Machine_learning/ECLAT at master · sharmasaravanan/Machine_learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#module Pyfim
from sys  import argv, stderr, maxsize
from math import ceil
from time import time

#-----------------------------------------------------------------------

def report (iset, pexs, supp, data):
    '''Recursively report item sets with the same support.
iset    base item set to report (list of items)
pexs    perfect extensions of the base item set (list of items)
supp    (absolute) support of the item set to report
data    static recursion/output data as a list
        [ target, supp, min, max, maxx, count [, out] ]'''
    if not pexs:                # if no perfect extensions (left)
        data[5] += 1            # count the reported item set
        if len(data) < 7: return# check for a destination
        n = len(iset)           # check the item set size
        if (n < data[2]) or (n > data[3]): return
        if isinstance(data[6], [].__class__):
            data[6].append((tuple(iset), (supp,)))
        else:                   # report the current item set
            for i in iset: data[6].write(str(i)+' '),
            data[6].write('('+str(supp)+')\n')
    else:                       # if perfect extensions to process
        for i in range(len(pexs)): # do include/exclude recursions
            report(iset+[pexs[i]], pexs[i+1:], supp, data)
            report(iset,           pexs[i+1:], supp, data)

#-----------------------------------------------------------------------

def closed (tracts, elim):
    '''Check for a closed item set.
tracts  list of transactions containing the item set
elim    list of lists of transactions for eliminated items
returns whether the item set is closed'''
    for t in reversed(elim):    # try to find a perfect extension
        if tracts <= t: return False
    return True                 # return whether the item set is closed

#-----------------------------------------------------------------------

def maximal (tracts, elim, supp):
    '''Check for a maximal item set.
tracts  list of transactions containing the item set
elim    list of lists of transactions for eliminated items
supp    minimum support of an item set
returns whether the item set is maximal'''
    for t in reversed(elim):    # try to find a frequent extension
        if sum([w for x,w in tracts & t]) >= supp: return False
    return True                 # return whether the item set is maximal

#-----------------------------------------------------------------------

def recurse (tadb, iset, pexs, elim, data):
    '''Recursive part of the eclat algorithm.
tadb    (conditional) transaction database, in vertical representation,
        as a list of item/transaction information, one per (last) item
        (triples of support, item and transaction set)
iset    item set (prefix of conditional transaction database)
pexs    set of perfect extensions (parent equivalent items)
elim    set of eliminated items (for closed/maximal check)
data    static recursion/output data as a list
        [ target, supp, min, max, maxx, count [, out] ]'''
    tadb.sort()                 # sort items by (conditional) support
    xelm = []; m = 0            # init. elim. items and max. support
    for k in range(len(tadb)):  # traverse the items/item sets
        s,i,t = tadb[k]         # unpack the item information
        if s > m: m = s         # find maximum extension support
        if data[0] in 'cm' and not closed(t, elim+xelm):
            continue            # check for a perfect extension
        #if data[0] in 'cm':     # check for a perfect extension
        #    x = set(iset +pexs +[j for r,j,u in tadb[k:]])
        #    y = set().intersection(*[u for u,w in t])
        #    if not (y <= x): continue
        proj = []; xpxs = []    # construct the projection of the
        for r,j,u in tadb[k+1:]:# trans. database to the current item:
            u = u & t           # intersect with subsequent lists
            r = sum([w for x,w in u])
            if   r >= s:       xpxs.append(j)
            elif r >= data[1]: proj.append([r,j,u])
        xpxs = pexs +xpxs       # combine perfect extensions and
        xset = iset +[i]        # add the current item to the set and
        n    = len(xpxs) if data[0] in 'cm' else 0
        r    = recurse(proj, xset, xpxs, elim+xelm, data) \
            if proj and (len(xset)+n < data[4]) else 0
        xelm += [t]             # collect the eliminated items
        if   data[0] == 'm':    # if to report only maximal item sets
            if r < data[1] and maximal(t, elim+xelm[:-1], data[1]):
                report(xset+xpxs, [], s, data)
        elif data[0] == 'c':    # if to report only closed  item sets
            if r < s: report(xset+xpxs, [], s, data)
        else:                   # if to report all frequent item sets
            report(xset, xpxs, s, data)
    return m                    # return the maximum extension support

#-----------------------------------------------------------------------

def eclat (tracts, target='s', supp=2, min=1, max=maxsize, out=0):
    '''Find frequent item set with the eclat algorithm.
tracts  transaction database to mine (mandatory)
        The database must be a list or a tuple of transactions;
        each transaction must be a list or a tuple of items.
        An item can be any hashable object.
target  type of frequent item sets to find     (default: 's')
        s/a   sets/all   all     frequent item sets
        c     closed     closed  frequent item sets
        m     maximal    maximal frequent item sets
supp    minimum support of an item set         (default: 2)
        (positive: percentage, negative: absolute number)
min     minimum number of items per item set   (default: 1)
max     maximum number of items per item set   (default: no limit)
out     output file or array as a destination  (default: None)
returns a list of pairs (i.e. tuples with two elements),
        each consisting of a tuple with a found frequent item set
        and a one-element tuple with its (absolute) support.'''
    supp = -supp if supp < 0 else int(ceil(0.01*supp*len(tracts)))
    if supp <= 0: supp = 1      # check and adapt the minimum support
    if max  <  0: max  = maxsize# and the maximum item set size
    if len(tracts) < supp:      # check whether any set can be frequent
        return out if isinstance(out, [].__class__) else 0
    tadb = dict()               # reduce by combining equal transactions
    for t in [frozenset(t) for t in tracts]:
        if t in tadb: tadb[t] += 1
        else:         tadb[t]  = 1
    tracts = tadb.items()       # get reduced trans. and collect items
    items  = set().union(*[t for t,w in tracts])
    tadb   = dict([(i,[]) for i in items])
    for t in tracts:            # collect transactions per item
        for i in t[0]: tadb[i].append(t)
    tadb = [[sum([w for t,w in tadb[i]]), i, set(tadb[i])]
            for i in tadb]      # build and filter transaction sets
    sall = sum([w for t,w in tracts])
    pexs = [i for s,i,t in tadb if s >= sall]
    tadb = [t for t in tadb if t[0] >= supp and t[0] < sall]
    maxx = max+1 if max < maxsize and target in 'cm' else max
    data = [target, supp, min, max, maxx, 0]
    if not isinstance(out, (0).__class__): data.append(out)
    r = recurse(tadb, [], pexs, [], data)
    if len(pexs) >= min:        # recursively find frequent item sets
        if   target == 'm':     # if to report only maximal item sets
            if r < supp: report(pexs, [], sall, data)
        elif target == 'c':     # if to report only closed  item sets
            if r < s:    report(pexs, [], sall, data)
        else:                   # if to report all frequent item sets
            report([], pexs, sall, data)  # report the empty item set
    if isinstance(out, [].__class__): return out
    return data[5]              # return (number of) found item sets

#-----------------------------------------------------------------------

def fim (tracts, target='s', supp=2, min=1, max=maxsize):
    '''Find frequent item set with the eclat algorithm.
tracts  transaction database to mine (mandatory)
        The database must be a list or a tuple of transactions;
        each transaction must be a list or a tuple of items.
        An item can be any hashable object.
target  type of frequent item sets to find     (default: 's')
        s/a   sets/all   all     frequent item sets
        c     closed     closed  frequent item sets
        m     maximal    maximal frequent item sets
supp    minimum support of an item set         (default: 2)
        (positive: percentage, negative: absolute number)
min     minimum number of items per item set   (default: 1)
max     maximum number of items per item set   (default: no limit)
returns a list of pairs (i.e. tuples with two elements); each
        consisting of a list with a found frequent item set and
        a one-element tuple with its (absolute) support.'''
    return eclat(tracts, target, supp, min, max, [])

#-----------------------------------------------------------------------

if __name__ == '__main__':
    desc    = 'find frequent item sets (with the eclat algorithm)'
    version = 'version 1.2 (2013.01.25)         ' \
              + '(c) 2013   Christian Borgelt'
    opts    = {'t': ['target', 's'],
               'm': ['min',     1 ],
               'n': ['max',    -1 ],
               's': ['supp',   10 ] }
    fixed   = []                # list of program options

    if len(argv) <= 1:          # if no arguments are given
        opts = dict([(o,opts[o][1]) for o in opts])
        print('usage: pyfim.py [options] infile [outfile]')
        print(desc); print(version)
        print('-t#      target type                            '
              +'(default: '+str(opts['t'])+')')
        print('         (s: frequent, c: closed, m: maximal item sets)')
        print('-m#      minimum number of items per item set   '
              +'(default: '  +str(opts['m'])+')')
        print('-n#      maximum number of items per item set   '
              +'(default: no limit)')
        print('-s#      minimum support                        '
              +'(default: '  +str(opts['s'])+'%)')
        print('         (positive: percentage,'
              +'negative: absolute number)')
        print('infile   file to read transactions from         '
              +'[required]')
        print('outfile  file to write frequent item sets to    '
              +'[optional]')
        exit()                  # print a usage message

    stderr.write('pyfim.py')    # print a startup message
    stderr.write(' - ' +desc +'\n' +version +'\n')

    for a in argv[1:]:          # traverse the program arguments
        if a[0] != '-':         # check for a fixed argument
            if len(fixed) < 2: fixed.append(a); continue
            else: print('too many fixed arguments'); exit()
        if a[1] not in opts:    # check for an option
            print('unknown option: '+a[:2]); exit()
        o = opts[a[1]]          # get the corresponding option
        v = a[2:]               # and get the option argument
        if   isinstance(o[1],(0).__class__): o[1] = int(v)
        elif isinstance(o[1],0.0.__class__): o[1] = float(v)
        else:                                o[1] = v
    opts = dict([opts[o] for o in opts])

    t = time()                  # read the transaction data
    stderr.write('reading ' +fixed[0] +' ... ')
    with open(fixed[0], 'rt') as inp:
        tracts = [frozenset(line.split()) for line in inp]
    n = len(tracts)             # get the number of transactions
    stderr.write('[%d transaction(s)] ' % n)
    stderr.write('done [%.2fs].\n' % (time()-t))

    t = time()                  # mine frequent item sets
    stderr.write('mining frequent item sets ... ')
    if len(fixed) < 2:          # if no output file is given
        n = eclat(tracts, **opts)
    else:                       # if an output file is given
        with open(fixed[1], 'w') as out:
            n = eclat(tracts, out=out, **opts)
    stderr.write('[%d sets(s)] ' % n)
    stderr.write('done [%.2fs].\n' % (time()-t))

    #modulle JPA Eclat
    import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from pyfim import eclat
from pyfim import fim
import time
start = time.time()
print("start time is", start)
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
#frequent_itemsets = fim(df, target='s', supp=2, min=1, max=100, out=0)
frequent_itemsets = fim(df, target='s', supp=4, min=1, max=10)
print(frequent_itemsets)
etime = time.time() - start
print(etime)