mastersoft/AddressProfiler.py at master · engramar/mastersoft · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#########################################################################################################
#   Program Name : AddressProfiler.py                                                                   #
#   Program Description:                                                                                #
#   This program basically does profiling of profilinginput from PreAddressProfiler.py                  #
#                                                                                                       #
#   Tag Legend                                                                                          #
#       X - Alphanumeric word                                                                           #
#       A - Alphabetic word                                                                             #
#       9 - Numeric word                                                                                #
#       $ - Word with special character/s                                                               #
#       POBox - Postal Type                                                                             #
#       Unit - Subdwelling Type                                                                         #
#       Level - Floor/Level Type                                                                        #
#       Block - Block Type                                                                              #
#       Rural - Rural Delivery Type                                                                     #
#       Street - Street Type                                                                            #
#       State - Au State Name                                                                           #
#       TownCity - NZ Town/City                                                                         #
#                                                                                                       #
#   PreRequisite:                                                                                       #
#   PreAddressProfiler.py must be triggered first.                                                      #
#                                                                                                       #
#   Comment                                         Date                  Author                        #
#   ================================                ==========            ================              #
#   Initial Version                                 04/07/2016            Engramar Bollas               #
#########################################################################################################
import sys
import re

print ('Running AddressProfiler...')

record = open('profilerinput')

filename  = open("AddressProfilerOutput.txt",'w')
sys.stdout = filename

filename2 = open("AddressProfilerOutputWithAddresses.txt",'w')

lexicals = {
	 'B' : 'Level' ,
	 'BASEMENT' : 'Level' ,
	 'WEST AUST' : 'State' ,
	 'WESTERN AUST' : 'State' ,
	 'WESTERN AUSTRALIA' : 'State'
}

patternlist = []
for line in record:
    try:
        pattern = re.compile(r'\b(' + '|'.join(lexicals.keys()) + r')\b')
        newrecord = pattern.sub(lambda x: lexicals[x.group()], line)
    except KeyError:
        newrecord = line
    newrecord = newrecord.rstrip()

    outaddressfields = []
    columnlist = newrecord.split('|')

    fieldlist = columnlist[3].split();

    for field in fieldlist:
        outfield = "X"

        if field == ('POBox'):
            outaddressfields.append(field)
            continue

        if field == ('Unit'):
            outaddressfields.append(field)
            continue

        if field == ('Level'):
            outaddressfields.append(field)
            continue

        if field == ('Block'):
            outaddressfields.append(field)
            continue

        if field == ('Rural'):
            outaddressfields.append(field)
            continue

        if field == ('Street'):
            outaddressfields.append(field)
            continue

        if field == ('State'):
            outaddressfields.append(field)
            continue

        if field.isdigit() == True:
            outfield = '9'

        if field.isalpha() == True:
            outfield = 'A'

        if field.find('/')!=-1:
            outfield = 'UnitAndStNum'

        if field == ',':
            outfield = ','

        special = ['~','`','!','@','#','$','%','^','&','*','(',')','_','+','=','{','}','[',']',':','>',';','<','?','*','+']
        if any(ext in field for ext in special):
            outfield = '$'

        outaddressfields.append(outfield)

    pattern = ""
    for item in outaddressfields:
        pattern += item+" "
    patternlist.append(pattern)

    filename2.write(pattern+"|"+newrecord+'\n')


counts = dict()
for i in patternlist:
    counts[i] = counts.get(i, 0) + 1

for keys,values in counts.items():
    print(keys+"|"+str(values))