-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAddressProfiler.py
More file actions
123 lines (96 loc) · 5.12 KB
/
AddressProfiler.py
File metadata and controls
123 lines (96 loc) · 5.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#########################################################################################################
# Program Name : AddressProfiler.py #
# Program Description: #
# This program basically does profiling of profilinginput from PreAddressProfiler.py #
# #
# Tag Legend #
# X - Alphanumeric word #
# A - Alphabetic word #
# 9 - Numeric word #
# $ - Word with special character/s #
# POBox - Postal Type #
# Unit - Subdwelling Type #
# Level - Floor/Level Type #
# Block - Block Type #
# Rural - Rural Delivery Type #
# Street - Street Type #
# State - Au State Name #
# TownCity - NZ Town/City #
# #
# PreRequisite: #
# PreAddressProfiler.py must be triggered first. #
# #
# Comment Date Author #
# ================================ ========== ================ #
# Initial Version 04/07/2016 Engramar Bollas #
#########################################################################################################
import sys
import re
print ('Running AddressProfiler...')
record = open('profilerinput')
filename = open("AddressProfilerOutput.txt",'w')
sys.stdout = filename
filename2 = open("AddressProfilerOutputWithAddresses.txt",'w')
lexicals = {
'B' : 'Level' ,
'BASEMENT' : 'Level' ,
'WEST AUST' : 'State' ,
'WESTERN AUST' : 'State' ,
'WESTERN AUSTRALIA' : 'State'
}
patternlist = []
for line in record:
try:
pattern = re.compile(r'\b(' + '|'.join(lexicals.keys()) + r')\b')
newrecord = pattern.sub(lambda x: lexicals[x.group()], line)
except KeyError:
newrecord = line
newrecord = newrecord.rstrip()
outaddressfields = []
columnlist = newrecord.split('|')
fieldlist = columnlist[3].split();
for field in fieldlist:
outfield = "X"
if field == ('POBox'):
outaddressfields.append(field)
continue
if field == ('Unit'):
outaddressfields.append(field)
continue
if field == ('Level'):
outaddressfields.append(field)
continue
if field == ('Block'):
outaddressfields.append(field)
continue
if field == ('Rural'):
outaddressfields.append(field)
continue
if field == ('Street'):
outaddressfields.append(field)
continue
if field == ('State'):
outaddressfields.append(field)
continue
if field.isdigit() == True:
outfield = '9'
if field.isalpha() == True:
outfield = 'A'
if field.find('/')!=-1:
outfield = 'UnitAndStNum'
if field == ',':
outfield = ','
special = ['~','`','!','@','#','$','%','^','&','*','(',')','_','+','=','{','}','[',']',':','>',';','<','?','*','+']
if any(ext in field for ext in special):
outfield = '$'
outaddressfields.append(outfield)
pattern = ""
for item in outaddressfields:
pattern += item+" "
patternlist.append(pattern)
filename2.write(pattern+"|"+newrecord+'\n')
counts = dict()
for i in patternlist:
counts[i] = counts.get(i, 0) + 1
for keys,values in counts.items():
print(keys+"|"+str(values))