-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataProcessing.py
More file actions
94 lines (78 loc) · 2.75 KB
/
DataProcessing.py
File metadata and controls
94 lines (78 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import nltk
import sys
def create_dataset(input,output,count,average,categories):
in_file = open(input, "r")
out_file = open(output, "w")
for example in in_file:
label, text = example.strip().split('\t')
if(label in categories):
if(categories[label]<count and len(text) > average/2 and len(text) < average + average/2):
categories[label]+=1
#print example
out_file.write(example)
in_file.close()
out_file.close()
def rearrange_dataset(input,output,categories):
out_file = open(output, "w")
outCategories = []
for i in range (0,len(categories)):
#print categories[i]
in_file = open(input, "r")
for example in in_file:
label, text = example.strip().split('\t')
#print categories[i] , label
if(label==categories[i]):
out_file.write(example)
in_file.close()
out_file.close()
def read_dataset(input):
file = open(input, "r")
labels = dict()
for example in file:
#print example
label, text = example.strip().split('\t')
if label in labels:
labels[label] += 1
else:
labels[label] = 1
return (labels)
def get_average_size(input):
in_file = open(input, "r")
total_number=0
total_size= 0
for example in in_file:
label, text = example.strip().split('\t')
text = text.strip().split(' ')
print len(text)
total_number+=1
total_size+= len(text)
return total_size/total_number
if __name__ == "__main__":
# reading, tokenizing, and normalizing data
#test_input=sys.argv[1]
#train_output=sys.argv[2]
#test_input = sys.argv[3]
test_output = sys.argv[1]
categories= dict.fromkeys( ['comp.graphics', 'sci.med', 'soc.religion.christian', 'sci.crypt','talk.politics.mideast'] , 0 )
#categories= dict.fromkeys( ['project', 'course', 'student','faculty'] , 0 )
#categories= dict.fromkeys( ['earn', 'money-fx', 'trade', 'acq','crude'] , 0 )
#categories= ['comp.graphics', 'sci.med', 'soc.religion.christian', 'sci.crypt','talk.politics.mideast']
#rearrange_dataset(test_input,test_output,categories)
# test_input = sys.argv[3]
#test_output = sys.argv[2]
#categories= dict.fromkeys( ['comp.graphics', 'sci.med', 'soc.religion.christian', 'sci.crypt','talk.politics.mideast'] , 0 )
#categories= dict.fromkeys( ['project', 'course', 'student', 'sci.crypt','faculty'] , 0 )
#categories= dict.fromkeys( ['earn', 'money-fx', 'trade', 'acq','crude'] , 0 )
#testing 2
#create_dataset(test_input,test_output,50,500,categories)
labels = read_dataset(test_output)
avg = get_average_size(test_output )
print labels
print avg
# create_dataset(test_input,test_output,50,1000,categories)
# labels = read_dataset(test_output)
# avg = get_average_size(test_output )
# print labels
# print avg
#labelsTest = read_dataset(test_output)
#print labelsTest