Document-Similarities-Using-Dynamic-Programming-Alignment/DataProcessing.py at master · ayaabdelsalam91/Document-Similarities-Using-Dynamic-Programming-Alignment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import nltk
import sys
def create_dataset(input,output,count,average,categories):
	in_file = open(input, "r")
	out_file = open(output, "w")

	for example in in_file:
		label, text = example.strip().split('\t')
		if(label in categories):
			if(categories[label]<count and len(text) > average/2 and len(text) <  average + average/2):
				categories[label]+=1
				#print example
				out_file.write(example)
	in_file.close()
	out_file.close()

def rearrange_dataset(input,output,categories):


	out_file = open(output, "w")
	outCategories = []
	for i in range (0,len(categories)):
		#print categories[i]
		in_file = open(input, "r")
		for example in in_file:
			label, text = example.strip().split('\t')
			#print categories[i] , label
			if(label==categories[i]):
				out_file.write(example)
		in_file.close()
	out_file.close()

def read_dataset(input):
	file = open(input, "r")
	labels = dict()
	for example in file:
		#print example
		label, text = example.strip().split('\t')
		if label in labels:
			labels[label] += 1
		else:
			labels[label] = 1
	return (labels)

def get_average_size(input):
	in_file = open(input, "r")
	total_number=0
	total_size= 0
	for example in in_file:
		label, text = example.strip().split('\t')
		text = text.strip().split(' ')
		print len(text)
		total_number+=1
		total_size+= len(text)

	return total_size/total_number


if __name__ == "__main__":

    # reading, tokenizing, and normalizing data
    #test_input=sys.argv[1]
	#train_output=sys.argv[2]
	#test_input = sys.argv[3]
    test_output = sys.argv[1]
    categories= dict.fromkeys( ['comp.graphics', 'sci.med', 'soc.religion.christian', 'sci.crypt','talk.politics.mideast'] , 0 )
    #categories= dict.fromkeys( ['project', 'course', 'student','faculty'] , 0 )
    #categories= dict.fromkeys( ['earn', 'money-fx', 'trade', 'acq','crude'] , 0 )
    #categories= ['comp.graphics', 'sci.med', 'soc.religion.christian', 'sci.crypt','talk.politics.mideast']
    #rearrange_dataset(test_input,test_output,categories)
	# test_input = sys.argv[3]
    #test_output = sys.argv[2]
    #categories= dict.fromkeys( ['comp.graphics', 'sci.med', 'soc.religion.christian', 'sci.crypt','talk.politics.mideast'] , 0 )
    #categories= dict.fromkeys( ['project', 'course', 'student', 'sci.crypt','faculty'] , 0 )
    #categories= dict.fromkeys( ['earn', 'money-fx', 'trade', 'acq','crude'] , 0 )
    #testing 2
    #create_dataset(test_input,test_output,50,500,categories)
    labels  = read_dataset(test_output)
    avg  = get_average_size(test_output )
    print labels
    print avg
    # create_dataset(test_input,test_output,50,1000,categories)
    # labels  = read_dataset(test_output)
    # avg  = get_average_size(test_output )
    # print labels
    # print avg


	#labelsTest = read_dataset(test_output)
	#print labelsTest