-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_data.py
More file actions
132 lines (117 loc) · 4.28 KB
/
preprocess_data.py
File metadata and controls
132 lines (117 loc) · 4.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
try:
import json
except ImportError:
import simplejson as json
import re
import nltk
from nltk.classify import *
import stopWord
#import pickle
#import os
#dest = os.path.join('genderprediction','pkl_objects')
#if not os.path.exists(dest):
# os.makedirs(dest)
#start process_tweet
def processTweet(tweet):
# process the tweets
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert @username to AT_USER
tweet = re.sub('@[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#end
def getfeatureVector(word, tweet):
featureVector = []
words = tweet.split()
#print words[3]
for singleword in words:
singleword = stopWord.removeDuplicateWords(singleword)
singleword = singleword.strip('\'"?,.!')
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", singleword)
if(singleword in word or val is None):
continue
else:
featureVector.append(singleword.lower())
return featureVector
#stopwords = stopWord.getStopWord('stopwords.txt')
#save_stopwords = open("E:\\univerHel\\my_github\\genderprediction\\pkl_objects\\stopwords.pkl","wb")
#pickle.dump(stopwords,save_stopwords)
#save_stopwords.close()
#pickle.dump(stopwords, open(os.path.join(dest,'stopwords.pkl'),'wb'),protocol=None)
#print stopwords[1:5]
'''
featureList = []
tweetFeature = []
with open('labelledtrainingData.txt', 'r') as f:
for line in f:
tweet = json.loads(line.strip())
#print tweet['user']['gender']
#print tweet['user']['name']
#print tweet['user']['screen_name']
#print tweet['user']['description']
gender = tweet['user']['gender']
#print gender
tweet_text = processTweet(tweet['text'])
#print tweet_text
featureVector = getfeatureVector(stopwords, tweet_text)
#print featureVector
featureList.extend(featureVector)
tweetFeature.append((featureVector, gender));
#print tweetFeature
featureList = list(set(featureList))
#pickle.dump(featureList, open(os.path.join(dest,'featureList.pkl'),'wb'),protocol=None)
def extractFeatures(feature):
features = set(feature)
featureExtractList = {}
for word in featureList:
featureExtractList['contains(%s)'% word] = (word in features)
return featureExtractList
training_set = nltk.classify.util.apply_features(extractFeatures, tweetFeature)
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
#pickle.dump(NBClassifier, open(os.path.join(dest,"NBClassifier.pkl"),"wb"),protocol=None)
#print nltk.classify.accuracy(NBClassifier, training_set)
save_classifiew = open("E:\\univerHel\\my_github\\genderprediction\\pkl_objects\\NBClassifier.pkl","wb")
pickle.dump(NBClassifier,save_classifiew)
save_classifiew.close()
'''
#NBClassifier.show_most_informative_features(20)
'''
##test part
count = 0
with open('labelledtestData.txt', 'r') as file:
for line in file:
tweet_test = json.loads(line.strip())
#print tweet['user']['gender']
#print tweet['user']['name']
#print tweet['user']['screen_name']
#print tweet['user']['description']
gender = tweet_test['user']['gender']
#print "right"
#print gender
tweet_text_test = processTweet(tweet_test['text'])
#print tweet_text
featureVector = getfeatureVector(stopwords, tweet_text_test)
#print featureVector
#featureList.extend(featureVector)
#tweetFeature.append((featureVector, gender));
label = NBClassifier.classify(extractFeatures(featureVector))
#print label
if(label==gender):
count = count + 1
print count/49.0
'''
#tweet_text = word_tokenize(tweet_text)
#print tweet_text
#print tweet['text']
#hashtags = []
#for hashtag in tweet['entities']['hashtags']:
# hashtags.append(hashtag['text'])
#print hashtags