Sentweet/featureFunctions.py at master · shuklak13/Sentweet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from unicodeFunctions import *

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

stopWords = dict([(word, True) for word in stopwords.words('english')]) #nltk's stopwords
stopWords2 = [".","!","?","rt","@","=","+","-","&amp;","follow","i'm","i'll"]
stopWords2 = dict([(toUnicode(word), True) for word in stopWords2]) #my own set of stopwords
stopWords.update(stopWords2)

#stems the word and lowercases it
def normalize(word):
     return stemmer.stem(word.lower())


#return tuple (features, label)
#for input into NLTK's Naive Bayes Classifier
def feature_tuple(fileName, mode, label):
   features = word_features(fileName, mode)
   return (features, label)

#add each word from the corpus into our dictionary
    #if it's not a stopword
#Note that normalization (word.lower()) is not needed
    #because the Stanford corpus is already lowercase
#Other corpora might need to be normalized, however
def word_features(fileName, mode):
    corpus = open(fileName, mode)
    features = dict()
    for line in corpus:
        line = toUnicode(line)
        newFeatures = extractFeaturesFrom(line)
        if newFeatures is not None:  #if line is NoneType, skip to next line
            features.update(newFeatures)
    corpus.close()
    return features

#iterate through every word in text, normalize it, and remove stopwords
def extractFeaturesFrom(text):
    if text is not None:
        return dict([(normalize(word), True) for word in text.split()
                    if word not in stopWords])
    else:
        return None