Sarcasm-detection-project/PatternExtraction.py at master · pratheeksh/Sarcasm-detection-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re
import string

class PatternExtraction:
	def __init__(self, reviews):
		self.reviewList = reviews

	def calculateCorpusFrequency(self):
	#pass 1: for each word in each review, find total number of occurrences.
	#also find total number of words in the overall corpus of reviews.
		wordFreqDict = {}
		totalNumWordsInCorpus = 0

		for review in self.reviewList:
			#treats punctuation and word as a word
			wordList = re.findall(r"[\w']+|[.,!?;]", review)
			for word in wordList:
				totalNumWordsInCorpus += 1
				if word not in wordFreqDict:
					wordFreqDict[word] = 1
				else:
					wordCount = wordFreqDict[word]
					wordFreqDict[word] = wordCount + 1

		#pass 2: calculate the corpus-freq of each word
		for review in self.reviewList:
			wordList = re.findall(r"[\w']+|[.,!?;]", review)
			for word in wordList:
				wordCount = wordFreqDict[word]
				#word already normalized; skip over it
				if wordCount < 1:
					continue
				normalizedCount = (float(wordCount) / totalNumWordsInCorpus)
				wordFreqDict[word] = normalizedCount
		#wordFreqDict now stores normalized word freq
		return wordFreqDict

	def findCW(self):
		cwSet = set()
		wordFreqDict = self.calculateCorpusFrequency()
		#upperbound for fc = 1000 words per million
		fcThresholdMax = (float(1000) / 1000000)
		for word in wordFreqDict:
			#punctuation is not CW
			if word in string.punctuation:
				continue
			if word in cwSet:
				continue
			corpusFreqWord = wordFreqDict[word]
			if (corpusFreqWord < fcThresholdMax):
				cwSet.add(word)
		return cwSet

	def findHFW(self):
		hfwSet = set()
		wordFreqDict = self.calculateCorpusFrequency()
		#lowerbound for hfw = 1000 words per million
		fwThresholdMin = (float(1000) / 1000000)
		for word in wordFreqDict:
			if word in hfwSet:
				continue
			corpusFreqWord = wordFreqDict[word]
			if (corpusFreqWord > fwThresholdMin):
				hfwSet.add(word)
		return hfwSet


def main():
	lis = ['this is a review','this is another review','i do not like avacados','casper mattresses are expensive!']
	pattext = PatternExtraction(lis)
	dic = pattext.calculateCorpusFrequency()
	cwset = pattext.findCW()
	print (cwset)

if __name__ == "__main__":
	main()