-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFiltrationEngine.py
More file actions
132 lines (103 loc) · 4.43 KB
/
FiltrationEngine.py
File metadata and controls
132 lines (103 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from tweepy import *
import tweepy as tweepy
import pandas as pd
import csv
import re
import string
import json
import pymongo
from pymongo import MongoClient
jsonfile = open('Tweet 3000.json')
jsondata = json.load(jsonfile)
myclient = pymongo.MongoClient("key")
mydb = myclient['TweetDB']
mycollection = mydb["TweetCollection"]
print("Mongo DB Connection Success")
index = 0
collectionIndex = 1
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
def removeUnnecessaryKV(tweet):
for key, value in dict(tweet).items():
if "truncated" == key:
del tweet["truncated"]
if "entities" == key:
del tweet["entities"]
if "extended_entities" == key:
del tweet["extended_entities"]
if "is_quote_status" == key:
del tweet["is_quote_status"]
if "favorited" == key:
del tweet["favorited"]
if "retweeted" == key:
del tweet["retweeted"]
if "possibly_sensitive" == key:
del tweet["possibly_sensitive"]
return tweet
def removeNull(tweet, key, value):
if value is None:
del tweet[key]
def filterTweets(tweet):
for key, value in dict(tweet).items():
if key == "text":
tweet[key] = re.sub(regex, " ", value)
if isinstance(value, str):
if re.findall(regex, value) and key != "text":
del tweet[key]
if key == "user":
for keyuser, valueuser in dict(tweet[key]).items():
if isinstance(valueuser, str):
if re.findall(regex, valueuser):
del tweet[key][keyuser]
removeNull(tweet[key], keyuser, valueuser)
# if valueuser is None:
# del tweet[key][keyuser]
if key == "retweeted_status":
for keyretweet, valueretweet in dict(tweet[key]).items():
tweet[key] = removeUnnecessaryKV(tweet[key])
for keyretweet, valueretweet in dict(tweet[key]).items():
if isinstance(valueretweet, str):
if re.findall(regex, valueretweet):
del tweet[key][keyretweet]
if valueretweet is None:
del tweet[key][keyretweet]
if keyretweet == "user":
for keyuser, valueuser in dict(tweet[key][keyretweet]).items():
if isinstance(valueuser, str):
if re.findall(regex, valueuser):
del tweet[key][keyretweet][keyuser]
removeNull(tweet[key][keyretweet], keyuser, valueuser)
removeNull(tweet, key, value)
for key in dict(tweet):
if "text" in tweet:
tweet["text"] = remove_emoji(tweet["text"])
tweet["text"] = re.sub('[^A-Za-z0-9]+', ' ', tweet["text"])
if "retweeted_status" in tweet:
if "text" in tweet["retweeted_status"]:
tweet["retweeted_status"]["text"] = remove_emoji(
tweet["retweeted_status"]["text"])
tweet["retweeted_status"]["text"] = re.sub(
'[^A-Za-z0-9]+', ' ', tweet["retweeted_status"]["text"])
if "user" in tweet:
if "name" in tweet["user"]:
tweet["user"]["name"] = remove_emoji(tweet["user"]["name"])
if "location" in tweet["user"]:
tweet["user"]["location"] = remove_emoji(tweet["user"]["location"])
if "description" in tweet["user"]:
tweet["user"]["description"] = remove_emoji(
tweet["user"]["description"])
return tweet
for tweet in jsondata['data']:
tweet = removeUnnecessaryKV(tweet)
filterTweets(tweet)
x = mycollection.insert_one(tweet)
print("Tweets Filtered Successful and stored in MongoDB")