-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
58 lines (46 loc) · 2.5 KB
/
data_preprocessing.py
File metadata and controls
58 lines (46 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
import pandas as pd
import collections
from keras.preprocessing.text import text_to_word_sequence
from sklearn.model_selection import train_test_split
articles = pd.read_csv("all_datasets.csv")
#articles = articles.loc[:, ~articles.columns.str.contains('^Unnamed')] # remove weird index
articles["text"] = articles["text"].str.lower().apply(text_to_word_sequence)
total_articles = articles.shape[0] # 44898
#max_len = max(articles["text"].str.len()) # len method works with list, 8375
max_len = max(articles["text"].map(len)) # len method works with list, 8375
test_articles = articles.iloc[:15,:]
total_articles = test_articles.shape[0] # 44898
max_len = max(test_articles["text"].map(len)) # len method works with list, 8375
# number of articles containing one word from vocab. That is why it's a set
word_freq = collections.Counter(word for art in test_articles["text"] for word in set(art)) # this is better
vocabulary = word_freq.keys() # 138021, this would be for sentences? nah per article
class TF_IDF:
def __init__(self):
print("transforming words to word_freq")
@staticmethod
def TF(article, word): # term_frequency
#return len([w for w in article if np.any(w == word)])/len(article)
return len([w for w in article if w == word])/len(article)
@staticmethod
def IDF(word): # inverse_document_freq
return np.log(total_articles/word_freq[word])
#return np.log(total_articles/word_in_text)
def article_tf_idf(self, article):
tf_idf_vec = np.zeros((1, max_len)) # padding the vector to reach 8375
art_length = len(article)
for i, word in enumerate(article):
tf_idf_vec[0, i] = self.TF(article, word) * self.IDF(word)
return tf_idf_vec
compute_tf_idf = TF_IDF()
test_articles.loc[:, "text"] = test_articles.apply(lambda row: compute_tf_idf.article_tf_idf(row["text"]), axis=1)
# relevant for decision trees to split data
y_data = np.resize(test_articles["label"].to_numpy(), (total_articles, 1))
x_data = np.vstack(tuple(test_articles["text"].iloc[i] for i in range(total_articles))) # generator use has been deprecated
#print(test_articles["text"].iloc[0][0, :3])
#print(x_data[0, :3])
data = np.concatenate((x_data, y_data), axis=1)
#print(data[0, :3])
#print(data.shape)
X_train, X_test, Y_train, Y_test = train_test_split(data[:, :-1], data[:, -1], train_size=0.8, test_size=0.2)
X_train, X_valid, Y_train, Y_valid = train_test_split(data[:, :-1], data[:, -1], train_size=0.7, test_size=0.3)