-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlogistic_regression.py
More file actions
82 lines (63 loc) · 2.62 KB
/
logistic_regression.py
File metadata and controls
82 lines (63 loc) · 2.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from pandas import DataFrame
import pandas as pd
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk import word_tokenize
def logistic_regression(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)
print('training data...')
# Train
log_reg = LogisticRegression().fit(X_train, y_train)
print('making predictions...')
y_predicted = log_reg.predict(X_test)
print('performance metrics:')
print('Accuracy score test set: ', accuracy_score(y_test, y_predicted))
print('Confusion matrix test set: \n', confusion_matrix(y_test, y_predicted)/len(y_test))
def vectorize_tfidf(df):
print('applying tfidf vectorizer')
# Define the vectorizer and specify the arguments
my_pattern = r'\b[^\d\W][^\d\W]+\b'
vect = TfidfVectorizer(ngram_range=(1,2), max_features=100, token_pattern=my_pattern, stop_words=ENGLISH_STOP_WORDS).fit(df.review)
# Transform the vectorizer
X_txt = vect.transform(df.review)
# Transform to a data frame and specify the column names
X=pd.DataFrame(X_txt.toarray(), columns=vect.get_feature_names())
y= df.label
print(X)
print(y)
logistic_regression(X, y)
def bd_call():
engine = create_engine('mysql+mysqlconnector://root:root@localhost/hotel_reviews')
procedure = 'select_all'
print('getting data from db...')
raw_conn = engine.raw_connection()
cur = raw_conn.cursor()
cur.callproc(procedure)
for result in cur.stored_results():
df = DataFrame(result.fetchall())
column_names_list = [i[0] for i in result.description]
raw_conn.close()
df.columns = column_names_list
df.to_csv('E:\HvA\Big Data Scientist & Engineer\Block2\Assignment2\code_and_df\src\data_okay.csv')
return df
def stemming(df):
# STEMMING NOW
porter = PorterStemmer()
print('creating a list of tokens...')
# Create a list of tokens
tokens = [word_tokenize(review) for review in df.review]
print('stemming the list...')
# Stem the list of tokens
stemmed_tokens = [[porter.stem(word) for word in token] for token in tokens]
print('joining the words...')
# Print the first item of the stemmed tokenss
columns_ready = [' '.join(stemmed) for stemmed in stemmed_tokens]
df.review = columns_ready
vectorize_tfidf(df)
df = bd_call()
vectorize_tfidf(df)
stemming(df)