-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodeling.py
More file actions
269 lines (222 loc) · 10.2 KB
/
modeling.py
File metadata and controls
269 lines (222 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import pandas as pd
import numpy as np
import time
#see the data
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
#play with words
import nltk.sentiment
import nltk
import re
from pprint import pprint
#split and model
from scipy.stats import f_oneway
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from nltk.tokenize import ToktokTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#import
from sklearn.feature_extraction.text import CountVectorizer
#sql creds
import env as e
import acquire as a
#scraping
import requests
from bs4 import BeautifulSoup
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
from env import github_token, github_username
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# setting basic style parameters for matplotlib
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')
def prepare_for_modeling(train, validate, test):
"""
Prepares the data for modeling by creating the necessary feature and target variables.
Args:
train (pandas.DataFrame): DataFrame containing the training data.
validate (pandas.DataFrame): DataFrame containing the validation data.
test (pandas.DataFrame): DataFrame containing the test data.
Returns:
tuple: A tuple containing the following elements in order:
- X_bow (scipy.sparse.csr_matrix): Bag-of-words representation of the training data.
- X_validate_bow (scipy.sparse.csr_matrix): Bag-of-words representation of the validation data.
- X_test_bow (scipy.sparse.csr_matrix): Bag-of-words representation of the test data.
- y_train (pandas.Series): Target variable for the training data.
- y_validate (pandas.Series): Target variable for the validation data.
- y_test (pandas.Series): Target variable for the test data.
"""
#create X_train and y_train elements
X_train = train.clean_contents
X_validate = validate.clean_contents
X_test = test.clean_contents
y_train = train.language
y_validate = validate.language
y_test = test.language
#make my bag of words
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train)
X_validate_bow = cv.transform(X_validate)
X_test_bow = cv.transform(X_test)
return X_bow, X_validate_bow, X_test_bow, y_train, y_validate, y_test
def decision_tree(X_bow, X_validate_bow, y_train, y_validate):
"""
This function trains a decision tree classifier on the provided training data, and evaluates its performance on the
validation data for different values of the 'max_depth' hyperparameter. It then generates a plot of the training and
validation accuracy scores as a function of 'max_depth', and returns a DataFrame containing these scores.
Parameters:
- X_train (pandas.DataFrame): A DataFrame containing the features for the training data.
- X_validate (pandas.DataFrame): A DataFrame containing the features for the validation data.
- y_train (pandas.Series): A Series containing the target variable for the training data.
- y_validate (pandas.Series): A Series containing the target variable for the validation data.
Returns:
- scores_df (pandas.DataFrame): A DataFrame containing the training and validation accuracy scores, as well as the
difference between them, for different values of the 'max_depth' hyperparameter.
"""
# get data
scores_all = []
for x in range(1,20):
tree = DecisionTreeClassifier(max_depth=x, random_state=123)
tree.fit(X_bow, y_train)
train_acc = tree.score(X_bow,y_train)
val_acc = tree.score(X_validate_bow, y_validate)
score_diff = train_acc - val_acc
scores_all.append([x, train_acc, val_acc, score_diff])
scores_df = pd.DataFrame(scores_all, columns=['max_depth', 'train_acc','val_acc','score_diff'])
# Plot the results
sns.set_style('whitegrid')
plt.plot(scores_df['max_depth'], scores_df['train_acc'], label='Train score')
plt.plot(scores_df['max_depth'], scores_df['val_acc'], label='Validation score')
plt.fill_between(scores_df['max_depth'], scores_df['train_acc'], scores_df['val_acc'], alpha=0.2, color='gray')
plt.xlabel('Max depth')
plt.ylabel('Accuracy')
plt.title('Decision Tree Accuracy vs Max Depth')
plt.legend()
plt.show()
return scores_df
def random_forest_scores(X_bow, y_train, X_validate_bow, y_validate):
"""
Trains and evaluates a random forest classifier with different combinations of hyperparameters. The function takes in
training and validation datasets, and returns a dataframe summarizing the model performance on each combination of
hyperparameters.
Parameters:
-----------
X_train : pandas DataFrame
Features of the training dataset.
y_train : pandas Series
Target variable of the training dataset.
X_validate : pandas DataFrame
Features of the validation dataset.
y_validate : pandas Series
Target variable of the validation dataset.
Returns:
--------
df : pandas DataFrame
A dataframe summarizing the model performance on each combination of hyperparameters.
"""
#define variables
train_scores = []
validate_scores = []
min_samples_leaf_values = [1, 2, 3, 4, 5, 6, 7, 8 , 9, 10]
max_depth_values = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
for min_samples_leaf, max_depth in zip(min_samples_leaf_values, max_depth_values):
rf = RandomForestClassifier(min_samples_leaf=min_samples_leaf, max_depth=max_depth,random_state=123)
rf.fit(X_bow, y_train)
train_score = rf.score(X_bow, y_train)
validate_score = rf.score(X_validate_bow, y_validate)
train_scores.append(train_score)
validate_scores.append(validate_score)
# Calculate the difference between the train and validation scores
diff_scores = [train_score - validate_score for train_score, validate_score in zip(train_scores, validate_scores)]
#Put results into a dataframe
df = pd.DataFrame({
'min_samples_leaf': min_samples_leaf_values,
'max_depth': max_depth_values,
'train_score': train_scores,
'validate_score': validate_scores,
'diff_score': diff_scores})
# Set plot style
sns.set_style('whitegrid')
# Create plot
plt.figure(figsize=(8, 6))
plt.plot(max_depth_values, train_scores, label='train', marker='o', color='blue')
plt.plot(max_depth_values, validate_scores, label='validation', marker='o', color='orange')
plt.fill_between(max_depth_values, train_scores, validate_scores, alpha=0.2, color='gray')
plt.xticks([2,4,6,8,10],['Leaf 9 and Depth 2','Leaf 7 and Depth 4','Leaf 5 and Depth 6','Leaf 3 and Depth 8','Leaf 1and Depth 10'], rotation = 45)
plt.xlabel('min_samples_leaf and max_depth', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.title('Random Forest Classifier Performance', fontsize=18)
plt.legend(fontsize=12)
plt.show()
return df
def k_nearest(X_bow, y_train, X_validate_bow, y_validate):
"""
Trains and evaluates KNN models for different values of k and plots the results.
Parameters:
-----------
X_train: array-like, shape (n_samples, n_features)
Training input samples.
y_train: array-like, shape (n_samples,)
Target values for the training input samples.
X_validate: array-like, shape (n_samples, n_features)
Validation input samples.
y_validate: array-like, shape (n_samples,)
Target values for the validation input samples.
Returns:
--------
results: pandas DataFrame
Contains the train and validation accuracy for each value of k.
"""
metrics = []
train_score = []
validate_score = []
for k in range(1,21):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_bow, y_train)
train_score.append(knn.score(X_bow, y_train))
validate_score.append(knn.score(X_validate_bow, y_validate))
diff_score = train_score[-1] - validate_score[-1]
metrics.append({'k': k, 'train_score': train_score[-1], 'validate_score': validate_score[-1], 'diff_score': diff_score})
baseline_accuracy = (y_train == 6).mean()
results = pd.DataFrame.from_records(metrics)
# modify the last few lines of the function
# drop the diff_score column before plotting
results_for_plotting = results.drop(columns=['diff_score'])
with sns.axes_style('whitegrid'):
ax = results_for_plotting.set_index('k').plot(figsize=(16,9))
plt.ylabel('Accuracy')
#plt.axhline(baseline_accuracy, linewidth=2, color='black', label='baseline')
plt.xticks(np.arange(0,21,1))
#min_diff_idx = np.abs(results['diff_score']).argmin()
#min_diff_k = results.loc[min_diff_idx, 'k']
#min_diff_score = results.loc[min_diff_idx, 'diff_score']
#ax.axvline(min_diff_k, linestyle='--', linewidth=2, color='red', label=f'min diff at k={min_diff_k} (diff={min_diff_score:.3f})')
plt.fill_between(results['k'], train_score, validate_score, alpha=0.2, color='gray', where=(results['k'] > 0))
plt.title('K Nearest Neighbor', fontsize=18)
plt.legend()
plt.show()
return results
def the_chosen_one(X_bow, X_test_bow, y_train, y_test):
"""
Trains a DecisionTree classifier on the provided training data with a pre-selected max_depth and
evaluates the classifier on the test data.
Parameters:
- X_train_scaled (pandas.DataFrame): DataFrame containing the scaled features for the training data.
- X_test_scaled (pandas.DataFrame): DataFrame containing the scaled features for the test data.
- y_train (pandas.Series): Series containing the target variable for the training data.
- y_test (pandas.Series): Series containing the target variable for the test data.
Returns:
- Accuracy score
"""
tree = DecisionTreeClassifier(max_depth=1, random_state=123)
tree.fit(X_bow, y_train)
tree.score(X_test_bow, y_test)
return tree.score(X_test_bow, y_test)