MachineLearning2/rnn.py at master · rizzoMartin/MachineLearning2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import tensorflow as tf
from tensorflow.keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from keras.layers.recurrent import LSTM
from sklearn.model_selection import train_test_split
import pandas as pd

import keras
from keras.models import Sequential,Input,Model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.embeddings import Embedding

df = pd.read_csv('E:\HvA\Big Data Scientist & Engineer\Block2\Assignment2\code_and_df\src\data.csv')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

X = []
sentences = list(df['review'])
for sen in sentences:
    X.append(sen)


y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

max_words=5000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

maxlen = 50

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

from keras.layers.convolutional import Conv1D
from keras.layers import GlobalMaxPooling1D

vocab_size = len(tokenizer.word_index) + 1


model = Sequential()
# vocab_Size is the size of our vocabulary + 1 and it's the input_dim,
# 32 is the output_dim and its the dimension of the dense embedding,
# last thing input_length is used when you are going to connect flatten and dense layers
model.add(Embedding(vocab_size, 32, input_length=maxlen))
# basic LSTM layer with 128 as dimensionality of the output
model.add(LSTM(128))
# Basic dense layer,
# 1 is the dimensionality of the output,
# activation is the activation function sigmoid sigmoid(x) = 1 / (1 + exp(-x))
model.add(Dense(1, activation='sigmoid'))
# loss=binary_crossentropy, it's the algorithm to calculate the losses this way:
# Computes the cross-entropy loss between true labels and predicted labels,
# optimizer=adam, is the algorithm we are going to use to improve our model
# metrics are the metrics we are going to evaluate, in this case the accuracy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
print(model.summary())

# X_train, y_train, is the data to train the model
# batch_size is the number of samples per gradient update
# epochs are the number of trainings of the model
# verbose is for check in the terminal how is the model fitting
# validation split is a part of the training data splitted to evaluate the accuracy and losses per epoch
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)
score = model.evaluate(X_test, y_test, verbose=1)
print("Loss value:", score[0]) # 0.1362583190202713
print("Test Accuracy:", score[1]) # 0.9522904753684998