CSC561-Final-Project/MainCode.py at main · nick-rommel/CSC561-Final-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# import statements
# importing the other files in this folder hierarchy
import CustomDataloader as CDL
from VITNet import VITNet as VN
import time
import torch.optim as TO
import torch.nn as nn
import torch
import numpy as np


# setting the device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# function to invoke training on the model
def train():
    # path to the dataset
    # this will need to be the absolute path to the dataset's "images_original" folder.
    path = 'C:/Masters/CSC561/Final Project/Code/CSC561-Final-Project/GTZAN/images_original/'

    # parameters that are remaining constant across all model combinations
    batch_size = 100
    # after finishing the first hyperparameter sweep, we determined that we were getting
    #   a severe diminishing margin of returns after 15 epochs. Therefore, 15 epochs is
    #   our new number for training sweeps for each model combination.
    num_epochs = 15

    train_size = 800
    val_size = 100
    test_size = 99

    # creating the dataloaders by invoking their function.
    train_loader,val_loader,test_loader = CDL.CustomLoader(path,batch_size,train_size,val_size,test_size)

    # hyperparameters
    # we are looping over 26 different learning rates, and 11 different weight decays
    # Lowered the number of learing rates we are sweeping over based on results of
    #   our first search.
    lr = np.arange(0.001, 0.0052, 0.0002)
    Weight_Decay = np.arange(0.005, 0.016, 0.001)
    best_param_dictionary = {
        "LR": lr[0],
        "weight_decay": Weight_Decay[0]
    }

    # declaring the variable to hold the best model
    # will be updating this each time a better model parameter combination is found in the loop below
    # Making use of the model.state_dict()
    best_model = VN(1,4*256)

    # variable for holding the current best validation accuracy -- NOT USED for second hyperparam sweep
    best_val_acc = 0

    # variable for holding the best validation loss. This is our evaluation metric for our
    #   second hyperparameter sweep.
    # Setting to 100k as the calculated losses will be much lower than that.
    best_val_loss = 100000


    # hyperparameter loop
    for LR in lr:
        for weight_decay in Weight_Decay:

            # instantiating the network.
            Network = VN(1,4*256)

                # variable for invoking loss calculations
            criterion = nn.CrossEntropyLoss(reduction='mean')

            # in order to only train the 'head', hidden, and output layers, we define the parameters for the optimizer as such
            # this means the ViT isn't updated as the model learns.
            params = [
                {'params':Network.vit.heads.parameters()},
                {'params':Network.hidden.parameters()},
                {'params':Network.output.parameters()}
                ]

            # Using AdamW as the optimizer of choice. AdamW is often used as the optimizer in
            #   applications making use of the ViT, as it was used in ViT's introductory paper
            optimizer = TO.AdamW(params=params,lr=LR, weight_decay=weight_decay)


            ################# MODEL TRAINING ################
            # variable for timing metrics
            start = time.time()

            # delcaring lists to hold the loss values for use in the future.
            losses = []
            vlosses = []
            accuracy = []
            vaccuracy = []

            # sending the model to the GPU
            Network.to(device=device)

            # Below is the actual training loop for the model.
            for epoch in range(num_epochs):
                # ensuring that the model is set to "train" mode
                Network.train()

                # invoking the internal method defined for the training loop of the model
                average_training_loss,average_training_accuracy = trainloop(train_loader,Network,criterion,optimizer)
                # calculating actual average of the batch training accuracies and losses.
                # there are 8 minibatches per epoch.
                avg_train_loss = sum(average_training_loss) / 8
                avg_train_acc = sum(average_training_accuracy) / 8

                # appending returned values onto the end of their respective lists
                losses.extend(average_training_loss)
                accuracy.extend(average_training_accuracy)

                # changing the network to evaluation mode in order to calculate the validation accuracy
                Network.eval()
                with torch.no_grad():
                    average_validation_loss,average_validation_accuracy = validationloop(val_loader,Network,criterion)

                # if there is a new best val loss achieved, update current best loss and save the current
                #   model state
                # average_validation_loss is always a 1 element list.
                if average_validation_loss[0] < best_val_loss:
                    # Print statement to show that an update has been made
                    print(f"New best loss, updating metrics")
                    # setting the new best_val_loss
                    best_val_loss = average_validation_loss[0]
                    # saving the current weights and biases of the model that produced these new "best" results
                    best_model.load_state_dict(Network.state_dict())

                    # update the dictionary holding the best parameters
                    best_param_dictionary['LR']=LR
                    best_param_dictionary['weight_decay']=weight_decay

                # appending the validation values to their respective lists.
                vlosses.extend(average_validation_loss)
                vaccuracy.extend(average_validation_accuracy)


            # defining reporting metrics
            name = f'LR:{LR},Weight_Decay:{weight_decay}'
            # calculating the running time of the training
            end = time.time()
            dur = end-start

            # printing these values
            print(name,f'Duration:{dur:0.2f},Acuracy:{accuracy[-1]:.0f},vAcuracy:{vaccuracy[-1]:.0f}',flush = True)

            # writing these metrics to a file for later use in graphing.
            filename = 'Train_Validation.txt'
            file = open(filename,'a',encoding='utf-8')
            file.write(f'{name}\t{losses}\t{vlosses}\t{accuracy}\t{vaccuracy}\t{dur}\n')
            file.close()

    # calling the evaluation of the test model using the best found model
    best_model.eval()
    with torch.no_grad():
        test_model(test_loader=test_loader,Network=best_model,criterion=criterion, dict=best_param_dictionary)


# Function for evaluating the best model stored
# This takes the snapshot of the best model recorded and loads it in as the current model state_dict
# Inputs:
#   test_loader: the test data's dataloader
#   Network: The "best" model (passed already in eval() mode)
#   dict: The dictionary holding the LR and weight-decay values used when the best model was found
# Outputs:
#   None
def test_model(test_loader, Network, criterion, dict):
    # declaring lists to hold the calculated metrics
    test_running_loss = []
    test_running_accuracy = []

    # sending the model to the GPU
    Network.to(device=device)

    # loop for going through the batches
    for inputs,labels in test_loader:
        # for calculating the validation accuracy
        start = time.time()
        num_correct = 0
        num_samples = 0

        # sending the data to the GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # exectuing the prediction and calculating the loss.
        pred = Network(inputs)
        loss = criterion(pred,labels)
        test_running_loss.append(loss.item())

        # calculating the accuracy and recording it
        _,predictions = pred.cpu().detach().max(1)
        num_correct += (predictions == labels.cpu().detach()).sum()
        num_samples += predictions.size(0)
        accuracy = float(num_correct/num_samples)*100
        test_running_accuracy.append(accuracy)
        end = time.time()
        dur = end - start
        print(f'Final Inference on hold-out test set...')
        print(f'Test: {accuracy:0.0f}%, {dur:0.2f}seconds\n')

        name = f'LR:{dict["LR"]},Weight_Decay:{dict["weight_decay"]}'

        # writing these metrics to a file for later use.
        filename = 'Test.txt'
        file = open(filename,'a',encoding='utf-8')
        file.write(f'{name}\t{loss}\t{accuracy}\t{dur}\n')
        file.close()


# Function for executing the training of the model over the training batches.
# Inputs:
#   dataset: the dataloader
#   model: the model being used
#   criterion: the loss function (cross entropy)
# Outputs:
#   running_loss: the list of batch losses
#   running_accuracy: the list of batch accuracies
def trainloop(dataset,model,criterion,optimizer):
    # declaring lists for holding the intermediary metrics
    running_loss = []
    running_accuracy = []
    # going through the data batches in the dataloader
    for inputs,labels in dataset:
        # for the accuracy calculation
        num_correct = 0
        num_samples = 0

        # initializing the gradients to 0.
        optimizer.zero_grad()

        # sending the data to the GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # executing the prediction and and calculating the loss
        pred = model(inputs)
        loss = criterion(pred,labels)
        loss.backward()
        optimizer.step()

        # calculating the accuracy
        _,predictions = pred.cpu().detach().max(1)
        num_correct += (predictions == labels.cpu().detach()).sum()
        num_samples += predictions.size(0)
        accuracy = float(num_correct/num_samples)*100

        # appending the training accuracy and loss
        running_accuracy.append(accuracy)
        running_loss.append(loss.item())

    # returning the lists of the calculated loss and accuracy
    return running_loss,running_accuracy


# function used for calculating the validation accuracies and loss
# Inputs:
#   dataset: the dataloader
#   model: the model being used
#   criterion: the loss function (cross entropy)
# Outputs:
#   running_loss: the list of batch losses (1 item in this case)
#   running_accuracy: the list of batch accuracies (1 item in this case)
def validationloop(dataset,model,criterion):
    # declaring lists to hold the calculated metrics
    running_loss = []
    running_accuracy = []

    # loop for going through the batches
    # for the validation loader, there is only a single batch due to the amount
    #   of available data.
    for inputs,labels in dataset:
        # for calculating the validation accuracy
        num_correct = 0
        num_samples = 0

        # sending the data to the GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # exectuing the prediction and calculating the loss.
        pred = model(inputs)
        loss = criterion(pred,labels)
        running_loss.append(loss.item())

        # calculating the accuracy and recording it
        _,predictions = pred.cpu().detach().max(1)
        num_correct += (predictions == labels.cpu().detach()).sum()
        num_samples += predictions.size(0)
        accuracy = float(num_correct/num_samples)*100
        running_accuracy.append(accuracy)

    # returning the calculated loss and accuracy lists
    return running_loss,running_accuracy


# this file is the "main" function of the project, and is the entry point for execution.
if __name__ =='__main__':
    start = time.time()
    train()
    end = time.time()
    print(end-start)