VulnerabilityExtractionMethods/LSICAPEC2CVE.py at master · ref3t/VulnerabilityExtractionMethods · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
from numpy.ma import count
import pandas as pd
from gensim.parsing.preprocessing import preprocess_documents

import gensim
from gensim.parsing.preprocessing import preprocess_documents
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import numpy as np, random
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import average_precision_score, precision_recall_curve, auc
# import utils
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import warnings, random
warnings.filterwarnings("ignore")


import re
# checkCVEUsingBert()
def removeUrls (text):
    # #print (text)
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    text = re.sub(r'(?i)NOTE:.*', '', text)
    # text = re.sub(r'\b\w*\d+\w*\b|\b\w*\.\w*\b', '', text)
    # text = re.sub(r'\.\w+\b', '', text)
    # text = re.sub(r'[^\w\s]', '', text)
    # text = re.sub(r'\b\w*\d+\w*\b', '', text)
    # text = re.sub(r'\s+', ' ', text)
    # text = re.sub(r'\d+', '', text)
    # text = re.sub(r'[,."()]', '', text)
    text = re.sub(r'\b\d+(\.\d+)*\b', '', text) #remove digits
    # #print (text)
    return(text)


def removeCitation(text):
    position = text.find('(Citation:')
    if position > 0:
        return text[:position]
    else:
        return text

def removeURLandCitationBulk(texts):
    return [removeUrls(removeCitation(text)) for text in texts]
# red = removeURLandCitationBulk(['Untrusted search path vulnerability in  PGP Desktop 9.9.0 Build 397, 9.10.x, 10.0.0 Build 2732,and probably other versions allows local users,and possibly remote attackers,to execute arbitrary code and conduct DLL hijacking attacks via a Trojan horse tsp.dll or tvttsp.dll that is located in the same folder as a .p12,.pem,.pgp,.prk,.prvkr,.pubkr,.rnd or .skr file.'])

def dataPreprocessingStopWords(texts):
    return [preprocess_text_stop_words(text) for text in texts]

def dataPreprocessingStemming(texts):
    return [preprocess_text_stemming(text) for text in texts]

def dataPreprocessingLemmatization(texts):
    return [preprocess_text_lemmatization(text) for text in texts]


def preprocess_text_stop_words(text):
    # Tokenization
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))

    # Stop words removal
    tokens = [token for token in tokens if token not in stop_words]

    return tokens
#Stemming is the process of finding the root of words
def preprocess_text_stemming(text):
    # Tokenization
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    return stemmed_tokens
#Lemmatization is the process of finding the form of the related word in the dictionary.
def preprocess_text_lemmatization(text):
    # Tokenization
    tokens = word_tokenize(text)

    lemmatizer = WordNetLemmatizer()
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return lemmatized_tokens


def main():

    dataFrame = pd.read_excel('DataSetFinalCapecCve.xlsx', sheet_name=0)

    dfAttackCut = dataFrame.loc[:, ['CVE-ID','CVE-Description', 'CAPECName', 'CAPECDescription']]

    techniqueNamesWithLessThanFiveExamples = []

    trainAndTestSetGrouped = dfAttackCut.groupby('CVE-ID')
    #print (trainAndTestSetGrouped.head())
    classCounts = []
    # use techniques that have at least 30 procedure descriptions
    for name,group in trainAndTestSetGrouped:
        classCounts.append({ 'CVE-ID' : f'{name}', 'count' : group.shape[0]})
        if group.shape[0] < 30:
            techniqueNamesWithLessThanFiveExamples.append(name)


    file = open('results/LSI.txt', 'w')

    for top_n_class in [2,3,4,5,6]:
    # for top_n_class in [2, 4]:
        file.write('\n=================\n')
        file.write(f'n = {top_n_class}\n')
        print(f'n = {top_n_class}\n')
        classCounts_sorted = sorted(classCounts, key = lambda x:x['count'], reverse = True)[0:top_n_class] # sort the class related to count
        classCounts_top_n = [item['CVE-ID'] for item in classCounts_sorted] # return the name of classes after sorted
        print(classCounts_sorted)
        res = []
        for num in classCounts_top_n:
            dfHijack = dfAttackCut[dfAttackCut['CVE-ID'] == num]

            numUniqueCVEs = dfHijack['CAPECName'].nunique()
            res.append({ 'CVE-ID' : f'{num}', 'count' : numUniqueCVEs})
            #print(num+ " : "+ str(numUniqueCVEs))
        classCounts_sorted2 = sorted(res, key = lambda x:x['count'], reverse = True)[0:top_n_class] # sort the class related to count
        classCounts_top_n2 = [item['CVE-ID'] for item in classCounts_sorted2] # return the name of classes after sorted

        print (classCounts_sorted)
        trainAndTestSetFiltered = dfAttackCut[dfAttackCut['CVE-ID'].isin(classCounts_top_n)] #have all data the techniques contains more than 30 descriptions

        text_corpus = trainAndTestSetFiltered['CAPECDescription'].values

        text_corpus = removeURLandCitationBulk(text_corpus)

        text_corpus = dataPreprocessingStemming(text_corpus)
        text_corpus = [' '.join(item) for item in text_corpus]

        # Tokenizing each document
        tokenized_corpus = [doc.split() for doc in text_corpus]

        # Creating a dictionary
        dictionary = gensim.corpora.Dictionary(tokenized_corpus)

        # Creating Bag of Words (BoW) representation
        bow_corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_corpus]
        tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='nfc', id2word=dictionary)
        corpus_tfidf = tfidf[bow_corpus]

        lsi = gensim.models.LsiModel(corpus_tfidf, num_topics=500, power_iters=100)
        index = gensim.similarities.MatrixSimilarity(lsi[corpus_tfidf])

        dfLsi = pd.DataFrame(np.array(index.index)).add_prefix('column')
        trainAndTestSetFiltered = pd.concat([trainAndTestSetFiltered.reset_index(drop=True), dfLsi.reset_index(drop=True)], axis=1)

        #split the data to train and test
        skf = StratifiedKFold(n_splits=5)
        target = trainAndTestSetFiltered.loc[:,'CVE-ID']
        train = []
        test = []
        ##print(skf.split(trainAndTestSetFiltered, target))
        for train_index, test_index in skf.split(trainAndTestSetFiltered, target):
            train.append( trainAndTestSetFiltered.iloc[train_index] )
            test.append( trainAndTestSetFiltered.iloc[test_index] )


        for item in ['knn', 'nb', 'svm', 'rf', 'dt', 'nn']:
            file.write('\n###################\n')
            file.write(f'classifier: {item}')
            ##print(f'classifier: {item}')

            accuracy = []
            precision_m = []
            precision_w = []
            recall_m = []
            recall_w = []
            f1_m = []
            f1_w = []
            auc = []

            for index in range(0, 5):
                numOfColumns = len(train[index].columns)

                clf = None
                #print(train[0][5])
                if item == 'knn': clf = KNeighborsClassifier().fit(train[index].iloc[:, 4:(numOfColumns)], train[index]['CVE-ID'])

                if item == 'nb': clf = GaussianNB().fit(train[index].iloc[:, 4:(numOfColumns)], train[index]['CVE-ID'])

                if item == 'svm': clf = svm.SVC(probability=True).fit(train[index].iloc[:, 4:(numOfColumns)], train[index]['CVE-ID'])

                if item == 'rf': clf = RandomForestClassifier().fit(train[index].iloc[:, 4:(numOfColumns)], train[index]['CVE-ID'])

                if item == 'dt': clf = DecisionTreeClassifier().fit(train[index].iloc[:, 4:(numOfColumns)], train[index]['CVE-ID'])

                if item == 'nn': clf = MLPClassifier().fit(train[index].iloc[:, 4:(numOfColumns)], train[index]['CVE-ID'])

                predicted = clf.predict(test[index].iloc[:, 4:(numOfColumns)])
                ##print(predicted)
                ref = test[index]['CVE-ID']
                saj = predicted
                #print( test[index]['CVE-ID'] + "#######"+ predicted[index] + "\n")
                output = classification_report(test[index]['CVE-ID'], predicted, output_dict =  True)
                probs = clf.predict_proba(test[index].iloc[:, 4:(numOfColumns)])
                if top_n_class == 2:
                    auc.append(roc_auc_score( test[index]['CVE-ID'] , probs[:,1]))
                else:
                    auc.append(roc_auc_score( test[index]['CVE-ID'] , probs , multi_class='ovr', average='weighted'))

                accuracy.append(output['accuracy'])
                precision_m.append(output['macro avg']['precision'])
                precision_w.append(output['weighted avg']['precision'])
                recall_m.append(output['macro avg']['recall'])
                recall_w.append(output['weighted avg']['recall'])
                f1_m.append(output['macro avg']['f1-score'])
                f1_w.append(output['weighted avg']['f1-score'])

            file.write(f'accuracy: {sum(accuracy)/5}\n')
            file.write(f'precision macro: {sum(precision_m)/5}\n')
            file.write(f'precision weighted: {sum(precision_w)/5}\n')
            file.write(f'recall macro: {sum(recall_m)/5}\n')
            file.write(f'recall weighted: {sum(recall_w)/5}\n')
            file.write(f'f1 macro: {sum(f1_m)/5}\n')
            file.write(f'f1 weighted: {sum(f1_w)/5}\n')
            file.write(f'auc: {sum(auc)/5}\n')
            file.write('###################\n')

    file.write('=================\n')
    file.close()

    # implement word embedding
    # prepare oracle for svo extraction
    # take the five data points for precision recall for drawing the roc curve
    # take the true positives and false positives data
    # report the paper's performance with our observed performance

if __name__ == "__main__":
    main()