-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_dataset.py
More file actions
31 lines (23 loc) · 770 Bytes
/
load_dataset.py
File metadata and controls
31 lines (23 loc) · 770 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
import json
'''def load_dataset():
#Get columns for optimization
with open("./option_columns.json","r") as f:
option_columns = json.load(f)
#Find the dataset
df = pd.read_csv("./dataset_encoded_all_size.csv", dtype={k:"int8" for k in option_columns})
#Filter and clean the data
df.query("cid >= 30000", inplace=True)
df.fillna(-1, inplace=True)
df.query("vmlinux >= 0", inplace=True)
#Add the nbyes column
NBYES = "nbyes"
def nbyes(row):
return sum(row == 1)
df[NBYES] = df.apply(nbyes, axis=1)
return df'''
def load_dataset(nb_yes):
df = pd.read_pickle("all_size_withyes.pkl")
if nb_yes == 0:
df.drop(columns=["nbyes"])
return df