-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataSetLoadin.py
More file actions
71 lines (53 loc) · 2.03 KB
/
dataSetLoadin.py
File metadata and controls
71 lines (53 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# import pandas as pd
# splits = {
# 'train': 'train.jsonl',
# 'validation': 'valid.jsonl',
# 'test': 'test.jsonl'
# }
# train_df = pd.read_json(
# "hf://datasets/semeru/code-code-DefectDetection/" + splits["train"],
# lines=True
# )
# train_df.to_json("./data/processed/train.json", orient="records", indent=2)
# validation_df = pd.read_json(
# "hf://datasets/semeru/code-code-DefectDetection/" + splits["validation"],
# lines=True
# )
# validation_df.to_json("./data/processed/validation.json", orient="records", indent=2)
# test_df = pd.read_json(
# "hf://datasets/semeru/code-code-DefectDetection/" + splits["test"],
# lines=True
# )
# test_df.to_json("./data/processed/test.json", orient="records", indent=2)
# print("done")
# from datasets import load_dataset
# dataset = load_dataset("benjis/diversevul")
# print(dataset["train"][0])
import ijson
import json
import os
input_file = r"C:/Users/syedm/Downloads/MSR_data_cleaned_json/MSR_data_cleaned.json"
output_file = r"C:/Users/syedm/Synelime/coirei/ML_model_Vulnerability_Detection/bigvul_20k.json"
# Check if input file exists
if not os.path.exists(input_file):
print(f"Error: Input file not found: {input_file}")
else:
print(f"Input file found: {input_file}")
print(f"File size: {os.path.getsize(input_file) / (1024**3):.2f} GB")
count = 0
# The JSON structure is {"0": {...}, "1": {...}, ...}
# Use ijson.kvitems to get key-value pairs
with open(input_file, "rb") as f:
# kvitems returns tuples of (key, value)
objects = ijson.kvitems(f, "")
with open(output_file, "w", encoding="utf-8") as out:
for key, obj in objects:
# obj is the dictionary with all vulnerability data
json.dump(obj, out)
out.write("\n")
count += 1
if count % 100 == 0:
print(f"Processed {count} records...")
if count >= 20000:
break
print(f"Done! Wrote {count} records to {output_file}")