-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathgenerate_dataset.py
More file actions
212 lines (170 loc) · 6.38 KB
/
generate_dataset.py
File metadata and controls
212 lines (170 loc) · 6.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
Dataset Generation for Token Prediction Model
This script processes text files (ABAP and Markdown) to create a training dataset
for token prediction models. It extracts linguistic features and calculates actual
token counts using both GPT-4 and Mistral tokenizers.
Usage:
python generate_dataset.py [--input-dirs DIR1 DIR2 ...] [--output OUTPUT_FILE]
"""
import os
import sys
import argparse
from typing import List, Tuple
import pandas as pd
from tqdm import tqdm
from utils import (
count_tokens_gpt4,
count_tokens_mistral,
extract_text_features,
predict_tokens_gpt4,
predict_tokens_mistral
)
def process_text_file(file_path: str) -> dict:
"""
Process a single text file and extract all relevant features and token counts.
Args:
file_path: Path to the text file
Returns:
Dictionary containing file metadata, features, and token counts
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
except Exception as e:
print(f"Error reading {file_path}: {e}")
return None
# Extract features
features = extract_text_features(text)
(text_length, word_count, punctuation_count, number_count,
whitespace_count, line_count, sentence_count) = features
# Calculate actual token counts
gpt4_tokens = count_tokens_gpt4(text)
mistral_tokens = count_tokens_mistral(text)
# Generate predictions
predicted_gpt4 = predict_tokens_gpt4(*features)
predicted_mistral = predict_tokens_mistral(*features)
# Calculate prediction differences
diff_gpt4 = gpt4_tokens - predicted_gpt4
diff_mistral = mistral_tokens - predicted_mistral
return {
'filename': os.path.basename(file_path),
'file_path': file_path,
'gpt4_tokens': gpt4_tokens,
'mistral_tokens': mistral_tokens,
'text_length': text_length,
'word_count': word_count,
'punctuation_count': punctuation_count,
'number_count': number_count,
'whitespace_count': whitespace_count,
'line_count': line_count,
'sentence_count': sentence_count,
'predicted_gpt4': predicted_gpt4,
'predicted_mistral': predicted_mistral,
'diff_gpt4': diff_gpt4,
'diff_mistral': diff_mistral
}
def scan_directories(directories: List[str], extensions: List[str] = None) -> List[str]:
"""
Scan directories for files with specified extensions.
Args:
directories: List of directory paths to scan
extensions: List of file extensions to include (e.g., ['.abap', '.md'])
Returns:
List of file paths matching the criteria
"""
if extensions is None:
extensions = ['.abap', '.md']
file_paths = []
for directory in directories:
if not os.path.exists(directory):
print(f"Warning: Directory '{directory}' does not exist, skipping...")
continue
for root, _, files in os.walk(directory):
for file in files:
if any(file.endswith(ext) for ext in extensions):
file_paths.append(os.path.join(root, file))
return file_paths
def generate_dataset(input_directories: List[str],
output_file: str,
file_extensions: List[str] = None) -> pd.DataFrame:
"""
Generate a dataset by processing multiple directories of text files.
Args:
input_directories: List of directories containing text files
output_file: Path for the output TSV file
file_extensions: List of file extensions to process
Returns:
DataFrame containing the generated dataset
"""
print("Token Prediction Dataset Generator")
print("=" * 50)
# Scan for files
print(f"\nScanning directories: {', '.join(input_directories)}")
file_paths = scan_directories(input_directories, file_extensions)
print(f"Found {len(file_paths)} files to process")
if not file_paths:
print("No files found to process!")
return None
# Process files
print("\nProcessing files...")
results = []
for file_path in tqdm(file_paths, desc="Processing"):
result = process_text_file(file_path)
if result and result['gpt4_tokens'] > 0: # Only include files with content
results.append(result)
# Create DataFrame
df = pd.DataFrame(results)
# Sort by filename
df = df.sort_values('filename')
# Save to TSV
print(f"\nSaving dataset to: {output_file}")
os.makedirs(os.path.dirname(output_file), exist_ok=True)
df.to_csv(output_file, sep='\t', index=False)
# Display statistics
print("\nDataset Statistics:")
print(f"Total files processed: {len(df)}")
print(f"Average GPT-4 tokens per file: {df['gpt4_tokens'].mean():.1f}")
print(f"Average Mistral tokens per file: {df['mistral_tokens'].mean():.1f}")
print(f"GPT-4 prediction MAE: {df['diff_gpt4'].abs().mean():.1f}")
print(f"Mistral prediction MAE: {df['diff_mistral'].abs().mean():.1f}")
return df
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(
description="Generate training dataset for token prediction models"
)
parser.add_argument(
'--input-dirs',
nargs='+',
#default=['./_abap_code/'],
default=['./_your_specific_dataset/'],
help='Input directories containing text files (default: ./_abap_code/)'
)
parser.add_argument(
'--output',
default='./_predictoken/dataset.tsv',
help='Output TSV file path (default: ./_predictoken/dataset.tsv)'
)
parser.add_argument(
'--extensions',
nargs='+',
default=['.abap', '.md'],
help='File extensions to process (default: .abap .md)'
)
args = parser.parse_args()
try:
df = generate_dataset(
input_directories=args.input_dirs,
output_file=args.output,
file_extensions=args.extensions
)
if df is not None:
print(f"\nDataset generation complete! Saved {len(df)} records.")
except KeyboardInterrupt:
print("\n\nProcess interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"\nError: {e}")
sys.exit(1)
if __name__ == "__main__":
main()