zllm/generate_dataset.py at main · oisee/zllm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
Dataset Generation for Token Prediction Model

This script processes text files (ABAP and Markdown) to create a training dataset
for token prediction models. It extracts linguistic features and calculates actual
token counts using both GPT-4 and Mistral tokenizers.

Usage:
    python generate_dataset.py [--input-dirs DIR1 DIR2 ...] [--output OUTPUT_FILE]
"""

import os
import sys
import argparse
from typing import List, Tuple
import pandas as pd
from tqdm import tqdm

from utils import (
    count_tokens_gpt4,
    count_tokens_mistral,
    extract_text_features,
    predict_tokens_gpt4,
    predict_tokens_mistral
)


def process_text_file(file_path: str) -> dict:
    """
    Process a single text file and extract all relevant features and token counts.

    Args:
        file_path: Path to the text file

    Returns:
        Dictionary containing file metadata, features, and token counts
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

    # Extract features
    features = extract_text_features(text)
    (text_length, word_count, punctuation_count, number_count,
     whitespace_count, line_count, sentence_count) = features

    # Calculate actual token counts
    gpt4_tokens = count_tokens_gpt4(text)
    mistral_tokens = count_tokens_mistral(text)

    # Generate predictions
    predicted_gpt4 = predict_tokens_gpt4(*features)
    predicted_mistral = predict_tokens_mistral(*features)

    # Calculate prediction differences
    diff_gpt4 = gpt4_tokens - predicted_gpt4
    diff_mistral = mistral_tokens - predicted_mistral

    return {
        'filename': os.path.basename(file_path),
        'file_path': file_path,
        'gpt4_tokens': gpt4_tokens,
        'mistral_tokens': mistral_tokens,
        'text_length': text_length,
        'word_count': word_count,
        'punctuation_count': punctuation_count,
        'number_count': number_count,
        'whitespace_count': whitespace_count,
        'line_count': line_count,
        'sentence_count': sentence_count,
        'predicted_gpt4': predicted_gpt4,
        'predicted_mistral': predicted_mistral,
        'diff_gpt4': diff_gpt4,
        'diff_mistral': diff_mistral
    }


def scan_directories(directories: List[str], extensions: List[str] = None) -> List[str]:
    """
    Scan directories for files with specified extensions.

    Args:
        directories: List of directory paths to scan
        extensions: List of file extensions to include (e.g., ['.abap', '.md'])

    Returns:
        List of file paths matching the criteria
    """
    if extensions is None:
        extensions = ['.abap', '.md']

    file_paths = []

    for directory in directories:
        if not os.path.exists(directory):
            print(f"Warning: Directory '{directory}' does not exist, skipping...")
            continue

        for root, _, files in os.walk(directory):
            for file in files:
                if any(file.endswith(ext) for ext in extensions):
                    file_paths.append(os.path.join(root, file))

    return file_paths


def generate_dataset(input_directories: List[str],
                    output_file: str,
                    file_extensions: List[str] = None) -> pd.DataFrame:
    """
    Generate a dataset by processing multiple directories of text files.

    Args:
        input_directories: List of directories containing text files
        output_file: Path for the output TSV file
        file_extensions: List of file extensions to process

    Returns:
        DataFrame containing the generated dataset
    """
    print("Token Prediction Dataset Generator")
    print("=" * 50)

    # Scan for files
    print(f"\nScanning directories: {', '.join(input_directories)}")
    file_paths = scan_directories(input_directories, file_extensions)
    print(f"Found {len(file_paths)} files to process")

    if not file_paths:
        print("No files found to process!")
        return None

    # Process files
    print("\nProcessing files...")
    results = []

    for file_path in tqdm(file_paths, desc="Processing"):
        result = process_text_file(file_path)
        if result and result['gpt4_tokens'] > 0:  # Only include files with content
            results.append(result)

    # Create DataFrame
    df = pd.DataFrame(results)

    # Sort by filename
    df = df.sort_values('filename')

    # Save to TSV
    print(f"\nSaving dataset to: {output_file}")
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    df.to_csv(output_file, sep='\t', index=False)

    # Display statistics
    print("\nDataset Statistics:")
    print(f"Total files processed: {len(df)}")
    print(f"Average GPT-4 tokens per file: {df['gpt4_tokens'].mean():.1f}")
    print(f"Average Mistral tokens per file: {df['mistral_tokens'].mean():.1f}")
    print(f"GPT-4 prediction MAE: {df['diff_gpt4'].abs().mean():.1f}")
    print(f"Mistral prediction MAE: {df['diff_mistral'].abs().mean():.1f}")

    return df


def main():
    """Main entry point for the script."""
    parser = argparse.ArgumentParser(
        description="Generate training dataset for token prediction models"
    )
    parser.add_argument(
        '--input-dirs',
        nargs='+',
        #default=['./_abap_code/'],
        default=['./_your_specific_dataset/'],
        help='Input directories containing text files (default: ./_abap_code/)'
    )
    parser.add_argument(
        '--output',
        default='./_predictoken/dataset.tsv',
        help='Output TSV file path (default: ./_predictoken/dataset.tsv)'
    )
    parser.add_argument(
        '--extensions',
        nargs='+',
        default=['.abap', '.md'],
        help='File extensions to process (default: .abap .md)'
    )

    args = parser.parse_args()

    try:
        df = generate_dataset(
            input_directories=args.input_dirs,
            output_file=args.output,
            file_extensions=args.extensions
        )

        if df is not None:
            print(f"\nDataset generation complete! Saved {len(df)} records.")

    except KeyboardInterrupt:
        print("\n\nProcess interrupted by user.")
        sys.exit(1)
    except Exception as e:
        print(f"\nError: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()