ReduceFix/evaluate_repair.py at main · GLEAM-Lab/ReduceFix · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# evaluate_repair.py
import os
import sys
import argparse
import json
import glob
import subprocess
import tempfile
import shutil
import time
import traceback # Needed for traceback.print_exc()
import hashlib # For code hashing
from typing import Optional, Tuple, List, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import fcntl # For file locking on Unix-like systems
# --- Import markdownify ---
import markdownify
# BeautifulSoup removed - using full problem description without HTML parsing

# --- Import new LLM module ---
import llm

# --- Configuration ---
# Use relative paths for portability
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
BASE_TESTCASE_PATH = os.path.join(SCRIPT_DIR, "lftbench", "tests")  # Use tests from lftbench
COMPILE_TIMEOUT = 10 # Timeout for compilation
RUN_TIMEOUT = 5      # Timeout for running code on one test case
TOP_K = 10
# --- LLM Configuration for Repair ---
REPAIR_MODEL_NAME = "qwen2.5-coder-7b-instruct" # Model to use for repairs
REPAIR_TEMPERATURE = 0.8
MAX_INPUT_LENGTH_FOR_LLM = 40960 # Max chars for failing input in prompt (10KB)
# --- Add max length for output context ---
MAX_OUTPUT_LENGTH_FOR_LLM = 8192 # Max chars for actual/expected output in prompt (2KB)
# Default max threads - can be overridden by command line argument
DEFAULT_MAX_THREADS = 5

# --- Try importing necessary components ---
try:
    # Assuming reduce.py is in the same directory or Python path
    from reduce import compile_program, _parse_problem_id
    print("[Info] Successfully imported required functions from reduce.")
except ImportError as e:
    print(f"[Error] Failed to import from reduce ({e}). Ensure reduce.py is accessible.", file=sys.stderr)
    sys.exit(1)

# --- File Locking Helper ---
class FileLocker:
    """Context manager for file locking to prevent concurrent access to result.json"""

    def __init__(self, file_path: str, mode: str = 'r+', timeout: int = 30):
        self.file_path = file_path
        self.mode = mode
        self.timeout = timeout
        self.file_handle = None
        self.lock_acquired = False

    def __enter__(self):
        """Acquire file lock with timeout"""
        try:
            # Create file if it doesn't exist (for write operations)
            if 'w' in self.mode or 'a' in self.mode:
                os.makedirs(os.path.dirname(self.file_path) if os.path.dirname(self.file_path) else '.', exist_ok=True)
                if not os.path.exists(self.file_path):
                    with open(self.file_path, 'w') as f:
                        json.dump({}, f)

            # Open file with appropriate mode
            self.file_handle = open(self.file_path, self.mode, encoding='utf-8')

            # Try to acquire lock with timeout
            start_time = time.time()
            while time.time() - start_time < self.timeout:
                try:
                    fcntl.flock(self.file_handle.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
                    self.lock_acquired = True
                    print(f"[Lock] Acquired lock for {self.file_path}")
                    return self.file_handle
                except (IOError, OSError):
                    # Lock not available, wait and retry
                    time.sleep(0.1)
                    continue

            # Timeout occurred
            raise TimeoutError(f"Could not acquire lock for {self.file_path} within {self.timeout} seconds")

        except Exception as e:
            if self.file_handle:
                self.file_handle.close()
                self.file_handle = None
            raise e

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Release file lock"""
        if self.file_handle:
            try:
                if self.lock_acquired:
                    fcntl.flock(self.file_handle.fileno(), fcntl.LOCK_UN)
                    print(f"[Lock] Released lock for {self.file_path}")
                self.file_handle.close()
                self.file_handle = None
                self.lock_acquired = False
            except Exception as e:
                print(f"[Warning] Error releasing lock for {self.file_path}: {e}", file=sys.stderr)

def load_results_with_lock(json_path: str) -> dict:
    """Load results from JSON file with file locking"""
    if not os.path.exists(json_path):
        return {}

    try:
        with FileLocker(json_path, 'r') as f:
            if os.path.getsize(json_path) > 0:
                f.seek(0)
                results = json.load(f)
                print(f"[Info] Loaded data from {json_path}")
                return results
            else:
                print(f"[Error] {json_path} is empty.", file=sys.stderr)
                return {}
    except json.JSONDecodeError:
        print(f"[Error] Failed to decode JSON from {json_path}.", file=sys.stderr)
        return {}
    except TimeoutError as e:
        print(f"[Error] Lock timeout when loading {json_path}: {e}", file=sys.stderr)
        return {}
    except Exception as e:
        print(f"[Error] Failed to load or parse {json_path}: {e}", file=sys.stderr)
        return {}

def save_results_with_lock(json_path: str, results_data: dict) -> bool:
    """Save results to JSON file with file locking and retry mechanism"""
    max_retries = 5
    base_wait_time = 2

    for attempt in range(max_retries):
        try:
            with FileLocker(json_path, 'w') as f:
                f.seek(0)
                f.truncate()
                json.dump(results_data, f, indent=2, ensure_ascii=False)
                f.flush()
                os.fsync(f.fileno())  # Ensure data is written to disk
                print(f"[Success] Saved updated evaluation results to {json_path}")
                return True
        except TimeoutError as e:
            wait_time = base_wait_time * (2 ** attempt)  # Exponential backoff
            print(f"[Warning] Lock timeout on attempt {attempt + 1}/{max_retries}: {e}")
            if attempt < max_retries - 1:
                print(f"[Retry] Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
                continue
            else:
                print(f"[Error] Failed to save after {max_retries} attempts", file=sys.stderr)
                return False
        except Exception as e:
            wait_time = base_wait_time * (2 ** attempt)
            print(f"[Warning] Save error on attempt {attempt + 1}/{max_retries}: {e}")
            if attempt < max_retries - 1:
                print(f"[Retry] Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
                continue
            else:
                print(f"[Error] Failed to save after {max_retries} attempts: {e}", file=sys.stderr)
                return False

    return False

# --- Helper Functions ---

def run_program(executable_path: str, input_path: str, timeout: int) -> Tuple[Optional[str], Optional[str]]:
    """Runs the compiled program with given input, capturing stdout.
    Returns (stdout, error_message). error_message is None on success.
    """
    try:
        with open(input_path, 'r', encoding='utf-8') as f_in:
            input_content = f_in.read()

        result = subprocess.run(
            [executable_path],
            input=input_content,
            capture_output=True,
            text=True,
            timeout=timeout,
            check=False # Don't raise exception on non-zero exit code, check manually
        )

        if result.returncode != 0:
            err_msg = f"Runtime Error (Exit Code: {result.returncode})"
            # Include stderr if available
            if result.stderr:
                err_msg += f"\\nStderr:\\n{result.stderr.strip()}"
            return None, err_msg
        # Check for stderr even on success, could indicate warnings/issues
        # if result.stderr:
        #     print(f"  [Warning] Runtime produced stderr (but exit code 0):\\n{result.stderr.strip()}", file=sys.stderr)

        return result.stdout, None # Success

    except subprocess.TimeoutExpired:
        return None, "Timeout Error"
    except FileNotFoundError:
        return None, f"Executable not found at {executable_path}"
    except Exception as e:
        return None, f"Unexpected error during execution: {e}"

def get_reference_outputs(ac_executable_path: str, test_case_dir: str, test_case_names: List[str]) -> Tuple[Optional[Dict[str, str]], Optional[str]]:
    """Runs the AC executable on all specified test cases and returns a dict of {case_name: stdout}.
       Returns (output_dict, error_message). error_message is None if all runs succeed.
    """
    reference_outputs = {}
    print(f"  Generating reference outputs using {ac_executable_path}...")
    for case_name in test_case_names:
        input_path = os.path.join(test_case_dir, case_name)
        # print(f"    Running AC on: {case_name}") # Reduce verbosity
        stdout, error = run_program(ac_executable_path, input_path, RUN_TIMEOUT)
        if error:
            err_msg = f"Failed to get reference output for {case_name}: {error}"
            print(f"    [Error] {err_msg}", file=sys.stderr)
            return None, err_msg # Fatal error if AC fails/times out
        reference_outputs[case_name] = stdout
    print(f"  Successfully generated reference outputs for {len(test_case_names)} cases.")
    return reference_outputs, None

def evaluate_fixed_code(
    fixed_code_str: str,
    reference_outputs: Dict[str, str],
    work_dir: str,
    compile_timeout: int,
    run_timeout: int
) -> Tuple[bool, str]:
    """
    Compiles the fixed code, runs it against all test cases, compares with reference outputs.
    Returns (passed_all, status_message).
    """
    fixed_src_path = os.path.join(work_dir, "fixed.cpp")
    fixed_exe_path = os.path.join(work_dir, "fixed")

    # 1. Write fixed code
    try:
        with open(fixed_src_path, "w", encoding="utf-8") as f:
            f.write(fixed_code_str)
    except IOError as e:
        return False, f"Failed to write fixed code: {e}"

    # 2. Compile fixed code
    # Assuming compile_program works correctly with relative paths within work_dir
    if not compile_program("fixed.cpp", "fixed", work_dir):
        return False, "Compilation Failed"

    # 3. Run on each test case and compare
    # Infer test case directory structure based on how reference_outputs keys were formed
    # This assumes reference_outputs keys are just the base filenames
    if not reference_outputs:
        return False, "Internal Error: No reference outputs provided"
    # Get one key to find the directory
    one_case_name = next(iter(reference_outputs.keys()))
    # Assume the test case dir is where this file would be
    # This part is fragile and depends on how test_case_names was created in get_reference_outputs
    # Let's assume the full path was not stored, only basename. Reconstruct path needed.
    # Need the original test_case_dir path here. Pass it as an argument.

    # *** Need to pass test_case_dir to this function ***
    # Let's refactor to pass test_case_dir
    # Refactored signature: evaluate_fixed_code(..., test_case_dir: str)

    all_passed = True
    first_failure_details = ""

    for case_base_name, ref_stdout in reference_outputs.items():
        # This assumes case_base_name is JUST the filename, need dir.
        # Correction: We need the test case directory passed in.
        # We'll assume it's passed in for now.
        input_path = os.path.join("PLACEHOLDER_TEST_CASE_DIR", case_base_name) # Placeholder!
        fixed_stdout, error = run_program(fixed_exe_path, input_path, run_timeout)

        if error:
            all_passed = False
            first_failure_details = f"Failed on '{case_base_name}': {error}"
            break # Stop on first error

        # Normalize whitespace/newlines for comparison - ignore trailing whitespace on each line
        def normalize_output(output):
            return '\n'.join(line.rstrip() for line in output.splitlines()).strip()

        if normalize_output(fixed_stdout) != normalize_output(ref_stdout):
            all_passed = False
            first_failure_details = f"Wrong Answer on '{case_base_name}'"
            # Optionally include diff here for debugging
            break # Stop on first WA

    if all_passed:
        return True, "Correct (Passed All Test Cases)"
    else:
        return False, first_failure_details


# --- Refactored evaluate_fixed_code (passing test_case_dir) ---
def get_code_hash(code: str) -> str:
    """Generate a hash for the given code to use as cache key"""
    return hashlib.sha256(code.encode('utf-8')).hexdigest()[:16]

def load_persistent_cache(cache_file: str) -> Dict:
    """Load evaluation cache from file"""
    if os.path.exists(cache_file):
        try:
            with open(cache_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            print(f"  [Warning] Failed to load cache from {cache_file}: {e}", file=sys.stderr)
    return {}

def save_persistent_cache(cache_file: str, cache_data: Dict):
    """Save evaluation cache to file"""
    try:
        # Ensure parent directory exists
        cache_dir = os.path.dirname(cache_file)
        if cache_dir:
            os.makedirs(cache_dir, exist_ok=True)

        with open(cache_file, 'w', encoding='utf-8') as f:
            json.dump(cache_data, f, indent=2)
    except Exception as e:
        print(f"  [Warning] Failed to save cache to {cache_file}: {e}", file=sys.stderr)

def evaluate_fixed_code_refactored(
    fixed_code_str: str,
    reference_outputs: Dict[str, str],
    work_dir: str,
    test_case_dir: str, # Added parameter
    compile_timeout: int,
    run_timeout: int,
    evaluation_cache: Optional[Dict] = None  # Added cache parameter
) -> Tuple[bool, str]:
    """
    Compiles the fixed code, runs it against all test cases, compares with reference outputs.
    Returns (passed_all, status_message).
    Now supports caching to avoid re-testing identical code.
    """
    # Check cache first if provided
    if evaluation_cache is not None:
        code_hash = get_code_hash(fixed_code_str)
        if code_hash in evaluation_cache:
            cached_result = evaluation_cache[code_hash]
            # Cache hit message now handled by caller
            return cached_result["passed"], cached_result["status"]
    fixed_src_path = os.path.join(work_dir, "fixed.cpp")
    fixed_exe_path = os.path.join(work_dir, "fixed")

    # 1. Write fixed code
    try:
        with open(fixed_src_path, "w", encoding="utf-8") as f:
            f.write(fixed_code_str)
    except IOError as e:
        return False, f"Failed to write fixed code: {e}"

    # 2. Compile fixed code
    if not compile_program("fixed.cpp", "fixed", work_dir):
        # --- Save failing code on compilation error (silently) ---
        try:
            fail_save_path = os.path.join(work_dir, "fixed_compile_fail.cpp")
            with open(fail_save_path, "w", encoding="utf-8") as f_fail:
                f_fail.write(fixed_code_str)
            # Removed debug print and code content print
            error_message = "Compilation Failed" # Concise error message
            # --- Removed print failing code content ---
            return False, error_message
        except Exception as save_e:
             # Fallback error message if saving fails
            error_message = f"Compilation Failed (Save Error: {save_e})"
            print(f"  [Error] {error_message}", file=sys.stderr)
            return False, error_message
        # --- End save failing code ---

    # 3. Run on each test case and compare
    if not reference_outputs:
        return False, "Internal Error: No reference outputs provided"
    if not os.path.isdir(test_case_dir):
         return False, f"Internal Error: Invalid test case directory provided: {test_case_dir}"

    all_passed = True
    first_failure_details = ""

    # Ensure reference_outputs keys are sorted for consistent order if needed (glob might not be sorted)
    sorted_case_names = sorted(reference_outputs.keys())

    for case_base_name in sorted_case_names:
        ref_stdout = reference_outputs[case_base_name]
        input_path = os.path.join(test_case_dir, case_base_name)
        if not os.path.exists(input_path):
             print(f"  [Warning] Input case file {input_path} missing during evaluation. Skipping case.", file=sys.stderr)
             continue # Or mark as failure?

        fixed_stdout, error = run_program(fixed_exe_path, input_path, run_timeout)

        if error:
            all_passed = False
            first_failure_details = f"Failed on '{case_base_name}': {error}"
            break

        # Normalize whitespace for comparison - ignore trailing whitespace on each line
        def normalize_output(output):
            return '\n'.join(line.rstrip() for line in output.splitlines()).strip()

        if normalize_output(fixed_stdout) != normalize_output(ref_stdout):
            all_passed = False
            first_failure_details = f"Wrong Answer on '{case_base_name}'"

            # --- Add diff for the first line ---
            fixed_first_line = fixed_stdout.splitlines()[0].rstrip() if fixed_stdout.strip() else ""
            ref_first_line = ref_stdout.splitlines()[0].rstrip() if ref_stdout.strip() else ""

            if fixed_first_line != ref_first_line:
                first_failure_details += f"\\n  First line diff:\\n    Actual: '{fixed_first_line}'\\n    Expected: '{ref_first_line}'"
            # --- End add diff ---
            break

    # Determine final result
    if all_passed:
        result_passed, result_status = True, "Correct (Passed All Test Cases)"
    else:
        result_passed, result_status = False, first_failure_details

    # Cache the result if cache is provided
    if evaluation_cache is not None:
        code_hash = get_code_hash(fixed_code_str)
        evaluation_cache[code_hash] = {
            "passed": result_passed,
            "status": result_status,
            "timestamp": time.time()
        }
        # Only show save message in debug mode - removed to reduce noise

    return result_passed, result_status


# --- Enhanced LLM Repair Function with Optimization ---
def generate_llm_repair(
    problem_description: str,
    wa_code: str,
    submission_id: str, # Added submission_id
    repair_context_label: str, # Added context label (e.g., "no_tc", "orig_tc", "reduced_tc")
    failing_input: Optional[str] = None,
    artifact_dir: Optional[str] = None,
    wa_output: Optional[str] = None,
    expected_output: Optional[str] = None,
    # --- NEW: Support for dual test case strategy ---
    secondary_input: Optional[str] = None,
    secondary_wa_output: Optional[str] = None,
    secondary_expected_output: Optional[str] = None,
    model_tag: str = ""
) -> Optional[str]:
    """
    Enhanced repair function with optimized prompt strategy:
    - Full problem description (including samples)
    - Smart truncation with structure preservation
    - Dual test case support for better generalization
    - Precise diff information
    """

    # --- Step 1: Use full problem description ---
    print(f"  [Prompt] Using full problem description ({len(problem_description)} chars)")

    # --- Step 2: Build prompt with full problem description ---
    prompt = f"""### Problem Description
{problem_description}

### Your Incorrect Code
```cpp
{wa_code}
```

"""

    # --- Step 3: Handle test case information ---
    input_for_llm_str = None
    wa_output_for_llm_str = None
    expected_output_for_llm_str = None
    prompt_context_type = "no_tc"

    # --- Special handling for diff_only strategy ---
    if repair_context_label.startswith("diff_only") and wa_output is not None and expected_output is not None:
        # For diff_only strategy: no input, only first diff line with char limit
        prompt_context_type = "diff_only"

        # Only show first diff line with 64 chars per output (total 128 chars)
        diff_info = get_diff_lines(wa_output, expected_output, max_lines=1, max_chars_per_output=64)
        prompt += f"""
### Error Summary (diff only)
{diff_info}

### Your Task
Analyze and fix the algorithmic errors in the C++ code to make the diff disappear.

### Guidelines
1. Focus on correctness and efficiency
2. Consider edge cases and constraint limits
3. Use clean, readable code structure

### Output Format
Provide ONLY the complete fixed C++ code in a single ```cpp block.
"""

    elif failing_input:
        # Regular strategies with test case input
        prompt_context_type = "with_tc"

        # Smart truncation for primary input
        input_for_llm_str, input_note = truncate_smart(failing_input, MAX_INPUT_LENGTH_FOR_LLM, "input")
        if input_note:
            print(f"  [Smart Truncation] Primary input{input_note}")

        # --- Dual test case strategy ---
        dual_case_info = ""
        if secondary_input and secondary_input != failing_input and len(secondary_input.strip()) > 0:
            # Adjust truncation length: keep total (primary + secondary) roughly within MAX_INPUT_LENGTH_FOR_LLM
            avail_len = max(0, MAX_INPUT_LENGTH_FOR_LLM - len(input_for_llm_str))
            # If reduced_plus_diff strategy, try to use full budget; else default to half budget
            sec_limit = avail_len if repair_context_label.startswith("reduced_plus_diff") else MAX_INPUT_LENGTH_FOR_LLM // 2
            secondary_input_truncated, secondary_note = truncate_smart(secondary_input, sec_limit, "input")
            if secondary_note:
                print(f"  [Smart Truncation] Secondary input{secondary_note}")

            dual_case_info = f"""
### Failing Case #2 (Alternative Size)
Input:
```
{secondary_input_truncated}
```"""

            if secondary_wa_output and secondary_expected_output:
                sec_wa_truncated, _ = truncate_smart(secondary_wa_output, MAX_OUTPUT_LENGTH_FOR_LLM // 2, "output")
                sec_exp_truncated, _ = truncate_smart(secondary_expected_output, MAX_OUTPUT_LENGTH_FOR_LLM // 2, "output")
                diff_info_2 = get_diff_lines(sec_wa_truncated, sec_exp_truncated)
                dual_case_info += f"""
Error: {diff_info_2}"""

        # Primary test case
        prompt += f"""### Failing Case #1
Input:
```
{input_for_llm_str}
```"""

        # Add output diff information
        if wa_output is not None and expected_output is not None:
            prompt_context_type = "with_tc_outputs"

            wa_output_for_llm_str, wa_note = truncate_smart(wa_output, MAX_OUTPUT_LENGTH_FOR_LLM, "output")
            expected_output_for_llm_str, exp_note = truncate_smart(expected_output, MAX_OUTPUT_LENGTH_FOR_LLM, "output")

            if wa_note:
                print(f"  [Smart Truncation] Actual output{wa_note}")
            if exp_note:
                print(f"  [Smart Truncation] Expected output{exp_note}")

            # For full context strategies (orig_tc, reduced_tc, reduced_plus_diff), show full outputs
            prompt += f"""
Your Output:
```
{wa_output_for_llm_str}
```
Expected Output:
```
{expected_output_for_llm_str}
```"""

            # Also include diff info for clarity, but after showing full outputs
            diff_info = get_diff_lines(wa_output_for_llm_str, expected_output_for_llm_str, max_lines=10)
            prompt += f"""
Error Analysis: {diff_info}"""

        # Add dual case information
        prompt += dual_case_info

        # --- Enhanced task instructions ---
        prompt += f"""

### Your Task
Fix the C++ code to pass ALL test cases (including hidden ones).

### Critical Guidelines
1. Focus on algorithmic correctness - NO hard-coded values or special cases
2. Ensure proper data structures and complexity (aim for O(N log N) or better when possible)
3. Handle edge cases (empty input, single elements, max constraints)
4. Use standard C++20 features; avoid non-portable code

### Output Format
Provide ONLY the complete fixed C++ code in a single ```cpp block.
"""
    else:
        # No test case provided (baseline strategy)
        prompt += f"""### Your Task
Analyze and fix the algorithmic errors in the C++ code.

### Guidelines
1. Focus on correctness and efficiency
2. Consider edge cases and constraint limits
3. Use clean, readable code structure

### Output Format
Provide ONLY the complete fixed C++ code in a single ```cpp block.
"""

    # --- Save Prompt and Components (Moved Up) ---
    if artifact_dir:
        os.makedirs(artifact_dir, exist_ok=True)

        def save_component(filename, content):
            if content is None: return # Don't save if content is None
            path = os.path.join(artifact_dir, filename)
            try:
                with open(path, "w", encoding="utf-8") as f: f.write(content)
                # Removed info print about saving component
                # print(f"  [Info] Saved component to: {path}")
            except IOError as e:
                print(f"  [Warning] Failed to save {filename}: {e}", file=sys.stderr)

        # --- Modify filenames to include repair_context_label ---
        context_suffix = f"_{repair_context_label}_{model_tag}" # e.g., "_no_tc_qwen-plus", "_orig_tc_llama3"
        save_component(f"{submission_id}.prompt{context_suffix}.txt", prompt)
        save_component(f"{submission_id}.llm_input{context_suffix}.txt", input_for_llm_str)
        save_component(f"{submission_id}.actual_output{context_suffix}.txt", wa_output_for_llm_str)
        save_component(f"{submission_id}.expected_output{context_suffix}.txt", expected_output_for_llm_str)
        # --- End modify filenames ---
    # --- End Save ---

    # Removed verbose logging per user request

    fixed_code = None # Initialize fixed_code
    raw_response = None # Initialize raw_response
    try:
        prompt_history = [{'role': 'user', 'content': prompt}]
        raw_response = llm.call_llm(
            prompt_history=prompt_history,
            model_name=REPAIR_MODEL_NAME,
            temperature=REPAIR_TEMPERATURE,
        )

        if raw_response:
            # --- Save Raw LLM Response ---
            if artifact_dir:
                 # --- Modify response filename to include context suffix ---
                 save_component(f"{submission_id}.llm_response_{repair_context_label}_{model_tag}.txt", raw_response)
                 # --- End modify response filename ---
            # --- End Save Raw Response ---

            # --- Extract C++ code block directly ---
            start_marker = "```cpp\n"
            end_marker = "\n```"
            start_index = raw_response.find(start_marker)

            if start_index != -1:
                code_start_index = start_index + len(start_marker)
                end_index = raw_response.find(end_marker, code_start_index)
                if end_index != -1:
                    fixed_code = raw_response[code_start_index:end_index].strip()
                    print("[Info] Successfully extracted C++ code block using ```cpp markers.")
                else:
                    end_marker_alt = "```"
                    if raw_response.strip().endswith(end_marker_alt):
                        temp_end_index = raw_response.rfind(end_marker_alt)
                        if temp_end_index > code_start_index:
                             fixed_code = raw_response[code_start_index:temp_end_index].strip()
                             print("[Info] Successfully extracted C++ code block using ```cpp start and final ``` end markers.")

                if fixed_code:
                    # --- Save Extracted Fixed Code ---
                    if artifact_dir:
                         # --- Modify fixed code filename to include context suffix ---
                         save_component(f"{submission_id}.fixed_code_{repair_context_label}_{model_tag}.cpp", fixed_code)
                         # --- End modify fixed code filename ---
                    # --- End Save Fixed Code ---
                    return fixed_code
                else:
                    # Failed to extract code
                    return None
            else:
                # Failed to extract code
                return None
        else:
            print("[Error] LLM call returned None (likely an API error or client issue).", file=sys.stderr)
            return None

    except Exception as e:
        print(f"[Error] Exception during LLM call or extraction in generate_llm_repair: {e}", file=sys.stderr)
        traceback.print_exc()
        # --- Save Raw LLM Response on Exception (if available) ---
        if artifact_dir and raw_response is not None:
             # --- Modify error response filename to include context suffix ---
             save_component(f"{submission_id}.llm_response_on_error_{repair_context_label}_{model_tag}.txt", raw_response)
             # --- End modify error response filename ---
        # --- End Save Raw Response ---
        return None

# --- Add new prompt optimization helpers ---
# extract_problem_essentials function removed - using full problem description instead

def get_diff_lines(actual_output: str, expected_output: str, max_lines: int = 20, max_chars_per_output: int = None) -> str:
    """Return up to `max_lines` lines where actual and expected outputs differ.
    Format: "Line i: Got '...', Expected '...'". If line counts differ, note that too.
    If max_chars_per_output is specified, truncate each output to that length.
    """
    if not actual_output or not expected_output:
        return "Missing output data"

    actual_lines = actual_output.splitlines()
    expected_lines = expected_output.splitlines()

    diffs = []
    for i in range(max(len(actual_lines), len(expected_lines))):
        a_line = actual_lines[i].rstrip() if i < len(actual_lines) else "<EOF>"
        e_line = expected_lines[i].rstrip() if i < len(expected_lines) else "<EOF>"
        if a_line != e_line:
            # Apply character limit if specified (including "..." within the limit)
            if max_chars_per_output:
                if a_line != "<EOF>" and len(a_line) > max_chars_per_output:
                    a_line = a_line[:max_chars_per_output - 3] + "..."
                if e_line != "<EOF>" and len(e_line) > max_chars_per_output:
                    e_line = e_line[:max_chars_per_output - 3] + "..."
            diffs.append(f"Line {i+1}: Got '{a_line}', Expected '{e_line}'")
            if len(diffs) >= max_lines:
                break
    if not diffs:
        return "Outputs appear identical (possibly whitespace issue)"
    return "\n".join(diffs)

def truncate_smart(text: str, max_length: int, label: str = "") -> tuple:
    """Hard truncation: simply return the first `max_length` characters to provide more context."""
    # Return as-is if within limit
    if len(text) <= max_length:
        return text, ""

    # Hard truncate to first max_length characters
    truncated = text[:max_length]
    note = f" (truncated from {len(text)} chars)"
    return truncated, note

def generate_versions_parallel(problem_description: str, wa_code: str, submission_id: str,
                             base_context_label: str, artifact_dir: str,
                             num_versions: int,
                             start_version: int = 1,  # Added start_version parameter
                             failing_input: Optional[str] = None,
                             wa_output: Optional[str] = None,
                             expected_output: Optional[str] = None,
                             secondary_input: Optional[str] = None,
                             secondary_wa_output: Optional[str] = None,
                             secondary_expected_output: Optional[str] = None,
                             model_tag: str = "",
                             max_threads: int = DEFAULT_MAX_THREADS) -> List[str]:
    """
    Parallel generation of multiple repair versions.
    Returns a list of successfully generated code strings.
    """

    def generate_single_version(version_num):
        """Generate a single version - thread-safe."""
        context_label = f"{base_context_label}_v{version_num}"
        try:
            result = generate_llm_repair(
                problem_description, wa_code, submission_id=submission_id,
                repair_context_label=context_label, artifact_dir=artifact_dir,
                failing_input=failing_input, wa_output=wa_output,
                expected_output=expected_output,
                secondary_input=secondary_input,
                secondary_wa_output=secondary_wa_output,
                secondary_expected_output=secondary_expected_output,
                model_tag=model_tag
            )
            return version_num, result
        except Exception as e:
            print(f"    [Warning] Failed to generate version {version_num}: {e}", file=sys.stderr)
            return version_num, None

    print(f"  Generating {num_versions} versions in parallel (starting from v{start_version})...")
    successful_versions = []

    # Use ThreadPoolExecutor for parallel generation
    with ThreadPoolExecutor(max_workers=min(num_versions, max_threads)) as executor:  # Limit concurrent threads
        # Submit all generation tasks with correct version numbers
        futures = {executor.submit(generate_single_version, i): i for i in range(start_version, start_version + num_versions)}

        # Collect results as they complete
        results = {}
        for future in as_completed(futures):
            version_num, code = future.result()
            results[version_num] = code
            # Removed verbose output per user request

    # Collect successful results in order
    for i in range(start_version, start_version + num_versions):
        if i in results and results[i]:
            successful_versions.append(results[i])

    print(f"  Generated {len(successful_versions)}/{num_versions} versions successfully")
    return successful_versions

# --- Main Evaluation Logic ---
def main():
    parser = argparse.ArgumentParser(description="Evaluate LLM repair performance using different test case inputs.")
    parser.add_argument("problem_id", help="Target AtCoder problem ID (e.g., abc123a)")
    parser.add_argument("--model-tag", required=True, help="Tag for result file naming (e.g., 'qwen-coder7b' -> result_qwen-coder7b.json)")
    parser.add_argument("--reducer-model", default="qwen-plus", help="LLM model name for reducer generation (default: qwen-plus)")
    parser.add_argument("--repair-model", help="LLM model name for repair (overrides default REPAIR_MODEL_NAME)")
    parser.add_argument("--regenerate", action="store_true",
                       help="Force regenerate all repair codes instead of loading cached versions (default: load cached)")
    parser.add_argument("--max-threads", type=int, default=DEFAULT_MAX_THREADS,
                       help=f"Maximum number of parallel threads for LLM generation (default: {DEFAULT_MAX_THREADS})")
    args = parser.parse_args()
    target_problem_id_input = args.problem_id.strip().lower()
    max_threads = args.max_threads

    # Save repair results to file corresponding to model_tag
    global RESULT_JSON_PATH
    RESULT_JSON_PATH = f"result_{args.model_tag}.json"

    # Override default repair model if --repair-model is specified
    if args.repair_model:
        global REPAIR_MODEL_NAME
        REPAIR_MODEL_NAME = args.repair_model
        print(f"[Info] Using repair model: {REPAIR_MODEL_NAME} (overridden by --repair-model)")
    else:
        print(f"[Info] Using default repair model: {REPAIR_MODEL_NAME}")

    print(f"[Info] Using max threads: {max_threads}")

    # --- Load Reduction Data from unified reducer results file ---
    REDUCER_RESULTS_FILE = "reducer_results.json"

    print(f"[Info] Loading reduction results from {REDUCER_RESULTS_FILE}...")

    # Parse problem ID to get directory structure
    parsed_ids = _parse_problem_id(target_problem_id_input)
    if not parsed_ids:
        print(f"[Error] Invalid problem ID format: '{target_problem_id_input}'", file=sys.stderr)
        sys.exit(1)
    contest_id, problem_letter = parsed_ids
    problem_dir_name = f"results/{contest_id}{problem_letter.lower()}"

    # Load unified reducer results
    if not os.path.exists(REDUCER_RESULTS_FILE):
        print(f"[Error] Reducer results file '{REDUCER_RESULTS_FILE}' not found.")
        print(f"[Info] Please run: python3 consolidate_reducer_results.py")
        sys.exit(1)

    reducer_results_data = load_results_with_lock(REDUCER_RESULTS_FILE)
    if not reducer_results_data:
        print(f"[Error] Failed to load reducer results from {REDUCER_RESULTS_FILE} or file is empty.", file=sys.stderr)
        sys.exit(1)

    if target_problem_id_input not in reducer_results_data:
        print(f"[Error] Problem ID '{target_problem_id_input}' not found in {REDUCER_RESULTS_FILE}.")
        print(f"[Info] Available problems: {', '.join(reducer_results_data.keys())}")
        print(f"[Info] Please run: python3 consolidate_reducer_results.py {target_problem_id_input}")
        sys.exit(1)

    problem_data = reducer_results_data[target_problem_id_input]
    reduction_results = problem_data.get("results", [])

    if not reduction_results:
        print(f"[Error] No reduction results found for '{target_problem_id_input}' in {REDUCER_RESULTS_FILE}.")
        print(f"[Info] Please run: python3 consolidate_reducer_results.py {target_problem_id_input}")
        sys.exit(1)

    print(f"[Info] Found {len(reduction_results)} reduction results from unified file")

    # --- Get Problem Description ---
    # Get problem description from unified reducer results file
    problem_description = problem_data.get("problem_description", "")

    if not problem_description or problem_description.startswith("# Error"):
        print(f"[Warning] Problem description not found in {REDUCER_RESULTS_FILE}, trying fallback methods...")

        # Fallback 1: Try to find it in cache
        cache_desc_path = f".atcoder_cache/problems/{contest_id}/{contest_id}_{problem_letter.lower()}/description.md"
        if os.path.exists(cache_desc_path):
            print(f"[Info] Reading problem description from cache: {cache_desc_path}")
            try:
                with open(cache_desc_path, 'r', encoding='utf-8') as f:
                    problem_description = f.read()
            except Exception as e:
                print(f"[Warning] Failed to read cached description: {e}", file=sys.stderr)

        # Fallback 2: Try to load from lftbench
        if not problem_description:
            print("[Info] Problem description not found, attempting to load from lftbench...")
            try:
                import lftbench_utils
                problem_description = lftbench_utils.get_problem_description(target_problem_id_input)
                if problem_description and not problem_description.startswith("# Error"):
                    print("[Info] Successfully loaded problem description from lftbench")
                else:
                    print("[Warning] Failed to load problem description from lftbench")
                    problem_description = None
            except Exception as e:
                print(f"[Warning] Failed to load from lftbench: {e}", file=sys.stderr)

        if not problem_description:
            print(f"[Error] Could not obtain problem description for '{target_problem_id_input}'.")
            print(f"[Info] Please run: python3 consolidate_reducer_results.py {target_problem_id_input}")
            sys.exit(1)
    else:
        print(f"[Info] Using problem description from {REDUCER_RESULTS_FILE}")

    # --- Load/Initialize Repair Results file ---
    repair_results_data = load_results_with_lock(RESULT_JSON_PATH)
    if target_problem_id_input not in repair_results_data:
        # Initialize repair results entry with basic problem info
        repair_results_data[target_problem_id_input] = {
            "problem_description": problem_description,
            "reduction_source": REDUCER_RESULTS_FILE,  # Record that we read from unified file
            "results": []
        }

    wa_results = reduction_results  # Use the reduction_results we loaded earlier

    if not wa_results:
        print(f"[Info] No WA submission results found for '{target_problem_id_input}' to evaluate.", file=sys.stderr)
        sys.exit(0)

    # Use already parsed problem ID for paths
    ac_path = os.path.join(problem_dir_name, "ac.cpp")
    # Correct test case dir path
    test_case_dir = os.path.join(BASE_TESTCASE_PATH, contest_id, problem_letter.upper(), "in")

    if not os.path.exists(ac_path):
         print(f"[Error] AC code '{ac_path}' not found.", file=sys.stderr)
         sys.exit(1)
    if not os.path.isdir(test_case_dir):
         print(f"[Error] Test case directory '{test_case_dir}' not found.", file=sys.stderr)
         sys.exit(1)

    # --- Prepare AC Code and Reference Outputs ---
    print("\n--- Preparing Reference AC Execution ---")
    ac_work_dir = tempfile.mkdtemp(prefix="eval_ac_")
    ac_executable_path = os.path.join(ac_work_dir, "ac_ref")
    reference_outputs: Optional[Dict[str, str]] = None
    ac_compile_success = False # Flag to track compilation
    print(f"  Compiling AC code: {ac_path} in {ac_work_dir}...")
    # Compile AC in its own temp dir to avoid CWD issues
    if not compile_program(os.path.abspath(ac_path), "ac_ref", ac_work_dir):
            print("[Error] Failed to compile reference AC code.", file=sys.stderr)
            # Don't exit immediately, but mark as failed. We might need to clean up.
            # sys.exit(1)
    else:
        ac_compile_success = True
        print(f"  Successfully compiled AC to {ac_executable_path}")

        # List test cases
        test_case_paths = glob.glob(os.path.join(test_case_dir, "*"))
        if not test_case_paths:
                print(f"[Error] No test cases found in '{test_case_dir}'.", file=sys.stderr)
                ac_compile_success = False # Mark as failed if no test cases
                # sys.exit(1)
        else:
            test_case_names = [os.path.basename(p) for p in test_case_paths]
            test_case_names.sort() # Ensure consistent order

            reference_outputs, ref_error = get_reference_outputs(ac_executable_path, test_case_dir, test_case_names)
            if ref_error:
                print(f"[Error] Failed to generate reference outputs: {ref_error}", file=sys.stderr)
                ac_compile_success = False # Mark as failed
                # sys.exit(1)

    # *** Do not clean up ac_work_dir here anymore ***
    # finally:
    #     # Clean up AC compilation directory

    # Check if AC compilation and reference output generation failed
    if not ac_compile_success or reference_outputs is None:

        print("[Error] AC compilation or reference output generation failed. Exiting.", file=sys.stderr)
        # Clean up now before exiting
        if os.path.exists(ac_work_dir):
             shutil.rmtree(ac_work_dir)
        sys.exit(1)

    # --- Iterate through WA Submissions and Evaluate Repairs ---
    print(f"\n--- Evaluating Repairs for {len(wa_results)} WA Submissions --- ")
    needs_saving = False # Flag to save JSON only if changes were made

    # --- Add counters for 5 fair strategies ---
    evaluated_submissions_count = 0
    success_counts = {"no_tc": 0, "orig_tc": 0, "diff_only": 0, "reduced_tc": 0, "reduced_plus_diff": 0}
    success_submissions = {"no_tc": [], "orig_tc": [], "diff_only": [], "reduced_tc": [], "reduced_plus_diff": []}
    # --- End Add counters ---

    # --- Add outer try/finally block for main loop cleanup ---
    try:
        for wa_result in wa_results:
            submission_id = str(wa_result.get("submission_id", "UNKNOWN"))
            print(f"\n-- Processing WA Submission: {submission_id} --")

            # -------------------------------------------------------------
            # Early Skip Logic: If all strategies already have TOP_K (10)
            # repaired code versions on disk, we can safely skip this WA
            # submission to save time and avoid redundant LLM calls.
            # -------------------------------------------------------------