code-debugger-env/openenv.yaml at main · raunitx-02/code-debugger-env · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
name: code-debugger-env
version: "1.1.0"
description: >
  A real-world Python code debugging environment where AI agents identify bugs
  and provide corrected code. Covers 15 tasks across 3 difficulty-tiered bug categories:
  runtime errors, logic bugs, critical security vulnerabilities (SQL injection, OS command
  injection, weak hashing, unsafe eval), mutable state bugs, and multi-file project bugs.
  Graders execute fixed code in sandboxed subprocesses for fully deterministic, reproducible
  scoring. Includes a Regression Test Oracle (failing+passing tests) for all 15 tasks and a
  Code Smell Penalty to discourage dangerous patterns.
author: raunit19
tags:
  - code-review
  - debugging
  - security
  - python
  - real-world
  - openenv
action_space:
  type: object
  description: "Agent submits bug line number, bug type classification, and complete fixed code"
  properties:
    bug_line:
      type: integer
      description: "1-indexed line number of the bug in the code snippet"
    bug_type:
      type: string
      enum: ["syntax", "logic", "runtime", "security", "mutable_state"]
      description: "Category of the bug"
    fixed_code:
      type: string
      description: "Complete corrected Python code (entire snippet, not just the fix)"
    explanation:
      type: string
      description: "One-sentence explanation of the bug (optional)"
observation_space:
  type: object
  properties:
    code_snippet:
      type: string
      description: "Python code containing exactly one planted bug"
    task_description:
      type: string
      description: "What the function should do when correctly implemented"
    test_hint:
      type: string
      description: "Description of the test cases that will grade the fix"
    feedback:
      type: string
      description: "Grader feedback from the previous attempt (empty on first step)"
    attempt_number:
      type: integer
      description: "Current attempt number within this episode (1-5)"
    score_so_far:
      type: number
      description: "Best score achieved so far in this episode (0.0-1.0)"
    difficulty:
      type: string
      enum: ["easy", "medium", "hard"]
      description: "Difficulty level of the current task"
    done:
      type: boolean
      description: "True when the episode has ended"
    reward:
      type: number
      description: "Score for the most recent step (0.0-1.0)"
tasks:
  - id: easy_01
    difficulty: easy
    description: "Fix off-by-one in list double calculator"
  - id: easy_02
    difficulty: easy
    description: "Fix IndexError in palindrome checker"
  - id: easy_03
    difficulty: easy
    description: "Fix assignment error in vowel counter"
  - id: easy_04
    difficulty: easy
    description: "Fix product initialised to 0 instead of 1 in multiply_list"
  - id: medium_01
    difficulty: medium
    description: "Fix infinite recursion in summing logic"
  - id: medium_02
    difficulty: medium
    description: "Fix TypeError / float division in binary search"
  - id: medium_03
    difficulty: medium
    description: "Fix logic error in list flattener"
  - id: medium_04
    difficulty: medium
    description: "Fix wrong return variable (seen vs duplicates) in find_duplicates"
  - id: hard_01
    difficulty: hard
    description: "Fix mutable default argument in class constructor"
  - id: hard_02
    difficulty: hard
    description: "Fix SQL injection vulnerability"
  - id: hard_03
    difficulty: hard
    description: "Replace dangerous MD5 password hashing with SHA-256"
  - id: hard_04
    difficulty: hard
    description: "Fix OS command injection via shell=True subprocess"
  - id: hard_05
    difficulty: hard
    description: "Multi-file project simulation: fix cross-module dependency bug"
  - id: project_easy_01
    difficulty: easy
    description: "Fix syntax error (= instead of ==) in api.py if-condition (multi-file project)"
  - id: project_medium_01
    difficulty: medium
    description: "Fix syntax error (= instead of ==) in validator.py if-condition (multi-file project)"
max_episode_steps: 5
inference_timeout: 1200
baseline_scores:
  easy_01: 0.95
  easy_02: 0.90
  easy_03: 0.95
  easy_04: 0.90
  medium_01: 0.80
  medium_02: 0.70
  medium_03: 0.80
  medium_04: 0.75
  hard_01: 0.50
  hard_02: 0.40
  hard_03: 0.60
  hard_04: 0.45
  hard_05: 0.50
  project_easy_01: 0.85
  project_medium_01: 0.65