openvax · iskandr · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.travis.yml b/.travis.yml
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,96 @@
+## Golden Rules
+
+1. **Never commit to `master`.** Always `git checkout -b <feature-branch>` before editing. Land via PR.
+2. **Every PR bumps the version.** Even doc-only PRs — at minimum a patch bump. Update `gtfparse/__init__.py::__version__`.
+3. **"Done" means merged AND deployed to PyPI** — never stop at merge. After a PR merges, run `./deploy.sh` from a clean master. Skipping deploy = task not done.
+4. **File problems as issues, don't silently work around them.** If you hit a bug here or in a sibling openvax/pirl-unc repo, open a GitHub issue on the correct repo and link it from the PR.
+5. **After a PR ships, look for the next block of work.** Read open issues across the relevant openvax repos, group by dependency + urgency. Prefer *foundational* changes that unblock multiple downstream improvements; otherwise chain the smallest independent improvements.
+
+---
+
+## Before Completing Any Task
+
+Before considering any code change complete, you MUST:
+
+1. **Run `./lint.sh`** - Verify linting passes (runs `ruff check`)
+2. **Run `./test.sh`** - Verify all tests pass with coverage
+
+Do not tell the user you are "done" or that changes are "complete" until both pass.
+
+## Scripts
+
+- `./lint.sh` - Checks linting with ruff (must pass). **Always use this for linting if it exists.**
+- `./test.sh` - Runs pytest with coverage (must pass)
+- `./lint-and-test.sh` - Runs lint and test back-to-back
+- `./deploy.sh` - Deploys to PyPI (gates on lint.sh and test.sh). **Always use this for deploying if it exists.**
+- `./develop.sh` - Installs package in development mode (`pip install -e .`)
+
+## Code Style
+
+- Use ruff for linting
+- Configuration is in `pyproject.toml` under `[tool.ruff]`
+- Target Python version: 3.9+ (CI matrix runs 3.9 / 3.10 / 3.11)
+
+## Project Shape
+
+- Package: `gtfparse/` (thin library — attribute parsing, GTF reading, missing feature creation)
+- Tests: `tests/` (pytest; fixtures under `tests/data/`)
+- Core deps: polars, pyarrow, pandas (see `requirements.txt`)
+- Version lives in `gtfparse/__init__.py` and is exposed via `setuptools.dynamic` in `pyproject.toml`
+
+---
+
+## Workflow Orchestration
+
+### 1. Upfront Planning
+- For ANY non-trivial task (3+ steps or architectural decisions): write a detailed spec before touching code
+- If something goes sideways, STOP and re-plan immediately — don't keep pushing
+- Use planning/verification steps, not just building
+- Write detailed specs upfront to reduce ambiguity
+
+### 2. Self-Improvement Loop
+- After ANY correction from the user: update `tasks/lessons.md` with the pattern
+- Write rules for yourself that prevent the same mistake
+- Ruthlessly iterate on these lessons until mistake rate drops
+- Review lessons at session start for relevant project
+
+### 3. Verification Before Done
+- Never mark a task complete without proving it works
+- Diff behavior between the latest code and your changes when relevant
+- Ask yourself: "Would a staff engineer approve this?"
+- Run tests, check logs, demonstrate correctness
+
+### 4. Demand Elegance (Balanced)
+- For non-trivial changes: pause and ask "is there a more elegant way?"
+- If a fix feels hacky: "Knowing everything I know now, implement the elegant solution"
+- Skip this for simple, obvious fixes — don't over-engineer
+- Challenge your own work before presenting it
+
+### 5. Autonomous Bug Fixing
+- When given a bug report: just fix it. Don't ask for hand-holding
+- Point at logs, errors, failing tests — then resolve them
+- Zero context switching required from the user
+- Fix failing unit tests without being told how
+
+---
+
+## Task Management
+
+1. **Plan First**: Write plan to `tasks/todo.md` with checkable items
+2. **Verify Plan**: Check in before starting implementation
+3. **Track Progress**: Mark items complete as you go
+4. **Explain Changes**: High-level summary at each step
+5. **Document Results**: Add review section to `tasks/todo.md`
+6. **Capture Lessons**: Update `tasks/lessons.md` after corrections
+
+---
+
+## Core Principles
+
+- **Simplicity First**: Make every change as simple as possible. Impact minimal code.
+- **No Laziness**: Find root causes. No temporary fixes. Senior developer standards.
+- **Minimal Impact**: Changes should only touch what's necessary. Avoid introducing bugs.
+
+## Scientific Domain Knowledge
+- **Read the literature**: if some code involves genomic or biological concepts (GTF/GFF formats, feature hierarchies, attribute conventions), feel free to search for specs/review papers before changing code that expresses scientific concepts.
+- **Flag inconsistencies**: if code expresses a scientific model that's at odds with your understanding, note that inconsistency and ask for clarification.
diff --git a/format.sh b/format.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+set -e
+
+SOURCES="gtfparse tests"
+
+echo "Running ruff format..."
+ruff format $SOURCES
+
+echo "Formatting complete!"
diff --git a/gtfparse/__init__.py b/gtfparse/__init__.py
@@ -14,23 +14,23 @@
 from .create_missing_features import create_missing_features
 from .parsing_error import ParsingError
 from .read_gtf import (
-    read_gtf,
+    REQUIRED_COLUMNS,
     parse_gtf,
-    parse_gtf_pandas,
     parse_gtf_and_expand_attributes,
-    REQUIRED_COLUMNS,
+    parse_gtf_pandas,
+    read_gtf,
 )
 
-__version__ = "2.6.2"
+__version__ = "2.6.3"
 
 __all__ = [
-    "__version__",
-    "expand_attribute_strings",
-    "create_missing_features",
-    "parse_gtf_and_expand_attributes",
     "REQUIRED_COLUMNS",
     "ParsingError",
-    "read_gtf",
+    "__version__",
+    "create_missing_features",
+    "expand_attribute_strings",
     "parse_gtf",
+    "parse_gtf_and_expand_attributes",
     "parse_gtf_pandas",
+    "read_gtf",
 ]
diff --git a/gtfparse/attribute_parsing.py b/gtfparse/attribute_parsing.py
@@ -18,12 +18,7 @@
 logger = logging.getLogger(__name__)
 
 
-
-def expand_attribute_strings(
-        attribute_strings,
-        quote_char="'",
-        missing_value="",
-        usecols=None):
+def expand_attribute_strings(attribute_strings, quote_char="'", missing_value="", usecols=None):
     """
     The last column of GTF has a variable number of key value pairs
     of the format: "key1 value1; key2 value2;"
@@ -66,7 +61,7 @@ def expand_attribute_strings(
     # and pair of try/except blocks in the loop.
     column_interned_strings = {}
 
-    for (i, kv_strings) in enumerate(attribute_strings):
+    for i, kv_strings in enumerate(attribute_strings):
         if type(kv_strings) is str:
             kv_strings = kv_strings.split(";")
         for kv in kv_strings:
@@ -92,7 +87,7 @@ def expand_attribute_strings(
 
             if value[0] == quote_char:
                 value = value.replace(quote_char, "")
-                
+
             try:
                 column = extra_columns[column_name]
                 # if an attribute is used repeatedly then
@@ -108,9 +103,5 @@ def expand_attribute_strings(
                 extra_columns[column_name] = column
                 column_order.append(column_name)
 
-
-
     logging.info("Extracted GTF attributes: %s" % column_order)
-    return OrderedDict(
-        (column_name, extra_columns[column_name])
-        for column_name in column_order)
+    return OrderedDict((column_name, extra_columns[column_name]) for column_name in column_order)
diff --git a/gtfparse/create_missing_features.py b/gtfparse/create_missing_features.py
@@ -19,11 +19,7 @@
 logger = logging.getLogger(__name__)
 
 
-def create_missing_features(
-        dataframe,
-        unique_keys={},
-        extra_columns={},
-        missing_value=None):
+def create_missing_features(dataframe, unique_keys={}, extra_columns={}, missing_value=None):
     """
     Helper function used to construct a missing feature such as 'transcript'
     or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have
@@ -49,40 +45,35 @@ def create_missing_features(
     missing_value : any
         Which value to fill in for columns that we don't infer values for.
 
-    Returns original dataframe (converted to Pandas if necessary) along with all 
+    Returns original dataframe (converted to Pandas if necessary) along with all
     extra rows created for missing features.
     """
     if hasattr(dataframe, "to_pandas"):
         dataframe = dataframe.to_pandas()
-  
+
     extra_dataframes = []
 
     existing_features = set(dataframe["feature"])
     existing_columns = set(dataframe.columns)
-
-    for (feature_name, groupby_key) in unique_keys.items():
-
+
+    for feature_name, groupby_key in unique_keys.items():
         if feature_name in existing_features:
-            logging.info(
-                "Feature '%s' already exists in GTF data" % feature_name)
+            logging.info("Feature '%s' already exists in GTF data" % feature_name)
             continue
         logging.info("Creating rows for missing feature '%s'" % feature_name)
 
         # don't include rows where the groupby key was missing
-        missing = pd.Series([
-            x is None or x == ""
-            for x in dataframe[groupby_key]])
+        missing = pd.Series([x is None or x == "" for x in dataframe[groupby_key]])
         not_missing = ~missing
         row_groups = dataframe[not_missing].groupby(groupby_key)
 
         # Each group corresponds to a unique feature entry for which the
         # other columns may or may not be uniquely defined. Start off by
         # assuming the values for every column are missing and fill them in
         # where possible.
-        feature_values = OrderedDict([
-            (column_name, [missing_value] * row_groups.ngroups)
-            for column_name in dataframe.keys()
-        ])
+        feature_values = OrderedDict(
+            [(column_name, [missing_value] * row_groups.ngroups) for column_name in dataframe]
+        )
 
         # User specifies which non-required columns should we try to infer
         # values for
@@ -111,8 +102,9 @@ def create_missing_features(
             for column_name in feature_columns:
                 if column_name not in existing_columns:
                     raise ValueError(
-                        "Column '%s' does not exist in GTF, columns = %s" % (
-                            column_name, existing_columns))
+                        "Column '%s' does not exist in GTF, columns = %s"
+                        % (column_name, existing_columns)
+                    )
 
                 # expect that all entries related to a reconstructed feature
                 # are related and are thus within the same interval of
@@ -121,4 +113,4 @@ def create_missing_features(
                 if len(unique_values) == 1:
                     feature_values[column_name][i] = unique_values[0]
         extra_dataframes.append(pd.DataFrame(feature_values))
-    return pd.concat([dataframe] + extra_dataframes, ignore_index=True)
+    return pd.concat([dataframe, *extra_dataframes], ignore_index=True)
diff --git a/gtfparse/parsing_error.py b/gtfparse/parsing_error.py
@@ -10,5 +10,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 class ParsingError(Exception):
     pass