From 929e83d4069453afd7cf40d6a124719a31f29fed Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 13 May 2026 14:40:42 -0400 Subject: [PATCH] Fix #56: NameError on expand_attribute_column=False MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The else branch in read_gtf passed `result_df` to parse_gtf before `result_df` had been assigned — a guaranteed NameError that survived because the branch was never tested. Replaced with the correct `filepath_or_buffer` argument and also pass `split_attributes=False` so the unexpanded path doesn't emit the helper `attribute_split` column the user didn't ask for. Added tests/test_expand_attribute_column_false.py covering pandas / polars / dict result types, the features filter, and the alias / version-cast kwargs as graceful no-ops when attribute columns aren't expanded. Bumped to 2.7.1. --- gtfparse/__init__.py | 2 +- gtfparse/read_gtf.py | 5 +- tests/test_expand_attribute_column_false.py | 74 +++++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 tests/test_expand_attribute_column_false.py diff --git a/gtfparse/__init__.py b/gtfparse/__init__.py index bb8908b..436a0de 100644 --- a/gtfparse/__init__.py +++ b/gtfparse/__init__.py @@ -23,7 +23,7 @@ read_gtf, ) -__version__ = "2.7.0" +__version__ = "2.7.1" __all__ = [ "GENCODE_BIOTYPE_ALIASES", diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py index 23e2647..13ddc84 100644 --- a/gtfparse/read_gtf.py +++ b/gtfparse/read_gtf.py @@ -354,7 +354,10 @@ def read_gtf( features=features, ) else: - result_df = parse_gtf(result_df, features=features) + # When the caller opts out of attribute expansion they want the raw + # 'attribute' column verbatim — no need to also produce the + # 'attribute_split' helper that parse_gtf adds by default. + result_df = parse_gtf(filepath_or_buffer, features=features, split_attributes=False) # converting back to pandas here because Polars bugs manifest # as `pyo3_runtime.PanicException: assertion `left == right` failed: impl error` diff --git a/tests/test_expand_attribute_column_false.py b/tests/test_expand_attribute_column_false.py new file mode 100644 index 0000000..b06a206 --- /dev/null +++ b/tests/test_expand_attribute_column_false.py @@ -0,0 +1,74 @@ +"""Regression tests for #56: read_gtf(expand_attribute_column=False) +used to raise NameError because the else branch referenced `result_df` +before it had been assigned. +""" + +import pandas as pd + +from gtfparse import read_gtf + +from .data import data_path + +GTF_PATH = data_path("ensembl_grch37.head.gtf") + + +def test_expand_attribute_column_false_returns_raw_attribute_pandas(): + df = read_gtf(GTF_PATH, expand_attribute_column=False, result_type="pandas") + assert isinstance(df, pd.DataFrame) + # raw attribute column is preserved verbatim + assert "attribute" in df.columns + # none of the per-key attribute columns are produced + assert "gene_name" not in df.columns + assert "transcript_id" not in df.columns + # the helper 'attribute_split' column from parse_gtf is also suppressed + assert "attribute_split" not in df.columns + # core GTF columns are present and populated + for col in ("seqname", "source", "feature", "start", "end", "strand"): + assert col in df.columns + assert len(df) > 0 + # spot-check that the raw attribute string carries the original key/value form + assert any("gene_id" in val for val in df["attribute"].astype(str)) + + +def test_expand_attribute_column_false_returns_polars(): + df = read_gtf(GTF_PATH, expand_attribute_column=False, result_type="polars") + # polars dataframe — has columns attribute but no per-key columns + assert "attribute" in df.columns + assert "gene_name" not in df.columns + assert "attribute_split" not in df.columns + + +def test_expand_attribute_column_false_returns_dict(): + result = read_gtf(GTF_PATH, expand_attribute_column=False, result_type="dict") + assert isinstance(result, dict) + assert "attribute" in result + assert "gene_name" not in result + + +def test_expand_attribute_column_false_with_features_filter(): + """The features filter must still apply when not expanding.""" + df = read_gtf( + GTF_PATH, + expand_attribute_column=False, + features={"gene"}, + result_type="pandas", + ) + assert set(df["feature"]) == {"gene"} + + +def test_expand_attribute_column_false_skips_alias_and_version_logic(): + """When attribute columns aren't expanded, attribute_aliases has + nothing to rename and cast_version_columns has nothing to cast. + Neither should raise — both must be graceful no-ops on the raw + 'attribute'-column-only frame.""" + df = read_gtf( + GTF_PATH, + expand_attribute_column=False, + attribute_aliases={"gene_type": "gene_biotype"}, + cast_version_columns=True, + result_type="pandas", + ) + # alias source wasn't in columns → no rename happened → no canonical added + assert "gene_biotype" not in df.columns + # version columns weren't present → no cast → still nothing + assert "gene_version" not in df.columns