From 83caf8b76dcf6bc542ab1921474f870714a8453a Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 13 May 2026 11:05:06 -0400 Subject: [PATCH 1/3] Add attribute_aliases + cast_version_columns to read_gtf Closes #63: `attribute_aliases` lets callers normalize GENCODE-format GTFs onto Ensembl column names (`gene_type` -> `gene_biotype`, `transcript_type` -> `transcript_biotype`) at parse time. New `GENCODE_BIOTYPE_ALIASES` constant exports the canonical pair. Closes #64: `cast_version_columns=True` (default) casts the four well-known Ensembl `*_version` attributes (gene/transcript/protein/exon) from strings to pandas nullable Int64 when present, so downstream consumers like pyensembl don't have to `int(...)` themselves. Pass `cast_version_columns=False` to opt out and keep them as strings. Adds tests/data/gencode.head.gtf (synthetic GENCODE-style fixture carrying both alias columns and separate *_version attributes) plus test_gencode_gtf.py covering: alias rename, alias-vs-canonical conflict, missing alias no-op, opt-out, missing-attribute no-op, polars/pandas result types, and interaction with infer_biotype_column. Bump version to 2.7.0. --- gtfparse/__init__.py | 6 +- gtfparse/read_gtf.py | 98 +++++++++++++++++ tests/data/gencode.head.gtf | 10 ++ tests/test_gencode_gtf.py | 207 ++++++++++++++++++++++++++++++++++++ 4 files changed, 320 insertions(+), 1 deletion(-) create mode 100644 tests/data/gencode.head.gtf create mode 100644 tests/test_gencode_gtf.py diff --git a/gtfparse/__init__.py b/gtfparse/__init__.py index 3a46b2c..bb8908b 100644 --- a/gtfparse/__init__.py +++ b/gtfparse/__init__.py @@ -14,6 +14,8 @@ from .create_missing_features import create_missing_features from .parsing_error import ParsingError from .read_gtf import ( + GENCODE_BIOTYPE_ALIASES, + INTEGER_VERSION_COLUMNS, REQUIRED_COLUMNS, parse_gtf, parse_gtf_and_expand_attributes, @@ -21,9 +23,11 @@ read_gtf, ) -__version__ = "2.6.3" +__version__ = "2.7.0" __all__ = [ + "GENCODE_BIOTYPE_ALIASES", + "INTEGER_VERSION_COLUMNS", "REQUIRED_COLUMNS", "ParsingError", "__version__", diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py index fdf2d4a..d7dfa58 100644 --- a/gtfparse/read_gtf.py +++ b/gtfparse/read_gtf.py @@ -13,6 +13,7 @@ import logging from os.path import exists +import pandas as pd import polars from .attribute_parsing import expand_attribute_strings @@ -22,6 +23,27 @@ logger = logging.getLogger(__name__) +# GENCODE GTFs use *_type where Ensembl GTFs use *_biotype. Pass this +# (or a superset) as `attribute_aliases` to read_gtf to normalize a +# GENCODE-format GTF onto the Ensembl column names that downstream +# tools like pyensembl expect. +GENCODE_BIOTYPE_ALIASES = { + "gene_type": "gene_biotype", + "transcript_type": "transcript_biotype", +} + + +# Ensembl-style attribute columns that are always integer-valued when +# present. read_gtf casts these from string to pandas nullable Int64 by +# default; pass cast_version_columns=False to keep them as strings. +INTEGER_VERSION_COLUMNS = ( + "gene_version", + "transcript_version", + "protein_version", + "exon_version", +) + + """ Columns of a GTF file: @@ -183,6 +205,54 @@ def parse_gtf_and_expand_attributes( ) +def _apply_attribute_aliases(result_df, attribute_aliases): + """ + Rename alias attribute columns onto canonical names in-place. + + For each (alias -> canonical) pair: + * if only the alias is present, rename it to the canonical name. + * if both are present, drop the alias and warn (canonical wins). + * if neither is present, do nothing. + """ + if not attribute_aliases: + return result_df + existing = set(result_df.columns) + rename_map = {} + drop_aliases = [] + for alias, canonical in attribute_aliases.items(): + if alias not in existing: + continue + if canonical in existing: + logger.warning( + "Both alias column '%s' and canonical column '%s' are present; " + "dropping alias and keeping canonical values.", + alias, + canonical, + ) + drop_aliases.append(alias) + else: + rename_map[alias] = canonical + if drop_aliases: + result_df = result_df.drop(columns=drop_aliases) + if rename_map: + result_df = result_df.rename(columns=rename_map) + return result_df + + +def _cast_version_columns(result_df, version_columns=INTEGER_VERSION_COLUMNS): + """ + Cast known Ensembl *_version attribute columns from strings to + pandas nullable Int64 in-place. Missing/empty values become pd.NA. + """ + for column_name in version_columns: + if column_name not in result_df.columns: + continue + result_df[column_name] = pd.to_numeric( + result_df[column_name].replace("", None), errors="coerce" + ).astype("Int64") + return result_df + + def read_gtf( filepath_or_buffer, expand_attribute_column=True, @@ -192,6 +262,8 @@ def read_gtf( usecols=None, features=None, result_type="polars", + attribute_aliases=None, + cast_version_columns=True, ): """ Parse a GTF into a dictionary mapping column names to sequences of values. @@ -231,6 +303,22 @@ def read_gtf( result_type : One of 'polars', 'pandas', or 'dict' Default behavior is to return a Polars DataFrame, but will convert to Pandas DataFrame or dictionary if specified. + + attribute_aliases : dict of str -> str, optional + Maps alias attribute names onto canonical ones. After attributes + are expanded into columns, each alias column is renamed to its + canonical name when the canonical column is absent. If both are + present the alias is dropped and a warning is logged. Pass + `GENCODE_BIOTYPE_ALIASES` to normalize a GENCODE GTF's + `gene_type`/`transcript_type` onto Ensembl's + `gene_biotype`/`transcript_biotype`. + + cast_version_columns : bool + When True (default), cast the well-known integer version + attribute columns (`gene_version`, `transcript_version`, + `protein_version`, `exon_version`) from strings to pandas + nullable Int64 when present. Set to False to keep them as + strings. """ if type(filepath_or_buffer) is str and not exists(filepath_or_buffer): raise ValueError("GTF file does not exist: %s" % filepath_or_buffer) @@ -267,6 +355,16 @@ def wrapped_fn(x): column_type = column_cast_types[column_name] result_df[column_name] = result_df[column_name].astype(column_type) + # Rename alias attribute columns onto their canonical names. Done before + # infer_biotype_column so an aliased gene_biotype/transcript_biotype is + # visible to the inference logic. + result_df = _apply_attribute_aliases(result_df, attribute_aliases) + + # Cast Ensembl *_version columns from strings to nullable integers so + # downstream consumers (e.g. pyensembl) don't have to int(...) themselves. + if cast_version_columns: + result_df = _cast_version_columns(result_df) + # Hackishly infer whether the values in the 'source' column of this GTF # are actually representing a biotype by checking for the most common # gene_biotype and transcript_biotype value 'protein_coding' diff --git a/tests/data/gencode.head.gtf b/tests/data/gencode.head.gtf new file mode 100644 index 0000000..a39b34e --- /dev/null +++ b/tests/data/gencode.head.gtf @@ -0,0 +1,10 @@ +##description: GENCODE-style synthetic head for gtfparse tests +##provider: gtfparse-tests +##format: gtf +chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972"; gene_version "5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; gene_name "DDX11L1"; transcript_name "DDX11L1-202"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_id "ENSE00002234944"; exon_version "1"; exon_number "1"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; +chr1 HAVANA gene 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "6"; gene_type "protein_coding"; gene_name "OR4F5"; +chr1 HAVANA transcript 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; transcript_name "OR4F5-201"; +chr1 HAVANA CDS 65565 65573 . + 0 gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "1"; protein_id "ENSP00000493376"; protein_version "2"; gene_type "protein_coding"; transcript_type "protein_coding"; +chr1 HAVANA exon 65565 65573 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_id "ENSE00003812156"; exon_version "1"; exon_number "1"; gene_type "protein_coding"; transcript_type "protein_coding"; diff --git a/tests/test_gencode_gtf.py b/tests/test_gencode_gtf.py new file mode 100644 index 0000000..3dd3bd4 --- /dev/null +++ b/tests/test_gencode_gtf.py @@ -0,0 +1,207 @@ +"""Tests for GENCODE attribute aliases (#63) and version-column casting (#64).""" + +import logging +import os +import tempfile + +from gtfparse import GENCODE_BIOTYPE_ALIASES, INTEGER_VERSION_COLUMNS, read_gtf + +from .data import data_path + +GENCODE_GTF_PATH = data_path("gencode.head.gtf") + + +# ----------------------------- +# attribute_aliases (#63) +# ----------------------------- + + +def test_no_aliases_by_default_leaves_columns_alone(): + df = read_gtf(GENCODE_GTF_PATH, result_type="pandas") + # gene_type / transcript_type come through as-is, gene_biotype / transcript_biotype absent + assert "gene_type" in df.columns + assert "transcript_type" in df.columns + assert "gene_biotype" not in df.columns + assert "transcript_biotype" not in df.columns + + +def test_attribute_aliases_renames_gencode_to_ensembl(): + df = read_gtf( + GENCODE_GTF_PATH, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + result_type="pandas", + ) + assert "gene_biotype" in df.columns + assert "transcript_biotype" in df.columns + assert "gene_type" not in df.columns + assert "transcript_type" not in df.columns + # values are preserved + gene_rows = df[df["feature"] == "gene"] + assert set(gene_rows["gene_biotype"]) == { + "transcribed_unprocessed_pseudogene", + "protein_coding", + } + + +def test_attribute_aliases_with_polars_result_type(): + df = read_gtf( + GENCODE_GTF_PATH, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + result_type="polars", + ) + assert "gene_biotype" in df.columns + assert "gene_type" not in df.columns + + +def test_attribute_aliases_when_canonical_already_present_prefers_canonical(caplog): + """If both the alias and canonical column exist, drop the alias and warn.""" + # Hand-build a GTF where gene_biotype is already populated AND gene_type is also present + contents = ( + "chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\t" + 'gene_id "ENSGX"; gene_type "alias_value"; gene_biotype "canonical_value";\n' + ) + fd, path = tempfile.mkstemp(suffix=".gtf") + try: + with os.fdopen(fd, "w") as fh: + fh.write(contents) + with caplog.at_level(logging.WARNING): + df = read_gtf( + path, + attribute_aliases={"gene_type": "gene_biotype"}, + result_type="pandas", + ) + assert "gene_biotype" in df.columns + assert "gene_type" not in df.columns + # canonical wins + assert list(df["gene_biotype"]) == ["canonical_value"] + # warning was logged + assert any("Both alias column" in rec.message for rec in caplog.records) + finally: + os.unlink(path) + + +def test_attribute_aliases_with_missing_alias_is_noop(): + """Aliases that don't appear in the file are silently skipped.""" + df = read_gtf( + GENCODE_GTF_PATH, + attribute_aliases={"never_in_file": "gene_biotype"}, + result_type="pandas", + ) + # gene_biotype did not exist and the alias did not exist -> still no gene_biotype + assert "gene_biotype" not in df.columns + + +def test_attribute_aliases_with_empty_dict_is_noop(): + df = read_gtf(GENCODE_GTF_PATH, attribute_aliases={}, result_type="pandas") + assert "gene_type" in df.columns + + +# ----------------------------- +# cast_version_columns (#64) +# ----------------------------- + + +def test_version_columns_cast_to_int_by_default(): + df = read_gtf(GENCODE_GTF_PATH, result_type="pandas") + # all four version columns should be present and Int64-typed (nullable) + for column_name in INTEGER_VERSION_COLUMNS: + assert column_name in df.columns, column_name + assert str(df[column_name].dtype) == "Int64", ( + f"{column_name} dtype is {df[column_name].dtype}, expected Int64" + ) + # spot-check actual values are integers, not strings + gene_rows = df[df["feature"] == "gene"].reset_index(drop=True) + assert gene_rows.loc[0, "gene_version"] == 5 + assert gene_rows.loc[1, "gene_version"] == 6 + + +def test_version_columns_handle_missing_values(): + """Rows where a version attribute is absent should produce pd.NA, not raise.""" + df = read_gtf(GENCODE_GTF_PATH, result_type="pandas") + # gene rows don't have transcript_version / protein_version / exon_version + gene_rows = df[df["feature"] == "gene"] + assert gene_rows["transcript_version"].isna().all() + assert gene_rows["protein_version"].isna().all() + assert gene_rows["exon_version"].isna().all() + # gene_version IS populated on gene rows + assert gene_rows["gene_version"].notna().all() + + +def test_cast_version_columns_false_keeps_strings(): + df = read_gtf(GENCODE_GTF_PATH, cast_version_columns=False, result_type="pandas") + # column is still object/string when opted out + assert df["gene_version"].dtype == object + gene_rows = df[df["feature"] == "gene"].reset_index(drop=True) + assert gene_rows.loc[0, "gene_version"] == "5" + + +def test_version_columns_present_in_polars_result(): + df = read_gtf(GENCODE_GTF_PATH, result_type="polars") + for column_name in INTEGER_VERSION_COLUMNS: + assert column_name in df.columns + + +def test_version_columns_missing_when_attribute_absent(): + """When the underlying GTF doesn't carry version attributes, the columns + simply don't exist — casting must not raise.""" + # The classic Ensembl release-75 fixture has no *_version attributes + df = read_gtf(data_path("ensembl_grch37.head.gtf"), result_type="pandas") + for column_name in INTEGER_VERSION_COLUMNS: + assert column_name not in df.columns + + +# ----------------------------- +# Interaction between #63 and #64 +# ----------------------------- + + +def test_aliases_and_version_casting_work_together(): + df = read_gtf( + GENCODE_GTF_PATH, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + result_type="pandas", + ) + assert "gene_biotype" in df.columns + assert "transcript_biotype" in df.columns + assert str(df["gene_version"].dtype) == "Int64" + transcript_rows = df[df["feature"] == "transcript"].reset_index(drop=True) + assert transcript_rows.loc[0, "transcript_biotype"] == "processed_transcript" + assert transcript_rows.loc[0, "transcript_version"] == 2 + + +def test_aliases_visible_to_infer_biotype_column(): + """attribute_aliases is applied before infer_biotype_column, so an aliased + gene_biotype must prevent the inference path from overwriting it.""" + df = read_gtf( + GENCODE_GTF_PATH, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + infer_biotype_column=True, + result_type="pandas", + ) + # Should keep the GENCODE-derived biotypes, not infer from "source" + gene_rows = df[df["feature"] == "gene"] + assert "protein_coding" in set(gene_rows["gene_biotype"]) + assert "transcribed_unprocessed_pseudogene" in set(gene_rows["gene_biotype"]) + # source column is HAVANA, definitely not a biotype value + assert "HAVANA" not in set(gene_rows["gene_biotype"]) + + +# ----------------------------- +# Sanity: constants are stable +# ----------------------------- + + +def test_gencode_biotype_aliases_constant(): + assert GENCODE_BIOTYPE_ALIASES == { + "gene_type": "gene_biotype", + "transcript_type": "transcript_biotype", + } + + +def test_integer_version_columns_constant(): + assert set(INTEGER_VERSION_COLUMNS) == { + "gene_version", + "transcript_version", + "protein_version", + "exon_version", + } From 948a496adeea9acac0bac8cba72461ad40daccbd Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 13 May 2026 11:13:28 -0400 Subject: [PATCH 2/3] Use pd.api.types.is_string_dtype to tolerate StringDtype on newer pandas Python 3.9 / 3.11 CI failed on test_cast_version_columns_false_keeps_strings because newer pandas returns its own StringDtype (rather than the numpy object dtype) for string-typed columns. Relax the assertion to pd.api.types.is_string_dtype which returns True for both. --- tests/test_gencode_gtf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_gencode_gtf.py b/tests/test_gencode_gtf.py index 3dd3bd4..2b84fde 100644 --- a/tests/test_gencode_gtf.py +++ b/tests/test_gencode_gtf.py @@ -128,9 +128,13 @@ def test_version_columns_handle_missing_values(): def test_cast_version_columns_false_keeps_strings(): + import pandas as pd + df = read_gtf(GENCODE_GTF_PATH, cast_version_columns=False, result_type="pandas") - # column is still object/string when opted out - assert df["gene_version"].dtype == object + # column is still string-typed when opted out — accept either the legacy + # object dtype or pandas' newer StringDtype, depending on the installed + # pandas version. + assert pd.api.types.is_string_dtype(df["gene_version"]) gene_rows = df[df["feature"] == "gene"].reset_index(drop=True) assert gene_rows.loc[0, "gene_version"] == "5" From 668dfb0c18692940be01b8f0d22dbe6de12e4114 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 13 May 2026 11:25:22 -0400 Subject: [PATCH 3/3] Address review: bug fixes + real GENCODE coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes from PR review: 1. _apply_attribute_aliases: two aliases targeting the same canonical (e.g. both gene_type and a hypothetical gene_kind -> gene_biotype) silently produced a duplicate-column rename. Track renames in a running set so the second collision is flagged the same way as a pre-existing canonical (drop + warn). First in iteration order wins. 2. usecols + attribute_aliases: usecols was enforced at parse time (inside parse_gtf_and_expand_attributes) before aliases could run, so requesting `usecols=["gene_biotype"]` against a GENCODE GTF that only had gene_type returned an empty frame. Expand the parse-time filter to pull alias source columns through when their canonical is in usecols. End-of-function usecols filter still narrows down to the canonical name only. New test coverage (18 new tests, 54 total): * explicit attribute_aliases=None matches implicit default * result_type="dict" with both kwargs * multiple aliases -> same canonical: first wins, second warned+dropped * alias-chain semantics pinned (no chasing — each rule judged against the pre-rename column set) * aliasing preserves all unrelated columns * usecols composes with attribute_aliases * gene_version=0 round-trips as Int64(0), not pd.NA * malformed version string ("v3") coerces to pd.NA (lenient by design) * version cast idempotent across repeated reads (global polars cache safety) Real GENCODE fixture (tests/data/gencode.real.head.gtf) reflecting the on-disk shape — versioned IDs embedded in gene_id/transcript_id rather than separate *_version attrs, GENCODE-only fields (level, hgnc_id, havana_gene/transcript, tag, transcript_support_level), and chr-prefixed seqnames — plus 7 tests against it: * alias rename works on the real shape * no synthesized *_version columns when attrs absent * versioned ID strings preserved verbatim (ENSG/ENST/ENSP/ENSE) * GENCODE-only attribute fields all come through * chr1/chrM seqname convention preserved * gzipped input round-trips * infer_biotype_column stays silent on GENCODE (source is HAVANA, not a biotype value) — regression guard Ensembl regression checks: * old release-75 fixture still uses 'source' for biotype inference * passing GENCODE_BIOTYPE_ALIASES against a pure Ensembl GTF is a no-op --- gtfparse/read_gtf.py | 36 +++- tests/data/gencode.real.head.gtf | 13 ++ tests/test_gencode_gtf.py | 352 +++++++++++++++++++++++++++++++ 3 files changed, 396 insertions(+), 5 deletions(-) create mode 100644 tests/data/gencode.real.head.gtf diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py index d7dfa58..23e2647 100644 --- a/gtfparse/read_gtf.py +++ b/gtfparse/read_gtf.py @@ -209,20 +209,26 @@ def _apply_attribute_aliases(result_df, attribute_aliases): """ Rename alias attribute columns onto canonical names in-place. - For each (alias -> canonical) pair: + For each (alias -> canonical) pair, in iteration order: * if only the alias is present, rename it to the canonical name. * if both are present, drop the alias and warn (canonical wins). * if neither is present, do nothing. + + When two aliases target the same canonical (e.g. both ``gene_type`` + and a hypothetical ``gene_kind`` map to ``gene_biotype``), the first + rename in iteration order wins; subsequent aliases targeting an + already-renamed canonical are treated as collisions, dropped, and + warned about. """ if not attribute_aliases: return result_df - existing = set(result_df.columns) + columns_present = set(result_df.columns) rename_map = {} drop_aliases = [] for alias, canonical in attribute_aliases.items(): - if alias not in existing: + if alias not in columns_present: continue - if canonical in existing: + if canonical in columns_present: logger.warning( "Both alias column '%s' and canonical column '%s' are present; " "dropping alias and keeping canonical values.", @@ -232,6 +238,11 @@ def _apply_attribute_aliases(result_df, attribute_aliases): drop_aliases.append(alias) else: rename_map[alias] = canonical + # Reflect the rename in the running column set so a later + # alias mapping to the same canonical sees the collision + # instead of silently producing a duplicate-named column. + columns_present.discard(alias) + columns_present.add(canonical) if drop_aliases: result_df = result_df.drop(columns=drop_aliases) if rename_map: @@ -323,9 +334,24 @@ def read_gtf( if type(filepath_or_buffer) is str and not exists(filepath_or_buffer): raise ValueError("GTF file does not exist: %s" % filepath_or_buffer) + # If usecols asks for a canonical column that's only present in the + # GTF under an alias name, expand the parse-time column filter to + # also pull the alias through — otherwise it gets dropped at parse + # time before _apply_attribute_aliases can see it. The end-of-function + # usecols filter still narrows the result down to the canonical name. + parse_usecols = usecols + if usecols is not None and attribute_aliases: + usecols_set = set(usecols) + parse_usecols = set(usecols_set) + for alias, canonical in attribute_aliases.items(): + if canonical in usecols_set: + parse_usecols.add(alias) + if expand_attribute_column: result_df = parse_gtf_and_expand_attributes( - filepath_or_buffer, restrict_attribute_columns=usecols, features=features + filepath_or_buffer, + restrict_attribute_columns=parse_usecols, + features=features, ) else: result_df = parse_gtf(result_df, features=features) diff --git a/tests/data/gencode.real.head.gtf b/tests/data/gencode.real.head.gtf new file mode 100644 index 0000000..802d507 --- /dev/null +++ b/tests/data/gencode.real.head.gtf @@ -0,0 +1,13 @@ +##description: evidence-based annotation of the human genome (GRCh38) - test excerpt +##provider: GENCODE +##contact: gencode-help@ebi.ac.uk +##format: gtf +##date: 2024-01-01 +chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; gene_name "DDX11L1"; transcript_name "DDX11L1-202"; level 2; transcript_support_level "1"; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; exon_number 1; exon_id "ENSE00002234944.1"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; gene_name "DDX11L1"; transcript_name "DDX11L1-202"; level 2; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA gene 65419 71585 . + . gene_id "ENSG00000186092.7"; gene_type "protein_coding"; gene_name "OR4F5"; level 2; hgnc_id "HGNC:14825"; havana_gene "OTTHUMG00000001094.4"; +chr1 HAVANA transcript 65419 71585 . + . gene_id "ENSG00000186092.7"; transcript_id "ENST00000641515.2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; transcript_name "OR4F5-201"; level 2; tag "MANE_Select"; havana_gene "OTTHUMG00000001094.4"; +chr1 HAVANA CDS 65565 65573 . + 0 gene_id "ENSG00000186092.7"; transcript_id "ENST00000641515.2"; exon_number 1; protein_id "ENSP00000493376.2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; tag "MANE_Select"; +chr1 HAVANA exon 65565 65573 . + . gene_id "ENSG00000186092.7"; transcript_id "ENST00000641515.2"; exon_number 1; exon_id "ENSE00003812156.1"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; tag "MANE_Select"; +chrM ENSEMBL gene 3307 4262 . + . gene_id "ENSG00000198888.2"; gene_type "protein_coding"; gene_name "MT-ND1"; level 3; diff --git a/tests/test_gencode_gtf.py b/tests/test_gencode_gtf.py index 2b84fde..061928a 100644 --- a/tests/test_gencode_gtf.py +++ b/tests/test_gencode_gtf.py @@ -1,5 +1,6 @@ """Tests for GENCODE attribute aliases (#63) and version-column casting (#64).""" +import gzip import logging import os import tempfile @@ -9,6 +10,7 @@ from .data import data_path GENCODE_GTF_PATH = data_path("gencode.head.gtf") +GENCODE_REAL_GTF_PATH = data_path("gencode.real.head.gtf") # ----------------------------- @@ -209,3 +211,353 @@ def test_integer_version_columns_constant(): "protein_version", "exon_version", } + + +# ----------------------------- +# Explicit-None defaults +# ----------------------------- + + +def test_attribute_aliases_none_is_explicit_noop(): + """The kwarg default is None; passing it explicitly must match + the implicit behavior (no rename, no error).""" + df_default = read_gtf(GENCODE_GTF_PATH, result_type="pandas") + df_explicit = read_gtf(GENCODE_GTF_PATH, attribute_aliases=None, result_type="pandas") + assert list(df_default.columns) == list(df_explicit.columns) + assert "gene_type" in df_explicit.columns + assert "gene_biotype" not in df_explicit.columns + + +# ----------------------------- +# result_type="dict" with new kwargs +# ----------------------------- + + +def test_attribute_aliases_with_dict_result_type(): + result = read_gtf( + GENCODE_GTF_PATH, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + result_type="dict", + ) + assert isinstance(result, dict) + assert "gene_biotype" in result + assert "transcript_biotype" in result + assert "gene_type" not in result + assert "transcript_type" not in result + + +def test_version_columns_with_dict_result_type(): + result = read_gtf(GENCODE_GTF_PATH, result_type="dict") + assert "gene_version" in result + # values come back as ints when present, pd.NA when missing + values = list(result["gene_version"].values()) + non_null = [v for v in values if v is not None and not _is_na(v)] + assert non_null and all(isinstance(v, int) for v in non_null), non_null + + +def _is_na(v): + # pd.NA isn't equal-comparable; use bool() guarded form + try: + return v != v # NaN-style: pd.NA also returns NA from != itself + except TypeError: + return True + + +# ----------------------------- +# Multi-alias and ordering edge cases +# ----------------------------- + + +def test_multiple_aliases_mapping_to_same_canonical_first_wins(caplog): + """Two aliases that both target the same canonical: the first in + iteration order is renamed; the second is treated as a conflict + and dropped. Prevents pandas producing two columns with the same + name.""" + contents = ( + "chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\t" + 'gene_id "ENSGX"; gene_type "alias_a"; gene_kind "alias_b";\n' + ) + fd, path = tempfile.mkstemp(suffix=".gtf") + try: + with os.fdopen(fd, "w") as fh: + fh.write(contents) + with caplog.at_level(logging.WARNING): + df = read_gtf( + path, + # both aliases map to gene_biotype; gene_type comes first + attribute_aliases={ + "gene_type": "gene_biotype", + "gene_kind": "gene_biotype", + }, + result_type="pandas", + ) + # exactly one gene_biotype column, and the first alias's value wins + assert list(df.columns).count("gene_biotype") == 1 + assert "gene_type" not in df.columns + assert "gene_kind" not in df.columns + assert list(df["gene_biotype"]) == ["alias_a"] + # the collision was warned about + assert any("Both alias column" in rec.message for rec in caplog.records) + finally: + os.unlink(path) + + +def test_aliases_iteration_order_matters_for_chains(): + """When canonical of one rule is the alias of another, behavior + depends on iteration order against the original column set. This + test pins the current behavior: each rule is decided against the + pre-rename state, so chains aren't followed. Documents the contract.""" + contents = 'chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\tgene_id "ENSGX"; a "1"; b "2";\n' + fd, path = tempfile.mkstemp(suffix=".gtf") + try: + with os.fdopen(fd, "w") as fh: + fh.write(contents) + # {"a": "b"} with b already in the file should drop a (both + # present, canonical wins). NOT chain to {"a": "b", "b": "c"}. + df = read_gtf( + path, + attribute_aliases={"a": "b"}, + result_type="pandas", + ) + assert "a" not in df.columns + assert "b" in df.columns + assert list(df["b"]) == ["2"] + finally: + os.unlink(path) + + +def test_aliases_preserves_other_attribute_columns(): + """Renaming one attribute must not perturb the others.""" + df = read_gtf( + GENCODE_GTF_PATH, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + result_type="pandas", + ) + # spot-check several unrelated columns survive + for col in ("gene_id", "transcript_id", "gene_name", "transcript_name"): + assert col in df.columns, col + + +# ----------------------------- +# usecols interaction +# ----------------------------- + + +def test_aliases_with_usecols_filtering_canonical_column(): + """usecols is applied AFTER aliasing, so requesting the canonical + column name returns the data even when the GTF only had the alias.""" + df = read_gtf( + GENCODE_GTF_PATH, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + usecols=["gene_biotype"], + result_type="pandas", + ) + assert list(df.columns) == ["gene_biotype"] + assert "protein_coding" in set(df["gene_biotype"]) + + +# ----------------------------- +# cast_version_columns edge cases +# ----------------------------- + + +def test_version_zero_is_preserved_not_treated_as_missing(): + """0 must round-trip as Int64(0), not become pd.NA — the + .replace("", None) call only targets empty strings.""" + contents = 'chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\tgene_id "ENSGX"; gene_version "0";\n' + fd, path = tempfile.mkstemp(suffix=".gtf") + try: + with os.fdopen(fd, "w") as fh: + fh.write(contents) + df = read_gtf(path, result_type="pandas") + assert df.loc[0, "gene_version"] == 0 + assert not df["gene_version"].isna().iloc[0] + finally: + os.unlink(path) + + +def test_version_with_corrupted_non_integer_value_coerces_to_na(caplog): + """A malformed version string (e.g. 'v3' instead of '3') is + coerced to pd.NA rather than raising. Documents the lenient + behavior — corrupted-GTF data quality bugs become missing + values, not exceptions.""" + contents = 'chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\tgene_id "ENSGX"; gene_version "v3";\n' + fd, path = tempfile.mkstemp(suffix=".gtf") + try: + with os.fdopen(fd, "w") as fh: + fh.write(contents) + df = read_gtf(path, result_type="pandas") + assert df["gene_version"].isna().iloc[0] + finally: + os.unlink(path) + + +def test_version_cast_is_idempotent_via_double_read(): + """Reading the same file twice with the cast on must produce the + same dtypes — sanity check that the cast doesn't accumulate + side effects via the global polars string cache or similar.""" + df1 = read_gtf(GENCODE_GTF_PATH, result_type="pandas") + df2 = read_gtf(GENCODE_GTF_PATH, result_type="pandas") + for col in INTEGER_VERSION_COLUMNS: + assert str(df1[col].dtype) == str(df2[col].dtype) == "Int64" + + +# ----------------------------- +# Real GENCODE-format fixture (versions in IDs, not in attributes) +# ----------------------------- + + +def test_real_gencode_format_alias_rename(): + """A GENCODE GTF in its real on-disk shape — versioned IDs + embedded in gene_id / transcript_id, GENCODE-only fields like + level / hgnc_id / havana_gene / tag — should normalize to + Ensembl-style biotype column names cleanly.""" + df = read_gtf( + GENCODE_REAL_GTF_PATH, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + result_type="pandas", + ) + assert "gene_biotype" in df.columns + assert "transcript_biotype" in df.columns + assert "gene_type" not in df.columns + gene_rows = df[df["feature"] == "gene"] + biotypes = set(gene_rows["gene_biotype"]) + assert biotypes == {"transcribed_unprocessed_pseudogene", "protein_coding"} + + +def test_real_gencode_format_has_no_separate_version_attrs(): + """Real GENCODE doesn't carry separate *_version attribute + fields — versions are baked into gene_id/transcript_id strings. + cast_version_columns must be a graceful no-op (not create empty + columns, not raise).""" + df = read_gtf(GENCODE_REAL_GTF_PATH, result_type="pandas") + for col in INTEGER_VERSION_COLUMNS: + assert col not in df.columns, ( + f"Real GENCODE has no '{col}' attribute; " + f"version casting should not have synthesized one." + ) + + +def test_real_gencode_format_preserves_versioned_id_strings(): + """The version baked into gene_id / transcript_id / protein_id / + exon_id must come through verbatim — those strings ARE the + canonical identifier in GENCODE.""" + df = read_gtf(GENCODE_REAL_GTF_PATH, result_type="pandas") + gene_ids = set(df["gene_id"]) + assert "ENSG00000223972.5" in gene_ids + assert "ENSG00000186092.7" in gene_ids + assert "ENSG00000198888.2" in gene_ids + # protein_id and exon_id likewise carry .N + protein_rows = df[df["feature"] == "CDS"] + assert "ENSP00000493376.2" in set(protein_rows["protein_id"]) + + +def test_real_gencode_format_preserves_gencode_only_fields(): + """GENCODE-specific attribute fields (level, hgnc_id, + havana_gene, havana_transcript, tag, transcript_support_level) + must survive parsing — they're useful downstream signal.""" + df = read_gtf(GENCODE_REAL_GTF_PATH, result_type="pandas") + for col in ( + "level", + "hgnc_id", + "havana_gene", + "tag", + "transcript_support_level", + "havana_transcript", + ): + assert col in df.columns, col + # spot-check values + transcript_rows = df[df["feature"] == "transcript"].reset_index(drop=True) + assert "MANE_Select" in set(df["tag"]) + assert "HGNC:14825" in set(df["hgnc_id"]) + # transcript_support_level is "1" (string — we deliberately don't + # auto-cast it since GENCODE allows non-numeric values like "NA") + assert "1" in set(transcript_rows["transcript_support_level"]) + + +def test_real_gencode_format_uses_chr_prefixed_seqnames(): + """GENCODE uses 'chr1' / 'chrM' while Ensembl uses bare '1' / + 'MT'. Confirming the seqname comes through unchanged so + downstream tooling can normalize if needed.""" + df = read_gtf(GENCODE_REAL_GTF_PATH, result_type="pandas") + seqnames = set(df["seqname"]) + assert seqnames <= {"chr1", "chrM"} + assert "chr1" in seqnames + assert "chrM" in seqnames + + +def test_real_gencode_format_via_gzip_compressed_input(): + """End-to-end: gzip a real-format GENCODE GTF, point read_gtf at + the .gz, confirm aliasing + parsing both still work.""" + with open(GENCODE_REAL_GTF_PATH, "rb") as src: + raw = src.read() + fd, gz_path = tempfile.mkstemp(suffix=".gtf.gz") + os.close(fd) + try: + with gzip.open(gz_path, "wb") as out: + out.write(raw) + df = read_gtf( + gz_path, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + result_type="pandas", + ) + assert "gene_biotype" in df.columns + assert "ENSG00000186092.7" in set(df["gene_id"]) + finally: + os.unlink(gz_path) + + +def test_real_gencode_format_with_infer_biotype_column_is_safe(): + """infer_biotype_column would normally fire if 'protein_coding' + appears in the 'source' column. Real GENCODE's source is + 'HAVANA' / 'ENSEMBL', not 'protein_coding', so inference + correctly stays silent. Regression guard against accidentally + triggering it on GENCODE input.""" + df = read_gtf( + GENCODE_REAL_GTF_PATH, + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + infer_biotype_column=True, + result_type="pandas", + ) + # The aliased gene_biotype should be the authoritative one + gene_rows = df[df["feature"] == "gene"] + assert set(gene_rows["gene_biotype"]) == { + "transcribed_unprocessed_pseudogene", + "protein_coding", + } + + +# ----------------------------- +# Ensembl regression checks +# ----------------------------- + + +def test_old_ensembl_fixture_still_uses_source_for_biotype_inference(): + """Ensembl release-75 puts biotype into the 'source' column. With + infer_biotype_column=True the fix should still populate + gene_biotype / transcript_biotype from source. Make sure the new + kwargs don't break that legacy path.""" + df = read_gtf( + data_path("ensembl_grch37.head.gtf"), + infer_biotype_column=True, + result_type="pandas", + ) + assert "gene_biotype" in df.columns + assert "transcript_biotype" in df.columns + # The release-75 fixture has pseudogenes and protein_coding genes + assert "protein_coding" in set(df["gene_biotype"]) + + +def test_ensembl_attribute_aliases_no_op_when_aliases_absent(): + """Passing GENCODE_BIOTYPE_ALIASES against a pure Ensembl file + that doesn't have gene_type / transcript_type must do nothing + — no spurious columns, no exceptions.""" + df = read_gtf( + data_path("ensembl_grch37.head.gtf"), + attribute_aliases=GENCODE_BIOTYPE_ALIASES, + result_type="pandas", + ) + # gene_biotype exists in this fixture; aliasing didn't fight with it + assert "gene_biotype" in df.columns + assert "gene_type" not in df.columns + assert "transcript_type" not in df.columns