From 83caf8b76dcf6bc542ab1921474f870714a8453a Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Wed, 13 May 2026 11:05:06 -0400
Subject: [PATCH 1/3] Add attribute_aliases + cast_version_columns to read_gtf

Closes #63: `attribute_aliases` lets callers normalize GENCODE-format
GTFs onto Ensembl column names (`gene_type` -> `gene_biotype`,
`transcript_type` -> `transcript_biotype`) at parse time. New
`GENCODE_BIOTYPE_ALIASES` constant exports the canonical pair.

Closes #64: `cast_version_columns=True` (default) casts the four
well-known Ensembl `*_version` attributes (gene/transcript/protein/exon)
from strings to pandas nullable Int64 when present, so downstream
consumers like pyensembl don't have to `int(...)` themselves. Pass
`cast_version_columns=False` to opt out and keep them as strings.

Adds tests/data/gencode.head.gtf (synthetic GENCODE-style fixture
carrying both alias columns and separate *_version attributes) plus
test_gencode_gtf.py covering: alias rename, alias-vs-canonical conflict,
missing alias no-op, opt-out, missing-attribute no-op, polars/pandas
result types, and interaction with infer_biotype_column.

Bump version to 2.7.0.
---
 gtfparse/__init__.py        |   6 +-
 gtfparse/read_gtf.py        |  98 +++++++++++++++++
 tests/data/gencode.head.gtf |  10 ++
 tests/test_gencode_gtf.py   | 207 ++++++++++++++++++++++++++++++++++++
 4 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 tests/data/gencode.head.gtf
 create mode 100644 tests/test_gencode_gtf.py

diff --git a/gtfparse/__init__.py b/gtfparse/__init__.py
index 3a46b2c..bb8908b 100644
--- a/gtfparse/__init__.py
+++ b/gtfparse/__init__.py
@@ -14,6 +14,8 @@
 from .create_missing_features import create_missing_features
 from .parsing_error import ParsingError
 from .read_gtf import (
+    GENCODE_BIOTYPE_ALIASES,
+    INTEGER_VERSION_COLUMNS,
     REQUIRED_COLUMNS,
     parse_gtf,
     parse_gtf_and_expand_attributes,
@@ -21,9 +23,11 @@
     read_gtf,
 )
 
-__version__ = "2.6.3"
+__version__ = "2.7.0"
 
 __all__ = [
+    "GENCODE_BIOTYPE_ALIASES",
+    "INTEGER_VERSION_COLUMNS",
     "REQUIRED_COLUMNS",
     "ParsingError",
     "__version__",
diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py
index fdf2d4a..d7dfa58 100644
--- a/gtfparse/read_gtf.py
+++ b/gtfparse/read_gtf.py
@@ -13,6 +13,7 @@
 import logging
 from os.path import exists
 
+import pandas as pd
 import polars
 
 from .attribute_parsing import expand_attribute_strings
@@ -22,6 +23,27 @@
 logger = logging.getLogger(__name__)
 
 
+# GENCODE GTFs use *_type where Ensembl GTFs use *_biotype. Pass this
+# (or a superset) as `attribute_aliases` to read_gtf to normalize a
+# GENCODE-format GTF onto the Ensembl column names that downstream
+# tools like pyensembl expect.
+GENCODE_BIOTYPE_ALIASES = {
+    "gene_type": "gene_biotype",
+    "transcript_type": "transcript_biotype",
+}
+
+
+# Ensembl-style attribute columns that are always integer-valued when
+# present. read_gtf casts these from string to pandas nullable Int64 by
+# default; pass cast_version_columns=False to keep them as strings.
+INTEGER_VERSION_COLUMNS = (
+    "gene_version",
+    "transcript_version",
+    "protein_version",
+    "exon_version",
+)
+
+
 """
 Columns of a GTF file:
 
@@ -183,6 +205,54 @@ def parse_gtf_and_expand_attributes(
     )
 
 
+def _apply_attribute_aliases(result_df, attribute_aliases):
+    """
+    Rename alias attribute columns onto canonical names in-place.
+
+    For each (alias -> canonical) pair:
+      * if only the alias is present, rename it to the canonical name.
+      * if both are present, drop the alias and warn (canonical wins).
+      * if neither is present, do nothing.
+    """
+    if not attribute_aliases:
+        return result_df
+    existing = set(result_df.columns)
+    rename_map = {}
+    drop_aliases = []
+    for alias, canonical in attribute_aliases.items():
+        if alias not in existing:
+            continue
+        if canonical in existing:
+            logger.warning(
+                "Both alias column '%s' and canonical column '%s' are present; "
+                "dropping alias and keeping canonical values.",
+                alias,
+                canonical,
+            )
+            drop_aliases.append(alias)
+        else:
+            rename_map[alias] = canonical
+    if drop_aliases:
+        result_df = result_df.drop(columns=drop_aliases)
+    if rename_map:
+        result_df = result_df.rename(columns=rename_map)
+    return result_df
+
+
+def _cast_version_columns(result_df, version_columns=INTEGER_VERSION_COLUMNS):
+    """
+    Cast known Ensembl *_version attribute columns from strings to
+    pandas nullable Int64 in-place. Missing/empty values become pd.NA.
+    """
+    for column_name in version_columns:
+        if column_name not in result_df.columns:
+            continue
+        result_df[column_name] = pd.to_numeric(
+            result_df[column_name].replace("", None), errors="coerce"
+        ).astype("Int64")
+    return result_df
+
+
 def read_gtf(
     filepath_or_buffer,
     expand_attribute_column=True,
@@ -192,6 +262,8 @@ def read_gtf(
     usecols=None,
     features=None,
     result_type="polars",
+    attribute_aliases=None,
+    cast_version_columns=True,
 ):
     """
     Parse a GTF into a dictionary mapping column names to sequences of values.
@@ -231,6 +303,22 @@ def read_gtf(
     result_type : One of 'polars', 'pandas', or 'dict'
         Default behavior is to return a Polars DataFrame, but will convert to
         Pandas DataFrame or dictionary if specified.
+
+    attribute_aliases : dict of str -> str, optional
+        Maps alias attribute names onto canonical ones. After attributes
+        are expanded into columns, each alias column is renamed to its
+        canonical name when the canonical column is absent. If both are
+        present the alias is dropped and a warning is logged. Pass
+        `GENCODE_BIOTYPE_ALIASES` to normalize a GENCODE GTF's
+        `gene_type`/`transcript_type` onto Ensembl's
+        `gene_biotype`/`transcript_biotype`.
+
+    cast_version_columns : bool
+        When True (default), cast the well-known integer version
+        attribute columns (`gene_version`, `transcript_version`,
+        `protein_version`, `exon_version`) from strings to pandas
+        nullable Int64 when present. Set to False to keep them as
+        strings.
     """
     if type(filepath_or_buffer) is str and not exists(filepath_or_buffer):
         raise ValueError("GTF file does not exist: %s" % filepath_or_buffer)
@@ -267,6 +355,16 @@ def wrapped_fn(x):
                 column_type = column_cast_types[column_name]
                 result_df[column_name] = result_df[column_name].astype(column_type)
 
+    # Rename alias attribute columns onto their canonical names. Done before
+    # infer_biotype_column so an aliased gene_biotype/transcript_biotype is
+    # visible to the inference logic.
+    result_df = _apply_attribute_aliases(result_df, attribute_aliases)
+
+    # Cast Ensembl *_version columns from strings to nullable integers so
+    # downstream consumers (e.g. pyensembl) don't have to int(...) themselves.
+    if cast_version_columns:
+        result_df = _cast_version_columns(result_df)
+
     # Hackishly infer whether the values in the 'source' column of this GTF
     # are actually representing a biotype by checking for the most common
     # gene_biotype and transcript_biotype value 'protein_coding'
diff --git a/tests/data/gencode.head.gtf b/tests/data/gencode.head.gtf
new file mode 100644
index 0000000..a39b34e
--- /dev/null
+++ b/tests/data/gencode.head.gtf
@@ -0,0 +1,10 @@
+##description: GENCODE-style synthetic head for gtfparse tests
+##provider: gtfparse-tests
+##format: gtf
+chr1	HAVANA	gene	11869	14409	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1";
+chr1	HAVANA	transcript	11869	14409	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; gene_name "DDX11L1"; transcript_name "DDX11L1-202";
+chr1	HAVANA	exon	11869	12227	.	+	.	gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_id "ENSE00002234944"; exon_version "1"; exon_number "1"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript";
+chr1	HAVANA	gene	65419	71585	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; gene_type "protein_coding"; gene_name "OR4F5";
+chr1	HAVANA	transcript	65419	71585	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; transcript_name "OR4F5-201";
+chr1	HAVANA	CDS	65565	65573	.	+	0	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "1"; protein_id "ENSP00000493376"; protein_version "2"; gene_type "protein_coding"; transcript_type "protein_coding";
+chr1	HAVANA	exon	65565	65573	.	+	.	gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_id "ENSE00003812156"; exon_version "1"; exon_number "1"; gene_type "protein_coding"; transcript_type "protein_coding";
diff --git a/tests/test_gencode_gtf.py b/tests/test_gencode_gtf.py
new file mode 100644
index 0000000..3dd3bd4
--- /dev/null
+++ b/tests/test_gencode_gtf.py
@@ -0,0 +1,207 @@
+"""Tests for GENCODE attribute aliases (#63) and version-column casting (#64)."""
+
+import logging
+import os
+import tempfile
+
+from gtfparse import GENCODE_BIOTYPE_ALIASES, INTEGER_VERSION_COLUMNS, read_gtf
+
+from .data import data_path
+
+GENCODE_GTF_PATH = data_path("gencode.head.gtf")
+
+
+# -----------------------------
+# attribute_aliases (#63)
+# -----------------------------
+
+
+def test_no_aliases_by_default_leaves_columns_alone():
+    df = read_gtf(GENCODE_GTF_PATH, result_type="pandas")
+    # gene_type / transcript_type come through as-is, gene_biotype / transcript_biotype absent
+    assert "gene_type" in df.columns
+    assert "transcript_type" in df.columns
+    assert "gene_biotype" not in df.columns
+    assert "transcript_biotype" not in df.columns
+
+
+def test_attribute_aliases_renames_gencode_to_ensembl():
+    df = read_gtf(
+        GENCODE_GTF_PATH,
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        result_type="pandas",
+    )
+    assert "gene_biotype" in df.columns
+    assert "transcript_biotype" in df.columns
+    assert "gene_type" not in df.columns
+    assert "transcript_type" not in df.columns
+    # values are preserved
+    gene_rows = df[df["feature"] == "gene"]
+    assert set(gene_rows["gene_biotype"]) == {
+        "transcribed_unprocessed_pseudogene",
+        "protein_coding",
+    }
+
+
+def test_attribute_aliases_with_polars_result_type():
+    df = read_gtf(
+        GENCODE_GTF_PATH,
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        result_type="polars",
+    )
+    assert "gene_biotype" in df.columns
+    assert "gene_type" not in df.columns
+
+
+def test_attribute_aliases_when_canonical_already_present_prefers_canonical(caplog):
+    """If both the alias and canonical column exist, drop the alias and warn."""
+    # Hand-build a GTF where gene_biotype is already populated AND gene_type is also present
+    contents = (
+        "chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\t"
+        'gene_id "ENSGX"; gene_type "alias_value"; gene_biotype "canonical_value";\n'
+    )
+    fd, path = tempfile.mkstemp(suffix=".gtf")
+    try:
+        with os.fdopen(fd, "w") as fh:
+            fh.write(contents)
+        with caplog.at_level(logging.WARNING):
+            df = read_gtf(
+                path,
+                attribute_aliases={"gene_type": "gene_biotype"},
+                result_type="pandas",
+            )
+        assert "gene_biotype" in df.columns
+        assert "gene_type" not in df.columns
+        # canonical wins
+        assert list(df["gene_biotype"]) == ["canonical_value"]
+        # warning was logged
+        assert any("Both alias column" in rec.message for rec in caplog.records)
+    finally:
+        os.unlink(path)
+
+
+def test_attribute_aliases_with_missing_alias_is_noop():
+    """Aliases that don't appear in the file are silently skipped."""
+    df = read_gtf(
+        GENCODE_GTF_PATH,
+        attribute_aliases={"never_in_file": "gene_biotype"},
+        result_type="pandas",
+    )
+    # gene_biotype did not exist and the alias did not exist -> still no gene_biotype
+    assert "gene_biotype" not in df.columns
+
+
+def test_attribute_aliases_with_empty_dict_is_noop():
+    df = read_gtf(GENCODE_GTF_PATH, attribute_aliases={}, result_type="pandas")
+    assert "gene_type" in df.columns
+
+
+# -----------------------------
+# cast_version_columns (#64)
+# -----------------------------
+
+
+def test_version_columns_cast_to_int_by_default():
+    df = read_gtf(GENCODE_GTF_PATH, result_type="pandas")
+    # all four version columns should be present and Int64-typed (nullable)
+    for column_name in INTEGER_VERSION_COLUMNS:
+        assert column_name in df.columns, column_name
+        assert str(df[column_name].dtype) == "Int64", (
+            f"{column_name} dtype is {df[column_name].dtype}, expected Int64"
+        )
+    # spot-check actual values are integers, not strings
+    gene_rows = df[df["feature"] == "gene"].reset_index(drop=True)
+    assert gene_rows.loc[0, "gene_version"] == 5
+    assert gene_rows.loc[1, "gene_version"] == 6
+
+
+def test_version_columns_handle_missing_values():
+    """Rows where a version attribute is absent should produce pd.NA, not raise."""
+    df = read_gtf(GENCODE_GTF_PATH, result_type="pandas")
+    # gene rows don't have transcript_version / protein_version / exon_version
+    gene_rows = df[df["feature"] == "gene"]
+    assert gene_rows["transcript_version"].isna().all()
+    assert gene_rows["protein_version"].isna().all()
+    assert gene_rows["exon_version"].isna().all()
+    # gene_version IS populated on gene rows
+    assert gene_rows["gene_version"].notna().all()
+
+
+def test_cast_version_columns_false_keeps_strings():
+    df = read_gtf(GENCODE_GTF_PATH, cast_version_columns=False, result_type="pandas")
+    # column is still object/string when opted out
+    assert df["gene_version"].dtype == object
+    gene_rows = df[df["feature"] == "gene"].reset_index(drop=True)
+    assert gene_rows.loc[0, "gene_version"] == "5"
+
+
+def test_version_columns_present_in_polars_result():
+    df = read_gtf(GENCODE_GTF_PATH, result_type="polars")
+    for column_name in INTEGER_VERSION_COLUMNS:
+        assert column_name in df.columns
+
+
+def test_version_columns_missing_when_attribute_absent():
+    """When the underlying GTF doesn't carry version attributes, the columns
+    simply don't exist — casting must not raise."""
+    # The classic Ensembl release-75 fixture has no *_version attributes
+    df = read_gtf(data_path("ensembl_grch37.head.gtf"), result_type="pandas")
+    for column_name in INTEGER_VERSION_COLUMNS:
+        assert column_name not in df.columns
+
+
+# -----------------------------
+# Interaction between #63 and #64
+# -----------------------------
+
+
+def test_aliases_and_version_casting_work_together():
+    df = read_gtf(
+        GENCODE_GTF_PATH,
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        result_type="pandas",
+    )
+    assert "gene_biotype" in df.columns
+    assert "transcript_biotype" in df.columns
+    assert str(df["gene_version"].dtype) == "Int64"
+    transcript_rows = df[df["feature"] == "transcript"].reset_index(drop=True)
+    assert transcript_rows.loc[0, "transcript_biotype"] == "processed_transcript"
+    assert transcript_rows.loc[0, "transcript_version"] == 2
+
+
+def test_aliases_visible_to_infer_biotype_column():
+    """attribute_aliases is applied before infer_biotype_column, so an aliased
+    gene_biotype must prevent the inference path from overwriting it."""
+    df = read_gtf(
+        GENCODE_GTF_PATH,
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        infer_biotype_column=True,
+        result_type="pandas",
+    )
+    # Should keep the GENCODE-derived biotypes, not infer from "source"
+    gene_rows = df[df["feature"] == "gene"]
+    assert "protein_coding" in set(gene_rows["gene_biotype"])
+    assert "transcribed_unprocessed_pseudogene" in set(gene_rows["gene_biotype"])
+    # source column is HAVANA, definitely not a biotype value
+    assert "HAVANA" not in set(gene_rows["gene_biotype"])
+
+
+# -----------------------------
+# Sanity: constants are stable
+# -----------------------------
+
+
+def test_gencode_biotype_aliases_constant():
+    assert GENCODE_BIOTYPE_ALIASES == {
+        "gene_type": "gene_biotype",
+        "transcript_type": "transcript_biotype",
+    }
+
+
+def test_integer_version_columns_constant():
+    assert set(INTEGER_VERSION_COLUMNS) == {
+        "gene_version",
+        "transcript_version",
+        "protein_version",
+        "exon_version",
+    }

From 948a496adeea9acac0bac8cba72461ad40daccbd Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Wed, 13 May 2026 11:13:28 -0400
Subject: [PATCH 2/3] Use pd.api.types.is_string_dtype to tolerate StringDtype
 on newer pandas

Python 3.9 / 3.11 CI failed on test_cast_version_columns_false_keeps_strings
because newer pandas returns its own StringDtype (rather than the numpy
object dtype) for string-typed columns. Relax the assertion to
pd.api.types.is_string_dtype which returns True for both.
---
 tests/test_gencode_gtf.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_gencode_gtf.py b/tests/test_gencode_gtf.py
index 3dd3bd4..2b84fde 100644
--- a/tests/test_gencode_gtf.py
+++ b/tests/test_gencode_gtf.py
@@ -128,9 +128,13 @@ def test_version_columns_handle_missing_values():
 
 
 def test_cast_version_columns_false_keeps_strings():
+    import pandas as pd
+
     df = read_gtf(GENCODE_GTF_PATH, cast_version_columns=False, result_type="pandas")
-    # column is still object/string when opted out
-    assert df["gene_version"].dtype == object
+    # column is still string-typed when opted out — accept either the legacy
+    # object dtype or pandas' newer StringDtype, depending on the installed
+    # pandas version.
+    assert pd.api.types.is_string_dtype(df["gene_version"])
     gene_rows = df[df["feature"] == "gene"].reset_index(drop=True)
     assert gene_rows.loc[0, "gene_version"] == "5"
 

From 668dfb0c18692940be01b8f0d22dbe6de12e4114 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Wed, 13 May 2026 11:25:22 -0400
Subject: [PATCH 3/3] Address review: bug fixes + real GENCODE coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes from PR review:

1. _apply_attribute_aliases: two aliases targeting the same canonical
   (e.g. both gene_type and a hypothetical gene_kind -> gene_biotype)
   silently produced a duplicate-column rename. Track renames in a
   running set so the second collision is flagged the same way as a
   pre-existing canonical (drop + warn). First in iteration order wins.

2. usecols + attribute_aliases: usecols was enforced at parse time
   (inside parse_gtf_and_expand_attributes) before aliases could run,
   so requesting `usecols=["gene_biotype"]` against a GENCODE GTF that
   only had gene_type returned an empty frame. Expand the parse-time
   filter to pull alias source columns through when their canonical is
   in usecols. End-of-function usecols filter still narrows down to
   the canonical name only.

New test coverage (18 new tests, 54 total):

  * explicit attribute_aliases=None matches implicit default
  * result_type="dict" with both kwargs
  * multiple aliases -> same canonical: first wins, second warned+dropped
  * alias-chain semantics pinned (no chasing — each rule judged against
    the pre-rename column set)
  * aliasing preserves all unrelated columns
  * usecols composes with attribute_aliases
  * gene_version=0 round-trips as Int64(0), not pd.NA
  * malformed version string ("v3") coerces to pd.NA (lenient by design)
  * version cast idempotent across repeated reads (global polars cache safety)

Real GENCODE fixture (tests/data/gencode.real.head.gtf) reflecting the
on-disk shape — versioned IDs embedded in gene_id/transcript_id rather
than separate *_version attrs, GENCODE-only fields (level, hgnc_id,
havana_gene/transcript, tag, transcript_support_level), and chr-prefixed
seqnames — plus 7 tests against it:

  * alias rename works on the real shape
  * no synthesized *_version columns when attrs absent
  * versioned ID strings preserved verbatim (ENSG/ENST/ENSP/ENSE)
  * GENCODE-only attribute fields all come through
  * chr1/chrM seqname convention preserved
  * gzipped input round-trips
  * infer_biotype_column stays silent on GENCODE (source is HAVANA, not
    a biotype value) — regression guard

Ensembl regression checks:
  * old release-75 fixture still uses 'source' for biotype inference
  * passing GENCODE_BIOTYPE_ALIASES against a pure Ensembl GTF is a no-op
---
 gtfparse/read_gtf.py             |  36 +++-
 tests/data/gencode.real.head.gtf |  13 ++
 tests/test_gencode_gtf.py        | 352 +++++++++++++++++++++++++++++++
 3 files changed, 396 insertions(+), 5 deletions(-)
 create mode 100644 tests/data/gencode.real.head.gtf

diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py
index d7dfa58..23e2647 100644
--- a/gtfparse/read_gtf.py
+++ b/gtfparse/read_gtf.py
@@ -209,20 +209,26 @@ def _apply_attribute_aliases(result_df, attribute_aliases):
     """
     Rename alias attribute columns onto canonical names in-place.
 
-    For each (alias -> canonical) pair:
+    For each (alias -> canonical) pair, in iteration order:
       * if only the alias is present, rename it to the canonical name.
       * if both are present, drop the alias and warn (canonical wins).
       * if neither is present, do nothing.
+
+    When two aliases target the same canonical (e.g. both ``gene_type``
+    and a hypothetical ``gene_kind`` map to ``gene_biotype``), the first
+    rename in iteration order wins; subsequent aliases targeting an
+    already-renamed canonical are treated as collisions, dropped, and
+    warned about.
     """
     if not attribute_aliases:
         return result_df
-    existing = set(result_df.columns)
+    columns_present = set(result_df.columns)
     rename_map = {}
     drop_aliases = []
     for alias, canonical in attribute_aliases.items():
-        if alias not in existing:
+        if alias not in columns_present:
             continue
-        if canonical in existing:
+        if canonical in columns_present:
             logger.warning(
                 "Both alias column '%s' and canonical column '%s' are present; "
                 "dropping alias and keeping canonical values.",
@@ -232,6 +238,11 @@ def _apply_attribute_aliases(result_df, attribute_aliases):
             drop_aliases.append(alias)
         else:
             rename_map[alias] = canonical
+            # Reflect the rename in the running column set so a later
+            # alias mapping to the same canonical sees the collision
+            # instead of silently producing a duplicate-named column.
+            columns_present.discard(alias)
+            columns_present.add(canonical)
     if drop_aliases:
         result_df = result_df.drop(columns=drop_aliases)
     if rename_map:
@@ -323,9 +334,24 @@ def read_gtf(
     if type(filepath_or_buffer) is str and not exists(filepath_or_buffer):
         raise ValueError("GTF file does not exist: %s" % filepath_or_buffer)
 
+    # If usecols asks for a canonical column that's only present in the
+    # GTF under an alias name, expand the parse-time column filter to
+    # also pull the alias through — otherwise it gets dropped at parse
+    # time before _apply_attribute_aliases can see it. The end-of-function
+    # usecols filter still narrows the result down to the canonical name.
+    parse_usecols = usecols
+    if usecols is not None and attribute_aliases:
+        usecols_set = set(usecols)
+        parse_usecols = set(usecols_set)
+        for alias, canonical in attribute_aliases.items():
+            if canonical in usecols_set:
+                parse_usecols.add(alias)
+
     if expand_attribute_column:
         result_df = parse_gtf_and_expand_attributes(
-            filepath_or_buffer, restrict_attribute_columns=usecols, features=features
+            filepath_or_buffer,
+            restrict_attribute_columns=parse_usecols,
+            features=features,
         )
     else:
         result_df = parse_gtf(result_df, features=features)
diff --git a/tests/data/gencode.real.head.gtf b/tests/data/gencode.real.head.gtf
new file mode 100644
index 0000000..802d507
--- /dev/null
+++ b/tests/data/gencode.real.head.gtf
@@ -0,0 +1,13 @@
+##description: evidence-based annotation of the human genome (GRCh38) - test excerpt
+##provider: GENCODE
+##contact: gencode-help@ebi.ac.uk
+##format: gtf
+##date: 2024-01-01
+chr1	HAVANA	gene	11869	14409	.	+	.	gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2";
+chr1	HAVANA	transcript	11869	14409	.	+	.	gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; gene_name "DDX11L1"; transcript_name "DDX11L1-202"; level 2; transcript_support_level "1"; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1";
+chr1	HAVANA	exon	11869	12227	.	+	.	gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; exon_number 1; exon_id "ENSE00002234944.1"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; gene_name "DDX11L1"; transcript_name "DDX11L1-202"; level 2; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1";
+chr1	HAVANA	gene	65419	71585	.	+	.	gene_id "ENSG00000186092.7"; gene_type "protein_coding"; gene_name "OR4F5"; level 2; hgnc_id "HGNC:14825"; havana_gene "OTTHUMG00000001094.4";
+chr1	HAVANA	transcript	65419	71585	.	+	.	gene_id "ENSG00000186092.7"; transcript_id "ENST00000641515.2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; transcript_name "OR4F5-201"; level 2; tag "MANE_Select"; havana_gene "OTTHUMG00000001094.4";
+chr1	HAVANA	CDS	65565	65573	.	+	0	gene_id "ENSG00000186092.7"; transcript_id "ENST00000641515.2"; exon_number 1; protein_id "ENSP00000493376.2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; tag "MANE_Select";
+chr1	HAVANA	exon	65565	65573	.	+	.	gene_id "ENSG00000186092.7"; transcript_id "ENST00000641515.2"; exon_number 1; exon_id "ENSE00003812156.1"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; tag "MANE_Select";
+chrM	ENSEMBL	gene	3307	4262	.	+	.	gene_id "ENSG00000198888.2"; gene_type "protein_coding"; gene_name "MT-ND1"; level 3;
diff --git a/tests/test_gencode_gtf.py b/tests/test_gencode_gtf.py
index 2b84fde..061928a 100644
--- a/tests/test_gencode_gtf.py
+++ b/tests/test_gencode_gtf.py
@@ -1,5 +1,6 @@
 """Tests for GENCODE attribute aliases (#63) and version-column casting (#64)."""
 
+import gzip
 import logging
 import os
 import tempfile
@@ -9,6 +10,7 @@
 from .data import data_path
 
 GENCODE_GTF_PATH = data_path("gencode.head.gtf")
+GENCODE_REAL_GTF_PATH = data_path("gencode.real.head.gtf")
 
 
 # -----------------------------
@@ -209,3 +211,353 @@ def test_integer_version_columns_constant():
         "protein_version",
         "exon_version",
     }
+
+
+# -----------------------------
+# Explicit-None defaults
+# -----------------------------
+
+
+def test_attribute_aliases_none_is_explicit_noop():
+    """The kwarg default is None; passing it explicitly must match
+    the implicit behavior (no rename, no error)."""
+    df_default = read_gtf(GENCODE_GTF_PATH, result_type="pandas")
+    df_explicit = read_gtf(GENCODE_GTF_PATH, attribute_aliases=None, result_type="pandas")
+    assert list(df_default.columns) == list(df_explicit.columns)
+    assert "gene_type" in df_explicit.columns
+    assert "gene_biotype" not in df_explicit.columns
+
+
+# -----------------------------
+# result_type="dict" with new kwargs
+# -----------------------------
+
+
+def test_attribute_aliases_with_dict_result_type():
+    result = read_gtf(
+        GENCODE_GTF_PATH,
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        result_type="dict",
+    )
+    assert isinstance(result, dict)
+    assert "gene_biotype" in result
+    assert "transcript_biotype" in result
+    assert "gene_type" not in result
+    assert "transcript_type" not in result
+
+
+def test_version_columns_with_dict_result_type():
+    result = read_gtf(GENCODE_GTF_PATH, result_type="dict")
+    assert "gene_version" in result
+    # values come back as ints when present, pd.NA when missing
+    values = list(result["gene_version"].values())
+    non_null = [v for v in values if v is not None and not _is_na(v)]
+    assert non_null and all(isinstance(v, int) for v in non_null), non_null
+
+
+def _is_na(v):
+    # pd.NA isn't equal-comparable; use bool() guarded form
+    try:
+        return v != v  # NaN-style: pd.NA also returns NA from != itself
+    except TypeError:
+        return True
+
+
+# -----------------------------
+# Multi-alias and ordering edge cases
+# -----------------------------
+
+
+def test_multiple_aliases_mapping_to_same_canonical_first_wins(caplog):
+    """Two aliases that both target the same canonical: the first in
+    iteration order is renamed; the second is treated as a conflict
+    and dropped. Prevents pandas producing two columns with the same
+    name."""
+    contents = (
+        "chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\t"
+        'gene_id "ENSGX"; gene_type "alias_a"; gene_kind "alias_b";\n'
+    )
+    fd, path = tempfile.mkstemp(suffix=".gtf")
+    try:
+        with os.fdopen(fd, "w") as fh:
+            fh.write(contents)
+        with caplog.at_level(logging.WARNING):
+            df = read_gtf(
+                path,
+                # both aliases map to gene_biotype; gene_type comes first
+                attribute_aliases={
+                    "gene_type": "gene_biotype",
+                    "gene_kind": "gene_biotype",
+                },
+                result_type="pandas",
+            )
+        # exactly one gene_biotype column, and the first alias's value wins
+        assert list(df.columns).count("gene_biotype") == 1
+        assert "gene_type" not in df.columns
+        assert "gene_kind" not in df.columns
+        assert list(df["gene_biotype"]) == ["alias_a"]
+        # the collision was warned about
+        assert any("Both alias column" in rec.message for rec in caplog.records)
+    finally:
+        os.unlink(path)
+
+
+def test_aliases_iteration_order_matters_for_chains():
+    """When canonical of one rule is the alias of another, behavior
+    depends on iteration order against the original column set. This
+    test pins the current behavior: each rule is decided against the
+    pre-rename state, so chains aren't followed. Documents the contract."""
+    contents = 'chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\tgene_id "ENSGX"; a "1"; b "2";\n'
+    fd, path = tempfile.mkstemp(suffix=".gtf")
+    try:
+        with os.fdopen(fd, "w") as fh:
+            fh.write(contents)
+        # {"a": "b"} with b already in the file should drop a (both
+        # present, canonical wins). NOT chain to {"a": "b", "b": "c"}.
+        df = read_gtf(
+            path,
+            attribute_aliases={"a": "b"},
+            result_type="pandas",
+        )
+        assert "a" not in df.columns
+        assert "b" in df.columns
+        assert list(df["b"]) == ["2"]
+    finally:
+        os.unlink(path)
+
+
+def test_aliases_preserves_other_attribute_columns():
+    """Renaming one attribute must not perturb the others."""
+    df = read_gtf(
+        GENCODE_GTF_PATH,
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        result_type="pandas",
+    )
+    # spot-check several unrelated columns survive
+    for col in ("gene_id", "transcript_id", "gene_name", "transcript_name"):
+        assert col in df.columns, col
+
+
+# -----------------------------
+# usecols interaction
+# -----------------------------
+
+
+def test_aliases_with_usecols_filtering_canonical_column():
+    """usecols is applied AFTER aliasing, so requesting the canonical
+    column name returns the data even when the GTF only had the alias."""
+    df = read_gtf(
+        GENCODE_GTF_PATH,
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        usecols=["gene_biotype"],
+        result_type="pandas",
+    )
+    assert list(df.columns) == ["gene_biotype"]
+    assert "protein_coding" in set(df["gene_biotype"])
+
+
+# -----------------------------
+# cast_version_columns edge cases
+# -----------------------------
+
+
+def test_version_zero_is_preserved_not_treated_as_missing():
+    """0 must round-trip as Int64(0), not become pd.NA — the
+    .replace("", None) call only targets empty strings."""
+    contents = 'chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\tgene_id "ENSGX"; gene_version "0";\n'
+    fd, path = tempfile.mkstemp(suffix=".gtf")
+    try:
+        with os.fdopen(fd, "w") as fh:
+            fh.write(contents)
+        df = read_gtf(path, result_type="pandas")
+        assert df.loc[0, "gene_version"] == 0
+        assert not df["gene_version"].isna().iloc[0]
+    finally:
+        os.unlink(path)
+
+
+def test_version_with_corrupted_non_integer_value_coerces_to_na(caplog):
+    """A malformed version string (e.g. 'v3' instead of '3') is
+    coerced to pd.NA rather than raising. Documents the lenient
+    behavior — corrupted-GTF data quality bugs become missing
+    values, not exceptions."""
+    contents = 'chr1\tHAVANA\tgene\t1\t100\t.\t+\t.\tgene_id "ENSGX"; gene_version "v3";\n'
+    fd, path = tempfile.mkstemp(suffix=".gtf")
+    try:
+        with os.fdopen(fd, "w") as fh:
+            fh.write(contents)
+        df = read_gtf(path, result_type="pandas")
+        assert df["gene_version"].isna().iloc[0]
+    finally:
+        os.unlink(path)
+
+
+def test_version_cast_is_idempotent_via_double_read():
+    """Reading the same file twice with the cast on must produce the
+    same dtypes — sanity check that the cast doesn't accumulate
+    side effects via the global polars string cache or similar."""
+    df1 = read_gtf(GENCODE_GTF_PATH, result_type="pandas")
+    df2 = read_gtf(GENCODE_GTF_PATH, result_type="pandas")
+    for col in INTEGER_VERSION_COLUMNS:
+        assert str(df1[col].dtype) == str(df2[col].dtype) == "Int64"
+
+
+# -----------------------------
+# Real GENCODE-format fixture (versions in IDs, not in attributes)
+# -----------------------------
+
+
+def test_real_gencode_format_alias_rename():
+    """A GENCODE GTF in its real on-disk shape — versioned IDs
+    embedded in gene_id / transcript_id, GENCODE-only fields like
+    level / hgnc_id / havana_gene / tag — should normalize to
+    Ensembl-style biotype column names cleanly."""
+    df = read_gtf(
+        GENCODE_REAL_GTF_PATH,
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        result_type="pandas",
+    )
+    assert "gene_biotype" in df.columns
+    assert "transcript_biotype" in df.columns
+    assert "gene_type" not in df.columns
+    gene_rows = df[df["feature"] == "gene"]
+    biotypes = set(gene_rows["gene_biotype"])
+    assert biotypes == {"transcribed_unprocessed_pseudogene", "protein_coding"}
+
+
+def test_real_gencode_format_has_no_separate_version_attrs():
+    """Real GENCODE doesn't carry separate *_version attribute
+    fields — versions are baked into gene_id/transcript_id strings.
+    cast_version_columns must be a graceful no-op (not create empty
+    columns, not raise)."""
+    df = read_gtf(GENCODE_REAL_GTF_PATH, result_type="pandas")
+    for col in INTEGER_VERSION_COLUMNS:
+        assert col not in df.columns, (
+            f"Real GENCODE has no '{col}' attribute; "
+            f"version casting should not have synthesized one."
+        )
+
+
+def test_real_gencode_format_preserves_versioned_id_strings():
+    """The version baked into gene_id / transcript_id / protein_id /
+    exon_id must come through verbatim — those strings ARE the
+    canonical identifier in GENCODE."""
+    df = read_gtf(GENCODE_REAL_GTF_PATH, result_type="pandas")
+    gene_ids = set(df["gene_id"])
+    assert "ENSG00000223972.5" in gene_ids
+    assert "ENSG00000186092.7" in gene_ids
+    assert "ENSG00000198888.2" in gene_ids
+    # protein_id and exon_id likewise carry .N
+    protein_rows = df[df["feature"] == "CDS"]
+    assert "ENSP00000493376.2" in set(protein_rows["protein_id"])
+
+
+def test_real_gencode_format_preserves_gencode_only_fields():
+    """GENCODE-specific attribute fields (level, hgnc_id,
+    havana_gene, havana_transcript, tag, transcript_support_level)
+    must survive parsing — they're useful downstream signal."""
+    df = read_gtf(GENCODE_REAL_GTF_PATH, result_type="pandas")
+    for col in (
+        "level",
+        "hgnc_id",
+        "havana_gene",
+        "tag",
+        "transcript_support_level",
+        "havana_transcript",
+    ):
+        assert col in df.columns, col
+    # spot-check values
+    transcript_rows = df[df["feature"] == "transcript"].reset_index(drop=True)
+    assert "MANE_Select" in set(df["tag"])
+    assert "HGNC:14825" in set(df["hgnc_id"])
+    # transcript_support_level is "1" (string — we deliberately don't
+    # auto-cast it since GENCODE allows non-numeric values like "NA")
+    assert "1" in set(transcript_rows["transcript_support_level"])
+
+
+def test_real_gencode_format_uses_chr_prefixed_seqnames():
+    """GENCODE uses 'chr1' / 'chrM' while Ensembl uses bare '1' /
+    'MT'. Confirming the seqname comes through unchanged so
+    downstream tooling can normalize if needed."""
+    df = read_gtf(GENCODE_REAL_GTF_PATH, result_type="pandas")
+    seqnames = set(df["seqname"])
+    assert seqnames <= {"chr1", "chrM"}
+    assert "chr1" in seqnames
+    assert "chrM" in seqnames
+
+
+def test_real_gencode_format_via_gzip_compressed_input():
+    """End-to-end: gzip a real-format GENCODE GTF, point read_gtf at
+    the .gz, confirm aliasing + parsing both still work."""
+    with open(GENCODE_REAL_GTF_PATH, "rb") as src:
+        raw = src.read()
+    fd, gz_path = tempfile.mkstemp(suffix=".gtf.gz")
+    os.close(fd)
+    try:
+        with gzip.open(gz_path, "wb") as out:
+            out.write(raw)
+        df = read_gtf(
+            gz_path,
+            attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+            result_type="pandas",
+        )
+        assert "gene_biotype" in df.columns
+        assert "ENSG00000186092.7" in set(df["gene_id"])
+    finally:
+        os.unlink(gz_path)
+
+
+def test_real_gencode_format_with_infer_biotype_column_is_safe():
+    """infer_biotype_column would normally fire if 'protein_coding'
+    appears in the 'source' column. Real GENCODE's source is
+    'HAVANA' / 'ENSEMBL', not 'protein_coding', so inference
+    correctly stays silent. Regression guard against accidentally
+    triggering it on GENCODE input."""
+    df = read_gtf(
+        GENCODE_REAL_GTF_PATH,
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        infer_biotype_column=True,
+        result_type="pandas",
+    )
+    # The aliased gene_biotype should be the authoritative one
+    gene_rows = df[df["feature"] == "gene"]
+    assert set(gene_rows["gene_biotype"]) == {
+        "transcribed_unprocessed_pseudogene",
+        "protein_coding",
+    }
+
+
+# -----------------------------
+# Ensembl regression checks
+# -----------------------------
+
+
+def test_old_ensembl_fixture_still_uses_source_for_biotype_inference():
+    """Ensembl release-75 puts biotype into the 'source' column. With
+    infer_biotype_column=True the fix should still populate
+    gene_biotype / transcript_biotype from source. Make sure the new
+    kwargs don't break that legacy path."""
+    df = read_gtf(
+        data_path("ensembl_grch37.head.gtf"),
+        infer_biotype_column=True,
+        result_type="pandas",
+    )
+    assert "gene_biotype" in df.columns
+    assert "transcript_biotype" in df.columns
+    # The release-75 fixture has pseudogenes and protein_coding genes
+    assert "protein_coding" in set(df["gene_biotype"])
+
+
+def test_ensembl_attribute_aliases_no_op_when_aliases_absent():
+    """Passing GENCODE_BIOTYPE_ALIASES against a pure Ensembl file
+    that doesn't have gene_type / transcript_type must do nothing
+    — no spurious columns, no exceptions."""
+    df = read_gtf(
+        data_path("ensembl_grch37.head.gtf"),
+        attribute_aliases=GENCODE_BIOTYPE_ALIASES,
+        result_type="pandas",
+    )
+    # gene_biotype exists in this fixture; aliasing didn't fight with it
+    assert "gene_biotype" in df.columns
+    assert "gene_type" not in df.columns
+    assert "transcript_type" not in df.columns