Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion gtfparse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,20 @@
from .create_missing_features import create_missing_features
from .parsing_error import ParsingError
from .read_gtf import (
GENCODE_BIOTYPE_ALIASES,
INTEGER_VERSION_COLUMNS,
REQUIRED_COLUMNS,
parse_gtf,
parse_gtf_and_expand_attributes,
parse_gtf_pandas,
read_gtf,
)

__version__ = "2.6.3"
__version__ = "2.7.0"

__all__ = [
"GENCODE_BIOTYPE_ALIASES",
"INTEGER_VERSION_COLUMNS",
"REQUIRED_COLUMNS",
"ParsingError",
"__version__",
Expand Down
126 changes: 125 additions & 1 deletion gtfparse/read_gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import logging
from os.path import exists

import pandas as pd
import polars

from .attribute_parsing import expand_attribute_strings
Expand All @@ -22,6 +23,27 @@
logger = logging.getLogger(__name__)


# GENCODE GTFs use *_type where Ensembl GTFs use *_biotype. Pass this
# (or a superset) as `attribute_aliases` to read_gtf to normalize a
# GENCODE-format GTF onto the Ensembl column names that downstream
# tools like pyensembl expect.
GENCODE_BIOTYPE_ALIASES = {
"gene_type": "gene_biotype",
"transcript_type": "transcript_biotype",
}


# Ensembl-style attribute columns that are always integer-valued when
# present. read_gtf casts these from string to pandas nullable Int64 by
# default; pass cast_version_columns=False to keep them as strings.
INTEGER_VERSION_COLUMNS = (
"gene_version",
"transcript_version",
"protein_version",
"exon_version",
)


"""
Columns of a GTF file:

Expand Down Expand Up @@ -183,6 +205,65 @@ def parse_gtf_and_expand_attributes(
)


def _apply_attribute_aliases(result_df, attribute_aliases):
"""
Rename alias attribute columns onto canonical names in-place.

For each (alias -> canonical) pair, in iteration order:
* if only the alias is present, rename it to the canonical name.
* if both are present, drop the alias and warn (canonical wins).
* if neither is present, do nothing.

When two aliases target the same canonical (e.g. both ``gene_type``
and a hypothetical ``gene_kind`` map to ``gene_biotype``), the first
rename in iteration order wins; subsequent aliases targeting an
already-renamed canonical are treated as collisions, dropped, and
warned about.
"""
if not attribute_aliases:
return result_df
columns_present = set(result_df.columns)
rename_map = {}
drop_aliases = []
for alias, canonical in attribute_aliases.items():
if alias not in columns_present:
continue
if canonical in columns_present:
logger.warning(
"Both alias column '%s' and canonical column '%s' are present; "
"dropping alias and keeping canonical values.",
alias,
canonical,
)
drop_aliases.append(alias)
else:
rename_map[alias] = canonical
# Reflect the rename in the running column set so a later
# alias mapping to the same canonical sees the collision
# instead of silently producing a duplicate-named column.
columns_present.discard(alias)
columns_present.add(canonical)
if drop_aliases:
result_df = result_df.drop(columns=drop_aliases)
if rename_map:
result_df = result_df.rename(columns=rename_map)
return result_df


def _cast_version_columns(result_df, version_columns=INTEGER_VERSION_COLUMNS):
"""
Cast known Ensembl *_version attribute columns from strings to
pandas nullable Int64 in-place. Missing/empty values become pd.NA.
"""
for column_name in version_columns:
if column_name not in result_df.columns:
continue
result_df[column_name] = pd.to_numeric(
result_df[column_name].replace("", None), errors="coerce"
).astype("Int64")
return result_df


def read_gtf(
filepath_or_buffer,
expand_attribute_column=True,
Expand All @@ -192,6 +273,8 @@ def read_gtf(
usecols=None,
features=None,
result_type="polars",
attribute_aliases=None,
cast_version_columns=True,
):
"""
Parse a GTF into a dictionary mapping column names to sequences of values.
Expand Down Expand Up @@ -231,13 +314,44 @@ def read_gtf(
result_type : One of 'polars', 'pandas', or 'dict'
Default behavior is to return a Polars DataFrame, but will convert to
Pandas DataFrame or dictionary if specified.

attribute_aliases : dict of str -> str, optional
Maps alias attribute names onto canonical ones. After attributes
are expanded into columns, each alias column is renamed to its
canonical name when the canonical column is absent. If both are
present the alias is dropped and a warning is logged. Pass
`GENCODE_BIOTYPE_ALIASES` to normalize a GENCODE GTF's
`gene_type`/`transcript_type` onto Ensembl's
`gene_biotype`/`transcript_biotype`.

cast_version_columns : bool
When True (default), cast the well-known integer version
attribute columns (`gene_version`, `transcript_version`,
`protein_version`, `exon_version`) from strings to pandas
nullable Int64 when present. Set to False to keep them as
strings.
"""
if type(filepath_or_buffer) is str and not exists(filepath_or_buffer):
raise ValueError("GTF file does not exist: %s" % filepath_or_buffer)

# If usecols asks for a canonical column that's only present in the
# GTF under an alias name, expand the parse-time column filter to
# also pull the alias through — otherwise it gets dropped at parse
# time before _apply_attribute_aliases can see it. The end-of-function
# usecols filter still narrows the result down to the canonical name.
parse_usecols = usecols
if usecols is not None and attribute_aliases:
usecols_set = set(usecols)
parse_usecols = set(usecols_set)
for alias, canonical in attribute_aliases.items():
if canonical in usecols_set:
parse_usecols.add(alias)

if expand_attribute_column:
result_df = parse_gtf_and_expand_attributes(
filepath_or_buffer, restrict_attribute_columns=usecols, features=features
filepath_or_buffer,
restrict_attribute_columns=parse_usecols,
features=features,
)
else:
result_df = parse_gtf(result_df, features=features)
Expand Down Expand Up @@ -267,6 +381,16 @@ def wrapped_fn(x):
column_type = column_cast_types[column_name]
result_df[column_name] = result_df[column_name].astype(column_type)

# Rename alias attribute columns onto their canonical names. Done before
# infer_biotype_column so an aliased gene_biotype/transcript_biotype is
# visible to the inference logic.
result_df = _apply_attribute_aliases(result_df, attribute_aliases)

# Cast Ensembl *_version columns from strings to nullable integers so
# downstream consumers (e.g. pyensembl) don't have to int(...) themselves.
if cast_version_columns:
result_df = _cast_version_columns(result_df)

# Hackishly infer whether the values in the 'source' column of this GTF
# are actually representing a biotype by checking for the most common
# gene_biotype and transcript_biotype value 'protein_coding'
Expand Down
10 changes: 10 additions & 0 deletions tests/data/gencode.head.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
##description: GENCODE-style synthetic head for gtfparse tests
##provider: gtfparse-tests
##format: gtf
chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972"; gene_version "5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1";
chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; gene_name "DDX11L1"; transcript_name "DDX11L1-202";
chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972"; gene_version "5"; transcript_id "ENST00000456328"; transcript_version "2"; exon_id "ENSE00002234944"; exon_version "1"; exon_number "1"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript";
chr1 HAVANA gene 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "6"; gene_type "protein_coding"; gene_name "OR4F5";
chr1 HAVANA transcript 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; transcript_name "OR4F5-201";
chr1 HAVANA CDS 65565 65573 . + 0 gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "1"; protein_id "ENSP00000493376"; protein_version "2"; gene_type "protein_coding"; transcript_type "protein_coding";
chr1 HAVANA exon 65565 65573 . + . gene_id "ENSG00000186092"; gene_version "6"; transcript_id "ENST00000641515"; transcript_version "2"; exon_id "ENSE00003812156"; exon_version "1"; exon_number "1"; gene_type "protein_coding"; transcript_type "protein_coding";
13 changes: 13 additions & 0 deletions tests/data/gencode.real.head.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
##description: evidence-based annotation of the human genome (GRCh38) - test excerpt
##provider: GENCODE
##contact: gencode-help@ebi.ac.uk
##format: gtf
##date: 2024-01-01
chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2";
chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; gene_name "DDX11L1"; transcript_name "DDX11L1-202"; level 2; transcript_support_level "1"; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1";
chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; exon_number 1; exon_id "ENSE00002234944.1"; gene_type "transcribed_unprocessed_pseudogene"; transcript_type "processed_transcript"; gene_name "DDX11L1"; transcript_name "DDX11L1-202"; level 2; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1";
chr1 HAVANA gene 65419 71585 . + . gene_id "ENSG00000186092.7"; gene_type "protein_coding"; gene_name "OR4F5"; level 2; hgnc_id "HGNC:14825"; havana_gene "OTTHUMG00000001094.4";
chr1 HAVANA transcript 65419 71585 . + . gene_id "ENSG00000186092.7"; transcript_id "ENST00000641515.2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; transcript_name "OR4F5-201"; level 2; tag "MANE_Select"; havana_gene "OTTHUMG00000001094.4";
chr1 HAVANA CDS 65565 65573 . + 0 gene_id "ENSG00000186092.7"; transcript_id "ENST00000641515.2"; exon_number 1; protein_id "ENSP00000493376.2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; tag "MANE_Select";
chr1 HAVANA exon 65565 65573 . + . gene_id "ENSG00000186092.7"; transcript_id "ENST00000641515.2"; exon_number 1; exon_id "ENSE00003812156.1"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "OR4F5"; tag "MANE_Select";
chrM ENSEMBL gene 3307 4262 . + . gene_id "ENSG00000198888.2"; gene_type "protein_coding"; gene_name "MT-ND1"; level 3;
Loading
Loading