diff --git a/pyensembl/database.py b/pyensembl/database.py index 70a5435..571963c 100644 --- a/pyensembl/database.py +++ b/pyensembl/database.py @@ -16,7 +16,7 @@ import datacache from typechecks import require_integer, require_string -from gtfparse import read_gtf, create_missing_features +from gtfparse import GENCODE_BIOTYPE_ALIASES, create_missing_features, read_gtf from .common import memoize from .normalization import normalize_chromosome, normalize_strand @@ -626,6 +626,11 @@ def _load_gtf_as_dataframe(self, usecols=None, features=None): "seqname": normalize_chromosome, "strand": normalize_strand, }, + # GENCODE GTFs use gene_type / transcript_type where Ensembl uses + # gene_biotype / transcript_biotype. attribute_aliases (gtfparse + # 2.7.0+) renames the GENCODE columns onto the Ensembl names at + # parse time so the rest of pyensembl can stay Ensembl-centric. + attribute_aliases=GENCODE_BIOTYPE_ALIASES, infer_biotype_column=True, usecols=usecols, features=features, diff --git a/pyensembl/version.py b/pyensembl/version.py index cbd495c..af9817b 100644 --- a/pyensembl/version.py +++ b/pyensembl/version.py @@ -1,4 +1,4 @@ -__version__ = "2.9.7" +__version__ = "2.9.8" def print_version(): print(f"v{__version__}") diff --git a/pyproject.toml b/pyproject.toml index 2e32240..1a2f358 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "datacache>=1.4.0,<2.0.0", "memoized-property>=1.0.2", "tinytimer>=0.0.0,<1.0.0", - "gtfparse>=2.6.0,<3.0.0", + "gtfparse>=2.7.0,<3.0.0", "serializable>=0.2.1,<2.0.0", "numpy>=2.0.0,<3.0.0", ] diff --git a/tests/test_gencode_biotype_aliases.py b/tests/test_gencode_biotype_aliases.py new file mode 100644 index 0000000..18e2102 --- /dev/null +++ b/tests/test_gencode_biotype_aliases.py @@ -0,0 +1,104 @@ +""" +Issue #335 part 1: pyensembl now passes gtfparse's GENCODE_BIOTYPE_ALIASES +into read_gtf, so a vanilla GENCODE GTF (which carries gene_type / +transcript_type) gets normalized onto the Ensembl gene_biotype / +transcript_biotype column names at parse time. Without this, GENCODE +genomes silently looked like every transcript had no biotype and +Transcript.is_protein_coding returned False for everything (see #335 +issue body). +""" + +import os + +from pyensembl import Genome + +from .common import TemporaryDirectory, eq_ + + +GENCODE_STYLE_GTF = """\ +1\tHAVANA\tgene\t100\t800\t.\t+\t.\tgene_id "ENSGTEST00000000001.4"; gene_type "protein_coding"; gene_name "FAKE1"; +1\tHAVANA\ttranscript\t100\t800\t.\t+\t.\tgene_id "ENSGTEST00000000001.4"; transcript_id "ENSTTEST00000000001.5"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "FAKE1"; +1\tHAVANA\texon\t100\t800\t.\t+\t.\tgene_id "ENSGTEST00000000001.4"; transcript_id "ENSTTEST00000000001.5"; exon_number "1"; exon_id "ENSETEST00000000001.2"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "FAKE1"; +1\tHAVANA\tCDS\t100\t798\t.\t+\t0\tgene_id "ENSGTEST00000000001.4"; transcript_id "ENSTTEST00000000001.5"; exon_number "1"; exon_id "ENSETEST00000000001.2"; protein_id "ENSPTEST00000000001.3"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "FAKE1"; +1\tHAVANA\tstart_codon\t100\t102\t.\t+\t0\tgene_id "ENSGTEST00000000001.4"; transcript_id "ENSTTEST00000000001.5"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "FAKE1"; +1\tHAVANA\tstop_codon\t799\t801\t.\t+\t0\tgene_id "ENSGTEST00000000001.4"; transcript_id "ENSTTEST00000000001.5"; gene_type "protein_coding"; transcript_type "protein_coding"; gene_name "FAKE1"; +1\tHAVANA\tgene\t2000\t3000\t.\t+\t.\tgene_id "ENSGTEST00000000002.1"; gene_type "lncRNA"; gene_name "FAKE2"; +1\tHAVANA\ttranscript\t2000\t3000\t.\t+\t.\tgene_id "ENSGTEST00000000002.1"; transcript_id "ENSTTEST00000000002.1"; gene_type "lncRNA"; transcript_type "lncRNA"; gene_name "FAKE2"; +1\tHAVANA\texon\t2000\t3000\t.\t+\t.\tgene_id "ENSGTEST00000000002.1"; transcript_id "ENSTTEST00000000002.1"; exon_number "1"; exon_id "ENSETEST00000000002.1"; gene_type "lncRNA"; transcript_type "lncRNA"; gene_name "FAKE2"; +""" + + +def _make_genome(tmpdir): + gtf_path = os.path.join(tmpdir, "gencode_style.gtf") + with open(gtf_path, "w") as f: + f.write(GENCODE_STYLE_GTF) + return Genome( + reference_name="GRCh38", + annotation_name="_test_gencode_biotype_aliases_335p1", + gtf_path_or_url=gtf_path, + cache_directory_path=tmpdir, + ) + + +def test_gencode_gene_type_normalized_to_gene_biotype(): + """A GENCODE GTF (with gene_type / transcript_type attributes) is + parsed so that the sqlite database stores gene_biotype / + transcript_biotype, and querying by Ensembl biotype names works.""" + with TemporaryDirectory() as tmpdir: + genome = _make_genome(tmpdir) + genome.index() + + # Biotype columns exist in the gene + transcript tables + assert genome.db.column_exists("gene", "gene_biotype") + assert genome.db.column_exists("transcript", "transcript_biotype") + + # Filter by biotype works against the renamed columns + coding_genes = genome.genes(biotype="protein_coding") + eq_(len(coding_genes), 1) + eq_(coding_genes[0].id, "ENSGTEST00000000001.4") + eq_(coding_genes[0].biotype, "protein_coding") + + ncrna_genes = genome.genes(biotype="lncRNA") + eq_(len(ncrna_genes), 1) + eq_(ncrna_genes[0].id, "ENSGTEST00000000002.1") + + +def test_gencode_transcript_type_normalized_to_transcript_biotype(): + with TemporaryDirectory() as tmpdir: + genome = _make_genome(tmpdir) + genome.index() + + coding = genome.transcripts(biotype="protein_coding") + eq_(len(coding), 1) + eq_(coding[0].id, "ENSTTEST00000000001.5") + eq_(coding[0].biotype, "protein_coding") + + +def test_gencode_transcript_is_protein_coding_true_after_alias(): + """The original #335 repro: Variant(...).effects() returned + NoncodingTranscript for every transcript because is_protein_coding + was False. With attribute_aliases the biotype column lands on the + Ensembl name and is_protein_coding returns True for coding rows.""" + with TemporaryDirectory() as tmpdir: + genome = _make_genome(tmpdir) + genome.index() + coding = genome.transcript_by_id("ENSTTEST00000000001.5") + assert coding.is_protein_coding is True + noncoding = genome.transcript_by_id("ENSTTEST00000000002.1") + assert noncoding.is_protein_coding is False + + +def test_pure_ensembl_gtf_unaffected_by_aliasing(): + """Vanilla Ensembl GTFs already use gene_biotype / transcript_biotype. + Passing the alias dict must be a no-op for them (gtfparse skips + missing alias sources). Reuses the existing mouse partial fixture + which is Ensembl 81 style.""" + from .data import ( + custom_mouse_genome_grcm38_subset, + setup_init_custom_mouse_genome, + ) + setup_init_custom_mouse_genome() + coding = custom_mouse_genome_grcm38_subset.transcripts(biotype="protein_coding") + eq_(len(coding), 1) + eq_(coding[0].id, "ENSMUST00000103109") + assert coding[0].is_protein_coding is True