diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 0940d65..0000000 --- a/.travis.yml +++ /dev/null @@ -1,52 +0,0 @@ -sudo: false -language: python -python: - - "3.8" - - "3.9" -git: - # don't need the default depth of 50 - # but don't want to use a depth of 1 since that affects - # whether jobs run when you have multiple commits queued - # https://github.com/travis-ci/travis-ci/issues/4575 - depth: 10 -cache: - pip: true -before_install: - # download different versions of mini-conda for py2 vs. py3 - - | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then - wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; - else - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; - fi - - bash miniconda.sh -b -p $HOME/miniconda - - export PATH="$HOME/miniconda/bin:$PATH" - - hash -r - - conda config --set always_yes yes --set changeps1 no - - conda update -q conda - # Useful for debugging any issues with conda - - conda info -a -install: - - > - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION - numpy nose pandas pylint pandoc - - source activate test-environment - # make sure we're using the correct version - - python --version - - pip install -r requirements.txt - - pip install . - - pip install pypandoc - - pip install coveralls -script: - - ./lint.sh - - nosetests test --with-coverage --cover-package=gtfparse -after_success: coveralls -deploy: - provider: pypi - distributions: sdist - user: openvax - password: # See http://docs.travis-ci.com/user/encryption-keys/ - secure: "lNFheXNLXUin0p+1LUHpE6CPvLnv0WyaTW7nW+r3gbleI8AocJ3cbJPBExOioTLnyEw+hQsq/aJqaz4eeNoEwJKJi3OasuUuQf3sqF/ujOLTcap09NIknPIJBf0lseh1zQgEZr/ov18kiscY/nPKbXPqoGOINEyMEqYP+Cciiq3FokSD++BU1n55RpH4iya3//DmxuOlkgU0Idxv9KAPy13YP3eymPTix1NkEDwDVglevW4bpZqXjA2iQMzQg0HpwsfkvasrGQ6UZCyg/ukxMfuNna0Ws40frq8xrYz4NROrepwLZ+9XGpPY3/ts3EvMYURDmucAyqcdPmxoBXaKtFsg/C/4Dhqh0l/agXhls0hf9BjSuaZT+mWPXtB9LEjkoHFUIvsxGN53N9xoPABuhjudAJdbAr40/GWTnvJlkULmAJ+ZVHb7GHA39Xsk/Efys27vZeTpM8xjwtCaJ0i61FjWeJP09Rd9bk6QwrpbjjnuwpRl5qyJv9jSi8H+oMaxOWFaSDYEMPyhV3pjtXaGo2CgR8ypYg1g9EuqVmScP9N2L5YzhYmCzan7peqMhDqQFJwPjO5Ger99a5E88yytpqW3PhQG/5AJEV+d/ZGvn71FAnC+cEPJqy2+jrsRw0a04AMmJya2udRJMuzixnBvwbvc7BMNx0xis7LJxDgmOFg=" - on: - branch: master - condition: $TRAVIS_PYTHON_VERSION = "3.9" diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..bbb0c38 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,96 @@ +## Golden Rules + +1. **Never commit to `master`.** Always `git checkout -b ` before editing. Land via PR. +2. **Every PR bumps the version.** Even doc-only PRs — at minimum a patch bump. Update `gtfparse/__init__.py::__version__`. +3. **"Done" means merged AND deployed to PyPI** — never stop at merge. After a PR merges, run `./deploy.sh` from a clean master. Skipping deploy = task not done. +4. **File problems as issues, don't silently work around them.** If you hit a bug here or in a sibling openvax/pirl-unc repo, open a GitHub issue on the correct repo and link it from the PR. +5. **After a PR ships, look for the next block of work.** Read open issues across the relevant openvax repos, group by dependency + urgency. Prefer *foundational* changes that unblock multiple downstream improvements; otherwise chain the smallest independent improvements. + +--- + +## Before Completing Any Task + +Before considering any code change complete, you MUST: + +1. **Run `./lint.sh`** - Verify linting passes (runs `ruff check`) +2. **Run `./test.sh`** - Verify all tests pass with coverage + +Do not tell the user you are "done" or that changes are "complete" until both pass. + +## Scripts + +- `./lint.sh` - Checks linting with ruff (must pass). **Always use this for linting if it exists.** +- `./test.sh` - Runs pytest with coverage (must pass) +- `./lint-and-test.sh` - Runs lint and test back-to-back +- `./deploy.sh` - Deploys to PyPI (gates on lint.sh and test.sh). **Always use this for deploying if it exists.** +- `./develop.sh` - Installs package in development mode (`pip install -e .`) + +## Code Style + +- Use ruff for linting +- Configuration is in `pyproject.toml` under `[tool.ruff]` +- Target Python version: 3.9+ (CI matrix runs 3.9 / 3.10 / 3.11) + +## Project Shape + +- Package: `gtfparse/` (thin library — attribute parsing, GTF reading, missing feature creation) +- Tests: `tests/` (pytest; fixtures under `tests/data/`) +- Core deps: polars, pyarrow, pandas (see `requirements.txt`) +- Version lives in `gtfparse/__init__.py` and is exposed via `setuptools.dynamic` in `pyproject.toml` + +--- + +## Workflow Orchestration + +### 1. Upfront Planning +- For ANY non-trivial task (3+ steps or architectural decisions): write a detailed spec before touching code +- If something goes sideways, STOP and re-plan immediately — don't keep pushing +- Use planning/verification steps, not just building +- Write detailed specs upfront to reduce ambiguity + +### 2. Self-Improvement Loop +- After ANY correction from the user: update `tasks/lessons.md` with the pattern +- Write rules for yourself that prevent the same mistake +- Ruthlessly iterate on these lessons until mistake rate drops +- Review lessons at session start for relevant project + +### 3. Verification Before Done +- Never mark a task complete without proving it works +- Diff behavior between the latest code and your changes when relevant +- Ask yourself: "Would a staff engineer approve this?" +- Run tests, check logs, demonstrate correctness + +### 4. Demand Elegance (Balanced) +- For non-trivial changes: pause and ask "is there a more elegant way?" +- If a fix feels hacky: "Knowing everything I know now, implement the elegant solution" +- Skip this for simple, obvious fixes — don't over-engineer +- Challenge your own work before presenting it + +### 5. Autonomous Bug Fixing +- When given a bug report: just fix it. Don't ask for hand-holding +- Point at logs, errors, failing tests — then resolve them +- Zero context switching required from the user +- Fix failing unit tests without being told how + +--- + +## Task Management + +1. **Plan First**: Write plan to `tasks/todo.md` with checkable items +2. **Verify Plan**: Check in before starting implementation +3. **Track Progress**: Mark items complete as you go +4. **Explain Changes**: High-level summary at each step +5. **Document Results**: Add review section to `tasks/todo.md` +6. **Capture Lessons**: Update `tasks/lessons.md` after corrections + +--- + +## Core Principles + +- **Simplicity First**: Make every change as simple as possible. Impact minimal code. +- **No Laziness**: Find root causes. No temporary fixes. Senior developer standards. +- **Minimal Impact**: Changes should only touch what's necessary. Avoid introducing bugs. + +## Scientific Domain Knowledge +- **Read the literature**: if some code involves genomic or biological concepts (GTF/GFF formats, feature hierarchies, attribute conventions), feel free to search for specs/review papers before changing code that expresses scientific concepts. +- **Flag inconsistencies**: if code expresses a scientific model that's at odds with your understanding, note that inconsistency and ask for clarification. diff --git a/format.sh b/format.sh new file mode 100755 index 0000000..a9261d1 --- /dev/null +++ b/format.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -e + +SOURCES="gtfparse tests" + +echo "Running ruff format..." +ruff format $SOURCES + +echo "Formatting complete!" diff --git a/gtfparse/__init__.py b/gtfparse/__init__.py index fcdb15a..3a46b2c 100644 --- a/gtfparse/__init__.py +++ b/gtfparse/__init__.py @@ -14,23 +14,23 @@ from .create_missing_features import create_missing_features from .parsing_error import ParsingError from .read_gtf import ( - read_gtf, + REQUIRED_COLUMNS, parse_gtf, - parse_gtf_pandas, parse_gtf_and_expand_attributes, - REQUIRED_COLUMNS, + parse_gtf_pandas, + read_gtf, ) -__version__ = "2.6.2" +__version__ = "2.6.3" __all__ = [ - "__version__", - "expand_attribute_strings", - "create_missing_features", - "parse_gtf_and_expand_attributes", "REQUIRED_COLUMNS", "ParsingError", - "read_gtf", + "__version__", + "create_missing_features", + "expand_attribute_strings", "parse_gtf", + "parse_gtf_and_expand_attributes", "parse_gtf_pandas", + "read_gtf", ] diff --git a/gtfparse/attribute_parsing.py b/gtfparse/attribute_parsing.py index 0a2af17..55ef282 100644 --- a/gtfparse/attribute_parsing.py +++ b/gtfparse/attribute_parsing.py @@ -18,12 +18,7 @@ logger = logging.getLogger(__name__) - -def expand_attribute_strings( - attribute_strings, - quote_char="'", - missing_value="", - usecols=None): +def expand_attribute_strings(attribute_strings, quote_char="'", missing_value="", usecols=None): """ The last column of GTF has a variable number of key value pairs of the format: "key1 value1; key2 value2;" @@ -66,7 +61,7 @@ def expand_attribute_strings( # and pair of try/except blocks in the loop. column_interned_strings = {} - for (i, kv_strings) in enumerate(attribute_strings): + for i, kv_strings in enumerate(attribute_strings): if type(kv_strings) is str: kv_strings = kv_strings.split(";") for kv in kv_strings: @@ -92,7 +87,7 @@ def expand_attribute_strings( if value[0] == quote_char: value = value.replace(quote_char, "") - + try: column = extra_columns[column_name] # if an attribute is used repeatedly then @@ -108,9 +103,5 @@ def expand_attribute_strings( extra_columns[column_name] = column column_order.append(column_name) - - logging.info("Extracted GTF attributes: %s" % column_order) - return OrderedDict( - (column_name, extra_columns[column_name]) - for column_name in column_order) + return OrderedDict((column_name, extra_columns[column_name]) for column_name in column_order) diff --git a/gtfparse/create_missing_features.py b/gtfparse/create_missing_features.py index 8758a6b..0a7b50a 100644 --- a/gtfparse/create_missing_features.py +++ b/gtfparse/create_missing_features.py @@ -19,11 +19,7 @@ logger = logging.getLogger(__name__) -def create_missing_features( - dataframe, - unique_keys={}, - extra_columns={}, - missing_value=None): +def create_missing_features(dataframe, unique_keys={}, extra_columns={}, missing_value=None): """ Helper function used to construct a missing feature such as 'transcript' or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have @@ -49,29 +45,25 @@ def create_missing_features( missing_value : any Which value to fill in for columns that we don't infer values for. - Returns original dataframe (converted to Pandas if necessary) along with all + Returns original dataframe (converted to Pandas if necessary) along with all extra rows created for missing features. """ if hasattr(dataframe, "to_pandas"): dataframe = dataframe.to_pandas() - + extra_dataframes = [] existing_features = set(dataframe["feature"]) existing_columns = set(dataframe.columns) - - for (feature_name, groupby_key) in unique_keys.items(): - + + for feature_name, groupby_key in unique_keys.items(): if feature_name in existing_features: - logging.info( - "Feature '%s' already exists in GTF data" % feature_name) + logging.info("Feature '%s' already exists in GTF data" % feature_name) continue logging.info("Creating rows for missing feature '%s'" % feature_name) # don't include rows where the groupby key was missing - missing = pd.Series([ - x is None or x == "" - for x in dataframe[groupby_key]]) + missing = pd.Series([x is None or x == "" for x in dataframe[groupby_key]]) not_missing = ~missing row_groups = dataframe[not_missing].groupby(groupby_key) @@ -79,10 +71,9 @@ def create_missing_features( # other columns may or may not be uniquely defined. Start off by # assuming the values for every column are missing and fill them in # where possible. - feature_values = OrderedDict([ - (column_name, [missing_value] * row_groups.ngroups) - for column_name in dataframe.keys() - ]) + feature_values = OrderedDict( + [(column_name, [missing_value] * row_groups.ngroups) for column_name in dataframe] + ) # User specifies which non-required columns should we try to infer # values for @@ -111,8 +102,9 @@ def create_missing_features( for column_name in feature_columns: if column_name not in existing_columns: raise ValueError( - "Column '%s' does not exist in GTF, columns = %s" % ( - column_name, existing_columns)) + "Column '%s' does not exist in GTF, columns = %s" + % (column_name, existing_columns) + ) # expect that all entries related to a reconstructed feature # are related and are thus within the same interval of @@ -121,4 +113,4 @@ def create_missing_features( if len(unique_values) == 1: feature_values[column_name][i] = unique_values[0] extra_dataframes.append(pd.DataFrame(feature_values)) - return pd.concat([dataframe] + extra_dataframes, ignore_index=True) + return pd.concat([dataframe, *extra_dataframes], ignore_index=True) diff --git a/gtfparse/parsing_error.py b/gtfparse/parsing_error.py index 23cbb4e..6323da8 100644 --- a/gtfparse/parsing_error.py +++ b/gtfparse/parsing_error.py @@ -10,5 +10,6 @@ # See the License for the specific language governing permissions and # limitations under the License. + class ParsingError(Exception): pass diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py index 7c0c38a..fdf2d4a 100644 --- a/gtfparse/read_gtf.py +++ b/gtfparse/read_gtf.py @@ -13,12 +13,11 @@ import logging from os.path import exists -import polars +import polars from .attribute_parsing import expand_attribute_strings from .parsing_error import ParsingError - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -76,88 +75,79 @@ DEFAULT_COLUMN_DTYPES = { - "seqname": polars.Categorical, - "source": polars.Categorical, - + "seqname": polars.Categorical, + "source": polars.Categorical, "start": polars.Int64, "end": polars.Int64, "score": polars.Float32, - - "feature": polars.Categorical, - "strand": polars.Categorical, + "feature": polars.Categorical, + "strand": polars.Categorical, "frame": polars.UInt32, } + def parse_with_polars_lazy( - filepath_or_buffer, - split_attributes=True, - features=None, - fix_quotes_columns=["attribute"]): + filepath_or_buffer, split_attributes=True, features=None, fix_quotes_columns=["attribute"] +): # use a global string cache so that all strings get intern'd into # a single numbering system polars.enable_string_cache() - kwargs = dict( - has_header=False, - separator="\t", - comment_prefix="#", - null_values=".", - schema_overrides=DEFAULT_COLUMN_DTYPES) + kwargs = { + "has_header": False, + "separator": "\t", + "comment_prefix": "#", + "null_values": ".", + "schema_overrides": DEFAULT_COLUMN_DTYPES, + } try: - df = polars.read_csv( - filepath_or_buffer, - new_columns=REQUIRED_COLUMNS, - **kwargs).lazy() - except polars.exceptions.ShapeError: - raise ParsingError("Wrong number of columns") + df = polars.read_csv(filepath_or_buffer, new_columns=REQUIRED_COLUMNS, **kwargs).lazy() + except polars.exceptions.ShapeError as err: + raise ParsingError("Wrong number of columns") from err # Drop empty lines that may appear as all-null rows df = df.filter(polars.col("seqname").is_not_null()) - df = df.with_columns([ - polars.col("frame").fill_null(0), - polars.col("attribute").str.replace_all('"', "'") - ]) - + df = df.with_columns( + [polars.col("frame").fill_null(0), polars.col("attribute").str.replace_all('"', "'")] + ) + for fix_quotes_column in fix_quotes_columns: # Catch mistaken semicolons by replacing "xyz;" with "xyz" # Required to do this since the Ensembl GTF for Ensembl # release 78 has mistakes such as: # gene_name = "PRAMEF6;" transcript_name = "PRAMEF6;-201" - df = df.with_columns([ - polars.col(fix_quotes_column).str.replace(';\"', '\"').str.replace(";-", "-") - ]) + df = df.with_columns( + [polars.col(fix_quotes_column).str.replace(';"', '"').str.replace(";-", "-")] + ) if features is not None: features = sorted(set(features)) df = df.filter(polars.col("feature").is_in(features)) - if split_attributes: - df = df.with_columns([ - polars.col("attribute").str.split(";").alias("attribute_split") - ]) + df = df.with_columns([polars.col("attribute").str.split(";").alias("attribute_split")]) return df + def parse_gtf( - filepath_or_buffer, - split_attributes=True, - features=None, - fix_quotes_columns=["attribute"]): + filepath_or_buffer, split_attributes=True, features=None, fix_quotes_columns=["attribute"] +): df_lazy = parse_with_polars_lazy( filepath_or_buffer=filepath_or_buffer, split_attributes=split_attributes, features=features, - fix_quotes_columns=fix_quotes_columns) + fix_quotes_columns=fix_quotes_columns, + ) return df_lazy.collect() + def parse_gtf_pandas(*args, **kwargs): return parse_gtf(*args, **kwargs).to_pandas() - + def parse_gtf_and_expand_attributes( - filepath_or_buffer, - restrict_attribute_columns=None, - features=None): + filepath_or_buffer, restrict_attribute_columns=None, features=None +): """ Parse lines into column->values dictionary and then expand the 'attribute' column into multiple columns. This expansion happens @@ -177,33 +167,32 @@ def parse_gtf_and_expand_attributes( features : set or None Ignore entries which don't correspond to one of the supplied features """ - df = parse_gtf( - filepath_or_buffer=filepath_or_buffer, - features=features, - split_attributes=True) + df = parse_gtf(filepath_or_buffer=filepath_or_buffer, features=features, split_attributes=True) if type(restrict_attribute_columns) is str: restrict_attribute_columns = {restrict_attribute_columns} elif restrict_attribute_columns: restrict_attribute_columns = set(restrict_attribute_columns) df.drop_in_place("attribute") attribute_pairs = df.drop_in_place("attribute_split") - return df.with_columns([ - polars.Series(k, vs) - for (k, vs) in - expand_attribute_strings(attribute_pairs).items() - if restrict_attribute_columns is None or k in restrict_attribute_columns - ]) - + return df.with_columns( + [ + polars.Series(k, vs) + for (k, vs) in expand_attribute_strings(attribute_pairs).items() + if restrict_attribute_columns is None or k in restrict_attribute_columns + ] + ) + def read_gtf( - filepath_or_buffer, - expand_attribute_column=True, - infer_biotype_column=False, - column_converters={}, - column_cast_types={}, - usecols=None, - features=None, - result_type='polars'): + filepath_or_buffer, + expand_attribute_column=True, + infer_biotype_column=False, + column_converters={}, + column_cast_types={}, + usecols=None, + features=None, + result_type="polars", +): """ Parse a GTF into a dictionary mapping column names to sequences of values. @@ -231,7 +220,7 @@ def read_gtf( column_cast_types : dict, optional Dictionary mapping column names to dtypes. Will cast columns to given Polars types. - + usecols : list of str or None Restrict which columns are loaded to the give set. If None, then load all columns. @@ -240,7 +229,7 @@ def read_gtf( Drop rows which aren't one of the features in the supplied set result_type : One of 'polars', 'pandas', or 'dict' - Default behavior is to return a Polars DataFrame, but will convert to + Default behavior is to return a Polars DataFrame, but will convert to Pandas DataFrame or dictionary if specified. """ if type(filepath_or_buffer) is str and not exists(filepath_or_buffer): @@ -248,9 +237,8 @@ def read_gtf( if expand_attribute_column: result_df = parse_gtf_and_expand_attributes( - filepath_or_buffer, - restrict_attribute_columns=usecols, - features=features) + filepath_or_buffer, restrict_attribute_columns=usecols, features=features + ) else: result_df = parse_gtf(result_df, features=features) @@ -259,26 +247,26 @@ def read_gtf( # and are generally insane to chase down result_df = result_df.to_pandas() if column_converters or column_cast_types: + def wrap_to_always_accept_none(f): def wrapped_fn(x): if x is None or x == "": return None else: return f(x) + return wrapped_fn - + column_names = set(column_converters.keys()).union(column_cast_types.keys()) for column_name in column_names: - if column_name in column_converters: - column_fn = wrap_to_always_accept_none( - column_converters[column_name]) + column_fn = wrap_to_always_accept_none(column_converters[column_name]) result_df[column_name] = result_df[column_name].apply(column_fn) if column_name in column_cast_types: column_type = column_cast_types[column_name] result_df[column_name] = result_df[column_name].astype(column_type) - + # Hackishly infer whether the values in the 'source' column of this GTF # are actually representing a biotype by checking for the most common # gene_biotype and transcript_biotype value 'protein_coding' @@ -292,11 +280,11 @@ def wrapped_fn(x): # gene_biotype) if "gene_biotype" not in column_names: logging.info("Using column 'source' to replace missing 'gene_biotype'") - result_df['gene_biotype'] = result_df['source'] + result_df["gene_biotype"] = result_df["source"] if "transcript_biotype" not in column_names: logging.info("Using column 'source' to replace missing 'transcript_biotype'") - result_df['transcript_biotype'] = result_df['source'] - + result_df["transcript_biotype"] = result_df["source"] + if usecols is not None: column_names = set(result_df.columns) valid_columns = [c for c in usecols if c in column_names] diff --git a/lint.sh b/lint.sh index 3ec86eb..214d41b 100755 --- a/lint.sh +++ b/lint.sh @@ -1,6 +1,13 @@ -#!/bin/bash -set -o errexit +#!/usr/bin/env bash -ruff check gtfparse/ tests/ +set -e -echo 'Passes ruff check' +SOURCES="gtfparse tests" + +echo "Running ruff check..." +ruff check $SOURCES + +echo "Running ruff format check..." +ruff format --check $SOURCES + +echo "All checks passed!" diff --git a/pylintrc b/pylintrc deleted file mode 100644 index 658d636..0000000 --- a/pylintrc +++ /dev/null @@ -1,6 +0,0 @@ -[TYPECHECK] -# Without ignoring this, we get errors like: -# E:249,20: Module 'numpy' has no 'nan' member (no-member) -ignored-modules = numpy -# getting 'no-value-for-parameter' from nose.tools.assert_raises -ignored-classes = nose.tools \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 3619c98..3f10af2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,20 +1,41 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + [project] name = "gtfparse" -requires-python = ">=3.7" +requires-python = ">=3.9" authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu" } ] description = "Parsing library for extracting data frames of genomic features from GTF files" classifiers = [ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Operating System :: OS Independent', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python', - 'Topic :: Scientific/Engineering :: Bio-Informatics', + "Development Status :: 4 - Beta", + "Environment :: Console", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Bio-Informatics", ] readme = "README.md" dynamic = ["version", "dependencies"] +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-cov", + "ruff", + "coveralls", +] + +[project.urls] +"Homepage" = "https://github.com/openvax/gtfparse" +"Bug Tracker" = "https://github.com/openvax/gtfparse/issues" + [tool.setuptools.dynamic] version = {attr = "gtfparse.__version__"} dependencies = {file = ["requirements.txt"]} @@ -22,14 +43,62 @@ dependencies = {file = ["requirements.txt"]} [tool.setuptools] packages = ["gtfparse"] -[project.urls] -"Homepage" = "https://github.com/openvax/gtfparse" -"Bug Tracker" = "https://github.com/openvax/gtfparse" +[tool.ruff] +target-version = "py39" +line-length = 100 +src = ["gtfparse", "tests"] +exclude = [ + ".git", + ".venv", + "__pycache__", + "build", + "dist", + "*.egg-info", + ".eggs", +] [tool.ruff.lint] -select = ["E", "F"] -ignore = ["F821", "E501", "F841", "E731", "E741", "E722", "E721"] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # Pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade + "SIM", # flake8-simplify + "RUF", # Ruff-specific rules +] +ignore = [ + "E501", # line too long (handled by formatter) + "E741", # ambiguous variable name (pre-existing in codebase) + "B006", # mutable default args (pre-existing; behavior-risky to change) + "B008", # do not perform function calls in argument defaults + "B905", # zip() without explicit strict + "SIM108", # use ternary operator instead of if-else + "UP007", # use X | Y for type unions (need Python 3.10+) + "UP031", # %-format strings (pre-existing; out of scope for config PR) +] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401"] -"test*/*" = ["F401"] +"tests/*" = ["F401", "B011"] + +[tool.ruff.lint.isort] +known-first-party = ["gtfparse"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = "test_*.py" +python_functions = "test_*" +addopts = "-v --tb=short" + +[tool.coverage.run] +source = ["gtfparse"] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "if __name__ == .__main__.:", + "raise NotImplementedError", +] diff --git a/tests/data.py b/tests/data.py index 0ea6986..23aaacb 100644 --- a/tests/data.py +++ b/tests/data.py @@ -1,8 +1,9 @@ import os + def data_path(name): """ Return the absolute path to a file in the varcode/test/data directory. The name specified should be relative to varcode/test/data. """ - return os.path.join(os.path.dirname(__file__), "data", name) \ No newline at end of file + return os.path.join(os.path.dirname(__file__), "data", name) diff --git a/tests/test_create_missing_features.py b/tests/test_create_missing_features.py index 5c72e10..d2a697c 100644 --- a/tests/test_create_missing_features.py +++ b/tests/test_create_missing_features.py @@ -1,86 +1,85 @@ -from gtfparse import create_missing_features, parse_gtf_and_expand_attributes from io import StringIO +from gtfparse import create_missing_features, parse_gtf_and_expand_attributes + # two lines from the Ensembl 54 human GTF containing only a stop_codon and # exon features, but from which gene and transcript information could be # inferred -GTF_TEXT = "\n".join([ - "# seqname biotype feature start end score strand frame attribute", - "".join([ - """18\tprotein_coding\tstop_codon\t32630766\t32630768\t.\t-\t0\t""", - """gene_id "ENSG00000134779"; transcript_id "ENST00000334295"; exon_number "7";""" - """ gene_name "C18orf10";""", - """ transcript_name "C18orf10-201";"""]), - "".join([ - """18\tprotein_coding\texon\t32663078\t32663157\t.\t+\t.\tgene_id "ENSG00000150477"; """, - """transcript_id "ENST00000383055"; exon_number "1"; gene_name "KIAA1328"; """, - """transcript_name "KIAA1328-202";"""]), -]) +GTF_TEXT = "\n".join( + [ + "# seqname biotype feature start end score strand frame attribute", + "".join( + [ + """18\tprotein_coding\tstop_codon\t32630766\t32630768\t.\t-\t0\t""", + """gene_id "ENSG00000134779"; transcript_id "ENST00000334295"; exon_number "7";""" + """ gene_name "C18orf10";""", + """ transcript_name "C18orf10-201";""", + ] + ), + "".join( + [ + """18\tprotein_coding\texon\t32663078\t32663157\t.\t+\t.\tgene_id "ENSG00000150477"; """, + """transcript_id "ENST00000383055"; exon_number "1"; gene_name "KIAA1328"; """, + """transcript_name "KIAA1328-202";""", + ] + ), + ] +) GTF_DATAFRAME = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT)) GTF_DATAFRAME = GTF_DATAFRAME.to_pandas() + def test_create_missing_features_identity(): df_should_be_same = create_missing_features(GTF_DATAFRAME, {}) - assert len(GTF_DATAFRAME) == len(df_should_be_same), \ - "GTF DataFrames should be same size" + assert len(GTF_DATAFRAME) == len(df_should_be_same), "GTF DataFrames should be same size" + def _check_expanded_dataframe(df): - assert "gene" in set(df["feature"]), \ - "Extended GTF should contain gene feature" - assert "transcript" in set(df["feature"]), \ - "Extended GTF should contain transcript feature" + assert "gene" in set(df["feature"]), "Extended GTF should contain gene feature" + assert "transcript" in set(df["feature"]), "Extended GTF should contain transcript feature" - C18orf10_201_transcript_mask = ( - (df["feature"] == "transcript") & - (df["transcript_name"] == "C18orf10-201")) - assert len(df[C18orf10_201_transcript_mask]) == 1, \ - "Expected only 1 gene entry for C18orf10-201, got %s" % ( - df[C18orf10_201_transcript_mask],) + C18orf10_201_transcript_mask = (df["feature"] == "transcript") & ( + df["transcript_name"] == "C18orf10-201" + ) + assert len(df[C18orf10_201_transcript_mask]) == 1, ( + "Expected only 1 gene entry for C18orf10-201, got %s" % (df[C18orf10_201_transcript_mask],) + ) transcript_seqname = df[C18orf10_201_transcript_mask].seqname.iloc[0] - assert (transcript_seqname == "18"), \ - "Wrong seqname for C18orf10-201: %s" % transcript_seqname + assert transcript_seqname == "18", "Wrong seqname for C18orf10-201: %s" % transcript_seqname transcript_start = df[C18orf10_201_transcript_mask].start.iloc[0] - assert (transcript_start == 32630766), \ - "Wrong start for C18orf10-201: %s" % transcript_start + assert transcript_start == 32630766, "Wrong start for C18orf10-201: %s" % transcript_start transcript_end = df[C18orf10_201_transcript_mask].end.iloc[0] - assert (transcript_end == 32630768), \ - "Wrong end for C18orf10-201: %s" % transcript_end + assert transcript_end == 32630768, "Wrong end for C18orf10-201: %s" % transcript_end transcript_strand = df[C18orf10_201_transcript_mask].strand.iloc[0] - assert (transcript_strand == "-"), \ - "Wrong strand for C18orf10-201: %s" % transcript_strand + assert transcript_strand == "-", "Wrong strand for C18orf10-201: %s" % transcript_strand - KIAA1328_gene_mask = ( - (df["feature"] == "gene") & - (df["gene_name"] == "KIAA1328")) + KIAA1328_gene_mask = (df["feature"] == "gene") & (df["gene_name"] == "KIAA1328") assert len(df[KIAA1328_gene_mask]) == 1, "Expected only 1 gene entry for KIAA1328" gene_seqname = df[KIAA1328_gene_mask].seqname.iloc[0] - assert (gene_seqname == "18"), \ - "Wrong seqname for KIAA1328: %s" % gene_seqname + assert gene_seqname == "18", "Wrong seqname for KIAA1328: %s" % gene_seqname gene_start = df[KIAA1328_gene_mask].start.iloc[0] - assert (gene_start == 32663078), \ - "Wrong start for KIAA1328: %s" % (gene_start,) + assert gene_start == 32663078, "Wrong start for KIAA1328: %s" % (gene_start,) gene_end = df[KIAA1328_gene_mask].end.iloc[0] - assert (gene_end == 32663157), \ - "Wrong end for KIAA1328: %s" % (gene_end,) + assert gene_end == 32663157, "Wrong end for KIAA1328: %s" % (gene_end,) gene_strand = df[KIAA1328_gene_mask].strand.iloc[0] - assert (gene_strand == "+"), \ - "Wrong strand for KIAA1328: %s" % gene_strand + assert gene_strand == "+", "Wrong strand for KIAA1328: %s" % gene_strand + def test_create_missing_features(): - assert "gene" not in set(GTF_DATAFRAME["feature"]), \ + assert "gene" not in set(GTF_DATAFRAME["feature"]), ( "Original GTF should not contain gene feature" - assert "transcript" not in set(GTF_DATAFRAME["feature"]), \ + ) + assert "transcript" not in set(GTF_DATAFRAME["feature"]), ( "Original GTF should not contain transcript feature" + ) df_extra_features = create_missing_features( GTF_DATAFRAME, - unique_keys={ - "gene": "gene_id", - "transcript": "transcript_id" - }, + unique_keys={"gene": "gene_id", "transcript": "transcript_id"}, extra_columns={ "gene": {"gene_name"}, "transcript": {"gene_id", "gene_name", "transcript_name"}, - }) + }, + ) _check_expanded_dataframe(df_extra_features) diff --git a/tests/test_ensembl_gtf.py b/tests/test_ensembl_gtf.py index 3dc45ef..c897c8b 100644 --- a/tests/test_ensembl_gtf.py +++ b/tests/test_ensembl_gtf.py @@ -4,7 +4,7 @@ ENSEMBL_GTF_PATH = data_path("ensembl_grch37.head.gtf") -EXPECTED_FEATURES = set([ +EXPECTED_FEATURES = { "gene", "transcript", "exon", @@ -12,7 +12,7 @@ "UTR", "start_codon", "stop_codon", -]) +} def test_ensembl_gtf_columns(): @@ -20,62 +20,128 @@ def test_ensembl_gtf_columns(): features = set(df["feature"]) assert features == EXPECTED_FEATURES + # first 1000 lines of GTF only contained these genes EXPECTED_GENE_NAMES = { - 'FAM41C', 'CICP27', 'RNU6-1100P', 'NOC2L', 'AP006222.1', - 'LINC01128', 'RP4-669L17.1', 'RP11-206L10.2', 'PLEKHN1', - 'WBP1LP7', 'RP5-857K21.1', 'RP5-857K21.5', 'RNU6-1199P', - 'RP11-206L10.10', 'RP11-54O7.16', 'CICP7', 'AL627309.1', - 'RP5-857K21.11', 'DDX11L1', 'RP5-857K21.3', 'RP11-34P13.7', - 'AL669831.1', 'MTATP6P1', 'CICP3', 'WBP1LP6', 'LINC00115', - 'hsa-mir-6723', 'RP5-857K21.7', 'SAMD11', 'RP11-206L10.5', - 'RP11-34P13.8', 'RP11-206L10.9', 'RP11-34P13.15', 'TUBB8P11', - 'MTATP8P1', 'RP4-669L17.8', 'RP11-206L10.1', 'RP11-34P13.13', - 'RP11-206L10.3', 'RP11-206L10.4', 'RP11-54O7.3', 'RP5-857K21.2', - 'OR4F5', 'MTND1P23', 'AL645608.1', 'RP11-34P13.16', 'RP11-34P13.14', - 'AP006222.2', 'OR4F29', 'RP4-669L17.4', 'AL732372.1', 'OR4G4P', - 'MTND2P28', 'OR4F16', 'KLHL17', 'FAM138A', 'OR4G11P', 'FAM87B', - 'RP5-857K21.15', 'AL645608.2', 'RP11-206L10.8', 'RP5-857K21.4', - 'MIR1302-10', 'RP11-54O7.2', 'RP4-669L17.10', 'RP11-54O7.1', - 'RP11-34P13.9', 'WASH7P', 'RP4-669L17.2' + "FAM41C", + "CICP27", + "RNU6-1100P", + "NOC2L", + "AP006222.1", + "LINC01128", + "RP4-669L17.1", + "RP11-206L10.2", + "PLEKHN1", + "WBP1LP7", + "RP5-857K21.1", + "RP5-857K21.5", + "RNU6-1199P", + "RP11-206L10.10", + "RP11-54O7.16", + "CICP7", + "AL627309.1", + "RP5-857K21.11", + "DDX11L1", + "RP5-857K21.3", + "RP11-34P13.7", + "AL669831.1", + "MTATP6P1", + "CICP3", + "WBP1LP6", + "LINC00115", + "hsa-mir-6723", + "RP5-857K21.7", + "SAMD11", + "RP11-206L10.5", + "RP11-34P13.8", + "RP11-206L10.9", + "RP11-34P13.15", + "TUBB8P11", + "MTATP8P1", + "RP4-669L17.8", + "RP11-206L10.1", + "RP11-34P13.13", + "RP11-206L10.3", + "RP11-206L10.4", + "RP11-54O7.3", + "RP5-857K21.2", + "OR4F5", + "MTND1P23", + "AL645608.1", + "RP11-34P13.16", + "RP11-34P13.14", + "AP006222.2", + "OR4F29", + "RP4-669L17.4", + "AL732372.1", + "OR4G4P", + "MTND2P28", + "OR4F16", + "KLHL17", + "FAM138A", + "OR4G11P", + "FAM87B", + "RP5-857K21.15", + "AL645608.2", + "RP11-206L10.8", + "RP5-857K21.4", + "MIR1302-10", + "RP11-54O7.2", + "RP4-669L17.10", + "RP11-54O7.1", + "RP11-34P13.9", + "WASH7P", + "RP4-669L17.2", } + def test_ensembl_gtf_gene_names(): df = read_gtf(ENSEMBL_GTF_PATH) gene_names = set(df["gene_name"]) - assert gene_names == EXPECTED_GENE_NAMES, \ - "Wrong gene names: %s, missing %s and unexpected %s" % ( + assert gene_names == EXPECTED_GENE_NAMES, ( + "Wrong gene names: %s, missing %s and unexpected %s" + % ( gene_names, EXPECTED_GENE_NAMES.difference(gene_names), - gene_names.difference(EXPECTED_GENE_NAMES) + gene_names.difference(EXPECTED_GENE_NAMES), ) + ) + def test_ensembl_gtf_gene_names_with_usecols(): df = read_gtf(ENSEMBL_GTF_PATH, usecols=["gene_name"]) gene_names = set(df["gene_name"]) - assert gene_names == EXPECTED_GENE_NAMES, \ - "Wrong gene names: %s, missing %s and unexpected %s" % ( + assert gene_names == EXPECTED_GENE_NAMES, ( + "Wrong gene names: %s, missing %s and unexpected %s" + % ( gene_names, EXPECTED_GENE_NAMES.difference(gene_names), - gene_names.difference(EXPECTED_GENE_NAMES) + gene_names.difference(EXPECTED_GENE_NAMES), ) + ) + def test_ensembl_gtf_gene_names_zip(): df = read_gtf(ENSEMBL_GTF_PATH + ".gz") gene_names = set(df["gene_name"]) - assert gene_names == EXPECTED_GENE_NAMES, \ - "Wrong gene names: %s, missing %s and unexpected %s" % ( + assert gene_names == EXPECTED_GENE_NAMES, ( + "Wrong gene names: %s, missing %s and unexpected %s" + % ( gene_names, EXPECTED_GENE_NAMES.difference(gene_names), - gene_names.difference(EXPECTED_GENE_NAMES) + gene_names.difference(EXPECTED_GENE_NAMES), ) + ) + def test_ensembl_gtf_gene_names_with_usecols_gzip(): df = read_gtf(ENSEMBL_GTF_PATH + ".gz", usecols=["gene_name"]) gene_names = set(df["gene_name"]) - assert gene_names == EXPECTED_GENE_NAMES, \ - "Wrong gene names: %s, missing %s and unexpected %s" % ( + assert gene_names == EXPECTED_GENE_NAMES, ( + "Wrong gene names: %s, missing %s and unexpected %s" + % ( gene_names, EXPECTED_GENE_NAMES.difference(gene_names), - gene_names.difference(EXPECTED_GENE_NAMES) + gene_names.difference(EXPECTED_GENE_NAMES), ) + ) diff --git a/tests/test_expand_attributes.py b/tests/test_expand_attributes.py index 386b36e..9d85baf 100644 --- a/tests/test_expand_attributes.py +++ b/tests/test_expand_attributes.py @@ -1,12 +1,13 @@ from gtfparse import expand_attribute_strings + def test_attributes_in_quotes(): attributes = [ - "gene_id \"ENSG001\"; tag \"bogotron\"; version \"1\";", - "gene_id \"ENSG002\"; tag \"wolfpuppy\"; version \"2\";" + 'gene_id "ENSG001"; tag "bogotron"; version "1";', + 'gene_id "ENSG002"; tag "wolfpuppy"; version "2";', ] parsed_dict = expand_attribute_strings(attributes, quote_char='"') - assert list(sorted(parsed_dict.keys())), ["gene_id", "tag", "version"] + assert sorted(parsed_dict.keys()), ["gene_id", "tag", "version"] assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002"] assert parsed_dict["tag"] == ["bogotron", "wolfpuppy"] assert parsed_dict["version"] == ["1", "2"] @@ -15,10 +16,10 @@ def test_attributes_in_quotes(): def test_attributes_without_quotes(): attributes = [ "gene_id ENSG001; tag bogotron; version 1;", - "gene_id ENSG002; tag wolfpuppy; version 2" + "gene_id ENSG002; tag wolfpuppy; version 2", ] parsed_dict = expand_attribute_strings(attributes) - assert list(sorted(parsed_dict.keys())) == ["gene_id", "tag", "version"] + assert sorted(parsed_dict.keys()) == ["gene_id", "tag", "version"] assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002"] assert parsed_dict["tag"] == ["bogotron", "wolfpuppy"] assert parsed_dict["version"] == ["1", "2"] @@ -31,6 +32,6 @@ def test_optional_attributes(): "gene_id ENSG003; sometimes-present wolfpuppy;", ] parsed_dict = expand_attribute_strings(attributes) - assert list(sorted(parsed_dict.keys())) == ["gene_id", "sometimes-present"] - assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002", "ENSG003"] - assert parsed_dict["sometimes-present"] == ["bogotron", "", "wolfpuppy"] + assert sorted(parsed_dict.keys()) == ["gene_id", "sometimes-present"] + assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002", "ENSG003"] + assert parsed_dict["sometimes-present"] == ["bogotron", "", "wolfpuppy"] diff --git a/tests/test_multiple_values_for_tag_attribute.py b/tests/test_multiple_values_for_tag_attribute.py index fcb4a79..eac905e 100644 --- a/tests/test_multiple_values_for_tag_attribute.py +++ b/tests/test_multiple_values_for_tag_attribute.py @@ -1,4 +1,5 @@ from io import StringIO + from gtfparse import parse_gtf_and_expand_attributes # failing example from https://github.com/openvax/gtfparse/issues/2 @@ -11,25 +12,26 @@ """tag "cds_end_NF"; tag "mRNA_end_NF"; """ ) + def test_parse_tag_attributes(): parsed = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT)) tag_column = parsed["tag"] assert len(tag_column) == 1 tags = tag_column[0] - assert tags == 'cds_end_NF,mRNA_end_NF' + assert tags == "cds_end_NF,mRNA_end_NF" + def test_parse_tag_attributes_with_usecols(): - parsed = parse_gtf_and_expand_attributes( - StringIO(GTF_TEXT), - restrict_attribute_columns=["tag"]) + parsed = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT), restrict_attribute_columns=["tag"]) tag_column = parsed["tag"] assert len(tag_column) == 1 tags = tag_column[0] - assert tags == 'cds_end_NF,mRNA_end_NF' + assert tags == "cds_end_NF,mRNA_end_NF" + def test_parse_tag_attributes_with_usecols_other_column(): parsed = parse_gtf_and_expand_attributes( - StringIO(GTF_TEXT), - restrict_attribute_columns=["exon_id"]) + StringIO(GTF_TEXT), restrict_attribute_columns=["exon_id"] + ) assert "tag" not in parsed, "Expected 'tag' to get dropped but got %s" % (parsed,) diff --git a/tests/test_parse_gtf_lines.py b/tests/test_parse_gtf_lines.py index b26326e..b9bf3ed 100644 --- a/tests/test_parse_gtf_lines.py +++ b/tests/test_parse_gtf_lines.py @@ -1,11 +1,8 @@ +from io import StringIO + from pytest import raises -from gtfparse import ( - parse_gtf, - parse_gtf_and_expand_attributes, - REQUIRED_COLUMNS, - ParsingError -) -from io import StringIO + +from gtfparse import REQUIRED_COLUMNS, ParsingError, parse_gtf, parse_gtf_and_expand_attributes gtf_text = """ # sample GTF data copied from: @@ -14,12 +11,13 @@ 1\tprocessed_transcript\ttranscript\t11869\t14409\t.\t+\t.\tgene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; """ + def test_parse_gtf_lines_with_expand_attributes(): df = parse_gtf_and_expand_attributes(StringIO(gtf_text)) - # excluding 'attribute' column from required names - expected_columns = REQUIRED_COLUMNS[:8] + [ + expected_columns = [ + *REQUIRED_COLUMNS[:8], "gene_id", "gene_name", "gene_source", @@ -29,7 +27,7 @@ def test_parse_gtf_lines_with_expand_attributes(): "transcript_source", ] # convert to list since Py3's dictionary keys are a distinct collection type - assert list(df.columns) == expected_columns + assert list(df.columns) == expected_columns assert list(df["seqname"]) == ["1", "1"] # convert to list for comparison since numerical columns may be NumPy arrays assert list(df["start"]) == [11869, 11869] @@ -52,6 +50,7 @@ def test_parse_gtf_lines_without_expand_attributes(): assert df["score"].is_null().all(), "Unexpected scores: %s" % (df["score"],) assert len(df["attribute"]) == 2 + def test_parse_gtf_lines_error_too_few_fields(): bad_gtf_text = gtf_text.replace("\t", " ") # pylint: disable=no-value-for-parameter diff --git a/tests/test_read_stringtie_gtf.py b/tests/test_read_stringtie_gtf.py index 8abcea1..b105439 100644 --- a/tests/test_read_stringtie_gtf.py +++ b/tests/test_read_stringtie_gtf.py @@ -1,8 +1,10 @@ from gtfparse import read_gtf + from .data import data_path B16_GTF_PATH = data_path("B16.stringtie.head.gtf") + def _check_required_columns(gtf_dict): assert "feature" in gtf_dict, "Expected column named 'feature' in StringTie GTF" assert "cov" in gtf_dict, "Expected column named 'cov' in StringTie GTF" @@ -11,32 +13,37 @@ def _check_required_columns(gtf_dict): assert "exon" in features, "No exons in GTF (available: %s)" % features assert "transcript" in features, "No transcripts in GTF (available: %s)" % features + def _check_string_cov_and_FPKM(gtf_dict): for i, feature_name in enumerate(gtf_dict["feature"]): cov = gtf_dict["cov"][i] fpkm = gtf_dict["FPKM"][i] if feature_name == "exon": - assert len(fpkm) == 0, \ - "Expected missing FPKM for exon, got %s" % (fpkm,) - assert len(cov) > 0 and float(cov) >= 0, \ + assert len(fpkm) == 0, "Expected missing FPKM for exon, got %s" % (fpkm,) + assert len(cov) > 0 and float(cov) >= 0, ( "Expected non-negative cov for exon, got %s" % (cov,) + ) elif feature_name == "transcript": - assert len(cov) and float(cov) >= 0, \ + assert len(cov) and float(cov) >= 0, ( "Expected non-negative cov for transcript, got %s" % (cov,) - assert len(fpkm) > 0 and float(fpkm) >= 0, \ + ) + assert len(fpkm) > 0 and float(fpkm) >= 0, ( "Expected non-negative FPKM for transcript, got %s" % (fpkm,) + ) + def _check_float_cov_and_FPKM(gtf_dict): for i, feature_name in enumerate(gtf_dict["feature"]): cov = gtf_dict["cov"][i] fpkm = gtf_dict["FPKM"][i] - assert isinstance(cov, float), \ - "Expected cov to be float but got %s : %s" % (cov, type(cov)) + assert isinstance(cov, float), "Expected cov to be float but got %s : %s" % (cov, type(cov)) if feature_name == "exon": assert cov >= 0, "Expected non-negative cov for exon, got %s" % (cov,) elif feature_name == "transcript": - assert isinstance(fpkm, float), \ - "Expected FPKM to be float but got %s : %s" % (fpkm, type(fpkm)) + assert isinstance(fpkm, float), "Expected FPKM to be float but got %s : %s" % ( + fpkm, + type(fpkm), + ) assert cov >= 0, "Expected non-negative cov for transcript, got %s" % (cov,) assert fpkm >= 0, "Expected non-negative FPKM for transcript, got %s" % (fpkm,) @@ -46,9 +53,8 @@ def test_read_stringtie_gtf_as_dataframe(): _check_required_columns(gtf_df) _check_string_cov_and_FPKM(gtf_df) + def test_read_stringtie_gtf_as_dataframe_float_values(): - gtf_df = read_gtf( - B16_GTF_PATH, - column_converters={"cov": float, "FPKM": float}) + gtf_df = read_gtf(B16_GTF_PATH, column_converters={"cov": float, "FPKM": float}) _check_required_columns(gtf_df) _check_float_cov_and_FPKM(gtf_df) diff --git a/tests/test_refseq_gtf.py b/tests/test_refseq_gtf.py index 57c52d1..26a8fd4 100644 --- a/tests/test_refseq_gtf.py +++ b/tests/test_refseq_gtf.py @@ -1,8 +1,10 @@ from gtfparse import read_gtf + from .data import data_path REFSEQ_GTF_PATH = data_path("refseq.ucsc.small.gtf") + def _check_required_columns(gtf_dict): assert "feature" in gtf_dict, "Expected column named 'feature' in RefSeq GTF" assert "gene_id" in gtf_dict, "Expected column named 'gene_id' in RefSeq GTF" @@ -11,10 +13,16 @@ def _check_required_columns(gtf_dict): assert "exon" in features, "No exon features in GTF (available: %s)" % features assert "CDS" in features, "No CDS features in GTF (available: %s)" % features + def test_read_refseq_gtf_as_dataframe(): gtf_df = read_gtf(REFSEQ_GTF_PATH) _check_required_columns(gtf_df) + def test_read_refseq_and_transform_columns(): - gtf_df = read_gtf(REFSEQ_GTF_PATH, column_converters={"start": int, "end": int}, column_cast_types={"score": float}) - print(gtf_df) \ No newline at end of file + gtf_df = read_gtf( + REFSEQ_GTF_PATH, + column_converters={"start": int, "end": int}, + column_cast_types={"score": float}, + ) + print(gtf_df)