Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gffquant/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from enum import Enum, auto, unique


__version__ = "2.18.5"
__version__ = "2.19.0"
__tool__ = "gffquant"


Expand Down
43 changes: 26 additions & 17 deletions gffquant/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,26 +128,34 @@ def main():
**kwargs,
)

if args.input_type == "fastq":
if args.input_type in ("fastq", "bam", "sam"):

stream_alignments(args, profiler)
if args.input_type == "fastq":

else:
stream_alignments(args, profiler)

input_file = args.bam if args.input_type == "bam" else args.sam
debug_samfile = None
if profiler.debug:
debug_samfile = f"{profiler.out_prefix}.{args.input_type}.filtered.sam"

profiler.count_alignments(
sys.stdin if input_file == "-" else input_file,
aln_format=args.input_type,
min_identity=args.min_identity,
min_seqlen=args.min_seqlen,
external_readcounts=args.import_readcounts,
unmarked_orphans=args.unmarked_orphans,
debug_samfile=debug_samfile,
)
else:

input_file = args.bam if args.input_type == "bam" else args.sam
debug_samfile = None
if profiler.debug:
debug_samfile = f"{profiler.out_prefix}.{args.input_type}.filtered.sam"

profiler.count_alignments(
sys.stdin if input_file == "-" else input_file,
aln_format=args.input_type,
min_identity=args.min_identity,
min_seqlen=args.min_seqlen,
external_readcounts=args.import_readcounts,
unmarked_orphans=args.unmarked_orphans,
debug_samfile=debug_samfile,
)

profiler.report_alignments()

else:

...

profiler.finalise(
restrict_reports=args.restrict_metrics,
Expand All @@ -156,6 +164,7 @@ def main():
dump_counters=args.debug,
in_memory=args.db_in_memory,
gene_group_db=args.gene_group_db,
external_gene_counts=args.gene_counts,
)


Expand Down
35 changes: 35 additions & 0 deletions gffquant/annotation/count_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

""" This module contains code for transforming gene counts to feature counts. """

import csv
import logging

from itertools import chain
Expand Down Expand Up @@ -197,6 +198,9 @@ class RegionCountAnnotator(CountAnnotator):
def __init__(self, strand_specific, report_scaling_factors=True):
CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)

def annotate_external(self, fn, db, gene_group_db=False):
raise NotImplementedError()

# pylint: disable=R0914,W0613
def annotate(self, refmgr, db, count_manager, gene_group_db=False):
"""
Expand Down Expand Up @@ -326,3 +330,34 @@ def annotate(self, refmgr, db, count_manager, gene_group_db=False):
self.unannotated_counts += counts[:4]

self.calculate_scaling_factors()

def annotate_external(self, fn, db, gene_group_db=False): # refmgr, db, count_manager, gene_group_db=False):

with open(fn, "rt", encoding="UTF-8") as _in:
for row in csv.DictReader(_in, delimiter="\t"):
# gene uniq_raw uniq_lnorm uniq_scaled combined_raw combined_lnorm combined_scaled
cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"]
counts = tuple(map(float, cols))
ref = row["gene"]

if gene_group_db:
# ref_tokens = ref.split(".")
p = ref.rfind(".")
# gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
gene_id, ggroup_id = ref[:p], ref[p + 1:]
else:
ggroup_id, gene_id = ref, ref

gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
gcounts += counts
Comment on lines +336 to +352

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Reject or fully parse strand-specific external matrices.

annotate_external() always builds a 4-value counts tuple, but self.bins is 12 when --strand_specific is enabled. That makes gcounts += counts fail with a NumPy shape mismatch on the first imported row.

💡 Minimal guard if 12-bin import is not supported yet
 def annotate_external(self, fn, db, gene_group_db=False):  # refmgr, db, count_manager, gene_group_db=False):
+    if self.bins != 4:
+        raise ValueError(
+            "--gene_counts currently supports only non-strand-specific count matrices."
+        )
 
     with open(fn, "rt", encoding="UTF-8") as _in:
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
with open(fn, "rt", encoding="UTF-8") as _in:
for row in csv.DictReader(_in, delimiter="\t"):
# gene uniq_raw uniq_lnorm uniq_scaled combined_raw combined_lnorm combined_scaled
cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"]
counts = tuple(map(float, cols))
ref = row["gene"]
if gene_group_db:
ref_tokens = ref.split(".")
gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
else:
ggroup_id, gene_id = ref, ref
gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
gcounts += counts
if self.bins != 4:
raise ValueError(
"--gene_counts currently supports only non-strand-specific count matrices."
)
with open(fn, "rt", encoding="UTF-8") as _in:
for row in csv.DictReader(_in, delimiter="\t"):
# gene uniq_raw uniq_lnorm uniq_scaled combined_raw combined_lnorm combined_scaled
cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"]
counts = tuple(map(float, cols))
ref = row["gene"]
if gene_group_db:
ref_tokens = ref.split(".")
gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
else:
ggroup_id, gene_id = ref, ref
gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
gcounts += counts
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@gffquant/annotation/count_annotator.py` around lines 333 - 347,
annotate_external() currently always builds a 4-value tuple (counts) and then
adds it to self.gene_counts entries of length self.bins, causing a shape
mismatch when self.bins == 12 (strand-specific). Fix by adding a guard and
proper parsing: if self.bins != 4 then either (A) raise a clear error (e.g.,
RuntimeError) explaining that strand-specific (12-bin) external matrices are not
supported yet, or (B) implement 12-bin parsing by reading the expected
per-strand columns (instead of
("uniq_raw","uniq_lnorm","combined_raw","combined_lnorm")) into a numpy array of
length 12 and then add that array to gcounts; update annotate_external(), the
local variables counts and gcounts, and the use of row[...] keys accordingly so
shapes always match.

self.total_gene_counts += counts[:4]

region_annotation = db.query_sequence(ggroup_id)
if region_annotation is not None:
_, _, region_annotation = region_annotation
self.distribute_feature_counts(counts, region_annotation)

else:
self.unannotated_counts += counts[:4]

self.calculate_scaling_factors()
18 changes: 18 additions & 0 deletions gffquant/counters/count_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,24 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]]

return uniq_counts, ambig_counts

# def set_counts(self, seqid, value, which_counter):
# counter = (self.uniq_seqcounts, self.ambig_seqcounts)[which_counter == "ambig"]
# counter[seqid] = value

# def load_data(self, fn):
# # gene uniq_raw uniq_lnorm uniq_scaled combined_raw combined_lnorm combined_scaled
# with open(fn, "rt", encoding="UTF-8") as _in:
# try:
# header = next(_in)
# except StopIteration:
# header = ""
# if header:
# for row in csv.reader(_in, delimiter="\t"):
# gene, counts = row[0], tuple(map(float, row[1:]))
# self.set_counts()



def get_regions(self, rid):
return set(self.uniq_regioncounts.get(rid, set())).union(
Expand Down
45 changes: 33 additions & 12 deletions gffquant/handle_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ def validate_args(args):

if not all(os.path.isfile(f) for f in db_files):
raise ValueError(f"Cannot find annotation db at `{args.annotation_db}`.")
if (args.aligner == "bwa" and not check_bwa_index(args.reference)) or (args.aligner == "minimap" and not check_minimap2_index(args.reference)):
raise ValueError(f"Cannot find reference index at `{args.reference}`.")


has_fastq = any(
map(
lambda x: x is not None,
Expand All @@ -39,22 +37,39 @@ def validate_args(args):
)
)

if tuple(map(bool, (has_fastq, args.bam, args.sam))).count(True) != 1:
raise ValueError(f"Need exactly one type of input: bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)}.")
if tuple(map(bool, (has_fastq, args.bam, args.sam, args.gene_counts))).count(True) != 1:
raise ValueError(
"Need exactly one type of input: "
f"bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)} "
f"gene_counts={bool(args.gene_counts)}."
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts"))

args.input_type = "fastq" if has_fastq else ("bam" if args.bam else "sam")
if has_fastq:
if not bool(args.reference and args.aligner):
raise ValueError("--fastq-<readtype> input requires --reference and --aligner to be set.")

if (args.reference or args.aligner) and not has_fastq:
raise ValueError("--reference/--aligner are not needed with alignment input (bam, sam).")
if bool(args.reference and args.aligner) != has_fastq:
raise ValueError("--fastq requires --reference and --aligner to be set.")
if (args.aligner == "bwa" and not check_bwa_index(args.reference)) or (args.aligner == "minimap" and not check_minimap2_index(args.reference)):
raise ValueError(f"Cannot find `${args.aligner}` reference index at `{args.reference}`.")

if args.input_type == "fastq":
args.input_data = check_input_reads(
fwd_reads=args.reads1, rev_reads=args.reads2,
single_reads=args.singles, orphan_reads=args.orphans,
)

else:
if bool(args.reference or args.aligner):
raise ValueError("--reference/--aligner parameters are only required for --fastq-<readtype> input.")

if (args.strand_specific and args.gene_counts):
raise NotImplementedError("External gene count input is not implemented for strand-specific counts.")

# if (args.reference or args.aligner) and not has_fastq:
# raise ValueError("--reference/--aligner parameters are only required for --fastq-<readtype> input.")

# if bool(args.reference and args.aligner) != has_fastq:
# raise ValueError("--fastq-<readtype> input requires --reference and --aligner to be set.")

if args.restrict_metrics:
restrict_metrics = set(args.restrict_metrics.split(","))
invalid = restrict_metrics.difference(('raw', 'lnorm', 'scaled', 'rpkm'))
Expand Down Expand Up @@ -189,6 +204,12 @@ def handle_args(args):
# Input from STDOUT can be used with '-'."""
# ),
# )
ap.add_argument(
"--gene_counts",
type=str,
help="Path to a file containing a gene_counts matrix from a previous gffquant run."
)

ap.add_argument(
"--bam",
type=str,
Expand Down
Loading
Loading