From 216ed7e2390d20ceeabd843df783c43e2962be2d Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 12 Feb 2025 11:29:19 +0100 Subject: [PATCH 01/19] add skeleton for read count input --- gffquant/__main__.py | 42 +++++++++------ gffquant/handle_args.py | 16 ++++-- gffquant/profilers/feature_quantifier.py | 68 +++++++++++------------- 3 files changed, 70 insertions(+), 56 deletions(-) diff --git a/gffquant/__main__.py b/gffquant/__main__.py index c5804587..f7682142 100644 --- a/gffquant/__main__.py +++ b/gffquant/__main__.py @@ -128,26 +128,34 @@ def main(): **kwargs, ) - if args.input_type == "fastq": + if args.input_type in ("fastq", "bam", "sam"): - stream_alignments(args, profiler) + if args.input_type == "fastq": - else: + stream_alignments(args, profiler) - input_file = args.bam if args.input_type == "bam" else args.sam - debug_samfile = None - if profiler.debug: - debug_samfile = f"{profiler.out_prefix}.{args.input_type}.filtered.sam" - - profiler.count_alignments( - sys.stdin if input_file == "-" else input_file, - aln_format=args.input_type, - min_identity=args.min_identity, - min_seqlen=args.min_seqlen, - external_readcounts=args.import_readcounts, - unmarked_orphans=args.unmarked_orphans, - debug_samfile=debug_samfile, - ) + else: + + input_file = args.bam if args.input_type == "bam" else args.sam + debug_samfile = None + if profiler.debug: + debug_samfile = f"{profiler.out_prefix}.{args.input_type}.filtered.sam" + + profiler.count_alignments( + sys.stdin if input_file == "-" else input_file, + aln_format=args.input_type, + min_identity=args.min_identity, + min_seqlen=args.min_seqlen, + external_readcounts=args.import_readcounts, + unmarked_orphans=args.unmarked_orphans, + debug_samfile=debug_samfile, + ) + + profiler.report_alignments() + + else: + + profiler.load_gene_counts(args.gene_counts) profiler.finalise( restrict_reports=args.restrict_metrics, diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py index 8be6a387..8c70561f 100644 --- a/gffquant/handle_args.py +++ b/gffquant/handle_args.py @@ -39,10 +39,14 @@ def validate_args(args): ) ) - if tuple(map(bool, (has_fastq, args.bam, args.sam))).count(True) != 1: - raise ValueError(f"Need exactly one type of input: bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)}.") + if tuple(map(bool, (has_fastq, args.bam, args.sam, args.gene_counts))).count(True) != 1: + raise ValueError( + "Need exactly one type of input: " + f"bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)} " + f"gene_counts={bool(args.gene_count)}." + ) - args.input_type = "fastq" if has_fastq else ("bam" if args.bam else "sam") + args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts")) if (args.reference or args.aligner) and not has_fastq: raise ValueError("--reference/--aligner are not needed with alignment input (bam, sam).") @@ -189,6 +193,12 @@ def handle_args(args): # Input from STDOUT can be used with '-'.""" # ), # ) + ap.add_argument( + "--gene_counts", + type=str, + help="Path to a file containing a gene_counts matrix from a previous gffquant run." + ) + ap.add_argument( "--bam", type=str, diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index bd5c5c96..1a4ed63c 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -330,16 +330,8 @@ def count_alignments( self.aln_counter.update(aln_reader.get_alignment_stats_dict()) - def finalise( - self, - restrict_reports=None, - report_category=False, - report_unannotated=False, - dump_counters=False, - in_memory=True, - gene_group_db=False, - ): + def report_alignments(self): with gzip.open(f"{self.out_prefix}.aln_stats.txt.gz", "wt") as aln_stats_out: print( AlignmentProcessor.get_alignment_stats_str( @@ -352,19 +344,41 @@ def finalise( ), file=aln_stats_out ) + for metric, value in ( + ("Input reads", "full_read_count"), + ("Aligned reads", "read_count"), + ("Alignments", "pysam_total"), + ("Reads passing filters", "filtered_read_count"), + ("Alignments passing filters", "pysam_passed"), + (" - Discarded due to seqid", "pysam_seqid_filt"), + (" - Discarded due to length", "pysam_len_filt"), + # ("Unannotated multimappers", "unannotated_ambig"), + ): + logger.info("%s: %s", metric, self.aln_counter.get(value)) + + logger.info( + "Alignment rate: %s%%, Filter pass rate: %s%%", + round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100, + round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100, + ) + + def load_gene_counts(self, gene_count_matrix): + self.aln_counter["aln_count"] = 1 + + def finalise( + self, + restrict_reports=None, + report_category=False, + report_unannotated=False, + dump_counters=False, + in_memory=True, + gene_group_db=False, + ): if self.aln_counter.get("aln_count"): if self.adm is None: self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory) - report_args = { - "restrict_reports": restrict_reports, - "report_category": report_category, - "report_unannotated": report_unannotated, - } - - # self.write_coverage() - self.process_counters( restrict_reports=restrict_reports, report_category=report_category, @@ -372,25 +386,7 @@ def finalise( dump_counters=dump_counters, in_memory=in_memory, gene_group_db=gene_group_db, - ) - - for metric, value in ( - ("Input reads", "full_read_count"), - ("Aligned reads", "read_count"), - ("Alignments", "pysam_total"), - ("Reads passing filters", "filtered_read_count"), - ("Alignments passing filters", "pysam_passed"), - (" - Discarded due to seqid", "pysam_seqid_filt"), - (" - Discarded due to length", "pysam_len_filt"), - # ("Unannotated multimappers", "unannotated_ambig"), - ): - logger.info("%s: %s", metric, self.aln_counter.get(value)) - - logger.info( - "Alignment rate: %s%%, Filter pass rate: %s%%", - round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100, - round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100, - ) + ) self.adm.clear_caches() logger.info("Finished.") From 5121120f6c513aeb466d4500ac90149c728c65d4 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 12 Feb 2025 11:32:55 +0100 Subject: [PATCH 02/19] version -> 2.19.0, move referencehit to own module --- gffquant/__init__.py | 2 +- gffquant/profilers/feature_quantifier.py | 34 +----------------------- 2 files changed, 2 insertions(+), 34 deletions(-) diff --git a/gffquant/__init__.py b/gffquant/__init__.py index ad7efe99..31f4177f 100644 --- a/gffquant/__init__.py +++ b/gffquant/__init__.py @@ -5,7 +5,7 @@ from enum import Enum, auto, unique -__version__ = "2.18.2" +__version__ = "2.19.0" __tool__ = "gffquant" diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 1a4ed63c..567f0f6b 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -13,6 +13,7 @@ from dataclasses import dataclass, asdict from .panda_coverage_profiler import PandaCoverageProfiler +from .reference_hit import ReferenceHit from ..alignment import AlignmentGroup, AlignmentProcessor, SamFlags from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter from ..counters import CountManager @@ -24,39 +25,6 @@ logger = logging.getLogger(__name__) -@dataclass(slots=True) -class ReferenceHit: - rid: int = None - start: int = None - end: int = None - rev_strand: bool = None - cov_start: int = None - cov_end: int = None - has_annotation: bool = None - n_aln: int = None - is_ambiguous: bool = None - library_mod: int = None - mate_id: int = None - - def __hash__(self): - return hash(tuple(asdict(self).values())) - - def __eq__(self, other): - return all( - item[0][1] == item[1][1] - for item in zip( - sorted(asdict(self).items()), - sorted(asdict(other).items()) - ) - ) - - def __str__(self): - return "\t".join(map(str, asdict(self).values())) - - def __repr__(self): - return str(self) - - class FeatureQuantifier(ABC): """ Three groups of alignments: From 75aceaa7419ba2571d1dc68dc6b000ab88347479 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 12 Feb 2025 14:59:46 +0100 Subject: [PATCH 03/19] version -> 2.19.0, move referencehit to own module --- gffquant/profilers/reference_hit.py | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 gffquant/profilers/reference_hit.py diff --git a/gffquant/profilers/reference_hit.py b/gffquant/profilers/reference_hit.py new file mode 100644 index 00000000..5c1be314 --- /dev/null +++ b/gffquant/profilers/reference_hit.py @@ -0,0 +1,36 @@ +""" module docstring """ + +from dataclasses import asdict, dataclass + + +@dataclass(slots=True) +class ReferenceHit: + rid: int = None + start: int = None + end: int = None + rev_strand: bool = None + cov_start: int = None + cov_end: int = None + has_annotation: bool = None + n_aln: int = None + is_ambiguous: bool = None + library_mod: int = None + mate_id: int = None + + def __hash__(self): + return hash(tuple(asdict(self).values())) + + def __eq__(self, other): + return all( + item[0][1] == item[1][1] + for item in zip( + sorted(asdict(self).items()), + sorted(asdict(other).items()) + ) + ) + + def __str__(self): + return "\t".join(map(str, asdict(self).values())) + + def __repr__(self): + return str(self) \ No newline at end of file From c719aa617ddf78cdc57cd5e4797a12c2934c824c Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 12 Feb 2025 15:00:30 +0100 Subject: [PATCH 04/19] added functionality to add external genecounts to gene_annotator --- gffquant/__main__.py | 3 +- gffquant/annotation/count_annotator.py | 30 ++++++++++++++++ gffquant/counters/count_manager.py | 20 +++++++++++ gffquant/profilers/feature_quantifier.py | 45 ++++++++++++++++-------- 4 files changed, 83 insertions(+), 15 deletions(-) diff --git a/gffquant/__main__.py b/gffquant/__main__.py index f7682142..4133d1a1 100644 --- a/gffquant/__main__.py +++ b/gffquant/__main__.py @@ -155,7 +155,7 @@ def main(): else: - profiler.load_gene_counts(args.gene_counts) + ... profiler.finalise( restrict_reports=args.restrict_metrics, @@ -164,6 +164,7 @@ def main(): dump_counters=args.debug, in_memory=args.db_in_memory, gene_group_db=args.gene_group_db, + external_gene_counts=args.gene_counts, ) diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index feb96b92..8506b7b4 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -2,6 +2,7 @@ """ This module contains code for transforming gene counts to feature counts. """ +import csv import logging from itertools import chain @@ -326,3 +327,32 @@ def annotate(self, refmgr, db, count_manager, gene_group_db=False): self.unannotated_counts += counts[:4] self.calculate_scaling_factors() + + def annotate_external(self, fn, db, gene_group_db=False): # refmgr, db, count_manager, gene_group_db=False): + + with open(fn, "rt", encoding="UTF-8") as _in: + for row in csv.DictReader(_in, delimiter="\t"): + # gene uniq_raw uniq_lnorm uniq_scaled combined_raw combined_lnorm combined_scaled + cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"] + counts = tuple(map(float, cols)) + ref = row["gene"] + + if gene_group_db: + ref_tokens = ref.split(".") + gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + else: + ggroup_id, gene_id = ref, ref + + gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins)) + gcounts += counts + self.total_gene_counts += counts[:4] + + region_annotation = db.query_sequence(ggroup_id) + if region_annotation is not None: + _, _, region_annotation = region_annotation + self.distribute_feature_counts(counts, region_annotation) + + else: + self.unannotated_counts += counts[:4] + + self.calculate_scaling_factors() diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index 40ae72a6..7e7a8205 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -1,5 +1,7 @@ """count_manager""" +import csv + from collections import Counter from .. import DistributionMode @@ -155,6 +157,24 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False): uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]] return uniq_counts, ambig_counts + + # def set_counts(self, seqid, value, which_counter): + # counter = (self.uniq_seqcounts, self.ambig_seqcounts)[which_counter == "ambig"] + # counter[seqid] = value + + # def load_data(self, fn): + # # gene uniq_raw uniq_lnorm uniq_scaled combined_raw combined_lnorm combined_scaled + # with open(fn, "rt", encoding="UTF-8") as _in: + # try: + # header = next(_in) + # except StopIteration: + # header = "" + # if header: + # for row in csv.reader(_in, delimiter="\t"): + # gene, counts = row[0], tuple(map(float, row[1:])) + # self.set_counts() + + def get_regions(self, rid): return set(self.uniq_regioncounts.get(rid, set())).union( diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 567f0f6b..e95fe100 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -121,31 +121,44 @@ def process_counters( dump_counters=True, in_memory=True, gene_group_db=False, + external_gene_counts=None, ): if self.adm is None: self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory) - if dump_counters: + if dump_counters and not external_gene_counts: self.count_manager.dump_raw_counters(self.out_prefix, self.reference_manager) report_scaling_factors = restrict_reports is None or "scaled" in restrict_reports Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required] count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors) - count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,) + + if external_gene_counts: + count_annotator.annotate_external(external_gene_counts) + total_readcount = 0 + filtered_readcount = 0 + has_ambig_counts = True + else: + count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,) + total_readcount = self.aln_counter["read_count"] + filtered_readcount = self.aln_counter["filtered_read_count"] + has_ambig_counts = self.count_manager.has_ambig_counts() count_writer = CountWriter( self.out_prefix, - has_ambig_counts=self.count_manager.has_ambig_counts(), + has_ambig_counts=has_ambig_counts, strand_specific=self.strand_specific, restrict_reports=restrict_reports, report_category=report_category, - total_readcount=self.aln_counter["read_count"], - filtered_readcount=self.aln_counter["filtered_read_count"], + total_readcount=total_readcount, + filtered_readcount=filtered_readcount, ) - unannotated_reads = self.count_manager.get_unannotated_reads() - unannotated_reads += self.aln_counter["unannotated_ambig"] + unannotated_reads = 0 + if not external_gene_counts: + unannotated_reads += self.count_manager.get_unannotated_reads() + unannotated_reads += self.aln_counter["unannotated_ambig"] count_writer.write_feature_counts( self.adm, @@ -153,11 +166,12 @@ def process_counters( (None, unannotated_reads)[report_unannotated], ) - count_writer.write_gene_counts( - count_annotator.gene_counts, - count_annotator.scaling_factors["total_gene_uniq"], - count_annotator.scaling_factors["total_gene_ambi"] - ) + if not external_gene_counts: + count_writer.write_gene_counts( + count_annotator.gene_counts, + count_annotator.scaling_factors["total_gene_uniq"], + count_annotator.scaling_factors["total_gene_ambi"] + ) self.adm.clear_caches() @@ -330,8 +344,9 @@ def report_alignments(self): round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100, ) - def load_gene_counts(self, gene_count_matrix): - self.aln_counter["aln_count"] = 1 + # def load_gene_counts(self, gene_count_matrix): + # self.aln_counter["aln_count"] = 1 + # self.count_manager.load_data(gene_count_matrix) def finalise( self, @@ -341,6 +356,7 @@ def finalise( dump_counters=False, in_memory=True, gene_group_db=False, + external_gene_counts=None, ): if self.aln_counter.get("aln_count"): @@ -354,6 +370,7 @@ def finalise( dump_counters=dump_counters, in_memory=in_memory, gene_group_db=gene_group_db, + external_gene_counts=external_gene_counts, ) self.adm.clear_caches() From 68aa9901153810c99921c9e6086e9a824e4cc967 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 12 Feb 2025 15:11:46 +0100 Subject: [PATCH 05/19] aln stats are now json dumps --- gffquant/profilers/feature_quantifier.py | 26 +++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index e95fe100..bb29e721 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -314,18 +314,20 @@ def count_alignments( def report_alignments(self): - with gzip.open(f"{self.out_prefix}.aln_stats.txt.gz", "wt") as aln_stats_out: - print( - AlignmentProcessor.get_alignment_stats_str( - [ - v - for k, v in self.aln_counter.items() - if k.startswith("pysam_") and not k.endswith("total") - ], - table=True, - ), - file=aln_stats_out - ) + with open(f"{self.out_prefix}.aln_stats.json", "wb") as aln_stats_out: + json.dump(self.aln_counter, aln_stats_out) + # print( + # AlignmentProcessor.get_alignment_stats_str( + # [ + # v + # for k, v in self.aln_counter.items() + # if k.startswith("pysam_") and not k.endswith("total") + # ], + # table=True, + # ), + # file=aln_stats_out + # ) + for metric, value in ( ("Input reads", "full_read_count"), ("Aligned reads", "read_count"), From c6e2eff927f424380175e74d2ea2c507bd428ab0 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 12 Feb 2025 16:34:10 +0100 Subject: [PATCH 06/19] minor. --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index bb29e721..f05e2ce8 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -361,7 +361,7 @@ def finalise( external_gene_counts=None, ): - if self.aln_counter.get("aln_count"): + if self.aln_counter.get("aln_count") or external_gene_counts: if self.adm is None: self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory) From fc10c9fd6318d0960b51430058bc673ecb12d19a Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 12 Feb 2025 16:36:46 +0100 Subject: [PATCH 07/19] minor. --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index f05e2ce8..99300897 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -135,7 +135,7 @@ def process_counters( count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors) if external_gene_counts: - count_annotator.annotate_external(external_gene_counts) + count_annotator.annotate_external(external_gene_counts, self.adm, gene_group_db=gene_group_db,) total_readcount = 0 filtered_readcount = 0 has_ambig_counts = True From b9755191d43106b36f3beaef9483f91ded976bee Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 12 Feb 2025 16:43:13 +0100 Subject: [PATCH 08/19] minor. --- gffquant/profilers/feature_quantifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 99300897..b499e12b 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -136,8 +136,8 @@ def process_counters( if external_gene_counts: count_annotator.annotate_external(external_gene_counts, self.adm, gene_group_db=gene_group_db,) - total_readcount = 0 - filtered_readcount = 0 + total_readcount = 1 + filtered_readcount = 1 has_ambig_counts = True else: count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,) From 161aae220ddfe4ae18e4d656573a9a2b05a41541 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 11 May 2026 21:51:36 +0200 Subject: [PATCH 09/19] fix typo --- gffquant/handle_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py index 8c70561f..50a96357 100644 --- a/gffquant/handle_args.py +++ b/gffquant/handle_args.py @@ -43,7 +43,7 @@ def validate_args(args): raise ValueError( "Need exactly one type of input: " f"bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)} " - f"gene_counts={bool(args.gene_count)}." + f"gene_counts={bool(args.gene_counts)}." ) args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts")) From 587155d5486d237c8822ebb28920bcfabaec3bfc Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 11 May 2026 21:54:46 +0200 Subject: [PATCH 10/19] fix ReferenceHit comparison --- gffquant/profilers/reference_hit.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/gffquant/profilers/reference_hit.py b/gffquant/profilers/reference_hit.py index 5c1be314..4cb30f45 100644 --- a/gffquant/profilers/reference_hit.py +++ b/gffquant/profilers/reference_hit.py @@ -21,13 +21,9 @@ def __hash__(self): return hash(tuple(asdict(self).values())) def __eq__(self, other): - return all( - item[0][1] == item[1][1] - for item in zip( - sorted(asdict(self).items()), - sorted(asdict(other).items()) - ) - ) + if not isinstance(other, ReferenceHit): + raise NotImplementedError(f"{other} is not a valid ReferenceHit object.") + return sorted(asdict(self).items()) == sorted(asdict(other).items()) def __str__(self): return "\t".join(map(str, asdict(self).values())) From c2043ce38c91c83856b6cdf537f31edb206d5efc Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 11 May 2026 21:58:33 +0200 Subject: [PATCH 11/19] fix alignment rate report error on zero alignments --- gffquant/profilers/feature_quantifier.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 280d81cf..81729a61 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -340,10 +340,16 @@ def report_alignments(self): ): logger.info("%s: %s", metric, self.aln_counter.get(value)) + if self.aln_counter["full_read_count"]: + alignment_rate = round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100, + filter_pass_rate = round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100, + else: + alignment_rate, filter_pass_rate = None, None + logger.info( - "Alignment rate: %s%%, Filter pass rate: %s%%", - round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100, - round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100, + "Alignment rate: %s%%, Filter pass rate: %s%%" % ( + alignment_rate, filter_pass_rate, + ) ) # def load_gene_counts(self, gene_count_matrix): From 5fdcb27e7e803325562f491d6fc4bcbe0249a936 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 11 May 2026 21:59:03 +0200 Subject: [PATCH 12/19] fix hanging indent --- gffquant/profilers/feature_quantifier.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 81729a61..66bcb5cc 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -329,16 +329,16 @@ def report_alignments(self): # ) for metric, value in ( - ("Input reads", "full_read_count"), - ("Aligned reads", "read_count"), - ("Alignments", "pysam_total"), - ("Reads passing filters", "filtered_read_count"), - ("Alignments passing filters", "pysam_passed"), - (" - Discarded due to seqid", "pysam_seqid_filt"), - (" - Discarded due to length", "pysam_len_filt"), - # ("Unannotated multimappers", "unannotated_ambig"), - ): - logger.info("%s: %s", metric, self.aln_counter.get(value)) + ("Input reads", "full_read_count"), + ("Aligned reads", "read_count"), + ("Alignments", "pysam_total"), + ("Reads passing filters", "filtered_read_count"), + ("Alignments passing filters", "pysam_passed"), + (" - Discarded due to seqid", "pysam_seqid_filt"), + (" - Discarded due to length", "pysam_len_filt"), + # ("Unannotated multimappers", "unannotated_ambig"), + ): + logger.info("%s: %s", metric, self.aln_counter.get(value)) if self.aln_counter["full_read_count"]: alignment_rate = round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100, From 11e8a8406823b20d096e26d7e4d0bafd3ec76d84 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 11 May 2026 21:59:26 +0200 Subject: [PATCH 13/19] fix aln_stats output --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 66bcb5cc..e1a862e9 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -314,7 +314,7 @@ def count_alignments( def report_alignments(self): - with open(f"{self.out_prefix}.aln_stats.json", "wb") as aln_stats_out: + with open(f"{self.out_prefix}.aln_stats.json", "wt") as aln_stats_out: json.dump(self.aln_counter, aln_stats_out) # print( # AlignmentProcessor.get_alignment_stats_str( From e19ba121b086f80a74caddafc0437156db1ff021 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 11 May 2026 22:13:13 +0200 Subject: [PATCH 14/19] fix potential crash when region counting operates on external gene counts --- gffquant/annotation/count_annotator.py | 3 +++ gffquant/profilers/feature_quantifier.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index 8506b7b4..b9db3e3c 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -198,6 +198,9 @@ class RegionCountAnnotator(CountAnnotator): def __init__(self, strand_specific, report_scaling_factors=True): CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) + def annotate_external(self, fn, db, gene_group_db=False): + raise NotImplementedError() + # pylint: disable=R0914,W0613 def annotate(self, refmgr, db, count_manager, gene_group_db=False): """ diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index e1a862e9..b83c5ba5 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -131,7 +131,7 @@ def process_counters( report_scaling_factors = restrict_reports is None or "scaled" in restrict_reports - Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required] + Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required and not external_gene_counts] count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors) if external_gene_counts: From 149349f3e2941d28084c3f9cf1271c03b095c31f Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 11 May 2026 22:13:48 +0200 Subject: [PATCH 15/19] removed unused csv import --- gffquant/counters/count_manager.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index 7e7a8205..0e89b61c 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -1,7 +1,5 @@ """count_manager""" -import csv - from collections import Counter from .. import DistributionMode From 0d0ab7c58503c93fed55dec371f0338d764f27bc Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 11 May 2026 22:49:31 +0200 Subject: [PATCH 16/19] minor refactor --- gffquant/annotation/count_annotator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index b9db3e3c..16d5425b 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -341,8 +341,10 @@ def annotate_external(self, fn, db, gene_group_db=False): # refmgr, db, count_m ref = row["gene"] if gene_group_db: - ref_tokens = ref.split(".") - gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + # ref_tokens = ref.split(".") + p = ref.rfind(".") + # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + gene_id, ggroup_id = ref[:p], ref[p + 1:] else: ggroup_id, gene_id = ref, ref From 630097010bb6e1700b2063d8abf3381844ed6bef Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 11 May 2026 22:49:48 +0200 Subject: [PATCH 17/19] prevent strand-specific to be used together with gene_count input --- gffquant/handle_args.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py index 50a96357..92385003 100644 --- a/gffquant/handle_args.py +++ b/gffquant/handle_args.py @@ -53,6 +53,9 @@ def validate_args(args): if bool(args.reference and args.aligner) != has_fastq: raise ValueError("--fastq requires --reference and --aligner to be set.") + if (args.strand_specific and args.gene_counts): + raise NotImplementedError("External gene count input is not implemented for strand-specific counts.") + if args.input_type == "fastq": args.input_data = check_input_reads( fwd_reads=args.reads1, rev_reads=args.reads2, From 7090337cc15e7eb9ff396d0baf93b97727fcdfb1 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 12 May 2026 13:59:10 +0200 Subject: [PATCH 18/19] fix error message --- gffquant/handle_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py index 92385003..ebff1f94 100644 --- a/gffquant/handle_args.py +++ b/gffquant/handle_args.py @@ -49,7 +49,7 @@ def validate_args(args): args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts")) if (args.reference or args.aligner) and not has_fastq: - raise ValueError("--reference/--aligner are not needed with alignment input (bam, sam).") + raise ValueError("--reference/--aligner parameters are only required for --fastq- input.") if bool(args.reference and args.aligner) != has_fastq: raise ValueError("--fastq requires --reference and --aligner to be set.") From f229f2e1305849a095274a783be0dd34cee5f919 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 12 May 2026 14:16:51 +0200 Subject: [PATCH 19/19] adjusting input argument logic --- gffquant/handle_args.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py index ebff1f94..fe64d9b8 100644 --- a/gffquant/handle_args.py +++ b/gffquant/handle_args.py @@ -27,9 +27,7 @@ def validate_args(args): if not all(os.path.isfile(f) for f in db_files): raise ValueError(f"Cannot find annotation db at `{args.annotation_db}`.") - if (args.aligner == "bwa" and not check_bwa_index(args.reference)) or (args.aligner == "minimap" and not check_minimap2_index(args.reference)): - raise ValueError(f"Cannot find reference index at `{args.reference}`.") - + has_fastq = any( map( lambda x: x is not None, @@ -48,20 +46,30 @@ def validate_args(args): args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts")) - if (args.reference or args.aligner) and not has_fastq: - raise ValueError("--reference/--aligner parameters are only required for --fastq- input.") - if bool(args.reference and args.aligner) != has_fastq: - raise ValueError("--fastq requires --reference and --aligner to be set.") + if has_fastq: + if not bool(args.reference and args.aligner): + raise ValueError("--fastq- input requires --reference and --aligner to be set.") - if (args.strand_specific and args.gene_counts): - raise NotImplementedError("External gene count input is not implemented for strand-specific counts.") + if (args.aligner == "bwa" and not check_bwa_index(args.reference)) or (args.aligner == "minimap" and not check_minimap2_index(args.reference)): + raise ValueError(f"Cannot find `${args.aligner}` reference index at `{args.reference}`.") - if args.input_type == "fastq": args.input_data = check_input_reads( fwd_reads=args.reads1, rev_reads=args.reads2, single_reads=args.singles, orphan_reads=args.orphans, ) + else: + if bool(args.reference or args.aligner): + raise ValueError("--reference/--aligner parameters are only required for --fastq- input.") + + if (args.strand_specific and args.gene_counts): + raise NotImplementedError("External gene count input is not implemented for strand-specific counts.") + # if (args.reference or args.aligner) and not has_fastq: + # raise ValueError("--reference/--aligner parameters are only required for --fastq- input.") + + # if bool(args.reference and args.aligner) != has_fastq: + # raise ValueError("--fastq- input requires --reference and --aligner to be set.") + if args.restrict_metrics: restrict_metrics = set(args.restrict_metrics.split(",")) invalid = restrict_metrics.difference(('raw', 'lnorm', 'scaled', 'rpkm'))