From 216ed7e2390d20ceeabd843df783c43e2962be2d Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 12 Feb 2025 11:29:19 +0100
Subject: [PATCH 01/19] add skeleton for read count input

---
 gffquant/__main__.py                     | 42 +++++++++------
 gffquant/handle_args.py                  | 16 ++++--
 gffquant/profilers/feature_quantifier.py | 68 +++++++++++-------------
 3 files changed, 70 insertions(+), 56 deletions(-)

diff --git a/gffquant/__main__.py b/gffquant/__main__.py
index c5804587..f7682142 100644
--- a/gffquant/__main__.py
+++ b/gffquant/__main__.py
@@ -128,26 +128,34 @@ def main():
         **kwargs,
     )
 
-    if args.input_type == "fastq":
+    if args.input_type in ("fastq", "bam", "sam"):
 
-        stream_alignments(args, profiler)
+        if args.input_type == "fastq":
 
-    else:
+            stream_alignments(args, profiler)        
 
-        input_file = args.bam if args.input_type == "bam" else args.sam
-        debug_samfile = None
-        if profiler.debug:
-            debug_samfile = f"{profiler.out_prefix}.{args.input_type}.filtered.sam"
-
-        profiler.count_alignments(
-            sys.stdin if input_file == "-" else input_file,
-            aln_format=args.input_type,
-            min_identity=args.min_identity,
-            min_seqlen=args.min_seqlen,
-            external_readcounts=args.import_readcounts,
-            unmarked_orphans=args.unmarked_orphans,
-            debug_samfile=debug_samfile,
-        )
+        else:
+
+            input_file = args.bam if args.input_type == "bam" else args.sam
+            debug_samfile = None
+            if profiler.debug:
+                debug_samfile = f"{profiler.out_prefix}.{args.input_type}.filtered.sam"
+
+            profiler.count_alignments(
+                sys.stdin if input_file == "-" else input_file,
+                aln_format=args.input_type,
+                min_identity=args.min_identity,
+                min_seqlen=args.min_seqlen,
+                external_readcounts=args.import_readcounts,
+                unmarked_orphans=args.unmarked_orphans,
+                debug_samfile=debug_samfile,
+            )
+
+        profiler.report_alignments()
+    
+    else:
+        
+        profiler.load_gene_counts(args.gene_counts)
 
     profiler.finalise(
         restrict_reports=args.restrict_metrics,
diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py
index 8be6a387..8c70561f 100644
--- a/gffquant/handle_args.py
+++ b/gffquant/handle_args.py
@@ -39,10 +39,14 @@ def validate_args(args):
         )
     )
 
-    if tuple(map(bool, (has_fastq, args.bam, args.sam))).count(True) != 1:
-        raise ValueError(f"Need exactly one type of input: bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)}.")
+    if tuple(map(bool, (has_fastq, args.bam, args.sam, args.gene_counts))).count(True) != 1:
+        raise ValueError(
+            "Need exactly one type of input: "
+            f"bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)} "
+            f"gene_counts={bool(args.gene_count)}."
+        )
 
-    args.input_type = "fastq" if has_fastq else ("bam" if args.bam else "sam")
+    args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts"))
 
     if (args.reference or args.aligner) and not has_fastq:
         raise ValueError("--reference/--aligner are not needed with alignment input (bam, sam).")
@@ -189,6 +193,12 @@ def handle_args(args):
     #         Input from STDOUT can be used with '-'."""
     #     ),
     # )
+    ap.add_argument(
+        "--gene_counts",
+        type=str,
+        help="Path to a file containing a gene_counts matrix from a previous gffquant run."
+    )
+
     ap.add_argument(
         "--bam",
         type=str,
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index bd5c5c96..1a4ed63c 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -330,16 +330,8 @@ def count_alignments(
 
         self.aln_counter.update(aln_reader.get_alignment_stats_dict())
 
-    def finalise(
-        self,
-        restrict_reports=None,
-        report_category=False,
-        report_unannotated=False,
-        dump_counters=False,
-        in_memory=True,
-        gene_group_db=False,
-    ):
 
+    def report_alignments(self):
         with gzip.open(f"{self.out_prefix}.aln_stats.txt.gz", "wt") as aln_stats_out:
             print(
                 AlignmentProcessor.get_alignment_stats_str(
@@ -352,19 +344,41 @@ def finalise(
                 ),
                 file=aln_stats_out
             )
+        for metric, value in (
+                ("Input reads", "full_read_count"),
+                ("Aligned reads", "read_count"),
+                ("Alignments", "pysam_total"),
+                ("Reads passing filters", "filtered_read_count"),
+                ("Alignments passing filters", "pysam_passed"),
+                ("  - Discarded due to seqid", "pysam_seqid_filt"),
+                ("  - Discarded due to length", "pysam_len_filt"),
+                # ("Unannotated multimappers", "unannotated_ambig"),
+            ):
+                logger.info("%s: %s", metric, self.aln_counter.get(value))
+
+        logger.info(
+            "Alignment rate: %s%%, Filter pass rate: %s%%",
+            round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100,
+            round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100,
+        )
+
+    def load_gene_counts(self, gene_count_matrix):
+        self.aln_counter["aln_count"] = 1
+
+    def finalise(
+        self,
+        restrict_reports=None,
+        report_category=False,
+        report_unannotated=False,
+        dump_counters=False,
+        in_memory=True,
+        gene_group_db=False,
+    ):
 
         if self.aln_counter.get("aln_count"):
             if self.adm is None:
                 self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory)
 
-            report_args = {
-                "restrict_reports": restrict_reports,
-                "report_category": report_category,
-                "report_unannotated": report_unannotated,
-            }
-
-            # self.write_coverage()
-
             self.process_counters(
                 restrict_reports=restrict_reports,
                 report_category=report_category,
@@ -372,25 +386,7 @@ def finalise(
                 dump_counters=dump_counters,
                 in_memory=in_memory,
                 gene_group_db=gene_group_db,
-            )
-
-            for metric, value in (
-                ("Input reads", "full_read_count"),
-                ("Aligned reads", "read_count"),
-                ("Alignments", "pysam_total"),
-                ("Reads passing filters", "filtered_read_count"),
-                ("Alignments passing filters", "pysam_passed"),
-                ("  - Discarded due to seqid", "pysam_seqid_filt"),
-                ("  - Discarded due to length", "pysam_len_filt"),
-                # ("Unannotated multimappers", "unannotated_ambig"),
-            ):
-                logger.info("%s: %s", metric, self.aln_counter.get(value))
-
-            logger.info(
-                "Alignment rate: %s%%, Filter pass rate: %s%%",
-                round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100,
-                round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100,
-            )
+            )            
 
         self.adm.clear_caches()
         logger.info("Finished.")

From 5121120f6c513aeb466d4500ac90149c728c65d4 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 12 Feb 2025 11:32:55 +0100
Subject: [PATCH 02/19] version -> 2.19.0, move referencehit to own module

---
 gffquant/__init__.py                     |  2 +-
 gffquant/profilers/feature_quantifier.py | 34 +-----------------------
 2 files changed, 2 insertions(+), 34 deletions(-)

diff --git a/gffquant/__init__.py b/gffquant/__init__.py
index ad7efe99..31f4177f 100644
--- a/gffquant/__init__.py
+++ b/gffquant/__init__.py
@@ -5,7 +5,7 @@
 from enum import Enum, auto, unique
 
 
-__version__ = "2.18.2"
+__version__ = "2.19.0"
 __tool__ = "gffquant"
 
 
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 1a4ed63c..567f0f6b 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -13,6 +13,7 @@
 from dataclasses import dataclass, asdict
 
 from .panda_coverage_profiler import PandaCoverageProfiler
+from .reference_hit import ReferenceHit
 from ..alignment import AlignmentGroup, AlignmentProcessor, SamFlags
 from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter
 from ..counters import CountManager
@@ -24,39 +25,6 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass(slots=True)
-class ReferenceHit:
-    rid: int = None
-    start: int = None
-    end: int = None
-    rev_strand: bool = None
-    cov_start: int = None
-    cov_end: int = None
-    has_annotation: bool = None
-    n_aln: int = None
-    is_ambiguous: bool = None
-    library_mod: int = None
-    mate_id: int = None
-
-    def __hash__(self):
-        return hash(tuple(asdict(self).values()))
-
-    def __eq__(self, other):
-        return all(
-            item[0][1] == item[1][1]
-            for item in zip(
-                sorted(asdict(self).items()),
-                sorted(asdict(other).items())
-            )
-        )
-
-    def __str__(self):
-        return "\t".join(map(str, asdict(self).values()))
-
-    def __repr__(self):
-        return str(self)
-
-
 class FeatureQuantifier(ABC):
     """
         Three groups of alignments:

From 75aceaa7419ba2571d1dc68dc6b000ab88347479 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 12 Feb 2025 14:59:46 +0100
Subject: [PATCH 03/19] version -> 2.19.0, move referencehit to own module

---
 gffquant/profilers/reference_hit.py | 36 +++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 gffquant/profilers/reference_hit.py

diff --git a/gffquant/profilers/reference_hit.py b/gffquant/profilers/reference_hit.py
new file mode 100644
index 00000000..5c1be314
--- /dev/null
+++ b/gffquant/profilers/reference_hit.py
@@ -0,0 +1,36 @@
+""" module docstring """
+
+from dataclasses import asdict, dataclass
+
+
+@dataclass(slots=True)
+class ReferenceHit:
+    rid: int = None
+    start: int = None
+    end: int = None
+    rev_strand: bool = None
+    cov_start: int = None
+    cov_end: int = None
+    has_annotation: bool = None
+    n_aln: int = None
+    is_ambiguous: bool = None
+    library_mod: int = None
+    mate_id: int = None
+
+    def __hash__(self):
+        return hash(tuple(asdict(self).values()))
+
+    def __eq__(self, other):
+        return all(
+            item[0][1] == item[1][1]
+            for item in zip(
+                sorted(asdict(self).items()),
+                sorted(asdict(other).items())
+            )
+        )
+
+    def __str__(self):
+        return "\t".join(map(str, asdict(self).values()))
+
+    def __repr__(self):
+        return str(self)
\ No newline at end of file

From c719aa617ddf78cdc57cd5e4797a12c2934c824c Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 12 Feb 2025 15:00:30 +0100
Subject: [PATCH 04/19] added functionality to add external genecounts to
 gene_annotator

---
 gffquant/__main__.py                     |  3 +-
 gffquant/annotation/count_annotator.py   | 30 ++++++++++++++++
 gffquant/counters/count_manager.py       | 20 +++++++++++
 gffquant/profilers/feature_quantifier.py | 45 ++++++++++++++++--------
 4 files changed, 83 insertions(+), 15 deletions(-)

diff --git a/gffquant/__main__.py b/gffquant/__main__.py
index f7682142..4133d1a1 100644
--- a/gffquant/__main__.py
+++ b/gffquant/__main__.py
@@ -155,7 +155,7 @@ def main():
     
     else:
         
-        profiler.load_gene_counts(args.gene_counts)
+        ...
 
     profiler.finalise(
         restrict_reports=args.restrict_metrics,
@@ -164,6 +164,7 @@ def main():
         dump_counters=args.debug,
         in_memory=args.db_in_memory,
         gene_group_db=args.gene_group_db,
+        external_gene_counts=args.gene_counts,
     )
 
 
diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index feb96b92..8506b7b4 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -2,6 +2,7 @@
 
 """ This module contains code for transforming gene counts to feature counts. """
 
+import csv
 import logging
 
 from itertools import chain
@@ -326,3 +327,32 @@ def annotate(self, refmgr, db, count_manager, gene_group_db=False):
                 self.unannotated_counts += counts[:4]
 
         self.calculate_scaling_factors()
+
+    def annotate_external(self, fn, db, gene_group_db=False):  # refmgr, db, count_manager, gene_group_db=False):
+
+        with open(fn, "rt", encoding="UTF-8") as _in:
+            for row in csv.DictReader(_in, delimiter="\t"):
+                # gene    uniq_raw        uniq_lnorm      uniq_scaled     combined_raw    combined_lnorm  combined_scaled
+                cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"]
+                counts = tuple(map(float, cols))
+                ref = row["gene"]
+
+                if gene_group_db:
+                    ref_tokens = ref.split(".")
+                    gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+                else:
+                    ggroup_id, gene_id = ref, ref
+
+                gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
+                gcounts += counts
+                self.total_gene_counts += counts[:4]
+
+                region_annotation = db.query_sequence(ggroup_id)
+                if region_annotation is not None:
+                    _, _, region_annotation = region_annotation
+                    self.distribute_feature_counts(counts, region_annotation)
+
+                else:
+                    self.unannotated_counts += counts[:4]
+
+        self.calculate_scaling_factors()
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index 40ae72a6..7e7a8205 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -1,5 +1,7 @@
 """count_manager"""
 
+import csv
+
 from collections import Counter
 
 from .. import DistributionMode
@@ -155,6 +157,24 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
                 uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]]
 
             return uniq_counts, ambig_counts
+        
+    # def set_counts(self, seqid, value, which_counter):
+    #     counter = (self.uniq_seqcounts, self.ambig_seqcounts)[which_counter == "ambig"]
+    #     counter[seqid] = value
+
+    # def load_data(self, fn):
+    #     # gene    uniq_raw        uniq_lnorm      uniq_scaled     combined_raw    combined_lnorm  combined_scaled
+    #     with open(fn, "rt", encoding="UTF-8") as _in:
+    #         try:
+    #             header = next(_in)
+    #         except StopIteration:
+    #             header = ""
+    #         if header:
+    #             for row in csv.reader(_in, delimiter="\t"):
+    #                 gene, counts = row[0], tuple(map(float, row[1:]))
+    #                 self.set_counts()
+
+
 
     def get_regions(self, rid):
         return set(self.uniq_regioncounts.get(rid, set())).union(
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 567f0f6b..e95fe100 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -121,31 +121,44 @@ def process_counters(
         dump_counters=True,
         in_memory=True,
         gene_group_db=False,
+        external_gene_counts=None,
     ):
         if self.adm is None:
             self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory)
 
-        if dump_counters:
+        if dump_counters and not external_gene_counts:
             self.count_manager.dump_raw_counters(self.out_prefix, self.reference_manager)
 
         report_scaling_factors = restrict_reports is None or "scaled" in restrict_reports
 
         Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required]
         count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors)
-        count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,)
+        
+        if external_gene_counts:
+            count_annotator.annotate_external(external_gene_counts)
+            total_readcount = 0
+            filtered_readcount = 0
+            has_ambig_counts = True
+        else:
+            count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,)
+            total_readcount = self.aln_counter["read_count"]
+            filtered_readcount = self.aln_counter["filtered_read_count"]
+            has_ambig_counts = self.count_manager.has_ambig_counts()
 
         count_writer = CountWriter(
             self.out_prefix,
-            has_ambig_counts=self.count_manager.has_ambig_counts(),
+            has_ambig_counts=has_ambig_counts,
             strand_specific=self.strand_specific,
             restrict_reports=restrict_reports,
             report_category=report_category,
-            total_readcount=self.aln_counter["read_count"],
-            filtered_readcount=self.aln_counter["filtered_read_count"],
+            total_readcount=total_readcount,
+            filtered_readcount=filtered_readcount,
         )
 
-        unannotated_reads = self.count_manager.get_unannotated_reads()
-        unannotated_reads += self.aln_counter["unannotated_ambig"]
+        unannotated_reads = 0
+        if not external_gene_counts:
+            unannotated_reads += self.count_manager.get_unannotated_reads()
+            unannotated_reads += self.aln_counter["unannotated_ambig"]
 
         count_writer.write_feature_counts(
             self.adm,
@@ -153,11 +166,12 @@ def process_counters(
             (None, unannotated_reads)[report_unannotated],
         )
 
-        count_writer.write_gene_counts(
-            count_annotator.gene_counts,
-            count_annotator.scaling_factors["total_gene_uniq"],
-            count_annotator.scaling_factors["total_gene_ambi"]
-        )
+        if not external_gene_counts:
+            count_writer.write_gene_counts(
+                count_annotator.gene_counts,
+                count_annotator.scaling_factors["total_gene_uniq"],
+                count_annotator.scaling_factors["total_gene_ambi"]
+            )
 
         self.adm.clear_caches()
 
@@ -330,8 +344,9 @@ def report_alignments(self):
             round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100,
         )
 
-    def load_gene_counts(self, gene_count_matrix):
-        self.aln_counter["aln_count"] = 1
+    # def load_gene_counts(self, gene_count_matrix):
+    #     self.aln_counter["aln_count"] = 1
+    #     self.count_manager.load_data(gene_count_matrix)
 
     def finalise(
         self,
@@ -341,6 +356,7 @@ def finalise(
         dump_counters=False,
         in_memory=True,
         gene_group_db=False,
+        external_gene_counts=None,
     ):
 
         if self.aln_counter.get("aln_count"):
@@ -354,6 +370,7 @@ def finalise(
                 dump_counters=dump_counters,
                 in_memory=in_memory,
                 gene_group_db=gene_group_db,
+                external_gene_counts=external_gene_counts,
             )            
 
         self.adm.clear_caches()

From 68aa9901153810c99921c9e6086e9a824e4cc967 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 12 Feb 2025 15:11:46 +0100
Subject: [PATCH 05/19] aln stats are now json dumps

---
 gffquant/profilers/feature_quantifier.py | 26 +++++++++++++-----------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index e95fe100..bb29e721 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -314,18 +314,20 @@ def count_alignments(
 
 
     def report_alignments(self):
-        with gzip.open(f"{self.out_prefix}.aln_stats.txt.gz", "wt") as aln_stats_out:
-            print(
-                AlignmentProcessor.get_alignment_stats_str(
-                    [
-                        v
-                        for k, v in self.aln_counter.items()
-                        if k.startswith("pysam_") and not k.endswith("total")
-                    ],
-                    table=True,
-                ),
-                file=aln_stats_out
-            )
+        with open(f"{self.out_prefix}.aln_stats.json", "wb") as aln_stats_out:
+            json.dump(self.aln_counter, aln_stats_out)
+            # print(
+            #     AlignmentProcessor.get_alignment_stats_str(
+            #         [
+            #             v
+            #             for k, v in self.aln_counter.items()
+            #             if k.startswith("pysam_") and not k.endswith("total")
+            #         ],
+            #         table=True,
+            #     ),
+            #     file=aln_stats_out
+            # )
+
         for metric, value in (
                 ("Input reads", "full_read_count"),
                 ("Aligned reads", "read_count"),

From c6e2eff927f424380175e74d2ea2c507bd428ab0 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 12 Feb 2025 16:34:10 +0100
Subject: [PATCH 06/19] minor.

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index bb29e721..f05e2ce8 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -361,7 +361,7 @@ def finalise(
         external_gene_counts=None,
     ):
 
-        if self.aln_counter.get("aln_count"):
+        if self.aln_counter.get("aln_count") or external_gene_counts:
             if self.adm is None:
                 self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory)
 

From fc10c9fd6318d0960b51430058bc673ecb12d19a Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 12 Feb 2025 16:36:46 +0100
Subject: [PATCH 07/19] minor.

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index f05e2ce8..99300897 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -135,7 +135,7 @@ def process_counters(
         count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors)
         
         if external_gene_counts:
-            count_annotator.annotate_external(external_gene_counts)
+            count_annotator.annotate_external(external_gene_counts, self.adm, gene_group_db=gene_group_db,)
             total_readcount = 0
             filtered_readcount = 0
             has_ambig_counts = True

From b9755191d43106b36f3beaef9483f91ded976bee Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 12 Feb 2025 16:43:13 +0100
Subject: [PATCH 08/19] minor.

---
 gffquant/profilers/feature_quantifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 99300897..b499e12b 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -136,8 +136,8 @@ def process_counters(
         
         if external_gene_counts:
             count_annotator.annotate_external(external_gene_counts, self.adm, gene_group_db=gene_group_db,)
-            total_readcount = 0
-            filtered_readcount = 0
+            total_readcount = 1
+            filtered_readcount = 1
             has_ambig_counts = True
         else:
             count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,)

From 161aae220ddfe4ae18e4d656573a9a2b05a41541 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 11 May 2026 21:51:36 +0200
Subject: [PATCH 09/19] fix typo

---
 gffquant/handle_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py
index 8c70561f..50a96357 100644
--- a/gffquant/handle_args.py
+++ b/gffquant/handle_args.py
@@ -43,7 +43,7 @@ def validate_args(args):
         raise ValueError(
             "Need exactly one type of input: "
             f"bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)} "
-            f"gene_counts={bool(args.gene_count)}."
+            f"gene_counts={bool(args.gene_counts)}."
         )
 
     args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts"))

From 587155d5486d237c8822ebb28920bcfabaec3bfc Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 11 May 2026 21:54:46 +0200
Subject: [PATCH 10/19] fix ReferenceHit comparison

---
 gffquant/profilers/reference_hit.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/gffquant/profilers/reference_hit.py b/gffquant/profilers/reference_hit.py
index 5c1be314..4cb30f45 100644
--- a/gffquant/profilers/reference_hit.py
+++ b/gffquant/profilers/reference_hit.py
@@ -21,13 +21,9 @@ def __hash__(self):
         return hash(tuple(asdict(self).values()))
 
     def __eq__(self, other):
-        return all(
-            item[0][1] == item[1][1]
-            for item in zip(
-                sorted(asdict(self).items()),
-                sorted(asdict(other).items())
-            )
-        )
+        if not isinstance(other, ReferenceHit):
+            raise NotImplementedError(f"{other} is not a valid ReferenceHit object.")
+        return sorted(asdict(self).items()) == sorted(asdict(other).items())
 
     def __str__(self):
         return "\t".join(map(str, asdict(self).values()))

From c2043ce38c91c83856b6cdf537f31edb206d5efc Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 11 May 2026 21:58:33 +0200
Subject: [PATCH 11/19] fix alignment rate report error on zero alignments

---
 gffquant/profilers/feature_quantifier.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 280d81cf..81729a61 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -340,10 +340,16 @@ def report_alignments(self):
             ):
                 logger.info("%s: %s", metric, self.aln_counter.get(value))
 
+        if self.aln_counter["full_read_count"]:
+            alignment_rate = round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100,
+            filter_pass_rate = round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100,
+        else:
+            alignment_rate, filter_pass_rate = None, None
+
         logger.info(
-            "Alignment rate: %s%%, Filter pass rate: %s%%",
-            round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100,
-            round(self.aln_counter["filtered_read_count"] / self.aln_counter["full_read_count"], 3) * 100,
+            "Alignment rate: %s%%, Filter pass rate: %s%%" % (
+                alignment_rate, filter_pass_rate,
+            )
         )
 
     # def load_gene_counts(self, gene_count_matrix):

From 5fdcb27e7e803325562f491d6fc4bcbe0249a936 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 11 May 2026 21:59:03 +0200
Subject: [PATCH 12/19] fix hanging indent

---
 gffquant/profilers/feature_quantifier.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 81729a61..66bcb5cc 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -329,16 +329,16 @@ def report_alignments(self):
             # )
 
         for metric, value in (
-                ("Input reads", "full_read_count"),
-                ("Aligned reads", "read_count"),
-                ("Alignments", "pysam_total"),
-                ("Reads passing filters", "filtered_read_count"),
-                ("Alignments passing filters", "pysam_passed"),
-                ("  - Discarded due to seqid", "pysam_seqid_filt"),
-                ("  - Discarded due to length", "pysam_len_filt"),
-                # ("Unannotated multimappers", "unannotated_ambig"),
-            ):
-                logger.info("%s: %s", metric, self.aln_counter.get(value))
+            ("Input reads", "full_read_count"),
+            ("Aligned reads", "read_count"),
+            ("Alignments", "pysam_total"),
+            ("Reads passing filters", "filtered_read_count"),
+            ("Alignments passing filters", "pysam_passed"),
+            ("  - Discarded due to seqid", "pysam_seqid_filt"),
+            ("  - Discarded due to length", "pysam_len_filt"),
+            # ("Unannotated multimappers", "unannotated_ambig"),
+        ):
+            logger.info("%s: %s", metric, self.aln_counter.get(value))
 
         if self.aln_counter["full_read_count"]:
             alignment_rate = round(self.aln_counter["read_count"] / self.aln_counter["full_read_count"], 3) * 100,

From 11e8a8406823b20d096e26d7e4d0bafd3ec76d84 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 11 May 2026 21:59:26 +0200
Subject: [PATCH 13/19] fix aln_stats output

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 66bcb5cc..e1a862e9 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -314,7 +314,7 @@ def count_alignments(
 
 
     def report_alignments(self):
-        with open(f"{self.out_prefix}.aln_stats.json", "wb") as aln_stats_out:
+        with open(f"{self.out_prefix}.aln_stats.json", "wt") as aln_stats_out:
             json.dump(self.aln_counter, aln_stats_out)
             # print(
             #     AlignmentProcessor.get_alignment_stats_str(

From e19ba121b086f80a74caddafc0437156db1ff021 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 11 May 2026 22:13:13 +0200
Subject: [PATCH 14/19] fix potential crash when region counting operates on
 external gene counts

---
 gffquant/annotation/count_annotator.py   | 3 +++
 gffquant/profilers/feature_quantifier.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index 8506b7b4..b9db3e3c 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -198,6 +198,9 @@ class RegionCountAnnotator(CountAnnotator):
     def __init__(self, strand_specific, report_scaling_factors=True):
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
+    def annotate_external(self, fn, db, gene_group_db=False):
+        raise NotImplementedError()
+
     # pylint: disable=R0914,W0613
     def annotate(self, refmgr, db, count_manager, gene_group_db=False):
         """
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index e1a862e9..b83c5ba5 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -131,7 +131,7 @@ def process_counters(
 
         report_scaling_factors = restrict_reports is None or "scaled" in restrict_reports
 
-        Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required]
+        Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required and not external_gene_counts]
         count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors)
         
         if external_gene_counts:

From 149349f3e2941d28084c3f9cf1271c03b095c31f Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 11 May 2026 22:13:48 +0200
Subject: [PATCH 15/19] removed unused csv import

---
 gffquant/counters/count_manager.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index 7e7a8205..0e89b61c 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -1,7 +1,5 @@
 """count_manager"""
 
-import csv
-
 from collections import Counter
 
 from .. import DistributionMode

From 0d0ab7c58503c93fed55dec371f0338d764f27bc Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 11 May 2026 22:49:31 +0200
Subject: [PATCH 16/19] minor refactor

---
 gffquant/annotation/count_annotator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index b9db3e3c..16d5425b 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -341,8 +341,10 @@ def annotate_external(self, fn, db, gene_group_db=False):  # refmgr, db, count_m
                 ref = row["gene"]
 
                 if gene_group_db:
-                    ref_tokens = ref.split(".")
-                    gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+                    # ref_tokens = ref.split(".")
+                    p = ref.rfind(".")
+                    # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+                    gene_id, ggroup_id = ref[:p], ref[p + 1:]
                 else:
                     ggroup_id, gene_id = ref, ref
 

From 630097010bb6e1700b2063d8abf3381844ed6bef Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 11 May 2026 22:49:48 +0200
Subject: [PATCH 17/19] prevent strand-specific to be used together with
 gene_count input

---
 gffquant/handle_args.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py
index 50a96357..92385003 100644
--- a/gffquant/handle_args.py
+++ b/gffquant/handle_args.py
@@ -53,6 +53,9 @@ def validate_args(args):
     if bool(args.reference and args.aligner) != has_fastq:
         raise ValueError("--fastq requires --reference and --aligner to be set.")
 
+    if (args.strand_specific and args.gene_counts):
+        raise NotImplementedError("External gene count input is not implemented for strand-specific counts.")
+
     if args.input_type == "fastq":
         args.input_data = check_input_reads(
             fwd_reads=args.reads1, rev_reads=args.reads2,

From 7090337cc15e7eb9ff396d0baf93b97727fcdfb1 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 12 May 2026 13:59:10 +0200
Subject: [PATCH 18/19] fix error message

---
 gffquant/handle_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py
index 92385003..ebff1f94 100644
--- a/gffquant/handle_args.py
+++ b/gffquant/handle_args.py
@@ -49,7 +49,7 @@ def validate_args(args):
     args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts"))
 
     if (args.reference or args.aligner) and not has_fastq:
-        raise ValueError("--reference/--aligner are not needed with alignment input (bam, sam).")
+        raise ValueError("--reference/--aligner parameters are only required for --fastq-<readtype> input.")
     if bool(args.reference and args.aligner) != has_fastq:
         raise ValueError("--fastq requires --reference and --aligner to be set.")
 

From f229f2e1305849a095274a783be0dd34cee5f919 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 12 May 2026 14:16:51 +0200
Subject: [PATCH 19/19] adjusting input argument logic

---
 gffquant/handle_args.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py
index ebff1f94..fe64d9b8 100644
--- a/gffquant/handle_args.py
+++ b/gffquant/handle_args.py
@@ -27,9 +27,7 @@ def validate_args(args):
 
     if not all(os.path.isfile(f) for f in db_files):
         raise ValueError(f"Cannot find annotation db at `{args.annotation_db}`.")
-    if (args.aligner == "bwa" and not check_bwa_index(args.reference)) or (args.aligner == "minimap" and not check_minimap2_index(args.reference)):
-        raise ValueError(f"Cannot find reference index at `{args.reference}`.")
-
+    
     has_fastq = any(
         map(
             lambda x: x is not None,
@@ -48,20 +46,30 @@ def validate_args(args):
 
     args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts"))
 
-    if (args.reference or args.aligner) and not has_fastq:
-        raise ValueError("--reference/--aligner parameters are only required for --fastq-<readtype> input.")
-    if bool(args.reference and args.aligner) != has_fastq:
-        raise ValueError("--fastq requires --reference and --aligner to be set.")
+    if has_fastq:
+        if not bool(args.reference and args.aligner):
+            raise ValueError("--fastq-<readtype> input requires --reference and --aligner to be set.")
 
-    if (args.strand_specific and args.gene_counts):
-        raise NotImplementedError("External gene count input is not implemented for strand-specific counts.")
+        if (args.aligner == "bwa" and not check_bwa_index(args.reference)) or (args.aligner == "minimap" and not check_minimap2_index(args.reference)):
+            raise ValueError(f"Cannot find `${args.aligner}` reference index at `{args.reference}`.")
 
-    if args.input_type == "fastq":
         args.input_data = check_input_reads(
             fwd_reads=args.reads1, rev_reads=args.reads2,
             single_reads=args.singles, orphan_reads=args.orphans,
         )
+    else:
+        if bool(args.reference or args.aligner):
+            raise ValueError("--reference/--aligner parameters are only required for --fastq-<readtype> input.")
+        
+    if (args.strand_specific and args.gene_counts):
+        raise NotImplementedError("External gene count input is not implemented for strand-specific counts.")
 
+    # if (args.reference or args.aligner) and not has_fastq:
+    #     raise ValueError("--reference/--aligner parameters are only required for --fastq-<readtype> input.")
+    
+    # if bool(args.reference and args.aligner) != has_fastq:
+    #     raise ValueError("--fastq-<readtype> input requires --reference and --aligner to be set.")
+    
     if args.restrict_metrics:
         restrict_metrics = set(args.restrict_metrics.split(","))
         invalid = restrict_metrics.difference(('raw', 'lnorm', 'scaled', 'rpkm'))