cschu · cschu · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/gffquant/__init__.py b/gffquant/__init__.py
@@ -5,7 +5,7 @@
 from enum import Enum, auto, unique
 
 
-__version__ = "2.18.5"
+__version__ = "2.19.0"
 __tool__ = "gffquant"
 
 

diff --git a/gffquant/__main__.py b/gffquant/__main__.py
@@ -128,26 +128,34 @@ def main():
         **kwargs,
     )
 
-    if args.input_type == "fastq":
+    if args.input_type in ("fastq", "bam", "sam"):
 
-        stream_alignments(args, profiler)
+        if args.input_type == "fastq":
 
-    else:
+            stream_alignments(args, profiler)        
 
-        input_file = args.bam if args.input_type == "bam" else args.sam
-        debug_samfile = None
-        if profiler.debug:
-            debug_samfile = f"{profiler.out_prefix}.{args.input_type}.filtered.sam"
-
-        profiler.count_alignments(
-            sys.stdin if input_file == "-" else input_file,
-            aln_format=args.input_type,
-            min_identity=args.min_identity,
-            min_seqlen=args.min_seqlen,
-            external_readcounts=args.import_readcounts,
-            unmarked_orphans=args.unmarked_orphans,
-            debug_samfile=debug_samfile,
-        )
+        else:
+
+            input_file = args.bam if args.input_type == "bam" else args.sam
+            debug_samfile = None
+            if profiler.debug:
+                debug_samfile = f"{profiler.out_prefix}.{args.input_type}.filtered.sam"
+
+            profiler.count_alignments(
+                sys.stdin if input_file == "-" else input_file,
+                aln_format=args.input_type,
+                min_identity=args.min_identity,
+                min_seqlen=args.min_seqlen,
+                external_readcounts=args.import_readcounts,
+                unmarked_orphans=args.unmarked_orphans,
+                debug_samfile=debug_samfile,
+            )
+
+        profiler.report_alignments()
+
+    else:
+
+        ...
 
     profiler.finalise(
         restrict_reports=args.restrict_metrics,
@@ -156,6 +164,7 @@ def main():
         dump_counters=args.debug,
         in_memory=args.db_in_memory,
         gene_group_db=args.gene_group_db,
+        external_gene_counts=args.gene_counts,
     )
 
 

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
@@ -2,6 +2,7 @@
 
 """ This module contains code for transforming gene counts to feature counts. """
 
+import csv
 import logging
 
 from itertools import chain
@@ -197,6 +198,9 @@ class RegionCountAnnotator(CountAnnotator):
     def __init__(self, strand_specific, report_scaling_factors=True):
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
+    def annotate_external(self, fn, db, gene_group_db=False):
+        raise NotImplementedError()
+
     # pylint: disable=R0914,W0613
     def annotate(self, refmgr, db, count_manager, gene_group_db=False):
         """
@@ -326,3 +330,34 @@ def annotate(self, refmgr, db, count_manager, gene_group_db=False):
                 self.unannotated_counts += counts[:4]
 
         self.calculate_scaling_factors()
+
+    def annotate_external(self, fn, db, gene_group_db=False):  # refmgr, db, count_manager, gene_group_db=False):
+
+        with open(fn, "rt", encoding="UTF-8") as _in:
+            for row in csv.DictReader(_in, delimiter="\t"):
+                # gene    uniq_raw        uniq_lnorm      uniq_scaled     combined_raw    combined_lnorm  combined_scaled
+                cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"]
+                counts = tuple(map(float, cols))
+                ref = row["gene"]
+
+                if gene_group_db:
+                    # ref_tokens = ref.split(".")
+                    p = ref.rfind(".")
+                    # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+                    gene_id, ggroup_id = ref[:p], ref[p + 1:]
+                else:
+                    ggroup_id, gene_id = ref, ref
+
+                gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
+                gcounts += counts
-        with open(fn, "rt", encoding="UTF-8") as _in:
-            for row in csv.DictReader(_in, delimiter="\t"):
-                # gene    uniq_raw        uniq_lnorm      uniq_scaled     combined_raw    combined_lnorm  combined_scaled
-                cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"]
-                counts = tuple(map(float, cols))
-                ref = row["gene"]
-
-                if gene_group_db:
-                    ref_tokens = ref.split(".")
-                    gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-                else:
-                    ggroup_id, gene_id = ref, ref
-
-                gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
-                gcounts += counts
+    if self.bins != 4:
+        raise ValueError(
+            "--gene_counts currently supports only non-strand-specific count matrices."
+        )
+
+    with open(fn, "rt", encoding="UTF-8") as _in:
+        for row in csv.DictReader(_in, delimiter="\t"):
+            # gene    uniq_raw        uniq_lnorm      uniq_scaled     combined_raw    combined_lnorm  combined_scaled
+            cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"]
+            counts = tuple(map(float, cols))
+            ref = row["gene"]
+
+            if gene_group_db:
+                ref_tokens = ref.split(".")
+                gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+            else:
+                ggroup_id, gene_id = ref, ref
+
+            gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
+            gcounts += counts
-        with open(fn, "rt", encoding="UTF-8") as _in:
-            for row in csv.DictReader(_in, delimiter="\t"):
-                # gene    uniq_raw        uniq_lnorm      uniq_scaled     combined_raw    combined_lnorm  combined_scaled
-                cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"]
-                counts = tuple(map(float, cols))
-                ref = row["gene"]
-
-                if gene_group_db:
-                    ref_tokens = ref.split(".")
-                    gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-                else:
-                    ggroup_id, gene_id = ref, ref
-
-                gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
-                gcounts += counts
+    if self.bins != 4:
+        raise ValueError(
+            "--gene_counts currently supports only non-strand-specific count matrices."
+        )
+
+    with open(fn, "rt", encoding="UTF-8") as _in:
+        for row in csv.DictReader(_in, delimiter="\t"):
+            # gene    uniq_raw        uniq_lnorm      uniq_scaled     combined_raw    combined_lnorm  combined_scaled
+            cols = row["uniq_raw"], row["uniq_lnorm"], row["combined_raw"], row["combined_lnorm"]
+            counts = tuple(map(float, cols))
+            ref = row["gene"]
+
+            if gene_group_db:
+                ref_tokens = ref.split(".")
+                gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+            else:
+                ggroup_id, gene_id = ref, ref
+
+            gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
+            gcounts += counts
+                self.total_gene_counts += counts[:4]
+
+                region_annotation = db.query_sequence(ggroup_id)
+                if region_annotation is not None:
+                    _, _, region_annotation = region_annotation
+                    self.distribute_feature_counts(counts, region_annotation)
+
+                else:
+                    self.unannotated_counts += counts[:4]
+
+        self.calculate_scaling_factors()
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
@@ -155,6 +155,24 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
                 uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]]
 
             return uniq_counts, ambig_counts
+
+    # def set_counts(self, seqid, value, which_counter):
+    #     counter = (self.uniq_seqcounts, self.ambig_seqcounts)[which_counter == "ambig"]
+    #     counter[seqid] = value
+
+    # def load_data(self, fn):
+    #     # gene    uniq_raw        uniq_lnorm      uniq_scaled     combined_raw    combined_lnorm  combined_scaled
+    #     with open(fn, "rt", encoding="UTF-8") as _in:
+    #         try:
+    #             header = next(_in)
+    #         except StopIteration:
+    #             header = ""
+    #         if header:
+    #             for row in csv.reader(_in, delimiter="\t"):
+    #                 gene, counts = row[0], tuple(map(float, row[1:]))
+    #                 self.set_counts()
+
+
 
     def get_regions(self, rid):
         return set(self.uniq_regioncounts.get(rid, set())).union(

diff --git a/gffquant/handle_args.py b/gffquant/handle_args.py
@@ -27,9 +27,7 @@ def validate_args(args):
 
     if not all(os.path.isfile(f) for f in db_files):
         raise ValueError(f"Cannot find annotation db at `{args.annotation_db}`.")
-    if (args.aligner == "bwa" and not check_bwa_index(args.reference)) or (args.aligner == "minimap" and not check_minimap2_index(args.reference)):
-        raise ValueError(f"Cannot find reference index at `{args.reference}`.")
-
+
     has_fastq = any(
         map(
             lambda x: x is not None,
@@ -39,22 +37,39 @@ def validate_args(args):
         )
     )
 
-    if tuple(map(bool, (has_fastq, args.bam, args.sam))).count(True) != 1:
-        raise ValueError(f"Need exactly one type of input: bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)}.")
+    if tuple(map(bool, (has_fastq, args.bam, args.sam, args.gene_counts))).count(True) != 1:
+        raise ValueError(
+            "Need exactly one type of input: "
+            f"bam={bool(args.bam)} sam={bool(args.sam)} fastq={bool(has_fastq)} "
+            f"gene_counts={bool(args.gene_counts)}."
+        )
+
+    args.input_type = "fastq" if has_fastq else ("bam" if args.bam else ("sam" if args.sam else "gene_counts"))
 
-    args.input_type = "fastq" if has_fastq else ("bam" if args.bam else "sam")
+    if has_fastq:
+        if not bool(args.reference and args.aligner):
+            raise ValueError("--fastq-<readtype> input requires --reference and --aligner to be set.")
 
-    if (args.reference or args.aligner) and not has_fastq:
-        raise ValueError("--reference/--aligner are not needed with alignment input (bam, sam).")
-    if bool(args.reference and args.aligner) != has_fastq:
-        raise ValueError("--fastq requires --reference and --aligner to be set.")
+        if (args.aligner == "bwa" and not check_bwa_index(args.reference)) or (args.aligner == "minimap" and not check_minimap2_index(args.reference)):
+            raise ValueError(f"Cannot find `${args.aligner}` reference index at `{args.reference}`.")
 
-    if args.input_type == "fastq":
         args.input_data = check_input_reads(
             fwd_reads=args.reads1, rev_reads=args.reads2,
             single_reads=args.singles, orphan_reads=args.orphans,
         )
-
+    else:
+        if bool(args.reference or args.aligner):
+            raise ValueError("--reference/--aligner parameters are only required for --fastq-<readtype> input.")
+
+    if (args.strand_specific and args.gene_counts):
+        raise NotImplementedError("External gene count input is not implemented for strand-specific counts.")
+
+    # if (args.reference or args.aligner) and not has_fastq:
+    #     raise ValueError("--reference/--aligner parameters are only required for --fastq-<readtype> input.")
+
+    # if bool(args.reference and args.aligner) != has_fastq:
+    #     raise ValueError("--fastq-<readtype> input requires --reference and --aligner to be set.")
+
     if args.restrict_metrics:
         restrict_metrics = set(args.restrict_metrics.split(","))
         invalid = restrict_metrics.difference(('raw', 'lnorm', 'scaled', 'rpkm'))
@@ -189,6 +204,12 @@ def handle_args(args):
     #         Input from STDOUT can be used with '-'."""
     #     ),
     # )
+    ap.add_argument(
+        "--gene_counts",
+        type=str,
+        help="Path to a file containing a gene_counts matrix from a previous gffquant run."
+    )
+
     ap.add_argument(
         "--bam",
         type=str,