diff --git a/src/modules/overrepresented_seqs.rs b/src/modules/overrepresented_seqs.rs index 2f718bb..c6b2f0d 100644 --- a/src/modules/overrepresented_seqs.rs +++ b/src/modules/overrepresented_seqs.rs @@ -416,17 +416,18 @@ impl QCModule for OverRepresentedSeqs { Some(hit) => hit.to_string(), None => "No Hit".to_string(), }; - // The Java ResultsTable.getValueAt() for percentage does - // JAVA COMPAT: Math.round(percentage * 100.0) / 100.0, rounding to 2 decimal places. - // The text report then calls String.valueOf() on the Double, producing - // Java's Double.toString() format. - let rounded_pct = (s.percentage * 100.0).round() / 100.0; + // JAVA COMPAT: Java's OverRepresentedSeqs.OverrepresentedSeq stores + // the raw double percentage without rounding (see OverRepresentedSeqs.java:253), + // and AbstractQCModule.writeTable serializes it via String.valueOf(getValueAt(...)) + // (AbstractQCModule.java:159), which returns Java's Double.toString() of the + // unrounded value (e.g. "7.160449112640348"). Pass the raw percentage to the + // formatter; do not round to 2 decimals. writeln!( writer, "{}\t{}\t{}\t{}", s.seq, s.count, - java_format_double(rounded_pct), + java_format_double(s.percentage), source )?; } diff --git a/tests/data/gen_realistic.py b/tests/data/gen_realistic.py new file mode 100755 index 0000000..167ef8b --- /dev/null +++ b/tests/data/gen_realistic.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +"""Deterministically generate a realistic test FASTQ for fastqc-rust equivalence tests. + +Produces 1009 reads of 50bp each at uniform Phred 40, with 5 deliberately +overrepresented sequences at non-round percentages. Background reads are +pseudo-random (fixed seed) and below the 0.1% overrepresented threshold. + +Designed to expose the percentage-precision bug fixed by ewels/FastQC-Rust#2. + +Output is written gzipped (~20 KB vs ~120 KB plain) to keep the repo lean. +The gzip stream uses mtime=0 and the deterministic content above so byte- +identical regeneration is possible across machines. +""" +import gzip +import random +import sys + +random.seed(20260426) + +QUAL = "I" * 50 # Phred 40 +LEN = 50 +TOTAL = 1009 + +OVERREP = [ + ("OVERREP_A_HIGH", 73), # 73/1009 = 7.23488602...% + ("OVERREP_B_MID", 37), # 37/1009 = 3.66699702...% + ("OVERREP_C_LOW", 11), # 11/1009 = 1.09018830...% + ("OVERREP_D_TINY", 5), # 5/1009 = 0.49554014...% + ("OVERREP_E_EDGE", 2), # 2/1009 = 0.19821605...% (just above 0.1% threshold) +] + +def random_seq(rng): + return "".join(rng.choice("ACGT") for _ in range(LEN)) + +# Build the five overrepresented sequences. Use deterministic RNG, but reject +# any candidate that collides with a previous sequence to keep counts exact. +overrep_seqs = [] +seen = set() +seq_rng = random.Random(20260426) +for label, _count in OVERREP: + while True: + s = random_seq(seq_rng) + if s not in seen: + overrep_seqs.append(s) + seen.add(s) + break + +# Build background reads. Each must be unique AND not collide with any +# overrepresented sequence (otherwise the percentages drift). +n_overrep = sum(c for _, c in OVERREP) +n_background = TOTAL - n_overrep +assert n_background > 0 +background_seqs = [] +while len(background_seqs) < n_background: + s = random_seq(seq_rng) + if s in seen: + continue + seen.add(s) + background_seqs.append(s) + +# Assemble reads in a deterministic interleaved order so the output is stable. +reads = [] +for s, (label, count) in zip(overrep_seqs, OVERREP): + for i in range(count): + reads.append((f"{label}_{i+1}", s)) +for i, s in enumerate(background_seqs): + reads.append((f"BACKGROUND_{i+1}", s)) + +# Shuffle deterministically so overrepresented reads aren't all clustered. +order_rng = random.Random(99) +order_rng.shuffle(reads) +assert len(reads) == TOTAL + +out_path = sys.argv[1] +# mtime=0 makes the gzip header deterministic so re-running the generator +# produces byte-identical output on any machine. +with gzip.GzipFile(filename=out_path, mode="wb", mtime=0) as f: + for header, seq in reads: + f.write(f"@{header}\n{seq}\n+\n{QUAL}\n".encode("ascii")) + +print(f"wrote {out_path}: {TOTAL} reads, {LEN}bp, {len(OVERREP)} overrepresented sequences", file=sys.stderr) +for label, count in OVERREP: + pct = count * 100 / TOTAL + print(f" {label}: {count}/{TOTAL} = {pct}%", file=sys.stderr) diff --git a/tests/data/realistic.fastq.gz b/tests/data/realistic.fastq.gz new file mode 100644 index 0000000..386166d Binary files /dev/null and b/tests/data/realistic.fastq.gz differ diff --git a/tests/equivalence/patches/realistic_default_adapter_content_svg.patch b/tests/equivalence/patches/realistic_default_adapter_content_svg.patch new file mode 100644 index 0000000..a905e30 --- /dev/null +++ b/tests/equivalence/patches/realistic_default_adapter_content_svg.patch @@ -0,0 +1,11 @@ +--- java/Images/adapter_content.svg ++++ rust/Images/adapter_content.svg +@@ -17,7 +17,7 @@ + % Adapter + + +-Position in read (bp) ++Position in read (bp) + 1 + + 2 diff --git a/tests/equivalence/patches/realistic_default_duplication_levels_svg.patch b/tests/equivalence/patches/realistic_default_duplication_levels_svg.patch new file mode 100644 index 0000000..8739842 --- /dev/null +++ b/tests/equivalence/patches/realistic_default_duplication_levels_svg.patch @@ -0,0 +1,22 @@ +--- java/Images/duplication_levels.svg ++++ rust/Images/duplication_levels.svg +@@ -17,7 +17,7 @@ + Percent of seqs remaining if deduplicated 87.81% + + +-Sequence Duplication Level ++Sequence Duplication Level + 1 + + 2 +@@ -68,7 +68,7 @@ + + + +- +- +-% Total sequences ++ ++ ++% Total sequences + diff --git a/tests/equivalence/patches/realistic_default_per_base_n_content_svg.patch b/tests/equivalence/patches/realistic_default_per_base_n_content_svg.patch new file mode 100644 index 0000000..4c30aab --- /dev/null +++ b/tests/equivalence/patches/realistic_default_per_base_n_content_svg.patch @@ -0,0 +1,21 @@ +--- java/Images/per_base_n_content.svg ++++ rust/Images/per_base_n_content.svg +@@ -17,7 +17,7 @@ + N content across all bases + + +-Position in read (bp) ++Position in read (bp) + 1 + + 2 +@@ -132,7 +132,7 @@ + + + +- +- ++ ++ + %N + diff --git a/tests/equivalence/patches/realistic_default_per_base_quality_svg.patch b/tests/equivalence/patches/realistic_default_per_base_quality_svg.patch new file mode 100644 index 0000000..eaaef17 --- /dev/null +++ b/tests/equivalence/patches/realistic_default_per_base_quality_svg.patch @@ -0,0 +1,20 @@ +--- java/Images/per_base_quality.svg ++++ rust/Images/per_base_quality.svg +@@ -21,7 +21,7 @@ + 30 + 32 + 34 +-Quality scores across all bases (Illumina 1.5 encoding) ++Quality scores across all bases (Illumina 1.5 encoding) + + + +@@ -203,7 +203,7 @@ + + + +-Position in read (bp) ++Position in read (bp) + + + diff --git a/tests/equivalence/patches/realistic_default_per_base_sequence_content_svg.patch b/tests/equivalence/patches/realistic_default_per_base_sequence_content_svg.patch new file mode 100644 index 0000000..17a07ea --- /dev/null +++ b/tests/equivalence/patches/realistic_default_per_base_sequence_content_svg.patch @@ -0,0 +1,22 @@ +--- java/Images/per_base_sequence_content.svg ++++ rust/Images/per_base_sequence_content.svg +@@ -17,7 +17,7 @@ + Sequence content across all bases + + +-Position in read (bp) ++Position in read (bp) + 1 + + 2 +@@ -279,8 +279,8 @@ + + + +- +- ++ ++ + %T + %C + %A diff --git a/tests/equivalence/patches/realistic_default_per_sequence_gc_content_svg.patch b/tests/equivalence/patches/realistic_default_per_sequence_gc_content_svg.patch new file mode 100644 index 0000000..48fabba --- /dev/null +++ b/tests/equivalence/patches/realistic_default_per_sequence_gc_content_svg.patch @@ -0,0 +1,22 @@ +--- java/Images/per_sequence_gc_content.svg ++++ rust/Images/per_sequence_gc_content.svg +@@ -15,7 +15,7 @@ + GC distribution over all sequences + + +-Mean GC content (%) ++Mean GC content (%) + 0 + + 2 +@@ -303,8 +303,8 @@ + + + +- +- ++ ++ + GC count per read + Theoretical Distribution + diff --git a/tests/equivalence/patches/realistic_default_per_sequence_quality_svg.patch b/tests/equivalence/patches/realistic_default_per_sequence_quality_svg.patch new file mode 100644 index 0000000..1b0b9ed --- /dev/null +++ b/tests/equivalence/patches/realistic_default_per_sequence_quality_svg.patch @@ -0,0 +1,26 @@ +--- java/Images/per_sequence_quality.svg ++++ rust/Images/per_sequence_quality.svg +@@ -9,10 +9,10 @@ + 600 + 800 + 1000 +-Quality score distribution over all sequences ++Quality score distribution over all sequences + + +-Mean Sequence Quality (Phred Score) ++Mean Sequence Quality (Phred Score) + 9 + + +@@ -20,7 +20,7 @@ + + + +- +- +-Average Quality per read ++ ++ ++Average Quality per read + diff --git a/tests/equivalence/patches/realistic_default_sequence_length_distribution_svg.patch b/tests/equivalence/patches/realistic_default_sequence_length_distribution_svg.patch new file mode 100644 index 0000000..9a182c7 --- /dev/null +++ b/tests/equivalence/patches/realistic_default_sequence_length_distribution_svg.patch @@ -0,0 +1,13 @@ +--- java/Images/sequence_length_distribution.svg ++++ rust/Images/sequence_length_distribution.svg +@@ -25,7 +25,7 @@ + + + +- +- +-Sequence Length ++ ++ ++Sequence Length + diff --git a/tests/equivalence/reference/realistic_default/Images/adapter_content.png b/tests/equivalence/reference/realistic_default/Images/adapter_content.png new file mode 100644 index 0000000..8d90857 Binary files /dev/null and b/tests/equivalence/reference/realistic_default/Images/adapter_content.png differ diff --git a/tests/equivalence/reference/realistic_default/Images/adapter_content.svg b/tests/equivalence/reference/realistic_default/Images/adapter_content.svg new file mode 100644 index 0000000..ca428d2 --- /dev/null +++ b/tests/equivalence/reference/realistic_default/Images/adapter_content.svg @@ -0,0 +1,311 @@ + + + + + +0 +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +% Adapter + + +Position in read (bp) +1 + +2 +3 + +4 +5 + +6 +7 + +8 +9 + +10 + +12 + +14 + +16 + +18 + +20 + +22 + +24 + +26 + +28 + +30 + +32 + +34 + +36 + +38 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Illumina Universal Adapter +Illumina Small RNA 3' Adapter +Illumina Small RNA 5' Adapter +Nextera Transposase Sequence +PolyA +PolyG + diff --git a/tests/equivalence/reference/realistic_default/Images/duplication_levels.png b/tests/equivalence/reference/realistic_default/Images/duplication_levels.png new file mode 100644 index 0000000..87c570b Binary files /dev/null and b/tests/equivalence/reference/realistic_default/Images/duplication_levels.png differ diff --git a/tests/equivalence/reference/realistic_default/Images/duplication_levels.svg b/tests/equivalence/reference/realistic_default/Images/duplication_levels.svg new file mode 100644 index 0000000..8f44b39 --- /dev/null +++ b/tests/equivalence/reference/realistic_default/Images/duplication_levels.svg @@ -0,0 +1,74 @@ + + + + + +0 +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +Percent of seqs remaining if deduplicated 87.81% + + +Sequence Duplication Level +1 + +2 +3 + +4 +5 + +6 +7 + +8 +9 + +>10 +>50 + +>100 +>500 + +>1k +>5k + +>10k + + + + + + + + + + + + + + + + + + + + + + + + + + + + +% Total sequences + diff --git a/tests/equivalence/reference/realistic_default/Images/per_base_n_content.png b/tests/equivalence/reference/realistic_default/Images/per_base_n_content.png new file mode 100644 index 0000000..0211043 Binary files /dev/null and b/tests/equivalence/reference/realistic_default/Images/per_base_n_content.png differ diff --git a/tests/equivalence/reference/realistic_default/Images/per_base_n_content.svg b/tests/equivalence/reference/realistic_default/Images/per_base_n_content.svg new file mode 100644 index 0000000..0981699 --- /dev/null +++ b/tests/equivalence/reference/realistic_default/Images/per_base_n_content.svg @@ -0,0 +1,138 @@ + + + + + +0 +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +N content across all bases + + +Position in read (bp) +1 + +2 +3 + +4 +5 + +6 +7 + +8 +9 + +11 + +13 + +15 + +17 + +19 + +21 + +23 + +25 + +27 + +29 + +31 + +33 + +35 + +37 + +39 + +41 + +43 + +45 + +47 + +49 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%N + diff --git a/tests/equivalence/reference/realistic_default/Images/per_base_quality.png b/tests/equivalence/reference/realistic_default/Images/per_base_quality.png new file mode 100644 index 0000000..f82cd15 Binary files /dev/null and b/tests/equivalence/reference/realistic_default/Images/per_base_quality.png differ diff --git a/tests/equivalence/reference/realistic_default/Images/per_base_quality.svg b/tests/equivalence/reference/realistic_default/Images/per_base_quality.svg new file mode 100644 index 0000000..795faca --- /dev/null +++ b/tests/equivalence/reference/realistic_default/Images/per_base_quality.svg @@ -0,0 +1,606 @@ + + + + + +0 +2 +4 +6 +8 +10 +12 +14 +16 +18 +20 +22 +24 +26 +28 +30 +32 +34 +Quality scores across all bases (Illumina 1.5 encoding) + + + +1 + + + +2 + + + +3 + + + +4 + + + +5 + + + +6 + + + +7 + + + +8 + + + +9 + + + + + + +11 + + + + + + +13 + + + + + + +15 + + + + + + +17 + + + + + + +19 + + + + + + +21 + + + + + + +23 + + + + + + +25 + + + + + + +27 + + + + + + +29 + + + + + + +31 + + + + + + +33 + + + + + + +35 + + + + + + +37 + + + + + + +39 + + + + + + +41 + + + + + + +43 + + + + + + +45 + + + + + + +47 + + + + + + +49 + + + + + +Position in read (bp) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/equivalence/reference/realistic_default/Images/per_base_sequence_content.png b/tests/equivalence/reference/realistic_default/Images/per_base_sequence_content.png new file mode 100644 index 0000000..8e80320 Binary files /dev/null and b/tests/equivalence/reference/realistic_default/Images/per_base_sequence_content.png differ diff --git a/tests/equivalence/reference/realistic_default/Images/per_base_sequence_content.svg b/tests/equivalence/reference/realistic_default/Images/per_base_sequence_content.svg new file mode 100644 index 0000000..b676512 --- /dev/null +++ b/tests/equivalence/reference/realistic_default/Images/per_base_sequence_content.svg @@ -0,0 +1,288 @@ + + + + + +0 +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +Sequence content across all bases + + +Position in read (bp) +1 + +2 +3 + +4 +5 + +6 +7 + +8 +9 + +11 + +13 + +15 + +17 + +19 + +21 + +23 + +25 + +27 + +29 + +31 + +33 + +35 + +37 + +39 + +41 + +43 + +45 + +47 + +49 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%T +%C +%A +%G + diff --git a/tests/equivalence/reference/realistic_default/Images/per_sequence_gc_content.png b/tests/equivalence/reference/realistic_default/Images/per_sequence_gc_content.png new file mode 100644 index 0000000..9954155 Binary files /dev/null and b/tests/equivalence/reference/realistic_default/Images/per_sequence_gc_content.png differ diff --git a/tests/equivalence/reference/realistic_default/Images/per_sequence_gc_content.svg b/tests/equivalence/reference/realistic_default/Images/per_sequence_gc_content.svg new file mode 100644 index 0000000..de767aa --- /dev/null +++ b/tests/equivalence/reference/realistic_default/Images/per_sequence_gc_content.svg @@ -0,0 +1,310 @@ + + + + + +0 +20 +40 +60 +80 +100 +120 +140 +160 +GC distribution over all sequences + + +Mean GC content (%) +0 + +2 + +4 + +6 + +8 + + +11 + + +15 + + +19 + + +23 + + +27 + + +31 + + +35 + + +39 + + +43 + + +47 + + +51 + + +55 + + +59 + + +63 + + +67 + + +71 + + +75 + + +79 + + +83 + + +87 + + +91 + + +95 + + +99 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +GC count per read +Theoretical Distribution + diff --git a/tests/equivalence/reference/realistic_default/Images/per_sequence_quality.png b/tests/equivalence/reference/realistic_default/Images/per_sequence_quality.png new file mode 100644 index 0000000..f7eddef Binary files /dev/null and b/tests/equivalence/reference/realistic_default/Images/per_sequence_quality.png differ diff --git a/tests/equivalence/reference/realistic_default/Images/per_sequence_quality.svg b/tests/equivalence/reference/realistic_default/Images/per_sequence_quality.svg new file mode 100644 index 0000000..b528334 --- /dev/null +++ b/tests/equivalence/reference/realistic_default/Images/per_sequence_quality.svg @@ -0,0 +1,26 @@ + + + + + +0 +200 +400 +600 +800 +1000 +Quality score distribution over all sequences + + +Mean Sequence Quality (Phred Score) +9 + + + + + + + + +Average Quality per read + diff --git a/tests/equivalence/reference/realistic_default/Images/sequence_length_distribution.png b/tests/equivalence/reference/realistic_default/Images/sequence_length_distribution.png new file mode 100644 index 0000000..5625625 Binary files /dev/null and b/tests/equivalence/reference/realistic_default/Images/sequence_length_distribution.png differ diff --git a/tests/equivalence/reference/realistic_default/Images/sequence_length_distribution.svg b/tests/equivalence/reference/realistic_default/Images/sequence_length_distribution.svg new file mode 100644 index 0000000..45ba1ab --- /dev/null +++ b/tests/equivalence/reference/realistic_default/Images/sequence_length_distribution.svg @@ -0,0 +1,31 @@ + + + + + +0 +200 +400 +600 +800 +1000 +Distribution of sequence lengths over all sequences + + +Sequence Length (bp) +49 + +50 +51 + + + + + + + + + + +Sequence Length + diff --git a/tests/equivalence/reference/realistic_default/fastqc.fo b/tests/equivalence/reference/realistic_default/fastqc.fo new file mode 100644 index 0000000..1c0c5da --- /dev/null +++ b/tests/equivalence/reference/realistic_default/fastqc.fo @@ -0,0 +1,3 @@ + + FASTQC-Report + Basic StatisticsFilenamerealistic.fastq.gzFile typeConventional base callsEncodingIllumina 1.5Total Sequences1009Total Bases50.4 kbpSequences flagged as poor quality0Sequence length50%GC49Per base sequence qualityPer sequence quality scoresPer base sequence contentPer sequence GC contentPer base N contentSequence Length DistributionSequence Duplication LevelsOverrepresented sequencesGCAGGACCTCTAGATTGTATCACTCTGGACCGAAGATATTGACCCTCAAA737.234886025768088No HitGTAACAGTAACCGACAACCCGATCACAAGGTTCAAAGACTCCGTGAAAAA373.6669970267591676No HitCAGCAAGTGTGGTCTTTGTTCAAGTAAGCTTGCACCTGAGTTTGCGCTGC111.0901883052527255No HitCCACATCTCTCTCCCATTTGATCTATACGTAGACAGGTTCTAGATCCGGT50.4955401387512388No HitCAGAGTAAATCCTTGAGTGGCCTCTAGGCTCACATAATAGAAGTCAATCC20.19821605550049554No HitAdapter Content \ No newline at end of file diff --git a/tests/equivalence/reference/realistic_default/fastqc_data.txt b/tests/equivalence/reference/realistic_default/fastqc_data.txt new file mode 100644 index 0000000..ab75b6e --- /dev/null +++ b/tests/equivalence/reference/realistic_default/fastqc_data.txt @@ -0,0 +1,353 @@ +##FastQC 0.12.1 +>>Basic Statistics pass +#Measure Value +Filename realistic.fastq.gz +File type Conventional base calls +Encoding Illumina 1.5 +Total Sequences 1009 +Total Bases 50.4 kbp +Sequences flagged as poor quality 0 +Sequence length 50 +%GC 49 +>>END_MODULE +>>Per base sequence quality fail +#Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile +1 9.0 9.0 9.0 9.0 9.0 9.0 +2 9.0 9.0 9.0 9.0 9.0 9.0 +3 9.0 9.0 9.0 9.0 9.0 9.0 +4 9.0 9.0 9.0 9.0 9.0 9.0 +5 9.0 9.0 9.0 9.0 9.0 9.0 +6 9.0 9.0 9.0 9.0 9.0 9.0 +7 9.0 9.0 9.0 9.0 9.0 9.0 +8 9.0 9.0 9.0 9.0 9.0 9.0 +9 9.0 9.0 9.0 9.0 9.0 9.0 +10 9.0 9.0 9.0 9.0 9.0 9.0 +11 9.0 9.0 9.0 9.0 9.0 9.0 +12 9.0 9.0 9.0 9.0 9.0 9.0 +13 9.0 9.0 9.0 9.0 9.0 9.0 +14 9.0 9.0 9.0 9.0 9.0 9.0 +15 9.0 9.0 9.0 9.0 9.0 9.0 +16 9.0 9.0 9.0 9.0 9.0 9.0 +17 9.0 9.0 9.0 9.0 9.0 9.0 +18 9.0 9.0 9.0 9.0 9.0 9.0 +19 9.0 9.0 9.0 9.0 9.0 9.0 +20 9.0 9.0 9.0 9.0 9.0 9.0 +21 9.0 9.0 9.0 9.0 9.0 9.0 +22 9.0 9.0 9.0 9.0 9.0 9.0 +23 9.0 9.0 9.0 9.0 9.0 9.0 +24 9.0 9.0 9.0 9.0 9.0 9.0 +25 9.0 9.0 9.0 9.0 9.0 9.0 +26 9.0 9.0 9.0 9.0 9.0 9.0 +27 9.0 9.0 9.0 9.0 9.0 9.0 +28 9.0 9.0 9.0 9.0 9.0 9.0 +29 9.0 9.0 9.0 9.0 9.0 9.0 +30 9.0 9.0 9.0 9.0 9.0 9.0 +31 9.0 9.0 9.0 9.0 9.0 9.0 +32 9.0 9.0 9.0 9.0 9.0 9.0 +33 9.0 9.0 9.0 9.0 9.0 9.0 +34 9.0 9.0 9.0 9.0 9.0 9.0 +35 9.0 9.0 9.0 9.0 9.0 9.0 +36 9.0 9.0 9.0 9.0 9.0 9.0 +37 9.0 9.0 9.0 9.0 9.0 9.0 +38 9.0 9.0 9.0 9.0 9.0 9.0 +39 9.0 9.0 9.0 9.0 9.0 9.0 +40 9.0 9.0 9.0 9.0 9.0 9.0 +41 9.0 9.0 9.0 9.0 9.0 9.0 +42 9.0 9.0 9.0 9.0 9.0 9.0 +43 9.0 9.0 9.0 9.0 9.0 9.0 +44 9.0 9.0 9.0 9.0 9.0 9.0 +45 9.0 9.0 9.0 9.0 9.0 9.0 +46 9.0 9.0 9.0 9.0 9.0 9.0 +47 9.0 9.0 9.0 9.0 9.0 9.0 +48 9.0 9.0 9.0 9.0 9.0 9.0 +49 9.0 9.0 9.0 9.0 9.0 9.0 +50 9.0 9.0 9.0 9.0 9.0 9.0 +>>END_MODULE +>>Per sequence quality scores fail +#Quality Count +9 1009.0 +>>END_MODULE +>>Per base sequence content warn +#Base G A T C +1 33.10208126858276 21.902874132804754 21.605550049554015 23.389494549058472 +2 22.2001982160555 21.407333994053516 25.272547076313177 31.119920713577798 +3 23.984142715559962 33.79583746283449 20.713577799801783 21.506442021803764 +4 28.543111992071356 23.785926660059463 22.497522299306244 25.173439048562933 +5 29.83151635282458 24.677898909811695 21.110009910802773 24.380574826560952 +6 22.101090188305253 34.093161546085234 19.920713577799802 23.88503468780971 +7 27.056491575817642 19.028741328047573 22.2001982160555 31.714568880079284 +8 20.812685827552034 21.010901883052526 28.93954410307235 29.23686818632309 +9 25.768087215064423 25.074331020812686 28.344895936570865 20.812685827552034 +10 22.794846382556987 23.48860257680872 24.2814667988107 29.43508424182359 +11 23.48860257680872 22.69573835480674 29.038652130822594 24.777006937561943 +12 21.80376610505451 30.426164519326065 20.41625371655104 27.35381565906839 +13 33.0029732408325 22.001982160555006 22.398414271555996 22.59663032705649 +14 20.911793855302278 33.3994053518335 21.80376610505451 23.88503468780971 +15 22.497522299306244 21.506442021803764 27.452923686818632 28.543111992071356 +16 21.605550049554015 26.362735381565905 29.43508424182359 22.59663032705649 +17 28.14667988107037 25.074331020812686 24.4796828543112 22.29930624380575 +18 22.893954410307234 21.110009910802773 31.020812685827554 24.975222993062438 +19 20.317145688800792 29.23686818632309 24.08325074331021 26.362735381565905 +20 21.407333994053516 22.101090188305253 31.020812685827554 25.47076313181368 +21 28.14667988107037 19.127849355797817 22.398414271555996 30.327056491575817 +22 22.398414271555996 33.30029732408325 23.290386521308225 21.010901883052526 +23 22.398414271555996 22.101090188305253 26.362735381565905 29.137760158572846 +24 20.515361744301288 23.19127849355798 30.22794846382557 26.065411298315162 +25 20.01982160555005 29.33597621407334 20.812685827552034 29.83151635282458 +26 23.785926660059463 23.290386521308225 29.137760158572846 23.785926660059463 +27 28.44400396432111 26.461843409316156 25.074331020812686 20.01982160555005 +28 32.70564915758176 26.065411298315162 21.308225966303272 19.920713577799802 +29 25.569871159563924 30.22794846382557 21.308225966303272 22.893954410307234 +30 22.2001982160555 22.101090188305253 26.7591674925669 28.93954410307235 +31 23.389494549058472 21.902874132804754 26.560951437066404 28.14667988107037 +32 31.41724479682854 20.614469772051535 26.957383548067394 21.010901883052526 +33 20.812685827552034 32.30921704658078 20.118929633300297 26.7591674925669 +34 22.69573835480674 33.201189296333 21.902874132804754 22.2001982160555 +35 29.038652130822594 26.461843409316156 23.290386521308225 21.209117938553025 +36 22.2001982160555 33.3994053518335 20.614469772051535 23.785926660059463 +37 26.26362735381566 21.80376610505451 30.426164519326065 21.506442021803764 +38 22.101090188305253 32.01189296333003 22.993062438057482 22.893954410307234 +39 20.515361744301288 23.68681863230922 30.327056491575817 25.47076313181368 +40 22.794846382556987 21.902874132804754 32.21010901883052 23.09217046580773 +41 27.94846382556987 22.398414271555996 24.182358771060457 25.47076313181368 +42 22.497522299306244 28.93954410307235 21.605550049554015 26.957383548067394 +43 26.16451932606541 19.920713577799802 23.19127849355798 30.723488602576808 +44 22.398414271555996 22.29930624380575 26.461843409316156 28.840436075322103 +45 26.16451932606541 24.4796828543112 19.920713577799802 29.43508424182359 +46 21.506442021803764 26.16451932606541 31.813676907829535 20.515361744301288 +47 22.59663032705649 23.984142715559962 23.88503468780971 29.534192269573833 +48 22.101090188305253 34.489593657086225 23.09217046580773 20.317145688800792 +49 22.59663032705649 34.19226957383548 19.722497522299307 23.48860257680872 +50 21.605550049554015 33.597621407333996 22.101090188305253 22.69573835480674 +>>END_MODULE +>>Per sequence GC content fail +#GC Content Count +0 0.0 +1 0.0 +2 0.0 +3 0.0 +4 0.0 +5 0.0 +6 0.0 +7 0.0 +8 0.0 +9 0.0 +10 0.0 +11 0.0 +12 0.0 +13 0.0 +14 0.0 +15 0.0 +16 0.0 +17 0.0 +18 0.0 +19 0.0 +20 0.0 +21 0.0 +22 0.0 +23 0.0 +24 0.0 +25 0.0 +26 0.0 +27 0.0 +28 0.0 +29 1.0 +30 2.0 +31 4.5 +32 7.0 +33 8.0 +34 9.0 +35 10.5 +36 12.0 +37 16.5 +38 21.0 +39 31.0 +40 41.0 +41 46.0 +42 51.0 +43 77.0 +44 103.0 +45 139.5 +46 176.0 +47 129.5 +48 83.0 +49 103.5 +50 124.0 +51 106.5 +52 89.0 +53 81.5 +54 74.0 +55 73.0 +56 72.0 +57 67.0 +58 62.0 +59 48.0 +60 34.0 +61 26.5 +62 19.0 +63 15.5 +64 12.0 +65 12.0 +66 12.0 +67 7.5 +68 3.0 +69 2.5 +70 2.0 +71 1.5 +72 1.0 +73 0.5 +74 0.0 +75 0.0 +76 0.0 +77 0.0 +78 0.0 +79 0.0 +80 0.0 +81 0.0 +82 0.0 +83 0.0 +84 0.0 +85 0.0 +86 0.0 +87 0.0 +88 0.0 +89 0.0 +90 0.0 +91 0.0 +92 0.0 +93 0.0 +94 0.0 +95 0.0 +96 0.0 +97 0.0 +98 0.0 +99 0.0 +100 0.0 +>>END_MODULE +>>Per base N content pass +#Base N-Count +1 0.0 +2 0.0 +3 0.0 +4 0.0 +5 0.0 +6 0.0 +7 0.0 +8 0.0 +9 0.0 +10 0.0 +11 0.0 +12 0.0 +13 0.0 +14 0.0 +15 0.0 +16 0.0 +17 0.0 +18 0.0 +19 0.0 +20 0.0 +21 0.0 +22 0.0 +23 0.0 +24 0.0 +25 0.0 +26 0.0 +27 0.0 +28 0.0 +29 0.0 +30 0.0 +31 0.0 +32 0.0 +33 0.0 +34 0.0 +35 0.0 +36 0.0 +37 0.0 +38 0.0 +39 0.0 +40 0.0 +41 0.0 +42 0.0 +43 0.0 +44 0.0 +45 0.0 +46 0.0 +47 0.0 +48 0.0 +49 0.0 +50 0.0 +>>END_MODULE +>>Sequence Length Distribution pass +#Length Count +50 1009.0 +>>END_MODULE +>>Sequence Duplication Levels pass +#Total Deduplicated Percentage 87.80971258671953 +#Duplication Level Percentage of total +1 87.31417244796829 +2 0.19821605550049554 +3 0.0 +4 0.0 +5 0.4955401387512388 +6 0.0 +7 0.0 +8 0.0 +9 0.0 +>10 4.757185332011893 +>50 7.234886025768088 +>100 0.0 +>500 0.0 +>1k 0.0 +>5k 0.0 +>10k+ 0.0 +>>END_MODULE +>>Overrepresented sequences fail +#Sequence Count Percentage Possible Source +GCAGGACCTCTAGATTGTATCACTCTGGACCGAAGATATTGACCCTCAAA 73 7.234886025768088 No Hit +GTAACAGTAACCGACAACCCGATCACAAGGTTCAAAGACTCCGTGAAAAA 37 3.6669970267591676 No Hit +CAGCAAGTGTGGTCTTTGTTCAAGTAAGCTTGCACCTGAGTTTGCGCTGC 11 1.0901883052527255 No Hit +CCACATCTCTCTCCCATTTGATCTATACGTAGACAGGTTCTAGATCCGGT 5 0.4955401387512388 No Hit +CAGAGTAAATCCTTGAGTGGCCTCTAGGCTCACATAATAGAAGTCAATCC 2 0.19821605550049554 No Hit +>>END_MODULE +>>Adapter Content pass +#Position Illumina Universal Adapter Illumina Small RNA 3' Adapter Illumina Small RNA 5' Adapter Nextera Transposase Sequence PolyA PolyG +1 0.0 0.0 0.0 0.0 0.0 0.0 +2 0.0 0.0 0.0 0.0 0.0 0.0 +3 0.0 0.0 0.0 0.0 0.0 0.0 +4 0.0 0.0 0.0 0.0 0.0 0.0 +5 0.0 0.0 0.0 0.0 0.0 0.0 +6 0.0 0.0 0.0 0.0 0.0 0.0 +7 0.0 0.0 0.0 0.0 0.0 0.0 +8 0.0 0.0 0.0 0.0 0.0 0.0 +9 0.0 0.0 0.0 0.0 0.0 0.0 +10 0.0 0.0 0.0 0.0 0.0 0.0 +11 0.0 0.0 0.0 0.0 0.0 0.0 +12 0.0 0.0 0.0 0.0 0.0 0.0 +13 0.0 0.0 0.0 0.0 0.0 0.0 +14 0.0 0.0 0.0 0.0 0.0 0.0 +15 0.0 0.0 0.0 0.0 0.0 0.0 +16 0.0 0.0 0.0 0.0 0.0 0.0 +17 0.0 0.0 0.0 0.0 0.0 0.0 +18 0.0 0.0 0.0 0.0 0.0 0.0 +19 0.0 0.0 0.0 0.0 0.0 0.0 +20 0.0 0.0 0.0 0.0 0.0 0.0 +21 0.0 0.0 0.0 0.0 0.0 0.0 +22 0.0 0.0 0.0 0.0 0.0 0.0 +23 0.0 0.0 0.0 0.0 0.0 0.0 +24 0.0 0.0 0.0 0.0 0.0 0.0 +25 0.0 0.0 0.0 0.0 0.0 0.0 +26 0.0 0.0 0.0 0.0 0.0 0.0 +27 0.0 0.0 0.0 0.0 0.0 0.0 +28 0.0 0.0 0.0 0.0 0.0 0.0 +29 0.0 0.0 0.0 0.0 0.0 0.0 +30 0.0 0.0 0.0 0.0 0.0 0.0 +31 0.0 0.0 0.0 0.0 0.0 0.0 +32 0.0 0.0 0.0 0.0 0.0 0.0 +33 0.0 0.0 0.0 0.0 0.0 0.0 +34 0.0 0.0 0.0 0.0 0.0 0.0 +35 0.0 0.0 0.0 0.0 0.0 0.0 +36 0.0 0.0 0.0 0.0 0.0 0.0 +37 0.0 0.0 0.0 0.0 0.0 0.0 +38 0.0 0.0 0.0 0.0 0.0 0.0 +39 0.0 0.0 0.0 0.0 0.0 0.0 +>>END_MODULE diff --git a/tests/equivalence/reference/realistic_default/fastqc_report.html b/tests/equivalence/reference/realistic_default/fastqc_report.html new file mode 100644 index 0000000..b830093 --- /dev/null +++ b/tests/equivalence/reference/realistic_default/fastqc_report.html @@ -0,0 +1,187 @@ +realistic.fastq.gz FastQC Report
FastQCFastQC Report
Sun 26 Apr 2026
realistic.fastq.gz

[OK]Basic Statistics

MeasureValue
Filenamerealistic.fastq.gz
File typeConventional base calls
EncodingIllumina 1.5
Total Sequences1009
Total Bases50.4 kbp
Sequences flagged as poor quality0
Sequence length50
%GC49

[FAIL]Per base sequence quality

Per base quality graph

[FAIL]Per sequence quality scores

Per Sequence quality graph

[WARN]Per base sequence content

Per base sequence content

[FAIL]Per sequence GC content

Per sequence GC content graph

[OK]Per base N content

N content graph

[OK]Sequence Length Distribution

Sequence length distribution

[OK]Sequence Duplication Levels

Duplication level graph

[FAIL]Overrepresented sequences

SequenceCountPercentagePossible Source
GCAGGACCTCTAGATTGTATCACTCTGGACCGAAGATATTGACCCTCAAA737.234886025768088No Hit
GTAACAGTAACCGACAACCCGATCACAAGGTTCAAAGACTCCGTGAAAAA373.6669970267591676No Hit
CAGCAAGTGTGGTCTTTGTTCAAGTAAGCTTGCACCTGAGTTTGCGCTGC111.0901883052527255No Hit
CCACATCTCTCTCCCATTTGATCTATACGTAGACAGGTTCTAGATCCGGT50.4955401387512388No Hit
CAGAGTAAATCCTTGAGTGGCCTCTAGGCTCACATAATAGAAGTCAATCC20.19821605550049554No Hit

[OK]Adapter Content

Adapter graph

\ No newline at end of file diff --git a/tests/equivalence/reference/realistic_default/summary.txt b/tests/equivalence/reference/realistic_default/summary.txt new file mode 100644 index 0000000..1069074 --- /dev/null +++ b/tests/equivalence/reference/realistic_default/summary.txt @@ -0,0 +1,10 @@ +PASS Basic Statistics realistic.fastq.gz +FAIL Per base sequence quality realistic.fastq.gz +FAIL Per sequence quality scores realistic.fastq.gz +WARN Per base sequence content realistic.fastq.gz +FAIL Per sequence GC content realistic.fastq.gz +PASS Per base N content realistic.fastq.gz +PASS Sequence Length Distribution realistic.fastq.gz +PASS Sequence Duplication Levels realistic.fastq.gz +FAIL Overrepresented sequences realistic.fastq.gz +PASS Adapter Content realistic.fastq.gz diff --git a/tests/equivalence/test_cases.yaml b/tests/equivalence/test_cases.yaml index 7e2b758..6ea8f00 100644 --- a/tests/equivalence/test_cases.yaml +++ b/tests/equivalence/test_cases.yaml @@ -76,3 +76,14 @@ - name: complex_dup_length_10 file: complex.fastq args: [--dup_length, "10"] + +# Realistic mid-size input designed to exercise non-round overrepresented +# percentages. 1009 reads (prime), 50bp uniform, 5 deliberately-overrepresented +# sequences at counts (73, 37, 11, 5, 2) producing percentages with full +# Double.toString() precision (e.g. 7.234886025768088). Background reads are +# pseudo-random and below the 0.1% overrepresented threshold so they don't +# pollute the section. Generated deterministically; see header comment in +# tests/data/realistic.fastq. +- name: realistic_default + file: realistic.fastq.gz + args: []