Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions src/modules/overrepresented_seqs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,17 +416,18 @@ impl QCModule for OverRepresentedSeqs {
Some(hit) => hit.to_string(),
None => "No Hit".to_string(),
};
// The Java ResultsTable.getValueAt() for percentage does
// JAVA COMPAT: Math.round(percentage * 100.0) / 100.0, rounding to 2 decimal places.
// The text report then calls String.valueOf() on the Double, producing
// Java's Double.toString() format.
let rounded_pct = (s.percentage * 100.0).round() / 100.0;
// JAVA COMPAT: Java's OverRepresentedSeqs.OverrepresentedSeq stores
// the raw double percentage without rounding (see OverRepresentedSeqs.java:253),
// and AbstractQCModule.writeTable serializes it via String.valueOf(getValueAt(...))
// (AbstractQCModule.java:159), which returns Java's Double.toString() of the
// unrounded value (e.g. "7.160449112640348"). Pass the raw percentage to the
// formatter; do not round to 2 decimals.
writeln!(
writer,
"{}\t{}\t{}\t{}",
s.seq,
s.count,
java_format_double(rounded_pct),
java_format_double(s.percentage),
source
)?;
}
Expand Down
84 changes: 84 additions & 0 deletions tests/data/gen_realistic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3
"""Deterministically generate a realistic test FASTQ for fastqc-rust equivalence tests.

Produces 1009 reads of 50bp each at uniform Phred 40, with 5 deliberately
overrepresented sequences at non-round percentages. Background reads are
pseudo-random (fixed seed) and below the 0.1% overrepresented threshold.

Designed to expose the percentage-precision bug fixed by ewels/FastQC-Rust#2.

Output is written gzipped (~20 KB vs ~120 KB plain) to keep the repo lean.
The gzip stream uses mtime=0 and the deterministic content above so byte-
identical regeneration is possible across machines.
"""
import gzip
import random
import sys

random.seed(20260426)

QUAL = "I" * 50 # Phred 40
LEN = 50
TOTAL = 1009

OVERREP = [
("OVERREP_A_HIGH", 73), # 73/1009 = 7.23488602...%
("OVERREP_B_MID", 37), # 37/1009 = 3.66699702...%
("OVERREP_C_LOW", 11), # 11/1009 = 1.09018830...%
("OVERREP_D_TINY", 5), # 5/1009 = 0.49554014...%
("OVERREP_E_EDGE", 2), # 2/1009 = 0.19821605...% (just above 0.1% threshold)
]

def random_seq(rng):
return "".join(rng.choice("ACGT") for _ in range(LEN))

# Build the five overrepresented sequences. Use deterministic RNG, but reject
# any candidate that collides with a previous sequence to keep counts exact.
overrep_seqs = []
seen = set()
seq_rng = random.Random(20260426)
for label, _count in OVERREP:
while True:
s = random_seq(seq_rng)
if s not in seen:
overrep_seqs.append(s)
seen.add(s)
break

# Build background reads. Each must be unique AND not collide with any
# overrepresented sequence (otherwise the percentages drift).
n_overrep = sum(c for _, c in OVERREP)
n_background = TOTAL - n_overrep
assert n_background > 0
background_seqs = []
while len(background_seqs) < n_background:
s = random_seq(seq_rng)
if s in seen:
continue
seen.add(s)
background_seqs.append(s)

# Assemble reads in a deterministic interleaved order so the output is stable.
reads = []
for s, (label, count) in zip(overrep_seqs, OVERREP):
for i in range(count):
reads.append((f"{label}_{i+1}", s))
for i, s in enumerate(background_seqs):
reads.append((f"BACKGROUND_{i+1}", s))

# Shuffle deterministically so overrepresented reads aren't all clustered.
order_rng = random.Random(99)
order_rng.shuffle(reads)
assert len(reads) == TOTAL

out_path = sys.argv[1]
# mtime=0 makes the gzip header deterministic so re-running the generator
# produces byte-identical output on any machine.
with gzip.GzipFile(filename=out_path, mode="wb", mtime=0) as f:
for header, seq in reads:
f.write(f"@{header}\n{seq}\n+\n{QUAL}\n".encode("ascii"))

print(f"wrote {out_path}: {TOTAL} reads, {LEN}bp, {len(OVERREP)} overrepresented sequences", file=sys.stderr)
for label, count in OVERREP:
pct = count * 100 / TOTAL
print(f" {label}: {count}/{TOTAL} = {pct}%", file=sys.stderr)
Binary file added tests/data/realistic.fastq.gz
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- java/Images/adapter_content.svg
+++ rust/Images/adapter_content.svg
@@ -17,7 +17,7 @@
<text x="380" y="30" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">% Adapter</text>
<line x1="30" y1="560" x2="790" y2="560" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
<line x1="30" y1="560" x2="30" y2="40" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
-<text x="340" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Position in read (bp)</text>
+<text x="350" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Position in read (bp)</text>
<text x="30" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">1</text>
<rect width="20" height="520" x="50" y="40" style="fill:rgb(230,230,230);stroke:none"/>
<text x="50" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">2</text>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
--- java/Images/duplication_levels.svg
+++ rust/Images/duplication_levels.svg
@@ -17,7 +17,7 @@
<text x="260" y="30" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Percent of seqs remaining if deduplicated 87.81%</text>
<line x1="30" y1="560" x2="790" y2="560" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
<line x1="30" y1="560" x2="30" y2="40" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
-<text x="320" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Sequence Duplication Level</text>
+<text x="330" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Sequence Duplication Level</text>
<text x="50" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">1</text>
<rect width="50" height="520" x="80" y="40" style="fill:rgb(230,230,230);stroke:none"/>
<text x="100" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">2</text>
@@ -68,7 +68,7 @@
<line x1="620" y1="560" x2="660" y2="560" stroke="rgb(136,34,85)" stroke-width="[SW]"/>
<line x1="660" y1="560" x2="710" y2="560" stroke="rgb(136,34,85)" stroke-width="[SW]"/>
<line x1="710" y1="560" x2="760" y2="560" stroke="rgb(136,34,85)" stroke-width="[SW]"/>
-<rect width="120" height="23" x="670" y="40" style="fill:rgb(255,255,255);stroke:none"/>
-<rect width="120" height="23" x="670" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
-<text x="670" y="55" fill="rgb(136,34,85)" font-family="[FONT]" font-size="12">% Total sequences</text>
+<rect width="110" height="23" x="680" y="40" style="fill:rgb(255,255,255);stroke:none"/>
+<rect width="110" height="23" x="680" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
+<text x="680" y="55" fill="rgb(136,34,85)" font-family="[FONT]" font-size="12">% Total sequences</text>
</svg>
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
--- java/Images/per_base_n_content.svg
+++ rust/Images/per_base_n_content.svg
@@ -17,7 +17,7 @@
<text x="330" y="30" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">N content across all bases</text>
<line x1="30" y1="560" x2="790" y2="560" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
<line x1="30" y1="560" x2="30" y2="40" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
-<text x="340" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Position in read (bp)</text>
+<text x="350" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Position in read (bp)</text>
<text x="30" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">1</text>
<rect width="20" height="520" x="40" y="40" style="fill:rgb(230,230,230);stroke:none"/>
<text x="50" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">2</text>
@@ -132,7 +132,7 @@
<line x1="730" y1="560" x2="740" y2="560" stroke="rgb(136,34,85)" stroke-width="[SW]"/>
<line x1="740" y1="560" x2="760" y2="560" stroke="rgb(136,34,85)" stroke-width="[SW]"/>
<line x1="760" y1="560" x2="770" y2="560" stroke="rgb(136,34,85)" stroke-width="[SW]"/>
-<rect width="20" height="23" x="770" y="40" style="fill:rgb(255,255,255);stroke:none"/>
-<rect width="20" height="23" x="770" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
+<rect width="20" height="23" x="760" y="40" style="fill:rgb(255,255,255);stroke:none"/>
+<rect width="20" height="23" x="760" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
<text x="770" y="55" fill="rgb(136,34,85)" font-family="[FONT]" font-size="12">%N</text>
</svg>
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
--- java/Images/per_base_quality.svg
+++ rust/Images/per_base_quality.svg
@@ -21,7 +21,7 @@
<text x="0" y="121" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">30</text>
<text x="0" y="91" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">32</text>
<text x="0" y="61" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">34</text>
-<text x="250" y="30" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Quality scores across all bases (Illumina 1.5 encoding)</text>
+<text x="240" y="30" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Quality scores across all bases (Illumina 1.5 encoding)</text>
<rect width="20" height="297" x="20" y="263" style="fill:rgb(230,175,175);stroke:none"/>
<rect width="20" height="119" x="20" y="144" style="fill:rgb(230,215,175);stroke:none"/>
<rect width="20" height="104" x="20" y="40" style="fill:rgb(175,230,175);stroke:none"/>
@@ -203,7 +203,7 @@
<rect width="20" height="104" x="760" y="40" style="fill:rgb(195,230,195);stroke:none"/>
<line x1="20" y1="560" x2="790" y2="560" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
<line x1="20" y1="560" x2="20" y2="40" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
-<text x="340" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Position in read (bp)</text>
+<text x="350" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Position in read (bp)</text>
<rect width="10" height="0" x="20" y="427" style="fill:rgb(240,240,0);stroke:none"/>
<rect width="10" height="0" x="20" y="427" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(0,0,0)"/>
<line x1="30" y1="427" x2="30" y2="427" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
--- java/Images/per_base_sequence_content.svg
+++ rust/Images/per_base_sequence_content.svg
@@ -17,7 +17,7 @@
<text x="310" y="30" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Sequence content across all bases</text>
<line x1="30" y1="560" x2="790" y2="560" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
<line x1="30" y1="560" x2="30" y2="40" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
-<text x="340" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Position in read (bp)</text>
+<text x="350" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Position in read (bp)</text>
<text x="30" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">1</text>
<rect width="20" height="520" x="40" y="40" style="fill:rgb(230,230,230);stroke:none"/>
<text x="50" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">2</text>
@@ -279,8 +279,8 @@
<line x1="730" y1="443" x2="740" y2="446" stroke="rgb(221,204,119)" stroke-width="[SW]"/>
<line x1="740" y1="446" x2="760" y2="443" stroke="rgb(221,204,119)" stroke-width="[SW]"/>
<line x1="760" y1="443" x2="770" y2="448" stroke="rgb(221,204,119)" stroke-width="[SW]"/>
-<rect width="20" height="83" x="770" y="40" style="fill:rgb(255,255,255);stroke:none"/>
-<rect width="20" height="83" x="770" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
+<rect width="20" height="83" x="760" y="40" style="fill:rgb(255,255,255);stroke:none"/>
+<rect width="20" height="83" x="760" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
<text x="770" y="55" fill="rgb(136,34,85)" font-family="[FONT]" font-size="12">%T</text>
<text x="770" y="75" fill="rgb(51,34,136)" font-family="[FONT]" font-size="12">%C</text>
<text x="770" y="95" fill="rgb(17,119,51)" font-family="[FONT]" font-size="12">%A</text>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
--- java/Images/per_sequence_gc_content.svg
+++ rust/Images/per_sequence_gc_content.svg
@@ -15,7 +15,7 @@
<text x="310" y="30" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">GC distribution over all sequences</text>
<line x1="30" y1="560" x2="790" y2="560" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
<line x1="30" y1="560" x2="30" y2="40" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
-<text x="340" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Mean GC content (%)</text>
+<text x="350" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Mean GC content (%)</text>
<text x="30" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">0</text>
<rect width="10" height="520" x="40" y="40" style="fill:rgb(230,230,230);stroke:none"/>
<text x="40" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">2</text>
@@ -303,8 +303,8 @@
<line x1="710" y1="560" x2="720" y2="560" stroke="rgb(51,34,136)" stroke-width="[SW]"/>
<line x1="720" y1="560" x2="720" y2="560" stroke="rgb(51,34,136)" stroke-width="[SW]"/>
<line x1="720" y1="560" x2="730" y2="560" stroke="rgb(51,34,136)" stroke-width="[SW]"/>
-<rect width="160" height="43" x="640" y="40" style="fill:rgb(255,255,255);stroke:none"/>
-<rect width="160" height="43" x="640" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
+<rect width="150" height="43" x="640" y="40" style="fill:rgb(255,255,255);stroke:none"/>
+<rect width="150" height="43" x="640" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
<text x="640" y="55" fill="rgb(136,34,85)" font-family="[FONT]" font-size="12">GC count per read</text>
<text x="640" y="75" fill="rgb(51,34,136)" font-family="[FONT]" font-size="12">Theoretical Distribution</text>
</svg>
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
--- java/Images/per_sequence_quality.svg
+++ rust/Images/per_sequence_quality.svg
@@ -9,10 +9,10 @@
<text x="0" y="257" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">600</text>
<text x="0" y="154" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">800</text>
<text x="0" y="51" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">1000</text>
-<text x="290" y="30" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Quality score distribution over all sequences</text>
+<text x="280" y="30" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Quality score distribution over all sequences</text>
<line x1="40" y1="560" x2="790" y2="560" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
<line x1="40" y1="560" x2="40" y2="40" stroke="rgb(0,0,0)" stroke-width="[SW]"/>
-<text x="290" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Mean Sequence Quality (Phred Score)</text>
+<text x="300" y="595" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">Mean Sequence Quality (Phred Score)</text>
<text x="410" y="575" fill="rgb(0,0,0)" font-family="[FONT]" font-size="12">9</text>
<line x1="40" y1="560" x2="790" y2="560" stroke="rgb(180,180,180)" stroke-width="[SW]"/>
<line x1="40" y1="457" x2="790" y2="457" stroke="rgb(180,180,180)" stroke-width="[SW]"/>
@@ -20,7 +20,7 @@
<line x1="40" y1="251" x2="790" y2="251" stroke="rgb(180,180,180)" stroke-width="[SW]"/>
<line x1="40" y1="148" x2="790" y2="148" stroke="rgb(180,180,180)" stroke-width="[SW]"/>
<line x1="40" y1="45" x2="790" y2="45" stroke="rgb(180,180,180)" stroke-width="[SW]"/>
-<rect width="160" height="23" x="630" y="40" style="fill:rgb(255,255,255);stroke:none"/>
-<rect width="160" height="23" x="630" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
-<text x="630" y="55" fill="rgb(136,34,85)" font-family="[FONT]" font-size="12">Average Quality per read</text>
+<rect width="150" height="23" x="640" y="40" style="fill:rgb(255,255,255);stroke:none"/>
+<rect width="150" height="23" x="640" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
+<text x="640" y="55" fill="rgb(136,34,85)" font-family="[FONT]" font-size="12">Average Quality per read</text>
</svg>
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
--- java/Images/sequence_length_distribution.svg
+++ rust/Images/sequence_length_distribution.svg
@@ -25,7 +25,7 @@
<line x1="40" y1="45" x2="790" y2="45" stroke="rgb(180,180,180)" stroke-width="[SW]"/>
<line x1="160" y1="560" x2="410" y2="40" stroke="rgb(136,34,85)" stroke-width="[SW]"/>
<line x1="410" y1="40" x2="660" y2="560" stroke="rgb(136,34,85)" stroke-width="[SW]"/>
-<rect width="110" height="23" x="680" y="40" style="fill:rgb(255,255,255);stroke:none"/>
-<rect width="110" height="23" x="680" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
-<text x="680" y="55" fill="rgb(136,34,85)" font-family="[FONT]" font-size="12">Sequence Length</text>
+<rect width="100" height="23" x="680" y="40" style="fill:rgb(255,255,255);stroke:none"/>
+<rect width="100" height="23" x="680" y="40" rx="0" ry="0" style="fill:none;stroke-width:1;stroke:rgb(192,192,192)"/>
+<text x="690" y="55" fill="rgb(136,34,85)" font-family="[FONT]" font-size="12">Sequence Length</text>
</svg>
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading