From 72e86b105d3597e4553ae18607f0bb0f9ce9b172 Mon Sep 17 00:00:00 2001 From: Erick Samera Date: Fri, 29 May 2026 22:47:43 -0700 Subject: [PATCH 1/2] feat(internal)!: improve motif matching with exact detection - Enhance motif matching by adding exact motif detection. - Introduce a new `IsExactACGT` function for checking A/C/G/T motifs. - Update `MatchMask` to utilize the best anchor for faster matching. - Add tests for new functionality and edge cases in motif detection. --- cmd/radigest/main.go | 2 +- internal/digest/digest.go | 38 +++++++++++++++++- internal/digest/digest_test.go | 19 +++++++++ internal/enzyme/iupac.go | 48 +++++++++++++++++------ internal/enzyme/pattern.go | 69 ++++++++++++++++++++++++++------- internal/enzyme/pattern_test.go | 32 +++++++++++++++ 6 files changed, 179 insertions(+), 29 deletions(-) diff --git a/cmd/radigest/main.go b/cmd/radigest/main.go index 83fee5d..4f836fc 100644 --- a/cmd/radigest/main.go +++ b/cmd/radigest/main.go @@ -23,7 +23,7 @@ import ( ) var ( - version = "v0.2.0" + version = "v0.3.0" ) const summarySchemaVersion = 1 diff --git a/internal/digest/digest.go b/internal/digest/digest.go index 25fdda4..00adcf3 100644 --- a/internal/digest/digest.go +++ b/internal/digest/digest.go @@ -1,6 +1,7 @@ package digest import ( + "bytes" "fmt" "strings" @@ -14,7 +15,9 @@ type Fragment struct { } type matcher struct { + exact []byte mask []uint8 + anchor int offset int } @@ -73,7 +76,15 @@ func TryNewPlanWithOptions(ens []enzyme.Enzyme, opt Options) (Plan, error) { if err != nil { return Plan{}, fmt.Errorf("enzyme %s recognition %q: %w", e.Name, e.Recognition, err) } - p.m[i] = matcher{mask: mask, offset: offset} + mat := matcher{ + mask: mask, + anchor: enzyme.BestMaskAnchor(mask), + offset: offset, + } + if enzyme.IsExactACGT(site) { + mat.exact = []byte(strings.ToUpper(site)) + } + p.m[i] = mat } return p, nil } @@ -92,6 +103,13 @@ func newCutScanner(mat matcher, seq []byte) cutScanner { } func (s *cutScanner) next() (int, bool) { + if len(s.mat.exact) > 0 { + return s.nextExact() + } + return s.nextMask() +} + +func (s *cutScanner) nextMask() (int, bool) { n := len(s.mat.mask) if n == 0 || len(s.seq) < n { return 0, false @@ -99,13 +117,29 @@ func (s *cutScanner) next() (int, bool) { for s.pos <= len(s.seq)-n { pos := s.pos s.pos++ - if enzyme.MatchMask(s.mat.mask, s.seq[pos:pos+n]) { + if enzyme.MatchMaskAt(s.mat.mask, s.mat.anchor, s.seq[pos:pos+n]) { return pos + s.mat.offset, true } } return 0, false } +func (s *cutScanner) nextExact() (int, bool) { + n := len(s.mat.exact) + if n == 0 || len(s.seq) < n || s.pos > len(s.seq)-n { + return 0, false + } + + idx := bytes.Index(s.seq[s.pos:], s.mat.exact) + if idx < 0 { + return 0, false + } + + siteStart := s.pos + idx + s.pos = siteStart + 1 // preserve overlapping motif detection + return siteStart + s.mat.offset, true +} + func emitIfKept(start, end, min, max int, emit func(Fragment) error) error { if ln := end - start; ln >= min && ln <= max { return emit(Fragment{Start: start, End: end}) diff --git a/internal/digest/digest_test.go b/internal/digest/digest_test.go index 12233de..bd66bcb 100644 --- a/internal/digest/digest_test.go +++ b/internal/digest/digest_test.go @@ -40,3 +40,22 @@ func TestDoubleDigest_AB_BA_Filter(t *testing.T) { } } } + +func TestExactScannerDetectsOverlappingMotifs(t *testing.T) { + eA := enzyme.Enzyme{Name: "FakeExact", Recognition: "AA^A"} + seq := []byte("AAAAA") + + frags := Digest(seq, []enzyme.Enzyme{eA}, 0, 1<<30) + want := []Fragment{ + {Start: 2, End: 3}, + {Start: 3, End: 4}, + } + if len(frags) != len(want) { + t.Fatalf("got %d fragments, want %d: %#v", len(frags), len(want), frags) + } + for i := range want { + if frags[i] != want[i] { + t.Fatalf("fragment %d got %+v, want %+v", i, frags[i], want[i]) + } + } +} diff --git a/internal/enzyme/iupac.go b/internal/enzyme/iupac.go index 2167eb7..985f4e6 100644 --- a/internal/enzyme/iupac.go +++ b/internal/enzyme/iupac.go @@ -2,7 +2,7 @@ package enzyme import "fmt" -// 4‑bit mask per base +// 4-bit mask per base. var codeMap = map[byte]uint8{ 'A': 1 << 0, 'C': 1 << 1, @@ -21,16 +21,44 @@ var codeMap = map[byte]uint8{ 'N': (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3), // match any base } -// CompilePattern converts an IUPAC recognition string to a slice of 4‑bit masks. +// motifMaskTable maps IUPAC motif symbols to their allowed A/C/G/T masks. +// Motif N means any unambiguous A/C/G/T reference base. +var motifMaskTable [256]uint8 + +// refMaskTable maps reference bases to masks used while matching a reference +// window. Reference N and all unrecognized symbols intentionally map to zero, +// so no recognition site is inferred across assembly gaps or unknown bases. +var refMaskTable [256]uint8 + +func init() { + for b, m := range codeMap { + setMaskBothCases(&motifMaskTable, b, m) + } + + setMaskBothCases(&refMaskTable, 'A', 1<<0) + setMaskBothCases(&refMaskTable, 'C', 1<<1) + setMaskBothCases(&refMaskTable, 'G', 1<<2) + setMaskBothCases(&refMaskTable, 'T', 1<<3) + + // Reference N remains zero by design. + refMaskTable['N'] = 0 + refMaskTable['n'] = 0 +} + +func setMaskBothCases(table *[256]uint8, b byte, mask uint8) { + table[b] = mask + if b >= 'A' && b <= 'Z' { + table[b+'a'-'A'] = mask + } +} + +// CompilePattern converts an IUPAC recognition string to a slice of 4-bit masks. func CompilePattern(seq string) ([]uint8, error) { out := make([]uint8, len(seq)) for i := 0; i < len(seq); i++ { c := seq[i] - if c >= 'a' && c <= 'z' { // ensure upper-case - c -= 'a' - 'A' - } - m, ok := codeMap[c] - if !ok { + m := motifMaskTable[c] + if m == 0 { return nil, fmt.Errorf("invalid IUPAC base %q", c) } out[i] = m @@ -44,11 +72,7 @@ func Match(pattern []uint8, window []byte) bool { return false } for i, m := range pattern { - base := window[i] - if base >= 'a' && base <= 'z' { - base -= 'a' - 'A' - } - if (m & codeMap[base]) == 0 { + if m&refMaskTable[window[i]] == 0 { return false } } diff --git a/internal/enzyme/pattern.go b/internal/enzyme/pattern.go index cac85d6..1967b52 100644 --- a/internal/enzyme/pattern.go +++ b/internal/enzyme/pattern.go @@ -1,5 +1,7 @@ package enzyme +import "math/bits" + // StripCaret removes “^” from the recognition site and returns (cleanSite, cutOffset). func StripCaret(recog string) (string, int) { for i := 0; i < len(recog); i++ { @@ -20,10 +22,7 @@ func CompileMask(site string) []uint8 { b := []byte(site) m := make([]uint8, len(b)) for i, c := range b { - if c >= 'a' && c <= 'z' { // upper-case on the fly - c -= 'a' - 'A' - } - m[i] = codeMap[c] + m[i] = motifMaskTable[c] } return m } @@ -37,29 +36,71 @@ func CompileMaskChecked(site string) ([]uint8, error) { // baseMaskWin maps a reference base to its mask for matching. // NOTE: We *block* 'N' in the reference (mask=0) so 'N' never matches any site. func baseMaskWin(b byte) uint8 { - if b >= 'a' && b <= 'z' { - b -= 'a' - 'A' - } - if b == 'N' { + return refMaskTable[b] +} + +// BestMaskAnchor returns the most selective position in a compiled motif mask. +// Positions with fewer allowed bases are tested first by MatchMaskAt. +func BestMaskAnchor(mask []uint8) int { + if len(mask) == 0 { return 0 } - if m, ok := codeMap[b]; ok { - return m + best := 0 + bestPop := 9 + for i, m := range mask { + pop := bits.OnesCount8(m) + if pop < bestPop { + best = i + bestPop = pop + if pop == 1 { + break + } + } } - return 0 // anything unknown in the sequence fails to match + return best } // MatchMask returns true iff window matches the compiled mask. func MatchMask(mask []uint8, window []byte) bool { + return MatchMaskAt(mask, BestMaskAnchor(mask), window) +} + +// MatchMaskAt returns true iff window matches the compiled mask, testing the +// provided anchor position first for a fast reject. +func MatchMaskAt(mask []uint8, anchor int, window []byte) bool { n := len(mask) - // fast reject on last position - if baseMaskWin(window[n-1])&mask[n-1] == 0 { + if n == 0 || len(window) < n { + return false + } + if anchor < 0 || anchor >= n { + anchor = n - 1 + } + if baseMaskWin(window[anchor])&mask[anchor] == 0 { return false } - for i := 0; i < n-1; i++ { + for i := 0; i < n; i++ { + if i == anchor { + continue + } if baseMaskWin(window[i])&mask[i] == 0 { return false } } return true } + +// IsExactACGT reports whether site contains only unambiguous A/C/G/T bases. +func IsExactACGT(site string) bool { + if site == "" { + return false + } + for i := 0; i < len(site); i++ { + switch site[i] { + case 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't': + continue + default: + return false + } + } + return true +} diff --git a/internal/enzyme/pattern_test.go b/internal/enzyme/pattern_test.go index c4bc028..847ac43 100644 --- a/internal/enzyme/pattern_test.go +++ b/internal/enzyme/pattern_test.go @@ -38,3 +38,35 @@ func TestCompileMaskCheckedRejectsInvalidIUPAC(t *testing.T) { t.Fatalf("CompileMaskChecked returned nil error for invalid IUPAC symbol") } } + +func TestIsExactACGT(t *testing.T) { + if !IsExactACGT("GAATTC") { + t.Fatalf("GAATTC should be recognized as an exact A/C/G/T motif") + } + if !IsExactACGT("gaattc") { + t.Fatalf("lowercase exact motifs should be recognized") + } + if IsExactACGT("GCWGC") { + t.Fatalf("degenerate motifs must not use the exact-match scanner") + } + if IsExactACGT("") { + t.Fatalf("empty motif must not be exact") + } +} + +func TestBestMaskAnchorPrefersMostSpecificPosition(t *testing.T) { + mask := CompileMask("NNACN") + if got := BestMaskAnchor(mask); got != 2 { + t.Fatalf("best anchor = %d, want 2", got) + } +} + +func TestMatchMaskAtUsesAnchorAndReferenceNStillFails(t *testing.T) { + mask := CompileMask("NNACN") + if !MatchMaskAt(mask, BestMaskAnchor(mask), []byte("GGACA")) { + t.Fatalf("expected degenerate motif to match compatible reference") + } + if MatchMaskAt(mask, BestMaskAnchor(mask), []byte("GGNCG")) { + t.Fatalf("reference N must not match even under degenerate motif N") + } +} From 28b53c194083dd3b3b164ee96bd2e67de59c4e3b Mon Sep 17 00:00:00 2001 From: Erick Samera Date: Fri, 29 May 2026 23:10:06 -0700 Subject: [PATCH 2/2] feat(cmd): add stats-only JSON output for improved performance - Introduced `runStatsOnlyJSON` function for optimized stats processing. - Added `DigestStats` method to calculate fragment statistics without materializing fragments. - Enhanced `Stats` struct to summarize fragment counts and bases efficiently. - Updated tests to validate new `DigestStats` functionality against existing digest behavior. --- cmd/radigest/main.go | 216 ++++++++++++++++++++++++++++++++- internal/digest/digest.go | 117 ++++++++++++++++++ internal/digest/digest_test.go | 90 ++++++++++++++ 3 files changed, 418 insertions(+), 5 deletions(-) diff --git a/cmd/radigest/main.go b/cmd/radigest/main.go index 4f836fc..015d148 100644 --- a/cmd/radigest/main.go +++ b/cmd/radigest/main.go @@ -306,6 +306,38 @@ func run(args []string, stdin io.Reader, stdout, stderr io.Writer) error { return usageError{err: err} } + // Resolve the synthetic-genome seed before choosing the execution path so + // JSON summaries report the same value in streaming and stats-only modes. + resolvedSimSeed := int64(0) + if *simLen > 0 { + resolvedSimSeed = sim.ResolveSeed(*simSeed) + } + + if canUseStatsOnlyJSON(gffOutputPath, fragmentsTSVOutputPath, fragmentsFASTAOutputPath, jsonOutputPath, selector.Config()) { + return runStatsOnlyJSON(runStatsOnlyInput{ + Args: args, + Stdin: stdin, + Stdout: stdout, + Stderr: stderr, + Plan: plan, + Selector: selector, + EnzymeNames: enzymeNames, + FastaPath: *fastaPath, + SimLen: *simLen, + SimGC: *simGC, + SimSeedRequested: *simSeed, + SimSeedResolved: resolvedSimSeed, + MinLen: *minLen, + MaxLen: *maxLen, + Threads: *threads, + Verbose: *verbose, + AllowSame: *allowSame, + StrictCuts: *strictCuts, + IncludeEnds: *includeEnds, + JSONPath: jsonOutputPath, + }) + } + // ---- start writers ------------------------------------------------------- writer, err := collector.NewWriterTo(gffOutputPath, stdout) if err != nil { @@ -358,11 +390,6 @@ func run(args []string, stdin io.Reader, stdout, stderr io.Writer) error { }() // ---- source: FASTA or synthetic ---------------------------------------- - resolvedSimSeed := int64(0) - if *simLen > 0 { - resolvedSimSeed = sim.ResolveSeed(*simSeed) - } - faCh := make(chan fasta.Record) sourceErrCh := make(chan error, 1) go func() { @@ -440,6 +467,185 @@ func run(args []string, stdin io.Reader, stdout, stderr io.Writer) error { return nil } +type statsOnlyResult struct { + idx int + chr string + stats digest.Stats +} + +type runStatsOnlyInput struct { + Args []string + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer + Plan digest.Plan + Selector sizeselect.Selector + EnzymeNames []string + FastaPath string + SimLen int + SimGC float64 + SimSeedRequested int64 + SimSeedResolved int64 + MinLen int + MaxLen int + Threads int + Verbose bool + AllowSame bool + StrictCuts bool + IncludeEnds bool + JSONPath string +} + +func canUseStatsOnlyJSON(gffPath, fragmentsTSVPath, fragmentsFASTAPath, jsonPath string, cfg sizeselect.Config) bool { + return jsonPath != "" && + gffPath == "" && + fragmentsTSVPath == "" && + fragmentsFASTAPath == "" && + cfg.Model == sizeselect.ModelHard && + cfg.ScoreMin == cfg.Min && + cfg.ScoreMax == cfg.Max +} + +func runStatsOnlyJSON(in runStatsOnlyInput) error { + type job struct { + idx int + rec fasta.Record + } + + jobs := make(chan job, in.Threads) + results := make(chan statsOnlyResult, in.Threads) + var wg sync.WaitGroup + + for i := 0; i < in.Threads; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := range jobs { + results <- statsOnlyResult{ + idx: j.idx, + chr: j.rec.ID, + stats: in.Plan.DigestStats(j.rec.Seq, in.MinLen, in.MaxLen), + } + } + }() + } + go func() { + wg.Wait() + close(results) + }() + + faCh := make(chan fasta.Record) + sourceErrCh := make(chan error, 1) + go func() { + if in.SimLen > 0 { + seq := sim.Make(in.SimLen, in.SimGC, in.SimSeedResolved) + faCh <- fasta.Record{ID: "chr1", Seq: seq} + close(faCh) + sourceErrCh <- nil + return + } + sourceErrCh <- fasta.StreamFrom(in.FastaPath, in.Stdin, faCh) + }() + go func() { + idx := 0 + for rec := range faCh { + jobs <- job{idx: idx, rec: rec} + idx++ + } + close(jobs) + }() + + stats, streamErr := collectStatsOnlyResults(results, in.Verbose, in.Stderr) + if streamErr != nil { + return fmt.Errorf("digest/stats: %w", streamErr) + } + sourceErr := <-sourceErrCh + if sourceErr != nil { + return fmt.Errorf("fasta stream: %w", sourceErr) + } + + sizeStats := hardSizeStatsFromCollector(in.Selector, stats) + + if _, err := fmt.Fprintf(in.Stderr, "Fragments kept: %d\nBases covered: %d\nChromosomes: %d\n", + stats.TotalFragments, stats.TotalBases, len(stats.PerChr)); err != nil { + return fmt.Errorf("write final stats: %w", err) + } + + summary := buildRunSummary(runSummaryInput{ + Args: in.Args, + Enzymes: in.EnzymeNames, + FastaPath: in.FastaPath, + SimLen: in.SimLen, + SimGC: in.SimGC, + SimSeedRequested: in.SimSeedRequested, + SimSeedResolved: in.SimSeedResolved, + MinLen: in.MinLen, + MaxLen: in.MaxLen, + Threads: in.Threads, + AllowSame: in.AllowSame, + StrictCuts: in.StrictCuts, + IncludeEnds: in.IncludeEnds, + SelectorConfig: in.Selector.Config(), + JSONPath: in.JSONPath, + SizeSelection: sizeStats, + Stats: stats, + }) + if err := writeSummaryJSONTo(in.JSONPath, summary, in.Stdout); err != nil { + return fmt.Errorf("write json: %w", err) + } + return nil +} + +func collectStatsOnlyResults(results <-chan statsOnlyResult, verbose bool, stderr io.Writer) (collector.Stats, error) { + pending := make(map[int]statsOnlyResult) + next := 0 + stats := collector.Stats{PerChr: make(map[string]collector.ChrStats)} + + for results != nil || len(pending) > 0 { + if r, ok := pending[next]; ok { + if r.stats.Fragments > 0 { + chrStats := collector.ChrStats{Fragments: r.stats.Fragments, Bases: r.stats.Bases} + stats.PerChr[r.chr] = chrStats + stats.TotalFragments += r.stats.Fragments + stats.TotalBases += r.stats.Bases + } + if verbose { + if _, err := fmt.Fprintf(stderr, "chr=%s fragments=%d\n", r.chr, r.stats.Fragments); err != nil { + return stats, fmt.Errorf("write progress for %s: %w", r.chr, err) + } + } + delete(pending, next) + next++ + continue + } + + if results == nil { + return stats, fmt.Errorf("missing digest result for chromosome index %d", next) + } + r, ok := <-results + if !ok { + results = nil + continue + } + pending[r.idx] = r + } + return stats, nil +} + +func hardSizeStatsFromCollector(selector sizeselect.Selector, stats collector.Stats) sizeselect.Stats { + sizeStats := sizeselect.NewStats(selector) + sizeStats.RawFragmentsScored = stats.TotalFragments + sizeStats.RawBasesScored = int64(stats.TotalBases) + sizeStats.RawFragmentsInWindow = stats.TotalFragments + sizeStats.RawBasesInWindow = int64(stats.TotalBases) + sizeStats.WeightedFragments = float64(stats.TotalFragments) + sizeStats.WeightedBases = float64(stats.TotalBases) + if stats.TotalFragments > 0 { + sizeStats.MeanWeightedLength = sizeStats.WeightedBases / sizeStats.WeightedFragments + } + return sizeStats +} + func writeResultStreams(w *collector.Writer, results <-chan digestResult, verbose bool) error { return writeResultStreamsTo(w, results, verbose, os.Stderr) } diff --git a/internal/digest/digest.go b/internal/digest/digest.go index 00adcf3..8daab3c 100644 --- a/internal/digest/digest.go +++ b/internal/digest/digest.go @@ -14,6 +14,26 @@ type Fragment struct { End int } +// Stats summarizes kept digest fragments without materializing Fragment values. +type Stats struct { + Fragments int + Bases int +} + +func (s *Stats) addIfKept(start, end, min, max int) { + if ln := end - start; ln >= min && ln <= max { + s.Fragments++ + s.Bases += ln + } +} + +func (s *Stats) addTerminalIfKept(start, end, min, max int) { + if end <= start { + return + } + s.addIfKept(start, end, min, max) +} + type matcher struct { exact []byte mask []uint8 @@ -266,6 +286,103 @@ func (p Plan) DigestEach(seq []byte, min, max int, emit func(Fragment) error) er return nil } +// DigestStats returns hard-window fragment counts and bases without constructing +// Fragment values or invoking a per-fragment callback. It uses the same fragment +// semantics as DigestEach for single/double digest, coincident cuts, same-enzyme +// adjacency, and optional terminal fragments. +func (p Plan) DigestStats(seq []byte, min, max int) Stats { + var stats Stats + if p.m[0].mask == nil { // no enzymes compiled + return stats + } + + aScan := newCutScanner(p.m[0], seq) + aPos, aOK := aScan.next() + + // Single-enzyme mode: only the previous cut coordinate is needed. + if p.m[1].mask == nil { + if !aOK { + if p.includeEnds { + stats.addTerminalIfKept(0, len(seq), min, max) + } + return stats + } + if p.includeEnds { + stats.addTerminalIfKept(0, aPos, min, max) + } + prevPos := aPos + for { + pos, ok := aScan.next() + if !ok { + if p.includeEnds { + stats.addTerminalIfKept(prevPos, len(seq), min, max) + } + return stats + } + stats.addIfKept(prevPos, pos, min, max) + prevPos = pos + } + } + + // Double-enzyme mode: merge the two naturally sorted cut-coordinate streams. + bScan := newCutScanner(p.m[1], seq) + bPos, bOK := bScan.next() + prevType := -1 // 0=A, 1=B + prevPos := 0 + sawCut := false + lastPos := 0 + + for aOK || bOK { + var pos int + if aOK && (!bOK || aPos <= bPos) { + pos = aPos + } else { + pos = bPos + } + hasA := aOK && aPos == pos + hasB := bOK && bPos == pos + + if p.includeEnds && !sawCut { + stats.addTerminalIfKept(0, pos, min, max) + } + sawCut = true + lastPos = pos + + if hasA && hasB { + // Coincident cuts are barriers. Count one zero-length fragment for + // the site if the caller's size range allows it, then reset + // adjacency so no fragment bridges across the coincident cut. + stats.addIfKept(pos, pos, min, max) + aPos, aOK = aScan.next() + bPos, bOK = bScan.next() + prevType = -1 + prevPos = pos + continue + } + + curType := 0 + if hasA { + aPos, aOK = aScan.next() + } else { + curType = 1 + bPos, bOK = bScan.next() + } + + if prevType != -1 && (p.allowSame || prevType != curType) { + stats.addIfKept(prevPos, pos, min, max) + } + prevType, prevPos = curType, pos + } + if p.includeEnds { + if !sawCut { + stats.addTerminalIfKept(0, len(seq), min, max) + return stats + } + stats.addTerminalIfKept(lastPos, len(seq), min, max) + } + return stats +} + // Digest supports: // - single-enzyme mode (only A configured): consecutive A cuts // - double-enzyme mode (A,B): adjacent AB/BA only (or AA/BB too if allowSame) diff --git a/internal/digest/digest_test.go b/internal/digest/digest_test.go index bd66bcb..d284174 100644 --- a/internal/digest/digest_test.go +++ b/internal/digest/digest_test.go @@ -59,3 +59,93 @@ func TestExactScannerDetectsOverlappingMotifs(t *testing.T) { } } } + +func TestDigestStatsMatchesDigest(t *testing.T) { + cases := []struct { + name string + seq []byte + ens []enzyme.Enzyme + opt Options + min int + max int + }{ + { + name: "single internal", + seq: []byte("AATTCCCCAATT"), + ens: []enzyme.Enzyme{{Name: "MluCI", Recognition: "^AATT"}}, + min: 1, + max: 100, + }, + { + name: "double AB BA", + seq: []byte("AAAAGAATTCTTAAAGAATTC"), + ens: []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "MseI", Recognition: "T^TAA"}, + }, + min: 1, + max: 100, + }, + { + name: "double include ends", + seq: []byte("AAAAGAATTCTTAAAGAATTC"), + ens: []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "MseI", Recognition: "T^TAA"}, + }, + opt: Options{IncludeEnds: true}, + min: 1, + max: 100, + }, + { + name: "same enzyme allowed", + seq: []byte("AAAAGAATTCAAAAGAATTCAAA"), + ens: []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "NcoI", Recognition: "C^CATGG"}, + }, + opt: Options{AllowSame: true}, + min: 1, + max: 100, + }, + { + name: "coincident zero length", + seq: []byte("AAAGATCAAAGATC"), + ens: []enzyme.Enzyme{ + {Name: "DpnII", Recognition: "^GATC"}, + {Name: "MboI", Recognition: "^GATC"}, + }, + min: 0, + max: 100, + }, + { + name: "degenerate motif", + seq: []byte("AAGCAGCAAGCTGC"), + ens: []enzyme.Enzyme{{Name: "ApeKI", Recognition: "G^CWGC"}}, + min: 1, + max: 100, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + plan := NewPlanWithOptions(tc.ens, tc.opt) + frags := plan.Digest(tc.seq, tc.min, tc.max) + stats := plan.DigestStats(tc.seq, tc.min, tc.max) + + var wantBases int + for _, fr := range frags { + wantBases += fr.End - fr.Start + } + if stats.Fragments != len(frags) || stats.Bases != wantBases { + t.Fatalf( + "DigestStats=(frags=%d bases=%d), Digest=(frags=%d bases=%d)", + stats.Fragments, + stats.Bases, + len(frags), + wantBases, + ) + } + }) + } +}