diff --git a/Makefile b/Makefile index 83fb13b..a43ac1b 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ VERSION := $(shell git describe --tags --dirty --always 2>/dev/null || echo dev GOFLAGS ?= -trimpath LDFLAGS := -s -w -X main.version=$(VERSION) SCRIPTS := scripts/radigest-screen-pairs scripts/radigest-rank-pairs scripts/radigest-fit-size-model +CACHED_SCREEN_BIN := $(BIN_DIR)/radigest-screen-pairs-cached .PHONY: all build install test lint tidy clean @@ -14,12 +15,14 @@ all: build build: mkdir -p $(BIN_DIR) $(GO) build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(BIN_DIR)/radigest ./cmd/radigest + $(GO) build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(CACHED_SCREEN_BIN) ./cmd/radigest-screen-pairs-cached cp $(SCRIPTS) $(BIN_DIR)/ - chmod 0755 $(BIN_DIR)/radigest $(BIN_DIR)/radigest-screen-pairs $(BIN_DIR)/radigest-rank-pairs $(BIN_DIR)/radigest-fit-size-model + chmod 0755 $(BIN_DIR)/radigest $(CACHED_SCREEN_BIN) $(BIN_DIR)/radigest-screen-pairs $(BIN_DIR)/radigest-rank-pairs $(BIN_DIR)/radigest-fit-size-model install: build install -d $(DESTDIR)$(PREFIX)/bin install -m 0755 $(BIN_DIR)/radigest $(DESTDIR)$(PREFIX)/bin/radigest + install -m 0755 $(CACHED_SCREEN_BIN) $(DESTDIR)$(PREFIX)/bin/radigest-screen-pairs-cached install -m 0755 $(BIN_DIR)/radigest-screen-pairs $(DESTDIR)$(PREFIX)/bin/radigest-screen-pairs install -m 0755 $(BIN_DIR)/radigest-rank-pairs $(DESTDIR)$(PREFIX)/bin/radigest-rank-pairs install -m 0755 $(BIN_DIR)/radigest-fit-size-model $(DESTDIR)$(PREFIX)/bin/radigest-fit-size-model diff --git a/cmd/radigest-screen-pairs-cached/main.go b/cmd/radigest-screen-pairs-cached/main.go new file mode 100644 index 0000000..63cb8b9 --- /dev/null +++ b/cmd/radigest-screen-pairs-cached/main.go @@ -0,0 +1,438 @@ +package main + +import ( + "encoding/json" + "errors" + "flag" + "fmt" + "io" + "os" + "path/filepath" + "regexp" + "runtime" + "sort" + "strings" + "sync" + "time" + + "github.com/ericksamera/radigest/internal/digest" + "github.com/ericksamera/radigest/internal/enzyme" + "github.com/ericksamera/radigest/internal/screen" + "github.com/ericksamera/radigest/internal/sizeselect" +) + +var version = "dev" + +var safeTag = regexp.MustCompile(`[^A-Za-z0-9_.-]+`) + +type usageError struct{ err error } + +func (e usageError) Error() string { return e.err.Error() } +func (e usageError) Unwrap() error { return e.err } + +type pairJob struct { + enzymeA string + enzymeB string + tag string + json string + log string +} + +type pairResult struct { + job pairJob + err error +} + +type pairJSON struct { + SchemaVersion int `json:"schema_version"` + RadigestVersion string `json:"radigest_version,omitempty"` + Command []string `json:"command,omitempty"` + Enzymes []string `json:"enzymes"` + MinLength int `json:"min_length"` + MaxLength int `json:"max_length"` + TotalFragments int `json:"total_fragments"` + TotalBases int `json:"total_bases"` + PerChromosome map[string]screen.RecordStats `json:"per_chromosome"` + SizeSelection sizeselect.Stats `json:"size_selection"` + Screening screen.ScreeningStats `json:"screening"` +} + +func main() { + if err := run(os.Args[1:], os.Stdout, os.Stderr); err != nil { + code := 1 + var usage usageError + if errors.As(err, &usage) { + code = 2 + } + _, _ = fmt.Fprintln(os.Stderr, "error:", err) + os.Exit(code) + } +} + +func run(args []string, stdout, stderr io.Writer) error { + if stdout == nil { + stdout = io.Discard + } + if stderr == nil { + stderr = io.Discard + } + + fs := flag.NewFlagSet("radigest-screen-pairs-cached", flag.ContinueOnError) + fs.SetOutput(stderr) + + fastaPath := fs.String("fasta", "", "reference FASTA file") + enzFlag := fs.String("enzymes", "", "comma-separated enzymes or a file with one/comma-separated enzymes per line") + outDir := fs.String("out-dir", "pair_screen_cached", "output directory containing json/ and logs/") + minLen := fs.Int("min", 300, "minimum fragment length (bp) for hard size selection") + maxLen := fs.Int("max", 600, "maximum fragment length (bp) for hard size selection") + scoreMin := fs.Int("score-min", 1, "minimum fragment length included in size-selection scoring") + scoreMax := fs.Int("score-max", 2000, "maximum fragment length included in size-selection scoring") + sizeModel := fs.String("size-model", "normal", "size-selection model: hard, normal, triangular, or soft-window") + sizeMean := fs.Float64("size-mean", 275, "target/peak insert length for normal/triangular models") + sizeSD := fs.Float64("size-sd", 85, "standard deviation for -size-model normal") + sizeEdgeSD := fs.Float64("size-edge-sd", 25, "edge softness for -size-model soft-window") + jobsFlag := fs.Int("jobs", 0, "parallel pair-scoring workers (default: -threads)") + threadsFlag := fs.Int("threads", runtime.NumCPU(), "worker count alias used when -jobs is not set") + allowSame := fs.Bool("allow-same", false, "double digest: also keep AA/BB neighbors (default AB/BA only)") + includeEnds := fs.Bool("include-ends", false, "also score terminal fragments from contig ends to nearest cut") + strictCuts := fs.Bool("strict-cuts", false, "error if an enzyme lacks a caret and CutIndex==0") + force := fs.Bool("force", false, "overwrite existing JSON files") + dryRun := fs.Bool("dry-run", false, "print pair jobs without reading FASTA or writing JSON") + maxPairs := fs.Int("max-pairs", 0, "maximum number of pairs to score; 0 means all pairs") + showVersion := fs.Bool("version", false, "print version and exit") + + fs.Usage = func() { + _, _ = fmt.Fprintln(stderr, "radigest-screen-pairs-cached — cached cut-index enzyme-pair screen") + _, _ = fmt.Fprintln(stderr) + _, _ = fmt.Fprintln(stderr, "Usage:") + _, _ = fmt.Fprintln(stderr, " radigest-screen-pairs-cached --fasta ref.fa.gz --enzymes enzymes.txt [options]") + _, _ = fmt.Fprintln(stderr) + fs.PrintDefaults() + } + + if err := fs.Parse(args); err != nil { + if errors.Is(err, flag.ErrHelp) { + return nil + } + return usageError{err: err} + } + if *showVersion { + _, err := fmt.Fprintf(stdout, "radigest-screen-pairs-cached %s\n", version) + return err + } + if *fastaPath == "" { + return usageError{err: errors.New("--fasta is required")} + } + if *enzFlag == "" { + return usageError{err: errors.New("--enzymes is required")} + } + if *minLen < 0 || *maxLen < *minLen { + return usageError{err: fmt.Errorf("invalid hard size window: min=%d max=%d", *minLen, *maxLen)} + } + + enzymeNames, err := readEnzymeNames(*enzFlag) + if err != nil { + return err + } + enzymes, err := lookupEnzymes(enzymeNames) + if err != nil { + return err + } + pairJobs := buildPairJobs(enzymeNames, *outDir, *force, *maxPairs) + if _, err := fmt.Fprintf(stderr, "candidate_enzymes\t%d\n", len(enzymeNames)); err != nil { + return err + } + if _, err := fmt.Fprintf(stderr, "pair_jobs\t%d\n", len(pairJobs)); err != nil { + return err + } + if len(pairJobs) == 0 { + return nil + } + if *dryRun { + for _, job := range pairJobs { + if _, err := fmt.Fprintf(stdout, "%s\t%s,%s\t%s\n", job.tag, job.enzymeA, job.enzymeB, job.json); err != nil { + return err + } + } + return nil + } + + selector, err := sizeselect.New(sizeselect.Config{ + Model: sizeselect.Model(*sizeModel), + Min: *minLen, + Max: *maxLen, + ScoreMin: *scoreMin, + ScoreMax: *scoreMax, + Mean: *sizeMean, + SD: *sizeSD, + EdgeSD: *sizeEdgeSD, + }) + if err != nil { + return err + } + + index, err := screen.BuildCutIndexFromFASTA(*fastaPath, enzymes, digest.Options{StrictCuts: *strictCuts}) + if err != nil { + return err + } + if _, err := fmt.Fprintf(stderr, "records\t%d\n", len(index.Records)); err != nil { + return err + } + if _, err := fmt.Fprintf(stderr, "cached_cut_sites\t%d\n", index.CachedCutSites()); err != nil { + return err + } + if _, err := fmt.Fprintf(stderr, "cache_memory_estimate_bytes\t%d\n", index.CacheMemoryEstimateBytes()); err != nil { + return err + } + + workers := *jobsFlag + if workers <= 0 { + workers = *threadsFlag + } + if workers <= 0 { + workers = 1 + } + if workers > len(pairJobs) { + workers = len(pairJobs) + } + + opt := digest.Options{ + AllowSame: *allowSame, + IncludeEnds: *includeEnds, + StrictCuts: *strictCuts, + } + return scorePairJobs(pairJobs, index, selector, opt, workers, args, stderr) +} + +func readEnzymeNames(value string) ([]string, error) { + var raw []string + if data, err := os.ReadFile(value); err == nil { + for _, line := range strings.Split(string(data), "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + raw = append(raw, splitNames(line)...) + } + } else { + raw = splitNames(value) + } + deduped := make([]string, 0, len(raw)) + seen := make(map[string]struct{}, len(raw)) + for _, name := range raw { + name = strings.TrimSpace(name) + if name == "" { + continue + } + if _, ok := seen[name]; ok { + continue + } + seen[name] = struct{}{} + deduped = append(deduped, name) + } + if len(deduped) < 2 { + return nil, fmt.Errorf("need at least two candidate enzymes") + } + return deduped, nil +} + +func splitNames(value string) []string { + fields := strings.FieldsFunc(value, func(r rune) bool { + return r == ',' || r == '\t' || r == ' ' || r == '\r' + }) + out := make([]string, 0, len(fields)) + for _, field := range fields { + field = strings.TrimSpace(field) + if field != "" { + out = append(out, field) + } + } + return out +} + +func lookupEnzymes(names []string) ([]enzyme.Enzyme, error) { + out := make([]enzyme.Enzyme, 0, len(names)) + for _, name := range names { + enz, ok := enzyme.Get(name) + if !ok { + return nil, fmt.Errorf("unknown enzyme %q", name) + } + out = append(out, enz) + } + return out, nil +} + +func buildPairJobs(enzymeNames []string, outDir string, force bool, maxPairs int) []pairJob { + jsonDir := filepath.Join(outDir, "json") + logDir := filepath.Join(outDir, "logs") + jobs := make([]pairJob, 0, len(enzymeNames)*(len(enzymeNames)-1)/2) + for i := 0; i < len(enzymeNames); i++ { + for j := i + 1; j < len(enzymeNames); j++ { + tag := pairTag(enzymeNames[i], enzymeNames[j]) + jsonPath := filepath.Join(jsonDir, tag+".json") + if !force { + if _, err := os.Stat(jsonPath); err == nil { + continue + } + } + jobs = append(jobs, pairJob{ + enzymeA: enzymeNames[i], + enzymeB: enzymeNames[j], + tag: tag, + json: jsonPath, + log: filepath.Join(logDir, tag+".log"), + }) + if maxPairs > 0 && len(jobs) >= maxPairs { + return jobs + } + } + } + return jobs +} + +func pairTag(enzymeA, enzymeB string) string { + return safeTag.ReplaceAllString(enzymeA+"__"+enzymeB, "_") +} + +func scorePairJobs(jobs []pairJob, index screen.CutIndex, selector sizeselect.Selector, opt digest.Options, workers int, command []string, stderr io.Writer) error { + jobCh := make(chan pairJob) + resultCh := make(chan pairResult, len(jobs)) + var wg sync.WaitGroup + for w := 0; w < workers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for job := range jobCh { + resultCh <- pairResult{job: job, err: scoreOnePair(job, index, selector, opt, command)} + } + }() + } + for _, job := range jobs { + jobCh <- job + } + close(jobCh) + wg.Wait() + close(resultCh) + + var failed []pairResult + results := make([]pairResult, 0, len(jobs)) + for result := range resultCh { + results = append(results, result) + if result.err != nil { + failed = append(failed, result) + } + } + sort.Slice(results, func(i, j int) bool { return results[i].job.tag < results[j].job.tag }) + for _, result := range results { + status := "ok" + if result.err != nil { + status = "error=" + result.err.Error() + } + if _, err := fmt.Fprintf(stderr, "%s\t%s\t%s\t%s\n", result.job.tag, status, result.job.json, result.job.log); err != nil { + return err + } + } + if len(failed) > 0 { + return fmt.Errorf("failed_pairs\t%d", len(failed)) + } + return nil +} + +func scoreOnePair(job pairJob, index screen.CutIndex, selector sizeselect.Selector, opt digest.Options, command []string) error { + started := time.Now() + if err := os.MkdirAll(filepath.Dir(job.json), 0o755); err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(job.log), 0o755); err != nil { + return err + } + summary, err := screen.ScorePair(index, job.enzymeA, job.enzymeB, selector, opt) + if err != nil { + if logErr := writePairLog(job, command, started, err); logErr != nil { + return errors.Join(err, logErr) + } + return err + } + out := pairJSON{ + SchemaVersion: summary.SchemaVersion, + RadigestVersion: version, + Command: append([]string(nil), command...), + Enzymes: summary.Enzymes, + MinLength: summary.MinLength, + MaxLength: summary.MaxLength, + TotalFragments: summary.TotalFragments, + TotalBases: summary.TotalBases, + PerChromosome: summary.PerChromosome, + SizeSelection: summary.SizeSelection, + Screening: summary.Screening, + } + if err := writeJSONAtomic(job.json, out); err != nil { + if logErr := writePairLog(job, command, started, err); logErr != nil { + return errors.Join(err, logErr) + } + return err + } + return writePairLog(job, command, started, nil) +} + +func writeJSONAtomic(path string, value any) error { + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + tmp, err := os.CreateTemp(dir, filepath.Base(path)+".*.tmp") + if err != nil { + return err + } + tmpName := tmp.Name() + keepTmp := false + defer func() { + if !keepTmp { + _ = os.Remove(tmpName) + } + }() + + if err := tmp.Chmod(0o644); err != nil { + _ = tmp.Close() + return err + } + + enc := json.NewEncoder(tmp) + enc.SetIndent("", " ") + if err := enc.Encode(value); err != nil { + _ = tmp.Close() + return err + } + if err := tmp.Sync(); err != nil { + _ = tmp.Close() + return err + } + if err := tmp.Close(); err != nil { + return err + } + if err := os.Rename(tmpName, path); err != nil { + return err + } + keepTmp = true + return nil +} + +func writePairLog(job pairJob, command []string, started time.Time, err error) error { + if err := os.MkdirAll(filepath.Dir(job.log), 0o755); err != nil { + return err + } + + var b strings.Builder + b.WriteString(fmt.Sprintf("pair\t%s,%s\n", job.enzymeA, job.enzymeB)) + b.WriteString(fmt.Sprintf("json\t%s\n", job.json)) + b.WriteString(fmt.Sprintf("started_at\t%s\n", started.Format(time.RFC3339Nano))) + b.WriteString(fmt.Sprintf("finished_at\t%s\n", time.Now().Format(time.RFC3339Nano))) + b.WriteString(fmt.Sprintf("command\t%s\n", strings.Join(command, " "))) + if err != nil { + b.WriteString(fmt.Sprintf("status\terror\nerror\t%s\n", err)) + } else { + b.WriteString("status\tok\n") + } + + return os.WriteFile(job.log, []byte(b.String()), 0o644) +} diff --git a/cmd/radigest-screen-pairs-cached/main_test.go b/cmd/radigest-screen-pairs-cached/main_test.go new file mode 100644 index 0000000..e730cf6 --- /dev/null +++ b/cmd/radigest-screen-pairs-cached/main_test.go @@ -0,0 +1,138 @@ +package main + +import ( + "bytes" + "encoding/json" + "os" + "path/filepath" + "testing" +) + +func TestRunCachedScreenWritesRankablePairJSON(t *testing.T) { + dir := t.TempDir() + fastaPath := filepath.Join(dir, "toy.fa") + if err := os.WriteFile(fastaPath, []byte(">ecori_msei_double\nAAAAGAATTCTTAAAGAATTCTTT\n"), 0o644); err != nil { + t.Fatalf("write FASTA: %v", err) + } + outDir := filepath.Join(dir, "screen") + + var stdout, stderr bytes.Buffer + err := run([]string{ + "--fasta", fastaPath, + "--enzymes", "EcoRI,MseI", + "--min", "1", + "--max", "100", + "--score-min", "1", + "--score-max", "100", + "--size-model", "hard", + "--out-dir", outDir, + "--jobs", "1", + }, &stdout, &stderr) + if err != nil { + t.Fatalf("run() error = %v\nstderr:\n%s", err, stderr.String()) + } + + jsonPath := filepath.Join(outDir, "json", "EcoRI__MseI.json") + data, err := os.ReadFile(jsonPath) + if err != nil { + t.Fatalf("read pair JSON: %v", err) + } + var doc struct { + Enzymes []string `json:"enzymes"` + TotalFragments int `json:"total_fragments"` + TotalBases int `json:"total_bases"` + SizeSelection struct { + RawFragmentsInWindow int `json:"raw_fragments_in_window"` + RawBasesInWindow int64 `json:"raw_bases_in_window"` + WeightedFragments float64 `json:"weighted_fragments"` + WeightedBases float64 `json:"weighted_bases"` + } `json:"size_selection"` + Screening struct { + Engine string `json:"engine"` + } `json:"screening"` + } + if err := json.Unmarshal(data, &doc); err != nil { + t.Fatalf("unmarshal pair JSON: %v", err) + } + if got := doc.Enzymes; len(got) != 2 || got[0] != "EcoRI" || got[1] != "MseI" { + t.Fatalf("enzymes = %#v, want EcoRI,MseI", got) + } + if doc.TotalFragments != 2 || doc.TotalBases != 11 { + t.Fatalf("totals = fragments %d bases %d, want 2 and 11", doc.TotalFragments, doc.TotalBases) + } + if doc.SizeSelection.RawFragmentsInWindow != 2 || doc.SizeSelection.RawBasesInWindow != 11 { + t.Fatalf("size-selection raw fields = %d/%d, want 2/11", doc.SizeSelection.RawFragmentsInWindow, doc.SizeSelection.RawBasesInWindow) + } + if doc.SizeSelection.WeightedFragments != 2 || doc.SizeSelection.WeightedBases != 11 { + t.Fatalf("size-selection weighted fields = %g/%g, want 2/11", doc.SizeSelection.WeightedFragments, doc.SizeSelection.WeightedBases) + } + if doc.Screening.Engine != "cached-cut-index" { + t.Fatalf("screening engine = %q, want cached-cut-index", doc.Screening.Engine) + } +} + +func TestDryRunDoesNotReadMissingFASTA(t *testing.T) { + var stdout, stderr bytes.Buffer + err := run([]string{ + "--fasta", filepath.Join(t.TempDir(), "missing.fa"), + "--enzymes", "EcoRI,MseI", + "--out-dir", t.TempDir(), + "--dry-run", + }, &stdout, &stderr) + if err != nil { + t.Fatalf("dry-run returned error: %v", err) + } + if !bytes.Contains(stdout.Bytes(), []byte("EcoRI__MseI")) { + t.Fatalf("dry-run stdout did not contain pair tag; stdout=%q", stdout.String()) + } +} + +func TestWriteJSONAtomicWritesCompleteFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "pair.json") + + if err := writeJSONAtomic(path, map[string]any{"ok": true}); err != nil { + t.Fatalf("writeJSONAtomic returned error: %v", err) + } + + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read output JSON: %v", err) + } + var doc map[string]bool + if err := json.Unmarshal(data, &doc); err != nil { + t.Fatalf("unmarshal output JSON: %v", err) + } + if !doc["ok"] { + t.Fatalf("decoded JSON = %#v, want ok=true", doc) + } + + matches, err := filepath.Glob(filepath.Join(dir, "pair.json.*.tmp")) + if err != nil { + t.Fatalf("glob temp files: %v", err) + } + if len(matches) != 0 { + t.Fatalf("temporary files remain after successful write: %#v", matches) + } +} + +func TestWriteJSONAtomicDoesNotLeaveFinalFileOnEncodeError(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "pair.json") + + err := writeJSONAtomic(path, map[string]any{"bad": make(chan int)}) + if err == nil { + t.Fatalf("writeJSONAtomic returned nil error for unencodable value") + } + if _, statErr := os.Stat(path); !os.IsNotExist(statErr) { + t.Fatalf("final JSON file exists after failed write; stat error = %v", statErr) + } + + matches, globErr := filepath.Glob(filepath.Join(dir, "pair.json.*.tmp")) + if globErr != nil { + t.Fatalf("glob temp files: %v", globErr) + } + if len(matches) != 0 { + t.Fatalf("temporary files remain after failed write: %#v", matches) + } +} diff --git a/cmd/radigest/main.go b/cmd/radigest/main.go index 015d148..ecb79d6 100644 --- a/cmd/radigest/main.go +++ b/cmd/radigest/main.go @@ -23,7 +23,7 @@ import ( ) var ( - version = "v0.3.0" + version = "v0.3.1" ) const summarySchemaVersion = 1 diff --git a/internal/digest/cut_cache_test.go b/internal/digest/cut_cache_test.go new file mode 100644 index 0000000..e3e076d --- /dev/null +++ b/internal/digest/cut_cache_test.go @@ -0,0 +1,201 @@ +package digest + +import ( + "errors" + "reflect" + "testing" + + "github.com/ericksamera/radigest/internal/enzyme" +) + +func fragmentsEqual(a, b []Fragment) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func collectCutsForEnzyme(t *testing.T, e enzyme.Enzyme, seq []byte) []int { + t.Helper() + plan := NewPlan([]enzyme.Enzyme{e}) + return plan.Cuts(seq) +} + +func collectDigestCutsEach(t *testing.T, cutsA, cutsB []int, seqLen, min, max int, opt Options) []Fragment { + t.Helper() + var got []Fragment + if err := DigestCutsEach(cutsA, cutsB, seqLen, min, max, opt, func(fr Fragment) error { + got = append(got, fr) + return nil + }); err != nil { + t.Fatalf("DigestCutsEach returned error: %v", err) + } + return got +} + +func TestCutsReturnsSortedCutCoordinates(t *testing.T) { + got := collectCutsForEnzyme(t, enzyme.DB["EcoRI"], []byte(toyChr)) + want := []int{5, 16} + if !reflect.DeepEqual(got, want) { + t.Fatalf("EcoRI cuts mismatch: got %#v want %#v", got, want) + } +} + +func TestCutsSupportsDegenerateMotifsAndReferenceNPolicy(t *testing.T) { + apekiCuts := collectCutsForEnzyme(t, enzyme.Enzyme{Name: "ApeKI", Recognition: "G^CWGC"}, []byte("AAGCAGCAAGCTGC")) + if want := []int{3, 10}; !reflect.DeepEqual(apekiCuts, want) { + t.Fatalf("ApeKI cuts mismatch: got %#v want %#v", apekiCuts, want) + } + + hinfCuts := collectCutsForEnzyme(t, enzyme.Enzyme{Name: "HinfI", Recognition: "G^ANTC"}, []byte("GAATCCCCGANTCCCCGAATC")) + if want := []int{1, 17}; !reflect.DeepEqual(hinfCuts, want) { + t.Fatalf("HinfI cuts mismatch: got %#v want %#v", hinfCuts, want) + } +} + +func TestCutsEachPropagatesEmitError(t *testing.T) { + wantErr := errors.New("stop") + plan := NewPlan([]enzyme.Enzyme{enzyme.DB["EcoRI"]}) + err := plan.CutsEach([]byte(toyChr), func(int) error { return wantErr }) + if !errors.Is(err, wantErr) { + t.Fatalf("CutsEach error = %v, want %v", err, wantErr) + } +} + +func TestCutsEachRejectsNilEmit(t *testing.T) { + plan := NewPlan([]enzyme.Enzyme{enzyme.DB["EcoRI"]}) + if err := plan.CutsEach([]byte(toyChr), nil); err == nil { + t.Fatal("CutsEach with nil emit returned nil error") + } +} + +func TestDigestCutsEachRejectsNilEmit(t *testing.T) { + if err := DigestCutsEach([]int{1, 5}, nil, 10, 1, 100, Options{}, nil); err == nil { + t.Fatal("DigestCutsEach with nil emit returned nil error") + } +} + +func TestDigestCutsEachMatchesPlanDigestEach(t *testing.T) { + cases := []struct { + name string + seq []byte + ens []enzyme.Enzyme + opt Options + min int + max int + }{ + { + name: "single internal", + seq: []byte("AATTCCCCAATT"), + ens: []enzyme.Enzyme{{Name: "MluCI", Recognition: "^AATT"}}, + min: 1, + max: 100, + }, + { + name: "single include ends no cut", + seq: []byte("CCCCCCCC"), + ens: []enzyme.Enzyme{{Name: "EcoRI", Recognition: "G^AATTC"}}, + opt: Options{IncludeEnds: true}, + min: 1, + max: 100, + }, + { + name: "double AB BA", + seq: []byte("AAAAGAATTCTTAAAGAATTC"), + ens: []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "MseI", Recognition: "T^TAA"}, + }, + min: 1, + max: 100, + }, + { + name: "double include ends", + seq: []byte("AAAAGAATTCTTAAAGAATTC"), + ens: []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "MseI", Recognition: "T^TAA"}, + }, + opt: Options{IncludeEnds: true}, + min: 1, + max: 100, + }, + { + name: "same enzyme suppressed", + seq: []byte("AAAAGAATTCAAAAGAATTCAAA"), + ens: []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "NcoI", Recognition: "C^CATGG"}, + }, + min: 1, + max: 100, + }, + { + name: "same enzyme allowed", + seq: []byte("AAAAGAATTCAAAAGAATTCAAA"), + ens: []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "NcoI", Recognition: "C^CATGG"}, + }, + opt: Options{AllowSame: true}, + min: 1, + max: 100, + }, + { + name: "coincident zero length", + seq: []byte("AAAGATCAAAGATC"), + ens: []enzyme.Enzyme{ + {Name: "DpnII", Recognition: "^GATC"}, + {Name: "MboI", Recognition: "^GATC"}, + }, + min: 0, + max: 100, + }, + { + name: "double second enzyme has no cuts", + seq: []byte("AAAAGAATTCAAAAGAATTCAAA"), + ens: []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "NcoI", Recognition: "C^CATGG"}, + }, + opt: Options{IncludeEnds: true}, + min: 1, + max: 100, + }, + { + name: "degenerate motif", + seq: []byte("AAGCAGCAAGCTGC"), + ens: []enzyme.Enzyme{{Name: "ApeKI", Recognition: "G^CWGC"}}, + min: 1, + max: 100, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + plan := NewPlanWithOptions(tc.ens, tc.opt) + want := collectDigestEach(t, plan, tc.seq, tc.min, tc.max) + + cutsA := collectCutsForEnzyme(t, tc.ens[0], tc.seq) + var cutsB []int + if len(tc.ens) > 1 { + cutsB = collectCutsForEnzyme(t, tc.ens[1], tc.seq) + } + + got := collectDigestCutsEach(t, cutsA, cutsB, len(tc.seq), tc.min, tc.max, tc.opt) + if !fragmentsEqual(got, want) { + t.Fatalf("DigestCutsEach mismatch:\n got %#v\nwant %#v", got, want) + } + + gotSlice := DigestCuts(cutsA, cutsB, len(tc.seq), tc.min, tc.max, tc.opt) + if !fragmentsEqual(gotSlice, want) { + t.Fatalf("DigestCuts mismatch:\n got %#v\nwant %#v", gotSlice, want) + } + }) + } +} diff --git a/internal/digest/digest.go b/internal/digest/digest.go index 8daab3c..3d3c80b 100644 --- a/internal/digest/digest.go +++ b/internal/digest/digest.go @@ -174,6 +174,43 @@ func emitTerminalIfKept(start, end, min, max int, emit func(Fragment) error) err return emitIfKept(start, end, min, max, emit) } +// CutsEach streams sorted cut coordinates for the first enzyme in the plan. +// Cut coordinates are motif start plus cut offset. The callback is invoked in +// deterministic genomic cut-coordinate order. If emit returns an error, +// scanning stops and that error is returned. +func (p Plan) CutsEach(seq []byte, emit func(int) error) error { + if p.m[0].mask == nil { // no enzymes compiled + return nil + } + if emit == nil { + return fmt.Errorf("digest cut emit callback is nil") + } + + scan := newCutScanner(p.m[0], seq) + for { + cut, ok := scan.next() + if !ok { + return nil + } + if err := emit(cut); err != nil { + return err + } + } +} + +// Cuts returns sorted cut coordinates for the first enzyme in the plan. +func (p Plan) Cuts(seq []byte) []int { + if p.m[0].mask == nil { + return nil + } + cuts := make([]int, 0) + _ = p.CutsEach(seq, func(cut int) error { + cuts = append(cuts, cut) + return nil + }) + return cuts +} + // DigestEach streams kept fragments to emit without materializing cut arrays or // a per-chromosome []Fragment. It supports the same modes as Digest: // - single-enzyme mode (only A configured): consecutive A cuts @@ -286,6 +323,126 @@ func (p Plan) DigestEach(seq []byte, min, max int, emit func(Fragment) error) er return nil } +// DigestCutsEach streams kept fragments from precomputed sorted cut-coordinate +// slices. If cutsB is nil, single-enzyme semantics are used for cutsA. If cutsB +// is non-nil, double-digest semantics are used for cutsA and cutsB. +// +// The behavior matches Plan.DigestEach for single digest, double digest, +// coincident cuts, AA/BB suppression, AllowSame, and IncludeEnds, assuming the +// input cut slices are sorted in ascending cut-coordinate order. +func DigestCutsEach(cutsA, cutsB []int, seqLen, min, max int, opt Options, emit func(Fragment) error) error { + if emit == nil { + return fmt.Errorf("digest emit callback is nil") + } + if seqLen < 0 { + return fmt.Errorf("digest sequence length is negative: %d", seqLen) + } + if cutsB == nil { + return digestSingleCutsEach(cutsA, seqLen, min, max, opt, emit) + } + return digestDoubleCutsEach(cutsA, cutsB, seqLen, min, max, opt, emit) +} + +func digestSingleCutsEach(cuts []int, seqLen, min, max int, opt Options, emit func(Fragment) error) error { + if len(cuts) == 0 { + if opt.IncludeEnds { + return emitTerminalIfKept(0, seqLen, min, max, emit) + } + return nil + } + if opt.IncludeEnds { + if err := emitTerminalIfKept(0, cuts[0], min, max, emit); err != nil { + return err + } + } + prevPos := cuts[0] + for _, pos := range cuts[1:] { + if err := emitIfKept(prevPos, pos, min, max, emit); err != nil { + return err + } + prevPos = pos + } + if opt.IncludeEnds { + return emitTerminalIfKept(prevPos, seqLen, min, max, emit) + } + return nil +} + +func digestDoubleCutsEach(cutsA, cutsB []int, seqLen, min, max int, opt Options, emit func(Fragment) error) error { + ai, bi := 0, 0 + prevType := -1 // 0=A, 1=B + prevPos := 0 + sawCut := false + lastPos := 0 + + for ai < len(cutsA) || bi < len(cutsB) { + var pos int + if ai < len(cutsA) && (bi >= len(cutsB) || cutsA[ai] <= cutsB[bi]) { + pos = cutsA[ai] + } else { + pos = cutsB[bi] + } + hasA := ai < len(cutsA) && cutsA[ai] == pos + hasB := bi < len(cutsB) && cutsB[bi] == pos + + if opt.IncludeEnds && !sawCut { + if err := emitTerminalIfKept(0, pos, min, max, emit); err != nil { + return err + } + } + sawCut = true + lastPos = pos + + if hasA && hasB { + // Coincident cuts are barriers. Emit one zero-length fragment for the + // site if the caller's size range allows it, then reset adjacency so no + // fragment bridges across the coincident cut. + if err := emitIfKept(pos, pos, min, max, emit); err != nil { + return err + } + ai++ + bi++ + prevType = -1 + prevPos = pos + continue + } + + curType := 0 + if hasA { + ai++ + } else { + curType = 1 + bi++ + } + + if prevType != -1 && (opt.AllowSame || prevType != curType) { + if err := emitIfKept(prevPos, pos, min, max, emit); err != nil { + return err + } + } + prevType, prevPos = curType, pos + } + if opt.IncludeEnds { + if !sawCut { + return emitTerminalIfKept(0, seqLen, min, max, emit) + } + return emitTerminalIfKept(lastPos, seqLen, min, max, emit) + } + return nil +} + +// DigestCuts returns kept fragments from precomputed sorted cut-coordinate +// slices. A nil cutsB selects single-enzyme semantics; a non-nil cutsB selects +// double-digest semantics even when the second enzyme has zero cuts. +func DigestCuts(cutsA, cutsB []int, seqLen, min, max int, opt Options) []Fragment { + out := make([]Fragment, 0) + _ = DigestCutsEach(cutsA, cutsB, seqLen, min, max, opt, func(fr Fragment) error { + out = append(out, fr) + return nil + }) + return out +} + // DigestStats returns hard-window fragment counts and bases without constructing // Fragment values or invoking a per-fragment callback. It uses the same fragment // semantics as DigestEach for single/double digest, coincident cuts, same-enzyme diff --git a/internal/screen/screen.go b/internal/screen/screen.go new file mode 100644 index 0000000..104cec5 --- /dev/null +++ b/internal/screen/screen.go @@ -0,0 +1,347 @@ +// Package screen provides cached cut-index helpers for enzyme-pair screening. +// +// The package is intentionally internal and does not define a command-line +// interface. It supports cached screening in three explicit steps: +// +// 1. Scan each candidate enzyme once per FASTA record and store cut positions. +// 2. Reuse those cut streams for many enzyme-pair scoring operations. +// 3. Produce pair summaries with the core fields consumed by +// scripts/radigest-rank-pairs. +package screen + +import ( + "encoding/json" + "fmt" + "io" + "strconv" + + "github.com/ericksamera/radigest/internal/digest" + "github.com/ericksamera/radigest/internal/enzyme" + "github.com/ericksamera/radigest/internal/fasta" + "github.com/ericksamera/radigest/internal/sizeselect" +) + +const EngineCachedCutIndex = "cached-cut-index" + +// RecordCuts stores cached cut coordinates for one FASTA record. +type RecordCuts struct { + ID string + Length int + Cuts map[string][]int +} + +// CutIndex stores per-record, per-enzyme sorted cut coordinates. +type CutIndex struct { + Records []RecordCuts + EnzymeNames []string +} + +// RecordStats summarizes hard-window fragments for one record. +type RecordStats struct { + Fragments int `json:"fragments"` + Bases int `json:"bases"` +} + +// ScreeningStats records cached-screening provenance and cache size information. +type ScreeningStats struct { + Engine string `json:"engine"` + CandidateEnzymes int `json:"candidate_enzymes"` + Records int `json:"records"` + CachedCutSites int `json:"cached_cut_sites"` + CacheMemoryEstimateBytes int64 `json:"cache_memory_estimate_bytes"` +} + +// PairSummary is the JSON-compatible summary emitted by cached pair screening. +// It intentionally preserves fields consumed by scripts/radigest-rank-pairs: +// enzymes and size_selection.raw/weighted fields. +type PairSummary struct { + SchemaVersion int `json:"schema_version"` + Enzymes []string `json:"enzymes"` + MinLength int `json:"min_length"` + MaxLength int `json:"max_length"` + TotalFragments int `json:"total_fragments"` + TotalBases int `json:"total_bases"` + PerChromosome map[string]RecordStats `json:"per_chromosome"` + SizeSelection sizeselect.Stats `json:"size_selection"` + Screening ScreeningStats `json:"screening"` +} + +// Pair identifies one unique enzyme pair. +type Pair struct { + A string + B string +} + +// BuildCutIndex scans every input record once per enzyme and stores sorted cut +// coordinates. Enzyme names must be unique because they are used as map keys. +// +// BuildCutIndex accepts a materialized record slice for tests and callers that +// already hold FASTA records in memory. Cached screening CLIs should prefer +// BuildCutIndexFromFASTA or BuildCutIndexFromRecords so sequence bytes can be +// discarded after each record is scanned. +func BuildCutIndex(records []fasta.Record, enzymes []enzyme.Enzyme, opt digest.Options) (CutIndex, error) { + names, plans, err := compileCutPlans(enzymes, opt) + if err != nil { + return CutIndex{}, err + } + + idx := CutIndex{ + Records: make([]RecordCuts, 0, len(records)), + EnzymeNames: names, + } + + for _, rec := range records { + rc, err := scanRecordCuts(rec, names, plans) + if err != nil { + return CutIndex{}, err + } + idx.Records = append(idx.Records, rc) + } + + return idx, nil +} + +// BuildCutIndexFromRecords scans records from a channel and stores only record +// IDs, record lengths, and per-enzyme cut coordinates. Sequence bytes are not +// retained after each record has been scanned. +func BuildCutIndexFromRecords(records <-chan fasta.Record, enzymes []enzyme.Enzyme, opt digest.Options) (CutIndex, error) { + if records == nil { + return CutIndex{}, fmt.Errorf("screen cut index: records channel is nil") + } + + names, plans, err := compileCutPlans(enzymes, opt) + if err != nil { + return CutIndex{}, err + } + + idx := CutIndex{ + Records: make([]RecordCuts, 0), + EnzymeNames: names, + } + + for rec := range records { + rc, err := scanRecordCuts(rec, names, plans) + if err != nil { + return CutIndex{}, err + } + idx.Records = append(idx.Records, rc) + } + + return idx, nil +} + +// BuildCutIndexFromFASTA streams a FASTA file and builds a cut index without +// retaining full reference sequences. This is the preferred entry point for +// cached pair-screening commands on large genomes. +func BuildCutIndexFromFASTA(path string, enzymes []enzyme.Enzyme, opt digest.Options) (CutIndex, error) { + ch := make(chan fasta.Record) + errCh := make(chan error, 1) + go func() { + errCh <- fasta.Stream(path, ch) + }() + + idx, buildErr := BuildCutIndexFromRecords(ch, enzymes, opt) + if buildErr != nil { + for range ch { + // Drain the FASTA stream so fasta.Stream can return its error and the + // goroutine cannot block while sending a later record. + } + } + streamErr := <-errCh + if buildErr != nil { + return CutIndex{}, buildErr + } + if streamErr != nil { + return CutIndex{}, streamErr + } + return idx, nil +} + +func compileCutPlans(enzymes []enzyme.Enzyme, opt digest.Options) ([]string, []digest.Plan, error) { + if len(enzymes) == 0 { + return nil, nil, fmt.Errorf("screen cut index: no enzymes provided") + } + + names := make([]string, 0, len(enzymes)) + plans := make([]digest.Plan, 0, len(enzymes)) + seen := make(map[string]struct{}, len(enzymes)) + + for _, enz := range enzymes { + if enz.Name == "" { + return nil, nil, fmt.Errorf("screen cut index: enzyme with empty name") + } + if _, ok := seen[enz.Name]; ok { + return nil, nil, fmt.Errorf("screen cut index: duplicate enzyme name %q", enz.Name) + } + seen[enz.Name] = struct{}{} + names = append(names, enz.Name) + + plan, err := digest.TryNewPlanWithOptions([]enzyme.Enzyme{enz}, digest.Options{StrictCuts: opt.StrictCuts}) + if err != nil { + return nil, nil, err + } + plans = append(plans, plan) + } + + return names, plans, nil +} + +func scanRecordCuts(rec fasta.Record, names []string, plans []digest.Plan) (RecordCuts, error) { + if rec.ID == "" { + return RecordCuts{}, fmt.Errorf("screen cut index: record with empty ID") + } + rc := RecordCuts{ + ID: rec.ID, + Length: len(rec.Seq), + Cuts: make(map[string][]int, len(names)), + } + for i, plan := range plans { + rc.Cuts[names[i]] = plan.Cuts(rec.Seq) + } + return rc, nil +} + +// ContainsEnzyme reports whether the cut index includes name. +func (idx CutIndex) ContainsEnzyme(name string) bool { + for _, enzymeName := range idx.EnzymeNames { + if enzymeName == name { + return true + } + } + return false +} + +// PairNames returns all unique enzyme pairs in cut-index enzyme order. +func (idx CutIndex) PairNames() []Pair { + pairs := make([]Pair, 0, len(idx.EnzymeNames)*(len(idx.EnzymeNames)-1)/2) + for i := 0; i < len(idx.EnzymeNames); i++ { + for j := i + 1; j < len(idx.EnzymeNames); j++ { + pairs = append(pairs, Pair{A: idx.EnzymeNames[i], B: idx.EnzymeNames[j]}) + } + } + return pairs +} + +// CachedCutSites returns the total number of cached cut coordinates. +func (idx CutIndex) CachedCutSites() int { + total := 0 + for _, rec := range idx.Records { + for _, cuts := range rec.Cuts { + total += len(cuts) + } + } + return total +} + +// CacheMemoryEstimateBytes returns an approximate in-memory size for cached cut +// coordinates only. It intentionally excludes map, slice, and string overhead. +func (idx CutIndex) CacheMemoryEstimateBytes() int64 { + return int64(idx.CachedCutSites()) * int64(strconv.IntSize/8) +} + +// ScorePair scores one enzyme pair from cached cut-coordinate streams. +func ScorePair(idx CutIndex, enzymeA, enzymeB string, selector sizeselect.Selector, opt digest.Options) (PairSummary, error) { + if enzymeA == "" || enzymeB == "" { + return PairSummary{}, fmt.Errorf("screen score pair: enzyme names must be non-empty") + } + if enzymeA == enzymeB { + return PairSummary{}, fmt.Errorf("screen score pair: self-pair %q is not supported", enzymeA) + } + if !idx.ContainsEnzyme(enzymeA) { + return PairSummary{}, fmt.Errorf("screen score pair: enzyme %q not found in cut index", enzymeA) + } + if !idx.ContainsEnzyme(enzymeB) { + return PairSummary{}, fmt.Errorf("screen score pair: enzyme %q not found in cut index", enzymeB) + } + + cfg := selector.Config() + digestMin := minInt(cfg.Min, cfg.ScoreMin) + digestMax := maxInt(cfg.Max, cfg.ScoreMax) + + sizeStats := sizeselect.NewStats(selector) + perChromosome := make(map[string]RecordStats, len(idx.Records)) + totalFragments := 0 + totalBases := 0 + + for _, rec := range idx.Records { + cutsA := rec.Cuts[enzymeA] + cutsB := rec.Cuts[enzymeB] + local := RecordStats{} + + err := digest.DigestCutsEach(cutsA, cutsB, rec.Length, digestMin, digestMax, opt, func(fr digest.Fragment) error { + length := fr.End - fr.Start + hardKept := selector.InHardWindow(length) + if hardKept { + sizeStats.AddHardKept(length) + local.Fragments++ + local.Bases += length + totalFragments++ + totalBases += length + } + if selector.InScoreRange(length) { + sizeStats.AddScored(length, selector.Weight(length)) + } + return nil + }) + if err != nil { + return PairSummary{}, err + } + perChromosome[rec.ID] = local + } + + return PairSummary{ + SchemaVersion: 1, + Enzymes: []string{enzymeA, enzymeB}, + MinLength: cfg.Min, + MaxLength: cfg.Max, + TotalFragments: totalFragments, + TotalBases: totalBases, + PerChromosome: perChromosome, + SizeSelection: sizeStats, + Screening: ScreeningStats{ + Engine: EngineCachedCutIndex, + CandidateEnzymes: len(idx.EnzymeNames), + Records: len(idx.Records), + CachedCutSites: idx.CachedCutSites(), + CacheMemoryEstimateBytes: idx.CacheMemoryEstimateBytes(), + }, + }, nil +} + +// ScoreAllPairs scores all unique enzyme pairs in cut-index order. +func ScoreAllPairs(idx CutIndex, selector sizeselect.Selector, opt digest.Options) ([]PairSummary, error) { + pairs := idx.PairNames() + summaries := make([]PairSummary, 0, len(pairs)) + for _, pair := range pairs { + summary, err := ScorePair(idx, pair.A, pair.B, selector, opt) + if err != nil { + return nil, err + } + summaries = append(summaries, summary) + } + return summaries, nil +} + +// WritePairSummaryJSON writes one pair summary as indented JSON. +func WritePairSummaryJSON(w io.Writer, summary PairSummary) error { + if w == nil { + return fmt.Errorf("screen pair summary writer is nil") + } + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + return enc.Encode(summary) +} + +func minInt(a, b int) int { + if a < b { + return a + } + return b +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/internal/screen/screen_test.go b/internal/screen/screen_test.go new file mode 100644 index 0000000..36e0306 --- /dev/null +++ b/internal/screen/screen_test.go @@ -0,0 +1,355 @@ +package screen + +import ( + "bytes" + "encoding/json" + "math" + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/ericksamera/radigest/internal/digest" + "github.com/ericksamera/radigest/internal/enzyme" + "github.com/ericksamera/radigest/internal/fasta" + "github.com/ericksamera/radigest/internal/sizeselect" +) + +func testSelector(t *testing.T) sizeselect.Selector { + t.Helper() + sel, err := sizeselect.New(sizeselect.Config{ + Model: sizeselect.ModelSoftWindow, + Min: 1, + Max: 100, + ScoreMin: 1, + ScoreMax: 100, + EdgeSD: 10, + }) + if err != nil { + t.Fatalf("sizeselect.New returned error: %v", err) + } + return sel +} + +func testRecords() []fasta.Record { + return []fasta.Record{ + {ID: "toy", Seq: []byte("AAAAGAATTCTTAAAGAATTCTTT")}, + {ID: "nocut", Seq: []byte("CCCCCCCC")}, + } +} + +func testEnzymes() []enzyme.Enzyme { + return []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "MseI", Recognition: "T^TAA"}, + {Name: "ApeKI", Recognition: "G^CWGC"}, + } +} + +func expectedFromPlan(t *testing.T, records []fasta.Record, ens []enzyme.Enzyme, selector sizeselect.Selector, opt digest.Options) PairSummary { + t.Helper() + plan, err := digest.TryNewPlanWithOptions(ens, opt) + if err != nil { + t.Fatalf("TryNewPlanWithOptions returned error: %v", err) + } + + cfg := selector.Config() + digestMin := minInt(cfg.Min, cfg.ScoreMin) + digestMax := maxInt(cfg.Max, cfg.ScoreMax) + sizeStats := sizeselect.NewStats(selector) + perChromosome := make(map[string]RecordStats, len(records)) + totalFragments := 0 + totalBases := 0 + + for _, rec := range records { + local := RecordStats{} + err := plan.DigestEach(rec.Seq, digestMin, digestMax, func(fr digest.Fragment) error { + length := fr.End - fr.Start + hardKept := selector.InHardWindow(length) + if hardKept { + sizeStats.AddHardKept(length) + local.Fragments++ + local.Bases += length + totalFragments++ + totalBases += length + } + if selector.InScoreRange(length) { + sizeStats.AddScored(length, selector.Weight(length)) + } + return nil + }) + if err != nil { + t.Fatalf("DigestEach returned error: %v", err) + } + perChromosome[rec.ID] = local + } + + cfg = selector.Config() + return PairSummary{ + Enzymes: []string{ens[0].Name, ens[1].Name}, + MinLength: cfg.Min, + MaxLength: cfg.Max, + TotalFragments: totalFragments, + TotalBases: totalBases, + PerChromosome: perChromosome, + SizeSelection: sizeStats, + } +} + +func assertFloatNear(t *testing.T, name string, got, want float64) { + t.Helper() + if math.Abs(got-want) > 1e-9 { + t.Fatalf("%s got %.12g want %.12g", name, got, want) + } +} + +func assertSummaryMatches(t *testing.T, got, want PairSummary) { + t.Helper() + if !reflect.DeepEqual(got.Enzymes, want.Enzymes) { + t.Fatalf("enzymes got %#v want %#v", got.Enzymes, want.Enzymes) + } + if got.MinLength != want.MinLength || got.MaxLength != want.MaxLength { + t.Fatalf("length window got %d-%d want %d-%d", got.MinLength, got.MaxLength, want.MinLength, want.MaxLength) + } + if got.TotalFragments != want.TotalFragments || got.TotalBases != want.TotalBases { + t.Fatalf("totals got fragments=%d bases=%d want fragments=%d bases=%d", got.TotalFragments, got.TotalBases, want.TotalFragments, want.TotalBases) + } + if !reflect.DeepEqual(got.PerChromosome, want.PerChromosome) { + t.Fatalf("per-chromosome stats got %#v want %#v", got.PerChromosome, want.PerChromosome) + } + if got.SizeSelection.RawFragmentsScored != want.SizeSelection.RawFragmentsScored || + got.SizeSelection.RawBasesScored != want.SizeSelection.RawBasesScored || + got.SizeSelection.RawFragmentsInWindow != want.SizeSelection.RawFragmentsInWindow || + got.SizeSelection.RawBasesInWindow != want.SizeSelection.RawBasesInWindow { + t.Fatalf("raw size stats got %#v want %#v", got.SizeSelection, want.SizeSelection) + } + assertFloatNear(t, "weighted fragments", got.SizeSelection.WeightedFragments, want.SizeSelection.WeightedFragments) + assertFloatNear(t, "weighted bases", got.SizeSelection.WeightedBases, want.SizeSelection.WeightedBases) + assertFloatNear(t, "mean weighted length", got.SizeSelection.MeanWeightedLength, want.SizeSelection.MeanWeightedLength) +} + +func TestBuildCutIndexFromRecordsMatchesBuildCutIndex(t *testing.T) { + records := testRecords() + ch := make(chan fasta.Record) + go func() { + defer close(ch) + for _, rec := range records { + ch <- rec + } + }() + + got, err := BuildCutIndexFromRecords(ch, testEnzymes(), digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndexFromRecords returned error: %v", err) + } + want, err := BuildCutIndex(records, testEnzymes(), digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("streamed cut index got %#v want %#v", got, want) + } +} + +func TestBuildCutIndexFromRecordsRejectsNilChannel(t *testing.T) { + _, err := BuildCutIndexFromRecords(nil, testEnzymes(), digest.Options{}) + if err == nil { + t.Fatal("BuildCutIndexFromRecords accepted nil records channel") + } +} + +func TestBuildCutIndexFromFASTAMatchesBuildCutIndex(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "records.fa") + data := []byte(`>toy +AAAAGAATTCTTAAAGAATTCTTT +>nocut +CCCCCCCC +`) + if err := os.WriteFile(path, data, 0o644); err != nil { + t.Fatalf("write FASTA: %v", err) + } + + got, err := BuildCutIndexFromFASTA(path, testEnzymes(), digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndexFromFASTA returned error: %v", err) + } + want, err := BuildCutIndex(testRecords(), testEnzymes(), digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("FASTA-streamed cut index got %#v want %#v", got, want) + } +} + +func TestBuildCutIndexCachesExpectedCuts(t *testing.T) { + idx, err := BuildCutIndex(testRecords(), testEnzymes(), digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + if len(idx.Records) != 2 { + t.Fatalf("records got %d want 2", len(idx.Records)) + } + got := idx.Records[0].Cuts["EcoRI"] + want := []int{5, 16} + if !reflect.DeepEqual(got, want) { + t.Fatalf("EcoRI cuts got %#v want %#v", got, want) + } + got = idx.Records[0].Cuts["MseI"] + want = []int{11} + if !reflect.DeepEqual(got, want) { + t.Fatalf("MseI cuts got %#v want %#v", got, want) + } + if idx.CachedCutSites() != 3 { + t.Fatalf("cached cut sites got %d want 3", idx.CachedCutSites()) + } + if idx.CacheMemoryEstimateBytes() <= 0 { + t.Fatalf("cache memory estimate should be positive") + } +} + +func TestBuildCutIndexRejectsDuplicateEnzymeNames(t *testing.T) { + _, err := BuildCutIndex(testRecords(), []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "EcoRI", Recognition: "G^AATTC"}, + }, digest.Options{}) + if err == nil { + t.Fatal("BuildCutIndex accepted duplicate enzyme names") + } +} + +func TestPairNames(t *testing.T) { + idx, err := BuildCutIndex(testRecords(), testEnzymes(), digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + want := []Pair{{A: "EcoRI", B: "MseI"}, {A: "EcoRI", B: "ApeKI"}, {A: "MseI", B: "ApeKI"}} + if got := idx.PairNames(); !reflect.DeepEqual(got, want) { + t.Fatalf("PairNames got %#v want %#v", got, want) + } +} + +func TestScorePairMatchesPlanDigestEach(t *testing.T) { + records := testRecords() + ens := testEnzymes() + idx, err := BuildCutIndex(records, ens, digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + sel := testSelector(t) + + got, err := ScorePair(idx, "EcoRI", "MseI", sel, digest.Options{}) + if err != nil { + t.Fatalf("ScorePair returned error: %v", err) + } + want := expectedFromPlan(t, records, ens[:2], sel, digest.Options{}) + assertSummaryMatches(t, got, want) + if got.Screening.Engine != EngineCachedCutIndex { + t.Fatalf("screening engine got %q", got.Screening.Engine) + } +} + +func TestScoreAllPairs(t *testing.T) { + idx, err := BuildCutIndex(testRecords(), testEnzymes(), digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + summaries, err := ScoreAllPairs(idx, testSelector(t), digest.Options{}) + if err != nil { + t.Fatalf("ScoreAllPairs returned error: %v", err) + } + if len(summaries) != 3 { + t.Fatalf("ScoreAllPairs returned %d summaries, want 3", len(summaries)) + } +} + +func TestScorePairMatchesPlanDigestEachWithOptions(t *testing.T) { + records := []fasta.Record{ + {ID: "same", Seq: []byte("AAAAGAATTCAAAAGAATTCAAA")}, + } + ens := []enzyme.Enzyme{ + {Name: "EcoRI", Recognition: "G^AATTC"}, + {Name: "NcoI", Recognition: "C^CATGG"}, + } + opt := digest.Options{AllowSame: true, IncludeEnds: true} + idx, err := BuildCutIndex(records, ens, opt) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + sel := testSelector(t) + + got, err := ScorePair(idx, "EcoRI", "NcoI", sel, opt) + if err != nil { + t.Fatalf("ScorePair returned error: %v", err) + } + want := expectedFromPlan(t, records, ens, sel, opt) + assertSummaryMatches(t, got, want) +} + +func TestScorePairMatchesPlanDigestEachWithCoincidentCuts(t *testing.T) { + records := []fasta.Record{{ID: "coincident", Seq: []byte("AAAGATCAAAGATC")}} + ens := []enzyme.Enzyme{ + {Name: "DpnII", Recognition: "^GATC"}, + {Name: "MboI", Recognition: "^GATC"}, + } + idx, err := BuildCutIndex(records, ens, digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + sel, err := sizeselect.New(sizeselect.Config{Model: sizeselect.ModelHard, Min: 0, Max: 100, ScoreMin: 0, ScoreMax: 100}) + if err != nil { + t.Fatalf("sizeselect.New returned error: %v", err) + } + got, err := ScorePair(idx, "DpnII", "MboI", sel, digest.Options{}) + if err != nil { + t.Fatalf("ScorePair returned error: %v", err) + } + want := expectedFromPlan(t, records, ens, sel, digest.Options{}) + assertSummaryMatches(t, got, want) +} + +func TestScorePairReportsMissingEnzyme(t *testing.T) { + idx, err := BuildCutIndex(testRecords(), testEnzymes(), digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + _, err = ScorePair(idx, "EcoRI", "Missing", testSelector(t), digest.Options{}) + if err == nil { + t.Fatal("ScorePair accepted missing enzyme") + } +} + +func TestWritePairSummaryJSON(t *testing.T) { + idx, err := BuildCutIndex(testRecords(), testEnzymes(), digest.Options{}) + if err != nil { + t.Fatalf("BuildCutIndex returned error: %v", err) + } + summary, err := ScorePair(idx, "EcoRI", "MseI", testSelector(t), digest.Options{}) + if err != nil { + t.Fatalf("ScorePair returned error: %v", err) + } + + var buf bytes.Buffer + if err := WritePairSummaryJSON(&buf, summary); err != nil { + t.Fatalf("WritePairSummaryJSON returned error: %v", err) + } + + var decoded struct { + Enzymes []string `json:"enzymes"` + SizeSelection sizeselect.Stats `json:"size_selection"` + Screening ScreeningStats `json:"screening"` + } + if err := json.Unmarshal(buf.Bytes(), &decoded); err != nil { + t.Fatalf("summary JSON did not unmarshal: %v", err) + } + if !reflect.DeepEqual(decoded.Enzymes, []string{"EcoRI", "MseI"}) { + t.Fatalf("decoded enzymes got %#v", decoded.Enzymes) + } + if decoded.SizeSelection.RawFragmentsInWindow != summary.SizeSelection.RawFragmentsInWindow { + t.Fatalf("decoded size selection mismatch") + } + if decoded.Screening.Engine != EngineCachedCutIndex { + t.Fatalf("decoded engine got %q", decoded.Screening.Engine) + } +}