diff --git a/R/Functions.R b/R/Functions.R index 9c1010d..7cf0713 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -314,6 +314,34 @@ pairwiseMutions <- function(germ_imgt, } # ***************************************************************************** +# ***************************************************************************** +# Wrapper around fastDist_rcpp: returns a proper dist object with Labels +fastDist <- function(seqs) { + n <- length(seqs) + v <- fastDist_rcpp(seqs) + structure(v, + class = "dist", + Size = n, + Labels = names(seqs), + Diag = FALSE, + Upper = FALSE) +} +# ***************************************************************************** + +# ***************************************************************************** +# Wrapper around fastDistAA_rcpp: returns a proper dist object with Labels +fastDistAA <- function(seqs) { + n <- length(seqs) + v <- fastDistAA_rcpp(seqs) + structure(v, + class = "dist", + Size = n, + Labels = names(seqs), + Diag = FALSE, + Upper = FALSE) +} +# ***************************************************************************** + # ***************************************************************************** ### make a dataframe of unique seqs in each clone uniqueSeq <- function(seqs) { @@ -1966,7 +1994,7 @@ hierarchicalClones_helper <- function(db_gp, # calculate distance matrix if (method == "nt") { if (! IUPAC){ - dist_mtx <- fastDist_rcpp(seqs_unq) + dist_mtx <- fastDist(seqs_unq) } else{ dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c447597..39e0a74 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -35,7 +35,7 @@ BEGIN_RCPP END_RCPP } // fastDist_rcpp -IntegerMatrix fastDist_rcpp(CharacterVector seqs); +IntegerVector fastDist_rcpp(CharacterVector seqs); RcppExport SEXP _scoper_fastDist_rcpp(SEXP seqsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; diff --git a/src/fastDist.cpp b/src/fastDist.cpp index d0c94af..85ecc72 100644 --- a/src/fastDist.cpp +++ b/src/fastDist.cpp @@ -37,7 +37,7 @@ int countSeqsWithInvalidBases_rcpp(CharacterVector seqs) { } // [[Rcpp::export]] -IntegerMatrix fastDist_rcpp(CharacterVector seqs) { +IntegerVector fastDist_rcpp(CharacterVector seqs) { int N = seqs.size(); if (N == 0) stop("empty input"); @@ -60,7 +60,16 @@ IntegerMatrix fastDist_rcpp(CharacterVector seqs) { } } - IntegerMatrix Mmatch(N, N); // zero-initialized + // lower-triangle vector (column-major, matching R's dist storage): + // element at row i, col j (i > j, 0-indexed) maps to position: + // j*N - j*(j+1)/2 + (i-j) - 1 + size_t tri_size = (size_t)N * (N - 1) / 2; + IntegerVector tri(tri_size); // zero-initialized + + auto tri_idx = [&](int i, int j) -> size_t { + if (i < j) std::swap(i, j); + return (size_t)j * N - (size_t)j * (j + 1) / 2 + (i - j) - 1; + }; for (int p = 0; p < L; ++p) { std::vector A, Cb, G, Tt, Ns, Q; @@ -78,28 +87,20 @@ IntegerMatrix fastDist_rcpp(CharacterVector seqs) { } } - // column-major safe updaters: iterate over j (column) outermost, - // then use a pointer to column j and bump rows i. - auto bump_within = [&](const std::vector& v){ + // increment lower-triangle match counts for pairs within the same bucket + auto bump_within = [&](const std::vector& v) { int m = (int)v.size(); - for (int jj = 0; jj < m; ++jj) { - int j = v[jj]; - int* colj = &Mmatch(0, j); // pointer to column j - for (int ii = 0; ii < m; ++ii) { - colj[v[ii]] += 1; // increment (i = v[ii], j) - } - } + for (int ii = 1; ii < m; ++ii) + for (int jj = 0; jj < ii; ++jj) + tri[tri_idx(v[ii], v[jj])] += 1; }; - auto bump_pairs = [&](const std::vector& X, const std::vector& Y){ - int mx = (int)X.size(), my = (int)Y.size(); - for (int jj = 0; jj < my; ++jj) { - int j = Y[jj]; - int* colj = &Mmatch(0, j); // pointer to column j - for (int ii = 0; ii < mx; ++ii) { - colj[X[ii]] += 1; // increment (i = X[ii], j) - } - } + // increment lower-triangle match counts for all cross-bucket pairs + auto bump_pairs = [&](const std::vector& X, const std::vector& Y) { + for (int xi : X) + for (int yj : Y) + if (xi != yj) + tri[tri_idx(xi, yj)] += 1; }; // same known base @@ -112,7 +113,7 @@ IntegerMatrix fastDist_rcpp(CharacterVector seqs) { K.insert(K.end(), G.begin(), G.end()); K.insert(K.end(), Tt.begin(), Tt.end()); bump_pairs(Ns, K); - bump_pairs(K, Ns); + // bump_pairs(K, Ns); // The bump_pairs(Ns, K) call already handles all N-vs-known-base pairs in both directions because tri_idx normalizes (i,j) and (j,i) to the same lower-triangle position. bump_within(Ns); // N-N is a match (consistent with getDNAMatrix(gap=0)) // ? with ? @@ -120,11 +121,8 @@ IntegerMatrix fastDist_rcpp(CharacterVector seqs) { } // convert matches -> distances - for (int i = 0; i < N; ++i) { - for (int j = 0; j < N; ++j) { - Mmatch(i, j) = L - Mmatch(i, j); - } - Mmatch(i, i) = 0; - } - return Mmatch; + for (int k = 0; k < tri_size; ++k) + tri[k] = L - tri[k]; + + return tri; } diff --git a/src/fastDistAA.cpp b/src/fastDistAA.cpp new file mode 100644 index 0000000..c5d8083 --- /dev/null +++ b/src/fastDistAA.cpp @@ -0,0 +1,128 @@ +#include +using namespace Rcpp; + +// A=0 C=1 D=2 E=3 F=4 G=5 H=6 I=7 K=8 L=9 +// M=10 N=11 P=12 Q=13 R=14 S=15 T=16 V=17 W=18 Y=19 +// X=20 (wildcard) ?=21 +inline uint8_t code_aa(char c){ + switch(c){ + case 'A': return 0; case 'C': return 1; case 'D': return 2; + case 'E': return 3; case 'F': return 4; case 'G': return 5; + case 'H': return 6; case 'I': return 7; case 'K': return 8; + case 'L': return 9; case 'M': return 10; case 'N': return 11; + case 'P': return 12; case 'Q': return 13; case 'R': return 14; + case 'S': return 15; case 'T': return 16; case 'V': return 17; + case 'W': return 18; case 'Y': return 19; + case 'X': return 20; case '?': return 21; + default: return 255; + } +} + +// [[Rcpp::export]] +int countAASeqsWithInvalidChars_rcpp(CharacterVector seqs) { + int N = seqs.size(); + int bad = 0; + + for (int i = 0; i < N; ++i) { + if (seqs[i] == NA_STRING) { + bad++; + continue; + } + + std::string s = as(seqs[i]); + bool invalid = false; + for (char ch : s) { + char up = (char)std::toupper((unsigned char)ch); + if (code_aa(up) == 255) { + invalid = true; + break; + } + } + + if (invalid) bad++; + } + + return bad; +} + +// [[Rcpp::export]] +IntegerVector fastDistAA_rcpp(CharacterVector seqs) { + int N = seqs.size(); + + if (N == 0) stop("empty input"); + + std::string s0 = as(seqs[0]); + int L = (int)s0.size(); + for (int i = 0; i < N; ++i) { + if ((int)std::string(as(seqs[i])).size() != L) + stop("All sequences must have the same length"); + } + + // encode to uint8: N x L, row-major in a flat vector + std::vector enc((size_t)N * L); + for (int i = 0; i < N; ++i) { + std::string s = as(seqs[i]); + for (int p = 0; p < L; ++p) { + char up = (char)std::toupper((unsigned char)s[p]); + uint8_t c = code_aa(up); + if (c == 255) stop("Only the 20 standard AAs plus X and ? are allowed"); + enc[(size_t)i * L + p] = c; + } + } + + // lower-triangle vector (column-major, matching R's dist storage): + // element at row i, col j (i > j, 0-indexed) maps to: + // j*N - j*(j+1)/2 + (i-j) - 1 + size_t tri_size = (size_t)N * (N - 1) / 2; + IntegerVector tri(tri_size); // zero-initialized + + auto tri_idx = [&](int i, int j) -> size_t { + if (i < j) std::swap(i, j); + return (size_t)j * N - (size_t)j * (j + 1) / 2 + (i - j) - 1; + }; + + for (int p = 0; p < L; ++p) { + // one bucket per code (0-19 known AAs, 20 = X, 21 = ?) + std::vector buckets[22]; + for (int b = 0; b < 22; ++b) buckets[b].reserve(8); + + for (int i = 0; i < N; ++i) + buckets[enc[(size_t)i * L + p]].push_back(i); + + // increment lower-triangle match counts for pairs within the same bucket + auto bump_within = [&](const std::vector& v) { + int m = (int)v.size(); + for (int ii = 1; ii < m; ++ii) + for (int jj = 0; jj < ii; ++jj) + tri[tri_idx(v[ii], v[jj])] += 1; + }; + + // increment lower-triangle match counts for all cross-bucket pairs + auto bump_pairs = [&](const std::vector& X, const std::vector& Y) { + for (int xi : X) + for (int yj : Y) + if (xi != yj) + tri[tri_idx(xi, yj)] += 1; + }; + + // same known AA + for (int b = 0; b < 20; ++b) + bump_within(buckets[b]); + + // X (wildcard) with any known AA — but not X-X + std::vector K; + K.reserve(N); + for (int b = 0; b < 20; ++b) + K.insert(K.end(), buckets[b].begin(), buckets[b].end()); + bump_pairs(buckets[20], K); + + // ? with ? + bump_within(buckets[21]); + } + + // convert matches -> distances + for (size_t k = 0; k < tri_size; ++k) + tri[k] = L - tri[k]; + + return tri; +} diff --git a/src/fastDist_explainer.html b/src/fastDist_explainer.html new file mode 100644 index 0000000..b705975 --- /dev/null +++ b/src/fastDist_explainer.html @@ -0,0 +1,436 @@ + + + + + +fastDist.cpp — Algorithm Explanation + + + + +

fastDist.cpp

+

Algorithm explanation with interactive walkthrough

+ +

Overview

+

+ fastDist.cpp implements two R-callable C++ functions via Rcpp for working with aligned DNA sequences. + The core function, fastDist_rcpp, computes a pairwise Hamming distance matrix + for N sequences of equal length L. It returns a flat integer vector in R's lower-triangle dist format, + ready to be passed directly to structure(..., class="dist"). +

+ +

Helper: code_char

+

Encodes each nucleotide character as a small integer for fast comparison:

+ + + + + + + + + +
CharacterCodeMeaning
A0Adenine
C1Cytosine
G2Guanine
T3Thymine
N4Ambiguous — any base
?5Unknown — do not assume
anything else255Invalid — triggers error
+ +

Function 1: countSeqsWithInvalidBases_rcpp

+

+ A validation pass. Loops over a vector of sequences and counts how many contain characters outside + the allowed set (A, C, G, T, N, ?). NA strings are also counted as invalid. +

+ +

Function 2: fastDist_rcpp

+ +

Step 1 — Encode

+

+ All N sequences (each of length L) are encoded into a flat uint8 array of shape N×L (row-major) + using code_char. This makes column-wise access cache-friendly. +

+ +

Step 2 — Column-wise bucket counting

+

+ Rather than the naïve O(N²×L) loop comparing every pair at every position, the algorithm works + column by column and uses a bucketing trick: +

+

For each alignment column p:

+
    +
  1. Sequences are partitioned into 6 buckets by their symbol at that column (A, C, G, T, N, ?).
  2. +
  3. Pairs within the same known-base bucket (A–A, C–C, G–G, T–T) get +1 in the match triangle — they agree here.
  4. +
  5. Pairs where one sequence has N and the other has a known base also get +1 — N is treated as matching anything known (standard IUPAC convention).
  6. +
  7. Pairs where both sequences have ? also get +1.
  8. +
  9. All other pairs get nothing — they disagree or are too ambiguous to call.
  10. +
+ +
+ Why this is fast: The naïve approach iterates all N(N−1)/2 pairs for every position. + This approach iterates only N sequences per column to fill the buckets, then iterates within/across + small bucket sizes — much cheaper when sequences share bases at each column, which is typical in aligned biological data. +
+ +

Matching rules summary

+ + + + + + + + + +
Pair typeCreditRationale
A–A, C–C, G–G, T–T+1Identical known base
N vs known base+1N could be that base (IUPAC)
? vs ?+1Two unknowns assumed to match
A vs C, G vs T, etc.0Genuine mismatch
N vs N0Two unknowns don't confirm agreement
N vs ?0Both ambiguous — no credit
? vs known base0Unknown treated conservatively
+ +

+ Note the asymmetry between N and ?: N says "I could be anything, assume I match a known base," + while ? says "I'm unknown — don't assume anything." This makes ? strictly more conservative than N. +

+ +

Step 3 — Convert matches to distances

+

+ After all L columns are processed, each triangle entry holds the number of positions where two sequences + are considered to agree. The final conversion is simply: +

+
distance(i, j) = L − matches(i, j)
+ +

Output format

+

+ The returned IntegerVector is a flat lower-triangle in column-major order, + matching R's built-in dist object storage. The index formula for pair (i, j) with i > j (0-indexed) is: +

+
tri_idx(i, j) = j*N − j*(j+1)/2 + (i−j) − 1
+ +

Interactive Walkthrough

+

+ The widget below steps through the algorithm on a concrete example of 4 sequences of length 5, + including one ? character. Use the next / prev buttons to advance through + each column's bucket-and-credit phase. The triangle updates live as matches accumulate, and the + final step converts match counts to Hamming distances. +

+

Input sequences:

+
Seq 0:  A C G T N
+Seq 1:  A G G T A
+Seq 2:  C C G ? A
+Seq 3:  A C T T N
+ + +
+
Step-by-step: column bucketing & match accumulation
+ +
+
+ +
+ +
+
+ +
+ + + +
+ +
+ +
+
Buckets at this column
+
+
+ +
+
Pairs credited (+1 match)
+
+
+
+ +
+
Triangle (match counts → distances)
+
+
+
+
+ + +

Source reference

+

Key functions in fastDist.cpp:

+ + + + + +
FunctionPurpose
code_char(c)Maps A/C/G/T/N/? to 0–5; returns 255 for invalid characters
countSeqsWithInvalidBases_rcpp(seqs)Returns count of sequences containing invalid characters or NA
fastDist_rcpp(seqs)Returns lower-triangle Hamming distance vector in R dist format
+ + + + diff --git a/testAA.R b/testAA.R new file mode 100644 index 0000000..18ebf13 --- /dev/null +++ b/testAA.R @@ -0,0 +1,77 @@ +library(Rcpp) +sourceCpp("src/fastDistAA.cpp") + +fastDistAA <- function(seqs) { + n <- length(seqs) + v <- fastDistAA_rcpp(seqs) + structure(v, class="dist", Size=n, Labels=names(seqs), Diag=FALSE, Upper=FALSE) +} + +# Naive reference: pure Hamming with X/? semantics matching fastDistAA +# Rules: +# - same known AA -> match +# - X vs known AA (either order) -> match (wildcard) +# - ? vs ? -> match +# - everything else -> mismatch +naiveDistAA <- function(s1, s2) { + c1 <- strsplit(toupper(s1), "")[[1]] + c2 <- strsplit(toupper(s2), "")[[1]] + known <- c("A","C","D","E","F","G","H","I","K","L", + "M","N","P","Q","R","S","T","V","W","Y") + mismatches <- 0L + for (i in seq_along(c1)) { + a <- c1[i]; b <- c2[i] + is_match <- (a == b && a %in% known) || + (a == "X" && b %in% known) || + (b == "X" && a %in% known) || + (a == "?" && b == "?") + if (!is_match) mismatches <- mismatches + 1L + } + mismatches +} + +# ---- parse args ---- +args <- commandArgs(trailingOnly=TRUE) +verbose <- "-v" %in% args +k_flag <- which(args == "-k") +l_flag <- which(args == "-l") +K <- if (length(k_flag) && k_flag < length(args)) as.integer(args[k_flag + 1L]) else 500L +L <- if (length(l_flag) && l_flag < length(args)) as.integer(args[l_flag + 1L]) else 20L + +cat(sprintf("Testing fastDistAA: k=%d sequences, l=%d length\n", K, L)) + +set.seed(42) +AAS <- c("A","C","D","E","F","G","H","I","K","L", + "M","N","P","Q","R","S","T","V","W","Y","X","?") +seqs <- replicate(K, paste(sample(AAS, L, replace=TRUE), collapse="")) + +# ---- pairwise naive reference ---- +pairs <- combn(K, 2) # 2 x choose(K,2) matrix +naive <- integer(ncol(pairs)) +for (p in seq_len(ncol(pairs))) { + naive[p] <- naiveDistAA(seqs[pairs[1,p]], seqs[pairs[2,p]]) +} + +# ---- fastDistAA output (lower-triangle, column-major) ---- +fast <- as.integer(fastDistAA(seqs)) + +# combn gives column-major lower-triangle in the same order R's dist uses +errors <- 0L +for (p in seq_len(ncol(pairs))) { + i <- pairs[1,p]; j <- pairs[2,p] + if (verbose) + cat(sprintf("%s %s naive=%d fast=%d\n", + seqs[i], seqs[j], naive[p], fast[p])) + if (fast[p] != naive[p]) { + cat(sprintf("FAIL pair (%d,%d): '%s' vs '%s' fast=%d naive=%d\n", + i, j, seqs[i], seqs[j], fast[p], naive[p])) + errors <- errors + 1L + } +} + +if (errors == 0L) { + cat(sprintf("All %d pairs passed.\n", ncol(pairs))) +} else { + cat(sprintf("%d / %d pairs FAILED.\n", errors, ncol(pairs))) + quit(status=1) +} diff --git a/testNT.R b/testNT.R new file mode 100644 index 0000000..1e7f8b9 --- /dev/null +++ b/testNT.R @@ -0,0 +1,69 @@ +library(Rcpp) +sourceCpp("src/fastDist.cpp") + +fastDist <- function(seqs) { + n <- length(seqs) + v <- fastDist_rcpp(seqs) + structure(v, class="dist", Size=n, Labels=names(seqs), Diag=FALSE, Upper=FALSE) +} + +# Naive reference implementation +trueDist <- function(s1, s2) { + c1 <- strsplit(s1, "")[[1]] + c2 <- strsplit(s2, "")[[1]] + known <- c("A", "C", "G", "T") + mismatches <- 0L + for (i in seq_along(c1)) { + a <- c1[i]; b <- c2[i] + match <- (a == b && a %in% known) || + (a == "N" && b %in% known) || + (b == "N" && a %in% known) || + (a == "?" && b == "?") + if (!match) mismatches <- mismatches + 1L + } + mismatches +} + +# ---- parse args ---- +args <- commandArgs(trailingOnly=TRUE) +verbose <- "-v" %in% args +k_flag <- which(args == "-k") +l_flag <- which(args == "-l") +K <- if (length(k_flag) && k_flag < length(args)) as.integer(args[k_flag + 1L]) else 500L +L <- if (length(l_flag) && l_flag < length(args)) as.integer(args[l_flag + 1L]) else 20L + +cat(sprintf("Testing fastDist: k=%d sequences, l=%d length\n", K, L)) + +set.seed(42) +BASES <- c("A", "C", "G", "T", "N", "?") +seqs <- replicate(K, paste(sample(BASES, L, replace=TRUE), collapse="")) + +# ---- pairwise naive reference ---- +pairs <- combn(K, 2) +naive <- integer(ncol(pairs)) +for (p in seq_len(ncol(pairs))) { + naive[p] <- trueDist(seqs[pairs[1,p]], seqs[pairs[2,p]]) +} + +# ---- fastDist output ---- +fast <- as.integer(fastDist(seqs)) + +errors <- 0L +for (p in seq_len(ncol(pairs))) { + i <- pairs[1,p]; j <- pairs[2,p] + if (verbose) + cat(sprintf("%s %s naive=%d fast=%d\n", + seqs[i], seqs[j], naive[p], fast[p])) + if (fast[p] != naive[p]) { + cat(sprintf("FAIL pair (%d,%d): '%s' vs '%s' fast=%d naive=%d\n", + i, j, seqs[i], seqs[j], fast[p], naive[p])) + errors <- errors + 1L + } +} + +if (errors == 0L) { + cat(sprintf("All %d pairs passed.\n", ncol(pairs))) +} else { + cat(sprintf("%d / %d pairs FAILED.\n", errors, ncol(pairs))) + quit(status=1) +} diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R index f7d97b2..d36c244 100644 --- a/tests/testthat/test_clone.R +++ b/tests/testthat/test_clone.R @@ -887,7 +887,7 @@ test_that("fastDist_rcpp matches pairwiseDist for ATCG sequences", { "TTTTTTTT" # seq4: 6 mismatches from seq1 (positions 1,2,3,5,6,7) ) - fast_counts <- scoper:::fastDist_rcpp(seqs) + fast_counts <- as.matrix(scoper:::fastDist(seqs)) pw_dist <- alakazam::pairwiseDist(seqs, dna_mat) # Same results when comparing to pairwiseDist with check.attributes=F (ignoring dimnames) @@ -897,7 +897,7 @@ test_that("fastDist_rcpp matches pairwiseDist for ATCG sequences", { # N represents any nucleotide. Distance 0 vs any known base seqs_n <- c("ACGN", "ACGN", "ACGA", "ACGT") - fast_n <- scoper:::fastDist_rcpp(seqs_n) + fast_n <- as.matrix(scoper:::fastDist(seqs_n)) pw_n <- alakazam::pairwiseDist(seqs_n, dna_mat) # Same results when comparing to pairwiseDist with check.attributes=F (ignoring dimnames) @@ -907,7 +907,7 @@ test_that("fastDist_rcpp matches pairwiseDist for ATCG sequences", { # ? means missing data: matches only itself, mismatches everything else seqs_q <- c("ACG?", "ACG?", "ACGA", "ACGN") - fast_q <- scoper:::fastDist_rcpp(seqs_q) + fast_q <- as.matrix(scoper:::fastDist(seqs_q)) pw_q <- alakazam::pairwiseDist(seqs_q, dna_mat) # Same results when comparing to pairwiseDist with check.attributes=F (ignoring dimnames) @@ -915,14 +915,14 @@ test_that("fastDist_rcpp matches pairwiseDist for ATCG sequences", { # --- Mixed N, ?, and ATCG: full matrix matches pairwiseDist --- seqs_mixed <- c("ACGTNACGT?", "ACGTNACGT?", "ACGTAACGTA", "TTTTTTTTTT") - fast_mixed <- scoper:::fastDist_rcpp(seqs_mixed) + fast_mixed <- as.matrix(scoper:::fastDist(seqs_mixed)) pw_mixed <- alakazam::pairwiseDist(seqs_mixed, dna_mat) # Expect same results when comparing to pairwiseDist with check.attributes=F (ignoring dimnames) expect_equal(fast_mixed, pw_mixed, check.attributes=FALSE) # --- Single sequence: 1x1 matrix, diagonal = 0 --- - fast_single <- scoper:::fastDist_rcpp("ACGT") + fast_single <- as.matrix(scoper:::fastDist("ACGT")) single <- alakazam::pairwiseDist("ACGT", dna_mat) # Expect same results when comparing to pairwiseDist with check.attributes=F (ignoring dimnames)