diff --git a/DESCRIPTION b/DESCRIPTION index e2fd51b..f9870ef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,13 +1,13 @@ Package: scoper Type: Package Version: 1.4.0.999 -Date: 2026-02-13 +Date: 2026-03-17 Authors@R: c(person("Nima", "Nouri", role=c("aut"), email="nima.nouri@yale.edu"), person("Edel", "Aron", role=c("ctb"), email="edel.aron@yale.edu"), person("Gisela", "Gabernet", role=c("ctb"), - email="gisela.gabernet@yale.edu"), + email="gisela.gabernet@yale.edu"), person("Cole", "Jensen", role=c("ctb"), email="cole.jensen@yale.edu"), person("Huimin", "Lyu", role=c("ctb"), diff --git a/R/Functions.R b/R/Functions.R index 99e21de..12b934a 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -887,10 +887,20 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' @param db data.frame containing sequence data. #' @param threshold numeric scalar where the tree should be cut (the distance threshold for clonal grouping). #' @param method one of the \code{"nt"} for nucleotide based clustering or -#' \code{"aa"} for amino acid based clustering. +#' \code{"aa"} for amino acid based clustering. Method `"aa"` still expects nucleotide sequences, +#' which will be translated to amino acids #' @param linkage available linkage are \code{"single"}, \code{"average"}, and \code{"complete"}. #' @param normalize method of normalization. The default is \code{"len"}, which divides the distance by the length #' of the sequence group. If \code{"none"} then no normalization if performed. +#' @param IUPAC If \code{TRUE}, allows sequences with IUPAC codes to pass validation +#' and be used in clustering with IUPAC-aware distance calculation +#' (via \code{alakazam::pairwiseDist}). If \code{FALSE} (default), uses fast Hamming distance +#' (via \code{fastDist_rcpp}) and only allows standard bases (A, T, C, G), N, and ? +#' in sequences.This parameter controls validation and distance +#' calculation method, not sequence filtering. See \code{max_n} for +#' filtering sequences by character content. See the IUPAC and max_n +#' parameters section for more details and examples. Note: This parameter is only available +#' for \code{hierarchicalClones} with \code{method="nt"}. #' @param junction character name of the column containing junction sequences. #' Also used to determine sequence length for grouping. #' @param v_call name of the column containing the V-segment allele calls. @@ -923,11 +933,14 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' less than 7 nucleotides. #' @param mod3 if \code{TRUE} removes records with a \code{junction} length that is not divisible by #' 3 in nucleotide space. -#' @param max_n The maximum number of degenerate characters to permit in the junction sequence -#' before excluding the record from clonal assignment. Note, with -#' \code{linkage="single"} non-informative positions can create artifactual -#' links between unrelated sequences. Use with caution. -#' Default is set to be zero. Set it as \code{"NULL"} for no action. +#' @param max_n The maximum number of non-ATCG characters (degenerate positions) to permit +#' in the junction sequence before excluding the record from clonal assignment. +#' Note: \code{max_n} operates independently +#' from \code{IUPAC} - it controls filtering by character count, while +#' \code{IUPAC} controls validation and distance calculation method. +#' With \code{linkage="single"}, non-informative positions can create +#' artifactual links between unrelated sequences. Use with caution. +#' Default is 0 (ATCG-only). Set to \code{NULL} for no filtering. #' @param nproc number of cores to distribute the function over. #' @param verbose if \code{TRUE} prints out a summary of each step cloning process. #' if \code{FALSE} (default) process cloning silently. @@ -936,7 +949,8 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' The default is \code{NULL} for no action. #' @param summarize_clones if \code{TRUE} performs a series of analysis to assess the clonal landscape #' and returns a \link{ScoperClones} object. If \code{FALSE} (default) then -#' a modified input \code{db} is returned. When grouping by \code{fields}, +#' a modified input \code{db} is returned with clone identifiers in the specified +#' `clone` column. When grouping by \code{fields}, #' \code{summarize_clones} should be \code{FALSE}. #' @param seq_id The column containing sequence ids #' @@ -947,6 +961,55 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' clonal assignment summary information and a modified input \code{db} in the \code{db} slot that #' contains clonal identifiers in the specified \code{clone} column. #' +#' @section IUPAC and max_n parameters: +#' Note: The \code{IUPAC} parameter is only available for \code{hierarchicalClones} with +#' \code{method="nt"} (nucleotide mode). It is ignored when \code{method="aa"} (amino acid mode). +#' The \code{identicalClones} and \code{spectralClones} functions always operate as if +#' \code{IUPAC=FALSE} (standard bases plus N and ? only). The \code{max_n} parameter +#' is available for all cloning functions. +#' +#' The \code{IUPAC} and \code{max_n} parameters serve complementary but distinct purposes: +#' +#' \code{IUPAC} controls: +#' \itemize{ +#' \item Sequence validation (which characters are allowed) +#' \item Distance calculation method (fast Hamming vs IUPAC-aware scoring) +#' } +#' +#' \code{max_n} controls: +#' \itemize{ +#' \item Sequence filtering by counting non-ATCG characters +#' } +#' +#' Common use cases (for \code{method="nt"}): +#' \itemize{ +#' \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation. +#' Will throw an error and exit if sequences with IUPAC codes not A, T, C, G, N, or ? are detected +#' among the first 1000 sequences. Fastest option for high-quality data. +#' \item \code{IUPAC=FALSE, max_n>0}: Will throw an error and exit if sequences with IUPAC +#' codes not A, T, C, G, N, or ? are detected among the first 1000 sequences. Allows sequences +#' with limited N/? characters in distance calculation, using fast Hamming distance. +#' Note: IUPAC codes are rejected during validation (before max_n filtering), so max_n only +#' affects sequences that passed validation with N or ? characters. Useful for data with low-quality +#' or masked positions but no experimental ambiguity codes. +#' \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG +#' characters anyway. Only standard bases remain after filtering. +#' \item \code{IUPAC=TRUE, max_n>0}: Allows sequences with limited ambiguity codes and uses +#' proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly. +#' Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate +#' (counts all non-ATCG: N, ?, and other IUPAC codes). +#' \item \code{IUPAC=TRUE, max_n=NULL}: Process all sequences with IUPAC codes regardless of the +#' number of ambiguous positions. Uses IUPAC-aware distance calculation with no filtering. +#' Most permissive option for data with extensive IUPAC ambiguity codes. +#' } +#' +#' Note: Validation occurs before filtering. When \code{IUPAC=FALSE}, sequences containing IUPAC +#' ambiguity codes (R, Y, W, S, M, K, etc.) will fail validation and be rejected before the +#' \code{max_n} filtering step. Therefore, with \code{IUPAC=FALSE, max_n > 0}, only sequences +#' with N and ? characters (not IUPAC codes) can pass validation and be filtered by \code{max_n}. +#' The \code{max_n} parameter always counts using regex \code{"[^ATCG]"}, but \code{IUPAC} determines +#' which non-ATCG characters are allowed to reach the filtering step. +#' #' @section Single-cell data: #' To invoke single-cell mode the \code{cell_id} argument must be specified and the \code{locus} #' column must be correct. Otherwise, clustering will be performed with bulk sequencing assumptions, @@ -984,15 +1047,15 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' #' @export hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("single", "average", "complete"), - normalize=c("len", "none"), junction="junction", + normalize=c("len", "none"), IUPAC=FALSE, junction="junction", v_call="v_call", j_call="j_call", clone="clone_id", fields=NULL, cell_id=NULL, locus="locus", only_heavy=TRUE, split_light=FALSE, first=FALSE, cdr3=FALSE, mod3=FALSE, max_n=0, nproc=1, verbose=FALSE, log=NULL, summarize_clones=FALSE, seq_id = "sequence_id") { results <- defineClonesScoper(db = db, threshold = threshold, model = "hierarchical", - method = match.arg(method), linkage = match.arg(linkage), normalize = match.arg(normalize), - junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields, + method = match.arg(method), linkage = match.arg(linkage), normalize = match.arg(normalize), + IUPAC = IUPAC, junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields, cell_id = cell_id, locus = locus, only_heavy = only_heavy, split_light = split_light, first = first, cdr3 = cdr3, mod3 = mod3, max_n = max_n, nproc = nproc, verbose = verbose, log = log, summarize_clones = summarize_clones) @@ -1062,9 +1125,12 @@ hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("s #' less than 7 nucleotides. #' @param mod3 if \code{TRUE} removes records with a \code{junction} length that is not divisible by #' 3 in nucleotide space. -#' @param max_n the maximum number of degenerate characters to permit in the junction sequence before excluding the -#' record from clonal assignment. Default is set to be zero. Set it as \code{"NULL"} -#' for no action. +#' @param max_n The maximum number of degenerate characters to permit in the junction sequence before excluding the +#' record from clonal assignment. Counts non-ATCG characters using regex \code{"[^ATCG]"}, which +#' includes N, ?, and IUPAC ambiguity codes. Note: \code{spectralClones} does not support the +#' \code{IUPAC} parameter and always validates sequences as if \code{IUPAC=FALSE} (only allows +#' standard bases A,T,C,G plus N and ?). IUPAC ambiguity codes will cause validation errors. +#' Default is 0. Set to \code{NULL} for no filtering. #' @param threshold the supervising cut-off to enforce an upper-limit distance for clonal grouping. #' A numeric value between (0,1). #' @param base_sim required similarity cut-off for sequences in equal distances from each other. @@ -1184,7 +1250,7 @@ defineClonesScoper <- function(db, model = c("identical", "hierarchical", "spectral"), method = c("nt", "aa", "novj", "vj"), linkage = c("single", "average", "complete"), normalize = c("len", "none"), - germline = "germline_alignment", sequence = "sequence_alignment", + germline = "germline_alignment", sequence = "sequence_alignment", IUPAC = FALSE, junction = "junction", v_call = "v_call", j_call = "j_call", clone = "clone_id", fields = NULL, cell_id = NULL, locus = NULL, only_heavy = TRUE, split_light = FALSE, targeting_model = NULL, len_limit = NULL, @@ -1193,7 +1259,8 @@ defineClonesScoper <- function(db, iter_max = 1000, nstart = 1000, nproc = 1, verbose = FALSE, log = NULL, summarize_clones = FALSE) { - + + cat("In modified Functions.R") ### get model model <- match.arg(model) @@ -1249,13 +1316,26 @@ defineClonesScoper <- function(db, } ### Check for invalid characters - valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) + # IUPAC mode is only supported for hierarchical clustering + if (!IUPAC && model == "hierarchical" && method == "nt") { + valid_chars <- c("A", "T", "C", "G", "N", "?") + } else { + valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) + } .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) } - valid_seq <- sapply(db[[junction]], .validateSeq) + valid_seq <- sapply(head(db[[junction]],1000), .validateSeq) not_valid_seq <- which(!valid_seq) if (length(not_valid_seq) > 0) { - stop("invalid sequence characters in the ", junction, " column. ", - length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'") + error_msg <- paste0("Invalid sequence characters in the ", junction, " column were found when checking the first 1000 sequences. ", + length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'", + "\n If you have other IUPAC characters in your sequences, set IUPAC=TRUE to allow all IUPAC bases, this will run a slower version of hierarchicalClustering.") + if (method == "aa") { + method_aa_msg <- paste0("Clustering with method='aa' expects nucleotide sequences, which will be translated by this function.") + stop(paste0(error_msg, "\n", method_aa_msg)) + } else { + stop(error_msg) + } + } ### temp columns @@ -1417,7 +1497,14 @@ defineClonesScoper <- function(db, } else { stop('Nproc must be positive.') } - + + ## RDB + ## This is necessary until the modified code is in the default path + #parallel::clusterEvalQ(cluster, { + # .libPaths(c("/gpfs/gibbs/project/support/rdb9/working/Gabernet/251202/modified/devlib", .libPaths())) + # library(scoper) + #}) + ### export function to clusters if (nproc > 1) { export_functions <- list("passToClustering_lev1", "passToClustering_lev2", "passToClustering_lev3", "passToClustering_lev4", @@ -1433,6 +1520,7 @@ defineClonesScoper <- function(db, db_cloned <- foreach::foreach(gp=1:n_groups, .final=dplyr::bind_rows, .inorder=TRUE, + .packages="scoper", .errorhandling='stop') %dopar% { # ********************************************************************************* # filter each group @@ -1450,6 +1538,7 @@ defineClonesScoper <- function(db, method = method, linkage = ifelse(model == "hierarchical", linkage, NA), normalize = ifelse(model == "hierarchical", normalize, NA), + IUPAC = ifelse(model == "hierarchical" & method == "nt", IUPAC, FALSE), germline = germline, sequence = sequence, junction = junction, @@ -1749,6 +1838,7 @@ passToClustering_lev1 <- function (db_gp, method = c("nt", "aa", "novj", "vj"), linkage = c("single", "average", "complete"), normalize = c("len", "none"), + IUPAC = FALSE, germline = "germline_alignment", sequence = "sequence_alignment", junction = "junction", @@ -1775,6 +1865,7 @@ passToClustering_lev1 <- function (db_gp, method = method, linkage = linkage, normalize = normalize, + IUPAC = IUPAC, junction = junction, cdr3 = cdr3, cdr3_col = cdr3_col, @@ -1839,6 +1930,7 @@ hierarchicalClones_helper <- function(db_gp, method = c("nt", "aa"), linkage = c("single", "average", "complete"), normalize = c("len", "none"), + IUPAC = FALSE, junction = "junction", cdr3 = FALSE, cdr3_col = NA, @@ -1854,7 +1946,7 @@ hierarchicalClones_helper <- function(db_gp, ### number of sequences n <- nrow(db_gp) - + # get sequences if (method == "nt") { seqs <- db_gp[[ifelse(cdr3, cdr3_col, junction)]] @@ -1868,16 +1960,22 @@ hierarchicalClones_helper <- function(db_gp, n_unq <- nrow(df) ind_unq <- df$V1 seqs_unq <- df$seqs + if (n_unq == 1) { return(list("idCluster" = rep(1, n), "n_cluster" = 1, "eigen_vals" = rep(0, n))) } - + # calculate distance matrix if (method == "nt") { - dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, + if (! IUPAC){ + dist_mtx <- fastDist_rcpp(seqs_unq) + } + else{ + dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, dist_mat = getDNAMatrix(gap = 0)) + } } else if (method == "aa") { dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, dist_mat = getAAMatrix(gap = 0)) diff --git a/R/RcppExports.R b/R/RcppExports.R index f02781e..c4726d7 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,3 +5,11 @@ pairwiseMutMatrixRcpp <- function(informative_pos, mutMtx, motifMtx) { .Call(`_scoper_pairwiseMutMatrixRcpp`, informative_pos, mutMtx, motifMtx) } +countSeqsWithInvalidBases_rcpp <- function(seqs) { + .Call(`_scoper_countSeqsWithInvalidBases_rcpp`, seqs) +} + +fastDist_rcpp <- function(seqs) { + .Call(`_scoper_fastDist_rcpp`, seqs) +} + diff --git a/docs/topics/hierarchicalClones.md b/docs/topics/hierarchicalClones.md index 9fbf0d9..69241dc 100644 --- a/docs/topics/hierarchicalClones.md +++ b/docs/topics/hierarchicalClones.md @@ -51,7 +51,8 @@ threshold method : one of the `"nt"` for nucleotide based clustering or -`"aa"` for amino acid based clustering. +`"aa"` for amino acid based clustering. Method `"aa"` still expects nucleotide sequences, +which will be translated to amino acids. linkage : available linkage are `"single"`, `"average"`, and `"complete"`. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 04defa6..c447597 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -23,9 +23,33 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// countSeqsWithInvalidBases_rcpp +int countSeqsWithInvalidBases_rcpp(CharacterVector seqs); +RcppExport SEXP _scoper_countSeqsWithInvalidBases_rcpp(SEXP seqsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< CharacterVector >::type seqs(seqsSEXP); + rcpp_result_gen = Rcpp::wrap(countSeqsWithInvalidBases_rcpp(seqs)); + return rcpp_result_gen; +END_RCPP +} +// fastDist_rcpp +IntegerMatrix fastDist_rcpp(CharacterVector seqs); +RcppExport SEXP _scoper_fastDist_rcpp(SEXP seqsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< CharacterVector >::type seqs(seqsSEXP); + rcpp_result_gen = Rcpp::wrap(fastDist_rcpp(seqs)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_scoper_pairwiseMutMatrixRcpp", (DL_FUNC) &_scoper_pairwiseMutMatrixRcpp, 3}, + {"_scoper_countSeqsWithInvalidBases_rcpp", (DL_FUNC) &_scoper_countSeqsWithInvalidBases_rcpp, 1}, + {"_scoper_fastDist_rcpp", (DL_FUNC) &_scoper_fastDist_rcpp, 1}, {NULL, NULL, 0} }; diff --git a/src/fastDist.cpp b/src/fastDist.cpp new file mode 100644 index 0000000..0b89d01 --- /dev/null +++ b/src/fastDist.cpp @@ -0,0 +1,129 @@ +#include +using namespace Rcpp; + +inline uint8_t code_char(char c){ + switch(c){ + case 'A': return 0; case 'C': return 1; case 'G': return 2; case 'T': return 3; + case 'N': return 4; case '?': return 5; + default: return 255; + } +} + +// [[Rcpp::export]] +int countSeqsWithInvalidBases_rcpp(CharacterVector seqs) { + int N = seqs.size(); + int bad = 0; + + for (int i = 0; i < N; ++i) { + if (seqs[i] == NA_STRING) { + bad++; + continue; + } + + std::string s = as(seqs[i]); + bool invalid = false; + for (char ch : s) { + char up = (char)std::toupper((unsigned char)ch); + if (code_char(up) == 255) { + invalid = true; + break; + } + } + + if (invalid) bad++; + } + + return bad; +} + +// [[Rcpp::export]] +IntegerMatrix fastDist_rcpp(CharacterVector seqs) { + int N = seqs.size(); + + if (N == 0) stop("empty input"); + + std::string s0 = as(seqs[0]); + int L = (int)s0.size(); + for (int i = 0; i < N; ++i) { + if ((int)std::string(as(seqs[i])).size() != L) + stop("All sequences must have the same length"); + } + + // encode to uint8: N x L, row-major in a flat vector + std::vector enc((size_t)N * L); + for (int i = 0; i < N; ++i) { + std::string s = as(seqs[i]); + for (int p = 0; p < L; ++p) { + uint8_t c = code_char(s[p]); + if (c == 255) stop("Only A,C,G,T,N,? are allowed"); + enc[(size_t)i * L + p] = c; + } + } + + IntegerMatrix Mmatch(N, N); // zero-initialized + + for (int p = 0; p < L; ++p) { + std::vector A, Cb, G, Tt, Ns, Q; + A.reserve(N); Cb.reserve(N); G.reserve(N); Tt.reserve(N); Ns.reserve(N); Q.reserve(N); + + // bucket row indices by symbol at column p + for (int i = 0; i < N; ++i) { + switch (enc[(size_t)i * L + p]) { + case 0: A.push_back(i); break; + case 1: Cb.push_back(i); break; + case 2: G.push_back(i); break; + case 3: Tt.push_back(i); break; + case 4: Ns.push_back(i); break; + case 5: Q.push_back(i); break; + } + } + + // column-major safe updaters: iterate over j (column) outermost, + // then use a pointer to column j and bump rows i. + auto bump_within = [&](const std::vector& v){ + int m = (int)v.size(); + for (int jj = 0; jj < m; ++jj) { + int j = v[jj]; + int* colj = &Mmatch(0, j); // pointer to column j + for (int ii = 0; ii < m; ++ii) { + colj[v[ii]] += 1; // increment (i = v[ii], j) + } + } + }; + + auto bump_pairs = [&](const std::vector& X, const std::vector& Y){ + int mx = (int)X.size(), my = (int)Y.size(); + for (int jj = 0; jj < my; ++jj) { + int j = Y[jj]; + int* colj = &Mmatch(0, j); // pointer to column j + for (int ii = 0; ii < mx; ++ii) { + colj[X[ii]] += 1; // increment (i = X[ii], j) + } + } + }; + + // same known base + bump_within(A); bump_within(Cb); bump_within(G); bump_within(Tt); + + // N with known (both directions) + std::vector K; K.reserve(A.size()+Cb.size()+G.size()+Tt.size()); + K.insert(K.end(), A.begin(), A.end()); + K.insert(K.end(), Cb.begin(), Cb.end()); + K.insert(K.end(), G.begin(), G.end()); + K.insert(K.end(), Tt.begin(), Tt.end()); + bump_pairs(Ns, K); + bump_pairs(K, Ns); + + // ? with ? + bump_within(Q); + } + + // convert matches -> distances + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + Mmatch(i, j) = L - Mmatch(i, j); + } + Mmatch(i, i) = 0; + } + return Mmatch; +} diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R index 2293311..49a30c7 100644 --- a/tests/testthat/test_clone.R +++ b/tests/testthat/test_clone.R @@ -70,6 +70,462 @@ test_that("Test hierarchicalClones", { } }) +#### clone - hierarchicalClones with IUPAC parameter #### + +test_that("Test hierarchicalClones with IUPAC parameter", { + # IUPAC and max_n serve different purposes: + # - IUPAC: Controls (1) validation (which characters allowed) and + # (2) distance calculation method (fast Hamming vs IUPAC-aware scoring) + # - max_n: Filters sequences by counting non-ATCG characters using regex "[^ATCG]" + # (includes N, ?, and all IUPAC codes - whatever passed validation) + # + # Processing order: Validation -> Filtering -> Distance calculation + # + # Key use cases tested: + # 1. IUPAC=FALSE, max_n=0: Strict ATCG-only mode with fast distance calculation + # 2. IUPAC=TRUE, max_n=0: IUPAC-aware distance (but filters out IUPAC seqs anyway) + # 3. IUPAC=TRUE, max_n=1+: Main use case - IUPAC codes with proper scoring + # 4. IUPAC=FALSE, max_n=1+: Allow N/? but reject IUPAC codes (validation blocks them) + # 5. IUPAC=FALSE/TRUE with standard bases: Backward compatibility (same results) + + # Create test data with IUPAC ambiguity codes + # With threshold=0.15 and length 15, distance > 0.15 requires >=3 mutations (3/15=0.2) + db_iupac <- data.frame( + sequence_id = paste0("seq", 1:12), + v_call = rep("IGHV1-1*01", 12), + j_call = rep("IGHJ1*01", 12), + # Use IUPAC ambiguity codes: R(A/G), Y(C/T), W(A/T), S(C/G), M(A/C), K(G/T) + junction = c( + # Clone 1: sequences similar to TGTGCAAGCTACTGG (0-1 mutations apart) + "TGTGCRAGCTACTGG", # R = A or G at pos 5 + "TGTGCRAGCTACTGG", # identical to seq1 + "TGTGCAAGCTACTGG", # standard bases only + "TGTGCYAGCTACTGG", # Y = C or T at pos 5, 1 mutation from seq3 + "TGTGCYAGCTACTGG", # identical to seq4 + "TGTGCMAGCTACTGG", # M = A or C at pos 5, 1 mutation from seq3 + "TGTGCWAGCTACTGG", # W = A or T at pos 5, 1 mutation from seq3 + + # Clone 2: sequences similar to ACGTTTGGCCAAACC (3+ mutations from Clone 1) + "ACGTTTGGCCAAACC", # standard bases, distant from Clone 1 + "ACGTTTGGCCAAACC", # identical to seq8 + "ACGKTTGGCCAAACC", # K = G or T at pos 4, 1 mutation from seq8 + "ACGRTTGGCCAAACC", # R = A or G at pos 4, 1 mutation from seq8 + "ACGTTTSGCCAAACC" # S = C or G at pos 7, 1 mutation from seq8 + ), + locus = rep("IGH", 12), + stringsAsFactors = FALSE + ) + + # Test 1: IUPAC=FALSE should reject IUPAC characters at validation stage + # Validation (IUPAC=FALSE): Only allows A,T,C,G,N,? - REJECTS IUPAC codes (R,Y,W,S,M,K,etc) + # Result: Sequences with IUPAC codes fail validation and raise error before any filtering + # Use case: Strict ATCG-only mode with fast Hamming distance (fastDist_rcpp) + expect_error( + hierarchicalClones(db_iupac, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, # Use fast Hamming distance for ATCG only + summarize_clones = FALSE + ), + "Invalid sequence characters" + ) + + # Test 2: IUPAC=TRUE with max_n=0 - validates with IUPAC but filters them out + # 1. Validation (IUPAC=TRUE): Allows standard bases, N, ?, and IUPAC codes - all sequences pass + # 2. Filtering (max_n=0): Counts non-ATCG using "[^ATCG]" - removes all sequences with any non-ATCG + # Result: Only seq3, seq8 and seq9 (standard bases only) remain after filtering + expect_warning( + expect_message( + db_result2 <- hierarchicalClones(db_iupac, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, # Use IUPAC-aware distance calculation + max_n = 0, # Filter out sequences with ANY non-ATCG chars + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ), + "Removed 9 sequences with non ATCG characters." + ) + + # Only 3 sequences with standard bases remain + expect_equal(nrow(db_result2), 3) + expect_equal(sort(db_result2$sequence_id), c("seq3", "seq8", "seq9")) + # They are in different clones (distant sequences) + expect_equal(length(unique(db_result2$clone_id)), 2) + + # Test 3a: Create dataset with N characters to distinguish from IUPAC codes + db_with_n <- db_iupac + db_with_n[13,] <- NA + db_with_n$junction[13] <- "TGTGCNAGCTACTGG" # Add seq13 with N + db_with_n$sequence_id <- c(db_with_n$sequence_id[1:12], "seq13") + db_with_n$v_call <- c(db_with_n$v_call[1:12], "IGHV1-1*01") + db_with_n$j_call <- c(db_with_n$j_call[1:12], "IGHJ1*01") + db_with_n$locus <- c(db_with_n$locus[1:12], "IGH") + + # Test 3b: IUPAC=TRUE with max_n=1 - allows up to 1 non-ATCG character per sequence + # This is the main use case: proper IUPAC distance calculation with ambiguity codes + # 1. Validation (IUPAC=TRUE): Allows standard bases (A,T,C,G) plus N, ?, and IUPAC codes (R,Y,W,S,M,K,etc) + # 2. Filtering (max_n=1): Counts non-ATCG using "[^ATCG]" - keeps sequences with ≤1 such character + # 3. Distance: Uses IUPAC-aware scoring (alakazam::pairwiseDist) for proper ambiguity handling + expect_message( + db_result3 <- hierarchicalClones(db_with_n, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, # Use IUPAC-aware distance calculation + max_n = 1, # Allow up to 1 non-ATCG character (IUPAC or N) + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + + # All 13 sequences kept (each has ≤1 non-ATCG character) + expect_equal(nrow(db_result3), 13) + + # Verify that we have 2 different clusters (Clone 1 and Clone 2) + n_clones <- length(unique(db_result3$clone_id)) + expect_equal(n_clones, 2) + + # Identical sequences with IUPAC codes should cluster together + # Note: rows may be reordered, so reference by sequence_id + get_clone <- function(seq_id) db_result3$clone_id[db_result3$sequence_id == seq_id] + + expect_equal(get_clone("seq1"), get_clone("seq2")) # seq1=seq2 (both R) + expect_equal(get_clone("seq4"), get_clone("seq5")) # seq4=seq5 (both Y) + expect_equal(get_clone("seq8"), get_clone("seq9")) # seq8=seq9 (standard) + + # seq13 (with N) should cluster with Clone 1 (similar to seq3) + expect_equal(get_clone("seq3"), get_clone("seq13")) + + # Verify two distinct clones based on biological distance + clone1_id <- get_clone("seq3") # seq3 in Clone 1 + clone2_id <- get_clone("seq8") # seq8 in Clone 2 + expect_true(clone1_id != clone2_id) + expect_true(all(sapply(paste0("seq", 1:7), get_clone) == clone1_id)) # Clone 1: seq1-7 + expect_true(all(sapply(paste0("seq", 8:12), get_clone) == clone2_id)) # Clone 2: seq8-12 + expect_true(get_clone("seq13") == clone1_id) # seq13 joins Clone 1 + + # Test 4: IUPAC=FALSE with max_n=1+ - allows N/? but rejects other IUPAC codes + # This demonstrates the two-stage process: validation -> filtering + # 1. Validation (IUPAC=FALSE): Allows A,T,C,G,N,? but REJECTS IUPAC codes (R,Y,W,S,M,K,etc) + # 2. Filtering (max_n): Counts non-ATCG using "[^ATCG]" regex on sequences that passed validation + # Use case: Tolerate low-quality positions (N,?) with fast Hamming distance, but no ambiguity codes + db_with_n_only <- data.frame( + sequence_id = paste0("seq", 1:5), + v_call = rep("IGHV1-1*01", 5), + j_call = rep("IGHJ1*01", 5), + junction = c( + "TGTGCAAGCTACTGG", # standard bases only + "TGTGCNAGCTACTGG", # 1 N (allowed with max_n=1) + "TGTGC?AGCTACTGG", # 1 ? (allowed with max_n=1) + "TGTGCNNGCTACTGG", # 2 N's (filtered with max_n=1) + "TGTGCRAGCTACTGG" # IUPAC R code (should fail validation) + ), + locus = rep("IGH", 5), + stringsAsFactors = FALSE + ) + + # Should reject at validation stage due to IUPAC code (R) in seq5 + expect_error( + hierarchicalClones(db_with_n_only, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, # Reject IUPAC codes + max_n = 1, # Allow up to 1 N or ? + summarize_clones = FALSE + ), + "Invalid sequence characters" + ) + + # Test with only N/? (no IUPAC codes) - should succeed + db_n_only_valid <- db_with_n_only[1:4, ] # Remove seq5 with IUPAC code + + expect_warning( + expect_message( + db_result5 <- hierarchicalClones(db_n_only_valid, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, # Use fast Hamming distance + max_n = 1, # Allow up to 1 N or ? + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ), + "Removed 1 sequences with non ATCG characters." # seq4 with 2 N's filtered + ) + + # Should keep seq1, seq2, seq3 (0-1 non-ATCG characters each) + expect_equal(nrow(db_result5), 3) + expect_equal(sort(db_result5$sequence_id), c("seq1", "seq2", "seq3")) + # All are similar sequences, should be in same clone + expect_equal(length(unique(db_result5$clone_id)), 1) + + # Test 5: Standard bases work with both IUPAC=TRUE and IUPAC=FALSE + # This demonstrates backward compatibility and performance consideration: + # - IUPAC=FALSE: Uses fast Hamming distance (fastDist_rcpp) - faster + # - IUPAC=TRUE: Uses IUPAC-aware scoring (alakazam::pairwiseDist) - slower but handles ambiguity + # For standard bases (ATCG), results should be identical + db_standard <- ExampleDb[1:50, ] + + # Verify db_standard contains only standard bases (not IUPAC ambiguity codes) + standard_chars <- c("A", "T", "C", "G", "N", "?") + .is_standard <- function(x) { + all(unique(strsplit(x, "")[[1]]) %in% standard_chars) + } + all_standard <- sapply(db_standard$junction, .is_standard) + expect_true(all(all_standard), + info = "ExampleDb contains IUPAC ambiguity codes - update test data or test expectations" + ) + + expect_message( + db_result_false <- hierarchicalClones(db_standard, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, # Fast Hamming distance for ATCG only + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + + expect_message( + db_result_true <- hierarchicalClones(db_standard, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, # IUPAC-aware distance (slower but handles ambiguity) + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + + # Results should be identical for standard bases + # (IUPAC just changes the distance calculation method, not the result) + expect_equal(nrow(db_result_false), nrow(db_result_true)) + expect_true(all(!is.na(db_result_false$clone_id))) + expect_true(all(!is.na(db_result_true$clone_id))) + # Clone assignments should be the same (though IDs may differ) + expect_equal(length(unique(db_result_false$clone_id)), + length(unique(db_result_true$clone_id))) + + # Test 6: IUPAC=TRUE with max_n=NULL - no filtering, process all sequences + # This is the most permissive option for IUPAC data + # 1. Validation (IUPAC=TRUE): Allows standard bases, N, ?, and all IUPAC codes + # 2. Filtering (max_n=NULL): No filtering - all sequences kept regardless of character count + # 3. Distance: Uses IUPAC-aware scoring for proper ambiguity handling + # Use case: Data with extensive IUPAC codes where you want to process everything + + # Create test data with multiple IUPAC codes per sequence + db_iupac_multi <- data.frame( + sequence_id = paste0("seq", 1:8), + v_call = rep("IGHV1-1*01", 8), + j_call = rep("IGHJ1*01", 8), + junction = c( + # Clone 1: sequences with multiple IUPAC codes, similar to each other + "TGTGCRAGCTRYCTGG", # R at pos 6, R at pos 12, Y at pos 13 (3 non-ATCG) + "TGTGCRAGCTRYCTGG", # identical to seq1 + "TGTGCRAGCTWYNMGG", # R at pos 6, W at pos 12, Y at pos 13, N at pos 14, M at pos 15 (5 non-ATCG) + "TGTGCAAGCTACCTGG", # standard bases only (0 non-ATCG) + # Clone 2: sequences distant from Clone 1, also with IUPAC codes + "ACGKTTRGCCNNNYYY", # K, R, 3 N's, 3 Y's (8 non-ATCG characters!) + "ACGKTTRGCCNNNYYY", # identical to seq5 + "ACGGTTAGCCAAACCC", # standard bases only (0 non-ATCG) + "ACGKTTSGCCNNNYYY" # K, S, 3 N's, 3 Y's (8 non-ATCG) + ), + locus = rep("IGH", 8), + stringsAsFactors = FALSE + ) + + # With max_n=NULL, all sequences should be kept regardless of IUPAC code count + expect_message( + db_result6 <- hierarchicalClones(db_iupac_multi, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, # Use IUPAC-aware distance calculation + max_n = NULL, # No filtering - process all sequences + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + + # All 8 sequences kept (no filtering applied) + expect_equal(nrow(db_result6), 8) + expect_equal(sort(db_result6$sequence_id), paste0("seq", 1:8)) + + # Should have 2 clones based on biological distance + expect_equal(length(unique(db_result6$clone_id)), 2) + + # Helper function to get clone ID by sequence ID + get_clone6 <- function(seq_id) db_result6$clone_id[db_result6$sequence_id == seq_id] + + # Verify identical sequences cluster together despite many IUPAC codes + expect_equal(get_clone6("seq1"), get_clone6("seq2")) # both identical with 3 IUPAC + expect_equal(get_clone6("seq5"), get_clone6("seq6")) # both identical with 8 IUPAC + + # Verify Clone 1 (seq1-4) and Clone 2 (seq5-8) are distinct + clone1_id <- get_clone6("seq1") + clone2_id <- get_clone6("seq5") + expect_true(clone1_id != clone2_id) + expect_true(all(sapply(paste0("seq", 1:4), get_clone6) == clone1_id)) + expect_true(all(sapply(paste0("seq", 5:8), get_clone6) == clone2_id)) + +}) + +#### clone - hierarchicalClones, IUPAC code in sequence 1001 #### + +test_that("Test hierarchicalClones with IUPAC code beyond validation window", { + # Validation only inspects the first 1000 sequences (head(..., 1000)). + # A sequence containing an IUPAC code at position 1001 therefore escapes the check + # regardless of IUPAC=TRUE/FALSE. + # What happens next depends on max_n and IUPAC: + # - max_n=0: prepare_db filters out non-ATCG sequences -> warning, no error (both IUPAC=T/F) + # - max_n=NULL, IUPAC=FALSE: no filtering -> fastDist_rcpp encounters R -> error + # - max_n=NULL, IUPAC=TRUE: no filtering -> pairwiseDist handles R correctly -> succeeds + + # Build 1001-row dataset: rows 1-1000 are standard ATCG, row 1001 has IUPAC code R + n_std <- 1000 + db_iupac_1001 <- data.frame( + sequence_id = paste0("seq", seq_len(n_std + 1)), + v_call = rep("IGHV1-1*01", n_std + 1), + j_call = rep("IGHJ1*01", n_std + 1), + junction = c(rep("TGTGCAAGCTACTGG", n_std), # rows 1-1000: ATCG only + "TGTGCRAGCTACTGG"), # row 1001: R is an IUPAC code + locus = rep("IGH", n_std + 1), + stringsAsFactors = FALSE + ) + + # Confirm row 1001 is the one with the IUPAC code (sanity check) + expect_equal(db_iupac_1001$junction[1001], "TGTGCRAGCTACTGG") + + # Same data in reverse order to put IUPAC code in row 1 + db_iupac_1 <- db_iupac_1001[1001:1, ] + + # Test 1: IUPAC=FALSE, max_n=0 (default) + # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000 + # - max_n=0 filtering: removes row 1001 (R is non-ATCG) and issues a warning + # - Result: runs successfully; row 1001 removed + expect_warning( + expect_message( + db_result1 <- hierarchicalClones(db_iupac_1001, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, + max_n = 0, + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ), + "Removed 1 sequences with non ATCG characters." + ) + expect_equal(nrow(db_result1), n_std) + expect_false("seq1001" %in% db_result1$sequence_id) + + # Same data in reverse order now throws an error. Potentially confusing for users? + expect_error( + db_result1 <- hierarchicalClones(db_iupac_1, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, + max_n = 0, + summarize_clones = FALSE + ), + "Invalid sequence characters in the junction column" + ) + + # Test 2: IUPAC=FALSE, max_n=NULL + # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000 + # - max_n=NULL: no filtering -> row 1001 reaches fastDist_rcpp + # - fastDist_rcpp stops with an error because R is not A,C,G,T,N,? + expect_error( + suppressMessages( + hierarchicalClones(db_iupac_1001, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, + max_n = NULL, + summarize_clones = FALSE + ) + ), + "Only A,C,G,T,N,\\? are allowed" + ) + + # Same data in reverse also throws an error. + expect_error( + suppressMessages( + hierarchicalClones(db_iupac_1, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, + max_n = NULL, + summarize_clones = FALSE + ) + ), + "Invalid sequence characters in the junction column" + ) + + # Test 3: IUPAC=TRUE, max_n=NULL + # - Validation (first 1000 rows): passes + # - max_n=NULL: no filtering -> row 1001 is kept + # - Distance: alakazam::pairwiseDist handles IUPAC codes correctly -> no error + # - Result: all 1001 sequences processed; seq1001 included in output + expect_message( + db_result3 <- hierarchicalClones(db_iupac_1001, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, + max_n = NULL, + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + expect_equal(nrow(db_result3), n_std + 1) + expect_true("seq1001" %in% db_result3$sequence_id) + # seq1001 (TGTGCRAGCTACTGG) is within distance 0.15 of the standard sequences + # (TGTGCAAGCTACTGG): R matches A under IUPAC scoring -> same clone + expect_equal(length(unique(db_result3$clone_id)), 1) + + # Same data in reverse order gives the same result (IUPAC=TRUE allows R, max_n=NULL means no filtering) + expect_message( + db_result4 <- hierarchicalClones(db_iupac_1, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, + max_n = NULL, + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + # Results should be identical regardless of row order + db_result4_ordered <- db_result4[match(db_result3$sequence_id, db_result4$sequence_id), ] + expect_equal(db_result3, db_result4_ordered, check.attributes = FALSE) +}) + #### clone - spectralClones - novj method #### test_that("Test spectralClones - novj", { @@ -407,7 +863,7 @@ test_that("Test hierarchicalClones only_heavy and first", { # expect sequences from cells 1 and 3 in clone 1, and # sequences from cell 2 in clone 2 clones_split_F_th0@db %>% - mutate(expected_clone_id = dplyr::case_when( + dplyr::mutate(expected_clone_id = dplyr::case_when( cell_id %in% c(1, 3) ~ "1", cell_id %in% c(2) ~ "2", TRUE ~ NA_character_ @@ -484,7 +940,7 @@ test_that("Test hierarchicalClones only_heavy and first", { # sequences from cell 2 in clone 2, regardless of # light chains clones_split_F_th0_2l@db %>% - mutate(expected_clone_id = dplyr::case_when( + dplyr::mutate(expected_clone_id = dplyr::case_when( cell_id %in% c(1, 3) ~ "1", cell_id %in% c(2) ~ "2", TRUE ~ NA_character_ @@ -558,3 +1014,5 @@ test_that("Testing split_light warnings for all cloning mehtods", { cell_id = "cell_id", split_light = TRUE)) expect_equal(db_nsplit[['clone_id']], db_split[['clone_id']]) }) + +