From bf3cd69a609a3f7ce0fd6c7d819c752dfd4e4e92 Mon Sep 17 00:00:00 2001 From: Robert Bjornson Date: Fri, 5 Dec 2025 09:16:30 -0500 Subject: [PATCH 01/25] added code to load devel scoper in foreach worker --- R/Functions.R | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index e13bb5d..fc25f75 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -60,6 +60,12 @@ setMethod("as.data.frame", c(x="ScoperClones"), #### Internal functions #### +# RDB +workerlog <- function(msg) { + log_file <- paste0("worker_", Sys.getpid(), ".log") + cat("In foreach", "PID", Sys.getpid(), format(Sys.time(), "%Y-%m-%d_%H:%M:%S"), msg, "\n", file = log_file, append = TRUE) +} + # find density gap findGapSmooth <- function(vec) { # bandwidth <- kedd::h.ucv(vec, 4)$h @@ -1181,7 +1187,8 @@ defineClonesScoper <- function(db, iter_max = 1000, nstart = 1000, nproc = 1, verbose = FALSE, log = NULL, summarize_clones = FALSE) { - + + cat("In modified Functions.R") ### get model model <- match.arg(model) @@ -1405,13 +1412,24 @@ defineClonesScoper <- function(db, } else { stop('Nproc must be positive.') } - + + #RDB + ## Workers: also dev version + parallel::clusterEvalQ(cluster, { + library(devtools) + devtools::load_all("scoper") # same source tree path + Rcpp::sourceCpp("fastDistExt2C.cpp") + }) + + #RDB + cat('Loading fastDistExt2C.cpp') + ### export function to clusters if (nproc > 1) { export_functions <- list("passToClustering_lev1", "passToClustering_lev2", "passToClustering_lev3", "passToClustering_lev4", "findGapSmooth", "infer", "krnlMtxGenerator", "makeAffinity", "laplacianMtx", "rangeAtoB", "likelihoods", "pairwiseMutions", "pairwiseMutMatrix", - "printVerbose", "logVerbose","stri_join") + "printVerbose", "logVerbose","stri_join", "workerlog") parallel::clusterExport(cluster, export_functions, envir=environment()) doParallel::registerDoParallel(cluster, cores=nproc) } @@ -1749,6 +1767,8 @@ passToClustering_lev1 <- function (db_gp, iter_max = 1000, nstart = 1000) { ### get model + # RDB + workerlog('in passToClustering_lev1') model <- match.arg(model) ### begin clustering @@ -1759,6 +1779,8 @@ passToClustering_lev1 <- function (db_gp, cdr3 = cdr3, cdr3_col = cdr3_col) } else if (model == "hierarchical") { + #RDB + workerlog('calling hierarchicalClones_helper') clone_results <- hierarchicalClones_helper(db_gp, method = method, linkage = linkage, @@ -1831,6 +1853,8 @@ hierarchicalClones_helper <- function(db_gp, cdr3 = FALSE, cdr3_col = NA, threshold = NULL) { + #RDB + workerlog('in hierarchicalClones_helper') ### get method method <- match.arg(method) @@ -1842,7 +1866,8 @@ hierarchicalClones_helper <- function(db_gp, ### number of sequences n <- nrow(db_gp) - + + workerlog(paste('in hierarchicalClones_helper, n, method', n, method)) # get sequences if (method == "nt") { seqs <- db_gp[[ifelse(cdr3, cdr3_col, junction)]] @@ -1856,16 +1881,23 @@ hierarchicalClones_helper <- function(db_gp, n_unq <- nrow(df) ind_unq <- df$V1 seqs_unq <- df$seqs + + workerlog(paste('in hierarchicalClones_helper', n_unq)) if (n_unq == 1) { return(list("idCluster" = rep(1, n), "n_cluster" = 1, "eigen_vals" = rep(0, n))) } - + + # RDB + workerlog('just before calculate distance matrix') # calculate distance matrix if (method == "nt") { - dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, - dist_mat = getDNAMatrix(gap = 0)) + workerlog('calling fastDistExt2_rcpp') + dist_mtx <- fastDistExt2_rcpp(seqs_unq) + # RDB + #dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, + # dist_mat = getDNAMatrix(gap = 0)) } else if (method == "aa") { dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, dist_mat = getAAMatrix(gap = 0)) From e2f299e4bb2fc77313f77689a87898167225a184 Mon Sep 17 00:00:00 2001 From: Robert Bjornson Date: Fri, 5 Dec 2025 16:01:40 -0500 Subject: [PATCH 02/25] added fastDist.cpp --- src/fastDist.cpp | 104 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 src/fastDist.cpp diff --git a/src/fastDist.cpp b/src/fastDist.cpp new file mode 100644 index 0000000..3ff9919 --- /dev/null +++ b/src/fastDist.cpp @@ -0,0 +1,104 @@ +#include +using namespace Rcpp; + +inline uint8_t code_char(char c){ + switch(c){ + case 'A': return 0; case 'C': return 1; case 'G': return 2; case 'T': return 3; + case 'N': return 4; case '?': return 5; + default: return 255; + } +} + +// [[Rcpp::export]] +IntegerMatrix fastDist_rcpp(CharacterVector seqs) { + int N = seqs.size(); + + // RDB + printf("In fastDistExt2_rcpp, %d seqs\n", N); + if (N == 0) stop("empty input"); + + std::string s0 = as(seqs[0]); + int L = (int)s0.size(); + for (int i = 0; i < N; ++i) { + if ((int)std::string(as(seqs[i])).size() != L) + stop("All sequences must have the same length"); + } + + // encode to uint8: N x L, row-major in a flat vector + std::vector enc((size_t)N * L); + for (int i = 0; i < N; ++i) { + std::string s = as(seqs[i]); + for (int p = 0; p < L; ++p) { + uint8_t c = code_char(s[p]); + if (c == 255) stop("Only A,C,G,T,N,? are allowed"); + enc[(size_t)i * L + p] = c; + } + } + + IntegerMatrix Mmatch(N, N); // zero-initialized + + for (int p = 0; p < L; ++p) { + std::vector A, Cb, G, Tt, Ns, Q; + A.reserve(N); Cb.reserve(N); G.reserve(N); Tt.reserve(N); Ns.reserve(N); Q.reserve(N); + + // bucket row indices by symbol at column p + for (int i = 0; i < N; ++i) { + switch (enc[(size_t)i * L + p]) { + case 0: A.push_back(i); break; + case 1: Cb.push_back(i); break; + case 2: G.push_back(i); break; + case 3: Tt.push_back(i); break; + case 4: Ns.push_back(i); break; + case 5: Q.push_back(i); break; + } + } + + // column-major safe updaters: iterate over j (column) outermost, + // then use a pointer to column j and bump rows i. + auto bump_within = [&](const std::vector& v){ + int m = (int)v.size(); + for (int jj = 0; jj < m; ++jj) { + int j = v[jj]; + int* colj = &Mmatch(0, j); // pointer to column j + for (int ii = 0; ii < m; ++ii) { + colj[v[ii]] += 1; // increment (i = v[ii], j) + } + } + }; + + auto bump_pairs = [&](const std::vector& X, const std::vector& Y){ + int mx = (int)X.size(), my = (int)Y.size(); + for (int jj = 0; jj < my; ++jj) { + int j = Y[jj]; + int* colj = &Mmatch(0, j); // pointer to column j + for (int ii = 0; ii < mx; ++ii) { + colj[X[ii]] += 1; // increment (i = X[ii], j) + } + } + }; + + // same known base + bump_within(A); bump_within(Cb); bump_within(G); bump_within(Tt); + + // N with known (both directions) + std::vector K; K.reserve(A.size()+Cb.size()+G.size()+Tt.size()); + K.insert(K.end(), A.begin(), A.end()); + K.insert(K.end(), Cb.begin(), Cb.end()); + K.insert(K.end(), G.begin(), G.end()); + K.insert(K.end(), Tt.begin(), Tt.end()); + bump_pairs(Ns, K); + bump_pairs(K, Ns); + + // ? with ? + bump_within(Q); + } + + // convert matches -> distances + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + Mmatch(i, j) = L - Mmatch(i, j); + } + Mmatch(i, i) = 0; + } + return Mmatch; +} From 2b728a6a996fb4aeb13de643921e5a3ff5fffbd8 Mon Sep 17 00:00:00 2001 From: Robert Bjornson Date: Fri, 5 Dec 2025 16:05:34 -0500 Subject: [PATCH 03/25] first attempt at proper integration --- R/Functions.R | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index fc25f75..ea38e9f 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -1415,15 +1415,11 @@ defineClonesScoper <- function(db, #RDB ## Workers: also dev version - parallel::clusterEvalQ(cluster, { - library(devtools) - devtools::load_all("scoper") # same source tree path - Rcpp::sourceCpp("fastDistExt2C.cpp") + parallel::clusterEvalQ(cluster, { + .libPaths(c("/path/to/devlib", .libPaths())) + library(scoper) }) - #RDB - cat('Loading fastDistExt2C.cpp') - ### export function to clusters if (nproc > 1) { export_functions <- list("passToClustering_lev1", "passToClustering_lev2", "passToClustering_lev3", "passToClustering_lev4", @@ -1893,8 +1889,8 @@ hierarchicalClones_helper <- function(db_gp, workerlog('just before calculate distance matrix') # calculate distance matrix if (method == "nt") { - workerlog('calling fastDistExt2_rcpp') - dist_mtx <- fastDistExt2_rcpp(seqs_unq) + workerlog('calling fastDist_rcpp') + dist_mtx <- fastDist_rcpp(seqs_unq) # RDB #dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, # dist_mat = getDNAMatrix(gap = 0)) From 8fe9e523bd39e71721c7a60a6453e911e923b8aa Mon Sep 17 00:00:00 2001 From: Robert Bjornson Date: Fri, 5 Dec 2025 16:15:21 -0500 Subject: [PATCH 04/25] fixed libPath --- R/Functions.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/Functions.R b/R/Functions.R index ea38e9f..8c28d68 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -1414,9 +1414,10 @@ defineClonesScoper <- function(db, } #RDB + ## Workers: also dev version parallel::clusterEvalQ(cluster, { - .libPaths(c("/path/to/devlib", .libPaths())) + .libPaths(c("/gpfs/gibbs/project/support/rdb9/working/Gabernet/251202/modified/devlib", .libPaths())) library(scoper) }) From 6e58d4efceebaf25adea12439f64d38d339f91d5 Mon Sep 17 00:00:00 2001 From: Robert Bjornson Date: Mon, 8 Dec 2025 09:52:34 -0500 Subject: [PATCH 05/25] added cpp interface files --- R/RcppExports.R | 4 ++++ src/RcppExports.cpp | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/R/RcppExports.R b/R/RcppExports.R index f02781e..70462b1 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,3 +5,7 @@ pairwiseMutMatrixRcpp <- function(informative_pos, mutMtx, motifMtx) { .Call(`_scoper_pairwiseMutMatrixRcpp`, informative_pos, mutMtx, motifMtx) } +fastDist_rcpp <- function(seqs) { + .Call(`_scoper_fastDist_rcpp`, seqs) +} + diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 04defa6..03a552b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -23,9 +23,21 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// fastDist_rcpp +IntegerMatrix fastDist_rcpp(CharacterVector seqs); +RcppExport SEXP _scoper_fastDist_rcpp(SEXP seqsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< CharacterVector >::type seqs(seqsSEXP); + rcpp_result_gen = Rcpp::wrap(fastDist_rcpp(seqs)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_scoper_pairwiseMutMatrixRcpp", (DL_FUNC) &_scoper_pairwiseMutMatrixRcpp, 3}, + {"_scoper_fastDist_rcpp", (DL_FUNC) &_scoper_fastDist_rcpp, 1}, {NULL, NULL, 0} }; From 5668d3fd5afaa133c4c8cd9a12a727c32e12c07b Mon Sep 17 00:00:00 2001 From: Robert Bjornson Date: Mon, 8 Dec 2025 10:28:52 -0500 Subject: [PATCH 06/25] more cleanup --- R/Functions.R | 35 +++++++++-------------------------- src/fastDist.cpp | 2 -- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index 8c28d68..f1f9018 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -60,12 +60,6 @@ setMethod("as.data.frame", c(x="ScoperClones"), #### Internal functions #### -# RDB -workerlog <- function(msg) { - log_file <- paste0("worker_", Sys.getpid(), ".log") - cat("In foreach", "PID", Sys.getpid(), format(Sys.time(), "%Y-%m-%d_%H:%M:%S"), msg, "\n", file = log_file, append = TRUE) -} - # find density gap findGapSmooth <- function(vec) { # bandwidth <- kedd::h.ucv(vec, 4)$h @@ -1413,20 +1407,19 @@ defineClonesScoper <- function(db, stop('Nproc must be positive.') } - #RDB - - ## Workers: also dev version - parallel::clusterEvalQ(cluster, { - .libPaths(c("/gpfs/gibbs/project/support/rdb9/working/Gabernet/251202/modified/devlib", .libPaths())) - library(scoper) - }) + ## RDB + ## This is necessary until the modified code is in the default path + #parallel::clusterEvalQ(cluster, { + # .libPaths(c("/gpfs/gibbs/project/support/rdb9/working/Gabernet/251202/modified/devlib", .libPaths())) + # library(scoper) + #}) ### export function to clusters if (nproc > 1) { export_functions <- list("passToClustering_lev1", "passToClustering_lev2", "passToClustering_lev3", "passToClustering_lev4", "findGapSmooth", "infer", "krnlMtxGenerator", "makeAffinity", "laplacianMtx", "rangeAtoB", "likelihoods", "pairwiseMutions", "pairwiseMutMatrix", - "printVerbose", "logVerbose","stri_join", "workerlog") + "printVerbose", "logVerbose","stri_join") parallel::clusterExport(cluster, export_functions, envir=environment()) doParallel::registerDoParallel(cluster, cores=nproc) } @@ -1436,6 +1429,7 @@ defineClonesScoper <- function(db, db_cloned <- foreach::foreach(gp=1:n_groups, .final=dplyr::bind_rows, .inorder=TRUE, + .packages="scoper", .errorhandling='stop') %dopar% { # ********************************************************************************* # filter each group @@ -1764,8 +1758,6 @@ passToClustering_lev1 <- function (db_gp, iter_max = 1000, nstart = 1000) { ### get model - # RDB - workerlog('in passToClustering_lev1') model <- match.arg(model) ### begin clustering @@ -1776,8 +1768,6 @@ passToClustering_lev1 <- function (db_gp, cdr3 = cdr3, cdr3_col = cdr3_col) } else if (model == "hierarchical") { - #RDB - workerlog('calling hierarchicalClones_helper') clone_results <- hierarchicalClones_helper(db_gp, method = method, linkage = linkage, @@ -1850,8 +1840,6 @@ hierarchicalClones_helper <- function(db_gp, cdr3 = FALSE, cdr3_col = NA, threshold = NULL) { - #RDB - workerlog('in hierarchicalClones_helper') ### get method method <- match.arg(method) @@ -1864,7 +1852,6 @@ hierarchicalClones_helper <- function(db_gp, ### number of sequences n <- nrow(db_gp) - workerlog(paste('in hierarchicalClones_helper, n, method', n, method)) # get sequences if (method == "nt") { seqs <- db_gp[[ifelse(cdr3, cdr3_col, junction)]] @@ -1879,20 +1866,16 @@ hierarchicalClones_helper <- function(db_gp, ind_unq <- df$V1 seqs_unq <- df$seqs - workerlog(paste('in hierarchicalClones_helper', n_unq)) if (n_unq == 1) { return(list("idCluster" = rep(1, n), "n_cluster" = 1, "eigen_vals" = rep(0, n))) } - # RDB - workerlog('just before calculate distance matrix') # calculate distance matrix if (method == "nt") { - workerlog('calling fastDist_rcpp') + # RDB dist_mtx <- fastDist_rcpp(seqs_unq) - # RDB #dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, # dist_mat = getDNAMatrix(gap = 0)) } else if (method == "aa") { diff --git a/src/fastDist.cpp b/src/fastDist.cpp index 3ff9919..a3254a3 100644 --- a/src/fastDist.cpp +++ b/src/fastDist.cpp @@ -13,8 +13,6 @@ inline uint8_t code_char(char c){ IntegerMatrix fastDist_rcpp(CharacterVector seqs) { int N = seqs.size(); - // RDB - printf("In fastDistExt2_rcpp, %d seqs\n", N); if (N == 0) stop("empty input"); std::string s0 = as(seqs[0]); From 102252c3a5319edd1395393f89dd9cd38cc87fdb Mon Sep 17 00:00:00 2001 From: Robert Bjornson Date: Tue, 23 Dec 2025 15:27:50 -0500 Subject: [PATCH 07/25] added countSeqsWithInvalidBases_rcpp() --- src/fastDist.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/fastDist.cpp b/src/fastDist.cpp index a3254a3..0b89d01 100644 --- a/src/fastDist.cpp +++ b/src/fastDist.cpp @@ -9,6 +9,33 @@ inline uint8_t code_char(char c){ } } +// [[Rcpp::export]] +int countSeqsWithInvalidBases_rcpp(CharacterVector seqs) { + int N = seqs.size(); + int bad = 0; + + for (int i = 0; i < N; ++i) { + if (seqs[i] == NA_STRING) { + bad++; + continue; + } + + std::string s = as(seqs[i]); + bool invalid = false; + for (char ch : s) { + char up = (char)std::toupper((unsigned char)ch); + if (code_char(up) == 255) { + invalid = true; + break; + } + } + + if (invalid) bad++; + } + + return bad; +} + // [[Rcpp::export]] IntegerMatrix fastDist_rcpp(CharacterVector seqs) { int N = seqs.size(); From 9c14a2d59db1604e3ee7fb121d08041d5eec51cb Mon Sep 17 00:00:00 2001 From: Robert Bjornson Date: Tue, 23 Dec 2025 15:54:45 -0500 Subject: [PATCH 08/25] updated exports --- R/RcppExports.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/RcppExports.R b/R/RcppExports.R index 70462b1..c4726d7 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,6 +5,10 @@ pairwiseMutMatrixRcpp <- function(informative_pos, mutMtx, motifMtx) { .Call(`_scoper_pairwiseMutMatrixRcpp`, informative_pos, mutMtx, motifMtx) } +countSeqsWithInvalidBases_rcpp <- function(seqs) { + .Call(`_scoper_countSeqsWithInvalidBases_rcpp`, seqs) +} + fastDist_rcpp <- function(seqs) { .Call(`_scoper_fastDist_rcpp`, seqs) } From fcd69a38843e6df8e07316095b468a746e0152ea Mon Sep 17 00:00:00 2001 From: Vivian0105 Date: Mon, 9 Mar 2026 11:23:08 -0400 Subject: [PATCH 09/25] Add parameter IUPAC in functions hierarchicalClones, defineClonesScoper and hierarchicalClones_helper --- R/Functions.R | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index f1f9018..02c095c 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -881,6 +881,8 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' @param linkage available linkage are \code{"single"}, \code{"average"}, and \code{"complete"}. #' @param normalize method of normalization. The default is \code{"len"}, which divides the distance by the length #' of the sequence group. If \code{"none"} then no normalization if performed. +#' @param IUPAC If \code{TRUE}, use the IUPAC coding system, which includes both standard bases and ambiguity codes. +#' If \code{FALSE} (default), only standard bases, "N" and "?" are allowed in the sequence. #' @param junction character name of the column containing junction sequences. #' Also used to determine sequence length for grouping. #' @param v_call name of the column containing the V-segment allele calls. @@ -974,15 +976,15 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' #' @export hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("single", "average", "complete"), - normalize=c("len", "none"), junction="junction", + normalize=c("len", "none"), IUPAC=FALSE, junction="junction", v_call="v_call", j_call="j_call", clone="clone_id", fields=NULL, cell_id=NULL, locus="locus", only_heavy=TRUE, split_light=FALSE, first=FALSE, cdr3=FALSE, mod3=FALSE, max_n=0, nproc=1, verbose=FALSE, log=NULL, summarize_clones=FALSE, seq_id = "sequence_id") { results <- defineClonesScoper(db = db, threshold = threshold, model = "hierarchical", - method = match.arg(method), linkage = match.arg(linkage), normalize = match.arg(normalize), - junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields, + method = match.arg(method), linkage = match.arg(linkage), normalize = match.arg(normalize), + IUPAC = match.arg(IUPAC), junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields, cell_id = cell_id, locus = locus, only_heavy = only_heavy, split_light = split_light, first = first, cdr3 = cdr3, mod3 = mod3, max_n = max_n, nproc = nproc, verbose = verbose, log = log, summarize_clones = summarize_clones) @@ -1172,7 +1174,7 @@ defineClonesScoper <- function(db, model = c("identical", "hierarchical", "spectral"), method = c("nt", "aa", "novj", "vj"), linkage = c("single", "average", "complete"), normalize = c("len", "none"), - germline = "germline_alignment", sequence = "sequence_alignment", + germline = "germline_alignment", sequence = "sequence_alignment", IUPAC = FALSE, junction = "junction", v_call = "v_call", j_call = "j_call", clone = "clone_id", fields = NULL, cell_id = NULL, locus = NULL, only_heavy = TRUE, split_light = FALSE, targeting_model = NULL, len_limit = NULL, @@ -1238,13 +1240,25 @@ defineClonesScoper <- function(db, } ### Check for invalid characters - valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) - .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) } - valid_seq <- sapply(db[[junction]], .validateSeq) - not_valid_seq <- which(!valid_seq) - if (length(not_valid_seq) > 0) { + if (!IUPAC && model == "hierarchical"){ + nonIUPAC_valid <- c("A", "T", "C", "G", "N", "?") + .validcharacters <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% nonIUPAC_valid) } + nonIUPAC_valid_seq <- sapply(head(db[[junction]],1000), .validcharacters) + not_nonIUPAC_valid_seq <- which(!nonIUPAC_valid_seq) + if (length(not_nonIUPAC_valid_seq) > 0) { stop("invalid sequence characters in the ", junction, " column. ", - length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'") + "\n Valid characters are: '", nonIUPAC_valid, "'") + } + } + else{ + valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) + .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) } + valid_seq <- sapply(db[[junction]], .validateSeq) + not_valid_seq <- which(!valid_seq) + if (length(not_valid_seq) > 0) { + stop("invalid sequence characters in the ", junction, " column. ", + length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'") + } } ### temp columns @@ -1772,6 +1786,7 @@ passToClustering_lev1 <- function (db_gp, method = method, linkage = linkage, normalize = normalize, + IUPAC = match.arg(IUPAC), junction = junction, cdr3 = cdr3, cdr3_col = cdr3_col, @@ -1836,6 +1851,7 @@ hierarchicalClones_helper <- function(db_gp, method = c("nt", "aa"), linkage = c("single", "average", "complete"), normalize = c("len", "none"), + IUPAC = FALSE, junction = "junction", cdr3 = FALSE, cdr3_col = NA, @@ -1874,10 +1890,13 @@ hierarchicalClones_helper <- function(db_gp, # calculate distance matrix if (method == "nt") { - # RDB - dist_mtx <- fastDist_rcpp(seqs_unq) - #dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, - # dist_mat = getDNAMatrix(gap = 0)) + if (! IUPAC){ + dist_mtx <- fastDist_rcpp(seqs_unq) + } + else{ + dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, + dist_mat = getDNAMatrix(gap = 0)) + } } else if (method == "aa") { dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, dist_mat = getAAMatrix(gap = 0)) From 0af042b2bb61724564ac535f3dcd1737c06977fb Mon Sep 17 00:00:00 2001 From: Vivian0105 Date: Mon, 9 Mar 2026 11:47:41 -0400 Subject: [PATCH 10/25] fix small IUPAC bug --- R/Functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index 02c095c..02dffda 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -984,7 +984,7 @@ hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("s results <- defineClonesScoper(db = db, threshold = threshold, model = "hierarchical", method = match.arg(method), linkage = match.arg(linkage), normalize = match.arg(normalize), - IUPAC = match.arg(IUPAC), junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields, + IUPAC = IUPAC, junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields, cell_id = cell_id, locus = locus, only_heavy = only_heavy, split_light = split_light, first = first, cdr3 = cdr3, mod3 = mod3, max_n = max_n, nproc = nproc, verbose = verbose, log = log, summarize_clones = summarize_clones) @@ -1786,7 +1786,7 @@ passToClustering_lev1 <- function (db_gp, method = method, linkage = linkage, normalize = normalize, - IUPAC = match.arg(IUPAC), + IUPAC = IUPAC, junction = junction, cdr3 = cdr3, cdr3_col = cdr3_col, From c83316a7aa7a9a2f3b25deb996d5f55cdff5cec3 Mon Sep 17 00:00:00 2001 From: Vivian0105 Date: Mon, 9 Mar 2026 14:59:16 -0400 Subject: [PATCH 11/25] Update DESCRIPTION, add IUPAC to function passToClustering_lev1 --- DESCRIPTION | 2 +- R/Functions.R | 2 ++ src/RcppExports.cpp | 12 ++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index de164e5..e4778c8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -55,7 +55,7 @@ Suggests: knitr, rmarkdown, testthat (>= 3.0.0) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Collate: 'Data.R' 'Scoper.R' diff --git a/R/Functions.R b/R/Functions.R index 02dffda..34e16c1 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -1461,6 +1461,7 @@ defineClonesScoper <- function(db, method = method, linkage = ifelse(model == "hierarchical", linkage, NA), normalize = ifelse(model == "hierarchical", normalize, NA), + IUPAC = ifelse(model == "hierarchical", IUPAC, FALSE), germline = germline, sequence = sequence, junction = junction, @@ -1760,6 +1761,7 @@ passToClustering_lev1 <- function (db_gp, method = c("nt", "aa", "novj", "vj"), linkage = c("single", "average", "complete"), normalize = c("len", "none"), + IUPAC = FALSE, germline = "germline_alignment", sequence = "sequence_alignment", junction = "junction", diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 03a552b..c447597 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -23,6 +23,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// countSeqsWithInvalidBases_rcpp +int countSeqsWithInvalidBases_rcpp(CharacterVector seqs); +RcppExport SEXP _scoper_countSeqsWithInvalidBases_rcpp(SEXP seqsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< CharacterVector >::type seqs(seqsSEXP); + rcpp_result_gen = Rcpp::wrap(countSeqsWithInvalidBases_rcpp(seqs)); + return rcpp_result_gen; +END_RCPP +} // fastDist_rcpp IntegerMatrix fastDist_rcpp(CharacterVector seqs); RcppExport SEXP _scoper_fastDist_rcpp(SEXP seqsSEXP) { @@ -37,6 +48,7 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_scoper_pairwiseMutMatrixRcpp", (DL_FUNC) &_scoper_pairwiseMutMatrixRcpp, 3}, + {"_scoper_countSeqsWithInvalidBases_rcpp", (DL_FUNC) &_scoper_countSeqsWithInvalidBases_rcpp, 1}, {"_scoper_fastDist_rcpp", (DL_FUNC) &_scoper_fastDist_rcpp, 1}, {NULL, NULL, 0} }; From 707a7a10e9421241192c50a8c32f66a043f00d3a Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Wed, 11 Mar 2026 17:40:23 +0100 Subject: [PATCH 12/25] typo --- R/Functions.R | 2 +- tests/testthat/test_clone.R | 60 +++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/R/Functions.R b/R/Functions.R index 34e16c1..c4cf197 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -536,7 +536,7 @@ prepare_db <- function(db, dplyr::filter(stringi::stri_count(!!rlang::sym(junction), regex = "[^ATCG]") <= max_n) n_after <- nrow(db) if ( n_before > n_after) { - warning(paste("Removed", n_before - n_after, "sequences with non ATCG charachters.")) + warning(paste("Removed", n_before - n_after, "sequences with non ATCG characters.")) } } else { n_rmv_N <- 0 diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R index 2293311..22909d9 100644 --- a/tests/testthat/test_clone.R +++ b/tests/testthat/test_clone.R @@ -70,6 +70,64 @@ test_that("Test hierarchicalClones", { } }) +#### clone - hierarchicalClones with IUPAC parameter #### + +test_that("Test hierarchicalClones with IUPAC parameter", { + # Create test data with IUPAC ambiguity codes + db_iupac <- data.frame( + sequence_id = paste0("seq", 1:10), + v_call = rep("IGHV1-1*01", 10), + j_call = rep("IGHJ1*01", 10), + # Use IUPAC ambiguity codes: R(A/G), Y(C/T), W(A/T), S(C/G), M(A/C), K(G/T) + junction = c( + "TGTGCRAGCTACTGG", # R = A or G + "TGTGCRAGCTACTGG", # identical to seq1 + "TGTGCAAGCTACTGG", # standard bases only, similar to seq1/2 + "TGTGCYAGCTACTGG", # Y = C or T + "TGTGCYAGCTACTGG", # identical to seq4 + "TGTGCMAGCTACTGG", # M = A or C + "TGTGCWAGCTACTGG", # W = A or T + "TGTGCWAGCTACTGG", # identical to seq7 + "TGTGCAAGCTAYTGG", # Y in different position + "TGTGCAAGCTKCTGG" # K = G or T + ), + locus = rep("IGH", 10), + stringsAsFactors = FALSE + ) + + # Test 1: IUPAC=FALSE should fail with IUPAC characters + expect_error( + hierarchicalClones(db_iupac, threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, + summarize_clones = FALSE), + "invalid sequence characters" + ) + + # Test 2: IUPAC=TRUE should run with IUPAC characters + expect_warning( + expect_message( + db_result <- hierarchicalClones(db_iupac, threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, + summarize_clones = FALSE), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ), + "Removed 9 sequences with non ATCG charachters." + ) + + # Check that clones were assigned + expect_true("clone_id" %in% colnames(db_result)) + expect_true(all(!is.na(db_result$clone_id))) + # Sequences with IUPAC codes should be removed, leaving only seq3 which has standard bases + expect_equal(nrow(db_result), 1) + +}) + #### clone - spectralClones - novj method #### test_that("Test spectralClones - novj", { @@ -558,3 +616,5 @@ test_that("Testing split_light warnings for all cloning mehtods", { cell_id = "cell_id", split_light = TRUE)) expect_equal(db_nsplit[['clone_id']], db_split[['clone_id']]) }) + + From 653df606dbb32b34baf36781f30b0a4548cab842 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Wed, 11 Mar 2026 19:14:45 +0100 Subject: [PATCH 13/25] tests and doc --- R/Functions.R | 83 +++++++-- tests/testthat/test_clone.R | 326 ++++++++++++++++++++++++++++++++---- 2 files changed, 365 insertions(+), 44 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index c4cf197..9bf0ffe 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -881,8 +881,16 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' @param linkage available linkage are \code{"single"}, \code{"average"}, and \code{"complete"}. #' @param normalize method of normalization. The default is \code{"len"}, which divides the distance by the length #' of the sequence group. If \code{"none"} then no normalization if performed. -#' @param IUPAC If \code{TRUE}, use the IUPAC coding system, which includes both standard bases and ambiguity codes. -#' If \code{FALSE} (default), only standard bases, "N" and "?" are allowed in the sequence. +#' @param IUPAC If \code{TRUE}, allows sequences with IUPAC codes to pass validation +#' and be used in clustering with IUPAC-aware distance calculation +#' (via \code{alakazam::pairwiseDist}). If \code{FALSE} (default), uses fast Hamming distance +#' (via \code{fastDist_rcpp}) and only allows standard bases (A, T, C, G), N, and ? +#' in sequences. IUPAC codes will cause validation errors if \code{IUPAC=FALSE}. +#' Note: This parameter is only available for \code{hierarchicalClones} with +#' \code{method="nt"}. \code{identicalClones} and +#' \code{spectralClones} always behave as if \code{IUPAC=FALSE}. This parameter controls +#' the distance calculation method, not sequence filtering. See \code{max_n} for +#' filtering sequences by character content. #' @param junction character name of the column containing junction sequences. #' Also used to determine sequence length for grouping. #' @param v_call name of the column containing the V-segment allele calls. @@ -915,11 +923,16 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' less than 7 nucleotides. #' @param mod3 if \code{TRUE} removes records with a \code{junction} length that is not divisible by #' 3 in nucleotide space. -#' @param max_n The maximum number of degenerate characters to permit in the junction sequence -#' before excluding the record from clonal assignment. Note, with -#' \code{linkage="single"} non-informative positions can create artifactual -#' links between unrelated sequences. Use with caution. -#' Default is set to be zero. Set it as \code{"NULL"} for no action. +#' @param max_n The maximum number of non-ATCG characters (degenerate positions) to permit +#' in the junction sequence before excluding the record from clonal assignment. +#' This filters sequences based on the regex pattern \code{"[^ATCG]"}, which +#' includes N, ?, and IUPAC ambiguity codes (R, Y, W, S, M, K, etc.) +#' Note: \code{max_n} operates independently +#' from \code{IUPAC} - it controls filtering by character count, while +#' \code{IUPAC} controls validation and distance calculation method. +#' With \code{linkage="single"}, non-informative positions can create +#' artifactual links between unrelated sequences. Use with caution. +#' Default is 0 (ATCG-only). Set to \code{NULL} for no filtering. #' @param nproc number of cores to distribute the function over. #' @param verbose if \code{TRUE} prints out a summary of each step cloning process. #' if \code{FALSE} (default) process cloning silently. @@ -939,6 +952,53 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' clonal assignment summary information and a modified input \code{db} in the \code{db} slot that #' contains clonal identifiers in the specified \code{clone} column. #' +#' @section IUPAC and max_n parameters: +#' Note: The \code{IUPAC} parameter is only available for \code{hierarchicalClones} with +#' \code{method="nt"} (nucleotide mode). It is ignored when \code{method="aa"} (amino acid mode). +#' The \code{identicalClones} and \code{spectralClones} functions always operate as if +#' \code{IUPAC=FALSE} (standard bases plus N and ? only). The \code{max_n} parameter +#' is available for all cloning functions. +#' +#' The \code{IUPAC} and \code{max_n} parameters serve complementary but distinct purposes: +#' +#' \code{IUPAC} controls: +#' \itemize{ +#' \item Sequence validation (which characters are allowed) +#' \item Distance calculation method (fast Hamming vs IUPAC-aware scoring) +#' } +#' +#' \code{max_n} controls: +#' \itemize{ +#' \item Sequence filtering by counting non-ATCG characters +#' } +#' +#' Common use cases (for \code{method="nt"}): +#' \itemize{ +#' \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation. +#' Rejects any sequences with IUPAC codes, N, or ?. Fastest option for high-quality data. +#' \item \code{IUPAC=FALSE, max_n=1+}: Allows sequences with limited N/? characters while +#' rejecting IUPAC ambiguity codes at validation. Uses fast Hamming distance. Note: IUPAC +#' codes are rejected during validation (before max_n filtering), so max_n only affects +#' sequences that passed validation with N or ? characters. Useful for data with low-quality +#' or masked positions but no experimental ambiguity codes. +#' \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG +#' characters anyway. Only standard bases remain after filtering. +#' \item \code{IUPAC=TRUE, max_n=1+}: Allows sequences with limited ambiguity codes and uses +#' proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly. +#' Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate +#' (counts all non-ATCG: N, ?, and IUPAC codes). +#' \item \code{IUPAC=TRUE, max_n=NULL}: Process all sequences with IUPAC codes regardless of the +#' number of ambiguous positions. Uses IUPAC-aware distance calculation with no filtering. +#' Most permissive option for data with extensive IUPAC ambiguity codes. +#' } +#' +#' Note: Validation occurs before filtering. When \code{IUPAC=FALSE}, sequences containing IUPAC +#' ambiguity codes (R, Y, W, S, M, K, etc.) will fail validation and be rejected before the +#' \code{max_n} filtering step. Therefore, with \code{IUPAC=FALSE, max_n > 0}, only sequences +#' with N and ? characters (not IUPAC codes) can pass validation and be filtered by \code{max_n}. +#' The \code{max_n} parameter always counts using regex \code{"[^ATCG]"}, but \code{IUPAC} determines +#' which non-ATCG characters are allowed to reach the filtering step. +#' #' @section Single-cell data: #' To invoke single-cell mode the \code{cell_id} argument must be specified and the \code{locus} #' column must be correct. Otherwise, clustering will be performed with bulk sequencing assumptions, @@ -1054,9 +1114,12 @@ hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("s #' less than 7 nucleotides. #' @param mod3 if \code{TRUE} removes records with a \code{junction} length that is not divisible by #' 3 in nucleotide space. -#' @param max_n the maximum number of degenerate characters to permit in the junction sequence before excluding the -#' record from clonal assignment. Default is set to be zero. Set it as \code{"NULL"} -#' for no action. +#' @param max_n The maximum number of degenerate characters to permit in the junction sequence before excluding the +#' record from clonal assignment. Counts non-ATCG characters using regex \code{"[^ATCG]"}, which +#' includes N, ?, and IUPAC ambiguity codes. Note: \code{spectralClones} does not support the +#' \code{IUPAC} parameter and always validates sequences as if \code{IUPAC=FALSE} (only allows +#' standard bases A,T,C,G plus N and ?). IUPAC ambiguity codes will cause validation errors. +#' Default is 0. Set to \code{NULL} for no filtering. #' @param threshold the supervising cut-off to enforce an upper-limit distance for clonal grouping. #' A numeric value between (0,1). #' @param base_sim required similarity cut-off for sequences in equal distances from each other. diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R index 22909d9..0c0ff4d 100644 --- a/tests/testthat/test_clone.R +++ b/tests/testthat/test_clone.R @@ -73,59 +73,317 @@ test_that("Test hierarchicalClones", { #### clone - hierarchicalClones with IUPAC parameter #### test_that("Test hierarchicalClones with IUPAC parameter", { + # IUPAC and max_n serve different purposes: + # - IUPAC: Controls (1) validation (which characters allowed) and + # (2) distance calculation method (fast Hamming vs IUPAC-aware scoring) + # - max_n: Filters sequences by counting non-ATCG characters using regex "[^ATCG]" + # (includes N, ?, and all IUPAC codes - whatever passed validation) + # + # Processing order: Validation -> Filtering -> Distance calculation + # + # Key use cases tested: + # 1. IUPAC=FALSE, max_n=0: Strict ATCG-only mode with fast distance calculation + # 2. IUPAC=TRUE, max_n=0: IUPAC-aware distance (but filters out IUPAC seqs anyway) + # 3. IUPAC=TRUE, max_n=1+: Main use case - IUPAC codes with proper scoring + # 4. IUPAC=FALSE, max_n=1+: Allow N/? but reject IUPAC codes (validation blocks them) + # 5. IUPAC=FALSE/TRUE with standard bases: Backward compatibility (same results) + # Create test data with IUPAC ambiguity codes + # With threshold=0.15 and length 15, distance > 0.15 requires >=3 mutations (3/15=0.2) db_iupac <- data.frame( - sequence_id = paste0("seq", 1:10), - v_call = rep("IGHV1-1*01", 10), - j_call = rep("IGHJ1*01", 10), + sequence_id = paste0("seq", 1:12), + v_call = rep("IGHV1-1*01", 12), + j_call = rep("IGHJ1*01", 12), # Use IUPAC ambiguity codes: R(A/G), Y(C/T), W(A/T), S(C/G), M(A/C), K(G/T) junction = c( - "TGTGCRAGCTACTGG", # R = A or G - "TGTGCRAGCTACTGG", # identical to seq1 - "TGTGCAAGCTACTGG", # standard bases only, similar to seq1/2 - "TGTGCYAGCTACTGG", # Y = C or T - "TGTGCYAGCTACTGG", # identical to seq4 - "TGTGCMAGCTACTGG", # M = A or C - "TGTGCWAGCTACTGG", # W = A or T - "TGTGCWAGCTACTGG", # identical to seq7 - "TGTGCAAGCTAYTGG", # Y in different position - "TGTGCAAGCTKCTGG" # K = G or T + # Clone 1: sequences similar to TGTGCAAGCTACTGG (0-1 mutations apart) + "TGTGCRAGCTACTGG", # R = A or G at pos 5 + "TGTGCRAGCTACTGG", # identical to seq1 + "TGTGCAAGCTACTGG", # standard bases only + "TGTGCYAGCTACTGG", # Y = C or T at pos 5, 1 mutation from seq3 + "TGTGCYAGCTACTGG", # identical to seq4 + "TGTGCMAGCTACTGG", # M = A or C at pos 5, 1 mutation from seq3 + "TGTGCWAGCTACTGG", # W = A or T at pos 5, 1 mutation from seq3 + + # Clone 2: sequences similar to ACGTTTGGCCAAACC (3+ mutations from Clone 1) + "ACGTTTGGCCAAACC", # standard bases, distant from Clone 1 + "ACGTTTGGCCAAACC", # identical to seq8 + "ACGKTTGGCCAAACC", # K = G or T at pos 4, 1 mutation from seq8 + "ACGRTTGGCCAAACC", # R = A or G at pos 4, 1 mutation from seq8 + "ACGTTTSGCCAAACC" # S = C or G at pos 7, 1 mutation from seq8 + ), + locus = rep("IGH", 12), + stringsAsFactors = FALSE + ) + + # Test 1: IUPAC=FALSE should reject IUPAC characters at validation stage + # Validation (IUPAC=FALSE): Only allows A,T,C,G,N,? - REJECTS IUPAC codes (R,Y,W,S,M,K,etc) + # Result: Sequences with IUPAC codes fail validation and raise error before any filtering + # Use case: Strict ATCG-only mode with fast Hamming distance (fastDist_rcpp) + expect_error( + hierarchicalClones(db_iupac, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, # Use fast Hamming distance for ATCG only + summarize_clones = FALSE + ), + "invalid sequence characters" + ) + + # Test 2: IUPAC=TRUE with max_n=0 - validates with IUPAC but filters them out + # 1. Validation (IUPAC=TRUE): Allows standard bases, N, ?, and IUPAC codes - all sequences pass + # 2. Filtering (max_n=0): Counts non-ATCG using "[^ATCG]" - removes all sequences with any non-ATCG + # Result: Only seq3, seq8 and seq9 (standard bases only) remain after filtering + expect_warning( + expect_message( + db_result2 <- hierarchicalClones(db_iupac, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, # Use IUPAC-aware distance calculation + max_n = 0, # Filter out sequences with ANY non-ATCG chars + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ), + "Removed 9 sequences with non ATCG characters." + ) + + # Only 3 sequences with standard bases remain + expect_equal(nrow(db_result2), 3) + expect_equal(sort(db_result2$sequence_id), c("seq3", "seq8", "seq9")) + # They are in different clones (distant sequences) + expect_equal(length(unique(db_result2$clone_id)), 2) + + # Test 3a: Create dataset with N characters to distinguish from IUPAC codes + db_with_n <- db_iupac + db_with_n[13,] <- NA + db_with_n$junction[13] <- "TGTGCNAGCTACTGG" # Add seq13 with N + db_with_n$sequence_id <- c(db_with_n$sequence_id[1:12], "seq13") + db_with_n$v_call <- c(db_with_n$v_call[1:12], "IGHV1-1*01") + db_with_n$j_call <- c(db_with_n$j_call[1:12], "IGHJ1*01") + db_with_n$locus <- c(db_with_n$locus[1:12], "IGH") + + # Test 3b: IUPAC=TRUE with max_n=1 - allows up to 1 non-ATCG character per sequence + # This is the main use case: proper IUPAC distance calculation with ambiguity codes + # 1. Validation (IUPAC=TRUE): Allows standard bases (A,T,C,G) plus N, ?, and IUPAC codes (R,Y,W,S,M,K,etc) + # 2. Filtering (max_n=1): Counts non-ATCG using "[^ATCG]" - keeps sequences with ≤1 such character + # 3. Distance: Uses IUPAC-aware scoring (alakazam::pairwiseDist) for proper ambiguity handling + expect_message( + db_result3 <- hierarchicalClones(db_with_n, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, # Use IUPAC-aware distance calculation + max_n = 1, # Allow up to 1 non-ATCG character (IUPAC or N) + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + + # All 13 sequences kept (each has ≤1 non-ATCG character) + expect_equal(nrow(db_result3), 13) + + # Verify that we have 2 different clusters (Clone 1 and Clone 2) + n_clones <- length(unique(db_result3$clone_id)) + expect_equal(n_clones, 2) + + # Identical sequences with IUPAC codes should cluster together + # Note: rows may be reordered, so reference by sequence_id + get_clone <- function(seq_id) db_result3$clone_id[db_result3$sequence_id == seq_id] + + expect_equal(get_clone("seq1"), get_clone("seq2")) # seq1=seq2 (both R) + expect_equal(get_clone("seq4"), get_clone("seq5")) # seq4=seq5 (both Y) + expect_equal(get_clone("seq8"), get_clone("seq9")) # seq8=seq9 (standard) + + # seq13 (with N) should cluster with Clone 1 (similar to seq3) + expect_equal(get_clone("seq3"), get_clone("seq13")) + + # Verify two distinct clones based on biological distance + clone1_id <- get_clone("seq3") # seq3 in Clone 1 + clone2_id <- get_clone("seq8") # seq8 in Clone 2 + expect_true(clone1_id != clone2_id) + expect_true(all(sapply(paste0("seq", 1:7), get_clone) == clone1_id)) # Clone 1: seq1-7 + expect_true(all(sapply(paste0("seq", 8:12), get_clone) == clone2_id)) # Clone 2: seq8-12 + expect_true(get_clone("seq13") == clone1_id) # seq13 joins Clone 1 + + # Test 4: IUPAC=FALSE with max_n=1+ - allows N/? but rejects other IUPAC codes + # This demonstrates the two-stage process: validation -> filtering + # 1. Validation (IUPAC=FALSE): Allows A,T,C,G,N,? but REJECTS IUPAC codes (R,Y,W,S,M,K,etc) + # 2. Filtering (max_n): Counts non-ATCG using "[^ATCG]" regex on sequences that passed validation + # Use case: Tolerate low-quality positions (N,?) with fast Hamming distance, but no ambiguity codes + db_with_n_only <- data.frame( + sequence_id = paste0("seq", 1:5), + v_call = rep("IGHV1-1*01", 5), + j_call = rep("IGHJ1*01", 5), + junction = c( + "TGTGCAAGCTACTGG", # standard bases only + "TGTGCNAGCTACTGG", # 1 N (allowed with max_n=1) + "TGTGC?AGCTACTGG", # 1 ? (allowed with max_n=1) + "TGTGCNNGCTACTGG", # 2 N's (filtered with max_n=1) + "TGTGCRAGCTACTGG" # IUPAC R code (should fail validation) ), - locus = rep("IGH", 10), + locus = rep("IGH", 5), stringsAsFactors = FALSE ) - # Test 1: IUPAC=FALSE should fail with IUPAC characters + # Should reject at validation stage due to IUPAC code (R) in seq5 expect_error( - hierarchicalClones(db_iupac, threshold = 0.15, - method = "nt", linkage = "single", - junction = "junction", - v_call = "v_call", j_call = "j_call", - IUPAC = FALSE, - summarize_clones = FALSE), + hierarchicalClones(db_with_n_only, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, # Reject IUPAC codes + max_n = 1, # Allow up to 1 N or ? + summarize_clones = FALSE + ), "invalid sequence characters" ) - # Test 2: IUPAC=TRUE should run with IUPAC characters + # Test with only N/? (no IUPAC codes) - should succeed + db_n_only_valid <- db_with_n_only[1:4, ] # Remove seq5 with IUPAC code + expect_warning( expect_message( - db_result <- hierarchicalClones(db_iupac, threshold = 0.15, - method = "nt", linkage = "single", - junction = "junction", - v_call = "v_call", j_call = "j_call", - IUPAC = TRUE, - summarize_clones = FALSE), + db_result5 <- hierarchicalClones(db_n_only_valid, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, # Use fast Hamming distance + max_n = 1, # Allow up to 1 N or ? + summarize_clones = FALSE + ), "Running defineClonesScoper in bulk mode and only keep heavy chains" ), - "Removed 9 sequences with non ATCG charachters." + "Removed 1 sequences with non ATCG characters." # seq4 with 2 N's filtered ) - # Check that clones were assigned - expect_true("clone_id" %in% colnames(db_result)) - expect_true(all(!is.na(db_result$clone_id))) - # Sequences with IUPAC codes should be removed, leaving only seq3 which has standard bases - expect_equal(nrow(db_result), 1) + # Should keep seq1, seq2, seq3 (0-1 non-ATCG characters each) + expect_equal(nrow(db_result5), 3) + expect_equal(sort(db_result5$sequence_id), c("seq1", "seq2", "seq3")) + # All are similar sequences, should be in same clone + expect_equal(length(unique(db_result5$clone_id)), 1) + + # Test 5: Standard bases work with both IUPAC=TRUE and IUPAC=FALSE + # This demonstrates backward compatibility and performance consideration: + # - IUPAC=FALSE: Uses fast Hamming distance (fastDist_rcpp) - faster + # - IUPAC=TRUE: Uses IUPAC-aware scoring (alakazam::pairwiseDist) - slower but handles ambiguity + # For standard bases (ATCG), results should be identical + db_standard <- ExampleDb[1:50, ] + + # Verify db_standard contains only standard bases (not IUPAC ambiguity codes) + standard_chars <- c("A", "T", "C", "G", "N", "?") + .is_standard <- function(x) { + all(unique(strsplit(x, "")[[1]]) %in% standard_chars) + } + all_standard <- sapply(db_standard$junction, .is_standard) + expect_true(all(all_standard), + info = "ExampleDb contains IUPAC ambiguity codes - update test data or test expectations" + ) + expect_message( + db_result_false <- hierarchicalClones(db_standard, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, # Fast Hamming distance for ATCG only + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + + expect_message( + db_result_true <- hierarchicalClones(db_standard, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, # IUPAC-aware distance (slower but handles ambiguity) + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + + # Results should be identical for standard bases + # (IUPAC just changes the distance calculation method, not the result) + expect_equal(nrow(db_result_false), nrow(db_result_true)) + expect_true(all(!is.na(db_result_false$clone_id))) + expect_true(all(!is.na(db_result_true$clone_id))) + # Clone assignments should be the same (though IDs may differ) + expect_equal(length(unique(db_result_false$clone_id)), + length(unique(db_result_true$clone_id))) + + # Test 6: IUPAC=TRUE with max_n=NULL - no filtering, process all sequences + # This is the most permissive option for IUPAC data + # 1. Validation (IUPAC=TRUE): Allows standard bases, N, ?, and all IUPAC codes + # 2. Filtering (max_n=NULL): No filtering - all sequences kept regardless of character count + # 3. Distance: Uses IUPAC-aware scoring for proper ambiguity handling + # Use case: Data with extensive IUPAC codes where you want to process everything + + # Create test data with multiple IUPAC codes per sequence + db_iupac_multi <- data.frame( + sequence_id = paste0("seq", 1:8), + v_call = rep("IGHV1-1*01", 8), + j_call = rep("IGHJ1*01", 8), + junction = c( + # Clone 1: sequences with multiple IUPAC codes, similar to each other + "TGTGCRAGCTRYCTGG", # R at pos 6, R at pos 12, Y at pos 13 (3 non-ATCG) + "TGTGCRAGCTRYCTGG", # identical to seq1 + "TGTGCRAGCTWYNMGG", # R at pos 6, W at pos 12, Y at pos 13, N at pos 14, M at pos 15 (5 non-ATCG) + "TGTGCAAGCTACTGG", # standard bases only (0 non-ATCG) + + # Clone 2: sequences distant from Clone 1, also with IUPAC codes + "ACGKTTRGCCNNNYYY", # K, R, 3 N's, 3 Y's (8 non-ATCG characters!) + "ACGKTTRGCCNNNYYY", # identical to seq5 + "ACGTTTGGCCAAACC", # standard bases only (0 non-ATCG) + "ACGKTTSGCCNNNYYY" # K, S, 3 N's, 3 Y's (8 non-ATCG) + ), + locus = rep("IGH", 8), + stringsAsFactors = FALSE + ) + + # With max_n=NULL, all sequences should be kept regardless of IUPAC code count + expect_message( + db_result6 <- hierarchicalClones(db_iupac_multi, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, # Use IUPAC-aware distance calculation + max_n = NULL, # No filtering - process all sequences + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + + # All 8 sequences kept (no filtering applied) + expect_equal(nrow(db_result6), 8) + expect_equal(sort(db_result6$sequence_id), paste0("seq", 1:8)) + + # Should have 2 clones based on biological distance + expect_equal(length(unique(db_result6$clone_id)), 2) + + # Helper function to get clone ID by sequence ID + get_clone6 <- function(seq_id) db_result6$clone_id[db_result6$sequence_id == seq_id] + + # Verify identical sequences cluster together despite many IUPAC codes + expect_equal(get_clone6("seq1"), get_clone6("seq2")) # both identical with 3 IUPAC + expect_equal(get_clone6("seq5"), get_clone6("seq6")) # both identical with 8 IUPAC + + # Verify Clone 1 (seq1-4) and Clone 2 (seq5-8) are distinct + clone1_id <- get_clone6("seq1") + clone2_id <- get_clone6("seq5") + expect_true(clone1_id != clone2_id) + expect_true(all(sapply(paste0("seq", 1:4), get_clone6) == clone1_id)) + expect_true(all(sapply(paste0("seq", 5:8), get_clone6) == clone2_id)) + }) #### clone - spectralClones - novj method #### From ef421dc2a873bf77a024f87caa04b957bf383bc1 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Thu, 12 Mar 2026 10:16:35 +0100 Subject: [PATCH 14/25] updated docs --- R/Functions.R | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index 9bf0ffe..bd750c0 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -885,8 +885,7 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' and be used in clustering with IUPAC-aware distance calculation #' (via \code{alakazam::pairwiseDist}). If \code{FALSE} (default), uses fast Hamming distance #' (via \code{fastDist_rcpp}) and only allows standard bases (A, T, C, G), N, and ? -#' in sequences. IUPAC codes will cause validation errors if \code{IUPAC=FALSE}. -#' Note: This parameter is only available for \code{hierarchicalClones} with +#' in sequences. Note: This parameter is only available for \code{hierarchicalClones} with #' \code{method="nt"}. \code{identicalClones} and #' \code{spectralClones} always behave as if \code{IUPAC=FALSE}. This parameter controls #' the distance calculation method, not sequence filtering. See \code{max_n} for @@ -925,8 +924,6 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' 3 in nucleotide space. #' @param max_n The maximum number of non-ATCG characters (degenerate positions) to permit #' in the junction sequence before excluding the record from clonal assignment. -#' This filters sequences based on the regex pattern \code{"[^ATCG]"}, which -#' includes N, ?, and IUPAC ambiguity codes (R, Y, W, S, M, K, etc.) #' Note: \code{max_n} operates independently #' from \code{IUPAC} - it controls filtering by character count, while #' \code{IUPAC} controls validation and distance calculation method. @@ -976,14 +973,14 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' \itemize{ #' \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation. #' Rejects any sequences with IUPAC codes, N, or ?. Fastest option for high-quality data. -#' \item \code{IUPAC=FALSE, max_n=1+}: Allows sequences with limited N/? characters while +#' \item \code{IUPAC=FALSE, max_n>0}: Allows sequences with limited N/? characters while #' rejecting IUPAC ambiguity codes at validation. Uses fast Hamming distance. Note: IUPAC #' codes are rejected during validation (before max_n filtering), so max_n only affects #' sequences that passed validation with N or ? characters. Useful for data with low-quality #' or masked positions but no experimental ambiguity codes. #' \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG #' characters anyway. Only standard bases remain after filtering. -#' \item \code{IUPAC=TRUE, max_n=1+}: Allows sequences with limited ambiguity codes and uses +#' \item \code{IUPAC=TRUE, max_n>0}: Allows sequences with limited ambiguity codes and uses #' proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly. #' Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate #' (counts all non-ATCG: N, ?, and IUPAC codes). From 308e685f1fd1ed79276407f97d7d7c123d8c97a3 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Tue, 17 Mar 2026 14:22:01 +0100 Subject: [PATCH 15/25] fix test data --- tests/testthat/test_clone.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R index 0c0ff4d..378931b 100644 --- a/tests/testthat/test_clone.R +++ b/tests/testthat/test_clone.R @@ -337,12 +337,11 @@ test_that("Test hierarchicalClones with IUPAC parameter", { "TGTGCRAGCTRYCTGG", # R at pos 6, R at pos 12, Y at pos 13 (3 non-ATCG) "TGTGCRAGCTRYCTGG", # identical to seq1 "TGTGCRAGCTWYNMGG", # R at pos 6, W at pos 12, Y at pos 13, N at pos 14, M at pos 15 (5 non-ATCG) - "TGTGCAAGCTACTGG", # standard bases only (0 non-ATCG) - + "TGTGCAAGCTACCTGG", # standard bases only (0 non-ATCG) # Clone 2: sequences distant from Clone 1, also with IUPAC codes "ACGKTTRGCCNNNYYY", # K, R, 3 N's, 3 Y's (8 non-ATCG characters!) "ACGKTTRGCCNNNYYY", # identical to seq5 - "ACGTTTGGCCAAACC", # standard bases only (0 non-ATCG) + "ACGGTTAGCCAAACCC", # standard bases only (0 non-ATCG) "ACGKTTSGCCNNNYYY" # K, S, 3 N's, 3 Y's (8 non-ATCG) ), locus = rep("IGH", 8), @@ -723,7 +722,7 @@ test_that("Test hierarchicalClones only_heavy and first", { # expect sequences from cells 1 and 3 in clone 1, and # sequences from cell 2 in clone 2 clones_split_F_th0@db %>% - mutate(expected_clone_id = dplyr::case_when( + dplyr::mutate(expected_clone_id = dplyr::case_when( cell_id %in% c(1, 3) ~ "1", cell_id %in% c(2) ~ "2", TRUE ~ NA_character_ @@ -800,7 +799,7 @@ test_that("Test hierarchicalClones only_heavy and first", { # sequences from cell 2 in clone 2, regardless of # light chains clones_split_F_th0_2l@db %>% - mutate(expected_clone_id = dplyr::case_when( + dplyr::mutate(expected_clone_id = dplyr::case_when( cell_id %in% c(1, 3) ~ "1", cell_id %in% c(2) ~ "2", TRUE ~ NA_character_ From b17f0543a7d12b73083e947f251eed8536aac866 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Thu, 19 Mar 2026 09:34:25 +0100 Subject: [PATCH 16/25] updated docs --- R/Functions.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index bd750c0..bd3910e 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -886,9 +886,8 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' (via \code{alakazam::pairwiseDist}). If \code{FALSE} (default), uses fast Hamming distance #' (via \code{fastDist_rcpp}) and only allows standard bases (A, T, C, G), N, and ? #' in sequences. Note: This parameter is only available for \code{hierarchicalClones} with -#' \code{method="nt"}. \code{identicalClones} and -#' \code{spectralClones} always behave as if \code{IUPAC=FALSE}. This parameter controls -#' the distance calculation method, not sequence filtering. See \code{max_n} for +#' \code{method="nt"}. This parameter controls validation and distance +#' calculation method, not sequence filtering. See \code{max_n} for #' filtering sequences by character content. #' @param junction character name of the column containing junction sequences. #' Also used to determine sequence length for grouping. @@ -972,18 +971,20 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' Common use cases (for \code{method="nt"}): #' \itemize{ #' \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation. -#' Rejects any sequences with IUPAC codes, N, or ?. Fastest option for high-quality data. -#' \item \code{IUPAC=FALSE, max_n>0}: Allows sequences with limited N/? characters while -#' rejecting IUPAC ambiguity codes at validation. Uses fast Hamming distance. Note: IUPAC -#' codes are rejected during validation (before max_n filtering), so max_n only affects -#' sequences that passed validation with N or ? characters. Useful for data with low-quality +#' Will throw an error and exit if sequences with IUPAC codes not A, T, C, G, N, or ? are detected +#' among the first 1000 sequences. Fastest option for high-quality data. +#' \item \code{IUPAC=FALSE, max_n>0}: Will throw an error and exit if sequences with IUPAC +#' codes not A, T, C, G, N, or ? are detected among the first 1000 sequences. Allows sequences +#' with limited N/? characters in distance calculation, using fast Hamming distance. +#' Note: IUPAC codes are rejected during validation (before max_n filtering), so max_n only +#' affects sequences that passed validation with N or ? characters. Useful for data with low-quality #' or masked positions but no experimental ambiguity codes. #' \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG #' characters anyway. Only standard bases remain after filtering. #' \item \code{IUPAC=TRUE, max_n>0}: Allows sequences with limited ambiguity codes and uses #' proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly. #' Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate -#' (counts all non-ATCG: N, ?, and IUPAC codes). +#' (counts all non-ATCG: N, ?, and other IUPAC codes). #' \item \code{IUPAC=TRUE, max_n=NULL}: Process all sequences with IUPAC codes regardless of the #' number of ambiguous positions. Uses IUPAC-aware distance calculation with no filtering. #' Most permissive option for data with extensive IUPAC ambiguity codes. From ba063d6dfd16271e9fa6ecfca8282ccf2d136307 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Thu, 19 Mar 2026 09:53:38 +0100 Subject: [PATCH 17/25] added Huimin to contributors --- DESCRIPTION | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e4778c8..f4530cb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,14 +1,16 @@ Package: scoper Type: Package Version: 1.3.1.999 -Date: 2025-02-26 +Date: 2026-03-17 Authors@R: c(person("Nima", "Nouri", role=c("aut"), email="nima.nouri@yale.edu"), person("Edel", "Aron", role=c("ctb"), email="edel.aron@yale.edu"), person("Gisela", "Gabernet", role=c("ctb"), email="gisela.gabernet@yale.edu"), - person("Cole", "Jensen", role=c("ctb"), + person("Huimin", "Lyu", role=c("ctb"), + email="huimin.lyu@yale.edu"), + person("Cole", "Jensen", role=c("ctb"), email="cole.jensen@yale.edu"), person("Susanna", "Marquez", role=c("ctb", "cre"), email="susanna.marquez@yale.edu"), From b468e18a37c2baf6abd8a0987d0076a6f7d5ebaf Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Thu, 19 Mar 2026 09:58:02 +0100 Subject: [PATCH 18/25] updated docs --- R/Functions.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index bd3910e..ad7a2e9 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -885,10 +885,11 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' and be used in clustering with IUPAC-aware distance calculation #' (via \code{alakazam::pairwiseDist}). If \code{FALSE} (default), uses fast Hamming distance #' (via \code{fastDist_rcpp}) and only allows standard bases (A, T, C, G), N, and ? -#' in sequences. Note: This parameter is only available for \code{hierarchicalClones} with -#' \code{method="nt"}. This parameter controls validation and distance +#' in sequences.This parameter controls validation and distance #' calculation method, not sequence filtering. See \code{max_n} for -#' filtering sequences by character content. +#' filtering sequences by character content. See the IUPAC and max_n +#' parameters section for more details and examples. Note: This parameter is only available +#' for \code{hierarchicalClones} with \code{method="nt"}. #' @param junction character name of the column containing junction sequences. #' Also used to determine sequence length for grouping. #' @param v_call name of the column containing the V-segment allele calls. From cd4271604c0a9703fe86d677502c211c27f6cc9e Mon Sep 17 00:00:00 2001 From: Gisela Gabernet Date: Tue, 24 Mar 2026 11:37:32 -0400 Subject: [PATCH 19/25] fix character validation logic --- R/Functions.R | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index ad7a2e9..2025d21 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -1302,26 +1302,25 @@ defineClonesScoper <- function(db, } ### Check for invalid characters - if (!IUPAC && model == "hierarchical"){ - nonIUPAC_valid <- c("A", "T", "C", "G", "N", "?") - .validcharacters <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% nonIUPAC_valid) } - nonIUPAC_valid_seq <- sapply(head(db[[junction]],1000), .validcharacters) - not_nonIUPAC_valid_seq <- which(!nonIUPAC_valid_seq) - if (length(not_nonIUPAC_valid_seq) > 0) { - stop("invalid sequence characters in the ", junction, " column. ", - "\n Valid characters are: '", nonIUPAC_valid, "'") - } - } - else{ - valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) - .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) } - valid_seq <- sapply(db[[junction]], .validateSeq) - not_valid_seq <- which(!valid_seq) - if (length(not_valid_seq) > 0) { - stop("invalid sequence characters in the ", junction, " column. ", - length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'") + # IUPAC mode is only supported for hierarchical clustering and nt method + if (!IUPAC && model == "hierarchical" && method == "nt") { + valid_chars <- c("A", "T", "C", "G", "N", "?") + } else { + if (method %in% c("nt", "novj", "vj")) { + valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) + } else if (method == "aa") { + valid_chars <- colnames(alakazam::getAAMatrix(gap = 0)) + } else { + stop("invalid method specified.") } } + .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) } + valid_seq <- sapply(head(db[[junction]],1000), .validateSeq) + not_valid_seq <- which(!valid_seq) + if (length(not_valid_seq) > 0) { + stop("invalid sequence characters in the ", junction, " column. ", + length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'") + } ### temp columns temp_cols <- c("vj_group", "vjl_group", "junction_l", "cdr3_col", "clone_temp", "cell_id_temp") @@ -1523,7 +1522,7 @@ defineClonesScoper <- function(db, method = method, linkage = ifelse(model == "hierarchical", linkage, NA), normalize = ifelse(model == "hierarchical", normalize, NA), - IUPAC = ifelse(model == "hierarchical", IUPAC, FALSE), + IUPAC = ifelse(model == "hierarchical" & method == "nt", IUPAC, FALSE), germline = germline, sequence = sequence, junction = junction, From fa1858e2b80dafd88b891f3c9a264e38db71476c Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Wed, 25 Mar 2026 12:11:12 +0100 Subject: [PATCH 20/25] added tests --- tests/testthat/test_clone.R | 141 ++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R index 378931b..c93c991 100644 --- a/tests/testthat/test_clone.R +++ b/tests/testthat/test_clone.R @@ -385,6 +385,147 @@ test_that("Test hierarchicalClones with IUPAC parameter", { }) +#### clone - hierarchicalClones, IUPAC code in sequence 1001 #### + +test_that("Test hierarchicalClones with IUPAC code beyond validation window", { + # Validation only inspects the first 1000 sequences (head(..., 1000)). + # A sequence containing an IUPAC code at position 1001 therefore escapes the check + # regardless of IUPAC=TRUE/FALSE. + # What happens next depends on max_n and IUPAC: + # - max_n=0: prepare_db filters out non-ATCG sequences -> warning, no error (both IUPAC=T/F) + # - max_n=NULL, IUPAC=FALSE: no filtering -> fastDist_rcpp encounters R -> error + # - max_n=NULL, IUPAC=TRUE: no filtering -> pairwiseDist handles R correctly -> succeeds + + # Build 1001-row dataset: rows 1-1000 are standard ATCG, row 1001 has IUPAC code R + n_std <- 1000 + db_iupac_1001 <- data.frame( + sequence_id = paste0("seq", seq_len(n_std + 1)), + v_call = rep("IGHV1-1*01", n_std + 1), + j_call = rep("IGHJ1*01", n_std + 1), + junction = c(rep("TGTGCAAGCTACTGG", n_std), # rows 1-1000: ATCG only + "TGTGCRAGCTACTGG"), # row 1001: R is an IUPAC code + locus = rep("IGH", n_std + 1), + stringsAsFactors = FALSE + ) + + # Confirm row 1001 is the one with the IUPAC code (sanity check) + expect_equal(db_iupac_1001$junction[1001], "TGTGCRAGCTACTGG") + + # Same data in reverse order to put IUPAC code in row 1 + db_iupac_1 <- db_iupac_1001[1001:1, ] + + # Test 1: IUPAC=FALSE, max_n=0 (default) + # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000 + # - max_n=0 filtering: removes row 1001 (R is non-ATCG) and issues a warning + # - Result: runs successfully; row 1001 removed + expect_warning( + expect_message( + db_result1 <- hierarchicalClones(db_iupac_1001, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, + max_n = 0, + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ), + "Removed 1 sequences with non ATCG characters." + ) + expect_equal(nrow(db_result1), n_std) + expect_false("seq1001" %in% db_result1$sequence_id) + + # Same data in reverse order now throws an error. Potentially confusing for users? + expect_error( + db_result1 <- hierarchicalClones(db_iupac_1, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, + max_n = 0, + summarize_clones = FALSE + ), + "invalid sequence characters in the junction column\\. 1 sequence\\(s\\) found\\.\\n Valid characters are: 'ATCGN\\?'" + ) + + # Test 2: IUPAC=FALSE, max_n=NULL + # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000 + # - max_n=NULL: no filtering -> row 1001 reaches fastDist_rcpp + # - fastDist_rcpp stops with an error because R is not A,C,G,T,N,? + expect_error( + suppressMessages( + hierarchicalClones(db_iupac_1001, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, + max_n = NULL, + summarize_clones = FALSE + ) + ), + "Only A,C,G,T,N,\\? are allowed" + ) + + # Same data in reverse also throws an error. + expect_error( + suppressMessages( + hierarchicalClones(db_iupac_1, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = FALSE, + max_n = NULL, + summarize_clones = FALSE + ) + ), + "invalid sequence characters in the junction column\\. 1 sequence\\(s\\) found\\.\\n Valid characters are: 'ATCGN\\?'" + ) + + # Test 3: IUPAC=TRUE, max_n=NULL + # - Validation (first 1000 rows): passes + # - max_n=NULL: no filtering -> row 1001 is kept + # - Distance: alakazam::pairwiseDist handles IUPAC codes correctly -> no error + # - Result: all 1001 sequences processed; seq1001 included in output + expect_message( + db_result3 <- hierarchicalClones(db_iupac_1001, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, + max_n = NULL, + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + expect_equal(nrow(db_result3), n_std + 1) + expect_true("seq1001" %in% db_result3$sequence_id) + # seq1001 (TGTGCRAGCTACTGG) is within distance 0.15 of the standard sequences + # (TGTGCAAGCTACTGG): R matches A under IUPAC scoring -> same clone + expect_equal(length(unique(db_result3$clone_id)), 1) + + # Same data in reverse order gives the same result (IUPAC=TRUE allows R, max_n=NULL means no filtering) + expect_message( + db_result4 <- hierarchicalClones(db_iupac_1, + threshold = 0.15, + method = "nt", linkage = "single", + junction = "junction", + v_call = "v_call", j_call = "j_call", + IUPAC = TRUE, + max_n = NULL, + summarize_clones = FALSE + ), + "Running defineClonesScoper in bulk mode and only keep heavy chains" + ) + # Results should be identical regardless of row order + db_result4_ordered <- db_result4[match(db_result3$sequence_id, db_result4$sequence_id), ] + expect_equal(db_result3, db_result4_ordered, check.attributes = FALSE) +}) + #### clone - spectralClones - novj method #### test_that("Test spectralClones - novj", { From 0b7fda1b34a4f16ab44b01c37a438b64fa820bdb Mon Sep 17 00:00:00 2001 From: Gisela Gabernet Date: Wed, 25 Mar 2026 11:16:15 -0400 Subject: [PATCH 21/25] method aa translates on the fly --- R/Functions.R | 26 ++++++++++++++------------ docs/topics/hierarchicalClones.md | 3 ++- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index 2025d21..df84db4 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -1302,24 +1302,26 @@ defineClonesScoper <- function(db, } ### Check for invalid characters - # IUPAC mode is only supported for hierarchical clustering and nt method - if (!IUPAC && model == "hierarchical" && method == "nt") { - valid_chars <- c("A", "T", "C", "G", "N", "?") + # IUPAC mode is only supported for hierarchical clustering + if (!IUPAC && model == "hierarchical") { + valid_chars <- c("A", "T", "C", "G", "N", "?") } else { - if (method %in% c("nt", "novj", "vj")) { - valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) - } else if (method == "aa") { - valid_chars <- colnames(alakazam::getAAMatrix(gap = 0)) - } else { - stop("invalid method specified.") - } + valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) } .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) } valid_seq <- sapply(head(db[[junction]],1000), .validateSeq) not_valid_seq <- which(!valid_seq) if (length(not_valid_seq) > 0) { - stop("invalid sequence characters in the ", junction, " column. ", - length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'") + error_msg <- paste0("Invalid sequence characters in the ", junction, " column were found when checking the first 1000 sequences. ", + length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'", + "\n If you have other IUPAC characters in your sequences, set IUPAC=TRUE to allow all IUPAC bases, this will run a slower version of hierarchicalClustering.") + if (method == "aa") { + method_aa_msg <- paste0("Clustering with method='aa' expects nucleotide sequences, which will be translated by this function.") + stop(paste0(error_msg, "\n", method_aa_msg)) + } else { + stop(error_msg) + } + } ### temp columns diff --git a/docs/topics/hierarchicalClones.md b/docs/topics/hierarchicalClones.md index 890dd7b..92b7caf 100644 --- a/docs/topics/hierarchicalClones.md +++ b/docs/topics/hierarchicalClones.md @@ -51,7 +51,8 @@ threshold method : one of the `"nt"` for nucleotide based clustering or -`"aa"` for amino acid based clustering. +`"aa"` for amino acid based clustering. Method `"aa"` still expects nucleotide sequences, +which will be translated to amino acids. linkage : available linkage are `"single"`, `"average"`, and `"complete"`. From ceab078a63660e3faa063aa47ee3f3c10de4de43 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Mon, 30 Mar 2026 17:23:05 +0200 Subject: [PATCH 22/25] added humin to docs --- docs/topics/scoper-package.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/topics/scoper-package.md b/docs/topics/scoper-package.md index dbc06cc..f5deda4 100644 --- a/docs/topics/scoper-package.md +++ b/docs/topics/scoper-package.md @@ -42,8 +42,9 @@ Authors: Other contributors: + Edel Aron [edel.aron@yale.edu](edel.aron@yale.edu) [contributor] -+ Cole Jensen [cole.jensen@yale.edu](cole.jensen@yale.edu) [contributor] + Gisela Gabernet [gisela.gabernet@yale.edu](gisela.gabernet@yale.edu) [contributor] ++ Cole Jensen [cole.jensen@yale.edu](cole.jensen@yale.edu) [contributor] ++ Huimin Lyu [huimin.lyu@yale.edu](huimin.lyu@yale.edu) [contributor] From 041cc7e6f7c72198e74bae331bb28d768fe9699c Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Mon, 30 Mar 2026 19:12:50 +0200 Subject: [PATCH 23/25] refine character validation adding method=nt for hierarchicalClones --- R/Functions.R | 2 +- tests/testthat/test_clone.R | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index df84db4..3a64927 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -1303,7 +1303,7 @@ defineClonesScoper <- function(db, ### Check for invalid characters # IUPAC mode is only supported for hierarchical clustering - if (!IUPAC && model == "hierarchical") { + if (!IUPAC && model == "hierarchical" && method == "nt") { valid_chars <- c("A", "T", "C", "G", "N", "?") } else { valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R index c93c991..49a30c7 100644 --- a/tests/testthat/test_clone.R +++ b/tests/testthat/test_clone.R @@ -129,7 +129,7 @@ test_that("Test hierarchicalClones with IUPAC parameter", { IUPAC = FALSE, # Use fast Hamming distance for ATCG only summarize_clones = FALSE ), - "invalid sequence characters" + "Invalid sequence characters" ) # Test 2: IUPAC=TRUE with max_n=0 - validates with IUPAC but filters them out @@ -242,7 +242,7 @@ test_that("Test hierarchicalClones with IUPAC parameter", { max_n = 1, # Allow up to 1 N or ? summarize_clones = FALSE ), - "invalid sequence characters" + "Invalid sequence characters" ) # Test with only N/? (no IUPAC codes) - should succeed @@ -447,7 +447,7 @@ test_that("Test hierarchicalClones with IUPAC code beyond validation window", { max_n = 0, summarize_clones = FALSE ), - "invalid sequence characters in the junction column\\. 1 sequence\\(s\\) found\\.\\n Valid characters are: 'ATCGN\\?'" + "Invalid sequence characters in the junction column" ) # Test 2: IUPAC=FALSE, max_n=NULL @@ -482,7 +482,7 @@ test_that("Test hierarchicalClones with IUPAC code beyond validation window", { summarize_clones = FALSE ) ), - "invalid sequence characters in the junction column\\. 1 sequence\\(s\\) found\\.\\n Valid characters are: 'ATCGN\\?'" + "Invalid sequence characters in the junction column" ) # Test 3: IUPAC=TRUE, max_n=NULL From 982e47e89e113a61feb9cb3ce79c6e28c8f50540 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Mon, 30 Mar 2026 19:22:58 +0200 Subject: [PATCH 24/25] rm moved to github message --- README.md | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/README.md b/README.md index 75f0acf..6fae412 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,6 @@ [![](https://cranlogs.r-pkg.org/badges/scoper)](https://www.r-pkg.org/pkg/scoper) [![](https://img.shields.io/static/v1?label=AIRR-C%20sw-tools%20v1&message=compliant&color=008AFF&labelColor=000000&style=plastic)](https://docs.airr-community.org/en/stable/swtools/airr_swtools_standard.html) -**IMPORTANT!** -SCOPer has moved to https://github.com/immcantation/scoper - -To update Git configuration settings use: - -``` - git config user.email "your-gh-user@email.com" - git config user.name "your-gh-user-name" - git remote set-url origin git@github.com:immcantation/scoper.git -``` - SCOPer ------------------------------------------------------------------------------- From 9e30b9858dd4629d61b5a1c2b02e222fd945a9f3 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Mon, 30 Mar 2026 21:50:31 +0200 Subject: [PATCH 25/25] mv to roxygen block comment that was added in docs md --- R/Functions.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index 3a64927..7ec41e9 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -877,7 +877,8 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' @param db data.frame containing sequence data. #' @param threshold numeric scalar where the tree should be cut (the distance threshold for clonal grouping). #' @param method one of the \code{"nt"} for nucleotide based clustering or -#' \code{"aa"} for amino acid based clustering. +#' \code{"aa"} for amino acid based clustering. Method `"aa"` still expects nucleotide sequences, +#' which will be translated to amino acids #' @param linkage available linkage are \code{"single"}, \code{"average"}, and \code{"complete"}. #' @param normalize method of normalization. The default is \code{"len"}, which divides the distance by the length #' of the sequence group. If \code{"none"} then no normalization if performed. @@ -938,7 +939,8 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' The default is \code{NULL} for no action. #' @param summarize_clones if \code{TRUE} performs a series of analysis to assess the clonal landscape #' and returns a \link{ScoperClones} object. If \code{FALSE} (default) then -#' a modified input \code{db} is returned. When grouping by \code{fields}, +#' a modified input \code{db} is returned with clone identifiers in the specified +#' `clone` column. When grouping by \code{fields}, #' \code{summarize_clones} should be \code{FALSE}. #' @param seq_id The column containing sequence ids #'