From 6e6c6e989969c89560b28f4d5b244fbe3fbba31f Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Fri, 1 May 2026 13:17:42 +0200 Subject: [PATCH 1/4] Check all sequences are valid, not just the first 1000 --- R/Functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index 12b934a..9036a47 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -1323,10 +1323,10 @@ defineClonesScoper <- function(db, valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0)) } .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) } - valid_seq <- sapply(head(db[[junction]],1000), .validateSeq) + valid_seq <- sapply(db[[junction]], .validateSeq) not_valid_seq <- which(!valid_seq) if (length(not_valid_seq) > 0) { - error_msg <- paste0("Invalid sequence characters in the ", junction, " column were found when checking the first 1000 sequences. ", + error_msg <- paste0("Invalid sequence characters in the ", junction, " column were found. ", length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'", "\n If you have other IUPAC characters in your sequences, set IUPAC=TRUE to allow all IUPAC bases, this will run a slower version of hierarchicalClustering.") if (method == "aa") { From db6183eea2b38f85424a3381a75e0a0acd790155 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Fri, 1 May 2026 13:20:33 +0200 Subject: [PATCH 2/4] Updated docs --- R/Functions.R | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/R/Functions.R b/R/Functions.R index 9036a47..36f6f7d 100644 --- a/R/Functions.R +++ b/R/Functions.R @@ -790,9 +790,12 @@ plotCloneSummary <- function(data, xmin=NULL, xmax=NULL, breaks=NULL, #' less than 7 nucleotides. #' @param mod3 if \code{TRUE} removes records with a \code{junction} length that is not divisible by #' 3 in nucleotide space. -#' @param max_n The maximum number of degenerate characters to permit in the junction sequence before excluding the -#' record from clonal assignment. Default is set to be zero. Set it as \code{"NULL"} for no -#' action. +#' @param max_n The maximum number of non-ATCG characters to permit in the junction sequence before +#' excluding the record from clonal assignment. Counts non-ATCG characters using regex +#' \code{"[^ATCG]"}, which includes N, ?, and IUPAC ambiguity codes. +#' With the default value of 0, all sequences containing any non-ATCG character +#' (including IUPAC codes) in the junction are removed before clustering. Set to \code{NULL} +#' for no filtering. #' @param nproc number of cores to distribute the function over. #' @param verbose if \code{TRUE} prints out a summary of each step cloning process. #' if \code{FALSE} (default) process cloning silently. @@ -964,9 +967,7 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' @section IUPAC and max_n parameters: #' Note: The \code{IUPAC} parameter is only available for \code{hierarchicalClones} with #' \code{method="nt"} (nucleotide mode). It is ignored when \code{method="aa"} (amino acid mode). -#' The \code{identicalClones} and \code{spectralClones} functions always operate as if -#' \code{IUPAC=FALSE} (standard bases plus N and ? only). The \code{max_n} parameter -#' is available for all cloning functions. +#' The \code{max_n} parameter is available for all cloning functions. #' #' The \code{IUPAC} and \code{max_n} parameters serve complementary but distinct purposes: #' @@ -978,29 +979,34 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction", #' #' \code{max_n} controls: #' \itemize{ -#' \item Sequence filtering by counting non-ATCG characters +#' \item Sequence filtering by counting non-ATCG characters in the junction #' } +#' +#' \code{hierarchicalClones} with \code{method="aa"} accepts the full IUPAC DNA alphabet during validation, +#' then \code{max_n} controls filtering of sequences containing excess non-ATCG characters +#' before translating to amino acids and performing IUPAC-aware clustering. #' -#' Common use cases (for \code{method="nt"}): +#' Example use cases for \code{hierarchicalClones} with \code{method="nt"}: #' \itemize{ #' \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation. -#' Will throw an error and exit if sequences with IUPAC codes not A, T, C, G, N, or ? are detected -#' among the first 1000 sequences. Fastest option for high-quality data. -#' \item \code{IUPAC=FALSE, max_n>0}: Will throw an error and exit if sequences with IUPAC -#' codes not A, T, C, G, N, or ? are detected among the first 1000 sequences. Allows sequences +#' Will throw an error and exit if sequences with characters not A, T, C, G, N, or ? are detected. +#' max_n=0 will filter out sequences with N or ? characters. Fastest option for high-quality data. +#' \item \code{IUPAC=FALSE, max_n>0}: Will throw an error and exit if sequences with characters +#' not A, T, C, G, N, or ? are detected. Allows sequences #' with limited N/? characters in distance calculation, using fast Hamming distance. #' Note: IUPAC codes are rejected during validation (before max_n filtering), so max_n only -#' affects sequences that passed validation with N or ? characters. Useful for data with low-quality +#' controls filtering of sequences with N or ? characters. Useful for data with low-quality #' or masked positions but no experimental ambiguity codes. #' \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG -#' characters anyway. Only standard bases remain after filtering. +#' characters anyway. Only standard bases remain after filtering. Slower than IUPAC=FALSE +#' but handles any ambiguity codes in the input by filtering them out before clustering. #' \item \code{IUPAC=TRUE, max_n>0}: Allows sequences with limited ambiguity codes and uses #' proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly. #' Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate #' (counts all non-ATCG: N, ?, and other IUPAC codes). #' \item \code{IUPAC=TRUE, max_n=NULL}: Process all sequences with IUPAC codes regardless of the #' number of ambiguous positions. Uses IUPAC-aware distance calculation with no filtering. -#' Most permissive option for data with extensive IUPAC ambiguity codes. +#' Most permissive option. #' } #' #' Note: Validation occurs before filtering. When \code{IUPAC=FALSE}, sequences containing IUPAC @@ -1125,12 +1131,11 @@ hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("s #' less than 7 nucleotides. #' @param mod3 if \code{TRUE} removes records with a \code{junction} length that is not divisible by #' 3 in nucleotide space. -#' @param max_n The maximum number of degenerate characters to permit in the junction sequence before excluding the -#' record from clonal assignment. Counts non-ATCG characters using regex \code{"[^ATCG]"}, which -#' includes N, ?, and IUPAC ambiguity codes. Note: \code{spectralClones} does not support the -#' \code{IUPAC} parameter and always validates sequences as if \code{IUPAC=FALSE} (only allows -#' standard bases A,T,C,G plus N and ?). IUPAC ambiguity codes will cause validation errors. -#' Default is 0. Set to \code{NULL} for no filtering. +#' @param max_n The maximum number of non-ATCG characters to permit in the junction sequence before +#' excluding the record from clonal assignment. Counts non-ATCG characters using regex +#' \code{"[^ATCG]"}, which includes N, ?, and IUPAC ambiguity codes. +#' With the default value of 0, all sequences containing any non-ATCG character +#' are removed before clustering. Set to \code{NULL} for no filtering. #' @param threshold the supervising cut-off to enforce an upper-limit distance for clonal grouping. #' A numeric value between (0,1). #' @param base_sim required similarity cut-off for sequences in equal distances from each other. From fd146b145668a069fc1f088f040dcad3e6485e16 Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Fri, 1 May 2026 14:28:50 +0200 Subject: [PATCH 3/4] Remove tests not needed since we now validate beyond sequence 1000 --- tests/testthat/test_clone.R | 143 +----------------------------------- 1 file changed, 1 insertion(+), 142 deletions(-) diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R index 49a30c7..33ae311 100644 --- a/tests/testthat/test_clone.R +++ b/tests/testthat/test_clone.R @@ -21,7 +21,7 @@ pipeline_env <- TRUE #### clone - identicalClones #### test_that("Test identicalClones", { - # Truth + # TruthR expects <- as.integer(c(20, 21, 23, 26, 27, 28, 30, 44, 50, 100)) # Reproduce example @@ -385,147 +385,6 @@ test_that("Test hierarchicalClones with IUPAC parameter", { }) -#### clone - hierarchicalClones, IUPAC code in sequence 1001 #### - -test_that("Test hierarchicalClones with IUPAC code beyond validation window", { - # Validation only inspects the first 1000 sequences (head(..., 1000)). - # A sequence containing an IUPAC code at position 1001 therefore escapes the check - # regardless of IUPAC=TRUE/FALSE. - # What happens next depends on max_n and IUPAC: - # - max_n=0: prepare_db filters out non-ATCG sequences -> warning, no error (both IUPAC=T/F) - # - max_n=NULL, IUPAC=FALSE: no filtering -> fastDist_rcpp encounters R -> error - # - max_n=NULL, IUPAC=TRUE: no filtering -> pairwiseDist handles R correctly -> succeeds - - # Build 1001-row dataset: rows 1-1000 are standard ATCG, row 1001 has IUPAC code R - n_std <- 1000 - db_iupac_1001 <- data.frame( - sequence_id = paste0("seq", seq_len(n_std + 1)), - v_call = rep("IGHV1-1*01", n_std + 1), - j_call = rep("IGHJ1*01", n_std + 1), - junction = c(rep("TGTGCAAGCTACTGG", n_std), # rows 1-1000: ATCG only - "TGTGCRAGCTACTGG"), # row 1001: R is an IUPAC code - locus = rep("IGH", n_std + 1), - stringsAsFactors = FALSE - ) - - # Confirm row 1001 is the one with the IUPAC code (sanity check) - expect_equal(db_iupac_1001$junction[1001], "TGTGCRAGCTACTGG") - - # Same data in reverse order to put IUPAC code in row 1 - db_iupac_1 <- db_iupac_1001[1001:1, ] - - # Test 1: IUPAC=FALSE, max_n=0 (default) - # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000 - # - max_n=0 filtering: removes row 1001 (R is non-ATCG) and issues a warning - # - Result: runs successfully; row 1001 removed - expect_warning( - expect_message( - db_result1 <- hierarchicalClones(db_iupac_1001, - threshold = 0.15, - method = "nt", linkage = "single", - junction = "junction", - v_call = "v_call", j_call = "j_call", - IUPAC = FALSE, - max_n = 0, - summarize_clones = FALSE - ), - "Running defineClonesScoper in bulk mode and only keep heavy chains" - ), - "Removed 1 sequences with non ATCG characters." - ) - expect_equal(nrow(db_result1), n_std) - expect_false("seq1001" %in% db_result1$sequence_id) - - # Same data in reverse order now throws an error. Potentially confusing for users? - expect_error( - db_result1 <- hierarchicalClones(db_iupac_1, - threshold = 0.15, - method = "nt", linkage = "single", - junction = "junction", - v_call = "v_call", j_call = "j_call", - IUPAC = FALSE, - max_n = 0, - summarize_clones = FALSE - ), - "Invalid sequence characters in the junction column" - ) - - # Test 2: IUPAC=FALSE, max_n=NULL - # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000 - # - max_n=NULL: no filtering -> row 1001 reaches fastDist_rcpp - # - fastDist_rcpp stops with an error because R is not A,C,G,T,N,? - expect_error( - suppressMessages( - hierarchicalClones(db_iupac_1001, - threshold = 0.15, - method = "nt", linkage = "single", - junction = "junction", - v_call = "v_call", j_call = "j_call", - IUPAC = FALSE, - max_n = NULL, - summarize_clones = FALSE - ) - ), - "Only A,C,G,T,N,\\? are allowed" - ) - - # Same data in reverse also throws an error. - expect_error( - suppressMessages( - hierarchicalClones(db_iupac_1, - threshold = 0.15, - method = "nt", linkage = "single", - junction = "junction", - v_call = "v_call", j_call = "j_call", - IUPAC = FALSE, - max_n = NULL, - summarize_clones = FALSE - ) - ), - "Invalid sequence characters in the junction column" - ) - - # Test 3: IUPAC=TRUE, max_n=NULL - # - Validation (first 1000 rows): passes - # - max_n=NULL: no filtering -> row 1001 is kept - # - Distance: alakazam::pairwiseDist handles IUPAC codes correctly -> no error - # - Result: all 1001 sequences processed; seq1001 included in output - expect_message( - db_result3 <- hierarchicalClones(db_iupac_1001, - threshold = 0.15, - method = "nt", linkage = "single", - junction = "junction", - v_call = "v_call", j_call = "j_call", - IUPAC = TRUE, - max_n = NULL, - summarize_clones = FALSE - ), - "Running defineClonesScoper in bulk mode and only keep heavy chains" - ) - expect_equal(nrow(db_result3), n_std + 1) - expect_true("seq1001" %in% db_result3$sequence_id) - # seq1001 (TGTGCRAGCTACTGG) is within distance 0.15 of the standard sequences - # (TGTGCAAGCTACTGG): R matches A under IUPAC scoring -> same clone - expect_equal(length(unique(db_result3$clone_id)), 1) - - # Same data in reverse order gives the same result (IUPAC=TRUE allows R, max_n=NULL means no filtering) - expect_message( - db_result4 <- hierarchicalClones(db_iupac_1, - threshold = 0.15, - method = "nt", linkage = "single", - junction = "junction", - v_call = "v_call", j_call = "j_call", - IUPAC = TRUE, - max_n = NULL, - summarize_clones = FALSE - ), - "Running defineClonesScoper in bulk mode and only keep heavy chains" - ) - # Results should be identical regardless of row order - db_result4_ordered <- db_result4[match(db_result3$sequence_id, db_result4$sequence_id), ] - expect_equal(db_result3, db_result4_ordered, check.attributes = FALSE) -}) - #### clone - spectralClones - novj method #### test_that("Test spectralClones - novj", { From 169b67ad3eb25e2d7a15eb357d5253140ad4446d Mon Sep 17 00:00:00 2001 From: ssnn-airr Date: Fri, 1 May 2026 14:57:35 +0200 Subject: [PATCH 4/4] typo --- tests/testthat/test_clone.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R index 33ae311..38b8d8b 100644 --- a/tests/testthat/test_clone.R +++ b/tests/testthat/test_clone.R @@ -21,7 +21,7 @@ pipeline_env <- TRUE #### clone - identicalClones #### test_that("Test identicalClones", { - # TruthR + # Truth expects <- as.integer(c(20, 21, 23, 26, 27, 28, 30, 44, 50, 100)) # Reproduce example