immcantation · ssnn-airr · May 1, 2026 · May 1, 2026 · May 1, 2026 · May 1, 2026
diff --git a/R/Functions.R b/R/Functions.R
@@ -790,9 +790,12 @@ plotCloneSummary <- function(data, xmin=NULL, xmax=NULL, breaks=NULL,
 #'                              less than 7 nucleotides.
 #' @param    mod3               if \code{TRUE} removes records with a \code{junction} length that is not divisible by 
 #'                              3 in nucleotide space.
-#' @param    max_n              The maximum number of degenerate characters to permit in the junction sequence before excluding the 
-#'                              record from clonal assignment. Default is set to be zero. Set it as \code{"NULL"} for no 
-#'                              action.
+#' @param    max_n              The maximum number of non-ATCG characters to permit in the junction sequence before 
+#'                              excluding the record from clonal assignment. Counts non-ATCG characters using regex 
+#'                              \code{"[^ATCG]"}, which includes N, ?, and IUPAC ambiguity codes. 
+#'                              With the default value of 0, all sequences containing any non-ATCG character 
+#'                              (including IUPAC codes) in the junction are removed before clustering. Set to \code{NULL} 
+#'                              for no filtering.
 #' @param    nproc              number of cores to distribute the function over.
 #' @param    verbose            if \code{TRUE} prints out a summary of each step cloning process.
 #'                              if \code{FALSE} (default) process cloning silently.
@@ -964,9 +967,7 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #' @section IUPAC and max_n parameters:
 #' Note: The \code{IUPAC} parameter is only available for \code{hierarchicalClones} with 
 #' \code{method="nt"} (nucleotide mode). It is ignored when \code{method="aa"} (amino acid mode). 
-#' The \code{identicalClones} and \code{spectralClones} functions always operate as if 
-#' \code{IUPAC=FALSE} (standard bases plus N and ? only). The \code{max_n} parameter 
-#' is available for all cloning functions.
+#' The \code{max_n} parameter is available for all cloning functions.
 #' 
 #' The \code{IUPAC} and \code{max_n} parameters serve complementary but distinct purposes:
 #' 
@@ -978,29 +979,34 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #' 
 #' \code{max_n} controls:
 #' \itemize{
-#'   \item Sequence filtering by counting non-ATCG characters
+#'   \item Sequence filtering by counting non-ATCG characters in the junction
 #' }
+#'
+#' \code{hierarchicalClones} with \code{method="aa"} accepts the full IUPAC DNA alphabet during validation, 
+#' then \code{max_n} controls filtering of sequences containing excess non-ATCG characters 
+#' before translating to amino acids and performing IUPAC-aware clustering.
 #' 
-#' Common use cases (for \code{method="nt"}):
+#' Example use cases for \code{hierarchicalClones} with \code{method="nt"}:
 #' \itemize{
 #'   \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation. 
-#'         Will throw an error and exit if sequences with IUPAC codes not A, T, C, G, N, or ? are detected
-#'         among the first 1000 sequences. Fastest option for high-quality data.
-#'   \item \code{IUPAC=FALSE, max_n>0}: Will throw an error and exit if sequences with IUPAC 
-#'         codes not A, T, C, G, N, or ? are detected among the first 1000 sequences. Allows sequences 
+#'         Will throw an error and exit if sequences with characters not A, T, C, G, N, or ? are detected.
+#'         max_n=0 will filter out sequences with N or ? characters. Fastest option for high-quality data.
+#'   \item \code{IUPAC=FALSE, max_n>0}: Will throw an error and exit if sequences with characters
+#'         not A, T, C, G, N, or ? are detected. Allows sequences 
 #'         with limited N/? characters in distance calculation, using fast Hamming distance. 
 #'         Note: IUPAC codes are rejected during validation (before max_n filtering), so max_n only 
-#'         affects sequences that passed validation with N or ? characters. Useful for data with low-quality 
+#'         controls filtering of sequences with N or ? characters. Useful for data with low-quality 
 #'         or masked positions but no experimental ambiguity codes.
 #'   \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG 
-#'         characters anyway. Only standard bases remain after filtering.
+#'         characters anyway. Only standard bases remain after filtering. Slower than IUPAC=FALSE 
+#'         but handles any ambiguity codes in the input by filtering them out before clustering.
 #'   \item \code{IUPAC=TRUE, max_n>0}: Allows sequences with limited ambiguity codes and uses 
 #'         proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly. 
 #'         Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate 
 #'         (counts all non-ATCG: N, ?, and other IUPAC codes).
 #'   \item \code{IUPAC=TRUE, max_n=NULL}: Process all sequences with IUPAC codes regardless of the 
 #'         number of ambiguous positions. Uses IUPAC-aware distance calculation with no filtering. 
-#'         Most permissive option for data with extensive IUPAC ambiguity codes.
+#'         Most permissive option.
 #' }
 #' 
 #' Note: Validation occurs before filtering. When \code{IUPAC=FALSE}, sequences containing IUPAC 
@@ -1125,12 +1131,11 @@ hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("s
 #'                              less than 7 nucleotides.
 #' @param    mod3               if \code{TRUE} removes records with a \code{junction} length that is not divisible by 
 #'                              3 in nucleotide space.
-#' @param    max_n              The maximum number of degenerate characters to permit in the junction sequence before excluding the 
-#'                              record from clonal assignment. Counts non-ATCG characters using regex \code{"[^ATCG]"}, which 
-#'                              includes N, ?, and IUPAC ambiguity codes. Note: \code{spectralClones} does not support the 
-#'                              \code{IUPAC} parameter and always validates sequences as if \code{IUPAC=FALSE} (only allows 
-#'                              standard bases A,T,C,G plus N and ?). IUPAC ambiguity codes will cause validation errors. 
-#'                              Default is 0. Set to \code{NULL} for no filtering.
+#' @param    max_n              The maximum number of non-ATCG characters to permit in the junction sequence before 
+#'                              excluding the record from clonal assignment. Counts non-ATCG characters using regex 
+#'                              \code{"[^ATCG]"}, which includes N, ?, and IUPAC ambiguity codes. 
+#'                              With the default value of 0, all sequences containing any non-ATCG character 
+#'                              are removed before clustering. Set to \code{NULL} for no filtering.
 #' @param    threshold          the supervising cut-off to enforce an upper-limit distance for clonal grouping.
 #'                              A numeric value between (0,1).
 #' @param    base_sim           required similarity cut-off for sequences in equal distances from each other.
@@ -1323,10 +1328,10 @@ defineClonesScoper <- function(db,
         valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0))
     }
     .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) }
-    valid_seq <- sapply(head(db[[junction]],1000), .validateSeq)
+    valid_seq <- sapply(db[[junction]], .validateSeq)
     not_valid_seq <- which(!valid_seq)
     if (length(not_valid_seq) > 0) {
-        error_msg <- paste0("Invalid sequence characters in the ", junction, " column were found when checking the first 1000 sequences. ",
+        error_msg <- paste0("Invalid sequence characters in the ", junction, " column were found. ",
             length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '",  valid_chars, "'",
             "\n If you have other IUPAC characters in your sequences, set IUPAC=TRUE to allow all IUPAC bases, this will run a slower version of hierarchicalClustering.")
         if (method == "aa") {

diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R
@@ -385,147 +385,6 @@ test_that("Test hierarchicalClones with IUPAC parameter", {
 
 })
 
-#### clone - hierarchicalClones, IUPAC code in sequence 1001 ####
-
-test_that("Test hierarchicalClones with IUPAC code beyond validation window", {
-    # Validation only inspects the first 1000 sequences (head(..., 1000)).
-    # A sequence containing an IUPAC code at position 1001 therefore escapes the check
-    # regardless of IUPAC=TRUE/FALSE.
-    # What happens next depends on max_n and IUPAC:
-    #   - max_n=0:    prepare_db filters out non-ATCG sequences -> warning, no error (both IUPAC=T/F)
-    #   - max_n=NULL, IUPAC=FALSE: no filtering -> fastDist_rcpp encounters R -> error
-    #   - max_n=NULL, IUPAC=TRUE:  no filtering -> pairwiseDist handles R correctly -> succeeds
-
-    # Build 1001-row dataset: rows 1-1000 are standard ATCG, row 1001 has IUPAC code R
-    n_std <- 1000
-    db_iupac_1001 <- data.frame(
-        sequence_id = paste0("seq", seq_len(n_std + 1)),
-        v_call      = rep("IGHV1-1*01", n_std + 1),
-        j_call      = rep("IGHJ1*01",   n_std + 1),
-        junction    = c(rep("TGTGCAAGCTACTGG", n_std),  # rows 1-1000: ATCG only
-                        "TGTGCRAGCTACTGG"),              # row 1001: R is an IUPAC code
-        locus       = rep("IGH", n_std + 1),
-        stringsAsFactors = FALSE
-    )
-
-    # Confirm row 1001 is the one with the IUPAC code (sanity check)
-    expect_equal(db_iupac_1001$junction[1001], "TGTGCRAGCTACTGG")
-
-    # Same data in reverse order to put IUPAC code in row 1
-    db_iupac_1 <- db_iupac_1001[1001:1, ]
-
-    # Test 1: IUPAC=FALSE, max_n=0 (default)
-    # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000
-    # - max_n=0 filtering: removes row 1001 (R is non-ATCG) and issues a warning
-    # - Result: runs successfully; row 1001 removed
-    expect_warning(
-        expect_message(
-            db_result1 <- hierarchicalClones(db_iupac_1001,
-                threshold = 0.15,
-                method = "nt", linkage = "single",
-                junction = "junction",
-                v_call = "v_call", j_call = "j_call",
-                IUPAC = FALSE,
-                max_n = 0,
-                summarize_clones = FALSE
-            ),
-            "Running defineClonesScoper in bulk mode and only keep heavy chains"
-        ),
-        "Removed 1 sequences with non ATCG characters."
-    )
-    expect_equal(nrow(db_result1), n_std)
-    expect_false("seq1001" %in% db_result1$sequence_id)
-
-    # Same data in reverse order now throws an error. Potentially confusing for users?
-    expect_error(
-        db_result1 <- hierarchicalClones(db_iupac_1,
-            threshold = 0.15,
-            method = "nt", linkage = "single",
-            junction = "junction",
-            v_call = "v_call", j_call = "j_call",
-            IUPAC = FALSE,
-            max_n = 0,
-            summarize_clones = FALSE
-        ),
-        "Invalid sequence characters in the junction column"
-    )
-
-    # Test 2: IUPAC=FALSE, max_n=NULL
-    # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000
-    # - max_n=NULL: no filtering -> row 1001 reaches fastDist_rcpp
-    # - fastDist_rcpp stops with an error because R is not A,C,G,T,N,?
-    expect_error(
-        suppressMessages(
-            hierarchicalClones(db_iupac_1001,
-                threshold = 0.15,
-                method = "nt", linkage = "single",
-                junction = "junction",
-                v_call = "v_call", j_call = "j_call",
-                IUPAC = FALSE,
-                max_n = NULL,
-                summarize_clones = FALSE
-            )
-        ),
-        "Only A,C,G,T,N,\\? are allowed"
-    )
-
-    # Same data in reverse also throws an error.
-    expect_error(
-        suppressMessages(
-            hierarchicalClones(db_iupac_1,
-                threshold = 0.15,
-                method = "nt", linkage = "single",
-                junction = "junction",
-                v_call = "v_call", j_call = "j_call",
-                IUPAC = FALSE,
-                max_n = NULL,
-                summarize_clones = FALSE
-            )
-        ),
-        "Invalid sequence characters in the junction column"
-    )
-
-    # Test 3: IUPAC=TRUE, max_n=NULL
-    # - Validation (first 1000 rows): passes
-    # - max_n=NULL: no filtering -> row 1001 is kept
-    # - Distance: alakazam::pairwiseDist handles IUPAC codes correctly -> no error
-    # - Result: all 1001 sequences processed; seq1001 included in output
-    expect_message(
-        db_result3 <- hierarchicalClones(db_iupac_1001,
-            threshold = 0.15,
-            method = "nt", linkage = "single",
-            junction = "junction",
-            v_call = "v_call", j_call = "j_call",
-            IUPAC = TRUE,
-            max_n = NULL,
-            summarize_clones = FALSE
-        ),
-        "Running defineClonesScoper in bulk mode and only keep heavy chains"
-    )
-    expect_equal(nrow(db_result3), n_std + 1)
-    expect_true("seq1001" %in% db_result3$sequence_id)
-    # seq1001 (TGTGCRAGCTACTGG) is within distance 0.15 of the standard sequences
-    # (TGTGCAAGCTACTGG): R matches A under IUPAC scoring -> same clone
-    expect_equal(length(unique(db_result3$clone_id)), 1)
-
-    # Same data in reverse order gives the same result (IUPAC=TRUE allows R, max_n=NULL means no filtering)
-    expect_message(
-        db_result4 <- hierarchicalClones(db_iupac_1,
-            threshold = 0.15,
-            method = "nt", linkage = "single",
-            junction = "junction",
-            v_call = "v_call", j_call = "j_call",
-            IUPAC = TRUE,
-            max_n = NULL,
-            summarize_clones = FALSE
-        ),
-        "Running defineClonesScoper in bulk mode and only keep heavy chains"
-    )
-    # Results should be identical regardless of row order
-    db_result4_ordered <- db_result4[match(db_result3$sequence_id, db_result4$sequence_id), ]
-    expect_equal(db_result3, db_result4_ordered,  check.attributes = FALSE)
-})
-
 #### clone - spectralClones - novj method ####
 
 test_that("Test spectralClones - novj", {