Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 28 additions & 23 deletions R/Functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -790,9 +790,12 @@ plotCloneSummary <- function(data, xmin=NULL, xmax=NULL, breaks=NULL,
#' less than 7 nucleotides.
#' @param mod3 if \code{TRUE} removes records with a \code{junction} length that is not divisible by
#' 3 in nucleotide space.
#' @param max_n The maximum number of degenerate characters to permit in the junction sequence before excluding the
#' record from clonal assignment. Default is set to be zero. Set it as \code{"NULL"} for no
#' action.
#' @param max_n The maximum number of non-ATCG characters to permit in the junction sequence before
#' excluding the record from clonal assignment. Counts non-ATCG characters using regex
#' \code{"[^ATCG]"}, which includes N, ?, and IUPAC ambiguity codes.
#' With the default value of 0, all sequences containing any non-ATCG character
#' (including IUPAC codes) in the junction are removed before clustering. Set to \code{NULL}
#' for no filtering.
#' @param nproc number of cores to distribute the function over.
#' @param verbose if \code{TRUE} prints out a summary of each step cloning process.
#' if \code{FALSE} (default) process cloning silently.
Expand Down Expand Up @@ -964,9 +967,7 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
#' @section IUPAC and max_n parameters:
#' Note: The \code{IUPAC} parameter is only available for \code{hierarchicalClones} with
#' \code{method="nt"} (nucleotide mode). It is ignored when \code{method="aa"} (amino acid mode).
#' The \code{identicalClones} and \code{spectralClones} functions always operate as if
#' \code{IUPAC=FALSE} (standard bases plus N and ? only). The \code{max_n} parameter
#' is available for all cloning functions.
#' The \code{max_n} parameter is available for all cloning functions.
#'
#' The \code{IUPAC} and \code{max_n} parameters serve complementary but distinct purposes:
#'
Expand All @@ -978,29 +979,34 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
#'
#' \code{max_n} controls:
#' \itemize{
#' \item Sequence filtering by counting non-ATCG characters
#' \item Sequence filtering by counting non-ATCG characters in the junction
#' }
#'
#' \code{hierarchicalClones} with \code{method="aa"} accepts the full IUPAC DNA alphabet during validation,
#' then \code{max_n} controls filtering of sequences containing excess non-ATCG characters
#' before translating to amino acids and performing IUPAC-aware clustering.
#'
#' Common use cases (for \code{method="nt"}):
#' Example use cases for \code{hierarchicalClones} with \code{method="nt"}:
#' \itemize{
#' \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation.
#' Will throw an error and exit if sequences with IUPAC codes not A, T, C, G, N, or ? are detected
#' among the first 1000 sequences. Fastest option for high-quality data.
#' \item \code{IUPAC=FALSE, max_n>0}: Will throw an error and exit if sequences with IUPAC
#' codes not A, T, C, G, N, or ? are detected among the first 1000 sequences. Allows sequences
#' Will throw an error and exit if sequences with characters not A, T, C, G, N, or ? are detected.
#' max_n=0 will filter out sequences with N or ? characters. Fastest option for high-quality data.
#' \item \code{IUPAC=FALSE, max_n>0}: Will throw an error and exit if sequences with characters
#' not A, T, C, G, N, or ? are detected. Allows sequences
#' with limited N/? characters in distance calculation, using fast Hamming distance.
#' Note: IUPAC codes are rejected during validation (before max_n filtering), so max_n only
#' affects sequences that passed validation with N or ? characters. Useful for data with low-quality
#' controls filtering of sequences with N or ? characters. Useful for data with low-quality
#' or masked positions but no experimental ambiguity codes.
#' \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG
#' characters anyway. Only standard bases remain after filtering.
#' characters anyway. Only standard bases remain after filtering. Slower than IUPAC=FALSE
#' but handles any ambiguity codes in the input by filtering them out before clustering.
#' \item \code{IUPAC=TRUE, max_n>0}: Allows sequences with limited ambiguity codes and uses
#' proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly.
#' Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate
#' (counts all non-ATCG: N, ?, and other IUPAC codes).
#' \item \code{IUPAC=TRUE, max_n=NULL}: Process all sequences with IUPAC codes regardless of the
#' number of ambiguous positions. Uses IUPAC-aware distance calculation with no filtering.
#' Most permissive option for data with extensive IUPAC ambiguity codes.
#' Most permissive option.
#' }
#'
#' Note: Validation occurs before filtering. When \code{IUPAC=FALSE}, sequences containing IUPAC
Expand Down Expand Up @@ -1125,12 +1131,11 @@ hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("s
#' less than 7 nucleotides.
#' @param mod3 if \code{TRUE} removes records with a \code{junction} length that is not divisible by
#' 3 in nucleotide space.
#' @param max_n The maximum number of degenerate characters to permit in the junction sequence before excluding the
#' record from clonal assignment. Counts non-ATCG characters using regex \code{"[^ATCG]"}, which
#' includes N, ?, and IUPAC ambiguity codes. Note: \code{spectralClones} does not support the
#' \code{IUPAC} parameter and always validates sequences as if \code{IUPAC=FALSE} (only allows
#' standard bases A,T,C,G plus N and ?). IUPAC ambiguity codes will cause validation errors.
#' Default is 0. Set to \code{NULL} for no filtering.
#' @param max_n The maximum number of non-ATCG characters to permit in the junction sequence before
#' excluding the record from clonal assignment. Counts non-ATCG characters using regex
#' \code{"[^ATCG]"}, which includes N, ?, and IUPAC ambiguity codes.
#' With the default value of 0, all sequences containing any non-ATCG character
#' are removed before clustering. Set to \code{NULL} for no filtering.
#' @param threshold the supervising cut-off to enforce an upper-limit distance for clonal grouping.
#' A numeric value between (0,1).
#' @param base_sim required similarity cut-off for sequences in equal distances from each other.
Expand Down Expand Up @@ -1323,10 +1328,10 @@ defineClonesScoper <- function(db,
valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0))
}
.validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) }
valid_seq <- sapply(head(db[[junction]],1000), .validateSeq)
valid_seq <- sapply(db[[junction]], .validateSeq)
not_valid_seq <- which(!valid_seq)
if (length(not_valid_seq) > 0) {
error_msg <- paste0("Invalid sequence characters in the ", junction, " column were found when checking the first 1000 sequences. ",
error_msg <- paste0("Invalid sequence characters in the ", junction, " column were found. ",
length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '", valid_chars, "'",
"\n If you have other IUPAC characters in your sequences, set IUPAC=TRUE to allow all IUPAC bases, this will run a slower version of hierarchicalClustering.")
if (method == "aa") {
Expand Down
141 changes: 0 additions & 141 deletions tests/testthat/test_clone.R
Original file line number Diff line number Diff line change
Expand Up @@ -385,147 +385,6 @@ test_that("Test hierarchicalClones with IUPAC parameter", {

})

#### clone - hierarchicalClones, IUPAC code in sequence 1001 ####

test_that("Test hierarchicalClones with IUPAC code beyond validation window", {
# Validation only inspects the first 1000 sequences (head(..., 1000)).
# A sequence containing an IUPAC code at position 1001 therefore escapes the check
# regardless of IUPAC=TRUE/FALSE.
# What happens next depends on max_n and IUPAC:
# - max_n=0: prepare_db filters out non-ATCG sequences -> warning, no error (both IUPAC=T/F)
# - max_n=NULL, IUPAC=FALSE: no filtering -> fastDist_rcpp encounters R -> error
# - max_n=NULL, IUPAC=TRUE: no filtering -> pairwiseDist handles R correctly -> succeeds

# Build 1001-row dataset: rows 1-1000 are standard ATCG, row 1001 has IUPAC code R
n_std <- 1000
db_iupac_1001 <- data.frame(
sequence_id = paste0("seq", seq_len(n_std + 1)),
v_call = rep("IGHV1-1*01", n_std + 1),
j_call = rep("IGHJ1*01", n_std + 1),
junction = c(rep("TGTGCAAGCTACTGG", n_std), # rows 1-1000: ATCG only
"TGTGCRAGCTACTGG"), # row 1001: R is an IUPAC code
locus = rep("IGH", n_std + 1),
stringsAsFactors = FALSE
)

# Confirm row 1001 is the one with the IUPAC code (sanity check)
expect_equal(db_iupac_1001$junction[1001], "TGTGCRAGCTACTGG")

# Same data in reverse order to put IUPAC code in row 1
db_iupac_1 <- db_iupac_1001[1001:1, ]

# Test 1: IUPAC=FALSE, max_n=0 (default)
# - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000
# - max_n=0 filtering: removes row 1001 (R is non-ATCG) and issues a warning
# - Result: runs successfully; row 1001 removed
expect_warning(
expect_message(
db_result1 <- hierarchicalClones(db_iupac_1001,
threshold = 0.15,
method = "nt", linkage = "single",
junction = "junction",
v_call = "v_call", j_call = "j_call",
IUPAC = FALSE,
max_n = 0,
summarize_clones = FALSE
),
"Running defineClonesScoper in bulk mode and only keep heavy chains"
),
"Removed 1 sequences with non ATCG characters."
)
expect_equal(nrow(db_result1), n_std)
expect_false("seq1001" %in% db_result1$sequence_id)

# Same data in reverse order now throws an error. Potentially confusing for users?
expect_error(
db_result1 <- hierarchicalClones(db_iupac_1,
threshold = 0.15,
method = "nt", linkage = "single",
junction = "junction",
v_call = "v_call", j_call = "j_call",
IUPAC = FALSE,
max_n = 0,
summarize_clones = FALSE
),
"Invalid sequence characters in the junction column"
)

# Test 2: IUPAC=FALSE, max_n=NULL
# - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000
# - max_n=NULL: no filtering -> row 1001 reaches fastDist_rcpp
# - fastDist_rcpp stops with an error because R is not A,C,G,T,N,?
expect_error(
suppressMessages(
hierarchicalClones(db_iupac_1001,
threshold = 0.15,
method = "nt", linkage = "single",
junction = "junction",
v_call = "v_call", j_call = "j_call",
IUPAC = FALSE,
max_n = NULL,
summarize_clones = FALSE
)
),
"Only A,C,G,T,N,\\? are allowed"
)

# Same data in reverse also throws an error.
expect_error(
suppressMessages(
hierarchicalClones(db_iupac_1,
threshold = 0.15,
method = "nt", linkage = "single",
junction = "junction",
v_call = "v_call", j_call = "j_call",
IUPAC = FALSE,
max_n = NULL,
summarize_clones = FALSE
)
),
"Invalid sequence characters in the junction column"
)

# Test 3: IUPAC=TRUE, max_n=NULL
# - Validation (first 1000 rows): passes
# - max_n=NULL: no filtering -> row 1001 is kept
# - Distance: alakazam::pairwiseDist handles IUPAC codes correctly -> no error
# - Result: all 1001 sequences processed; seq1001 included in output
expect_message(
db_result3 <- hierarchicalClones(db_iupac_1001,
threshold = 0.15,
method = "nt", linkage = "single",
junction = "junction",
v_call = "v_call", j_call = "j_call",
IUPAC = TRUE,
max_n = NULL,
summarize_clones = FALSE
),
"Running defineClonesScoper in bulk mode and only keep heavy chains"
)
expect_equal(nrow(db_result3), n_std + 1)
expect_true("seq1001" %in% db_result3$sequence_id)
# seq1001 (TGTGCRAGCTACTGG) is within distance 0.15 of the standard sequences
# (TGTGCAAGCTACTGG): R matches A under IUPAC scoring -> same clone
expect_equal(length(unique(db_result3$clone_id)), 1)

# Same data in reverse order gives the same result (IUPAC=TRUE allows R, max_n=NULL means no filtering)
expect_message(
db_result4 <- hierarchicalClones(db_iupac_1,
threshold = 0.15,
method = "nt", linkage = "single",
junction = "junction",
v_call = "v_call", j_call = "j_call",
IUPAC = TRUE,
max_n = NULL,
summarize_clones = FALSE
),
"Running defineClonesScoper in bulk mode and only keep heavy chains"
)
# Results should be identical regardless of row order
db_result4_ordered <- db_result4[match(db_result3$sequence_id, db_result4$sequence_id), ]
expect_equal(db_result3, db_result4_ordered, check.attributes = FALSE)
})

#### clone - spectralClones - novj method ####

test_that("Test spectralClones - novj", {
Expand Down
Loading