From bf3cd69a609a3f7ce0fd6c7d819c752dfd4e4e92 Mon Sep 17 00:00:00 2001
From: Robert Bjornson <robert.bjornson@yale.edu>
Date: Fri, 5 Dec 2025 09:16:30 -0500
Subject: [PATCH 01/25] added code to load devel scoper in foreach worker

---
 R/Functions.R | 46 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index e13bb5d..fc25f75 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -60,6 +60,12 @@ setMethod("as.data.frame", c(x="ScoperClones"),
 
 #### Internal functions ####
 
+# RDB
+workerlog <- function(msg) {
+   log_file <- paste0("worker_", Sys.getpid(), ".log")
+   cat("In foreach", "PID", Sys.getpid(), format(Sys.time(), "%Y-%m-%d_%H:%M:%S"), msg, "\n", file = log_file, append = TRUE)
+}
+
 # find density gap
 findGapSmooth <- function(vec) {
     # bandwidth <- kedd::h.ucv(vec, 4)$h
@@ -1181,7 +1187,8 @@ defineClonesScoper <- function(db,
                                iter_max = 1000, nstart = 1000, nproc = 1,
                                verbose = FALSE, log = NULL,
                                summarize_clones = FALSE) {
-  
+
+    cat("In modified Functions.R")			       
     ### get model
     model <- match.arg(model)
     
@@ -1405,13 +1412,24 @@ defineClonesScoper <- function(db,
     } else {
         stop('Nproc must be positive.')
     }
-    
+
+    #RDB
+    ## Workers: also dev version
+       parallel::clusterEvalQ(cluster, {
+          library(devtools)
+          devtools::load_all("scoper")   # same source tree path
+	  Rcpp::sourceCpp("fastDistExt2C.cpp")
+    })
+
+    #RDB
+    cat('Loading fastDistExt2C.cpp')
+
     ### export function to clusters
     if (nproc > 1) { 
         export_functions <- list("passToClustering_lev1", "passToClustering_lev2", "passToClustering_lev3", "passToClustering_lev4",
                                  "findGapSmooth", "infer", "krnlMtxGenerator", "makeAffinity", "laplacianMtx", 
                                  "rangeAtoB", "likelihoods", "pairwiseMutions", "pairwiseMutMatrix",
-                                 "printVerbose", "logVerbose","stri_join")
+                                 "printVerbose", "logVerbose","stri_join", "workerlog")
         parallel::clusterExport(cluster, export_functions, envir=environment())
         doParallel::registerDoParallel(cluster, cores=nproc)
     }
@@ -1749,6 +1767,8 @@ passToClustering_lev1 <- function (db_gp,
                                    iter_max = 1000, 
                                    nstart = 1000) {
     ### get model
+    # RDB
+    workerlog('in passToClustering_lev1')
     model <- match.arg(model)
     
     ### begin clustering
@@ -1759,6 +1779,8 @@ passToClustering_lev1 <- function (db_gp,
                                                 cdr3 = cdr3,
                                                 cdr3_col = cdr3_col)
     } else if (model == "hierarchical") {
+        #RDB
+	workerlog('calling hierarchicalClones_helper')
         clone_results <- hierarchicalClones_helper(db_gp,
                                                    method = method,
                                                    linkage = linkage,
@@ -1831,6 +1853,8 @@ hierarchicalClones_helper <- function(db_gp,
                                       cdr3 = FALSE,
                                       cdr3_col = NA,
                                       threshold = NULL) {
+    #RDB
+    workerlog('in hierarchicalClones_helper')
     ### get method
     method <- match.arg(method)
 
@@ -1842,7 +1866,8 @@ hierarchicalClones_helper <- function(db_gp,
     
     ### number of sequences
     n <- nrow(db_gp)
-    
+
+    workerlog(paste('in hierarchicalClones_helper, n, method', n, method))
     # get sequences
     if (method == "nt") {
         seqs <- db_gp[[ifelse(cdr3, cdr3_col, junction)]]   
@@ -1856,16 +1881,23 @@ hierarchicalClones_helper <- function(db_gp,
     n_unq <- nrow(df)
     ind_unq <- df$V1
     seqs_unq <- df$seqs
+
+    workerlog(paste('in hierarchicalClones_helper', n_unq))
     if (n_unq == 1) {
         return(list("idCluster" = rep(1, n), 
                     "n_cluster" = 1, 
                     "eigen_vals" = rep(0, n)))
     }
-    
+
+    # RDB
+    workerlog('just before calculate distance matrix')
     # calculate distance matrix
     if (method == "nt") {
-        dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, 
-                                 dist_mat = getDNAMatrix(gap = 0))
+        workerlog('calling fastDistExt2_rcpp')
+       	dist_mtx <- fastDistExt2_rcpp(seqs_unq)
+	# RDB
+        #dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, 
+        #                         dist_mat = getDNAMatrix(gap = 0))
     } else if (method == "aa") {
         dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, 
                                  dist_mat = getAAMatrix(gap = 0))

From e2f299e4bb2fc77313f77689a87898167225a184 Mon Sep 17 00:00:00 2001
From: Robert Bjornson <robert.bjornson@yale.edu>
Date: Fri, 5 Dec 2025 16:01:40 -0500
Subject: [PATCH 02/25] added fastDist.cpp

---
 src/fastDist.cpp | 104 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 src/fastDist.cpp

diff --git a/src/fastDist.cpp b/src/fastDist.cpp
new file mode 100644
index 0000000..3ff9919
--- /dev/null
+++ b/src/fastDist.cpp
@@ -0,0 +1,104 @@
+#include <Rcpp.h>
+using namespace Rcpp;
+
+inline uint8_t code_char(char c){
+  switch(c){
+    case 'A': return 0; case 'C': return 1; case 'G': return 2; case 'T': return 3;
+    case 'N': return 4; case '?': return 5;
+    default:  return 255;
+  }
+}
+
+// [[Rcpp::export]]
+IntegerMatrix fastDist_rcpp(CharacterVector seqs) {
+  int N = seqs.size();
+
+  // RDB
+  printf("In fastDistExt2_rcpp, %d seqs\n", N);
+  if (N == 0) stop("empty input");
+
+  std::string s0 = as<std::string>(seqs[0]);
+  int L = (int)s0.size();
+  for (int i = 0; i < N; ++i) {
+    if ((int)std::string(as<std::string>(seqs[i])).size() != L)
+      stop("All sequences must have the same length");
+  }
+
+  // encode to uint8: N x L, row-major in a flat vector
+  std::vector<uint8_t> enc((size_t)N * L);
+  for (int i = 0; i < N; ++i) {
+    std::string s = as<std::string>(seqs[i]);
+    for (int p = 0; p < L; ++p) {
+      uint8_t c = code_char(s[p]);
+      if (c == 255) stop("Only A,C,G,T,N,? are allowed");
+      enc[(size_t)i * L + p] = c;
+    }
+  }
+
+  IntegerMatrix Mmatch(N, N); // zero-initialized
+
+  for (int p = 0; p < L; ++p) {
+    std::vector<int> A, Cb, G, Tt, Ns, Q;
+    A.reserve(N); Cb.reserve(N); G.reserve(N); Tt.reserve(N); Ns.reserve(N); Q.reserve(N);
+
+    // bucket row indices by symbol at column p
+    for (int i = 0; i < N; ++i) {
+      switch (enc[(size_t)i * L + p]) {
+        case 0: A.push_back(i);  break;
+        case 1: Cb.push_back(i); break;
+        case 2: G.push_back(i);  break;
+        case 3: Tt.push_back(i); break;
+        case 4: Ns.push_back(i); break;
+        case 5: Q.push_back(i);  break;
+      }
+    }
+
+    // column-major safe updaters: iterate over j (column) outermost,
+    // then use a pointer to column j and bump rows i.
+    auto bump_within = [&](const std::vector<int>& v){
+      int m = (int)v.size();
+      for (int jj = 0; jj < m; ++jj) {
+        int j = v[jj];
+        int* colj = &Mmatch(0, j);        // pointer to column j
+        for (int ii = 0; ii < m; ++ii) {
+          colj[v[ii]] += 1;               // increment (i = v[ii], j)
+        }
+      }
+    };
+
+    auto bump_pairs = [&](const std::vector<int>& X, const std::vector<int>& Y){
+      int mx = (int)X.size(), my = (int)Y.size();
+      for (int jj = 0; jj < my; ++jj) {
+        int j = Y[jj];
+        int* colj = &Mmatch(0, j);        // pointer to column j
+        for (int ii = 0; ii < mx; ++ii) {
+          colj[X[ii]] += 1;               // increment (i = X[ii], j)
+        }
+      }
+    };
+
+    // same known base
+    bump_within(A);  bump_within(Cb);  bump_within(G);  bump_within(Tt);
+
+    // N with known (both directions)
+    std::vector<int> K; K.reserve(A.size()+Cb.size()+G.size()+Tt.size());
+    K.insert(K.end(), A.begin(),  A.end());
+    K.insert(K.end(), Cb.begin(), Cb.end());
+    K.insert(K.end(), G.begin(),  G.end());
+    K.insert(K.end(), Tt.begin(), Tt.end());
+    bump_pairs(Ns, K);
+    bump_pairs(K, Ns);
+
+    // ? with ?
+    bump_within(Q);
+  }
+
+  // convert matches -> distances
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < N; ++j) {
+      Mmatch(i, j) = L - Mmatch(i, j);
+    }
+    Mmatch(i, i) = 0;
+  }
+  return Mmatch;
+}

From 2b728a6a996fb4aeb13de643921e5a3ff5fffbd8 Mon Sep 17 00:00:00 2001
From: Robert Bjornson <robert.bjornson@yale.edu>
Date: Fri, 5 Dec 2025 16:05:34 -0500
Subject: [PATCH 03/25] first attempt at proper integration

---
 R/Functions.R | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index fc25f75..ea38e9f 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -1415,15 +1415,11 @@ defineClonesScoper <- function(db,
 
     #RDB
     ## Workers: also dev version
-       parallel::clusterEvalQ(cluster, {
-          library(devtools)
-          devtools::load_all("scoper")   # same source tree path
-	  Rcpp::sourceCpp("fastDistExt2C.cpp")
+    parallel::clusterEvalQ(cluster, {
+        .libPaths(c("/path/to/devlib", .libPaths()))
+        library(scoper)
     })
 
-    #RDB
-    cat('Loading fastDistExt2C.cpp')
-
     ### export function to clusters
     if (nproc > 1) { 
         export_functions <- list("passToClustering_lev1", "passToClustering_lev2", "passToClustering_lev3", "passToClustering_lev4",
@@ -1893,8 +1889,8 @@ hierarchicalClones_helper <- function(db_gp,
     workerlog('just before calculate distance matrix')
     # calculate distance matrix
     if (method == "nt") {
-        workerlog('calling fastDistExt2_rcpp')
-       	dist_mtx <- fastDistExt2_rcpp(seqs_unq)
+        workerlog('calling fastDist_rcpp')
+       	dist_mtx <- fastDist_rcpp(seqs_unq)
 	# RDB
         #dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, 
         #                         dist_mat = getDNAMatrix(gap = 0))

From 8fe9e523bd39e71721c7a60a6453e911e923b8aa Mon Sep 17 00:00:00 2001
From: Robert Bjornson <robert.bjornson@yale.edu>
Date: Fri, 5 Dec 2025 16:15:21 -0500
Subject: [PATCH 04/25] fixed libPath

---
 R/Functions.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/Functions.R b/R/Functions.R
index ea38e9f..8c28d68 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -1414,9 +1414,10 @@ defineClonesScoper <- function(db,
     }
 
     #RDB
+
     ## Workers: also dev version
     parallel::clusterEvalQ(cluster, {
-        .libPaths(c("/path/to/devlib", .libPaths()))
+        .libPaths(c("/gpfs/gibbs/project/support/rdb9/working/Gabernet/251202/modified/devlib", .libPaths()))
         library(scoper)
     })
 

From 6e58d4efceebaf25adea12439f64d38d339f91d5 Mon Sep 17 00:00:00 2001
From: Robert Bjornson <robert.bjornson@yale.edu>
Date: Mon, 8 Dec 2025 09:52:34 -0500
Subject: [PATCH 05/25] added cpp interface files

---
 R/RcppExports.R     |  4 ++++
 src/RcppExports.cpp | 12 ++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/R/RcppExports.R b/R/RcppExports.R
index f02781e..70462b1 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -5,3 +5,7 @@ pairwiseMutMatrixRcpp <- function(informative_pos, mutMtx, motifMtx) {
     .Call(`_scoper_pairwiseMutMatrixRcpp`, informative_pos, mutMtx, motifMtx)
 }
 
+fastDist_rcpp <- function(seqs) {
+    .Call(`_scoper_fastDist_rcpp`, seqs)
+}
+
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 04defa6..03a552b 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -23,9 +23,21 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// fastDist_rcpp
+IntegerMatrix fastDist_rcpp(CharacterVector seqs);
+RcppExport SEXP _scoper_fastDist_rcpp(SEXP seqsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< CharacterVector >::type seqs(seqsSEXP);
+    rcpp_result_gen = Rcpp::wrap(fastDist_rcpp(seqs));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_scoper_pairwiseMutMatrixRcpp", (DL_FUNC) &_scoper_pairwiseMutMatrixRcpp, 3},
+    {"_scoper_fastDist_rcpp", (DL_FUNC) &_scoper_fastDist_rcpp, 1},
     {NULL, NULL, 0}
 };
 

From 5668d3fd5afaa133c4c8cd9a12a727c32e12c07b Mon Sep 17 00:00:00 2001
From: Robert Bjornson <robert.bjornson@yale.edu>
Date: Mon, 8 Dec 2025 10:28:52 -0500
Subject: [PATCH 06/25] more cleanup

---
 R/Functions.R    | 35 +++++++++--------------------------
 src/fastDist.cpp |  2 --
 2 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index 8c28d68..f1f9018 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -60,12 +60,6 @@ setMethod("as.data.frame", c(x="ScoperClones"),
 
 #### Internal functions ####
 
-# RDB
-workerlog <- function(msg) {
-   log_file <- paste0("worker_", Sys.getpid(), ".log")
-   cat("In foreach", "PID", Sys.getpid(), format(Sys.time(), "%Y-%m-%d_%H:%M:%S"), msg, "\n", file = log_file, append = TRUE)
-}
-
 # find density gap
 findGapSmooth <- function(vec) {
     # bandwidth <- kedd::h.ucv(vec, 4)$h
@@ -1413,20 +1407,19 @@ defineClonesScoper <- function(db,
         stop('Nproc must be positive.')
     }
 
-    #RDB
-
-    ## Workers: also dev version
-    parallel::clusterEvalQ(cluster, {
-        .libPaths(c("/gpfs/gibbs/project/support/rdb9/working/Gabernet/251202/modified/devlib", .libPaths()))
-        library(scoper)
-    })
+    ## RDB
+    ## This is necessary until the modified code is in the default path
+    #parallel::clusterEvalQ(cluster, {
+    #    .libPaths(c("/gpfs/gibbs/project/support/rdb9/working/Gabernet/251202/modified/devlib", .libPaths()))
+    #    library(scoper)
+    #})
 
     ### export function to clusters
     if (nproc > 1) { 
         export_functions <- list("passToClustering_lev1", "passToClustering_lev2", "passToClustering_lev3", "passToClustering_lev4",
                                  "findGapSmooth", "infer", "krnlMtxGenerator", "makeAffinity", "laplacianMtx", 
                                  "rangeAtoB", "likelihoods", "pairwiseMutions", "pairwiseMutMatrix",
-                                 "printVerbose", "logVerbose","stri_join", "workerlog")
+                                 "printVerbose", "logVerbose","stri_join")
         parallel::clusterExport(cluster, export_functions, envir=environment())
         doParallel::registerDoParallel(cluster, cores=nproc)
     }
@@ -1436,6 +1429,7 @@ defineClonesScoper <- function(db,
     db_cloned <- foreach::foreach(gp=1:n_groups,
                          .final=dplyr::bind_rows,
                          .inorder=TRUE,
+			 .packages="scoper",
                          .errorhandling='stop') %dopar% { 
                              # *********************************************************************************
                              # filter each group
@@ -1764,8 +1758,6 @@ passToClustering_lev1 <- function (db_gp,
                                    iter_max = 1000, 
                                    nstart = 1000) {
     ### get model
-    # RDB
-    workerlog('in passToClustering_lev1')
     model <- match.arg(model)
     
     ### begin clustering
@@ -1776,8 +1768,6 @@ passToClustering_lev1 <- function (db_gp,
                                                 cdr3 = cdr3,
                                                 cdr3_col = cdr3_col)
     } else if (model == "hierarchical") {
-        #RDB
-	workerlog('calling hierarchicalClones_helper')
         clone_results <- hierarchicalClones_helper(db_gp,
                                                    method = method,
                                                    linkage = linkage,
@@ -1850,8 +1840,6 @@ hierarchicalClones_helper <- function(db_gp,
                                       cdr3 = FALSE,
                                       cdr3_col = NA,
                                       threshold = NULL) {
-    #RDB
-    workerlog('in hierarchicalClones_helper')
     ### get method
     method <- match.arg(method)
 
@@ -1864,7 +1852,6 @@ hierarchicalClones_helper <- function(db_gp,
     ### number of sequences
     n <- nrow(db_gp)
 
-    workerlog(paste('in hierarchicalClones_helper, n, method', n, method))
     # get sequences
     if (method == "nt") {
         seqs <- db_gp[[ifelse(cdr3, cdr3_col, junction)]]   
@@ -1879,20 +1866,16 @@ hierarchicalClones_helper <- function(db_gp,
     ind_unq <- df$V1
     seqs_unq <- df$seqs
 
-    workerlog(paste('in hierarchicalClones_helper', n_unq))
     if (n_unq == 1) {
         return(list("idCluster" = rep(1, n), 
                     "n_cluster" = 1, 
                     "eigen_vals" = rep(0, n)))
     }
 
-    # RDB
-    workerlog('just before calculate distance matrix')
     # calculate distance matrix
     if (method == "nt") {
-        workerlog('calling fastDist_rcpp')
+    	# RDB
        	dist_mtx <- fastDist_rcpp(seqs_unq)
-	# RDB
         #dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, 
         #                         dist_mat = getDNAMatrix(gap = 0))
     } else if (method == "aa") {
diff --git a/src/fastDist.cpp b/src/fastDist.cpp
index 3ff9919..a3254a3 100644
--- a/src/fastDist.cpp
+++ b/src/fastDist.cpp
@@ -13,8 +13,6 @@ inline uint8_t code_char(char c){
 IntegerMatrix fastDist_rcpp(CharacterVector seqs) {
   int N = seqs.size();
 
-  // RDB
-  printf("In fastDistExt2_rcpp, %d seqs\n", N);
   if (N == 0) stop("empty input");
 
   std::string s0 = as<std::string>(seqs[0]);

From 102252c3a5319edd1395393f89dd9cd38cc87fdb Mon Sep 17 00:00:00 2001
From: Robert Bjornson <robert.bjornson@yale.edu>
Date: Tue, 23 Dec 2025 15:27:50 -0500
Subject: [PATCH 07/25] added countSeqsWithInvalidBases_rcpp()

---
 src/fastDist.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/fastDist.cpp b/src/fastDist.cpp
index a3254a3..0b89d01 100644
--- a/src/fastDist.cpp
+++ b/src/fastDist.cpp
@@ -9,6 +9,33 @@ inline uint8_t code_char(char c){
   }
 }
 
+// [[Rcpp::export]]
+int countSeqsWithInvalidBases_rcpp(CharacterVector seqs) {
+  int N = seqs.size();
+  int bad = 0;
+
+  for (int i = 0; i < N; ++i) {
+    if (seqs[i] == NA_STRING) {
+      bad++;
+      continue;
+    }
+
+    std::string s = as<std::string>(seqs[i]);
+    bool invalid = false;
+    for (char ch : s) {
+      char up = (char)std::toupper((unsigned char)ch);
+      if (code_char(up) == 255) {
+        invalid = true;
+        break;
+      }
+    }
+
+    if (invalid) bad++;
+  }
+
+  return bad;
+}
+
 // [[Rcpp::export]]
 IntegerMatrix fastDist_rcpp(CharacterVector seqs) {
   int N = seqs.size();

From 9c14a2d59db1604e3ee7fb121d08041d5eec51cb Mon Sep 17 00:00:00 2001
From: Robert Bjornson <robert.bjornson@yale.edu>
Date: Tue, 23 Dec 2025 15:54:45 -0500
Subject: [PATCH 08/25] updated exports

---
 R/RcppExports.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/R/RcppExports.R b/R/RcppExports.R
index 70462b1..c4726d7 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -5,6 +5,10 @@ pairwiseMutMatrixRcpp <- function(informative_pos, mutMtx, motifMtx) {
     .Call(`_scoper_pairwiseMutMatrixRcpp`, informative_pos, mutMtx, motifMtx)
 }
 
+countSeqsWithInvalidBases_rcpp <- function(seqs) {
+    .Call(`_scoper_countSeqsWithInvalidBases_rcpp`, seqs)
+}
+
 fastDist_rcpp <- function(seqs) {
     .Call(`_scoper_fastDist_rcpp`, seqs)
 }

From fcd69a38843e6df8e07316095b468a746e0152ea Mon Sep 17 00:00:00 2001
From: Vivian0105 <huiminlyu0105@gmail.com>
Date: Mon, 9 Mar 2026 11:23:08 -0400
Subject: [PATCH 09/25] Add parameter IUPAC in functions hierarchicalClones,
 defineClonesScoper and hierarchicalClones_helper

---
 R/Functions.R | 47 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index f1f9018..02c095c 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -881,6 +881,8 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #' @param    linkage            available linkage are \code{"single"}, \code{"average"}, and \code{"complete"}.
 #' @param    normalize	        method of normalization. The default is \code{"len"}, which divides the distance by the length 
 #'                              of the sequence group. If \code{"none"} then no normalization if performed.
+#' @param    IUPAC              If \code{TRUE}, use the IUPAC coding system, which includes both standard bases and ambiguity codes. 
+#'                              If \code{FALSE} (default), only standard bases, "N" and "?" are allowed in the sequence. 
 #' @param    junction           character name of the column containing junction sequences.
 #'                              Also used to determine sequence length for grouping.
 #' @param    v_call             name of the column containing the V-segment allele calls.
@@ -974,15 +976,15 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #' 
 #' @export
 hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("single", "average", "complete"), 
-                               normalize=c("len", "none"), junction="junction", 
+                               normalize=c("len", "none"), IUPAC=FALSE, junction="junction", 
                                v_call="v_call", j_call="j_call", clone="clone_id", fields=NULL,
                                cell_id=NULL, locus="locus", only_heavy=TRUE, split_light=FALSE,
                                first=FALSE, cdr3=FALSE, mod3=FALSE, max_n=0, nproc=1,
                                verbose=FALSE, log=NULL, summarize_clones=FALSE, seq_id = "sequence_id") {
     
     results <- defineClonesScoper(db = db, threshold = threshold, model = "hierarchical", 
-                                  method = match.arg(method), linkage = match.arg(linkage), normalize = match.arg(normalize),
-                                  junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields,
+                                  method = match.arg(method), linkage = match.arg(linkage), normalize = match.arg(normalize), 
+                                  IUPAC = match.arg(IUPAC), junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields,
                                   cell_id = cell_id, locus = locus, only_heavy = only_heavy, split_light = split_light,
                                   first = first, cdr3 = cdr3, mod3 = mod3, max_n = max_n, nproc = nproc,   
                                   verbose = verbose, log = log, summarize_clones = summarize_clones)
@@ -1172,7 +1174,7 @@ defineClonesScoper <- function(db,
                                model = c("identical", "hierarchical", "spectral"),
                                method = c("nt", "aa", "novj", "vj"),
                                linkage = c("single", "average", "complete"), normalize = c("len", "none"),
-                               germline = "germline_alignment", sequence = "sequence_alignment",
+                               germline = "germline_alignment", sequence = "sequence_alignment", IUPAC = FALSE,
                                junction = "junction", v_call = "v_call", j_call = "j_call", clone = "clone_id",
                                fields = NULL, cell_id = NULL, locus = NULL, only_heavy = TRUE, split_light = FALSE,
                                targeting_model = NULL, len_limit = NULL,
@@ -1238,13 +1240,25 @@ defineClonesScoper <- function(db,
     }
     
     ### Check for invalid characters
-    valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0))
-    .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) }
-    valid_seq <- sapply(db[[junction]], .validateSeq)
-    not_valid_seq <- which(!valid_seq)
-    if (length(not_valid_seq) > 0) {
+    if (!IUPAC && model == "hierarchical"){
+        nonIUPAC_valid <- c("A", "T", "C", "G", "N", "?")
+        .validcharacters <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% nonIUPAC_valid) }
+        nonIUPAC_valid_seq <- sapply(head(db[[junction]],1000), .validcharacters)
+        not_nonIUPAC_valid_seq <- which(!nonIUPAC_valid_seq)
+        if (length(not_nonIUPAC_valid_seq) > 0) {
         stop("invalid sequence characters in the ", junction, " column. ",
-             length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '",  valid_chars, "'")
+             "\n Valid characters are: '",  nonIUPAC_valid, "'")
+        }   
+    }
+    else{
+        valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0))
+        .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) }
+        valid_seq <- sapply(db[[junction]], .validateSeq)
+        not_valid_seq <- which(!valid_seq)
+        if (length(not_valid_seq) > 0) {
+            stop("invalid sequence characters in the ", junction, " column. ",
+                length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '",  valid_chars, "'")
+        }
     }
     
     ### temp columns
@@ -1772,6 +1786,7 @@ passToClustering_lev1 <- function (db_gp,
                                                    method = method,
                                                    linkage = linkage,
                                                    normalize = normalize,
+                                                   IUPAC = match.arg(IUPAC), 
                                                    junction = junction,
                                                    cdr3 = cdr3,
                                                    cdr3_col = cdr3_col,
@@ -1836,6 +1851,7 @@ hierarchicalClones_helper <- function(db_gp,
                                       method = c("nt", "aa"),
                                       linkage = c("single", "average", "complete"),
                                       normalize = c("len", "none"),
+                                      IUPAC = FALSE,
                                       junction = "junction",
                                       cdr3 = FALSE,
                                       cdr3_col = NA,
@@ -1874,10 +1890,13 @@ hierarchicalClones_helper <- function(db_gp,
 
     # calculate distance matrix
     if (method == "nt") {
-    	# RDB
-       	dist_mtx <- fastDist_rcpp(seqs_unq)
-        #dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, 
-        #                         dist_mat = getDNAMatrix(gap = 0))
+    	if (! IUPAC){
+            dist_mtx <- fastDist_rcpp(seqs_unq)
+        }
+       	else{
+            dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, 
+                                 dist_mat = getDNAMatrix(gap = 0))
+        }
     } else if (method == "aa") {
         dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq, 
                                  dist_mat = getAAMatrix(gap = 0))

From 0af042b2bb61724564ac535f3dcd1737c06977fb Mon Sep 17 00:00:00 2001
From: Vivian0105 <huiminlyu0105@gmail.com>
Date: Mon, 9 Mar 2026 11:47:41 -0400
Subject: [PATCH 10/25] fix small IUPAC bug

---
 R/Functions.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index 02c095c..02dffda 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -984,7 +984,7 @@ hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("s
     
     results <- defineClonesScoper(db = db, threshold = threshold, model = "hierarchical", 
                                   method = match.arg(method), linkage = match.arg(linkage), normalize = match.arg(normalize), 
-                                  IUPAC = match.arg(IUPAC), junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields,
+                                  IUPAC = IUPAC, junction = junction, v_call = v_call, j_call = j_call, clone = clone, fields = fields,
                                   cell_id = cell_id, locus = locus, only_heavy = only_heavy, split_light = split_light,
                                   first = first, cdr3 = cdr3, mod3 = mod3, max_n = max_n, nproc = nproc,   
                                   verbose = verbose, log = log, summarize_clones = summarize_clones)
@@ -1786,7 +1786,7 @@ passToClustering_lev1 <- function (db_gp,
                                                    method = method,
                                                    linkage = linkage,
                                                    normalize = normalize,
-                                                   IUPAC = match.arg(IUPAC), 
+                                                   IUPAC = IUPAC, 
                                                    junction = junction,
                                                    cdr3 = cdr3,
                                                    cdr3_col = cdr3_col,

From c83316a7aa7a9a2f3b25deb996d5f55cdff5cec3 Mon Sep 17 00:00:00 2001
From: Vivian0105 <huiminlyu0105@gmail.com>
Date: Mon, 9 Mar 2026 14:59:16 -0400
Subject: [PATCH 11/25] Update DESCRIPTION, add IUPAC to function
 passToClustering_lev1

---
 DESCRIPTION         |  2 +-
 R/Functions.R       |  2 ++
 src/RcppExports.cpp | 12 ++++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index de164e5..e4778c8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -55,7 +55,7 @@ Suggests:
     knitr,
     rmarkdown,
     testthat (>= 3.0.0)
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Collate: 
     'Data.R'
     'Scoper.R'
diff --git a/R/Functions.R b/R/Functions.R
index 02dffda..34e16c1 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -1461,6 +1461,7 @@ defineClonesScoper <- function(db,
                                                               method = method,
                                                               linkage = ifelse(model == "hierarchical", linkage, NA),
                                                               normalize = ifelse(model == "hierarchical", normalize, NA),
+                                                              IUPAC = ifelse(model == "hierarchical", IUPAC, FALSE),
                                                               germline = germline,
                                                               sequence = sequence,
                                                               junction = junction,
@@ -1760,6 +1761,7 @@ passToClustering_lev1 <- function (db_gp,
                                    method = c("nt", "aa", "novj", "vj"),
                                    linkage = c("single", "average", "complete"),
                                    normalize = c("len", "none"),
+                                   IUPAC = FALSE,
                                    germline = "germline_alignment",
                                    sequence = "sequence_alignment",
                                    junction = "junction",
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 03a552b..c447597 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -23,6 +23,17 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// countSeqsWithInvalidBases_rcpp
+int countSeqsWithInvalidBases_rcpp(CharacterVector seqs);
+RcppExport SEXP _scoper_countSeqsWithInvalidBases_rcpp(SEXP seqsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< CharacterVector >::type seqs(seqsSEXP);
+    rcpp_result_gen = Rcpp::wrap(countSeqsWithInvalidBases_rcpp(seqs));
+    return rcpp_result_gen;
+END_RCPP
+}
 // fastDist_rcpp
 IntegerMatrix fastDist_rcpp(CharacterVector seqs);
 RcppExport SEXP _scoper_fastDist_rcpp(SEXP seqsSEXP) {
@@ -37,6 +48,7 @@ END_RCPP
 
 static const R_CallMethodDef CallEntries[] = {
     {"_scoper_pairwiseMutMatrixRcpp", (DL_FUNC) &_scoper_pairwiseMutMatrixRcpp, 3},
+    {"_scoper_countSeqsWithInvalidBases_rcpp", (DL_FUNC) &_scoper_countSeqsWithInvalidBases_rcpp, 1},
     {"_scoper_fastDist_rcpp", (DL_FUNC) &_scoper_fastDist_rcpp, 1},
     {NULL, NULL, 0}
 };

From 707a7a10e9421241192c50a8c32f66a043f00d3a Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Wed, 11 Mar 2026 17:40:23 +0100
Subject: [PATCH 12/25] typo

---
 R/Functions.R               |  2 +-
 tests/testthat/test_clone.R | 60 +++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/R/Functions.R b/R/Functions.R
index 34e16c1..c4cf197 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -536,7 +536,7 @@ prepare_db <- function(db,
             dplyr::filter(stringi::stri_count(!!rlang::sym(junction), regex = "[^ATCG]") <= max_n)
         n_after <- nrow(db)
         if ( n_before > n_after) {
-            warning(paste("Removed", n_before - n_after, "sequences with non ATCG charachters."))
+            warning(paste("Removed", n_before - n_after, "sequences with non ATCG characters."))
         }
     } else {
         n_rmv_N <- 0
diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R
index 2293311..22909d9 100644
--- a/tests/testthat/test_clone.R
+++ b/tests/testthat/test_clone.R
@@ -70,6 +70,64 @@ test_that("Test hierarchicalClones", {
     }
 })
 
+#### clone - hierarchicalClones with IUPAC parameter ####
+
+test_that("Test hierarchicalClones with IUPAC parameter", {
+    # Create test data with IUPAC ambiguity codes
+    db_iupac <- data.frame(
+        sequence_id = paste0("seq", 1:10),
+        v_call = rep("IGHV1-1*01", 10),
+        j_call = rep("IGHJ1*01", 10),
+        # Use IUPAC ambiguity codes: R(A/G), Y(C/T), W(A/T), S(C/G), M(A/C), K(G/T)
+        junction = c(
+            "TGTGCRAGCTACTGG",  # R = A or G
+            "TGTGCRAGCTACTGG",  # identical to seq1
+            "TGTGCAAGCTACTGG",  # standard bases only, similar to seq1/2
+            "TGTGCYAGCTACTGG",  # Y = C or T
+            "TGTGCYAGCTACTGG",  # identical to seq4
+            "TGTGCMAGCTACTGG",  # M = A or C
+            "TGTGCWAGCTACTGG",  # W = A or T
+            "TGTGCWAGCTACTGG",  # identical to seq7
+            "TGTGCAAGCTAYTGG",  # Y in different position
+            "TGTGCAAGCTKCTGG"   # K = G or T
+        ),
+        locus = rep("IGH", 10),
+        stringsAsFactors = FALSE
+    )
+    
+    # Test 1: IUPAC=FALSE should fail with IUPAC characters
+    expect_error(
+        hierarchicalClones(db_iupac, threshold = 0.15,
+                          method = "nt", linkage = "single",
+                          junction = "junction",
+                          v_call = "v_call", j_call = "j_call",
+                          IUPAC = FALSE,
+                          summarize_clones = FALSE),
+        "invalid sequence characters"
+    )
+    
+    # Test 2: IUPAC=TRUE should run with IUPAC characters
+    expect_warning(
+        expect_message(
+            db_result <- hierarchicalClones(db_iupac, threshold = 0.15,
+                                        method = "nt", linkage = "single",
+                                        junction = "junction",
+                                        v_call = "v_call", j_call = "j_call",
+                                        IUPAC = TRUE,
+                                        summarize_clones = FALSE),
+            "Running defineClonesScoper in bulk mode and only keep heavy chains"
+        ),
+        "Removed 9 sequences with non ATCG charachters."
+    )
+    
+    # Check that clones were assigned
+    expect_true("clone_id" %in% colnames(db_result))
+    expect_true(all(!is.na(db_result$clone_id)))
+    # Sequences with IUPAC codes should be removed, leaving only seq3 which has standard bases
+    expect_equal(nrow(db_result), 1)
+
+})
+
 #### clone - spectralClones - novj method ####
 
 test_that("Test spectralClones - novj", {
@@ -558,3 +616,5 @@ test_that("Testing split_light warnings for all cloning mehtods", {
                                 cell_id = "cell_id", split_light = TRUE))
    expect_equal(db_nsplit[['clone_id']], db_split[['clone_id']])
 })
+
+

From 653df606dbb32b34baf36781f30b0a4548cab842 Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Wed, 11 Mar 2026 19:14:45 +0100
Subject: [PATCH 13/25] tests and doc

---
 R/Functions.R               |  83 +++++++--
 tests/testthat/test_clone.R | 326 ++++++++++++++++++++++++++++++++----
 2 files changed, 365 insertions(+), 44 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index c4cf197..9bf0ffe 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -881,8 +881,16 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #' @param    linkage            available linkage are \code{"single"}, \code{"average"}, and \code{"complete"}.
 #' @param    normalize	        method of normalization. The default is \code{"len"}, which divides the distance by the length 
 #'                              of the sequence group. If \code{"none"} then no normalization if performed.
-#' @param    IUPAC              If \code{TRUE}, use the IUPAC coding system, which includes both standard bases and ambiguity codes. 
-#'                              If \code{FALSE} (default), only standard bases, "N" and "?" are allowed in the sequence. 
+#' @param    IUPAC              If \code{TRUE}, allows sequences with IUPAC codes to pass validation 
+#'                              and be used in clustering with IUPAC-aware distance calculation 
+#'                              (via \code{alakazam::pairwiseDist}). If \code{FALSE} (default), uses fast Hamming distance 
+#'                              (via \code{fastDist_rcpp}) and only allows standard bases (A, T, C, G), N, and ? 
+#'                              in sequences. IUPAC codes will cause validation errors if \code{IUPAC=FALSE}.
+#'                              Note: This parameter is only available for \code{hierarchicalClones} with 
+#'                              \code{method="nt"}. \code{identicalClones} and 
+#'                              \code{spectralClones} always behave as if \code{IUPAC=FALSE}. This parameter controls 
+#'                              the distance calculation method, not sequence filtering. See \code{max_n} for 
+#'                              filtering sequences by character content.
 #' @param    junction           character name of the column containing junction sequences.
 #'                              Also used to determine sequence length for grouping.
 #' @param    v_call             name of the column containing the V-segment allele calls.
@@ -915,11 +923,16 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #'                              less than 7 nucleotides.
 #' @param    mod3               if \code{TRUE} removes records with a \code{junction} length that is not divisible by 
 #'                              3 in nucleotide space.
-#' @param    max_n              The maximum number of degenerate characters to permit in the junction sequence 
-#'                              before excluding the record from clonal assignment. Note, with 
-#'                              \code{linkage="single"} non-informative positions can create artifactual 
-#'                              links between unrelated sequences. Use with caution. 
-#'                              Default is set to be zero. Set it as \code{"NULL"} for no action.
+#' @param    max_n              The maximum number of non-ATCG characters (degenerate positions) to permit 
+#'                              in the junction sequence before excluding the record from clonal assignment. 
+#'                              This filters sequences based on the regex pattern \code{"[^ATCG]"}, which 
+#'                              includes N, ?, and IUPAC ambiguity codes (R, Y, W, S, M, K, etc.) 
+#'                              Note: \code{max_n} operates independently 
+#'                              from \code{IUPAC} - it controls filtering by character count, while 
+#'                              \code{IUPAC} controls validation and distance calculation method. 
+#'                              With \code{linkage="single"}, non-informative positions can create 
+#'                              artifactual links between unrelated sequences. Use with caution. 
+#'                              Default is 0 (ATCG-only). Set to \code{NULL} for no filtering.
 #' @param    nproc              number of cores to distribute the function over.
 #' @param    verbose            if \code{TRUE} prints out a summary of each step cloning process.
 #'                              if \code{FALSE} (default) process cloning silently.
@@ -939,6 +952,53 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #' clonal assignment summary information and a modified input \code{db} in the \code{db} slot that 
 #' contains clonal identifiers in the specified \code{clone} column.
 #' 
+#' @section IUPAC and max_n parameters:
+#' Note: The \code{IUPAC} parameter is only available for \code{hierarchicalClones} with 
+#' \code{method="nt"} (nucleotide mode). It is ignored when \code{method="aa"} (amino acid mode). 
+#' The \code{identicalClones} and \code{spectralClones} functions always operate as if 
+#' \code{IUPAC=FALSE} (standard bases plus N and ? only). The \code{max_n} parameter 
+#' is available for all cloning functions.
+#' 
+#' The \code{IUPAC} and \code{max_n} parameters serve complementary but distinct purposes:
+#' 
+#' \code{IUPAC} controls:
+#' \itemize{
+#'   \item Sequence validation (which characters are allowed)
+#'   \item Distance calculation method (fast Hamming vs IUPAC-aware scoring)
+#' }
+#' 
+#' \code{max_n} controls:
+#' \itemize{
+#'   \item Sequence filtering by counting non-ATCG characters
+#' }
+#' 
+#' Common use cases (for \code{method="nt"}):
+#' \itemize{
+#'   \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation. 
+#'         Rejects any sequences with IUPAC codes, N, or ?. Fastest option for high-quality data.
+#'   \item \code{IUPAC=FALSE, max_n=1+}: Allows sequences with limited N/? characters while 
+#'         rejecting IUPAC ambiguity codes at validation. Uses fast Hamming distance. Note: IUPAC 
+#'         codes are rejected during validation (before max_n filtering), so max_n only affects 
+#'         sequences that passed validation with N or ? characters. Useful for data with low-quality 
+#'         or masked positions but no experimental ambiguity codes.
+#'   \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG 
+#'         characters anyway. Only standard bases remain after filtering.
+#'   \item \code{IUPAC=TRUE, max_n=1+}: Allows sequences with limited ambiguity codes and uses 
+#'         proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly. 
+#'         Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate 
+#'         (counts all non-ATCG: N, ?, and IUPAC codes).
+#'   \item \code{IUPAC=TRUE, max_n=NULL}: Process all sequences with IUPAC codes regardless of the 
+#'         number of ambiguous positions. Uses IUPAC-aware distance calculation with no filtering. 
+#'         Most permissive option for data with extensive IUPAC ambiguity codes.
+#' }
+#' 
+#' Note: Validation occurs before filtering. When \code{IUPAC=FALSE}, sequences containing IUPAC 
+#' ambiguity codes (R, Y, W, S, M, K, etc.) will fail validation and be rejected before the 
+#' \code{max_n} filtering step. Therefore, with \code{IUPAC=FALSE, max_n > 0}, only sequences 
+#' with N and ? characters (not IUPAC codes) can pass validation and be filtered by \code{max_n}. 
+#' The \code{max_n} parameter always counts using regex \code{"[^ATCG]"}, but \code{IUPAC} determines 
+#' which non-ATCG characters are allowed to reach the filtering step.
+#' 
 #' @section Single-cell data:
 #' To invoke single-cell mode the \code{cell_id} argument must be specified and the \code{locus} 
 #' column must be correct. Otherwise, clustering will be performed with bulk sequencing assumptions, 
@@ -1054,9 +1114,12 @@ hierarchicalClones <- function(db, threshold, method=c("nt", "aa"), linkage=c("s
 #'                              less than 7 nucleotides.
 #' @param    mod3               if \code{TRUE} removes records with a \code{junction} length that is not divisible by 
 #'                              3 in nucleotide space.
-#' @param    max_n              the maximum number of degenerate characters to permit in the junction sequence before excluding the 
-#'                              record from clonal assignment. Default is set to be zero. Set it as \code{"NULL"} 
-#'                              for no action.
+#' @param    max_n              The maximum number of degenerate characters to permit in the junction sequence before excluding the 
+#'                              record from clonal assignment. Counts non-ATCG characters using regex \code{"[^ATCG]"}, which 
+#'                              includes N, ?, and IUPAC ambiguity codes. Note: \code{spectralClones} does not support the 
+#'                              \code{IUPAC} parameter and always validates sequences as if \code{IUPAC=FALSE} (only allows 
+#'                              standard bases A,T,C,G plus N and ?). IUPAC ambiguity codes will cause validation errors. 
+#'                              Default is 0. Set to \code{NULL} for no filtering.
 #' @param    threshold          the supervising cut-off to enforce an upper-limit distance for clonal grouping.
 #'                              A numeric value between (0,1).
 #' @param    base_sim           required similarity cut-off for sequences in equal distances from each other.
diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R
index 22909d9..0c0ff4d 100644
--- a/tests/testthat/test_clone.R
+++ b/tests/testthat/test_clone.R
@@ -73,59 +73,317 @@ test_that("Test hierarchicalClones", {
 #### clone - hierarchicalClones with IUPAC parameter ####
 
 test_that("Test hierarchicalClones with IUPAC parameter", {
+    # IUPAC and max_n serve different purposes:
+    # - IUPAC: Controls (1) validation (which characters allowed) and 
+    #          (2) distance calculation method (fast Hamming vs IUPAC-aware scoring)
+    # - max_n: Filters sequences by counting non-ATCG characters using regex "[^ATCG]"
+    #          (includes N, ?, and all IUPAC codes - whatever passed validation)
+    #
+    # Processing order: Validation -> Filtering -> Distance calculation
+    #
+    # Key use cases tested:
+    # 1. IUPAC=FALSE, max_n=0: Strict ATCG-only mode with fast distance calculation
+    # 2. IUPAC=TRUE, max_n=0: IUPAC-aware distance (but filters out IUPAC seqs anyway)
+    # 3. IUPAC=TRUE, max_n=1+: Main use case - IUPAC codes with proper scoring
+    # 4. IUPAC=FALSE, max_n=1+: Allow N/? but reject IUPAC codes (validation blocks them)
+    # 5. IUPAC=FALSE/TRUE with standard bases: Backward compatibility (same results)
+    
     # Create test data with IUPAC ambiguity codes
+    # With threshold=0.15 and length 15, distance > 0.15 requires >=3 mutations (3/15=0.2)
     db_iupac <- data.frame(
-        sequence_id = paste0("seq", 1:10),
-        v_call = rep("IGHV1-1*01", 10),
-        j_call = rep("IGHJ1*01", 10),
+        sequence_id = paste0("seq", 1:12),
+        v_call = rep("IGHV1-1*01", 12),
+        j_call = rep("IGHJ1*01", 12),
         # Use IUPAC ambiguity codes: R(A/G), Y(C/T), W(A/T), S(C/G), M(A/C), K(G/T)
         junction = c(
-            "TGTGCRAGCTACTGG",  # R = A or G
-            "TGTGCRAGCTACTGG",  # identical to seq1
-            "TGTGCAAGCTACTGG",  # standard bases only, similar to seq1/2
-            "TGTGCYAGCTACTGG",  # Y = C or T
-            "TGTGCYAGCTACTGG",  # identical to seq4
-            "TGTGCMAGCTACTGG",  # M = A or C
-            "TGTGCWAGCTACTGG",  # W = A or T
-            "TGTGCWAGCTACTGG",  # identical to seq7
-            "TGTGCAAGCTAYTGG",  # Y in different position
-            "TGTGCAAGCTKCTGG"   # K = G or T
+            # Clone 1: sequences similar to TGTGCAAGCTACTGG (0-1 mutations apart)
+            "TGTGCRAGCTACTGG", # R = A or G at pos 5
+            "TGTGCRAGCTACTGG", # identical to seq1
+            "TGTGCAAGCTACTGG", # standard bases only
+            "TGTGCYAGCTACTGG", # Y = C or T at pos 5, 1 mutation from seq3
+            "TGTGCYAGCTACTGG", # identical to seq4
+            "TGTGCMAGCTACTGG", # M = A or C at pos 5, 1 mutation from seq3
+            "TGTGCWAGCTACTGG", # W = A or T at pos 5, 1 mutation from seq3
+            
+            # Clone 2: sequences similar to ACGTTTGGCCAAACC (3+ mutations from Clone 1)
+            "ACGTTTGGCCAAACC", # standard bases, distant from Clone 1
+            "ACGTTTGGCCAAACC", # identical to seq8
+            "ACGKTTGGCCAAACC", # K = G or T at pos 4, 1 mutation from seq8
+            "ACGRTTGGCCAAACC", # R = A or G at pos 4, 1 mutation from seq8
+            "ACGTTTSGCCAAACC"  # S = C or G at pos 7, 1 mutation from seq8
+        ),
+        locus = rep("IGH", 12),
+        stringsAsFactors = FALSE
+    )
+
+    # Test 1: IUPAC=FALSE should reject IUPAC characters at validation stage
+    # Validation (IUPAC=FALSE): Only allows A,T,C,G,N,? - REJECTS IUPAC codes (R,Y,W,S,M,K,etc)
+    # Result: Sequences with IUPAC codes fail validation and raise error before any filtering
+    # Use case: Strict ATCG-only mode with fast Hamming distance (fastDist_rcpp)
+    expect_error(
+        hierarchicalClones(db_iupac,
+            threshold = 0.15,
+            method = "nt", linkage = "single",
+            junction = "junction",
+            v_call = "v_call", j_call = "j_call",
+            IUPAC = FALSE,  # Use fast Hamming distance for ATCG only
+            summarize_clones = FALSE
+        ),
+        "invalid sequence characters"
+    )
+
+    # Test 2: IUPAC=TRUE with max_n=0 - validates with IUPAC but filters them out
+    # 1. Validation (IUPAC=TRUE): Allows standard bases, N, ?, and IUPAC codes - all sequences pass
+    # 2. Filtering (max_n=0): Counts non-ATCG using "[^ATCG]" - removes all sequences with any non-ATCG
+    # Result: Only seq3, seq8 and seq9 (standard bases only) remain after filtering
+    expect_warning(
+        expect_message(
+            db_result2 <- hierarchicalClones(db_iupac,
+                threshold = 0.15,
+                method = "nt", linkage = "single",
+                junction = "junction",
+                v_call = "v_call", j_call = "j_call",
+                IUPAC = TRUE,   # Use IUPAC-aware distance calculation
+                max_n = 0,      # Filter out sequences with ANY non-ATCG chars
+                summarize_clones = FALSE
+            ),
+            "Running defineClonesScoper in bulk mode and only keep heavy chains"
+        ),
+        "Removed 9 sequences with non ATCG characters."
+    )
+
+    # Only 3 sequences with standard bases remain
+    expect_equal(nrow(db_result2), 3)
+    expect_equal(sort(db_result2$sequence_id), c("seq3", "seq8", "seq9"))
+    # They are in different clones (distant sequences)
+    expect_equal(length(unique(db_result2$clone_id)), 2)
+
+    # Test 3a: Create dataset with N characters to distinguish from IUPAC codes
+    db_with_n <- db_iupac
+    db_with_n[13,] <- NA
+    db_with_n$junction[13] <- "TGTGCNAGCTACTGG"  # Add seq13 with N
+    db_with_n$sequence_id <- c(db_with_n$sequence_id[1:12], "seq13")
+    db_with_n$v_call <- c(db_with_n$v_call[1:12], "IGHV1-1*01")
+    db_with_n$j_call <- c(db_with_n$j_call[1:12], "IGHJ1*01")
+    db_with_n$locus <- c(db_with_n$locus[1:12], "IGH")
+    
+    # Test 3b: IUPAC=TRUE with max_n=1 - allows up to 1 non-ATCG character per sequence
+    # This is the main use case: proper IUPAC distance calculation with ambiguity codes
+    # 1. Validation (IUPAC=TRUE): Allows standard bases (A,T,C,G) plus N, ?, and IUPAC codes (R,Y,W,S,M,K,etc)
+    # 2. Filtering (max_n=1): Counts non-ATCG using "[^ATCG]" - keeps sequences with ≤1 such character
+    # 3. Distance: Uses IUPAC-aware scoring (alakazam::pairwiseDist) for proper ambiguity handling
+    expect_message(
+        db_result3 <- hierarchicalClones(db_with_n,
+            threshold = 0.15,
+            method = "nt", linkage = "single",
+            junction = "junction",
+            v_call = "v_call", j_call = "j_call",
+            IUPAC = TRUE,   # Use IUPAC-aware distance calculation
+            max_n = 1,      # Allow up to 1 non-ATCG character (IUPAC or N)
+            summarize_clones = FALSE
+        ),
+        "Running defineClonesScoper in bulk mode and only keep heavy chains"
+    )
+    
+    # All 13 sequences kept (each has ≤1 non-ATCG character)
+    expect_equal(nrow(db_result3), 13)
+    
+    # Verify that we have 2 different clusters (Clone 1 and Clone 2)
+    n_clones <- length(unique(db_result3$clone_id))
+    expect_equal(n_clones, 2)
+    
+    # Identical sequences with IUPAC codes should cluster together
+    # Note: rows may be reordered, so reference by sequence_id
+    get_clone <- function(seq_id) db_result3$clone_id[db_result3$sequence_id == seq_id]
+    
+    expect_equal(get_clone("seq1"), get_clone("seq2"))  # seq1=seq2 (both R)
+    expect_equal(get_clone("seq4"), get_clone("seq5"))  # seq4=seq5 (both Y)
+    expect_equal(get_clone("seq8"), get_clone("seq9"))  # seq8=seq9 (standard)
+    
+    # seq13 (with N) should cluster with Clone 1 (similar to seq3)
+    expect_equal(get_clone("seq3"), get_clone("seq13"))
+    
+    # Verify two distinct clones based on biological distance
+    clone1_id <- get_clone("seq3")  # seq3 in Clone 1
+    clone2_id <- get_clone("seq8")  # seq8 in Clone 2
+    expect_true(clone1_id != clone2_id)
+    expect_true(all(sapply(paste0("seq", 1:7), get_clone) == clone1_id))   # Clone 1: seq1-7
+    expect_true(all(sapply(paste0("seq", 8:12), get_clone) == clone2_id))  # Clone 2: seq8-12
+    expect_true(get_clone("seq13") == clone1_id)                           # seq13 joins Clone 1
+
+    # Test 4: IUPAC=FALSE with max_n=1+ - allows N/? but rejects other IUPAC codes
+    # This demonstrates the two-stage process: validation -> filtering
+    # 1. Validation (IUPAC=FALSE): Allows A,T,C,G,N,? but REJECTS IUPAC codes (R,Y,W,S,M,K,etc)
+    # 2. Filtering (max_n): Counts non-ATCG using "[^ATCG]" regex on sequences that passed validation
+    # Use case: Tolerate low-quality positions (N,?) with fast Hamming distance, but no ambiguity codes
+    db_with_n_only <- data.frame(
+        sequence_id = paste0("seq", 1:5),
+        v_call = rep("IGHV1-1*01", 5),
+        j_call = rep("IGHJ1*01", 5),
+        junction = c(
+            "TGTGCAAGCTACTGG",  # standard bases only
+            "TGTGCNAGCTACTGG",  # 1 N (allowed with max_n=1)
+            "TGTGC?AGCTACTGG",  # 1 ? (allowed with max_n=1)
+            "TGTGCNNGCTACTGG",  # 2 N's (filtered with max_n=1)
+            "TGTGCRAGCTACTGG"   # IUPAC R code (should fail validation)
         ),
-        locus = rep("IGH", 10),
+        locus = rep("IGH", 5),
         stringsAsFactors = FALSE
     )
     
-    # Test 1: IUPAC=FALSE should fail with IUPAC characters
+    # Should reject at validation stage due to IUPAC code (R) in seq5
     expect_error(
-        hierarchicalClones(db_iupac, threshold = 0.15,
-                          method = "nt", linkage = "single",
-                          junction = "junction",
-                          v_call = "v_call", j_call = "j_call",
-                          IUPAC = FALSE,
-                          summarize_clones = FALSE),
+        hierarchicalClones(db_with_n_only,
+            threshold = 0.15,
+            method = "nt", linkage = "single",
+            junction = "junction",
+            v_call = "v_call", j_call = "j_call",
+            IUPAC = FALSE,  # Reject IUPAC codes
+            max_n = 1,      # Allow up to 1 N or ?
+            summarize_clones = FALSE
+        ),
         "invalid sequence characters"
     )
     
-    # Test 2: IUPAC=TRUE should run with IUPAC characters
+    # Test with only N/? (no IUPAC codes) - should succeed
+    db_n_only_valid <- db_with_n_only[1:4, ]  # Remove seq5 with IUPAC code
+    
     expect_warning(
         expect_message(
-            db_result <- hierarchicalClones(db_iupac, threshold = 0.15,
-                                        method = "nt", linkage = "single",
-                                        junction = "junction",
-                                        v_call = "v_call", j_call = "j_call",
-                                        IUPAC = TRUE,
-                                        summarize_clones = FALSE),
+            db_result5 <- hierarchicalClones(db_n_only_valid,
+                threshold = 0.15,
+                method = "nt", linkage = "single",
+                junction = "junction",
+                v_call = "v_call", j_call = "j_call",
+                IUPAC = FALSE,  # Use fast Hamming distance
+                max_n = 1,      # Allow up to 1 N or ?
+                summarize_clones = FALSE
+            ),
             "Running defineClonesScoper in bulk mode and only keep heavy chains"
         ),
-        "Removed 9 sequences with non ATCG charachters."
+        "Removed 1 sequences with non ATCG characters."  # seq4 with 2 N's filtered
     )
     
-    # Check that clones were assigned
-    expect_true("clone_id" %in% colnames(db_result))
-    expect_true(all(!is.na(db_result$clone_id)))
-    # Sequences with IUPAC codes should be removed, leaving only seq3 which has standard bases
-    expect_equal(nrow(db_result), 1)
+    # Should keep seq1, seq2, seq3 (0-1 non-ATCG characters each)
+    expect_equal(nrow(db_result5), 3)
+    expect_equal(sort(db_result5$sequence_id), c("seq1", "seq2", "seq3"))
+    # All are similar sequences, should be in same clone
+    expect_equal(length(unique(db_result5$clone_id)), 1)
+
+    # Test 5: Standard bases work with both IUPAC=TRUE and IUPAC=FALSE
+    # This demonstrates backward compatibility and performance consideration:
+    # - IUPAC=FALSE: Uses fast Hamming distance (fastDist_rcpp) - faster
+    # - IUPAC=TRUE: Uses IUPAC-aware scoring (alakazam::pairwiseDist) - slower but handles ambiguity
+    # For standard bases (ATCG), results should be identical
+    db_standard <- ExampleDb[1:50, ]
+
+    # Verify db_standard contains only standard bases (not IUPAC ambiguity codes)
+    standard_chars <- c("A", "T", "C", "G", "N", "?")
+    .is_standard <- function(x) {
+        all(unique(strsplit(x, "")[[1]]) %in% standard_chars)
+    }
+    all_standard <- sapply(db_standard$junction, .is_standard)
+    expect_true(all(all_standard),
+        info = "ExampleDb contains IUPAC ambiguity codes - update test data or test expectations"
+    )
 
+    expect_message(
+        db_result_false <- hierarchicalClones(db_standard,
+            threshold = 0.15,
+            method = "nt", linkage = "single",
+            junction = "junction",
+            v_call = "v_call", j_call = "j_call",
+            IUPAC = FALSE,  # Fast Hamming distance for ATCG only
+            summarize_clones = FALSE
+        ),
+        "Running defineClonesScoper in bulk mode and only keep heavy chains"
+    )
+
+    expect_message(
+        db_result_true <- hierarchicalClones(db_standard,
+            threshold = 0.15,
+            method = "nt", linkage = "single",
+            junction = "junction",
+            v_call = "v_call", j_call = "j_call",
+            IUPAC = TRUE,   # IUPAC-aware distance (slower but handles ambiguity)
+            summarize_clones = FALSE
+        ),
+        "Running defineClonesScoper in bulk mode and only keep heavy chains"
+    )
+
+    # Results should be identical for standard bases
+    # (IUPAC just changes the distance calculation method, not the result)
+    expect_equal(nrow(db_result_false), nrow(db_result_true))
+    expect_true(all(!is.na(db_result_false$clone_id)))
+    expect_true(all(!is.na(db_result_true$clone_id)))
+    # Clone assignments should be the same (though IDs may differ)
+    expect_equal(length(unique(db_result_false$clone_id)),
+                 length(unique(db_result_true$clone_id)))
+    
+    # Test 6: IUPAC=TRUE with max_n=NULL - no filtering, process all sequences
+    # This is the most permissive option for IUPAC data
+    # 1. Validation (IUPAC=TRUE): Allows standard bases, N, ?, and all IUPAC codes
+    # 2. Filtering (max_n=NULL): No filtering - all sequences kept regardless of character count
+    # 3. Distance: Uses IUPAC-aware scoring for proper ambiguity handling
+    # Use case: Data with extensive IUPAC codes where you want to process everything
+    
+    # Create test data with multiple IUPAC codes per sequence
+    db_iupac_multi <- data.frame(
+        sequence_id = paste0("seq", 1:8),
+        v_call = rep("IGHV1-1*01", 8),
+        j_call = rep("IGHJ1*01", 8),
+        junction = c(
+            # Clone 1: sequences with multiple IUPAC codes, similar to each other
+            "TGTGCRAGCTRYCTGG",  # R at pos 6, R at pos 12, Y at pos 13 (3 non-ATCG)
+            "TGTGCRAGCTRYCTGG",  # identical to seq1
+            "TGTGCRAGCTWYNMGG",  # R at pos 6, W at pos 12, Y at pos 13, N at pos 14, M at pos 15 (5 non-ATCG)
+            "TGTGCAAGCTACTGG",   # standard bases only (0 non-ATCG)
+            
+            # Clone 2: sequences distant from Clone 1, also with IUPAC codes
+            "ACGKTTRGCCNNNYYY",  # K, R, 3 N's, 3 Y's (8 non-ATCG characters!)
+            "ACGKTTRGCCNNNYYY",  # identical to seq5
+            "ACGTTTGGCCAAACC",   # standard bases only (0 non-ATCG)
+            "ACGKTTSGCCNNNYYY"   # K, S, 3 N's, 3 Y's (8 non-ATCG)
+        ),
+        locus = rep("IGH", 8),
+        stringsAsFactors = FALSE
+    )
+    
+    # With max_n=NULL, all sequences should be kept regardless of IUPAC code count
+    expect_message(
+        db_result6 <- hierarchicalClones(db_iupac_multi,
+            threshold = 0.15,
+            method = "nt", linkage = "single",
+            junction = "junction",
+            v_call = "v_call", j_call = "j_call",
+            IUPAC = TRUE,     # Use IUPAC-aware distance calculation
+            max_n = NULL,     # No filtering - process all sequences
+            summarize_clones = FALSE
+        ),
+        "Running defineClonesScoper in bulk mode and only keep heavy chains"
+    )
+    
+    # All 8 sequences kept (no filtering applied)
+    expect_equal(nrow(db_result6), 8)
+    expect_equal(sort(db_result6$sequence_id), paste0("seq", 1:8))
+    
+    # Should have 2 clones based on biological distance
+    expect_equal(length(unique(db_result6$clone_id)), 2)
+    
+    # Helper function to get clone ID by sequence ID
+    get_clone6 <- function(seq_id) db_result6$clone_id[db_result6$sequence_id == seq_id]
+    
+    # Verify identical sequences cluster together despite many IUPAC codes
+    expect_equal(get_clone6("seq1"), get_clone6("seq2"))  # both identical with 3 IUPAC
+    expect_equal(get_clone6("seq5"), get_clone6("seq6"))  # both identical with 8 IUPAC
+    
+    # Verify Clone 1 (seq1-4) and Clone 2 (seq5-8) are distinct
+    clone1_id <- get_clone6("seq1")
+    clone2_id <- get_clone6("seq5")
+    expect_true(clone1_id != clone2_id)
+    expect_true(all(sapply(paste0("seq", 1:4), get_clone6) == clone1_id))
+    expect_true(all(sapply(paste0("seq", 5:8), get_clone6) == clone2_id))
+    
 })
 
 #### clone - spectralClones - novj method ####

From ef421dc2a873bf77a024f87caa04b957bf383bc1 Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Thu, 12 Mar 2026 10:16:35 +0100
Subject: [PATCH 14/25] updated docs

---
 R/Functions.R | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index 9bf0ffe..bd750c0 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -885,8 +885,7 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #'                              and be used in clustering with IUPAC-aware distance calculation 
 #'                              (via \code{alakazam::pairwiseDist}). If \code{FALSE} (default), uses fast Hamming distance 
 #'                              (via \code{fastDist_rcpp}) and only allows standard bases (A, T, C, G), N, and ? 
-#'                              in sequences. IUPAC codes will cause validation errors if \code{IUPAC=FALSE}.
-#'                              Note: This parameter is only available for \code{hierarchicalClones} with 
+#'                              in sequences. Note: This parameter is only available for \code{hierarchicalClones} with 
 #'                              \code{method="nt"}. \code{identicalClones} and 
 #'                              \code{spectralClones} always behave as if \code{IUPAC=FALSE}. This parameter controls 
 #'                              the distance calculation method, not sequence filtering. See \code{max_n} for 
@@ -925,8 +924,6 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #'                              3 in nucleotide space.
 #' @param    max_n              The maximum number of non-ATCG characters (degenerate positions) to permit 
 #'                              in the junction sequence before excluding the record from clonal assignment. 
-#'                              This filters sequences based on the regex pattern \code{"[^ATCG]"}, which 
-#'                              includes N, ?, and IUPAC ambiguity codes (R, Y, W, S, M, K, etc.) 
 #'                              Note: \code{max_n} operates independently 
 #'                              from \code{IUPAC} - it controls filtering by character count, while 
 #'                              \code{IUPAC} controls validation and distance calculation method. 
@@ -976,14 +973,14 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #' \itemize{
 #'   \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation. 
 #'         Rejects any sequences with IUPAC codes, N, or ?. Fastest option for high-quality data.
-#'   \item \code{IUPAC=FALSE, max_n=1+}: Allows sequences with limited N/? characters while 
+#'   \item \code{IUPAC=FALSE, max_n>0}: Allows sequences with limited N/? characters while 
 #'         rejecting IUPAC ambiguity codes at validation. Uses fast Hamming distance. Note: IUPAC 
 #'         codes are rejected during validation (before max_n filtering), so max_n only affects 
 #'         sequences that passed validation with N or ? characters. Useful for data with low-quality 
 #'         or masked positions but no experimental ambiguity codes.
 #'   \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG 
 #'         characters anyway. Only standard bases remain after filtering.
-#'   \item \code{IUPAC=TRUE, max_n=1+}: Allows sequences with limited ambiguity codes and uses 
+#'   \item \code{IUPAC=TRUE, max_n>0}: Allows sequences with limited ambiguity codes and uses 
 #'         proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly. 
 #'         Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate 
 #'         (counts all non-ATCG: N, ?, and IUPAC codes).

From 308e685f1fd1ed79276407f97d7d7c123d8c97a3 Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Tue, 17 Mar 2026 14:22:01 +0100
Subject: [PATCH 15/25] fix test data

---
 tests/testthat/test_clone.R | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R
index 0c0ff4d..378931b 100644
--- a/tests/testthat/test_clone.R
+++ b/tests/testthat/test_clone.R
@@ -337,12 +337,11 @@ test_that("Test hierarchicalClones with IUPAC parameter", {
             "TGTGCRAGCTRYCTGG",  # R at pos 6, R at pos 12, Y at pos 13 (3 non-ATCG)
             "TGTGCRAGCTRYCTGG",  # identical to seq1
             "TGTGCRAGCTWYNMGG",  # R at pos 6, W at pos 12, Y at pos 13, N at pos 14, M at pos 15 (5 non-ATCG)
-            "TGTGCAAGCTACTGG",   # standard bases only (0 non-ATCG)
-            
+            "TGTGCAAGCTACCTGG",  # standard bases only (0 non-ATCG)
             # Clone 2: sequences distant from Clone 1, also with IUPAC codes
             "ACGKTTRGCCNNNYYY",  # K, R, 3 N's, 3 Y's (8 non-ATCG characters!)
             "ACGKTTRGCCNNNYYY",  # identical to seq5
-            "ACGTTTGGCCAAACC",   # standard bases only (0 non-ATCG)
+            "ACGGTTAGCCAAACCC",  # standard bases only (0 non-ATCG)
             "ACGKTTSGCCNNNYYY"   # K, S, 3 N's, 3 Y's (8 non-ATCG)
         ),
         locus = rep("IGH", 8),
@@ -723,7 +722,7 @@ test_that("Test hierarchicalClones only_heavy and first", {
     # expect sequences from cells 1 and 3 in clone 1, and
     # sequences from cell 2 in clone 2
     clones_split_F_th0@db %>%
-        mutate(expected_clone_id = dplyr::case_when(
+        dplyr::mutate(expected_clone_id = dplyr::case_when(
             cell_id %in% c(1, 3) ~ "1",
             cell_id %in% c(2) ~ "2",
             TRUE ~ NA_character_
@@ -800,7 +799,7 @@ test_that("Test hierarchicalClones only_heavy and first", {
     # sequences from cell 2 in clone 2, regardless of
     # light chains
     clones_split_F_th0_2l@db %>%
-        mutate(expected_clone_id = dplyr::case_when(
+        dplyr::mutate(expected_clone_id = dplyr::case_when(
             cell_id %in% c(1, 3) ~ "1",
             cell_id %in% c(2) ~ "2",
             TRUE ~ NA_character_

From b17f0543a7d12b73083e947f251eed8536aac866 Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Thu, 19 Mar 2026 09:34:25 +0100
Subject: [PATCH 16/25] updated docs

---
 R/Functions.R | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index bd750c0..bd3910e 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -886,9 +886,8 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #'                              (via \code{alakazam::pairwiseDist}). If \code{FALSE} (default), uses fast Hamming distance 
 #'                              (via \code{fastDist_rcpp}) and only allows standard bases (A, T, C, G), N, and ? 
 #'                              in sequences. Note: This parameter is only available for \code{hierarchicalClones} with 
-#'                              \code{method="nt"}. \code{identicalClones} and 
-#'                              \code{spectralClones} always behave as if \code{IUPAC=FALSE}. This parameter controls 
-#'                              the distance calculation method, not sequence filtering. See \code{max_n} for 
+#'                              \code{method="nt"}. This parameter controls validation and distance
+#'                              calculation method, not sequence filtering. See \code{max_n} for 
 #'                              filtering sequences by character content.
 #' @param    junction           character name of the column containing junction sequences.
 #'                              Also used to determine sequence length for grouping.
@@ -972,18 +971,20 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #' Common use cases (for \code{method="nt"}):
 #' \itemize{
 #'   \item \code{IUPAC=FALSE, max_n=0}: Strict ATCG-only mode with fast distance calculation. 
-#'         Rejects any sequences with IUPAC codes, N, or ?. Fastest option for high-quality data.
-#'   \item \code{IUPAC=FALSE, max_n>0}: Allows sequences with limited N/? characters while 
-#'         rejecting IUPAC ambiguity codes at validation. Uses fast Hamming distance. Note: IUPAC 
-#'         codes are rejected during validation (before max_n filtering), so max_n only affects 
-#'         sequences that passed validation with N or ? characters. Useful for data with low-quality 
+#'         Will throw an error and exit if sequences with IUPAC codes not A, T, C, G, N, or ? are detected
+#'         among the first 1000 sequences. Fastest option for high-quality data.
+#'   \item \code{IUPAC=FALSE, max_n>0}: Will throw an error and exit if sequences with IUPAC 
+#'         codes not A, T, C, G, N, or ? are detected among the first 1000 sequences. Allows sequences 
+#'         with limited N/? characters in distance calculation, using fast Hamming distance. 
+#'         Note: IUPAC codes are rejected during validation (before max_n filtering), so max_n only 
+#'         affects sequences that passed validation with N or ? characters. Useful for data with low-quality 
 #'         or masked positions but no experimental ambiguity codes.
 #'   \item \code{IUPAC=TRUE, max_n=0}: Uses IUPAC-aware distance but filters out all non-ATCG 
 #'         characters anyway. Only standard bases remain after filtering.
 #'   \item \code{IUPAC=TRUE, max_n>0}: Allows sequences with limited ambiguity codes and uses 
 #'         proper IUPAC-aware distance calculation. Slower but handles biological ambiguity correctly. 
 #'         Set max_n to the maximum number of ambiguous positions per sequence you want to tolerate 
-#'         (counts all non-ATCG: N, ?, and IUPAC codes).
+#'         (counts all non-ATCG: N, ?, and other IUPAC codes).
 #'   \item \code{IUPAC=TRUE, max_n=NULL}: Process all sequences with IUPAC codes regardless of the 
 #'         number of ambiguous positions. Uses IUPAC-aware distance calculation with no filtering. 
 #'         Most permissive option for data with extensive IUPAC ambiguity codes.

From ba063d6dfd16271e9fa6ecfca8282ccf2d136307 Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Thu, 19 Mar 2026 09:53:38 +0100
Subject: [PATCH 17/25] added Huimin to contributors

---
 DESCRIPTION | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index e4778c8..f4530cb 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,14 +1,16 @@
 Package: scoper
 Type: Package
 Version: 1.3.1.999
-Date: 2025-02-26
+Date: 2026-03-17
 Authors@R: c(person("Nima", "Nouri", role=c("aut"), 
                     email="nima.nouri@yale.edu"),
              person("Edel", "Aron", role=c("ctb"),
                     email="edel.aron@yale.edu"),
              person("Gisela", "Gabernet", role=c("ctb"),
                     email="gisela.gabernet@yale.edu"),
-              person("Cole", "Jensen", role=c("ctb"),
+             person("Huimin", "Lyu", role=c("ctb"),
+                    email="huimin.lyu@yale.edu"),                    
+             person("Cole", "Jensen", role=c("ctb"),
                     email="cole.jensen@yale.edu"),
              person("Susanna", "Marquez", role=c("ctb", "cre"),
                     email="susanna.marquez@yale.edu"),

From b468e18a37c2baf6abd8a0987d0076a6f7d5ebaf Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Thu, 19 Mar 2026 09:58:02 +0100
Subject: [PATCH 18/25] updated docs

---
 R/Functions.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index bd3910e..ad7a2e9 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -885,10 +885,11 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #'                              and be used in clustering with IUPAC-aware distance calculation 
 #'                              (via \code{alakazam::pairwiseDist}). If \code{FALSE} (default), uses fast Hamming distance 
 #'                              (via \code{fastDist_rcpp}) and only allows standard bases (A, T, C, G), N, and ? 
-#'                              in sequences. Note: This parameter is only available for \code{hierarchicalClones} with 
-#'                              \code{method="nt"}. This parameter controls validation and distance
+#'                              in sequences.This parameter controls validation and distance
 #'                              calculation method, not sequence filtering. See \code{max_n} for 
-#'                              filtering sequences by character content.
+#'                              filtering sequences by character content. See the IUPAC and max_n 
+#'                              parameters section for more details and examples. Note: This parameter is only available 
+#'                              for \code{hierarchicalClones} with \code{method="nt"}. 
 #' @param    junction           character name of the column containing junction sequences.
 #'                              Also used to determine sequence length for grouping.
 #' @param    v_call             name of the column containing the V-segment allele calls.

From cd4271604c0a9703fe86d677502c211c27f6cc9e Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Tue, 24 Mar 2026 11:37:32 -0400
Subject: [PATCH 19/25] fix character validation logic

---
 R/Functions.R | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index ad7a2e9..2025d21 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -1302,26 +1302,25 @@ defineClonesScoper <- function(db,
     }
     
     ### Check for invalid characters
-    if (!IUPAC && model == "hierarchical"){
-        nonIUPAC_valid <- c("A", "T", "C", "G", "N", "?")
-        .validcharacters <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% nonIUPAC_valid) }
-        nonIUPAC_valid_seq <- sapply(head(db[[junction]],1000), .validcharacters)
-        not_nonIUPAC_valid_seq <- which(!nonIUPAC_valid_seq)
-        if (length(not_nonIUPAC_valid_seq) > 0) {
-        stop("invalid sequence characters in the ", junction, " column. ",
-             "\n Valid characters are: '",  nonIUPAC_valid, "'")
-        }   
-    }
-    else{
-        valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0))
-        .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) }
-        valid_seq <- sapply(db[[junction]], .validateSeq)
-        not_valid_seq <- which(!valid_seq)
-        if (length(not_valid_seq) > 0) {
-            stop("invalid sequence characters in the ", junction, " column. ",
-                length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '",  valid_chars, "'")
+    # IUPAC mode is only supported for hierarchical clustering and nt method
+    if (!IUPAC && model == "hierarchical" && method == "nt") {
+            valid_chars <- c("A", "T", "C", "G", "N", "?")
+    } else {
+        if (method %in% c("nt", "novj", "vj")) {
+            valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0))
+        } else if (method == "aa") {
+            valid_chars <- colnames(alakazam::getAAMatrix(gap = 0))
+        } else {
+            stop("invalid method specified.")
         }
     }
+    .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) }
+    valid_seq <- sapply(head(db[[junction]],1000), .validateSeq)
+    not_valid_seq <- which(!valid_seq)
+    if (length(not_valid_seq) > 0) {
+        stop("invalid sequence characters in the ", junction, " column. ",
+            length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '",  valid_chars, "'")
+    }
     
     ### temp columns
     temp_cols <- c("vj_group", "vjl_group", "junction_l",  "cdr3_col", "clone_temp", "cell_id_temp")
@@ -1523,7 +1522,7 @@ defineClonesScoper <- function(db,
                                                               method = method,
                                                               linkage = ifelse(model == "hierarchical", linkage, NA),
                                                               normalize = ifelse(model == "hierarchical", normalize, NA),
-                                                              IUPAC = ifelse(model == "hierarchical", IUPAC, FALSE),
+                                                              IUPAC = ifelse(model == "hierarchical" & method == "nt", IUPAC, FALSE),
                                                               germline = germline,
                                                               sequence = sequence,
                                                               junction = junction,

From fa1858e2b80dafd88b891f3c9a264e38db71476c Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Wed, 25 Mar 2026 12:11:12 +0100
Subject: [PATCH 20/25] added tests

---
 tests/testthat/test_clone.R | 141 ++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R
index 378931b..c93c991 100644
--- a/tests/testthat/test_clone.R
+++ b/tests/testthat/test_clone.R
@@ -385,6 +385,147 @@ test_that("Test hierarchicalClones with IUPAC parameter", {
     
 })
 
+#### clone - hierarchicalClones, IUPAC code in sequence 1001 ####
+
+test_that("Test hierarchicalClones with IUPAC code beyond validation window", {
+    # Validation only inspects the first 1000 sequences (head(..., 1000)).
+    # A sequence containing an IUPAC code at position 1001 therefore escapes the check
+    # regardless of IUPAC=TRUE/FALSE.
+    # What happens next depends on max_n and IUPAC:
+    #   - max_n=0:    prepare_db filters out non-ATCG sequences -> warning, no error (both IUPAC=T/F)
+    #   - max_n=NULL, IUPAC=FALSE: no filtering -> fastDist_rcpp encounters R -> error
+    #   - max_n=NULL, IUPAC=TRUE:  no filtering -> pairwiseDist handles R correctly -> succeeds
+
+    # Build 1001-row dataset: rows 1-1000 are standard ATCG, row 1001 has IUPAC code R
+    n_std <- 1000
+    db_iupac_1001 <- data.frame(
+        sequence_id = paste0("seq", seq_len(n_std + 1)),
+        v_call      = rep("IGHV1-1*01", n_std + 1),
+        j_call      = rep("IGHJ1*01",   n_std + 1),
+        junction    = c(rep("TGTGCAAGCTACTGG", n_std),  # rows 1-1000: ATCG only
+                        "TGTGCRAGCTACTGG"),              # row 1001: R is an IUPAC code
+        locus       = rep("IGH", n_std + 1),
+        stringsAsFactors = FALSE
+    )
+
+    # Confirm row 1001 is the one with the IUPAC code (sanity check)
+    expect_equal(db_iupac_1001$junction[1001], "TGTGCRAGCTACTGG")
+
+    # Same data in reverse order to put IUPAC code in row 1
+    db_iupac_1 <- db_iupac_1001[1001:1, ]
+
+    # Test 1: IUPAC=FALSE, max_n=0 (default)
+    # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000
+    # - max_n=0 filtering: removes row 1001 (R is non-ATCG) and issues a warning
+    # - Result: runs successfully; row 1001 removed
+    expect_warning(
+        expect_message(
+            db_result1 <- hierarchicalClones(db_iupac_1001,
+                threshold = 0.15,
+                method = "nt", linkage = "single",
+                junction = "junction",
+                v_call = "v_call", j_call = "j_call",
+                IUPAC = FALSE,
+                max_n = 0,
+                summarize_clones = FALSE
+            ),
+            "Running defineClonesScoper in bulk mode and only keep heavy chains"
+        ),
+        "Removed 1 sequences with non ATCG characters."
+    )
+    expect_equal(nrow(db_result1), n_std)
+    expect_false("seq1001" %in% db_result1$sequence_id)
+
+    # Same data in reverse order now throws an error. Potentially confusing for users?
+    expect_error(
+        db_result1 <- hierarchicalClones(db_iupac_1,
+            threshold = 0.15,
+            method = "nt", linkage = "single",
+            junction = "junction",
+            v_call = "v_call", j_call = "j_call",
+            IUPAC = FALSE,
+            max_n = 0,
+            summarize_clones = FALSE
+        ),
+        "invalid sequence characters in the junction column\\. 1 sequence\\(s\\) found\\.\\n Valid characters are: 'ATCGN\\?'"
+    )
+
+    # Test 2: IUPAC=FALSE, max_n=NULL
+    # - Validation (first 1000 rows): passes - no IUPAC codes in rows 1-1000
+    # - max_n=NULL: no filtering -> row 1001 reaches fastDist_rcpp
+    # - fastDist_rcpp stops with an error because R is not A,C,G,T,N,?
+    expect_error(
+        suppressMessages(
+            hierarchicalClones(db_iupac_1001,
+                threshold = 0.15,
+                method = "nt", linkage = "single",
+                junction = "junction",
+                v_call = "v_call", j_call = "j_call",
+                IUPAC = FALSE,
+                max_n = NULL,
+                summarize_clones = FALSE
+            )
+        ),
+        "Only A,C,G,T,N,\\? are allowed"
+    )
+
+    # Same data in reverse also throws an error.
+    expect_error(
+        suppressMessages(
+            hierarchicalClones(db_iupac_1,
+                threshold = 0.15,
+                method = "nt", linkage = "single",
+                junction = "junction",
+                v_call = "v_call", j_call = "j_call",
+                IUPAC = FALSE,
+                max_n = NULL,
+                summarize_clones = FALSE
+            )
+        ),
+        "invalid sequence characters in the junction column\\. 1 sequence\\(s\\) found\\.\\n Valid characters are: 'ATCGN\\?'"
+    )
+
+    # Test 3: IUPAC=TRUE, max_n=NULL
+    # - Validation (first 1000 rows): passes
+    # - max_n=NULL: no filtering -> row 1001 is kept
+    # - Distance: alakazam::pairwiseDist handles IUPAC codes correctly -> no error
+    # - Result: all 1001 sequences processed; seq1001 included in output
+    expect_message(
+        db_result3 <- hierarchicalClones(db_iupac_1001,
+            threshold = 0.15,
+            method = "nt", linkage = "single",
+            junction = "junction",
+            v_call = "v_call", j_call = "j_call",
+            IUPAC = TRUE,
+            max_n = NULL,
+            summarize_clones = FALSE
+        ),
+        "Running defineClonesScoper in bulk mode and only keep heavy chains"
+    )
+    expect_equal(nrow(db_result3), n_std + 1)
+    expect_true("seq1001" %in% db_result3$sequence_id)
+    # seq1001 (TGTGCRAGCTACTGG) is within distance 0.15 of the standard sequences
+    # (TGTGCAAGCTACTGG): R matches A under IUPAC scoring -> same clone
+    expect_equal(length(unique(db_result3$clone_id)), 1)
+
+    # Same data in reverse order gives the same result (IUPAC=TRUE allows R, max_n=NULL means no filtering)
+    expect_message(
+        db_result4 <- hierarchicalClones(db_iupac_1,
+            threshold = 0.15,
+            method = "nt", linkage = "single",
+            junction = "junction",
+            v_call = "v_call", j_call = "j_call",
+            IUPAC = TRUE,
+            max_n = NULL,
+            summarize_clones = FALSE
+        ),
+        "Running defineClonesScoper in bulk mode and only keep heavy chains"
+    )
+    # Results should be identical regardless of row order
+    db_result4_ordered <- db_result4[match(db_result3$sequence_id, db_result4$sequence_id), ]
+    expect_equal(db_result3, db_result4_ordered,  check.attributes = FALSE)
+})
+
 #### clone - spectralClones - novj method ####
 
 test_that("Test spectralClones - novj", {

From 0b7fda1b34a4f16ab44b01c37a438b64fa820bdb Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Wed, 25 Mar 2026 11:16:15 -0400
Subject: [PATCH 21/25] method aa translates on the fly

---
 R/Functions.R                     | 26 ++++++++++++++------------
 docs/topics/hierarchicalClones.md |  3 ++-
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index 2025d21..df84db4 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -1302,24 +1302,26 @@ defineClonesScoper <- function(db,
     }
     
     ### Check for invalid characters
-    # IUPAC mode is only supported for hierarchical clustering and nt method
-    if (!IUPAC && model == "hierarchical" && method == "nt") {
-            valid_chars <- c("A", "T", "C", "G", "N", "?")
+    # IUPAC mode is only supported for hierarchical clustering
+    if (!IUPAC && model == "hierarchical") {
+        valid_chars <- c("A", "T", "C", "G", "N", "?")
     } else {
-        if (method %in% c("nt", "novj", "vj")) {
-            valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0))
-        } else if (method == "aa") {
-            valid_chars <- colnames(alakazam::getAAMatrix(gap = 0))
-        } else {
-            stop("invalid method specified.")
-        }
+        valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0))
     }
     .validateSeq <- function(x) { all(unique(strsplit(x, "")[[1]]) %in% valid_chars) }
     valid_seq <- sapply(head(db[[junction]],1000), .validateSeq)
     not_valid_seq <- which(!valid_seq)
     if (length(not_valid_seq) > 0) {
-        stop("invalid sequence characters in the ", junction, " column. ",
-            length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '",  valid_chars, "'")
+        error_msg <- paste0("Invalid sequence characters in the ", junction, " column were found when checking the first 1000 sequences. ",
+            length(not_valid_seq)," sequence(s) found.", "\n Valid characters are: '",  valid_chars, "'",
+            "\n If you have other IUPAC characters in your sequences, set IUPAC=TRUE to allow all IUPAC bases, this will run a slower version of hierarchicalClustering.")
+        if (method == "aa") {
+            method_aa_msg <- paste0("Clustering with method='aa' expects nucleotide sequences, which will be translated by this function.")
+            stop(paste0(error_msg, "\n", method_aa_msg))
+        } else {
+            stop(error_msg)
+        }
+        
     }
     
     ### temp columns
diff --git a/docs/topics/hierarchicalClones.md b/docs/topics/hierarchicalClones.md
index 890dd7b..92b7caf 100644
--- a/docs/topics/hierarchicalClones.md
+++ b/docs/topics/hierarchicalClones.md
@@ -51,7 +51,8 @@ threshold
 
 method
 :   one of the `"nt"` for nucleotide based clustering or 
-`"aa"` for amino acid based clustering.
+`"aa"` for amino acid based clustering. Method `"aa"` still expects nucleotide sequences, 
+which will be translated to amino acids.
 
 linkage
 :   available linkage are `"single"`, `"average"`, and `"complete"`.

From ceab078a63660e3faa063aa47ee3f3c10de4de43 Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Mon, 30 Mar 2026 17:23:05 +0200
Subject: [PATCH 22/25] added humin to docs

---
 docs/topics/scoper-package.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/topics/scoper-package.md b/docs/topics/scoper-package.md
index dbc06cc..f5deda4 100644
--- a/docs/topics/scoper-package.md
+++ b/docs/topics/scoper-package.md
@@ -42,8 +42,9 @@ Authors:
 Other contributors:
 
 +  Edel Aron [edel.aron@yale.edu](edel.aron@yale.edu) [contributor]
-+  Cole Jensen [cole.jensen@yale.edu](cole.jensen@yale.edu) [contributor]
 +  Gisela Gabernet [gisela.gabernet@yale.edu](gisela.gabernet@yale.edu) [contributor]
++  Cole Jensen [cole.jensen@yale.edu](cole.jensen@yale.edu) [contributor]
++  Huimin Lyu [huimin.lyu@yale.edu](huimin.lyu@yale.edu) [contributor]
 
 
 

From 041cc7e6f7c72198e74bae331bb28d768fe9699c Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Mon, 30 Mar 2026 19:12:50 +0200
Subject: [PATCH 23/25] refine character validation adding method=nt for
 hierarchicalClones

---
 R/Functions.R               | 2 +-
 tests/testthat/test_clone.R | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index df84db4..3a64927 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -1303,7 +1303,7 @@ defineClonesScoper <- function(db,
     
     ### Check for invalid characters
     # IUPAC mode is only supported for hierarchical clustering
-    if (!IUPAC && model == "hierarchical") {
+    if (!IUPAC && model == "hierarchical" && method == "nt") {
         valid_chars <- c("A", "T", "C", "G", "N", "?")
     } else {
         valid_chars <- colnames(alakazam::getDNAMatrix(gap = 0))
diff --git a/tests/testthat/test_clone.R b/tests/testthat/test_clone.R
index c93c991..49a30c7 100644
--- a/tests/testthat/test_clone.R
+++ b/tests/testthat/test_clone.R
@@ -129,7 +129,7 @@ test_that("Test hierarchicalClones with IUPAC parameter", {
             IUPAC = FALSE,  # Use fast Hamming distance for ATCG only
             summarize_clones = FALSE
         ),
-        "invalid sequence characters"
+        "Invalid sequence characters"
     )
 
     # Test 2: IUPAC=TRUE with max_n=0 - validates with IUPAC but filters them out
@@ -242,7 +242,7 @@ test_that("Test hierarchicalClones with IUPAC parameter", {
             max_n = 1,      # Allow up to 1 N or ?
             summarize_clones = FALSE
         ),
-        "invalid sequence characters"
+        "Invalid sequence characters"
     )
     
     # Test with only N/? (no IUPAC codes) - should succeed
@@ -447,7 +447,7 @@ test_that("Test hierarchicalClones with IUPAC code beyond validation window", {
             max_n = 0,
             summarize_clones = FALSE
         ),
-        "invalid sequence characters in the junction column\\. 1 sequence\\(s\\) found\\.\\n Valid characters are: 'ATCGN\\?'"
+        "Invalid sequence characters in the junction column"
     )
 
     # Test 2: IUPAC=FALSE, max_n=NULL
@@ -482,7 +482,7 @@ test_that("Test hierarchicalClones with IUPAC code beyond validation window", {
                 summarize_clones = FALSE
             )
         ),
-        "invalid sequence characters in the junction column\\. 1 sequence\\(s\\) found\\.\\n Valid characters are: 'ATCGN\\?'"
+        "Invalid sequence characters in the junction column"
     )
 
     # Test 3: IUPAC=TRUE, max_n=NULL

From 982e47e89e113a61feb9cb3ce79c6e28c8f50540 Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Mon, 30 Mar 2026 19:22:58 +0200
Subject: [PATCH 24/25] rm moved to github message

---
 README.md | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/README.md b/README.md
index 75f0acf..6fae412 100644
--- a/README.md
+++ b/README.md
@@ -2,17 +2,6 @@
 [![](https://cranlogs.r-pkg.org/badges/scoper)](https://www.r-pkg.org/pkg/scoper)
 [![](https://img.shields.io/static/v1?label=AIRR-C%20sw-tools%20v1&message=compliant&color=008AFF&labelColor=000000&style=plastic)](https://docs.airr-community.org/en/stable/swtools/airr_swtools_standard.html)
     
-**IMPORTANT!** 
-SCOPer has moved to https://github.com/immcantation/scoper
-
-To update Git configuration settings use:
-
-```
-   git config user.email "your-gh-user@email.com"
-   git config user.name "your-gh-user-name"
-   git remote set-url origin git@github.com:immcantation/scoper.git
-```    
-
 SCOPer
 -------------------------------------------------------------------------------
 

From 9e30b9858dd4629d61b5a1c2b02e222fd945a9f3 Mon Sep 17 00:00:00 2001
From: ssnn-airr <susanna.marquez@yale.edu>
Date: Mon, 30 Mar 2026 21:50:31 +0200
Subject: [PATCH 25/25]  mv to roxygen block comment that was added in docs md

---
 R/Functions.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index 3a64927..7ec41e9 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -877,7 +877,8 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #' @param    db                 data.frame containing sequence data.
 #' @param    threshold          numeric scalar where the tree should be cut (the distance threshold for clonal grouping).
 #' @param    method             one of the \code{"nt"} for nucleotide based clustering or 
-#'                              \code{"aa"} for amino acid based clustering.
+#'                              \code{"aa"} for amino acid based clustering. Method `"aa"` still expects nucleotide sequences, 
+#'                              which will be translated to amino acids
 #' @param    linkage            available linkage are \code{"single"}, \code{"average"}, and \code{"complete"}.
 #' @param    normalize	        method of normalization. The default is \code{"len"}, which divides the distance by the length 
 #'                              of the sequence group. If \code{"none"} then no normalization if performed.
@@ -938,7 +939,8 @@ identicalClones <- function(db, method=c("nt", "aa"), junction="junction",
 #'                              The default is \code{NULL} for no action.
 #' @param    summarize_clones   if \code{TRUE} performs a series of analysis to assess the clonal landscape
 #'                              and returns a \link{ScoperClones} object. If \code{FALSE} (default) then
-#'                              a modified input \code{db} is returned. When grouping by \code{fields}, 
+#'                              a modified input \code{db} is returned with clone identifiers in the specified 
+#'                              `clone` column. When grouping by \code{fields}, 
 #'                              \code{summarize_clones} should be \code{FALSE}. 
 #' @param   seq_id              The column containing sequence ids
 #'