From 23adf012e7ef6fab647491089d1f5e3ab93922db Mon Sep 17 00:00:00 2001
From: Paul Saary <github@paulsaary.de>
Date: Thu, 9 Apr 2026 16:22:02 +0200
Subject: [PATCH 1/2] refactor of calculateInterVsIntra to reduce complexity
 and increase performance

---
 R/Functions.R | 94 ++++++++++++++++++++-------------------------------
 1 file changed, 37 insertions(+), 57 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index 99e21de..b953b12 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -367,75 +367,55 @@ calculateInterVsIntra <- function(db,
     
     k <- NULL
     # open dataframes
-    vec_ff <- foreach(k=1:n_groups,
-                      .combine="c",
+    db_dff <- foreach(k=1:n_groups,
+                      .combine="rbind",
                       .errorhandling='stop') %dopar% {
-                          
-                          # *********************************************************************************
                           clones <- stringi::stri_split_fixed(vjl_gps$clone_id[k], ",")[[1]]
-                          l <- vjl_gps$junction_length[k]
-                          n_clones <- length(clones)
+                          junction_length <- vjl_gps$junction_length[k]
                           in_clones <- db[[clone]] %in% clones
+                          
                           seqs <- db[[ifelse(cdr3, cdr3_col, junction)]][in_clones]
                           names(seqs) <- db[[clone]][in_clones]
                           seqs <- uniqueSeq(seqs)
-                          ### calculate distance matrix among all seqs
+                          
                           dist_mtx <- pairwiseDist(seqs, dist_mat=DNAMatrix_gap0)
-                          ### prealoocate a vector = no. of max-dist in each clone (intra) + no. of min-dist between clones (inter)
-                          nrow_f <- n_clones + n_clones*(n_clones-1)/2
-                          vec_f <- rep(NA, nrow_f)
-                          ### calculate minimum and maximum distance in each clone
-                          n <- 0
-                          if (n_clones == 1) {
-                              n <- n + 1
-                              vec_f[n] <- max(dist_mtx)/l
-                              names(vec_f)[n] <- paste(clones[1], "NA", "intra", sep="_")
-                          } else {
-                              for (i in 1:(n_clones-1)) {
-                                  xx <- dist_mtx[rownames(dist_mtx) == clones[i], colnames(dist_mtx) == clones[i]]
-                                  n <- n + 1
-                                  vec_f[n] <- max(xx)/l
-                                  names(vec_f)[n] <- paste(clones[i], "NA", "intra", sep="_")
-                                  for (j in (i+1):n_clones) {
-                                      xy <- dist_mtx[rownames(dist_mtx) == clones[i], colnames(dist_mtx) == clones[j]]
-                                      n <- n + 1
-                                      vec_f[n] <- min(xy)/l
-                                      names(vec_f)[n] <- paste(clones[i], clones[j], "inter", sep="_")
-                                  }
+
+                          clones_in_group <- unique(rownames(dist_mtx)) # Only unique clones present
+                          n_clones <- length(clones_in_group)
+                          n_rows <- n_clones + (n_clones * (n_clones - 1) / 2)
+
+                          res_mtx <- matrix(NA, nrow = n_rows, ncol = 4)
+
+                          curr_row <- 1
+                          for (i in 1:n_clones) {
+                            idx_i <- which(rownames(dist_mtx) == clones_in_group[i])
+                            
+                            # INTRA: Max distance in sub-matrix
+                            res_mtx[curr_row, ] <- c(i, NA, max(dist_mtx[idx_i, idx_i]) / junction_length, 0)
+                            curr_row <- curr_row + 1
+                                
+                            # INTER: Min distance between pairs
+                            if (n_clones > 1 && i < n_clones) {
+                              for (j in (i + 1):n_clones) {
+                                idx_j <- which(rownames(dist_mtx) == clones_in_group[j])
+                                      
+                                res_mtx[curr_row, ] <- c(i, j, min(dist_mtx[idx_i, idx_j]) / junction_length, 1)
+                                curr_row <- curr_row + 1
+                                }
                               }
-                              yy <- dist_mtx[rownames(dist_mtx) == clones[j], colnames(dist_mtx) == clones[j]]
-                              n <- n + 1
-                              vec_f[n] <- max(yy)/l
-                              names(vec_f)[n] <- paste(clones[j], "NA", "intra", sep="_")
-                          }
-                          
-                          # update progress
-                          if (verbose) { pb$tick() }
-                          
-                          return(vec_f)
-                          # *********************************************************************************
+                            }
+
+                            # Convert result to dataframe
+                            res_k <- data.frame(
+                                clone_id_x = clones_in_group[res_mtx[,1]],
+                                clone_id_y = clones_in_group[res_mtx[,2]],
+                                distance   = res_mtx[,3],
+                                label      = ifelse(res_mtx[,4] == 0, "intra", "inter")
+                            )
                       }
     
-    ### stop the cluster
     if (nproc > 1) { parallel::stopCluster(cluster) }
     
-    # convert to a data.frame
-    db_dff <- data.frame(keyName = names(vec_ff), 
-                         distance = vec_ff, 
-                         row.names=NULL)
-    db_dff$label <- "intra"
-    db_dff$label[grepl("inter", db_dff$keyName)] <- "inter"
-    clones_xy <- data.frame(matrix(unlist(stringi::stri_split_fixed(db_dff$keyName, "_", n=3)),
-                                   nrow=nrow(db_dff),
-                                   byrow=T),
-                            stringsAsFactors=FALSE)
-    db_dff <- cbind(clones_xy, db_dff)
-    db_dff$keyName <- NULL
-    colnames(db_dff)[colnames(db_dff) == "X1"] <- "clone_id_x"
-    colnames(db_dff)[colnames(db_dff) == "X2"] <- "clone_id_y"
-    db_dff$X3 <- NULL
-    
-    # return results
     return(db_dff)
 }
 # *****************************************************************************

From 24458e18979bb384a80396e01449e41006eb5aad Mon Sep 17 00:00:00 2001
From: Paul Saary <github@paulsaary.de>
Date: Mon, 13 Apr 2026 19:08:05 +0200
Subject: [PATCH 2/2] refactor with lower memory usage according to bench:mark

---
 R/Functions.R | 174 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 104 insertions(+), 70 deletions(-)

diff --git a/R/Functions.R b/R/Functions.R
index b953b12..7dd3baa 100644
--- a/R/Functions.R
+++ b/R/Functions.R
@@ -332,90 +332,124 @@ uniqueSeq <- function(seqs) {
 
 # *****************************************************************************
 # inter-clone-distance vs intra-clone-distance
-calculateInterVsIntra <- function(db,
-                                  clone,
-                                  vjl_gps,
-                                  junction = "junction",
-                                  cdr3 = FALSE,
-                                  cdr3_col = NA,
-                                  nproc = 1,
-                                  verbose = FALSE) {
-    ### Create cluster of nproc size and export namespaces
-    if(nproc == 1) {
-        # If needed to run on a single core/cpu then, register DoSEQ
-        # (needed for 'foreach' in non-parallel mode)
+calculateInterVsIntra <- function(
+    db,
+    clone,
+    vjl_gps,
+    junction = "junction",
+    cdr3 = FALSE,
+    cdr3_col = NA,
+    nproc = 1,
+    verbose = FALSE
+) {
+    if (nproc == 1) {
         registerDoSEQ()
-    } else if( nproc > 1 ) {
-        cluster <- parallel::makeCluster(nproc, type="PSOCK")
-        registerDoParallel(cluster,cores=nproc)
+    } else if (nproc > 1) {
+        cluster <- parallel::makeCluster(nproc, type = "PSOCK")
+        registerDoParallel(cluster, cores = nproc)
     } else {
         stop('Nproc must be positive.')
     }
-    
-    ### export function to clusters
+
     DNAMatrix_gap0 <- getDNAMatrix(gap = 0)
-    if (nproc > 1) { 
-        export_functions <- list("pairwiseDist", "DNAMatrix_gap0", "uniqueSeq", "stri_split_fixed")
-        parallel::clusterExport(cluster, export_functions, envir=environment())
+    seq_col <- if (cdr3) cdr3_col else junction
+
+    # this index is later used to quickly get the rows of db corresponding to a clone
+    clone_to_idx <- split(seq_along(db[[clone]]), db[[clone]])
+
+    if (nproc > 1) {
+        parallel::clusterExport(
+            cluster,
+            list(
+                "pairwiseDist",
+                "DNAMatrix_gap0",
+                "stri_split_fixed",
+                "db",
+                "clone",
+                "seq_col",
+                "clone_to_idx"
+            ),
+            envir = environment()
+        )
     }
-    
-    n_groups <- nrow(vjl_gps)  
-    ### check the progress bar
+
+    n_groups <- nrow(vjl_gps)
     if (verbose) {
         pb <- progressBar(n_groups)
     }
-    
+
     k <- NULL
-    # open dataframes
-    db_dff <- foreach(k=1:n_groups,
-                      .combine="rbind",
-                      .errorhandling='stop') %dopar% {
-                          clones <- stringi::stri_split_fixed(vjl_gps$clone_id[k], ",")[[1]]
-                          junction_length <- vjl_gps$junction_length[k]
-                          in_clones <- db[[clone]] %in% clones
-                          
-                          seqs <- db[[ifelse(cdr3, cdr3_col, junction)]][in_clones]
-                          names(seqs) <- db[[clone]][in_clones]
-                          seqs <- uniqueSeq(seqs)
-                          
-                          dist_mtx <- pairwiseDist(seqs, dist_mat=DNAMatrix_gap0)
+    db_dff <- foreach(
+        k = 1:n_groups,
+        .combine = "rbind",
+        .errorhandling = 'stop'
+    ) %dopar%
+        {
+            clones <- stringi::stri_split_fixed(vjl_gps$clone_id[k], ",")[[1]]
+            jlen <- vjl_gps$junction_length[k]
 
-                          clones_in_group <- unique(rownames(dist_mtx)) # Only unique clones present
-                          n_clones <- length(clones_in_group)
-                          n_rows <- n_clones + (n_clones * (n_clones - 1) / 2)
+            # get te sequences and clone ids for all sequences in the group of clones
+            row_idx <- unlist(clone_to_idx[clones], use.names = FALSE)
+            seqs <- db[[seq_col]][row_idx]
+            cids <- db[[clone]][row_idx]
 
-                          res_mtx <- matrix(NA, nrow = n_rows, ncol = 4)
+            # This dedup is faster 
+            # than using a dataframe and calling distinct()
+            key <- paste0(cids, "\t", seqs)
+            keep <- !duplicated(key)
+            seqs <- seqs[keep]
+            names(seqs) <- cids[keep]
 
-                          curr_row <- 1
-                          for (i in 1:n_clones) {
-                            idx_i <- which(rownames(dist_mtx) == clones_in_group[i])
-                            
-                            # INTRA: Max distance in sub-matrix
-                            res_mtx[curr_row, ] <- c(i, NA, max(dist_mtx[idx_i, idx_i]) / junction_length, 0)
-                            curr_row <- curr_row + 1
-                                
-                            # INTER: Min distance between pairs
-                            if (n_clones > 1 && i < n_clones) {
-                              for (j in (i + 1):n_clones) {
-                                idx_j <- which(rownames(dist_mtx) == clones_in_group[j])
-                                      
-                                res_mtx[curr_row, ] <- c(i, j, min(dist_mtx[idx_i, idx_j]) / junction_length, 1)
-                                curr_row <- curr_row + 1
-                                }
-                              }
-                            }
+            dist_mtx <- pairwiseDist(seqs, dist_mat = DNAMatrix_gap0)
 
-                            # Convert result to dataframe
-                            res_k <- data.frame(
-                                clone_id_x = clones_in_group[res_mtx[,1]],
-                                clone_id_y = clones_in_group[res_mtx[,2]],
-                                distance   = res_mtx[,3],
-                                label      = ifelse(res_mtx[,4] == 0, "intra", "inter")
-                            )
-                      }
-    
-    if (nproc > 1) { parallel::stopCluster(cluster) }
-    
+            rn <- rownames(dist_mtx)
+            clone_list <- unique(rn)
+            n_clones <- length(clone_list)
+            clone_idx <- split(seq_along(rn), rn)[clone_list]
+
+            n_rows <- n_clones + n_clones * (n_clones - 1) %/% 2
+            clone_x <- character(n_rows)
+            clone_y <- character(n_rows)
+            dists <- numeric(n_rows)
+            labels <- character(n_rows)
+
+            r <- 0L
+            for (i in seq_along(clone_list)) {
+                ii <- clone_idx[[i]]
+                r <- r + 1L
+                clone_x[r] <- clone_list[i]
+                clone_y[r] <- NA_character_
+                dists[r] <- max(dist_mtx[ii, ii, drop = FALSE]) / jlen
+                labels[r] <- "intra"
+
+                if (i < n_clones) {
+                    for (j in (i + 1L):n_clones) {
+                        jj <- clone_idx[[j]]
+                        r <- r + 1L
+                        clone_x[r] <- clone_list[i]
+                        clone_y[r] <- clone_list[j]
+                        dists[r] <- min(dist_mtx[ii, jj, drop = FALSE]) / jlen
+                        labels[r] <- "inter"
+                    }
+                }
+            }
+
+            if (verbose) {
+                pb$tick()
+            }
+
+            data.frame(
+                clone_id_x = clone_x,
+                clone_id_y = clone_y,
+                distance = dists,
+                label = labels,
+                stringsAsFactors = FALSE
+            )
+        }
+
+    if (nproc > 1) {
+        parallel::stopCluster(cluster)
+    }
     return(db_dff)
 }
 # *****************************************************************************