Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
bf3cd69
added code to load devel scoper in foreach worker
robertbjornson Dec 5, 2025
e2f299e
added fastDist.cpp
robertbjornson Dec 5, 2025
2b728a6
first attempt at proper integration
robertbjornson Dec 5, 2025
8fe9e52
fixed libPath
robertbjornson Dec 5, 2025
6e58d4e
added cpp interface files
robertbjornson Dec 8, 2025
5668d3f
more cleanup
robertbjornson Dec 8, 2025
102252c
added countSeqsWithInvalidBases_rcpp()
robertbjornson Dec 23, 2025
9c14a2d
updated exports
robertbjornson Dec 23, 2025
fcd69a3
Add parameter IUPAC in functions hierarchicalClones, defineClonesScop…
Vivian0105 Mar 9, 2026
0af042b
fix small IUPAC bug
Vivian0105 Mar 9, 2026
c83316a
Update DESCRIPTION, add IUPAC to function passToClustering_lev1
Vivian0105 Mar 9, 2026
707a7a1
typo
ssnn-airr Mar 11, 2026
653df60
tests and doc
ssnn-airr Mar 11, 2026
ef421dc
updated docs
ssnn-airr Mar 12, 2026
308e685
fix test data
ssnn-airr Mar 17, 2026
b17f054
updated docs
ssnn-airr Mar 19, 2026
ba063d6
added Huimin to contributors
ssnn-airr Mar 19, 2026
b468e18
updated docs
ssnn-airr Mar 19, 2026
cd42716
fix character validation logic
ggabernet Mar 24, 2026
fa1858e
added tests
ssnn-airr Mar 25, 2026
0b7fda1
method aa translates on the fly
ggabernet Mar 25, 2026
ae86614
Merge branch 'rdb9_fastDist' of https://github.com/vivian0105/scoper …
ggabernet Mar 25, 2026
ceab078
added humin to docs
ssnn-airr Mar 30, 2026
041cc7e
refine character validation adding method=nt for hierarchicalClones
ssnn-airr Mar 30, 2026
4c5bd02
Merge pull request #44 from Vivian0105/rdb9_fastDist
ssnn-airr Mar 30, 2026
982e47e
rm moved to github message
ssnn-airr Mar 30, 2026
4651c4b
Merge branch 'master' into rdb9_fastDist
ssnn-airr Mar 30, 2026
9e30b98
mv to roxygen block comment that was added in docs md
ssnn-airr Mar 30, 2026
ca57a57
Merge branch 'rdb9_fastDist' of github.com:immcantation/scoper into r…
ssnn-airr Mar 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
Package: scoper
Type: Package
Version: 1.4.0.999
Date: 2026-02-13
Date: 2026-03-17
Authors@R: c(person("Nima", "Nouri", role=c("aut"),
email="nima.nouri@yale.edu"),
person("Edel", "Aron", role=c("ctb"),
email="edel.aron@yale.edu"),
person("Gisela", "Gabernet", role=c("ctb"),
email="gisela.gabernet@yale.edu"),
email="gisela.gabernet@yale.edu"),
person("Cole", "Jensen", role=c("ctb"),
email="cole.jensen@yale.edu"),
person("Huimin", "Lyu", role=c("ctb"),
Expand Down
144 changes: 121 additions & 23 deletions R/Functions.R

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,11 @@ pairwiseMutMatrixRcpp <- function(informative_pos, mutMtx, motifMtx) {
.Call(`_scoper_pairwiseMutMatrixRcpp`, informative_pos, mutMtx, motifMtx)
}

countSeqsWithInvalidBases_rcpp <- function(seqs) {
.Call(`_scoper_countSeqsWithInvalidBases_rcpp`, seqs)
}

fastDist_rcpp <- function(seqs) {
.Call(`_scoper_fastDist_rcpp`, seqs)
}

3 changes: 2 additions & 1 deletion docs/topics/hierarchicalClones.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ threshold

method
: one of the `"nt"` for nucleotide based clustering or
`"aa"` for amino acid based clustering.
`"aa"` for amino acid based clustering. Method `"aa"` still expects nucleotide sequences,
which will be translated to amino acids.

linkage
: available linkage are `"single"`, `"average"`, and `"complete"`.
Expand Down
24 changes: 24 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,33 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// countSeqsWithInvalidBases_rcpp
int countSeqsWithInvalidBases_rcpp(CharacterVector seqs);
RcppExport SEXP _scoper_countSeqsWithInvalidBases_rcpp(SEXP seqsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type seqs(seqsSEXP);
rcpp_result_gen = Rcpp::wrap(countSeqsWithInvalidBases_rcpp(seqs));
return rcpp_result_gen;
END_RCPP
}
// fastDist_rcpp
IntegerMatrix fastDist_rcpp(CharacterVector seqs);
RcppExport SEXP _scoper_fastDist_rcpp(SEXP seqsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type seqs(seqsSEXP);
rcpp_result_gen = Rcpp::wrap(fastDist_rcpp(seqs));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_scoper_pairwiseMutMatrixRcpp", (DL_FUNC) &_scoper_pairwiseMutMatrixRcpp, 3},
{"_scoper_countSeqsWithInvalidBases_rcpp", (DL_FUNC) &_scoper_countSeqsWithInvalidBases_rcpp, 1},
{"_scoper_fastDist_rcpp", (DL_FUNC) &_scoper_fastDist_rcpp, 1},
{NULL, NULL, 0}
};

Expand Down
129 changes: 129 additions & 0 deletions src/fastDist.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#include <Rcpp.h>
using namespace Rcpp;

inline uint8_t code_char(char c){
switch(c){
case 'A': return 0; case 'C': return 1; case 'G': return 2; case 'T': return 3;
case 'N': return 4; case '?': return 5;
default: return 255;
}
}

// [[Rcpp::export]]
int countSeqsWithInvalidBases_rcpp(CharacterVector seqs) {
int N = seqs.size();
int bad = 0;

for (int i = 0; i < N; ++i) {
if (seqs[i] == NA_STRING) {
bad++;
continue;
}

std::string s = as<std::string>(seqs[i]);
bool invalid = false;
for (char ch : s) {
char up = (char)std::toupper((unsigned char)ch);
if (code_char(up) == 255) {
invalid = true;
break;
}
}

if (invalid) bad++;
}

return bad;
}

// [[Rcpp::export]]
IntegerMatrix fastDist_rcpp(CharacterVector seqs) {
int N = seqs.size();

if (N == 0) stop("empty input");

std::string s0 = as<std::string>(seqs[0]);
int L = (int)s0.size();
for (int i = 0; i < N; ++i) {
if ((int)std::string(as<std::string>(seqs[i])).size() != L)
stop("All sequences must have the same length");
}

// encode to uint8: N x L, row-major in a flat vector
std::vector<uint8_t> enc((size_t)N * L);
for (int i = 0; i < N; ++i) {
std::string s = as<std::string>(seqs[i]);
for (int p = 0; p < L; ++p) {
uint8_t c = code_char(s[p]);
if (c == 255) stop("Only A,C,G,T,N,? are allowed");
Comment thread
ggabernet marked this conversation as resolved.
enc[(size_t)i * L + p] = c;
}
}

IntegerMatrix Mmatch(N, N); // zero-initialized

for (int p = 0; p < L; ++p) {
std::vector<int> A, Cb, G, Tt, Ns, Q;
A.reserve(N); Cb.reserve(N); G.reserve(N); Tt.reserve(N); Ns.reserve(N); Q.reserve(N);

// bucket row indices by symbol at column p
for (int i = 0; i < N; ++i) {
switch (enc[(size_t)i * L + p]) {
case 0: A.push_back(i); break;
case 1: Cb.push_back(i); break;
case 2: G.push_back(i); break;
case 3: Tt.push_back(i); break;
case 4: Ns.push_back(i); break;
case 5: Q.push_back(i); break;
}
}

// column-major safe updaters: iterate over j (column) outermost,
// then use a pointer to column j and bump rows i.
auto bump_within = [&](const std::vector<int>& v){
int m = (int)v.size();
for (int jj = 0; jj < m; ++jj) {
int j = v[jj];
int* colj = &Mmatch(0, j); // pointer to column j
for (int ii = 0; ii < m; ++ii) {
colj[v[ii]] += 1; // increment (i = v[ii], j)
}
}
};

auto bump_pairs = [&](const std::vector<int>& X, const std::vector<int>& Y){
int mx = (int)X.size(), my = (int)Y.size();
for (int jj = 0; jj < my; ++jj) {
int j = Y[jj];
int* colj = &Mmatch(0, j); // pointer to column j
for (int ii = 0; ii < mx; ++ii) {
colj[X[ii]] += 1; // increment (i = X[ii], j)
}
}
};

// same known base
bump_within(A); bump_within(Cb); bump_within(G); bump_within(Tt);

// N with known (both directions)
std::vector<int> K; K.reserve(A.size()+Cb.size()+G.size()+Tt.size());
K.insert(K.end(), A.begin(), A.end());
K.insert(K.end(), Cb.begin(), Cb.end());
K.insert(K.end(), G.begin(), G.end());
K.insert(K.end(), Tt.begin(), Tt.end());
bump_pairs(Ns, K);
bump_pairs(K, Ns);

// ? with ?
bump_within(Q);
}

// convert matches -> distances
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
Mmatch(i, j) = L - Mmatch(i, j);
}
Mmatch(i, i) = 0;
}
return Mmatch;
}
Loading
Loading