Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion R/Functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,34 @@ pairwiseMutions <- function(germ_imgt,
}
# *****************************************************************************

# *****************************************************************************
# Wrapper around fastDist_rcpp: returns a proper dist object with Labels
fastDist <- function(seqs) {
n <- length(seqs)
v <- fastDist_rcpp(seqs)
structure(v,
class = "dist",
Size = n,
Labels = names(seqs),
Diag = FALSE,
Upper = FALSE)
}
# *****************************************************************************

# *****************************************************************************
# Wrapper around fastDistAA_rcpp: returns a proper dist object with Labels
fastDistAA <- function(seqs) {
n <- length(seqs)
v <- fastDistAA_rcpp(seqs)
structure(v,
class = "dist",
Size = n,
Labels = names(seqs),
Diag = FALSE,
Upper = FALSE)
}
# *****************************************************************************

# *****************************************************************************
### make a dataframe of unique seqs in each clone
uniqueSeq <- function(seqs) {
Expand Down Expand Up @@ -1966,7 +1994,7 @@ hierarchicalClones_helper <- function(db_gp,
# calculate distance matrix
if (method == "nt") {
if (! IUPAC){
dist_mtx <- fastDist_rcpp(seqs_unq)
dist_mtx <- fastDist(seqs_unq)
}
else{
dist_mtx <- alakazam::pairwiseDist(seq = seqs_unq,
Expand Down
2 changes: 1 addition & 1 deletion src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ BEGIN_RCPP
END_RCPP
}
// fastDist_rcpp
IntegerMatrix fastDist_rcpp(CharacterVector seqs);
IntegerVector fastDist_rcpp(CharacterVector seqs);
RcppExport SEXP _scoper_fastDist_rcpp(SEXP seqsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Expand Down
56 changes: 27 additions & 29 deletions src/fastDist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ int countSeqsWithInvalidBases_rcpp(CharacterVector seqs) {
}

// [[Rcpp::export]]
IntegerMatrix fastDist_rcpp(CharacterVector seqs) {
IntegerVector fastDist_rcpp(CharacterVector seqs) {
int N = seqs.size();

if (N == 0) stop("empty input");
Expand All @@ -60,7 +60,16 @@ IntegerMatrix fastDist_rcpp(CharacterVector seqs) {
}
}

IntegerMatrix Mmatch(N, N); // zero-initialized
// lower-triangle vector (column-major, matching R's dist storage):
// element at row i, col j (i > j, 0-indexed) maps to position:
// j*N - j*(j+1)/2 + (i-j) - 1
size_t tri_size = (size_t)N * (N - 1) / 2;
IntegerVector tri(tri_size); // zero-initialized

auto tri_idx = [&](int i, int j) -> size_t {
if (i < j) std::swap(i, j);
return (size_t)j * N - (size_t)j * (j + 1) / 2 + (i - j) - 1;
};

for (int p = 0; p < L; ++p) {
std::vector<int> A, Cb, G, Tt, Ns, Q;
Expand All @@ -78,28 +87,20 @@ IntegerMatrix fastDist_rcpp(CharacterVector seqs) {
}
}

// column-major safe updaters: iterate over j (column) outermost,
// then use a pointer to column j and bump rows i.
auto bump_within = [&](const std::vector<int>& v){
// increment lower-triangle match counts for pairs within the same bucket
auto bump_within = [&](const std::vector<int>& v) {
int m = (int)v.size();
for (int jj = 0; jj < m; ++jj) {
int j = v[jj];
int* colj = &Mmatch(0, j); // pointer to column j
for (int ii = 0; ii < m; ++ii) {
colj[v[ii]] += 1; // increment (i = v[ii], j)
}
}
for (int ii = 1; ii < m; ++ii)
for (int jj = 0; jj < ii; ++jj)
tri[tri_idx(v[ii], v[jj])] += 1;
};

auto bump_pairs = [&](const std::vector<int>& X, const std::vector<int>& Y){
int mx = (int)X.size(), my = (int)Y.size();
for (int jj = 0; jj < my; ++jj) {
int j = Y[jj];
int* colj = &Mmatch(0, j); // pointer to column j
for (int ii = 0; ii < mx; ++ii) {
colj[X[ii]] += 1; // increment (i = X[ii], j)
}
}
// increment lower-triangle match counts for all cross-bucket pairs
auto bump_pairs = [&](const std::vector<int>& X, const std::vector<int>& Y) {
for (int xi : X)
for (int yj : Y)
if (xi != yj)
tri[tri_idx(xi, yj)] += 1;
};

// same known base
Expand All @@ -112,19 +113,16 @@ IntegerMatrix fastDist_rcpp(CharacterVector seqs) {
K.insert(K.end(), G.begin(), G.end());
K.insert(K.end(), Tt.begin(), Tt.end());
bump_pairs(Ns, K);
bump_pairs(K, Ns);
// bump_pairs(K, Ns); // The bump_pairs(Ns, K) call already handles all N-vs-known-base pairs in both directions because tri_idx normalizes (i,j) and (j,i) to the same lower-triangle position.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@robertbjornson I made this change (commented out bump_pairs) to your code. Please review.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I"m unclear where this bump_pairs(K, Ns) came from. It wasn't in my checked in code. Seems fine to remove it.

bump_within(Ns); // N-N is a match (consistent with getDNAMatrix(gap=0))

// ? with ?
bump_within(Q);
}

// convert matches -> distances
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
Mmatch(i, j) = L - Mmatch(i, j);
}
Mmatch(i, i) = 0;
}
return Mmatch;
for (int k = 0; k < tri_size; ++k)
tri[k] = L - tri[k];

return tri;
}
128 changes: 128 additions & 0 deletions src/fastDistAA.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#include <Rcpp.h>
using namespace Rcpp;

// A=0 C=1 D=2 E=3 F=4 G=5 H=6 I=7 K=8 L=9
// M=10 N=11 P=12 Q=13 R=14 S=15 T=16 V=17 W=18 Y=19
// X=20 (wildcard) ?=21
inline uint8_t code_aa(char c){
switch(c){
case 'A': return 0; case 'C': return 1; case 'D': return 2;
case 'E': return 3; case 'F': return 4; case 'G': return 5;
case 'H': return 6; case 'I': return 7; case 'K': return 8;
case 'L': return 9; case 'M': return 10; case 'N': return 11;
case 'P': return 12; case 'Q': return 13; case 'R': return 14;
case 'S': return 15; case 'T': return 16; case 'V': return 17;
case 'W': return 18; case 'Y': return 19;
case 'X': return 20; case '?': return 21;
default: return 255;
}
}

// [[Rcpp::export]]
int countAASeqsWithInvalidChars_rcpp(CharacterVector seqs) {
int N = seqs.size();
int bad = 0;

for (int i = 0; i < N; ++i) {
if (seqs[i] == NA_STRING) {
bad++;
continue;
}

std::string s = as<std::string>(seqs[i]);
bool invalid = false;
for (char ch : s) {
char up = (char)std::toupper((unsigned char)ch);
if (code_aa(up) == 255) {
invalid = true;
break;
}
}

if (invalid) bad++;
}

return bad;
}

// [[Rcpp::export]]
IntegerVector fastDistAA_rcpp(CharacterVector seqs) {
int N = seqs.size();

if (N == 0) stop("empty input");

std::string s0 = as<std::string>(seqs[0]);
int L = (int)s0.size();
for (int i = 0; i < N; ++i) {
if ((int)std::string(as<std::string>(seqs[i])).size() != L)
stop("All sequences must have the same length");
}

// encode to uint8: N x L, row-major in a flat vector
std::vector<uint8_t> enc((size_t)N * L);
for (int i = 0; i < N; ++i) {
std::string s = as<std::string>(seqs[i]);
for (int p = 0; p < L; ++p) {
char up = (char)std::toupper((unsigned char)s[p]);
uint8_t c = code_aa(up);
if (c == 255) stop("Only the 20 standard AAs plus X and ? are allowed");
enc[(size_t)i * L + p] = c;
}
}

// lower-triangle vector (column-major, matching R's dist storage):
// element at row i, col j (i > j, 0-indexed) maps to:
// j*N - j*(j+1)/2 + (i-j) - 1
size_t tri_size = (size_t)N * (N - 1) / 2;
IntegerVector tri(tri_size); // zero-initialized

auto tri_idx = [&](int i, int j) -> size_t {
if (i < j) std::swap(i, j);
return (size_t)j * N - (size_t)j * (j + 1) / 2 + (i - j) - 1;
};

for (int p = 0; p < L; ++p) {
// one bucket per code (0-19 known AAs, 20 = X, 21 = ?)
std::vector<int> buckets[22];
for (int b = 0; b < 22; ++b) buckets[b].reserve(8);

for (int i = 0; i < N; ++i)
buckets[enc[(size_t)i * L + p]].push_back(i);

// increment lower-triangle match counts for pairs within the same bucket
auto bump_within = [&](const std::vector<int>& v) {
int m = (int)v.size();
for (int ii = 1; ii < m; ++ii)
for (int jj = 0; jj < ii; ++jj)
tri[tri_idx(v[ii], v[jj])] += 1;
};

// increment lower-triangle match counts for all cross-bucket pairs
auto bump_pairs = [&](const std::vector<int>& X, const std::vector<int>& Y) {
for (int xi : X)
for (int yj : Y)
if (xi != yj)
tri[tri_idx(xi, yj)] += 1;
};

// same known AA
for (int b = 0; b < 20; ++b)
bump_within(buckets[b]);

// X (wildcard) with any known AA — but not X-X
std::vector<int> K;
K.reserve(N);
for (int b = 0; b < 20; ++b)
K.insert(K.end(), buckets[b].begin(), buckets[b].end());
bump_pairs(buckets[20], K);

// ? with ?
bump_within(buckets[21]);
}

// convert matches -> distances
for (size_t k = 0; k < tri_size; ++k)
tri[k] = L - tri[k];

return tri;
}
Loading
Loading