From 6a3034041d161714f5e4ee52f1a0ba46d2a03817 Mon Sep 17 00:00:00 2001 From: Net Zhang Date: Tue, 2 Jun 2026 10:58:59 -0400 Subject: [PATCH 1/6] Remove FAISS backend The FAISS KMeans backend added meaningful installation weight and startup import noise for a marginal benefit. Removing it simplies the backend selection logic to two cases: - cuML if GPU available - else sklearn Changes - Drop `faiss-cpu` & `faiss-gpu-cu12` from main deps and `gpu-*` extras - Remove FAISS from backend scripts `resolve_brackend()`, `run_kmeans()` dispatch - Remove "faiss" from clustering backend dropdowns in the webUI - Update README & BACKEND_PIPELINE doc to reflect the changes --- README.md | 2 +- docs/BACKEND_PIPELINE.md | 14 ++--- pyproject.toml | 12 ++-- shared/components/clustering_controls.py | 9 +-- shared/services/clustering_service.py | 4 +- shared/utils/__init__.py | 2 +- shared/utils/backend.py | 21 +------ shared/utils/clustering.py | 77 +++--------------------- 8 files changed, 24 insertions(+), 117 deletions(-) diff --git a/README.md b/README.md index 127a842..4e96445 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]" uv pip install -e ".[gpu-cu13]" ``` -The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. You can also manually select backends (cuML, FAISS, sklearn) in the sidebar. +The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. You can also manually select backends (cuML, sklearn) in the sidebar. ## Usage diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md index 43c1209..0f298d8 100644 --- a/docs/BACKEND_PIPELINE.md +++ b/docs/BACKEND_PIPELINE.md @@ -12,11 +12,11 @@ Raw Embeddings (from parquet or model) ├─ L2 Normalize: project onto unit hypersphere │ ├─► Step 1: KMeans Clustering (high-dimensional) - │ Backend: cuML → FAISS → sklearn + │ Backend: cuML (GPU) → sklearn (CPU) │ ├─► Step 2: Dimensionality Reduction to 2D │ Method: PCA / t-SNE / UMAP - │ Backend: cuML → sklearn + │ Backend: cuML (GPU) → sklearn (CPU) │ └─► Scatter Plot (Altair) Color = cluster, position = 2D projection @@ -46,10 +46,9 @@ feature space, not a lossy 2D projection. | Backend | When It's Used | How It Works | |---------|---------------|--------------| | **cuML** | GPU available + >500 samples | GPU-accelerated KMeans via RAPIDS. Runs on CuPy arrays. Falls back to sklearn on any error. | -| **FAISS** | No GPU + >500 samples | Facebook's optimized CPU KMeans using L2 index. Fast for medium datasets. Falls back to sklearn on error. | -| **sklearn** | Small datasets or fallback | Standard scikit-learn KMeans. Always works, no special dependencies. | +| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans. Always works, no special dependencies. | -**Auto-selection priority:** cuML > FAISS > sklearn. You can override in the sidebar. +**Auto-selection priority:** cuML > sklearn. You can override in the sidebar. ## Step 2: Dimensionality Reduction @@ -96,7 +95,7 @@ When you select "auto" (the default), the app picks the fastest available backen | Operation | Auto Logic | |-----------|-----------| -| KMeans | cuML if GPU + >500 samples, else FAISS if available + >500 samples, else sklearn | +| KMeans | cuML if GPU + >500 samples, else sklearn | | Dim. Reduction | cuML if GPU + >5000 samples, else sklearn | Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an @@ -122,9 +121,6 @@ Check the log file for the full picture when debugging. cuML (GPU) │ error? ▼ -FAISS (CPU, optimized) ← KMeans only - │ error? - ▼ sklearn (CPU, always works) ``` diff --git a/pyproject.toml b/pyproject.toml index 24f89af..c793896 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,8 +42,7 @@ dependencies = [ # Machine learning "scikit-learn>=1.0.0", "umap-learn>=0.5.0", - "numba>=0.57.0", - "faiss-cpu>=1.7.0", + "numba>=0.57.0", # Vision-language models "open-clip-torch>=2.20.0", # Custom inference package @@ -69,20 +68,17 @@ gpu = [ ] gpu-cu12 = [ "torch>=2.0.0", - "cuml-cu12>=25.6", - "faiss-gpu-cu12>=1.11.0", + "cuml-cu12>=26.4", # 26.4 removed the sklearn upper bound (compatible with sklearn>=1.8) "pynvml>=11.0.0", ] gpu-cu13 = [ "torch>=2.0.0", - "cuml-cu13>=25.12", - "faiss-gpu-cu12>=1.11.0", # no cu13 build on PyPI; cu12 works via CUDA backward compat + "cuml-cu13>=26.4", # 26.4 removed the sklearn upper bound (compatible with sklearn>=1.8) "pynvml>=11.0.0", ] -# Minimal GPU support (just PyTorch + FAISS GPU, no RAPIDS) +# Minimal GPU support for image embeddings generation (just PyTorch, no RAPIDS) gpu-minimal = [ "torch>=2.0.0", - "faiss-gpu-cu12>=1.11.0", ] all = [ "emb-explorer[dev,gpu]", diff --git a/shared/components/clustering_controls.py b/shared/components/clustering_controls.py index d18eb5d..34d3e1c 100644 --- a/shared/components/clustering_controls.py +++ b/shared/components/clustering_controls.py @@ -5,7 +5,7 @@ import streamlit as st from typing import Tuple, Optional -from shared.utils.backend import HAS_FAISS_PACKAGE, HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE +from shared.utils.backend import HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE def render_clustering_backend_controls(): @@ -19,9 +19,6 @@ def render_clustering_backend_controls(): dim_reduction_options = ["auto", "sklearn"] clustering_options = ["auto", "sklearn"] - if HAS_FAISS_PACKAGE: - clustering_options.append("faiss") - if HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE: dim_reduction_options.append("cuml") clustering_options.append("cuml") @@ -73,7 +70,7 @@ def render_clustering_backend_controls(): max_value=64, value=8, step=1, - help="Number of parallel workers for CPU backends (sklearn, FAISS). Not used by cuML (GPU manages parallelization automatically)." + help="Number of parallel workers for CPU sklearn. Not used by cuML (GPU manages parallelization automatically)." ) @@ -118,8 +115,6 @@ def render_kmeans_controls(): Tuple of (clustering_backend, n_workers, seed) """ clustering_options = ["auto", "sklearn"] - if HAS_FAISS_PACKAGE: - clustering_options.append("faiss") if HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE: clustering_options.append("cuml") diff --git a/shared/services/clustering_service.py b/shared/services/clustering_service.py index 150bcfc..6847b9f 100644 --- a/shared/services/clustering_service.py +++ b/shared/services/clustering_service.py @@ -38,8 +38,8 @@ def run_clustering( n_clusters: Number of clusters reduction_method: Dimensionality reduction method n_workers: Number of workers for reduction - dim_reduction_backend: Backend for dimensionality reduction ("auto", "sklearn", "faiss", "cuml") - clustering_backend: Backend for clustering ("auto", "sklearn", "faiss", "cuml") + dim_reduction_backend: Backend for dimensionality reduction ("auto", "sklearn", "cuml") + clustering_backend: Backend for clustering ("auto", "sklearn", "cuml") seed: Random seed for reproducibility (None for random) Returns: diff --git a/shared/utils/__init__.py b/shared/utils/__init__.py index b305aa7..2ae7a57 100644 --- a/shared/utils/__init__.py +++ b/shared/utils/__init__.py @@ -2,7 +2,7 @@ Shared utilities for clustering, IO, models, and taxonomy. Modules are imported lazily to avoid pulling in heavy dependencies -(sklearn, umap, faiss, cuml, torch, open_clip) at startup. +(sklearn, umap, cuml, torch, open_clip) at startup. Use direct imports instead: from shared.utils.clustering import reduce_dim, run_kmeans diff --git a/shared/utils/backend.py b/shared/utils/backend.py index ed66cad..c75db3f 100644 --- a/shared/utils/backend.py +++ b/shared/utils/backend.py @@ -20,7 +20,6 @@ # These are safe to call at module-load / render time — they only check # whether the package is installed, without executing it. -HAS_FAISS_PACKAGE: bool = importlib.util.find_spec("faiss") is not None HAS_CUML_PACKAGE: bool = importlib.util.find_spec("cuml") is not None HAS_CUPY_PACKAGE: bool = importlib.util.find_spec("cupy") is not None HAS_TORCH_PACKAGE: bool = importlib.util.find_spec("torch") is not None @@ -84,27 +83,16 @@ def check_cuml_available() -> bool: return False -def check_faiss_available() -> bool: - """Check if FAISS is available (actual import, for runtime use).""" - if not HAS_FAISS_PACKAGE: - return False - try: - import faiss - return True - except ImportError: - return False - - def resolve_backend(backend: str, operation: str = "general") -> str: """ Resolve 'auto' backend to actual backend based on available hardware. Args: - backend: Requested backend ("auto", "sklearn", "cuml", "faiss") + backend: Requested backend ("auto", "sklearn", "cuml") operation: Operation type for logging ("clustering", "reduction", "general") Returns: - Resolved backend name + Resolved backend name. CPU paths always go through sklearn. """ if backend != "auto": logger.debug(f"Using explicitly requested backend: {backend}") @@ -112,14 +100,10 @@ def resolve_backend(backend: str, operation: str = "general") -> str: cuda_available, device_info = check_cuda_available() has_cuml = check_cuml_available() - has_faiss = check_faiss_available() if cuda_available and has_cuml: resolved = "cuml" logger.info(f"Auto-resolved {operation} backend to cuML (GPU: {device_info})") - elif has_faiss: - resolved = "faiss" - logger.info(f"Auto-resolved {operation} backend to FAISS (CPU)") else: resolved = "sklearn" logger.info(f"Auto-resolved {operation} backend to sklearn (CPU)") @@ -140,7 +124,6 @@ def get_backend_info() -> dict: "cuda_available": cuda_available, "device_info": device_info, "cuml_available": check_cuml_available(), - "faiss_available": check_faiss_available(), } diff --git a/shared/utils/clustering.py b/shared/utils/clustering.py index f144f4b..ad9fc2e 100644 --- a/shared/utils/clustering.py +++ b/shared/utils/clustering.py @@ -8,8 +8,8 @@ from shared.utils.logging_config import get_logger from shared.utils.backend import ( - HAS_FAISS_PACKAGE, HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE, - check_cuda_available, check_cuml_available, check_faiss_available, + HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE, + check_cuda_available, check_cuml_available, ) logger = get_logger(__name__) @@ -17,7 +17,6 @@ # Legacy module-level flags — now backed by lightweight find_spec() checks # so importing this module no longer triggers heavy library loads. # Functions that actually need the libraries import them locally. -HAS_FAISS: bool = HAS_FAISS_PACKAGE HAS_CUML: bool = HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE HAS_CUDA: bool = False # resolved lazily via check_cuda_available() @@ -342,8 +341,8 @@ def run_kmeans(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = No embeddings (np.ndarray): The input feature embeddings of shape (n_samples, n_features). n_clusters (int): The number of clusters to form. seed (int, optional): Random seed for reproducibility. Defaults to None (random). - n_workers (int, optional): Number of parallel workers (used by FAISS and cuML if available). - backend (str, optional): Clustering backend - "auto", "sklearn", "faiss", or "cuml". Defaults to "auto". + n_workers (int, optional): Number of parallel workers (used by cuML if available). + backend (str, optional): Clustering backend - "auto", "sklearn", or "cuml". Defaults to "auto". Returns: kmeans (KMeans or custom object): The fitted clustering object. @@ -362,20 +361,9 @@ def run_kmeans(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = No if backend == "cuml" and HAS_CUML and cuda_available: logger.info("Using cuML backend for KMeans") result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers) - elif backend == "faiss" and HAS_FAISS: - logger.info("Using FAISS backend for KMeans") - result = _run_kmeans_faiss(embeddings, n_clusters, seed, n_workers) - elif backend == "auto": - # Auto selection priority: cuML > FAISS > sklearn - if HAS_CUML and cuda_available and n_samples > 500: - logger.info("Auto-selected cuML backend for KMeans (GPU available, large dataset)") - result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers) - elif HAS_FAISS and n_samples > 500: - logger.info("Auto-selected FAISS backend for KMeans (large dataset)") - result = _run_kmeans_faiss(embeddings, n_clusters, seed, n_workers) - else: - logger.info("Using sklearn backend for KMeans") - result = _run_kmeans_sklearn(embeddings, n_clusters, seed) + elif backend == "auto" and HAS_CUML and cuda_available and n_samples > 500: + logger.info("Auto-selected cuML backend for KMeans (GPU available, large dataset)") + result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers) else: logger.info("Using sklearn backend for KMeans") result = _run_kmeans_sklearn(embeddings, n_clusters, seed) @@ -443,55 +431,4 @@ def _run_kmeans_sklearn(embeddings: np.ndarray, n_clusters: int, seed: Optional[ return kmeans, labels -def _run_kmeans_faiss(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = None, n_workers: int = 1): - """KMeans using FAISS backend for faster clustering.""" - try: - import faiss - - # Ensure embeddings are float32 and C-contiguous (FAISS requirement) - embeddings = np.ascontiguousarray(embeddings.astype(np.float32)) - - n_samples, d = embeddings.shape - - # Set number of threads for FAISS - if n_workers > 1: - faiss.omp_set_num_threads(n_workers) - - # Create FAISS KMeans object - kmeans = faiss.Clustering(d, n_clusters) - - # Set clustering parameters - kmeans.verbose = False - kmeans.niter = 20 # Number of iterations - kmeans.nredo = 1 # Number of redos - if seed is not None: - kmeans.seed = seed - - # Use L2 distance (equivalent to sklearn's default) - index = faiss.IndexFlatL2(d) - - # Run clustering - kmeans.train(embeddings, index) - - # Get centroids - centroids = faiss.vector_to_array(kmeans.centroids).reshape(n_clusters, d) - - # Assign labels by finding nearest centroid for each point - _, labels = index.search(embeddings, 1) - labels = labels.flatten() - - # Create a simple object to mimic sklearn KMeans interface - class FAISSKMeans: - def __init__(self, centroids, labels): - self.cluster_centers_ = centroids - self.labels_ = labels - self.n_clusters = len(centroids) - - return FAISSKMeans(centroids, labels), labels - - except Exception as e: - # Fallback to sklearn if FAISS fails - logger.warning(f"FAISS clustering failed ({e}), falling back to sklearn") - return _run_kmeans_sklearn(embeddings, n_clusters, seed) - From 868fb21ea3ae5c33e2c7c1e98ce48e7df7cca160 Mon Sep 17 00:00:00 2001 From: Net Zhang Date: Tue, 2 Jun 2026 11:37:56 -0400 Subject: [PATCH 2/6] Auto-enable `scikit-learn-intelex` for CPU `sklearn` accel Add `scikit-learn-intelex` as default dependency and patch sklearn at import time in `shared/utils/clustering.py`. Accelerates the existing `sklearn` PCA / TSNE / KMeans calls on CPU. UMAP is unaffected as `umap-learn` is not part of the `sklearn` algorithm. Set Set EMB_EXPLORER_DISABLE_SKLEARNEX=1 to opt out for debugging vanilla sklearn behavior. --- README.md | 2 +- docs/BACKEND_PIPELINE.md | 10 +++++----- pyproject.toml | 3 +++ shared/utils/clustering.py | 12 ++++++++++++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4e96445..d0c7d1a 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]" uv pip install -e ".[gpu-cu13]" ``` -The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. You can also manually select backends (cuML, sklearn) in the sidebar. +The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL). You can also manually select backends (cuML, sklearn) in the sidebar. ## Usage diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md index 0f298d8..8a35c23 100644 --- a/docs/BACKEND_PIPELINE.md +++ b/docs/BACKEND_PIPELINE.md @@ -12,11 +12,11 @@ Raw Embeddings (from parquet or model) ├─ L2 Normalize: project onto unit hypersphere │ ├─► Step 1: KMeans Clustering (high-dimensional) - │ Backend: cuML (GPU) → sklearn (CPU) + │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by sklearn-intelex) │ ├─► Step 2: Dimensionality Reduction to 2D │ Method: PCA / t-SNE / UMAP - │ Backend: cuML (GPU) → sklearn (CPU) + │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by sklearn-intelex for PCA/TSNE) │ └─► Scatter Plot (Altair) Color = cluster, position = 2D projection @@ -46,7 +46,7 @@ feature space, not a lossy 2D projection. | Backend | When It's Used | How It Works | |---------|---------------|--------------| | **cuML** | GPU available + >500 samples | GPU-accelerated KMeans via RAPIDS. Runs on CuPy arrays. Falls back to sklearn on any error. | -| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans. Always works, no special dependencies. | +| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans, auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL) when installed — typically 10–17× faster than vanilla sklearn on CPU. Disable with `EMB_EXPLORER_DISABLE_SKLEARNEX=1`. | **Auto-selection priority:** cuML > sklearn. You can override in the sidebar. @@ -95,8 +95,8 @@ When you select "auto" (the default), the app picks the fastest available backen | Operation | Auto Logic | |-----------|-----------| -| KMeans | cuML if GPU + >500 samples, else sklearn | -| Dim. Reduction | cuML if GPU + >5000 samples, else sklearn | +| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by sklearn-intelex when installed) | +| Dim. Reduction | cuML if GPU + >5000 samples, else sklearn (auto-accelerated by sklearn-intelex for PCA / t-SNE) | Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an automatic retry with sklearn. OOM errors are surfaced to the user with guidance. diff --git a/pyproject.toml b/pyproject.toml index c793896..0551f5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,9 @@ dependencies = [ "altair>=5.0.0", # Machine learning "scikit-learn>=1.0.0", + # Intel oneDAL acceleration for sklearn (PCA / TSNE / KMeans) auto-patched at runtime. + # Disable with EMB_EXPLORER_DISABLE_SKLEARNEX=1 if you need vanilla sklearn behavior for debugging. + "scikit-learn-intelex>=2025.0", "umap-learn>=0.5.0", "numba>=0.57.0", # Vision-language models diff --git a/shared/utils/clustering.py b/shared/utils/clustering.py index ad9fc2e..4f85e83 100644 --- a/shared/utils/clustering.py +++ b/shared/utils/clustering.py @@ -14,6 +14,18 @@ logger = get_logger(__name__) +# Auto-enable scikit-learn-intelex (Intel oneDAL) acceleration for sklearn's +# PCA / TSNE / KMeans on CPU. Patches sklearn at import time so any downstream +# sklearn call gets the accelerated path transparently. Disable for debugging +# vanilla sklearn behavior with: EMB_EXPLORER_DISABLE_SKLEARNEX=1 +if os.environ.get("EMB_EXPLORER_DISABLE_SKLEARNEX", "0") != "1": + try: + from sklearnex import patch_sklearn + patch_sklearn() + logger.info("scikit-learn-intelex enabled (CPU sklearn auto-accelerated)") + except ImportError: + logger.debug("scikit-learn-intelex not installed; using vanilla sklearn") + # Legacy module-level flags — now backed by lightweight find_spec() checks # so importing this module no longer triggers heavy library loads. # Functions that actually need the libraries import them locally. From 788ec60c4d66ef2504a6c8bac5f93c5e29ce4918 Mon Sep 17 00:00:00 2001 From: Net Zhang <48858129+NetZissou@users.noreply.github.com> Date: Wed, 3 Jun 2026 13:05:01 -0400 Subject: [PATCH 3/6] Added footnote for `sklearn-intelex` in README & BACKEND_PIPELINE Co-authored-by: Net Zhang <48858129+NetZissou@users.noreply.github.com> --- README.md | 4 +++- docs/BACKEND_PIPELINE.md | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d0c7d1a..7b8f58a 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]" uv pip install -e ".[gpu-cu13]" ``` -The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL). You can also manually select backends (cuML, sklearn) in the sidebar. +The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) [^1]. You can also manually select backends (`cuML`, `sklearn`) in the sidebar. ## Usage @@ -96,3 +96,5 @@ ssh -N -L 8501::8501 @ ## Acknowledgements [OpenCLIP](https://github.com/mlfoundations/open_clip) | [Streamlit](https://streamlit.io/) | [Altair](https://altair-viz.github.io/) + +[^1]: [`sklearn-intelex`](https://github.com/uxlfoundation/scikit-learn-intelex) is powered by the [oneDAL](https://github.com/uxlfoundation/oneDAL) library that provides accelerations on x86_64 Linux and Windows machines, and silently fall back to vanilla `sklearn` on unsupported architectures like Apple Silicon and ARM Linux. The package is under the [UXL Foundation](https://github.com/uxlfoundation) (a Linux Foundation project) so cross-vendor support is a stated goal. diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md index 8a35c23..1ad8071 100644 --- a/docs/BACKEND_PIPELINE.md +++ b/docs/BACKEND_PIPELINE.md @@ -12,11 +12,11 @@ Raw Embeddings (from parquet or model) ├─ L2 Normalize: project onto unit hypersphere │ ├─► Step 1: KMeans Clustering (high-dimensional) - │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by sklearn-intelex) + │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` [^1]) │ ├─► Step 2: Dimensionality Reduction to 2D │ Method: PCA / t-SNE / UMAP - │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by sklearn-intelex for PCA/TSNE) + │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` for PCA/TSNE [^1]) │ └─► Scatter Plot (Altair) Color = cluster, position = 2D projection @@ -95,7 +95,7 @@ When you select "auto" (the default), the app picks the fastest available backen | Operation | Auto Logic | |-----------|-----------| -| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by sklearn-intelex when installed) | +| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by `sklearn-intelex` when installed [^1]) | | Dim. Reduction | cuML if GPU + >5000 samples, else sklearn (auto-accelerated by sklearn-intelex for PCA / t-SNE) | Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an @@ -126,3 +126,5 @@ sklearn (CPU, always works) The app is designed to *always produce a result*. GPU acceleration is a nice-to-have, never a hard requirement. + +[^1]: [`sklearn-intelex`](https://github.com/uxlfoundation/scikit-learn-intelex) is powered by the [oneDAL](https://github.com/uxlfoundation/oneDAL) library that provides accelerations on x86_64 Linux and Windows machines, and silently fall back to vanilla `sklearn` on unsupported architectures like Apple Silicon and ARM Linux. The package is under the [UXL Foundation](https://github.com/uxlfoundation) (a Linux Foundation project) so cross-vendor support is a stated goal. From 133e9bcfa0b9576e9fad2669e310c858350d1208 Mon Sep 17 00:00:00 2001 From: egrace479 Date: Mon, 8 Jun 2026 15:49:30 -0400 Subject: [PATCH 4/6] fix for optional extension install only on os that can use it --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0551f5f..2f2f4b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ dependencies = [ "scikit-learn>=1.0.0", # Intel oneDAL acceleration for sklearn (PCA / TSNE / KMeans) auto-patched at runtime. # Disable with EMB_EXPLORER_DISABLE_SKLEARNEX=1 if you need vanilla sklearn behavior for debugging. - "scikit-learn-intelex>=2025.0", + "scikit-learn-intelex>=2025.0; platform_machine == 'AMD64' or platform_machine == 'x86_64'", "umap-learn>=0.5.0", "numba>=0.57.0", # Vision-language models From 3d6e63b7d44d6b87955c89deea427281731fe903 Mon Sep 17 00:00:00 2001 From: Net Zhang Date: Tue, 9 Jun 2026 10:03:31 -0400 Subject: [PATCH 5/6] Apply footnote format changes Co-Authored-By: egrace479 --- README.md | 2 +- docs/BACKEND_PIPELINE.md | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7b8f58a..2cff7ad 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]" uv pip install -e ".[gpu-cu13]" ``` -The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) [^1]. You can also manually select backends (`cuML`, `sklearn`) in the sidebar. +The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex)[^1]. You can also manually select backends (`cuML`, `sklearn`) in the sidebar. ## Usage diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md index 1ad8071..92c2b4c 100644 --- a/docs/BACKEND_PIPELINE.md +++ b/docs/BACKEND_PIPELINE.md @@ -12,16 +12,18 @@ Raw Embeddings (from parquet or model) ├─ L2 Normalize: project onto unit hypersphere │ ├─► Step 1: KMeans Clustering (high-dimensional) - │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` [^1]) + │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex`) │ ├─► Step 2: Dimensionality Reduction to 2D │ Method: PCA / t-SNE / UMAP - │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` for PCA/TSNE [^1]) + │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` for PCA/TSNE) │ └─► Scatter Plot (Altair) Color = cluster, position = 2D projection ``` +Note that `sklearn-intelex` acceleration is used for CPU operations where available[^1]. + ## Step 0: Embedding Preparation Before any computation, every embedding goes through `_prepare_embeddings()`: @@ -46,7 +48,7 @@ feature space, not a lossy 2D projection. | Backend | When It's Used | How It Works | |---------|---------------|--------------| | **cuML** | GPU available + >500 samples | GPU-accelerated KMeans via RAPIDS. Runs on CuPy arrays. Falls back to sklearn on any error. | -| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans, auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL) when installed — typically 10–17× faster than vanilla sklearn on CPU. Disable with `EMB_EXPLORER_DISABLE_SKLEARNEX=1`. | +| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans, auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL) when installed[^1] — typically 10–17× faster than vanilla sklearn on CPU. Disable with `EMB_EXPLORER_DISABLE_SKLEARNEX=1`. | **Auto-selection priority:** cuML > sklearn. You can override in the sidebar. @@ -95,7 +97,7 @@ When you select "auto" (the default), the app picks the fastest available backen | Operation | Auto Logic | |-----------|-----------| -| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by `sklearn-intelex` when installed [^1]) | +| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by `sklearn-intelex` when installed[^1]) | | Dim. Reduction | cuML if GPU + >5000 samples, else sklearn (auto-accelerated by sklearn-intelex for PCA / t-SNE) | Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an From a021cc8cc063893b0ba19b443601c7030ee4b614 Mon Sep 17 00:00:00 2001 From: Net Zhang Date: Tue, 9 Jun 2026 10:41:18 -0400 Subject: [PATCH 6/6] Add CI Matrix to test install & import on OS + Py Matrix Matrix: - OS: ubuntu-latest, windows-latest, macos-latest - Python: 3.10, 3.11, 3.12, 3.13 Each cell installs the CPU base (no gpu extras), runs an import smoke tests for shared/utils/clustering and both Streamlit app entry points, then verifies the sklearnex platform marker: present on x86_64/AMD64, absent on arm64. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/install-and-import.yaml | 59 +++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 .github/workflows/install-and-import.yaml diff --git a/.github/workflows/install-and-import.yaml b/.github/workflows/install-and-import.yaml new file mode 100644 index 0000000..a188136 --- /dev/null +++ b/.github/workflows/install-and-import.yaml @@ -0,0 +1,59 @@ +name: install + import + +# CI matrix: verify the package installs cleanly and core modules +# import on every supported (OS, Python) pair, per pkg pyproject.toml's +# requires-python = ">=3.10,<3.14". +on: + pull_request: + workflow_dispatch: # allow manual reruns from the Actions tab + +jobs: + install-and-import: + name: ${{ matrix.os }} / py${{ matrix.python-version }} + runs-on: ${{ matrix.os }} + strategy: + # Don't abort the whole matrix on a single failure so we can see all platforms. + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.10', '3.11', '3.12', '3.13'] + steps: + - uses: actions/checkout@v4 + + # uv is our package manager of record, mirrors what users do locally. + - uses: astral-sh/setup-uv@v6 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + # CPU-only install. gpu-cu12 / gpu-cu13 extras are skipped on CI runners + # (no NVIDIA hardware, CUDA wheels are large and slow to resolve). + - name: Install package + run: uv pip install --system -e . + + # Smoke-test that the modules that get patched or accelerated import + # cleanly, and that both Streamlit app entry points are importable. + - name: Import smoke test + run: | + python -c "import shared.utils.clustering" + python -c "import apps.precalculated.app" + python -c "import apps.embed_explore.app" + + # On x86_64 / AMD64 sklearnex must install (the platform marker in + # pyproject.toml ensures it). On macos-latest (arm64) it must NOT + # install, which is the whole point of the marker. + - name: Verify sklearnex matches platform marker + shell: python + run: | + import platform + import importlib.util + on_x86 = platform.machine() in ('x86_64', 'AMD64') + present = importlib.util.find_spec('sklearnex') is not None + if on_x86: + assert present, 'sklearnex should be installed on x86_64/AMD64' + import sklearnex + print(f'sklearnex {sklearnex.__version__} present on {platform.machine()}') + else: + assert not present, f'sklearnex must not install on {platform.machine()}' + print(f'sklearnex correctly absent on {platform.machine()}')