diff --git a/.github/workflows/install-and-import.yaml b/.github/workflows/install-and-import.yaml new file mode 100644 index 0000000..a188136 --- /dev/null +++ b/.github/workflows/install-and-import.yaml @@ -0,0 +1,59 @@ +name: install + import + +# CI matrix: verify the package installs cleanly and core modules +# import on every supported (OS, Python) pair, per pkg pyproject.toml's +# requires-python = ">=3.10,<3.14". +on: + pull_request: + workflow_dispatch: # allow manual reruns from the Actions tab + +jobs: + install-and-import: + name: ${{ matrix.os }} / py${{ matrix.python-version }} + runs-on: ${{ matrix.os }} + strategy: + # Don't abort the whole matrix on a single failure so we can see all platforms. + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.10', '3.11', '3.12', '3.13'] + steps: + - uses: actions/checkout@v4 + + # uv is our package manager of record, mirrors what users do locally. + - uses: astral-sh/setup-uv@v6 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + # CPU-only install. gpu-cu12 / gpu-cu13 extras are skipped on CI runners + # (no NVIDIA hardware, CUDA wheels are large and slow to resolve). + - name: Install package + run: uv pip install --system -e . + + # Smoke-test that the modules that get patched or accelerated import + # cleanly, and that both Streamlit app entry points are importable. + - name: Import smoke test + run: | + python -c "import shared.utils.clustering" + python -c "import apps.precalculated.app" + python -c "import apps.embed_explore.app" + + # On x86_64 / AMD64 sklearnex must install (the platform marker in + # pyproject.toml ensures it). On macos-latest (arm64) it must NOT + # install, which is the whole point of the marker. + - name: Verify sklearnex matches platform marker + shell: python + run: | + import platform + import importlib.util + on_x86 = platform.machine() in ('x86_64', 'AMD64') + present = importlib.util.find_spec('sklearnex') is not None + if on_x86: + assert present, 'sklearnex should be installed on x86_64/AMD64' + import sklearnex + print(f'sklearnex {sklearnex.__version__} present on {platform.machine()}') + else: + assert not present, f'sklearnex must not install on {platform.machine()}' + print(f'sklearnex correctly absent on {platform.machine()}') diff --git a/README.md b/README.md index 127a842..2cff7ad 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ uv pip install -e ".[gpu-cu12]" uv pip install -e ".[gpu-cu13]" ``` -The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. You can also manually select backends (cuML, FAISS, sklearn) in the sidebar. +The app auto-detects GPU availability at runtime and falls back to CPU if anything goes wrong — no configuration needed. The CPU sklearn path is auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex)[^1]. You can also manually select backends (`cuML`, `sklearn`) in the sidebar. ## Usage @@ -96,3 +96,5 @@ ssh -N -L 8501::8501 @ ## Acknowledgements [OpenCLIP](https://github.com/mlfoundations/open_clip) | [Streamlit](https://streamlit.io/) | [Altair](https://altair-viz.github.io/) + +[^1]: [`sklearn-intelex`](https://github.com/uxlfoundation/scikit-learn-intelex) is powered by the [oneDAL](https://github.com/uxlfoundation/oneDAL) library that provides accelerations on x86_64 Linux and Windows machines, and silently fall back to vanilla `sklearn` on unsupported architectures like Apple Silicon and ARM Linux. The package is under the [UXL Foundation](https://github.com/uxlfoundation) (a Linux Foundation project) so cross-vendor support is a stated goal. diff --git a/docs/BACKEND_PIPELINE.md b/docs/BACKEND_PIPELINE.md index 43c1209..92c2b4c 100644 --- a/docs/BACKEND_PIPELINE.md +++ b/docs/BACKEND_PIPELINE.md @@ -12,16 +12,18 @@ Raw Embeddings (from parquet or model) ├─ L2 Normalize: project onto unit hypersphere │ ├─► Step 1: KMeans Clustering (high-dimensional) - │ Backend: cuML → FAISS → sklearn + │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex`) │ ├─► Step 2: Dimensionality Reduction to 2D │ Method: PCA / t-SNE / UMAP - │ Backend: cuML → sklearn + │ Backend: cuML (GPU) → sklearn (CPU, auto-accelerated by `sklearn-intelex` for PCA/TSNE) │ └─► Scatter Plot (Altair) Color = cluster, position = 2D projection ``` +Note that `sklearn-intelex` acceleration is used for CPU operations where available[^1]. + ## Step 0: Embedding Preparation Before any computation, every embedding goes through `_prepare_embeddings()`: @@ -46,10 +48,9 @@ feature space, not a lossy 2D projection. | Backend | When It's Used | How It Works | |---------|---------------|--------------| | **cuML** | GPU available + >500 samples | GPU-accelerated KMeans via RAPIDS. Runs on CuPy arrays. Falls back to sklearn on any error. | -| **FAISS** | No GPU + >500 samples | Facebook's optimized CPU KMeans using L2 index. Fast for medium datasets. Falls back to sklearn on error. | -| **sklearn** | Small datasets or fallback | Standard scikit-learn KMeans. Always works, no special dependencies. | +| **sklearn** | CPU path (default on machines without a GPU) | Standard scikit-learn KMeans, auto-accelerated by [scikit-learn-intelex](https://github.com/uxlfoundation/scikit-learn-intelex) (Intel oneDAL) when installed[^1] — typically 10–17× faster than vanilla sklearn on CPU. Disable with `EMB_EXPLORER_DISABLE_SKLEARNEX=1`. | -**Auto-selection priority:** cuML > FAISS > sklearn. You can override in the sidebar. +**Auto-selection priority:** cuML > sklearn. You can override in the sidebar. ## Step 2: Dimensionality Reduction @@ -96,8 +97,8 @@ When you select "auto" (the default), the app picks the fastest available backen | Operation | Auto Logic | |-----------|-----------| -| KMeans | cuML if GPU + >500 samples, else FAISS if available + >500 samples, else sklearn | -| Dim. Reduction | cuML if GPU + >5000 samples, else sklearn | +| KMeans | cuML if GPU + >500 samples, else sklearn (auto-accelerated by `sklearn-intelex` when installed[^1]) | +| Dim. Reduction | cuML if GPU + >5000 samples, else sklearn (auto-accelerated by sklearn-intelex for PCA / t-SNE) | Any GPU error (architecture mismatch, missing libraries, out of memory (OOM)) triggers an automatic retry with sklearn. OOM errors are surfaced to the user with guidance. @@ -122,11 +123,10 @@ Check the log file for the full picture when debugging. cuML (GPU) │ error? ▼ -FAISS (CPU, optimized) ← KMeans only - │ error? - ▼ sklearn (CPU, always works) ``` The app is designed to *always produce a result*. GPU acceleration is a nice-to-have, never a hard requirement. + +[^1]: [`sklearn-intelex`](https://github.com/uxlfoundation/scikit-learn-intelex) is powered by the [oneDAL](https://github.com/uxlfoundation/oneDAL) library that provides accelerations on x86_64 Linux and Windows machines, and silently fall back to vanilla `sklearn` on unsupported architectures like Apple Silicon and ARM Linux. The package is under the [UXL Foundation](https://github.com/uxlfoundation) (a Linux Foundation project) so cross-vendor support is a stated goal. diff --git a/pyproject.toml b/pyproject.toml index 24f89af..2f2f4b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,9 +41,11 @@ dependencies = [ "altair>=5.0.0", # Machine learning "scikit-learn>=1.0.0", + # Intel oneDAL acceleration for sklearn (PCA / TSNE / KMeans) auto-patched at runtime. + # Disable with EMB_EXPLORER_DISABLE_SKLEARNEX=1 if you need vanilla sklearn behavior for debugging. + "scikit-learn-intelex>=2025.0; platform_machine == 'AMD64' or platform_machine == 'x86_64'", "umap-learn>=0.5.0", - "numba>=0.57.0", - "faiss-cpu>=1.7.0", + "numba>=0.57.0", # Vision-language models "open-clip-torch>=2.20.0", # Custom inference package @@ -69,20 +71,17 @@ gpu = [ ] gpu-cu12 = [ "torch>=2.0.0", - "cuml-cu12>=25.6", - "faiss-gpu-cu12>=1.11.0", + "cuml-cu12>=26.4", # 26.4 removed the sklearn upper bound (compatible with sklearn>=1.8) "pynvml>=11.0.0", ] gpu-cu13 = [ "torch>=2.0.0", - "cuml-cu13>=25.12", - "faiss-gpu-cu12>=1.11.0", # no cu13 build on PyPI; cu12 works via CUDA backward compat + "cuml-cu13>=26.4", # 26.4 removed the sklearn upper bound (compatible with sklearn>=1.8) "pynvml>=11.0.0", ] -# Minimal GPU support (just PyTorch + FAISS GPU, no RAPIDS) +# Minimal GPU support for image embeddings generation (just PyTorch, no RAPIDS) gpu-minimal = [ "torch>=2.0.0", - "faiss-gpu-cu12>=1.11.0", ] all = [ "emb-explorer[dev,gpu]", diff --git a/shared/components/clustering_controls.py b/shared/components/clustering_controls.py index d18eb5d..34d3e1c 100644 --- a/shared/components/clustering_controls.py +++ b/shared/components/clustering_controls.py @@ -5,7 +5,7 @@ import streamlit as st from typing import Tuple, Optional -from shared.utils.backend import HAS_FAISS_PACKAGE, HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE +from shared.utils.backend import HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE def render_clustering_backend_controls(): @@ -19,9 +19,6 @@ def render_clustering_backend_controls(): dim_reduction_options = ["auto", "sklearn"] clustering_options = ["auto", "sklearn"] - if HAS_FAISS_PACKAGE: - clustering_options.append("faiss") - if HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE: dim_reduction_options.append("cuml") clustering_options.append("cuml") @@ -73,7 +70,7 @@ def render_clustering_backend_controls(): max_value=64, value=8, step=1, - help="Number of parallel workers for CPU backends (sklearn, FAISS). Not used by cuML (GPU manages parallelization automatically)." + help="Number of parallel workers for CPU sklearn. Not used by cuML (GPU manages parallelization automatically)." ) @@ -118,8 +115,6 @@ def render_kmeans_controls(): Tuple of (clustering_backend, n_workers, seed) """ clustering_options = ["auto", "sklearn"] - if HAS_FAISS_PACKAGE: - clustering_options.append("faiss") if HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE: clustering_options.append("cuml") diff --git a/shared/services/clustering_service.py b/shared/services/clustering_service.py index 150bcfc..6847b9f 100644 --- a/shared/services/clustering_service.py +++ b/shared/services/clustering_service.py @@ -38,8 +38,8 @@ def run_clustering( n_clusters: Number of clusters reduction_method: Dimensionality reduction method n_workers: Number of workers for reduction - dim_reduction_backend: Backend for dimensionality reduction ("auto", "sklearn", "faiss", "cuml") - clustering_backend: Backend for clustering ("auto", "sklearn", "faiss", "cuml") + dim_reduction_backend: Backend for dimensionality reduction ("auto", "sklearn", "cuml") + clustering_backend: Backend for clustering ("auto", "sklearn", "cuml") seed: Random seed for reproducibility (None for random) Returns: diff --git a/shared/utils/__init__.py b/shared/utils/__init__.py index b305aa7..2ae7a57 100644 --- a/shared/utils/__init__.py +++ b/shared/utils/__init__.py @@ -2,7 +2,7 @@ Shared utilities for clustering, IO, models, and taxonomy. Modules are imported lazily to avoid pulling in heavy dependencies -(sklearn, umap, faiss, cuml, torch, open_clip) at startup. +(sklearn, umap, cuml, torch, open_clip) at startup. Use direct imports instead: from shared.utils.clustering import reduce_dim, run_kmeans diff --git a/shared/utils/backend.py b/shared/utils/backend.py index ed66cad..c75db3f 100644 --- a/shared/utils/backend.py +++ b/shared/utils/backend.py @@ -20,7 +20,6 @@ # These are safe to call at module-load / render time — they only check # whether the package is installed, without executing it. -HAS_FAISS_PACKAGE: bool = importlib.util.find_spec("faiss") is not None HAS_CUML_PACKAGE: bool = importlib.util.find_spec("cuml") is not None HAS_CUPY_PACKAGE: bool = importlib.util.find_spec("cupy") is not None HAS_TORCH_PACKAGE: bool = importlib.util.find_spec("torch") is not None @@ -84,27 +83,16 @@ def check_cuml_available() -> bool: return False -def check_faiss_available() -> bool: - """Check if FAISS is available (actual import, for runtime use).""" - if not HAS_FAISS_PACKAGE: - return False - try: - import faiss - return True - except ImportError: - return False - - def resolve_backend(backend: str, operation: str = "general") -> str: """ Resolve 'auto' backend to actual backend based on available hardware. Args: - backend: Requested backend ("auto", "sklearn", "cuml", "faiss") + backend: Requested backend ("auto", "sklearn", "cuml") operation: Operation type for logging ("clustering", "reduction", "general") Returns: - Resolved backend name + Resolved backend name. CPU paths always go through sklearn. """ if backend != "auto": logger.debug(f"Using explicitly requested backend: {backend}") @@ -112,14 +100,10 @@ def resolve_backend(backend: str, operation: str = "general") -> str: cuda_available, device_info = check_cuda_available() has_cuml = check_cuml_available() - has_faiss = check_faiss_available() if cuda_available and has_cuml: resolved = "cuml" logger.info(f"Auto-resolved {operation} backend to cuML (GPU: {device_info})") - elif has_faiss: - resolved = "faiss" - logger.info(f"Auto-resolved {operation} backend to FAISS (CPU)") else: resolved = "sklearn" logger.info(f"Auto-resolved {operation} backend to sklearn (CPU)") @@ -140,7 +124,6 @@ def get_backend_info() -> dict: "cuda_available": cuda_available, "device_info": device_info, "cuml_available": check_cuml_available(), - "faiss_available": check_faiss_available(), } diff --git a/shared/utils/clustering.py b/shared/utils/clustering.py index f144f4b..4f85e83 100644 --- a/shared/utils/clustering.py +++ b/shared/utils/clustering.py @@ -8,16 +8,27 @@ from shared.utils.logging_config import get_logger from shared.utils.backend import ( - HAS_FAISS_PACKAGE, HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE, - check_cuda_available, check_cuml_available, check_faiss_available, + HAS_CUML_PACKAGE, HAS_CUPY_PACKAGE, + check_cuda_available, check_cuml_available, ) logger = get_logger(__name__) +# Auto-enable scikit-learn-intelex (Intel oneDAL) acceleration for sklearn's +# PCA / TSNE / KMeans on CPU. Patches sklearn at import time so any downstream +# sklearn call gets the accelerated path transparently. Disable for debugging +# vanilla sklearn behavior with: EMB_EXPLORER_DISABLE_SKLEARNEX=1 +if os.environ.get("EMB_EXPLORER_DISABLE_SKLEARNEX", "0") != "1": + try: + from sklearnex import patch_sklearn + patch_sklearn() + logger.info("scikit-learn-intelex enabled (CPU sklearn auto-accelerated)") + except ImportError: + logger.debug("scikit-learn-intelex not installed; using vanilla sklearn") + # Legacy module-level flags — now backed by lightweight find_spec() checks # so importing this module no longer triggers heavy library loads. # Functions that actually need the libraries import them locally. -HAS_FAISS: bool = HAS_FAISS_PACKAGE HAS_CUML: bool = HAS_CUML_PACKAGE and HAS_CUPY_PACKAGE HAS_CUDA: bool = False # resolved lazily via check_cuda_available() @@ -342,8 +353,8 @@ def run_kmeans(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = No embeddings (np.ndarray): The input feature embeddings of shape (n_samples, n_features). n_clusters (int): The number of clusters to form. seed (int, optional): Random seed for reproducibility. Defaults to None (random). - n_workers (int, optional): Number of parallel workers (used by FAISS and cuML if available). - backend (str, optional): Clustering backend - "auto", "sklearn", "faiss", or "cuml". Defaults to "auto". + n_workers (int, optional): Number of parallel workers (used by cuML if available). + backend (str, optional): Clustering backend - "auto", "sklearn", or "cuml". Defaults to "auto". Returns: kmeans (KMeans or custom object): The fitted clustering object. @@ -362,20 +373,9 @@ def run_kmeans(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = No if backend == "cuml" and HAS_CUML and cuda_available: logger.info("Using cuML backend for KMeans") result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers) - elif backend == "faiss" and HAS_FAISS: - logger.info("Using FAISS backend for KMeans") - result = _run_kmeans_faiss(embeddings, n_clusters, seed, n_workers) - elif backend == "auto": - # Auto selection priority: cuML > FAISS > sklearn - if HAS_CUML and cuda_available and n_samples > 500: - logger.info("Auto-selected cuML backend for KMeans (GPU available, large dataset)") - result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers) - elif HAS_FAISS and n_samples > 500: - logger.info("Auto-selected FAISS backend for KMeans (large dataset)") - result = _run_kmeans_faiss(embeddings, n_clusters, seed, n_workers) - else: - logger.info("Using sklearn backend for KMeans") - result = _run_kmeans_sklearn(embeddings, n_clusters, seed) + elif backend == "auto" and HAS_CUML and cuda_available and n_samples > 500: + logger.info("Auto-selected cuML backend for KMeans (GPU available, large dataset)") + result = _run_kmeans_cuml(embeddings, n_clusters, seed, n_workers) else: logger.info("Using sklearn backend for KMeans") result = _run_kmeans_sklearn(embeddings, n_clusters, seed) @@ -443,55 +443,4 @@ def _run_kmeans_sklearn(embeddings: np.ndarray, n_clusters: int, seed: Optional[ return kmeans, labels -def _run_kmeans_faiss(embeddings: np.ndarray, n_clusters: int, seed: Optional[int] = None, n_workers: int = 1): - """KMeans using FAISS backend for faster clustering.""" - try: - import faiss - - # Ensure embeddings are float32 and C-contiguous (FAISS requirement) - embeddings = np.ascontiguousarray(embeddings.astype(np.float32)) - - n_samples, d = embeddings.shape - - # Set number of threads for FAISS - if n_workers > 1: - faiss.omp_set_num_threads(n_workers) - - # Create FAISS KMeans object - kmeans = faiss.Clustering(d, n_clusters) - - # Set clustering parameters - kmeans.verbose = False - kmeans.niter = 20 # Number of iterations - kmeans.nredo = 1 # Number of redos - if seed is not None: - kmeans.seed = seed - - # Use L2 distance (equivalent to sklearn's default) - index = faiss.IndexFlatL2(d) - - # Run clustering - kmeans.train(embeddings, index) - - # Get centroids - centroids = faiss.vector_to_array(kmeans.centroids).reshape(n_clusters, d) - - # Assign labels by finding nearest centroid for each point - _, labels = index.search(embeddings, 1) - labels = labels.flatten() - - # Create a simple object to mimic sklearn KMeans interface - class FAISSKMeans: - def __init__(self, centroids, labels): - self.cluster_centers_ = centroids - self.labels_ = labels - self.n_clusters = len(centroids) - - return FAISSKMeans(centroids, labels), labels - - except Exception as e: - # Fallback to sklearn if FAISS fails - logger.warning(f"FAISS clustering failed ({e}), falling back to sklearn") - return _run_kmeans_sklearn(embeddings, n_clusters, seed) -