diff --git a/.changeset/add-mongodb-adapter.md b/.changeset/add-mongodb-adapter.md new file mode 100644 index 0000000..740d9a2 --- /dev/null +++ b/.changeset/add-mongodb-adapter.md @@ -0,0 +1,8 @@ +--- +"payloadcms-vectorize": minor +"@payloadcms-vectorize/pg": minor +"@payloadcms-vectorize/cf": minor +"@payloadcms-vectorize/mongodb": minor +--- + +Add `@payloadcms-vectorize/mongodb` adapter (Atlas + self-hosted Community 8.2+) backed by `$vectorSearch`, with pre/post filter splitting and full WHERE-clause parity across operators (equals, not_equals, in, notIn, like, contains, gt/gte/lt/lte, exists, and/or). Search indexes are auto-ensured on first use. diff --git a/.changeset/config.json b/.changeset/config.json index b0b05fb..d2c86a0 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -6,7 +6,7 @@ ], "commit": false, "fixed": [ - ["payloadcms-vectorize", "@payloadcms-vectorize/pg", "@payloadcms-vectorize/cf"] + ["payloadcms-vectorize", "@payloadcms-vectorize/pg", "@payloadcms-vectorize/cf", "@payloadcms-vectorize/mongodb"] ], "access": "public", "baseBranch": "main", diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7057f71..28a4e43 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -157,6 +157,42 @@ jobs: PAYLOAD_SECRET: test-secret-key TEST_ENV: 1 + test_adapters_mongodb: + runs-on: ubuntu-latest + + services: + mongodb: + image: mongodb/mongodb-atlas-local:latest + ports: + - 27018:27017 + options: >- + --health-cmd "mongosh --quiet --eval 'db.runCommand({ping:1})'" + --health-interval 5s + --health-timeout 10s + --health-retries 30 + + steps: + - uses: actions/checkout@v4 + + - name: Install pnpm + uses: pnpm/action-setup@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'pnpm' + + - name: Install dependencies + run: pnpm install + + - name: Run mongodb adapter tests + run: pnpm test:adapters:mongodb + env: + PAYLOAD_SECRET: test-secret-key + MONGODB_URI: mongodb://localhost:27018/?directConnection=true + TEST_ENV: 1 + test_e2e: runs-on: ubuntu-latest @@ -231,10 +267,11 @@ jobs: test -f dist/index.d.ts test -f adapters/pg/dist/index.d.ts test -f adapters/cf/dist/index.d.ts + test -f adapters/mongodb/dist/index.d.ts test: runs-on: ubuntu-latest - needs: [typecheck, build, test_int, test_adapters_pg, test_adapters_cf, test_e2e] + needs: [typecheck, build, test_int, test_adapters_pg, test_adapters_cf, test_adapters_mongodb, test_e2e] if: always() steps: - name: Check required jobs @@ -244,6 +281,7 @@ jobs: [ "${{ needs.test_int.result }}" != "success" ] || \ [ "${{ needs.test_adapters_pg.result }}" != "success" ] || \ [ "${{ needs.test_adapters_cf.result }}" != "success" ] || \ + [ "${{ needs.test_adapters_mongodb.result }}" != "success" ] || \ [ "${{ needs.test_e2e.result }}" != "success" ]; then echo "One or more required jobs failed" exit 1 diff --git a/.gitignore b/.gitignore index 4e8441e..982a5de 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ node_modules/ /dist /adapters/pg/dist /adapters/cf/dist +/adapters/mongodb/dist # misc .DS_Store @@ -55,4 +56,7 @@ yarn-error.log* */secret # Cursor -.cursor/ \ No newline at end of file +.cursor/ + +# Worktrees +.worktrees/ \ No newline at end of file diff --git a/README.md b/README.md index c446ec9..d7ff370 100644 --- a/README.md +++ b/README.md @@ -52,10 +52,11 @@ A Payload CMS plugin that adds vector search capabilities to your collections. P This plugin requires a database adapter for vector storage. Available adapters: -| Adapter | Package | Database | Documentation | -| -------------------- | -------------------------- | --------------------------- | --------------------------------- | -| PostgreSQL | `@payloadcms-vectorize/pg` | PostgreSQL with pgvector | [README](./adapters/pg/README.md) | -| Cloudflare Vectorize | `@payloadcms-vectorize/cf` | Cloudflare Vectorize index | [README](./adapters/cf/README.md) | +| Adapter | Package | Database | Documentation | +| -------------------- | ------------------------------- | --------------------------------------- | -------------------------------------- | +| PostgreSQL | `@payloadcms-vectorize/pg` | PostgreSQL with pgvector | [README](./adapters/pg/README.md) | +| Cloudflare Vectorize | `@payloadcms-vectorize/cf` | Cloudflare Vectorize index | [README](./adapters/cf/README.md) | +| MongoDB | `@payloadcms-vectorize/mongodb` | MongoDB Atlas + self-hosted 8.2+ | [README](./adapters/mongodb/README.md) | See [adapters/README.md](./adapters/README.md) for information on creating custom adapters. @@ -72,8 +73,9 @@ See [adapters/README.md](./adapters/README.md) for information on creating custo pnpm add payloadcms-vectorize # Install a database adapter (one of the following) -pnpm add @payloadcms-vectorize/pg # PostgreSQL + pgvector -pnpm add @payloadcms-vectorize/cf # Cloudflare Vectorize +pnpm add @payloadcms-vectorize/pg # PostgreSQL + pgvector +pnpm add @payloadcms-vectorize/cf # Cloudflare Vectorize +pnpm add @payloadcms-vectorize/mongodb # MongoDB Atlas + self-hosted 8.2+ ``` ## Quick Start @@ -84,6 +86,7 @@ First, configure your database adapter. See the adapter-specific documentation: - **PostgreSQL**: [@payloadcms-vectorize/pg README](./adapters/pg/README.md) — pgvector setup, schema initialization, and migrations. - **Cloudflare Vectorize**: [@payloadcms-vectorize/cf README](./adapters/cf/README.md) — index creation, bindings, and known limitations. +- **MongoDB**: [@payloadcms-vectorize/mongodb README](./adapters/mongodb/README.md) — Atlas / self-hosted 8.2+, `filterableFields`, and the `$vectorSearch` index lifecycle. ### 2. Configure the Plugin @@ -222,6 +225,7 @@ Migration steps depend on your database adapter: - **PostgreSQL**: [@payloadcms-vectorize/pg README → Migrations](./adapters/pg/README.md#migrations) - **Cloudflare Vectorize**: index creation is a one-time setup step — see [@payloadcms-vectorize/cf README](./adapters/cf/README.md#1-create-vectorize-index). +- **MongoDB**: no manual migration — the `$vectorSearch` index is auto-ensured on first write. See [@payloadcms-vectorize/mongodb README → Index lifecycle](./adapters/mongodb/README.md#index-lifecycle). ### 4. Search Your Content @@ -298,6 +302,7 @@ Each adapter has its own configuration shape — this is where index parameters, - **PostgreSQL** (`dims`, `ivfflatLists`, schema initialization): [@payloadcms-vectorize/pg → Static Configuration](./adapters/pg/README.md#static-configuration) - **Cloudflare Vectorize** (`dims`, Vectorize binding): [@payloadcms-vectorize/cf → Configuration](./adapters/cf/README.md#configuration) +- **MongoDB** (`uri`, `dbName`, per-pool `dimensions` / `similarity` / `filterableFields` / `numCandidates` / `forceExact`): [@payloadcms-vectorize/mongodb → API Reference](./adapters/mongodb/README.md#api-reference) The embeddings collection name in Payload will be the same as the knowledge pool name. @@ -374,7 +379,7 @@ You can filter on: References to fields that don't exist on the embeddings table are silently dropped (the rest of the clause still applies). -> **Adapter parity.** All operators are implemented in `@payloadcms-vectorize/pg`. The Cloudflare Vectorize adapter has narrower native filtering — see [@payloadcms-vectorize/cf → Known Limitations](./adapters/cf/README.md#metadata-filtering) for what is and isn't supported there. +> **Adapter parity.** All operators are implemented in `@payloadcms-vectorize/pg`. The Cloudflare Vectorize adapter has narrower native filtering — see [@payloadcms-vectorize/cf → Known Limitations](./adapters/cf/README.md#metadata-filtering) for what is and isn't supported there. The MongoDB adapter splits the clause into a native `$vectorSearch` pre-filter and a JS post-filter — `like`/`contains`/`all` and any mixed-pre/post `or` are post-filtered, so they may return fewer than `limit` rows. See [@payloadcms-vectorize/mongodb → WHERE clause behavior](./adapters/mongodb/README.md#where-clause-behavior). ## Chunkers @@ -1006,15 +1011,15 @@ Common scripts: **Already shipped:** - **Multiple Knowledge Pools** — independent configurations and embedding functions per pool. -- **Database Adapter Architecture** — pluggable backends (PostgreSQL, Cloudflare Vectorize today). +- **Database Adapter Architecture** — pluggable backends (PostgreSQL, Cloudflare Vectorize, MongoDB today). - **More expressive queries** — configurable limits, per-collection scoping, and full Payload-style metadata filtering (see [Metadata Filtering](#metadata-filtering-where)). - **Bulk Embed All** — admin button, provider callbacks, and run/batch tracking. - **Serverless-friendly job model** — bulk runs are split into small, requeueable units (`prepare-bulk-embedding` and `poll-or-complete-single-batch`) so individual jobs stay well under typical serverless time limits. The `batchLimit` option (see [CollectionVectorizeOption](#collectionvectorizeoption)) lets you cap docs-per-job to fit your platform. Tested locally and on Node-style hosts; deeper Vercel-specific integration testing is on the help-wanted list. - **Cloudflare Vectorize adapter** — `@payloadcms-vectorize/cf`. +- **MongoDB adapter** — `@payloadcms-vectorize/mongodb` (Atlas + self-hosted Community 8.2+ via `$vectorSearch`). **Help wanted** (priority is driven by community demand — open or 👍 an issue to push something up): -- **MongoDB adapter** — `@payloadcms-vectorize/mongodb` for MongoDB Atlas Vector Search. - **Additional adapters** — Pinecone, Qdrant, SQLite, etc. See [adapters/README.md](./adapters/README.md) for the `DbAdapter` contract. - **Vercel CI matrix** — exercising the serverless job model end-to-end on Vercel preview deployments. diff --git a/adapters/README.md b/adapters/README.md index 55ca868..096a6e4 100644 --- a/adapters/README.md +++ b/adapters/README.md @@ -37,10 +37,11 @@ ## Available Adapters -| Adapter | Package | Database | Version | Status | -| -------------------- | --------------------------------------------- | -------------------------- | -------- | ----------- | -| PostgreSQL | [`@payloadcms-vectorize/pg`](./pg/README.md) | PostgreSQL with `pgvector` | `0.7.2` | Stable | -| Cloudflare Vectorize | [`@payloadcms-vectorize/cf`](./cf/README.md) | Cloudflare Vectorize index | `0.7.2` | Beta | +| Adapter | Package | Database | Version | Status | +| -------------------- | ------------------------------------------------------------- | --------------------------------- | -------- | ----------- | +| PostgreSQL | [`@payloadcms-vectorize/pg`](./pg/README.md) | PostgreSQL with `pgvector` | `0.7.2` | Stable | +| Cloudflare Vectorize | [`@payloadcms-vectorize/cf`](./cf/README.md) | Cloudflare Vectorize index | `0.7.2` | Beta | +| MongoDB | [`@payloadcms-vectorize/mongodb`](./mongodb/README.md) | MongoDB Atlas + self-hosted 8.2+ | `0.7.2` | Beta | ## Architecture diff --git a/adapters/mongodb/README.md b/adapters/mongodb/README.md new file mode 100644 index 0000000..e3858d8 --- /dev/null +++ b/adapters/mongodb/README.md @@ -0,0 +1,338 @@ +# @payloadcms-vectorize/mongodb + +[![npm version](https://img.shields.io/npm/v/@payloadcms-vectorize/mongodb.svg)](https://www.npmjs.com/package/@payloadcms-vectorize/mongodb) +[![npm downloads](https://img.shields.io/npm/dm/@payloadcms-vectorize/mongodb.svg)](https://www.npmjs.com/package/@payloadcms-vectorize/mongodb) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](../../LICENSE) +[![Payload CMS](https://img.shields.io/badge/Payload-3.x-000000.svg)](https://payloadcms.com) + +MongoDB adapter for [`payloadcms-vectorize`](https://github.com/techiejd/payloadcms-vectorize). Stores and queries embeddings via MongoDB's `$vectorSearch` aggregation stage. Targets **MongoDB Atlas** and **self-hosted MongoDB Community 8.2+** through a single code path — connection string is the only difference. + +> **Status:** `0.x` — pre-1.0. Designed for MongoDB Atlas; CI runs against [`mongodb/mongodb-atlas-local`](https://hub.docker.com/r/mongodb/mongodb-atlas-local) (the upstream `mongot` engine in the same image Atlas uses). The public API is stabilizing but may still have breaking changes between minor releases. Track the [CHANGELOG](./CHANGELOG.md) before upgrading. + +## Who is this for? + +Use this adapter if **all** of the following are true: + +- You already use (or plan to use) MongoDB for your application data, or want vector storage to live in the same database as your Payload documents. +- You are deploying to MongoDB Atlas (M0/Flex for development, **M10+** for production), or running self-hosted **MongoDB Community 8.2+** with `mongot` enabled. +- You can live with the [Limitations](#limitations) (post-filter operators may return fewer than `limit` rows, no geo predicates, OR clauses with `like`/`contains`/`all` are evaluated in JS). + +If you're on Postgres with `pgvector`, prefer [`@payloadcms-vectorize/pg`](../pg/README.md). If you're deploying to Cloudflare Workers, prefer [`@payloadcms-vectorize/cf`](../cf/README.md). + +## Table of Contents + +- [Prerequisites](#prerequisites) +- [Installation](#installation) +- [How it works](#how-it-works) +- [Quick Start](#quick-start) +- [API Reference](#api-reference) + - [`createMongoVectorIntegration(options)`](#createmongovectorintegrationoptions) + - [Pool config](#pool-config) +- [`filterableFields` explained](#filterablefields-explained) +- [Index lifecycle](#index-lifecycle) +- [WHERE clause behavior](#where-clause-behavior) +- [Tuning `numCandidates` and `forceExact`](#tuning-numcandidates-and-forceexact) +- [Multiple Knowledge Pools](#multiple-knowledge-pools) +- [Tier guidance](#tier-guidance) +- [Limitations](#limitations) +- [Contributing](#contributing) +- [Changelog](#changelog) +- [License](#license) + +## Prerequisites + +- MongoDB Atlas (M0/Flex for development, **M10+** for production) **or** self-hosted MongoDB Community `>=8.2` with `mongot` enabled (e.g. via the [`mongodb/mongodb-atlas-local`](https://hub.docker.com/r/mongodb/mongodb-atlas-local) Docker image). +- The `mongodb` Node.js driver, `>=6.0.0` (peer dep). +- Payload CMS `3.x` (peer-dep range: `>=3.0.0 <4.0.0`). +- `payloadcms-vectorize` matching this adapter's version (peer-dep range: `>=0.7.2`). +- Node.js `^18.20.2` or `>=20.9.0`. + +## Installation + +```bash +pnpm add payloadcms-vectorize @payloadcms-vectorize/mongodb mongodb +``` + +## How it works + +The adapter is the bridge between Payload's vectorize plugin and a MongoDB collection backed by an Atlas-style search index. There are **two invariants** to know up front: + +> ⚠️ **Dimension parity:** the `dimensions` value on each pool **must equal** your embedding model's output size. Changing `dimensions` after the index exists requires manually dropping the search index — the adapter refuses to silently rebuild it. + +> ⚠️ **`filterableFields` must be declared up front.** MongoDB's `$vectorSearch` only accepts pre-filters on fields declared as `type: 'filter'` in the search index definition. Filtering on a field you forgot to declare throws a clear adapter error before the request hits Mongo. See [`filterableFields` explained](#filterablefields-explained). + +Beyond that, three facts shape day-to-day usage: + +1. **One Mongo collection per pool.** Default name `vectorize_${poolName}`; override with `collectionName`. The adapter does not multiplex pools onto a single collection. +2. **The search index is auto-ensured on first write.** [`storeChunk`](./src/embed.ts) calls [`ensureSearchIndex`](./src/indexes.ts), which creates the `vectorSearch` index if missing, polls until `READY`, and short-circuits on subsequent calls. See [Index lifecycle](#index-lifecycle). +3. **Reserved fields (`sourceCollection`, `docId`, `embeddingVersion`, `chunkIndex`, `chunkText`, `embedding`) are written and managed by the adapter.** `sourceCollection`, `docId`, and `embeddingVersion` are always declared as `type: 'filter'` in the index — you can filter on them without listing them in `filterableFields`. + +## Quick Start + +This Quick Start gets you a working semantic-search wiring against MongoDB. Paste each block in order. + +### 1. Run MongoDB locally (or use Atlas) + +For local development: + +```bash +docker run -d -p 27018:27017 mongodb/mongodb-atlas-local:latest +``` + +Set `MONGODB_URI=mongodb://localhost:27018/?directConnection=true`. + +For Atlas, set `MONGODB_URI` to your `mongodb+srv://...` connection string. Make sure your IP is in the access list and the user has `readWrite` on the database. + +### 2. Configure the plugin + +```typescript +import { buildConfig } from 'payload' +import { mongooseAdapter } from '@payloadcms/db-mongodb' +import { embed, embedMany } from 'ai' +import { voyage } from 'voyage-ai-provider' +import payloadcmsVectorize from 'payloadcms-vectorize' +import { createMongoVectorIntegration } from '@payloadcms-vectorize/mongodb' + +const embedDocs = async (texts: string[]): Promise => { + const result = await embedMany({ + model: voyage.textEmbeddingModel('voyage-3.5-lite'), + values: texts, + providerOptions: { voyage: { inputType: 'document' } }, + }) + return result.embeddings +} + +const embedQuery = async (text: string): Promise => { + const result = await embed({ + model: voyage.textEmbeddingModel('voyage-3.5-lite'), + value: text, + providerOptions: { voyage: { inputType: 'query' } }, + }) + return result.embedding +} + +const { adapter } = createMongoVectorIntegration({ + uri: process.env.MONGODB_URI!, + dbName: 'payload_vectorize', + pools: { + default: { + dimensions: 1024, // matches voyage-3.5-lite + similarity: 'cosine', + filterableFields: ['status', 'category'], + }, + }, +}) + +export default buildConfig({ + db: mongooseAdapter({ url: process.env.MONGODB_URI! }), + collections: [ + { + slug: 'posts', + fields: [ + { name: 'title', type: 'text' }, + { name: 'status', type: 'select', options: ['draft', 'published'] }, + { name: 'category', type: 'text' }, + ], + }, + ], + plugins: [ + payloadcmsVectorize({ + dbAdapter: adapter, + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: 'v1.0.0', + queryFn: embedQuery, + realTimeIngestionFn: embedDocs, + }, + }, + }, + }), + ], + secret: process.env.PAYLOAD_SECRET!, +}) +``` + +### 3. Verify it works + +After Payload is running, create a post and run a vector search through the plugin's REST endpoint (or `payload.find` from server code): + +```bash +# Create a post (real-time ingestion path embeds + writes a chunk) +curl -X POST http://localhost:3000/api/posts \ + -H "Content-Type: application/json" \ + -d '{"title": "How to cancel a subscription", "status": "published", "category": "billing"}' + +# Search by semantic similarity, scoped to published billing posts +curl -X POST http://localhost:3000/api/payloadcms-vectorize/search \ + -H "Content-Type: application/json" \ + -d '{ + "knowledgePool": "default", + "query": "refund my account", + "limit": 5, + "where": { + "and": [ + { "status": { "equals": "published" } }, + { "category": { "equals": "billing" } } + ] + } + }' +``` + +The first write may take 5–30s while `mongot` builds the search index; subsequent calls are no-ops. If filtering returns nothing, verify the field is in `filterableFields` — see [`filterableFields` explained](#filterablefields-explained). + +## API Reference + +### `createMongoVectorIntegration(options)` + +Creates the `DbAdapter` that the core plugin uses for vector storage. + +**Parameters:** + +| Parameter | Type | Required | Description | +| --- | --- | --- | --- | +| `options.uri` | `string` | Yes | Any valid MongoDB connection string (Atlas SRV or self-hosted). The URI lives in the adapter closure and is **not** written to `payload.config` — credentials never leak via `getConfigExtension`. | +| `options.dbName` | `string` | Yes | Database that holds the per-pool vector collections. | +| `options.pools` | `Record` | Yes | Pools keyed by knowledge-pool name. Pool names must match the keys of `knowledgePools` passed to `payloadcmsVectorize(...)`. Must contain at least one pool. | + +**Returns:** `{ adapter: DbAdapter }` — pass `adapter` to `payloadcmsVectorize({ dbAdapter })`. + +### Pool config + +| Field | Type | Required | Default | Description | +| --- | --- | --- | --- | --- | +| `dimensions` | `number` | Yes | — | Vector dimensions for this pool. Must match your embedding model's output. | +| `similarity` | `'cosine' \| 'euclidean' \| 'dotProduct'` | No | `'cosine'` | Similarity metric for the search index. | +| `numCandidates` | `number` | No | `limit * 10` (search-time) | ANN candidate set size for HNSW. See [Tuning `numCandidates` and `forceExact`](#tuning-numcandidates-and-forceexact). | +| `filterableFields` | `string[]` | No | `[]` | Extension fields you'll filter on in `where` clauses. Reserved fields (`sourceCollection`, `docId`, `embeddingVersion`) are always filterable. See [`filterableFields` explained](#filterablefields-explained). | +| `forceExact` | `boolean` | No | `false` | Use ENN exact full-scan instead of ANN. See [Tuning `numCandidates` and `forceExact`](#tuning-numcandidates-and-forceexact). | +| `collectionName` | `string` | No | `vectorize_` | Override Mongo collection name. | +| `indexName` | `string` | No | `_idx` | Override search index name. | + +## `filterableFields` explained + +MongoDB's `$vectorSearch` requires every field used in its native pre-filter to be declared as `type: 'filter'` in the search index definition. The adapter automatically declares the reserved fields (`sourceCollection`, `docId`, `embeddingVersion`) and any field name you list in `filterableFields`. + +Filtering on a field NOT in `filterableFields` (and not reserved) throws a clear adapter-side error before the request hits Mongo, rather than silently falling back to a slow scan or returning nothing. + +Reserved fields are also re-listed under [How it works](#how-it-works) — you don't need to declare them. + +## Index lifecycle + +`ensureSearchIndex` runs lazily on the first `storeChunk` per pool ([`indexes.ts`](./src/indexes.ts)): + +1. Lists existing search indexes via `collection.listSearchIndexes(indexName)`. +2. If the named index already exists with the **same** definition (`READY` or `BUILDING`), short-circuits. +3. If it exists with a **different** definition, throws an error. **Auto-dropping is unsafe** — drop manually: + ```js + db.collection('vectorize_default').dropSearchIndex('vectorize_default_idx') + ``` +4. Otherwise creates the index (`createSearchIndex({ type: 'vectorSearch' })`) and polls until `status === 'READY'` (≤ 60s by default). + +Concurrent `ensureSearchIndex` calls for the same `(db, collection, indexName)` share a single in-flight promise, so a thundering-herd of writes does not produce duplicate `createSearchIndex` calls. + +The first write per pool may take ~5–30s while the index builds; subsequent calls are no-ops. On a cold M10 cluster the first build can occasionally exceed 60s — if you see `Search index ... did not become READY within 60s`, wait, retry, and please [open an issue](https://github.com/techiejd/payloadcms-vectorize/issues) so we can make this configurable. + +## WHERE clause behavior + +The adapter splits a Payload `Where` clause into two stages ([`convertWhere.ts`](./src/convertWhere.ts)): + +| Operator | Stage | Notes | +| --- | --- | --- | +| `equals`, `not_equals` (`notEquals`) | Pre-filter | Native `$vectorSearch.filter`, applied **before** topK. | +| `in`, `not_in` (`notIn`) | Pre-filter | Native, applied **before** topK. | +| `greater_than` (`greaterThan`), `greater_than_equal` (`greaterThanEqual`) | Pre-filter | Native, applied **before** topK. | +| `less_than` (`lessThan`), `less_than_equal` (`lessThanEqual`) | Pre-filter | Native, applied **before** topK. | +| `exists` | Pre-filter | Maps to `$exists` + null check. | +| `and` | Pre-filter when all branches are pre; mixed pre/post otherwise | Pre-branches stay native; post-branches evaluated in JS. | +| `or` | Pre-filter when all branches are pre; **otherwise entire OR routes to post-filter** | Required to preserve disjunction semantics. | +| `like`, `contains`, `all` | Post-filter | Not expressible in `$vectorSearch.filter`; applied in JS against the post-`$vectorSearch` rows. | +| `near`, `within`, `intersects` | **Unsupported** | Throws a clear adapter error — Mongo's `$vectorSearch` does not expose geo predicates. | + +`id` is automatically mapped to `_id` and 24-hex strings are cast to `ObjectId` (including inside `in`/`notIn` arrays). + +> **Result-count caveat.** `$vectorSearch.limit` is applied **before** any post-filter. If many rows fail the post-filter, you may receive fewer than `limit` results. The adapter does not over-fetch — this matches the [Cloudflare Vectorize adapter's](../cf/README.md#metadata-filtering) post-filter behavior. Best practices: tighten pre-filters, increase `limit`, or split the query. + +> **Mixed-OR caveat.** When any branch of an `or` clause needs a post-filter operator, the entire `or` is routed to post-filter — the pre-filter is dropped from `$vectorSearch.filter`. With a high-cardinality collection the unfiltered top-K may not contain all matching rows. If you can rewrite as `and` of disjunctions, do. + +## Tuning `numCandidates` and `forceExact` + +`$vectorSearch` runs HNSW ANN by default, sampling `numCandidates` vectors and returning the best `limit`. + +- **`numCandidates`** — defaults to `limit * 10`. Atlas docs recommend **10×–20×** of `limit`; bump to `limit * 20` (or higher) when you need better recall, especially with restrictive pre-filters that may force the ANN walk past most candidates. Higher `numCandidates` costs latency and RU/credits. +- **`forceExact: true`** — switches to ENN exact full-scan. Use when (a) recall matters more than latency and (b) the collection is small enough that a full scan is cheap, or (c) your pre-filter is so restrictive that ANN regularly returns < `limit` results because the candidate pool doesn't intersect the filter. Not recommended for collections > ~100k vectors. + +## Multiple Knowledge Pools + +Each pool gets its own collection and its own search index. Configure them in the same `pools` object — no extra wiring needed: + +```typescript +const { adapter } = createMongoVectorIntegration({ + uri: process.env.MONGODB_URI!, + dbName: 'payload_vectorize', + pools: { + posts: { + dimensions: 1024, + filterableFields: ['status', 'category'], + }, + images: { + dimensions: 512, + filterableFields: ['caption'], + collectionName: 'image_vectors', // override default `vectorize_images` + }, + }, +}) +``` + +Pool names must match the keys of `knowledgePools` you pass to `payloadcmsVectorize({...})`. + +## Tier guidance + +- **Atlas M0 / Flex (free):** development only. Search index runs on a single shared replica with limited memory; query latency is unpredictable under load. +- **Atlas M10+:** production. Use [Search Nodes](https://www.mongodb.com/docs/atlas/cluster-config/multi-cloud-distribution/) for dedicated `mongot` capacity if your vector workload is meaningful. +- **Self-hosted Community 8.2+:** supported. `mongot` is upstream-source-available (SSPL); verify you're on a build that includes the version you tested against. + +## Limitations + +Each item below links to the section that explains the mechanism, so you can decide if it's a blocker for your workload. + +- **Post-filter result count** — `like`/`contains`/`all` and any mixed-pre/post `or` may return fewer than `limit` results. See [WHERE clause behavior → Result-count caveat](#where-clause-behavior). +- **Geo operators** — `near`/`within`/`intersects` throw at convert time. Mongo's `$vectorSearch` does not expose geo predicates. See [WHERE clause behavior](#where-clause-behavior). +- **Index immutability** — changing `dimensions`, `similarity`, or `filterableFields` after the index exists requires `db.collection(...).dropSearchIndex(...)` first. The adapter refuses to silently rebuild. See [Index lifecycle](#index-lifecycle). +- **No automatic retry/backoff** — transient `mongot` errors propagate to the caller. Wrap your search/store calls if your runtime needs retries. +- **CI runs against `mongodb-atlas-local`, not managed Atlas** — the same `mongot` engine, but managed-Atlas-only behavior (e.g. Search Nodes routing, very-large-collection index build times) is not exercised in CI. If you hit something Atlas-specific, please [open an issue](https://github.com/techiejd/payloadcms-vectorize/issues). + +## Contributing + +Issues and PRs are welcome. The repo lives at [github.com/techiejd/payloadcms-vectorize](https://github.com/techiejd/payloadcms-vectorize) — please open an issue before sending a non-trivial PR so we can align on the approach. + +For local development, see the root [README](../../README.md). The adapter test suite uses the bundled [`dev/docker-compose.yml`](./dev/docker-compose.yml): + +```bash +pnpm --filter @payloadcms-vectorize/mongodb test:setup # starts mongodb-atlas-local on :27018 +pnpm test:adapters:mongodb # runs the spec suite +pnpm --filter @payloadcms-vectorize/mongodb test:teardown # stops the container +``` + +The source layout under [`src/`](./src/) is intentionally small: + +- [`index.ts`](./src/index.ts) — exports `createMongoVectorIntegration`, wires `DbAdapter` methods. +- [`client.ts`](./src/client.ts) — `MongoClient` cache keyed by URI; rejected connects evict, so a transient failure doesn't poison the cache. +- [`embed.ts`](./src/embed.ts) — `storeChunk` (insert + ensure index). +- [`search.ts`](./src/search.ts) — `searchImpl` (build pipeline, run `$vectorSearch`, apply post-filter). +- [`indexes.ts`](./src/indexes.ts) — `ensureSearchIndex` (create / poll / detect drift). +- [`convertWhere.ts`](./src/convertWhere.ts) — Payload `Where` → Mongo pre-filter + JS post-filter splitter. +- [`types.ts`](./src/types.ts) — config shapes and reserved-field constants. + +## Changelog + +See [CHANGELOG.md](./CHANGELOG.md) for release notes. Releases are managed by [Changesets](https://github.com/changesets/changesets) — when contributing, run `pnpm changeset` to describe your change. + +## License + +[MIT](../../LICENSE) diff --git a/adapters/mongodb/dev/docker-compose.yml b/adapters/mongodb/dev/docker-compose.yml new file mode 100644 index 0000000..7604230 --- /dev/null +++ b/adapters/mongodb/dev/docker-compose.yml @@ -0,0 +1,11 @@ +services: + mongodb-atlas: + image: mongodb/mongodb-atlas-local:latest + container_name: vectorize-mongodb-test + ports: + - "27018:27017" + healthcheck: + test: ["CMD", "mongosh", "--quiet", "--eval", "db.runCommand({ping:1})"] + interval: 2s + timeout: 5s + retries: 30 diff --git a/adapters/mongodb/dev/specs/client.spec.ts b/adapters/mongodb/dev/specs/client.spec.ts new file mode 100644 index 0000000..496bca7 --- /dev/null +++ b/adapters/mongodb/dev/specs/client.spec.ts @@ -0,0 +1,20 @@ +import { MongoClient } from 'mongodb' +import { afterEach, describe, expect, test, vi } from 'vitest' +import { __closeForTests, getMongoClient } from '../../src/client.js' + +afterEach(async () => { + vi.restoreAllMocks() + await __closeForTests() +}) + +describe('getMongoClient cache', () => { + test('a rejected connect attempt is not cached — the next call retries (verified by connect call count)', async () => { + const bad = 'mongodb://127.0.0.1:1/?serverSelectionTimeoutMS=200&directConnection=true' + const connectSpy = vi.spyOn(MongoClient, 'connect') + + await expect(getMongoClient(bad)).rejects.toThrow() + await expect(getMongoClient(bad)).rejects.toThrow() + + expect(connectSpy).toHaveBeenCalledTimes(2) + }) +}) diff --git a/adapters/mongodb/dev/specs/compliance.spec.ts b/adapters/mongodb/dev/specs/compliance.spec.ts new file mode 100644 index 0000000..2e798ea --- /dev/null +++ b/adapters/mongodb/dev/specs/compliance.spec.ts @@ -0,0 +1,237 @@ +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { MongoClient } from 'mongodb' +import type { BasePayload } from 'payload' +import type { DbAdapter } from 'payloadcms-vectorize' +import { DIMS, MONGO_URI, TEST_DB } from './constants.js' +import { buildMongoTestPayload, teardownDbs } from './utils.js' +import { + makeDummyEmbedDocs, + makeDummyEmbedQuery, + testEmbeddingVersion, +} from '@shared-test/helpers/embed' + +describe('Mongo Adapter Compliance Tests', () => { + let adapter: DbAdapter + let payload: BasePayload + + beforeAll(async () => { + const built = await buildMongoTestPayload({ + uri: MONGO_URI, + dbName: TEST_DB, + pools: { default: { dimensions: DIMS, filterableFields: [] } }, + knowledgePools: { + default: { + collections: {}, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }) + payload = built.payload + adapter = built.adapter + }) + + afterAll(async () => { + await teardownDbs(payload, MONGO_URI, TEST_DB) + }) + + describe('getConfigExtension()', () => { + test('returns object with custom._mongoConfig', () => { + const ext = adapter.getConfigExtension({} as any) + expect(ext.custom?._mongoConfig).toBeDefined() + expect(ext.custom!._mongoConfig).not.toHaveProperty('uri') + expect(ext.custom!._mongoConfig.dbName).toBe(`${TEST_DB}_vectors`) + expect(ext.custom!._mongoConfig.pools.default.dimensions).toBe(DIMS) + }) + + test('does NOT include any collections (Mongo manages docs via raw driver)', () => { + const ext = adapter.getConfigExtension({} as any) + expect(ext.collections).toBeUndefined() + }) + }) + + describe('storeChunk()', () => { + test('persists embedding (number[])', async () => { + const embedding = Array(DIMS) + .fill(0) + .map(() => Math.random()) + await expect( + adapter.storeChunk(payload, 'default', { + sourceCollection: 'test-collection', + docId: `embed-1-${Date.now()}`, + chunkIndex: 0, + chunkText: 'test text', + embeddingVersion: 'v1', + embedding, + extensionFields: {}, + }), + ).resolves.not.toThrow() + }) + + test('persists embedding (Float32Array)', async () => { + const embedding = new Float32Array( + Array(DIMS) + .fill(0) + .map(() => Math.random()), + ) + await expect( + adapter.storeChunk(payload, 'default', { + sourceCollection: 'test-collection', + docId: `embed-2-${Date.now()}`, + chunkIndex: 0, + chunkText: 'test text float32', + embeddingVersion: 'v1', + embedding, + extensionFields: {}, + }), + ).resolves.not.toThrow() + }) + }) + + describe('search()', () => { + let target: number[] + beforeAll(async () => { + target = Array(DIMS).fill(0.5) + const similar = target.map((v) => v + Math.random() * 0.05) + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'test-collection', + docId: `search-similar-${Date.now()}`, + chunkIndex: 0, + chunkText: 'similar doc', + embeddingVersion: 'v1', + embedding: similar, + extensionFields: {}, + }) + }) + + test('returns an array of results', async () => { + const results = await adapter.search(payload, target, 'default') + expect(Array.isArray(results)).toBe(true) + }) + + test('results have all required fields with correct types', async () => { + const results = await adapter.search(payload, target, 'default') + for (const r of results) { + expect(typeof r.id).toBe('string') + expect(typeof r.score).toBe('number') + expect(typeof r.sourceCollection).toBe('string') + expect(typeof r.docId).toBe('string') + expect(typeof r.chunkIndex).toBe('number') + expect(typeof r.chunkText).toBe('string') + expect(typeof r.embeddingVersion).toBe('string') + } + }) + + test('results are ordered by score (highest first)', async () => { + const results = await adapter.search(payload, target, 'default', 10) + for (let i = 1; i < results.length; i++) { + expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score) + } + }) + + test('respects limit parameter', async () => { + const results = await adapter.search(payload, target, 'default', 1) + expect(results.length).toBeLessThanOrEqual(1) + }) + }) + + describe('deleteChunks()', () => { + test('removes chunks for a doc', async () => { + const docId = `to-delete-${Date.now()}` + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'delete-test', + docId, + chunkIndex: 0, + chunkText: 'doc to delete', + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.7), + extensionFields: {}, + }) + + const c = new MongoClient(MONGO_URI) + await c.connect() + const before = await c + .db(`${TEST_DB}_vectors`) + .collection('vectorize_default') + .countDocuments({ sourceCollection: 'delete-test', docId }) + expect(before).toBeGreaterThan(0) + + await adapter.deleteChunks(payload, 'default', 'delete-test', docId) + + const after = await c + .db(`${TEST_DB}_vectors`) + .collection('vectorize_default') + .countDocuments({ sourceCollection: 'delete-test', docId }) + expect(after).toBe(0) + await c.close() + }) + + test('handles missing doc gracefully', async () => { + await expect( + adapter.deleteChunks(payload, 'default', 'never-existed', 'fake-id'), + ).resolves.not.toThrow() + }) + }) + + describe('hasEmbeddingVersion()', () => { + test('true when chunk exists', async () => { + const docId = `has-version-${Date.now()}` + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'test-collection', + docId, + chunkIndex: 0, + chunkText: 'has version test', + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.5), + extensionFields: {}, + }) + const r = await adapter.hasEmbeddingVersion( + payload, 'default', 'test-collection', docId, 'v1', + ) + expect(r).toBe(true) + }) + + test('false when no chunk exists', async () => { + const r = await adapter.hasEmbeddingVersion( + payload, 'default', 'test-collection', 'never-existed', 'v1', + ) + expect(r).toBe(false) + }) + }) + + describe('unknown pool errors', () => { + test('search throws Unknown pool', async () => { + await expect( + adapter.search(payload, Array(DIMS).fill(0.0), 'pool_does_not_exist', 5), + ).rejects.toThrow(/Unknown pool/) + }) + + test('storeChunk throws Unknown pool', async () => { + await expect( + adapter.storeChunk(payload, 'pool_does_not_exist', { + sourceCollection: 'src', + docId: 'x', + chunkIndex: 0, + chunkText: 'x', + embeddingVersion: 'v', + embedding: Array(DIMS).fill(0.0), + extensionFields: {}, + }), + ).rejects.toThrow(/Unknown pool/) + }) + }) + + describe('search input validation', () => { + test.each([0, -1, 1.5, NaN])( + 'search rejects non-positive-integer limit (%s)', + async (limit) => { + await expect( + adapter.search(payload, Array(DIMS).fill(0.0), 'default', limit), + ).rejects.toThrow(/limit must be a positive integer/) + }, + ) + }) +}) diff --git a/adapters/mongodb/dev/specs/constants.ts b/adapters/mongodb/dev/specs/constants.ts new file mode 100644 index 0000000..ccad1a2 --- /dev/null +++ b/adapters/mongodb/dev/specs/constants.ts @@ -0,0 +1,5 @@ +export const DIMS = 8 +export const MONGO_URI = + process.env.MONGODB_URI || 'mongodb://localhost:27018/?directConnection=true' + +export const TEST_DB = `vectorize_mongo_test_${Date.now()}` diff --git a/adapters/mongodb/dev/specs/convertWhere.spec.ts b/adapters/mongodb/dev/specs/convertWhere.spec.ts new file mode 100644 index 0000000..3c8e3e6 --- /dev/null +++ b/adapters/mongodb/dev/specs/convertWhere.spec.ts @@ -0,0 +1,373 @@ +// adapters/mongodb/dev/specs/convertWhere.spec.ts +import { describe, expect, test } from 'vitest' +import { ObjectId } from 'mongodb' +import { convertWhereToMongo } from '../../src/convertWhere.js' + +const FILTERABLE = ['status', 'category', 'views', 'rating', 'published', 'tags'] + +describe('convertWhereToMongo — pre-filter operators', () => { + test('equals', () => { + expect( + convertWhereToMongo({ status: { equals: 'published' } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $eq: 'published' } }, postFilter: null }) + }) + + test('not_equals (snake) and notEquals (camel)', () => { + expect( + convertWhereToMongo({ status: { not_equals: 'draft' } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $ne: 'draft' } }, postFilter: null }) + expect( + convertWhereToMongo({ status: { notEquals: 'draft' } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $ne: 'draft' } }, postFilter: null }) + }) + + test('in / not_in / notIn', () => { + expect( + convertWhereToMongo({ status: { in: ['a', 'b'] } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $in: ['a', 'b'] } }, postFilter: null }) + expect( + convertWhereToMongo({ status: { not_in: ['a'] } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $nin: ['a'] } }, postFilter: null }) + expect( + convertWhereToMongo({ status: { notIn: ['a'] } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $nin: ['a'] } }, postFilter: null }) + }) + + test('greater_than / greaterThan / less_than_equal etc.', () => { + expect( + convertWhereToMongo({ views: { greater_than: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $gt: 100 } }, postFilter: null }) + expect( + convertWhereToMongo({ views: { greaterThan: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $gt: 100 } }, postFilter: null }) + expect( + convertWhereToMongo({ views: { greater_than_equal: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $gte: 100 } }, postFilter: null }) + expect( + convertWhereToMongo({ views: { less_than: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $lt: 100 } }, postFilter: null }) + expect( + convertWhereToMongo({ views: { less_than_equal: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $lte: 100 } }, postFilter: null }) + }) + + test('exists true → $exists + $ne null', () => { + expect( + convertWhereToMongo({ category: { exists: true } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: { category: { $exists: true, $ne: null } }, + postFilter: null, + }) + }) + + test('exists false → $exists false OR $eq null', () => { + expect( + convertWhereToMongo({ category: { exists: false } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: { $or: [{ category: { $exists: false } }, { category: { $eq: null } }] }, + postFilter: null, + }) + }) + + test('multiple operators on same field combine via $and', () => { + const result = convertWhereToMongo( + { views: { greater_than: 50, less_than: 200 } }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { $and: [{ views: { $gt: 50 } }, { views: { $lt: 200 } }] }, + postFilter: null, + }) + }) + + test('reserved field always usable even when filterableFields is empty', () => { + expect( + convertWhereToMongo( + { sourceCollection: { equals: 'articles' } }, + [], + 'p1', + ), + ).toEqual({ + preFilter: { sourceCollection: { $eq: 'articles' } }, + postFilter: null, + }) + }) +}) + +describe('convertWhereToMongo — post-filter operators', () => { + test('like routes the whole leaf to post-filter (verbatim Where)', () => { + expect( + convertWhereToMongo({ tags: { like: 'javascript' } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: null, + postFilter: { tags: { like: 'javascript' } }, + }) + }) + + test('contains routes the whole leaf to post-filter', () => { + expect( + convertWhereToMongo({ category: { contains: 'tech' } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: null, + postFilter: { category: { contains: 'tech' } }, + }) + }) + + test('mixed pre + post operators on same leaf → entire leaf goes to post', () => { + expect( + convertWhereToMongo( + { tags: { equals: 'a', like: 'javascript' } }, + FILTERABLE, + 'p1', + ), + ).toEqual({ + preFilter: null, + postFilter: { tags: { equals: 'a', like: 'javascript' } }, + }) + }) + + test('all routes to post-filter', () => { + expect( + convertWhereToMongo({ tags: { all: ['a', 'b'] } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: null, + postFilter: { tags: { all: ['a', 'b'] } }, + }) + }) + + test('unsupported geo op throws', () => { + expect(() => + convertWhereToMongo({ loc: { near: [0, 0] } }, ['loc'], 'p1'), + ).toThrowError(/not supported/) + }) +}) + +describe('convertWhereToMongo — and/or composition', () => { + test('and: all branches pre → combined preFilter via $and', () => { + const result = convertWhereToMongo( + { + and: [ + { status: { equals: 'published' } }, + { views: { greater_than: 100 } }, + ], + }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { + $and: [ + { status: { $eq: 'published' } }, + { views: { $gt: 100 } }, + ], + }, + postFilter: null, + }) + }) + + test('and: mix of pre + post → pre kept native, post in {and:[...]}', () => { + const result = convertWhereToMongo( + { + and: [ + { status: { equals: 'published' } }, + { tags: { like: 'javascript' } }, + ], + }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { status: { $eq: 'published' } }, + postFilter: { tags: { like: 'javascript' } }, + }) + }) + + test('or: all branches pre → combined preFilter via $or', () => { + const result = convertWhereToMongo( + { + or: [ + { status: { equals: 'draft' } }, + { status: { equals: 'archived' } }, + ], + }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { + $or: [ + { status: { $eq: 'draft' } }, + { status: { $eq: 'archived' } }, + ], + }, + postFilter: null, + }) + }) + + test('or: any branch is post → entire or goes to post-filter', () => { + const where: any = { + or: [ + { status: { equals: 'published' } }, + { tags: { like: 'javascript' } }, + ], + } + const result = convertWhereToMongo(where, FILTERABLE, 'p1') + expect(result.preFilter).toBeNull() + expect(result.postFilter).toEqual(where) + }) + + test('nested and/or: (published AND tech) OR (archived)', () => { + const where: any = { + or: [ + { + and: [ + { status: { equals: 'published' } }, + { category: { equals: 'tech' } }, + ], + }, + { status: { equals: 'archived' } }, + ], + } + const result = convertWhereToMongo(where, FILTERABLE, 'p1') + expect(result.preFilter).toEqual({ + $or: [ + { $and: [{ status: { $eq: 'published' } }, { category: { $eq: 'tech' } }] }, + { status: { $eq: 'archived' } }, + ], + }) + expect(result.postFilter).toBeNull() + }) + + test('and with single condition reduces to that condition', () => { + const result = convertWhereToMongo( + { and: [{ status: { equals: 'published' } }] }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { status: { $eq: 'published' } }, + postFilter: null, + }) + }) +}) + +import { evaluatePostFilter } from '../../src/convertWhere.js' + +describe('evaluatePostFilter', () => { + test('like with case-insensitive substring match', () => { + expect( + evaluatePostFilter({ tags: 'JavaScript' }, { tags: { like: 'javascript' } }), + ).toBe(true) + expect( + evaluatePostFilter({ tags: 'python' }, { tags: { like: 'javascript' } }), + ).toBe(false) + }) + + test('contains works on scalar string', () => { + expect( + evaluatePostFilter({ category: 'technology' }, { category: { contains: 'tech' } }), + ).toBe(true) + expect( + evaluatePostFilter({ category: 'design' }, { category: { contains: 'tech' } }), + ).toBe(false) + }) + + test('contains on array uses elemMatch-style', () => { + expect( + evaluatePostFilter({ tags: ['react', 'javascript'] }, { tags: { contains: 'java' } }), + ).toBe(true) + expect( + evaluatePostFilter({ tags: ['python'] }, { tags: { contains: 'java' } }), + ).toBe(false) + }) + + test('like with regex special chars does NOT match unintended values', () => { + // Pattern "foo.bar" must match the literal dot, not any char. + expect( + evaluatePostFilter({ tags: 'fooXbar' }, { tags: { like: 'foo.bar' } }), + ).toBe(false) + expect( + evaluatePostFilter({ tags: 'foo.bar' }, { tags: { like: 'foo.bar' } }), + ).toBe(true) + }) + + test('all on array', () => { + expect( + evaluatePostFilter({ tags: ['a', 'b', 'c'] }, { tags: { all: ['a', 'b'] } }), + ).toBe(true) + expect( + evaluatePostFilter({ tags: ['a'] }, { tags: { all: ['a', 'b'] } }), + ).toBe(false) + }) + + test('and combinator', () => { + const w: any = { + and: [ + { status: { equals: 'published' } }, + { tags: { like: 'javascript' } }, + ], + } + expect( + evaluatePostFilter({ status: 'published', tags: 'JavaScript,react' }, w), + ).toBe(true) + expect( + evaluatePostFilter({ status: 'draft', tags: 'JavaScript,react' }, w), + ).toBe(false) + }) + + test('or combinator', () => { + const w: any = { + or: [ + { status: { equals: 'published' } }, + { tags: { like: 'javascript' } }, + ], + } + expect(evaluatePostFilter({ status: 'published', tags: 'python' }, w)).toBe(true) + expect(evaluatePostFilter({ status: 'draft', tags: 'JavaScript' }, w)).toBe(true) + expect(evaluatePostFilter({ status: 'draft', tags: 'python' }, w)).toBe(false) + }) + + test('pre-filter operators also evaluable in post path (for OR mixed branches)', () => { + expect( + evaluatePostFilter({ status: 'published' }, { status: { equals: 'published' } }), + ).toBe(true) + expect( + evaluatePostFilter({ views: 150 }, { views: { greater_than: 100 } }), + ).toBe(true) + expect( + evaluatePostFilter({ views: 50 }, { views: { greater_than: 100 } }), + ).toBe(false) + }) +}) + +describe('convertWhereToMongo — id mapping', () => { + test('id with 24-hex string maps to _id with ObjectId cast', () => { + const hex = '507f1f77bcf86cd799439011' + const result = convertWhereToMongo({ id: { equals: hex } }, [], 'p1') + expect(result.preFilter).toEqual({ _id: { $eq: new ObjectId(hex) } }) + expect(result.postFilter).toBeNull() + }) + + test('id with non-hex string maps to _id with raw value', () => { + const result = convertWhereToMongo({ id: { equals: 'not-an-objectid' } }, [], 'p1') + expect(result.preFilter).toEqual({ _id: { $eq: 'not-an-objectid' } }) + }) + + test('id with in array casts each 24-hex string', () => { + const a = '507f1f77bcf86cd799439011' + const b = 'plain-string-id' + const result = convertWhereToMongo({ id: { in: [a, b] } }, [], 'p1') + expect(result.preFilter).toEqual({ + _id: { $in: [new ObjectId(a), b] }, + }) + }) +}) + +describe('convertWhereToMongo — undeclared filter fields', () => { + test('throws when filtering on a field not in filterableFields and not reserved', () => { + expect(() => + convertWhereToMongo({ unknown_field: { equals: 'x' } }, [], 'default'), + ).toThrow(/filterableFields/) + }) +}) diff --git a/adapters/mongodb/dev/specs/ensureSearchIndex.spec.ts b/adapters/mongodb/dev/specs/ensureSearchIndex.spec.ts new file mode 100644 index 0000000..269c220 --- /dev/null +++ b/adapters/mongodb/dev/specs/ensureSearchIndex.spec.ts @@ -0,0 +1,180 @@ +import { afterEach, describe, expect, test, vi } from 'vitest' +import { __resetIndexCacheForTests, ensureSearchIndex } from '../../src/indexes.js' +import type { ResolvedPoolConfig } from '../../src/types.js' + +const POOL: ResolvedPoolConfig = { + dimensions: 4, + similarity: 'cosine', + filterableFields: [], + forceExact: false, + collectionName: 'vectorize_default', + indexName: 'vectorize_default_idx', +} + +afterEach(() => __resetIndexCacheForTests()) + +describe('ensureSearchIndex', () => { + test('listSearchIndexes errors propagate (no silent fallback)', async () => { + const collection = { + listSearchIndexes: () => ({ + toArray: async () => { + throw new Error('boom') + }, + }), + createSearchIndex: async () => undefined, + } + const client = { + db: () => ({ + collection: () => collection, + listCollections: () => ({ toArray: async () => [] }), + createCollection: async () => undefined, + }), + } as any + await expect(ensureSearchIndex(client, 'db', POOL)).rejects.toThrow('boom') + }) + + test('polls until status transitions from BUILDING to READY', async () => { + vi.useFakeTimers() + try { + const definition = { + fields: [ + { type: 'vector', path: 'embedding', numDimensions: 4, similarity: 'cosine' }, + { type: 'filter', path: 'sourceCollection' }, + { type: 'filter', path: 'docId' }, + { type: 'filter', path: 'embeddingVersion' }, + ], + } + let listCount = 0 + const list = vi.fn(() => ({ + toArray: async () => { + listCount += 1 + if (listCount === 1) return [] + if (listCount <= 3) { + return [{ name: POOL.indexName, status: 'BUILDING', latestDefinition: definition }] + } + return [{ name: POOL.indexName, status: 'READY', latestDefinition: definition }] + }, + })) + const create = vi.fn(async () => undefined) + const collection = { + listSearchIndexes: list, + createSearchIndex: create, + } + const client = { + db: () => ({ + collection: () => collection, + listCollections: () => ({ toArray: async () => [] }), + createCollection: async () => undefined, + }), + } as any + + const promise = ensureSearchIndex(client, 'db', POOL) + await vi.advanceTimersByTimeAsync(3000) + await promise + + expect(create).toHaveBeenCalledTimes(1) + expect(list).toHaveBeenCalledTimes(4) + } finally { + vi.useRealTimers() + } + }) + + test('treats existing index as equal when mongot returns reordered fields/keys', async () => { + const reorderedDefinition = { + fields: [ + { path: 'docId', type: 'filter' }, + { path: 'embeddingVersion', type: 'filter' }, + { similarity: 'cosine', path: 'embedding', numDimensions: 4, type: 'vector' }, + { path: 'sourceCollection', type: 'filter' }, + ], + } + const create = vi.fn(async () => undefined) + const collection = { + listSearchIndexes: () => ({ + toArray: async () => [ + { name: POOL.indexName, status: 'READY', latestDefinition: reorderedDefinition }, + ], + }), + createSearchIndex: create, + } + const client = { + db: () => ({ + collection: () => collection, + listCollections: () => ({ toArray: async () => [] }), + createCollection: async () => undefined, + }), + } as any + await expect(ensureSearchIndex(client, 'db', POOL)).resolves.toBeUndefined() + expect(create).not.toHaveBeenCalled() + }) + + test('throws when existing index has a genuinely different definition', async () => { + const differentDefinition = { + fields: [ + { type: 'vector', path: 'embedding', numDimensions: 4, similarity: 'euclidean' }, + { type: 'filter', path: 'sourceCollection' }, + { type: 'filter', path: 'docId' }, + { type: 'filter', path: 'embeddingVersion' }, + ], + } + const collection = { + listSearchIndexes: () => ({ + toArray: async () => [ + { name: POOL.indexName, status: 'READY', latestDefinition: differentDefinition }, + ], + }), + createSearchIndex: async () => undefined, + } + const client = { + db: () => ({ + collection: () => collection, + listCollections: () => ({ toArray: async () => [] }), + createCollection: async () => undefined, + }), + } as any + await expect(ensureSearchIndex(client, 'db', POOL)).rejects.toThrow(/different definition/) + }) + + test('concurrent ensureSearchIndex calls share one createSearchIndex call', async () => { + let createCount = 0 + const create = vi.fn(async () => { + createCount += 1 + }) + const collection = { + listSearchIndexes: () => ({ + toArray: async () => { + if (createCount === 0) return [] + return [ + { + name: POOL.indexName, + status: 'READY', + latestDefinition: { + fields: [ + { type: 'vector', path: 'embedding', numDimensions: 4, similarity: 'cosine' }, + { type: 'filter', path: 'sourceCollection' }, + { type: 'filter', path: 'docId' }, + { type: 'filter', path: 'embeddingVersion' }, + ], + }, + }, + ] + }, + }), + createSearchIndex: create, + } + const client = { + db: () => ({ + collection: () => collection, + listCollections: () => ({ toArray: async () => [] }), + createCollection: async () => undefined, + }), + } as any + + await Promise.all([ + ensureSearchIndex(client, 'db', POOL), + ensureSearchIndex(client, 'db', POOL), + ensureSearchIndex(client, 'db', POOL), + ]) + expect(create).toHaveBeenCalledTimes(1) + }) +}) diff --git a/adapters/mongodb/dev/specs/escapeRegExp.spec.ts b/adapters/mongodb/dev/specs/escapeRegExp.spec.ts new file mode 100644 index 0000000..9a7eeb5 --- /dev/null +++ b/adapters/mongodb/dev/specs/escapeRegExp.spec.ts @@ -0,0 +1,21 @@ +import { describe, expect, test } from 'vitest' +import { escapeRegExp } from '../../src/escapeRegExp.js' + +describe('escapeRegExp', () => { + test('escapes regex metacharacters', () => { + expect(escapeRegExp('foo.bar')).toBe('foo\\.bar') + expect(escapeRegExp('a*b')).toBe('a\\*b') + expect(escapeRegExp('(x)')).toBe('\\(x\\)') + expect(escapeRegExp('a+b?c')).toBe('a\\+b\\?c') + expect(escapeRegExp('[abc]')).toBe('\\[abc\\]') + expect(escapeRegExp('a\\b')).toBe('a\\\\b') + expect(escapeRegExp('a^b$')).toBe('a\\^b\\$') + expect(escapeRegExp('a|b')).toBe('a\\|b') + expect(escapeRegExp('{1,2}')).toBe('\\{1,2\\}') + }) + + test('returns plain string unchanged', () => { + expect(escapeRegExp('hello world')).toBe('hello world') + expect(escapeRegExp('')).toBe('') + }) +}) diff --git a/adapters/mongodb/dev/specs/extensionFields.spec.ts b/adapters/mongodb/dev/specs/extensionFields.spec.ts new file mode 100644 index 0000000..ac5d1f2 --- /dev/null +++ b/adapters/mongodb/dev/specs/extensionFields.spec.ts @@ -0,0 +1,109 @@ +// adapters/mongodb/dev/specs/extensionFields.spec.ts +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { MongoClient } from 'mongodb' +import type { BasePayload } from 'payload' +import type { DbAdapter } from 'payloadcms-vectorize' +import { DIMS, MONGO_URI } from './constants.js' +import { buildMongoTestPayload, teardownDbs } from './utils.js' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from '@shared-test/helpers/embed' + +const DB = `mongo_extension_fields_${Date.now()}` + +describe('Extension fields (mongodb)', () => { + let payload: BasePayload + let adapter: DbAdapter + + beforeAll(async () => { + const built = await buildMongoTestPayload({ + uri: MONGO_URI, + dbName: DB, + pools: { + default: { + dimensions: DIMS, + filterableFields: ['category', 'priority'], + }, + }, + collections: [ + { + slug: 'posts', + fields: [ + { name: 'title', type: 'text' }, + { name: 'category', type: 'text' }, + { name: 'priority', type: 'number' }, + ], + }, + ], + knowledgePools: { + default: { + collections: {}, + extensionFields: [ + { name: 'category', type: 'text' }, + { name: 'priority', type: 'number' }, + ], + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }) + payload = built.payload + adapter = built.adapter + }) + + afterAll(async () => { + await teardownDbs(payload, MONGO_URI, DB) + }) + + test('search index declares extension fields as filterable', async () => { + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'posts', + docId: 'doc-bootstrap', + chunkIndex: 0, + chunkText: 'bootstrap', + embeddingVersion: testEmbeddingVersion, + embedding: Array(DIMS).fill(0.1), + extensionFields: { category: 'cat-a', priority: 1 }, + }) + + const c = new MongoClient(MONGO_URI) + await c.connect() + const indexes = (await c + .db(`${DB}_vectors`) + .collection('vectorize_default') + .listSearchIndexes('vectorize_default_idx') + .toArray()) as Array<{ latestDefinition: { fields: Array<{ type: string; path: string }> } }> + await c.close() + + const def = indexes[0]?.latestDefinition + expect(def).toBeDefined() + const filterPaths = def!.fields.filter((f) => f.type === 'filter').map((f) => f.path) + expect(filterPaths).toContain('sourceCollection') + expect(filterPaths).toContain('docId') + expect(filterPaths).toContain('embeddingVersion') + expect(filterPaths).toContain('category') + expect(filterPaths).toContain('priority') + }, 90_000) + + test('extensionFields are persisted on the chunk document and returned by search', async () => { + const target = Array(DIMS).fill(0.42) + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'posts', + docId: 'doc-1', + chunkIndex: 0, + chunkText: 'hello', + embeddingVersion: testEmbeddingVersion, + embedding: target, + extensionFields: { category: 'cat-a', priority: 7 }, + }) + + await new Promise((r) => setTimeout(r, 1500)) + + const r = await adapter.search(payload, target, 'default', 5) + const hit = r.find((x) => x.docId === 'doc-1') + expect(hit).toBeDefined() + expect((hit as any).category).toBe('cat-a') + expect((hit as any).priority).toBe(7) + }, 90_000) +}) diff --git a/adapters/mongodb/dev/specs/integration.spec.ts b/adapters/mongodb/dev/specs/integration.spec.ts new file mode 100644 index 0000000..a5c59b1 --- /dev/null +++ b/adapters/mongodb/dev/specs/integration.spec.ts @@ -0,0 +1,184 @@ +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { MongoClient } from 'mongodb' +import type { BasePayload } from 'payload' +import type { DbAdapter } from 'payloadcms-vectorize' +import { DIMS, MONGO_URI } from './constants.js' +import { buildMongoTestPayload, teardownDbs } from './utils.js' +import { + makeDummyEmbedDocs, + makeDummyEmbedQuery, + testEmbeddingVersion, +} from '@shared-test/helpers/embed' + +const DB1 = `vectorize_mongo_int_${Date.now()}_a` +const DB1_VECTORS = `${DB1}_vectors` + +describe('Mongo-specific integration tests', () => { + let adapter: DbAdapter + let payload: BasePayload + + beforeAll(async () => { + const built = await buildMongoTestPayload({ + uri: MONGO_URI, + dbName: DB1, + pools: { + default: { dimensions: DIMS, numCandidates: 50 }, + secondary: { dimensions: DIMS, numCandidates: 50 }, + }, + knowledgePools: { + default: { + collections: {}, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + secondary: { + collections: {}, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }) + adapter = built.adapter + payload = built.payload + }) + + afterAll(async () => { + await teardownDbs(payload, MONGO_URI, DB1) + }) + + test('ensureSearchIndex is idempotent across multiple storeChunk calls', async () => { + for (let i = 0; i < 3; i++) { + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'idempotent', + docId: `id-${i}`, + chunkIndex: 0, + chunkText: `chunk ${i}`, + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.1 + i * 0.01), + extensionFields: {}, + }) + } + + const c = new MongoClient(MONGO_URI) + await c.connect() + const indexes = (await c + .db(DB1_VECTORS) + .collection('vectorize_default') + .listSearchIndexes() + .toArray()) as Array<{ name: string }> + const matches = indexes.filter((i) => i.name === 'vectorize_default_idx') + expect(matches.length).toBe(1) + await c.close() + }, 90_000) + + test('storeChunk → immediate search returns the inserted doc', async () => { + const docId = `imm-${Date.now()}` + const target = Array(DIMS).fill(0.42) + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'immediate', + docId, + chunkIndex: 0, + chunkText: 'immediate test', + embeddingVersion: 'v1', + embedding: target, + extensionFields: {}, + }) + await new Promise((r) => setTimeout(r, 1200)) + const r = await adapter.search(payload, target, 'default', 5) + const found = r.some((x) => x.docId === docId) + expect(found).toBe(true) + }) + + test('multiple pools coexist without collision', async () => { + await adapter.storeChunk(payload, 'secondary', { + sourceCollection: 'sec', + docId: 'sec-1', + chunkIndex: 0, + chunkText: 'secondary pool', + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.9), + extensionFields: {}, + }) + + const c = new MongoClient(MONGO_URI) + await c.connect() + const a = await c.db(DB1_VECTORS).collection('vectorize_default').countDocuments() + const b = await c.db(DB1_VECTORS).collection('vectorize_secondary').countDocuments() + expect(a).toBeGreaterThan(0) + expect(b).toBeGreaterThan(0) + await c.close() + }, 90_000) + + test('conflicting index definition throws actionable error', async () => { + // Boot a fresh payload, then pre-seed a conflicting index in the vectors DB + // BEFORE the adapter's first storeChunk runs ensureSearchIndex. + const conflictBase = `${DB1}_conflict` + const conflictVectorsDb = `${conflictBase}_vectors` + + const built = await buildMongoTestPayload({ + uri: MONGO_URI, + dbName: conflictBase, + pools: { default: { dimensions: DIMS, similarity: 'cosine', numCandidates: 50 } }, + knowledgePools: { + default: { + collections: {}, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }) + + const c = new MongoClient(MONGO_URI) + await c.connect() + const coll = c.db(conflictVectorsDb).collection('vectorize_default') + // Ensure the collection exists by inserting a sentinel doc, then drop it. + await coll.insertOne({ _bootstrap: true }) + await coll.deleteMany({ _bootstrap: true }) + await coll.createSearchIndex({ + name: 'vectorize_default_idx', + type: 'vectorSearch', + definition: { + fields: [ + { type: 'vector', path: 'embedding', numDimensions: DIMS, similarity: 'euclidean' }, + { type: 'filter', path: 'sourceCollection' }, + { type: 'filter', path: 'docId' }, + { type: 'filter', path: 'embeddingVersion' }, + ], + }, + }) + + const deadline = Date.now() + 30_000 + while (Date.now() < deadline) { + const list = (await coll.listSearchIndexes('vectorize_default_idx').toArray()) as Array<{ name: string; status: string }> + const status = list.find((i) => i.name === 'vectorize_default_idx')?.status + if (status === 'BUILDING' || status === 'READY') break + await new Promise((r) => setTimeout(r, 200)) + } + + try { + await expect( + built.adapter.storeChunk(built.payload, 'default', { + sourceCollection: 'x', + docId: 'x-1', + chunkIndex: 0, + chunkText: 'should fail', + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.5), + extensionFields: {}, + }), + ).rejects.toThrowError(/different definition/) + } finally { + await teardownDbs(built.payload, MONGO_URI, conflictBase) + await c.close() + } + }, 90_000) +}) diff --git a/adapters/mongodb/dev/specs/multipools.spec.ts b/adapters/mongodb/dev/specs/multipools.spec.ts new file mode 100644 index 0000000..ea1cff8 --- /dev/null +++ b/adapters/mongodb/dev/specs/multipools.spec.ts @@ -0,0 +1,115 @@ +// adapters/mongodb/dev/specs/multipools.spec.ts +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { MongoClient } from 'mongodb' +import type { BasePayload } from 'payload' +import type { DbAdapter } from 'payloadcms-vectorize' +import { MONGO_URI } from './constants.js' +import { buildMongoTestPayload, teardownDbs } from './utils.js' +import { makeDummyEmbedDocs, makeDummyEmbedQuery } from '@shared-test/helpers/embed' + +const DB = `mongo_multipools_${Date.now()}` +const VECTOR_DB = `${DB}_vectors` +const DIMS_A = 8 +const DIMS_B = 16 + +describe('Multiple knowledge pools (mongodb)', () => { + let payload: BasePayload + let adapter: DbAdapter + + beforeAll(async () => { + const built = await buildMongoTestPayload({ + uri: MONGO_URI, + dbName: DB, + pools: { + pool_a: { dimensions: DIMS_A }, + pool_b: { dimensions: DIMS_B }, + }, + knowledgePools: { + pool_a: { + collections: {}, + embeddingConfig: { + version: 'test-pool-a', + queryFn: makeDummyEmbedQuery(DIMS_A), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS_A), + }, + }, + pool_b: { + collections: {}, + embeddingConfig: { + version: 'test-pool-b', + queryFn: makeDummyEmbedQuery(DIMS_B), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS_B), + }, + }, + }, + }) + payload = built.payload + adapter = built.adapter + }) + + afterAll(async () => { + await teardownDbs(payload, MONGO_URI, DB) + }) + + test('each pool gets its own collection and search index', async () => { + await adapter.storeChunk(payload, 'pool_a', { + sourceCollection: 'src', + docId: 'a-1', + chunkIndex: 0, + chunkText: 'a', + embeddingVersion: 'test-pool-a', + embedding: Array(DIMS_A).fill(0.5), + extensionFields: {}, + }) + await adapter.storeChunk(payload, 'pool_b', { + sourceCollection: 'src', + docId: 'b-1', + chunkIndex: 0, + chunkText: 'b', + embeddingVersion: 'test-pool-b', + embedding: Array(DIMS_B).fill(0.5), + extensionFields: {}, + }) + + const c = new MongoClient(MONGO_URI) + await c.connect() + const collections = (await c + .db(VECTOR_DB) + .listCollections({}, { nameOnly: true }) + .toArray()) as Array<{ name: string }> + const names = collections.map((x) => x.name) + expect(names).toEqual(expect.arrayContaining(['vectorize_pool_a', 'vectorize_pool_b'])) + + for (const [coll, expectedDims] of [ + ['vectorize_pool_a', DIMS_A], + ['vectorize_pool_b', DIMS_B], + ] as const) { + const idx = (await c.db(VECTOR_DB).collection(coll).listSearchIndexes().toArray()) as Array<{ + name: string + latestDefinition: { fields: Array<{ type: string; numDimensions?: number }> } + }> + const vectorField = idx[0].latestDefinition.fields.find((f) => f.type === 'vector') + expect(vectorField?.numDimensions).toBe(expectedDims) + } + await c.close() + }, 120_000) + + test('search isolation: a vector written to pool_a is not returned from pool_b', async () => { + await adapter.storeChunk(payload, 'pool_a', { + sourceCollection: 'src', + docId: 'a-iso', + chunkIndex: 0, + chunkText: 'isolated-a', + embeddingVersion: 'test-pool-a', + embedding: Array(DIMS_A).fill(0.99), + extensionFields: {}, + }) + await new Promise((r) => setTimeout(r, 1500)) + + const aResults = await adapter.search(payload, Array(DIMS_A).fill(0.99), 'pool_a', 5) + expect(aResults.some((x) => x.docId === 'a-iso')).toBe(true) + + const bResults = await adapter.search(payload, Array(DIMS_B).fill(0.99), 'pool_b', 5) + expect(bResults.some((x) => x.docId === 'a-iso')).toBe(false) + }, 90_000) +}) diff --git a/adapters/mongodb/dev/specs/utils.ts b/adapters/mongodb/dev/specs/utils.ts new file mode 100644 index 0000000..9966ab8 --- /dev/null +++ b/adapters/mongodb/dev/specs/utils.ts @@ -0,0 +1,117 @@ +import { MongoClient } from 'mongodb' +import { buildConfig, getPayload } from 'payload' +import { mongooseAdapter } from '@payloadcms/db-mongodb' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import payloadcmsVectorize from 'payloadcms-vectorize' +import type { BasePayload, CollectionConfig } from 'payload' +import type { KnowledgePoolDynamicConfig } from 'payloadcms-vectorize' + +export type KnowledgePoolsConfig = Record +import { __closeForTests } from '../../src/client.js' +import { __resetIndexCacheForTests } from '../../src/indexes.js' +import { createMongoVectorIntegration } from '../../src/index.js' +import type { MongoVectorIntegrationConfig } from '../../src/types.js' + +export interface BuildMongoTestPayloadArgs { + uri: string + dbName: string + pools: MongoVectorIntegrationConfig['pools'] + collections?: CollectionConfig[] + knowledgePools: KnowledgePoolsConfig +} + +export async function buildMongoTestPayload(args: BuildMongoTestPayloadArgs): Promise<{ + payload: BasePayload + adapter: ReturnType['adapter'] +}> { + const vectorDbName = `${args.dbName}_vectors` + + await dropTestDb(args.uri, args.dbName) + await dropTestDb(args.uri, vectorDbName) + + const { adapter } = createMongoVectorIntegration({ + uri: args.uri, + dbName: vectorDbName, + pools: args.pools, + }) + + const config = await buildConfig({ + secret: 'test-secret', + editor: lexicalEditor(), + collections: args.collections ?? [], + db: mongooseAdapter({ url: injectDbName(args.uri, args.dbName) }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: adapter, + knowledgePools: args.knowledgePools, + }), + ], + }) + + const payload = await getPayload({ + config, + key: `mongodb-test-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`, + cron: false, + }) + return { payload, adapter } +} + +/** + * Insert a database name into a Mongo connection string between the host + * and the optional query string. Requires a path-less URI (host[:port] only, + * optionally followed by `?query`). Throws on URIs that already carry a path + * component (e.g. `mongodb+srv://cluster/myapp`) — concatenating onto those + * silently produces an invalid double-path URI like `.../myapp/test`. + */ +function injectDbName(uri: string, dbName: string): string { + const queryIdx = uri.indexOf('?') + const base = queryIdx === -1 ? uri : uri.slice(0, queryIdx) + const query = queryIdx === -1 ? '' : uri.slice(queryIdx) + const schemeEnd = base.indexOf('://') + const afterScheme = schemeEnd === -1 ? base : base.slice(schemeEnd + 3) + const slashIdx = afterScheme.indexOf('/') + if (slashIdx !== -1 && afterScheme.slice(slashIdx + 1).replace(/\/+$/, '').length > 0) { + throw new Error( + `[buildMongoTestPayload] Mongo URI must be path-less (host[:port] only); got ${uri}. ` + + `Strip the default-DB path before passing in.`, + ) + } + const baseNoSlash = base.replace(/\/+$/, '') + return `${baseNoSlash}/${dbName}${query}` +} + +export async function dropTestDb(uri: string, dbName: string): Promise { + const c = new MongoClient(uri) + try { + await c.connect() + await c.db(dbName).dropDatabase() + } catch { + // ignore + } finally { + await c.close() + } +} + +/** + * Tear down a booted test payload + both databases + module caches. + * + * Mirrors the pg adapter's `destroyPayload` pattern: destroying the payload + * instance closes the Mongoose connection opened by `mongooseAdapter`. Without + * this, each spec leaks a live Mongoose connection and the suite eventually + * exhausts the pool. + */ +export async function teardownDbs( + payload: BasePayload, + uri: string, + dbName: string, +): Promise { + try { + await payload.destroy() + } catch { + // ignore — destroy is best-effort during teardown + } + await dropTestDb(uri, dbName) + await dropTestDb(uri, `${dbName}_vectors`) + __resetIndexCacheForTests() + await __closeForTests() +} diff --git a/adapters/mongodb/dev/specs/vectorSearchWhere.spec.ts b/adapters/mongodb/dev/specs/vectorSearchWhere.spec.ts new file mode 100644 index 0000000..4488c5b --- /dev/null +++ b/adapters/mongodb/dev/specs/vectorSearchWhere.spec.ts @@ -0,0 +1,351 @@ +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import type { BasePayload, Where } from 'payload' +import type { DbAdapter, VectorSearchResult } from 'payloadcms-vectorize' +import { DIMS, MONGO_URI } from './constants.js' +import { buildMongoTestPayload, teardownDbs } from './utils.js' +import { + makeDummyEmbedDocs, + makeDummyEmbedQuery, + testEmbeddingVersion, +} from '@shared-test/helpers/embed' + +const TEST_DB = `vectorize_mongo_where_${Date.now()}` +const FILTERABLE = ['status', 'category', 'views', 'rating', 'published', 'tags'] + +const articles = [ + { + title: 'Published Tech Article', + status: 'published', category: 'tech', views: 150, + rating: 4.5, published: true, tags: 'javascript,nodejs,programming', + }, + { + title: 'Draft Tech Article', + status: 'draft', category: 'tech', views: 0, + rating: 0, published: false, tags: 'javascript', + }, + { + title: 'Published Design Article', + status: 'published', category: 'design', views: 300, + rating: 4.8, published: true, tags: 'ui,design,ux', + }, + { + title: 'Archived Tech Article', + status: 'archived', category: 'tech', views: 50, + rating: 3.5, published: false, tags: 'python,legacy', + }, +] + +async function performVectorSearch( + payload: BasePayload, + adapter: DbAdapter, + where?: Where, + limit = 10, +): Promise { + const queryEmbedding = Array(DIMS).fill(0.5) + return adapter.search(payload, queryEmbedding, 'default', limit, where) +} + +describe('Mongo adapter — WHERE clause operators', () => { + let adapter: DbAdapter + let payload: BasePayload + + beforeAll(async () => { + const built = await buildMongoTestPayload({ + uri: MONGO_URI, + dbName: TEST_DB, + pools: { + default: { + dimensions: DIMS, + filterableFields: FILTERABLE, + numCandidates: 50, + }, + }, + knowledgePools: { + default: { + collections: {}, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }) + adapter = built.adapter + payload = built.payload + + let i = 0 + for (const a of articles) { + const embedding = Array(DIMS).fill(0.5).map((v) => v + Math.random() * 0.05) + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'articles', + docId: `art-${i++}`, + chunkIndex: 0, + chunkText: a.title, + embeddingVersion: 'v1', + embedding, + extensionFields: { + status: a.status, + category: a.category, + views: a.views, + rating: a.rating, + published: a.published, + tags: a.tags, + }, + }) + } + await new Promise((r) => setTimeout(r, 1200)) + }, 90_000) + + afterAll(async () => { + await teardownDbs(payload, MONGO_URI, TEST_DB) + }) + + describe('equals operator', () => { + test('filters by exact text match', async () => { + const r = await performVectorSearch(payload, adapter, { status: { equals: 'published' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).toBe('published')) + }) + + test('returns empty when no match', async () => { + const r = await performVectorSearch(payload, adapter, { status: { equals: 'missing' } }) + expect(r).toEqual([]) + }) + }) + + describe('not_equals / notEquals operator', () => { + test('filters by non-equal text match', async () => { + const r = await performVectorSearch(payload, adapter, { status: { not_equals: 'draft' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).not.toBe('draft')) + }) + + test('notEquals variant', async () => { + const r = await performVectorSearch(payload, adapter, { status: { notEquals: 'archived' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).not.toBe('archived')) + }) + }) + + describe('in / not_in / notIn operators', () => { + test('in', async () => { + const r = await performVectorSearch(payload, adapter, { status: { in: ['published', 'draft'] } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(['published', 'draft']).toContain(x.status)) + }) + test('not_in', async () => { + const r = await performVectorSearch(payload, adapter, { status: { not_in: ['draft', 'archived'] } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(['draft', 'archived']).not.toContain(x.status)) + }) + test('notIn', async () => { + const r = await performVectorSearch(payload, adapter, { status: { notIn: ['archived'] } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).not.toBe('archived')) + }) + }) + + describe('like / contains operators (post-filter)', () => { + test('like substring match', async () => { + const r = await performVectorSearch(payload, adapter, { tags: { like: 'javascript' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect((x.tags as string).toLowerCase()).toContain('javascript')) + }) + test('contains substring match', async () => { + const r = await performVectorSearch(payload, adapter, { category: { contains: 'tech' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.category).toContain('tech')) + }) + test('like regex special chars do NOT match unintended values', async () => { + // None of our fixtures contain "foo.bar" — the dot must be escaped. + const r = await performVectorSearch(payload, adapter, { tags: { like: 'foo.bar' } }) + expect(r).toEqual([]) + }) + }) + + describe('comparison operators (numbers)', () => { + test('greater_than', async () => { + const r = await performVectorSearch(payload, adapter, { views: { greater_than: 100 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeGreaterThan(100)) + }) + test('greaterThan variant', async () => { + const r = await performVectorSearch(payload, adapter, { views: { greaterThan: 100 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeGreaterThan(100)) + }) + test('greater_than_equal', async () => { + const r = await performVectorSearch(payload, adapter, { views: { greater_than_equal: 150 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeGreaterThanOrEqual(150)) + }) + test('less_than', async () => { + const r = await performVectorSearch(payload, adapter, { views: { less_than: 200 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeLessThan(200)) + }) + test('less_than_equal', async () => { + const r = await performVectorSearch(payload, adapter, { views: { less_than_equal: 150 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeLessThanOrEqual(150)) + }) + test('lessThan variant on float', async () => { + const r = await performVectorSearch(payload, adapter, { rating: { lessThan: 4.6 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.rating).toBeLessThan(4.6)) + }) + test('range via and', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ views: { greater_than: 50 } }, { views: { less_than: 200 } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + expect(x.views).toBeGreaterThan(50) + expect(x.views).toBeLessThan(200) + }) + }) + }) + + describe('exists operator', () => { + test('exists true', async () => { + const r = await performVectorSearch(payload, adapter, { category: { exists: true } }) + r.forEach((x) => expect(x.category != null).toBe(true)) + }) + test('exists false', async () => { + const r = await performVectorSearch(payload, adapter, { category: { exists: false } }) + r.forEach((x) => expect(x.category == null).toBe(true)) + }) + }) + + describe('AND operator', () => { + test('text + text', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ status: { equals: 'published' } }, { category: { equals: 'tech' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + expect(x.status).toBe('published') + expect(x.category).toBe('tech') + }) + }) + test('text + numeric', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ status: { equals: 'published' } }, { views: { greater_than: 100 } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + expect(x.status).toBe('published') + expect(x.views).toBeGreaterThan(100) + }) + }) + test('and with single condition', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ status: { equals: 'published' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).toBe('published')) + }) + test('and with one pre + one post operator', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ status: { equals: 'published' } }, { tags: { like: 'javascript' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + expect(x.status).toBe('published') + expect((x.tags as string).toLowerCase()).toContain('javascript') + }) + }) + }) + + describe('OR operator', () => { + test('two text branches', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [{ status: { equals: 'draft' } }, { status: { equals: 'archived' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(['draft', 'archived']).toContain(x.status)) + }) + test('two numeric branches', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [{ views: { greater_than: 200 } }, { rating: { greater_than: 4.7 } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + const a = (x.views as number) > 200 + const b = (x.rating as number) > 4.7 + expect(a || b).toBe(true) + }) + }) + test('or with single condition', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [{ status: { equals: 'published' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).toBe('published')) + }) + test('or with one post-filter branch routes whole or to post', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [{ status: { equals: 'published' } }, { tags: { like: 'python' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + const a = x.status === 'published' + const b = (x.tags as string).toLowerCase().includes('python') + expect(a || b).toBe(true) + }) + }) + }) + + describe('complex nested logic', () => { + test('(published AND tech) OR archived', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [ + { + and: [ + { status: { equals: 'published' } }, + { category: { equals: 'tech' } }, + ], + }, + { status: { equals: 'archived' } }, + ], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + const tech = x.status === 'published' && x.category === 'tech' + const arch = x.status === 'archived' + expect(tech || arch).toBe(true) + }) + }) + }) + + describe('reserved fields filterable without declaration', () => { + test('sourceCollection equals works on a pool that did not declare it', async () => { + const r = await performVectorSearch(payload, adapter, { + sourceCollection: { equals: 'articles' }, + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.sourceCollection).toBe('articles')) + }) + }) + + describe('configuration errors', () => { + test('filtering on undeclared field throws clearly', async () => { + await expect( + performVectorSearch(payload, adapter, { + undeclared: { equals: 'x' }, + } as any), + ).rejects.toThrowError(/not configured as filterableFields/) + }) + }) + + describe('limit', () => { + test('returns at most `limit` results ordered by score', async () => { + const r = await performVectorSearch(payload, adapter, undefined, 2) + expect(r.length).toBeLessThanOrEqual(2) + for (let i = 1; i < r.length; i++) { + expect(r[i - 1].score).toBeGreaterThanOrEqual(r[i].score) + } + }) + }) +}) diff --git a/adapters/mongodb/package.json b/adapters/mongodb/package.json new file mode 100644 index 0000000..cbb29c6 --- /dev/null +++ b/adapters/mongodb/package.json @@ -0,0 +1,65 @@ +{ + "name": "@payloadcms-vectorize/mongodb", + "version": "0.7.2", + "description": "MongoDB Atlas + self-hosted vectorSearch adapter for payloadcms-vectorize", + "license": "MIT", + "repository": { + "type": "git", + "url": "git+https://github.com/techiejd/payloadcms-vectorize.git", + "directory": "adapters/mongodb" + }, + "homepage": "https://github.com/techiejd/payloadcms-vectorize/tree/main/adapters/mongodb#readme", + "bugs": { + "url": "https://github.com/techiejd/payloadcms-vectorize/issues" + }, + "type": "module", + "files": [ + "dist", + "README.md" + ], + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "default": "./dist/index.js" + } + }, + "scripts": { + "test:setup": "docker-compose -f dev/docker-compose.yml up -d", + "test:teardown": "docker-compose -f dev/docker-compose.yml down" + }, + "keywords": [ + "payloadcms", + "mongodb", + "vector-search", + "rag" + ], + "peerDependencies": { + "mongodb": ">=6.0.0", + "payload": ">=3.0.0 <4.0.0", + "payloadcms-vectorize": ">=0.7.2" + }, + "devDependencies": { + "@payloadcms/db-mongodb": "3.69.0", + "@payloadcms/richtext-lexical": "3.69.0", + "mongodb": "^6.10.0", + "payloadcms-vectorize": "workspace:*" + }, + "engines": { + "node": "^18.20.2 || >=20.9.0", + "pnpm": "^9 || ^10" + }, + "publishConfig": { + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "default": "./dist/index.js" + } + }, + "main": "./dist/index.js", + "types": "./dist/index.d.ts" + } +} diff --git a/adapters/mongodb/src/client.ts b/adapters/mongodb/src/client.ts new file mode 100644 index 0000000..55bd4fd --- /dev/null +++ b/adapters/mongodb/src/client.ts @@ -0,0 +1,32 @@ +import { MongoClient } from 'mongodb' + +const clientCache = new Map>() + +export function getMongoClient(uri: string): Promise { + let p = clientCache.get(uri) + if (!p) { + p = MongoClient.connect(uri).catch((err) => { + clientCache.delete(uri) + throw err + }) + clientCache.set(uri, p) + } + return p +} + +/** + * Test-only helper. NOT exported from `index.ts` — referenced by the dev test + * suites via deep import to avoid leaking into the published API. + */ +export async function __closeForTests(): Promise { + const promises = Array.from(clientCache.values()) + clientCache.clear() + for (const p of promises) { + try { + const c = await p + await c.close() + } catch { + // ignore; client may not have connected + } + } +} diff --git a/adapters/mongodb/src/convertWhere.ts b/adapters/mongodb/src/convertWhere.ts new file mode 100644 index 0000000..52dfd4b --- /dev/null +++ b/adapters/mongodb/src/convertWhere.ts @@ -0,0 +1,220 @@ +import type { Where } from 'payload' +import { ObjectId } from 'mongodb' +import { escapeRegExp } from './escapeRegExp.js' +import { RESERVED_FILTER_FIELDS } from './types.js' + +export interface ConvertResult { + preFilter: Record | null + postFilter: Where | null +} + +const PRE_OPS = new Map([ + ['equals', '$eq'], + ['not_equals', '$ne'], + ['notEquals', '$ne'], + ['in', '$in'], + ['not_in', '$nin'], + ['notIn', '$nin'], + ['greater_than', '$gt'], + ['greaterThan', '$gt'], + ['greater_than_equal', '$gte'], + ['greaterThanEqual', '$gte'], + ['less_than', '$lt'], + ['lessThan', '$lt'], + ['less_than_equal', '$lte'], + ['lessThanEqual', '$lte'], +]) + +const POST_OPS = new Set(['like', 'contains', 'all']) +const UNSUPPORTED_OPS = new Set(['near', 'within', 'intersects']) + +const HEX24 = /^[a-f\d]{24}$/i + +function castIdValue(v: unknown): unknown { + if (typeof v === 'string' && HEX24.test(v)) return new ObjectId(v) + return v +} + +function castIdOperand(op: string, v: unknown): unknown { + if (op === 'in' || op === 'not_in' || op === 'notIn') { + return Array.isArray(v) ? v.map(castIdValue) : v + } + return castIdValue(v) +} + +function isFilterable(field: string, filterable: string[]): boolean { + if (field === 'id') return true + return ( + (RESERVED_FILTER_FIELDS as readonly string[]).includes(field) || + filterable.includes(field) + ) +} + +function leafToPre(field: string, cond: Record): Record { + const targetField = field === 'id' ? '_id' : field + const clauses: Record[] = [] + for (const [op, val] of Object.entries(cond)) { + if (op === 'exists') { + if (val === true) { + clauses.push({ [targetField]: { $exists: true, $ne: null } }) + } else { + clauses.push({ + $or: [ + { [targetField]: { $exists: false } }, + { [targetField]: { $eq: null } }, + ], + }) + } + continue + } + const mongoOp = PRE_OPS.get(op) + if (!mongoOp) continue + const operand = field === 'id' ? castIdOperand(op, val) : val + clauses.push({ [targetField]: { [mongoOp]: operand } }) + } + if (clauses.length === 0) return {} + if (clauses.length === 1) return clauses[0] + return { $and: clauses } +} + +function convertLeaf( + where: Where, + filterable: string[], + poolName: string, +): ConvertResult { + const keys = Object.keys(where) + if (keys.length !== 1) { + // Multiple top-level fields on the same object: treat as implicit AND. + const synthetic: Where = { and: keys.map((k) => ({ [k]: where[k] }) as Where) } + return convertWhereToMongo(synthetic, filterable, poolName) + } + const field = keys[0] + const cond = where[field] as Record + if (!isFilterable(field, filterable)) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Field "${field}" is not configured as filterableFields for pool "${poolName}"`, + ) + } + for (const op of Object.keys(cond)) { + if (UNSUPPORTED_OPS.has(op)) { + throw new Error(`[@payloadcms-vectorize/mongodb] Operator "${op}" is not supported`) + } + } + const hasPostOp = Object.keys(cond).some((op) => POST_OPS.has(op)) + if (hasPostOp) { + return { preFilter: null, postFilter: { [field]: cond } as Where } + } + return { preFilter: leafToPre(field, cond), postFilter: null } +} + +export function convertWhereToMongo( + where: Where, + filterable: string[], + poolName: string, +): ConvertResult { + if ('and' in where && Array.isArray(where.and)) { + const branches = where.and.map((b) => convertWhereToMongo(b, filterable, poolName)) + const preBranches = branches.filter((b) => b.preFilter).map((b) => b.preFilter!) + const postBranches = branches.filter((b) => b.postFilter).map((b) => b.postFilter!) + const preFilter = + preBranches.length === 0 + ? null + : preBranches.length === 1 + ? preBranches[0] + : { $and: preBranches } + const postFilter = + postBranches.length === 0 + ? null + : postBranches.length === 1 + ? postBranches[0] + : ({ and: postBranches } as Where) + return { preFilter, postFilter } + } + + if ('or' in where && Array.isArray(where.or)) { + const branches = where.or.map((b) => convertWhereToMongo(b, filterable, poolName)) + const anyPost = branches.some((b) => b.postFilter !== null) + if (anyPost) { + // Entire OR goes post — semantics require the whole disjunction to apply + // to the post-vectorSearch document set. + return { preFilter: null, postFilter: where } + } + const preBranches = branches.map((b) => b.preFilter!).filter((p) => p) + const preFilter = + preBranches.length === 0 + ? null + : preBranches.length === 1 + ? preBranches[0] + : { $or: preBranches } + return { preFilter, postFilter: null } + } + + return convertLeaf(where, filterable, poolName) +} + +function valueMatchesOp(value: unknown, op: string, operand: unknown): boolean { + switch (op) { + case 'equals': + return value === operand + case 'not_equals': + case 'notEquals': + return value !== operand + case 'in': + return Array.isArray(operand) && operand.includes(value as never) + case 'not_in': + case 'notIn': + return Array.isArray(operand) && !operand.includes(value as never) + case 'greater_than': + case 'greaterThan': + return typeof value === 'number' && typeof operand === 'number' && value > operand + case 'greater_than_equal': + case 'greaterThanEqual': + return typeof value === 'number' && typeof operand === 'number' && value >= operand + case 'less_than': + case 'lessThan': + return typeof value === 'number' && typeof operand === 'number' && value < operand + case 'less_than_equal': + case 'lessThanEqual': + return typeof value === 'number' && typeof operand === 'number' && value <= operand + case 'exists': + return operand + ? value !== undefined && value !== null + : value === undefined || value === null + case 'like': + case 'contains': { + if (typeof operand !== 'string') return false + const re = new RegExp(escapeRegExp(operand), 'i') + if (Array.isArray(value)) { + return value.some((v) => typeof v === 'string' && re.test(v)) + } + return typeof value === 'string' && re.test(value) + } + case 'all': + return ( + Array.isArray(value) && + Array.isArray(operand) && + operand.every((o) => value.includes(o as never)) + ) + default: + return false + } +} + +export function evaluatePostFilter(doc: Record, where: Where): boolean { + if (!where || Object.keys(where).length === 0) return true + if ('and' in where && Array.isArray(where.and)) { + return where.and.every((c: Where) => evaluatePostFilter(doc, c)) + } + if ('or' in where && Array.isArray(where.or)) { + return where.or.some((c: Where) => evaluatePostFilter(doc, c)) + } + for (const [field, condition] of Object.entries(where)) { + if (field === 'and' || field === 'or') continue + if (typeof condition !== 'object' || condition === null) continue + const cond = condition as Record + for (const [op, operand] of Object.entries(cond)) { + if (!valueMatchesOp(doc[field], op, operand)) return false + } + } + return true +} diff --git a/adapters/mongodb/src/embed.ts b/adapters/mongodb/src/embed.ts new file mode 100644 index 0000000..4eaa307 --- /dev/null +++ b/adapters/mongodb/src/embed.ts @@ -0,0 +1,43 @@ +import type { BasePayload } from 'payload' +import type { StoreChunkData } from 'payloadcms-vectorize' +import { getMongoClient } from './client.js' +import { ensureSearchIndex } from './indexes.js' +import type { ResolvedPoolConfig } from './types.js' + +export interface MongoStoreCtx { + uri: string + dbName: string + pools: Record +} + +export async function storeChunkImpl( + ctx: MongoStoreCtx, + _payload: BasePayload, + poolName: string, + data: StoreChunkData, +): Promise { + const pool = ctx.pools[poolName] + if (!pool) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Unknown pool "${poolName}". Configured pools: ${Object.keys(ctx.pools).join(', ')}`, + ) + } + const client = await getMongoClient(ctx.uri) + await ensureSearchIndex(client, ctx.dbName, pool) + + const embeddingArray = Array.from(data.embedding) + + const now = new Date() + const collection = client.db(ctx.dbName).collection(pool.collectionName) + await collection.insertOne({ + ...data.extensionFields, + sourceCollection: data.sourceCollection, + docId: String(data.docId), + chunkIndex: data.chunkIndex, + chunkText: data.chunkText, + embeddingVersion: data.embeddingVersion, + embedding: embeddingArray, + createdAt: now, + updatedAt: now, + }) +} diff --git a/adapters/mongodb/src/escapeRegExp.ts b/adapters/mongodb/src/escapeRegExp.ts new file mode 100644 index 0000000..60bce82 --- /dev/null +++ b/adapters/mongodb/src/escapeRegExp.ts @@ -0,0 +1,3 @@ +export function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') +} diff --git a/adapters/mongodb/src/index.ts b/adapters/mongodb/src/index.ts new file mode 100644 index 0000000..0827547 --- /dev/null +++ b/adapters/mongodb/src/index.ts @@ -0,0 +1,87 @@ +import type { DbAdapter } from 'payloadcms-vectorize' +import { getMongoClient } from './client.js' +import { storeChunkImpl } from './embed.js' +import { searchImpl } from './search.js' +import { + resolvePoolConfig, + type MongoVectorIntegrationConfig, + type ResolvedPoolConfig, +} from './types.js' + +export type { + MongoPoolConfig, + MongoVectorIntegrationConfig, + Similarity, +} from './types.js' + +export const createMongoVectorIntegration = ( + options: MongoVectorIntegrationConfig, +): { adapter: DbAdapter } => { + if (!options.uri) throw new Error('[@payloadcms-vectorize/mongodb] `uri` is required') + if (!options.dbName) throw new Error('[@payloadcms-vectorize/mongodb] `dbName` is required') + if (!options.pools || Object.keys(options.pools).length === 0) { + throw new Error('[@payloadcms-vectorize/mongodb] `pools` must contain at least one pool') + } + + const resolvedPools: Record = {} + for (const [name, p] of Object.entries(options.pools)) { + if (typeof p.dimensions !== 'number' || p.dimensions <= 0) { + throw new Error( + `[@payloadcms-vectorize/mongodb] pool "${name}" requires a positive numeric \`dimensions\``, + ) + } + resolvedPools[name] = resolvePoolConfig(name, p) + } + + const ctx = { uri: options.uri, dbName: options.dbName, pools: resolvedPools } + + const adapter: DbAdapter = { + getConfigExtension: () => ({ + custom: { + _mongoConfig: { dbName: options.dbName, pools: resolvedPools }, + }, + }), + + storeChunk: (payload, poolName, chunk) => + storeChunkImpl(ctx, payload, poolName, chunk), + + deleteChunks: async (_payload, poolName, sourceCollection, docId) => { + const cfg = ctx.pools[poolName] + if (!cfg) { + throw new Error(`[@payloadcms-vectorize/mongodb] Unknown pool "${poolName}"`) + } + const client = await getMongoClient(ctx.uri) + await client + .db(ctx.dbName) + .collection(cfg.collectionName) + .deleteMany({ sourceCollection, docId: String(docId) }) + }, + + hasEmbeddingVersion: async ( + _payload, + poolName, + sourceCollection, + docId, + embeddingVersion, + ) => { + const cfg = ctx.pools[poolName] + if (!cfg) { + throw new Error(`[@payloadcms-vectorize/mongodb] Unknown pool "${poolName}"`) + } + const client = await getMongoClient(ctx.uri) + const count = await client + .db(ctx.dbName) + .collection(cfg.collectionName) + .countDocuments( + { sourceCollection, docId: String(docId), embeddingVersion }, + { limit: 1 }, + ) + return count > 0 + }, + + search: (payload, queryEmbedding, poolName, limit, where) => + searchImpl(ctx, payload, queryEmbedding, poolName, limit, where), + } + + return { adapter } +} diff --git a/adapters/mongodb/src/indexes.ts b/adapters/mongodb/src/indexes.ts new file mode 100644 index 0000000..4e6e236 --- /dev/null +++ b/adapters/mongodb/src/indexes.ts @@ -0,0 +1,132 @@ +import type { Db, MongoClient } from 'mongodb' +import type { ResolvedPoolConfig } from './types.js' + +const ensureCache = new Map>() + +function cacheKey(dbName: string, collectionName: string, indexName: string): string { + return `${dbName}::${collectionName}::${indexName}` +} + +function buildDefinition(pool: ResolvedPoolConfig): Record { + return { + fields: [ + { + type: 'vector', + path: 'embedding', + numDimensions: pool.dimensions, + similarity: pool.similarity, + }, + { type: 'filter', path: 'sourceCollection' }, + { type: 'filter', path: 'docId' }, + { type: 'filter', path: 'embeddingVersion' }, + ...pool.filterableFields.map((p) => ({ type: 'filter', path: p })), + ], + } +} + +function definitionsEqual(a: unknown, b: unknown): boolean { + return canonicalize(a) === canonicalize(b) +} + +function canonicalize(value: unknown): string { + return JSON.stringify(canonicalValue(value)) +} + +function canonicalValue(value: unknown): unknown { + if (value === null || typeof value !== 'object') return value + if (Array.isArray(value)) return value.map(canonicalValue) + const obj = value as Record + const out: Record = {} + for (const key of Object.keys(obj).sort()) { + let v = canonicalValue(obj[key]) + if (key === 'fields' && Array.isArray(v)) { + v = [...v].sort((x, y) => { + const xs = JSON.stringify(x) + const ys = JSON.stringify(y) + return xs < ys ? -1 : xs > ys ? 1 : 0 + }) + } + out[key] = v + } + return out +} + +async function ensureCollectionExists(db: Db, name: string): Promise { + const existing = await db.listCollections({ name }, { nameOnly: true }).toArray() + if (existing.length === 0) { + await db.createCollection(name) + } +} + +async function doEnsure( + client: MongoClient, + dbName: string, + pool: ResolvedPoolConfig, +): Promise { + const db = client.db(dbName) + const collection = db.collection(pool.collectionName) + const wantedDefinition = buildDefinition(pool) + + const existing = (await collection + .listSearchIndexes(pool.indexName) + .toArray()) as Array> + + const found = existing.find((idx) => idx.name === pool.indexName) + if (found) { + const status = found.status as string | undefined + if (status === 'READY' || status === 'BUILDING') { + const latest = (found.latestDefinition as Record) ?? found.definition + if (!definitionsEqual(latest, wantedDefinition)) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Search index "${pool.indexName}" exists with different definition. Drop it manually with db.collection("${pool.collectionName}").dropSearchIndex("${pool.indexName}") before re-running.`, + ) + } + if (status === 'READY') return + } else { + throw new Error( + `[@payloadcms-vectorize/mongodb] Search index "${pool.indexName}" is in unexpected state "${status}". Drop and recreate.`, + ) + } + } else { + await ensureCollectionExists(db, pool.collectionName) + await collection.createSearchIndex({ + name: pool.indexName, + type: 'vectorSearch', + definition: wantedDefinition, + }) + } + + const deadline = Date.now() + 60_000 + while (Date.now() < deadline) { + const list = (await collection + .listSearchIndexes(pool.indexName) + .toArray()) as Array> + const idx = list.find((i) => i.name === pool.indexName) + if (idx?.status === 'READY') return + await new Promise((r) => setTimeout(r, 1000)) + } + throw new Error( + `[@payloadcms-vectorize/mongodb] Search index "${pool.indexName}" did not become READY within 60s. Check Mongo logs.`, + ) +} + +export function ensureSearchIndex( + client: MongoClient, + dbName: string, + pool: ResolvedPoolConfig, +): Promise { + const key = cacheKey(dbName, pool.collectionName, pool.indexName) + let p = ensureCache.get(key) + if (!p) { + p = doEnsure(client, dbName, pool).catch((err) => { + ensureCache.delete(key) + throw err + }) + ensureCache.set(key, p) + } + return p +} + +export function __resetIndexCacheForTests(): void { + ensureCache.clear() +} diff --git a/adapters/mongodb/src/search.ts b/adapters/mongodb/src/search.ts new file mode 100644 index 0000000..02c5392 --- /dev/null +++ b/adapters/mongodb/src/search.ts @@ -0,0 +1,107 @@ +import type { BasePayload, Where } from 'payload' +import type { VectorSearchResult } from 'payloadcms-vectorize' +import { getMongoClient } from './client.js' +import { convertWhereToMongo, evaluatePostFilter } from './convertWhere.js' +import { ensureSearchIndex } from './indexes.js' +import { RESERVED_FIELDS, type ResolvedPoolConfig } from './types.js' + +export interface MongoSearchCtx { + uri: string + dbName: string + pools: Record +} + +export async function searchImpl( + ctx: MongoSearchCtx, + _payload: BasePayload, + queryEmbedding: number[], + poolName: string, + limit: number = 10, + where?: Where, +): Promise { + const pool = ctx.pools[poolName] + if (!pool) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Unknown pool "${poolName}". Configured pools: ${Object.keys(ctx.pools).join(', ')}`, + ) + } + if (!Number.isInteger(limit) || limit <= 0) { + throw new Error( + `[@payloadcms-vectorize/mongodb] limit must be a positive integer; got ${limit}`, + ) + } + const client = await getMongoClient(ctx.uri) + await ensureSearchIndex(client, ctx.dbName, pool) + + let preFilter: Record | null = null + let postFilter: Where | null = null + if (where && Object.keys(where).length > 0) { + const split = convertWhereToMongo(where, pool.filterableFields, poolName) + preFilter = split.preFilter + postFilter = split.postFilter + } + + const numCandidates = pool.numCandidates ?? limit * 10 + + const vectorSearchStage: Record = { + index: pool.indexName, + path: 'embedding', + queryVector: queryEmbedding, + numCandidates, + limit, + } + if (pool.forceExact) vectorSearchStage.exact = true + if (preFilter) vectorSearchStage.filter = preFilter + + const projection: Record = { + _id: 1, + score: { $meta: 'vectorSearchScore' }, + sourceCollection: 1, + docId: 1, + chunkIndex: 1, + chunkText: 1, + embeddingVersion: 1, + } + for (const f of pool.filterableFields) projection[f] = 1 + + const pipeline: Record[] = [ + { $vectorSearch: vectorSearchStage }, + { $project: projection }, + ] + + const collection = client.db(ctx.dbName).collection(pool.collectionName) + const rawDocs = await collection.aggregate(pipeline).toArray() + + const filtered = postFilter + ? rawDocs.filter((d) => evaluatePostFilter(d as Record, postFilter!)) + : rawDocs + + return filtered.map((d) => mapDocToResult(d as Record, pool.filterableFields)) +} + +function mapDocToResult( + doc: Record, + filterable: string[], +): VectorSearchResult { + if (typeof doc.score !== 'number') { + throw new Error( + `[@payloadcms-vectorize/mongodb] Search result is missing numeric "score" field; ensure $project includes { score: { $meta: 'vectorSearchScore' } }`, + ) + } + const result: Record = { + id: String(doc._id), + score: doc.score, + sourceCollection: String(doc.sourceCollection ?? ''), + docId: String(doc.docId ?? ''), + chunkIndex: + typeof doc.chunkIndex === 'number' ? doc.chunkIndex : Number(doc.chunkIndex ?? 0), + chunkText: String(doc.chunkText ?? ''), + embeddingVersion: String(doc.embeddingVersion ?? ''), + } + for (const f of filterable) { + if (f in doc && !(RESERVED_FIELDS as readonly string[]).includes(f)) { + result[f] = doc[f] + } + } + return result as VectorSearchResult +} diff --git a/adapters/mongodb/src/types.ts b/adapters/mongodb/src/types.ts new file mode 100644 index 0000000..8c8963e --- /dev/null +++ b/adapters/mongodb/src/types.ts @@ -0,0 +1,80 @@ +export type Similarity = 'cosine' | 'euclidean' | 'dotProduct' + +export interface MongoPoolConfig { + /** Vector dimensions for this pool (must match embedding model output). */ + dimensions: number + /** Similarity metric for the search index. Default 'cosine'. */ + similarity?: Similarity + /** ANN candidate set size. Default at search time: max(limit * 20, 100). */ + numCandidates?: number + /** Extension fields to declare as filterable in the search index. */ + filterableFields?: string[] + /** ENN exact search (full scan) instead of HNSW ANN. Default false. */ + forceExact?: boolean + /** Override Mongo collection name. Default `vectorize_${poolName}`. */ + collectionName?: string + /** Override search index name. Default `${collectionName}_idx`. */ + indexName?: string +} + +export interface MongoVectorIntegrationConfig { + /** Any valid MongoDB connection string (Atlas SRV or self-hosted). */ + uri: string + /** Database that holds the per-pool vector collections. */ + dbName: string + /** Pools keyed by knowledge pool name. */ + pools: Record +} + +/** Resolved per-pool config used internally (defaults applied). */ +export interface ResolvedPoolConfig { + dimensions: number + similarity: Similarity + numCandidates?: number + filterableFields: string[] + forceExact: boolean + collectionName: string + indexName: string +} + +/** + * Stored on `getConfigExtension().custom._mongoConfig` for introspection. + * The connection URI is intentionally NOT included — credentials live in + * the adapter closure, never on `payload.config`. + */ +export interface MongoConfigCustom { + dbName: string + pools: Record +} + +export const RESERVED_FILTER_FIELDS = [ + 'sourceCollection', + 'docId', + 'embeddingVersion', +] as const + +export const RESERVED_FIELDS = [ + 'sourceCollection', + 'docId', + 'chunkIndex', + 'chunkText', + 'embeddingVersion', + 'embedding', +] as const + +export function resolvePoolConfig( + poolName: string, + cfg: MongoPoolConfig, +): ResolvedPoolConfig { + const collectionName = cfg.collectionName ?? `vectorize_${poolName}` + return { + dimensions: cfg.dimensions, + similarity: cfg.similarity ?? 'cosine', + numCandidates: cfg.numCandidates, + filterableFields: cfg.filterableFields ?? [], + forceExact: cfg.forceExact ?? false, + collectionName, + indexName: cfg.indexName ?? `${collectionName}_idx`, + } +} + diff --git a/adapters/mongodb/tsconfig.build.json b/adapters/mongodb/tsconfig.build.json new file mode 100644 index 0000000..3a56da8 --- /dev/null +++ b/adapters/mongodb/tsconfig.build.json @@ -0,0 +1,3 @@ +{ + "extends": "../tsconfig.adapter.json" +} diff --git a/adapters/mongodb/vitest.config.ts b/adapters/mongodb/vitest.config.ts new file mode 100644 index 0000000..7ada1b4 --- /dev/null +++ b/adapters/mongodb/vitest.config.ts @@ -0,0 +1,38 @@ +import path from 'path' +import { loadEnv } from 'payload/node' +import { fileURLToPath } from 'url' +import tsconfigPaths from 'vite-tsconfig-paths' +import { defineConfig } from 'vitest/config' + +const filename = fileURLToPath(import.meta.url) +const dirname = path.dirname(filename) + +export default defineConfig(() => { + loadEnv(path.resolve(dirname, '../../dev')) + + return { + plugins: [ + tsconfigPaths({ + ignoreConfigErrors: true, + }), + ], + resolve: { + alias: { + 'payloadcms-vectorize': path.resolve(dirname, '../../src/index.ts'), + '@shared-test/utils': path.resolve(dirname, '../../dev/specs/utils.ts'), + '@shared-test/helpers/chunkers': path.resolve(dirname, '../../dev/helpers/chunkers.ts'), + '@shared-test/helpers/embed': path.resolve(dirname, '../../dev/helpers/embed.ts'), + '@shared-test/constants': path.resolve(dirname, '../../dev/specs/constants.ts'), + }, + }, + test: { + root: dirname, + environment: 'node', + hookTimeout: 120_000, + testTimeout: 120_000, + include: ['dev/specs/**/*.spec.ts'], + exclude: ['**/e2e.spec.{ts,js}', '**/node_modules/**'], + fileParallelism: false, + }, + } +}) diff --git a/adapters/pg/src/index.ts b/adapters/pg/src/index.ts index c2dffcd..ac28c21 100644 --- a/adapters/pg/src/index.ts +++ b/adapters/pg/src/index.ts @@ -100,12 +100,12 @@ export const createPostgresVectorIntegration = ( const created = await payload.create({ collection: poolName as any, data: { + ...data.extensionFields, sourceCollection: data.sourceCollection, docId: data.docId, chunkIndex: data.chunkIndex, chunkText: data.chunkText, embeddingVersion: data.embeddingVersion, - ...data.extensionFields, embedding: embeddingArray, }, }) diff --git a/docs/plans/2026-04-25-mongodb-adapter-deep-dive.md b/docs/plans/2026-04-25-mongodb-adapter-deep-dive.md new file mode 100644 index 0000000..6c0f5be --- /dev/null +++ b/docs/plans/2026-04-25-mongodb-adapter-deep-dive.md @@ -0,0 +1,186 @@ +# Deep dive: MongoDB vector search adapter + +## TL;DR + +**Your intuition is correct, and the timing is excellent.** Self-hosted MongoDB vector search is a real, supported thing as of MongoDB Community Edition 8.2 (Sept 2025), and the engine (`mongot`) went source-available under SSPL in Jan 2026. It is exactly what you described — a separate binary that runs alongside `mongod` and stays in sync via Change Streams. The application still talks to `mongod` on the standard port and just sends a `$vectorSearch` aggregation stage; mongod proxies it to mongot transparently. + +**Effort estimate: ~1.5–2.5 weeks** to ship a quality adapter on par with the PG one (functionally), assuming you adopt the official `mongot` path rather than rolling brute-force. + +The friction is **not** in the adapter code — it's small and clean. The friction is in the **operational story you're asking users to adopt** (replica set + sidecar binary + index lifecycle). + +--- + +## 1. The adapter contract is small and Mongo-friendly + +The core defines a 5-method `DbAdapter` interface in [src/types.ts:384-418](../../src/types.ts#L384-L418): + +| Method | What it does | Mongo equivalent | +|---|---|---| +| `getConfigExtension` | Returns Payload collections/bins/custom data the adapter contributes | Same shape — adapter exposes its own collections to Payload | +| `storeChunk` | Insert a chunk row with text, metadata, and the vector | `db.collection.insertOne({ ...meta, embedding })` | +| `deleteChunks` | Delete all chunks for a `(sourceCollection, docId)` | `db.collection.deleteMany({ sourceCollection, docId })` | +| `hasEmbeddingVersion` | Check if a doc already has chunks at a given embedding version | `db.collection.findOne({ docId, embeddingVersion })` | +| `search` | Vector search with optional `Where` filter | `$vectorSearch` aggregation pipeline | + +Input/output types ([src/types.ts:298-308](../../src/types.ts#L298-L308), [src/types.ts:374-382](../../src/types.ts#L374-L382)) are pure data — nothing PG-shaped leaks across the boundary. **Notably, there's no schema migration step required** in the contract. PG needs `afterSchemaInitHook` because Drizzle has to learn about the `vector(dims)` column at startup. Mongo doesn't — collections and search indexes can be created lazily on first use, so the Mongo factory can return just `{ adapter }` (matching the CF Vectorize adapter shape). + +--- + +## 2. mongot's API surface maps cleanly to what we need + +From the official aggregation reference ([$vectorSearch docs](https://www.mongodb.com/docs/manual/reference/operator/aggregation/vectorsearch/)) and the community-edition writeup, here is the actual query shape: + +```js +db.chunks.aggregate([ + { + $vectorSearch: { + index: "vector_index", + path: "embedding", + queryVector: [/* …dims floats… */], + numCandidates: 100, // ANN candidate set + limit: 10, + filter: { // ← native pre-filter + sourceCollection: { $eq: "articles" }, + status: { $in: ["published", "featured"] } + } + } + }, + { $project: { score: { $meta: "vectorSearchScore" }, chunkText: 1, /*…*/ } } +]) +``` + +Two important properties for us: + +- **Native pre-filter.** Mongo's `filter` clause runs *before* the ANN scan, which is the correct ordering — this is the same architectural advantage the CF Vectorize adapter exploits, where it has to split a Payload `Where` into native-supported predicates vs post-filter predicates. +- **Score in `$meta`.** No `1 - cosineDistance` math needed; Mongo gives you a normalized similarity score directly. + +Index definition (created via `createSearchIndexes`): +```js +{ + fields: [{ type: "vector", path: "embedding", numDimensions: 1536, similarity: "cosine" }] +} +``` + +--- + +## 3. Translating Payload's `Where` to Mongo + +> **Verified against [`payloadcms/payload/packages/db-mongodb/src/queries/`](https://github.com/payloadcms/payload/tree/main/packages/db-mongodb/src/queries)** — `operatorMap.ts`, `sanitizeQueryValue.ts`, `parseParams.ts`, `buildAndOrConditions.ts`. Payload's own Mongo adapter is the source of truth for these semantics; the goal is byte-for-byte filter parity with users' CRUD queries. +> +> Authoritative allowlist for `$vectorSearch.filter`: `$eq $ne $gt $gte $lt $lte $in $nin $exists $not $nor $and $or` ([Mongo docs](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/)). Anything else must be **post-filtered** as a `$match` stage after the vector scan — the `splitWhere` pattern from [adapters/cf/src/search.ts:91-142](../../adapters/cf/src/search.ts#L91-L142). + +This is the same problem solved in [adapters/pg/src/search.ts:146-282](../../adapters/pg/src/search.ts#L146-L282) (`convertWhereToDrizzle`) and tested heavily in your own [vectorSearchWhere.spec.ts](../../adapters/pg/dev/specs/vectorSearchWhere.spec.ts) suite. + +| Payload operator | Payload's `db-mongodb` mapping | In `$vectorSearch.filter`? | Adapter strategy | +|---|---|---|---| +| `equals` | `$eq` | ✅ | pre-filter | +| `not_equals` | `$ne` | ✅ | pre-filter | +| `in` | `$in` | ✅ | pre-filter | +| `not_in` | `$nin` | ✅ | pre-filter | +| `greater_than` | `$gt` | ✅ | pre-filter | +| `greater_than_equal` | `$gte` | ✅ | pre-filter | +| `less_than` | `$lt` | ✅ | pre-filter | +| `less_than_equal` | `$lte` | ✅ | pre-filter | +| `exists` | `$and`/`$or` of `$exists` + `$ne null` + `$ne ''` (see `buildExistsQuery`) | ✅ (composed of allowed ops) | pre-filter | +| `like` | `$regex` + `$options:'i'` + `escapeRegExp` | ❌ | **post-filter** | +| `contains` (scalar) | `$regex` + `$options:'i'` + `escapeRegExp` | ❌ | **post-filter** | +| `contains` (hasMany) | `$elemMatch` + `$regex` | ❌ | **post-filter** | +| `all` | `$all` | ❌ | **post-filter** | +| `near` / `within` / `intersects` | `$near` / `$geoWithin` / `$geoIntersects` | ❌ | unsupported in vector context — surface a clear error | +| `and` / `or` (case-insensitive keys) | `$and` / `$or` | ✅ | pre-filter, recurse | + +**Six things Payload does that we must mirror** (otherwise filter behavior diverges from CRUD): + +1. **`escapeRegExp` on `like`/`contains`.** Without escaping, user input like `foo.bar` matches `foozbar`. Reuse Payload's exported `escapeRegExp`. +2. **Case-insensitive substring by default** — `$options: 'i'` on every `like`/`contains`. +3. **Case-insensitive `and`/`or` keys** — Payload `.toLowerCase()`s them; accept `and`/`AND`/`And`. +4. **Multi-operator on same path → wrap in `$and`** to avoid object-key overwrite (see `parseParams.ts`). +5. **`exists` is compound, not just `$exists`** — empty strings are treated as missing for most field types. +6. **ObjectId casting on `_id` and relationship IDs** — lift from `sanitizeQueryValue.ts`. Matters when filtering chunks by `docId` against a source collection that uses ObjectIds. + +**Your existing 38-test suite for `vectorSearchWhere` can be reused almost verbatim** — assertions are on result IDs and ordering, not SQL strings, so it ports as-is. + +--- + +## 4. Operational story — the real cost + +This is what your users will actually feel. Vector search in self-hosted Mongo is **not** "just install MongoDB": + +| Requirement | Notes | +|---|---| +| MongoDB Community Edition **8.2+** | Released Sept 2025; many users are on 6.x/7.x | +| **Replica set required** (even single-node) | Atlas hides this; self-hosted users have to `rs.initiate()` | +| `mongot` binary running alongside `mongod` | Separate package: `mongodb/mongodb-community-search` | +| Connectivity between mongod ↔ mongot | mongot port 27028, mongod must be configured to know about it | +| Search indexes created via `createSearchIndexes` | Async — index becomes queryable after a sync delay | +| Public preview status | Mongo flags this as "development and evaluation only, not production" as of Jan 2026 | + +This is the "wrapping two services together" piece you flagged. The adapter code itself doesn't wrap them — `mongod` does. But your **adapter README and onboarding docs** will need to walk users through a docker-compose setup like: + +```yaml +services: + mongod: { image: mongodb/mongodb-community-server:8.2.0-ubi9, ... } + mongot: { image: mongodb/mongodb-community-search:0.53.1, ports: ["27028:27028"] } +``` + +…plus replica set init. The dev-environment story for your own [/dev](../../dev/) test app needs the same setup, and your [compliance.spec.ts](../../adapters/pg/dev/specs/compliance.spec.ts) port will need a `beforeAll` that brings both up. + +--- + +## 5. Concrete proposed structure + +Mirroring [adapters/pg/](../../adapters/pg/): + +``` +adapters/mongodb/ +├── package.json # @payloadcms-vectorize/mongodb +│ # peer deps: payload, payloadcms-vectorize, mongodb (>=6.x driver) +├── src/ +│ ├── index.ts # createMongoVectorIntegration({ uri, dbName, knowledgePools }) +│ │ # returns { adapter }; lazily creates collection + search index per pool +│ ├── search.ts # search() → $vectorSearch pipeline; convertWhereToMongo() +│ ├── embed.ts # storeChunk() → insertOne; deleteChunks() → deleteMany +│ ├── indexes.ts # ensureSearchIndex() — createSearchIndexes if missing +│ └── types.ts # MongoConfig, similarity choice, numCandidates default +└── dev/specs/ + ├── compliance.spec.ts # port from PG + ├── vectorSearchWhere.spec.ts # port from PG (38 tests, mostly identical assertions) + └── docker-compose.test.yml # mongod + mongot for CI +``` + +Notable simplifications vs PG: +- No `bin-vectorize-migrate.ts` — Mongo doesn't have a schema migration concept here +- No `drizzle.ts` registry — no ORM to plug into +- No `afterSchemaInitHook` — adapter returns just `{ adapter }` +- Index dimension changes: handled by dropping/recreating the search index + +--- + +## 6. Recommendation & scope + +**Worth doing.** Three reasons: + +1. **The contract fits.** Your interface was clearly designed adapter-first; Mongo doesn't require contract changes. That validates the original design and makes a third adapter low-risk. +2. **Mongo is a major Payload backend.** Payload's first-class DB adapters are PG and Mongo. Shipping only a PG vector adapter implicitly excludes half the Payload userbase from this plugin. +3. **Test reuse.** The 38-test `vectorSearchWhere` suite is the hard part of any adapter; you've already built it. Porting it is mechanical. + +**Estimated breakdown** (calendar time, single dev): +- Adapter scaffolding + `storeChunk`/`deleteChunks`/`hasEmbeddingVersion`: ~1 day +- `convertWhereToMongo` + handling pre-filter vs post-filter split: ~1–2 days +- `search` with `$vectorSearch` + index lifecycle: ~2 days +- Docker-compose + CI for mongod+mongot, port the test suite: ~2–3 days +- README + setup walkthrough (this is genuinely the hardest user-facing piece): ~1–2 days + +**Tradeoff to flag:** the public-preview status of self-hosted vector search means you'd be shipping an adapter against a feature MongoDB themselves label as "not for production." A pragmatic move would be to ship it labeled `experimental` / `^0.x` and let GA timing on Mongo's side drive the 1.0. + +**One thing I'd want to know before we commit:** does PR #35 / Dejan's "issues testing on a real app" feedback include any Mongo-specific requests? If users are already asking for this, that nudges scope toward "just do it." + +--- + +## Sources +- [MongoDB extends search and vector search to self-managed offerings (press release)](https://www.mongodb.com/press/mongodb-extends-search-and-vector-search-capabilities-to-self-managed-offerings) +- [Public preview: MongoDB Community Edition now offers native full-text and vector search](https://www.mongodb.com/products/updates/public-preview-mongodb-community-edition-now-offers-native-full-text-and-vector-search/) +- [$vectorSearch aggregation stage reference](https://www.mongodb.com/docs/manual/reference/operator/aggregation/vectorsearch/) +- [Now source available: the engine powering MongoDB Search (mongot under SSPL)](https://www.mongodb.com/company/blog/product-release-announcements/now-source-available-the-engine-powering-mongodb-search) +- [MongoDB Community Edition: Vector Search for Everyone (hands-on writeup)](https://www.ostberg.dev/work/2025/10/12/mongodb-community-vector-search.html) +- [Supercharge self-managed apps with search and vector search capabilities](https://www.mongodb.com/company/blog/product-release-announcements/supercharge-self-managed-apps-search-vector-search-capabilities) diff --git a/docs/plans/2026-04-25-mongodb-atlas-adapter-deep-dive.md b/docs/plans/2026-04-25-mongodb-atlas-adapter-deep-dive.md new file mode 100644 index 0000000..4f01a44 --- /dev/null +++ b/docs/plans/2026-04-25-mongodb-atlas-adapter-deep-dive.md @@ -0,0 +1,388 @@ +# Deep dive: MongoDB Atlas vector search adapter + +> Companion to [2026-04-25-mongodb-adapter-deep-dive.md](2026-04-25-mongodb-adapter-deep-dive.md), which covers self-hosted Community Edition. + +## TL;DR + +**The adapter code for Atlas is essentially the same as for self-hosted Community.** Both expose the identical `$vectorSearch` aggregation stage, the identical `createSearchIndexes` API, and the identical `filter` semantics. The interesting question is no longer *"how do I write an Atlas adapter?"* — it's *"do I write one adapter or two?"* + +**My recommendation: ship one adapter (`@payloadcms-vectorize/mongodb`), with Atlas as the default/production target and self-hosted Community as the experimental/dev target.** Atlas is GA, runs on every tier including the free M0, and is what 90%+ of MongoDB-on-Payload users are already using. + +> **Decision (2026-04-25):** Development will use the **direct Docker** path (`mongodb/mongodb-atlas-local` image) as the primary dev/CI target. No Atlas account, no Atlas CLI, no login required. See [section 8](#8-development-environment-local-atlas-deployment-) for the full setup. + +**Effort estimate vs the Community adapter:** +- If you do **both** as one adapter: **+1–2 days** on top of the Community estimate (1.5–2.5 wks). Mostly: connection-string handling, the `filter`-field-must-be-indexed gotcha, and Search Nodes documentation. +- If you do **Atlas only** and skip Community: **~1–1.5 weeks**. You get to skip the docker-compose / mongot / replica-set onboarding burden entirely. + +The friction is **lower than Community in every dimension**: GA instead of preview, no sidecar binary, free tier exists, no replica-set ceremony for the user. + +--- + +## 1. The adapter contract change is zero + +Everything in [section 1 of the Community deep-dive](2026-04-25-mongodb-adapter-deep-dive.md) applies unchanged. The `DbAdapter` interface in [src/types.ts:384-418](../../src/types.ts#L384-L418) doesn't care whether you point it at Atlas or self-hosted — both speak MongoDB wire protocol, both accept the same aggregation pipeline. + +--- + +## 2. The query shape is identical, with a few extra knobs + +The `$vectorSearch` aggregation stage is the same one. Atlas exposes a couple of additional production-relevant knobs not emphasized in the Community write-up: + +```js +db.chunks.aggregate([ + { + $vectorSearch: { + index: "vector_index", + path: "embedding", + queryVector: [/* …dims floats… */], + numCandidates: 100, // tune ~20× limit + limit: 10, + exact: false, // ← Atlas: ENN if true, ANN (HNSW) if false/omitted + filter: { + sourceCollection: "articles", // shorthand $eq + status: { $in: ["published", "featured"] } + } + } + }, + { $project: { score: { $meta: "vectorSearchScore" }, chunkText: 1 } } +]) +``` + +New for Atlas: +- **`exact: true`** — opt-in exact nearest-neighbor (full scan) for small datasets or safety-critical paths. Useful as a `forceExact?: boolean` knob in the adapter config. +- **HNSW under the hood** — Atlas docs explicitly say ANN uses Hierarchical Navigable Small Worlds. Not adapter-relevant; just informational. + +--- + +## 3. Filter operators — officially enumerated + +> **Verified against [`payloadcms/payload/packages/db-mongodb/src/queries/`](https://github.com/payloadcms/payload/tree/main/packages/db-mongodb/src/queries)** — `operatorMap.ts`, `sanitizeQueryValue.ts`, `parseParams.ts`. Goal: byte-for-byte filter parity with Payload's own CRUD queries against the same data. + +Atlas docs give the **exhaustive supported list** for the `filter` clause: + +**Supported in `$vectorSearch.filter`:** `$eq`, `$ne`, `$gt`, `$gte`, `$lt`, `$lte`, `$in`, `$nin`, `$exists`, `$and`, `$or`, `$not`, `$nor` + +**NOT supported:** `$regex`, `$all`, `$elemMatch`, geo operators, any aggregation operator, any `$search` operator + +Mapped against Payload's `Where` (per Payload's own `db-mongodb` adapter): + +| Payload operator | Payload's `db-mongodb` mapping | In `$vectorSearch.filter`? | Adapter strategy | +|---|---|---|---| +| `equals` | `$eq` | ✅ | pre-filter | +| `not_equals` | `$ne` | ✅ | pre-filter | +| `in` | `$in` | ✅ | pre-filter | +| `not_in` | `$nin` | ✅ | pre-filter | +| `greater_than` | `$gt` | ✅ | pre-filter | +| `greater_than_equal` | `$gte` | ✅ | pre-filter | +| `less_than` | `$lt` | ✅ | pre-filter | +| `less_than_equal` | `$lte` | ✅ | pre-filter | +| `exists` | `$and`/`$or` of `$exists` + `$ne null` + `$ne ''` (see `buildExistsQuery`) | ✅ (composed of allowed ops) | pre-filter | +| `and` / `or` (case-insensitive keys) | `$and` / `$or` | ✅ | pre-filter, recurse | +| **`like`** | `$regex` + `$options:'i'` + `escapeRegExp` | ❌ | **post-filter** | +| **`contains`** (scalar) | `$regex` + `$options:'i'` + `escapeRegExp` | ❌ | **post-filter** | +| **`contains`** (hasMany) | `$elemMatch` + `$regex` | ❌ | **post-filter** | +| **`all`** | `$all` | ❌ | **post-filter** | +| `near` / `within` / `intersects` | `$near` / `$geoWithin` / `$geoIntersects` | ❌ | unsupported in vector context — surface a clear error | + +So `splitWhere` is needed (same pattern as [adapters/cf/src/search.ts:91-142](../../adapters/cf/src/search.ts#L91-L142)), and its post-filter bucket is wider than originally framed: `like`, `contains`, `all`, and any geo predicate. + +**Six things Payload's own adapter does that we must mirror** (otherwise vector-search filter behavior diverges from CRUD behavior on the same data): + +1. **`escapeRegExp` on `like`/`contains`.** Without escaping, user input like `foo.bar` matches `foozbar`. Reuse Payload's exported `escapeRegExp`. +2. **Case-insensitive substring by default** — `$options: 'i'` on every `like`/`contains`. +3. **Case-insensitive `and`/`or` keys** — Payload `.toLowerCase()`s them. +4. **Multi-operator on same path → wrap in `$and`** to avoid object-key overwrite. +5. **`exists` is compound, not just `$exists`** — empty strings are treated as missing for most field types. +6. **ObjectId casting on `_id` and relationship IDs** — lift from `sanitizeQueryValue.ts`. Matters when filtering chunks by `docId` against a source collection that uses ObjectIds. + +⚠️ **The big Atlas gotcha that doesn't exist in PG:** Filter fields must be declared in the index definition as type `"filter"`. You cannot filter on an unindexed field. So the index definition becomes: + +```js +{ + fields: [ + { type: "vector", path: "embedding", numDimensions: 1536, similarity: "cosine" }, + { type: "filter", path: "sourceCollection" }, + { type: "filter", path: "docId" }, + { type: "filter", path: "embeddingVersion" }, + // …plus any extension field a user wants to filter on + ] +} +``` + +This is a **real design constraint for the adapter API**: the user has to declare upfront which `extensionFields` are filterable, or the adapter has to be conservative and index everything. The PG adapter doesn't have this concern — Postgres can filter on any column. Recommend an explicit `filterableFields: string[]` in the knowledge-pool config. + +--- + +## 4. Operational story — much friendlier than Community + +| Concern | Self-hosted Community | Atlas | +|---|---|---| +| MongoDB version | Must be 8.2+ | Always current | +| Replica set | Manual `rs.initiate()` | Automatic | +| `mongot` binary | Run separately, port-wire to mongod | Atlas runs it for you | +| GA status | Public preview, "not for production" | GA, production-supported | +| Free tier | N/A (you're hosting) | M0 free cluster | +| Connection | docker-compose required for dev | Just a connection string | +| Search Nodes (workload isolation) | DIY | Available on dedicated tiers | + +**Tier matrix for vector search** (per Atlas deployment-options docs): + +| Tier | Vector search? | Production-ready? | mongot location | +|---|---|---|---| +| M0 (free, 512 MB) | ✅ | Test only | Same node, shared | +| Flex ($8–$30/mo, 5 GB) | ✅ | Limited | Same node, shared | +| Dedicated M10+ ($57+/mo) | ✅ | Yes | Same node by default | +| Dedicated + Search Nodes | ✅ | Yes (best) | Separate nodes; ~90% RAM for index | + +For our adapter: +- **Dev/CI:** point at the M0 free tier or `atlas deployments setup --type local` (Atlas CLI's local-replica-set, runs mongot locally — no docker-compose needed). The local mode is what makes a "skip Community, Atlas-only" strategy viable for dev story too. +- **Production users:** `mongodb+srv://...` connection string, that's it. + +--- + +## 5. One adapter or two? The strategic call + +Since the wire protocol and aggregation API are identical, you have three structural options: + +**Option A — One unified `@payloadcms-vectorize/mongodb` adapter.** Recommend this. +- Pros: one codebase, one test suite, one README. User picks tier; same code works. +- Cons: must document both setup paths in the README; `filterableFields` config required upfront (because of the index-fields constraint, which exists on both Atlas and Community). + +**Option B — Two adapters, `@payloadcms-vectorize/mongodb-atlas` and `@payloadcms-vectorize/mongodb-community`.** +- Pros: cleaner per-target docs; can mark Community as `experimental` while Atlas is `stable`. +- Cons: 95% code duplication, double the maintenance, confusing for users. + +**Option C — Atlas only, defer Community.** +- Pros: ship faster, target the production-ready path, skip the docker-compose dev story. +- Cons: leaves self-hosted Payload users with no Mongo option — but they could still use the PG adapter. + +**My recommendation: Option A**, framed as "Atlas-first." The README leads with `mongodb+srv://` setup, and there's a "Self-hosted Community" subsection at the bottom for advanced users who explicitly want it. Internally the adapter doesn't even branch — it's the same code path either way. + +--- + +## 6. Concrete proposed structure (Option A) + +Same as the Community proposal, with these adjustments: + +``` +adapters/mongodb/ +├── package.json # @payloadcms-vectorize/mongodb +│ # peer deps: payload, payloadcms-vectorize, mongodb (>=6.x) +├── src/ +│ ├── index.ts # createMongoVectorIntegration({ +│ │ # uri, dbName, knowledgePools, +│ │ # filterableFields?: Record +│ │ # }) +│ ├── search.ts # search() → $vectorSearch; convertWhereToMongo(); +│ │ # splitWhere() to peel off like/contains/all/geo +│ │ # for post-filter $match +│ ├── embed.ts # storeChunk / deleteChunks / hasEmbeddingVersion +│ ├── indexes.ts # ensureSearchIndex() — declares vector field + all +│ │ # filterableFields as `filter` type +│ └── types.ts # MongoConfig, similarity, numCandidates default, +│ # forceExact?: boolean +└── dev/specs/ + ├── compliance.spec.ts # port from PG, runs against local Atlas (atlas CLI) + ├── vectorSearchWhere.spec.ts # port from PG; like/contains/all tests verify post-filter + └── setup.ts # `atlas deployments setup --type local` orchestration +``` + +Net code delta vs the Community-only proposal: +- `+ filterableFields` config plumbing +- `+ forceExact` support in `search.ts` +- `+ splitWhere` for `like` / `contains` / `all` / geo (CF adapter has the template) +- `+ Payload-parity touches`: `escapeRegExp`, `$options:'i'`, case-insensitive `and`/`or` keys, multi-op-on-same-path → `$and`, compound `exists`, ObjectId casting on `_id`/`docId` +- README has a tier-decision table at the top + +--- + +## 7. Recommendation & scope + +**Ship it as Option A — one adapter, Atlas-first messaging.** + +Three reasons this is more compelling than the Community-only path: + +1. **Production-ready today.** Atlas vector search is GA, not preview. You can recommend it to real users with a straight face. +2. **Free tier exists.** M0 means you can have a "Get started in 60 seconds with Atlas" path in the README, which is huge for adoption — no docker, no replica-set CLI, just a connection string. +3. **You get Community for free.** Same code works on self-hosted 8.2+, so the moment Mongo's Community offering goes GA you've already got an adapter for it. + +**Estimated breakdown** (calendar time, single dev, building both targets in one adapter): +- Adapter scaffolding + storeChunk/deleteChunks/hasEmbeddingVersion: ~1 day +- `convertWhereToMongo` + `splitWhere` for like/contains: ~1–2 days +- `search` with `$vectorSearch` + index lifecycle (incl. `filterableFields` declaration): ~2–3 days +- Atlas CLI local-deployment for CI, port test suite: ~2 days +- README with tier decision table + connection-string quickstart: ~1–2 days + +**Total: ~1.5–2 weeks.** Slightly faster than Community-only because the dev/CI story is simpler (Atlas CLI vs hand-rolled docker-compose). + +**Tradeoff to flag — same as Community plan:** the `filterableFields` constraint is a real API ergonomics issue. Worth a small brainstorm before committing: do we require users to declare them, auto-detect from the `Where` queries we see (lazy index updates), or just index all top-level extension fields by default? Each has trade-offs around index size, change-management, and surprise. + +**Companion item to revisit:** if we go with Option A, the [Community-only deep-dive](2026-04-25-mongodb-adapter-deep-dive.md) is partially superseded — it should get a banner pointing at this doc as the canonical plan. + +--- + +## 8. Development environment: local Atlas deployment via Docker 🟢 + +**Decision: this project uses the direct Docker path** (`mongodb/mongodb-atlas-local` image) for both local dev and CI. Fully free, fully offline-capable after the image pull, and runs the same `mongot` binary that production Atlas uses — so behavior parity is high. + +### Why Docker, not the Atlas CLI + +The Atlas CLI offers a `atlas local` command that wraps the same container, but it requires a free MongoDB Atlas account and `atlas auth login` before you can use it. We're skipping the CLI for three reasons: + +1. **Zero-prereq contributors.** Anyone with Docker can clone, run tests, and submit a PR — no account creation, no browser-based OAuth dance. +2. **CI without secrets.** No `ATLAS_*` credentials in GitHub Actions, no service account to manage. +3. **One path, one set of docs.** Local dev and CI use literally the same `docker run` command. + +The CLI is strictly a convenience wrapper — the underlying behavior is identical. + +### Prerequisites + +- **Docker.** That's it. No Atlas account, no Atlas CLI, no login. +- Docker Desktop 4.31+ on macOS/Windows, or Docker Engine 27+ / Podman 5+ on Linux. +- Min: 2 CPU cores, 2 GB free RAM. +- First run requires internet to pull the image (~few hundred MB); offline thereafter. + +> **OrbStack:** widely reported to work as a Docker Desktop drop-in but not officially supported by MongoDB. Use at your own risk. + +### Setup + +```sh +docker run -d \ + --name vectorize-dev \ + -p 27017:27017 \ + mongodb/mongodb-atlas-local:latest +# → connection string: mongodb://localhost:27017/?directConnection=true +``` + +The image self-initializes the replica set and starts `mongot` on first boot. First-run takes ~10–30s before `$vectorSearch` is queryable; the test harness should poll-and-wait rather than assume immediate readiness. + +### Lifecycle + +```sh +docker stop vectorize-dev # pause +docker start vectorize-dev # resume (state preserved) +docker rm -f vectorize-dev # delete +``` + +For local dev, a `docker-compose.yml` at `adapters/mongodb/dev/` keeps the command short: + +```yaml +services: + mongodb-atlas: + image: mongodb/mongodb-atlas-local:latest + ports: ["27017:27017"] + healthcheck: + test: ["CMD", "mongosh", "--quiet", "--eval", "db.runCommand({ping:1})"] + interval: 2s + timeout: 5s + retries: 30 +``` + +Then `docker compose -f adapters/mongodb/dev/docker-compose.yml up -d`. + +--- + +### What you get vs production Atlas + +| Aspect | Local deployment | Production Atlas | +|---|---|---| +| `$vectorSearch` aggregation | ✅ identical | ✅ | +| `createSearchIndexes` driver API | ✅ identical | ✅ | +| `filter` operators supported | ✅ identical list | ✅ | +| `mongot` binary | ✅ runs locally in container | ✅ Atlas-managed | +| Replica set | ✅ single-node, automatic | ✅ | +| Search Nodes (workload isolation) | ❌ same node only | ✅ on dedicated tiers | +| Network latency | ⚡ localhost | ms to cloud region | +| Cost | $0 | tier-dependent | +| Internet required | ❌ offline-capable (after image pull) | ✅ | + +The only behavioral gap that matters for our adapter: **Search Nodes vs same-node `mongot`** affects RAM available to the index, not query semantics. If our test suite passes against local, it will pass against any Atlas tier. The reverse isn't quite true (a query that performs well on Search Nodes might be too slow on a tiny local box), but that's a perf concern, not a correctness one. + +--- + +### Wiring into the test harness + +For `dev/specs/setup.ts`: + +```ts +// pseudo-code outline +const IMAGE = 'mongodb/mongodb-atlas-local:latest' +const CONTAINER = 'vectorize-test' + +export async function setupTestDeployment() { + // Idempotent + await sh`docker rm -f ${CONTAINER} || true` + await sh`docker run -d --name ${CONTAINER} -p 27017:27017 ${IMAGE}` + await waitForVectorSearchReady('mongodb://localhost:27017/?directConnection=true') + return 'mongodb://localhost:27017/?directConnection=true' +} + +export async function teardownTestDeployment() { + await sh`docker rm -f ${CONTAINER}` +} +``` + +Vitest `globalSetup` calls `setupTestDeployment` once per run; individual specs share the deployment. + +--- + +### CI considerations (GitHub Actions) + +This works cleanly on GitHub-hosted Ubuntu runners — Docker is preinstalled. No secrets, no credentials, no Atlas account in the org. + +Sketch: + +```yaml +jobs: + test-mongodb-adapter: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: pnpm/action-setup@v4 + - uses: actions/setup-node@v4 + with: { node-version: 22, cache: pnpm } + - run: pnpm install --frozen-lockfile + + - name: Start local Atlas deployment + run: | + docker run -d --name vectorize-test -p 27017:27017 \ + mongodb/mongodb-atlas-local:latest + # wait for mongot to be ready + for i in {1..30}; do + docker exec vectorize-test mongosh --quiet --eval 'db.runCommand({ping:1})' && break + sleep 2 + done + + - run: pnpm --filter @payloadcms-vectorize/mongodb test + env: + MONGODB_URI: mongodb://localhost:27017/?directConnection=true +``` + +Notes: +- First-run downloads the image (~few hundred MB). Cache via `actions/cache` keyed on the image tag if CI time becomes a concern. +- macOS / Windows runners: Docker isn't always reliable on hosted runners; stick with `ubuntu-latest`. +- No login, no `ATLAS_*` secrets needed. + +--- + +### When to graduate to a real Atlas cluster + +Only at two points in the lifecycle, and **never as a daily-driver dev environment**: + +1. **Pre-1.0 smoke test:** spin up a free M0 cluster once, run the compliance suite against it via `mongodb+srv://` to confirm the connection-string code path works against real Atlas. Tear it down. +2. **Search Nodes perf validation** (optional, only if a user reports perf issues): provision a dedicated tier with Search Nodes and benchmark. This costs real money — defer until there's a concrete reason. + +For day-to-day dev and CI, the local deployment is the path. + +--- + +## Sources +- [$vectorSearch aggregation stage reference (Atlas)](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/) +- [Run Vector Search Queries — Atlas](https://www.mongodb.com/docs/manual/reference/operator/aggregation/vectorsearch/) +- [Review Deployment Options — Atlas Vector Search](https://www.mongodb.com/docs/atlas/atlas-vector-search/deployment-options/) +- [Atlas Free Cluster Limits](https://www.mongodb.com/docs/atlas/reference/free-shared-limitations/) +- [MongoDB Pricing](https://www.mongodb.com/pricing) +- [Pre-filtering Data — MongoDB Search Lab](https://mongodb-developer.github.io/search-lab/docs/vector-search/filtering) +- [vectorSearch operator (within $search)](https://www.mongodb.com/docs/atlas/atlas-search/operators-collectors/vectorsearch/) +- [Create a Local Atlas Deployment with Docker](https://www.mongodb.com/docs/atlas/cli/current/atlas-cli-deploy-docker/) +- [`mongodb/mongodb-atlas-local` Docker image](https://hub.docker.com/r/mongodb/mongodb-atlas-local) diff --git a/docs/plans/2026-04-25-mongodb-unified-adapter-strategy.md b/docs/plans/2026-04-25-mongodb-unified-adapter-strategy.md new file mode 100644 index 0000000..9c3952a --- /dev/null +++ b/docs/plans/2026-04-25-mongodb-unified-adapter-strategy.md @@ -0,0 +1,247 @@ +# Strategy: one MongoDB adapter for both Atlas and Community Edition + +## TL;DR + +Ship **one** package — `@payloadcms-vectorize/mongodb` — that works against both MongoDB Atlas and self-hosted MongoDB Community Edition 8.2+. There is no technical reason to fork into two adapters: the `$vectorSearch` aggregation stage, the `createSearchIndexes` driver API, the filter operator subset, and the score projection are **identical** across both. Community runs the same `mongot` engine Atlas runs (source-available under SSPL since Jan 2026). + +The adapter code is single. The test suite is single. The README has two "Connecting" subsections — Atlas and self-hosted Community — and that is the *only* user-facing fork. + +--- + +## 1. What's actually shared (the entire surface) + +Everything the adapter does over the wire is identical between Atlas and Community: + +| Surface | Atlas | Community 8.2+ | Adapter handles it as | +|---|---|---|---| +| Driver | `mongodb` npm pkg | `mongodb` npm pkg | Single `MongoClient` | +| Vector query | `$vectorSearch` stage | `$vectorSearch` stage | One pipeline builder | +| Index API | `db.collection.createSearchIndexes(...)` | `db.collection.createSearchIndexes(...)` | One ensure-index helper | +| Filter operators | `$eq` `$ne` `$gt` `$gte` `$lt` `$lte` `$in` `$nin` `$exists` `$and` `$or` `$not` `$nor` | Same | One `convertWhereToMongo` | +| Score field | `$meta: "vectorSearchScore"` | `$meta: "vectorSearchScore"` | One `$project` stage | +| Filterable fields | Must be declared in index | Must be declared in index | One `filterableFields` config | +| Sync mechanism | Change Streams (Atlas-managed) | Change Streams (mongot subscribes) | Adapter doesn't care | + +The adapter never branches on "is this Atlas or Community" because nothing it does *can* differ between them. + +--- + +## 2. What differs (and why none of it touches code) + +The differences live entirely on the user's side of the connection string: + +| Concern | Atlas | Community 8.2+ | +|---|---|---| +| Connection string | `mongodb+srv://user:pw@cluster.mongodb.net/...` | `mongodb://localhost:27017/?directConnection=true` | +| How `mongot` is run | Atlas provisions and manages it | User runs `mongodb/mongodb-community-search` sidecar (or `mongodb/mongodb-atlas-local` Docker image which bundles it) | +| Replica set | Always (Atlas does it) | Required, even single-node — user runs `rs.initiate()` | +| Index propagation delay | Sub-second typically | Same (`mongot` subscribes to oplog) | +| Production readiness | GA | Public preview as of Jan 2026 | +| Auth | TLS + SCRAM via SRV | Local: none. Self-hosted prod: SCRAM/x509 | + +The adapter takes a `uri` and `dbName` and trusts the user to point them somewhere reachable. **All operational concerns are documented, not coded.** + +--- + +## 3. Where-clause translation: verified against Payload's `db-mongodb` adapter + +Cross-checked against [`payloadcms/payload/packages/db-mongodb/src/queries/`](https://github.com/payloadcms/payload/tree/main/packages/db-mongodb/src/queries) — specifically `operatorMap.ts`, `sanitizeQueryValue.ts`, `parseParams.ts`, `buildAndOrConditions.ts`. Payload's own adapter is the source of truth for what their `Where` shape means; the goal is to mirror their semantics so users get identical filter behavior between their CRUD queries and our vector search. + +### Operator coverage + +`$vectorSearch.filter` only accepts a strict subset of MQL ([Mongo docs](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/)). Anything outside that subset must be split off and applied **after** the vector scan as an additional `$match` stage — the same `splitWhere` pattern the CF adapter uses. + +| Payload `Where` op | Payload mapping (from `db-mongodb`) | Allowed in `$vectorSearch.filter`? | Adapter strategy | +|---|---|---|---| +| `equals` | `$eq` | ✅ | pre-filter | +| `not_equals` | `$ne` | ✅ | pre-filter | +| `in` | `$in` | ✅ | pre-filter | +| `not_in` | `$nin` | ✅ | pre-filter | +| `greater_than` | `$gt` | ✅ | pre-filter | +| `greater_than_equal` | `$gte` | ✅ | pre-filter | +| `less_than` | `$lt` | ✅ | pre-filter | +| `less_than_equal` | `$lte` | ✅ | pre-filter | +| `exists` | `$and`/`$or` of `$exists`/`$ne null`/`$ne ''` (see `buildExistsQuery`) | ✅ (composed of allowed ops) | pre-filter | +| `like` | `$regex` + `$options: 'i'` + `escapeRegExp` | ❌ | **post-filter** | +| `contains` (scalar) | `$regex` + `$options: 'i'` + `escapeRegExp` | ❌ | **post-filter** | +| `contains` (hasMany array) | `$elemMatch` + `$regex` | ❌ | **post-filter** | +| `all` | `$all` | ❌ | **post-filter** | +| `near` / `within` / `intersects` | `$near` / `$geoWithin` / `$geoIntersects` | ❌ | unsupported in vector context — surface a clear error if the user tries it | +| `and` / `or` (case-insensitive) | `$and` / `$or` | ✅ | pre-filter, recurse | +| `not` / `nor` | (Mongo native, not in Payload's `operatorMap` but valid in `filter`) | ✅ | available if we ever surface them | + +### Six things Payload does that we must mirror + +1. **`escapeRegExp` on `like` / `contains`.** Payload imports `escapeRegExp` from the `payload` package and applies it before wrapping in `$regex`. Without it, user input like `foo.bar` matches `foozbar`. Reuse `payload`'s exported helper — don't roll our own. +2. **Case-insensitive substring by default.** Payload always sets `$options: 'i'` for `like`/`contains`. Match this so vector-search filtering behaves like CRUD filtering. +3. **Case-insensitive `and` / `or` keys.** `parseParams.ts` does `relationOrPath.toLowerCase() === 'and'`. Accept `and`/`AND`/`And` (and same for `or`). +4. **Multiple operators on the same path → wrap in `$and`.** When a single field has e.g. `{ greater_than: 5, less_than: 10 }`, Mongo's plain object form `{ field: { $gt: 5, $lt: 10 } }` works fine, **but** if Payload-style input collides (e.g. two predicates that would both write to the same path key), Payload promotes them into a `$and: [...]` to avoid object-key overwrite. Mirror this — it shows up when the same field appears under both an explicit predicate and inside a nested `and`. +5. **`exists` is a compound expression, not just `$exists`.** Payload's `buildExistsQuery` checks `$exists: true`, `$ne: null`, and (for most field types) `$ne: ''`. Empty strings are treated as missing. If we want behavior parity, we mirror that compound shape — all components are individually allowed in `filter`, so it stays pre-filterable. +6. **ObjectId casting on `_id` and relationship IDs.** Payload casts string IDs to `Types.ObjectId` for queries. Our chunks store `docId` as the raw string we received from Payload. If a user filters `where: { docId: { equals: '<24-hex>' } }` and the source collection uses ObjectId IDs, we need to cast the comparison value. Lift the casting logic from Payload's `sanitizeQueryValue.ts` (or call it directly if we depend on `payload` as a peer dep — which we already do). + +### What we got wrong in the earlier deep-dives + +Two corrections vs the original [community](./2026-04-25-mongodb-adapter-deep-dive.md) and [Atlas](./2026-04-25-mongodb-atlas-adapter-deep-dive.md) deep-dives: + +- The original tables omitted Payload's **`all`** operator. It maps to `$all`, which is **not** in the `$vectorSearch.filter` allowlist → must be post-filtered. +- The original tables said `like` and `contains` "need post-filtering — same split-pre/post pattern the CF adapter uses." That is correct, but understated the implementation work: Payload uses `$regex` with `$options: 'i'` **and** `escapeRegExp`. Our post-filter `$match` stage must reproduce that exactly, not just naive substring matching. + +### What stays simpler than PG + +Even with the post-filter list, this is *still* the easiest of the three adapters because: +- No SQL escaping (Mongo takes a JS object). +- The 38-test `vectorSearchWhere` suite was written backend-agnostic — assertions are on result IDs and ordering, not on SQL strings. It ports as-is. +- Payload's own `db-mongodb` source is permissively licensed; we can lift `convertWhereToMongo` logic almost verbatim, with attribution. + +--- + +## 4. Public API (single, unified) + +```ts +import { createMongoVectorIntegration } from '@payloadcms-vectorize/mongodb' + +const { adapter } = createMongoVectorIntegration({ + uri: process.env.MONGODB_URI!, // works for both Atlas and Community + dbName: 'payload', + knowledgePools: [ + { + name: 'articles', + sourceCollections: ['articles', 'pages'], + embeddingModel: 'text-embedding-3-small', + dimensions: 1536, + similarity: 'cosine', + // Pre-declared so the search index can filter on them at scan time. + // Same on Atlas and Community. + filterableFields: ['status', 'category', 'publishedAt', 'tags'], + }, + ], +}) +``` + +There is no `mode: 'atlas' | 'community'` flag. There is no `transport` switch. The user's `MONGODB_URI` is the *only* thing that determines which backend they're hitting, and the adapter doesn't need to know. + +--- + +## 5. Package layout (single) + +``` +adapters/mongodb/ +├── package.json # @payloadcms-vectorize/mongodb +│ # peer deps: payload, payloadcms-vectorize, mongodb (>=6.x) +├── src/ +│ ├── index.ts # createMongoVectorIntegration({ uri, dbName, knowledgePools }) +│ ├── search.ts # search() → $vectorSearch pipeline +│ ├── convertWhere.ts # Where → Mongo filter (with split-pre/post for unsupported ops) +│ ├── embed.ts # storeChunk / deleteChunks / hasEmbeddingVersion +│ ├── indexes.ts # ensureSearchIndex (createSearchIndexes if missing) +│ └── types.ts # MongoConfig, KnowledgePool, similarity choice, defaults +├── dev/ +│ ├── docker-compose.yml # mongodb/mongodb-atlas-local — used for BOTH local dev and CI +│ └── specs/ +│ ├── compliance.spec.ts # ported from PG +│ └── vectorSearchWhere.spec.ts # ported from PG (38 tests) +└── README.md # see §5 +``` + +Notable: there is no `adapters/mongodb-atlas/` and no `adapters/mongodb-community/`. One directory, one `package.json`, one published artifact on npm. + +--- + +## 6. README structure + +The README is single, but has two subsections under "Connecting": + +```markdown +# @payloadcms-vectorize/mongodb + +Vector search adapter for PayloadCMS, backed by MongoDB's `$vectorSearch`. +Works against MongoDB Atlas (GA) and self-hosted MongoDB Community 8.2+ (public preview). + +## Install +npm install @payloadcms-vectorize/mongodb mongodb + +## Configure +[single createMongoVectorIntegration example] + +## Connecting + +### → MongoDB Atlas +1. Create a cluster (M10+ recommended for production; M0/Flex fine for dev). +2. Database Access → create a user with `readWrite` on your DB. +3. Network Access → allow your IP (or 0.0.0.0/0 for dev only). +4. Copy the connection string (Drivers → Node). +5. Set `MONGODB_URI=mongodb+srv://user:pw@cluster.xxxxx.mongodb.net/payload` + +### → Self-hosted MongoDB Community 8.2+ +> ⚠️ Public preview as of Jan 2026 — Mongo labels this "not for production." + +You need `mongod` 8.2+ running as a replica set, plus the `mongot` sidecar. +The simplest path is the all-in-one Docker image: + +docker run -d --name mongo -p 27017:27017 mongodb/mongodb-atlas-local:latest + +Then: `MONGODB_URI=mongodb://localhost:27017/payload?directConnection=true` + +For production self-hosted, see [MongoDB's mongot deployment guide]. + +## Filterable fields +[explain filterableFields config — applies to both backends identically] + +## Index lifecycle +[explain createSearchIndexes async behavior — same for both] +``` + +That's the entire fork. Two subsections under one heading. + +--- + +## 7. Test strategy (single suite, single backend) + +The test suite runs against **`mongodb/mongodb-atlas-local`** for both local dev and CI. This image bundles `mongod` + `mongot` + replica-set init in one container and is the same `mongot` build Atlas ships. Tests that pass against it pass against Atlas — that's the entire point of the image. + +We do **not** maintain a parallel test job against Atlas. Reasons: +- Adapter has no Atlas-vs-Community branches to cover. +- Atlas in CI requires a paid project, IP allowlisting from GitHub runners, and per-PR cluster lifecycle. Real cost, zero adapter coverage gained. +- If Atlas ever diverges from the local image's `mongot`, that's a Mongo-side regression, not ours. + +**Smoke check before each release:** one manual `npm test` run pointed at a real Atlas M0 by setting `MONGODB_URI`. Catches any drift. Documented in `RELEASING.md`, not automated. + +--- + +## 8. Where the (small) Atlas/Community asymmetries actually live + +For completeness — these are the things a contributor might *think* should be branched but don't need to be: + +- **Replica set init.** `mongodb/mongodb-atlas-local` does it for you. Production self-hosted docs tell users to do it. Adapter never touches it. +- **`mongot` port (27028).** Internal to the Mongo deployment. The driver only ever talks to `mongod` on 27017. +- **Index sync delay.** Same on both — the adapter's `ensureSearchIndex` polls `listSearchIndexes` until status is `READY` regardless of backend. +- **Free-tier quirks (M0/Flex).** Some Atlas free tiers cap search index count or vector dimensions. That's a user-side limit; the adapter surfaces the Mongo error verbatim. +- **`$regex` inside `filter`.** Not supported on either backend. Both use the same `splitWhere` post-filter pattern (already proven in the CF adapter). + +--- + +## 9. Versioning and the preview disclaimer + +Self-hosted Community vector search is **public preview** as of Jan 2026; Atlas vector search is **GA**. The adapter itself is GA-quality against either, but we ship `^0.x` and label it `experimental` until Mongo's Community vector search reaches GA. Bumping to `1.0` is gated on Mongo's announcement, not on adapter maturity. + +The README's Community subsection carries the preview warning. The Atlas subsection does not. Same code, different runtime maturity — documented honestly. + +--- + +## 10. Summary checklist for the contributor + +To ship this: + +- [ ] Scaffold `adapters/mongodb/` with the layout in §4 +- [ ] Implement `createMongoVectorIntegration` with the §3 signature (no backend flag) +- [ ] Port `convertWhereToDrizzle` → `convertWhereToMongo`, mirroring Payload's `db-mongodb/queries/` (operator map, `escapeRegExp` + `$options:'i'` on `like`/`contains`, case-insensitive `and`/`or` keys, compound `exists`, ObjectId casting on `_id`/`docId`) +- [ ] Use the `splitWhere` pattern from the CF adapter to split pre-filter (allowed in `$vectorSearch.filter`) vs post-filter (`like`, `contains`, `all`, geo) predicates +- [ ] Implement `ensureSearchIndex` against `createSearchIndexes` + poll `listSearchIndexes` until `READY` +- [ ] Add `dev/docker-compose.yml` using `mongodb/mongodb-atlas-local:latest` +- [ ] Port the 38-test `vectorSearchWhere` suite from PG +- [ ] Port `compliance.spec.ts` from PG +- [ ] Write the README with the two "Connecting" subsections in §5 +- [ ] Add a release-time smoke checklist for Atlas M0 in `RELEASING.md` +- [ ] Publish at `^0.x` with `experimental` tag in keywords + +No fork. No `mongodb-atlas` package. No `mongodb-community` package. One adapter, two onboarding paths in the README. diff --git a/docs/superpowers/plans/2026-04-25-mongodb-adapter.md b/docs/superpowers/plans/2026-04-25-mongodb-adapter.md new file mode 100644 index 0000000..5f9c834 --- /dev/null +++ b/docs/superpowers/plans/2026-04-25-mongodb-adapter.md @@ -0,0 +1,3035 @@ +# MongoDB Adapter Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Ship `@payloadcms-vectorize/mongodb` — a `DbAdapter` for `payloadcms-vectorize` that targets MongoDB Atlas and self-hosted MongoDB Community 8.2+ via the unified `$vectorSearch` aggregation stage. + +**Architecture:** Single npm package under `adapters/mongodb/` mirroring the layout of `adapters/cf/`. The adapter holds a lazy singleton `MongoClient`, manages one Mongo collection per knowledge pool, lazily ensures a `vectorSearch` index, and translates Payload `Where` clauses into a pre-filter (inside `$vectorSearch.filter`) plus a post-filter (`$match` after the vector scan). No Payload `CollectionConfig` is registered — vector documents are managed via the raw MongoDB driver. + +**Tech Stack:** TypeScript, Node.js MongoDB driver (`mongodb`), `vitest`, `payload` 3.x peerDep, `mongodb/mongodb-atlas-local` Docker image for local dev + CI. + +**Spec:** [`docs/superpowers/specs/2026-04-25-mongodb-adapter.md`](../specs/2026-04-25-mongodb-adapter.md). + +--- + +## File Structure + +``` +adapters/mongodb/ +├── package.json # Task 1 +├── tsconfig.build.json # Task 1 +├── vitest.config.ts # Task 1 +├── README.md # Task 17 +├── src/ +│ ├── escapeRegExp.ts # Task 2 — pure utility +│ ├── types.ts # Task 3 — public types, getMongoConfig() helper +│ ├── client.ts # Task 4 — lazy singleton MongoClient + __closeForTests +│ ├── convertWhere.ts # Tasks 5–8 — pre/post-filter splitter +│ ├── indexes.ts # Task 9 — ensureSearchIndex + cache +│ ├── embed.ts # Task 10 — storeChunk +│ ├── search.ts # Tasks 11–12 — search aggregation +│ └── index.ts # Task 13 — createMongoVectorIntegration wiring +└── dev/ + ├── docker-compose.yml # Task 14 + └── specs/ + ├── constants.ts # Task 15 + ├── utils.ts # Task 15 — waitForVectorSearchReady, dropDb + ├── compliance.spec.ts # Task 15 + ├── vectorSearchWhere.spec.ts # Task 16 + └── integration.spec.ts # Task 16 +``` + +Top-level files touched: +- `package.json` — add `build:adapters:mongodb`, `test:adapters:mongodb`, chain into `build:adapters`. (Task 18) +- `.changeset/config.json` — add `@payloadcms-vectorize/mongodb` to the `fixed` array. (Task 18) +- `.github/workflows/ci.yml` — add `test_adapters_mongodb` job. (Task 19) + +--- + +## Task 1: Package skeleton (`adapters/mongodb/package.json`, tsconfig, vitest) + +**Files:** +- Create: `adapters/mongodb/package.json` +- Create: `adapters/mongodb/tsconfig.build.json` +- Create: `adapters/mongodb/vitest.config.ts` + +- [ ] **Step 1: Write `adapters/mongodb/package.json`** + +```json +{ + "name": "@payloadcms-vectorize/mongodb", + "version": "0.7.2", + "description": "MongoDB Atlas + self-hosted vectorSearch adapter for payloadcms-vectorize", + "license": "MIT", + "repository": { + "type": "git", + "url": "git+https://github.com/techiejd/payloadcms-vectorize.git", + "directory": "adapters/mongodb" + }, + "homepage": "https://github.com/techiejd/payloadcms-vectorize/tree/main/adapters/mongodb#readme", + "bugs": { + "url": "https://github.com/techiejd/payloadcms-vectorize/issues" + }, + "type": "module", + "files": [ + "dist", + "README.md" + ], + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "default": "./dist/index.js" + } + }, + "scripts": { + "test:setup": "docker-compose -f dev/docker-compose.yml up -d", + "test:teardown": "docker-compose -f dev/docker-compose.yml down" + }, + "keywords": [ + "payloadcms", + "mongodb", + "vector-search", + "rag", + "experimental" + ], + "peerDependencies": { + "mongodb": ">=6.0.0", + "payload": ">=3.0.0 <4.0.0", + "payloadcms-vectorize": ">=0.7.2" + }, + "devDependencies": { + "mongodb": "^6.10.0", + "payloadcms-vectorize": "workspace:*" + }, + "engines": { + "node": "^18.20.2 || >=20.9.0", + "pnpm": "^9 || ^10" + }, + "publishConfig": { + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "default": "./dist/index.js" + } + }, + "main": "./dist/index.js", + "types": "./dist/index.d.ts" + } +} +``` + +- [ ] **Step 2: Write `adapters/mongodb/tsconfig.build.json`** + +```json +{ + "extends": "../tsconfig.adapter.json" +} +``` + +- [ ] **Step 3: Write `adapters/mongodb/vitest.config.ts`** (mirrors `adapters/cf/vitest.config.ts`) + +```ts +import path from 'path' +import { loadEnv } from 'payload/node' +import { fileURLToPath } from 'url' +import tsconfigPaths from 'vite-tsconfig-paths' +import { defineConfig } from 'vitest/config' + +const filename = fileURLToPath(import.meta.url) +const dirname = path.dirname(filename) + +export default defineConfig(() => { + loadEnv(path.resolve(dirname, '../../dev')) + + return { + plugins: [ + tsconfigPaths({ + ignoreConfigErrors: true, + }), + ], + resolve: { + alias: { + 'payloadcms-vectorize': path.resolve(dirname, '../../src/index.ts'), + }, + }, + test: { + root: dirname, + environment: 'node', + hookTimeout: 120_000, + testTimeout: 120_000, + include: ['dev/specs/**/*.spec.ts'], + exclude: ['**/e2e.spec.{ts,js}', '**/node_modules/**'], + fileParallelism: false, + }, + } +}) +``` + +- [ ] **Step 4: Install workspace deps** + +Run: `pnpm install` +Expected: `mongodb` and `payloadcms-vectorize` linked under `adapters/mongodb/node_modules/`. No errors. + +- [ ] **Step 5: Verify build skeleton compiles** + +Run: `cd adapters/mongodb && pnpm exec tsc -p tsconfig.build.json --noEmit` +Expected: PASS (no `src/` files yet, but config must parse). + +- [ ] **Step 6: Commit** + +```bash +git add adapters/mongodb/package.json adapters/mongodb/tsconfig.build.json adapters/mongodb/vitest.config.ts pnpm-lock.yaml +git commit -m "feat(mongodb): scaffold adapter package skeleton" +``` + +--- + +## Task 2: `escapeRegExp` utility + +**Files:** +- Create: `adapters/mongodb/src/escapeRegExp.ts` +- Create: `adapters/mongodb/dev/specs/escapeRegExp.spec.ts` + +- [ ] **Step 1: Write the failing test** + +```ts +// adapters/mongodb/dev/specs/escapeRegExp.spec.ts +import { describe, expect, test } from 'vitest' +import { escapeRegExp } from '../../src/escapeRegExp.js' + +describe('escapeRegExp', () => { + test('escapes regex metacharacters', () => { + expect(escapeRegExp('foo.bar')).toBe('foo\\.bar') + expect(escapeRegExp('a*b')).toBe('a\\*b') + expect(escapeRegExp('(x)')).toBe('\\(x\\)') + expect(escapeRegExp('a+b?c')).toBe('a\\+b\\?c') + expect(escapeRegExp('[abc]')).toBe('\\[abc\\]') + expect(escapeRegExp('a\\b')).toBe('a\\\\b') + expect(escapeRegExp('a^b$')).toBe('a\\^b\\$') + expect(escapeRegExp('a|b')).toBe('a\\|b') + expect(escapeRegExp('{1,2}')).toBe('\\{1,2\\}') + }) + + test('returns plain string unchanged', () => { + expect(escapeRegExp('hello world')).toBe('hello world') + expect(escapeRegExp('')).toBe('') + }) +}) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/escapeRegExp.spec.ts` +Expected: FAIL — module `../../src/escapeRegExp.js` not found. + +- [ ] **Step 3: Write the implementation** + +```ts +// adapters/mongodb/src/escapeRegExp.ts +export function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') +} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/escapeRegExp.spec.ts` +Expected: PASS, 2 tests. + +- [ ] **Step 5: Commit** + +```bash +git add adapters/mongodb/src/escapeRegExp.ts adapters/mongodb/dev/specs/escapeRegExp.spec.ts +git commit -m "feat(mongodb): add escapeRegExp utility" +``` + +--- + +## Task 3: Public types + `getMongoConfig` helper + +**Files:** +- Create: `adapters/mongodb/src/types.ts` + +- [ ] **Step 1: Write `adapters/mongodb/src/types.ts`** + +```ts +import type { BasePayload } from 'payload' +import { getVectorizedPayload } from 'payloadcms-vectorize' + +export type Similarity = 'cosine' | 'euclidean' | 'dotProduct' + +export interface MongoPoolConfig { + /** Vector dimensions for this pool (must match embedding model output). */ + dimensions: number + /** Similarity metric for the search index. Default 'cosine'. */ + similarity?: Similarity + /** ANN candidate set size. Default at search time: max(limit * 20, 100). */ + numCandidates?: number + /** Extension fields to declare as filterable in the search index. */ + filterableFields?: string[] + /** ENN exact search (full scan) instead of HNSW ANN. Default false. */ + forceExact?: boolean + /** Override Mongo collection name. Default `vectorize_${poolName}`. */ + collectionName?: string + /** Override search index name. Default `${collectionName}_idx`. */ + indexName?: string +} + +export interface MongoVectorIntegrationConfig { + /** Any valid MongoDB connection string (Atlas SRV or self-hosted). */ + uri: string + /** Database that holds the per-pool vector collections. */ + dbName: string + /** Pools keyed by knowledge pool name. */ + pools: Record +} + +/** Resolved per-pool config used internally (defaults applied). */ +export interface ResolvedPoolConfig { + dimensions: number + similarity: Similarity + numCandidates?: number + filterableFields: string[] + forceExact: boolean + collectionName: string + indexName: string +} + +/** + * Stored in `getConfigExtension().custom._mongoConfig` so `search()` can + * recover the same config from a `BasePayload` instance. + */ +export interface MongoConfigCustom { + uri: string + dbName: string + pools: Record +} + +export const RESERVED_FILTER_FIELDS = [ + 'sourceCollection', + 'docId', + 'embeddingVersion', +] as const + +export const RESERVED_FIELDS = [ + 'sourceCollection', + 'docId', + 'chunkIndex', + 'chunkText', + 'embeddingVersion', + 'embedding', +] as const + +export function resolvePoolConfig( + poolName: string, + cfg: MongoPoolConfig, +): ResolvedPoolConfig { + const collectionName = cfg.collectionName ?? `vectorize_${poolName}` + return { + dimensions: cfg.dimensions, + similarity: cfg.similarity ?? 'cosine', + numCandidates: cfg.numCandidates, + filterableFields: cfg.filterableFields ?? [], + forceExact: cfg.forceExact ?? false, + collectionName, + indexName: cfg.indexName ?? `${collectionName}_idx`, + } +} + +export function getMongoConfig(payload: BasePayload): MongoConfigCustom { + const cfg = getVectorizedPayload(payload)?.getDbAdapterCustom() + ?._mongoConfig as MongoConfigCustom | undefined + if (!cfg) { + throw new Error('[@payloadcms-vectorize/mongodb] _mongoConfig not found on payload — did you register the adapter?') + } + return cfg +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd adapters/mongodb && pnpm exec tsc -p tsconfig.build.json --noEmit` +Expected: PASS. + +- [ ] **Step 3: Commit** + +```bash +git add adapters/mongodb/src/types.ts +git commit -m "feat(mongodb): add public types and config helpers" +``` + +--- + +## Task 4: Lazy singleton MongoClient + +**Files:** +- Create: `adapters/mongodb/src/client.ts` + +- [ ] **Step 1: Write `adapters/mongodb/src/client.ts`** + +```ts +import { MongoClient } from 'mongodb' + +const clientCache = new Map>() + +export function getMongoClient(uri: string): Promise { + let p = clientCache.get(uri) + if (!p) { + p = MongoClient.connect(uri) + clientCache.set(uri, p) + } + return p +} + +/** + * Test-only helper. NOT exported from `index.ts` — referenced by the dev test + * suites via deep import to avoid leaking into the published API. + */ +export async function __closeForTests(): Promise { + const promises = Array.from(clientCache.values()) + clientCache.clear() + for (const p of promises) { + try { + const c = await p + await c.close() + } catch { + // ignore; client may not have connected + } + } +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd adapters/mongodb && pnpm exec tsc -p tsconfig.build.json --noEmit` +Expected: PASS. + +- [ ] **Step 3: Commit** + +```bash +git add adapters/mongodb/src/client.ts +git commit -m "feat(mongodb): add lazy singleton MongoClient with test close helper" +``` + +--- + +## Task 5: `convertWhereToMongo` — pre-filter operators on a leaf + +**Files:** +- Create: `adapters/mongodb/src/convertWhere.ts` +- Create: `adapters/mongodb/dev/specs/convertWhere.spec.ts` + +This task implements the pre-filter operator branch only — `equals`, `not_equals`/`notEquals`, `in`, `not_in`/`notIn`, `gt/gte/lt/lte` (both spellings), and `exists`. Subsequent tasks (6–8) layer on `like`/`contains` post-filter, `and`/`or` recursion, and field validation. + +- [ ] **Step 1: Write the failing test** + +```ts +// adapters/mongodb/dev/specs/convertWhere.spec.ts +import { describe, expect, test } from 'vitest' +import { convertWhereToMongo } from '../../src/convertWhere.js' + +const FILTERABLE = ['status', 'category', 'views', 'rating', 'published', 'tags'] + +describe('convertWhereToMongo — pre-filter operators', () => { + test('equals', () => { + expect( + convertWhereToMongo({ status: { equals: 'published' } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $eq: 'published' } }, postFilter: null }) + }) + + test('not_equals (snake) and notEquals (camel)', () => { + expect( + convertWhereToMongo({ status: { not_equals: 'draft' } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $ne: 'draft' } }, postFilter: null }) + expect( + convertWhereToMongo({ status: { notEquals: 'draft' } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $ne: 'draft' } }, postFilter: null }) + }) + + test('in / not_in / notIn', () => { + expect( + convertWhereToMongo({ status: { in: ['a', 'b'] } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $in: ['a', 'b'] } }, postFilter: null }) + expect( + convertWhereToMongo({ status: { not_in: ['a'] } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $nin: ['a'] } }, postFilter: null }) + expect( + convertWhereToMongo({ status: { notIn: ['a'] } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { status: { $nin: ['a'] } }, postFilter: null }) + }) + + test('greater_than / greaterThan / less_than_equal etc.', () => { + expect( + convertWhereToMongo({ views: { greater_than: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $gt: 100 } }, postFilter: null }) + expect( + convertWhereToMongo({ views: { greaterThan: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $gt: 100 } }, postFilter: null }) + expect( + convertWhereToMongo({ views: { greater_than_equal: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $gte: 100 } }, postFilter: null }) + expect( + convertWhereToMongo({ views: { less_than: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $lt: 100 } }, postFilter: null }) + expect( + convertWhereToMongo({ views: { less_than_equal: 100 } }, FILTERABLE, 'p1'), + ).toEqual({ preFilter: { views: { $lte: 100 } }, postFilter: null }) + }) + + test('exists true → $exists + $ne null', () => { + expect( + convertWhereToMongo({ category: { exists: true } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: { category: { $exists: true, $ne: null } }, + postFilter: null, + }) + }) + + test('exists false → $exists false OR $eq null', () => { + expect( + convertWhereToMongo({ category: { exists: false } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: { $or: [{ category: { $exists: false } }, { category: { $eq: null } }] }, + postFilter: null, + }) + }) + + test('multiple operators on same field combine via $and', () => { + const result = convertWhereToMongo( + { views: { greater_than: 50, less_than: 200 } }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { $and: [{ views: { $gt: 50 } }, { views: { $lt: 200 } }] }, + postFilter: null, + }) + }) + + test('reserved field always usable even when filterableFields is empty', () => { + expect( + convertWhereToMongo( + { sourceCollection: { equals: 'articles' } }, + [], + 'p1', + ), + ).toEqual({ + preFilter: { sourceCollection: { $eq: 'articles' } }, + postFilter: null, + }) + }) +}) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: FAIL — `convertWhereToMongo` not exported. + +- [ ] **Step 3: Write the minimal implementation** + +```ts +// adapters/mongodb/src/convertWhere.ts +import type { Where } from 'payload' +import { RESERVED_FILTER_FIELDS } from './types.js' + +export interface ConvertResult { + preFilter: Record | null + postFilter: Where | null +} + +const PRE_OPS = new Map([ + ['equals', '$eq'], + ['not_equals', '$ne'], + ['notEquals', '$ne'], + ['in', '$in'], + ['not_in', '$nin'], + ['notIn', '$nin'], + ['greater_than', '$gt'], + ['greaterThan', '$gt'], + ['greater_than_equal', '$gte'], + ['greaterThanEqual', '$gte'], + ['less_than', '$lt'], + ['lessThan', '$lt'], + ['less_than_equal', '$lte'], + ['lessThanEqual', '$lte'], +]) + +const POST_OPS = new Set(['like', 'contains', 'all']) +const UNSUPPORTED_OPS = new Set(['near', 'within', 'intersects']) + +function isFilterable(field: string, filterable: string[]): boolean { + return ( + (RESERVED_FILTER_FIELDS as readonly string[]).includes(field) || + filterable.includes(field) + ) +} + +function leafToPre(field: string, cond: Record): Record { + const clauses: Record[] = [] + for (const [op, val] of Object.entries(cond)) { + if (op === 'exists') { + if (val === true) { + clauses.push({ [field]: { $exists: true, $ne: null } }) + } else { + clauses.push({ $or: [{ [field]: { $exists: false } }, { [field]: { $eq: null } }] }) + } + continue + } + const mongoOp = PRE_OPS.get(op) + if (!mongoOp) continue + clauses.push({ [field]: { [mongoOp]: val } }) + } + if (clauses.length === 0) return {} + if (clauses.length === 1) return clauses[0] + return { $and: clauses } +} + +export function convertWhereToMongo( + where: Where, + filterable: string[], + poolName: string, +): ConvertResult { + // Single-field leaf with only pre-filter operators (the simple, most-common path). + const keys = Object.keys(where).filter((k) => k !== 'and' && k !== 'or') + if (keys.length === 1) { + const field = keys[0] + const cond = where[field] as Record + if (!isFilterable(field, filterable)) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Field "${field}" is not configured as filterableFields for pool "${poolName}"`, + ) + } + for (const op of Object.keys(cond)) { + if (UNSUPPORTED_OPS.has(op)) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Operator "${op}" is not supported`, + ) + } + } + const onlyPreOps = Object.keys(cond).every( + (op) => PRE_OPS.has(op) || op === 'exists', + ) + if (onlyPreOps) { + return { preFilter: leafToPre(field, cond), postFilter: null } + } + } + // Tasks 6–8 expand this; for now, throw for unimplemented paths. + throw new Error('[@payloadcms-vectorize/mongodb] convertWhereToMongo: path not implemented yet') +} + +// POST_OPS is referenced by Task 6 — silences TS unused-symbol warnings until then. +void POST_OPS +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: PASS, 8 tests. + +- [ ] **Step 5: Commit** + +```bash +git add adapters/mongodb/src/convertWhere.ts adapters/mongodb/dev/specs/convertWhere.spec.ts +git commit -m "feat(mongodb): convertWhereToMongo handles pre-filter leaf operators" +``` + +--- + +## Task 6: `convertWhereToMongo` — post-filter operators (`like`, `contains`, `all`) + +**Files:** +- Modify: `adapters/mongodb/src/convertWhere.ts` +- Modify: `adapters/mongodb/dev/specs/convertWhere.spec.ts` + +- [ ] **Step 1: Add failing tests for post-filter operators** + +Append to `adapters/mongodb/dev/specs/convertWhere.spec.ts`: + +```ts +describe('convertWhereToMongo — post-filter operators', () => { + test('like routes the whole leaf to post-filter (verbatim Where)', () => { + expect( + convertWhereToMongo({ tags: { like: 'javascript' } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: null, + postFilter: { tags: { like: 'javascript' } }, + }) + }) + + test('contains routes the whole leaf to post-filter', () => { + expect( + convertWhereToMongo({ category: { contains: 'tech' } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: null, + postFilter: { category: { contains: 'tech' } }, + }) + }) + + test('mixed pre + post operators on same leaf → entire leaf goes to post', () => { + expect( + convertWhereToMongo( + { tags: { equals: 'a', like: 'javascript' } }, + FILTERABLE, + 'p1', + ), + ).toEqual({ + preFilter: null, + postFilter: { tags: { equals: 'a', like: 'javascript' } }, + }) + }) + + test('all routes to post-filter', () => { + expect( + convertWhereToMongo({ tags: { all: ['a', 'b'] } }, FILTERABLE, 'p1'), + ).toEqual({ + preFilter: null, + postFilter: { tags: { all: ['a', 'b'] } }, + }) + }) + + test('unsupported geo op throws', () => { + expect(() => + convertWhereToMongo({ loc: { near: [0, 0] } }, ['loc'], 'p1'), + ).toThrowError(/not supported/) + }) +}) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: FAIL — leaf with `like`/`contains`/`all` currently throws "not implemented". + +- [ ] **Step 3: Update `convertWhereToMongo` to handle post-filter leaves** + +Replace the body of `convertWhereToMongo` in `adapters/mongodb/src/convertWhere.ts` with: + +```ts +export function convertWhereToMongo( + where: Where, + filterable: string[], + poolName: string, +): ConvertResult { + const keys = Object.keys(where).filter((k) => k !== 'and' && k !== 'or') + if (keys.length === 1 && !('and' in where) && !('or' in where)) { + const field = keys[0] + const cond = where[field] as Record + if (!isFilterable(field, filterable)) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Field "${field}" is not configured as filterableFields for pool "${poolName}"`, + ) + } + for (const op of Object.keys(cond)) { + if (UNSUPPORTED_OPS.has(op)) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Operator "${op}" is not supported`, + ) + } + } + const hasPostOp = Object.keys(cond).some((op) => POST_OPS.has(op)) + if (hasPostOp) { + return { preFilter: null, postFilter: { [field]: cond } as Where } + } + return { preFilter: leafToPre(field, cond), postFilter: null } + } + throw new Error('[@payloadcms-vectorize/mongodb] convertWhereToMongo: and/or not implemented yet') +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: PASS, 13 tests total. + +- [ ] **Step 5: Commit** + +```bash +git add adapters/mongodb/src/convertWhere.ts adapters/mongodb/dev/specs/convertWhere.spec.ts +git commit -m "feat(mongodb): convertWhereToMongo routes like/contains/all to post-filter" +``` + +--- + +## Task 7: `convertWhereToMongo` — `and` / `or` recursion + +**Files:** +- Modify: `adapters/mongodb/src/convertWhere.ts` +- Modify: `adapters/mongodb/dev/specs/convertWhere.spec.ts` + +- [ ] **Step 1: Add failing tests for `and` / `or`** + +Append to `adapters/mongodb/dev/specs/convertWhere.spec.ts`: + +```ts +describe('convertWhereToMongo — and/or composition', () => { + test('and: all branches pre → combined preFilter via $and', () => { + const result = convertWhereToMongo( + { + and: [ + { status: { equals: 'published' } }, + { views: { greater_than: 100 } }, + ], + }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { + $and: [ + { status: { $eq: 'published' } }, + { views: { $gt: 100 } }, + ], + }, + postFilter: null, + }) + }) + + test('and: mix of pre + post → pre kept native, post in {and:[...]}', () => { + const result = convertWhereToMongo( + { + and: [ + { status: { equals: 'published' } }, + { tags: { like: 'javascript' } }, + ], + }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { status: { $eq: 'published' } }, + postFilter: { tags: { like: 'javascript' } }, + }) + }) + + test('or: all branches pre → combined preFilter via $or', () => { + const result = convertWhereToMongo( + { + or: [ + { status: { equals: 'draft' } }, + { status: { equals: 'archived' } }, + ], + }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { + $or: [ + { status: { $eq: 'draft' } }, + { status: { $eq: 'archived' } }, + ], + }, + postFilter: null, + }) + }) + + test('or: any branch is post → entire or goes to post-filter', () => { + const where: any = { + or: [ + { status: { equals: 'published' } }, + { tags: { like: 'javascript' } }, + ], + } + const result = convertWhereToMongo(where, FILTERABLE, 'p1') + expect(result.preFilter).toBeNull() + expect(result.postFilter).toEqual(where) + }) + + test('nested and/or: (published AND tech) OR (archived)', () => { + const where: any = { + or: [ + { + and: [ + { status: { equals: 'published' } }, + { category: { equals: 'tech' } }, + ], + }, + { status: { equals: 'archived' } }, + ], + } + const result = convertWhereToMongo(where, FILTERABLE, 'p1') + expect(result.preFilter).toEqual({ + $or: [ + { $and: [{ status: { $eq: 'published' } }, { category: { $eq: 'tech' } }] }, + { status: { $eq: 'archived' } }, + ], + }) + expect(result.postFilter).toBeNull() + }) + + test('and with single condition reduces to that condition', () => { + const result = convertWhereToMongo( + { and: [{ status: { equals: 'published' } }] }, + FILTERABLE, + 'p1', + ) + expect(result).toEqual({ + preFilter: { status: { $eq: 'published' } }, + postFilter: null, + }) + }) +}) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: FAIL — `and/or not implemented yet`. + +- [ ] **Step 3: Implement `and` / `or` recursion** + +Replace the implementation in `adapters/mongodb/src/convertWhere.ts` with: + +```ts +import type { Where } from 'payload' +import { RESERVED_FILTER_FIELDS } from './types.js' + +export interface ConvertResult { + preFilter: Record | null + postFilter: Where | null +} + +const PRE_OPS = new Map([ + ['equals', '$eq'], + ['not_equals', '$ne'], + ['notEquals', '$ne'], + ['in', '$in'], + ['not_in', '$nin'], + ['notIn', '$nin'], + ['greater_than', '$gt'], + ['greaterThan', '$gt'], + ['greater_than_equal', '$gte'], + ['greaterThanEqual', '$gte'], + ['less_than', '$lt'], + ['lessThan', '$lt'], + ['less_than_equal', '$lte'], + ['lessThanEqual', '$lte'], +]) + +const POST_OPS = new Set(['like', 'contains', 'all']) +const UNSUPPORTED_OPS = new Set(['near', 'within', 'intersects']) + +function isFilterable(field: string, filterable: string[]): boolean { + return ( + (RESERVED_FILTER_FIELDS as readonly string[]).includes(field) || + filterable.includes(field) + ) +} + +function leafToPre(field: string, cond: Record): Record { + const clauses: Record[] = [] + for (const [op, val] of Object.entries(cond)) { + if (op === 'exists') { + if (val === true) { + clauses.push({ [field]: { $exists: true, $ne: null } }) + } else { + clauses.push({ $or: [{ [field]: { $exists: false } }, { [field]: { $eq: null } }] }) + } + continue + } + const mongoOp = PRE_OPS.get(op) + if (!mongoOp) continue + clauses.push({ [field]: { [mongoOp]: val } }) + } + if (clauses.length === 0) return {} + if (clauses.length === 1) return clauses[0] + return { $and: clauses } +} + +function convertLeaf( + where: Where, + filterable: string[], + poolName: string, +): ConvertResult { + const keys = Object.keys(where) + if (keys.length !== 1) { + // Multiple top-level fields on the same object: treat as implicit AND. + const synthetic: Where = { and: keys.map((k) => ({ [k]: where[k] }) as Where) } + return convertWhereToMongo(synthetic, filterable, poolName) + } + const field = keys[0] + const cond = where[field] as Record + if (!isFilterable(field, filterable)) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Field "${field}" is not configured as filterableFields for pool "${poolName}"`, + ) + } + for (const op of Object.keys(cond)) { + if (UNSUPPORTED_OPS.has(op)) { + throw new Error(`[@payloadcms-vectorize/mongodb] Operator "${op}" is not supported`) + } + } + const hasPostOp = Object.keys(cond).some((op) => POST_OPS.has(op)) + if (hasPostOp) { + return { preFilter: null, postFilter: { [field]: cond } as Where } + } + return { preFilter: leafToPre(field, cond), postFilter: null } +} + +export function convertWhereToMongo( + where: Where, + filterable: string[], + poolName: string, +): ConvertResult { + if ('and' in where && Array.isArray(where.and)) { + const branches = where.and.map((b) => convertWhereToMongo(b, filterable, poolName)) + const preBranches = branches.filter((b) => b.preFilter).map((b) => b.preFilter!) + const postBranches = branches.filter((b) => b.postFilter).map((b) => b.postFilter!) + const preFilter = + preBranches.length === 0 + ? null + : preBranches.length === 1 + ? preBranches[0] + : { $and: preBranches } + const postFilter = + postBranches.length === 0 + ? null + : postBranches.length === 1 + ? postBranches[0] + : ({ and: postBranches } as Where) + return { preFilter, postFilter } + } + + if ('or' in where && Array.isArray(where.or)) { + const branches = where.or.map((b) => convertWhereToMongo(b, filterable, poolName)) + const anyPost = branches.some((b) => b.postFilter !== null) + if (anyPost) { + // Entire OR goes post — semantics require the whole disjunction to apply + // to the post-vectorSearch document set. + return { preFilter: null, postFilter: where } + } + const preBranches = branches.map((b) => b.preFilter!).filter((p) => p) + const preFilter = + preBranches.length === 0 + ? null + : preBranches.length === 1 + ? preBranches[0] + : { $or: preBranches } + return { preFilter, postFilter: null } + } + + return convertLeaf(where, filterable, poolName) +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: PASS — 19 tests total. + +- [ ] **Step 5: Commit** + +```bash +git add adapters/mongodb/src/convertWhere.ts adapters/mongodb/dev/specs/convertWhere.spec.ts +git commit -m "feat(mongodb): convertWhereToMongo handles and/or composition with pre/post split" +``` + +--- + +## Task 8: `evaluatePostFilter` — runtime post-filter matcher + +**Files:** +- Modify: `adapters/mongodb/src/convertWhere.ts` +- Modify: `adapters/mongodb/dev/specs/convertWhere.spec.ts` + +The post-filter is applied in JS against the result rows (not as a `$match` — Mongo's `$match` cannot natively express `like`/`contains`/regex with our exact semantics, and we already need JS evaluation for nested-`or` cases). This task adds `evaluatePostFilter`. + +- [ ] **Step 1: Add failing tests** + +Append to `adapters/mongodb/dev/specs/convertWhere.spec.ts`: + +```ts +import { evaluatePostFilter } from '../../src/convertWhere.js' + +describe('evaluatePostFilter', () => { + test('like with case-insensitive substring match', () => { + expect( + evaluatePostFilter({ tags: 'JavaScript' }, { tags: { like: 'javascript' } }), + ).toBe(true) + expect( + evaluatePostFilter({ tags: 'python' }, { tags: { like: 'javascript' } }), + ).toBe(false) + }) + + test('contains works on scalar string', () => { + expect( + evaluatePostFilter({ category: 'technology' }, { category: { contains: 'tech' } }), + ).toBe(true) + expect( + evaluatePostFilter({ category: 'design' }, { category: { contains: 'tech' } }), + ).toBe(false) + }) + + test('contains on array uses elemMatch-style', () => { + expect( + evaluatePostFilter({ tags: ['react', 'javascript'] }, { tags: { contains: 'java' } }), + ).toBe(true) + expect( + evaluatePostFilter({ tags: ['python'] }, { tags: { contains: 'java' } }), + ).toBe(false) + }) + + test('like with regex special chars does NOT match unintended values', () => { + // Pattern "foo.bar" must match the literal dot, not any char. + expect( + evaluatePostFilter({ tags: 'fooXbar' }, { tags: { like: 'foo.bar' } }), + ).toBe(false) + expect( + evaluatePostFilter({ tags: 'foo.bar' }, { tags: { like: 'foo.bar' } }), + ).toBe(true) + }) + + test('all on array', () => { + expect( + evaluatePostFilter({ tags: ['a', 'b', 'c'] }, { tags: { all: ['a', 'b'] } }), + ).toBe(true) + expect( + evaluatePostFilter({ tags: ['a'] }, { tags: { all: ['a', 'b'] } }), + ).toBe(false) + }) + + test('and combinator', () => { + const w: any = { + and: [ + { status: { equals: 'published' } }, + { tags: { like: 'javascript' } }, + ], + } + expect( + evaluatePostFilter({ status: 'published', tags: 'JavaScript,react' }, w), + ).toBe(true) + expect( + evaluatePostFilter({ status: 'draft', tags: 'JavaScript,react' }, w), + ).toBe(false) + }) + + test('or combinator', () => { + const w: any = { + or: [ + { status: { equals: 'published' } }, + { tags: { like: 'javascript' } }, + ], + } + expect(evaluatePostFilter({ status: 'published', tags: 'python' }, w)).toBe(true) + expect(evaluatePostFilter({ status: 'draft', tags: 'JavaScript' }, w)).toBe(true) + expect(evaluatePostFilter({ status: 'draft', tags: 'python' }, w)).toBe(false) + }) + + test('pre-filter operators also evaluable in post path (for OR mixed branches)', () => { + expect( + evaluatePostFilter({ status: 'published' }, { status: { equals: 'published' } }), + ).toBe(true) + expect( + evaluatePostFilter({ views: 150 }, { views: { greater_than: 100 } }), + ).toBe(true) + expect( + evaluatePostFilter({ views: 50 }, { views: { greater_than: 100 } }), + ).toBe(false) + }) +}) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: FAIL — `evaluatePostFilter` not exported. + +- [ ] **Step 3: Implement `evaluatePostFilter`** + +Append to `adapters/mongodb/src/convertWhere.ts`: + +```ts +import { escapeRegExp } from './escapeRegExp.js' + +function valueMatchesOp(value: unknown, op: string, operand: unknown): boolean { + switch (op) { + case 'equals': + return value === operand + case 'not_equals': + case 'notEquals': + return value !== operand + case 'in': + return Array.isArray(operand) && operand.includes(value as never) + case 'not_in': + case 'notIn': + return Array.isArray(operand) && !operand.includes(value as never) + case 'greater_than': + case 'greaterThan': + return typeof value === 'number' && typeof operand === 'number' && value > operand + case 'greater_than_equal': + case 'greaterThanEqual': + return typeof value === 'number' && typeof operand === 'number' && value >= operand + case 'less_than': + case 'lessThan': + return typeof value === 'number' && typeof operand === 'number' && value < operand + case 'less_than_equal': + case 'lessThanEqual': + return typeof value === 'number' && typeof operand === 'number' && value <= operand + case 'exists': + return operand + ? value !== undefined && value !== null + : value === undefined || value === null + case 'like': + case 'contains': { + if (typeof operand !== 'string') return false + const re = new RegExp(escapeRegExp(operand), 'i') + if (Array.isArray(value)) { + return value.some((v) => typeof v === 'string' && re.test(v)) + } + return typeof value === 'string' && re.test(value) + } + case 'all': + return ( + Array.isArray(value) && + Array.isArray(operand) && + operand.every((o) => value.includes(o as never)) + ) + default: + return false + } +} + +export function evaluatePostFilter(doc: Record, where: Where): boolean { + if (!where || Object.keys(where).length === 0) return true + if ('and' in where && Array.isArray(where.and)) { + return where.and.every((c: Where) => evaluatePostFilter(doc, c)) + } + if ('or' in where && Array.isArray(where.or)) { + return where.or.some((c: Where) => evaluatePostFilter(doc, c)) + } + for (const [field, condition] of Object.entries(where)) { + if (field === 'and' || field === 'or') continue + if (typeof condition !== 'object' || condition === null) continue + const cond = condition as Record + for (const [op, operand] of Object.entries(cond)) { + if (!valueMatchesOp(doc[field], op, operand)) return false + } + } + return true +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: PASS — 27 tests total. + +- [ ] **Step 5: Commit** + +```bash +git add adapters/mongodb/src/convertWhere.ts adapters/mongodb/dev/specs/convertWhere.spec.ts +git commit -m "feat(mongodb): add evaluatePostFilter for runtime post-filter matching" +``` + +--- + +## Task 9: `ensureSearchIndex` — index lifecycle with cache + +**Files:** +- Create: `adapters/mongodb/src/indexes.ts` + +This is verified end-to-end via the integration suite (Task 16). No unit test here because the function is a thin wrapper around the Mongo driver's `listSearchIndexes` / `createSearchIndex`, both of which require a live `mongot`. + +Atlas Local rejects `createSearchIndex` if the target collection does not yet exist (`Collection 'X' does not exist`), so we materialize the collection idempotently with `db.createCollection` before the first index creation. Atlas Cloud is more lenient about this, but the adapter must work in both environments. + +- [ ] **Step 1: Write `adapters/mongodb/src/indexes.ts`** + +```ts +import type { Db, MongoClient } from 'mongodb' +import type { ResolvedPoolConfig } from './types.js' + +const ensureCache = new Set() + +function cacheKey(dbName: string, collectionName: string, indexName: string): string { + return `${dbName}::${collectionName}::${indexName}` +} + +function buildDefinition(pool: ResolvedPoolConfig): Record { + return { + fields: [ + { + type: 'vector', + path: 'embedding', + numDimensions: pool.dimensions, + similarity: pool.similarity, + }, + { type: 'filter', path: 'sourceCollection' }, + { type: 'filter', path: 'docId' }, + { type: 'filter', path: 'embeddingVersion' }, + ...pool.filterableFields.map((p) => ({ type: 'filter', path: p })), + ], + } +} + +function definitionsEqual(a: unknown, b: unknown): boolean { + return JSON.stringify(a) === JSON.stringify(b) +} + +async function ensureCollectionExists(db: Db, name: string): Promise { + const existing = await db.listCollections({ name }, { nameOnly: true }).toArray() + if (existing.length === 0) { + await db.createCollection(name) + } +} + +export async function ensureSearchIndex( + client: MongoClient, + dbName: string, + pool: ResolvedPoolConfig, +): Promise { + const key = cacheKey(dbName, pool.collectionName, pool.indexName) + if (ensureCache.has(key)) return + + const db = client.db(dbName) + const collection = db.collection(pool.collectionName) + + const wantedDefinition = buildDefinition(pool) + + let existing: Array> + try { + existing = (await collection.listSearchIndexes(pool.indexName).toArray()) as Array< + Record + > + } catch { + existing = [] + } + + const found = existing.find((idx) => idx.name === pool.indexName) + if (found) { + const status = found.status as string | undefined + if (status === 'READY' || status === 'BUILDING') { + const latest = (found.latestDefinition as Record) ?? found.definition + if (!definitionsEqual(latest, wantedDefinition)) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Search index "${pool.indexName}" exists with different definition. Drop it manually with db.collection("${pool.collectionName}").dropSearchIndex("${pool.indexName}") before re-running.`, + ) + } + if (status === 'READY') { + ensureCache.add(key) + return + } + // BUILDING: fall through to polling + } else { + throw new Error( + `[@payloadcms-vectorize/mongodb] Search index "${pool.indexName}" is in unexpected state "${status}". Drop and recreate.`, + ) + } + } else { + await ensureCollectionExists(db, pool.collectionName) + await collection.createSearchIndex({ + name: pool.indexName, + type: 'vectorSearch', + definition: wantedDefinition, + }) + } + + // Poll for READY (≤ 60s) + const deadline = Date.now() + 60_000 + while (Date.now() < deadline) { + const list = (await collection.listSearchIndexes(pool.indexName).toArray()) as Array< + Record + > + const idx = list.find((i) => i.name === pool.indexName) + if (idx?.status === 'READY') { + ensureCache.add(key) + return + } + await new Promise((r) => setTimeout(r, 1000)) + } + throw new Error( + `[@payloadcms-vectorize/mongodb] Search index "${pool.indexName}" did not become READY within 60s. Check Mongo logs.`, + ) +} + +/** Test-only: clear the in-memory ensure cache. */ +export function __resetIndexCacheForTests(): void { + ensureCache.clear() +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd adapters/mongodb && pnpm exec tsc -p tsconfig.build.json --noEmit` +Expected: PASS. + +- [ ] **Step 3: Commit** + +```bash +git add adapters/mongodb/src/indexes.ts +git commit -m "feat(mongodb): ensureSearchIndex with definition-mismatch detection and READY polling" +``` + +--- + +## Task 10: `storeChunk` + +**Files:** +- Create: `adapters/mongodb/src/embed.ts` + +- [ ] **Step 1: Write `adapters/mongodb/src/embed.ts`** + +```ts +import type { Payload } from 'payload' +import type { StoreChunkData } from 'payloadcms-vectorize' +import { getMongoClient } from './client.js' +import { ensureSearchIndex } from './indexes.js' +import { getMongoConfig } from './types.js' + +export default async function storeChunk( + payload: Payload, + poolName: string, + data: StoreChunkData, +): Promise { + const cfg = getMongoConfig(payload) + const pool = cfg.pools[poolName] + if (!pool) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Unknown pool "${poolName}". Configured pools: ${Object.keys(cfg.pools).join(', ')}`, + ) + } + const client = await getMongoClient(cfg.uri) + await ensureSearchIndex(client, cfg.dbName, pool) + + const embeddingArray = Array.from(data.embedding) + + const now = new Date() + const collection = client.db(cfg.dbName).collection(pool.collectionName) + await collection.insertOne({ + ...data.extensionFields, + sourceCollection: data.sourceCollection, + docId: String(data.docId), + chunkIndex: data.chunkIndex, + chunkText: data.chunkText, + embeddingVersion: data.embeddingVersion, + embedding: embeddingArray, + createdAt: now, + updatedAt: now, + }) +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd adapters/mongodb && pnpm exec tsc -p tsconfig.build.json --noEmit` +Expected: PASS. + +- [ ] **Step 3: Commit** + +```bash +git add adapters/mongodb/src/embed.ts +git commit -m "feat(mongodb): add storeChunk that ensures index then inserts document" +``` + +--- + +## Task 11: `search` — `$vectorSearch` aggregation + +**Files:** +- Create: `adapters/mongodb/src/search.ts` + +- [ ] **Step 1: Write `adapters/mongodb/src/search.ts`** + +```ts +import type { BasePayload, Where } from 'payload' +import type { VectorSearchResult } from 'payloadcms-vectorize' +import { getMongoClient } from './client.js' +import { convertWhereToMongo, evaluatePostFilter } from './convertWhere.js' +import { ensureSearchIndex } from './indexes.js' +import { getMongoConfig, RESERVED_FIELDS } from './types.js' + +export default async function search( + payload: BasePayload, + queryEmbedding: number[], + poolName: string, + limit: number = 10, + where?: Where, +): Promise { + const cfg = getMongoConfig(payload) + const pool = cfg.pools[poolName] + if (!pool) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Unknown pool "${poolName}". Configured pools: ${Object.keys(cfg.pools).join(', ')}`, + ) + } + const client = await getMongoClient(cfg.uri) + await ensureSearchIndex(client, cfg.dbName, pool) + + let preFilter: Record | null = null + let postFilter: Where | null = null + if (where && Object.keys(where).length > 0) { + const split = convertWhereToMongo(where, pool.filterableFields, poolName) + preFilter = split.preFilter + postFilter = split.postFilter + } + + const numCandidates = + pool.numCandidates ?? Math.max(limit * 20, 100) + + const vectorSearchStage: Record = { + index: pool.indexName, + path: 'embedding', + queryVector: queryEmbedding, + numCandidates, + limit, + } + if (pool.forceExact) vectorSearchStage.exact = true + if (preFilter) vectorSearchStage.filter = preFilter + + const projection: Record = { + _id: 1, + score: { $meta: 'vectorSearchScore' }, + sourceCollection: 1, + docId: 1, + chunkIndex: 1, + chunkText: 1, + embeddingVersion: 1, + } + for (const f of pool.filterableFields) projection[f] = 1 + + const pipeline: Record[] = [ + { $vectorSearch: vectorSearchStage }, + { $project: projection }, + ] + + const collection = client.db(cfg.dbName).collection(pool.collectionName) + const rawDocs = await collection.aggregate(pipeline).toArray() + + const filtered = postFilter + ? rawDocs.filter((d) => evaluatePostFilter(d as Record, postFilter!)) + : rawDocs + + return filtered.map((d) => mapDocToResult(d as Record, pool.filterableFields)) +} + +function mapDocToResult( + doc: Record, + filterable: string[], +): VectorSearchResult { + if (typeof doc.score !== 'number') { + throw new Error( + `[@payloadcms-vectorize/mongodb] Search result is missing numeric "score" field; ensure $project includes { score: { $meta: 'vectorSearchScore' } }`, + ) + } + const result: Record = { + id: String(doc._id), + score: doc.score, + sourceCollection: String(doc.sourceCollection ?? ''), + docId: String(doc.docId ?? ''), + chunkIndex: + typeof doc.chunkIndex === 'number' ? doc.chunkIndex : Number(doc.chunkIndex ?? 0), + chunkText: String(doc.chunkText ?? ''), + embeddingVersion: String(doc.embeddingVersion ?? ''), + } + for (const f of filterable) { + if (f in doc && !(RESERVED_FIELDS as readonly string[]).includes(f)) { + result[f] = doc[f] + } + } + return result as VectorSearchResult +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd adapters/mongodb && pnpm exec tsc -p tsconfig.build.json --noEmit` +Expected: PASS. + +- [ ] **Step 3: Commit** + +```bash +git add adapters/mongodb/src/search.ts +git commit -m "feat(mongodb): implement search via \$vectorSearch with pre/post split" +``` + +--- + +## Task 12: `id` → `_id` `ObjectId` casting in `convertWhereToMongo` + +**Files:** +- Modify: `adapters/mongodb/src/convertWhere.ts` +- Modify: `adapters/mongodb/dev/specs/convertWhere.spec.ts` + +The Payload `id` field maps to Mongo `_id`. When users filter by `id`, cast to `ObjectId` if the value is a 24-hex string; otherwise pass through as-is. + +- [ ] **Step 1: Add failing tests** + +Append to `adapters/mongodb/dev/specs/convertWhere.spec.ts`: + +```ts +import { ObjectId } from 'mongodb' + +describe('convertWhereToMongo — id mapping', () => { + test('id with 24-hex string maps to _id with ObjectId cast', () => { + const hex = '507f1f77bcf86cd799439011' + const result = convertWhereToMongo({ id: { equals: hex } }, [], 'p1') + expect(result.preFilter).toEqual({ _id: { $eq: new ObjectId(hex) } }) + expect(result.postFilter).toBeNull() + }) + + test('id with non-hex string maps to _id with raw value', () => { + const result = convertWhereToMongo({ id: { equals: 'not-an-objectid' } }, [], 'p1') + expect(result.preFilter).toEqual({ _id: { $eq: 'not-an-objectid' } }) + }) + + test('id with in array casts each 24-hex string', () => { + const a = '507f1f77bcf86cd799439011' + const b = 'plain-string-id' + const result = convertWhereToMongo({ id: { in: [a, b] } }, [], 'p1') + expect(result.preFilter).toEqual({ + _id: { $in: [new ObjectId(a), b] }, + }) + }) +}) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: FAIL — `id` field not in `RESERVED_FILTER_FIELDS`, throws "not configured" or maps to `id` not `_id`. + +- [ ] **Step 3: Update `convertWhere.ts`** + +In `adapters/mongodb/src/convertWhere.ts`: + +a) At the top of the file, add: + +```ts +import { ObjectId } from 'mongodb' + +const HEX24 = /^[a-f\d]{24}$/i + +function castIdValue(v: unknown): unknown { + if (typeof v === 'string' && HEX24.test(v)) return new ObjectId(v) + return v +} + +function castIdOperand(op: string, v: unknown): unknown { + if (op === 'in' || op === 'not_in' || op === 'notIn') { + return Array.isArray(v) ? v.map(castIdValue) : v + } + return castIdValue(v) +} +``` + +b) Update `isFilterable` to recognize `id`: + +```ts +function isFilterable(field: string, filterable: string[]): boolean { + if (field === 'id') return true + return ( + (RESERVED_FILTER_FIELDS as readonly string[]).includes(field) || + filterable.includes(field) + ) +} +``` + +c) Update `leafToPre` to remap `id` → `_id` and cast: + +```ts +function leafToPre(field: string, cond: Record): Record { + const targetField = field === 'id' ? '_id' : field + const clauses: Record[] = [] + for (const [op, val] of Object.entries(cond)) { + if (op === 'exists') { + if (val === true) { + clauses.push({ [targetField]: { $exists: true, $ne: null } }) + } else { + clauses.push({ + $or: [ + { [targetField]: { $exists: false } }, + { [targetField]: { $eq: null } }, + ], + }) + } + continue + } + const mongoOp = PRE_OPS.get(op) + if (!mongoOp) continue + const operand = field === 'id' ? castIdOperand(op, val) : val + clauses.push({ [targetField]: { [mongoOp]: operand } }) + } + if (clauses.length === 0) return {} + if (clauses.length === 1) return clauses[0] + return { $and: clauses } +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd adapters/mongodb && pnpm exec vitest run dev/specs/convertWhere.spec.ts` +Expected: PASS — 30 tests total. + +- [ ] **Step 5: Commit** + +```bash +git add adapters/mongodb/src/convertWhere.ts adapters/mongodb/dev/specs/convertWhere.spec.ts +git commit -m "feat(mongodb): map Payload id → Mongo _id with ObjectId casting" +``` + +--- + +## Task 13: `createMongoVectorIntegration` factory + barrel exports + +**Files:** +- Create: `adapters/mongodb/src/index.ts` + +- [ ] **Step 1: Write `adapters/mongodb/src/index.ts`** + +```ts +import type { DbAdapter } from 'payloadcms-vectorize' +import { getMongoClient } from './client.js' +import storeChunk from './embed.js' +import search from './search.js' +import { + resolvePoolConfig, + type MongoVectorIntegrationConfig, + type ResolvedPoolConfig, +} from './types.js' + +export type { + MongoPoolConfig, + MongoVectorIntegrationConfig, + Similarity, +} from './types.js' + +export const createMongoVectorIntegration = ( + options: MongoVectorIntegrationConfig, +): { adapter: DbAdapter } => { + if (!options.uri) { + throw new Error('[@payloadcms-vectorize/mongodb] `uri` is required') + } + if (!options.dbName) { + throw new Error('[@payloadcms-vectorize/mongodb] `dbName` is required') + } + if (!options.pools || Object.keys(options.pools).length === 0) { + throw new Error('[@payloadcms-vectorize/mongodb] `pools` must contain at least one pool') + } + + const resolvedPools: Record = {} + for (const [name, p] of Object.entries(options.pools)) { + if (typeof p.dimensions !== 'number' || p.dimensions <= 0) { + throw new Error( + `[@payloadcms-vectorize/mongodb] pool "${name}" requires a positive numeric \`dimensions\``, + ) + } + resolvedPools[name] = resolvePoolConfig(name, p) + } + + const adapter: DbAdapter = { + getConfigExtension: () => ({ + custom: { + _mongoConfig: { + uri: options.uri, + dbName: options.dbName, + pools: resolvedPools, + }, + }, + }), + + storeChunk, + + deleteChunks: async (payload, poolName, sourceCollection, docId) => { + const cfg = resolvedPools[poolName] + if (!cfg) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Unknown pool "${poolName}"`, + ) + } + const client = await getMongoClient(options.uri) + await client + .db(options.dbName) + .collection(cfg.collectionName) + .deleteMany({ sourceCollection, docId: String(docId) }) + }, + + hasEmbeddingVersion: async ( + payload, + poolName, + sourceCollection, + docId, + embeddingVersion, + ) => { + const cfg = resolvedPools[poolName] + if (!cfg) { + throw new Error( + `[@payloadcms-vectorize/mongodb] Unknown pool "${poolName}"`, + ) + } + const client = await getMongoClient(options.uri) + const count = await client + .db(options.dbName) + .collection(cfg.collectionName) + .countDocuments( + { sourceCollection, docId: String(docId), embeddingVersion }, + { limit: 1 }, + ) + return count > 0 + }, + + search, + } + + return { adapter } +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd adapters/mongodb && pnpm exec tsc -p tsconfig.build.json --noEmit` +Expected: PASS. + +- [ ] **Step 3: Commit** + +```bash +git add adapters/mongodb/src/index.ts +git commit -m "feat(mongodb): add createMongoVectorIntegration factory and adapter wiring" +``` + +--- + +## Task 14: `docker-compose.yml` for local Mongo Atlas + +**Files:** +- Create: `adapters/mongodb/dev/docker-compose.yml` + +- [ ] **Step 1: Write `adapters/mongodb/dev/docker-compose.yml`** + +```yaml +services: + mongodb-atlas: + image: mongodb/mongodb-atlas-local:latest + container_name: vectorize-mongodb-test + ports: + - "27018:27017" + healthcheck: + test: ["CMD", "mongosh", "--quiet", "--eval", "db.runCommand({ping:1})"] + interval: 2s + timeout: 5s + retries: 30 +``` + +- [ ] **Step 2: Sanity check that the compose file parses** + +Run: `cd adapters/mongodb && docker compose -f dev/docker-compose.yml config` +Expected: prints normalized YAML with no errors. + +- [ ] **Step 3: Bring the container up and verify health** + +Run: `cd adapters/mongodb && pnpm test:setup` +Then: `docker inspect --format='{{.State.Health.Status}}' vectorize-mongodb-test` +Expected: `healthy` within ~30s. + +- [ ] **Step 4: Bring the container down** + +Run: `cd adapters/mongodb && pnpm test:teardown` +Expected: container removed, no errors. + +- [ ] **Step 5: Commit** + +```bash +git add adapters/mongodb/dev/docker-compose.yml +git commit -m "feat(mongodb): add docker-compose for local mongodb-atlas-local stack" +``` + +--- + +## Task 15: Compliance suite + +**Files:** +- Create: `adapters/mongodb/dev/specs/constants.ts` +- Create: `adapters/mongodb/dev/specs/utils.ts` +- Create: `adapters/mongodb/dev/specs/compliance.spec.ts` + +The Mongo adapter does NOT register a Payload collection, so unlike PG we don't spin up the full plugin in compliance tests — we exercise the `DbAdapter` directly with a minimal Payload instance whose only role is to surface `_mongoConfig` via the plugin's `getVectorizedPayload` helper. We use the dummy in-memory `payload` shape `payloadcms-vectorize` looks for. + +- [ ] **Step 1: Write `adapters/mongodb/dev/specs/constants.ts`** + +```ts +import { createMongoVectorIntegration } from '../../src/index.js' + +export const DIMS = 8 +export const MONGO_URI = + process.env.MONGODB_URI || 'mongodb://localhost:27018/?directConnection=true' + +export const TEST_DB = `vectorize_mongo_test_${Date.now()}` + +export function makeIntegration(filterableFields: string[] = []) { + return createMongoVectorIntegration({ + uri: MONGO_URI, + dbName: TEST_DB, + pools: { + default: { + dimensions: DIMS, + filterableFields, + // Smaller candidate set so HNSW build/scan stays fast on tiny datasets. + numCandidates: 50, + }, + }, + }) +} +``` + +- [ ] **Step 2: Write `adapters/mongodb/dev/specs/utils.ts`** + +```ts +import { MongoClient } from 'mongodb' +import type { BasePayload } from 'payload' +import { __closeForTests } from '../../src/client.js' +import { __resetIndexCacheForTests } from '../../src/indexes.js' + +/** + * Minimal payload-shaped object that satisfies `getVectorizedPayload(payload).getDbAdapterCustom()`. + * + * `getVectorizedPayload` (src/types.ts) reads `payload.config.custom.createVectorizedPayloadObject` + * and calls it with the payload to produce a `VectorizedPayload` whose `getDbAdapterCustom()` + * returns the adapter's `getConfigExtension().custom`. We mirror that contract exactly. + */ +export function makeFakePayload(custom: Record): BasePayload { + const payload = { + config: { + custom: { + createVectorizedPayloadObject: () => ({ + getDbAdapterCustom: () => custom, + }), + }, + }, + logger: { + error: console.error.bind(console), + info: console.log.bind(console), + }, + } as unknown as BasePayload + return payload +} + +/** Spin up an admin client and drop the test DB. */ +export async function dropTestDb(uri: string, dbName: string): Promise { + const c = new MongoClient(uri) + try { + await c.connect() + await c.db(dbName).dropDatabase() + } catch { + // ignore — DB may not exist + } finally { + await c.close() + } +} + +export async function teardown(): Promise { + __resetIndexCacheForTests() + await __closeForTests() +} +``` + +- [ ] **Step 3: Write `adapters/mongodb/dev/specs/compliance.spec.ts`** + +```ts +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { MongoClient } from 'mongodb' +import type { BasePayload } from 'payload' +import type { DbAdapter } from 'payloadcms-vectorize' +import { DIMS, MONGO_URI, TEST_DB, makeIntegration } from './constants.js' +import { dropTestDb, makeFakePayload, teardown } from './utils.js' + +describe('Mongo Adapter Compliance Tests', () => { + let adapter: DbAdapter + let payload: BasePayload + + beforeAll(async () => { + await dropTestDb(MONGO_URI, TEST_DB) + const integration = makeIntegration() + adapter = integration.adapter + const ext = adapter.getConfigExtension({} as any) + payload = makeFakePayload(ext.custom!) + }) + + afterAll(async () => { + await dropTestDb(MONGO_URI, TEST_DB) + await teardown() + }) + + describe('getConfigExtension()', () => { + test('returns object with custom._mongoConfig', () => { + const ext = adapter.getConfigExtension({} as any) + expect(ext.custom?._mongoConfig).toBeDefined() + expect(ext.custom!._mongoConfig.uri).toBe(MONGO_URI) + expect(ext.custom!._mongoConfig.dbName).toBe(TEST_DB) + expect(ext.custom!._mongoConfig.pools.default.dimensions).toBe(DIMS) + }) + + test('does NOT include any collections (Mongo manages docs via raw driver)', () => { + const ext = adapter.getConfigExtension({} as any) + expect(ext.collections).toBeUndefined() + }) + }) + + describe('storeChunk()', () => { + test('persists embedding (number[])', async () => { + const embedding = Array(DIMS) + .fill(0) + .map(() => Math.random()) + await expect( + adapter.storeChunk(payload, 'default', { + sourceCollection: 'test-collection', + docId: `embed-1-${Date.now()}`, + chunkIndex: 0, + chunkText: 'test text', + embeddingVersion: 'v1', + embedding, + extensionFields: {}, + }), + ).resolves.not.toThrow() + }) + + test('persists embedding (Float32Array)', async () => { + const embedding = new Float32Array( + Array(DIMS) + .fill(0) + .map(() => Math.random()), + ) + await expect( + adapter.storeChunk(payload, 'default', { + sourceCollection: 'test-collection', + docId: `embed-2-${Date.now()}`, + chunkIndex: 0, + chunkText: 'test text float32', + embeddingVersion: 'v1', + embedding, + extensionFields: {}, + }), + ).resolves.not.toThrow() + }) + }) + + describe('search()', () => { + let target: number[] + beforeAll(async () => { + target = Array(DIMS).fill(0.5) + const similar = target.map((v) => v + Math.random() * 0.05) + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'test-collection', + docId: `search-similar-${Date.now()}`, + chunkIndex: 0, + chunkText: 'similar doc', + embeddingVersion: 'v1', + embedding: similar, + extensionFields: {}, + }) + }) + + test('returns an array of results', async () => { + const results = await adapter.search(payload, target, 'default') + expect(Array.isArray(results)).toBe(true) + }) + + test('results have all required fields with correct types', async () => { + const results = await adapter.search(payload, target, 'default') + for (const r of results) { + expect(typeof r.id).toBe('string') + expect(typeof r.score).toBe('number') + expect(typeof r.sourceCollection).toBe('string') + expect(typeof r.docId).toBe('string') + expect(typeof r.chunkIndex).toBe('number') + expect(typeof r.chunkText).toBe('string') + expect(typeof r.embeddingVersion).toBe('string') + } + }) + + test('results are ordered by score (highest first)', async () => { + const results = await adapter.search(payload, target, 'default', 10) + for (let i = 1; i < results.length; i++) { + expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score) + } + }) + + test('respects limit parameter', async () => { + const results = await adapter.search(payload, target, 'default', 1) + expect(results.length).toBeLessThanOrEqual(1) + }) + }) + + describe('deleteChunks()', () => { + test('removes chunks for a doc', async () => { + const docId = `to-delete-${Date.now()}` + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'delete-test', + docId, + chunkIndex: 0, + chunkText: 'doc to delete', + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.7), + extensionFields: {}, + }) + + const c = new MongoClient(MONGO_URI) + await c.connect() + const before = await c + .db(TEST_DB) + .collection('vectorize_default') + .countDocuments({ sourceCollection: 'delete-test', docId }) + expect(before).toBeGreaterThan(0) + + await adapter.deleteChunks(payload, 'default', 'delete-test', docId) + + const after = await c + .db(TEST_DB) + .collection('vectorize_default') + .countDocuments({ sourceCollection: 'delete-test', docId }) + expect(after).toBe(0) + await c.close() + }) + + test('handles missing doc gracefully', async () => { + await expect( + adapter.deleteChunks(payload, 'default', 'never-existed', 'fake-id'), + ).resolves.not.toThrow() + }) + }) + + describe('hasEmbeddingVersion()', () => { + test('true when chunk exists', async () => { + const docId = `has-version-${Date.now()}` + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'test-collection', + docId, + chunkIndex: 0, + chunkText: 'has version test', + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.5), + extensionFields: {}, + }) + const r = await adapter.hasEmbeddingVersion( + payload, 'default', 'test-collection', docId, 'v1', + ) + expect(r).toBe(true) + }) + + test('false when no chunk exists', async () => { + const r = await adapter.hasEmbeddingVersion( + payload, 'default', 'test-collection', 'never-existed', 'v1', + ) + expect(r).toBe(false) + }) + }) +}) +``` + +- [ ] **Step 4: Run compliance suite (requires `pnpm test:setup` running)** + +Run: `cd adapters/mongodb && pnpm test:setup && pnpm exec vitest run dev/specs/compliance.spec.ts` +Expected: all tests PASS. The first run may take ~30s while the search index builds. + +- [ ] **Step 5: Commit** + +```bash +git add adapters/mongodb/dev/specs/constants.ts adapters/mongodb/dev/specs/utils.ts adapters/mongodb/dev/specs/compliance.spec.ts +git commit -m "test(mongodb): port compliance suite from PG, exercise adapter directly" +``` + +--- + +## Task 16: WHERE-clause + integration suites against live Mongo + +**Files:** +- Create: `adapters/mongodb/dev/specs/vectorSearchWhere.spec.ts` +- Create: `adapters/mongodb/dev/specs/integration.spec.ts` + +The PG `vectorSearchWhere.spec.ts` runs end-to-end through the plugin's HTTP handler. For Mongo we exercise `adapter.search` directly because we don't register a Payload collection. The fixture data and assertions are otherwise identical so the suite mirrors PG's coverage of all 9 operators. + +- [ ] **Step 1: Write `adapters/mongodb/dev/specs/vectorSearchWhere.spec.ts`** + +```ts +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import type { BasePayload, Where } from 'payload' +import type { DbAdapter, VectorSearchResult } from 'payloadcms-vectorize' +import { createMongoVectorIntegration } from '../../src/index.js' +import { DIMS, MONGO_URI } from './constants.js' +import { dropTestDb, makeFakePayload, teardown } from './utils.js' + +const TEST_DB = `vectorize_mongo_where_${Date.now()}` +const FILTERABLE = ['status', 'category', 'views', 'rating', 'published', 'tags'] + +const articles = [ + { + title: 'Published Tech Article', + status: 'published', category: 'tech', views: 150, + rating: 4.5, published: true, tags: 'javascript,nodejs,programming', + }, + { + title: 'Draft Tech Article', + status: 'draft', category: 'tech', views: 0, + rating: 0, published: false, tags: 'javascript', + }, + { + title: 'Published Design Article', + status: 'published', category: 'design', views: 300, + rating: 4.8, published: true, tags: 'ui,design,ux', + }, + { + title: 'Archived Tech Article', + status: 'archived', category: 'tech', views: 50, + rating: 3.5, published: false, tags: 'python,legacy', + }, +] + +async function performVectorSearch( + payload: BasePayload, + adapter: DbAdapter, + where?: Where, + limit = 10, +): Promise { + // Atlas Vector Search rejects limit > numCandidates. Pool is numCandidates: 50, + // and the WHERE-suite fixture is 4 articles, so limit=10 is effectively unlimited. + const queryEmbedding = Array(DIMS).fill(0.5) + return adapter.search(payload, queryEmbedding, 'default', limit, where) +} + +describe('Mongo adapter — WHERE clause operators', () => { + let adapter: DbAdapter + let payload: BasePayload + + beforeAll(async () => { + await dropTestDb(MONGO_URI, TEST_DB) + const { adapter: a } = createMongoVectorIntegration({ + uri: MONGO_URI, + dbName: TEST_DB, + pools: { + default: { + dimensions: DIMS, + filterableFields: FILTERABLE, + numCandidates: 50, + }, + }, + }) + adapter = a + const ext = adapter.getConfigExtension({} as any) + payload = makeFakePayload(ext.custom!) + + let i = 0 + for (const a of articles) { + const embedding = Array(DIMS).fill(0.5).map((v) => v + Math.random() * 0.05) + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'articles', + docId: `art-${i++}`, + chunkIndex: 0, + chunkText: a.title, + embeddingVersion: 'v1', + embedding, + extensionFields: { + status: a.status, + category: a.category, + views: a.views, + rating: a.rating, + published: a.published, + tags: a.tags, + }, + }) + } + // Atlas Local has ~1s lag between insertOne and $vectorSearch visibility, + // even after the index is READY. Empirically: 0 results at 750ms, 4 at 1000ms. + await new Promise((r) => setTimeout(r, 1200)) + }, 90_000) + + afterAll(async () => { + await dropTestDb(MONGO_URI, TEST_DB) + await teardown() + }) + + describe('equals operator', () => { + test('filters by exact text match', async () => { + const r = await performVectorSearch(payload, adapter, { status: { equals: 'published' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).toBe('published')) + }) + + test('returns empty when no match', async () => { + const r = await performVectorSearch(payload, adapter, { status: { equals: 'missing' } }) + expect(r).toEqual([]) + }) + }) + + describe('not_equals / notEquals operator', () => { + test('filters by non-equal text match', async () => { + const r = await performVectorSearch(payload, adapter, { status: { not_equals: 'draft' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).not.toBe('draft')) + }) + + test('notEquals variant', async () => { + const r = await performVectorSearch(payload, adapter, { status: { notEquals: 'archived' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).not.toBe('archived')) + }) + }) + + describe('in / not_in / notIn operators', () => { + test('in', async () => { + const r = await performVectorSearch(payload, adapter, { status: { in: ['published', 'draft'] } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(['published', 'draft']).toContain(x.status)) + }) + test('not_in', async () => { + const r = await performVectorSearch(payload, adapter, { status: { not_in: ['draft', 'archived'] } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(['draft', 'archived']).not.toContain(x.status)) + }) + test('notIn', async () => { + const r = await performVectorSearch(payload, adapter, { status: { notIn: ['archived'] } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).not.toBe('archived')) + }) + }) + + describe('like / contains operators (post-filter)', () => { + test('like substring match', async () => { + const r = await performVectorSearch(payload, adapter, { tags: { like: 'javascript' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect((x.tags as string).toLowerCase()).toContain('javascript')) + }) + test('contains substring match', async () => { + const r = await performVectorSearch(payload, adapter, { category: { contains: 'tech' } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.category).toContain('tech')) + }) + test('like regex special chars do NOT match unintended values', async () => { + // None of our fixtures contain "foo.bar" — the dot must be escaped. + const r = await performVectorSearch(payload, adapter, { tags: { like: 'foo.bar' } }) + expect(r).toEqual([]) + }) + }) + + describe('comparison operators (numbers)', () => { + test('greater_than', async () => { + const r = await performVectorSearch(payload, adapter, { views: { greater_than: 100 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeGreaterThan(100)) + }) + test('greaterThan variant', async () => { + const r = await performVectorSearch(payload, adapter, { views: { greaterThan: 100 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeGreaterThan(100)) + }) + test('greater_than_equal', async () => { + const r = await performVectorSearch(payload, adapter, { views: { greater_than_equal: 150 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeGreaterThanOrEqual(150)) + }) + test('less_than', async () => { + const r = await performVectorSearch(payload, adapter, { views: { less_than: 200 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeLessThan(200)) + }) + test('less_than_equal', async () => { + const r = await performVectorSearch(payload, adapter, { views: { less_than_equal: 150 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.views).toBeLessThanOrEqual(150)) + }) + test('lessThan variant on float', async () => { + const r = await performVectorSearch(payload, adapter, { rating: { lessThan: 4.6 } }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.rating).toBeLessThan(4.6)) + }) + test('range via and', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ views: { greater_than: 50 } }, { views: { less_than: 200 } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + expect(x.views).toBeGreaterThan(50) + expect(x.views).toBeLessThan(200) + }) + }) + }) + + describe('exists operator', () => { + test('exists true', async () => { + const r = await performVectorSearch(payload, adapter, { category: { exists: true } }) + r.forEach((x) => expect(x.category != null).toBe(true)) + }) + test('exists false', async () => { + const r = await performVectorSearch(payload, adapter, { category: { exists: false } }) + r.forEach((x) => expect(x.category == null).toBe(true)) + }) + }) + + describe('AND operator', () => { + test('text + text', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ status: { equals: 'published' } }, { category: { equals: 'tech' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + expect(x.status).toBe('published') + expect(x.category).toBe('tech') + }) + }) + test('text + numeric', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ status: { equals: 'published' } }, { views: { greater_than: 100 } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + expect(x.status).toBe('published') + expect(x.views).toBeGreaterThan(100) + }) + }) + test('and with single condition', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ status: { equals: 'published' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).toBe('published')) + }) + test('and with one pre + one post operator', async () => { + const r = await performVectorSearch(payload, adapter, { + and: [{ status: { equals: 'published' } }, { tags: { like: 'javascript' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + expect(x.status).toBe('published') + expect((x.tags as string).toLowerCase()).toContain('javascript') + }) + }) + }) + + describe('OR operator', () => { + test('two text branches', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [{ status: { equals: 'draft' } }, { status: { equals: 'archived' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(['draft', 'archived']).toContain(x.status)) + }) + test('two numeric branches', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [{ views: { greater_than: 200 } }, { rating: { greater_than: 4.7 } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + const a = (x.views as number) > 200 + const b = (x.rating as number) > 4.7 + expect(a || b).toBe(true) + }) + }) + test('or with single condition', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [{ status: { equals: 'published' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.status).toBe('published')) + }) + test('or with one post-filter branch routes whole or to post', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [{ status: { equals: 'published' } }, { tags: { like: 'python' } }], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + const a = x.status === 'published' + const b = (x.tags as string).toLowerCase().includes('python') + expect(a || b).toBe(true) + }) + }) + }) + + describe('complex nested logic', () => { + test('(published AND tech) OR archived', async () => { + const r = await performVectorSearch(payload, adapter, { + or: [ + { + and: [ + { status: { equals: 'published' } }, + { category: { equals: 'tech' } }, + ], + }, + { status: { equals: 'archived' } }, + ], + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => { + const tech = x.status === 'published' && x.category === 'tech' + const arch = x.status === 'archived' + expect(tech || arch).toBe(true) + }) + }) + }) + + describe('reserved fields filterable without declaration', () => { + test('sourceCollection equals works on a pool that did not declare it', async () => { + const r = await performVectorSearch(payload, adapter, { + sourceCollection: { equals: 'articles' }, + }) + expect(r.length).toBeGreaterThan(0) + r.forEach((x) => expect(x.sourceCollection).toBe('articles')) + }) + }) + + describe('configuration errors', () => { + test('filtering on undeclared field throws clearly', async () => { + await expect( + performVectorSearch(payload, adapter, { + undeclared: { equals: 'x' }, + } as any), + ).rejects.toThrowError(/not configured as filterableFields/) + }) + }) + + describe('limit', () => { + test('returns at most `limit` results ordered by score', async () => { + const r = await performVectorSearch(payload, adapter, undefined, 2) + expect(r.length).toBeLessThanOrEqual(2) + for (let i = 1; i < r.length; i++) { + expect(r[i - 1].score).toBeGreaterThanOrEqual(r[i].score) + } + }) + }) +}) +``` + +- [ ] **Step 2: Write `adapters/mongodb/dev/specs/integration.spec.ts`** + +```ts +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { MongoClient } from 'mongodb' +import type { BasePayload } from 'payload' +import type { DbAdapter } from 'payloadcms-vectorize' +import { createMongoVectorIntegration } from '../../src/index.js' +import { DIMS, MONGO_URI } from './constants.js' +import { dropTestDb, makeFakePayload, teardown } from './utils.js' + +const DB1 = `vectorize_mongo_int_${Date.now()}_a` + +describe('Mongo-specific integration tests', () => { + let adapter: DbAdapter + let payload: BasePayload + + beforeAll(async () => { + await dropTestDb(MONGO_URI, DB1) + const { adapter: a } = createMongoVectorIntegration({ + uri: MONGO_URI, + dbName: DB1, + pools: { + default: { + dimensions: DIMS, + numCandidates: 50, + }, + secondary: { + dimensions: DIMS, + numCandidates: 50, + }, + }, + }) + adapter = a + const ext = adapter.getConfigExtension({} as any) + payload = makeFakePayload(ext.custom!) + }) + + afterAll(async () => { + await dropTestDb(MONGO_URI, DB1) + await teardown() + }) + + test('ensureSearchIndex is idempotent across multiple storeChunk calls', async () => { + for (let i = 0; i < 3; i++) { + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'idempotent', + docId: `id-${i}`, + chunkIndex: 0, + chunkText: `chunk ${i}`, + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.1 + i * 0.01), + extensionFields: {}, + }) + } + + const c = new MongoClient(MONGO_URI) + await c.connect() + const indexes = (await c + .db(DB1) + .collection('vectorize_default') + .listSearchIndexes() + .toArray()) as Array<{ name: string }> + const matches = indexes.filter((i) => i.name === 'vectorize_default_idx') + expect(matches.length).toBe(1) + await c.close() + }, 90_000) + + test('storeChunk → immediate search returns the inserted doc', async () => { + const docId = `imm-${Date.now()}` + const target = Array(DIMS).fill(0.42) + await adapter.storeChunk(payload, 'default', { + sourceCollection: 'immediate', + docId, + chunkIndex: 0, + chunkText: 'immediate test', + embeddingVersion: 'v1', + embedding: target, + extensionFields: {}, + }) + // Atlas Local lag — see WHERE suite beforeAll for measurements. + await new Promise((r) => setTimeout(r, 1200)) + const r = await adapter.search(payload, target, 'default', 5) + const found = r.some((x) => x.docId === docId) + expect(found).toBe(true) + }) + + test('multiple pools coexist without collision', async () => { + await adapter.storeChunk(payload, 'secondary', { + sourceCollection: 'sec', + docId: 'sec-1', + chunkIndex: 0, + chunkText: 'secondary pool', + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.9), + extensionFields: {}, + }) + + const c = new MongoClient(MONGO_URI) + await c.connect() + const a = await c.db(DB1).collection('vectorize_default').countDocuments() + const b = await c.db(DB1).collection('vectorize_secondary').countDocuments() + expect(a).toBeGreaterThan(0) + expect(b).toBeGreaterThan(0) + await c.close() + }) + + test('conflicting index definition throws actionable error', async () => { + // Manually create an index with a different definition on a fresh pool. + const dbName = `${DB1}_conflict` + await dropTestDb(MONGO_URI, dbName) + const c = new MongoClient(MONGO_URI) + await c.connect() + const coll = c.db(dbName).collection('vectorize_default') + // Ensure the collection exists by inserting a sentinel doc, then drop it. + await coll.insertOne({ _bootstrap: true }) + await coll.deleteMany({ _bootstrap: true }) + await coll.createSearchIndex({ + name: 'vectorize_default_idx', + type: 'vectorSearch', + definition: { + fields: [ + { type: 'vector', path: 'embedding', numDimensions: DIMS, similarity: 'euclidean' }, + { type: 'filter', path: 'sourceCollection' }, + { type: 'filter', path: 'docId' }, + { type: 'filter', path: 'embeddingVersion' }, + ], + }, + }) + + // createSearchIndex returns while the index is in PENDING; ensureSearchIndex + // treats anything not READY|BUILDING as "unexpected state". Wait for transition + // so the conflict path (different definition) is the one that fires. + const deadline = Date.now() + 30_000 + while (Date.now() < deadline) { + const list = (await coll.listSearchIndexes('vectorize_default_idx').toArray()) as Array<{ name: string; status: string }> + const status = list.find((i) => i.name === 'vectorize_default_idx')?.status + if (status === 'BUILDING' || status === 'READY') break + await new Promise((r) => setTimeout(r, 200)) + } + + const { adapter: badAdapter } = createMongoVectorIntegration({ + uri: MONGO_URI, + dbName, + pools: { default: { dimensions: DIMS, similarity: 'cosine', numCandidates: 50 } }, + }) + const badExt = badAdapter.getConfigExtension({} as any) + const badPayload = makeFakePayload(badExt.custom!) + + try { + await expect( + badAdapter.storeChunk(badPayload, 'default', { + sourceCollection: 'x', + docId: 'x-1', + chunkIndex: 0, + chunkText: 'should fail', + embeddingVersion: 'v1', + embedding: Array(DIMS).fill(0.5), + extensionFields: {}, + }), + ).rejects.toThrowError(/different definition/) + } finally { + await c.db(dbName).dropDatabase() + await c.close() + } + }) +}) +``` + +- [ ] **Step 3: Run the integration + where suites** + +Run: `cd adapters/mongodb && pnpm test:setup && pnpm exec vitest run` +Expected: all suites PASS. Total runtime ~2–3 min on first run (multiple search indexes built). + +- [ ] **Step 4: Commit** + +```bash +git add adapters/mongodb/dev/specs/vectorSearchWhere.spec.ts adapters/mongodb/dev/specs/integration.spec.ts +git commit -m "test(mongodb): add WHERE-clause + integration suites against live Mongo" +``` + +--- + +## Task 17: README + +**Files:** +- Create: `adapters/mongodb/README.md` + +- [ ] **Step 1: Write `adapters/mongodb/README.md`** + +```markdown +# @payloadcms-vectorize/mongodb + +MongoDB adapter for [`payloadcms-vectorize`](https://github.com/techiejd/payloadcms-vectorize). Targets both **MongoDB Atlas** (GA) and **self-hosted MongoDB Community 8.2+** (public preview) via a unified `$vectorSearch` API — connection string is the only difference. + +> **Status:** experimental. Atlas behavior is GA-quality; self-hosted Community vector search is in public preview as of MongoDB 8.2. + +## Install + +```bash +pnpm add @payloadcms-vectorize/mongodb mongodb +``` + +## Connecting to Atlas + +```ts +import { createMongoVectorIntegration } from '@payloadcms-vectorize/mongodb' + +const { adapter } = createMongoVectorIntegration({ + uri: process.env.MONGODB_URI!, // mongodb+srv://... + dbName: 'payload_vectorize', + pools: { + default: { + dimensions: 1536, + similarity: 'cosine', + filterableFields: ['status', 'category', 'publishedAt'], + }, + }, +}) +``` + +## Connecting to self-hosted (Docker) + +```bash +docker run -d -p 27018:27017 mongodb/mongodb-atlas-local:latest +``` + +```ts +const { adapter } = createMongoVectorIntegration({ + uri: 'mongodb://localhost:27018/?directConnection=true', + dbName: 'payload_vectorize', + pools: { default: { dimensions: 1536, filterableFields: ['status'] } }, +}) +``` + +> Self-hosted vector search uses MongoDB's `mongot` engine (source-available, SSPL). It is in public preview in 8.2 — production-grade workloads should use Atlas. + +## Configuration + +| Option | Required | Default | Notes | +|---|---|---|---| +| `dimensions` | yes | — | Embedding vector dimensions; must match your model. | +| `similarity` | no | `'cosine'` | `'cosine' \| 'euclidean' \| 'dotProduct'`. | +| `numCandidates` | no | `max(limit*20, 100)` | ANN candidate set size for HNSW. | +| `filterableFields` | no | `[]` | Extension fields you'll filter on in `where` clauses. | +| `forceExact` | no | `false` | Use ENN exact full-scan instead of ANN. | +| `collectionName` | no | `vectorize_` | Override Mongo collection name. | +| `indexName` | no | `_idx` | Override search index name. | + +## `filterableFields` explained + +MongoDB's `$vectorSearch` requires every field used in its native pre-filter to be declared as `type: 'filter'` in the search index definition. The adapter automatically declares the reserved fields (`sourceCollection`, `docId`, `embeddingVersion`) and any field name you list in `filterableFields`. + +Filtering on a field NOT in `filterableFields` (and not reserved) throws a clear adapter-side error before the request hits Mongo, rather than silently falling back to a slow scan. + +## Index lifecycle + +`ensureSearchIndex` runs lazily on the first `storeChunk` per pool: + +1. Lists existing search indexes. +2. If the named index already exists with the same definition (`READY` or `BUILDING`), returns immediately. +3. If it exists with a *different* definition, throws an error. **Auto-dropping is unsafe** — drop manually: + ```js + db.collection('vectorize_default').dropSearchIndex('vectorize_default_idx') + ``` +4. Otherwise creates the index and polls `listSearchIndexes` (≤ 60s) until `status === 'READY'`. + +The first write per pool may take ~5–30s while the index builds; subsequent calls are no-ops. + +## WHERE clause behavior + +The adapter splits a Payload `Where` clause into two stages: + +- **Pre-filter** (fast, applied inside `$vectorSearch.filter`): `equals`, `not_equals`, `in`, `not_in`, `greater_than`/`gte`/`less_than`/`lte`, `exists`, plus `and`/`or` of any of those. +- **Post-filter** (correct, applied after the vector scan): `like`, `contains`, `all` — these aren't expressible in `$vectorSearch.filter`, so the adapter applies them in JS against the result rows. + +### Implications + +- `$vectorSearch.limit` is enforced **before** the post-filter. If many rows fail the post-filter, you may receive fewer than `limit` results. To compensate, the adapter does not over-fetch — the trade-off matches the Cloudflare Vectorize adapter's behavior. +- An `or` clause where any branch needs a post-filter operator is routed entirely to the post-filter to preserve disjunction semantics. +- Geo operators (`near`, `within`, `intersects`) are **not supported** — they throw a clear adapter error. + +## Tier guidance + +- **Atlas M0/Flex:** development only. Free, but search index is a single shared replica with limited memory. +- **Atlas M10+:** production. Use [Search Nodes](https://www.mongodb.com/docs/atlas/cluster-config/multi-cloud-distribution/) for dedicated `mongot` capacity. +- **Self-hosted Community 8.2+:** preview-only. Production use waits on GA. + +## Limitations + +- Post-filter operators can reduce result count below `limit`. +- Geo operators (`near`, `within`, `intersects`) throw — Mongo's `$vectorSearch` does not expose geo predicates. +- Changing `dimensions`, `similarity`, or `filterableFields` after the index exists requires a manual `dropSearchIndex` first. + +## License + +MIT. +``` + +- [ ] **Step 2: Commit** + +```bash +git add adapters/mongodb/README.md +git commit -m "docs(mongodb): add README walking from install through Atlas + self-hosted" +``` + +--- + +## Task 18: Wire into root `package.json` and changeset config + +**Files:** +- Modify: `package.json` +- Modify: `.changeset/config.json` + +- [ ] **Step 1: Read current root `package.json`** + +Open `package.json`. Lines to modify: +- `"build:adapters": "pnpm build:adapters:pg && pnpm build:adapters:cf"` (line 35) — chain `mongodb`. +- After `"build:adapters:cf": "..."` (line 37) — add `build:adapters:mongodb`. +- After `"test:adapters:cf": "..."` (line 60) — add `test:adapters:mongodb`. + +- [ ] **Step 2: Apply edits to `package.json`** + +Replace: +```json +"build:adapters": "pnpm build:adapters:pg && pnpm build:adapters:cf", +``` +with: +```json +"build:adapters": "pnpm build:adapters:pg && pnpm build:adapters:cf && pnpm build:adapters:mongodb", +``` + +Replace: +```json +"build:adapters:cf": "cd ./adapters/cf && tsc -p tsconfig.build.json && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", +``` +with: +```json +"build:adapters:cf": "cd ./adapters/cf && tsc -p tsconfig.build.json && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", +"build:adapters:mongodb": "cd ./adapters/mongodb && tsc -p tsconfig.build.json && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", +``` + +Replace: +```json +"test:adapters:cf": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/cf/vitest.config.ts" +``` +with: +```json +"test:adapters:cf": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/cf/vitest.config.ts", +"test:adapters:mongodb": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/mongodb/vitest.config.ts" +``` + +- [ ] **Step 3: Update `.changeset/config.json`** + +Replace: +```json +"fixed": [ + ["payloadcms-vectorize", "@payloadcms-vectorize/pg", "@payloadcms-vectorize/cf"] +], +``` +with: +```json +"fixed": [ + ["payloadcms-vectorize", "@payloadcms-vectorize/pg", "@payloadcms-vectorize/cf", "@payloadcms-vectorize/mongodb"] +], +``` + +- [ ] **Step 4: Verify root scripts work** + +Run: `pnpm build:adapters:mongodb` +Expected: `adapters/mongodb/dist/` populated with `.js` + `.d.ts`. + +Run: `pnpm build:types:all` +Expected: PASS, no type errors anywhere in repo. + +- [ ] **Step 5: Commit** + +```bash +git add package.json .changeset/config.json +git commit -m "feat(mongodb): wire mongodb adapter into root build/test scripts and changesets" +``` + +--- + +## Task 19: CI job + +**Files:** +- Modify: `.github/workflows/ci.yml` + +- [ ] **Step 1: Add `test_adapters_mongodb` job** + +Append to `.github/workflows/ci.yml`, after the `test_adapters_cf` job and before `test_e2e`: + +```yaml + test_adapters_mongodb: + runs-on: ubuntu-latest + + services: + mongodb: + image: mongodb/mongodb-atlas-local:latest + ports: + - 27018:27017 + options: >- + --health-cmd "mongosh --quiet --eval 'db.runCommand({ping:1})'" + --health-interval 5s + --health-timeout 10s + --health-retries 30 + + steps: + - uses: actions/checkout@v4 + + - name: Install pnpm + uses: pnpm/action-setup@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'pnpm' + + - name: Install dependencies + run: pnpm install + + - name: Run mongodb adapter tests + run: pnpm test:adapters:mongodb + env: + PAYLOAD_SECRET: test-secret-key + MONGODB_URI: mongodb://localhost:27018/?directConnection=true + TEST_ENV: 1 +``` + +Also wire the new job into the aggregate `test` gate so PR-merge protection actually depends on it. Find the final `test:` job and update its `needs:` array and the conditional check script: + +```diff + test: + runs-on: ubuntu-latest +- needs: [typecheck, build, test_int, test_adapters_pg, test_adapters_cf, test_e2e] ++ needs: [typecheck, build, test_int, test_adapters_pg, test_adapters_cf, test_adapters_mongodb, test_e2e] + if: always() + steps: + - name: Check required jobs + run: | + if [ "${{ needs.typecheck.result }}" != "success" ] || \ + [ "${{ needs.build.result }}" != "success" ] || \ + [ "${{ needs.test_int.result }}" != "success" ] || \ + [ "${{ needs.test_adapters_pg.result }}" != "success" ] || \ + [ "${{ needs.test_adapters_cf.result }}" != "success" ] || \ ++ [ "${{ needs.test_adapters_mongodb.result }}" != "success" ] || \ + [ "${{ needs.test_e2e.result }}" != "success" ]; then + echo "One or more required jobs failed" + exit 1 + fi +``` + +- [ ] **Step 2: Validate the workflow file** + +Run: `cd /Users/juandominguez/development/payloadcms-vectorize/.worktrees/mongodb-adapter && python3 -c "import yaml; yaml.safe_load(open('.github/workflows/ci.yml'))"` +Expected: no error (YAML parses cleanly). + +- [ ] **Step 3: Commit** + +```bash +git add .github/workflows/ci.yml +git commit -m "ci(mongodb): add test_adapters_mongodb job using mongodb-atlas-local service" +``` + +--- + +## Task 20: End-to-end verification + +**Files:** none modified — verifies the full pipeline. + +- [ ] **Step 1: Clean install + full build** + +Run: `pnpm clean && pnpm install && pnpm build` +Expected: PASS, `adapters/mongodb/dist/` populated. + +- [ ] **Step 2: Type check across the whole repo** + +Run: `pnpm build:types:all` +Expected: PASS. + +- [ ] **Step 3: Bring up Mongo and run the adapter test suite** + +Run: `cd adapters/mongodb && pnpm test:setup` +Then: `cd /Users/juandominguez/development/payloadcms-vectorize/.worktrees/mongodb-adapter && pnpm test:adapters:mongodb` +Expected: all suites PASS (compliance + vectorSearchWhere + integration + convertWhere unit + escapeRegExp unit). + +- [ ] **Step 4: Tear down** + +Run: `cd adapters/mongodb && pnpm test:teardown` +Expected: container removed. + +- [ ] **Step 5: Run the existing PG and CF suites to confirm no regressions** + +Run: `pnpm test:setup && pnpm test:adapters:pg && pnpm test:adapters:cf && pnpm test:teardown` +Expected: PASS for both. + +- [ ] **Step 6: Add a changeset entry** + +Run: `pnpm changeset` +- Select: `@payloadcms-vectorize/mongodb` +- Bump: `minor` +- Summary: `Add MongoDB adapter (Atlas + self-hosted Community 8.2+) with $vectorSearch, pre/post filter split, and full WHERE-clause parity.` + +- [ ] **Step 7: Commit changeset** + +```bash +git add .changeset/ +git commit -m "chore(mongodb): add changeset for new adapter" +``` + +- [ ] **Step 8: Push and open PR** + +```bash +git push -u origin feat/mongodb-adapter +gh pr create --title "feat: add @payloadcms-vectorize/mongodb adapter" --body "$(cat <<'EOF' +## Summary +- New `@payloadcms-vectorize/mongodb` adapter targets MongoDB Atlas + self-hosted Community 8.2+ via unified `$vectorSearch`. +- WHERE-clause parity with the PG adapter: pre-filter for `equals`/`not_equals`/`in`/`not_in`/`gt`/`gte`/`lt`/`lte`/`exists`/`and`/`or`; post-filter for `like`/`contains`/`all`. +- Local dev + CI use `mongodb/mongodb-atlas-local` Docker image — no Atlas account or secrets. + +## Test plan +- [x] `pnpm test:adapters:mongodb` passes locally +- [x] `pnpm test:adapters:pg` and `pnpm test:adapters:cf` still pass (no regressions) +- [x] `pnpm build:types:all` passes +- [x] Spec: `docs/superpowers/specs/2026-04-25-mongodb-adapter.md` +- [x] Plan: `docs/superpowers/plans/2026-04-25-mongodb-adapter.md` +EOF +)" +``` + +Expected: PR URL printed; CI runs the new `test_adapters_mongodb` job alongside existing jobs. + +--- + +## Acceptance Criteria (from spec §13) + +- `pnpm test:adapters:mongodb` passes against the docker-compose stack — Task 20 step 3. +- `pnpm build:adapters:mongodb` produces `adapters/mongodb/dist/` with `.js` + `.d.ts` — Task 18 step 4. +- `pnpm build:types:all` typechecks — Task 20 step 2. +- New CI job `test_adapters_mongodb` passes — Task 19 + Task 20 step 8. +- README walks a fresh user from `npm install` to a working vector search — Task 17. +- `where` parity with Payload CRUD — Task 16 (vectorSearchWhere covers the same operators against the same fixtures as the PG suite). diff --git a/docs/superpowers/specs/2026-04-25-mongodb-adapter.md b/docs/superpowers/specs/2026-04-25-mongodb-adapter.md new file mode 100644 index 0000000..44d0ee8 --- /dev/null +++ b/docs/superpowers/specs/2026-04-25-mongodb-adapter.md @@ -0,0 +1,401 @@ +# Spec: `@payloadcms-vectorize/mongodb` adapter + +> Single MongoDB adapter that targets both **MongoDB Atlas** (GA) and **self-hosted MongoDB Community 8.2+** (public preview) via a unified `$vectorSearch` API. +> +> **Strategy basis:** [`docs/plans/2026-04-25-mongodb-unified-adapter-strategy.md`](../../plans/2026-04-25-mongodb-unified-adapter-strategy.md). Companion deep-dives: [community](../../plans/2026-04-25-mongodb-adapter-deep-dive.md), [Atlas](../../plans/2026-04-25-mongodb-atlas-adapter-deep-dive.md). + +--- + +## 1. Goal + +Ship one published package — `@payloadcms-vectorize/mongodb` — that implements the existing `DbAdapter` contract from [`src/types.ts:384-418`](../../../src/types.ts#L384-L418) on top of MongoDB's `$vectorSearch` aggregation stage. + +The package must: + +1. Work against both Atlas (`mongodb+srv://...`) and self-hosted Community 8.2+ (`mongodb://...`) with **zero code branching** — connection-string is the only difference. +2. Reach **filter parity** with Payload's own `db-mongodb` adapter so a `Where` clause that works on the user's CRUD queries works identically against vector search. +3. Use the `mongodb/mongodb-atlas-local` Docker image for local dev and CI — no Atlas account, no CLI login, no secrets. +4. Pass the same compliance + WHERE-clause test suites the PG adapter passes, plus MongoDB-specific tests for the pre/post filter split. + +Out of scope: +- Running `mongod` or `mongot` for users; we only document setup. +- Atlas Search Nodes provisioning; that's user-side ops. +- Bulk-embed support beyond what the existing plugin contract already handles. + +--- + +## 2. Public API + +```ts +import { createMongoVectorIntegration } from '@payloadcms-vectorize/mongodb' + +const { adapter } = createMongoVectorIntegration({ + uri: process.env.MONGODB_URI!, + dbName: 'payload_vectorize', + pools: { + default: { + dimensions: 1536, + similarity: 'cosine', // 'cosine' | 'euclidean' | 'dotProduct'; default 'cosine' + numCandidates: 200, // optional; default = max(limit * 20, 100) + filterableFields: ['status', 'category', 'publishedAt', 'tags'], + forceExact: false, // optional; default false (ANN). true = ENN full scan. + collectionName: 'vectorize_default', // optional; default = `vectorize_${poolName}` + indexName: 'vectorize_default_idx', // optional; default = `${collectionName}_idx` + }, + }, +}) +``` + +- `uri`: any valid MongoDB connection string. SRV (`mongodb+srv://`) for Atlas, standard (`mongodb://`) for self-hosted. Required. +- `dbName`: database that will hold the per-pool vector collections. Required. Created on first write if absent (Mongo behavior). +- `pools`: keyed by pool name (must match a `knowledgePools` key in the main plugin config). +- `dimensions`: required per pool, must match the embedding model's vector dim. +- `similarity`: maps directly to the index definition's `similarity` field. Default `'cosine'`. +- `numCandidates`: ANN candidate set size. Default formula: `Math.max(limit * 20, 100)` per call, computed at search time. +- `filterableFields`: extension fields the user wants to filter on. The adapter pre-declares these as `type: "filter"` in the search index. **Reserved fields (`sourceCollection`, `docId`, `embeddingVersion`) are always declared as filter fields automatically — users do NOT list them.** Optional; default `[]`. +- `forceExact`: opt into ENN exact search instead of HNSW ANN. Default `false`. +- `collectionName` / `indexName`: optional overrides for advanced users. + +The factory returns `{ adapter }`, matching the CF adapter's shape ([adapters/cf/src/index.ts:41](../../../adapters/cf/src/index.ts#L41)). No `afterSchemaInitHook` (Mongo doesn't need schema migration). + +--- + +## 3. Data layout + +For each pool, the adapter manages **one MongoDB collection** in `dbName`. Document shape: + +```ts +{ + _id: ObjectId, // auto + sourceCollection: string, // reserved + docId: string, // reserved (always stored as string) + chunkIndex: number, // reserved + chunkText: string, // reserved + embeddingVersion: string, // reserved + embedding: number[], // the vector + ...extensionFields, // user-provided per pool + createdAt: Date, // adapter-set + updatedAt: Date, // adapter-set +} +``` + +**Search index per collection** (created via `createSearchIndex`): + +```js +{ + name: indexName, + type: 'vectorSearch', + definition: { + fields: [ + { type: 'vector', path: 'embedding', numDimensions, similarity }, + { type: 'filter', path: 'sourceCollection' }, + { type: 'filter', path: 'docId' }, + { type: 'filter', path: 'embeddingVersion' }, + ...filterableFields.map(p => ({ type: 'filter', path: p })), + ], + }, +} +``` + +The adapter does **not** register a Payload `CollectionConfig` for vectors — those documents are managed entirely via the raw MongoDB driver, mirroring how the CF adapter delegates storage to Cloudflare Vectorize. The adapter optionally exposes the connection in `getConfigExtension().custom` so the `search()` method can recover it from a `BasePayload` instance. + +--- + +## 4. Method semantics + +### `getConfigExtension(payloadCmsConfig, knowledgePools?)` + +Returns `{ custom: { _mongoConfig: { uri, dbName, pools } } }`. No collections, no bins. The `custom` payload gives `search()` access to the same config the factory was called with, via `getVectorizedPayload(payload)?.getDbAdapterCustom()._mongoConfig`. + +### `storeChunk(payload, poolName, data)` + +1. Resolves the pool's collection. +2. Lazily ensures the search index exists (idempotent: skips if a search index named `indexName` already exists). +3. Inserts one document with `embedding: Array.from(data.embedding)` and all reserved + extension fields. +4. No return value (Promise). + +### `deleteChunks(payload, poolName, sourceCollection, docId)` + +`db.collection(name).deleteMany({ sourceCollection, docId: String(docId) })`. Returns `void` regardless of the deleted count (matches PG and CF behavior). + +### `hasEmbeddingVersion(payload, poolName, sourceCollection, docId, embeddingVersion)` + +`db.collection(name).countDocuments({ sourceCollection, docId: String(docId), embeddingVersion }, { limit: 1 }) > 0`. + +### `search(payload, queryEmbedding, poolName, limit = 10, where?)` + +Pipeline: + +```js +[ + { $vectorSearch: { + index, path: 'embedding', + queryVector: queryEmbedding, + numCandidates, limit, + ...(forceExact ? { exact: true } : {}), + ...(preFilter ? { filter: preFilter } : {}), + }}, + ...(postFilter ? [{ $match: postFilter }] : []), + { $project: { + _id: 1, score: { $meta: 'vectorSearchScore' }, + sourceCollection: 1, docId: 1, chunkIndex: 1, + chunkText: 1, embeddingVersion: 1, + // every field in `pool.filterableFields` is projected by default + // (so `where`-filterable fields are also returnable in results): + ...projectionForFilterableFields, + }}, +] +``` + +Returns `VectorSearchResult[]` ordered by `vectorSearchScore` descending (Mongo's natural order from `$vectorSearch`): + +```ts +{ + id: String(doc._id), + score: doc.score, + sourceCollection, docId, chunkIndex, chunkText, embeddingVersion, + ...extensionFields, +} +``` + +When the post-filter is present, the limit is applied **before** the post-filter (Mongo enforces `limit` inside `$vectorSearch`). This is acceptable because: (a) it matches `$vectorSearch` semantics, and (b) the same trade-off exists in the CF adapter. Documented in the README. + +--- + +## 5. WHERE clause translation: `convertWhereToMongo` + +The function returns: + +```ts +type ConvertResult = { + preFilter: Record | null + postFilter: Where | null +} +``` + +### Pre-filter (allowed inside `$vectorSearch.filter`) + +| Payload op | Mongo op | Notes | +|---|---|---| +| `equals` | `$eq` | | +| `not_equals` / `notEquals` | `$ne` | | +| `in` | `$in` | array required | +| `not_in` / `notIn` | `$nin` | array required | +| `greater_than` / `greaterThan` | `$gt` | | +| `greater_than_equal` / `greaterThanEqual` | `$gte` | | +| `less_than` / `lessThan` | `$lt` | | +| `less_than_equal` / `lessThanEqual` | `$lte` | | +| `exists` | compound: `$exists` + `$ne null` (+ `$ne ''` for string-typed fields) | mirrors Payload's `buildExistsQuery` | +| `and` (case-insensitive) | `$and` | recurse | +| `or` (case-insensitive) | `$or` | recurse | + +Operators not listed in this table are post-filter (see next subsection). Multi-operator on the same path → wrap in `$and: [...]` so two predicates don't collide on the same key. + +### Post-filter (NOT allowed in `$vectorSearch.filter`) + +| Payload op | Strategy | +|---|---| +| `like` | `$match` with `$regex: escapeRegExp(value), $options: 'i'` | +| `contains` (scalar) | `$match` with `$regex: escapeRegExp(value), $options: 'i'` | +| `contains` (array hasMany) | `$match` with `$elemMatch: { $regex, $options: 'i' }` | +| `all` | `$match` with `$all` | +| `near` / `within` / `intersects` | **not supported** — throw a clear error | + +### `escapeRegExp` + +Inlined in the adapter (Payload doesn't export it): + +```ts +function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') +} +``` + +### Splitting rule + +Walk the `Where` tree: +- A leaf condition with only pre-filter operators → goes to `preFilter`. +- A leaf condition with any post-filter operator → the whole leaf goes to `postFilter` (so all its predicates are evaluated together against the post-vectorSearch documents). +- Nested `or`: if any branch contains a post-filter operator, the **entire `or` block** must go to post-filter. (You can't apply half of an `or` natively — semantics would be wrong.) +- Nested `and`: split per branch — pre-filter compatible branches go to `preFilter`, others to `postFilter`. The combined `preFilter` is implicitly AND-ed; the combined `postFilter` is wrapped in `{ and: [...] }`. + +This matches the CF adapter's `splitWhere` ([adapters/cf/src/search.ts:91-142](../../../adapters/cf/src/search.ts#L91-L142)) extended for nested `or` correctness. + +### Field mapping + +- The reserved field `id` (Payload-side string) is mapped to `_id` for the Mongo filter, and the value is cast to `ObjectId` if it's a 24-hex string. All other reserved fields and extension fields use their literal name. +- Field names not present in `filterableFields` (and not reserved) get rejected at `convertWhereToMongo` time with a clear error: `Field "" is not configured as filterableFields for pool ""`. This prevents Mongo's silent "no filter on unindexed field" failure mode. + +--- + +## 6. Index lifecycle + +`ensureSearchIndex(client, dbName, pool)`: + +1. List existing search indexes via `db.collection(name).listSearchIndexes(indexName).toArray()`. +2. If an index named `indexName` is `READY` or `BUILDING` with the same `definition`, return. +3. If it exists with a different definition, **throw a clear error** (`"index '' exists with different definition; drop it manually with db.collection.dropSearchIndex(...) before re-running"`). Auto-dropping is too risky. +4. Otherwise create with `db.collection(name).createSearchIndex({ name, type: 'vectorSearch', definition: {...} })`, then poll `listSearchIndexes` every 1s until `status === 'READY'` or 60s timeout. Timeout throws a clear error advising the user to check Mongo logs. +5. Cache "ensured" status in-memory per `(dbName, collectionName, indexName)` so subsequent `storeChunk` calls don't re-list. + +The first `storeChunk` for a pool may take ~5–30s while the index builds; subsequent calls are no-ops. + +--- + +## 7. Connection lifecycle + +The adapter holds a singleton `MongoClient` per `createMongoVectorIntegration` call, lazily connected on first method invocation: + +```ts +let clientPromise: Promise | null = null +const getClient = () => (clientPromise ??= MongoClient.connect(uri).then(c => c)) +``` + +- No explicit `close()` in the public API; the client lives for the process lifetime, mirroring how Payload manages its own DB connection. +- Tests are responsible for shutting down the client via an internal `__closeForTests()` helper exported from the package's `dev/` test utilities (not the public API). + +--- + +## 8. Dev & CI environment + +**Image:** `mongodb/mongodb-atlas-local:latest` (bundles `mongod` + `mongot` + replica-set init). + +**Local dev:** + +`adapters/mongodb/dev/docker-compose.yml`: +```yaml +services: + mongodb-atlas: + image: mongodb/mongodb-atlas-local:latest + container_name: vectorize-mongodb-test + ports: ["27018:27017"] # 27018 to avoid collision with users' local mongod + healthcheck: + test: ["CMD", "mongosh", "--quiet", "--eval", "db.runCommand({ping:1})"] + interval: 2s + timeout: 5s + retries: 30 +``` + +**Connection string:** `mongodb://localhost:27018/?directConnection=true`. + +**Adapter `package.json` scripts** (in `adapters/mongodb/package.json` — analogous to root [package.json:54-55](../../../package.json#L54-L55) but local to the adapter so contributors can `cd adapters/mongodb && pnpm test:setup`): + +```jsonc +"test:setup": "docker-compose -f dev/docker-compose.yml up -d", +"test:teardown": "docker-compose -f dev/docker-compose.yml down", +``` + +**Root `package.json` scripts** (test-runner and build scripts grouped with their PG/CF siblings — after [package.json:60](../../../package.json#L60) `test:adapters:cf` and after [package.json:37](../../../package.json#L37) `build:adapters:cf`): + +```jsonc +"build:adapters:mongodb": "cd ./adapters/mongodb && tsc -p tsconfig.build.json && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", +"test:adapters:mongodb": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/mongodb/vitest.config.ts", +``` + +`build:adapters` (line 35) is updated to chain the new `build:adapters:mongodb`. + +**Test setup helper** waits for `$vectorSearch` readiness by attempting a no-op vector search against a temp collection (with retry/backoff up to ~30s). + +**CI:** new `test_adapters_mongodb` job in `.github/workflows/ci.yml` runs `pnpm test:setup:mongodb`, polls the healthcheck, runs `pnpm test:adapters:mongodb`, then `pnpm test:teardown:mongodb`. No secrets, no Atlas account. + +--- + +## 9. Test plan + +Three suites under `adapters/mongodb/dev/specs/`: + +### `compliance.spec.ts` (port from PG) + +Same shape as [adapters/pg/dev/specs/compliance.spec.ts](../../../adapters/pg/dev/specs/compliance.spec.ts): +- `getConfigExtension()` returns valid extension with `custom._mongoConfig`. +- `storeChunk()` accepts `number[]` and `Float32Array` embeddings. +- `search()` returns array, results have all required fields with correct types, ordered by score desc, respects `limit`. +- `deleteChunks()` removes chunks for a doc; idempotent on missing. +- `hasEmbeddingVersion()` true/false. + +### `vectorSearchWhere.spec.ts` (port from PG, +adapt) + +Port the 38-test PG suite verbatim (assertions are on result IDs/ordering/values, not SQL/Mongo strings). The PG fixtures filter on `status`, `category`, `views`, `rating`, `published`, and `tags` — these MUST all be declared in the Mongo test pool's `filterableFields` so the search index includes them. Plus: + +- **Pre/post split coverage** (Mongo-specific): + - `like` and `contains` round-trip correctly via post-filter (verifies escape + case-insensitivity). + - `like` with regex special chars (`foo.bar`, `a*b`, `(x)`) does NOT match unintended values. + - `or` containing one `like` branch goes entirely to post-filter — verify result correctness. + - Mixed `and` with both pre and post operators — pre goes native, post applies to native results. +- **Configuration errors:** + - Filtering on a field not in `filterableFields` throws a clear adapter error before hitting Mongo. +- **Reserved fields always filterable:** + - `where: { sourceCollection: { equals: ... } }` works even if `filterableFields` is empty. + +### `integration.spec.ts` (Mongo-specific) + +- `ensureSearchIndex` is idempotent across multiple `storeChunk` calls. +- Conflicting index definition throws actionable error. +- `storeChunk` then immediate `search` works after index ready (waits if needed). +- Multiple pools coexist in same DB without index/collection collision. + +--- + +## 10. Package layout + +``` +adapters/mongodb/ +├── package.json # @payloadcms-vectorize/mongodb +├── tsconfig.build.json # extends ../tsconfig.adapter.json +├── vitest.config.ts # mirrors adapters/pg/vitest.config.js +├── README.md # see §11 +├── src/ +│ ├── index.ts # createMongoVectorIntegration + adapter wiring +│ ├── types.ts # MongoVectorIntegrationConfig, PoolConfig, etc. +│ ├── client.ts # lazy singleton MongoClient +│ ├── indexes.ts # ensureSearchIndex +│ ├── embed.ts # storeChunk +│ ├── search.ts # search() + post-filter $match wiring +│ ├── convertWhere.ts # convertWhereToMongo (pre/post split) +│ └── escapeRegExp.ts # tiny utility +└── dev/ + ├── docker-compose.yml + └── specs/ + ├── constants.ts # shared test config + ├── utils.ts # waitForVectorSearchReady, dropDb, etc. + ├── compliance.spec.ts + ├── vectorSearchWhere.spec.ts + └── integration.spec.ts +``` + +Files mirror PG's responsibility split (`embed.ts`, `search.ts`, `types.ts`, `index.ts`) plus three new files: `client.ts`, `indexes.ts`, `convertWhere.ts`. + +`package.json`'s `files` field must include only `dist/` and `README.md` (matching PG/CF) so `dev/` and `__closeForTests` test utilities are NOT in the published artifact. + +--- + +## 11. README outline + +1. **Install**: `pnpm add @payloadcms-vectorize/mongodb mongodb` +2. **Connecting to Atlas**: connection string snippet + `createMongoVectorIntegration` example. +3. **Connecting to self-hosted (Docker)**: `docker run mongodb/mongodb-atlas-local:latest` + connection string. Preview-status warning callout. +4. **Configuration**: per-pool config table (dimensions, similarity, numCandidates, filterableFields, forceExact). +5. **`filterableFields` explained**: why filtering requires pre-declaration; what happens if you omit a field. +6. **Index lifecycle**: how `ensureSearchIndex` works, what the first-write delay looks like, how to manually drop an index for redefinition. +7. **WHERE clause behavior**: which operators are pre-filtered (fast) vs post-filtered (correct but applied after vector scan); why `like`/`contains` go post-filter. +8. **Tier guidance**: M0/Flex/M10/Search Nodes for Atlas; preview status for Community. +9. **Limitations**: post-filter operators reduce result count below `limit` when many post-filter rejections occur; geo operators unsupported; index-definition changes require manual drop. + +--- + +## 12. Versioning + +Match existing adapter versioning: `0.x` aligned with the rest of the repo. Mark as `experimental` in keywords until MongoDB Community vector search GAs. Atlas behavior is GA-quality; the experimental label is about Mongo's labelling of self-hosted, not adapter maturity. + +**Changesets registration:** Add `"@payloadcms-vectorize/mongodb"` to the `fixed` array in [`.changeset/config.json`](../../../.changeset/config.json) (line 9) so it stays version-locked with `payloadcms-vectorize`, `@payloadcms-vectorize/pg`, and `@payloadcms-vectorize/cf`. `pnpm-workspace.yaml` already includes `adapters/*` so no workspace change is needed. + +--- + +## 13. Acceptance criteria + +- `pnpm test:adapters:mongodb` runs locally against the docker-compose stack and passes all suites. +- `pnpm build:adapters:mongodb` produces `adapters/mongodb/dist/` with `.js` + `.d.ts`. +- `pnpm build:types:all` typechecks across the whole repo. +- New CI job `test_adapters_mongodb` passes on a clean PR. +- README walks a fresh user from `npm install` to a working vector search in under ~10 minutes via the local Docker path. +- Any `where` clause that works in Payload's CRUD `find({ collection: 'articles', where: ... })` produces the same set of matched documents in vector search (modulo vector ordering and `limit`). diff --git a/package.json b/package.json index cb0469c..1ea9265 100644 --- a/package.json +++ b/package.json @@ -32,9 +32,10 @@ "scripts": { "build": "pnpm copyfiles && pnpm build:types && pnpm build:swc && pnpm build:adapters", "build:swc": "swc ./src -d ./dist --config-file .swcrc --strip-leading-paths", - "build:adapters": "pnpm build:adapters:pg && pnpm build:adapters:cf", + "build:adapters": "pnpm build:adapters:pg && pnpm build:adapters:cf && pnpm build:adapters:mongodb", "build:adapters:pg": "cd ./adapters/pg && tsc -p tsconfig.build.json && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", "build:adapters:cf": "cd ./adapters/cf && tsc -p tsconfig.build.json && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", + "build:adapters:mongodb": "cd ./adapters/mongodb && tsc -p tsconfig.build.json && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", "build:types": "tsc -p tsconfig.build.json --outDir dist --rootDir ./src", "build:types:all": "pnpm build:types && tsc --noEmit", "clean": "rimraf {dist,*.tsbuildinfo,adapters/*/dist,adapters/*/*.tsbuildinfo}", @@ -57,7 +58,8 @@ "test:e2e": "playwright test", "test:int": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest", "test:adapters:pg": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/pg/vitest.config.js", - "test:adapters:cf": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/cf/vitest.config.ts" + "test:adapters:cf": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/cf/vitest.config.ts", + "test:adapters:mongodb": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/mongodb/vitest.config.ts" }, "devDependencies": { "@changesets/changelog-github": "^0.5.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0caa24e..c9e19e1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -157,6 +157,25 @@ importers: specifier: workspace:* version: link:../.. + adapters/mongodb: + dependencies: + payload: + specifier: '>=3.0.0 <4.0.0' + version: 3.69.0(graphql@16.12.0)(typescript@5.7.3) + devDependencies: + '@payloadcms/db-mongodb': + specifier: 3.69.0 + version: 3.69.0(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3)) + '@payloadcms/richtext-lexical': + specifier: 3.69.0 + version: 3.69.0(@faceless-ui/modal@3.0.0(react-dom@19.1.0(react@19.1.0))(react@19.1.0))(@faceless-ui/scroll-info@2.0.0(react-dom@19.1.0(react@19.1.0))(react@19.1.0))(@payloadcms/next@3.69.0(@types/react@19.1.8)(graphql@16.12.0)(monaco-editor@0.55.1)(next@15.4.4(@babel/core@7.28.5)(@opentelemetry/api@1.9.0)(@playwright/test@1.57.0)(babel-plugin-macros@3.1.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(sass@1.77.4))(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(typescript@5.7.3))(@types/react@19.1.8)(monaco-editor@0.55.1)(next@15.4.4(@babel/core@7.28.5)(@opentelemetry/api@1.9.0)(@playwright/test@1.57.0)(babel-plugin-macros@3.1.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(sass@1.77.4))(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(typescript@5.7.3)(yjs@13.6.28) + mongodb: + specifier: ^6.10.0 + version: 6.21.0 + payloadcms-vectorize: + specifier: workspace:* + version: link:../.. + adapters/pg: dependencies: '@payloadcms/db-postgres': @@ -1652,6 +1671,9 @@ packages: react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 react-dom: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + '@mongodb-js/saslprep@1.4.9': + resolution: {integrity: sha512-RXSxsokhAF/4nWys8An8npsqOI33Ex1Hlzqjw2pZOO+GKtMAR2noGnUdsFiGwsaO/xXI+56mtjTmDA3JXJsvmA==} + '@napi-rs/nice-android-arm-eabi@1.1.1': resolution: {integrity: sha512-kjirL3N6TnRPv5iuHw36wnucNqXAO46dzK9oPb0wj076R5Xm8PfUVA9nAFB5ZNMmfJQJVKACAPd/Z2KYMppthw==} engines: {node: '>= 10'} @@ -1896,6 +1918,11 @@ packages: cpu: [x64] os: [win32] + '@payloadcms/db-mongodb@3.69.0': + resolution: {integrity: sha512-FGyI4JjBVxU6I/9r7G737l/ikzs+lhJ5UCZ4L8eKl5v8HKOr+KN1bJkoBa0aPEj08GHeHPB4f1SZhS6L09GmlQ==} + peerDependencies: + payload: 3.69.0 + '@payloadcms/db-postgres@3.69.0': resolution: {integrity: sha512-Fz/hjP0z88zrsYz1UzaqnoM3L+yHymH+yWUIJnIf7jMCtnfi/ws5XBX/0DHILvxoVsCNc5XSx3fTjcBCJoYylw==} peerDependencies: @@ -2476,6 +2503,12 @@ packages: '@types/uuid@10.0.0': resolution: {integrity: sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==} + '@types/webidl-conversions@7.0.3': + resolution: {integrity: sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA==} + + '@types/whatwg-url@11.0.5': + resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==} + '@types/ws@8.18.1': resolution: {integrity: sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==} @@ -3072,6 +3105,10 @@ packages: bson-objectid@2.0.4: resolution: {integrity: sha512-vgnKAUzcDoa+AeyYwXCoHyF2q6u/8H46dxu5JN+4/TZeq/Dlinn0K6GvxsCLb3LHUJl0m/TLiEK31kUwtgocMQ==} + bson@6.10.4: + resolution: {integrity: sha512-WIsKqkSC0ABoBJuT1LEX+2HEvNmNKKgnTAyd0fL8qzK4SH2i9NXg+t08YtdZp/V9IZ33cxe3iV4yM0qg8lMQng==} + engines: {node: '>=16.20.1'} + buffer-crc32@0.2.13: resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==} @@ -4230,11 +4267,12 @@ packages: glob@10.5.0: resolution: {integrity: sha512-DfXN8DfhJ7NH3Oe7cFmu3NCu1wKbkReJ8TorzSAFbSKrlNaQSKfIzqYqVY8zlbs2NLBbWpRiU52GX2PbaBVNkg==} + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me hasBin: true glob@7.2.3: resolution: {integrity: sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==} - deprecated: Glob versions prior to v9 are no longer supported + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me globals@14.0.0: resolution: {integrity: sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==} @@ -4826,6 +4864,10 @@ packages: resolution: {integrity: sha512-ZZow9HBI5O6EPgSJLUb8n2NKgmVWTwCvHGwFuJlMjvLFqlGG6pjirPhtdsseaLZjSibD8eegzmYpUZwoIlj2cQ==} engines: {node: '>=4.0'} + kareem@2.6.3: + resolution: {integrity: sha512-C3iHfuGUXK2u8/ipq9LfjFfXFxAZMQJJq7vLS45r3D9Y2xQ/m4S8zaR4zMLFWh9AsNPXmcFfUDhTEO8UIC/V6Q==} + engines: {node: '>=12.0.0'} + keyv@4.5.4: resolution: {integrity: sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==} @@ -5025,6 +5067,9 @@ packages: memoize-one@6.0.0: resolution: {integrity: sha512-rkpe71W0N0c0Xz6QD0eJETuWAJGnJ9afsl1srmwPrI+yBCkge5EycXXbYRyvL29zZVUWQCY7InPRCv3GDXuZNw==} + memory-pager@1.5.0: + resolution: {integrity: sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==} + merge-stream@2.0.0: resolution: {integrity: sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==} @@ -5161,6 +5206,79 @@ packages: monaco-editor@0.55.1: resolution: {integrity: sha512-jz4x+TJNFHwHtwuV9vA9rMujcZRb0CEilTEwG2rRSpe/A7Jdkuj8xPKttCgOh+v/lkHy7HsZ64oj+q3xoAFl9A==} + mongodb-connection-string-url@3.0.2: + resolution: {integrity: sha512-rMO7CGo/9BFwyZABcKAWL8UJwH/Kc2x0g72uhDWzG48URRax5TCIcJ7Rc3RZqffZzO/Gwff/jyKwCU9TN8gehA==} + + mongodb@6.16.0: + resolution: {integrity: sha512-D1PNcdT0y4Grhou5Zi/qgipZOYeWrhLEpk33n3nm6LGtz61jvO88WlrWCK/bigMjpnOdAUKKQwsGIl0NtWMyYw==} + engines: {node: '>=16.20.1'} + peerDependencies: + '@aws-sdk/credential-providers': ^3.188.0 + '@mongodb-js/zstd': ^1.1.0 || ^2.0.0 + gcp-metadata: ^5.2.0 + kerberos: ^2.0.1 + mongodb-client-encryption: '>=6.0.0 <7' + snappy: ^7.2.2 + socks: ^2.7.1 + peerDependenciesMeta: + '@aws-sdk/credential-providers': + optional: true + '@mongodb-js/zstd': + optional: true + gcp-metadata: + optional: true + kerberos: + optional: true + mongodb-client-encryption: + optional: true + snappy: + optional: true + socks: + optional: true + + mongodb@6.21.0: + resolution: {integrity: sha512-URyb/VXMjJ4da46OeSXg+puO39XH9DeQpWCslifrRn9JWugy0D+DvvBvkm2WxmHe61O/H19JM66p1z7RHVkZ6A==} + engines: {node: '>=16.20.1'} + peerDependencies: + '@aws-sdk/credential-providers': ^3.188.0 + '@mongodb-js/zstd': ^1.1.0 || ^2.0.0 + gcp-metadata: ^5.2.0 + kerberos: ^2.0.1 + mongodb-client-encryption: '>=6.0.0 <7' + snappy: ^7.3.2 + socks: ^2.7.1 + peerDependenciesMeta: + '@aws-sdk/credential-providers': + optional: true + '@mongodb-js/zstd': + optional: true + gcp-metadata: + optional: true + kerberos: + optional: true + mongodb-client-encryption: + optional: true + snappy: + optional: true + socks: + optional: true + + mongoose-paginate-v2@1.8.5: + resolution: {integrity: sha512-kFxhot+yw9KmpAGSSrF/o+f00aC2uawgNUbhyaM0USS9L7dln1NA77/pLg4lgOaRgXMtfgCENamjqZwIM1Zrig==} + engines: {node: '>=4.0.0'} + + mongoose@8.15.1: + resolution: {integrity: sha512-RhQ4DzmBi5BNGcS0w4u1vdMRIKcteXTCNzDt1j7XRcdWYBz1MjMjulBhPaeC5jBCHOD1yinuOFTTSOWLLGexWw==} + engines: {node: '>=16.20.1'} + + mpath@0.9.0: + resolution: {integrity: sha512-ikJRQTk8hw5DEoFVxHG1Gn9T/xcjtdnOKIU1JTmGjZZlg9LST2mBLmcX3/ICIbgJydT2GOc15RnNy5mHmzfSew==} + engines: {node: '>=4.0.0'} + + mquery@5.0.0: + resolution: {integrity: sha512-iQMncpmEK8R8ncT8HJGsGc9Dsp8xcgYMVSbs5jgnm1lFHTZqMJTUWTDx1LBO8+mK3tPNZWFLBghQEIOULSTHZg==} + engines: {node: '>=14.0.0'} + mri@1.2.0: resolution: {integrity: sha512-tzzskb3bG8LvYGFF/mDTpq3jpI6Q9wc3LEmBaghu+DdCssd1FakN7Bc0hVNmEyGq1bq3RgfkCb3cmQLpNPOroA==} engines: {node: '>=4'} @@ -5926,6 +6044,9 @@ packages: resolution: {integrity: sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==} engines: {node: '>= 0.4'} + sift@17.1.3: + resolution: {integrity: sha512-Rtlj66/b0ICeFzYTuNvX/EF1igRbbnGSvEyT79McoZa/DeGhMyC5pWKOEsZKnpkqtSeovd5FL/bjHWC3CIIvCQ==} + siginfo@2.0.0: resolution: {integrity: sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==} @@ -5995,6 +6116,9 @@ packages: resolution: {integrity: sha512-i5uvt8C3ikiWeNZSVZNWcfZPItFQOsYTUAOkcUPGd8DqDy1uOUikjt5dG+uRlwyvR108Fb9DOd4GvXfT0N2/uQ==} engines: {node: '>= 12'} + sparse-bitfield@3.0.3: + resolution: {integrity: sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==} + spawndamnit@3.0.1: resolution: {integrity: sha512-MmnduQUuHCoFckZoWnXsTg7JaiLBJrKFj9UI2MbRPGaJeVpsLcVBu6P/IGZovziM/YBsellCmsprgNA+w0CzVg==} @@ -6267,6 +6391,10 @@ packages: tr46@0.0.3: resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} + tr46@5.1.1: + resolution: {integrity: sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==} + engines: {node: '>=18'} + truncate-utf8-bytes@1.0.2: resolution: {integrity: sha512-95Pu1QXQvruGEhv62XCMO3Mm90GscOCClvrIUwCM0PYOXK3kaF3l3sIHxx71ThJfcbM2O5Au6SO3AWCSEfW4mQ==} @@ -6580,6 +6708,14 @@ packages: webidl-conversions@3.0.1: resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==} + webidl-conversions@7.0.0: + resolution: {integrity: sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==} + engines: {node: '>=12'} + + whatwg-url@14.2.0: + resolution: {integrity: sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==} + engines: {node: '>=18'} + whatwg-url@5.0.0: resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==} @@ -8328,6 +8464,10 @@ snapshots: react: 19.1.0 react-dom: 19.1.0(react@19.1.0) + '@mongodb-js/saslprep@1.4.9': + dependencies: + sparse-bitfield: 3.0.3 + '@napi-rs/nice-android-arm-eabi@1.1.1': optional: true @@ -8492,6 +8632,23 @@ snapshots: '@oxc-resolver/binding-win32-x64-msvc@1.12.0': optional: true + '@payloadcms/db-mongodb@3.69.0(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))': + dependencies: + mongoose: 8.15.1 + mongoose-paginate-v2: 1.8.5 + payload: 3.69.0(graphql@16.12.0)(typescript@5.7.3) + prompts: 2.4.2 + uuid: 10.0.0 + transitivePeerDependencies: + - '@aws-sdk/credential-providers' + - '@mongodb-js/zstd' + - gcp-metadata + - kerberos + - mongodb-client-encryption + - snappy + - socks + - supports-color + '@payloadcms/db-postgres@3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))': dependencies: '@payloadcms/drizzle': 3.69.0(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0)(@types/pg@8.10.2)(payload@3.69.0(graphql@16.12.0)(typescript@5.7.3))(pg@8.16.3) @@ -9312,6 +9469,12 @@ snapshots: '@types/uuid@10.0.0': {} + '@types/webidl-conversions@7.0.3': {} + + '@types/whatwg-url@11.0.5': + dependencies: + '@types/webidl-conversions': 7.0.3 + '@types/ws@8.18.1': dependencies: '@types/node': 22.19.3 @@ -10100,6 +10263,8 @@ snapshots: bson-objectid@2.0.4: {} + bson@6.10.4: {} + buffer-crc32@0.2.13: {} buffer-crc32@1.0.0: {} @@ -12267,6 +12432,8 @@ snapshots: object.assign: 4.1.7 object.values: 1.2.1 + kareem@2.6.3: {} + keyv@4.5.4: dependencies: json-buffer: 3.0.1 @@ -12475,6 +12642,8 @@ snapshots: memoize-one@6.0.0: {} + memory-pager@1.5.0: {} + merge-stream@2.0.0: {} merge2@1.4.1: {} @@ -12692,6 +12861,52 @@ snapshots: dompurify: 3.2.7 marked: 14.0.0 + mongodb-connection-string-url@3.0.2: + dependencies: + '@types/whatwg-url': 11.0.5 + whatwg-url: 14.2.0 + + mongodb@6.16.0: + dependencies: + '@mongodb-js/saslprep': 1.4.9 + bson: 6.10.4 + mongodb-connection-string-url: 3.0.2 + + mongodb@6.21.0: + dependencies: + '@mongodb-js/saslprep': 1.4.9 + bson: 6.10.4 + mongodb-connection-string-url: 3.0.2 + + mongoose-paginate-v2@1.8.5: {} + + mongoose@8.15.1: + dependencies: + bson: 6.10.4 + kareem: 2.6.3 + mongodb: 6.16.0 + mpath: 0.9.0 + mquery: 5.0.0 + ms: 2.1.3 + sift: 17.1.3 + transitivePeerDependencies: + - '@aws-sdk/credential-providers' + - '@mongodb-js/zstd' + - gcp-metadata + - kerberos + - mongodb-client-encryption + - snappy + - socks + - supports-color + + mpath@0.9.0: {} + + mquery@5.0.0: + dependencies: + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + mri@1.2.0: {} ms@2.1.3: {} @@ -13601,6 +13816,8 @@ snapshots: side-channel-map: 1.0.1 side-channel-weakmap: 1.0.2 + sift@17.1.3: {} + siginfo@2.0.0: {} signal-exit@3.0.7: {} @@ -13665,6 +13882,10 @@ snapshots: source-map@0.7.6: {} + sparse-bitfield@3.0.3: + dependencies: + memory-pager: 1.5.0 + spawndamnit@3.0.1: dependencies: cross-spawn: 7.0.6 @@ -13996,6 +14217,10 @@ snapshots: tr46@0.0.3: {} + tr46@5.1.1: + dependencies: + punycode: 2.3.1 + truncate-utf8-bytes@1.0.2: dependencies: utf8-byte-length: 1.0.5 @@ -14348,6 +14573,13 @@ snapshots: webidl-conversions@3.0.1: {} + webidl-conversions@7.0.0: {} + + whatwg-url@14.2.0: + dependencies: + tr46: 5.1.1 + webidl-conversions: 7.0.0 + whatwg-url@5.0.0: dependencies: tr46: 0.0.3