diff --git a/docs/api-reference/index.mdx b/docs/api-reference/index.mdx
index a625607..d3e5168 100644
--- a/docs/api-reference/index.mdx
+++ b/docs/api-reference/index.mdx
@@ -26,7 +26,7 @@ Other language SDKs are available through examples or third-party contributions.
| SDK Examples | Description |
|:--------------|-------------------|
-| [Java API Quickstart](https://github.com/lancedb/vectordb-recipes/tree/main/examples/saas_examples/rest_api_example)| Streamline REST API interactions in Java|
+| [Java API Quickstart]https://lancedb.github.io/lancedb/java/java/)| Streamline REST API interactions in Java|
{/* TODO: Add Go bindings reference page here */}
diff --git a/docs/embedding/index.mdx b/docs/embedding/index.mdx
index f6e0629..d316497 100644
--- a/docs/embedding/index.mdx
+++ b/docs/embedding/index.mdx
@@ -5,6 +5,18 @@ description: "Use the embedding API in LanceDB -- registry, functions, schemas,
icon: "bars"
---
+import {
+ PyOpenaiEmbeddings,
+ PyManualQuerySearch,
+ PyEmbeddingFunction,
+ TsOpenaiEmbeddings,
+ TsManualQuerySearch,
+ TsEmbeddingFunction,
+ RsOpenaiEmbeddings,
+ RsManualQuerySearch,
+ RsEmbeddingFunction,
+} from '/snippets/embedding.mdx';
+
Modern machine learning models can be trained to convert raw data into embeddings, which are vectors
of floating point numbers. The position of an embedding in vector space captures the semantics of
the data, so vectors that are close to each other are considered similar.
@@ -12,8 +24,8 @@ the data, so vectors that are close to each other are considered similar.
LanceDB provides an embedding function registry in OSS as well as its Cloud and Enterprise versions
([see below](#embeddings-in-lancedb-cloud-and-enterprise))
that automatically generates vector embeddings during data ingestion. Automatic query-time embedding
-generation is currently only supported in LanceDB OSS. The API abstracts embedding generation, allowing
-you to focus on your application logic.
+generation is available in LanceDB OSS, with SDK-specific query ergonomics. The API abstracts
+embedding generation, allowing you to focus on your application logic.
## Embedding Registry
@@ -21,47 +33,28 @@ you to focus on your application logic.
In LanceDB OSS, you can get a supported embedding function from the registry, and then use it in your table schema.
Once configured, the embedding function will automatically generate embeddings when you insert data
-into the table. And when you query the table, you can provide a query string or other input, and the
-embedding function will generate an embedding for it.
+into the table. Query-time behavior depends on SDK: Python/TypeScript can query with text directly,
+while Rust examples typically compute query embeddings explicitly before vector search.
-```python Python icon="python"
-from lancedb.embeddings import get_registry
-from lancedb.pydantic import LanceModel, Vector
-
-# Get a sentence-transformer function
-func = get_registry().get("sentence-transformers").create()
-
-class MySchema(LanceModel):
- # Embed the 'text' field automatically
- text: str = func.SourceField()
- # Store the embeddings in the 'vector' field
- vector: Vector(func.ndims()) = func.VectorField()
-
-# Create a LanceDB table with the schema
-import lancedb
-db = lancedb.connect("./mydb")
-table = db.create_table("mytable", schema=MySchema)
-# Insert data - embeddings are generated automatically
-table.add([
- {"text": "This is a test."},
- {"text": "Another example."}
-])
-
-# Query the table - embeddings are generated for the query
-results = table.search("test example").limit(5).to_pandas()
-print(results)
-
-## Example Output
-# text vector _distance
-# 0 This is a test. [0.0123, -0.0456, ..., 0.0789] 0.123456
-# 1 Another example. [0.0234, -0.0567, ..., 0.0890] 0.234567
-```
+
+ {PyOpenaiEmbeddings}
+
+
+
+ {TsOpenaiEmbeddings}
+
+
+
+ {RsOpenaiEmbeddings}
+
### Using an embedding function
-The `.create()` method accepts several arguments to configure the embedding function's behavior. `max_retries` is a special argument that applies to all providers.
+Python SDK
+
+In the Python SDK, the `.create()` method accepts several arguments to configure embedding function behavior. `max_retries` is a special argument that applies to all providers.
| Argument | Type | Description |
|---|---|---|
@@ -108,59 +101,43 @@ Currently, the embedding registry on LanceDB Cloud
generated on the client side (and stored on the remote table). We don't yet support automatic query-time
embedding generation when sending queries, though this is planned for a future release.
-For now, you can manually generate the embeddings at query time using the same embedding function that
+
+The same manual query-embedding flow shown above in OSS applies to Cloud and Enterprise connections.
+
+
+For search, you can manually generate the embeddings at query time using the same embedding function that
was used during ingestion, and pass the embeddings to the search function.
-```python Python icon="python"
-import lancedb
-from lancedb.embeddings import get_registry
-from lancedb.pydantic import LanceModel, Vector
-
-db = lancedb.connect(...)
-func = get_registry().get("sentence-transformers").create()
-
-class MySchema(LanceModel):
- text: str = func.SourceField()
- vector: Vector(func.ndims()) = func.VectorField()
-
-table = db.create_table("mytable", schema=MySchema)
-table.add([
- {"text": "This is a test."},
- {"text": "Another example."}
-])
-
-# Manually generate embeddings for the query
-query_vector = func.generate_embeddings(["test example"])[0]
-results = table.search(query_vector).limit(5).to_pandas()
-```
+
+ {PyManualQuerySearch}
+
+
+
+ {TsManualQuerySearch}
+
+
+
+ {RsManualQuerySearch}
+
## Custom Embedding Functions
-You can always implement your own embedding function by inheriting from `TextEmbeddingFunction`
-(for text) or `EmbeddingFunction` (for multimodal data).
+You can always implement your own embedding function:
+- Python/TypeScript: subclass `TextEmbeddingFunction` (text) or `EmbeddingFunction` (multimodal).
+- Rust: implement the `EmbeddingFunction` trait.
-```python Python icon="python"
-from lancedb.embeddings import register, TextEmbeddingFunction
-from functools import cached_property
-
-@register("my-embedder")
-class MyTextEmbedder(TextEmbeddingFunction):
- model_name: str = "my-model"
-
- def generate_embeddings(self, texts: list[str]) -> list[list[float]]:
- # Your embedding logic here
- return self._model.encode(texts).tolist()
-
- def ndims(self) -> int:
- # Return the dimensionality of the embeddings
- return len(self.generate_embeddings(["test"])[0])
-
- @cached_property
- def _model(self):
- # Initialize your model once
- return MyEmbeddingModel(self.model_name)
-```
+
+ {PyEmbeddingFunction}
+
+
+
+ {TsEmbeddingFunction}
+
+
+
+ {RsEmbeddingFunction}
+
diff --git a/docs/snippets/embedding.mdx b/docs/snippets/embedding.mdx
index 230c86d..3ce10c3 100644
--- a/docs/snippets/embedding.mdx
+++ b/docs/snippets/embedding.mdx
@@ -2,8 +2,14 @@
export const PyAsyncOpenaiEmbeddings = "db = await lancedb.connect_async(uri)\nfunc = get_registry().get(\"openai\").create(name=\"text-embedding-ada-002\")\n\nclass Words(LanceModel):\n text: str = func.SourceField()\n vector: Vector(func.ndims()) = func.VectorField()\n\ntable = await db.create_table(\"words\", schema=Words, mode=\"overwrite\")\nawait table.add([{\"text\": \"hello world\"}, {\"text\": \"goodbye world\"}])\n\nquery = \"greetings\"\nactual = await (await table.search(query)).limit(1).to_pydantic(Words)[0]\nprint(actual.text)\n";
+export const PyEmbeddingFunction = "from functools import cached_property\n\nfrom lancedb.embeddings import TextEmbeddingFunction, register\n\nclass MyEmbeddingModel:\n def __init__(self, model_name: str):\n self.model_name = model_name\n\n def encode(self, texts: list[str]) -> list[list[float]]:\n return [[1.0, 2.0, 3.0] for _ in texts]\n\n@register(\"my-embedder\")\nclass MyTextEmbedder(TextEmbeddingFunction):\n model_name: str = \"my-model\"\n\n def generate_embeddings(self, texts: list[str]) -> list[list[float]]:\n # Your embedding logic here\n return self._model.encode(texts)\n\n def ndims(self) -> int:\n # Return the dimensionality of the embeddings\n return len(self.generate_embeddings([\"test\"])[0])\n\n @cached_property\n def _model(self) -> MyEmbeddingModel:\n # Initialize your model once\n return MyEmbeddingModel(self.model_name)\n";
+
export const PyImports = "from lancedb.pydantic import LanceModel, Vector\nfrom lancedb.embeddings import get_registry\n";
+export const PyManualQueryEmbeddings = "db = lancedb.connect(\"/tmp/db\")\nfunc = get_registry().get(\"openai\").create(name=\"text-embedding-ada-002\")\n\nclass Words(LanceModel):\n text: str = func.SourceField()\n vector: Vector(func.ndims()) = func.VectorField()\n\ntable = db.create_table(\"words\", schema=Words, mode=\"overwrite\")\ntable.add([{\"text\": \"hello world\"}, {\"text\": \"goodbye world\"}])\n\nquery_vector = func.generate_embeddings([\"greetings\"])[0]\n# --8<-- [start:manual_query_search]\n# query_vector is assumed to already be generated by your embedding function\nactual = table.search(query_vector).limit(1).to_pydantic(Words)[0]\nprint(actual.text)\n# --8<-- [end:manual_query_search]\n";
+
+export const PyManualQuerySearch = "# query_vector is assumed to already be generated by your embedding function\nactual = table.search(query_vector).limit(1).to_pydantic(Words)[0]\nprint(actual.text)\n";
+
export const PyOpenaiEmbeddings = "db = lancedb.connect(\"/tmp/db\")\nfunc = get_registry().get(\"openai\").create(name=\"text-embedding-ada-002\")\n\nclass Words(LanceModel):\n text: str = func.SourceField()\n vector: Vector(func.ndims()) = func.VectorField()\n\ntable = db.create_table(\"words\", schema=Words, mode=\"overwrite\")\ntable.add([{\"text\": \"hello world\"}, {\"text\": \"goodbye world\"}])\n\nquery = \"greetings\"\nactual = table.search(query).limit(1).to_pydantic(Words)[0]\nprint(actual.text)\n";
export const PyRegisterDevice = "import torch\n\nregistry = get_registry()\nif torch.cuda.is_available():\n registry.set_var(\"device\", \"cuda\")\n\nfunc = registry.get(\"huggingface\").create(device=\"$var:device:cpu\")\n";
@@ -14,7 +20,19 @@ export const TsEmbeddingFunction = "const db = await lancedb.connect(databaseDir
export const TsImports = "import * as lancedb from \"@lancedb/lancedb\";\nimport \"@lancedb/lancedb/embedding/openai\";\nimport { LanceSchema, getRegistry, register } from \"@lancedb/lancedb/embedding\";\nimport { EmbeddingFunction } from \"@lancedb/lancedb/embedding\";\nimport { type Float, Float32, Utf8 } from \"apache-arrow\";\n";
+export const TsManualQueryEmbeddings = "const db = await lancedb.connect(databaseDir);\nconst func = getRegistry()\n .get(\"openai\")\n ?.create({ model: \"text-embedding-ada-002\" }) as EmbeddingFunction;\n\nconst wordsSchema = LanceSchema({\n text: func.sourceField(new Utf8()),\n vector: func.vectorField(),\n});\nconst tbl = await db.createEmptyTable(\"words\", wordsSchema, {\n mode: \"overwrite\",\n});\nawait tbl.add([{ text: \"hello world\" }, { text: \"goodbye world\" }]);\n\nconst queryVector = await func.computeQueryEmbeddings(\"greetings\");\n// --8<-- [start:manual_query_search]\n// queryVector is assumed to already be generated by your embedding function\nconst actual = (await tbl.search(queryVector).limit(1).toArray())[0];\n// --8<-- [end:manual_query_search]\n";
+
+export const TsManualQuerySearch = "// queryVector is assumed to already be generated by your embedding function\nconst actual = (await tbl.search(queryVector).limit(1).toArray())[0];\n";
+
export const TsOpenaiEmbeddings = "const db = await lancedb.connect(databaseDir);\nconst func = getRegistry()\n .get(\"openai\")\n ?.create({ model: \"text-embedding-ada-002\" }) as EmbeddingFunction;\n\nconst wordsSchema = LanceSchema({\n text: func.sourceField(new Utf8()),\n vector: func.vectorField(),\n});\nconst tbl = await db.createEmptyTable(\"words\", wordsSchema, {\n mode: \"overwrite\",\n});\nawait tbl.add([{ text: \"hello world\" }, { text: \"goodbye world\" }]);\n\nconst query = \"greetings\";\nconst actual = (await tbl.search(query).limit(1).toArray())[0];\n";
export const TsRegisterSecret = "const registry = getRegistry();\nregistry.setVar(\"api_key\", \"sk-...\");\n\nconst func = registry.get(\"openai\")!.create({\n apiKey: \"$var:api_key\",\n});\n";
+export const RsEmbeddingFunction = "use std::{borrow::Cow, sync::Arc};\n\nuse arrow_array::{Array, FixedSizeListArray, Float32Array};\nuse arrow_schema::{DataType, Field, Schema};\nuse lancedb::{\n connect,\n embeddings::{EmbeddingDefinition, EmbeddingFunction},\n Result,\n};\n\n#[derive(Debug, Clone)]\nstruct MyTextEmbedder {\n dim: usize,\n}\n\nimpl EmbeddingFunction for MyTextEmbedder {\n fn name(&self) -> &str {\n \"my-embedder\"\n }\n\n fn source_type(&self) -> Result> {\n Ok(Cow::Owned(DataType::Utf8))\n }\n\n fn dest_type(&self) -> Result> {\n Ok(Cow::Owned(DataType::new_fixed_size_list(\n DataType::Float32,\n self.dim as i32,\n true,\n )))\n }\n\n fn compute_source_embeddings(&self, source: Arc) -> Result> {\n let values = Arc::new(Float32Array::from(vec![1.0f32; source.len() * self.dim]));\n let field = Arc::new(Field::new(\"item\", DataType::Float32, true));\n Ok(Arc::new(FixedSizeListArray::new(\n field,\n self.dim as i32,\n values,\n None,\n )))\n }\n\n fn compute_query_embeddings(&self, _input: Arc) -> Result> {\n unimplemented!()\n }\n}\n\n#[tokio::main]\nasync fn main() -> Result<()> {\n let db = connect(\"./mydb\").execute().await?;\n db.embedding_registry()\n .register(\"my-embedder\", Arc::new(MyTextEmbedder { dim: 3 }))?;\n\n let schema = Arc::new(Schema::new(vec![Field::new(\"text\", DataType::Utf8, false)]));\n db.create_empty_table(\"mytable\", schema)\n .add_embedding(EmbeddingDefinition::new(\n \"text\",\n \"my-embedder\",\n Some(\"vector\"),\n ))?\n .execute()\n .await?;\n\n Ok(())\n}\n";
+
+export const RsManualQueryEmbeddings = "use std::{iter::once, sync::Arc};\n\nuse arrow_array::{record_batch, StringArray};\nuse arrow_schema::{DataType, Field, Schema};\nuse futures::StreamExt;\nuse lancedb::{\n connect,\n embeddings::{openai::OpenAIEmbeddingFunction, EmbeddingDefinition, EmbeddingFunction},\n query::{ExecutableQuery, QueryBase},\n Result,\n};\n\n#[tokio::main]\nasync fn main() -> Result<()> {\n let db = connect(\"./mydb\").execute().await?;\n let api_key = std::env::var(\"OPENAI_API_KEY\").expect(\"OPENAI_API_KEY is not set\");\n let embedding = Arc::new(OpenAIEmbeddingFunction::new_with_model(\n api_key,\n \"text-embedding-3-large\",\n )?);\n db.embedding_registry().register(\"openai\", embedding.clone())?;\n\n let schema = Arc::new(Schema::new(vec![Field::new(\"text\", DataType::Utf8, false)]));\n let table = db\n .create_empty_table(\"mytable\", schema)\n .add_embedding(EmbeddingDefinition::new(\"text\", \"openai\", Some(\"vector\")))?\n .execute()\n .await?;\n\n table\n .add(record_batch!((\"text\", Utf8, [\"This is a test.\", \"Another example.\"]))?)\n .execute()\n .await?;\n\n // Manually generate embeddings for the query (Cloud/Enterprise path)\n let query = Arc::new(StringArray::from_iter_values(once(\"test example\")));\n let query_vector = embedding.compute_query_embeddings(query)?;\n // --8<-- [start:manual_query_search]\n // query_vector is assumed to already be generated by your embedding function\n let mut results = table.vector_search(query_vector)?.limit(5).execute().await?;\n\n while let Some(batch) = results.next().await {\n println!(\"{:?}\", batch?);\n }\n // --8<-- [end:manual_query_search]\n\n Ok(())\n}\n";
+
+export const RsManualQuerySearch = "// query_vector is assumed to already be generated by your embedding function\nlet mut results = table.vector_search(query_vector)?.limit(5).execute().await?;\n\nwhile let Some(batch) = results.next().await {\n println!(\"{:?}\", batch?);\n}\n";
+
+export const RsOpenaiEmbeddings = "use std::{iter::once, sync::Arc};\n\nuse arrow_array::{record_batch, StringArray};\nuse arrow_schema::{DataType, Field, Schema};\nuse futures::StreamExt;\nuse lancedb::{\n connect,\n embeddings::{openai::OpenAIEmbeddingFunction, EmbeddingDefinition, EmbeddingFunction},\n query::{ExecutableQuery, QueryBase},\n Result,\n};\n\n#[tokio::main]\nasync fn main() -> Result<()> {\n let db = connect(\"./mydb\").execute().await?;\n let api_key = std::env::var(\"OPENAI_API_KEY\").expect(\"OPENAI_API_KEY is not set\");\n let embedding = Arc::new(OpenAIEmbeddingFunction::new_with_model(\n api_key,\n \"text-embedding-3-large\",\n )?);\n\n db.embedding_registry().register(\"openai\", embedding.clone())?;\n\n let schema = Arc::new(Schema::new(vec![Field::new(\"text\", DataType::Utf8, false)]));\n let table = db\n .create_empty_table(\"mytable\", schema)\n .add_embedding(EmbeddingDefinition::new(\"text\", \"openai\", Some(\"vector\")))?\n .execute()\n .await?;\n\n table\n .add(record_batch!((\"text\", Utf8, [\"This is a test.\", \"Another example.\"]))?)\n .execute()\n .await?;\n\n let query = Arc::new(StringArray::from_iter_values(once(\"test example\")));\n let query_vector = embedding.compute_query_embeddings(query)?;\n let mut results = table.vector_search(query_vector)?.limit(5).execute().await?;\n\n while let Some(batch) = results.next().await {\n println!(\"{:?}\", batch?);\n }\n\n Ok(())\n}\n";
+
diff --git a/docs/tables/update.mdx b/docs/tables/update.mdx
index 21c5229..c9862ee 100644
--- a/docs/tables/update.mdx
+++ b/docs/tables/update.mdx
@@ -192,7 +192,9 @@ When rows are updated, they are moved out of the index. The row will still show
### Delete Operations
-Remove rows that match a condition.
+Delete operations **soft delete** rows that match a given condition.
+The underlying data is not immediately removed, but is marked
+for deletion (in the [deletion files](https://lance.org/format/table/#deletion-files) at the Lance format level) and excluded from query results.
@@ -200,14 +202,29 @@ Remove rows that match a condition.
-
-Delete operations soft delete rows. Rows are hard deleted later by compaction and cleanup operations that happen in the background on LanceDB Cloud and Enterprise. The default retention on Cloud is 30 days. During this time, these rows are still accessible to query or restore by accessing old table versions (see [Versioning & Reproducibility in LanceDB](/tables/versioning/)).
-
-
-If a table is emptied, its existing indexes are removed. Recreate indexes after ingesting new data.
+
+**Deleting rows removes them from the index**
+
+When rows are deleted, those rows are also excluded from the index segments, so indexed queries will not return them either. If ALL the rows are deleted (i.e., the table is emptied), ensure that you recreate the index after ingesting new data.
+To permanently remove deleted rows, you can optimize the table, which will run compaction and cleans up the soft-deleted rows, which frees up storage space.
+
+- In LanceDB OSS, compaction and cleanup are manual. Run `table.optimize()` regularly to free up disk space.
+- In LanceDB Cloud, compaction and cleanup runs automatically in the background.
+- In LanceDB Enterprise, files aren't cleaned up by default. You can configure the compaction and cleanup behavior at cluster setup time to suit your organization's retention policy.
+
+By default, table cleanup removes data up to 7 days ago. If you need to reclaim space from deleted rows more aggressively, manually call `table.optimize()` use a shorter retention window as follows:
+
+
+ ```python Python icon=Python
+ from datetime import timedelta
+
+ table.optimize(cleanup_older_than=timedelta(days=1))
+ ```
+
+
## Merge Operations
The merge insert command is a flexible API that can be used to perform `upsert`,
@@ -275,4 +292,3 @@ for the document that are not in the new data, with `when_not_matched_by_source_
We suggest the best batch size to be 500k for optimal performance.
-
diff --git a/docs/tables/versioning.mdx b/docs/tables/versioning.mdx
index fcfa498..d3dd737 100644
--- a/docs/tables/versioning.mdx
+++ b/docs/tables/versioning.mdx
@@ -282,6 +282,10 @@ Each version represents a distinct state of your data, allowing you to:
- Revert to previous states
- Maintain data lineage for ML reproducibility
-
-System operations like index updates and table compaction automatically increment the table version number. These background processes are tracked in the version history, though their version numbers are omitted from this example for clarity.
+
+**System Operations**
+
+System operations like `optimize()`, index updates, and table compaction also increment table version numbers.
+In LanceDB OSS and Cloud, `optimize()` can prune older versions based on its retention setting (`cleanup_older_than`, 7 days by default),
+which is when old-version files are removed and disk space is reclaimed.
diff --git a/tests/py/test_embedding.py b/tests/py/test_embedding.py
index 79add01..7c5e7d7 100644
--- a/tests/py/test_embedding.py
+++ b/tests/py/test_embedding.py
@@ -51,6 +51,60 @@ class Words(LanceModel):
# --8<-- [end:async_openai_embeddings]
+@pytest.mark.skip(reason="OpenAI is not available in the test environment")
+def test_embeddings_manual_query():
+ # --8<-- [start:manual_query_embeddings]
+ db = lancedb.connect("/tmp/db")
+ func = get_registry().get("openai").create(name="text-embedding-ada-002")
+
+ class Words(LanceModel):
+ text: str = func.SourceField()
+ vector: Vector(func.ndims()) = func.VectorField()
+
+ table = db.create_table("words", schema=Words, mode="overwrite")
+ table.add([{"text": "hello world"}, {"text": "goodbye world"}])
+
+ query_vector = func.generate_embeddings(["greetings"])[0]
+ # --8<-- [start:manual_query_search]
+ # query_vector is assumed to already be generated by your embedding function
+ actual = table.search(query_vector).limit(1).to_pydantic(Words)[0]
+ print(actual.text)
+ # --8<-- [end:manual_query_search]
+ # --8<-- [end:manual_query_embeddings]
+
+
+def test_custom_embedding_function():
+ # --8<-- [start:embedding_function]
+ from functools import cached_property
+
+ from lancedb.embeddings import TextEmbeddingFunction, register
+
+ class MyEmbeddingModel:
+ def __init__(self, model_name: str):
+ self.model_name = model_name
+
+ def encode(self, texts: list[str]) -> list[list[float]]:
+ return [[1.0, 2.0, 3.0] for _ in texts]
+
+ @register("my-embedder")
+ class MyTextEmbedder(TextEmbeddingFunction):
+ model_name: str = "my-model"
+
+ def generate_embeddings(self, texts: list[str]) -> list[list[float]]:
+ # Your embedding logic here
+ return self._model.encode(texts)
+
+ def ndims(self) -> int:
+ # Return the dimensionality of the embeddings
+ return len(self.generate_embeddings(["test"])[0])
+
+ @cached_property
+ def _model(self) -> MyEmbeddingModel:
+ # Initialize your model once
+ return MyEmbeddingModel(self.model_name)
+ # --8<-- [end:embedding_function]
+
+
def test_embeddings_secret():
# --8<-- [start:register_secret]
registry = get_registry()
diff --git a/tests/rs/embedding.rs b/tests/rs/embedding.rs
new file mode 100644
index 0000000..c732f6b
--- /dev/null
+++ b/tests/rs/embedding.rs
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+// --8<-- [start:openai_embeddings]
+use std::{iter::once, sync::Arc};
+
+use arrow_array::{record_batch, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use futures::StreamExt;
+use lancedb::{
+ connect,
+ embeddings::{openai::OpenAIEmbeddingFunction, EmbeddingDefinition, EmbeddingFunction},
+ query::{ExecutableQuery, QueryBase},
+ Result,
+};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+ let db = connect("./mydb").execute().await?;
+ let api_key = std::env::var("OPENAI_API_KEY").expect("OPENAI_API_KEY is not set");
+ let embedding = Arc::new(OpenAIEmbeddingFunction::new_with_model(
+ api_key,
+ "text-embedding-3-large",
+ )?);
+
+ db.embedding_registry().register("openai", embedding.clone())?;
+
+ let schema = Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8, false)]));
+ let table = db
+ .create_empty_table("mytable", schema)
+ .add_embedding(EmbeddingDefinition::new("text", "openai", Some("vector")))?
+ .execute()
+ .await?;
+
+ table
+ .add(record_batch!(("text", Utf8, ["This is a test.", "Another example."]))?)
+ .execute()
+ .await?;
+
+ let query = Arc::new(StringArray::from_iter_values(once("test example")));
+ let query_vector = embedding.compute_query_embeddings(query)?;
+ let mut results = table.vector_search(query_vector)?.limit(5).execute().await?;
+
+ while let Some(batch) = results.next().await {
+ println!("{:?}", batch?);
+ }
+
+ Ok(())
+}
+// --8<-- [end:openai_embeddings]
+
+// --8<-- [start:manual_query_embeddings]
+use std::{iter::once, sync::Arc};
+
+use arrow_array::{record_batch, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use futures::StreamExt;
+use lancedb::{
+ connect,
+ embeddings::{openai::OpenAIEmbeddingFunction, EmbeddingDefinition, EmbeddingFunction},
+ query::{ExecutableQuery, QueryBase},
+ Result,
+};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+ let db = connect("./mydb").execute().await?;
+ let api_key = std::env::var("OPENAI_API_KEY").expect("OPENAI_API_KEY is not set");
+ let embedding = Arc::new(OpenAIEmbeddingFunction::new_with_model(
+ api_key,
+ "text-embedding-3-large",
+ )?);
+ db.embedding_registry().register("openai", embedding.clone())?;
+
+ let schema = Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8, false)]));
+ let table = db
+ .create_empty_table("mytable", schema)
+ .add_embedding(EmbeddingDefinition::new("text", "openai", Some("vector")))?
+ .execute()
+ .await?;
+
+ table
+ .add(record_batch!(("text", Utf8, ["This is a test.", "Another example."]))?)
+ .execute()
+ .await?;
+
+ // Manually generate embeddings for the query (Cloud/Enterprise path)
+ let query = Arc::new(StringArray::from_iter_values(once("test example")));
+ let query_vector = embedding.compute_query_embeddings(query)?;
+ // --8<-- [start:manual_query_search]
+ // query_vector is assumed to already be generated by your embedding function
+ let mut results = table.vector_search(query_vector)?.limit(5).execute().await?;
+
+ while let Some(batch) = results.next().await {
+ println!("{:?}", batch?);
+ }
+ // --8<-- [end:manual_query_search]
+
+ Ok(())
+}
+// --8<-- [end:manual_query_embeddings]
+
+// --8<-- [start:embedding_function]
+use std::{borrow::Cow, sync::Arc};
+
+use arrow_array::{Array, FixedSizeListArray, Float32Array};
+use arrow_schema::{DataType, Field, Schema};
+use lancedb::{
+ connect,
+ embeddings::{EmbeddingDefinition, EmbeddingFunction},
+ Result,
+};
+
+#[derive(Debug, Clone)]
+struct MyTextEmbedder {
+ dim: usize,
+}
+
+impl EmbeddingFunction for MyTextEmbedder {
+ fn name(&self) -> &str {
+ "my-embedder"
+ }
+
+ fn source_type(&self) -> Result> {
+ Ok(Cow::Owned(DataType::Utf8))
+ }
+
+ fn dest_type(&self) -> Result> {
+ Ok(Cow::Owned(DataType::new_fixed_size_list(
+ DataType::Float32,
+ self.dim as i32,
+ true,
+ )))
+ }
+
+ fn compute_source_embeddings(&self, source: Arc) -> Result> {
+ let values = Arc::new(Float32Array::from(vec![1.0f32; source.len() * self.dim]));
+ let field = Arc::new(Field::new("item", DataType::Float32, true));
+ Ok(Arc::new(FixedSizeListArray::new(
+ field,
+ self.dim as i32,
+ values,
+ None,
+ )))
+ }
+
+ fn compute_query_embeddings(&self, _input: Arc) -> Result> {
+ unimplemented!()
+ }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+ let db = connect("./mydb").execute().await?;
+ db.embedding_registry()
+ .register("my-embedder", Arc::new(MyTextEmbedder { dim: 3 }))?;
+
+ let schema = Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8, false)]));
+ db.create_empty_table("mytable", schema)
+ .add_embedding(EmbeddingDefinition::new(
+ "text",
+ "my-embedder",
+ Some("vector"),
+ ))?
+ .execute()
+ .await?;
+
+ Ok(())
+}
+// --8<-- [end:embedding_function]
diff --git a/tests/ts/embedding.test.ts b/tests/ts/embedding.test.ts
index 35af9e5..2926da7 100644
--- a/tests/ts/embedding.test.ts
+++ b/tests/ts/embedding.test.ts
@@ -36,6 +36,33 @@ openAiTest("openai embeddings", async () => {
});
});
+openAiTest("manual query embeddings", async () => {
+ await withTempDirectory(async (databaseDir) => {
+ // --8<-- [start:manual_query_embeddings]
+ const db = await lancedb.connect(databaseDir);
+ const func = getRegistry()
+ .get("openai")
+ ?.create({ model: "text-embedding-ada-002" }) as EmbeddingFunction;
+
+ const wordsSchema = LanceSchema({
+ text: func.sourceField(new Utf8()),
+ vector: func.vectorField(),
+ });
+ const tbl = await db.createEmptyTable("words", wordsSchema, {
+ mode: "overwrite",
+ });
+ await tbl.add([{ text: "hello world" }, { text: "goodbye world" }]);
+
+ const queryVector = await func.computeQueryEmbeddings("greetings");
+ // --8<-- [start:manual_query_search]
+ // queryVector is assumed to already be generated by your embedding function
+ const actual = (await tbl.search(queryVector).limit(1).toArray())[0];
+ // --8<-- [end:manual_query_search]
+ // --8<-- [end:manual_query_embeddings]
+ expect(actual).toHaveProperty("text");
+ });
+});
+
test("custom embedding function", async () => {
await withTempDirectory(async (databaseDir) => {
// --8<-- [start:embedding_function]
diff --git a/tests/ts/package-lock.json b/tests/ts/package-lock.json
index 64f338e..b586703 100644
--- a/tests/ts/package-lock.json
+++ b/tests/ts/package-lock.json
@@ -119,7 +119,6 @@
"resolved": "https://registry.npmjs.org/@babel/core/-/core-7.26.0.tgz",
"integrity": "sha512-i1SLeK+DzNnQ3LL/CswPCa/E5u4lh1k6IAEphON8F+cXt0t9euTshDru0q7/IqMa1PMPz5RnHuHscF8/ZJsStg==",
"dev": true,
- "peer": true,
"dependencies": {
"@ampproject/remapping": "^2.2.0",
"@babel/code-frame": "^7.26.0",
@@ -1863,6 +1862,7 @@
"version": "0.5.17",
"resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.17.tgz",
"integrity": "sha512-5IKx/Y13RsYd+sauPb2x+U/xZikHjolzfuDgTAl/Tdf3Q8rslRvC19NKDLgAJQ6wsqADk10ntlv08nPFw/gO/A==",
+ "peer": true,
"dependencies": {
"tslib": "^2.8.0"
}
@@ -1911,12 +1911,14 @@
"node_modules/@types/command-line-args": {
"version": "5.2.3",
"resolved": "https://registry.npmjs.org/@types/command-line-args/-/command-line-args-5.2.3.tgz",
- "integrity": "sha512-uv0aG6R0Y8WHZLTamZwtfsDLVRnOa+n+n5rEvFWL5Na5gZ8V2Teab/duDPFzIIIhs9qizDpcavCusCLJZu62Kw=="
+ "integrity": "sha512-uv0aG6R0Y8WHZLTamZwtfsDLVRnOa+n+n5rEvFWL5Na5gZ8V2Teab/duDPFzIIIhs9qizDpcavCusCLJZu62Kw==",
+ "peer": true
},
"node_modules/@types/command-line-usage": {
"version": "5.0.4",
"resolved": "https://registry.npmjs.org/@types/command-line-usage/-/command-line-usage-5.0.4.tgz",
- "integrity": "sha512-BwR5KP3Es/CSht0xqBcUXS3qCAUVXwpRKsV2+arxeb65atasuXG9LykC9Ab10Cw3s2raH92ZqOeILaQbsB2ACg=="
+ "integrity": "sha512-BwR5KP3Es/CSht0xqBcUXS3qCAUVXwpRKsV2+arxeb65atasuXG9LykC9Ab10Cw3s2raH92ZqOeILaQbsB2ACg==",
+ "peer": true
},
"node_modules/@types/graceful-fs": {
"version": "4.1.9",
@@ -2065,6 +2067,7 @@
"version": "18.1.0",
"resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-18.1.0.tgz",
"integrity": "sha512-v/ShMp57iBnBp4lDgV8Jx3d3Q5/Hac25FWmQ98eMahUiHPXcvwIMKJD0hBIgclm/FCG+LwPkAKtkRO1O/W0YGg==",
+ "peer": true,
"dependencies": {
"@swc/helpers": "^0.5.11",
"@types/command-line-args": "^5.2.3",
@@ -2083,7 +2086,8 @@
"node_modules/apache-arrow/node_modules/flatbuffers": {
"version": "24.12.23",
"resolved": "https://registry.npmjs.org/flatbuffers/-/flatbuffers-24.12.23.tgz",
- "integrity": "sha512-dLVCAISd5mhls514keQzmEG6QHmUUsNuWsb4tFafIUwvvgDjXhtfAYSKOzt5SWOy+qByV5pbsDZ+Vb7HUOBEdA=="
+ "integrity": "sha512-dLVCAISd5mhls514keQzmEG6QHmUUsNuWsb4tFafIUwvvgDjXhtfAYSKOzt5SWOy+qByV5pbsDZ+Vb7HUOBEdA==",
+ "peer": true
},
"node_modules/argparse": {
"version": "1.0.10",
@@ -2098,6 +2102,7 @@
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/array-back/-/array-back-3.1.0.tgz",
"integrity": "sha512-TkuxA4UCOvxuDK6NZYXCalszEzj+TLszyASooky+i742l9TqsOdYCMJJupxRic61hwquNtppB3hgcuq9SVSH1Q==",
+ "peer": true,
"engines": {
"node": ">=6"
}
@@ -2279,7 +2284,6 @@
"url": "https://github.com/sponsors/ai"
}
],
- "peer": true,
"dependencies": {
"caniuse-lite": "^1.0.30001669",
"electron-to-chromium": "^1.5.41",
@@ -2390,6 +2394,7 @@
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/chalk-template/-/chalk-template-0.4.0.tgz",
"integrity": "sha512-/ghrgmhfY8RaSdeo43hNXxpoHAtxdbskUHjPpfqUWGttFgycUhYPGx3YZBCnUCvOa7Doivn1IZec3DEGFoMgLg==",
+ "peer": true,
"dependencies": {
"chalk": "^4.1.2"
},
@@ -2512,6 +2517,7 @@
"version": "5.2.1",
"resolved": "https://registry.npmjs.org/command-line-args/-/command-line-args-5.2.1.tgz",
"integrity": "sha512-H4UfQhZyakIjC74I9d34fGYDwk3XpSr17QhEd0Q3I9Xq1CETHo4Hcuo87WyWHpAF1aSLjLRf5lD9ZGX2qStUvg==",
+ "peer": true,
"dependencies": {
"array-back": "^3.1.0",
"find-replace": "^3.0.0",
@@ -2526,6 +2532,7 @@
"version": "7.0.3",
"resolved": "https://registry.npmjs.org/command-line-usage/-/command-line-usage-7.0.3.tgz",
"integrity": "sha512-PqMLy5+YGwhMh1wS04mVG44oqDsgyLRSKJBdOo1bnYhMKBW65gZF1dRp2OZRhiTjgUHljy99qkO7bsctLaw35Q==",
+ "peer": true,
"dependencies": {
"array-back": "^6.2.2",
"chalk-template": "^0.4.0",
@@ -2540,6 +2547,7 @@
"version": "6.2.2",
"resolved": "https://registry.npmjs.org/array-back/-/array-back-6.2.2.tgz",
"integrity": "sha512-gUAZ7HPyb4SJczXAMUXMGAvI976JoK3qEx9v1FTmeYuJj0IBiaKttG1ydtGKdkfqWkIkouke7nG8ufGy77+Cvw==",
+ "peer": true,
"engines": {
"node": ">=12.17"
}
@@ -2548,6 +2556,7 @@
"version": "7.3.0",
"resolved": "https://registry.npmjs.org/typical/-/typical-7.3.0.tgz",
"integrity": "sha512-ya4mg/30vm+DOWfBg4YK3j2WD6TWtRkCbasOJr40CseYENzCUby/7rIvXA99JGsQHeNxLbnXdyLLxKSv3tauFw==",
+ "peer": true,
"engines": {
"node": ">=12.17"
}
@@ -2939,6 +2948,7 @@
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/find-replace/-/find-replace-3.0.0.tgz",
"integrity": "sha512-6Tb2myMioCAgv5kfvP5/PkZZ/ntTpVK39fHY7WkWBgvbeE+VHd/tZuZ4mrC+bxh4cfOZeYKVPaJIZtZXV7GNCQ==",
+ "peer": true,
"dependencies": {
"array-back": "^3.0.1"
},
@@ -3460,7 +3470,6 @@
"resolved": "https://registry.npmjs.org/jest/-/jest-29.7.0.tgz",
"integrity": "sha512-NIy3oAFp9shda19hy4HK0HRTWKtPJmGdnvywu01nOqNC2vZg+Z+fvJDxpMQA88eb2I9EcafcdjYgsDthnYTvGw==",
"dev": true,
- "peer": true,
"dependencies": {
"@jest/core": "^29.7.0",
"@jest/types": "^29.6.3",
@@ -4071,6 +4080,7 @@
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/json-bignum/-/json-bignum-0.0.3.tgz",
"integrity": "sha512-2WHyXj3OfHSgNyuzDbSxI1w2jgw5gkWSWhS7Qg4bWXx1nLk3jnbwfUeS0PSba3IzpTUWdHxBieELUzXRjQB2zg==",
+ "peer": true,
"engines": {
"node": ">=0.8"
}
@@ -4132,7 +4142,8 @@
"node_modules/lodash.camelcase": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/lodash.camelcase/-/lodash.camelcase-4.3.0.tgz",
- "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA=="
+ "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==",
+ "peer": true
},
"node_modules/lodash.memoize": {
"version": "4.1.2",
@@ -5010,6 +5021,7 @@
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/table-layout/-/table-layout-4.1.1.tgz",
"integrity": "sha512-iK5/YhZxq5GO5z8wb0bY1317uDF3Zjpha0QFFLA8/trAoiLbQD0HUbMesEaxyzUgDxi2QlcbM8IvqOlEjgoXBA==",
+ "peer": true,
"dependencies": {
"array-back": "^6.2.2",
"wordwrapjs": "^5.1.0"
@@ -5022,6 +5034,7 @@
"version": "6.2.2",
"resolved": "https://registry.npmjs.org/array-back/-/array-back-6.2.2.tgz",
"integrity": "sha512-gUAZ7HPyb4SJczXAMUXMGAvI976JoK3qEx9v1FTmeYuJj0IBiaKttG1ydtGKdkfqWkIkouke7nG8ufGy77+Cvw==",
+ "peer": true,
"engines": {
"node": ">=12.17"
}
@@ -5175,7 +5188,6 @@
"integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==",
"dev": true,
"license": "Apache-2.0",
- "peer": true,
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"
@@ -5188,6 +5200,7 @@
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/typical/-/typical-4.0.0.tgz",
"integrity": "sha512-VAH4IvQ7BDFYglMd7BPRDfLgxZZX4O4TFcRDA6EN5X7erNJJq+McIEp8np9aVtxrCJ6qx4GTYVfOWNjcqwZgRw==",
+ "peer": true,
"engines": {
"node": ">=8"
}
@@ -5290,6 +5303,7 @@
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/wordwrapjs/-/wordwrapjs-5.1.0.tgz",
"integrity": "sha512-JNjcULU2e4KJwUNv6CHgI46UvDGitb6dGryHajXTDiLgg1/RiGoPSDw4kZfYnwGtEXf2ZMeIewDQgFGzkCB2Sg==",
+ "peer": true,
"engines": {
"node": ">=12.17"
}