From a42d5bad8c46026a9c7b0ebb0f6ffc6a5616c534 Mon Sep 17 00:00:00 2001 From: prrao87 <35005448+prrao87@users.noreply.github.com> Date: Thu, 19 Feb 2026 21:59:22 -0500 Subject: [PATCH] Revamp table update docs and regenerate snippets --- docs/embedding/index.mdx | 4 +- docs/snippets/tables.mdx | 26 ++- docs/tables/update.mdx | 354 +++++++++++++++++----------------- tests/py/test_tables.py | 398 +++++++++++++++++++++++++++++---------- 4 files changed, 501 insertions(+), 281 deletions(-) diff --git a/docs/embedding/index.mdx b/docs/embedding/index.mdx index d316497..ac89562 100644 --- a/docs/embedding/index.mdx +++ b/docs/embedding/index.mdx @@ -29,9 +29,7 @@ embedding generation, allowing you to focus on your application logic. ## Embedding Registry -OSS - -In LanceDB OSS, you can get a supported embedding function from the registry, and then use it in your table schema. +You can get a supported embedding function from the registry, and then use it in your table schema. Once configured, the embedding function will automatically generate embeddings when you insert data into the table. Query-time behavior depends on SDK: Python/TypeScript can query with text directly, while Rust examples typically compute query embeddings explicitly before vector search. diff --git a/docs/snippets/tables.mdx b/docs/snippets/tables.mdx index 05e5c36..c79f6e0 100644 --- a/docs/snippets/tables.mdx +++ b/docs/snippets/tables.mdx @@ -50,7 +50,7 @@ export const PyCreateTableFromPydantic = "from lancedb.pydantic import LanceMode export const PyCreateTableNestedSchema = "from lancedb.pydantic import LanceModel, Vector\n\n# --8<-- [start:tables_document_model]\nfrom pydantic import BaseModel\n\nclass Document(BaseModel):\n content: str\n source: str\n\n# --8<-- [end:tables_document_model]\n\nclass NestedSchema(LanceModel):\n id: str\n vector: Vector(1536)\n document: Document\n\ndb = tmp_db\ntbl = db.create_table(\"nested_table\", schema=NestedSchema, mode=\"overwrite\")\n"; -export const PyDeleteOperation = "# delete data\npredicate = \"price = 30.0\"\ntable.delete(predicate)\n"; +export const PyDeleteOperation = "# delete data\npredicate = \"id = 3\"\ntable.delete(predicate)\n"; export const PyDropColumnsMultiple = "# Remove the second temporary column\ntable.drop_columns([\"temp_col2\"])\n"; @@ -58,11 +58,17 @@ export const PyDropColumnsSingle = "# Remove the first temporary column\ntable.d export const PyDropTable = "db = tmp_db\n# Create a table first\ndata = [{\"vector\": [1.1, 1.2], \"lat\": 45.5}]\ndb.create_table(\"my_table\", data, mode=\"overwrite\")\n\n# Drop the table\ndb.drop_table(\"my_table\")\n"; -export const PyInsertIfNotExists = "# Create example table\ntable = db.create_table(\n \"domains\",\n [\n {\"domain\": \"google.com\", \"name\": \"Google\"},\n {\"domain\": \"github.com\", \"name\": \"GitHub\"},\n ],\n mode=\"overwrite\",\n)\n\n# Prepare new data - one existing and one new record\nnew_domains = [\n {\"domain\": \"google.com\", \"name\": \"Google\"},\n {\"domain\": \"facebook.com\", \"name\": \"Facebook\"},\n]\n\n# Insert only if domain doesn't exist\ntable.merge_insert(\"domain\").when_not_matched_insert_all().execute(new_domains)\n\n# Verify count - should be 3 (original 2 plus 1 new)\nprint(f\"Total domains: {table.count_rows()}\") # 3\n"; +export const PyInsertIfNotExists = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2],\n \"name\": [\"Alice\", \"Bob\"],\n \"login_count\": [10, 20],\n }\n ),\n mode=\"overwrite\",\n)\n\nincoming_users = pa.table(\n {\n \"id\": [2, 3],\n \"name\": [\"Bobby\", \"Charlie\"],\n \"login_count\": [21, 5],\n }\n)\n\n(\n table.merge_insert(\"id\")\n .when_not_matched_insert_all()\n .execute(incoming_users)\n)\n"; -export const PyOpenExistingTable = "db = tmp_db\n# Create a table first\ndata = [{\"vector\": [1.1, 1.2], \"lat\": 45.5, \"long\": -122.7}]\ndb.create_table(\"test_table\", data, mode=\"overwrite\")\n\n# List table names\nprint(db.table_names())\n\n# Open existing table\ntbl = db.open_table(\"test_table\")\n"; +export const PyMergeDeleteMissingBySource = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2, 3],\n \"name\": [\"Alice\", \"Bob\", \"Charlie\"],\n \"login_count\": [10, 20, 5],\n }\n ),\n mode=\"overwrite\",\n)\n\nincoming_users = pa.table(\n {\n \"id\": [2, 3],\n \"name\": [\"Bobby\", \"Charlie\"],\n \"login_count\": [21, 5],\n }\n)\n\n(\n table.merge_insert(\"id\")\n .when_matched_update_all()\n .when_not_matched_insert_all()\n .when_not_matched_by_source_delete()\n .execute(incoming_users)\n)\n"; + +export const PyMergeMatchedUpdateOnly = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2],\n \"name\": [\"Alice\", \"Bob\"],\n \"login_count\": [10, 20],\n }\n ),\n mode=\"overwrite\",\n)\n\nincoming_users = pa.table(\n {\n \"id\": [2, 3],\n \"name\": [\"Bobby\", \"Charlie\"],\n \"login_count\": [21, 5],\n }\n)\n\n(\n table.merge_insert(\"id\")\n .when_matched_update_all()\n .execute(incoming_users)\n)\n"; + +export const PyMergePartialColumns = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2],\n \"name\": [\"Alice\", \"Bob\"],\n \"login_count\": [10, 20],\n }\n ),\n mode=\"overwrite\",\n)\n\nincoming_users = pa.table(\n {\n \"id\": [2, 3],\n \"name\": [\"Bobby\", \"Charlie\"],\n }\n)\n\n(\n table.merge_insert(\"id\")\n .when_matched_update_all()\n .when_not_matched_insert_all()\n .execute(incoming_users)\n)\n"; + +export const PyMergeUpdateInsert = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2],\n \"name\": [\"Alice\", \"Bob\"],\n \"login_count\": [10, 20],\n }\n ),\n mode=\"overwrite\",\n)\n\nincoming_users = pa.table(\n {\n \"id\": [2, 3],\n \"name\": [\"Bobby\", \"Charlie\"],\n \"login_count\": [21, 5],\n }\n)\n\n(\n table.merge_insert(\"id\")\n .when_matched_update_all()\n .when_not_matched_insert_all()\n .execute(incoming_users)\n)\n"; -export const PyReplaceRangeOperation = "# Create example table with document chunks\ntable = db.create_table(\n \"chunks\",\n [\n {\"doc_id\": 0, \"chunk_id\": 0, \"text\": \"Hello\"},\n {\"doc_id\": 0, \"chunk_id\": 1, \"text\": \"World\"},\n {\"doc_id\": 1, \"chunk_id\": 0, \"text\": \"Foo\"},\n {\"doc_id\": 1, \"chunk_id\": 1, \"text\": \"Bar\"},\n {\"doc_id\": 2, \"chunk_id\": 0, \"text\": \"Baz\"},\n ],\n mode=\"overwrite\",\n)\n\n# New data - replacing all chunks for doc_id 1 with just one chunk\nnew_chunks = [\n {\"doc_id\": 1, \"chunk_id\": 0, \"text\": \"Zoo\"},\n]\n\n# Replace all chunks for doc_id 1\n(\n table.merge_insert([\"doc_id\"])\n .when_matched_update_all()\n .when_not_matched_insert_all()\n .when_not_matched_by_source_delete(\"doc_id = 1\")\n .execute(new_chunks)\n)\n\n# Verify count for doc_id = 1 - should be 1\nprint(f\"Chunks for doc_id = 1: {table.count_rows('doc_id = 1')}\") # 1\n"; +export const PyOpenExistingTable = "db = tmp_db\n# Create a table first\ndata = [{\"vector\": [1.1, 1.2], \"lat\": 45.5, \"long\": -122.7}]\ndb.create_table(\"test_table\", data, mode=\"overwrite\")\n\n# List table names\nprint(db.table_names())\n\n# Open existing table\ntbl = db.open_table(\"test_table\")\n"; export const PySchemaAddSetup = "table_name = \"schema_evolution_add_example\"\nif data is None:\n data = [\n {\n \"id\": 1,\n \"name\": \"Laptop\",\n \"price\": 1200.00,\n \"vector\": np.random.random(128).tolist(),\n },\n {\n \"id\": 2,\n \"name\": \"Smartphone\",\n \"price\": 800.00,\n \"vector\": np.random.random(128).tolist(),\n },\n {\n \"id\": 3,\n \"name\": \"Headphones\",\n \"price\": 150.00,\n \"vector\": np.random.random(128).tolist(),\n },\n ]\ntable = tmp_db.create_table(table_name, data, mode=\"overwrite\")\n"; @@ -76,11 +82,17 @@ export const PyTablesImports = "import lancedb\nimport numpy as np\nimport panda export const PyTablesTzValidator = "from datetime import datetime\nfrom zoneinfo import ZoneInfo\n\nfrom lancedb.pydantic import LanceModel\nfrom pydantic import Field, ValidationError, ValidationInfo, field_validator\n\ntzname = \"America/New_York\"\ntz = ZoneInfo(tzname)\n\nclass TestModel(LanceModel):\n dt_with_tz: datetime = Field(json_schema_extra={\"tz\": tzname})\n\n @field_validator(\"dt_with_tz\")\n @classmethod\n def tz_must_match(cls, dt: datetime) -> datetime:\n assert dt.tzinfo == tz\n return dt\n\nok = TestModel(dt_with_tz=datetime.now(tz))\n\ntry:\n TestModel(dt_with_tz=datetime.now(ZoneInfo(\"Asia/Shanghai\")))\n assert 0 == 1, \"this should raise ValidationError\"\nexcept ValidationError:\n print(\"A ValidationError was raised.\")\n pass\n"; -export const PyUpdateOperation = "import pandas as pd\n\n# Create a table from a pandas DataFrame\ndata = pd.DataFrame({\"x\": [1, 2, 3], \"vector\": [[1, 2], [3, 4], [5, 6]]})\ntbl = db.create_table(\"test_table\", data, mode=\"overwrite\")\n# Update the table where x = 2\ntbl.update(where=\"x = 2\", values={\"vector\": [10, 10]})\n# Get the updated table as a pandas DataFrame\ndf = tbl.to_pandas()\nprint(df)\n"; +export const PyUpdateConnectCloud = "import lancedb\n\ndb = lancedb.connect(\n uri=\"db://your-project-slug\",\n api_key=\"your-api-key\",\n region=\"us-east-1\",\n)\n"; + +export const PyUpdateConnectLocal = "import lancedb\n\ndb = lancedb.connect(\"./data\")\n"; + +export const PyUpdateExampleTableSetup = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2],\n \"name\": [\"Alice\", \"Bob\"],\n \"login_count\": [10, 20],\n }\n ),\n mode=\"overwrite\",\n)\n"; + +export const PyUpdateOperation = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2],\n \"name\": [\"Alice\", \"Bob\"],\n \"login_count\": [10, 20],\n }\n ),\n mode=\"overwrite\",\n)\ntable.update(where=\"id = 2\", values={\"name\": \"Bobby\"})\n"; -export const PyUpdateUsingSql = "import pandas as pd\n\n# Create a table from a pandas DataFrame\ndata = pd.DataFrame({\"x\": [1, 2, 3], \"vector\": [[1, 2], [3, 4], [5, 6]]})\ntbl = db.create_table(\"test_table\", data, mode=\"overwrite\")\n# Update all rows: increment x by 1\ntbl.update(values_sql={\"x\": \"x + 1\"})\nprint(tbl.to_pandas())\n"; +export const PyUpdateOptimizeCleanup = "from datetime import timedelta\n\ntable.optimize(cleanup_older_than=timedelta(days=1))\n"; -export const PyUpsertOperation = "# Create example table\nusers_table_name = \"users_example\"\ntable = db.create_table(\n users_table_name,\n [\n {\"id\": 0, \"name\": \"Alice\"},\n {\"id\": 1, \"name\": \"Bob\"},\n ],\n mode=\"overwrite\",\n)\nprint(f\"Created users table with {table.count_rows()} rows\")\n\n# Prepare data for upsert\nnew_users = [\n {\"id\": 1, \"name\": \"Bobby\"}, # Will update existing record\n {\"id\": 2, \"name\": \"Charlie\"}, # Will insert new record\n]\n\n# Upsert by id\n(\n table.merge_insert(\"id\")\n .when_matched_update_all()\n .when_not_matched_insert_all()\n .execute(new_users)\n)\n\n# Verify results - should be 3 records total\nprint(f\"Total users: {table.count_rows()}\") # 3\n"; +export const PyUpdateUsingSql = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2],\n \"name\": [\"Alice\", \"Bob\"],\n \"login_count\": [10, 20],\n }\n ),\n mode=\"overwrite\",\n)\ntable.update(where=\"id = 2\", values_sql={\"login_count\": \"login_count + 1\"})\n"; export const PyVersioningAddData = "# Add more data\nmore_data = [\n {\n \"id\": 4,\n \"author\": \"Richard Daniel Sanchez\",\n \"quote\": \"That's the way the news goes!\",\n },\n {\"id\": 5, \"author\": \"Morty\", \"quote\": \"Aww geez, Rick!\"},\n]\ntable.add(more_data)\n"; diff --git a/docs/tables/update.mdx b/docs/tables/update.mdx index c9862ee..3addb71 100644 --- a/docs/tables/update.mdx +++ b/docs/tables/update.mdx @@ -1,294 +1,296 @@ --- title: "Updating and Modifying Table Data" sidebarTitle: "Update/modify data" -description: "Learn how to update and modify data in LanceDB. Includes incremental updates, batch modifications, and best practices for data maintenance." +description: "Learn how to update, merge, and delete rows in a LanceDB table." icon: "clone" --- import { - PyAddDataToTable as AddDataToTable, - PyAddDataPydanticModel as AddDataPydanticModel, - PyAddDataNestedModel as AddDataNestedModel, - PyBatchDataInsertion as BatchDataInsertion, + PyUpdateConnectCloud as UpdateConnectCloud, + PyUpdateConnectLocal as UpdateConnectLocal, + PyUpdateExampleTableSetup as UpdateExampleTableSetup, PyUpdateOperation as UpdateOperation, PyUpdateUsingSql as UpdateUsingSql, - PyDeleteOperation as DeleteOperation, - PyUpsertOperation as UpsertOperation, + PyMergeMatchedUpdateOnly as MergeMatchedUpdateOnly, PyInsertIfNotExists as InsertIfNotExists, - PyReplaceRangeOperation as ReplaceRangeOperation, + PyMergeUpdateInsert as MergeUpdateInsert, + PyMergeDeleteMissingBySource as MergeDeleteMissingBySource, + PyMergePartialColumns as MergePartialColumns, + PyDeleteOperation as DeleteOperation, + PyUpdateOptimizeCleanup as UpdateOptimizeCleanup, } from '/snippets/tables.mdx'; -Once you have created a table, there are several ways to modify its data. You can: - -- Ingest and add new records to your table; -- Update existing records that match specific conditions; -- Use the powerful Merge Insert function for more complex operations like upserting or replacing ranges of data. - -These operations allow you to keep your table data current and maintain it exactly as needed for your use case. Let's look at each of these operations in detail. - - -These examples demonstrate common usage patterns. For complete API details and advanced options, refer to our SDK [documentation page](/api-reference/) and navigate to your client language of choice. - - -## Connecting to LanceDB +Updating or modifying data involves changing rows in an existing table. +LanceDB provides two families of write operations that can modify data in a table: -Before performing any operations, you'll need to connect to LanceDB. The connection method depends on whether you're using LanceDB Cloud or the open source version. +- `update(...)`: mutate existing rows that match a SQL filter. +- `merge_insert(...)`: compare incoming rows to existing rows by key, then choose what to do for each case. -```python -import lancedb +The `update` method is simpler to use when you already know which rows you want to modify and you do not need to compare against an incoming dataset. The `merge_insert` method is more powerful when you have a new dataset that you want to merge into an existing table, and you want LanceDB to handle the logic of comparing against existing rows by key. -# Connect to LanceDB Cloud -db = lancedb.connect( - uri="db://your-project-slug", - api_key="your-api-key", - region="us-east-1" -) -``` +Let's look at an example that demonstrates these operations in practice. -You can also connect locally using LanceDB OSS: +## Connect to LanceDB -```python -import lancedb +Connect to your local LanceDB instance: -# Connect to local LanceDB -db = lancedb.connect("./data") # Local directory for data storage -``` + + + {UpdateConnectLocal} + + -## Data Insertion +Expected output: -### Adding data to a table +| Variable | Value | +| --- | --- | +| `db` | A connected LanceDB database handle pointing to `./data` | -Say you created a LanceDB table by passing in a `schema`. -This is an _empty_ table, with no data in it. To add or append data to a table, you can use the `table.add(data)`, -as shown below. +Or, connect to your LanceDB remote cluster: - {AddDataToTable} + {UpdateConnectCloud} - -The vector column needs to be a `pyarrow.FixedSizeList` type. - +Expected output: -### Using Pydantic Models +| Variable | Value | +| --- | --- | +| `db` | A connected LanceDB database handle for your remote project | -Pydantic models provide a more structured way to define your table schema: +## Create the example table +We'll start by creating a simple table of a table with `id`, `name`, and `login_count` columns. All examples below use the same table. - {AddDataPydanticModel} + {UpdateExampleTableSetup} -### Using Nested Models +Expected table contents: -You can use nested Pydantic models to represent complex data structures. -For example, you may want to store the document string and the document source name as a nested Document object: +| id | name | login_count | +| --- | --- | --- | +| 1 | Alice | 10 | +| 2 | Bob | 20 | -```python -from pydantic import BaseModel +The example above shows a PyArrow schema. You can just as well create the table using other +table creation patterns (Pandas, Polars, Pydantic, iterators, etc.) -- see the [ingestion](/tables/create/) guide for more details. -class Document(BaseModel): - content: str - source: str -``` +## Choose a write method -This can be used as the type of a LanceDB table column: +| Family | Method | Use this when... | +| --------------- | --------------- | ---------------- | +| `update` | `update(where=..., values=...)` | You want to edit rows that already exist, using a SQL filter. | +| `merge_insert` | `.when_matched_update_all()` | You have incoming rows and want to update keys that already exist in the table. | +| `merge_insert` | `.when_not_matched_insert_all()` | You have incoming rows and want to insert keys that do not exist yet. | +| `merge_insert` | `.when_matched_update_all()` + `.when_not_matched_insert_all()` | You want both behaviors together (often called **upsert**: update existing keys **and** insert missing keys in the same operation). | +| `merge_insert` | `.when_not_matched_by_source_delete(...)` | You want to remove target rows that are missing from the incoming source set. | + +## Update rows + +Use `update` when you already know which target rows to modify and you do not need to compare against an incoming dataset. - {AddDataNestedModel} + {UpdateOperation} -This creates a struct column called `document` that has two subfields called `content` and `source`: +Expected table contents: -```bash -In [28]: table.schema -Out[28]: -id: string not null -vector: fixed_size_list[128] not null - child 0, item: float -document: struct not null - child 0, content: string not null - child 1, source: string not null -``` +| id | name | login_count | +| --- | --- | --- | +| 1 | Alice | 10 | +| 2 | Bobby | 20 | -### Batch Data Insertion + +Updating nested columns is not yet supported. + -It is recommended to use iterators to add large datasets in batches when creating -your table in one go. Data will be automatically compacted for the best query performance. +## Update rows with SQL expressions -#### Python Batch Insertion +Use `values_sql` when you want to use SQL-like expressions to update rows. This is useful for operations like incrementing a counter, or setting a column value based on another column. - {BatchDataInsertion} + {UpdateUsingSql} - -LanceDB Cloud is a multi-tenant environment with a 100MB payload limit. Adjust your batch size accordingly. - +Expected table contents: -## Data Modification +| id | name | login_count | +| --- | --- | --- | +| 1 | Alice | 10 | +| 2 | Bob | 21 | -### Update Operations + +See the [SQL queries](/search/sql/) page for more information on the supported SQL syntax. + -This can be used to update zero to all rows depending on how many rows match the where clause. The update queries follow the form of a SQL UPDATE statement. The `where` parameter is a SQL filter that matches on the metadata columns. The `values` or `values_sql` parameters are used to provide the new values for the columns. +When rows are updated, they are moved out of any existing index. The row will still show up in search queries, but the query will not be as fast as it would be if the row was in the index. If you update a large proportion of rows, consider triggering an index rebuild afterwards. - -Updating nested columns is not yet supported. - +## Merge incoming rows by key -| Parameter | Type | Description | -| ------------ | ------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `where` | `str` | The SQL where clause to use when updating rows. For example, `'x = 2'` or `'x IN (1, 2, 3)'`. The filter must not be empty, or it will error. | -| `values` | `dict` | The values to update. The keys are the column names and the values are the values to set. | -| `values_sql` | `dict` | The values to update. The keys are the column names and the values are the SQL expressions to set. For example, `{'x': 'x + 1'}` will increment the value of the `x` column by 1. | +Merging is different from updating because it involves comparing incoming rows to existing rows by key, and then choosing what to do based on whether the key exists in the target table or not. +The `merge_insert(""..."")` method lets you do this. - -See the [SQL queries](/search/sql/) page for more information on the supported SQL syntax. - +In merge operations, rows are split into three groups: - - - {UpdateOperation} - - +- **Matched**: key exists in both source and target. +- **Not matched**: key exists only in source. +- **Not matched by source**: key exists only in target. -Output: + +**Use scalar indexes to speed up merge insert** -```json - x vector -0 1 [1.0, 2.0] -1 3 [5.0, 6.0] -2 2 [10.0, 10.0] -``` +The merge insert command performs a join between the input data and the target table `on` the key you provide. This requires scanning that entire column, which can be expensive for large tables. To speed up this operation, create a scalar index on the join column, which will allow LanceDB to find matches without scanning the whole table. -### Updating Using SQL +Read more about scalar indices in the [Scalar Index](/indexing/scalar-index/) guide. + + + +If you see this HTTP 400 error from `merge_insert`: `Bad request: Merge insert cannot be performed because the number of unindexed rows exceeds the maximum of 10000`. Verify that the scalar index on the join column is up to date before retrying. + + +Like the create table and add APIs, the merge insert API will automatically compute embeddings based on the [embedding registry](/embedding/index#embedding-registry) if the table has an embedding definition in its schema. -The `values` parameter is used to provide the new values for the columns as literal values. You can also use the `values_sql` / `valuesSql` parameter to provide SQL expressions for the new values. For example, you can use `values_sql="x + 1"` to increment the value of the `x` column by 1. +During `merge_insert`, if the input data doesn't contain the source column (i.e., the original field used to generate embeddings, such as text for a text embedding model or `image_uri` for an image model), or if a vector value is already provided, LanceDB skips embedding generation for that row. Embeddings are only auto-generated when that source field is present in the incoming data, **and** the vector field is empty. + +### Update matched rows only + +This updates keys that already exist in the target table. Source rows with new keys are ignored. - {UpdateUsingSql} + {MergeMatchedUpdateOnly} -Output: +Expected table contents: -```json - x vector -0 2 [1.0, 2.0] -1 4 [5.0, 6.0] -2 3 [10.0, 10.0] -``` +| id | name | login_count | +| --- | --- | --- | +| 1 | Alice | 10 | +| 2 | Bobby | 21 | - -When rows are updated, they are moved out of the index. The row will still show up in ANN queries, but the query will not be as fast as it would be if the row was in the index. If you update a large proportion of rows, consider rebuilding the index afterwards. - - -### Delete Operations +### Insert unmatched rows only -Delete operations **soft delete** rows that match a given condition. -The underlying data is not immediately removed, but is marked -for deletion (in the [deletion files](https://lance.org/format/table/#deletion-files) at the Lance format level) and excluded from query results. +This inserts only brand-new keys from the source. Existing keys are left unchanged. - {DeleteOperation} + {InsertIfNotExists} +Expected table contents: - -**Deleting rows removes them from the index** - -When rows are deleted, those rows are also excluded from the index segments, so indexed queries will not return them either. If ALL the rows are deleted (i.e., the table is emptied), ensure that you recreate the index after ingesting new data. - +| id | name | login_count | +| --- | --- | --- | +| 1 | Alice | 10 | +| 2 | Bob | 20 | +| 3 | Charlie | 5 | -To permanently remove deleted rows, you can optimize the table, which will run compaction and cleans up the soft-deleted rows, which frees up storage space. +### Update matched rows and insert unmatched rows -- In LanceDB OSS, compaction and cleanup are manual. Run `table.optimize()` regularly to free up disk space. -- In LanceDB Cloud, compaction and cleanup runs automatically in the background. -- In LanceDB Enterprise, files aren't cleaned up by default. You can configure the compaction and cleanup behavior at cluster setup time to suit your organization's retention policy. +Use both `when_matched_update_all()` and `when_not_matched_insert_all()` when you want to update existing keys and insert missing keys in one operation. -By default, table cleanup removes data up to 7 days ago. If you need to reclaim space from deleted rows more aggressively, manually call `table.optimize()` use a shorter retention window as follows: + +This is a conventional **upsert**. + - ```python Python icon=Python - from datetime import timedelta - - table.optimize(cleanup_older_than=timedelta(days=1)) - ``` + + {MergeUpdateInsert} + -## Merge Operations +Expected table contents: -The merge insert command is a flexible API that can be used to perform `upsert`, -`insert_if_not_exists`, and `replace_range_ operations`. +| id | name | login_count | +| --- | --- | --- | +| 1 | Alice | 10 | +| 2 | Bobby | 21 | +| 3 | Charlie | 5 | - -The merge insert command performs a join between the input data and the target table `on` the key you provide. This requires scanning that entire column, which can be expensive for large tables. To speed up this operation, create a scalar index on the join column, which will allow LanceDB to find matches without scanning the whole table. +### Delete target rows that are missing from source -Read more about scalar indices in the [Scalar Index](/indexing/scalar-index/) guide. - +Use `when_not_matched_by_source_delete()` when you want to remove any target row that does not appear in the incoming source data. - -You may receive an HTTP 400 error from merge insert: `Bad request: Merge insert cannot be performed because the number of unindexed rows exceeds the maximum of 10000`. Verify that the scalar index on the join column is up to date before retrying. - + + + {MergeDeleteMissingBySource} + + - -Like the create table and add APIs, the merge insert API will automatically compute embeddings if the table has an embedding definition in its schema. If the input data doesn't contain the source column, or the vector column is already filled, the embeddings won't be computed. - +Expected table contents: -### Upsert +| id | name | login_count | +| --- | --- | --- | +| 2 | Bobby | 21 | +| 3 | Charlie | 5 | -`upsert` updates rows if they exist and inserts them if they don't. To do this with merge insert, -enable both `when_matched_update_all()` and `when_not_matched_insert_all()`. +In the example above, LanceDB matches rows by `id`. Rows with `id=2` and `id=3` exist in both the table and incoming data, so they are updated. Row `id=1` exists only in the target, so it is deleted. -#### Setting Up the Example Table and Performing Upsert +### Use partial columns in merge updates + +Merge updates do not require you to provide values for all columns. You can provide only a subset of columns in source rows. For matched rows, only the provided columns are updated. - {UpsertOperation} + {MergePartialColumns} -### Insert-if-not-exists +Expected table contents: + +| id | name | login_count | +| --- | --- | --- | +| 1 | Alice | 10 | +| 2 | Bobby | 20 | +| 3 | Charlie | null | + +Note that in the example above, when `merge_insert` creates a new row, any missing columns are written as `null`. If a missing column is non-nullable in your schema, the insert will fail. -This will only insert rows that do not have a match in the target table, thus -preventing duplicate rows. To do this with merge insert, enable just -`when_not_matched_insert_all()`. +## Delete rows -#### Setting Up the Example Table and Performing Insert-if-not-exists +Delete operations **soft delete** rows that match a given condition. +The underlying data is not immediately removed, but is marked +for deletion (in the [deletion files](https://lance.org/format/table/#deletion-files) at the Lance format level) and excluded from query results. - {InsertIfNotExists} + {DeleteOperation} -### Replace Range +Expected table contents: -You can also replace a range of rows in the target table with the input data. -For example, if you have a table of document chunks, where each chunk has both -a `doc_id` and a `chunk_id`, you can replace all chunks for a given `doc_id` with updated chunks. +| id | name | login_count | +| --- | --- | --- | +| 1 | Alice | 10 | +| 2 | Bob | 20 | -This can be tricky otherwise because if you try to use `upsert` when the new data has fewer -chunks you will end up with extra chunks. To avoid this, add another clause to delete any chunks -for the document that are not in the new data, with `when_not_matched_by_source_delete`. -#### Setting Up the Example Table and Performing Replace Range + +**Deleting rows removes them from the index** + +When rows are deleted, those rows are also excluded from the index segments, so indexed queries will not return them either. If ALL the rows are deleted (i.e., the table is emptied), ensure that you recreate the index after ingesting new data. + + +To permanently remove deleted rows, you can optimize the table, which will run compaction and cleans up the soft-deleted rows, which frees up storage space. + +- In LanceDB OSS, compaction and cleanup are manual. Run `table.optimize()` regularly to free up disk space. +- In LanceDB Cloud, compaction and cleanup runs automatically in the background. +- In LanceDB Enterprise, files aren't cleaned up by default. You can configure the compaction and cleanup behavior at cluster setup time to suit your organization's retention policy. + +By default, table cleanup removes data up to 7 days ago. If you need to reclaim space from deleted rows more aggressively, manually call `table.optimize()` use a shorter retention window as follows: - {ReplaceRangeOperation} + {UpdateOptimizeCleanup} - - -We suggest the best batch size to be 500k for optimal performance. - diff --git a/tests/py/test_tables.py b/tests/py/test_tables.py index c1faef7..aa56dfd 100644 --- a/tests/py/test_tables.py +++ b/tests/py/test_tables.py @@ -44,6 +44,60 @@ def fake_connect(uri): assert isinstance(db, DummyDB) +def test_update_connect_cloud_snippet(monkeypatch): + calls = {} + + class DummyDB: + pass + + def fake_connect(**kwargs): + calls.update(kwargs) + return DummyDB() + + import lancedb as _lancedb + + monkeypatch.setattr(_lancedb, "connect", fake_connect) + + # --8<-- [start:update_connect_cloud] + import lancedb + + db = lancedb.connect( + uri="db://your-project-slug", + api_key="your-api-key", + region="us-east-1", + ) + # --8<-- [end:update_connect_cloud] + + assert calls["uri"] == "db://your-project-slug" + assert calls["api_key"] == "your-api-key" + assert calls["region"] == "us-east-1" + assert isinstance(db, DummyDB) + + +def test_update_connect_local_snippet(monkeypatch): + calls = {} + + class DummyDB: + pass + + def fake_connect(uri): + calls["uri"] = uri + return DummyDB() + + import lancedb as _lancedb + + monkeypatch.setattr(_lancedb, "connect", fake_connect) + + # --8<-- [start:update_connect_local] + import lancedb + + db = lancedb.connect("./data") + # --8<-- [end:update_connect_local] + + assert calls["uri"] == "./data" + assert isinstance(db, DummyDB) + + def test_table_creation_from_dicts(tmp_db): # --8<-- [start:create_table_from_dicts] data = [ @@ -417,160 +471,314 @@ def make_batches(): assert table.count_rows() == 10 +def _create_users_example_table(db, table_name="users_example"): + return db.create_table( + table_name, + data=pa.table( + { + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + "login_count": [10, 20, 5], + } + ), + mode="overwrite", + ) + + +def test_update_example_table_setup(tmp_db): + db = tmp_db + + # --8<-- [start:update_example_table_setup] + import pyarrow as pa + + table = db.create_table( + "users_example", + data=pa.table( + { + "id": [1, 2], + "name": ["Alice", "Bob"], + "login_count": [10, 20], + } + ), + mode="overwrite", + ) + # --8<-- [end:update_example_table_setup] + assert table.count_rows() == 2 + + def test_update_operation(tmp_db): db = tmp_db # --8<-- [start:update_operation] - import pandas as pd + import pyarrow as pa - # Create a table from a pandas DataFrame - data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]}) - tbl = db.create_table("test_table", data, mode="overwrite") - # Update the table where x = 2 - tbl.update(where="x = 2", values={"vector": [10, 10]}) - # Get the updated table as a pandas DataFrame - df = tbl.to_pandas() - print(df) + table = db.create_table( + "users_example", + data=pa.table( + { + "id": [1, 2], + "name": ["Alice", "Bob"], + "login_count": [10, 20], + } + ), + mode="overwrite", + ) + table.update(where="id = 2", values={"name": "Bobby"}) # --8<-- [end:update_operation] - assert df.loc[df["x"] == 2, "vector"].iloc[0] == [10, 10] + rows = table.to_arrow().sort_by("id").to_pylist() + assert rows == [ + {"id": 1, "name": "Alice", "login_count": 10}, + {"id": 2, "name": "Bobby", "login_count": 20}, + ] def test_update_using_sql(tmp_db): db = tmp_db # --8<-- [start:update_using_sql] - import pandas as pd + import pyarrow as pa - # Create a table from a pandas DataFrame - data = pd.DataFrame({"x": [1, 2, 3], "vector": [[1, 2], [3, 4], [5, 6]]}) - tbl = db.create_table("test_table", data, mode="overwrite") - # Update all rows: increment x by 1 - tbl.update(values_sql={"x": "x + 1"}) - print(tbl.to_pandas()) + table = db.create_table( + "users_example", + data=pa.table( + { + "id": [1, 2], + "name": ["Alice", "Bob"], + "login_count": [10, 20], + } + ), + mode="overwrite", + ) + table.update(where="id = 2", values_sql={"login_count": "login_count + 1"}) # --8<-- [end:update_using_sql] - assert sorted(tbl.to_pandas()["x"].tolist()) == [2, 3, 4] + rows = table.to_arrow().sort_by("id").to_pylist() + assert rows == [ + {"id": 1, "name": "Alice", "login_count": 10}, + {"id": 2, "name": "Bob", "login_count": 21}, + ] -def test_delete_operation(tmp_db): +def test_merge_matched_update_only(tmp_db): db = tmp_db + + # --8<-- [start:merge_matched_update_only] + import pyarrow as pa + table = db.create_table( - "update_table_example", - [ - {"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, - {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}, - {"vector": [10.2, 100.8], "item": "baz", "price": 30.0}, - ], + "users_example", + data=pa.table( + { + "id": [1, 2], + "name": ["Alice", "Bob"], + "login_count": [10, 20], + } + ), mode="overwrite", ) - # --8<-- [start:delete_operation] - # delete data - predicate = "price = 30.0" - table.delete(predicate) - # --8<-- [end:delete_operation] - assert table.count_rows() == 2 + incoming_users = pa.table( + { + "id": [2, 3], + "name": ["Bobby", "Charlie"], + "login_count": [21, 5], + } + ) + + ( + table.merge_insert("id") + .when_matched_update_all() + .execute(incoming_users) + ) + # --8<-- [end:merge_matched_update_only] + rows = table.to_arrow().sort_by("id").to_pylist() + assert rows == [ + {"id": 1, "name": "Alice", "login_count": 10}, + {"id": 2, "name": "Bobby", "login_count": 21}, + ] -def test_upsert_operation(tmp_db): +def test_insert_if_not_exists(tmp_db): db = tmp_db - # --8<-- [start:upsert_operation] - # Create example table - users_table_name = "users_example" + # --8<-- [start:insert_if_not_exists] + import pyarrow as pa + table = db.create_table( - users_table_name, - [ - {"id": 0, "name": "Alice"}, - {"id": 1, "name": "Bob"}, - ], + "users_example", + data=pa.table( + { + "id": [1, 2], + "name": ["Alice", "Bob"], + "login_count": [10, 20], + } + ), mode="overwrite", ) - print(f"Created users table with {table.count_rows()} rows") - # Prepare data for upsert - new_users = [ - {"id": 1, "name": "Bobby"}, # Will update existing record - {"id": 2, "name": "Charlie"}, # Will insert new record - ] + incoming_users = pa.table( + { + "id": [2, 3], + "name": ["Bobby", "Charlie"], + "login_count": [21, 5], + } + ) - # Upsert by id ( table.merge_insert("id") - .when_matched_update_all() .when_not_matched_insert_all() - .execute(new_users) + .execute(incoming_users) ) - - # Verify results - should be 3 records total - print(f"Total users: {table.count_rows()}") # 3 - # --8<-- [end:upsert_operation] - assert table.count_rows() == 3 + # --8<-- [end:insert_if_not_exists] + rows = table.to_arrow().sort_by("id").to_pylist() + assert rows == [ + {"id": 1, "name": "Alice", "login_count": 10}, + {"id": 2, "name": "Bob", "login_count": 20}, + {"id": 3, "name": "Charlie", "login_count": 5}, + ] -def test_insert_if_not_exists(tmp_db): +def test_merge_update_insert(tmp_db): db = tmp_db - # --8<-- [start:insert_if_not_exists] - # Create example table + # --8<-- [start:merge_update_insert] + import pyarrow as pa + table = db.create_table( - "domains", - [ - {"domain": "google.com", "name": "Google"}, - {"domain": "github.com", "name": "GitHub"}, - ], + "users_example", + data=pa.table( + { + "id": [1, 2], + "name": ["Alice", "Bob"], + "login_count": [10, 20], + } + ), mode="overwrite", ) - # Prepare new data - one existing and one new record - new_domains = [ - {"domain": "google.com", "name": "Google"}, - {"domain": "facebook.com", "name": "Facebook"}, + incoming_users = pa.table( + { + "id": [2, 3], + "name": ["Bobby", "Charlie"], + "login_count": [21, 5], + } + ) + + ( + table.merge_insert("id") + .when_matched_update_all() + .when_not_matched_insert_all() + .execute(incoming_users) + ) + # --8<-- [end:merge_update_insert] + rows = table.to_arrow().sort_by("id").to_pylist() + assert rows == [ + {"id": 1, "name": "Alice", "login_count": 10}, + {"id": 2, "name": "Bobby", "login_count": 21}, + {"id": 3, "name": "Charlie", "login_count": 5}, ] - # Insert only if domain doesn't exist - table.merge_insert("domain").when_not_matched_insert_all().execute(new_domains) - # Verify count - should be 3 (original 2 plus 1 new) - print(f"Total domains: {table.count_rows()}") # 3 - # --8<-- [end:insert_if_not_exists] - assert table.count_rows() == 3 +def test_merge_delete_missing_by_source(tmp_db): + db = tmp_db + + # --8<-- [start:merge_delete_missing_by_source] + import pyarrow as pa + + table = db.create_table( + "users_example", + data=pa.table( + { + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + "login_count": [10, 20, 5], + } + ), + mode="overwrite", + ) + + incoming_users = pa.table( + { + "id": [2, 3], + "name": ["Bobby", "Charlie"], + "login_count": [21, 5], + } + ) + ( + table.merge_insert("id") + .when_matched_update_all() + .when_not_matched_insert_all() + .when_not_matched_by_source_delete() + .execute(incoming_users) + ) + # --8<-- [end:merge_delete_missing_by_source] + rows = table.to_arrow().sort_by("id").to_pylist() + assert rows == [ + {"id": 2, "name": "Bobby", "login_count": 21}, + {"id": 3, "name": "Charlie", "login_count": 5}, + ] -def test_replace_range_operation(tmp_db): + +def test_merge_partial_columns(tmp_db): db = tmp_db - # --8<-- [start:replace_range_operation] - # Create example table with document chunks + # --8<-- [start:merge_partial_columns] + import pyarrow as pa + table = db.create_table( - "chunks", - [ - {"doc_id": 0, "chunk_id": 0, "text": "Hello"}, - {"doc_id": 0, "chunk_id": 1, "text": "World"}, - {"doc_id": 1, "chunk_id": 0, "text": "Foo"}, - {"doc_id": 1, "chunk_id": 1, "text": "Bar"}, - {"doc_id": 2, "chunk_id": 0, "text": "Baz"}, - ], + "users_example", + data=pa.table( + { + "id": [1, 2], + "name": ["Alice", "Bob"], + "login_count": [10, 20], + } + ), mode="overwrite", ) - # New data - replacing all chunks for doc_id 1 with just one chunk - new_chunks = [ - {"doc_id": 1, "chunk_id": 0, "text": "Zoo"}, - ] + incoming_users = pa.table( + { + "id": [2, 3], + "name": ["Bobby", "Charlie"], + } + ) - # Replace all chunks for doc_id 1 ( - table.merge_insert(["doc_id"]) + table.merge_insert("id") .when_matched_update_all() .when_not_matched_insert_all() - .when_not_matched_by_source_delete("doc_id = 1") - .execute(new_chunks) + .execute(incoming_users) ) + # --8<-- [end:merge_partial_columns] + rows = table.to_arrow().sort_by("id").to_pylist() + assert rows == [ + {"id": 1, "name": "Alice", "login_count": 10}, + {"id": 2, "name": "Bobby", "login_count": 20}, + {"id": 3, "name": "Charlie", "login_count": None}, + ] + + +def test_delete_operation(tmp_db): + db = tmp_db + table = _create_users_example_table(db) + + # --8<-- [start:delete_operation] + # delete data + predicate = "id = 3" + table.delete(predicate) + # --8<-- [end:delete_operation] + assert table.count_rows() == 2 + + +def test_update_optimize_cleanup_snippet(tmp_db): + table = _create_users_example_table(tmp_db, table_name="users_cleanup_example") + + # --8<-- [start:update_optimize_cleanup] + from datetime import timedelta - # Verify count for doc_id = 1 - should be 1 - print(f"Chunks for doc_id = 1: {table.count_rows('doc_id = 1')}") # 1 - # --8<-- [end:replace_range_operation] - assert table.count_rows("doc_id = 1") == 1 + table.optimize(cleanup_older_than=timedelta(days=1)) + # --8<-- [end:update_optimize_cleanup] # ============================================================================