From 336591087581acc29728c32b6daa6a834cce43ec Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 8 May 2026 15:51:46 +0200 Subject: [PATCH 1/4] docs: add Docling Serve integration page Co-Authored-By: Claude Sonnet 4.6 --- integrations/docling-serve.md | 102 ++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 integrations/docling-serve.md diff --git a/integrations/docling-serve.md b/integrations/docling-serve.md new file mode 100644 index 0000000..8fa0f25 --- /dev/null +++ b/integrations/docling-serve.md @@ -0,0 +1,102 @@ +--- +layout: integration +name: Docling Serve +description: Use DoclingServe to convert PDF, DOCX, HTML, and other document types to Haystack Documents via a remote HTTP server, with no local ML dependencies +authors: + - name: deepset + socials: + github: deepset-ai +pypi: https://pypi.org/project/docling-serve-haystack +repo: https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling_serve +type: Data Ingestion +report_issue: https://github.com/deepset-ai/haystack-core-integrations/issues +logo: /logos/docling.png +version: Haystack 2.0 +toc: true +--- +### **Table of Contents** +- [Overview](#overview) +- [Installation](#installation) +- [Usage](#usage) +- [License](#license) + +## Overview + +[DoclingServe](https://github.com/docling-project/docling-serve) hosts [Docling](https://github.com/DS4SD/docling) +as a scalable HTTP server, supporting PDFs, Office documents, HTML, and many other formats. + +This integration introduces `DoclingServeConverter`, a Haystack component that sends documents to a running +DoclingServe instance and returns Haystack `Document` objects. Unlike the local `DoclingConverter`, this +component has no heavy ML dependencies — all document parsing happens on the remote server. + +## Installation + +```bash +pip install docling-serve-haystack +``` + +Start a DoclingServe instance locally (requires Docker): + +```bash +docker run -p 5001:5001 ghcr.io/docling-project/docling-serve-cpu:latest +``` + +## Usage + +### Components + +This integration introduces `DoclingServeConverter`, a component which converts documents by sending them +to a DoclingServe HTTP server and returns Haystack `Document` objects. + +Local files and `ByteStream` objects are uploaded via the `/v1/convert/file` endpoint. URL strings are +sent to `/v1/convert/source`. + +The component supports three export modes via the `export_type` parameter: + +- `ExportType.MARKDOWN` (default): Returns document content as a Markdown string. +- `ExportType.TEXT`: Returns plain text extracted from the document. +- `ExportType.JSON`: Returns the full Docling document representation as a JSON string. + +### Standalone + +```python +from haystack_integrations.components.converters.docling_serve import ( + DoclingServeConverter, +) + +# Default: Markdown output +converter = DoclingServeConverter(base_url="http://localhost:5001") +result = converter.run(sources=["https://arxiv.org/pdf/2206.01062"]) +documents = result["documents"] +print(documents[0].content[:200]) +``` + +### In a Pipeline + +```python +from haystack import Pipeline +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.writers import DocumentWriter +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack_integrations.components.converters.docling_serve import ( + DoclingServeConverter, +) + +document_store = InMemoryDocumentStore() + +pipeline = Pipeline() +pipeline.add_component( + "converter", + DoclingServeConverter(base_url="http://localhost:5001"), +) +pipeline.add_component("splitter", DocumentSplitter()) +pipeline.add_component("writer", DocumentWriter(document_store=document_store)) +pipeline.connect("converter", "splitter") +pipeline.connect("splitter", "writer") + +pipeline.run({"converter": {"sources": ["report.pdf", "manual.docx"]}}) +``` + +### License + +`docling-serve-haystack` is distributed under the terms of the Apache-2.0 license. From f1ccffc9758d8ffdb24422cefeba15466160ccfc Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 8 May 2026 15:53:29 +0200 Subject: [PATCH 2/4] docs: rename DoclingServe to Docling Serve in integration page Co-Authored-By: Claude Sonnet 4.6 --- integrations/docling-serve.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/integrations/docling-serve.md b/integrations/docling-serve.md index 8fa0f25..5d82b45 100644 --- a/integrations/docling-serve.md +++ b/integrations/docling-serve.md @@ -1,7 +1,7 @@ --- layout: integration name: Docling Serve -description: Use DoclingServe to convert PDF, DOCX, HTML, and other document types to Haystack Documents via a remote HTTP server, with no local ML dependencies +description: Use Docling Serve to convert PDF, DOCX, HTML, and other document types to Haystack Documents via a remote HTTP server, with no local ML dependencies authors: - name: deepset socials: @@ -22,11 +22,11 @@ toc: true ## Overview -[DoclingServe](https://github.com/docling-project/docling-serve) hosts [Docling](https://github.com/DS4SD/docling) +[Docling Serve](https://github.com/docling-project/docling-serve) hosts [Docling](https://github.com/DS4SD/docling) as a scalable HTTP server, supporting PDFs, Office documents, HTML, and many other formats. This integration introduces `DoclingServeConverter`, a Haystack component that sends documents to a running -DoclingServe instance and returns Haystack `Document` objects. Unlike the local `DoclingConverter`, this +Docling Serve instance and returns Haystack `Document` objects. Unlike the local `DoclingConverter`, this component has no heavy ML dependencies — all document parsing happens on the remote server. ## Installation @@ -35,7 +35,7 @@ component has no heavy ML dependencies — all document parsing happens on the r pip install docling-serve-haystack ``` -Start a DoclingServe instance locally (requires Docker): +Start a Docling Serve instance locally (requires Docker): ```bash docker run -p 5001:5001 ghcr.io/docling-project/docling-serve-cpu:latest @@ -46,7 +46,7 @@ docker run -p 5001:5001 ghcr.io/docling-project/docling-serve-cpu:latest ### Components This integration introduces `DoclingServeConverter`, a component which converts documents by sending them -to a DoclingServe HTTP server and returns Haystack `Document` objects. +to a Docling Serve HTTP server and returns Haystack `Document` objects. Local files and `ByteStream` objects are uploaded via the `/v1/convert/file` endpoint. URL strings are sent to `/v1/convert/source`. From 23aa4cf99e9dbaa8a118f50ad670c7a5c59771f0 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 8 May 2026 15:54:46 +0200 Subject: [PATCH 3/4] docs: shorten overview and components description in Docling Serve integration page Co-Authored-By: Claude Sonnet 4.6 --- integrations/docling-serve.md | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/integrations/docling-serve.md b/integrations/docling-serve.md index 5d82b45..61a7204 100644 --- a/integrations/docling-serve.md +++ b/integrations/docling-serve.md @@ -23,11 +23,8 @@ toc: true ## Overview [Docling Serve](https://github.com/docling-project/docling-serve) hosts [Docling](https://github.com/DS4SD/docling) -as a scalable HTTP server, supporting PDFs, Office documents, HTML, and many other formats. - -This integration introduces `DoclingServeConverter`, a Haystack component that sends documents to a running -Docling Serve instance and returns Haystack `Document` objects. Unlike the local `DoclingConverter`, this -component has no heavy ML dependencies — all document parsing happens on the remote server. +as a scalable HTTP server, supporting PDFs, Office documents, HTML, and many other formats. All document +parsing happens on the remote server, with no local ML dependencies. ## Installation @@ -45,10 +42,7 @@ docker run -p 5001:5001 ghcr.io/docling-project/docling-serve-cpu:latest ### Components -This integration introduces `DoclingServeConverter`, a component which converts documents by sending them -to a Docling Serve HTTP server and returns Haystack `Document` objects. - -Local files and `ByteStream` objects are uploaded via the `/v1/convert/file` endpoint. URL strings are +`DoclingServeConverter` converts documents by sending them to a Docling Serve HTTP server. Local files and `ByteStream` objects are uploaded via the `/v1/convert/file` endpoint. URL strings are sent to `/v1/convert/source`. The component supports three export modes via the `export_type` parameter: From dd5acf73e30b9908d9ab2c7aaab0885374438306 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 8 May 2026 15:59:31 +0200 Subject: [PATCH 4/4] docs: fix pipeline example to use a real URL source Co-Authored-By: Claude Sonnet 4.6 --- integrations/docling-serve.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/docling-serve.md b/integrations/docling-serve.md index 61a7204..b845355 100644 --- a/integrations/docling-serve.md +++ b/integrations/docling-serve.md @@ -60,7 +60,7 @@ from haystack_integrations.components.converters.docling_serve import ( # Default: Markdown output converter = DoclingServeConverter(base_url="http://localhost:5001") -result = converter.run(sources=["https://arxiv.org/pdf/2206.01062"]) +result = converter.run(sources=["https://arxiv.org/pdf/2602.17316"]) documents = result["documents"] print(documents[0].content[:200]) ``` @@ -88,7 +88,7 @@ pipeline.add_component("writer", DocumentWriter(document_store=document_store)) pipeline.connect("converter", "splitter") pipeline.connect("splitter", "writer") -pipeline.run({"converter": {"sources": ["report.pdf", "manual.docx"]}}) +pipeline.run({"converter": {"sources": ["https://arxiv.org/pdf/2602.17316"]}}) ``` ### License