From 6e6e2acbdeee9649085b2de3ac328cd185762b5a Mon Sep 17 00:00:00 2001 From: Rushikesh Sathe Date: Fri, 22 May 2026 11:27:37 +0530 Subject: [PATCH 1/7] docling example --- .../docling-parse-example/README.md | 20 +++ .../docling_parse_example.py | 121 ++++++++++++++++++ .../install_test_example.sh | 68 ++++++++++ .../docling-parse-example/requirements.txt | 3 + .../docling-parse-example/sub-test1.py | 28 ++++ .../docling-parse-example/sub-test2.py | 44 +++++++ .../docling-parse-example/sub-test3.py | 73 +++++++++++ 7 files changed, 357 insertions(+) create mode 100644 examples/Python3.12/docling-parse-example/README.md create mode 100644 examples/Python3.12/docling-parse-example/docling_parse_example.py create mode 100644 examples/Python3.12/docling-parse-example/install_test_example.sh create mode 100644 examples/Python3.12/docling-parse-example/requirements.txt create mode 100644 examples/Python3.12/docling-parse-example/sub-test1.py create mode 100644 examples/Python3.12/docling-parse-example/sub-test2.py create mode 100644 examples/Python3.12/docling-parse-example/sub-test3.py diff --git a/examples/Python3.12/docling-parse-example/README.md b/examples/Python3.12/docling-parse-example/README.md new file mode 100644 index 0000000..ddb500b --- /dev/null +++ b/examples/Python3.12/docling-parse-example/README.md @@ -0,0 +1,20 @@ +## Purpose: Parse PDF documents using the docling-parse library. + +### Packages used: +docling-parse + +### Functionality: +- Parses PDF documents using docling-parse v2 parser. +- Extracts document metadata (page count, text content). +- Creates sample PDF documents for testing (using reportlab). +- Handles parsing errors gracefully. +- Displays parsed document information. + +### How to run the example : +``` +chmod +x install_test_example.sh +./install_test_example.sh +``` + +### License: +It's covered under Apache 2.0 licenses \ No newline at end of file diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py new file mode 100644 index 0000000..2faca6b --- /dev/null +++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py @@ -0,0 +1,121 @@ +import sys +from pathlib import Path +from docling_parse.docling_parse import pdf_parser_v2 + +def parse_pdf_document(pdf_path: str) -> dict: + """ + Parse a PDF document using docling-parse library. + + Args: + pdf_path: Path to the PDF file to parse + + Returns: + Dictionary containing parsed document information + """ + try: + # Parse the PDF document + doc = pdf_parser_v2(pdf_path) + + # Extract basic information + result = { + "success": True, + "num_pages": len(doc.pages) if hasattr(doc, 'pages') else 0, + "has_text": bool(doc.text) if hasattr(doc, 'text') else False, + "parser_version": "v2" + } + + # Try to get page count and basic metadata + if hasattr(doc, 'pages') and doc.pages: + result["first_page_info"] = { + "page_num": 1, + "has_content": bool(doc.pages[0]) if doc.pages else False + } + + return result + + except Exception as e: + return { + "success": False, + "error": str(e), + "error_type": type(e).__name__ + } + + +def create_sample_pdf(): + """ + Create a simple sample PDF for testing purposes. + """ + try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + + pdf_path = "sample_document.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + + # Add some text content + c.drawString(100, 750, "Sample PDF Document") + c.drawString(100, 730, "This is a test document for docling-parse") + c.drawString(100, 710, "Page 1 of 1") + + c.showPage() + c.save() + + print(f"Created sample PDF: {pdf_path}") + return pdf_path + + except ImportError: + print("reportlab not available, skipping PDF creation") + return None + + +if __name__ == "__main__": + print("=" * 60) + print("Docling-Parse PDF Parsing Example") + print("=" * 60) + + # Check if a PDF path was provided as argument + if len(sys.argv) > 1: + pdf_path = sys.argv[1] + print(f"\nParsing provided PDF: {pdf_path}") + else: + # Try to create a sample PDF + pdf_path = create_sample_pdf() + if not pdf_path: + print("\nNo PDF provided and couldn't create sample PDF") + print("Usage: python docling_parse_example.py [path_to_pdf]") + sys.exit(1) + + # Check if file exists + if not Path(pdf_path).exists(): + print(f"\nError: PDF file not found: {pdf_path}") + sys.exit(1) + + # Parse the PDF + print(f"\nParsing PDF document...") + result = parse_pdf_document(pdf_path) + + # Display results + print("\n" + "=" * 60) + print("Parsing Results:") + print("=" * 60) + + if result["success"]: + print(f"✓ Successfully parsed PDF") + print(f" - Number of pages: {result.get('num_pages', 'N/A')}") + print(f" - Has text content: {result.get('has_text', 'N/A')}") + print(f" - Parser version: {result.get('parser_version', 'N/A')}") + + if "first_page_info" in result: + print(f"\n First page information:") + print(f" - Page number: {result['first_page_info']['page_num']}") + print(f" - Has content: {result['first_page_info']['has_content']}") + else: + print(f"✗ Failed to parse PDF") + print(f" - Error type: {result.get('error_type', 'Unknown')}") + print(f" - Error message: {result.get('error', 'No details available')}") + + print("\n" + "=" * 60) + print("Example completed successfully!") + print("=" * 60) + +# Made with Bob diff --git a/examples/Python3.12/docling-parse-example/install_test_example.sh b/examples/Python3.12/docling-parse-example/install_test_example.sh new file mode 100644 index 0000000..a2ca1c4 --- /dev/null +++ b/examples/Python3.12/docling-parse-example/install_test_example.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# ------------------------------- +# Function to detect Linux distro +# ------------------------------- +detect_distro() { + if [ -f /etc/os-release ]; then + . /etc/os-release + echo $ID + elif [ -f /etc/redhat-release ]; then + echo "rhel" + elif [ -f /etc/debian_version ]; then + echo "debian" + else + echo "unknown" + fi +} + +DISTRO=$(detect_distro) +echo "Detected distribution: $DISTRO" +echo "Installing prerequisites..." + +# ------------------------------- +# Install system dependencies +# ------------------------------- +case $DISTRO in + "fedora"|"rhel"|"centos"|"rocky"|"almalinux") + if command -v dnf >/dev/null 2>&1; then + sudo dnf install -y python3.12 python3.12-devel python3-pip + else + sudo yum install -y python3.12 python3.12-devel python3-pip + fi + ;; + "ubuntu"|"debian") + export DEBIAN_FRONTEND=noninteractive + sudo apt update + sudo apt install -y python3.12 python3.12-dev python3-pip python3.12-venv + ;; + "sles") + sudo zypper refresh + sudo zypper install -y python312 python312-pip + + ;; + *) + echo "Unsupported distribution: $DISTRO" + exit 1 + ;; +esac + + + +python3.12 -m venv .venv +source .venv/bin/activate + +python3.12 -m pip install --no-cache --prefer-binary --extra-index-url https://wheels.developerfirst.ibm.com/ppc64le/linux -r requirements.txt + +WORKDIR=$(pwd) + +cd $WORKDIR + +python3.12 docling_parse_example.py + +echo "\n ==== Running tests ==== \n" + +python3.12 sub-test1.py +python3.12 sub-test2.py +python3.12 sub-test3.py + +# Made with Bob diff --git a/examples/Python3.12/docling-parse-example/requirements.txt b/examples/Python3.12/docling-parse-example/requirements.txt new file mode 100644 index 0000000..fee762e --- /dev/null +++ b/examples/Python3.12/docling-parse-example/requirements.txt @@ -0,0 +1,3 @@ +docling-parse==5.8.0+ppc64le1 +reportlab==4.2.5+ppc64le1 +Pillow==11.1.0+ppc64le1 \ No newline at end of file diff --git a/examples/Python3.12/docling-parse-example/sub-test1.py b/examples/Python3.12/docling-parse-example/sub-test1.py new file mode 100644 index 0000000..7b86ea7 --- /dev/null +++ b/examples/Python3.12/docling-parse-example/sub-test1.py @@ -0,0 +1,28 @@ +import unittest +import importlib.metadata + +class TestDoclingParseLibrary(unittest.TestCase): + def test_docling_parse_import(self): + """Check if docling-parse can be imported""" + try: + import docling_parse + except ImportError: + self.fail("docling-parse is not installed") + + def test_docling_parse_version(self): + """Verify docling-parse version""" + version = importlib.metadata.version("docling-parse") + assert "5.8.0" in version, f"Expected docling-parse 5.8.0, got {version}" + + def test_pdf_parser_import(self): + """Check if pdf_parser_v2 can be imported""" + try: + from docling_parse.docling_parse import pdf_parser_v2 + self.assertIsNotNone(pdf_parser_v2, "pdf_parser_v2 should not be None") + except ImportError as e: + self.fail(f"Failed to import pdf_parser_v2: {e}") + +if __name__ == "__main__": + unittest.main() + +# Made with Bob diff --git a/examples/Python3.12/docling-parse-example/sub-test2.py b/examples/Python3.12/docling-parse-example/sub-test2.py new file mode 100644 index 0000000..1aa2538 --- /dev/null +++ b/examples/Python3.12/docling-parse-example/sub-test2.py @@ -0,0 +1,44 @@ +import unittest +from pathlib import Path + +class TestReportlabPDFCreation(unittest.TestCase): + def test_reportlab_import(self): + """Check if reportlab can be imported""" + try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + except ImportError: + self.fail("reportlab is not installed") + + def test_create_simple_pdf(self): + """Test creating a simple PDF with reportlab""" + try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + + pdf_path = "test_document.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + c.drawString(100, 750, "Test PDF Document") + c.showPage() + c.save() + + # Verify file was created + self.assertTrue(Path(pdf_path).exists(), "PDF file was not created") + + # Clean up + Path(pdf_path).unlink() + + except Exception as e: + self.fail(f"Failed to create PDF: {e}") + + def test_pillow_import(self): + """Check if Pillow can be imported""" + try: + from PIL import Image + except ImportError: + self.fail("Pillow is not installed") + +if __name__ == "__main__": + unittest.main() + +# Made with Bob diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py new file mode 100644 index 0000000..0266a39 --- /dev/null +++ b/examples/Python3.12/docling-parse-example/sub-test3.py @@ -0,0 +1,73 @@ +import unittest +from pathlib import Path +import sys + +class TestDoclingParseIntegration(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Create a test PDF before running tests""" + try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + + cls.test_pdf_path = "integration_test.pdf" + c = canvas.Canvas(cls.test_pdf_path, pagesize=letter) + c.drawString(100, 750, "Integration Test PDF") + c.drawString(100, 730, "This document is used for testing docling-parse") + c.drawString(100, 710, "Line 3 of test content") + c.showPage() + c.save() + + except Exception as e: + print(f"Warning: Could not create test PDF: {e}") + cls.test_pdf_path = None + + @classmethod + def tearDownClass(cls): + """Clean up test PDF after tests""" + if cls.test_pdf_path and Path(cls.test_pdf_path).exists(): + Path(cls.test_pdf_path).unlink() + + def test_parse_pdf_with_docling(self): + """Test parsing a PDF with docling-parse""" + if not self.test_pdf_path: + self.skipTest("Test PDF not available") + + try: + from docling_parse.docling_parse import pdf_parser_v2 + + # Parse the test PDF + doc = pdf_parser_v2(self.test_pdf_path) + + # Basic assertions + self.assertIsNotNone(doc, "Parsed document should not be None") + + # Check if document has expected attributes + has_pages = hasattr(doc, 'pages') + has_text = hasattr(doc, 'text') + + # At least one of these should be true for a valid parse + self.assertTrue(has_pages or has_text, + "Parsed document should have pages or text attribute") + + except Exception as e: + self.fail(f"Failed to parse PDF with docling-parse: {e}") + + def test_docling_parse_error_handling(self): + """Test error handling for non-existent PDF""" + try: + from docling_parse.docling_parse import pdf_parser_v2 + + # Try to parse a non-existent file + non_existent_pdf = "this_file_does_not_exist.pdf" + + with self.assertRaises(Exception): + pdf_parser_v2(non_existent_pdf) + + except ImportError: + self.skipTest("docling-parse not available") + +if __name__ == "__main__": + unittest.main() + +# Made with Bob From 2fcb7e560aa56a46db756d41abd10d7c82036676 Mon Sep 17 00:00:00 2001 From: Rushikesh Sathe Date: Fri, 22 May 2026 12:00:09 +0530 Subject: [PATCH 2/7] docling updated example --- .../docling-parse-example/README.md | 43 +++- .../docling_parse_example.py | 214 ++++++++++++------ .../docling-parse-example/requirements.txt | 3 +- 3 files changed, 187 insertions(+), 73 deletions(-) diff --git a/examples/Python3.12/docling-parse-example/README.md b/examples/Python3.12/docling-parse-example/README.md index ddb500b..fd2f921 100644 --- a/examples/Python3.12/docling-parse-example/README.md +++ b/examples/Python3.12/docling-parse-example/README.md @@ -1,20 +1,45 @@ -## Purpose: Parse PDF documents using the docling-parse library. +## Purpose: Simple RAG (Retrieval-Augmented Generation) Application with Docling-Parse ### Packages used: -docling-parse +- docling-parse (PDF parsing) +- reportlab (PDF creation for testing) +- Pillow (Image processing support) +- numpy (Numerical operations) ### Functionality: -- Parses PDF documents using docling-parse v2 parser. -- Extracts document metadata (page count, text content). -- Creates sample PDF documents for testing (using reportlab). -- Handles parsing errors gracefully. -- Displays parsed document information. +This example demonstrates a simplified RAG (Retrieval-Augmented Generation) pipeline: -### How to run the example : -``` +1. **Document Parsing**: Uses docling-parse v2 to extract text from PDF documents +2. **Text Chunking**: Splits extracted text into manageable chunks for retrieval +3. **Keyword-based Retrieval**: Implements simple keyword matching to find relevant chunks +4. **Result Display**: Shows top matching chunks based on query + +### RAG Pipeline Steps: +1. Parse PDF document with docling-parse +2. Extract text content from parsed document +3. Create document chunks (configurable size) +4. Perform keyword-based retrieval on chunks +5. Display top relevant results + +### Note: +This is a simplified RAG demonstration. Production RAG systems typically use: +- Vector embeddings (e.g., sentence-transformers, OpenAI embeddings) +- Vector databases (e.g., FAISS, Chroma, Pinecone) +- LLM for generation (e.g., vLLM, Ollama, OpenAI GPT) + +### How to run the example: +```bash chmod +x install_test_example.sh ./install_test_example.sh ``` +Or manually: +```bash +python3.12 -m venv .venv +source .venv/bin/activate +pip install --extra-index-url https://wheels.developerfirst.ibm.com/ppc64le/linux -r requirements.txt +python3.12 docling_parse_example.py [optional_pdf_path] +``` + ### License: It's covered under Apache 2.0 licenses \ No newline at end of file diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py index 2faca6b..af863f0 100644 --- a/examples/Python3.12/docling-parse-example/docling_parse_example.py +++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py @@ -1,121 +1,209 @@ import sys from pathlib import Path +from typing import List, Dict from docling_parse.docling_parse import pdf_parser_v2 -def parse_pdf_document(pdf_path: str) -> dict: +def parse_pdf_with_docling(pdf_path: str) -> str: """ - Parse a PDF document using docling-parse library. + Parse a PDF document using docling-parse and extract text content. Args: pdf_path: Path to the PDF file to parse Returns: - Dictionary containing parsed document information + Extracted text content from the PDF """ try: - # Parse the PDF document + # Parse the PDF document using docling-parse v2 doc = pdf_parser_v2(pdf_path) - # Extract basic information - result = { - "success": True, - "num_pages": len(doc.pages) if hasattr(doc, 'pages') else 0, - "has_text": bool(doc.text) if hasattr(doc, 'text') else False, - "parser_version": "v2" - } + # Extract text content + text_content = "" - # Try to get page count and basic metadata - if hasattr(doc, 'pages') and doc.pages: - result["first_page_info"] = { - "page_num": 1, - "has_content": bool(doc.pages[0]) if doc.pages else False - } + # Try different methods to extract text + if hasattr(doc, 'text') and doc.text: + text_content = doc.text + elif hasattr(doc, 'pages') and doc.pages: + # Extract text from each page + for page in doc.pages: + if hasattr(page, 'text'): + text_content += page.text + "\n" - return result + return text_content if text_content else "No text content extracted" except Exception as e: - return { - "success": False, - "error": str(e), - "error_type": type(e).__name__ - } + return f"Error parsing PDF: {str(e)}" + + +def create_document_chunks(text: str, chunk_size: int = 500) -> List[str]: + """ + Split document text into smaller chunks for RAG processing. + + Args: + text: Full document text + chunk_size: Size of each chunk in characters + + Returns: + List of text chunks + """ + # Simple chunking by character count + chunks = [] + words = text.split() + current_chunk = [] + current_size = 0 + + for word in words: + word_size = len(word) + 1 # +1 for space + if current_size + word_size > chunk_size and current_chunk: + chunks.append(" ".join(current_chunk)) + current_chunk = [word] + current_size = word_size + else: + current_chunk.append(word) + current_size += word_size + + if current_chunk: + chunks.append(" ".join(current_chunk)) + + return chunks + + +def simple_keyword_search(chunks: List[str], query: str) -> List[Dict[str, any]]: + """ + Simple keyword-based retrieval (mock RAG retrieval). + In a real RAG system, this would use embeddings and vector search. + + Args: + chunks: List of document chunks + query: Search query + + Returns: + List of relevant chunks with scores + """ + results = [] + query_lower = query.lower() + query_words = set(query_lower.split()) + + for idx, chunk in enumerate(chunks): + chunk_lower = chunk.lower() + # Simple scoring: count matching words + matches = sum(1 for word in query_words if word in chunk_lower) + + if matches > 0: + results.append({ + "chunk_id": idx, + "text": chunk, + "score": matches, + "preview": chunk[:200] + "..." if len(chunk) > 200 else chunk + }) + + # Sort by score (descending) + results.sort(key=lambda x: x["score"], reverse=True) + return results def create_sample_pdf(): """ - Create a simple sample PDF for testing purposes. + Create a sample PDF document with RAG-relevant content. """ try: from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter - pdf_path = "sample_document.pdf" + pdf_path = "rag_sample_document.pdf" c = canvas.Canvas(pdf_path, pagesize=letter) - # Add some text content - c.drawString(100, 750, "Sample PDF Document") - c.drawString(100, 730, "This is a test document for docling-parse") - c.drawString(100, 710, "Page 1 of 1") + # Add content about AI and machine learning + y_position = 750 + content = [ + "Introduction to Artificial Intelligence", + "", + "Artificial Intelligence (AI) is the simulation of human intelligence", + "processes by machines, especially computer systems. These processes", + "include learning, reasoning, and self-correction.", + "", + "Machine Learning is a subset of AI that provides systems the ability", + "to automatically learn and improve from experience without being", + "explicitly programmed. Deep learning is a subset of machine learning.", + "", + "Natural Language Processing (NLP) is a branch of AI that helps", + "computers understand, interpret and manipulate human language.", + "NLP draws from many disciplines, including computer science and", + "computational linguistics.", + ] + + for line in content: + c.drawString(50, y_position, line) + y_position -= 20 c.showPage() c.save() - print(f"Created sample PDF: {pdf_path}") + print(f"✓ Created sample PDF: {pdf_path}") return pdf_path except ImportError: - print("reportlab not available, skipping PDF creation") + print("✗ reportlab not available, skipping PDF creation") return None if __name__ == "__main__": - print("=" * 60) - print("Docling-Parse PDF Parsing Example") - print("=" * 60) + print("=" * 70) + print("Simple RAG Application with Docling-Parse") + print("=" * 70) - # Check if a PDF path was provided as argument + # Step 1: Get or create PDF document if len(sys.argv) > 1: pdf_path = sys.argv[1] - print(f"\nParsing provided PDF: {pdf_path}") + print(f"\n[1/5] Using provided PDF: {pdf_path}") else: - # Try to create a sample PDF + print("\n[1/5] Creating sample PDF document...") pdf_path = create_sample_pdf() if not pdf_path: - print("\nNo PDF provided and couldn't create sample PDF") + print("\n✗ No PDF provided and couldn't create sample PDF") print("Usage: python docling_parse_example.py [path_to_pdf]") sys.exit(1) # Check if file exists if not Path(pdf_path).exists(): - print(f"\nError: PDF file not found: {pdf_path}") + print(f"\n✗ Error: PDF file not found: {pdf_path}") sys.exit(1) - # Parse the PDF - print(f"\nParsing PDF document...") - result = parse_pdf_document(pdf_path) + # Step 2: Parse PDF with docling-parse + print("\n[2/5] Parsing PDF with docling-parse...") + document_text = parse_pdf_with_docling(pdf_path) + print(f"✓ Extracted {len(document_text)} characters") - # Display results - print("\n" + "=" * 60) - print("Parsing Results:") - print("=" * 60) + # Step 3: Create document chunks + print("\n[3/5] Creating document chunks for RAG...") + chunks = create_document_chunks(document_text, chunk_size=300) + print(f"✓ Created {len(chunks)} chunks") - if result["success"]: - print(f"✓ Successfully parsed PDF") - print(f" - Number of pages: {result.get('num_pages', 'N/A')}") - print(f" - Has text content: {result.get('has_text', 'N/A')}") - print(f" - Parser version: {result.get('parser_version', 'N/A')}") - - if "first_page_info" in result: - print(f"\n First page information:") - print(f" - Page number: {result['first_page_info']['page_num']}") - print(f" - Has content: {result['first_page_info']['has_content']}") - else: - print(f"✗ Failed to parse PDF") - print(f" - Error type: {result.get('error_type', 'Unknown')}") - print(f" - Error message: {result.get('error', 'No details available')}") + # Step 4: Demonstrate retrieval + print("\n[4/5] Performing keyword-based retrieval...") + query = "machine learning" + print(f"Query: '{query}'") + + results = simple_keyword_search(chunks, query) + print(f"✓ Found {len(results)} relevant chunks") + + # Step 5: Display results + print("\n[5/5] Top Retrieved Chunks:") + print("=" * 70) + + for i, result in enumerate(results[:3], 1): # Show top 3 + print(f"\nChunk #{result['chunk_id']} (Score: {result['score']})") + print("-" * 70) + print(result['preview']) - print("\n" + "=" * 60) - print("Example completed successfully!") - print("=" * 60) + print("\n" + "=" * 70) + print("RAG Example completed successfully!") + print("=" * 70) + print("\nNote: This is a simplified RAG demonstration.") + print("Production RAG systems would use:") + print(" - Vector embeddings (e.g., sentence-transformers)") + print(" - Vector databases (e.g., FAISS, Chroma)") + print(" - LLM for generation (e.g., vLLM, Ollama)") + print("=" * 70) # Made with Bob diff --git a/examples/Python3.12/docling-parse-example/requirements.txt b/examples/Python3.12/docling-parse-example/requirements.txt index fee762e..2fbdec3 100644 --- a/examples/Python3.12/docling-parse-example/requirements.txt +++ b/examples/Python3.12/docling-parse-example/requirements.txt @@ -1,3 +1,4 @@ docling-parse==5.8.0+ppc64le1 reportlab==4.2.5+ppc64le1 -Pillow==11.1.0+ppc64le1 \ No newline at end of file +Pillow==11.1.0+ppc64le1 +numpy==2.4.4+ppc64le1 \ No newline at end of file From ea43bd0ce323ab7e4b9ac80282905fed3b75a9ba Mon Sep 17 00:00:00 2001 From: Rushikesh Sathe Date: Fri, 22 May 2026 12:06:39 +0530 Subject: [PATCH 3/7] changes in examples --- .../Python3.12/docling-parse-example/docling_parse_example.py | 2 +- examples/Python3.12/docling-parse-example/requirements.txt | 4 ++-- examples/Python3.12/docling-parse-example/sub-test1.py | 2 +- examples/Python3.12/docling-parse-example/sub-test3.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py index af863f0..6cd3fbf 100644 --- a/examples/Python3.12/docling-parse-example/docling_parse_example.py +++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py @@ -1,7 +1,7 @@ import sys from pathlib import Path from typing import List, Dict -from docling_parse.docling_parse import pdf_parser_v2 +from docling_parse import pdf_parser_v2 def parse_pdf_with_docling(pdf_path: str) -> str: """ diff --git a/examples/Python3.12/docling-parse-example/requirements.txt b/examples/Python3.12/docling-parse-example/requirements.txt index 2fbdec3..5f47dff 100644 --- a/examples/Python3.12/docling-parse-example/requirements.txt +++ b/examples/Python3.12/docling-parse-example/requirements.txt @@ -1,4 +1,4 @@ docling-parse==5.8.0+ppc64le1 -reportlab==4.2.5+ppc64le1 -Pillow==11.1.0+ppc64le1 +reportlab==4.2.5 +Pillow==11.1.0 numpy==2.4.4+ppc64le1 \ No newline at end of file diff --git a/examples/Python3.12/docling-parse-example/sub-test1.py b/examples/Python3.12/docling-parse-example/sub-test1.py index 7b86ea7..8dd8bf2 100644 --- a/examples/Python3.12/docling-parse-example/sub-test1.py +++ b/examples/Python3.12/docling-parse-example/sub-test1.py @@ -17,7 +17,7 @@ def test_docling_parse_version(self): def test_pdf_parser_import(self): """Check if pdf_parser_v2 can be imported""" try: - from docling_parse.docling_parse import pdf_parser_v2 + from docling_parse import pdf_parser_v2 self.assertIsNotNone(pdf_parser_v2, "pdf_parser_v2 should not be None") except ImportError as e: self.fail(f"Failed to import pdf_parser_v2: {e}") diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py index 0266a39..1ae5100 100644 --- a/examples/Python3.12/docling-parse-example/sub-test3.py +++ b/examples/Python3.12/docling-parse-example/sub-test3.py @@ -34,7 +34,7 @@ def test_parse_pdf_with_docling(self): self.skipTest("Test PDF not available") try: - from docling_parse.docling_parse import pdf_parser_v2 + from docling_parse import pdf_parser_v2 # Parse the test PDF doc = pdf_parser_v2(self.test_pdf_path) @@ -56,7 +56,7 @@ def test_parse_pdf_with_docling(self): def test_docling_parse_error_handling(self): """Test error handling for non-existent PDF""" try: - from docling_parse.docling_parse import pdf_parser_v2 + from docling_parse import pdf_parser_v2 # Try to parse a non-existent file non_existent_pdf = "this_file_does_not_exist.pdf" From 120bf4c8f3ced4244967edfc85e8ca4d854a186d Mon Sep 17 00:00:00 2001 From: Rushikesh Sathe Date: Fri, 22 May 2026 12:13:20 +0530 Subject: [PATCH 4/7] updated --- .../docling_parse_example.py | 25 ++++++++++--------- .../docling-parse-example/sub-test1.py | 17 ++++++++++--- .../docling-parse-example/sub-test3.py | 18 ++++++------- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py index 6cd3fbf..265c6ef 100644 --- a/examples/Python3.12/docling-parse-example/docling_parse_example.py +++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py @@ -1,7 +1,7 @@ import sys from pathlib import Path from typing import List, Dict -from docling_parse import pdf_parser_v2 +from docling_parse.pdf_parser import DoclingPdfParser def parse_pdf_with_docling(pdf_path: str) -> str: """ @@ -14,22 +14,23 @@ def parse_pdf_with_docling(pdf_path: str) -> str: Extracted text content from the PDF """ try: - # Parse the PDF document using docling-parse v2 - doc = pdf_parser_v2(pdf_path) + # Create parser instance and parse the PDF document + parser = DoclingPdfParser() + doc = parser.parse(pdf_path) - # Extract text content + # Extract text content from pages text_content = "" - # Try different methods to extract text - if hasattr(doc, 'text') and doc.text: - text_content = doc.text - elif hasattr(doc, 'pages') and doc.pages: - # Extract text from each page + if hasattr(doc, 'pages') and doc.pages: for page in doc.pages: - if hasattr(page, 'text'): - text_content += page.text + "\n" + # Extract text from cells in the page + if hasattr(page, 'cells'): + for cell in page.cells: + if hasattr(cell, 'text'): + text_content += cell.text + " " + text_content += "\n" - return text_content if text_content else "No text content extracted" + return text_content.strip() if text_content else "No text content extracted" except Exception as e: return f"Error parsing PDF: {str(e)}" diff --git a/examples/Python3.12/docling-parse-example/sub-test1.py b/examples/Python3.12/docling-parse-example/sub-test1.py index 8dd8bf2..83d5623 100644 --- a/examples/Python3.12/docling-parse-example/sub-test1.py +++ b/examples/Python3.12/docling-parse-example/sub-test1.py @@ -15,12 +15,21 @@ def test_docling_parse_version(self): assert "5.8.0" in version, f"Expected docling-parse 5.8.0, got {version}" def test_pdf_parser_import(self): - """Check if pdf_parser_v2 can be imported""" + """Check if DoclingPdfParser can be imported""" try: - from docling_parse import pdf_parser_v2 - self.assertIsNotNone(pdf_parser_v2, "pdf_parser_v2 should not be None") + from docling_parse.pdf_parser import DoclingPdfParser + self.assertIsNotNone(DoclingPdfParser, "DoclingPdfParser should not be None") except ImportError as e: - self.fail(f"Failed to import pdf_parser_v2: {e}") + self.fail(f"Failed to import DoclingPdfParser: {e}") + + def test_pdf_parser_instantiation(self): + """Check if DoclingPdfParser can be instantiated""" + try: + from docling_parse.pdf_parser import DoclingPdfParser + parser = DoclingPdfParser() + self.assertIsNotNone(parser, "Parser instance should not be None") + except Exception as e: + self.fail(f"Failed to instantiate DoclingPdfParser: {e}") if __name__ == "__main__": unittest.main() diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py index 1ae5100..1ff2cb6 100644 --- a/examples/Python3.12/docling-parse-example/sub-test3.py +++ b/examples/Python3.12/docling-parse-example/sub-test3.py @@ -34,21 +34,20 @@ def test_parse_pdf_with_docling(self): self.skipTest("Test PDF not available") try: - from docling_parse import pdf_parser_v2 + from docling_parse.pdf_parser import DoclingPdfParser - # Parse the test PDF - doc = pdf_parser_v2(self.test_pdf_path) + # Create parser and parse the test PDF + parser = DoclingPdfParser() + doc = parser.parse(self.test_pdf_path) # Basic assertions self.assertIsNotNone(doc, "Parsed document should not be None") # Check if document has expected attributes has_pages = hasattr(doc, 'pages') - has_text = hasattr(doc, 'text') - # At least one of these should be true for a valid parse - self.assertTrue(has_pages or has_text, - "Parsed document should have pages or text attribute") + # Document should have pages + self.assertTrue(has_pages, "Parsed document should have pages attribute") except Exception as e: self.fail(f"Failed to parse PDF with docling-parse: {e}") @@ -56,13 +55,14 @@ def test_parse_pdf_with_docling(self): def test_docling_parse_error_handling(self): """Test error handling for non-existent PDF""" try: - from docling_parse import pdf_parser_v2 + from docling_parse.pdf_parser import DoclingPdfParser # Try to parse a non-existent file non_existent_pdf = "this_file_does_not_exist.pdf" + parser = DoclingPdfParser() with self.assertRaises(Exception): - pdf_parser_v2(non_existent_pdf) + parser.parse(non_existent_pdf) except ImportError: self.skipTest("docling-parse not available") From 08c55555f899b0d38a2b6fb6e09b5335caa62a0b Mon Sep 17 00:00:00 2001 From: Rushikesh Sathe Date: Fri, 22 May 2026 12:17:28 +0530 Subject: [PATCH 5/7] updated subtest --- .../docling-parse-example/docling_parse_example.py | 4 ++-- examples/Python3.12/docling-parse-example/sub-test3.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py index 265c6ef..6edc539 100644 --- a/examples/Python3.12/docling-parse-example/docling_parse_example.py +++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py @@ -14,9 +14,9 @@ def parse_pdf_with_docling(pdf_path: str) -> str: Extracted text content from the PDF """ try: - # Create parser instance and parse the PDF document + # Create parser instance and load the PDF document parser = DoclingPdfParser() - doc = parser.parse(pdf_path) + doc = parser.load(pdf_path) # Extract text content from pages text_content = "" diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py index 1ff2cb6..6cd156f 100644 --- a/examples/Python3.12/docling-parse-example/sub-test3.py +++ b/examples/Python3.12/docling-parse-example/sub-test3.py @@ -36,9 +36,9 @@ def test_parse_pdf_with_docling(self): try: from docling_parse.pdf_parser import DoclingPdfParser - # Create parser and parse the test PDF + # Create parser and load the test PDF parser = DoclingPdfParser() - doc = parser.parse(self.test_pdf_path) + doc = parser.load(self.test_pdf_path) # Basic assertions self.assertIsNotNone(doc, "Parsed document should not be None") @@ -57,12 +57,12 @@ def test_docling_parse_error_handling(self): try: from docling_parse.pdf_parser import DoclingPdfParser - # Try to parse a non-existent file + # Try to load a non-existent file non_existent_pdf = "this_file_does_not_exist.pdf" parser = DoclingPdfParser() with self.assertRaises(Exception): - parser.parse(non_existent_pdf) + parser.load(non_existent_pdf) except ImportError: self.skipTest("docling-parse not available") From ed51bd6149e7e7461b66518abda5c8ebd880f867 Mon Sep 17 00:00:00 2001 From: Rushikesh Sathe Date: Fri, 22 May 2026 12:22:42 +0530 Subject: [PATCH 6/7] subtest --- .../docling_parse_example.py | 18 +++++++++--------- .../docling-parse-example/sub-test3.py | 18 ++++++++++++++---- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py index 6edc539..855f5c9 100644 --- a/examples/Python3.12/docling-parse-example/docling_parse_example.py +++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py @@ -18,17 +18,17 @@ def parse_pdf_with_docling(pdf_path: str) -> str: parser = DoclingPdfParser() doc = parser.load(pdf_path) - # Extract text content from pages + # Extract text content from all pages text_content = "" - if hasattr(doc, 'pages') and doc.pages: - for page in doc.pages: - # Extract text from cells in the page - if hasattr(page, 'cells'): - for cell in page.cells: - if hasattr(cell, 'text'): - text_content += cell.text + " " - text_content += "\n" + for page_num in range(1, doc.number_of_pages() + 1): + page = doc.get_page(page_num) + + # Extract text from word cells + if hasattr(page, 'word_cells') and page.word_cells: + for cell in page.word_cells: + text_content += cell.text + " " + text_content += "\n" return text_content.strip() if text_content else "No text content extracted" diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py index 6cd156f..997ba66 100644 --- a/examples/Python3.12/docling-parse-example/sub-test3.py +++ b/examples/Python3.12/docling-parse-example/sub-test3.py @@ -43,11 +43,21 @@ def test_parse_pdf_with_docling(self): # Basic assertions self.assertIsNotNone(doc, "Parsed document should not be None") - # Check if document has expected attributes - has_pages = hasattr(doc, 'pages') + # Check if document has number_of_pages method + self.assertTrue(hasattr(doc, 'number_of_pages'), + "Document should have number_of_pages method") - # Document should have pages - self.assertTrue(has_pages, "Parsed document should have pages attribute") + # Check that we can get pages + num_pages = doc.number_of_pages() + self.assertGreater(num_pages, 0, "Document should have at least one page") + + # Try to get the first page + page = doc.get_page(1) + self.assertIsNotNone(page, "Should be able to get page 1") + + # Check that page has word_cells + self.assertTrue(hasattr(page, 'word_cells'), + "Page should have word_cells attribute") except Exception as e: self.fail(f"Failed to parse PDF with docling-parse: {e}") From df9cf4445b454b97a545879a5b217e1a6c4d570c Mon Sep 17 00:00:00 2001 From: Rushikesh Sathe Date: Fri, 22 May 2026 08:35:00 +0000 Subject: [PATCH 7/7] Added docling example --- examples/Python3.12/docling-parse-example/README.md | 2 +- .../Python3.12/docling-parse-example/docling_parse_example.py | 1 - .../Python3.12/docling-parse-example/install_test_example.sh | 1 - examples/Python3.12/docling-parse-example/sub-test1.py | 1 - examples/Python3.12/docling-parse-example/sub-test2.py | 1 - examples/Python3.12/docling-parse-example/sub-test3.py | 1 - 6 files changed, 1 insertion(+), 6 deletions(-) mode change 100644 => 100755 examples/Python3.12/docling-parse-example/install_test_example.sh diff --git a/examples/Python3.12/docling-parse-example/README.md b/examples/Python3.12/docling-parse-example/README.md index fd2f921..31d2bdb 100644 --- a/examples/Python3.12/docling-parse-example/README.md +++ b/examples/Python3.12/docling-parse-example/README.md @@ -42,4 +42,4 @@ python3.12 docling_parse_example.py [optional_pdf_path] ``` ### License: -It's covered under Apache 2.0 licenses \ No newline at end of file +It's covered under Apache 2.0 licenses diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py index 855f5c9..d2fca00 100644 --- a/examples/Python3.12/docling-parse-example/docling_parse_example.py +++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py @@ -207,4 +207,3 @@ def create_sample_pdf(): print(" - LLM for generation (e.g., vLLM, Ollama)") print("=" * 70) -# Made with Bob diff --git a/examples/Python3.12/docling-parse-example/install_test_example.sh b/examples/Python3.12/docling-parse-example/install_test_example.sh old mode 100644 new mode 100755 index a2ca1c4..509a866 --- a/examples/Python3.12/docling-parse-example/install_test_example.sh +++ b/examples/Python3.12/docling-parse-example/install_test_example.sh @@ -65,4 +65,3 @@ python3.12 sub-test1.py python3.12 sub-test2.py python3.12 sub-test3.py -# Made with Bob diff --git a/examples/Python3.12/docling-parse-example/sub-test1.py b/examples/Python3.12/docling-parse-example/sub-test1.py index 83d5623..8f00b64 100644 --- a/examples/Python3.12/docling-parse-example/sub-test1.py +++ b/examples/Python3.12/docling-parse-example/sub-test1.py @@ -34,4 +34,3 @@ def test_pdf_parser_instantiation(self): if __name__ == "__main__": unittest.main() -# Made with Bob diff --git a/examples/Python3.12/docling-parse-example/sub-test2.py b/examples/Python3.12/docling-parse-example/sub-test2.py index 1aa2538..181873d 100644 --- a/examples/Python3.12/docling-parse-example/sub-test2.py +++ b/examples/Python3.12/docling-parse-example/sub-test2.py @@ -41,4 +41,3 @@ def test_pillow_import(self): if __name__ == "__main__": unittest.main() -# Made with Bob diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py index 997ba66..41933f3 100644 --- a/examples/Python3.12/docling-parse-example/sub-test3.py +++ b/examples/Python3.12/docling-parse-example/sub-test3.py @@ -80,4 +80,3 @@ def test_docling_parse_error_handling(self): if __name__ == "__main__": unittest.main() -# Made with Bob