From 6e6e2acbdeee9649085b2de3ac328cd185762b5a Mon Sep 17 00:00:00 2001
From: Rushikesh Sathe <rushikeshsathe@Rushikeshs-MacBook-Air.local>
Date: Fri, 22 May 2026 11:27:37 +0530
Subject: [PATCH 1/7] docling example

---
 .../docling-parse-example/README.md           |  20 +++
 .../docling_parse_example.py                  | 121 ++++++++++++++++++
 .../install_test_example.sh                   |  68 ++++++++++
 .../docling-parse-example/requirements.txt    |   3 +
 .../docling-parse-example/sub-test1.py        |  28 ++++
 .../docling-parse-example/sub-test2.py        |  44 +++++++
 .../docling-parse-example/sub-test3.py        |  73 +++++++++++
 7 files changed, 357 insertions(+)
 create mode 100644 examples/Python3.12/docling-parse-example/README.md
 create mode 100644 examples/Python3.12/docling-parse-example/docling_parse_example.py
 create mode 100644 examples/Python3.12/docling-parse-example/install_test_example.sh
 create mode 100644 examples/Python3.12/docling-parse-example/requirements.txt
 create mode 100644 examples/Python3.12/docling-parse-example/sub-test1.py
 create mode 100644 examples/Python3.12/docling-parse-example/sub-test2.py
 create mode 100644 examples/Python3.12/docling-parse-example/sub-test3.py

diff --git a/examples/Python3.12/docling-parse-example/README.md b/examples/Python3.12/docling-parse-example/README.md
new file mode 100644
index 0000000..ddb500b
--- /dev/null
+++ b/examples/Python3.12/docling-parse-example/README.md
@@ -0,0 +1,20 @@
+## Purpose: Parse PDF documents using the docling-parse library.
+
+### Packages used:
+docling-parse
+
+### Functionality:
+- Parses PDF documents using docling-parse v2 parser.
+- Extracts document metadata (page count, text content).
+- Creates sample PDF documents for testing (using reportlab).
+- Handles parsing errors gracefully.
+- Displays parsed document information.
+
+### How to run the example :
+```
+chmod +x install_test_example.sh
+./install_test_example.sh
+```
+
+### License:
+It's covered under Apache 2.0 licenses
\ No newline at end of file
diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py
new file mode 100644
index 0000000..2faca6b
--- /dev/null
+++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py
@@ -0,0 +1,121 @@
+import sys
+from pathlib import Path
+from docling_parse.docling_parse import pdf_parser_v2
+
+def parse_pdf_document(pdf_path: str) -> dict:
+    """
+    Parse a PDF document using docling-parse library.
+    
+    Args:
+        pdf_path: Path to the PDF file to parse
+        
+    Returns:
+        Dictionary containing parsed document information
+    """
+    try:
+        # Parse the PDF document
+        doc = pdf_parser_v2(pdf_path)
+        
+        # Extract basic information
+        result = {
+            "success": True,
+            "num_pages": len(doc.pages) if hasattr(doc, 'pages') else 0,
+            "has_text": bool(doc.text) if hasattr(doc, 'text') else False,
+            "parser_version": "v2"
+        }
+        
+        # Try to get page count and basic metadata
+        if hasattr(doc, 'pages') and doc.pages:
+            result["first_page_info"] = {
+                "page_num": 1,
+                "has_content": bool(doc.pages[0]) if doc.pages else False
+            }
+        
+        return result
+        
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": type(e).__name__
+        }
+
+
+def create_sample_pdf():
+    """
+    Create a simple sample PDF for testing purposes.
+    """
+    try:
+        from reportlab.pdfgen import canvas
+        from reportlab.lib.pagesizes import letter
+        
+        pdf_path = "sample_document.pdf"
+        c = canvas.Canvas(pdf_path, pagesize=letter)
+        
+        # Add some text content
+        c.drawString(100, 750, "Sample PDF Document")
+        c.drawString(100, 730, "This is a test document for docling-parse")
+        c.drawString(100, 710, "Page 1 of 1")
+        
+        c.showPage()
+        c.save()
+        
+        print(f"Created sample PDF: {pdf_path}")
+        return pdf_path
+        
+    except ImportError:
+        print("reportlab not available, skipping PDF creation")
+        return None
+
+
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Docling-Parse PDF Parsing Example")
+    print("=" * 60)
+    
+    # Check if a PDF path was provided as argument
+    if len(sys.argv) > 1:
+        pdf_path = sys.argv[1]
+        print(f"\nParsing provided PDF: {pdf_path}")
+    else:
+        # Try to create a sample PDF
+        pdf_path = create_sample_pdf()
+        if not pdf_path:
+            print("\nNo PDF provided and couldn't create sample PDF")
+            print("Usage: python docling_parse_example.py [path_to_pdf]")
+            sys.exit(1)
+    
+    # Check if file exists
+    if not Path(pdf_path).exists():
+        print(f"\nError: PDF file not found: {pdf_path}")
+        sys.exit(1)
+    
+    # Parse the PDF
+    print(f"\nParsing PDF document...")
+    result = parse_pdf_document(pdf_path)
+    
+    # Display results
+    print("\n" + "=" * 60)
+    print("Parsing Results:")
+    print("=" * 60)
+    
+    if result["success"]:
+        print(f"✓ Successfully parsed PDF")
+        print(f"  - Number of pages: {result.get('num_pages', 'N/A')}")
+        print(f"  - Has text content: {result.get('has_text', 'N/A')}")
+        print(f"  - Parser version: {result.get('parser_version', 'N/A')}")
+        
+        if "first_page_info" in result:
+            print(f"\n  First page information:")
+            print(f"    - Page number: {result['first_page_info']['page_num']}")
+            print(f"    - Has content: {result['first_page_info']['has_content']}")
+    else:
+        print(f"✗ Failed to parse PDF")
+        print(f"  - Error type: {result.get('error_type', 'Unknown')}")
+        print(f"  - Error message: {result.get('error', 'No details available')}")
+    
+    print("\n" + "=" * 60)
+    print("Example completed successfully!")
+    print("=" * 60)
+
+# Made with Bob
diff --git a/examples/Python3.12/docling-parse-example/install_test_example.sh b/examples/Python3.12/docling-parse-example/install_test_example.sh
new file mode 100644
index 0000000..a2ca1c4
--- /dev/null
+++ b/examples/Python3.12/docling-parse-example/install_test_example.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# -------------------------------
+# Function to detect Linux distro
+# -------------------------------
+detect_distro() {
+    if [ -f /etc/os-release ]; then
+        . /etc/os-release
+        echo $ID
+    elif [ -f /etc/redhat-release ]; then
+        echo "rhel"
+    elif [ -f /etc/debian_version ]; then
+        echo "debian"
+    else
+        echo "unknown"
+    fi
+}
+
+DISTRO=$(detect_distro)
+echo "Detected distribution: $DISTRO"
+echo "Installing prerequisites..."
+
+# -------------------------------
+# Install system dependencies
+# -------------------------------
+case $DISTRO in
+    "fedora"|"rhel"|"centos"|"rocky"|"almalinux")
+        if command -v dnf >/dev/null 2>&1; then
+            sudo dnf install -y python3.12 python3.12-devel python3-pip 
+        else
+            sudo yum install -y python3.12 python3.12-devel python3-pip 
+        fi
+        ;;
+    "ubuntu"|"debian")
+        export DEBIAN_FRONTEND=noninteractive  
+        sudo apt update
+        sudo apt install -y python3.12 python3.12-dev python3-pip python3.12-venv 
+        ;;
+    "sles")
+        sudo zypper refresh
+        sudo zypper install -y python312 python312-pip
+
+        ;;
+    *)
+        echo "Unsupported distribution: $DISTRO"
+        exit 1
+        ;;
+esac
+
+
+
+python3.12 -m venv .venv
+source .venv/bin/activate
+
+python3.12 -m pip install --no-cache --prefer-binary --extra-index-url https://wheels.developerfirst.ibm.com/ppc64le/linux -r requirements.txt
+
+WORKDIR=$(pwd)
+
+cd $WORKDIR
+
+python3.12 docling_parse_example.py
+
+echo "\n ==== Running tests ==== \n"
+
+python3.12 sub-test1.py
+python3.12 sub-test2.py
+python3.12 sub-test3.py
+
+# Made with Bob
diff --git a/examples/Python3.12/docling-parse-example/requirements.txt b/examples/Python3.12/docling-parse-example/requirements.txt
new file mode 100644
index 0000000..fee762e
--- /dev/null
+++ b/examples/Python3.12/docling-parse-example/requirements.txt
@@ -0,0 +1,3 @@
+docling-parse==5.8.0+ppc64le1
+reportlab==4.2.5+ppc64le1
+Pillow==11.1.0+ppc64le1
\ No newline at end of file
diff --git a/examples/Python3.12/docling-parse-example/sub-test1.py b/examples/Python3.12/docling-parse-example/sub-test1.py
new file mode 100644
index 0000000..7b86ea7
--- /dev/null
+++ b/examples/Python3.12/docling-parse-example/sub-test1.py
@@ -0,0 +1,28 @@
+import unittest
+import importlib.metadata
+
+class TestDoclingParseLibrary(unittest.TestCase):
+    def test_docling_parse_import(self):
+        """Check if docling-parse can be imported"""
+        try:
+            import docling_parse
+        except ImportError:
+            self.fail("docling-parse is not installed")
+
+    def test_docling_parse_version(self):
+        """Verify docling-parse version"""
+        version = importlib.metadata.version("docling-parse")
+        assert "5.8.0" in version, f"Expected docling-parse 5.8.0, got {version}"
+
+    def test_pdf_parser_import(self):
+        """Check if pdf_parser_v2 can be imported"""
+        try:
+            from docling_parse.docling_parse import pdf_parser_v2
+            self.assertIsNotNone(pdf_parser_v2, "pdf_parser_v2 should not be None")
+        except ImportError as e:
+            self.fail(f"Failed to import pdf_parser_v2: {e}")
+
+if __name__ == "__main__":
+    unittest.main()
+
+# Made with Bob
diff --git a/examples/Python3.12/docling-parse-example/sub-test2.py b/examples/Python3.12/docling-parse-example/sub-test2.py
new file mode 100644
index 0000000..1aa2538
--- /dev/null
+++ b/examples/Python3.12/docling-parse-example/sub-test2.py
@@ -0,0 +1,44 @@
+import unittest
+from pathlib import Path
+
+class TestReportlabPDFCreation(unittest.TestCase):
+    def test_reportlab_import(self):
+        """Check if reportlab can be imported"""
+        try:
+            from reportlab.pdfgen import canvas
+            from reportlab.lib.pagesizes import letter
+        except ImportError:
+            self.fail("reportlab is not installed")
+
+    def test_create_simple_pdf(self):
+        """Test creating a simple PDF with reportlab"""
+        try:
+            from reportlab.pdfgen import canvas
+            from reportlab.lib.pagesizes import letter
+            
+            pdf_path = "test_document.pdf"
+            c = canvas.Canvas(pdf_path, pagesize=letter)
+            c.drawString(100, 750, "Test PDF Document")
+            c.showPage()
+            c.save()
+            
+            # Verify file was created
+            self.assertTrue(Path(pdf_path).exists(), "PDF file was not created")
+            
+            # Clean up
+            Path(pdf_path).unlink()
+            
+        except Exception as e:
+            self.fail(f"Failed to create PDF: {e}")
+
+    def test_pillow_import(self):
+        """Check if Pillow can be imported"""
+        try:
+            from PIL import Image
+        except ImportError:
+            self.fail("Pillow is not installed")
+
+if __name__ == "__main__":
+    unittest.main()
+
+# Made with Bob
diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py
new file mode 100644
index 0000000..0266a39
--- /dev/null
+++ b/examples/Python3.12/docling-parse-example/sub-test3.py
@@ -0,0 +1,73 @@
+import unittest
+from pathlib import Path
+import sys
+
+class TestDoclingParseIntegration(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Create a test PDF before running tests"""
+        try:
+            from reportlab.pdfgen import canvas
+            from reportlab.lib.pagesizes import letter
+            
+            cls.test_pdf_path = "integration_test.pdf"
+            c = canvas.Canvas(cls.test_pdf_path, pagesize=letter)
+            c.drawString(100, 750, "Integration Test PDF")
+            c.drawString(100, 730, "This document is used for testing docling-parse")
+            c.drawString(100, 710, "Line 3 of test content")
+            c.showPage()
+            c.save()
+            
+        except Exception as e:
+            print(f"Warning: Could not create test PDF: {e}")
+            cls.test_pdf_path = None
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test PDF after tests"""
+        if cls.test_pdf_path and Path(cls.test_pdf_path).exists():
+            Path(cls.test_pdf_path).unlink()
+
+    def test_parse_pdf_with_docling(self):
+        """Test parsing a PDF with docling-parse"""
+        if not self.test_pdf_path:
+            self.skipTest("Test PDF not available")
+        
+        try:
+            from docling_parse.docling_parse import pdf_parser_v2
+            
+            # Parse the test PDF
+            doc = pdf_parser_v2(self.test_pdf_path)
+            
+            # Basic assertions
+            self.assertIsNotNone(doc, "Parsed document should not be None")
+            
+            # Check if document has expected attributes
+            has_pages = hasattr(doc, 'pages')
+            has_text = hasattr(doc, 'text')
+            
+            # At least one of these should be true for a valid parse
+            self.assertTrue(has_pages or has_text, 
+                          "Parsed document should have pages or text attribute")
+            
+        except Exception as e:
+            self.fail(f"Failed to parse PDF with docling-parse: {e}")
+
+    def test_docling_parse_error_handling(self):
+        """Test error handling for non-existent PDF"""
+        try:
+            from docling_parse.docling_parse import pdf_parser_v2
+            
+            # Try to parse a non-existent file
+            non_existent_pdf = "this_file_does_not_exist.pdf"
+            
+            with self.assertRaises(Exception):
+                pdf_parser_v2(non_existent_pdf)
+                
+        except ImportError:
+            self.skipTest("docling-parse not available")
+
+if __name__ == "__main__":
+    unittest.main()
+
+# Made with Bob

From 2fcb7e560aa56a46db756d41abd10d7c82036676 Mon Sep 17 00:00:00 2001
From: Rushikesh Sathe <rushikeshsathe@Rushikeshs-MacBook-Air.local>
Date: Fri, 22 May 2026 12:00:09 +0530
Subject: [PATCH 2/7] docling updated example

---
 .../docling-parse-example/README.md           |  43 +++-
 .../docling_parse_example.py                  | 214 ++++++++++++------
 .../docling-parse-example/requirements.txt    |   3 +-
 3 files changed, 187 insertions(+), 73 deletions(-)

diff --git a/examples/Python3.12/docling-parse-example/README.md b/examples/Python3.12/docling-parse-example/README.md
index ddb500b..fd2f921 100644
--- a/examples/Python3.12/docling-parse-example/README.md
+++ b/examples/Python3.12/docling-parse-example/README.md
@@ -1,20 +1,45 @@
-## Purpose: Parse PDF documents using the docling-parse library.
+## Purpose: Simple RAG (Retrieval-Augmented Generation) Application with Docling-Parse
 
 ### Packages used:
-docling-parse
+- docling-parse (PDF parsing)
+- reportlab (PDF creation for testing)
+- Pillow (Image processing support)
+- numpy (Numerical operations)
 
 ### Functionality:
-- Parses PDF documents using docling-parse v2 parser.
-- Extracts document metadata (page count, text content).
-- Creates sample PDF documents for testing (using reportlab).
-- Handles parsing errors gracefully.
-- Displays parsed document information.
+This example demonstrates a simplified RAG (Retrieval-Augmented Generation) pipeline:
 
-### How to run the example :
-```
+1. **Document Parsing**: Uses docling-parse v2 to extract text from PDF documents
+2. **Text Chunking**: Splits extracted text into manageable chunks for retrieval
+3. **Keyword-based Retrieval**: Implements simple keyword matching to find relevant chunks
+4. **Result Display**: Shows top matching chunks based on query
+
+### RAG Pipeline Steps:
+1. Parse PDF document with docling-parse
+2. Extract text content from parsed document
+3. Create document chunks (configurable size)
+4. Perform keyword-based retrieval on chunks
+5. Display top relevant results
+
+### Note:
+This is a simplified RAG demonstration. Production RAG systems typically use:
+- Vector embeddings (e.g., sentence-transformers, OpenAI embeddings)
+- Vector databases (e.g., FAISS, Chroma, Pinecone)
+- LLM for generation (e.g., vLLM, Ollama, OpenAI GPT)
+
+### How to run the example:
+```bash
 chmod +x install_test_example.sh
 ./install_test_example.sh
 ```
 
+Or manually:
+```bash
+python3.12 -m venv .venv
+source .venv/bin/activate
+pip install --extra-index-url https://wheels.developerfirst.ibm.com/ppc64le/linux -r requirements.txt
+python3.12 docling_parse_example.py [optional_pdf_path]
+```
+
 ### License:
 It's covered under Apache 2.0 licenses
\ No newline at end of file
diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py
index 2faca6b..af863f0 100644
--- a/examples/Python3.12/docling-parse-example/docling_parse_example.py
+++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py
@@ -1,121 +1,209 @@
 import sys
 from pathlib import Path
+from typing import List, Dict
 from docling_parse.docling_parse import pdf_parser_v2
 
-def parse_pdf_document(pdf_path: str) -> dict:
+def parse_pdf_with_docling(pdf_path: str) -> str:
     """
-    Parse a PDF document using docling-parse library.
+    Parse a PDF document using docling-parse and extract text content.
     
     Args:
         pdf_path: Path to the PDF file to parse
         
     Returns:
-        Dictionary containing parsed document information
+        Extracted text content from the PDF
     """
     try:
-        # Parse the PDF document
+        # Parse the PDF document using docling-parse v2
         doc = pdf_parser_v2(pdf_path)
         
-        # Extract basic information
-        result = {
-            "success": True,
-            "num_pages": len(doc.pages) if hasattr(doc, 'pages') else 0,
-            "has_text": bool(doc.text) if hasattr(doc, 'text') else False,
-            "parser_version": "v2"
-        }
+        # Extract text content
+        text_content = ""
         
-        # Try to get page count and basic metadata
-        if hasattr(doc, 'pages') and doc.pages:
-            result["first_page_info"] = {
-                "page_num": 1,
-                "has_content": bool(doc.pages[0]) if doc.pages else False
-            }
+        # Try different methods to extract text
+        if hasattr(doc, 'text') and doc.text:
+            text_content = doc.text
+        elif hasattr(doc, 'pages') and doc.pages:
+            # Extract text from each page
+            for page in doc.pages:
+                if hasattr(page, 'text'):
+                    text_content += page.text + "\n"
         
-        return result
+        return text_content if text_content else "No text content extracted"
         
     except Exception as e:
-        return {
-            "success": False,
-            "error": str(e),
-            "error_type": type(e).__name__
-        }
+        return f"Error parsing PDF: {str(e)}"
+
+
+def create_document_chunks(text: str, chunk_size: int = 500) -> List[str]:
+    """
+    Split document text into smaller chunks for RAG processing.
+    
+    Args:
+        text: Full document text
+        chunk_size: Size of each chunk in characters
+        
+    Returns:
+        List of text chunks
+    """
+    # Simple chunking by character count
+    chunks = []
+    words = text.split()
+    current_chunk = []
+    current_size = 0
+    
+    for word in words:
+        word_size = len(word) + 1  # +1 for space
+        if current_size + word_size > chunk_size and current_chunk:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [word]
+            current_size = word_size
+        else:
+            current_chunk.append(word)
+            current_size += word_size
+    
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    
+    return chunks
+
+
+def simple_keyword_search(chunks: List[str], query: str) -> List[Dict[str, any]]:
+    """
+    Simple keyword-based retrieval (mock RAG retrieval).
+    In a real RAG system, this would use embeddings and vector search.
+    
+    Args:
+        chunks: List of document chunks
+        query: Search query
+        
+    Returns:
+        List of relevant chunks with scores
+    """
+    results = []
+    query_lower = query.lower()
+    query_words = set(query_lower.split())
+    
+    for idx, chunk in enumerate(chunks):
+        chunk_lower = chunk.lower()
+        # Simple scoring: count matching words
+        matches = sum(1 for word in query_words if word in chunk_lower)
+        
+        if matches > 0:
+            results.append({
+                "chunk_id": idx,
+                "text": chunk,
+                "score": matches,
+                "preview": chunk[:200] + "..." if len(chunk) > 200 else chunk
+            })
+    
+    # Sort by score (descending)
+    results.sort(key=lambda x: x["score"], reverse=True)
+    return results
 
 
 def create_sample_pdf():
     """
-    Create a simple sample PDF for testing purposes.
+    Create a sample PDF document with RAG-relevant content.
     """
     try:
         from reportlab.pdfgen import canvas
         from reportlab.lib.pagesizes import letter
         
-        pdf_path = "sample_document.pdf"
+        pdf_path = "rag_sample_document.pdf"
         c = canvas.Canvas(pdf_path, pagesize=letter)
         
-        # Add some text content
-        c.drawString(100, 750, "Sample PDF Document")
-        c.drawString(100, 730, "This is a test document for docling-parse")
-        c.drawString(100, 710, "Page 1 of 1")
+        # Add content about AI and machine learning
+        y_position = 750
+        content = [
+            "Introduction to Artificial Intelligence",
+            "",
+            "Artificial Intelligence (AI) is the simulation of human intelligence",
+            "processes by machines, especially computer systems. These processes",
+            "include learning, reasoning, and self-correction.",
+            "",
+            "Machine Learning is a subset of AI that provides systems the ability",
+            "to automatically learn and improve from experience without being",
+            "explicitly programmed. Deep learning is a subset of machine learning.",
+            "",
+            "Natural Language Processing (NLP) is a branch of AI that helps",
+            "computers understand, interpret and manipulate human language.",
+            "NLP draws from many disciplines, including computer science and",
+            "computational linguistics.",
+        ]
+        
+        for line in content:
+            c.drawString(50, y_position, line)
+            y_position -= 20
         
         c.showPage()
         c.save()
         
-        print(f"Created sample PDF: {pdf_path}")
+        print(f"✓ Created sample PDF: {pdf_path}")
         return pdf_path
         
     except ImportError:
-        print("reportlab not available, skipping PDF creation")
+        print("✗ reportlab not available, skipping PDF creation")
         return None
 
 
 if __name__ == "__main__":
-    print("=" * 60)
-    print("Docling-Parse PDF Parsing Example")
-    print("=" * 60)
+    print("=" * 70)
+    print("Simple RAG Application with Docling-Parse")
+    print("=" * 70)
     
-    # Check if a PDF path was provided as argument
+    # Step 1: Get or create PDF document
     if len(sys.argv) > 1:
         pdf_path = sys.argv[1]
-        print(f"\nParsing provided PDF: {pdf_path}")
+        print(f"\n[1/5] Using provided PDF: {pdf_path}")
     else:
-        # Try to create a sample PDF
+        print("\n[1/5] Creating sample PDF document...")
         pdf_path = create_sample_pdf()
         if not pdf_path:
-            print("\nNo PDF provided and couldn't create sample PDF")
+            print("\n✗ No PDF provided and couldn't create sample PDF")
             print("Usage: python docling_parse_example.py [path_to_pdf]")
             sys.exit(1)
     
     # Check if file exists
     if not Path(pdf_path).exists():
-        print(f"\nError: PDF file not found: {pdf_path}")
+        print(f"\n✗ Error: PDF file not found: {pdf_path}")
         sys.exit(1)
     
-    # Parse the PDF
-    print(f"\nParsing PDF document...")
-    result = parse_pdf_document(pdf_path)
+    # Step 2: Parse PDF with docling-parse
+    print("\n[2/5] Parsing PDF with docling-parse...")
+    document_text = parse_pdf_with_docling(pdf_path)
+    print(f"✓ Extracted {len(document_text)} characters")
     
-    # Display results
-    print("\n" + "=" * 60)
-    print("Parsing Results:")
-    print("=" * 60)
+    # Step 3: Create document chunks
+    print("\n[3/5] Creating document chunks for RAG...")
+    chunks = create_document_chunks(document_text, chunk_size=300)
+    print(f"✓ Created {len(chunks)} chunks")
     
-    if result["success"]:
-        print(f"✓ Successfully parsed PDF")
-        print(f"  - Number of pages: {result.get('num_pages', 'N/A')}")
-        print(f"  - Has text content: {result.get('has_text', 'N/A')}")
-        print(f"  - Parser version: {result.get('parser_version', 'N/A')}")
-        
-        if "first_page_info" in result:
-            print(f"\n  First page information:")
-            print(f"    - Page number: {result['first_page_info']['page_num']}")
-            print(f"    - Has content: {result['first_page_info']['has_content']}")
-    else:
-        print(f"✗ Failed to parse PDF")
-        print(f"  - Error type: {result.get('error_type', 'Unknown')}")
-        print(f"  - Error message: {result.get('error', 'No details available')}")
+    # Step 4: Demonstrate retrieval
+    print("\n[4/5] Performing keyword-based retrieval...")
+    query = "machine learning"
+    print(f"Query: '{query}'")
+    
+    results = simple_keyword_search(chunks, query)
+    print(f"✓ Found {len(results)} relevant chunks")
+    
+    # Step 5: Display results
+    print("\n[5/5] Top Retrieved Chunks:")
+    print("=" * 70)
+    
+    for i, result in enumerate(results[:3], 1):  # Show top 3
+        print(f"\nChunk #{result['chunk_id']} (Score: {result['score']})")
+        print("-" * 70)
+        print(result['preview'])
     
-    print("\n" + "=" * 60)
-    print("Example completed successfully!")
-    print("=" * 60)
+    print("\n" + "=" * 70)
+    print("RAG Example completed successfully!")
+    print("=" * 70)
+    print("\nNote: This is a simplified RAG demonstration.")
+    print("Production RAG systems would use:")
+    print("  - Vector embeddings (e.g., sentence-transformers)")
+    print("  - Vector databases (e.g., FAISS, Chroma)")
+    print("  - LLM for generation (e.g., vLLM, Ollama)")
+    print("=" * 70)
 
 # Made with Bob
diff --git a/examples/Python3.12/docling-parse-example/requirements.txt b/examples/Python3.12/docling-parse-example/requirements.txt
index fee762e..2fbdec3 100644
--- a/examples/Python3.12/docling-parse-example/requirements.txt
+++ b/examples/Python3.12/docling-parse-example/requirements.txt
@@ -1,3 +1,4 @@
 docling-parse==5.8.0+ppc64le1
 reportlab==4.2.5+ppc64le1
-Pillow==11.1.0+ppc64le1
\ No newline at end of file
+Pillow==11.1.0+ppc64le1
+numpy==2.4.4+ppc64le1
\ No newline at end of file

From ea43bd0ce323ab7e4b9ac80282905fed3b75a9ba Mon Sep 17 00:00:00 2001
From: Rushikesh Sathe <rushikeshsathe@Rushikeshs-MacBook-Air.local>
Date: Fri, 22 May 2026 12:06:39 +0530
Subject: [PATCH 3/7] changes in examples

---
 .../Python3.12/docling-parse-example/docling_parse_example.py | 2 +-
 examples/Python3.12/docling-parse-example/requirements.txt    | 4 ++--
 examples/Python3.12/docling-parse-example/sub-test1.py        | 2 +-
 examples/Python3.12/docling-parse-example/sub-test3.py        | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py
index af863f0..6cd3fbf 100644
--- a/examples/Python3.12/docling-parse-example/docling_parse_example.py
+++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py
@@ -1,7 +1,7 @@
 import sys
 from pathlib import Path
 from typing import List, Dict
-from docling_parse.docling_parse import pdf_parser_v2
+from docling_parse import pdf_parser_v2
 
 def parse_pdf_with_docling(pdf_path: str) -> str:
     """
diff --git a/examples/Python3.12/docling-parse-example/requirements.txt b/examples/Python3.12/docling-parse-example/requirements.txt
index 2fbdec3..5f47dff 100644
--- a/examples/Python3.12/docling-parse-example/requirements.txt
+++ b/examples/Python3.12/docling-parse-example/requirements.txt
@@ -1,4 +1,4 @@
 docling-parse==5.8.0+ppc64le1
-reportlab==4.2.5+ppc64le1
-Pillow==11.1.0+ppc64le1
+reportlab==4.2.5
+Pillow==11.1.0
 numpy==2.4.4+ppc64le1
\ No newline at end of file
diff --git a/examples/Python3.12/docling-parse-example/sub-test1.py b/examples/Python3.12/docling-parse-example/sub-test1.py
index 7b86ea7..8dd8bf2 100644
--- a/examples/Python3.12/docling-parse-example/sub-test1.py
+++ b/examples/Python3.12/docling-parse-example/sub-test1.py
@@ -17,7 +17,7 @@ def test_docling_parse_version(self):
     def test_pdf_parser_import(self):
         """Check if pdf_parser_v2 can be imported"""
         try:
-            from docling_parse.docling_parse import pdf_parser_v2
+            from docling_parse import pdf_parser_v2
             self.assertIsNotNone(pdf_parser_v2, "pdf_parser_v2 should not be None")
         except ImportError as e:
             self.fail(f"Failed to import pdf_parser_v2: {e}")
diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py
index 0266a39..1ae5100 100644
--- a/examples/Python3.12/docling-parse-example/sub-test3.py
+++ b/examples/Python3.12/docling-parse-example/sub-test3.py
@@ -34,7 +34,7 @@ def test_parse_pdf_with_docling(self):
             self.skipTest("Test PDF not available")
         
         try:
-            from docling_parse.docling_parse import pdf_parser_v2
+            from docling_parse import pdf_parser_v2
             
             # Parse the test PDF
             doc = pdf_parser_v2(self.test_pdf_path)
@@ -56,7 +56,7 @@ def test_parse_pdf_with_docling(self):
     def test_docling_parse_error_handling(self):
         """Test error handling for non-existent PDF"""
         try:
-            from docling_parse.docling_parse import pdf_parser_v2
+            from docling_parse import pdf_parser_v2
             
             # Try to parse a non-existent file
             non_existent_pdf = "this_file_does_not_exist.pdf"

From 120bf4c8f3ced4244967edfc85e8ca4d854a186d Mon Sep 17 00:00:00 2001
From: Rushikesh Sathe <rushikeshsathe@Rushikeshs-MacBook-Air.local>
Date: Fri, 22 May 2026 12:13:20 +0530
Subject: [PATCH 4/7] updated

---
 .../docling_parse_example.py                  | 25 ++++++++++---------
 .../docling-parse-example/sub-test1.py        | 17 ++++++++++---
 .../docling-parse-example/sub-test3.py        | 18 ++++++-------
 3 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py
index 6cd3fbf..265c6ef 100644
--- a/examples/Python3.12/docling-parse-example/docling_parse_example.py
+++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py
@@ -1,7 +1,7 @@
 import sys
 from pathlib import Path
 from typing import List, Dict
-from docling_parse import pdf_parser_v2
+from docling_parse.pdf_parser import DoclingPdfParser
 
 def parse_pdf_with_docling(pdf_path: str) -> str:
     """
@@ -14,22 +14,23 @@ def parse_pdf_with_docling(pdf_path: str) -> str:
         Extracted text content from the PDF
     """
     try:
-        # Parse the PDF document using docling-parse v2
-        doc = pdf_parser_v2(pdf_path)
+        # Create parser instance and parse the PDF document
+        parser = DoclingPdfParser()
+        doc = parser.parse(pdf_path)
         
-        # Extract text content
+        # Extract text content from pages
         text_content = ""
         
-        # Try different methods to extract text
-        if hasattr(doc, 'text') and doc.text:
-            text_content = doc.text
-        elif hasattr(doc, 'pages') and doc.pages:
-            # Extract text from each page
+        if hasattr(doc, 'pages') and doc.pages:
             for page in doc.pages:
-                if hasattr(page, 'text'):
-                    text_content += page.text + "\n"
+                # Extract text from cells in the page
+                if hasattr(page, 'cells'):
+                    for cell in page.cells:
+                        if hasattr(cell, 'text'):
+                            text_content += cell.text + " "
+                    text_content += "\n"
         
-        return text_content if text_content else "No text content extracted"
+        return text_content.strip() if text_content else "No text content extracted"
         
     except Exception as e:
         return f"Error parsing PDF: {str(e)}"
diff --git a/examples/Python3.12/docling-parse-example/sub-test1.py b/examples/Python3.12/docling-parse-example/sub-test1.py
index 8dd8bf2..83d5623 100644
--- a/examples/Python3.12/docling-parse-example/sub-test1.py
+++ b/examples/Python3.12/docling-parse-example/sub-test1.py
@@ -15,12 +15,21 @@ def test_docling_parse_version(self):
         assert "5.8.0" in version, f"Expected docling-parse 5.8.0, got {version}"
 
     def test_pdf_parser_import(self):
-        """Check if pdf_parser_v2 can be imported"""
+        """Check if DoclingPdfParser can be imported"""
         try:
-            from docling_parse import pdf_parser_v2
-            self.assertIsNotNone(pdf_parser_v2, "pdf_parser_v2 should not be None")
+            from docling_parse.pdf_parser import DoclingPdfParser
+            self.assertIsNotNone(DoclingPdfParser, "DoclingPdfParser should not be None")
         except ImportError as e:
-            self.fail(f"Failed to import pdf_parser_v2: {e}")
+            self.fail(f"Failed to import DoclingPdfParser: {e}")
+
+    def test_pdf_parser_instantiation(self):
+        """Check if DoclingPdfParser can be instantiated"""
+        try:
+            from docling_parse.pdf_parser import DoclingPdfParser
+            parser = DoclingPdfParser()
+            self.assertIsNotNone(parser, "Parser instance should not be None")
+        except Exception as e:
+            self.fail(f"Failed to instantiate DoclingPdfParser: {e}")
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py
index 1ae5100..1ff2cb6 100644
--- a/examples/Python3.12/docling-parse-example/sub-test3.py
+++ b/examples/Python3.12/docling-parse-example/sub-test3.py
@@ -34,21 +34,20 @@ def test_parse_pdf_with_docling(self):
             self.skipTest("Test PDF not available")
         
         try:
-            from docling_parse import pdf_parser_v2
+            from docling_parse.pdf_parser import DoclingPdfParser
             
-            # Parse the test PDF
-            doc = pdf_parser_v2(self.test_pdf_path)
+            # Create parser and parse the test PDF
+            parser = DoclingPdfParser()
+            doc = parser.parse(self.test_pdf_path)
             
             # Basic assertions
             self.assertIsNotNone(doc, "Parsed document should not be None")
             
             # Check if document has expected attributes
             has_pages = hasattr(doc, 'pages')
-            has_text = hasattr(doc, 'text')
             
-            # At least one of these should be true for a valid parse
-            self.assertTrue(has_pages or has_text, 
-                          "Parsed document should have pages or text attribute")
+            # Document should have pages
+            self.assertTrue(has_pages, "Parsed document should have pages attribute")
             
         except Exception as e:
             self.fail(f"Failed to parse PDF with docling-parse: {e}")
@@ -56,13 +55,14 @@ def test_parse_pdf_with_docling(self):
     def test_docling_parse_error_handling(self):
         """Test error handling for non-existent PDF"""
         try:
-            from docling_parse import pdf_parser_v2
+            from docling_parse.pdf_parser import DoclingPdfParser
             
             # Try to parse a non-existent file
             non_existent_pdf = "this_file_does_not_exist.pdf"
+            parser = DoclingPdfParser()
             
             with self.assertRaises(Exception):
-                pdf_parser_v2(non_existent_pdf)
+                parser.parse(non_existent_pdf)
                 
         except ImportError:
             self.skipTest("docling-parse not available")

From 08c55555f899b0d38a2b6fb6e09b5335caa62a0b Mon Sep 17 00:00:00 2001
From: Rushikesh Sathe <rushikeshsathe@Rushikeshs-MacBook-Air.local>
Date: Fri, 22 May 2026 12:17:28 +0530
Subject: [PATCH 5/7] updated subtest

---
 .../docling-parse-example/docling_parse_example.py        | 4 ++--
 examples/Python3.12/docling-parse-example/sub-test3.py    | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py
index 265c6ef..6edc539 100644
--- a/examples/Python3.12/docling-parse-example/docling_parse_example.py
+++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py
@@ -14,9 +14,9 @@ def parse_pdf_with_docling(pdf_path: str) -> str:
         Extracted text content from the PDF
     """
     try:
-        # Create parser instance and parse the PDF document
+        # Create parser instance and load the PDF document
         parser = DoclingPdfParser()
-        doc = parser.parse(pdf_path)
+        doc = parser.load(pdf_path)
         
         # Extract text content from pages
         text_content = ""
diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py
index 1ff2cb6..6cd156f 100644
--- a/examples/Python3.12/docling-parse-example/sub-test3.py
+++ b/examples/Python3.12/docling-parse-example/sub-test3.py
@@ -36,9 +36,9 @@ def test_parse_pdf_with_docling(self):
         try:
             from docling_parse.pdf_parser import DoclingPdfParser
             
-            # Create parser and parse the test PDF
+            # Create parser and load the test PDF
             parser = DoclingPdfParser()
-            doc = parser.parse(self.test_pdf_path)
+            doc = parser.load(self.test_pdf_path)
             
             # Basic assertions
             self.assertIsNotNone(doc, "Parsed document should not be None")
@@ -57,12 +57,12 @@ def test_docling_parse_error_handling(self):
         try:
             from docling_parse.pdf_parser import DoclingPdfParser
             
-            # Try to parse a non-existent file
+            # Try to load a non-existent file
             non_existent_pdf = "this_file_does_not_exist.pdf"
             parser = DoclingPdfParser()
             
             with self.assertRaises(Exception):
-                parser.parse(non_existent_pdf)
+                parser.load(non_existent_pdf)
                 
         except ImportError:
             self.skipTest("docling-parse not available")

From ed51bd6149e7e7461b66518abda5c8ebd880f867 Mon Sep 17 00:00:00 2001
From: Rushikesh Sathe <rushikeshsathe@Rushikeshs-MacBook-Air.local>
Date: Fri, 22 May 2026 12:22:42 +0530
Subject: [PATCH 6/7] subtest

---
 .../docling_parse_example.py                   | 18 +++++++++---------
 .../docling-parse-example/sub-test3.py         | 18 ++++++++++++++----
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py
index 6edc539..855f5c9 100644
--- a/examples/Python3.12/docling-parse-example/docling_parse_example.py
+++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py
@@ -18,17 +18,17 @@ def parse_pdf_with_docling(pdf_path: str) -> str:
         parser = DoclingPdfParser()
         doc = parser.load(pdf_path)
         
-        # Extract text content from pages
+        # Extract text content from all pages
         text_content = ""
         
-        if hasattr(doc, 'pages') and doc.pages:
-            for page in doc.pages:
-                # Extract text from cells in the page
-                if hasattr(page, 'cells'):
-                    for cell in page.cells:
-                        if hasattr(cell, 'text'):
-                            text_content += cell.text + " "
-                    text_content += "\n"
+        for page_num in range(1, doc.number_of_pages() + 1):
+            page = doc.get_page(page_num)
+            
+            # Extract text from word cells
+            if hasattr(page, 'word_cells') and page.word_cells:
+                for cell in page.word_cells:
+                    text_content += cell.text + " "
+                text_content += "\n"
         
         return text_content.strip() if text_content else "No text content extracted"
         
diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py
index 6cd156f..997ba66 100644
--- a/examples/Python3.12/docling-parse-example/sub-test3.py
+++ b/examples/Python3.12/docling-parse-example/sub-test3.py
@@ -43,11 +43,21 @@ def test_parse_pdf_with_docling(self):
             # Basic assertions
             self.assertIsNotNone(doc, "Parsed document should not be None")
             
-            # Check if document has expected attributes
-            has_pages = hasattr(doc, 'pages')
+            # Check if document has number_of_pages method
+            self.assertTrue(hasattr(doc, 'number_of_pages'),
+                          "Document should have number_of_pages method")
             
-            # Document should have pages
-            self.assertTrue(has_pages, "Parsed document should have pages attribute")
+            # Check that we can get pages
+            num_pages = doc.number_of_pages()
+            self.assertGreater(num_pages, 0, "Document should have at least one page")
+            
+            # Try to get the first page
+            page = doc.get_page(1)
+            self.assertIsNotNone(page, "Should be able to get page 1")
+            
+            # Check that page has word_cells
+            self.assertTrue(hasattr(page, 'word_cells'),
+                          "Page should have word_cells attribute")
             
         except Exception as e:
             self.fail(f"Failed to parse PDF with docling-parse: {e}")

From df9cf4445b454b97a545879a5b217e1a6c4d570c Mon Sep 17 00:00:00 2001
From: Rushikesh Sathe <Rushikesh.Sathe1@ibm.com>
Date: Fri, 22 May 2026 08:35:00 +0000
Subject: [PATCH 7/7] Added docling example

---
 examples/Python3.12/docling-parse-example/README.md             | 2 +-
 .../Python3.12/docling-parse-example/docling_parse_example.py   | 1 -
 .../Python3.12/docling-parse-example/install_test_example.sh    | 1 -
 examples/Python3.12/docling-parse-example/sub-test1.py          | 1 -
 examples/Python3.12/docling-parse-example/sub-test2.py          | 1 -
 examples/Python3.12/docling-parse-example/sub-test3.py          | 1 -
 6 files changed, 1 insertion(+), 6 deletions(-)
 mode change 100644 => 100755 examples/Python3.12/docling-parse-example/install_test_example.sh

diff --git a/examples/Python3.12/docling-parse-example/README.md b/examples/Python3.12/docling-parse-example/README.md
index fd2f921..31d2bdb 100644
--- a/examples/Python3.12/docling-parse-example/README.md
+++ b/examples/Python3.12/docling-parse-example/README.md
@@ -42,4 +42,4 @@ python3.12 docling_parse_example.py [optional_pdf_path]
 ```
 
 ### License:
-It's covered under Apache 2.0 licenses
\ No newline at end of file
+It's covered under Apache 2.0 licenses
diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py
index 855f5c9..d2fca00 100644
--- a/examples/Python3.12/docling-parse-example/docling_parse_example.py
+++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py
@@ -207,4 +207,3 @@ def create_sample_pdf():
     print("  - LLM for generation (e.g., vLLM, Ollama)")
     print("=" * 70)
 
-# Made with Bob
diff --git a/examples/Python3.12/docling-parse-example/install_test_example.sh b/examples/Python3.12/docling-parse-example/install_test_example.sh
old mode 100644
new mode 100755
index a2ca1c4..509a866
--- a/examples/Python3.12/docling-parse-example/install_test_example.sh
+++ b/examples/Python3.12/docling-parse-example/install_test_example.sh
@@ -65,4 +65,3 @@ python3.12 sub-test1.py
 python3.12 sub-test2.py
 python3.12 sub-test3.py
 
-# Made with Bob
diff --git a/examples/Python3.12/docling-parse-example/sub-test1.py b/examples/Python3.12/docling-parse-example/sub-test1.py
index 83d5623..8f00b64 100644
--- a/examples/Python3.12/docling-parse-example/sub-test1.py
+++ b/examples/Python3.12/docling-parse-example/sub-test1.py
@@ -34,4 +34,3 @@ def test_pdf_parser_instantiation(self):
 if __name__ == "__main__":
     unittest.main()
 
-# Made with Bob
diff --git a/examples/Python3.12/docling-parse-example/sub-test2.py b/examples/Python3.12/docling-parse-example/sub-test2.py
index 1aa2538..181873d 100644
--- a/examples/Python3.12/docling-parse-example/sub-test2.py
+++ b/examples/Python3.12/docling-parse-example/sub-test2.py
@@ -41,4 +41,3 @@ def test_pillow_import(self):
 if __name__ == "__main__":
     unittest.main()
 
-# Made with Bob
diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py
index 997ba66..41933f3 100644
--- a/examples/Python3.12/docling-parse-example/sub-test3.py
+++ b/examples/Python3.12/docling-parse-example/sub-test3.py
@@ -80,4 +80,3 @@ def test_docling_parse_error_handling(self):
 if __name__ == "__main__":
     unittest.main()
 
-# Made with Bob