diff --git a/examples/Python3.12/docling-parse-example/README.md b/examples/Python3.12/docling-parse-example/README.md new file mode 100644 index 0000000..31d2bdb --- /dev/null +++ b/examples/Python3.12/docling-parse-example/README.md @@ -0,0 +1,45 @@ +## Purpose: Simple RAG (Retrieval-Augmented Generation) Application with Docling-Parse + +### Packages used: +- docling-parse (PDF parsing) +- reportlab (PDF creation for testing) +- Pillow (Image processing support) +- numpy (Numerical operations) + +### Functionality: +This example demonstrates a simplified RAG (Retrieval-Augmented Generation) pipeline: + +1. **Document Parsing**: Uses docling-parse v2 to extract text from PDF documents +2. **Text Chunking**: Splits extracted text into manageable chunks for retrieval +3. **Keyword-based Retrieval**: Implements simple keyword matching to find relevant chunks +4. **Result Display**: Shows top matching chunks based on query + +### RAG Pipeline Steps: +1. Parse PDF document with docling-parse +2. Extract text content from parsed document +3. Create document chunks (configurable size) +4. Perform keyword-based retrieval on chunks +5. Display top relevant results + +### Note: +This is a simplified RAG demonstration. Production RAG systems typically use: +- Vector embeddings (e.g., sentence-transformers, OpenAI embeddings) +- Vector databases (e.g., FAISS, Chroma, Pinecone) +- LLM for generation (e.g., vLLM, Ollama, OpenAI GPT) + +### How to run the example: +```bash +chmod +x install_test_example.sh +./install_test_example.sh +``` + +Or manually: +```bash +python3.12 -m venv .venv +source .venv/bin/activate +pip install --extra-index-url https://wheels.developerfirst.ibm.com/ppc64le/linux -r requirements.txt +python3.12 docling_parse_example.py [optional_pdf_path] +``` + +### License: +It's covered under Apache 2.0 licenses diff --git a/examples/Python3.12/docling-parse-example/docling_parse_example.py b/examples/Python3.12/docling-parse-example/docling_parse_example.py new file mode 100644 index 0000000..d2fca00 --- /dev/null +++ b/examples/Python3.12/docling-parse-example/docling_parse_example.py @@ -0,0 +1,209 @@ +import sys +from pathlib import Path +from typing import List, Dict +from docling_parse.pdf_parser import DoclingPdfParser + +def parse_pdf_with_docling(pdf_path: str) -> str: + """ + Parse a PDF document using docling-parse and extract text content. + + Args: + pdf_path: Path to the PDF file to parse + + Returns: + Extracted text content from the PDF + """ + try: + # Create parser instance and load the PDF document + parser = DoclingPdfParser() + doc = parser.load(pdf_path) + + # Extract text content from all pages + text_content = "" + + for page_num in range(1, doc.number_of_pages() + 1): + page = doc.get_page(page_num) + + # Extract text from word cells + if hasattr(page, 'word_cells') and page.word_cells: + for cell in page.word_cells: + text_content += cell.text + " " + text_content += "\n" + + return text_content.strip() if text_content else "No text content extracted" + + except Exception as e: + return f"Error parsing PDF: {str(e)}" + + +def create_document_chunks(text: str, chunk_size: int = 500) -> List[str]: + """ + Split document text into smaller chunks for RAG processing. + + Args: + text: Full document text + chunk_size: Size of each chunk in characters + + Returns: + List of text chunks + """ + # Simple chunking by character count + chunks = [] + words = text.split() + current_chunk = [] + current_size = 0 + + for word in words: + word_size = len(word) + 1 # +1 for space + if current_size + word_size > chunk_size and current_chunk: + chunks.append(" ".join(current_chunk)) + current_chunk = [word] + current_size = word_size + else: + current_chunk.append(word) + current_size += word_size + + if current_chunk: + chunks.append(" ".join(current_chunk)) + + return chunks + + +def simple_keyword_search(chunks: List[str], query: str) -> List[Dict[str, any]]: + """ + Simple keyword-based retrieval (mock RAG retrieval). + In a real RAG system, this would use embeddings and vector search. + + Args: + chunks: List of document chunks + query: Search query + + Returns: + List of relevant chunks with scores + """ + results = [] + query_lower = query.lower() + query_words = set(query_lower.split()) + + for idx, chunk in enumerate(chunks): + chunk_lower = chunk.lower() + # Simple scoring: count matching words + matches = sum(1 for word in query_words if word in chunk_lower) + + if matches > 0: + results.append({ + "chunk_id": idx, + "text": chunk, + "score": matches, + "preview": chunk[:200] + "..." if len(chunk) > 200 else chunk + }) + + # Sort by score (descending) + results.sort(key=lambda x: x["score"], reverse=True) + return results + + +def create_sample_pdf(): + """ + Create a sample PDF document with RAG-relevant content. + """ + try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + + pdf_path = "rag_sample_document.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + + # Add content about AI and machine learning + y_position = 750 + content = [ + "Introduction to Artificial Intelligence", + "", + "Artificial Intelligence (AI) is the simulation of human intelligence", + "processes by machines, especially computer systems. These processes", + "include learning, reasoning, and self-correction.", + "", + "Machine Learning is a subset of AI that provides systems the ability", + "to automatically learn and improve from experience without being", + "explicitly programmed. Deep learning is a subset of machine learning.", + "", + "Natural Language Processing (NLP) is a branch of AI that helps", + "computers understand, interpret and manipulate human language.", + "NLP draws from many disciplines, including computer science and", + "computational linguistics.", + ] + + for line in content: + c.drawString(50, y_position, line) + y_position -= 20 + + c.showPage() + c.save() + + print(f"✓ Created sample PDF: {pdf_path}") + return pdf_path + + except ImportError: + print("✗ reportlab not available, skipping PDF creation") + return None + + +if __name__ == "__main__": + print("=" * 70) + print("Simple RAG Application with Docling-Parse") + print("=" * 70) + + # Step 1: Get or create PDF document + if len(sys.argv) > 1: + pdf_path = sys.argv[1] + print(f"\n[1/5] Using provided PDF: {pdf_path}") + else: + print("\n[1/5] Creating sample PDF document...") + pdf_path = create_sample_pdf() + if not pdf_path: + print("\n✗ No PDF provided and couldn't create sample PDF") + print("Usage: python docling_parse_example.py [path_to_pdf]") + sys.exit(1) + + # Check if file exists + if not Path(pdf_path).exists(): + print(f"\n✗ Error: PDF file not found: {pdf_path}") + sys.exit(1) + + # Step 2: Parse PDF with docling-parse + print("\n[2/5] Parsing PDF with docling-parse...") + document_text = parse_pdf_with_docling(pdf_path) + print(f"✓ Extracted {len(document_text)} characters") + + # Step 3: Create document chunks + print("\n[3/5] Creating document chunks for RAG...") + chunks = create_document_chunks(document_text, chunk_size=300) + print(f"✓ Created {len(chunks)} chunks") + + # Step 4: Demonstrate retrieval + print("\n[4/5] Performing keyword-based retrieval...") + query = "machine learning" + print(f"Query: '{query}'") + + results = simple_keyword_search(chunks, query) + print(f"✓ Found {len(results)} relevant chunks") + + # Step 5: Display results + print("\n[5/5] Top Retrieved Chunks:") + print("=" * 70) + + for i, result in enumerate(results[:3], 1): # Show top 3 + print(f"\nChunk #{result['chunk_id']} (Score: {result['score']})") + print("-" * 70) + print(result['preview']) + + print("\n" + "=" * 70) + print("RAG Example completed successfully!") + print("=" * 70) + print("\nNote: This is a simplified RAG demonstration.") + print("Production RAG systems would use:") + print(" - Vector embeddings (e.g., sentence-transformers)") + print(" - Vector databases (e.g., FAISS, Chroma)") + print(" - LLM for generation (e.g., vLLM, Ollama)") + print("=" * 70) + diff --git a/examples/Python3.12/docling-parse-example/install_test_example.sh b/examples/Python3.12/docling-parse-example/install_test_example.sh new file mode 100755 index 0000000..509a866 --- /dev/null +++ b/examples/Python3.12/docling-parse-example/install_test_example.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# ------------------------------- +# Function to detect Linux distro +# ------------------------------- +detect_distro() { + if [ -f /etc/os-release ]; then + . /etc/os-release + echo $ID + elif [ -f /etc/redhat-release ]; then + echo "rhel" + elif [ -f /etc/debian_version ]; then + echo "debian" + else + echo "unknown" + fi +} + +DISTRO=$(detect_distro) +echo "Detected distribution: $DISTRO" +echo "Installing prerequisites..." + +# ------------------------------- +# Install system dependencies +# ------------------------------- +case $DISTRO in + "fedora"|"rhel"|"centos"|"rocky"|"almalinux") + if command -v dnf >/dev/null 2>&1; then + sudo dnf install -y python3.12 python3.12-devel python3-pip + else + sudo yum install -y python3.12 python3.12-devel python3-pip + fi + ;; + "ubuntu"|"debian") + export DEBIAN_FRONTEND=noninteractive + sudo apt update + sudo apt install -y python3.12 python3.12-dev python3-pip python3.12-venv + ;; + "sles") + sudo zypper refresh + sudo zypper install -y python312 python312-pip + + ;; + *) + echo "Unsupported distribution: $DISTRO" + exit 1 + ;; +esac + + + +python3.12 -m venv .venv +source .venv/bin/activate + +python3.12 -m pip install --no-cache --prefer-binary --extra-index-url https://wheels.developerfirst.ibm.com/ppc64le/linux -r requirements.txt + +WORKDIR=$(pwd) + +cd $WORKDIR + +python3.12 docling_parse_example.py + +echo "\n ==== Running tests ==== \n" + +python3.12 sub-test1.py +python3.12 sub-test2.py +python3.12 sub-test3.py + diff --git a/examples/Python3.12/docling-parse-example/requirements.txt b/examples/Python3.12/docling-parse-example/requirements.txt new file mode 100644 index 0000000..5f47dff --- /dev/null +++ b/examples/Python3.12/docling-parse-example/requirements.txt @@ -0,0 +1,4 @@ +docling-parse==5.8.0+ppc64le1 +reportlab==4.2.5 +Pillow==11.1.0 +numpy==2.4.4+ppc64le1 \ No newline at end of file diff --git a/examples/Python3.12/docling-parse-example/sub-test1.py b/examples/Python3.12/docling-parse-example/sub-test1.py new file mode 100644 index 0000000..8f00b64 --- /dev/null +++ b/examples/Python3.12/docling-parse-example/sub-test1.py @@ -0,0 +1,36 @@ +import unittest +import importlib.metadata + +class TestDoclingParseLibrary(unittest.TestCase): + def test_docling_parse_import(self): + """Check if docling-parse can be imported""" + try: + import docling_parse + except ImportError: + self.fail("docling-parse is not installed") + + def test_docling_parse_version(self): + """Verify docling-parse version""" + version = importlib.metadata.version("docling-parse") + assert "5.8.0" in version, f"Expected docling-parse 5.8.0, got {version}" + + def test_pdf_parser_import(self): + """Check if DoclingPdfParser can be imported""" + try: + from docling_parse.pdf_parser import DoclingPdfParser + self.assertIsNotNone(DoclingPdfParser, "DoclingPdfParser should not be None") + except ImportError as e: + self.fail(f"Failed to import DoclingPdfParser: {e}") + + def test_pdf_parser_instantiation(self): + """Check if DoclingPdfParser can be instantiated""" + try: + from docling_parse.pdf_parser import DoclingPdfParser + parser = DoclingPdfParser() + self.assertIsNotNone(parser, "Parser instance should not be None") + except Exception as e: + self.fail(f"Failed to instantiate DoclingPdfParser: {e}") + +if __name__ == "__main__": + unittest.main() + diff --git a/examples/Python3.12/docling-parse-example/sub-test2.py b/examples/Python3.12/docling-parse-example/sub-test2.py new file mode 100644 index 0000000..181873d --- /dev/null +++ b/examples/Python3.12/docling-parse-example/sub-test2.py @@ -0,0 +1,43 @@ +import unittest +from pathlib import Path + +class TestReportlabPDFCreation(unittest.TestCase): + def test_reportlab_import(self): + """Check if reportlab can be imported""" + try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + except ImportError: + self.fail("reportlab is not installed") + + def test_create_simple_pdf(self): + """Test creating a simple PDF with reportlab""" + try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + + pdf_path = "test_document.pdf" + c = canvas.Canvas(pdf_path, pagesize=letter) + c.drawString(100, 750, "Test PDF Document") + c.showPage() + c.save() + + # Verify file was created + self.assertTrue(Path(pdf_path).exists(), "PDF file was not created") + + # Clean up + Path(pdf_path).unlink() + + except Exception as e: + self.fail(f"Failed to create PDF: {e}") + + def test_pillow_import(self): + """Check if Pillow can be imported""" + try: + from PIL import Image + except ImportError: + self.fail("Pillow is not installed") + +if __name__ == "__main__": + unittest.main() + diff --git a/examples/Python3.12/docling-parse-example/sub-test3.py b/examples/Python3.12/docling-parse-example/sub-test3.py new file mode 100644 index 0000000..41933f3 --- /dev/null +++ b/examples/Python3.12/docling-parse-example/sub-test3.py @@ -0,0 +1,82 @@ +import unittest +from pathlib import Path +import sys + +class TestDoclingParseIntegration(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Create a test PDF before running tests""" + try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + + cls.test_pdf_path = "integration_test.pdf" + c = canvas.Canvas(cls.test_pdf_path, pagesize=letter) + c.drawString(100, 750, "Integration Test PDF") + c.drawString(100, 730, "This document is used for testing docling-parse") + c.drawString(100, 710, "Line 3 of test content") + c.showPage() + c.save() + + except Exception as e: + print(f"Warning: Could not create test PDF: {e}") + cls.test_pdf_path = None + + @classmethod + def tearDownClass(cls): + """Clean up test PDF after tests""" + if cls.test_pdf_path and Path(cls.test_pdf_path).exists(): + Path(cls.test_pdf_path).unlink() + + def test_parse_pdf_with_docling(self): + """Test parsing a PDF with docling-parse""" + if not self.test_pdf_path: + self.skipTest("Test PDF not available") + + try: + from docling_parse.pdf_parser import DoclingPdfParser + + # Create parser and load the test PDF + parser = DoclingPdfParser() + doc = parser.load(self.test_pdf_path) + + # Basic assertions + self.assertIsNotNone(doc, "Parsed document should not be None") + + # Check if document has number_of_pages method + self.assertTrue(hasattr(doc, 'number_of_pages'), + "Document should have number_of_pages method") + + # Check that we can get pages + num_pages = doc.number_of_pages() + self.assertGreater(num_pages, 0, "Document should have at least one page") + + # Try to get the first page + page = doc.get_page(1) + self.assertIsNotNone(page, "Should be able to get page 1") + + # Check that page has word_cells + self.assertTrue(hasattr(page, 'word_cells'), + "Page should have word_cells attribute") + + except Exception as e: + self.fail(f"Failed to parse PDF with docling-parse: {e}") + + def test_docling_parse_error_handling(self): + """Test error handling for non-existent PDF""" + try: + from docling_parse.pdf_parser import DoclingPdfParser + + # Try to load a non-existent file + non_existent_pdf = "this_file_does_not_exist.pdf" + parser = DoclingPdfParser() + + with self.assertRaises(Exception): + parser.load(non_existent_pdf) + + except ImportError: + self.skipTest("docling-parse not available") + +if __name__ == "__main__": + unittest.main() +