Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions examples/Python3.12/docling-parse-example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
## Purpose: Simple RAG (Retrieval-Augmented Generation) Application with Docling-Parse

### Packages used:
- docling-parse (PDF parsing)
- reportlab (PDF creation for testing)
- Pillow (Image processing support)
- numpy (Numerical operations)

### Functionality:
This example demonstrates a simplified RAG (Retrieval-Augmented Generation) pipeline:

1. **Document Parsing**: Uses docling-parse v2 to extract text from PDF documents
2. **Text Chunking**: Splits extracted text into manageable chunks for retrieval
3. **Keyword-based Retrieval**: Implements simple keyword matching to find relevant chunks
4. **Result Display**: Shows top matching chunks based on query

### RAG Pipeline Steps:
1. Parse PDF document with docling-parse
2. Extract text content from parsed document
3. Create document chunks (configurable size)
4. Perform keyword-based retrieval on chunks
5. Display top relevant results

### Note:
This is a simplified RAG demonstration. Production RAG systems typically use:
- Vector embeddings (e.g., sentence-transformers, OpenAI embeddings)
- Vector databases (e.g., FAISS, Chroma, Pinecone)
- LLM for generation (e.g., vLLM, Ollama, OpenAI GPT)

### How to run the example:
```bash
chmod +x install_test_example.sh
./install_test_example.sh
```

Or manually:
```bash
python3.12 -m venv .venv
source .venv/bin/activate
pip install --extra-index-url https://wheels.developerfirst.ibm.com/ppc64le/linux -r requirements.txt
python3.12 docling_parse_example.py [optional_pdf_path]
```

### License:
It's covered under Apache 2.0 licenses
209 changes: 209 additions & 0 deletions examples/Python3.12/docling-parse-example/docling_parse_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import sys
from pathlib import Path
from typing import List, Dict
from docling_parse.pdf_parser import DoclingPdfParser

def parse_pdf_with_docling(pdf_path: str) -> str:
"""
Parse a PDF document using docling-parse and extract text content.

Args:
pdf_path: Path to the PDF file to parse

Returns:
Extracted text content from the PDF
"""
try:
# Create parser instance and load the PDF document
parser = DoclingPdfParser()
doc = parser.load(pdf_path)

# Extract text content from all pages
text_content = ""

for page_num in range(1, doc.number_of_pages() + 1):
page = doc.get_page(page_num)

# Extract text from word cells
if hasattr(page, 'word_cells') and page.word_cells:
for cell in page.word_cells:
text_content += cell.text + " "
text_content += "\n"

return text_content.strip() if text_content else "No text content extracted"

except Exception as e:
return f"Error parsing PDF: {str(e)}"


def create_document_chunks(text: str, chunk_size: int = 500) -> List[str]:
"""
Split document text into smaller chunks for RAG processing.

Args:
text: Full document text
chunk_size: Size of each chunk in characters

Returns:
List of text chunks
"""
# Simple chunking by character count
chunks = []
words = text.split()
current_chunk = []
current_size = 0

for word in words:
word_size = len(word) + 1 # +1 for space
if current_size + word_size > chunk_size and current_chunk:
chunks.append(" ".join(current_chunk))
current_chunk = [word]
current_size = word_size
else:
current_chunk.append(word)
current_size += word_size

if current_chunk:
chunks.append(" ".join(current_chunk))

return chunks


def simple_keyword_search(chunks: List[str], query: str) -> List[Dict[str, any]]:
"""
Simple keyword-based retrieval (mock RAG retrieval).
In a real RAG system, this would use embeddings and vector search.

Args:
chunks: List of document chunks
query: Search query

Returns:
List of relevant chunks with scores
"""
results = []
query_lower = query.lower()
query_words = set(query_lower.split())

for idx, chunk in enumerate(chunks):
chunk_lower = chunk.lower()
# Simple scoring: count matching words
matches = sum(1 for word in query_words if word in chunk_lower)

if matches > 0:
results.append({
"chunk_id": idx,
"text": chunk,
"score": matches,
"preview": chunk[:200] + "..." if len(chunk) > 200 else chunk
})

# Sort by score (descending)
results.sort(key=lambda x: x["score"], reverse=True)
return results


def create_sample_pdf():
"""
Create a sample PDF document with RAG-relevant content.
"""
try:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter

pdf_path = "rag_sample_document.pdf"
c = canvas.Canvas(pdf_path, pagesize=letter)

# Add content about AI and machine learning
y_position = 750
content = [
"Introduction to Artificial Intelligence",
"",
"Artificial Intelligence (AI) is the simulation of human intelligence",
"processes by machines, especially computer systems. These processes",
"include learning, reasoning, and self-correction.",
"",
"Machine Learning is a subset of AI that provides systems the ability",
"to automatically learn and improve from experience without being",
"explicitly programmed. Deep learning is a subset of machine learning.",
"",
"Natural Language Processing (NLP) is a branch of AI that helps",
"computers understand, interpret and manipulate human language.",
"NLP draws from many disciplines, including computer science and",
"computational linguistics.",
]

for line in content:
c.drawString(50, y_position, line)
y_position -= 20

c.showPage()
c.save()

print(f"✓ Created sample PDF: {pdf_path}")
return pdf_path

except ImportError:
print("✗ reportlab not available, skipping PDF creation")
return None


if __name__ == "__main__":
print("=" * 70)
print("Simple RAG Application with Docling-Parse")
print("=" * 70)

# Step 1: Get or create PDF document
if len(sys.argv) > 1:
pdf_path = sys.argv[1]
print(f"\n[1/5] Using provided PDF: {pdf_path}")
else:
print("\n[1/5] Creating sample PDF document...")
pdf_path = create_sample_pdf()
if not pdf_path:
print("\n✗ No PDF provided and couldn't create sample PDF")
print("Usage: python docling_parse_example.py [path_to_pdf]")
sys.exit(1)

# Check if file exists
if not Path(pdf_path).exists():
print(f"\n✗ Error: PDF file not found: {pdf_path}")
sys.exit(1)

# Step 2: Parse PDF with docling-parse
print("\n[2/5] Parsing PDF with docling-parse...")
document_text = parse_pdf_with_docling(pdf_path)
print(f"✓ Extracted {len(document_text)} characters")

# Step 3: Create document chunks
print("\n[3/5] Creating document chunks for RAG...")
chunks = create_document_chunks(document_text, chunk_size=300)
print(f"✓ Created {len(chunks)} chunks")

# Step 4: Demonstrate retrieval
print("\n[4/5] Performing keyword-based retrieval...")
query = "machine learning"
print(f"Query: '{query}'")

results = simple_keyword_search(chunks, query)
print(f"✓ Found {len(results)} relevant chunks")

# Step 5: Display results
print("\n[5/5] Top Retrieved Chunks:")
print("=" * 70)

for i, result in enumerate(results[:3], 1): # Show top 3
print(f"\nChunk #{result['chunk_id']} (Score: {result['score']})")
print("-" * 70)
print(result['preview'])

print("\n" + "=" * 70)
print("RAG Example completed successfully!")
print("=" * 70)
print("\nNote: This is a simplified RAG demonstration.")
print("Production RAG systems would use:")
print(" - Vector embeddings (e.g., sentence-transformers)")
print(" - Vector databases (e.g., FAISS, Chroma)")
print(" - LLM for generation (e.g., vLLM, Ollama)")
print("=" * 70)

67 changes: 67 additions & 0 deletions examples/Python3.12/docling-parse-example/install_test_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash
# -------------------------------
# Function to detect Linux distro
# -------------------------------
detect_distro() {
if [ -f /etc/os-release ]; then
. /etc/os-release
echo $ID
elif [ -f /etc/redhat-release ]; then
echo "rhel"
elif [ -f /etc/debian_version ]; then
echo "debian"
else
echo "unknown"
fi
}

DISTRO=$(detect_distro)
echo "Detected distribution: $DISTRO"
echo "Installing prerequisites..."

# -------------------------------
# Install system dependencies
# -------------------------------
case $DISTRO in
"fedora"|"rhel"|"centos"|"rocky"|"almalinux")
if command -v dnf >/dev/null 2>&1; then
sudo dnf install -y python3.12 python3.12-devel python3-pip
else
sudo yum install -y python3.12 python3.12-devel python3-pip
fi
;;
"ubuntu"|"debian")
export DEBIAN_FRONTEND=noninteractive
sudo apt update
sudo apt install -y python3.12 python3.12-dev python3-pip python3.12-venv
;;
"sles")
sudo zypper refresh
sudo zypper install -y python312 python312-pip

;;
*)
echo "Unsupported distribution: $DISTRO"
exit 1
;;
esac



python3.12 -m venv .venv
source .venv/bin/activate

python3.12 -m pip install --no-cache --prefer-binary --extra-index-url https://wheels.developerfirst.ibm.com/ppc64le/linux -r requirements.txt

WORKDIR=$(pwd)

cd $WORKDIR

python3.12 docling_parse_example.py

echo "\n ==== Running tests ==== \n"

python3.12 sub-test1.py
python3.12 sub-test2.py
python3.12 sub-test3.py

4 changes: 4 additions & 0 deletions examples/Python3.12/docling-parse-example/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
docling-parse==5.8.0+ppc64le1
reportlab==4.2.5
Pillow==11.1.0
numpy==2.4.4+ppc64le1
36 changes: 36 additions & 0 deletions examples/Python3.12/docling-parse-example/sub-test1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import unittest
import importlib.metadata

class TestDoclingParseLibrary(unittest.TestCase):
def test_docling_parse_import(self):
"""Check if docling-parse can be imported"""
try:
import docling_parse
except ImportError:
self.fail("docling-parse is not installed")

def test_docling_parse_version(self):
"""Verify docling-parse version"""
version = importlib.metadata.version("docling-parse")
assert "5.8.0" in version, f"Expected docling-parse 5.8.0, got {version}"

def test_pdf_parser_import(self):
"""Check if DoclingPdfParser can be imported"""
try:
from docling_parse.pdf_parser import DoclingPdfParser
self.assertIsNotNone(DoclingPdfParser, "DoclingPdfParser should not be None")
except ImportError as e:
self.fail(f"Failed to import DoclingPdfParser: {e}")

def test_pdf_parser_instantiation(self):
"""Check if DoclingPdfParser can be instantiated"""
try:
from docling_parse.pdf_parser import DoclingPdfParser
parser = DoclingPdfParser()
self.assertIsNotNone(parser, "Parser instance should not be None")
except Exception as e:
self.fail(f"Failed to instantiate DoclingPdfParser: {e}")

if __name__ == "__main__":
unittest.main()

Loading