-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfix_ocr.py
More file actions
74 lines (62 loc) · 2.67 KB
/
fix_ocr.py
File metadata and controls
74 lines (62 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
# The CORRECT content that handles Text files AND fixes the PaddleOCR crash
CORRECT_CODE = r'''import logging
import os
# Suppress PaddleOCR's noisy debug logs via standard logging
logging.getLogger("ppocr").setLevel(logging.ERROR)
from paddleocr import PaddleOCR
from pypdf import PdfReader
class OCREngine:
def __init__(self):
# Initialize OCR once
print("👁️ [OCR] Loading Neural Engine...")
# We disable angle classification globally to prevent the 'cls' argument bug
self.ocr_model = PaddleOCR(use_angle_cls=False, lang='en')
def extract_text(self, file_path: str) -> str:
print(f"👁️ [OCR] Processing: {file_path}")
text = ""
# ==============================================================================
# 🚨 STRATEGY 1: PLAIN TEXT BYPASS (CRITICAL FIX)
# ==============================================================================
if file_path.endswith(".txt") or file_path.endswith(".md"):
try:
with open(file_path, "r", encoding="utf-8") as f:
return f.read().strip()
except Exception as e:
print(f"⚠️ Text read failed: {e}")
# ==============================================================================
# --- STRATEGY 2: NATIVE PDF EXTRACTION ---
try:
reader = PdfReader(file_path)
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
except Exception:
pass
# --- STRATEGY 3: NEURAL OCR (Fallback for Scans/Images) ---
if len(text.strip()) < 50:
print("👁️ [OCR] Native text failed. Running Neural OCR (Paddle)...")
try:
# FIX: Call without arguments. The __init__ setting handles the config.
result = self.ocr_model.ocr(file_path)
text = ""
if result and result[0]:
for line in result[0]:
text += line[1][0] + "\n"
except Exception as e:
print(f"❌ [OCR Failed] {e}")
return ""
return text.strip()
# Global Instance
ocr_runner = OCREngine()
'''
# Path to the file we need to overwrite
target_path = os.path.join("backend", "app", "services", "ocr_engine.py")
# Write the file
try:
with open(target_path, "w", encoding="utf-8") as f:
f.write(CORRECT_CODE)
print(f"✅ SUCCESS: Overwrote {target_path} with the correct code.")
except Exception as e:
print(f"❌ FAIL: Could not write file. Error: {e}")