-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest.py
More file actions
40 lines (28 loc) · 1.1 KB
/
ingest.py
File metadata and controls
40 lines (28 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""CLI entrypoint to enqueue PDF ingestion jobs."""
import sys
import time
def main() -> None:
from ingestion.config import settings
data_dir = settings.data_dir
if not data_dir.exists():
print(f"Error: data directory not found at {data_dir.resolve()}", file=sys.stderr)
sys.exit(1)
pdf_files = list(data_dir.rglob("*.pdf"))
if not pdf_files:
print(f"No PDF files found in {data_dir.resolve()}")
print("Drop your PDFs into the data/ folder and re-run.")
sys.exit(0)
print(f"Found {len(pdf_files)} PDF file(s):")
for f in pdf_files:
print(f" • {f.relative_to(data_dir.parent)}")
print()
from ingestion.queue import enqueue_directory
start = time.perf_counter()
enqueued, skipped_existing = enqueue_directory(data_dir)
elapsed = time.perf_counter() - start
print(f"\nDone in {elapsed:.1f}s.")
print(f"Enqueued: {enqueued} PDF job(s)")
print(f"Skipped (already queued/known): {skipped_existing} PDF job(s)")
print("Start workers with: uv run python worker.py")
if __name__ == "__main__":
main()