content-engine/main.py at main · OutRizz/content-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""Main pipeline: load sources, fetch, generate 10 best post ideas from all (no grouping by source)."""

from __future__ import annotations

import logging
import os
import sys

from config import DEFAULT_MODEL, DEFAULT_URLS_FILE
from fetcher import fetch_entries_by_source
from langfuse import get_client
from llm import generate_post_ideas
from models import FeedEntry
from sources import load_sources

logger = logging.getLogger("content_engine")

TOTAL_IDEAS = 10
MAX_ENTRIES_FOR_LLM = 50  # Cap total entries to avoid token overflow


def build_content_block(entries: list[FeedEntry], include_links: bool = True) -> str:
    """Build a single text block for the LLM from entries."""
    parts: list[str] = []
    for i, e in enumerate(entries, 1):
        lines = [
            f"[{i}] Title: {e.title}",
            f"Source: {e.source_feed}",
            f"Published: {e.published}",
            f"Content:\n{e.content[:8000]}",
        ]
        if include_links:
            lines.insert(1, f"Link: {e.link}")
        parts.append("\n".join(lines))
    return "\n---\n\n".join(parts)


def build_sources_list(entries: list[FeedEntry]) -> str:
    """Build [N] -> url list for the LLM."""
    return "\n".join(f"[{i}] {e.link}" for i, e in enumerate(entries, 1))


def main() -> None:
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        stream=sys.stdout,
    )

    urls_path = os.environ.get("URLS_FILE", DEFAULT_URLS_FILE)
    model = os.environ.get("OPENAI_MODEL", DEFAULT_MODEL)

    if not os.environ.get("OPENAI_API_KEY"):
        print("Set OPENAI_API_KEY in .env or environment.", file=sys.stderr)
        sys.exit(1)

    sources = load_sources(urls_path)
    if not sources:
        print(
            f"No sources in {urls_path}. Add lines: type\\turl (type: rss | html).",
            file=sys.stderr,
        )
        sys.exit(1)

    logger.info("Fetching entries from %s source(s)...", len(sources))
    by_source = fetch_entries_by_source(sources)
    all_entries: list[FeedEntry] = []
    for _src, entries in by_source:
        all_entries.extend(entries)
    if not all_entries:
        print("No entries fetched.", file=sys.stderr)
        sys.exit(1)

    entries = all_entries[:MAX_ENTRIES_FOR_LLM]
    logger.info("Using %s entries (of %s) for LLM, generating %s best ideas total", len(entries), len(all_entries), TOTAL_IDEAS)

    content_block = build_content_block(entries, include_links=True)
    sources_list = build_sources_list(entries)
    logger.info("Content block: %s chars, calling LLM...", len(content_block))

    ideas = generate_post_ideas(
        content_block, model, count=TOTAL_IDEAS, sources_list=sources_list
    )
    logger.info("Post ideas received: %s", len(ideas))

    print("\n--- 10 best post ideas (all sources) ---\n")
    for i, idea in enumerate(ideas, 1):
        print(f"### Idea {i}")
        print("**Source(s):** " + (", ".join(idea.source_links) if idea.source_links else "—"))
        print(f"**Source insight:** {idea.source_insight}")
        print(f"**Post idea:** {idea.post_idea}")
        print(f"**Description:** {idea.description}")
        print(f"**Recommended format:** {idea.recommended_format}")
        print(f"**How to use:** {idea.how_to_use}")
        print()

    get_client().flush()