dotfiles/feeds.py at main · fng97/dotfiles · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3

import platform, sys, subprocess, webbrowser, xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor
from urllib.request import Request, urlopen
from urllib.error import URLError
from datetime import datetime, timezone, timedelta
from email.utils import parsedate_to_datetime
from pathlib import Path

ATOM = "{http://www.w3.org/2005/Atom}"

FEED_FILE = Path.home() / ".config/feeds.txt"
MONTHS = 1

# Read feeds list file, accounting for `#` comments.
assert FEED_FILE.is_file(), f"not found: {FEED_FILE}"
cutoff = datetime.now(timezone.utc) - timedelta(days=MONTHS * 30)
urls = [
    line.split("#")[0].strip()
    for line in FEED_FILE.read_text().splitlines()
    if line.strip() and not line.strip().startswith("#")
]


def parse_date(raw):
    for parser in (parsedate_to_datetime, datetime.fromisoformat):
        try:
            dt = parser(raw)
            return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
        except Exception:
            continue
    return None


def fetch_feed(url):
    try:
        request = Request(url, headers={"User-Agent": "feeds.py/1.0"})
        with urlopen(request, timeout=3) as r:
            raw = r.read()
    except (URLError, OSError) as e:
        print(f"! fetch failed: {url} ({e})", file=sys.stderr)
        return []

    try:
        root = ET.fromstring(raw)
    except ET.ParseError:
        print(f"! parse failed: {url}", file=sys.stderr)
        return []

    # Extract entries: try RSS first.
    entries = []
    for item in root.iter("item"):
        title = (item.findtext("title") or "").strip()
        link = (item.findtext("link") or "").strip()
        date = (item.findtext("pubDate") or "").strip()
        author = (
            item.findtext("author")
            or item.findtext("{http://purl.org/dc/elements/1.1/}creator")
            or ""
        ).strip()
        if title and link and date:
            entries.append((author, title, link, date))

    # Extract entries: fall back to Atom.
    if not entries:
        for entry in root.iter(f"{ATOM}entry"):
            title = (entry.findtext(f"{ATOM}title") or "").strip()
            link_el = entry.find(f"{ATOM}link[@rel='alternate']")
            if link_el is None:
                link_el = entry.find(f"{ATOM}link")
            link = (link_el.get("href", "") if link_el is not None else "").strip()
            date = (
                entry.findtext(f"{ATOM}published")
                or entry.findtext(f"{ATOM}updated")
                or ""
            ).strip()
            author = (entry.findtext(f"{ATOM}author/{ATOM}name") or "").strip()
            if title and link and date:
                entries.append((author, title, link, date))

    if not entries:
        print(f"! no entries: {url}", file=sys.stderr)
        return []

    results = []
    for author, title, link, raw_date in entries:
        dt = parse_date(raw_date)
        if not dt:
            print(f"! bad date: {raw_date!r} in {url}", file=sys.stderr)
            continue
        if dt >= cutoff:
            label = f"[{author}] {title}" if author else title
            results.append((dt, label, link))
    return results


# Fetch all feeds.
with ThreadPoolExecutor() as pool:
    results = pool.map(fetch_feed, urls)
all_entries = [entry for batch in results for entry in batch]
if not all_entries:
    print(f"No articles in the last {MONTHS} month(s).")
    sys.exit(0)
all_entries.sort(key=lambda e: e[0], reverse=True)

# Prepare list of feed items for selection in fzf, then open the item. Keep the feed
# open: prompt again for selection in fzf after opening item.
lines = [
    f"{date:%Y-%m-%d} {'▶' if 'youtube.com/' in url else ' '} {title}\t{url}"
    for date, title, url in all_entries
]
while True:
    result = subprocess.run(
        ["fzf", "--with-nth=1", "--delimiter=\t", "--no-multi"],
        input="\n".join(lines),
        capture_output=True,
        text=True,
    )
    # If we exit fzf, exit this script.
    if result.returncode != 0:
        break
    url = result.stdout.strip().split("\t")[-1]
    if "youtube.com/" in url:
        subprocess.run(["mpv.exe" if "WSL2" in platform.release() else "mpv", url])
    else:
        webbrowser.open(url)