-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrss_fetch.py
More file actions
172 lines (142 loc) · 5.82 KB
/
rss_fetch.py
File metadata and controls
172 lines (142 loc) · 5.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import json
import feedparser
from datetime import datetime as dt
from datetime import timezone as tz
from dateutil import parser as date_parser
from newspaper import Article
import urllib
import ssl
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from markdownify import markdownify as md
FEEDS = [
"https://export.arxiv.org/rss/cs.AI", # arXiv – 最新AI论文
"https://techcrunch.com/tag/artificial-intelligence/feed/", # TechCrunch AI – AI初创和趋势报道
]
OUTPUT_DIR = "_" + dt.now().strftime("%Y-%m-%d")
timestamp_file = "last_fetched.json"
# 加载上次时间记录
if os.path.exists(timestamp_file):
with open(timestamp_file, "r") as f:
last_times = json.load(f)
else:
last_times = {}
# 当前抓取后的最新时间记录
new_last_times = {}
def sanitize_filename(name: str) -> str:
"""Turn a string into a safe filename (alphanumeric, spaces, dots and underscores)."""
return "".join(c for c in name if c.isalnum() or c in (" ", ".", "_")).rstrip()
def fetch_full_text(url: str) -> str:
"""
Fetch the full article text with graceful fallbacks:
1. Try `requests` with retry logic (handles most modern TLS configs).
2. If that fails, fall back to `urllib` with a relaxed SSL context.
"""
headers = {"User-Agent": "Mozilla/5.0"}
# First attempt: `requests` with automatic retries
try:
session = requests.Session()
retries = Retry(
total=3,
backoff_factor=0.5,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET"],
)
session.mount("https://", HTTPAdapter(max_retries=retries))
resp = session.get(url, headers=headers, timeout=10)
resp.raise_for_status()
html = resp.text
article = Article(url)
article.set_html(html)
article.parse()
article_html = getattr(article, "article_html", None) or html
body_md = md(article_html, strip=["img"]).strip()
return body_md or "[Empty article body]"
except Exception as e:
print(f"[fetch_full_text] `requests` failed for {url} → {e}")
# Fallback: urllib with relaxed SSL checks (last resort)
try:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
req = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(req, context=ctx, timeout=10).read()
article = Article(url)
article.set_html(html)
article.parse()
article_html = getattr(article, "article_html", None) or html
body_md = md(article_html, strip=["img"]).strip()
return body_md or "[Empty article body]"
except Exception as e:
print(f"[fetch_full_text] urllib fallback failed for {url} → {e}")
return "[Content unavailable due to access restrictions]"
def save_feed_as_md(feed_url: str):
feed = feedparser.parse(feed_url)
feed_title = feed.feed.get("title", feed_url.split("//")[-1].split("/")[0])
# create a subfolder per feed
feed_dir = os.path.join(OUTPUT_DIR, "_" + sanitize_filename(feed_title))
os.makedirs(feed_dir, exist_ok=True)
last_time_str = last_times.get(feed_url)
last_time = date_parser.parse(last_time_str) if last_time_str else dt(2000, 1, 1, tzinfo=tz.utc)
latest_seen = last_time
for entry in feed.entries:
# grab fields (fallbacks if missing)
title = entry.get("title", "no-title").strip()
published = entry.get("published", entry.get("updated", ""))
if not published:
continue
published_time = date_parser.parse(published)
if published_time <= last_time:
continue
if published_time > latest_seen:
latest_seen = published_time
link = entry.get("link", "")
body = fetch_full_text(link) # 按 RSS 提供的链接获取正文内容
# 可选优先将 RSS 内容作为正文
# if hasattr(entry, "content") and entry.content:
# body = entry.content[0].value.strip()
# elif hasattr(entry, "summary") and entry.summary:
# body = entry.summary.strip()
# else:
# body = fetch_full_text(link)
# Skip saving if the article body is empty or only placeholder text
if not body or body.strip() in ("[Empty article body]", "[Content unavailable due to access restrictions]"):
print(f"[{dt.now().strftime('%Y-%m-%d %H:%M:%S')}] Skipped → {title} (no content)")
continue
# build a safe filename, e.g. "2025-06-11_my-post-title.md"
date_prefix = ""
if published:
try:
published_dt = dt(*entry.published_parsed[:6])
date_prefix = published_dt.strftime("%Y-%m-%d") + "_"
except Exception:
pass
fname = f"{date_prefix}{sanitize_filename(title)}.md"
path = os.path.join(feed_dir, fname)
escaped_title = title.replace('"', '\\"')
# compose Markdown with YAML front matter
md = (
f"---\n"
f"title: \"{escaped_title}\"\n"
f"date: \"{published_time.isoformat()}\"\n"
f"link: \"{link}\"\n"
f"---\n\n"
f"{body}\n"
)
with open(path, "w", encoding="utf-8") as f:
f.write(md)
print(f"[{dt.now().strftime('%Y-%m-%d %H:%M:%S')}] Saved → {path}")
new_last_times[feed_url] = latest_seen.isoformat()
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
for url in FEEDS:
save_feed_as_md(url)
# Save the latest timestamps
with open(timestamp_file, "w") as f:
json.dump(new_last_times, f, indent=4)
if __name__ == "__main__":
main()