ScienceNet-Article-Bot/worm.py at main · Photon-gjq/ScienceNet-Article-Bot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import requests
import json
import asyncio
from bs4 import BeautifulSoup
import os
from telegram import Bot

DATA_FILE = 'previous_articles.json'

def fetch_latest_articles():
    urls = [
        ("https://blog.sciencenet.cn/blog.php?mod=type&type=7", "科普集錦"),
        ("https://blog.sciencenet.cn/blog.php?mod=type&type=3", "觀點評述")
    ]

    articles = []

    for url, label in urls:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch page from {url}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')

        table = soup.find('table')  # 假设文章在一个表格中
        if not table:
            print("No table found on the page.")
            continue

        rows = table.find_all('tr')[1:]  # 跳过表头

        for row in rows:
            cols = row.find_all('td')
            if len(cols) < 6:
                continue

            article_id = cols[0].text.strip()
            title = cols[1].text.strip()
            author = cols[2].text.strip()
            views = cols[3].text.strip()
            comments = cols[4].text.strip()
            date = cols[5].text.strip()

            link_tag = cols[1].find('a', href=True)
            link = f"https://blog.sciencenet.cn/{link_tag['href']}" if link_tag else 'No link available'

            articles.append({
                'id': article_id,
                'title': title,
                'author': author,
                'views': views,
                'comments': comments,
                'date': date,
                'link': link,
                'label': label
            })

    return articles

async def send_telegram_message(articles):
    bot_token = os.getenv('BOT_TOKEN')
    chat_id = os.getenv('CHAT_ID')

    bot = Bot(token=bot_token)
    for article in articles:
        message = f"{article['title']}\n作者: {article['author']}\n分類: {article['label']}\n鏈結: {article['link']}"
        try:
            await bot.send_message(chat_id=chat_id, text=message)
            print(f"Message sent: {message}")
        except Exception as e:
            print(f"Failed to send message: {e}")

def load_previous_articles():
    if os.path.exists(DATA_FILE) and os.path.getsize(DATA_FILE) > 0:
        with open(DATA_FILE, 'r') as file:
            return set(json.load(file))
    return set()

def save_current_articles(previous_ids, new_articles):
    # 合并旧的和新的文章 ID
    updated_ids = previous_ids.union({article['id'] for article in new_articles})

    with open(DATA_FILE, 'w') as file:
        json.dump(list(updated_ids), file)

def get_new_articles(current_articles, previous_article_ids):
    return [article for article in current_articles if article['id'] not in previous_article_ids]

#主程序入口
if __name__ == "__main__":
    current_articles = fetch_latest_articles()
    previous_article_ids = load_previous_articles()

    new_articles = get_new_articles(current_articles, previous_article_ids)

    if new_articles:
        print(f"Found {len(new_articles)} new articles.")
        asyncio.run(send_telegram_message(new_articles))
        save_current_articles(previous_article_ids, new_articles)  # 传入两个参数
    else:
        print("No new articles found.")

# if __name__ == "__main__":
#     current_articles = fetch_latest_articles()

#     if current_articles:
#         print(f"Found {len(current_articles)} articles.")
#         asyncio.run(send_telegram_message(current_articles))
#         save_current_articles(current_articles)  # 更新存储的文章列表
#     else:
#         print("No articles found.")