diff --git a/.gitignore b/.gitignore index f2a9421d..4559a565 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,15 @@ yarn-error.log* lerna-debug.log* .pnpm-debug.log* .DS_Store + +# Local AI agent files +.claude/ +.codex/ +AGENTS.md +CLAUDE.md +MODEL_ROUTING.md +REFLECTION_LOG.md + # Diagnostic reports (https://nodejs.org/api/report.html) report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json BiliNote/pnpm-lock.yaml @@ -324,6 +333,9 @@ cython_debug/ /BiliNote_frontend/.idea/* /BiliNote_frontend/src-tauri/bin/ +# Local platform cookies +**/cookies.txt + # FFmpeg 构建文件(不应该提交到仓库) ffmpeg*/ -ffmpg*/ \ No newline at end of file +ffmpg*/ diff --git a/backend/app/downloaders/bilibili_downloader.py b/backend/app/downloaders/bilibili_downloader.py index 1ee02a42..34e82a34 100644 --- a/backend/app/downloaders/bilibili_downloader.py +++ b/backend/app/downloaders/bilibili_downloader.py @@ -1,8 +1,8 @@ import os import json -import logging import tempfile from abc import ABC +from pathlib import Path from typing import Union, Optional, List import yt_dlp @@ -10,11 +10,25 @@ from app.downloaders.base import Downloader, DownloadQuality, QUALITY_MAP from app.models.notes_model import AudioDownloadResult from app.models.transcriber_model import TranscriptResult, TranscriptSegment +from app.utils.logger import get_logger from app.utils.path_helper import get_data_dir from app.utils.url_parser import extract_video_id from app.services.cookie_manager import CookieConfigManager -logger = logging.getLogger(__name__) +logger = get_logger(__name__) + +BILIBILI_COOKIES_FILE = os.getenv("BILIBILI_COOKIES_FILE", "cookies.txt") +BILIBILI_HTTP_HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', + 'Accept-Encoding': 'gzip, deflate, br', + 'Referer': 'https://www.bilibili.com/', + 'Origin': 'https://www.bilibili.com', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-site', +} class BilibiliDownloader(Downloader, ABC): @@ -22,7 +36,7 @@ def __init__(self): super().__init__() self._cookie_mgr = CookieConfigManager() self._cookie = self._cookie_mgr.get('bilibili') - self._cookiefile = self._write_netscape_cookie_file() + self._cookiefile = self._write_netscape_cookie_file() or self._resolve_cookies_file() def _write_netscape_cookie_file(self) -> Optional[str]: """将 Cookie 写入 Netscape 格式临时文件,返回文件路径(供 yt-dlp cookiefile 使用)""" @@ -40,12 +54,54 @@ def _write_netscape_cookie_file(self) -> Optional[str]: logger.info("已生成 B站 Netscape Cookie 文件: %s (条目: %d)", tmp.name, len(lines) - 1) return tmp.name + def _resolve_cookies_file(self) -> Optional[str]: + """按约定位置查找 Netscape cookies.txt 文件。""" + configured = Path(BILIBILI_COOKIES_FILE) + backend_root = Path(__file__).resolve().parents[2] + candidates: list[Path] = [] + + if configured.is_absolute(): + candidates.append(configured) + else: + env_value = os.getenv("BILIBILI_COOKIES_FILE") + if env_value: + candidates.append(Path.cwd() / configured) + candidates.extend([ + backend_root / configured, + Path.cwd() / configured, + Path("/app") / configured, + ]) + + seen: set[Path] = set() + for candidate in candidates: + candidate = candidate.resolve() + if candidate in seen: + continue + seen.add(candidate) + if candidate.is_file(): + logger.info("使用 B站 cookies 文件: %s", candidate) + return str(candidate) + if candidate.exists(): + logger.warning("忽略非文件 cookies 路径: %s", candidate) + + logger.warning("B站 Cookie 未配置且 cookies.txt 不存在,下载可能失败") + return None + + def _apply_common_ydl_opts(self, ydl_opts: dict) -> dict: + existing_headers = ydl_opts.get('http_headers', {}) + ydl_opts['http_headers'] = {**BILIBILI_HTTP_HEADERS, **existing_headers} + ydl_opts['extractor_retries'] = 5 + if self._cookiefile: + ydl_opts['cookiefile'] = self._cookiefile + return ydl_opts + def download( self, video_url: str, output_dir: Union[str, None] = None, quality: DownloadQuality = "fast", - need_video:Optional[bool]=False + need_video: Optional[bool] = False, + skip_download: bool = False, ) -> AudioDownloadResult: if output_dir is None: output_dir = get_data_dir() @@ -58,7 +114,6 @@ def download( ydl_opts = { 'format': 'bestaudio[ext=m4a]/bestaudio/best', 'outtmpl': output_path, - 'http_headers': {'Referer': 'https://www.bilibili.com'}, 'postprocessors': [ { 'key': 'FFmpegExtractAudio', @@ -69,11 +124,12 @@ def download( 'noplaylist': True, 'quiet': False, } - if self._cookiefile: - ydl_opts['cookiefile'] = self._cookiefile + if skip_download: + ydl_opts['skip_download'] = True + self._apply_common_ydl_opts(ydl_opts) with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(video_url, download=True) + info = ydl.extract_info(video_url, download=not skip_download) video_id = info.get("id") title = info.get("title") duration = info.get("duration", 0) @@ -117,13 +173,11 @@ def download_video( ydl_opts = { 'format': 'bv*[ext=mp4]/bestvideo+bestaudio/best', 'outtmpl': output_path, - 'http_headers': {'Referer': 'https://www.bilibili.com'}, 'noplaylist': True, 'quiet': False, 'merge_output_format': 'mp4', # 确保合并成 mp4 } - if self._cookiefile: - ydl_opts['cookiefile'] = self._cookiefile + self._apply_common_ydl_opts(ydl_opts) with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(video_url, download=True) @@ -175,11 +229,7 @@ def download_subtitles(self, video_url: str, output_dir: str = None, 'outtmpl': os.path.join(output_dir, f'{video_id}.%(ext)s'), 'quiet': True, } - - # 通过 CookieConfigManager 注入 B站 Cookie(Netscape cookiefile) - if self._cookiefile: - ydl_opts['cookiefile'] = self._cookiefile - ydl_opts['http_headers'] = {'Referer': 'https://www.bilibili.com'} + self._apply_common_ydl_opts(ydl_opts) try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: @@ -330,4 +380,4 @@ def _parse_json3_subtitle(self, subtitle_file: str, language: str) -> Optional[T except Exception as e: logger.warning(f"解析字幕文件失败: {e}") - return None \ No newline at end of file + return None diff --git a/backend/app/downloaders/youtube_downloader.py b/backend/app/downloaders/youtube_downloader.py index bb8ed8a7..0534768e 100644 --- a/backend/app/downloaders/youtube_downloader.py +++ b/backend/app/downloaders/youtube_downloader.py @@ -1,5 +1,4 @@ import os -import logging from abc import ABC from typing import Union, Optional, List @@ -9,10 +8,11 @@ from app.downloaders.youtube_subtitle import YouTubeSubtitleFetcher from app.models.notes_model import AudioDownloadResult from app.models.transcriber_model import TranscriptResult +from app.utils.logger import get_logger from app.utils.path_helper import get_data_dir from app.utils.url_parser import extract_video_id -logger = logging.getLogger(__name__) +logger = get_logger(__name__) class YoutubeDownloader(Downloader, ABC): diff --git a/backend/app/routers/note.py b/backend/app/routers/note.py index a9e2d4c3..1482c671 100644 --- a/backend/app/routers/note.py +++ b/backend/app/routers/note.py @@ -175,11 +175,38 @@ def generate_note(data: VideoRequest, background_tasks: BackgroundTasks): def get_task_status(task_id: str): status_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}.status.json") result_path = os.path.join(NOTE_OUTPUT_DIR, f"{task_id}.json") + pending_status = { + "status": TaskStatus.PENDING.value, + "message": "任务排队中", + "task_id": task_id, + } # 优先读状态文件 if os.path.exists(status_path): - with open(status_path, "r", encoding="utf-8") as f: - status_content = json.load(f) + try: + with open(status_path, "r", encoding="utf-8") as f: + content = f.read() + if content.strip(): + status_content = json.loads(content) + elif os.path.exists(result_path): + logger.warning(f"状态文件为空但结果文件已存在: {status_path}") + status_content = {"status": TaskStatus.SUCCESS.value, "task_id": task_id} + else: + logger.warning(f"状态文件为空: {status_path}") + status_content = pending_status + except (json.JSONDecodeError, OSError) as e: + logger.warning(f"读取状态文件失败: {status_path}, {e}") + if os.path.exists(result_path): + status_content = {"status": TaskStatus.SUCCESS.value, "task_id": task_id} + else: + status_content = pending_status + + if status_content == pending_status: + try: + with open(status_path, "w", encoding="utf-8") as wf: + json.dump(status_content, wf, ensure_ascii=False) + except OSError as e: + logger.warning(f"重建状态文件失败: {status_path}, {e}") status = status_content.get("status") message = status_content.get("message", "") diff --git a/backend/app/services/note.py b/backend/app/services/note.py index ebbe83a6..71f7008a 100644 --- a/backend/app/services/note.py +++ b/backend/app/services/note.py @@ -321,31 +321,15 @@ def _update_status(self, task_id: Optional[str], status: Union[str, TaskStatus], NOTE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) status_file = NOTE_OUTPUT_DIR / f"{task_id}.status.json" - print(f"写入状态文件: {status_file} 当前状态: {status}") data = {"status": status.value if isinstance(status, TaskStatus) else status} if message: data["message"] = message try: - # First create a temporary file - temp_file = status_file.with_suffix('.tmp') - - # Write to temporary file - with temp_file.open('w', encoding='utf-8') as f: + with status_file.open('w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) - - # Atomic rename operation - temp_file.replace(status_file) - - print(f"状态文件写入成功: {status_file}") except Exception as e: logger.error(f"写入状态文件失败 (task_id={task_id}):{e}") - # Try to write error to file directly as fallback - try: - with status_file.open('w', encoding='utf-8') as f: - f.write(f"Error writing status: {str(e)}") - except: - logger.error(f"写入错误 {e}") def _handle_exception(self, task_id, exc): logger.error(f"任务异常 (task_id={task_id})", exc_info=True) diff --git a/backend/requirements.txt b/backend/requirements.txt index b29b0abc..fda91944 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -125,5 +125,5 @@ webencodings==0.5.1 websockets==15.0.1 yarl==1.19.0 youtube-transcript-api>=1.0.0 -yt-dlp==2025.3.31 +yt-dlp==2026.3.17 zopfli==0.2.3.post1 diff --git a/backend/tests/test_note_status_resilience.py b/backend/tests/test_note_status_resilience.py new file mode 100644 index 00000000..9b0cd9a6 --- /dev/null +++ b/backend/tests/test_note_status_resilience.py @@ -0,0 +1,40 @@ +import inspect +import json + +from app.downloaders.bilibili_downloader import BilibiliDownloader +from app.enmus.task_status_enums import TaskStatus +from app.routers import note as note_router + + +def _response_payload(response): + return json.loads(response.body.decode("utf-8")) + + +def test_empty_status_file_returns_pending_and_rewrites_json(tmp_path, monkeypatch): + task_id = "empty-status" + monkeypatch.setattr(note_router, "NOTE_OUTPUT_DIR", str(tmp_path)) + status_path = tmp_path / f"{task_id}.status.json" + status_path.write_text("", encoding="utf-8") + + payload = _response_payload(note_router.get_task_status(task_id)) + + assert payload["data"]["status"] == TaskStatus.PENDING.value + assert json.loads(status_path.read_text(encoding="utf-8"))["status"] == TaskStatus.PENDING.value + + +def test_invalid_status_file_returns_existing_result(tmp_path, monkeypatch): + task_id = "invalid-status" + monkeypatch.setattr(note_router, "NOTE_OUTPUT_DIR", str(tmp_path)) + (tmp_path / f"{task_id}.status.json").write_text("{", encoding="utf-8") + (tmp_path / f"{task_id}.json").write_text('{"markdown": "done"}', encoding="utf-8") + + payload = _response_payload(note_router.get_task_status(task_id)) + + assert payload["data"]["status"] == TaskStatus.SUCCESS.value + assert payload["data"]["result"] == {"markdown": "done"} + + +def test_bilibili_downloader_accepts_skip_download_argument(): + signature = inspect.signature(BilibiliDownloader.download) + + assert "skip_download" in signature.parameters