diff --git a/apps/backend/app/crawler/sitemap.py b/apps/backend/app/crawler/sitemap.py index d172eb9..bb17e84 100644 --- a/apps/backend/app/crawler/sitemap.py +++ b/apps/backend/app/crawler/sitemap.py @@ -27,6 +27,7 @@ "nhn-cloud-meetup": re.compile(r"/posts/\d+/?$"), "upstage-blog": re.compile(r"/blog/ko/[^/]+/?$"), } +NHN_CLOUD_MEETUP_TITLE_SUFFIX = " : NHN Cloud Meetup" @dataclass(frozen=True) @@ -139,9 +140,36 @@ def clean_title(value: str | None) -> str | None: return re.sub(r"\s+", " ", value).strip() +def is_url_like_title(value: str) -> bool: + parsed = urlparse(value) + if parsed.scheme in {"http", "https"} and parsed.netloc: + return True + + return bool(re.fullmatch(r"[\w.-]+\.[a-z]{2,}(?:/.*)?", value, flags=re.IGNORECASE)) + + +def clean_article_title(value: str | None) -> str | None: + title = clean_title(value) + if not title or is_url_like_title(title): + return None + return title + + +def normalize_nhn_cloud_title(value: str | None) -> str | None: + title = clean_article_title(value) + if not title: + return None + + title = re.sub(r"\s*(?::|-)\s*NHN Cloud Meetup\s*$", "", title).strip() + if not title: + return None + + return f"{title}{NHN_CLOUD_MEETUP_TITLE_SUFFIX}" + + def first_heading_title(soup: BeautifulSoup) -> str | None: for heading in soup.find_all("h1"): - title = clean_title(heading.get_text(" ", strip=True)) + title = clean_article_title(heading.get_text(" ", strip=True)) if title: return title @@ -176,11 +204,9 @@ def extract_nhn_cloud_payload( if not isinstance(post_per_lang, dict): raise SkippedArticleError(f"NHN Cloud post API returned no postPerLang: {entry.url}") - title = clean_title(post_per_lang.get("title")) + title = normalize_nhn_cloud_title(post_per_lang.get("title")) if not title: - raise ValueError(f"NHN Cloud post API returned no title: {entry.url}") - if "NHN Cloud Meetup" not in title: - title = f"{title} : NHN Cloud Meetup" + raise SkippedArticleError(f"NHN Cloud post API returned no usable title: {entry.url}") content_html = post_per_lang.get("content") or "" content_text = clean_feed_text(content_html) @@ -210,14 +236,23 @@ def extract_article_payload(url: str, html: str, lastmod: str | None) -> dict[st readable_html = document.summary(html_partial=True) content_text = clean_feed_text(readable_html) - title = clean_title( - meta_content(soup, ("property", "og:title"), ("name", "twitter:title")) - or document.short_title() - or (soup.title.get_text(" ", strip=True) if soup.title else None) - or first_heading_title(soup) + title = next( + ( + candidate + for candidate in ( + clean_article_title( + meta_content(soup, ("property", "og:title"), ("name", "twitter:title")) + ), + clean_article_title(document.short_title()), + clean_article_title(soup.title.get_text(" ", strip=True) if soup.title else None), + first_heading_title(soup), + ) + if candidate + ), + None, ) if not title: - title = url + raise SkippedArticleError(f"Article page returned no usable title: {url}") summary = clean_feed_text( meta_content(soup, ("property", "og:description"), ("name", "description")) or "" diff --git a/apps/backend/tests/conftest.py b/apps/backend/tests/conftest.py new file mode 100644 index 0000000..d91b6d0 --- /dev/null +++ b/apps/backend/tests/conftest.py @@ -0,0 +1,6 @@ +import sys +from pathlib import Path + +BACKEND_ROOT = Path(__file__).resolve().parents[1] +if str(BACKEND_ROOT) not in sys.path: + sys.path.insert(0, str(BACKEND_ROOT)) diff --git a/apps/backend/tests/test_sitemap_crawler.py b/apps/backend/tests/test_sitemap_crawler.py new file mode 100644 index 0000000..ece3b8b --- /dev/null +++ b/apps/backend/tests/test_sitemap_crawler.py @@ -0,0 +1,90 @@ +from types import SimpleNamespace + +import httpx +import pytest + +from app.crawler import sitemap +from app.crawler.sitemap import ( + SitemapEntry, + SkippedArticleError, + clean_article_title, + extract_nhn_cloud_payload, +) + + +def nhn_source() -> SimpleNamespace: + return SimpleNamespace(site_url="https://meetup.nhncloud.com") + + +def nhn_api_client(title: str) -> httpx.Client: + def handler(request: httpx.Request) -> httpx.Response: + assert str(request.url) == "https://meetup.nhncloud.com/tcblog/v1.0/posts/320" + return httpx.Response( + 200, + json={ + "blogPost": { + "regId": "tech-writer", + "publishTime": "2026-05-01T12:00:00+09:00", + "postPerLang": { + "title": title, + "description": "
요약
", + "content": "본문
", + }, + } + }, + ) + + return httpx.Client(transport=httpx.MockTransport(handler)) + + +def test_extract_nhn_cloud_payload_uses_post_api_title_without_duplicate_suffix(): + entry = SitemapEntry(url="https://meetup.nhncloud.com/posts/320") + + with nhn_api_client("Go 제네릭 : NHN Cloud Meetup") as client: + payload = extract_nhn_cloud_payload(client, nhn_source(), entry) + + assert payload["title"] == "Go 제네릭 : NHN Cloud Meetup" + assert payload["summary"] == "요약" + assert payload["content_text"] == "본문" + assert payload["raw_metadata"]["post_id"] == "320" + + +def test_extract_nhn_cloud_payload_rejects_url_title(): + entry = SitemapEntry(url="https://meetup.nhncloud.com/posts/320") + + with nhn_api_client("https://meetup.nhncloud.com/posts/320") as client: + with pytest.raises(SkippedArticleError): + extract_nhn_cloud_payload(client, nhn_source(), entry) + + +def test_clean_article_title_rejects_url_like_values(): + assert clean_article_title("https://meetup.nhncloud.com/posts/320") is None + assert clean_article_title("meetup.nhncloud.com/posts/320") is None + assert clean_article_title("Go 제네릭") == "Go 제네릭" + + +def test_extract_article_payload_skips_when_every_title_candidate_is_url(monkeypatch): + class UrlTitleDocument: + def __init__(self, html: str): + self.html = html + + def summary(self, html_partial: bool = True) -> str: + return "본문
" + + def short_title(self) -> str: + return "https://example.com/posts/123" + + monkeypatch.setattr(sitemap, "Document", UrlTitleDocument) + + html = """ + + + +본문
+ + """ + + with pytest.raises(SkippedArticleError): + sitemap.extract_article_payload("https://example.com/posts/123", html, None) diff --git a/docs/development-log.md b/docs/development-log.md index ef54103..6f58cb5 100644 --- a/docs/development-log.md +++ b/docs/development-log.md @@ -3694,3 +3694,32 @@ API RAG 검색: /api/search?q=RAG&sort=relevance&page=1&page_size=3, total=35 뒤로가기: /?q=RAG&sort=relevance, 1페이지 복원 추천 검색어 RAG 클릭: /?q=RAG&sort=relevance로 URL 갱신 ``` + +## 76. NHN Cloud 제목 추출 보정 + +NHN Cloud Meetup sitemap 수집에서 제목 후보가 URL처럼 들어오는 경우를 방지했습니다. + +보정 내용: + +```text +NHN Cloud post API의 postPerLang.title을 표준 제목으로 사용 +NHN Cloud Meetup suffix 중복 방지 +http/https URL 또는 도메인/path 형태의 제목 후보는 거부 +일반 sitemap HTML 수집에서도 URL 같은 제목을 article title로 저장하지 않고 skip +``` + +현재 로컬/운영 DB에서 `nhn-cloud-meetup` source의 URL 형태 title은 0건임을 확인했습니다. + +검증: + +```bash +uv run pytest tests/test_sitemap_crawler.py +uv run ruff check app/crawler/sitemap.py tests/conftest.py tests/test_sitemap_crawler.py +``` + +결과: + +```text +4 passed +All checks passed +```