Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 46 additions & 11 deletions apps/backend/app/crawler/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"nhn-cloud-meetup": re.compile(r"/posts/\d+/?$"),
"upstage-blog": re.compile(r"/blog/ko/[^/]+/?$"),
}
NHN_CLOUD_MEETUP_TITLE_SUFFIX = " : NHN Cloud Meetup"


@dataclass(frozen=True)
Expand Down Expand Up @@ -139,9 +140,36 @@ def clean_title(value: str | None) -> str | None:
return re.sub(r"\s+", " ", value).strip()


def is_url_like_title(value: str) -> bool:
parsed = urlparse(value)
if parsed.scheme in {"http", "https"} and parsed.netloc:
return True

return bool(re.fullmatch(r"[\w.-]+\.[a-z]{2,}(?:/.*)?", value, flags=re.IGNORECASE))


def clean_article_title(value: str | None) -> str | None:
title = clean_title(value)
if not title or is_url_like_title(title):
return None
return title


def normalize_nhn_cloud_title(value: str | None) -> str | None:
title = clean_article_title(value)
if not title:
return None

title = re.sub(r"\s*(?::|-)\s*NHN Cloud Meetup\s*$", "", title).strip()
if not title:
return None

return f"{title}{NHN_CLOUD_MEETUP_TITLE_SUFFIX}"


def first_heading_title(soup: BeautifulSoup) -> str | None:
for heading in soup.find_all("h1"):
title = clean_title(heading.get_text(" ", strip=True))
title = clean_article_title(heading.get_text(" ", strip=True))
if title:
return title

Expand Down Expand Up @@ -176,11 +204,9 @@ def extract_nhn_cloud_payload(
if not isinstance(post_per_lang, dict):
raise SkippedArticleError(f"NHN Cloud post API returned no postPerLang: {entry.url}")

title = clean_title(post_per_lang.get("title"))
title = normalize_nhn_cloud_title(post_per_lang.get("title"))
if not title:
raise ValueError(f"NHN Cloud post API returned no title: {entry.url}")
if "NHN Cloud Meetup" not in title:
title = f"{title} : NHN Cloud Meetup"
raise SkippedArticleError(f"NHN Cloud post API returned no usable title: {entry.url}")

content_html = post_per_lang.get("content") or ""
content_text = clean_feed_text(content_html)
Expand Down Expand Up @@ -210,14 +236,23 @@ def extract_article_payload(url: str, html: str, lastmod: str | None) -> dict[st
readable_html = document.summary(html_partial=True)
content_text = clean_feed_text(readable_html)

title = clean_title(
meta_content(soup, ("property", "og:title"), ("name", "twitter:title"))
or document.short_title()
or (soup.title.get_text(" ", strip=True) if soup.title else None)
or first_heading_title(soup)
title = next(
(
candidate
for candidate in (
clean_article_title(
meta_content(soup, ("property", "og:title"), ("name", "twitter:title"))
),
clean_article_title(document.short_title()),
clean_article_title(soup.title.get_text(" ", strip=True) if soup.title else None),
first_heading_title(soup),
)
if candidate
),
None,
)
if not title:
title = url
raise SkippedArticleError(f"Article page returned no usable title: {url}")

summary = clean_feed_text(
meta_content(soup, ("property", "og:description"), ("name", "description")) or ""
Expand Down
6 changes: 6 additions & 0 deletions apps/backend/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import sys
from pathlib import Path

BACKEND_ROOT = Path(__file__).resolve().parents[1]
if str(BACKEND_ROOT) not in sys.path:
sys.path.insert(0, str(BACKEND_ROOT))
90 changes: 90 additions & 0 deletions apps/backend/tests/test_sitemap_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from types import SimpleNamespace

import httpx
import pytest

from app.crawler import sitemap
from app.crawler.sitemap import (
SitemapEntry,
SkippedArticleError,
clean_article_title,
extract_nhn_cloud_payload,
)


def nhn_source() -> SimpleNamespace:
return SimpleNamespace(site_url="https://meetup.nhncloud.com")


def nhn_api_client(title: str) -> httpx.Client:
def handler(request: httpx.Request) -> httpx.Response:
assert str(request.url) == "https://meetup.nhncloud.com/tcblog/v1.0/posts/320"
return httpx.Response(
200,
json={
"blogPost": {
"regId": "tech-writer",
"publishTime": "2026-05-01T12:00:00+09:00",
"postPerLang": {
"title": title,
"description": "<p>요약</p>",
"content": "<p>본문</p>",
},
}
},
)

return httpx.Client(transport=httpx.MockTransport(handler))


def test_extract_nhn_cloud_payload_uses_post_api_title_without_duplicate_suffix():
entry = SitemapEntry(url="https://meetup.nhncloud.com/posts/320")

with nhn_api_client("Go 제네릭 : NHN Cloud Meetup") as client:
payload = extract_nhn_cloud_payload(client, nhn_source(), entry)

assert payload["title"] == "Go 제네릭 : NHN Cloud Meetup"
assert payload["summary"] == "요약"
assert payload["content_text"] == "본문"
assert payload["raw_metadata"]["post_id"] == "320"


def test_extract_nhn_cloud_payload_rejects_url_title():
entry = SitemapEntry(url="https://meetup.nhncloud.com/posts/320")

with nhn_api_client("https://meetup.nhncloud.com/posts/320") as client:
with pytest.raises(SkippedArticleError):
extract_nhn_cloud_payload(client, nhn_source(), entry)


def test_clean_article_title_rejects_url_like_values():
assert clean_article_title("https://meetup.nhncloud.com/posts/320") is None
assert clean_article_title("meetup.nhncloud.com/posts/320") is None
assert clean_article_title("Go 제네릭") == "Go 제네릭"


def test_extract_article_payload_skips_when_every_title_candidate_is_url(monkeypatch):
class UrlTitleDocument:
def __init__(self, html: str):
self.html = html

def summary(self, html_partial: bool = True) -> str:
return "<p>본문</p>"

def short_title(self) -> str:
return "https://example.com/posts/123"

monkeypatch.setattr(sitemap, "Document", UrlTitleDocument)

html = """
<html>
<head>
<meta property="og:title" content="https://example.com/posts/123" />
<title>https://example.com/posts/123</title>
</head>
<body><h1>example.com/posts/123</h1><p>본문</p></body>
</html>
"""

with pytest.raises(SkippedArticleError):
sitemap.extract_article_payload("https://example.com/posts/123", html, None)
29 changes: 29 additions & 0 deletions docs/development-log.md
Original file line number Diff line number Diff line change
Expand Up @@ -3694,3 +3694,32 @@ API RAG 검색: /api/search?q=RAG&sort=relevance&page=1&page_size=3, total=35
뒤로가기: /?q=RAG&sort=relevance, 1페이지 복원
추천 검색어 RAG 클릭: /?q=RAG&sort=relevance로 URL 갱신
```

## 76. NHN Cloud 제목 추출 보정

NHN Cloud Meetup sitemap 수집에서 제목 후보가 URL처럼 들어오는 경우를 방지했습니다.

보정 내용:

```text
NHN Cloud post API의 postPerLang.title을 표준 제목으로 사용
NHN Cloud Meetup suffix 중복 방지
http/https URL 또는 도메인/path 형태의 제목 후보는 거부
일반 sitemap HTML 수집에서도 URL 같은 제목을 article title로 저장하지 않고 skip
```

현재 로컬/운영 DB에서 `nhn-cloud-meetup` source의 URL 형태 title은 0건임을 확인했습니다.

검증:

```bash
uv run pytest tests/test_sitemap_crawler.py
uv run ruff check app/crawler/sitemap.py tests/conftest.py tests/test_sitemap_crawler.py
```

결과:

```text
4 passed
All checks passed
```