SmileJune · SmileJune · Jun 1, 2026 · Jun 1, 2026
diff --git a/apps/backend/app/crawler/sitemap.py b/apps/backend/app/crawler/sitemap.py
@@ -27,6 +27,7 @@
     "nhn-cloud-meetup": re.compile(r"/posts/\d+/?$"),
     "upstage-blog": re.compile(r"/blog/ko/[^/]+/?$"),
 }
+NHN_CLOUD_MEETUP_TITLE_SUFFIX = " : NHN Cloud Meetup"
 
 
 @dataclass(frozen=True)
@@ -139,9 +140,36 @@ def clean_title(value: str | None) -> str | None:
     return re.sub(r"\s+", " ", value).strip()
 
 
+def is_url_like_title(value: str) -> bool:
+    parsed = urlparse(value)
+    if parsed.scheme in {"http", "https"} and parsed.netloc:
+        return True
+
+    return bool(re.fullmatch(r"[\w.-]+\.[a-z]{2,}(?:/.*)?", value, flags=re.IGNORECASE))
+
+
+def clean_article_title(value: str | None) -> str | None:
+    title = clean_title(value)
+    if not title or is_url_like_title(title):
+        return None
+    return title
+
+
+def normalize_nhn_cloud_title(value: str | None) -> str | None:
+    title = clean_article_title(value)
+    if not title:
+        return None
+
+    title = re.sub(r"\s*(?::|-)\s*NHN Cloud Meetup\s*$", "", title).strip()
+    if not title:
+        return None
+
+    return f"{title}{NHN_CLOUD_MEETUP_TITLE_SUFFIX}"
+
+
 def first_heading_title(soup: BeautifulSoup) -> str | None:
     for heading in soup.find_all("h1"):
-        title = clean_title(heading.get_text(" ", strip=True))
+        title = clean_article_title(heading.get_text(" ", strip=True))
         if title:
             return title
 
@@ -176,11 +204,9 @@ def extract_nhn_cloud_payload(
     if not isinstance(post_per_lang, dict):
         raise SkippedArticleError(f"NHN Cloud post API returned no postPerLang: {entry.url}")
 
-    title = clean_title(post_per_lang.get("title"))
+    title = normalize_nhn_cloud_title(post_per_lang.get("title"))
     if not title:
-        raise ValueError(f"NHN Cloud post API returned no title: {entry.url}")
-    if "NHN Cloud Meetup" not in title:
-        title = f"{title} : NHN Cloud Meetup"
+        raise SkippedArticleError(f"NHN Cloud post API returned no usable title: {entry.url}")
 
     content_html = post_per_lang.get("content") or ""
     content_text = clean_feed_text(content_html)
@@ -210,14 +236,23 @@ def extract_article_payload(url: str, html: str, lastmod: str | None) -> dict[st
     readable_html = document.summary(html_partial=True)
     content_text = clean_feed_text(readable_html)
 
-    title = clean_title(
-        meta_content(soup, ("property", "og:title"), ("name", "twitter:title"))
-        or document.short_title()
-        or (soup.title.get_text(" ", strip=True) if soup.title else None)
-        or first_heading_title(soup)
+    title = next(
+        (
+            candidate
+            for candidate in (
+                clean_article_title(
+                    meta_content(soup, ("property", "og:title"), ("name", "twitter:title"))
+                ),
+                clean_article_title(document.short_title()),
+                clean_article_title(soup.title.get_text(" ", strip=True) if soup.title else None),
+                first_heading_title(soup),
+            )
+            if candidate
+        ),
+        None,
     )
     if not title:
-        title = url
+        raise SkippedArticleError(f"Article page returned no usable title: {url}")
 
     summary = clean_feed_text(
         meta_content(soup, ("property", "og:description"), ("name", "description")) or ""

diff --git a/apps/backend/tests/conftest.py b/apps/backend/tests/conftest.py
@@ -0,0 +1,6 @@
+import sys
+from pathlib import Path
+
+BACKEND_ROOT = Path(__file__).resolve().parents[1]
+if str(BACKEND_ROOT) not in sys.path:
+    sys.path.insert(0, str(BACKEND_ROOT))
diff --git a/apps/backend/tests/test_sitemap_crawler.py b/apps/backend/tests/test_sitemap_crawler.py
@@ -0,0 +1,90 @@
+from types import SimpleNamespace
+
+import httpx
+import pytest
+
+from app.crawler import sitemap
+from app.crawler.sitemap import (
+    SitemapEntry,
+    SkippedArticleError,
+    clean_article_title,
+    extract_nhn_cloud_payload,
+)
+
+
+def nhn_source() -> SimpleNamespace:
+    return SimpleNamespace(site_url="https://meetup.nhncloud.com")
+
+
+def nhn_api_client(title: str) -> httpx.Client:
+    def handler(request: httpx.Request) -> httpx.Response:
+        assert str(request.url) == "https://meetup.nhncloud.com/tcblog/v1.0/posts/320"
+        return httpx.Response(
+            200,
+            json={
+                "blogPost": {
+                    "regId": "tech-writer",
+                    "publishTime": "2026-05-01T12:00:00+09:00",
+                    "postPerLang": {
+                        "title": title,
+                        "description": "<p>요약</p>",
+                        "content": "<p>본문</p>",
+                    },
+                }
+            },
+        )
+
+    return httpx.Client(transport=httpx.MockTransport(handler))
+
+
+def test_extract_nhn_cloud_payload_uses_post_api_title_without_duplicate_suffix():
+    entry = SitemapEntry(url="https://meetup.nhncloud.com/posts/320")
+
+    with nhn_api_client("Go 제네릭 : NHN Cloud Meetup") as client:
+        payload = extract_nhn_cloud_payload(client, nhn_source(), entry)
+
+    assert payload["title"] == "Go 제네릭 : NHN Cloud Meetup"
+    assert payload["summary"] == "요약"
+    assert payload["content_text"] == "본문"
+    assert payload["raw_metadata"]["post_id"] == "320"
+
+
+def test_extract_nhn_cloud_payload_rejects_url_title():
+    entry = SitemapEntry(url="https://meetup.nhncloud.com/posts/320")
+
+    with nhn_api_client("https://meetup.nhncloud.com/posts/320") as client:
+        with pytest.raises(SkippedArticleError):
+            extract_nhn_cloud_payload(client, nhn_source(), entry)
+
+
+def test_clean_article_title_rejects_url_like_values():
+    assert clean_article_title("https://meetup.nhncloud.com/posts/320") is None
+    assert clean_article_title("meetup.nhncloud.com/posts/320") is None
+    assert clean_article_title("Go 제네릭") == "Go 제네릭"
+
+
+def test_extract_article_payload_skips_when_every_title_candidate_is_url(monkeypatch):
+    class UrlTitleDocument:
+        def __init__(self, html: str):
+            self.html = html
+
+        def summary(self, html_partial: bool = True) -> str:
+            return "<p>본문</p>"
+
+        def short_title(self) -> str:
+            return "https://example.com/posts/123"
+
+    monkeypatch.setattr(sitemap, "Document", UrlTitleDocument)
+
+    html = """
+    <html>
+      <head>
+        <meta property="og:title" content="https://example.com/posts/123" />
+        <title>https://example.com/posts/123</title>
+      </head>
+      <body><h1>example.com/posts/123</h1><p>본문</p></body>
+    </html>
+    """
+
+    with pytest.raises(SkippedArticleError):
+        sitemap.extract_article_payload("https://example.com/posts/123", html, None)
diff --git a/docs/development-log.md b/docs/development-log.md
@@ -3694,3 +3694,32 @@ API RAG 검색: /api/search?q=RAG&sort=relevance&page=1&page_size=3, total=35
 뒤로가기: /?q=RAG&sort=relevance, 1페이지 복원
 추천 검색어 RAG 클릭: /?q=RAG&sort=relevance로 URL 갱신
 ```
+
+## 76. NHN Cloud 제목 추출 보정
+
+NHN Cloud Meetup sitemap 수집에서 제목 후보가 URL처럼 들어오는 경우를 방지했습니다.
+
+보정 내용:
+
+```text
+NHN Cloud post API의 postPerLang.title을 표준 제목으로 사용
+NHN Cloud Meetup suffix 중복 방지
+http/https URL 또는 도메인/path 형태의 제목 후보는 거부
+일반 sitemap HTML 수집에서도 URL 같은 제목을 article title로 저장하지 않고 skip
+```
+
+현재 로컬/운영 DB에서 `nhn-cloud-meetup` source의 URL 형태 title은 0건임을 확인했습니다.
+
+검증:
+
+```bash
+uv run pytest tests/test_sitemap_crawler.py
+uv run ruff check app/crawler/sitemap.py tests/conftest.py tests/test_sitemap_crawler.py
+```
+
+결과:
+
+```text
+4 passed
+All checks passed
+```