Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 5 additions & 26 deletions python/rhwp/cli/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import rhwp
from rhwp.cli._state import is_quiet
from rhwp.ir._plain_text import join_inline_blocks
from rhwp.ir.nodes import (
Block,
CaptionBlock,
Expand Down Expand Up @@ -113,9 +114,7 @@ def blocks_cmd(
"--format",
help="출력 포맷 (ndjson/json/text).",
),
limit: int | None = typer.Option(
None, "--limit", help="최대 출고 개수 (None = 전체)."
),
limit: int | None = typer.Option(None, "--limit", help="최대 출고 개수 (None = 전체)."),
) -> None:
if not path.exists():
typer.echo(f"file not found: {path}", err=True)
Expand Down Expand Up @@ -188,40 +187,20 @@ def _block_to_text(block: Block) -> str:
return block.text
if isinstance(block, PictureBlock):
if block.caption is not None:
cap = _caption_plain(block.caption)
cap = join_inline_blocks(block.caption.blocks)
if cap:
return cap
return block.description or ""
if isinstance(block, FormulaBlock):
return block.text_alt or block.script
if isinstance(block, (FootnoteBlock, EndnoteBlock)):
return "\n".join(b.text for b in block.blocks if isinstance(b, ParagraphBlock) and b.text)
if isinstance(block, (FootnoteBlock, EndnoteBlock, CaptionBlock)):
return join_inline_blocks(block.blocks)
if isinstance(block, ListItemBlock):
return f"{block.marker} {block.text}".strip()
if isinstance(block, CaptionBlock):
return _caption_plain(block)
if isinstance(block, TocBlock):
return "\n".join(e.text for e in block.entries if e.text)
if isinstance(block, FieldBlock):
return block.cached_value or ""
# ^ 새 Block variant 추가 시 위 분기를 먼저 확장 — UnknownBlock 폴백은 빈 텍스트
assert isinstance(block, UnknownBlock)
return ""


def _caption_plain(caption: CaptionBlock) -> str:
"""CaptionBlock.blocks 평문 추출 — Paragraph + Formula(text_alt|script) 결합.

LangChain loader (_caption_plain_text) 와 의도적 동일 정책 — RAG 일관성 보존.
"""
parts: list[str] = []
for b in caption.blocks:
if isinstance(b, ParagraphBlock) and b.text:
parts.append(b.text)
elif isinstance(b, FormulaBlock):
text = b.text_alt or b.script
if text:
parts.append(text)
elif isinstance(b, FieldBlock) and b.cached_value:
parts.append(b.cached_value)
return "\n".join(parts)
37 changes: 10 additions & 27 deletions python/rhwp/integrations/langchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from langchain_core.documents import Document

import rhwp
from rhwp.ir._plain_text import join_inline_blocks
from rhwp.ir.nodes import (
Block,
CaptionBlock,
Expand Down Expand Up @@ -185,7 +186,9 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
# caption 은 v0.2.0 호환 평문 우선, 없으면 caption_block.blocks 평문 폴백
# (PictureBlock 분기와 대칭 — caption 정보 손실 회피).
caption_text = block.caption or (
_caption_plain_text(block.caption_block) if block.caption_block is not None else None
join_inline_blocks(block.caption_block.blocks)
if block.caption_block is not None
else None
)
return block.html, {
"kind": "table",
Expand All @@ -200,7 +203,7 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
# ^ caption.blocks 평문 우선 (S3 구조화), 없으면 description (S1 호환).
# image meta 는 RAG 가 picture 를 별도 색인할 때 활용. 빈 content 는
# lazy_load 상위에서 strip 후 skip.
caption_text = _caption_plain_text(block.caption) if block.caption is not None else ""
caption_text = join_inline_blocks(block.caption.blocks) if block.caption is not None else ""
content = caption_text or (block.description or "")
meta: dict[str, Any] = {
"kind": "picture",
Expand All @@ -222,11 +225,11 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
"inline": block.inline,
}
if isinstance(block, (FootnoteBlock, EndnoteBlock)):
# ^ 각주/미주 본문 paragraphs 의 평문을 합쳐 content 로. marker_prov 는 본문 인용
# 위치를 별도 메타로 노출 — RAG 가 "이 각주는 어디 paragraph 에서 인용됐나" 역추적
text_parts = [b.text for b in block.blocks if isinstance(b, ParagraphBlock) and b.text]
# ^ 각주/미주 본문의 인라인-스러운 블록 (Paragraph/ListItem/Formula/Field) 평문을
# 결합. marker_prov 는 본문 인용 위치를 별도 메타로 노출 — RAG 가 "이 각주는
# 어디 paragraph 에서 인용됐나" 역추적.
kind_label = "footnote" if isinstance(block, FootnoteBlock) else "endnote"
return "\n".join(text_parts), {
return join_inline_blocks(block.blocks), {
"kind": kind_label,
"section_idx": block.prov.section_idx,
"para_idx": block.prov.para_idx,
Expand All @@ -248,7 +251,7 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
if isinstance(block, CaptionBlock):
# ^ 단독 CaptionBlock 은 거의 없음 (Picture/Table 자식). 명시적으로 body 에
# 넣은 사용자 경로만 — direction 메타로 노출.
return _caption_plain_text(block), {
return join_inline_blocks(block.blocks), {
"kind": "caption",
"section_idx": block.prov.section_idx,
"para_idx": block.prov.para_idx,
Expand Down Expand Up @@ -283,23 +286,3 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
"section_idx": block.prov.section_idx,
"para_idx": block.prov.para_idx,
}


def _caption_plain_text(caption: CaptionBlock) -> str:
"""CaptionBlock.blocks 의 텍스트 표현을 개행 결합 (S3 신규 헬퍼).

포함 대상: ParagraphBlock.text + FormulaBlock.text_alt|script + FieldBlock.cached_value.
캡션 안의 수식·필드도 평문 흐름의 일부 (spec § 5 "캡션 안의 인라인 수식·필드도
자연스럽게 표현") — RAG 색인에 자연 포함. 표/그림 등 구조 블록은 별도 색인.
"""
parts: list[str] = []
for b in caption.blocks:
if isinstance(b, ParagraphBlock) and b.text:
parts.append(b.text)
elif isinstance(b, FormulaBlock):
text = b.text_alt or b.script
if text:
parts.append(text)
elif isinstance(b, FieldBlock) and b.cached_value:
parts.append(b.cached_value)
return "\n".join(parts)
51 changes: 51 additions & 0 deletions python/rhwp/ir/_plain_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Block 컨테이너 → 평문 변환 헬퍼 — LangChain integration / CLI 공유 SSOT.

캡션·각주·미주 같은 컨테이너 블록의 inner blocks 를 평문으로 합칠 때 사용한다.
RAG 색인에 자연 포함되는 인라인-스러운 블록만 처리한다.

처리 대상 (각 블록의 평문 표현):

- ``ParagraphBlock`` → ``text``
- ``ListItemBlock`` → ``"{marker} {text}"`` — 목록 항목 단위 색인
- ``FormulaBlock`` → ``text_alt`` 우선, 없으면 ``script`` (RAG 폴백)
- ``FieldBlock`` → ``cached_value`` (없으면 None)

처리 안 함 (별도 블록으로 색인되어야 하는 구조 블록):

- ``TableBlock`` / ``PictureBlock`` / ``TocBlock`` / 중첩 컨테이너 등
"""

from rhwp.ir.nodes import (
Block,
FieldBlock,
FormulaBlock,
ListItemBlock,
ParagraphBlock,
)


def block_inline_text(block: Block) -> str | None:
"""인라인-스러운 단일 Block → 평문. 빈 문자열·해당 없는 타입은 None.

None 분기로 호출자가 ``if text:`` 로 빈 텍스트 / 비-인라인 블록을 함께 skip
가능하다.
"""
if isinstance(block, ParagraphBlock):
return block.text or None
if isinstance(block, ListItemBlock):
return f"{block.marker} {block.text}".strip() or None
if isinstance(block, FormulaBlock):
return block.text_alt or block.script or None
if isinstance(block, FieldBlock):
return block.cached_value or None
return None


def join_inline_blocks(blocks: list[Block]) -> str:
r"""블록 리스트의 인라인 텍스트를 ``\n`` 로 결합.

캡션·각주·미주 본문 평문화에 사용. ``block_inline_text`` 가 None 을 반환한
(비-인라인 또는 빈) 블록은 skip — 빈 줄 노이즈 회피.
"""
parts = [text for b in blocks if (text := block_inline_text(b))]
return "\n".join(parts)
24 changes: 24 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,3 +296,27 @@ def test_chunks_missing_file_exit_1(tmp_path: Path) -> None:
pytest.importorskip("langchain_text_splitters")
result = _run("chunks", str(tmp_path / "missing.hwp"))
assert result.exit_code == 1


# * footnote/caption 평문화 회귀 — ListItemBlock 누락 방지 (--format text)
#
# ``rhwp.ir._plain_text.join_inline_blocks`` 도입 전에는 footnote/caption 안의
# ListItemBlock 이 평문에 포함되지 않았다. CLI ``--format text`` 도 동일한 누락이
# 있었으므로 같은 회귀를 가드한다.


def test_block_to_text_includes_list_items_in_footnote() -> None:
from rhwp.cli.ir import _block_to_text
from rhwp.ir.nodes import FootnoteBlock, ListItemBlock, ParagraphBlock, Provenance

prov = Provenance(section_idx=0, para_idx=0)
footnote = FootnoteBlock(
number=1,
marker_prov=prov,
prov=prov,
blocks=[
ParagraphBlock(text="참고:", prov=prov),
ListItemBlock(text="첫째", marker="1.", enumerated=True, prov=prov),
],
)
assert _block_to_text(footnote) == "참고:\n1. 첫째"
144 changes: 144 additions & 0 deletions tests/test_ir_plain_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""rhwp.ir._plain_text 단위 테스트 — 컨테이너 평문화 헬퍼.

캡션·각주·미주의 inner blocks 평문화에 ``ListItemBlock`` / ``FormulaBlock`` /
``FieldBlock`` 이 포함되는지 검증 (이전엔 ``ParagraphBlock`` 만 잡아 누락).
"""

from rhwp.ir._plain_text import block_inline_text, join_inline_blocks
from rhwp.ir.nodes import (
Block,
CaptionBlock,
FieldBlock,
FormulaBlock,
ImageRef,
ListItemBlock,
ParagraphBlock,
PictureBlock,
Provenance,
TableBlock,
UnknownBlock,
)

_PROV = Provenance(section_idx=0, para_idx=0)


# * block_inline_text — 인라인-스러운 블록만 평문 반환


def test_paragraph_with_text() -> None:
assert block_inline_text(ParagraphBlock(text="hello", prov=_PROV)) == "hello"


def test_paragraph_empty_returns_none() -> None:
assert block_inline_text(ParagraphBlock(text="", prov=_PROV)) is None


def test_list_item_includes_marker() -> None:
block = ListItemBlock(text="첫 항목", marker="1.", enumerated=True, prov=_PROV)
assert block_inline_text(block) == "1. 첫 항목"


def test_list_item_empty_text_with_marker_returns_marker() -> None:
# ^ marker 만 있고 본문 없으면 marker 그대로 (drop 하지 않음 — 정렬 정보 보존)
block = ListItemBlock(text="", marker="•", enumerated=False, prov=_PROV)
assert block_inline_text(block) == "•"


def test_list_item_fully_empty_returns_none() -> None:
block = ListItemBlock(text="", marker="", enumerated=False, prov=_PROV)
assert block_inline_text(block) is None


def test_formula_prefers_text_alt() -> None:
block = FormulaBlock(script="1 over 2", text_alt="1 / 2", prov=_PROV)
assert block_inline_text(block) == "1 / 2"


def test_formula_falls_back_to_script() -> None:
block = FormulaBlock(script="x^2", text_alt=None, prov=_PROV)
assert block_inline_text(block) == "x^2"


def test_formula_empty_returns_none() -> None:
# ^ 정상적으로는 빈 script 가 출고되지 않지만 손상 입력 방어
block = FormulaBlock(script="", text_alt=None, prov=_PROV)
assert block_inline_text(block) is None


def test_field_with_cached_value() -> None:
block = FieldBlock(field_kind="date", cached_value="2026-04-28", prov=_PROV)
assert block_inline_text(block) == "2026-04-28"


def test_field_without_cached_value_returns_none() -> None:
block = FieldBlock(field_kind="hyperlink", cached_value=None, prov=_PROV)
assert block_inline_text(block) is None


def test_structural_blocks_return_none() -> None:
# ^ Table / Picture 는 구조 블록 — 평문화에서 제외 (별도 색인 대상)
assert block_inline_text(TableBlock(rows=1, cols=1, prov=_PROV)) is None
assert (
block_inline_text(
PictureBlock(image=ImageRef(uri="bin://1", mime_type="image/png"), prov=_PROV)
)
is None
)


def test_unknown_block_returns_none() -> None:
assert block_inline_text(UnknownBlock(kind="future_kind", prov=_PROV)) is None


# * join_inline_blocks — 캡션·각주·미주 본문 평문화


def test_join_empty_list() -> None:
assert join_inline_blocks([]) == ""


def test_join_skips_blocks_with_no_inline_text() -> None:
# ^ 핵심 회귀: TableBlock / PictureBlock 등 구조 블록이 섞여도 인라인만 추출
blocks: list[Block] = [
ParagraphBlock(text="본문", prov=_PROV),
TableBlock(rows=1, cols=1, prov=_PROV),
ParagraphBlock(text="", prov=_PROV), # ^ 빈 단락 skip
]
assert join_inline_blocks(blocks) == "본문"


def test_join_includes_list_item_in_caption_or_footnote() -> None:
"""ListItemBlock 누락 회귀 테스트 — 각주/미주/캡션 안의 list 가 평문에 포함된다.

이전 구현은 ``isinstance(b, ParagraphBlock)`` 만 체크하여 ListItemBlock 으로
변환된 paragraph (`ParaShape.head_type` 비-None) 가 통째로 누락됐다.
"""
blocks: list[Block] = [
ParagraphBlock(text="머리말", prov=_PROV),
ListItemBlock(text="첫째", marker="1.", enumerated=True, prov=_PROV),
ListItemBlock(text="둘째", marker="2.", enumerated=True, prov=_PROV),
]
assert join_inline_blocks(blocks) == "머리말\n1. 첫째\n2. 둘째"


def test_join_mixes_paragraph_listitem_formula_field() -> None:
blocks: list[Block] = [
ParagraphBlock(text="식:", prov=_PROV),
FormulaBlock(script="x+y", text_alt=None, prov=_PROV),
FieldBlock(field_kind="date", cached_value="2026-04-28", prov=_PROV),
ListItemBlock(text="결론", marker="•", enumerated=False, prov=_PROV),
]
assert join_inline_blocks(blocks) == "식:\nx+y\n2026-04-28\n• 결론"


def test_join_caption_blocks_works_via_attribute() -> None:
"""CaptionBlock 사용처 사용 패턴 — caption.blocks 를 그대로 넘긴다."""
caption = CaptionBlock(
blocks=[
ParagraphBlock(text="<그림 1>", prov=_PROV),
FormulaBlock(script="E=mc^2", text_alt=None, prov=_PROV),
],
direction="bottom",
prov=_PROV,
)
assert join_inline_blocks(caption.blocks) == "<그림 1>\nE=mc^2"
Loading
Loading