Skip to content

Commit 1709b87

Browse files
authored
Merge pull request #7 from DanMeon/fix/listitem-in-furniture-plain-text
fix(ir): 캡션/각주/미주 평문화에서 ListItemBlock 누락 정정
2 parents cfa63dc + f3da6dc commit 1709b87

6 files changed

Lines changed: 294 additions & 53 deletions

File tree

python/rhwp/cli/ir.py

Lines changed: 5 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import rhwp
1919
from rhwp.cli._state import is_quiet
20+
from rhwp.ir._plain_text import join_inline_blocks
2021
from rhwp.ir.nodes import (
2122
Block,
2223
CaptionBlock,
@@ -113,9 +114,7 @@ def blocks_cmd(
113114
"--format",
114115
help="출력 포맷 (ndjson/json/text).",
115116
),
116-
limit: int | None = typer.Option(
117-
None, "--limit", help="최대 출고 개수 (None = 전체)."
118-
),
117+
limit: int | None = typer.Option(None, "--limit", help="최대 출고 개수 (None = 전체)."),
119118
) -> None:
120119
if not path.exists():
121120
typer.echo(f"file not found: {path}", err=True)
@@ -188,40 +187,20 @@ def _block_to_text(block: Block) -> str:
188187
return block.text
189188
if isinstance(block, PictureBlock):
190189
if block.caption is not None:
191-
cap = _caption_plain(block.caption)
190+
cap = join_inline_blocks(block.caption.blocks)
192191
if cap:
193192
return cap
194193
return block.description or ""
195194
if isinstance(block, FormulaBlock):
196195
return block.text_alt or block.script
197-
if isinstance(block, (FootnoteBlock, EndnoteBlock)):
198-
return "\n".join(b.text for b in block.blocks if isinstance(b, ParagraphBlock) and b.text)
196+
if isinstance(block, (FootnoteBlock, EndnoteBlock, CaptionBlock)):
197+
return join_inline_blocks(block.blocks)
199198
if isinstance(block, ListItemBlock):
200199
return f"{block.marker} {block.text}".strip()
201-
if isinstance(block, CaptionBlock):
202-
return _caption_plain(block)
203200
if isinstance(block, TocBlock):
204201
return "\n".join(e.text for e in block.entries if e.text)
205202
if isinstance(block, FieldBlock):
206203
return block.cached_value or ""
207204
# ^ 새 Block variant 추가 시 위 분기를 먼저 확장 — UnknownBlock 폴백은 빈 텍스트
208205
assert isinstance(block, UnknownBlock)
209206
return ""
210-
211-
212-
def _caption_plain(caption: CaptionBlock) -> str:
213-
"""CaptionBlock.blocks 평문 추출 — Paragraph + Formula(text_alt|script) 결합.
214-
215-
LangChain loader (_caption_plain_text) 와 의도적 동일 정책 — RAG 일관성 보존.
216-
"""
217-
parts: list[str] = []
218-
for b in caption.blocks:
219-
if isinstance(b, ParagraphBlock) and b.text:
220-
parts.append(b.text)
221-
elif isinstance(b, FormulaBlock):
222-
text = b.text_alt or b.script
223-
if text:
224-
parts.append(text)
225-
elif isinstance(b, FieldBlock) and b.cached_value:
226-
parts.append(b.cached_value)
227-
return "\n".join(parts)

python/rhwp/integrations/langchain.py

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from langchain_core.documents import Document
2727

2828
import rhwp
29+
from rhwp.ir._plain_text import join_inline_blocks
2930
from rhwp.ir.nodes import (
3031
Block,
3132
CaptionBlock,
@@ -185,7 +186,9 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
185186
# caption 은 v0.2.0 호환 평문 우선, 없으면 caption_block.blocks 평문 폴백
186187
# (PictureBlock 분기와 대칭 — caption 정보 손실 회피).
187188
caption_text = block.caption or (
188-
_caption_plain_text(block.caption_block) if block.caption_block is not None else None
189+
join_inline_blocks(block.caption_block.blocks)
190+
if block.caption_block is not None
191+
else None
189192
)
190193
return block.html, {
191194
"kind": "table",
@@ -200,7 +203,7 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
200203
# ^ caption.blocks 평문 우선 (S3 구조화), 없으면 description (S1 호환).
201204
# image meta 는 RAG 가 picture 를 별도 색인할 때 활용. 빈 content 는
202205
# lazy_load 상위에서 strip 후 skip.
203-
caption_text = _caption_plain_text(block.caption) if block.caption is not None else ""
206+
caption_text = join_inline_blocks(block.caption.blocks) if block.caption is not None else ""
204207
content = caption_text or (block.description or "")
205208
meta: dict[str, Any] = {
206209
"kind": "picture",
@@ -222,11 +225,11 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
222225
"inline": block.inline,
223226
}
224227
if isinstance(block, (FootnoteBlock, EndnoteBlock)):
225-
# ^ 각주/미주 본문 paragraphs 의 평문을 합쳐 content 로. marker_prov 는 본문 인용
226-
# 위치를 별도 메타로 노출 — RAG 가 "이 각주는 어디 paragraph 에서 인용됐나" 역추적
227-
text_parts = [b.text for b in block.blocks if isinstance(b, ParagraphBlock) and b.text]
228+
# ^ 각주/미주 본문의 인라인-스러운 블록 (Paragraph/ListItem/Formula/Field) 평문을
229+
# 결합. marker_prov 는 본문 인용 위치를 별도 메타로 노출 — RAG 가 "이 각주는
230+
# 어디 paragraph 에서 인용됐나" 역추적.
228231
kind_label = "footnote" if isinstance(block, FootnoteBlock) else "endnote"
229-
return "\n".join(text_parts), {
232+
return join_inline_blocks(block.blocks), {
230233
"kind": kind_label,
231234
"section_idx": block.prov.section_idx,
232235
"para_idx": block.prov.para_idx,
@@ -248,7 +251,7 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
248251
if isinstance(block, CaptionBlock):
249252
# ^ 단독 CaptionBlock 은 거의 없음 (Picture/Table 자식). 명시적으로 body 에
250253
# 넣은 사용자 경로만 — direction 메타로 노출.
251-
return _caption_plain_text(block), {
254+
return join_inline_blocks(block.blocks), {
252255
"kind": "caption",
253256
"section_idx": block.prov.section_idx,
254257
"para_idx": block.prov.para_idx,
@@ -283,23 +286,3 @@ def _block_to_content_and_meta(block: Block) -> tuple[str, dict[str, Any]]:
283286
"section_idx": block.prov.section_idx,
284287
"para_idx": block.prov.para_idx,
285288
}
286-
287-
288-
def _caption_plain_text(caption: CaptionBlock) -> str:
289-
"""CaptionBlock.blocks 의 텍스트 표현을 개행 결합 (S3 신규 헬퍼).
290-
291-
포함 대상: ParagraphBlock.text + FormulaBlock.text_alt|script + FieldBlock.cached_value.
292-
캡션 안의 수식·필드도 평문 흐름의 일부 (spec § 5 "캡션 안의 인라인 수식·필드도
293-
자연스럽게 표현") — RAG 색인에 자연 포함. 표/그림 등 구조 블록은 별도 색인.
294-
"""
295-
parts: list[str] = []
296-
for b in caption.blocks:
297-
if isinstance(b, ParagraphBlock) and b.text:
298-
parts.append(b.text)
299-
elif isinstance(b, FormulaBlock):
300-
text = b.text_alt or b.script
301-
if text:
302-
parts.append(text)
303-
elif isinstance(b, FieldBlock) and b.cached_value:
304-
parts.append(b.cached_value)
305-
return "\n".join(parts)

python/rhwp/ir/_plain_text.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""Block 컨테이너 → 평문 변환 헬퍼 — LangChain integration / CLI 공유 SSOT.
2+
3+
캡션·각주·미주 같은 컨테이너 블록의 inner blocks 를 평문으로 합칠 때 사용한다.
4+
RAG 색인에 자연 포함되는 인라인-스러운 블록만 처리한다.
5+
6+
처리 대상 (각 블록의 평문 표현):
7+
8+
- ``ParagraphBlock`` → ``text``
9+
- ``ListItemBlock`` → ``"{marker} {text}"`` — 목록 항목 단위 색인
10+
- ``FormulaBlock`` → ``text_alt`` 우선, 없으면 ``script`` (RAG 폴백)
11+
- ``FieldBlock`` → ``cached_value`` (없으면 None)
12+
13+
처리 안 함 (별도 블록으로 색인되어야 하는 구조 블록):
14+
15+
- ``TableBlock`` / ``PictureBlock`` / ``TocBlock`` / 중첩 컨테이너 등
16+
"""
17+
18+
from rhwp.ir.nodes import (
19+
Block,
20+
FieldBlock,
21+
FormulaBlock,
22+
ListItemBlock,
23+
ParagraphBlock,
24+
)
25+
26+
27+
def block_inline_text(block: Block) -> str | None:
28+
"""인라인-스러운 단일 Block → 평문. 빈 문자열·해당 없는 타입은 None.
29+
30+
None 분기로 호출자가 ``if text:`` 로 빈 텍스트 / 비-인라인 블록을 함께 skip
31+
가능하다.
32+
"""
33+
if isinstance(block, ParagraphBlock):
34+
return block.text or None
35+
if isinstance(block, ListItemBlock):
36+
return f"{block.marker} {block.text}".strip() or None
37+
if isinstance(block, FormulaBlock):
38+
return block.text_alt or block.script or None
39+
if isinstance(block, FieldBlock):
40+
return block.cached_value or None
41+
return None
42+
43+
44+
def join_inline_blocks(blocks: list[Block]) -> str:
45+
r"""블록 리스트의 인라인 텍스트를 ``\n`` 로 결합.
46+
47+
캡션·각주·미주 본문 평문화에 사용. ``block_inline_text`` 가 None 을 반환한
48+
(비-인라인 또는 빈) 블록은 skip — 빈 줄 노이즈 회피.
49+
"""
50+
parts = [text for b in blocks if (text := block_inline_text(b))]
51+
return "\n".join(parts)

tests/test_cli.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,3 +296,27 @@ def test_chunks_missing_file_exit_1(tmp_path: Path) -> None:
296296
pytest.importorskip("langchain_text_splitters")
297297
result = _run("chunks", str(tmp_path / "missing.hwp"))
298298
assert result.exit_code == 1
299+
300+
301+
# * footnote/caption 평문화 회귀 — ListItemBlock 누락 방지 (--format text)
302+
#
303+
# ``rhwp.ir._plain_text.join_inline_blocks`` 도입 전에는 footnote/caption 안의
304+
# ListItemBlock 이 평문에 포함되지 않았다. CLI ``--format text`` 도 동일한 누락이
305+
# 있었으므로 같은 회귀를 가드한다.
306+
307+
308+
def test_block_to_text_includes_list_items_in_footnote() -> None:
309+
from rhwp.cli.ir import _block_to_text
310+
from rhwp.ir.nodes import FootnoteBlock, ListItemBlock, ParagraphBlock, Provenance
311+
312+
prov = Provenance(section_idx=0, para_idx=0)
313+
footnote = FootnoteBlock(
314+
number=1,
315+
marker_prov=prov,
316+
prov=prov,
317+
blocks=[
318+
ParagraphBlock(text="참고:", prov=prov),
319+
ListItemBlock(text="첫째", marker="1.", enumerated=True, prov=prov),
320+
],
321+
)
322+
assert _block_to_text(footnote) == "참고:\n1. 첫째"

tests/test_ir_plain_text.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"""rhwp.ir._plain_text 단위 테스트 — 컨테이너 평문화 헬퍼.
2+
3+
캡션·각주·미주의 inner blocks 평문화에 ``ListItemBlock`` / ``FormulaBlock`` /
4+
``FieldBlock`` 이 포함되는지 검증 (이전엔 ``ParagraphBlock`` 만 잡아 누락).
5+
"""
6+
7+
from rhwp.ir._plain_text import block_inline_text, join_inline_blocks
8+
from rhwp.ir.nodes import (
9+
Block,
10+
CaptionBlock,
11+
FieldBlock,
12+
FormulaBlock,
13+
ImageRef,
14+
ListItemBlock,
15+
ParagraphBlock,
16+
PictureBlock,
17+
Provenance,
18+
TableBlock,
19+
UnknownBlock,
20+
)
21+
22+
_PROV = Provenance(section_idx=0, para_idx=0)
23+
24+
25+
# * block_inline_text — 인라인-스러운 블록만 평문 반환
26+
27+
28+
def test_paragraph_with_text() -> None:
29+
assert block_inline_text(ParagraphBlock(text="hello", prov=_PROV)) == "hello"
30+
31+
32+
def test_paragraph_empty_returns_none() -> None:
33+
assert block_inline_text(ParagraphBlock(text="", prov=_PROV)) is None
34+
35+
36+
def test_list_item_includes_marker() -> None:
37+
block = ListItemBlock(text="첫 항목", marker="1.", enumerated=True, prov=_PROV)
38+
assert block_inline_text(block) == "1. 첫 항목"
39+
40+
41+
def test_list_item_empty_text_with_marker_returns_marker() -> None:
42+
# ^ marker 만 있고 본문 없으면 marker 그대로 (drop 하지 않음 — 정렬 정보 보존)
43+
block = ListItemBlock(text="", marker="•", enumerated=False, prov=_PROV)
44+
assert block_inline_text(block) == "•"
45+
46+
47+
def test_list_item_fully_empty_returns_none() -> None:
48+
block = ListItemBlock(text="", marker="", enumerated=False, prov=_PROV)
49+
assert block_inline_text(block) is None
50+
51+
52+
def test_formula_prefers_text_alt() -> None:
53+
block = FormulaBlock(script="1 over 2", text_alt="1 / 2", prov=_PROV)
54+
assert block_inline_text(block) == "1 / 2"
55+
56+
57+
def test_formula_falls_back_to_script() -> None:
58+
block = FormulaBlock(script="x^2", text_alt=None, prov=_PROV)
59+
assert block_inline_text(block) == "x^2"
60+
61+
62+
def test_formula_empty_returns_none() -> None:
63+
# ^ 정상적으로는 빈 script 가 출고되지 않지만 손상 입력 방어
64+
block = FormulaBlock(script="", text_alt=None, prov=_PROV)
65+
assert block_inline_text(block) is None
66+
67+
68+
def test_field_with_cached_value() -> None:
69+
block = FieldBlock(field_kind="date", cached_value="2026-04-28", prov=_PROV)
70+
assert block_inline_text(block) == "2026-04-28"
71+
72+
73+
def test_field_without_cached_value_returns_none() -> None:
74+
block = FieldBlock(field_kind="hyperlink", cached_value=None, prov=_PROV)
75+
assert block_inline_text(block) is None
76+
77+
78+
def test_structural_blocks_return_none() -> None:
79+
# ^ Table / Picture 는 구조 블록 — 평문화에서 제외 (별도 색인 대상)
80+
assert block_inline_text(TableBlock(rows=1, cols=1, prov=_PROV)) is None
81+
assert (
82+
block_inline_text(
83+
PictureBlock(image=ImageRef(uri="bin://1", mime_type="image/png"), prov=_PROV)
84+
)
85+
is None
86+
)
87+
88+
89+
def test_unknown_block_returns_none() -> None:
90+
assert block_inline_text(UnknownBlock(kind="future_kind", prov=_PROV)) is None
91+
92+
93+
# * join_inline_blocks — 캡션·각주·미주 본문 평문화
94+
95+
96+
def test_join_empty_list() -> None:
97+
assert join_inline_blocks([]) == ""
98+
99+
100+
def test_join_skips_blocks_with_no_inline_text() -> None:
101+
# ^ 핵심 회귀: TableBlock / PictureBlock 등 구조 블록이 섞여도 인라인만 추출
102+
blocks: list[Block] = [
103+
ParagraphBlock(text="본문", prov=_PROV),
104+
TableBlock(rows=1, cols=1, prov=_PROV),
105+
ParagraphBlock(text="", prov=_PROV), # ^ 빈 단락 skip
106+
]
107+
assert join_inline_blocks(blocks) == "본문"
108+
109+
110+
def test_join_includes_list_item_in_caption_or_footnote() -> None:
111+
"""ListItemBlock 누락 회귀 테스트 — 각주/미주/캡션 안의 list 가 평문에 포함된다.
112+
113+
이전 구현은 ``isinstance(b, ParagraphBlock)`` 만 체크하여 ListItemBlock 으로
114+
변환된 paragraph (`ParaShape.head_type` 비-None) 가 통째로 누락됐다.
115+
"""
116+
blocks: list[Block] = [
117+
ParagraphBlock(text="머리말", prov=_PROV),
118+
ListItemBlock(text="첫째", marker="1.", enumerated=True, prov=_PROV),
119+
ListItemBlock(text="둘째", marker="2.", enumerated=True, prov=_PROV),
120+
]
121+
assert join_inline_blocks(blocks) == "머리말\n1. 첫째\n2. 둘째"
122+
123+
124+
def test_join_mixes_paragraph_listitem_formula_field() -> None:
125+
blocks: list[Block] = [
126+
ParagraphBlock(text="식:", prov=_PROV),
127+
FormulaBlock(script="x+y", text_alt=None, prov=_PROV),
128+
FieldBlock(field_kind="date", cached_value="2026-04-28", prov=_PROV),
129+
ListItemBlock(text="결론", marker="•", enumerated=False, prov=_PROV),
130+
]
131+
assert join_inline_blocks(blocks) == "식:\nx+y\n2026-04-28\n• 결론"
132+
133+
134+
def test_join_caption_blocks_works_via_attribute() -> None:
135+
"""CaptionBlock 사용처 사용 패턴 — caption.blocks 를 그대로 넘긴다."""
136+
caption = CaptionBlock(
137+
blocks=[
138+
ParagraphBlock(text="<그림 1>", prov=_PROV),
139+
FormulaBlock(script="E=mc^2", text_alt=None, prov=_PROV),
140+
],
141+
direction="bottom",
142+
prov=_PROV,
143+
)
144+
assert join_inline_blocks(caption.blocks) == "<그림 1>\nE=mc^2"

0 commit comments

Comments
 (0)