diff --git a/CHANGELOG.md b/CHANGELOG.md index 14c50f7..b35ff5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,34 @@ Format: [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). Versioning: ## [Unreleased] +## [0.10.0] — 2026-06-01 + +코드 감사로 포맷별(docx/pptx/xlsx) 격차를 점검해 **Excel 지원을 신규 추가**하고, +inspect 가 병합 docx 에서 깨지던 버그와 머리말/꼬리말·노트 플레이스홀더 누락을 수정. + +### Added +- **Excel(`.xlsx`) 지원 — `XlsxAdapter`** (openpyxl 기반). 각 워크시트를 하나의 표로 + 매핑(`table_index`=시트 인덱스, `location`=시트명). `get_tables`/`get_cell`/ + `set_cell`/`append_to_cell`/`append_row`/`render_template` 구현, `fill_form` 은 + base 구현으로 자동 동작. 병합 셀 anchor/span 인지(비-anchor 쓰기는 + `MergedCellWriteError`), 셀 크기(cm) 메타 제공. `load("*.xlsx")` 자동 디스패치. + MCP 도구는 확장자 디스패치로 그대로 동작. + +### Fixed +- **docx `get_placeholders` 병합표 크래시**: `row.cells` 가 가로+세로 병합 docx 에서 + `ValueError` 로 깨져 `inspect_document`/`get_schema` 가 실패하던 문제 — + `_build_grid` anchor 셀 순회로 수정(`get_tables` 와 동일 견고 경로). +- **머리말/꼬리말·노트 플레이스홀더 누락**: docx `get_placeholders` 가 본문만 보고 + 머리말/꼬리말을, pptx 가 슬라이드 노트를 놓쳐 `render` 는 채우는데 `inspect`/ + `used`/`missing` 에 안 잡히던 불일치 수정. + +### Changed +- 런타임 의존성에 `openpyxl>=3.1` 추가. + +### Verified +- xlsx 폼(병합 헤더·라벨-값·템플릿) inspect/fill_form/render/round-trip + MCP 경로, + docx 머리말/꼬리말·pptx 노트 커버리지 회귀 테스트. 테스트 77 종, ruff·mypy 클린. + ## [0.9.0] — 2026-06-01 실제 공공서식(지급정지요청서 등)과 다운로드한 docx/hwpx 폼들로 검증하며 드러난 diff --git a/README.md b/README.md index 53c3997..6dfd92e 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,11 @@ | `.docx` | `docxtpl` + `python-docx` | Jinja2 (`{%tr%}` loop 포함) | ✅ | ✅ | ✅ | ✅ | ✅ | | `.pptx` | `python-pptx` + 자체 lxml 확장 | `{{key}}` 치환 | ✅ (슬라이드 위치 포함) | ✅ | — (포맷 미지원) | ✅ | ✅ (v0.5+) | | `.hwpx` | 자체 `hwpx_core` (lxml + zipfile) | `{{key}}` 치환 | ✅ | ✅ | ✅ | ✅ | ✅ | +| `.xlsx` | `openpyxl` | `{{key}}` 치환 | ✅ (시트 = 표) | ✅ | — | ✅ | ✅ | - HWPX는 한컴오피스 설치가 **불필요**합니다 (macOS/Linux 서버에서 그대로 동작). - 구버전 `.hwp`(바이너리 포맷)는 지원하지 않습니다 — `.hwpx`로 변환 후 사용하세요. +- **XLSX (v0.10+)**: 각 워크시트를 하나의 표로 매핑(`table_index` = 시트 인덱스, `location` = 시트명). 병합 셀·`fill_form`·`render_template` 동일 인터페이스로 동작. - 병합 셀: 3개 포맷 모두 preview에 `null` 슬롯 + `merges` 메타로 구조 노출. non-anchor 좌표에 쓰기는 `MergedCellWriteError`로 거부. - **셀 크기 메타 (v0.6+)**: `get_tables`는 `column_widths_cm` / `row_heights_cm`, `get_cell`은 `width_cm` / `height_cm` / `char_count`를 반환합니다. LLM이 좁은 셀(예: 1.7×0.7cm 배지)에 긴 텍스트를 넣어 오버플로 되는 것을 사전에 판단할 수 있습니다. diff --git a/document_adapter/__init__.py b/document_adapter/__init__.py index 4febdc0..c6f4811 100644 --- a/document_adapter/__init__.py +++ b/document_adapter/__init__.py @@ -15,6 +15,7 @@ from .docx_adapter import DocxAdapter from .hwpx_adapter import HwpxAdapter from .pptx_adapter import PptxAdapter +from .xlsx_adapter import XlsxAdapter __all__ = [ "load", @@ -24,12 +25,14 @@ "DocxAdapter", "PptxAdapter", "HwpxAdapter", + "XlsxAdapter", ] _ADAPTERS: dict[str, type[DocumentAdapter]] = { ".docx": DocxAdapter, ".pptx": PptxAdapter, ".hwpx": HwpxAdapter, + ".xlsx": XlsxAdapter, } diff --git a/document_adapter/docx_adapter.py b/document_adapter/docx_adapter.py index 6e3dd85..aa2e898 100644 --- a/document_adapter/docx_adapter.py +++ b/document_adapter/docx_adapter.py @@ -186,6 +186,11 @@ def get_placeholders(self) -> list[str]: keys: set[str] = set() for p in self._doc.paragraphs: keys.update(TAG_PATTERN.findall(p.text)) + # 머리말/꼬리말 (docxtpl render 는 이미 채우므로 inspect 와 일치시킨다) + for section in self._doc.sections: + for hf in (section.header, section.footer): + for p in hf.paragraphs: + keys.update(TAG_PATTERN.findall(p.text)) # 모든 (중첩 포함) 표 셀에서 수집. row.cells 는 병합표에서 깨지므로 # _build_grid 의 anchor 셀만 순회한다(get_tables 와 동일 견고 경로). for _, tbl, _ in self._iter_tables(): diff --git a/document_adapter/pptx_adapter.py b/document_adapter/pptx_adapter.py index 9079cff..0dcb21b 100644 --- a/document_adapter/pptx_adapter.py +++ b/document_adapter/pptx_adapter.py @@ -79,6 +79,9 @@ def _iter_text_frames(self) -> Iterator[Any]: for row in shape.table.rows: for cell in row.cells: yield cell.text_frame + # 슬라이드 노트의 {{key}} 도 포함 (get_placeholders·render 공통) + if slide.has_notes_slide: + yield slide.notes_slide.notes_text_frame @staticmethod def _dimensions(table) -> tuple[int, int]: diff --git a/document_adapter/xlsx_adapter.py b/document_adapter/xlsx_adapter.py new file mode 100644 index 0000000..77731a3 --- /dev/null +++ b/document_adapter/xlsx_adapter.py @@ -0,0 +1,212 @@ +"""XLSX 어댑터: openpyxl 기반. 각 워크시트를 하나의 표로 매핑한다. + +- table_index = 워크시트 인덱스(0-based), location = 시트 이름 +- 좌표 row/col 은 다른 어댑터와 동일하게 0-based 논리 좌표 (openpyxl 은 1-based) +- 병합 셀: top-left 가 anchor, 나머지는 non-anchor. openpyxl 은 병합 non-anchor + 셀이 읽기전용(MergedCell)이라 set_cell 은 anchor 로 redirect 한다. +- fill_form 은 base 구현이 get_tables/get_cell/set_cell 로 자동 동작한다. +""" +from __future__ import annotations + +import re +from pathlib import Path +from typing import Any + +from openpyxl import load_workbook +from openpyxl.utils import get_column_letter + +from .base import ( + CellContent, + CellOutOfBoundsError, + DocumentAdapter, + MergeInfo, + MergedCellWriteError, + TableIndexError, + TableSchema, +) + +TAG_PATTERN = re.compile(r"\{\{\s*(\w+)\s*\}\}") + +# Excel 열 너비(문자 단위) → cm 근사. Calibri 11 기준 MDW≈7px, padding 5px. +# px → cm: px/96*2.54. +_PX_PER_CM = 96 / 2.54 + + +def _colwidth_to_cm(chars: float | None) -> float | None: + if chars is None: + return None + px = chars * 7 + 5 + return round(px / _PX_PER_CM, 1) + + +def _rowheight_to_cm(points: float | None) -> float | None: + if points is None: + return None + return round(points / 72 * 2.54, 1) + + +class XlsxAdapter(DocumentAdapter): + format = "xlsx" + + def _open(self) -> None: + self._wb = load_workbook(str(self.path)) + + def save(self, path: Path | str | None = None) -> Path: + target = Path(path) if path else self.path + self._wb.save(str(target)) + self.path = target + return target + + # ---- helpers ---- + def _ws(self, table_index: int): + sheets = self._wb.worksheets + if table_index < 0 or table_index >= len(sheets): + raise TableIndexError(f"XLSX sheet index {table_index} not found") + return sheets[table_index] + + @staticmethod + def _merge_map(ws) -> tuple[dict, dict]: + """(anchor → span) 와 (covered cell → anchor) 매핑을 0-based 로 반환.""" + anchors: dict[tuple[int, int], tuple[int, int]] = {} + covered: dict[tuple[int, int], tuple[int, int]] = {} + for rng in ws.merged_cells.ranges: + r0, c0 = rng.min_row - 1, rng.min_col - 1 + span = (rng.max_row - rng.min_row + 1, rng.max_col - rng.min_col + 1) + anchors[(r0, c0)] = span + for r in range(rng.min_row - 1, rng.max_row): + for c in range(rng.min_col - 1, rng.max_col): + if (r, c) != (r0, c0): + covered[(r, c)] = (r0, c0) + return anchors, covered + + @staticmethod + def _dims(ws) -> tuple[int, int]: + return ws.max_row or 0, ws.max_column or 0 + + # ---- inspection ---- + def get_placeholders(self) -> list[str]: + keys: set[str] = set() + for ws in self._wb.worksheets: + for row in ws.iter_rows(): + for cell in row: + if isinstance(cell.value, str): + keys.update(TAG_PATTERN.findall(cell.value)) + return sorted(keys) + + def get_tables(self, min_rows: int = 1, min_cols: int = 1, + preview_rows: int = 4, max_cell_len: int = 40) -> list[TableSchema]: + schemas: list[TableSchema] = [] + for idx, ws in enumerate(self._wb.worksheets): + rows, cols = self._dims(ws) + if rows < min_rows or cols < min_cols: + continue + anchors, covered = self._merge_map(ws) + visible = min(rows, preview_rows) + preview: list[list[str | None]] = [ + [None] * cols for _ in range(visible) + ] + for r in range(visible): + for c in range(cols): + if (r, c) in covered: + continue + v = ws.cell(row=r + 1, column=c + 1).value + preview[r][c] = ("" if v is None else str(v))[:max_cell_len] + merges = [MergeInfo(anchor=a, span=s) for a, s in anchors.items()] + col_widths = [ + _colwidth_to_cm(ws.column_dimensions[get_column_letter(c + 1)].width) + for c in range(cols) + ] + row_heights = [ + _rowheight_to_cm(ws.row_dimensions[r + 1].height) + for r in range(rows) + ] + schemas.append(TableSchema( + index=idx, rows=rows, cols=cols, preview=preview, + location=ws.title, merges=merges, + column_widths_cm=col_widths if any(col_widths) else None, + row_heights_cm=row_heights if any(row_heights) else None, + )) + return schemas + + def get_cell(self, table_index: int, row: int, col: int) -> CellContent: + ws = self._ws(table_index) + rows, cols = self._dims(ws) + if row < 0 or col < 0 or row >= rows or col >= cols: + raise CellOutOfBoundsError( + f"cell ({row},{col}) out of bounds ({rows}x{cols})") + anchors, covered = self._merge_map(ws) + if (row, col) in covered: + ar, ac = covered[(row, col)] + is_anchor, anchor, span = False, (ar, ac), anchors[(ar, ac)] + v = ws.cell(row=ar + 1, column=ac + 1).value + else: + is_anchor, anchor = True, (row, col) + span = anchors.get((row, col), (1, 1)) + v = ws.cell(row=row + 1, column=col + 1).value + text = "" if v is None else str(v) + width_cm = _colwidth_to_cm( + ws.column_dimensions[get_column_letter(anchor[1] + 1)].width) + height_cm = _rowheight_to_cm(ws.row_dimensions[anchor[0] + 1].height) + return CellContent( + row=row, col=col, text=text, paragraphs=[text], + is_anchor=is_anchor, anchor=anchor, span=span, + width_cm=width_cm, height_cm=height_cm, char_count=len(text)) + + # ---- editing ---- + def render_template(self, context: dict[str, Any], *, + on_missing: str = "blank") -> dict[str, list[str]]: + report = self._render_report(self.get_placeholders(), context, on_missing) + + def repl(m: "re.Match[str]") -> str: + key = m.group(1) + if key in context: + return str(context[key]) + return "" if on_missing == "blank" else m.group(0) + + for ws in self._wb.worksheets: + for row in ws.iter_rows(): + for cell in row: + if isinstance(cell.value, str) and TAG_PATTERN.search(cell.value): + cell.value = TAG_PATTERN.sub(repl, cell.value) + return report + + def _resolve_writable(self, ws, row: int, col: int, + allow_merge_redirect: bool) -> tuple[int, int]: + """병합 non-anchor 좌표면 anchor 로 redirect (openpyxl MergedCell 은 읽기전용).""" + _, covered = self._merge_map(ws) + if (row, col) in covered: + if not allow_merge_redirect: + ar, ac = covered[(row, col)] + raise MergedCellWriteError( + f"cell ({row},{col}) is part of a merge anchored at " + f"({ar},{ac}). Write to the anchor, or pass " + f"allow_merge_redirect=True.") + return covered[(row, col)] + return row, col + + def set_cell(self, table_index: int, row: int, col: int, value: str, + *, allow_merge_redirect: bool = False) -> str: + ws = self._ws(table_index) + rows, cols = self._dims(ws) + if row < 0 or col < 0 or row >= rows or col >= cols: + raise CellOutOfBoundsError( + f"cell ({row},{col}) out of bounds ({rows}x{cols})") + wr, wc = self._resolve_writable(ws, row, col, allow_merge_redirect) + cell = ws.cell(row=wr + 1, column=wc + 1) + old = "" if cell.value is None else str(cell.value) + cell.value = value + return old + + def append_to_cell(self, table_index: int, row: int, col: int, value: str, + separator: str = " ", *, + allow_merge_redirect: bool = False) -> str: + ws = self._ws(table_index) + wr, wc = self._resolve_writable(ws, row, col, allow_merge_redirect) + cell = ws.cell(row=wr + 1, column=wc + 1) + old = "" if cell.value is None else str(cell.value) + cell.value = f"{old}{separator}{value}" if old else value + return old + + def append_row(self, table_index: int, values: list[str]) -> None: + ws = self._ws(table_index) + ws.append(list(values)) diff --git a/pyproject.toml b/pyproject.toml index 71399fa..b8b329d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta" [project] name = "document-adapter" -version = "0.9.0" -description = "LLM-friendly document template editing (DOCX/PPTX/HWPX) with MCP server and Claude API tool-use support" +version = "0.10.0" +description = "LLM-friendly document template editing (DOCX/PPTX/HWPX/XLSX) with MCP server and Claude API tool-use support" readme = "README.md" requires-python = ">=3.10" license = { text = "MIT" } @@ -58,6 +58,7 @@ dependencies = [ "python-docx>=1.2", "docxtpl>=0.20", "python-pptx>=1.0", + "openpyxl>=3.1", "lxml>=5.0", "mcp>=1.0", ] diff --git a/tests/test_scenarios.py b/tests/test_scenarios.py index a580df8..bfb4a1a 100644 --- a/tests/test_scenarios.py +++ b/tests/test_scenarios.py @@ -496,6 +496,110 @@ def test_fill_form_reports_overflow_warnings(tmp_path: Path) -> None: assert r["overflow_warnings"] == [] +def _make_xlsx_form(path: Path) -> None: + """병합 헤더 + 라벨-값 + 템플릿 키를 가진 xlsx 폼.""" + from openpyxl import Workbook + wb = Workbook() + ws = wb.active + ws.title = "신청서" + ws["A1"] = "신청 정보" + ws.merge_cells("A1:B1") + ws["A2"] = "성명" + ws["A3"] = "부서" + ws["A4"] = "제목 {{title}}" + wb.save(str(path)) + + +def test_xlsx_inspect_fill_render_roundtrip(tmp_path: Path) -> None: + """XlsxAdapter: 시트→표 인식, 병합, fill_form(base 상속), render, 영속.""" + src = tmp_path / "form.xlsx" + _make_xlsx_form(src) + ad = load(src) + assert ad.format == "xlsx" + t = ad.get_tables()[0] + assert (t.rows, t.cols) == (4, 2) + assert t.location == "신청서" + assert [(m.anchor, m.span) for m in t.merges] == [((0, 0), (1, 2))] + assert ad.get_placeholders() == ["title"] + # 병합 anchor/non-anchor + assert ad.get_cell(0, 0, 0).span == (1, 2) + assert ad.get_cell(0, 0, 1).is_anchor is False + # fill_form (base 구현이 자동 동작) + r = ad.fill_form({"성명": "홍길동", "부서": "개발팀"}) + assert len(r["filled"]) == 2 + rr = ad.render_template({"title": "2026 보고서"}) + assert rr["used"] == ["title"] + out = tmp_path / "out.xlsx" + ad.save(out) + ad.close() + + ad2 = load(out) + assert ad2.get_cell(0, 1, 1).text == "홍길동" + assert ad2.get_cell(0, 2, 1).text == "개발팀" + assert "2026 보고서" in ad2.get_cell(0, 3, 0).text + ad2.close() + + +def test_xlsx_merged_cell_write_rejected(tmp_path: Path) -> None: + """병합 non-anchor 좌표 쓰기는 MergedCellWriteError (allow_merge_redirect로 우회).""" + from document_adapter.base import MergedCellWriteError + + src = tmp_path / "f.xlsx" + _make_xlsx_form(src) + ad = load(src) + with pytest.raises(MergedCellWriteError): + ad.set_cell(0, 0, 1, "X") # (0,1)은 A1:B1 병합의 non-anchor + ad.set_cell(0, 0, 1, "X", allow_merge_redirect=True) # anchor로 redirect + assert ad.get_cell(0, 0, 0).text == "X" + ad.close() + + +def test_xlsx_via_tools(tmp_path: Path) -> None: + """MCP call_tool 경로(load 디스패치)로도 xlsx 가 동작.""" + from document_adapter.tools import call_tool + + src = tmp_path / "f.xlsx" + _make_xlsx_form(src) + insp = call_tool("inspect_document", {"path": str(src)}) + assert insp["format"] == "xlsx" + assert insp["tables"][0]["location"] == "신청서" + + +def test_docx_header_footer_placeholders(tmp_path: Path) -> None: + """docx get_placeholders 가 머리말/꼬리말의 {{key}} 도 포함해야 한다.""" + from docx import Document + src = tmp_path / "hf.docx" + d = Document() + d.add_paragraph("본문 {{body}}") + sec = d.sections[0] + sec.header.paragraphs[0].text = "머리말 {{header_key}}" + sec.footer.paragraphs[0].text = "꼬리말 {{footer_key}}" + d.save(str(src)) + ad = load(src) + ph = ad.get_placeholders() + ad.close() + assert {"body", "header_key", "footer_key"} <= set(ph) + + +def test_pptx_notes_placeholders_and_render(tmp_path: Path) -> None: + """pptx 슬라이드 노트의 {{key}} 가 감지·렌더돼야 한다.""" + from pptx import Presentation + src = tmp_path / "n.pptx" + pr = Presentation() + s = pr.slides.add_slide(pr.slide_layouts[6]) + s.notes_slide.notes_text_frame.text = "노트 {{note_key}}" + pr.save(str(src)) + ad = load(src) + assert "note_key" in ad.get_placeholders() + ad.render_template({"note_key": "확인됨"}) + ad.save(src) + ad.close() + pr2 = Presentation(str(src)) + notes = [sl.notes_slide.notes_text_frame.text + for sl in pr2.slides if sl.has_notes_slide] + assert any("확인됨" in n for n in notes) + + def test_get_cell_out_of_bounds_raises(tmp_path: Path) -> None: """경계를 벗어난 좌표는 CellOutOfBoundsError(IndexError 하위).""" from document_adapter.base import CellOutOfBoundsError