Files
cvfs/dlib/cv/docx_export.py

77 lines
2.2 KiB
Python

from __future__ import annotations
from collections import defaultdict
from io import BytesIO
from docx import Document
from .parser import _detect_block_type
def _path_to_para_map(doc: Document) -> dict[str, int]:
counters: defaultdict[str, int] = defaultdict(int)
result: dict[str, int] = {}
for idx, para in enumerate(doc.paragraphs):
if not para.text.strip():
continue
block_type = _detect_block_type(getattr(para.style, "name", None), para)
counters[block_type] += 1
result[f"{block_type}[{counters[block_type]}]"] = idx
return result
def _replace_para_text(para, new_text: str) -> None:
"""Replace paragraph text preserving the first run's character formatting."""
if not para.runs:
para.add_run(new_text)
return
first = para.runs[0]
for run in para.runs[1:]:
run.text = ""
first.text = new_text
def _remove_paragraph(paragraph) -> None:
p = paragraph._element
p.getparent().remove(p)
def generate_patched_docx(
original_bytes: bytes, structured_blocks: list[dict]
) -> bytes:
"""Return DOCX bytes with text patches from structured_blocks applied.
Compares each block's text against the original paragraph and replaces it
when different. Blocks absent from structured_blocks are removed.
"""
if not structured_blocks:
return original_bytes
doc = Document(BytesIO(original_bytes))
path_map = _path_to_para_map(doc)
original_paths = set(path_map.keys())
patched = {b["path"]: b["text"] for b in structured_blocks}
patched_paths = set(patched.keys())
# Apply text replacements first (indices stay stable)
for path, new_text in patched.items():
idx = path_map.get(path)
if idx is None:
continue
para = doc.paragraphs[idx]
if para.text.strip() != new_text:
_replace_para_text(para, new_text)
# Remove blocks no longer present; process in reverse index order
removed = sorted(
[path_map[p] for p in (original_paths - patched_paths) if p in path_map],
reverse=True,
)
for idx in removed:
_remove_paragraph(doc.paragraphs[idx])
out = BytesIO()
doc.save(out)
return out.getvalue()