mirror of
https://github.com/velocitatem/cvfs.git
synced 2026-05-31 08:43:37 +00:00
Finish MVP and dockerize
This commit is contained in:
22
dlib/cv/__init__.py
Normal file
22
dlib/cv/__init__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from .schema import (
|
||||
StructuredBlock,
|
||||
StructuredDocument,
|
||||
PatchPayload,
|
||||
PatchSuggestion,
|
||||
PatchOperation,
|
||||
)
|
||||
from .parser import parse_docx_bytes, summarize_keywords
|
||||
from .patcher import apply_patchset
|
||||
from .ats_guard import validate_patchset
|
||||
|
||||
__all__ = [
|
||||
"StructuredBlock",
|
||||
"StructuredDocument",
|
||||
"PatchPayload",
|
||||
"PatchSuggestion",
|
||||
"PatchOperation",
|
||||
"parse_docx_bytes",
|
||||
"summarize_keywords",
|
||||
"apply_patchset",
|
||||
"validate_patchset",
|
||||
]
|
||||
44
dlib/cv/ats_guard.py
Normal file
44
dlib/cv/ats_guard.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable
|
||||
|
||||
from .schema import PatchPayload, PatchOperation, StructuredDocument
|
||||
|
||||
|
||||
class PatchValidationError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def validate_patchset(
|
||||
document: StructuredDocument,
|
||||
patches: Iterable[PatchPayload],
|
||||
*,
|
||||
max_changes: int = 12,
|
||||
max_growth_ratio: float = 1.45,
|
||||
) -> None:
|
||||
patch_list = list(patches)
|
||||
if len(patch_list) > max_changes:
|
||||
raise PatchValidationError(
|
||||
f"Patchset exceeds max changes ({len(patch_list)} > {max_changes})"
|
||||
)
|
||||
block_map = {block.path: block for block in document.blocks}
|
||||
for patch in patch_list:
|
||||
block = block_map.get(patch.target_path)
|
||||
if not block:
|
||||
raise PatchValidationError(
|
||||
f"Target path {patch.target_path} does not exist in base document"
|
||||
)
|
||||
if patch.operation == PatchOperation.REPLACE_TEXT:
|
||||
if not patch.new_value:
|
||||
raise PatchValidationError("replace_text requires new_value")
|
||||
baseline = len(block.text.strip()) or 1
|
||||
if len(patch.new_value.strip()) / baseline > max_growth_ratio:
|
||||
raise PatchValidationError("Patch grows text beyond ATS safe threshold")
|
||||
if (
|
||||
patch.operation
|
||||
in {PatchOperation.REMOVE_BLOCK, PatchOperation.SUPPRESS_BLOCK}
|
||||
and block.block_type == "heading"
|
||||
):
|
||||
raise PatchValidationError(
|
||||
"Headings cannot be removed without manual confirmation"
|
||||
)
|
||||
104
dlib/cv/parser.py
Normal file
104
dlib/cv/parser.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from io import BytesIO
|
||||
from typing import Iterable
|
||||
|
||||
from docx import Document
|
||||
|
||||
from .schema import StructuredBlock, StructuredDocument
|
||||
|
||||
|
||||
def _detect_block_type(style_name: str | None, paragraph) -> str:
|
||||
style = (style_name or "").lower()
|
||||
if style.startswith("heading"):
|
||||
return "heading"
|
||||
if (
|
||||
"bullet" in style
|
||||
or "list" in style
|
||||
or getattr(paragraph, "style", None)
|
||||
and getattr(paragraph.style, "name", "").lower().startswith("list")
|
||||
):
|
||||
return "bullet"
|
||||
return "text"
|
||||
|
||||
|
||||
def _build_path(block_type: str, counter: int, extra: str | None = None) -> str:
|
||||
suffix = f"{block_type}[{counter}]"
|
||||
if extra:
|
||||
return f"{suffix}.{extra}"
|
||||
return suffix
|
||||
|
||||
|
||||
def parse_docx_bytes(
|
||||
file_bytes: bytes, *, version_label: str | None = None
|
||||
) -> StructuredDocument:
|
||||
document = Document(BytesIO(file_bytes))
|
||||
counters: defaultdict[str, int] = defaultdict(int)
|
||||
blocks: list[StructuredBlock] = []
|
||||
|
||||
for paragraph in document.paragraphs:
|
||||
text = paragraph.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
block_type = _detect_block_type(
|
||||
getattr(paragraph.style, "name", None), paragraph
|
||||
)
|
||||
counters[block_type] += 1
|
||||
keywords = summarize_keywords([text])
|
||||
blocks.append(
|
||||
StructuredBlock(
|
||||
path=_build_path(block_type, counters[block_type]),
|
||||
block_type="heading"
|
||||
if block_type == "heading"
|
||||
else ("bullet" if block_type == "bullet" else "text"),
|
||||
text=text,
|
||||
keywords=keywords,
|
||||
metadata={
|
||||
"style": getattr(getattr(paragraph, "style", None), "name", "")
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
for table_index, table in enumerate(document.tables):
|
||||
for row_index, row in enumerate(table.rows):
|
||||
for cell_index, cell in enumerate(row.cells):
|
||||
text = cell.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
counters["table"] += 1
|
||||
blocks.append(
|
||||
StructuredBlock(
|
||||
path=_build_path(
|
||||
"table",
|
||||
counters["table"],
|
||||
extra=f"{row_index}-{cell_index}",
|
||||
),
|
||||
block_type="table",
|
||||
text=text,
|
||||
keywords=summarize_keywords([text]),
|
||||
metadata={
|
||||
"table_index": table_index,
|
||||
"row": row_index,
|
||||
"cell": cell_index,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return StructuredDocument(version_label=version_label, blocks=blocks)
|
||||
|
||||
|
||||
def summarize_keywords(lines: Iterable[str], *, max_keywords: int = 6) -> list[str]:
|
||||
terms: dict[str, int] = {}
|
||||
for line in lines:
|
||||
for raw in line.split():
|
||||
cleaned = raw.strip().strip(",.;:()[]").lower()
|
||||
if len(cleaned) <= 2:
|
||||
continue
|
||||
terms[cleaned] = terms.get(cleaned, 0) + 1
|
||||
return [
|
||||
term
|
||||
for term, _ in sorted(terms.items(), key=lambda kv: kv[1], reverse=True)[
|
||||
:max_keywords
|
||||
]
|
||||
]
|
||||
53
dlib/cv/patcher.py
Normal file
53
dlib/cv/patcher.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from .schema import PatchOperation, PatchPayload, StructuredDocument
|
||||
|
||||
|
||||
def apply_patchset(
|
||||
document: StructuredDocument, patches: list[PatchPayload]
|
||||
) -> StructuredDocument:
|
||||
working = StructuredDocument.model_validate(deepcopy(document.model_dump()))
|
||||
for patch in patches:
|
||||
block = working.get_block(patch.target_path)
|
||||
if not block:
|
||||
continue
|
||||
if patch.operation == PatchOperation.REPLACE_TEXT:
|
||||
block.metadata["previous_text"] = block.text
|
||||
if patch.new_value:
|
||||
block.text = patch.new_value
|
||||
elif patch.operation == PatchOperation.REMOVE_BLOCK:
|
||||
working.blocks = [
|
||||
candidate
|
||||
for candidate in working.blocks
|
||||
if candidate.path != patch.target_path
|
||||
]
|
||||
elif patch.operation == PatchOperation.REORDER_SECTION:
|
||||
target_index = (
|
||||
patch.metadata.get("target_index") if patch.metadata else None
|
||||
)
|
||||
if target_index is None:
|
||||
continue
|
||||
to_move = next(
|
||||
(
|
||||
candidate
|
||||
for candidate in working.blocks
|
||||
if candidate.path == patch.target_path
|
||||
),
|
||||
None,
|
||||
)
|
||||
if not to_move:
|
||||
continue
|
||||
working.blocks = [
|
||||
candidate
|
||||
for candidate in working.blocks
|
||||
if candidate.path != patch.target_path
|
||||
]
|
||||
working.blocks.insert(int(target_index), to_move)
|
||||
elif patch.operation == PatchOperation.BOOST_KEYWORD and patch.new_value:
|
||||
if patch.new_value not in block.keywords:
|
||||
block.keywords.insert(0, patch.new_value)
|
||||
elif patch.operation == PatchOperation.SUPPRESS_BLOCK:
|
||||
block.metadata["suppressed"] = True
|
||||
return working
|
||||
54
dlib/cv/schema.py
Normal file
54
dlib/cv/schema.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, Field, ConfigDict
|
||||
|
||||
|
||||
class StructuredBlock(BaseModel):
|
||||
"""Editable slice of a DOCX document."""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
path: str
|
||||
block_type: Literal[
|
||||
"heading", "summary", "bullet", "skills", "table", "meta", "text"
|
||||
]
|
||||
text: str
|
||||
keywords: list[str] = Field(default_factory=list)
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class StructuredDocument(BaseModel):
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
version_label: str | None = None
|
||||
blocks: list[StructuredBlock] = Field(default_factory=list)
|
||||
|
||||
def get_block(self, path: str) -> StructuredBlock | None:
|
||||
return next((block for block in self.blocks if block.path == path), None)
|
||||
|
||||
|
||||
class PatchOperation(str, Enum):
|
||||
REPLACE_TEXT = "replace_text"
|
||||
REMOVE_BLOCK = "remove_block"
|
||||
REORDER_SECTION = "reorder_section"
|
||||
BOOST_KEYWORD = "boost_keyword"
|
||||
SUPPRESS_BLOCK = "suppress_block"
|
||||
|
||||
|
||||
class PatchPayload(BaseModel):
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
target_path: str
|
||||
operation: PatchOperation
|
||||
new_value: str | None = None
|
||||
old_value: str | None = None
|
||||
rationale: str | None = None
|
||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class PatchSuggestion(PatchPayload):
|
||||
confidence: float | None = None
|
||||
keywords: list[str] = Field(default_factory=list)
|
||||
Reference in New Issue
Block a user