feat: NLP patch insights + standalone demo mode

- dlib/ai/insights.py: pure-Python NLP analysis that correlates accepted
  AI suggestion operations/keywords/sections with submission outcomes
  (pending_review / published = positive, archived = negative)
- Backend: GET /api/v1/insights route + service + Pydantic schema
- Frontend: InsightsPanel component with bar charts for operation impact,
  section impact, and keyword signal lift scores
- Insights tab added to the version panel; compact preview on doc overview
- NEXT_PUBLIC_DEMO=true makes the webapp fully standalone: loads
  DEMO_DOCUMENTS / DEMO_SUBMISSIONS / DEMO_INSIGHTS from demo-data.ts,
  disables all mutating actions, shows a DEMO badge in the top bar
- apps/webapp/public/demo-cv.docx: static dummy CV (Alex Rivera) for demo
- scripts/gen_demo_cv.py: script to regenerate the demo DOCX
- .env.example: document NEXT_PUBLIC_DEMO flag

https://claude.ai/code/session_01LWxu2qrwY6BRjUFXXn7NiM
This commit is contained in:
Claude
2026-04-05 09:34:01 +00:00
parent 0f32d46404
commit 615d1bdb9e
12 changed files with 780 additions and 17 deletions

189
dlib/ai/insights.py Normal file
View File

@@ -0,0 +1,189 @@
from __future__ import annotations
import re
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from typing import Literal
STOPWORDS = frozenset(
"a an the and or but in on at to for of with is are was were be been have has"
" had do does did this that these those it its i you he she we they their our"
" your my his her from by into through about as so if then when where which who"
" can will may should would could also just not no more some all any each"
" than other up out off over how what new using use used with well per".split()
)
Outcome = Literal["positive", "negative"] # positive = pending_review / published
@dataclass
class SuggestionRecord:
operation: str
target_path: str
proposed_text: str | None
rationale: str | None
accepted: bool | None
@dataclass
class SubmissionRecord:
status: str
suggestions: list[SuggestionRecord] = field(default_factory=list)
@dataclass
class OperationImpact:
operation: str
total: int
positive: int
rate: float
@dataclass
class KeywordSignal:
keyword: str
positive_count: int
negative_count: int
lift: float # positive_count / max(negative_count, 1)
@dataclass
class SectionImpact:
section: str
positive_rate: float
count: int
@dataclass
class InsightsResult:
total_submissions: int
positive_count: int
positive_rate: float
operation_impact: list[OperationImpact]
top_positive_keywords: list[KeywordSignal]
top_negative_keywords: list[KeywordSignal]
section_impact: list[SectionImpact]
has_data: bool
def _outcome(status: str) -> Outcome | None:
if status in ("pending_review", "published"):
return "positive"
if status == "archived":
return "negative"
return None # draft / tailoring — not enough signal
def _tokens(text: str | None) -> list[str]:
if not text:
return []
return [
t for t in re.findall(r"[a-z][a-z0-9+.-]{1,}", text.lower())
if t not in STOPWORDS and len(t) > 2
]
def _section_prefix(path: str) -> str:
"""heading[1] -> heading, bullet[3] -> bullet, table[1].0-1 -> table"""
return re.match(r"([a-z_]+)", path).group(1) if path else "unknown"
def analyze(submissions: list[SubmissionRecord]) -> InsightsResult:
labeled = [(s, _outcome(s.status)) for s in submissions]
labeled_known = [(s, o) for s, o in labeled if o is not None]
positive_count = sum(1 for _, o in labeled_known if o == "positive")
# operation impact: only accepted suggestions in outcome-labeled submissions
op_positive: Counter[str] = Counter()
op_total: Counter[str] = Counter()
for sub, outcome in labeled_known:
for sug in sub.suggestions:
if sug.accepted is not True:
continue
op_total[sug.operation] += 1
if outcome == "positive":
op_positive[sug.operation] += 1
op_impact = sorted(
[
OperationImpact(
operation=op,
total=total,
positive=op_positive[op],
rate=round(op_positive[op] / total, 3),
)
for op, total in op_total.items()
],
key=lambda x: x.rate,
reverse=True,
)
# keyword signals from accepted-suggestion text in outcome-labeled submissions
kw_pos: Counter[str] = Counter()
kw_neg: Counter[str] = Counter()
for sub, outcome in labeled_known:
bucket = kw_pos if outcome == "positive" else kw_neg
for sug in sub.suggestions:
if sug.accepted is not True:
continue
for t in _tokens(sug.proposed_text) + _tokens(sug.rationale):
bucket[t] += 1
all_kws = set(kw_pos) | set(kw_neg)
signals = [
KeywordSignal(
keyword=kw,
positive_count=kw_pos[kw],
negative_count=kw_neg[kw],
lift=round(kw_pos[kw] / max(kw_neg[kw], 1), 2),
)
for kw in all_kws
if kw_pos[kw] + kw_neg[kw] >= 2 # minimum support
]
top_pos_kw = sorted(
[s for s in signals if s.positive_count > 0],
key=lambda s: (s.lift, s.positive_count),
reverse=True,
)[:8]
top_neg_kw = sorted(
[s for s in signals if s.negative_count > 0],
key=lambda s: (s.negative_count, -s.lift),
reverse=True,
)[:8]
# section impact: group target_path prefix by outcome
sec_pos: Counter[str] = Counter()
sec_total: Counter[str] = Counter()
for sub, outcome in labeled_known:
for sug in sub.suggestions:
if sug.accepted is not True:
continue
sec = _section_prefix(sug.target_path)
sec_total[sec] += 1
if outcome == "positive":
sec_pos[sec] += 1
section_impact = sorted(
[
SectionImpact(
section=sec,
positive_rate=round(sec_pos[sec] / total, 3),
count=total,
)
for sec, total in sec_total.items()
],
key=lambda s: s.positive_rate,
reverse=True,
)
return InsightsResult(
total_submissions=len(submissions),
positive_count=positive_count,
positive_rate=round(positive_count / len(submissions), 3) if submissions else 0.0,
operation_impact=op_impact,
top_positive_keywords=top_pos_kw,
top_negative_keywords=top_neg_kw,
section_impact=section_impact,
has_data=bool(labeled_known),
)