mirror of
https://github.com/velocitatem/cvfs.git
synced 2026-05-31 08:43:37 +00:00
- dlib/ai/insights.py: pure-Python NLP analysis that correlates accepted AI suggestion operations/keywords/sections with submission outcomes (pending_review / published = positive, archived = negative) - Backend: GET /api/v1/insights route + service + Pydantic schema - Frontend: InsightsPanel component with bar charts for operation impact, section impact, and keyword signal lift scores - Insights tab added to the version panel; compact preview on doc overview - NEXT_PUBLIC_DEMO=true makes the webapp fully standalone: loads DEMO_DOCUMENTS / DEMO_SUBMISSIONS / DEMO_INSIGHTS from demo-data.ts, disables all mutating actions, shows a DEMO badge in the top bar - apps/webapp/public/demo-cv.docx: static dummy CV (Alex Rivera) for demo - scripts/gen_demo_cv.py: script to regenerate the demo DOCX - .env.example: document NEXT_PUBLIC_DEMO flag https://claude.ai/code/session_01LWxu2qrwY6BRjUFXXn7NiM
190 lines
5.5 KiB
Python
190 lines
5.5 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from collections import Counter, defaultdict
|
|
from dataclasses import dataclass, field
|
|
from typing import Literal
|
|
|
|
STOPWORDS = frozenset(
|
|
"a an the and or but in on at to for of with is are was were be been have has"
|
|
" had do does did this that these those it its i you he she we they their our"
|
|
" your my his her from by into through about as so if then when where which who"
|
|
" can will may should would could also just not no more some all any each"
|
|
" than other up out off over how what new using use used with well per".split()
|
|
)
|
|
|
|
Outcome = Literal["positive", "negative"] # positive = pending_review / published
|
|
|
|
|
|
@dataclass
|
|
class SuggestionRecord:
|
|
operation: str
|
|
target_path: str
|
|
proposed_text: str | None
|
|
rationale: str | None
|
|
accepted: bool | None
|
|
|
|
|
|
@dataclass
|
|
class SubmissionRecord:
|
|
status: str
|
|
suggestions: list[SuggestionRecord] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class OperationImpact:
|
|
operation: str
|
|
total: int
|
|
positive: int
|
|
rate: float
|
|
|
|
|
|
@dataclass
|
|
class KeywordSignal:
|
|
keyword: str
|
|
positive_count: int
|
|
negative_count: int
|
|
lift: float # positive_count / max(negative_count, 1)
|
|
|
|
|
|
@dataclass
|
|
class SectionImpact:
|
|
section: str
|
|
positive_rate: float
|
|
count: int
|
|
|
|
|
|
@dataclass
|
|
class InsightsResult:
|
|
total_submissions: int
|
|
positive_count: int
|
|
positive_rate: float
|
|
operation_impact: list[OperationImpact]
|
|
top_positive_keywords: list[KeywordSignal]
|
|
top_negative_keywords: list[KeywordSignal]
|
|
section_impact: list[SectionImpact]
|
|
has_data: bool
|
|
|
|
|
|
def _outcome(status: str) -> Outcome | None:
|
|
if status in ("pending_review", "published"):
|
|
return "positive"
|
|
if status == "archived":
|
|
return "negative"
|
|
return None # draft / tailoring — not enough signal
|
|
|
|
|
|
def _tokens(text: str | None) -> list[str]:
|
|
if not text:
|
|
return []
|
|
return [
|
|
t for t in re.findall(r"[a-z][a-z0-9+.-]{1,}", text.lower())
|
|
if t not in STOPWORDS and len(t) > 2
|
|
]
|
|
|
|
|
|
def _section_prefix(path: str) -> str:
|
|
"""heading[1] -> heading, bullet[3] -> bullet, table[1].0-1 -> table"""
|
|
return re.match(r"([a-z_]+)", path).group(1) if path else "unknown"
|
|
|
|
|
|
def analyze(submissions: list[SubmissionRecord]) -> InsightsResult:
|
|
labeled = [(s, _outcome(s.status)) for s in submissions]
|
|
labeled_known = [(s, o) for s, o in labeled if o is not None]
|
|
|
|
positive_count = sum(1 for _, o in labeled_known if o == "positive")
|
|
|
|
# operation impact: only accepted suggestions in outcome-labeled submissions
|
|
op_positive: Counter[str] = Counter()
|
|
op_total: Counter[str] = Counter()
|
|
for sub, outcome in labeled_known:
|
|
for sug in sub.suggestions:
|
|
if sug.accepted is not True:
|
|
continue
|
|
op_total[sug.operation] += 1
|
|
if outcome == "positive":
|
|
op_positive[sug.operation] += 1
|
|
|
|
op_impact = sorted(
|
|
[
|
|
OperationImpact(
|
|
operation=op,
|
|
total=total,
|
|
positive=op_positive[op],
|
|
rate=round(op_positive[op] / total, 3),
|
|
)
|
|
for op, total in op_total.items()
|
|
],
|
|
key=lambda x: x.rate,
|
|
reverse=True,
|
|
)
|
|
|
|
# keyword signals from accepted-suggestion text in outcome-labeled submissions
|
|
kw_pos: Counter[str] = Counter()
|
|
kw_neg: Counter[str] = Counter()
|
|
for sub, outcome in labeled_known:
|
|
bucket = kw_pos if outcome == "positive" else kw_neg
|
|
for sug in sub.suggestions:
|
|
if sug.accepted is not True:
|
|
continue
|
|
for t in _tokens(sug.proposed_text) + _tokens(sug.rationale):
|
|
bucket[t] += 1
|
|
|
|
all_kws = set(kw_pos) | set(kw_neg)
|
|
signals = [
|
|
KeywordSignal(
|
|
keyword=kw,
|
|
positive_count=kw_pos[kw],
|
|
negative_count=kw_neg[kw],
|
|
lift=round(kw_pos[kw] / max(kw_neg[kw], 1), 2),
|
|
)
|
|
for kw in all_kws
|
|
if kw_pos[kw] + kw_neg[kw] >= 2 # minimum support
|
|
]
|
|
top_pos_kw = sorted(
|
|
[s for s in signals if s.positive_count > 0],
|
|
key=lambda s: (s.lift, s.positive_count),
|
|
reverse=True,
|
|
)[:8]
|
|
top_neg_kw = sorted(
|
|
[s for s in signals if s.negative_count > 0],
|
|
key=lambda s: (s.negative_count, -s.lift),
|
|
reverse=True,
|
|
)[:8]
|
|
|
|
# section impact: group target_path prefix by outcome
|
|
sec_pos: Counter[str] = Counter()
|
|
sec_total: Counter[str] = Counter()
|
|
for sub, outcome in labeled_known:
|
|
for sug in sub.suggestions:
|
|
if sug.accepted is not True:
|
|
continue
|
|
sec = _section_prefix(sug.target_path)
|
|
sec_total[sec] += 1
|
|
if outcome == "positive":
|
|
sec_pos[sec] += 1
|
|
|
|
section_impact = sorted(
|
|
[
|
|
SectionImpact(
|
|
section=sec,
|
|
positive_rate=round(sec_pos[sec] / total, 3),
|
|
count=total,
|
|
)
|
|
for sec, total in sec_total.items()
|
|
],
|
|
key=lambda s: s.positive_rate,
|
|
reverse=True,
|
|
)
|
|
|
|
return InsightsResult(
|
|
total_submissions=len(submissions),
|
|
positive_count=positive_count,
|
|
positive_rate=round(positive_count / len(submissions), 3) if submissions else 0.0,
|
|
operation_impact=op_impact,
|
|
top_positive_keywords=top_pos_kw,
|
|
top_negative_keywords=top_neg_kw,
|
|
section_impact=section_impact,
|
|
has_data=bool(labeled_known),
|
|
)
|