from __future__ import annotations import re from collections import defaultdict from typing import Any, Dict, Iterable, List, Tuple from intel.config import ENTITIES_DIR from intel.models import AdvisoryRecord from intel.utils import isoformat, load_all_json, now_utc, parse_dt, slugify, unique FAMILY_KEYWORDS = { "xss": ["xss", "cross-site scripting", "content injection", "html injection", "dom xss"], "sqli": ["sql injection", "sqli"], "authz-bypass": ["authorization bypass", "access control", "auth bypass", "permission bypass"], "ssrf": ["ssrf", "server-side request forgery"], "file-upload": ["file upload", "upload bypass", "attachment"], "request-smuggling": ["request smuggling", "http desync"], "template-injection": ["template injection", "ssti"], "deserialization": ["deserialization", "serialization"], "proxy-boundary": ["proxy", "middleware", "header trust", "reverse proxy"], "plugin-extension": ["plugin", "extension", "module", "theme", "addon"], "session-token": ["token", "cookie", "session", "jwt"], "path-traversal": ["path traversal", "directory traversal"], "misconfiguration": ["misconfiguration", "default credentials", "admin panel", "debug"], } PLUGINISH_ENTITY_TYPES = {"plugin", "extension", "module", "theme"} FAMILY_TO_REQUIRED_ROLE = { "xss": "editor-or-admin", "sqli": "anonymous-or-low-privileged", "authz-bypass": "cross-tenant-or-low-privileged-user", "ssrf": "editor-or-admin", "file-upload": "authenticated-uploader", "request-smuggling": "edge-access", "template-injection": "template-editor-or-admin", "deserialization": "application-integrator", "proxy-boundary": "reverse-proxy-or-edge-client", "plugin-extension": "plugin-manager-or-admin", "session-token": "authenticated-user", "path-traversal": "anonymous-or-low-privileged", "misconfiguration": "operator-or-admin", "unknown": "unknown", } def _advisory_dict(item: AdvisoryRecord | Dict[str, Any]) -> Dict[str, Any]: if isinstance(item, AdvisoryRecord): return item.to_dict() return dict(item or {}) def infer_family(advisory: Dict[str, Any]) -> str: text = " ".join( filter( None, [ advisory.get("title"), advisory.get("summary"), advisory.get("system_id"), advisory.get("package_name"), " ".join(advisory.get("aliases", []) or []), " ".join(advisory.get("secure_code_topics", []) or []), ], ) ).lower() for family, keywords in FAMILY_KEYWORDS.items(): if any(keyword in text for keyword in keywords): return family return "unknown" def _strip_package_version_suffix(value: str) -> str: stripped = value.strip() stripped = re.sub(r"/v\d+$", "", stripped) return stripped def _display_name(value: str, fallback: str) -> str: candidate = (value or "").strip() if not candidate: return fallback if candidate.startswith("github.com/"): candidate = candidate.split("/", 1)[1] candidate = candidate.replace("@", "").replace("/", " / ") return candidate def infer_entity_type(advisory: Dict[str, Any]) -> str: package_name = (advisory.get("package_name") or "").lower() text = " ".join( filter( None, [ advisory.get("title"), advisory.get("summary"), advisory.get("package_name"), ], ) ).lower() if "theme" in package_name or " theme" in text: return "theme" if "plugin" in package_name or " plugin" in text or "plugins" in text: return "plugin" if "extension" in package_name or " extension" in text: return "extension" if "module" in package_name or " module" in text: return "module" if package_name.startswith("github.com/") or package_name.count("/") >= 2: return "repo" if "/" in package_name: return "package" return "project" def advisory_scope_for_entity(entity_type: str) -> str: if entity_type in {"plugin", "extension", "module", "theme"}: return entity_type if entity_type == "repo": return "repo" if entity_type in {"package", "project"}: return "package" return "core" def _repo_url_from_package(package_name: str) -> str: package_name = _strip_package_version_suffix(package_name) if package_name.startswith("github.com/"): return f"https://{package_name}" if package_name.count("/") == 1 and not package_name.startswith("@"): owner, repo = package_name.split("/", 1) if owner and repo: return f"https://github.com/{owner}/{repo}" return "" def _github_repo_from_url(url: str) -> str: match = re.match(r"https://github\.com/([^/]+)/([^/#?]+)", (url or "").strip(), re.IGNORECASE) if not match: return "" return f"https://github.com/{match.group(1)}/{match.group(2)}" def _marketplace_slug(url: str) -> str: parts = [part for part in re.split(r"[/?#]+", (url or "").strip()) if part] if not parts: return "" return parts[-1] def _package_registry_url(package_name: str) -> str: normalized = _strip_package_version_suffix(package_name) if not normalized: return "" if normalized.startswith("@") or "/" not in normalized: return f"https://www.npmjs.com/package/{normalized}" if normalized.count("/") == 1 and not normalized.startswith("github.com/"): return f"https://packagist.org/packages/{normalized}" return "" def _entity_id(system_id: str, entity_type: str, name: str) -> str: return f"{system_id}--{entity_type}--{slugify(name)}" def _pick_version_boundary(versions: Iterable[str], *, prefer_fixed: bool = False) -> str | None: values = [str(value).strip() for value in versions if str(value).strip()] if not values: return None direct = [value for value in values if re.search(r"\d", value)] if direct: return direct[0] if prefer_fixed else direct[-1] return values[0] def _version_confidence(affected: List[str], fixed: List[str]) -> Tuple[str, str, bool]: if affected and fixed: return "high", "", False if affected or fixed: return "medium", "official bulletin or mirrored source only exposed one side of the version boundary", True return "low", "official bulletin or aggregated source did not expose explicit affected/fixed versions", True def _entry_surface(family: str, scope: str) -> str: mapping = { "xss": "web-ui-render-path", "sqli": "query-or-filter-parameter", "authz-bypass": "privileged-route-or-object-reference", "ssrf": "remote-fetch-or-webhook-endpoint", "file-upload": "upload-or-import-surface", "request-smuggling": "reverse-proxy-boundary", "template-injection": "template-render-entry", "deserialization": "serialized-input-boundary", "proxy-boundary": "proxy-header-or-trust-boundary", "plugin-extension": "extension-management-surface", "session-token": "session-or-token-processing", "path-traversal": "file-read-or-download-path", "misconfiguration": "deployment-or-admin-surface", } return mapping.get(family, f"{scope}-surface") def _request_paths(family: str, scope: str) -> List[str]: mapping = { "xss": ["/admin/editor", "/preview", "/rendered-content"], "sqli": ["/search", "/filter", "/api/list"], "authz-bypass": ["/admin/*", "/api/private/*", "/tenant/*"], "ssrf": ["/webhook/test", "/remote-fetch", "/import-url"], "file-upload": ["/upload", "/import", "/plugin/install"], "request-smuggling": ["/ via reverse proxy", "front proxy -> app origin"], "template-injection": ["/templates", "/email-preview", "/theme-editor"], "deserialization": ["/api/import", "/queue/consumer", "/session/restore"], "proxy-boundary": ["/middleware", "/x-forwarded-* trust path"], "plugin-extension": ["/plugins", "/extensions", "/themes"], "session-token": ["/login", "/callback", "/session"], "path-traversal": ["/download", "/assets", "/attachment"], "misconfiguration": ["/admin", "/debug", "/setup"], } return mapping.get(family, [f"/{scope}"]) def _input_shape(family: str) -> str: mapping = { "xss": "受控 HTML/Markdown/富文本输入,观察渲染上下文是否失去编码或净化。", "sqli": "受控查询参数、排序字段或筛选值,观察是否突破预期查询边界。", "authz-bypass": "使用低权限身份访问高权限对象或跨租户资源。", "ssrf": "提交受控回环或哨兵 URL,验证协议、主机、IP 与重定向限制。", "file-upload": "提交受控非执行样本,验证扩展名、MIME、落盘与执行权限。", "request-smuggling": "构造受控冲突头部组合,仅验证代理与应用解析差异。", "template-injection": "提交受控模板占位符,验证是否存在危险表达式求值。", "deserialization": "提交受控序列化样本,验证类型恢复与危险对象实例化。", "proxy-boundary": "提交受控代理头或来源头,验证信任边界和回源鉴权。", "plugin-extension": "在扩展管理或扩展功能入口中提交受控配置/内容。", "session-token": "使用短期测试令牌或会话,验证生命周期、绑定与失效逻辑。", "path-traversal": "提交规范化路径片段,验证根目录限制与标准化处理。", "misconfiguration": "检查默认入口、调试面板、弱默认项和暴露控制面。", } return mapping.get(family, "提交最小化、可审计、可回滚的受控输入。") def _unsafe_behavior(family: str) -> str: mapping = { "xss": "输入在目标上下文执行或被浏览器解释为主动内容。", "sqli": "响应、日志或 side effect 显示查询边界被打破。", "authz-bypass": "低权限身份可访问本不应可见的数据或操作。", "ssrf": "服务端向受控目标发起非预期请求。", "file-upload": "上传样本被错误接受、可访问或位于可执行路径。", "request-smuggling": "代理和应用对同一请求的边界解释不一致。", "template-injection": "模板引擎对用户输入执行表达式求值。", "deserialization": "反序列化恢复出危险类型或触发危险行为。", "proxy-boundary": "仅凭代理头即可越过鉴权或来源控制。", "plugin-extension": "扩展安装、配置或运行突破了信任边界。", "session-token": "令牌或会话可被重放、固定或越权使用。", "path-traversal": "可读取、列出或访问根目录之外资源。", "misconfiguration": "默认设置暴露管理面、调试面或高权限动作。", } return mapping.get(family, "目标表现出超出设计边界的行为。") def _evidence_points(family: str, advisory_scope: str) -> Tuple[List[str], List[str], List[str], List[str]]: server = [ "应用日志中的命中路径、鉴权决策和异常栈", "反向代理或边界层日志中的请求头、来源 IP 与路由决策", ] browser = [ "基线截图与攻击后截图的 DOM/视觉差异", "console、network 与 response metadata 中的异常信号", ] db_fs = [ "数据库中新增/越权读取的测试数据", "文件系统中新增上传样本、缓存条目或越权读取痕迹", ] detection = [ "WAF / reverse proxy 异常日志、访问日志和告警", "应用审计日志中的权限错误、重定向异常、模板渲染或上传落盘事件", ] if family in {"request-smuggling", "proxy-boundary"}: detection.append("上游代理与应用层对 Content-Length / Transfer-Encoding / forwarded headers 的解释差异") if advisory_scope in {"plugin", "extension", "module", "theme"}: server.append("插件/扩展管理日志、安装日志与版本清单") db_fs.append("插件目录、主题目录或扩展配置表中的测试样本") return server, browser, db_fs, detection def _patch_validation_steps(family: str, advisory: Dict[str, Any]) -> List[str]: patched_version = advisory.get("patched_version") or "修复版本" version_assertion = advisory.get("affected_version_ranges") or advisory.get("affected_versions") or ["受影响版本区间"] return [ f"确认目标版本从 `{', '.join(version_assertion[:3])}` 升级或回移到 `{patched_version}`。", "保留同一组受控输入,在修复前后分别执行并比对响应、日志与浏览器证据。", "确认修复后仅保留预期业务行为,不再触发越权、回显、异常渲染或错误请求。", f"补充 `{family}` 族自动化回归,避免同类路径在插件、主题或代理链中回归。", ] def build_workflow(advisory: Dict[str, Any], system: Dict[str, Any]) -> Dict[str, Any]: family = advisory.get("workflow", {}).get("vuln_family") or infer_family(advisory) scope = advisory.get("advisory_scope") or "core" required_role = FAMILY_TO_REQUIRED_ROLE.get(family, "unknown") affected_assertion = advisory.get("affected_version_ranges") or advisory.get("affected_versions") or ["需要从公告、锁文件、版本页或关于页面人工确认版本命中"] server, browser, db_fs, detection = _evidence_points(family, scope) return { "workflow_id": f"{advisory.get('canonical_id')}--workflow", "vuln_family": family, "entry_surface": _entry_surface(family, scope), "preconditions": [ "仅在 lab-local、lab-public 或明确授权目标中执行。", f"确认目标命中版本断言: {', '.join(affected_assertion[:3])}", f"若对象属于 `{scope}`,先确认扩展/仓库/包已启用并处于受影响版本。", ], "required_role": required_role, "affected_version_assertion": affected_assertion, "trigger_vector": f"对 `{family}` 家族入口投递最小化、可审计、可回滚的受控输入,比较修复前后差异。", "request_or_ui_path": _request_paths(family, scope), "input_shape": _input_shape(family), "expected_unsafe_behavior": _unsafe_behavior(family), "server_evidence_points": server, "browser_evidence_points": browser, "db_or_fs_evidence_points": db_fs, "detection_signals": detection, "mitigation_summary": "优先升级到修复版本,并同时收紧输入校验、服务端鉴权、代理信任边界、扩展安装信任和审计日志。", "patch_validation_steps": _patch_validation_steps(family, advisory), "lab_safety_notes": [ "只使用回环地址、哨兵目标、无害样本或可回滚测试数据。", "禁止造成持久破坏、越权下载真实数据或不可回滚 side effect。", "如需浏览器证据,保留 baseline / proof 两份快照以及 console / network 记录。", ], "review_state": "needs-version-gap-review" if advisory.get("version_resolution_needed") else "ready", } def build_advisory_extensions(advisory: Dict[str, Any], system: Dict[str, Any]) -> Dict[str, Any]: entity_type = infer_entity_type(advisory) advisory_scope = advisory_scope_for_entity(entity_type) if advisory.get("package_name") else "core" root_system_id = advisory.get("system_id") or system.get("system_id") package_name = advisory.get("package_name") or "" child_entity_id = "" if package_name: child_entity_id = _entity_id(root_system_id, entity_type, package_name) entity_refs = [ { "entity_id": root_system_id, "entity_type": "system", "relation": "root-system", "root_system_id": root_system_id, "official": True, } ] if child_entity_id: entity_refs.append( { "entity_id": child_entity_id, "entity_type": entity_type, "relation": "affected-component", "root_system_id": root_system_id, "official": advisory_scope == "core", } ) affected = list(advisory.get("affected_versions") or []) fixed = list(advisory.get("fixed_versions") or []) version_confidence, gap_reason, version_resolution_needed = _version_confidence(affected, fixed) affected_components = [ { "name": _display_name(package_name, system.get("display_name", root_system_id)), "entity_id": child_entity_id or root_system_id, "scope": advisory_scope, "package_name": package_name or None, "official": advisory_scope == "core", } ] version_sources = unique( [advisory.get("official_source_url")] + list(advisory.get("secondary_source_urls") or []) ) enriched = { "entity_refs": entity_refs, "affected_components": affected_components, "affected_version_ranges": affected, "fixed_version_ranges": fixed, "introduced_version": _pick_version_boundary(affected), "patched_version": _pick_version_boundary(fixed, prefer_fixed=True), "version_evidence_sources": version_sources, "advisory_scope": advisory_scope, "version_confidence": version_confidence, "version_gap_reason": gap_reason, "version_resolution_needed": version_resolution_needed, } enriched["workflow"] = build_workflow({**advisory, **enriched}, system) return enriched def enrich_advisory_record(advisory: AdvisoryRecord, system: Dict[str, Any]) -> AdvisoryRecord: enriched = build_advisory_extensions(advisory.to_dict(), system) for key, value in enriched.items(): setattr(advisory, key, value) advisory.metadata = { **(advisory.metadata or {}), "entity_ref_count": len(advisory.entity_refs), "advisory_scope": advisory.advisory_scope, "version_confidence": advisory.version_confidence, "workflow_id": advisory.workflow.get("workflow_id"), } return advisory def _source_refs(system: Dict[str, Any]) -> List[Dict[str, Any]]: refs: List[Dict[str, Any]] = [] for bucket in ("official_sources", "ecosystem_sources", "research_sources"): for source in system.get(bucket, []) or []: refs.append( { "name": source.get("name"), "url": source.get("url"), "kind": source.get("kind"), "status": source.get("status"), "bucket": bucket, "official": bucket == "official_sources", } ) return refs def _entity_payload( *, entity_id: str, entity_type: str, display_name: str, parent_entity_id: str | None, root_system_id: str, category: str, ecosystem: str, official: bool, status: str, history_policy: str, repo_url: str, package_registry: str, marketplace_url: str, latest_version: str, version_scheme: str, source_refs: List[Dict[str, Any]], ) -> Dict[str, Any]: return { "entity_id": entity_id, "entity_type": entity_type, "display_name": display_name, "parent_entity_id": parent_entity_id, "root_system_id": root_system_id, "category": category, "ecosystem": ecosystem, "official": official, "status": status, "history_policy": history_policy, "repo_url": repo_url, "package_registry": package_registry, "marketplace_url": marketplace_url, "latest_version": latest_version, "version_scheme": version_scheme, "latest_release_at": "", "latest_release_url": "", "version_source_refs": [], "version_sync_status": "pending", "security_version_count": 0, "last_version_synced_at": "", "latest_version_evidence": [], "catalog_source": "", "catalog_reason": "", "auto_cataloged": False, "last_discovered_at": "", "last_synced_at": "", "history_backfill_status": "pending", "latest_sync_status": "pending", "official_source_covered": False, "advisory_count": 0, "workflow_complete_advisory_count": 0, "version_mapped_advisory_count": 0, "first_advisory_at": "", "latest_advisory_at": "", "advisory_ids": [], "source_refs": source_refs, } def _merge_source_refs(primary: List[Dict[str, Any]], secondary: List[Dict[str, Any]]) -> List[Dict[str, Any]]: merged: List[Dict[str, Any]] = [] seen = set() for item in (primary or []) + (secondary or []): if not isinstance(item, dict): continue key = ( item.get("name") or "", item.get("url") or "", item.get("kind") or "", item.get("bucket") or "", ) if key in seen: continue seen.add(key) merged.append(item) return merged def _merge_entity_overlay(entity: Dict[str, Any], overlay: Dict[str, Any] | None) -> Dict[str, Any]: if not overlay: return entity merged = dict(entity) for key in ( "status", "history_policy", "repo_url", "package_registry", "marketplace_url", "latest_version", "version_scheme", "latest_release_at", "latest_release_url", "version_source_refs", "version_sync_status", "security_version_count", "last_version_synced_at", "latest_version_evidence", "catalog_source", "catalog_reason", "auto_cataloged", "last_discovered_at", "last_synced_at", "history_backfill_status", "latest_sync_status", "official_source_covered", ): if key not in overlay: continue value = overlay.get(key) if value in (None, "", [], {}): continue merged[key] = value merged["source_refs"] = _merge_source_refs(entity.get("source_refs", []), overlay.get("source_refs", [])) return merged def _update_entity_stats(entity: Dict[str, Any], advisories: List[Dict[str, Any]]) -> None: advisory_ids = [item.get("canonical_id") for item in advisories if item.get("canonical_id")] workflow_count = len([item for item in advisories if item.get("workflow", {}).get("workflow_id")]) version_mapped_count = len([item for item in advisories if not item.get("version_resolution_needed")]) timestamps = [] for advisory in advisories: for field in ("published_at", "updated_at"): dt = parse_dt(advisory.get(field)) if dt is not None: timestamps.append(dt) entity["advisory_count"] = len(advisories) entity["workflow_complete_advisory_count"] = workflow_count entity["version_mapped_advisory_count"] = version_mapped_count entity["advisory_ids"] = advisory_ids if timestamps: entity["first_advisory_at"] = isoformat(min(timestamps)) entity["latest_advisory_at"] = isoformat(max(timestamps)) entity["last_discovered_at"] = entity["latest_advisory_at"] entity["last_synced_at"] = entity["latest_advisory_at"] entity["official_source_covered"] = bool( entity.get("official_source_covered") or any(source.get("official") for source in entity.get("source_refs", [])) or any(advisory.get("official_source_url") for advisory in advisories) ) if entity["advisory_count"]: if entity["entity_type"] == "system" and entity.get("history_policy") == "history-full": entity["history_backfill_status"] = "complete" elif entity["advisory_count"] >= 2 and entity["version_mapped_advisory_count"] >= max(1, entity["advisory_count"] - 1): entity["history_backfill_status"] = "complete" else: entity["history_backfill_status"] = "seeded" entity["latest_sync_status"] = "green" if entity["official_source_covered"] and entity["advisory_count"] else "pending" if not entity.get("latest_version"): for advisory in advisories: patched = advisory.get("patched_version") if patched: entity["latest_version"] = patched break def _candidate_from_source(system: Dict[str, Any], source: Dict[str, Any], known_repo_urls: set[str]) -> Dict[str, Any] | None: url = (source.get("url") or "").strip() entity_type = source.get("entity_type_hint") or "project" repo_url = _github_repo_from_url(url) package_registry = "" marketplace_url = "" display_name = "" stable_url = repo_url if repo_url: if repo_url in known_repo_urls: return None entity_type = source.get("entity_type_hint") or "repo" match = re.match(r"https://github\.com/([^/]+)/([^/#?]+)", repo_url, re.IGNORECASE) if match: display_name = f"{match.group(1)} / {match.group(2)}" elif "npmjs.com/package/" in url: entity_type = source.get("entity_type_hint") or "package" package_name = url.split("/package/", 1)[1].split("?", 1)[0].strip("/") package_registry = f"https://www.npmjs.com/package/{package_name}" display_name = package_name stable_url = package_registry elif "packagist.org/packages/" in url: entity_type = source.get("entity_type_hint") or "package" package_name = url.split("/packages/", 1)[1].split("?", 1)[0].strip("/") package_registry = f"https://packagist.org/packages/{package_name}" display_name = package_name.replace("/", " / ") stable_url = package_registry elif any(token in url.lower() for token in ("/plugins/", "/themes/", "/extensions/", "/modules/", "/marketplace/")): marketplace_url = url slug = _marketplace_slug(url) display_name = slug.replace("-", " ") stable_url = marketplace_url else: return None if not display_name: display_name = source.get("name") or system.get("display_name") or system.get("system_id") return { "candidate_id": f"{system.get('system_id')}--{entity_type}-candidate--{slugify(stable_url or display_name)}", "root_system_id": system.get("system_id"), "display_name": display_name, "entity_type": entity_type, "status": "candidate", "reason": "source catalog exposed a stable security-related object that is not yet cataloged as an entity", "source": url, "source_name": source.get("name") or "", "source_confidence": source.get("confidence") or "unknown", "source_bucket": source.get("bucket_name") or "", "auto_catalog": bool(source.get("auto_catalog")), "repo_url": repo_url, "package_registry": package_registry, "marketplace_url": marketplace_url, "risk": "medium", "waiting_for": "确认是否应升级为 cataloged repo/plugin/package 实体并补齐安全相关版本与历史漏洞", "canonical_id": "", } def build_entity_views(source_map: Dict[str, Any], advisories: List[AdvisoryRecord | Dict[str, Any]]) -> Dict[str, Any]: generated_at = isoformat(now_utc()) systems = {item["system_id"]: item for item in source_map.get("systems", []) or [] if item.get("system_id")} advisory_rows = [_advisory_dict(item) for item in advisories] existing_entities = { item.get("entity_id"): item for item in load_all_json(ENTITIES_DIR) if item.get("entity_id") and item.get("root_system_id") in systems } entities: Dict[str, Dict[str, Any]] = {} advisories_by_entity: Dict[str, List[Dict[str, Any]]] = defaultdict(list) for system_id, system in systems.items(): history_policy = system.get("tier") or "history-full" entities[system_id] = _merge_entity_overlay( _entity_payload( entity_id=system_id, entity_type="system", display_name=system.get("display_name", system_id), parent_entity_id=None, root_system_id=system_id, category=system.get("category", "unknown"), ecosystem=system.get("category", "unknown"), official=True, status="cataloged", history_policy=history_policy, repo_url="", package_registry="", marketplace_url="", latest_version="", version_scheme="vendor", source_refs=_source_refs(system), ), existing_entities.get(system_id), ) for advisory in advisory_rows: root_system_id = advisory.get("system_id") system = systems.get(root_system_id, {}) refs = advisory.get("entity_refs") or [{"entity_id": root_system_id, "entity_type": "system"}] for ref in refs: entity_id = ref.get("entity_id") if not entity_id: continue if entity_id not in entities: package_name = advisory.get("package_name") or advisory.get("title") or entity_id entity_type = ref.get("entity_type") or infer_entity_type(advisory) entities[entity_id] = _merge_entity_overlay( _entity_payload( entity_id=entity_id, entity_type=entity_type, display_name=_display_name(package_name, entity_id), parent_entity_id=root_system_id, root_system_id=root_system_id, category=system.get("category", advisory.get("category", "unknown")), ecosystem=advisory.get("package_name", "").split("/", 1)[0] if advisory.get("package_name") else system.get("category", "unknown"), official=entity_type in {"project", "repo"} and entity_type != "package", status="cataloged", history_policy="history-full", repo_url=_repo_url_from_package(advisory.get("package_name") or ""), package_registry=_package_registry_url(advisory.get("package_name") or ""), marketplace_url="", latest_version=advisory.get("patched_version") or "", version_scheme="semver-ish" if advisory.get("package_name") else "vendor", source_refs=[], ), existing_entities.get(entity_id), ) advisories_by_entity[entity_id].append(advisory) for entity_id, advisories_for_entity in advisories_by_entity.items(): _update_entity_stats(entities[entity_id], advisories_for_entity) entities[entity_id] = _merge_entity_overlay(entities[entity_id], existing_entities.get(entity_id)) for entity_id, item in existing_entities.items(): if entity_id in entities: continue entities[entity_id] = item known_repo_urls = {entity.get("repo_url") for entity in entities.values() if entity.get("repo_url")} candidate_map: Dict[str, Dict[str, Any]] = {} for system in systems.values(): for bucket in ("official_sources", "ecosystem_sources"): for source in system.get(bucket, []) or []: candidate = _candidate_from_source(system, source, known_repo_urls) if candidate: candidate_map[candidate["candidate_id"]] = candidate candidate_backlog = sorted(candidate_map.values(), key=lambda item: (item["root_system_id"], item["display_name"])) system_summary: Dict[str, Dict[str, Any]] = {} for entity in entities.values(): summary = system_summary.setdefault( entity["root_system_id"], { "system_id": entity["root_system_id"], "display_name": systems.get(entity["root_system_id"], {}).get("display_name", entity["root_system_id"]), "cataloged_entity_total": 0, "child_entity_total": 0, "candidate_entity_total": 0, "workflow_complete_count": 0, "version_mapped_count": 0, "official_source_covered_count": 0, "history_full_complete_count": 0, "latest_green_count": 0, "version_gap_entity_count": 0, "workflow_gap_entity_count": 0, "plugin_total": 0, "entity_type_counts": {}, "top_entities": [], "backlog_preview": [], }, ) summary["cataloged_entity_total"] += 1 if entity["entity_type"] != "system": summary["child_entity_total"] += 1 summary["entity_type_counts"][entity["entity_type"]] = summary["entity_type_counts"].get(entity["entity_type"], 0) + 1 summary["workflow_complete_count"] += 1 if entity["advisory_count"] and entity["workflow_complete_advisory_count"] >= entity["advisory_count"] else 0 summary["version_mapped_count"] += 1 if entity["advisory_count"] and entity["version_mapped_advisory_count"] >= entity["advisory_count"] else 0 summary["official_source_covered_count"] += 1 if entity["official_source_covered"] else 0 summary["history_full_complete_count"] += 1 if entity.get("history_backfill_status") == "complete" else 0 summary["latest_green_count"] += 1 if entity.get("latest_sync_status") == "green" else 0 summary["version_gap_entity_count"] += 1 if entity.get("advisory_count") and entity.get("version_mapped_advisory_count", 0) < entity.get("advisory_count", 0) else 0 summary["workflow_gap_entity_count"] += 1 if entity.get("advisory_count") and entity.get("workflow_complete_advisory_count", 0) < entity.get("advisory_count", 0) else 0 if entity["entity_type"] in PLUGINISH_ENTITY_TYPES: summary["plugin_total"] += 1 for system_id, summary in system_summary.items(): ranked_entities = sorted( [ entity for entity in entities.values() if entity["root_system_id"] == system_id and entity["entity_type"] != "system" ], key=lambda item: (-(item.get("advisory_count") or 0), item["entity_type"], item["display_name"].lower()), ) summary["top_entities"] = [ { "entity_id": entity["entity_id"], "entity_type": entity["entity_type"], "display_name": entity["display_name"], "advisory_count": entity.get("advisory_count", 0), "history_backfill_status": entity.get("history_backfill_status"), "latest_sync_status": entity.get("latest_sync_status"), } for entity in ranked_entities[:5] ] for candidate in candidate_backlog: system_summary.setdefault( candidate["root_system_id"], { "system_id": candidate["root_system_id"], "display_name": systems.get(candidate["root_system_id"], {}).get("display_name", candidate["root_system_id"]), "cataloged_entity_total": 0, "child_entity_total": 0, "candidate_entity_total": 0, "workflow_complete_count": 0, "version_mapped_count": 0, "official_source_covered_count": 0, "history_full_complete_count": 0, "latest_green_count": 0, "version_gap_entity_count": 0, "workflow_gap_entity_count": 0, "plugin_total": 0, "entity_type_counts": {}, "top_entities": [], "backlog_preview": [], }, )["candidate_entity_total"] += 1 preview = system_summary[candidate["root_system_id"]]["backlog_preview"] if len(preview) < 5: preview.append( { "candidate_id": candidate["candidate_id"], "display_name": candidate["display_name"], "entity_type": candidate["entity_type"], "risk": candidate["risk"], } ) cataloged_entities = [entity for entity in entities.values() if entity.get("status") == "cataloged"] history_full_complete_count = len( [entity for entity in cataloged_entities if entity.get("history_policy") == "history-full" and entity.get("history_backfill_status") == "complete"] ) latest_green_count = len([entity for entity in cataloged_entities if entity.get("latest_sync_status") == "green"]) workflow_complete_count = len( [entity for entity in cataloged_entities if entity.get("advisory_count") and entity.get("workflow_complete_advisory_count") >= entity.get("advisory_count")] ) version_mapped_count = len( [entity for entity in cataloged_entities if entity.get("advisory_count") and entity.get("version_mapped_advisory_count") >= entity.get("advisory_count")] ) official_source_covered_count = len([entity for entity in cataloged_entities if entity.get("official_source_covered")]) plugin_history_full_count = len( [ entity for entity in cataloged_entities if entity.get("entity_type") in PLUGINISH_ENTITY_TYPES and entity.get("history_backfill_status") == "complete" ] ) queue_payload = { "generated_at": generated_at, "discovery_queue": { "count": len(candidate_backlog), "items": candidate_backlog[:200], }, "history_queue": { "count": len([entity for entity in cataloged_entities if entity.get("history_policy") == "history-full" and entity.get("history_backfill_status") != "complete"]), "items": [ { "entity_id": entity["entity_id"], "display_name": entity["display_name"], "root_system_id": entity["root_system_id"], "history_policy": entity["history_policy"], "history_backfill_status": entity["history_backfill_status"], } for entity in cataloged_entities if entity.get("history_policy") == "history-full" and entity.get("history_backfill_status") != "complete" ][:200], }, "latest_queue": { "count": len([entity for entity in cataloged_entities if entity.get("latest_sync_status") != "green"]), "items": [ { "entity_id": entity["entity_id"], "display_name": entity["display_name"], "root_system_id": entity["root_system_id"], "latest_sync_status": entity["latest_sync_status"], "last_synced_at": entity["last_synced_at"], } for entity in cataloged_entities if entity.get("latest_sync_status") != "green" ][:200], }, "workflow_queue": { "count": len([advisory for advisory in advisory_rows if advisory.get("workflow", {}).get("review_state") != "ready"]), "items": [ { "canonical_id": advisory.get("canonical_id"), "system_id": advisory.get("system_id"), "title": advisory.get("title"), "review_state": advisory.get("workflow", {}).get("review_state"), "version_resolution_needed": advisory.get("version_resolution_needed"), } for advisory in advisory_rows if advisory.get("workflow", {}).get("review_state") != "ready" ][:200], }, } completeness = { "generated_at": generated_at, "cataloged_entity_total": len(cataloged_entities), "candidate_entity_total": len(candidate_backlog), "history_full_complete_count": history_full_complete_count, "latest_green_count": latest_green_count, "workflow_complete_count": workflow_complete_count, "version_mapped_count": version_mapped_count, "official_source_covered_count": official_source_covered_count, "plugin_history_full_count": plugin_history_full_count, "systems": sorted(system_summary.values(), key=lambda item: item["system_id"]), "queues": { key: value["count"] for key, value in queue_payload.items() if key.endswith("_queue") }, } report_lines = [ "# 分层实体覆盖与完整度报告", "", f"- 生成时间: `{generated_at}`", f"- 已编目实体: `{completeness['cataloged_entity_total']}`", f"- 待编目 backlog: `{completeness['candidate_entity_total']}`", f"- history-full 已完成: `{completeness['history_full_complete_count']}`", f"- latest green: `{completeness['latest_green_count']}`", f"- workflow 完整: `{completeness['workflow_complete_count']}`", f"- 版本映射完整: `{completeness['version_mapped_count']}`", f"- 官方源覆盖: `{completeness['official_source_covered_count']}`", f"- 插件 history-full 已完成: `{completeness['plugin_history_full_count']}`", "", "## 系统分层摘要", "", "| 系统 | cataloged entities | candidate backlog | workflow complete | version mapped | official covered | plugins |", "| --- | ---: | ---: | ---: | ---: | ---: | ---: |", ] for item in completeness["systems"]: report_lines.append( f"| {item['system_id']} | {item['cataloged_entity_total']} | {item['candidate_entity_total']} | {item['workflow_complete_count']} | {item['version_mapped_count']} | {item['official_source_covered_count']} | {item['plugin_total']} |" ) backlog_lines = [ "# 分层实体发现 Backlog", "", f"- 生成时间: `{generated_at}`", f"- 待编目数量: `{len(candidate_backlog)}`", "", "| candidate_id | root_system | entity_type | risk | reason | waiting_for | source |", "| --- | --- | --- | --- | --- | --- | --- |", ] if candidate_backlog: for item in candidate_backlog[:500]: backlog_lines.append( f"| {item['candidate_id']} | {item['root_system_id']} | {item['entity_type']} | {item['risk']} | {item['reason']} | {item['waiting_for']} | {item.get('source') or '-'} |" ) else: backlog_lines.append("| - | - | - | - | no backlog | - | - |") entity_payloads = { f"{entity['entity_id']}.json": entity for entity in sorted(entities.values(), key=lambda item: item["entity_id"]) } return { "generated_at": generated_at, "entity_payloads": entity_payloads, "entities": sorted(entities.values(), key=lambda item: item["entity_id"]), "candidate_backlog": candidate_backlog, "completeness": completeness, "queues": queue_payload, "catalog_report_markdown": "\n".join(report_lines), "backlog_report_markdown": "\n".join(backlog_lines), }