文件
websafe-kb/scripts/intel/entities.py
2026-03-21 17:44:22 -07:00

958 行
43 KiB
Python

from __future__ import annotations
import re
from collections import defaultdict
from typing import Any, Dict, Iterable, List, Tuple
from intel.config import ENTITIES_DIR
from intel.models import AdvisoryRecord
from intel.utils import isoformat, load_all_json, now_utc, parse_dt, slugify, unique
FAMILY_KEYWORDS = {
"xss": ["xss", "cross-site scripting", "content injection", "html injection", "dom xss"],
"sqli": ["sql injection", "sqli"],
"authz-bypass": ["authorization bypass", "access control", "auth bypass", "permission bypass"],
"ssrf": ["ssrf", "server-side request forgery"],
"file-upload": ["file upload", "upload bypass", "attachment"],
"request-smuggling": ["request smuggling", "http desync"],
"template-injection": ["template injection", "ssti"],
"deserialization": ["deserialization", "serialization"],
"proxy-boundary": ["proxy", "middleware", "header trust", "reverse proxy"],
"plugin-extension": ["plugin", "extension", "module", "theme", "addon"],
"session-token": ["token", "cookie", "session", "jwt"],
"path-traversal": ["path traversal", "directory traversal"],
"misconfiguration": ["misconfiguration", "default credentials", "admin panel", "debug"],
}
PLUGINISH_ENTITY_TYPES = {"plugin", "extension", "module", "theme"}
FAMILY_TO_REQUIRED_ROLE = {
"xss": "editor-or-admin",
"sqli": "anonymous-or-low-privileged",
"authz-bypass": "cross-tenant-or-low-privileged-user",
"ssrf": "editor-or-admin",
"file-upload": "authenticated-uploader",
"request-smuggling": "edge-access",
"template-injection": "template-editor-or-admin",
"deserialization": "application-integrator",
"proxy-boundary": "reverse-proxy-or-edge-client",
"plugin-extension": "plugin-manager-or-admin",
"session-token": "authenticated-user",
"path-traversal": "anonymous-or-low-privileged",
"misconfiguration": "operator-or-admin",
"unknown": "unknown",
}
def _advisory_dict(item: AdvisoryRecord | Dict[str, Any]) -> Dict[str, Any]:
if isinstance(item, AdvisoryRecord):
return item.to_dict()
return dict(item or {})
def infer_family(advisory: Dict[str, Any]) -> str:
text = " ".join(
filter(
None,
[
advisory.get("title"),
advisory.get("summary"),
advisory.get("system_id"),
advisory.get("package_name"),
" ".join(advisory.get("aliases", []) or []),
" ".join(advisory.get("secure_code_topics", []) or []),
],
)
).lower()
for family, keywords in FAMILY_KEYWORDS.items():
if any(keyword in text for keyword in keywords):
return family
return "unknown"
def _strip_package_version_suffix(value: str) -> str:
stripped = value.strip()
stripped = re.sub(r"/v\d+$", "", stripped)
return stripped
def _display_name(value: str, fallback: str) -> str:
candidate = (value or "").strip()
if not candidate:
return fallback
if candidate.startswith("github.com/"):
candidate = candidate.split("/", 1)[1]
candidate = candidate.replace("@", "").replace("/", " / ")
return candidate
def infer_entity_type(advisory: Dict[str, Any]) -> str:
package_name = (advisory.get("package_name") or "").lower()
text = " ".join(
filter(
None,
[
advisory.get("title"),
advisory.get("summary"),
advisory.get("package_name"),
],
)
).lower()
if "theme" in package_name or " theme" in text:
return "theme"
if "plugin" in package_name or " plugin" in text or "plugins" in text:
return "plugin"
if "extension" in package_name or " extension" in text:
return "extension"
if "module" in package_name or " module" in text:
return "module"
if package_name.startswith("github.com/") or package_name.count("/") >= 2:
return "repo"
if "/" in package_name:
return "package"
return "project"
def advisory_scope_for_entity(entity_type: str) -> str:
if entity_type in {"plugin", "extension", "module", "theme"}:
return entity_type
if entity_type == "repo":
return "repo"
if entity_type in {"package", "project"}:
return "package"
return "core"
def _repo_url_from_package(package_name: str) -> str:
package_name = _strip_package_version_suffix(package_name)
if package_name.startswith("github.com/"):
return f"https://{package_name}"
if package_name.count("/") == 1 and not package_name.startswith("@"):
owner, repo = package_name.split("/", 1)
if owner and repo:
return f"https://github.com/{owner}/{repo}"
return ""
def _github_repo_from_url(url: str) -> str:
match = re.match(r"https://github\.com/([^/]+)/([^/#?]+)", (url or "").strip(), re.IGNORECASE)
if not match:
return ""
return f"https://github.com/{match.group(1)}/{match.group(2)}"
def _marketplace_slug(url: str) -> str:
parts = [part for part in re.split(r"[/?#]+", (url or "").strip()) if part]
if not parts:
return ""
return parts[-1]
def _package_registry_url(package_name: str) -> str:
normalized = _strip_package_version_suffix(package_name)
if not normalized:
return ""
if normalized.startswith("@") or "/" not in normalized:
return f"https://www.npmjs.com/package/{normalized}"
if normalized.count("/") == 1 and not normalized.startswith("github.com/"):
return f"https://packagist.org/packages/{normalized}"
return ""
def _entity_id(system_id: str, entity_type: str, name: str) -> str:
return f"{system_id}--{entity_type}--{slugify(name)}"
def _pick_version_boundary(versions: Iterable[str], *, prefer_fixed: bool = False) -> str | None:
values = [str(value).strip() for value in versions if str(value).strip()]
if not values:
return None
direct = [value for value in values if re.search(r"\d", value)]
if direct:
return direct[0] if prefer_fixed else direct[-1]
return values[0]
def _version_confidence(affected: List[str], fixed: List[str]) -> Tuple[str, str, bool]:
if affected and fixed:
return "high", "", False
if affected or fixed:
return "medium", "official bulletin or mirrored source only exposed one side of the version boundary", True
return "low", "official bulletin or aggregated source did not expose explicit affected/fixed versions", True
def _entry_surface(family: str, scope: str) -> str:
mapping = {
"xss": "web-ui-render-path",
"sqli": "query-or-filter-parameter",
"authz-bypass": "privileged-route-or-object-reference",
"ssrf": "remote-fetch-or-webhook-endpoint",
"file-upload": "upload-or-import-surface",
"request-smuggling": "reverse-proxy-boundary",
"template-injection": "template-render-entry",
"deserialization": "serialized-input-boundary",
"proxy-boundary": "proxy-header-or-trust-boundary",
"plugin-extension": "extension-management-surface",
"session-token": "session-or-token-processing",
"path-traversal": "file-read-or-download-path",
"misconfiguration": "deployment-or-admin-surface",
}
return mapping.get(family, f"{scope}-surface")
def _request_paths(family: str, scope: str) -> List[str]:
mapping = {
"xss": ["/admin/editor", "/preview", "/rendered-content"],
"sqli": ["/search", "/filter", "/api/list"],
"authz-bypass": ["/admin/*", "/api/private/*", "/tenant/*"],
"ssrf": ["/webhook/test", "/remote-fetch", "/import-url"],
"file-upload": ["/upload", "/import", "/plugin/install"],
"request-smuggling": ["/ via reverse proxy", "front proxy -> app origin"],
"template-injection": ["/templates", "/email-preview", "/theme-editor"],
"deserialization": ["/api/import", "/queue/consumer", "/session/restore"],
"proxy-boundary": ["/middleware", "/x-forwarded-* trust path"],
"plugin-extension": ["/plugins", "/extensions", "/themes"],
"session-token": ["/login", "/callback", "/session"],
"path-traversal": ["/download", "/assets", "/attachment"],
"misconfiguration": ["/admin", "/debug", "/setup"],
}
return mapping.get(family, [f"/{scope}"])
def _input_shape(family: str) -> str:
mapping = {
"xss": "受控 HTML/Markdown/富文本输入,观察渲染上下文是否失去编码或净化。",
"sqli": "受控查询参数、排序字段或筛选值,观察是否突破预期查询边界。",
"authz-bypass": "使用低权限身份访问高权限对象或跨租户资源。",
"ssrf": "提交受控回环或哨兵 URL,验证协议、主机、IP 与重定向限制。",
"file-upload": "提交受控非执行样本,验证扩展名、MIME、落盘与执行权限。",
"request-smuggling": "构造受控冲突头部组合,仅验证代理与应用解析差异。",
"template-injection": "提交受控模板占位符,验证是否存在危险表达式求值。",
"deserialization": "提交受控序列化样本,验证类型恢复与危险对象实例化。",
"proxy-boundary": "提交受控代理头或来源头,验证信任边界和回源鉴权。",
"plugin-extension": "在扩展管理或扩展功能入口中提交受控配置/内容。",
"session-token": "使用短期测试令牌或会话,验证生命周期、绑定与失效逻辑。",
"path-traversal": "提交规范化路径片段,验证根目录限制与标准化处理。",
"misconfiguration": "检查默认入口、调试面板、弱默认项和暴露控制面。",
}
return mapping.get(family, "提交最小化、可审计、可回滚的受控输入。")
def _unsafe_behavior(family: str) -> str:
mapping = {
"xss": "输入在目标上下文执行或被浏览器解释为主动内容。",
"sqli": "响应、日志或 side effect 显示查询边界被打破。",
"authz-bypass": "低权限身份可访问本不应可见的数据或操作。",
"ssrf": "服务端向受控目标发起非预期请求。",
"file-upload": "上传样本被错误接受、可访问或位于可执行路径。",
"request-smuggling": "代理和应用对同一请求的边界解释不一致。",
"template-injection": "模板引擎对用户输入执行表达式求值。",
"deserialization": "反序列化恢复出危险类型或触发危险行为。",
"proxy-boundary": "仅凭代理头即可越过鉴权或来源控制。",
"plugin-extension": "扩展安装、配置或运行突破了信任边界。",
"session-token": "令牌或会话可被重放、固定或越权使用。",
"path-traversal": "可读取、列出或访问根目录之外资源。",
"misconfiguration": "默认设置暴露管理面、调试面或高权限动作。",
}
return mapping.get(family, "目标表现出超出设计边界的行为。")
def _evidence_points(family: str, advisory_scope: str) -> Tuple[List[str], List[str], List[str], List[str]]:
server = [
"应用日志中的命中路径、鉴权决策和异常栈",
"反向代理或边界层日志中的请求头、来源 IP 与路由决策",
]
browser = [
"基线截图与攻击后截图的 DOM/视觉差异",
"console、network 与 response metadata 中的异常信号",
]
db_fs = [
"数据库中新增/越权读取的测试数据",
"文件系统中新增上传样本、缓存条目或越权读取痕迹",
]
detection = [
"WAF / reverse proxy 异常日志、访问日志和告警",
"应用审计日志中的权限错误、重定向异常、模板渲染或上传落盘事件",
]
if family in {"request-smuggling", "proxy-boundary"}:
detection.append("上游代理与应用层对 Content-Length / Transfer-Encoding / forwarded headers 的解释差异")
if advisory_scope in {"plugin", "extension", "module", "theme"}:
server.append("插件/扩展管理日志、安装日志与版本清单")
db_fs.append("插件目录、主题目录或扩展配置表中的测试样本")
return server, browser, db_fs, detection
def _patch_validation_steps(family: str, advisory: Dict[str, Any]) -> List[str]:
patched_version = advisory.get("patched_version") or "修复版本"
version_assertion = advisory.get("affected_version_ranges") or advisory.get("affected_versions") or ["受影响版本区间"]
return [
f"确认目标版本从 `{', '.join(version_assertion[:3])}` 升级或回移到 `{patched_version}`。",
"保留同一组受控输入,在修复前后分别执行并比对响应、日志与浏览器证据。",
"确认修复后仅保留预期业务行为,不再触发越权、回显、异常渲染或错误请求。",
f"补充 `{family}` 族自动化回归,避免同类路径在插件、主题或代理链中回归。",
]
def build_workflow(advisory: Dict[str, Any], system: Dict[str, Any]) -> Dict[str, Any]:
family = advisory.get("workflow", {}).get("vuln_family") or infer_family(advisory)
scope = advisory.get("advisory_scope") or "core"
required_role = FAMILY_TO_REQUIRED_ROLE.get(family, "unknown")
affected_assertion = advisory.get("affected_version_ranges") or advisory.get("affected_versions") or ["需要从公告、锁文件、版本页或关于页面人工确认版本命中"]
server, browser, db_fs, detection = _evidence_points(family, scope)
return {
"workflow_id": f"{advisory.get('canonical_id')}--workflow",
"vuln_family": family,
"entry_surface": _entry_surface(family, scope),
"preconditions": [
"仅在 lab-local、lab-public 或明确授权目标中执行。",
f"确认目标命中版本断言: {', '.join(affected_assertion[:3])}",
f"若对象属于 `{scope}`,先确认扩展/仓库/包已启用并处于受影响版本。",
],
"required_role": required_role,
"affected_version_assertion": affected_assertion,
"trigger_vector": f"对 `{family}` 家族入口投递最小化、可审计、可回滚的受控输入,比较修复前后差异。",
"request_or_ui_path": _request_paths(family, scope),
"input_shape": _input_shape(family),
"expected_unsafe_behavior": _unsafe_behavior(family),
"server_evidence_points": server,
"browser_evidence_points": browser,
"db_or_fs_evidence_points": db_fs,
"detection_signals": detection,
"mitigation_summary": "优先升级到修复版本,并同时收紧输入校验、服务端鉴权、代理信任边界、扩展安装信任和审计日志。",
"patch_validation_steps": _patch_validation_steps(family, advisory),
"lab_safety_notes": [
"只使用回环地址、哨兵目标、无害样本或可回滚测试数据。",
"禁止造成持久破坏、越权下载真实数据或不可回滚 side effect。",
"如需浏览器证据,保留 baseline / proof 两份快照以及 console / network 记录。",
],
"review_state": "needs-version-gap-review" if advisory.get("version_resolution_needed") else "ready",
}
def build_advisory_extensions(advisory: Dict[str, Any], system: Dict[str, Any]) -> Dict[str, Any]:
entity_type = infer_entity_type(advisory)
advisory_scope = advisory_scope_for_entity(entity_type) if advisory.get("package_name") else "core"
root_system_id = advisory.get("system_id") or system.get("system_id")
package_name = advisory.get("package_name") or ""
child_entity_id = ""
if package_name:
child_entity_id = _entity_id(root_system_id, entity_type, package_name)
entity_refs = [
{
"entity_id": root_system_id,
"entity_type": "system",
"relation": "root-system",
"root_system_id": root_system_id,
"official": True,
}
]
if child_entity_id:
entity_refs.append(
{
"entity_id": child_entity_id,
"entity_type": entity_type,
"relation": "affected-component",
"root_system_id": root_system_id,
"official": advisory_scope == "core",
}
)
affected = list(advisory.get("affected_versions") or [])
fixed = list(advisory.get("fixed_versions") or [])
version_confidence, gap_reason, version_resolution_needed = _version_confidence(affected, fixed)
affected_components = [
{
"name": _display_name(package_name, system.get("display_name", root_system_id)),
"entity_id": child_entity_id or root_system_id,
"scope": advisory_scope,
"package_name": package_name or None,
"official": advisory_scope == "core",
}
]
version_sources = unique(
[advisory.get("official_source_url")] + list(advisory.get("secondary_source_urls") or [])
)
enriched = {
"entity_refs": entity_refs,
"affected_components": affected_components,
"affected_version_ranges": affected,
"fixed_version_ranges": fixed,
"introduced_version": _pick_version_boundary(affected),
"patched_version": _pick_version_boundary(fixed, prefer_fixed=True),
"version_evidence_sources": version_sources,
"advisory_scope": advisory_scope,
"version_confidence": version_confidence,
"version_gap_reason": gap_reason,
"version_resolution_needed": version_resolution_needed,
}
enriched["workflow"] = build_workflow({**advisory, **enriched}, system)
return enriched
def enrich_advisory_record(advisory: AdvisoryRecord, system: Dict[str, Any]) -> AdvisoryRecord:
enriched = build_advisory_extensions(advisory.to_dict(), system)
for key, value in enriched.items():
setattr(advisory, key, value)
advisory.metadata = {
**(advisory.metadata or {}),
"entity_ref_count": len(advisory.entity_refs),
"advisory_scope": advisory.advisory_scope,
"version_confidence": advisory.version_confidence,
"workflow_id": advisory.workflow.get("workflow_id"),
}
return advisory
def _source_refs(system: Dict[str, Any]) -> List[Dict[str, Any]]:
refs: List[Dict[str, Any]] = []
for bucket in ("official_sources", "ecosystem_sources", "research_sources"):
for source in system.get(bucket, []) or []:
refs.append(
{
"name": source.get("name"),
"url": source.get("url"),
"kind": source.get("kind"),
"status": source.get("status"),
"bucket": bucket,
"official": bucket == "official_sources",
}
)
return refs
def _entity_payload(
*,
entity_id: str,
entity_type: str,
display_name: str,
parent_entity_id: str | None,
root_system_id: str,
category: str,
ecosystem: str,
official: bool,
status: str,
history_policy: str,
repo_url: str,
package_registry: str,
marketplace_url: str,
latest_version: str,
version_scheme: str,
source_refs: List[Dict[str, Any]],
) -> Dict[str, Any]:
return {
"entity_id": entity_id,
"entity_type": entity_type,
"display_name": display_name,
"parent_entity_id": parent_entity_id,
"root_system_id": root_system_id,
"category": category,
"ecosystem": ecosystem,
"official": official,
"status": status,
"history_policy": history_policy,
"repo_url": repo_url,
"package_registry": package_registry,
"marketplace_url": marketplace_url,
"latest_version": latest_version,
"version_scheme": version_scheme,
"latest_release_at": "",
"latest_release_url": "",
"version_source_refs": [],
"version_sync_status": "pending",
"security_version_count": 0,
"last_version_synced_at": "",
"latest_version_evidence": [],
"catalog_source": "",
"catalog_reason": "",
"auto_cataloged": False,
"last_discovered_at": "",
"last_synced_at": "",
"history_backfill_status": "pending",
"latest_sync_status": "pending",
"official_source_covered": False,
"advisory_count": 0,
"workflow_complete_advisory_count": 0,
"version_mapped_advisory_count": 0,
"first_advisory_at": "",
"latest_advisory_at": "",
"advisory_ids": [],
"source_refs": source_refs,
}
def _merge_source_refs(primary: List[Dict[str, Any]], secondary: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
merged: List[Dict[str, Any]] = []
seen = set()
for item in (primary or []) + (secondary or []):
if not isinstance(item, dict):
continue
key = (
item.get("name") or "",
item.get("url") or "",
item.get("kind") or "",
item.get("bucket") or "",
)
if key in seen:
continue
seen.add(key)
merged.append(item)
return merged
def _merge_entity_overlay(entity: Dict[str, Any], overlay: Dict[str, Any] | None) -> Dict[str, Any]:
if not overlay:
return entity
merged = dict(entity)
for key in (
"status",
"history_policy",
"repo_url",
"package_registry",
"marketplace_url",
"latest_version",
"version_scheme",
"latest_release_at",
"latest_release_url",
"version_source_refs",
"version_sync_status",
"security_version_count",
"last_version_synced_at",
"latest_version_evidence",
"catalog_source",
"catalog_reason",
"auto_cataloged",
"last_discovered_at",
"last_synced_at",
"history_backfill_status",
"latest_sync_status",
"official_source_covered",
):
if key not in overlay:
continue
value = overlay.get(key)
if value in (None, "", [], {}):
continue
merged[key] = value
merged["source_refs"] = _merge_source_refs(entity.get("source_refs", []), overlay.get("source_refs", []))
return merged
def _update_entity_stats(entity: Dict[str, Any], advisories: List[Dict[str, Any]]) -> None:
advisory_ids = [item.get("canonical_id") for item in advisories if item.get("canonical_id")]
workflow_count = len([item for item in advisories if item.get("workflow", {}).get("workflow_id")])
version_mapped_count = len([item for item in advisories if not item.get("version_resolution_needed")])
timestamps = []
for advisory in advisories:
for field in ("published_at", "updated_at"):
dt = parse_dt(advisory.get(field))
if dt is not None:
timestamps.append(dt)
entity["advisory_count"] = len(advisories)
entity["workflow_complete_advisory_count"] = workflow_count
entity["version_mapped_advisory_count"] = version_mapped_count
entity["advisory_ids"] = advisory_ids
if timestamps:
entity["first_advisory_at"] = isoformat(min(timestamps))
entity["latest_advisory_at"] = isoformat(max(timestamps))
entity["last_discovered_at"] = entity["latest_advisory_at"]
entity["last_synced_at"] = entity["latest_advisory_at"]
entity["official_source_covered"] = bool(
entity.get("official_source_covered")
or any(source.get("official") for source in entity.get("source_refs", []))
or any(advisory.get("official_source_url") for advisory in advisories)
)
if entity["advisory_count"]:
if entity["entity_type"] == "system" and entity.get("history_policy") == "history-full":
entity["history_backfill_status"] = "complete"
elif entity["advisory_count"] >= 2 and entity["version_mapped_advisory_count"] >= max(1, entity["advisory_count"] - 1):
entity["history_backfill_status"] = "complete"
else:
entity["history_backfill_status"] = "seeded"
entity["latest_sync_status"] = "green" if entity["official_source_covered"] and entity["advisory_count"] else "pending"
if not entity.get("latest_version"):
for advisory in advisories:
patched = advisory.get("patched_version")
if patched:
entity["latest_version"] = patched
break
def _candidate_from_source(system: Dict[str, Any], source: Dict[str, Any], known_repo_urls: set[str]) -> Dict[str, Any] | None:
url = (source.get("url") or "").strip()
entity_type = source.get("entity_type_hint") or "project"
repo_url = _github_repo_from_url(url)
package_registry = ""
marketplace_url = ""
display_name = ""
stable_url = repo_url
if repo_url:
if repo_url in known_repo_urls:
return None
entity_type = source.get("entity_type_hint") or "repo"
match = re.match(r"https://github\.com/([^/]+)/([^/#?]+)", repo_url, re.IGNORECASE)
if match:
display_name = f"{match.group(1)} / {match.group(2)}"
elif "npmjs.com/package/" in url:
entity_type = source.get("entity_type_hint") or "package"
package_name = url.split("/package/", 1)[1].split("?", 1)[0].strip("/")
package_registry = f"https://www.npmjs.com/package/{package_name}"
display_name = package_name
stable_url = package_registry
elif "packagist.org/packages/" in url:
entity_type = source.get("entity_type_hint") or "package"
package_name = url.split("/packages/", 1)[1].split("?", 1)[0].strip("/")
package_registry = f"https://packagist.org/packages/{package_name}"
display_name = package_name.replace("/", " / ")
stable_url = package_registry
elif any(token in url.lower() for token in ("/plugins/", "/themes/", "/extensions/", "/modules/", "/marketplace/")):
marketplace_url = url
slug = _marketplace_slug(url)
display_name = slug.replace("-", " ")
stable_url = marketplace_url
else:
return None
if not display_name:
display_name = source.get("name") or system.get("display_name") or system.get("system_id")
return {
"candidate_id": f"{system.get('system_id')}--{entity_type}-candidate--{slugify(stable_url or display_name)}",
"root_system_id": system.get("system_id"),
"display_name": display_name,
"entity_type": entity_type,
"status": "candidate",
"reason": "source catalog exposed a stable security-related object that is not yet cataloged as an entity",
"source": url,
"source_name": source.get("name") or "",
"source_confidence": source.get("confidence") or "unknown",
"source_bucket": source.get("bucket_name") or "",
"auto_catalog": bool(source.get("auto_catalog")),
"repo_url": repo_url,
"package_registry": package_registry,
"marketplace_url": marketplace_url,
"risk": "medium",
"waiting_for": "确认是否应升级为 cataloged repo/plugin/package 实体并补齐安全相关版本与历史漏洞",
"canonical_id": "",
}
def build_entity_views(source_map: Dict[str, Any], advisories: List[AdvisoryRecord | Dict[str, Any]]) -> Dict[str, Any]:
generated_at = isoformat(now_utc())
systems = {item["system_id"]: item for item in source_map.get("systems", []) or [] if item.get("system_id")}
advisory_rows = [_advisory_dict(item) for item in advisories]
existing_entities = {
item.get("entity_id"): item
for item in load_all_json(ENTITIES_DIR)
if item.get("entity_id") and item.get("root_system_id") in systems
}
entities: Dict[str, Dict[str, Any]] = {}
advisories_by_entity: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
for system_id, system in systems.items():
history_policy = system.get("tier") or "history-full"
entities[system_id] = _merge_entity_overlay(
_entity_payload(
entity_id=system_id,
entity_type="system",
display_name=system.get("display_name", system_id),
parent_entity_id=None,
root_system_id=system_id,
category=system.get("category", "unknown"),
ecosystem=system.get("category", "unknown"),
official=True,
status="cataloged",
history_policy=history_policy,
repo_url="",
package_registry="",
marketplace_url="",
latest_version="",
version_scheme="vendor",
source_refs=_source_refs(system),
),
existing_entities.get(system_id),
)
for advisory in advisory_rows:
root_system_id = advisory.get("system_id")
system = systems.get(root_system_id, {})
refs = advisory.get("entity_refs") or [{"entity_id": root_system_id, "entity_type": "system"}]
for ref in refs:
entity_id = ref.get("entity_id")
if not entity_id:
continue
if entity_id not in entities:
package_name = advisory.get("package_name") or advisory.get("title") or entity_id
entity_type = ref.get("entity_type") or infer_entity_type(advisory)
entities[entity_id] = _merge_entity_overlay(
_entity_payload(
entity_id=entity_id,
entity_type=entity_type,
display_name=_display_name(package_name, entity_id),
parent_entity_id=root_system_id,
root_system_id=root_system_id,
category=system.get("category", advisory.get("category", "unknown")),
ecosystem=advisory.get("package_name", "").split("/", 1)[0] if advisory.get("package_name") else system.get("category", "unknown"),
official=entity_type in {"project", "repo"} and entity_type != "package",
status="cataloged",
history_policy="history-full",
repo_url=_repo_url_from_package(advisory.get("package_name") or ""),
package_registry=_package_registry_url(advisory.get("package_name") or ""),
marketplace_url="",
latest_version=advisory.get("patched_version") or "",
version_scheme="semver-ish" if advisory.get("package_name") else "vendor",
source_refs=[],
),
existing_entities.get(entity_id),
)
advisories_by_entity[entity_id].append(advisory)
for entity_id, advisories_for_entity in advisories_by_entity.items():
_update_entity_stats(entities[entity_id], advisories_for_entity)
entities[entity_id] = _merge_entity_overlay(entities[entity_id], existing_entities.get(entity_id))
for entity_id, item in existing_entities.items():
if entity_id in entities:
continue
entities[entity_id] = item
known_repo_urls = {entity.get("repo_url") for entity in entities.values() if entity.get("repo_url")}
candidate_map: Dict[str, Dict[str, Any]] = {}
for system in systems.values():
for bucket in ("official_sources", "ecosystem_sources"):
for source in system.get(bucket, []) or []:
candidate = _candidate_from_source(system, source, known_repo_urls)
if candidate:
candidate_map[candidate["candidate_id"]] = candidate
candidate_backlog = sorted(candidate_map.values(), key=lambda item: (item["root_system_id"], item["display_name"]))
system_summary: Dict[str, Dict[str, Any]] = {}
for entity in entities.values():
summary = system_summary.setdefault(
entity["root_system_id"],
{
"system_id": entity["root_system_id"],
"display_name": systems.get(entity["root_system_id"], {}).get("display_name", entity["root_system_id"]),
"cataloged_entity_total": 0,
"child_entity_total": 0,
"candidate_entity_total": 0,
"workflow_complete_count": 0,
"version_mapped_count": 0,
"official_source_covered_count": 0,
"history_full_complete_count": 0,
"latest_green_count": 0,
"version_gap_entity_count": 0,
"workflow_gap_entity_count": 0,
"plugin_total": 0,
"entity_type_counts": {},
"top_entities": [],
"backlog_preview": [],
},
)
summary["cataloged_entity_total"] += 1
if entity["entity_type"] != "system":
summary["child_entity_total"] += 1
summary["entity_type_counts"][entity["entity_type"]] = summary["entity_type_counts"].get(entity["entity_type"], 0) + 1
summary["workflow_complete_count"] += 1 if entity["advisory_count"] and entity["workflow_complete_advisory_count"] >= entity["advisory_count"] else 0
summary["version_mapped_count"] += 1 if entity["advisory_count"] and entity["version_mapped_advisory_count"] >= entity["advisory_count"] else 0
summary["official_source_covered_count"] += 1 if entity["official_source_covered"] else 0
summary["history_full_complete_count"] += 1 if entity.get("history_backfill_status") == "complete" else 0
summary["latest_green_count"] += 1 if entity.get("latest_sync_status") == "green" else 0
summary["version_gap_entity_count"] += 1 if entity.get("advisory_count") and entity.get("version_mapped_advisory_count", 0) < entity.get("advisory_count", 0) else 0
summary["workflow_gap_entity_count"] += 1 if entity.get("advisory_count") and entity.get("workflow_complete_advisory_count", 0) < entity.get("advisory_count", 0) else 0
if entity["entity_type"] in PLUGINISH_ENTITY_TYPES:
summary["plugin_total"] += 1
for system_id, summary in system_summary.items():
ranked_entities = sorted(
[
entity
for entity in entities.values()
if entity["root_system_id"] == system_id and entity["entity_type"] != "system"
],
key=lambda item: (-(item.get("advisory_count") or 0), item["entity_type"], item["display_name"].lower()),
)
summary["top_entities"] = [
{
"entity_id": entity["entity_id"],
"entity_type": entity["entity_type"],
"display_name": entity["display_name"],
"advisory_count": entity.get("advisory_count", 0),
"history_backfill_status": entity.get("history_backfill_status"),
"latest_sync_status": entity.get("latest_sync_status"),
}
for entity in ranked_entities[:5]
]
for candidate in candidate_backlog:
system_summary.setdefault(
candidate["root_system_id"],
{
"system_id": candidate["root_system_id"],
"display_name": systems.get(candidate["root_system_id"], {}).get("display_name", candidate["root_system_id"]),
"cataloged_entity_total": 0,
"child_entity_total": 0,
"candidate_entity_total": 0,
"workflow_complete_count": 0,
"version_mapped_count": 0,
"official_source_covered_count": 0,
"history_full_complete_count": 0,
"latest_green_count": 0,
"version_gap_entity_count": 0,
"workflow_gap_entity_count": 0,
"plugin_total": 0,
"entity_type_counts": {},
"top_entities": [],
"backlog_preview": [],
},
)["candidate_entity_total"] += 1
preview = system_summary[candidate["root_system_id"]]["backlog_preview"]
if len(preview) < 5:
preview.append(
{
"candidate_id": candidate["candidate_id"],
"display_name": candidate["display_name"],
"entity_type": candidate["entity_type"],
"risk": candidate["risk"],
}
)
cataloged_entities = [entity for entity in entities.values() if entity.get("status") == "cataloged"]
history_full_complete_count = len(
[entity for entity in cataloged_entities if entity.get("history_policy") == "history-full" and entity.get("history_backfill_status") == "complete"]
)
latest_green_count = len([entity for entity in cataloged_entities if entity.get("latest_sync_status") == "green"])
workflow_complete_count = len(
[entity for entity in cataloged_entities if entity.get("advisory_count") and entity.get("workflow_complete_advisory_count") >= entity.get("advisory_count")]
)
version_mapped_count = len(
[entity for entity in cataloged_entities if entity.get("advisory_count") and entity.get("version_mapped_advisory_count") >= entity.get("advisory_count")]
)
official_source_covered_count = len([entity for entity in cataloged_entities if entity.get("official_source_covered")])
plugin_history_full_count = len(
[
entity
for entity in cataloged_entities
if entity.get("entity_type") in PLUGINISH_ENTITY_TYPES and entity.get("history_backfill_status") == "complete"
]
)
queue_payload = {
"generated_at": generated_at,
"discovery_queue": {
"count": len(candidate_backlog),
"items": candidate_backlog[:200],
},
"history_queue": {
"count": len([entity for entity in cataloged_entities if entity.get("history_policy") == "history-full" and entity.get("history_backfill_status") != "complete"]),
"items": [
{
"entity_id": entity["entity_id"],
"display_name": entity["display_name"],
"root_system_id": entity["root_system_id"],
"history_policy": entity["history_policy"],
"history_backfill_status": entity["history_backfill_status"],
}
for entity in cataloged_entities
if entity.get("history_policy") == "history-full" and entity.get("history_backfill_status") != "complete"
][:200],
},
"latest_queue": {
"count": len([entity for entity in cataloged_entities if entity.get("latest_sync_status") != "green"]),
"items": [
{
"entity_id": entity["entity_id"],
"display_name": entity["display_name"],
"root_system_id": entity["root_system_id"],
"latest_sync_status": entity["latest_sync_status"],
"last_synced_at": entity["last_synced_at"],
}
for entity in cataloged_entities
if entity.get("latest_sync_status") != "green"
][:200],
},
"workflow_queue": {
"count": len([advisory for advisory in advisory_rows if advisory.get("workflow", {}).get("review_state") != "ready"]),
"items": [
{
"canonical_id": advisory.get("canonical_id"),
"system_id": advisory.get("system_id"),
"title": advisory.get("title"),
"review_state": advisory.get("workflow", {}).get("review_state"),
"version_resolution_needed": advisory.get("version_resolution_needed"),
}
for advisory in advisory_rows
if advisory.get("workflow", {}).get("review_state") != "ready"
][:200],
},
}
completeness = {
"generated_at": generated_at,
"cataloged_entity_total": len(cataloged_entities),
"candidate_entity_total": len(candidate_backlog),
"history_full_complete_count": history_full_complete_count,
"latest_green_count": latest_green_count,
"workflow_complete_count": workflow_complete_count,
"version_mapped_count": version_mapped_count,
"official_source_covered_count": official_source_covered_count,
"plugin_history_full_count": plugin_history_full_count,
"systems": sorted(system_summary.values(), key=lambda item: item["system_id"]),
"queues": {
key: value["count"] for key, value in queue_payload.items() if key.endswith("_queue")
},
}
report_lines = [
"# 分层实体覆盖与完整度报告",
"",
f"- 生成时间: `{generated_at}`",
f"- 已编目实体: `{completeness['cataloged_entity_total']}`",
f"- 待编目 backlog: `{completeness['candidate_entity_total']}`",
f"- history-full 已完成: `{completeness['history_full_complete_count']}`",
f"- latest green: `{completeness['latest_green_count']}`",
f"- workflow 完整: `{completeness['workflow_complete_count']}`",
f"- 版本映射完整: `{completeness['version_mapped_count']}`",
f"- 官方源覆盖: `{completeness['official_source_covered_count']}`",
f"- 插件 history-full 已完成: `{completeness['plugin_history_full_count']}`",
"",
"## 系统分层摘要",
"",
"| 系统 | cataloged entities | candidate backlog | workflow complete | version mapped | official covered | plugins |",
"| --- | ---: | ---: | ---: | ---: | ---: | ---: |",
]
for item in completeness["systems"]:
report_lines.append(
f"| {item['system_id']} | {item['cataloged_entity_total']} | {item['candidate_entity_total']} | {item['workflow_complete_count']} | {item['version_mapped_count']} | {item['official_source_covered_count']} | {item['plugin_total']} |"
)
backlog_lines = [
"# 分层实体发现 Backlog",
"",
f"- 生成时间: `{generated_at}`",
f"- 待编目数量: `{len(candidate_backlog)}`",
"",
"| candidate_id | root_system | entity_type | risk | reason | waiting_for | source |",
"| --- | --- | --- | --- | --- | --- | --- |",
]
if candidate_backlog:
for item in candidate_backlog[:500]:
backlog_lines.append(
f"| {item['candidate_id']} | {item['root_system_id']} | {item['entity_type']} | {item['risk']} | {item['reason']} | {item['waiting_for']} | {item.get('source') or '-'} |"
)
else:
backlog_lines.append("| - | - | - | - | no backlog | - | - |")
entity_payloads = {
f"{entity['entity_id']}.json": entity for entity in sorted(entities.values(), key=lambda item: item["entity_id"])
}
return {
"generated_at": generated_at,
"entity_payloads": entity_payloads,
"entities": sorted(entities.values(), key=lambda item: item["entity_id"]),
"candidate_backlog": candidate_backlog,
"completeness": completeness,
"queues": queue_payload,
"catalog_report_markdown": "\n".join(report_lines),
"backlog_report_markdown": "\n".join(backlog_lines),
}