更新: 13 个文件 - 2026-03-18 09:44:57

2026-03-18 09:44:57 -07:00
--- a/scripts/intel/monitoring.py
+++ b/scripts/intel/monitoring.py
@@ -0,0 +1,377 @@
+from __future__ import annotations
+
+from datetime import timedelta
+from pathlib import Path
+from typing import Any, Dict, List
+
+from intel.config import (
+    ALERTS_PATH,
+    MACHINE_READABLE_SOURCE_KINDS,
+    MONITORING_DIR,
+    MONITOR_SUMMARY_PATH,
+    RETIRED_SOURCES_PATH,
+    SOURCE_CATALOG_AUDIT_MD_PATH,
+    SOURCE_CATALOG_AUDIT_PATH,
+    SOURCE_HEALTH_PATH,
+    iter_all_sources,
+)
+from intel.utils import ensure_dir, isoformat, now_utc, parse_dt, read_json, write_json, write_text
+
+
+def source_key(system_id: str, source_name: str) -> str:
+    return f"{system_id}::{source_name}"
+
+
+def normalize_failures(items: List[Any]) -> List[Dict[str, Any]]:
+    normalized: List[Dict[str, Any]] = []
+    for item in items or []:
+        if isinstance(item, dict):
+            cloned = dict(item)
+            if not cloned.get("summary"):
+                cloned["summary"] = f"{cloned.get('system_id')}::{cloned.get('source_name')}::{cloned.get('category')}::{cloned.get('message') or cloned.get('exception')}"
+            normalized.append(cloned)
+            continue
+        if not isinstance(item, str):
+            continue
+        parts = item.split("::", 3)
+        system_id = parts[0] if len(parts) > 0 else "unknown"
+        source_name = parts[1] if len(parts) > 1 else "unknown"
+        exception = parts[2] if len(parts) > 2 else "UnknownError"
+        message = parts[3] if len(parts) > 3 else exception
+        category = "tls" if "ssl" in exception.lower() else "http_status" if "http" in exception.lower() else "network"
+        normalized.append(
+            {
+                "system_id": system_id,
+                "display_name": system_id,
+                "source_name": source_name,
+                "source_kind": "",
+                "source_bucket": "",
+                "category": category,
+                "exception": exception,
+                "message": message,
+                "status_code": None,
+                "url": "",
+                "summary": item,
+            }
+        )
+    return normalized
+
+
+def build_source_catalog_audit(source_map: Dict[str, Any]) -> Dict[str, Any]:
+    systems_payload: List[Dict[str, Any]] = []
+    retired_sources: List[Dict[str, Any]] = []
+    replacement_map: List[Dict[str, Any]] = []
+    total_sources = 0
+    active_sources = 0
+    retired_count = 0
+    systems_with_active_official = 0
+    systems_with_machine_source = 0
+
+    for system in source_map.get("systems", []) or []:
+        active_official = 0
+        active_machine = 0
+        bucket_counts = {"official_sources": 0, "ecosystem_sources": 0, "research_sources": 0}
+        retired_for_system: List[Dict[str, Any]] = []
+        for bucket_name, source in ((bucket, src) for _system, bucket, src in iter_all_sources({"systems": [system]}, include_retired=True)):
+            total_sources += 1
+            if source.get("status") == "retired":
+                retired_count += 1
+                retired_entry = {
+                    "system_id": system["system_id"],
+                    "display_name": system["display_name"],
+                    "source_name": source["name"],
+                    "bucket": bucket_name,
+                    "kind": source["kind"],
+                    "retired_reason": source.get("retired_reason") or "",
+                    "replacement_sources": source.get("replacement_sources") or [],
+                    "url": source.get("url") or "",
+                }
+                retired_sources.append(retired_entry)
+                retired_for_system.append(retired_entry)
+                replacement_map.append(
+                    {
+                        "system_id": system["system_id"],
+                        "retired_source": source["name"],
+                        "replacement_sources": source.get("replacement_sources") or [],
+                    }
+                )
+                continue
+            active_sources += 1
+            bucket_counts[bucket_name] += 1
+            if bucket_name == "official_sources":
+                active_official += 1
+            if source.get("kind") in MACHINE_READABLE_SOURCE_KINDS:
+                active_machine += 1
+        if active_official:
+            systems_with_active_official += 1
+        if active_machine:
+            systems_with_machine_source += 1
+        systems_payload.append(
+            {
+                "system_id": system["system_id"],
+                "display_name": system["display_name"],
+                "category": system.get("category"),
+                "tier": system.get("tier"),
+                "source_total": sum(bucket_counts.values()) + len(retired_for_system),
+                "active_source_total": sum(bucket_counts.values()),
+                "retired_source_total": len(retired_for_system),
+                "official_active": bucket_counts["official_sources"],
+                "ecosystem_active": bucket_counts["ecosystem_sources"],
+                "research_active": bucket_counts["research_sources"],
+                "machine_readable_active": active_machine,
+                "has_active_official": bool(active_official),
+                "has_machine_readable_source": bool(active_machine),
+            }
+        )
+
+    audit = {
+        "generated_at": isoformat(now_utc()),
+        "system_count": len(source_map.get("systems", []) or []),
+        "source_count": total_sources,
+        "active_source_count": active_sources,
+        "retired_source_count": retired_count,
+        "systems_with_active_official": systems_with_active_official,
+        "systems_with_machine_readable_source": systems_with_machine_source,
+        "systems": sorted(systems_payload, key=lambda item: item["system_id"]),
+        "retired_sources": sorted(retired_sources, key=lambda item: (item["system_id"], item["source_name"])),
+        "replacement_map": sorted(replacement_map, key=lambda item: (item["system_id"], item["retired_source"])),
+    }
+    return audit
+
+
+def write_source_catalog_audit(source_map: Dict[str, Any]) -> Dict[str, Any]:
+    audit = build_source_catalog_audit(source_map)
+    lines = [
+        "# Source Catalog Audit",
+        "",
+        f"- generated_at: `{audit['generated_at']}`",
+        f"- systems: `{audit['system_count']}`",
+        f"- sources: `{audit['source_count']}`",
+        f"- active_sources: `{audit['active_source_count']}`",
+        f"- retired_sources: `{audit['retired_source_count']}`",
+        f"- systems_with_active_official: `{audit['systems_with_active_official']}/{audit['system_count']}`",
+        f"- systems_with_machine_readable_source: `{audit['systems_with_machine_readable_source']}/{audit['system_count']}`",
+        "",
+        "## Retired Sources",
+        "",
+    ]
+    if audit["retired_sources"]:
+        for item in audit["retired_sources"]:
+            replacements = ", ".join(item.get("replacement_sources") or []) or "-"
+            lines.append(
+                f"- `{item['system_id']}` `{item['source_name']}` -> replacements: `{replacements}` | reason: {item.get('retired_reason') or 'n/a'}"
+            )
+    else:
+        lines.append("- none")
+    write_json(SOURCE_CATALOG_AUDIT_PATH, audit)
+    write_text(SOURCE_CATALOG_AUDIT_MD_PATH, "\n".join(lines))
+    write_json(RETIRED_SOURCES_PATH, audit["retired_sources"])
+    return audit
+
+
+def build_source_health_snapshot(
+    source_map: Dict[str, Any],
+    probes: List[Dict[str, Any]],
+    failures: List[Dict[str, Any]],
+    *,
+    previous: Dict[str, Any] | None = None,
+    retries_performed: int = 0,
+) -> Dict[str, Any]:
+    normalized_failures = normalize_failures(failures)
+    active_source_total = sum(1 for _system, _bucket, _source in iter_all_sources(source_map, include_retired=False))
+    green_sources = max(0, active_source_total - len(normalized_failures))
+    systems: Dict[str, Dict[str, Any]] = {}
+    for system in source_map.get("systems", []) or []:
+        systems[system["system_id"]] = {
+            "system_id": system["system_id"],
+            "display_name": system["display_name"],
+            "active_source_total": 0,
+            "green_source_total": 0,
+            "failure_count": 0,
+        }
+    for system, _bucket, _source in iter_all_sources(source_map, include_retired=False):
+        systems[system["system_id"]]["active_source_total"] += 1
+    for probe in probes:
+        systems.setdefault(
+            probe["system_id"],
+            {"system_id": probe["system_id"], "display_name": probe.get("display_name", probe["system_id"]), "active_source_total": 0, "green_source_total": 0, "failure_count": 0},
+        )["green_source_total"] += 1
+    for failure in normalized_failures:
+        systems.setdefault(
+            failure["system_id"],
+            {"system_id": failure["system_id"], "display_name": failure.get("display_name", failure["system_id"]), "active_source_total": 0, "green_source_total": 0, "failure_count": 0},
+        )["failure_count"] += 1
+
+    generated_at = isoformat(now_utc())
+    all_green = not normalized_failures
+    previous = previous or {}
+    last_fully_green_run = generated_at if all_green else previous.get("last_fully_green_run")
+    return {
+        "generated_at": generated_at,
+        "active_source_count": active_source_total,
+        "green_source_count": green_sources,
+        "failure_count": len(normalized_failures),
+        "all_green": all_green,
+        "last_fully_green_run": last_fully_green_run,
+        "retries_performed": retries_performed,
+        "probes": sorted(probes, key=lambda item: (item["system_id"], item["source_name"])),
+        "failures": normalized_failures,
+        "systems": sorted(systems.values(), key=lambda item: item["system_id"]),
+    }
+
+
+def write_source_health(snapshot: Dict[str, Any]) -> None:
+    write_json(SOURCE_HEALTH_PATH, snapshot)
+
+
+def build_alerts(
+    current_failures: List[Dict[str, Any]],
+    *,
+    previous_alerts: List[Dict[str, Any]] | None = None,
+    bootstrap_failures: List[Dict[str, Any]] | None = None,
+    generated_at: str | None = None,
+) -> List[Dict[str, Any]]:
+    now_value = generated_at or isoformat(now_utc())
+    normalized_current = {source_key(item["system_id"], item["source_name"]): item for item in normalize_failures(current_failures)}
+
+    previous_entries = list(previous_alerts or [])
+    if not previous_entries and bootstrap_failures:
+        for item in normalize_failures(bootstrap_failures):
+            previous_entries.append(
+                {
+                    "alert_id": source_key(item["system_id"], item["source_name"]),
+                    "system_id": item["system_id"],
+                    "display_name": item.get("display_name", item["system_id"]),
+                    "source_name": item["source_name"],
+                    "source_kind": item.get("source_kind"),
+                    "status": "open",
+                    "opened_at": now_value,
+                    "updated_at": now_value,
+                    "resolved_at": None,
+                    "failure_streak": 1,
+                    "last_category": item.get("category"),
+                    "last_failure": item,
+                }
+            )
+
+    previous_by_key = {item["alert_id"]: item for item in previous_entries if item.get("alert_id")}
+    next_alerts: List[Dict[str, Any]] = []
+
+    for alert_id, failure in normalized_current.items():
+        previous = previous_by_key.get(alert_id)
+        if previous and previous.get("status") == "open":
+            streak = int(previous.get("failure_streak") or 0) + 1
+            opened_at = previous.get("opened_at") or now_value
+        else:
+            streak = 1
+            opened_at = now_value
+        next_alerts.append(
+            {
+                "alert_id": alert_id,
+                "system_id": failure["system_id"],
+                "display_name": failure.get("display_name", failure["system_id"]),
+                "source_name": failure["source_name"],
+                "source_kind": failure.get("source_kind"),
+                "status": "open",
+                "opened_at": opened_at,
+                "updated_at": now_value,
+                "resolved_at": None,
+                "failure_streak": streak,
+                "last_category": failure.get("category"),
+                "last_failure": failure,
+            }
+        )
+
+    for alert_id, previous in previous_by_key.items():
+        if alert_id in normalized_current:
+            continue
+        if previous.get("status") == "open":
+            resolved = dict(previous)
+            resolved["status"] = "resolved"
+            resolved["updated_at"] = now_value
+            resolved["resolved_at"] = now_value
+            next_alerts.append(resolved)
+        else:
+            next_alerts.append(previous)
+
+    return sorted(next_alerts, key=lambda item: (item.get("status") != "open", item.get("system_id"), item.get("source_name")))
+
+
+def write_alerts(alerts: List[Dict[str, Any]]) -> None:
+    write_json(ALERTS_PATH, alerts)
+
+
+def _history_filename(generated_at: str) -> Path:
+    safe_name = generated_at.replace(":", "-")
+    return MONITORING_DIR / f"{safe_name}.json"
+
+
+def _prune_monitoring_history(now_value: str) -> None:
+    ensure_dir(MONITORING_DIR)
+    current_dt = parse_dt(now_value)
+    if current_dt is None:
+        return
+    cutoff = current_dt - timedelta(days=90)
+    for path in sorted(MONITORING_DIR.glob("*.json")):
+        stem = path.stem.replace("-", ":", 2)
+        snapshot_dt = parse_dt(stem)
+        if snapshot_dt is None:
+            continue
+        if snapshot_dt < cutoff:
+            path.unlink()
+
+
+def write_monitoring_state(
+    *,
+    audit: Dict[str, Any],
+    source_health: Dict[str, Any],
+    alerts: List[Dict[str, Any]],
+    ingest_summary: Dict[str, Any],
+    validation_errors: List[str],
+) -> Dict[str, Any]:
+    open_alerts = [item for item in alerts if item.get("status") == "open"]
+    generated_at = source_health.get("generated_at") or isoformat(now_utc())
+    summary = {
+        "generated_at": generated_at,
+        "active_source_count": source_health.get("active_source_count", 0),
+        "green_source_count": source_health.get("green_source_count", 0),
+        "source_failure_count": source_health.get("failure_count", 0),
+        "open_alert_count": len(open_alerts),
+        "resolved_alert_count": len([item for item in alerts if item.get("status") == "resolved"]),
+        "last_fully_green_run": source_health.get("last_fully_green_run"),
+        "source_catalog": {
+            "system_count": audit.get("system_count", 0),
+            "source_count": audit.get("source_count", 0),
+            "retired_source_count": audit.get("retired_source_count", 0),
+        },
+        "ingest": {
+            "new_count": ingest_summary.get("new_count", 0),
+            "updated_count": ingest_summary.get("updated_count", 0),
+            "failure_count": len(normalize_failures(ingest_summary.get("failures", []))),
+            "systems_touched": ingest_summary.get("systems_touched", []),
+        },
+        "validation": {
+            "passed": not validation_errors,
+            "error_count": len(validation_errors),
+            "errors": validation_errors,
+        },
+    }
+    snapshot = {
+        "generated_at": generated_at,
+        "source_catalog_audit": audit,
+        "source_health": source_health,
+        "alerts": alerts,
+        "monitor_summary": summary,
+    }
+    write_json(MONITOR_SUMMARY_PATH, summary)
+    write_json(_history_filename(generated_at), snapshot)
+    _prune_monitoring_history(generated_at)
+    return summary
+
+
+def read_previous_source_health() -> Dict[str, Any]:
+    return read_json(SOURCE_HEALTH_PATH, default={}) or {}
+
+
+def read_previous_alerts() -> List[Dict[str, Any]]:
+    return read_json(ALERTS_PATH, default=[]) or []