更新: 109 个文件 - 2026-03-18 10:55:52

2026-03-18 10:55:52 -07:00
--- a/scripts/intel/config.py
+++ b/scripts/intel/config.py
@@ -33,7 +33,7 @@ SOURCE_BUCKETS = ("official_sources", "ecosystem_sources", "research_sources")
 MACHINE_READABLE_SOURCE_KINDS = {"ghsa-global", "osv-batch", "nvd-search", "kev-json", "json-feed", "rss-feed", "atom-feed"}

 DEFAULT_REQUEST_POLICY = {
-    "user_agent": "websafe-intel",
+    "user_agent": "python-requests/2.31.0",
    "accept": "",
    "timeout_seconds": 30,
    "verify_tls": True,
--- a/scripts/intel/http_client.py
+++ b/scripts/intel/http_client.py
@@ -4,14 +4,12 @@ import time
 from typing import Any, Dict

 import requests
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry

 from intel.config import DEFAULT_HEALTH_POLICY, DEFAULT_REQUEST_POLICY


 DEFAULT_TIMEOUT = 30
-DEFAULT_USER_AGENT = "websafe-intel"
+DEFAULT_USER_AGENT = "python-requests/2.31.0"


 def _request_policy(source: Dict[str, Any] | None = None) -> Dict[str, Any]:
@@ -23,21 +21,8 @@ def _health_policy(source: Dict[str, Any] | None = None) -> Dict[str, Any]:


 def build_session(source: Dict[str, Any] | None = None) -> requests.Session:
-    health_policy = _health_policy(source)
    session = requests.Session()
-    retry = Retry(
-        total=int(health_policy.get("retries") or 3),
-        connect=int(health_policy.get("retries") or 3),
-        read=int(health_policy.get("retries") or 3),
-        status=int(health_policy.get("retries") or 3),
-        backoff_factor=float(health_policy.get("backoff_seconds") or 0.5),
-        allowed_methods=frozenset(["GET", "POST"]),
-        status_forcelist=[429, 500, 502, 503, 504],
-        raise_on_status=False,
-    )
-    adapter = HTTPAdapter(max_retries=retry)
-    session.mount("https://", adapter)
-    session.mount("http://", adapter)
+    session.trust_env = True
    request_policy = _request_policy(source)
    headers = {"User-Agent": request_policy.get("user_agent") or DEFAULT_USER_AGENT}
    if request_policy.get("accept"):
@@ -63,8 +48,6 @@ def request(
        headers["User-Agent"] = request_policy.get("user_agent") or DEFAULT_USER_AGENT
    if request_policy.get("accept") and "Accept" not in headers:
        headers["Accept"] = request_policy["accept"]
-    if request_policy.get("http_version") == "1.1" and "Connection" not in headers:
-        headers["Connection"] = "close"
    timeout_value = timeout if timeout != DEFAULT_TIMEOUT else int(request_policy.get("timeout_seconds") or DEFAULT_TIMEOUT)
    allow_redirects = kwargs.pop("allow_redirects", bool(request_policy.get("follow_redirects", True)))
    verify = kwargs.pop("verify", bool(request_policy.get("verify_tls", True)))
--- a/scripts/intel/main.py
+++ b/scripts/intel/main.py
@@ -242,9 +242,6 @@ def cmd_source_health(args) -> int:
        retries_performed=retries_performed,
    )
    write_source_health(snapshot)
-    render_map, advisories, triage = _load_existing_selection(full_source_map, source_map)
-    existing_summary = read_json(GENERATED_DIR / "run-summary.json", default={}) or {}
-    render_generated(render_map, advisories, triage, snapshot.get("failures", []), existing_summary)
    print(
        f"Source health checked {len(probes)} active sources across {len(source_map['systems'])} systems; failures {snapshot['failure_count']}; retries {retries_performed}"
    )
@@ -377,6 +374,14 @@ def cmd_monitor(args) -> int:
    )
    write_alerts(alerts)

+    write_monitoring_state(
+        audit=audit,
+        source_health=source_health,
+        alerts=alerts,
+        ingest_summary={**summary, "failures": ingest_failures},
+        validation_errors=[],
+    )
+    _refresh_render_state(full_source_map, source_map)
    validation_errors = validate(source_map)
    write_monitoring_state(
        audit=audit,
@@ -385,7 +390,6 @@ def cmd_monitor(args) -> int:
        ingest_summary={**summary, "failures": ingest_failures},
        validation_errors=validation_errors,
    )
-    _refresh_render_state(full_source_map, source_map)

    passed = not source_health.get("failures") and not ingest_failures and not validation_errors
    _write_state("success" if passed else "degraded", record_success=passed)
--- a/scripts/intel/monitoring.py
+++ b/scripts/intel/monitoring.py
@@ -313,8 +313,8 @@ def _prune_monitoring_history(now_value: str) -> None:
        return
    cutoff = current_dt - timedelta(days=90)
    for path in sorted(MONITORING_DIR.glob("*.json")):
-        stem = path.stem.replace("-", ":", 2)
-        snapshot_dt = parse_dt(stem)
+        snapshot = read_json(path, default={}) or {}
+        snapshot_dt = parse_dt(snapshot.get("generated_at"))
        if snapshot_dt is None:
            continue
        if snapshot_dt < cutoff:
--- a/scripts/intel/sources/nvd_api.py
+++ b/scripts/intel/sources/nvd_api.py
@@ -1,6 +1,8 @@
 from __future__ import annotations

 import os
+import threading
+import time
 from typing import Any, Dict, List

 import requests
@@ -11,6 +13,30 @@ from intel.utils import unique


 API_URL = "https://services.nvd.nist.gov/rest/json/cves/2.0"
+PUBLIC_INTERVAL_SECONDS = 7.0
+_NVD_RATE_LOCK = threading.Lock()
+_NVD_LAST_REQUEST = 0.0
+
+
+def _wait_for_slot() -> None:
+    global _NVD_LAST_REQUEST
+    if os.environ.get("NVD_API_KEY"):
+        return
+    with _NVD_RATE_LOCK:
+        elapsed = time.monotonic() - _NVD_LAST_REQUEST
+        if elapsed < PUBLIC_INTERVAL_SECONDS:
+            time.sleep(PUBLIC_INTERVAL_SECONDS - elapsed)
+        _NVD_LAST_REQUEST = time.monotonic()
+
+
+def request_nvd(source: Dict[str, Any], headers: Dict[str, Any], params: Dict[str, Any]) -> requests.Response:
+    _wait_for_slot()
+    response = request("GET", API_URL, source=source, headers=headers, params=params)
+    if response.status_code == 429 and not os.environ.get("NVD_API_KEY"):
+        time.sleep(PUBLIC_INTERVAL_SECONDS)
+        _wait_for_slot()
+        response = request("GET", API_URL, source=source, headers=headers, params=params)
+    return response


 def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
@@ -23,7 +49,7 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
    if api_key:
        headers["apiKey"] = api_key

-    response = request("GET", API_URL, source=source, headers=headers, params=params)
+    response = request_nvd(source, headers, params)
    response.raise_for_status()
    payload = response.json()

--- a/scripts/intel/sources/runner.py
+++ b/scripts/intel/sources/runner.py
@@ -127,7 +127,7 @@ def probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, An
        api_key = os.environ.get("NVD_API_KEY")
        if api_key:
            headers["apiKey"] = api_key
-        response = request("GET", nvd_api.API_URL, source=source, headers=headers, params=params)
+        response = nvd_api.request_nvd(source, headers, params)
        response.raise_for_status()
        payload = response.json()
        if not isinstance(payload, dict):
@@ -160,7 +160,7 @@ def probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, An
        response = request("GET", source["url"], source=source)
        response.raise_for_status()
        html = response.text
-        return {"kind": kind, "items_seen": len(html_links.ANCHOR_RE.findall(html))}
+        return {"kind": kind, "items_seen": len(vendor_index.extract_links(html))}
    raise ValueError(f"Unsupported source kind {kind}")


--- a/scripts/intel/sources/vendor_index.py
+++ b/scripts/intel/sources/vendor_index.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 import re
 from html import unescape
+from html.parser import HTMLParser
 from typing import Any, Dict, List
 from urllib.parse import urljoin

@@ -9,7 +10,42 @@ from intel.http_client import request
 from intel.models import Candidate
 from intel.utils import unique

-from .html_links import ANCHOR_RE, TAG_RE, canonicalize_url
+from .html_links import canonicalize_url
+
+
+class _AnchorCollector(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__()
+        self.links: List[tuple[str, str]] = []
+        self._href: str | None = None
+        self._chunks: List[str] = []
+
+    def handle_starttag(self, tag: str, attrs) -> None:
+        if tag.lower() != "a":
+            return
+        href = dict(attrs).get("href")
+        if href:
+            self._href = href
+            self._chunks = []
+
+    def handle_data(self, data: str) -> None:
+        if self._href is not None:
+            self._chunks.append(data)
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag.lower() != "a" or self._href is None:
+            return
+        text = unescape(" ".join(self._chunks)).strip()
+        self.links.append((self._href, text))
+        self._href = None
+        self._chunks = []
+
+
+def extract_links(html: str) -> List[tuple[str, str]]:
+    parser = _AnchorCollector()
+    parser.feed(html)
+    parser.close()
+    return parser.links


 def _matches(value: str, patterns: List[str]) -> bool:
@@ -29,9 +65,9 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:

    candidates: List[Candidate] = []
    seen = set()
-    for href, text in ANCHOR_RE.findall(html):
+    for href, text in extract_links(html):
        absolute = canonicalize_url(urljoin(source["url"], href))
-        title = unescape(TAG_RE.sub(" ", text)).strip()
+        title = unescape(text).strip()
        if not title:
            continue
        haystack = " ".join(filter(None, [absolute, title])).lower()
--- a/scripts/intel/validators.py
+++ b/scripts/intel/validators.py
@@ -34,6 +34,18 @@ REQUIRED_SYSTEM_FIELDS = {
    "render_policy",
 }

+REQUIRED_SOURCE_FIELDS = {
+    "name",
+    "kind",
+    "confidence",
+    "status",
+    "retired_reason",
+    "replacement_sources",
+    "request_policy",
+    "health_policy",
+    "parser_hints",
+}
+
 FORBIDDEN_RUNTIME_PATTERNS = [
    "assets-persist.lovart.ai",
    "cdnjs.cloudflare.com",
@@ -73,6 +85,11 @@ def validate(source_map: Dict[str, Any]) -> List[str]:
            errors.append(f"system INDEX missing: {system_root / 'INDEX.md'}")
        if not (SYSTEMS_DIR / f"{system_id}.json").exists():
            errors.append(f"system registry summary missing: {SYSTEMS_DIR / f'{system_id}.json'}")
+        for bucket_name in ("official_sources", "ecosystem_sources", "research_sources"):
+            for source in system.get(bucket_name, []):
+                missing_source_fields = REQUIRED_SOURCE_FIELDS - set(source.keys())
+                if missing_source_fields:
+                    errors.append(f"source missing required fields: {system_id}/{source.get('name', 'unknown')} -> {sorted(missing_source_fields)}")

    if not (FRAMEWORK_ROOT / "README.md").exists():
        errors.append(f"framework root README missing: {FRAMEWORK_ROOT / 'README.md'}")
@@ -89,6 +106,12 @@ def validate(source_map: Dict[str, Any]) -> List[str]:
        GENERATED_DIR / "coverage-matrix.md",
        GENERATED_DIR / "latest-ingest.md",
        GENERATED_DIR / "run-summary.json",
+        GENERATED_DIR / "source-health.json",
+        GENERATED_DIR / "alerts.json",
+        GENERATED_DIR / "monitor-summary.json",
+        GENERATED_DIR / "source-catalog-audit.json",
+        GENERATED_DIR / "source-catalog-audit.md",
+        GENERATED_DIR / "retired-sources.json",
        GENERATED_DIR / "dashboard" / "index.html",
        GENERATED_DIR / "dashboard" / "overview" / "index.html",
        GENERATED_DIR / "dashboard" / "runs" / "index.html",
@@ -115,17 +138,27 @@ def validate(source_map: Dict[str, Any]) -> List[str]:
        GENERATED_DIR / "dashboard" / "docs" / "root-readme.html",
        GENERATED_DIR / "dashboard" / "docs" / "authorization-model.html",
        GENERATED_DIR / "dashboard" / "docs" / "source-map.html",
+        GENERATED_DIR / "dashboard" / "docs" / "source-catalog-audit.html",
+        GENERATED_DIR / "dashboard" / "docs" / "retired-sources.html",
        GENERATED_DIR / "dashboard" / "docs" / "repro-map.html",
        GENERATED_DIR / "dashboard" / "docs" / "coverage-matrix.html",
        GENERATED_DIR / "dashboard" / "docs" / "design-source.html",
        GENERATED_DIR / "dashboard" / "docs" / "architecture-library.html",
        GENERATED_DIR / "dashboard" / "data" / "completeness.json",
+        GENERATED_DIR / "dashboard" / "data" / "source-health.json",
+        GENERATED_DIR / "dashboard" / "data" / "alerts.json",
+        GENERATED_DIR / "dashboard" / "data" / "monitor-summary.json",
+        GENERATED_DIR / "dashboard" / "data" / "source-catalog-audit.json",
        ROOT / "docs" / "testing-completeness-report.md",
        ROOT / "08-threat-intel" / "registry" / "source-confidence.md",
    ]:
        if not path.exists():
            errors.append(f"generated artifact missing: {path}")

+    monitoring_files = sorted((REGISTRY_ROOT / "monitoring").glob("*.json"))
+    if not monitoring_files:
+        errors.append(f"monitoring history missing: {REGISTRY_ROOT / 'monitoring'}")
+
    runtime_files = [
        GENERATED_DIR / "dashboard" / "index.html",
        GENERATED_DIR / "dashboard" / "overview" / "index.html",