更新: 109 个文件 - 2026-03-18 10:55:52

这个提交包含在:
hao
2026-03-18 10:55:52 -07:00
父节点 1d5cb533e3
当前提交 1f9d9b1d16
修改 109 个文件,包含 10958 行新增1350 行删除

查看文件

@@ -33,7 +33,7 @@ SOURCE_BUCKETS = ("official_sources", "ecosystem_sources", "research_sources")
MACHINE_READABLE_SOURCE_KINDS = {"ghsa-global", "osv-batch", "nvd-search", "kev-json", "json-feed", "rss-feed", "atom-feed"}
DEFAULT_REQUEST_POLICY = {
"user_agent": "websafe-intel",
"user_agent": "python-requests/2.31.0",
"accept": "",
"timeout_seconds": 30,
"verify_tls": True,

查看文件

@@ -4,14 +4,12 @@ import time
from typing import Any, Dict
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from intel.config import DEFAULT_HEALTH_POLICY, DEFAULT_REQUEST_POLICY
DEFAULT_TIMEOUT = 30
DEFAULT_USER_AGENT = "websafe-intel"
DEFAULT_USER_AGENT = "python-requests/2.31.0"
def _request_policy(source: Dict[str, Any] | None = None) -> Dict[str, Any]:
@@ -23,21 +21,8 @@ def _health_policy(source: Dict[str, Any] | None = None) -> Dict[str, Any]:
def build_session(source: Dict[str, Any] | None = None) -> requests.Session:
health_policy = _health_policy(source)
session = requests.Session()
retry = Retry(
total=int(health_policy.get("retries") or 3),
connect=int(health_policy.get("retries") or 3),
read=int(health_policy.get("retries") or 3),
status=int(health_policy.get("retries") or 3),
backoff_factor=float(health_policy.get("backoff_seconds") or 0.5),
allowed_methods=frozenset(["GET", "POST"]),
status_forcelist=[429, 500, 502, 503, 504],
raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("https://", adapter)
session.mount("http://", adapter)
session.trust_env = True
request_policy = _request_policy(source)
headers = {"User-Agent": request_policy.get("user_agent") or DEFAULT_USER_AGENT}
if request_policy.get("accept"):
@@ -63,8 +48,6 @@ def request(
headers["User-Agent"] = request_policy.get("user_agent") or DEFAULT_USER_AGENT
if request_policy.get("accept") and "Accept" not in headers:
headers["Accept"] = request_policy["accept"]
if request_policy.get("http_version") == "1.1" and "Connection" not in headers:
headers["Connection"] = "close"
timeout_value = timeout if timeout != DEFAULT_TIMEOUT else int(request_policy.get("timeout_seconds") or DEFAULT_TIMEOUT)
allow_redirects = kwargs.pop("allow_redirects", bool(request_policy.get("follow_redirects", True)))
verify = kwargs.pop("verify", bool(request_policy.get("verify_tls", True)))

查看文件

@@ -242,9 +242,6 @@ def cmd_source_health(args) -> int:
retries_performed=retries_performed,
)
write_source_health(snapshot)
render_map, advisories, triage = _load_existing_selection(full_source_map, source_map)
existing_summary = read_json(GENERATED_DIR / "run-summary.json", default={}) or {}
render_generated(render_map, advisories, triage, snapshot.get("failures", []), existing_summary)
print(
f"Source health checked {len(probes)} active sources across {len(source_map['systems'])} systems; failures {snapshot['failure_count']}; retries {retries_performed}"
)
@@ -377,6 +374,14 @@ def cmd_monitor(args) -> int:
)
write_alerts(alerts)
write_monitoring_state(
audit=audit,
source_health=source_health,
alerts=alerts,
ingest_summary={**summary, "failures": ingest_failures},
validation_errors=[],
)
_refresh_render_state(full_source_map, source_map)
validation_errors = validate(source_map)
write_monitoring_state(
audit=audit,
@@ -385,7 +390,6 @@ def cmd_monitor(args) -> int:
ingest_summary={**summary, "failures": ingest_failures},
validation_errors=validation_errors,
)
_refresh_render_state(full_source_map, source_map)
passed = not source_health.get("failures") and not ingest_failures and not validation_errors
_write_state("success" if passed else "degraded", record_success=passed)

查看文件

@@ -313,8 +313,8 @@ def _prune_monitoring_history(now_value: str) -> None:
return
cutoff = current_dt - timedelta(days=90)
for path in sorted(MONITORING_DIR.glob("*.json")):
stem = path.stem.replace("-", ":", 2)
snapshot_dt = parse_dt(stem)
snapshot = read_json(path, default={}) or {}
snapshot_dt = parse_dt(snapshot.get("generated_at"))
if snapshot_dt is None:
continue
if snapshot_dt < cutoff:

查看文件

@@ -1,6 +1,8 @@
from __future__ import annotations
import os
import threading
import time
from typing import Any, Dict, List
import requests
@@ -11,6 +13,30 @@ from intel.utils import unique
API_URL = "https://services.nvd.nist.gov/rest/json/cves/2.0"
PUBLIC_INTERVAL_SECONDS = 7.0
_NVD_RATE_LOCK = threading.Lock()
_NVD_LAST_REQUEST = 0.0
def _wait_for_slot() -> None:
global _NVD_LAST_REQUEST
if os.environ.get("NVD_API_KEY"):
return
with _NVD_RATE_LOCK:
elapsed = time.monotonic() - _NVD_LAST_REQUEST
if elapsed < PUBLIC_INTERVAL_SECONDS:
time.sleep(PUBLIC_INTERVAL_SECONDS - elapsed)
_NVD_LAST_REQUEST = time.monotonic()
def request_nvd(source: Dict[str, Any], headers: Dict[str, Any], params: Dict[str, Any]) -> requests.Response:
_wait_for_slot()
response = request("GET", API_URL, source=source, headers=headers, params=params)
if response.status_code == 429 and not os.environ.get("NVD_API_KEY"):
time.sleep(PUBLIC_INTERVAL_SECONDS)
_wait_for_slot()
response = request("GET", API_URL, source=source, headers=headers, params=params)
return response
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
@@ -23,7 +49,7 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
if api_key:
headers["apiKey"] = api_key
response = request("GET", API_URL, source=source, headers=headers, params=params)
response = request_nvd(source, headers, params)
response.raise_for_status()
payload = response.json()

查看文件

@@ -127,7 +127,7 @@ def probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, An
api_key = os.environ.get("NVD_API_KEY")
if api_key:
headers["apiKey"] = api_key
response = request("GET", nvd_api.API_URL, source=source, headers=headers, params=params)
response = nvd_api.request_nvd(source, headers, params)
response.raise_for_status()
payload = response.json()
if not isinstance(payload, dict):
@@ -160,7 +160,7 @@ def probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, An
response = request("GET", source["url"], source=source)
response.raise_for_status()
html = response.text
return {"kind": kind, "items_seen": len(html_links.ANCHOR_RE.findall(html))}
return {"kind": kind, "items_seen": len(vendor_index.extract_links(html))}
raise ValueError(f"Unsupported source kind {kind}")

查看文件

@@ -2,6 +2,7 @@ from __future__ import annotations
import re
from html import unescape
from html.parser import HTMLParser
from typing import Any, Dict, List
from urllib.parse import urljoin
@@ -9,7 +10,42 @@ from intel.http_client import request
from intel.models import Candidate
from intel.utils import unique
from .html_links import ANCHOR_RE, TAG_RE, canonicalize_url
from .html_links import canonicalize_url
class _AnchorCollector(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.links: List[tuple[str, str]] = []
self._href: str | None = None
self._chunks: List[str] = []
def handle_starttag(self, tag: str, attrs) -> None:
if tag.lower() != "a":
return
href = dict(attrs).get("href")
if href:
self._href = href
self._chunks = []
def handle_data(self, data: str) -> None:
if self._href is not None:
self._chunks.append(data)
def handle_endtag(self, tag: str) -> None:
if tag.lower() != "a" or self._href is None:
return
text = unescape(" ".join(self._chunks)).strip()
self.links.append((self._href, text))
self._href = None
self._chunks = []
def extract_links(html: str) -> List[tuple[str, str]]:
parser = _AnchorCollector()
parser.feed(html)
parser.close()
return parser.links
def _matches(value: str, patterns: List[str]) -> bool:
@@ -29,9 +65,9 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
candidates: List[Candidate] = []
seen = set()
for href, text in ANCHOR_RE.findall(html):
for href, text in extract_links(html):
absolute = canonicalize_url(urljoin(source["url"], href))
title = unescape(TAG_RE.sub(" ", text)).strip()
title = unescape(text).strip()
if not title:
continue
haystack = " ".join(filter(None, [absolute, title])).lower()

查看文件

@@ -34,6 +34,18 @@ REQUIRED_SYSTEM_FIELDS = {
"render_policy",
}
REQUIRED_SOURCE_FIELDS = {
"name",
"kind",
"confidence",
"status",
"retired_reason",
"replacement_sources",
"request_policy",
"health_policy",
"parser_hints",
}
FORBIDDEN_RUNTIME_PATTERNS = [
"assets-persist.lovart.ai",
"cdnjs.cloudflare.com",
@@ -73,6 +85,11 @@ def validate(source_map: Dict[str, Any]) -> List[str]:
errors.append(f"system INDEX missing: {system_root / 'INDEX.md'}")
if not (SYSTEMS_DIR / f"{system_id}.json").exists():
errors.append(f"system registry summary missing: {SYSTEMS_DIR / f'{system_id}.json'}")
for bucket_name in ("official_sources", "ecosystem_sources", "research_sources"):
for source in system.get(bucket_name, []):
missing_source_fields = REQUIRED_SOURCE_FIELDS - set(source.keys())
if missing_source_fields:
errors.append(f"source missing required fields: {system_id}/{source.get('name', 'unknown')} -> {sorted(missing_source_fields)}")
if not (FRAMEWORK_ROOT / "README.md").exists():
errors.append(f"framework root README missing: {FRAMEWORK_ROOT / 'README.md'}")
@@ -89,6 +106,12 @@ def validate(source_map: Dict[str, Any]) -> List[str]:
GENERATED_DIR / "coverage-matrix.md",
GENERATED_DIR / "latest-ingest.md",
GENERATED_DIR / "run-summary.json",
GENERATED_DIR / "source-health.json",
GENERATED_DIR / "alerts.json",
GENERATED_DIR / "monitor-summary.json",
GENERATED_DIR / "source-catalog-audit.json",
GENERATED_DIR / "source-catalog-audit.md",
GENERATED_DIR / "retired-sources.json",
GENERATED_DIR / "dashboard" / "index.html",
GENERATED_DIR / "dashboard" / "overview" / "index.html",
GENERATED_DIR / "dashboard" / "runs" / "index.html",
@@ -115,17 +138,27 @@ def validate(source_map: Dict[str, Any]) -> List[str]:
GENERATED_DIR / "dashboard" / "docs" / "root-readme.html",
GENERATED_DIR / "dashboard" / "docs" / "authorization-model.html",
GENERATED_DIR / "dashboard" / "docs" / "source-map.html",
GENERATED_DIR / "dashboard" / "docs" / "source-catalog-audit.html",
GENERATED_DIR / "dashboard" / "docs" / "retired-sources.html",
GENERATED_DIR / "dashboard" / "docs" / "repro-map.html",
GENERATED_DIR / "dashboard" / "docs" / "coverage-matrix.html",
GENERATED_DIR / "dashboard" / "docs" / "design-source.html",
GENERATED_DIR / "dashboard" / "docs" / "architecture-library.html",
GENERATED_DIR / "dashboard" / "data" / "completeness.json",
GENERATED_DIR / "dashboard" / "data" / "source-health.json",
GENERATED_DIR / "dashboard" / "data" / "alerts.json",
GENERATED_DIR / "dashboard" / "data" / "monitor-summary.json",
GENERATED_DIR / "dashboard" / "data" / "source-catalog-audit.json",
ROOT / "docs" / "testing-completeness-report.md",
ROOT / "08-threat-intel" / "registry" / "source-confidence.md",
]:
if not path.exists():
errors.append(f"generated artifact missing: {path}")
monitoring_files = sorted((REGISTRY_ROOT / "monitoring").glob("*.json"))
if not monitoring_files:
errors.append(f"monitoring history missing: {REGISTRY_ROOT / 'monitoring'}")
runtime_files = [
GENERATED_DIR / "dashboard" / "index.html",
GENERATED_DIR / "dashboard" / "overview" / "index.html",