更新: 109 个文件 - 2026-03-18 10:55:52
这个提交包含在:
@@ -33,7 +33,7 @@ SOURCE_BUCKETS = ("official_sources", "ecosystem_sources", "research_sources")
|
||||
MACHINE_READABLE_SOURCE_KINDS = {"ghsa-global", "osv-batch", "nvd-search", "kev-json", "json-feed", "rss-feed", "atom-feed"}
|
||||
|
||||
DEFAULT_REQUEST_POLICY = {
|
||||
"user_agent": "websafe-intel",
|
||||
"user_agent": "python-requests/2.31.0",
|
||||
"accept": "",
|
||||
"timeout_seconds": 30,
|
||||
"verify_tls": True,
|
||||
|
||||
@@ -4,14 +4,12 @@ import time
|
||||
from typing import Any, Dict
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from intel.config import DEFAULT_HEALTH_POLICY, DEFAULT_REQUEST_POLICY
|
||||
|
||||
|
||||
DEFAULT_TIMEOUT = 30
|
||||
DEFAULT_USER_AGENT = "websafe-intel"
|
||||
DEFAULT_USER_AGENT = "python-requests/2.31.0"
|
||||
|
||||
|
||||
def _request_policy(source: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
||||
@@ -23,21 +21,8 @@ def _health_policy(source: Dict[str, Any] | None = None) -> Dict[str, Any]:
|
||||
|
||||
|
||||
def build_session(source: Dict[str, Any] | None = None) -> requests.Session:
|
||||
health_policy = _health_policy(source)
|
||||
session = requests.Session()
|
||||
retry = Retry(
|
||||
total=int(health_policy.get("retries") or 3),
|
||||
connect=int(health_policy.get("retries") or 3),
|
||||
read=int(health_policy.get("retries") or 3),
|
||||
status=int(health_policy.get("retries") or 3),
|
||||
backoff_factor=float(health_policy.get("backoff_seconds") or 0.5),
|
||||
allowed_methods=frozenset(["GET", "POST"]),
|
||||
status_forcelist=[429, 500, 502, 503, 504],
|
||||
raise_on_status=False,
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry)
|
||||
session.mount("https://", adapter)
|
||||
session.mount("http://", adapter)
|
||||
session.trust_env = True
|
||||
request_policy = _request_policy(source)
|
||||
headers = {"User-Agent": request_policy.get("user_agent") or DEFAULT_USER_AGENT}
|
||||
if request_policy.get("accept"):
|
||||
@@ -63,8 +48,6 @@ def request(
|
||||
headers["User-Agent"] = request_policy.get("user_agent") or DEFAULT_USER_AGENT
|
||||
if request_policy.get("accept") and "Accept" not in headers:
|
||||
headers["Accept"] = request_policy["accept"]
|
||||
if request_policy.get("http_version") == "1.1" and "Connection" not in headers:
|
||||
headers["Connection"] = "close"
|
||||
timeout_value = timeout if timeout != DEFAULT_TIMEOUT else int(request_policy.get("timeout_seconds") or DEFAULT_TIMEOUT)
|
||||
allow_redirects = kwargs.pop("allow_redirects", bool(request_policy.get("follow_redirects", True)))
|
||||
verify = kwargs.pop("verify", bool(request_policy.get("verify_tls", True)))
|
||||
|
||||
@@ -242,9 +242,6 @@ def cmd_source_health(args) -> int:
|
||||
retries_performed=retries_performed,
|
||||
)
|
||||
write_source_health(snapshot)
|
||||
render_map, advisories, triage = _load_existing_selection(full_source_map, source_map)
|
||||
existing_summary = read_json(GENERATED_DIR / "run-summary.json", default={}) or {}
|
||||
render_generated(render_map, advisories, triage, snapshot.get("failures", []), existing_summary)
|
||||
print(
|
||||
f"Source health checked {len(probes)} active sources across {len(source_map['systems'])} systems; failures {snapshot['failure_count']}; retries {retries_performed}"
|
||||
)
|
||||
@@ -377,6 +374,14 @@ def cmd_monitor(args) -> int:
|
||||
)
|
||||
write_alerts(alerts)
|
||||
|
||||
write_monitoring_state(
|
||||
audit=audit,
|
||||
source_health=source_health,
|
||||
alerts=alerts,
|
||||
ingest_summary={**summary, "failures": ingest_failures},
|
||||
validation_errors=[],
|
||||
)
|
||||
_refresh_render_state(full_source_map, source_map)
|
||||
validation_errors = validate(source_map)
|
||||
write_monitoring_state(
|
||||
audit=audit,
|
||||
@@ -385,7 +390,6 @@ def cmd_monitor(args) -> int:
|
||||
ingest_summary={**summary, "failures": ingest_failures},
|
||||
validation_errors=validation_errors,
|
||||
)
|
||||
_refresh_render_state(full_source_map, source_map)
|
||||
|
||||
passed = not source_health.get("failures") and not ingest_failures and not validation_errors
|
||||
_write_state("success" if passed else "degraded", record_success=passed)
|
||||
|
||||
@@ -313,8 +313,8 @@ def _prune_monitoring_history(now_value: str) -> None:
|
||||
return
|
||||
cutoff = current_dt - timedelta(days=90)
|
||||
for path in sorted(MONITORING_DIR.glob("*.json")):
|
||||
stem = path.stem.replace("-", ":", 2)
|
||||
snapshot_dt = parse_dt(stem)
|
||||
snapshot = read_json(path, default={}) or {}
|
||||
snapshot_dt = parse_dt(snapshot.get("generated_at"))
|
||||
if snapshot_dt is None:
|
||||
continue
|
||||
if snapshot_dt < cutoff:
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import requests
|
||||
@@ -11,6 +13,30 @@ from intel.utils import unique
|
||||
|
||||
|
||||
API_URL = "https://services.nvd.nist.gov/rest/json/cves/2.0"
|
||||
PUBLIC_INTERVAL_SECONDS = 7.0
|
||||
_NVD_RATE_LOCK = threading.Lock()
|
||||
_NVD_LAST_REQUEST = 0.0
|
||||
|
||||
|
||||
def _wait_for_slot() -> None:
|
||||
global _NVD_LAST_REQUEST
|
||||
if os.environ.get("NVD_API_KEY"):
|
||||
return
|
||||
with _NVD_RATE_LOCK:
|
||||
elapsed = time.monotonic() - _NVD_LAST_REQUEST
|
||||
if elapsed < PUBLIC_INTERVAL_SECONDS:
|
||||
time.sleep(PUBLIC_INTERVAL_SECONDS - elapsed)
|
||||
_NVD_LAST_REQUEST = time.monotonic()
|
||||
|
||||
|
||||
def request_nvd(source: Dict[str, Any], headers: Dict[str, Any], params: Dict[str, Any]) -> requests.Response:
|
||||
_wait_for_slot()
|
||||
response = request("GET", API_URL, source=source, headers=headers, params=params)
|
||||
if response.status_code == 429 and not os.environ.get("NVD_API_KEY"):
|
||||
time.sleep(PUBLIC_INTERVAL_SECONDS)
|
||||
_wait_for_slot()
|
||||
response = request("GET", API_URL, source=source, headers=headers, params=params)
|
||||
return response
|
||||
|
||||
|
||||
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
|
||||
@@ -23,7 +49,7 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
|
||||
if api_key:
|
||||
headers["apiKey"] = api_key
|
||||
|
||||
response = request("GET", API_URL, source=source, headers=headers, params=params)
|
||||
response = request_nvd(source, headers, params)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
|
||||
|
||||
@@ -127,7 +127,7 @@ def probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, An
|
||||
api_key = os.environ.get("NVD_API_KEY")
|
||||
if api_key:
|
||||
headers["apiKey"] = api_key
|
||||
response = request("GET", nvd_api.API_URL, source=source, headers=headers, params=params)
|
||||
response = nvd_api.request_nvd(source, headers, params)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
if not isinstance(payload, dict):
|
||||
@@ -160,7 +160,7 @@ def probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, An
|
||||
response = request("GET", source["url"], source=source)
|
||||
response.raise_for_status()
|
||||
html = response.text
|
||||
return {"kind": kind, "items_seen": len(html_links.ANCHOR_RE.findall(html))}
|
||||
return {"kind": kind, "items_seen": len(vendor_index.extract_links(html))}
|
||||
raise ValueError(f"Unsupported source kind {kind}")
|
||||
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
from html import unescape
|
||||
from html.parser import HTMLParser
|
||||
from typing import Any, Dict, List
|
||||
from urllib.parse import urljoin
|
||||
|
||||
@@ -9,7 +10,42 @@ from intel.http_client import request
|
||||
from intel.models import Candidate
|
||||
from intel.utils import unique
|
||||
|
||||
from .html_links import ANCHOR_RE, TAG_RE, canonicalize_url
|
||||
from .html_links import canonicalize_url
|
||||
|
||||
|
||||
class _AnchorCollector(HTMLParser):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.links: List[tuple[str, str]] = []
|
||||
self._href: str | None = None
|
||||
self._chunks: List[str] = []
|
||||
|
||||
def handle_starttag(self, tag: str, attrs) -> None:
|
||||
if tag.lower() != "a":
|
||||
return
|
||||
href = dict(attrs).get("href")
|
||||
if href:
|
||||
self._href = href
|
||||
self._chunks = []
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self._href is not None:
|
||||
self._chunks.append(data)
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag.lower() != "a" or self._href is None:
|
||||
return
|
||||
text = unescape(" ".join(self._chunks)).strip()
|
||||
self.links.append((self._href, text))
|
||||
self._href = None
|
||||
self._chunks = []
|
||||
|
||||
|
||||
def extract_links(html: str) -> List[tuple[str, str]]:
|
||||
parser = _AnchorCollector()
|
||||
parser.feed(html)
|
||||
parser.close()
|
||||
return parser.links
|
||||
|
||||
|
||||
def _matches(value: str, patterns: List[str]) -> bool:
|
||||
@@ -29,9 +65,9 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
|
||||
|
||||
candidates: List[Candidate] = []
|
||||
seen = set()
|
||||
for href, text in ANCHOR_RE.findall(html):
|
||||
for href, text in extract_links(html):
|
||||
absolute = canonicalize_url(urljoin(source["url"], href))
|
||||
title = unescape(TAG_RE.sub(" ", text)).strip()
|
||||
title = unescape(text).strip()
|
||||
if not title:
|
||||
continue
|
||||
haystack = " ".join(filter(None, [absolute, title])).lower()
|
||||
|
||||
@@ -34,6 +34,18 @@ REQUIRED_SYSTEM_FIELDS = {
|
||||
"render_policy",
|
||||
}
|
||||
|
||||
REQUIRED_SOURCE_FIELDS = {
|
||||
"name",
|
||||
"kind",
|
||||
"confidence",
|
||||
"status",
|
||||
"retired_reason",
|
||||
"replacement_sources",
|
||||
"request_policy",
|
||||
"health_policy",
|
||||
"parser_hints",
|
||||
}
|
||||
|
||||
FORBIDDEN_RUNTIME_PATTERNS = [
|
||||
"assets-persist.lovart.ai",
|
||||
"cdnjs.cloudflare.com",
|
||||
@@ -73,6 +85,11 @@ def validate(source_map: Dict[str, Any]) -> List[str]:
|
||||
errors.append(f"system INDEX missing: {system_root / 'INDEX.md'}")
|
||||
if not (SYSTEMS_DIR / f"{system_id}.json").exists():
|
||||
errors.append(f"system registry summary missing: {SYSTEMS_DIR / f'{system_id}.json'}")
|
||||
for bucket_name in ("official_sources", "ecosystem_sources", "research_sources"):
|
||||
for source in system.get(bucket_name, []):
|
||||
missing_source_fields = REQUIRED_SOURCE_FIELDS - set(source.keys())
|
||||
if missing_source_fields:
|
||||
errors.append(f"source missing required fields: {system_id}/{source.get('name', 'unknown')} -> {sorted(missing_source_fields)}")
|
||||
|
||||
if not (FRAMEWORK_ROOT / "README.md").exists():
|
||||
errors.append(f"framework root README missing: {FRAMEWORK_ROOT / 'README.md'}")
|
||||
@@ -89,6 +106,12 @@ def validate(source_map: Dict[str, Any]) -> List[str]:
|
||||
GENERATED_DIR / "coverage-matrix.md",
|
||||
GENERATED_DIR / "latest-ingest.md",
|
||||
GENERATED_DIR / "run-summary.json",
|
||||
GENERATED_DIR / "source-health.json",
|
||||
GENERATED_DIR / "alerts.json",
|
||||
GENERATED_DIR / "monitor-summary.json",
|
||||
GENERATED_DIR / "source-catalog-audit.json",
|
||||
GENERATED_DIR / "source-catalog-audit.md",
|
||||
GENERATED_DIR / "retired-sources.json",
|
||||
GENERATED_DIR / "dashboard" / "index.html",
|
||||
GENERATED_DIR / "dashboard" / "overview" / "index.html",
|
||||
GENERATED_DIR / "dashboard" / "runs" / "index.html",
|
||||
@@ -115,17 +138,27 @@ def validate(source_map: Dict[str, Any]) -> List[str]:
|
||||
GENERATED_DIR / "dashboard" / "docs" / "root-readme.html",
|
||||
GENERATED_DIR / "dashboard" / "docs" / "authorization-model.html",
|
||||
GENERATED_DIR / "dashboard" / "docs" / "source-map.html",
|
||||
GENERATED_DIR / "dashboard" / "docs" / "source-catalog-audit.html",
|
||||
GENERATED_DIR / "dashboard" / "docs" / "retired-sources.html",
|
||||
GENERATED_DIR / "dashboard" / "docs" / "repro-map.html",
|
||||
GENERATED_DIR / "dashboard" / "docs" / "coverage-matrix.html",
|
||||
GENERATED_DIR / "dashboard" / "docs" / "design-source.html",
|
||||
GENERATED_DIR / "dashboard" / "docs" / "architecture-library.html",
|
||||
GENERATED_DIR / "dashboard" / "data" / "completeness.json",
|
||||
GENERATED_DIR / "dashboard" / "data" / "source-health.json",
|
||||
GENERATED_DIR / "dashboard" / "data" / "alerts.json",
|
||||
GENERATED_DIR / "dashboard" / "data" / "monitor-summary.json",
|
||||
GENERATED_DIR / "dashboard" / "data" / "source-catalog-audit.json",
|
||||
ROOT / "docs" / "testing-completeness-report.md",
|
||||
ROOT / "08-threat-intel" / "registry" / "source-confidence.md",
|
||||
]:
|
||||
if not path.exists():
|
||||
errors.append(f"generated artifact missing: {path}")
|
||||
|
||||
monitoring_files = sorted((REGISTRY_ROOT / "monitoring").glob("*.json"))
|
||||
if not monitoring_files:
|
||||
errors.append(f"monitoring history missing: {REGISTRY_ROOT / 'monitoring'}")
|
||||
|
||||
runtime_files = [
|
||||
GENERATED_DIR / "dashboard" / "index.html",
|
||||
GENERATED_DIR / "dashboard" / "overview" / "index.html",
|
||||
|
||||
在新工单中引用
屏蔽一个用户