kb: expand authorized lab coverage and intel automation

这个提交包含在:
hao
2026-03-16 22:04:51 -07:00
父节点 cda31e86c7
当前提交 d0120fbf10
修改 592 个文件,包含 29025 行新增267 行删除

查看文件

@@ -0,0 +1 @@
"""Source adapters for advisory ingestion."""

查看文件

@@ -0,0 +1,57 @@
from __future__ import annotations
from typing import Any, Dict, List
import requests
from intel.models import Candidate
from intel.utils import unique
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
response = requests.get(source["url"], headers={"User-Agent": "websafe-intel"}, timeout=30)
response.raise_for_status()
payload = response.json()
keywords = {kw.lower() for kw in source.get("keywords") or system.get("kev_keywords", []) or [system["display_name"]]}
candidates: List[Candidate] = []
for vuln in payload.get("vulnerabilities", []):
haystack = " ".join(
filter(
None,
[
vuln.get("vendorProject"),
vuln.get("product"),
vuln.get("vulnerabilityName"),
vuln.get("shortDescription"),
],
)
).lower()
if not any(keyword in haystack for keyword in keywords):
continue
cve = vuln.get("cveID")
refs = [source["url"]]
candidates.append(
Candidate(
system_id=system["system_id"],
display_name=system["display_name"],
category=system["category"],
advisory_mode=source.get("advisory_mode", "core"),
source_kind=source["kind"],
source_name=source["name"],
source_confidence=source["confidence"],
source_url=source["url"],
title=vuln.get("vulnerabilityName") or cve or f"KEV advisory for {system['display_name']}",
published_at=vuln.get("dateAdded"),
updated_at=vuln.get("dueDate"),
summary=vuln.get("shortDescription") or "",
severity="critical",
exploit_status="known_exploited",
aliases=unique([cve]),
cve_ids=[cve] if cve else [],
references=refs,
raw=vuln,
)
)
return candidates

查看文件

@@ -0,0 +1,120 @@
from __future__ import annotations
import os
from typing import Any, Dict, List
import requests
from intel.models import Candidate
from intel.utils import unique
API_URL = "https://api.github.com/advisories"
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
headers = {"Accept": "application/vnd.github+json", "User-Agent": "websafe-intel"}
token = os.environ.get("GITHUB_TOKEN")
if token:
headers["Authorization"] = f"Bearer {token}"
page = 1
packages = {
item["name"].lower(): item["ecosystem"].lower()
for item in system.get("package_names", [])
if item.get("name") and item.get("ecosystem")
}
keyword_set = {value.lower() for value in system.get("ghsa_keywords", [])}
candidates: List[Candidate] = []
while True:
response = requests.get(
API_URL,
headers=headers,
params={"per_page": 100, "page": page, "ecosystem": source.get("ecosystem")},
timeout=30,
)
if response.status_code == 403 and "rate limit" in response.text.lower():
raise requests.HTTPError("GitHub advisory rate limit exceeded; set GITHUB_TOKEN for higher quota", response=response)
response.raise_for_status()
advisories = response.json()
if not advisories:
break
for advisory in advisories:
matched_vulns = []
for vuln in advisory.get("vulnerabilities", []):
package = (vuln.get("package") or {})
package_name = (package.get("name") or "").lower()
ecosystem = (package.get("ecosystem") or "").lower()
if package_name in packages and packages[package_name] == ecosystem:
matched_vulns.append(vuln)
haystack = " ".join(
filter(
None,
[
advisory.get("summary"),
advisory.get("description"),
advisory.get("ghsa_id"),
advisory.get("cve_id"),
],
)
).lower()
if not matched_vulns and keyword_set and not any(keyword in haystack for keyword in keyword_set):
continue
if not matched_vulns and not keyword_set:
continue
affected_versions = []
fixed_versions = []
package_name = None
for vuln in matched_vulns:
if vuln.get("vulnerable_version_range"):
affected_versions.append(vuln["vulnerable_version_range"])
patched = vuln.get("first_patched_version") or {}
if patched.get("identifier"):
fixed_versions.append(patched["identifier"])
if not package_name and vuln.get("package"):
package_name = vuln["package"].get("name")
aliases = unique(
[
advisory.get("ghsa_id"),
advisory.get("cve_id"),
*(advisory.get("identifiers") or []),
]
)
cve_ids = [advisory["cve_id"]] if advisory.get("cve_id") else []
ghsa_ids = [advisory["ghsa_id"]] if advisory.get("ghsa_id") else []
candidates.append(
Candidate(
system_id=system["system_id"],
display_name=system["display_name"],
category=system["category"],
advisory_mode=source.get("advisory_mode", "core"),
source_kind=source["kind"],
source_name=source["name"],
source_confidence=source["confidence"],
source_url=advisory.get("html_url") or API_URL,
title=advisory.get("summary") or advisory.get("ghsa_id") or "GitHub advisory",
published_at=advisory.get("published_at"),
updated_at=advisory.get("updated_at"),
summary=advisory.get("description") or "",
severity=(advisory.get("severity") or "unknown").lower(),
aliases=aliases,
cve_ids=cve_ids,
ghsa_ids=ghsa_ids,
affected_versions=unique(affected_versions),
fixed_versions=unique(fixed_versions),
package_name=package_name,
references=[advisory.get("html_url")] if advisory.get("html_url") else [],
raw=advisory,
)
)
page += 1
if len(advisories) < 100:
break
return candidates

查看文件

@@ -0,0 +1,56 @@
from __future__ import annotations
import re
from html import unescape
from typing import Any, Dict, List
from urllib.parse import urljoin
import requests
from intel.models import Candidate
from intel.utils import unique
ANCHOR_RE = re.compile(r"<a[^>]+href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", re.IGNORECASE | re.DOTALL)
TAG_RE = re.compile(r"<[^>]+>")
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
response = requests.get(source["url"], headers={"User-Agent": "websafe-intel"}, timeout=30)
response.raise_for_status()
html = response.text
keywords = {kw.lower() for kw in source.get("keywords", [])}
candidates: List[Candidate] = []
seen = set()
for href, text in ANCHOR_RE.findall(html):
title = unescape(TAG_RE.sub(" ", text)).strip()
if not title:
continue
absolute = urljoin(source["url"], href)
haystack = f"{title} {absolute}".lower()
if keywords and not any(keyword in haystack for keyword in keywords):
continue
if absolute in seen:
continue
seen.add(absolute)
candidates.append(
Candidate(
system_id=system["system_id"],
display_name=system["display_name"],
category=system["category"],
advisory_mode=source.get("advisory_mode", "core"),
source_kind=source["kind"],
source_name=source["name"],
source_confidence=source["confidence"],
source_url=absolute,
title=title,
summary="",
severity="unknown",
references=unique([absolute]),
raw={"href": absolute, "title": title},
)
)
if len(candidates) >= source.get("max_items", 50):
break
return candidates

查看文件

@@ -0,0 +1,68 @@
from __future__ import annotations
import os
from typing import Any, Dict, List
import requests
from intel.models import Candidate
from intel.utils import unique
API_URL = "https://services.nvd.nist.gov/rest/json/cves/2.0"
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
params = {
"keywordSearch": source.get("keyword") or system["display_name"],
"resultsPerPage": source.get("results_per_page", 50),
}
headers = {"User-Agent": "websafe-intel"}
api_key = os.environ.get("NVD_API_KEY")
if api_key:
headers["apiKey"] = api_key
response = requests.get(API_URL, headers=headers, params=params, timeout=30)
response.raise_for_status()
payload = response.json()
candidates: List[Candidate] = []
for item in payload.get("vulnerabilities", []):
cve = item.get("cve", {})
descriptions = cve.get("descriptions", [])
description = next((d.get("value") for d in descriptions if d.get("lang") == "en"), "")
metrics = cve.get("metrics", {})
severity = "unknown"
cvss_score = None
for key in ("cvssMetricV31", "cvssMetricV30", "cvssMetricV2"):
entries = metrics.get(key, [])
if entries:
data = entries[0].get("cvssData", {})
severity = (entries[0].get("baseSeverity") or data.get("baseSeverity") or "unknown").lower()
cvss_score = data.get("baseScore")
break
refs = [ref.get("url") for ref in cve.get("references", []) if ref.get("url")]
candidates.append(
Candidate(
system_id=system["system_id"],
display_name=system["display_name"],
category=system["category"],
advisory_mode=source.get("advisory_mode", "core"),
source_kind=source["kind"],
source_name=source["name"],
source_confidence=source["confidence"],
source_url=refs[0] if refs else API_URL,
title=cve.get("id") or f"NVD advisory for {system['display_name']}",
published_at=cve.get("published"),
updated_at=cve.get("lastModified"),
summary=description or "",
severity=severity,
cvss_score=cvss_score,
aliases=unique([cve.get("id")]),
cve_ids=[cve.get("id")] if cve.get("id") else [],
references=refs,
raw=item,
)
)
return candidates

查看文件

@@ -0,0 +1,154 @@
from __future__ import annotations
import re
from typing import Any, Dict, List
import requests
from intel.models import Candidate
from intel.utils import unique
QUERY_BATCH_URL = "https://api.osv.dev/v1/querybatch"
DETAIL_URL = "https://api.osv.dev/v1/vulns/{vuln_id}"
CVSS_SCORE_RE = re.compile(r"/CVSS:3\.[01]/AV:[A-Z]/AC:[A-Z]/PR:[A-Z]/UI:[A-Z]/S:[A-Z]/C:[A-Z]/I:[A-Z]/A:[A-Z]")
NUMERIC_SCORE_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
def _fetch_detail(session: requests.Session, vuln_id: str) -> Dict[str, Any]:
response = session.get(
DETAIL_URL.format(vuln_id=vuln_id),
headers={"User-Agent": "websafe-intel"},
timeout=30,
)
response.raise_for_status()
return response.json()
def _fixed_versions(vuln: Dict[str, Any]) -> List[str]:
fixed = []
for affected in vuln.get("affected", []):
for rng in affected.get("ranges", []):
for event in rng.get("events", []):
if event.get("fixed"):
fixed.append(event["fixed"])
return unique(fixed)
def _affected_versions(vuln: Dict[str, Any]) -> List[str]:
versions = []
ranges = []
for affected in vuln.get("affected", []):
versions.extend(affected.get("versions", [])[:20])
for rng in affected.get("ranges", []):
introduced = None
fixed = None
last_affected = None
limit = None
for event in rng.get("events", []):
introduced = introduced or event.get("introduced")
fixed = fixed or event.get("fixed")
last_affected = last_affected or event.get("last_affected")
limit = limit or event.get("limit")
if introduced or fixed or last_affected or limit:
parts = []
if introduced:
parts.append(f"introduced={introduced}")
if last_affected:
parts.append(f"last_affected={last_affected}")
if fixed:
parts.append(f"fixed<{fixed}")
if limit:
parts.append(f"limit<{limit}")
ranges.append(", ".join(parts))
return unique(versions + ranges)
def _severity(vuln: Dict[str, Any]) -> tuple[str, float | None]:
best_score = None
for sev in vuln.get("severity", []):
score = sev.get("score", "")
match = NUMERIC_SCORE_RE.search(score)
if match:
try:
best_score = float(match.group(1))
break
except ValueError:
continue
if best_score is None:
return "unknown", None
if best_score >= 9.0:
return "critical", best_score
if best_score >= 7.0:
return "high", best_score
if best_score >= 4.0:
return "medium", best_score
return "low", best_score
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
packages = system.get("package_names", [])
if not packages:
return []
queries = [{"package": {"name": pkg["name"], "ecosystem": pkg["ecosystem"]}} for pkg in packages]
session = requests.Session()
response = session.post(
QUERY_BATCH_URL,
json={"queries": queries},
headers={"User-Agent": "websafe-intel"},
timeout=30,
)
response.raise_for_status()
payload = response.json()
detail_cache: Dict[str, Dict[str, Any]] = {}
candidates: List[Candidate] = []
for package, result in zip(packages, payload.get("results", [])):
for summary in result.get("vulns", []):
vuln_id = summary.get("id")
if not vuln_id:
continue
if vuln_id not in detail_cache:
detail_cache[vuln_id] = _fetch_detail(session, vuln_id)
vuln = detail_cache[vuln_id]
aliases = unique(vuln.get("aliases", []) + [vuln.get("id")])
refs = [ref.get("url") for ref in vuln.get("references", []) if ref.get("url")]
severity, cvss_score = _severity(vuln)
package_name = package["name"]
if not package_name:
for affected in vuln.get("affected", []):
pkg = affected.get("package") or {}
if pkg.get("name"):
package_name = pkg["name"]
break
candidates.append(
Candidate(
system_id=system["system_id"],
display_name=system["display_name"],
category=system["category"],
advisory_mode=source.get("advisory_mode", "core"),
source_kind=source["kind"],
source_name=source["name"],
source_confidence=source["confidence"],
source_url=refs[0] if refs else DETAIL_URL.format(vuln_id=vuln_id),
title=vuln.get("summary") or vuln.get("id") or f"OSV advisory for {package['name']}",
published_at=vuln.get("published"),
updated_at=vuln.get("modified"),
summary=vuln.get("details") or "",
severity=severity,
cvss_score=cvss_score,
aliases=aliases,
cve_ids=[item for item in aliases if item and item.startswith("CVE-")],
ghsa_ids=[item for item in aliases if item and item.startswith("GHSA-")],
osv_ids=[vuln.get("id")] if vuln.get("id") else [],
affected_versions=_affected_versions(vuln),
fixed_versions=_fixed_versions(vuln),
package_name=package_name,
references=refs,
raw=vuln,
)
)
return candidates

查看文件

@@ -0,0 +1,51 @@
from __future__ import annotations
import xml.etree.ElementTree as ET
from typing import Any, Dict, List
import requests
from intel.models import Candidate
def _text(node: ET.Element, name: str) -> str:
child = node.find(name)
return child.text.strip() if child is not None and child.text else ""
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
response = requests.get(source["url"], headers={"User-Agent": "websafe-intel"}, timeout=30)
response.raise_for_status()
root = ET.fromstring(response.content)
keywords = {kw.lower() for kw in source.get("keywords", [])}
items = root.findall(".//item")
candidates: List[Candidate] = []
for item in items[: source.get("max_items", 50)]:
title = _text(item, "title")
link = _text(item, "link") or source["url"]
description = _text(item, "description")
if keywords:
haystack = " ".join([title, description]).lower()
if not any(keyword in haystack for keyword in keywords):
continue
candidates.append(
Candidate(
system_id=system["system_id"],
display_name=system["display_name"],
category=system["category"],
advisory_mode=source.get("advisory_mode", "core"),
source_kind=source["kind"],
source_name=source["name"],
source_confidence=source["confidence"],
source_url=link,
title=title or f"RSS entry for {system['display_name']}",
published_at=_text(item, "pubDate"),
updated_at=_text(item, "pubDate"),
summary=description,
severity="unknown",
references=[link],
raw={"title": title, "link": link},
)
)
return candidates

查看文件

@@ -0,0 +1,57 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
from intel.models import Candidate
from intel.utils import parse_dt
from . import cisa_kev, github_global, html_links, nvd_api, osv_api, rss_feed
HANDLERS = {
"ghsa-global": github_global.fetch,
"osv-batch": osv_api.fetch,
"kev-json": cisa_kev.fetch,
"nvd-search": nvd_api.fetch,
"rss-feed": rss_feed.fetch,
"html-links": html_links.fetch,
}
def _passes_since(candidate: Candidate, since_dt: Optional[datetime], include_undated: bool) -> bool:
if since_dt is None:
return True
timestamps = [parse_dt(candidate.updated_at), parse_dt(candidate.published_at)]
valid = [item for item in timestamps if item is not None]
if not valid:
return include_undated
return max(valid) >= since_dt
def collect_candidates(
source_map: Dict[str, Any],
since_dt: Optional[datetime] = None,
tier: Optional[str] = None,
include_undated: bool = False,
) -> Tuple[List[Candidate], List[str]]:
all_candidates: List[Candidate] = []
failures: List[str] = []
for system in source_map["systems"]:
if tier and system.get("tier") != tier:
continue
for bucket_name in ("official_sources", "ecosystem_sources", "research_sources"):
for source in system.get(bucket_name, []):
handler = HANDLERS.get(source["kind"])
if handler is None:
failures.append(f"Unsupported source kind {source['kind']} for {system['system_id']}")
continue
try:
items = handler(system, source)
for item in items:
if _passes_since(item, since_dt, include_undated):
all_candidates.append(item)
except Exception as exc:
failures.append(f"{system['system_id']}::{source['name']}::{exc.__class__.__name__}")
return all_candidates, failures