更新: 13 个文件 - 2026-03-18 09:44:57

2026-03-18 09:44:57 -07:00
--- a/scripts/intel/sources/atom_feed.py
+++ b/scripts/intel/sources/atom_feed.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+import xml.etree.ElementTree as ET
+from typing import Any, Dict, List
+
+from intel.http_client import request
+from intel.models import Candidate
+
+
+ATOM_NS = {"atom": "http://www.w3.org/2005/Atom"}
+
+
+def _node_text(node: ET.Element, path: str) -> str:
+    child = node.find(path, ATOM_NS)
+    return child.text.strip() if child is not None and child.text else ""
+
+
+def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
+    response = request("GET", source["url"], source=source)
+    response.raise_for_status()
+    root = ET.fromstring(response.content)
+
+    parser_hints = source.get("parser_hints") or {}
+    keywords = {kw.lower() for kw in (parser_hints.get("keywords") or source.get("keywords", []))}
+    candidates: List[Candidate] = []
+    entries = root.findall(".//atom:entry", ATOM_NS) or root.findall(".//entry")
+    for entry in entries[: source.get("max_items", 50)]:
+        title = _node_text(entry, "atom:title") or _node_text(entry, "title")
+        link_node = entry.find("atom:link", ATOM_NS) or entry.find("link")
+        link = ""
+        if link_node is not None:
+            link = (link_node.get("href") or "").strip()
+        summary = _node_text(entry, "atom:summary") or _node_text(entry, "summary") or _node_text(entry, "atom:content")
+        if keywords:
+            haystack = " ".join(filter(None, [title, summary, link])).lower()
+            if not any(keyword in haystack for keyword in keywords):
+                continue
+        candidates.append(
+            Candidate(
+                system_id=system["system_id"],
+                display_name=system["display_name"],
+                category=system["category"],
+                advisory_mode=source.get("advisory_mode", "core"),
+                source_kind=source["kind"],
+                source_name=source["name"],
+                source_confidence=source["confidence"],
+                source_url=link or source["url"],
+                title=title or f"Atom entry for {system['display_name']}",
+                published_at=_node_text(entry, "atom:published") or _node_text(entry, "published"),
+                updated_at=_node_text(entry, "atom:updated") or _node_text(entry, "updated"),
+                summary=summary,
+                severity="unknown",
+                references=[link] if link else [source["url"]],
+                raw={"title": title, "link": link},
+            )
+        )
+    return candidates
--- a/scripts/intel/sources/cisa_kev.py
+++ b/scripts/intel/sources/cisa_kev.py
@@ -10,7 +10,7 @@ from intel.utils import unique


 def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
-    response = request("GET", source["url"])
+    response = request("GET", source["url"], source=source)
    response.raise_for_status()
    payload = response.json()

--- a/scripts/intel/sources/github_global.py
+++ b/scripts/intel/sources/github_global.py
@@ -31,6 +31,7 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
        response = request(
            "GET",
            API_URL,
+            source=source,
            headers=headers,
            params={"per_page": 100, "page": page, "ecosystem": source.get("ecosystem")},
        )
--- a/scripts/intel/sources/html_links.py
+++ b/scripts/intel/sources/html_links.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import re
 from html import unescape
 from typing import Any, Dict, List
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlsplit, urlunsplit

 import requests

@@ -16,11 +16,25 @@ ANCHOR_RE = re.compile(r"<a[^>]+href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", re.IGN
 TAG_RE = re.compile(r"<[^>]+>")


+def canonicalize_url(url: str) -> str:
+    parsed = urlsplit(url)
+    return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, parsed.query, ""))
+
+
+def _matches_patterns(value: str, patterns: List[str]) -> bool:
+    if not patterns:
+        return True
+    return any(re.search(pattern, value, re.IGNORECASE) for pattern in patterns)
+
+
 def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
-    response = request("GET", source["url"])
+    response = request("GET", source["url"], source=source)
    response.raise_for_status()
    html = response.text
-    keywords = {kw.lower() for kw in source.get("keywords", [])}
+    parser_hints = source.get("parser_hints") or {}
+    keywords = {kw.lower() for kw in (parser_hints.get("keywords") or source.get("keywords", []))}
+    include_patterns = parser_hints.get("include_url_patterns") or []
+    exclude_patterns = parser_hints.get("exclude_url_patterns") or []

    candidates: List[Candidate] = []
    seen = set()
@@ -28,10 +42,14 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
        title = unescape(TAG_RE.sub(" ", text)).strip()
        if not title:
            continue
-        absolute = urljoin(source["url"], href)
+        absolute = canonicalize_url(urljoin(source["url"], href))
        haystack = f"{title} {absolute}".lower()
        if keywords and not any(keyword in haystack for keyword in keywords):
            continue
+        if include_patterns and not _matches_patterns(absolute, include_patterns):
+            continue
+        if exclude_patterns and _matches_patterns(absolute, exclude_patterns):
+            continue
        if absolute in seen:
            continue
        seen.add(absolute)
--- a/scripts/intel/sources/json_feed.py
+++ b/scripts/intel/sources/json_feed.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+from intel.http_client import request
+from intel.models import Candidate
+from intel.utils import unique
+
+
+def _refs(item: Dict[str, Any]) -> List[str]:
+    values: List[str] = []
+    for entry in item.get("references", []) or []:
+        if isinstance(entry, str):
+            values.append(entry)
+        elif isinstance(entry, dict) and entry.get("url"):
+            values.append(entry["url"])
+    return unique(values)
+
+
+def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
+    response = request("GET", source["url"], source=source)
+    response.raise_for_status()
+    payload = response.json()
+    items = payload.get("items") or payload.get("entries") or payload.get("advisories") or []
+    if not isinstance(items, list):
+        return []
+
+    parser_hints = source.get("parser_hints") or {}
+    keywords = {kw.lower() for kw in (parser_hints.get("keywords") or source.get("keywords", []))}
+    candidates: List[Candidate] = []
+    for item in items[: source.get("max_items", 50)]:
+        if not isinstance(item, dict):
+            continue
+        title = item.get("title") or item.get("name") or item.get("summary") or f"JSON entry for {system['display_name']}"
+        link = item.get("url") or item.get("external_url") or item.get("html_url") or source["url"]
+        summary = item.get("summary") or item.get("content_text") or item.get("description") or ""
+        if keywords:
+            haystack = " ".join(filter(None, [title, summary, link])).lower()
+            if not any(keyword in haystack for keyword in keywords):
+                continue
+        refs = _refs(item)
+        if link and link not in refs:
+            refs.insert(0, link)
+        candidates.append(
+            Candidate(
+                system_id=system["system_id"],
+                display_name=system["display_name"],
+                category=system["category"],
+                advisory_mode=source.get("advisory_mode", "core"),
+                source_kind=source["kind"],
+                source_name=source["name"],
+                source_confidence=source["confidence"],
+                source_url=link,
+                title=title,
+                published_at=item.get("date_published") or item.get("published_at") or item.get("published") or item.get("created_at"),
+                updated_at=item.get("date_modified") or item.get("updated_at") or item.get("modified") or item.get("updated"),
+                summary=summary,
+                severity=str(item.get("severity") or "unknown").lower(),
+                aliases=unique(item.get("aliases", []) or [item.get("id")]),
+                references=refs,
+                raw=item,
+            )
+        )
+    return candidates
--- a/scripts/intel/sources/nvd_api.py
+++ b/scripts/intel/sources/nvd_api.py
@@ -23,7 +23,7 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
    if api_key:
        headers["apiKey"] = api_key

-    response = request("GET", API_URL, headers=headers, params=params)
+    response = request("GET", API_URL, source=source, headers=headers, params=params)
    response.raise_for_status()
    payload = response.json()

--- a/scripts/intel/sources/osv_api.py
+++ b/scripts/intel/sources/osv_api.py
@@ -94,10 +94,11 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
        return []

    queries = [{"package": {"name": pkg["name"], "ecosystem": pkg["ecosystem"]}} for pkg in packages]
-    session = build_session()
+    session = build_session(source)
    response = request(
        "POST",
        QUERY_BATCH_URL,
+        source=source,
        session=session,
        json={"queries": queries},
        headers={"User-Agent": "websafe-intel"},
--- a/scripts/intel/sources/rss_feed.py
+++ b/scripts/intel/sources/rss_feed.py
@@ -15,11 +15,12 @@ def _text(node: ET.Element, name: str) -> str:


 def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
-    response = request("GET", source["url"])
+    response = request("GET", source["url"], source=source)
    response.raise_for_status()
    root = ET.fromstring(response.content)

-    keywords = {kw.lower() for kw in source.get("keywords", [])}
+    parser_hints = source.get("parser_hints") or {}
+    keywords = {kw.lower() for kw in (parser_hints.get("keywords") or source.get("keywords", []))}
    items = root.findall(".//item")
    candidates: List[Candidate] = []
    for item in items[: source.get("max_items", 50)]:
--- a/scripts/intel/sources/runner.py
+++ b/scripts/intel/sources/runner.py
@@ -8,11 +8,12 @@ from typing import Any, Dict, List, Optional, Tuple

 import requests

+from intel.config import iter_all_sources
 from intel.http_client import request
 from intel.models import Candidate
 from intel.utils import parse_dt

-from . import cisa_kev, github_global, html_links, nvd_api, osv_api, rss_feed
+from . import atom_feed, cisa_kev, github_global, html_links, json_feed, nvd_api, osv_api, rss_feed, vendor_index


 HANDLERS = {
@@ -21,11 +22,59 @@ HANDLERS = {
    "kev-json": cisa_kev.fetch,
    "nvd-search": nvd_api.fetch,
    "rss-feed": rss_feed.fetch,
+    "atom-feed": atom_feed.fetch,
+    "json-feed": json_feed.fetch,
    "html-links": html_links.fetch,
+    "vendor-index": vendor_index.fetch,
 }


-def _probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, Any]:
+def _failure_category(exc: Exception) -> str:
+    if isinstance(exc, requests.exceptions.SSLError):
+        return "tls"
+    if isinstance(exc, requests.exceptions.HTTPError):
+        response = getattr(exc, "response", None)
+        status = getattr(response, "status_code", None)
+        if status == 429:
+            return "rate_limit"
+        return "http_status"
+    if isinstance(exc, requests.exceptions.RequestException):
+        return "network"
+    if isinstance(exc, ET.ParseError):
+        return "parse"
+    if isinstance(exc, ValueError):
+        return "schema"
+    return "parse"
+
+
+def failure_summary(failure: Dict[str, Any]) -> str:
+    if isinstance(failure, str):
+        return failure
+    return failure.get("summary") or f"{failure.get('system_id')}::{failure.get('source_name')}::{failure.get('category')}::{failure.get('exception')}"
+
+
+def _build_failure(system: Dict[str, Any], source: Dict[str, Any], exc: Exception) -> Dict[str, Any]:
+    response = getattr(exc, "response", None)
+    status_code = getattr(response, "status_code", None)
+    category = _failure_category(exc)
+    message = str(exc).strip() or exc.__class__.__name__
+    summary = f"{system['system_id']}::{source['name']}::{category}::{message}"
+    return {
+        "system_id": system["system_id"],
+        "display_name": system["display_name"],
+        "source_name": source["name"],
+        "source_kind": source["kind"],
+        "source_bucket": source.get("bucket_name"),
+        "category": category,
+        "exception": exc.__class__.__name__,
+        "message": message,
+        "status_code": status_code,
+        "url": source.get("url") or "",
+        "summary": summary,
+    }
+
+
+def probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, Any]:
    kind = source["kind"]
    if kind == "ghsa-global":
        headers = {"Accept": "application/vnd.github+json", "User-Agent": "websafe-intel"}
@@ -35,6 +84,7 @@ def _probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, A
        response = request(
            "GET",
            github_global.API_URL,
+            source=source,
            headers=headers,
            params={"per_page": 1, "page": 1, "ecosystem": source.get("ecosystem")},
        )
@@ -52,6 +102,7 @@ def _probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, A
        response = request(
            "POST",
            osv_api.QUERY_BATCH_URL,
+            source=source,
            json={"queries": [{"package": {"name": packages[0]["name"], "ecosystem": packages[0]["ecosystem"]}}]},
            headers={"User-Agent": "websafe-intel"},
        )
@@ -61,7 +112,7 @@ def _probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, A
            raise ValueError("OSV probe returned non-object payload")
        return {"kind": kind, "items_seen": len(payload.get("results", []))}
    if kind == "kev-json":
-        response = request("GET", source["url"])
+        response = request("GET", source["url"], source=source)
        response.raise_for_status()
        payload = response.json()
        if not isinstance(payload, dict):
@@ -76,19 +127,37 @@ def _probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, A
        api_key = os.environ.get("NVD_API_KEY")
        if api_key:
            headers["apiKey"] = api_key
-        response = request("GET", nvd_api.API_URL, headers=headers, params=params)
+        response = request("GET", nvd_api.API_URL, source=source, headers=headers, params=params)
        response.raise_for_status()
        payload = response.json()
        if not isinstance(payload, dict):
            raise ValueError("NVD probe returned non-object payload")
        return {"kind": kind, "items_seen": len(payload.get("vulnerabilities", []))}
    if kind == "rss-feed":
-        response = request("GET", source["url"])
+        response = request("GET", source["url"], source=source)
        response.raise_for_status()
        root = ET.fromstring(response.content)
        return {"kind": kind, "items_seen": len(root.findall(".//item"))}
+    if kind == "atom-feed":
+        response = request("GET", source["url"], source=source)
+        response.raise_for_status()
+        root = ET.fromstring(response.content)
+        return {"kind": kind, "items_seen": len(root.findall(".//{http://www.w3.org/2005/Atom}entry"))}
+    if kind == "json-feed":
+        response = request("GET", source["url"], source=source)
+        response.raise_for_status()
+        payload = response.json()
+        items = payload.get("items") or payload.get("entries") or payload.get("advisories") or []
+        if not isinstance(items, list):
+            raise ValueError("JSON feed probe returned non-list items")
+        return {"kind": kind, "items_seen": len(items)}
    if kind == "html-links":
-        response = request("GET", source["url"])
+        response = request("GET", source["url"], source=source)
+        response.raise_for_status()
+        html = response.text
+        return {"kind": kind, "items_seen": len(html_links.ANCHOR_RE.findall(html))}
+    if kind == "vendor-index":
+        response = request("GET", source["url"], source=source)
        response.raise_for_status()
        html = response.text
        return {"kind": kind, "items_seen": len(html_links.ANCHOR_RE.findall(html))}
@@ -110,47 +179,59 @@ def collect_candidates(
    since_dt: Optional[datetime] = None,
    tier: Optional[str] = None,
    include_undated: bool = False,
-) -> Tuple[List[Candidate], List[str]]:
+) -> Tuple[List[Candidate], List[Dict[str, Any]]]:
    all_candidates: List[Candidate] = []
-    failures: List[str] = []
+    failures: List[Dict[str, Any]] = []

    for system in source_map["systems"]:
        if tier and system.get("tier") != tier:
            continue
-        for bucket_name in ("official_sources", "ecosystem_sources", "research_sources"):
-            for source in system.get(bucket_name, []):
-                handler = HANDLERS.get(source["kind"])
-                if handler is None:
-                    failures.append(f"Unsupported source kind {source['kind']} for {system['system_id']}")
-                    continue
-                try:
-                    items = handler(system, source)
-                    for item in items:
-                        if _passes_since(item, since_dt, include_undated):
-                            all_candidates.append(item)
-                except Exception as exc:
-                    failures.append(f"{system['system_id']}::{source['name']}::{exc.__class__.__name__}")
+        for _system, _bucket_name, source in iter_all_sources({"systems": [system]}, include_retired=False):
+            handler = HANDLERS.get(source["kind"])
+            if handler is None:
+                failures.append(
+                    {
+                        "system_id": system["system_id"],
+                        "display_name": system["display_name"],
+                        "source_name": source["name"],
+                        "source_kind": source["kind"],
+                        "source_bucket": source.get("bucket_name"),
+                        "category": "schema",
+                        "exception": "UnsupportedSourceKind",
+                        "message": f"Unsupported source kind {source['kind']}",
+                        "status_code": None,
+                        "url": source.get("url") or "",
+                        "summary": f"{system['system_id']}::{source['name']}::schema::Unsupported source kind {source['kind']}",
+                    }
+                )
+                continue
+            try:
+                items = handler(system, source)
+                for item in items:
+                    if _passes_since(item, since_dt, include_undated):
+                        all_candidates.append(item)
+            except Exception as exc:
+                failures.append(_build_failure(system, source, exc))
    return all_candidates, failures


 def probe_sources(
    source_map: Dict[str, Any],
    tier: Optional[str] = None,
-) -> Tuple[List[Dict[str, Any]], List[str]]:
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    jobs: List[Tuple[Dict[str, Any], Dict[str, Any]]] = []
    probes: List[Dict[str, Any]] = []
-    failures: List[str] = []
+    failures: List[Dict[str, Any]] = []

    for system in source_map["systems"]:
        if tier and system.get("tier") != tier:
            continue
-        for bucket_name in ("official_sources", "ecosystem_sources", "research_sources"):
-            for source in system.get(bucket_name, []):
-                jobs.append((system, source))
+        for _system, _bucket_name, source in iter_all_sources({"systems": [system]}, include_retired=False):
+            jobs.append((system, source))

    max_workers = min(16, max(4, len(jobs) or 1))
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        future_map = {executor.submit(_probe_source, system, source): (system, source) for system, source in jobs}
+        future_map = {executor.submit(probe_source, system, source): (system, source) for system, source in jobs}
        for future in as_completed(future_map):
            system, source = future_map[future]
            try:
@@ -164,5 +245,15 @@ def probe_sources(
                    }
                )
            except Exception as exc:
-                failures.append(f"{system['system_id']}::{source['name']}::{exc.__class__.__name__}")
+                failures.append(_build_failure(system, source, exc))
    return probes, failures
+
+
+def find_source(source_map: Dict[str, Any], system_id: str, source_name: str) -> Tuple[Dict[str, Any], Dict[str, Any]] | None:
+    for system in source_map.get("systems", []) or []:
+        if system.get("system_id") != system_id:
+            continue
+        for _system, _bucket_name, source in iter_all_sources({"systems": [system]}, include_retired=True):
+            if source.get("name") == source_name:
+                return system, source
+    return None
--- a/scripts/intel/sources/vendor_index.py
+++ b/scripts/intel/sources/vendor_index.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import re
+from html import unescape
+from typing import Any, Dict, List
+from urllib.parse import urljoin
+
+from intel.http_client import request
+from intel.models import Candidate
+from intel.utils import unique
+
+from .html_links import ANCHOR_RE, TAG_RE, canonicalize_url
+
+
+def _matches(value: str, patterns: List[str]) -> bool:
+    if not patterns:
+        return True
+    return any(re.search(pattern, value, re.IGNORECASE) for pattern in patterns)
+
+
+def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
+    response = request("GET", source["url"], source=source)
+    response.raise_for_status()
+    html = response.text
+    parser_hints = source.get("parser_hints") or {}
+    keywords = {kw.lower() for kw in (parser_hints.get("keywords") or source.get("keywords", []))}
+    include_patterns = parser_hints.get("include_url_patterns") or []
+    exclude_patterns = parser_hints.get("exclude_url_patterns") or []
+
+    candidates: List[Candidate] = []
+    seen = set()
+    for href, text in ANCHOR_RE.findall(html):
+        absolute = canonicalize_url(urljoin(source["url"], href))
+        title = unescape(TAG_RE.sub(" ", text)).strip()
+        if not title:
+            continue
+        haystack = " ".join(filter(None, [absolute, title])).lower()
+        if keywords and not any(keyword in haystack for keyword in keywords):
+            continue
+        if include_patterns and not _matches(absolute, include_patterns):
+            continue
+        if exclude_patterns and _matches(absolute, exclude_patterns):
+            continue
+        if absolute in seen:
+            continue
+        seen.add(absolute)
+        candidates.append(
+            Candidate(
+                system_id=system["system_id"],
+                display_name=system["display_name"],
+                category=system["category"],
+                advisory_mode=source.get("advisory_mode", "core"),
+                source_kind=source["kind"],
+                source_name=source["name"],
+                source_confidence=source["confidence"],
+                source_url=absolute,
+                title=title,
+                summary="",
+                severity="unknown",
+                references=unique([absolute]),
+                raw={"href": absolute, "title": title},
+            )
+        )
+        if len(candidates) >= source.get("max_items", 50):
+            break
+    return candidates