from __future__ import annotations import os import xml.etree.ElementTree as ET from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from typing import Any, Dict, List, Optional, Tuple import requests from intel.config import iter_all_sources from intel.http_client import request from intel.models import Candidate from intel.utils import parse_dt from . import atom_feed, cisa_kev, github_global, html_links, json_feed, nvd_api, osv_api, rss_feed, vendor_index HANDLERS = { "ghsa-global": github_global.fetch, "osv-batch": osv_api.fetch, "kev-json": cisa_kev.fetch, "nvd-search": nvd_api.fetch, "rss-feed": rss_feed.fetch, "atom-feed": atom_feed.fetch, "json-feed": json_feed.fetch, "html-links": html_links.fetch, "vendor-index": vendor_index.fetch, } def _failure_category(exc: Exception) -> str: if isinstance(exc, requests.exceptions.SSLError): return "tls" if isinstance(exc, requests.exceptions.HTTPError): response = getattr(exc, "response", None) status = getattr(response, "status_code", None) if status == 429: return "rate_limit" return "http_status" if isinstance(exc, requests.exceptions.RequestException): return "network" if isinstance(exc, ET.ParseError): return "parse" if isinstance(exc, ValueError): return "schema" return "parse" def failure_summary(failure: Dict[str, Any]) -> str: if isinstance(failure, str): return failure return failure.get("summary") or f"{failure.get('system_id')}::{failure.get('source_name')}::{failure.get('category')}::{failure.get('exception')}" def build_failure(system: Dict[str, Any], source: Dict[str, Any], exc: Exception) -> Dict[str, Any]: response = getattr(exc, "response", None) status_code = getattr(response, "status_code", None) category = _failure_category(exc) message = str(exc).strip() or exc.__class__.__name__ summary = f"{system['system_id']}::{source['name']}::{category}::{message}" return { "system_id": system["system_id"], "display_name": system["display_name"], "source_name": source["name"], "source_kind": source["kind"], "source_bucket": source.get("bucket_name"), "category": category, "exception": exc.__class__.__name__, "message": message, "status_code": status_code, "url": source.get("url") or "", "summary": summary, } def probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, Any]: kind = source["kind"] if kind == "ghsa-global": headers = {"Accept": "application/vnd.github+json", "User-Agent": "websafe-intel"} token = os.environ.get("GITHUB_TOKEN") if token: headers["Authorization"] = f"Bearer {token}" response = request( "GET", github_global.API_URL, source=source, headers=headers, params={"per_page": 1, "page": 1, "ecosystem": source.get("ecosystem")}, ) if response.status_code == 403 and "rate limit" in response.text.lower(): raise requests.HTTPError("GitHub advisory rate limit exceeded; set GITHUB_TOKEN for higher quota", response=response) response.raise_for_status() payload = response.json() if not isinstance(payload, list): raise ValueError("GitHub advisory probe returned non-list payload") return {"kind": kind, "items_seen": len(payload)} if kind == "osv-batch": packages = system.get("package_names", []) if not packages: return {"kind": kind, "items_seen": 0} response = request( "POST", osv_api.QUERY_BATCH_URL, source=source, json={"queries": [{"package": {"name": packages[0]["name"], "ecosystem": packages[0]["ecosystem"]}}]}, headers={"User-Agent": "websafe-intel"}, ) response.raise_for_status() payload = response.json() if not isinstance(payload, dict): raise ValueError("OSV probe returned non-object payload") return {"kind": kind, "items_seen": len(payload.get("results", []))} if kind == "kev-json": response = request("GET", source["url"], source=source) response.raise_for_status() payload = response.json() if not isinstance(payload, dict): raise ValueError("KEV probe returned non-object payload") return {"kind": kind, "items_seen": len(payload.get("vulnerabilities", []))} if kind == "nvd-search": params = { "keywordSearch": source.get("keyword") or system["display_name"], "resultsPerPage": 1, } headers = {"User-Agent": "websafe-intel"} api_key = os.environ.get("NVD_API_KEY") if api_key: headers["apiKey"] = api_key response = nvd_api.request_nvd(source, headers, params) response.raise_for_status() payload = response.json() if not isinstance(payload, dict): raise ValueError("NVD probe returned non-object payload") return {"kind": kind, "items_seen": len(payload.get("vulnerabilities", []))} if kind == "rss-feed": response = request("GET", source["url"], source=source) response.raise_for_status() root = ET.fromstring(response.content) return {"kind": kind, "items_seen": len(root.findall(".//item"))} if kind == "atom-feed": response = request("GET", source["url"], source=source) response.raise_for_status() root = ET.fromstring(response.content) return {"kind": kind, "items_seen": len(root.findall(".//{http://www.w3.org/2005/Atom}entry"))} if kind == "json-feed": response = request("GET", source["url"], source=source) response.raise_for_status() payload = response.json() items = payload.get("items") or payload.get("entries") or payload.get("advisories") or [] if not isinstance(items, list): raise ValueError("JSON feed probe returned non-list items") return {"kind": kind, "items_seen": len(items)} if kind == "html-links": response = request("GET", source["url"], source=source) response.raise_for_status() html = response.text return {"kind": kind, "items_seen": len(html_links.ANCHOR_RE.findall(html))} if kind == "vendor-index": response = request("GET", source["url"], source=source) response.raise_for_status() html = response.text return {"kind": kind, "items_seen": len(vendor_index.extract_links(html))} raise ValueError(f"Unsupported source kind {kind}") def _passes_since(candidate: Candidate, since_dt: Optional[datetime], include_undated: bool) -> bool: if since_dt is None: return True timestamps = [parse_dt(candidate.updated_at), parse_dt(candidate.published_at)] valid = [item for item in timestamps if item is not None] if not valid: return include_undated return max(valid) >= since_dt def collect_candidates( source_map: Dict[str, Any], since_dt: Optional[datetime] = None, tier: Optional[str] = None, include_undated: bool = False, ) -> Tuple[List[Candidate], List[Dict[str, Any]]]: all_candidates: List[Candidate] = [] failures: List[Dict[str, Any]] = [] for system in source_map["systems"]: if tier and system.get("tier") != tier: continue for _system, _bucket_name, source in iter_all_sources({"systems": [system]}, include_retired=False): handler = HANDLERS.get(source["kind"]) if handler is None: failures.append( { "system_id": system["system_id"], "display_name": system["display_name"], "source_name": source["name"], "source_kind": source["kind"], "source_bucket": source.get("bucket_name"), "category": "schema", "exception": "UnsupportedSourceKind", "message": f"Unsupported source kind {source['kind']}", "status_code": None, "url": source.get("url") or "", "summary": f"{system['system_id']}::{source['name']}::schema::Unsupported source kind {source['kind']}", } ) continue try: items = handler(system, source) for item in items: if _passes_since(item, since_dt, include_undated): all_candidates.append(item) except Exception as exc: failures.append(build_failure(system, source, exc)) return all_candidates, failures def probe_sources( source_map: Dict[str, Any], tier: Optional[str] = None, ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: jobs: List[Tuple[Dict[str, Any], Dict[str, Any]]] = [] probes: List[Dict[str, Any]] = [] failures: List[Dict[str, Any]] = [] for system in source_map["systems"]: if tier and system.get("tier") != tier: continue for _system, _bucket_name, source in iter_all_sources({"systems": [system]}, include_retired=False): jobs.append((system, source)) max_workers = min(16, max(4, len(jobs) or 1)) with ThreadPoolExecutor(max_workers=max_workers) as executor: future_map = {executor.submit(probe_source, system, source): (system, source) for system, source in jobs} for future in as_completed(future_map): system, source = future_map[future] try: result = future.result() probes.append( { "system_id": system["system_id"], "source_name": source["name"], "source_kind": source["kind"], **result, } ) except Exception as exc: failures.append(build_failure(system, source, exc)) return probes, failures def find_source(source_map: Dict[str, Any], system_id: str, source_name: str) -> Tuple[Dict[str, Any], Dict[str, Any]] | None: for system in source_map.get("systems", []) or []: if system.get("system_id") != system_id: continue for _system, _bucket_name, source in iter_all_sources({"systems": [system]}, include_retired=True): if source.get("name") == source_name: return system, source return None