from __future__ import annotations import os import xml.etree.ElementTree as ET from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from typing import Any, Dict, List, Optional, Tuple import requests from intel.http_client import request from intel.models import Candidate from intel.utils import parse_dt from . import cisa_kev, github_global, html_links, nvd_api, osv_api, rss_feed HANDLERS = { "ghsa-global": github_global.fetch, "osv-batch": osv_api.fetch, "kev-json": cisa_kev.fetch, "nvd-search": nvd_api.fetch, "rss-feed": rss_feed.fetch, "html-links": html_links.fetch, } def _probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, Any]: kind = source["kind"] if kind == "ghsa-global": headers = {"Accept": "application/vnd.github+json", "User-Agent": "websafe-intel"} token = os.environ.get("GITHUB_TOKEN") if token: headers["Authorization"] = f"Bearer {token}" response = request( "GET", github_global.API_URL, headers=headers, params={"per_page": 1, "page": 1, "ecosystem": source.get("ecosystem")}, ) if response.status_code == 403 and "rate limit" in response.text.lower(): raise requests.HTTPError("GitHub advisory rate limit exceeded; set GITHUB_TOKEN for higher quota", response=response) response.raise_for_status() payload = response.json() if not isinstance(payload, list): raise ValueError("GitHub advisory probe returned non-list payload") return {"kind": kind, "items_seen": len(payload)} if kind == "osv-batch": packages = system.get("package_names", []) if not packages: return {"kind": kind, "items_seen": 0} response = request( "POST", osv_api.QUERY_BATCH_URL, json={"queries": [{"package": {"name": packages[0]["name"], "ecosystem": packages[0]["ecosystem"]}}]}, headers={"User-Agent": "websafe-intel"}, ) response.raise_for_status() payload = response.json() if not isinstance(payload, dict): raise ValueError("OSV probe returned non-object payload") return {"kind": kind, "items_seen": len(payload.get("results", []))} if kind == "kev-json": response = request("GET", source["url"]) response.raise_for_status() payload = response.json() if not isinstance(payload, dict): raise ValueError("KEV probe returned non-object payload") return {"kind": kind, "items_seen": len(payload.get("vulnerabilities", []))} if kind == "nvd-search": params = { "keywordSearch": source.get("keyword") or system["display_name"], "resultsPerPage": 1, } headers = {"User-Agent": "websafe-intel"} api_key = os.environ.get("NVD_API_KEY") if api_key: headers["apiKey"] = api_key response = request("GET", nvd_api.API_URL, headers=headers, params=params) response.raise_for_status() payload = response.json() if not isinstance(payload, dict): raise ValueError("NVD probe returned non-object payload") return {"kind": kind, "items_seen": len(payload.get("vulnerabilities", []))} if kind == "rss-feed": response = request("GET", source["url"]) response.raise_for_status() root = ET.fromstring(response.content) return {"kind": kind, "items_seen": len(root.findall(".//item"))} if kind == "html-links": response = request("GET", source["url"]) response.raise_for_status() html = response.text return {"kind": kind, "items_seen": len(html_links.ANCHOR_RE.findall(html))} raise ValueError(f"Unsupported source kind {kind}") def _passes_since(candidate: Candidate, since_dt: Optional[datetime], include_undated: bool) -> bool: if since_dt is None: return True timestamps = [parse_dt(candidate.updated_at), parse_dt(candidate.published_at)] valid = [item for item in timestamps if item is not None] if not valid: return include_undated return max(valid) >= since_dt def collect_candidates( source_map: Dict[str, Any], since_dt: Optional[datetime] = None, tier: Optional[str] = None, include_undated: bool = False, ) -> Tuple[List[Candidate], List[str]]: all_candidates: List[Candidate] = [] failures: List[str] = [] for system in source_map["systems"]: if tier and system.get("tier") != tier: continue for bucket_name in ("official_sources", "ecosystem_sources", "research_sources"): for source in system.get(bucket_name, []): handler = HANDLERS.get(source["kind"]) if handler is None: failures.append(f"Unsupported source kind {source['kind']} for {system['system_id']}") continue try: items = handler(system, source) for item in items: if _passes_since(item, since_dt, include_undated): all_candidates.append(item) except Exception as exc: failures.append(f"{system['system_id']}::{source['name']}::{exc.__class__.__name__}") return all_candidates, failures def probe_sources( source_map: Dict[str, Any], tier: Optional[str] = None, ) -> Tuple[List[Dict[str, Any]], List[str]]: jobs: List[Tuple[Dict[str, Any], Dict[str, Any]]] = [] probes: List[Dict[str, Any]] = [] failures: List[str] = [] for system in source_map["systems"]: if tier and system.get("tier") != tier: continue for bucket_name in ("official_sources", "ecosystem_sources", "research_sources"): for source in system.get(bucket_name, []): jobs.append((system, source)) max_workers = min(16, max(4, len(jobs) or 1)) with ThreadPoolExecutor(max_workers=max_workers) as executor: future_map = {executor.submit(_probe_source, system, source): (system, source) for system, source in jobs} for future in as_completed(future_map): system, source = future_map[future] try: result = future.result() probes.append( { "system_id": system["system_id"], "source_name": source["name"], "source_kind": source["kind"], **result, } ) except Exception as exc: failures.append(f"{system['system_id']}::{source['name']}::{exc.__class__.__name__}") return probes, failures