websafe-kb/scripts/intel/sources/github_global.py

from __future__ import annotations

import os
from typing import Any, Dict, List

import requests

from intel.models import Candidate
from intel.utils import unique


API_URL = "https://api.github.com/advisories"


def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
    headers = {"Accept": "application/vnd.github+json", "User-Agent": "websafe-intel"}
    token = os.environ.get("GITHUB_TOKEN")
    if token:
        headers["Authorization"] = f"Bearer {token}"
    page = 1
    packages = {
        item["name"].lower(): item["ecosystem"].lower()
        for item in system.get("package_names", [])
        if item.get("name") and item.get("ecosystem")
    }
    keyword_set = {value.lower() for value in system.get("ghsa_keywords", [])}
    candidates: List[Candidate] = []

    while True:
        response = requests.get(
            API_URL,
            headers=headers,
            params={"per_page": 100, "page": page, "ecosystem": source.get("ecosystem")},
            timeout=30,
        )
        if response.status_code == 403 and "rate limit" in response.text.lower():
            raise requests.HTTPError("GitHub advisory rate limit exceeded; set GITHUB_TOKEN for higher quota", response=response)
        response.raise_for_status()
        advisories = response.json()
        if not advisories:
            break

        for advisory in advisories:
            matched_vulns = []
            for vuln in advisory.get("vulnerabilities", []):
                package = (vuln.get("package") or {})
                package_name = (package.get("name") or "").lower()
                ecosystem = (package.get("ecosystem") or "").lower()
                if package_name in packages and packages[package_name] == ecosystem:
                    matched_vulns.append(vuln)

            haystack = " ".join(
                filter(
                    None,
                    [
                        advisory.get("summary"),
                        advisory.get("description"),
                        advisory.get("ghsa_id"),
                        advisory.get("cve_id"),
                    ],
                )
            ).lower()
            if not matched_vulns and keyword_set and not any(keyword in haystack for keyword in keyword_set):
                continue
            if not matched_vulns and not keyword_set:
                continue

            affected_versions = []
            fixed_versions = []
            package_name = None
            for vuln in matched_vulns:
                if vuln.get("vulnerable_version_range"):
                    affected_versions.append(vuln["vulnerable_version_range"])
                patched = vuln.get("first_patched_version") or {}
                if patched.get("identifier"):
                    fixed_versions.append(patched["identifier"])
                if not package_name and vuln.get("package"):
                    package_name = vuln["package"].get("name")

            aliases = unique(
                [
                    advisory.get("ghsa_id"),
                    advisory.get("cve_id"),
                    *(advisory.get("identifiers") or []),
                ]
            )
            cve_ids = [advisory["cve_id"]] if advisory.get("cve_id") else []
            ghsa_ids = [advisory["ghsa_id"]] if advisory.get("ghsa_id") else []

            candidates.append(
                Candidate(
                    system_id=system["system_id"],
                    display_name=system["display_name"],
                    category=system["category"],
                    advisory_mode=source.get("advisory_mode", "core"),
                    source_kind=source["kind"],
                    source_name=source["name"],
                    source_confidence=source["confidence"],
                    source_url=advisory.get("html_url") or API_URL,
                    title=advisory.get("summary") or advisory.get("ghsa_id") or "GitHub advisory",
                    published_at=advisory.get("published_at"),
                    updated_at=advisory.get("updated_at"),
                    summary=advisory.get("description") or "",
                    severity=(advisory.get("severity") or "unknown").lower(),
                    aliases=aliases,
                    cve_ids=cve_ids,
                    ghsa_ids=ghsa_ids,
                    affected_versions=unique(affected_versions),
                    fixed_versions=unique(fixed_versions),
                    package_name=package_name,
                    references=[advisory.get("html_url")] if advisory.get("html_url") else [],
                    raw=advisory,
                )
            )

        page += 1
        if len(advisories) < 100:
            break

    return candidates