kb: expand authorized lab coverage and intel automation

2026-03-16 22:04:51 -07:00
--- a/scripts/intel/sources/html_links.py
+++ b/scripts/intel/sources/html_links.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+import re
+from html import unescape
+from typing import Any, Dict, List
+from urllib.parse import urljoin
+
+import requests
+
+from intel.models import Candidate
+from intel.utils import unique
+
+
+ANCHOR_RE = re.compile(r"<a[^>]+href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", re.IGNORECASE | re.DOTALL)
+TAG_RE = re.compile(r"<[^>]+>")
+
+
+def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
+    response = requests.get(source["url"], headers={"User-Agent": "websafe-intel"}, timeout=30)
+    response.raise_for_status()
+    html = response.text
+    keywords = {kw.lower() for kw in source.get("keywords", [])}
+
+    candidates: List[Candidate] = []
+    seen = set()
+    for href, text in ANCHOR_RE.findall(html):
+        title = unescape(TAG_RE.sub(" ", text)).strip()
+        if not title:
+            continue
+        absolute = urljoin(source["url"], href)
+        haystack = f"{title} {absolute}".lower()
+        if keywords and not any(keyword in haystack for keyword in keywords):
+            continue
+        if absolute in seen:
+            continue
+        seen.add(absolute)
+        candidates.append(
+            Candidate(
+                system_id=system["system_id"],
+                display_name=system["display_name"],
+                category=system["category"],
+                advisory_mode=source.get("advisory_mode", "core"),
+                source_kind=source["kind"],
+                source_name=source["name"],
+                source_confidence=source["confidence"],
+                source_url=absolute,
+                title=title,
+                summary="",
+                severity="unknown",
+                references=unique([absolute]),
+                raw={"href": absolute, "title": title},
+            )
+        )
+        if len(candidates) >= source.get("max_items", 50):
+            break
+    return candidates