Expand intel coverage and refresh monitoring

2026-03-18 14:18:09 -07:00
--- a/scripts/intel/sources/html_links.py
+++ b/scripts/intel/sources/html_links.py
@@ -14,6 +14,16 @@ from intel.utils import unique

 ANCHOR_RE = re.compile(r"<a[^>]+href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", re.IGNORECASE | re.DOTALL)
 TAG_RE = re.compile(r"<[^>]+>")
+GENERIC_TITLES = {
+    "permalink",
+    "discuss this topic",
+    "read full topic",
+    "read more",
+}
+
+
+def _is_generic_title(title: str) -> bool:
+    return title.strip().lower() in GENERIC_TITLES


 def canonicalize_url(url: str) -> str:
@@ -37,7 +47,8 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
    exclude_patterns = parser_hints.get("exclude_url_patterns") or []

    candidates: List[Candidate] = []
-    seen = set()
+    by_url: Dict[str, Candidate] = {}
+    ordered_urls: List[str] = []
    for href, text in ANCHOR_RE.findall(html):
        title = unescape(TAG_RE.sub(" ", text)).strip()
        if not title:
@@ -50,11 +61,10 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
            continue
        if exclude_patterns and _matches_patterns(absolute, exclude_patterns):
            continue
-        if absolute in seen:
-            continue
-        seen.add(absolute)
-        candidates.append(
-            Candidate(
+        existing = by_url.get(absolute)
+        if existing is None:
+            ordered_urls.append(absolute)
+            by_url[absolute] = Candidate(
                system_id=system["system_id"],
                display_name=system["display_name"],
                category=system["category"],
@@ -69,7 +79,17 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
                references=unique([absolute]),
                raw={"href": absolute, "title": title},
            )
-        )
-        if len(candidates) >= source.get("max_items", 50):
-            break
+            continue
+        if _is_generic_title(existing.title) and not _is_generic_title(title):
+            existing.title = title
+            existing.raw = {"href": absolute, "title": title}
+            continue
+        if _is_generic_title(title) and not _is_generic_title(existing.title):
+            continue
+        if len(title) > len(existing.title):
+            existing.title = title
+            existing.raw = {"href": absolute, "title": title}
+
+    for absolute in ordered_urls[: source.get("max_items", 50)]:
+        candidates.append(by_url[absolute])
    return candidates