更新: 13 个文件 - 2026-03-18 09:44:57

2026-03-18 09:44:57 -07:00
--- a/scripts/intel/sources/html_links.py
+++ b/scripts/intel/sources/html_links.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import re
 from html import unescape
 from typing import Any, Dict, List
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlsplit, urlunsplit

 import requests

@@ -16,11 +16,25 @@ ANCHOR_RE = re.compile(r"<a[^>]+href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", re.IGN
 TAG_RE = re.compile(r"<[^>]+>")


+def canonicalize_url(url: str) -> str:
+    parsed = urlsplit(url)
+    return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, parsed.query, ""))
+
+
+def _matches_patterns(value: str, patterns: List[str]) -> bool:
+    if not patterns:
+        return True
+    return any(re.search(pattern, value, re.IGNORECASE) for pattern in patterns)
+
+
 def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
-    response = request("GET", source["url"])
+    response = request("GET", source["url"], source=source)
    response.raise_for_status()
    html = response.text
-    keywords = {kw.lower() for kw in source.get("keywords", [])}
+    parser_hints = source.get("parser_hints") or {}
+    keywords = {kw.lower() for kw in (parser_hints.get("keywords") or source.get("keywords", []))}
+    include_patterns = parser_hints.get("include_url_patterns") or []
+    exclude_patterns = parser_hints.get("exclude_url_patterns") or []

    candidates: List[Candidate] = []
    seen = set()
@@ -28,10 +42,14 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
        title = unescape(TAG_RE.sub(" ", text)).strip()
        if not title:
            continue
-        absolute = urljoin(source["url"], href)
+        absolute = canonicalize_url(urljoin(source["url"], href))
        haystack = f"{title} {absolute}".lower()
        if keywords and not any(keyword in haystack for keyword in keywords):
            continue
+        if include_patterns and not _matches_patterns(absolute, include_patterns):
+            continue
+        if exclude_patterns and _matches_patterns(absolute, exclude_patterns):
+            continue
        if absolute in seen:
            continue
        seen.add(absolute)