Expand intel coverage and refresh monitoring

这个提交包含在:
hao
2026-03-18 14:18:09 -07:00
父节点 87008d1bd5
当前提交 00d828d090
修改 3658 个文件,包含 124245 行新增13073 行删除

查看文件

@@ -14,6 +14,16 @@ from intel.utils import unique
ANCHOR_RE = re.compile(r"<a[^>]+href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", re.IGNORECASE | re.DOTALL)
TAG_RE = re.compile(r"<[^>]+>")
GENERIC_TITLES = {
"permalink",
"discuss this topic",
"read full topic",
"read more",
}
def _is_generic_title(title: str) -> bool:
return title.strip().lower() in GENERIC_TITLES
def canonicalize_url(url: str) -> str:
@@ -37,7 +47,8 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
exclude_patterns = parser_hints.get("exclude_url_patterns") or []
candidates: List[Candidate] = []
seen = set()
by_url: Dict[str, Candidate] = {}
ordered_urls: List[str] = []
for href, text in ANCHOR_RE.findall(html):
title = unescape(TAG_RE.sub(" ", text)).strip()
if not title:
@@ -50,11 +61,10 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
continue
if exclude_patterns and _matches_patterns(absolute, exclude_patterns):
continue
if absolute in seen:
continue
seen.add(absolute)
candidates.append(
Candidate(
existing = by_url.get(absolute)
if existing is None:
ordered_urls.append(absolute)
by_url[absolute] = Candidate(
system_id=system["system_id"],
display_name=system["display_name"],
category=system["category"],
@@ -69,7 +79,17 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
references=unique([absolute]),
raw={"href": absolute, "title": title},
)
)
if len(candidates) >= source.get("max_items", 50):
break
continue
if _is_generic_title(existing.title) and not _is_generic_title(title):
existing.title = title
existing.raw = {"href": absolute, "title": title}
continue
if _is_generic_title(title) and not _is_generic_title(existing.title):
continue
if len(title) > len(existing.title):
existing.title = title
existing.raw = {"href": absolute, "title": title}
for absolute in ordered_urls[: source.get("max_items", 50)]:
candidates.append(by_url[absolute])
return candidates

查看文件

@@ -17,11 +17,33 @@ def _refs(item: Dict[str, Any]) -> List[str]:
return unique(values)
def _list_value(item: Dict[str, Any], *keys: str) -> List[str]:
values: List[str] = []
for key in keys:
raw = item.get(key)
if isinstance(raw, str) and raw:
values.append(raw)
elif isinstance(raw, list):
values.extend(str(entry) for entry in raw if entry)
return unique(values)
def _title(item: Dict[str, Any], system: Dict[str, Any]) -> str:
for key in ("title", "name", "summary", "issue_id", "cve_id", "id"):
value = item.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
return f"JSON entry for {system['display_name']}"
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
response = request("GET", source["url"], source=source)
response.raise_for_status()
payload = response.json()
items = payload.get("items") or payload.get("entries") or payload.get("advisories") or []
if isinstance(payload, list):
items = payload
else:
items = payload.get("items") or payload.get("entries") or payload.get("advisories") or []
if not isinstance(items, list):
return []
@@ -31,9 +53,15 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
for item in items[: source.get("max_items", 50)]:
if not isinstance(item, dict):
continue
title = item.get("title") or item.get("name") or item.get("summary") or f"JSON entry for {system['display_name']}"
title = _title(item, system)
link = item.get("url") or item.get("external_url") or item.get("html_url") or source["url"]
summary = item.get("summary") or item.get("content_text") or item.get("description") or ""
summary = (
item.get("summary")
or item.get("content_text")
or item.get("description")
or item.get("details")
or ""
)
if keywords:
haystack = " ".join(filter(None, [title, summary, link])).lower()
if not any(keyword in haystack for keyword in keywords):
@@ -41,6 +69,10 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
refs = _refs(item)
if link and link not in refs:
refs.insert(0, link)
aliases = _list_value(item, "aliases", "id", "issue_id", "cve_id", "ghsa_id", "osv_id")
cve_ids = [value for value in aliases if value.startswith("CVE-")]
ghsa_ids = [value for value in aliases if value.startswith("GHSA-")]
osv_ids = [value for value in aliases if value.startswith("OSV-")]
candidates.append(
Candidate(
system_id=system["system_id"],
@@ -52,11 +84,29 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
source_confidence=source["confidence"],
source_url=link,
title=title,
published_at=item.get("date_published") or item.get("published_at") or item.get("published") or item.get("created_at"),
updated_at=item.get("date_modified") or item.get("updated_at") or item.get("modified") or item.get("updated"),
published_at=(
item.get("date_published")
or item.get("published_at")
or item.get("published")
or item.get("created_at")
or item.get("fix_release_date")
),
updated_at=(
item.get("date_modified")
or item.get("updated_at")
or item.get("modified")
or item.get("updated")
or item.get("fix_release_date")
),
summary=summary,
severity=str(item.get("severity") or "unknown").lower(),
aliases=unique(item.get("aliases", []) or [item.get("id")]),
aliases=aliases,
cve_ids=cve_ids,
ghsa_ids=ghsa_ids,
osv_ids=osv_ids,
affected_versions=_list_value(item, "affected_versions"),
fixed_versions=_list_value(item, "fixed_versions", "fix_versions"),
package_name=item.get("package_name") or item.get("platform"),
references=refs,
raw=item,
)

查看文件

@@ -147,7 +147,12 @@ def probe_source(system: Dict[str, Any], source: Dict[str, Any]) -> Dict[str, An
response = request("GET", source["url"], source=source)
response.raise_for_status()
payload = response.json()
items = payload.get("items") or payload.get("entries") or payload.get("advisories") or []
if isinstance(payload, list):
items = payload
elif isinstance(payload, dict):
items = payload.get("items") or payload.get("entries") or payload.get("advisories") or []
else:
raise ValueError("JSON feed probe returned unsupported payload type")
if not isinstance(items, list):
raise ValueError("JSON feed probe returned non-list items")
return {"kind": kind, "items_seen": len(items)}