Expand intel coverage and refresh monitoring
这个提交包含在:
@@ -14,6 +14,16 @@ from intel.utils import unique
|
||||
|
||||
ANCHOR_RE = re.compile(r"<a[^>]+href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", re.IGNORECASE | re.DOTALL)
|
||||
TAG_RE = re.compile(r"<[^>]+>")
|
||||
GENERIC_TITLES = {
|
||||
"permalink",
|
||||
"discuss this topic",
|
||||
"read full topic",
|
||||
"read more",
|
||||
}
|
||||
|
||||
|
||||
def _is_generic_title(title: str) -> bool:
|
||||
return title.strip().lower() in GENERIC_TITLES
|
||||
|
||||
|
||||
def canonicalize_url(url: str) -> str:
|
||||
@@ -37,7 +47,8 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
|
||||
exclude_patterns = parser_hints.get("exclude_url_patterns") or []
|
||||
|
||||
candidates: List[Candidate] = []
|
||||
seen = set()
|
||||
by_url: Dict[str, Candidate] = {}
|
||||
ordered_urls: List[str] = []
|
||||
for href, text in ANCHOR_RE.findall(html):
|
||||
title = unescape(TAG_RE.sub(" ", text)).strip()
|
||||
if not title:
|
||||
@@ -50,11 +61,10 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
|
||||
continue
|
||||
if exclude_patterns and _matches_patterns(absolute, exclude_patterns):
|
||||
continue
|
||||
if absolute in seen:
|
||||
continue
|
||||
seen.add(absolute)
|
||||
candidates.append(
|
||||
Candidate(
|
||||
existing = by_url.get(absolute)
|
||||
if existing is None:
|
||||
ordered_urls.append(absolute)
|
||||
by_url[absolute] = Candidate(
|
||||
system_id=system["system_id"],
|
||||
display_name=system["display_name"],
|
||||
category=system["category"],
|
||||
@@ -69,7 +79,17 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
|
||||
references=unique([absolute]),
|
||||
raw={"href": absolute, "title": title},
|
||||
)
|
||||
)
|
||||
if len(candidates) >= source.get("max_items", 50):
|
||||
break
|
||||
continue
|
||||
if _is_generic_title(existing.title) and not _is_generic_title(title):
|
||||
existing.title = title
|
||||
existing.raw = {"href": absolute, "title": title}
|
||||
continue
|
||||
if _is_generic_title(title) and not _is_generic_title(existing.title):
|
||||
continue
|
||||
if len(title) > len(existing.title):
|
||||
existing.title = title
|
||||
existing.raw = {"href": absolute, "title": title}
|
||||
|
||||
for absolute in ordered_urls[: source.get("max_items", 50)]:
|
||||
candidates.append(by_url[absolute])
|
||||
return candidates
|
||||
|
||||
在新工单中引用
屏蔽一个用户