更新: 109 个文件 - 2026-03-18 10:55:52

这个提交包含在:
hao
2026-03-18 10:55:52 -07:00
父节点 1d5cb533e3
当前提交 1f9d9b1d16
修改 109 个文件,包含 10958 行新增1350 行删除

查看文件

@@ -2,6 +2,7 @@ from __future__ import annotations
import re
from html import unescape
from html.parser import HTMLParser
from typing import Any, Dict, List
from urllib.parse import urljoin
@@ -9,7 +10,42 @@ from intel.http_client import request
from intel.models import Candidate
from intel.utils import unique
from .html_links import ANCHOR_RE, TAG_RE, canonicalize_url
from .html_links import canonicalize_url
class _AnchorCollector(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.links: List[tuple[str, str]] = []
self._href: str | None = None
self._chunks: List[str] = []
def handle_starttag(self, tag: str, attrs) -> None:
if tag.lower() != "a":
return
href = dict(attrs).get("href")
if href:
self._href = href
self._chunks = []
def handle_data(self, data: str) -> None:
if self._href is not None:
self._chunks.append(data)
def handle_endtag(self, tag: str) -> None:
if tag.lower() != "a" or self._href is None:
return
text = unescape(" ".join(self._chunks)).strip()
self.links.append((self._href, text))
self._href = None
self._chunks = []
def extract_links(html: str) -> List[tuple[str, str]]:
parser = _AnchorCollector()
parser.feed(html)
parser.close()
return parser.links
def _matches(value: str, patterns: List[str]) -> bool:
@@ -29,9 +65,9 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
candidates: List[Candidate] = []
seen = set()
for href, text in ANCHOR_RE.findall(html):
for href, text in extract_links(html):
absolute = canonicalize_url(urljoin(source["url"], href))
title = unescape(TAG_RE.sub(" ", text)).strip()
title = unescape(text).strip()
if not title:
continue
haystack = " ".join(filter(None, [absolute, title])).lower()