更新: 13 个文件 - 2026-03-18 09:44:57

这个提交包含在:
hao
2026-03-18 09:44:57 -07:00
父节点 91d6f4d04e
当前提交 dc31e6e80f
修改 13 个文件,包含 904 行新增52 行删除

查看文件

@@ -3,7 +3,7 @@ from __future__ import annotations
import re
from html import unescape
from typing import Any, Dict, List
from urllib.parse import urljoin
from urllib.parse import urljoin, urlsplit, urlunsplit
import requests
@@ -16,11 +16,25 @@ ANCHOR_RE = re.compile(r"<a[^>]+href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", re.IGN
TAG_RE = re.compile(r"<[^>]+>")
def canonicalize_url(url: str) -> str:
parsed = urlsplit(url)
return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, parsed.query, ""))
def _matches_patterns(value: str, patterns: List[str]) -> bool:
if not patterns:
return True
return any(re.search(pattern, value, re.IGNORECASE) for pattern in patterns)
def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
response = request("GET", source["url"])
response = request("GET", source["url"], source=source)
response.raise_for_status()
html = response.text
keywords = {kw.lower() for kw in source.get("keywords", [])}
parser_hints = source.get("parser_hints") or {}
keywords = {kw.lower() for kw in (parser_hints.get("keywords") or source.get("keywords", []))}
include_patterns = parser_hints.get("include_url_patterns") or []
exclude_patterns = parser_hints.get("exclude_url_patterns") or []
candidates: List[Candidate] = []
seen = set()
@@ -28,10 +42,14 @@ def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
title = unescape(TAG_RE.sub(" ", text)).strip()
if not title:
continue
absolute = urljoin(source["url"], href)
absolute = canonicalize_url(urljoin(source["url"], href))
haystack = f"{title} {absolute}".lower()
if keywords and not any(keyword in haystack for keyword in keywords):
continue
if include_patterns and not _matches_patterns(absolute, include_patterns):
continue
if exclude_patterns and _matches_patterns(absolute, exclude_patterns):
continue
if absolute in seen:
continue
seen.add(absolute)