from __future__ import annotations import re from html import unescape from typing import Any, Dict, List from urllib.parse import urljoin from intel.http_client import request from intel.models import Candidate from intel.utils import unique from .html_links import ANCHOR_RE, TAG_RE, canonicalize_url def _matches(value: str, patterns: List[str]) -> bool: if not patterns: return True return any(re.search(pattern, value, re.IGNORECASE) for pattern in patterns) def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]: response = request("GET", source["url"], source=source) response.raise_for_status() html = response.text parser_hints = source.get("parser_hints") or {} keywords = {kw.lower() for kw in (parser_hints.get("keywords") or source.get("keywords", []))} include_patterns = parser_hints.get("include_url_patterns") or [] exclude_patterns = parser_hints.get("exclude_url_patterns") or [] candidates: List[Candidate] = [] seen = set() for href, text in ANCHOR_RE.findall(html): absolute = canonicalize_url(urljoin(source["url"], href)) title = unescape(TAG_RE.sub(" ", text)).strip() if not title: continue haystack = " ".join(filter(None, [absolute, title])).lower() if keywords and not any(keyword in haystack for keyword in keywords): continue if include_patterns and not _matches(absolute, include_patterns): continue if exclude_patterns and _matches(absolute, exclude_patterns): continue if absolute in seen: continue seen.add(absolute) candidates.append( Candidate( system_id=system["system_id"], display_name=system["display_name"], category=system["category"], advisory_mode=source.get("advisory_mode", "core"), source_kind=source["kind"], source_name=source["name"], source_confidence=source["confidence"], source_url=absolute, title=title, summary="", severity="unknown", references=unique([absolute]), raw={"href": absolute, "title": title}, ) ) if len(candidates) >= source.get("max_items", 50): break return candidates