from __future__ import annotations

import re
from html import unescape
from typing import Any, Dict, List
from urllib.parse import urljoin

from intel.http_client import request
from intel.models import Candidate
from intel.utils import unique

from .html_links import ANCHOR_RE, TAG_RE, canonicalize_url


def _matches(value: str, patterns: List[str]) -> bool:
    if not patterns:
        return True
    return any(re.search(pattern, value, re.IGNORECASE) for pattern in patterns)


def fetch(system: Dict[str, Any], source: Dict[str, Any]) -> List[Candidate]:
    response = request("GET", source["url"], source=source)
    response.raise_for_status()
    html = response.text
    parser_hints = source.get("parser_hints") or {}
    keywords = {kw.lower() for kw in (parser_hints.get("keywords") or source.get("keywords", []))}
    include_patterns = parser_hints.get("include_url_patterns") or []
    exclude_patterns = parser_hints.get("exclude_url_patterns") or []

    candidates: List[Candidate] = []
    seen = set()
    for href, text in ANCHOR_RE.findall(html):
        absolute = canonicalize_url(urljoin(source["url"], href))
        title = unescape(TAG_RE.sub(" ", text)).strip()
        if not title:
            continue
        haystack = " ".join(filter(None, [absolute, title])).lower()
        if keywords and not any(keyword in haystack for keyword in keywords):
            continue
        if include_patterns and not _matches(absolute, include_patterns):
            continue
        if exclude_patterns and _matches(absolute, exclude_patterns):
            continue
        if absolute in seen:
            continue
        seen.add(absolute)
        candidates.append(
            Candidate(
                system_id=system["system_id"],
                display_name=system["display_name"],
                category=system["category"],
                advisory_mode=source.get("advisory_mode", "core"),
                source_kind=source["kind"],
                source_name=source["name"],
                source_confidence=source["confidence"],
                source_url=absolute,
                title=title,
                summary="",
                severity="unknown",
                references=unique([absolute]),
                raw={"href": absolute, "title": title},
            )
        )
        if len(candidates) >= source.get("max_items", 50):
            break
    return candidates