csp/scripts/import_kb_learning_resources.py

#!/usr/bin/env python3
"""Collect C++/CSP learning resources from the web, summarize with LLM, and upsert KB articles."""

from __future__ import annotations

import argparse
import html
import json
import os
import re
import sqlite3
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import requests

RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
DEFAULT_TIMEOUT = 30
USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/123.0.0.0 Safari/537.36"
)
NO_PROXY = {"http": "", "https": ""}


@dataclass(frozen=True)
class ResourceSource:
    label: str
    url: str


@dataclass(frozen=True)
class TrackSpec:
    slug: str
    title: str
    audience: str
    objective: str
    sources: tuple[ResourceSource, ...]


TRACKS: tuple[TrackSpec, ...] = (
    TrackSpec(
        slug="learning-roadmap-csp",
        title="CSP 学习总路线（C++ 基础 → CSP-J → CSP-S）",
        audience="准备长期学习 CSP 的初中/高中选手与家长",
        objective="给出分阶段目标、周训练节奏、升阶检查清单和环境规范提醒。",
        sources=(
            ResourceSource("NOI 技术规则", "https://www.noi.cn/gynoi/jsgz/"),
            ResourceSource(
                "NOI Linux 与说明文档下载",
                "https://www.noi.cn/gynoi/jsgz/2018-08-21/710467.shtml",
            ),
            ResourceSource(
                "NOI 标准竞赛环境说明（2012）",
                "https://www.noi.cn/gynoi/jsgz/2018-08-13/710466.shtml",
            ),
            ResourceSource("OI Wiki 竞赛路线图", "https://oi-wiki.org/contest/roadmap/"),
            ResourceSource("OI Wiki 竞赛资源", "https://oi-wiki.org/contest/resources/"),
            ResourceSource(
                "cp-algorithms 首页",
                "https://cp-algorithms.com/",
            ),
        ),
    ),
    TrackSpec(
        slug="learning-cpp-basic",
        title="C++ 基础学习资料（面向 CSP）",
        audience="C++ 零基础或语法不稳，准备进入 CSP-J 的同学",
        objective="梳理语法基础、STL 入门、输入输出与 C++14 兼容写法。",
        sources=(
            ResourceSource("cppreference C++ language", "https://en.cppreference.com/w/cpp/language.html"),
            ResourceSource("OI Wiki 语言基础", "https://oi-wiki.org/lang/basic/"),
            ResourceSource("OI Wiki 数组", "https://oi-wiki.org/lang/array/"),
            ResourceSource("OI Wiki 循环", "https://oi-wiki.org/lang/loop/"),
            ResourceSource("OI Wiki 运算符", "https://oi-wiki.org/lang/op/"),
            ResourceSource("OI Wiki C++ 标准库", "https://oi-wiki.org/lang/csl/"),
            ResourceSource("OI Wiki 文件操作", "https://oi-wiki.org/lang/file-op/"),
        ),
    ),
    TrackSpec(
        slug="learning-csp-j",
        title="CSP-J 学习资料与训练路径",
        audience="目标 CSP-J 提高组入门，正在建立算法基础的同学",
        objective="覆盖模拟、枚举、前缀和、基础搜索与基础 DP，给出循序刷题方案。",
        sources=(
            ResourceSource("NOI 技术规则", "https://www.noi.cn/gynoi/jsgz/"),
            ResourceSource("OI Wiki 模拟", "https://oi-wiki.org/basic/simulate/"),
            ResourceSource("OI Wiki 枚举", "https://oi-wiki.org/basic/enumerate/"),
            ResourceSource("OI Wiki 前缀和与差分", "https://oi-wiki.org/basic/prefix-sum/"),
            ResourceSource("OI Wiki 动态规划基础", "https://oi-wiki.org/dp/basic/"),
            ResourceSource("OI Wiki BFS", "https://oi-wiki.org/search/bfs/"),
            ResourceSource("OI Wiki DFS", "https://oi-wiki.org/search/dfs/"),
            ResourceSource("OI Wiki 常见错误", "https://oi-wiki.org/contest/common-mistakes/"),
        ),
    ),
    TrackSpec(
        slug="learning-csp-s",
        title="CSP-S 学习资料与进阶路径",
        audience="目标 CSP-S，已具备 CSP-J 基础并准备系统进阶的同学",
        objective="覆盖数据结构、图论、字符串与 DP 进阶，强调复杂度与工程规范。",
        sources=(
            ResourceSource(
                "NOI 标准竞赛环境说明（2016）",
                "https://www.noi.cn/gynoi/jsgz/2018-08-13/710465.shtml",
            ),
            ResourceSource(
                "NOI 标准竞赛环境说明（2012）",
                "https://www.noi.cn/gynoi/jsgz/2018-08-13/710466.shtml",
            ),
            ResourceSource("OI Wiki 树状数组", "https://oi-wiki.org/ds/fenwick/"),
            ResourceSource("OI Wiki 线段树", "https://oi-wiki.org/ds/seg/"),
            ResourceSource("OI Wiki 最短路", "https://oi-wiki.org/graph/shortest-path/"),
            ResourceSource("OI Wiki 强连通分量", "https://oi-wiki.org/graph/scc/"),
            ResourceSource("OI Wiki 最大流", "https://oi-wiki.org/graph/flow/max-flow/"),
            ResourceSource("OI Wiki 树上 DP", "https://oi-wiki.org/dp/tree/"),
            ResourceSource("OI Wiki KMP", "https://oi-wiki.org/string/kmp/"),
            ResourceSource(
                "cp-algorithms Segment Tree",
                "https://cp-algorithms.com/data_structures/segment_tree.html",
            ),
        ),
    ),
)


def now_sec() -> int:
    return int(time.time())


def load_dotenv(path: Path) -> None:
    if not path.exists():
        return
    for raw in path.read_text(encoding="utf-8", errors="ignore").splitlines():
        line = raw.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        key = key.strip()
        if not key or key in os.environ:
            continue
        os.environ[key] = value.strip().strip("\"").strip("'")


def fetch_url(url: str, timeout: int) -> tuple[str, str]:
    headers = {"User-Agent": USER_AGENT}
    resp = requests.get(url, headers=headers, timeout=timeout, proxies=NO_PROXY)
    resp.encoding = resp.apparent_encoding or resp.encoding
    if resp.status_code >= 400:
        raise RuntimeError(f"HTTP {resp.status_code}")
    html_text = resp.text
    return html_text, resp.url


def strip_html(html_text: str, max_chars: int) -> str:
    text = re.sub(r"(?is)<(script|style|noscript|svg|canvas)[^>]*>.*?</\\1>", " ", html_text)
    text = re.sub(r"(?is)<br\\s*/?>", "\n", text)
    text = re.sub(r"(?is)</(p|div|section|article|h1|h2|h3|h4|h5|h6|li|tr|table|ul|ol)>", "\n", text)
    text = re.sub(r"(?is)<[^>]+>", " ", text)
    text = html.unescape(text)
    text = text.replace("\r", "\n").replace("\xa0", " ")
    text = re.sub(r"[ \t\f\v]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = "\n".join(line.strip() for line in text.splitlines() if line.strip())
    if len(text) > max_chars:
        text = text[:max_chars].rstrip() + "\n..."
    return text


def extract_title(html_text: str, fallback: str) -> str:
    match = re.search(r"(?is)<title>(.*?)</title>", html_text)
    if not match:
        return fallback
    title = html.unescape(match.group(1))
    title = re.sub(r"\s+", " ", title).strip()
    if title:
        return title
    return fallback


def extract_message_text(content: Any) -> str:
    if isinstance(content, str):
        return content.strip()
    if isinstance(content, list):
        out: list[str] = []
        for item in content:
            if isinstance(item, dict):
                text = item.get("text")
                if isinstance(text, str) and text.strip():
                    out.append(text.strip())
        return "\n".join(out).strip()
    if isinstance(content, dict):
        text = content.get("text")
        if isinstance(text, str):
            return text.strip()
    return ""


def llm_request(prompt: str, timeout: int, retries: int, retry_sleep_sec: float) -> str:
    url = os.getenv("OI_LLM_API_URL", "").strip()
    api_key = os.getenv("OI_LLM_API_KEY", "").strip()
    model = os.getenv("OI_LLM_MODEL", "qwen3-max").strip()
    if not url:
        raise RuntimeError("missing OI_LLM_API_URL")

    headers = {"Content-Type": "application/json"}
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"

    payload = {
        "model": model,
        "stream": False,
        "temperature": 0.2,
        "messages": [
            {
                "role": "system",
                "content": "你是资深信息学竞赛教练，请严格基于来源内容整理中文 Markdown 学习资料。",
            },
            {"role": "user", "content": prompt},
        ],
    }

    for idx in range(1, retries + 1):
        try:
            resp = requests.post(url, headers=headers, json=payload, timeout=timeout, proxies=NO_PROXY)
        except requests.RequestException as exc:
            if idx >= retries:
                raise RuntimeError(f"llm request failed: {exc}") from exc
            time.sleep(retry_sleep_sec * idx)
            continue

        if resp.status_code in RETRYABLE_HTTP_CODES:
            if idx >= retries:
                raise RuntimeError(f"llm retry exhausted: HTTP {resp.status_code}")
            time.sleep(retry_sleep_sec * idx)
            continue

        if resp.status_code >= 400:
            raise RuntimeError(f"llm request failed: HTTP {resp.status_code}: {resp.text[:300]}")

        body = resp.json()
        choices = body.get("choices") or []
        if not choices:
            raise RuntimeError("llm response missing choices")
        message = (choices[0] or {}).get("message") or {}
        text = extract_message_text(message.get("content"))
        if not text:
            text = extract_message_text((choices[0] or {}).get("text"))
        if not text:
            raise RuntimeError("llm response missing content")
        return text

    raise RuntimeError("llm request failed")


def remove_outer_markdown_fence(text: str) -> str:
    raw = text.strip()
    match = re.match(r"^```(?:markdown|md)?\\s*([\\s\\S]*?)\\s*```$", raw, flags=re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return raw


def build_prompt(spec: TrackSpec, source_materials: list[dict[str, str]]) -> str:
    source_lines = "\n".join([f"- {it['label']}: {it['url']}" for it in source_materials])
    snippets: list[str] = []
    for idx, item in enumerate(source_materials, start=1):
        snippets.append(
            "\n".join(
                [
                    f"[来源 {idx}] {item['label']}",
                    f"URL: {item['url']}",
                    f"页面标题: {item['title']}",
                    "摘录:",
                    item["snippet"],
                ]
            )
        )

    all_snippets = "\n\n".join(snippets)

    return f"""
请整理一篇中文 Markdown 学习资料文章，主题：{spec.title}

目标读者：{spec.audience}
目标：{spec.objective}

硬性要求：
1. 只输出 Markdown 正文，不要输出解释、前言、JSON 或代码块外的多余说明。
2. 正文不少于 900 字，内容要具体可执行，不能只给提纲。
3. 内容结构至少包含：
   - 学习目标
   - 知识图谱（按优先级）
   - 分阶段训练计划（建议按周）
   - 常见失分点与避坑清单
   - C++14 / 评测环境规范提醒（明确写出：优先 C++14；避免 C++17 特性；long long 用 %lld；如赛方要求则使用 freopen；main 返回 int 且 return 0）
4. 结尾必须包含“## 参考来源”章节，并且只列出本次给定来源，使用 Markdown 链接。
5. 不要编造具体年份政策细节；对于地方性要求，写“以当年官方通知为准”。
6. 风格要可执行，尽量给出检查清单与训练顺序。

可用来源列表：
{source_lines}

来源摘录：
{all_snippets}
""".strip()


def fallback_markdown(spec: TrackSpec, source_materials: list[dict[str, str]], error_text: str) -> str:
    lines = [
        f"# {spec.title}",
        "",
        "## 学习目标",
        f"- 读者：{spec.audience}",
        f"- 目标：{spec.objective}",
        "",
        "## 训练建议",
        "- 每周固定做 3~5 道题：先基础题，再专项题，再限时套题。",
        "- 每次训练后补齐错因：读题失误、边界遗漏、复杂度超限、代码规范错误。",
        "- 建议建立个人模板并反复演练输入输出与边界处理。",
        "",
        "## C++14 / 评测环境规范提醒",
        "- 统一按 C++14 编译，避免 C++17 及以上语法。",
        "- `long long` 输入输出优先使用 `%lld`。",
        "- 若赛方要求文件读写，使用 `freopen(\"xxx.in\", \"r\", stdin)` / `freopen(\"xxx.out\", \"w\", stdout)`。",
        "- `main` 必须是 `int main()` 且 `return 0;`。",
        "- 地方考区细则每年会更新，务必以当年官方通知为准。",
        "",
        "## 参考来源",
    ]
    for item in source_materials:
        lines.append(f"- [{item['label']}]({item['url']})")
    lines.extend(
        [
            "",
            "> 说明：本条目由自动整理流程生成。",
            f"> LLM 调用失败原因：{error_text[:180]}",
        ]
    )
    return "\n".join(lines).strip() + "\n"


def upsert_article(conn: sqlite3.Connection, slug: str, title: str, content_md: str, ts: int) -> None:
    conn.execute(
        """
        INSERT INTO kb_articles(slug, title, content_md, created_at)
        VALUES(?, ?, ?, ?)
        ON CONFLICT(slug) DO UPDATE SET
          title=excluded.title,
          content_md=excluded.content_md,
          created_at=excluded.created_at
        """,
        (slug, title, content_md, ts),
    )


def guess_db_path(cli_path: str | None) -> str:
    if cli_path:
        return cli_path
    candidates = [
        os.getenv("CSP_DB_PATH", "").strip(),
        "/data/csp.db",
        "/var/lib/docker/volumes/csp_csp_data/_data/csp.db",
        str(Path(__file__).resolve().parents[1] / "data" / "csp.db"),
    ]
    for path in candidates:
        if path and Path(path).exists():
            return path
    return "/data/csp.db"


def collect_source_materials(
    spec: TrackSpec,
    timeout: int,
    max_chars_per_source: int,
    max_sources_per_track: int,
) -> list[dict[str, str]]:
    materials: list[dict[str, str]] = []
    for src in spec.sources[:max_sources_per_track]:
        try:
            html_text, final_url = fetch_url(src.url, timeout=timeout)
            title = extract_title(html_text, src.label)
            if "404" in title.lower() and "not found" in title.lower():
                raise RuntimeError(f"unexpected 404 title: {title}")
            snippet = strip_html(html_text, max_chars=max_chars_per_source)
            if not snippet:
                raise RuntimeError("empty extracted text")
            materials.append(
                {
                    "label": src.label,
                    "url": final_url,
                    "title": title,
                    "snippet": snippet,
                }
            )
            print(f"[fetch] ok: {spec.slug} <- {src.url}")
        except Exception as exc:
            print(f"[fetch] skip: {spec.slug} <- {src.url} ({exc})", file=sys.stderr)

    if not materials:
        raise RuntimeError(f"no source material collected for {spec.slug}")
    return materials


def main() -> int:
    parser = argparse.ArgumentParser(description="Import web learning resources into kb_articles")
    parser.add_argument("--db-path", default="", help="SQLite path (default: auto-detect)")
    parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT)
    parser.add_argument("--retries", type=int, default=5)
    parser.add_argument("--retry-sleep-sec", type=float, default=1.5)
    parser.add_argument("--max-chars-per-source", type=int, default=900)
    parser.add_argument("--max-sources-per-track", type=int, default=3)
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument(
        "--only",
        default="",
        help="Only process one track slug, e.g. learning-csp-j",
    )
    args = parser.parse_args()

    repo_root = Path(__file__).resolve().parents[1]
    load_dotenv(repo_root / ".env")

    db_path = guess_db_path(args.db_path or None)
    print(f"[db] using: {db_path}")

    conn = sqlite3.connect(db_path)
    conn.execute("PRAGMA foreign_keys=ON")
    conn.execute("PRAGMA busy_timeout=10000")

    processed = 0
    for spec in TRACKS:
        if args.only and spec.slug != args.only.strip():
            continue

        print(f"[track] start: {spec.slug}")
        materials = collect_source_materials(
            spec,
            timeout=max(5, args.timeout),
            max_chars_per_source=max(600, args.max_chars_per_source),
            max_sources_per_track=max(1, args.max_sources_per_track),
        )

        prompt = build_prompt(spec, materials)
        markdown: str
        try:
            markdown = llm_request(
                prompt=prompt,
                timeout=max(15, args.timeout * 2),
                retries=max(1, args.retries),
                retry_sleep_sec=max(0.2, args.retry_sleep_sec),
            )
            markdown = remove_outer_markdown_fence(markdown)
            if os.getenv("KB_IMPORT_DEBUG", "").strip():
                preview = markdown.strip().replace("\n", "\\n")
                print(
                    f"[llm] raw: {spec.slug} len={len(markdown.strip())} preview={preview[:220]}",
                    file=sys.stderr,
                )
            if len(markdown.strip()) < 120:
                raise RuntimeError("llm output too short")
            print(f"[llm] ok: {spec.slug}")
        except Exception as exc:
            print(f"[llm] fallback: {spec.slug} ({exc})", file=sys.stderr)
            markdown = fallback_markdown(spec, materials, str(exc))

        content = markdown.strip() + "\n"
        if args.dry_run:
            print(f"[dry-run] {spec.slug}: {len(content)} chars")
            processed += 1
            continue

        upsert_article(conn, spec.slug, spec.title, content, now_sec())
        conn.commit()
        processed += 1
        print(f"[db] upserted: {spec.slug}")

    conn.close()
    print(f"[done] processed tracks: {processed}")
    return 0 if processed > 0 else 1


if __name__ == "__main__":
    raise SystemExit(main())