488 行
18 KiB
Python
可执行文件
488 行
18 KiB
Python
可执行文件
#!/usr/bin/env python3
|
||
"""Collect C++/CSP learning resources from the web, summarize with LLM, and upsert KB articles."""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import html
|
||
import json
|
||
import os
|
||
import re
|
||
import sqlite3
|
||
import sys
|
||
import time
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import requests
|
||
|
||
RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
|
||
DEFAULT_TIMEOUT = 30
|
||
USER_AGENT = (
|
||
"Mozilla/5.0 (X11; Linux x86_64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/123.0.0.0 Safari/537.36"
|
||
)
|
||
NO_PROXY = {"http": "", "https": ""}
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class ResourceSource:
|
||
label: str
|
||
url: str
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class TrackSpec:
|
||
slug: str
|
||
title: str
|
||
audience: str
|
||
objective: str
|
||
sources: tuple[ResourceSource, ...]
|
||
|
||
|
||
TRACKS: tuple[TrackSpec, ...] = (
|
||
TrackSpec(
|
||
slug="learning-roadmap-csp",
|
||
title="CSP 学习总路线(C++ 基础 → CSP-J → CSP-S)",
|
||
audience="准备长期学习 CSP 的初中/高中选手与家长",
|
||
objective="给出分阶段目标、周训练节奏、升阶检查清单和环境规范提醒。",
|
||
sources=(
|
||
ResourceSource("NOI 技术规则", "https://www.noi.cn/gynoi/jsgz/"),
|
||
ResourceSource(
|
||
"NOI Linux 与说明文档下载",
|
||
"https://www.noi.cn/gynoi/jsgz/2018-08-21/710467.shtml",
|
||
),
|
||
ResourceSource(
|
||
"NOI 标准竞赛环境说明(2012)",
|
||
"https://www.noi.cn/gynoi/jsgz/2018-08-13/710466.shtml",
|
||
),
|
||
ResourceSource("OI Wiki 竞赛路线图", "https://oi-wiki.org/contest/roadmap/"),
|
||
ResourceSource("OI Wiki 竞赛资源", "https://oi-wiki.org/contest/resources/"),
|
||
ResourceSource(
|
||
"cp-algorithms 首页",
|
||
"https://cp-algorithms.com/",
|
||
),
|
||
),
|
||
),
|
||
TrackSpec(
|
||
slug="learning-cpp-basic",
|
||
title="C++ 基础学习资料(面向 CSP)",
|
||
audience="C++ 零基础或语法不稳,准备进入 CSP-J 的同学",
|
||
objective="梳理语法基础、STL 入门、输入输出与 C++14 兼容写法。",
|
||
sources=(
|
||
ResourceSource("cppreference C++ language", "https://en.cppreference.com/w/cpp/language.html"),
|
||
ResourceSource("OI Wiki 语言基础", "https://oi-wiki.org/lang/basic/"),
|
||
ResourceSource("OI Wiki 数组", "https://oi-wiki.org/lang/array/"),
|
||
ResourceSource("OI Wiki 循环", "https://oi-wiki.org/lang/loop/"),
|
||
ResourceSource("OI Wiki 运算符", "https://oi-wiki.org/lang/op/"),
|
||
ResourceSource("OI Wiki C++ 标准库", "https://oi-wiki.org/lang/csl/"),
|
||
ResourceSource("OI Wiki 文件操作", "https://oi-wiki.org/lang/file-op/"),
|
||
),
|
||
),
|
||
TrackSpec(
|
||
slug="learning-csp-j",
|
||
title="CSP-J 学习资料与训练路径",
|
||
audience="目标 CSP-J 提高组入门,正在建立算法基础的同学",
|
||
objective="覆盖模拟、枚举、前缀和、基础搜索与基础 DP,给出循序刷题方案。",
|
||
sources=(
|
||
ResourceSource("NOI 技术规则", "https://www.noi.cn/gynoi/jsgz/"),
|
||
ResourceSource("OI Wiki 模拟", "https://oi-wiki.org/basic/simulate/"),
|
||
ResourceSource("OI Wiki 枚举", "https://oi-wiki.org/basic/enumerate/"),
|
||
ResourceSource("OI Wiki 前缀和与差分", "https://oi-wiki.org/basic/prefix-sum/"),
|
||
ResourceSource("OI Wiki 动态规划基础", "https://oi-wiki.org/dp/basic/"),
|
||
ResourceSource("OI Wiki BFS", "https://oi-wiki.org/search/bfs/"),
|
||
ResourceSource("OI Wiki DFS", "https://oi-wiki.org/search/dfs/"),
|
||
ResourceSource("OI Wiki 常见错误", "https://oi-wiki.org/contest/common-mistakes/"),
|
||
),
|
||
),
|
||
TrackSpec(
|
||
slug="learning-csp-s",
|
||
title="CSP-S 学习资料与进阶路径",
|
||
audience="目标 CSP-S,已具备 CSP-J 基础并准备系统进阶的同学",
|
||
objective="覆盖数据结构、图论、字符串与 DP 进阶,强调复杂度与工程规范。",
|
||
sources=(
|
||
ResourceSource(
|
||
"NOI 标准竞赛环境说明(2016)",
|
||
"https://www.noi.cn/gynoi/jsgz/2018-08-13/710465.shtml",
|
||
),
|
||
ResourceSource(
|
||
"NOI 标准竞赛环境说明(2012)",
|
||
"https://www.noi.cn/gynoi/jsgz/2018-08-13/710466.shtml",
|
||
),
|
||
ResourceSource("OI Wiki 树状数组", "https://oi-wiki.org/ds/fenwick/"),
|
||
ResourceSource("OI Wiki 线段树", "https://oi-wiki.org/ds/seg/"),
|
||
ResourceSource("OI Wiki 最短路", "https://oi-wiki.org/graph/shortest-path/"),
|
||
ResourceSource("OI Wiki 强连通分量", "https://oi-wiki.org/graph/scc/"),
|
||
ResourceSource("OI Wiki 最大流", "https://oi-wiki.org/graph/flow/max-flow/"),
|
||
ResourceSource("OI Wiki 树上 DP", "https://oi-wiki.org/dp/tree/"),
|
||
ResourceSource("OI Wiki KMP", "https://oi-wiki.org/string/kmp/"),
|
||
ResourceSource(
|
||
"cp-algorithms Segment Tree",
|
||
"https://cp-algorithms.com/data_structures/segment_tree.html",
|
||
),
|
||
),
|
||
),
|
||
)
|
||
|
||
|
||
def now_sec() -> int:
|
||
return int(time.time())
|
||
|
||
|
||
def load_dotenv(path: Path) -> None:
|
||
if not path.exists():
|
||
return
|
||
for raw in path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
||
line = raw.strip()
|
||
if not line or line.startswith("#") or "=" not in line:
|
||
continue
|
||
key, value = line.split("=", 1)
|
||
key = key.strip()
|
||
if not key or key in os.environ:
|
||
continue
|
||
os.environ[key] = value.strip().strip("\"").strip("'")
|
||
|
||
|
||
def fetch_url(url: str, timeout: int) -> tuple[str, str]:
|
||
headers = {"User-Agent": USER_AGENT}
|
||
resp = requests.get(url, headers=headers, timeout=timeout, proxies=NO_PROXY)
|
||
resp.encoding = resp.apparent_encoding or resp.encoding
|
||
if resp.status_code >= 400:
|
||
raise RuntimeError(f"HTTP {resp.status_code}")
|
||
html_text = resp.text
|
||
return html_text, resp.url
|
||
|
||
|
||
def strip_html(html_text: str, max_chars: int) -> str:
|
||
text = re.sub(r"(?is)<(script|style|noscript|svg|canvas)[^>]*>.*?</\\1>", " ", html_text)
|
||
text = re.sub(r"(?is)<br\\s*/?>", "\n", text)
|
||
text = re.sub(r"(?is)</(p|div|section|article|h1|h2|h3|h4|h5|h6|li|tr|table|ul|ol)>", "\n", text)
|
||
text = re.sub(r"(?is)<[^>]+>", " ", text)
|
||
text = html.unescape(text)
|
||
text = text.replace("\r", "\n").replace("\xa0", " ")
|
||
text = re.sub(r"[ \t\f\v]+", " ", text)
|
||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||
text = "\n".join(line.strip() for line in text.splitlines() if line.strip())
|
||
if len(text) > max_chars:
|
||
text = text[:max_chars].rstrip() + "\n..."
|
||
return text
|
||
|
||
|
||
def extract_title(html_text: str, fallback: str) -> str:
|
||
match = re.search(r"(?is)<title>(.*?)</title>", html_text)
|
||
if not match:
|
||
return fallback
|
||
title = html.unescape(match.group(1))
|
||
title = re.sub(r"\s+", " ", title).strip()
|
||
if title:
|
||
return title
|
||
return fallback
|
||
|
||
|
||
def extract_message_text(content: Any) -> str:
|
||
if isinstance(content, str):
|
||
return content.strip()
|
||
if isinstance(content, list):
|
||
out: list[str] = []
|
||
for item in content:
|
||
if isinstance(item, dict):
|
||
text = item.get("text")
|
||
if isinstance(text, str) and text.strip():
|
||
out.append(text.strip())
|
||
return "\n".join(out).strip()
|
||
if isinstance(content, dict):
|
||
text = content.get("text")
|
||
if isinstance(text, str):
|
||
return text.strip()
|
||
return ""
|
||
|
||
|
||
def llm_request(prompt: str, timeout: int, retries: int, retry_sleep_sec: float) -> str:
|
||
url = os.getenv("OI_LLM_API_URL", "").strip()
|
||
api_key = os.getenv("OI_LLM_API_KEY", "").strip()
|
||
model = os.getenv("OI_LLM_MODEL", "qwen3-max").strip()
|
||
if not url:
|
||
raise RuntimeError("missing OI_LLM_API_URL")
|
||
|
||
headers = {"Content-Type": "application/json"}
|
||
if api_key:
|
||
headers["Authorization"] = f"Bearer {api_key}"
|
||
|
||
payload = {
|
||
"model": model,
|
||
"stream": False,
|
||
"temperature": 0.2,
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": "你是资深信息学竞赛教练,请严格基于来源内容整理中文 Markdown 学习资料。",
|
||
},
|
||
{"role": "user", "content": prompt},
|
||
],
|
||
}
|
||
|
||
for idx in range(1, retries + 1):
|
||
try:
|
||
resp = requests.post(url, headers=headers, json=payload, timeout=timeout, proxies=NO_PROXY)
|
||
except requests.RequestException as exc:
|
||
if idx >= retries:
|
||
raise RuntimeError(f"llm request failed: {exc}") from exc
|
||
time.sleep(retry_sleep_sec * idx)
|
||
continue
|
||
|
||
if resp.status_code in RETRYABLE_HTTP_CODES:
|
||
if idx >= retries:
|
||
raise RuntimeError(f"llm retry exhausted: HTTP {resp.status_code}")
|
||
time.sleep(retry_sleep_sec * idx)
|
||
continue
|
||
|
||
if resp.status_code >= 400:
|
||
raise RuntimeError(f"llm request failed: HTTP {resp.status_code}: {resp.text[:300]}")
|
||
|
||
body = resp.json()
|
||
choices = body.get("choices") or []
|
||
if not choices:
|
||
raise RuntimeError("llm response missing choices")
|
||
message = (choices[0] or {}).get("message") or {}
|
||
text = extract_message_text(message.get("content"))
|
||
if not text:
|
||
text = extract_message_text((choices[0] or {}).get("text"))
|
||
if not text:
|
||
raise RuntimeError("llm response missing content")
|
||
return text
|
||
|
||
raise RuntimeError("llm request failed")
|
||
|
||
|
||
def remove_outer_markdown_fence(text: str) -> str:
|
||
raw = text.strip()
|
||
match = re.match(r"^```(?:markdown|md)?\\s*([\\s\\S]*?)\\s*```$", raw, flags=re.IGNORECASE)
|
||
if match:
|
||
return match.group(1).strip()
|
||
return raw
|
||
|
||
|
||
def build_prompt(spec: TrackSpec, source_materials: list[dict[str, str]]) -> str:
|
||
source_lines = "\n".join([f"- {it['label']}: {it['url']}" for it in source_materials])
|
||
snippets: list[str] = []
|
||
for idx, item in enumerate(source_materials, start=1):
|
||
snippets.append(
|
||
"\n".join(
|
||
[
|
||
f"[来源 {idx}] {item['label']}",
|
||
f"URL: {item['url']}",
|
||
f"页面标题: {item['title']}",
|
||
"摘录:",
|
||
item["snippet"],
|
||
]
|
||
)
|
||
)
|
||
|
||
all_snippets = "\n\n".join(snippets)
|
||
|
||
return f"""
|
||
请整理一篇中文 Markdown 学习资料文章,主题:{spec.title}
|
||
|
||
目标读者:{spec.audience}
|
||
目标:{spec.objective}
|
||
|
||
硬性要求:
|
||
1. 只输出 Markdown 正文,不要输出解释、前言、JSON 或代码块外的多余说明。
|
||
2. 正文不少于 900 字,内容要具体可执行,不能只给提纲。
|
||
3. 内容结构至少包含:
|
||
- 学习目标
|
||
- 知识图谱(按优先级)
|
||
- 分阶段训练计划(建议按周)
|
||
- 常见失分点与避坑清单
|
||
- C++14 / 评测环境规范提醒(明确写出:优先 C++14;避免 C++17 特性;long long 用 %lld;如赛方要求则使用 freopen;main 返回 int 且 return 0)
|
||
4. 结尾必须包含“## 参考来源”章节,并且只列出本次给定来源,使用 Markdown 链接。
|
||
5. 不要编造具体年份政策细节;对于地方性要求,写“以当年官方通知为准”。
|
||
6. 风格要可执行,尽量给出检查清单与训练顺序。
|
||
|
||
可用来源列表:
|
||
{source_lines}
|
||
|
||
来源摘录:
|
||
{all_snippets}
|
||
""".strip()
|
||
|
||
|
||
def fallback_markdown(spec: TrackSpec, source_materials: list[dict[str, str]], error_text: str) -> str:
|
||
lines = [
|
||
f"# {spec.title}",
|
||
"",
|
||
"## 学习目标",
|
||
f"- 读者:{spec.audience}",
|
||
f"- 目标:{spec.objective}",
|
||
"",
|
||
"## 训练建议",
|
||
"- 每周固定做 3~5 道题:先基础题,再专项题,再限时套题。",
|
||
"- 每次训练后补齐错因:读题失误、边界遗漏、复杂度超限、代码规范错误。",
|
||
"- 建议建立个人模板并反复演练输入输出与边界处理。",
|
||
"",
|
||
"## C++14 / 评测环境规范提醒",
|
||
"- 统一按 C++14 编译,避免 C++17 及以上语法。",
|
||
"- `long long` 输入输出优先使用 `%lld`。",
|
||
"- 若赛方要求文件读写,使用 `freopen(\"xxx.in\", \"r\", stdin)` / `freopen(\"xxx.out\", \"w\", stdout)`。",
|
||
"- `main` 必须是 `int main()` 且 `return 0;`。",
|
||
"- 地方考区细则每年会更新,务必以当年官方通知为准。",
|
||
"",
|
||
"## 参考来源",
|
||
]
|
||
for item in source_materials:
|
||
lines.append(f"- [{item['label']}]({item['url']})")
|
||
lines.extend(
|
||
[
|
||
"",
|
||
"> 说明:本条目由自动整理流程生成。",
|
||
f"> LLM 调用失败原因:{error_text[:180]}",
|
||
]
|
||
)
|
||
return "\n".join(lines).strip() + "\n"
|
||
|
||
|
||
def upsert_article(conn: sqlite3.Connection, slug: str, title: str, content_md: str, ts: int) -> None:
|
||
conn.execute(
|
||
"""
|
||
INSERT INTO kb_articles(slug, title, content_md, created_at)
|
||
VALUES(?, ?, ?, ?)
|
||
ON CONFLICT(slug) DO UPDATE SET
|
||
title=excluded.title,
|
||
content_md=excluded.content_md,
|
||
created_at=excluded.created_at
|
||
""",
|
||
(slug, title, content_md, ts),
|
||
)
|
||
|
||
|
||
def guess_db_path(cli_path: str | None) -> str:
|
||
if cli_path:
|
||
return cli_path
|
||
candidates = [
|
||
os.getenv("CSP_DB_PATH", "").strip(),
|
||
"/data/csp.db",
|
||
"/var/lib/docker/volumes/csp_csp_data/_data/csp.db",
|
||
str(Path(__file__).resolve().parents[1] / "data" / "csp.db"),
|
||
]
|
||
for path in candidates:
|
||
if path and Path(path).exists():
|
||
return path
|
||
return "/data/csp.db"
|
||
|
||
|
||
def collect_source_materials(
|
||
spec: TrackSpec,
|
||
timeout: int,
|
||
max_chars_per_source: int,
|
||
max_sources_per_track: int,
|
||
) -> list[dict[str, str]]:
|
||
materials: list[dict[str, str]] = []
|
||
for src in spec.sources[:max_sources_per_track]:
|
||
try:
|
||
html_text, final_url = fetch_url(src.url, timeout=timeout)
|
||
title = extract_title(html_text, src.label)
|
||
if "404" in title.lower() and "not found" in title.lower():
|
||
raise RuntimeError(f"unexpected 404 title: {title}")
|
||
snippet = strip_html(html_text, max_chars=max_chars_per_source)
|
||
if not snippet:
|
||
raise RuntimeError("empty extracted text")
|
||
materials.append(
|
||
{
|
||
"label": src.label,
|
||
"url": final_url,
|
||
"title": title,
|
||
"snippet": snippet,
|
||
}
|
||
)
|
||
print(f"[fetch] ok: {spec.slug} <- {src.url}")
|
||
except Exception as exc:
|
||
print(f"[fetch] skip: {spec.slug} <- {src.url} ({exc})", file=sys.stderr)
|
||
|
||
if not materials:
|
||
raise RuntimeError(f"no source material collected for {spec.slug}")
|
||
return materials
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(description="Import web learning resources into kb_articles")
|
||
parser.add_argument("--db-path", default="", help="SQLite path (default: auto-detect)")
|
||
parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT)
|
||
parser.add_argument("--retries", type=int, default=5)
|
||
parser.add_argument("--retry-sleep-sec", type=float, default=1.5)
|
||
parser.add_argument("--max-chars-per-source", type=int, default=900)
|
||
parser.add_argument("--max-sources-per-track", type=int, default=3)
|
||
parser.add_argument("--dry-run", action="store_true")
|
||
parser.add_argument(
|
||
"--only",
|
||
default="",
|
||
help="Only process one track slug, e.g. learning-csp-j",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
repo_root = Path(__file__).resolve().parents[1]
|
||
load_dotenv(repo_root / ".env")
|
||
|
||
db_path = guess_db_path(args.db_path or None)
|
||
print(f"[db] using: {db_path}")
|
||
|
||
conn = sqlite3.connect(db_path)
|
||
conn.execute("PRAGMA foreign_keys=ON")
|
||
conn.execute("PRAGMA busy_timeout=10000")
|
||
|
||
processed = 0
|
||
for spec in TRACKS:
|
||
if args.only and spec.slug != args.only.strip():
|
||
continue
|
||
|
||
print(f"[track] start: {spec.slug}")
|
||
materials = collect_source_materials(
|
||
spec,
|
||
timeout=max(5, args.timeout),
|
||
max_chars_per_source=max(600, args.max_chars_per_source),
|
||
max_sources_per_track=max(1, args.max_sources_per_track),
|
||
)
|
||
|
||
prompt = build_prompt(spec, materials)
|
||
markdown: str
|
||
try:
|
||
markdown = llm_request(
|
||
prompt=prompt,
|
||
timeout=max(15, args.timeout * 2),
|
||
retries=max(1, args.retries),
|
||
retry_sleep_sec=max(0.2, args.retry_sleep_sec),
|
||
)
|
||
markdown = remove_outer_markdown_fence(markdown)
|
||
if os.getenv("KB_IMPORT_DEBUG", "").strip():
|
||
preview = markdown.strip().replace("\n", "\\n")
|
||
print(
|
||
f"[llm] raw: {spec.slug} len={len(markdown.strip())} preview={preview[:220]}",
|
||
file=sys.stderr,
|
||
)
|
||
if len(markdown.strip()) < 120:
|
||
raise RuntimeError("llm output too short")
|
||
print(f"[llm] ok: {spec.slug}")
|
||
except Exception as exc:
|
||
print(f"[llm] fallback: {spec.slug} ({exc})", file=sys.stderr)
|
||
markdown = fallback_markdown(spec, materials, str(exc))
|
||
|
||
content = markdown.strip() + "\n"
|
||
if args.dry_run:
|
||
print(f"[dry-run] {spec.slug}: {len(content)} chars")
|
||
processed += 1
|
||
continue
|
||
|
||
upsert_article(conn, spec.slug, spec.title, content, now_sec())
|
||
conn.commit()
|
||
processed += 1
|
||
print(f"[db] upserted: {spec.slug}")
|
||
|
||
conn.close()
|
||
print(f"[done] processed tracks: {processed}")
|
||
return 0 if processed > 0 else 1
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|