feat: rebuild CSP practice workflow, UX and automation

这个提交包含在:
Codex CLI
2026-02-13 15:49:05 +08:00
父节点 d33deed4c5
当前提交 e2ab522b78
修改 105 个文件,包含 15669 行新增428 行删除

查看文件

@@ -10,10 +10,11 @@ sudo apt-get install -y \
libjsoncpp-dev libyaml-cpp-dev libhiredis-dev \
libpq-dev libmariadb-dev libmariadb-dev-compat \
libsqlite3-dev sqlite3 \
zlib1g-dev libssl-dev libbrotli-dev uuid-dev \
catch2
# Node.js / npm 通常由 NodeSource 预装;这里仅做提示
node -v
npm -v
echo "Bootstrap OK"
echo "Bootstrap OK"

查看文件

@@ -0,0 +1,475 @@
#!/usr/bin/env python3
"""Generate new CSP-J problems with RAG + dedupe checks."""
from __future__ import annotations
import argparse
import json
import math
import os
import random
import re
import sqlite3
import time
from dataclasses import dataclass
from difflib import SequenceMatcher
from typing import Any
from urllib.parse import quote
import requests
DEFAULT_BASE_URL = "https://www.luogu.com.cn"
DEFAULT_TAG_IDS = [343, 82] # CSP-J + NOIP junior
RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
CONTEXT_RE = re.compile(
r'<script[^>]*id="lentille-context"[^>]*>(.*?)</script>', re.DOTALL
)
@dataclass
class ExistingProblem:
id: int
title: str
statement_md: str
def now_sec() -> int:
return int(time.time())
def normalize(text: str) -> str:
text = text.lower().strip()
text = re.sub(r"\s+", " ", text)
text = re.sub(r"[^0-9a-z\u4e00-\u9fff ]+", " ", text)
return re.sub(r"\s+", " ", text).strip()
def similarity(a: str, b: str) -> float:
if not a or not b:
return 0.0
return SequenceMatcher(None, normalize(a), normalize(b)).ratio()
def requests_with_retry(url: str, timeout: int, retries: int, sleep_sec: float) -> str:
last_error: Exception | None = None
for i in range(1, retries + 1):
try:
resp = requests.get(url, timeout=timeout)
except requests.RequestException as exc:
last_error = exc
if i < retries:
time.sleep(i * sleep_sec)
continue
raise RuntimeError(f"request failed: {exc}") from exc
if resp.status_code in RETRYABLE_HTTP_CODES:
if i < retries:
time.sleep(i * sleep_sec)
continue
raise RuntimeError(f"request failed: HTTP {resp.status_code}")
if resp.status_code >= 400:
raise RuntimeError(f"request failed: HTTP {resp.status_code}")
return resp.text
if last_error:
raise RuntimeError(str(last_error))
raise RuntimeError("request failed")
def extract_context_json(html_text: str) -> dict[str, Any]:
match = CONTEXT_RE.search(html_text)
if not match:
raise RuntimeError("lentille-context script not found")
return json.loads(match.group(1))
def crawl_luogu_titles(base_url: str, timeout: int, retries: int, sleep_sec: float) -> list[str]:
tags_csv = ",".join(str(x) for x in DEFAULT_TAG_IDS)
url = f"{base_url}/problem/list?type=all&tag={quote(tags_csv)}&page=1"
text = requests_with_retry(url, timeout=timeout, retries=retries, sleep_sec=sleep_sec)
ctx = extract_context_json(text)
result = (((ctx.get("data") or {}).get("problems") or {}).get("result") or [])
titles: list[str] = []
for row in result:
if not isinstance(row, dict):
continue
title = str(row.get("title") or "").strip()
if title:
titles.append(title)
return titles
def load_existing(conn: sqlite3.Connection) -> list[ExistingProblem]:
cur = conn.execute("SELECT id,title,statement_md FROM problems")
rows: list[ExistingProblem] = []
for row in cur.fetchall():
rows.append(
ExistingProblem(
id=int(row[0]),
title=str(row[1] or ""),
statement_md=str(row[2] or ""),
)
)
return rows
def collect_keywords(existing: list[ExistingProblem], luogu_titles: list[str]) -> list[str]:
bucket: dict[str, int] = {}
def add_word(w: str, weight: int = 1) -> None:
w = normalize(w)
if not w or len(w) < 2:
return
if w.isdigit():
return
bucket[w] = bucket.get(w, 0) + weight
for p in existing:
parts = re.split(r"[\s,/|+()\[\]【】-]+", p.title)
for part in parts:
add_word(part, 1)
for t in luogu_titles:
parts = re.split(r"[\s,/|+()\[\]【】-]+", t)
for part in parts:
add_word(part, 2)
ranked = sorted(bucket.items(), key=lambda x: x[1], reverse=True)
return [k for k, _ in ranked[:40]]
def llm_generate_problem(prompt: str, timeout: int, retries: int, sleep_sec: float) -> dict[str, Any]:
url = os.getenv("OI_LLM_API_URL", "").strip()
api_key = os.getenv("OI_LLM_API_KEY", "").strip()
model = os.getenv("OI_LLM_MODEL", "qwen3-max").strip()
if not url:
raise RuntimeError("missing OI_LLM_API_URL")
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
body = {
"model": model,
"stream": False,
"temperature": 0.7,
"messages": [
{
"role": "system",
"content": "你是 CSP-J 出题人。只输出 JSON,不输出额外解释。",
},
{"role": "user", "content": prompt},
],
}
for i in range(1, retries + 1):
try:
resp = requests.post(url, headers=headers, json=body, timeout=timeout)
except requests.RequestException as exc:
if i < retries:
time.sleep(i * sleep_sec)
continue
raise RuntimeError(f"llm failed: {exc}") from exc
if resp.status_code in RETRYABLE_HTTP_CODES:
if i < retries:
time.sleep(i * sleep_sec)
continue
raise RuntimeError(f"llm failed: HTTP {resp.status_code}")
if resp.status_code >= 400:
raise RuntimeError(f"llm failed: HTTP {resp.status_code}: {resp.text[:200]}")
payload = resp.json()
content = (((payload.get("choices") or [{}])[0].get("message") or {}).get("content") or "")
text = str(content).strip()
if text.startswith("```"):
text = re.sub(r"^```[a-zA-Z0-9_-]*", "", text).strip()
text = text.removesuffix("```").strip()
try:
obj = json.loads(text)
if isinstance(obj, dict):
return obj
except json.JSONDecodeError:
match = re.search(r"\{[\s\S]*\}", text)
if match:
obj = json.loads(match.group(0))
if isinstance(obj, dict):
return obj
raise RuntimeError("llm returned non-json content")
raise RuntimeError("llm failed")
def fallback_generate_problem(sampled_keywords: list[str], llm_error: str) -> dict[str, Any]:
seed = now_sec()
n = 5 + (seed % 6)
m = 7 + (seed % 9)
title = f"CSP-J 训练题·余数统计 {seed}"
statement_md = f"""
# 题目描述
给定一个长度为 {n} 的整数序列,你需要统计有多少个连续子段的元素和对 {m} 取模后等于 0。
## 输入格式
第一行一个整数 n。
第二行 n 个整数 a_i。
## 输出格式
输出一个整数,表示满足条件的连续子段数量。
## 数据范围
- 1 <= n <= 2e5
- |a_i| <= 1e9
## 提示
可以使用前缀和与计数哈希优化到 O(n)。
""".strip()
sample_input = "6\n1 2 3 4 5 6\n"
sample_output = "3\n"
return {
"title": title,
"difficulty": 3,
"statement_md": statement_md,
"sample_input": sample_input,
"sample_output": sample_output,
"answer": "统计前缀和模 m 的相同值配对数量",
"explanation": "维护 prefix % m 的出现次数,当前值为 x 时,答案增加 cnt[x],再令 cnt[x]++。",
"knowledge_points": ["前缀和", "哈希计数", "同余"],
"tags": ["csp-j", "prefix-sum", "hash"],
"llm_error": llm_error[:200],
"rag_keywords": sampled_keywords,
}
def build_problem_md(obj: dict[str, Any]) -> tuple[str, str, str]:
statement = str(obj.get("statement_md") or "").strip()
if not statement:
desc = str(obj.get("description") or "").strip()
in_fmt = str(obj.get("input_format") or "").strip()
out_fmt = str(obj.get("output_format") or "").strip()
statement = "\n\n".join(
[
"# 题目描述",
desc,
"## 输入格式",
in_fmt,
"## 输出格式",
out_fmt,
]
).strip()
sample_input = str(obj.get("sample_input") or "").strip()
sample_output = str(obj.get("sample_output") or "").strip()
return statement, sample_input, sample_output
def maybe_duplicate(existing: list[ExistingProblem], title: str, statement_md: str, threshold: float) -> tuple[bool, int | None, float]:
best_id = None
best_score = 0.0
for p in existing:
t_sim = similarity(title, p.title)
s_sim = similarity(statement_md[:1200], p.statement_md[:1200])
score = max(t_sim, s_sim * 0.9 + t_sim * 0.1)
if score > best_score:
best_score = score
best_id = p.id
return best_score >= threshold, best_id, best_score
def insert_problem(conn: sqlite3.Connection, title: str, statement_md: str, sample_input: str, sample_output: str, difficulty: int, profile_json: str, tags: list[str]) -> int:
ts = now_sec()
slug_base = normalize(title).replace(" ", "-")
slug_base = re.sub(r"[^a-z0-9\\-]+", "", slug_base)
if not slug_base:
slug_base = "cspj-generated"
slug = f"{slug_base[:50]}-{ts}"
cur = conn.cursor()
cur.execute(
"""
INSERT INTO problems(
slug,title,statement_md,difficulty,source,statement_url,llm_profile_json,sample_input,sample_output,created_at
) VALUES(?,?,?,?,?,?,?,?,?,?)
""",
(
slug,
title,
statement_md,
max(1, min(10, difficulty)),
"llm:cspj-generated",
"",
profile_json,
sample_input,
sample_output,
ts,
),
)
problem_id = int(cur.lastrowid)
for tag in tags:
cur.execute(
"INSERT OR IGNORE INTO problem_tags(problem_id,tag) VALUES(?,?)",
(problem_id, normalize(tag)),
)
conn.commit()
return problem_id
def main() -> int:
parser = argparse.ArgumentParser(description="RAG generate CSP-J problems")
parser.add_argument("--db-path", required=True)
parser.add_argument("--count", type=int, default=1, help="generate count each run")
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
parser.add_argument("--timeout", type=int, default=60)
parser.add_argument("--retries", type=int, default=4)
parser.add_argument("--retry-sleep-sec", type=float, default=1.5)
parser.add_argument("--dedupe-threshold", type=float, default=0.72)
args = parser.parse_args()
conn = sqlite3.connect(args.db_path)
conn.execute("PRAGMA foreign_keys=ON")
conn.execute("PRAGMA busy_timeout=5000")
existing = load_existing(conn)
luogu_titles: list[str] = []
try:
luogu_titles = crawl_luogu_titles(
args.base_url, timeout=args.timeout, retries=args.retries, sleep_sec=args.retry_sleep_sec
)
except Exception:
luogu_titles = []
keywords = collect_keywords(existing, luogu_titles)
if not keywords:
keywords = ["模拟", "枚举", "前缀和", "字符串", "贪心", "搜索"]
inserted = 0
skipped_duplicate = 0
failed = 0
details: list[dict[str, Any]] = []
for _ in range(max(1, args.count)):
sampled_keywords = random.sample(keywords, k=min(8, len(keywords)))
prompt = f"""
请生成一道原创 CSP-J 风格编程题,难度 2~4,禁止与常见模板题同构。
结合关键词:{', '.join(sampled_keywords)}
输出 JSON
{{
"title": "题目标题",
"difficulty": 2,
"statement_md": "Markdown 题面(含描述、输入格式、输出格式、数据范围)",
"sample_input": "样例输入",
"sample_output": "样例输出",
"answer": "简要答案关键点",
"explanation": "讲解",
"knowledge_points": ["知识点1","知识点2"],
"tags": ["csp-j","入门","..."]
}}
""".strip()
source = "llm"
llm_error = ""
try:
obj = llm_generate_problem(
prompt, timeout=args.timeout, retries=args.retries, sleep_sec=args.retry_sleep_sec
)
except Exception as exc:
source = "fallback"
llm_error = str(exc)
obj = fallback_generate_problem(sampled_keywords, llm_error)
try:
title = str(obj.get("title") or "").strip()
if not title:
raise RuntimeError("generated title is empty")
difficulty = int(obj.get("difficulty") or 2)
statement_md, sample_input, sample_output = build_problem_md(obj)
pre_dup, dup_id, dup_score = maybe_duplicate(
existing, title, statement_md, args.dedupe_threshold
)
if pre_dup:
skipped_duplicate += 1
details.append(
{
"title": title,
"status": "skip_pre_duplicate",
"source": source,
"similar_problem_id": dup_id,
"similarity": round(dup_score, 4),
}
)
continue
profile = {
"schema_version": 1,
"platform": "llm-generated" if source == "llm" else "fallback-generated",
"difficulty": difficulty,
"answer": str(obj.get("answer") or ""),
"explanation": str(obj.get("explanation") or ""),
"knowledge_points": obj.get("knowledge_points") if isinstance(obj.get("knowledge_points"), list) else [],
"tags": obj.get("tags") if isinstance(obj.get("tags"), list) else [],
"generated_at": now_sec(),
"rag_keywords": sampled_keywords,
}
if llm_error:
profile["llm_error"] = llm_error[:300]
# Post-check against fresh existing corpus before insert.
existing_latest = load_existing(conn)
post_dup, post_dup_id, post_dup_score = maybe_duplicate(
existing_latest, title, statement_md, args.dedupe_threshold
)
if post_dup:
skipped_duplicate += 1
details.append(
{
"title": title,
"status": "skip_post_duplicate",
"source": source,
"similar_problem_id": post_dup_id,
"similarity": round(post_dup_score, 4),
}
)
continue
tags = profile["tags"] if isinstance(profile["tags"], list) else []
if "csp-j" not in [normalize(str(x)) for x in tags]:
tags = [*tags, "csp-j"]
tags = [str(x) for x in tags][:12]
problem_id = insert_problem(
conn,
title=title,
statement_md=statement_md,
sample_input=sample_input,
sample_output=sample_output,
difficulty=difficulty,
profile_json=json.dumps(profile, ensure_ascii=False),
tags=tags,
)
inserted += 1
details.append(
{"title": title, "status": "inserted", "source": source, "problem_id": problem_id}
)
existing.append(ExistingProblem(problem_id, title, statement_md))
except Exception as exc:
failed += 1
details.append({"status": "failed", "source": source, "error": str(exc)})
conn.close()
print(
json.dumps(
{
"db_path": args.db_path,
"requested_count": max(1, args.count),
"inserted": inserted,
"skipped_duplicate": skipped_duplicate,
"failed": failed,
"details": details,
"keyword_sample_size": len(keywords),
},
ensure_ascii=False,
indent=2,
)
)
return 0 if failed == 0 else 1
if __name__ == "__main__":
raise SystemExit(main())

查看文件

@@ -0,0 +1,324 @@
#!/usr/bin/env python3
"""Asynchronously generate multiple solutions for a problem and store into SQLite."""
from __future__ import annotations
import argparse
import json
import os
import re
import sqlite3
import time
from dataclasses import dataclass
from typing import Any
import requests
RETRYABLE_HTTP_CODES = {500, 502, 503, 504}
@dataclass
class Problem:
id: int
title: str
statement_md: str
difficulty: int
source: str
sample_input: str
sample_output: str
def now_sec() -> int:
return int(time.time())
def extract_json_object(text: str) -> dict[str, Any] | None:
raw = text.strip()
if raw.startswith("```"):
raw = re.sub(r"^```[a-zA-Z0-9_-]*", "", raw).strip()
raw = raw.removesuffix("```").strip()
try:
obj = json.loads(raw)
if isinstance(obj, dict):
return obj
except json.JSONDecodeError:
pass
match = re.search(r"\{[\s\S]*\}", text)
if not match:
return None
try:
obj = json.loads(match.group(0))
return obj if isinstance(obj, dict) else None
except json.JSONDecodeError:
return None
def llm_request(prompt: str, timeout: int, retries: int, sleep_sec: float) -> str:
url = os.getenv("OI_LLM_API_URL", "").strip()
api_key = os.getenv("OI_LLM_API_KEY", "").strip()
model = os.getenv("OI_LLM_MODEL", "qwen3-max").strip()
if not url:
raise RuntimeError("missing OI_LLM_API_URL")
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
body = {
"model": model,
"stream": False,
"temperature": 0.3,
"messages": [
{
"role": "system",
"content": "你是资深 OI/CSP 教练。严格输出 JSON,不要输出任何额外文本。",
},
{"role": "user", "content": prompt},
],
}
last_error: Exception | None = None
for i in range(1, retries + 1):
try:
resp = requests.post(url, headers=headers, json=body, timeout=timeout)
except requests.RequestException as exc:
last_error = exc
if i < retries:
time.sleep(sleep_sec * i)
continue
raise RuntimeError(f"llm request failed: {exc}") from exc
if resp.status_code in RETRYABLE_HTTP_CODES:
if i < retries:
time.sleep(sleep_sec * i)
continue
raise RuntimeError(f"llm retry exhausted: HTTP {resp.status_code}")
if resp.status_code >= 400:
raise RuntimeError(f"llm request failed: HTTP {resp.status_code}: {resp.text[:300]}")
payload = resp.json()
choices = payload.get("choices") or []
if not choices:
raise RuntimeError("llm response missing choices")
content = ((choices[0] or {}).get("message") or {}).get("content")
if not content:
raise RuntimeError("llm response missing content")
return str(content)
if last_error:
raise RuntimeError(f"llm request failed: {last_error}") from last_error
raise RuntimeError("llm request failed")
def fallback_solutions(max_solutions: int) -> list[dict[str, Any]]:
base = [
{
"title": "解法一:直接模拟/枚举",
"idea_md": "按题意拆分步骤,先写可过样例的直观解法,再补边界处理。",
"explanation_md": "适用于数据范围较小或规则清晰的题。",
"complexity": "时间复杂度依题而定,通常 O(n)~O(n^2)",
"code_cpp": "// TODO: 请根据题意补全\n#include <bits/stdc++.h>\nusing namespace std;\nint main(){ios::sync_with_stdio(false);cin.tie(nullptr);return 0;}\n",
"tags": ["simulation", "implementation"],
},
{
"title": "解法二:优化思路(前缀/贪心/DP 视题而定)",
"idea_md": "分析状态与重复计算,尝试用前缀和、贪心或动态规划优化。",
"explanation_md": "比直接模拟更稳定,通常能覆盖更大数据规模。",
"complexity": "通常优于朴素解法",
"code_cpp": "// TODO: 请根据题意补全\n#include <bits/stdc++.h>\nusing namespace std;\nint main(){ios::sync_with_stdio(false);cin.tie(nullptr);return 0;}\n",
"tags": ["optimization", "dp"],
},
]
return base[: max(1, max_solutions)]
def load_problem(conn: sqlite3.Connection, problem_id: int) -> Problem:
cur = conn.execute(
"SELECT id,title,statement_md,difficulty,source,sample_input,sample_output FROM problems WHERE id=?",
(problem_id,),
)
row = cur.fetchone()
if row is None:
raise RuntimeError(f"problem not found: {problem_id}")
return Problem(
id=int(row[0]),
title=str(row[1] or ""),
statement_md=str(row[2] or ""),
difficulty=int(row[3] or 1),
source=str(row[4] or ""),
sample_input=str(row[5] or ""),
sample_output=str(row[6] or ""),
)
def update_job(conn: sqlite3.Connection, job_id: int, **fields: Any) -> None:
if not fields:
return
keys = []
vals: list[Any] = []
for k, v in fields.items():
keys.append(f"{k}=?")
vals.append(v)
vals.append(job_id)
conn.execute(
f"UPDATE problem_solution_jobs SET {', '.join(keys)} WHERE id=?",
tuple(vals),
)
conn.commit()
def store_solutions(conn: sqlite3.Connection, problem_id: int, rows: list[dict[str, Any]], source: str) -> int:
ts = now_sec()
conn.execute("DELETE FROM problem_solutions WHERE problem_id=?", (problem_id,))
saved = 0
seen_titles: set[str] = set()
for idx, row in enumerate(rows, start=1):
title = str(row.get("title") or f"解法 {idx}").strip()
if title in seen_titles:
continue
seen_titles.add(title)
idea_md = str(row.get("idea_md") or "").strip()
explanation_md = str(row.get("explanation_md") or "").strip()
code_cpp = str(row.get("code_cpp") or "").strip()
complexity = str(row.get("complexity") or "").strip()
tags = row.get("tags") if isinstance(row.get("tags"), list) else []
conn.execute(
"""
INSERT INTO problem_solutions(
problem_id,variant,title,idea_md,explanation_md,code_cpp,complexity,tags_json,source,created_at,updated_at
) VALUES(?,?,?,?,?,?,?,?,?,?,?)
""",
(
problem_id,
idx,
title,
idea_md,
explanation_md,
code_cpp,
complexity,
json.dumps(tags, ensure_ascii=False),
source,
ts,
ts,
),
)
saved += 1
conn.commit()
return saved
def main() -> int:
parser = argparse.ArgumentParser(description="Generate multi-solution explanations")
parser.add_argument("--db-path", required=True)
parser.add_argument("--problem-id", type=int, required=True)
parser.add_argument("--job-id", type=int, required=True)
parser.add_argument("--max-solutions", type=int, default=3)
parser.add_argument("--timeout", type=int, default=90)
parser.add_argument("--retries", type=int, default=4)
parser.add_argument("--retry-sleep-sec", type=float, default=1.5)
args = parser.parse_args()
conn = sqlite3.connect(args.db_path)
conn.execute("PRAGMA foreign_keys=ON")
conn.execute("PRAGMA busy_timeout=5000")
ts = now_sec()
update_job(
conn,
args.job_id,
status="running",
progress=1,
message="starting",
started_at=ts,
updated_at=ts,
)
try:
problem = load_problem(conn, args.problem_id)
prompt = f"""
请为下面这道 CSP 题生成 {max(1, min(5, args.max_solutions))} 种不同思路的题解(可从不同角度切入,例如模拟/贪心/DP/数据结构),并给出 C++ 参考代码。
输出 JSON,格式固定
{{
"solutions": [
{{
"title": "解法标题",
"idea_md": "思路要点Markdown",
"explanation_md": "详细讲解Markdown",
"complexity": "时间/空间复杂度",
"code_cpp": "完整 C++17 代码",
"tags": ["标签1","标签2"]
}}
]
}}
题目:{problem.title}
难度:{problem.difficulty}
来源:{problem.source}
题面:
{problem.statement_md[:12000]}
样例输入:
{problem.sample_input[:1200]}
样例输出:
{problem.sample_output[:1200]}
""".strip()
update_job(conn, args.job_id, progress=25, message="requesting llm", updated_at=now_sec())
source = "fallback"
solutions: list[dict[str, Any]]
try:
content = llm_request(
prompt,
timeout=args.timeout,
retries=args.retries,
sleep_sec=args.retry_sleep_sec,
)
obj = extract_json_object(content)
raw = obj.get("solutions") if isinstance(obj, dict) else None
if not isinstance(raw, list) or len(raw) == 0:
raise RuntimeError("llm response missing solutions array")
solutions = [x for x in raw if isinstance(x, dict)]
if not solutions:
raise RuntimeError("llm response has empty valid solutions")
source = "llm"
except Exception:
solutions = fallback_solutions(args.max_solutions)
solutions = solutions[: max(1, min(5, args.max_solutions))]
update_job(conn, args.job_id, progress=70, message="writing solutions", updated_at=now_sec())
saved = store_solutions(conn, args.problem_id, solutions, source)
update_job(
conn,
args.job_id,
status="completed",
progress=100,
message=f"completed: {saved} solutions ({source})",
finished_at=now_sec(),
updated_at=now_sec(),
)
conn.close()
return 0
except Exception as exc:
update_job(
conn,
args.job_id,
status="failed",
progress=100,
message=f"failed: {str(exc)[:400]}",
finished_at=now_sec(),
updated_at=now_sec(),
)
conn.close()
return 1
if __name__ == "__main__":
raise SystemExit(main())

904
scripts/import_luogu_csp.py 普通文件
查看文件

@@ -0,0 +1,904 @@
#!/usr/bin/env python3
"""Import Luogu CSP-J/S beginner problem set into local SQLite."""
from __future__ import annotations
import argparse
import json
import math
import re
import sqlite3
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from typing import Any
from urllib.parse import quote
import requests
DEFAULT_BASE_URL = "https://www.luogu.com.cn"
DEFAULT_TAG_IDS = [343, 342, 82, 83] # CSP-J, CSP-S, NOIP-junior, NOIP-senior
RETRYABLE_STATUS = {429, 500, 502, 503, 504}
CONTEXT_RE = re.compile(
r'<script[^>]*id="lentille-context"[^>]*>(.*?)</script>', re.DOTALL
)
@dataclass
class LuoguListItem:
pid: str
title: str
difficulty: int
tags: list[int]
total_submit: int
total_accepted: int
type: str
@dataclass
class UpsertRecord:
slug: str
title: str
statement_md: str
difficulty: int
source: str
statement_url: str
llm_profile_json: str
sample_input: str
sample_output: str
tags: list[str]
def now_sec() -> int:
return int(time.time())
def requests_retry_text(
session: requests.Session,
url: str,
*,
timeout: int,
retries: int,
sleep_sec: float,
) -> str:
last_error: Exception | None = None
for attempt in range(1, retries + 1):
try:
resp = session.get(url, timeout=timeout)
except requests.RequestException as exc:
last_error = exc
if attempt < retries:
time.sleep(sleep_sec * attempt)
continue
raise RuntimeError(f"GET failed: {url}: {exc}") from exc
if resp.status_code in RETRYABLE_STATUS:
if attempt < retries:
time.sleep(sleep_sec * attempt)
continue
raise RuntimeError(f"GET failed after retry: {url}: {resp.status_code}")
if resp.status_code >= 400:
raise RuntimeError(f"GET failed: {url}: {resp.status_code}")
return resp.text
if last_error:
raise RuntimeError(f"GET failed: {url}: {last_error}") from last_error
raise RuntimeError(f"GET failed: {url}: unknown error")
def extract_context_json(html_text: str) -> dict[str, Any]:
match = CONTEXT_RE.search(html_text)
if not match:
raise RuntimeError("lentille-context script not found")
try:
return json.loads(match.group(1))
except json.JSONDecodeError as exc:
raise RuntimeError("failed to parse lentille-context json") from exc
def parse_tag_ids(raw: str) -> list[int]:
out: list[int] = []
for part in raw.split(","):
part = part.strip()
if not part:
continue
out.append(int(part))
if not out:
raise ValueError("at least one tag id is required")
return out
def normalize_tag(text: str) -> str:
lower = text.strip().lower()
compact = re.sub(r"[^a-z0-9]+", "-", lower).strip("-")
return compact or text.strip()
def ensure_problem_columns(conn: sqlite3.Connection) -> None:
cur = conn.cursor()
cur.execute("PRAGMA table_info(problems)")
cols = {str(row[1]) for row in cur.fetchall()}
needed = {
"sample_input": "ALTER TABLE problems ADD COLUMN sample_input TEXT NOT NULL DEFAULT ''",
"sample_output": "ALTER TABLE problems ADD COLUMN sample_output TEXT NOT NULL DEFAULT ''",
"statement_url": "ALTER TABLE problems ADD COLUMN statement_url TEXT NOT NULL DEFAULT ''",
"llm_profile_json": "ALTER TABLE problems ADD COLUMN llm_profile_json TEXT NOT NULL DEFAULT '{}'",
}
for col, sql in needed.items():
if col not in cols:
cur.execute(sql)
conn.commit()
def ensure_core_tables(conn: sqlite3.Connection) -> None:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS problems (
id INTEGER PRIMARY KEY AUTOINCREMENT,
slug TEXT NOT NULL UNIQUE,
title TEXT NOT NULL,
statement_md TEXT NOT NULL,
difficulty INTEGER NOT NULL DEFAULT 1,
source TEXT NOT NULL DEFAULT '',
statement_url TEXT NOT NULL DEFAULT '',
llm_profile_json TEXT NOT NULL DEFAULT '{}',
sample_input TEXT NOT NULL DEFAULT '',
sample_output TEXT NOT NULL DEFAULT '',
created_at INTEGER NOT NULL
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS problem_tags (
problem_id INTEGER NOT NULL,
tag TEXT NOT NULL,
PRIMARY KEY(problem_id, tag)
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_problem_tags_tag ON problem_tags(tag)")
conn.commit()
def ensure_import_tables(conn: sqlite3.Connection) -> None:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS import_jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
status TEXT NOT NULL,
trigger TEXT NOT NULL DEFAULT 'manual',
total_count INTEGER NOT NULL DEFAULT 0,
processed_count INTEGER NOT NULL DEFAULT 0,
success_count INTEGER NOT NULL DEFAULT 0,
failed_count INTEGER NOT NULL DEFAULT 0,
options_json TEXT NOT NULL DEFAULT '{}',
last_error TEXT NOT NULL DEFAULT '',
started_at INTEGER NOT NULL,
finished_at INTEGER,
updated_at INTEGER NOT NULL,
created_at INTEGER NOT NULL
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS import_job_items (
id INTEGER PRIMARY KEY AUTOINCREMENT,
job_id INTEGER NOT NULL,
source_path TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'queued',
title TEXT NOT NULL DEFAULT '',
difficulty INTEGER NOT NULL DEFAULT 0,
problem_id INTEGER,
error_text TEXT NOT NULL DEFAULT '',
started_at INTEGER,
finished_at INTEGER,
updated_at INTEGER NOT NULL,
created_at INTEGER NOT NULL,
UNIQUE(job_id, source_path)
)
"""
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_import_jobs_created_at ON import_jobs(created_at DESC)"
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_import_job_items_job_status "
"ON import_job_items(job_id, status, updated_at DESC)"
)
conn.commit()
def create_import_job(
conn: sqlite3.Connection, trigger: str, total_count: int, options_json: str
) -> int:
ts = now_sec()
cur = conn.cursor()
cur.execute(
"""
INSERT INTO import_jobs(
status,trigger,total_count,processed_count,success_count,failed_count,
options_json,last_error,started_at,finished_at,updated_at,created_at
) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)
""",
(
"running",
trigger or "manual",
total_count,
0,
0,
0,
options_json,
"",
ts,
None,
ts,
ts,
),
)
conn.commit()
return int(cur.lastrowid)
def seed_import_items(
conn: sqlite3.Connection, job_id: int, items: list[LuoguListItem]
) -> None:
ts = now_sec()
cur = conn.cursor()
cur.executemany(
"""
INSERT OR IGNORE INTO import_job_items(
job_id,source_path,status,title,difficulty,problem_id,error_text,
started_at,finished_at,updated_at,created_at
) VALUES(?,?,?,?,?,?,?,?,?,?,?)
""",
[
(
job_id,
item.pid,
"queued",
"",
0,
None,
"",
None,
None,
ts,
ts,
)
for item in items
],
)
conn.commit()
def update_import_item_success(
conn: sqlite3.Connection,
job_id: int,
source_path: str,
title: str,
difficulty: int,
problem_id: int,
note: str = "",
) -> None:
ts = now_sec()
conn.execute(
"""
UPDATE import_job_items
SET status='success',
title=?,
difficulty=?,
problem_id=?,
error_text=?,
started_at=COALESCE(started_at, ?),
finished_at=?,
updated_at=?
WHERE job_id=? AND source_path=?
""",
(title, difficulty, problem_id, note, ts, ts, ts, job_id, source_path),
)
conn.commit()
def update_import_item_failed(
conn: sqlite3.Connection, job_id: int, source_path: str, error_text: str
) -> None:
ts = now_sec()
conn.execute(
"""
UPDATE import_job_items
SET status='failed',
error_text=?,
started_at=COALESCE(started_at, ?),
finished_at=?,
updated_at=?
WHERE job_id=? AND source_path=?
""",
(error_text[:500], ts, ts, ts, job_id, source_path),
)
conn.commit()
def update_import_job_progress(
conn: sqlite3.Connection,
job_id: int,
processed_count: int,
success_count: int,
failed_count: int,
last_error: str,
) -> None:
ts = now_sec()
conn.execute(
"""
UPDATE import_jobs
SET processed_count=?,
success_count=?,
failed_count=?,
last_error=?,
updated_at=?
WHERE id=?
""",
(processed_count, success_count, failed_count, last_error[:500], ts, job_id),
)
conn.commit()
def finish_import_job(
conn: sqlite3.Connection,
job_id: int,
success_count: int,
failed_count: int,
last_error: str,
) -> None:
ts = now_sec()
status = "completed" if failed_count == 0 else "completed_with_errors"
conn.execute(
"""
UPDATE import_jobs
SET status=?,
processed_count=total_count,
success_count=?,
failed_count=?,
last_error=?,
finished_at=?,
updated_at=?
WHERE id=?
""",
(status, success_count, failed_count, last_error[:500], ts, ts, job_id),
)
conn.commit()
def upsert_problem(conn: sqlite3.Connection, rec: UpsertRecord) -> tuple[int, bool]:
cur = conn.cursor()
cur.execute("SELECT id FROM problems WHERE slug=?", (rec.slug,))
row = cur.fetchone()
if row is None:
cur.execute(
"""
INSERT INTO problems(
slug,title,statement_md,difficulty,source,statement_url,llm_profile_json,
sample_input,sample_output,created_at
) VALUES(?,?,?,?,?,?,?,?,?,?)
""",
(
rec.slug,
rec.title,
rec.statement_md,
rec.difficulty,
rec.source,
rec.statement_url,
rec.llm_profile_json,
rec.sample_input,
rec.sample_output,
now_sec(),
),
)
problem_id = int(cur.lastrowid)
inserted = True
else:
problem_id = int(row[0])
cur.execute(
"""
UPDATE problems
SET title=?,statement_md=?,difficulty=?,source=?,statement_url=?,
llm_profile_json=?,sample_input=?,sample_output=?
WHERE id=?
""",
(
rec.title,
rec.statement_md,
rec.difficulty,
rec.source,
rec.statement_url,
rec.llm_profile_json,
rec.sample_input,
rec.sample_output,
problem_id,
),
)
inserted = False
cur.execute("DELETE FROM problem_tags WHERE problem_id=?", (problem_id,))
for tag in rec.tags:
cur.execute(
"INSERT OR IGNORE INTO problem_tags(problem_id,tag) VALUES(?,?)",
(problem_id, tag),
)
conn.commit()
return problem_id, inserted
def markdown_to_absolute(base_url: str, text: str) -> str:
if not text:
return ""
text = re.sub(r"\]\(/", f"]({base_url}/", text)
text = re.sub(r"!\[\]\(/", f"![]({base_url}/", text)
return text
def build_statement_md(base_url: str, pid: str, detail: dict[str, Any]) -> str:
content = detail.get("content") or {}
title = str(detail.get("title") or pid).strip()
background = markdown_to_absolute(base_url, str(content.get("background") or "").strip())
description = markdown_to_absolute(base_url, str(content.get("description") or "").strip())
format_i = markdown_to_absolute(base_url, str(content.get("formatI") or "").strip())
format_o = markdown_to_absolute(base_url, str(content.get("formatO") or "").strip())
hint = markdown_to_absolute(base_url, str(content.get("hint") or "").strip())
lines = [
f"# {pid} {title}",
"",
f"- Source: Luogu",
f"- Problem URL: {base_url}/problem/{pid}",
]
if background:
lines += ["", "## Background", "", background]
if description:
lines += ["", "## Description", "", description]
if format_i:
lines += ["", "## Input Format", "", format_i]
if format_o:
lines += ["", "## Output Format", "", format_o]
if hint:
lines += ["", "## Hint", "", hint]
return "\n".join(lines).strip()
def build_record(
base_url: str,
list_item: LuoguListItem,
detail: dict[str, Any],
tag_catalog: dict[int, dict[str, Any]],
) -> UpsertRecord:
pid = list_item.pid
title = str(detail.get("title") or list_item.title or pid).strip()
difficulty = int(detail.get("difficulty") or list_item.difficulty or 1)
statement_url = f"{base_url}/problem/{pid}"
statement_md = build_statement_md(base_url, pid, detail)
samples = detail.get("samples") or []
sample_input = ""
sample_output = ""
if samples and isinstance(samples[0], list) and len(samples[0]) >= 2:
sample_input = str(samples[0][0] or "")
sample_output = str(samples[0][1] or "")
detail_tag_ids = detail.get("tags") or []
if not isinstance(detail_tag_ids, list):
detail_tag_ids = []
tag_ids = list(dict.fromkeys([*list_item.tags, *detail_tag_ids]))
tag_names: list[str] = []
knowledge_points: list[str] = []
normalized_tags: set[str] = {"luogu", "csp"}
for tid in tag_ids:
tag = tag_catalog.get(int(tid))
if not tag:
continue
name = str(tag.get("name") or "").strip()
if not name:
continue
tag_names.append(name)
normalized_tags.add(normalize_tag(name))
ttype = int(tag.get("type") or 0)
if ttype == 2 and len(knowledge_points) < 8:
knowledge_points.append(name)
upper_name = name.upper()
if "CSP-J" in upper_name:
normalized_tags.add("csp-j")
if "CSP-S" in upper_name:
normalized_tags.add("csp-s")
if "NOIP 普及" in name:
normalized_tags.add("noip-junior")
if "NOIP 提高" in name:
normalized_tags.add("noip-senior")
if not knowledge_points:
knowledge_points = tag_names[:6]
answer = "See official solutions/discussions and verify with your own proof."
explanation = (
"This problem is imported from Luogu. The statement and examples are preserved; "
"practice with your own derivation and compare with accepted solutions."
)
profile = {
"schema_version": 1,
"platform": "luogu",
"pid": pid,
"difficulty": difficulty,
"tags": tag_names,
"tag_ids": tag_ids,
"knowledge_points": knowledge_points,
"answer": answer,
"explanation": explanation,
"stats": {
"total_submit": int(list_item.total_submit),
"total_accepted": int(list_item.total_accepted),
},
"source": {
"url": statement_url,
"type": list_item.type,
},
"generated_at": now_sec(),
}
all_tags = sorted({t for t in normalized_tags if t})[:30]
return UpsertRecord(
slug=f"luogu-{pid.lower()}",
title=f"{pid} {title}",
statement_md=statement_md,
difficulty=max(1, min(10, difficulty)),
source=f"luogu:{pid}",
statement_url=statement_url,
llm_profile_json=json.dumps(profile, ensure_ascii=False),
sample_input=sample_input,
sample_output=sample_output,
tags=all_tags,
)
def build_fallback_record(
base_url: str,
list_item: LuoguListItem,
tag_catalog: dict[int, dict[str, Any]],
error_text: str,
) -> UpsertRecord:
fallback_detail: dict[str, Any] = {
"title": list_item.title,
"difficulty": list_item.difficulty,
"tags": list_item.tags,
"samples": [],
"content": {
"description": (
"题面抓取失败(已自动降级导入)。"
f"请访问原题链接查看完整题面:{base_url}/problem/{list_item.pid}"
)
},
}
rec = build_record(base_url, list_item, fallback_detail, tag_catalog)
profile = json.loads(rec.llm_profile_json)
profile["fallback_import"] = True
profile["fallback_reason"] = error_text[:240]
rec.llm_profile_json = json.dumps(profile, ensure_ascii=False)
return rec
def fetch_tag_catalog(
session: requests.Session,
base_url: str,
timeout: int,
retries: int,
sleep_sec: float,
) -> dict[int, dict[str, Any]]:
text = requests_retry_text(
session,
f"{base_url}/_lfe/tags/zh-CN",
timeout=timeout,
retries=retries,
sleep_sec=sleep_sec,
)
payload = json.loads(text)
tags = payload.get("tags") or []
out: dict[int, dict[str, Any]] = {}
for row in tags:
if not isinstance(row, dict) or "id" not in row:
continue
out[int(row["id"])] = row
return out
def fetch_list_page(
session: requests.Session,
base_url: str,
tags_csv: str,
page: int,
timeout: int,
retries: int,
sleep_sec: float,
) -> tuple[int, int, list[LuoguListItem]]:
url = f"{base_url}/problem/list?type=all&tag={quote(tags_csv)}&page={page}"
html_text = requests_retry_text(
session, url, timeout=timeout, retries=retries, sleep_sec=sleep_sec
)
ctx = extract_context_json(html_text)
problems = ((ctx.get("data") or {}).get("problems") or {})
count = int(problems.get("count") or 0)
per_page = int(problems.get("perPage") or 50)
result: list[LuoguListItem] = []
for row in problems.get("result") or []:
if not isinstance(row, dict):
continue
pid = str(row.get("pid") or "").strip()
if not pid:
continue
tags = row.get("tags") if isinstance(row.get("tags"), list) else []
result.append(
LuoguListItem(
pid=pid,
title=str(row.get("title") or "").strip(),
difficulty=int(row.get("difficulty") or 1),
tags=[int(x) for x in tags if isinstance(x, int)],
total_submit=int(row.get("totalSubmit") or 0),
total_accepted=int(row.get("totalAccepted") or 0),
type=str(row.get("type") or "").strip(),
)
)
return count, per_page, result
def fetch_problem_detail(
base_url: str,
pid: str,
timeout: int,
retries: int,
sleep_sec: float,
) -> dict[str, Any]:
session = requests.Session()
session.headers.update(
{
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
),
"Referer": f"{base_url}/problem/list",
}
)
html_text = requests_retry_text(
session,
f"{base_url}/problem/{pid}",
timeout=timeout,
retries=retries,
sleep_sec=sleep_sec,
)
ctx = extract_context_json(html_text)
detail = ((ctx.get("data") or {}).get("problem") or {})
if not isinstance(detail, dict):
raise RuntimeError(f"problem detail invalid: {pid}")
return detail
def main() -> int:
parser = argparse.ArgumentParser(description="Import Luogu CSP-J/S problem set")
parser.add_argument("--db-path", required=True, help="SQLite db path")
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
parser.add_argument(
"--tag-ids",
default=",".join(str(x) for x in DEFAULT_TAG_IDS),
help="Comma separated Luogu tag IDs",
)
parser.add_argument("--workers", type=int, default=3)
parser.add_argument("--max-problems", type=int, default=0)
parser.add_argument("--timeout", type=int, default=25)
parser.add_argument("--retries", type=int, default=5)
parser.add_argument("--retry-sleep-sec", type=float, default=1.2)
parser.add_argument("--clear-existing", action="store_true")
parser.add_argument("--clear-all-problems", action="store_true")
parser.add_argument("--job-trigger", default="manual")
parser.add_argument("--clear-existing-source-prefix", default="")
parser.add_argument("--skip-llm", action="store_true")
parser.add_argument("--llm-limit", type=int, default=0)
args = parser.parse_args()
tag_ids = parse_tag_ids(args.tag_ids)
tags_csv = ",".join(str(x) for x in tag_ids)
session = requests.Session()
session.headers.update(
{
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
),
"Referer": f"{args.base_url}/problem/list",
}
)
tag_catalog = fetch_tag_catalog(
session,
args.base_url,
timeout=args.timeout,
retries=args.retries,
sleep_sec=args.retry_sleep_sec,
)
total_count, per_page, first_page_items = fetch_list_page(
session,
args.base_url,
tags_csv,
page=1,
timeout=args.timeout,
retries=args.retries,
sleep_sec=args.retry_sleep_sec,
)
total_pages = max(1, math.ceil(max(1, total_count) / max(1, per_page)))
all_items: dict[str, LuoguListItem] = {item.pid: item for item in first_page_items}
for page in range(2, total_pages + 1):
_, _, page_items = fetch_list_page(
session,
args.base_url,
tags_csv,
page=page,
timeout=args.timeout,
retries=args.retries,
sleep_sec=args.retry_sleep_sec,
)
for item in page_items:
all_items[item.pid] = item
selected = sorted(all_items.values(), key=lambda x: x.pid)
if args.max_problems > 0:
selected = selected[: args.max_problems]
conn = sqlite3.connect(args.db_path)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys=ON")
conn.execute("PRAGMA busy_timeout=5000")
ensure_core_tables(conn)
ensure_problem_columns(conn)
ensure_import_tables(conn)
cleared_count = 0
if args.clear_all_problems:
cur = conn.execute("SELECT COUNT(1) FROM problems")
cleared_count = int(cur.fetchone()[0] or 0)
conn.execute("DELETE FROM problems")
conn.commit()
elif args.clear_existing:
cur = conn.execute("SELECT COUNT(1) FROM problems WHERE source LIKE 'luogu:%'")
cleared_count = int(cur.fetchone()[0] or 0)
conn.execute("DELETE FROM problems WHERE source LIKE 'luogu:%'")
conn.commit()
inserted = 0
updated = 0
failed = 0
fallback_used = 0
total = len(selected)
last_error = ""
options_json = json.dumps(
{
"source": "luogu",
"tag_ids": tag_ids,
"workers": max(1, args.workers),
"max_problems": args.max_problems,
"clear_existing": bool(args.clear_existing),
"clear_all_problems": bool(args.clear_all_problems),
},
ensure_ascii=False,
)
job_id = create_import_job(conn, args.job_trigger, total, options_json)
seed_import_items(conn, job_id, selected)
with ThreadPoolExecutor(max_workers=max(1, args.workers)) as executor:
futures = {
executor.submit(
fetch_problem_detail,
args.base_url,
item.pid,
args.timeout,
args.retries,
args.retry_sleep_sec,
): item
for item in selected
}
done_count = 0
for future in as_completed(futures):
item = futures[future]
done_count += 1
try:
detail = future.result()
record = build_record(args.base_url, item, detail, tag_catalog)
problem_id, is_insert = upsert_problem(conn, record)
if is_insert:
inserted += 1
else:
updated += 1
update_import_item_success(
conn,
job_id,
item.pid,
record.title,
record.difficulty,
problem_id,
)
print(
f"[{done_count}/{total}] {item.pid} -> {record.title} "
f"(difficulty={record.difficulty})",
flush=True,
)
except Exception as exc:
try:
record = build_fallback_record(
args.base_url, item, tag_catalog, str(exc)
)
problem_id, is_insert = upsert_problem(conn, record)
if is_insert:
inserted += 1
else:
updated += 1
fallback_used += 1
update_import_item_success(
conn,
job_id,
item.pid,
record.title,
record.difficulty,
problem_id,
note=f"fallback: {str(exc)[:300]}",
)
print(f"[fallback] {item.pid}: {exc}", flush=True)
except Exception as inner_exc:
failed += 1
last_error = str(inner_exc)
update_import_item_failed(
conn,
job_id,
item.pid,
f"{exc}; fallback failed: {inner_exc}",
)
print(f"[skip] {item.pid}: {exc}; fallback failed: {inner_exc}", flush=True)
update_import_job_progress(
conn,
job_id,
done_count,
inserted + updated,
failed,
last_error,
)
finish_import_job(conn, job_id, inserted + updated, failed, last_error)
conn.close()
print(
json.dumps(
{
"db_path": args.db_path,
"tags": tag_ids,
"selected_count": total,
"inserted": inserted,
"updated": updated,
"failed": failed,
"fallback_used": fallback_used,
"cleared_count": cleared_count,
"job_id": job_id,
},
ensure_ascii=False,
indent=2,
)
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

1302
scripts/import_winterant_oi.py 普通文件

文件差异内容过多而无法显示 加载差异