feat: expand platform management, admin controls, and learning workflows

这个提交包含在:
Codex CLI
2026-02-15 15:41:56 +08:00
父节点 ad29a9f62d
当前提交 f209ae82da
修改 75 个文件,包含 9663 行新增794 行删除

查看文件

@@ -0,0 +1,272 @@
#!/usr/bin/env python3
"""Generate CSP-J/S style feedback for one submission via LLM (with fallback)."""
from __future__ import annotations
import argparse
import json
import os
import re
import sys
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import requests
DEFAULT_LINKS: List[Dict[str, str]] = [
{"title": "NOI 官网(规则与环境)", "url": "https://www.noi.cn/"},
{"title": "OI Wiki算法知识库", "url": "https://oi-wiki.org/"},
{"title": "cppreference C++14", "url": "https://en.cppreference.com/w/cpp/14"},
{
"title": "GCC Warning Options",
"url": "https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html",
},
{"title": "洛谷(题解与训练)", "url": "https://www.luogu.com.cn/"},
]
@dataclass
class LlmResult:
ok: bool
feedback_md: str
links: List[Dict[str, str]]
model_name: str
status: str
def env(name: str, default: str = "") -> str:
value = os.getenv(name, "").strip()
return value if value else default
def load_input(path: str) -> Dict[str, Any]:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, dict):
raise ValueError("input json must be object")
return data
def detect_cpp14_risk(code: str, compile_log: str) -> List[str]:
hints: List[str] = []
joined = f"{code}\n{compile_log}"
checks = [
(r"\bif\s+constexpr\b", "检测到 `if constexpr`C++17,C++14 环境会 CE。"),
(r"\bstd::optional\b", "检测到 `std::optional`C++17,建议改为普通变量+标记位。"),
(r"\bstd::variant\b", "检测到 `std::variant`C++17,建议改为 struct/enum 分支。"),
(r"\bstd::string_view\b", "检测到 `std::string_view`C++17,建议改为 `const string&`。"),
(r"\[[^\]]+\]\s*=" , "检测到结构化绑定迹象,C++14 不支持,建议改 pair/struct 访问。"),
(r"%I64d", "检测到 `%I64d`,Linux 评测机应统一使用 `%lld`。"),
(r"\bvoid\s+main\s*\(", "检测到 `void main()`,需改为 `int main()` 并 `return 0;`。"),
]
for pattern, tip in checks:
if re.search(pattern, joined):
hints.append(tip)
if "-Wsign-compare" in compile_log:
hints.append("存在 `-Wsign-compare`,建议统一使用 `size_t` 或显式类型转换。")
return hints
def build_fallback_feedback(payload: Dict[str, Any], llm_error: str = "") -> LlmResult:
status = str(payload.get("status", "Unknown"))
score = payload.get("score", 0)
compile_log = str(payload.get("compile_log", ""))
runtime_log = str(payload.get("runtime_log", ""))
code = str(payload.get("code", ""))
risk_tips = detect_cpp14_risk(code, compile_log)
if not risk_tips:
risk_tips = [
"请确认只使用 C++14 语法,避免 C++17 特性导致 CE。",
"若题目要求文件输入输出,使用 `freopen(\"xxx.in\",\"r\",stdin)` / `freopen(\"xxx.out\",\"w\",stdout)`。",
]
thought = (
"代码通过了当前评测,核心思路基本正确,建议继续做规范化和鲁棒性收敛。"
if status.upper() == "AC"
else "当前提交未稳定通过,建议先按日志定位错误,再拆分为思路问题与实现问题逐步修复。"
)
lines: List[str] = []
lines.append("### 评测结论")
lines.append(f"- 本次状态:**{status}**,分数:**{score}**。")
lines.append(f"- 思路评价:{thought}")
lines.append("")
lines.append("### 福建 CSP-J/S 规范检查C++14")
for tip in risk_tips:
lines.append(f"- {tip}")
if compile_log.strip():
lines.append("- 编译日志有信息,建议逐条清理 warning,减少考场不确定性。")
if runtime_log.strip():
lines.append("- 运行日志有输出,建议重点检查边界输入与数组越界风险。")
lines.append("")
lines.append("### 修改建议(可执行)")
lines.append("- 按“先编译通过→再保证正确→最后做优化”的顺序迭代。")
lines.append("- `long long` 读写统一 `%lld`;不要使用 `%I64d`。")
lines.append("- 清理 signed/unsigned 警告,降低不同编译器行为差异。")
lines.append("- 确保 `int main()` 且 `return 0;`。")
lines.append("")
lines.append("### 知识点评测")
lines.append("- 强项:基础实现与调试流程。")
lines.append("- 待加强:边界构造、类型一致性、赛场环境兼容性。")
lines.append("")
lines.append("### 推荐外链资料")
for item in DEFAULT_LINKS:
lines.append(f"- [{item['title']}]({item['url']})")
if llm_error:
lines.append("")
lines.append(f"> 说明LLM 调用失败,已返回规则兜底建议。错误:{llm_error}")
return LlmResult(
ok=True,
feedback_md="\n".join(lines).strip(),
links=DEFAULT_LINKS,
model_name="fallback-rules",
status="fallback",
)
def normalize_links(raw: Any) -> List[Dict[str, str]]:
links: List[Dict[str, str]] = []
if isinstance(raw, list):
for item in raw:
if not isinstance(item, dict):
continue
title = str(item.get("title", "")).strip()
url = str(item.get("url", "")).strip()
if title and url:
links.append({"title": title, "url": url})
return links if links else DEFAULT_LINKS
def dict_to_markdown(data: Dict[str, Any]) -> str:
parts: List[str] = []
for key, value in data.items():
title = str(key).strip() or "分析项"
if isinstance(value, str):
body = value.strip()
else:
body = json.dumps(value, ensure_ascii=False, indent=2)
if not body:
continue
parts.append(f"### {title}\n{body}")
return "\n\n".join(parts)
def call_llm(payload: Dict[str, Any]) -> LlmResult:
api_url = env("OI_LLM_API_URL") or env("CSP_LLM_API_URL")
api_key = env("OI_LLM_API_KEY") or env("CSP_LLM_API_KEY")
model = env("OI_LLM_MODEL", "qwen3-max")
if not api_url:
raise RuntimeError("missing OI_LLM_API_URL")
system_prompt = (
"你是福建省 CSP-J/S 代码规范与评测老师。"
"请严格按 C++14 旧 GCC 环境给建议,重点指出会导致 CE/RE/爆零的风险。"
"输出 JSON,不要输出其他文字。"
)
user_prompt = {
"task": "分析这份提交并给出改进建议",
"required_sections": [
"评测结论",
"福建 CSP-J/S 规范检查C++14",
"修改建议",
"知识点评测",
"推荐外链资料",
],
"submission": payload,
"output_json_schema": {
"feedback_md": "markdown string",
"links": [{"title": "string", "url": "string"}],
"status": "ready",
},
}
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
body = {
"model": model,
"stream": False,
"temperature": 0.1,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": json.dumps(user_prompt, ensure_ascii=False)},
],
}
resp = requests.post(api_url, headers=headers, json=body, timeout=50)
resp.raise_for_status()
data = resp.json()
choices = data.get("choices") if isinstance(data, dict) else None
if not choices:
raise RuntimeError("LLM response missing choices")
first = choices[0] if isinstance(choices, list) and choices else {}
message = first.get("message") if isinstance(first, dict) else {}
content = message.get("content", "") if isinstance(message, dict) else ""
if not isinstance(content, str) or not content.strip():
raise RuntimeError("LLM content is empty")
model_name = str(data.get("model", model)) if isinstance(data, dict) else model
parsed: Optional[Dict[str, Any]] = None
try:
candidate = json.loads(content)
if isinstance(candidate, dict):
parsed = candidate
except Exception:
parsed = None
if parsed and parsed.get("feedback_md"):
return LlmResult(
ok=True,
feedback_md=str(parsed.get("feedback_md", "")).strip(),
links=normalize_links(parsed.get("links")),
model_name=model_name,
status=str(parsed.get("status", "ready")) or "ready",
)
if parsed:
return LlmResult(
ok=True,
feedback_md=dict_to_markdown(parsed),
links=DEFAULT_LINKS,
model_name=model_name,
status="ready",
)
return LlmResult(
ok=True,
feedback_md=content.strip(),
links=DEFAULT_LINKS,
model_name=model_name,
status="ready",
)
def main() -> int:
parser = argparse.ArgumentParser(description="Analyze one submission with LLM + fallback")
parser.add_argument("--input-file", required=True, help="JSON file from backend")
args = parser.parse_args()
payload = load_input(args.input_file)
try:
result = call_llm(payload)
except Exception as exc:
result = build_fallback_feedback(payload, str(exc))
output = {
"feedback_md": result.feedback_md,
"links": result.links,
"model_name": result.model_name,
"status": result.status,
}
sys.stdout.write(json.dumps(output, ensure_ascii=False))
return 0
if __name__ == "__main__":
raise SystemExit(main())

查看文件

@@ -7,7 +7,10 @@ import argparse
import json
import os
import re
import shutil
import sqlite3
import subprocess
import tempfile
import time
from dataclasses import dataclass
from typing import Any
@@ -15,6 +18,37 @@ from typing import Any
import requests
RETRYABLE_HTTP_CODES = {500, 502, 503, 504}
CLANG_FORMAT_BIN = shutil.which("clang-format")
GXX_BIN = shutil.which("g++")
PLACEHOLDER_CODE_MARKERS = (
"todo",
"to do",
"请根据题意补全",
"待补全",
"自行补全",
"省略",
"your code here",
)
CPP17_BANNED_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
(re.compile(r"\bif\s+constexpr\b"), "if constexpr"),
(re.compile(r"\bstd::optional\b"), "std::optional"),
(re.compile(r"\bstd::variant\b"), "std::variant"),
(re.compile(r"\bstd::any\b"), "std::any"),
(re.compile(r"\bstd::string_view\b"), "std::string_view"),
(re.compile(r"\bstd::filesystem\b"), "std::filesystem"),
(re.compile(r"\bstd::byte\b"), "std::byte"),
(re.compile(r"\bstd::clamp\s*\("), "std::clamp"),
(re.compile(r"\bstd::gcd\s*\("), "std::gcd"),
(re.compile(r"\bstd::lcm\s*\("), "std::lcm"),
(re.compile(r"#\s*include\s*<\s*(optional|variant|any|string_view|filesystem|charconv|execution)\s*>"), "C++17 header"),
(
re.compile(
r"\b(?:const\s+)?auto(?:\s*&|\s*&&)?\s*\[[^\]\n]+\]\s*=",
flags=re.MULTILINE,
),
"structured bindings",
),
)
@dataclass
@@ -32,6 +66,17 @@ def now_sec() -> int:
return int(time.time())
def env_bool(key: str, default: bool) -> bool:
raw = os.getenv(key, "").strip().lower()
if not raw:
return default
if raw in {"1", "true", "yes", "on"}:
return True
if raw in {"0", "false", "no", "off"}:
return False
return default
def extract_json_object(text: str) -> dict[str, Any] | None:
raw = text.strip()
if raw.startswith("```"):
@@ -54,6 +99,253 @@ def extract_json_object(text: str) -> dict[str, Any] | None:
return None
def extract_message_text(content: Any) -> str:
if isinstance(content, str):
return content.strip()
if isinstance(content, list):
parts: list[str] = []
for item in content:
if isinstance(item, dict):
text = item.get("text")
if isinstance(text, str) and text.strip():
parts.append(text.strip())
continue
nested = item.get("content")
if isinstance(nested, str) and nested.strip():
parts.append(nested.strip())
return "\n".join(parts).strip()
if isinstance(content, dict):
text = content.get("text")
if isinstance(text, str):
return text.strip()
nested = content.get("content")
if isinstance(nested, str):
return nested.strip()
return ""
def iter_json_candidates(text: str) -> list[str]:
if not text:
return []
raw = text.strip()
candidates: list[str] = [raw] if raw else []
for match in re.finditer(r"```(?:json)?\s*([\s\S]*?)```", text, flags=re.IGNORECASE):
block = match.group(1).strip()
if block:
candidates.append(block)
decoder = json.JSONDecoder()
limit = min(len(text), 200000)
sample = text[:limit]
for idx, ch in enumerate(sample):
if ch not in "{[":
continue
try:
_, end = decoder.raw_decode(sample[idx:])
except json.JSONDecodeError:
continue
snippet = sample[idx : idx + end].strip()
if snippet:
candidates.append(snippet)
seen: set[str] = set()
deduped: list[str] = []
for cand in candidates:
if cand in seen:
continue
seen.add(cand)
deduped.append(cand)
return deduped
def extract_solution_rows(content: str) -> list[dict[str, Any]]:
for candidate in iter_json_candidates(content):
try:
parsed = json.loads(candidate)
except json.JSONDecodeError:
continue
rows: Any = None
if isinstance(parsed, dict):
rows = parsed.get("solutions")
if rows is None and isinstance(parsed.get("data"), dict):
rows = parsed["data"].get("solutions")
elif isinstance(parsed, list):
rows = parsed
if isinstance(rows, list):
filtered = [x for x in rows if isinstance(x, dict)]
if filtered:
return filtered
return []
def is_placeholder_code(code: str) -> bool:
lower = (code or "").lower()
if any(marker in lower for marker in PLACEHOLDER_CODE_MARKERS):
return True
if "..." in code:
return True
return False
def cpp14_violations(code: str) -> list[str]:
hits: list[str] = []
for pattern, label in CPP17_BANNED_PATTERNS:
if pattern.search(code):
hits.append(label)
return hits
def compiles_under_cpp14(code: str) -> tuple[bool, str]:
if not GXX_BIN:
return True, ""
with tempfile.TemporaryDirectory(prefix="csp_sol_cpp14_") as tmp:
src_path = os.path.join(tmp, "main.cpp")
with open(src_path, "w", encoding="utf-8") as f:
f.write(code if code.endswith("\n") else f"{code}\n")
proc = subprocess.run(
[GXX_BIN, "-std=gnu++14", "-O2", "-Wall", "-Wextra", "-Wpedantic", "-fsyntax-only", src_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
timeout=12,
)
if proc.returncode == 0:
return True, ""
err = proc.stderr.decode("utf-8", errors="ignore").strip()
return False, err[:400]
def normalize_solutions(rows: list[dict[str, Any]], max_solutions: int) -> tuple[list[dict[str, Any]], list[str]]:
normalized: list[dict[str, Any]] = []
rejected: list[str] = []
for row in rows:
title = str(row.get("title") or "").strip()
idea_md = str(row.get("idea_md") or "").strip()
explanation_md = str(row.get("explanation_md") or "").strip()
complexity = str(row.get("complexity") or "").strip()
code_cpp = str(row.get("code_cpp") or "")
tags = row.get("tags") if isinstance(row.get("tags"), list) else []
if not code_cpp.strip():
rejected.append("empty code_cpp")
continue
if "main(" not in code_cpp:
rejected.append("missing main()")
continue
if is_placeholder_code(code_cpp):
rejected.append("placeholder code")
continue
violations = cpp14_violations(code_cpp)
if violations:
rejected.append(f"C++17+ feature: {', '.join(violations[:3])}")
continue
ok_cpp14, compile_msg = compiles_under_cpp14(code_cpp)
if not ok_cpp14:
rejected.append(f"cannot compile with -std=gnu++14: {compile_msg}")
continue
normalized.append(
{
"title": title,
"idea_md": idea_md,
"explanation_md": explanation_md,
"complexity": complexity,
"code_cpp": code_cpp,
"tags": tags,
}
)
if len(normalized) >= max_solutions:
break
return normalized, rejected
def build_prompt(problem: Problem, max_solutions: int) -> str:
return f"""
请为下面这道 CSP 题生成 {max_solutions} 种不同思路的题解(可从模拟/贪心/DP/图论/数据结构等不同角度切入),并给出可直接提交的 C++14 参考代码。
硬性要求:
1. 必须只输出一个 JSON 对象,不能有任何 JSON 外文本。
2. JSON 必须符合下面格式,且 solutions 数组长度应为 {max_solutions}
3. 每个 code_cpp 必须是完整、可编译、可运行的 C++14 程序(包含 main 函数),不能出现 TODO、伪代码、占位注释、省略号。
4. 必须兼容 GCC 4.9/5.4 + -std=gnu++14严禁使用 C++17 及以上特性(如 structured bindings、if constexpr、std::optional、std::variant、std::any、std::string_view、<filesystem>)。
5. 建议使用标准头文件(如 <iostream>/<vector>/<algorithm> 等),不要使用 <bits/stdc++.h>。
6. main 必须是 int main(),并且 return 0;。若使用 scanf/printf 处理 long long,格式符必须用 %lld,不要用 %I64d。
7. 代码风格清晰,变量命名可读,注释简洁。
输出 JSON,格式固定
{{
"solutions": [
{{
"title": "解法标题",
"idea_md": "思路要点Markdown",
"explanation_md": "详细讲解Markdown",
"complexity": "时间/空间复杂度",
"code_cpp": "完整 C++14 代码",
"tags": ["标签1","标签2"]
}}
]
}}
题目信息:
- 题目:{problem.title}
- 难度:{problem.difficulty}
- 来源:{problem.source}
完整题面(原文,不做截断):
{problem.statement_md}
样例输入(原文):
{problem.sample_input}
样例输出(原文):
{problem.sample_output}
""".strip()
def parse_solutions_or_raise(content: str, max_solutions: int) -> list[dict[str, Any]]:
rows = extract_solution_rows(content)
if not rows:
raise RuntimeError("llm response missing valid solutions array")
normalized, rejected = normalize_solutions(rows, max_solutions=max_solutions)
if not normalized:
reason = f"; rejected sample: {rejected[0][:180]}" if rejected else ""
raise RuntimeError(f"llm response contains no runnable full code{reason}")
return normalized
def generate_solutions_with_llm(
prompt: str,
max_solutions: int,
timeout: int,
retries: int,
sleep_sec: float,
) -> list[dict[str, Any]]:
first_content = llm_request(prompt, timeout=timeout, retries=retries, sleep_sec=sleep_sec)
try:
return parse_solutions_or_raise(first_content, max_solutions=max_solutions)
except Exception as first_exc:
repair_prompt = (
"你上一条回复不符合要求,原因是:"
f"{str(first_exc)[:240]}。请只输出合法 JSON,并确保 code_cpp 是完整可运行 C++14 代码(兼容 -std=gnu++14\n\n"
+ prompt
)
second_content = llm_request(
repair_prompt,
timeout=timeout,
retries=retries,
sleep_sec=sleep_sec,
)
try:
return parse_solutions_or_raise(second_content, max_solutions=max_solutions)
except Exception as second_exc:
raise RuntimeError(
f"parse failed after retry: first={str(first_exc)[:200]}; second={str(second_exc)[:200]}"
) from second_exc
def llm_request(prompt: str, timeout: int, retries: int, sleep_sec: float) -> str:
url = os.getenv("OI_LLM_API_URL", "").strip()
api_key = os.getenv("OI_LLM_API_KEY", "").strip()
@@ -102,10 +394,14 @@ def llm_request(prompt: str, timeout: int, retries: int, sleep_sec: float) -> st
choices = payload.get("choices") or []
if not choices:
raise RuntimeError("llm response missing choices")
content = ((choices[0] or {}).get("message") or {}).get("content")
choice0 = choices[0] or {}
message = choice0.get("message") or {}
content = extract_message_text(message.get("content"))
if not content:
content = extract_message_text(choice0.get("text"))
if not content:
raise RuntimeError("llm response missing content")
return str(content)
return content
if last_error:
raise RuntimeError(f"llm request failed: {last_error}") from last_error
@@ -119,7 +415,20 @@ def fallback_solutions(max_solutions: int) -> list[dict[str, Any]]:
"idea_md": "按题意拆分步骤,先写可过样例的直观解法,再补边界处理。",
"explanation_md": "适用于数据范围较小或规则清晰的题。",
"complexity": "时间复杂度依题而定,通常 O(n)~O(n^2)",
"code_cpp": "// TODO: 请根据题意补全\n#include <bits/stdc++.h>\nusing namespace std;\nint main(){ios::sync_with_stdio(false);cin.tie(nullptr);return 0;}\n",
"code_cpp": """
#include <iostream>
#include <vector>
#include <algorithm>
using namespace std;
int main() {
ios::sync_with_stdio(false);
cin.tie(nullptr);
// TODO: 请根据题意补全读入、核心逻辑与输出。
return 0;
}
""".strip(),
"tags": ["simulation", "implementation"],
},
{
@@ -127,13 +436,55 @@ def fallback_solutions(max_solutions: int) -> list[dict[str, Any]]:
"idea_md": "分析状态与重复计算,尝试用前缀和、贪心或动态规划优化。",
"explanation_md": "比直接模拟更稳定,通常能覆盖更大数据规模。",
"complexity": "通常优于朴素解法",
"code_cpp": "// TODO: 请根据题意补全\n#include <bits/stdc++.h>\nusing namespace std;\nint main(){ios::sync_with_stdio(false);cin.tie(nullptr);return 0;}\n",
"code_cpp": """
#include <iostream>
#include <vector>
#include <algorithm>
using namespace std;
int main() {
ios::sync_with_stdio(false);
cin.tie(nullptr);
// TODO: 请根据题意补全优化版思路实现。
return 0;
}
""".strip(),
"tags": ["optimization", "dp"],
},
]
return base[: max(1, max_solutions)]
def format_cpp_code(raw: str) -> str:
code = (raw or "").replace("\r\n", "\n").replace("\r", "\n")
if not code.strip():
return ""
if not code.endswith("\n"):
code += "\n"
if not CLANG_FORMAT_BIN:
return code
try:
proc = subprocess.run(
[CLANG_FORMAT_BIN, "--style={BasedOnStyle: Google, IndentWidth: 2, ColumnLimit: 0}"],
input=code.encode("utf-8"),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
timeout=6,
)
if proc.returncode == 0 and proc.stdout:
out = proc.stdout.decode("utf-8", errors="ignore")
if out.strip():
return out if out.endswith("\n") else f"{out}\n"
except Exception:
pass
return code
def load_problem(conn: sqlite3.Connection, problem_id: int) -> Problem:
cur = conn.execute(
"SELECT id,title,statement_md,difficulty,source,sample_input,sample_output FROM problems WHERE id=?",
@@ -182,7 +533,7 @@ def store_solutions(conn: sqlite3.Connection, problem_id: int, rows: list[dict[s
idea_md = str(row.get("idea_md") or "").strip()
explanation_md = str(row.get("explanation_md") or "").strip()
code_cpp = str(row.get("code_cpp") or "").strip()
code_cpp = format_cpp_code(str(row.get("code_cpp") or ""))
complexity = str(row.get("complexity") or "").strip()
tags = row.get("tags") if isinstance(row.get("tags"), list) else []
@@ -239,68 +590,41 @@ def main() -> int:
try:
problem = load_problem(conn, args.problem_id)
requested_solutions = max(1, min(5, args.max_solutions))
allow_fallback = env_bool("CSP_SOLUTION_ALLOW_FALLBACK", False)
prompt = f"""
请为下面这道 CSP 题生成 {max(1, min(5, args.max_solutions))} 种不同思路的题解(可从不同角度切入,例如模拟/贪心/DP/数据结构),并给出 C++ 参考代码。
输出 JSON,格式固定
{{
"solutions": [
{{
"title": "解法标题",
"idea_md": "思路要点Markdown",
"explanation_md": "详细讲解Markdown",
"complexity": "时间/空间复杂度",
"code_cpp": "完整 C++17 代码",
"tags": ["标签1","标签2"]
}}
]
}}
题目:{problem.title}
难度:{problem.difficulty}
来源:{problem.source}
题面:
{problem.statement_md[:12000]}
样例输入:
{problem.sample_input[:1200]}
样例输出:
{problem.sample_output[:1200]}
""".strip()
prompt = build_prompt(problem, max_solutions=requested_solutions)
update_job(conn, args.job_id, progress=25, message="requesting llm", updated_at=now_sec())
source = "fallback"
source = "llm"
solutions: list[dict[str, Any]]
try:
content = llm_request(
prompt,
solutions = generate_solutions_with_llm(
prompt=prompt,
max_solutions=requested_solutions,
timeout=args.timeout,
retries=args.retries,
sleep_sec=args.retry_sleep_sec,
)
obj = extract_json_object(content)
raw = obj.get("solutions") if isinstance(obj, dict) else None
if not isinstance(raw, list) or len(raw) == 0:
raise RuntimeError("llm response missing solutions array")
solutions = [x for x in raw if isinstance(x, dict)]
if not solutions:
raise RuntimeError("llm response has empty valid solutions")
source = "llm"
except Exception:
except Exception as exc:
if not allow_fallback:
raise RuntimeError(f"llm generation failed: {str(exc)[:280]}") from exc
source = "fallback"
solutions = fallback_solutions(args.max_solutions)
solutions = solutions[: max(1, min(5, args.max_solutions))]
solutions = solutions[:requested_solutions]
update_job(conn, args.job_id, progress=70, message="writing solutions", updated_at=now_sec())
saved = store_solutions(conn, args.problem_id, solutions, source)
done_msg = f"completed: {saved} solutions ({source})"
update_job(
conn,
args.job_id,
status="completed",
progress=100,
message=f"completed: {saved} solutions ({source})",
message=done_msg,
finished_at=now_sec(),
updated_at=now_sec(),
)

查看文件

@@ -0,0 +1,487 @@
#!/usr/bin/env python3
"""Collect C++/CSP learning resources from the web, summarize with LLM, and upsert KB articles."""
from __future__ import annotations
import argparse
import html
import json
import os
import re
import sqlite3
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import requests
RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
DEFAULT_TIMEOUT = 30
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36"
)
NO_PROXY = {"http": "", "https": ""}
@dataclass(frozen=True)
class ResourceSource:
label: str
url: str
@dataclass(frozen=True)
class TrackSpec:
slug: str
title: str
audience: str
objective: str
sources: tuple[ResourceSource, ...]
TRACKS: tuple[TrackSpec, ...] = (
TrackSpec(
slug="learning-roadmap-csp",
title="CSP 学习总路线C++ 基础 → CSP-J → CSP-S",
audience="准备长期学习 CSP 的初中/高中选手与家长",
objective="给出分阶段目标、周训练节奏、升阶检查清单和环境规范提醒。",
sources=(
ResourceSource("NOI 技术规则", "https://www.noi.cn/gynoi/jsgz/"),
ResourceSource(
"NOI Linux 与说明文档下载",
"https://www.noi.cn/gynoi/jsgz/2018-08-21/710467.shtml",
),
ResourceSource(
"NOI 标准竞赛环境说明2012",
"https://www.noi.cn/gynoi/jsgz/2018-08-13/710466.shtml",
),
ResourceSource("OI Wiki 竞赛路线图", "https://oi-wiki.org/contest/roadmap/"),
ResourceSource("OI Wiki 竞赛资源", "https://oi-wiki.org/contest/resources/"),
ResourceSource(
"cp-algorithms 首页",
"https://cp-algorithms.com/",
),
),
),
TrackSpec(
slug="learning-cpp-basic",
title="C++ 基础学习资料(面向 CSP",
audience="C++ 零基础或语法不稳,准备进入 CSP-J 的同学",
objective="梳理语法基础、STL 入门、输入输出与 C++14 兼容写法。",
sources=(
ResourceSource("cppreference C++ language", "https://en.cppreference.com/w/cpp/language.html"),
ResourceSource("OI Wiki 语言基础", "https://oi-wiki.org/lang/basic/"),
ResourceSource("OI Wiki 数组", "https://oi-wiki.org/lang/array/"),
ResourceSource("OI Wiki 循环", "https://oi-wiki.org/lang/loop/"),
ResourceSource("OI Wiki 运算符", "https://oi-wiki.org/lang/op/"),
ResourceSource("OI Wiki C++ 标准库", "https://oi-wiki.org/lang/csl/"),
ResourceSource("OI Wiki 文件操作", "https://oi-wiki.org/lang/file-op/"),
),
),
TrackSpec(
slug="learning-csp-j",
title="CSP-J 学习资料与训练路径",
audience="目标 CSP-J 提高组入门,正在建立算法基础的同学",
objective="覆盖模拟、枚举、前缀和、基础搜索与基础 DP,给出循序刷题方案。",
sources=(
ResourceSource("NOI 技术规则", "https://www.noi.cn/gynoi/jsgz/"),
ResourceSource("OI Wiki 模拟", "https://oi-wiki.org/basic/simulate/"),
ResourceSource("OI Wiki 枚举", "https://oi-wiki.org/basic/enumerate/"),
ResourceSource("OI Wiki 前缀和与差分", "https://oi-wiki.org/basic/prefix-sum/"),
ResourceSource("OI Wiki 动态规划基础", "https://oi-wiki.org/dp/basic/"),
ResourceSource("OI Wiki BFS", "https://oi-wiki.org/search/bfs/"),
ResourceSource("OI Wiki DFS", "https://oi-wiki.org/search/dfs/"),
ResourceSource("OI Wiki 常见错误", "https://oi-wiki.org/contest/common-mistakes/"),
),
),
TrackSpec(
slug="learning-csp-s",
title="CSP-S 学习资料与进阶路径",
audience="目标 CSP-S,已具备 CSP-J 基础并准备系统进阶的同学",
objective="覆盖数据结构、图论、字符串与 DP 进阶,强调复杂度与工程规范。",
sources=(
ResourceSource(
"NOI 标准竞赛环境说明2016",
"https://www.noi.cn/gynoi/jsgz/2018-08-13/710465.shtml",
),
ResourceSource(
"NOI 标准竞赛环境说明2012",
"https://www.noi.cn/gynoi/jsgz/2018-08-13/710466.shtml",
),
ResourceSource("OI Wiki 树状数组", "https://oi-wiki.org/ds/fenwick/"),
ResourceSource("OI Wiki 线段树", "https://oi-wiki.org/ds/seg/"),
ResourceSource("OI Wiki 最短路", "https://oi-wiki.org/graph/shortest-path/"),
ResourceSource("OI Wiki 强连通分量", "https://oi-wiki.org/graph/scc/"),
ResourceSource("OI Wiki 最大流", "https://oi-wiki.org/graph/flow/max-flow/"),
ResourceSource("OI Wiki 树上 DP", "https://oi-wiki.org/dp/tree/"),
ResourceSource("OI Wiki KMP", "https://oi-wiki.org/string/kmp/"),
ResourceSource(
"cp-algorithms Segment Tree",
"https://cp-algorithms.com/data_structures/segment_tree.html",
),
),
),
)
def now_sec() -> int:
return int(time.time())
def load_dotenv(path: Path) -> None:
if not path.exists():
return
for raw in path.read_text(encoding="utf-8", errors="ignore").splitlines():
line = raw.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
key = key.strip()
if not key or key in os.environ:
continue
os.environ[key] = value.strip().strip("\"").strip("'")
def fetch_url(url: str, timeout: int) -> tuple[str, str]:
headers = {"User-Agent": USER_AGENT}
resp = requests.get(url, headers=headers, timeout=timeout, proxies=NO_PROXY)
resp.encoding = resp.apparent_encoding or resp.encoding
if resp.status_code >= 400:
raise RuntimeError(f"HTTP {resp.status_code}")
html_text = resp.text
return html_text, resp.url
def strip_html(html_text: str, max_chars: int) -> str:
text = re.sub(r"(?is)<(script|style|noscript|svg|canvas)[^>]*>.*?</\\1>", " ", html_text)
text = re.sub(r"(?is)<br\\s*/?>", "\n", text)
text = re.sub(r"(?is)</(p|div|section|article|h1|h2|h3|h4|h5|h6|li|tr|table|ul|ol)>", "\n", text)
text = re.sub(r"(?is)<[^>]+>", " ", text)
text = html.unescape(text)
text = text.replace("\r", "\n").replace("\xa0", " ")
text = re.sub(r"[ \t\f\v]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
text = "\n".join(line.strip() for line in text.splitlines() if line.strip())
if len(text) > max_chars:
text = text[:max_chars].rstrip() + "\n..."
return text
def extract_title(html_text: str, fallback: str) -> str:
match = re.search(r"(?is)<title>(.*?)</title>", html_text)
if not match:
return fallback
title = html.unescape(match.group(1))
title = re.sub(r"\s+", " ", title).strip()
if title:
return title
return fallback
def extract_message_text(content: Any) -> str:
if isinstance(content, str):
return content.strip()
if isinstance(content, list):
out: list[str] = []
for item in content:
if isinstance(item, dict):
text = item.get("text")
if isinstance(text, str) and text.strip():
out.append(text.strip())
return "\n".join(out).strip()
if isinstance(content, dict):
text = content.get("text")
if isinstance(text, str):
return text.strip()
return ""
def llm_request(prompt: str, timeout: int, retries: int, retry_sleep_sec: float) -> str:
url = os.getenv("OI_LLM_API_URL", "").strip()
api_key = os.getenv("OI_LLM_API_KEY", "").strip()
model = os.getenv("OI_LLM_MODEL", "qwen3-max").strip()
if not url:
raise RuntimeError("missing OI_LLM_API_URL")
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
payload = {
"model": model,
"stream": False,
"temperature": 0.2,
"messages": [
{
"role": "system",
"content": "你是资深信息学竞赛教练,请严格基于来源内容整理中文 Markdown 学习资料。",
},
{"role": "user", "content": prompt},
],
}
for idx in range(1, retries + 1):
try:
resp = requests.post(url, headers=headers, json=payload, timeout=timeout, proxies=NO_PROXY)
except requests.RequestException as exc:
if idx >= retries:
raise RuntimeError(f"llm request failed: {exc}") from exc
time.sleep(retry_sleep_sec * idx)
continue
if resp.status_code in RETRYABLE_HTTP_CODES:
if idx >= retries:
raise RuntimeError(f"llm retry exhausted: HTTP {resp.status_code}")
time.sleep(retry_sleep_sec * idx)
continue
if resp.status_code >= 400:
raise RuntimeError(f"llm request failed: HTTP {resp.status_code}: {resp.text[:300]}")
body = resp.json()
choices = body.get("choices") or []
if not choices:
raise RuntimeError("llm response missing choices")
message = (choices[0] or {}).get("message") or {}
text = extract_message_text(message.get("content"))
if not text:
text = extract_message_text((choices[0] or {}).get("text"))
if not text:
raise RuntimeError("llm response missing content")
return text
raise RuntimeError("llm request failed")
def remove_outer_markdown_fence(text: str) -> str:
raw = text.strip()
match = re.match(r"^```(?:markdown|md)?\\s*([\\s\\S]*?)\\s*```$", raw, flags=re.IGNORECASE)
if match:
return match.group(1).strip()
return raw
def build_prompt(spec: TrackSpec, source_materials: list[dict[str, str]]) -> str:
source_lines = "\n".join([f"- {it['label']}: {it['url']}" for it in source_materials])
snippets: list[str] = []
for idx, item in enumerate(source_materials, start=1):
snippets.append(
"\n".join(
[
f"[来源 {idx}] {item['label']}",
f"URL: {item['url']}",
f"页面标题: {item['title']}",
"摘录:",
item["snippet"],
]
)
)
all_snippets = "\n\n".join(snippets)
return f"""
请整理一篇中文 Markdown 学习资料文章,主题:{spec.title}
目标读者:{spec.audience}
目标:{spec.objective}
硬性要求:
1. 只输出 Markdown 正文,不要输出解释、前言、JSON 或代码块外的多余说明。
2. 正文不少于 900 字,内容要具体可执行,不能只给提纲。
3. 内容结构至少包含:
- 学习目标
- 知识图谱(按优先级)
- 分阶段训练计划(建议按周)
- 常见失分点与避坑清单
- C++14 / 评测环境规范提醒(明确写出:优先 C++14;避免 C++17 特性;long long 用 %lld;如赛方要求则使用 freopen;main 返回 int 且 return 0
4. 结尾必须包含“## 参考来源”章节,并且只列出本次给定来源,使用 Markdown 链接。
5. 不要编造具体年份政策细节;对于地方性要求,写“以当年官方通知为准”。
6. 风格要可执行,尽量给出检查清单与训练顺序。
可用来源列表:
{source_lines}
来源摘录:
{all_snippets}
""".strip()
def fallback_markdown(spec: TrackSpec, source_materials: list[dict[str, str]], error_text: str) -> str:
lines = [
f"# {spec.title}",
"",
"## 学习目标",
f"- 读者:{spec.audience}",
f"- 目标:{spec.objective}",
"",
"## 训练建议",
"- 每周固定做 3~5 道题:先基础题,再专项题,再限时套题。",
"- 每次训练后补齐错因:读题失误、边界遗漏、复杂度超限、代码规范错误。",
"- 建议建立个人模板并反复演练输入输出与边界处理。",
"",
"## C++14 / 评测环境规范提醒",
"- 统一按 C++14 编译,避免 C++17 及以上语法。",
"- `long long` 输入输出优先使用 `%lld`。",
"- 若赛方要求文件读写,使用 `freopen(\"xxx.in\", \"r\", stdin)` / `freopen(\"xxx.out\", \"w\", stdout)`。",
"- `main` 必须是 `int main()` 且 `return 0;`。",
"- 地方考区细则每年会更新,务必以当年官方通知为准。",
"",
"## 参考来源",
]
for item in source_materials:
lines.append(f"- [{item['label']}]({item['url']})")
lines.extend(
[
"",
"> 说明:本条目由自动整理流程生成。",
f"> LLM 调用失败原因:{error_text[:180]}",
]
)
return "\n".join(lines).strip() + "\n"
def upsert_article(conn: sqlite3.Connection, slug: str, title: str, content_md: str, ts: int) -> None:
conn.execute(
"""
INSERT INTO kb_articles(slug, title, content_md, created_at)
VALUES(?, ?, ?, ?)
ON CONFLICT(slug) DO UPDATE SET
title=excluded.title,
content_md=excluded.content_md,
created_at=excluded.created_at
""",
(slug, title, content_md, ts),
)
def guess_db_path(cli_path: str | None) -> str:
if cli_path:
return cli_path
candidates = [
os.getenv("CSP_DB_PATH", "").strip(),
"/data/csp.db",
"/var/lib/docker/volumes/csp_csp_data/_data/csp.db",
str(Path(__file__).resolve().parents[1] / "data" / "csp.db"),
]
for path in candidates:
if path and Path(path).exists():
return path
return "/data/csp.db"
def collect_source_materials(
spec: TrackSpec,
timeout: int,
max_chars_per_source: int,
max_sources_per_track: int,
) -> list[dict[str, str]]:
materials: list[dict[str, str]] = []
for src in spec.sources[:max_sources_per_track]:
try:
html_text, final_url = fetch_url(src.url, timeout=timeout)
title = extract_title(html_text, src.label)
if "404" in title.lower() and "not found" in title.lower():
raise RuntimeError(f"unexpected 404 title: {title}")
snippet = strip_html(html_text, max_chars=max_chars_per_source)
if not snippet:
raise RuntimeError("empty extracted text")
materials.append(
{
"label": src.label,
"url": final_url,
"title": title,
"snippet": snippet,
}
)
print(f"[fetch] ok: {spec.slug} <- {src.url}")
except Exception as exc:
print(f"[fetch] skip: {spec.slug} <- {src.url} ({exc})", file=sys.stderr)
if not materials:
raise RuntimeError(f"no source material collected for {spec.slug}")
return materials
def main() -> int:
parser = argparse.ArgumentParser(description="Import web learning resources into kb_articles")
parser.add_argument("--db-path", default="", help="SQLite path (default: auto-detect)")
parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT)
parser.add_argument("--retries", type=int, default=5)
parser.add_argument("--retry-sleep-sec", type=float, default=1.5)
parser.add_argument("--max-chars-per-source", type=int, default=900)
parser.add_argument("--max-sources-per-track", type=int, default=3)
parser.add_argument("--dry-run", action="store_true")
parser.add_argument(
"--only",
default="",
help="Only process one track slug, e.g. learning-csp-j",
)
args = parser.parse_args()
repo_root = Path(__file__).resolve().parents[1]
load_dotenv(repo_root / ".env")
db_path = guess_db_path(args.db_path or None)
print(f"[db] using: {db_path}")
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA foreign_keys=ON")
conn.execute("PRAGMA busy_timeout=10000")
processed = 0
for spec in TRACKS:
if args.only and spec.slug != args.only.strip():
continue
print(f"[track] start: {spec.slug}")
materials = collect_source_materials(
spec,
timeout=max(5, args.timeout),
max_chars_per_source=max(600, args.max_chars_per_source),
max_sources_per_track=max(1, args.max_sources_per_track),
)
prompt = build_prompt(spec, materials)
markdown: str
try:
markdown = llm_request(
prompt=prompt,
timeout=max(15, args.timeout * 2),
retries=max(1, args.retries),
retry_sleep_sec=max(0.2, args.retry_sleep_sec),
)
markdown = remove_outer_markdown_fence(markdown)
if os.getenv("KB_IMPORT_DEBUG", "").strip():
preview = markdown.strip().replace("\n", "\\n")
print(
f"[llm] raw: {spec.slug} len={len(markdown.strip())} preview={preview[:220]}",
file=sys.stderr,
)
if len(markdown.strip()) < 120:
raise RuntimeError("llm output too short")
print(f"[llm] ok: {spec.slug}")
except Exception as exc:
print(f"[llm] fallback: {spec.slug} ({exc})", file=sys.stderr)
markdown = fallback_markdown(spec, materials, str(exc))
content = markdown.strip() + "\n"
if args.dry_run:
print(f"[dry-run] {spec.slug}: {len(content)} chars")
processed += 1
continue
upsert_article(conn, spec.slug, spec.title, content, now_sec())
conn.commit()
processed += 1
print(f"[db] upserted: {spec.slug}")
conn.close()
print(f"[done] processed tracks: {processed}")
return 0 if processed > 0 else 1
if __name__ == "__main__":
raise SystemExit(main())

文件差异内容过多而无法显示 加载差异