feat: expand platform management, admin controls, and learning workflows
这个提交包含在:
@@ -0,0 +1,272 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate CSP-J/S style feedback for one submission via LLM (with fallback)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
DEFAULT_LINKS: List[Dict[str, str]] = [
|
||||
{"title": "NOI 官网(规则与环境)", "url": "https://www.noi.cn/"},
|
||||
{"title": "OI Wiki(算法知识库)", "url": "https://oi-wiki.org/"},
|
||||
{"title": "cppreference C++14", "url": "https://en.cppreference.com/w/cpp/14"},
|
||||
{
|
||||
"title": "GCC Warning Options",
|
||||
"url": "https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html",
|
||||
},
|
||||
{"title": "洛谷(题解与训练)", "url": "https://www.luogu.com.cn/"},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlmResult:
|
||||
ok: bool
|
||||
feedback_md: str
|
||||
links: List[Dict[str, str]]
|
||||
model_name: str
|
||||
status: str
|
||||
|
||||
|
||||
def env(name: str, default: str = "") -> str:
|
||||
value = os.getenv(name, "").strip()
|
||||
return value if value else default
|
||||
|
||||
|
||||
def load_input(path: str) -> Dict[str, Any]:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError("input json must be object")
|
||||
return data
|
||||
|
||||
|
||||
def detect_cpp14_risk(code: str, compile_log: str) -> List[str]:
|
||||
hints: List[str] = []
|
||||
joined = f"{code}\n{compile_log}"
|
||||
checks = [
|
||||
(r"\bif\s+constexpr\b", "检测到 `if constexpr`(C++17),C++14 环境会 CE。"),
|
||||
(r"\bstd::optional\b", "检测到 `std::optional`(C++17),建议改为普通变量+标记位。"),
|
||||
(r"\bstd::variant\b", "检测到 `std::variant`(C++17),建议改为 struct/enum 分支。"),
|
||||
(r"\bstd::string_view\b", "检测到 `std::string_view`(C++17),建议改为 `const string&`。"),
|
||||
(r"\[[^\]]+\]\s*=" , "检测到结构化绑定迹象,C++14 不支持,建议改 pair/struct 访问。"),
|
||||
(r"%I64d", "检测到 `%I64d`,Linux 评测机应统一使用 `%lld`。"),
|
||||
(r"\bvoid\s+main\s*\(", "检测到 `void main()`,需改为 `int main()` 并 `return 0;`。"),
|
||||
]
|
||||
for pattern, tip in checks:
|
||||
if re.search(pattern, joined):
|
||||
hints.append(tip)
|
||||
if "-Wsign-compare" in compile_log:
|
||||
hints.append("存在 `-Wsign-compare`,建议统一使用 `size_t` 或显式类型转换。")
|
||||
return hints
|
||||
|
||||
|
||||
def build_fallback_feedback(payload: Dict[str, Any], llm_error: str = "") -> LlmResult:
|
||||
status = str(payload.get("status", "Unknown"))
|
||||
score = payload.get("score", 0)
|
||||
compile_log = str(payload.get("compile_log", ""))
|
||||
runtime_log = str(payload.get("runtime_log", ""))
|
||||
code = str(payload.get("code", ""))
|
||||
|
||||
risk_tips = detect_cpp14_risk(code, compile_log)
|
||||
if not risk_tips:
|
||||
risk_tips = [
|
||||
"请确认只使用 C++14 语法,避免 C++17 特性导致 CE。",
|
||||
"若题目要求文件输入输出,使用 `freopen(\"xxx.in\",\"r\",stdin)` / `freopen(\"xxx.out\",\"w\",stdout)`。",
|
||||
]
|
||||
|
||||
thought = (
|
||||
"代码通过了当前评测,核心思路基本正确,建议继续做规范化和鲁棒性收敛。"
|
||||
if status.upper() == "AC"
|
||||
else "当前提交未稳定通过,建议先按日志定位错误,再拆分为思路问题与实现问题逐步修复。"
|
||||
)
|
||||
|
||||
lines: List[str] = []
|
||||
lines.append("### 评测结论")
|
||||
lines.append(f"- 本次状态:**{status}**,分数:**{score}**。")
|
||||
lines.append(f"- 思路评价:{thought}")
|
||||
lines.append("")
|
||||
lines.append("### 福建 CSP-J/S 规范检查(C++14)")
|
||||
for tip in risk_tips:
|
||||
lines.append(f"- {tip}")
|
||||
if compile_log.strip():
|
||||
lines.append("- 编译日志有信息,建议逐条清理 warning,减少考场不确定性。")
|
||||
if runtime_log.strip():
|
||||
lines.append("- 运行日志有输出,建议重点检查边界输入与数组越界风险。")
|
||||
lines.append("")
|
||||
lines.append("### 修改建议(可执行)")
|
||||
lines.append("- 按“先编译通过→再保证正确→最后做优化”的顺序迭代。")
|
||||
lines.append("- `long long` 读写统一 `%lld`;不要使用 `%I64d`。")
|
||||
lines.append("- 清理 signed/unsigned 警告,降低不同编译器行为差异。")
|
||||
lines.append("- 确保 `int main()` 且 `return 0;`。")
|
||||
lines.append("")
|
||||
lines.append("### 知识点评测")
|
||||
lines.append("- 强项:基础实现与调试流程。")
|
||||
lines.append("- 待加强:边界构造、类型一致性、赛场环境兼容性。")
|
||||
lines.append("")
|
||||
lines.append("### 推荐外链资料")
|
||||
for item in DEFAULT_LINKS:
|
||||
lines.append(f"- [{item['title']}]({item['url']})")
|
||||
|
||||
if llm_error:
|
||||
lines.append("")
|
||||
lines.append(f"> 说明:LLM 调用失败,已返回规则兜底建议。错误:{llm_error}")
|
||||
|
||||
return LlmResult(
|
||||
ok=True,
|
||||
feedback_md="\n".join(lines).strip(),
|
||||
links=DEFAULT_LINKS,
|
||||
model_name="fallback-rules",
|
||||
status="fallback",
|
||||
)
|
||||
|
||||
|
||||
def normalize_links(raw: Any) -> List[Dict[str, str]]:
|
||||
links: List[Dict[str, str]] = []
|
||||
if isinstance(raw, list):
|
||||
for item in raw:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
title = str(item.get("title", "")).strip()
|
||||
url = str(item.get("url", "")).strip()
|
||||
if title and url:
|
||||
links.append({"title": title, "url": url})
|
||||
return links if links else DEFAULT_LINKS
|
||||
|
||||
|
||||
def dict_to_markdown(data: Dict[str, Any]) -> str:
|
||||
parts: List[str] = []
|
||||
for key, value in data.items():
|
||||
title = str(key).strip() or "分析项"
|
||||
if isinstance(value, str):
|
||||
body = value.strip()
|
||||
else:
|
||||
body = json.dumps(value, ensure_ascii=False, indent=2)
|
||||
if not body:
|
||||
continue
|
||||
parts.append(f"### {title}\n{body}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def call_llm(payload: Dict[str, Any]) -> LlmResult:
|
||||
api_url = env("OI_LLM_API_URL") or env("CSP_LLM_API_URL")
|
||||
api_key = env("OI_LLM_API_KEY") or env("CSP_LLM_API_KEY")
|
||||
model = env("OI_LLM_MODEL", "qwen3-max")
|
||||
if not api_url:
|
||||
raise RuntimeError("missing OI_LLM_API_URL")
|
||||
|
||||
system_prompt = (
|
||||
"你是福建省 CSP-J/S 代码规范与评测老师。"
|
||||
"请严格按 C++14 旧 GCC 环境给建议,重点指出会导致 CE/RE/爆零的风险。"
|
||||
"输出 JSON,不要输出其他文字。"
|
||||
)
|
||||
user_prompt = {
|
||||
"task": "分析这份提交并给出改进建议",
|
||||
"required_sections": [
|
||||
"评测结论",
|
||||
"福建 CSP-J/S 规范检查(C++14)",
|
||||
"修改建议",
|
||||
"知识点评测",
|
||||
"推荐外链资料",
|
||||
],
|
||||
"submission": payload,
|
||||
"output_json_schema": {
|
||||
"feedback_md": "markdown string",
|
||||
"links": [{"title": "string", "url": "string"}],
|
||||
"status": "ready",
|
||||
},
|
||||
}
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
body = {
|
||||
"model": model,
|
||||
"stream": False,
|
||||
"temperature": 0.1,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": json.dumps(user_prompt, ensure_ascii=False)},
|
||||
],
|
||||
}
|
||||
|
||||
resp = requests.post(api_url, headers=headers, json=body, timeout=50)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
choices = data.get("choices") if isinstance(data, dict) else None
|
||||
if not choices:
|
||||
raise RuntimeError("LLM response missing choices")
|
||||
first = choices[0] if isinstance(choices, list) and choices else {}
|
||||
message = first.get("message") if isinstance(first, dict) else {}
|
||||
content = message.get("content", "") if isinstance(message, dict) else ""
|
||||
if not isinstance(content, str) or not content.strip():
|
||||
raise RuntimeError("LLM content is empty")
|
||||
|
||||
model_name = str(data.get("model", model)) if isinstance(data, dict) else model
|
||||
|
||||
parsed: Optional[Dict[str, Any]] = None
|
||||
try:
|
||||
candidate = json.loads(content)
|
||||
if isinstance(candidate, dict):
|
||||
parsed = candidate
|
||||
except Exception:
|
||||
parsed = None
|
||||
|
||||
if parsed and parsed.get("feedback_md"):
|
||||
return LlmResult(
|
||||
ok=True,
|
||||
feedback_md=str(parsed.get("feedback_md", "")).strip(),
|
||||
links=normalize_links(parsed.get("links")),
|
||||
model_name=model_name,
|
||||
status=str(parsed.get("status", "ready")) or "ready",
|
||||
)
|
||||
|
||||
if parsed:
|
||||
return LlmResult(
|
||||
ok=True,
|
||||
feedback_md=dict_to_markdown(parsed),
|
||||
links=DEFAULT_LINKS,
|
||||
model_name=model_name,
|
||||
status="ready",
|
||||
)
|
||||
|
||||
return LlmResult(
|
||||
ok=True,
|
||||
feedback_md=content.strip(),
|
||||
links=DEFAULT_LINKS,
|
||||
model_name=model_name,
|
||||
status="ready",
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Analyze one submission with LLM + fallback")
|
||||
parser.add_argument("--input-file", required=True, help="JSON file from backend")
|
||||
args = parser.parse_args()
|
||||
|
||||
payload = load_input(args.input_file)
|
||||
try:
|
||||
result = call_llm(payload)
|
||||
except Exception as exc:
|
||||
result = build_fallback_feedback(payload, str(exc))
|
||||
|
||||
output = {
|
||||
"feedback_md": result.feedback_md,
|
||||
"links": result.links,
|
||||
"model_name": result.model_name,
|
||||
"status": result.status,
|
||||
}
|
||||
sys.stdout.write(json.dumps(output, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -7,7 +7,10 @@ import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
@@ -15,6 +18,37 @@ from typing import Any
|
||||
import requests
|
||||
|
||||
RETRYABLE_HTTP_CODES = {500, 502, 503, 504}
|
||||
CLANG_FORMAT_BIN = shutil.which("clang-format")
|
||||
GXX_BIN = shutil.which("g++")
|
||||
PLACEHOLDER_CODE_MARKERS = (
|
||||
"todo",
|
||||
"to do",
|
||||
"请根据题意补全",
|
||||
"待补全",
|
||||
"自行补全",
|
||||
"省略",
|
||||
"your code here",
|
||||
)
|
||||
CPP17_BANNED_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
|
||||
(re.compile(r"\bif\s+constexpr\b"), "if constexpr"),
|
||||
(re.compile(r"\bstd::optional\b"), "std::optional"),
|
||||
(re.compile(r"\bstd::variant\b"), "std::variant"),
|
||||
(re.compile(r"\bstd::any\b"), "std::any"),
|
||||
(re.compile(r"\bstd::string_view\b"), "std::string_view"),
|
||||
(re.compile(r"\bstd::filesystem\b"), "std::filesystem"),
|
||||
(re.compile(r"\bstd::byte\b"), "std::byte"),
|
||||
(re.compile(r"\bstd::clamp\s*\("), "std::clamp"),
|
||||
(re.compile(r"\bstd::gcd\s*\("), "std::gcd"),
|
||||
(re.compile(r"\bstd::lcm\s*\("), "std::lcm"),
|
||||
(re.compile(r"#\s*include\s*<\s*(optional|variant|any|string_view|filesystem|charconv|execution)\s*>"), "C++17 header"),
|
||||
(
|
||||
re.compile(
|
||||
r"\b(?:const\s+)?auto(?:\s*&|\s*&&)?\s*\[[^\]\n]+\]\s*=",
|
||||
flags=re.MULTILINE,
|
||||
),
|
||||
"structured bindings",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -32,6 +66,17 @@ def now_sec() -> int:
|
||||
return int(time.time())
|
||||
|
||||
|
||||
def env_bool(key: str, default: bool) -> bool:
|
||||
raw = os.getenv(key, "").strip().lower()
|
||||
if not raw:
|
||||
return default
|
||||
if raw in {"1", "true", "yes", "on"}:
|
||||
return True
|
||||
if raw in {"0", "false", "no", "off"}:
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def extract_json_object(text: str) -> dict[str, Any] | None:
|
||||
raw = text.strip()
|
||||
if raw.startswith("```"):
|
||||
@@ -54,6 +99,253 @@ def extract_json_object(text: str) -> dict[str, Any] | None:
|
||||
return None
|
||||
|
||||
|
||||
def extract_message_text(content: Any) -> str:
|
||||
if isinstance(content, str):
|
||||
return content.strip()
|
||||
if isinstance(content, list):
|
||||
parts: list[str] = []
|
||||
for item in content:
|
||||
if isinstance(item, dict):
|
||||
text = item.get("text")
|
||||
if isinstance(text, str) and text.strip():
|
||||
parts.append(text.strip())
|
||||
continue
|
||||
nested = item.get("content")
|
||||
if isinstance(nested, str) and nested.strip():
|
||||
parts.append(nested.strip())
|
||||
return "\n".join(parts).strip()
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text")
|
||||
if isinstance(text, str):
|
||||
return text.strip()
|
||||
nested = content.get("content")
|
||||
if isinstance(nested, str):
|
||||
return nested.strip()
|
||||
return ""
|
||||
|
||||
|
||||
def iter_json_candidates(text: str) -> list[str]:
|
||||
if not text:
|
||||
return []
|
||||
raw = text.strip()
|
||||
candidates: list[str] = [raw] if raw else []
|
||||
|
||||
for match in re.finditer(r"```(?:json)?\s*([\s\S]*?)```", text, flags=re.IGNORECASE):
|
||||
block = match.group(1).strip()
|
||||
if block:
|
||||
candidates.append(block)
|
||||
|
||||
decoder = json.JSONDecoder()
|
||||
limit = min(len(text), 200000)
|
||||
sample = text[:limit]
|
||||
for idx, ch in enumerate(sample):
|
||||
if ch not in "{[":
|
||||
continue
|
||||
try:
|
||||
_, end = decoder.raw_decode(sample[idx:])
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
snippet = sample[idx : idx + end].strip()
|
||||
if snippet:
|
||||
candidates.append(snippet)
|
||||
|
||||
seen: set[str] = set()
|
||||
deduped: list[str] = []
|
||||
for cand in candidates:
|
||||
if cand in seen:
|
||||
continue
|
||||
seen.add(cand)
|
||||
deduped.append(cand)
|
||||
return deduped
|
||||
|
||||
|
||||
def extract_solution_rows(content: str) -> list[dict[str, Any]]:
|
||||
for candidate in iter_json_candidates(content):
|
||||
try:
|
||||
parsed = json.loads(candidate)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
rows: Any = None
|
||||
if isinstance(parsed, dict):
|
||||
rows = parsed.get("solutions")
|
||||
if rows is None and isinstance(parsed.get("data"), dict):
|
||||
rows = parsed["data"].get("solutions")
|
||||
elif isinstance(parsed, list):
|
||||
rows = parsed
|
||||
|
||||
if isinstance(rows, list):
|
||||
filtered = [x for x in rows if isinstance(x, dict)]
|
||||
if filtered:
|
||||
return filtered
|
||||
return []
|
||||
|
||||
|
||||
def is_placeholder_code(code: str) -> bool:
|
||||
lower = (code or "").lower()
|
||||
if any(marker in lower for marker in PLACEHOLDER_CODE_MARKERS):
|
||||
return True
|
||||
if "..." in code:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def cpp14_violations(code: str) -> list[str]:
|
||||
hits: list[str] = []
|
||||
for pattern, label in CPP17_BANNED_PATTERNS:
|
||||
if pattern.search(code):
|
||||
hits.append(label)
|
||||
return hits
|
||||
|
||||
|
||||
def compiles_under_cpp14(code: str) -> tuple[bool, str]:
|
||||
if not GXX_BIN:
|
||||
return True, ""
|
||||
with tempfile.TemporaryDirectory(prefix="csp_sol_cpp14_") as tmp:
|
||||
src_path = os.path.join(tmp, "main.cpp")
|
||||
with open(src_path, "w", encoding="utf-8") as f:
|
||||
f.write(code if code.endswith("\n") else f"{code}\n")
|
||||
proc = subprocess.run(
|
||||
[GXX_BIN, "-std=gnu++14", "-O2", "-Wall", "-Wextra", "-Wpedantic", "-fsyntax-only", src_path],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
timeout=12,
|
||||
)
|
||||
if proc.returncode == 0:
|
||||
return True, ""
|
||||
err = proc.stderr.decode("utf-8", errors="ignore").strip()
|
||||
return False, err[:400]
|
||||
|
||||
|
||||
def normalize_solutions(rows: list[dict[str, Any]], max_solutions: int) -> tuple[list[dict[str, Any]], list[str]]:
|
||||
normalized: list[dict[str, Any]] = []
|
||||
rejected: list[str] = []
|
||||
for row in rows:
|
||||
title = str(row.get("title") or "").strip()
|
||||
idea_md = str(row.get("idea_md") or "").strip()
|
||||
explanation_md = str(row.get("explanation_md") or "").strip()
|
||||
complexity = str(row.get("complexity") or "").strip()
|
||||
code_cpp = str(row.get("code_cpp") or "")
|
||||
tags = row.get("tags") if isinstance(row.get("tags"), list) else []
|
||||
|
||||
if not code_cpp.strip():
|
||||
rejected.append("empty code_cpp")
|
||||
continue
|
||||
if "main(" not in code_cpp:
|
||||
rejected.append("missing main()")
|
||||
continue
|
||||
if is_placeholder_code(code_cpp):
|
||||
rejected.append("placeholder code")
|
||||
continue
|
||||
violations = cpp14_violations(code_cpp)
|
||||
if violations:
|
||||
rejected.append(f"C++17+ feature: {', '.join(violations[:3])}")
|
||||
continue
|
||||
ok_cpp14, compile_msg = compiles_under_cpp14(code_cpp)
|
||||
if not ok_cpp14:
|
||||
rejected.append(f"cannot compile with -std=gnu++14: {compile_msg}")
|
||||
continue
|
||||
|
||||
normalized.append(
|
||||
{
|
||||
"title": title,
|
||||
"idea_md": idea_md,
|
||||
"explanation_md": explanation_md,
|
||||
"complexity": complexity,
|
||||
"code_cpp": code_cpp,
|
||||
"tags": tags,
|
||||
}
|
||||
)
|
||||
if len(normalized) >= max_solutions:
|
||||
break
|
||||
return normalized, rejected
|
||||
|
||||
|
||||
def build_prompt(problem: Problem, max_solutions: int) -> str:
|
||||
return f"""
|
||||
请为下面这道 CSP 题生成 {max_solutions} 种不同思路的题解(可从模拟/贪心/DP/图论/数据结构等不同角度切入),并给出可直接提交的 C++14 参考代码。
|
||||
|
||||
硬性要求:
|
||||
1. 必须只输出一个 JSON 对象,不能有任何 JSON 外文本。
|
||||
2. JSON 必须符合下面格式,且 solutions 数组长度应为 {max_solutions}。
|
||||
3. 每个 code_cpp 必须是完整、可编译、可运行的 C++14 程序(包含 main 函数),不能出现 TODO、伪代码、占位注释、省略号。
|
||||
4. 必须兼容 GCC 4.9/5.4 + -std=gnu++14:严禁使用 C++17 及以上特性(如 structured bindings、if constexpr、std::optional、std::variant、std::any、std::string_view、<filesystem>)。
|
||||
5. 建议使用标准头文件(如 <iostream>/<vector>/<algorithm> 等),不要使用 <bits/stdc++.h>。
|
||||
6. main 必须是 int main(),并且 return 0;。若使用 scanf/printf 处理 long long,格式符必须用 %lld,不要用 %I64d。
|
||||
7. 代码风格清晰,变量命名可读,注释简洁。
|
||||
|
||||
输出 JSON,格式固定:
|
||||
{{
|
||||
"solutions": [
|
||||
{{
|
||||
"title": "解法标题",
|
||||
"idea_md": "思路要点(Markdown)",
|
||||
"explanation_md": "详细讲解(Markdown)",
|
||||
"complexity": "时间/空间复杂度",
|
||||
"code_cpp": "完整 C++14 代码",
|
||||
"tags": ["标签1","标签2"]
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
题目信息:
|
||||
- 题目:{problem.title}
|
||||
- 难度:{problem.difficulty}
|
||||
- 来源:{problem.source}
|
||||
|
||||
完整题面(原文,不做截断):
|
||||
{problem.statement_md}
|
||||
|
||||
样例输入(原文):
|
||||
{problem.sample_input}
|
||||
|
||||
样例输出(原文):
|
||||
{problem.sample_output}
|
||||
""".strip()
|
||||
|
||||
|
||||
def parse_solutions_or_raise(content: str, max_solutions: int) -> list[dict[str, Any]]:
|
||||
rows = extract_solution_rows(content)
|
||||
if not rows:
|
||||
raise RuntimeError("llm response missing valid solutions array")
|
||||
normalized, rejected = normalize_solutions(rows, max_solutions=max_solutions)
|
||||
if not normalized:
|
||||
reason = f"; rejected sample: {rejected[0][:180]}" if rejected else ""
|
||||
raise RuntimeError(f"llm response contains no runnable full code{reason}")
|
||||
return normalized
|
||||
|
||||
|
||||
def generate_solutions_with_llm(
|
||||
prompt: str,
|
||||
max_solutions: int,
|
||||
timeout: int,
|
||||
retries: int,
|
||||
sleep_sec: float,
|
||||
) -> list[dict[str, Any]]:
|
||||
first_content = llm_request(prompt, timeout=timeout, retries=retries, sleep_sec=sleep_sec)
|
||||
try:
|
||||
return parse_solutions_or_raise(first_content, max_solutions=max_solutions)
|
||||
except Exception as first_exc:
|
||||
repair_prompt = (
|
||||
"你上一条回复不符合要求,原因是:"
|
||||
f"{str(first_exc)[:240]}。请只输出合法 JSON,并确保 code_cpp 是完整可运行 C++14 代码(兼容 -std=gnu++14)。\n\n"
|
||||
+ prompt
|
||||
)
|
||||
second_content = llm_request(
|
||||
repair_prompt,
|
||||
timeout=timeout,
|
||||
retries=retries,
|
||||
sleep_sec=sleep_sec,
|
||||
)
|
||||
try:
|
||||
return parse_solutions_or_raise(second_content, max_solutions=max_solutions)
|
||||
except Exception as second_exc:
|
||||
raise RuntimeError(
|
||||
f"parse failed after retry: first={str(first_exc)[:200]}; second={str(second_exc)[:200]}"
|
||||
) from second_exc
|
||||
|
||||
|
||||
def llm_request(prompt: str, timeout: int, retries: int, sleep_sec: float) -> str:
|
||||
url = os.getenv("OI_LLM_API_URL", "").strip()
|
||||
api_key = os.getenv("OI_LLM_API_KEY", "").strip()
|
||||
@@ -102,10 +394,14 @@ def llm_request(prompt: str, timeout: int, retries: int, sleep_sec: float) -> st
|
||||
choices = payload.get("choices") or []
|
||||
if not choices:
|
||||
raise RuntimeError("llm response missing choices")
|
||||
content = ((choices[0] or {}).get("message") or {}).get("content")
|
||||
choice0 = choices[0] or {}
|
||||
message = choice0.get("message") or {}
|
||||
content = extract_message_text(message.get("content"))
|
||||
if not content:
|
||||
content = extract_message_text(choice0.get("text"))
|
||||
if not content:
|
||||
raise RuntimeError("llm response missing content")
|
||||
return str(content)
|
||||
return content
|
||||
|
||||
if last_error:
|
||||
raise RuntimeError(f"llm request failed: {last_error}") from last_error
|
||||
@@ -119,7 +415,20 @@ def fallback_solutions(max_solutions: int) -> list[dict[str, Any]]:
|
||||
"idea_md": "按题意拆分步骤,先写可过样例的直观解法,再补边界处理。",
|
||||
"explanation_md": "适用于数据范围较小或规则清晰的题。",
|
||||
"complexity": "时间复杂度依题而定,通常 O(n)~O(n^2)",
|
||||
"code_cpp": "// TODO: 请根据题意补全\n#include <bits/stdc++.h>\nusing namespace std;\nint main(){ios::sync_with_stdio(false);cin.tie(nullptr);return 0;}\n",
|
||||
"code_cpp": """
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
ios::sync_with_stdio(false);
|
||||
cin.tie(nullptr);
|
||||
|
||||
// TODO: 请根据题意补全读入、核心逻辑与输出。
|
||||
return 0;
|
||||
}
|
||||
""".strip(),
|
||||
"tags": ["simulation", "implementation"],
|
||||
},
|
||||
{
|
||||
@@ -127,13 +436,55 @@ def fallback_solutions(max_solutions: int) -> list[dict[str, Any]]:
|
||||
"idea_md": "分析状态与重复计算,尝试用前缀和、贪心或动态规划优化。",
|
||||
"explanation_md": "比直接模拟更稳定,通常能覆盖更大数据规模。",
|
||||
"complexity": "通常优于朴素解法",
|
||||
"code_cpp": "// TODO: 请根据题意补全\n#include <bits/stdc++.h>\nusing namespace std;\nint main(){ios::sync_with_stdio(false);cin.tie(nullptr);return 0;}\n",
|
||||
"code_cpp": """
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
ios::sync_with_stdio(false);
|
||||
cin.tie(nullptr);
|
||||
|
||||
// TODO: 请根据题意补全优化版思路实现。
|
||||
return 0;
|
||||
}
|
||||
""".strip(),
|
||||
"tags": ["optimization", "dp"],
|
||||
},
|
||||
]
|
||||
return base[: max(1, max_solutions)]
|
||||
|
||||
|
||||
def format_cpp_code(raw: str) -> str:
|
||||
code = (raw or "").replace("\r\n", "\n").replace("\r", "\n")
|
||||
if not code.strip():
|
||||
return ""
|
||||
if not code.endswith("\n"):
|
||||
code += "\n"
|
||||
|
||||
if not CLANG_FORMAT_BIN:
|
||||
return code
|
||||
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
[CLANG_FORMAT_BIN, "--style={BasedOnStyle: Google, IndentWidth: 2, ColumnLimit: 0}"],
|
||||
input=code.encode("utf-8"),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
timeout=6,
|
||||
)
|
||||
if proc.returncode == 0 and proc.stdout:
|
||||
out = proc.stdout.decode("utf-8", errors="ignore")
|
||||
if out.strip():
|
||||
return out if out.endswith("\n") else f"{out}\n"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return code
|
||||
|
||||
|
||||
def load_problem(conn: sqlite3.Connection, problem_id: int) -> Problem:
|
||||
cur = conn.execute(
|
||||
"SELECT id,title,statement_md,difficulty,source,sample_input,sample_output FROM problems WHERE id=?",
|
||||
@@ -182,7 +533,7 @@ def store_solutions(conn: sqlite3.Connection, problem_id: int, rows: list[dict[s
|
||||
|
||||
idea_md = str(row.get("idea_md") or "").strip()
|
||||
explanation_md = str(row.get("explanation_md") or "").strip()
|
||||
code_cpp = str(row.get("code_cpp") or "").strip()
|
||||
code_cpp = format_cpp_code(str(row.get("code_cpp") or ""))
|
||||
complexity = str(row.get("complexity") or "").strip()
|
||||
tags = row.get("tags") if isinstance(row.get("tags"), list) else []
|
||||
|
||||
@@ -239,68 +590,41 @@ def main() -> int:
|
||||
|
||||
try:
|
||||
problem = load_problem(conn, args.problem_id)
|
||||
requested_solutions = max(1, min(5, args.max_solutions))
|
||||
allow_fallback = env_bool("CSP_SOLUTION_ALLOW_FALLBACK", False)
|
||||
|
||||
prompt = f"""
|
||||
请为下面这道 CSP 题生成 {max(1, min(5, args.max_solutions))} 种不同思路的题解(可从不同角度切入,例如模拟/贪心/DP/数据结构),并给出 C++ 参考代码。
|
||||
|
||||
输出 JSON,格式固定:
|
||||
{{
|
||||
"solutions": [
|
||||
{{
|
||||
"title": "解法标题",
|
||||
"idea_md": "思路要点(Markdown)",
|
||||
"explanation_md": "详细讲解(Markdown)",
|
||||
"complexity": "时间/空间复杂度",
|
||||
"code_cpp": "完整 C++17 代码",
|
||||
"tags": ["标签1","标签2"]
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
题目:{problem.title}
|
||||
难度:{problem.difficulty}
|
||||
来源:{problem.source}
|
||||
题面:
|
||||
{problem.statement_md[:12000]}
|
||||
样例输入:
|
||||
{problem.sample_input[:1200]}
|
||||
样例输出:
|
||||
{problem.sample_output[:1200]}
|
||||
""".strip()
|
||||
prompt = build_prompt(problem, max_solutions=requested_solutions)
|
||||
|
||||
update_job(conn, args.job_id, progress=25, message="requesting llm", updated_at=now_sec())
|
||||
|
||||
source = "fallback"
|
||||
source = "llm"
|
||||
solutions: list[dict[str, Any]]
|
||||
try:
|
||||
content = llm_request(
|
||||
prompt,
|
||||
solutions = generate_solutions_with_llm(
|
||||
prompt=prompt,
|
||||
max_solutions=requested_solutions,
|
||||
timeout=args.timeout,
|
||||
retries=args.retries,
|
||||
sleep_sec=args.retry_sleep_sec,
|
||||
)
|
||||
obj = extract_json_object(content)
|
||||
raw = obj.get("solutions") if isinstance(obj, dict) else None
|
||||
if not isinstance(raw, list) or len(raw) == 0:
|
||||
raise RuntimeError("llm response missing solutions array")
|
||||
solutions = [x for x in raw if isinstance(x, dict)]
|
||||
if not solutions:
|
||||
raise RuntimeError("llm response has empty valid solutions")
|
||||
source = "llm"
|
||||
except Exception:
|
||||
except Exception as exc:
|
||||
if not allow_fallback:
|
||||
raise RuntimeError(f"llm generation failed: {str(exc)[:280]}") from exc
|
||||
source = "fallback"
|
||||
solutions = fallback_solutions(args.max_solutions)
|
||||
|
||||
solutions = solutions[: max(1, min(5, args.max_solutions))]
|
||||
solutions = solutions[:requested_solutions]
|
||||
|
||||
update_job(conn, args.job_id, progress=70, message="writing solutions", updated_at=now_sec())
|
||||
saved = store_solutions(conn, args.problem_id, solutions, source)
|
||||
|
||||
done_msg = f"completed: {saved} solutions ({source})"
|
||||
update_job(
|
||||
conn,
|
||||
args.job_id,
|
||||
status="completed",
|
||||
progress=100,
|
||||
message=f"completed: {saved} solutions ({source})",
|
||||
message=done_msg,
|
||||
finished_at=now_sec(),
|
||||
updated_at=now_sec(),
|
||||
)
|
||||
|
||||
487
scripts/import_kb_learning_resources.py
可执行文件
487
scripts/import_kb_learning_resources.py
可执行文件
@@ -0,0 +1,487 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Collect C++/CSP learning resources from the web, summarize with LLM, and upsert KB articles."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
|
||||
DEFAULT_TIMEOUT = 30
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/123.0.0.0 Safari/537.36"
|
||||
)
|
||||
NO_PROXY = {"http": "", "https": ""}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ResourceSource:
|
||||
label: str
|
||||
url: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TrackSpec:
|
||||
slug: str
|
||||
title: str
|
||||
audience: str
|
||||
objective: str
|
||||
sources: tuple[ResourceSource, ...]
|
||||
|
||||
|
||||
TRACKS: tuple[TrackSpec, ...] = (
|
||||
TrackSpec(
|
||||
slug="learning-roadmap-csp",
|
||||
title="CSP 学习总路线(C++ 基础 → CSP-J → CSP-S)",
|
||||
audience="准备长期学习 CSP 的初中/高中选手与家长",
|
||||
objective="给出分阶段目标、周训练节奏、升阶检查清单和环境规范提醒。",
|
||||
sources=(
|
||||
ResourceSource("NOI 技术规则", "https://www.noi.cn/gynoi/jsgz/"),
|
||||
ResourceSource(
|
||||
"NOI Linux 与说明文档下载",
|
||||
"https://www.noi.cn/gynoi/jsgz/2018-08-21/710467.shtml",
|
||||
),
|
||||
ResourceSource(
|
||||
"NOI 标准竞赛环境说明(2012)",
|
||||
"https://www.noi.cn/gynoi/jsgz/2018-08-13/710466.shtml",
|
||||
),
|
||||
ResourceSource("OI Wiki 竞赛路线图", "https://oi-wiki.org/contest/roadmap/"),
|
||||
ResourceSource("OI Wiki 竞赛资源", "https://oi-wiki.org/contest/resources/"),
|
||||
ResourceSource(
|
||||
"cp-algorithms 首页",
|
||||
"https://cp-algorithms.com/",
|
||||
),
|
||||
),
|
||||
),
|
||||
TrackSpec(
|
||||
slug="learning-cpp-basic",
|
||||
title="C++ 基础学习资料(面向 CSP)",
|
||||
audience="C++ 零基础或语法不稳,准备进入 CSP-J 的同学",
|
||||
objective="梳理语法基础、STL 入门、输入输出与 C++14 兼容写法。",
|
||||
sources=(
|
||||
ResourceSource("cppreference C++ language", "https://en.cppreference.com/w/cpp/language.html"),
|
||||
ResourceSource("OI Wiki 语言基础", "https://oi-wiki.org/lang/basic/"),
|
||||
ResourceSource("OI Wiki 数组", "https://oi-wiki.org/lang/array/"),
|
||||
ResourceSource("OI Wiki 循环", "https://oi-wiki.org/lang/loop/"),
|
||||
ResourceSource("OI Wiki 运算符", "https://oi-wiki.org/lang/op/"),
|
||||
ResourceSource("OI Wiki C++ 标准库", "https://oi-wiki.org/lang/csl/"),
|
||||
ResourceSource("OI Wiki 文件操作", "https://oi-wiki.org/lang/file-op/"),
|
||||
),
|
||||
),
|
||||
TrackSpec(
|
||||
slug="learning-csp-j",
|
||||
title="CSP-J 学习资料与训练路径",
|
||||
audience="目标 CSP-J 提高组入门,正在建立算法基础的同学",
|
||||
objective="覆盖模拟、枚举、前缀和、基础搜索与基础 DP,给出循序刷题方案。",
|
||||
sources=(
|
||||
ResourceSource("NOI 技术规则", "https://www.noi.cn/gynoi/jsgz/"),
|
||||
ResourceSource("OI Wiki 模拟", "https://oi-wiki.org/basic/simulate/"),
|
||||
ResourceSource("OI Wiki 枚举", "https://oi-wiki.org/basic/enumerate/"),
|
||||
ResourceSource("OI Wiki 前缀和与差分", "https://oi-wiki.org/basic/prefix-sum/"),
|
||||
ResourceSource("OI Wiki 动态规划基础", "https://oi-wiki.org/dp/basic/"),
|
||||
ResourceSource("OI Wiki BFS", "https://oi-wiki.org/search/bfs/"),
|
||||
ResourceSource("OI Wiki DFS", "https://oi-wiki.org/search/dfs/"),
|
||||
ResourceSource("OI Wiki 常见错误", "https://oi-wiki.org/contest/common-mistakes/"),
|
||||
),
|
||||
),
|
||||
TrackSpec(
|
||||
slug="learning-csp-s",
|
||||
title="CSP-S 学习资料与进阶路径",
|
||||
audience="目标 CSP-S,已具备 CSP-J 基础并准备系统进阶的同学",
|
||||
objective="覆盖数据结构、图论、字符串与 DP 进阶,强调复杂度与工程规范。",
|
||||
sources=(
|
||||
ResourceSource(
|
||||
"NOI 标准竞赛环境说明(2016)",
|
||||
"https://www.noi.cn/gynoi/jsgz/2018-08-13/710465.shtml",
|
||||
),
|
||||
ResourceSource(
|
||||
"NOI 标准竞赛环境说明(2012)",
|
||||
"https://www.noi.cn/gynoi/jsgz/2018-08-13/710466.shtml",
|
||||
),
|
||||
ResourceSource("OI Wiki 树状数组", "https://oi-wiki.org/ds/fenwick/"),
|
||||
ResourceSource("OI Wiki 线段树", "https://oi-wiki.org/ds/seg/"),
|
||||
ResourceSource("OI Wiki 最短路", "https://oi-wiki.org/graph/shortest-path/"),
|
||||
ResourceSource("OI Wiki 强连通分量", "https://oi-wiki.org/graph/scc/"),
|
||||
ResourceSource("OI Wiki 最大流", "https://oi-wiki.org/graph/flow/max-flow/"),
|
||||
ResourceSource("OI Wiki 树上 DP", "https://oi-wiki.org/dp/tree/"),
|
||||
ResourceSource("OI Wiki KMP", "https://oi-wiki.org/string/kmp/"),
|
||||
ResourceSource(
|
||||
"cp-algorithms Segment Tree",
|
||||
"https://cp-algorithms.com/data_structures/segment_tree.html",
|
||||
),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def now_sec() -> int:
|
||||
return int(time.time())
|
||||
|
||||
|
||||
def load_dotenv(path: Path) -> None:
|
||||
if not path.exists():
|
||||
return
|
||||
for raw in path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
||||
line = raw.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, value = line.split("=", 1)
|
||||
key = key.strip()
|
||||
if not key or key in os.environ:
|
||||
continue
|
||||
os.environ[key] = value.strip().strip("\"").strip("'")
|
||||
|
||||
|
||||
def fetch_url(url: str, timeout: int) -> tuple[str, str]:
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
resp = requests.get(url, headers=headers, timeout=timeout, proxies=NO_PROXY)
|
||||
resp.encoding = resp.apparent_encoding or resp.encoding
|
||||
if resp.status_code >= 400:
|
||||
raise RuntimeError(f"HTTP {resp.status_code}")
|
||||
html_text = resp.text
|
||||
return html_text, resp.url
|
||||
|
||||
|
||||
def strip_html(html_text: str, max_chars: int) -> str:
|
||||
text = re.sub(r"(?is)<(script|style|noscript|svg|canvas)[^>]*>.*?</\\1>", " ", html_text)
|
||||
text = re.sub(r"(?is)<br\\s*/?>", "\n", text)
|
||||
text = re.sub(r"(?is)</(p|div|section|article|h1|h2|h3|h4|h5|h6|li|tr|table|ul|ol)>", "\n", text)
|
||||
text = re.sub(r"(?is)<[^>]+>", " ", text)
|
||||
text = html.unescape(text)
|
||||
text = text.replace("\r", "\n").replace("\xa0", " ")
|
||||
text = re.sub(r"[ \t\f\v]+", " ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
text = "\n".join(line.strip() for line in text.splitlines() if line.strip())
|
||||
if len(text) > max_chars:
|
||||
text = text[:max_chars].rstrip() + "\n..."
|
||||
return text
|
||||
|
||||
|
||||
def extract_title(html_text: str, fallback: str) -> str:
|
||||
match = re.search(r"(?is)<title>(.*?)</title>", html_text)
|
||||
if not match:
|
||||
return fallback
|
||||
title = html.unescape(match.group(1))
|
||||
title = re.sub(r"\s+", " ", title).strip()
|
||||
if title:
|
||||
return title
|
||||
return fallback
|
||||
|
||||
|
||||
def extract_message_text(content: Any) -> str:
|
||||
if isinstance(content, str):
|
||||
return content.strip()
|
||||
if isinstance(content, list):
|
||||
out: list[str] = []
|
||||
for item in content:
|
||||
if isinstance(item, dict):
|
||||
text = item.get("text")
|
||||
if isinstance(text, str) and text.strip():
|
||||
out.append(text.strip())
|
||||
return "\n".join(out).strip()
|
||||
if isinstance(content, dict):
|
||||
text = content.get("text")
|
||||
if isinstance(text, str):
|
||||
return text.strip()
|
||||
return ""
|
||||
|
||||
|
||||
def llm_request(prompt: str, timeout: int, retries: int, retry_sleep_sec: float) -> str:
|
||||
url = os.getenv("OI_LLM_API_URL", "").strip()
|
||||
api_key = os.getenv("OI_LLM_API_KEY", "").strip()
|
||||
model = os.getenv("OI_LLM_MODEL", "qwen3-max").strip()
|
||||
if not url:
|
||||
raise RuntimeError("missing OI_LLM_API_URL")
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"stream": False,
|
||||
"temperature": 0.2,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "你是资深信息学竞赛教练,请严格基于来源内容整理中文 Markdown 学习资料。",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
}
|
||||
|
||||
for idx in range(1, retries + 1):
|
||||
try:
|
||||
resp = requests.post(url, headers=headers, json=payload, timeout=timeout, proxies=NO_PROXY)
|
||||
except requests.RequestException as exc:
|
||||
if idx >= retries:
|
||||
raise RuntimeError(f"llm request failed: {exc}") from exc
|
||||
time.sleep(retry_sleep_sec * idx)
|
||||
continue
|
||||
|
||||
if resp.status_code in RETRYABLE_HTTP_CODES:
|
||||
if idx >= retries:
|
||||
raise RuntimeError(f"llm retry exhausted: HTTP {resp.status_code}")
|
||||
time.sleep(retry_sleep_sec * idx)
|
||||
continue
|
||||
|
||||
if resp.status_code >= 400:
|
||||
raise RuntimeError(f"llm request failed: HTTP {resp.status_code}: {resp.text[:300]}")
|
||||
|
||||
body = resp.json()
|
||||
choices = body.get("choices") or []
|
||||
if not choices:
|
||||
raise RuntimeError("llm response missing choices")
|
||||
message = (choices[0] or {}).get("message") or {}
|
||||
text = extract_message_text(message.get("content"))
|
||||
if not text:
|
||||
text = extract_message_text((choices[0] or {}).get("text"))
|
||||
if not text:
|
||||
raise RuntimeError("llm response missing content")
|
||||
return text
|
||||
|
||||
raise RuntimeError("llm request failed")
|
||||
|
||||
|
||||
def remove_outer_markdown_fence(text: str) -> str:
|
||||
raw = text.strip()
|
||||
match = re.match(r"^```(?:markdown|md)?\\s*([\\s\\S]*?)\\s*```$", raw, flags=re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
return raw
|
||||
|
||||
|
||||
def build_prompt(spec: TrackSpec, source_materials: list[dict[str, str]]) -> str:
|
||||
source_lines = "\n".join([f"- {it['label']}: {it['url']}" for it in source_materials])
|
||||
snippets: list[str] = []
|
||||
for idx, item in enumerate(source_materials, start=1):
|
||||
snippets.append(
|
||||
"\n".join(
|
||||
[
|
||||
f"[来源 {idx}] {item['label']}",
|
||||
f"URL: {item['url']}",
|
||||
f"页面标题: {item['title']}",
|
||||
"摘录:",
|
||||
item["snippet"],
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
all_snippets = "\n\n".join(snippets)
|
||||
|
||||
return f"""
|
||||
请整理一篇中文 Markdown 学习资料文章,主题:{spec.title}
|
||||
|
||||
目标读者:{spec.audience}
|
||||
目标:{spec.objective}
|
||||
|
||||
硬性要求:
|
||||
1. 只输出 Markdown 正文,不要输出解释、前言、JSON 或代码块外的多余说明。
|
||||
2. 正文不少于 900 字,内容要具体可执行,不能只给提纲。
|
||||
3. 内容结构至少包含:
|
||||
- 学习目标
|
||||
- 知识图谱(按优先级)
|
||||
- 分阶段训练计划(建议按周)
|
||||
- 常见失分点与避坑清单
|
||||
- C++14 / 评测环境规范提醒(明确写出:优先 C++14;避免 C++17 特性;long long 用 %lld;如赛方要求则使用 freopen;main 返回 int 且 return 0)
|
||||
4. 结尾必须包含“## 参考来源”章节,并且只列出本次给定来源,使用 Markdown 链接。
|
||||
5. 不要编造具体年份政策细节;对于地方性要求,写“以当年官方通知为准”。
|
||||
6. 风格要可执行,尽量给出检查清单与训练顺序。
|
||||
|
||||
可用来源列表:
|
||||
{source_lines}
|
||||
|
||||
来源摘录:
|
||||
{all_snippets}
|
||||
""".strip()
|
||||
|
||||
|
||||
def fallback_markdown(spec: TrackSpec, source_materials: list[dict[str, str]], error_text: str) -> str:
|
||||
lines = [
|
||||
f"# {spec.title}",
|
||||
"",
|
||||
"## 学习目标",
|
||||
f"- 读者:{spec.audience}",
|
||||
f"- 目标:{spec.objective}",
|
||||
"",
|
||||
"## 训练建议",
|
||||
"- 每周固定做 3~5 道题:先基础题,再专项题,再限时套题。",
|
||||
"- 每次训练后补齐错因:读题失误、边界遗漏、复杂度超限、代码规范错误。",
|
||||
"- 建议建立个人模板并反复演练输入输出与边界处理。",
|
||||
"",
|
||||
"## C++14 / 评测环境规范提醒",
|
||||
"- 统一按 C++14 编译,避免 C++17 及以上语法。",
|
||||
"- `long long` 输入输出优先使用 `%lld`。",
|
||||
"- 若赛方要求文件读写,使用 `freopen(\"xxx.in\", \"r\", stdin)` / `freopen(\"xxx.out\", \"w\", stdout)`。",
|
||||
"- `main` 必须是 `int main()` 且 `return 0;`。",
|
||||
"- 地方考区细则每年会更新,务必以当年官方通知为准。",
|
||||
"",
|
||||
"## 参考来源",
|
||||
]
|
||||
for item in source_materials:
|
||||
lines.append(f"- [{item['label']}]({item['url']})")
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"> 说明:本条目由自动整理流程生成。",
|
||||
f"> LLM 调用失败原因:{error_text[:180]}",
|
||||
]
|
||||
)
|
||||
return "\n".join(lines).strip() + "\n"
|
||||
|
||||
|
||||
def upsert_article(conn: sqlite3.Connection, slug: str, title: str, content_md: str, ts: int) -> None:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO kb_articles(slug, title, content_md, created_at)
|
||||
VALUES(?, ?, ?, ?)
|
||||
ON CONFLICT(slug) DO UPDATE SET
|
||||
title=excluded.title,
|
||||
content_md=excluded.content_md,
|
||||
created_at=excluded.created_at
|
||||
""",
|
||||
(slug, title, content_md, ts),
|
||||
)
|
||||
|
||||
|
||||
def guess_db_path(cli_path: str | None) -> str:
|
||||
if cli_path:
|
||||
return cli_path
|
||||
candidates = [
|
||||
os.getenv("CSP_DB_PATH", "").strip(),
|
||||
"/data/csp.db",
|
||||
"/var/lib/docker/volumes/csp_csp_data/_data/csp.db",
|
||||
str(Path(__file__).resolve().parents[1] / "data" / "csp.db"),
|
||||
]
|
||||
for path in candidates:
|
||||
if path and Path(path).exists():
|
||||
return path
|
||||
return "/data/csp.db"
|
||||
|
||||
|
||||
def collect_source_materials(
|
||||
spec: TrackSpec,
|
||||
timeout: int,
|
||||
max_chars_per_source: int,
|
||||
max_sources_per_track: int,
|
||||
) -> list[dict[str, str]]:
|
||||
materials: list[dict[str, str]] = []
|
||||
for src in spec.sources[:max_sources_per_track]:
|
||||
try:
|
||||
html_text, final_url = fetch_url(src.url, timeout=timeout)
|
||||
title = extract_title(html_text, src.label)
|
||||
if "404" in title.lower() and "not found" in title.lower():
|
||||
raise RuntimeError(f"unexpected 404 title: {title}")
|
||||
snippet = strip_html(html_text, max_chars=max_chars_per_source)
|
||||
if not snippet:
|
||||
raise RuntimeError("empty extracted text")
|
||||
materials.append(
|
||||
{
|
||||
"label": src.label,
|
||||
"url": final_url,
|
||||
"title": title,
|
||||
"snippet": snippet,
|
||||
}
|
||||
)
|
||||
print(f"[fetch] ok: {spec.slug} <- {src.url}")
|
||||
except Exception as exc:
|
||||
print(f"[fetch] skip: {spec.slug} <- {src.url} ({exc})", file=sys.stderr)
|
||||
|
||||
if not materials:
|
||||
raise RuntimeError(f"no source material collected for {spec.slug}")
|
||||
return materials
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Import web learning resources into kb_articles")
|
||||
parser.add_argument("--db-path", default="", help="SQLite path (default: auto-detect)")
|
||||
parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT)
|
||||
parser.add_argument("--retries", type=int, default=5)
|
||||
parser.add_argument("--retry-sleep-sec", type=float, default=1.5)
|
||||
parser.add_argument("--max-chars-per-source", type=int, default=900)
|
||||
parser.add_argument("--max-sources-per-track", type=int, default=3)
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument(
|
||||
"--only",
|
||||
default="",
|
||||
help="Only process one track slug, e.g. learning-csp-j",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
repo_root = Path(__file__).resolve().parents[1]
|
||||
load_dotenv(repo_root / ".env")
|
||||
|
||||
db_path = guess_db_path(args.db_path or None)
|
||||
print(f"[db] using: {db_path}")
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
conn.execute("PRAGMA busy_timeout=10000")
|
||||
|
||||
processed = 0
|
||||
for spec in TRACKS:
|
||||
if args.only and spec.slug != args.only.strip():
|
||||
continue
|
||||
|
||||
print(f"[track] start: {spec.slug}")
|
||||
materials = collect_source_materials(
|
||||
spec,
|
||||
timeout=max(5, args.timeout),
|
||||
max_chars_per_source=max(600, args.max_chars_per_source),
|
||||
max_sources_per_track=max(1, args.max_sources_per_track),
|
||||
)
|
||||
|
||||
prompt = build_prompt(spec, materials)
|
||||
markdown: str
|
||||
try:
|
||||
markdown = llm_request(
|
||||
prompt=prompt,
|
||||
timeout=max(15, args.timeout * 2),
|
||||
retries=max(1, args.retries),
|
||||
retry_sleep_sec=max(0.2, args.retry_sleep_sec),
|
||||
)
|
||||
markdown = remove_outer_markdown_fence(markdown)
|
||||
if os.getenv("KB_IMPORT_DEBUG", "").strip():
|
||||
preview = markdown.strip().replace("\n", "\\n")
|
||||
print(
|
||||
f"[llm] raw: {spec.slug} len={len(markdown.strip())} preview={preview[:220]}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
if len(markdown.strip()) < 120:
|
||||
raise RuntimeError("llm output too short")
|
||||
print(f"[llm] ok: {spec.slug}")
|
||||
except Exception as exc:
|
||||
print(f"[llm] fallback: {spec.slug} ({exc})", file=sys.stderr)
|
||||
markdown = fallback_markdown(spec, materials, str(exc))
|
||||
|
||||
content = markdown.strip() + "\n"
|
||||
if args.dry_run:
|
||||
print(f"[dry-run] {spec.slug}: {len(content)} chars")
|
||||
processed += 1
|
||||
continue
|
||||
|
||||
upsert_article(conn, spec.slug, spec.title, content, now_sec())
|
||||
conn.commit()
|
||||
processed += 1
|
||||
print(f"[db] upserted: {spec.slug}")
|
||||
|
||||
conn.close()
|
||||
print(f"[done] processed tracks: {processed}")
|
||||
return 0 if processed > 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
1275
scripts/import_local_pdf_rag.py
普通文件
1275
scripts/import_local_pdf_rag.py
普通文件
文件差异内容过多而无法显示
加载差异
在新工单中引用
屏蔽一个用户