feat: rebuild CSP practice workflow, UX and automation
这个提交包含在:
904
scripts/import_luogu_csp.py
普通文件
904
scripts/import_luogu_csp.py
普通文件
@@ -0,0 +1,904 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Import Luogu CSP-J/S beginner problem set into local SQLite."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
DEFAULT_BASE_URL = "https://www.luogu.com.cn"
|
||||
DEFAULT_TAG_IDS = [343, 342, 82, 83] # CSP-J, CSP-S, NOIP-junior, NOIP-senior
|
||||
RETRYABLE_STATUS = {429, 500, 502, 503, 504}
|
||||
CONTEXT_RE = re.compile(
|
||||
r'<script[^>]*id="lentille-context"[^>]*>(.*?)</script>', re.DOTALL
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LuoguListItem:
|
||||
pid: str
|
||||
title: str
|
||||
difficulty: int
|
||||
tags: list[int]
|
||||
total_submit: int
|
||||
total_accepted: int
|
||||
type: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class UpsertRecord:
|
||||
slug: str
|
||||
title: str
|
||||
statement_md: str
|
||||
difficulty: int
|
||||
source: str
|
||||
statement_url: str
|
||||
llm_profile_json: str
|
||||
sample_input: str
|
||||
sample_output: str
|
||||
tags: list[str]
|
||||
|
||||
|
||||
def now_sec() -> int:
|
||||
return int(time.time())
|
||||
|
||||
|
||||
def requests_retry_text(
|
||||
session: requests.Session,
|
||||
url: str,
|
||||
*,
|
||||
timeout: int,
|
||||
retries: int,
|
||||
sleep_sec: float,
|
||||
) -> str:
|
||||
last_error: Exception | None = None
|
||||
for attempt in range(1, retries + 1):
|
||||
try:
|
||||
resp = session.get(url, timeout=timeout)
|
||||
except requests.RequestException as exc:
|
||||
last_error = exc
|
||||
if attempt < retries:
|
||||
time.sleep(sleep_sec * attempt)
|
||||
continue
|
||||
raise RuntimeError(f"GET failed: {url}: {exc}") from exc
|
||||
|
||||
if resp.status_code in RETRYABLE_STATUS:
|
||||
if attempt < retries:
|
||||
time.sleep(sleep_sec * attempt)
|
||||
continue
|
||||
raise RuntimeError(f"GET failed after retry: {url}: {resp.status_code}")
|
||||
if resp.status_code >= 400:
|
||||
raise RuntimeError(f"GET failed: {url}: {resp.status_code}")
|
||||
return resp.text
|
||||
|
||||
if last_error:
|
||||
raise RuntimeError(f"GET failed: {url}: {last_error}") from last_error
|
||||
raise RuntimeError(f"GET failed: {url}: unknown error")
|
||||
|
||||
|
||||
def extract_context_json(html_text: str) -> dict[str, Any]:
|
||||
match = CONTEXT_RE.search(html_text)
|
||||
if not match:
|
||||
raise RuntimeError("lentille-context script not found")
|
||||
try:
|
||||
return json.loads(match.group(1))
|
||||
except json.JSONDecodeError as exc:
|
||||
raise RuntimeError("failed to parse lentille-context json") from exc
|
||||
|
||||
|
||||
def parse_tag_ids(raw: str) -> list[int]:
|
||||
out: list[int] = []
|
||||
for part in raw.split(","):
|
||||
part = part.strip()
|
||||
if not part:
|
||||
continue
|
||||
out.append(int(part))
|
||||
if not out:
|
||||
raise ValueError("at least one tag id is required")
|
||||
return out
|
||||
|
||||
|
||||
def normalize_tag(text: str) -> str:
|
||||
lower = text.strip().lower()
|
||||
compact = re.sub(r"[^a-z0-9]+", "-", lower).strip("-")
|
||||
return compact or text.strip()
|
||||
|
||||
|
||||
def ensure_problem_columns(conn: sqlite3.Connection) -> None:
|
||||
cur = conn.cursor()
|
||||
cur.execute("PRAGMA table_info(problems)")
|
||||
cols = {str(row[1]) for row in cur.fetchall()}
|
||||
needed = {
|
||||
"sample_input": "ALTER TABLE problems ADD COLUMN sample_input TEXT NOT NULL DEFAULT ''",
|
||||
"sample_output": "ALTER TABLE problems ADD COLUMN sample_output TEXT NOT NULL DEFAULT ''",
|
||||
"statement_url": "ALTER TABLE problems ADD COLUMN statement_url TEXT NOT NULL DEFAULT ''",
|
||||
"llm_profile_json": "ALTER TABLE problems ADD COLUMN llm_profile_json TEXT NOT NULL DEFAULT '{}'",
|
||||
}
|
||||
for col, sql in needed.items():
|
||||
if col not in cols:
|
||||
cur.execute(sql)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def ensure_core_tables(conn: sqlite3.Connection) -> None:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS problems (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
slug TEXT NOT NULL UNIQUE,
|
||||
title TEXT NOT NULL,
|
||||
statement_md TEXT NOT NULL,
|
||||
difficulty INTEGER NOT NULL DEFAULT 1,
|
||||
source TEXT NOT NULL DEFAULT '',
|
||||
statement_url TEXT NOT NULL DEFAULT '',
|
||||
llm_profile_json TEXT NOT NULL DEFAULT '{}',
|
||||
sample_input TEXT NOT NULL DEFAULT '',
|
||||
sample_output TEXT NOT NULL DEFAULT '',
|
||||
created_at INTEGER NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS problem_tags (
|
||||
problem_id INTEGER NOT NULL,
|
||||
tag TEXT NOT NULL,
|
||||
PRIMARY KEY(problem_id, tag)
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_problem_tags_tag ON problem_tags(tag)")
|
||||
conn.commit()
|
||||
|
||||
|
||||
def ensure_import_tables(conn: sqlite3.Connection) -> None:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS import_jobs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
status TEXT NOT NULL,
|
||||
trigger TEXT NOT NULL DEFAULT 'manual',
|
||||
total_count INTEGER NOT NULL DEFAULT 0,
|
||||
processed_count INTEGER NOT NULL DEFAULT 0,
|
||||
success_count INTEGER NOT NULL DEFAULT 0,
|
||||
failed_count INTEGER NOT NULL DEFAULT 0,
|
||||
options_json TEXT NOT NULL DEFAULT '{}',
|
||||
last_error TEXT NOT NULL DEFAULT '',
|
||||
started_at INTEGER NOT NULL,
|
||||
finished_at INTEGER,
|
||||
updated_at INTEGER NOT NULL,
|
||||
created_at INTEGER NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS import_job_items (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
job_id INTEGER NOT NULL,
|
||||
source_path TEXT NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'queued',
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
difficulty INTEGER NOT NULL DEFAULT 0,
|
||||
problem_id INTEGER,
|
||||
error_text TEXT NOT NULL DEFAULT '',
|
||||
started_at INTEGER,
|
||||
finished_at INTEGER,
|
||||
updated_at INTEGER NOT NULL,
|
||||
created_at INTEGER NOT NULL,
|
||||
UNIQUE(job_id, source_path)
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_import_jobs_created_at ON import_jobs(created_at DESC)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_import_job_items_job_status "
|
||||
"ON import_job_items(job_id, status, updated_at DESC)"
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def create_import_job(
|
||||
conn: sqlite3.Connection, trigger: str, total_count: int, options_json: str
|
||||
) -> int:
|
||||
ts = now_sec()
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO import_jobs(
|
||||
status,trigger,total_count,processed_count,success_count,failed_count,
|
||||
options_json,last_error,started_at,finished_at,updated_at,created_at
|
||||
) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
""",
|
||||
(
|
||||
"running",
|
||||
trigger or "manual",
|
||||
total_count,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
options_json,
|
||||
"",
|
||||
ts,
|
||||
None,
|
||||
ts,
|
||||
ts,
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
return int(cur.lastrowid)
|
||||
|
||||
|
||||
def seed_import_items(
|
||||
conn: sqlite3.Connection, job_id: int, items: list[LuoguListItem]
|
||||
) -> None:
|
||||
ts = now_sec()
|
||||
cur = conn.cursor()
|
||||
cur.executemany(
|
||||
"""
|
||||
INSERT OR IGNORE INTO import_job_items(
|
||||
job_id,source_path,status,title,difficulty,problem_id,error_text,
|
||||
started_at,finished_at,updated_at,created_at
|
||||
) VALUES(?,?,?,?,?,?,?,?,?,?,?)
|
||||
""",
|
||||
[
|
||||
(
|
||||
job_id,
|
||||
item.pid,
|
||||
"queued",
|
||||
"",
|
||||
0,
|
||||
None,
|
||||
"",
|
||||
None,
|
||||
None,
|
||||
ts,
|
||||
ts,
|
||||
)
|
||||
for item in items
|
||||
],
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def update_import_item_success(
|
||||
conn: sqlite3.Connection,
|
||||
job_id: int,
|
||||
source_path: str,
|
||||
title: str,
|
||||
difficulty: int,
|
||||
problem_id: int,
|
||||
note: str = "",
|
||||
) -> None:
|
||||
ts = now_sec()
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE import_job_items
|
||||
SET status='success',
|
||||
title=?,
|
||||
difficulty=?,
|
||||
problem_id=?,
|
||||
error_text=?,
|
||||
started_at=COALESCE(started_at, ?),
|
||||
finished_at=?,
|
||||
updated_at=?
|
||||
WHERE job_id=? AND source_path=?
|
||||
""",
|
||||
(title, difficulty, problem_id, note, ts, ts, ts, job_id, source_path),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def update_import_item_failed(
|
||||
conn: sqlite3.Connection, job_id: int, source_path: str, error_text: str
|
||||
) -> None:
|
||||
ts = now_sec()
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE import_job_items
|
||||
SET status='failed',
|
||||
error_text=?,
|
||||
started_at=COALESCE(started_at, ?),
|
||||
finished_at=?,
|
||||
updated_at=?
|
||||
WHERE job_id=? AND source_path=?
|
||||
""",
|
||||
(error_text[:500], ts, ts, ts, job_id, source_path),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def update_import_job_progress(
|
||||
conn: sqlite3.Connection,
|
||||
job_id: int,
|
||||
processed_count: int,
|
||||
success_count: int,
|
||||
failed_count: int,
|
||||
last_error: str,
|
||||
) -> None:
|
||||
ts = now_sec()
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE import_jobs
|
||||
SET processed_count=?,
|
||||
success_count=?,
|
||||
failed_count=?,
|
||||
last_error=?,
|
||||
updated_at=?
|
||||
WHERE id=?
|
||||
""",
|
||||
(processed_count, success_count, failed_count, last_error[:500], ts, job_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def finish_import_job(
|
||||
conn: sqlite3.Connection,
|
||||
job_id: int,
|
||||
success_count: int,
|
||||
failed_count: int,
|
||||
last_error: str,
|
||||
) -> None:
|
||||
ts = now_sec()
|
||||
status = "completed" if failed_count == 0 else "completed_with_errors"
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE import_jobs
|
||||
SET status=?,
|
||||
processed_count=total_count,
|
||||
success_count=?,
|
||||
failed_count=?,
|
||||
last_error=?,
|
||||
finished_at=?,
|
||||
updated_at=?
|
||||
WHERE id=?
|
||||
""",
|
||||
(status, success_count, failed_count, last_error[:500], ts, ts, job_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def upsert_problem(conn: sqlite3.Connection, rec: UpsertRecord) -> tuple[int, bool]:
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id FROM problems WHERE slug=?", (rec.slug,))
|
||||
row = cur.fetchone()
|
||||
|
||||
if row is None:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO problems(
|
||||
slug,title,statement_md,difficulty,source,statement_url,llm_profile_json,
|
||||
sample_input,sample_output,created_at
|
||||
) VALUES(?,?,?,?,?,?,?,?,?,?)
|
||||
""",
|
||||
(
|
||||
rec.slug,
|
||||
rec.title,
|
||||
rec.statement_md,
|
||||
rec.difficulty,
|
||||
rec.source,
|
||||
rec.statement_url,
|
||||
rec.llm_profile_json,
|
||||
rec.sample_input,
|
||||
rec.sample_output,
|
||||
now_sec(),
|
||||
),
|
||||
)
|
||||
problem_id = int(cur.lastrowid)
|
||||
inserted = True
|
||||
else:
|
||||
problem_id = int(row[0])
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE problems
|
||||
SET title=?,statement_md=?,difficulty=?,source=?,statement_url=?,
|
||||
llm_profile_json=?,sample_input=?,sample_output=?
|
||||
WHERE id=?
|
||||
""",
|
||||
(
|
||||
rec.title,
|
||||
rec.statement_md,
|
||||
rec.difficulty,
|
||||
rec.source,
|
||||
rec.statement_url,
|
||||
rec.llm_profile_json,
|
||||
rec.sample_input,
|
||||
rec.sample_output,
|
||||
problem_id,
|
||||
),
|
||||
)
|
||||
inserted = False
|
||||
|
||||
cur.execute("DELETE FROM problem_tags WHERE problem_id=?", (problem_id,))
|
||||
for tag in rec.tags:
|
||||
cur.execute(
|
||||
"INSERT OR IGNORE INTO problem_tags(problem_id,tag) VALUES(?,?)",
|
||||
(problem_id, tag),
|
||||
)
|
||||
conn.commit()
|
||||
return problem_id, inserted
|
||||
|
||||
|
||||
def markdown_to_absolute(base_url: str, text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
text = re.sub(r"\]\(/", f"]({base_url}/", text)
|
||||
text = re.sub(r"!\[\]\(/", f"
|
||||
return text
|
||||
|
||||
|
||||
def build_statement_md(base_url: str, pid: str, detail: dict[str, Any]) -> str:
|
||||
content = detail.get("content") or {}
|
||||
title = str(detail.get("title") or pid).strip()
|
||||
|
||||
background = markdown_to_absolute(base_url, str(content.get("background") or "").strip())
|
||||
description = markdown_to_absolute(base_url, str(content.get("description") or "").strip())
|
||||
format_i = markdown_to_absolute(base_url, str(content.get("formatI") or "").strip())
|
||||
format_o = markdown_to_absolute(base_url, str(content.get("formatO") or "").strip())
|
||||
hint = markdown_to_absolute(base_url, str(content.get("hint") or "").strip())
|
||||
|
||||
lines = [
|
||||
f"# {pid} {title}",
|
||||
"",
|
||||
f"- Source: Luogu",
|
||||
f"- Problem URL: {base_url}/problem/{pid}",
|
||||
]
|
||||
if background:
|
||||
lines += ["", "## Background", "", background]
|
||||
if description:
|
||||
lines += ["", "## Description", "", description]
|
||||
if format_i:
|
||||
lines += ["", "## Input Format", "", format_i]
|
||||
if format_o:
|
||||
lines += ["", "## Output Format", "", format_o]
|
||||
if hint:
|
||||
lines += ["", "## Hint", "", hint]
|
||||
return "\n".join(lines).strip()
|
||||
|
||||
|
||||
def build_record(
|
||||
base_url: str,
|
||||
list_item: LuoguListItem,
|
||||
detail: dict[str, Any],
|
||||
tag_catalog: dict[int, dict[str, Any]],
|
||||
) -> UpsertRecord:
|
||||
pid = list_item.pid
|
||||
title = str(detail.get("title") or list_item.title or pid).strip()
|
||||
difficulty = int(detail.get("difficulty") or list_item.difficulty or 1)
|
||||
statement_url = f"{base_url}/problem/{pid}"
|
||||
statement_md = build_statement_md(base_url, pid, detail)
|
||||
|
||||
samples = detail.get("samples") or []
|
||||
sample_input = ""
|
||||
sample_output = ""
|
||||
if samples and isinstance(samples[0], list) and len(samples[0]) >= 2:
|
||||
sample_input = str(samples[0][0] or "")
|
||||
sample_output = str(samples[0][1] or "")
|
||||
|
||||
detail_tag_ids = detail.get("tags") or []
|
||||
if not isinstance(detail_tag_ids, list):
|
||||
detail_tag_ids = []
|
||||
tag_ids = list(dict.fromkeys([*list_item.tags, *detail_tag_ids]))
|
||||
|
||||
tag_names: list[str] = []
|
||||
knowledge_points: list[str] = []
|
||||
normalized_tags: set[str] = {"luogu", "csp"}
|
||||
for tid in tag_ids:
|
||||
tag = tag_catalog.get(int(tid))
|
||||
if not tag:
|
||||
continue
|
||||
name = str(tag.get("name") or "").strip()
|
||||
if not name:
|
||||
continue
|
||||
tag_names.append(name)
|
||||
normalized_tags.add(normalize_tag(name))
|
||||
|
||||
ttype = int(tag.get("type") or 0)
|
||||
if ttype == 2 and len(knowledge_points) < 8:
|
||||
knowledge_points.append(name)
|
||||
|
||||
upper_name = name.upper()
|
||||
if "CSP-J" in upper_name:
|
||||
normalized_tags.add("csp-j")
|
||||
if "CSP-S" in upper_name:
|
||||
normalized_tags.add("csp-s")
|
||||
if "NOIP 普及" in name:
|
||||
normalized_tags.add("noip-junior")
|
||||
if "NOIP 提高" in name:
|
||||
normalized_tags.add("noip-senior")
|
||||
|
||||
if not knowledge_points:
|
||||
knowledge_points = tag_names[:6]
|
||||
|
||||
answer = "See official solutions/discussions and verify with your own proof."
|
||||
explanation = (
|
||||
"This problem is imported from Luogu. The statement and examples are preserved; "
|
||||
"practice with your own derivation and compare with accepted solutions."
|
||||
)
|
||||
|
||||
profile = {
|
||||
"schema_version": 1,
|
||||
"platform": "luogu",
|
||||
"pid": pid,
|
||||
"difficulty": difficulty,
|
||||
"tags": tag_names,
|
||||
"tag_ids": tag_ids,
|
||||
"knowledge_points": knowledge_points,
|
||||
"answer": answer,
|
||||
"explanation": explanation,
|
||||
"stats": {
|
||||
"total_submit": int(list_item.total_submit),
|
||||
"total_accepted": int(list_item.total_accepted),
|
||||
},
|
||||
"source": {
|
||||
"url": statement_url,
|
||||
"type": list_item.type,
|
||||
},
|
||||
"generated_at": now_sec(),
|
||||
}
|
||||
|
||||
all_tags = sorted({t for t in normalized_tags if t})[:30]
|
||||
return UpsertRecord(
|
||||
slug=f"luogu-{pid.lower()}",
|
||||
title=f"{pid} {title}",
|
||||
statement_md=statement_md,
|
||||
difficulty=max(1, min(10, difficulty)),
|
||||
source=f"luogu:{pid}",
|
||||
statement_url=statement_url,
|
||||
llm_profile_json=json.dumps(profile, ensure_ascii=False),
|
||||
sample_input=sample_input,
|
||||
sample_output=sample_output,
|
||||
tags=all_tags,
|
||||
)
|
||||
|
||||
|
||||
def build_fallback_record(
|
||||
base_url: str,
|
||||
list_item: LuoguListItem,
|
||||
tag_catalog: dict[int, dict[str, Any]],
|
||||
error_text: str,
|
||||
) -> UpsertRecord:
|
||||
fallback_detail: dict[str, Any] = {
|
||||
"title": list_item.title,
|
||||
"difficulty": list_item.difficulty,
|
||||
"tags": list_item.tags,
|
||||
"samples": [],
|
||||
"content": {
|
||||
"description": (
|
||||
"题面抓取失败(已自动降级导入)。"
|
||||
f"请访问原题链接查看完整题面:{base_url}/problem/{list_item.pid}"
|
||||
)
|
||||
},
|
||||
}
|
||||
rec = build_record(base_url, list_item, fallback_detail, tag_catalog)
|
||||
profile = json.loads(rec.llm_profile_json)
|
||||
profile["fallback_import"] = True
|
||||
profile["fallback_reason"] = error_text[:240]
|
||||
rec.llm_profile_json = json.dumps(profile, ensure_ascii=False)
|
||||
return rec
|
||||
|
||||
|
||||
def fetch_tag_catalog(
|
||||
session: requests.Session,
|
||||
base_url: str,
|
||||
timeout: int,
|
||||
retries: int,
|
||||
sleep_sec: float,
|
||||
) -> dict[int, dict[str, Any]]:
|
||||
text = requests_retry_text(
|
||||
session,
|
||||
f"{base_url}/_lfe/tags/zh-CN",
|
||||
timeout=timeout,
|
||||
retries=retries,
|
||||
sleep_sec=sleep_sec,
|
||||
)
|
||||
payload = json.loads(text)
|
||||
tags = payload.get("tags") or []
|
||||
out: dict[int, dict[str, Any]] = {}
|
||||
for row in tags:
|
||||
if not isinstance(row, dict) or "id" not in row:
|
||||
continue
|
||||
out[int(row["id"])] = row
|
||||
return out
|
||||
|
||||
|
||||
def fetch_list_page(
|
||||
session: requests.Session,
|
||||
base_url: str,
|
||||
tags_csv: str,
|
||||
page: int,
|
||||
timeout: int,
|
||||
retries: int,
|
||||
sleep_sec: float,
|
||||
) -> tuple[int, int, list[LuoguListItem]]:
|
||||
url = f"{base_url}/problem/list?type=all&tag={quote(tags_csv)}&page={page}"
|
||||
html_text = requests_retry_text(
|
||||
session, url, timeout=timeout, retries=retries, sleep_sec=sleep_sec
|
||||
)
|
||||
ctx = extract_context_json(html_text)
|
||||
problems = ((ctx.get("data") or {}).get("problems") or {})
|
||||
count = int(problems.get("count") or 0)
|
||||
per_page = int(problems.get("perPage") or 50)
|
||||
|
||||
result: list[LuoguListItem] = []
|
||||
for row in problems.get("result") or []:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
pid = str(row.get("pid") or "").strip()
|
||||
if not pid:
|
||||
continue
|
||||
tags = row.get("tags") if isinstance(row.get("tags"), list) else []
|
||||
result.append(
|
||||
LuoguListItem(
|
||||
pid=pid,
|
||||
title=str(row.get("title") or "").strip(),
|
||||
difficulty=int(row.get("difficulty") or 1),
|
||||
tags=[int(x) for x in tags if isinstance(x, int)],
|
||||
total_submit=int(row.get("totalSubmit") or 0),
|
||||
total_accepted=int(row.get("totalAccepted") or 0),
|
||||
type=str(row.get("type") or "").strip(),
|
||||
)
|
||||
)
|
||||
return count, per_page, result
|
||||
|
||||
|
||||
def fetch_problem_detail(
|
||||
base_url: str,
|
||||
pid: str,
|
||||
timeout: int,
|
||||
retries: int,
|
||||
sleep_sec: float,
|
||||
) -> dict[str, Any]:
|
||||
session = requests.Session()
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Referer": f"{base_url}/problem/list",
|
||||
}
|
||||
)
|
||||
html_text = requests_retry_text(
|
||||
session,
|
||||
f"{base_url}/problem/{pid}",
|
||||
timeout=timeout,
|
||||
retries=retries,
|
||||
sleep_sec=sleep_sec,
|
||||
)
|
||||
ctx = extract_context_json(html_text)
|
||||
detail = ((ctx.get("data") or {}).get("problem") or {})
|
||||
if not isinstance(detail, dict):
|
||||
raise RuntimeError(f"problem detail invalid: {pid}")
|
||||
return detail
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Import Luogu CSP-J/S problem set")
|
||||
parser.add_argument("--db-path", required=True, help="SQLite db path")
|
||||
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
||||
parser.add_argument(
|
||||
"--tag-ids",
|
||||
default=",".join(str(x) for x in DEFAULT_TAG_IDS),
|
||||
help="Comma separated Luogu tag IDs",
|
||||
)
|
||||
parser.add_argument("--workers", type=int, default=3)
|
||||
parser.add_argument("--max-problems", type=int, default=0)
|
||||
parser.add_argument("--timeout", type=int, default=25)
|
||||
parser.add_argument("--retries", type=int, default=5)
|
||||
parser.add_argument("--retry-sleep-sec", type=float, default=1.2)
|
||||
parser.add_argument("--clear-existing", action="store_true")
|
||||
parser.add_argument("--clear-all-problems", action="store_true")
|
||||
parser.add_argument("--job-trigger", default="manual")
|
||||
parser.add_argument("--clear-existing-source-prefix", default="")
|
||||
parser.add_argument("--skip-llm", action="store_true")
|
||||
parser.add_argument("--llm-limit", type=int, default=0)
|
||||
args = parser.parse_args()
|
||||
|
||||
tag_ids = parse_tag_ids(args.tag_ids)
|
||||
tags_csv = ",".join(str(x) for x in tag_ids)
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Referer": f"{args.base_url}/problem/list",
|
||||
}
|
||||
)
|
||||
|
||||
tag_catalog = fetch_tag_catalog(
|
||||
session,
|
||||
args.base_url,
|
||||
timeout=args.timeout,
|
||||
retries=args.retries,
|
||||
sleep_sec=args.retry_sleep_sec,
|
||||
)
|
||||
|
||||
total_count, per_page, first_page_items = fetch_list_page(
|
||||
session,
|
||||
args.base_url,
|
||||
tags_csv,
|
||||
page=1,
|
||||
timeout=args.timeout,
|
||||
retries=args.retries,
|
||||
sleep_sec=args.retry_sleep_sec,
|
||||
)
|
||||
total_pages = max(1, math.ceil(max(1, total_count) / max(1, per_page)))
|
||||
|
||||
all_items: dict[str, LuoguListItem] = {item.pid: item for item in first_page_items}
|
||||
for page in range(2, total_pages + 1):
|
||||
_, _, page_items = fetch_list_page(
|
||||
session,
|
||||
args.base_url,
|
||||
tags_csv,
|
||||
page=page,
|
||||
timeout=args.timeout,
|
||||
retries=args.retries,
|
||||
sleep_sec=args.retry_sleep_sec,
|
||||
)
|
||||
for item in page_items:
|
||||
all_items[item.pid] = item
|
||||
|
||||
selected = sorted(all_items.values(), key=lambda x: x.pid)
|
||||
if args.max_problems > 0:
|
||||
selected = selected[: args.max_problems]
|
||||
|
||||
conn = sqlite3.connect(args.db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
conn.execute("PRAGMA busy_timeout=5000")
|
||||
ensure_core_tables(conn)
|
||||
ensure_problem_columns(conn)
|
||||
ensure_import_tables(conn)
|
||||
|
||||
cleared_count = 0
|
||||
if args.clear_all_problems:
|
||||
cur = conn.execute("SELECT COUNT(1) FROM problems")
|
||||
cleared_count = int(cur.fetchone()[0] or 0)
|
||||
conn.execute("DELETE FROM problems")
|
||||
conn.commit()
|
||||
elif args.clear_existing:
|
||||
cur = conn.execute("SELECT COUNT(1) FROM problems WHERE source LIKE 'luogu:%'")
|
||||
cleared_count = int(cur.fetchone()[0] or 0)
|
||||
conn.execute("DELETE FROM problems WHERE source LIKE 'luogu:%'")
|
||||
conn.commit()
|
||||
|
||||
inserted = 0
|
||||
updated = 0
|
||||
failed = 0
|
||||
fallback_used = 0
|
||||
total = len(selected)
|
||||
last_error = ""
|
||||
|
||||
options_json = json.dumps(
|
||||
{
|
||||
"source": "luogu",
|
||||
"tag_ids": tag_ids,
|
||||
"workers": max(1, args.workers),
|
||||
"max_problems": args.max_problems,
|
||||
"clear_existing": bool(args.clear_existing),
|
||||
"clear_all_problems": bool(args.clear_all_problems),
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
job_id = create_import_job(conn, args.job_trigger, total, options_json)
|
||||
seed_import_items(conn, job_id, selected)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max(1, args.workers)) as executor:
|
||||
futures = {
|
||||
executor.submit(
|
||||
fetch_problem_detail,
|
||||
args.base_url,
|
||||
item.pid,
|
||||
args.timeout,
|
||||
args.retries,
|
||||
args.retry_sleep_sec,
|
||||
): item
|
||||
for item in selected
|
||||
}
|
||||
done_count = 0
|
||||
for future in as_completed(futures):
|
||||
item = futures[future]
|
||||
done_count += 1
|
||||
try:
|
||||
detail = future.result()
|
||||
record = build_record(args.base_url, item, detail, tag_catalog)
|
||||
problem_id, is_insert = upsert_problem(conn, record)
|
||||
if is_insert:
|
||||
inserted += 1
|
||||
else:
|
||||
updated += 1
|
||||
update_import_item_success(
|
||||
conn,
|
||||
job_id,
|
||||
item.pid,
|
||||
record.title,
|
||||
record.difficulty,
|
||||
problem_id,
|
||||
)
|
||||
print(
|
||||
f"[{done_count}/{total}] {item.pid} -> {record.title} "
|
||||
f"(difficulty={record.difficulty})",
|
||||
flush=True,
|
||||
)
|
||||
except Exception as exc:
|
||||
try:
|
||||
record = build_fallback_record(
|
||||
args.base_url, item, tag_catalog, str(exc)
|
||||
)
|
||||
problem_id, is_insert = upsert_problem(conn, record)
|
||||
if is_insert:
|
||||
inserted += 1
|
||||
else:
|
||||
updated += 1
|
||||
fallback_used += 1
|
||||
update_import_item_success(
|
||||
conn,
|
||||
job_id,
|
||||
item.pid,
|
||||
record.title,
|
||||
record.difficulty,
|
||||
problem_id,
|
||||
note=f"fallback: {str(exc)[:300]}",
|
||||
)
|
||||
print(f"[fallback] {item.pid}: {exc}", flush=True)
|
||||
except Exception as inner_exc:
|
||||
failed += 1
|
||||
last_error = str(inner_exc)
|
||||
update_import_item_failed(
|
||||
conn,
|
||||
job_id,
|
||||
item.pid,
|
||||
f"{exc}; fallback failed: {inner_exc}",
|
||||
)
|
||||
print(f"[skip] {item.pid}: {exc}; fallback failed: {inner_exc}", flush=True)
|
||||
update_import_job_progress(
|
||||
conn,
|
||||
job_id,
|
||||
done_count,
|
||||
inserted + updated,
|
||||
failed,
|
||||
last_error,
|
||||
)
|
||||
|
||||
finish_import_job(conn, job_id, inserted + updated, failed, last_error)
|
||||
conn.close()
|
||||
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"db_path": args.db_path,
|
||||
"tags": tag_ids,
|
||||
"selected_count": total,
|
||||
"inserted": inserted,
|
||||
"updated": updated,
|
||||
"failed": failed,
|
||||
"fallback_used": fallback_used,
|
||||
"cleared_count": cleared_count,
|
||||
"job_id": job_id,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
在新工单中引用
屏蔽一个用户