#include #include "csp/app_state.h" #include "csp/services/crawler_service.h" TEST_CASE("crawler target upsert and queue lifecycle") { csp::AppState::Instance().Init(":memory:"); csp::services::CrawlerService svc(csp::AppState::Instance().db()); const auto first = svc.UpsertTarget("https://Example.com/news/?a=1", "test", "u1", "tester"); REQUIRE(first.inserted); REQUIRE(first.target.id > 0); REQUIRE(first.target.normalized_url == "https://example.com/news"); const auto second = svc.UpsertTarget("https://example.com/news", "test", "u1", "tester"); REQUIRE_FALSE(second.inserted); REQUIRE(second.target.id == first.target.id); auto listed = svc.ListTargets("", 50); REQUIRE(listed.size() == 1); csp::services::CrawlerTarget claimed; REQUIRE(svc.ClaimNextTarget(claimed)); REQUIRE(claimed.id == first.target.id); REQUIRE(claimed.status == "generating"); svc.UpdateGenerated(claimed.id, "{}", "/tmp/demo.py"); svc.MarkTesting(claimed.id); svc.InsertRun(claimed.id, "success", 200, "{}", ""); svc.MarkActive(claimed.id, 1700000000); const auto got = svc.GetTargetById(claimed.id); REQUIRE(got.has_value()); REQUIRE(got->status == "active"); csp::services::CrawlerTarget due; REQUIRE_FALSE(svc.EnqueueDueActiveTarget(3600, 1700002000, due)); REQUIRE(svc.EnqueueDueActiveTarget(3600, 1700004000, due)); REQUIRE(due.id == claimed.id); REQUIRE(due.status == "queued"); const auto runs = svc.ListRuns(claimed.id, 20); REQUIRE(runs.size() == 1); REQUIRE(runs[0].status == "success"); } TEST_CASE("crawler extract urls from mixed text") { const auto urls = csp::services::CrawlerService::ExtractUrls( "请收录 https://one.hao.work/path/?a=1 和 www.Example.com/docs, 谢谢"); REQUIRE(urls.size() == 2); REQUIRE(urls[0] == "https://one.hao.work/path"); REQUIRE(urls[1] == "https://www.example.com/docs"); }