From 3f7f4cd53c1e98bee69fbdee03ac3edaf37b6f8d Mon Sep 17 00:00:00 2001 From: Microck Date: Mon, 25 May 2026 19:19:08 +0000 Subject: [PATCH] feat(api): add kagi extract command Add Kagi Extract API support as a CLI command and MCP tool, returning the first extracted page markdown to match kagimcp behavior. Document the new paid API command and cover CLI/MCP request behavior with integration tests. --- README.md | 3 +- docs/commands/extract.mdx | 77 ++++++++++++++++++++++ docs/docs.json | 1 + docs/reference/auth-matrix.mdx | 2 + docs/reference/coverage.mdx | 2 + src/api.rs | 109 +++++++++++++++++++++++++++--- src/cli.rs | 10 +++ src/main.rs | 16 ++++- src/types.rs | 41 ++++++++++++ tests/integration-cli.rs | 117 ++++++++++++++++++++++++++++++++- 10 files changed, 366 insertions(+), 12 deletions(-) create mode 100644 docs/commands/extract.mdx diff --git a/README.md b/README.md index e285334..b06f99f 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ export KAGI_API_TOKEN='...' | credential | what it unlocks | | --- | --- | | `KAGI_SESSION_TOKEN` | base search fallback, `search --lens`, filtered search, `quick`, `ask-page`, `assistant`, `translate`, `summarize --subscriber` | -| `KAGI_API_TOKEN` | public `summarize`, `fastgpt`, `enrich web`, `enrich news` | +| `KAGI_API_TOKEN` | public `summarize`, `extract`, `fastgpt`, `enrich web`, `enrich news` | | none | `news`, `smallweb`, `auth status`, `--help` | example config: @@ -165,6 +165,7 @@ for the full command-to-token matrix, use the [`auth-matrix`](https://kagi.micr. | `kagi batch` | run multiple searches in parallel with JSON, TOON, compact, pretty, markdown, or csv output and shared filters | | `kagi auth` | launch the auth wizard, or inspect, validate, and save credentials | | `kagi summarize` | use the paid public summarizer API or the subscriber summarizer with `--subscriber` | +| `kagi extract` | extract a page's full content as markdown through the paid API | | `kagi watch` | rerun a search on an interval and emit added/removed result URLs | | `kagi notify` | send search or news output to a webhook | | `kagi history` | inspect local command history and aggregate query stats | diff --git a/docs/commands/extract.mdx b/docs/commands/extract.mdx new file mode 100644 index 0000000..03395d4 --- /dev/null +++ b/docs/commands/extract.mdx @@ -0,0 +1,77 @@ +--- +title: "extract" +description: "Complete reference for the kagi extract command - fetch a page's full content as markdown using Kagi's Extract API." +--- + +# `kagi extract` + +Extract the readable content of a web page as markdown using Kagi's Extract API. + +## Synopsis + +```bash +kagi extract +``` + +## Description + +The `kagi extract` command sends one HTTPS URL to Kagi's v1 Extract API and prints the extracted page markdown to stdout. It is useful when a search result, article, or documentation page needs full-page text instead of a search snippet. + +The command uses JSON mode internally because that is the stable envelope returned by the API, then prints the first page's `markdown` field. If Kagi returns no page markdown, the CLI reports the Extract API error details and trace id when available. + +## Authentication + +**Required:** `KAGI_API_TOKEN` + +Extract is part of Kagi's paid API surface and consumes API credit per request. + +## Arguments + +### `` (Required) + +The HTTPS URL of the page to extract. + +```bash +kagi extract "https://example.com/article" +``` + +Only `https://` URLs with a valid host are accepted. + +## Output + +The command prints markdown directly: + +```markdown +# Article title + +Extracted page content... +``` + +## Examples + +### Save an Article + +```bash +kagi extract "https://example.com/article" > article.md +``` + +### Pipe into Another Tool + +```bash +kagi extract "https://example.com/article" | sed -n '1,80p' +``` + +## Exit Codes + +| Code | Meaning | +|------|---------| +| 0 | Success - markdown extracted | +| 1 | Error - see stderr | + +Common errors: + +- Missing API token +- Non-HTTPS or invalid URL +- Insufficient API credit +- Kagi returns no extractable content +- Network error diff --git a/docs/docs.json b/docs/docs.json index 7f9c445..dc9b635 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -73,6 +73,7 @@ "commands/batch", "commands/auth", "commands/summarize", + "commands/extract", "commands/watch", "commands/notify", "commands/history", diff --git a/docs/reference/auth-matrix.mdx b/docs/reference/auth-matrix.mdx index 9cbf12e..6c31cd3 100644 --- a/docs/reference/auth-matrix.mdx +++ b/docs/reference/auth-matrix.mdx @@ -20,6 +20,7 @@ This reference provides a complete mapping of which commands require which authe | `auth set` | None | None | Saves credentials | | `summarize` | `KAGI_API_TOKEN` | None | Paid public API | | `summarize --subscriber` | `KAGI_SESSION_TOKEN` | None | Subscriber web product | +| `extract` | `KAGI_API_TOKEN` | None | Paid public API | | `news` | None | None | Public endpoint | | `quick` | `KAGI_SESSION_TOKEN` | None | Quick Answer web product | | `ask-page` | `KAGI_SESSION_TOKEN` | None | Subscriber feature | @@ -147,6 +148,7 @@ flowchart TD | `assistant custom` | `KAGI_SESSION_TOKEN` | Create and manage saved assistants | | `translate` | `KAGI_SESSION_TOKEN` | Kagi Translate text mode | | `fastgpt` | `KAGI_API_TOKEN` | Quick factual answers | +| `extract` | `KAGI_API_TOKEN` | Full-page markdown extraction | #### Settings Commands diff --git a/docs/reference/coverage.mdx b/docs/reference/coverage.mdx index e398b92..e4350ea 100644 --- a/docs/reference/coverage.mdx +++ b/docs/reference/coverage.mdx @@ -17,6 +17,7 @@ These are official, documented API endpoints: |----------|---------|--------| | Search API | `kagi search` | ✅ Implemented for base search | | Universal Summarizer | `kagi summarize` | ✅ Implemented | +| Extract API | `kagi extract` | ✅ Implemented | | FastGPT | `kagi fastgpt` | ✅ Implemented | | Web Enrichment (Teclis) | `kagi enrich web` | ✅ Implemented | | News Enrichment (TinyGem) | `kagi enrich news` | ✅ Implemented | @@ -72,6 +73,7 @@ These require no authentication: | `summarize` | Public API summarizer | API | ✅ | | `summarize --subscriber` | Web summarizer | Session | ✅ | | `summarize --filter` | Summarize stdin items as URLs or text | API or Session | ✅ | +| `extract` | Extract page content as markdown | API | ✅ | | `watch` | Search diff monitoring | API or Session | ✅ | | `notify` | Webhook notifications for search/news | API, Session, or None | ✅ | | `history` | Local history and stats | None | ✅ | diff --git a/src/api.rs b/src/api.rs index 9b46405..6d6389b 100644 --- a/src/api.rs +++ b/src/api.rs @@ -33,14 +33,15 @@ use crate::types::{ AssistantPromptResponse, AssistantThread, AssistantThreadDeleteResponse, AssistantThreadExportResponse, AssistantThreadListResponse, AssistantThreadOpenResponse, AssistantThreadPagination, CustomBangCreateRequest, CustomBangDetails, CustomBangSummary, - CustomBangUpdateRequest, DeletedResourceResponse, EnrichResponse, FastGptRequest, - FastGptResponse, LensCreateRequest, LensDetails, LensSummary, LensUpdateRequest, - NewsBatchCategories, NewsBatchCategory, NewsCategoriesResponse, NewsCategoryMetadata, - NewsCategoryMetadataList, NewsChaos, NewsChaosResponse, NewsContentFilterSummary, - NewsFilterPresetListEntry, NewsFilterPresetListResponse, NewsLatestBatch, NewsResolvedCategory, - NewsStoriesPayload, NewsStoriesResponse, NewsStoryContentFilterSummary, - RedirectRuleCreateRequest, RedirectRuleDetails, RedirectRuleSummary, RedirectRuleUpdateRequest, - SmallWebFeed, SubscriberSummarization, SubscriberSummarizeMeta, SubscriberSummarizeRequest, + CustomBangUpdateRequest, DeletedResourceResponse, EnrichResponse, ExtractPageInput, + ExtractRequest, ExtractResponse, FastGptRequest, FastGptResponse, LensCreateRequest, + LensDetails, LensSummary, LensUpdateRequest, NewsBatchCategories, NewsBatchCategory, + NewsCategoriesResponse, NewsCategoryMetadata, NewsCategoryMetadataList, NewsChaos, + NewsChaosResponse, NewsContentFilterSummary, NewsFilterPresetListEntry, + NewsFilterPresetListResponse, NewsLatestBatch, NewsResolvedCategory, NewsStoriesPayload, + NewsStoriesResponse, NewsStoryContentFilterSummary, RedirectRuleCreateRequest, + RedirectRuleDetails, RedirectRuleSummary, RedirectRuleUpdateRequest, SmallWebFeed, + SubscriberSummarization, SubscriberSummarizeMeta, SubscriberSummarizeRequest, SubscriberSummarizeResponse, SummarizeRequest, SummarizeResponse, TextAlignmentsResponse, ToggleResourceResponse, TranslateBootstrapMetadata, TranslateCommandRequest, TranslateDetectedLanguage, TranslateOptionState, TranslateResponse, TranslateTextResponse, @@ -48,6 +49,7 @@ use crate::types::{ }; const KAGI_SUMMARIZE_PATH: &str = "/api/v0/summarize"; +const KAGI_EXTRACT_PATH: &str = "/api/v1/extract"; const KAGI_SUBSCRIBER_SUMMARIZE_PATH: &str = "/mother/summary_labs"; const KAGI_NEWS_LATEST_PATH: &str = "/api/batches/latest"; const KAGI_NEWS_CATEGORIES_METADATA_PATH: &str = "/api/categories/metadata"; @@ -145,6 +147,45 @@ pub async fn execute_summarize( decode_kagi_json(response, "summarizer").await } +/// Extracts a web page as markdown using Kagi's v1 Extract API with API-token auth. +/// +/// # Arguments +/// * `url` - The HTTPS URL to extract. +/// * `token` - The Kagi API token. +/// +/// # Returns +/// Extracted page markdown. +/// +/// # Errors +/// Returns `KagiError::Auth` if the token is missing, `KagiError::Config` if the +/// URL does not satisfy the Extract API contract, and network/parse errors on failure. +pub async fn execute_extract(url: &str, token: &str) -> Result { + if token.trim().is_empty() { + return Err(KagiError::Auth( + "missing Kagi API token (expected KAGI_API_TOKEN)".to_string(), + )); + } + + let url = normalize_extract_url(url)?; + let request = ExtractRequest { + pages: vec![ExtractPageInput { url }], + format: "json".to_string(), + }; + + let client = build_client()?; + let response = client + .post(http::kagi_url(KAGI_EXTRACT_PATH)) + .header(header::AUTHORIZATION, format!("Bot {token}")) + .header(header::CONTENT_TYPE, "application/json") + .json(&request) + .send() + .await + .map_err(map_transport_error)?; + + let response: ExtractResponse = decode_kagi_json(response, "Extract").await?; + extract_first_markdown(response) +} + /// Summarizes a URL or text using the subscriber web Summarizer with session-token auth. /// /// # Arguments @@ -232,6 +273,58 @@ pub async fn execute_subscriber_summarize( } } +fn normalize_extract_url(url: &str) -> Result { + let trimmed = url.trim(); + if trimmed.is_empty() { + return Err(KagiError::Config("extract requires a URL".to_string())); + } + + let parsed = Url::parse(trimmed) + .map_err(|error| KagiError::Config(format!("extract URL is invalid: {error}")))?; + if parsed.scheme() != "https" { + return Err(KagiError::Config( + "extract URL must use the https scheme".to_string(), + )); + } + if parsed.host_str().is_none() { + return Err(KagiError::Config( + "extract URL must include a valid host".to_string(), + )); + } + + Ok(trimmed.to_string()) +} + +fn extract_first_markdown(response: ExtractResponse) -> Result { + if let Some(markdown) = response + .data + .first() + .and_then(|page| page.markdown.as_deref()) + .filter(|markdown| !markdown.is_empty()) + { + return Ok(markdown.to_string()); + } + + let suffix = response + .meta + .trace + .as_deref() + .map(|trace| format!(" (trace id: {trace})")) + .unwrap_or_default(); + + if let Some(errors) = response.errors.filter(|errors| !errors.is_empty()) { + return Err(KagiError::Network(format!( + "Kagi Extract API error: {}{}", + serde_json::to_string(&errors).unwrap_or_else(|_| format!("{errors:?}")), + suffix + ))); + } + + Err(KagiError::Parse(format!( + "Kagi Extract API returned no content{suffix}" + ))) +} + /// Fetches Kagi News stories for a given category with optional content filtering. /// /// # Arguments diff --git a/src/cli.rs b/src/cli.rs index 62be49b..e8d7e1c 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -223,6 +223,8 @@ pub enum Commands { Auth(AuthCommand), /// Summarize a URL or text with Kagi's public API or subscriber web Summarizer Summarize(SummarizeArgs), + /// Extract a page's full content as markdown through Kagi's Extract API + Extract(ExtractArgs), /// Read Kagi News from the live public JSON endpoints News(NewsArgs), /// Prompt Kagi Assistant and manage Assistant threads @@ -604,6 +606,14 @@ impl SummarizeArgs { } } +#[derive(Debug, Args)] +/// Arguments for the `extract` subcommand. +pub struct ExtractArgs { + /// HTTPS URL of the page to extract as markdown + #[arg(value_name = "URL")] + pub url: String, +} + #[derive(Debug, Args)] /// Arguments for the `fastgpt` subcommand. pub struct FastGptArgs { diff --git a/src/main.rs b/src/main.rs index 1083b8b..e575b08 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,8 +22,8 @@ use crate::api::{ execute_custom_assistant_create, execute_custom_assistant_delete, execute_custom_assistant_get, execute_custom_assistant_list, execute_custom_assistant_update, execute_custom_bang_create, execute_custom_bang_delete, execute_custom_bang_get, execute_custom_bang_list, - execute_custom_bang_update, execute_enrich_news, execute_enrich_web, execute_fastgpt, - execute_lens_create, execute_lens_delete, execute_lens_get, execute_lens_list, + execute_custom_bang_update, execute_enrich_news, execute_enrich_web, execute_extract, + execute_fastgpt, execute_lens_create, execute_lens_delete, execute_lens_get, execute_lens_list, execute_lens_set_enabled, execute_lens_update, execute_news, execute_news_categories, execute_news_chaos, execute_news_filter_presets, execute_redirect_create, execute_redirect_delete, execute_redirect_get, execute_redirect_list, @@ -252,6 +252,12 @@ async fn run() -> Result<(), KagiError> { print_json(&response) } } + Commands::Extract(args) => { + let token = resolve_api_token(profile.as_deref())?; + let markdown = execute_extract(&args.url, &token).await?; + println!("{markdown}"); + Ok(()) + } Commands::News(args) => { args.validate().map_err(KagiError::Config)?; @@ -2070,6 +2076,7 @@ async fn run_mcp(args: McpArgs, profile: Option<&str>) -> Result<(), KagiError> "tools": [ {"name": "kagi_search", "description": "Search Kagi", "inputSchema": {"type": "object"}}, {"name": "kagi_summarize", "description": "Summarize a URL or text", "inputSchema": {"type": "object"}}, + {"name": "kagi_extract", "description": "Extract a page's full content as markdown", "inputSchema": {"type": "object"}}, {"name": "kagi_quick", "description": "Get a Kagi Quick Answer", "inputSchema": {"type": "object"}}, {"name": "kagi_news", "description": "Fetch Kagi News stories for a category", "inputSchema": {"type": "object"}}, {"name": "kagi_news_search", "description": "Search the News tab of kagi.com (clusters of articles)", "inputSchema": {"type": "object"}} @@ -2136,6 +2143,11 @@ async fn run_mcp_tool_call(request: &Value, profile: Option<&str>) -> Result { + let token = resolve_api_token(profile)?; + let url = arguments.get("url").and_then(Value::as_str).unwrap_or(""); + execute_extract(url, &token).await? + } "kagi_quick" => { let token = resolve_session_token(profile)?; let query = arguments.get("query").and_then(Value::as_str).unwrap_or(""); diff --git a/src/types.rs b/src/types.rs index 56c5a5c..2915a86 100644 --- a/src/types.rs +++ b/src/types.rs @@ -106,6 +106,47 @@ pub struct SummarizeResponse { pub data: Summarization, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +/// Request body for Kagi's v1 content extraction endpoint. +pub struct ExtractRequest { + pub pages: Vec, + pub format: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +/// A single page input for content extraction. +pub struct ExtractPageInput { + pub url: String, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +/// Metadata returned by the v1 extraction endpoint. +pub struct ExtractMeta { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub trace: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub node: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub ms: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +/// Extracted content for one page. +pub struct ExtractPageOutput { + pub url: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub markdown: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +/// Response from Kagi's v1 content extraction endpoint. +pub struct ExtractResponse { + pub meta: ExtractMeta, + pub data: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub errors: Option>, +} + #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] /// Metadata for the subscriber-mode summarization endpoint. pub struct SubscriberSummarizeMeta { diff --git a/tests/integration-cli.rs b/tests/integration-cli.rs index 1b0af9f..cec8e2f 100644 --- a/tests/integration-cli.rs +++ b/tests/integration-cli.rs @@ -486,6 +486,68 @@ fn summarize_url_command_prints_structured_json() { assert_eq!(body["data"]["output"], "A concise summary."); } +#[test] +fn extract_command_prints_markdown_from_mock_api() { + let server = MockServer::start(); + let _extract = server.mock(|when, then| { + when.method(POST) + .path("/api/v1/extract") + .header("authorization", "Bot test-api-token") + .json_body(json!({ + "pages": [ + { + "url": "https://example.com/article" + } + ], + "format": "json" + })); + then.status(200) + .header("content-type", "application/json") + .json_body(json!({ + "meta": { + "trace": "trace-1", + "node": "test", + "ms": 12 + }, + "data": [ + { + "url": "https://example.com/article", + "markdown": "# Article\n\nExtracted content." + } + ] + })); + }); + + let tempdir = TempDir::new().expect("tempdir"); + let env = test_env(&server); + let output = run_kagi( + &["extract", "https://example.com/article"], + &env_refs(&env), + tempdir.path(), + ); + + assert_success(&output); + let stdout = String::from_utf8_lossy(&output.stdout); + assert_eq!(stdout, "# Article\n\nExtracted content.\n"); +} + +#[test] +fn extract_command_rejects_non_https_urls() { + let tempdir = TempDir::new().expect("tempdir"); + let env = [("KAGI_API_TOKEN", API_TOKEN)]; + let output = run_kagi(&["extract", "http://example.com"], &env, tempdir.path()); + + assert!( + !output.status.success(), + "expected non-zero exit for non-HTTPS extract URL" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("extract URL must use the https scheme"), + "expected HTTPS validation in stderr: {stderr}" + ); +} + fn news_search_html_fixture() -> &'static str { r#"
@@ -870,7 +932,7 @@ fn mcp_initialize_returns_server_info() { } #[test] -fn mcp_tools_list_includes_news() { +fn mcp_tools_list_includes_news_and_extract() { let tempdir = TempDir::new().expect("tempdir"); let output = run_kagi_with_stdin( &["mcp"], @@ -886,6 +948,59 @@ fn mcp_tools_list_includes_news() { tools.iter().any(|tool| tool["name"] == "kagi_news"), "expected kagi_news in tools list, got {tools:?}" ); + assert!( + tools.iter().any(|tool| tool["name"] == "kagi_extract"), + "expected kagi_extract in tools list, got {tools:?}" + ); +} + +#[test] +fn mcp_extract_tool_call_returns_markdown() { + let server = MockServer::start(); + let _extract = server.mock(|when, then| { + when.method(POST) + .path("/api/v1/extract") + .header("authorization", "Bot test-api-token") + .json_body(json!({ + "pages": [ + { + "url": "https://example.com/article" + } + ], + "format": "json" + })); + then.status(200) + .header("content-type", "application/json") + .json_body(json!({ + "meta": { + "trace": "trace-1", + "node": "test", + "ms": 12 + }, + "data": [ + { + "url": "https://example.com/article", + "markdown": "# Article\n\nExtracted content." + } + ] + })); + }); + + let tempdir = TempDir::new().expect("tempdir"); + let env = test_env(&server); + let output = run_kagi_with_stdin( + &["mcp"], + r#"{"jsonrpc":"2.0","id":1,"method":"tools/call","params":{"name":"kagi_extract","arguments":{"url":"https://example.com/article"}}}"#, + &env_refs(&env), + tempdir.path(), + ); + + assert_success(&output); + let response: Value = serde_json::from_slice(&output.stdout).expect("mcp json parses"); + assert_eq!( + response["result"]["content"][0]["text"], + "# Article\n\nExtracted content." + ); } #[test]