From 443361b8778eeffd6d341b2701ebc2eee6094ac6 Mon Sep 17 00:00:00 2001 From: supermomonga <377137+supermomonga@users.noreply.github.com> Date: Sat, 20 Jun 2026 06:04:06 +0900 Subject: [PATCH] Add J-STAGE E2E coverage and DOI fallback --- .../E2E/JStageCliE2ETests.cs | 490 ++++++++++++++++++ .../Sources/SearchSourceTests.cs | 91 ++++ src/PapersCli.Cli/Commands/PaperCommands.cs | 5 + src/PapersCli.Cli/Sources/JStageSource.cs | 180 ++++++- 4 files changed, 759 insertions(+), 7 deletions(-) create mode 100644 src/PapersCli.Cli.Tests/E2E/JStageCliE2ETests.cs diff --git a/src/PapersCli.Cli.Tests/E2E/JStageCliE2ETests.cs b/src/PapersCli.Cli.Tests/E2E/JStageCliE2ETests.cs new file mode 100644 index 0000000..c01a6e1 --- /dev/null +++ b/src/PapersCli.Cli.Tests/E2E/JStageCliE2ETests.cs @@ -0,0 +1,490 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using System.Net; +using System.Net.Sockets; +using System.Text; +using System.Text.Json; + +namespace PapersCli.Cli.Tests.E2E; + +public class JStageCliE2ETests +{ + private const string TestDoi = "10.4295/audiology.20.27"; + private const string SearchTitle = "Metz-Test (ABLB-Test, SISI-Test, Békésy-Testとの比較)"; + private const string MetadataTitle = "The Metz-Test (Comparison with the ABLB-Test, the SISI-Test, and Békésy Audiometry)"; + private const string SearchAuthor = "北條 和博"; + private const string MetadataAuthor = "Kazuhiro Hojo"; + private const string TestJournal = "AUDIOLOGY JAPAN"; + private static readonly byte[] PdfBytes = Encoding.UTF8.GetBytes("%PDF-1.4\r\n% actual-route fixture\r\n"); + + [Test] + public async Task Search_UsesObservedJStageApiResponseAndReturnsJson() + { + await using var server = JStageFixtureServer.Start(); + using var environment = TestEnvironment.Create(server.SearchApiUrl, server.Origin); + + var result = await CliRunner.RunAsync( + environment, + "search", "test", + "--source", "jstage", + "--limit", "1", + "--json"); + + await Assert.That(result.ExitCode).IsEqualTo(0); + await Assert.That(result.StandardError).IsEqualTo(""); + + using var json = JsonDocument.Parse(result.StandardOutput); + var root = json.RootElement; + await Assert.That(root.GetProperty("source").GetString()).IsEqualTo("jstage"); + await Assert.That(root.GetProperty("query").GetString()).IsEqualTo("test"); + await Assert.That(root.GetProperty("limit").GetInt32()).IsEqualTo(1); + await Assert.That(root.GetProperty("sort_key").GetString()).IsEqualTo("relevance"); + await Assert.That(root.GetProperty("sort_order").GetString()).IsEqualTo("desc"); + await Assert.That(root.GetProperty("total_results").GetInt32()).IsEqualTo(31644); + + var resultItem = root.GetProperty("results")[0]; + await Assert.That(resultItem.GetProperty("source_id").GetString()).IsEqualTo(TestDoi); + await Assert.That(resultItem.GetProperty("title").GetString()).IsEqualTo(SearchTitle); + await Assert.That(resultItem.GetProperty("authors")[0].GetString()).IsEqualTo(SearchAuthor); + await Assert.That(resultItem.GetProperty("journal").GetString()).IsEqualTo(TestJournal); + await Assert.That(resultItem.GetProperty("download_urls").GetProperty("pdf").GetString()) + .IsEqualTo("https://www.jstage.jst.go.jp/article/audiology1968/20/1/20_1_27/_pdf/-char/ja/"); + + await Assert.That(server.RequestTargets.Any(t => + t.StartsWith("/searchapi/do?", StringComparison.Ordinal) + && t.Contains("article=test", StringComparison.Ordinal) + && t.Contains("count=1", StringComparison.Ordinal) + && t.Contains("start=1", StringComparison.Ordinal) + && t.Contains("sortflg=1", StringComparison.Ordinal))).IsTrue(); + } + + [Test] + public async Task DownloadThenShow_HandlesObservedDoiLookupFailureAndUsesCitationMetadata() + { + await using var server = JStageFixtureServer.Start(); + using var environment = TestEnvironment.Create(server.SearchApiUrl, server.Origin); + + var download = await CliRunner.RunAsync(environment, "download", $"jstage:{TestDoi}"); + + await Assert.That(download.ExitCode).IsEqualTo(0); + await Assert.That(download.StandardError).IsEqualTo(""); + await Assert.That(download.StandardOutput.Contains($"Downloaded: jstage:{TestDoi}", StringComparison.Ordinal)).IsTrue(); + await Assert.That(server.DoiApiErrorRequestCount).IsEqualTo(1); + await Assert.That(server.DoiRedirectRequestCount).IsEqualTo(1); + await Assert.That(server.ArticlePageRequestCount).IsEqualTo(1); + await Assert.That(server.PdfRequestCount).IsEqualTo(1); + + var pdfPath = Path.Combine(environment.DownloadDir, "jstage", "10.4295_audiology.20.27.pdf"); + await Assert.That(File.Exists(pdfPath)).IsTrue(); + await Assert.That(Convert.ToBase64String(await File.ReadAllBytesAsync(pdfPath))).IsEqualTo(Convert.ToBase64String(PdfBytes)); + + var show = await CliRunner.RunAsync(environment, "show", $"jstage:{TestDoi}", "--json"); + + await Assert.That(show.ExitCode).IsEqualTo(0); + await Assert.That(show.StandardError).IsEqualTo(""); + + using var json = JsonDocument.Parse(show.StandardOutput); + var root = json.RootElement; + await Assert.That(root.GetProperty("source").GetString()).IsEqualTo("jstage"); + await Assert.That(root.GetProperty("source_id").GetString()).IsEqualTo(TestDoi); + await Assert.That(root.GetProperty("title").GetString()).IsEqualTo(MetadataTitle); + await Assert.That(root.GetProperty("authors")[0].GetString()).IsEqualTo(MetadataAuthor); + await Assert.That(root.GetProperty("doi").GetString()).IsEqualTo(TestDoi); + await Assert.That(root.GetProperty("journal").GetString()).IsEqualTo(TestJournal); + } + + [Test] + public async Task Download_ReturnsNonZeroWhenMetadataCannotBeFetched() + { + await using var server = JStageFixtureServer.Start(); + using var environment = TestEnvironment.Create(server.SearchApiUrl, server.Origin); + + var download = await CliRunner.RunAsync(environment, "download", "jstage:10.0000/missing"); + + await Assert.That(download.ExitCode).IsEqualTo(1); + await Assert.That(download.StandardOutput.Contains("Could not fetch metadata", StringComparison.Ordinal)).IsTrue(); + } + + private sealed class TestEnvironment : IDisposable + { + private TestEnvironment(string rootDir, string downloadDir, IReadOnlyDictionary variables) + { + RootDir = rootDir; + DownloadDir = downloadDir; + Variables = variables; + } + + public string RootDir { get; } + public string DownloadDir { get; } + public IReadOnlyDictionary Variables { get; } + + public static TestEnvironment Create(string jStageApiUrl, string doiBaseUrl) + { + var rootDir = Path.Combine(Path.GetTempPath(), $"papers-cli-e2e-{Guid.NewGuid():N}"); + var configHome = Path.Combine(rootDir, "config"); + var dataHome = Path.Combine(rootDir, "data"); + var home = Path.Combine(rootDir, "home"); + var downloadDir = Path.Combine(rootDir, "downloads"); + var configDir = Path.Combine(configHome, "papers-cli"); + + Directory.CreateDirectory(configDir); + Directory.CreateDirectory(dataHome); + Directory.CreateDirectory(home); + Directory.CreateDirectory(downloadDir); + + File.WriteAllText( + Path.Combine(configDir, "config.toml"), + $""" + download-dir = "{EscapeTomlString(downloadDir)}" + default-source = "jstage" + + [api-keys] + """); + + return new TestEnvironment( + rootDir, + downloadDir, + new Dictionary + { + ["XDG_CONFIG_HOME"] = configHome, + ["XDG_DATA_HOME"] = dataHome, + ["HOME"] = home, + ["PAPERS_CLI_JSTAGE_API_BASE_URL"] = jStageApiUrl, + ["PAPERS_CLI_DOI_BASE_URL"] = doiBaseUrl, + ["DOTNET_CLI_TELEMETRY_OPTOUT"] = "1", + ["NO_COLOR"] = "1", + }); + } + + public void Dispose() + { + if (Directory.Exists(RootDir)) + Directory.Delete(RootDir, recursive: true); + } + + private static string EscapeTomlString(string value) => + value.Replace("\\", "\\\\", StringComparison.Ordinal).Replace("\"", "\\\"", StringComparison.Ordinal); + } + + private static class CliRunner + { + public static async Task RunAsync(TestEnvironment environment, params string[] args) + { + var cliPath = Path.Combine(AppContext.BaseDirectory, "papers-cli.dll"); + if (!File.Exists(cliPath)) + throw new FileNotFoundException("CLI assembly was not copied to the test output directory.", cliPath); + + var startInfo = new ProcessStartInfo("dotnet") + { + RedirectStandardOutput = true, + RedirectStandardError = true, + UseShellExecute = false, + }; + startInfo.ArgumentList.Add(cliPath); + foreach (var arg in args) + startInfo.ArgumentList.Add(arg); + foreach (var (key, value) in environment.Variables) + startInfo.Environment[key] = value; + + using var process = Process.Start(startInfo) + ?? throw new InvalidOperationException("Failed to start papers-cli process."); + + var stdout = process.StandardOutput.ReadToEndAsync(); + var stderr = process.StandardError.ReadToEndAsync(); + var waitForExit = process.WaitForExitAsync(); + var exited = await Task.WhenAny(waitForExit, Task.Delay(TimeSpan.FromSeconds(30))); + if (exited != waitForExit) + { + process.Kill(entireProcessTree: true); + throw new TimeoutException($"papers-cli timed out: {string.Join(" ", args)}"); + } + + return new CliResult(process.ExitCode, await stdout, await stderr); + } + } + + private sealed record CliResult(int ExitCode, string StandardOutput, string StandardError); + + private sealed class JStageFixtureServer : IAsyncDisposable + { + private readonly TcpListener _listener; + private readonly CancellationTokenSource _cancellation = new(); + private readonly Task _acceptLoop; + private readonly ConcurrentQueue _requestTargets = new(); + private int _articlePageRequestCount; + private int _doiApiErrorRequestCount; + private int _doiRedirectRequestCount; + private int _pdfRequestCount; + + private JStageFixtureServer(TcpListener listener) + { + _listener = listener; + var endpoint = (IPEndPoint)_listener.LocalEndpoint; + Origin = $"http://127.0.0.1:{endpoint.Port}"; + SearchApiUrl = $"{Origin}/searchapi/do"; + ArticleUrl = $"{Origin}/article/audiology1968/20/1/20_1_27/_article"; + PdfUrl = $"{Origin}/article/audiology1968/20/1/20_1_27/_pdf"; + _acceptLoop = Task.Run(AcceptLoopAsync); + } + + public string Origin { get; } + public string SearchApiUrl { get; } + public string ArticleUrl { get; } + public string PdfUrl { get; } + public IReadOnlyCollection RequestTargets => _requestTargets.ToArray(); + public int ArticlePageRequestCount => Volatile.Read(ref _articlePageRequestCount); + public int DoiApiErrorRequestCount => Volatile.Read(ref _doiApiErrorRequestCount); + public int DoiRedirectRequestCount => Volatile.Read(ref _doiRedirectRequestCount); + public int PdfRequestCount => Volatile.Read(ref _pdfRequestCount); + + public static JStageFixtureServer Start() + { + var listener = new TcpListener(IPAddress.Loopback, 0); + listener.Start(); + return new JStageFixtureServer(listener); + } + + public async ValueTask DisposeAsync() + { + _cancellation.Cancel(); + _listener.Stop(); + + try + { + await _acceptLoop.WaitAsync(TimeSpan.FromSeconds(5)); + } + catch (TimeoutException) + { + } + catch (OperationCanceledException) + { + } + + _cancellation.Dispose(); + } + + private async Task AcceptLoopAsync() + { + while (!_cancellation.IsCancellationRequested) + { + try + { + var client = await _listener.AcceptTcpClientAsync(_cancellation.Token); + _ = Task.Run(() => HandleClientAsync(client), _cancellation.Token); + } + catch (OperationCanceledException) + { + break; + } + catch (ObjectDisposedException) + { + break; + } + } + } + + private async Task HandleClientAsync(TcpClient client) + { + using var clientRegistration = client; + try + { + await using var stream = client.GetStream(); + using var reader = new StreamReader(stream, Encoding.ASCII, detectEncodingFromByteOrderMarks: false, leaveOpen: true); + var requestLine = await reader.ReadLineAsync(_cancellation.Token); + if (string.IsNullOrEmpty(requestLine)) + return; + + while (!string.IsNullOrEmpty(await reader.ReadLineAsync(_cancellation.Token))) + { + } + + var parts = requestLine.Split(' ', 3); + var target = parts.Length >= 2 ? parts[1] : "/"; + _requestTargets.Enqueue(target); + + if (target.StartsWith("/searchapi/do", StringComparison.Ordinal)) + { + var article = GetQueryParameter(target, "article"); + if (article == TestDoi || article == "10.0000/missing") + { + Interlocked.Increment(ref _doiApiErrorRequestCount); + await WriteResponseAsync(stream, HttpStatusCode.OK, "application/xml", Encoding.UTF8.GetBytes(BuildObservedDoiLookupErrorFeed(article))); + } + else + { + await WriteResponseAsync(stream, HttpStatusCode.OK, "application/xml", Encoding.UTF8.GetBytes(BuildObservedSearchFeed())); + } + } + else if (target == $"/{TestDoi}") + { + Interlocked.Increment(ref _doiRedirectRequestCount); + await WriteRedirectAsync(stream, ArticleUrl); + } + else if (target == "/10.0000/missing") + { + Interlocked.Increment(ref _doiRedirectRequestCount); + await WriteResponseAsync(stream, HttpStatusCode.NotFound, "text/plain", Encoding.UTF8.GetBytes("not found")); + } + else if (target.StartsWith("/article/audiology1968/20/1/20_1_27/_article", StringComparison.Ordinal)) + { + Interlocked.Increment(ref _articlePageRequestCount); + await WriteResponseAsync(stream, HttpStatusCode.OK, "text/html", Encoding.UTF8.GetBytes(BuildObservedArticleHtml())); + } + else if (target.StartsWith("/article/audiology1968/20/1/20_1_27/_pdf", StringComparison.Ordinal)) + { + Interlocked.Increment(ref _pdfRequestCount); + await WriteResponseAsync(stream, HttpStatusCode.OK, "application/pdf", PdfBytes); + } + else + { + await WriteResponseAsync(stream, HttpStatusCode.NotFound, "text/plain", Encoding.UTF8.GetBytes("not found")); + } + } + catch (IOException) + { + } + catch (OperationCanceledException) + { + } + } + + private string BuildObservedSearchFeed() => + """ + + + + 0 + + + Articles + 31644 + 1 + 1 + + + + + + + https://www.jstage.jst.go.jp/article/audiology1968/20/1/20_1_27/_article + https://www.jstage.jst.go.jp/article/audiology1968/20/1/20_1_27/_article/-char/ja/ + + + + + + + + + + + + + + + 1977 + 10.4295/audiology.20.27 + + + """; + + private static string BuildObservedDoiLookupErrorFeed(string article) => + $$""" + + + + ERR_001 + ERR_001 + + Articles + + https://api.jstage.jst.go.jp/searchapi/do?service=3&count=10&article={{WebUtility.HtmlEncode(article)}} + 3 + + + + + + <link/> + <id/> + <updated/> + </entry> + </feed> + """; + + private string BuildObservedArticleHtml() => + $$""" + <!doctype html> + <html> + <head> + <meta name="citation_journal_title" content="AUDIOLOGY JAPAN" /> + <meta name="citation_author" content="Kazuhiro Hojo" /> + <meta name="citation_author_institution" content="Dept. of Otolaryngology, School of Medicine, Niigata Univ." /> + <meta name="citation_author" content="Satoru Hosokawa" /> + <meta name="citation_author" content="Yukio Toda" /> + <meta name="citation_author" content="Yoshiaki Ohno" /> + <meta name="citation_author" content="Kanemasa Mizukoshi" /> + <meta name="citation_title" content="The Metz-Test (Comparison with the ABLB-Test, the SISI-Test, and Békésy Audiometry)" /> + <meta name="citation_publication_date" content="1977" /> + <meta name="citation_doi" content="10.4295/audiology.20.27" /> + <meta name="citation_pdf_url" content="{{PdfUrl}}" /> + <meta name="pdf_url" content="{{PdfUrl}}" /> + </head> + <body> + <a href="{{PdfUrl}}" class="thirdlevel-pdf-btn">Download PDF</a> + </body> + </html> + """; + + private static string? GetQueryParameter(string target, string name) + { + var question = target.IndexOf('?'); + if (question < 0) + return null; + + foreach (var pair in target[(question + 1)..].Split('&', StringSplitOptions.RemoveEmptyEntries)) + { + var parts = pair.Split('=', 2); + if (parts.Length == 0 || parts[0] != name) + continue; + + return parts.Length == 1 + ? "" + : Uri.UnescapeDataString(parts[1].Replace('+', ' ')); + } + + return null; + } + + private static async Task WriteRedirectAsync(NetworkStream stream, string location) + { + var headers = Encoding.ASCII.GetBytes( + "HTTP/1.1 302 Found\r\n" + + $"Location: {location}\r\n" + + "Content-Length: 0\r\n" + + "Connection: close\r\n" + + "\r\n"); + + await stream.WriteAsync(headers); + } + + private static async Task WriteResponseAsync(NetworkStream stream, HttpStatusCode status, string contentType, byte[] body) + { + var reasonPhrase = status switch + { + HttpStatusCode.OK => "OK", + HttpStatusCode.NotFound => "Not Found", + _ => status.ToString(), + }; + var headers = Encoding.ASCII.GetBytes( + $"HTTP/1.1 {(int)status} {reasonPhrase}\r\n" + + $"Content-Type: {contentType}\r\n" + + $"Content-Length: {body.Length}\r\n" + + "Connection: close\r\n" + + "\r\n"); + + await stream.WriteAsync(headers); + await stream.WriteAsync(body); + } + } +} diff --git a/src/PapersCli.Cli.Tests/Sources/SearchSourceTests.cs b/src/PapersCli.Cli.Tests/Sources/SearchSourceTests.cs index 3e5f710..1127703 100644 --- a/src/PapersCli.Cli.Tests/Sources/SearchSourceTests.cs +++ b/src/PapersCli.Cli.Tests/Sources/SearchSourceTests.cs @@ -145,6 +145,84 @@ public async Task JStageSearch_ParsesTotalResultsAndKeepsWarnEntries() await Assert.That(handler.LastRequestUri.Query.Contains("sortflg=5")).IsTrue(); } + [Test] + [NotInParallel("Environment")] + public async Task JStageGetMetadata_FollowsObservedDoiRedirectAndParsesCitationMetadata() + { + var previousJStageBaseUrl = Environment.GetEnvironmentVariable("PAPERS_CLI_JSTAGE_API_BASE_URL"); + var previousDoiBaseUrl = Environment.GetEnvironmentVariable("PAPERS_CLI_DOI_BASE_URL"); + Environment.SetEnvironmentVariable("PAPERS_CLI_JSTAGE_API_BASE_URL", "https://api.test/searchapi/do"); + Environment.SetEnvironmentVariable("PAPERS_CLI_DOI_BASE_URL", "https://doi.test"); + + try + { + var handler = new SequenceHttpMessageHandler(request => + { + if (request.RequestUri!.Host == "api.test") + { + return new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(""" + <?xml version="1.0" encoding="UTF-8"?> + <feed xmlns="http://www.w3.org/2005/Atom" xmlns:prism="http://prismstandard.org/namespaces/basic/2.0/" xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/" xml:lang="ja"> + <result><status>ERR_001</status><message>ERR_001</message></result> + <opensearch:totalResults/> + <entry><title/><link/><id/><updated/></entry> + </feed> + """, Encoding.UTF8, "application/xml"), + }; + } + + if (request.RequestUri.Host == "doi.test") + { + return new HttpResponseMessage(HttpStatusCode.Redirect) + { + Headers = { Location = new Uri("http://www.jstage.jst.go.jp/article/audiology1968/20/1/20_1_27/_article") }, + }; + } + + if (request.RequestUri.ToString() == "https://www.jstage.jst.go.jp/article/audiology1968/20/1/20_1_27/_article") + { + return new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(""" + <!doctype html> + <html> + <head> + <meta name="citation_journal_title" content="AUDIOLOGY JAPAN" /> + <meta name="citation_author" content="Kazuhiro Hojo" /> + <meta name="citation_title" content="The Metz-Test (Comparison with the ABLB-Test, the SISI-Test, and Békésy Audiometry)" /> + <meta name="citation_publication_date" content="1977" /> + <meta name="citation_doi" content="10.4295/audiology.20.27" /> + <meta name="citation_pdf_url" content="https://www.jstage.jst.go.jp/article/audiology1968/20/1/20_1_27/_pdf" /> + </head> + </html> + """, Encoding.UTF8, "text/html"), + }; + } + + return new HttpResponseMessage(HttpStatusCode.NotFound); + }); + var source = new JStageSource(new HttpClient(handler), new CiNiiSource(new HttpClient(handler))); + + var metadata = await source.GetMetadataAsync("10.4295/audiology.20.27"); + + await Assert.That(metadata).IsNotNull(); + await Assert.That(metadata!.SourceId).IsEqualTo("10.4295/audiology.20.27"); + await Assert.That(metadata.Title).IsEqualTo("The Metz-Test (Comparison with the ABLB-Test, the SISI-Test, and Békésy Audiometry)"); + await Assert.That(metadata.Authors[0]).IsEqualTo("Kazuhiro Hojo"); + await Assert.That(metadata.PublishedAt).IsEqualTo("1977-01-01"); + await Assert.That(metadata.Journal).IsEqualTo("AUDIOLOGY JAPAN"); + await Assert.That(metadata.DownloadUrls["pdf"]).IsEqualTo("https://www.jstage.jst.go.jp/article/audiology1968/20/1/20_1_27/_pdf"); + await Assert.That(handler.RequestUris.Select(uri => uri.ToString()).Contains("https://www.jstage.jst.go.jp/article/audiology1968/20/1/20_1_27/_article")).IsTrue(); + } + finally + { + Environment.SetEnvironmentVariable("PAPERS_CLI_JSTAGE_API_BASE_URL", previousJStageBaseUrl); + Environment.SetEnvironmentVariable("PAPERS_CLI_DOI_BASE_URL", previousDoiBaseUrl); + } + } + private sealed class StubHttpMessageHandler(string response, string mediaType = "application/xml") : HttpMessageHandler { public Uri? LastRequestUri { get; private set; } @@ -158,4 +236,17 @@ protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage reques }); } } + + private sealed class SequenceHttpMessageHandler(Func<HttpRequestMessage, HttpResponseMessage> responseFactory) : HttpMessageHandler + { + private readonly List<Uri> _requestUris = []; + + public IReadOnlyList<Uri> RequestUris => _requestUris; + + protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + _requestUris.Add(request.RequestUri!); + return Task.FromResult(responseFactory(request)); + } + } } diff --git a/src/PapersCli.Cli/Commands/PaperCommands.cs b/src/PapersCli.Cli/Commands/PaperCommands.cs index a462b7c..351836e 100644 --- a/src/PapersCli.Cli/Commands/PaperCommands.cs +++ b/src/PapersCli.Cli/Commands/PaperCommands.cs @@ -164,6 +164,7 @@ public async Task Download( } var requestedFormats = format?.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) ?? ["pdf"]; + var hadErrors = false; await AnsiConsole.Progress() .StartAsync(async ctx => @@ -179,11 +180,15 @@ await AnsiConsole.Progress() } catch (Exception ex) { + hadErrors = true; AnsiConsole.MarkupLine($"[red]Error: {Markup.Escape(ex.Message)}[/]"); task.Value = task.MaxValue; } } }); + + if (hadErrors) + Environment.ExitCode = 1; } /// <summary> diff --git a/src/PapersCli.Cli/Sources/JStageSource.cs b/src/PapersCli.Cli/Sources/JStageSource.cs index bb17c21..af24c96 100644 --- a/src/PapersCli.Cli/Sources/JStageSource.cs +++ b/src/PapersCli.Cli/Sources/JStageSource.cs @@ -1,3 +1,4 @@ +using System.Net; using System.Text.RegularExpressions; using System.Web; using System.Xml.Linq; @@ -7,10 +8,12 @@ namespace PapersCli.Cli.Sources; public partial class JStageSource(HttpClient httpClient, CiNiiSource cinii) : IPaperSource { - private const string BaseUrl = "https://api.jstage.jst.go.jp/searchapi/do"; + private const string DefaultBaseUrl = "https://api.jstage.jst.go.jp/searchapi/do"; private static readonly XNamespace AtomNs = "http://www.w3.org/2005/Atom"; private static readonly XNamespace PrismNs = "http://prismstandard.org/namespaces/basic/2.0/"; private static readonly XNamespace OpenSearchNs = "http://a9.com/-/spec/opensearch/1.1/"; + private readonly string _baseUrl = ResolveBaseUrl(); + private readonly string _doiBaseUrl = ResolveDoiBaseUrl(); public string Name => "jstage"; public IReadOnlyList<string> SupportedFormats => ["pdf"]; @@ -40,6 +43,10 @@ public async Task<SearchResultsPage> SearchAsync( if (result is not null) return result; + result = await GetMetadataFromArticlePageAsync(sourceId, cancellationToken); + if (result is not null) + return result; + // Fallback: try CiNii (sourceId might be a CRID) var ciniiResult = await cinii.GetMetadataAsync(sourceId, cancellationToken); if (ciniiResult is not null) @@ -55,13 +62,11 @@ public async Task<Dictionary<string, string>> GetDownloadUrlsAsync(string source if (doi is not null) { - var doiUrl = $"https://doi.org/{doi}"; + var doiUrl = $"{_doiBaseUrl}/{doi}"; try { - using var handler = new HttpClientHandler { AllowAutoRedirect = true }; - using var resolveClient = new HttpClient(handler); var request = new HttpRequestMessage(HttpMethod.Head, doiUrl); - var response = await resolveClient.SendAsync(request, cancellationToken); + var response = await HttpRetryHandler.SendWithRetryAsync(httpClient, request, cancellationToken: cancellationToken); var resolvedUrl = response.RequestMessage?.RequestUri?.ToString(); if (resolvedUrl is not null && resolvedUrl.Contains("jstage.jst.go.jp")) @@ -109,7 +114,7 @@ private async Task<SearchResultsPage> SearchJStageAsync( _ => "sortflg=1", }); - var url = $"{BaseUrl}?{string.Join("&", parameters)}"; + var url = $"{_baseUrl}?{string.Join("&", parameters)}"; try { @@ -163,7 +168,7 @@ private async Task<SearchResultsPage> SearchJStageAsync( if (sourceId.Contains('/')) parameters.Add($"article={HttpUtility.UrlEncode(sourceId)}"); - var url = $"{BaseUrl}?{string.Join("&", parameters)}"; + var url = $"{_baseUrl}?{string.Join("&", parameters)}"; try { @@ -190,6 +195,43 @@ private async Task<SearchResultsPage> SearchJStageAsync( } } + private async Task<SearchResult?> GetMetadataFromArticlePageAsync(string sourceId, CancellationToken cancellationToken) + { + foreach (var url in BuildArticlePageCandidates(sourceId)) + { + var currentUrl = url; + try + { + for (var redirectCount = 0; redirectCount < 5; redirectCount++) + { + var request = new HttpRequestMessage(HttpMethod.Get, currentUrl); + var response = await HttpRetryHandler.SendWithRetryAsync(httpClient, request, cancellationToken: cancellationToken); + if (IsRedirect(response.StatusCode) && response.Headers.Location is not null) + { + currentUrl = ResolveRedirectUrl(currentUrl, response.Headers.Location); + response.Dispose(); + continue; + } + + if (!response.IsSuccessStatusCode) + break; + + var html = await response.Content.ReadAsStringAsync(cancellationToken); + var resolvedUrl = response.RequestMessage?.RequestUri?.ToString() ?? currentUrl; + var result = ParseCitationMetadata(html, resolvedUrl, sourceId); + if (result is not null) + return result; + break; + } + } + catch (HttpRequestException) + { + } + } + + return null; + } + private SearchResult? ParseEntry(XElement entry) { var title = entry.Element(AtomNs + "article_title")?.Element(AtomNs + "ja")?.Value?.Trim() @@ -245,12 +287,136 @@ private async Task<SearchResultsPage> SearchJStageAsync( }; } + private SearchResult? ParseCitationMetadata(string html, string url, string sourceId) + { + var metadata = ExtractMetaTags(html); + var title = GetFirstMeta(metadata, "citation_title"); + if (string.IsNullOrEmpty(title)) + return null; + + var doi = GetFirstMeta(metadata, "citation_doi"); + var resultSourceId = !string.IsNullOrEmpty(doi) + ? doi + : ParseUrl(url) ?? sourceId; + var pdfUrl = GetFirstMeta(metadata, "citation_pdf_url") ?? GetFirstMeta(metadata, "pdf_url"); + var downloadUrls = new Dictionary<string, string>(); + if (!string.IsNullOrEmpty(pdfUrl)) + downloadUrls["pdf"] = pdfUrl; + + return new SearchResult + { + Source = "jstage", + SourceId = resultSourceId, + Title = title, + Authors = metadata.TryGetValue("citation_author", out var authors) ? authors.ToArray() : [], + PublishedAt = FormatPublicationDate(GetFirstMeta(metadata, "citation_publication_date")), + Url = url, + Doi = doi, + Journal = GetFirstMeta(metadata, "citation_journal_title"), + Categories = [], + DownloadUrls = downloadUrls, + }; + } + [GeneratedRegex(@"jstage\.jst\.go\.jp/article/((?:[^/]+/){3}[^/]+)")] private static partial Regex JStageArticleUrlPattern(); [GeneratedRegex(@"/_article(?:/|$)")] private static partial Regex JStageArticleToPdfPattern(); + [GeneratedRegex(@"<meta\s+[^>]*>", RegexOptions.IgnoreCase)] + private static partial Regex MetaTagPattern(); + + [GeneratedRegex(@"(?<name>[A-Za-z_:.-]+)\s*=\s*(?<quote>[""'])(?<value>.*?)\k<quote>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] + private static partial Regex HtmlAttributePattern(); + private static int? ParseIntElement(XElement? root, XName name) => int.TryParse(root?.Element(name)?.Value, out var value) ? value : null; + + private IEnumerable<string> BuildArticlePageCandidates(string sourceId) + { + if (IsDoi(sourceId)) + yield return $"{_doiBaseUrl}/{sourceId}"; + else if (sourceId.Contains('/')) + yield return $"https://www.jstage.jst.go.jp/article/{sourceId}/_article/-char/ja/"; + } + + private static Dictionary<string, List<string>> ExtractMetaTags(string html) + { + var metadata = new Dictionary<string, List<string>>(StringComparer.OrdinalIgnoreCase); + + foreach (Match tag in MetaTagPattern().Matches(html)) + { + string? name = null; + string? content = null; + foreach (Match attr in HtmlAttributePattern().Matches(tag.Value)) + { + var attrName = attr.Groups["name"].Value; + var attrValue = WebUtility.HtmlDecode(attr.Groups["value"].Value.Trim()); + if (attrName.Equals("name", StringComparison.OrdinalIgnoreCase)) + name = attrValue; + else if (attrName.Equals("content", StringComparison.OrdinalIgnoreCase)) + content = attrValue; + } + + if (string.IsNullOrEmpty(name) || string.IsNullOrEmpty(content)) + continue; + + if (!metadata.TryGetValue(name, out var values)) + { + values = []; + metadata[name] = values; + } + values.Add(content); + } + + return metadata; + } + + private static string? GetFirstMeta(Dictionary<string, List<string>> metadata, string name) + => metadata.TryGetValue(name, out var values) + ? values.FirstOrDefault(v => !string.IsNullOrWhiteSpace(v)) + : null; + + private static string? FormatPublicationDate(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + return null; + + var trimmed = value.Trim(); + return trimmed.Length == 4 && int.TryParse(trimmed, out _) + ? $"{trimmed}-01-01" + : trimmed; + } + + private static bool IsDoi(string sourceId) => + sourceId.StartsWith("10.", StringComparison.Ordinal) && sourceId.Contains('/'); + + private static bool IsRedirect(HttpStatusCode statusCode) => + statusCode is HttpStatusCode.Moved + or HttpStatusCode.Redirect + or HttpStatusCode.RedirectMethod + or HttpStatusCode.TemporaryRedirect + or HttpStatusCode.PermanentRedirect; + + private static string ResolveRedirectUrl(string currentUrl, Uri location) + { + var next = location.IsAbsoluteUri ? location : new Uri(new Uri(currentUrl), location); + if (next.Scheme == Uri.UriSchemeHttp && next.Host.Equals("www.jstage.jst.go.jp", StringComparison.OrdinalIgnoreCase)) + next = new UriBuilder(next) { Scheme = Uri.UriSchemeHttps, Port = -1 }.Uri; + + return next.ToString(); + } + + private static string ResolveBaseUrl() + { + var value = Environment.GetEnvironmentVariable("PAPERS_CLI_JSTAGE_API_BASE_URL"); + return string.IsNullOrWhiteSpace(value) ? DefaultBaseUrl : value.TrimEnd('/'); + } + + private static string ResolveDoiBaseUrl() + { + var value = Environment.GetEnvironmentVariable("PAPERS_CLI_DOI_BASE_URL"); + return string.IsNullOrWhiteSpace(value) ? "https://doi.org" : value.TrimEnd('/'); + } }