diff --git a/AGENTS.md b/AGENTS.md index dbfb6a00..1738a8eb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -78,17 +78,17 @@ Two pieces per engine: ## Build & test -A configured build dir already exists (`cmake-build-debug`, also `…-release`, -`…-relwithdebinfo`). Typical loop: +A configured build dir already exists (`cmake-build-relwithdebinfo`, also `…-debug`, +`…-release`). Typical loop: ```bash # library -cmake --build cmake-build-debug --target odr +cmake --build cmake-build-relwithdebinfo --target odr # tests (the ODR_TEST option is on in this build dir) -cmake --build cmake-build-debug --target odr_test -./cmake-build-debug/test/odr_test --gtest_filter='OldMs.*' +cmake --build cmake-build-relwithdebinfo --target odr_test +./cmake-build-relwithdebinfo/test/odr_test --gtest_filter='OldMs.*' # CLI (renders a file to a directory of HTML) -cmake --build cmake-build-debug --target translate +cmake --build cmake-build-relwithdebinfo --target translate ``` Notable CMake options (`CMakeLists.txt`): `ODR_TEST`, `ODR_CLI`, diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 5229021d..0ef78d0c 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -143,7 +144,7 @@ class HtmlServiceImpl final : public HtmlService { o << "bottom:" << offset[1] / 72.0 << "in;"; o << "font-size:" << size << "pt;"; })); - out.write_raw(unicode); + out.write_raw(escape_text(unicode)); out.write_element_end("span"); } else if (op.type == pdf::GraphicsOperatorType::show_text_manual_spacing) { diff --git a/src/odr/internal/ooxml/ooxml_util.cpp b/src/odr/internal/ooxml/ooxml_util.cpp index 37d365eb..f8c8af7d 100644 --- a/src/odr/internal/ooxml/ooxml_util.cpp +++ b/src/odr/internal/ooxml/ooxml_util.cpp @@ -94,7 +94,7 @@ ooxml::read_pct_attribute(const pugi::xml_attribute attribute) { // potentially this should be moved to a table parser std::string val = attribute.value(); - util::string::trim(val); + util::string::trim_inplace(val); if (val.find('%') != std::string::npos) { util::string::replace_all(val, "%", ""); diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index 1cfa284f..ab9d2445 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -9,7 +9,8 @@ module (poppler / pdf2htmlEX, behind `ODR_WITH_PDF2HTMLEX`) is the production-quality alternative engine. **Scope today.** Parse the PDF object/file structure (classic cross-reference -tables, cross-reference streams, object streams, hybrid files), build the page +tables, cross-reference streams, object streams, hybrid files, with a +forward-scan recovery path for broken cross-references), build the page tree with fonts and annotations, tokenize page content streams into graphics operators, and emit a **proof-of-concept HTML rendering**: absolutely positioned text spans per `Tj`, pages sized from `MediaBox`. Encrypted files are decrypted @@ -42,6 +43,15 @@ not production-quality — the HTML path still contains debug `std::cout` output Lenient where the wild demands: `/Type /XRef` only warns, references to free or absent objects resolve to null with a `Logger` warning, `n g obj` need not end with a newline. +- **Cross-reference recovery**: when the trailer-chain walk throws (missing or + garbage `startxref`, a broken `Prev` chain) or the document fails to build + (no `/Root`, offsets pointing at the wrong objects), the whole file is + forward-scanned for `n g obj` starts, rebuilding a synthetic xref (last + definition of an id wins). `trailer` dictionaries are collected for `/Root`, + `/Encrypt`, `/ID`; recovered `/Type /ObjStm` members are indexed as + compressed entries; and, when no trailer supplied a `/Root`, a `/Type + /Catalog` object is searched. Handles e.g. an HTTP response saved as `.pdf` + (every offset shifted by the header). - **Page tree**: `Catalog` → `Pages` (recursive) → `Page` with per-page `Resources` (fonts only) and `Annots` (raw dictionary only). Objects cached by reference (`DocumentParser::m_objects`). @@ -186,7 +196,9 @@ surprises **throw** `std::runtime_error` (missing `obj`/`endobj`/`stream`/ operators ignored by `execute`, annotations keep their raw dictionary, CMap `codespacerange`/`bfrange` parsed past without effect. References to free/absent objects resolve to null with a warning; unknown xref-stream entry types treated -as absent (7.5.8.3). +as absent (7.5.8.3). A structural throw in the cross-reference layer is not +fatal, though: it is caught once and the file is forward-scanned to rebuild the +table (*Cross-reference recovery* above) before giving up. **Debug output still in place.** `html/pdf_file.cpp`, `pdf_graphics_state.cpp`, `pdf_graphics_operator_parser.cpp` and `pdf_cmap_parser.cpp` print diagnostics @@ -222,11 +234,16 @@ and routes its warnings through it — new diagnostics should do the same. variants), plus inherited-page-attribute coverage (a multi-level `Pages` tree: per-page resolved `MediaBox`/`CropBox`/`Rotate`/`Resources`, override vs. inheritance, the `CropBox` ← `MediaBox` default, the missing-`MediaBox` - US-Letter lenience). End-to-end: the classic fixture + US-Letter lenience), plus cross-reference-recovery coverage (inline broken + mini-PDFs: garbage prepended, a bad `startxref`, no trailer at all → catalog + scan, a duplicate id → last definition wins, a page tree living in an object + stream). End-to-end: the classic fixture `odr-public/pdf/style-various-1.pdf`, plus decryption of `odr-public/pdf/Casio_WVA-M650-7AJF.pdf` (RC4, empty password) and `odr-private/pdf/encrypted_fontfile3_opentype.pdf` (AES-256; skipped when the - private submodule is absent). The `odr-private` xref-stream/objstm/hybrid + private submodule is absent), and recovery of the real + `odr-private/pdf/order-EK52VKL0.pdf` (an HTTP response saved as `.pdf`; + likewise skipped when absent). The `odr-private` xref-stream/objstm/hybrid fixtures (`basic_text.pdf`, `geneve_1564.pdf`, `test_fail.pdf`, `Kayla….pdf`, `svg_background…issue402.pdf`, `Core_v5.1.pdf`, `onepage.pdf`) were verified manually but are not pinned in unit tests. Also still contains the original @@ -248,31 +265,16 @@ are ordered by what they unlock; 0–2 are roughly sequential, 3 and 4 are independent, 5 builds on whatever pages already render. Each stage gets its own detailed design before implementation. -## Stage 0 — file-format compatibility (prerequisite) — **mostly done** - -Modern producers write PDF 1.5+ structures the original parser rejected. -Cross-reference/object streams + hybrid files, the filter framework (incl. PNG -predictors), inherited page attributes, and encryption (RC4 / AES-128 / AES-256) -are **all implemented** (see *What works*). The one remaining piece: - -**Xref recovery for broken files** (post-stage-0; the WP2 code left room): -- Trigger: any structural throw during xref-chain walking or a failed object - lookup (`startxref` missing/garbage, offsets wrong). -- Recovery: a single forward scan for `n g obj` line starts (the existing - sequential `read_entry` machinery is most of this), building a synthetic - `Xref` (last definition of an id wins), collecting `trailer` dicts and - `/Type /Catalog` objects as `Root` candidates; objstm members indexed by - scanning recovered object streams. -- Tests fit inline strings well: the scan ignores xref offsets, so a broken - mini-PDF needs no offset bookkeeping — write a literal with a garbage - `startxref`, duplicate ids, or a missing trailer, and assert what got rebuilt. - Real-world fixture: `odr-private/pdf/order-EK52VKL0.pdf` — an HTTP response - accidentally saved as `.pdf` (starts with `HTTP/1.0 200 OK`). - -Remaining encryption edge cases (deferred until a real file needs them): -per-stream `/Crypt` filter `Name` overrides, the `EncryptMetadata false` -metadata-stream `Identity` special case, and `Perms` (Algorithm 13) validation; -the public-key security handler and R 5 are out of scope. +## Stage 0 — file-format compatibility (prerequisite) — **done** + +The prerequisite for everything below: read the structures modern producers +write that the original parser rejected, so a real-world `.pdf` reaches the page +tree at all. All of it has landed (see *What works*): the stream-filter +framework (incl. PNG predictors), PDF 1.5+ cross-reference/object streams and +hybrid files, inherited page attributes, encryption (RC4 / AES-128 / AES-256), +and last-resort cross-reference recovery for broken files. Remaining odds and +ends are folded into *Other known gaps* below; the staged renderer work now +builds on a parser that opens the common corpus. ## Stage 1 — text extraction: the code → Unicode chain @@ -441,6 +443,15 @@ tree, little else. ## Other known gaps +- **Encryption edge cases** (deferred from stage 0 until a real file needs + them): per-stream `/Crypt` filter `Name` overrides, the `EncryptMetadata + false` metadata-stream `Identity` special case, and `Perms` (Algorithm 13) + validation. The public-key security handler and revision 5 are out of scope. +- **Recovery limitations** (deferred from stage 0): when several `/Type + /Catalog` objects survive, the first in id order is picked rather than the + newest; the constructor-triggered recovery path cannot decode object streams + in an *encrypted* broken file (no decryptor yet), so such members go + unindexed. Both are edge cases beyond the corpus seen so far. - **Linearized files** are not handled specially (the tail-first read usually still works, but hint streams are ignored). - **CMap coverage**: only single-byte `bfchar`; `bfrange`/`codespacerange` diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp index 1cc217c4..5f934135 100644 --- a/src/odr/internal/pdf/pdf_document_parser.cpp +++ b/src/odr/internal/pdf/pdf_document_parser.cpp @@ -9,10 +9,17 @@ #include #include +#include +#include + +#include #include #include #include #include +#include +#include +#include namespace odr::internal::pdf { @@ -144,17 +151,19 @@ Resources *parse_resources(DocumentParser &parser, const Object &object, return resources; } +Annotation *parse_annotation(Document &document, const Dictionary &dictionary) { + auto *annotation = document.create_element(); + annotation->object = Object(dictionary); + return annotation; +} + Annotation *parse_annotation(DocumentParser &parser, const ObjectReference &reference, Document &document) { - auto *annotation = document.create_element(); - IndirectObject object = parser.read_object(reference); - const Dictionary &dictionary = object.object.as_dictionary(); - + Annotation *annotation = + parse_annotation(document, object.object.as_dictionary()); annotation->object_reference = reference; - annotation->object = Object(dictionary); - return annotation; } @@ -175,12 +184,19 @@ Page *parse_page(DocumentParser &parser, const ObjectReference &reference, const Object resources = attributes.resolve_into(*page, parser, reference); page->resources = parse_resources(parser, resources, document); - if (dictionary["Contents"].is_reference()) { - page->contents_reference = {dictionary["Contents"].as_reference()}; - } else { - for (const Object &e : dictionary["Contents"].as_array()) { + // /Contents is a content stream or an array of them, supplied directly or + // through an indirect reference (7.7.3.3). Resolve a reference first so that + // a reference to an array is expanded into its stream references rather than + // mistaken for a single stream. + const Object &contents = dictionary["Contents"]; + const Object resolved_contents = + contents.is_reference() ? parser.resolve_object_copy(contents) : contents; + if (resolved_contents.is_array()) { + for (const Object &e : resolved_contents.as_array()) { page->contents_reference.push_back(e.as_reference()); } + } else if (contents.is_reference()) { + page->contents_reference = {contents.as_reference()}; } if (dictionary.has_key("Annots")) { @@ -188,8 +204,15 @@ Page *parse_page(DocumentParser &parser, const ObjectReference &reference, Array annotations = parser.resolve_object_copy(dictionary["Annots"]).as_array(); for (const Object &annotation : annotations) { - page->annotations.push_back( - parse_annotation(parser, annotation.as_reference(), document)); + // entries are usually indirect references, but inline annotation + // dictionaries are equally valid (12.5.2) + if (annotation.is_reference()) { + page->annotations.push_back( + parse_annotation(parser, annotation.as_reference(), document)); + } else if (annotation.is_dictionary()) { + page->annotations.push_back( + parse_annotation(document, annotation.as_dictionary())); + } } } @@ -258,9 +281,16 @@ DocumentParser::DocumentParser(std::unique_ptr in, std::optional decryptor, const Logger &logger) : m_stream(std::move(in)), m_parser(*m_stream), m_logger{&logger} { - auto [xref, trailer] = read_trailer_chain(); - m_xref = std::move(xref); - m_trailer = std::move(trailer); + try { + auto [xref, trailer] = read_trailer_chain(); + m_xref = std::move(xref); + m_trailer = std::move(trailer); + } catch (const std::exception &e) { + ODR_WARNING(*m_logger, "pdf: cross-reference parsing failed (" + << e.what() + << "), scanning the file to recover"); + recover_xref(); + } if (m_trailer.has_key("Encrypt")) { // Build an `Authenticator` from the trailer `/Encrypt` and `/ID` @@ -571,6 +601,217 @@ std::pair DocumentParser::read_trailer_chain() { return {std::move(result_xref), std::move(result_trailer).value()}; } +namespace { + +/// Trim leading and trailing PDF whitespace from `line`, returning the offset +/// of the first non-whitespace byte (so the caller can map back to a file +/// position) and a view of the trimmed content. +std::pair trim_line(const std::string &line) { + const std::string_view content = + util::string::trim_view(line, &ObjectParser::is_whitespace); + // `content` is a subrange of `line`, so the leading offset is the distance + // between their data pointers. + return {static_cast(content.data() - line.data()), content}; +} + +/// Recognize an `n g obj` object header at the start of `content` (already +/// trimmed). The dictionary/value may follow on the same line (`12 0 obj<<`), +/// so only the leading `id gen obj` token is required. +std::optional match_object_start(std::string_view content) { + util::stream::ViewStreamBuf buffer(content); + std::istream stream(&buffer); + ObjectParser parser(stream); + + // `peek_unsigned_integer` guards each read so a non-matching line is rejected + // without `read_unsigned_integer` throwing (the common case while scanning). + if (!parser.peek_unsigned_integer()) { + return std::nullopt; + } + const UnsignedInteger id = parser.read_unsigned_integer(); + if (!parser.peek_whitespace()) { + return std::nullopt; + } + parser.skip_whitespace(); + if (!parser.peek_unsigned_integer()) { + return std::nullopt; + } + const UnsignedInteger gen = parser.read_unsigned_integer(); + if (!parser.peek_whitespace()) { + return std::nullopt; + } + parser.skip_whitespace(); + + // the `obj` keyword must follow; guard against identifiers like `object` + // that merely start with `obj` + const std::string rest = parser.read_line(); + const std::string_view tail(rest); + if (!tail.starts_with("obj")) { + return std::nullopt; + } + if (tail.size() > 3 && + (std::isalnum(static_cast(tail[3])) || tail[3] == '.')) { + return std::nullopt; + } + return ObjectReference(id, gen); +} + +/// True if `content` (already trimmed) ends with the `stream` keyword on a word +/// boundary. This covers both a bare `stream` line and a compact object that +/// inlines its dictionary and the `stream` token on one line +/// (`N G obj<<...>>stream`). The boundary check rejects `endstream` and +/// identifiers that merely end in `stream`. +bool opens_stream_body(std::string_view content) { + constexpr std::string_view keyword = "stream"; + if (!content.ends_with(keyword)) { + return false; + } + const std::size_t begin = content.size() - keyword.size(); + return begin == 0 || + !std::isalnum(static_cast(content[begin - 1])); +} + +} // namespace + +void DocumentParser::recover_xref() { + // Offsets from the failed attempt may be wrong, so anything cached from it is + // suspect. + m_objects.clear(); + m_object_streams.clear(); + m_recovered = true; + + ObjectParser &p = parser().parser(); + std::istream &stream = in(); + stream.clear(); + stream.seekg(0, std::ios::end); + const auto size = static_cast(stream.tellg()); + + Xref xref; + Dictionary trailer; + + stream.seekg(0); + while (true) { + stream.clear(); // drop any eofbit set by the previous read_line + const std::int64_t tell = stream.tellg(); + if (tell < 0 || static_cast(tell) >= size) { + break; + } + const auto position = static_cast(tell); + + const std::string line = p.read_line(); + const auto [lead, content] = trim_line(line); + + if (std::optional ref = match_object_start(content)) { + // last definition of an id wins (operator[] overwrites) + xref.table[*ref] = Xref::Entry( + Xref::UsedEntry{static_cast(position + lead)}); + // A compact object may inline its dictionary and the `stream` token on + // this same line; fall through to skip the body below. Otherwise the + // header is fully consumed and we advance to the next line. + if (!opens_stream_body(content)) { + continue; + } + } + + if (opens_stream_body(content)) { + // Skip the stream body so its (possibly object-shaped) bytes are not + // mis-scanned. The length is unknown here, so scan past `endstream`. + stream.clear(); + p.skip_past("endstream"); + continue; + } + + if (content.starts_with("trailer")) { + const std::int64_t after = stream.tellg(); // start of the next line + try { + stream.clear(); + stream.seekg(static_cast(position + lead) + + 7); // "trailer" + p.skip_whitespace(); + for (const Dictionary dict = p.read_dictionary(); + const auto &[key, value] : dict) { + trailer[key] = value; // last trailer wins per key + } + } catch (const std::exception &) { + // ignore a malformed trailer and keep scanning + } + stream.clear(); + if (after >= 0) { + stream.seekg(after); + } + continue; + } + } + + m_xref = std::move(xref); + m_trailer = std::move(trailer); + + index_object_streams(); + + if (!m_trailer.has_key("Root")) { + recover_root(); + } +} + +void DocumentParser::index_object_streams() { + // Snapshot the directly recovered objects: reading object streams adds + // compressed entries, which would invalidate an in-flight iterator. + std::vector candidates; + for (const auto &[reference, entry] : m_xref.table) { + if (entry.is_used()) { + candidates.push_back(reference); + } + } + + for (const ObjectReference &reference : candidates) { + try { + const IndirectObject &object = read_object(reference); + if (!object.has_stream || !object.object.is_dictionary()) { + continue; + } + const Dictionary &dictionary = object.object.as_dictionary(); + if (!dictionary.has_key("Type") || !dictionary["Type"].is_name() || + dictionary["Type"].as_name() != "ObjStm") { + continue; + } + const ObjectStream &members = load_object_stream(reference.id); + for (std::size_t i = 0; i < members.size(); ++i) { + // a directly recovered object wins over its compressed copy + m_xref.table.try_emplace(ObjectReference(members[i].id, 0), + Xref::Entry(Xref::CompressedEntry{ + static_cast(reference.id), + static_cast(i)})); + } + } catch (const std::exception &) { + // an unreadable (or, when encrypted, undecryptable) object stream is + // simply not indexed + } + } +} + +void DocumentParser::recover_root() { + for (const auto &[reference, entry] : m_xref.table) { + if (entry.is_free()) { + continue; + } + try { + const IndirectObject &object = read_object(reference); + if (!object.object.is_dictionary()) { + continue; + } + const Dictionary &dictionary = object.object.as_dictionary(); + if (dictionary.has_key("Type") && dictionary["Type"].is_name() && + dictionary["Type"].as_name() == "Catalog") { + ODR_WARNING(*m_logger, "pdf: recovered document catalog " << reference); + m_trailer["Root"] = Object(reference); + return; + } + } catch (const std::exception &) { + // skip objects that fail to read during the catalog search + } + } + ODR_WARNING(*m_logger, "pdf: recovery found no document catalog"); +} + void DocumentParser::decrypt_strings(Object &object, const ObjectReference &reference) { if (object.is_standard_string()) { @@ -599,7 +840,21 @@ DocumentParser::build_document(const Dictionary &trailer) { } std::unique_ptr DocumentParser::parse_document() { - return build_document(m_trailer); + try { + return build_document(m_trailer); + } catch (const std::exception &e) { + // The cross-reference table parsed cleanly but does not describe a usable + // document (no `/Root`, offsets pointing at the wrong objects, …). Scan the + // file once and retry; if recovery already ran, give up. + if (m_recovered) { + throw; + } + ODR_WARNING(*m_logger, "pdf: building the document failed (" + << e.what() + << "), scanning the file to recover"); + recover_xref(); + return build_document(m_trailer); + } } void DocumentParser::resolve_object(Object &object) { diff --git a/src/odr/internal/pdf/pdf_document_parser.hpp b/src/odr/internal/pdf/pdf_document_parser.hpp index ffd60556..e22502f2 100644 --- a/src/odr/internal/pdf/pdf_document_parser.hpp +++ b/src/odr/internal/pdf/pdf_document_parser.hpp @@ -110,6 +110,21 @@ class DocumentParser { /// table together with the newest (first-seen) trailer dictionary. [[nodiscard]] std::pair read_trailer_chain(); + /// Last-resort cross-reference recovery for broken files (missing/garbage + /// `startxref`, wrong offsets, a damaged chain): forward-scan the whole file + /// for `n g obj` starts, rebuilding `m_xref` (last definition of an id wins) + /// and collecting `trailer` dictionaries into `m_trailer`. Then object-stream + /// members are indexed (`index_object_streams`) and, if no `trailer` supplied + /// a `/Root`, a `/Type /Catalog` object is searched (`recover_root`). Sets + /// `m_recovered`. Any object cached from the failed attempt is dropped first. + void recover_xref(); + /// Index the members of every recovered `/Type /ObjStm` object as compressed + /// cross-reference entries (additive; an existing direct entry wins). + void index_object_streams(); + /// Search the recovered objects for a `/Type /Catalog` and install it as the + /// trailer `/Root`. + void recover_root(); + [[nodiscard]] std::unique_ptr build_document(const Dictionary &trailer); @@ -125,6 +140,7 @@ class DocumentParser { Xref m_xref; Dictionary m_trailer; + bool m_recovered{false}; bool m_is_encrypted{false}; std::optional m_authenticator; diff --git a/src/odr/internal/pdf/pdf_file_parser.cpp b/src/odr/internal/pdf/pdf_file_parser.cpp index 4938f069..4696b730 100644 --- a/src/odr/internal/pdf/pdf_file_parser.cpp +++ b/src/odr/internal/pdf/pdf_file_parser.cpp @@ -31,7 +31,10 @@ IndirectObject FileParser::read_indirect_object() { result.object = m_parser.read_object(); m_parser.skip_whitespace(); - const std::string next = m_parser.read_line(); + // the keyword may carry trailing whitespace (`endobj \n`) or a CR from a + // CRLF line ending (`stream\r\n`), so compare against the trimmed token + std::string next = m_parser.read_line(); + util::string::rtrim_inplace(next); if (next == "endobj") { m_parser.skip_whitespace(); @@ -125,7 +128,7 @@ std::string FileParser::read_stream(const std::int32_t size) { // TODO improve poor solution while (true) { std::string line = m_parser.read_line(true); - if (line == "endstream\n") { + if (util::string::trim(line) == "endstream") { result.pop_back(); break; } @@ -134,10 +137,7 @@ std::string FileParser::read_stream(const std::int32_t size) { } m_parser.skip_whitespace(); - if (const std::string line = m_parser.read_line(); line != "endobj") { - throw std::runtime_error("expected endobj"); - } - + m_parser.expect_characters("endobj"); m_parser.skip_whitespace(); return result; diff --git a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp index 7621f05d..cb9e3d0b 100644 --- a/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp +++ b/src/odr/internal/pdf/pdf_graphics_operator_parser.cpp @@ -166,9 +166,33 @@ GraphicsOperator GraphicsOperatorParser::read_operator() { std::cerr << "unknown operator: " << operator_name << std::endl; } + // After `ID` the raw image bytes follow inline; consume them up to `EI` so + // they are not mis-tokenized as operators (which corrupts the parse state). + if (result.type == GraphicsOperatorType::begin_inline_image_data) { + skip_inline_image_data(); + } + m_parser.skip_whitespace(); return result; } +void GraphicsOperatorParser::skip_inline_image_data() { + // Exactly one white-space character separates `ID` from the data (8.9.7). + if (m_parser.geti() != eof) { + m_parser.bumpc(); + } + + // The length is not encoded, so scan for the `EI` terminator. `EI` also + // occurs inside the raw image bytes, so only accept one that is followed by + // white-space or eof; otherwise keep scanning past it. + while (m_parser.skip_past("EI")) { + const int_type after = m_parser.geti(); + if (after == eof || + ObjectParser::is_whitespace(static_cast(after))) { + return; + } + } +} + } // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp b/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp index 3adf02a1..5a34c957 100644 --- a/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp +++ b/src/odr/internal/pdf/pdf_graphics_operator_parser.hpp @@ -18,6 +18,10 @@ class GraphicsOperatorParser { [[nodiscard]] GraphicsOperator read_operator(); private: + // Consume the binary image data of an inline image, from just after the `ID` + // keyword up to and including its `EI` terminator (8.9.7). + void skip_inline_image_data(); + ObjectParser m_parser; }; diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index b242e2b3..c7a790d9 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -63,7 +63,7 @@ void GraphicsState::execute(const GraphicsOperator &op) { std::cout << "dash pattern not implemented" << std::endl; break; case GraphicsOperatorType::set_color_rendering_intent: - current().general.color_rendering_intent = op.arguments.at(0).as_real(); + current().general.color_rendering_intent = op.arguments.at(0).as_name(); break; case GraphicsOperatorType::set_flatness_tolerance: current().general.flatness_tolerance = op.arguments.at(0).as_real(); diff --git a/src/odr/internal/pdf/pdf_graphics_state.hpp b/src/odr/internal/pdf/pdf_graphics_state.hpp index 6e4ea9e0..b65079f2 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.hpp +++ b/src/odr/internal/pdf/pdf_graphics_state.hpp @@ -22,7 +22,7 @@ struct GraphicsState { int join_style{}; double miter_limit{}; int dash_pattern{}; - double color_rendering_intent{}; + std::string color_rendering_intent; double flatness_tolerance{}; std::string graphics_state_parameters; std::array transform_matrix{1, 0, 0, 1, 0, 0}; diff --git a/src/odr/internal/pdf/pdf_object_parser.cpp b/src/odr/internal/pdf/pdf_object_parser.cpp index 84a6540d..357cf5ea 100644 --- a/src/odr/internal/pdf/pdf_object_parser.cpp +++ b/src/odr/internal/pdf/pdf_object_parser.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace odr::internal::pdf { @@ -121,6 +122,45 @@ std::string ObjectParser::read_line(const bool inclusive) { return util::stream::read_line(in(), inclusive); } +bool ObjectParser::skip_past(const std::string_view marker) { + if (marker.empty()) { + return true; + } + + // KMP failure function over the (typically tiny) marker, so the streaming + // scan stays correct even when the marker has internal repetition (e.g. the + // two `e`s in `endstream`). + std::vector fail(marker.size(), 0); + for (std::size_t i = 1, k = 0; i < marker.size(); ++i) { + while (k > 0 && marker[i] != marker[k]) { + k = fail[k - 1]; + } + if (marker[i] == marker[k]) { + ++k; + } + fail[i] = k; + } + + std::size_t matched = 0; + while (true) { + const int_type c = sb().sbumpc(); + if (c == eof) { + in().setstate(std::ios::eofbit); + return false; + } + const auto ch = static_cast(c); + while (matched > 0 && ch != marker[matched]) { + matched = fail[matched - 1]; + } + if (ch == marker[matched]) { + ++matched; + if (matched == marker.size()) { + return true; + } + } + } +} + void ObjectParser::expect_characters(const std::string &string) { const std::string observed = bumpnc(string.size()); if (observed != string) { @@ -135,20 +175,38 @@ bool ObjectParser::peek_number() { return c != eof && (c == '+' || c == '-' || c == '.' || std::isdigit(c)); } -UnsignedInteger ObjectParser::read_unsigned_integer() { +bool ObjectParser::peek_unsigned_integer() { + const int_type c = geti(); + return c != eof && std::isdigit(c); +} + +std::pair +ObjectParser::read_unsigned_integer_and_count() { UnsignedInteger result = 0; + std::uint32_t count = 0; while (true) { const int_type c = geti(); if (c == eof) { - return result; + break; } if (!std::isdigit(c)) { - return result; + break; } result = result * 10 + (c - '0'); + ++count; bumpc(); } + + if (count == 0) { + throw std::runtime_error("expected unsigned integer, but got none"); + } + + return {result, count}; +} + +UnsignedInteger ObjectParser::read_unsigned_integer() { + return read_unsigned_integer_and_count().first; } Integer ObjectParser::read_integer() { @@ -172,23 +230,34 @@ Real ObjectParser::read_number() { } std::variant ObjectParser::read_integer_or_real() { - Integer i = 0; + Integer sign = 1; + if (geti() == '-') { + sign = -1; + bumpc(); + } else if (geti() == '+') { + bumpc(); + } - if (char_type c = getc(); c != '.') { - i = read_integer(); - c = getc(); - if (c != '.') { - return i; - } + UnsignedInteger i = 0; + + if (geti() != '.') { + i = read_unsigned_integer(); + } + if (geti() != '.') { + return static_cast(sign * i); } bumpc(); - const pos_type begin = in().tellg(); - const UnsignedInteger i2 = read_unsigned_integer(); - const pos_type end = in().tellg(); + Real r = static_cast(i); + + if (peek_unsigned_integer()) { + const auto [fraction, decimals] = read_unsigned_integer_and_count(); + // `decimals` is unsigned; negate as floating point to avoid wrap-around. + r += static_cast(fraction) * + std::pow(10.0, -static_cast(decimals)); + } - return static_cast(i) + - static_cast(i2) * std::pow(10.0, begin - end); + return static_cast(sign) * r; } bool ObjectParser::peek_name() { diff --git a/src/odr/internal/pdf/pdf_object_parser.hpp b/src/odr/internal/pdf/pdf_object_parser.hpp index 4558bc4b..8ee00ab4 100644 --- a/src/odr/internal/pdf/pdf_object_parser.hpp +++ b/src/odr/internal/pdf/pdf_object_parser.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include namespace odr::internal::pdf { @@ -45,9 +46,16 @@ class ObjectParser { void skip_whitespace(); void skip_line(); std::string read_line(bool inclusive = false); + /// Advance the cursor just past the next occurrence of `marker`. Returns true + /// if it was found; on false the stream has been consumed to eof. Operates on + /// raw bytes, so the marker may straddle line breaks. + bool skip_past(std::string_view marker); void expect_characters(const std::string &string); [[nodiscard]] bool peek_number(); + [[nodiscard]] bool peek_unsigned_integer(); + [[nodiscard]] std::pair + read_unsigned_integer_and_count(); [[nodiscard]] UnsignedInteger read_unsigned_integer(); [[nodiscard]] Integer read_integer(); [[nodiscard]] Real read_number(); diff --git a/src/odr/internal/util/stream_util.hpp b/src/odr/internal/util/stream_util.hpp index 04ebbcc5..01cd065f 100644 --- a/src/odr/internal/util/stream_util.hpp +++ b/src/odr/internal/util/stream_util.hpp @@ -1,10 +1,26 @@ #pragma once #include +#include #include +#include namespace odr::internal::util::stream { +/// Read-only stream buffer over an existing `string_view`, so a `std::istream` +/// can scan it without copying into a `std::string`/`std::istringstream`. The +/// view must outlive the buffer. Only the get area is exposed; seeking is not +/// supported. +class ViewStreamBuf : public std::streambuf { +public: + explicit ViewStreamBuf(std::string_view view) { + // The get area is only ever read, never written through, so dropping the + // `const` is safe. + char *begin = const_cast(view.data()); + setg(begin, begin, begin + view.size()); + } +}; + std::string read(std::istream &in); std::string read(std::istream &in, std::size_t size); diff --git a/src/odr/internal/util/string_util.cpp b/src/odr/internal/util/string_util.cpp index 9a926a6a..661e2fa5 100644 --- a/src/odr/internal/util/string_util.cpp +++ b/src/odr/internal/util/string_util.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -19,22 +20,61 @@ bool string::ends_with(const std::string &string, const std::string &with) { 0; } -void string::ltrim(std::string &s) { - s.erase(s.begin(), std::ranges::find_if(s, [](const std::uint8_t ch) { - return !std::isspace(ch); +bool string::is_ascii_space(const char c) { + return std::isspace(static_cast(c)) != 0; +} + +void string::ltrim_inplace(std::string &s, const CharPredicate is_space) { + s.erase(s.begin(), std::ranges::find_if(s, [is_space](const char ch) { + return !is_space(ch); })); } -void string::rtrim(std::string &s) { +void string::rtrim_inplace(std::string &s, const CharPredicate is_space) { s.erase(std::find_if(s.rbegin(), s.rend(), - [](const std::uint8_t ch) { return !std::isspace(ch); }) + [is_space](const char ch) { return !is_space(ch); }) .base(), s.end()); } -void string::trim(std::string &s) { - rtrim(s); - ltrim(s); +void string::trim_inplace(std::string &s, const CharPredicate is_space) { + rtrim_inplace(s, is_space); + ltrim_inplace(s, is_space); +} + +std::string string::ltrim(const std::string &s, const CharPredicate is_space) { + return std::string(ltrim_view(s, is_space)); +} + +std::string string::rtrim(const std::string &s, const CharPredicate is_space) { + return std::string(rtrim_view(s, is_space)); +} + +std::string string::trim(const std::string &s, const CharPredicate is_space) { + return std::string(trim_view(s, is_space)); +} + +std::string_view string::ltrim_view(std::string_view s, + const CharPredicate is_space) { + std::size_t begin = 0; + while (begin < s.size() && is_space(s[begin])) { + ++begin; + } + return s.substr(begin); +} + +std::string_view string::rtrim_view(std::string_view s, + const CharPredicate is_space) { + std::size_t end = s.size(); + while (end > 0 && is_space(s[end - 1])) { + --end; + } + return s.substr(0, end); +} + +std::string_view string::trim_view(std::string_view s, + const CharPredicate is_space) { + return ltrim_view(rtrim_view(s, is_space), is_space); } void string::replace_all(std::string &string, const std::string &search, diff --git a/src/odr/internal/util/string_util.hpp b/src/odr/internal/util/string_util.hpp index d91e2525..42f8fb83 100644 --- a/src/odr/internal/util/string_util.hpp +++ b/src/odr/internal/util/string_util.hpp @@ -2,6 +2,7 @@ #include #include +#include #include namespace odr::internal::util::string { @@ -9,9 +10,33 @@ namespace odr::internal::util::string { bool starts_with(const std::string &string, const std::string &with); bool ends_with(const std::string &string, const std::string &with); -void ltrim(std::string &s); -void rtrim(std::string &s); -void trim(std::string &s); +/// Predicate deciding whether a byte counts as whitespace for the `*_view` +/// trims. Takes a single `char`; implementations must handle the full byte +/// range without relying on the sign of `char`. +using CharPredicate = bool (*)(char); + +/// `std::isspace` for the default C locale, made safe for any `char` value. +bool is_ascii_space(char c); + +void ltrim_inplace(std::string &s, CharPredicate is_space = is_ascii_space); +void rtrim_inplace(std::string &s, CharPredicate is_space = is_ascii_space); +void trim_inplace(std::string &s, CharPredicate is_space = is_ascii_space); + +std::string ltrim(const std::string &s, + CharPredicate is_space = is_ascii_space); +std::string rtrim(const std::string &s, + CharPredicate is_space = is_ascii_space); +std::string trim(const std::string &s, CharPredicate is_space = is_ascii_space); + +/// Trim leading/trailing whitespace and return a view into `s`. The result is a +/// subrange of `s`, so the leading offset is recoverable as +/// `result.data() - s.data()`. +std::string_view ltrim_view(std::string_view s, + CharPredicate is_space = is_ascii_space); +std::string_view rtrim_view(std::string_view s, + CharPredicate is_space = is_ascii_space); +std::string_view trim_view(std::string_view s, + CharPredicate is_space = is_ascii_space); void replace_all(std::string &string, const std::string &search, const std::string &replace); diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index c6cad8af..f8f00288 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit c6cad8afe5795d343d3d8cfc634368694d40fc3b +Subproject commit f8f00288248d2d7aef0e113289feaf5fdc69510b diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index f62e13cd..9b3c8a3c 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit f62e13cdba20b099622b0091e6abbcc0675f378b +Subproject commit 9b3c8a3c5c7d97afd206ba29f5682c37faa527b4 diff --git a/test/src/html_output_test.cpp b/test/src/html_output_test.cpp index 2c55789b..b15cab82 100644 --- a/test/src/html_output_test.cpp +++ b/test/src/html_output_test.cpp @@ -70,14 +70,6 @@ TEST_P(HtmlOutputTests, html_meta) { GTEST_SKIP(); } - // TODO fix pdf implementation - if (engine == DecoderEngine::odr && - test_file.type == FileType::portable_document_format && - (test_file.short_path.starts_with("odr-private") || - test_file.short_path == "odr-public/pdf/Casio_WVA-M650-7AJF.pdf")) { - GTEST_SKIP(); - } - DecodePreference decode_preference; decode_preference.as_file_type = test_file.type; decode_preference.with_engine = engine; diff --git a/test/src/internal/pdf/pdf_document_parser.cpp b/test/src/internal/pdf/pdf_document_parser.cpp index 32798714..5c9ce769 100644 --- a/test/src/internal/pdf/pdf_document_parser.cpp +++ b/test/src/internal/pdf/pdf_document_parser.cpp @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -208,6 +209,135 @@ TEST(DocumentParser, inherited_page_attributes) { EXPECT_EQ(page6->rotate, 90); } +// Recovery: a valid file with garbage prepended (the real fixture +// `order-EK52VKL0.pdf` is an HTTP response saved as `.pdf`) has every xref +// offset and the `startxref` shifted, so the chain walk fails. A forward scan +// rebuilds the table from the actual object positions. +TEST(DocumentParser, recovers_from_prepended_garbage) { + const std::string pdf = + "HTTP/1.0 200 OK\r\nContent-Type: application/pdf\r\n\r\n" + + two_object_mini_pdf(true); + check_mini_pdf(pdf); +} + +// Recovery: the `startxref` points nowhere, so locating the table fails. +TEST(DocumentParser, recovers_from_garbage_startxref) { + std::string pdf = two_object_mini_pdf(true); + const std::size_t pos = pdf.find("startxref\n") + std::strlen("startxref\n"); + pdf.replace(pos, pdf.find('\n', pos) - pos, "999999"); + check_mini_pdf(pdf); +} + +// Recovery: no `xref`/`trailer`/`startxref` at all — the catalog is found by +// scanning the recovered objects for `/Type /Catalog`. +TEST(DocumentParser, recovers_root_from_catalog_scan) { + const std::string pdf = + "%PDF-1.7\n" + "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n" + "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n" + "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " + "/Resources << >> /Contents 4 0 R >>\nendobj\n" + "4 0 obj\n<< /Length 5 >>\nstream\nBT ET\nendstream\nendobj\n" + "%%EOF\n"; + check_mini_pdf(pdf); +} + +// Recovery: an id defined more than once (e.g. a botched incremental update) +// resolves to the last definition in the file. +TEST(DocumentParser, recovery_last_definition_wins) { + const std::string pdf = + "%PDF-1.7\n" + "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n" + "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n" + "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] " + "/Resources << >> /Contents 4 0 R >>\nendobj\n" + "4 0 obj\n<< /Length 5 >>\nstream\nBT ET\nendstream\nendobj\n" + "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 200] " + "/Resources << >> /Contents 4 0 R >>\nendobj\n" + "trailer\n<< /Root 1 0 R >>\n%%EOF\n"; + + DocumentParser parser(std::make_unique(pdf)); + const std::unique_ptr document = parser.parse_document(); + const std::vector pages = document->collect_pages(); + + ASSERT_EQ(pages.size(), 1); + EXPECT_EQ(pages[0]->media_box.as_array()[2].as_real(), 200.0); +} + +// Recovery: an object that inlines its dictionary and the `stream` token on a +// single line (`N G obj<<...>>stream`) must still have its body skipped, so +// object-shaped bytes inside the stream (here a fake `1 0 obj`) do not +// overwrite the real recovered entry. +TEST(DocumentParser, recovery_skips_same_line_stream_body) { + const std::string pdf = + "%PDF-1.7\n" + "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n" + "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n" + "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " + "/Resources << >> /Contents 4 0 R >>\nendobj\n" + // dictionary and `stream` keyword share the object's first line; the body + // contains a decoy `1 0 obj` that must not be recorded + "4 0 obj<< /Length 20 >>stream\n1 0 obj garbage BT " + "ET\nendstream\nendobj\n" + "trailer\n<< /Root 1 0 R >>\n%%EOF\n"; + + DocumentParser parser(std::make_unique(pdf)); + const std::unique_ptr document = parser.parse_document(); + const std::vector pages = document->collect_pages(); + + ASSERT_EQ(pages.size(), 1); + // the real catalog (object 1) survived; the decoy inside the stream did not + // clobber it + EXPECT_EQ(pages[0]->media_box.as_array()[2].as_real(), 612.0); +} + +// Recovery: the page tree lives in an (uncompressed) object stream. After the +// forward scan finds the stream, its members are indexed as compressed entries +// so the catalog and pages resolve. +TEST(DocumentParser, recovers_object_stream_members) { + const std::vector> members = { + {2, "<< /Type /Catalog /Pages 3 0 R >>"}, + {3, "<< /Type /Pages /Kids [4 0 R] /Count 1 >>"}, + {4, "<< /Type /Page /Parent 3 0 R /MediaBox [0 0 612 792] " + "/Resources << >> /Contents 6 0 R >>"}}; + + std::string header; + std::string payload; + for (const auto &[id, body] : members) { + header += std::to_string(id) + " " + std::to_string(payload.size()) + " "; + payload += body + " "; + } + const std::string objstm = header + payload; + + std::string pdf = "%PDF-1.7\n"; + pdf += "5 0 obj\n<< /Type /ObjStm /N " + std::to_string(members.size()) + + " /First " + std::to_string(header.size()) + " /Length " + + std::to_string(objstm.size()) + " >>\nstream\n" + objstm + + "\nendstream\nendobj\n"; + pdf += "6 0 obj\n<< /Length 5 >>\nstream\nBT ET\nendstream\nendobj\n"; + pdf += "%%EOF\n"; + + DocumentParser parser(std::make_unique(pdf)); + const std::unique_ptr document = parser.parse_document(); + const std::vector pages = document->collect_pages(); + + ASSERT_EQ(pages.size(), 1); + EXPECT_EQ(pages[0]->media_box.as_array()[2].as_real(), 612.0); + EXPECT_EQ(parser.read_decoded_stream(pages[0]->contents_reference.front()), + "BT ET"); +} + +// Real-world recovery: an HTTP response accidentally saved as `.pdf` — the body +// is a valid PDF but the leading `HTTP/1.0 200 OK …` header shifts every +// offset. Skipped when the private submodule is absent. +TEST(DocumentParser, recovers_http_response_fixture) { + const std::string path = "odr-private/pdf/order-EK52VKL0.pdf"; + if (!std::filesystem::exists(TestData::test_file_path(path))) { + GTEST_SKIP() << "private fixture not available"; + } + check_fixture_parses(path); +} + TEST(DocumentParser, missing_media_box_defaults_to_us_letter) { PdfFileBuilder builder; builder.object("<< /Type /Catalog /Pages 2 0 R >>") diff --git a/test/src/internal/pdf/pdf_object_parser.cpp b/test/src/internal/pdf/pdf_object_parser.cpp index a9cd15f9..8d56fc85 100644 --- a/test/src/internal/pdf/pdf_object_parser.cpp +++ b/test/src/internal/pdf/pdf_object_parser.cpp @@ -2,6 +2,8 @@ #include #include +#include +#include #include @@ -13,8 +15,119 @@ std::string read_hex_string(const std::string &input) { ObjectParser parser(in); return std::get(parser.read_string()).string; } + +Real read_number(const std::string &input) { + std::istringstream in(input); + ObjectParser parser(in); + return parser.read_number(); +} + +UnsignedInteger read_unsigned_integer(const std::string &input) { + std::istringstream in(input); + ObjectParser parser(in); + return parser.read_unsigned_integer(); +} + +bool peek_unsigned_integer(const std::string &input) { + std::istringstream in(input); + ObjectParser parser(in); + return parser.peek_unsigned_integer(); +} + +Integer read_integer(const std::string &input) { + std::istringstream in(input); + ObjectParser parser(in); + return parser.read_integer(); +} + +// Runs skip_past(marker) on `input` and reports whether the marker was found +// together with the bytes left after the cursor, so a test can pin both the +// result and the resulting position. +std::pair skip_past(const std::string &input, + const std::string_view marker) { + std::istringstream in(input); + ObjectParser parser(in); + const bool found = parser.skip_past(marker); + std::string rest; + while (parser.geti() != ObjectParser::eof) { + rest.push_back(parser.bumpc()); + } + return {found, rest}; +} } // namespace +// 7.3.3: a real is an optional integer part, a `.`, and an optional fractional +// part; either part may be absent (but not both). +TEST(PdfObjectParser, read_number) { + EXPECT_DOUBLE_EQ(read_number("3.14"), 3.14); + EXPECT_DOUBLE_EQ(read_number("0.5"), 0.5); + EXPECT_DOUBLE_EQ(read_number("42."), 42.0); + EXPECT_DOUBLE_EQ(read_number(".25"), 0.25); + EXPECT_DOUBLE_EQ(read_number("10"), 10.0); + // sign applies to the whole magnitude, and either part may be absent + EXPECT_DOUBLE_EQ(read_number("-1.5"), -1.5); + EXPECT_DOUBLE_EQ(read_number("-.5"), -0.5); + EXPECT_DOUBLE_EQ(read_number("+.5"), 0.5); + EXPECT_DOUBLE_EQ(read_number("-7"), -7.0); +} + +// 7.3.3: an unsigned integer is one or more digits. A missing number is an +// error rather than a silent 0. +TEST(PdfObjectParser, read_unsigned_integer) { + EXPECT_EQ(read_unsigned_integer("123"), 123u); + EXPECT_EQ(read_unsigned_integer("0"), 0u); + EXPECT_EQ(read_unsigned_integer("007"), 7u); + EXPECT_EQ(read_unsigned_integer("42 0 obj"), 42u); + EXPECT_ANY_THROW(read_unsigned_integer("abc")); + EXPECT_ANY_THROW(read_unsigned_integer("-5")); + EXPECT_ANY_THROW(read_unsigned_integer("")); +} + +TEST(PdfObjectParser, peek_unsigned_integer) { + EXPECT_TRUE(peek_unsigned_integer("5")); + EXPECT_TRUE(peek_unsigned_integer("0xyz")); + EXPECT_FALSE(peek_unsigned_integer("-5")); + EXPECT_FALSE(peek_unsigned_integer("+5")); + EXPECT_FALSE(peek_unsigned_integer(".5")); + EXPECT_FALSE(peek_unsigned_integer("x")); + EXPECT_FALSE(peek_unsigned_integer("")); +} + +// 7.3.3: a signed integer is an optional `+`/`-` followed by digits. +TEST(PdfObjectParser, read_integer) { + EXPECT_EQ(read_integer("123"), 123); + EXPECT_EQ(read_integer("-5"), -5); + EXPECT_EQ(read_integer("+7"), 7); + EXPECT_EQ(read_integer("0"), 0); +} + +// skip_past advances just past the first occurrence of the marker and reports +// whether it was found; on a miss it consumes the whole stream. +TEST(PdfObjectParser, skip_past) { + using Result = std::pair; + + // cursor lands immediately after the marker + EXPECT_EQ(skip_past("hello world", "world"), Result(true, "")); + EXPECT_EQ(skip_past("abcXYdef", "XY"), Result(true, "def")); + + // the first occurrence wins + EXPECT_EQ(skip_past("aXYbXYc", "XY"), Result(true, "bXYc")); + + // not found: the stream is consumed to eof + EXPECT_EQ(skip_past("abcdef", "XY"), Result(false, "")); + + // an empty marker matches immediately and consumes nothing + EXPECT_EQ(skip_past("abc", ""), Result(true, "abc")); + + // markers with internal repetition must still match across an overlapping + // partial match (KMP correctness): "aab" in "aaab", and the doubled-`e` / + // doubled prefix cases for "endstream" + EXPECT_EQ(skip_past("aaab rest", "aab"), Result(true, " rest")); + EXPECT_EQ(skip_past("eendstream!", "endstream"), Result(true, "!")); + EXPECT_EQ(skip_past("<<...>>stream\nbytes\nendstreamX", "endstream"), + Result(true, "X")); +} + // 7.3.4.3: a hex string is the bytes of its hex digits, with whitespace ignored // and an odd final digit assumed to be followed by a 0. TEST(PdfObjectParser, hex_string_basic) { diff --git a/test/src/internal/util/string_util_test.cpp b/test/src/internal/util/string_util_test.cpp index d6864589..9d6ca1c5 100644 --- a/test/src/internal/util/string_util_test.cpp +++ b/test/src/internal/util/string_util_test.cpp @@ -69,3 +69,55 @@ TEST(string_util, split) { EXPECT_EQ(strings[2], "y"); } } + +TEST(string_util, trim_view) { + EXPECT_EQ(trim_view(" abc "), "abc"); + EXPECT_EQ(ltrim_view(" abc "), "abc "); + EXPECT_EQ(rtrim_view(" abc "), " abc"); + + // No surrounding whitespace leaves the content untouched. + EXPECT_EQ(trim_view("abc"), "abc"); + + // Interior whitespace is preserved. + EXPECT_EQ(trim_view("\t a b c \n"), "a b c"); + + // Empty and all-whitespace inputs collapse to an empty view. + EXPECT_EQ(trim_view(""), ""); + EXPECT_EQ(trim_view(" \t\r\n "), ""); + + // The result is a subrange of the input, so the leading offset is the + // distance between the data pointers. + { + const std::string_view input = " abc "; + const std::string_view trimmed = trim_view(input); + EXPECT_EQ(trimmed, "abc"); + EXPECT_EQ(trimmed.data() - input.data(), 3); + } +} + +TEST(string_util, trim_view_custom_predicate) { + const auto is_dot = [](const char c) { return c == '.'; }; + + EXPECT_EQ(trim_view("..abc..", is_dot), "abc"); + // The default ASCII-space predicate does not treat '.' as whitespace. + EXPECT_EQ(trim_view("..abc.."), "..abc.."); + // Conversely, the dot predicate does not strip spaces. + EXPECT_EQ(trim_view(" abc ", is_dot), " abc "); +} + +TEST(string_util, trim_owning_delegates_to_view) { + EXPECT_EQ(trim(" abc "), "abc"); + EXPECT_EQ(ltrim(" abc "), "abc "); + EXPECT_EQ(rtrim(" abc "), " abc"); + + const auto is_dot = [](const char c) { return c == '.'; }; + EXPECT_EQ(trim("..abc..", is_dot), "abc"); + + std::string s = " abc "; + trim_inplace(s); + EXPECT_EQ(s, "abc"); + + std::string d = "..abc.."; + trim_inplace(d, is_dot); + EXPECT_EQ(d, "abc"); +}