diff --git a/DEPENDENCIES b/DEPENDENCIES index 6590681f7..2c078a711 100644 --- a/DEPENDENCIES +++ b/DEPENDENCIES @@ -1,4 +1,4 @@ vendorpull https://github.com/sourcemeta/vendorpull 1dcbac42809cf87cb5b045106b863e17ad84ba02 -core https://github.com/sourcemeta/core 9a8c6370c53b4b3b24f4ba0caa2d897a5094ff4e -blaze https://github.com/sourcemeta/blaze 4c311d45803eaeeb17cbd49af4fffdb5f224d180 +core https://github.com/sourcemeta/core df8f2970ccf85a3a3f01e004ac436ff916f8c52a +blaze https://github.com/sourcemeta/blaze 7b214cff6d575831c16a2ce33f55d97c02eb6338 bootstrap https://github.com/twbs/bootstrap 1a6fdfae6be09b09eaced8f0e442ca6f7680a61e diff --git a/vendor/blaze/DEPENDENCIES b/vendor/blaze/DEPENDENCIES index 9ae6e513f..2bf88c518 100644 --- a/vendor/blaze/DEPENDENCIES +++ b/vendor/blaze/DEPENDENCIES @@ -1,5 +1,5 @@ vendorpull https://github.com/sourcemeta/vendorpull 1dcbac42809cf87cb5b045106b863e17ad84ba02 -core https://github.com/sourcemeta/core 5d187a796444fef1b5e044c659800085a4be7ef4 +core https://github.com/sourcemeta/core df8f2970ccf85a3a3f01e004ac436ff916f8c52a jsonschema-test-suite https://github.com/json-schema-org/JSON-Schema-Test-Suite 60755c1097769e313fae3ec4d63bcc9d49b5d2d5 jsonschema-2020-12 https://github.com/json-schema-org/json-schema-spec 769daad75a9553562333a8937a187741cb708c72 jsonschema-2019-09 https://github.com/json-schema-org/json-schema-spec 41014ea723120ce70b314d72f863c6929d9f3cfd diff --git a/vendor/blaze/src/bundle/bundle.cc b/vendor/blaze/src/bundle/bundle.cc index ad28457d7..74ee2c58d 100644 --- a/vendor/blaze/src/bundle/bundle.cc +++ b/vendor/blaze/src/bundle/bundle.cc @@ -16,12 +16,17 @@ namespace { -auto is_official_metaschema_reference( +auto is_skippable_metaschema_reference( + const sourcemeta::blaze::BundleMode mode, const sourcemeta::core::WeakPointer &pointer, const std::string &destination) -> bool { assert(!pointer.empty()); assert(pointer.back().is_property()); - return pointer.back().to_property() == "$schema" && + if (pointer.back().to_property() != "$schema") { + return false; + } + + return mode == sourcemeta::blaze::BundleMode::References || sourcemeta::blaze::is_official_schema(destination); } @@ -47,7 +52,9 @@ auto dependencies_internal( const auto &reference) { // We don't want to report official schemas, as we can expect // virtually all implementations to understand them out of the box - if (is_official_metaschema_reference(pointer, reference.destination)) { + if (is_skippable_metaschema_reference( + sourcemeta::blaze::BundleMode::NonOfficialMetaschemas, pointer, + reference.destination)) { return; } @@ -236,6 +243,7 @@ auto bundle_schema(sourcemeta::core::JSON &root, sourcemeta::core::JSON &subschema, const sourcemeta::blaze::SchemaWalker &walker, const sourcemeta::blaze::SchemaResolver &resolver, + const sourcemeta::blaze::BundleMode mode, std::string_view default_dialect, std::string_view default_id, const sourcemeta::blaze::SchemaFrame::Paths &paths, @@ -265,8 +273,10 @@ auto bundle_schema(sourcemeta::core::JSON &root, frame.for_each_unresolved_reference([&](const auto &pointer, const auto &reference) { // We don't want to bundle official schemas, as we can expect - // virtually all implementations to understand them out of the box - if (is_official_metaschema_reference(pointer, reference.destination)) { + // virtually all implementations to understand them out of the box. + // Depending on the bundling strategy, we may skip meta-schemas entirely + if (is_skippable_metaschema_reference(mode, pointer, + reference.destination)) { return; } @@ -378,8 +388,8 @@ auto bundle_schema(sourcemeta::core::JSON &root, } for (auto &[remote, effective_id, remote_dialect] : deferred) { - bundle_schema(root, container, remote, walker, resolver, default_dialect, - effective_id, paths, bundled, depth + 1); + bundle_schema(root, container, remote, walker, resolver, mode, + default_dialect, effective_id, paths, bundled, depth + 1); elevate_embedded_resources(remote, root, container, remote_dialect, resolver, default_dialect, bundled); embed_schema(root, container, effective_id, std::move(remote)); @@ -403,8 +413,8 @@ auto dependencies(const sourcemeta::core::JSON &schema, // TODO: Refactor this function to internally rely on the `.dependencies()` // function auto bundle(sourcemeta::core::JSON &schema, const SchemaWalker &walker, - const SchemaResolver &resolver, std::string_view default_dialect, - std::string_view default_id, + const SchemaResolver &resolver, const BundleMode mode, + std::string_view default_dialect, std::string_view default_id, const std::optional &default_container, const SchemaFrame::Paths &paths) -> void { // Pre-scan the schema to find any already-embedded schemas and mark them @@ -424,7 +434,7 @@ auto bundle(sourcemeta::core::JSON &schema, const SchemaWalker &walker, // This is undefined behavior assert(!default_container.value().empty()); bundle_schema(schema, default_container.value(), schema, walker, resolver, - default_dialect, default_id, paths, bundled); + mode, default_dialect, default_id, paths, bundled); return; } @@ -477,18 +487,18 @@ auto bundle(sourcemeta::core::JSON &schema, const SchemaWalker &walker, } bundle_schema(schema, {sourcemeta::core::JSON::String{container_keyword}}, - schema, walker, resolver, default_dialect, default_id, paths, - bundled); + schema, walker, resolver, mode, default_dialect, default_id, + paths, bundled); } auto bundle(const sourcemeta::core::JSON &schema, const SchemaWalker &walker, - const SchemaResolver &resolver, std::string_view default_dialect, - std::string_view default_id, + const SchemaResolver &resolver, const BundleMode mode, + std::string_view default_dialect, std::string_view default_id, const std::optional &default_container, const SchemaFrame::Paths &paths) -> sourcemeta::core::JSON { sourcemeta::core::JSON copy = schema; - bundle(copy, walker, resolver, default_dialect, default_id, default_container, - paths); + bundle(copy, walker, resolver, mode, default_dialect, default_id, + default_container, paths); return copy; } diff --git a/vendor/blaze/src/bundle/include/sourcemeta/blaze/bundle.h b/vendor/blaze/src/bundle/include/sourcemeta/blaze/bundle.h index 9a21f9293..b5160e79c 100644 --- a/vendor/blaze/src/bundle/include/sourcemeta/blaze/bundle.h +++ b/vendor/blaze/src/bundle/include/sourcemeta/blaze/bundle.h @@ -20,6 +20,7 @@ #include #include +#include // std::uint8_t #include // std::function #include // std::optional, std::nullopt #include // std::string_view @@ -36,6 +37,18 @@ using DependencyCallback = std::function; +/// @ingroup bundle +/// The strategies that the bundling process can follow +enum class BundleMode : std::uint8_t { + /// Embed every external reference, including any non-official + /// meta-schemas that the schema or its dependencies declare, along + /// with the dependencies of those meta-schemas + NonOfficialMetaschemas, + /// Embed every external reference, skipping meta-schema + /// declarations entirely + References +}; + /// @ingroup bundle /// /// This function recursively traverses and reports the external references in a @@ -117,7 +130,8 @@ auto dependencies(const sourcemeta::core::JSON &schema, /// })JSON"); /// /// sourcemeta::blaze::bundle(document, -/// sourcemeta::blaze::schema_walker, test_resolver); +/// sourcemeta::blaze::schema_walker, test_resolver, +/// sourcemeta::blaze::BundleMode::NonOfficialMetaschemas); /// /// const sourcemeta::core::JSON expected = /// sourcemeta::core::parse_json(R"JSON({ @@ -136,7 +150,7 @@ auto dependencies(const sourcemeta::core::JSON &schema, /// ``` SOURCEMETA_BLAZE_BUNDLE_EXPORT auto bundle(sourcemeta::core::JSON &schema, const SchemaWalker &walker, - const SchemaResolver &resolver, + const SchemaResolver &resolver, const BundleMode mode, std::string_view default_dialect = "", std::string_view default_id = "", const std::optional &default_container = @@ -179,7 +193,8 @@ auto bundle(sourcemeta::core::JSON &schema, const SchemaWalker &walker, /// /// const sourcemeta::core::JSON result = /// sourcemeta::blaze::bundle(document, -/// sourcemeta::blaze::schema_walker, test_resolver); +/// sourcemeta::blaze::schema_walker, test_resolver, +/// sourcemeta::blaze::BundleMode::NonOfficialMetaschemas); /// /// const sourcemeta::core::JSON expected = /// sourcemeta::core::parse_json(R"JSON({ @@ -199,8 +214,8 @@ auto bundle(sourcemeta::core::JSON &schema, const SchemaWalker &walker, SOURCEMETA_BLAZE_BUNDLE_EXPORT auto bundle( const sourcemeta::core::JSON &schema, const SchemaWalker &walker, - const SchemaResolver &resolver, std::string_view default_dialect = "", - std::string_view default_id = "", + const SchemaResolver &resolver, const BundleMode mode, + std::string_view default_dialect = "", std::string_view default_id = "", const std::optional &default_container = std::nullopt, const SchemaFrame::Paths &paths = {sourcemeta::core::empty_weak_pointer}) diff --git a/vendor/blaze/src/codegen/codegen.cc b/vendor/blaze/src/codegen/codegen.cc index 583566305..325055116 100644 --- a/vendor/blaze/src/codegen/codegen.cc +++ b/vendor/blaze/src/codegen/codegen.cc @@ -62,8 +62,9 @@ auto compile(const sourcemeta::core::JSON &input, // (1) Bundle the schema to resolve external references // -------------------------------------------------------------------------- - auto schema{sourcemeta::blaze::bundle(input, walker, resolver, - default_dialect, default_id)}; + auto schema{sourcemeta::blaze::bundle( + input, walker, resolver, sourcemeta::blaze::BundleMode::References, + default_dialect, default_id)}; // -------------------------------------------------------------------------- // (2) Canonicalize the schema for easier analysis diff --git a/vendor/blaze/src/compiler/compile.cc b/vendor/blaze/src/compiler/compile.cc index 425dc18ac..7cf3354a0 100644 --- a/vendor/blaze/src/compiler/compile.cc +++ b/vendor/blaze/src/compiler/compile.cc @@ -439,9 +439,11 @@ auto compile(const sourcemeta::core::JSON &schema, assert(is_schema(schema)); // Make sure the input schema is bundled, otherwise we won't be able to - // resolve remote references here + // resolve remote references here. Meta-schemas are not needed, as we + // can determine vocabularies through the resolver const sourcemeta::core::JSON result{sourcemeta::blaze::bundle( - schema, walker, resolver, default_dialect, default_id)}; + schema, walker, resolver, sourcemeta::blaze::BundleMode::References, + default_dialect, default_id)}; sourcemeta::blaze::SchemaFrame frame{ sourcemeta::blaze::SchemaFrame::Mode::References}; diff --git a/vendor/blaze/src/configuration/fetch.cc b/vendor/blaze/src/configuration/fetch.cc index 126c5b235..14bdd7bc1 100644 --- a/vendor/blaze/src/configuration/fetch.cc +++ b/vendor/blaze/src/configuration/fetch.cc @@ -117,8 +117,10 @@ auto fetch_and_write( try { const std::string default_dialect_value{default_dialect.value_or("")}; - sourcemeta::blaze::bundle(out_schema, sourcemeta::blaze::schema_walker, - resolver, default_dialect_value, dependency_uri); + sourcemeta::blaze::bundle( + out_schema, sourcemeta::blaze::schema_walker, resolver, + sourcemeta::blaze::BundleMode::NonOfficialMetaschemas, + default_dialect_value, dependency_uri); } catch (...) { emit_event(callback, FetchEvent::Type::Error, dependency_uri, dependency_path, index, total, "Failed to bundle schema", diff --git a/vendor/blaze/src/configuration/include/sourcemeta/blaze/configuration.h b/vendor/blaze/src/configuration/include/sourcemeta/blaze/configuration.h index e8bab468b..4c772494a 100644 --- a/vendor/blaze/src/configuration/include/sourcemeta/blaze/configuration.h +++ b/vendor/blaze/src/configuration/include/sourcemeta/blaze/configuration.h @@ -57,6 +57,7 @@ struct SOURCEMETA_BLAZE_CONFIGURATION_EXPORT Configuration { bool absolute_path_explicit{false}; std::filesystem::path base_path; sourcemeta::core::JSON::String base; + sourcemeta::core::URI base_uri; std::optional default_dialect; std::unordered_set extension{".json", ".yml", ".yaml"}; diff --git a/vendor/blaze/src/configuration/parse.cc b/vendor/blaze/src/configuration/parse.cc index 59fa7b6e8..70b879b88 100644 --- a/vendor/blaze/src/configuration/parse.cc +++ b/vendor/blaze/src/configuration/parse.cc @@ -58,21 +58,22 @@ auto Configuration::from_json(const sourcemeta::core::JSON &value, !value.defines("dependencies") || value.at("dependencies").is_object(), "The dependencies property must be an object", {"dependencies"}); + const sourcemeta::core::JSON null_value{nullptr}; result.title = sourcemeta::core::from_json( - value.at_or("title", sourcemeta::core::JSON{nullptr})); + value.at_or("title", null_value)); result.description = sourcemeta::core::from_json( - value.at_or("description", sourcemeta::core::JSON{nullptr})); + value.at_or("description", null_value)); result.email = sourcemeta::core::from_json( - value.at_or("email", sourcemeta::core::JSON{nullptr})); + value.at_or("email", null_value)); result.github = sourcemeta::core::from_json( - value.at_or("github", sourcemeta::core::JSON{nullptr})); + value.at_or("github", null_value)); result.website = sourcemeta::core::from_json( - value.at_or("website", sourcemeta::core::JSON{nullptr})); + value.at_or("website", null_value)); if (value.defines("path")) { const std::filesystem::path path{value.at("path").to_string()}; @@ -92,14 +93,14 @@ auto Configuration::from_json(const sourcemeta::core::JSON &value, if (value.defines("baseUri")) { try { - sourcemeta::core::URI base{value.at("baseUri").to_string()}; - base.canonicalize(); - if (!base.is_absolute()) { + result.base_uri = sourcemeta::core::URI{value.at("baseUri").to_string()}; + result.base_uri.canonicalize(); + if (!result.base_uri.is_absolute()) { CONFIGURATION_ENSURE( false, "The baseUri property must be an absolute URI", {"baseUri"}); } - result.base = base.recompose(); + result.base = result.base_uri.recompose(); } catch (const sourcemeta::core::URIParseError &) { CONFIGURATION_ENSURE(false, "The baseUri property must represent a valid URI", @@ -107,13 +108,13 @@ auto Configuration::from_json(const sourcemeta::core::JSON &value, } } else { // Otherwise the base is the directory - result.base = - sourcemeta::core::URI::from_path(result.absolute_path).recompose(); + result.base_uri = sourcemeta::core::URI::from_path(result.absolute_path); + result.base = result.base_uri.recompose(); } result.default_dialect = sourcemeta::core::from_json( - value.at_or("defaultDialect", sourcemeta::core::JSON{nullptr})); + value.at_or("defaultDialect", null_value)); if (value.defines("extension")) { result.extension.clear(); diff --git a/vendor/blaze/src/editor/include/sourcemeta/blaze/editor.h b/vendor/blaze/src/editor/include/sourcemeta/blaze/editor.h index eb54aa196..3754894d4 100644 --- a/vendor/blaze/src/editor/include/sourcemeta/blaze/editor.h +++ b/vendor/blaze/src/editor/include/sourcemeta/blaze/editor.h @@ -35,6 +35,7 @@ namespace sourcemeta::blaze { /// /// ```cpp /// #include +/// #include /// #include /// #include /// @@ -46,7 +47,8 @@ namespace sourcemeta::blaze { /// /// sourcemeta::blaze::bundle(schema, /// sourcemeta::blaze::schema_walker, -/// sourcemeta::blaze::schema_resolver); +/// sourcemeta::blaze::schema_resolver, +/// sourcemeta::blaze::BundleMode::NonOfficialMetaschemas); /// sourcemeta::blaze::for_editor(schema, /// sourcemeta::blaze::schema_walker, /// sourcemeta::blaze::schema_resolver); diff --git a/vendor/blaze/src/frame/frame.cc b/vendor/blaze/src/frame/frame.cc index a8e4a4ee7..dd5aa18fb 100644 --- a/vendor/blaze/src/frame/frame.cc +++ b/vendor/blaze/src/frame/frame.cc @@ -321,6 +321,24 @@ auto supports_id_anchors( } } +// Generic URI normalisation only decodes unreserved characters (see RFC 3986, +// section 6.2.2.2), so a reference destination may still spell its JSON +// Pointer fragment with percent-encoded octets. Re-serialise such fragments +// from their parsed form so that reference destinations match the URIs that +// locations are framed under +auto canonicalize_pointer_fragment(sourcemeta::core::URI &uri) -> void { + const auto fragment{uri.fragment()}; + if (!fragment.has_value() || + fragment.value().find('%') == std::string_view::npos) { + return; + } + + const auto destination{sourcemeta::core::fragment_to_pointer(uri)}; + if (destination.has_value()) { + uri.fragment(sourcemeta::core::to_string(destination.value())); + } +} + auto set_base_and_fragment( sourcemeta::blaze::SchemaFrame::ReferencesEntry &entry) -> void { const std::string_view destination_view{entry.destination}; @@ -772,7 +790,8 @@ auto SchemaFrame::analyse(const sourcemeta::core::JSON &root, entry.id ? std::optional{*entry.id} : std::nullopt)}; if (!nearest_bases.first.empty()) { - metaschema.resolve_from(nearest_bases.first.front()); + metaschema.resolve_from( + sourcemeta::core::URI{nearest_bases.first.front()}); } metaschema.canonicalize(); @@ -1037,10 +1056,11 @@ auto SchemaFrame::analyse(const sourcemeta::core::JSON &root, } if (!nearest_bases.first.empty()) { - ref.resolve_from(nearest_bases.first.front()); + ref.resolve_from(sourcemeta::core::URI{nearest_bases.first.front()}); } ref.canonicalize(); + canonicalize_pointer_fragment(ref); auto ref_pointer{common_pointer_weak}; ref_pointer.push_back(std::cref(KEYWORD_REF)); const auto [it, inserted] = this->references_.insert_or_assign( @@ -1122,10 +1142,11 @@ auto SchemaFrame::analyse(const sourcemeta::core::JSON &root, } if (!nearest_bases.first.empty()) { - ref.resolve_from(nearest_bases.first.front()); + ref.resolve_from(sourcemeta::core::URI{nearest_bases.first.front()}); } ref.canonicalize(); + canonicalize_pointer_fragment(ref); auto ref_string{ref.recompose()}; // Note that here we cannot enforce the bookending requirement, diff --git a/vendor/core/CMakeLists.txt b/vendor/core/CMakeLists.txt index 742cb6fcd..6da6d84dd 100644 --- a/vendor/core/CMakeLists.txt +++ b/vendor/core/CMakeLists.txt @@ -165,7 +165,6 @@ endif() if(SOURCEMETA_CORE_GZIP) find_package(LibDeflate REQUIRED) - find_package(ZLIB REQUIRED) add_subdirectory(src/core/gzip) endif() diff --git a/vendor/core/DEPENDENCIES b/vendor/core/DEPENDENCIES index cd1f409ae..73fe46d9a 100644 --- a/vendor/core/DEPENDENCIES +++ b/vendor/core/DEPENDENCIES @@ -2,7 +2,6 @@ vendorpull https://github.com/sourcemeta/vendorpull 1dcbac42809cf87cb5b045106b86 jsontestsuite https://github.com/nst/JSONTestSuite d64aefb55228d9584d3e5b2433f720ea8fd00c82 yaml-test-suite https://github.com/yaml/yaml-test-suite data-2022-01-17 cmark-gfm https://github.com/github/cmark-gfm 587a12bb54d95ac37241377e6ddc93ea0e45439b -zlib https://github.com/madler/zlib v1.3.2 uritemplate-test https://github.com/uri-templates/uritemplate-test 1eb27ab4462b9e5819dc47db99044f5fd1fa9bc7 pyca-cryptography https://github.com/pyca/cryptography c4935a7021af37c38e0684b0546c1b4378518342 pcre2 https://github.com/PCRE2Project/pcre2 pcre2-10.47 diff --git a/vendor/core/cmake/FindZLIB.cmake b/vendor/core/cmake/FindZLIB.cmake deleted file mode 100644 index de335c216..000000000 --- a/vendor/core/cmake/FindZLIB.cmake +++ /dev/null @@ -1,126 +0,0 @@ -if(NOT ZLIB_FOUND) - set(ZLIB_DIR "${PROJECT_SOURCE_DIR}/vendor/zlib") - set(ZLIB_PUBLIC_HEADER "${ZLIB_DIR}/zlib.h") - set(ZLIB_PRIVATE_HEADERS "${ZLIB_DIR}/zconf.h") - - add_library(zlib - "${ZLIB_PUBLIC_HEADER}" ${ZLIB_PRIVATE_HEADERS} - "${ZLIB_DIR}/adler32.c" - "${ZLIB_DIR}/compress.c" - "${ZLIB_DIR}/crc32.c" - "${ZLIB_DIR}/crc32.h" - "${ZLIB_DIR}/deflate.c" - "${ZLIB_DIR}/deflate.h" - "${ZLIB_DIR}/gzclose.c" - "${ZLIB_DIR}/gzguts.h" - "${ZLIB_DIR}/gzlib.c" - "${ZLIB_DIR}/gzread.c" - "${ZLIB_DIR}/gzwrite.c" - "${ZLIB_DIR}/infback.c" - "${ZLIB_DIR}/inffast.c" - "${ZLIB_DIR}/inffast.h" - "${ZLIB_DIR}/inffixed.h" - "${ZLIB_DIR}/inflate.c" - "${ZLIB_DIR}/inflate.h" - "${ZLIB_DIR}/inftrees.c" - "${ZLIB_DIR}/inftrees.h" - "${ZLIB_DIR}/trees.c" - "${ZLIB_DIR}/trees.h" - "${ZLIB_DIR}/uncompr.c" - "${ZLIB_DIR}/zutil.c" - "${ZLIB_DIR}/zutil.h") - - target_compile_definitions(zlib PUBLIC NO_FSEEKO) - target_compile_definitions(zlib PUBLIC _LARGEFILE64_SOURCE=1) - - if(SOURCEMETA_COMPILER_MSVC) - target_compile_options(zlib PRIVATE /W3 /MP /wd4996) - target_compile_definitions(zlib PRIVATE _CRT_SECURE_NO_WARNINGS) - else() - target_compile_options(zlib PRIVATE - -Wall - -Wextra - -Wpedantic - -Werror - -Wdouble-promotion - -Wfloat-equal - -Wmissing-declarations - -Wshadow - -Wwrite-strings - -Wno-cast-align - -Wno-cast-qual - -Wno-format-nonliteral - -Wno-sign-conversion - -Wno-shorten-64-to-32 - -Wno-implicit-int-conversion - -Wno-comma - -Wno-implicit-fallthrough) - - if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") - target_compile_options(zlib PRIVATE - -funroll-loops - -fstrict-aliasing - -ftree-vectorize - -fno-math-errno - -fwrapv) - endif() - - # Disable LTO for zlib to work around GCC LTO linker plugin not - # properly rescanning this archive for transitive dependencies - if(SOURCEMETA_COMPILER_GCC) - target_compile_options(zlib PRIVATE -fno-lto) - endif() - endif() - - target_include_directories(zlib PUBLIC - "$" - "$") - - add_library(ZLIB::ZLIB ALIAS zlib) - - set_target_properties(zlib - PROPERTIES - OUTPUT_NAME zlib - PUBLIC_HEADER "${ZLIB_PUBLIC_HEADER}" - PRIVATE_HEADER "${ZLIB_PRIVATE_HEADERS}" - C_STANDARD 11 - C_STANDARD_REQUIRED ON - C_EXTENSIONS OFF - POSITION_INDEPENDENT_CODE ON - C_VISIBILITY_PRESET "default" - C_VISIBILITY_INLINES_HIDDEN FALSE - VISIBILITY_INLINES_HIDDEN OFF - WINDOWS_EXPORT_ALL_SYMBOLS TRUE - EXPORT_NAME ZLIB) - - if(SOURCEMETA_CORE_INSTALL) - include(GNUInstallDirs) - install(TARGETS zlib - EXPORT zlib - PUBLIC_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" - COMPONENT sourcemeta_core_dev - PRIVATE_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" - COMPONENT sourcemeta_core_dev - RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" - COMPONENT sourcemeta_core - LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" - COMPONENT sourcemeta_core - NAMELINK_COMPONENT sourcemeta_core_dev - ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" - COMPONENT sourcemeta_core_dev) - install(EXPORT zlib - DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/zlib" - NAMESPACE ZLIB:: - COMPONENT sourcemeta_core_dev) - - file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/zlib-config.cmake - "include(\"\${CMAKE_CURRENT_LIST_DIR}/zlib.cmake\")\n" - "check_required_components(\"zlib\")\n") - install(FILES - "${CMAKE_CURRENT_BINARY_DIR}/zlib-config.cmake" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/zlib" - COMPONENT sourcemeta_core_dev) - endif() - - set(ZLIB_FOUND ON) -endif() diff --git a/vendor/core/cmake/common/compiler/simd.cmake b/vendor/core/cmake/common/compiler/simd.cmake index 112c364ec..f213a4f33 100644 --- a/vendor/core/cmake/common/compiler/simd.cmake +++ b/vendor/core/cmake/common/compiler/simd.cmake @@ -31,10 +31,28 @@ macro(sourcemeta_enable_simd) endif() endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64" AND NOT MSVC) - check_cxx_compiler_flag("-march=armv8-a+fp+simd" COMPILER_SUPPORTS_NEON) - if(COMPILER_SUPPORTS_NEON) - message(STATUS "Enabling SIMD NEON") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+fp+simd") + # +crc is part of the optional Armv8.0-A CRC32 extension. It is + # guaranteed on Apple Silicon (Armv8.5+) and several other modern cores, + # but Cortex-A53-class CPUs do not have it. The compiler accepting the + # flag does not prove the runtime CPU supports the instruction, so the + # only platform we auto-enable +crc on is Apple Silicon. Other aarch64 + # targets keep the plain NEON baseline and the CRC32 software fallback. + set(SIMD_NEON_FLAG_APPLIED FALSE) + if(APPLE) + check_cxx_compiler_flag("-march=armv8-a+fp+simd+crc" + COMPILER_SUPPORTS_NEON_CRC) + if(COMPILER_SUPPORTS_NEON_CRC) + message(STATUS "Enabling SIMD NEON + CRC32") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+fp+simd+crc") + set(SIMD_NEON_FLAG_APPLIED TRUE) + endif() + endif() + if(NOT SIMD_NEON_FLAG_APPLIED) + check_cxx_compiler_flag("-march=armv8-a+fp+simd" COMPILER_SUPPORTS_NEON) + if(COMPILER_SUPPORTS_NEON) + message(STATUS "Enabling SIMD NEON") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+fp+simd") + endif() endif() endif() endmacro() diff --git a/vendor/core/config.cmake.in b/vendor/core/config.cmake.in index e02574a6a..e6a48152e 100644 --- a/vendor/core/config.cmake.in +++ b/vendor/core/config.cmake.in @@ -106,12 +106,12 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS}) include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_json.cmake") elseif(component STREQUAL "jsonl") find_dependency(LibDeflate CONFIG) - find_dependency(ZLIB CONFIG) include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_preprocessor.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_numeric.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_io.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_json.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_crypto.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_gzip.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_jsonl.cmake") elseif(component STREQUAL "jsonpointer") @@ -165,7 +165,7 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS}) include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_semver.cmake") elseif(component STREQUAL "gzip") find_dependency(LibDeflate CONFIG) - find_dependency(ZLIB CONFIG) + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_crypto.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_gzip.cmake") elseif(component STREQUAL "html") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_preprocessor.cmake") diff --git a/vendor/core/src/core/crypto/CMakeLists.txt b/vendor/core/src/core/crypto/CMakeLists.txt index bc5a62e95..865ae7788 100644 --- a/vendor/core/src/core/crypto/CMakeLists.txt +++ b/vendor/core/src/core/crypto/CMakeLists.txt @@ -1,6 +1,7 @@ sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME crypto - PRIVATE_HEADERS sha256.h uuid.h crc32.h - SOURCES crypto_sha256.cc crypto_uuid.cc crypto_crc32.cc) + PRIVATE_HEADERS sha256.h sha1.h fnv128.h uuid.h crc32.h + SOURCES crypto_sha256.cc crypto_sha1.cc crypto_fnv128.cc + crypto_uuid.cc crypto_crc32.cc) if(SOURCEMETA_CORE_CRYPTO_USE_SYSTEM_OPENSSL) target_compile_definitions(sourcemeta_core_crypto diff --git a/vendor/core/src/core/crypto/crypto_crc32.cc b/vendor/core/src/core/crypto/crypto_crc32.cc index 1220366b7..d28615603 100644 --- a/vendor/core/src/core/crypto/crypto_crc32.cc +++ b/vendor/core/src/core/crypto/crypto_crc32.cc @@ -2,8 +2,20 @@ #include // std::array #include // std::size_t -#include // std::uint8_t, std::uint32_t +#include // std::uint8_t, std::uint32_t, std::uint64_t, std::uintptr_t +#include // std::memcpy +// Only enable the hardware CRC32 path when the target ISA explicitly promises +// the optional ARMv8 CRC32 extension. Running these instructions on a CPU +// without the extension would trap with SIGILL, so we gate on the standard +// feature macro that the compiler sets when the right -march or -mcpu is in +// effect +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) +#include // __crc32b, __crc32d +#define SOURCEMETA_CORE_CRYPTO_CRC32_ARM 1 +#endif + +#ifndef SOURCEMETA_CORE_CRYPTO_CRC32_ARM namespace { constexpr auto compute_crc32_table_entry(const std::uint32_t value) noexcept @@ -17,17 +29,27 @@ constexpr auto compute_crc32_table_entry(const std::uint32_t value) noexcept return entry; } -constexpr auto build_crc32_table() noexcept -> std::array { - std::array table{}; +constexpr auto build_crc32_tables() noexcept + -> std::array, 8> { + std::array, 8> tables{}; for (std::uint32_t index = 0; index < 256u; ++index) { - table[index] = compute_crc32_table_entry(index); + tables[0][index] = compute_crc32_table_entry(index); + } + // Slice-by-8 extension: T[k][b] = (T[k-1][b] >> 8) ^ T[0][T[k-1][b] & 0xff] + for (std::size_t slice = 1; slice < 8; ++slice) { + for (std::uint32_t index = 0; index < 256u; ++index) { + const auto previous{tables[slice - 1][index]}; + tables[slice][index] = (previous >> 8u) ^ tables[0][previous & 0xffu]; + } } - return table; + return tables; } -constexpr std::array CRC32_TABLE{build_crc32_table()}; +constexpr std::array, 8> CRC32_TABLES{ + build_crc32_tables()}; } // namespace +#endif namespace sourcemeta::core { @@ -38,11 +60,54 @@ auto crc32(const std::string_view input) -> std::uint32_t { auto crc32_update(const std::uint32_t previous, const std::string_view input) -> std::uint32_t { auto checksum{previous ^ 0xFFFFFFFFu}; - for (const auto character : input) { - const auto byte{static_cast(character)}; - checksum = CRC32_TABLE[(checksum ^ byte) & 0xffu] ^ (checksum >> 8u); + const auto *data{reinterpret_cast(input.data())}; + auto remaining{input.size()}; + +#ifdef SOURCEMETA_CORE_CRYPTO_CRC32_ARM + // ARMv8 hardware CRC32 instruction (~8 bytes per cycle) + while (remaining >= 8) { + std::uint64_t chunk{0}; + std::memcpy(&chunk, data, sizeof(chunk)); + checksum = __crc32d(checksum, chunk); + data += 8; + remaining -= 8; } + while (remaining > 0) { + checksum = __crc32b(checksum, *data++); + --remaining; + } + return checksum ^ 0xFFFFFFFFu; +#else + // Slice-by-8 software fallback: consume 8 bytes per iteration + while (remaining >= 8) { + const std::uint32_t one{(static_cast(data[0])) | + (static_cast(data[1]) << 8u) | + (static_cast(data[2]) << 16u) | + (static_cast(data[3]) << 24u)}; + const std::uint32_t two{(static_cast(data[4])) | + (static_cast(data[5]) << 8u) | + (static_cast(data[6]) << 16u) | + (static_cast(data[7]) << 24u)}; + const auto mixed{checksum ^ one}; + checksum = + CRC32_TABLES[0][(two >> 24u) & 0xffu] ^ + CRC32_TABLES[1][(two >> 16u) & 0xffu] ^ + CRC32_TABLES[2][(two >> 8u) & 0xffu] ^ CRC32_TABLES[3][two & 0xffu] ^ + CRC32_TABLES[4][(mixed >> 24u) & 0xffu] ^ + CRC32_TABLES[5][(mixed >> 16u) & 0xffu] ^ + CRC32_TABLES[6][(mixed >> 8u) & 0xffu] ^ CRC32_TABLES[7][mixed & 0xffu]; + data += 8; + remaining -= 8; + } + + // Tail: byte-by-byte for the final 0..7 bytes + while (remaining > 0) { + checksum = CRC32_TABLES[0][(checksum ^ *data++) & 0xffu] ^ (checksum >> 8u); + --remaining; + } + return checksum ^ 0xFFFFFFFFu; +#endif } } // namespace sourcemeta::core diff --git a/vendor/core/src/core/crypto/crypto_fnv128.cc b/vendor/core/src/core/crypto/crypto_fnv128.cc new file mode 100644 index 000000000..e5d643ca2 --- /dev/null +++ b/vendor/core/src/core/crypto/crypto_fnv128.cc @@ -0,0 +1,86 @@ +#include + +#include // std::array +#include // std::uint8_t, std::uint32_t, std::uint64_t + +namespace { + +constexpr std::array HEX_DIGITS{{'0', '1', '2', '3', '4', '5', '6', + '7', '8', '9', 'a', 'b', 'c', 'd', + 'e', 'f', '\0'}}; + +// The 128-bit FNV offset basis, in two 64-bit limbs +// (draft-eastlake-fnv Section 5) +constexpr std::uint64_t OFFSET_BASIS_HIGH{0x6c62272e07bb0142ULL}; +constexpr std::uint64_t OFFSET_BASIS_LOW{0x62b821756295c58dULL}; + +// The 128-bit FNV prime is 2^88 + 2^8 + 0x3b (draft-eastlake-fnv Section 4), +// so multiplying by it reduces to (value << 88) + (value << 8) + value * 0x3b +// modulo 2^128, computed here over two 64-bit limbs +inline constexpr auto multiply_by_prime(std::uint64_t &high, + std::uint64_t &low) noexcept -> void { + // value << 88, whose low limb is always zero + const auto shift_88_high = low << 24u; + + // value << 8 + const auto shift_8_high = (high << 8u) | (low >> 56u); + const auto shift_8_low = low << 8u; + + // value * 0x3b, multiplying the low limb in 32-bit halves to portably + // capture the carry into the high limb + const auto low_half_product = (low & 0xffffffffULL) * 0x3bULL; + const auto high_half_product = (low >> 32u) * 0x3bULL; + const auto small_low = low_half_product + (high_half_product << 32u); + const auto small_carry = + (high_half_product + (low_half_product >> 32u)) >> 32u; + const auto small_high = (high * 0x3bULL) + small_carry; + + const auto result_low = shift_8_low + small_low; + const auto result_carry = result_low < shift_8_low ? 1ULL : 0ULL; + high = shift_88_high + shift_8_high + small_high + result_carry; + low = result_low; +} + +} // namespace + +namespace sourcemeta::core { + +auto fnv128_digest(const std::string_view input) + -> std::array { + auto high = OFFSET_BASIS_HIGH; + auto low = OFFSET_BASIS_LOW; + + // FNV-1 multiplies first and XORs after (draft-eastlake-fnv Section 2) + for (const auto character : input) { + multiply_by_prime(high, low); + low ^= static_cast(character); + } + + std::array result{}; + for (std::uint64_t index = 0u; index < 8u; ++index) { + const auto shift = 8u * (7u - index); + result[index] = static_cast((high >> shift) & 0xffu); + result[8u + index] = static_cast((low >> shift) & 0xffu); + } + + return result; +} + +auto fnv128(const std::string_view input) -> std::string { + const auto digest = fnv128_digest(input); + std::string result; + result.reserve(32); + for (std::uint64_t index = 0u; index < 16u; ++index) { + result.push_back(HEX_DIGITS[(digest[index] >> 4u) & 0x0fu]); + result.push_back(HEX_DIGITS[digest[index] & 0x0fu]); + } + + return result; +} + +auto fnv128(const std::string_view input, std::ostream &output) -> void { + const auto result = fnv128(input); + output.write(result.data(), static_cast(result.size())); +} + +} // namespace sourcemeta::core diff --git a/vendor/core/src/core/crypto/crypto_sha1.cc b/vendor/core/src/core/crypto/crypto_sha1.cc new file mode 100644 index 000000000..33dccfadf --- /dev/null +++ b/vendor/core/src/core/crypto/crypto_sha1.cc @@ -0,0 +1,223 @@ +#include + +#include // std::array +#include // std::uint32_t, std::uint64_t + +#ifdef SOURCEMETA_CORE_CRYPTO_USE_SYSTEM_OPENSSL +#include // EVP_MD_CTX_new, EVP_DigestInit_ex, EVP_sha1, EVP_DigestUpdate, EVP_DigestFinal_ex, EVP_MD_CTX_free +#include // std::runtime_error +#else +#include // std::memcpy +#endif + +namespace { +constexpr std::array HEX_DIGITS{{'0', '1', '2', '3', '4', '5', '6', + '7', '8', '9', 'a', 'b', 'c', 'd', + 'e', 'f', '\0'}}; +} // namespace + +#ifdef SOURCEMETA_CORE_CRYPTO_USE_SYSTEM_OPENSSL + +namespace sourcemeta::core { + +auto sha1(const std::string_view input) -> std::string { + auto *context = EVP_MD_CTX_new(); + if (context == nullptr) { + throw std::runtime_error("Could not allocate OpenSSL digest context"); + } + + if (EVP_DigestInit_ex(context, EVP_sha1(), nullptr) != 1 || + EVP_DigestUpdate(context, input.data(), input.size()) != 1) { + EVP_MD_CTX_free(context); + throw std::runtime_error("Could not compute SHA-1 digest"); + } + + std::array digest{}; + unsigned int length = 0; + if (EVP_DigestFinal_ex(context, digest.data(), &length) != 1) { + EVP_MD_CTX_free(context); + throw std::runtime_error("Could not finalize SHA-1 digest"); + } + + EVP_MD_CTX_free(context); + + std::string result; + result.reserve(40); + for (std::uint64_t index = 0; index < 20u; ++index) { + result.push_back(HEX_DIGITS[(digest[index] >> 4u) & 0x0fu]); + result.push_back(HEX_DIGITS[digest[index] & 0x0fu]); + } + + return result; +} + +auto sha1(const std::string_view input, std::ostream &output) -> void { + const auto result = sha1(input); + output.write(result.data(), static_cast(result.size())); +} + +} // namespace sourcemeta::core + +#else + +namespace { + +inline constexpr auto rotate_left(std::uint32_t value, + std::uint64_t count) noexcept + -> std::uint32_t { + return (value << count) | (value >> (32u - count)); +} + +// Equivalent to (x & y) ^ (~x & z) but avoids a bitwise NOT +// (RFC 3174 Section 5, rounds 0 to 19) +inline constexpr auto choice(std::uint32_t x, std::uint32_t y, + std::uint32_t z) noexcept -> std::uint32_t { + return z ^ (x & (y ^ z)); +} + +// RFC 3174 Section 5, rounds 20 to 39 and 60 to 79 +inline constexpr auto parity(std::uint32_t x, std::uint32_t y, + std::uint32_t z) noexcept -> std::uint32_t { + return x ^ y ^ z; +} + +// RFC 3174 Section 5, rounds 40 to 59 +inline constexpr auto majority(std::uint32_t x, std::uint32_t y, + std::uint32_t z) noexcept -> std::uint32_t { + return (x & y) ^ (x & z) ^ (y & z); +} + +inline auto sha1_process_block(const unsigned char *block, + std::array &state) noexcept + -> void { + // Decode 16 big-endian 32-bit words from the block + std::array schedule; + for (std::uint64_t word_index = 0; word_index < 16u; ++word_index) { + const std::uint64_t byte_index = word_index * 4u; + schedule[word_index] = + (static_cast(block[byte_index]) << 24u) | + (static_cast(block[byte_index + 1u]) << 16u) | + (static_cast(block[byte_index + 2u]) << 8u) | + static_cast(block[byte_index + 3u]); + } + + // Extend the message schedule (RFC 3174 Section 6.1 step b) + for (std::uint64_t index = 16u; index < 80u; ++index) { + schedule[index] = + rotate_left(schedule[index - 3u] ^ schedule[index - 8u] ^ + schedule[index - 14u] ^ schedule[index - 16u], + 1u); + } + + auto working = state; + + // Compression function (RFC 3174 Section 6.1 step d), with the round + // constants of RFC 3174 Section 5 + for (std::uint64_t round_index = 0u; round_index < 80u; ++round_index) { + std::uint32_t function_value; + std::uint32_t round_constant; + if (round_index < 20u) { + function_value = choice(working[1], working[2], working[3]); + round_constant = 0x5a827999U; + } else if (round_index < 40u) { + function_value = parity(working[1], working[2], working[3]); + round_constant = 0x6ed9eba1U; + } else if (round_index < 60u) { + function_value = majority(working[1], working[2], working[3]); + round_constant = 0x8f1bbcdcU; + } else { + function_value = parity(working[1], working[2], working[3]); + round_constant = 0xca62c1d6U; + } + + const auto temporary = rotate_left(working[0], 5u) + function_value + + working[4] + schedule[round_index] + round_constant; + + working[4] = working[3]; + working[3] = working[2]; + working[2] = rotate_left(working[1], 30u); + working[1] = working[0]; + working[0] = temporary; + } + + for (std::uint64_t index = 0u; index < 5u; ++index) { + state[index] += working[index]; + } +} + +} // namespace + +namespace sourcemeta::core { + +auto sha1(const std::string_view input) -> std::string { + // Initial hash values (RFC 3174 Section 6.1) + std::array state{}; + state[0] = 0x67452301U; + state[1] = 0xefcdab89U; + state[2] = 0x98badcfeU; + state[3] = 0x10325476U; + state[4] = 0xc3d2e1f0U; + + const auto *const input_bytes = + reinterpret_cast(input.data()); + const std::size_t input_length = input.size(); + + // Process all full 64-byte blocks directly from the input (streaming) + std::size_t processed_bytes = 0u; + while (input_length - processed_bytes >= 64u) { + sha1_process_block(input_bytes + processed_bytes, state); + processed_bytes += 64u; + } + + // Prepare the final block(s) (one or two 64-byte blocks) + std::array final_block{}; + const std::size_t remaining_bytes = input_length - processed_bytes; + if (remaining_bytes > 0u) { + std::memcpy(final_block.data(), input_bytes + processed_bytes, + remaining_bytes); + } + + // Append the 0x80 byte after the message data (RFC 3174 Section 4) + final_block[remaining_bytes] = 0x80u; + + // Append length in bits as big-endian 64-bit at the end of the padding + const std::uint64_t message_length_bits = + static_cast(input_length) * 8ull; + + if (remaining_bytes < 56u) { + for (std::uint64_t index = 0u; index < 8u; ++index) { + final_block[56u + index] = static_cast( + (message_length_bits >> (8u * (7u - index))) & 0xffu); + } + sha1_process_block(final_block.data(), state); + } else { + for (std::uint64_t index = 0u; index < 8u; ++index) { + final_block[64u + 56u + index] = static_cast( + (message_length_bits >> (8u * (7u - index))) & 0xffu); + } + + sha1_process_block(final_block.data(), state); + sha1_process_block(final_block.data() + 64u, state); + } + + std::string result; + result.reserve(40); + for (std::uint64_t state_index = 0u; state_index < 5u; ++state_index) { + const auto value = state[state_index]; + for (std::uint64_t nibble = 0u; nibble < 8u; ++nibble) { + const auto shift = 28u - nibble * 4u; + result.push_back(HEX_DIGITS[(value >> shift) & 0x0fu]); + } + } + + return result; +} + +auto sha1(const std::string_view input, std::ostream &output) -> void { + const auto result = sha1(input); + output.write(result.data(), static_cast(result.size())); +} + +} // namespace sourcemeta::core + +#endif diff --git a/vendor/core/src/core/crypto/crypto_uuid.cc b/vendor/core/src/core/crypto/crypto_uuid.cc index e69128a19..f572438d2 100644 --- a/vendor/core/src/core/crypto/crypto_uuid.cc +++ b/vendor/core/src/core/crypto/crypto_uuid.cc @@ -40,8 +40,8 @@ auto uuidv4() -> std::string { throw std::runtime_error("Could not generate random bytes with OpenSSL"); } #else - static std::random_device device; - static std::mt19937 generator{device()}; + thread_local std::random_device device; + thread_local std::mt19937 generator{device()}; std::uniform_int_distribution distribution(0, 15); std::uniform_int_distribution diff --git a/vendor/core/src/core/crypto/include/sourcemeta/core/crypto.h b/vendor/core/src/core/crypto/include/sourcemeta/core/crypto.h index 520e0954b..550e949e2 100644 --- a/vendor/core/src/core/crypto/include/sourcemeta/core/crypto.h +++ b/vendor/core/src/core/crypto/include/sourcemeta/core/crypto.h @@ -11,6 +11,8 @@ /// ``` #include +#include +#include #include #include diff --git a/vendor/core/src/core/crypto/include/sourcemeta/core/crypto_fnv128.h b/vendor/core/src/core/crypto/include/sourcemeta/core/crypto_fnv128.h new file mode 100644 index 000000000..2fa0671de --- /dev/null +++ b/vendor/core/src/core/crypto/include/sourcemeta/core/crypto_fnv128.h @@ -0,0 +1,60 @@ +#ifndef SOURCEMETA_CORE_CRYPTO_FNV128_H_ +#define SOURCEMETA_CORE_CRYPTO_FNV128_H_ + +#ifndef SOURCEMETA_CORE_CRYPTO_EXPORT +#include +#endif + +#include // std::array +#include // std::uint8_t +#include // std::ostream +#include // std::string +#include // std::string_view + +namespace sourcemeta::core { + +/// @ingroup crypto +/// Hash a string using the non-cryptographic 128-bit FNV-1 function, +/// returning the raw digest in big-endian byte order. For example: +/// +/// ```cpp +/// #include +/// +/// const auto digest{sourcemeta::core::fnv128_digest("foo bar")}; +/// assert(digest.size() == 16); +/// ``` +auto SOURCEMETA_CORE_CRYPTO_EXPORT fnv128_digest(const std::string_view input) + -> std::array; + +/// @ingroup crypto +/// Hash a string using the non-cryptographic 128-bit FNV-1 function. For +/// example: +/// +/// ```cpp +/// #include +/// #include +/// #include +/// +/// std::ostringstream result; +/// sourcemeta::core::fnv128("foo bar", result); +/// std::cout << result.str() << "\n"; +/// ``` +auto SOURCEMETA_CORE_CRYPTO_EXPORT fnv128(const std::string_view input, + std::ostream &output) -> void; + +/// @ingroup crypto +/// Hash a string using the non-cryptographic 128-bit FNV-1 function, +/// returning the hex digest as a string. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// std::cout << sourcemeta::core::fnv128("foo bar") << "\n"; +/// ``` +auto SOURCEMETA_CORE_CRYPTO_EXPORT fnv128(const std::string_view input) + -> std::string; + +} // namespace sourcemeta::core + +#endif diff --git a/vendor/core/src/core/crypto/include/sourcemeta/core/crypto_sha1.h b/vendor/core/src/core/crypto/include/sourcemeta/core/crypto_sha1.h new file mode 100644 index 000000000..83a91b431 --- /dev/null +++ b/vendor/core/src/core/crypto/include/sourcemeta/core/crypto_sha1.h @@ -0,0 +1,44 @@ +#ifndef SOURCEMETA_CORE_CRYPTO_SHA1_H_ +#define SOURCEMETA_CORE_CRYPTO_SHA1_H_ + +#ifndef SOURCEMETA_CORE_CRYPTO_EXPORT +#include +#endif + +#include // std::ostream +#include // std::string +#include // std::string_view + +namespace sourcemeta::core { + +/// @ingroup crypto +/// Hash a string using SHA-1 (RFC 3174). For example: +/// +/// ```cpp +/// #include +/// #include +/// #include +/// +/// std::ostringstream result; +/// sourcemeta::core::sha1("foo bar", result); +/// std::cout << result.str() << "\n"; +/// ``` +auto SOURCEMETA_CORE_CRYPTO_EXPORT sha1(const std::string_view input, + std::ostream &output) -> void; + +/// @ingroup crypto +/// Hash a string using SHA-1 (RFC 3174), returning the hex digest as a +/// string. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// std::cout << sourcemeta::core::sha1("foo bar") << "\n"; +/// ``` +auto SOURCEMETA_CORE_CRYPTO_EXPORT sha1(const std::string_view input) + -> std::string; + +} // namespace sourcemeta::core + +#endif diff --git a/vendor/core/src/core/dns/hostname.cc b/vendor/core/src/core/dns/hostname.cc index e048fba73..4a980f7e2 100644 --- a/vendor/core/src/core/dns/hostname.cc +++ b/vendor/core/src/core/dns/hostname.cc @@ -20,7 +20,9 @@ auto is_hostname(const std::string_view value) -> bool { return false; } - // RFC 1123 §2.1: SHOULD handle host names of up to 255 characters + // RFC 1123 §2.1: SHOULD handle host names of up to 255 characters. This is + // intentionally looser than the stricter 253-octet cap applied to the + // internationalized form if (value.size() > 255) { return false; } diff --git a/vendor/core/src/core/dns/idn_hostname.cc b/vendor/core/src/core/dns/idn_hostname.cc index 16a32faae..b1d102d6d 100644 --- a/vendor/core/src/core/dns/idn_hostname.cc +++ b/vendor/core/src/core/dns/idn_hostname.cc @@ -68,7 +68,7 @@ auto is_idn_hostname(const std::string_view value) -> bool { try { const auto body{utf32_to_punycode(decoded)}; a_label_octets = 4 + body.size(); - } catch (...) { + } catch (const PunycodeError &) { return false; } } else if (*kind == IDNALabelKind::Ascii) { diff --git a/vendor/core/src/core/dns/include/sourcemeta/core/dns.h b/vendor/core/src/core/dns/include/sourcemeta/core/dns.h index 7e84f5674..810f59b27 100644 --- a/vendor/core/src/core/dns/include/sourcemeta/core/dns.h +++ b/vendor/core/src/core/dns/include/sourcemeta/core/dns.h @@ -34,9 +34,10 @@ namespace sourcemeta::core { /// assert(!sourcemeta::core::is_hostname("example.")); /// ``` /// -/// This function implements RFC 1123 §2.1 (ASCII only). It does not -/// perform A-label or Punycode decoding. For internationalized host -/// names see `is_idn_hostname`. +/// This function operates on ASCII input only and caps the total length at +/// 255 octets. Labels matching the case-insensitive "xn--" prefix are +/// additionally validated as RFC 5890 A-labels, so the Punycode body must +/// decode and round-trip. SOURCEMETA_CORE_DNS_EXPORT auto is_hostname(const std::string_view value) -> bool; @@ -45,7 +46,8 @@ auto is_hostname(const std::string_view value) -> bool; /// RFC 5891 Section 4. Each label is validated as an RFC 5890 A-label or /// U-label (with RFC 5892 ContextJ and ContextO contextual rules and the /// RFC 5891 §4.1.2.A NFC requirement), and the RFC 5893 Bidi rule is -/// enforced on every label of a Bidi domain name. For example: +/// enforced on every label of a Bidi domain name. The total length is capped +/// at 253 octets in A-label form. For example: /// /// ```cpp /// #include diff --git a/vendor/core/src/core/email/helpers.h b/vendor/core/src/core/email/helpers.h index 1b900eebf..dd93f5bf4 100644 --- a/vendor/core/src/core/email/helpers.h +++ b/vendor/core/src/core/email/helpers.h @@ -3,6 +3,7 @@ #include +#include // std::uint8_t, std::uint16_t #include // std::string_view namespace { @@ -75,6 +76,46 @@ inline constexpr auto is_ldh_str(const std::string_view value) -> bool { return true; } +// RFC 5321 §4.1.3: Snum = 1*3DIGIT ; representing a decimal integer +// value in the range 0 through 255. Leading zeros are permitted, unlike +// the RFC 3986 dec-octet that backs is_ipv4 +inline constexpr auto is_snum(const std::string_view value) -> bool { + if (value.empty() || value.size() > 3) { + return false; + } + std::uint16_t result{0}; + for (const auto character : value) { + if (character < '0' || character > '9') { + return false; + } + result = static_cast( + result * 10 + static_cast(character - '0')); + } + return result <= 255; +} + +// RFC 5321 §4.1.3: IPv4-address-literal = Snum 3("." Snum) +inline constexpr auto is_ipv4_address_literal(const std::string_view value) + -> bool { + std::string_view::size_type start{0}; + std::uint8_t octets{0}; + while (true) { + const auto dot{value.find('.', start)}; + const auto octet{dot == std::string_view::npos + ? value.substr(start) + : value.substr(start, dot - start)}; + if (!is_snum(octet)) { + return false; + } + octets = static_cast(octets + 1); + if (dot == std::string_view::npos) { + break; + } + start = dot + 1; + } + return octets == 4; +} + // RFC 5234 §2.3: ABNF literal strings are case-insensitive by default // RFC 5321 §4.1.3: IPv6-address-literal prefix is the literal "IPv6:" inline constexpr auto matches_ipv6_tag(const std::string_view value) -> bool { @@ -126,7 +167,7 @@ inline auto is_address_literal(const std::string_view domain) -> bool { // RFC 5321 §4.1.3: IPv4-address-literal has no ":"; // General-address-literal requires ":" if (inner.find(':') == std::string_view::npos) { - return sourcemeta::core::is_ipv4(inner); + return is_ipv4_address_literal(inner); } return is_general_address_literal(inner); } diff --git a/vendor/core/src/core/gzip/CMakeLists.txt b/vendor/core/src/core/gzip/CMakeLists.txt index e4663aa34..4365ee11a 100644 --- a/vendor/core/src/core/gzip/CMakeLists.txt +++ b/vendor/core/src/core/gzip/CMakeLists.txt @@ -2,10 +2,10 @@ sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME gzip PRIVATE_HEADERS error.h streambuf.h SOURCES gzip.cc streambuf.cc) +target_link_libraries(sourcemeta_core_gzip PRIVATE sourcemeta::core::crypto) + # Way faster for full buffer decompression target_link_libraries(sourcemeta_core_gzip PRIVATE LibDeflate::LibDeflate) -# Supports streaming -target_link_libraries(sourcemeta_core_gzip PRIVATE ZLIB::ZLIB) if(SOURCEMETA_CORE_INSTALL) sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME gzip) diff --git a/vendor/core/src/core/gzip/bit_reader.h b/vendor/core/src/core/gzip/bit_reader.h new file mode 100644 index 000000000..bbffdae05 --- /dev/null +++ b/vendor/core/src/core/gzip/bit_reader.h @@ -0,0 +1,155 @@ +#ifndef SOURCEMETA_CORE_GZIP_BIT_READER_H_ +#define SOURCEMETA_CORE_GZIP_BIT_READER_H_ + +#include + +#include // std::array +#include // assert +#include // std::size_t +#include // std::uint8_t, std::uint32_t, std::uint64_t +#include // std::istream + +namespace sourcemeta::core { + +class BitReader { +public: + BitReader(std::istream &source) : source_{&source} {} + + auto read_bits(const unsigned int count) -> std::uint32_t { + const auto value{this->peek_bits(count)}; + this->consume_bits(count); + return value; + } + + auto peek_bits(const unsigned int count) -> std::uint32_t { + // Callers in this module always pass count in [0, 32]; larger shifts + // would be undefined behaviour against the 64-bit accumulator. The + // assert documents the contract without paying a release-build cost + assert(count <= 32); + if (this->bits_available_ < count) { + this->refill_for(count); + } + const auto mask{(static_cast(1) << count) - 1}; + return static_cast(this->accumulator_ & mask); + } + + auto consume_bits(const unsigned int count) -> void { + // Consuming more bits than are buffered would underflow the unsigned + // counter. Every call site is preceded by a peek or refill that + // guarantees enough bits, so the assert documents the contract + assert(count <= this->bits_available_); + this->accumulator_ >>= count; + this->bits_available_ -= count; + } + + auto align_to_byte() -> void { + const auto trailing_bits{this->bits_available_ % 8}; + this->accumulator_ >>= trailing_bits; + this->bits_available_ -= trailing_bits; + } + + auto read_byte() -> std::uint8_t { + // Reading a byte while 1 to 7 bits are buffered would return a byte from + // ahead of them. Every call site is byte-aligned, so the assert documents + // the invariant without paying a release-build cost + assert(this->bits_available_ % 8 == 0); + if (this->bits_available_ >= 8) { + const auto value{static_cast(this->accumulator_ & 0xff)}; + this->accumulator_ >>= 8; + this->bits_available_ -= 8; + return value; + } + return this->pull_source_byte(); + } + + auto try_read_byte(std::uint8_t &byte) -> bool { + assert(this->bits_available_ % 8 == 0); + if (this->bits_available_ >= 8) { + byte = static_cast(this->accumulator_ & 0xff); + this->accumulator_ >>= 8; + this->bits_available_ -= 8; + return true; + } + if (this->buffer_position_ >= this->buffer_size_ && + !this->try_refill_buffer()) { + return false; + } + byte = this->buffer_[this->buffer_position_++]; + return true; + } + + auto read_bytes(std::uint8_t *destination, const std::size_t count) -> void { + for (std::size_t index = 0; index < count; ++index) { + destination[index] = this->read_byte(); + } + } + +private: + static constexpr std::size_t SOURCE_BUFFER_SIZE{4096}; + + auto pull_source_byte() -> std::uint8_t { + if (this->buffer_position_ >= this->buffer_size_) { + this->refill_buffer(); + } + return this->buffer_[this->buffer_position_++]; + } + + auto refill_for(const unsigned int count) -> void { + // Fast path: if 4 bytes available in the input buffer and the + // accumulator has room for 32 more bits, load 4 bytes at once. + // RFC 1951 packs bits LSB-first within each byte, so the first byte + // contributes the low 8 bits of the loaded word regardless of host + // endianness. Construct the 32-bit value explicitly to keep this + // portable on big-endian hosts + if (this->bits_available_ <= 32 && + this->buffer_position_ + 4 <= this->buffer_size_) { + const std::uint32_t four_bytes{ + static_cast(this->buffer_[this->buffer_position_]) | + (static_cast(this->buffer_[this->buffer_position_ + 1]) + << 8) | + (static_cast(this->buffer_[this->buffer_position_ + 2]) + << 16) | + (static_cast(this->buffer_[this->buffer_position_ + 3]) + << 24)}; + this->accumulator_ |= static_cast(four_bytes) + << this->bits_available_; + this->bits_available_ += 32; + this->buffer_position_ += 4; + } + while (this->bits_available_ < count) { + const auto byte{this->pull_source_byte()}; + this->accumulator_ |= static_cast(byte) + << this->bits_available_; + this->bits_available_ += 8; + } + } + + auto refill_buffer() -> void { + if (!this->try_refill_buffer()) { + throw GZIPError{"Unexpected end of source stream"}; + } + } + + auto try_refill_buffer() -> bool { + this->source_->read(reinterpret_cast(this->buffer_.data()), + SOURCE_BUFFER_SIZE); + const auto bytes_read{static_cast(this->source_->gcount())}; + if (bytes_read == 0) { + return false; + } + this->buffer_size_ = bytes_read; + this->buffer_position_ = 0; + return true; + } + + std::istream *source_; + std::uint64_t accumulator_{0}; + unsigned int bits_available_{0}; + std::array buffer_{}; + std::size_t buffer_position_{0}; + std::size_t buffer_size_{0}; +}; + +} // namespace sourcemeta::core + +#endif diff --git a/vendor/core/src/core/gzip/deflate.h b/vendor/core/src/core/gzip/deflate.h new file mode 100644 index 000000000..592341a45 --- /dev/null +++ b/vendor/core/src/core/gzip/deflate.h @@ -0,0 +1,409 @@ +#ifndef SOURCEMETA_CORE_GZIP_DEFLATE_H_ +#define SOURCEMETA_CORE_GZIP_DEFLATE_H_ + +#include "bit_reader.h" +#include "huffman.h" + +#include + +#include // std::min +#include // std::array +#include // std::size_t +#include // std::uint8_t, std::uint16_t +#include // std::memcpy + +namespace sourcemeta::core { + +inline constexpr std::size_t DEFLATE_WINDOW_SIZE{32768}; +inline constexpr std::size_t DEFLATE_WINDOW_MASK{DEFLATE_WINDOW_SIZE - 1}; +static_assert((DEFLATE_WINDOW_SIZE & DEFLATE_WINDOW_MASK) == 0, + "DEFLATE_WINDOW_SIZE must be a power of two"); + +// RFC 1951 section 3.2.5 length codes 257-285 +inline constexpr std::array DEFLATE_LENGTH_BASE{ + {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, + 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258}}; + +inline constexpr std::array DEFLATE_LENGTH_EXTRA{ + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, + 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0}}; + +// RFC 1951 section 3.2.5 distance codes 0-29 +inline constexpr std::array DEFLATE_DISTANCE_BASE{ + {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, + 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, + 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577}}; + +inline constexpr std::array DEFLATE_DISTANCE_EXTRA{ + {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}}; + +// RFC 1951 section 3.2.7 code-length-of-codes order +inline constexpr std::array DEFLATE_CODE_LENGTH_ORDER{ + {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}}; + +class DeflateDecoder { +public: + DeflateDecoder(BitReader &reader) : reader_{&reader} {} + + auto decompress(std::uint8_t *output, const std::size_t output_size) + -> std::size_t { + std::size_t produced{0}; + + while (!this->stream_ended_) { + switch (this->state_) { + case State::BlockHeader: + if (this->final_block_) { + this->state_ = State::End; + } else { + this->start_block(); + } + break; + case State::StoredBlock: + this->process_stored_block(output, output_size, produced); + if (this->state_ == State::StoredBlock) { + return produced; + } + break; + case State::HuffmanBlock: + this->process_huffman_block(output, output_size, produced); + if (this->state_ == State::HuffmanBlock) { + return produced; + } + break; + case State::End: + this->reader_->align_to_byte(); + this->stream_ended_ = true; + return produced; + } + } + return produced; + } + + [[nodiscard]] auto stream_ended() const -> bool { + return this->stream_ended_; + } + + auto reset() -> void { + this->state_ = State::BlockHeader; + this->final_block_ = false; + this->stream_ended_ = false; + this->stored_remaining_ = 0; + this->pending_copy_length_ = 0; + this->pending_copy_distance_ = 0; + this->window_position_ = 0; + this->bytes_written_ = 0; + } + +private: + enum class State : std::uint8_t { + BlockHeader, + StoredBlock, + HuffmanBlock, + End, + }; + + auto start_block() -> void { + this->final_block_ = this->reader_->read_bits(1) != 0; + const auto btype{this->reader_->read_bits(2)}; + switch (btype) { + case 0: + this->start_stored_block(); + break; + case 1: + this->build_fixed_trees(); + this->state_ = State::HuffmanBlock; + break; + case 2: + this->read_dynamic_header(); + this->state_ = State::HuffmanBlock; + break; + default: + throw GZIPError{"Reserved deflate block type"}; + } + } + + auto start_stored_block() -> void { + this->reader_->align_to_byte(); + this->stored_remaining_ = + static_cast(this->reader_->read_byte()) | + static_cast( + static_cast(this->reader_->read_byte()) << 8); + const auto nlen_lo{this->reader_->read_byte()}; + const auto nlen_hi{this->reader_->read_byte()}; + const std::uint16_t nlen{static_cast( + static_cast(nlen_lo) | + static_cast(static_cast(nlen_hi) << 8))}; + const std::uint16_t expected_nlen{ + static_cast(~this->stored_remaining_)}; + if (expected_nlen != nlen) { + throw GZIPError{"Stored block LEN/NLEN mismatch"}; + } + this->state_ = State::StoredBlock; + } + + auto build_fixed_trees() -> void { + std::array literal_lengths{}; + for (std::size_t index = 0; index < 144; ++index) { + literal_lengths[index] = 8; + } + for (std::size_t index = 144; index < 256; ++index) { + literal_lengths[index] = 9; + } + for (std::size_t index = 256; index < 280; ++index) { + literal_lengths[index] = 7; + } + for (std::size_t index = 280; index < 288; ++index) { + literal_lengths[index] = 8; + } + this->literal_length_tree_.build(literal_lengths.data(), + literal_lengths.size()); + + std::array distance_lengths{}; + for (auto &length : distance_lengths) { + length = 5; + } + this->distance_tree_.build(distance_lengths.data(), distance_lengths.size(), + true); + } + + auto read_dynamic_header() -> void { + const auto hlit{this->reader_->read_bits(5) + 257}; + const auto hdist{this->reader_->read_bits(5) + 1}; + const auto hclen{this->reader_->read_bits(4) + 4}; + + // RFC 1951 section 3.2.7 caps the literal/length alphabet at 286 symbols + if (hlit > 286) { + throw GZIPError{"Too many literal/length codes"}; + } + + std::array code_length_lengths{}; + for (std::size_t index = 0; index < hclen; ++index) { + code_length_lengths[DEFLATE_CODE_LENGTH_ORDER[index]] = + static_cast(this->reader_->read_bits(3)); + } + + HuffmanDecoder code_length_tree; + code_length_tree.build(code_length_lengths.data(), + code_length_lengths.size()); + + std::array all_lengths{}; + std::size_t index{0}; + while (index < hlit + hdist) { + const auto symbol{code_length_tree.decode(*this->reader_)}; + if (symbol < 16) { + all_lengths[index++] = static_cast(symbol); + } else if (symbol == 16) { + if (index == 0) { + throw GZIPError{"Repeat-previous code length with no previous"}; + } + const auto previous{all_lengths[index - 1]}; + const auto repeats{this->reader_->read_bits(2) + 3}; + for (std::size_t step = 0; step < repeats; ++step) { + if (index >= all_lengths.size()) { + throw GZIPError{"Code length count overflow"}; + } + all_lengths[index++] = previous; + } + } else if (symbol == 17) { + const auto repeats{this->reader_->read_bits(3) + 3}; + for (std::size_t step = 0; step < repeats; ++step) { + if (index >= all_lengths.size()) { + throw GZIPError{"Code length count overflow"}; + } + all_lengths[index++] = 0; + } + } else if (symbol == 18) { + const auto repeats{this->reader_->read_bits(7) + 11}; + for (std::size_t step = 0; step < repeats; ++step) { + if (index >= all_lengths.size()) { + throw GZIPError{"Code length count overflow"}; + } + all_lengths[index++] = 0; + } + } else { + throw GZIPError{"Invalid code length symbol"}; + } + } + + if (index != hlit + hdist) { + throw GZIPError{"Code length count overflow"}; + } + + this->literal_length_tree_.build(all_lengths.data(), hlit); + this->distance_tree_.build(all_lengths.data() + hlit, hdist); + } + + auto process_stored_block(std::uint8_t *output, const std::size_t output_size, + std::size_t &produced) -> void { + while (this->stored_remaining_ > 0 && produced < output_size) { + const auto byte{this->reader_->read_byte()}; + this->emit(byte, output, output_size, produced); + --this->stored_remaining_; + } + if (this->stored_remaining_ == 0) { + this->state_ = State::BlockHeader; + } + } + + auto process_huffman_block(std::uint8_t *output, + const std::size_t output_size, + std::size_t &produced) -> void { + if (this->pending_copy_length_ > 0) { + this->copy_backref(output, output_size, produced); + if (this->pending_copy_length_ > 0) { + return; + } + } + + while (produced < output_size) { + const auto symbol{this->literal_length_tree_.decode(*this->reader_)}; + if (symbol < 256) { + this->window_[this->window_position_] = + static_cast(symbol); + this->window_position_ = + (this->window_position_ + 1) & DEFLATE_WINDOW_MASK; + if (this->bytes_written_ < DEFLATE_WINDOW_SIZE) { + ++this->bytes_written_; + } + output[produced++] = static_cast(symbol); + } else if (symbol == 256) { + this->state_ = State::BlockHeader; + return; + } else if (symbol <= 285) { + const auto length_index{static_cast(symbol - 257)}; + const std::size_t length{ + static_cast(DEFLATE_LENGTH_BASE[length_index]) + + static_cast( + this->reader_->read_bits(DEFLATE_LENGTH_EXTRA[length_index]))}; + const auto distance_symbol{this->distance_tree_.decode(*this->reader_)}; + if (distance_symbol >= 30) { + throw GZIPError{"Invalid distance code"}; + } + const std::size_t distance{ + static_cast(DEFLATE_DISTANCE_BASE[distance_symbol]) + + static_cast(this->reader_->read_bits( + DEFLATE_DISTANCE_EXTRA[distance_symbol]))}; + if (distance > this->bytes_written_) { + throw GZIPError{"Backref distance exceeds bytes available"}; + } + this->pending_copy_length_ = length; + this->pending_copy_distance_ = distance; + this->copy_backref(output, output_size, produced); + if (this->pending_copy_length_ > 0) { + return; + } + } else { + throw GZIPError{"Invalid literal/length code"}; + } + } + } + + auto copy_backref(std::uint8_t *output, const std::size_t output_size, + std::size_t &produced) -> void { + const auto remaining{output_size - produced}; + const auto to_copy{std::min(this->pending_copy_length_, remaining)}; + + if (this->pending_copy_distance_ >= this->pending_copy_length_) { + // Source range does not overlap with the bytes about to be written + this->copy_backref_non_overlapping(output, produced, to_copy); + } else { + // RLE-style overlap: must propagate byte by byte + this->copy_backref_overlapping(output, output_size, produced); + } + } + + auto copy_backref_non_overlapping(std::uint8_t *output, std::size_t &produced, + const std::size_t to_copy) -> void { + const std::size_t source_position{(this->window_position_ + + DEFLATE_WINDOW_SIZE - + this->pending_copy_distance_) & + DEFLATE_WINDOW_MASK}; + + // Copy from circular window into linear output (one or two contiguous + // chunks depending on whether the source range wraps the window) + const std::size_t source_first{ + std::min(to_copy, DEFLATE_WINDOW_SIZE - source_position)}; + std::memcpy(output + produced, this->window_.data() + source_position, + source_first); + if (source_first < to_copy) { + std::memcpy(output + produced + source_first, this->window_.data(), + to_copy - source_first); + } + + // Mirror the freshly written bytes back into the circular window + const std::size_t dest_first{ + std::min(to_copy, DEFLATE_WINDOW_SIZE - this->window_position_)}; + std::memcpy(this->window_.data() + this->window_position_, + output + produced, dest_first); + if (dest_first < to_copy) { + std::memcpy(this->window_.data(), output + produced + dest_first, + to_copy - dest_first); + } + + produced += to_copy; + this->window_position_ = + (this->window_position_ + to_copy) & DEFLATE_WINDOW_MASK; + this->bytes_written_ = + std::min(this->bytes_written_ + to_copy, DEFLATE_WINDOW_SIZE); + this->pending_copy_length_ -= to_copy; + } + + auto copy_backref_overlapping(std::uint8_t *output, + const std::size_t output_size, + std::size_t &produced) -> void { + std::size_t source_position{(this->window_position_ + DEFLATE_WINDOW_SIZE - + this->pending_copy_distance_) & + DEFLATE_WINDOW_MASK}; + while (this->pending_copy_length_ > 0 && produced < output_size) { + const auto byte{this->window_[source_position]}; + this->window_[this->window_position_] = byte; + this->window_position_ = + (this->window_position_ + 1) & DEFLATE_WINDOW_MASK; + source_position = (source_position + 1) & DEFLATE_WINDOW_MASK; + if (this->bytes_written_ < DEFLATE_WINDOW_SIZE) { + ++this->bytes_written_; + } + output[produced++] = byte; + --this->pending_copy_length_; + } + } + + auto emit(const std::uint8_t byte, std::uint8_t *output, + const std::size_t output_size, std::size_t &produced) -> bool { + this->window_[this->window_position_] = byte; + this->window_position_ = (this->window_position_ + 1) & DEFLATE_WINDOW_MASK; + if (this->bytes_written_ < DEFLATE_WINDOW_SIZE) { + ++this->bytes_written_; + } + if (produced < output_size) { + output[produced++] = byte; + return true; + } + return false; + } + + BitReader *reader_; + State state_{State::BlockHeader}; + bool final_block_{false}; + bool stream_ended_{false}; + + std::uint16_t stored_remaining_{0}; + + HuffmanDecoder literal_length_tree_{}; + HuffmanDecoder distance_tree_{}; + + std::size_t pending_copy_length_{0}; + std::size_t pending_copy_distance_{0}; + + std::array window_{}; + std::size_t window_position_{0}; + // Bytes written into the sliding window since the last reset, capped at + // DEFLATE_WINDOW_SIZE. Used to reject back-references whose distance + // exceeds the data we have actually produced for the current member + std::size_t bytes_written_{0}; +}; + +} // namespace sourcemeta::core + +#endif diff --git a/vendor/core/src/core/gzip/gzip.cc b/vendor/core/src/core/gzip/gzip.cc index 83af9a56d..9ee3375a3 100644 --- a/vendor/core/src/core/gzip/gzip.cc +++ b/vendor/core/src/core/gzip/gzip.cc @@ -8,19 +8,27 @@ extern "C" { namespace sourcemeta::core { -auto gzip(const std::uint8_t *input, const std::size_t size) -> std::string { +auto gzip(const std::uint8_t *input, const std::size_t size, const int level) + -> std::string { std::unique_ptr - compressor{libdeflate_alloc_compressor(1), libdeflate_free_compressor}; + compressor{libdeflate_alloc_compressor(level), + libdeflate_free_compressor}; if (!compressor) { throw GZIPError{"Could not allocate compressor"}; } const auto max_size{libdeflate_gzip_compress_bound(compressor.get(), size)}; std::string output; - output.resize(max_size); - - const auto actual_size{libdeflate_gzip_compress( - compressor.get(), input, size, output.data(), output.size())}; + std::size_t actual_size{0}; + // libdeflate overwrites the whole bound, so leaving the buffer uninitialised + // avoids zero-filling multi-megabyte allocations that are immediately + // discarded + output.resize_and_overwrite( + max_size, [&](char *const buffer, const std::size_t capacity) { + actual_size = libdeflate_gzip_compress(compressor.get(), input, size, + buffer, capacity); + return capacity; + }); if (actual_size == 0) { throw GZIPError{"Could not compress input"}; @@ -44,11 +52,17 @@ auto gunzip(const std::uint8_t *input, const std::size_t size, auto capacity{output_hint > 0 ? output_hint : size * 4}; for (;;) { - output.resize(capacity); std::size_t actual_size{0}; - const auto result{libdeflate_gzip_decompress(decompressor.get(), input, - size, output.data(), - output.size(), &actual_size)}; + auto result{LIBDEFLATE_BAD_DATA}; + // libdeflate writes only the decompressed bytes, so leaving the buffer + // uninitialised avoids zero-filling multi-megabyte allocations on every + // retry of the doubling loop + output.resize_and_overwrite(capacity, [&](char *const buffer, + const std::size_t buffer_size) { + result = libdeflate_gzip_decompress(decompressor.get(), input, size, + buffer, buffer_size, &actual_size); + return buffer_size; + }); if (result == LIBDEFLATE_SUCCESS) { output.resize(actual_size); diff --git a/vendor/core/src/core/gzip/huffman.h b/vendor/core/src/core/gzip/huffman.h new file mode 100644 index 000000000..01c72b330 --- /dev/null +++ b/vendor/core/src/core/gzip/huffman.h @@ -0,0 +1,166 @@ +#ifndef SOURCEMETA_CORE_GZIP_HUFFMAN_H_ +#define SOURCEMETA_CORE_GZIP_HUFFMAN_H_ + +#include "bit_reader.h" + +#include + +#include // std::ranges::fill +#include // std::array +#include // std::size_t +#include // std::uint8_t, std::uint16_t + +namespace sourcemeta::core { + +// Maximum Huffman code length per RFC 1951 section 3.2.7 +inline constexpr unsigned int MAX_HUFFMAN_BITS{15}; + +// Largest alphabet is the literal/length alphabet (288 symbols) per RFC 1951 +// section 3.2.5 +inline constexpr std::size_t MAX_HUFFMAN_SYMBOLS{288}; + +class HuffmanDecoder { +public: + // Primary lookup table covering all codes of length <= LUT_BITS. Misses + // fall back to the canonical-puff traversal for codes of length 10..15 + static constexpr unsigned int LUT_BITS{9}; + static constexpr std::size_t LUT_SIZE{1u << LUT_BITS}; + + HuffmanDecoder() = default; + + // The fixed distance tree of RFC 1951 section 3.2.6 is intentionally + // incomplete (30 codes of length five over a 32-slot space), so the + // completeness check is suppressed for it and enforced everywhere else + auto build(const std::uint8_t *lengths, const std::size_t length_count, + const bool allow_incomplete = false) -> void { + std::ranges::fill(this->count_, std::uint16_t{0}); + std::ranges::fill(this->lut_, std::uint16_t{0}); + + for (std::size_t symbol = 0; symbol < length_count; ++symbol) { + if (lengths[symbol] > MAX_HUFFMAN_BITS) { + throw GZIPError{"Huffman code length out of range"}; + } + this->count_[lengths[symbol]]++; + } + + if (this->count_[0] == length_count) { + return; + } + + // Verify the alphabet is complete or single-symbol per RFC 1951 + int left{1}; + for (unsigned int bits = 1; bits <= MAX_HUFFMAN_BITS; ++bits) { + left <<= 1; + left -= this->count_[bits]; + if (left < 0) { + throw GZIPError{"Over-subscribed Huffman code"}; + } + } + + // Reject incomplete codes, matching zlib and puff. RFC 1951 sanctions + // incompleteness only for the single-code case (a tree built from one + // used code of length one), where every length is either zero or one + if (left > 0 && !allow_incomplete && + length_count != static_cast(this->count_[0]) + + static_cast(this->count_[1])) { + throw GZIPError{"Incomplete Huffman code"}; + } + + std::array offsets{}; + offsets[1] = 0; + for (unsigned int bits = 1; bits < MAX_HUFFMAN_BITS; ++bits) { + offsets[bits + 1] = + static_cast(offsets[bits] + this->count_[bits]); + } + + for (std::size_t symbol = 0; symbol < length_count; ++symbol) { + if (lengths[symbol] != 0) { + this->symbols_[offsets[lengths[symbol]]++] = + static_cast(symbol); + } + } + + // Build the fast-path LUT. Each entry packs (symbol << 4) | length; + // a value of 0 means "no short code, fall back". Codes longer than + // LUT_BITS leave their LUT entries at 0 + std::array next_code{}; + next_code[1] = 0; + for (unsigned int bits = 2; bits <= MAX_HUFFMAN_BITS; ++bits) { + next_code[bits] = (next_code[bits - 1] + this->count_[bits - 1]) << 1u; + } + + std::size_t symbol_index{0}; + for (unsigned int code_length = 1; code_length <= MAX_HUFFMAN_BITS; + ++code_length) { + for (unsigned int k = 0; k < this->count_[code_length]; ++k) { + const auto symbol{this->symbols_[symbol_index]}; + if (code_length <= LUT_BITS) { + const auto code{next_code[code_length]}; + const auto lsb_key{reverse_bits(code, code_length)}; + const auto entry{static_cast( + static_cast(code_length) | + static_cast(symbol << 4u))}; + const std::uint32_t stride{1u << code_length}; + for (std::uint32_t slot = lsb_key; slot < LUT_SIZE; slot += stride) { + this->lut_[slot] = entry; + } + } + ++next_code[code_length]; + ++symbol_index; + } + } + } + + auto decode(BitReader &reader) const -> std::uint16_t { + const auto key{reader.peek_bits(LUT_BITS)}; + const auto entry{this->lut_[key]}; + if (entry != 0) { + const unsigned int length{entry & 0xfu}; + reader.consume_bits(length); + return static_cast(entry >> 4u); + } + return this->decode_long(reader); + } + +private: + auto decode_long(BitReader &reader) const -> std::uint16_t { + std::uint32_t bits{reader.peek_bits(MAX_HUFFMAN_BITS)}; + int code{0}; + int first{0}; + int index{0}; + for (unsigned int length = 1; length <= MAX_HUFFMAN_BITS; ++length) { + code |= static_cast(bits & 1u); + bits >>= 1u; + const auto entries{static_cast(this->count_[length])}; + if (code - entries < first) { + const auto position{static_cast(index) + + static_cast(code - first)}; + reader.consume_bits(length); + return this->symbols_[position]; + } + index += entries; + first = (first + entries) << 1; + code <<= 1; + } + throw GZIPError{"Invalid Huffman code"}; + } + + static auto reverse_bits(std::uint32_t value, const unsigned int length) + -> std::uint32_t { + std::uint32_t result{0}; + for (unsigned int index = 0; index < length; ++index) { + if ((value & (1u << (length - 1 - index))) != 0u) { + result |= (1u << index); + } + } + return result; + } + + std::array count_{}; + std::array symbols_{}; + std::array lut_{}; +}; + +} // namespace sourcemeta::core + +#endif diff --git a/vendor/core/src/core/gzip/include/sourcemeta/core/gzip.h b/vendor/core/src/core/gzip/include/sourcemeta/core/gzip.h index 2b7fb5918..cd22de5c3 100644 --- a/vendor/core/src/core/gzip/include/sourcemeta/core/gzip.h +++ b/vendor/core/src/core/gzip/include/sourcemeta/core/gzip.h @@ -26,7 +26,8 @@ namespace sourcemeta::core { /// @ingroup gzip -/// Compress a byte buffer using the GZIP format (RFC 1952). For example: +/// Compress a byte buffer using the GZIP format (RFC 1952). An optional +/// compression level from 0 to 12 trades speed for ratio. For example: /// /// ```cpp /// #include @@ -36,7 +37,8 @@ namespace sourcemeta::core { /// reinterpret_cast(input.data()), input.size())}; /// ``` auto SOURCEMETA_CORE_GZIP_EXPORT gzip(const std::uint8_t *input, - std::size_t size) -> std::string; + std::size_t size, int level = 1) + -> std::string; /// @ingroup gzip /// Decompress a GZIP compressed byte buffer (RFC 1952). An optional output diff --git a/vendor/core/src/core/gzip/streambuf.cc b/vendor/core/src/core/gzip/streambuf.cc index 75cb2c11d..4ba4801bb 100644 --- a/vendor/core/src/core/gzip/streambuf.cc +++ b/vendor/core/src/core/gzip/streambuf.cc @@ -1,12 +1,15 @@ #include -extern "C" { -#include -} +#include "bit_reader.h" +#include "deflate.h" + +#include -#include // std::array -#include // std::size_t -#include // std::istream +#include // std::array +#include // std::size_t, std::ptrdiff_t +#include // std::uint8_t, std::uint16_t, std::uint32_t +#include // std::istream +#include // std::string_view namespace sourcemeta::core { @@ -15,82 +18,224 @@ static constexpr std::size_t GZIP_BUFFER_SIZE{16384}; struct GZIPStreamBuffer::Internal { // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) std::istream &stream; - z_stream zlib_stream; - bool stream_ended; - std::array compressed_buffer; - std::array decompressed_buffer; + BitReader reader; + DeflateDecoder deflate; + bool stream_ended{false}; + bool any_member_completed{false}; + bool member_started{false}; + std::uint32_t member_crc32{0}; + std::uint32_t member_isize{0}; + std::array decompressed_buffer{}; + + Internal(std::istream &source) + : stream{source}, reader{source}, deflate{reader} {} }; -GZIPStreamBuffer::GZIPStreamBuffer(std::istream &compressed_stream) - : internal{new Internal{.stream = compressed_stream, - .zlib_stream = {}, - .stream_ended = false, - .compressed_buffer = {}, - .decompressed_buffer = {}}} { - this->internal->zlib_stream.zalloc = Z_NULL; - this->internal->zlib_stream.zfree = Z_NULL; - this->internal->zlib_stream.opaque = Z_NULL; - this->internal->zlib_stream.avail_in = 0; - this->internal->zlib_stream.next_in = Z_NULL; - - // MAX_WBITS + 16 selects gzip decoding per the zlib API - const auto result{inflateInit2(&this->internal->zlib_stream, MAX_WBITS + 16)}; - if (result != Z_OK) { - throw GZIPError{"Could not initialize gzip decompressor"}; +namespace { + +// Accumulates the running CRC-32 over the header bytes that the FHCRC check +// covers, computing it only when the FHCRC flag is present +class HeaderChecksum { +public: + HeaderChecksum(const bool track) : track_{track} {} + + auto feed(const std::uint8_t byte) -> void { + if (this->track_) { + const auto data{static_cast(byte)}; + this->checksum_ = + crc32_update(this->checksum_, std::string_view{&data, 1}); + } + } + + [[nodiscard]] auto low16() const -> std::uint16_t { + return static_cast(this->checksum_ & 0xffffu); } -} -GZIPStreamBuffer::~GZIPStreamBuffer() { - inflateEnd(&this->internal->zlib_stream); +private: + bool track_; + std::uint32_t checksum_{0}; +}; + +auto read_header_byte(BitReader &reader, HeaderChecksum &checksum) + -> std::uint8_t { + const auto byte{reader.read_byte()}; + checksum.feed(byte); + return byte; } -auto GZIPStreamBuffer::underflow() -> int_type { - if (this->gptr() && this->gptr() < this->egptr()) { - return traits_type::to_int_type(*this->gptr()); +auto parse_member_header(BitReader &reader, const std::uint8_t first_byte) + -> void { + // RFC 1952 section 2.3.1.2: FHCRC covers every header byte up to but not + // including the CRC16 itself, so feeding each byte as it is read produces + // exactly the right value. The bytes are not retained, removing an + // unbounded-memory path through FNAME and FCOMMENT + + // Caller already consumed the ID1 byte and verified it is 0x1f + const auto id2{reader.read_byte()}; + if (id2 != 0x8b) { + throw GZIPError{"Invalid gzip magic bytes"}; + } + const auto compression_method{reader.read_byte()}; + if (compression_method != 8) { + throw GZIPError{"Unsupported gzip compression method"}; + } + const auto flag_byte{reader.read_byte()}; + if ((flag_byte & 0xe0) != 0) { + throw GZIPError{"Reserved gzip FLG bits must be zero"}; } - if (this->internal->stream_ended) { - return traits_type::eof(); + HeaderChecksum checksum{(flag_byte & 0x02) != 0}; + checksum.feed(first_byte); + checksum.feed(id2); + checksum.feed(compression_method); + checksum.feed(flag_byte); + + // MTIME (4 bytes) + XFL (1 byte) + OS (1 byte) are informational + for (std::size_t index = 0; index < 6; ++index) { + read_header_byte(reader, checksum); } - if (this->internal->zlib_stream.avail_in == 0) { - this->internal->stream.read( - this->internal->compressed_buffer.data(), - static_cast(this->internal->compressed_buffer.size())); - const auto bytes_read{this->internal->stream.gcount()}; - if (bytes_read > 0) { - this->internal->zlib_stream.next_in = - reinterpret_cast(this->internal->compressed_buffer.data()); - this->internal->zlib_stream.avail_in = static_cast(bytes_read); + if ((flag_byte & 0x04) != 0) { + // FEXTRA + const auto xlen_lo{read_header_byte(reader, checksum)}; + const auto xlen_hi{read_header_byte(reader, checksum)}; + const auto xlen{static_cast(xlen_lo) | + (static_cast(xlen_hi) << 8)}; + for (std::size_t index = 0; index < xlen; ++index) { + read_header_byte(reader, checksum); } } - this->internal->zlib_stream.next_out = - reinterpret_cast(this->internal->decompressed_buffer.data()); - this->internal->zlib_stream.avail_out = - static_cast(this->internal->decompressed_buffer.size()); + if ((flag_byte & 0x08) != 0) { + // FNAME (null-terminated) + while (read_header_byte(reader, checksum) != 0) { + } + } - const auto result{inflate(&this->internal->zlib_stream, Z_NO_FLUSH)}; + if ((flag_byte & 0x10) != 0) { + // FCOMMENT (null-terminated) + while (read_header_byte(reader, checksum) != 0) { + } + } - if (result != Z_OK && result != Z_STREAM_END) { - throw GZIPError{"Could not decompress gzip stream"}; + if ((flag_byte & 0x02) != 0) { + // FHCRC: low 16 bits of CRC-32 over all preceding header bytes + const auto stored_lo{reader.read_byte()}; + const auto stored_hi{reader.read_byte()}; + const std::uint16_t stored{static_cast( + static_cast(stored_lo) | + static_cast(static_cast(stored_hi) + << 8))}; + if (stored != checksum.low16()) { + throw GZIPError{"FHCRC mismatch"}; + } } +} - const auto bytes_produced{this->internal->decompressed_buffer.size() - - this->internal->zlib_stream.avail_out}; - if (bytes_produced == 0) { - this->internal->stream_ended = true; - return traits_type::eof(); +// Used for members past the first, where gzip(1) tolerates trailing +// non-member data, so a header that fails to validate is reported as +// trailing garbage rather than propagated as an error +auto try_parse_member_header(BitReader &reader, const std::uint8_t first_byte) + -> bool { + try { + parse_member_header(reader, first_byte); + return true; + } catch (const GZIPError &) { + return false; } +} + +} // namespace + +GZIPStreamBuffer::GZIPStreamBuffer(std::istream &compressed_stream) + : internal{new Internal{compressed_stream}} {} - if (result == Z_STREAM_END) { - this->internal->stream_ended = true; +GZIPStreamBuffer::~GZIPStreamBuffer() = default; + +auto GZIPStreamBuffer::underflow() -> int_type { + if (this->gptr() && this->gptr() < this->egptr()) { + return traits_type::to_int_type(*this->gptr()); } + if (this->internal->stream_ended) { + return traits_type::eof(); + } + + while (true) { + if (!this->internal->member_started) { + std::uint8_t first_byte{0}; + if (!this->internal->reader.try_read_byte(first_byte)) { + if (!this->internal->any_member_completed) { + throw GZIPError{"Empty source stream"}; + } + this->internal->stream_ended = true; + return traits_type::eof(); + } + if (this->internal->any_member_completed) { + // gzip(1) silently ignores any trailing data after a complete member + // rather than treating it as the start of a new member. Bytes that do + // not form a valid member header end the stream without error, + // independent of the first byte value + if (first_byte != 0x1f || + !try_parse_member_header(this->internal->reader, first_byte)) { + this->internal->stream_ended = true; + return traits_type::eof(); + } + } else { + if (first_byte != 0x1f) { + throw GZIPError{"Invalid gzip magic bytes"}; + } + parse_member_header(this->internal->reader, first_byte); + } + + this->internal->deflate.reset(); + this->internal->member_started = true; + this->internal->member_crc32 = 0; + this->internal->member_isize = 0; + } - auto *buffer_start{this->internal->decompressed_buffer.data()}; - this->setg(buffer_start, buffer_start, - buffer_start + static_cast(bytes_produced)); - return traits_type::to_int_type(*this->gptr()); + const auto produced{this->internal->deflate.decompress( + this->internal->decompressed_buffer.data(), + this->internal->decompressed_buffer.size())}; + + if (produced > 0) { + this->internal->member_crc32 = crc32_update( + this->internal->member_crc32, + std::string_view{reinterpret_cast( + this->internal->decompressed_buffer.data()), + produced}); + this->internal->member_isize += static_cast(produced); + + auto *buffer_start{ + reinterpret_cast(this->internal->decompressed_buffer.data())}; + this->setg(buffer_start, buffer_start, + buffer_start + static_cast(produced)); + return traits_type::to_int_type(*this->gptr()); + } + + if (!this->internal->deflate.stream_ended()) { + throw GZIPError{"Deflate stream ended unexpectedly"}; + } + + std::array trailer{}; + this->internal->reader.read_bytes(trailer.data(), trailer.size()); + const auto stored_crc32{static_cast(trailer[0]) | + (static_cast(trailer[1]) << 8) | + (static_cast(trailer[2]) << 16) | + (static_cast(trailer[3]) << 24)}; + const auto stored_isize{static_cast(trailer[4]) | + (static_cast(trailer[5]) << 8) | + (static_cast(trailer[6]) << 16) | + (static_cast(trailer[7]) << 24)}; + if (stored_crc32 != this->internal->member_crc32) { + throw GZIPError{"Gzip member CRC32 mismatch"}; + } + if (stored_isize != this->internal->member_isize) { + throw GZIPError{"Gzip member ISIZE mismatch"}; + } + + this->internal->any_member_completed = true; + this->internal->member_started = false; + } } } // namespace sourcemeta::core diff --git a/vendor/core/src/core/html/include/sourcemeta/core/html_writer.h b/vendor/core/src/core/html/include/sourcemeta/core/html_writer.h index 384f1307d..777408112 100644 --- a/vendor/core/src/core/html/include/sourcemeta/core/html_writer.h +++ b/vendor/core/src/core/html/include/sourcemeta/core/html_writer.h @@ -37,10 +37,14 @@ class SOURCEMETA_CORE_HTML_EXPORT HTMLWriter { this->buffer_.reserve(bytes); } - /// Close the most recently opened element + /// Close the most recently opened element. Closing when no element is open + /// has no effect. SOURCEMETA_FORCEINLINE inline auto close() -> HTMLWriter & { this->flush_open_tag(); assert(!this->tag_stack_.empty()); + if (this->tag_stack_.empty()) [[unlikely]] { + return *this; + } this->buffer_.append("buffer_.append(this->tag_stack_.back()); this->buffer_.append(">"); diff --git a/vendor/core/src/core/http/helpers.h b/vendor/core/src/core/http/helpers.h index abfabd40a..40d78b2b5 100644 --- a/vendor/core/src/core/http/helpers.h +++ b/vendor/core/src/core/http/helpers.h @@ -144,8 +144,20 @@ inline auto http_for_each_parameter(const std::string_view parameters, ++position; } std::size_t end_position{position}; - while (end_position < parameters.size() && - parameters[end_position] != ';') { + bool in_quotes{false}; + while (end_position < parameters.size()) { + const char current{parameters[end_position]}; + if (in_quotes) { + if (current == '\\' && end_position + 1 < parameters.size()) { + ++end_position; + } else if (current == '"') { + in_quotes = false; + } + } else if (current == '"') { + in_quotes = true; + } else if (current == ';') { + break; + } ++end_position; } const auto raw{http_subview(parameters, position, end_position - position)}; @@ -167,27 +179,29 @@ inline auto http_for_each_parameter(const std::string_view parameters, } } -// RFC 9110 §5.6.5 q-value. Defaults to 1.0 on malformed input. +// RFC 9110 §12.4.2 q-value. A malformed weight is a fail-safe refusal, so it +// is treated as 0 rather than maximal preference. An absent weight is not +// routed here and keeps its 1.0 default at the call site. inline auto http_parse_qvalue(const std::string_view value) noexcept -> float { if (value.empty()) { - return 1.0f; + return 0.0f; } if (value[0] != '0' && value[0] != '1') { - return 1.0f; + return 0.0f; } const float integer_part{static_cast(value[0] - '0')}; if (value.size() == 1) { return integer_part; } if (value[1] != '.' || value.size() > 5) { - return 1.0f; + return 0.0f; } std::uint16_t numerator{0}; std::uint16_t denominator{1}; for (std::size_t index{2}; index < value.size(); ++index) { const char character{value[index]}; if (character < '0' || character > '9') { - return 1.0f; + return 0.0f; } numerator = static_cast(numerator * 10 + (character - '0')); denominator = static_cast(denominator * 10); @@ -196,7 +210,7 @@ inline auto http_parse_qvalue(const std::string_view value) noexcept -> float { static_cast(denominator)}; const float result{integer_part + fraction}; if (result > 1.0f) { - return 1.0f; + return 0.0f; } return result; } diff --git a/vendor/core/src/core/idna/idna.cc b/vendor/core/src/core/idna/idna.cc index da161023d..87018b7ed 100644 --- a/vendor/core/src/core/idna/idna.cc +++ b/vendor/core/src/core/idna/idna.cc @@ -12,6 +12,54 @@ namespace sourcemeta::core { +namespace { + +// RFC 5890 §2.3.2.1: the maximum length of a label in A-label form +constexpr std::size_t MAXIMUM_LABEL_OCTETS{63}; + +// Decode and fully validate a Punycode A-label body (the substring after the +// "xn--" prefix), writing the decoded U-label out on success. Returns false +// when the body is not a canonical A-label. +auto validate_a_label_body(const std::string_view encoded, + std::u32string &decoded) -> bool { + if (encoded.empty()) { + return false; + } + + try { + decoded = punycode_to_utf32(encoded); + } catch (const PunycodeError &) { + return false; + } + + // RFC 5890 §2.3.2.1: a U-label contains at least one non-ASCII codepoint. + // A Punycode body that decodes to pure ASCII is not a real A-label. + bool has_non_ascii{false}; + for (const auto codepoint : decoded) { + if (codepoint > 0x7F) { + has_non_ascii = true; + break; + } + } + if (!has_non_ascii) { + return false; + } + + if (!idna_is_valid_u_label(decoded)) { + return false; + } + + // RFC 5891 §4.2: A-labels must be in canonical Punycode form, so + // re-encoding the decoded U-label must yield the original bytes. + try { + return utf32_to_punycode(decoded) == encoded; + } catch (const PunycodeError &) { + return false; + } +} + +} // namespace + auto idna_classify_label(const std::u32string_view label, std::u32string &decoded) -> std::optional { @@ -34,23 +82,23 @@ auto idna_classify_label(const std::u32string_view label, ((label[1] | 0x20) == U'n') && label[2] == U'-' && label[3] == U'-'}; if (has_a_label_prefix) { + // RFC 5890 §2.3.2.1: a label in A-label form is at most 63 octets + if (label.size() > MAXIMUM_LABEL_OCTETS) { + return std::nullopt; + } + std::string ascii; ascii.reserve(label.size()); for (const auto codepoint : label) { ascii.push_back(static_cast(codepoint)); } // Normalise the prefix to canonical lowercase before validating, so - // the round-trip equality inside `idna_is_valid_a_label` does not - // reject input that only differs in the case of the prefix + // the round-trip equality does not reject input that only differs in + // the case of the prefix ascii[0] = 'x'; ascii[1] = 'n'; - if (!idna_is_valid_a_label(ascii)) { - return std::nullopt; - } - try { - decoded = punycode_to_utf32( - std::string_view{ascii.data() + 4, ascii.size() - 4}); - } catch (...) { + if (!validate_a_label_body( + std::string_view{ascii.data() + 4, ascii.size() - 4}, decoded)) { return std::nullopt; } return IDNALabelKind::ALabel; @@ -128,24 +176,23 @@ auto idna_passes_contexto(const std::u32string_view label, break; } - // RFC 5892 Appendix A.8 ARABIC-INDIC DIGITS (U+0660..U+0669) - if (codepoint >= 0x0660 && codepoint <= 0x0669) { - for (const auto other : label) { - if (other >= 0x06F0 && other <= 0x06F9) { - return false; - } - } - return true; - } - - // RFC 5892 Appendix A.9 EXTENDED ARABIC-INDIC DIGITS (U+06F0..U+06F9) - if (codepoint >= 0x06F0 && codepoint <= 0x06F9) { + // RFC 5892 Appendix A.8 ARABIC-INDIC DIGITS (U+0660..U+0669) and + // Appendix A.9 EXTENDED ARABIC-INDIC DIGITS (U+06F0..U+06F9): a label must + // not mix the two blocks. A single scan over the label settles both rules. + const bool is_arabic_indic{codepoint >= 0x0660 && codepoint <= 0x0669}; + const bool is_extended_arabic_indic{codepoint >= 0x06F0 && + codepoint <= 0x06F9}; + if (is_arabic_indic || is_extended_arabic_indic) { + bool has_arabic_indic{false}; + bool has_extended_arabic_indic{false}; for (const auto other : label) { if (other >= 0x0660 && other <= 0x0669) { - return false; + has_arabic_indic = true; + } else if (other >= 0x06F0 && other <= 0x06F9) { + has_extended_arabic_indic = true; } } - return true; + return !(has_arabic_indic && has_extended_arabic_indic); } // No RFC 5892 Appendix A.3-A.9 rule applies to this codepoint, so there @@ -252,6 +299,16 @@ auto idna_is_valid_u_label(const std::u32string_view label) -> bool { } } + // RFC 5890 §2.3.2.1: the corresponding A-label (the "xn--" prefix plus the + // Punycode-encoded body) must not exceed 63 octets + try { + if (4 + utf32_to_punycode(label).size() > MAXIMUM_LABEL_OCTETS) { + return false; + } + } catch (const PunycodeError &) { + return false; + } + return true; } @@ -350,6 +407,11 @@ auto idna_is_valid_a_label(const std::string_view label) -> bool { return false; } + // RFC 5890 §2.3.2.1: a label in A-label form is at most 63 octets + if (label.size() > MAXIMUM_LABEL_OCTETS) { + return false; + } + // RFC 5890 §2.3.2.1: A-labels are pure ASCII for (const auto byte : label) { if (static_cast(byte) > 0x7F) { @@ -361,41 +423,9 @@ auto idna_is_valid_a_label(const std::string_view label) -> bool { // avoids `std::string_view::substr`, which is not noexcept. const std::string_view encoded{label.data() + prefix.size(), label.size() - prefix.size()}; - if (encoded.empty()) { - return false; - } std::u32string decoded; - try { - decoded = punycode_to_utf32(encoded); - } catch (...) { - return false; - } - - // RFC 5890 §2.3.2.1: a U-label contains at least one non-ASCII codepoint. - // A Punycode body that decodes to pure ASCII is not a real A-label. - bool has_non_ascii{false}; - for (const auto codepoint : decoded) { - if (codepoint > 0x7F) { - has_non_ascii = true; - break; - } - } - if (!has_non_ascii) { - return false; - } - - if (!idna_is_valid_u_label(decoded)) { - return false; - } - - // RFC 5891 §4.2: A-labels must be in canonical Punycode form, so - // re-encoding the decoded U-label must yield the original bytes. - try { - return utf32_to_punycode(decoded) == encoded; - } catch (...) { - return false; - } + return validate_a_label_body(encoded, decoded); } } // namespace sourcemeta::core diff --git a/vendor/core/src/core/idna/include/sourcemeta/core/idna.h b/vendor/core/src/core/idna/include/sourcemeta/core/idna.h index 166cb5ea6..9c7874682 100644 --- a/vendor/core/src/core/idna/include/sourcemeta/core/idna.h +++ b/vendor/core/src/core/idna/include/sourcemeta/core/idna.h @@ -142,7 +142,10 @@ auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool; /// Return whether the given label is a valid U-label per RFC 5891 §4. See /// https://www.rfc-editor.org/rfc/rfc5891#section-4 for the criteria. /// The Bidi rule is not checked here because Bidi domain detection is a -/// property of the whole domain, not of a single label. For example: +/// property of the whole domain, not of a single label. A pure-ASCII label +/// that satisfies the structural rules is accepted even though it carries no +/// non-ASCII codepoint, so this check is not on its own a guarantee that the +/// label requires IDNA processing. For example: /// /// ```cpp /// #include @@ -158,10 +161,13 @@ auto idna_is_valid_u_label(const std::u32string_view label) -> bool; /// @ingroup idna /// Return whether the given label is a valid A-label per RFC 5891 §4. See /// https://www.rfc-editor.org/rfc/rfc5891#section-4 for the criteria. -/// A valid A-label starts with the ACE prefix "xn--", is pure ASCII, has a -/// non-empty Punycode body that decodes to a U-label containing at least -/// one non-ASCII codepoint, and round-trips through Punycode in its -/// canonical form. For example: +/// A valid A-label starts with the lowercase ACE prefix "xn--", is pure +/// ASCII, is at most 63 octets, has a non-empty Punycode body that decodes to +/// a U-label containing at least one non-ASCII codepoint, and round-trips +/// through Punycode in its canonical form. Both the prefix and the Punycode +/// body are matched case-sensitively, so an uppercase prefix or a mixed-case +/// body is rejected. This is intended for registration-side validation rather +/// than case-folding lookup. For example: /// /// ```cpp /// #include diff --git a/vendor/core/src/core/json/construct.h b/vendor/core/src/core/json/construct.h index f7c5661bf..8c385096a 100644 --- a/vendor/core/src/core/json/construct.h +++ b/vendor/core/src/core/json/construct.h @@ -9,26 +9,26 @@ #include "parser.h" -#include // assert -#include // std::size_t -#include // std::uint64_t, std::uint32_t -#include // std::memchr -#include // std::reference_wrapper -#include // std::invalid_argument -#include // std::move -#include // std::vector +#include // assert +#include // std::size_t +#include // std::uint64_t, std::uint32_t, std::uint8_t +#include // std::reference_wrapper +#include // std::invalid_argument +#include // std::string_view +#include // std::move +#include // std::vector namespace sourcemeta::core { namespace internal { -inline auto unescape_string(const char *data, const std::uint32_t length) -> - typename JSON::String { +inline auto unescape_string(const char *data, const std::uint32_t length, + const bool has_escape) -> typename JSON::String { typename JSON::String result; const char *cursor{data}; const char *string_end{data + length}; - if (!std::memchr(data, '\\', length)) { + if (!has_escape) { result.append(data, length); return result; } @@ -118,13 +118,10 @@ inline auto unescape_string(const char *data, const std::uint32_t length) -> return result; } -inline auto construct_number(const char *data, const std::uint32_t length) - -> JSON { - const bool has_dot{std::memchr(data, '.', length) != nullptr}; - const bool has_exponent{std::memchr(data, 'e', length) != nullptr || - std::memchr(data, 'E', length) != nullptr}; - - if (has_exponent) { +inline auto construct_number(const char *data, const std::uint32_t length, + const std::uint8_t flags, + const std::uint32_t significant_digits) -> JSON { + if (flags & TAPE_FLAG_NUMBER_EXPONENT) { try { return JSON{Decimal{std::string_view{data, length}}}; } catch (const DecimalParseError &) { @@ -134,27 +131,8 @@ inline auto construct_number(const char *data, const std::uint32_t length) } } - if (has_dot) { - std::size_t first_nonzero_position{JSON::String::npos}; - const auto decimal_position{static_cast( - static_cast(std::memchr(data, '.', length)) - data)}; - for (std::size_t index = 0; index < length; index++) { - if (index != decimal_position && data[index] != '0' && - data[index] != '-') { - first_nonzero_position = index; - break; - } - } - - if (first_nonzero_position == JSON::String::npos) { - first_nonzero_position = 0; - } - - const auto decimal_after_first_nonzero{decimal_position > - first_nonzero_position}; - const auto significant_digits{length - first_nonzero_position - - (decimal_after_first_nonzero ? 1 : 0)}; - constexpr std::size_t MAX_SAFE_SIGNIFICANT_DIGITS{15}; + if (flags & TAPE_FLAG_NUMBER_DOT) { + constexpr std::uint32_t MAX_SAFE_SIGNIFICANT_DIGITS{15}; if (significant_digits > MAX_SAFE_SIGNIFICANT_DIGITS) { try { return JSON{Decimal{std::string_view{data, length}}}; @@ -165,13 +143,13 @@ inline auto construct_number(const char *data, const std::uint32_t length) } } - const typename JSON::String string_value{data, length}; - const auto double_result{sourcemeta::core::to_double(string_value)}; + const std::string_view value{data, length}; + const auto double_result{sourcemeta::core::to_double(value)}; if (double_result.has_value()) { return JSON{double_result.value()}; } try { - return JSON{Decimal{string_value}}; + return JSON{Decimal{value}}; } catch (const DecimalParseError &) { throw JSONParseError(1, 1); } catch (const std::invalid_argument &) { @@ -185,13 +163,13 @@ inline auto construct_number(const char *data, const std::uint32_t length) } if (digit_length <= 19) { - const typename JSON::String string_value{data, length}; - const auto int_result{sourcemeta::core::to_int64_t(string_value)}; + const std::string_view value{data, length}; + const auto int_result{sourcemeta::core::to_int64_t(value)}; if (int_result.has_value()) { return JSON{int_result.value()}; } try { - return JSON{Decimal{string_value}}; + return JSON{Decimal{value}}; } catch (const DecimalParseError &) { throw JSONParseError(1, 1); } catch (const std::invalid_argument &) { @@ -282,15 +260,16 @@ inline auto construct_json(const char *buffer, return; case TapeType::String: { CALLBACK_PRE(String, entry, JSON::ParseContext::Root, 0, empty_property); - auto value{Result{ - internal::unescape_string(buffer + entry.offset, entry.length)}}; + auto value{Result{internal::unescape_string( + buffer + entry.offset, entry.length, + (entry.flags & TAPE_FLAG_STRING_ESCAPE) != 0)}}; CALLBACK_POST(String, entry.line, internal::post_column_for(entry)); output = std::move(value); return; } case TapeType::Number: { - auto value = - internal::construct_number(buffer + entry.offset, entry.length); + auto value = internal::construct_number( + buffer + entry.offset, entry.length, entry.flags, entry.count); if (value.is_integer()) { CALLBACK_PRE(Integer, entry, JSON::ParseContext::Root, 0, empty_property); @@ -336,13 +315,14 @@ do_construct_array: { frames.emplace_back(frames.back().get().back()); } else if (levels.back() == Container::Object) { levels.push_back(Container::Array); - frames.back().get().assign(key, Result::make_array()); + auto &inserted{frames.back().get().assign_assume_new( + std::move(key), Result::make_array(), key_hash)}; if (callback) { callback(JSON::ParsePhase::Pre, JSON::Type::Array, key_line, key_column, JSON::ParseContext::Property, 0, frames.back().get().as_object().back_key()); } - frames.emplace_back(frames.back().get().at(key)); + frames.emplace_back(inserted); } frames.back().get().as_array().reserve(child_count); @@ -400,15 +380,17 @@ do_construct_array_item: { CALLBACK_PRE(String, item_entry, JSON::ParseContext::Index, frames.back().get().size(), empty_property); frames.back().get().push_back(Result{internal::unescape_string( - buffer + item_entry.offset, item_entry.length)}); + buffer + item_entry.offset, item_entry.length, + (item_entry.flags & TAPE_FLAG_STRING_ESCAPE) != 0)}); tape_index++; CALLBACK_POST(String, item_entry.line, internal::post_column_for(item_entry)); goto do_construct_array_item_separator; case TapeType::Number: { const auto current_index{frames.back().get().size()}; - auto value = internal::construct_number(buffer + item_entry.offset, - item_entry.length); + auto value = internal::construct_number( + buffer + item_entry.offset, item_entry.length, item_entry.flags, + item_entry.count); if (value.is_integer()) { CALLBACK_PRE(Integer, item_entry, JSON::ParseContext::Index, current_index, empty_property); @@ -469,13 +451,14 @@ do_construct_object: { frames.emplace_back(frames.back().get().back()); } else if (levels.back() == Container::Object) { levels.push_back(Container::Object); - frames.back().get().assign(key, Result::make_object()); + auto &inserted{frames.back().get().assign_assume_new( + std::move(key), Result::make_object(), key_hash)}; if (callback) { callback(JSON::ParsePhase::Pre, JSON::Type::Object, key_line, key_column, JSON::ParseContext::Property, 0, frames.back().get().as_object().back_key()); } - frames.emplace_back(frames.back().get().at(key)); + frames.emplace_back(inserted); } frames.back().get().as_object().reserve(property_count); @@ -498,8 +481,8 @@ do_construct_object_key: { assert(key_entry.type == TapeType::Key); const char *key_data{buffer + key_entry.offset}; const auto key_length{key_entry.length}; - if (std::memchr(key_data, '\\', key_length)) { - key = internal::unescape_string(key_data, key_length); + if (key_entry.flags & TAPE_FLAG_STRING_ESCAPE) { + key = internal::unescape_string(key_data, key_length, true); key_hash = Result::Object::hash(key); } else { key.assign(key_data, key_length); @@ -558,8 +541,9 @@ do_construct_object_value: { case TapeType::String: frames.back().get().assign_assume_new( std::move(key), - Result{internal::unescape_string(buffer + value_entry.offset, - value_entry.length)}, + Result{internal::unescape_string( + buffer + value_entry.offset, value_entry.length, + (value_entry.flags & TAPE_FLAG_STRING_ESCAPE) != 0)}, key_hash); if (callback) { callback(JSON::ParsePhase::Pre, JSON::Type::String, key_line, @@ -571,8 +555,9 @@ do_construct_object_value: { internal::post_column_for(value_entry)); goto do_construct_object_property_end; case TapeType::Number: { - auto value = internal::construct_number(buffer + value_entry.offset, - value_entry.length); + auto value = internal::construct_number( + buffer + value_entry.offset, value_entry.length, value_entry.flags, + value_entry.count); const auto value_type{value.type()}; frames.back().get().assign_assume_new(std::move(key), std::move(value), key_hash); diff --git a/vendor/core/src/core/json/include/sourcemeta/core/json_object.h b/vendor/core/src/core/json/include/sourcemeta/core/json_object.h index 51ca77628..79eb8795c 100644 --- a/vendor/core/src/core/json/include/sourcemeta/core/json_object.h +++ b/vendor/core/src/core/json/include/sourcemeta/core/json_object.h @@ -56,37 +56,50 @@ template class JSONObject { auto operator<(const JSONObject &other) const noexcept -> bool { - // The `std::unordered_map` container, by definition, does not provide - // ordering. However, we still want some level of ordering to allow - // arrays of objects to be sorted. - - // First try a size comparison + // Objects have no inherent order, but a deterministic strict weak ordering + // independent of insertion order is needed so that collections of objects + // can be sorted. Smaller objects come first, and objects of equal size are + // ordered as their entries would compare in key order. That outcome is + // decided entirely by the smallest key at which the two objects differ, + // which is found by scanning the entries in place to avoid allocating if (this->data.size() != other.data.size()) { return this->data.size() < other.data.size(); } - // Otherwise do value comparison for common properties - for (const auto &entry : *this) { - const auto other_entry{other.find(entry.first)}; - if (other_entry != other.cend() && entry.second < other_entry->second) { - return true; + const Key *decisive_key{nullptr}; + bool decision{false}; + for (const auto &entry : this->data) { + const auto match{other.find(entry.first)}; + const bool differs{match == other.cend() || + !(entry.second == match->second)}; + if (differs && (decisive_key == nullptr || entry.first < *decisive_key)) { + decisive_key = &entry.first; + decision = match == other.cend() || entry.second < match->second; } } - return false; + for (const auto &entry : other.data) { + if (this->find(entry.first) == this->cend() && + (decisive_key == nullptr || entry.first < *decisive_key)) { + decisive_key = &entry.first; + decision = false; + } + } + + return decision; } auto operator<=(const JSONObject &other) const noexcept -> bool { - return this->data <= other.data; + return !(other < *this); } auto operator>(const JSONObject &other) const noexcept -> bool { - return this->data > other.data; + return other < *this; } auto operator>=(const JSONObject &other) const noexcept -> bool { - return this->data >= other.data; + return !(*this < other); } auto operator==(const JSONObject &other) const noexcept @@ -532,7 +545,31 @@ template class JSONObject { } } - this->data.push_back({key, value, key_hash}); + this->data.push_back({std::move(key), std::move(value), key_hash}); + return key_hash; + } + + /// Emplace an object property + inline auto emplace(const Key &key, mapped_type &&value) -> hash_type { + const auto key_hash{this->hash(key)}; + + if (this->hasher.is_perfect(key_hash)) { + for (auto &entry : this->data) { + if (entry.hash == key_hash) { + entry.second = std::move(value); + return key_hash; + } + } + } else { + for (auto &entry : this->data) { + if (entry.hash == key_hash && entry.first == key) { + entry.second = std::move(value); + return key_hash; + } + } + } + + this->data.push_back({key, std::move(value), key_hash}); return key_hash; } @@ -575,10 +612,12 @@ template class JSONObject { return key_hash; } - /// Emplace an object property with a pre-computed hash + /// Emplace an object property with a pre-computed hash, returning the + /// inserted value inline auto emplace_assume_new(Key &&key, mapped_type &&value, - const hash_type key_hash) -> void { + const hash_type key_hash) -> mapped_type & { this->data.push_back({std::move(key), std::move(value), key_hash}); + return this->data.back().second; } /// Emplace an object property with a pre-computed hash diff --git a/vendor/core/src/core/json/include/sourcemeta/core/json_value.h b/vendor/core/src/core/json/include/sourcemeta/core/json_value.h index 37f057686..e862de2a7 100644 --- a/vendor/core/src/core/json/include/sourcemeta/core/json_value.h +++ b/vendor/core/src/core/json/include/sourcemeta/core/json_value.h @@ -174,6 +174,16 @@ class SOURCEMETA_CORE_JSON_EXPORT JSON { /// ``` explicit JSON(const String &value); + /// This constructor creates a JSON document from a string type. For example: + /// + /// ```cpp + /// #include + /// + /// sourcemeta::core::JSON::String value{"foo"}; + /// const sourcemeta::core::JSON my_string{std::move(value)}; + /// ``` + explicit JSON(String &&value); + /// This constructor creates a JSON document from a string type. For example: /// /// ```cpp @@ -627,9 +637,9 @@ class SOURCEMETA_CORE_JSON_EXPORT JSON { [[nodiscard]] SOURCEMETA_FORCEINLINE inline auto to_decimal() const noexcept -> const Decimal & { assert(this->is_decimal()); - assert(this->data_decimal->is_finite()); - assert(!this->data_decimal->is_nan()); - return *this->data_decimal; + assert(this->data_decimal.is_finite()); + assert(!this->data_decimal.is_nan()); + return this->data_decimal; } /// Convert a JSON instance into a standard string value. The result of this @@ -998,6 +1008,11 @@ class SOURCEMETA_CORE_JSON_EXPORT JSON { [[nodiscard]] auto at_or(const String &key, const JSON &otherwise) const -> const JSON &; + /// This overload avoids misuses of returning a const reference parameter as a + /// constant reference. + [[nodiscard]] auto at_or(const String &key, JSON &&otherwise) const + -> const JSON & = delete; + /// This method retrieves an object property given a pre-calculated property /// hash, or a user provided value if such property is not defined. /// @@ -1018,13 +1033,8 @@ class SOURCEMETA_CORE_JSON_EXPORT JSON { const typename Object::hash_type hash, const JSON &otherwise) const -> const JSON &; - // Constant reference parameters can accept xvalues which will be destructed - // after the call. When the function returns such a parameter also as constant - // reference, then the returned reference can be used after the object it - // refers to has been destroyed. - // https://clang.llvm.org/extra/clang-tidy/checks/bugprone/return-const-ref-from-parameter.html - // This overload avoids mis-uses of retuning const reference parameter as - // constant reference. + /// This overload avoids misuses of returning a const reference parameter as a + /// constant reference. [[nodiscard]] auto at_or(const String &key, const typename Object::hash_type hash, JSON &&otherwise) const -> const JSON & = delete; @@ -1808,9 +1818,10 @@ class SOURCEMETA_CORE_JSON_EXPORT JSON { /// ``` auto assign_assume_new(String &&key, JSON &&value) -> void; - /// This method sets an object key with a pre-computed hash + /// This method sets an object key with a pre-computed hash, returning the + /// inserted value auto assign_assume_new(String &&key, JSON &&value, Object::hash_type hash) - -> void; + -> JSON &; /// This method deletes an object key. For example: /// @@ -2202,11 +2213,12 @@ class SOURCEMETA_CORE_JSON_EXPORT JSON { String data_string; Array data_array; Object data_object; - // Move Decimal to the heap to reduce the size of the JSON class. - // Dealing with arbitrary precision numbers is not common, so we pay the - // indirection cost only when needed. - Decimal *data_decimal; + Decimal data_decimal; }; + + // Storing the decimal inline must not grow the union beyond its existing + // footprint + static_assert(sizeof(Decimal) <= sizeof(String)); #if defined(_MSC_VER) #pragma warning(default : 4251) #endif diff --git a/vendor/core/src/core/json/json.cc b/vendor/core/src/core/json/json.cc index 3fbf22e1b..d4229a7f3 100644 --- a/vendor/core/src/core/json/json.cc +++ b/vendor/core/src/core/json/json.cc @@ -12,7 +12,9 @@ #include // std::uint64_t #include // std::filesystem #include // std::basic_istream +#include // std::numeric_limits #include // std::basic_ostream +#include // std::cmp_greater #include // std::vector namespace sourcemeta::core { @@ -23,6 +25,13 @@ static auto internal_parse_json(const char *&cursor, const char *end, const bool track_positions, JSON &output) -> void { const char *buffer_start{cursor}; + // Tape entries address the input with 32-bit offsets and lengths, so a larger + // input cannot be represented without truncation + if (std::cmp_greater(end - cursor, + std::numeric_limits::max())) { + throw JSONParseError(line, column); + } + std::vector tape; tape.reserve(static_cast(end - cursor) / 8); if (callback || track_positions) { diff --git a/vendor/core/src/core/json/json_value.cc b/vendor/core/src/core/json/json_value.cc index efff89988..5231525f8 100644 --- a/vendor/core/src/core/json/json_value.cc +++ b/vendor/core/src/core/json/json_value.cc @@ -56,6 +56,10 @@ JSON::JSON(const String &value) : current_type{Type::String} { std::construct_at(&this->data_string, value); } +JSON::JSON(String &&value) : current_type{Type::String} { + std::construct_at(&this->data_string, std::move(value)); +} + JSON::JSON(const std::basic_string_view &value) : current_type{Type::String} { std::construct_at(&this->data_string, value); @@ -99,7 +103,7 @@ JSON::JSON(const Decimal &value) : current_type{Type::Decimal} { throw std::invalid_argument("JSON does not support Infinity or NaN"); } - this->data_decimal = new Decimal{value}; + std::construct_at(&this->data_decimal, value); } JSON::JSON(Decimal &&value) : current_type{Type::Decimal} { @@ -107,7 +111,7 @@ JSON::JSON(Decimal &&value) : current_type{Type::Decimal} { throw std::invalid_argument("JSON does not support Infinity or NaN"); } - this->data_decimal = new Decimal{std::move(value)}; + std::construct_at(&this->data_decimal, std::move(value)); } JSON::JSON(const JSON &other) { @@ -133,7 +137,7 @@ JSON::JSON(const JSON &other) { this->current_type = Type::String; return; case Type::Decimal: - this->data_decimal = new Decimal{*other.data_decimal}; + std::construct_at(&this->data_decimal, other.data_decimal); this->current_type = Type::Decimal; return; case Type::Array: @@ -183,7 +187,7 @@ JSON::JSON(const JSON &other) { destination.current_type = Type::String; break; case Type::Decimal: - destination.data_decimal = new Decimal{*source.data_decimal}; + std::construct_at(&destination.data_decimal, source.data_decimal); destination.current_type = Type::Decimal; break; case Type::Array: { @@ -251,7 +255,12 @@ JSON::JSON(JSON &&other) noexcept : current_type{other.current_type} { other.current_type = Type::Null; break; case Type::Decimal: - this->data_decimal = std::exchange(other.data_decimal, nullptr); + std::construct_at(&this->data_decimal, std::move(other.data_decimal)); + // Marking the source as empty means its destructor will never visit + // the decimal member again, so end that member's lifetime here. The + // moved-from state owns no heap coefficient, making this a no-op + // branch rather than a deallocation + other.data_decimal.~Decimal(); other.current_type = Type::Null; break; default: @@ -264,39 +273,51 @@ auto JSON::operator=(const JSON &other) -> JSON & { return *this; } - // Fast path for scalar sources: destroy this iteratively (safe for any - // depth) then assign the scalar directly. Avoids the copy-then-move dance - // that the container path needs for strong exception safety + // Fast path for scalar sources: tear this value down iteratively, which is + // safe for any depth, then assign the scalar directly. Each scalar is + // buffered into a local first, because the source may be nested inside this + // value, and tearing this value down would otherwise free the storage still + // being read from switch (other.current_type) { case Type::Null: this->~JSON(); this->current_type = Type::Null; return *this; - case Type::Boolean: + case Type::Boolean: { + const auto value{other.data_boolean}; this->~JSON(); - this->data_boolean = other.data_boolean; + this->data_boolean = value; this->current_type = Type::Boolean; return *this; - case Type::Integer: + } + case Type::Integer: { + const auto value{other.data_integer}; this->~JSON(); - this->data_integer = other.data_integer; + this->data_integer = value; this->current_type = Type::Integer; return *this; - case Type::Real: + } + case Type::Real: { + const auto value{other.data_real}; this->~JSON(); - this->data_real = other.data_real; + this->data_real = value; this->current_type = Type::Real; return *this; - case Type::String: + } + case Type::String: { + String value{other.data_string}; this->~JSON(); - std::construct_at(&this->data_string, other.data_string); + std::construct_at(&this->data_string, std::move(value)); this->current_type = Type::String; return *this; - case Type::Decimal: + } + case Type::Decimal: { + Decimal value{other.data_decimal}; this->~JSON(); - this->data_decimal = new Decimal{*other.data_decimal}; + std::construct_at(&this->data_decimal, std::move(value)); this->current_type = Type::Decimal; return *this; + } case Type::Array: case Type::Object: break; @@ -312,13 +333,17 @@ auto JSON::operator=(const JSON &other) -> JSON & { } auto JSON::operator=(JSON &&other) noexcept -> JSON & { - // Destroy-then-rebuild so the existing value in this is torn down by the - // iterative destructor if (this == &other) { return *this; } + + // Steal the source into a local before this value is torn down, because the + // source may be nested inside this value, and tearing it down first would + // free the storage still being moved from. Parentheses select the move + // constructor rather than the list constructor + JSON moved(std::move(other)); this->~JSON(); - std::construct_at(this, std::move(other)); + std::construct_at(this, std::move(moved)); return *this; } @@ -495,7 +520,7 @@ auto JSON::operator==(const JSON &other) const noexcept -> bool { case Type::Real: return this->data_real == other.data_real; case Type::Decimal: - return *this->data_decimal == *other.data_decimal; + return this->data_decimal == other.data_decimal; case Type::String: return this->data_string == other.data_string; case Type::Array: @@ -825,7 +850,7 @@ auto JSON::assign(const JSON::String &key, const JSON &value) -> void { auto JSON::assign(const JSON::String &key, JSON &&value) -> void { assert(this->is_object()); - this->data_object.emplace(key, value); + this->data_object.emplace(key, std::move(value)); } auto JSON::try_assign_before(const String &key, const JSON &value, @@ -860,9 +885,10 @@ auto JSON::assign_assume_new(JSON::String &&key, JSON &&value) -> void { } auto JSON::assign_assume_new(JSON::String &&key, JSON &&value, - Object::hash_type hash) -> void { + Object::hash_type hash) -> JSON & { assert(this->is_object()); - this->data_object.emplace_assume_new(std::move(key), std::move(value), hash); + return this->data_object.emplace_assume_new(std::move(key), std::move(value), + hash); } auto JSON::erase(const JSON::String &key) -> typename Object::size_type { @@ -907,6 +933,15 @@ auto JSON::clear_except(std::initializer_list keys) -> void { auto JSON::merge(const JSON::Object &other) -> void { assert(this->is_object()); + // When the source is this object's own container, the insertions below would + // reallocate the very storage being iterated, so it is snapshotted first + if (&other == &this->data_object) { + // NOLINTNEXTLINE(performance-unnecessary-copy-initialization) + const JSON::Object snapshot{other}; + this->merge(snapshot); + return; + } + for (const auto &pair : other) { const auto maybe_key{this->try_at(pair.first, pair.hash)}; if (maybe_key && maybe_key->is_object() && pair.second.is_object()) { @@ -974,7 +1009,7 @@ auto JSON::maybe_destruct_union() -> void { this->data_object.~JSONObject(); break; case Type::Decimal: - delete this->data_decimal; + this->data_decimal.~Decimal(); break; default: break; diff --git a/vendor/core/src/core/json/parser.h b/vendor/core/src/core/json/parser.h index b5f8eed1e..e40228020 100644 --- a/vendor/core/src/core/json/parser.h +++ b/vendor/core/src/core/json/parser.h @@ -6,8 +6,11 @@ #include "grammar.h" +#include // std::countr_zero, std::endian #include // assert -#include // std::uint64_t, std::uint32_t +#include // std::size_t +#include // std::uint64_t, std::uint32_t, std::uint8_t +#include // std::memcpy #include // std::vector namespace sourcemeta::core { @@ -25,8 +28,15 @@ enum class TapeType : std::uint8_t { False }; +constexpr std::uint8_t TAPE_FLAG_STRING_ESCAPE{0x01}; +constexpr std::uint8_t TAPE_FLAG_NUMBER_DOT{0x02}; +constexpr std::uint8_t TAPE_FLAG_NUMBER_EXPONENT{0x04}; + +// The flags byte lives in what was already padding, so recording scan facts +// for construct does not grow the entry struct TapeEntry { TapeType type; + std::uint8_t flags; std::uint32_t offset; std::uint32_t length; std::uint32_t count; @@ -34,8 +44,43 @@ struct TapeEntry { std::uint64_t column; }; +static_assert(sizeof(TapeEntry) == 32); + namespace internal { +constexpr std::uint64_t WORD_LOW_BITS{0x0101010101010101ULL}; +constexpr std::uint64_t WORD_HIGH_BITS{0x8080808080808080ULL}; + +inline auto is_plain_string_byte(const char character) -> bool { + return character != internal::token_string_quote && + character != internal::token_string_escape && + static_cast(character) >= 0x20; +} + +// Flag every byte that ends the plain run of a string: the closing quote, +// the escape introducer, or a control character that RFC 8259 forbids +// unescaped. Subtraction borrows can falsely flag the byte directly after a +// genuinely flagged byte, which is harmless because callers only act on the +// first flagged byte, and the first flag is always genuine +inline auto match_string_special_bytes(const std::uint64_t word) + -> std::uint64_t { + constexpr auto quote_pattern{ + WORD_LOW_BITS * + static_cast(internal::token_string_quote)}; + constexpr auto escape_pattern{ + WORD_LOW_BITS * + static_cast(internal::token_string_escape)}; + const auto quote_difference{word ^ quote_pattern}; + const auto escape_difference{word ^ escape_pattern}; + const auto quote_matches{(quote_difference - WORD_LOW_BITS) & + ~quote_difference & WORD_HIGH_BITS}; + const auto escape_matches{(escape_difference - WORD_LOW_BITS) & + ~escape_difference & WORD_HIGH_BITS}; + const auto control_matches{(word - (WORD_LOW_BITS * 0x20ULL)) & ~word & + WORD_HIGH_BITS}; + return quote_matches | escape_matches | control_matches; +} + template inline auto skip_whitespace(const char *&cursor, const char *end, std::uint64_t &line, std::uint64_t &column) @@ -228,13 +273,26 @@ inline auto scan_string_escape(const std::uint64_t line, std::uint64_t &column, template inline auto scan_string(const std::uint64_t line, std::uint64_t &column, - const char *&cursor, const char *end) -> void { - using CharT = typename JSON::Char; + const char *&cursor, const char *end) -> bool { + bool has_escape{false}; while (cursor < end) { const char *scan{cursor}; - while (scan < end && *scan != internal::token_string_quote && - *scan != internal::token_string_escape && - static_cast(*scan) >= 0x20) { + + if constexpr (std::endian::native == std::endian::little) { + while (scan + sizeof(std::uint64_t) <= end) { + std::uint64_t word; + std::memcpy(&word, scan, sizeof(word)); + const auto matches{internal::match_string_special_bytes(word)}; + if (matches != 0) { + scan += static_cast(std::countr_zero(matches)) >> 3; + break; + } + + scan += sizeof(std::uint64_t); + } + } + + while (scan < end && internal::is_plain_string_byte(*scan)) { scan++; } @@ -259,8 +317,9 @@ inline auto scan_string(const std::uint64_t line, std::uint64_t &column, switch (character) { case internal::token_string_quote: - return; + return has_escape; case internal::token_string_escape: + has_escape = true; scan_string_escape(line, column, cursor, end); break; default: @@ -296,11 +355,18 @@ inline auto scan_digits(const std::uint64_t line, std::uint64_t &column, } } +struct NumberFacts { + std::uint8_t flags{0}; + std::uint32_t significant_digits{0}; +}; + template inline auto scan_number(const std::uint64_t line, std::uint64_t &column, const char *&cursor, const char *end, const char first) - -> void { + -> NumberFacts { using CharT = typename JSON::Char; + NumberFacts facts; + const char *literal_start{cursor - 1}; if (first == internal::token_number_minus) { if (cursor >= end || *cursor < internal::token_number_zero || *cursor > internal::token_number_nine) [[unlikely]] { @@ -313,6 +379,8 @@ inline auto scan_number(const std::uint64_t line, std::uint64_t &column, const char int_start{first == internal::token_number_minus ? *cursor : first}; + const char *integer_begin{ + first == internal::token_number_minus ? cursor : cursor - 1}; if (first == internal::token_number_minus) { if constexpr (TrackPositions) { column += 1; @@ -332,7 +400,10 @@ inline auto scan_number(const std::uint64_t line, std::uint64_t &column, scan_digits(line, column, cursor, end, false); } + const char *dot_position{nullptr}; if (cursor < end && *cursor == internal::token_number_decimal_point) { + dot_position = cursor; + facts.flags |= TAPE_FLAG_NUMBER_DOT; if constexpr (TrackPositions) { column += 1; } @@ -343,6 +414,7 @@ inline auto scan_number(const std::uint64_t line, std::uint64_t &column, if (cursor < end && (*cursor == internal::token_number_exponent_lowercase || *cursor == internal::token_number_exponent_uppercase)) { + facts.flags |= TAPE_FLAG_NUMBER_EXPONENT; if constexpr (TrackPositions) { column += 1; } @@ -356,6 +428,35 @@ inline auto scan_number(const std::uint64_t line, std::uint64_t &column, } scan_digits(line, column, cursor, end, true); } + + // The significant digit count only matters for choosing between a double + // and an arbitrary precision representation, a choice that exponents force + // on their own + if (dot_position && !(facts.flags & TAPE_FLAG_NUMBER_EXPONENT)) { + const char *first_significant{nullptr}; + if (int_start != internal::token_number_zero) { + first_significant = integer_begin; + } else { + for (const char *pointer = dot_position + 1; pointer < cursor; + pointer++) { + if (*pointer != internal::token_number_zero) { + first_significant = pointer; + break; + } + } + } + + if (first_significant) { + facts.significant_digits = static_cast( + cursor - first_significant - + (dot_position > first_significant ? 1 : 0)); + } else { + facts.significant_digits = + static_cast(cursor - literal_start - 1); + } + } + + return facts; } } // namespace internal @@ -395,24 +496,27 @@ inline auto scan_json(const char *&cursor, const char *end, switch (character) { case internal::token_true: internal::scan_true(line, column, cursor, end); - tape.push_back({TapeType::True, 0, 0, 0, value_line, value_column}); + tape.push_back({TapeType::True, 0, 0, 0, 0, value_line, value_column}); return; case internal::token_false: internal::scan_false(line, column, cursor, end); - tape.push_back({TapeType::False, 0, 0, 0, value_line, value_column}); + tape.push_back({TapeType::False, 0, 0, 0, 0, value_line, value_column}); return; case internal::token_null: internal::scan_null(line, column, cursor, end); - tape.push_back({TapeType::Null, 0, 0, 0, value_line, value_column}); + tape.push_back({TapeType::Null, 0, 0, 0, 0, value_line, value_column}); return; case internal::token_string_quote: { const auto string_start{ static_cast(cursor - buffer_start)}; - internal::scan_string(line, column, cursor, end); + const auto string_has_escape{ + internal::scan_string(line, column, cursor, end)}; const auto string_length{static_cast( cursor - buffer_start - string_start - 1)}; - tape.push_back({TapeType::String, string_start, string_length, 0, - value_line, value_column}); + tape.push_back( + {TapeType::String, + string_has_escape ? TAPE_FLAG_STRING_ESCAPE : std::uint8_t{0}, + string_start, string_length, 0, value_line, value_column}); return; } case internal::token_array_begin: @@ -432,11 +536,12 @@ inline auto scan_json(const char *&cursor, const char *end, case internal::token_number_nine: { const auto number_start{ static_cast(cursor - buffer_start - 1)}; - internal::scan_number(line, column, cursor, end, - character); + const auto number_facts{internal::scan_number( + line, column, cursor, end, character)}; const auto number_length{ static_cast(cursor - buffer_start - number_start)}; - tape.push_back({TapeType::Number, number_start, number_length, 0, + tape.push_back({TapeType::Number, number_facts.flags, number_start, + number_length, number_facts.significant_digits, value_line, value_column}); return; } @@ -451,7 +556,7 @@ inline auto scan_json(const char *&cursor, const char *end, do_scan_array: { const auto start_index{tape.size()}; - tape.push_back({TapeType::ArrayStart, 0, 0, 0, line, column}); + tape.push_back({TapeType::ArrayStart, 0, 0, 0, 0, line, column}); container_stack.push_back({start_index, 0}); internal::skip_whitespace(cursor, end, line, column); @@ -468,7 +573,7 @@ do_scan_array: { } cursor++; tape[start_index].count = 0; - tape.push_back({TapeType::ArrayEnd, 0, 0, 0, line, column}); + tape.push_back({TapeType::ArrayEnd, 0, 0, 0, 0, line, column}); container_stack.pop_back(); goto do_scan_container_end; } @@ -502,24 +607,27 @@ do_scan_array: { goto do_scan_object; case internal::token_true: internal::scan_true(line, column, cursor, end); - tape.push_back({TapeType::True, 0, 0, 0, value_line, value_column}); + tape.push_back({TapeType::True, 0, 0, 0, 0, value_line, value_column}); goto do_scan_array_item_separator; case internal::token_false: internal::scan_false(line, column, cursor, end); - tape.push_back({TapeType::False, 0, 0, 0, value_line, value_column}); + tape.push_back({TapeType::False, 0, 0, 0, 0, value_line, value_column}); goto do_scan_array_item_separator; case internal::token_null: internal::scan_null(line, column, cursor, end); - tape.push_back({TapeType::Null, 0, 0, 0, value_line, value_column}); + tape.push_back({TapeType::Null, 0, 0, 0, 0, value_line, value_column}); goto do_scan_array_item_separator; case internal::token_string_quote: { const auto string_start{ static_cast(cursor - buffer_start)}; - internal::scan_string(line, column, cursor, end); + const auto string_has_escape{ + internal::scan_string(line, column, cursor, end)}; const auto string_length{static_cast( cursor - buffer_start - string_start - 1)}; - tape.push_back({TapeType::String, string_start, string_length, 0, - value_line, value_column}); + tape.push_back( + {TapeType::String, + string_has_escape ? TAPE_FLAG_STRING_ESCAPE : std::uint8_t{0}, + string_start, string_length, 0, value_line, value_column}); goto do_scan_array_item_separator; } case internal::token_number_minus: @@ -535,11 +643,12 @@ do_scan_array: { case internal::token_number_nine: { const auto number_start{ static_cast(cursor - buffer_start - 1)}; - internal::scan_number(line, column, cursor, end, - character); + const auto number_facts{internal::scan_number( + line, column, cursor, end, character)}; const auto number_length{ static_cast(cursor - buffer_start - number_start)}; - tape.push_back({TapeType::Number, number_start, number_length, 0, + tape.push_back({TapeType::Number, number_facts.flags, number_start, + number_length, number_facts.significant_digits, value_line, value_column}); goto do_scan_array_item_separator; } @@ -567,7 +676,7 @@ do_scan_array: { assert(!container_stack.empty()); auto &frame{container_stack.back()}; tape[frame.tape_index].count = frame.child_count; - tape.push_back({TapeType::ArrayEnd, 0, 0, 0, line, column}); + tape.push_back({TapeType::ArrayEnd, 0, 0, 0, 0, line, column}); container_stack.pop_back(); goto do_scan_container_end; } @@ -581,7 +690,7 @@ do_scan_array: { do_scan_object: { const auto start_index{tape.size()}; - tape.push_back({TapeType::ObjectStart, 0, 0, 0, line, column}); + tape.push_back({TapeType::ObjectStart, 0, 0, 0, 0, line, column}); container_stack.push_back({start_index, 0}); internal::skip_whitespace(cursor, end, line, column); @@ -598,7 +707,7 @@ do_scan_object: { } cursor++; tape[start_index].count = 0; - tape.push_back({TapeType::ObjectEnd, 0, 0, 0, line, column}); + tape.push_back({TapeType::ObjectEnd, 0, 0, 0, 0, line, column}); container_stack.pop_back(); goto do_scan_container_end; } @@ -626,11 +735,14 @@ do_scan_object: { const auto key_start{static_cast(cursor - buffer_start)}; const auto key_line{line}; const auto key_column{column}; - internal::scan_string(line, column, cursor, end); + const auto key_has_escape{ + internal::scan_string(line, column, cursor, end)}; const auto key_length{ static_cast(cursor - buffer_start - key_start - 1)}; tape.push_back( - {TapeType::Key, key_start, key_length, 0, key_line, key_column}); + {TapeType::Key, + key_has_escape ? TAPE_FLAG_STRING_ESCAPE : std::uint8_t{0}, + key_start, key_length, 0, key_line, key_column}); goto do_scan_object_separator; } default: @@ -679,24 +791,27 @@ do_scan_object: { goto do_scan_object; case internal::token_true: internal::scan_true(line, column, cursor, end); - tape.push_back({TapeType::True, 0, 0, 0, value_line, value_column}); + tape.push_back({TapeType::True, 0, 0, 0, 0, value_line, value_column}); goto do_scan_object_property_end; case internal::token_false: internal::scan_false(line, column, cursor, end); - tape.push_back({TapeType::False, 0, 0, 0, value_line, value_column}); + tape.push_back({TapeType::False, 0, 0, 0, 0, value_line, value_column}); goto do_scan_object_property_end; case internal::token_null: internal::scan_null(line, column, cursor, end); - tape.push_back({TapeType::Null, 0, 0, 0, value_line, value_column}); + tape.push_back({TapeType::Null, 0, 0, 0, 0, value_line, value_column}); goto do_scan_object_property_end; case internal::token_string_quote: { const auto string_start{ static_cast(cursor - buffer_start)}; - internal::scan_string(line, column, cursor, end); + const auto string_has_escape{ + internal::scan_string(line, column, cursor, end)}; const auto string_length{static_cast( cursor - buffer_start - string_start - 1)}; - tape.push_back({TapeType::String, string_start, string_length, 0, - value_line, value_column}); + tape.push_back( + {TapeType::String, + string_has_escape ? TAPE_FLAG_STRING_ESCAPE : std::uint8_t{0}, + string_start, string_length, 0, value_line, value_column}); goto do_scan_object_property_end; } case internal::token_number_minus: @@ -712,11 +827,12 @@ do_scan_object: { case internal::token_number_nine: { const auto number_start{ static_cast(cursor - buffer_start - 1)}; - internal::scan_number(line, column, cursor, end, - character); + const auto number_facts{internal::scan_number( + line, column, cursor, end, character)}; const auto number_length{ static_cast(cursor - buffer_start - number_start)}; - tape.push_back({TapeType::Number, number_start, number_length, 0, + tape.push_back({TapeType::Number, number_facts.flags, number_start, + number_length, number_facts.significant_digits, value_line, value_column}); goto do_scan_object_property_end; } @@ -744,7 +860,7 @@ do_scan_object: { assert(!container_stack.empty()); auto &frame{container_stack.back()}; tape[frame.tape_index].count = frame.child_count; - tape.push_back({TapeType::ObjectEnd, 0, 0, 0, line, column}); + tape.push_back({TapeType::ObjectEnd, 0, 0, 0, 0, line, column}); container_stack.pop_back(); goto do_scan_container_end; } diff --git a/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer.h b/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer.h index 7f0b88253..d3f79ebc6 100644 --- a/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer.h +++ b/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer.h @@ -18,6 +18,7 @@ #include // assert #include // std::reference_wrapper #include // std::allocator +#include // std::optional #include // std::basic_ostream #include // std::basic_string #include // std::string_view @@ -404,6 +405,25 @@ auto to_pointer( const std::basic_string_view input) -> Pointer; +/// @ingroup jsonpointer +/// Parse the URI fragment representation of a JSON Pointer, percent-decoding +/// the fragment data before interpretation as mandated by RFC 3986, Section +/// 2.4 and RFC 6901, Section 6. The result is not set if the URI has no +/// fragment or if its fragment is not a valid JSON Pointer. For example: +/// +/// ```cpp +/// #include +/// #include +/// #include +/// +/// const sourcemeta::core::URI uri{"https://www.example.com#/foo/bar"}; +/// const auto pointer{sourcemeta::core::fragment_to_pointer(uri)}; +/// assert(pointer.has_value()); +/// assert(pointer.value().size() == 2); +/// ``` +SOURCEMETA_CORE_JSONPOINTER_EXPORT +auto fragment_to_pointer(const URI &uri) -> std::optional; + /// @ingroup jsonpointer /// Convert a JSON WeakPointer into a JSON Pointer. For example: /// diff --git a/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_pointer.h b/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_pointer.h index a07242a02..d28ccd4ce 100644 --- a/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_pointer.h +++ b/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_pointer.h @@ -868,7 +868,7 @@ template class GenericPointer { for (const auto &element : value.as_array()) { if (element.is_string()) { result.emplace_back(element.to_string()); - } else if (element.is_integer()) { + } else if (element.is_integer() && element.to_integer() >= 0) { result.emplace_back( static_cast(element.to_integer())); } else { diff --git a/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_token.h b/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_token.h index 69d639940..01f4d020d 100644 --- a/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_token.h +++ b/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_token.h @@ -79,7 +79,9 @@ template class GenericToken { /// ``` GenericToken(const int value) : as_property{false}, property{DEFAULT_PROPERTY}, hash{0}, - index{static_cast(value)} {} + index{static_cast(value)} { + assert(value >= 0); + } #if defined(_MSC_VER) /// This constructor creates an JSON Pointer token from an item index. For diff --git a/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_walker.h b/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_walker.h index 9441f564b..610099c2a 100644 --- a/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_walker.h +++ b/vendor/core/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_walker.h @@ -16,7 +16,10 @@ template class GenericPointerWalker { using internal = typename std::vector; public: - GenericPointerWalker(const JSON &document) { this->walk(document, {}); } + GenericPointerWalker(const JSON &document) { + PointerT accumulator; + this->walk(document, accumulator); + } using const_iterator = typename internal::const_iterator; [[nodiscard]] auto begin() const -> const_iterator { @@ -33,19 +36,19 @@ template class GenericPointerWalker { }; private: - auto walk(const JSON &document, const PointerT &pointer) -> void { + auto walk(const JSON &document, PointerT &pointer) -> void { this->pointers.push_back(pointer); if (document.is_array()) { for (std::size_t index = 0; index < document.size(); index++) { - PointerT subpointer{pointer}; - subpointer.emplace_back(index); - this->walk(document.at(index), subpointer); + pointer.emplace_back(index); + this->walk(document.at(index), pointer); + pointer.pop_back(); } } else if (document.is_object()) { for (const auto &pair : document.as_object()) { - PointerT subpointer{pointer}; - subpointer.emplace_back(pair.first); - this->walk(pair.second, subpointer); + pointer.emplace_back(pair.first); + this->walk(pair.second, pointer); + pointer.pop_back(); } } } diff --git a/vendor/core/src/core/jsonpointer/jsonpointer.cc b/vendor/core/src/core/jsonpointer/jsonpointer.cc index 576cb53b8..6677cb508 100644 --- a/vendor/core/src/core/jsonpointer/jsonpointer.cc +++ b/vendor/core/src/core/jsonpointer/jsonpointer.cc @@ -12,8 +12,10 @@ #include // std::array #include // assert #include // std::to_chars +#include // std::size_t #include // std::cbegin, std::cend, std::prev, std::advance #include // std::allocator +#include // std::optional, std::nullopt #include // std::basic_ostream #include // std::basic_ostringstream, std::basic_stringstream #include // std::basic_string @@ -23,6 +25,18 @@ namespace { +auto uri_hex_value(const char character) -> int { + if (character >= '0' && character <= '9') { + return character - '0'; + } else if (character >= 'A' && character <= 'F') { + return character - 'A' + 10; + } else if (character >= 'a' && character <= 'f') { + return character - 'a' + 10; + } else { + return -1; + } +} + template