From 7bbde582ce2e669af199febb758c10ac636e8ba9 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 17:54:37 +0000 Subject: [PATCH 1/8] Merge encodings into a single file The decode methods use methods from other encodings. For example, Utf8::decode calls Utf16::getCharCount and Utf16::encode in a loop. Placing them in the same file makes it easier for them to be inlined which improves performance. --- src/cpp/encoding/Ascii.cpp | 66 ---- src/cpp/encoding/Encodings.cpp | 641 +++++++++++++++++++++++++++++++++ src/cpp/encoding/Utf16.cpp | 279 -------------- src/cpp/encoding/Utf8.cpp | 303 ---------------- toolchain/haxe-target.xml | 10 +- 5 files changed, 644 insertions(+), 655 deletions(-) delete mode 100644 src/cpp/encoding/Ascii.cpp create mode 100644 src/cpp/encoding/Encodings.cpp delete mode 100644 src/cpp/encoding/Utf16.cpp delete mode 100644 src/cpp/encoding/Utf8.cpp diff --git a/src/cpp/encoding/Ascii.cpp b/src/cpp/encoding/Ascii.cpp deleted file mode 100644 index 7a0acd8bf..000000000 --- a/src/cpp/encoding/Ascii.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#include - -using namespace cpp::marshal; - -bool cpp::encoding::Ascii::isEncoded(const String& string) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - return string.isAsciiEncoded(); -} - -int64_t cpp::encoding::Ascii::encode(const String& string, View buffer) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (string.isUTF16Encoded()) - { - hx::Throw(HX_CSTRING("String cannot be encoded to ASCII")); - } - - auto src = cpp::marshal::View(string.raw_ptr(), string.length).reinterpret(); - - if (src.tryCopyTo(buffer)) - { - return src.length; - } - else - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } -} - -String cpp::encoding::Ascii::decode(View view) -{ - if (view.isEmpty()) - { - return hx::Throw(HX_CSTRING("View is empty")); - } - - auto bytes = int64_t{ 0 }; - auto i = int64_t{ 0 }; - auto chars = view.reinterpret(); - - while (i < chars.length && 0 != chars.ptr[i]) - { - bytes += sizeof(char); - i++; - } - - if (0 == bytes) - { - return String::emptyString; - } - - auto backing = hx::NewGCPrivate(0, bytes + sizeof(char)); - - std::memcpy(backing, view.ptr.ptr, bytes); - - return String(static_cast(backing), bytes / sizeof(char)); -} diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp new file mode 100644 index 000000000..c3d30d314 --- /dev/null +++ b/src/cpp/encoding/Encodings.cpp @@ -0,0 +1,641 @@ +#include +#include + +using namespace cpp::marshal; + +bool cpp::encoding::Ascii::isEncoded(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + return string.isAsciiEncoded(); +} + +int64_t cpp::encoding::Ascii::encode(const String& string, View buffer) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (string.isUTF16Encoded()) + { + hx::Throw(HX_CSTRING("String cannot be encoded to ASCII")); + } + + auto src = cpp::marshal::View(string.raw_ptr(), string.length).reinterpret(); + + if (src.tryCopyTo(buffer)) + { + return src.length; + } + else + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } +} + +String cpp::encoding::Ascii::decode(View view) +{ + if (view.isEmpty()) + { + return hx::Throw(HX_CSTRING("View is empty")); + } + + auto bytes = int64_t{ 0 }; + auto i = int64_t{ 0 }; + auto chars = view.reinterpret(); + + while (i < chars.length && 0 != chars.ptr[i]) + { + bytes += sizeof(char); + i++; + } + + if (0 == bytes) + { + return String::emptyString; + } + + auto backing = hx::NewGCPrivate(0, bytes + sizeof(char)); + + std::memcpy(backing, view.ptr.ptr, bytes); + + return String(static_cast(backing), bytes / sizeof(char)); +} + +namespace +{ + bool isAsciiUtf8Buffer(const View& buffer) + { + auto i = int64_t{ 0 }; + while (i < buffer.length) + { + auto p = cpp::encoding::Utf8::codepoint(buffer.slice(i)); + + if (p > 127) + { + return false; + } + + i += cpp::encoding::Utf8::getByteCount(p); + } + + return true; + } +} + +int cpp::encoding::Utf8::getByteCount(const null&) +{ + hx::NullReference("String", false); + return 0; +} + +int cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) +{ + if (codepoint <= 0x7F) + { + return 1; + } + else if (codepoint <= 0x7FF) + { + return 2; + } + else if (codepoint <= 0xFFFF) + { + return 3; + } + else + { + return 4; + } +} + +int64_t cpp::encoding::Utf8::getByteCount(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (string.isAsciiEncoded()) + { + return string.length; + } + +#if defined(HX_SMART_STRINGS) + auto source = View(string.raw_wptr(), string.length).reinterpret(); + auto length = source.length; + auto bytes = int64_t{ 0 }; + auto i = int64_t{ 0 }; + + while (i < source.length) + { + auto slice = source.slice(i); + auto p = Utf16::codepoint(slice); + + i += Utf16::getByteCount(p); + bytes += getByteCount(p); + } + + return bytes; +#else + return hx::Throw(HX_CSTRING("Unexpected encoding error")); +#endif +} + +int cpp::encoding::Utf8::getCharCount(const null&) +{ + hx::NullReference("String", false); + return 0; +} + +int cpp::encoding::Utf8::getCharCount(const char32_t& codepoint) +{ + return getByteCount(codepoint) / sizeof(char); +} + +int64_t cpp::encoding::Utf8::getCharCount(const String& string) +{ + return getByteCount(string) / sizeof(char); +} + +int cpp::encoding::Utf8::encode(const null&, const cpp::marshal::View& buffer) +{ + hx::NullReference("String", false); + return 0; +} + +int64_t cpp::encoding::Utf8::encode(const String& string, const cpp::marshal::View& buffer) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (0 == string.length) + { + return 0; + } + + if (buffer.isEmpty()) + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + + if (string.isAsciiEncoded()) + { + auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_ptr())), string.length); + + if (src.tryCopyTo(buffer)) + { + return src.length; + } + else + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + } + +#if defined(HX_SMART_STRINGS) + if (getByteCount(string) > buffer.length) + { + hx::Throw(HX_CSTRING("Buffer too small")); + } + + auto initialPtr = buffer.ptr.ptr; + auto source = View(string.raw_wptr(), string.length).reinterpret(); + auto i = int64_t{ 0 }; + auto k = int64_t{ 0 }; + + while (i < source.length) + { + auto p = Utf16::codepoint(source.slice(i)); + + i += Utf16::getByteCount(p); + k += encode(p, buffer.slice(k)); + } + + return k; +#else + return hx::Throw(HX_CSTRING("Unexpected encoding error")); +#endif +} + +int cpp::encoding::Utf8::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) +{ + if (codepoint <= 0x7F) + { + buffer[0] = static_cast(codepoint); + + return 1; + } + else if (codepoint <= 0x7FF) + { + auto data = std::array + { { + static_cast(0xC0 | (codepoint >> 6)), + static_cast(0x80 | (codepoint & 63)) + } }; + auto src = View(data.data(), data.size()); + + src.copyTo(buffer); + + return data.size(); + } + else if (codepoint <= 0xFFFF) + { + auto data = std::array + { { + static_cast(0xE0 | (codepoint >> 12)), + static_cast(0x80 | ((codepoint >> 6) & 63)), + static_cast(0x80 | (codepoint & 63)) + } }; + + auto src = View(data.data(), data.size()); + + src.copyTo(buffer); + + return data.size(); + } + else + { + auto data = std::array + { { + static_cast(0xF0 | (codepoint >> 18)), + static_cast(0x80 | ((codepoint >> 12) & 63)), + static_cast(0x80 | ((codepoint >> 6) & 63)), + static_cast(0x80 | (codepoint & 63)) + } }; + + auto src = View(data.data(), data.size()); + + src.copyTo(buffer); + + return data.size(); + } +} + +String cpp::encoding::Utf8::decode(const cpp::marshal::View& buffer) +{ + if (buffer.isEmpty()) + { + return String::emptyString; + } + + if (isAsciiUtf8Buffer(buffer)) + { + return Ascii::decode(buffer); + } + +#if defined(HX_SMART_STRINGS) + auto chars = int64_t{ 0 }; + auto i = int64_t{ 0 }; + + while (i < buffer.length) + { + auto p = codepoint(buffer.slice(i)); + + i += getByteCount(p); + chars += Utf16::getCharCount(p); + } + + auto backing = View(::String::allocChar16Ptr(chars), chars); + auto output = backing.reinterpret(); + auto k = int64_t{ 0 }; + + i = 0; + while (i < buffer.length) + { + auto p = codepoint(buffer.slice(i)); + + i += getByteCount(p); + k += Utf16::encode(p, output.slice(k)); + } + + return String(backing.ptr.ptr, chars); +#else + auto backing = View(hx::InternalNew(buffer.length, false), buffer.length); + + std::memcpy(backing.ptr.ptr, buffer.ptr.ptr, buffer.length); + + return String(backing.ptr.ptr, static_cast(buffer.length)); +#endif +} + +char32_t cpp::encoding::Utf8::codepoint(const cpp::marshal::View& buffer) +{ + auto b0 = static_cast(buffer[0]); + + if ((b0 & 0x80) == 0) + { + return b0; + } + else if ((b0 & 0xE0) == 0xC0) + { + return (static_cast(b0 & 0x1F) << 6) | static_cast(buffer.slice(1)[0] & 0x3F); + } + else if ((b0 & 0xF0) == 0xE0) + { + auto staging = std::array(); + auto dst = View(staging.data(), staging.size()); + + buffer.slice(1, staging.size()).copyTo(dst); + + return (static_cast(b0 & 0x0F) << 12) | (static_cast(staging[0] & 0x3F) << 6) | static_cast(staging[1] & 0x3F); + } + else if ((b0 & 0xF8) == 0xF0) + { + auto staging = std::array(); + auto dst = View(staging.data(), staging.size()); + + buffer.slice(1, staging.size()).copyTo(dst); + + return + (static_cast(b0 & 0x07) << 18) | + (static_cast(staging[0] & 0x3F) << 12) | + (static_cast(staging[1] & 0x3F) << 6) | + static_cast(staging[2] & 0x3F); + } + else + { + return int{ hx::Throw(HX_CSTRING("Failed to read codepoint")) }; + } +} + +namespace +{ + bool isSurrogate(char32_t codepoint) + { + return codepoint >= 0xd800 && codepoint < 0xe000; + } + + bool isLowSurrogate(char32_t codepoint) + { + return codepoint >= 0xdc00 && codepoint < 0xe000; + } + + bool isHighSurrogate(char32_t codepoint) + { + return codepoint >= 0xd800 && codepoint < 0xdc00; + } + + bool isAsciiUtf16Buffer(const View& buffer) + { + auto i = int64_t{ 0 }; + while (i < buffer.length) + { + auto p = cpp::encoding::Utf16::codepoint(buffer.slice(i)); + + if (p > 127) + { + return false; + } + + i += cpp::encoding::Utf16::getByteCount(p); + } + + return true; + } + + String toAsciiString(const View& buffer) + { + auto bytes = buffer.length / sizeof(char16_t); + auto chars = View(hx::InternalNew(bytes + 1, false), bytes * sizeof(char)); + auto i = int64_t{ 0 }; + auto k = int64_t{ 0 }; + + while (i < buffer.length) + { + auto p = cpp::encoding::Utf16::codepoint(buffer.slice(i)); + + chars[k++] = static_cast(p); + + i += cpp::encoding::Utf16::getByteCount(p); + } + + return String(chars.ptr.ptr, chars.length); + } +} + +bool cpp::encoding::Utf16::isEncoded(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + return string.isUTF16Encoded(); +} + +int cpp::encoding::Utf16::getByteCount(const null&) +{ + hx::NullReference("String", false); + return 0; +} + +int cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) +{ + return codepoint <= 0xFFFF ? 2 : 4; +} + +int64_t cpp::encoding::Utf16::getByteCount(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (string.isUTF16Encoded()) + { + return string.length * sizeof(char16_t); + } + else + { + auto bytes = int64_t{ 0 }; + for (auto i = 0; i < string.length; i++) + { + bytes += getByteCount(static_cast(string.raw_ptr()[i])); + } + + return bytes; + } +} + +int cpp::encoding::Utf16::getCharCount(const null&) +{ + hx::NullReference("String", false); + return 0; +} + +int cpp::encoding::Utf16::getCharCount(const char32_t& codepoint) +{ + return getByteCount(codepoint) / sizeof(char16_t); +} + +int64_t cpp::encoding::Utf16::getCharCount(const String& string) +{ + return getByteCount(string) / sizeof(char16_t); +} + +int cpp::encoding::Utf16::encode(const null&, const cpp::marshal::View& buffer) +{ + hx::NullReference("String", false); + return 0; +} + +int64_t cpp::encoding::Utf16::encode(const String& string, const cpp::marshal::View& buffer) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (0 == string.length) + { + return 0; + } + + if (buffer.isEmpty()) + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + +#if defined(HX_SMART_STRINGS) + if (string.isUTF16Encoded()) + { + auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_wptr())), string.length * sizeof(char16_t)); + + if (src.tryCopyTo(buffer)) + { + return src.length; + } + else + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + } + else +#endif + { + auto bytes = int64_t{ 0 }; + for (auto i = 0; i < string.length; i++) + { + bytes += getByteCount(static_cast(string.raw_ptr()[i])); + } + + if (bytes > buffer.length) + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + + auto i = int64_t{ 0 }; + for (auto k = 0; k < string.length; k++) + { + i += encode(static_cast(string.raw_ptr()[k]), buffer.slice(i)); + } + + return bytes; + } +} + +int cpp::encoding::Utf16::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) +{ + if (codepoint < 0xD800) + { + Marshal::writeUInt16(buffer, static_cast(codepoint)); + + return 2; + } + else if (codepoint < 0xE000) + { + // D800 - DFFF is invalid + + return hx::Throw(HX_CSTRING("Invalid UTF16")); + } + else if (codepoint < 0x10000) + { + Marshal::writeUInt16(buffer, static_cast(codepoint)); + + return 2; + } + else if (codepoint < 0x110000) + { + auto staging = std::array(); + auto fst = View(staging.data(), 2); + auto snd = View(staging.data() + 2, 2); + auto all = View(staging.data(), staging.size()); + + Marshal::writeUInt16(fst, 0xD800 + (((codepoint - 0x10000) >> 10) & 0x3FF)); + Marshal::writeUInt16(snd, 0xDC00 + ((codepoint - 0x10000) & 0x3FF)); + + all.copyTo(buffer); + + return 4; + } + + return 0; +} + +String cpp::encoding::Utf16::decode(const cpp::marshal::View& buffer) +{ + if (buffer.isEmpty()) + { + return String::emptyString; + } + + if (isAsciiUtf16Buffer(buffer)) + { + return toAsciiString(buffer); + } + +#if defined(HX_SMART_STRINGS) + auto i = int64_t{ 0 }; + while (i < buffer.length) + { + auto p = codepoint(buffer.slice(i)); + + i += getByteCount(p); + } + + auto chars = i / sizeof(char16_t); + auto backing = View(::String::allocChar16Ptr(chars), chars); + auto output = backing.reinterpret(); + auto k = int64_t{ 0 }; + + i = 0; + while (i < buffer.length) + { + auto p = codepoint(buffer.slice(i)); + + i += getByteCount(p); + k += encode(p, output.slice(k)); + } + + return String(backing.ptr.ptr, chars); +#else + return hx::Throw(HX_CSTRING("Not Implemented : UTF16 decode when HX_SMART_STRINGS is not defined")); +#endif +} + +char32_t cpp::encoding::Utf16::codepoint(const cpp::marshal::View& buffer) +{ + auto first = static_cast(Marshal::readUInt16(buffer)); + + if (0xD800 <= first && first < 0xDc00) + { + auto second = static_cast(Marshal::readUInt16(buffer.slice(2))); + if (0xDC00 <= second && second < 0xE000) + { + return static_cast((((first - 0xD800) << 10) | (second - 0xDC00)) + 0x10000); + } + + return int{ hx::Throw(HX_CSTRING("Invalid UTF16")) }; + } + else + { + return static_cast(first); + } +} diff --git a/src/cpp/encoding/Utf16.cpp b/src/cpp/encoding/Utf16.cpp deleted file mode 100644 index 930513259..000000000 --- a/src/cpp/encoding/Utf16.cpp +++ /dev/null @@ -1,279 +0,0 @@ -#include -#include - -using namespace cpp::marshal; - -namespace -{ - bool isSurrogate(char32_t codepoint) - { - return codepoint >= 0xd800 && codepoint < 0xe000; - } - - bool isLowSurrogate(char32_t codepoint) - { - return codepoint >= 0xdc00 && codepoint < 0xe000; - } - - bool isHighSurrogate(char32_t codepoint) - { - return codepoint >= 0xd800 && codepoint < 0xdc00; - } - - bool isAsciiBuffer(const View& buffer) - { - auto i = int64_t{ 0 }; - while (i < buffer.length) - { - auto p = cpp::encoding::Utf16::codepoint(buffer.slice(i)); - - if (p > 127) - { - return false; - } - - i += cpp::encoding::Utf16::getByteCount(p); - } - - return true; - } - - String toAsciiString(const View& buffer) - { - auto bytes = buffer.length / sizeof(char16_t); - auto chars = View(hx::InternalNew(bytes + 1, false), bytes * sizeof(char)); - auto i = int64_t{ 0 }; - auto k = int64_t{ 0 }; - - while (i < buffer.length) - { - auto p = cpp::encoding::Utf16::codepoint(buffer.slice(i)); - - chars[k++] = static_cast(p); - - i += cpp::encoding::Utf16::getByteCount(p); - } - - return String(chars.ptr.ptr, chars.length); - } -} - -bool cpp::encoding::Utf16::isEncoded(const String& string) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - return string.isUTF16Encoded(); -} - -int cpp::encoding::Utf16::getByteCount(const null&) -{ - hx::NullReference("String", false); - return 0; -} - -int cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) -{ - return codepoint <= 0xFFFF ? 2 : 4; -} - -int64_t cpp::encoding::Utf16::getByteCount(const String& string) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (string.isUTF16Encoded()) - { - return string.length * sizeof(char16_t); - } - else - { - auto bytes = int64_t{ 0 }; - for (auto i = 0; i < string.length; i++) - { - bytes += getByteCount(static_cast(string.raw_ptr()[i])); - } - - return bytes; - } -} - -int cpp::encoding::Utf16::getCharCount(const null&) -{ - hx::NullReference("String", false); - return 0; -} - -int cpp::encoding::Utf16::getCharCount(const char32_t& codepoint) -{ - return getByteCount(codepoint) / sizeof(char16_t); -} - -int64_t cpp::encoding::Utf16::getCharCount(const String& string) -{ - return getByteCount(string) / sizeof(char16_t); -} - -int cpp::encoding::Utf16::encode(const null&, const cpp::marshal::View& buffer) -{ - hx::NullReference("String", false); - return 0; -} - -int64_t cpp::encoding::Utf16::encode(const String& string, const cpp::marshal::View& buffer) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (0 == string.length) - { - return 0; - } - - if (buffer.isEmpty()) - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - -#if defined(HX_SMART_STRINGS) - if (string.isUTF16Encoded()) - { - auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_wptr())), string.length * sizeof(char16_t)); - - if (src.tryCopyTo(buffer)) - { - return src.length; - } - else - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - } - else -#endif - { - auto bytes = int64_t{ 0 }; - for (auto i = 0; i < string.length; i++) - { - bytes += getByteCount(static_cast(string.raw_ptr()[i])); - } - - if (bytes > buffer.length) - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - - auto i = int64_t{ 0 }; - for (auto k = 0; k < string.length; k++) - { - i += encode(static_cast(string.raw_ptr()[k]), buffer.slice(i)); - } - - return bytes; - } -} - -int cpp::encoding::Utf16::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) -{ - if (codepoint < 0xD800) - { - Marshal::writeUInt16(buffer, static_cast(codepoint)); - - return 2; - } - else if (codepoint < 0xE000) - { - // D800 - DFFF is invalid - - return hx::Throw(HX_CSTRING("Invalid UTF16")); - } - else if (codepoint < 0x10000) - { - Marshal::writeUInt16(buffer, static_cast(codepoint)); - - return 2; - } - else if (codepoint < 0x110000) - { - auto staging = std::array(); - auto fst = View(staging.data(), 2); - auto snd = View(staging.data() + 2, 2); - auto all = View(staging.data(), staging.size()); - - Marshal::writeUInt16(fst, 0xD800 + (((codepoint - 0x10000) >> 10) & 0x3FF)); - Marshal::writeUInt16(snd, 0xDC00 + ((codepoint - 0x10000) & 0x3FF)); - - all.copyTo(buffer); - - return 4; - } - - return 0; -} - -String cpp::encoding::Utf16::decode(const cpp::marshal::View& buffer) -{ - if (buffer.isEmpty()) - { - return String::emptyString; - } - - if (isAsciiBuffer(buffer)) - { - return toAsciiString(buffer); - } - -#if defined(HX_SMART_STRINGS) - auto i = int64_t{ 0 }; - while (i < buffer.length) - { - auto p = codepoint(buffer.slice(i)); - - i += getByteCount(p); - } - - auto chars = i / sizeof(char16_t); - auto backing = View(::String::allocChar16Ptr(chars), chars); - auto output = backing.reinterpret(); - auto k = int64_t{ 0 }; - - i = 0; - while (i < buffer.length) - { - auto p = codepoint(buffer.slice(i)); - - i += getByteCount(p); - k += encode(p, output.slice(k)); - } - - return String(backing.ptr.ptr, chars); -#else - return hx::Throw(HX_CSTRING("Not Implemented : UTF16 decode when HX_SMART_STRINGS is not defined")); -#endif -} - -char32_t cpp::encoding::Utf16::codepoint(const cpp::marshal::View& buffer) -{ - auto first = static_cast(Marshal::readUInt16(buffer)); - - if (0xD800 <= first && first < 0xDc00) - { - auto second = static_cast(Marshal::readUInt16(buffer.slice(2))); - if (0xDC00 <= second && second < 0xE000) - { - return static_cast((((first - 0xD800) << 10) | (second - 0xDC00)) + 0x10000); - } - - return int{ hx::Throw(HX_CSTRING("Invalid UTF16")) }; - } - else - { - return static_cast(first); - } -} diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp deleted file mode 100644 index 6ff51af96..000000000 --- a/src/cpp/encoding/Utf8.cpp +++ /dev/null @@ -1,303 +0,0 @@ -#include -#include - -using namespace cpp::marshal; - -namespace -{ - bool isAsciiBuffer(const View& buffer) - { - auto i = int64_t{ 0 }; - while (i < buffer.length) - { - auto p = cpp::encoding::Utf8::codepoint(buffer.slice(i)); - - if (p > 127) - { - return false; - } - - i += cpp::encoding::Utf8::getByteCount(p); - } - - return true; - } -} - -int cpp::encoding::Utf8::getByteCount(const null&) -{ - hx::NullReference("String", false); - return 0; -} - -int cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) -{ - if (codepoint <= 0x7F) - { - return 1; - } - else if (codepoint <= 0x7FF) - { - return 2; - } - else if (codepoint <= 0xFFFF) - { - return 3; - } - else - { - return 4; - } -} - -int64_t cpp::encoding::Utf8::getByteCount(const String& string) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (string.isAsciiEncoded()) - { - return string.length; - } - -#if defined(HX_SMART_STRINGS) - auto source = View(string.raw_wptr(), string.length).reinterpret(); - auto length = source.length; - auto bytes = int64_t{ 0 }; - auto i = int64_t{ 0 }; - - while (i < source.length) - { - auto slice = source.slice(i); - auto p = Utf16::codepoint(slice); - - i += Utf16::getByteCount(p); - bytes += getByteCount(p); - } - - return bytes; -#else - return hx::Throw(HX_CSTRING("Unexpected encoding error")); -#endif -} - -int cpp::encoding::Utf8::getCharCount(const null&) -{ - hx::NullReference("String", false); - return 0; -} - -int cpp::encoding::Utf8::getCharCount(const char32_t& codepoint) -{ - return getByteCount(codepoint) / sizeof(char); -} - -int64_t cpp::encoding::Utf8::getCharCount(const String& string) -{ - return getByteCount(string) / sizeof(char); -} - -int cpp::encoding::Utf8::encode(const null&, const cpp::marshal::View& buffer) -{ - hx::NullReference("String", false); - return 0; -} - -int64_t cpp::encoding::Utf8::encode(const String& string, const cpp::marshal::View& buffer) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (0 == string.length) - { - return 0; - } - - if (buffer.isEmpty()) - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - - if (string.isAsciiEncoded()) - { - auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_ptr())), string.length); - - if (src.tryCopyTo(buffer)) - { - return src.length; - } - else - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - } - -#if defined(HX_SMART_STRINGS) - if (getByteCount(string) > buffer.length) - { - hx::Throw(HX_CSTRING("Buffer too small")); - } - - auto initialPtr = buffer.ptr.ptr; - auto source = View(string.raw_wptr(), string.length).reinterpret(); - auto i = int64_t{ 0 }; - auto k = int64_t{ 0 }; - - while (i < source.length) - { - auto p = Utf16::codepoint(source.slice(i)); - - i += Utf16::getByteCount(p); - k += encode(p, buffer.slice(k)); - } - - return k; -#else - return hx::Throw(HX_CSTRING("Unexpected encoding error")); -#endif -} - -int cpp::encoding::Utf8::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) -{ - if (codepoint <= 0x7F) - { - buffer[0] = static_cast(codepoint); - - return 1; - } - else if (codepoint <= 0x7FF) - { - auto data = std::array - { { - static_cast(0xC0 | (codepoint >> 6)), - static_cast(0x80 | (codepoint & 63)) - } }; - auto src = View(data.data(), data.size()); - - src.copyTo(buffer); - - return data.size(); - } - else if (codepoint <= 0xFFFF) - { - auto data = std::array - { { - static_cast(0xE0 | (codepoint >> 12)), - static_cast(0x80 | ((codepoint >> 6) & 63)), - static_cast(0x80 | (codepoint & 63)) - } }; - - auto src = View(data.data(), data.size()); - - src.copyTo(buffer); - - return data.size(); - } - else - { - auto data = std::array - { { - static_cast(0xF0 | (codepoint >> 18)), - static_cast(0x80 | ((codepoint >> 12) & 63)), - static_cast(0x80 | ((codepoint >> 6) & 63)), - static_cast(0x80 | (codepoint & 63)) - } }; - - auto src = View(data.data(), data.size()); - - src.copyTo(buffer); - - return data.size(); - } -} - -String cpp::encoding::Utf8::decode(const cpp::marshal::View& buffer) -{ - if (buffer.isEmpty()) - { - return String::emptyString; - } - - if (isAsciiBuffer(buffer)) - { - return Ascii::decode(buffer); - } - -#if defined(HX_SMART_STRINGS) - auto chars = int64_t{ 0 }; - auto i = int64_t{ 0 }; - - while (i < buffer.length) - { - auto p = codepoint(buffer.slice(i)); - - i += getByteCount(p); - chars += Utf16::getCharCount(p); - } - - auto backing = View(::String::allocChar16Ptr(chars), chars); - auto output = backing.reinterpret(); - auto k = int64_t{ 0 }; - - i = 0; - while (i < buffer.length) - { - auto p = codepoint(buffer.slice(i)); - - i += getByteCount(p); - k += Utf16::encode(p, output.slice(k)); - } - - return String(backing.ptr.ptr, chars); -#else - auto backing = View(hx::InternalNew(buffer.length, false), buffer.length); - - std::memcpy(backing.ptr.ptr, buffer.ptr.ptr, buffer.length); - - return String(backing.ptr.ptr, static_cast(buffer.length)); -#endif -} - -char32_t cpp::encoding::Utf8::codepoint(const cpp::marshal::View& buffer) -{ - auto b0 = static_cast(buffer[0]); - - if ((b0 & 0x80) == 0) - { - return b0; - } - else if ((b0 & 0xE0) == 0xC0) - { - return (static_cast(b0 & 0x1F) << 6) | static_cast(buffer.slice(1)[0] & 0x3F); - } - else if ((b0 & 0xF0) == 0xE0) - { - auto staging = std::array(); - auto dst = View(staging.data(), staging.size()); - - buffer.slice(1, staging.size()).copyTo(dst); - - return (static_cast(b0 & 0x0F) << 12) | (static_cast(staging[0] & 0x3F) << 6) | static_cast(staging[1] & 0x3F); - } - else if ((b0 & 0xF8) == 0xF0) - { - auto staging = std::array(); - auto dst = View(staging.data(), staging.size()); - - buffer.slice(1, staging.size()).copyTo(dst); - - return - (static_cast(b0 & 0x07) << 18) | - (static_cast(staging[0] & 0x3F) << 12) | - (static_cast(staging[1] & 0x3F) << 6) | - static_cast(staging[2] & 0x3F); - } - else - { - return int{ hx::Throw(HX_CSTRING("Failed to read codepoint")) }; - } -} diff --git a/toolchain/haxe-target.xml b/toolchain/haxe-target.xml index 0c50477bb..4a253bc12 100644 --- a/toolchain/haxe-target.xml +++ b/toolchain/haxe-target.xml @@ -83,7 +83,7 @@ - + @@ -188,7 +188,7 @@ - + @@ -202,9 +202,7 @@ - - - + @@ -291,5 +289,3 @@ - - From 854eef56696c579c83c16fb4d98785bd9b11c5b0 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 19:16:48 +0000 Subject: [PATCH 2/8] Avoid duplicate string iteration for ascii checks --- src/cpp/encoding/Encodings.cpp | 37 +++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index c3d30d314..98c206e97 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -285,23 +285,29 @@ String cpp::encoding::Utf8::decode(const cpp::marshal::View& buffer) return String::emptyString; } - if (isAsciiUtf8Buffer(buffer)) - { - return Ascii::decode(buffer); - } - #if defined(HX_SMART_STRINGS) auto chars = int64_t{ 0 }; auto i = int64_t{ 0 }; + bool isAscii = true; while (i < buffer.length) { auto p = codepoint(buffer.slice(i)); + if (p > 127) + { + isAscii = false; + } + i += getByteCount(p); chars += Utf16::getCharCount(p); } + if (isAscii) + { + return Ascii::decode(buffer); + } + auto backing = View(::String::allocChar16Ptr(chars), chars); auto output = backing.reinterpret(); auto k = int64_t{ 0 }; @@ -317,6 +323,11 @@ String cpp::encoding::Utf8::decode(const cpp::marshal::View& buffer) return String(backing.ptr.ptr, chars); #else + if (isAsciiUtf8Buffer(buffer)) + { + return Ascii::decode(buffer); + } + auto backing = View(hx::InternalNew(buffer.length, false), buffer.length); std::memcpy(backing.ptr.ptr, buffer.ptr.ptr, buffer.length); @@ -586,20 +597,26 @@ String cpp::encoding::Utf16::decode(const cpp::marshal::View& buffer) return String::emptyString; } - if (isAsciiUtf16Buffer(buffer)) - { - return toAsciiString(buffer); - } - #if defined(HX_SMART_STRINGS) auto i = int64_t{ 0 }; + bool isAscii = true; while (i < buffer.length) { auto p = codepoint(buffer.slice(i)); + if (p > 127) + { + isAscii = false; + } + i += getByteCount(p); } + if (isAscii) + { + return toAsciiString(buffer); + } + auto chars = i / sizeof(char16_t); auto backing = View(::String::allocChar16Ptr(chars), chars); auto output = backing.reinterpret(); From 7ede40a856d2c70e45d020787baffbddac49df74 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 18:54:23 +0000 Subject: [PATCH 3/8] Add utf8 encode function that allocates output The existing Utf8::encode function takes in a buffer, but we don't know what size is required so we have to iterate through the string before writing to make sure the buffer is big enough. If the caller already ran getByteCount, then this means we have duplicated their work just for the case where they did not do it properly. This new method allocates its own buffer that is definitely the right size, which avoids the need for unnecessary checks --- include/cpp/encoding/Utf8.hpp | 1 + src/cpp/encoding/Encodings.cpp | 47 ++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/include/cpp/encoding/Utf8.hpp b/include/cpp/encoding/Utf8.hpp index 9636b8b51..8eeebead5 100644 --- a/include/cpp/encoding/Utf8.hpp +++ b/include/cpp/encoding/Utf8.hpp @@ -17,6 +17,7 @@ namespace cpp static int encode(const null&, const cpp::marshal::View& buffer); static int encode(const char32_t& codepoint, const cpp::marshal::View& buffer); static int64_t encode(const String& string, const cpp::marshal::View& buffer); + static Array encode(const String& string); static char32_t codepoint(const cpp::marshal::View& buffer); static String decode(const cpp::marshal::View& buffer); diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index 98c206e97..59228194f 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -224,6 +224,53 @@ int64_t cpp::encoding::Utf8::encode(const String& string, const cpp::marshal::Vi #endif } +Array cpp::encoding::Utf8::encode(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (0 == string.length) + { + return 0; + } + + if (string.isAsciiEncoded()) + { + Array out(string.length, 0); + + View src(reinterpret_cast(const_cast(string.raw_ptr())), string.length); + View buffer(out->Pointer(), out->length); + + src.copyTo(buffer); + + return out; + } + +#if defined(HX_SMART_STRINGS) + Array out(getByteCount(string), 0); + View buffer(out->Pointer(), out->length); + + auto initialPtr = buffer.ptr.ptr; + auto source = View(string.raw_wptr(), string.length).reinterpret(); + auto i = int64_t{ 0 }; + auto k = int64_t{ 0 }; + + while (i < source.length) + { + auto p = Utf16::codepoint(source.slice(i)); + + i += Utf16::getByteCount(p); + k += encode(p, buffer.slice(k)); + } + + return out; +#else + return hx::Throw(HX_CSTRING("Unexpected encoding error")); +#endif +} + int cpp::encoding::Utf8::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) { if (codepoint <= 0x7F) From 13e0791b398c04d0cdcccff2a3754d85c7407960 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 21:01:47 +0000 Subject: [PATCH 4/8] Pass char32_t by value instead of by reference --- include/cpp/encoding/Utf16.hpp | 6 +++--- include/cpp/encoding/Utf8.hpp | 6 +++--- src/cpp/encoding/Encodings.cpp | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/cpp/encoding/Utf16.hpp b/include/cpp/encoding/Utf16.hpp index ae2aacf31..6d851e14f 100644 --- a/include/cpp/encoding/Utf16.hpp +++ b/include/cpp/encoding/Utf16.hpp @@ -9,15 +9,15 @@ namespace cpp static bool isEncoded(const String& string); static int getByteCount(const null&); - static int getByteCount(const char32_t& codepoint); + static int getByteCount(char32_t codepoint); static int64_t getByteCount(const String& string); static int getCharCount(const null&); - static int getCharCount(const char32_t& codepoint); + static int getCharCount(char32_t codepoint); static int64_t getCharCount(const String& string); static int encode(const null&, const cpp::marshal::View& buffer); - static int encode(const char32_t& codepoint, const cpp::marshal::View& buffer); + static int encode(char32_t codepoint, const cpp::marshal::View& buffer); static int64_t encode(const String& string, const cpp::marshal::View& buffer); static char32_t codepoint(const cpp::marshal::View& buffer); diff --git a/include/cpp/encoding/Utf8.hpp b/include/cpp/encoding/Utf8.hpp index 8eeebead5..3ff12ba4b 100644 --- a/include/cpp/encoding/Utf8.hpp +++ b/include/cpp/encoding/Utf8.hpp @@ -7,15 +7,15 @@ namespace cpp struct Utf8 final { static int getByteCount(const null&); - static int getByteCount(const char32_t& codepoint); + static int getByteCount(char32_t codepoint); static int64_t getByteCount(const String& string); static int getCharCount(const null&); - static int getCharCount(const char32_t& codepoint); + static int getCharCount(char32_t codepoint); static int64_t getCharCount(const String& string); static int encode(const null&, const cpp::marshal::View& buffer); - static int encode(const char32_t& codepoint, const cpp::marshal::View& buffer); + static int encode(char32_t codepoint, const cpp::marshal::View& buffer); static int64_t encode(const String& string, const cpp::marshal::View& buffer); static Array encode(const String& string); diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index 59228194f..472e77094 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -93,7 +93,7 @@ int cpp::encoding::Utf8::getByteCount(const null&) return 0; } -int cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) +int cpp::encoding::Utf8::getByteCount(char32_t codepoint) { if (codepoint <= 0x7F) { @@ -152,7 +152,7 @@ int cpp::encoding::Utf8::getCharCount(const null&) return 0; } -int cpp::encoding::Utf8::getCharCount(const char32_t& codepoint) +int cpp::encoding::Utf8::getCharCount(char32_t codepoint) { return getByteCount(codepoint) / sizeof(char); } @@ -271,7 +271,7 @@ Array cpp::encoding::Utf8::encode(const String& string) #endif } -int cpp::encoding::Utf8::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) +int cpp::encoding::Utf8::encode(char32_t codepoint, const cpp::marshal::View& buffer) { if (codepoint <= 0x7F) { @@ -494,7 +494,7 @@ int cpp::encoding::Utf16::getByteCount(const null&) return 0; } -int cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) +int cpp::encoding::Utf16::getByteCount(char32_t codepoint) { return codepoint <= 0xFFFF ? 2 : 4; } @@ -528,7 +528,7 @@ int cpp::encoding::Utf16::getCharCount(const null&) return 0; } -int cpp::encoding::Utf16::getCharCount(const char32_t& codepoint) +int cpp::encoding::Utf16::getCharCount(char32_t codepoint) { return getByteCount(codepoint) / sizeof(char16_t); } @@ -599,7 +599,7 @@ int64_t cpp::encoding::Utf16::encode(const String& string, const cpp::marshal::V } } -int cpp::encoding::Utf16::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) +int cpp::encoding::Utf16::encode(char32_t codepoint, const cpp::marshal::View& buffer) { if (codepoint < 0xD800) { From 26f03c7f3ceaa1377f4c9f1245314dfba67fd6bc Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 23:03:40 +0000 Subject: [PATCH 5/8] Estimate utf8 length instead of iterating --- src/cpp/encoding/Encodings.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index 472e77094..5abd4c84d 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -249,7 +249,8 @@ Array cpp::encoding::Utf8::encode(const String& string) } #if defined(HX_SMART_STRINGS) - Array out(getByteCount(string), 0); + // estimate the utf8 length with an upper bound + Array out(string.length * 3, 0); View buffer(out->Pointer(), out->length); auto initialPtr = buffer.ptr.ptr; @@ -265,6 +266,8 @@ Array cpp::encoding::Utf8::encode(const String& string) k += encode(p, buffer.slice(k)); } + out->resize(i); + return out; #else return hx::Throw(HX_CSTRING("Unexpected encoding error")); From b5351504ab445261ea9c330c2339a439f18f8195 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 23:18:53 +0000 Subject: [PATCH 6/8] Make Utf16::codepoint locally inline --- src/cpp/encoding/Encodings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index 5abd4c84d..23c83c38c 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -687,7 +687,7 @@ String cpp::encoding::Utf16::decode(const cpp::marshal::View& buffer) #endif } -char32_t cpp::encoding::Utf16::codepoint(const cpp::marshal::View& buffer) +inline char32_t cpp::encoding::Utf16::codepoint(const cpp::marshal::View& buffer) { auto first = static_cast(Marshal::readUInt16(buffer)); From 9411971dfcd17783e50f6c029548130b8b1f3c97 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Tue, 10 Feb 2026 00:29:46 +0000 Subject: [PATCH 7/8] Simplify ascii check --- src/cpp/encoding/Encodings.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index 23c83c38c..c5eb0401d 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -70,19 +70,13 @@ namespace { bool isAsciiUtf8Buffer(const View& buffer) { - auto i = int64_t{ 0 }; - while (i < buffer.length) + for (int64_t i = 0; i < buffer.length; i++) { - auto p = cpp::encoding::Utf8::codepoint(buffer.slice(i)); - - if (p > 127) + if (buffer[i] > 127) { return false; } - - i += cpp::encoding::Utf8::getByteCount(p); } - return true; } } From a441f63edb3437c4188dc4223e94ade5838e8302 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Tue, 10 Feb 2026 03:45:08 +0000 Subject: [PATCH 8/8] Fix utf8 encode resize --- src/cpp/encoding/Encodings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index c5eb0401d..f0853138c 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -260,7 +260,7 @@ Array cpp::encoding::Utf8::encode(const String& string) k += encode(p, buffer.slice(k)); } - out->resize(i); + out->resize(k); return out; #else