Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 54 additions & 50 deletions src/encoding_binding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,53 +340,43 @@ void BindingData::EncodeUtf8String(const FunctionCallbackInfo<Value>& args) {

size_t length = source->Length();
size_t utf8_length = 0;
bool is_one_byte = source->IsOneByte();

if (is_one_byte) {
// One-byte string (Latin1) - copy to buffer first, then process
MaybeStackBuffer<uint8_t, MAX_SIZE_FOR_STACK_ALLOC> latin1_buffer(length);
source->WriteOneByteV2(isolate, 0, length, latin1_buffer.out());

auto data = reinterpret_cast<const char*>(latin1_buffer.out());

// Check if it's pure ASCII - if so, we can just copy
simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
if (result.error == simdutf::SUCCESS) {
// Pure ASCII - direct copy
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
isolate, length, BackingStoreInitializationMode::kUninitialized);
CHECK(bs);
memcpy(bs->Data(), data, length);
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
args.GetReturnValue().Set(Uint8Array::New(ab, 0, length));
return;
}

// Latin1 with non-ASCII characters - need conversion
utf8_length = simdutf::utf8_length_from_latin1(data, length);
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
CHECK(bs);
[[maybe_unused]] size_t written = simdutf::convert_latin1_to_utf8(
data, length, static_cast<char*>(bs->Data()));
DCHECK_EQ(written, utf8_length);
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length));
return;
// Inspect the string's flat content directly to determine the encoding and
// the exact UTF-8 output size, without copying it out of the V8 heap.
//
// v8::String::ValueView holds a DisallowGarbageCollection scope, so it must
// be released before allocating the backing store below. Flattening is cached
// on the string, so re-acquiring the view for the conversion pass is cheap.
bool is_one_byte;
bool is_ascii = false;
bool is_well_formed = true;
{
v8::String::ValueView view(isolate, source);
is_one_byte = view.is_one_byte();
if (is_one_byte) {
auto data = reinterpret_cast<const char*>(view.data8());
is_ascii = simdutf::validate_ascii_with_errors(data, length).error ==
simdutf::SUCCESS;
utf8_length =
is_ascii ? length : simdutf::utf8_length_from_latin1(data, length);
} else {
auto data = reinterpret_cast<const char16_t*>(view.data16());
is_well_formed = simdutf::validate_utf16_with_errors(data, length).error ==
simdutf::SUCCESS;
if (is_well_formed) {
utf8_length = simdutf::utf8_length_from_utf16(data, length);
}
}
}

// Two-byte string (UTF-16) - copy to buffer first
MaybeStackBuffer<uint16_t, MAX_SIZE_FOR_STACK_ALLOC> utf16_buffer(length);
source->WriteV2(isolate, 0, length, utf16_buffer.out());

auto data = reinterpret_cast<char16_t*>(utf16_buffer.out());

// Check for unpaired surrogates
simdutf::result validation_result =
simdutf::validate_utf16_with_errors(data, length);
// Rare path: two-byte string with unpaired surrogates. Copy into a mutable
// buffer, make it well-formed, then encode.
if (!is_well_formed) {
MaybeStackBuffer<uint16_t, MAX_SIZE_FOR_STACK_ALLOC> utf16_buffer(length);
source->WriteV2(isolate, 0, length, utf16_buffer.out());
auto data = reinterpret_cast<char16_t*>(utf16_buffer.out());
simdutf::to_well_formed_utf16(data, length, data);

if (validation_result.error == simdutf::SUCCESS) {
// Valid UTF-16 - use the fast path
utf8_length = simdutf::utf8_length_from_utf16(data, length);
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
Expand All @@ -399,16 +389,30 @@ void BindingData::EncodeUtf8String(const FunctionCallbackInfo<Value>& args) {
return;
}

// Invalid UTF-16 with unpaired surrogates - convert to well-formed in place
simdutf::to_well_formed_utf16(data, length, data);

utf8_length = simdutf::utf8_length_from_utf16(data, length);
// Common path: allocate the exact-size output, then re-acquire the flat
// content and encode directly into the backing store.
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
CHECK(bs);
[[maybe_unused]] size_t written = simdutf::convert_utf16_to_utf8(
data, length, static_cast<char*>(bs->Data()));
DCHECK_EQ(written, utf8_length);
char* out = static_cast<char*>(bs->Data());
{
v8::String::ValueView view(isolate, source);
if (is_one_byte) {
auto data = reinterpret_cast<const char*>(view.data8());
if (is_ascii) {
memcpy(out, data, length);
} else {
[[maybe_unused]] size_t written =
simdutf::convert_latin1_to_utf8(data, length, out);
DCHECK_EQ(written, utf8_length);
}
} else {
auto data = reinterpret_cast<const char16_t*>(view.data16());
[[maybe_unused]] size_t written =
simdutf::convert_utf16_to_utf8(data, length, out);
DCHECK_EQ(written, utf8_length);
}
}
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length));
}
Expand Down
Loading