Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions ios/extensions/StringExtension.mm
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,81 @@ + (NSDictionary *)getEscapedCharactersInfoFrom:(NSString *)text {
}
}

// Numeric character references: &#NNNN; (decimal) and &#xHHHH; (hex)
NSRegularExpression *numericEntityRegex = [NSRegularExpression
regularExpressionWithPattern:@"&#([xX][0-9a-fA-F]+|[0-9]+);"
options:0
error:nil];
Comment thread
Copilot marked this conversation as resolved.

[numericEntityRegex
enumerateMatchesInString:text
options:0
range:NSMakeRange(0, text.length)
usingBlock:^(NSTextCheckingResult *match,
NSMatchingFlags flags, BOOL *stop) {
if (match == nil) {
return;
}
NSRange fullRange = [match range];
NSString *entityStr = [text substringWithRange:fullRange];
NSString *valueStr =
[text substringWithRange:[match rangeAtIndex:1]];

// Convert the matched string into a raw integer (UTF32
// Code Point)
UTF32Char codePoint = 0;
if ([valueStr hasPrefix:@"x"] ||
[valueStr hasPrefix:@"X"]) {
// Parse Hexadecimal (base 16)
const char *hexStr =
[[valueStr substringFromIndex:1] UTF8String];
codePoint = (UTF32Char)strtoul(hexStr, NULL, 16);
} else {
// Parse Decimal (base 10)
const char *decStr = [valueStr UTF8String];
codePoint = (UTF32Char)strtoul(decStr, NULL, 10);
}

// Safety check: HTML numeric character references should
// map to valid Unicode scalar values (0x0..0x10FFFF),
// excluding surrogate code points (0xD800-0xDFFF). Per
// HTML5, code point 0 is treated as U+FFFD. Replace
// invalid values with U+FFFD (Replacement Character) to
// avoid crashes/truncation.
if (codePoint == 0 || codePoint > 0x10FFFF ||
(codePoint >= 0xD800 && codePoint <= 0xDFFF)) {
codePoint = 0xFFFD;
}

NSString *decoded;
if (codePoint <= 0xFFFF) {
// STANDARD CHARACTER: Fits perfectly in one 16-bit
// unichar.
unichar ch = (unichar)codePoint;
decoded = [NSString stringWithCharacters:&ch length:1];
} else {
// LARGE CHARACTER: Too big for 16 bits.
// We must split the code point into two 16-bit halves
// (a "Surrogate Pair") so NSString can store it
// properly in UTF-16.
UniChar surrogate[2];

// Calculate the "High" surrogate half
surrogate[0] =
(UniChar)(0xD800 + ((codePoint - 0x10000) >> 10));

// Calculate the "Low" surrogate half
surrogate[1] =
(UniChar)(0xDC00 + ((codePoint - 0x10000) & 0x3FF));

// Create the string using both 16-bit pieces
decoded = [NSString stringWithCharacters:surrogate
length:2];
}

results[@(fullRange.location)] = @[ entityStr, decoded ];
}];

return results;
}

Expand Down
Loading