diff --git a/ios/extensions/StringExtension.mm b/ios/extensions/StringExtension.mm index a4ad69e5..6f0a9d5a 100644 --- a/ios/extensions/StringExtension.mm +++ b/ios/extensions/StringExtension.mm @@ -51,6 +51,81 @@ + (NSDictionary *)getEscapedCharactersInfoFrom:(NSString *)text { } } + // Numeric character references: &#NNNN; (decimal) and &#xHHHH; (hex) + NSRegularExpression *numericEntityRegex = [NSRegularExpression + regularExpressionWithPattern:@"&#([xX][0-9a-fA-F]+|[0-9]+);" + options:0 + error:nil]; + + [numericEntityRegex + enumerateMatchesInString:text + options:0 + range:NSMakeRange(0, text.length) + usingBlock:^(NSTextCheckingResult *match, + NSMatchingFlags flags, BOOL *stop) { + if (match == nil) { + return; + } + NSRange fullRange = [match range]; + NSString *entityStr = [text substringWithRange:fullRange]; + NSString *valueStr = + [text substringWithRange:[match rangeAtIndex:1]]; + + // Convert the matched string into a raw integer (UTF32 + // Code Point) + UTF32Char codePoint = 0; + if ([valueStr hasPrefix:@"x"] || + [valueStr hasPrefix:@"X"]) { + // Parse Hexadecimal (base 16) + const char *hexStr = + [[valueStr substringFromIndex:1] UTF8String]; + codePoint = (UTF32Char)strtoul(hexStr, NULL, 16); + } else { + // Parse Decimal (base 10) + const char *decStr = [valueStr UTF8String]; + codePoint = (UTF32Char)strtoul(decStr, NULL, 10); + } + + // Safety check: HTML numeric character references should + // map to valid Unicode scalar values (0x0..0x10FFFF), + // excluding surrogate code points (0xD800-0xDFFF). Per + // HTML5, code point 0 is treated as U+FFFD. Replace + // invalid values with U+FFFD (Replacement Character) to + // avoid crashes/truncation. + if (codePoint == 0 || codePoint > 0x10FFFF || + (codePoint >= 0xD800 && codePoint <= 0xDFFF)) { + codePoint = 0xFFFD; + } + + NSString *decoded; + if (codePoint <= 0xFFFF) { + // STANDARD CHARACTER: Fits perfectly in one 16-bit + // unichar. + unichar ch = (unichar)codePoint; + decoded = [NSString stringWithCharacters:&ch length:1]; + } else { + // LARGE CHARACTER: Too big for 16 bits. + // We must split the code point into two 16-bit halves + // (a "Surrogate Pair") so NSString can store it + // properly in UTF-16. + UniChar surrogate[2]; + + // Calculate the "High" surrogate half + surrogate[0] = + (UniChar)(0xD800 + ((codePoint - 0x10000) >> 10)); + + // Calculate the "Low" surrogate half + surrogate[1] = + (UniChar)(0xDC00 + ((codePoint - 0x10000) & 0x3FF)); + + // Create the string using both 16-bit pieces + decoded = [NSString stringWithCharacters:surrogate + length:2]; + } + + results[@(fullRange.location)] = @[ entityStr, decoded ]; + }]; + return results; }