From f263e7d9bb994533fb9811c635798c05f96cc62b Mon Sep 17 00:00:00 2001 From: "ap@webkit.org" Date: Tue, 13 Nov 2007 07:12:55 +0000 Subject: [PATCH] Reviewed by Darin. http://bugs.webkit.org/show_bug.cgi?id=15953 Add UTF-8 encoding/decoding to WTF * kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode. * kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient parameter. Callers are not interested in getting decoding results in strict mode, so this allows for bailing out as soon as an error is seen. * kjs/function.cpp: (KJS::encode): Updated for new UString::UTF8String() signature. * API/JSStringRef.cpp: (JSStringCreateWithCharacters): Disambiguate UChar. (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string! * bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16(). * wtf/unicode/UTF8.cpp: Added. (WTF::Unicode::inlineUTF8SequenceLengthNonASCII): (WTF::Unicode::inlineUTF8SequenceLength): (WTF::Unicode::UTF8SequenceLength): (WTF::Unicode::decodeUTF8Sequence): (WTF::Unicode::): (WTF::Unicode::ConvertUTF16ToUTF8): (WTF::Unicode::isLegalUTF8): (WTF::Unicode::ConvertUTF8ToUTF16): * wtf/unicode/UTF8.h: Added. (WTF::Unicode::): Some code moved from ustring.h, some adapted from unicode.org sources. * JavaScriptCore.exp: * JavaScriptCore.pri: * JavaScriptCore.vcproj/WTF/WTF.vcproj: * JavaScriptCore.xcodeproj/project.pbxproj: * JavaScriptCoreSources.bkl: Added UTF8.{h,cpp} git-svn-id: http://svn.webkit.org/repository/webkit/trunk@27746 268f45cc-cd09-0410-ab3c-d52691b4dbfc --- JavaScriptCore/API/JSStringRef.cpp | 13 +- JavaScriptCore/ChangeLog | 40 +++ JavaScriptCore/JavaScriptCore.exp | 2 +- JavaScriptCore/JavaScriptCore.pri | 1 + .../JavaScriptCore.vcproj/WTF/WTF.vcproj | 8 + .../JavaScriptCore.xcodeproj/project.pbxproj | 9 +- JavaScriptCore/JavaScriptCoreSources.bkl | 1 + JavaScriptCore/bindings/c/c_utility.cpp | 44 ++- JavaScriptCore/kjs/function.cpp | 7 +- JavaScriptCore/kjs/ustring.cpp | 139 +------- JavaScriptCore/kjs/ustring.h | 24 +- JavaScriptCore/wtf/unicode/UTF8.cpp | 303 ++++++++++++++++++ JavaScriptCore/wtf/unicode/UTF8.h | 75 +++++ 13 files changed, 484 insertions(+), 182 deletions(-) create mode 100644 JavaScriptCore/wtf/unicode/UTF8.cpp create mode 100644 JavaScriptCore/wtf/unicode/UTF8.h diff --git a/JavaScriptCore/API/JSStringRef.cpp b/JavaScriptCore/API/JSStringRef.cpp index 4fe767cb384..e9bca87627a 100644 --- a/JavaScriptCore/API/JSStringRef.cpp +++ b/JavaScriptCore/API/JSStringRef.cpp @@ -36,20 +36,27 @@ #include #include #include +#include using namespace KJS; +using namespace WTF::Unicode; JSStringRef JSStringCreateWithCharacters(const JSChar* chars, size_t numChars) { JSLock lock; - return toRef(UString(reinterpret_cast(chars), static_cast(numChars)).rep()->ref()); + return toRef(UString(reinterpret_cast(chars), static_cast(numChars)).rep()->ref()); } JSStringRef JSStringCreateWithUTF8CString(const char* string) { JSLock lock; - // FIXME: - return toRef(UString(string).rep()->ref()); + + size_t length = strlen(string); + Vector< ::UChar, 1024> buffer(length); + ::UChar* p = buffer.data(); + ConvertUTF8ToUTF16(&string, string + length, &p, p + length, false); + + return toRef(UString(reinterpret_cast(buffer.data()), p - buffer.data()).rep()->ref()); } JSStringRef JSStringRetain(JSStringRef string) diff --git a/JavaScriptCore/ChangeLog b/JavaScriptCore/ChangeLog index 7eb293a7cde..3aca314e309 100644 --- a/JavaScriptCore/ChangeLog +++ b/JavaScriptCore/ChangeLog @@ -1,3 +1,43 @@ +2007-11-12 Alexey Proskuryakov + + Reviewed by Darin. + + http://bugs.webkit.org/show_bug.cgi?id=15953 + Add UTF-8 encoding/decoding to WTF + + * kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode. + * kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient + parameter. Callers are not interested in getting decoding results in strict mode, so + this allows for bailing out as soon as an error is seen. + + * kjs/function.cpp: + (KJS::encode): Updated for new UString::UTF8String() signature. + + * API/JSStringRef.cpp: + (JSStringCreateWithCharacters): Disambiguate UChar. + (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string! + * bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16(). + + * wtf/unicode/UTF8.cpp: Added. + (WTF::Unicode::inlineUTF8SequenceLengthNonASCII): + (WTF::Unicode::inlineUTF8SequenceLength): + (WTF::Unicode::UTF8SequenceLength): + (WTF::Unicode::decodeUTF8Sequence): + (WTF::Unicode::): + (WTF::Unicode::ConvertUTF16ToUTF8): + (WTF::Unicode::isLegalUTF8): + (WTF::Unicode::ConvertUTF8ToUTF16): + * wtf/unicode/UTF8.h: Added. + (WTF::Unicode::): + Some code moved from ustring.h, some adapted from unicode.org sources. + + * JavaScriptCore.exp: + * JavaScriptCore.pri: + * JavaScriptCore.vcproj/WTF/WTF.vcproj: + * JavaScriptCore.xcodeproj/project.pbxproj: + * JavaScriptCoreSources.bkl: + Added UTF8.{h,cpp} + 2007-11-12 Josh Aas Reviewed by Darin. diff --git a/JavaScriptCore/JavaScriptCore.exp b/JavaScriptCore/JavaScriptCore.exp index d8a50b1eb49..a3cecd56dc8 100644 --- a/JavaScriptCore/JavaScriptCore.exp +++ b/JavaScriptCore/JavaScriptCore.exp @@ -259,7 +259,7 @@ __ZNK3KJS7JSValue15toInt32SlowCaseEPNS_9ExecStateERb __ZNK3KJS7JSValue16toUInt32SlowCaseEPNS_9ExecStateERb __ZNK3KJS7JSValue7toFloatEPNS_9ExecStateE __ZNK3KJS7JSValue9toIntegerEPNS_9ExecStateE -__ZNK3KJS7UString10UTF8StringEv +__ZNK3KJS7UString10UTF8StringEb __ZNK3KJS7UString14toStrictUInt32EPb __ZNK3KJS7UString5asciiEv __ZNK3KJS7UString6is8BitEv diff --git a/JavaScriptCore/JavaScriptCore.pri b/JavaScriptCore/JavaScriptCore.pri index 5147b2837c2..e481374927d 100644 --- a/JavaScriptCore/JavaScriptCore.pri +++ b/JavaScriptCore/JavaScriptCore.pri @@ -33,6 +33,7 @@ SOURCES += \ wtf/Assertions.cpp \ wtf/HashTable.cpp \ wtf/FastMalloc.cpp \ + wtf/unicode/UTF8.cpp \ bindings/NP_jsobject.cpp \ bindings/npruntime.cpp \ bindings/runtime_array.cpp \ diff --git a/JavaScriptCore/JavaScriptCore.vcproj/WTF/WTF.vcproj b/JavaScriptCore/JavaScriptCore.vcproj/WTF/WTF.vcproj index babc57e0cfd..b35a6e91e21 100644 --- a/JavaScriptCore/JavaScriptCore.vcproj/WTF/WTF.vcproj +++ b/JavaScriptCore/JavaScriptCore.vcproj/WTF/WTF.vcproj @@ -311,6 +311,14 @@ RelativePath="..\..\wtf\VectorTraits.h" > + + + + diff --git a/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj b/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj index 5f99c4b338d..7cef1e5cd2b 100644 --- a/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj +++ b/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj @@ -235,6 +235,8 @@ E11D51760B2E798D0056C188 /* StringExtras.h in Headers */ = {isa = PBXBuildFile; fileRef = E11D51750B2E798D0056C188 /* StringExtras.h */; settings = {ATTRIBUTES = (Private, ); }; }; E195679609E7CF1200B89D13 /* UnicodeIcu.h in Headers */ = {isa = PBXBuildFile; fileRef = E195678F09E7CF1200B89D13 /* UnicodeIcu.h */; settings = {ATTRIBUTES = (Private, ); }; }; E195679809E7CF1200B89D13 /* Unicode.h in Headers */ = {isa = PBXBuildFile; fileRef = E195679409E7CF1200B89D13 /* Unicode.h */; settings = {ATTRIBUTES = (Private, ); }; }; + E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E1EF79A80CE97BA60088D500 /* UTF8.cpp */; }; + E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */ = {isa = PBXBuildFile; fileRef = E1EF79A90CE97BA60088D500 /* UTF8.h */; }; /* End PBXBuildFile section */ /* Begin PBXBuildRule section */ @@ -589,6 +591,8 @@ E11D51750B2E798D0056C188 /* StringExtras.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = StringExtras.h; sourceTree = ""; }; E195678F09E7CF1200B89D13 /* UnicodeIcu.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UnicodeIcu.h; sourceTree = ""; }; E195679409E7CF1200B89D13 /* Unicode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Unicode.h; sourceTree = ""; }; + E1EF79A80CE97BA60088D500 /* UTF8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UTF8.cpp; sourceTree = ""; }; + E1EF79A90CE97BA60088D500 /* UTF8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UTF8.h; sourceTree = ""; }; F5BB2BC5030F772101FCFE1D /* completion.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = completion.h; sourceTree = ""; tabWidth = 8; }; F5C290E60284F98E018635CA /* JavaScriptCorePrefix.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; name = JavaScriptCorePrefix.h; path = ../JavaScriptCorePrefix.h; sourceTree = ""; tabWidth = 8; }; F5FFE656026B47A6018635CA /* nodes2string.cpp */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nodes2string.cpp; sourceTree = ""; tabWidth = 8; }; @@ -1087,6 +1091,8 @@ children = ( E195678E09E7CF1200B89D13 /* icu */, E195679409E7CF1200B89D13 /* Unicode.h */, + E1EF79A90CE97BA60088D500 /* UTF8.h */, + E1EF79A80CE97BA60088D500 /* UTF8.cpp */, ); path = unicode; sourceTree = ""; @@ -1253,6 +1259,7 @@ 93E26BFE08B151D400F85226 /* ucpinternal.h in Headers */, 932F5B5C0822A1C700736975 /* ustring.h in Headers */, 14ABB36F099C076400E2A24F /* value.h in Headers */, + E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -1368,7 +1375,6 @@ 0867D690FE84028FC02AAC07 /* Project object */ = { isa = PBXProject; buildConfigurationList = 149C277108902AFE008A9EFC /* Build configuration list for PBXProject "JavaScriptCore" */; - compatibilityVersion = "Xcode 2.4"; hasScannedForEncodings = 1; mainGroup = 0867D691FE84028FC02AAC07 /* JavaScriptCore */; productRefGroup = 034768DFFF38A50411DB9C8B /* Products */; @@ -1541,6 +1547,7 @@ 932F5BBD0822A1C700736975 /* runtime_method.cpp in Sources */, 932F5BBA0822A1C700736975 /* runtime_object.cpp in Sources */, 932F5BC50822A1C700736975 /* runtime_root.cpp in Sources */, + E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/JavaScriptCore/JavaScriptCoreSources.bkl b/JavaScriptCore/JavaScriptCoreSources.bkl index 337108fa940..9b933d9813f 100644 --- a/JavaScriptCore/JavaScriptCoreSources.bkl +++ b/JavaScriptCore/JavaScriptCoreSources.bkl @@ -113,6 +113,7 @@ Source files for JSCore. wtf/FastMalloc.cpp wtf/HashTable.cpp wtf/TCSystemAlloc.cpp + wtf/unicode/UTF8.cpp diff --git a/JavaScriptCore/bindings/c/c_utility.cpp b/JavaScriptCore/bindings/c/c_utility.cpp index c20daa80005..9cc26ccc415 100644 --- a/JavaScriptCore/bindings/c/c_utility.cpp +++ b/JavaScriptCore/bindings/c/c_utility.cpp @@ -38,10 +38,10 @@ #include "runtime_object.h" #include "runtime_root.h" #include "Platform.h" -#if USE(ICU_UNICODE) -#include -#endif #include +#include + +using namespace WTF::Unicode; namespace KJS { namespace Bindings { @@ -52,46 +52,40 @@ void convertNPStringToUTF16(const NPString *string, NPUTF16 **UTF16Chars, unsign } // Requires free() of returned UTF16Chars. -void convertUTF8ToUTF16(const NPUTF8 *UTF8Chars, int UTF8Length, NPUTF16 **UTF16Chars, unsigned int *UTF16Length) +void convertUTF8ToUTF16(const NPUTF8* UTF8Chars, int UTF8Length, NPUTF16** UTF16Chars, unsigned int* UTF16Length) { -#if USE(ICU_UNICODE) ASSERT(UTF8Chars || UTF8Length == 0); ASSERT(UTF16Chars); if (UTF8Length == -1) UTF8Length = static_cast(strlen(UTF8Chars)); - - // UTF16Length maximum length is the length of the UTF8 string, plus one to include terminator - // Without the plus one, it will convert ok, but a warning is generated from the converter as - // there is not enough room for a terminating character. - *UTF16Length = UTF8Length + 1; - - *UTF16Chars = 0; - UErrorCode status = U_ZERO_ERROR; - UConverter* conv = ucnv_open("utf8", &status); - if (U_SUCCESS(status)) { - *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length)); - ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, 0, 0, 0, &status); - *UTF16Length = ucnv_toUChars(conv, (::UChar*)*UTF16Chars, *UTF16Length, UTF8Chars, UTF8Length, &status); - ucnv_close(conv); - } + + *UTF16Length = UTF8Length; + *UTF16Chars = static_cast(malloc(sizeof(NPUTF16) * (*UTF16Length))); + const char* sourcestart = UTF8Chars; + const char* sourceend = sourcestart + UTF8Length; + + ::UChar* targetstart = reinterpret_cast< ::UChar*>(*UTF16Chars); + ::UChar* targetend = targetstart + UTF8Length; + + ConversionResult result = ConvertUTF8ToUTF16(&sourcestart, sourceend, &targetstart, targetend, true); + + *UTF16Length = targetstart - *UTF16Chars; + // Check to see if the conversion was successful // Some plugins return invalid UTF-8 in NPVariantType_String, see // There is no "bad data" for latin1. It is unlikely that the plugin was really sending text in this encoding, // but it should have used UTF-8, and now we are simply avoiding a crash. - if (!U_SUCCESS(status)) { + if (result != conversionOK) { *UTF16Length = UTF8Length; if (!*UTF16Chars) // If the memory wasn't allocated, allocate it. - *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length)); + *UTF16Chars = (NPUTF16*)malloc(sizeof(NPUTF16) * (*UTF16Length)); for (unsigned i = 0; i < *UTF16Length; i++) (*UTF16Chars)[i] = UTF8Chars[i] & 0xFF; } -#else - ASSERT(!"Implement me!"); -#endif } // Variant value must be released with NPReleaseVariantValue() diff --git a/JavaScriptCore/kjs/function.cpp b/JavaScriptCore/kjs/function.cpp index 569ca180244..fadd37af215 100644 --- a/JavaScriptCore/kjs/function.cpp +++ b/JavaScriptCore/kjs/function.cpp @@ -42,7 +42,7 @@ #include #include #include -#include +#include using namespace WTF; using namespace Unicode; @@ -514,9 +514,8 @@ GlobalFuncImp::GlobalFuncImp(ExecState* exec, FunctionPrototype* funcProto, int static JSValue* encode(ExecState* exec, const List& args, const char* do_not_escape) { UString r = "", s, str = args[0]->toString(exec); - bool wasGoodUTF16; - CString cstr = str.UTF8String(&wasGoodUTF16); - if (!wasGoodUTF16) + CString cstr = str.UTF8String(true); + if (!cstr.c_str()) return throwError(exec, URIError, "String contained an illegal UTF-16 sequence."); const char* p = cstr.c_str(); for (size_t k = 0; k < cstr.size(); k++, p++) { diff --git a/JavaScriptCore/kjs/ustring.cpp b/JavaScriptCore/kjs/ustring.cpp index 8805a4dad78..2b4b54cb83e 100644 --- a/JavaScriptCore/kjs/ustring.cpp +++ b/JavaScriptCore/kjs/ustring.cpp @@ -1271,145 +1271,20 @@ int compare(const UString& s1, const UString& s2) return (l1 > l2) ? 1 : -1; } -inline int inlineUTF8SequenceLengthNonASCII(char b0) +CString UString::UTF8String(bool strict) const { - if ((b0 & 0xC0) != 0xC0) - return 0; - if ((b0 & 0xE0) == 0xC0) - return 2; - if ((b0 & 0xF0) == 0xE0) - return 3; - if ((b0 & 0xF8) == 0xF0) - return 4; - return 0; -} - -int UTF8SequenceLengthNonASCII(char b0) -{ - return inlineUTF8SequenceLengthNonASCII(b0); -} - -inline int inlineUTF8SequenceLength(char b0) -{ - return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0); -} - -// Given a first byte, gives the length of the UTF-8 sequence it begins. -// Returns 0 for bytes that are not legal starts of UTF-8 sequences. -// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). -int UTF8SequenceLength(char b0) -{ - return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); -} - -// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. -// Only allows Unicode characters (U-00000000 to U-0010FFFF). -// Returns -1 if the sequence is not valid (including presence of extra bytes). -int decodeUTF8Sequence(const char *sequence) -{ - // Handle 0-byte sequences (never valid). - const unsigned char b0 = sequence[0]; - const int length = inlineUTF8SequenceLength(b0); - if (length == 0) - return -1; - - // Handle 1-byte sequences (plain ASCII). - const unsigned char b1 = sequence[1]; - if (length == 1) { - if (b1) - return -1; - return b0; - } - - // Handle 2-byte sequences. - if ((b1 & 0xC0) != 0x80) - return -1; - const unsigned char b2 = sequence[2]; - if (length == 2) { - if (b2) - return -1; - const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); - if (c < 0x80) - return -1; - return c; - } - - // Handle 3-byte sequences. - if ((b2 & 0xC0) != 0x80) - return -1; - const unsigned char b3 = sequence[3]; - if (length == 3) { - if (b3) - return -1; - const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); - if (c < 0x800) - return -1; - // UTF-16 surrogates should never appear in UTF-8 data. - if (c >= 0xD800 && c <= 0xDFFF) - return -1; - return c; - } - - // Handle 4-byte sequences. - if ((b3 & 0xC0) != 0x80) - return -1; - const unsigned char b4 = sequence[4]; - if (length == 4) { - if (b4) - return -1; - const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); - if (c < 0x10000 || c > 0x10FFFF) - return -1; - return c; - } - - return -1; -} - -CString UString::UTF8String(bool* utf16WasGood) const -{ - if (utf16WasGood) - *utf16WasGood = true; - // Allocate a buffer big enough to hold all the characters. const int length = size(); Vector buffer(length * 3); // Convert to runs of 8-bit characters. - char *p = buffer.begin(); - const UChar *d = data(); - for (int i = 0; i != length; ++i) { - unsigned short c = d[i].unicode(); - if (c < 0x80) { - *p++ = (char)c; - } else if (c < 0x800) { - *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 - *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set - } else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) { - unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF)); - *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8 - *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set - *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set - *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set - ++i; - } else { - if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF) - *utf16WasGood = false; - *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 - *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set - *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set - } - } + char* p = buffer.data(); + const ::UChar* d = &data()->uc; + ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict); + if (result != conversionOK) + return CString(); - // Return the result as a C string. - CString result(buffer.data(), p - buffer.data()); - - return result; -} - -CString UString::UTF8String() const -{ - return UTF8String(0); + return CString(buffer.data(), p - buffer.data()); } diff --git a/JavaScriptCore/kjs/ustring.h b/JavaScriptCore/kjs/ustring.h index e5bf33df6c7..412514ba12f 100644 --- a/JavaScriptCore/kjs/ustring.h +++ b/JavaScriptCore/kjs/ustring.h @@ -265,6 +265,8 @@ namespace KJS { /** * @return The string converted to the 8-bit string type CString(). + * This method is not Unicode safe and shouldn't be used unless the string + * is known to be ASCII. */ CString cstring() const; /** @@ -278,13 +280,13 @@ namespace KJS { /** * Convert the string to UTF-8, assuming it is UTF-16 encoded. - * Since this function is tolerant of badly formed UTF-16, it can create UTF-8 - * strings that are invalid because they have characters in the range - * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to - * be otherwise valid. + * In non-strict mode, this function is tolerant of badly formed UTF-16, it + * can create UTF-8 strings that are invalid because they have characters in + * the range U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is + * guaranteed to be otherwise valid. + * In strict mode, error is returned as null CString. */ - CString UTF8String() const; - CString UTF8String(bool* utf16WasGood) const; + CString UTF8String(bool strict = false) const; /** * @see UString(const DOM::DOMString&). @@ -427,16 +429,6 @@ namespace KJS { int compare(const UString &, const UString &); - // Given a first byte, gives the length of the UTF-8 sequence it begins. - // Returns 0 for bytes that are not legal starts of UTF-8 sequences. - // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). - int UTF8SequenceLength(char); - - // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. - // Only allows Unicode characters (U-00000000 to U-0010FFFF). - // Returns -1 if the sequence is not valid (including presence of extra bytes). - int decodeUTF8Sequence(const char *); - inline UString::UString() : m_rep(&Rep::null) { diff --git a/JavaScriptCore/wtf/unicode/UTF8.cpp b/JavaScriptCore/wtf/unicode/UTF8.cpp new file mode 100644 index 00000000000..527195aee41 --- /dev/null +++ b/JavaScriptCore/wtf/unicode/UTF8.cpp @@ -0,0 +1,303 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "UTF8.h" + +namespace WTF { +namespace Unicode { + +inline int inlineUTF8SequenceLengthNonASCII(char b0) +{ + if ((b0 & 0xC0) != 0xC0) + return 0; + if ((b0 & 0xE0) == 0xC0) + return 2; + if ((b0 & 0xF0) == 0xE0) + return 3; + if ((b0 & 0xF8) == 0xF0) + return 4; + return 0; +} + +inline int inlineUTF8SequenceLength(char b0) +{ + return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); +} + +int UTF8SequenceLength(char b0) +{ + return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); +} + +int decodeUTF8Sequence(const char* sequence) +{ + // Handle 0-byte sequences (never valid). + const unsigned char b0 = sequence[0]; + const int length = inlineUTF8SequenceLength(b0); + if (length == 0) + return -1; + + // Handle 1-byte sequences (plain ASCII). + const unsigned char b1 = sequence[1]; + if (length == 1) { + if (b1) + return -1; + return b0; + } + + // Handle 2-byte sequences. + if ((b1 & 0xC0) != 0x80) + return -1; + const unsigned char b2 = sequence[2]; + if (length == 2) { + if (b2) + return -1; + const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); + if (c < 0x80) + return -1; + return c; + } + + // Handle 3-byte sequences. + if ((b2 & 0xC0) != 0x80) + return -1; + const unsigned char b3 = sequence[3]; + if (length == 3) { + if (b3) + return -1; + const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); + if (c < 0x800) + return -1; + // UTF-16 surrogates should never appear in UTF-8 data. + if (c >= 0xD800 && c <= 0xDFFF) + return -1; + return c; + } + + // Handle 4-byte sequences. + if ((b3 & 0xC0) != 0x80) + return -1; + const unsigned char b4 = sequence[4]; + if (length == 4) { + if (b4) + return -1; + const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); + if (c < 0x10000 || c > 0x10FFFF) + return -1; + return c; + } + + return -1; +} + +// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed +// into the first byte, depending on how many bytes follow. There are +// as many entries in this table as there are UTF-8 sequence types. +// (I.e., one byte sequence, two byte... etc.). Remember that sequencs +// for *legal* UTF-8 will be 4 or fewer bytes total. +static const char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + +ConversionResult ConvertUTF16ToUTF8( + const UChar** sourceStart, const UChar* sourceEnd, + char** targetStart, char* targetEnd, bool strict) +{ + ConversionResult result = conversionOK; + const UChar* source = *sourceStart; + char* target = *targetStart; + while (source < sourceEnd) { + UChar32 ch; + unsigned short bytesToWrite = 0; + const UChar32 byteMask = 0xBF; + const UChar32 byteMark = 0x80; + const UChar* oldSource = source; // In case we have to back up because of target overflow. + ch = static_cast(*source++); + // If we have a surrogate pair, convert to UChar32 first. + if (ch >= 0xD800 && ch <= 0xDBFF) { + // If the 16 bits following the high surrogate are in the source buffer... + if (source < sourceEnd) { + UChar32 ch2 = static_cast(*source); + // If it's a low surrogate, convert to UChar32. + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; + ++source; + } else if (strict) { // it's an unpaired high surrogate + --source; // return to the illegal value itself + result = sourceIllegal; + break; + } + } else { // We don't have the 16 bits following the high surrogate. + --source; // return to the high surrogate + result = sourceExhausted; + break; + } + } else if (strict) { + // UTF-16 surrogate values are illegal in UTF-32 + if (ch >= 0xDC00 && ch <= 0xDFFF) { + --source; // return to the illegal value itself + result = sourceIllegal; + break; + } + } + // Figure out how many bytes the result will require + if (ch < (UChar32)0x80) { + bytesToWrite = 1; + } else if (ch < (UChar32)0x800) { + bytesToWrite = 2; + } else if (ch < (UChar32)0x10000) { + bytesToWrite = 3; + } else if (ch < (UChar32)0x110000) { + bytesToWrite = 4; + } else { + bytesToWrite = 3; + ch = 0xFFFD; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; // Back up source pointer! + target -= bytesToWrite; + result = targetExhausted; + break; + } + switch (bytesToWrite) { // note: everything falls through. + case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +// This must be called with the length pre-determined by the first byte. +// If presented with a length > 4, this returns false. The Unicode +// definition of UTF-8 goes up to 4-byte sequences. +static bool isLegalUTF8(const unsigned char* source, int length) +{ + unsigned char a; + const unsigned char* srcptr = source + length; + switch (length) { + default: return false; + // Everything else falls through when "true"... + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*source) { + // no fall-through in this inner switch + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) + return false; + return true; +} + +// Magic values subtracted from a buffer value during UTF8 conversion. +// This table contains as many values as there might be trailing bytes +// in a UTF-8 sequence. +static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + +ConversionResult ConvertUTF8ToUTF16( + const char** sourceStart, const char* sourceEnd, + UChar** targetStart, UChar* targetEnd, bool strict) +{ + ConversionResult result = conversionOK; + const char* source = *sourceStart; + UChar* target = *targetStart; + while (source < sourceEnd) { + UChar32 ch = 0; + unsigned short extraBytesToRead = UTF8SequenceLength(*source) - 1; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; + break; + } + // Do this check whether lenient or strict + if (!isLegalUTF8(reinterpret_cast(source), extraBytesToRead + 1)) { + result = sourceIllegal; + break; + } + // The cases all fall through. + switch (extraBytesToRead) { + case 5: ch += static_cast(*source++); ch <<= 6; // remember, illegal UTF-8 + case 4: ch += static_cast(*source++); ch <<= 6; // remember, illegal UTF-8 + case 3: ch += static_cast(*source++); ch <<= 6; + case 2: ch += static_cast(*source++); ch <<= 6; + case 1: ch += static_cast(*source++); ch <<= 6; + case 0: ch += static_cast(*source++); + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead + 1); // Back up source pointer! + result = targetExhausted; break; + } + if (ch <= 0xFFFF) { + // UTF-16 surrogate values are illegal in UTF-32 + if (ch >= 0xD800 && ch <= 0xDFFF) { + if (strict) { + source -= (extraBytesToRead + 1); // return to the illegal value itself + result = sourceIllegal; + break; + } else + *target++ = 0xFFFD; + } else + *target++ = (UChar)ch; // normal case + } else if (ch > 0x10FFFF) { + if (strict) { + result = sourceIllegal; + source -= (extraBytesToRead + 1); // return to the start + break; // Bail out; shouldn't continue + } else + *target++ = 0xFFFD; + } else { + // target is a character in range 0xFFFF - 0x10FFFF + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead + 1); // Back up source pointer! + result = targetExhausted; + break; + } + ch -= 0x0010000UL; + *target++ = (UChar)((ch >> 10) + 0xD800); + *target++ = (UChar)((ch & 0x03FF) + 0xDC00); + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +} +} diff --git a/JavaScriptCore/wtf/unicode/UTF8.h b/JavaScriptCore/wtf/unicode/UTF8.h new file mode 100644 index 00000000000..a09bc41212e --- /dev/null +++ b/JavaScriptCore/wtf/unicode/UTF8.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2007 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef WTF_UTF8_h +#define WTF_UTF8_h + +#include "Unicode.h" + +namespace WTF { + namespace Unicode { + + // Given a first byte, gives the length of the UTF-8 sequence it begins. + // Returns 0 for bytes that are not legal starts of UTF-8 sequences. + // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). + int UTF8SequenceLength(char); + + // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. + // Only allows Unicode characters (U-00000000 to U-0010FFFF). + // Returns -1 if the sequence is not valid (including presence of extra bytes). + int decodeUTF8Sequence(const char*); + + typedef enum { + conversionOK, // conversion successful + sourceExhausted, // partial character in source, but hit end + targetExhausted, // insuff. room in target for conversion + sourceIllegal // source sequence is illegal/malformed + } ConversionResult; + + // These conversion functions take a "strict" argument. When this + // flag is set to strict, both irregular sequences and isolated surrogates + // will cause an error. When the flag is set to lenient, both irregular + // sequences and isolated surrogates are converted. + // + // Whether the flag is strict or lenient, all illegal sequences will cause + // an error return. This includes sequences such as: , , + // or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code + // must check for illegal sequences. + // + // When the flag is set to lenient, characters over 0x10FFFF are converted + // to the replacement character; otherwise (when the flag is set to strict) + // they constitute an error. + + ConversionResult ConvertUTF8ToUTF16( + const char** sourceStart, const char* sourceEnd, + UChar** targetStart, UChar* targetEnd, bool strict = true); + + ConversionResult ConvertUTF16ToUTF8( + const UChar** sourceStart, const UChar* sourceEnd, + char** targetStart, char* targetEnd, bool strict = true); + } +} + +#endif // WTF_UTF8_h -- GitLab