Commit f263e7d9 authored by ap@webkit.org's avatar ap@webkit.org

Reviewed by Darin.

        http://bugs.webkit.org/show_bug.cgi?id=15953
        Add UTF-8 encoding/decoding to WTF

        * kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode.
        * kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient
        parameter. Callers are not interested in getting decoding results in strict mode, so 
        this allows for bailing out as soon as an error is seen.

        * kjs/function.cpp:
        (KJS::encode): Updated for new UString::UTF8String() signature.

        * API/JSStringRef.cpp:
        (JSStringCreateWithCharacters): Disambiguate UChar.
        (JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string!
        * bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16().

        * wtf/unicode/UTF8.cpp: Added.
        (WTF::Unicode::inlineUTF8SequenceLengthNonASCII):
        (WTF::Unicode::inlineUTF8SequenceLength):
        (WTF::Unicode::UTF8SequenceLength):
        (WTF::Unicode::decodeUTF8Sequence):
        (WTF::Unicode::):
        (WTF::Unicode::ConvertUTF16ToUTF8):
        (WTF::Unicode::isLegalUTF8):
        (WTF::Unicode::ConvertUTF8ToUTF16):
        * wtf/unicode/UTF8.h: Added.
        (WTF::Unicode::):
        Some code moved from ustring.h, some adapted from unicode.org sources.

        * JavaScriptCore.exp:
        * JavaScriptCore.pri:
        * JavaScriptCore.vcproj/WTF/WTF.vcproj:
        * JavaScriptCore.xcodeproj/project.pbxproj:
        * JavaScriptCoreSources.bkl:
        Added UTF8.{h,cpp}



git-svn-id: http://svn.webkit.org/repository/webkit/trunk@27746 268f45cc-cd09-0410-ab3c-d52691b4dbfc
parent 16d9491e
......@@ -36,20 +36,27 @@
#include <kjs/operations.h>
#include <kjs/ustring.h>
#include <kjs/value.h>
#include <wtf/unicode/UTF8.h>
using namespace KJS;
using namespace WTF::Unicode;
JSStringRef JSStringCreateWithCharacters(const JSChar* chars, size_t numChars)
{
JSLock lock;
return toRef(UString(reinterpret_cast<const UChar*>(chars), static_cast<int>(numChars)).rep()->ref());
return toRef(UString(reinterpret_cast<const KJS::UChar*>(chars), static_cast<int>(numChars)).rep()->ref());
}
JSStringRef JSStringCreateWithUTF8CString(const char* string)
{
JSLock lock;
// FIXME: <rdar://problem/4949018>
return toRef(UString(string).rep()->ref());
size_t length = strlen(string);
Vector< ::UChar, 1024> buffer(length);
::UChar* p = buffer.data();
ConvertUTF8ToUTF16(&string, string + length, &p, p + length, false);
return toRef(UString(reinterpret_cast<KJS::UChar*>(buffer.data()), p - buffer.data()).rep()->ref());
}
JSStringRef JSStringRetain(JSStringRef string)
......
2007-11-12 Alexey Proskuryakov <ap@webkit.org>
Reviewed by Darin.
http://bugs.webkit.org/show_bug.cgi?id=15953
Add UTF-8 encoding/decoding to WTF
* kjs/ustring.h: Moved UTF8SequenceLength() and decodeUTF8Sequence() to wtf/unicode.
* kjs/ustring.cpp: (KJS::UString::UTF8String): Changed this function to take a strict/lenient
parameter. Callers are not interested in getting decoding results in strict mode, so
this allows for bailing out as soon as an error is seen.
* kjs/function.cpp:
(KJS::encode): Updated for new UString::UTF8String() signature.
* API/JSStringRef.cpp:
(JSStringCreateWithCharacters): Disambiguate UChar.
(JSStringCreateWithUTF8CString): Actually use UTF-8 when creating the string!
* bindings/c/c_utility.cpp: (KJS::Bindings::convertUTF8ToUTF16): Use ConvertUTF8ToUTF16().
* wtf/unicode/UTF8.cpp: Added.
(WTF::Unicode::inlineUTF8SequenceLengthNonASCII):
(WTF::Unicode::inlineUTF8SequenceLength):
(WTF::Unicode::UTF8SequenceLength):
(WTF::Unicode::decodeUTF8Sequence):
(WTF::Unicode::):
(WTF::Unicode::ConvertUTF16ToUTF8):
(WTF::Unicode::isLegalUTF8):
(WTF::Unicode::ConvertUTF8ToUTF16):
* wtf/unicode/UTF8.h: Added.
(WTF::Unicode::):
Some code moved from ustring.h, some adapted from unicode.org sources.
* JavaScriptCore.exp:
* JavaScriptCore.pri:
* JavaScriptCore.vcproj/WTF/WTF.vcproj:
* JavaScriptCore.xcodeproj/project.pbxproj:
* JavaScriptCoreSources.bkl:
Added UTF8.{h,cpp}
2007-11-12 Josh Aas <joshmoz@gmail.com>
Reviewed by Darin.
......
......@@ -259,7 +259,7 @@ __ZNK3KJS7JSValue15toInt32SlowCaseEPNS_9ExecStateERb
__ZNK3KJS7JSValue16toUInt32SlowCaseEPNS_9ExecStateERb
__ZNK3KJS7JSValue7toFloatEPNS_9ExecStateE
__ZNK3KJS7JSValue9toIntegerEPNS_9ExecStateE
__ZNK3KJS7UString10UTF8StringEv
__ZNK3KJS7UString10UTF8StringEb
__ZNK3KJS7UString14toStrictUInt32EPb
__ZNK3KJS7UString5asciiEv
__ZNK3KJS7UString6is8BitEv
......
......@@ -33,6 +33,7 @@ SOURCES += \
wtf/Assertions.cpp \
wtf/HashTable.cpp \
wtf/FastMalloc.cpp \
wtf/unicode/UTF8.cpp \
bindings/NP_jsobject.cpp \
bindings/npruntime.cpp \
bindings/runtime_array.cpp \
......
......@@ -311,6 +311,14 @@
RelativePath="..\..\wtf\VectorTraits.h"
>
</File>
<File
RelativePath="..\..\wtf\unicode\UTF8.h"
>
</File>
<File
RelativePath="..\..\wtf\unicode\UTF8.cpp"
>
</File>
</Files>
<Globals>
</Globals>
......
......@@ -235,6 +235,8 @@
E11D51760B2E798D0056C188 /* StringExtras.h in Headers */ = {isa = PBXBuildFile; fileRef = E11D51750B2E798D0056C188 /* StringExtras.h */; settings = {ATTRIBUTES = (Private, ); }; };
E195679609E7CF1200B89D13 /* UnicodeIcu.h in Headers */ = {isa = PBXBuildFile; fileRef = E195678F09E7CF1200B89D13 /* UnicodeIcu.h */; settings = {ATTRIBUTES = (Private, ); }; };
E195679809E7CF1200B89D13 /* Unicode.h in Headers */ = {isa = PBXBuildFile; fileRef = E195679409E7CF1200B89D13 /* Unicode.h */; settings = {ATTRIBUTES = (Private, ); }; };
E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E1EF79A80CE97BA60088D500 /* UTF8.cpp */; };
E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */ = {isa = PBXBuildFile; fileRef = E1EF79A90CE97BA60088D500 /* UTF8.h */; };
/* End PBXBuildFile section */
/* Begin PBXBuildRule section */
......@@ -589,6 +591,8 @@
E11D51750B2E798D0056C188 /* StringExtras.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = StringExtras.h; sourceTree = "<group>"; };
E195678F09E7CF1200B89D13 /* UnicodeIcu.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UnicodeIcu.h; sourceTree = "<group>"; };
E195679409E7CF1200B89D13 /* Unicode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Unicode.h; sourceTree = "<group>"; };
E1EF79A80CE97BA60088D500 /* UTF8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UTF8.cpp; sourceTree = "<group>"; };
E1EF79A90CE97BA60088D500 /* UTF8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UTF8.h; sourceTree = "<group>"; };
F5BB2BC5030F772101FCFE1D /* completion.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = completion.h; sourceTree = "<group>"; tabWidth = 8; };
F5C290E60284F98E018635CA /* JavaScriptCorePrefix.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.c.h; name = JavaScriptCorePrefix.h; path = ../JavaScriptCorePrefix.h; sourceTree = "<group>"; tabWidth = 8; };
F5FFE656026B47A6018635CA /* nodes2string.cpp */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nodes2string.cpp; sourceTree = "<group>"; tabWidth = 8; };
......@@ -1087,6 +1091,8 @@
children = (
E195678E09E7CF1200B89D13 /* icu */,
E195679409E7CF1200B89D13 /* Unicode.h */,
E1EF79A90CE97BA60088D500 /* UTF8.h */,
E1EF79A80CE97BA60088D500 /* UTF8.cpp */,
);
path = unicode;
sourceTree = "<group>";
......@@ -1253,6 +1259,7 @@
93E26BFE08B151D400F85226 /* ucpinternal.h in Headers */,
932F5B5C0822A1C700736975 /* ustring.h in Headers */,
14ABB36F099C076400E2A24F /* value.h in Headers */,
E1EF79AB0CE97BA60088D500 /* UTF8.h in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
};
......@@ -1368,7 +1375,6 @@
0867D690FE84028FC02AAC07 /* Project object */ = {
isa = PBXProject;
buildConfigurationList = 149C277108902AFE008A9EFC /* Build configuration list for PBXProject "JavaScriptCore" */;
compatibilityVersion = "Xcode 2.4";
hasScannedForEncodings = 1;
mainGroup = 0867D691FE84028FC02AAC07 /* JavaScriptCore */;
productRefGroup = 034768DFFF38A50411DB9C8B /* Products */;
......@@ -1541,6 +1547,7 @@
932F5BBD0822A1C700736975 /* runtime_method.cpp in Sources */,
932F5BBA0822A1C700736975 /* runtime_object.cpp in Sources */,
932F5BC50822A1C700736975 /* runtime_root.cpp in Sources */,
E1EF79AA0CE97BA60088D500 /* UTF8.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
......
......@@ -113,6 +113,7 @@ Source files for JSCore.
wtf/FastMalloc.cpp
wtf/HashTable.cpp
wtf/TCSystemAlloc.cpp
wtf/unicode/UTF8.cpp
</set>
</makefile>
......@@ -38,10 +38,10 @@
#include "runtime_object.h"
#include "runtime_root.h"
#include "Platform.h"
#if USE(ICU_UNICODE)
#include <unicode/ucnv.h>
#endif
#include <wtf/Assertions.h>
#include <wtf/unicode/UTF8.h>
using namespace WTF::Unicode;
namespace KJS { namespace Bindings {
......@@ -52,46 +52,40 @@ void convertNPStringToUTF16(const NPString *string, NPUTF16 **UTF16Chars, unsign
}
// Requires free() of returned UTF16Chars.
void convertUTF8ToUTF16(const NPUTF8 *UTF8Chars, int UTF8Length, NPUTF16 **UTF16Chars, unsigned int *UTF16Length)
void convertUTF8ToUTF16(const NPUTF8* UTF8Chars, int UTF8Length, NPUTF16** UTF16Chars, unsigned int* UTF16Length)
{
#if USE(ICU_UNICODE)
ASSERT(UTF8Chars || UTF8Length == 0);
ASSERT(UTF16Chars);
if (UTF8Length == -1)
UTF8Length = static_cast<int>(strlen(UTF8Chars));
// UTF16Length maximum length is the length of the UTF8 string, plus one to include terminator
// Without the plus one, it will convert ok, but a warning is generated from the converter as
// there is not enough room for a terminating character.
*UTF16Length = UTF8Length + 1;
*UTF16Chars = 0;
UErrorCode status = U_ZERO_ERROR;
UConverter* conv = ucnv_open("utf8", &status);
if (U_SUCCESS(status)) {
*UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length));
ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, 0, 0, 0, &status);
*UTF16Length = ucnv_toUChars(conv, (::UChar*)*UTF16Chars, *UTF16Length, UTF8Chars, UTF8Length, &status);
ucnv_close(conv);
}
*UTF16Length = UTF8Length;
*UTF16Chars = static_cast<NPUTF16*>(malloc(sizeof(NPUTF16) * (*UTF16Length)));
const char* sourcestart = UTF8Chars;
const char* sourceend = sourcestart + UTF8Length;
::UChar* targetstart = reinterpret_cast< ::UChar*>(*UTF16Chars);
::UChar* targetend = targetstart + UTF8Length;
ConversionResult result = ConvertUTF8ToUTF16(&sourcestart, sourceend, &targetstart, targetend, true);
*UTF16Length = targetstart - *UTF16Chars;
// Check to see if the conversion was successful
// Some plugins return invalid UTF-8 in NPVariantType_String, see <http://bugs.webkit.org/show_bug.cgi?id=5163>
// There is no "bad data" for latin1. It is unlikely that the plugin was really sending text in this encoding,
// but it should have used UTF-8, and now we are simply avoiding a crash.
if (!U_SUCCESS(status)) {
if (result != conversionOK) {
*UTF16Length = UTF8Length;
if (!*UTF16Chars) // If the memory wasn't allocated, allocate it.
*UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length));
*UTF16Chars = (NPUTF16*)malloc(sizeof(NPUTF16) * (*UTF16Length));
for (unsigned i = 0; i < *UTF16Length; i++)
(*UTF16Chars)[i] = UTF8Chars[i] & 0xFF;
}
#else
ASSERT(!"Implement me!");
#endif
}
// Variant value must be released with NPReleaseVariantValue()
......
......@@ -42,7 +42,7 @@
#include <wtf/ASCIICType.h>
#include <wtf/Assertions.h>
#include <wtf/MathExtras.h>
#include <wtf/unicode/Unicode.h>
#include <wtf/unicode/UTF8.h>
using namespace WTF;
using namespace Unicode;
......@@ -514,9 +514,8 @@ GlobalFuncImp::GlobalFuncImp(ExecState* exec, FunctionPrototype* funcProto, int
static JSValue* encode(ExecState* exec, const List& args, const char* do_not_escape)
{
UString r = "", s, str = args[0]->toString(exec);
bool wasGoodUTF16;
CString cstr = str.UTF8String(&wasGoodUTF16);
if (!wasGoodUTF16)
CString cstr = str.UTF8String(true);
if (!cstr.c_str())
return throwError(exec, URIError, "String contained an illegal UTF-16 sequence.");
const char* p = cstr.c_str();
for (size_t k = 0; k < cstr.size(); k++, p++) {
......
......@@ -1271,145 +1271,20 @@ int compare(const UString& s1, const UString& s2)
return (l1 > l2) ? 1 : -1;
}
inline int inlineUTF8SequenceLengthNonASCII(char b0)
CString UString::UTF8String(bool strict) const
{
if ((b0 & 0xC0) != 0xC0)
return 0;
if ((b0 & 0xE0) == 0xC0)
return 2;
if ((b0 & 0xF0) == 0xE0)
return 3;
if ((b0 & 0xF8) == 0xF0)
return 4;
return 0;
}
int UTF8SequenceLengthNonASCII(char b0)
{
return inlineUTF8SequenceLengthNonASCII(b0);
}
inline int inlineUTF8SequenceLength(char b0)
{
return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
}
// Given a first byte, gives the length of the UTF-8 sequence it begins.
// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
int UTF8SequenceLength(char b0)
{
return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
}
// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
// Only allows Unicode characters (U-00000000 to U-0010FFFF).
// Returns -1 if the sequence is not valid (including presence of extra bytes).
int decodeUTF8Sequence(const char *sequence)
{
// Handle 0-byte sequences (never valid).
const unsigned char b0 = sequence[0];
const int length = inlineUTF8SequenceLength(b0);
if (length == 0)
return -1;
// Handle 1-byte sequences (plain ASCII).
const unsigned char b1 = sequence[1];
if (length == 1) {
if (b1)
return -1;
return b0;
}
// Handle 2-byte sequences.
if ((b1 & 0xC0) != 0x80)
return -1;
const unsigned char b2 = sequence[2];
if (length == 2) {
if (b2)
return -1;
const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
if (c < 0x80)
return -1;
return c;
}
// Handle 3-byte sequences.
if ((b2 & 0xC0) != 0x80)
return -1;
const unsigned char b3 = sequence[3];
if (length == 3) {
if (b3)
return -1;
const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
if (c < 0x800)
return -1;
// UTF-16 surrogates should never appear in UTF-8 data.
if (c >= 0xD800 && c <= 0xDFFF)
return -1;
return c;
}
// Handle 4-byte sequences.
if ((b3 & 0xC0) != 0x80)
return -1;
const unsigned char b4 = sequence[4];
if (length == 4) {
if (b4)
return -1;
const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
if (c < 0x10000 || c > 0x10FFFF)
return -1;
return c;
}
return -1;
}
CString UString::UTF8String(bool* utf16WasGood) const
{
if (utf16WasGood)
*utf16WasGood = true;
// Allocate a buffer big enough to hold all the characters.
const int length = size();
Vector<char, 1024> buffer(length * 3);
// Convert to runs of 8-bit characters.
char *p = buffer.begin();
const UChar *d = data();
for (int i = 0; i != length; ++i) {
unsigned short c = d[i].unicode();
if (c < 0x80) {
*p++ = (char)c;
} else if (c < 0x800) {
*p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
*p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
} else if (c >= 0xD800 && c <= 0xDBFF && i < length && d[i+1].uc >= 0xDC00 && d[i+1].uc <= 0xDFFF) {
unsigned sc = 0x10000 + (((c & 0x3FF) << 10) | (d[i+1].uc & 0x3FF));
*p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
*p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
*p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
*p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
++i;
} else {
if (utf16WasGood && c >= 0xD800 && c <= 0xDFFF)
*utf16WasGood = false;
*p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
*p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
*p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
}
}
char* p = buffer.data();
const ::UChar* d = &data()->uc;
ConversionResult result = ConvertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict);
if (result != conversionOK)
return CString();
// Return the result as a C string.
CString result(buffer.data(), p - buffer.data());
return result;
}
CString UString::UTF8String() const
{
return UTF8String(0);
return CString(buffer.data(), p - buffer.data());
}
......
......@@ -265,6 +265,8 @@ namespace KJS {
/**
* @return The string converted to the 8-bit string type CString().
* This method is not Unicode safe and shouldn't be used unless the string
* is known to be ASCII.
*/
CString cstring() const;
/**
......@@ -278,13 +280,13 @@ namespace KJS {
/**
* Convert the string to UTF-8, assuming it is UTF-16 encoded.
* Since this function is tolerant of badly formed UTF-16, it can create UTF-8
* strings that are invalid because they have characters in the range
* U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to
* be otherwise valid.
* In non-strict mode, this function is tolerant of badly formed UTF-16, it
* can create UTF-8 strings that are invalid because they have characters in
* the range U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is
* guaranteed to be otherwise valid.
* In strict mode, error is returned as null CString.
*/
CString UTF8String() const;
CString UTF8String(bool* utf16WasGood) const;
CString UTF8String(bool strict = false) const;
/**
* @see UString(const DOM::DOMString&).
......@@ -427,16 +429,6 @@ namespace KJS {
int compare(const UString &, const UString &);
// Given a first byte, gives the length of the UTF-8 sequence it begins.
// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
int UTF8SequenceLength(char);
// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
// Only allows Unicode characters (U-00000000 to U-0010FFFF).
// Returns -1 if the sequence is not valid (including presence of extra bytes).
int decodeUTF8Sequence(const char *);
inline UString::UString()
: m_rep(&Rep::null)
{
......
/*
* Copyright (C) 2007 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "UTF8.h"
namespace WTF {
namespace Unicode {
inline int inlineUTF8SequenceLengthNonASCII(char b0)
{
if ((b0 & 0xC0) != 0xC0)
return 0;
if ((b0 & 0xE0) == 0xC0)
return 2;
if ((b0 & 0xF0) == 0xE0)
return 3;
if ((b0 & 0xF8) == 0xF0)
return 4;
return 0;
}
inline int inlineUTF8SequenceLength(char b0)
{
return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
}
int UTF8SequenceLength(char b0)
{
return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
}
int decodeUTF8Sequence(const char* sequence)
{
// Handle 0-byte sequences (never valid).
const unsigned char b0 = sequence[0];
const int length = inlineUTF8SequenceLength(b0);
if (length == 0)
return -1;
// Handle 1-byte sequences (plain ASCII).
const unsigned char b1 = sequence[1];
if (length == 1) {
if (b1)
return -1;
return b0;
}
// Handle 2-byte sequences.
if ((b1 & 0xC0) != 0x80)
return -1;
const unsigned char b2 = sequence[2];
if (length == 2) {
if (b2)
return -1;
const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
if (c < 0x80)
return -1;
return c;
}
// Handle 3-byte sequences.
if ((b2 & 0xC0) != 0x80)
return -1;
const unsigned char b3 = sequence[3];
if (length == 3) {
if (b3)
return -1;
const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
if (c < 0x800)
return -1;
// UTF-16 surrogates should never appear in UTF-8 data.
if (c >= 0xD800 && c <= 0xDFFF)
return -1;
return c;
}
// Handle 4-byte sequences.
if ((b3 & 0xC0) != 0x80)
return -1;
const unsigned char b4 = sequence[4];
if (length == 4) {
if (b4)
return -1;
const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
if (c < 0x10000 || c > 0x10FFFF)
return -1;
return c;
}
return -1;
}
// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
// into the first byte, depending on how many bytes follow. There are
// as many entries in this table as there are UTF-8 sequence types.
// (I.e., one byte sequence, two byte... etc.). Remember that sequencs
// for *legal* UTF-8 will be 4 or fewer bytes total.
static const char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
ConversionResult ConvertUTF16ToUTF8(
const UChar** sourceStart, const UChar* sourceEnd,
char** targetStart, char* targetEnd, bool strict)
{
ConversionResult result = conversionOK;
const UChar* source = *sourceStart;
char* target = *targetStart;
while (source < sourceEnd) {
UChar32 ch;
unsigned short bytesToWrite = 0;
const UChar32 byteMask = 0xBF;
const UChar32 byteMark = 0x80;
const UChar* oldSource = source; // In case we have to back up because of target overflow.
ch = static_cast<unsigned short>(*source++);
// If we have a surrogate pair, convert to UChar32 first.
if (ch >= 0xD800 && ch <= 0xDBFF) {
// If the 16 bits following the high surrogate are in the source buffer...
if (source < sourceEnd) {
UChar32 ch2 = static_cast<unsigned short>(*source);
// If it's a low surrogate, convert to UChar32.
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
++source;
} else if (strict) { // it's an unpaired high surrogate
--source; // return to the illegal value itself
result = sourceIllegal;
break;
}
} else { // We don't have the 16 bits following the high surrogate.