TextCodecICU.cpp 16.5 KB
Newer Older
ap's avatar
ap committed
1
/*
darin@apple.com's avatar
darin@apple.com committed
2
 * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved.
ap's avatar
ap committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
darin's avatar
darin committed
28
#include "TextCodecICU.h"
ap's avatar
ap committed
29

ap@webkit.org's avatar
ap@webkit.org committed
30
#include "CharacterNames.h"
darin's avatar
darin committed
31 32 33
#include "CString.h"
#include "PlatformString.h"
#include <unicode/ucnv.h>
34
#include <unicode/ucnv_cb.h>
weinig's avatar
weinig committed
35
#include <wtf/Assertions.h>
ap's avatar
ap committed
36

darin's avatar
darin committed
37
using std::auto_ptr;
ap's avatar
ap committed
38 39 40 41
using std::min;

namespace WebCore {

darin's avatar
darin committed
42
const size_t ConversionBufferSize = 16384;
darin@apple.com's avatar
darin@apple.com committed
43

ap's avatar
ap committed
44 45
static UConverter* cachedConverterICU;

darin's avatar
darin committed
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
static auto_ptr<TextCodec> newTextCodecICU(const TextEncoding& encoding, const void*)
{
    return auto_ptr<TextCodec>(new TextCodecICU(encoding));
}

void TextCodecICU::registerBaseEncodingNames(EncodingNameRegistrar registrar)
{
    registrar("UTF-8", "UTF-8");
}

void TextCodecICU::registerBaseCodecs(TextCodecRegistrar registrar)
{
    registrar("UTF-8", newTextCodecICU, 0);
}

darin's avatar
darin committed
61 62 63 64
// FIXME: Registering all the encodings we get from ucnv_getAvailableName
// includes encodings we don't want or need. For example: UTF16_PlatformEndian,
// UTF16_OppositeEndian, UTF32_PlatformEndian, UTF32_OppositeEndian, and all
// the encodings with commas and version numbers.
ap's avatar
ap committed
65

darin's avatar
darin committed
66
void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
ap's avatar
ap committed
67
{
darin's avatar
darin committed
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
    // We register Hebrew with logical ordering using a separate name.
    // Otherwise, this would share the same canonical name as the
    // visual ordering case, and then TextEncoding could not tell them
    // apart; ICU works with either name.
    registrar("ISO-8859-8-I", "ISO-8859-8-I");

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // FIXME: Should we use the "MIME" standard instead of "IANA"?
        const char* standardName = ucnv_getStandardName(name, "IANA", &error);
        if (!U_SUCCESS(error) || !standardName)
            continue;

ap's avatar
ap committed
83 84 85 86 87
        // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.
        // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding
        //    for encoding GB_2312-80 and several others. So, we need to override this behavior, too.
        if (strcmp(standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0)
            standardName = "GBK";
ap@webkit.org's avatar
ap@webkit.org committed
88 89 90
        // Similarly, EUC-KR encodings all map to an extended version.
        else if (strcmp(standardName, "KS_C_5601-1987") == 0 || strcmp(standardName, "EUC-KR") == 0)
            standardName = "windows-949-2000";
ap@webkit.org's avatar
ap@webkit.org committed
91 92 93
        // And so on.
        else if (strcmp(standardName, "ISO_8859-9:1989") == 0)
            standardName = "windows-1254";
ap@webkit.org's avatar
ap@webkit.org committed
94 95
        else if (strcmp(standardName, "TIS-620") == 0)
            standardName = "windows-874-2000";
ap@webkit.org's avatar
ap@webkit.org committed
96 97

        registrar(standardName, standardName);
darin's avatar
darin committed
98 99 100 101 102 103 104 105 106 107 108

        uint16_t numAliases = ucnv_countAliases(name, &error);
        ASSERT(U_SUCCESS(error));
        if (U_SUCCESS(error))
            for (uint16_t j = 0; j < numAliases; ++j) {
                error = U_ZERO_ERROR;
                const char* alias = ucnv_getAlias(name, j, &error);
                ASSERT(U_SUCCESS(error));
                if (U_SUCCESS(error) && alias != standardName)
                    registrar(alias, standardName);
            }
ap's avatar
ap committed
109
    }
darin's avatar
darin committed
110

darin's avatar
darin committed
111 112 113
    // Additional aliases.
    // Perhaps we can get these added to ICU.
    registrar("macroman", "macintosh");
114
    registrar("xmacroman", "macintosh");
darin's avatar
darin committed
115

darin's avatar
darin committed
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
    // Additional aliases that historically were present in the encoding
    // table in WebKit on Macintosh that don't seem to be present in ICU.
    // Perhaps we can prove these are not used on the web and remove them.
    // Or perhaps we can get them added to ICU.
    registrar("cnbig5", "Big5");
    registrar("cngb", "EUC-CN");
    registrar("csISO88598I", "ISO_8859-8-I");
    registrar("csgb231280", "EUC-CN");
    registrar("dos720", "cp864");
    registrar("dos874", "cp874");
    registrar("jis7", "ISO-2022-JP");
    registrar("koi", "KOI8-R");
    registrar("logical", "ISO-8859-8-I");
    registrar("unicode11utf8", "UTF-8");
    registrar("unicode20utf8", "UTF-8");
    registrar("visual", "ISO-8859-8");
    registrar("winarabic", "windows-1256");
    registrar("winbaltic", "windows-1257");
    registrar("wincyrillic", "windows-1251");
ap@webkit.org's avatar
ap@webkit.org committed
135 136
    registrar("windows874", "windows874-2000");
    registrar("iso885911", "windows874-2000");
darin's avatar
darin committed
137 138 139 140 141 142 143 144 145 146 147 148
    registrar("wingreek", "windows-1253");
    registrar("winhebrew", "windows-1255");
    registrar("winlatin2", "windows-1250");
    registrar("winturkish", "windows-1254");
    registrar("winvietnamese", "windows-1258");
    registrar("xcp1250", "windows-1250");
    registrar("xcp1251", "windows-1251");
    registrar("xeuc", "EUC-JP");
    registrar("xeuccn", "EUC-CN");
    registrar("xgbk", "EUC-CN");
    registrar("xunicode20utf8", "UTF-8");
    registrar("xxbig5", "Big5");
ap's avatar
ap committed
149 150
}

darin's avatar
darin committed
151
void TextCodecICU::registerExtendedCodecs(TextCodecRegistrar registrar)
ap's avatar
ap committed
152
{
darin's avatar
darin committed
153 154 155 156 157 158 159 160 161 162 163 164
    // See comment above in registerEncodingNames.
    registrar("ISO-8859-8-I", newTextCodecICU, 0);

    int32_t numEncodings = ucnv_countAvailable();
    for (int32_t i = 0; i < numEncodings; ++i) {
        const char* name = ucnv_getAvailableName(i);
        UErrorCode error = U_ZERO_ERROR;
        // FIXME: Should we use the "MIME" standard instead of "IANA"?
        const char* standardName = ucnv_getStandardName(name, "IANA", &error);
        if (!U_SUCCESS(error) || !standardName)
            continue;
        registrar(standardName, newTextCodecICU, 0);
ap's avatar
ap committed
165 166 167
    }
}

darin's avatar
darin committed
168 169 170 171
TextCodecICU::TextCodecICU(const TextEncoding& encoding)
    : m_encoding(encoding)
    , m_numBufferedBytes(0)
    , m_converterICU(0)
172
    , m_needsGBKFallbacks(false)
ap's avatar
ap committed
173
{
darin's avatar
darin committed
174
}
ap's avatar
ap committed
175

darin's avatar
darin committed
176 177 178
TextCodecICU::~TextCodecICU()
{
    releaseICUConverter();
ap's avatar
ap committed
179 180
}

darin's avatar
darin committed
181
void TextCodecICU::releaseICUConverter() const
ap's avatar
ap committed
182
{
darin's avatar
darin committed
183 184 185 186 187
    if (m_converterICU) {
        if (cachedConverterICU)
            ucnv_close(cachedConverterICU);
        cachedConverterICU = m_converterICU;
        m_converterICU = 0;
ap's avatar
ap committed
188 189 190
    }
}

darin's avatar
darin committed
191
void TextCodecICU::createICUConverter() const
ap's avatar
ap committed
192
{
darin's avatar
darin committed
193 194
    ASSERT(!m_converterICU);

darin's avatar
darin committed
195 196 197
    const char* name = m_encoding.name();
    m_needsGBKFallbacks = name[0] == 'G' && name[1] == 'B' && name[2] == 'K' && !name[3];

darin's avatar
darin committed
198 199 200 201 202 203 204 205 206
    UErrorCode err;

    if (cachedConverterICU) {
        err = U_ZERO_ERROR;
        const char* cachedName = ucnv_getName(cachedConverterICU, &err);
        if (U_SUCCESS(err) && m_encoding == cachedName) {
            m_converterICU = cachedConverterICU;
            cachedConverterICU = 0;
            return;
ap's avatar
ap committed
207 208
        }
    }
darin's avatar
darin committed
209 210 211 212 213 214 215

    err = U_ZERO_ERROR;
    m_converterICU = ucnv_open(m_encoding.name(), &err);
#if !LOG_DISABLED
    if (err == U_AMBIGUOUS_ALIAS_WARNING)
        LOG_ERROR("ICU ambiguous alias warning for encoding: %s", m_encoding.name());
#endif
darin's avatar
darin committed
216 217
    if (m_converterICU)
        ucnv_setFallback(m_converterICU, TRUE);
ap's avatar
ap committed
218 219
}

eric@webkit.org's avatar
eric@webkit.org committed
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
int TextCodecICU::decodeToBuffer(UChar* target, UChar* targetLimit, const char*& source, const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode& err)
{
    UChar* targetStart = target;
    err = U_ZERO_ERROR;
    ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
    return target - targetStart;
}

class ErrorCallbackSetter {
public:
    ErrorCallbackSetter(UConverter* converter, bool stopOnError)
        : m_converter(converter)
        , m_shouldStopOnEncodingErrors(stopOnError)
    {
        if (m_shouldStopOnEncodingErrors) {
            UErrorCode err = U_ZERO_ERROR;
            ucnv_setToUCallBack(m_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE,
                           UCNV_SUB_STOP_ON_ILLEGAL, &m_savedAction,
                           &m_savedContext, &err);
            ASSERT(err == U_ZERO_ERROR);
        }
    }
    ~ErrorCallbackSetter()
    {
        if (m_shouldStopOnEncodingErrors) {
            UErrorCode err = U_ZERO_ERROR;
            const void* oldContext;
            UConverterToUCallback oldAction;
            ucnv_setToUCallBack(m_converter, m_savedAction,
                   m_savedContext, &oldAction,
                   &oldContext, &err);
            ASSERT(oldAction == UCNV_TO_U_CALLBACK_SUBSTITUTE);
252
            ASSERT(!strcmp(static_cast<const char*>(oldContext), UCNV_SUB_STOP_ON_ILLEGAL));
eric@webkit.org's avatar
eric@webkit.org committed
253 254 255 256 257 258 259 260 261 262 263
            ASSERT(err == U_ZERO_ERROR);
        }
    }
private:
    UConverter* m_converter;
    bool m_shouldStopOnEncodingErrors;
    const void* m_savedContext;
    UConverterToUCallback m_savedAction;
};

String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
ap's avatar
ap committed
264 265 266 267
{
    // Get a converter for the passed-in encoding.
    if (!m_converterICU) {
        createICUConverter();
darin's avatar
darin committed
268 269 270 271 272
        ASSERT(m_converterICU);
        if (!m_converterICU) {
            LOG_ERROR("error creating ICU encoder even though encoding was in table");
            return String();
        }
ap's avatar
ap committed
273
    }
eric@webkit.org's avatar
eric@webkit.org committed
274 275
    
    ErrorCallbackSetter callbackSetter(m_converterICU, stopOnError);
ap's avatar
ap committed
276

darin's avatar
darin committed
277
    Vector<UChar> result;
ap's avatar
ap committed
278 279

    UChar buffer[ConversionBufferSize];
eric@webkit.org's avatar
eric@webkit.org committed
280
    UChar* bufferLimit = buffer + ConversionBufferSize;
darin's avatar
darin committed
281 282
    const char* source = reinterpret_cast<const char*>(bytes);
    const char* sourceLimit = source + length;
ap's avatar
ap committed
283
    int32_t* offsets = NULL;
eric@webkit.org's avatar
eric@webkit.org committed
284
    UErrorCode err = U_ZERO_ERROR;
ap's avatar
ap committed
285 286

    do {
eric@webkit.org's avatar
eric@webkit.org committed
287
        int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err);
ap@webkit.org's avatar
ap@webkit.org committed
288
        result.append(buffer, ucharsDecoded);
ap's avatar
ap committed
289 290 291 292 293
    } while (err == U_BUFFER_OVERFLOW_ERROR);

    if (U_FAILURE(err)) {
        // flush the converter so it can be reused, and not be bothered by this error.
        do {
eric@webkit.org's avatar
eric@webkit.org committed
294
            decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err);
ap's avatar
ap committed
295
        } while (source < sourceLimit);
eric@webkit.org's avatar
eric@webkit.org committed
296
        sawError = true;
ap's avatar
ap committed
297 298
    }

ap@webkit.org's avatar
ap@webkit.org committed
299 300 301 302 303 304 305 306
    String resultString = String::adopt(result);

    // <http://bugs.webkit.org/show_bug.cgi?id=17014>
    // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5.
    if (m_encoding == "GBK" || m_encoding == "gb18030")
        resultString.replace(0xE5E5, ideographicSpace);

    return resultString;
ap's avatar
ap committed
307 308
}

309 310 311
// We need to apply these fallbacks ourselves as they are not currently supported by ICU and
// they were provided by the old TEC encoding path
// Needed to fix <rdar://problem/4708689>
312 313 314 315 316 317 318 319 320 321 322 323 324
static UChar getGbkEscape(UChar32 codePoint)
{
    switch (codePoint) {
        case 0x01F9:
            return 0xE7C8;
        case 0x1E3F:
            return 0xE7C7;
        case 0x22EF:
            return 0x2026;
        case 0x301C:
            return 0xFF5E;
        default:
            return 0;
325 326 327
    }
}

darin@apple.com's avatar
darin@apple.com committed
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
// Invalid character handler when writing escaped entities for unrepresentable
// characters. See the declaration of TextCodec::encode for more.
static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
                                     UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err)
{
    if (reason == UCNV_UNASSIGNED) {
        *err = U_ZERO_ERROR;

        UnencodableReplacementArray entity;
        int entityLen = TextCodec::getUnencodableReplacement(codePoint, URLEncodedEntitiesForUnencodables, entity);
        ucnv_cbFromUWriteBytes(fromUArgs, entity, entityLen, 0, err);
    } else
        UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
}

// Substitutes special GBK characters, escaping all other unassigned entities.
344 345 346
static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
                              UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 
{
347 348
    UChar outChar;
    if (reason == UCNV_UNASSIGNED && (outChar = getGbkEscape(codePoint))) {
349 350 351 352 353 354 355 356
        const UChar* source = &outChar;
        *err = U_ZERO_ERROR;
        ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err);
        return;
    }
    UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
}

darin@apple.com's avatar
darin@apple.com committed
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
// Combines both gbkUrlEscapedEntityCallback and GBK character substitution.
static void gbkUrlEscapedEntityCallack(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
                                       UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 
{
    if (reason == UCNV_UNASSIGNED) {
        if (UChar outChar = getGbkEscape(codePoint)) {
            const UChar* source = &outChar;
            *err = U_ZERO_ERROR;
            ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err);
            return;
        }
        urlEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, reason, err);
        return;
    }
    UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
}

374 375 376
static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
                                  UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 
{
377 378
    UChar outChar;
    if (reason == UCNV_UNASSIGNED && (outChar = getGbkEscape(codePoint))) {
379 380 381 382 383 384 385 386
        const UChar* source = &outChar;
        *err = U_ZERO_ERROR;
        ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err);
        return;
    }
    UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, err);
}

darin@apple.com's avatar
darin@apple.com committed
387
CString TextCodecICU::encode(const UChar* characters, size_t length, UnencodableHandling handling)
ap's avatar
ap committed
388
{
darin's avatar
darin committed
389
    if (!length)
ap's avatar
ap committed
390 391
        return "";

darin's avatar
darin committed
392 393 394 395
    if (!m_converterICU)
        createICUConverter();
    if (!m_converterICU)
        return CString();
ap's avatar
ap committed
396 397 398 399

    // FIXME: We should see if there is "force ASCII range" mode in ICU;
    // until then, we change the backslash into a yen sign.
    // Encoding will change the yen sign back into a backslash.
darin's avatar
darin committed
400
    String copy(characters, length);
ap's avatar
ap committed
401 402
    copy.replace('\\', m_encoding.backslashAsCurrencySymbol());

darin's avatar
darin committed
403
    const UChar* source = copy.characters();
ap's avatar
ap committed
404
    const UChar* sourceLimit = source + copy.length();
darin@apple.com's avatar
darin@apple.com committed
405

ap's avatar
ap committed
406 407
    UErrorCode err = U_ZERO_ERROR;

darin@apple.com's avatar
darin@apple.com committed
408 409 410 411 412 413 414 415 416 417 418
    switch (handling) {
        case QuestionMarksForUnencodables:
            ucnv_setSubstChars(m_converterICU, "?", 1, &err);
            ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);
            break;
        case EntitiesForUnencodables:
            ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);
            break;
        case URLEncodedEntitiesForUnencodables:
            ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err);
            break;
ap's avatar
ap committed
419 420 421 422
    }

    ASSERT(U_SUCCESS(err));
    if (U_FAILURE(err))
darin's avatar
darin committed
423
        return CString();
ap's avatar
ap committed
424

darin's avatar
darin committed
425 426
    Vector<char> result;
    size_t size = 0;
ap's avatar
ap committed
427
    do {
darin's avatar
darin committed
428
        char buffer[ConversionBufferSize];
ap's avatar
ap committed
429 430 431
        char* target = buffer;
        char* targetLimit = target + ConversionBufferSize;
        err = U_ZERO_ERROR;
darin's avatar
darin committed
432 433
        ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err);
        size_t count = target - buffer;
darin@apple.com's avatar
darin@apple.com committed
434
        result.grow(size + count);
darin's avatar
darin committed
435 436
        memcpy(result.data() + size, buffer, count);
        size += count;
ap's avatar
ap committed
437 438
    } while (err == U_BUFFER_OVERFLOW_ERROR);

darin's avatar
darin committed
439
    return CString(result.data(), size);
ap's avatar
ap committed
440 441 442 443
}


} // namespace WebCore