TextCodecMac.cpp 12.7 KB
Newer Older
ap's avatar
ap committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
/*
 * Copyright (C) 2004, 2006 Apple Computer, Inc.  All rights reserved.
 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
darin's avatar
darin committed
28
#include "TextCodecMac.h"
ap's avatar
ap committed
29

darin's avatar
darin committed
30
#include "CString.h"
darin's avatar
darin committed
31 32
#include "CharacterNames.h"
#include "CharsetData.h"
darin's avatar
darin committed
33
#include "PlatformString.h"
ap's avatar
ap committed
34 35
#include <wtf/Assertions.h>

darin's avatar
darin committed
36
using std::auto_ptr;
ap's avatar
ap committed
37 38 39 40
using std::min;

namespace WebCore {

darin's avatar
darin committed
41
// We need to keep this because ICU doesn't support some of the encodings that we need:
darin's avatar
darin committed
42
// <http://bugs.webkit.org/show_bug.cgi?id=4195>.
ap's avatar
ap committed
43

darin's avatar
darin committed
44
const size_t ConversionBufferSize = 16384;
ap's avatar
ap committed
45 46

static TECObjectRef cachedConverterTEC;
darin's avatar
darin committed
47
static TECTextEncodingID cachedConverterEncoding = invalidEncoding;
ap's avatar
ap committed
48

darin's avatar
darin committed
49
void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar)
ap's avatar
ap committed
50
{
darin's avatar
darin committed
51 52
    TECTextEncodingID lastEncoding = invalidEncoding;
    const char* lastName = 0;
ap's avatar
ap committed
53

darin's avatar
darin committed
54 55 56 57 58 59
    for (size_t i = 0; CharsetTable[i].name; ++i) {
        if (CharsetTable[i].encoding != lastEncoding) {
            lastEncoding = CharsetTable[i].encoding;
            lastName = CharsetTable[i].name;
        }
        registrar(CharsetTable[i].name, lastName);
ap's avatar
ap committed
60 61 62
    }
}

darin's avatar
darin committed
63
static auto_ptr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData)
ap's avatar
ap committed
64
{
darin's avatar
darin committed
65
    return auto_ptr<TextCodec>(new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData)));
ap's avatar
ap committed
66 67
}

darin's avatar
darin committed
68
void TextCodecMac::registerCodecs(TextCodecRegistrar registrar)
ap's avatar
ap committed
69
{
darin's avatar
darin committed
70
    TECTextEncodingID lastEncoding = invalidEncoding;
ap's avatar
ap committed
71

darin's avatar
darin committed
72 73 74 75
    for (size_t i = 0; CharsetTable[i].name; ++i)
        if (CharsetTable[i].encoding != lastEncoding) {
            registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding);
            lastEncoding = CharsetTable[i].encoding;
ap's avatar
ap committed
76 77 78
        }
}

darin's avatar
darin committed
79 80 81 82 83
TextCodecMac::TextCodecMac(TECTextEncodingID encoding)
    : m_encoding(encoding)
    , m_error(false)
    , m_numBufferedBytes(0)
    , m_converterTEC(0)
ap's avatar
ap committed
84
{
darin's avatar
darin committed
85
}
ap's avatar
ap committed
86

darin's avatar
darin committed
87 88 89
TextCodecMac::~TextCodecMac()
{
    releaseTECConverter();
ap's avatar
ap committed
90 91
}

darin's avatar
darin committed
92
void TextCodecMac::releaseTECConverter() const
ap's avatar
ap committed
93
{
darin's avatar
darin committed
94 95 96 97 98 99 100 101
    if (m_converterTEC) {
        if (cachedConverterTEC != 0)
            TECDisposeConverter(cachedConverterTEC);
        cachedConverterTEC = m_converterTEC;
        cachedConverterEncoding = m_encoding;
        m_converterTEC = 0;
    }
}
ap's avatar
ap committed
102

darin's avatar
darin committed
103 104 105 106
OSStatus TextCodecMac::createTECConverter() const
{
    bool cachedEncodingEqual = cachedConverterEncoding == m_encoding;
    cachedConverterEncoding = invalidEncoding;
ap's avatar
ap committed
107 108 109 110 111 112

    if (cachedEncodingEqual && cachedConverterTEC) {
        m_converterTEC = cachedConverterTEC;
        cachedConverterTEC = 0;
        TECClearConverterContextInfo(m_converterTEC);
    } else {
darin's avatar
darin committed
113
        OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding,
ap's avatar
ap committed
114 115 116 117 118 119 120 121 122 123
            CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
        if (status)
            return status;

        TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
    }
    
    return noErr;
}

darin's avatar
darin committed
124 125
OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength,
    void *outputBuffer, int outputBufferLength, int& outputLength)
ap's avatar
ap committed
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
{
    OSStatus status;
    unsigned long bytesRead = 0;
    unsigned long bytesWritten = 0;

    if (m_numBufferedBytes != 0) {
        // Finish converting a partial character that's in our buffer.
        
        // First, fill the partial character buffer with as many bytes as are available.
        ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes));
        const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes;
        const int bytesToPutInBuffer = MIN(spaceInBuffer, inputBufferLength);
        ASSERT(bytesToPutInBuffer != 0);
        memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer);

        // Now, do a conversion on the buffer.
        status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead,
darin's avatar
darin committed
143
            reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
ap's avatar
ap committed
144 145 146 147 148
        ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer);

        if (status == kTECPartialCharErr && bytesRead == 0) {
            // Handle the case where the partial character was not converted.
            if (bytesToPutInBuffer >= spaceInBuffer) {
149
                LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes));
ap's avatar
ap committed
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
                m_numBufferedBytes = 0;
                status = kTECUnmappableElementErr; // should never happen, but use this error code
            } else {
                // Tell the caller we read all the source bytes and keep them in the buffer.
                m_numBufferedBytes += bytesToPutInBuffer;
                bytesRead = bytesToPutInBuffer;
                status = noErr;
            }
        } else {
            // We are done with the partial character buffer.
            // Also, we have read some of the bytes from the main buffer.
            if (bytesRead > m_numBufferedBytes) {
                bytesRead -= m_numBufferedBytes;
            } else {
                LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
                bytesRead = 0;
            }
            m_numBufferedBytes = 0;
            if (status == kTECPartialCharErr) {
                // While there may be a partial character problem in the small buffer,
                // we have to try again and not get confused and think there is a partial
                // character problem in the large buffer.
                status = noErr;
            }
        }
    } else {
        status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead,
darin's avatar
darin committed
177
            static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
ap's avatar
ap committed
178 179 180 181 182 183 184 185 186 187 188 189 190
        ASSERT(static_cast<int>(bytesRead) <= inputBufferLength);
    }

    // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
    if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0) {
        status = kTECOutputBufferFullStatus;
    }

    inputLength = bytesRead;
    outputLength = bytesWritten;
    return status;
}

darin's avatar
darin committed
191
String TextCodecMac::decode(const char* bytes, size_t length, bool flush)
ap's avatar
ap committed
192 193 194
{
    // Get a converter for the passed-in encoding.
    if (!m_converterTEC && createTECConverter() != noErr)
darin's avatar
darin committed
195
        return String();
ap's avatar
ap committed
196
    
darin's avatar
darin committed
197
    Vector<UChar> result;
ap's avatar
ap committed
198

darin's avatar
darin committed
199 200
    const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes);
    int sourceLength = length;
ap's avatar
ap committed
201 202 203 204 205 206
    bool bufferWasFull = false;
    UniChar buffer[ConversionBufferSize];

    while (sourceLength || bufferWasFull) {
        int bytesRead = 0;
        int bytesWritten = 0;
darin's avatar
darin committed
207
        OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
ap's avatar
ap committed
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
        ASSERT(bytesRead <= sourceLength);
        sourcePointer += bytesRead;
        sourceLength -= bytesRead;
        
        switch (status) {
            case noErr:
            case kTECOutputBufferFullStatus:
                break;
            case kTextMalformedInputErr:
            case kTextUndefinedElementErr:
                // FIXME: Put FFFD character into the output string in this case?
                TECClearConverterContextInfo(m_converterTEC);
                if (sourceLength) {
                    sourcePointer += 1;
                    sourceLength -= 1;
                }
                break;
            case kTECPartialCharErr: {
                // Put the partial character into the buffer.
                ASSERT(m_numBufferedBytes == 0);
                const int bufferSize = sizeof(m_numBufferedBytes);
                if (sourceLength < bufferSize) {
                    memcpy(m_bufferedBytes, sourcePointer, sourceLength);
                    m_numBufferedBytes = sourceLength;
                } else {
                    LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
                }
                sourceLength = 0;
                break;
            }
            default:
239
                LOG_ERROR("text decoding failed with error %zu", status);
ap's avatar
ap committed
240
                m_error = true;
darin's avatar
darin committed
241
                return String();
ap's avatar
ap committed
242 243
        }

darin's avatar
darin committed
244 245
        ASSERT(!(bytesWritten % sizeof(UChar)));
        appendOmittingBOM(result, buffer, bytesWritten / sizeof(UChar));
ap's avatar
ap committed
246 247 248 249 250 251

        bufferWasFull = status == kTECOutputBufferFullStatus;
    }
    
    if (flush) {
        unsigned long bytesWritten = 0;
darin's avatar
darin committed
252 253 254
        TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
        ASSERT(!(bytesWritten % sizeof(UChar)));
        appendOmittingBOM(result, buffer, bytesWritten / sizeof(UChar));
ap's avatar
ap committed
255 256
    }

darin's avatar
darin committed
257 258
    String resultString = String::adopt(result);

ap's avatar
ap committed
259 260 261 262
    // Workaround for a bug in the Text Encoding Converter (see bug 3225472).
    // Simplified Chinese pages use the code U+A3A0 to mean "full-width space".
    // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
    // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
darin's avatar
darin committed
263
    if (m_encoding == kCFStringEncodingGB_18030_2000)
darin's avatar
darin committed
264
        resultString.replace(0xE5E5, ideographicSpace);
ap's avatar
ap committed
265
    
darin's avatar
darin committed
266
    return resultString;
ap's avatar
ap committed
267 268
}

darin's avatar
darin committed
269
CString TextCodecMac::encode(const UChar* characters, size_t length, bool allowEntities)
ap's avatar
ap committed
270
{
darin's avatar
darin committed
271
    // FIXME: We should really use TEC here instead of CFString for consistency with the other direction.
ap's avatar
ap committed
272 273 274

    // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
    // Encoding will change the yen sign back into a backslash.
darin's avatar
darin committed
275 276 277
    String copy(characters, length);
    copy.replace('\\', m_backslashAsCurrencySymbol);
    CFStringRef cfs = copy.createCFString();
ap's avatar
ap committed
278

darin's avatar
darin committed
279 280 281 282 283
    CFIndex startPos = 0;
    CFIndex charactersLeft = CFStringGetLength(cfs);
    Vector<char> result;
    size_t size = 0;
    UInt8 lossByte = allowEntities ? 0 : '?';
ap's avatar
ap committed
284 285 286
    while (charactersLeft > 0) {
        CFRange range = CFRangeMake(startPos, charactersLeft);
        CFIndex bufferLength;
darin's avatar
darin committed
287 288 289 290 291 292 293
        CFStringGetBytes(cfs, range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength);

        result.resize(size + bufferLength);
        unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size);
        CFIndex charactersConverted = CFStringGetBytes(cfs, range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength);
        size += bufferLength;

ap's avatar
ap committed
294
        if (charactersConverted != charactersLeft) {
darin's avatar
darin committed
295
            unsigned badChar = CFStringGetCharacterAtIndex(cfs, startPos + charactersConverted);
ap's avatar
ap committed
296
            ++charactersConverted;
darin's avatar
darin committed
297 298 299
            if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate
                UniChar low = CFStringGetCharacterAtIndex(cfs, startPos + charactersConverted);
                if ((low & 0xFC00) == 0xDC00) { // is low surrogate
ap's avatar
ap committed
300 301
                    badChar <<= 10;
                    badChar += low;
darin's avatar
darin committed
302
                    badChar += 0x10000 - (0xD800 << 10) - 0xDC00;
ap's avatar
ap committed
303 304 305
                    ++charactersConverted;
                }
            }
darin's avatar
darin committed
306 307 308 309 310 311
            char entityBuffer[16];
            sprintf(entityBuffer, "&#%u;", badChar);
            size_t entityLength = strlen(entityBuffer);
            result.resize(size + entityLength);
            memcpy(result.data() + size, entityBuffer, entityLength);
            size += entityLength;
ap's avatar
ap committed
312
        }
darin's avatar
darin committed
313

ap's avatar
ap committed
314 315 316
        startPos += charactersConverted;
        charactersLeft -= charactersConverted;
    }
darin's avatar
darin committed
317 318
    CFRelease(cfs);
    return CString(result.data(), size);
ap's avatar
ap committed
319 320 321
}

} // namespace WebCore