Commit a1b6f102 authored by ap@webkit.org's avatar ap@webkit.org

Reviewed by Eric Seidel.

        https://bugs.webkit.org/show_bug.cgi?id=18681
        <rdar://problem/5888130> WebKit should not remove BOM characters from content.

        We were only trying to match Firefox, and it doesn't do this any more.

        Tests: fast/encoding/bom-in-content.html
               fast/encoding/bom-in-content-utf16.html

        * platform/text/TextDecoder.cpp: (WebCore::TextDecoder::checkForBOM): Skip the BOM if it's
        at the start of input stream.

        * platform/text/TextCodec.cpp:
        * platform/text/TextCodec.h:
        * platform/text/TextCodecICU.cpp:
        (WebCore::TextCodecICU::decode):
        * platform/text/TextCodecUTF16.cpp:
        (WebCore::TextCodecUTF16::decode):
        * platform/text/mac/TextCodecMac.cpp:
        (WebCore::TextCodecMac::decode):
        Don't remove the BOM.



git-svn-id: http://svn.webkit.org/repository/webkit/trunk@33380 268f45cc-cd09-0410-ab3c-d52691b4dbfc
parent d45dad02
2008-05-13 Alexey Proskuryakov <ap@webkit.org>
Reviewed by Eric Seidel.
https://bugs.webkit.org/show_bug.cgi?id=18681
<rdar://problem/5888130> WebKit should not remove BOM characters from content.
* fast/encoding/bom-in-content-expected.txt: Added.
* fast/encoding/bom-in-content.html: Added.
* fast/encoding/bom-in-content-utf16-expected.txt: Added.
* fast/encoding/bom-in-content-utf16.html: Added.
* http/tests/incremental/resources: Added.
* http/tests/incremental/resources/slow-utf8-css.pl: Copied from LayoutTests/http/tests/incremental/slow-utf8-css.pl.
* http/tests/incremental/slow-utf8-css-expected.txt: Added.
* http/tests/incremental/slow-utf8-css.html: Added.
* http/tests/incremental/slow-utf8-css.pl: Removed.
* platform/mac/http/tests/incremental: Removed.
* platform/mac/http/tests/incremental/slow-utf8-css-expected.checksum: Removed.
* platform/mac/http/tests/incremental/slow-utf8-css-expected.png: Removed.
* platform/mac/http/tests/incremental/slow-utf8-css-expected.txt: Removed.
This test was relying on BOM characters being removed, but this was not what it tested for.
Rewrote it and made text-only.
2008-05-13 Alexey Proskuryakov <ap@webkit.org>
Reviewed by Dan Bernstein.
Test for bug 18681: BOM characters should not be removed from input stream.

PASS
Test for bug 18681: BOM characters should not be removed from input stream.

PASS
<head>
<meta charset="utf-8">
</head>
<body>
<p>Test for <a href="https://bugs.webkit.org/show_bug.cgi?id=18681">bug 18681</a>:
BOM characters should not be removed from input stream.<p>
<div id=BOMs></div>
<script>
if (window.layoutTestController)
layoutTestController.dumpAsText();
document.write(document.getElementById("BOMs").innerHTML.length == 1 ? "PASS" : "FAIL");
</script>
</body>
......@@ -10,12 +10,12 @@ print "Cache-Control: no-store, no-cache, must-revalidate\n";
print "Pragma: no-cache\n";
print "\n";
print "\xef\xbb\xbfTest for bug 10753: The beginning of a CSS file is missing.\n\n";
# Dump some BOMs to bypass CFNetwork buffering.
print "\xef\xbb\xbf#result {color:green;}\n";
# Dump some spaces to bypass CFNetwork buffering.
for ($count = 1; $count < 4000; $count++) {
print "\xef\xbb\xbf";
print " ";
}
# Delay to force the second line of text to be decoded as a separate chunk.
sleep 1;
print "You should see a bug description on a separate line above this one.";
print "body {}";
Test for bug 10753: The beginning of a CSS file is missing.
PASS
<head>
<link rel="stylesheet" href="resources/slow-utf8-css.pl" type="text/css" charset="utf-8">
<script>
function test() {
if (window.layoutTestController)
layoutTestController.dumpAsText();
document.getElementById("result").innerHTML =
(document.styleSheets.item(0).cssRules.item(0).selectorText == "#result") ? "PASS" : "FAIL";
}
</script>
</head>
<body onload="test()">
<p>Test for <a href="https://bugs.webkit.org/show_bug.cgi?id=10753">bug 10753</a>:
The beginning of a CSS file is missing.
<div id=result>Should be green</div>
</body>
b11756f779dae6e4e4852507da3ed50b
\ No newline at end of file
layer at (0,0) size 800x600
RenderView at (0,0) size 800x600
layer at (0,0) size 800x600
RenderBlock {HTML} at (0,0) size 800x600
RenderBody {BODY} at (8,8) size 784x579
RenderBlock {PRE} at (0,0) size 784x45
RenderText {#text} at (0,0) size 472x30
text run at (0,0) width 472: "Test for bug 10753: The beginning of a CSS file is missing."
text run at (472,0) width 0: " "
text run at (0,15) width 0: " "
RenderText {#text} at (0,30) size 536x15
text run at (0,30) width 536: "You should see a bug description on a separate line above this one."
2008-05-13 Alexey Proskuryakov <ap@webkit.org>
Reviewed by Eric Seidel.
https://bugs.webkit.org/show_bug.cgi?id=18681
<rdar://problem/5888130> WebKit should not remove BOM characters from content.
We were only trying to match Firefox, and it doesn't do this any more.
Tests: fast/encoding/bom-in-content.html
fast/encoding/bom-in-content-utf16.html
* platform/text/TextDecoder.cpp: (WebCore::TextDecoder::checkForBOM): Skip the BOM if it's
at the start of input stream.
* platform/text/TextCodec.cpp:
* platform/text/TextCodec.h:
* platform/text/TextCodecICU.cpp:
(WebCore::TextCodecICU::decode):
* platform/text/TextCodecUTF16.cpp:
(WebCore::TextCodecUTF16::decode):
* platform/text/mac/TextCodecMac.cpp:
(WebCore::TextCodecMac::decode):
Don't remove the BOM.
2008-05-13 Anders Carlsson <andersca@apple.com>
Reviewed by Darin.
......@@ -32,28 +32,10 @@
namespace WebCore {
const UChar BOM = 0xFEFF;
TextCodec::~TextCodec()
{
}
// We strip BOM characters because they can show up both at the start of content
// and inside content, and we never want them to end up in the decoded text.
void TextCodec::appendOmittingBOM(Vector<UChar>& v, const UChar* characters, size_t length)
{
size_t start = 0;
for (size_t i = 0; i != length; ++i) {
if (BOM == characters[i]) {
if (start != i)
v.append(&characters[start], i - start);
start = i + 1;
}
}
if (start != length)
v.append(&characters[start], length - start);
}
int TextCodec::getUnencodableReplacement(unsigned codePoint, UnencodableHandling handling, UnencodableReplacementArray replacement)
{
switch (handling) {
......
......@@ -72,9 +72,6 @@ namespace WebCore {
// unencodable character into the given replacement buffer.
// The length of the string (not including the null) will be returned.
static int getUnencodableReplacement(unsigned codePoint, UnencodableHandling, UnencodableReplacementArray);
protected:
static void appendOmittingBOM(Vector<UChar>&, const UChar*, size_t length);
};
typedef void (*EncodingNameRegistrar)(const char* alias, const char* name);
......
......@@ -282,7 +282,7 @@ String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool s
do {
int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err);
appendOmittingBOM(result, buffer, ucharsDecoded);
result.append(buffer, ucharsDecoded);
} while (err == U_BUFFER_OVERFLOW_ERROR);
if (U_FAILURE(err)) {
......
......@@ -34,8 +34,6 @@ using std::auto_ptr;
namespace WebCore {
const UChar BOM = 0xFEFF;
void TextCodecUTF16::registerEncodingNames(EncodingNameRegistrar registrar)
{
registrar("UTF-16LE", "UTF-16LE");
......@@ -85,8 +83,7 @@ String TextCodecUTF16::decode(const char* bytes, size_t length, bool, bool stopO
c = m_bufferedByte | (p[0] << 8);
else
c = (m_bufferedByte << 8) | p[0];
if (c != BOM)
*q++ = c;
*q++ = c;
m_haveBufferedByte = false;
p += 1;
numChars -= 1;
......@@ -96,15 +93,13 @@ String TextCodecUTF16::decode(const char* bytes, size_t length, bool, bool stopO
for (size_t i = 0; i < numChars; ++i) {
UChar c = p[0] | (p[1] << 8);
p += 2;
if (c != BOM)
*q++ = c;
*q++ = c;
}
else
for (size_t i = 0; i < numChars; ++i) {
UChar c = (p[0] << 8) | p[1];
p += 2;
if (c != BOM)
*q++ = c;
*q++ = c;
}
if (numBytes & 1) {
......
......@@ -49,6 +49,8 @@ void TextDecoder::reset(const TextEncoding& encoding)
String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError)
{
ASSERT(!m_checkedForBOM);
// Check to see if we found a BOM.
size_t numBufferedBytes = m_numBufferedBytes;
size_t buf1Len = numBufferedBytes;
......@@ -62,22 +64,28 @@ String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, boo
const TextEncoding* encodingConsideringBOM = &m_encoding;
bool foundBOM = true;
size_t lengthOfBOM = 0;
if (c1 == 0xFF && c2 == 0xFE) {
if (c3 != 0 || c4 != 0)
if (c3 != 0 || c4 != 0) {
encodingConsideringBOM = &UTF16LittleEndianEncoding();
else if (numBufferedBytes + length > sizeof(m_bufferedBytes))
lengthOfBOM = 2;
} else if (numBufferedBytes + length > sizeof(m_bufferedBytes)) {
encodingConsideringBOM = &UTF32LittleEndianEncoding();
else
lengthOfBOM = 4;
} else
foundBOM = false;
}
else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
} else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
encodingConsideringBOM = &UTF8Encoding();
else if (c1 == 0xFE && c2 == 0xFF)
lengthOfBOM = 3;
} else if (c1 == 0xFE && c2 == 0xFF) {
encodingConsideringBOM = &UTF16BigEndianEncoding();
else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF)
lengthOfBOM = 2;
} else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
encodingConsideringBOM = &UTF32BigEndianEncoding();
else
lengthOfBOM = 4;
} else
foundBOM = false;
if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
// Continue to look for the BOM.
memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
......@@ -91,6 +99,18 @@ String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, boo
return String();
m_checkedForBOM = true;
// Skip the BOM.
if (foundBOM) {
ASSERT(numBufferedBytes < lengthOfBOM);
size_t numUnbufferedBOMBytes = lengthOfBOM - numBufferedBytes;
ASSERT(numUnbufferedBOMBytes <= length);
data += numUnbufferedBOMBytes;
length -= numUnbufferedBOMBytes;
numBufferedBytes = 0;
m_numBufferedBytes = 0;
}
// Handle case where we have some buffered bytes to deal with.
if (numBufferedBytes) {
char bufferedBytes[sizeof(m_bufferedBytes)];
......
......@@ -243,7 +243,7 @@ String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool s
}
ASSERT(!(bytesWritten % sizeof(UChar)));
appendOmittingBOM(result, buffer, bytesWritten / sizeof(UChar));
result.append(buffer, bytesWritten / sizeof(UChar));
bufferWasFull = status == kTECOutputBufferFullStatus;
}
......@@ -252,7 +252,7 @@ String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool s
unsigned long bytesWritten = 0;
TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
ASSERT(!(bytesWritten % sizeof(UChar)));
appendOmittingBOM(result, buffer, bytesWritten / sizeof(UChar));
result.append(buffer, bytesWritten / sizeof(UChar));
}
String resultString = String::adopt(result);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment