Commit b9a7ac12 authored by jianli@chromium.org's avatar jianli@chromium.org

Refactor text encoding detection logic in FileReader.

https://bugs.webkit.org/show_bug.cgi?id=39131

Reviewed by Alexey Proskuryakov.

WebCore:

Changed FileReader::convertToText to call TextResourceDecoder::decode to
detect the encoding from BOM and decode the text. Though the File API
spec says that the supplied encoding should be used if it is valid, we
choose to ignore this requirement in order to be consistent with how
WebKit decodes the web content: always has the BOM override the provided
encoding.

* html/FileReader.cpp:
(WebCore::FileReader::convertToText):
* html/FileReader.h:

LayoutTests:

Add more test coverage for FileReader.

* fast/files/file-reader-expected.txt:
* fast/files/file-reader.html:
* fast/files/resources/binary-file: Added.

git-svn-id: http://svn.webkit.org/repository/webkit/trunk@59797 268f45cc-cd09-0410-ab3c-d52691b4dbfc
parent 0f317229
2010-05-19 Jian Li <jianli@chromium.org>
Reviewed by Alexey Proskuryakov.
Refactor text encoding detection logic in FileReader.
https://bugs.webkit.org/show_bug.cgi?id=39131
Add more test coverage for FileReader.
* fast/files/file-reader-expected.txt:
* fast/files/file-reader.html:
* fast/files/resources/binary-file: Added.
2010-05-19 Martin Robinson <mrobinson@igalia.com>
Not reviewed.
......@@ -41,6 +41,15 @@ readyState: 2
result size: 5
result: Hello
Received loadend event
Test reading a binary file as binary string
readyState: 0
Received loadstart event
readyState: 1
Received load event
readyState: 2
result size: 9
result: 0x0 0x1 0x2 0x80 0x81 0x82 0xfd 0xfe 0xff
Received loadend event
Test reading a UTF-8 file as text
readyState: 0
Received loadstart event
......@@ -86,6 +95,24 @@ readyState: 2
result size: 5
result: Hello
Received loadend event
Test reading a UTF-16BE BOM file as text with UTF8 encoding
readyState: 0
Received loadstart event
readyState: 1
Received load event
readyState: 2
result size: 5
result: Hello
Received loadend event
Test reading a UTF-16BE BOM file as text with invalid encoding
readyState: 0
Received loadstart event
readyState: 1
Received load event
readyState: 2
result size: 5
result: Hello
Received loadend event
Test reading a UTF-8 file as data URL
readyState: 0
Received loadstart event
......
......@@ -11,11 +11,14 @@ var testCases = [
"testReadingEmptyFileAsText",
"testReadingEmptyFileAsDataURL",
"testReadingUTF8EncodedFileAsBinaryString",
"testReadingBinaryFileAsBinaryString",
"testReadingUTF8EncodedFileAsText",
"testReadingUTF16BEBOMEncodedFileAsText",
"testReadingUTF16LEBOMEncodedFileAsText",
"testReadingUTF8BOMEncodedFileAsText",
"testReadingUTF16BEEncodedFileAsTextWithUTF16Encoding",
"testReadingUTF16BEBOMEncodedFileAsTextWithUTF8Encoding",
"testReadingUTF16BEBOMEncodedFileAsTextWithInvalidEncoding",
"testReadingUTF8EncodedFileAsDataURL",
"testMultipleReads",
];
......@@ -29,6 +32,7 @@ var testFileInfoList = [
{ 'name': 'UTF16LE-BOM-file', 'path': 'resources/UTF16LE-BOM.txt' },
{ 'name': 'UTF8-BOM-file', 'path': 'resources/UTF8-BOM.txt' },
{ 'name': 'UTF16BE-file', 'path': 'resources/UTF16BE.txt' },
{ 'name': 'binary-file', 'path': 'resources/binary-file' },
];
var testFiles = { };
......@@ -37,6 +41,27 @@ function log(message)
document.getElementById('console').appendChild(document.createTextNode(message + "\n"));
}
function isASCIIString(str)
{
for (var i = 0; i < str.length; ++i) {
if (str.charCodeAt(i) >= 128)
return false;
}
return true;
}
function toHexadecimal(str)
{
var result = "";
for (var i = 0; i < str.length; ++i) {
var hex = "0x" + (str.charCodeAt(i) & 0xFF).toString(16);
if (i > 0)
result += " ";
result += hex;
}
return result;
}
function createFileReader()
{
var reader = new FileReader();
......@@ -67,7 +92,10 @@ function loaded(event)
logEvent(event);
log("readyState: " + event.target.readyState);
log("result size: " + event.target.result.length);
log("result: " + event.target.result);
var result = event.target.result;
var resultOutput = isASCIIString(result) ? result : toHexadecimal(result);
log("result: " + resultOutput);
}
function loadFailed(event)
......@@ -139,6 +167,13 @@ function testReadingUTF8EncodedFileAsBinaryString()
reader.readAsBinaryString(testFiles['UTF8-file']);
}
function testReadingBinaryFileAsBinaryString()
{
log("Test reading a binary file as binary string");
var reader = createFileReader();
reader.readAsBinaryString(testFiles['binary-file']);
}
function testReadingUTF8EncodedFileAsText()
{
log("Test reading a UTF-8 file as text");
......@@ -174,6 +209,20 @@ function testReadingUTF16BEEncodedFileAsTextWithUTF16Encoding()
reader.readAsText(testFiles['UTF16BE-file'], "UTF-16BE");
}
function testReadingUTF16BEBOMEncodedFileAsTextWithUTF8Encoding()
{
log("Test reading a UTF-16BE BOM file as text with UTF8 encoding");
var reader = createFileReader();
reader.readAsText(testFiles['UTF16BE-BOM-file'], "UTF-8");
}
function testReadingUTF16BEBOMEncodedFileAsTextWithInvalidEncoding()
{
log("Test reading a UTF-16BE BOM file as text with invalid encoding");
var reader = createFileReader();
reader.readAsText(testFiles['UTF16BE-BOM-file'], "AnyInvalidEncoding");
}
function testReadingUTF8EncodedFileAsDataURL()
{
log("Test reading a UTF-8 file as data URL");
......
2010-05-19 Jian Li <jianli@chromium.org>
Reviewed by Alexey Proskuryakov.
Refactor text encoding detection logic in FileReader.
https://bugs.webkit.org/show_bug.cgi?id=39131
Changed FileReader::convertToText to call TextResourceDecoder::decode to
detect the encoding from BOM and decode the text. Though the File API
spec says that the supplied encoding should be used if it is valid, we
choose to ignore this requirement in order to be consistent with how
WebKit decodes the web content: always has the BOM override the provided
encoding.
* html/FileReader.cpp:
(WebCore::FileReader::convertToText):
* html/FileReader.h:
2010-05-19 Abhishek Arya <inferno@chromium.org>
Reviewed by David Hyatt.
......@@ -41,6 +41,7 @@
#include "Logging.h"
#include "ProgressEvent.h"
#include "ScriptExecutionContext.h"
#include "TextResourceDecoder.h"
#include <wtf/CurrentTime.h>
namespace WebCore {
......@@ -266,26 +267,17 @@ void FileReader::convertToText()
return;
}
// Try to determine the encoding if it is not provided.
// FIXME: move the following logic to a more generic place.
int offset = 0;
if (!m_encoding.isValid()) {
if (m_rawData.size() >= 2 && m_rawData[0] == '\xFE' && m_rawData[1] == '\xFF') {
offset = 2;
m_encoding = UTF16BigEndianEncoding();
} else if (m_rawData.size() >= 2 && m_rawData[0] == '\xFF' && m_rawData[1] == '\xFE') {
offset = 2;
m_encoding = UTF16LittleEndianEncoding();
} else if (m_rawData.size() >= 2 && m_rawData[0] == '\xEF' && m_rawData[1] == '\xBB' && m_rawData[2] == '\xBF') {
offset = 3;
m_encoding = UTF8Encoding();
} else
m_encoding = UTF8Encoding();
}
// Decode the data.
// The File API spec says that we should use the supplied encoding if it is valid. However, we choose to ignore this
// requirement in order to be consistent with how WebKit decodes the web content: always has the BOM override the
// provided encoding.
// FIXME: consider supporting incremental decoding to improve the perf.
m_result = m_encoding.decode(&m_rawData.at(0) + offset, m_rawData.size() - offset);
if (!m_decoder)
m_decoder = TextResourceDecoder::create("text/plain", m_encoding.isValid() ? m_encoding : UTF8Encoding());
m_result = m_decoder->decode(&m_rawData.at(0), m_rawData.size());
if (m_state == Completed && !m_error)
m_result += m_decoder->flush();
}
void FileReader::convertToDataURL()
......
......@@ -51,6 +51,7 @@ class Blob;
class File;
class FileStreamProxy;
class ScriptExecutionContext;
class TextResourceDecoder;
class FileReader : public RefCounted<FileReader>, public ActiveDOMObject, public EventTarget, public FileStreamClient {
public:
......@@ -135,6 +136,7 @@ private:
RefPtr<Blob> m_fileBlob;
ReadType m_readType;
TextEncoding m_encoding;
// Like XMLHttpRequest.m_responseText, we keep this as a ScriptString, not a WebCore::String.
// That's because these strings can easily get huge (they are filled from the file) and
......@@ -148,8 +150,8 @@ private:
Vector<char> m_rawData;
bool m_isRawDataConverted;
// Encoding scheme used to decode the data.
TextEncoding m_encoding;
// The decoder used to decode the text data.
RefPtr<TextResourceDecoder> m_decoder;
// Needed to create data URL.
String m_fileType;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment