Commit 64ee2fbe authored by abarth@webkit.org's avatar abarth@webkit.org

2010-08-29 Adam Barth <abarth@webkit.org>

        Reviewed by Darin Adler.

        Move UTF16 LEAD/TRAIL logic into the HTMLEntityParser
        https://bugs.webkit.org/show_bug.cgi?id=44790

        We now block this attack.

        * http/tests/security/xssAuditor/javascript-link-HTML-entities-null-char-expected.txt:

git-svn-id: http://svn.webkit.org/repository/webkit/trunk@66359 268f45cc-cd09-0410-ab3c-d52691b4dbfc
parent 911c44f2
2010-08-29 Adam Barth <abarth@webkit.org>
Reviewed by Darin Adler.
Move UTF16 LEAD/TRAIL logic into the HTMLEntityParser
https://bugs.webkit.org/show_bug.cgi?id=44790
We now block this attack.
* http/tests/security/xssAuditor/javascript-link-HTML-entities-null-char-expected.txt:
2010-08-29 Adam Barth <abarth@webkit.org>
Reviewed by Darin Adler.
Move UTF16 LEAD/TRAIL logic into the HTMLEntityParser
https://bugs.webkit.org/show_bug.cgi?id=44790
We now block this attack.
* http/tests/security/xssAuditor/javascript-link-HTML-entities-null-char-expected.txt:
2010-08-29 Yuzo Fujishima <yuzo@google.com>
Unreviewed Chromium test expectation change for r66282.
CONSOLE MESSAGE: line 1: SyntaxError: Parse error
CONSOLE MESSAGE: line 1: Refused to execute a JavaScript script. Source code of script found within request.
......@@ -45,23 +45,36 @@ static const UChar windowsLatin1ExtensionArray[32] = {
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
};
inline UChar adjustEntity(unsigned value)
inline UChar adjustEntity(UChar32 value)
{
if ((value & ~0x1F) != 0x0080)
return value;
return windowsLatin1ExtensionArray[value - 0x80];
}
inline unsigned legalEntityFor(unsigned value)
inline UChar32 legalEntityFor(UChar32 value)
{
// FIXME: A number of specific entity values generate parse errors.
if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
return 0xFFFD;
if (value < 0xFFFF)
if (U_IS_BMP(value))
return adjustEntity(value);
return value;
}
inline bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity)
{
if (U_IS_BMP(value)) {
UChar character = static_cast<UChar>(value);
ASSERT(character == value);
decodedEntity.append(character);
return true;
}
decodedEntity.append(U16_LEAD(value));
decodedEntity.append(U16_TRAIL(value));
return true;
}
inline bool isHexDigit(UChar cc)
{
return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
......@@ -85,14 +98,15 @@ void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consu
}
unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
{
ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
ASSERT(!notEnoughCharacters);
ASSERT(decodedEntity.isEmpty());
enum EntityState {
Initial,
NumberType,
Number,
MaybeHexLowerCaseX,
MaybeHexUpperCaseX,
Hex,
......@@ -100,7 +114,7 @@ unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, U
Named
};
EntityState entityState = Initial;
unsigned result = 0;
UChar32 result = 0;
Vector<UChar, 10> consumedCharacters;
while (!source.isEmpty()) {
......@@ -108,20 +122,20 @@ unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, U
switch (entityState) {
case Initial: {
if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
return 0;
return false;
if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
return 0;
return false;
if (cc == '#') {
entityState = NumberType;
entityState = Number;
break;
}
if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
entityState = Named;
continue;
}
return 0;
return false;
}
case NumberType: {
case Number: {
if (cc == 'x') {
entityState = MaybeHexLowerCaseX;
break;
......@@ -135,7 +149,7 @@ unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, U
continue;
}
source.push('#');
return 0;
return false;
}
case MaybeHexLowerCaseX: {
if (isHexDigit(cc)) {
......@@ -144,7 +158,7 @@ unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, U
}
source.push('#');
source.push('x');
return 0;
return false;
}
case MaybeHexUpperCaseX: {
if (isHexDigit(cc)) {
......@@ -153,7 +167,7 @@ unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, U
}
source.push('#');
source.push('X');
return 0;
return false;
}
case Hex: {
if (cc >= '0' && cc <= '9')
......@@ -162,21 +176,21 @@ unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, U
result = result * 16 + 10 + cc - 'a';
else if (cc >= 'A' && cc <= 'F')
result = result * 16 + 10 + cc - 'A';
else if (cc == ';') {
source.advancePastNonNewline();
return legalEntityFor(result);
} else
return legalEntityFor(result);
else {
if (cc == ';')
source.advanceAndASSERT(cc);
return convertToUTF16(legalEntityFor(result), decodedEntity);
}
break;
}
case Decimal: {
if (cc >= '0' && cc <= '9')
result = result * 10 + cc - '0';
else if (cc == ';') {
source.advancePastNonNewline();
return legalEntityFor(result);
} else
return legalEntityFor(result);
else {
if (cc == ';')
source.advanceAndASSERT(cc);
return convertToUTF16(legalEntityFor(result), decodedEntity);
}
break;
}
case Named: {
......@@ -194,12 +208,12 @@ unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, U
// We can't an entity because there might be a longer entity
// that we could match if we had more data.
unconsumeCharacters(source, consumedCharacters);
return 0;
return false;
}
if (!entitySearch.mostRecentMatch()) {
ASSERT(!entitySearch.currentValue());
unconsumeCharacters(source, consumedCharacters);
return 0;
return false;
}
if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
// We've consumed too many characters. We need to walk the
......@@ -218,12 +232,13 @@ unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, U
}
cc = *source;
}
if (entitySearch.mostRecentMatch()->lastCharacter() == ';')
return entitySearch.mostRecentMatch()->value;
if (!additionalAllowedCharacter || !(isAlphaNumeric(cc) || cc == '='))
return entitySearch.mostRecentMatch()->value;
if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
|| !additionalAllowedCharacter
|| !(isAlphaNumeric(cc) || cc == '=')) {
return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity);
}
unconsumeCharacters(source, consumedCharacters);
return 0;
return false;
}
}
consumedCharacters.append(cc);
......@@ -232,7 +247,7 @@ unsigned consumeHTMLEntity(SegmentedString& source, bool& notEnoughCharacters, U
ASSERT(source.isEmpty());
notEnoughCharacters = true;
unconsumeCharacters(source, consumedCharacters);
return 0;
return false;
}
UChar decodeNamedEntity(const char* name)
......
......@@ -31,7 +31,7 @@
namespace WebCore {
unsigned consumeHTMLEntity(SegmentedString&, bool& notEnoughCharacters, UChar additionalAllowedCharacter = '\0');
bool consumeHTMLEntity(SegmentedString&, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter = '\0');
// Used by the XML parser. Not suitable for use in HTML parsing. Use consumeHTMLEntity instead.
UChar decodeNamedEntity(const char*);
......
......@@ -39,7 +39,7 @@ public:
void advance(UChar);
bool isEntityPrefix() const { return !!m_first; }
int currentValue() const { return m_currentValue; }
UChar32 currentValue() const { return m_currentValue; }
int currentLength() const { return m_currentLength; }
const HTMLEntityTableEntry* mostRecentMatch() const { return m_mostRecentMatch; }
......@@ -63,7 +63,7 @@ private:
}
int m_currentLength;
int m_currentValue;
UChar32 m_currentValue;
const HTMLEntityTableEntry* m_mostRecentMatch;
const HTMLEntityTableEntry* m_first;
......
......@@ -35,7 +35,7 @@ struct HTMLEntityTableEntry {
const UChar* entity;
int length;
int value;
UChar32 value;
};
class HTMLEntityTable {
......
......@@ -119,13 +119,18 @@ void HTMLTokenizer::reset()
inline bool HTMLTokenizer::processEntity(SegmentedString& source)
{
bool notEnoughCharacters = false;
unsigned value = consumeHTMLEntity(source, notEnoughCharacters);
Vector<UChar, 16> decodedEntity;
bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
if (notEnoughCharacters)
return false;
if (!value)
if (!success) {
ASSERT(decodedEntity.isEmpty());
bufferCharacter('&');
else
bufferCodePoint(value);
} else {
Vector<UChar>::const_iterator iter = decodedEntity.begin();
for (; iter != decodedEntity.end(); ++iter)
bufferCharacter(*iter);
}
return true;
}
......@@ -1027,16 +1032,17 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
BEGIN_STATE(CharacterReferenceInAttributeValueState) {
bool notEnoughCharacters = false;
unsigned value = consumeHTMLEntity(source, notEnoughCharacters, m_additionalAllowedCharacter);
Vector<UChar, 16> decodedEntity;
bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
if (notEnoughCharacters)
return haveBufferedCharacterToken();
if (!value)
if (!success) {
ASSERT(decodedEntity.isEmpty());
m_token->appendToAttributeValue('&');
else if (value < 0xFFFF)
m_token->appendToAttributeValue(value);
else {
m_token->appendToAttributeValue(U16_LEAD(value));
m_token->appendToAttributeValue(U16_TRAIL(value));
} else {
Vector<UChar>::const_iterator iter = decodedEntity.begin();
for (; iter != decodedEntity.end(); ++iter)
m_token->appendToAttributeValue(*iter);
}
// We're supposed to switch back to the attribute value state that
// we were in when we were switched into this state. Rather than
......@@ -1634,16 +1640,6 @@ inline void HTMLTokenizer::bufferCharacter(UChar character)
m_token->appendToCharacter(character);
}
inline void HTMLTokenizer::bufferCodePoint(unsigned value)
{
if (value < 0xFFFF) {
bufferCharacter(value);
return;
}
bufferCharacter(U16_LEAD(value));
bufferCharacter(U16_TRAIL(value));
}
inline void HTMLTokenizer::parseError()
{
notImplemented();
......
......@@ -277,19 +277,18 @@ String XSSAuditor::decodeHTMLEntities(const String& string, bool leaveUndecodabl
if (leaveUndecodableEntitiesUntouched)
sourceShadow = source;
bool notEnoughCharacters = false;
unsigned entity = consumeHTMLEntity(source, notEnoughCharacters);
Vector<UChar, 16> decodedEntity;
bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
// We ignore notEnoughCharacters because we might as well use this loop
// to copy the remaining characters into |result|.
if (entity > 0xFFFF) {
result.append(U16_LEAD(entity));
result.append(U16_TRAIL(entity));
} else if (entity && (!leaveUndecodableEntitiesUntouched || entity != 0xFFFD)){
result.append(entity);
} else {
if (!success || (!leaveUndecodableEntitiesUntouched && decodedEntity.size() == 1 && decodedEntity[0] == 0xFFFD)) {
result.append('&');
if (leaveUndecodableEntitiesUntouched)
source = sourceShadow;
} else {
Vector<UChar>::const_iterator iter = decodedEntity.begin();
for (; iter != decodedEntity.end(); ++iter)
result.append(*iter);
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment