Commit 88ce5b4f authored by abarth@webkit.org's avatar abarth@webkit.org

2010-06-04 Adam Barth <abarth@webkit.org>

        Reviewed by Eric Seidel.

        Make HTML5Lexer go fast
        https://bugs.webkit.org/show_bug.cgi?id=40048

        We're going to do this patch in small steps to make it easier to verify correctness.

        * html/HTML5Lexer.cpp:
        (WebCore::HTML5Lexer::nextToken):


git-svn-id: http://svn.webkit.org/repository/webkit/trunk@60694 268f45cc-cd09-0410-ab3c-d52691b4dbfc
parent 87dbb021
2010-06-04 Adam Barth <abarth@webkit.org>
Reviewed by Eric Seidel.
Make HTML5Lexer go fast
https://bugs.webkit.org/show_bug.cgi?id=40048
We're going to do this patch in small steps to make it easier to verify correctness.
* html/HTML5Lexer.cpp:
(WebCore::HTML5Lexer::nextToken):
2010-06-04 Jay Civelli <jcivelli@chromium.org> 2010-06-04 Jay Civelli <jcivelli@chromium.org>
Reviewed by David Levin. Reviewed by David Levin.
......
...@@ -318,6 +318,14 @@ inline bool HTML5Lexer::processEntity(SegmentedString& source) ...@@ -318,6 +318,14 @@ inline bool HTML5Lexer::processEntity(SegmentedString& source)
return true; return true;
} }
// We'd like to use the standard do { } while (false) pattern here, but it
// doesn't play nicely with continue.
#define RECONSUME_IN(stateName) \
{ \
m_state = stateName; \
continue; \
}
bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
{ {
// If we have a token in progress, then we're supposed to be called back // If we have a token in progress, then we're supposed to be called back
...@@ -363,8 +371,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -363,8 +371,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
case CharacterReferenceInDataState: { case CharacterReferenceInDataState: {
if (!processEntity(source)) if (!processEntity(source))
return shouldEmitBufferedCharacterToken(source); return shouldEmitBufferedCharacterToken(source);
m_state = DataState; RECONSUME_IN(DataState);
continue;
} }
case RCDATAState: { case RCDATAState: {
if (cc == '&') if (cc == '&')
...@@ -378,8 +385,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -378,8 +385,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
case CharacterReferenceInRCDATAState: { case CharacterReferenceInRCDATAState: {
if (!processEntity(source)) if (!processEntity(source))
return shouldEmitBufferedCharacterToken(source); return shouldEmitBufferedCharacterToken(source);
m_state = RCDATAState; RECONSUME_IN(RCDATAState);
continue;
} }
case RAWTEXTState: { case RAWTEXTState: {
if (cc == '<') if (cc == '<')
...@@ -437,8 +443,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -437,8 +443,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
m_state = DataState; m_state = DataState;
} else { } else {
emitParseError(); emitParseError();
m_state = BogusCommentState; RECONSUME_IN(BogusCommentState);
continue;
} }
// FIXME: Handle EOF properly. // FIXME: Handle EOF properly.
break; break;
...@@ -465,8 +470,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -465,8 +470,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
m_state = RCDATAEndTagOpenState; m_state = RCDATAEndTagOpenState;
} else { } else {
emitCharacter('<'); emitCharacter('<');
m_state = RCDATAState; RECONSUME_IN(RCDATAState);
continue;
} }
break; break;
} }
...@@ -482,8 +486,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -482,8 +486,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
} else { } else {
emitCharacter('<'); emitCharacter('<');
emitCharacter('/'); emitCharacter('/');
m_state = RCDATAState; RECONSUME_IN(RCDATAState);
continue;
} }
break; break;
} }
...@@ -518,8 +521,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -518,8 +521,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
emitCharacter('/'); emitCharacter('/');
m_token->appendToCharacter(m_temporaryBuffer); m_token->appendToCharacter(m_temporaryBuffer);
m_bufferedEndTagName.clear(); m_bufferedEndTagName.clear();
m_state = RCDATAState; RECONSUME_IN(RCDATAState);
continue;
} }
break; break;
} }
...@@ -530,8 +532,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -530,8 +532,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
m_state = RAWTEXTEndTagOpenState; m_state = RAWTEXTEndTagOpenState;
} else { } else {
emitCharacter('<'); emitCharacter('<');
m_state = RAWTEXTState; RECONSUME_IN(RAWTEXTState);
continue;
} }
break; break;
} }
...@@ -547,8 +548,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -547,8 +548,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
} else { } else {
emitCharacter('<'); emitCharacter('<');
emitCharacter('/'); emitCharacter('/');
m_state = RAWTEXTState; RECONSUME_IN(RAWTEXTState);
continue;
} }
break; break;
} }
...@@ -583,8 +583,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -583,8 +583,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
emitCharacter('/'); emitCharacter('/');
m_token->appendToCharacter(m_temporaryBuffer); m_token->appendToCharacter(m_temporaryBuffer);
m_bufferedEndTagName.clear(); m_bufferedEndTagName.clear();
m_state = RAWTEXTState; RECONSUME_IN(RAWTEXTState);
continue;
} }
break; break;
} }
...@@ -599,8 +598,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -599,8 +598,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
m_state = ScriptDataEscapeStartState; m_state = ScriptDataEscapeStartState;
} else { } else {
emitCharacter('<'); emitCharacter('<');
m_state = ScriptDataState; RECONSUME_IN(ScriptDataState);
continue;
} }
break; break;
} }
...@@ -616,8 +614,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -616,8 +614,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
} else { } else {
emitCharacter('<'); emitCharacter('<');
emitCharacter('/'); emitCharacter('/');
m_state = ScriptDataState; RECONSUME_IN(ScriptDataState);
continue;
} }
break; break;
} }
...@@ -652,8 +649,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -652,8 +649,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
emitCharacter('/'); emitCharacter('/');
m_token->appendToCharacter(m_temporaryBuffer); m_token->appendToCharacter(m_temporaryBuffer);
m_bufferedEndTagName.clear(); m_bufferedEndTagName.clear();
m_state = ScriptDataState; RECONSUME_IN(ScriptDataState);
continue;
} }
break; break;
} }
...@@ -662,8 +658,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -662,8 +658,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
emitCharacter(cc); emitCharacter(cc);
m_state = ScriptDataEscapeStartDashState; m_state = ScriptDataEscapeStartDashState;
} else { } else {
m_state = ScriptDataState; RECONSUME_IN(ScriptDataState);
continue;
} }
break; break;
} }
...@@ -672,8 +667,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -672,8 +667,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
emitCharacter(cc); emitCharacter(cc);
m_state = ScriptDataEscapedDashDashState; m_state = ScriptDataEscapedDashDashState;
} else { } else {
m_state = ScriptDataState; RECONSUME_IN(ScriptDataState);
continue;
} }
break; break;
} }
...@@ -735,8 +729,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -735,8 +729,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
m_state = ScriptDataDoubleEscapeStartState; m_state = ScriptDataDoubleEscapeStartState;
} else { } else {
emitCharacter('<'); emitCharacter('<');
m_state = ScriptDataEscapedState; RECONSUME_IN(ScriptDataEscapedState);
continue;
} }
break; break;
} }
...@@ -752,8 +745,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -752,8 +745,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
} else { } else {
emitCharacter('<'); emitCharacter('<');
emitCharacter('/'); emitCharacter('/');
m_state = ScriptDataEscapedState; RECONSUME_IN(ScriptDataEscapedState);
continue;
} }
break; break;
} }
...@@ -788,8 +780,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -788,8 +780,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
emitCharacter('/'); emitCharacter('/');
m_token->appendToCharacter(m_temporaryBuffer); m_token->appendToCharacter(m_temporaryBuffer);
m_bufferedEndTagName.clear(); m_bufferedEndTagName.clear();
m_state = ScriptDataEscapedState; RECONSUME_IN(ScriptDataEscapedState);
continue;
} }
break; break;
} }
...@@ -807,8 +798,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -807,8 +798,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
emitCharacter(cc); emitCharacter(cc);
m_temporaryBuffer.append(cc); m_temporaryBuffer.append(cc);
} else { } else {
m_state = ScriptDataEscapedState; RECONSUME_IN(ScriptDataEscapedState);
continue;
} }
break; break;
} }
...@@ -860,8 +850,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -860,8 +850,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
m_temporaryBuffer.clear(); m_temporaryBuffer.clear();
m_state = ScriptDataDoubleEscapeEndState; m_state = ScriptDataDoubleEscapeEndState;
} else { } else {
m_state = ScriptDataDoubleEscapedState; RECONSUME_IN(ScriptDataDoubleEscapedState);
continue;
} }
break; break;
} }
...@@ -879,8 +868,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -879,8 +868,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
emitCharacter(cc); emitCharacter(cc);
m_temporaryBuffer.append(cc); m_temporaryBuffer.append(cc);
} else { } else {
m_state = ScriptDataDoubleEscapedState; RECONSUME_IN(ScriptDataDoubleEscapedState);
continue;
} }
break; break;
} }
...@@ -957,8 +945,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -957,8 +945,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
else if (cc == '"') else if (cc == '"')
m_state = AttributeValueDoubleQuotedState; m_state = AttributeValueDoubleQuotedState;
else if (cc == '&') { else if (cc == '&') {
m_state = AttributeValueUnquotedState; RECONSUME_IN(AttributeValueUnquotedState);
continue;
} else if (cc == '\'') } else if (cc == '\'')
m_state = AttributeValueSingleQuotedState; m_state = AttributeValueSingleQuotedState;
else if (cc == '>') { else if (cc == '>') {
...@@ -1049,8 +1036,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -1049,8 +1036,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
m_state = DataState; m_state = DataState;
} else { } else {
emitParseError(); emitParseError();
m_state = BeforeAttributeNameState; RECONSUME_IN(BeforeAttributeNameState);
continue;
} }
// FIXME: Handle EOF properly. // FIXME: Handle EOF properly.
break; break;
...@@ -1062,8 +1048,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -1062,8 +1048,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
m_state = DataState; m_state = DataState;
} else { } else {
emitParseError(); emitParseError();
m_state = BeforeAttributeNameState; RECONSUME_IN(BeforeAttributeNameState);
continue;
} }
// FIXME: Handle EOF properly. // FIXME: Handle EOF properly.
break; break;
...@@ -1093,16 +1078,14 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -1093,16 +1078,14 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
source.advanceAndASSERT('-'); source.advanceAndASSERT('-');
source.advanceAndASSERT('-'); source.advanceAndASSERT('-');
m_token->beginComment(); m_token->beginComment();
m_state = CommentStartState; RECONSUME_IN(CommentStartState);
continue;
} else if (result == SegmentedString::NotEnoughCharacters) } else if (result == SegmentedString::NotEnoughCharacters)
return shouldEmitBufferedCharacterToken(source); return shouldEmitBufferedCharacterToken(source);
} else if (cc == 'D' || cc == 'd') { } else if (cc == 'D' || cc == 'd') {
SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString); SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
if (result == SegmentedString::DidMatch) { if (result == SegmentedString::DidMatch) {
advanceStringAndASSERTIgnoringCase(source, "doctype"); advanceStringAndASSERTIgnoringCase(source, "doctype");
m_state = DOCTYPEState; RECONSUME_IN(DOCTYPEState);
continue;
} else if (result == SegmentedString::NotEnoughCharacters) } else if (result == SegmentedString::NotEnoughCharacters)
return shouldEmitBufferedCharacterToken(source); return shouldEmitBufferedCharacterToken(source);
} }
...@@ -1110,8 +1093,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -1110,8 +1093,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
// FIXME: We're still missing the bits about the insertion mode being in foreign content: // FIXME: We're still missing the bits about the insertion mode being in foreign content:
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state
emitParseError(); emitParseError();
m_state = BogusCommentState; RECONSUME_IN(BogusCommentState);
continue;
} }
case CommentStartState: { case CommentStartState: {
if (cc == '-') if (cc == '-')
...@@ -1227,8 +1209,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -1227,8 +1209,7 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
m_state = BeforeDOCTYPENameState; m_state = BeforeDOCTYPENameState;
else { else {
emitParseError(); emitParseError();
m_state = BeforeDOCTYPENameState; RECONSUME_IN(BeforeDOCTYPENameState);
continue;
} }
// FIXME: Handle EOF properly. // FIXME: Handle EOF properly.
break; break;
...@@ -1278,16 +1259,14 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token) ...@@ -1278,16 +1259,14 @@ bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString); SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
if (result == SegmentedString::DidMatch) { if (result == SegmentedString::DidMatch) {
advanceStringAndASSERTIgnoringCase(source, "public"); advanceStringAndASSERTIgnoringCase(source, "public");
m_state = AfterDOCTYPEPublicKeywordState; RECONSUME_IN(AfterDOCTYPEPublicKeywordState);
continue;
} else if (result == SegmentedString::NotEnoughCharacters) } else if (result == SegmentedString::NotEnoughCharacters)
return shouldEmitBufferedCharacterToken(source); return shouldEmitBufferedCharacterToken(source);
} else if (cc == 'S' || cc == 's') { } else if (cc == 'S' || cc == 's') {
SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString); SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
if (result == SegmentedString::DidMatch) { if (result == SegmentedString::DidMatch) {
advanceStringAndASSERTIgnoringCase(source, "system"); advanceStringAndASSERTIgnoringCase(source, "system");
m_state = AfterDOCTYPESystemKeywordState; RECONSUME_IN(AfterDOCTYPESystemKeywordState);
continue;
} else if (result == SegmentedString::NotEnoughCharacters) } else if (result == SegmentedString::NotEnoughCharacters)
return shouldEmitBufferedCharacterToken(source); return shouldEmitBufferedCharacterToken(source);
} }
...@@ -1595,3 +1574,4 @@ inline bool HTML5Lexer::shouldEmitBufferedCharacterToken(const SegmentedString& ...@@ -1595,3 +1574,4 @@ inline bool HTML5Lexer::shouldEmitBufferedCharacterToken(const SegmentedString&
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment