HTML5Lexer.cpp 55.9 KB
Newer Older
1
2
/*
 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3
 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4
 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

28
#include "config.h"
29
#include "HTML5Lexer.h"
30
31

#include "AtomicString.h"
32
#include "HTML5Token.h"
33
#include "HTMLNames.h"
34
#include "NotImplemented.h"
35
#include <wtf/CurrentTime.h>
36
37
#include <wtf/UnusedParam.h>
#include <wtf/text/CString.h>
38
39
#include <wtf/unicode/Unicode.h>

40

41
42
43
44
45
46
47
48
49
50
51
52
53
54
// Use __GNUC__ instead of PLATFORM(GCC) to stay consistent with the gperf generated c file
#ifdef __GNUC__
// The main tokenizer includes this too so we are getting two copies of the data. However, this way the code gets inlined.
#include "HTMLEntityNames.c"
#else
// Not inlined for non-GCC compilers
struct Entity {
    const char* name;
    int code;
};
const struct Entity* findEntity(register const char* str, register unsigned int len);
#endif

using namespace WTF;
55
56

namespace WebCore {
57

58
using namespace HTMLNames;
59

60
61
namespace {

62
63
64
65
66
67
68
static const UChar windowsLatin1ExtensionArray[32] = {
    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
};

69
70
71
72
73
74
75
inline UChar toLowerCase(UChar cc)
{
    ASSERT(cc >= 'A' && cc <= 'Z');
    const int lowerCaseOffset = 0x20;
    return cc + lowerCaseOffset;
}

76
77
78
79
80
81
inline void advanceStringAndASSERTIgnoringCase(SegmentedString& source, const char* expectedCharacters)
{
    while (*expectedCharacters)
        source.advanceAndASSERTIgnoringCase(*expectedCharacters++);
}

82
83
84
85
86
87
88
89
90
91
inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string)
{
    if (vector.size() != string.length())
        return false;
    const UChar* stringData = string.characters();
    const UChar* vectorData = vector.data();
    // FIXME: Is there a higher-level function we should be calling here?
    return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar));
}

92
93
94
95
96
97
98
99
inline UChar adjustEntity(unsigned value)
{
    if ((value & ~0x1F) != 0x0080)
        return value;
    return windowsLatin1ExtensionArray[value - 0x80];
}

inline unsigned legalEntityFor(unsigned value)
100
{
101
    // FIXME: A number of specific entity values generate parse errors.
102
103
    if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
        return 0xFFFD;
104
105
    if (value < 0xFFFF)
        return adjustEntity(value);
106
107
108
109
110
111
112
113
114
115
116
117
118
    return value;
}

inline bool isHexDigit(UChar cc)
{
    return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
}

inline bool isAlphaNumeric(UChar cc)
{
    return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
}

119
void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters)
120
121
122
123
124
125
126
127
128
129
{
    if (consumedCharacters.size() == 1)
        source.push(consumedCharacters[0]);
    else if (consumedCharacters.size() == 2) {
        source.push(consumedCharacters[0]);
        source.push(consumedCharacters[1]);
    } else
        source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size())));
}

130
131
132
133
134
135
136
137
138
139
140
141
inline bool isEndTagBufferingState(HTML5Lexer::State state)
{
    return state == HTML5Lexer::RCDATAEndTagOpenState
        || state == HTML5Lexer::RCDATAEndTagNameState
        || state == HTML5Lexer::RAWTEXTEndTagOpenState
        || state == HTML5Lexer::RAWTEXTEndTagNameState
        || state == HTML5Lexer::ScriptDataEndTagOpenState
        || state == HTML5Lexer::ScriptDataEndTagNameState
        || state == HTML5Lexer::ScriptDataEscapedEndTagOpenState
        || state == HTML5Lexer::ScriptDataEscapedEndTagNameState;
}

142
143
}

144
HTML5Lexer::HTML5Lexer()
145
{
146
    reset();
147
}
148

149
HTML5Lexer::~HTML5Lexer()
150
151
{
}
152

153
void HTML5Lexer::reset()
154
{
155
    m_state = DataState;
156
    m_token = 0;
157
    m_lineNumber = 0;
158
    m_skipLeadingNewLineForListing = false;
159
    m_emitPending = false;
160
    m_additionalAllowedCharacter = '\0';
161
}
162

163
unsigned HTML5Lexer::consumeEntity(SegmentedString& source, bool& notEnoughCharacters)
164
{
165
166
167
    ASSERT(m_state != CharacterReferenceInAttributeValueState || m_additionalAllowedCharacter == '"' || m_additionalAllowedCharacter == '\'' || m_additionalAllowedCharacter == '>');
    ASSERT(!notEnoughCharacters);

168
169
170
    enum EntityState {
        Initial,
        NumberType,
171
172
        MaybeHexLowerCaseX,
        MaybeHexUpperCaseX,
173
174
175
        Hex,
        Decimal,
        Named
176
    };
177
178
    EntityState entityState = Initial;
    unsigned result = 0;
179
    Vector<UChar, 10> consumedCharacters;
180
    Vector<char, 10> entityName;
181

182
183
184
    while (!source.isEmpty()) {
        UChar cc = *source;
        switch (entityState) {
185
186
        case Initial: {
            if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
187
                return 0;
188
189
190
            if (m_state == CharacterReferenceInAttributeValueState && cc == m_additionalAllowedCharacter)
                return 0;
            if (cc == '#') {
191
                entityState = NumberType;
192
193
194
                break;
            }
            if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
195
                entityState = Named;
196
197
198
199
200
201
202
203
204
205
206
207
208
209
                continue;
            }
            return 0;
        }
        case NumberType: {
            if (cc == 'x') {
                entityState = MaybeHexLowerCaseX;
                break;
            }
            if (cc == 'X') {
                entityState = MaybeHexUpperCaseX;
                break;
            }
            if (cc >= '0' && cc <= '9') {
210
                entityState = Decimal;
211
                continue;
212
            }
213
214
215
216
217
218
219
            source.push('#');
            return 0;
        }
        case MaybeHexLowerCaseX: {
            if (isHexDigit(cc)) {
                entityState = Hex;
                continue;
220
            }
221
222
223
224
225
226
227
228
229
230
231
232
233
234
            source.push('#');
            source.push('x');
            return 0;
        }
        case MaybeHexUpperCaseX: {
            if (isHexDigit(cc)) {
                entityState = Hex;
                continue;
            }
            source.push('#');
            source.push('X');
            return 0;
        }
        case Hex: {
235
236
237
238
239
240
241
            if (cc >= '0' && cc <= '9')
                result = result * 16 + cc - '0';
            else if (cc >= 'a' && cc <= 'f')
                result = result * 16 + 10 + cc - 'a';
            else if (cc >= 'A' && cc <= 'F')
                result = result * 16 + 10 + cc - 'A';
            else if (cc == ';') {
242
                source.advancePastNonNewline();
243
244
245
246
                return legalEntityFor(result);
            } else 
                return legalEntityFor(result);
            break;
247
248
        }
        case Decimal: {
249
250
251
            if (cc >= '0' && cc <= '9')
                result = result * 10 + cc - '0';
            else if (cc == ';') {
252
                source.advancePastNonNewline();
253
254
255
                return legalEntityFor(result);
            } else
                return legalEntityFor(result);
256
257
258
259
260
261
262
263
264
265
266
267
268
            break;
        }
        case Named: {
            // FIXME: This code is wrong. We need to find the longest matching entity.
            //        The examples from the spec are:
            //            I'm &notit; I tell you
            //            I'm &notin; I tell you
            //        In the first case, "&not" is the entity.  In the second
            //        case, "&notin;" is the entity.
            // FIXME: Our list of HTML entities is incomplete.
            // FIXME: The number 8 below is bogus.
            while (!source.isEmpty() && entityName.size() <= 8) {
                cc = *source;
269
270
271
                if (cc == ';') {
                    const Entity* entity = findEntity(entityName.data(), entityName.size());
                    if (entity) {
272
                        source.advanceAndASSERT(';');
273
274
                        return entity->code;
                    }
275
                    emitParseError();
276
277
                    break;
                }
278
                if (!isAlphaNumeric(cc)) {
279
                    const Entity* entity = findEntity(entityName.data(), entityName.size());
280
                    if (entity) {
281
282
283
284
                        // HTML5 tells us to ignore this entity, for historical reasons,
                        // if the lookhead character is '='.
                        if (m_state == CharacterReferenceInAttributeValueState && cc == '=')
                            break;
285
                        emitParseError();
286
                        return entity->code;
287
                    }
288
289
290
                    break;
                }
                entityName.append(cc);
291
292
                consumedCharacters.append(cc);
                source.advanceAndASSERT(cc);
293
            }
294
            notEnoughCharacters = source.isEmpty();
295
            unconsumeCharacters(source, consumedCharacters);
296
297
            return 0;
        }
298
299
300
        }
        consumedCharacters.append(cc);
        source.advanceAndASSERT(cc);
301
    }
302
    ASSERT(source.isEmpty());
303
    notEnoughCharacters = true;
304
    unconsumeCharacters(source, consumedCharacters);
305
306
307
    return 0;
}

308
309
310
311
312
313
314
315
316
317
318
319
320
inline bool HTML5Lexer::processEntity(SegmentedString& source)
{
    bool notEnoughCharacters = false;
    unsigned value = consumeEntity(source, notEnoughCharacters);
    if (notEnoughCharacters)
        return false;
    if (!value)
        emitCharacter('&');
    else
        emitCodePoint(value);
    return true;
}

321
322
323
#define BEGIN_STATE(stateName) case stateName:
#define END_STATE() ASSERT_NOT_REACHED(); break;

324
325
326
327
#define EMIT_AND_RESUME_IN(stateName)                                       \
    do {                                                                    \
        emitCurrentToken();                                                 \
        m_state = DataState;                                                \
328
        goto breakLabel;                                                    \
329
330
    } while (false)

331
#define ADVANCE_TO(stateName)                                               \
332
333
334
335
    do {                                                                    \
        m_state = stateName;                                                \
        goto breakLabel;                                                    \
    } while (false)
336

337
#define RECONSUME_IN(stateName)                                             \
338
    do {                                                                    \
339
        m_state = stateName;                                                \
340
341
        goto continueLabel;                                                 \
    } while (false)
342

343
#define FLUSH_EMIT_AND_RESUME_IN(stateName)                                 \
344
    do {                                                                    \
345
346
        m_state = stateName;                                                \
        maybeFlushBufferedEndTag();                                         \
347
348
        goto breakLabel;                                                    \
    } while (false)
349

350
351
352
// When we move away from using a jump table, these macros will be different.
#define FLUSH_AND_ADVANCE_TO(stateName) FLUSH_EMIT_AND_RESUME_IN(stateName)

353
bool HTML5Lexer::nextToken(SegmentedString& source, HTML5Token& token)
354
{
355
356
357
    // If we have a token in progress, then we're supposed to be called back
    // with the same token so we can finish it.
    ASSERT(!m_token || m_token == &token || token.type() == HTML5Token::Uninitialized);
358
    m_token = &token;
359

360
    if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
361
        // FIXME: This should call flushBufferedEndTag().
362
363
364
365
366
367
368
369
370
        // We started an end tag during our last iteration.
        m_token->beginEndTag(m_bufferedEndTagName);
        m_bufferedEndTagName.clear();
        if (m_state == DataState) {
            // We're back in the data state, so we must be done with the tag.
            return true;
        }
    }

371
372
373
374
375
    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
    if (m_skipLeadingNewLineForListing && m_state == DataState && !source.isEmpty() && *source == '\x0A')
        source.advanceAndASSERT('\x0A');
    m_skipLeadingNewLineForListing = false;

376
    // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
377
378
    // FIXME: This while should stop as soon as we have a token to return.
    while (!source.isEmpty()) {
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
    // FIXME: This is a purposeful style violation because this while loop is
    // going to be removed soon.

    UChar cc = *source;
    switch (m_state) {
    BEGIN_STATE(DataState) {
        if (cc == '&')
            ADVANCE_TO(CharacterReferenceInDataState);
        else if (cc == '<') {
            if (m_token->type() == HTML5Token::Character) {
                // We have a bunch of character tokens queued up that we
                // are emitting lazily here.
                return true;
            }
            ADVANCE_TO(TagOpenState);
394
        } else {
395
            emitCharacter(cc);
396
397
            ADVANCE_TO(DataState);
        }
398
399
    }
    END_STATE()
400

401
402
403
404
405
406
407
408
409
410
411
412
    BEGIN_STATE(CharacterReferenceInDataState) {
        if (!processEntity(source))
            return shouldEmitBufferedCharacterToken(source);
        RECONSUME_IN(DataState);
    }
    END_STATE()

    BEGIN_STATE(RCDATAState) {
        if (cc == '&')
            ADVANCE_TO(CharacterReferenceInRCDATAState);
        else if (cc == '<')
            ADVANCE_TO(RCDATALessThanSignState);
413
        else {
414
            emitCharacter(cc);
415
416
            ADVANCE_TO(RCDATAState);
        }
417
418
    }
    END_STATE()
419

420
421
422
423
424
425
    BEGIN_STATE(CharacterReferenceInRCDATAState) {
        if (!processEntity(source))
            return shouldEmitBufferedCharacterToken(source);
        RECONSUME_IN(RCDATAState);
    }
    END_STATE()
426

427
428
429
    BEGIN_STATE(RAWTEXTState) {
        if (cc == '<')
            ADVANCE_TO(RAWTEXTLessThanSignState);
430
        else {
431
            emitCharacter(cc);
432
433
            ADVANCE_TO(RAWTEXTState);
        }
434
435
    }
    END_STATE()
436

437
438
439
    BEGIN_STATE(ScriptDataState) {
        if (cc == '<')
            ADVANCE_TO(ScriptDataLessThanSignState);
440
        else {
441
            emitCharacter(cc);
442
443
            ADVANCE_TO(ScriptDataState);
        }
444
445
446
447
448
    }
    END_STATE()

    BEGIN_STATE(PLAINTEXTState) {
        emitCharacter(cc);
449
        ADVANCE_TO(PLAINTEXTState);
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
    }
    END_STATE()

    BEGIN_STATE(TagOpenState) {
        if (cc == '!')
            ADVANCE_TO(MarkupDeclarationOpenState);
        else if (cc == '/')
            ADVANCE_TO(EndTagOpenState);
        else if (cc >= 'A' && cc <= 'Z') {
            m_token->beginStartTag(toLowerCase(cc));
            ADVANCE_TO(TagNameState);
        } else if (cc >= 'a' && cc <= 'z') {
            m_token->beginStartTag(cc);
            ADVANCE_TO(TagNameState);
        } else if (cc == '?') {
            emitParseError();
            // The spec consumes the current character before switching
            // to the bogus comment state, but it's easier to implement
            // if we reconsume the current character.
            RECONSUME_IN(BogusCommentState);
        } else {
            emitParseError();
            emitCharacter('<');
            RECONSUME_IN(DataState);
474
        }
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
    }
    END_STATE()

    BEGIN_STATE(EndTagOpenState) {
        if (cc >= 'A' && cc <= 'Z') {
            m_token->beginEndTag(toLowerCase(cc));
            ADVANCE_TO(TagNameState);
        } else if (cc >= 'a' && cc <= 'z') {
            m_token->beginEndTag(cc);
            ADVANCE_TO(TagNameState);
        } else if (cc == '>') {
            emitParseError();
            ADVANCE_TO(DataState);
        } else {
            emitParseError();
            RECONSUME_IN(BogusCommentState);
491
        }
492
493
494
495
496
497
498
499
500
501
502
        // FIXME: Handle EOF properly.
    }
    END_STATE()

    BEGIN_STATE(TagNameState) {
        if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
            ADVANCE_TO(BeforeAttributeNameState);
        else if (cc == '/')
            ADVANCE_TO(SelfClosingStartTagState);
        else if (cc == '>') {
            EMIT_AND_RESUME_IN(DataState);
503
        } else if (cc >= 'A' && cc <= 'Z') {
504
            m_token->appendToName(toLowerCase(cc));
505
506
            ADVANCE_TO(TagNameState);
        } else {
507
            m_token->appendToName(cc);
508
509
            ADVANCE_TO(TagNameState);
        }
510
511
512
513
514
515
516
517
518
519
520
521
        // FIXME: Handle EOF properly.
    }
    END_STATE()

    BEGIN_STATE(RCDATALessThanSignState) {
        if (cc == '/') {
            m_temporaryBuffer.clear();
            ASSERT(m_bufferedEndTagName.isEmpty());
            ADVANCE_TO(RCDATAEndTagOpenState);
        } else {
            emitCharacter('<');
            RECONSUME_IN(RCDATAState);
522
        }
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
    }
    END_STATE()

    BEGIN_STATE(RCDATAEndTagOpenState) {
        if (cc >= 'A' && cc <= 'Z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(toLowerCase(cc));
            ADVANCE_TO(RCDATAEndTagNameState);
        } else if (cc >= 'a' && cc <= 'z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(cc);
            ADVANCE_TO(RCDATAEndTagNameState);
        } else {
            emitCharacter('<');
            emitCharacter('/');
            RECONSUME_IN(RCDATAState);
539
        }
540
541
542
543
544
545
546
    }
    END_STATE()

    BEGIN_STATE(RCDATAEndTagNameState) {
        if (cc >= 'A' && cc <= 'Z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(toLowerCase(cc));
547
            ADVANCE_TO(RCDATAEndTagNameState);
548
549
550
        } else if (cc >= 'a' && cc <= 'z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(cc);
551
            ADVANCE_TO(RCDATAEndTagNameState);
552
553
        } else {
            if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
554
                if (isAppropriateEndTag())
555
556
                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
            } else if (cc == '/') {
557
                if (isAppropriateEndTag())
558
559
                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
            } else if (cc == '>') {
560
                if (isAppropriateEndTag())
561
                    FLUSH_EMIT_AND_RESUME_IN(DataState);
562
            }
563
564
565
566
567
            emitCharacter('<');
            emitCharacter('/');
            m_token->appendToCharacter(m_temporaryBuffer);
            m_bufferedEndTagName.clear();
            RECONSUME_IN(RCDATAState);
568
        }
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
    }
    END_STATE()

    BEGIN_STATE(RAWTEXTLessThanSignState) {
        if (cc == '/') {
            m_temporaryBuffer.clear();
            ASSERT(m_bufferedEndTagName.isEmpty());
            ADVANCE_TO(RAWTEXTEndTagOpenState);
        } else {
            emitCharacter('<');
            RECONSUME_IN(RAWTEXTState);
        }
    }
    END_STATE()

    BEGIN_STATE(RAWTEXTEndTagOpenState) {
        if (cc >= 'A' && cc <= 'Z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(toLowerCase(cc));
            ADVANCE_TO(RAWTEXTEndTagNameState);
        } else if (cc >= 'a' && cc <= 'z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(cc);
            ADVANCE_TO(RAWTEXTEndTagNameState);
        } else {
            emitCharacter('<');
            emitCharacter('/');
            RECONSUME_IN(RAWTEXTState);
        }
    }
    END_STATE()

    BEGIN_STATE(RAWTEXTEndTagNameState) {
        if (cc >= 'A' && cc <= 'Z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(toLowerCase(cc));
605
            ADVANCE_TO(RAWTEXTEndTagNameState);
606
607
608
        } else if (cc >= 'a' && cc <= 'z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(cc);
609
            ADVANCE_TO(RAWTEXTEndTagNameState);
610
611
        } else {
            if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
612
                if (isAppropriateEndTag())
613
614
                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
            } else if (cc == '/') {
615
                if (isAppropriateEndTag())
616
                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
617
            } else if (cc == '>') {
618
                if (isAppropriateEndTag())
619
                    FLUSH_EMIT_AND_RESUME_IN(DataState);
620
            }
621
622
623
624
625
            emitCharacter('<');
            emitCharacter('/');
            m_token->appendToCharacter(m_temporaryBuffer);
            m_bufferedEndTagName.clear();
            RECONSUME_IN(RAWTEXTState);
626
        }
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
    }
    END_STATE()

    BEGIN_STATE(ScriptDataLessThanSignState) {
        if (cc == '/') {
            m_temporaryBuffer.clear();
            ASSERT(m_bufferedEndTagName.isEmpty());
            ADVANCE_TO(ScriptDataEndTagOpenState);
        } else if (cc == '!') {
            emitCharacter('<');
            emitCharacter('!');
            ADVANCE_TO(ScriptDataEscapeStartState);
        } else {
            emitCharacter('<');
            RECONSUME_IN(ScriptDataState);
        }
    }
    END_STATE()

    BEGIN_STATE(ScriptDataEndTagOpenState) {
        if (cc >= 'A' && cc <= 'Z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(toLowerCase(cc));
            ADVANCE_TO(ScriptDataEndTagNameState);
        } else if (cc >= 'a' && cc <= 'z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(cc);
            ADVANCE_TO(ScriptDataEndTagNameState);
        } else {
            emitCharacter('<');
            emitCharacter('/');
            RECONSUME_IN(ScriptDataState);
        }
    }
    END_STATE()

    BEGIN_STATE(ScriptDataEndTagNameState) {
        if (cc >= 'A' && cc <= 'Z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(toLowerCase(cc));
667
            ADVANCE_TO(ScriptDataEndTagNameState);
668
669
670
        } else if (cc >= 'a' && cc <= 'z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(cc);
671
            ADVANCE_TO(ScriptDataEndTagNameState);
672
673
        } else {
            if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
674
                if (isAppropriateEndTag())
675
676
                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
            } else if (cc == '/') {
677
                if (isAppropriateEndTag())
678
679
                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
            } else if (cc == '>') {
680
                if (isAppropriateEndTag())
681
                    FLUSH_EMIT_AND_RESUME_IN(DataState);
682
            }
683
684
685
686
687
            emitCharacter('<');
            emitCharacter('/');
            m_token->appendToCharacter(m_temporaryBuffer);
            m_bufferedEndTagName.clear();
            RECONSUME_IN(ScriptDataState);
688
        }
689
690
    }
    END_STATE()
691

692
693
694
695
    BEGIN_STATE(ScriptDataEscapeStartState) {
        if (cc == '-') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataEscapeStartDashState);
696
        } else
697
698
699
            RECONSUME_IN(ScriptDataState);
    }
    END_STATE()
700

701
702
703
704
    BEGIN_STATE(ScriptDataEscapeStartDashState) {
        if (cc == '-') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataEscapedDashDashState);
705
        } else
706
707
708
            RECONSUME_IN(ScriptDataState);
    }
    END_STATE()
709

710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
    BEGIN_STATE(ScriptDataEscapedState) {
        if (cc == '-') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataEscapedDashState);
        } else if (cc == '<')
            ADVANCE_TO(ScriptDataEscapedLessThanSignState);
        else
            emitCharacter(cc);
        // FIXME: Handle EOF properly.
        break;
    }
    END_STATE()

    BEGIN_STATE(ScriptDataEscapedDashState) {
        if (cc == '-') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataEscapedDashDashState);
        } else if (cc == '<')
            ADVANCE_TO(ScriptDataEscapedLessThanSignState);
        else {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataEscapedState);
732
        }
733
734
735
736
737
        // FIXME: Handle EOF properly.
    }
    END_STATE()

    BEGIN_STATE(ScriptDataEscapedDashDashState) {
738
        if (cc == '-') {
739
            emitCharacter(cc);
740
741
            ADVANCE_TO(ScriptDataEscapedDashDashState);
        } else if (cc == '<')
742
743
744
745
746
747
748
            ADVANCE_TO(ScriptDataEscapedLessThanSignState);
        else if (cc == '>') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataState);
        } else {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataEscapedState);
749
        }
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
        // FIXME: Handle EOF properly.
    }
    END_STATE()

    BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
        if (cc == '/') {
            m_temporaryBuffer.clear();
            ASSERT(m_bufferedEndTagName.isEmpty());
            ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
        } else if (cc >= 'A' && cc <= 'Z') {
            emitCharacter('<');
            emitCharacter(cc);
            m_temporaryBuffer.clear();
            m_temporaryBuffer.append(toLowerCase(cc));
            ADVANCE_TO(ScriptDataDoubleEscapeStartState);
        } else if (cc >= 'a' && cc <= 'z') {
            emitCharacter('<');
            emitCharacter(cc);
            m_temporaryBuffer.clear();
            m_temporaryBuffer.append(cc);
            ADVANCE_TO(ScriptDataDoubleEscapeStartState);
        } else {
            emitCharacter('<');
            RECONSUME_IN(ScriptDataEscapedState);
        }
    }
    END_STATE()

    BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
        if (cc >= 'A' && cc <= 'Z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(toLowerCase(cc));
            ADVANCE_TO(ScriptDataEscapedEndTagNameState);
        } else if (cc >= 'a' && cc <= 'z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(cc);
            ADVANCE_TO(ScriptDataEscapedEndTagNameState);
        } else {
            emitCharacter('<');
            emitCharacter('/');
            RECONSUME_IN(ScriptDataEscapedState);
        }
    }
    END_STATE()

    BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
        if (cc >= 'A' && cc <= 'Z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(toLowerCase(cc));
799
            ADVANCE_TO(ScriptDataEscapedEndTagNameState);
800
801
802
        } else if (cc >= 'a' && cc <= 'z') {
            m_temporaryBuffer.append(cc);
            addToPossibleEndTag(cc);
803
            ADVANCE_TO(ScriptDataEscapedEndTagNameState);
804
805
        } else {
            if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ') {
806
                if (isAppropriateEndTag())
807
808
                    FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
            } else if (cc == '/') {
809
                if (isAppropriateEndTag())
810
                    FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
811
            } else if (cc == '>') {
812
                if (isAppropriateEndTag())
813
                    FLUSH_EMIT_AND_RESUME_IN(DataState);
814
            }
815
816
817
818
819
            emitCharacter('<');
            emitCharacter('/');
            m_token->appendToCharacter(m_temporaryBuffer);
            m_bufferedEndTagName.clear();
            RECONSUME_IN(ScriptDataEscapedState);
820
        }
821
822
    }
    END_STATE()
823

824
825
826
827
828
    BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
        if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '/' || cc == '>') {
            emitCharacter(cc);
            if (temporaryBufferIs(scriptTag.localName()))
                ADVANCE_TO(ScriptDataDoubleEscapedState);
829
            else
830
831
832
833
                ADVANCE_TO(ScriptDataEscapedState);
        } else if (cc >= 'A' && cc <= 'Z') {
            emitCharacter(cc);
            m_temporaryBuffer.append(toLowerCase(cc));
834
            ADVANCE_TO(ScriptDataDoubleEscapeStartState);
835
836
837
        } else if (cc >= 'a' && cc <= 'z') {
            emitCharacter(cc);
            m_temporaryBuffer.append(cc);
838
            ADVANCE_TO(ScriptDataDoubleEscapeStartState);
839
840
        } else {
            RECONSUME_IN(ScriptDataEscapedState);
841
        }
842
843
    }
    END_STATE()
844

845
846
847
848
849
850
851
    BEGIN_STATE(ScriptDataDoubleEscapedState) {
        if (cc == '-') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataDoubleEscapedDashState);
        } else if (cc == '<') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
852
        } else {
853
            emitCharacter(cc);
854
855
            ADVANCE_TO(ScriptDataDoubleEscapedState);
        }
856
857
858
        // FIXME: Handle EOF properly.
    }
    END_STATE()
859

860
861
862
863
864
865
866
867
868
869
    BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
        if (cc == '-') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
        } else if (cc == '<') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
        } else {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataDoubleEscapedState);
870
        }
871
872
873
        // FIXME: Handle EOF properly.
    }
    END_STATE()
874

875
    BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
876
        if (cc == '-') {
877
            emitCharacter(cc);
878
879
            ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
        } else if (cc == '<') {
880
881
882
883
884
885
886
887
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
        } else if (cc == '>') {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataState);
        } else {
            emitCharacter(cc);
            ADVANCE_TO(ScriptDataDoubleEscapedState);
888
        }
889
890
891
        // FIXME: Handle EOF properly.
    }
    END_STATE()
892

893
894
895
896
897
    BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
        if (cc == '/') {
            emitCharacter(cc);
            m_temporaryBuffer.clear();
            ADVANCE_TO(ScriptDataDoubleEscapeEndState);
898
        } else
899
900
901
            RECONSUME_IN(ScriptDataDoubleEscapedState);
    }
    END_STATE()
902

903
904
905
906
907
908
909
910
911
912
    BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
        if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '/' || cc == '>') {
            emitCharacter(cc);
            if (temporaryBufferIs(scriptTag.localName()))
                ADVANCE_TO(ScriptDataEscapedState);
            else
                ADVANCE_TO(ScriptDataDoubleEscapedState);
        } else if (cc >= 'A' && cc <= 'Z') {
            emitCharacter(cc);
            m_temporaryBuffer.append(toLowerCase(cc));
913
            ADVANCE_TO(ScriptDataDoubleEscapeEndState);
914
915
916
        } else if (cc >= 'a' && cc <= 'z') {
            emitCharacter(cc);
            m_temporaryBuffer.append(cc);
917
918
            ADVANCE_TO(ScriptDataDoubleEscapeEndState);
        } else
919
920
921
            RECONSUME_IN(ScriptDataDoubleEscapedState);
    }
    END_STATE()
922

923
924
    BEGIN_STATE(BeforeAttributeNameState) {
        if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
925
            ADVANCE_TO(BeforeAttributeNameState);
926
927
        else if (cc == '/')
            ADVANCE_TO(SelfClosingStartTagState);
928
        else if (cc == '>')
929
            EMIT_AND_RESUME_IN(DataState);
930
        else if (cc >= 'A' && cc <= 'Z') {
931
932
933
934
935
936
937
938
939
            m_token->addNewAttribute();
            m_token->appendToAttributeName(toLowerCase(cc));
            ADVANCE_TO(AttributeNameState);
        } else {
            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
                emitParseError();
            m_token->addNewAttribute();
            m_token->appendToAttributeName(cc);
            ADVANCE_TO(AttributeNameState);
940
        }
941
942
943
944
945
946
947
948
949
950
951
        // FIXME: Handle EOF properly.
    }
    END_STATE()

    BEGIN_STATE(AttributeNameState) {
        if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
            ADVANCE_TO(AfterAttributeNameState);
        else if (cc == '/')
            ADVANCE_TO(SelfClosingStartTagState);
        else if (cc == '=')
            ADVANCE_TO(BeforeAttributeValueState);
952
        else if (cc == '>')
953
            EMIT_AND_RESUME_IN(DataState);
954
        else if (cc >= 'A' && cc <= 'Z') {
955
            m_token->appendToAttributeName(toLowerCase(cc));
956
957
            ADVANCE_TO(AttributeNameState);
        } else {
958
959
960
961
            if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
                emitParseError();
            m_token->appendToAttributeName(cc);
            ADVANCE_TO(AttributeNameState);
962
        }
963
964
965
        // FIXME: Handle EOF properly.
    }
    END_STATE()
966

967
968
    BEGIN_STATE(AfterAttributeNameState) {
        if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
969
            ADVANCE_TO(AfterAttributeNameState);
970
971
972
973
974
975
976
977
978
979
980
981
        else if (cc == '/')
            ADVANCE_TO(SelfClosingStartTagState);
        else if (cc == '=')
            ADVANCE_TO(BeforeAttributeValueState);
        else if (cc == '=') {
            EMIT_AND_RESUME_IN(DataState);
        } else if (cc >= 'A' && cc <= 'Z') {
            m_token->addNewAttribute();
            m_token->appendToAttributeName(toLowerCase(cc));
            ADVANCE_TO(AttributeNameState);
        } else {
            if (cc == '"' || cc == '\'' || cc == '<')
982
                emitParseError();
983
984
985
986
987
988
989
990
991
992
            m_token->addNewAttribute();
            m_token->appendToAttributeName(cc);
            ADVANCE_TO(AttributeNameState);
        }
        // FIXME: Handle EOF properly.
    }
    END_STATE()

    BEGIN_STATE(BeforeAttributeValueState) {
        if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ')
993
            ADVANCE_TO(BeforeAttributeValueState);
994
995
        else if (cc == '"')
            ADVANCE_TO(AttributeValueDoubleQuotedState);
996
        else if (cc == '&')
997
            RECONSUME_IN(AttributeValueUnquotedState);
998
        else if (cc == '\'')
999
1000
            ADVANCE_TO(AttributeValueSingleQuotedState);
        else if (cc == '>') {
For faster browsing, not all history is shown. View entire blame