// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package html import ( "bytes" "io" "io/ioutil" "reflect" "runtime" "strings" "testing" ) type tokenTest struct { // A short description of the test case. desc string // The HTML to parse. html string // The string representations of the expected tokens, joined by '$'. golden string } var tokenTests = []tokenTest{ { "empty", "", "", }, // A single text node. The tokenizer should not break text nodes on whitespace, // nor should it normalize whitespace within a text node. { "text", "foo bar", "foo bar", }, // An entity. { "entity", "one < two", "one < two", }, // A start, self-closing and end tag. The tokenizer does not care if the start // and end tokens don't match; that is the job of the parser. { "tags", "<a>b<c/>d</e>", "<a>$b$<c/>$d$</e>", }, // Angle brackets that aren't a tag. { "not a tag #0", "<", "<", }, { "not a tag #1", "</", "</", }, { "not a tag #2", "</>", "<!---->", }, { "not a tag #3", "a</>b", "a$<!---->$b", }, { "not a tag #4", "</ >", "<!-- -->", }, { "not a tag #5", "</.", "<!--.-->", }, { "not a tag #6", "</.>", "<!--.-->", }, { "not a tag #7", "a < b", "a < b", }, { "not a tag #8", "<.>", "<.>", }, { "not a tag #9", "a<<<b>>>c", "a<<$<b>$>>c", }, { "not a tag #10", "if x<0 and y < 0 then x*y>0", "if x<0 and y < 0 then x*y>0", }, { "not a tag #11", "<<p>", "<$<p>", }, // EOF in a tag name. { "tag name eof #0", "<a", "", }, { "tag name eof #1", "<a ", "", }, { "tag name eof #2", "a<b", "a", }, { "tag name eof #3", "<a><b", "<a>", }, { "tag name eof #4", `<a x`, ``, }, // Some malformed tags that are missing a '>'. { "malformed tag #0", `<p</p>`, `<p< p="">`, }, { "malformed tag #1", `<p </p>`, `<p <="" p="">`, }, { "malformed tag #2", `<p id`, ``, }, { "malformed tag #3", `<p id=`, ``, }, { "malformed tag #4", `<p id=>`, `<p id="">`, }, { "malformed tag #5", `<p id=0`, ``, }, { "malformed tag #6", `<p id=0</p>`, `<p id="0</p">`, }, { "malformed tag #7", `<p id="0</p>`, ``, }, { "malformed tag #8", `<p id="0"</p>`, `<p id="0" <="" p="">`, }, { "malformed tag #9", `<p></p id`, `<p>`, }, // Raw text and RCDATA. { "basic raw text", "<script><a></b></script>", "<script>$<a></b>$</script>", }, { "unfinished script end tag", "<SCRIPT>a</SCR", "<script>$a</SCR", }, { "broken script end tag", "<SCRIPT>a</SCR ipt>", "<script>$a</SCR ipt>", }, { "EOF in script end tag", "<SCRIPT>a</SCRipt", "<script>$a</SCRipt", }, { "scriptx end tag", "<SCRIPT>a</SCRiptx", "<script>$a</SCRiptx", }, { "' ' completes script end tag", "<SCRIPT>a</SCRipt ", "<script>$a", }, { "'>' completes script end tag", "<SCRIPT>a</SCRipt>", "<script>$a$</script>", }, { "self-closing script end tag", "<SCRIPT>a</SCRipt/>", "<script>$a$</script>", }, { "nested script tag", "<SCRIPT>a</SCRipt<script>", "<script>$a</SCRipt<script>", }, { "script end tag after unfinished", "<SCRIPT>a</SCRipt</script>", "<script>$a</SCRipt$</script>", }, { "script/style mismatched tags", "<script>a</style>", "<script>$a</style>", }, { "style element with entity", "<style>'", "<style>$&apos;", }, { "textarea with tag", "<textarea><div></textarea>", "<textarea>$<div>$</textarea>", }, { "title with tag and entity", "<title><b>K&R C</b></title>", "<title>$<b>K&R C</b>$</title>", }, { "title with trailing '<' entity", "<title>foobar<</title>", "<title>$foobar<$</title>", }, // DOCTYPE tests. { "Proper DOCTYPE", "<!DOCTYPE html>", "<!DOCTYPE html>", }, { "DOCTYPE with no space", "<!doctypehtml>", "<!DOCTYPE html>", }, { "DOCTYPE with two spaces", "<!doctype html>", "<!DOCTYPE html>", }, { "looks like DOCTYPE but isn't", "<!DOCUMENT html>", "<!--DOCUMENT html-->", }, { "DOCTYPE at EOF", "<!DOCtype", "<!DOCTYPE >", }, // XML processing instructions. { "XML processing instruction", "<?xml?>", "<!--?xml?-->", }, // Comments. { "comment0", "abc<b><!-- skipme --></b>def", "abc$<b>$<!-- skipme -->$</b>$def", }, { "comment1", "a<!-->z", "a$<!---->$z", }, { "comment2", "a<!--->z", "a$<!---->$z", }, { "comment3", "a<!--x>-->z", "a$<!--x>-->$z", }, { "comment4", "a<!--x->-->z", "a$<!--x->-->$z", }, { "comment5", "a<!>z", "a$<!---->$z", }, { "comment6", "a<!->z", "a$<!----->$z", }, { "comment7", "a<!---<>z", "a$<!---<>z-->", }, { "comment8", "a<!--z", "a$<!--z-->", }, { "comment9", "a<!--z-", "a$<!--z-->", }, { "comment10", "a<!--z--", "a$<!--z-->", }, { "comment11", "a<!--z---", "a$<!--z--->", }, { "comment12", "a<!--z----", "a$<!--z---->", }, { "comment13", "a<!--x--!>z", "a$<!--x-->$z", }, // An attribute with a backslash. { "backslash", `<p id="a\"b">`, `<p id="a\" b"="">`, }, // Entities, tag name and attribute key lower-casing, and whitespace // normalization within a tag. { "tricky", "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, }, // A nonexistent entity. Tokenizing and converting back to a string should // escape the "&" to become "&". { "noSuchEntity", `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, }, { "entity without semicolon", `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, }, { "entity with digits", "½", "½", }, // Attribute tests: // http://dev.w3.org/html5/pf-summary/Overview.html#attributes { "Empty attribute", `<input disabled FOO>`, `<input disabled="" foo="">`, }, { "Empty attribute, whitespace", `<input disabled FOO >`, `<input disabled="" foo="">`, }, { "Unquoted attribute value", `<input value=yes FOO=BAR>`, `<input value="yes" foo="BAR">`, }, { "Unquoted attribute value, spaces", `<input value = yes FOO = BAR>`, `<input value="yes" foo="BAR">`, }, { "Unquoted attribute value, trailing space", `<input value=yes FOO=BAR >`, `<input value="yes" foo="BAR">`, }, { "Single-quoted attribute value", `<input value='yes' FOO='BAR'>`, `<input value="yes" foo="BAR">`, }, { "Single-quoted attribute value, trailing space", `<input value='yes' FOO='BAR' >`, `<input value="yes" foo="BAR">`, }, { "Double-quoted attribute value", `<input value="I'm an attribute" FOO="BAR">`, `<input value="I'm an attribute" foo="BAR">`, }, { "Attribute name characters", `<meta http-equiv="content-type">`, `<meta http-equiv="content-type">`, }, { "Mixed attributes", `a<P V="0 1" w='2' X=3 y>z`, `a$<p v="0 1" w="2" x="3" y="">$z`, }, { "Attributes with a solitary single quote", `<p id=can't><p id=won't>`, `<p id="can't">$<p id="won't">`, }, } func TestTokenizer(t *testing.T) { loop: for _, tt := range tokenTests { z := NewTokenizer(strings.NewReader(tt.html)) if tt.golden != "" { for i, s := range strings.Split(tt.golden, "$") { if z.Next() == ErrorToken { t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err()) continue loop } actual := z.Token().String() if s != actual { t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) continue loop } } } z.Next() if z.Err() != io.EOF { t.Errorf("%s: want EOF got %q", tt.desc, z.Err()) } } } func TestMaxBuffer(t *testing.T) { // Exceeding the maximum buffer size generates ErrBufferExceeded. z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10))) z.SetMaxBuf(5) tt := z.Next() if got, want := tt, ErrorToken; got != want { t.Fatalf("token type: got: %v want: %v", got, want) } if got, want := z.Err(), ErrBufferExceeded; got != want { t.Errorf("error type: got: %v want: %v", got, want) } if got, want := string(z.Raw()), "<tttt"; got != want { t.Fatalf("buffered before overflow: got: %q want: %q", got, want) } } func TestMaxBufferReconstruction(t *testing.T) { // Exceeding the maximum buffer size at any point while tokenizing permits // reconstructing the original input. tests: for _, test := range tokenTests { for maxBuf := 1; ; maxBuf++ { r := strings.NewReader(test.html) z := NewTokenizer(r) z.SetMaxBuf(maxBuf) var tokenized bytes.Buffer for { tt := z.Next() tokenized.Write(z.Raw()) if tt == ErrorToken { if err := z.Err(); err != io.EOF && err != ErrBufferExceeded { t.Errorf("%s: unexpected error: %v", test.desc, err) } break } } // Anything tokenized along with untokenized input or data left in the reader. assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r)) if err != nil { t.Errorf("%s: ReadAll: %v", test.desc, err) continue tests } if got, want := string(assembled), test.html; got != want { t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want) continue tests } // EOF indicates that we completed tokenization and hence found the max // maxBuf that generates ErrBufferExceeded, so continue to the next test. if z.Err() == io.EOF { break } } // buffer sizes } // tests } func TestPassthrough(t *testing.T) { // Accumulating the raw output for each parse event should reconstruct the // original input. for _, test := range tokenTests { z := NewTokenizer(strings.NewReader(test.html)) var parsed bytes.Buffer for { tt := z.Next() parsed.Write(z.Raw()) if tt == ErrorToken { break } } if got, want := parsed.String(), test.html; got != want { t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want) } } } func TestBufAPI(t *testing.T) { s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" z := NewTokenizer(bytes.NewBufferString(s)) var result bytes.Buffer depth := 0 loop: for { tt := z.Next() switch tt { case ErrorToken: if z.Err() != io.EOF { t.Error(z.Err()) } break loop case TextToken: if depth > 0 { result.Write(z.Text()) } case StartTagToken, EndTagToken: tn, _ := z.TagName() if len(tn) == 1 && tn[0] == 'a' { if tt == StartTagToken { depth++ } else { depth-- } } } } u := "14567" v := string(result.Bytes()) if u != v { t.Errorf("TestBufAPI: want %q got %q", u, v) } } func TestConvertNewlines(t *testing.T) { testCases := map[string]string{ "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n", "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n", "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n", "": "", "\n": "\n", "\n\r": "\n\n", "\r": "\n", "\r\n": "\n", "\r\n\n": "\n\n", "\r\n\r": "\n\n", "\r\n\r\n": "\n\n", "\r\r": "\n\n", "\r\r\n": "\n\n", "\r\r\n\n": "\n\n\n", "\r\r\r\n": "\n\n\n", "\r \n": "\n \n", "xyz": "xyz", } for in, want := range testCases { if got := string(convertNewlines([]byte(in))); got != want { t.Errorf("input %q: got %q, want %q", in, got, want) } } } func TestReaderEdgeCases(t *testing.T) { const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>" testCases := []io.Reader{ &zeroOneByteReader{s: s}, &eofStringsReader{s: s}, &stuckReader{}, } for i, tc := range testCases { got := []TokenType{} z := NewTokenizer(tc) for { tt := z.Next() if tt == ErrorToken { break } got = append(got, tt) } if err := z.Err(); err != nil && err != io.EOF { if err != io.ErrNoProgress { t.Errorf("i=%d: %v", i, err) } continue } want := []TokenType{ StartTagToken, TextToken, EndTagToken, } if !reflect.DeepEqual(got, want) { t.Errorf("i=%d: got %v, want %v", i, got, want) continue } } } // zeroOneByteReader is like a strings.Reader that alternates between // returning 0 bytes and 1 byte at a time. type zeroOneByteReader struct { s string n int } func (r *zeroOneByteReader) Read(p []byte) (int, error) { if len(p) == 0 { return 0, nil } if len(r.s) == 0 { return 0, io.EOF } r.n++ if r.n%2 != 0 { return 0, nil } p[0], r.s = r.s[0], r.s[1:] return 1, nil } // eofStringsReader is like a strings.Reader but can return an (n, err) where // n > 0 && err != nil. type eofStringsReader struct { s string } func (r *eofStringsReader) Read(p []byte) (int, error) { n := copy(p, r.s) r.s = r.s[n:] if r.s != "" { return n, nil } return n, io.EOF } // stuckReader is an io.Reader that always returns no data and no error. type stuckReader struct{} func (*stuckReader) Read(p []byte) (int, error) { return 0, nil } const ( rawLevel = iota lowLevel highLevel ) func benchmarkTokenizer(b *testing.B, level int) { buf, err := ioutil.ReadFile("testdata/go1.html") if err != nil { b.Fatalf("could not read testdata/go1.html: %v", err) } b.SetBytes(int64(len(buf))) runtime.GC() b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { z := NewTokenizer(bytes.NewBuffer(buf)) for { tt := z.Next() if tt == ErrorToken { if err := z.Err(); err != nil && err != io.EOF { b.Fatalf("tokenizer error: %v", err) } break } switch level { case rawLevel: // Calling z.Raw just returns the raw bytes of the token. It does // not unescape < to <, or lower-case tag names and attribute keys. z.Raw() case lowLevel: // Caling z.Text, z.TagName and z.TagAttr returns []byte values // whose contents may change on the next call to z.Next. switch tt { case TextToken, CommentToken, DoctypeToken: z.Text() case StartTagToken, SelfClosingTagToken: _, more := z.TagName() for more { _, _, more = z.TagAttr() } case EndTagToken: z.TagName() } case highLevel: // Calling z.Token converts []byte values to strings whose validity // extend beyond the next call to z.Next. z.Token() } } } } func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) } func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) } func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }