// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package html import ( "bufio" "bytes" "errors" "fmt" "io" "io/ioutil" "os" "path/filepath" "runtime" "sort" "strings" "testing" "golang.org/x/net/html/atom" ) type testAttrs struct { text, want, context string scripting bool } // readParseTest reads a single test case from r. func readParseTest(r *bufio.Reader) (*testAttrs, error) { ta := &testAttrs{scripting: true} line, err := r.ReadSlice('\n') if err != nil { return nil, err } var b []byte // Read the HTML. if string(line) != "#data\n" { return nil, fmt.Errorf(`got %q want "#data\n"`, line) } for { line, err = r.ReadSlice('\n') if err != nil { return nil, err } if line[0] == '#' { break } b = append(b, line...) } ta.text = strings.TrimSuffix(string(b), "\n") b = b[:0] // Skip the error list. if string(line) != "#errors\n" { return nil, fmt.Errorf(`got %q want "#errors\n"`, line) } for { line, err = r.ReadSlice('\n') if err != nil { return nil, err } if line[0] == '#' { break } } if ls := string(line); strings.HasPrefix(ls, "#script-") { switch { case strings.HasSuffix(ls, "-on\n"): ta.scripting = true case strings.HasSuffix(ls, "-off\n"): ta.scripting = false default: return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line) } for { line, err = r.ReadSlice('\n') if err != nil { return nil, err } if line[0] == '#' { break } } } if string(line) == "#document-fragment\n" { line, err = r.ReadSlice('\n') if err != nil { return nil, err } ta.context = strings.TrimSpace(string(line)) line, err = r.ReadSlice('\n') if err != nil { return nil, err } } // Read the dump of what the parse tree should be. if string(line) != "#document\n" { return nil, fmt.Errorf(`got %q want "#document\n"`, line) } inQuote := false for { line, err = r.ReadSlice('\n') if err != nil && err != io.EOF { return nil, err } trimmed := bytes.Trim(line, "| \n") if len(trimmed) > 0 { if line[0] == '|' && trimmed[0] == '"' { inQuote = true } if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) { inQuote = false } } if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote { break } b = append(b, line...) } ta.want = string(b) return ta, nil } func dumpIndent(w io.Writer, level int) { io.WriteString(w, "| ") for i := 0; i < level; i++ { io.WriteString(w, " ") } } type sortedAttributes []Attribute func (a sortedAttributes) Len() int { return len(a) } func (a sortedAttributes) Less(i, j int) bool { if a[i].Namespace != a[j].Namespace { return a[i].Namespace < a[j].Namespace } return a[i].Key < a[j].Key } func (a sortedAttributes) Swap(i, j int) { a[i], a[j] = a[j], a[i] } func dumpLevel(w io.Writer, n *Node, level int) error { dumpIndent(w, level) level++ switch n.Type { case ErrorNode: return errors.New("unexpected ErrorNode") case DocumentNode: return errors.New("unexpected DocumentNode") case ElementNode: if n.Namespace != "" { fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data) } else { fmt.Fprintf(w, "<%s>", n.Data) } attr := sortedAttributes(n.Attr) sort.Sort(attr) for _, a := range attr { io.WriteString(w, "\n") dumpIndent(w, level) if a.Namespace != "" { fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val) } else { fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val) } } if n.Namespace == "" && n.DataAtom == atom.Template { io.WriteString(w, "\n") dumpIndent(w, level) level++ io.WriteString(w, "content") } case TextNode: fmt.Fprintf(w, `"%s"`, n.Data) case CommentNode: fmt.Fprintf(w, "<!-- %s -->", n.Data) case DoctypeNode: fmt.Fprintf(w, "<!DOCTYPE %s", n.Data) if n.Attr != nil { var p, s string for _, a := range n.Attr { switch a.Key { case "public": p = a.Val case "system": s = a.Val } } if p != "" || s != "" { fmt.Fprintf(w, ` "%s"`, p) fmt.Fprintf(w, ` "%s"`, s) } } io.WriteString(w, ">") case scopeMarkerNode: return errors.New("unexpected scopeMarkerNode") default: return errors.New("unknown node type") } io.WriteString(w, "\n") for c := n.FirstChild; c != nil; c = c.NextSibling { if err := dumpLevel(w, c, level); err != nil { return err } } return nil } func dump(n *Node) (string, error) { if n == nil || n.FirstChild == nil { return "", nil } var b bytes.Buffer for c := n.FirstChild; c != nil; c = c.NextSibling { if err := dumpLevel(&b, c, 0); err != nil { return "", err } } return b.String(), nil } var testDataDirs = []string{"testdata/webkit/", "testdata/go/"} func TestParser(t *testing.T) { for _, testDataDir := range testDataDirs { testFiles, err := filepath.Glob(testDataDir + "*.dat") if err != nil { t.Fatal(err) } for _, tf := range testFiles { f, err := os.Open(tf) if err != nil { t.Fatal(err) } defer f.Close() r := bufio.NewReader(f) for i := 0; ; i++ { ta, err := readParseTest(r) if err == io.EOF { break } if err != nil { t.Fatal(err) } err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting)) if err != nil { t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err) } } } } } // Issue 16318 func TestParserWithoutScripting(t *testing.T) { text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>` want := `| <html> | <head> | <noscript> | <body> | <img> | src="https://golang.org/doc/gopher/frontpage.png" | <p> | <img> | src="https://golang.org/doc/gopher/doc.png" ` if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil { t.Errorf("test with scripting is disabled, %q, %s", text, err) } } // testParseCase tests one test case from the test files. If the test does not // pass, it returns an error that explains the failure. // text is the HTML to be parsed, want is a dump of the correct parse tree, // and context is the name of the context node, if any. func testParseCase(text, want, context string, opts ...ParseOption) (err error) { defer func() { if x := recover(); x != nil { switch e := x.(type) { case error: err = e default: err = fmt.Errorf("%v", e) } } }() var doc *Node if context == "" { doc, err = ParseWithOptions(strings.NewReader(text), opts...) if err != nil { return err } } else { namespace := "" if i := strings.IndexByte(context, ' '); i >= 0 { namespace, context = context[:i], context[i+1:] } contextNode := &Node{ Data: context, DataAtom: atom.Lookup([]byte(context)), Namespace: namespace, Type: ElementNode, } nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...) if err != nil { return err } doc = &Node{ Type: DocumentNode, } for _, n := range nodes { doc.AppendChild(n) } } if err := checkTreeConsistency(doc); err != nil { return err } got, err := dump(doc) if err != nil { return err } // Compare the parsed tree to the #document section. if got != want { return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want) } if renderTestBlacklist[text] || context != "" { return nil } // Check that rendering and re-parsing results in an identical tree. pr, pw := io.Pipe() go func() { pw.CloseWithError(Render(pw, doc)) }() doc1, err := ParseWithOptions(pr, opts...) if err != nil { return err } got1, err := dump(doc1) if err != nil { return err } if got != got1 { return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1) } return nil } // Some test input result in parse trees are not 'well-formed' despite // following the HTML5 recovery algorithms. Rendering and re-parsing such a // tree will not result in an exact clone of that tree. We blacklist such // inputs from the render test. var renderTestBlacklist = map[string]bool{ // The second <a> will be reparented to the first <table>'s parent. This // results in an <a> whose parent is an <a>, which is not 'well-formed'. `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true, // The same thing with a <p>: `<p><table></p>`: true, // More cases of <a> being reparented: `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true, `<a><table><a></table><p><a><div><a>`: true, `<a><table><td><a><table></table><a></tr><a></table><a>`: true, `<template><a><table><a>`: true, // A similar reparenting situation involving <nobr>: `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true, // A <plaintext> element is reparented, putting it before a table. // A <plaintext> element can't have anything after it in HTML. `<table><plaintext><td>`: true, `<!doctype html><table><plaintext></plaintext>`: true, `<!doctype html><table><tbody><plaintext></plaintext>`: true, `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true, // A form inside a table inside a form doesn't work either. `<!doctype html><form><table></form><form></table></form>`: true, // A script that ends at EOF may escape its own closing tag when rendered. `<!doctype html><script><!--<script `: true, `<!doctype html><script><!--<script <`: true, `<!doctype html><script><!--<script <a`: true, `<!doctype html><script><!--<script </`: true, `<!doctype html><script><!--<script </s`: true, `<!doctype html><script><!--<script </script`: true, `<!doctype html><script><!--<script </scripta`: true, `<!doctype html><script><!--<script -`: true, `<!doctype html><script><!--<script -a`: true, `<!doctype html><script><!--<script -<`: true, `<!doctype html><script><!--<script --`: true, `<!doctype html><script><!--<script --a`: true, `<!doctype html><script><!--<script --<`: true, `<script><!--<script `: true, `<script><!--<script <a`: true, `<script><!--<script </script`: true, `<script><!--<script </scripta`: true, `<script><!--<script -`: true, `<script><!--<script -a`: true, `<script><!--<script --`: true, `<script><!--<script --a`: true, `<script><!--<script <`: true, `<script><!--<script </`: true, `<script><!--<script </s`: true, // Reconstructing the active formatting elements results in a <plaintext> // element that contains an <a> element. `<!doctype html><p><a><plaintext>b`: true, `<table><math><select><mi><select></table>`: true, } func TestNodeConsistency(t *testing.T) { // inconsistentNode is a Node whose DataAtom and Data do not agree. inconsistentNode := &Node{ Type: ElementNode, DataAtom: atom.Frameset, Data: "table", } if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil { t.Errorf("got nil error, want non-nil") } } func TestParseFragmentWithNilContext(t *testing.T) { // This shouldn't panic. ParseFragment(strings.NewReader("<p>hello</p>"), nil) } func BenchmarkParser(b *testing.B) { buf, err := ioutil.ReadFile("testdata/go1.html") if err != nil { b.Fatalf("could not read testdata/go1.html: %v", err) } b.SetBytes(int64(len(buf))) runtime.GC() b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { Parse(bytes.NewBuffer(buf)) } }