exp/html: make the tokenizer return atoms for tag tokens.

This is part 1 of a 2 part changelist. Part 2 contains the mechanical
change to parse.go to compare atoms (ints) instead of strings.

The overall effect of the two changes are:
benchmark                      old ns/op    new ns/op    delta
BenchmarkParser                  4462274      4058254   -9.05%
BenchmarkRawLevelTokenizer        913202       912917   -0.03%
BenchmarkLowLevelTokenizer       1268626      1267836   -0.06%
BenchmarkHighLevelTokenizer      1947305      1968944   +1.11%

R=rsc
CC=andybalholm, golang-dev, r
https://golang.org/cl/6305053
This commit is contained in:
Nigel Tao 2012-06-07 13:05:35 +10:00
parent 6423682019
commit cd21eff705
4 changed files with 57 additions and 36 deletions

View file

@ -4,8 +4,12 @@
package html
import (
"exp/html/atom"
)
// A NodeType is the type of a Node.
type NodeType int
type NodeType uint32
const (
ErrorNode NodeType = iota
@ -25,7 +29,8 @@ var scopeMarker = Node{Type: scopeMarkerNode}
// A Node consists of a NodeType and some Data (tag name for element nodes,
// content for text) and are part of a tree of Nodes. Element nodes may also
// have a Namespace and contain a slice of Attributes. Data is unescaped, so
// that it looks like "a<b" rather than "a&lt;b".
// that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
// is the atom for Data, or zero if Data is not a known tag name.
//
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
@ -34,6 +39,7 @@ type Node struct {
Parent *Node
Child []*Node
Type NodeType
DataAtom atom.Atom
Data string
Namespace string
Attr []Attribute
@ -83,9 +89,10 @@ func reparentChildren(dst, src *Node) {
// The clone has no parent and no children.
func (n *Node) clone() *Node {
m := &Node{
Type: n.Type,
Data: n.Data,
Attr: make([]Attribute, len(n.Attr)),
Type: n.Type,
DataAtom: n.DataAtom,
Data: n.Data,
Attr: make([]Attribute, len(n.Attr)),
}
copy(m.Attr, n.Attr)
return m

View file

@ -5,6 +5,7 @@
package html
import (
a "exp/html/atom"
"io"
"strings"
)
@ -280,7 +281,7 @@ func (p *parser) addText(text string) {
func (p *parser) addElement(tag string, attr []Attribute) {
p.addChild(&Node{
Type: ElementNode,
Data: tag,
Data: tag, // TODO: also set DataAtom.
Attr: attr,
})
}
@ -310,9 +311,9 @@ findIdenticalElements:
continue
}
compareAttributes:
for _, a := range n.Attr {
for _, b := range attr {
if a.Key == b.Key && a.Namespace == b.Namespace && a.Val == b.Val {
for _, t0 := range n.Attr {
for _, t1 := range attr {
if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
// Found a match for this attribute, continue with the next attribute.
continue compareAttributes
}
@ -676,13 +677,13 @@ func copyAttributes(dst *Node, src Token) {
return
}
attr := map[string]string{}
for _, a := range dst.Attr {
attr[a.Key] = a.Val
for _, t := range dst.Attr {
attr[t.Key] = t.Val
}
for _, a := range src.Attr {
if _, ok := attr[a.Key]; !ok {
dst.Attr = append(dst.Attr, a)
attr[a.Key] = a.Val
for _, t := range src.Attr {
if _, ok := attr[t.Key]; !ok {
dst.Attr = append(dst.Attr, t)
attr[t.Key] = t.Val
}
}
}
@ -843,9 +844,9 @@ func inBodyIM(p *parser) bool {
p.oe.pop()
p.acknowledgeSelfClosingTag()
if p.tok.Data == "input" {
for _, a := range p.tok.Attr {
if a.Key == "type" {
if strings.ToLower(a.Val) == "hidden" {
for _, t := range p.tok.Attr {
if t.Key == "type" {
if strings.ToLower(t.Val) == "hidden" {
// Skip setting framesetOK = false
return true
}
@ -874,16 +875,16 @@ func inBodyIM(p *parser) bool {
action := ""
prompt := "This is a searchable index. Enter search keywords: "
attr := []Attribute{{Key: "name", Val: "isindex"}}
for _, a := range p.tok.Attr {
switch a.Key {
for _, t := range p.tok.Attr {
switch t.Key {
case "action":
action = a.Val
action = t.Val
case "name":
// Ignore the attribute.
case "prompt":
prompt = a.Val
prompt = t.Val
default:
attr = append(attr, a)
attr = append(attr, t)
}
}
p.acknowledgeSelfClosingTag()
@ -1231,8 +1232,8 @@ func inTableIM(p *parser) bool {
case "style", "script":
return inHeadIM(p)
case "input":
for _, a := range p.tok.Attr {
if a.Key == "type" && strings.ToLower(a.Val) == "hidden" {
for _, t := range p.tok.Attr {
if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
p.addElement(p.tok.Data, p.tok.Attr)
p.oe.pop()
return true
@ -1863,6 +1864,7 @@ func parseForeignContent(p *parser) bool {
// Adjust SVG tag names. The tokenizer lower-cases tag names, but
// SVG wants e.g. "foreignObject" with a capital second "O".
if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
p.tok.DataAtom = a.Lookup([]byte(x))
p.tok.Data = x
}
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
@ -1929,7 +1931,7 @@ func (p *parser) parseImpliedToken(t TokenType, data string, attr []Attribute) {
realToken, selfClosing := p.tok, p.hasSelfClosingToken
p.tok = Token{
Type: t,
Data: data,
Data: data, // TODO: also set DataAtom.
Attr: attr,
}
p.hasSelfClosingToken = false
@ -2014,7 +2016,7 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
root := &Node{
Type: ElementNode,
Data: "html",
Data: "html", // TODO: also set DataAtom.
}
p.doc.Add(root)
p.oe = nodeStack{root}

View file

@ -8,6 +8,7 @@ import (
"bufio"
"bytes"
"errors"
"exp/html/atom"
"flag"
"fmt"
"io"
@ -320,8 +321,9 @@ func testParseCase(text, want, context string) (result parseTestResult, err erro
}
} else {
contextNode := &Node{
Type: ElementNode,
Data: context,
Type: ElementNode,
DataAtom: atom.Lookup([]byte(context)),
Data: context,
}
nodes, err := ParseFragment(strings.NewReader(text), contextNode)
if err != nil {

View file

@ -13,7 +13,7 @@ import (
)
// A TokenType is the type of a Token.
type TokenType int
type TokenType uint32
const (
// ErrorToken means that an error occurred during tokenization.
@ -66,11 +66,13 @@ type Attribute struct {
// A Token consists of a TokenType and some Data (tag name for start and end
// tags, content for text, comments and doctypes). A tag Token may also contain
// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
// rather than "a&lt;b").
// rather than "a&lt;b"). For tag Tokens, DataAtom is the atom for Data, or
// zero if Data is not a known tag name.
type Token struct {
Type TokenType
Data string
Attr []Attribute
Type TokenType
DataAtom atom.Atom
Data string
Attr []Attribute
}
// tagString returns a string representation of a tag Token's Data and Attr.
@ -794,11 +796,19 @@ func (z *Tokenizer) Token() Token {
key, val, moreAttr = z.TagAttr()
attr = append(attr, Attribute{"", atom.String(key), string(val)})
}
t.Data = atom.String(name)
if a := atom.Lookup(name); a != 0 {
t.DataAtom, t.Data = a, a.String()
} else {
t.DataAtom, t.Data = 0, string(name)
}
t.Attr = attr
case EndTagToken:
name, _ := z.TagName()
t.Data = atom.String(name)
if a := atom.Lookup(name); a != 0 {
t.DataAtom, t.Data = a, a.String()
} else {
t.DataAtom, t.Data = 0, string(name)
}
}
return t
}