html: remove the Tokenizer.ReturnComments option.

The original intention was to simplify the parser, in making it skip
all comment tokens. However, checking that the Go html package is
100% compatible with the WebKit HTML test suite requires parsing the
comments. There is no longer any real benefit for the option.

R=gri, andybalholm
CC=golang-dev
https://golang.org/cl/5321043
This commit is contained in:
Nigel Tao 2011-10-25 11:28:07 +11:00
parent 5791233461
commit 18b025d530
4 changed files with 21 additions and 34 deletions

View file

@ -70,9 +70,6 @@ call to Next. For example, to extract an HTML page's anchor text:
}
}
A Tokenizer typically skips over HTML comments. To return comment tokens, set
Tokenizer.ReturnComments to true before looping over calls to Next.
Parsing is done by calling Parse with an io.Reader, which returns the root of
the parse tree (the document element) as a *Node. It is the caller's
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For

View file

@ -1067,7 +1067,6 @@ func Parse(r io.Reader) (*Node, os.Error) {
scripting: true,
framesetOK: true,
}
p.tokenizer.ReturnComments = true
// Iterate until EOF. Any other error will cause an early return.
im, consumed := initialIM, true
for {

View file

@ -116,10 +116,6 @@ type span struct {
// A Tokenizer returns a stream of HTML Tokens.
type Tokenizer struct {
// If ReturnComments is set, Next returns comment tokens;
// otherwise it skips over comments (default).
ReturnComments bool
// r is the source of the HTML text.
r io.Reader
// tt is the TokenType of the current token.
@ -546,17 +542,19 @@ func (z *Tokenizer) readTagAttrVal() {
}
}
// next scans the next token and returns its type.
func (z *Tokenizer) next() TokenType {
// Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType {
if z.err != nil {
return ErrorToken
z.tt = ErrorToken
return z.tt
}
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
if z.rawTag != "" {
z.readRawOrRCDATA()
return TextToken
z.tt = TextToken
return z.tt
}
z.textIsRaw = false
@ -596,11 +594,13 @@ loop:
if x := z.raw.end - len("<a"); z.raw.start < x {
z.raw.end = x
z.data.end = x
return TextToken
z.tt = TextToken
return z.tt
}
switch tokenType {
case StartTagToken:
return z.readStartTag()
z.tt = z.readStartTag()
return z.tt
case EndTagToken:
c = z.readByte()
if z.err != nil {
@ -616,39 +616,31 @@ loop:
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
z.readEndTag()
return EndTagToken
z.tt = EndTagToken
return z.tt
}
z.raw.end--
z.readUntilCloseAngle()
return CommentToken
z.tt = CommentToken
return z.tt
case CommentToken:
if c == '!' {
return z.readMarkupDeclaration()
z.tt = z.readMarkupDeclaration()
return z.tt
}
z.raw.end--
z.readUntilCloseAngle()
return CommentToken
z.tt = CommentToken
return z.tt
}
}
if z.raw.start < z.raw.end {
z.data.end = z.raw.end
return TextToken
}
return ErrorToken
}
// Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType {
for {
z.tt = z.next()
// TODO: remove the ReturnComments option. A tokenizer should
// always return comment tags.
if z.tt == CommentToken && !z.ReturnComments {
continue
}
z.tt = TextToken
return z.tt
}
panic("unreachable")
z.tt = ErrorToken
return z.tt
}
// Raw returns the unmodified text of the current token. Calling Next, Token,

View file

@ -424,7 +424,6 @@ func TestTokenizer(t *testing.T) {
loop:
for _, tt := range tokenTests {
z := NewTokenizer(strings.NewReader(tt.html))
z.ReturnComments = true
if tt.golden != "" {
for i, s := range strings.Split(tt.golden, "$") {
if z.Next() == ErrorToken {