mirror of
https://github.com/golang/go
synced 2024-11-02 11:50:30 +00:00
html: remove the Tokenizer.ReturnComments option.
The original intention was to simplify the parser, in making it skip all comment tokens. However, checking that the Go html package is 100% compatible with the WebKit HTML test suite requires parsing the comments. There is no longer any real benefit for the option. R=gri, andybalholm CC=golang-dev https://golang.org/cl/5321043
This commit is contained in:
parent
5791233461
commit
18b025d530
4 changed files with 21 additions and 34 deletions
|
@ -70,9 +70,6 @@ call to Next. For example, to extract an HTML page's anchor text:
|
|||
}
|
||||
}
|
||||
|
||||
A Tokenizer typically skips over HTML comments. To return comment tokens, set
|
||||
Tokenizer.ReturnComments to true before looping over calls to Next.
|
||||
|
||||
Parsing is done by calling Parse with an io.Reader, which returns the root of
|
||||
the parse tree (the document element) as a *Node. It is the caller's
|
||||
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
|
||||
|
|
|
@ -1067,7 +1067,6 @@ func Parse(r io.Reader) (*Node, os.Error) {
|
|||
scripting: true,
|
||||
framesetOK: true,
|
||||
}
|
||||
p.tokenizer.ReturnComments = true
|
||||
// Iterate until EOF. Any other error will cause an early return.
|
||||
im, consumed := initialIM, true
|
||||
for {
|
||||
|
|
|
@ -116,10 +116,6 @@ type span struct {
|
|||
|
||||
// A Tokenizer returns a stream of HTML Tokens.
|
||||
type Tokenizer struct {
|
||||
// If ReturnComments is set, Next returns comment tokens;
|
||||
// otherwise it skips over comments (default).
|
||||
ReturnComments bool
|
||||
|
||||
// r is the source of the HTML text.
|
||||
r io.Reader
|
||||
// tt is the TokenType of the current token.
|
||||
|
@ -546,17 +542,19 @@ func (z *Tokenizer) readTagAttrVal() {
|
|||
}
|
||||
}
|
||||
|
||||
// next scans the next token and returns its type.
|
||||
func (z *Tokenizer) next() TokenType {
|
||||
// Next scans the next token and returns its type.
|
||||
func (z *Tokenizer) Next() TokenType {
|
||||
if z.err != nil {
|
||||
return ErrorToken
|
||||
z.tt = ErrorToken
|
||||
return z.tt
|
||||
}
|
||||
z.raw.start = z.raw.end
|
||||
z.data.start = z.raw.end
|
||||
z.data.end = z.raw.end
|
||||
if z.rawTag != "" {
|
||||
z.readRawOrRCDATA()
|
||||
return TextToken
|
||||
z.tt = TextToken
|
||||
return z.tt
|
||||
}
|
||||
z.textIsRaw = false
|
||||
|
||||
|
@ -596,11 +594,13 @@ loop:
|
|||
if x := z.raw.end - len("<a"); z.raw.start < x {
|
||||
z.raw.end = x
|
||||
z.data.end = x
|
||||
return TextToken
|
||||
z.tt = TextToken
|
||||
return z.tt
|
||||
}
|
||||
switch tokenType {
|
||||
case StartTagToken:
|
||||
return z.readStartTag()
|
||||
z.tt = z.readStartTag()
|
||||
return z.tt
|
||||
case EndTagToken:
|
||||
c = z.readByte()
|
||||
if z.err != nil {
|
||||
|
@ -616,39 +616,31 @@ loop:
|
|||
}
|
||||
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
|
||||
z.readEndTag()
|
||||
return EndTagToken
|
||||
z.tt = EndTagToken
|
||||
return z.tt
|
||||
}
|
||||
z.raw.end--
|
||||
z.readUntilCloseAngle()
|
||||
return CommentToken
|
||||
z.tt = CommentToken
|
||||
return z.tt
|
||||
case CommentToken:
|
||||
if c == '!' {
|
||||
return z.readMarkupDeclaration()
|
||||
z.tt = z.readMarkupDeclaration()
|
||||
return z.tt
|
||||
}
|
||||
z.raw.end--
|
||||
z.readUntilCloseAngle()
|
||||
return CommentToken
|
||||
z.tt = CommentToken
|
||||
return z.tt
|
||||
}
|
||||
}
|
||||
if z.raw.start < z.raw.end {
|
||||
z.data.end = z.raw.end
|
||||
return TextToken
|
||||
}
|
||||
return ErrorToken
|
||||
}
|
||||
|
||||
// Next scans the next token and returns its type.
|
||||
func (z *Tokenizer) Next() TokenType {
|
||||
for {
|
||||
z.tt = z.next()
|
||||
// TODO: remove the ReturnComments option. A tokenizer should
|
||||
// always return comment tags.
|
||||
if z.tt == CommentToken && !z.ReturnComments {
|
||||
continue
|
||||
}
|
||||
z.tt = TextToken
|
||||
return z.tt
|
||||
}
|
||||
panic("unreachable")
|
||||
z.tt = ErrorToken
|
||||
return z.tt
|
||||
}
|
||||
|
||||
// Raw returns the unmodified text of the current token. Calling Next, Token,
|
||||
|
|
|
@ -424,7 +424,6 @@ func TestTokenizer(t *testing.T) {
|
|||
loop:
|
||||
for _, tt := range tokenTests {
|
||||
z := NewTokenizer(strings.NewReader(tt.html))
|
||||
z.ReturnComments = true
|
||||
if tt.golden != "" {
|
||||
for i, s := range strings.Split(tt.golden, "$") {
|
||||
if z.Next() == ErrorToken {
|
||||
|
|
Loading…
Reference in a new issue