html: parse raw text and RCDATA elements, such as <script> and <title>.

Pass tests1.dat, test 26:
#data
<script><div></script></div><title><p></title><p><p>
#document
| <html>
|   <head>
|     <script>
|       "<div>"
|     <title>
|       "<p>"
|   <body>
|     <p>
|     <p>

Thanks to Andy Balholm for driving this change.

R=andybalholm
CC=golang-dev
https://golang.org/cl/5301042
This commit is contained in:
Nigel Tao 2011-10-19 08:03:30 +11:00
parent 78ad19f214
commit b1fd528db5
5 changed files with 209 additions and 21 deletions

View file

@ -29,6 +29,9 @@ type parser struct {
head, form *Node
// Other parsing state flags (section 11.2.3.5).
scripting, framesetOK bool
// originalIM is the insertion mode to go back to after completing a text
// or inTableText insertion mode.
originalIM insertionMode
}
func (p *parser) top() *Node {
@ -214,12 +217,23 @@ type insertionMode func(*parser) (insertionMode, bool)
// Section 11.2.3.1, "using the rules for".
func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, bool) {
im, consumed := delegate(p)
// TODO: do we need to update p.originalMode if it equals delegate?
if im != delegate {
return im, consumed
}
return actual, consumed
}
// setOriginalIM sets the insertion mode to return to after completing a text or
// inTableText insertion mode.
// Section 11.2.3.1, "using the rules for".
func (p *parser) setOriginalIM(im insertionMode) {
if p.originalIM != nil {
panic("html: bad parser state: originalIM was set twice")
}
p.originalIM = im
}
// Section 11.2.5.4.1.
func initialIM(p *parser) (insertionMode, bool) {
if p.tok.Type == DoctypeToken {
@ -318,8 +332,10 @@ func inHeadIM(p *parser) (insertionMode, bool) {
switch p.tok.Data {
case "meta":
// TODO.
case "script":
// TODO.
case "script", "title":
p.addElement(p.tok.Data, p.tok.Attr)
p.setOriginalIM(inHeadIM)
return textIM, true
default:
implied = true
}
@ -574,6 +590,20 @@ func (p *parser) inBodyEndTagFormatting(tag string) {
}
}
// Section 11.2.5.4.8.
func textIM(p *parser) (insertionMode, bool) {
switch p.tok.Type {
case TextToken:
p.addText(p.tok.Data)
return textIM, true
case EndTagToken:
p.oe.pop()
}
o := p.originalIM
p.originalIM = nil
return o, p.tok.Type == EndTagToken
}
// Section 11.2.5.4.9.
func inTableIM(p *parser) (insertionMode, bool) {
var (

View file

@ -80,13 +80,13 @@ func dumpLevel(w io.Writer, n *Node, level int) os.Error {
case DocumentNode:
return os.NewError("unexpected DocumentNode")
case ElementNode:
fmt.Fprintf(w, "<%s>", EscapeString(n.Data))
fmt.Fprintf(w, "<%s>", n.Data)
case TextNode:
fmt.Fprintf(w, "%q", EscapeString(n.Data))
fmt.Fprintf(w, "%q", n.Data)
case CommentNode:
return os.NewError("COMMENT")
case DoctypeNode:
fmt.Fprintf(w, "<!DOCTYPE %s>", EscapeString(n.Data))
fmt.Fprintf(w, "<!DOCTYPE %s>", n.Data)
case scopeMarkerNode:
return os.NewError("unexpected scopeMarkerNode")
default:
@ -123,7 +123,7 @@ func TestParser(t *testing.T) {
rc := make(chan io.Reader)
go readDat(filename, rc)
// TODO(nigeltao): Process all test cases, not just a subset.
for i := 0; i < 26; i++ {
for i := 0; i < 27; i++ {
// Parse the #data section.
b, err := ioutil.ReadAll(<-rc)
if err != nil {

View file

@ -74,17 +74,6 @@ func render(w writer, n *Node) os.Error {
return os.NewError("html: unknown node type")
}
// TODO: figure out what to do with <script>, <style>, <noembed>,
// <noframes> and <noscript> elements. A tentative plan:
// 1. render the <xxx> opening tag as normal.
// 2. maybe error out if any child is not a text node.
// 3. render the text nodes (without escaping??).
// 4. maybe error out if `</xxx` is a case-insensitive substring of the
// concatenation of the children's data.
// 5. maybe error out if the concatenation of the children's data contains an
// unbalanced escaping text span start ("<!--") not followed by an end ("-->").
// 6. render the closing tag as normal.
// Render the <xxx> opening tag.
if err := w.WriteByte('<'); err != nil {
return err
@ -121,9 +110,30 @@ func render(w writer, n *Node) os.Error {
}
// Render any child nodes.
for _, c := range n.Child {
if err := render(w, c); err != nil {
return err
switch n.Data {
case "noembed", "noframes", "noscript", "script", "style":
for _, c := range n.Child {
if c.Type != TextNode {
return fmt.Errorf("html: raw text element <%s> has non-text child node", n.Data)
}
if _, err := w.WriteString(c.Data); err != nil {
return err
}
}
case "textarea", "title":
for _, c := range n.Child {
if c.Type != TextNode {
return fmt.Errorf("html: RCDATA element <%s> has non-text child node", n.Data)
}
if err := render(w, c); err != nil {
return err
}
}
default:
for _, c := range n.Child {
if err := render(w, c); err != nil {
return err
}
}
}

View file

@ -9,6 +9,7 @@ import (
"io"
"os"
"strconv"
"strings"
)
// A TokenType is the type of a Token.
@ -144,6 +145,13 @@ type Tokenizer struct {
pendingAttr [2]span
attr [][2]span
nAttrReturned int
// rawTag is the "script" in "</script>" that closes the next token. If
// non-empty, the subsequent call to Next will return a raw or RCDATA text
// token: one that treats "<p>" as text instead of an element.
// rawTag's contents are lower-cased.
rawTag string
// textIsRaw is whether the current text token's data is not escaped.
textIsRaw bool
}
// Error returns the error associated with the most recent ErrorToken token.
@ -225,6 +233,54 @@ func (z *Tokenizer) skipWhiteSpace() {
}
}
// readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and
// is typically something like "script" or "textarea".
func (z *Tokenizer) readRawOrRCDATA() {
loop:
for {
c := z.readByte()
if z.err != nil {
break loop
}
if c != '<' {
continue loop
}
c = z.readByte()
if z.err != nil {
break loop
}
if c != '/' {
continue loop
}
for i := 0; i < len(z.rawTag); i++ {
c = z.readByte()
if z.err != nil {
break loop
}
if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {
continue loop
}
}
c = z.readByte()
if z.err != nil {
break loop
}
switch c {
case ' ', '\n', '\r', '\t', '\f', '/', '>':
// The 3 is 2 for the leading "</" plus 1 for the trailing character c.
z.raw.end -= 3 + len(z.rawTag)
break loop
case '<':
// Step back one, to catch "</foo</foo>".
z.raw.end--
}
}
z.data.end = z.raw.end
// A textarea's or title's RCDATA can contain escaped entities.
z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"
z.rawTag = ""
}
// readComment reads the next comment token starting with "<!--". The opening
// "<!--" has already been consumed.
func (z *Tokenizer) readComment() {
@ -350,6 +406,19 @@ func (z *Tokenizer) readStartTag() TokenType {
break
}
}
// Any "<noembed>", "<noframes>", "<noscript>", "<script>", "<style>",
// "<textarea>" or "<title>" tag flags the tokenizer's next token as raw.
// The tag name lengths of these special cases ranges in [5, 8].
if x := z.data.end - z.data.start; 5 <= x && x <= 8 {
switch z.buf[z.data.start] {
case 'n', 's', 't', 'N', 'S', 'T':
switch s := strings.ToLower(string(z.buf[z.data.start:z.data.end])); s {
case "noembed", "noframes", "noscript", "script", "style", "textarea", "title":
z.rawTag = s
}
}
}
// Look for a self-closing token like "<br/>".
if z.err == nil && z.buf[z.raw.end-2] == '/' {
return SelfClosingTagToken
}
@ -485,6 +554,11 @@ func (z *Tokenizer) next() TokenType {
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
if z.rawTag != "" {
z.readRawOrRCDATA()
return TextToken
}
z.textIsRaw = false
loop:
for {
@ -591,7 +665,10 @@ func (z *Tokenizer) Text() []byte {
s := z.buf[z.data.start:z.data.end]
z.data.start = z.raw.end
z.data.end = z.raw.end
return unescape(s)
if !z.textIsRaw {
s = unescape(s)
}
return s
}
return nil
}

View file

@ -174,6 +174,77 @@ var tokenTests = []tokenTest{
`<p id="0"</p>`,
`<p id="0" <="" p="">`,
},
// Raw text and RCDATA.
{
"basic raw text",
"<script><a></b></script>",
"<script>$&lt;a&gt;&lt;/b&gt;$</script>",
},
{
"unfinished script end tag",
"<SCRIPT>a</SCR",
"<script>$a&lt;/SCR",
},
{
"broken script end tag",
"<SCRIPT>a</SCR ipt>",
"<script>$a&lt;/SCR ipt&gt;",
},
{
"EOF in script end tag",
"<SCRIPT>a</SCRipt",
"<script>$a&lt;/SCRipt",
},
{
"scriptx end tag",
"<SCRIPT>a</SCRiptx",
"<script>$a&lt;/SCRiptx",
},
{
"' ' completes script end tag",
"<SCRIPT>a</SCRipt ",
"<script>$a$</script>",
},
{
"'>' completes script end tag",
"<SCRIPT>a</SCRipt>",
"<script>$a$</script>",
},
{
"self-closing script end tag",
"<SCRIPT>a</SCRipt/>",
"<script>$a$</script>",
},
{
"nested script tag",
"<SCRIPT>a</SCRipt<script>",
"<script>$a&lt;/SCRipt&lt;script&gt;",
},
{
"script end tag after unfinished",
"<SCRIPT>a</SCRipt</script>",
"<script>$a&lt;/SCRipt$</script>",
},
{
"script/style mismatched tags",
"<script>a</style>",
"<script>$a&lt;/style&gt;",
},
{
"style element with entity",
"<style>&apos;",
"<style>$&amp;apos;",
},
{
"textarea with tag",
"<textarea><div></textarea>",
"<textarea>$&lt;div&gt;$</textarea>",
},
{
"title with tag and entity",
"<title><b>K&amp;R C</b></title>",
"<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
},
// DOCTYPE tests.
{
"Proper DOCTYPE",