change scanner API from class to functional style

This commit is contained in:
Martin Aeschlimann 2016-08-25 09:46:56 +02:00
parent 5d8e0b6185
commit 5c4f879b89
3 changed files with 157 additions and 168 deletions

View file

@ -4,7 +4,7 @@
*--------------------------------------------------------------------------------------------*/
'use strict';
import { Scanner, TokenType } from './htmlScanner';
import { TokenType, createScanner } from './htmlScanner';
import { findFirst } from '../utils/arrays';
import { isEmptyElement } from './htmlTags';
@ -33,8 +33,7 @@ export interface HTMLDocument {
}
export function parse(text: string) : HTMLDocument {
let scanner = new Scanner();
scanner.setSource(text);
let scanner = createScanner(text);
let htmlDocument = new Node(0, text.length, [], null);
let curr = htmlDocument;
@ -43,21 +42,21 @@ export function parse(text: string) : HTMLDocument {
while (token !== TokenType.EOS) {
switch (token) {
case TokenType.StartTagOpen:
let child = new Node(scanner.tokenOffset, text.length, [], curr);
let child = new Node(scanner.getTokenOffset(), text.length, [], curr);
curr.children.push(child);
curr = child;
break;
case TokenType.StartTag:
curr.tag = scanner.tokenText;
curr.tag = scanner.getTokenText();
break;
case TokenType.StartTagClose:
curr.end = scanner.position; // might be later set to end tag position
curr.end = scanner.getTokenEnd(); // might be later set to end tag position
if (isEmptyElement(curr.tag) && curr !== htmlDocument) {
curr = curr.parent;
}
break;
case TokenType.EndTag:
let closeTag = scanner.tokenText;
let closeTag = scanner.getTokenText();
while (curr.tag !== closeTag && curr !== htmlDocument) {
curr = curr.parent;
}
@ -65,7 +64,7 @@ export function parse(text: string) : HTMLDocument {
case TokenType.StartTagSelfClose:
case TokenType.EndTagClose:
if (curr !== htmlDocument) {
curr.end = scanner.position;
curr.end = scanner.getTokenEnd();
curr = curr.parent;
}
break;

View file

@ -191,226 +191,216 @@ export enum ScannerState {
AttributeValue
}
export class Scanner {
export interface Scanner {
scan() : TokenType;
getTokenType(): TokenType;
getTokenOffset(): number;
getTokenLength(): number;
getTokenEnd(): number;
getTokenText(): string;
getScannerState(): ScannerState;
}
private _stream: MultiLineStream;
private _state: ScannerState;
private _tokenType: TokenType;
private _tokenOffset: number;
export function createScanner(input: string, initialState: ScannerState = ScannerState.Content) : Scanner {
private _hasSpaceAfterTag: boolean;
private _lastTag: string;
let stream = new MultiLineStream(input);
let state = initialState;
let tokenOffset: number = 0;
let tokenType: number = void 0;
public setSource(input: string, initialState: ScannerState = ScannerState.Content): void {
this._stream = new MultiLineStream(input);
this._state = initialState;
let hasSpaceAfterTag: boolean;
let lastTag: string;
function nextElementName(): string {
return stream.advanceIfRegExp(/^[_:\w][_:\w-.\d]*/).toLowerCase();
}
public get position(): number {
return this._stream.pos();
function nextAttributeName(): string {
return stream.advanceIfRegExp(/^[^\s"'>/=\x00-\x0F\x7F\x80-\x9F]*/).toLowerCase();
}
public get scannerState(): number {
return this._state;
}
public get tokenType(): number {
return this._tokenType;
}
public get tokenOffset(): number {
return this._tokenOffset;
}
public get tokenLength(): number {
return this._stream.pos() - this._tokenOffset;
}
public get tokenText(): string {
return this._stream.getSource().substring(this._tokenOffset, this._stream.pos());
}
private nextElementName(): string {
return this._stream.advanceIfRegExp(/^[_:\w][_:\w-.\d]*/).toLowerCase();
}
private nextAttributeName(): string {
return this._stream.advanceIfRegExp(/^[^\s"'>/=\x00-\x0F\x7F\x80-\x9F]*/).toLowerCase();
}
private finishToken(offset: number, type: TokenType): TokenType {
this._tokenType = type;
this._tokenOffset = offset;
function finishToken(offset: number, type: TokenType): TokenType {
tokenType = type;
tokenOffset = offset;
return type;
}
public scan(): TokenType {
let offset = this._stream.pos();
if (this._stream.eos()) {
return this.finishToken(offset, TokenType.EOS);
function scan(): TokenType {
let offset = stream.pos();
if (stream.eos()) {
return finishToken(offset, TokenType.EOS);
}
switch (this._state) {
switch (state) {
case ScannerState.WithinComment:
if (this._stream.advanceIfChars([_MIN, _MIN, _RAN])) { // -->
this._state = ScannerState.Content;
return this.finishToken(offset, TokenType.EndCommentTag);
if (stream.advanceIfChars([_MIN, _MIN, _RAN])) { // -->
state = ScannerState.Content;
return finishToken(offset, TokenType.EndCommentTag);
}
this._stream.advanceUntilChars([_MIN, _MIN, _RAN]); // -->
return this.finishToken(offset, TokenType.Comment);
stream.advanceUntilChars([_MIN, _MIN, _RAN]); // -->
return finishToken(offset, TokenType.Comment);
case ScannerState.WithinDoctype:
if (this._stream.advanceIfChar(_RAN)) {
this._state = ScannerState.Content;
return this.finishToken(offset, TokenType.EndDoctypeTag);
if (stream.advanceIfChar(_RAN)) {
state = ScannerState.Content;
return finishToken(offset, TokenType.EndDoctypeTag);
}
this._stream.advanceUntilChar(_RAN); // >
return this.finishToken(offset, TokenType.Doctype);
stream.advanceUntilChar(_RAN); // >
return finishToken(offset, TokenType.Doctype);
case ScannerState.Content:
if (this._stream.advanceIfChar(_LAN)) { // <
if (!this._stream.eos() && this._stream.peekChar() === _BNG) { // !
if (this._stream.advanceIfChars([_BNG, _MIN, _MIN])) { // <!--
this._state = ScannerState.WithinComment;
return this.finishToken(offset, TokenType.StartCommentTag);
if (stream.advanceIfChar(_LAN)) { // <
if (!stream.eos() && stream.peekChar() === _BNG) { // !
if (stream.advanceIfChars([_BNG, _MIN, _MIN])) { // <!--
state = ScannerState.WithinComment;
return finishToken(offset, TokenType.StartCommentTag);
}
if (this._stream.advanceIfRegExp(/^!doctype/i)) {
this._state = ScannerState.WithinDoctype;
return this.finishToken(offset, TokenType.StartDoctypeTag);
if (stream.advanceIfRegExp(/^!doctype/i)) {
state = ScannerState.WithinDoctype;
return finishToken(offset, TokenType.StartDoctypeTag);
}
}
if (this._stream.advanceIfChar(_FSL)) { // /
this._state = ScannerState.OpeningEndTag;
return this.finishToken(offset, TokenType.EndTagOpen);
if (stream.advanceIfChar(_FSL)) { // /
state = ScannerState.OpeningEndTag;
return finishToken(offset, TokenType.EndTagOpen);
}
this._state = ScannerState.OpeningStartTag;
return this.finishToken(offset, TokenType.StartTagOpen);
state = ScannerState.OpeningStartTag;
return finishToken(offset, TokenType.StartTagOpen);
}
this._stream.advanceUntilChar(_LAN);
return this.finishToken(offset, TokenType.Content);
stream.advanceUntilChar(_LAN);
return finishToken(offset, TokenType.Content);
case ScannerState.OpeningEndTag:
let tagName = this.nextElementName();
let tagName = nextElementName();
if (tagName.length > 0) {
return this.finishToken(offset, TokenType.EndTag);
} else if (this._stream.advanceIfChar(_RAN)) { // >
this._state = ScannerState.Content;
return this.finishToken(offset, TokenType.EndTagClose);
return finishToken(offset, TokenType.EndTag);
} else if (stream.advanceIfChar(_RAN)) { // >
state = ScannerState.Content;
return finishToken(offset, TokenType.EndTagClose);
}
this._stream.advanceUntilChar(_RAN);
return this.finishToken(offset, TokenType.Whitespace);
stream.advanceUntilChar(_RAN);
return finishToken(offset, TokenType.Whitespace);
case ScannerState.OpeningStartTag:
this._lastTag = this.nextElementName();
if (this._lastTag.length > 0) {
this._hasSpaceAfterTag = false;
this._state = ScannerState.WithinTag;
return this.finishToken(offset, TokenType.StartTag);
lastTag = nextElementName();
if (lastTag.length > 0) {
hasSpaceAfterTag = false;
state = ScannerState.WithinTag;
return finishToken(offset, TokenType.StartTag);
}
break;
case ScannerState.WithinTag:
if (this._stream.skipWhitespace()) {
this._hasSpaceAfterTag = true; // remember that we have seen a whitespace
return this.finishToken(offset, TokenType.Whitespace);
if (stream.skipWhitespace()) {
hasSpaceAfterTag = true; // remember that we have seen a whitespace
return finishToken(offset, TokenType.Whitespace);
}
if (this._hasSpaceAfterTag) {
let name = this.nextAttributeName();
if (hasSpaceAfterTag) {
let name = nextAttributeName();
if (name.length > 0) {
this._state = ScannerState.AttributeName;
this._hasSpaceAfterTag = false;
return this.finishToken(offset, TokenType.AttributeName);
state = ScannerState.AttributeName;
hasSpaceAfterTag = false;
return finishToken(offset, TokenType.AttributeName);
}
}
if (this._stream.advanceIfChars([_FSL, _RAN])) { // />
this._state = ScannerState.Content;
return this.finishToken(offset, TokenType.StartTagSelfClose);
if (stream.advanceIfChars([_FSL, _RAN])) { // />
state = ScannerState.Content;
return finishToken(offset, TokenType.StartTagSelfClose);
}
if (this._stream.advanceIfChar(_RAN)) { // >
if (this._lastTag === 'script') {
this._state = ScannerState.WithinScriptContent;
} else if (this._lastTag === 'style') {
this._state = ScannerState.WithinStyleContent;
if (stream.advanceIfChar(_RAN)) { // >
if (lastTag === 'script') {
state = ScannerState.WithinScriptContent;
} else if (lastTag === 'style') {
state = ScannerState.WithinStyleContent;
} else {
this._state = ScannerState.Content;
state = ScannerState.Content;
}
return this.finishToken(offset, TokenType.StartTagClose);
return finishToken(offset, TokenType.StartTagClose);
}
this._stream.advance(1);
return this.finishToken(offset, TokenType.Unknown);
stream.advance(1);
return finishToken(offset, TokenType.Unknown);
case ScannerState.AttributeName:
if (this._stream.skipWhitespace()) {
this._hasSpaceAfterTag = true;
return this.finishToken(offset, TokenType.Whitespace);
if (stream.skipWhitespace()) {
hasSpaceAfterTag = true;
return finishToken(offset, TokenType.Whitespace);
}
if (this._stream.advanceIfChar(_EQS)) {
this._state = ScannerState.AttributeValue;
return this.finishToken(offset, TokenType.DelimiterAssign);
if (stream.advanceIfChar(_EQS)) {
state = ScannerState.AttributeValue;
return finishToken(offset, TokenType.DelimiterAssign);
}
this._state = ScannerState.WithinTag;
return this.scan(); // no advance yet - jump to WithinTag
state = ScannerState.WithinTag;
return scan(); // no advance yet - jump to WithinTag
case ScannerState.AttributeValue:
if (this._stream.skipWhitespace()) {
return this.finishToken(offset, TokenType.Whitespace);
if (stream.skipWhitespace()) {
return finishToken(offset, TokenType.Whitespace);
}
let attributeValue = this._stream.advanceIfRegExp(/^[^\s"'`=<>]+/);
let attributeValue = stream.advanceIfRegExp(/^[^\s"'`=<>]+/);
if (attributeValue.length > 0) {
this._state = ScannerState.WithinTag;
this._hasSpaceAfterTag = false;
return this.finishToken(offset, TokenType.AttributeValue);
state = ScannerState.WithinTag;
hasSpaceAfterTag = false;
return finishToken(offset, TokenType.AttributeValue);
}
let ch = this._stream.peekChar();
let ch = stream.peekChar();
if (ch === _SQO || ch === _DQO) {
this._stream.advance(1); // consume quote
if (this._stream.advanceUntilChar(ch)) {
this._stream.advance(1); // consume quote
stream.advance(1); // consume quote
if (stream.advanceUntilChar(ch)) {
stream.advance(1); // consume quote
}
this._state = ScannerState.WithinTag;
this._hasSpaceAfterTag = false;
return this.finishToken(offset, TokenType.AttributeValue);
state = ScannerState.WithinTag;
hasSpaceAfterTag = false;
return finishToken(offset, TokenType.AttributeValue);
}
this._state = ScannerState.WithinTag;
this._hasSpaceAfterTag = false;
return this.scan(); // no advance yet - jump to WithinTag
state = ScannerState.WithinTag;
hasSpaceAfterTag = false;
return scan(); // no advance yet - jump to WithinTag
case ScannerState.WithinScriptContent:
// see http://stackoverflow.com/questions/14574471/how-do-browsers-parse-a-script-tag-exactly
let state = 1;
while (!this._stream.eos()) {
let match = this._stream.advanceIfRegExp(/<!--|-->|<\/?script\s*\/?>?/i);
let sciptState = 1;
while (!stream.eos()) {
let match = stream.advanceIfRegExp(/<!--|-->|<\/?script\s*\/?>?/i);
if (match.length === 0) {
this._stream.goToEnd();
return this.finishToken(offset, TokenType.Script);
stream.goToEnd();
return finishToken(offset, TokenType.Script);
} else if (match === '<!--') {
if (state === 1) {
state = 2;
if (sciptState === 1) {
sciptState = 2;
}
} else if (match === '-->') {
state = 1;
sciptState = 1;
} else if (match[1] !== '/') { // <script
if (state === 2) {
state = 3;
if (sciptState === 2) {
sciptState = 3;
}
} else { // </script
if (state === 3) {
state = 2;
if (sciptState === 3) {
sciptState = 2;
} else {
this._stream.goBack(match.length); // to the beginning of the closing tag
stream.goBack(match.length); // to the beginning of the closing tag
break;
}
}
}
this._state = ScannerState.Content;
if (offset < this._stream.pos()) {
return this.finishToken(offset, TokenType.Script);
state = ScannerState.Content;
if (offset < stream.pos()) {
return finishToken(offset, TokenType.Script);
}
return this.scan(); // no advance yet - jump to content
return scan(); // no advance yet - jump to content
case ScannerState.WithinScriptContent:
this._stream.advanceUntilRegExp(/<\/style/i);
this._state = ScannerState.Content;
if (offset < this._stream.pos()) {
return this.finishToken(offset, TokenType.Styles);
stream.advanceUntilRegExp(/<\/style/i);
state = ScannerState.Content;
if (offset < stream.pos()) {
return finishToken(offset, TokenType.Styles);
}
return this.scan(); // no advance yet - jump to content
return scan(); // no advance yet - jump to content
}
this._stream.advance(1);
this._state = ScannerState.Content;
return this.finishToken(offset, TokenType.Unknown);
stream.advance(1);
state = ScannerState.Content;
return finishToken(offset, TokenType.Unknown);
}
return {
scan,
getTokenType: () => tokenType,
getTokenOffset: () => tokenOffset,
getTokenLength: () => stream.pos() - tokenOffset,
getTokenEnd: () => stream.pos(),
getTokenText: () => stream.getSource().substring(tokenOffset, stream.pos()),
getScannerState: () => state
};
}

View file

@ -5,7 +5,7 @@
'use strict';
import * as assert from 'assert';
import {Scanner, TokenType, ScannerState} from '../parser/htmlScanner';
import {Scanner, TokenType, ScannerState, createScanner} from '../parser/htmlScanner';
suite('HTML Scanner', () => {
@ -16,22 +16,22 @@ suite('HTML Scanner', () => {
}
function assertTokens(tests: {input: string; tokens: Token[]; }[]) {
let scanner = new Scanner();
let scannerState = ScannerState.Content;
for (let t of tests) {
scanner.setSource(t.input, scannerState);
let scanner = createScanner(t.input, scannerState);
let tokenType = scanner.scan();
let actual : Token[] = [];
while (tokenType !== TokenType.EOS) {
let actualToken : Token= { offset: scanner.tokenOffset, type: tokenType };
let actualToken : Token= { offset: scanner.getTokenOffset(), type: tokenType };
if (tokenType == TokenType.StartTag || tokenType == TokenType.EndTag) {
actualToken.content = t.input.substr(scanner.tokenOffset, scanner.tokenLength);
actualToken.content = t.input.substr(scanner.getTokenOffset(), scanner.getTokenLength());
}
actual.push(actualToken);
tokenType = scanner.scan();
}
assert.deepEqual(actual, t.tokens);
scannerState = scanner.scannerState;
scannerState = scanner.getScannerState();
}
}