Add a fast TypeScript classifier

This commit is contained in:
Alex Dima 2019-07-15 23:59:47 +02:00
parent 8154b723da
commit 1b8a37d64e
8 changed files with 601 additions and 13 deletions

View file

@ -119,7 +119,8 @@ const copyrightFilter = [
'!resources/completions/**',
'!extensions/markdown-language-features/media/highlight.css',
'!extensions/html-language-features/server/src/modes/typescript/*',
'!extensions/*/server/bin/*'
'!extensions/*/server/bin/*',
'!src/vs/editor/test/node/classification/typescript-test.ts',
];
const eslintFilter = [

View file

@ -53,13 +53,7 @@
{
"language": "typescript",
"scopeName": "source.ts",
"path": "./syntaxes/TypeScript.tmLanguage.json",
"tokenTypes": {
"entity.name.type.instance.jsdoc": "other",
"entity.name.function.tagged-template": "other",
"meta.import string.quoted": "other",
"variable.other.jsdoc": "other"
}
"path": "./syntaxes/TypeScript.tmLanguage.json"
},
{
"language": "typescriptreact",

View file

@ -117,6 +117,9 @@ export class TokenizationStateStore {
if (deleteCount === 0) {
return;
}
if (start + deleteCount > this._len) {
deleteCount = this._len - start;
}
this._beginState.splice(start, deleteCount);
this._valid.splice(start, deleteCount);
this._len -= deleteCount;

View file

@ -116,6 +116,9 @@ export class TokensStore {
if (deleteCount === 0) {
return;
}
if (start + deleteCount > this._len) {
deleteCount = this._len - start;
}
this._lineTokens.splice(start, deleteCount);
this._len -= deleteCount;
}

View file

@ -0,0 +1,304 @@
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
import { StandardTokenType } from 'vs/editor/common/modes';
import { CharCode } from 'vs/base/common/charCode';
class ParserContext {
public readonly text: string;
public readonly len: number;
public readonly tokens: number[];
public pos: number;
private currentTokenStartOffset: number;
private currentTokenType: StandardTokenType;
constructor(text: string) {
this.text = text;
this.len = this.text.length;
this.tokens = [];
this.pos = 0;
this.currentTokenStartOffset = 0;
this.currentTokenType = StandardTokenType.Other;
}
private _safeCharCodeAt(index: number): number {
if (index >= this.len) {
return CharCode.Null;
}
return this.text.charCodeAt(index);
}
peek(distance: number = 0): number {
return this._safeCharCodeAt(this.pos + distance);
}
next(): number {
const result = this._safeCharCodeAt(this.pos);
this.pos++;
return result;
}
advance(distance: number): void {
this.pos += distance;
}
eof(): boolean {
return this.pos >= this.len;
}
beginToken(tokenType: StandardTokenType, deltaPos: number = 0): void {
this.currentTokenStartOffset = this.pos + deltaPos;
this.currentTokenType = tokenType;
}
endToken(deltaPos: number = 0): void {
const length = this.pos + deltaPos - this.currentTokenStartOffset;
// check if it is touching previous token
if (this.tokens.length > 0) {
const previousStartOffset = this.tokens[this.tokens.length - 3];
const previousLength = this.tokens[this.tokens.length - 2];
const previousTokenType = this.tokens[this.tokens.length - 1];
const previousEndOffset = previousStartOffset + previousLength;
if (this.currentTokenStartOffset === previousEndOffset && previousTokenType === this.currentTokenType) {
// extend previous token
this.tokens[this.tokens.length - 2] += length;
return;
}
}
this.tokens.push(this.currentTokenStartOffset, length, this.currentTokenType);
}
}
export function parse(text: string): number[] {
const ctx = new ParserContext(text);
while (!ctx.eof()) {
parseRoot(ctx);
}
return ctx.tokens;
}
function parseRoot(ctx: ParserContext): void {
let curlyCount = 0;
while (!ctx.eof()) {
const ch = ctx.peek();
switch (ch) {
case CharCode.SingleQuote:
parseSimpleString(ctx, CharCode.SingleQuote);
break;
case CharCode.DoubleQuote:
parseSimpleString(ctx, CharCode.DoubleQuote);
break;
case CharCode.BackTick:
parseInterpolatedString(ctx);
break;
case CharCode.Slash:
parseSlash(ctx);
break;
case CharCode.OpenCurlyBrace:
ctx.advance(1);
curlyCount++;
break;
case CharCode.CloseCurlyBrace:
ctx.advance(1);
curlyCount--;
if (curlyCount < 0) {
return;
}
break;
default:
ctx.advance(1);
}
}
}
function parseSimpleString(ctx: ParserContext, closingQuote: number): void {
ctx.beginToken(StandardTokenType.String);
// skip the opening quote
ctx.advance(1);
while (!ctx.eof()) {
const ch = ctx.next();
if (ch === CharCode.Backslash) {
// skip \r\n or any other character following a backslash
const advanceCount = (ctx.peek() === CharCode.CarriageReturn && ctx.peek(1) === CharCode.LineFeed ? 2 : 1);
ctx.advance(advanceCount);
} else if (ch === closingQuote) {
// hit end quote, so stop
break;
}
}
ctx.endToken();
}
function parseInterpolatedString(ctx: ParserContext): void {
ctx.beginToken(StandardTokenType.String);
// skip the opening quote
ctx.advance(1);
while (!ctx.eof()) {
const ch = ctx.next();
if (ch === CharCode.Backslash) {
// skip \r\n or any other character following a backslash
const advanceCount = (ctx.peek() === CharCode.CarriageReturn && ctx.peek(1) === CharCode.LineFeed ? 2 : 1);
ctx.advance(advanceCount);
} else if (ch === CharCode.BackTick) {
// hit end quote, so stop
break;
} else if (ch === CharCode.DollarSign) {
if (ctx.peek() === CharCode.OpenCurlyBrace) {
ctx.advance(1);
ctx.endToken();
parseRoot(ctx);
ctx.beginToken(StandardTokenType.String, -1);
}
}
}
ctx.endToken();
}
function parseSlash(ctx: ParserContext): void {
const nextCh = ctx.peek(1);
if (nextCh === CharCode.Asterisk) {
parseMultiLineComment(ctx);
return;
}
if (nextCh === CharCode.Slash) {
parseSingleLineComment(ctx);
return;
}
if (tryParseRegex(ctx)) {
return;
}
ctx.advance(1);
}
function tryParseRegex(ctx: ParserContext): boolean {
// See https://www.ecma-international.org/ecma-262/10.0/index.html#prod-RegularExpressionLiteral
// TODO: avoid regex...
let contentBefore = ctx.text.substr(ctx.pos - 100, 100);
if (/[a-zA-Z0-9](\s*)$/.test(contentBefore)) {
// Cannot start after an identifier
return false;
}
let pos = 0;
let len = ctx.len - ctx.pos;
let inClass = false;
// skip /
pos++;
while (pos < len) {
const ch = ctx.peek(pos++);
if (ch === CharCode.CarriageReturn || ch === CharCode.LineFeed) {
return false;
}
if (ch === CharCode.Backslash) {
const nextCh = ctx.peek();
if (nextCh === CharCode.CarriageReturn || nextCh === CharCode.LineFeed) {
return false;
}
// skip next character
pos++;
continue;
}
if (inClass) {
if (ch === CharCode.CloseSquareBracket) {
inClass = false;
continue;
}
} else {
if (ch === CharCode.Slash) {
// cannot be directly followed by a /
if (ctx.peek(pos) === CharCode.Slash) {
return false;
}
// consume flags
do {
let nextCh = ctx.peek(pos);
if (nextCh >= CharCode.a && nextCh <= CharCode.z) {
pos++;
continue;
} else {
break;
}
} while (true);
// TODO: avoid regex...
if (/^(\s*)(\.|;|\/|,|\)|\]|\}|$)/.test(ctx.text.substr(ctx.pos + pos))) {
// Must be followed by an operator of kinds
ctx.beginToken(StandardTokenType.RegEx);
ctx.advance(pos);
ctx.endToken();
return true;
}
return false;
}
if (ch === CharCode.OpenSquareBracket) {
inClass = true;
continue;
}
}
}
return false;
}
function parseMultiLineComment(ctx: ParserContext): void {
ctx.beginToken(StandardTokenType.Comment);
// skip the /*
ctx.advance(2);
while (!ctx.eof()) {
const ch = ctx.next();
if (ch === CharCode.Asterisk) {
if (ctx.peek() === CharCode.Slash) {
ctx.advance(1);
break;
}
}
}
ctx.endToken();
}
function parseSingleLineComment(ctx: ParserContext): void {
ctx.beginToken(StandardTokenType.Comment);
// skip the //
ctx.advance(2);
while (!ctx.eof()) {
const ch = ctx.next();
if (ch === CharCode.CarriageReturn || ch === CharCode.LineFeed) {
break;
}
}
ctx.endToken();
}

View file

@ -9,6 +9,7 @@ import { EditorAction, ServicesAccessor, registerEditorAction } from 'vs/editor/
import { StopWatch } from 'vs/base/common/stopwatch';
import { StandardTokenType } from 'vs/editor/common/modes';
import { ITextModel } from 'vs/editor/common/model';
import { parse } from 'vs/editor/common/modes/tokenization/typescript';
class ForceRetokenizeAction extends EditorAction {
constructor() {
@ -25,19 +26,88 @@ class ForceRetokenizeAction extends EditorAction {
return;
}
const model = editor.getModel();
model.resetTokenization();
// model.resetTokenization();
const sw = new StopWatch(true);
model.forceTokenization(model.getLineCount());
sw.stop();
console.log(`tokenization took ${sw.elapsed()}`);
if (!true) {
extractTokenTypes(model);
const expected = extractTokenTypes(model);
const sw2 = new StopWatch(true);
const actual = parse(model.getValue());
sw2.stop();
console.log(`classification took ${sw2.elapsed()}`);
let expectedIndex = 0, expectedCount = expected.length / 3;
let actualIndex = 0, actualCount = actual.length / 3;
outer: while (expectedIndex < expectedCount && actualIndex < actualCount) {
const expectedOffset = expected[3 * expectedIndex];
const expectedLength = expected[3 * expectedIndex + 1];
const expectedType = expected[3 * expectedIndex + 2];
const actualOffset = actual[3 * actualIndex];
const actualLength = actual[3 * actualIndex + 1];
const actualType = actual[3 * actualIndex + 2];
// TS breaks up comments or begins them before (in case of whitespace)...
if (actualType === StandardTokenType.Comment && expectedOffset <= actualOffset && expectedType === actualType) {
const actualEndOffset = actualOffset + actualLength;
while (expectedIndex < expectedCount && expected[3 * expectedIndex] + expected[3 * expectedIndex + 1] <= actualEndOffset) {
// console.log(`(Fuzzy match):`);
// console.log(`--- Expected: ${model.getPositionAt(expected[3 * expectedIndex])} - ${expected[3 * expectedIndex]}, ${expected[3 * expectedIndex + 1]}, ${expected[3 * expectedIndex + 2]}`);
// console.log(`--- Actual: ${model.getPositionAt(actualOffset)} - ${actualOffset}, ${actualLength}, ${actualType}`);
expectedIndex++;
}
actualIndex++;
continue;
}
// TS identifies regexes as strings and begins them before (in case of whitespace)...
if (actualType === StandardTokenType.RegEx && expectedOffset <= actualOffset && expectedType === StandardTokenType.String) {
const actualEndOffset = actualOffset + actualLength;
while (expectedIndex < expectedCount && expected[3 * expectedIndex] + expected[3 * expectedIndex + 1] <= actualEndOffset) {
expectedIndex++;
}
actualIndex++;
continue;
}
if (actualType === StandardTokenType.String && expectedType === actualType) {
const actualEndOffset = actualOffset + actualLength;
while (expectedIndex < expectedCount && expected[3 * expectedIndex] + expected[3 * expectedIndex + 1] <= actualEndOffset) {
// console.log(`(Fuzzy match):`);
// console.log(`--- Expected: ${model.getPositionAt(expected[3 * expectedIndex])} - ${expected[3 * expectedIndex]}, ${expected[3 * expectedIndex + 1]}, ${expected[3 * expectedIndex + 2]}`);
// console.log(`--- Actual: ${model.getPositionAt(actualOffset)} - ${actualOffset}, ${actualLength}, ${actualType}`);
expectedIndex++;
}
actualIndex++;
continue;
}
if (expectedOffset === actualOffset && expectedLength === actualLength && expectedType === actualType) {
expectedIndex++;
actualIndex++;
continue;
}
const expectedPosition = model.getPositionAt(expectedOffset);
console.error(`Missmatch at position: ${expectedPosition}`);
console.error(`Expected: ${model.getPositionAt(expectedOffset)} - ${expectedOffset}, ${expectedLength}, ${expectedType}`);
console.error(`Actual: ${model.getPositionAt(actualOffset)} - ${actualOffset}, ${actualLength}, ${actualType}`);
break;
}
if (expectedIndex !== expectedCount || actualIndex !== actualCount) {
console.error(`Missmatch at the end`);
}
console.log(`Finished comparison!`);
}
}
}
function extractTokenTypes(model: ITextModel): void {
function extractTokenTypes(model: ITextModel): number[] {
const eolLength = model.getEOL().length;
let result: number[] = [];
let resultLen: number = 0;
@ -46,6 +116,7 @@ function extractTokenTypes(model: ITextModel): void {
let offset = 0;
for (let lineNumber = 1, lineCount = model.getLineCount(); lineNumber <= lineCount; lineNumber++) {
const lineTokens = model.getLineTokens(lineNumber);
const lineText = lineTokens.getLineContent();
for (let i = 0, len = lineTokens.getCount(); i < len; i++) {
const tokenType = lineTokens.getStandardTokenType(i);
@ -67,7 +138,7 @@ function extractTokenTypes(model: ITextModel): void {
continue;
}
result[resultLen++] = startOffset; // - lastEndOffset
result[resultLen++] = startOffset;
result[resultLen++] = length;
result[resultLen++] = tokenType;
@ -75,8 +146,10 @@ function extractTokenTypes(model: ITextModel): void {
lastEndOffset = endOffset;
}
offset += lineTokens.getLineContent().length + eolLength;
offset += lineText.length + eolLength;
}
return result;
}
registerEditorAction(ForceRetokenizeAction);

View file

@ -0,0 +1,71 @@
///
/* tslint:disable */
const x01 = "string";
/// ^^^^^^^^ string
const x02 = '\'';
/// ^^^^ string
const x03 = '\n\'\t';
/// ^^^^^^^^ string
const x04 = 'this is\
/// ^^^^^^^^^ string\
a multiline string';
/// <------------------- string
const x05 = x01;// just some text
/// ^^^^^^^^^^^^^^^^^ comment
const x06 = x05;/* multi
/// ^^^^^^^^ comment
line *comment */
/// <---------------- comment
const x07 = 4 / 5;
const x08 = `howdy`;
/// ^^^^^^^ string
const x09 = `\'\"\``;
/// ^^^^^^^^ string
const x10 = `$[]`;
/// ^^^^^ string
const x11 = `${x07 +/**/3}px`;
/// ^^^ string
/// ^^^^ comment
/// ^^^^ string
const x12 = `${x07 + (function () { return 5; })()/**/}px`;
/// ^^^ string
/// ^^^^ comment
/// ^^^^ string
const x13 = /([\w\-]+)?(#([\w\-]+))?((.([\w\-]+))*)/;
/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ regex
const x14 = /\./g;
/// ^^^^^ regex
const x15 = Math.abs(x07) / x07; // speed
/// ^^^^^^^^ comment
const x16 = / x07; /.test('3');
/// ^^^^^^^^ regex
/// ^^^ string
const x17 = `.dialog-modal-block${true ? '.dimmed' : ''}`;
/// ^^^^^^^^^^^^^^^^^^^^^^ string
/// ^^^^^^^^^ string
/// ^^^^ string
const x18 = Math.min((14 <= 0.5 ? 123 / (2 * 1) : ''.length / (2 - (2 * 1))), 1);
/// ^^ string
const x19 = `${3 / '5'.length} km/h)`;
/// ^^^ string
/// ^^^ string
/// ^^^^^^^ string

View file

@ -0,0 +1,139 @@
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
import * as assert from 'assert';
import { StandardTokenType } from 'vs/editor/common/modes';
import * as fs from 'fs';
import { getPathFromAmdModule } from 'vs/base/common/amd';
import { parse } from 'vs/editor/common/modes/tokenization/typescript';
import { toStandardTokenType } from 'vs/editor/common/modes/supports/tokenization';
interface IParseFunc {
(text: string): number[];
}
interface IAssertion {
testLineNumber: number;
startOffset: number;
length: number;
tokenType: StandardTokenType;
}
interface ITest {
content: string;
assertions: IAssertion[];
}
function parseTest(fileName: string): ITest {
interface ILineWithAssertions {
line: string;
assertions: ILineAssertion[];
}
interface ILineAssertion {
testLineNumber: number;
startOffset: number;
length: number;
expectedTokenType: StandardTokenType;
}
const testContents = fs.readFileSync(fileName).toString();
const lines = testContents.split(/\r\n|\n/);
const magicToken = lines[0];
let currentElement: ILineWithAssertions = {
line: lines[1],
assertions: []
};
let parsedTest: ILineWithAssertions[] = [];
for (let i = 2; i < lines.length; i++) {
let line = lines[i];
if (line.substr(0, magicToken.length) === magicToken) {
// this is an assertion line
let m1 = line.substr(magicToken.length).match(/^( +)([\^]+) (\w+)\\?$/);
if (m1) {
currentElement.assertions.push({
testLineNumber: i + 1,
startOffset: magicToken.length + m1[1].length,
length: m1[2].length,
expectedTokenType: toStandardTokenType(m1[3])
});
} else {
let m2 = line.substr(magicToken.length).match(/^( +)<(-+) (\w+)\\?$/);
if (m2) {
currentElement.assertions.push({
testLineNumber: i + 1,
startOffset: 0,
length: m2[2].length,
expectedTokenType: toStandardTokenType(m2[3])
});
} else {
throw new Error(`Invalid test line at line number ${i + 1}.`);
}
}
} else {
// this is a line to be parsed
parsedTest.push(currentElement);
currentElement = {
line: line,
assertions: []
};
}
}
parsedTest.push(currentElement);
let assertions: IAssertion[] = [];
let offset = 0;
for (let i = 0; i < parsedTest.length; i++) {
const parsedTestLine = parsedTest[i];
for (let j = 0; j < parsedTestLine.assertions.length; j++) {
const assertion = parsedTestLine.assertions[j];
assertions.push({
testLineNumber: assertion.testLineNumber,
startOffset: offset + assertion.startOffset,
length: assertion.length,
tokenType: assertion.expectedTokenType
});
}
offset += parsedTestLine.line.length + 1;
}
let content: string = parsedTest.map(parsedTestLine => parsedTestLine.line).join('\n');
return { content, assertions };
}
function executeTest(fileName: string, parseFunc: IParseFunc): void {
const { content, assertions } = parseTest(fileName);
const actual = parseFunc(content);
let actualIndex = 0, actualCount = actual.length / 3;
for (let i = 0; i < assertions.length; i++) {
const assertion = assertions[i];
while (actualIndex < actualCount && actual[3 * actualIndex] + actual[3 * actualIndex + 1] <= assertion.startOffset) {
actualIndex++;
}
assert.ok(
actual[3 * actualIndex] <= assertion.startOffset,
`Line ${assertion.testLineNumber} : startOffset : ${actual[3 * actualIndex]} <= ${assertion.startOffset}`
);
assert.ok(
actual[3 * actualIndex] + actual[3 * actualIndex + 1] >= assertion.startOffset + assertion.length,
`Line ${assertion.testLineNumber} : length : ${actual[3 * actualIndex]} + ${actual[3 * actualIndex + 1]} >= ${assertion.startOffset} + ${assertion.length}.`
);
assert.equal(
actual[3 * actualIndex + 2],
assertion.tokenType,
`Line ${assertion.testLineNumber} : tokenType`);
}
}
suite('Classification', () => {
test('TypeScript', () => {
executeTest(getPathFromAmdModule(require, 'vs/editor/test/node/classification/typescript-test.ts').replace(/\bout\b/, 'src'), parse);
});
});