Add a fast TypeScript classifier

2024-08-27 04:49:35 +00:00 · 2019-07-15 23:59:47 +02:00 · 2019-07-15 23:59:47 +02:00 · 1b8a37d64e
parent 8154b723da
commit 1b8a37d64e
8 changed files with 601 additions and 13 deletions
--- a/build/gulpfile.hygiene.js
+++ b/build/gulpfile.hygiene.js
@ -119,7 +119,8 @@ const copyrightFilter = [
 	'!resources/completions/**',
 	'!extensions/markdown-language-features/media/highlight.css',
 	'!extensions/html-language-features/server/src/modes/typescript/*',
-	'!extensions/*/server/bin/*'
+	'!extensions/*/server/bin/*',
+	'!src/vs/editor/test/node/classification/typescript-test.ts',
 ];

 const eslintFilter = [
--- a/extensions/typescript-basics/package.json
+++ b/extensions/typescript-basics/package.json
@ -53,13 +53,7 @@
      {
        "language": "typescript",
        "scopeName": "source.ts",
-        "path": "./syntaxes/TypeScript.tmLanguage.json",
-        "tokenTypes": {
-          "entity.name.type.instance.jsdoc": "other",
-          "entity.name.function.tagged-template": "other",
-          "meta.import string.quoted": "other",
-          "variable.other.jsdoc": "other"
-        }
+        "path": "./syntaxes/TypeScript.tmLanguage.json"
      },
      {
        "language": "typescriptreact",
--- a/src/vs/editor/common/model/textModelTokens.ts
+++ b/src/vs/editor/common/model/textModelTokens.ts
@ -117,6 +117,9 @@ export class TokenizationStateStore {
 		if (deleteCount === 0) {
 			return;
 		}
+		if (start + deleteCount > this._len) {
+			deleteCount = this._len - start;
+		}
 		this._beginState.splice(start, deleteCount);
 		this._valid.splice(start, deleteCount);
 		this._len -= deleteCount;
--- a/src/vs/editor/common/model/tokensStore.ts
+++ b/src/vs/editor/common/model/tokensStore.ts
@ -116,6 +116,9 @@ export class TokensStore {
 		if (deleteCount === 0) {
 			return;
 		}
+		if (start + deleteCount > this._len) {
+			deleteCount = this._len - start;
+		}
 		this._lineTokens.splice(start, deleteCount);
 		this._len -= deleteCount;
 	}
--- a/src/vs/editor/common/modes/tokenization/typescript.ts
+++ b/src/vs/editor/common/modes/tokenization/typescript.ts
@ -0,0 +1,304 @@
+/*---------------------------------------------------------------------------------------------
+ *  Copyright (c) Microsoft Corporation. All rights reserved.
+ *  Licensed under the MIT License. See License.txt in the project root for license information.
+ *--------------------------------------------------------------------------------------------*/
+
+import { StandardTokenType } from 'vs/editor/common/modes';
+import { CharCode } from 'vs/base/common/charCode';
+
+class ParserContext {
+	public readonly text: string;
+	public readonly len: number;
+	public readonly tokens: number[];
+	public pos: number;
+
+	private currentTokenStartOffset: number;
+	private currentTokenType: StandardTokenType;
+
+	constructor(text: string) {
+		this.text = text;
+		this.len = this.text.length;
+		this.tokens = [];
+		this.pos = 0;
+		this.currentTokenStartOffset = 0;
+		this.currentTokenType = StandardTokenType.Other;
+	}
+
+	private _safeCharCodeAt(index: number): number {
+		if (index >= this.len) {
+			return CharCode.Null;
+		}
+		return this.text.charCodeAt(index);
+	}
+
+	peek(distance: number = 0): number {
+		return this._safeCharCodeAt(this.pos + distance);
+	}
+
+	next(): number {
+		const result = this._safeCharCodeAt(this.pos);
+		this.pos++;
+		return result;
+	}
+
+	advance(distance: number): void {
+		this.pos += distance;
+	}
+
+	eof(): boolean {
+		return this.pos >= this.len;
+	}
+
+	beginToken(tokenType: StandardTokenType, deltaPos: number = 0): void {
+		this.currentTokenStartOffset = this.pos + deltaPos;
+		this.currentTokenType = tokenType;
+	}
+
+	endToken(deltaPos: number = 0): void {
+		const length = this.pos + deltaPos - this.currentTokenStartOffset;
+		// check if it is touching previous token
+		if (this.tokens.length > 0) {
+			const previousStartOffset = this.tokens[this.tokens.length - 3];
+			const previousLength = this.tokens[this.tokens.length - 2];
+			const previousTokenType = this.tokens[this.tokens.length - 1];
+			const previousEndOffset = previousStartOffset + previousLength;
+			if (this.currentTokenStartOffset === previousEndOffset && previousTokenType === this.currentTokenType) {
+				// extend previous token
+				this.tokens[this.tokens.length - 2] += length;
+				return;
+			}
+		}
+		this.tokens.push(this.currentTokenStartOffset, length, this.currentTokenType);
+	}
+}
+
+export function parse(text: string): number[] {
+	const ctx = new ParserContext(text);
+	while (!ctx.eof()) {
+		parseRoot(ctx);
+	}
+	return ctx.tokens;
+}
+
+function parseRoot(ctx: ParserContext): void {
+	let curlyCount = 0;
+	while (!ctx.eof()) {
+		const ch = ctx.peek();
+
+		switch (ch) {
+			case CharCode.SingleQuote:
+				parseSimpleString(ctx, CharCode.SingleQuote);
+				break;
+			case CharCode.DoubleQuote:
+				parseSimpleString(ctx, CharCode.DoubleQuote);
+				break;
+			case CharCode.BackTick:
+				parseInterpolatedString(ctx);
+				break;
+			case CharCode.Slash:
+				parseSlash(ctx);
+				break;
+			case CharCode.OpenCurlyBrace:
+				ctx.advance(1);
+				curlyCount++;
+				break;
+			case CharCode.CloseCurlyBrace:
+				ctx.advance(1);
+				curlyCount--;
+				if (curlyCount < 0) {
+					return;
+				}
+				break;
+			default:
+				ctx.advance(1);
+		}
+	}
+
+}
+
+function parseSimpleString(ctx: ParserContext, closingQuote: number): void {
+	ctx.beginToken(StandardTokenType.String);
+
+	// skip the opening quote
+	ctx.advance(1);
+
+	while (!ctx.eof()) {
+		const ch = ctx.next();
+		if (ch === CharCode.Backslash) {
+			// skip \r\n or any other character following a backslash
+			const advanceCount = (ctx.peek() === CharCode.CarriageReturn && ctx.peek(1) === CharCode.LineFeed ? 2 : 1);
+			ctx.advance(advanceCount);
+		} else if (ch === closingQuote) {
+			// hit end quote, so stop
+			break;
+		}
+	}
+
+	ctx.endToken();
+}
+
+function parseInterpolatedString(ctx: ParserContext): void {
+	ctx.beginToken(StandardTokenType.String);
+
+	// skip the opening quote
+	ctx.advance(1);
+
+	while (!ctx.eof()) {
+		const ch = ctx.next();
+		if (ch === CharCode.Backslash) {
+			// skip \r\n or any other character following a backslash
+			const advanceCount = (ctx.peek() === CharCode.CarriageReturn && ctx.peek(1) === CharCode.LineFeed ? 2 : 1);
+			ctx.advance(advanceCount);
+		} else if (ch === CharCode.BackTick) {
+			// hit end quote, so stop
+			break;
+		} else if (ch === CharCode.DollarSign) {
+			if (ctx.peek() === CharCode.OpenCurlyBrace) {
+				ctx.advance(1);
+				ctx.endToken();
+				parseRoot(ctx);
+				ctx.beginToken(StandardTokenType.String, -1);
+			}
+		}
+	}
+
+	ctx.endToken();
+}
+
+function parseSlash(ctx: ParserContext): void {
+
+	const nextCh = ctx.peek(1);
+	if (nextCh === CharCode.Asterisk) {
+		parseMultiLineComment(ctx);
+		return;
+	}
+
+	if (nextCh === CharCode.Slash) {
+		parseSingleLineComment(ctx);
+		return;
+	}
+
+	if (tryParseRegex(ctx)) {
+		return;
+	}
+
+	ctx.advance(1);
+}
+
+function tryParseRegex(ctx: ParserContext): boolean {
+	// See https://www.ecma-international.org/ecma-262/10.0/index.html#prod-RegularExpressionLiteral
+
+	// TODO: avoid regex...
+	let contentBefore = ctx.text.substr(ctx.pos - 100, 100);
+	if (/[a-zA-Z0-9](\s*)$/.test(contentBefore)) {
+		// Cannot start after an identifier
+		return false;
+	}
+
+	let pos = 0;
+	let len = ctx.len - ctx.pos;
+	let inClass = false;
+
+	// skip /
+	pos++;
+
+	while (pos < len) {
+		const ch = ctx.peek(pos++);
+
+		if (ch === CharCode.CarriageReturn || ch === CharCode.LineFeed) {
+			return false;
+		}
+
+		if (ch === CharCode.Backslash) {
+			const nextCh = ctx.peek();
+			if (nextCh === CharCode.CarriageReturn || nextCh === CharCode.LineFeed) {
+				return false;
+			}
+			// skip next character
+			pos++;
+			continue;
+		}
+
+		if (inClass) {
+
+			if (ch === CharCode.CloseSquareBracket) {
+				inClass = false;
+				continue;
+			}
+
+		} else {
+
+			if (ch === CharCode.Slash) {
+				// cannot be directly followed by a /
+				if (ctx.peek(pos) === CharCode.Slash) {
+					return false;
+				}
+
+				// consume flags
+				do {
+					let nextCh = ctx.peek(pos);
+					if (nextCh >= CharCode.a && nextCh <= CharCode.z) {
+						pos++;
+						continue;
+					} else {
+						break;
+					}
+				} while (true);
+
+				// TODO: avoid regex...
+				if (/^(\s*)(\.|;|\/|,|\)|\]|\}|$)/.test(ctx.text.substr(ctx.pos + pos))) {
+					// Must be followed by an operator of kinds
+					ctx.beginToken(StandardTokenType.RegEx);
+					ctx.advance(pos);
+					ctx.endToken();
+					return true;
+				}
+
+				return false;
+			}
+
+			if (ch === CharCode.OpenSquareBracket) {
+				inClass = true;
+				continue;
+			}
+
+		}
+	}
+
+	return false;
+}
+
+function parseMultiLineComment(ctx: ParserContext): void {
+	ctx.beginToken(StandardTokenType.Comment);
+
+	// skip the /*
+	ctx.advance(2);
+
+	while (!ctx.eof()) {
+		const ch = ctx.next();
+		if (ch === CharCode.Asterisk) {
+			if (ctx.peek() === CharCode.Slash) {
+				ctx.advance(1);
+				break;
+			}
+		}
+	}
+
+	ctx.endToken();
+}
+
+function parseSingleLineComment(ctx: ParserContext): void {
+	ctx.beginToken(StandardTokenType.Comment);
+
+	// skip the //
+	ctx.advance(2);
+
+	while (!ctx.eof()) {
+		const ch = ctx.next();
+		if (ch === CharCode.CarriageReturn || ch === CharCode.LineFeed) {
+			break;
+		}
+	}
+
+	ctx.endToken();
+}
--- a/src/vs/editor/contrib/tokenization/tokenization.ts
+++ b/src/vs/editor/contrib/tokenization/tokenization.ts
@ -9,6 +9,7 @@ import { EditorAction, ServicesAccessor, registerEditorAction } from 'vs/editor/
 import { StopWatch } from 'vs/base/common/stopwatch';
 import { StandardTokenType } from 'vs/editor/common/modes';
 import { ITextModel } from 'vs/editor/common/model';
+import { parse } from 'vs/editor/common/modes/tokenization/typescript';

 class ForceRetokenizeAction extends EditorAction {
 	constructor() {
@ -25,19 +26,88 @@ class ForceRetokenizeAction extends EditorAction {
 			return;
 		}
 		const model = editor.getModel();
-		model.resetTokenization();
+		// model.resetTokenization();
 		const sw = new StopWatch(true);
 		model.forceTokenization(model.getLineCount());
 		sw.stop();
 		console.log(`tokenization took ${sw.elapsed()}`);

 		if (!true) {
-			extractTokenTypes(model);
+			const expected = extractTokenTypes(model);
+
+			const sw2 = new StopWatch(true);
+			const actual = parse(model.getValue());
+			sw2.stop();
+			console.log(`classification took ${sw2.elapsed()}`);
+
+			let expectedIndex = 0, expectedCount = expected.length / 3;
+			let actualIndex = 0, actualCount = actual.length / 3;
+			outer: while (expectedIndex < expectedCount && actualIndex < actualCount) {
+				const expectedOffset = expected[3 * expectedIndex];
+				const expectedLength = expected[3 * expectedIndex + 1];
+				const expectedType = expected[3 * expectedIndex + 2];
+				const actualOffset = actual[3 * actualIndex];
+				const actualLength = actual[3 * actualIndex + 1];
+				const actualType = actual[3 * actualIndex + 2];
+
+				// TS breaks up comments or begins them before (in case of whitespace)...
+				if (actualType === StandardTokenType.Comment && expectedOffset <= actualOffset && expectedType === actualType) {
+					const actualEndOffset = actualOffset + actualLength;
+					while (expectedIndex < expectedCount && expected[3 * expectedIndex] + expected[3 * expectedIndex + 1] <= actualEndOffset) {
+						// console.log(`(Fuzzy match):`);
+						// console.log(`--- Expected: ${model.getPositionAt(expected[3 * expectedIndex])} - ${expected[3 * expectedIndex]}, ${expected[3 * expectedIndex + 1]}, ${expected[3 * expectedIndex + 2]}`);
+						// console.log(`--- Actual: ${model.getPositionAt(actualOffset)} - ${actualOffset}, ${actualLength}, ${actualType}`);
+						expectedIndex++;
+					}
+					actualIndex++;
+					continue;
+				}
+
+				// TS identifies regexes as strings and begins them before (in case of whitespace)...
+				if (actualType === StandardTokenType.RegEx && expectedOffset <= actualOffset && expectedType === StandardTokenType.String) {
+					const actualEndOffset = actualOffset + actualLength;
+					while (expectedIndex < expectedCount && expected[3 * expectedIndex] + expected[3 * expectedIndex + 1] <= actualEndOffset) {
+						expectedIndex++;
+					}
+					actualIndex++;
+					continue;
+				}
+
+				if (actualType === StandardTokenType.String && expectedType === actualType) {
+					const actualEndOffset = actualOffset + actualLength;
+					while (expectedIndex < expectedCount && expected[3 * expectedIndex] + expected[3 * expectedIndex + 1] <= actualEndOffset) {
+						// console.log(`(Fuzzy match):`);
+						// console.log(`--- Expected: ${model.getPositionAt(expected[3 * expectedIndex])} - ${expected[3 * expectedIndex]}, ${expected[3 * expectedIndex + 1]}, ${expected[3 * expectedIndex + 2]}`);
+						// console.log(`--- Actual: ${model.getPositionAt(actualOffset)} - ${actualOffset}, ${actualLength}, ${actualType}`);
+						expectedIndex++;
+					}
+					actualIndex++;
+					continue;
+				}
+
+				if (expectedOffset === actualOffset && expectedLength === actualLength && expectedType === actualType) {
+					expectedIndex++;
+					actualIndex++;
+					continue;
+				}
+
+				const expectedPosition = model.getPositionAt(expectedOffset);
+				console.error(`Missmatch at position: ${expectedPosition}`);
+				console.error(`Expected: ${model.getPositionAt(expectedOffset)} - ${expectedOffset}, ${expectedLength}, ${expectedType}`);
+				console.error(`Actual: ${model.getPositionAt(actualOffset)} - ${actualOffset}, ${actualLength}, ${actualType}`);
+				break;
+			}
+
+			if (expectedIndex !== expectedCount || actualIndex !== actualCount) {
+				console.error(`Missmatch at the end`);
+			}
+
+			console.log(`Finished comparison!`);
 		}
 	}
 }

-function extractTokenTypes(model: ITextModel): void {
+function extractTokenTypes(model: ITextModel): number[] {
 	const eolLength = model.getEOL().length;
 	let result: number[] = [];
 	let resultLen: number = 0;
@ -46,6 +116,7 @@ function extractTokenTypes(model: ITextModel): void {
 	let offset = 0;
 	for (let lineNumber = 1, lineCount = model.getLineCount(); lineNumber <= lineCount; lineNumber++) {
 		const lineTokens = model.getLineTokens(lineNumber);
+		const lineText = lineTokens.getLineContent();

 		for (let i = 0, len = lineTokens.getCount(); i < len; i++) {
 			const tokenType = lineTokens.getStandardTokenType(i);
@ -67,7 +138,7 @@ function extractTokenTypes(model: ITextModel): void {
 				continue;
 			}

-			result[resultLen++] = startOffset; // - lastEndOffset
+			result[resultLen++] = startOffset;
 			result[resultLen++] = length;
 			result[resultLen++] = tokenType;

@ -75,8 +146,10 @@ function extractTokenTypes(model: ITextModel): void {
 			lastEndOffset = endOffset;
 		}

-		offset += lineTokens.getLineContent().length + eolLength;
+		offset += lineText.length + eolLength;
 	}
+
+	return result;
 }

 registerEditorAction(ForceRetokenizeAction);
--- a/src/vs/editor/test/node/classification/typescript-test.ts
+++ b/src/vs/editor/test/node/classification/typescript-test.ts
@ -0,0 +1,71 @@
+///
+/* tslint:disable */
+const x01 = "string";
+///         ^^^^^^^^ string
+
+const x02 = '\'';
+///         ^^^^ string
+
+const x03 = '\n\'\t';
+///         ^^^^^^^^ string
+
+const x04 = 'this is\
+///         ^^^^^^^^^ string\
+a multiline string';
+/// <------------------- string
+
+const x05 = x01;// just some text
+///             ^^^^^^^^^^^^^^^^^ comment
+
+const x06 = x05;/* multi
+///             ^^^^^^^^ comment
+line *comment */
+/// <---------------- comment
+
+const x07 = 4 / 5;
+
+const x08 = `howdy`;
+///         ^^^^^^^ string
+
+const x09 = `\'\"\``;
+///         ^^^^^^^^ string
+
+const x10 = `$[]`;
+///         ^^^^^ string
+
+const x11 = `${x07 +/**/3}px`;
+///         ^^^ string
+///                 ^^^^ comment
+///                      ^^^^ string
+
+const x12 = `${x07 + (function () { return 5; })()/**/}px`;
+///         ^^^ string
+///                                               ^^^^ comment
+///                                                   ^^^^ string
+
+const x13 = /([\w\-]+)?(#([\w\-]+))?((.([\w\-]+))*)/;
+///         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ regex
+
+const x14 = /\./g;
+///         ^^^^^ regex
+
+
+const x15 = Math.abs(x07) / x07; // speed
+///                              ^^^^^^^^ comment
+
+const x16 = / x07; /.test('3');
+///         ^^^^^^^^ regex
+///                       ^^^ string
+
+const x17 = `.dialog-modal-block${true ? '.dimmed' : ''}`;
+///         ^^^^^^^^^^^^^^^^^^^^^^ string
+///                                      ^^^^^^^^^ string
+///                                                  ^^^^ string
+
+const x18 = Math.min((14 <= 0.5 ? 123 / (2 * 1) : ''.length / (2 - (2 * 1))), 1);
+///                                               ^^ string
+
+const x19 = `${3 / '5'.length} km/h)`;
+///         ^^^ string
+///                ^^^ string
+///                          ^^^^^^^ string
--- a/src/vs/editor/test/node/classification/typescript.test.ts
+++ b/src/vs/editor/test/node/classification/typescript.test.ts
@ -0,0 +1,139 @@
+/*---------------------------------------------------------------------------------------------
+ *  Copyright (c) Microsoft Corporation. All rights reserved.
+ *  Licensed under the MIT License. See License.txt in the project root for license information.
+ *--------------------------------------------------------------------------------------------*/
+
+import * as assert from 'assert';
+import { StandardTokenType } from 'vs/editor/common/modes';
+import * as fs from 'fs';
+import { getPathFromAmdModule } from 'vs/base/common/amd';
+import { parse } from 'vs/editor/common/modes/tokenization/typescript';
+import { toStandardTokenType } from 'vs/editor/common/modes/supports/tokenization';
+
+interface IParseFunc {
+	(text: string): number[];
+}
+
+interface IAssertion {
+	testLineNumber: number;
+	startOffset: number;
+	length: number;
+	tokenType: StandardTokenType;
+}
+
+interface ITest {
+	content: string;
+	assertions: IAssertion[];
+}
+
+function parseTest(fileName: string): ITest {
+	interface ILineWithAssertions {
+		line: string;
+		assertions: ILineAssertion[];
+	}
+
+	interface ILineAssertion {
+		testLineNumber: number;
+		startOffset: number;
+		length: number;
+		expectedTokenType: StandardTokenType;
+	}
+
+	const testContents = fs.readFileSync(fileName).toString();
+	const lines = testContents.split(/\r\n|\n/);
+	const magicToken = lines[0];
+
+	let currentElement: ILineWithAssertions = {
+		line: lines[1],
+		assertions: []
+	};
+
+	let parsedTest: ILineWithAssertions[] = [];
+	for (let i = 2; i < lines.length; i++) {
+		let line = lines[i];
+		if (line.substr(0, magicToken.length) === magicToken) {
+			// this is an assertion line
+			let m1 = line.substr(magicToken.length).match(/^( +)([\^]+) (\w+)\\?$/);
+			if (m1) {
+				currentElement.assertions.push({
+					testLineNumber: i + 1,
+					startOffset: magicToken.length + m1[1].length,
+					length: m1[2].length,
+					expectedTokenType: toStandardTokenType(m1[3])
+				});
+			} else {
+				let m2 = line.substr(magicToken.length).match(/^( +)<(-+) (\w+)\\?$/);
+				if (m2) {
+					currentElement.assertions.push({
+						testLineNumber: i + 1,
+						startOffset: 0,
+						length: m2[2].length,
+						expectedTokenType: toStandardTokenType(m2[3])
+					});
+				} else {
+					throw new Error(`Invalid test line at line number ${i + 1}.`);
+				}
+			}
+		} else {
+			// this is a line to be parsed
+			parsedTest.push(currentElement);
+			currentElement = {
+				line: line,
+				assertions: []
+			};
+		}
+	}
+	parsedTest.push(currentElement);
+
+	let assertions: IAssertion[] = [];
+
+	let offset = 0;
+	for (let i = 0; i < parsedTest.length; i++) {
+		const parsedTestLine = parsedTest[i];
+		for (let j = 0; j < parsedTestLine.assertions.length; j++) {
+			const assertion = parsedTestLine.assertions[j];
+			assertions.push({
+				testLineNumber: assertion.testLineNumber,
+				startOffset: offset + assertion.startOffset,
+				length: assertion.length,
+				tokenType: assertion.expectedTokenType
+			});
+		}
+		offset += parsedTestLine.line.length + 1;
+	}
+
+	let content: string = parsedTest.map(parsedTestLine => parsedTestLine.line).join('\n');
+
+	return { content, assertions };
+}
+
+function executeTest(fileName: string, parseFunc: IParseFunc): void {
+	const { content, assertions } = parseTest(fileName);
+	const actual = parseFunc(content);
+
+	let actualIndex = 0, actualCount = actual.length / 3;
+	for (let i = 0; i < assertions.length; i++) {
+		const assertion = assertions[i];
+		while (actualIndex < actualCount && actual[3 * actualIndex] + actual[3 * actualIndex + 1] <= assertion.startOffset) {
+			actualIndex++;
+		}
+		assert.ok(
+			actual[3 * actualIndex] <= assertion.startOffset,
+			`Line ${assertion.testLineNumber} : startOffset : ${actual[3 * actualIndex]} <= ${assertion.startOffset}`
+		);
+		assert.ok(
+			actual[3 * actualIndex] + actual[3 * actualIndex + 1] >= assertion.startOffset + assertion.length,
+			`Line ${assertion.testLineNumber} : length : ${actual[3 * actualIndex]} + ${actual[3 * actualIndex + 1]} >= ${assertion.startOffset} + ${assertion.length}.`
+		);
+		assert.equal(
+			actual[3 * actualIndex + 2],
+			assertion.tokenType,
+			`Line ${assertion.testLineNumber} : tokenType`);
+	}
+}
+
+suite('Classification', () => {
+	test('TypeScript', () => {
+		executeTest(getPathFromAmdModule(require, 'vs/editor/test/node/classification/typescript-test.ts').replace(/\bout\b/, 'src'), parse);
+	});
+});