Improve markdown link regexp (#152533)

* Improve markdown link regexp

This makes the markdown link regexp more readable and also combines the two regular expressions we were running

* Fixed backtracking
This commit is contained in:
Matt Bierner 2022-06-18 21:25:54 -07:00 committed by GitHub
parent 5ffcfde11d
commit 5a175207de
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 45 additions and 39 deletions

View file

@ -131,11 +131,14 @@ export type MdLink = MdInlineLink | MdLinkDefinition;
function extractDocumentLink(
document: SkinnyTextDocument,
pre: number,
link: string,
pre: string,
rawLink: string,
matchIndex: number | undefined
): MdLink | undefined {
const offset = (matchIndex || 0) + pre;
const isAngleBracketLink = rawLink.startsWith('<');
const link = stripAngleBrackets(rawLink);
const offset = (matchIndex || 0) + pre.length + (isAngleBracketLink ? 1 : 0);
const linkStart = document.positionAt(offset);
const linkEnd = document.positionAt(offset + link.length);
try {
@ -185,20 +188,36 @@ function stripAngleBrackets(link: string) {
return link.replace(angleBracketLinkRe, '$1');
}
/**
* Matches `[text](link)`
*/
const linkPattern = /(\[((!\[[^\]]*?\]\(\s*)([^\s\(\)]+?)\s*\)\]|(?:\\\]|[^\]]|\][^(])*\])\(\s*)(([^\s\(\)]|\([^\s\(\)]*?\))+)\s*("[^"]*"|'[^']*'|\([^\(\)]*\))?\s*\)/g;
const r = String.raw;
/**
* Matches `[text](<link>)`
* Matches `[text](link)` or `[text](<link>)`
*/
const linkPatternAngle = /(\[((!\[[^\]]*?\]\(\s*)([^\s\(\)]+?)\s*\)\]|(?:\\\]|[^\]]|\][^(])*\])\(\s*<)(([^<>]|\([^\s\(\)]*?\))+)>\s*("[^"]*"|'[^']*'|\([^\(\)]*\))?\s*\)/g;
const linkPattern = new RegExp(
// text
r`(\[` + // open prefix match -->
/**/r`(?:` +
/*****/r`[^\[\]\\]|` + // Non-bracket chars, or...
/*****/r`\\.|` + // Escaped char, or...
/*****/r`\[[^\[\]]*\]` + // Matched bracket pair
/**/r`)*` +
r`\]` +
// Destination
r`\(\s*)` + // <-- close prefix match
/**/r`(` +
/*****/r`[^\s\(\)\<](?:[^\s\(\)]|\([^\s\(\)]*?\))*|` + // Link without whitespace, or...
/*****/r`<[^<>]*>` + // In angle brackets
/**/r`)` +
// Title
/**/r`\s*(?:"[^"]*"|'[^']*'|\([^\(\)]*\))?\s*` +
r`\)`,
'g');
/**
* Matches `[text][ref]` or `[shorthand]`
*/
* Matches `[text][ref]` or `[shorthand]`
*/
const referenceLinkPattern = /(^|[^\]\\])(?:(?:(\[((?:\\\]|[^\]])+)\]\[\s*?)([^\s\]]*?)\]|\[\s*?([^\s\]]*?)\])(?![\:\(]))/gm;
/**
@ -270,36 +289,23 @@ export class MdLinkComputer {
private *getInlineLinks(document: SkinnyTextDocument, noLinkRanges: NoLinkRanges): Iterable<MdLink> {
const text = document.getText();
for (const match of text.matchAll(linkPatternAngle)) {
const matchImageData = match[4] && extractDocumentLink(document, match[3].length + 1, match[4], match.index);
if (matchImageData && !noLinkRanges.contains(matchImageData.source.hrefRange)) {
yield matchImageData;
}
const matchLinkData = extractDocumentLink(document, match[1].length, match[5], match.index);
if (matchLinkData && !noLinkRanges.contains(matchLinkData.source.hrefRange)) {
yield matchLinkData;
}
}
for (const match of text.matchAll(linkPattern)) {
const matchImageData = match[4] && extractDocumentLink(document, match[3].length + 1, match[4], match.index);
if (matchImageData && !noLinkRanges.contains(matchImageData.source.hrefRange)) {
yield matchImageData;
}
if (match[5] !== undefined && match[5].startsWith('<')) {
continue;
}
const matchLinkData = extractDocumentLink(document, match[1].length, match[5], match.index);
const matchLinkData = extractDocumentLink(document, match[1], match[2], match.index);
if (matchLinkData && !noLinkRanges.contains(matchLinkData.source.hrefRange)) {
yield matchLinkData;
// Also check link destination for links
for (const innerMatch of match[1].matchAll(linkPattern)) {
const innerData = extractDocumentLink(document, innerMatch[1], innerMatch[2], (match.index ?? 0) + (innerMatch.index ?? 0));
if (innerData) {
yield innerData;
}
}
}
}
}
private *getAutoLinks(document: SkinnyTextDocument, noLinkRanges: NoLinkRanges): Iterable<MdLink> {
private * getAutoLinks(document: SkinnyTextDocument, noLinkRanges: NoLinkRanges): Iterable<MdLink> {
const text = document.getText();
for (const match of text.matchAll(autoLinkPattern)) {

View file

@ -32,7 +32,7 @@ function assertLinksEqual(actualLinks: readonly vscode.DocumentLink[], expectedR
}
}
suite('markdown.DocumentLinkProvider', () => {
suite('Markdown: DocumentLinkProvider', () => {
test('Should not return anything for empty document', async () => {
const links = await getLinksForFile('');
assert.strictEqual(links.length, 0);
@ -131,24 +131,24 @@ suite('markdown.DocumentLinkProvider', () => {
{
const links = await getLinksForFile('[![alt text](image.jpg)](https://example.com)');
assertLinksEqual(links, [
new vscode.Range(0, 25, 0, 44),
new vscode.Range(0, 13, 0, 22),
new vscode.Range(0, 25, 0, 44)
]);
}
{
const links = await getLinksForFile('[![a]( whitespace.jpg )]( https://whitespace.com )');
assertLinksEqual(links, [
new vscode.Range(0, 26, 0, 48),
new vscode.Range(0, 7, 0, 21),
new vscode.Range(0, 26, 0, 48)
]);
}
{
const links = await getLinksForFile('[![a](img1.jpg)](file1.txt) text [![a](img2.jpg)](file2.txt)');
assertLinksEqual(links, [
new vscode.Range(0, 6, 0, 14),
new vscode.Range(0, 17, 0, 26),
new vscode.Range(0, 39, 0, 47),
new vscode.Range(0, 6, 0, 14),
new vscode.Range(0, 50, 0, 59),
new vscode.Range(0, 39, 0, 47),
]);
}
});