deno/std/path/glob.ts
Nayeem Rahman 326ccb1095
feat(std/path): Align globToRegExp() with bash glob expansion (#7209)
- feat: Support escaping glob characters
- feat: Support more character classes
- feat: Match characters literally on segment parse failure
- fix: Match nothing for empty globs
- fix: Contain any glob syntax to its path segment
- perf: Remove extraneous separators from generated regex
- doc: Add detailed JSDoc
- chore: Remove old copyright headers
2020-10-01 11:37:03 +02:00

389 lines
12 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2018-2020 the Deno authors. All rights reserved. MIT license.
// This module is browser compatible.
import { NATIVE_OS } from "./_constants.ts";
import { join, normalize } from "./mod.ts";
import { SEP, SEP_PATTERN } from "./separator.ts";
export interface GlobOptions {
/** Extended glob syntax.
* See https://www.linuxjournal.com/content/bash-extended-globbing. Defaults
* to true. */
extended?: boolean;
/** Globstar syntax.
* See https://www.linuxjournal.com/content/globstar-new-bash-globbing-option.
* If false, `**` is treated like `*`. Defaults to true. */
globstar?: boolean;
/** Operating system. Defaults to the native OS. */
os?: typeof Deno.build.os;
}
export type GlobToRegExpOptions = GlobOptions;
// deno-fmt-ignore
const regExpEscapeChars = ["!", "$", "(", ")", "*", "+", ".", "=", "?", "[", "\\", "^", "{", "|"];
const rangeEscapeChars = ["-", "\\", "]"];
/** Convert a glob string to a regular expression.
*
* Tries to match bash glob expansion as closely as possible.
*
* Basic glob syntax:
* - `*` - Matches everything without leaving the path segment.
* - `{foo,bar}` - Matches `foo` or `bar`.
* - `[abcd]` - Matches `a`, `b`, `c` or `d`.
* - `[a-d]` - Matches `a`, `b`, `c` or `d`.
* - `[!abcd]` - Matches any single character besides `a`, `b`, `c` or `d`.
* - `[[:<class>:]]` - Matches any character belonging to `<class>`.
* - `[[:alnum:]]` - Matches any digit or letter.
* - `[[:digit:]abc]` - Matches any digit, `a`, `b` or `c`.
* - See https://facelessuser.github.io/wcmatch/glob/#posix-character-classes
* for a complete list of supported character classes.
* - `\` - Escapes the next character for an `os` other than `"windows"`.
* - \` - Escapes the next character for `os` set to `"windows"`.
* - `/` - Path separator.
* - `\` - Additional path separator only for `os` set to `"windows"`.
*
* Extended syntax:
* - Requires `{ extended: true }`.
* - `?(foo|bar)` - Matches 0 or 1 instance of `{foo,bar}`.
* - `@(foo|bar)` - Matches 1 instance of `{foo,bar}`. They behave the same.
* - `*(foo|bar)` - Matches _n_ instances of `{foo,bar}`.
* - `+(foo|bar)` - Matches _n > 0_ instances of `{foo,bar}`.
* - `!(foo|bar)` - Matches anything other than `{foo,bar}`.
* - See https://www.linuxjournal.com/content/bash-extended-globbing.
*
* Globstar syntax:
* - Requires `{ globstar: true }`.
* - `**` - Matches any number of any path segments.
* - Must comprise its entire path segment in the provided glob.
* - See https://www.linuxjournal.com/content/globstar-new-bash-globbing-option.
*
* Note the following properties:
* - The generated `RegExp` is anchored at both start and end.
* - Repeating and trailing separators are tolerated. Trailing separators in the
* provided glob have no meaning and are discarded.
* - Absolute globs will only match absolute paths, etc.
* - Empty globs will match nothing.
* - Any special glob syntax must be contained to one path segment. For example,
* `?(foo|bar/baz)` is invalid. The separator will take precendence and the
* first segment ends with an unclosed group.
* - If a path segment ends with unclosed groups or a dangling escape prefix, a
* parse error has occured. Every character for that segment is taken
* literally in this event.
*
* Limitations:
* - A negative group like `!(foo|bar)` will wrongly be converted to a negative
* look-ahead followed by a wildcard. This means that `!(foo).js` will wrongly
* fail to match `foobar.js`, even though `foobar` is not `foo`. Effectively,
* `!(foo|bar)` is treated like `!(@(foo|bar)*)`. This will work correctly if
* the group occurs not nested at the end of the segment. */
export function globToRegExp(
glob: string,
{ extended = true, globstar: globstarOption = true, os = NATIVE_OS }:
GlobToRegExpOptions = {},
): RegExp {
if (glob == "") {
return /(?!)/;
}
const sep = os == "windows" ? "(?:\\\\|/)+" : "/+";
const sepMaybe = os == "windows" ? "(?:\\\\|/)*" : "/*";
const seps = os == "windows" ? ["\\", "/"] : ["/"];
const globstar = os == "windows"
? "(?:[^\\\\/]*(?:\\\\|/|$)+)*"
: "(?:[^/]*(?:/|$)+)*";
const wildcard = os == "windows" ? "[^\\\\/]*" : "[^/]*";
const escapePrefix = os == "windows" ? "`" : "\\";
// Remove trailing separators.
let newLength = glob.length;
for (; newLength > 1 && seps.includes(glob[newLength - 1]); newLength--);
glob = glob.slice(0, newLength);
let regExpString = "";
// Terminates correctly. Trust that `j` is incremented every iteration.
for (let j = 0; j < glob.length;) {
let segment = "";
const groupStack = [];
let inRange = false;
let inEscape = false;
let endsWithSep = false;
let i = j;
// Terminates with `i` at the non-inclusive end of the current segment.
for (; i < glob.length && !seps.includes(glob[i]); i++) {
if (inEscape) {
inEscape = false;
const escapeChars = inRange ? rangeEscapeChars : regExpEscapeChars;
segment += escapeChars.includes(glob[i]) ? `\\${glob[i]}` : glob[i];
continue;
}
if (glob[i] == escapePrefix) {
inEscape = true;
continue;
}
if (glob[i] == "[") {
if (!inRange) {
inRange = true;
segment += "[";
if (glob[i + 1] == "!") {
i++;
segment += "^";
} else if (glob[i + 1] == "^") {
i++;
segment += "\\^";
}
continue;
} else if (glob[i + 1] == ":") {
let k = i + 1;
let value = "";
while (glob[k + 1] != null && glob[k + 1] != ":") {
value += glob[k + 1];
k++;
}
if (glob[k + 1] == ":" && glob[k + 2] == "]") {
i = k + 2;
if (value == "alnum") segment += "\\dA-Za-z";
else if (value == "alpha") segment += "A-Za-z";
else if (value == "ascii") segment += "\x00-\x7F";
else if (value == "blank") segment += "\t ";
else if (value == "cntrl") segment += "\x00-\x1F\x7F";
else if (value == "digit") segment += "\\d";
else if (value == "graph") segment += "\x21-\x7E";
else if (value == "lower") segment += "a-z";
else if (value == "print") segment += "\x20-\x7E";
else if (value == "punct") {
segment += "!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_{|}~";
} else if (value == "space") segment += "\\s\v";
else if (value == "upper") segment += "A-Z";
else if (value == "word") segment += "\\w";
else if (value == "xdigit") segment += "\\dA-Fa-f";
continue;
}
}
}
if (glob[i] == "]" && inRange) {
inRange = false;
segment += "]";
continue;
}
if (inRange) {
if (glob[i] == "\\") {
segment += `\\\\`;
} else {
segment += glob[i];
}
continue;
}
if (
glob[i] == ")" && groupStack.length > 0 &&
groupStack[groupStack.length - 1] != "BRACE"
) {
segment += ")";
const type = groupStack.pop()!;
if (type == "!") {
segment += wildcard;
} else if (type != "@") {
segment += type;
}
continue;
}
if (
glob[i] == "|" && groupStack.length > 0 &&
groupStack[groupStack.length - 1] != "BRACE"
) {
segment += "|";
continue;
}
if (glob[i] == "+" && extended && glob[i + 1] == "(") {
i++;
groupStack.push("+");
segment += "(?:";
continue;
}
if (glob[i] == "@" && extended && glob[i + 1] == "(") {
i++;
groupStack.push("@");
segment += "(?:";
continue;
}
if (glob[i] == "?") {
if (extended && glob[i + 1] == "(") {
i++;
groupStack.push("?");
segment += "(?:";
} else {
segment += ".";
}
continue;
}
if (glob[i] == "!" && extended && glob[i + 1] == "(") {
i++;
groupStack.push("!");
segment += "(?!";
continue;
}
if (glob[i] == "{") {
groupStack.push("BRACE");
segment += "(?:";
continue;
}
if (glob[i] == "}" && groupStack[groupStack.length - 1] == "BRACE") {
groupStack.pop();
segment += ")";
continue;
}
if (glob[i] == "," && groupStack[groupStack.length - 1] == "BRACE") {
segment += "|";
continue;
}
if (glob[i] == "*") {
if (extended && glob[i + 1] == "(") {
i++;
groupStack.push("*");
segment += "(?:";
} else {
const prevChar = glob[i - 1];
let numStars = 1;
while (glob[i + 1] == "*") {
i++;
numStars++;
}
const nextChar = glob[i + 1];
if (
globstarOption && numStars == 2 &&
[...seps, undefined].includes(prevChar) &&
[...seps, undefined].includes(nextChar)
) {
segment += globstar;
endsWithSep = true;
} else {
segment += wildcard;
}
}
continue;
}
segment += regExpEscapeChars.includes(glob[i]) ? `\\${glob[i]}` : glob[i];
}
// Check for unclosed groups or a dangling backslash.
if (groupStack.length > 0 || inRange || inEscape) {
// Parse failure. Take all characters from this segment literally.
segment = "";
for (const c of glob.slice(j, i)) {
segment += regExpEscapeChars.includes(c) ? `\\${c}` : c;
endsWithSep = false;
}
}
regExpString += segment;
if (!endsWithSep) {
regExpString += i < glob.length ? sep : sepMaybe;
endsWithSep = true;
}
// Terminates with `i` at the start of the next segment.
while (seps.includes(glob[i])) i++;
// Check that the next value of `j` is indeed higher than the current value.
if (!(i > j)) {
throw new Error("Assertion failure: i > j (potential infinite loop)");
}
j = i;
}
regExpString = `^${regExpString}$`;
return new RegExp(regExpString);
}
/** Test whether the given string is a glob */
export function isGlob(str: string): boolean {
const chars: Record<string, string> = { "{": "}", "(": ")", "[": "]" };
/* eslint-disable-next-line max-len */
const regex =
/\\(.)|(^!|\*|[\].+)]\?|\[[^\\\]]+\]|\{[^\\}]+\}|\(\?[:!=][^\\)]+\)|\([^|]+\|[^\\)]+\))/;
if (str === "") {
return false;
}
let match: RegExpExecArray | null;
while ((match = regex.exec(str))) {
if (match[2]) return true;
let idx = match.index + match[0].length;
// if an open bracket/brace/paren is escaped,
// set the index to the next closing character
const open = match[1];
const close = open ? chars[open] : null;
if (open && close) {
const n = str.indexOf(close, idx);
if (n !== -1) {
idx = n + 1;
}
}
str = str.slice(idx);
}
return false;
}
/** Like normalize(), but doesn't collapse "**\/.." when `globstar` is true. */
export function normalizeGlob(
glob: string,
{ globstar = false }: GlobOptions = {},
): string {
if (glob.match(/\0/g)) {
throw new Error(`Glob contains invalid characters: "${glob}"`);
}
if (!globstar) {
return normalize(glob);
}
const s = SEP_PATTERN.source;
const badParentPattern = new RegExp(
`(?<=(${s}|^)\\*\\*${s})\\.\\.(?=${s}|$)`,
"g",
);
return normalize(glob.replace(badParentPattern, "\0")).replace(/\0/g, "..");
}
/** Like join(), but doesn't collapse "**\/.." when `globstar` is true. */
export function joinGlobs(
globs: string[],
{ extended = false, globstar = false }: GlobOptions = {},
): string {
if (!globstar || globs.length == 0) {
return join(...globs);
}
if (globs.length === 0) return ".";
let joined: string | undefined;
for (const glob of globs) {
const path = glob;
if (path.length > 0) {
if (!joined) joined = path;
else joined += `${SEP}${path}`;
}
}
if (!joined) return ".";
return normalizeGlob(joined, { extended, globstar });
}