mirror of
https://github.com/zsh-users/zsh
synced 2024-09-29 04:35:22 +00:00
25138(? mailing list stuck): rewrite of completion matching.
Will one day use multibyte/wide characters, doesn't yet.
This commit is contained in:
parent
2dcc8627c9
commit
bb68ee8db7
11
ChangeLog
11
ChangeLog
|
@ -1,3 +1,14 @@
|
|||
2008-06-08 Peter Stephenson <p.w.stephenson@ntlworld.com>
|
||||
|
||||
* 25138(?): Completion/compinstall, Doc/Zsh/compwid.yo,
|
||||
Src/pattern.c, Src/zsh.h, Src/Zle/comp.h, Src/Zle/complete.c,
|
||||
Src/Zle/compmatch.c, Src/Zle/computil.c: enhance completion
|
||||
matching to make it possible to use wide characters (not yet
|
||||
done); also allows named character ranges, so that
|
||||
{[:lower:]}={[:upper:]} works (for now, just for single-byte
|
||||
character sets). Committing this while the mailing list
|
||||
appears to be stuck.
|
||||
|
||||
2008-06-05 Doug Kearns <dougkearns@gmail.com>
|
||||
|
||||
* unposted: Completion/Redhat/Command/_rpm: fix typo
|
||||
|
|
|
@ -985,8 +985,14 @@ __ci_do_matchers() {
|
|||
for (( eltcnt = 1; eltcnt <= $#mlist; eltcnt++ )); do
|
||||
[[ $mlist[eltcnt] == "+"* ]] && a_or_r[$eltcnt]='+'
|
||||
[[ -z $mlist[$eltcnt] ]] && n_list[$eltcnt]=$eltcnt
|
||||
# Accept the old form of lower/upper correspondence, but we'll
|
||||
# output the new one instead.
|
||||
[[ $mlist[$eltcnt] = *"m:{a-z}={A-Z}"* ]] && c_list[$eltcnt]=$eltcnt
|
||||
[[ $mlist[$eltcnt] = *"m:{[:lower:]}={[:upper:]}"* ]] &&
|
||||
c_list[$eltcnt]=$eltcnt
|
||||
[[ $mlist[$eltcnt] = *"m:{a-zA-Z}={A-Za-z}"* ]] && C_list[$eltcnt]=$eltcnt
|
||||
[[ $mlist[$eltcnt] = *"m:{[:lower:][:upper:]}={[:upper:][:lower:]}"* ]] &&
|
||||
C_list[$eltcnt]=$eltcnt
|
||||
# For partial word stuff, we use backreferences to find out what
|
||||
# the set of separators was.
|
||||
if [[ $mlist[$eltcnt] = (#b)*"r:|["([^\]]#)"]=*"#" r:|=*"* ]]; then
|
||||
|
@ -1105,8 +1111,10 @@ terminators to be matched in this way? (y/n) [n] "
|
|||
# and reconstructing the elements of the matcher array.
|
||||
for (( eltcnt = 1; eltcnt <= 4; eltcnt++ )); do
|
||||
elt=
|
||||
[[ $c_list[$eltcnt] != ' ' ]] && elt="${elt:+$elt }m:{a-z}={A-Z}"
|
||||
[[ $C_list[$eltcnt] != ' ' ]] && elt="${elt:+$elt }m:{a-zA-Z}={A-Za-z}"
|
||||
[[ $c_list[$eltcnt] != ' ' ]] &&
|
||||
elt="${elt:+$elt }m:{[:lower:]}={[:upper:]}"
|
||||
[[ $C_list[$eltcnt] != ' ' ]] &&
|
||||
elt="${elt:+$elt }m:{[:lower:][:upper:]}={[:upper:][:lower:]}"
|
||||
[[ $p_list[$eltcnt] != ' ' ]] &&
|
||||
elt="${elt:+$elt }r:|[${pw_seps[$eltcnt]}]=*${pw_dstar[$eltcnt]}\
|
||||
r:|=*${pw_dstar[$eltcnt]}"
|
||||
|
|
|
@ -577,7 +577,7 @@ the next character typed inserts one of the characters given in the
|
|||
var(remove-chars). This string is parsed as a characters class and
|
||||
understands the backslash sequences used by the tt(print) command. For
|
||||
example, `tt(-r "a-z\t")' removes the suffix if the next character typed
|
||||
inserts a lowercase character or a TAB, and `tt(-r "^0-9")' removes the
|
||||
inserts a lower case character or a TAB, and `tt(-r "^0-9")' removes the
|
||||
suffix if the next character typed inserts anything but a digit. One extra
|
||||
backslash sequence is understood in this string: `tt(\-)' stands for
|
||||
all characters that insert nothing. Thus `tt(-S "=" -q)' is the same
|
||||
|
@ -857,9 +857,9 @@ which character sequences in the trial completion. Any sequence of
|
|||
characters not handled in this fashion must match exactly, as usual.
|
||||
|
||||
The forms of var(match-spec) understood are as follows. In each case, the
|
||||
form with an uppercase initial character retains the string already
|
||||
form with an upper case initial character retains the string already
|
||||
typed on the command line as the final result of completion, while with
|
||||
a lowercase initial character the string on the command line is changed
|
||||
a lower case initial character the string on the command line is changed
|
||||
into the corresponding part of the trial completion.
|
||||
|
||||
startitem()
|
||||
|
@ -918,15 +918,35 @@ are not allowed, so the characters tt(!) and tt(^) have no special
|
|||
meaning directly after the opening brace. They indicate that a range of
|
||||
characters on the line match a range of characters in the trial
|
||||
completion, but (unlike ordinary character classes) paired according to
|
||||
the corresponding position in the sequence. For example, to make any
|
||||
lowercase letter on the line match the corresponding uppercase letter in
|
||||
the trial completion, you can use `tt(m:{a-z}={A-Z})'. More than one
|
||||
pair of classes can occur, in which case the first class before the
|
||||
tt(=) corresponds to the first after it, and so on. If one side has
|
||||
the corresponding position in the sequence. For example, to make any
|
||||
ASCII lower case letter on the line match the corresponding upper case
|
||||
letter in the trial completion, you can use `tt(m:{a-z}={A-Z})'
|
||||
(however, see below for the recommended form for this). More
|
||||
than one pair of classes can occur, in which case the first class before
|
||||
the tt(=) corresponds to the first after it, and so on. If one side has
|
||||
more such classes than the other side, the superfluous classes behave
|
||||
like normal character classes. In anchor patterns correspondence classes
|
||||
also behave like normal character classes.
|
||||
|
||||
The standard `tt([:)var(name)tt(:])' forms described for standard shell
|
||||
patterns,
|
||||
ifnzman(noderef(Filename Generation))\
|
||||
ifzman(see the section FILENAME GENERATION in zmanref(zshexpn)),
|
||||
may appear in correspondence classes as well as normal character
|
||||
classes. The only special behaviour in correspondence classes is if
|
||||
the form on the left and the form on the right are each one of
|
||||
tt([:upper:]), tt([:lower:]). In these cases the
|
||||
character in the word and the character on the line must be the same up
|
||||
to a difference in case. Hence to make any lower case character on the
|
||||
line match the corresponding upper case character in the trial
|
||||
completion you can use `tt(m:{[:lower:]}={[:upper:]})'. Although the
|
||||
matching system does not yet handle multibyte characters, this is likely
|
||||
to be a future extension, at which point this syntax will handle
|
||||
arbitrary alphabets; hence this form, rather than the use of explicit
|
||||
ranges, is the recommended form. In other cases
|
||||
`tt([:)var(name)tt(:])' forms are allowed, but imply no special
|
||||
constraint on the characters beyond that implied by the test itself.
|
||||
|
||||
The pattern var(tpat) may also be one or two stars, `tt(*)' or
|
||||
`tt(**)'. This means that the pattern on the command line can match
|
||||
any number of characters in the trial completion. In this case the
|
||||
|
@ -939,16 +959,16 @@ anchor can be matched, too.
|
|||
Examples:
|
||||
|
||||
The keys of the tt(options) association defined by the tt(parameter)
|
||||
module are the option names in all-lowercase form, without
|
||||
module are the option names in all-lower-case form, without
|
||||
underscores, and without the optional tt(no) at the beginning even
|
||||
though the builtins tt(setopt) and tt(unsetopt) understand option names
|
||||
with uppercase letters, underscores, and the optional tt(no). The
|
||||
with upper case letters, underscores, and the optional tt(no). The
|
||||
following alters the matching rules so that the prefix tt(no) and any
|
||||
underscore are ignored when trying to match the trial completions
|
||||
generated and uppercase letters on the line match the corresponding
|
||||
lowercase letters in the words:
|
||||
generated and upper case letters on the line match the corresponding
|
||||
lower case letters in the words:
|
||||
|
||||
example(compadd -M 'L:|[nN][oO]= M:_= M:{A-Z}={a-z}' - \
|
||||
example(compadd -M 'L:|[nN][oO]= M:_= M:{[:upper:]}={[:lower:]}' - \
|
||||
${(k)options} )
|
||||
|
||||
The first part says that the pattern `tt([nN][oO])' at the beginning
|
||||
|
@ -957,8 +977,8 @@ line matches the empty string in the list of words generated by
|
|||
completion, so it will be ignored if present. The second part does the
|
||||
same for an underscore anywhere in the command line string, and the
|
||||
third part uses correspondence classes so that any
|
||||
uppercase letter on the line matches the corresponding lowercase
|
||||
letter in the word. The use of the uppercase forms of the
|
||||
upper case letter on the line matches the corresponding lower case
|
||||
letter in the word. The use of the upper case forms of the
|
||||
specification characters (tt(L) and tt(M)) guarantees that what has
|
||||
already been typed on the command line (in particular the prefix
|
||||
tt(no)) will not be deleted.
|
||||
|
@ -979,12 +999,12 @@ The second example makes completion case insensitive. This is just
|
|||
the same as in the option example, except here we wish to retain the
|
||||
characters in the list of completions:
|
||||
|
||||
example(compadd -M 'm:{a-z}={A-Z}' ... )
|
||||
example(compadd -M 'm:{[:lower:]}={[:upper:]}' ... )
|
||||
|
||||
This makes lowercase letters match their uppercase counterparts.
|
||||
To make uppercase letters match the lowercase forms as well:
|
||||
This makes lower case letters match their upper case counterparts.
|
||||
To make upper case letters match the lower case forms as well:
|
||||
|
||||
example(compadd -M 'm:{a-zA-Z}={A-Za-z}' ... )
|
||||
example(compadd -M 'm:{[:lower:][:upper:]}={[:upper:][:lower:]}' ... )
|
||||
|
||||
A nice example for the use of tt(*) patterns is partial word
|
||||
completion. Sometimes you would like to make strings like `tt(c.s.u)'
|
||||
|
@ -1042,27 +1062,27 @@ The specifications with both a left and a right anchor are useful to
|
|||
complete partial words whose parts are not separated by some
|
||||
special character. For example, in some places strings have to be
|
||||
completed that are formed `tt(LikeThis)' (i.e. the separate parts are
|
||||
determined by a leading uppercase letter) or maybe one has to
|
||||
determined by a leading upper case letter) or maybe one has to
|
||||
complete strings with trailing numbers. Here one could use the simple
|
||||
form with only one anchor as in:
|
||||
|
||||
example(compadd -M 'r:|[A-Z0-9]=* r:|=*' LikeTHIS FooHoo 5foo123 5bar234)
|
||||
example(compadd -M 'r:|[[:upper:]0-9]=* r:|=*' LikeTHIS FooHoo 5foo123 5bar234)
|
||||
|
||||
But with this, the string `tt(H)' would neither complete to `tt(FooHoo)'
|
||||
nor to `tt(LikeTHIS)' because in each case there is an uppercase
|
||||
nor to `tt(LikeTHIS)' because in each case there is an upper case
|
||||
letter before the `tt(H)' and that is matched by the anchor. Likewise,
|
||||
a `tt(2)' would not be completed. In both cases this could be changed
|
||||
by using `tt(r:|[A-Z0-9]=**)', but then `tt(H)' completes to both
|
||||
by using `tt(r:|[[:upper:]0-9]=**)', but then `tt(H)' completes to both
|
||||
`tt(LikeTHIS)' and `tt(FooHoo)' and a `tt(2)' matches the other
|
||||
strings because characters can be inserted before every uppercase
|
||||
strings because characters can be inserted before every upper case
|
||||
letter and digit. To avoid this one would use:
|
||||
|
||||
example(compadd -M 'r:[^A-Z0-9]||[A-Z0-9]=** r:|=*' \
|
||||
example(compadd -M 'r:[^[:upper:]0-9]||[[:upper:]0-9]=** r:|=*' \
|
||||
LikeTHIS FooHoo foo123 bar234)
|
||||
|
||||
By using these two anchors, a `tt(H)' matches only uppercase `tt(H)'s that
|
||||
By using these two anchors, a `tt(H)' matches only upper case `tt(H)'s that
|
||||
are immediately preceded by something matching the left anchor
|
||||
`tt([^A-Z0-9])'. The effect is, of course, that `tt(H)' matches only
|
||||
`tt([^[:upper:]0-9])'. The effect is, of course, that `tt(H)' matches only
|
||||
the string `tt(FooHoo)', a `tt(2)' matches only `tt(bar234)' and so on.
|
||||
|
||||
When using the completion system (see
|
||||
|
|
|
@ -162,12 +162,49 @@ struct cmatcher {
|
|||
#define CMF_RIGHT 4
|
||||
#define CMF_INTER 8
|
||||
|
||||
/*
|
||||
* Types of cpattern structure.
|
||||
* Note freecpattern() assumes any <= CPAT_EQUIV have string.
|
||||
*/
|
||||
enum {
|
||||
CPAT_CCLASS, /* [...]: ordinary character class */
|
||||
CPAT_NCLASS, /* [!...]: ordinary character class, negated */
|
||||
CPAT_EQUIV, /* {...}: equivalence class */
|
||||
CPAT_ANY, /* ?: any character */
|
||||
CPAT_CHAR /* Single character given explicitly */
|
||||
};
|
||||
|
||||
/*
|
||||
* A pattern element in a matcher specification.
|
||||
* Unlike normal patterns this only presents one character in
|
||||
* either the test completion or the word on the command line.
|
||||
*/
|
||||
struct cpattern {
|
||||
Cpattern next; /* next sub-pattern */
|
||||
unsigned char tab[256]; /* table of matched characters */
|
||||
int equiv; /* if this is a {...} class */
|
||||
int tp; /* type of object as above */
|
||||
union {
|
||||
char *str; /* if a character class, the objects
|
||||
* in it in a similar form to normal
|
||||
* pattern matching (a metafied string
|
||||
* with tokens).
|
||||
* Note the allocated length may be longer
|
||||
* than the null-terminated string.
|
||||
*/
|
||||
int chr; /* if a single character, it
|
||||
* TODO: eventually should be a
|
||||
* convchar_t.
|
||||
*/
|
||||
} u;
|
||||
};
|
||||
|
||||
/*
|
||||
* For now this just handles single-byte characters.
|
||||
* TODO: this will change.
|
||||
*/
|
||||
#define PATMATCHRANGE(r, c, ip, mtp) patmatchrange(r, c, ip, mtp)
|
||||
#define PATMATCHINDEX(r, i, cp, mtp) patmatchindex(r, i, cp, mtp)
|
||||
#define CONVCAST(c) (c)
|
||||
|
||||
/* This is a special return value for parse_cmatcher(), *
|
||||
* signalling an error. */
|
||||
|
||||
|
|
|
@ -122,13 +122,15 @@ freecpattern(Cpattern p)
|
|||
|
||||
while (p) {
|
||||
n = p->next;
|
||||
if (p->tp <= CPAT_EQUIV)
|
||||
free(p->u.str);
|
||||
zfree(p, sizeof(struct cpattern));
|
||||
|
||||
p = n;
|
||||
}
|
||||
}
|
||||
|
||||
/* Copy a completion matcher list. */
|
||||
/* Copy a completion matcher list into permanent storage. */
|
||||
|
||||
/**/
|
||||
mod_export Cmatcher
|
||||
|
@ -157,22 +159,51 @@ cpcmatcher(Cmatcher m)
|
|||
return r;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy a single entry in a matcher pattern.
|
||||
* If useheap is 1, it comes from the heap.
|
||||
*/
|
||||
|
||||
/**/
|
||||
mod_export Cpattern
|
||||
cp_cpattern_element(Cpattern o)
|
||||
{
|
||||
Cpattern n = zalloc(sizeof(struct cpattern));
|
||||
|
||||
n->next = NULL;
|
||||
|
||||
n->tp = o->tp;
|
||||
switch (o->tp)
|
||||
{
|
||||
case CPAT_CCLASS:
|
||||
case CPAT_NCLASS:
|
||||
case CPAT_EQUIV:
|
||||
n->u.str = ztrdup(o->u.str);
|
||||
break;
|
||||
|
||||
case CPAT_CHAR:
|
||||
n->u.chr = o->u.chr;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* just to keep compiler quiet */
|
||||
break;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Copy a completion matcher pattern. */
|
||||
|
||||
/**/
|
||||
static Cpattern
|
||||
cpcpattern(Cpattern o)
|
||||
{
|
||||
Cpattern r = NULL, *p = &r, n;
|
||||
Cpattern r = NULL, *p = &r;
|
||||
|
||||
while (o) {
|
||||
*p = n = (Cpattern) zalloc(sizeof(struct cpattern));
|
||||
|
||||
n->next = NULL;
|
||||
memcpy(n->tab, o->tab, 256);
|
||||
n->equiv = o->equiv;
|
||||
|
||||
p = &(n->next);
|
||||
*p = cp_cpattern_element(o);
|
||||
p = &((*p)->next);
|
||||
o = o->next;
|
||||
}
|
||||
return r;
|
||||
|
@ -331,14 +362,26 @@ parse_cmatcher(char *name, char *s)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/* Parse a pattern for matcher control. */
|
||||
/*
|
||||
* Parse a pattern for matcher control.
|
||||
* name is the name of the builtin from which this is called, for errors.
|
||||
* *sp is the input string and will be updated to the end of the parsed
|
||||
* pattern.
|
||||
* *lp will be set to the number of characters (possibly multibyte)
|
||||
* that the pattern will match. This must be deterministic, given
|
||||
* the syntax allowed here.
|
||||
* e, if non-zero, is the ASCII end character to match; if zero,
|
||||
* stop on a blank.
|
||||
* *err is set to 1 to indicate an error, else to 0.
|
||||
*/
|
||||
|
||||
/**/
|
||||
static Cpattern
|
||||
parse_pattern(char *name, char **sp, int *lp, char e, int *err)
|
||||
{
|
||||
Cpattern ret = NULL, r = NULL, n;
|
||||
unsigned char *s = (unsigned char *) *sp;
|
||||
char *s = *sp;
|
||||
int inchar;
|
||||
int l = 0;
|
||||
|
||||
*err = 0;
|
||||
|
@ -346,25 +389,18 @@ parse_pattern(char *name, char **sp, int *lp, char e, int *err)
|
|||
while (*s && (e ? (*s != e) : !inblank(*s))) {
|
||||
n = (Cpattern) hcalloc(sizeof(*n));
|
||||
n->next = NULL;
|
||||
n->equiv = 0;
|
||||
|
||||
if (*s == '[') {
|
||||
s = parse_class(n, s + 1, ']');
|
||||
if (!*s) {
|
||||
*err = 1;
|
||||
zwarnnam(name, "unterminated character class");
|
||||
return NULL;
|
||||
}
|
||||
} else if (*s == '{') {
|
||||
n->equiv = 1;
|
||||
s = parse_class(n, s + 1, '}');
|
||||
if (*s == '[' || *s == '{') {
|
||||
s = parse_class(n, s);
|
||||
if (!*s) {
|
||||
*err = 1;
|
||||
zwarnnam(name, "unterminated character class");
|
||||
return NULL;
|
||||
}
|
||||
s++;
|
||||
} else if (*s == '?') {
|
||||
memset(n->tab, 1, 256);
|
||||
n->tp = CPAT_ANY;
|
||||
s++;
|
||||
} else if (*s == '*' || *s == '(' || *s == ')' || *s == '=') {
|
||||
*err = 1;
|
||||
zwarnnam(name, "invalid pattern character `%c'", *s);
|
||||
|
@ -373,8 +409,13 @@ parse_pattern(char *name, char **sp, int *lp, char e, int *err)
|
|||
if (*s == '\\' && s[1])
|
||||
s++;
|
||||
|
||||
memset(n->tab, 0, 256);
|
||||
n->tab[*s] = 1;
|
||||
if (*s == Meta)
|
||||
inchar = STOUC(*++s) ^ 32;
|
||||
else
|
||||
inchar = STOUC(*s);
|
||||
s++;
|
||||
n->tp = CPAT_CHAR;
|
||||
n->u.chr = inchar;
|
||||
}
|
||||
if (ret)
|
||||
r->next = n;
|
||||
|
@ -384,7 +425,6 @@ parse_pattern(char *name, char **sp, int *lp, char e, int *err)
|
|||
r = n;
|
||||
|
||||
l++;
|
||||
s++;
|
||||
}
|
||||
*sp = (char *) s;
|
||||
*lp = l;
|
||||
|
@ -394,28 +434,86 @@ parse_pattern(char *name, char **sp, int *lp, char e, int *err)
|
|||
/* Parse a character class for matcher control. */
|
||||
|
||||
/**/
|
||||
static unsigned char *
|
||||
parse_class(Cpattern p, unsigned char *s, unsigned char e)
|
||||
static char *
|
||||
parse_class(Cpattern p, char *iptr)
|
||||
{
|
||||
int n = 0, i = 1, j, eq = (e == '}'), k = 1;
|
||||
int endchar, firsttime = 1;
|
||||
char *optr, *nptr;
|
||||
|
||||
if (!eq && (*s == '!' || *s == '^') && s[1] != e) { n = 1; s++; }
|
||||
|
||||
memset(p->tab, n, 256);
|
||||
|
||||
n = !n;
|
||||
while (*s && (k || *s != e)) {
|
||||
if (s[1] == '-' && s[2] && s[2] != e) {
|
||||
/* a run of characters */
|
||||
for (j = (int) *s; j <= (int) s[2]; j++)
|
||||
p->tab[j] = (eq ? i++ : n);
|
||||
|
||||
s += 3;
|
||||
if (*iptr++ == '[') {
|
||||
endchar = ']';
|
||||
/* TODO: surely [^]] is valid? */
|
||||
if ((*iptr == '!' || *iptr == '^') && iptr[1] != ']') {
|
||||
p->tp = CPAT_NCLASS;
|
||||
iptr++;
|
||||
} else
|
||||
p->tab[*s++] = (eq ? i++ : n);
|
||||
k = 0;
|
||||
p->tp = CPAT_CCLASS;
|
||||
} else {
|
||||
endchar = '}';
|
||||
p->tp = CPAT_EQUIV;
|
||||
}
|
||||
return s;
|
||||
|
||||
/* find end of class. End character can appear literally first. */
|
||||
for (optr = iptr; optr == iptr || *optr != endchar; optr++)
|
||||
if (!*optr)
|
||||
return optr;
|
||||
/*
|
||||
* We can always fit the parsed class within the same length
|
||||
* because of the tokenization (including a null byte).
|
||||
*
|
||||
* As the input string is metafied, but shouldn't contain shell
|
||||
* tokens, we can just add our own tokens willy nilly.
|
||||
*/
|
||||
optr = p->u.str = zalloc((optr-iptr) + 1);
|
||||
|
||||
while (firsttime || *iptr != endchar) {
|
||||
int ch;
|
||||
|
||||
if (*iptr == '[' && iptr[1] == ':' &&
|
||||
(nptr = strchr((char *)iptr + 2, ':')) && nptr[1] == ']') {
|
||||
/* Range type */
|
||||
iptr += 2;
|
||||
ch = range_type((char *)iptr, nptr-iptr);
|
||||
iptr = nptr + 2;
|
||||
if (ch != PP_UNKWN)
|
||||
*optr++ = STOUC(Meta) + ch;
|
||||
} else {
|
||||
/* characters stay metafied */
|
||||
char *ptr1 = iptr;
|
||||
if (*iptr == Meta)
|
||||
iptr++;
|
||||
iptr++;
|
||||
if (*iptr == '-' && iptr[1] && iptr[1] != endchar) {
|
||||
/* a run of characters */
|
||||
iptr++;
|
||||
/* range token */
|
||||
*optr++ = Meta + PP_RANGE;
|
||||
|
||||
/* start of range character */
|
||||
if (*ptr1 == Meta) {
|
||||
*optr++ = Meta;
|
||||
*optr++ = ptr1[1] ^ 32;
|
||||
} else
|
||||
*optr++ = *ptr1;
|
||||
|
||||
if (*iptr == Meta) {
|
||||
*optr++ = *iptr++;
|
||||
*optr++ = *iptr++;
|
||||
} else
|
||||
*optr++ = *iptr++;
|
||||
} else {
|
||||
if (*ptr1 == Meta) {
|
||||
*optr++ = Meta;
|
||||
*optr++ = ptr1[1] ^ 32;
|
||||
} else
|
||||
*optr++ = *ptr1;
|
||||
}
|
||||
}
|
||||
firsttime = 0;
|
||||
}
|
||||
|
||||
*optr = '\0';
|
||||
return iptr;
|
||||
}
|
||||
|
||||
/**/
|
||||
|
|
|
@ -30,37 +30,68 @@
|
|||
#include "complete.mdh"
|
||||
#include "compmatch.pro"
|
||||
|
||||
/* This compares two cpattern lists and returns non-zero if they are
|
||||
* equal. */
|
||||
/*
|
||||
* This compares two cpattern lists and returns non-zero if they are
|
||||
* equal (N.B. opposite sense to usual *cmp()).
|
||||
*
|
||||
* The old version of this didn't worry about whether the lists
|
||||
* were the same length. This one does. It's hard to see how
|
||||
* that can be wrong even if it's unnecessary.
|
||||
*/
|
||||
|
||||
/**/
|
||||
static int
|
||||
cmp_cpatterns(Cpattern a, Cpattern b)
|
||||
cpatterns_same(Cpattern a, Cpattern b)
|
||||
{
|
||||
while (a) {
|
||||
if (a->equiv != b->equiv || memcmp(a->tab, b->tab, 256))
|
||||
if (!b)
|
||||
return 0;
|
||||
if (a->tp != b->tp)
|
||||
return 0;
|
||||
switch (a->tp) {
|
||||
case CPAT_CCLASS:
|
||||
case CPAT_NCLASS:
|
||||
case CPAT_EQUIV:
|
||||
/*
|
||||
* Patterns can actually match the same even if
|
||||
* the range strings don't compare differently, but
|
||||
* I don't think we need to handle that subtlety.
|
||||
*/
|
||||
if (strcmp(a->u.str, b->u.str) != 0)
|
||||
return 0;
|
||||
break;
|
||||
|
||||
case CPAT_CHAR:
|
||||
if (a->u.chr != b->u.chr)
|
||||
return 0;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* here to silence compiler */
|
||||
break;
|
||||
}
|
||||
|
||||
a = a->next;
|
||||
b = b->next;
|
||||
}
|
||||
return 1;
|
||||
return !b;
|
||||
}
|
||||
|
||||
/* This compares two cmatchers and returns non-zero if they are equal. */
|
||||
|
||||
/**/
|
||||
static int
|
||||
cmp_cmatchers(Cmatcher a, Cmatcher b)
|
||||
cmatchers_same(Cmatcher a, Cmatcher b)
|
||||
{
|
||||
return (a == b ||
|
||||
(a->flags == b->flags &&
|
||||
a->llen == b->llen && a->wlen == b->wlen &&
|
||||
(!a->llen || cmp_cpatterns(a->line, b->line)) &&
|
||||
(a->wlen <= 0 || cmp_cpatterns(a->word, b->word)) &&
|
||||
(!a->llen || cpatterns_same(a->line, b->line)) &&
|
||||
(a->wlen <= 0 || cpatterns_same(a->word, b->word)) &&
|
||||
(!(a->flags & (CMF_LEFT | CMF_RIGHT)) ||
|
||||
(a->lalen == b->lalen && a->ralen == b->ralen &&
|
||||
(!a->lalen || cmp_cpatterns(a->left, b->left)) &&
|
||||
(!a->ralen || cmp_cpatterns(a->right, b->right))))));
|
||||
(!a->lalen || cpatterns_same(a->left, b->left)) &&
|
||||
(!a->ralen || cpatterns_same(a->right, b->right))))));
|
||||
}
|
||||
|
||||
/* Add the given matchers to the bmatcher list. */
|
||||
|
@ -97,7 +128,7 @@ update_bmatchers(void)
|
|||
t = 0;
|
||||
for (ms = mstack; ms && !t; ms = ms->next)
|
||||
for (mp = ms->matcher; mp && !t; mp = mp->next)
|
||||
t = cmp_cmatchers(mp, p->matcher);
|
||||
t = cmatchers_same(mp, p->matcher);
|
||||
|
||||
p = p->next;
|
||||
if (!t) {
|
||||
|
@ -449,7 +480,7 @@ add_match_sub(Cmatcher m, char *l, int ll, char *w, int wl)
|
|||
}
|
||||
}
|
||||
|
||||
/* This tests if the string from the line l matches the word w. In bp
|
||||
/* This tests if the string from the line l matches the word w. In *bpp
|
||||
* the offset for the brace is returned, in rwlp the length of the
|
||||
* matched prefix or suffix, not including the stuff before or after
|
||||
* the last anchor is given. When sfx is non-zero matching is done from
|
||||
|
@ -1113,55 +1144,330 @@ comp_match(char *pfx, char *sfx, char *w, Patprog cp, Cline *clp, int qu,
|
|||
return r;
|
||||
}
|
||||
|
||||
/* Check if the given pattern matches the given string. *
|
||||
* p and s are either anchor or line pattern and string;
|
||||
* wp and ws are word (candidate) pattern and string
|
||||
*
|
||||
* If only one pattern is given, we just check if characters match
|
||||
* If both line and word are given, we check that characters match
|
||||
* for {...} classes by comparing relative numbers in sequence.
|
||||
*
|
||||
* Patterns and strings are always passed in pairs, so it is enough
|
||||
* to check for non-NULL wp. p should always be present.
|
||||
|
||||
/*
|
||||
* Guts of a single pattern for pattern_match().
|
||||
* Return non-zero if match successful.
|
||||
* If the class was an equivalence, return 1 + the index into
|
||||
* the equivalence class (see pattern.c for how this is calculated).
|
||||
*/
|
||||
|
||||
/**/
|
||||
mod_export int
|
||||
pattern_match(Cpattern p, char *s, Cpattern wp, char *ws)
|
||||
pattern_match1(Cpattern p, int c, int *mtp)
|
||||
{
|
||||
unsigned char c;
|
||||
unsigned char wc;
|
||||
/* TODO: should become convchar_t */
|
||||
int ind;
|
||||
|
||||
while (p && wp && *s && *ws) {
|
||||
c = p->tab[*((unsigned char *) s)];
|
||||
wc = wp->tab[*((unsigned char *) ws)];
|
||||
*mtp = 0;
|
||||
switch (p->tp) {
|
||||
case CPAT_CCLASS:
|
||||
return PATMATCHRANGE(p->u.str, CONVCAST(c), NULL, NULL);
|
||||
|
||||
if (!c || !wc || c != wc)
|
||||
case CPAT_NCLASS:
|
||||
return !PATMATCHRANGE(p->u.str, CONVCAST(c), NULL, NULL);
|
||||
|
||||
case CPAT_EQUIV:
|
||||
if (PATMATCHRANGE(p->u.str, CONVCAST(c), &ind, mtp))
|
||||
return ind + 1;
|
||||
else
|
||||
return 0;
|
||||
|
||||
s++;
|
||||
ws++;
|
||||
case CPAT_ANY:
|
||||
return 1;
|
||||
|
||||
case CPAT_CHAR:
|
||||
return (p->u.chr == c);
|
||||
|
||||
default:
|
||||
DPUTS(1, "bad matcher pattern type");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Use an equivalence to deduce the line character from the word, or
|
||||
* vice versa. (If vice versa, then "line" and "word" are reversed
|
||||
* in what follows. The logic is symmetric.)
|
||||
* lp is the line pattern.
|
||||
* wind is the index returned by a pattern match on the word pattern,
|
||||
* with type wmtp.
|
||||
* wchr is the word character.
|
||||
* Return -1 if no matching character, else the character.
|
||||
*
|
||||
* Only makes sense if lp->tp == CPAT_EQUIV and the (unseen) word
|
||||
* pattern also has that type.
|
||||
*/
|
||||
static int
|
||||
pattern_match_equivalence(Cpattern lp, int wind, int wmtp, int wchr)
|
||||
{
|
||||
int lchr, lmtp;
|
||||
|
||||
if (!PATMATCHINDEX(lp->u.str, wind-1, &lchr, &lmtp)) {
|
||||
/*
|
||||
* No equivalent. No possible match; give up.
|
||||
*/
|
||||
return -1;
|
||||
}
|
||||
/*
|
||||
* If we matched an exact character rather than a range
|
||||
* type, return it.
|
||||
*/
|
||||
if (lchr != -1)
|
||||
return lchr;
|
||||
|
||||
/*
|
||||
* Check the match types. We may want a case-changed
|
||||
* version of the word character.
|
||||
*/
|
||||
if (wmtp == PP_UPPER && lmtp == PP_LOWER)
|
||||
return tulower(wchr);
|
||||
else if (wmtp == PP_LOWER && lmtp == PP_UPPER)
|
||||
return tuupper(wchr);
|
||||
else if (wmtp == lmtp) {
|
||||
/*
|
||||
* Be lenient and allow identical replacements
|
||||
* for character classes, although in fact this
|
||||
* doesn't give special functionality for equivalence
|
||||
* classes.
|
||||
*/
|
||||
return wchr;
|
||||
} else {
|
||||
/*
|
||||
* Non-matching generic types; this can't work.
|
||||
*/
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the given pattern matches the given string.
|
||||
* p and s are either anchor or line pattern and string;
|
||||
* wp and ws are word (candidate) pattern and string
|
||||
*
|
||||
* If only one pattern is given, we just check if characters match.
|
||||
* If both line and word are given, we check that characters match
|
||||
* for {...} classes by comparing positions in the strings.
|
||||
*
|
||||
* Patterns and strings are always passed in pairs, so it is enough
|
||||
* to check for non-NULL wp. p should always be present.
|
||||
*
|
||||
* If prestrict is not NULL, it is a chain of patterns at least as long
|
||||
* as the line string. In this case we are still assembling the line at
|
||||
* s (which has been allocated but doesn't yet contain anything useful)
|
||||
* and must continue to do so as we go along; prestrict gives
|
||||
* restrictions on the line character to be applied along side the other
|
||||
* patterns. In the simple case a restriction is a character to be put
|
||||
* in place; otherwise it is a set of possible characters and we have to
|
||||
* deduce an actual matching character. Note prestrict is never an
|
||||
* equivalence class. In extreme cases we can't deduce a unique
|
||||
* character; then the match fails.
|
||||
*/
|
||||
|
||||
/**/
|
||||
mod_export int
|
||||
pattern_match_restrict(Cpattern p, char *s, Cpattern wp, char *ws,
|
||||
Cpattern prestrict)
|
||||
{
|
||||
int c, ind;
|
||||
int wc, wind;
|
||||
int len, wlen, mt, wmt;
|
||||
|
||||
while (p && wp && *s && *ws) {
|
||||
/* First test the word character */
|
||||
if (*ws == Meta) {
|
||||
wc = STOUC(ws[1]) ^ 32;
|
||||
wlen = 2;
|
||||
} else {
|
||||
wc = STOUC(*ws);
|
||||
wlen = 1;
|
||||
}
|
||||
wind = pattern_match1(wp, wc, &wmt);
|
||||
if (!wind)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Now the line character; deal with the case where
|
||||
* we don't yet have it, only a restriction on it.
|
||||
*/
|
||||
if (prestrict) {
|
||||
if (prestrict->tp == CPAT_CHAR) {
|
||||
/*
|
||||
* Easy case: restricted to an exact character on
|
||||
* the line. Procede as normal.
|
||||
*/
|
||||
c = prestrict->u.chr;
|
||||
} else {
|
||||
if (p->tp == CPAT_CHAR) {
|
||||
/*
|
||||
* Normal line pattern is an exact character: as
|
||||
* long as this matches prestrict, we can proceed
|
||||
* as usual.
|
||||
*/
|
||||
c = p->u.chr;
|
||||
} else if (p->tp == CPAT_EQUIV) {
|
||||
/*
|
||||
* An equivalence, so we can deduce the character
|
||||
* backwards from the word pattern and see if it
|
||||
* matches prestrict.
|
||||
*/
|
||||
if ((c = pattern_match_equivalence(p, wind, wmt, wc)) == -1)
|
||||
return 0;
|
||||
} else {
|
||||
/*
|
||||
* Not an equivalence, so that means we must match
|
||||
* the word (not just the word pattern), so grab it
|
||||
* and make sure it fulfills our needs. I think.
|
||||
* Not 100% sure about that, but what else can
|
||||
* we do? We haven't actually been passed a string
|
||||
* from the command line.
|
||||
*/
|
||||
c = wc;
|
||||
}
|
||||
/* Character so deduced must match the restriction. */
|
||||
if (!pattern_match1(prestrict, c, &mt))
|
||||
return 0;
|
||||
}
|
||||
len = imeta(c) ? 2 : 1;
|
||||
} else {
|
||||
/* We have the character itself. */
|
||||
if (*s == Meta) {
|
||||
c = STOUC(s[1]) ^ 32;
|
||||
len = 2;
|
||||
} else {
|
||||
c = STOUC(*s);
|
||||
len = 1;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* If either is "?", they match each other; no further tests.
|
||||
* Apply this even if the character wasn't convertable;
|
||||
* there's no point trying to be clever in that case.
|
||||
*/
|
||||
if (p->tp != CPAT_ANY || wp->tp != CPAT_ANY)
|
||||
{
|
||||
ind = pattern_match1(p, c, &mt);
|
||||
if (!ind)
|
||||
return 0;
|
||||
if (ind != wind)
|
||||
return 0;
|
||||
if (mt != wmt) {
|
||||
/*
|
||||
* Special case if matching lower vs. upper or
|
||||
* vice versa. The transformed characters must match.
|
||||
* We don't need to check the transformation is
|
||||
* the appropriate one for each character separately,
|
||||
* since that was done in pattern_match1(), so just
|
||||
* compare lower-cased versions of both.
|
||||
*/
|
||||
if ((mt == PP_LOWER || mt == PP_UPPER) &&
|
||||
(wmt == PP_LOWER || wmt == PP_UPPER)) {
|
||||
if (tulower(c) != tulower(wc))
|
||||
return 0;
|
||||
} else {
|
||||
/* Other different classes can't match. */
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (prestrict) {
|
||||
/* We need to assemble the line */
|
||||
if (imeta(c)) {
|
||||
*s++ = Meta;
|
||||
*s++ = c ^ 32;
|
||||
} else {
|
||||
*s++ = c;
|
||||
}
|
||||
prestrict = prestrict->next;
|
||||
} else
|
||||
s += len;
|
||||
ws += wlen;
|
||||
p = p->next;
|
||||
wp = wp->next;
|
||||
}
|
||||
|
||||
while (p && *s) {
|
||||
if (!p->tab[*((unsigned char *) s)])
|
||||
if (prestrict) {
|
||||
/*
|
||||
* As above, but with even less info to go on.
|
||||
* (Can this happen?) At least handle the cases where
|
||||
* one of our patterns has given us a specific character.
|
||||
*/
|
||||
if (prestrict->tp == CPAT_CHAR) {
|
||||
c = prestrict->u.chr;
|
||||
} else {
|
||||
if (p->tp == CPAT_CHAR) {
|
||||
c = p->u.chr;
|
||||
} else {
|
||||
/*
|
||||
* OK. Here we are in a function with just a line
|
||||
* pattern and another pattern to restrict the
|
||||
* characters that can go on the line, and no actual
|
||||
* characters. We're matching two patterns against
|
||||
* one another to generate a character to insert.
|
||||
* This is a bit too psychedelic, so I'm going to
|
||||
* bale out now. See you on the ground.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
if (!pattern_match1(prestrict, c, &mt))
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
if (*s == Meta) {
|
||||
c = STOUC(s[1]) ^ 32;
|
||||
len = 2;
|
||||
} else {
|
||||
c = STOUC(*s);
|
||||
len = 1;
|
||||
}
|
||||
}
|
||||
if (!pattern_match1(p, c, &mt))
|
||||
return 0;
|
||||
p = p->next;
|
||||
s++;
|
||||
if (prestrict) {
|
||||
if (imeta(c)) {
|
||||
*s++ = Meta;
|
||||
*s++ = c ^ 32;
|
||||
} else {
|
||||
*s++ = c;
|
||||
}
|
||||
prestrict = prestrict->next;
|
||||
} else
|
||||
s += len;
|
||||
}
|
||||
|
||||
while (wp && *ws) {
|
||||
if (!wp->tab[*((unsigned char *) ws)])
|
||||
/* No funny business when we only have the word pattern. */
|
||||
if (*ws == Meta) {
|
||||
wc = STOUC(ws[1]) ^ 32;
|
||||
wlen = 2;
|
||||
} else {
|
||||
wc = STOUC(*ws);
|
||||
wlen = 1;
|
||||
}
|
||||
if (!pattern_match1(wp, wc, &wmt))
|
||||
return 0;
|
||||
wp = wp->next;
|
||||
ws++;
|
||||
ws += wlen;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* The usual version of pattern matching, without the line string
|
||||
* being handled by restriction.
|
||||
*/
|
||||
/**/
|
||||
mod_export int
|
||||
pattern_match(Cpattern p, char *s, Cpattern wp, char *ws)
|
||||
{
|
||||
return pattern_match_restrict(p, s, wp, ws, NULL);
|
||||
}
|
||||
|
||||
/* This splits the given string into a list of cline structs, separated
|
||||
* at those places where one of the anchors of an `*' pattern was found.
|
||||
* plen gives the number of characters on the line that matched this
|
||||
|
@ -1256,11 +1562,11 @@ bld_parts(char *str, int len, int plen, Cline *lp, Cline *lprem)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/* This builds all the possible line patterns for the pattern pat in the
|
||||
* buffer line. Initially line is the same as lp, but during recursive
|
||||
* calls lp is incremented for storing successive characters. Whenever
|
||||
* a full possible string is build, we test if this line matches the
|
||||
* string given by wlen and word.
|
||||
|
||||
/*
|
||||
* This builds all the possible line patterns for the pattern pat in the
|
||||
* buffer line. Then we test if this line matches the string given by
|
||||
* wlen and word.
|
||||
*
|
||||
* wpat contains pattern that matched previously
|
||||
* lpat contains the pattern for line we build
|
||||
|
@ -1269,91 +1575,297 @@ bld_parts(char *str, int len, int plen, Cline *lp, Cline *lprem)
|
|||
*
|
||||
* The return value is the length of the string matched in the word, it
|
||||
* is zero if we couldn't build a line that matches the word.
|
||||
*
|
||||
* TODO: a lot of the nastiness associated with variable string
|
||||
* lengths can go when we switch to wide characters. (Why didn't
|
||||
* I just keep line unmetafied and metafy into place at the end? Er...)
|
||||
*/
|
||||
|
||||
|
||||
/**/
|
||||
static int
|
||||
bld_line(Cpattern wpat, Cpattern lpat, char *line, char *lp,
|
||||
char *mword, char *word, int wlen, int sfx)
|
||||
bld_line(Cmatcher mp, char **linep, char *mword, char *word, int wlen, int sfx)
|
||||
{
|
||||
if (lpat) {
|
||||
/* Still working on the pattern. */
|
||||
Cpattern lpat = mp->line;
|
||||
Cpattern wpat = mp->word;
|
||||
Cpattern curgenpat;
|
||||
VARARR(struct cpattern, genpatarr, mp->llen);
|
||||
Cmlist ms;
|
||||
int llen, rl;
|
||||
char *oword = word, *line = *linep;
|
||||
|
||||
int i, l;
|
||||
unsigned char c = 0;
|
||||
|
||||
/* Get the number of the character for a correspondence class
|
||||
* if it has a corresponding class. */
|
||||
if (lpat->equiv)
|
||||
if (wpat && *mword) {
|
||||
c = wpat->tab[STOUC(*mword)];
|
||||
wpat = wpat->next;
|
||||
mword++;
|
||||
/*
|
||||
* Loop over all characters. At this stage, line is an empty
|
||||
* space of length llen (not counting the null byte) which we assemble as
|
||||
* we go along.
|
||||
*
|
||||
* However, first we need to know what characters can appear at each
|
||||
* point in the line. For this we assemble an list genpatarr of the
|
||||
* same length as the line. (It's convenient to store this as an
|
||||
* array but it's linked as a list, too.) If there are equivalences
|
||||
* we use mword to derive the equivalent character; when we've
|
||||
* reached the end of mword, equivalences are treated just like
|
||||
* ordinary character classes. For character classes we just attach
|
||||
* the class to the genpatarr list and apply it as a restriction
|
||||
* when we finally match the line against the set of matchers.
|
||||
*/
|
||||
curgenpat = genpatarr;
|
||||
while (lpat) {
|
||||
int wchr = (*mword == Meta) ? STOUC(mword[1]) ^ 32 : STOUC(*mword);
|
||||
int wmtp, wind;
|
||||
/*
|
||||
* If the line pattern is an equivalence, query wpat to find the
|
||||
* word part of the equivalence. If we don't find one we don't try
|
||||
* equivalencing but use lpat as an ordinary match. (It's not
|
||||
* entirely clear to me this is the correct behaviour on a
|
||||
* failed character match within the equivalence, but that was
|
||||
* the behaviour of the old logic that this replaces.)
|
||||
*/
|
||||
if (lpat->tp == CPAT_EQUIV && wpat && *mword) {
|
||||
wind = pattern_match1(wpat, wchr, &wmtp);
|
||||
wpat = wpat->next;
|
||||
mword += (*mword == Meta) ? 2 : 1;
|
||||
} else
|
||||
wind = 0;
|
||||
if (wind) {
|
||||
/*
|
||||
* Successful match for word side of equivalence.
|
||||
* Find the line equivalent.
|
||||
*/
|
||||
int lchr;
|
||||
if ((lchr = pattern_match_equivalence(lpat, wind, wmtp, wchr))
|
||||
== -1) {
|
||||
/*
|
||||
* No equivalent. No possible match; give up.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Walk through the table in the pattern and try the characters
|
||||
* that may appear in the current position. */
|
||||
for (i = 0; i < 256; i++)
|
||||
if ((lpat->equiv && c) ? (c == lpat->tab[i]) : lpat->tab[i]) {
|
||||
*lp = i;
|
||||
/* We stored the character, now call ourselves to build
|
||||
* the rest. */
|
||||
if ((l = bld_line(wpat, lpat->next, line, lp + 1,
|
||||
mword, word, wlen, sfx)))
|
||||
return l;
|
||||
/*
|
||||
* We now have an exact character to match,
|
||||
* so make up a pattern element for it.
|
||||
*/
|
||||
curgenpat->tp = CPAT_CHAR;
|
||||
curgenpat->u.chr = lchr;
|
||||
} else {
|
||||
/*
|
||||
* Not an equivalence class, so we just keep the
|
||||
* test in the lpat as it is.
|
||||
*/
|
||||
curgenpat->tp = lpat->tp;
|
||||
if (lpat->tp == CPAT_CHAR)
|
||||
curgenpat->u.chr = lpat->u.chr;
|
||||
else if (lpat->tp != CPAT_ANY) {
|
||||
/*
|
||||
* The string isn't modified and is only needed in calls from
|
||||
* this function, so we don't even need to copy it.
|
||||
*/
|
||||
curgenpat->u.str = lpat->u.str;
|
||||
}
|
||||
} else {
|
||||
/* We reached the end, i.e. the line string is fully build, now
|
||||
* see if it matches the given word. */
|
||||
}
|
||||
lpat = lpat->next;
|
||||
/*
|
||||
* This linked list is defined above as an array.
|
||||
* We could get away with just keeping it as an array
|
||||
* and passing it down as such, but that's a bit icky
|
||||
* since the generic linkage of Cpatterns is as a linked
|
||||
* list and we should keep our local memory management
|
||||
* problems to ourselvess.
|
||||
*/
|
||||
if (lpat)
|
||||
curgenpat->next = curgenpat+1;
|
||||
else
|
||||
curgenpat->next = NULL;
|
||||
curgenpat++;
|
||||
}
|
||||
|
||||
Cmlist ms;
|
||||
Cmatcher mp;
|
||||
int l = lp - line, t, rl = 0, ind, add;
|
||||
/*
|
||||
* We now know how to match the word with the line patterns; let's
|
||||
* see if it does. We will use the information in curgenpat if we
|
||||
* are successful to work out what character goes on the line. This
|
||||
* is a bit hairy, as in "the Yeti is a creature that is a bit
|
||||
* hairy".
|
||||
*/
|
||||
llen = mp->llen;
|
||||
rl = 0;
|
||||
|
||||
/* Quick test if the strings are exactly the same. */
|
||||
if (l == wlen && !strncmp(line, word, l))
|
||||
return l;
|
||||
if (sfx)
|
||||
{
|
||||
/*
|
||||
* We need to work backwards from the end of both the
|
||||
* word and the line strings.
|
||||
*
|
||||
* Position at the end of the word by counting characters.
|
||||
*/
|
||||
int l = wlen;
|
||||
while (l--)
|
||||
word += (*word == Meta) ? 2 : 1;
|
||||
|
||||
/*
|
||||
* We construct the line from the end. We've left
|
||||
* enough space for possible Meta's.
|
||||
*/
|
||||
line += 2 * llen;
|
||||
*line = '\0';
|
||||
curgenpat = genpatarr + llen;
|
||||
} else
|
||||
curgenpat = genpatarr;
|
||||
|
||||
/* we now reuse mp, lpat, wpat for the global matchers */
|
||||
while (llen && wlen) {
|
||||
int wchr, wmtp;
|
||||
char *wp;
|
||||
Cpattern tmpgenpat;
|
||||
|
||||
if (sfx) {
|
||||
line = lp; word += wlen;
|
||||
ind = -1; add = -1;
|
||||
} else {
|
||||
ind = 0; add = 1;
|
||||
}
|
||||
/* We loop through the whole line string built. */
|
||||
while (l && wlen) {
|
||||
if (word[ind] == line[ind]) {
|
||||
/* The same character in both strings, skip over. */
|
||||
line += add; word += add;
|
||||
l--; wlen--; rl++;
|
||||
if (word > oword + 1 && word[-2] == Meta)
|
||||
wp = word - 2;
|
||||
else
|
||||
wp = word - 1;
|
||||
curgenpat--;
|
||||
} else
|
||||
wp = word;
|
||||
if (*wp == Meta)
|
||||
wchr = STOUC(wp[1]) ^ 32;
|
||||
else
|
||||
wchr = STOUC(*wp);
|
||||
if (pattern_match1(curgenpat, wchr, &wmtp))
|
||||
{
|
||||
int lchr;
|
||||
/*
|
||||
* We can match the line character directly with the word
|
||||
* character. If the line character is a fixed one,
|
||||
* keep it, since we went to all that trouble above,
|
||||
* else if it's generic, keep the word character,
|
||||
* since we have no choice.
|
||||
*/
|
||||
if (curgenpat->tp == CPAT_CHAR)
|
||||
lchr = curgenpat->u.chr;
|
||||
else
|
||||
lchr = wchr;
|
||||
if (imeta(lchr)) {
|
||||
if (sfx)
|
||||
line -= 2;
|
||||
line[0] = Meta;
|
||||
line[1] = lchr ^ 32;
|
||||
if (!sfx)
|
||||
line += 2;
|
||||
} else {
|
||||
t = 0;
|
||||
for (ms = bmatchers; ms && !t; ms = ms->next) {
|
||||
mp = ms->matcher;
|
||||
if (mp && !mp->flags && mp->wlen <= wlen && mp->llen <= l &&
|
||||
pattern_match(mp->line, (sfx ? line - mp->llen : line),
|
||||
mp->word, (sfx ? word - mp->wlen : word))) {
|
||||
/* Both the line and the word pattern matched,
|
||||
* now skip over the matched portions. */
|
||||
if (sfx) {
|
||||
line -= mp->llen; word -= mp->wlen;
|
||||
} else {
|
||||
line += mp->llen; word += mp->wlen;
|
||||
}
|
||||
l -= mp->llen; wlen -= mp->wlen; rl += mp->wlen;
|
||||
t = 1;
|
||||
}
|
||||
}
|
||||
if (!t)
|
||||
/* Didn't match, give up. */
|
||||
return 0;
|
||||
if (sfx)
|
||||
line--;
|
||||
line[0] = lchr;
|
||||
if (!sfx)
|
||||
line++;
|
||||
}
|
||||
|
||||
llen--;
|
||||
wlen--;
|
||||
rl++;
|
||||
|
||||
if (sfx)
|
||||
word = wp;
|
||||
else {
|
||||
if (llen)
|
||||
curgenpat++;
|
||||
word += (*word == Meta) ? 2 : 1;
|
||||
}
|
||||
}
|
||||
if (!l)
|
||||
/* Unmatched portion in the line built, return matched length. */
|
||||
return rl;
|
||||
else
|
||||
{
|
||||
char *lp;
|
||||
/*
|
||||
* Need to loop over pattern matchers.
|
||||
*/
|
||||
for (ms = bmatchers; ms; ms = ms->next) {
|
||||
mp = ms->matcher;
|
||||
/*
|
||||
* This is the nightmare case: we have line and
|
||||
* and word matchers and some pattern which restricts
|
||||
* the value on the line without us knowing exactly
|
||||
* what it is. Despatch to the special function
|
||||
* for that.
|
||||
*/
|
||||
if (mp && !mp->flags && mp->wlen <= wlen &&
|
||||
mp->llen <= llen)
|
||||
{
|
||||
if (sfx) {
|
||||
/*
|
||||
* We haven't assembled the line yet, and with
|
||||
* Meta characters we don't yet know the length.
|
||||
* We'll fix this up later.
|
||||
*/
|
||||
lp = line - 2 * mp->llen;
|
||||
} else
|
||||
lp = line;
|
||||
wp = word;
|
||||
if (sfx) {
|
||||
int l = mp->wlen;
|
||||
while (l--) {
|
||||
if (wp > oword + 1 && wp[-2] == Meta)
|
||||
wp -= 2;
|
||||
else
|
||||
wp--;
|
||||
}
|
||||
|
||||
tmpgenpat = curgenpat - mp->llen;
|
||||
} else
|
||||
tmpgenpat = curgenpat;
|
||||
if (pattern_match_restrict(mp->line, lp,
|
||||
mp->word, wp, tmpgenpat)) {
|
||||
/*
|
||||
* Matched: advance over as many characters
|
||||
* of the patterns and strings as
|
||||
* we've done matches.
|
||||
*/
|
||||
if (sfx) {
|
||||
int imove = mp->llen, nchar;
|
||||
char *pmove = lp;
|
||||
word = wp;
|
||||
|
||||
/* Close the gap we left in the line string */
|
||||
while (imove--)
|
||||
pmove += (*pmove == Meta) ? 2 : 1;
|
||||
/* Number of bytes to move */
|
||||
nchar = (int)(pmove - lp);
|
||||
/* The size of the gap */
|
||||
imove = 2 * mp->llen - nchar;
|
||||
if (imove) {
|
||||
lp = line - imove;
|
||||
/* Moving up, so start at the top */
|
||||
while (nchar--)
|
||||
*--line = *--lp;
|
||||
/* line is at the start of the moved text */
|
||||
}
|
||||
|
||||
curgenpat = tmpgenpat;
|
||||
} else {
|
||||
int cnt = mp->llen;
|
||||
while (cnt--) {
|
||||
line += (*line == Meta) ? 2 : 1;
|
||||
}
|
||||
|
||||
cnt = mp->wlen;
|
||||
while (cnt--)
|
||||
word += (*word == Meta) ? 2 : 1;
|
||||
|
||||
curgenpat += mp->llen;
|
||||
}
|
||||
llen -= mp->llen;
|
||||
wlen -= mp->wlen;
|
||||
rl += mp->wlen;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!ms)
|
||||
return 0; /* Didn't match, give up */
|
||||
}
|
||||
}
|
||||
if (!llen) {
|
||||
/* Unmatched portion in the line built, return matched length. */
|
||||
if (sfx)
|
||||
*linep = line;
|
||||
else
|
||||
*line = '\0';
|
||||
return rl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -1386,8 +1898,10 @@ join_strs(int la, char *sa, int lb, char *sb)
|
|||
if ((t = pattern_match(mp->word, sa, NULL, NULL)) ||
|
||||
pattern_match(mp->word, sb, NULL, NULL)) {
|
||||
/* It matched one of the strings, t says which one. */
|
||||
VARARR(char, line, mp->llen + 1);
|
||||
char **ap, **bp;
|
||||
/* TODO: double to allow Meta, not necessary
|
||||
when properly unmetafied */
|
||||
VARARR(char, linearr, 2*mp->llen + 1);
|
||||
char **ap, **bp, *line = linearr;
|
||||
int *alp, *blp;
|
||||
|
||||
if (t) {
|
||||
|
@ -1399,10 +1913,8 @@ join_strs(int la, char *sa, int lb, char *sb)
|
|||
}
|
||||
/* Now try to build a string that matches the other
|
||||
* string. */
|
||||
if ((bl = bld_line(mp->word, mp->line, line, line,
|
||||
*ap, *bp, *blp, 0))) {
|
||||
if ((bl = bld_line(mp, &line, *ap, *bp, *blp, 0))) {
|
||||
/* Found one, put it into the return string. */
|
||||
line[mp->llen] = '\0';
|
||||
if (rr <= mp->llen) {
|
||||
char *or = rs;
|
||||
|
||||
|
@ -1444,7 +1956,11 @@ join_strs(int la, char *sa, int lb, char *sb)
|
|||
return rs;
|
||||
}
|
||||
|
||||
/* This compares the anchors stored in two top-level clines. */
|
||||
/*
|
||||
* This compares the anchors stored in two top-level clines.
|
||||
* It returns 1 if the anchors are the same, 2 if they are
|
||||
* compatible (and have been combined in "o"), 0 otherwise.
|
||||
*/
|
||||
|
||||
/**/
|
||||
static int
|
||||
|
@ -1591,9 +2107,11 @@ join_sub(Cmdata md, char *str, int len, int *mlen, int sfx, int join)
|
|||
NULL, NULL)) ||
|
||||
pattern_match(mp->word, nw - (sfx ? mp->wlen : 0),
|
||||
NULL, NULL))) {
|
||||
VARARR(char, line, mp->llen + 1);
|
||||
/* TODO: doubled to allow Meta, not necessary
|
||||
* when properly unmetafied */
|
||||
VARARR(char, linearr, 2*mp->llen + 1);
|
||||
int bl;
|
||||
char *mw;
|
||||
char *mw, *line = linearr;
|
||||
|
||||
/* Then build all the possible lines and see
|
||||
* if one of them matches the other string. */
|
||||
|
@ -1602,11 +2120,10 @@ join_sub(Cmdata md, char *str, int len, int *mlen, int sfx, int join)
|
|||
else
|
||||
mw = nw - (sfx ? mp->wlen : 0);
|
||||
|
||||
if ((bl = bld_line(mp->word, mp->line, line, line,
|
||||
mw, (t ? nw : ow), (t ? nl : ol), sfx))) {
|
||||
if ((bl = bld_line(mp, &line, mw, (t ? nw : ow),
|
||||
(t ? nl : ol), sfx))) {
|
||||
/* Yep, one of the lines matched the other
|
||||
* string. */
|
||||
line[mp->llen] = '\0';
|
||||
|
||||
if (t) {
|
||||
ol = mp->wlen; nl = bl;
|
||||
|
|
|
@ -3997,6 +3997,239 @@ cfp_test_exact(LinkList names, char **accept, char *skipped)
|
|||
return (found ? ret : NULL);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This code constructs (from heap) and returns a string that
|
||||
* corresponds to a series of matches; when compiled as a pattern, at
|
||||
* each position it matches either the character from the string "add"
|
||||
* or the corresponding single-character match from the set of matchers.
|
||||
* To take a simple case, if add is "a" and the single matcher for the
|
||||
* character position matches "[0-9]", the pattern returned is "[0-9a]".
|
||||
* We take account of equivalences between the word and line, too.
|
||||
*
|
||||
* As there are virtually no comments in this file, I don't really
|
||||
* know why we're doing this, but it's to do with a matcher which
|
||||
* is passed as an argument to the utility compfiles -p/-P.
|
||||
*/
|
||||
static char *
|
||||
cfp_matcher_range(Cmatcher *ms, char *add)
|
||||
{
|
||||
Cmatcher *mp, m;
|
||||
int len = 0, mt;
|
||||
char *ret = NULL, *p = NULL, *adds = add;
|
||||
|
||||
/*
|
||||
* Do this twice: once to work out the length of the
|
||||
* string in len, the second time to build it in ret.
|
||||
* This is probably worthwhile because otherwise memory
|
||||
* management is difficult.
|
||||
*/
|
||||
for (;;) {
|
||||
for (mp = ms; *add; add++, mp++) {
|
||||
if (!(m = *mp)) {
|
||||
/*
|
||||
* No matcher, so just match the character
|
||||
* itself.
|
||||
*
|
||||
* TODO: surely this needs quoting if it's a
|
||||
* metacharacter?
|
||||
*/
|
||||
if (ret) {
|
||||
if (imeta(*add)) {
|
||||
*p++ = Meta;
|
||||
*p++ = *add ^ 32;
|
||||
} else
|
||||
*p++ = *add;
|
||||
} else
|
||||
len += imeta(*add) ? 2 : 1;
|
||||
} else if (m->flags & CMF_RIGHT) {
|
||||
/*
|
||||
* Right-anchored: match anything followed
|
||||
* by the character itself.
|
||||
*/
|
||||
if (ret) {
|
||||
*p++ = '*';
|
||||
/* TODO: quote again? */
|
||||
if (imeta(*add)) {
|
||||
*p++ = Meta;
|
||||
*p++ = *add ^ 32;
|
||||
} else
|
||||
*p++ = *add;
|
||||
} else
|
||||
len += imeta(*add) ? 3 : 2;
|
||||
} else {
|
||||
/* The usual set of matcher possibilities. */
|
||||
int ind;
|
||||
if (m->line->tp == CPAT_EQUIV &&
|
||||
m->word->tp == CPAT_EQUIV) {
|
||||
/*
|
||||
* Genuine equivalence. Add the character to match
|
||||
* and the equivalent character from the word
|
||||
* pattern.
|
||||
*
|
||||
* TODO: we could be more careful here with special
|
||||
* cases as we are in the basic character class
|
||||
* code below.
|
||||
*/
|
||||
if (ret) {
|
||||
*p++ = '[';
|
||||
if (imeta(*add)) {
|
||||
*p++ = Meta;
|
||||
*p++ = *add ^ 32;
|
||||
} else
|
||||
*p++ = *add;
|
||||
} else
|
||||
len += imeta(*add) ? 3 : 2;
|
||||
if (PATMATCHRANGE(m->line->u.str, CONVCAST(*add),
|
||||
&ind, &mt)) {
|
||||
/*
|
||||
* Find the equivalent match for ind in the
|
||||
* word pattern.
|
||||
*/
|
||||
if ((ind = pattern_match_equivalence
|
||||
(m->word, ind, mt, CONVCAST(*add))) != -1) {
|
||||
if (ret) {
|
||||
if (imeta(ind)) {
|
||||
*p++ = Meta;
|
||||
*p++ = ind ^ 32;
|
||||
} else
|
||||
*p++ = ind;
|
||||
} else
|
||||
len += imeta(ind) ? 2 : 1;
|
||||
}
|
||||
}
|
||||
if (ret)
|
||||
*p++ = ']';
|
||||
else
|
||||
len++;
|
||||
} else {
|
||||
int newlen, addadd;
|
||||
|
||||
switch (m->word->tp) {
|
||||
case CPAT_NCLASS:
|
||||
/*
|
||||
* TODO: the old logic implies that we need to
|
||||
* match *add, i.e. it should be deleted from
|
||||
* the set of character's we're not allowed to
|
||||
* match. That's too much like hard work for
|
||||
* now. Indeed, in general it's impossible
|
||||
* without trickery. Consider *add == 'A',
|
||||
* range == "[^[:upper:]]": we would have to
|
||||
* resort to something like "(A|[^[:upper:]])";
|
||||
* and in an expression like that *add may or
|
||||
* may not need backslashing. So we're deep
|
||||
* into see-if-we-can-get-away-without
|
||||
* territory.
|
||||
*/
|
||||
if (ret) {
|
||||
*p++ = '[';
|
||||
*p++ = '^';
|
||||
} else
|
||||
len += 2;
|
||||
/*
|
||||
* Convert the compiled range string back
|
||||
* to an ordinary string.
|
||||
*/
|
||||
newlen =
|
||||
pattern_range_to_string(m->word->u.str, p);
|
||||
DPUTS(!newlen, "empty character range");
|
||||
if (ret) {
|
||||
p += newlen;
|
||||
*p++ = ']';
|
||||
} else
|
||||
len += newlen + 1;
|
||||
break;
|
||||
|
||||
case CPAT_CCLASS:
|
||||
/*
|
||||
* If there is an equivalence only on one
|
||||
* side it's not equivalent to anything.
|
||||
* Treat it as an ordinary character class.
|
||||
*/
|
||||
case CPAT_EQUIV:
|
||||
case CPAT_CHAR:
|
||||
if (ret)
|
||||
*p++ = '[';
|
||||
else
|
||||
len++;
|
||||
/*
|
||||
* We needed to add *add specially only if
|
||||
* it is not covered by the range. This
|
||||
* is necessary for correct syntax---consider
|
||||
* if *add is ] and ] is also the first
|
||||
* character in the range.
|
||||
*/
|
||||
addadd = !pattern_match1(m->word, CONVCAST(*add), &mt);
|
||||
if (addadd && *add == ']') {
|
||||
if (ret)
|
||||
*p++ = *add;
|
||||
else
|
||||
len++;
|
||||
}
|
||||
if (m->word->tp == CPAT_CHAR) {
|
||||
/*
|
||||
* The matcher just matches a single
|
||||
* character, but we need to be able
|
||||
* to match *add, too, hence we do
|
||||
* this as a [...].
|
||||
*/
|
||||
if (ret) {
|
||||
if (imeta(m->word->u.chr)) {
|
||||
*p++ = Meta;
|
||||
*p++ = m->word->u.chr ^ 32;
|
||||
} else
|
||||
*p++ = m->word->u.chr;
|
||||
} else
|
||||
len += imeta(m->word->u.chr) ? 2 : 1;
|
||||
} else {
|
||||
/*
|
||||
* Convert the compiled range string back
|
||||
* to an ordinary string.
|
||||
*/
|
||||
newlen =
|
||||
pattern_range_to_string(m->word->u.str, p);
|
||||
DPUTS(!newlen, "empty character range");
|
||||
if (ret)
|
||||
p += newlen;
|
||||
else
|
||||
len += newlen;
|
||||
}
|
||||
if (addadd && *add != ']') {
|
||||
if (ret) {
|
||||
if (imeta(*add)) {
|
||||
*p++ = Meta;
|
||||
*p++ = *add ^ 32;
|
||||
} else
|
||||
*p++ = *add;
|
||||
} else
|
||||
len += imeta(*add) ? 2 : 1;
|
||||
}
|
||||
if (ret)
|
||||
*p++ = ']';
|
||||
else
|
||||
len++;
|
||||
break;
|
||||
|
||||
case CPAT_ANY:
|
||||
if (ret)
|
||||
*p++ = '?';
|
||||
else
|
||||
len++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ret) {
|
||||
*p = '\0';
|
||||
return ret;
|
||||
}
|
||||
p = ret = zhalloc(len + 1);
|
||||
add = adds;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static char *
|
||||
cfp_matcher_pats(char *matcher, char *add)
|
||||
{
|
||||
|
@ -4064,64 +4297,8 @@ cfp_matcher_pats(char *matcher, char *add)
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (*add) {
|
||||
char *ret = "", buf[259];
|
||||
|
||||
for (mp = ms; *add; add++, mp++) {
|
||||
if (!(m = *mp)) {
|
||||
buf[0] = *add;
|
||||
buf[1] = '\0';
|
||||
} else if (m->flags & CMF_RIGHT) {
|
||||
buf[0] = '*';
|
||||
buf[1] = *add;
|
||||
buf[2] = '\0';
|
||||
} else {
|
||||
unsigned char *t, c;
|
||||
char *p = buf;
|
||||
int i;
|
||||
|
||||
for (i = 256, t = m->word->tab; i--; t++)
|
||||
if (*t)
|
||||
break;
|
||||
if (i) {
|
||||
t = m->word->tab;
|
||||
*p++ = '[';
|
||||
if (m->line->equiv && m->word->equiv) {
|
||||
*p++ = *add;
|
||||
c = m->line->tab[STOUC(*add)];
|
||||
for (i = 0; i < 256; i++)
|
||||
if (m->word->tab[i] == c) {
|
||||
*p++ = (char) i;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (*add == ']' || t[STOUC(']')])
|
||||
*p++ = ']';
|
||||
for (i = 0; i < 256; i++, t++)
|
||||
if (*t && ((char) i) != *add &&
|
||||
i != ']' && i != '-' &&
|
||||
i != '^' && i != '!')
|
||||
*p++ = (char) i;
|
||||
*p++ = *add;
|
||||
t = m->word->tab;
|
||||
if (*add != '^' && t[STOUC('^')])
|
||||
*p++ = '^';
|
||||
if (*add != '!' && t[STOUC('!')])
|
||||
*p++ = '!';
|
||||
if (*add != '-' && t[STOUC('-')])
|
||||
*p++ = '-';
|
||||
}
|
||||
*p++ = ']';
|
||||
*p = '\0';
|
||||
} else {
|
||||
*p = '?';
|
||||
p[1] = '\0';
|
||||
}
|
||||
}
|
||||
ret = dyncat(ret, buf);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
if (*add)
|
||||
return cfp_matcher_range(ms, add);
|
||||
}
|
||||
return add;
|
||||
}
|
||||
|
|
456
Src/pattern.c
456
Src/pattern.c
|
@ -193,25 +193,6 @@ typedef union upat *Upat;
|
|||
* v v ^
|
||||
* ------------------------
|
||||
*/
|
||||
#define PP_ALPHA 1
|
||||
#define PP_ALNUM 2
|
||||
#define PP_ASCII 3
|
||||
#define PP_BLANK 4
|
||||
#define PP_CNTRL 5
|
||||
#define PP_DIGIT 6
|
||||
#define PP_GRAPH 7
|
||||
#define PP_LOWER 8
|
||||
#define PP_PRINT 9
|
||||
#define PP_PUNCT 10
|
||||
#define PP_SPACE 11
|
||||
#define PP_UPPER 12
|
||||
#define PP_XDIGIT 13
|
||||
#define PP_IDENT 14
|
||||
#define PP_IFS 15
|
||||
#define PP_IFSSPACE 16
|
||||
#define PP_WORD 17
|
||||
#define PP_UNKWN 18
|
||||
#define PP_RANGE 19
|
||||
|
||||
#define P_OP(p) ((p)->l & 0xff)
|
||||
#define P_NEXT(p) ((p)->l >> 8)
|
||||
|
@ -1057,6 +1038,127 @@ patgetglobflags(char **strp, long *assertp, int *ignore)
|
|||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static const char *colon_stuffs[] = {
|
||||
"alpha", "alnum", "ascii", "blank", "cntrl", "digit", "graph",
|
||||
"lower", "print", "punct", "space", "upper", "xdigit", "IDENT",
|
||||
"IFS", "IFSSPACE", "WORD", NULL
|
||||
};
|
||||
|
||||
/*
|
||||
* Handle the guts of a [:stuff:] character class element.
|
||||
* start is the beginning of "stuff" and len is its length.
|
||||
* This code is exported for the benefit of completion matching.
|
||||
*/
|
||||
|
||||
/**/
|
||||
mod_export int
|
||||
range_type(char *start, int len)
|
||||
{
|
||||
const char **csp;
|
||||
|
||||
for (csp = colon_stuffs; *csp; csp++) {
|
||||
if (!strncmp(start, *csp, len))
|
||||
return (csp - colon_stuffs) + PP_FIRST;
|
||||
}
|
||||
|
||||
return PP_UNKWN;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Convert the contents of a [...] or [^...] expression (just the
|
||||
* ... part) back into a string. This is used by compfiles -p/-P
|
||||
* for some reason. The compiled form (a metafied string) is
|
||||
* passed in rangestr.
|
||||
*
|
||||
* If outstr is non-NULL the compiled form is placed there. It
|
||||
* must be sufficiently long. A terminating NULL is appended.
|
||||
*
|
||||
* Return the length required, not including the terminating NULL.
|
||||
*
|
||||
* TODO: this is non-multibyte for now. It will need to be defined
|
||||
* appropriately with MULTIBYTE_SUPPORT when the completion matching
|
||||
* code catches up.
|
||||
*/
|
||||
|
||||
/**/
|
||||
mod_export int
|
||||
pattern_range_to_string(char *rangestr, char *outstr)
|
||||
{
|
||||
int len = 0;
|
||||
|
||||
while (*rangestr) {
|
||||
if (imeta(STOUC(*rangestr))) {
|
||||
int swtype = STOUC(*rangestr) - STOUC(Meta);
|
||||
|
||||
if (swtype == 0) {
|
||||
/* Ordindary metafied character */
|
||||
if (outstr)
|
||||
{
|
||||
*outstr++ = Meta;
|
||||
*outstr++ = rangestr[1] ^ 32;
|
||||
}
|
||||
len += 2;
|
||||
rangestr += 2;
|
||||
} else if (swtype == PP_RANGE) {
|
||||
/* X-Y range */
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
if (*rangestr == Meta) {
|
||||
if (outstr) {
|
||||
*outstr++ = Meta;
|
||||
*outstr++ = rangestr[1];
|
||||
}
|
||||
len += 2;
|
||||
rangestr += 2;
|
||||
} else {
|
||||
if (outstr)
|
||||
*outstr++ = *rangestr;
|
||||
len++;
|
||||
rangestr++;
|
||||
}
|
||||
|
||||
if (i == 0) {
|
||||
if (outstr)
|
||||
*outstr++ = '-';
|
||||
len++;
|
||||
}
|
||||
}
|
||||
} else if (swtype >= PP_FIRST && swtype <= PP_LAST) {
|
||||
/* [:stuff:]; we need to output [: and :] */
|
||||
const char *found = colon_stuffs[swtype - PP_FIRST];
|
||||
int newlen = strlen(found);
|
||||
if (outstr) {
|
||||
strcpy(outstr, "[:");
|
||||
outstr += 2;
|
||||
memcpy(outstr, found, newlen);
|
||||
outstr += newlen;
|
||||
strcpy(outstr, ":]");
|
||||
outstr += 2;
|
||||
}
|
||||
len += newlen + 4;
|
||||
rangestr++;
|
||||
} else {
|
||||
/* shouldn't happen */
|
||||
DPUTS(1, "BUG: unknown PP_ code in pattern range");
|
||||
rangestr++;
|
||||
}
|
||||
} else {
|
||||
/* ordinary character, guaranteed no Meta handling needed */
|
||||
if (outstr)
|
||||
*outstr++ = *rangestr;
|
||||
len++;
|
||||
rangestr++;
|
||||
}
|
||||
}
|
||||
|
||||
if (outstr)
|
||||
*outstr = '\0';
|
||||
return len;
|
||||
}
|
||||
|
||||
/*
|
||||
* compile a chunk such as a literal string or a [...] followed
|
||||
* by a possible hash operator
|
||||
|
@ -1230,45 +1332,10 @@ patcomppiece(int *flagp)
|
|||
/* Posix range. */
|
||||
patparse += 2;
|
||||
len = nptr - patparse;
|
||||
if (!strncmp(patparse, "alpha", len))
|
||||
ch = PP_ALPHA;
|
||||
else if (!strncmp(patparse, "alnum", len))
|
||||
ch = PP_ALNUM;
|
||||
else if (!strncmp(patparse, "ascii", len))
|
||||
ch = PP_ASCII;
|
||||
else if (!strncmp(patparse, "blank", len))
|
||||
ch = PP_BLANK;
|
||||
else if (!strncmp(patparse, "cntrl", len))
|
||||
ch = PP_CNTRL;
|
||||
else if (!strncmp(patparse, "digit", len))
|
||||
ch = PP_DIGIT;
|
||||
else if (!strncmp(patparse, "graph", len))
|
||||
ch = PP_GRAPH;
|
||||
else if (!strncmp(patparse, "lower", len))
|
||||
ch = PP_LOWER;
|
||||
else if (!strncmp(patparse, "print", len))
|
||||
ch = PP_PRINT;
|
||||
else if (!strncmp(patparse, "punct", len))
|
||||
ch = PP_PUNCT;
|
||||
else if (!strncmp(patparse, "space", len))
|
||||
ch = PP_SPACE;
|
||||
else if (!strncmp(patparse, "upper", len))
|
||||
ch = PP_UPPER;
|
||||
else if (!strncmp(patparse, "xdigit", len))
|
||||
ch = PP_XDIGIT;
|
||||
else if (!strncmp(patparse, "IDENT", len))
|
||||
ch = PP_IDENT;
|
||||
else if (!strncmp(patparse, "IFS", len))
|
||||
ch = PP_IFS;
|
||||
else if (!strncmp(patparse, "IFSSPACE", len))
|
||||
ch = PP_IFSSPACE;
|
||||
else if (!strncmp(patparse, "WORD", len))
|
||||
ch = PP_WORD;
|
||||
else
|
||||
ch = PP_UNKWN;
|
||||
ch = range_type(patparse, len);
|
||||
patparse = nptr + 2;
|
||||
if (ch != PP_UNKWN)
|
||||
patadd(NULL, STOUC(Meta+ch), 1, PA_NOALIGN);
|
||||
patadd(NULL, STOUC(Meta) + ch, 1, PA_NOALIGN);
|
||||
continue;
|
||||
}
|
||||
charstart = patparse;
|
||||
|
@ -1276,7 +1343,7 @@ patcomppiece(int *flagp)
|
|||
|
||||
if (*patparse == '-' && patparse[1] &&
|
||||
patparse[1] != Outbrack) {
|
||||
patadd(NULL, STOUC(Meta+PP_RANGE), 1, PA_NOALIGN);
|
||||
patadd(NULL, STOUC(Meta)+PP_RANGE, 1, PA_NOALIGN);
|
||||
if (itok(*charstart)) {
|
||||
patadd(0, STOUC(ztokens[*charstart - Pound]), 1,
|
||||
PA_NOALIGN);
|
||||
|
@ -2369,19 +2436,19 @@ patmatch(Upat prog)
|
|||
wchar_t cr = CHARREF(patinput, patinend);
|
||||
char *scanop = (char *)P_OPERAND(scan);
|
||||
if (patglobflags & GF_MULTIBYTE) {
|
||||
if (mb_patmatchrange(scanop, cr) ^
|
||||
if (mb_patmatchrange(scanop, cr, NULL, NULL) ^
|
||||
(P_OP(scan) == P_ANYOF))
|
||||
fail = 1;
|
||||
else
|
||||
CHARINC(patinput, patinend);
|
||||
} else if (patmatchrange(scanop, (int)cr) ^
|
||||
} else if (patmatchrange(scanop, (int)cr, NULL, NULL) ^
|
||||
(P_OP(scan) == P_ANYOF))
|
||||
fail = 1;
|
||||
else
|
||||
CHARINC(patinput, patinend);
|
||||
#else
|
||||
if (patmatchrange((char *)P_OPERAND(scan),
|
||||
CHARREF(patinput, patinend)) ^
|
||||
CHARREF(patinput, patinend), NULL, NULL) ^
|
||||
(P_OP(scan) == P_ANYOF))
|
||||
fail = 1;
|
||||
else
|
||||
|
@ -3122,12 +3189,33 @@ patmatch(Upat prog)
|
|||
/**/
|
||||
#ifdef MULTIBYTE_SUPPORT
|
||||
|
||||
/*
|
||||
* See if character ch matches a pattern range specification.
|
||||
* The null-terminated specification is in range; the test
|
||||
* character is in ch.
|
||||
*
|
||||
* indptr is used by completion matching, which is why this
|
||||
* function is exported. If indptr is not NULL we set *indptr
|
||||
* to the index of the character in the range string, adjusted
|
||||
* in the case of "A-B" ranges such that A would count as its
|
||||
* normal index (say IA), B would count as IA + (B-A), and any
|
||||
* character within the range as appropriate. We're not strictly
|
||||
* guaranteed this fits within a wint_t, but if this is Unicode
|
||||
* in 32 bits we have a fair amount of distance left over.
|
||||
*
|
||||
* mtp is used in the same circumstances. *mtp returns the match type:
|
||||
* 0 for a standard character, else the PP_ index. It's not
|
||||
* useful if the match failed.
|
||||
*/
|
||||
|
||||
/**/
|
||||
static int
|
||||
mb_patmatchrange(char *range, wchar_t ch)
|
||||
mod_export int
|
||||
mb_patmatchrange(char *range, wchar_t ch, wint_t *indptr, int *mtp)
|
||||
{
|
||||
wchar_t r1, r2;
|
||||
|
||||
if (indptr)
|
||||
*indptr = 0;
|
||||
/*
|
||||
* Careful here: unlike other strings, range is a NULL-terminated,
|
||||
* metafied string, because we need to treat the Posix and hyphenated
|
||||
|
@ -3135,7 +3223,10 @@ mb_patmatchrange(char *range, wchar_t ch)
|
|||
*/
|
||||
while (*range) {
|
||||
if (imeta(STOUC(*range))) {
|
||||
switch (STOUC(*range++) - STOUC(Meta)) {
|
||||
int swtype = STOUC(*range++) - STOUC(Meta);
|
||||
if (mtp)
|
||||
*mtp = swtype;
|
||||
switch (swtype) {
|
||||
case 0:
|
||||
/* ordinary metafied character */
|
||||
range--;
|
||||
|
@ -3214,8 +3305,19 @@ mb_patmatchrange(char *range, wchar_t ch)
|
|||
case PP_RANGE:
|
||||
r1 = metacharinc(&range);
|
||||
r2 = metacharinc(&range);
|
||||
if (r1 <= ch && ch <= r2)
|
||||
if (r1 <= ch && ch <= r2) {
|
||||
if (indptr)
|
||||
*indptr += ch - r1;
|
||||
return 1;
|
||||
}
|
||||
/* Careful not to screw up counting with bogus range */
|
||||
if (indptr && r1 < r2) {
|
||||
/*
|
||||
* This gets incremented again below to get
|
||||
* us past the range end. This is correct.
|
||||
*/
|
||||
*indptr += r2 - r1;
|
||||
}
|
||||
break;
|
||||
case PP_UNKWN:
|
||||
DPUTS(1, "BUG: unknown posix range passed through.\n");
|
||||
|
@ -3224,21 +3326,130 @@ mb_patmatchrange(char *range, wchar_t ch)
|
|||
DPUTS(1, "BUG: unknown metacharacter in range.");
|
||||
break;
|
||||
}
|
||||
} else if (metacharinc(&range) == ch)
|
||||
} else if (metacharinc(&range) == ch) {
|
||||
if (mtp)
|
||||
*mtp = 0;
|
||||
return 1;
|
||||
}
|
||||
if (indptr)
|
||||
(*indptr)++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* This is effectively the reverse of mb_patmatchrange().
|
||||
* Given a range descriptor of the same form, and an index into it,
|
||||
* try to determine the character that is matched. If the index
|
||||
* points to a [:...:] generic style match, set chr to WEOF and
|
||||
* return the type in mtp instead. Return 1 if successful, 0 if
|
||||
* there was no corresponding index. Note all pointer arguments
|
||||
* must be non-null.
|
||||
*
|
||||
* TODO: for now the completion matching code does not handle
|
||||
* multibyte. When it does, we will need either this, or
|
||||
* patmatchindex(), but not both---unlike user-initiated pattern
|
||||
* matching, multibyte mode in the line editor is always on when available.
|
||||
*/
|
||||
|
||||
/**/
|
||||
mod_export int
|
||||
mb_patmatchindex(char *range, wint_t ind, wint_t *chr, int *mtp)
|
||||
{
|
||||
wchar_t r1, r2, rchr;
|
||||
wint_t rdiff;
|
||||
|
||||
*chr = WEOF;
|
||||
*mtp = 0;
|
||||
|
||||
while (*range) {
|
||||
if (imeta(STOUC(*range))) {
|
||||
int swtype = STOUC(*range++) - STOUC(Meta);
|
||||
switch (swtype) {
|
||||
case 0:
|
||||
range--;
|
||||
rchr = metacharinc(&range);
|
||||
if (!ind) {
|
||||
*chr = (wint_t) rchr;
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
|
||||
case PP_ALPHA:
|
||||
case PP_ALNUM:
|
||||
case PP_ASCII:
|
||||
case PP_BLANK:
|
||||
case PP_CNTRL:
|
||||
case PP_DIGIT:
|
||||
case PP_GRAPH:
|
||||
case PP_LOWER:
|
||||
case PP_PRINT:
|
||||
case PP_PUNCT:
|
||||
case PP_SPACE:
|
||||
case PP_UPPER:
|
||||
case PP_XDIGIT:
|
||||
case PP_IDENT:
|
||||
case PP_IFS:
|
||||
case PP_IFSSPACE:
|
||||
case PP_WORD:
|
||||
if (!ind) {
|
||||
*mtp = swtype;
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
|
||||
case PP_RANGE:
|
||||
r1 = metacharinc(&range);
|
||||
r2 = metacharinc(&range);
|
||||
rdiff = (wint_t)r2 - (wint_t)r1;
|
||||
if (rdiff >= ind) {
|
||||
*chr = (wint_t)r1 + ind;
|
||||
return 1;
|
||||
}
|
||||
/* note the extra decrement to ind below */
|
||||
ind -= rdiff;
|
||||
break;
|
||||
case PP_UNKWN:
|
||||
DPUTS(1, "BUG: unknown posix range passed through.\n");
|
||||
break;
|
||||
default:
|
||||
DPUTS(1, "BUG: unknown metacharacter in range.");
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
rchr = metacharinc(&range);
|
||||
if (!ind) {
|
||||
*chr = (wint_t)rchr;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (!ind--)
|
||||
break;
|
||||
}
|
||||
|
||||
/* No corresponding index. */
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**/
|
||||
static int
|
||||
patmatchrange(char *range, int ch)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Identical function to mb_patmatchrange() above for single-byte
|
||||
* characters.
|
||||
*/
|
||||
|
||||
/**/
|
||||
mod_export int
|
||||
patmatchrange(char *range, int ch, int *indptr, int *mtp)
|
||||
{
|
||||
int r1, r2;
|
||||
|
||||
if (indptr)
|
||||
*indptr = 0;
|
||||
/*
|
||||
* Careful here: unlike other strings, range is a NULL-terminated,
|
||||
* metafied string, because we need to treat the Posix and hyphenated
|
||||
|
@ -3246,7 +3457,10 @@ patmatchrange(char *range, int ch)
|
|||
*/
|
||||
for (; *range; range++) {
|
||||
if (imeta(STOUC(*range))) {
|
||||
switch (STOUC(*range)-STOUC(Meta)) {
|
||||
int swtype = STOUC(*range) - STOUC(Meta);
|
||||
if (mtp)
|
||||
*mtp = swtype;
|
||||
switch (swtype) {
|
||||
case 0:
|
||||
if (STOUC(*++range ^ 32) == ch)
|
||||
return 1;
|
||||
|
@ -3326,8 +3540,13 @@ patmatchrange(char *range, int ch)
|
|||
r2 = STOUC(UNMETA(range));
|
||||
if (*range == Meta)
|
||||
range++;
|
||||
if (r1 <= ch && ch <= r2)
|
||||
if (r1 <= ch && ch <= r2) {
|
||||
if (indptr)
|
||||
*indptr += ch - r1;
|
||||
return 1;
|
||||
}
|
||||
if (indptr && r1 < r2)
|
||||
*indptr += r2 - r1;
|
||||
break;
|
||||
case PP_UNKWN:
|
||||
DPUTS(1, "BUG: unknown posix range passed through.\n");
|
||||
|
@ -3336,12 +3555,103 @@ patmatchrange(char *range, int ch)
|
|||
DPUTS(1, "BUG: unknown metacharacter in range.");
|
||||
break;
|
||||
}
|
||||
} else if (STOUC(*range) == ch)
|
||||
} else if (STOUC(*range) == ch) {
|
||||
if (mtp)
|
||||
*mtp = 0;
|
||||
return 1;
|
||||
}
|
||||
if (indptr)
|
||||
(*indptr)++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Identical function to mb_patmatchindex() above for single-byte
|
||||
* characters. Here -1 represents a character that needs a special type.
|
||||
*/
|
||||
|
||||
/**/
|
||||
mod_export int
|
||||
patmatchindex(char *range, int ind, int *chr, int *mtp)
|
||||
{
|
||||
int r1, r2, rdiff, rchr;
|
||||
|
||||
*chr = -1;
|
||||
*mtp = 0;
|
||||
|
||||
for (; *range; range++) {
|
||||
if (imeta(STOUC(*range))) {
|
||||
int swtype = STOUC(*range) - STOUC(Meta);
|
||||
switch (swtype) {
|
||||
case 0:
|
||||
/* ordinary metafied character */
|
||||
rchr = STOUC(*++range) ^ 32;
|
||||
if (!ind) {
|
||||
*chr = rchr;
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
|
||||
case PP_ALPHA:
|
||||
case PP_ALNUM:
|
||||
case PP_ASCII:
|
||||
case PP_BLANK:
|
||||
case PP_CNTRL:
|
||||
case PP_DIGIT:
|
||||
case PP_GRAPH:
|
||||
case PP_LOWER:
|
||||
case PP_PRINT:
|
||||
case PP_PUNCT:
|
||||
case PP_SPACE:
|
||||
case PP_UPPER:
|
||||
case PP_XDIGIT:
|
||||
case PP_IDENT:
|
||||
case PP_IFS:
|
||||
case PP_IFSSPACE:
|
||||
case PP_WORD:
|
||||
if (!ind) {
|
||||
*mtp = swtype;
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
|
||||
case PP_RANGE:
|
||||
range++;
|
||||
r1 = STOUC(UNMETA(range));
|
||||
METACHARINC(range);
|
||||
r2 = STOUC(UNMETA(range));
|
||||
if (*range == Meta)
|
||||
range++;
|
||||
rdiff = r2 - r1;
|
||||
if (rdiff >= ind) {
|
||||
*chr = r1 + ind;
|
||||
return 1;
|
||||
}
|
||||
/* note the extra decrement to ind below */
|
||||
ind -= rdiff;
|
||||
break;
|
||||
case PP_UNKWN:
|
||||
DPUTS(1, "BUG: unknown posix range passed through.\n");
|
||||
break;
|
||||
default:
|
||||
DPUTS(1, "BUG: unknown metacharacter in range.");
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (!ind) {
|
||||
*chr = STOUC(*range);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (!ind--)
|
||||
break;
|
||||
}
|
||||
|
||||
/* No corresponding index. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Repeatedly match something simple and say how many times.
|
||||
* charstart is an array parallel to that starting at patinput
|
||||
|
@ -3382,14 +3692,14 @@ static int patrepeat(Upat p, char *charstart)
|
|||
#ifdef MULTIBYTE_SUPPORT
|
||||
wchar_t cr = CHARREF(scan, patinend);
|
||||
if (patglobflags & GF_MULTIBYTE) {
|
||||
if (mb_patmatchrange(opnd, cr) ^
|
||||
if (mb_patmatchrange(opnd, cr, NULL, NULL) ^
|
||||
(P_OP(p) == P_ANYOF))
|
||||
break;
|
||||
} else if (patmatchrange(opnd, (int)cr) ^
|
||||
} else if (patmatchrange(opnd, (int)cr, NULL, NULL) ^
|
||||
(P_OP(p) == P_ANYOF))
|
||||
break;
|
||||
#else
|
||||
if (patmatchrange(opnd, CHARREF(scan, patinend)) ^
|
||||
if (patmatchrange(opnd, CHARREF(scan, patinend), NULL, NULL) ^
|
||||
(P_OP(p) == P_ANYOF))
|
||||
break;
|
||||
#endif
|
||||
|
|
42
Src/zsh.h
42
Src/zsh.h
|
@ -1307,6 +1307,48 @@ struct patprog {
|
|||
#define PAT_HAS_EXCLUDP 0x0800 /* (internal): top-level path1~path2. */
|
||||
#define PAT_LCMATCHUC 0x1000 /* equivalent to setting (#l) */
|
||||
|
||||
/*
|
||||
* Special match types used in character classes. These
|
||||
* are represented as tokens, with Meta added. The character
|
||||
* class is represented as a metafied string, with only these
|
||||
* tokens special. Note that an active leading "!" or "^" for
|
||||
* negation is not part of the string but is flagged in the
|
||||
* surrounding context.
|
||||
*
|
||||
* These types are also used in character and equivalence classes
|
||||
* in completion matching.
|
||||
*
|
||||
* This must be kept ordered by the array colon_stuffs in pattern.c.
|
||||
*/
|
||||
/* Special value for first definition */
|
||||
#define PP_FIRST 1
|
||||
/* POSIX-defined types: [:alpha:] etc. */
|
||||
#define PP_ALPHA 1
|
||||
#define PP_ALNUM 2
|
||||
#define PP_ASCII 3
|
||||
#define PP_BLANK 4
|
||||
#define PP_CNTRL 5
|
||||
#define PP_DIGIT 6
|
||||
#define PP_GRAPH 7
|
||||
#define PP_LOWER 8
|
||||
#define PP_PRINT 9
|
||||
#define PP_PUNCT 10
|
||||
#define PP_SPACE 11
|
||||
#define PP_UPPER 12
|
||||
#define PP_XDIGIT 13
|
||||
/* Zsh additions: [:IDENT:] etc. */
|
||||
#define PP_IDENT 14
|
||||
#define PP_IFS 15
|
||||
#define PP_IFSSPACE 16
|
||||
#define PP_WORD 17
|
||||
/* Special value for last definition */
|
||||
#define PP_LAST 17
|
||||
|
||||
/* Unknown type. Not used in a valid token. */
|
||||
#define PP_UNKWN 18
|
||||
/* Range: token followed by the (possibly multibyte) start and end */
|
||||
#define PP_RANGE 19
|
||||
|
||||
/* Globbing flags: lower 8 bits gives approx count */
|
||||
#define GF_LCMATCHUC 0x0100
|
||||
#define GF_IGNCASE 0x0200
|
||||
|
|
Loading…
Reference in a new issue