diff --git a/ChangeLog b/ChangeLog index 130dfc92c..4d5c8e7c7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2006-07-24 Peter Stephenson + + * 22556: Doc/Zsh/builtins.yo, Functions/Zle/insert-composed-char, + Src/builtin.c, Src/pattern.c, Src/subst.c, Src/utils.c, Src/zsh.h, + Src/ztype.h, Src/Zle/zle.h, Src/Zle/zle_main.c, + Test/D04parameter.ztst, Test/D07multibyte.ztst: Multibyte + separators and delimiters. + 2006-07-18 Clint Adams * 22554: Jesse Weinstein: Completion/Unix/Command/_vorbiscomment: diff --git a/Doc/Zsh/builtins.yo b/Doc/Zsh/builtins.yo index bd81a7746..d8892cd5c 100644 --- a/Doc/Zsh/builtins.yo +++ b/Doc/Zsh/builtins.yo @@ -1003,6 +1003,10 @@ Read only one (or var(num)) characters. All are assigned to the first var(name), without word splitting. This flag is ignored when tt(-q) is present. Input is read from the terminal unless one of tt(-u) or tt(-p) is present. This option may also be used within zle widgets. + +Note that despite the mnemonic `key' this option does read full +characters, which may consist of multiple bytes if the option +tt(MULTIBYTE) is set. ) item(tt(-z))( Read one entry from the editor buffer stack and assign it to the first diff --git a/Functions/Zle/insert-composed-char b/Functions/Zle/insert-composed-char index 2ed008990..7978a7589 100644 --- a/Functions/Zle/insert-composed-char +++ b/Functions/Zle/insert-composed-char @@ -128,7 +128,7 @@ # 'm Macron # '' Acute -emulate -LR zsh +emulate -L zsh setopt cbases extendedglob printeightbit local accent basechar ochar error @@ -165,7 +165,8 @@ else fi local -A charmap -charmap=(${=zsh_accented_chars[$accent]}) +# just in case someone is monkeying with IFS... +charmap=(${(s. .)zsh_accented_chars[$accent]}) if [[ ${#charmap} -eq 0 || -z $charmap[$basechar] ]]; then $error "Combination ${basechar}${accent} is not available." diff --git a/Src/Zle/zle.h b/Src/Zle/zle.h index 3671f90f3..69c73f4cf 100644 --- a/Src/Zle/zle.h +++ b/Src/Zle/zle.h @@ -62,11 +62,11 @@ typedef wint_t ZLE_INT_T; #define ZC_iblank wcsiblank #define ZC_icntrl iswcntrl #define ZC_idigit iswdigit -#define ZC_iident wcsiident +#define ZC_iident(x) wcsitype((x), IIDENT) #define ZC_ilower iswlower #define ZC_inblank iswspace #define ZC_iupper iswupper -#define ZC_iword wcsiword +#define ZC_iword(x) wcsitype((x), IWORD) #define ZC_tolower towlower #define ZC_toupper towupper diff --git a/Src/Zle/zle_main.c b/Src/Zle/zle_main.c index 1c82611c2..1d4636937 100644 --- a/Src/Zle/zle_main.c +++ b/Src/Zle/zle_main.c @@ -1290,32 +1290,40 @@ bin_vared(char *name, char **args, Options ops, UNUSED(int func)) char **arr = getarrvalue(v), **aptr, **tmparr, **tptr; tptr = tmparr = (char **)zhalloc(sizeof(char *)*(arrlen(arr)+1)); for (aptr = arr; *aptr; aptr++) { - int sepcount = 0; + int sepcount = 0, clen; + convchar_t c; /* * See if this word contains a separator character * or backslash */ - for (t = *aptr; *t; t++) { - if (*t == Meta) { - if (isep(t[1] ^ 32)) - sepcount++; + MB_METACHARINIT(); + for (t = *aptr; *t; ) { + if (*t == '\\') { t++; - } else if (isep(*t) || *t == '\\') sepcount++; + } else { + t += MB_METACHARLENCONV(t, &c); + if (MB_ZISTYPE(c, ISEP)) + sepcount++; + } } if (sepcount) { /* Yes, so allocate enough space to quote it. */ char *newstr, *nptr; newstr = zhalloc(strlen(*aptr)+sepcount+1); /* Go through string quoting separators */ + MB_METACHARINIT(); for (t = *aptr, nptr = newstr; *t; ) { - if (*t == Meta) { - if (isep(t[1] ^ 32)) - *nptr++ = '\\'; - *nptr++ = *t++; - } else if (isep(*t) || *t == '\\') + if (*t == '\\') { *nptr++ = '\\'; - *nptr++ = *t++; + *nptr++ = *t++; + } else { + clen = MB_METACHARLENCONV(t, &c); + if (MB_ZISTYPE(c, ISEP)) + *nptr++ = '\\'; + while (clen--) + *nptr++ = *t++; + } } *nptr = '\0'; /* Stick this into the array of words to join up */ diff --git a/Src/builtin.c b/Src/builtin.c index 71dcbffc3..3bd3b63cb 100644 --- a/Src/builtin.c +++ b/Src/builtin.c @@ -4266,7 +4266,7 @@ bin_break(char *name, char **argv, UNUSED(Options ops), int func) zerrnam(name, "not in while, until, select, or repeat loop"); return 1; } - contflag = 1; /* ARE WE SUPPOSED TO FALL THROUGH HERE? */ + contflag = 1; /* FALLTHROUGH */ case BIN_BREAK: if (!loops) { /* break is only permitted in loops */ zerrnam(name, "not in while, until, select, or repeat loop"); @@ -4560,7 +4560,14 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) int readchar = -1, val, resettty = 0; struct ttyinfo saveti; char d; +#ifdef MULTIBYTE_SUPPORT + wchar_t delim = L'\n', wc; + mbstate_t mbs; + char *laststart; + size_t ret; +#else char delim = '\n'; +#endif if (OPT_HASARG(ops,c='k')) { char *eptr, *optarg = OPT_ARG(ops,c); @@ -4666,7 +4673,23 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) } if (OPT_ISSET(ops,'d')) { char *delimstr = OPT_ARG(ops,'d'); +#ifdef MULTIBYTE_SUPPORT + wint_t wc; + + if (isset(MULTIBYTE)) { + mb_metacharinit(); + (void)mb_metacharlenconv(delimstr, &wc); + } + else + wc = WEOF; + if (wc != WEOF) + delim = (wchar_t)wc; + else + delim = (wchar_t)((delimstr[0] == Meta) ? + delimstr[1] ^ 32 : delimstr[0]); +#else delim = (delimstr[0] == Meta) ? delimstr[1] ^ 32 : delimstr[0]; +#endif if (SHTTY != -1) { struct ttyinfo ti; gettyinfo(&ti); @@ -4710,26 +4733,74 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) } } +#ifdef MULTIBYTE_SUPPORT + memset(&mbs, 0, sizeof(mbs)); +#endif + /* option -k means read only a given number of characters (default 1) */ if (OPT_ISSET(ops,'k')) { + int eof = 0; /* allocate buffer space for result */ bptr = buf = (char *)zalloc(nchars+1); do { if (izle) { - if ((val = getkeyptr(0, NULL)) < 0) + if ((val = getkeyptr(0, NULL)) < 0) { + eof = 1; break; - *bptr++ = (char) val; + } + *bptr = (char) val; +#ifdef MULTIBYTE_SUPPORT + if (isset(MULTIBYTE)) { + ret = mbrlen(bptr++, 1, &mbs); + if (ret == MB_INVALID) + memset(&mbs, 0, sizeof(mbs)); + /* treat invalid as single character */ + if (ret != MB_INCOMPLETE) + nchars--; + continue; + } else { + bptr++; + nchars--; + } +#else + bptr++; nchars--; +#endif } else { /* If read returns 0, is end of file */ if (readchar >= 0) { *bptr = readchar; val = 1; readchar = -1; - } else if ((val = read(readfd, bptr, nchars)) <= 0) + } else if ((val = read(readfd, bptr, nchars)) <= 0) { + eof = 1; break; + } +#ifdef MULTIBYTE_SUPPORT + if (isset(MULTIBYTE)) { + while (val > 0) { + ret = mbrlen(bptr, val, &mbs); + if (ret == MB_INCOMPLETE) { + bptr += val; + break; + } else { + if (ret == MB_INVALID) { + memset(&mbs, 0, sizeof(mbs)); + /* treat as single byte */ + ret = 1; + } + else if (ret == 0) /* handle null as normal char */ + ret = 1; + nchars--; + val -= ret; + bptr += ret; + } + } + continue; + } +#endif /* decrement number of characters read from number required */ nchars -= val; @@ -4761,7 +4832,7 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) zfree(buf, bptr - buf + 1); if (resettty && SHTTY != -1) settyinfo(&saveti); - return val <= 0; + return eof; } /* option -q means get one character, and interpret it as a Y or N */ @@ -4770,10 +4841,25 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) /* set up the buffer */ readbuf[1] = '\0'; - + /* get, and store, reply */ if (izle) { +#ifdef MULTIBYTE_SUPPORT + int key; + + while ((key = getkeyptr(0, NULL)) >= 0) { + char c = (char)key; + /* + * If multibyte, it can't be y, so we don't care + * what key gets set to; just read to end of character. + */ + if (!isset(MULTIBYTE) || + mbrlen(&c, 1, &mbs) != MB_INCOMPLETE) + break; + } +#else int key = getkeyptr(0, NULL); +#endif readbuf[0] = (key == 'y' ? 'y' : 'n'); } else { @@ -4786,6 +4872,7 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) SHTTY = -1; } } + if (OPT_ISSET(ops,'e') || OPT_ISSET(ops,'E')) printf("%s\n", readbuf); if (!OPT_ISSET(ops,'e')) @@ -4808,16 +4895,79 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) while (*args || (OPT_ISSET(ops,'A') && !gotnl)) { sigset_t s = child_unblock(); buf = bptr = (char *)zalloc(bsiz = 64); +#ifdef MULTIBYTE_SUPPORT + laststart = buf; + ret = MB_INCOMPLETE; +#endif /* get input, a character at a time */ while (!gotnl) { c = zread(izle, &readchar); /* \ at the end of a line indicates a continuation * * line, except in raw mode (-r option) */ +#ifdef MULTIBYTE_SUPPORT + if (c == EOF) { + /* not waiting to be completed any more */ + ret = 0; + break; + } + *bptr = (char)c; + if (isset(MULTIBYTE)) { + ret = mbrtowc(&wc, bptr, 1, &mbs); + if (!ret) /* NULL */ + ret = 1; + } else { + ret = 1; + wc = (wchar_t)c; + } + if (ret != MB_INCOMPLETE) { + if (ret == MB_INVALID) + memset(&mbs, 0, sizeof(mbs)); + if (bslash && wc == delim) { + bslash = 0; + continue; + } + if (wc == delim) + break; + /* + * `first' is non-zero if any separator we encounter is a + * non-whitespace separator, which means that anything + * (even an empty string) between, before or after separators + * is significant. If it is zero, we have a whitespace + * separator, which shouldn't cause extra empty strings to + * be emitted. Hence the test for (*buf || first) when + * we assign the result of reading a word. + */ + if (!bslash && wcsitype(wc, ISEP)) { + if (bptr != buf || + (!(c < 128 && iwsep(c)) && first)) { + first |= !(c < 128 && iwsep(c)); + break; + } + first |= !(c < 128 && iwsep(c)); + continue; + } + bslash = (wc == L'\\' && !bslash && !OPT_ISSET(ops,'r')); + if (bslash) + continue; + first = 0; + } + if (imeta(STOUC(*bptr))) { + bptr[1] = bptr[0] ^ 32; + bptr[0] = Meta; + bptr += 2; + } + else + bptr++; + if (ret != MB_INCOMPLETE) + laststart = bptr; +#else + if (c == EOF) + break; if (bslash && c == delim) { bslash = 0; continue; } - if (c == EOF || c == delim) + if (c == delim) break; /* * `first' is non-zero if any separator we encounter is a @@ -4845,18 +4995,42 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) *bptr++ = c ^ 32; } else *bptr++ = c; +#endif /* increase the buffer size, if necessary */ if (bptr >= buf + bsiz - 1) { int blen = bptr - buf; +#ifdef MULTIBYTE_SUPPORT + int llen = laststart - buf; +#endif buf = realloc(buf, bsiz *= 2); bptr = buf + blen; +#ifdef MULTIBYTE_SUPPORT + laststart = buf + llen; +#endif } } signal_setmask(s); +#ifdef MULTIBYTE_SUPPORT + if (c == EOF) + gotnl = 1; + if (ret == MB_INCOMPLETE) { + /* + * We can only get here if there is an EOF in the + * middle of a character... safest to keep the debris, + * I suppose. + */ + *bptr = '\0'; + } else { + if (wc == delim) + gotnl = 1; + *laststart = '\0'; + } +#else if (c == delim || c == EOF) gotnl = 1; *bptr = '\0'; +#endif /* dispose of word appropriately */ if (OPT_ISSET(ops,'e') || OPT_ISSET(ops,'E')) { zputs(buf, stdout); @@ -4908,12 +5082,66 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) return c == EOF; } buf = bptr = (char *)zalloc(bsiz = 64); +#ifdef MULTIBYTE_SUPPORT + laststart = buf; + ret = MB_INCOMPLETE; +#endif /* any remaining part of the line goes into one parameter */ bslash = 0; if (!gotnl) { sigset_t s = child_unblock(); for (;;) { c = zread(izle, &readchar); +#ifdef MULTIBYTE_SUPPORT + if (c == EOF) { + /* not waiting to be completed any more */ + ret = 0; + break; + } + *bptr = (char)c; + if (isset(MULTIBYTE)) { + ret = mbrtowc(&wc, bptr, 1, &mbs); + if (!ret) /* NULL */ + ret = 1; + } else { + ret = 1; + wc = (wchar_t)c; + } + if (ret != MB_INCOMPLETE) { + if (ret == MB_INVALID) + memset(&mbs, 0, sizeof(mbs)); + /* + * \ at the end of a line introduces a continuation line, + * except in raw mode (-r option) + */ + if (bslash && wc == delim) { + bslash = 0; + continue; + } + if (wc == delim && !zbuf) + break; + if (!bslash && bptr == buf && wcsitype(wc, ISEP)) { + if (c < 128 && iwsep(c)) + continue; + else if (!first) { + first = 1; + continue; + } + } + bslash = (wc == L'\\' && !bslash && !OPT_ISSET(ops,'r')); + if (bslash) + continue; + } + if (imeta(STOUC(*bptr))) { + bptr[1] = bptr[0] ^ 32; + bptr[0] = Meta; + bptr += 2; + } + else + bptr++; + if (ret != MB_INCOMPLETE) + laststart = bptr; +#else /* \ at the end of a line introduces a continuation line, except in raw mode (-r option) */ if (bslash && c == delim) { @@ -4938,22 +5166,36 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func)) *bptr++ = c ^ 32; } else *bptr++ = c; +#endif /* increase the buffer size, if necessary */ if (bptr >= buf + bsiz - 1) { int blen = bptr - buf; +#ifdef MULTIBYTE_SUPPORT + int llen = laststart - buf; +#endif buf = realloc(buf, bsiz *= 2); bptr = buf + blen; +#ifdef MULTIBYTE_SUPPORT + laststart = buf + llen; +#endif } } signal_setmask(s); } +#ifdef MULTIBYTE_SUPPORT + if (ret != MB_INCOMPLETE) + bptr = laststart; +#endif + /* + * Strip trailing IFS whitespace. + * iwsep can only be certain single-byte ASCII bytes, but we + * must check the byte isn't metafied. + */ while (bptr > buf) { if (bptr > buf + 1 && bptr[-2] == Meta) { - if (iwsep(bptr[-1] ^ 32)) - bptr -= 2; - else - break; + /* non-ASCII, can't be IWSEP */ + break; } else if (iwsep(bptr[-1])) bptr--; else diff --git a/Src/pattern.c b/Src/pattern.c index bc9afbae3..39c146b86 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -318,7 +318,7 @@ metacharinc(char **x) inchar = *inptr++; } *x = inptr; - return (wchar_t)inchar; + return (wchar_t)STOUC(inchar); } while (*inptr) { @@ -352,12 +352,14 @@ typedef int patint_t; #define PEOF EOF #define METACHARINC(x) ((void)((x) += (*(x) == Meta) ? 2 : 1)) -/* - * Return unmetafied char from string (x is any char *) - */ -#define UNMETA(x) (*(x) == Meta ? (x)[1] ^ 32 : *(x)) #endif +/* + * Return unmetafied char from string (x is any char *). + * Used with MULTIBYTE_SUPPORT if the GF_MULTIBYTE is not + * in effect. + */ +#define UNMETA(x) (*(x) == Meta ? (x)[1] ^ 32 : *(x)) /* Add n more characters, ensuring there is enough space. */ @@ -1575,7 +1577,7 @@ charref(char *x, char *y) size_t ret; if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*x) & 0x80)) - return (wchar_t) *x; + return (wchar_t) STOUC(*x); ret = mbrtowc(&wc, x, y-x, &shiftstate); @@ -1583,7 +1585,7 @@ charref(char *x, char *y) /* Error. Treat as single byte. */ /* Reset the shift state for next time. */ memset(&shiftstate, 0, sizeof(shiftstate)); - return (wchar_t) *x; + return (wchar_t) STOUC(*x); } return wc; @@ -1626,7 +1628,7 @@ charrefinc(char **x, char *y) size_t ret; if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80)) - return (wchar_t) *(*x)++; + return (wchar_t) STOUC(*(*x)++); ret = mbrtowc(&wc, *x, y-*x, &shiftstate); @@ -1634,7 +1636,7 @@ charrefinc(char **x, char *y) /* Error. Treat as single byte. */ /* Reset the shift state for next time. */ memset(&shiftstate, 0, sizeof(shiftstate)); - return (wchar_t) *(*x)++; + return (wchar_t) STOUC(*(*x)++); } /* Nulls here are normal characters */ @@ -2222,20 +2224,33 @@ patmatch(Upat prog) } break; case P_ANYOF: - if (patinput == patinend || - !patmatchrange((char *)P_OPERAND(scan), - CHARREF(patinput, patinend))) - fail = 1; - else - CHARINC(patinput, patinend); - break; case P_ANYBUT: - if (patinput == patinend || - patmatchrange((char *)P_OPERAND(scan), - CHARREF(patinput, patinend))) + if (patinput == patinend) fail = 1; - else - CHARINC(patinput, patinend); + else { +#ifdef MULTIBYTE_SUPPORT + wchar_t cr = CHARREF(patinput, patinend); + char *scanop = (char *)P_OPERAND(scan); + if (patglobflags & GF_MULTIBYTE) { + if (mb_patmatchrange(scanop, cr) ^ + (P_OP(scan) == P_ANYOF)) + fail = 1; + else + CHARINC(patinput, patinend); + } else if (patmatchrange(scanop, (int)cr) ^ + (P_OP(scan) == P_ANYOF)) + fail = 1; + else + CHARINC(patinput, patinend); +#else + if (patmatchrange((char *)P_OPERAND(scan), + CHARREF(patinput, patinend)) ^ + (P_OP(scan) == P_ANYOF)) + fail = 1; + else + CHARINC(patinput, patinend); +#endif + } break; case P_NUMRNG: case P_NUMFROM: @@ -2923,7 +2938,7 @@ patmatch(Upat prog) /**/ static int -patmatchrange(char *range, wchar_t ch) +mb_patmatchrange(char *range, wchar_t ch) { wchar_t r1, r2; @@ -2994,21 +3009,20 @@ patmatchrange(char *range, wchar_t ch) return 1; break; case PP_IDENT: - if (wcsiident(ch)) + if (wcsitype(ch, IIDENT)) return 1; break; case PP_IFS: - /* TODO */ - if (isep(ch)) + if (wcsitype(ch, ISEP)) return 1; break; case PP_IFSSPACE: - /* TODO */ - if (iwsep(ch)) + /* must be ASCII space character */ + if (ch < 128 && iwsep((int)ch)) return 1; break; case PP_WORD: - if (wcsiword(ch)) + if (wcsitype(ch, IWORD)) return 1; break; case PP_RANGE: @@ -3031,7 +3045,7 @@ patmatchrange(char *range, wchar_t ch) } /**/ -#else +#endif /**/ static int @@ -3142,9 +3156,6 @@ patmatchrange(char *range, int ch) return 0; } -/**/ -#endif - /* * Repeatedly match something simple and say how many times. * charstart is an array parallel to that starting at patinput @@ -3180,20 +3191,26 @@ static int patrepeat(Upat p, char *charstart) } break; case P_ANYOF: - while (scan < patinend && - patmatchrange(opnd, CHARREF(scan, patinend))) { - charstart[scan-patinput] = 1; - count++; - CHARINC(scan, patinend); - } - break; case P_ANYBUT: - while (scan < patinend && - !patmatchrange(opnd, CHARREF(scan, patinend))) { + while (scan < patinend) { +#ifdef MULTIBYTE_SUPPORT + wchar_t cr = CHARREF(scan, patinend); + if (patglobflags & GF_MULTIBYTE) { + if (mb_patmatchrange(opnd, cr) ^ + (P_OP(p) == P_ANYOF)) + break; + } else if (patmatchrange(opnd, (int)cr) ^ + (P_OP(p) == P_ANYOF)) + break; +#else + if (patmatchrange(opnd, CHARREF(scan, patinend)) ^ + P_OP(p) == P_ANYOF) + break; +#endif charstart[scan-patinput] = 1; count++; CHARINC(scan, patinend); - } + } break; #ifdef DEBUG default: diff --git a/Src/subst.c b/Src/subst.c index 821c1c79a..9f2703326 100644 --- a/Src/subst.c +++ b/Src/subst.c @@ -316,9 +316,14 @@ multsub(char **s, int split, char ***a, int *isarr, char *sep) local_list1(foo); if (split) { - for ( ; *x; x += l+1) { + /* + * This doesn't handle multibyte characters, but we're + * looking for whitespace separators which must be ASCII. + */ + for ( ; *x; x += l) { char c = (l = *x == Meta) ? x[1] ^ 32 : *x; - if (!iwsep(c)) + l++; + if (!iwsep(STOUC(c))) break; } } @@ -328,20 +333,35 @@ multsub(char **s, int split, char ***a, int *isarr, char *sep) if (split) { LinkNode n = firstnode(&foo); int inq = 0, inp = 0; - for ( ; *x; x += l+1) { - char c = (l = *x == Meta) ? x[1] ^ 32 : *x; - if (!inq && !inp && isep(c)) { - *x = '\0'; - for (x += l+1; *x; x += l+1) { - c = (l = *x == Meta) ? x[1] ^ 32 : *x; - if (!isep(c)) + MB_METACHARINIT(); + for ( ; *x; x += l) { + int rawc = -1; + convchar_t c; + if (itok(STOUC(*x))) { + /* token, can't be separator, must be single byte */ + rawc = *x; + l = 1; + } else { + l = MB_METACHARLENCONV(x, &c); + if (!inq && !inp && MB_ZISTYPE(c, ISEP)) { + *x = '\0'; + for (x += l; *x; x += l) { + if (itok(STOUC(*x))) { + /* as above */ + rawc = *x; + l = 1; + break; + } + l = MB_METACHARLENCONV(x, &c); + if (!MB_ZISTYPE(c, ISEP)) + break; + } + if (!*x) break; + insertlinknode(&foo, n, (void *)x), incnode(n); } - if (!*x) - break; - insertlinknode(&foo, n, (void *)x), incnode(n); } - switch (c) { + switch (rawc) { case Dnull: /* " */ case Snull: /* ' */ case Tick: /* ` (note: no Qtick!) */ @@ -357,8 +377,8 @@ multsub(char **s, int split, char ***a, int *isarr, char *sep) case Bnull: /* \ */ case Bnullkeep: /* The parser verified the following char's existence. */ - x += l+1; - l = *x == Meta; + x += l; + l = MB_METACHARLEN(x); break; } } @@ -685,12 +705,14 @@ invinstrpcmp(const void *a, const void *b) static char * dopadding(char *str, int prenum, int postnum, char *preone, char *postone, char *premul, char *postmul) { - char def[3], *ret, *t, *r; + char *def, *ret, *t, *r; int ls, ls2, lpreone, lpostone, lpremul, lpostmul, lr, f, m, c, cc; - def[0] = *ifs ? *ifs : ' '; - def[1] = *ifs == Meta ? ifs[1] ^ 32 : '\0'; - def[2] = '\0'; + MB_METACHARINIT(); + if (*ifs) + def = dupstrpfx(ifs, MB_METACHARLEN(ifs)); + else + def = ""; if (preone && !*preone) preone = def; if (postone && !*postone) diff --git a/Src/utils.c b/Src/utils.c index 0d6cd8866..6ea254a4d 100644 --- a/Src/utils.c +++ b/Src/utils.c @@ -35,16 +35,65 @@ /**/ char *scriptname; -/**/ #ifdef MULTIBYTE_SUPPORT +struct widechar_array { + wchar_t *chars; + size_t len; +}; +typedef struct widechar_array *Widechar_array; + /* * The wordchars variable turned into a wide character array. * This is much more convenient for testing. */ +struct widechar_array wordchars_wide; -/**/ -mod_export wchar_t *wordchars_wide; -/**/ +/* + * The same for the separators (IFS) array. + */ +struct widechar_array ifs_wide; + +/* Function to set one of the above from the multibyte array */ + +static void +set_widearray(char *mb_array, Widechar_array wca) +{ + if (wca->chars) { + free(wca->chars); + wca->chars = NULL; + } + wca->len = 0; + + if (!isset(MULTIBYTE)) + return; + + if (mb_array) { + VARARR(wchar_t, tmpwcs, strlen(mb_array)); + wchar_t *wcptr = tmpwcs; + wint_t wci; + + mb_metacharinit(); + while (*mb_array) { + int mblen = mb_metacharlenconv(mb_array, &wci); + + if (!mblen) + break; + /* No good unless all characters are convertible */ + if (*wcptr == WEOF) + return; + *wcptr++ = (wchar_t)wci; +#ifdef DEBUG + if (wcptr[-1] < 0) + fprintf(stderr, "BUG: Bad cast to wchar_t\n"); +#endif + mb_array += mblen; + } + + wca->len = wcptr - tmpwcs; + wca->chars = (wchar_t *)zalloc(wca->len * sizeof(wchar_t)); + wmemcpy(wca->chars, tmpwcs, wca->len); + } +} #endif @@ -1853,9 +1902,34 @@ getquery(char *valid_chars, int purge) if (c != '\n') while ((d = read1char()) >= 0 && d != '\n'); } else { - settyinfo(&shttyinfo); - if (c != '\n' && !valid_chars) + if (c != '\n' && !valid_chars) { +#ifdef MULTIBYTE_SUPPORT + if (isset(MULTIBYTE) && c >= 0) { + /* + * No waiting for a valid character, and no draining; + * we should ensure we haven't stopped in the middle + * of a multibyte character. + */ + mbstate_t mbs; + char cc = (char)c; + memset(&mbs, 0, sizeof(mbs)); + for (;;) { + size_t ret = mbrlen(&cc, 1, &mbs); + + if (ret != MB_INCOMPLETE) + break; + c = read1char(); + if (c < 0) + break; + cc = (char)c; + } + } +#endif + settyinfo(&shttyinfo); write(SHTTY, "\n", 1); + } + else + settyinfo(&shttyinfo); } return c; } @@ -2253,6 +2327,10 @@ skipwsep(char **s) char *t = *s; int i = 0; + /* + * Don't need to handle mutlibyte characters, they can't + * be IWSEP. Do need to check for metafication. + */ while (*t && iwsep(*t == Meta ? t[1] ^ 32 : *t)) { if (*t == Meta) t++; @@ -2293,19 +2371,23 @@ spacesplit(char *s, int allownull, int heap, int quote) t = s; skipwsep(&s); - if (*s && isep(*s == Meta ? s[1] ^ 32 : *s)) + MB_METACHARINIT(); + if (*s && itype_end(s, ISEP, 1) != s) *ptr++ = dup(allownull ? "" : nulstring); else if (!allownull && t != s) *ptr++ = dup(""); while (*s) { - if (isep(*s == Meta ? s[1] ^ 32 : *s) || (quote && *s == '\\')) { - if (*s == Meta) - s++; + char *iend = itype_end(s, ISEP, 1); + if (iend != s) { + s = iend; + skipwsep(&s); + } + else if (quote && *s == '\\') { s++; skipwsep(&s); } t = s; - findsep(&s, NULL, quote); + (void)findsep(&s, NULL, quote); if (s > t || allownull) { *ptr = (heap ? (char *) hcalloc((s - t) + 1) : (char *) zshcalloc((s - t) + 1)); @@ -2321,68 +2403,87 @@ spacesplit(char *s, int allownull, int heap, int quote) return ret; } +/* + * Find a separator. Return 0 if already at separator, 1 if separator + * found later, else -1. (Historical note: used to return length into + * string but this is all that is necessary and is less ambiguous with + * multibyte characters around.) + * + * *s is the string we are looking along, which will be updated + * to the point we have got to. + * + * sep is a possibly multicharacter separator to look for. If NULL, + * use normal separator characters. If *sep is NULL, split on individual + * characters. + * + * quote is a flag that '\' should not be treated as a separator. + * in this case we need to be able to strip the backslash directly + * in the string, so the calling function must have sent us something + * modifiable. currently this only works for sep == NULL. also in + * in this case only, we need to turn \\ into \. + */ + /**/ static int findsep(char **s, char *sep, int quote) { /* - * *s is the string we are looking along, which will be updated - * to the point we have got to. - * - * sep is a possibly multicharacter separator to look for. If NULL, - * use normal separator characters. - * - * quote is a flag that '\' should not be treated as a separator. - * in this case we need to be able to strip the backslash directly - * in the string, so the calling function must have sent us something - * modifiable. currently this only works for sep == NULL. also in - * in this case only, we need to turn \\ into \. */ - int i; + int i, ilen; char *t, *tt; + convchar_t c; + MB_METACHARINIT(); if (!sep) { - for (t = *s; *t; t++) { - if (quote && *t == '\\' && - (isep(t[1] == Meta ? (t[2] ^ 32) : t[1]) || t[1] == '\\')) { - chuck(t); - if (*t == Meta) - t++; - continue; - } - if (*t == Meta) { - if (isep(t[1] ^ 32)) + for (t = *s; *t; t += ilen) { + if (quote && *t == '\\') { + if (t[1] == '\\') { + chuck(t); + ilen = 1; + continue; + } else { + ilen = MB_METACHARLENCONV(t+1, &c); + if (MB_ZISTYPE(c, ISEP)) { + chuck(t); + /* then advance over new character, length ilen */ + } else { + /* treat *t (backslash) as normal byte */ + if (isep(*t)) + break; + ilen = 1; + } + } + } else { + ilen = MB_METACHARLENCONV(t, &c); + if (MB_ZISTYPE(c, ISEP)) break; - t++; - } else if (isep(*t)) - break; + } } - i = t - *s; + i = (t > *s); *s = t; return i; } if (!sep[0]) { + /* + * NULL separator just means advance past first character, + * if any. + */ if (**s) { - if (**s == Meta) - *s += 2; - else - ++*s; + *s += MB_METACHARLEN(*s); return 1; } return -1; } for (i = 0; **s; i++) { + /* + * The following works for multibyte characters by virtue of + * the fact that sep may be a string (and we don't care how + * it divides up, we need to match all of it). + */ for (t = sep, tt = *s; *t && *tt && *t == *tt; t++, tt++); if (!*t) - return i; - if (*(*s)++ == Meta) { -#ifdef DEBUG - if (! *(*s)++) - fprintf(stderr, "BUG: unexpected end of string in findsep()\n"); -#else - (*s)++; -#endif - } + return (i > 0); + *s += MB_METACHARLEN(*s); } return -1; } @@ -2405,16 +2506,15 @@ findword(char **s, char *sep) } return r; } - for (t = *s; *t; t++) { - if (*t == Meta) { - if (! isep(t[1] ^ 32)) - break; - t++; - } else if (! isep(*t)) + MB_METACHARINIT(); + for (t = *s; *t; t += sl) { + convchar_t c; + sl = MB_METACHARLENCONV(t, &c); + if (!MB_ZISTYPE(c, ISEP)) break; } *s = t; - findsep(s, sep, 0); + (void)findsep(s, sep, 0); return t; } @@ -2436,18 +2536,17 @@ wordcount(char *s, char *sep, int mul) r = 0; if (mul <= 0) skipwsep(&s); - if ((*s && isep(*s == Meta ? s[1] ^ 32 : *s)) || + if ((*s && itype_end(s, ISEP, 1) != s) || (mul < 0 && t != s)) r++; for (; *s; r++) { - if (isep(*s == Meta ? s[1] ^ 32 : *s)) { - if (*s == Meta) - s++; - s++; + char *ie = itype_end(s, ISEP, 1); + if (ie != s) { + s = ie; if (mul <= 0) skipwsep(&s); } - findsep(&s, NULL, 0); + (void)findsep(&s, NULL, 0); t = s; if (mul <= 0) skipwsep(&s); @@ -2464,19 +2563,20 @@ sepjoin(char **s, char *sep, int heap) { char *r, *p, **t; int l, sl; - char sepbuf[3]; + char sepbuf[2]; if (!*s) return heap ? "" : ztrdup(""); if (!sep) { - p = sep = sepbuf; - if (ifs) { - *p++ = *ifs; - *p++ = *ifs == Meta ? ifs[1] ^ 32 : '\0'; + /* optimise common case that ifs[0] is space */ + if (ifs && *ifs != ' ') { + MB_METACHARINIT(); + sep = dupstrpfx(ifs, MB_METACHARLEN(ifs)); } else { + p = sep = sepbuf; *p++ = ' '; + *p = '\0'; } - *p = '\0'; } sl = strlen(sep); for (t = s, l = 1 - sl; *t; l += strlen(*t) + sl, t++); @@ -2508,7 +2608,7 @@ sepsplit(char *s, char *sep, int allownull, int heap) for (t = s; n--;) { tt = t; - findsep(&t, sep, 0); + (void)findsep(&t, sep, 0); *p = (heap ? (char *) hcalloc(t - tt + 1) : (char *) zshcalloc(t - tt + 1)); strncpy(*p, tt, t - tt); @@ -2637,39 +2737,21 @@ inittyptab(void) for (t0 = (int)STOUC(Snull); t0 <= (int)STOUC(Nularg); t0++) typtab[t0] |= ITOK | IMETA | INULL; for (s = ifs ? ifs : DEFAULT_IFS; *s; s++) { - if (inblank(*s)) { - if (s[1] == *s) + int c = STOUC(*s == Meta ? *++s ^ 32 : *s); +#ifdef MULTIBYTE_SUPPORT + if (!isascii(c)) { + /* see comment for wordchars below */ + continue; + } +#endif + if (inblank(c)) { + if (s[1] == c) s++; else - typtab[STOUC(*s)] |= IWSEP; + typtab[c] |= IWSEP; } - typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= ISEP; + typtab[c] |= ISEP; } -#ifdef MULTIBYTE_SUPPORT - if (wordchars) { - char *wordchars_unmeta; - const char *wordchars_ptr; - mbstate_t mbs; - size_t nchars; - int unmetalen; - - wordchars_unmeta = dupstring(wordchars); - wordchars_ptr = unmetafy(wordchars_unmeta, &unmetalen); - - memset(&mbs, 0, sizeof(mbs)); - wordchars_wide = (wchar_t *) - zrealloc(wordchars_wide, (unmetalen+1)*sizeof(wchar_t)); - nchars = mbsrtowcs(wordchars_wide, &wordchars_ptr, unmetalen, &mbs); - if (nchars == MB_INVALID || nchars == MB_INCOMPLETE) { - /* Conversion state is undefined: better just set to null */ - nchars = 0; - } - wordchars_wide[nchars] = L'\0'; - } else { - wordchars_wide = zrealloc(wordchars_wide, sizeof(wchar_t)); - *wordchars_wide = L'\0'; - } -#endif for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++) { int c = STOUC(*s == Meta ? *++s ^ 32 : *s); #ifdef MULTIBYTE_SUPPORT @@ -2686,6 +2768,10 @@ inittyptab(void) #endif typtab[c] |= IWORD; } +#ifdef MULTIBYTE_SUPPORT + set_widearray(wordchars, &wordchars_wide); + set_widearray(ifs, &ifs_wide); +#endif for (s = SPECCHARS; *s; s++) typtab[STOUC(*s)] |= ISPECIAL; if (specialcomma) @@ -2718,62 +2804,60 @@ wcsiblank(wint_t wc) } /* - * iword() macro extended to support wide characters. + * zistype macro extended to support wide characters. + * Works for IIDENT, IWORD, IALNUM, ISEP. + * We don't need this for IWSEP because that only applies to + * a fixed set of ASCII characters. + * Note here that use of multibyte mode is not tested: + * that's because for ZLE this is unconditional, + * not dependent on the option. The caller must decide. */ /**/ mod_export int -wcsiword(wchar_t c) +wcsitype(wchar_t c, int itype) { int len; VARARR(char, outstr, MB_CUR_MAX); + + if (!isset(MULTIBYTE)) + return zistype(c, itype); + /* * Strategy: the shell requires that the multibyte representation * be an extension of ASCII. So see if converting the character - * produces an ASCII character. If it does, use iword on that. - * If it doesn't, use iswalnum on the original character. This - * is pretty good most of the time. + * produces an ASCII character. If it does, use zistype on that. + * If it doesn't, use iswalnum on the original character. + * If that fails, resort to the appropriate wide character array. */ len = wctomb(outstr, c); if (len == 0) { /* NULL is special */ - return iword(0); + return zistype(0, itype); } else if (len == 1 && iascii(*outstr)) { - return iword(*outstr); + return zistype(*outstr, itype); } else { - return iswalnum(c) || wcschr(wordchars_wide, c); + switch (itype) { + case IIDENT: + if (!isset(POSIXIDENTIFIERS)) + return 0; + return iswalnum(c); + + case IWORD: + if (iswalnum(c)) + return 1; + return !!wmemchr(wordchars_wide.chars, c, wordchars_wide.len); + + case ISEP: + return !!wmemchr(ifs_wide.chars, c, ifs_wide.len); + + default: + return iswalnum(c); + } } } -/* - * iident() macro extended to support wide characters. - * - * The macro is intended to test if a character is allowed in an - * internal zsh identifier. We allow all alphanumerics outside - * the ASCII range unless POSIXIDENTIFIERS is set. - * - * Otherwise similar to wcsiword. - */ - -/**/ -mod_export int -wcsiident(wchar_t c) -{ - int len; - VARARR(char, outstr, MB_CUR_MAX); - - len = wctomb(outstr, c); - - if (len == 0) { - /* NULL is special */ - return 0; - } else if (len == 1 && iascii(*outstr)) { - return iident(*outstr); - } else { - return !isset(POSIXIDENTIFIERS) && iswalnum(c); - } -} /**/ #endif @@ -2789,7 +2873,7 @@ wcsiident(wchar_t c) * If "once" is set, just test the first character, i.e. (outptr != * inptr) tests whether the first character is valid in an identifier. * - * Currently this is only called with itype IIDENT or IUSER. + * Currently this is only called with itype IIDENT, IUSER or ISEP. */ /**/ @@ -2819,12 +2903,25 @@ itype_end(const char *ptr, int itype, int once) break; } else { /* - * Valid non-ASCII character. Allow all alphanumerics; - * if testing for words, allow all wordchars. + * Valid non-ASCII character. */ - if (!(iswalnum(wc) || - (itype == IWORD && wcschr(wordchars_wide, wc)))) + switch (itype) { + case IWORD: + if (!iswalnum(wc) && + !wmemchr(wordchars_wide.chars, wc, + wordchars_wide.len)) + return (char *)ptr; break; + + case ISEP: + if (!wmemchr(ifs_wide.chars, wc, ifs_wide.len)) + return (char *)ptr; + break; + + default: + if (!iswalnum(wc)) + return (char *)ptr; + } } ptr += len; @@ -3791,16 +3888,22 @@ mb_metacharlenconv(const char *s, wint_t *wcp) wchar_t wc; if (!isset(MULTIBYTE)) { + /* treat as single byte, possibly metafied */ if (wcp) - *wcp = WEOF; + *wcp = (wint_t)(*s == Meta ? s[1] ^ 32 : *s); return 1 + (*s == Meta); } ret = MB_INVALID; for (ptr = s; *ptr; ) { - if (*ptr == Meta) + if (*ptr == Meta) { inchar = *++ptr ^ 32; - else +#ifdef DEBUG + if (!*ptr) + fprintf(stderr, + "BUG: unexpected end of string in mb_metacharlen()\n"); +#endif + } else inchar = *ptr; ptr++; ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate); @@ -3873,6 +3976,23 @@ mb_metastrlen(char *ptr) return num + num_in_char; } +/**/ +#else + +/* Simple replacement for mb_metacharlenconv */ +int +metacharlenconv(char *x, int *c) +{ + if (*x == Meta) { + if (c) + *c == STOUC(x[1]); + return 2; + } + if (c) + *c = STOUC(*x); + return 1; +} + /**/ #endif /* MULTIBYTE_SUPPORT */ diff --git a/Src/zsh.h b/Src/zsh.h index b5f675db5..25399b9d9 100644 --- a/Src/zsh.h +++ b/Src/zsh.h @@ -1925,6 +1925,8 @@ typedef char *(*ZleGetLineFn) _((int *, int *)); #ifdef MULTIBYTE_SUPPORT #define nicezputs(str, outs) (void)mb_niceformat((str), (outs), NULL, 0) #define MB_METACHARINIT() mb_metacharinit() +typedef wint_t convchar_t; +#define MB_METACHARLENCONV(str, cp) mb_metacharlenconv((str), (cp)) #define MB_METACHARLEN(str) mb_metacharlenconv(str, NULL) #define MB_METASTRLEN(str) mb_metastrlen(str) @@ -1948,6 +1950,8 @@ typedef char *(*ZleGetLineFn) _((int *, int *)); #else #define MB_METACHARINIT() +typedef int convchar_t; +#define MB_METACHARLENCONV(str, cp) metacharlenconv((str), (cp)) #define MB_METACHARLEN(str) (*(str) == Meta ? 2 : 1) #define MB_METASTRLEN(str) ztrlen(str) diff --git a/Src/ztype.h b/Src/ztype.h index 7b7973602..7aa56b073 100644 --- a/Src/ztype.h +++ b/Src/ztype.h @@ -59,6 +59,12 @@ #define iwsep(X) zistype(X,IWSEP) #define inull(X) zistype(X,INULL) +#ifdef MULTIBYTE_SUPPORT +#define MB_ZISTYPE(X,Y) wcsitype((X),(Y)) +#else +#define MB_ZISTYPE(X,Y) zistype((X),(Y)) +#endif + #define iascii(X) isascii(STOUC(X)) #define ilower(X) islower(STOUC(X)) #define iprint(X) isprint(STOUC(X)) diff --git a/Test/D04parameter.ztst b/Test/D04parameter.ztst index 73e87d0f8..6e97d7450 100644 --- a/Test/D04parameter.ztst +++ b/Test/D04parameter.ztst @@ -725,6 +725,29 @@ >7 >8 +# Tests a long-standing bug with joining on metafied characters in IFS + (array=(one two three) + IFS=$'\0' + foo="$array" + for (( i = 1; i <= ${#foo}; i++ )); do + char=${foo[i]} + print $(( #char )) + done) +0:Joining with NULL character from IFS +>111 +>110 +>101 +>0 +>116 +>119 +>111 +>0 +>116 +>104 +>114 +>101 +>101 + unset SHLVL (( SHLVL++ )) print $SHLVL diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst index 20c967540..683e8350e 100644 --- a/Test/D07multibyte.ztst +++ b/Test/D07multibyte.ztst @@ -174,3 +174,57 @@ 1:POSIX_IDENTIFIERS option >3 ?(eval):1: command not found: hähä=3 + + foo="Ølaf«Ødd«øpénëd«ån«àpple" + print -l ${(s.«.)foo} + ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." + print -l ${=ioh} + print ${(w)#ioh} +0:Splitting with multibyte characters +>Ølaf +>Ødd +>øpénëd +>ån +>àpple +>Ἐν +>ἀρχῇ +>ἦν +>ὁ +>λόγος, +>καὶ +>ὁ +>λόγος +>ἦν +>πρὸς +>τὸν +>θεόν, +>καὶ +>θεὸς +>ἦν +>ὁ +>λόγος. +>17 + + read -d £ one + read -d £ two + print $one + print $two +0:read with multibyte delimiter +first +>second + + (IFS=« + read -d » -A array + print -l $array) +0:read -A with multibyte IFS +dominus +>illuminatio +>mea + + read -k2 -u0 twochars + print $twochars +0:read multibyte characters +<«»ignored +>«»