Add non-metafied character length handling.

Use this in regex module and add test using $'\ua0'.

Rename mb_metacharinit() to mb_charinit() as it does not involve
metafied characters.
This commit is contained in:
Peter Stephenson 2015-06-12 09:30:39 +01:00
parent 370e7f73f6
commit f1923bdfa6
12 changed files with 154 additions and 31 deletions

View file

@ -1,3 +1,13 @@
2015-06-12 Peter Stephenson <p.stephenson@samsung.com>
* 35448: Src/Modules/curses.c, Src/Modules/regex.c,
Src/Zle/complist.c, Src/Zle/zle_utils.c, Src/builtin.c,
Src/glob.c, Src/hist.c, Src/prompt.c, Src/utils.c, Src/zsh.h,
Test/D07multibyte.ztst: Add non-metafied character length
handling and use this for regex module. Add test.
Rename mb_metacharinit() to mb_charinit() since it doesn't
involve metafied characters.
2015-06-11 Peter Stephenson <p.stephenson@samsung.com>
* 35442: Doc/Zsh/options.yo: multibyte option now on

View file

@ -765,7 +765,7 @@ zccmd_string(const char *nam, char **args)
w = (ZCWin)getdata(node);
#ifdef HAVE_WADDWSTR
mb_metacharinit();
mb_charinit();
wptr = wstr = zhalloc((strlen(str)+1) * sizeof(wchar_t));
while (*str && (clen = mb_metacharlenconv(str, &wc))) {

View file

@ -115,6 +115,7 @@ zcond_regex_match(char **a, int id)
} else {
zlong offs;
char *ptr;
int clen, leftlen;
m = matches;
s = metafy(lhstr + m->rm_so, m->rm_eo - m->rm_so, META_DUP);
@ -123,19 +124,25 @@ zcond_regex_match(char **a, int id)
* Count the characters before the match.
*/
ptr = lhstr;
leftlen = m->rm_so;
offs = 0;
MB_METACHARINIT();
while (ptr < lhstr + m->rm_so) {
MB_CHARINIT();
while (leftlen) {
offs++;
ptr += MB_METACHARLEN(ptr);
clen = MB_CHARLEN(ptr, leftlen);
ptr += clen;
leftlen -= clen;
}
setiparam("MBEGIN", offs + !isset(KSHARRAYS));
/*
* Add on the characters in the match.
*/
while (ptr < lhstr + m->rm_eo) {
leftlen = m->rm_eo - m->rm_so;
while (leftlen) {
offs++;
ptr += MB_METACHARLEN(ptr);
clen = MB_CHARLEN(ptr, leftlen);
ptr += clen;
leftlen -= clen;
}
setiparam("MEND", offs + !isset(KSHARRAYS) - 1);
if (nelem) {
@ -149,19 +156,25 @@ zcond_regex_match(char **a, int id)
{
char buf[DIGBUFSIZE];
ptr = lhstr;
leftlen = m->rm_so;
offs = 0;
/* Find the start offset */
MB_METACHARINIT();
while (ptr < lhstr + m->rm_so) {
MB_CHARINIT();
while (leftlen) {
offs++;
ptr += MB_METACHARLEN(ptr);
clen = MB_CHARLEN(ptr, leftlen);
ptr += clen;
leftlen -= clen;
}
convbase(buf, offs + !isset(KSHARRAYS), 10);
*bptr = ztrdup(buf);
/* Continue to the end offset */
while (ptr < lhstr + m->rm_eo) {
leftlen = m->rm_eo - m->rm_so;
while (leftlen ) {
offs++;
ptr += MB_METACHARLEN(ptr);
clen = MB_CHARLEN(ptr, leftlen);
ptr += clen;
leftlen -= clen;
}
convbase(buf, offs + !isset(KSHARRAYS) - 1, 10);
*eptr = ztrdup(buf);

View file

@ -728,7 +728,7 @@ clnicezputs(int do_colors, char *s, int ml)
if (do_colors)
initiscol();
mb_metacharinit();
mb_charinit();
while (umleft > 0) {
size_t cnt = eol ? MB_INVALID : mbrtowc(&cc, uptr, umleft, &mbs);

View file

@ -1288,7 +1288,7 @@ showmsg(char const *msg)
p = unmetafy(umsg, &ulen);
memset(&mbs, 0, sizeof mbs);
mb_metacharinit();
mb_charinit();
while (ulen > 0) {
char const *n;
if (*p == '\n') {

View file

@ -4582,7 +4582,7 @@ bin_print(char *name, char **args, Options ops, int func)
convchar_t cc;
#ifdef MULTIBYTE_SUPPORT
if (isset(MULTIBYTE)) {
mb_metacharinit();
mb_charinit();
(void)mb_metacharlenconv(metafy(curarg+1, curlen-1,
META_USEHEAP), &cc);
}
@ -5557,7 +5557,7 @@ bin_read(char *name, char **args, Options ops, UNUSED(int func))
wint_t wi;
if (isset(MULTIBYTE)) {
mb_metacharinit();
mb_charinit();
(void)mb_metacharlenconv(delimstr, &wi);
}
else

View file

@ -2237,7 +2237,7 @@ xpandbraces(LinkList list, LinkNode *np)
#ifdef MULTIBYTE_SUPPORT
char *ncptr;
int nclen;
mb_metacharinit();
mb_charinit();
ncptr = wcs_nicechar(cend, NULL, NULL);
nclen = strlen(ncptr);
p = zhalloc(lenalloc + nclen);
@ -2805,7 +2805,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr,
* ... now we know whether it's worth looking for the
* shortest, which we do by brute force.
*/
mb_metacharinit();
mb_charinit();
for (t = s, umlen = 0; t < s + mlen; ) {
set_pat_end(p, *t);
if (pattrylen(p, s, t - s, umlen, 0)) {
@ -2831,7 +2831,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr,
* so that match, mbegin, mend and MATCH, MBEGIN, MEND are
* correct.
*/
mb_metacharinit();
mb_charinit();
tmatch = NULL;
for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) {
set_pat_start(p, t-s);
@ -2855,7 +2855,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr,
/* Largest possible match at tail of string: *
* move forward along string until we get a match. *
* Again there's no optimisation. */
mb_metacharinit();
mb_charinit();
for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) {
set_pat_start(p, t-s);
if (pattrylen(p, t, s + l - t, umlen, ioff)) {
@ -2889,7 +2889,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr,
}
ioff = 0; /* offset into string */
umlen = umltot;
mb_metacharinit();
mb_charinit();
do {
/* loop over all matches for global substitution */
matched = 0;
@ -2986,7 +2986,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr,
*/
nmatches = 0;
tmatch = NULL;
mb_metacharinit();
mb_charinit();
for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) {
set_pat_start(p, t-s);
if (pattrylen(p, t, s + l - t, umlen, ioff)) {
@ -3002,7 +3002,7 @@ igetmatch(char **sp, Patprog p, int fl, int n, char *replstr,
* We need to find the n'th last match.
*/
n = nmatches - n;
mb_metacharinit();
mb_charinit();
for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) {
set_pat_start(p, t-s);
if (pattrylen(p, t, s + l - t, umlen, ioff) &&

View file

@ -2000,7 +2000,7 @@ casemodify(char *str, int how)
VARARR(char, mbstr, MB_CUR_MAX);
mbstate_t ps;
mb_metacharinit();
mb_charinit();
memset(&ps, 0, sizeof(ps));
while (*str) {
wint_t wc;

View file

@ -964,7 +964,7 @@ stradd(char *d)
/* FALL THROUGH */
default:
/* Take full wide character in one go */
mb_metacharinit();
mb_charinit();
pc = wcs_nicechar(cc, NULL, NULL);
break;
}

View file

@ -82,7 +82,7 @@ set_widearray(char *mb_array, Widechar_array wca)
wchar_t *wcptr = tmpwcs;
wint_t wci;
mb_metacharinit();
mb_charinit();
while (*mb_array) {
int mblen = mb_metacharlenconv(mb_array, &wci);
@ -332,7 +332,7 @@ zerrmsg(FILE *file, const char *fmt, va_list ap)
case 'c':
num = va_arg(ap, int);
#ifdef MULTIBYTE_SUPPORT
mb_metacharinit();
mb_charinit();
zputs(wcs_nicechar(num, NULL, NULL), file);
#else
zputs(nicechar(num), file);
@ -461,12 +461,13 @@ static mbstate_t mb_shiftstate;
/*
* Initialise multibyte state: called before a sequence of
* wcs_nicechar() or mb_metacharlenconv().
* wcs_nicechar(), mb_metacharlenconv(), or
* mb_charlenconv().
*/
/**/
mod_export void
mb_metacharinit(void)
mb_charinit(void)
{
memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
}
@ -500,7 +501,7 @@ mb_metacharinit(void)
* (but not both). (Note the complication that the wide character
* part may contain metafied characters.)
*
* The caller needs to call mb_metacharinit() before the first call, to
* The caller needs to call mb_charinit() before the first call, to
* set up the multibyte shift state for a range of characters.
*/
@ -3832,7 +3833,7 @@ itype_end(const char *ptr, int itype, int once)
#ifdef MULTIBYTE_SUPPORT
if (isset(MULTIBYTE) &&
(itype != IIDENT || !isset(POSIXIDENTIFIERS))) {
mb_metacharinit();
mb_charinit();
while (*ptr) {
wint_t wc;
int len = mb_metacharlenconv(ptr, &wc);
@ -4972,6 +4973,65 @@ mb_metastrlenend(char *ptr, int width, char *eptr)
return num + num_in_char;
}
/*
* The equivalent of mb_metacharlenconv_r() for
* strings that aren't metafied and hence have
* explicit lengths.
*/
/**/
mod_export int
mb_charlenconv_r(const char *s, int slen, wint_t *wcp, mbstate_t *mbsp)
{
size_t ret = MB_INVALID;
char inchar;
const char *ptr;
wchar_t wc;
for (ptr = s; slen; ) {
inchar = *ptr;
ptr++;
slen--;
ret = mbrtowc(&wc, &inchar, 1, mbsp);
if (ret == MB_INVALID)
break;
if (ret == MB_INCOMPLETE)
continue;
if (wcp)
*wcp = wc;
return ptr - s;
}
if (wcp)
*wcp = WEOF;
/* No valid multibyte sequence */
memset(mbsp, 0, sizeof(*mbsp));
if (ptr > s) {
return 1; /* Treat as single byte character */
} else
return 0; /* Probably shouldn't happen */
}
/*
* The equivalent of mb_metacharlenconv() for
* strings that aren't metafied and hence have
* explicit lengths;
*/
/**/
mod_export int
mb_charlenconv(const char *s, int slen, wint_t *wcp)
{
if (!isset(MULTIBYTE)) {
if (wcp)
*wcp = (wint_t)*s;
return 1;
}
return mb_charlenconv_r(s, slen, wcp, &mb_shiftstate);
}
/**/
#else
@ -4996,6 +5056,23 @@ metacharlenconv(const char *x, int *c)
return 1;
}
/* Simple replacement for mb_charlenconv */
/**/
mod_export int
charlenconv(const char *x, int len, int *c)
{
if (!len) {
if (c)
*c = '\0';
return 0;
}
if (c)
*c = (char)*x;
return 1;
}
/**/
#endif /* MULTIBYTE_SUPPORT */

View file

@ -2921,8 +2921,9 @@ enum {
#define AFTERTRAPHOOK (zshhooks + 2)
#ifdef MULTIBYTE_SUPPORT
/* Metafied input */
#define nicezputs(str, outs) (void)mb_niceformat((str), (outs), NULL, 0)
#define MB_METACHARINIT() mb_metacharinit()
#define MB_METACHARINIT() mb_charinit()
typedef wint_t convchar_t;
#define MB_METACHARLENCONV(str, cp) mb_metacharlenconv((str), (cp))
#define MB_METACHARLEN(str) mb_metacharlenconv(str, NULL)
@ -2932,6 +2933,11 @@ typedef wint_t convchar_t;
#define MB_METASTRLEN2END(str, widthp, eptr) \
mb_metastrlenend(str, widthp, eptr)
/* Unmetafined input */
#define MB_CHARINIT() mb_charinit()
#define MB_CHARLENCONV(str, len, cp) mb_charlenconv((str), (len), (cp))
#define MB_CHARLEN(str, len) mb_charlenconv((str), (len), NULL)
/*
* We replace broken implementations with one that uses Unicode
* characters directly as wide characters. In principle this is only
@ -3015,6 +3021,10 @@ typedef int convchar_t;
#define MB_METASTRLEN2(str, widthp) ztrlen(str)
#define MB_METASTRLEN2END(str, widthp, eptr) ztrlenend(str, eptr)
#define MB_CHARINIT()
#define MB_CHARLENCONV(str, len, cp) charlenconv((str), (len), (cp))
#define MB_CHARLEN(str, len) ((len) ? 1 : 0)
#define WCWIDTH_WINT(c) (1)
/* Leave character or string as is. */

View file

@ -484,3 +484,16 @@
# This doesn't look aligned in my editor because actually the characters
# aren't quite double width, but the arithmetic is correct.
# It appears just to be an effect of the font.
if zmodload -i zsh/regex 2>/dev/null; then
[[ $'\ua0' =~ '^.$' ]] && print OK
[[ $'\ua0' =~ $'^\ua0$' ]] && print OK
[[ $'\ua0'X =~ '^X$' ]] || print OK
else
print -u$ZTST_fd "Regexp test skipped, regexp library not found."
print -l OK OK OK
fi
0:Ensure no confusion on metafied input to regex module
>OK
>OK
>OK