36415: remap bytes from invalid multibyte characters.

These now go to 0xdc00 + index.  If wchar_t is a Unicode code point,
this is by construction an invalid character within the Unicode range.
If it isn't, we would hope the result was no worse than the current
fudge.
This commit is contained in:
Peter Stephenson 2015-09-04 10:07:51 +01:00
parent 32f5d3d8c1
commit f52795ea3e
3 changed files with 47 additions and 6 deletions

View file

@ -1,3 +1,11 @@
2015-09-04 Peter Stephenson <p.stephenson@samsung.com>
* 36415: Src/pattern.c, Test/D07multibyte.ztst: remap bytes from
invalid multibyte characters to 0xDC00 + index which is invalid
in Unicode. Strictly this only works if whcar_t is
ISO-10646-compliant, however it ought to be at least as good as
the current fudge in any case.
2015-09-03 Peter Stephenson <p.stephenson@samsung.com>
* 36416: Src/Zle/zle_refresh.c, Src/Zle/zle_utils.c: If

View file

@ -224,6 +224,22 @@ typedef zlong zrange_t;
typedef unsigned long zrange_t;
#endif
#ifdef MULTIBYTE_SUPPORT
/*
* Handle a byte that's not part of a valid character.
*
* This range in Unicode is recommended for purposes of this
* kind as it corresponds to invalid characters.
*
* Note that this strictly only works if wchar_t represents
* Unicode code points, which isn't necessarily true; however,
* converting an invalid character into an unknown format is
* a bit tricky...
*/
#define WCHAR_INVALID(ch) \
((wchar_t) (0xDC00 + STOUC(ch)))
#endif /* MULTIBYTE_SUPPORT */
/*
* Array of characters corresponding to zpc_chars enum, which it must match.
*/
@ -353,10 +369,10 @@ metacharinc(char **x)
return wc;
}
/* Error. Treat as single byte. */
/* Error. */
/* Reset the shift state for next time. */
memset(&shiftstate, 0, sizeof(shiftstate));
return (wchar_t) STOUC(*(*x)++);
return WCHAR_INVALID(*(*x)++);
}
#else
@ -1867,10 +1883,10 @@ charref(char *x, char *y)
ret = mbrtowc(&wc, x, y-x, &shiftstate);
if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
/* Error. Treat as single byte. */
/* Error. */
/* Reset the shift state for next time. */
memset(&shiftstate, 0, sizeof(shiftstate));
return (wchar_t) STOUC(*x);
return WCHAR_INVALID(*x);
}
return wc;
@ -1913,7 +1929,7 @@ charrefinc(char **x, char *y, int *z)
size_t ret;
if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80))
return (wchar_t) STOUC(*(*x)++);
return WCHAR_INVALID(*(*x)++);
ret = mbrtowc(&wc, *x, y-*x, &shiftstate);
@ -1922,7 +1938,7 @@ charrefinc(char **x, char *y, int *z)
*z = 1;
/* Reset the shift state for next time. */
memset(&shiftstate, 0, sizeof(shiftstate));
return (wchar_t) STOUC(*(*x)++);
return WCHAR_INVALID(*(*x)++);
}
/* Nulls here are normal characters */

View file

@ -508,3 +508,20 @@
cd ..
}
0:cd with special characters
test_array=(
'[[ \xcc = \xcc ]]'
'[[ \xcc != \xcd ]]'
'[[ \xcc != \ucc ]]'
'[[ \ucc = \ucc ]]'
'[[ \ucc = [\ucc] ]]'
'[[ \xcc != [\ucc] ]]'
# Not clear how useful the following is...
'[[ \xcc = [\xcc] ]]'
)
for test in $test_array; do
if ! eval ${(g::)test} ; then
print -rl "Test $test failed" >&2
fi
done
0:Invalid characters in pattern matching