23119: lower case in sorting properly

This commit is contained in:
Peter Stephenson 2007-01-22 14:35:12 +00:00
parent c53aa4adee
commit e375d5ee88
4 changed files with 67 additions and 4 deletions

View file

@ -1,3 +1,8 @@
2007-01-22 Peter Stephenson <pws@csr.com>
* 23119: Src/sort.c, Test/B03print.ztst, Test/D07multibyte.ztst:
do lowering of multibyte character case in sorting properly.
2007-01-21 Peter Stephenson <p.w.stephenson@ntlworld.com>
* 23118: Doc/Zsh/expn.yo, Src/builtin.c, Src/glob.c, Src/jobs.c,

View file

@ -248,7 +248,8 @@ strmetasort(char **array, int sortwhat, int *unmetalenp)
|| *metaptr == Meta) {
char *s, *t, *src = *arrptr, *dst;
int len;
sortarrptr->cmp = dst = (char *)zhalloc(strlen(src) + 1);
sortarrptr->cmp = dst =
(char *)zhalloc(((sortwhat & SORTIT_IGNORING_CASE)?2:1)*strlen(src)+1);
if (unmetalenp) {
/* Already unmetafied and we have the length. */
@ -283,8 +284,49 @@ strmetasort(char **array, int sortwhat, int *unmetalenp)
len = metaptr - src;
}
if (sortwhat & SORTIT_IGNORING_CASE) {
for (s = src, t = dst; s - src != len; )
*t++ = tulower(*s++);
char *send = src + len;
#ifdef MULTIBYTE_SUPPORT
if (isset(MULTIBYTE)) {
/*
* Lower the case the hard way. Convert to a wide
* character, process that, and convert back. We
* don't assume the characters have the same
* multibyte length. We can't use casemodify()
* because we have unmetafied data, which may have
* been passed down to use.
*/
mbstate_t mbsin, mbsout;
int clen;
wchar_t wc;
memset(&mbsin, 0, sizeof(mbstate_t));
memset(&mbsout, 0, sizeof(mbstate_t));
for (s = src, t = dst; s < send; ) {
clen = mbrtowc(&wc, s, send-s, &mbsin);
if (clen < 0) {
/* invalid or unfinished: treat as single bytes */
while (s < send)
*t++ = tulower(*s++);
break;
}
if (clen == 0) {
/* embedded null */
*t++ = '\0';
s++;
continue;
}
s += clen;
wc = towlower(wc);
clen = wcrtomb(t, wc, &mbsout);
t += clen;
DPUTS(clen < 0, "Bad conversion when lowering case");
}
*t = '\0';
len = t - dst;
} else
#endif
for (s = src, t = dst; s < send; )
*t++ = tulower(*s++);
src = dst;
}
if (sortwhat & SORTIT_IGNORING_BACKSLASHES) {

View file

@ -34,7 +34,12 @@
>baz
>bar
print -io a B c
# some locales force case-insensitive sorting
(LC_ALL=C; print -o a B c)
0:case-sensitive argument sorting
>B a c
(LC_ALL=C; print -io a B c)
0:case-insensitive argument sorting
>a B c

View file

@ -2,6 +2,8 @@
# Find a UTF-8 locale.
setopt multibyte
# Don't let LC_* override our choice of locale.
unset -m LC_\*
mb_ok=
langs=(en_US.UTF-8 en_GB.UTF-8 en.UTF-8
$(locale -a 2>/dev/null | sed -e 's/utf8/UTF-8/' | grep UTF-8))
@ -315,3 +317,12 @@
printf "%4.3s\n" főobar
0:Multibyte characters in printf widths
> főo
# We ask for case-insensitive sorting here (and supply upper case
# characters) so that we exercise the logic in the shell that lowers the
# case of the string for case-insensitive sorting.
print -oi HAH HUH HEH HÉH HÈH
(LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH)
0:Multibyte characters in print sorting
>HAH HEH HÉH HÈH HUH
>HAH HEH HUH HÈH HÉH