22557: turn on multibyte option by default

This commit is contained in:
Peter Stephenson 2006-07-25 18:10:37 +00:00
parent b9bf52d1fc
commit 09bc7ee2b5
7 changed files with 79 additions and 16 deletions

View file

@ -1,5 +1,10 @@
2006-07-25 Peter Stephenson <pws@csr.com>
* 22557: Doc/Zsh/options.yo, Misc/globtests, Src/options.c,
Src/pattern.c, Test/D02glob.ztst, Test/D07multibyte.ztst:
Turn on multibyte option by default for MULTIBYTE_SUPPORT and fix
tests and patterns.
* unposted: Src/pattern.c, Src/utils.c: minor typos in
22556 found when MULTIBYTE_SUPPORT is not defined.

View file

@ -411,19 +411,31 @@ item(tt(MARK_DIRS) (tt(-8), ksh: tt(-X)))(
Append a trailing `tt(/)' to all directory
names resulting from filename generation (globbing).
)
pindex(MULTIBYTE)
pindex(MULTIBYTE <D>)
cindex(characters, multibyte, in expansion and globbing)
cindex(multibyte characters, in expansion and globbing)
item(tt(MULTIBYTE))(
Respect multibyte characters when found during pattern matching.
When this option is set, characters strings are examined using the
Respect multibyte characters when found in strings.
When this option is set, strings are examined using the
system library to determine how many bytes form a character, depending
on the current locale. If the option is unset
(or the shell was not compiled with the configuration option
tt(MULTIBYTE_SUPPORT)) a single byte is always treated as a single
character. The option will eventually be extended to cover expansion.
Note, however, that it does not affect the shellʼs editor, which always
uses the locale to determine multibyte characters.
on the current locale. This affects the way characters are counted in
pattern matching, parameter values and various delimiters.
The option is on by default if the shell was compiled with
tt(MULTIBYTE_SUPPORT); otherwise it is off by default and has no effect if
turned on.
If the option is off a single byte is always treated as a single
character. This setting is designed purely for examining strings
known to contain raw bytes or other values that may not be characters
in the current locale. It is not necessary to unset the option merely
because the character set for the current locale does not contain multibyte
characters.
The option does not affect the shell's editor, which always uses the
locale to determine multibyte characters. This is because
the character set displayed by the terminal emulator is independent of
shell settings.
)
pindex(NOMATCH)
cindex(globbing, no matches)

View file

@ -182,6 +182,5 @@ f atest/path *((#s)|/)test((#e)|/)*
f path/testy *((#s)|/)test((#e)|/)*
f path/testy/ohyes *((#s)|/)test((#e)|/)*
f path/atest/ohyes *((#s)|/)test((#e)|/)*
t bj<62>n *[裝<>トヨ]*
EOT
print "$failed tests failed."

View file

@ -166,7 +166,13 @@ static struct optname optns[] = {
{{NULL, "markdirs", 0}, MARKDIRS},
{{NULL, "menucomplete", 0}, MENUCOMPLETE},
{{NULL, "monitor", OPT_SPECIAL}, MONITOR},
{{NULL, "multibyte", 0/*TBD*/}, MULTIBYTE},
{{NULL, "multibyte",
#ifdef MULTIBYTE_SUPPORT
OPT_ALL
#else
0
#endif
}, MULTIBYTE},
{{NULL, "multios", OPT_EMULATE|OPT_ZSH}, MULTIOS},
{{NULL, "nomatch", OPT_EMULATE|OPT_NONBOURNE},NOMATCH},
{{NULL, "notify", OPT_ZSH}, NOTIFY},

View file

@ -343,7 +343,7 @@ metacharinc(char **x)
/* Error. Treat as single byte. */
/* Reset the shift state for next time. */
memset(&shiftstate, 0, sizeof(shiftstate));
return (wchar_t) *(*x)++;
return (wchar_t) STOUC(*(*x)++);
}
#else
@ -595,7 +595,7 @@ patcompile(char *exp, int inflags, char **endexp)
while (oplen--) {
if (imeta(*opnd)) {
*dst++ = Meta;
*dst++ = *opnd ^ 32;
*dst++ = *opnd++ ^ 32;
} else {
*dst++ = *opnd++;
}

View file

@ -6,7 +6,9 @@
mkdir glob.tmp/dir3/subdir
: >glob.tmp/{,{dir1,dir2}/}{a,b,c}
globtest () { $ZTST_testdir/../Src/zsh -f $ZTST_srcdir/../Misc/$1 }
globtest () {
$ZTST_testdir/../Src/zsh -f $ZTST_srcdir/../Misc/$1
}
regress_absolute_path_and_core_dump() {
local absolute_dir=$(cd glob.tmp && pwd -P)
@ -175,7 +177,6 @@
>1: [[ path/testy = *((#s)|/)test((#e)|/)* ]]
>1: [[ path/testy/ohyes = *((#s)|/)test((#e)|/)* ]]
>1: [[ path/atest/ohyes = *((#s)|/)test((#e)|/)* ]]
>0: [[ björn = *[åäöÅÄÖ]* ]]
>0 tests failed.
globtest globtests.ksh
@ -263,6 +264,10 @@
>0: [[ Modules = (#i)*m* ]]
>0 tests failed.
(unsetopt multibyte
[[ björn = *[åäöÅÄÖ]* ]])
0:single byte match with top bit set
( regress_absolute_path_and_core_dump )
0:exclusions regression test
>

View file

@ -176,7 +176,7 @@
?(eval):1: command not found: hähä=3
foo="Ølaf«Ødd«øpénëd«ån«àpple"
print -l ${(s.«.)foo}
print -l ${(s.«.)foo}
ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
print -l ${=ioh}
print ${(w)#ioh}
@ -228,3 +228,39 @@
0:read multibyte characters
<«»ignored
>«»
# See if the system grokks first-century Greek...
ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
for (( i = 1; i <= ${#ioh}; i++ )); do
# FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with
# perispomeni and ypogegrammeni, of course) as a lower case character.
if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then
for tp in upper space punct invalid; do
if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then
print "$i: $tp"
break
fi
done
fi
done
0:isw* functions on non-ASCII wide characters
>1: upper
>3: space
>8: space
>11: space
>13: space
>19: punct
>20: space
>24: space
>26: space
>32: space
>35: space
>40: space
>44: space
>49: punct
>50: space
>54: space
>59: space
>62: space
>64: space
>70: punct