mirror of
https://github.com/zsh-users/zsh
synced 2024-07-21 10:14:19 +00:00
f52795ea3e
These now go to 0xdc00 + index. If wchar_t is a Unicode code point, this is by construction an invalid character within the Unicode range. If it isn't, we would hope the result was no worse than the current fudge.
528 lines
13 KiB
Plaintext
528 lines
13 KiB
Plaintext
%prep
|
||
|
||
# Find a UTF-8 locale.
|
||
setopt multibyte
|
||
# Don't let LC_* override our choice of locale.
|
||
unset -m LC_\*
|
||
mb_ok=
|
||
langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8
|
||
$(locale -a 2>/dev/null | egrep 'utf8|UTF-8'))
|
||
for LANG in $langs; do
|
||
if [[ é = ? ]]; then
|
||
mb_ok=1
|
||
break;
|
||
fi
|
||
done
|
||
if [[ -z $mb_ok ]]; then
|
||
ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented"
|
||
else
|
||
print -u $ZTST_fd Testing multibyte with locale $LANG
|
||
mkdir multibyte.tmp && cd multibyte.tmp
|
||
fi
|
||
|
||
%test
|
||
|
||
a=ténébreux
|
||
for i in {1..9}; do
|
||
print ${a[i]}
|
||
for j in {$i..9}; do
|
||
print $i $j ${a[i,j]} ${a[-j,-i]}
|
||
done
|
||
done
|
||
0:Basic indexing with multibyte characters
|
||
>t
|
||
>1 1 t x
|
||
>1 2 té ux
|
||
>1 3 tén eux
|
||
>1 4 téné reux
|
||
>1 5 ténéb breux
|
||
>1 6 ténébr ébreux
|
||
>1 7 ténébre nébreux
|
||
>1 8 ténébreu énébreux
|
||
>1 9 ténébreux ténébreux
|
||
>é
|
||
>2 2 é u
|
||
>2 3 én eu
|
||
>2 4 éné reu
|
||
>2 5 énéb breu
|
||
>2 6 énébr ébreu
|
||
>2 7 énébre nébreu
|
||
>2 8 énébreu énébreu
|
||
>2 9 énébreux ténébreu
|
||
>n
|
||
>3 3 n e
|
||
>3 4 né re
|
||
>3 5 néb bre
|
||
>3 6 nébr ébre
|
||
>3 7 nébre nébre
|
||
>3 8 nébreu énébre
|
||
>3 9 nébreux ténébre
|
||
>é
|
||
>4 4 é r
|
||
>4 5 éb br
|
||
>4 6 ébr ébr
|
||
>4 7 ébre nébr
|
||
>4 8 ébreu énébr
|
||
>4 9 ébreux ténébr
|
||
>b
|
||
>5 5 b b
|
||
>5 6 br éb
|
||
>5 7 bre néb
|
||
>5 8 breu énéb
|
||
>5 9 breux ténéb
|
||
>r
|
||
>6 6 r é
|
||
>6 7 re né
|
||
>6 8 reu éné
|
||
>6 9 reux téné
|
||
>e
|
||
>7 7 e n
|
||
>7 8 eu én
|
||
>7 9 eux tén
|
||
>u
|
||
>8 8 u é
|
||
>8 9 ux té
|
||
>x
|
||
>9 9 x t
|
||
|
||
s=é
|
||
print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E
|
||
0:Out of range subscripts with multibyte characters
|
||
>AA BéB CC DéD EE
|
||
|
||
print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]}
|
||
0:Reverse indexing with multibyte characters
|
||
>2 4 éné
|
||
|
||
print ${a[(r)én,(r)éb]}
|
||
0:Subscript searching with multibyte characters
|
||
>énéb
|
||
|
||
print ${a[(rb:1:)é,-1]}
|
||
print ${a[(rb:2:)é,-1]}
|
||
print ${a[(rb:3:)é,-1]}
|
||
print ${a[(rb:4:)é,-1]}
|
||
print ${a[(rb:5:)é,-1]}
|
||
0:Subscript searching with initial offset
|
||
>énébreux
|
||
>énébreux
|
||
>ébreux
|
||
>ébreux
|
||
>
|
||
|
||
print ${a[(rn:1:)é,-1]}
|
||
print ${a[(rn:2:)é,-1]}
|
||
print ${a[(rn:3:)é,-1]}
|
||
0:Subscript searching with count
|
||
>énébreux
|
||
>ébreux
|
||
>
|
||
|
||
print ${a[(R)én,(R)éb]}
|
||
0:Backward subscript searching with multibyte characters
|
||
>énéb
|
||
|
||
# Starting offsets with (R) seem to be so strange as to be hardly
|
||
# worth testing.
|
||
|
||
setopt extendedglob
|
||
[[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2
|
||
for i in {1..${#match}}; do
|
||
print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]}
|
||
done
|
||
0:Multibyte offsets in pattern tests
|
||
>én 2 3 én
|
||
>éb 4 5 éb
|
||
|
||
b=${(U)a}
|
||
print $b
|
||
print ${(L)b}
|
||
desdichado="Je suis le $a, le veuf, l'inconsolé"
|
||
print ${(C)desdichado}
|
||
lxiv="l'état c'est moi"
|
||
print ${(C)lxiv}
|
||
0:Case modification of multibyte strings
|
||
>TÉNÉBREUX
|
||
>ténébreux
|
||
>Je Suis Le Ténébreux, Le Veuf, L'Inconsolé
|
||
>L'État C'Est Moi
|
||
|
||
array=(ølaf ødd øpened án encyclopædia)
|
||
barray=(${(U)array})
|
||
print $barray
|
||
print ${(L)barray}
|
||
print ${(C)array}
|
||
print ${(C)barray}
|
||
0:Case modification of arrays with multibyte strings
|
||
>ØLAF ØDD ØPENED ÁN ENCYCLOPÆDIA
|
||
>ølaf ødd øpened án encyclopædia
|
||
>Ølaf Ødd Øpened Án Encyclopædia
|
||
>Ølaf Ødd Øpened Án Encyclopædia
|
||
|
||
print $(( ##¥ ))
|
||
pound=£
|
||
print $(( #pound ))
|
||
alpha=α
|
||
print $(( ##α )) $(( #alpha ))
|
||
0:Conversion to Unicode in mathematical expressions
|
||
>165
|
||
>163
|
||
>945 945
|
||
|
||
unsetopt posix_identifiers
|
||
expr='hähä=3 || exit 1; print $hähä'
|
||
eval $expr
|
||
setopt posix_identifiers
|
||
(eval $expr)
|
||
1:POSIX_IDENTIFIERS option
|
||
>3
|
||
?(eval):1: command not found: hähä=3
|
||
|
||
foo="Ølaf«Ødd«øpénëd«ån«àpple"
|
||
print -l ${(s.«.)foo}
|
||
ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
|
||
print -l ${=ioh}
|
||
print ${(w)#ioh}
|
||
0:Splitting with multibyte characters
|
||
>Ølaf
|
||
>Ødd
|
||
>øpénëd
|
||
>ån
|
||
>àpple
|
||
>Ἐν
|
||
>ἀρχῇ
|
||
>ἦν
|
||
>ὁ
|
||
>λόγος,
|
||
>καὶ
|
||
>ὁ
|
||
>λόγος
|
||
>ἦν
|
||
>πρὸς
|
||
>τὸν
|
||
>θεόν,
|
||
>καὶ
|
||
>θεὸς
|
||
>ἦν
|
||
>ὁ
|
||
>λόγος.
|
||
>17
|
||
|
||
read -d £ one
|
||
read -d £ two
|
||
print $one
|
||
print $two
|
||
0:read with multibyte delimiter
|
||
<first£second£
|
||
>first
|
||
>second
|
||
|
||
(IFS=«
|
||
read -d » -A array
|
||
print -l $array)
|
||
0:read -A with multibyte IFS
|
||
<dominus«illuminatio«mea»ignored
|
||
>dominus
|
||
>illuminatio
|
||
>mea
|
||
|
||
read -k2 -u0 twochars
|
||
print $twochars
|
||
0:read multibyte characters
|
||
<«»ignored
|
||
>«»
|
||
|
||
read -q -u0 mb
|
||
print $?
|
||
0:multibyte character makes read -q return false
|
||
<«
|
||
>1
|
||
|
||
# See if the system grokks first-century Greek...
|
||
ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
|
||
for (( i = 1; i <= ${#ioh}; i++ )); do
|
||
# FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with
|
||
# perispomeni and ypogegrammeni, of course) as a lower case character.
|
||
if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then
|
||
for tp in upper space punct invalid; do
|
||
if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then
|
||
print "$i: $tp"
|
||
break
|
||
fi
|
||
done
|
||
fi
|
||
done
|
||
0:isw* functions on non-ASCII wide characters
|
||
>1: upper
|
||
>3: space
|
||
>8: space
|
||
>11: space
|
||
>13: space
|
||
>19: punct
|
||
>20: space
|
||
>24: space
|
||
>26: space
|
||
>32: space
|
||
>35: space
|
||
>40: space
|
||
>44: space
|
||
>49: punct
|
||
>50: space
|
||
>54: space
|
||
>59: space
|
||
>62: space
|
||
>64: space
|
||
>70: punct
|
||
|
||
ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος"
|
||
print ${ioh#[[:alpha:]]##}
|
||
print ${ioh##[[:alpha:]]##}
|
||
print ${ioh%[[:alpha:]]##}
|
||
print ${ioh%%[[:alpha:]]##}
|
||
print ${(S)ioh#λ*ς}
|
||
print ${(S)ioh##λ*ς}
|
||
print ${(S)ioh%θ*ς}
|
||
print ${(S)ioh%%θ*ς}
|
||
0:Parameter #, ##, %, %% with multibyte characters
|
||
>ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
|
||
> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
|
||
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο
|
||
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ
|
||
>Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
|
||
>Ἐν ἀρχῇ ἦν ὁ
|
||
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ ἦν ὁ λόγος
|
||
>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ
|
||
|
||
foo=(κατέβην χθὲς εἰς Πειραιᾶ)
|
||
print ${(l.3..¥.r.3..£.)foo}
|
||
print ${(l.4..¥.r.2..£.)foo}
|
||
print ${(l.5..¥.r.1..£.)foo}
|
||
print ${(l.4..¥..«.r.4..£..».)foo}
|
||
print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo}
|
||
0:simultaneous left and right padding
|
||
>κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι
|
||
>¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα
|
||
>¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ
|
||
>«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ
|
||
>ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ
|
||
# er... yeah, that looks right...
|
||
|
||
foo=picobarn
|
||
print ${foo:s£bar£rod£:s¥rod¥stick¥}
|
||
0:Delimiters in modifiers
|
||
>picostickn
|
||
|
||
# TODO: if we get paired multibyte bracket delimiters to work
|
||
# (as Emacs does, the smug so-and-so), the following should change.
|
||
foo=bar
|
||
print ${(r£5££X£)foo}
|
||
print ${(l«10««Y««HI«)foo}
|
||
0:Delimiters in parameter flags
|
||
>barXX
|
||
>YYYYYHIbar
|
||
|
||
printf "%4.3s\n" főobar
|
||
0:Multibyte characters in printf widths
|
||
> főo
|
||
|
||
# We ask for case-insensitive sorting here (and supply upper case
|
||
# characters) so that we exercise the logic in the shell that lowers the
|
||
# case of the string for case-insensitive sorting.
|
||
print -oi HÛH HÔH HÎH HÊH HÂH
|
||
(LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH)
|
||
0:Multibyte characters in print sorting
|
||
>HÂH HÊH HÎH HÔH HÛH
|
||
>HAH HEH HUH HÈH HÉH
|
||
|
||
# These are control characters in Unicode, so don't show up.
|
||
# We just want to check they're not being treated as tokens.
|
||
for x in {128..150}; do
|
||
print ${(#)x}
|
||
done | while read line; do
|
||
print ${#line} $(( #line ))
|
||
done
|
||
0:evaluated character number with multibyte characters
|
||
>1 128
|
||
>1 129
|
||
>1 130
|
||
>1 131
|
||
>1 132
|
||
>1 133
|
||
>1 134
|
||
>1 135
|
||
>1 136
|
||
>1 137
|
||
>1 138
|
||
>1 139
|
||
>1 140
|
||
>1 141
|
||
>1 142
|
||
>1 143
|
||
>1 144
|
||
>1 145
|
||
>1 146
|
||
>1 147
|
||
>1 148
|
||
>1 149
|
||
>1 150
|
||
|
||
touch ngs1txt ngs2txt ngs10txt ngs20txt ngs100txt ngs200txt
|
||
setopt numericglobsort
|
||
print -l ngs*
|
||
0:NUMERIC_GLOB_SORT option in UTF-8 locale
|
||
>ngs1txt
|
||
>ngs2txt
|
||
>ngs10txt
|
||
>ngs20txt
|
||
>ngs100txt
|
||
>ngs200txt
|
||
|
||
# Not strictly multibyte, but gives us a well-defined locale for testing.
|
||
foo=$'X\xc0Y\x07Z\x7fT'
|
||
print -r ${(q)foo}
|
||
0:Backslash-quoting of unprintable/invalid characters uses $'...'
|
||
>X$'\300'Y$'\a'Z$'\177'T
|
||
|
||
# This also isn't strictly multibyte and is here to reduce the
|
||
# likelihood of a "cannot do character set conversion" error.
|
||
(print $'\u00e9') 2>&1 | read
|
||
if [[ $REPLY != é ]]; then
|
||
print "warning: your system can't do simple Unicode conversion." >&$ZTST_fd
|
||
print "Check you have a correctly installed iconv library." >&$ZTST_fd
|
||
# cheat
|
||
repeat 4 print OK
|
||
else
|
||
testfn() { (LC_ALL=C; print $'\u00e9') }
|
||
repeat 4 testfn 2>&1 | while read line; do
|
||
if [[ $line = *"character not in range"* ]]; then
|
||
print OK
|
||
elif [[ $line = "?" ]]; then
|
||
print OK
|
||
else
|
||
print Failed: no error message and no question mark
|
||
fi
|
||
done
|
||
fi
|
||
true
|
||
0:error handling in Unicode quoting
|
||
>OK
|
||
>OK
|
||
>OK
|
||
>OK
|
||
|
||
tmp1='glob/\(\)Ą/*'
|
||
[[ glob/'()Ą'/foo == $~tmp1 ]] && print "Matched against $tmp1"
|
||
tmp1='glob/\(\)Ā/*'
|
||
[[ glob/'()Ā'/bar == $~tmp1 ]] && print "Matched against $tmp1"
|
||
0:Backslashes and metafied characters in patterns
|
||
>Matched against glob/()Ą/*
|
||
>Matched against glob/()Ā/*
|
||
|
||
mkdir 梶浦由記 'Пётр Ильич Чайковский'
|
||
(cd 梶浦由記; print ${${(%):-%~}:t})
|
||
(cd 'Пётр Ильич Чайковский'; print ${${(%):-%~}:t})
|
||
0:Metafied characters in prompt expansion
|
||
>梶浦由記
|
||
>Пётр Ильич Чайковский
|
||
|
||
(
|
||
setopt nonomatch
|
||
tmp1=Ą
|
||
tmpA=(Ą 'Пётр Ильич Чайковский' 梶浦由記)
|
||
print ${tmp1} ${(%)tmp1} ${(%%)tmp1}
|
||
print ${#tmp1} ${#${(%)tmp1}} ${#${(%%)tmp1}}
|
||
print ${tmpA}
|
||
print ${(%)tmpA}
|
||
print ${(%%)tmpA}
|
||
)
|
||
0:More metafied characters in prompt expansion
|
||
>Ą Ą Ą
|
||
>1 1 1
|
||
>Ą Пётр Ильич Чайковский 梶浦由記
|
||
>Ą Пётр Ильич Чайковский 梶浦由記
|
||
>Ą Пётр Ильич Чайковский 梶浦由記
|
||
|
||
setopt cbases
|
||
print $'\xc5' | read
|
||
print $(( [#16] #REPLY ))
|
||
0:read passes through invalid multibyte characters
|
||
>0xC5
|
||
|
||
word=abcま
|
||
word[-1]=
|
||
print $word
|
||
word=abcま
|
||
word[-2]=
|
||
print $word
|
||
word=abcま
|
||
word[4]=d
|
||
print $word
|
||
word=abcま
|
||
word[3]=not_c
|
||
print $word
|
||
0:assignment with negative indices
|
||
>abc
|
||
>abま
|
||
>abcd
|
||
>abnot_cま
|
||
|
||
# The following doesn't necessarily need UTF-8, but this gives
|
||
# us the full effect --- if we parse this wrongly the \xe9
|
||
# in combination with the tokenized input afterwards looks like a
|
||
# valid UTF-8 character. But it isn't.
|
||
print $'$\xe9#``' >test_bad_param
|
||
(setopt nonomatch
|
||
. ./test_bad_param)
|
||
127:Invalid parameter name with following tokenized input
|
||
?./test_bad_param:1: command not found: $\M-i#
|
||
|
||
lines=$'one\tZSH\tthree\nfour\tfive\tsix'
|
||
print -X8 -r -- $lines
|
||
0:Tab expansion with extra-wide characters
|
||
>one ZSH three
|
||
>four five six
|
||
# This doesn't look aligned in my editor because actually the characters
|
||
# aren't quite double width, but the arithmetic is correct.
|
||
# It appears just to be an effect of the font.
|
||
|
||
if zmodload -i zsh/regex 2>/dev/null; then
|
||
[[ $'\ua0' =~ '^.$' ]] && print OK
|
||
[[ $'\ua0' =~ $'^\ua0$' ]] && print OK
|
||
[[ $'\ua0'X =~ '^X$' ]] || print OK
|
||
else
|
||
ZTST_skip="regexp library not found."
|
||
fi
|
||
0:Ensure no confusion on metafied input to regex module
|
||
>OK
|
||
>OK
|
||
>OK
|
||
|
||
() {
|
||
emulate -L zsh
|
||
setopt errreturn
|
||
local cdpath=(.)
|
||
mkdir ホ
|
||
cd ホ
|
||
cd ..
|
||
cd ./ホ
|
||
cd ..
|
||
}
|
||
0:cd with special characters
|
||
|
||
test_array=(
|
||
'[[ \xcc = \xcc ]]'
|
||
'[[ \xcc != \xcd ]]'
|
||
'[[ \xcc != \ucc ]]'
|
||
'[[ \ucc = \ucc ]]'
|
||
'[[ \ucc = [\ucc] ]]'
|
||
'[[ \xcc != [\ucc] ]]'
|
||
# Not clear how useful the following is...
|
||
'[[ \xcc = [\xcc] ]]'
|
||
)
|
||
for test in $test_array; do
|
||
if ! eval ${(g::)test} ; then
|
||
print -rl "Test $test failed" >&2
|
||
fi
|
||
done
|
||
0:Invalid characters in pattern matching
|