1%prep 2 3# Find a UTF-8 locale. 4 setopt multibyte 5# Don't let LC_* override our choice of locale. 6 unset -m LC_\* 7 mb_ok= 8 langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8 9 $(locale -a 2>/dev/null | egrep 'utf8|UTF-8')) 10 for LANG in $langs; do 11 if [[ é = ? ]]; then 12 mb_ok=1 13 break; 14 fi 15 done 16 if [[ -z $mb_ok ]]; then 17 ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented" 18 else 19 print -u $ZTST_fd Testing multibyte with locale $LANG 20 mkdir multibyte.tmp && cd multibyte.tmp 21 fi 22 23%test 24 25 a=ténébreux 26 for i in {1..9}; do 27 print ${a[i]} 28 for j in {$i..9}; do 29 print $i $j ${a[i,j]} ${a[-j,-i]} 30 done 31 done 320:Basic indexing with multibyte characters 33>t 34>1 1 t x 35>1 2 té ux 36>1 3 tén eux 37>1 4 téné reux 38>1 5 ténéb breux 39>1 6 ténébr ébreux 40>1 7 ténébre nébreux 41>1 8 ténébreu énébreux 42>1 9 ténébreux ténébreux 43>é 44>2 2 é u 45>2 3 én eu 46>2 4 éné reu 47>2 5 énéb breu 48>2 6 énébr ébreu 49>2 7 énébre nébreu 50>2 8 énébreu énébreu 51>2 9 énébreux ténébreu 52>n 53>3 3 n e 54>3 4 né re 55>3 5 néb bre 56>3 6 nébr ébre 57>3 7 nébre nébre 58>3 8 nébreu énébre 59>3 9 nébreux ténébre 60>é 61>4 4 é r 62>4 5 éb br 63>4 6 ébr ébr 64>4 7 ébre nébr 65>4 8 ébreu énébr 66>4 9 ébreux ténébr 67>b 68>5 5 b b 69>5 6 br éb 70>5 7 bre néb 71>5 8 breu énéb 72>5 9 breux ténéb 73>r 74>6 6 r é 75>6 7 re né 76>6 8 reu éné 77>6 9 reux téné 78>e 79>7 7 e n 80>7 8 eu én 81>7 9 eux tén 82>u 83>8 8 u é 84>8 9 ux té 85>x 86>9 9 x t 87 88 s=é 89 print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E 900:Out of range subscripts with multibyte characters 91>AA BéB CC DéD EE 92 93 print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]} 940:Reverse indexing with multibyte characters 95>2 4 éné 96 97 print ${a[(r)én,(r)éb]} 980:Subscript searching with multibyte characters 99>énéb 100 101 print ${a[(rb:1:)é,-1]} 102 print ${a[(rb:2:)é,-1]} 103 print ${a[(rb:3:)é,-1]} 104 print ${a[(rb:4:)é,-1]} 105 print ${a[(rb:5:)é,-1]} 1060:Subscript searching with initial offset 107>énébreux 108>énébreux 109>ébreux 110>ébreux 111> 112 113 print ${a[(rn:1:)é,-1]} 114 print ${a[(rn:2:)é,-1]} 115 print ${a[(rn:3:)é,-1]} 1160:Subscript searching with count 117>énébreux 118>ébreux 119> 120 121 print ${a[(R)én,(R)éb]} 1220:Backward subscript searching with multibyte characters 123>énéb 124 125# Starting offsets with (R) seem to be so strange as to be hardly 126# worth testing. 127 128 setopt extendedglob 129 [[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2 130 for i in {1..${#match}}; do 131 print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]} 132 done 1330:Multibyte offsets in pattern tests 134>én 2 3 én 135>éb 4 5 éb 136 137 b=${(U)a} 138 print $b 139 print ${(L)b} 140 desdichado="Je suis le $a, le veuf, l'inconsolé" 141 print ${(C)desdichado} 142 lxiv="l'état c'est moi" 143 print ${(C)lxiv} 1440:Case modification of multibyte strings 145>TÉNÉBREUX 146>ténébreux 147>Je Suis Le Ténébreux, Le Veuf, L'Inconsolé 148>L'État C'Est Moi 149 150 array=(ølaf ødd øpened án encyclopædia) 151 barray=(${(U)array}) 152 print $barray 153 print ${(L)barray} 154 print ${(C)array} 155 print ${(C)barray} 1560:Case modification of arrays with multibyte strings 157>ØLAF ØDD ØPENED ÁN ENCYCLOPÆDIA 158>ølaf ødd øpened án encyclopædia 159>Ølaf Ødd Øpened Án Encyclopædia 160>Ølaf Ødd Øpened Án Encyclopædia 161 162 print $(( ##¥ )) 163 pound=£ 164 print $(( #pound )) 165 alpha=α 166 print $(( ##α )) $(( #alpha )) 1670:Conversion to Unicode in mathematical expressions 168>165 169>163 170>945 945 171 172 unsetopt posix_identifiers 173 expr='hähä=3 || exit 1; print $hähä' 174 eval $expr 175 setopt posix_identifiers 176 (eval $expr) 1771:POSIX_IDENTIFIERS option 178>3 179?(eval):1: command not found: hähä=3 180 181 foo="Ølaf«Ødd«øpénëd«ån«àpple" 182 print -l ${(s.«.)foo} 183 ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." 184 print -l ${=ioh} 185 print ${(w)#ioh} 1860:Splitting with multibyte characters 187>Ølaf 188>Ødd 189>øpénëd 190>ån 191>àpple 192>Ἐν 193>ἀρχῇ 194>ἦν 195>ὁ 196>λόγος, 197>καὶ 198>ὁ 199>λόγος 200>ἦν 201>πρὸς 202>τὸν 203>θεόν, 204>καὶ 205>θεὸς 206>ἦν 207>ὁ 208>λόγος. 209>17 210 211 read -d £ one 212 read -d £ two 213 print $one 214 print $two 2150:read with multibyte delimiter 216<first£second£ 217>first 218>second 219 220 (IFS=« 221 read -d » -A array 222 print -l $array) 2230:read -A with multibyte IFS 224<dominus«illuminatio«mea»ignored 225>dominus 226>illuminatio 227>mea 228 229 read -k2 -u0 twochars 230 print $twochars 2310:read multibyte characters 232<«»ignored 233>«» 234 235 read -q -u0 mb 236 print $? 2370:multibyte character makes read -q return false 238<« 239>1 240 241 # See if the system grokks first-century Greek... 242 ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." 243 for (( i = 1; i <= ${#ioh}; i++ )); do 244 # FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with 245 # perispomeni and ypogegrammeni, of course) as a lower case character. 246 if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then 247 for tp in upper space punct invalid; do 248 if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then 249 print "$i: $tp" 250 break 251 fi 252 done 253 fi 254 done 2550:isw* functions on non-ASCII wide characters 256>1: upper 257>3: space 258>8: space 259>11: space 260>13: space 261>19: punct 262>20: space 263>24: space 264>26: space 265>32: space 266>35: space 267>40: space 268>44: space 269>49: punct 270>50: space 271>54: space 272>59: space 273>62: space 274>64: space 275>70: punct 276 277 ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος" 278 print ${ioh#[[:alpha:]]##} 279 print ${ioh##[[:alpha:]]##} 280 print ${ioh%[[:alpha:]]##} 281 print ${ioh%%[[:alpha:]]##} 282 print ${(S)ioh#λ*ς} 283 print ${(S)ioh##λ*ς} 284 print ${(S)ioh%θ*ς} 285 print ${(S)ioh%%θ*ς} 2860:Parameter #, ##, %, %% with multibyte characters 287>ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος 288> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος 289>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο 290>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ 291>Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος 292>Ἐν ἀρχῇ ἦν ὁ 293>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ ἦν ὁ λόγος 294>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ 295 296 foo=(κατέβην χθὲς εἰς Πειραιᾶ) 297 print ${(l.3..¥.r.3..£.)foo} 298 print ${(l.4..¥.r.2..£.)foo} 299 print ${(l.5..¥.r.1..£.)foo} 300 print ${(l.4..¥..«.r.4..£..».)foo} 301 print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo} 3020:simultaneous left and right padding 303>κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι 304>¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα 305>¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ 306>«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ 307>ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ 308# er... yeah, that looks right... 309 310 foo=picobarn 311 print ${foo:s£bar£rod£:s¥rod¥stick¥} 3120:Delimiters in modifiers 313>picostickn 314 315# TODO: if we get paired multibyte bracket delimiters to work 316# (as Emacs does, the smug so-and-so), the following should change. 317 foo=bar 318 print ${(r£5££X£)foo} 319 print ${(l«10««Y««HI«)foo} 3200:Delimiters in parameter flags 321>barXX 322>YYYYYHIbar 323 324 printf "%4.3s\n" főobar 3250:Multibyte characters in printf widths 326> főo 327 328# We ask for case-insensitive sorting here (and supply upper case 329# characters) so that we exercise the logic in the shell that lowers the 330# case of the string for case-insensitive sorting. 331 print -oi HÛH HÔH HÎH HÊH HÂH 332 (LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH) 3330:Multibyte characters in print sorting 334>HÂH HÊH HÎH HÔH HÛH 335>HAH HEH HUH HÈH HÉH 336 337# These are control characters in Unicode, so don't show up. 338# We just want to check they're not being treated as tokens. 339 for x in {128..150}; do 340 print ${(#)x} 341 done | while read line; do 342 print ${#line} $(( #line )) 343 done 3440:evaluated character number with multibyte characters 345>1 128 346>1 129 347>1 130 348>1 131 349>1 132 350>1 133 351>1 134 352>1 135 353>1 136 354>1 137 355>1 138 356>1 139 357>1 140 358>1 141 359>1 142 360>1 143 361>1 144 362>1 145 363>1 146 364>1 147 365>1 148 366>1 149 367>1 150 368 369 touch ngs1txt ngs2txt ngs10txt ngs20txt ngs100txt ngs200txt 370 setopt numericglobsort 371 print -l ngs* 3720:NUMERIC_GLOB_SORT option in UTF-8 locale 373>ngs1txt 374>ngs2txt 375>ngs10txt 376>ngs20txt 377>ngs100txt 378>ngs200txt 379 380# Not strictly multibyte, but gives us a well-defined locale for testing. 381 foo=$'X\xc0Y\x07Z\x7fT' 382 print -r ${(q)foo} 3830:Backslash-quoting of unprintable/invalid characters uses $'...' 384>X$'\300'Y$'\a'Z$'\177'T 385 386# This also isn't strictly multibyte and is here to reduce the 387# likelihood of a "cannot do character set conversion" error. 388 (print $'\u00e9') 2>&1 | read 389 if [[ $REPLY != é ]]; then 390 print "warning: your system can't do simple Unicode conversion." >&$ZTST_fd 391 print "Check you have a correctly installed iconv library." >&$ZTST_fd 392 # cheat 393 repeat 4 print OK 394 else 395 testfn() { (LC_ALL=C; print $'\u00e9') } 396 repeat 4 testfn 2>&1 | while read line; do 397 if [[ $line = *"character not in range"* ]]; then 398 print OK 399 elif [[ $line = "?" ]]; then 400 print OK 401 else 402 print Failed: no error message and no question mark 403 fi 404 done 405 fi 406 true 4070:error handling in Unicode quoting 408>OK 409>OK 410>OK 411>OK 412 413 tmp1='glob/\(\)Ą/*' 414 [[ glob/'()Ą'/foo == $~tmp1 ]] && print "Matched against $tmp1" 415 tmp1='glob/\(\)Ā/*' 416 [[ glob/'()Ā'/bar == $~tmp1 ]] && print "Matched against $tmp1" 4170:Backslashes and metafied characters in patterns 418>Matched against glob/()Ą/* 419>Matched against glob/()Ā/* 420 421 mkdir 梶浦由記 'Пётр Ильич Чайковский' 422 (cd 梶浦由記; print ${${(%):-%~}:t}) 423 (cd 'Пётр Ильич Чайковский'; print ${${(%):-%~}:t}) 4240:Metafied characters in prompt expansion 425>梶浦由記 426>Пётр Ильич Чайковский 427 428 ( 429 setopt nonomatch 430 tmp1=Ą 431 tmpA=(Ą 'Пётр Ильич Чайковский' 梶浦由記) 432 print ${tmp1} ${(%)tmp1} ${(%%)tmp1} 433 print ${#tmp1} ${#${(%)tmp1}} ${#${(%%)tmp1}} 434 print ${tmpA} 435 print ${(%)tmpA} 436 print ${(%%)tmpA} 437 ) 4380:More metafied characters in prompt expansion 439>Ą Ą Ą 440>1 1 1 441>Ą Пётр Ильич Чайковский 梶浦由記 442>Ą Пётр Ильич Чайковский 梶浦由記 443>Ą Пётр Ильич Чайковский 梶浦由記 444 445 setopt cbases 446 print $'\xc5' | read 447 print $(( [#16] #REPLY )) 4480:read passes through invalid multibyte characters 449>0xC5 450 451 word=abcま 452 word[-1]= 453 print $word 454 word=abcま 455 word[-2]= 456 print $word 457 word=abcま 458 word[4]=d 459 print $word 460 word=abcま 461 word[3]=not_c 462 print $word 4630:assignment with negative indices 464>abc 465>abま 466>abcd 467>abnot_cま 468