1%prep
2
3# Find a UTF-8 locale.
4  setopt multibyte
5# Don't let LC_* override our choice of locale.
6  unset -m LC_\*
7  mb_ok=
8  langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8
9	 $(locale -a 2>/dev/null | egrep 'utf8|UTF-8'))
10  for LANG in $langs; do
11    if [[ é = ? ]]; then
12      mb_ok=1
13      break;
14    fi
15  done
16  if [[ -z $mb_ok ]]; then
17    ZTST_unimplemented="no UTF-8 locale or multibyte mode is not implemented"
18  else
19    print -u $ZTST_fd Testing multibyte with locale $LANG
20    mkdir multibyte.tmp && cd multibyte.tmp
21  fi
22
23%test
24
25  a=ténébreux
26  for i in {1..9}; do
27      print ${a[i]}
28      for j in {$i..9}; do
29	  print $i $j ${a[i,j]} ${a[-j,-i]}
30      done
31  done
320:Basic indexing with multibyte characters
33>t
34>1 1 t x
35>1 2 té ux
36>1 3 tén eux
37>1 4 téné reux
38>1 5 ténéb breux
39>1 6 ténébr ébreux
40>1 7 ténébre nébreux
41>1 8 ténébreu énébreux
42>1 9 ténébreux ténébreux
4344>2 2 é u
45>2 3 én eu
46>2 4 éné reu
47>2 5 énéb breu
48>2 6 énébr ébreu
49>2 7 énébre nébreu
50>2 8 énébreu énébreu
51>2 9 énébreux ténébreu
52>n
53>3 3 n e
54>3 4 né re
55>3 5 néb bre
56>3 6 nébr ébre
57>3 7 nébre nébre
58>3 8 nébreu énébre
59>3 9 nébreux ténébre
6061>4 4 é r
62>4 5 éb br
63>4 6 ébr ébr
64>4 7 ébre nébr
65>4 8 ébreu énébr
66>4 9 ébreux ténébr
67>b
68>5 5 b b
69>5 6 br éb
70>5 7 bre néb
71>5 8 breu énéb
72>5 9 breux ténéb
73>r
74>6 6 r é
75>6 7 re né
76>6 8 reu éné
77>6 9 reux téné
78>e
79>7 7 e n
80>7 8 eu én
81>7 9 eux tén
82>u
83>8 8 u é
84>8 9 ux té
85>x
86>9 9 x t
87
88  s=é
89  print A${s[-2]}A B${s[-1]}B C${s[0]}C D${s[1]}D E${s[2]}E
900:Out of range subscripts with multibyte characters
91>AA BéB CC DéD EE
92
93  print ${a[(i)é]} ${a[(I)é]} ${a[${a[(i)é]},${a[(I)é]}]}
940:Reverse indexing with multibyte characters
95>2 4 éné
96
97  print ${a[(r)én,(r)éb]}
980:Subscript searching with multibyte characters
99>énéb
100
101  print ${a[(rb:1:)é,-1]}
102  print ${a[(rb:2:)é,-1]}
103  print ${a[(rb:3:)é,-1]}
104  print ${a[(rb:4:)é,-1]}
105  print ${a[(rb:5:)é,-1]}
1060:Subscript searching with initial offset
107>énébreux
108>énébreux
109>ébreux
110>ébreux
111>
112
113  print ${a[(rn:1:)é,-1]}
114  print ${a[(rn:2:)é,-1]}
115  print ${a[(rn:3:)é,-1]}
1160:Subscript searching with count
117>énébreux
118>ébreux
119>
120
121  print ${a[(R)én,(R)éb]}
1220:Backward subscript searching with multibyte characters
123>énéb
124
125# Starting offsets with (R) seem to be so strange as to be hardly
126# worth testing.
127
128  setopt extendedglob
129  [[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2
130  for i in {1..${#match}}; do
131    print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]}
132  done
1330:Multibyte offsets in pattern tests
134>én 2 3 én
135>éb 4 5 éb
136
137  b=${(U)a}
138  print $b
139  print ${(L)b}
140  desdichado="Je suis le $a, le veuf, l'inconsolé"
141  print ${(C)desdichado}
142  lxiv="l'état c'est moi"
143  print ${(C)lxiv}
1440:Case modification of multibyte strings
145>TÉNÉBREUX
146>ténébreux
147>Je Suis Le Ténébreux, Le Veuf, L'Inconsolé
148>L'État C'Est Moi
149
150  array=(ølaf ødd øpened án encyclopædia)
151  barray=(${(U)array})
152  print $barray
153  print ${(L)barray}
154  print ${(C)array}
155  print ${(C)barray}
1560:Case modification of arrays with multibyte strings
157>ØLAF ØDD ØPENED ÁN ENCYCLOPÆDIA
158>ølaf ødd øpened án encyclopædia
159>Ølaf Ødd Øpened Án Encyclopædia
160>Ølaf Ødd Øpened Án Encyclopædia
161
162  print $(( ##¥ ))
163  pound=£
164  print $(( #pound ))
165  alpha=α
166  print $(( ##α )) $(( #alpha ))
1670:Conversion to Unicode in mathematical expressions
168>165
169>163
170>945 945
171
172  unsetopt posix_identifiers
173  expr='hähä=3 || exit 1; print $hähä'
174  eval $expr
175  setopt posix_identifiers
176  (eval $expr)
1771:POSIX_IDENTIFIERS option
178>3
179?(eval):1: command not found: hähä=3
180
181  foo="Ølaf«Ødd«øpénëd«ån«àpple"
182  print -l ${(s.«.)foo}
183  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
184  print -l ${=ioh}
185  print ${(w)#ioh}
1860:Splitting with multibyte characters
187>Ølaf
188>Ødd
189>øpénëd
190>ån
191>àpple
192>Ἐν
193>ἀρχῇ
194>ἦν
195>ὁ
196>λόγος,
197>καὶ
198>ὁ
199>λόγος
200>ἦν
201>πρὸς
202>τὸν
203>θεόν,
204>καὶ
205>θεὸς
206>ἦν
207>ὁ
208>λόγος.
209>17
210
211  read -d £ one
212  read -d £ two
213  print $one
214  print $two
2150:read with multibyte delimiter
216<first£second£
217>first
218>second
219
220  (IFS=«
221  read -d » -A array
222  print -l $array)
2230:read -A with multibyte IFS
224<dominus«illuminatio«mea»ignored
225>dominus
226>illuminatio
227>mea
228
229  read -k2 -u0 twochars
230  print $twochars
2310:read multibyte characters
232<«»ignored
233>«»
234
235  read -q -u0 mb
236  print $?
2370:multibyte character makes read -q return false
238239>1
240
241  # See if the system grokks first-century Greek...
242  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
243  for (( i = 1; i <= ${#ioh}; i++ )); do
244    # FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with
245    # perispomeni and ypogegrammeni, of course) as a lower case character.
246    if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then
247      for tp in upper space punct invalid; do
248        if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then
249          print "$i: $tp"
250	  break
251	fi
252      done
253    fi
254  done
2550:isw* functions on non-ASCII wide characters
256>1: upper
257>3: space
258>8: space
259>11: space
260>13: space
261>19: punct
262>20: space
263>24: space
264>26: space
265>32: space
266>35: space
267>40: space
268>44: space
269>49: punct
270>50: space
271>54: space
272>59: space
273>62: space
274>64: space
275>70: punct
276
277  ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος"
278  print ${ioh#[[:alpha:]]##}
279  print ${ioh##[[:alpha:]]##}
280  print ${ioh%[[:alpha:]]##}
281  print ${ioh%%[[:alpha:]]##}
282  print ${(S)ioh#λ*ς}
283  print ${(S)ioh##λ*ς}
284  print ${(S)ioh%θ*ς}
285  print ${(S)ioh%%θ*ς}
2860:Parameter #, ##, %, %% with multibyte characters
287>ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
288> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
289>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο
290>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ 
291>Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
292>Ἐν ἀρχῇ ἦν ὁ 
293>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ  ἦν ὁ λόγος
294>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ 
295
296  foo=(κατέβην χθὲς εἰς Πειραιᾶ)
297  print ${(l.3..¥.r.3..£.)foo}
298  print ${(l.4..¥.r.2..£.)foo}
299  print ${(l.5..¥.r.1..£.)foo}
300  print ${(l.4..¥..«.r.4..£..».)foo}
301  print ${(l.4..¥..Σωκράτης.r.4..£..Γλαύκωνος.)foo}
3020:simultaneous left and right padding
303>κατέβη ¥χθὲς£ ¥¥εἰς£ Πειραι
304>¥κατέβ ¥¥χθὲς ¥¥¥εἰς ¥Πειρα
305>¥¥κατέ ¥¥¥χθὲ ¥¥¥¥εἰ ¥¥Πειρ
306>«κατέβην ¥«χθὲς»£ ¥¥«εἰς»£ «Πειραιᾶ
307>ςκατέβην ηςχθὲςΓλ τηςεἰςΓλ ςΠειραιᾶ
308# er... yeah, that looks right...
309
310  foo=picobarn
311  print ${foo:s£bar£rod£:s¥rod¥stick¥}
3120:Delimiters in modifiers
313>picostickn
314
315# TODO: if we get paired multibyte bracket delimiters to work
316# (as Emacs does, the smug so-and-so), the following should change.
317  foo=bar
318  print ${(r£5££X£)foo}
319  print ${(l«10««Y««HI«)foo}
3200:Delimiters in parameter flags
321>barXX
322>YYYYYHIbar
323
324  printf "%4.3s\n" főobar
3250:Multibyte characters in printf widths
326> főo
327
328# We ask for case-insensitive sorting here (and supply upper case
329# characters) so that we exercise the logic in the shell that lowers the
330# case of the string for case-insensitive sorting.
331  print -oi HÛH HÔH HÎH HÊH HÂH
332  (LC_ALL=C; print -oi HAH HUH HEH HÉH HÈH)
3330:Multibyte characters in print sorting
334>HÂH HÊH HÎH HÔH HÛH
335>HAH HEH HUH HÈH HÉH
336
337# These are control characters in Unicode, so don't show up.
338# We just want to check they're not being treated as tokens.
339  for x in {128..150}; do
340     print ${(#)x}
341  done | while read line; do
342    print ${#line} $(( #line ))
343  done
3440:evaluated character number with multibyte characters
345>1 128
346>1 129
347>1 130
348>1 131
349>1 132
350>1 133
351>1 134
352>1 135
353>1 136
354>1 137
355>1 138
356>1 139
357>1 140
358>1 141
359>1 142
360>1 143
361>1 144
362>1 145
363>1 146
364>1 147
365>1 148
366>1 149
367>1 150
368
369  touch ngs1txt ngs2txt ngs10txt ngs20txt ngs100txt ngs200txt
370  setopt numericglobsort
371  print -l ngs*
3720:NUMERIC_GLOB_SORT option in UTF-8 locale
373>ngs1txt
374>ngs2txt
375>ngs10txt
376>ngs20txt
377>ngs100txt
378>ngs200txt
379
380# Not strictly multibyte, but gives us a well-defined locale for testing.
381  foo=$'X\xc0Y\x07Z\x7fT'
382  print -r ${(q)foo}
3830:Backslash-quoting of unprintable/invalid characters uses $'...'
384>X$'\300'Y$'\a'Z$'\177'T
385
386# This also isn't strictly multibyte and is here to reduce the
387# likelihood of a "cannot do character set conversion" error.
388  (print $'\u00e9') 2>&1 | read
389  if [[ $REPLY != é ]]; then
390    print "warning: your system can't do simple Unicode conversion." >&$ZTST_fd
391    print "Check you have a correctly installed iconv library." >&$ZTST_fd
392    # cheat
393    repeat 4 print OK
394  else
395    testfn() { (LC_ALL=C; print $'\u00e9') }
396    repeat 4 testfn 2>&1 | while read line; do
397      if [[ $line = *"character not in range"* ]]; then
398        print OK
399      elif [[ $line = "?" ]]; then
400        print OK
401      else
402        print Failed: no error message and no question mark
403      fi
404    done
405  fi
406  true
4070:error handling in Unicode quoting
408>OK
409>OK
410>OK
411>OK
412
413  tmp1='glob/\(\)Ą/*'
414  [[ glob/'()Ą'/foo == $~tmp1 ]] && print "Matched against $tmp1"
415  tmp1='glob/\(\)Ā/*'
416  [[ glob/'()Ā'/bar == $~tmp1 ]] && print "Matched against $tmp1"
4170:Backslashes and metafied characters in patterns
418>Matched against glob/()Ą/*
419>Matched against glob/()Ā/*
420
421  mkdir 梶浦由記 'Пётр Ильич Чайковский'
422  (cd 梶浦由記; print ${${(%):-%~}:t})
423  (cd 'Пётр Ильич Чайковский'; print ${${(%):-%~}:t})
4240:Metafied characters in prompt expansion
425>梶浦由記
426>Пётр Ильич Чайковский
427
428  (
429  setopt nonomatch
430  tmp1=Ą
431  tmpA=(Ą 'Пётр Ильич Чайковский' 梶浦由記)
432  print ${tmp1} ${(%)tmp1} ${(%%)tmp1}
433  print ${#tmp1} ${#${(%)tmp1}} ${#${(%%)tmp1}}
434  print ${tmpA}
435  print ${(%)tmpA}
436  print ${(%%)tmpA}
437  )
4380:More metafied characters in prompt expansion
439>Ą Ą Ą
440>1 1 1
441>Ą Пётр Ильич Чайковский 梶浦由記
442>Ą Пётр Ильич Чайковский 梶浦由記
443>Ą Пётр Ильич Чайковский 梶浦由記
444
445  setopt cbases
446  print $'\xc5' | read
447  print $(( [#16] #REPLY ))
4480:read passes through invalid multibyte characters
449>0xC5
450
451  word=abcま          
452  word[-1]=
453  print $word
454  word=abcま 
455  word[-2]=
456  print $word
457  word=abcま 
458  word[4]=d
459  print $word
460  word=abcま 
461  word[3]=not_c
462  print $word  
4630:assignment with negative indices
464>abc
465>abま
466>abcd
467>abnot_cま
468