1# Copyright (c) 2012-2014 International Business Machines
2# Corporation and others. All Rights Reserved.
3#
4# This file should be in UTF-8 with a signature byte sequence ("BOM").
5#
6# collationtest.txt: Collation test data.
7#
8# created on: 2012apr13
9# created by: Markus W. Scherer
10
11# A line with "** test: description" is used for verbose and error output.
12
13# A collator can be set with "@ root" or "@ locale language-tag",
14# for example "@ locale de-u-co-phonebk".
15
16# A collator can be built with "@ rules".
17# An "@ rules" line is followed by one or more lines with the tailoring rules.
18
19# A collator can be modified with "% attribute=value".
20
21# "* compare" tests the order (= or <) of the following strings.
22# The relation can be "=" or "<" (the level of the difference is not specified)
23# or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference).
24
25# Test sections ("* compare") are terminated by
26# definitions of new collators, changing attributes, or new test sections.
27
28** test: simple CEs & expansions
29# Many types of mappings are tested elsewhere, including via the UCA conformance tests.
30# Here we mostly cover a few unusual mappings.
31@ rules
32&\x01                           # most control codes are ignorable
33<<<\u0300                       # tertiary CE
34&9<\x00                         # NUL not ignorable
35&\uA00A\uA00B=\uA002            # two long-primary CEs
36&\uA00A\uA00B\u00050005=\uA003  # three CEs, require 64 bits
37
38* compare
39=  \x01
40=  \x02
41<3 \u0300
42<1 9
43<1 \x00
44=  \x01\x00\x02
45<1 a
46<3 a\u0300
47<2 a\u0308
48=  ä
49<1 b
50<1 か        # Hiragana Ka (U+304B)
51<2 か\u3099  # plus voiced sound mark
52=  が        # Hiragana Ga (U+304C)
53<1 \uA00A\uA00B
54=  \uA002
55<1 \uA00A\uA00B\u00050004
56<1 \uA00A\uA00B\u00050005
57=  \uA003
58<1 \uA00A\uA00B\u00050006
59
60** test: contractions
61# Create some interesting mappings, and map some normalization-inert characters
62# (which are not subject to canonical reordering)
63# to some of the same CEs to check the sequence of CEs.
64@ rules
65
66# Contractions starting with 'a' should not continue with any character < U+0300
67# so that we can test a shortcut for that.
68&a=ⓐ
69&b<bz=ⓑ
70&d<dz\u0301=ⓓ           # d+z+acute
71&z
72<a\u0301=Ⓐ              # a+acute sorts after z
73<a\u0301\u0301=Ⓑ        # a+acute+acute
74<a\u0301\u0301\u0358=Ⓒ  # a+acute+acute+dot above right
75<a\u030a=Ⓓ              # a+ring
76<a\u0323=Ⓔ              # a+dot below
77<a\u0323\u0358=Ⓕ        # a+dot below+dot above right
78<a\u0327\u0323\u030a=Ⓖ  # a+cedilla+dot below+ring
79<a\u0327\u0323bz=Ⓗ      # a+cedilla+dot below+b+z
80
81&\U0001D158=⁰           # musical notehead black (has a symbol primary)
82<\U0001D158\U0001D165=¼ # musical quarter note
83
84# deliberately missing prefix contractions:
85# dz
86# a\u0327
87# a\u0327\u0323
88# a\u0327\u0323b
89
90&\x01
91<<<\U0001D165=¹         # musical stem (ccc=216)
92<<<\U0001D16D=²         # musical augmentation dot (ccc=226)
93<<<\U0001D165\U0001D16D=³  # stem+dot (ccc=216 226)
94&\u0301=❶               # acute (ccc=230)
95&\u030a=❷               # ring (ccc=230)
96&\u0308=❸               # diaeresis (ccc=230)
97<<\u0308\u0301=❹        # diaeresis+acute (=dialytika tonos) (ccc=230 230)
98&\u0327=❺               # cedilla (ccc=202)
99&\u0323=❻               # dot below (ccc=220)
100&\u0331=❼               # macron below (ccc=220)
101<<\u0331\u0358=❽        # macron below+dot above right (ccc=220 232)
102&\u0334=❾               # tilde overlay (ccc=1)
103&\u0358=❿               # dot above right (ccc=232)
104
105&\u0f71=①               # tibetan vowel sign aa
106&\u0f72=②               # tibetan vowel sign i
107#  \u0f71\u0f72         # tibetan vowel sign aa + i = ii = U+0F73
108&\u0f73=③               # tibetan vowel sign ii (ccc=0 but lccc=129)
109
110** test: simple contractions
111
112# Some strings are chosen to cause incremental contiguous contraction matching to
113# go into partial matches for prefixes of contractions
114# (where the prefixes are deliberately not also contractions).
115# When there is no complete match, then the matching code must back out of those
116# so that discontiguous contractions work as specified.
117
118* compare
119# contraction starter with no following text, or mismatch, or blocked
120<1 a
121=  ⓐ
122<1 aa
123=  ⓐⓐ
124<1 ab
125=  ⓐb
126<1 az
127=  ⓐz
128
129* compare
130<1 a
131<2 a\u0308\u030a  # ring blocked by diaeresis
132=  ⓐ❸❷
133<2 a\u0327
134=  ⓐ❺
135
136* compare
137<2 \u0308
138=  ❸
139<2 \u0308\u030a\u0301  # acute blocked by ring
140=  ❸❷❶
141
142* compare
143<1 \U0001D158
144=  ⁰
145<1 \U0001D158\U0001D165
146=  ¼
147
148# no discontiguous contraction because of missing prefix contraction d+z,
149# and a starter ('z') after the 'd'
150* compare
151<1 dz\u0323\u0301
152=  dz❻❶
153
154# contiguous contractions
155* compare
156<1 abz
157=  ⓐⓑ
158<1 abzz
159=  ⓐⓑz
160
161* compare
162<1 a
163<1 z
164<1 a\u0301
165=  Ⓐ
166<1 a\u0301\u0301
167=  Ⓑ
168<1 a\u0301\u0301\u0358
169=  Ⓒ
170<1 a\u030a
171=  Ⓓ
172<1 a\u0323\u0358
173=  Ⓕ
174<1 a\u0327\u0323\u030a  # match despite missing prefix
175=  Ⓖ
176<1 a\u0327\u0323bz
177=  Ⓗ
178
179* compare
180<2 \u0308\u0308\u0301  # acute blocked from first diaeresis, contracts with second
181=  ❸❹
182
183* compare
184<1 \U0001D158\U0001D165
185=  ¼
186
187* compare
188<3 \U0001D165\U0001D16D
189=  ³
190
191** test: discontiguous contractions
192* compare
193<1 a\u0327\u030a                # a+ring skips cedilla
194=  Ⓓ❺
195<2 a\u0327\u0327\u030a          # a+ring skips 2 cedillas
196=  Ⓓ❺❺
197<2 a\u0327\u0327\u0327\u030a    # a+ring skips 3 cedillas
198=  Ⓓ❺❺❺
199<2 a\u0334\u0327\u0327\u030a    # a+ring skips tilde overlay & 2 cedillas
200=  Ⓓ❾❺❺
201<1 a\u0327\u0323                # a+dot below skips cedilla
202=  Ⓔ❺
203<1 a\u0323\u0301\u0358          # a+dot below+dot ab.r.: 2-char match, then skips acute
204=  Ⓕ❶
205<2 a\u0334\u0323\u0358          # a+dot below skips tilde overlay
206=  Ⓕ❾
207
208* compare
209<2 \u0331\u0331\u0358           # macron below+dot ab.r. skips the second macron below
210=  ❽❼
211
212* compare
213<1 a\u0327\u0331\u0323\u030a    # a+ring skips cedilla, macron below, dot below (dot blocked by macron)
214=  Ⓓ❺❼❻
215<1 a\u0327\u0323\U0001D16D\u030a  # a+dot below skips cedilla
216=  Ⓔ❺²❷
217<2 a\u0327\u0327\u0323\u030a    # a+dot below skips 2 cedillas
218=  Ⓔ❺❺❷
219<2 a\u0327\u0323\u0323\u030a    # a+dot below skips cedilla
220=  Ⓔ❺❻❷
221<2 a\u0334\u0327\u0323\u030a    # a+dot below skips tilde overlay & cedilla
222=  Ⓔ❾❺❷
223
224* compare
225<1 \U0001D158\u0327\U0001D165   # quarter note skips cedilla
226=  ¼❺
227<1 a\U0001D165\u0323            # a+dot below skips stem
228=  Ⓔ¹
229
230# partial contiguous match, backs up, matches discontiguous contraction
231<1 a\u0327\u0323b
232=  Ⓔ❺b
233<1 a\u0327\u0323ba
234=  Ⓔ❺bⓐ
235
236# a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks
237* compare
238<1 a\u0327\u0301\u0301\u0358
239=  Ⓒ❺
240
241# FCD but not NFD
242* compare
243<1 a\u0f73\u0301                # a+acute skips tibetan ii
244=  Ⓐ③
245
246# FCD but the 0f71 inside the 0f73 must be skipped
247# to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73
248* compare
249<1 \u0f71\u0f73                 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72
250=  ③①
251
252** test: discontiguous contractions with nested contractions
253* compare
254<1 a\u0323\u0308\u0301\u0358
255=  Ⓕ❹
256<2 a\u0323\u0308\u0301\u0308\u0301\u0358
257=  Ⓕ❹❹
258
259** test: discontiguous contractions with interleaved contractions
260* compare
261# a+ring & cedilla & macron below+dot above right
262<1 a\u0327\u0331\u030a\u0358
263=  Ⓓ❺❽
264
265# a+ring & 1x..3x macron below+dot above right
266<2 a\u0331\u030a\u0358
267=  Ⓓ❽
268<2 a\u0331\u0331\u030a\u0358\u0358
269=  Ⓓ❽❽
270# also skips acute
271<2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358
272=  Ⓓ❽❽❽❶
273
274# a+dot below & stem+augmentation dot, followed by contiguous d+z+acute
275<1 a\U0001D165\u0323\U0001D16Ddz\u0301
276=  Ⓔ³ⓓ
277
278** test: some simple string comparisons
279@ root
280* compare
281# first string compares against ""
282= \u0000
283< a
284<1 b
285<3 B
286= \u0000B\u0000
287
288** test: compare with strength=primary
289% strength=primary
290* compare
291<1 a
292<1 b
293= B
294
295** test: compare with strength=secondary
296% strength=secondary
297* compare
298<1 a
299<1 b
300= B
301
302** test: compare with strength=tertiary
303% strength=tertiary
304* compare
305<1 a
306<1 b
307<3 B
308
309** test: compare with strength=quaternary
310% strength=quaternary
311* compare
312<1 a
313<1 b
314<3 B
315
316** test: compare with strength=identical
317% strength=identical
318* compare
319<1 a
320<1 b
321<3 B
322
323** test: côté with forwards secondary
324@ root
325* compare
326<1 cote
327<2 coté
328<2 côte
329<2 côté
330
331** test: côté with forwards secondary vs. U+FFFE merge separator
332# Merged sort keys: On each level, any difference in the first segment
333# must trump any further difference.
334* compare
335<1 cote\uFFFEcôté
336<2 coté\uFFFEcôte
337<2 côte\uFFFEcoté
338<2 côté\uFFFEcote
339
340** test: côté with backwards secondary
341% backwards=on
342* compare
343<1 cote
344<2 côte
345<2 coté
346<2 côté
347
348** test: côté with backwards secondary vs. U+FFFE merge separator
349# Merged sort keys: On each level, any difference in the first segment
350# must trump any further difference.
351* compare
352<1 cote\uFFFEcôté
353<2 côte\uFFFEcoté
354<2 coté\uFFFEcôte
355<2 côté\uFFFEcote
356
357** test: U+FFFE on identical level
358@ root
359% strength=identical
360* compare
361# All of these control codes are completely-ignorable, so that
362# their low code points are compared with the merge separator.
363# The merge separator must compare less than any other character.
364<1 \uFFFE\u0001\u0002\u0003
365<i \u0001\uFFFE\u0002\u0003
366<i \u0001\u0002\uFFFE\u0003
367<i \u0001\u0002\u0003\uFFFE
368
369* compare
370# The merge separator must even compare less than U+0000.
371<1 \uFFFE\u0000\u0000
372<i \u0000\uFFFE\u0000
373<i \u0000\u0000\uFFFE
374
375** test: Hani < surrogates < U+FFFD
376# Note: compareUTF8() treats unpaired surrogates like U+FFFD,
377# so with that the strings with surrogates will compare equal to each other
378# and equal to the string with U+FFFD.
379@ root
380% strength=identical
381* compare
382<1 abz
383<1 a\u4e00z
384<1 a\U00020000z
385<1 a\ud800z
386<1 a\udbffz
387<1 a\udc00z
388<1 a\udfffz
389<1 a\ufffdz
390
391** test: script reordering
392@ root
393% reorder Hani Zzzz digit
394* compare
395<1 ?
396<1 +
397<1 丂
398<1 a
399<1 α
400<1 5
401
402% reorder default
403* compare
404<1 ?
405<1 +
406<1 5
407<1 a
408<1 α
409<1 丂
410
411** test: empty rules
412@ rules
413* compare
414<1 a
415<2 ä
416<3 Ä
417<1 b
418
419** test: very simple rules
420@ rules
421&a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z
422% strength=quaternary
423* compare
424<1 a
425=  e
426<4 q
427<4 r
428<1 x
429<3 X
430<2 y
431<3 Y
432<2 z
433<3 Z
434
435** test: tailoring twice before a root position: primary
436@ rules
437&[before 1]b<p
438&[before 1]b<q
439* compare
440<1 a
441<1 p
442<1 q
443<1 b
444
445** test: tailoring twice before a root position: secondary
446@ rules
447&[before 2]ſ<<p
448&[before 2]ſ<<q
449* compare
450<1 s
451<2 p
452<2 q
453<2 ſ
454
455# secondary-before common weight
456@ rules
457&[before 2]b<<p
458&[before 2]b<<q
459* compare
460<1 a
461<1 p
462<2 q
463<2 b
464
465** test: tailoring twice before a root position: tertiary
466@ rules
467&[before 3]B<<<p
468&[before 3]B<<<q
469* compare
470<1 b
471<3 p
472<3 q
473<3 B
474
475# tertiary-before common weight
476@ rules
477&[before 3]b<<<p
478&[before 3]b<<<q
479* compare
480<1 a
481<1 p
482<3 q
483<3 b
484
485@ rules
486&[before 2]b<<s
487&[before 3]s<<<p
488&[before 3]s<<<q
489* compare
490<1 a
491<1 p
492<3 q
493<3 s
494<2 b
495
496** test: tailor after completely ignorable
497@ rules
498&\x00<<<x<<y
499* compare
500= \x00
501= \x1F
502<3 x
503<2 y
504
505** test: secondary tailoring gaps, ICU ticket 9362
506@ rules
507&[before 2]s<<'_'
508&s<<r  # secondary between s and ſ (long s)
509&ſ<<*a-q  # more than 15 between ſ and secondary CE boundary
510&[before 2][first primary ignorable]<<u<<v  # between secondary CE boundary & lowest secondary CE
511&[last primary ignorable]<<y<<z
512
513* compare
514<2 u
515<2 v
516<2 \u0332  # lowest secondary CE
517<2 \u0308
518<2 y
519<2 z
520<1 s_
521<2 ss
522<2 sr
523<2 sſ
524<2 sa
525<2 sb
526<2 sp
527<2 sq
528<2 sus
529<2 svs
530<2 rs
531
532** test: tertiary tailoring gaps, ICU ticket 9362
533@ rules
534&[before 3]t<<<'_'
535&t<<<r  # tertiary between t and fullwidth t
536&ᵀ<<<*a-q  # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary
537&[before 3][first secondary ignorable]<<<u<<<v  # between tertiary CE boundary & lowest tertiary CE
538&[last secondary ignorable]<<<y<<<z
539
540* compare
541<3 u
542<3 v
543# Note: The root collator currently does not map any characters to tertiary CEs.
544<3 y
545<3 z
546<1 t_
547<3 tt
548<3 tr
549<3 tt
550<3 tᵀ
551<3 ta
552<3 tb
553<3 tp
554<3 tq
555<3 tut
556<3 tvt
557<3 rt
558
559** test: secondary & tertiary around root character
560@ rules
561&[before 2]m<<r
562&m<<s
563&[before 3]m<<<u
564&m<<<v
565* compare
566<1 l
567<1 r
568<2 u
569<3 m
570<3 v
571<2 s
572<1 n
573
574** test: secondary & tertiary around tailored item
575@ rules
576&m<x
577&[before 2]x<<r
578&x<<s
579&[before 3]x<<<u
580&x<<<v
581* compare
582<1 m
583<1 r
584<2 u
585<3 x
586<3 v
587<2 s
588<1 n
589
590** test: more nesting of secondary & tertiary before
591@ rules
592&[before 3]m<<<u
593&[before 2]m<<r
594&[before 3]r<<<q
595&m<<<w
596&m<<t
597&[before 3]w<<<v
598&w<<<x
599&w<<s
600* compare
601<1 l
602<1 q
603<3 r
604<2 u
605<3 m
606<3 v
607<3 w
608<3 x
609<2 s
610<2 t
611<1 n
612
613** test: case bits
614@ rules
615&w<x  # tailored CE getting case bits
616  =uv=uV=Uv=UV  # 2 chars -> 1 CE
617&ae=ch=cH=Ch=CH  # 2 chars -> 2 CEs
618&rst=yz=yZ=Yz=YZ  # 2 chars -> 3 CEs
619% caseFirst=lower
620* compare
621<1 ae
622=  ch
623<3 cH
624<3 Ch
625<3 CH
626<1 rst
627=  yz
628<3 yZ
629<3 Yz
630<3 YZ
631<1 w
632<1 x
633=  uv
634<3 uV
635=  Uv  # mixed case on single CE cannot distinguish variations
636<3 UV
637
638** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower
639@ rules
640&\u0001<<<t<<<T  # tertiary CEs
641% caseFirst=lower
642* compare
643<1 aa
644<3 aat
645<3 aaT
646<3 aA
647<3 aAt
648<3 ata
649<3 aTa
650
651** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper
652% caseFirst=upper
653* compare
654<1 aA
655<3 aAt
656<3 aa
657<3 aat
658<3 aaT
659<3 ata
660<3 aTa
661
662** test: reset on expansion, ICU tickets 9415 & 9593
663@ rules
664&æ<x    # tailor the last primary CE so that x sorts between ae and af
665&æb=bæ  # copy all reset CEs to make bæ sort the same
666&각<h    # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂
667&⒀<<y   # copy/tailor 4 CEs to make y sort with only a secondary difference
668&l·=z   # handle the pre-context for · when fetching reset CEs
669   <<u  # copy/tailor 2 CEs
670
671* compare
672<1 ae
673<2 æ
674<1 x
675<1 af
676
677* compare
678<1 aeb
679<2 æb
680=  bæ
681
682* compare
683<1 각
684<1 h
685<1 갂
686<1 갃
687
688* compare
689<1 ·    # by itself: primary CE
690<1 l
691<2 l·   # l+middle dot has only a secondary difference from l
692=  z
693<2 u
694
695* compare
696<1 (13)
697<3 ⒀  # DUCET sets special tertiary weights in all CEs
698<2 y
699<1 (13[
700
701% alternate=shifted
702* compare
703<1 (13)
704=  13
705<3 ⒀
706=  y  # alternate=shifted removes the tailoring difference on the last CE
707<1 14
708
709** test: contraction inside extension, ICU ticket 9378
710@ rules
711&а<<х/й     # all letters are Cyrillic
712* compare
713<1 ай
714<2 х
715
716** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104
717@ rules
718&t<x &ᵀ<y           # same primary weights
719&q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent
720* compare
721<1 q
722<1 u
723<1 v
724<1 ꝗ
725<1 t
726<3 ᵀ
727<1 y
728<1 x
729
730# Principle: Each rule builds on the state of preceding rules and ignores following rules.
731
732** test: later rule does not affect earlier reset position, ICU ticket 10105
733@ rules
734&a < u < v < w  &ov < x  &b < v
735* compare
736<1 oa
737<1 ou
738<1 x    # CE(o) followed by CE between u and w
739<1 ow
740<1 ob
741<1 ov
742
743** test: later rule does not affect earlier extension (1), ICU ticket 10105
744@ rules
745&a=x/b &v=b
746% strength=secondary
747* compare
748<1 B
749<1 c
750<1 v
751=  b
752* compare
753<1 AB
754=  x
755<1 ac
756<1 av
757=  ab
758
759** test: later rule does not affect earlier extension (2), ICU ticket 10105
760@ rules
761&a <<< c / e &g <<< e / l
762% strength=secondary
763* compare
764<1 AE
765=  c
766<2 æ
767<1 agl
768=  ae
769
770** test: later rule does not affect earlier extension (3), ICU ticket 10105
771@ rules
772&a = b / c  &d = c / e
773% strength=secondary
774* compare
775<1 AC  # C is still only tertiary different from the original c
776=  b
777<1 ade
778=  ac
779
780** test: extension contains tailored character, ICU ticket 10105
781@ rules
782&a=e &b=u/e
783* compare
784<1 a
785=  e
786<1 ba
787=  be
788=  u
789
790** test: add simple mappings for characters with root context
791@ rules
792&z=·    # middle dot has a prefix mapping in the CLDR root
793&n=и    # и (U+0438) has contractions in the root
794* compare
795<1 l
796<2 l·   # root mapping for l|· still works
797<1 z
798=  ·
799* compare
800<1 n
801=  и
802<1 И
803<1 и\u0306  # root mapping for й=и\u0306 still works
804=  й
805<3 Й
806
807** test: add context mappings around characters with root context
808@ rules
809&z=·h   # middle dot has a prefix mapping in the CLDR root
810&n=ә|и  # и (U+0438) has contractions in the root
811* compare
812<1 l
813<2 l·   # root mapping for l|· still works
814<1 z
815=  ·h
816* compare
817<1 и
818<3 И
819<1 и\u0306  # root mapping for й=и\u0306 still works
820=  й
821* compare
822<1 әn
823=  әи
824<1 әo
825
826** test: many secondary CEs at the top of their range
827@ rules
828&[last primary ignorable]<<*\u2801-\u28ff
829* compare
830<2 \u0308
831<2 \u2801
832<2 \u2802
833<2 \u2803
834<2 \u2804
835<2 \u28fd
836<2 \u28fe
837<2 \u28ff
838<1 \x20
839
840** test: many tertiary CEs at the top of their range
841@ rules
842&[last secondary ignorable]<<<*a-z
843* compare
844<3 a
845<3 b
846<3 c
847<3 d
848# e..w
849<3 x
850<3 y
851<3 z
852<2 \u0308
853
854** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101
855@ rules
856&a=p|x &b=px &c=op
857* compare
858<1 b
859=  px
860<3 B
861<1 c
862=  op
863<3 C
864* compare
865<1 ca
866=  opx  # first contraction op, then prefix p|x
867<3 cA
868<3 Ca
869
870** test: reset position with prefix (pre-context), ICU ticket 10102
871@ rules
872&a=p|x &px=y
873* compare
874<1 pa
875=  px
876=  y
877<3 pA
878<1 q
879<1 x
880
881** test: prefix+contraction together (1), ICU ticket 10071
882@ rules
883&x=a|bc
884* compare
885<1 ab
886<1 Abc
887<1 abd
888<1 ac
889<1 aw
890<1 ax
891=  abc
892<3 aX
893<3 Ax
894<1 b
895<1 bb
896<1 bc
897<3 bC
898<3 Bc
899<1 bd
900
901** test: prefix+contraction together (2), ICU ticket 10071
902@ rules
903&w=bc &x=a|b
904* compare
905<1 w
906=  bc
907<3 W
908* compare
909<1 aw
910<1 ax
911=  ab
912<3 aX
913<1 axb
914<1 axc
915=  abc  # prefix match a|b takes precedence over contraction match bc
916<3 abC
917<1 abd
918<1 ay
919
920** test: prefix+contraction together (3), ICU ticket 10071
921@ rules
922&x=a|b &w=bc    # reverse order of rules as previous test, order should not matter here
923* compare       # same "compare" sequences as previous test
924<1 w
925=  bc
926<3 W
927* compare
928<1 aw
929<1 ax
930=  ab
931<3 aX
932<1 axb
933<1 axc
934=  abc  # prefix match a|b takes precedence over contraction match bc
935<3 abC
936<1 abd
937<1 ay
938
939** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962
940@ rules
941&d=ch &v=p|ci
942* compare
943<1 pc
944<3 pC
945<1 pcH
946<1 pcI
947<1 pd
948=  pch  # no-prefix contraction ch matches
949<3 pD
950<1 pv
951=  pci  # prefix+contraction p|ci matches
952<3 pV
953
954** test: tailor in & around compact ranges of root primaries
955# The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs
956# which should be reliably encoded as one range in the root elements data.
957@ rules
958&[before 1]ᚁ<a
959&ᚁ<b
960&[before 1]ᚂ<c
961&ᚂ<d
962&[before 1]ᚚ<y
963&ᚚ<z
964&[before 2]ᚁ<<r
965&ᚁ<<s
966&[before 3]ᚚ<<<t
967&ᚚ<<<u
968* compare
969<1 ᣵ    # U+18F5 last Canadian Aboriginal
970<1 a
971<1 r
972<2 ᚁ
973<2 s
974<1 b
975<1 c
976<1 ᚂ
977<1 d
978<1 ᚃ
979<1 ᚙ
980<1 y
981<1 t
982<3 ᚚ
983<3 u
984<1 z
985<1 ᚠ    # U+16A0 first Runic
986
987** test: suppressContractions
988@ rules
989&z<ch<әж [suppressContractions [·cә]]
990* compare
991<1 ch
992<3 cH   # ch was suppressed
993<1 l
994<1 l·   # primary difference, not secondary, because l|· was suppressed
995<1 ә
996<2 ә\u0308  # secondary difference, not primary, because contractions for ә were suppressed
997<1 әж
998<3 әЖ
999
1000** test: Hangul & Jamo
1001@ rules
1002&L=\u1100  # first Jamo L
1003&V=\u1161  # first Jamo V
1004&T=\u11A8  # first Jamo T
1005&\uAC01<<*\u4E00-\u4EFF  # first Hangul LVT syllable & lots of secondary diffs
1006* compare
1007<1 Lv
1008<3 LV
1009=  \u1100\u1161
1010=  \uAC00
1011<1 LVt
1012<3 LVT
1013=  \u1100\u1161\u11A8
1014=  \uAC00\u11A8
1015=  \uAC01
1016<2 LVT\u0308
1017<2 \u4E00
1018<2 \u4E01
1019<2 \u4E80
1020<2 \u4EFF
1021<2 LV\u0308T
1022<1 \uAC02
1023
1024** test: adjust special reset positions according to previous rules, CLDR ticket 6070
1025@ rules
1026&[last variable]<x
1027[maxVariable space]  # has effect only after building, no effect on following rules
1028&[last variable]<y
1029&[before 1][first regular]<z
1030* compare
1031<1 ?  # some punctuation
1032<1 x
1033<1 y
1034<1 z
1035<1 $  # some symbol
1036
1037@ rules
1038&[last primary ignorable]<<x<<<y
1039&[last primary ignorable]<<z
1040* compare
1041<2 \u0358
1042<2 x
1043<3 y
1044<2 z
1045<1 \x20
1046
1047@ rules
1048&[last secondary ignorable]<<<x
1049&[last secondary ignorable]<<<y
1050* compare
1051<3 x
1052<3 y
1053<2 \u0358
1054
1055@ rules
1056&[before 2][first variable]<<z
1057&[before 2][first variable]<<y
1058&[before 3][first variable]<<<x
1059&[before 3][first variable]<<<w
1060&[before 1][first variable]<v
1061&[before 2][first variable]<<u
1062&[before 3][first variable]<<<t
1063&[before 2]\uFDD1\xA0<<s  # FractionalUCA.txt: FDD1 00A0, SPACE first primary
1064* compare
1065<2 \u0358
1066<1 s
1067<2 \uFDD1\xA0
1068<1 t
1069<3 u
1070<2 v
1071<1 w
1072<3 x
1073<3 y
1074<2 z
1075<2 \t
1076
1077@ rules
1078&[before 2][first regular]<<z
1079&[before 3][first regular]<<<y
1080&[before 1][first regular]<x
1081&[before 3][first regular]<<<w
1082&[before 2]\uFDD1\u263A<<v  # FractionalUCA.txt: FDD1 263A, SYMBOL first primary
1083&[before 3][first regular]<<<u
1084&[before 1][first regular]<p  # primary before the boundary: becomes variable
1085&[before 3][first regular]<<<t  # not affected by p
1086&[last variable]<q              # after p!
1087* compare
1088<1 ?
1089<1 p
1090<1 q
1091<1 t
1092<3 u
1093<3 v
1094<1 w
1095<3 x
1096<1 y
1097<3 z
1098<1 $
1099
1100# check that p & q are indeed variable
1101% alternate=shifted
1102* compare
1103=  ?
1104=  p
1105=  q
1106<1 t
1107<3 u
1108<3 v
1109<1 w
1110<3 x
1111<1 y
1112<3 z
1113<1 $
1114
1115@ rules
1116&[before 2][first trailing]<<z
1117&[before 1][first trailing]<y
1118&[before 3][first trailing]<<<x
1119* compare
1120<1 \u4E00  # first Han, first implicit
1121<1 \uFDD1\uFDD0  # FractionalUCA.txt: unassigned first primary
1122# Note: The root collator currently does not map any characters to the trailing first boundary primary.
1123<1 x
1124<3 y
1125<1 z
1126<2 \uFFFD  # The root collator currently maps U+FFFD to the first real trailing primary.
1127
1128@ rules
1129&[before 2][first primary ignorable]<<z
1130&[before 2][first primary ignorable]<<y
1131&[before 3][first primary ignorable]<<<x
1132&[before 3][first primary ignorable]<<<w
1133* compare
1134=  \x01
1135<2 w
1136<3 x
1137<3 y
1138<2 z
1139<2 \u0301
1140
1141@ rules
1142&[before 3][first secondary ignorable]<<<y
1143&[before 3][first secondary ignorable]<<<x
1144* compare
1145=  \x01
1146<3 x
1147<3 y
1148<2 \u0301
1149
1150** test: canonical closure
1151@ rules
1152&X=A &U=Â
1153* compare
1154<1 U
1155=  Â
1156=  A\u0302
1157<2 Ú  # U with acute
1158=  U\u0301
1159=  Ấ  # A with circumflex & acute
1160=  Â\u0301
1161=  A\u0302\u0301
1162<1 X
1163=  A
1164<2 X\u030A  # with ring above
1165=  Å
1166=  A\u030A
1167=  \u212B  # Angstrom sign
1168
1169@ rules
1170&x=\u5140\u55C0
1171* compare
1172<1 x
1173=  \u5140\u55C0
1174=  \u5140\uFA0D
1175=  \uFA0C\u55C0
1176=  \uFA0C\uFA0D  # CJK compatibility characters
1177<3 X
1178
1179# canonical closure on prefix rules, ICU ticket 9444
1180@ rules
1181&x=ä|ŝ
1182* compare
1183<1 äs  # not tailored
1184<1 äx
1185=  äŝ
1186=  a\u0308s\u0302
1187=  a\u0308ŝ
1188=  äs\u0302
1189<3 äX
1190
1191** test: conjoining Jamo map to expansions
1192@ rules
1193&gg=\u1101  # Jamo Lead consonant GG
1194&nj=\u11AC  # Jamo Trail consonant NJ
1195* compare
1196<1 gg\u1161nj
1197=  \u1101\u1161\u11AC
1198=  \uAE4C\u11AC
1199=  \uAE51
1200<3 gg\u1161nJ
1201<1 \u1100\u1100
1202
1203** test: canonical tail closure, ICU ticket 5913
1204@ rules
1205&a<â
1206* compare
1207<1 a
1208<1 â              # tailored
1209=  a\u0302
1210<2 a\u0323\u0302  # discontiguous contraction
1211=  ạ\u0302        # equivalent
1212=  ậ              # equivalent
1213<1 b
1214
1215@ rules
1216&a<ạ
1217* compare
1218<1 a
1219<1 ạ              # tailored
1220=  a\u0323
1221<2 a\u0323\u0302  # contiguous contraction plus extra diacritic
1222=  ạ\u0302        # equivalent
1223=  ậ              # equivalent
1224<1 b
1225
1226# Tail closure should work even if there is a prefix and/or contraction.
1227@ rules
1228&a<\u5140|câ
1229# In order to find discontiguous contractions for \u5140|câ
1230# there must exist a mapping for \u5140|ca, regardless of what it maps to.
1231# (This follows from the UCA spec.)
1232&x=\u5140|ca
1233* compare
1234<1 \u5140a
1235=  \uFA0Ca
1236<1 \u5140câ              # tailored
1237=  \uFA0Ccâ
1238=  \u5140ca\u0302
1239=  \uFA0Cca\u0302
1240<2 \u5140ca\u0323\u0302  # discontiguous contraction
1241=  \uFA0Cca\u0323\u0302
1242=  \u5140cạ\u0302
1243=  \uFA0Ccạ\u0302
1244=  \u5140cậ
1245=  \uFA0Ccậ
1246<1 \u5140b
1247=  \uFA0Cb
1248<1 \u5140x
1249=  \u5140ca
1250
1251# Double-check that without the extra mapping there will be no discontiguous match.
1252@ rules
1253&a<\u5140|câ
1254* compare
1255<1 \u5140a
1256=  \uFA0Ca
1257<1 \u5140câ              # tailored
1258=  \uFA0Ccâ
1259=  \u5140ca\u0302
1260=  \uFA0Cca\u0302
1261<1 \u5140b
1262=  \uFA0Cb
1263<1 \u5140ca\u0323\u0302  # no discontiguous contraction
1264=  \uFA0Cca\u0323\u0302
1265=  \u5140cạ\u0302
1266=  \uFA0Ccạ\u0302
1267=  \u5140cậ
1268=  \uFA0Ccậ
1269
1270@ rules
1271&a<cạ
1272* compare
1273<1 a
1274<1 cạ              # tailored
1275=  ca\u0323
1276<2 ca\u0323\u0302  # contiguous contraction plus extra diacritic
1277=  cạ\u0302        # equivalent
1278=  cậ              # equivalent
1279<1 b
1280
1281# ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
1282#   = 03C9 0313 0300 0345
1283# ccc = 0, 230, 230, 240
1284@ rules
1285&δ=αῳ
1286# In order to find discontiguous contractions for αῳ
1287# there must exist a mapping for αω, regardless of what it maps to.
1288# (This follows from the UCA spec.)
1289&ε=αω
1290* compare
1291<1 δ
1292=  αῳ
1293=  αω\u0345
1294<2 αω\u0313\u0300\u0345  # discontiguous contraction
1295=  αὠ\u0300\u0345
1296=  αὢ\u0345
1297=  αᾢ
1298<2 αω\u0300\u0313\u0345
1299=  αὼ\u0313\u0345
1300=  αῲ\u0313  # not FCD
1301<1 ε
1302=  αω
1303
1304# Double-check that without the extra mapping there will be no discontiguous match.
1305@ rules
1306&δ=αῳ
1307* compare
1308<1 αω\u0313\u0300\u0345  # no discontiguous contraction
1309=  αὠ\u0300\u0345
1310=  αὢ\u0345
1311=  αᾢ
1312<2 αω\u0300\u0313\u0345
1313=  αὼ\u0313\u0345
1314=  αῲ\u0313  # not FCD
1315<1 δ
1316=  αῳ
1317=  αω\u0345
1318
1319# Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232.
1320# Tests code paths where the tailored string has a combining mark
1321# that does not occur in any composite's decomposition.
1322@ rules
1323&δ=αὼ\u0315
1324* compare
1325<1 αω\u0313\u0300\u0315  # Not tailored: The grave accent blocks the comma above.
1326=  αὠ\u0300\u0315
1327=  αὢ\u0315
1328<1 δ
1329=  αὼ\u0315
1330=  αω\u0300\u0315
1331<2 αω\u0300\u0315\u0345
1332=  αὼ\u0315\u0345
1333=  αῲ\u0315  # not FCD
1334
1335** test: danish a+a vs. a-umlaut, ICU ticket 9319
1336@ rules
1337&z<aa
1338* compare
1339<1 z
1340<1 aa
1341<2 aa\u0308
1342=  aä
1343
1344** test: Jamo L with and in prefix
1345# Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L).
1346@ rules
1347# Jamo Lead consonant G after G or GG
1348&[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100
1349# Jamo Lead consonant GG sorts like G+G
1350&\u1100\u1100=\u1101
1351# Note: Making G|GG and GG|GG sort the same as G|G+G
1352# would require the ability to reset on G|G+G,
1353# or we could make G-after-G equal to some secondary-CE character,
1354# and reset on a pair of those.
1355# (It does not matter much if there are at most two G in a row in real text.)
1356* compare
1357<1 \u1100
1358<2 \u1100\u1100  # only one primary from a sequence of G lead consonants
1359=  \u1101
1360<2 \u1100\u1100\u1100
1361=  \u1101\u1100
1362# but not = \u1100\u1101, see above
1363<1 \u1100\u1161
1364=  \uAC00
1365<2 \u1100\u1100\u1161
1366=  \u1100\uAC00  # prefix match from the L of the LV syllable
1367=  \u1101\u1161
1368=  \uAE4C
1369
1370** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546
1371@ rules
1372# Low secondary CEs for Jamo V & T.
1373# Note: T should sort before V for proper syllable order.
1374&\u0332  # COMBINING LOW LINE (first primary ignorable)
1375<<\u1161<<\u1162
1376
1377# Korean Jamo lead consonant search rules, part 2:
1378# Make modern compound L jamo primary equivalent to non-compound forms.
1379
1380# Secondary CEs for Jamo L-after-L, greater than Jamo V & T.
1381&\u0313  # COMBINING COMMA ABOVE (second primary ignorable)
1382=\u1100|\u1100
1383=\u1103|\u1103
1384=\u1107|\u1107
1385=\u1109|\u1109
1386=\u110C|\u110C
1387
1388# Compound L Jamo map to equivalent expansions of primary+secondary CE.
1389&\u1100\u0313=\u1101<<<\u3132  # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK
1390&\u1103\u0313=\u1104<<<\u3138  # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT
1391&\u1107\u0313=\u1108<<<\u3143  # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP
1392&\u1109\u0313=\u110A<<<\u3146  # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS
1393&\u110C\u0313=\u110D<<<\u3149  # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC
1394
1395* compare
1396<1 \u1100\u1161
1397=  \uAC00
1398<2 \u1100\u1162
1399=  \uAC1C
1400<2 \u1100\u1100\u1161
1401=  \u1100\uAC00
1402=  \u1101\u1161
1403=  \uAE4C
1404<3 \u3132\u1161
1405
1406** test: Hangul syllables in prefix & in the interior of a contraction
1407@ rules
1408&x=\u1100\u1161|a\u1102\u1162z
1409* compare
1410<1 \u1100\u1161x
1411=  \u1100\u1161a\u1102\u1162z
1412=  \u1100\u1161a\uB0B4z
1413=  \uAC00a\u1102\u1162z
1414=  \uAC00a\uB0B4z
1415
1416** test: digits are unsafe-backwards when numeric=on
1417@ root
1418% numeric=on
1419* compare
1420# If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a".
1421# We need to back up before the identical prefix "1" and compare the full numbers.
1422<1 11b
1423<1 101a
1424
1425** test: simple locale data test
1426@ locale de
1427* compare
1428<1 a
1429<2 ä
1430<1 ae
1431<2 æ
1432
1433@ locale de-u-co-phonebk
1434* compare
1435<1 a
1436<1 ae
1437<2 ä
1438<2 æ
1439
1440# The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt.
1441
1442** test: DataDrivenCollationTest/TestMorePinyin
1443# Testing the primary strength.
1444@ locale zh
1445% strength=primary
1446* compare
1447< lā
1448= lĀ
1449= Lā
1450= LĀ
1451< lān
1452= lĀn
1453< lē
1454= lĒ
1455= Lē
1456= LĒ
1457< lēn
1458= lĒn
1459
1460** test: DataDrivenCollationTest/TestLithuanian
1461# Lithuanian sort order.
1462@ locale lt
1463* compare
1464< cz
1465< č
1466< d
1467< iz
1468< j
1469< sz
1470< š
1471< t
1472< zz
1473< ž
1474
1475** test: DataDrivenCollationTest/TestLatvian
1476# Latvian sort order.
1477@ locale lv
1478* compare
1479< cz
1480< č
1481< d
1482< gz
1483< ģ
1484< h
1485< iz
1486< j
1487< kz
1488< ķ
1489< l
1490< lz
1491< ļ
1492< m
1493< nz
1494< ņ
1495< o
1496< rz
1497< ŗ
1498< s
1499< sz
1500< š
1501< t
1502< zz
1503< ž
1504
1505** test: DataDrivenCollationTest/TestEstonian
1506# Estonian sort order.
1507@ locale et
1508* compare
1509< sy
1510< š
1511< šy
1512< z
1513< zy
1514< ž
1515< v
1516< w
1517< va
1518< õ
1519< õy
1520< ä
1521< äy
1522< ö
1523< öy
1524< ü
1525< üy
1526< x
1527
1528** test: DataDrivenCollationTest/TestAlbanian
1529# Albanian sort order.
1530@ locale sq
1531* compare
1532< cz
1533< ç
1534< d
1535< dz
1536< dh
1537< e
1538< ez
1539< ë
1540< f
1541< gz
1542< gj
1543< h
1544< lz
1545< ll
1546< m
1547< nz
1548< nj
1549< o
1550< rz
1551< rr
1552< s
1553< sz
1554< sh
1555< t
1556< tz
1557< th
1558< u
1559< xz
1560< xh
1561< y
1562< zz
1563< zh
1564
1565** test: DataDrivenCollationTest/TestSimplifiedChineseOrder
1566# Sorted file has different order.
1567@ root
1568# normalization=on turned on & off automatically.
1569* compare
1570< \u5F20
1571< \u5F20\u4E00\u8E3F
1572
1573** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash
1574# This pretty much crashes.
1575@ root
1576* compare
1577< \u0f71\u0f72\u0f80\u0f71\u0f72
1578< \u0f80
1579
1580** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems
1581# These are examples of strings that caused trouble in partial sort key testing.
1582@ locale th-TH
1583* compare
1584< \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C
1585< \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18
1586* compare
1587< \u0E01\u0E07\u0E01\u0E32\u0E23
1588< \u0E01\u0E07\u0E42\u0E01\u0E49
1589* compare
1590< \u0E01\u0E23\u0E19\u0E17\u0E32
1591< \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32
1592* compare
1593< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27
1594< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27
1595* compare
1596< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D
1597< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32
1598
1599** test: DataDrivenCollationTest/TestJavaStyleRule
1600# java.text allows rules to start as '<<<x<<<y...'
1601# we emulate this by assuming a &[first tertiary ignorable] in this case.
1602@ rules
1603&\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b
1604* compare
1605= a
1606= equal
1607< z
1608< x
1609= b  # x had become the new first primary ignorable
1610< w
1611
1612** test: DataDrivenCollationTest/TestShiftedIgnorable
1613# The UCA states that primary ignorables should be completely
1614# ignorable when following a shifted code point.
1615@ root
1616% alternate=shifted
1617% strength=quaternary
1618* compare
1619< a\u0020b
1620= a\u0020\u0300b
1621= a\u0020\u0301b
1622< a_b
1623= a_\u0300b
1624= a_\u0301b
1625< A\u0020b
1626= A\u0020\u0300b
1627= A\u0020\u0301b
1628< A_b
1629= A_\u0300b
1630= A_\u0301b
1631< a\u0301b
1632< A\u0301b
1633< a\u0300b
1634< A\u0300b
1635
1636** test: DataDrivenCollationTest/TestNShiftedIgnorable
1637# The UCA states that primary ignorables should be completely
1638# ignorable when following a shifted code point.
1639@ root
1640% alternate=non-ignorable
1641% strength=tertiary
1642* compare
1643< a\u0020b
1644< A\u0020b
1645< a\u0020\u0301b
1646< A\u0020\u0301b
1647< a\u0020\u0300b
1648< A\u0020\u0300b
1649< a_b
1650< A_b
1651< a_\u0301b
1652< A_\u0301b
1653< a_\u0300b
1654< A_\u0300b
1655< a\u0301b
1656< A\u0301b
1657< a\u0300b
1658< A\u0300b
1659
1660** test: DataDrivenCollationTest/TestSafeSurrogates
1661# It turned out that surrogates were not skipped properly
1662# when iterating backwards if they were in the middle of a
1663# contraction. This test assures that this is fixed.
1664@ rules
1665&a < x\ud800\udc00b
1666* compare
1667< a
1668< x\ud800\udc00b
1669
1670** test: DataDrivenCollationTest/da_TestPrimary
1671# This test goes through primary strength cases
1672@ locale da
1673% strength=primary
1674* compare
1675< Lvi
1676< Lwi
1677* compare
1678< L\u00e4vi
1679< L\u00f6wi
1680* compare
1681< L\u00fcbeck
1682= Lybeck
1683
1684** test: DataDrivenCollationTest/da_TestTertiary
1685# This test goes through tertiary strength cases
1686@ locale da
1687% strength=tertiary
1688* compare
1689< Luc
1690< luck
1691* compare
1692< luck
1693< L\u00fcbeck
1694* compare
1695< lybeck
1696< L\u00fcbeck
1697* compare
1698< L\u00e4vi
1699< L\u00f6we
1700* compare
1701< L\u00f6ww
1702< mast
1703
1704* compare
1705< A/S
1706< ANDRE
1707< ANDR\u00c9
1708< ANDREAS
1709< AS
1710< CA
1711< \u00c7A
1712< CB
1713< \u00c7C
1714< D.S.B.
1715< DA
1716< \u00d0A
1717< DB
1718< \u00d0C
1719< DSB
1720< DSC
1721< EKSTRA_ARBEJDE
1722< EKSTRABUD0
1723< H\u00d8ST
1724< HAAG
1725< H\u00c5NDBOG
1726< HAANDV\u00c6RKSBANKEN
1727< Karl
1728< karl
1729< NIELS\u0020J\u00d8RGEN
1730< NIELS-J\u00d8RGEN
1731< NIELSEN
1732< R\u00c9E,\u0020A
1733< REE,\u0020B
1734< R\u00c9E,\u0020L
1735< REE,\u0020V
1736< SCHYTT,\u0020B
1737< SCHYTT,\u0020H
1738< SCH\u00dcTT,\u0020H
1739< SCHYTT,\u0020L
1740< SCH\u00dcTT,\u0020M
1741< SS
1742< \u00df
1743< SSA
1744< STORE\u0020VILDMOSE
1745< STOREK\u00c6R0
1746< STORM\u0020PETERSEN
1747< STORMLY
1748< THORVALD
1749< THORVARDUR
1750< \u00feORVAR\u00d0UR
1751< THYGESEN
1752< VESTERG\u00c5RD,\u0020A
1753< VESTERGAARD,\u0020A
1754< VESTERG\u00c5RD,\u0020B
1755< \u00c6BLE
1756< \u00c4BLE
1757< \u00d8BERG
1758< \u00d6BERG
1759
1760* compare
1761< andere
1762< chaque
1763< chemin
1764< cote
1765< cot\u00e9
1766< c\u00f4te
1767< c\u00f4t\u00e9
1768< \u010du\u010d\u0113t
1769< Czech
1770< hi\u0161a
1771< irdisch
1772< lie
1773< lire
1774< llama
1775< l\u00f5ug
1776< l\u00f2za
1777< lu\u010d
1778< luck
1779< L\u00fcbeck
1780< lye
1781< l\u00e4vi
1782< L\u00f6wen
1783< m\u00e0\u0161ta
1784< m\u00eer
1785< myndig
1786< M\u00e4nner
1787< m\u00f6chten
1788< pi\u00f1a
1789< pint
1790< pylon
1791< \u0161\u00e0ran
1792< savoir
1793< \u0160erb\u016bra
1794< Sietla
1795< \u015blub
1796< subtle
1797< symbol
1798< s\u00e4mtlich
1799< verkehrt
1800< vox
1801< v\u00e4ga
1802< waffle
1803< wood
1804< yen
1805< yuan
1806< yucca
1807< \u017eal
1808< \u017eena
1809< \u017den\u0113va
1810< zoo0
1811< Zviedrija
1812< Z\u00fcrich
1813< zysk0
1814< \u00e4ndere
1815
1816** test: DataDrivenCollationTest/hi_TestNewRules
1817# This test goes through new rules and tests against old rules
1818@ locale hi
1819* compare
1820< कॐ
1821< कं
1822< कँ
1823< कः
1824
1825** test: DataDrivenCollationTest/ro_TestNewRules
1826# This test goes through new rules and tests against old rules
1827@ locale ro
1828* compare
1829< xAx
1830< xă
1831< xĂ
1832< Xă
1833< XĂ
1834< xăx
1835< xĂx
1836< xâ
1837< xÂ
1838< Xâ
1839< XÂ
1840< xâx
1841< xÂx
1842< xb
1843< xIx
1844< xî
1845< xÎ
1846< Xî
1847< XÎ
1848< xîx
1849< xÎx
1850< xj
1851< xSx
1852< xș
1853= xş
1854< xȘ
1855= xŞ
1856< Xș
1857= Xş
1858< XȘ
1859= XŞ
1860< xșx
1861= xşx
1862< xȘx
1863= xŞx
1864< xT
1865< xTx
1866< xț
1867= xţ
1868< xȚ
1869= xŢ
1870< Xț
1871= Xţ
1872< XȚ
1873= XŢ
1874< xțx
1875= xţx
1876< xȚx
1877= xŢx
1878< xU
1879
1880** test: DataDrivenCollationTest/testOffsets
1881# This tests cases where forwards and backwards iteration get different offsets
1882@ locale en
1883% strength=tertiary
1884* compare
1885< a\uD800\uDC00\uDC00
1886< b\uD800\uDC00\uDC00
1887* compare
1888< \u0301A\u0301\u0301
1889< \u0301B\u0301\u0301
1890* compare
1891< abcd\r\u0301
1892< abce\r\u0301
1893# TODO: test offsets in new CollationTest
1894
1895# End of test cases moved here from ICU 52's DataDrivenCollationTest.txt.
1896
1897** test: was ICU 52 cmsccoll/TestRedundantRules
1898@ rules
1899& a < b < c < d& [before 1] c < m
1900* compare
1901<1 a
1902<1 b
1903<1 m
1904<1 c
1905<1 d
1906
1907@ rules
1908& a < b <<< c << d <<< e& [before 3] e <<< x
1909* compare
1910<1 a
1911<1 b
1912<3 c
1913<2 d
1914<3 x
1915<3 e
1916
1917@ rules
1918& a < b <<< c << d <<< e <<< f < g& [before 1] g < x
1919* compare
1920<1 a
1921<1 b
1922<3 c
1923<2 d
1924<3 e
1925<3 f
1926<1 x
1927<1 g
1928
1929@ rules
1930& a <<< b << c < d& a < m
1931* compare
1932<1 a
1933<3 b
1934<2 c
1935<1 m
1936<1 d
1937
1938@ rules
1939&a<b<<b\u0301 &z<b
1940* compare
1941<1 a
1942<1 b\u0301
1943<1 z
1944<1 b
1945
1946@ rules
1947&z<m<<<q<<<m
1948* compare
1949<1 z
1950<1 q
1951<3 m
1952
1953@ rules
1954&z<<<m<q<<<m
1955* compare
1956<1 z
1957<1 q
1958<3 m
1959
1960@ rules
1961& a < b < c < d& r < c
1962* compare
1963<1 a
1964<1 b
1965<1 d
1966<1 r
1967<1 c
1968
1969@ rules
1970& a < b < c < d& c < m
1971* compare
1972<1 a
1973<1 b
1974<1 c
1975<1 m
1976<1 d
1977
1978@ rules
1979& a < b < c < d& a < m
1980* compare
1981<1 a
1982<1 m
1983<1 b
1984<1 c
1985<1 d
1986
1987** test: was ICU 52 cmsccoll/TestExpansionSyntax
1988# The following two rules should sort the particular list of strings the same.
1989@ rules
1990&AE <<< a << b <<< c &d <<< f
1991* compare
1992<1 AE
1993<3 a
1994<2 b
1995<3 c
1996<1 d
1997<3 f
1998
1999@ rules
2000&A <<< a / E << b / E <<< c /E  &d <<< f
2001* compare
2002<1 AE
2003<3 a
2004<2 b
2005<3 c
2006<1 d
2007<3 f
2008
2009# The following two rules should sort the particular list of strings the same.
2010@ rules
2011&AE <<< a <<< b << c << d < e < f <<< g
2012* compare
2013<1 AE
2014<3 a
2015<3 b
2016<2 c
2017<2 d
2018<1 e
2019<1 f
2020<3 g
2021
2022@ rules
2023&A <<< a / E <<< b / E << c / E << d / E < e < f <<< g
2024* compare
2025<1 AE
2026<3 a
2027<3 b
2028<2 c
2029<2 d
2030<1 e
2031<1 f
2032<3 g
2033
2034# The following two rules should sort the particular list of strings the same.
2035@ rules
2036&AE <<< B <<< C / D <<< F
2037* compare
2038<1 AE
2039<3 B
2040<3 F
2041<1 AED
2042<3 C
2043
2044@ rules
2045&A <<< B / E <<< C / ED <<< F / E
2046* compare
2047<1 AE
2048<3 B
2049<3 F
2050<1 AED
2051<3 C
2052
2053** test: never reorder trailing primaries
2054@ root
2055% reorder Zzzz Grek
2056* compare
2057<1 L
2058<1 字
2059<1 Ω
2060<1 \uFFFD
2061<1 \uFFFF
2062
2063** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes
2064@ rules
2065&u=ab|cd
2066&v=b|ce
2067* compare
2068<1 abc
2069<1 abcc
2070<1 abcf
2071<1 abcd
2072=  abu
2073<1 abce
2074=  abv
2075
2076# With the following rules, there is only one prefix per composite ĉ or ç,
2077# but both prefixes apply to just c in NFD form.
2078# We would get different results for composed vs. NFD input
2079# if we fell back directly from longest-prefix mappings to no-prefix mappings.
2080@ rules
2081&x=op|ĉ
2082&y=p|ç
2083* compare
2084<1 opc
2085<2 opć
2086<1 opcz
2087<1 opd
2088<1 opĉ
2089=  opc\u0302
2090=  opx
2091<1 opç
2092=  opc\u0327
2093=  opy
2094
2095# The mapping is used which has the longest matching prefix for which
2096# there is also a suffix match, with the longest suffix match among several for that prefix.
2097@ rules
2098&❶=d
2099&❷=de
2100&❸=def
2101&①=c|d
2102&②=c|de
2103&③=c|def
2104&④=bc|d
2105&⑤=bc|de
2106&⑥=bc|def
2107&⑦=abc|d
2108&⑧=abc|de
2109&⑨=abc|def
2110* compare
2111<1 9aadzz
2112=  9aa❶zz
2113<1 9aadez
2114=  9aa❷z
2115<1 9aadef
2116=  9aa❸
2117<1 9acdzz
2118=  9ac①zz
2119<1 9acdez
2120=  9ac②z
2121<1 9acdef
2122=  9ac③
2123<1 9bcdzz
2124=  9bc④zz
2125<1 9bcdez
2126=  9bc⑤z
2127<1 9bcdef
2128=  9bc⑥
2129<1 abcdzz
2130=  abc⑦zz
2131<1 abcdez
2132=  abc⑧z
2133<1 abcdef
2134=  abc⑨
2135
2136** test: prefix + discontiguous contraction with missing prefix contraction
2137# Unfortunate terminology: The first "prefix" here is the pre-context,
2138# the second "prefix" refers to the contraction/relation string that is
2139# one shorter than the one being tested.
2140@ rules
2141&x=p|e
2142&y=p|ê
2143&z=op|ê
2144# No mapping for op|e:
2145# Discontiguous contraction matching should not match op|ê in opệ
2146# because it would have to skip the dot below and extend a match on op|e by the circumflex,
2147# but there is no match on op|e.
2148* compare
2149<1 oPe
2150<1 ope
2151=  opx
2152<1 opệ
2153=  opy\u0323  # y not z
2154<1 opê
2155=  opz
2156
2157# We cannot test for fallback by whether the contraction default CE32
2158# is for another contraction. With the following rules, there is no mapping for op|e,
2159# and the fallback to prefix p has no contractions.
2160@ rules
2161&x=p|e
2162&z=op|ê
2163* compare
2164<1 oPe
2165<1 ope
2166=  opx
2167<2 opệ
2168=  opx\u0323\u0302  # x not z
2169<1 opê
2170=  opz
2171
2172# One more variation: Fallback to the simple code point, no shorter non-empty prefix.
2173@ rules
2174&x=e
2175&z=op|ê
2176* compare
2177<1 ope
2178=  opx
2179<3 oPe
2180=  oPx
2181<2 opệ
2182=  opx\u0323\u0302  # x not z
2183<1 opê
2184=  opz
2185
2186** test: maxVariable via rules
2187@ rules
2188[maxVariable space][alternate shifted]
2189* compare
2190=  \u0020
2191=  \u000A
2192<1 .
2193<1 °  # degree sign
2194<1 $
2195<1 0
2196
2197** test: maxVariable via setting
2198@ root
2199% maxVariable=currency
2200% alternate=shifted
2201* compare
2202=  \u0020
2203=  \u000A
2204=  .
2205=  °  # degree sign
2206=  $
2207<1 0
2208
2209** test: ICU4J CollationMiscTest/TestContractionClosure (ää)
2210# This tests canonical closure, but it also tests that CollationFastLatin
2211# bails out properly for contractions with combining marks.
2212# For that we need pairs of strings that remain in the Latin fastpath
2213# long enough, hence the extra "= b" lines.
2214@ rules
2215&b=\u00e4\u00e4
2216* compare
2217<1 b
2218=  \u00e4\u00e4
2219=  b
2220=  a\u0308a\u0308
2221=  b
2222=  \u00e4a\u0308
2223=  b
2224=  a\u0308\u00e4
2225
2226** test: ICU4J CollationMiscTest/TestContractionClosure (Å)
2227@ rules
2228&b=\u00C5
2229* compare
2230<1 b
2231=  \u00C5
2232=  b
2233=  A\u030A
2234=  b
2235=  \u212B
2236
2237** test: reset-before on already-tailored characters, ICU ticket 10108
2238@ rules
2239&a<w<<x &[before 2]x<<y
2240* compare
2241<1 a
2242<1 w
2243<2 y
2244<2 x
2245
2246@ rules
2247&a<<w<<<x &[before 2]x<<y
2248* compare
2249<1 a
2250<2 y
2251<2 w
2252<3 x
2253
2254@ rules
2255&a<w<x &[before 2]x<<y
2256* compare
2257<1 a
2258<1 w
2259<1 y
2260<2 x
2261
2262@ rules
2263&a<w<<<x &[before 2]x<<y
2264* compare
2265<1 a
2266<1 y
2267<2 w
2268<3 x
2269
2270** test: numeric collation with other settings, ICU ticket 9092
2271@ root
2272% strength=identical
2273% caseFirst=upper
2274% numeric=on
2275* compare
2276<1 100\u0020a
2277<1 101
2278
2279** test: collation type fallback from unsupported type, ICU ticket 10149
2280@ locale fr-CA-u-co-phonebk
2281# Expect the same result as with fr-CA, using backwards-secondary order.
2282# That is, we should fall back from the unsupported collation type
2283# to the locale's default collation type.
2284* compare
2285<1 cote
2286<2 côte
2287<2 coté
2288<2 côté
2289
2290** test: @ is equivalent to [backwards 2], ICU ticket 9956
2291@ rules
2292&b<a @ &v<<w
2293* compare
2294<1 b
2295<1 a
2296<1 cote
2297<2 côte
2298<2 coté
2299<2 côté
2300<1 v
2301<2 w
2302<1 x
2303
2304** test: shifted+reordering, ICU ticket 9507
2305@ root
2306% reorder Grek punct space
2307% alternate=shifted
2308% strength=quaternary
2309# Which primaries are "variable" should be determined without script reordering,
2310# and then primaries should be reordered whether they are shifted to quaternary or not.
2311* compare
2312<4 (  # punctuation
2313<4 )
2314<4 \u0020  # space
2315<1 `  # symbol
2316<1 ^
2317<1 $  # currency symbol
2318<1 €
2319<1 0  # numbers
2320<1 ε  # Greek
2321<1 e  # Latin
2322<1 e(e
2323<4 e)e
2324<4 e\u0020e
2325<4 ee
2326<3 e(E
2327<4 e)E
2328<4 e\u0020E
2329<4 eE
2330
2331** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351
2332@ rules
2333&\u0001<<<b<<<B
2334% caseFirst=upper
2335* compare
2336<1 aaa
2337<3 aaaB
2338
2339** test: secondary+case ignores secondary ignorables, ICU ticket 9355
2340@ rules
2341&\u0001<<<b<<<B
2342% strength=secondary
2343% caseLevel=on
2344* compare
2345<1 a
2346=  ab
2347=  aB
2348
2349** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328
2350@ rules
2351&[before 2] ൌ << ൗ  # U+0D57 << U+0D4C == 0D46+0D57
2352* compare
2353<1 ൗx
2354<2 ൌx
2355<1 ൗy
2356<2 ൌy
2357
2358** test: quoted apostrophe in compact syntax, ICU ticket 8204
2359@ rules
2360&q<<*a''c
2361* compare
2362<1 d
2363<1 p
2364<1 q
2365<2 a
2366<2 \u0027
2367<2 c
2368<1 r
2369