1/* Test of case and normalization insensitive comparison of strings.
2   Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17/* Written by Bruno Haible <bruno@clisp.org>, 2009.  */
18
19#include <config.h>
20
21#include "unicase.h"
22
23#include <locale.h>
24#include "macros.h"
25
26#define UNIT char
27#include "test-casecmp.h"
28#undef UNIT
29
30static void
31test_iso_8859_1 (int (*my_casecmp) (const char *, size_t, const char *, size_t, const char *, uninorm_t, int *))
32{
33  {
34    static const char input1[] = { 'H', 0xF6, 'h', 'l', 'e' };
35    static const char input2[] = { 'H', 0xD6, 'h', 'L', 'e' };
36    static const char input3[] = { 'H', 0xF6, 'h', 'l', 'e', 'n' };
37    static const char input4[] = { 'H', 0xD6, 'h', 'L', 'e', 'n' };
38    static const char input5[] = { 'H', 'u', 'r', 'z' };
39    int cmp;
40
41    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
42    ASSERT (cmp == 0);
43
44    ASSERT (my_casecmp (input2, SIZEOF (input2), input1, SIZEOF (input1), NULL, UNINORM_NFD, &cmp) == 0);
45    ASSERT (cmp == 0);
46
47    ASSERT (my_casecmp (input3, SIZEOF (input3), input4, SIZEOF (input4), NULL, UNINORM_NFD, &cmp) == 0);
48    ASSERT (cmp == 0);
49
50    ASSERT (my_casecmp (input4, SIZEOF (input4), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
51    ASSERT (cmp == 0);
52
53    ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
54    ASSERT (cmp == -1);
55
56    ASSERT (my_casecmp (input1, SIZEOF (input1), input4, SIZEOF (input4), NULL, UNINORM_NFD, &cmp) == 0);
57    ASSERT (cmp == -1);
58
59    ASSERT (my_casecmp (input1, SIZEOF (input1), input5, SIZEOF (input5), NULL, UNINORM_NFD, &cmp) == 0);
60    ASSERT (cmp == -1);
61
62    ASSERT (my_casecmp (input2, SIZEOF (input2), input5, SIZEOF (input5), NULL, UNINORM_NFD, &cmp) == 0);
63    ASSERT (cmp == -1);
64  }
65
66  /* Uppercasing can increase the number of Unicode characters.  */
67  { /* "hei��" */
68    static const char input1[] = { 0x68, 0x65, 0x69, 0xDF };
69    static const char input2[] = { 0x68, 0x65, 0x69, 0x73, 0x73 };
70    int cmp;
71
72    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
73    ASSERT (cmp == 0);
74  }
75}
76
77static void
78test_utf_8 (int (*my_casecmp) (const char *, size_t, const char *, size_t, const char *, uninorm_t, int *))
79{
80  /* Normalization effects.  */
81  {
82    static const char input1[] = { 'H', 0xC3, 0xB6, 'h', 'l', 'e' };
83    static const char input2[] = { 'H', 'O', 0xCC, 0x88, 'h', 'L', 'e' };
84    static const char input3[] = { 'H', 0xC3, 0xB6, 'h', 'l', 'e', 'n' };
85    static const char input4[] = { 'H', 'O', 0xCC, 0x88, 'h', 'L', 'e', 'n' };
86    static const char input5[] = { 'H', 'u', 'r', 'z' };
87    int cmp;
88
89    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
90    ASSERT (cmp == 0);
91
92    ASSERT (my_casecmp (input2, SIZEOF (input2), input1, SIZEOF (input1), NULL, UNINORM_NFD, &cmp) == 0);
93    ASSERT (cmp == 0);
94
95    ASSERT (my_casecmp (input3, SIZEOF (input3), input4, SIZEOF (input4), NULL, UNINORM_NFD, &cmp) == 0);
96    ASSERT (cmp == 0);
97
98    ASSERT (my_casecmp (input4, SIZEOF (input4), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
99    ASSERT (cmp == 0);
100
101    ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
102    ASSERT (cmp == -1);
103
104    ASSERT (my_casecmp (input1, SIZEOF (input1), input4, SIZEOF (input4), NULL, UNINORM_NFD, &cmp) == 0);
105    ASSERT (cmp == -1);
106
107    ASSERT (my_casecmp (input1, SIZEOF (input1), input5, SIZEOF (input5), NULL, UNINORM_NFD, &cmp) == 0);
108    ASSERT (cmp == -1);
109
110    ASSERT (my_casecmp (input2, SIZEOF (input2), input5, SIZEOF (input5), NULL, UNINORM_NFD, &cmp) == 0);
111    ASSERT (cmp == -1);
112  }
113  { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
114    static const char input1[] = { 0xC3, 0x84 };
115    static const char input2[] = { 0x41, 0xCC, 0x88 };
116    int cmp;
117
118    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
119    ASSERT (cmp == 0);
120  }
121  { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
122    static const char input1[] = { 0xC7, 0x9E };
123    static const char input2[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
124    int cmp;
125
126    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
127    ASSERT (cmp == 0);
128  }
129  { /* GREEK DIALYTIKA AND PERISPOMENI */
130    static const char input1[] = { 0xE1, 0xBF, 0x81 };
131    static const char input2[] = { 0xC2, 0xA8, 0xCD, 0x82 };
132    int cmp;
133
134    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
135    ASSERT (cmp == 0);
136  }
137  { /* HANGUL SYLLABLE GEUL */
138    static const char input1[] = { 0xEA, 0xB8, 0x80 };
139    static const char input2[] = { 0xEA, 0xB7, 0xB8, 0xE1, 0x86, 0xAF };
140    static const char input3[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
141    int cmp;
142
143    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
144    ASSERT (cmp == 0);
145
146    ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
147    ASSERT (cmp == 0);
148  }
149  { /* HANGUL SYLLABLE GEU */
150    static const char input1[] = { 0xEA, 0xB7, 0xB8 };
151    static const char input2[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
152    int cmp;
153
154    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
155    ASSERT (cmp == 0);
156  }
157
158  /* Simple string.  */
159  { /* "Gr���� Gott. ������������������������! x=(-b��sqrt(b��-4ac))/(2a)  ���������,������,������" */
160    static const char input1[] =
161      { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.', ' ',
162        0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, 0x81,
163        0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9, 0xD1, 0x82, 0xD0, 0xB5,
164        '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, 's', 'q', 'r', 't', '(',
165        'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')',
166        ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E, ',',
167        0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
168        0xED, 0x95, 0x9C, 0xEA, 0xB8, 0x80, '\n'
169      };
170    static const char input2[] =
171      { 'g', 'r', 0xC3, 0xBC, 0x73, 0x73, ' ', 'g', 'o', 't', 't', '.', ' ',
172        0xD0, 0xB7, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, 0x81,
173        0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9, 0xD1, 0x82, 0xD0, 0xB5,
174        '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, 's', 'q', 'r', 't', '(',
175        'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')',
176        ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E, ',',
177        0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
178        0xED, 0x95, 0x9C, 0xEA, 0xB8, 0x80, '\n'
179      };
180    static const char input3[] =
181      { 'G', 'R', 0xC3, 0x9C, 0x53, 0x53, ' ', 'G', 'O', 'T', 'T', '.', ' ',
182        0xD0, 0x97, 0xD0, 0x94, 0xD0, 0xA0, 0xD0, 0x90, 0xD0, 0x92, 0xD0, 0xA1,
183        0xD0, 0xA2, 0xD0, 0x92, 0xD0, 0xA3, 0xD0, 0x99, 0xD0, 0xA2, 0xD0, 0x95,
184        '!', ' ', 'X', '=', '(', '-', 'B', 0xC2, 0xB1, 'S', 'Q', 'R', 'T', '(',
185        'B', 0xC2, 0xB2, '-', '4', 'A', 'C', ')', ')', '/', '(', '2', 'A', ')',
186        ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E, ',',
187        0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
188        0xED, 0x95, 0x9C, 0xEA, 0xB8, 0x80, '\n'
189      };
190    int cmp;
191
192    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
193    ASSERT (cmp == 0);
194
195    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
196    ASSERT (cmp == 0);
197
198    ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
199    ASSERT (cmp == 0);
200
201    ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
202    ASSERT (cmp == 0);
203
204    ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
205    ASSERT (cmp == 0);
206
207    ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
208    ASSERT (cmp == 0);
209  }
210
211  /* Case mapping can increase the number of Unicode characters.  */
212  { /* LATIN SMALL LETTER N PRECEDED BY APOSTROPHE */
213    static const char input1[] = { 0xC5, 0x89 };
214    static const char input2[] = { 0xCA, 0xBC, 0x6E };
215    static const char input3[] = { 0xCA, 0xBC, 0x4E };
216    int cmp;
217
218    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
219    ASSERT (cmp == 0);
220
221    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
222    ASSERT (cmp == 0);
223
224    ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
225    ASSERT (cmp == 0);
226
227    ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0);
228    ASSERT (cmp == 0);
229  }
230  { /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS */
231    static const char input1[] = { 0xCE, 0x90 };
232    static const char input2[] = { 0xCE, 0xB9, 0xCC, 0x88, 0xCC, 0x81 };
233    int cmp;
234
235    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
236    ASSERT (cmp == 0);
237
238    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0);
239    ASSERT (cmp == 0);
240  }
241
242  /* Turkish letters i �� �� I */
243  { /* LATIN CAPITAL LETTER I */
244    static const char input[]         = { 0x49 };
245    static const char casefolded[]    = { 0x69 };
246    static const char casefolded_tr[] = { 0xC4, 0xB1 };
247    int cmp;
248
249    ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
250    ASSERT (cmp == 0);
251
252    ASSERT (my_casecmp (input, SIZEOF (input), casefolded_tr, SIZEOF (casefolded_tr), "tr", NULL, &cmp) == 0);
253    ASSERT (cmp == 0);
254  }
255  { /* LATIN SMALL LETTER I */
256    static const char input[]         = { 0x69 };
257    static const char casefolded[]    = { 0x49 };
258    static const char casefolded_tr[] = { 0xC4, 0xB0 };
259    int cmp;
260
261    ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
262    ASSERT (cmp == 0);
263
264    ASSERT (my_casecmp (input, SIZEOF (input), casefolded_tr, SIZEOF (casefolded_tr), "tr", NULL, &cmp) == 0);
265    ASSERT (cmp == 0);
266  }
267  { /* LATIN CAPITAL LETTER I WITH DOT ABOVE */
268    static const char input[]         = { 0xC4, 0xB0 };
269    static const char casefolded[]    = { 0x69, 0xCC, 0x87 };
270    static const char casefolded_tr[] = { 0x69 };
271    int cmp;
272
273    ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
274    ASSERT (cmp == 0);
275
276    ASSERT (my_casecmp (input, SIZEOF (input), casefolded_tr, SIZEOF (casefolded_tr), "tr", NULL, &cmp) == 0);
277    ASSERT (cmp == 0);
278  }
279  { /* LATIN SMALL LETTER DOTLESS I */
280    static const char input[]      = { 0xC4, 0xB1 };
281    static const char casefolded[] = { 0x49 };
282    int cmp;
283
284    ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
285    ASSERT (cmp == 1);
286
287    ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), "tr", NULL, &cmp) == 0);
288    ASSERT (cmp == 0);
289  }
290  { /* "topkap��" */
291    static const char input[] =
292      { 0x54, 0x4F, 0x50, 0x4B, 0x41, 0x50, 0x49 };
293    static const char casefolded[] =
294      { 0x74, 0x6F, 0x70, 0x6B, 0x61, 0x70, 0xC4, 0xB1 };
295    int cmp;
296
297    ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
298    ASSERT (cmp == -1);
299
300    ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), "tr", NULL, &cmp) == 0);
301    ASSERT (cmp == 0);
302  }
303
304  /* Uppercasing can increase the number of Unicode characters.  */
305  { /* "hei��" */
306    static const char input1[] = { 0x68, 0x65, 0x69, 0xC3, 0x9F };
307    static const char input2[] = { 0x68, 0x65, 0x69, 0x73, 0x73 };
308    int cmp;
309
310    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
311    ASSERT (cmp == 0);
312  }
313
314  /* Case mappings for some characters can depend on the surrounding characters.  */
315  { /* "������������������������ ����������������������" */
316    static const char input1[] =
317      {
318        0xCF, 0x80, 0xCE, 0xB5, 0xCF, 0x81, 0xCE, 0xB9, 0xCF, 0x83, 0xCF, 0x83,
319        0xCF, 0x8C, 0xCF, 0x84, 0xCE, 0xB5, 0xCF, 0x81, 0xCE, 0xB5, 0xCF, 0x82,
320        ' ', 0xCF, 0x80, 0xCE, 0xBB, 0xCE, 0xB7, 0xCF, 0x81, 0xCE, 0xBF,
321        0xCF, 0x86, 0xCE, 0xBF, 0xCF, 0x81, 0xCE, 0xAF, 0xCE, 0xB5, 0xCF, 0x82
322      };
323    static const char input2[] =
324      {
325        0xCF, 0x80, 0xCE, 0xB5, 0xCF, 0x81, 0xCE, 0xB9, 0xCF, 0x83, 0xCF, 0x83,
326        0xCF, 0x8C, 0xCF, 0x84, 0xCE, 0xB5, 0xCF, 0x81, 0xCE, 0xB5, 0xCF, 0x83,
327        ' ', 0xCF, 0x80, 0xCE, 0xBB, 0xCE, 0xB7, 0xCF, 0x81, 0xCE, 0xBF,
328        0xCF, 0x86, 0xCE, 0xBF, 0xCF, 0x81, 0xCE, 0xAF, 0xCE, 0xB5, 0xCF, 0x83
329      };
330    static const char input3[] =
331      {
332        0xCE, 0xA0, 0xCE, 0x95, 0xCE, 0xA1, 0xCE, 0x99, 0xCE, 0xA3, 0xCE, 0xA3,
333        0xCE, 0x8C, 0xCE, 0xA4, 0xCE, 0x95, 0xCE, 0xA1, 0xCE, 0x95, 0xCE, 0xA3,
334        ' ', 0xCE, 0xA0, 0xCE, 0x9B, 0xCE, 0x97, 0xCE, 0xA1, 0xCE, 0x9F,
335        0xCE, 0xA6, 0xCE, 0x9F, 0xCE, 0xA1, 0xCE, 0x8A, 0xCE, 0x95, 0xCE, 0xA3
336      };
337    int cmp;
338
339    ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0);
340    ASSERT (cmp == 0);
341
342    ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
343    ASSERT (cmp == 0);
344
345    ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0);
346    ASSERT (cmp == 0);
347  }
348
349  /* Case mapping can require subsequent normalization.  */
350  { /* LATIN SMALL LETTER J WITH CARON, COMBINING DOT BELOW */
351    static const char input[]                 = { 0xC7, 0xB0, 0xCC, 0xA3 };
352    static const char casefolded[]            = { 0x6A, 0xCC, 0x8C, 0xCC, 0xA3 };
353    static const char casefolded_decomposed[] = { 0x6A, 0xCC, 0xA3, 0xCC, 0x8C };
354    int cmp;
355
356    ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0);
357    ASSERT (cmp == 0);
358
359    ASSERT (my_casecmp (input, SIZEOF (input), casefolded_decomposed, SIZEOF (casefolded_decomposed), NULL, NULL, &cmp) == 0);
360    ASSERT (cmp != 0);
361
362    ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, UNINORM_NFD, &cmp) == 0);
363    ASSERT (cmp == 0);
364
365    ASSERT (my_casecmp (input, SIZEOF (input), casefolded_decomposed, SIZEOF (casefolded_decomposed), NULL, UNINORM_NFD, &cmp) == 0);
366    ASSERT (cmp == 0);
367  }
368}
369
370int
371main (int argc, char *argv[])
372{
373  /* configure should already have checked that the locale is supported.  */
374  if (setlocale (LC_ALL, "") == NULL)
375    return 1;
376
377  test_ascii (ulc_casecmp, UNINORM_NFD);
378
379  if (argc > 1)
380    switch (argv[1][0])
381      {
382      case '1':
383        /* Locale encoding is ISO-8859-1 or ISO-8859-15.  */
384        test_iso_8859_1 (ulc_casecmp);
385        return 0;
386
387      case '2':
388        /* Locale encoding is UTF-8.  */
389        test_utf_8 (ulc_casecmp);
390        return 0;
391      }
392
393  return 1;
394}
395