1/* Test of case and normalization insensitive comparison of UTF-16 strings. 2 Copyright (C) 2009, 2010 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17/* Written by Bruno Haible <bruno@clisp.org>, 2009. */ 18 19#include <config.h> 20 21#include "unicase.h" 22 23#include "uninorm.h" 24#include "macros.h" 25 26#define UNIT uint16_t 27#include "test-casecmp.h" 28#undef UNIT 29 30static void 31test_nonascii (int (*my_casecmp) (const uint16_t *, size_t, const uint16_t *, size_t, const char *, uninorm_t, int *)) 32{ 33 /* Normalization effects. */ 34 { 35 static const uint16_t input1[] = { 'H', 0x00F6, 'h', 'l', 'e' }; 36 static const uint16_t input2[] = { 'H', 'O', 0x0308, 'h', 'L', 'e' }; 37 static const uint16_t input3[] = { 'H', 0x00F6, 'h', 'l', 'e', 'n' }; 38 static const uint16_t input4[] = { 'H', 'O', 0x0308, 'h', 'L', 'e', 'n' }; 39 static const uint16_t input5[] = { 'H', 'u', 'r', 'z' }; 40 int cmp; 41 42 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0); 43 ASSERT (cmp == 0); 44 45 ASSERT (my_casecmp (input2, SIZEOF (input2), input1, SIZEOF (input1), NULL, UNINORM_NFD, &cmp) == 0); 46 ASSERT (cmp == 0); 47 48 ASSERT (my_casecmp (input3, SIZEOF (input3), input4, SIZEOF (input4), NULL, UNINORM_NFD, &cmp) == 0); 49 ASSERT (cmp == 0); 50 51 ASSERT (my_casecmp (input4, SIZEOF (input4), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0); 52 ASSERT (cmp == 0); 53 54 ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0); 55 ASSERT (cmp == -1); 56 57 ASSERT (my_casecmp (input1, SIZEOF (input1), input4, SIZEOF (input4), NULL, UNINORM_NFD, &cmp) == 0); 58 ASSERT (cmp == -1); 59 60 ASSERT (my_casecmp (input1, SIZEOF (input1), input5, SIZEOF (input5), NULL, UNINORM_NFD, &cmp) == 0); 61 ASSERT (cmp == -1); 62 63 ASSERT (my_casecmp (input2, SIZEOF (input2), input5, SIZEOF (input5), NULL, UNINORM_NFD, &cmp) == 0); 64 ASSERT (cmp == -1); 65 } 66 { /* LATIN CAPITAL LETTER A WITH DIAERESIS */ 67 static const uint16_t input1[] = { 0x00C4 }; 68 static const uint16_t input2[] = { 0x0041, 0x0308 }; 69 int cmp; 70 71 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0); 72 ASSERT (cmp == 0); 73 } 74 { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */ 75 static const uint16_t input1[] = { 0x01DE }; 76 static const uint16_t input2[] = { 0x0041, 0x0308, 0x0304 }; 77 int cmp; 78 79 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0); 80 ASSERT (cmp == 0); 81 } 82 { /* GREEK DIALYTIKA AND PERISPOMENI */ 83 static const uint16_t input1[] = { 0x1FC1 }; 84 static const uint16_t input2[] = { 0x00A8, 0x0342 }; 85 int cmp; 86 87 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0); 88 ASSERT (cmp == 0); 89 } 90 { /* HANGUL SYLLABLE GEUL */ 91 static const uint16_t input1[] = { 0xAE00 }; 92 static const uint16_t input2[] = { 0xADF8, 0x11AF }; 93 static const uint16_t input3[] = { 0x1100, 0x1173, 0x11AF }; 94 int cmp; 95 96 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0); 97 ASSERT (cmp == 0); 98 99 ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0); 100 ASSERT (cmp == 0); 101 } 102 { /* HANGUL SYLLABLE GEU */ 103 static const uint16_t input1[] = { 0xADF8 }; 104 static const uint16_t input2[] = { 0x1100, 0x1173 }; 105 int cmp; 106 107 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0); 108 ASSERT (cmp == 0); 109 } 110 111 /* Simple string. */ 112 { /* "Gr���� Gott. ������������������������! x=(-b��sqrt(b��-4ac))/(2a) ���������,������,������" */ 113 static const uint16_t input1[] = 114 { 'G', 'r', 0x00FC, 0x00DF, ' ', 'G', 'o', 't', 't', '.', ' ', 115 0x0417, 0x0434, 0x0440, 0x0430, 0x0432, 0x0441, 0x0442, 0x0432, 0x0443, 116 0x0439, 0x0442, 0x0435, '!', ' ', 117 'x', '=', '(', '-', 'b', 0x00B1, 's', 'q', 'r', 't', '(', 'b', 0x00B2, 118 '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')', ' ', ' ', 119 0x65E5, 0x672C, 0x8A9E, ',', 0x4E2D, 0x6587, ',', 0xD55C, 0xAE00, '\n' 120 }; 121 static const uint16_t input2[] = 122 { 'g', 'r', 0x00FC, 0x0073, 0x0073, ' ', 'g', 'o', 't', 't', '.', ' ', 123 0x0437, 0x0434, 0x0440, 0x0430, 0x0432, 0x0441, 0x0442, 0x0432, 0x0443, 124 0x0439, 0x0442, 0x0435, '!', ' ', 125 'x', '=', '(', '-', 'b', 0x00B1, 's', 'q', 'r', 't', '(', 'b', 0x00B2, 126 '-', '4', 'a', 'c', ')', ')', '/', '(', '2', 'a', ')', ' ', ' ', 127 0x65E5, 0x672C, 0x8A9E, ',', 0x4E2D, 0x6587, ',', 0xD55C, 0xAE00, '\n' 128 }; 129 static const uint16_t input3[] = 130 { 'G', 'R', 0x00DC, 0x0053, 0x0053, ' ', 'G', 'O', 'T', 'T', '.', ' ', 131 0x0417, 0x0414, 0x0420, 0x0410, 0x0412, 0x0421, 0x0422, 0x0412, 0x0423, 132 0x0419, 0x0422, 0x0415, '!', ' ', 133 'X', '=', '(', '-', 'B', 0x00B1, 'S', 'Q', 'R', 'T', '(', 'B', 0x00B2, 134 '-', '4', 'A', 'C', ')', ')', '/', '(', '2', 'A', ')', ' ', ' ', 135 0x65E5, 0x672C, 0x8A9E, ',', 0x4E2D, 0x6587, ',', 0xD55C, 0xAE00, '\n' 136 }; 137 int cmp; 138 139 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0); 140 ASSERT (cmp == 0); 141 142 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0); 143 ASSERT (cmp == 0); 144 145 ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0); 146 ASSERT (cmp == 0); 147 148 ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0); 149 ASSERT (cmp == 0); 150 151 ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0); 152 ASSERT (cmp == 0); 153 154 ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0); 155 ASSERT (cmp == 0); 156 } 157 158 /* Case mapping can increase the number of Unicode characters. */ 159 { /* LATIN SMALL LETTER N PRECEDED BY APOSTROPHE */ 160 static const uint16_t input1[] = { 0x0149 }; 161 static const uint16_t input2[] = { 0x02BC, 0x006E }; 162 static const uint16_t input3[] = { 0x02BC, 0x004E }; 163 int cmp; 164 165 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0); 166 ASSERT (cmp == 0); 167 168 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0); 169 ASSERT (cmp == 0); 170 171 ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0); 172 ASSERT (cmp == 0); 173 174 ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, UNINORM_NFD, &cmp) == 0); 175 ASSERT (cmp == 0); 176 } 177 { /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS */ 178 static const uint16_t input1[] = { 0x0390 }; 179 static const uint16_t input2[] = { 0x03B9, 0x0308, 0x0301 }; 180 int cmp; 181 182 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0); 183 ASSERT (cmp == 0); 184 185 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, UNINORM_NFD, &cmp) == 0); 186 ASSERT (cmp == 0); 187 } 188 189 /* Turkish letters i �� �� I */ 190 { /* LATIN CAPITAL LETTER I */ 191 static const uint16_t input[] = { 0x0049 }; 192 static const uint16_t casefolded[] = { 0x0069 }; 193 static const uint16_t casefolded_tr[] = { 0x0131 }; 194 int cmp; 195 196 ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0); 197 ASSERT (cmp == 0); 198 199 ASSERT (my_casecmp (input, SIZEOF (input), casefolded_tr, SIZEOF (casefolded_tr), "tr", NULL, &cmp) == 0); 200 ASSERT (cmp == 0); 201 } 202 { /* LATIN SMALL LETTER I */ 203 static const uint16_t input[] = { 0x0069 }; 204 static const uint16_t casefolded[] = { 0x0049 }; 205 static const uint16_t casefolded_tr[] = { 0x0130 }; 206 int cmp; 207 208 ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0); 209 ASSERT (cmp == 0); 210 211 ASSERT (my_casecmp (input, SIZEOF (input), casefolded_tr, SIZEOF (casefolded_tr), "tr", NULL, &cmp) == 0); 212 ASSERT (cmp == 0); 213 } 214 { /* LATIN CAPITAL LETTER I WITH DOT ABOVE */ 215 static const uint16_t input[] = { 0x0130 }; 216 static const uint16_t casefolded[] = { 0x0069, 0x0307 }; 217 static const uint16_t casefolded_tr[] = { 0x0069 }; 218 int cmp; 219 220 ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0); 221 ASSERT (cmp == 0); 222 223 ASSERT (my_casecmp (input, SIZEOF (input), casefolded_tr, SIZEOF (casefolded_tr), "tr", NULL, &cmp) == 0); 224 ASSERT (cmp == 0); 225 } 226 { /* LATIN SMALL LETTER DOTLESS I */ 227 static const uint16_t input[] = { 0x0131 }; 228 static const uint16_t casefolded[] = { 0x0049 }; 229 int cmp; 230 231 ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0); 232 ASSERT (cmp == 1); 233 234 ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), "tr", NULL, &cmp) == 0); 235 ASSERT (cmp == 0); 236 } 237 { /* "topkap��" */ 238 static const uint16_t input[] = 239 { 0x0054, 0x004F, 0x0050, 0x004B, 0x0041, 0x0050, 0x0049 }; 240 static const uint16_t casefolded[] = 241 { 0x0074, 0x006F, 0x0070, 0x006B, 0x0061, 0x0070, 0x0131 }; 242 int cmp; 243 244 ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0); 245 ASSERT (cmp == -1); 246 247 ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), "tr", NULL, &cmp) == 0); 248 ASSERT (cmp == 0); 249 } 250 251 /* Uppercasing can increase the number of Unicode characters. */ 252 { /* "hei��" */ 253 static const uint16_t input1[] = { 0x0068, 0x0065, 0x0069, 0x00DF }; 254 static const uint16_t input2[] = { 0x0068, 0x0065, 0x0069, 0x0073, 0x0073 }; 255 int cmp; 256 257 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0); 258 ASSERT (cmp == 0); 259 } 260 261 /* Case mappings for some characters can depend on the surrounding characters. */ 262 { /* "������������������������ ����������������������" */ 263 static const uint16_t input1[] = 264 { 265 0x03C0, 0x03B5, 0x03C1, 0x03B9, 0x03C3, 0x03C3, 0x03CC, 0x03C4, 266 0x03B5, 0x03C1, 0x03B5, 0x03C2, 0x0020, 0x03C0, 0x03BB, 0x03B7, 267 0x03C1, 0x03BF, 0x03C6, 0x03BF, 0x03C1, 0x03AF, 0x03B5, 0x03C2 268 }; 269 static const uint16_t input2[] = 270 { 271 0x03C0, 0x03B5, 0x03C1, 0x03B9, 0x03C3, 0x03C3, 0x03CC, 0x03C4, 272 0x03B5, 0x03C1, 0x03B5, 0x03C3, 0x0020, 0x03C0, 0x03BB, 0x03B7, 273 0x03C1, 0x03BF, 0x03C6, 0x03BF, 0x03C1, 0x03AF, 0x03B5, 0x03C3 274 }; 275 static const uint16_t input3[] = 276 { 277 0x03A0, 0x0395, 0x03A1, 0x0399, 0x03A3, 0x03A3, 0x038C, 0x03A4, 278 0x0395, 0x03A1, 0x0395, 0x03A3, 0x0020, 0x03A0, 0x039B, 0x0397, 279 0x03A1, 0x039F, 0x03A6, 0x039F, 0x03A1, 0x038A, 0x0395, 0x03A3 280 }; 281 int cmp; 282 283 ASSERT (my_casecmp (input1, SIZEOF (input1), input2, SIZEOF (input2), NULL, NULL, &cmp) == 0); 284 ASSERT (cmp == 0); 285 286 ASSERT (my_casecmp (input1, SIZEOF (input1), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0); 287 ASSERT (cmp == 0); 288 289 ASSERT (my_casecmp (input2, SIZEOF (input2), input3, SIZEOF (input3), NULL, NULL, &cmp) == 0); 290 ASSERT (cmp == 0); 291 } 292 293 /* Case mapping can require subsequent normalization. */ 294 { /* LATIN SMALL LETTER J WITH CARON, COMBINING DOT BELOW */ 295 static const uint16_t input[] = { 0x01F0, 0x0323 }; 296 static const uint16_t casefolded[] = { 0x006A, 0x030C, 0x0323 }; 297 static const uint16_t casefolded_decomposed[] = { 0x006A, 0x0323, 0x030C }; 298 int cmp; 299 300 ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, NULL, &cmp) == 0); 301 ASSERT (cmp == 0); 302 303 ASSERT (my_casecmp (input, SIZEOF (input), casefolded_decomposed, SIZEOF (casefolded_decomposed), NULL, NULL, &cmp) == 0); 304 ASSERT (cmp != 0); 305 306 ASSERT (my_casecmp (input, SIZEOF (input), casefolded, SIZEOF (casefolded), NULL, UNINORM_NFD, &cmp) == 0); 307 ASSERT (cmp == 0); 308 309 ASSERT (my_casecmp (input, SIZEOF (input), casefolded_decomposed, SIZEOF (casefolded_decomposed), NULL, UNINORM_NFD, &cmp) == 0); 310 ASSERT (cmp == 0); 311 } 312} 313 314int 315main () 316{ 317 test_ascii (u16_casecmp, UNINORM_NFD); 318 test_nonascii (u16_casecmp); 319 320 return 0; 321} 322