1/* Test of compatibility normalization of UTF-8 strings. 2 Copyright (C) 2009, 2010 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17/* Written by Bruno Haible <bruno@clisp.org>, 2009. */ 18 19#include <config.h> 20 21#if GNULIB_TEST_UNINORM_U8_NORMALIZE 22 23#include "uninorm.h" 24 25#include <signal.h> 26#include <stdlib.h> 27#include <unistd.h> 28 29#include "unistr.h" 30#include "macros.h" 31 32static int 33check (const uint8_t *input, size_t input_length, 34 const uint8_t *expected, size_t expected_length) 35{ 36 size_t length; 37 uint8_t *result; 38 39 /* Test return conventions with resultbuf == NULL. */ 40 result = u8_normalize (UNINORM_NFKC, input, input_length, NULL, &length); 41 if (!(result != NULL)) 42 return 1; 43 if (!(length == expected_length)) 44 return 2; 45 if (!(u8_cmp (result, expected, expected_length) == 0)) 46 return 3; 47 free (result); 48 49 /* Test return conventions with resultbuf too small. */ 50 if (expected_length > 0) 51 { 52 uint8_t *preallocated; 53 54 length = expected_length - 1; 55 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); 56 result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length); 57 if (!(result != NULL)) 58 return 4; 59 if (!(result != preallocated)) 60 return 5; 61 if (!(length == expected_length)) 62 return 6; 63 if (!(u8_cmp (result, expected, expected_length) == 0)) 64 return 7; 65 free (result); 66 free (preallocated); 67 } 68 69 /* Test return conventions with resultbuf large enough. */ 70 { 71 uint8_t *preallocated; 72 73 length = expected_length; 74 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); 75 result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length); 76 if (!(result != NULL)) 77 return 8; 78 if (!(preallocated == NULL || result == preallocated)) 79 return 9; 80 if (!(length == expected_length)) 81 return 10; 82 if (!(u8_cmp (result, expected, expected_length) == 0)) 83 return 11; 84 free (preallocated); 85 } 86 87 return 0; 88} 89 90void 91test_u8_nfkc (void) 92{ 93 { /* Empty string. */ 94 ASSERT (check (NULL, 0, NULL, 0) == 0); 95 } 96 { /* SPACE */ 97 static const uint8_t input[] = { 0x20 }; 98 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 99 } 100 101 { /* LATIN CAPITAL LETTER A WITH DIAERESIS */ 102 static const uint8_t input[] = { 0xC3, 0x84 }; 103 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 }; 104 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 105 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); 106 } 107 108 { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */ 109 static const uint8_t input[] = { 0xC7, 0x9E }; 110 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 }; 111 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 112 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); 113 } 114 115 { /* ANGSTROM SIGN */ 116 static const uint8_t input[] = { 0xE2, 0x84, 0xAB }; 117 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A }; 118 static const uint8_t expected[] = { 0xC3, 0x85 }; 119 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0); 120 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0); 121 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0); 122 } 123 124 { /* GREEK DIALYTIKA AND PERISPOMENI */ 125 static const uint8_t input[] = { 0xE1, 0xBF, 0x81 }; 126 static const uint8_t decomposed[] = { 0x20, 0xCC, 0x88, 0xCD, 0x82 }; 127 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 128 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 129 } 130 131 { /* SCRIPT SMALL L */ 132 static const uint8_t input[] = { 0xE2, 0x84, 0x93 }; 133 static const uint8_t decomposed[] = { 0x6C }; 134 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 135 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 136 } 137 138 { /* NO-BREAK SPACE */ 139 static const uint8_t input[] = { 0xC2, 0xA0 }; 140 static const uint8_t decomposed[] = { 0x20 }; 141 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 142 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 143 } 144 145 { /* ARABIC LETTER VEH INITIAL FORM */ 146 static const uint8_t input[] = { 0xEF, 0xAD, 0xAC }; 147 static const uint8_t decomposed[] = { 0xDA, 0xA4 }; 148 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 149 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 150 } 151 152 { /* ARABIC LETTER VEH MEDIAL FORM */ 153 static const uint8_t input[] = { 0xEF, 0xAD, 0xAD }; 154 static const uint8_t decomposed[] = { 0xDA, 0xA4 }; 155 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 156 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 157 } 158 159 { /* ARABIC LETTER VEH FINAL FORM */ 160 static const uint8_t input[] = { 0xEF, 0xAD, 0xAB }; 161 static const uint8_t decomposed[] = { 0xDA, 0xA4 }; 162 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 163 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 164 } 165 166 { /* ARABIC LETTER VEH ISOLATED FORM */ 167 static const uint8_t input[] = { 0xEF, 0xAD, 0xAA }; 168 static const uint8_t decomposed[] = { 0xDA, 0xA4 }; 169 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 170 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 171 } 172 173 { /* CIRCLED NUMBER FIFTEEN */ 174 static const uint8_t input[] = { 0xE2, 0x91, 0xAE }; 175 static const uint8_t decomposed[] = { 0x31, 0x35 }; 176 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 177 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 178 } 179 180 { /* TRADE MARK SIGN */ 181 static const uint8_t input[] = { 0xE2, 0x84, 0xA2 }; 182 static const uint8_t decomposed[] = { 0x54, 0x4D }; 183 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 184 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 185 } 186 187 { /* LATIN SUBSCRIPT SMALL LETTER I */ 188 static const uint8_t input[] = { 0xE1, 0xB5, 0xA2 }; 189 static const uint8_t decomposed[] = { 0x69 }; 190 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 191 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 192 } 193 194 { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */ 195 static const uint8_t input[] = { 0xEF, 0xB8, 0xB5 }; 196 static const uint8_t decomposed[] = { 0x28 }; 197 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 198 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 199 } 200 201 { /* FULLWIDTH LATIN CAPITAL LETTER A */ 202 static const uint8_t input[] = { 0xEF, 0xBC, 0xA1 }; 203 static const uint8_t decomposed[] = { 0x41 }; 204 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 205 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 206 } 207 208 { /* HALFWIDTH IDEOGRAPHIC COMMA */ 209 static const uint8_t input[] = { 0xEF, 0xBD, 0xA4 }; 210 static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 }; 211 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 212 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 213 } 214 215 { /* SMALL IDEOGRAPHIC COMMA */ 216 static const uint8_t input[] = { 0xEF, 0xB9, 0x91 }; 217 static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 }; 218 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 219 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 220 } 221 222 { /* SQUARE MHZ */ 223 static const uint8_t input[] = { 0xE3, 0x8E, 0x92 }; 224 static const uint8_t decomposed[] = { 0x4D, 0x48, 0x7A }; 225 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 226 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 227 } 228 229 { /* VULGAR FRACTION THREE EIGHTHS */ 230 static const uint8_t input[] = { 0xE2, 0x85, 0x9C }; 231 static const uint8_t decomposed[] = { 0x33, 0xE2, 0x81, 0x84, 0x38 }; 232 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 233 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 234 } 235 236 { /* MICRO SIGN */ 237 static const uint8_t input[] = { 0xC2, 0xB5 }; 238 static const uint8_t decomposed[] = { 0xCE, 0xBC }; 239 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 240 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 241 } 242 243 { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */ 244 static const uint8_t input[] = { 0xEF, 0xB7, 0xBA }; 245 static const uint8_t decomposed[] = 246 { 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9, 247 0x84, 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A, 0xD9, 0x87, 248 0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9, 0x84, 0xD9, 0x85 249 }; 250 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0); 251 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0); 252 } 253 254 { /* HANGUL SYLLABLE GEUL */ 255 static const uint8_t input[] = { 0xEA, 0xB8, 0x80 }; 256 static const uint8_t decomposed[] = 257 { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF }; 258 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 259 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); 260 } 261 262 { /* HANGUL SYLLABLE GEU */ 263 static const uint8_t input[] = { 0xEA, 0xB7, 0xB8 }; 264 static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 }; 265 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 266 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); 267 } 268 269 { /* "Gr���� Gott. ������������������������! x=(-b��sqrt(b��-4ac))/(2a) ���������,������,������" */ 270 static const uint8_t input[] = 271 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.', 272 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, 273 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9, 274 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, 275 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')', 276 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 277 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',', 278 0xED, 0x95, 0x9C, 279 0xEA, 0xB8, 0x80, '\n' 280 }; 281 static const uint8_t decomposed[] = 282 { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.', 283 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, 284 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86, 285 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, 286 's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')', 287 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 288 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',', 289 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB, 290 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n' 291 }; 292 static const uint8_t expected[] = 293 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.', 294 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, 295 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9, 296 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, 297 's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')', 298 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 299 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',', 300 0xED, 0x95, 0x9C, 301 0xEA, 0xB8, 0x80, '\n' 302 }; 303 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0); 304 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0); 305 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0); 306 } 307 308#if HAVE_DECL_ALARM 309 /* Declare failure if test takes too long, by using default abort 310 caused by SIGALRM. */ 311 signal (SIGALRM, SIG_DFL); 312 alarm (50); 313#endif 314 315 /* Check that the sorting is not O(n��) but O(n log n). */ 316 { 317 int pass; 318 for (pass = 0; pass < 3; pass++) 319 { 320 size_t repeat = 1; 321 size_t m = 100000; 322 uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t)); 323 if (input != NULL) 324 { 325 uint8_t *expected = input + (2 * m - 1); 326 size_t m1 = m / 2; 327 size_t m2 = (m - 1) / 2; 328 /* NB: m1 + m2 == m - 1. */ 329 uint8_t *p; 330 size_t i; 331 332 input[0] = 0x41; 333 p = input + 1; 334 switch (pass) 335 { 336 case 0: 337 for (i = 0; i < m1; i++) 338 { 339 *p++ = 0xCC; 340 *p++ = 0x99; 341 } 342 for (i = 0; i < m2; i++) 343 { 344 *p++ = 0xCC; 345 *p++ = 0x80; 346 } 347 break; 348 349 case 1: 350 for (i = 0; i < m2; i++) 351 { 352 *p++ = 0xCC; 353 *p++ = 0x80; 354 } 355 for (i = 0; i < m1; i++) 356 { 357 *p++ = 0xCC; 358 *p++ = 0x99; 359 } 360 break; 361 362 case 2: 363 for (i = 0; i < m2; i++) 364 { 365 *p++ = 0xCC; 366 *p++ = 0x99; 367 *p++ = 0xCC; 368 *p++ = 0x80; 369 } 370 for (; i < m1; i++) 371 { 372 *p++ = 0xCC; 373 *p++ = 0x99; 374 } 375 break; 376 377 default: 378 abort (); 379 } 380 381 expected[0] = 0xC3; 382 expected[1] = 0x80; 383 p = expected + 2; 384 for (i = 0; i < m1; i++) 385 { 386 *p++ = 0xCC; 387 *p++ = 0x99; 388 } 389 for (i = 0; i < m2 - 1; i++) 390 { 391 *p++ = 0xCC; 392 *p++ = 0x80; 393 } 394 395 for (; repeat > 0; repeat--) 396 { 397 ASSERT (check (input, 2 * m - 1, expected, 2 * m - 2) == 0); 398 ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0); 399 } 400 401 free (input); 402 } 403 } 404 } 405} 406 407#else 408 409void 410test_u8_nfkc (void) 411{ 412} 413 414#endif 415