1/* Test of canonical normalization of UTF-8 strings. 2 Copyright (C) 2009, 2010 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17/* Written by Bruno Haible <bruno@clisp.org>, 2009. */ 18 19#include <config.h> 20 21#if GNULIB_TEST_UNINORM_U8_NORMALIZE 22 23#include "uninorm.h" 24 25#include <signal.h> 26#include <stdlib.h> 27#include <unistd.h> 28 29#include "unistr.h" 30#include "macros.h" 31 32static int 33check (const uint8_t *input, size_t input_length, 34 const uint8_t *expected, size_t expected_length) 35{ 36 size_t length; 37 uint8_t *result; 38 39 /* Test return conventions with resultbuf == NULL. */ 40 result = u8_normalize (UNINORM_NFC, input, input_length, NULL, &length); 41 if (!(result != NULL)) 42 return 1; 43 if (!(length == expected_length)) 44 return 2; 45 if (!(u8_cmp (result, expected, expected_length) == 0)) 46 return 3; 47 free (result); 48 49 /* Test return conventions with resultbuf too small. */ 50 if (expected_length > 0) 51 { 52 uint8_t *preallocated; 53 54 length = expected_length - 1; 55 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); 56 result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length); 57 if (!(result != NULL)) 58 return 4; 59 if (!(result != preallocated)) 60 return 5; 61 if (!(length == expected_length)) 62 return 6; 63 if (!(u8_cmp (result, expected, expected_length) == 0)) 64 return 7; 65 free (result); 66 free (preallocated); 67 } 68 69 /* Test return conventions with resultbuf large enough. */ 70 { 71 uint8_t *preallocated; 72 73 length = expected_length; 74 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); 75 result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length); 76 if (!(result != NULL)) 77 return 8; 78 if (!(preallocated == NULL || result == preallocated)) 79 return 9; 80 if (!(length == expected_length)) 81 return 10; 82 if (!(u8_cmp (result, expected, expected_length) == 0)) 83 return 11; 84 free (preallocated); 85 } 86 87 return 0; 88} 89 90void 91test_u8_nfc (void) 92{ 93 { /* Empty string. */ 94 ASSERT (check (NULL, 0, NULL, 0) == 0); 95 } 96 { /* SPACE */ 97 static const uint8_t input[] = { 0x20 }; 98 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 99 } 100 101 { /* LATIN CAPITAL LETTER A WITH DIAERESIS */ 102 static const uint8_t input[] = { 0xC3, 0x84 }; 103 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 }; 104 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 105 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); 106 } 107 108 { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */ 109 static const uint8_t input[] = { 0xC7, 0x9E }; 110 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 }; 111 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 112 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); 113 } 114 115 { /* ANGSTROM SIGN */ 116 static const uint8_t input[] = { 0xE2, 0x84, 0xAB }; 117 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A }; 118 static const uint8_t expected[] = { 0xC3, 0x85 }; 119 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0); 120 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0); 121 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0); 122 } 123 124 { /* GREEK DIALYTIKA AND PERISPOMENI */ 125 static const uint8_t input[] = { 0xE1, 0xBF, 0x81 }; 126 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 127 } 128 129 { /* SCRIPT SMALL L */ 130 static const uint8_t input[] = { 0xE2, 0x84, 0x93 }; 131 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 132 } 133 134 { /* NO-BREAK SPACE */ 135 static const uint8_t input[] = { 0xC2, 0xA0 }; 136 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 137 } 138 139 { /* ARABIC LETTER VEH INITIAL FORM */ 140 static const uint8_t input[] = { 0xEF, 0xAD, 0xAC }; 141 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 142 } 143 144 { /* ARABIC LETTER VEH MEDIAL FORM */ 145 static const uint8_t input[] = { 0xEF, 0xAD, 0xAD }; 146 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 147 } 148 149 { /* ARABIC LETTER VEH FINAL FORM */ 150 static const uint8_t input[] = { 0xEF, 0xAD, 0xAB }; 151 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 152 } 153 154 { /* ARABIC LETTER VEH ISOLATED FORM */ 155 static const uint8_t input[] = { 0xEF, 0xAD, 0xAA }; 156 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 157 } 158 159 { /* CIRCLED NUMBER FIFTEEN */ 160 static const uint8_t input[] = { 0xE2, 0x91, 0xAE }; 161 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 162 } 163 164 { /* TRADE MARK SIGN */ 165 static const uint8_t input[] = { 0xE2, 0x84, 0xA2 }; 166 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 167 } 168 169 { /* LATIN SUBSCRIPT SMALL LETTER I */ 170 static const uint8_t input[] = { 0xE1, 0xB5, 0xA2 }; 171 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 172 } 173 174 { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */ 175 static const uint8_t input[] = { 0xEF, 0xB8, 0xB5 }; 176 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 177 } 178 179 { /* FULLWIDTH LATIN CAPITAL LETTER A */ 180 static const uint8_t input[] = { 0xEF, 0xBC, 0xA1 }; 181 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 182 } 183 184 { /* HALFWIDTH IDEOGRAPHIC COMMA */ 185 static const uint8_t input[] = { 0xEF, 0xBD, 0xA4 }; 186 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 187 } 188 189 { /* SMALL IDEOGRAPHIC COMMA */ 190 static const uint8_t input[] = { 0xEF, 0xB9, 0x91 }; 191 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 192 } 193 194 { /* SQUARE MHZ */ 195 static const uint8_t input[] = { 0xE3, 0x8E, 0x92 }; 196 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 197 } 198 199 { /* VULGAR FRACTION THREE EIGHTHS */ 200 static const uint8_t input[] = { 0xE2, 0x85, 0x9C }; 201 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 202 } 203 204 { /* MICRO SIGN */ 205 static const uint8_t input[] = { 0xC2, 0xB5 }; 206 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 207 } 208 209 { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */ 210 static const uint8_t input[] = { 0xEF, 0xB7, 0xBA }; 211 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 212 } 213 214 { /* HANGUL SYLLABLE GEUL */ 215 static const uint8_t input[] = { 0xEA, 0xB8, 0x80 }; 216 static const uint8_t decomposed[] = 217 { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF }; 218 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 219 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); 220 } 221 222 { /* HANGUL SYLLABLE GEU */ 223 static const uint8_t input[] = { 0xEA, 0xB7, 0xB8 }; 224 static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 }; 225 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 226 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); 227 } 228 229 { /* "Gr���� Gott. ������������������������! x=(-b��sqrt(b��-4ac))/(2a) ���������,������,������" */ 230 static const uint8_t input[] = 231 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.', 232 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, 233 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9, 234 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, 235 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')', 236 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 237 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',', 238 0xED, 0x95, 0x9C, 239 0xEA, 0xB8, 0x80, '\n' 240 }; 241 static const uint8_t decomposed[] = 242 { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.', 243 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1, 244 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86, 245 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1, 246 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')', 247 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 248 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',', 249 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB, 250 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n' 251 }; 252 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0); 253 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0); 254 } 255 256#if HAVE_DECL_ALARM 257 /* Declare failure if test takes too long, by using default abort 258 caused by SIGALRM. */ 259 signal (SIGALRM, SIG_DFL); 260 alarm (50); 261#endif 262 263 /* Check that the sorting is not O(n��) but O(n log n). */ 264 { 265 int pass; 266 for (pass = 0; pass < 3; pass++) 267 { 268 size_t repeat = 1; 269 size_t m = 100000; 270 uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t)); 271 if (input != NULL) 272 { 273 uint8_t *expected = input + (2 * m - 1); 274 size_t m1 = m / 2; 275 size_t m2 = (m - 1) / 2; 276 /* NB: m1 + m2 == m - 1. */ 277 uint8_t *p; 278 size_t i; 279 280 input[0] = 0x41; 281 p = input + 1; 282 switch (pass) 283 { 284 case 0: 285 for (i = 0; i < m1; i++) 286 { 287 *p++ = 0xCC; 288 *p++ = 0x99; 289 } 290 for (i = 0; i < m2; i++) 291 { 292 *p++ = 0xCC; 293 *p++ = 0x80; 294 } 295 break; 296 297 case 1: 298 for (i = 0; i < m2; i++) 299 { 300 *p++ = 0xCC; 301 *p++ = 0x80; 302 } 303 for (i = 0; i < m1; i++) 304 { 305 *p++ = 0xCC; 306 *p++ = 0x99; 307 } 308 break; 309 310 case 2: 311 for (i = 0; i < m2; i++) 312 { 313 *p++ = 0xCC; 314 *p++ = 0x99; 315 *p++ = 0xCC; 316 *p++ = 0x80; 317 } 318 for (; i < m1; i++) 319 { 320 *p++ = 0xCC; 321 *p++ = 0x99; 322 } 323 break; 324 325 default: 326 abort (); 327 } 328 329 expected[0] = 0xC3; 330 expected[1] = 0x80; 331 p = expected + 2; 332 for (i = 0; i < m1; i++) 333 { 334 *p++ = 0xCC; 335 *p++ = 0x99; 336 } 337 for (i = 0; i < m2 - 1; i++) 338 { 339 *p++ = 0xCC; 340 *p++ = 0x80; 341 } 342 343 for (; repeat > 0; repeat--) 344 { 345 ASSERT (check (input, 2 * m - 1, expected, 2 * m - 2) == 0); 346 ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0); 347 } 348 349 free (input); 350 } 351 } 352 } 353} 354 355#else 356 357void 358test_u8_nfc (void) 359{ 360} 361 362#endif 363