1/* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent). 2 Copyright (C) 2009-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2009. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU Lesser General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18/* Quoting the Unicode standard, section "Default Case Algorithms": 19 Find the word boundaries in X according to Unicode Standard Annex #29, 20 ���Text Boundaries.��� For each word boundary, find the first cased character 21 F following the word boundary. If F exists, map F to Titlecase_Mapping(F); 22 then map all characters C between F and the following word boundary to 23 Lowercase_Mapping(C). */ 24 25UNIT * 26FUNC (const UNIT *s, size_t n, 27 casing_prefix_context_t prefix_context, 28 casing_suffix_context_t suffix_context, 29 const char *iso639_language, 30 uninorm_t nf, 31 UNIT *resultbuf, size_t *lengthp) 32{ 33 /* The result being accumulated. */ 34 UNIT *result; 35 size_t length; 36 size_t allocated; 37 /* An array containing the word break positions. */ 38 char *wordbreaks; 39 40 /* Initialize the accumulator. */ 41 if (nf != NULL || resultbuf == NULL) 42 { 43 result = NULL; 44 allocated = 0; 45 } 46 else 47 { 48 result = resultbuf; 49 allocated = *lengthp; 50 } 51 length = 0; 52 53 /* Initialize the word breaks array. */ 54 if (n > 0) 55 { 56 wordbreaks = (char *) malloc (n); 57 if (wordbreaks == NULL) 58 { 59 errno = ENOMEM; 60 goto fail2; 61 } 62 U_WORDBREAKS (s, n, wordbreaks); 63 } 64 else 65 wordbreaks = NULL; 66 67 { 68 const UNIT *s_end = s + n; 69 const char *wp = wordbreaks; 70 71 /* When considering the string as segmented by word boundaries: For each 72 such segment: 73 - In the first part, we are searching for the first cased character. 74 In this state, in_word_first_part = true, and no conversion takes 75 place. 76 - In the second part, we are converting every character: the first 77 among these characters to title case, the other ones to lower case. 78 In this state, in_word_first_part = false. */ 79 bool in_word_first_part = true; 80 81 /* Helper for evaluating the FINAL_SIGMA condition: 82 Last character that was not case-ignorable. */ 83 ucs4_t last_char_except_ignorable = 84 prefix_context.last_char_except_ignorable; 85 86 /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions: 87 Last character that was of combining class 230 ("Above") or 0. */ 88 ucs4_t last_char_normal_or_above = 89 prefix_context.last_char_normal_or_above; 90 91 while (s < s_end) 92 { 93 /* Fetch the next character. */ 94 ucs4_t uc; 95 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); 96 97 ucs4_t (*single_character_map) (ucs4_t); 98 size_t offset_in_rule; /* offset in 'struct special_casing_rule' */ 99 100 ucs4_t mapped_uc[3]; 101 unsigned int mapped_count; 102 103 if (*wp) 104 /* Crossing a word boundary. */ 105 in_word_first_part = true; 106 107 /* Determine single_character_map, offset_in_rule. 108 There are three possibilities: 109 - uc should not be converted. 110 - uc should be titlecased. 111 - uc should be lowercased. */ 112 if (in_word_first_part) 113 { 114 if (uc_is_cased (uc)) 115 { 116 /* uc is to be titlecased. */ 117 single_character_map = uc_totitle; 118 offset_in_rule = offsetof (struct special_casing_rule, title[0]); 119 in_word_first_part = false; 120 } 121 else 122 { 123 /* uc is not converted. */ 124 single_character_map = NULL; 125 offset_in_rule = 0; 126 } 127 } 128 else 129 { 130 /* uc is to be lowercased. */ 131 single_character_map = uc_tolower; 132 offset_in_rule = offsetof (struct special_casing_rule, lower[0]); 133 } 134 135 /* Actually map uc. */ 136 if (single_character_map == NULL) 137 { 138 mapped_uc[0] = uc; 139 mapped_count = 1; 140 goto found_mapping; 141 } 142 143 if (uc < 0x10000) 144 { 145 /* Look first in the special-casing table. */ 146 char code[3]; 147 148 code[0] = (uc >> 8) & 0xff; 149 code[1] = uc & 0xff; 150 151 for (code[2] = 0; ; code[2]++) 152 { 153 const struct special_casing_rule *rule = 154 gl_unicase_special_lookup (code, 3); 155 156 if (rule == NULL) 157 break; 158 159 /* Test if the condition applies. */ 160 /* Does the language apply? */ 161 if (rule->language[0] == '\0' 162 || (iso639_language != NULL 163 && iso639_language[0] == rule->language[0] 164 && iso639_language[1] == rule->language[1])) 165 { 166 /* Does the context apply? */ 167 int context = rule->context; 168 bool applies; 169 170 if (context < 0) 171 context = - context; 172 switch (context) 173 { 174 case SCC_ALWAYS: 175 applies = true; 176 break; 177 178 case SCC_FINAL_SIGMA: 179 /* "Before" condition: preceded by a sequence 180 consisting of a cased letter and a case-ignorable 181 sequence. 182 "After" condition: not followed by a sequence 183 consisting of a case-ignorable sequence and then a 184 cased letter. */ 185 /* Test the "before" condition. */ 186 applies = uc_is_cased (last_char_except_ignorable); 187 /* Test the "after" condition. */ 188 if (applies) 189 { 190 const UNIT *s2 = s + count; 191 for (;;) 192 { 193 if (s2 < s_end) 194 { 195 ucs4_t uc2; 196 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); 197 /* Our uc_is_case_ignorable function is 198 known to return false for all cased 199 characters. So we can call 200 uc_is_case_ignorable first. */ 201 if (!uc_is_case_ignorable (uc2)) 202 { 203 applies = ! uc_is_cased (uc2); 204 break; 205 } 206 s2 += count2; 207 } 208 else 209 { 210 applies = ! uc_is_cased (suffix_context.first_char_except_ignorable); 211 break; 212 } 213 } 214 } 215 break; 216 217 case SCC_AFTER_SOFT_DOTTED: 218 /* "Before" condition: There is a Soft_Dotted character 219 before it, with no intervening character of 220 combining class 0 or 230 (Above). */ 221 /* Test the "before" condition. */ 222 applies = uc_is_property_soft_dotted (last_char_normal_or_above); 223 break; 224 225 case SCC_MORE_ABOVE: 226 /* "After" condition: followed by a character of 227 combining class 230 (Above) with no intervening 228 character of combining class 0 or 230 (Above). */ 229 /* Test the "after" condition. */ 230 { 231 const UNIT *s2 = s + count; 232 applies = false; 233 for (;;) 234 { 235 if (s2 < s_end) 236 { 237 ucs4_t uc2; 238 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); 239 int ccc = uc_combining_class (uc2); 240 if (ccc == UC_CCC_A) 241 { 242 applies = true; 243 break; 244 } 245 if (ccc == UC_CCC_NR) 246 break; 247 s2 += count2; 248 } 249 else 250 { 251 applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0); 252 break; 253 } 254 } 255 } 256 break; 257 258 case SCC_BEFORE_DOT: 259 /* "After" condition: followed by COMBINING DOT ABOVE 260 (U+0307). Any sequence of characters with a 261 combining class that is neither 0 nor 230 may 262 intervene between the current character and the 263 combining dot above. */ 264 /* Test the "after" condition. */ 265 { 266 const UNIT *s2 = s + count; 267 applies = false; 268 for (;;) 269 { 270 if (s2 < s_end) 271 { 272 ucs4_t uc2; 273 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); 274 if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ 275 { 276 applies = true; 277 break; 278 } 279 { 280 int ccc = uc_combining_class (uc2); 281 if (ccc == UC_CCC_A || ccc == UC_CCC_NR) 282 break; 283 } 284 s2 += count2; 285 } 286 else 287 { 288 applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0); 289 break; 290 } 291 } 292 } 293 break; 294 295 case SCC_AFTER_I: 296 /* "Before" condition: There is an uppercase I before 297 it, and there is no intervening character of 298 combining class 0 or 230 (Above). */ 299 /* Test the "before" condition. */ 300 applies = (last_char_normal_or_above == 'I'); 301 break; 302 303 default: 304 abort (); 305 } 306 if (rule->context < 0) 307 applies = !applies; 308 309 if (applies) 310 { 311 /* The rule applies. 312 Look up the mapping (0 to 3 characters). */ 313 const unsigned short *mapped_in_rule = 314 (const unsigned short *)((const char *)rule + offset_in_rule); 315 316 if (mapped_in_rule[0] == 0) 317 mapped_count = 0; 318 else 319 { 320 mapped_uc[0] = mapped_in_rule[0]; 321 if (mapped_in_rule[1] == 0) 322 mapped_count = 1; 323 else 324 { 325 mapped_uc[1] = mapped_in_rule[1]; 326 if (mapped_in_rule[2] == 0) 327 mapped_count = 2; 328 else 329 { 330 mapped_uc[2] = mapped_in_rule[2]; 331 mapped_count = 3; 332 } 333 } 334 } 335 goto found_mapping; 336 } 337 } 338 339 /* Optimization: Save a hash table lookup in the next round. */ 340 if (!rule->has_next) 341 break; 342 } 343 } 344 345 /* No special-cased mapping. So use the locale and context independent 346 mapping. */ 347 mapped_uc[0] = single_character_map (uc); 348 mapped_count = 1; 349 350 found_mapping: 351 /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */ 352 { 353 unsigned int i; 354 355 for (i = 0; i < mapped_count; i++) 356 { 357 ucs4_t muc = mapped_uc[i]; 358 359 /* Append muc to the result accumulator. */ 360 if (length < allocated) 361 { 362 int ret = U_UCTOMB (result + length, muc, allocated - length); 363 if (ret == -1) 364 { 365 errno = EINVAL; 366 goto fail1; 367 } 368 if (ret >= 0) 369 { 370 length += ret; 371 goto done_appending; 372 } 373 } 374 { 375 size_t old_allocated = allocated; 376 size_t new_allocated = 2 * old_allocated; 377 if (new_allocated < 64) 378 new_allocated = 64; 379 if (new_allocated < old_allocated) /* integer overflow? */ 380 abort (); 381 { 382 UNIT *larger_result; 383 if (result == NULL) 384 { 385 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); 386 if (larger_result == NULL) 387 { 388 errno = ENOMEM; 389 goto fail1; 390 } 391 } 392 else if (result == resultbuf) 393 { 394 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); 395 if (larger_result == NULL) 396 { 397 errno = ENOMEM; 398 goto fail1; 399 } 400 U_CPY (larger_result, resultbuf, length); 401 } 402 else 403 { 404 larger_result = 405 (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); 406 if (larger_result == NULL) 407 { 408 errno = ENOMEM; 409 goto fail1; 410 } 411 } 412 result = larger_result; 413 allocated = new_allocated; 414 { 415 int ret = U_UCTOMB (result + length, muc, allocated - length); 416 if (ret == -1) 417 { 418 errno = EINVAL; 419 goto fail1; 420 } 421 if (ret < 0) 422 abort (); 423 length += ret; 424 goto done_appending; 425 } 426 } 427 } 428 done_appending: ; 429 } 430 } 431 432 if (!uc_is_case_ignorable (uc)) 433 last_char_except_ignorable = uc; 434 435 { 436 int ccc = uc_combining_class (uc); 437 if (ccc == UC_CCC_A || ccc == UC_CCC_NR) 438 last_char_normal_or_above = uc; 439 } 440 441 s += count; 442 wp += count; 443 } 444 } 445 446 free (wordbreaks); 447 448 if (nf != NULL) 449 { 450 /* Finally, normalize the result. */ 451 UNIT *normalized_result; 452 453 normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp); 454 if (normalized_result == NULL) 455 goto fail2; 456 457 free (result); 458 return normalized_result; 459 } 460 461 if (length == 0) 462 { 463 if (result == NULL) 464 { 465 /* Return a non-NULL value. NULL means error. */ 466 result = (UNIT *) malloc (1); 467 if (result == NULL) 468 { 469 errno = ENOMEM; 470 goto fail2; 471 } 472 } 473 } 474 else if (result != resultbuf && length < allocated) 475 { 476 /* Shrink the allocated memory if possible. */ 477 UNIT *memory; 478 479 memory = (UNIT *) realloc (result, length * sizeof (UNIT)); 480 if (memory != NULL) 481 result = memory; 482 } 483 484 *lengthp = length; 485 return result; 486 487 fail1: 488 { 489 int saved_errno = errno; 490 free (wordbreaks); 491 errno = saved_errno; 492 } 493 fail2: 494 if (result != resultbuf) 495 { 496 int saved_errno = errno; 497 free (result); 498 errno = saved_errno; 499 } 500 return NULL; 501} 502