1/* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent). 2 Copyright (C) 2009-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2009. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU Lesser General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18UNIT * 19FUNC (const UNIT *s, size_t n, 20 casing_prefix_context_t prefix_context, 21 casing_suffix_context_t suffix_context, 22 const char *iso639_language, 23 ucs4_t (*single_character_map) (ucs4_t), 24 size_t offset_in_rule, /* offset in 'struct special_casing_rule' */ 25 uninorm_t nf, 26 UNIT *resultbuf, size_t *lengthp) 27{ 28 /* The result being accumulated. */ 29 UNIT *result; 30 size_t length; 31 size_t allocated; 32 33 /* Initialize the accumulator. */ 34 if (nf != NULL || resultbuf == NULL) 35 { 36 result = NULL; 37 allocated = 0; 38 } 39 else 40 { 41 result = resultbuf; 42 allocated = *lengthp; 43 } 44 length = 0; 45 46 { 47 const UNIT *s_end = s + n; 48 49 /* Helper for evaluating the FINAL_SIGMA condition: 50 Last character that was not case-ignorable. */ 51 ucs4_t last_char_except_ignorable = 52 prefix_context.last_char_except_ignorable; 53 54 /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions: 55 Last character that was of combining class 230 ("Above") or 0. */ 56 ucs4_t last_char_normal_or_above = 57 prefix_context.last_char_normal_or_above; 58 59 while (s < s_end) 60 { 61 ucs4_t uc; 62 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); 63 64 ucs4_t mapped_uc[3]; 65 unsigned int mapped_count; 66 67 if (uc < 0x10000) 68 { 69 /* Look first in the special-casing table. */ 70 char code[3]; 71 72 code[0] = (uc >> 8) & 0xff; 73 code[1] = uc & 0xff; 74 75 for (code[2] = 0; ; code[2]++) 76 { 77 const struct special_casing_rule *rule = 78 gl_unicase_special_lookup (code, 3); 79 80 if (rule == NULL) 81 break; 82 83 /* Test if the condition applies. */ 84 /* Does the language apply? */ 85 if (rule->language[0] == '\0' 86 || (iso639_language != NULL 87 && iso639_language[0] == rule->language[0] 88 && iso639_language[1] == rule->language[1])) 89 { 90 /* Does the context apply? */ 91 int context = rule->context; 92 bool applies; 93 94 if (context < 0) 95 context = - context; 96 switch (context) 97 { 98 case SCC_ALWAYS: 99 applies = true; 100 break; 101 102 case SCC_FINAL_SIGMA: 103 /* "Before" condition: preceded by a sequence 104 consisting of a cased letter and a case-ignorable 105 sequence. 106 "After" condition: not followed by a sequence 107 consisting of a case-ignorable sequence and then a 108 cased letter. */ 109 /* Test the "before" condition. */ 110 applies = uc_is_cased (last_char_except_ignorable); 111 /* Test the "after" condition. */ 112 if (applies) 113 { 114 const UNIT *s2 = s + count; 115 for (;;) 116 { 117 if (s2 < s_end) 118 { 119 ucs4_t uc2; 120 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); 121 /* Our uc_is_case_ignorable function is 122 known to return false for all cased 123 characters. So we can call 124 uc_is_case_ignorable first. */ 125 if (!uc_is_case_ignorable (uc2)) 126 { 127 applies = ! uc_is_cased (uc2); 128 break; 129 } 130 s2 += count2; 131 } 132 else 133 { 134 applies = ! uc_is_cased (suffix_context.first_char_except_ignorable); 135 break; 136 } 137 } 138 } 139 break; 140 141 case SCC_AFTER_SOFT_DOTTED: 142 /* "Before" condition: There is a Soft_Dotted character 143 before it, with no intervening character of 144 combining class 0 or 230 (Above). */ 145 /* Test the "before" condition. */ 146 applies = uc_is_property_soft_dotted (last_char_normal_or_above); 147 break; 148 149 case SCC_MORE_ABOVE: 150 /* "After" condition: followed by a character of 151 combining class 230 (Above) with no intervening 152 character of combining class 0 or 230 (Above). */ 153 /* Test the "after" condition. */ 154 { 155 const UNIT *s2 = s + count; 156 applies = false; 157 for (;;) 158 { 159 if (s2 < s_end) 160 { 161 ucs4_t uc2; 162 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); 163 int ccc = uc_combining_class (uc2); 164 if (ccc == UC_CCC_A) 165 { 166 applies = true; 167 break; 168 } 169 if (ccc == UC_CCC_NR) 170 break; 171 s2 += count2; 172 } 173 else 174 { 175 applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0); 176 break; 177 } 178 } 179 } 180 break; 181 182 case SCC_BEFORE_DOT: 183 /* "After" condition: followed by COMBINING DOT ABOVE 184 (U+0307). Any sequence of characters with a 185 combining class that is neither 0 nor 230 may 186 intervene between the current character and the 187 combining dot above. */ 188 /* Test the "after" condition. */ 189 { 190 const UNIT *s2 = s + count; 191 applies = false; 192 for (;;) 193 { 194 if (s2 < s_end) 195 { 196 ucs4_t uc2; 197 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); 198 if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ 199 { 200 applies = true; 201 break; 202 } 203 { 204 int ccc = uc_combining_class (uc2); 205 if (ccc == UC_CCC_A || ccc == UC_CCC_NR) 206 break; 207 } 208 s2 += count2; 209 } 210 else 211 { 212 applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0); 213 break; 214 } 215 } 216 } 217 break; 218 219 case SCC_AFTER_I: 220 /* "Before" condition: There is an uppercase I before 221 it, and there is no intervening character of 222 combining class 0 or 230 (Above). */ 223 /* Test the "before" condition. */ 224 applies = (last_char_normal_or_above == 'I'); 225 break; 226 227 default: 228 abort (); 229 } 230 if (rule->context < 0) 231 applies = !applies; 232 233 if (applies) 234 { 235 /* The rule applies. 236 Look up the mapping (0 to 3 characters). */ 237 const unsigned short *mapped_in_rule = 238 (const unsigned short *)((const char *)rule + offset_in_rule); 239 240 if (mapped_in_rule[0] == 0) 241 mapped_count = 0; 242 else 243 { 244 mapped_uc[0] = mapped_in_rule[0]; 245 if (mapped_in_rule[1] == 0) 246 mapped_count = 1; 247 else 248 { 249 mapped_uc[1] = mapped_in_rule[1]; 250 if (mapped_in_rule[2] == 0) 251 mapped_count = 2; 252 else 253 { 254 mapped_uc[2] = mapped_in_rule[2]; 255 mapped_count = 3; 256 } 257 } 258 } 259 goto found_mapping; 260 } 261 } 262 263 /* Optimization: Save a hash table lookup in the next round. */ 264 if (!rule->has_next) 265 break; 266 } 267 } 268 269 /* No special-cased mapping. So use the locale and context independent 270 mapping. */ 271 mapped_uc[0] = single_character_map (uc); 272 mapped_count = 1; 273 274 found_mapping: 275 /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */ 276 { 277 unsigned int i; 278 279 for (i = 0; i < mapped_count; i++) 280 { 281 ucs4_t muc = mapped_uc[i]; 282 283 /* Append muc to the result accumulator. */ 284 if (length < allocated) 285 { 286 int ret = U_UCTOMB (result + length, muc, allocated - length); 287 if (ret == -1) 288 { 289 errno = EINVAL; 290 goto fail; 291 } 292 if (ret >= 0) 293 { 294 length += ret; 295 goto done_appending; 296 } 297 } 298 { 299 size_t old_allocated = allocated; 300 size_t new_allocated = 2 * old_allocated; 301 if (new_allocated < 64) 302 new_allocated = 64; 303 if (new_allocated < old_allocated) /* integer overflow? */ 304 abort (); 305 { 306 UNIT *larger_result; 307 if (result == NULL) 308 { 309 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); 310 if (larger_result == NULL) 311 { 312 errno = ENOMEM; 313 goto fail; 314 } 315 } 316 else if (result == resultbuf) 317 { 318 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); 319 if (larger_result == NULL) 320 { 321 errno = ENOMEM; 322 goto fail; 323 } 324 U_CPY (larger_result, resultbuf, length); 325 } 326 else 327 { 328 larger_result = 329 (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); 330 if (larger_result == NULL) 331 { 332 errno = ENOMEM; 333 goto fail; 334 } 335 } 336 result = larger_result; 337 allocated = new_allocated; 338 { 339 int ret = U_UCTOMB (result + length, muc, allocated - length); 340 if (ret == -1) 341 { 342 errno = EINVAL; 343 goto fail; 344 } 345 if (ret < 0) 346 abort (); 347 length += ret; 348 goto done_appending; 349 } 350 } 351 } 352 done_appending: ; 353 } 354 } 355 356 if (!uc_is_case_ignorable (uc)) 357 last_char_except_ignorable = uc; 358 359 { 360 int ccc = uc_combining_class (uc); 361 if (ccc == UC_CCC_A || ccc == UC_CCC_NR) 362 last_char_normal_or_above = uc; 363 } 364 365 s += count; 366 } 367 } 368 369 if (nf != NULL) 370 { 371 /* Finally, normalize the result. */ 372 UNIT *normalized_result; 373 374 normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp); 375 if (normalized_result == NULL) 376 goto fail; 377 378 free (result); 379 return normalized_result; 380 } 381 382 if (length == 0) 383 { 384 if (result == NULL) 385 { 386 /* Return a non-NULL value. NULL means error. */ 387 result = (UNIT *) malloc (1); 388 if (result == NULL) 389 { 390 errno = ENOMEM; 391 goto fail; 392 } 393 } 394 } 395 else if (result != resultbuf && length < allocated) 396 { 397 /* Shrink the allocated memory if possible. */ 398 UNIT *memory; 399 400 memory = (UNIT *) realloc (result, length * sizeof (UNIT)); 401 if (memory != NULL) 402 result = memory; 403 } 404 405 *lengthp = length; 406 return result; 407 408 fail: 409 if (result != resultbuf) 410 { 411 int saved_errno = errno; 412 free (result); 413 errno = saved_errno; 414 } 415 return NULL; 416} 417