1/* Decomposition and composition of Unicode strings. 2 Copyright (C) 2009-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2009. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU Lesser General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18UNIT * 19FUNC (uninorm_t nf, const UNIT *s, size_t n, 20 UNIT *resultbuf, size_t *lengthp) 21{ 22 int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; 23 ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; 24 25 /* The result being accumulated. */ 26 UNIT *result; 27 size_t length; 28 size_t allocated; 29 /* The buffer for sorting. */ 30 #define SORTBUF_PREALLOCATED 64 31 struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; 32 struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */ 33 size_t sortbuf_allocated; 34 size_t sortbuf_count; 35 36 /* Initialize the accumulator. */ 37 if (resultbuf == NULL) 38 { 39 result = NULL; 40 allocated = 0; 41 } 42 else 43 { 44 result = resultbuf; 45 allocated = *lengthp; 46 } 47 length = 0; 48 49 /* Initialize the buffer for sorting. */ 50 sortbuf = sortbuf_preallocated; 51 sortbuf_allocated = SORTBUF_PREALLOCATED; 52 sortbuf_count = 0; 53 54 { 55 const UNIT *s_end = s + n; 56 57 for (;;) 58 { 59 int count; 60 ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; 61 int decomposed_count; 62 int i; 63 64 if (s < s_end) 65 { 66 /* Fetch the next character. */ 67 count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); 68 decomposed_count = 1; 69 70 /* Decompose it, recursively. 71 It would be possible to precompute the recursive decomposition 72 and store it in a table. But this would significantly increase 73 the size of the decomposition tables, because for example for 74 U+1FC1 the recursive canonical decomposition and the recursive 75 compatibility decomposition are different. */ 76 { 77 int curr; 78 79 for (curr = 0; curr < decomposed_count; ) 80 { 81 /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. 82 all elements are atomic. */ 83 ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; 84 int curr_decomposed_count; 85 86 curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); 87 if (curr_decomposed_count >= 0) 88 { 89 /* Move curr_decomposed[0..curr_decomposed_count-1] over 90 decomposed[curr], making room. It's not worth using 91 memcpy() here, since the counts are so small. */ 92 int shift = curr_decomposed_count - 1; 93 94 if (shift < 0) 95 abort (); 96 if (shift > 0) 97 { 98 int j; 99 100 decomposed_count += shift; 101 if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) 102 abort (); 103 for (j = decomposed_count - 1 - shift; j > curr; j--) 104 decomposed[j + shift] = decomposed[j]; 105 } 106 for (; shift >= 0; shift--) 107 decomposed[curr + shift] = curr_decomposed[shift]; 108 } 109 else 110 { 111 /* decomposed[curr] is atomic. */ 112 curr++; 113 } 114 } 115 } 116 } 117 else 118 { 119 count = 0; 120 decomposed_count = 0; 121 } 122 123 i = 0; 124 for (;;) 125 { 126 ucs4_t uc; 127 int ccc; 128 129 if (s < s_end) 130 { 131 /* Fetch the next character from the decomposition. */ 132 if (i == decomposed_count) 133 break; 134 uc = decomposed[i]; 135 ccc = uc_combining_class (uc); 136 } 137 else 138 { 139 /* End of string reached. */ 140 uc = 0; 141 ccc = 0; 142 } 143 144 if (ccc == 0) 145 { 146 size_t j; 147 148 /* Apply the canonical ordering algorithm to the accumulated 149 sequence of characters. */ 150 if (sortbuf_count > 1) 151 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, 152 sortbuf + sortbuf_count); 153 154 if (composer != NULL) 155 { 156 /* Attempt to combine decomposed characters, as specified 157 in the Unicode Standard Annex #15 "Unicode Normalization 158 Forms". We need to check 159 1. whether the first accumulated character is a 160 "starter" (i.e. has ccc = 0). This is usually the 161 case. But when the string starts with a 162 non-starter, the sortbuf also starts with a 163 non-starter. Btw, this check could also be 164 omitted, because the composition table has only 165 entries (code1, code2) for which code1 is a 166 starter; if the first accumulated character is not 167 a starter, no lookup will succeed. 168 2. If the sortbuf has more than one character, check 169 for each of these characters that are not "blocked" 170 from the starter (i.e. have a ccc that is higher 171 than the ccc of the previous character) whether it 172 can be combined with the first character. 173 3. If only one character is left in sortbuf, check 174 whether it can be combined with the next character 175 (also a starter). */ 176 if (sortbuf_count > 0 && sortbuf[0].ccc == 0) 177 { 178 for (j = 1; j < sortbuf_count; ) 179 { 180 if (sortbuf[j].ccc > sortbuf[j - 1].ccc) 181 { 182 ucs4_t combined = 183 composer (sortbuf[0].code, sortbuf[j].code); 184 if (combined) 185 { 186 size_t k; 187 188 sortbuf[0].code = combined; 189 /* sortbuf[0].ccc = 0, still valid. */ 190 for (k = j + 1; k < sortbuf_count; k++) 191 sortbuf[k - 1] = sortbuf[k]; 192 sortbuf_count--; 193 continue; 194 } 195 } 196 j++; 197 } 198 if (s < s_end && sortbuf_count == 1) 199 { 200 ucs4_t combined = 201 composer (sortbuf[0].code, uc); 202 if (combined) 203 { 204 uc = combined; 205 ccc = 0; 206 /* uc could be further combined with subsequent 207 characters. So don't put it into sortbuf[0] in 208 this round, only in the next round. */ 209 sortbuf_count = 0; 210 } 211 } 212 } 213 } 214 215 for (j = 0; j < sortbuf_count; j++) 216 { 217 ucs4_t muc = sortbuf[j].code; 218 219 /* Append muc to the result accumulator. */ 220 if (length < allocated) 221 { 222 int ret = 223 U_UCTOMB (result + length, muc, allocated - length); 224 if (ret == -1) 225 { 226 errno = EINVAL; 227 goto fail; 228 } 229 if (ret >= 0) 230 { 231 length += ret; 232 goto done_appending; 233 } 234 } 235 { 236 size_t old_allocated = allocated; 237 size_t new_allocated = 2 * old_allocated; 238 if (new_allocated < 64) 239 new_allocated = 64; 240 if (new_allocated < old_allocated) /* integer overflow? */ 241 abort (); 242 { 243 UNIT *larger_result; 244 if (result == NULL) 245 { 246 larger_result = 247 (UNIT *) malloc (new_allocated * sizeof (UNIT)); 248 if (larger_result == NULL) 249 { 250 errno = ENOMEM; 251 goto fail; 252 } 253 } 254 else if (result == resultbuf) 255 { 256 larger_result = 257 (UNIT *) malloc (new_allocated * sizeof (UNIT)); 258 if (larger_result == NULL) 259 { 260 errno = ENOMEM; 261 goto fail; 262 } 263 U_CPY (larger_result, resultbuf, length); 264 } 265 else 266 { 267 larger_result = 268 (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); 269 if (larger_result == NULL) 270 { 271 errno = ENOMEM; 272 goto fail; 273 } 274 } 275 result = larger_result; 276 allocated = new_allocated; 277 { 278 int ret = 279 U_UCTOMB (result + length, muc, allocated - length); 280 if (ret == -1) 281 { 282 errno = EINVAL; 283 goto fail; 284 } 285 if (ret < 0) 286 abort (); 287 length += ret; 288 goto done_appending; 289 } 290 } 291 } 292 done_appending: ; 293 } 294 295 /* sortbuf is now empty. */ 296 sortbuf_count = 0; 297 } 298 299 if (!(s < s_end)) 300 /* End of string reached. */ 301 break; 302 303 /* Append (uc, ccc) to sortbuf. */ 304 if (sortbuf_count == sortbuf_allocated) 305 { 306 struct ucs4_with_ccc *new_sortbuf; 307 308 sortbuf_allocated = 2 * sortbuf_allocated; 309 if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ 310 abort (); 311 new_sortbuf = 312 (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); 313 memcpy (new_sortbuf, sortbuf, 314 sortbuf_count * sizeof (struct ucs4_with_ccc)); 315 if (sortbuf != sortbuf_preallocated) 316 free (sortbuf); 317 sortbuf = new_sortbuf; 318 } 319 sortbuf[sortbuf_count].code = uc; 320 sortbuf[sortbuf_count].ccc = ccc; 321 sortbuf_count++; 322 323 i++; 324 } 325 326 if (!(s < s_end)) 327 /* End of string reached. */ 328 break; 329 330 s += count; 331 } 332 } 333 334 if (length == 0) 335 { 336 if (result == NULL) 337 { 338 /* Return a non-NULL value. NULL means error. */ 339 result = (UNIT *) malloc (1); 340 if (result == NULL) 341 { 342 errno = ENOMEM; 343 goto fail; 344 } 345 } 346 } 347 else if (result != resultbuf && length < allocated) 348 { 349 /* Shrink the allocated memory if possible. */ 350 UNIT *memory; 351 352 memory = (UNIT *) realloc (result, length * sizeof (UNIT)); 353 if (memory != NULL) 354 result = memory; 355 } 356 357 if (sortbuf_count > 0) 358 abort (); 359 if (sortbuf != sortbuf_preallocated) 360 free (sortbuf); 361 362 *lengthp = length; 363 return result; 364 365 fail: 366 { 367 int saved_errno = errno; 368 if (sortbuf != sortbuf_preallocated) 369 free (sortbuf); 370 if (result != resultbuf) 371 free (result); 372 errno = saved_errno; 373 } 374 return NULL; 375} 376