1/* Stream-based normalization of Unicode strings. 2 Copyright (C) 2009-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2009. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU Lesser General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include "uninorm.h" 22 23#include <errno.h> 24#include <stddef.h> 25#include <stdlib.h> 26#include <string.h> 27 28#include "unictype.h" 29#include "normalize-internal.h" 30#include "decompose-internal.h" 31 32 33struct uninorm_filter 34{ 35 /* Characteristics of the normalization form. */ 36 int (*decomposer) (ucs4_t uc, ucs4_t *decomposition); 37 ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2); 38 39 /* The encapsulated stream. */ 40 int (*stream_func) (void *stream_data, ucs4_t uc); 41 void *stream_data; 42 43 /* The buffer for sorting. */ 44 #define SORTBUF_PREALLOCATED 64 45 struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; 46 struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */ 47 size_t sortbuf_allocated; 48 size_t sortbuf_count; 49}; 50 51struct uninorm_filter * 52uninorm_filter_create (uninorm_t nf, 53 int (*stream_func) (void *stream_data, ucs4_t uc), 54 void *stream_data) 55{ 56 struct uninorm_filter *filter = 57 (struct uninorm_filter *) malloc (sizeof (struct uninorm_filter)); 58 59 if (filter == NULL) 60 /* errno is ENOMEM. */ 61 return NULL; 62 63 filter->decomposer = nf->decomposer; 64 filter->composer = nf->composer; 65 filter->stream_func = stream_func; 66 filter->stream_data = stream_data; 67 filter->sortbuf = filter->sortbuf_preallocated; 68 filter->sortbuf_allocated = SORTBUF_PREALLOCATED; 69 filter->sortbuf_count = 0; 70 71 return filter; 72} 73 74int 75uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc_arg) 76{ 77 ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; 78 int decomposed_count; 79 80 /* Accept the next character. */ 81 decomposed[0] = uc_arg; 82 decomposed_count = 1; 83 84 /* Decompose it, recursively. 85 It would be possible to precompute the recursive decomposition 86 and store it in a table. But this would significantly increase 87 the size of the decomposition tables, because for example for 88 U+1FC1 the recursive canonical decomposition and the recursive 89 compatibility decomposition are different. */ 90 { 91 int curr; 92 93 for (curr = 0; curr < decomposed_count; ) 94 { 95 /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. 96 all elements are atomic. */ 97 ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; 98 int curr_decomposed_count; 99 100 curr_decomposed_count = 101 filter->decomposer (decomposed[curr], curr_decomposed); 102 if (curr_decomposed_count >= 0) 103 { 104 /* Move curr_decomposed[0..curr_decomposed_count-1] over 105 decomposed[curr], making room. It's not worth using 106 memcpy() here, since the counts are so small. */ 107 int shift = curr_decomposed_count - 1; 108 109 if (shift < 0) 110 abort (); 111 if (shift > 0) 112 { 113 int j; 114 115 decomposed_count += shift; 116 if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) 117 abort (); 118 for (j = decomposed_count - 1 - shift; j > curr; j--) 119 decomposed[j + shift] = decomposed[j]; 120 } 121 for (; shift >= 0; shift--) 122 decomposed[curr + shift] = curr_decomposed[shift]; 123 } 124 else 125 { 126 /* decomposed[curr] is atomic. */ 127 curr++; 128 } 129 } 130 } 131 132 { 133 /* Cache sortbuf and sortbuf_count in local register variables. */ 134 struct ucs4_with_ccc * const sortbuf = filter->sortbuf; 135 size_t sortbuf_count = filter->sortbuf_count; 136 int i; 137 138 for (i = 0; i < decomposed_count; i++) 139 { 140 /* Fetch the next character from the decomposition. */ 141 ucs4_t uc = decomposed[i]; 142 int ccc = uc_combining_class (uc); 143 144 if (ccc == 0) 145 { 146 size_t j; 147 148 /* Apply the canonical ordering algorithm to the accumulated 149 sequence of characters. */ 150 if (sortbuf_count > 1) 151 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, 152 sortbuf + sortbuf_count); 153 154 if (filter->composer != NULL) 155 { 156 /* Attempt to combine decomposed characters, as specified 157 in the Unicode Standard Annex #15 "Unicode Normalization 158 Forms". We need to check 159 1. whether the first accumulated character is a 160 "starter" (i.e. has ccc = 0). This is usually the 161 case. But when the string starts with a 162 non-starter, the sortbuf also starts with a 163 non-starter. Btw, this check could also be 164 omitted, because the composition table has only 165 entries (code1, code2) for which code1 is a 166 starter; if the first accumulated character is not 167 a starter, no lookup will succeed. 168 2. If the sortbuf has more than one character, check 169 for each of these characters that are not "blocked" 170 from the starter (i.e. have a ccc that is higher 171 than the ccc of the previous character) whether it 172 can be combined with the first character. 173 3. If only one character is left in sortbuf, check 174 whether it can be combined with the next character 175 (also a starter). */ 176 if (sortbuf_count > 0 && sortbuf[0].ccc == 0) 177 { 178 for (j = 1; j < sortbuf_count; ) 179 { 180 if (sortbuf[j].ccc > sortbuf[j - 1].ccc) 181 { 182 ucs4_t combined = 183 filter->composer (sortbuf[0].code, sortbuf[j].code); 184 if (combined) 185 { 186 size_t k; 187 188 sortbuf[0].code = combined; 189 /* sortbuf[0].ccc = 0, still valid. */ 190 for (k = j + 1; k < sortbuf_count; k++) 191 sortbuf[k - 1] = sortbuf[k]; 192 sortbuf_count--; 193 continue; 194 } 195 } 196 j++; 197 } 198 if (sortbuf_count == 1) 199 { 200 ucs4_t combined = 201 filter->composer (sortbuf[0].code, uc); 202 if (combined) 203 { 204 uc = combined; 205 ccc = 0; 206 /* uc could be further combined with subsequent 207 characters. So don't put it into sortbuf[0] in 208 this round, only in the next round. */ 209 sortbuf_count = 0; 210 } 211 } 212 } 213 } 214 215 for (j = 0; j < sortbuf_count; j++) 216 { 217 ucs4_t muc = sortbuf[j].code; 218 219 /* Output muc to the encapsulated stream. */ 220 int ret = filter->stream_func (filter->stream_data, muc); 221 if (ret < 0) 222 { 223 /* errno is set here. */ 224 filter->sortbuf_count = 0; 225 return -1; 226 } 227 } 228 229 /* sortbuf is now empty. */ 230 sortbuf_count = 0; 231 } 232 233 /* Append (uc, ccc) to sortbuf. */ 234 if (sortbuf_count == filter->sortbuf_allocated) 235 { 236 struct ucs4_with_ccc *new_sortbuf; 237 238 filter->sortbuf_allocated = 2 * filter->sortbuf_allocated; 239 if (filter->sortbuf_allocated < sortbuf_count) /* integer overflow? */ 240 abort (); 241 new_sortbuf = 242 (struct ucs4_with_ccc *) 243 malloc (2 * filter->sortbuf_allocated * sizeof (struct ucs4_with_ccc)); 244 memcpy (new_sortbuf, filter->sortbuf, 245 sortbuf_count * sizeof (struct ucs4_with_ccc)); 246 if (filter->sortbuf != filter->sortbuf_preallocated) 247 free (filter->sortbuf); 248 filter->sortbuf = new_sortbuf; 249 } 250 filter->sortbuf[sortbuf_count].code = uc; 251 filter->sortbuf[sortbuf_count].ccc = ccc; 252 sortbuf_count++; 253 } 254 255 filter->sortbuf_count = sortbuf_count; 256 } 257 258 return 0; 259} 260 261/* Bring data buffered in the filter to its destination, the encapsulated 262 stream. 263 Return 0 if successful, or -1 with errno set upon failure. 264 Note! If after calling this function, additional characters are written 265 into the filter, the resulting character sequence in the encapsulated stream 266 will not necessarily be normalized. */ 267int 268uninorm_filter_flush (struct uninorm_filter *filter) 269{ 270 /* Cache sortbuf and sortbuf_count in local register variables. */ 271 struct ucs4_with_ccc * const sortbuf = filter->sortbuf; 272 size_t sortbuf_count = filter->sortbuf_count; 273 size_t j; 274 275 /* Apply the canonical ordering algorithm to the accumulated 276 sequence of characters. */ 277 if (sortbuf_count > 1) 278 gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, 279 sortbuf + sortbuf_count); 280 281 if (filter->composer != NULL) 282 { 283 /* Attempt to combine decomposed characters, as specified 284 in the Unicode Standard Annex #15 "Unicode Normalization 285 Forms". We need to check 286 1. whether the first accumulated character is a 287 "starter" (i.e. has ccc = 0). This is usually the 288 case. But when the string starts with a 289 non-starter, the sortbuf also starts with a 290 non-starter. Btw, this check could also be 291 omitted, because the composition table has only 292 entries (code1, code2) for which code1 is a 293 starter; if the first accumulated character is not 294 a starter, no lookup will succeed. 295 2. If the sortbuf has more than one character, check 296 for each of these characters that are not "blocked" 297 from the starter (i.e. have a ccc that is higher 298 than the ccc of the previous character) whether it 299 can be combined with the first character. 300 3. If only one character is left in sortbuf, check 301 whether it can be combined with the next character 302 (also a starter). */ 303 if (sortbuf_count > 0 && sortbuf[0].ccc == 0) 304 { 305 for (j = 1; j < sortbuf_count; ) 306 { 307 if (sortbuf[j].ccc > sortbuf[j - 1].ccc) 308 { 309 ucs4_t combined = 310 filter->composer (sortbuf[0].code, sortbuf[j].code); 311 if (combined) 312 { 313 size_t k; 314 315 sortbuf[0].code = combined; 316 /* sortbuf[0].ccc = 0, still valid. */ 317 for (k = j + 1; k < sortbuf_count; k++) 318 sortbuf[k - 1] = sortbuf[k]; 319 sortbuf_count--; 320 continue; 321 } 322 } 323 j++; 324 } 325 } 326 } 327 328 for (j = 0; j < sortbuf_count; j++) 329 { 330 ucs4_t muc = sortbuf[j].code; 331 332 /* Output muc to the encapsulated stream. */ 333 int ret = filter->stream_func (filter->stream_data, muc); 334 if (ret < 0) 335 { 336 /* errno is set here. */ 337 filter->sortbuf_count = 0; 338 return -1; 339 } 340 } 341 342 /* sortbuf is now empty. */ 343 filter->sortbuf_count = 0; 344 345 return 0; 346} 347 348/* Bring data buffered in the filter to its destination, the encapsulated 349 stream, then close and free the filter. 350 Return 0 if successful, or -1 with errno set upon failure. */ 351int 352uninorm_filter_free (struct uninorm_filter *filter) 353{ 354 int ret = uninorm_filter_flush (filter); 355 356 if (ret < 0) 357 /* errno is set here. */ 358 return -1; 359 360 if (filter->sortbuf_count > 0) 361 abort (); 362 if (filter->sortbuf != filter->sortbuf_preallocated) 363 free (filter->sortbuf); 364 free (filter); 365 366 return 0; 367} 368