1/* Decomposition and composition of Unicode strings.
2   Copyright (C) 2009-2010 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5   This program is free software: you can redistribute it and/or modify it
6   under the terms of the GNU Lesser General Public License as published
7   by the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18UNIT *
19FUNC (uninorm_t nf, const UNIT *s, size_t n,
20      UNIT *resultbuf, size_t *lengthp)
21{
22  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
23  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
24
25  /* The result being accumulated.  */
26  UNIT *result;
27  size_t length;
28  size_t allocated;
29  /* The buffer for sorting.  */
30  #define SORTBUF_PREALLOCATED 64
31  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
32  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
33  size_t sortbuf_allocated;
34  size_t sortbuf_count;
35
36  /* Initialize the accumulator.  */
37  if (resultbuf == NULL)
38    {
39      result = NULL;
40      allocated = 0;
41    }
42  else
43    {
44      result = resultbuf;
45      allocated = *lengthp;
46    }
47  length = 0;
48
49  /* Initialize the buffer for sorting.  */
50  sortbuf = sortbuf_preallocated;
51  sortbuf_allocated = SORTBUF_PREALLOCATED;
52  sortbuf_count = 0;
53
54  {
55    const UNIT *s_end = s + n;
56
57    for (;;)
58      {
59        int count;
60        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
61        int decomposed_count;
62        int i;
63
64        if (s < s_end)
65          {
66            /* Fetch the next character.  */
67            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
68            decomposed_count = 1;
69
70            /* Decompose it, recursively.
71               It would be possible to precompute the recursive decomposition
72               and store it in a table.  But this would significantly increase
73               the size of the decomposition tables, because for example for
74               U+1FC1 the recursive canonical decomposition and the recursive
75               compatibility decomposition are different.  */
76            {
77              int curr;
78
79              for (curr = 0; curr < decomposed_count; )
80                {
81                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
82                     all elements are atomic.  */
83                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
84                  int curr_decomposed_count;
85
86                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
87                  if (curr_decomposed_count >= 0)
88                    {
89                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
90                         decomposed[curr], making room.  It's not worth using
91                         memcpy() here, since the counts are so small.  */
92                      int shift = curr_decomposed_count - 1;
93
94                      if (shift < 0)
95                        abort ();
96                      if (shift > 0)
97                        {
98                          int j;
99
100                          decomposed_count += shift;
101                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
102                            abort ();
103                          for (j = decomposed_count - 1 - shift; j > curr; j--)
104                            decomposed[j + shift] = decomposed[j];
105                        }
106                      for (; shift >= 0; shift--)
107                        decomposed[curr + shift] = curr_decomposed[shift];
108                    }
109                  else
110                    {
111                      /* decomposed[curr] is atomic.  */
112                      curr++;
113                    }
114                }
115            }
116          }
117        else
118          {
119            count = 0;
120            decomposed_count = 0;
121          }
122
123        i = 0;
124        for (;;)
125          {
126            ucs4_t uc;
127            int ccc;
128
129            if (s < s_end)
130              {
131                /* Fetch the next character from the decomposition.  */
132                if (i == decomposed_count)
133                  break;
134                uc = decomposed[i];
135                ccc = uc_combining_class (uc);
136              }
137            else
138              {
139                /* End of string reached.  */
140                uc = 0;
141                ccc = 0;
142              }
143
144            if (ccc == 0)
145              {
146                size_t j;
147
148                /* Apply the canonical ordering algorithm to the accumulated
149                   sequence of characters.  */
150                if (sortbuf_count > 1)
151                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
152                                                           sortbuf + sortbuf_count);
153
154                if (composer != NULL)
155                  {
156                    /* Attempt to combine decomposed characters, as specified
157                       in the Unicode Standard Annex #15 "Unicode Normalization
158                       Forms".  We need to check
159                         1. whether the first accumulated character is a
160                            "starter" (i.e. has ccc = 0).  This is usually the
161                            case.  But when the string starts with a
162                            non-starter, the sortbuf also starts with a
163                            non-starter.  Btw, this check could also be
164                            omitted, because the composition table has only
165                            entries (code1, code2) for which code1 is a
166                            starter; if the first accumulated character is not
167                            a starter, no lookup will succeed.
168                         2. If the sortbuf has more than one character, check
169                            for each of these characters that are not "blocked"
170                            from the starter (i.e. have a ccc that is higher
171                            than the ccc of the previous character) whether it
172                            can be combined with the first character.
173                         3. If only one character is left in sortbuf, check
174                            whether it can be combined with the next character
175                            (also a starter).  */
176                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
177                      {
178                        for (j = 1; j < sortbuf_count; )
179                          {
180                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
181                              {
182                                ucs4_t combined =
183                                  composer (sortbuf[0].code, sortbuf[j].code);
184                                if (combined)
185                                  {
186                                    size_t k;
187
188                                    sortbuf[0].code = combined;
189                                    /* sortbuf[0].ccc = 0, still valid.  */
190                                    for (k = j + 1; k < sortbuf_count; k++)
191                                      sortbuf[k - 1] = sortbuf[k];
192                                    sortbuf_count--;
193                                    continue;
194                                  }
195                              }
196                            j++;
197                          }
198                        if (s < s_end && sortbuf_count == 1)
199                          {
200                            ucs4_t combined =
201                              composer (sortbuf[0].code, uc);
202                            if (combined)
203                              {
204                                uc = combined;
205                                ccc = 0;
206                                /* uc could be further combined with subsequent
207                                   characters.  So don't put it into sortbuf[0] in
208                                   this round, only in the next round.  */
209                                sortbuf_count = 0;
210                              }
211                          }
212                      }
213                  }
214
215                for (j = 0; j < sortbuf_count; j++)
216                  {
217                    ucs4_t muc = sortbuf[j].code;
218
219                    /* Append muc to the result accumulator.  */
220                    if (length < allocated)
221                      {
222                        int ret =
223                          U_UCTOMB (result + length, muc, allocated - length);
224                        if (ret == -1)
225                          {
226                            errno = EINVAL;
227                            goto fail;
228                          }
229                        if (ret >= 0)
230                          {
231                            length += ret;
232                            goto done_appending;
233                          }
234                      }
235                    {
236                      size_t old_allocated = allocated;
237                      size_t new_allocated = 2 * old_allocated;
238                      if (new_allocated < 64)
239                        new_allocated = 64;
240                      if (new_allocated < old_allocated) /* integer overflow? */
241                        abort ();
242                      {
243                        UNIT *larger_result;
244                        if (result == NULL)
245                          {
246                            larger_result =
247                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
248                            if (larger_result == NULL)
249                              {
250                                errno = ENOMEM;
251                                goto fail;
252                              }
253                          }
254                        else if (result == resultbuf)
255                          {
256                            larger_result =
257                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
258                            if (larger_result == NULL)
259                              {
260                                errno = ENOMEM;
261                                goto fail;
262                              }
263                            U_CPY (larger_result, resultbuf, length);
264                          }
265                        else
266                          {
267                            larger_result =
268                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
269                            if (larger_result == NULL)
270                              {
271                                errno = ENOMEM;
272                                goto fail;
273                              }
274                          }
275                        result = larger_result;
276                        allocated = new_allocated;
277                        {
278                          int ret =
279                            U_UCTOMB (result + length, muc, allocated - length);
280                          if (ret == -1)
281                            {
282                              errno = EINVAL;
283                              goto fail;
284                            }
285                          if (ret < 0)
286                            abort ();
287                          length += ret;
288                          goto done_appending;
289                        }
290                      }
291                    }
292                   done_appending: ;
293                  }
294
295                /* sortbuf is now empty.  */
296                sortbuf_count = 0;
297              }
298
299            if (!(s < s_end))
300              /* End of string reached.  */
301              break;
302
303            /* Append (uc, ccc) to sortbuf.  */
304            if (sortbuf_count == sortbuf_allocated)
305              {
306                struct ucs4_with_ccc *new_sortbuf;
307
308                sortbuf_allocated = 2 * sortbuf_allocated;
309                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
310                  abort ();
311                new_sortbuf =
312                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
313                memcpy (new_sortbuf, sortbuf,
314                        sortbuf_count * sizeof (struct ucs4_with_ccc));
315                if (sortbuf != sortbuf_preallocated)
316                  free (sortbuf);
317                sortbuf = new_sortbuf;
318              }
319            sortbuf[sortbuf_count].code = uc;
320            sortbuf[sortbuf_count].ccc = ccc;
321            sortbuf_count++;
322
323            i++;
324          }
325
326        if (!(s < s_end))
327          /* End of string reached.  */
328          break;
329
330        s += count;
331      }
332  }
333
334  if (length == 0)
335    {
336      if (result == NULL)
337        {
338          /* Return a non-NULL value.  NULL means error.  */
339          result = (UNIT *) malloc (1);
340          if (result == NULL)
341            {
342              errno = ENOMEM;
343              goto fail;
344            }
345        }
346    }
347  else if (result != resultbuf && length < allocated)
348    {
349      /* Shrink the allocated memory if possible.  */
350      UNIT *memory;
351
352      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
353      if (memory != NULL)
354        result = memory;
355    }
356
357  if (sortbuf_count > 0)
358    abort ();
359  if (sortbuf != sortbuf_preallocated)
360    free (sortbuf);
361
362  *lengthp = length;
363  return result;
364
365 fail:
366  {
367    int saved_errno = errno;
368    if (sortbuf != sortbuf_preallocated)
369      free (sortbuf);
370    if (result != resultbuf)
371      free (result);
372    errno = saved_errno;
373  }
374  return NULL;
375}
376