1/* Case mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
2   Copyright (C) 2009-2010 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5   This program is free software: you can redistribute it and/or modify it
6   under the terms of the GNU Lesser General Public License as published
7   by the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18UNIT *
19FUNC (const UNIT *s, size_t n,
20      casing_prefix_context_t prefix_context,
21      casing_suffix_context_t suffix_context,
22      const char *iso639_language,
23      ucs4_t (*single_character_map) (ucs4_t),
24      size_t offset_in_rule, /* offset in 'struct special_casing_rule' */
25      uninorm_t nf,
26      UNIT *resultbuf, size_t *lengthp)
27{
28  /* The result being accumulated.  */
29  UNIT *result;
30  size_t length;
31  size_t allocated;
32
33  /* Initialize the accumulator.  */
34  if (nf != NULL || resultbuf == NULL)
35    {
36      result = NULL;
37      allocated = 0;
38    }
39  else
40    {
41      result = resultbuf;
42      allocated = *lengthp;
43    }
44  length = 0;
45
46  {
47    const UNIT *s_end = s + n;
48
49    /* Helper for evaluating the FINAL_SIGMA condition:
50       Last character that was not case-ignorable.  */
51    ucs4_t last_char_except_ignorable =
52      prefix_context.last_char_except_ignorable;
53
54    /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
55       Last character that was of combining class 230 ("Above") or 0.  */
56    ucs4_t last_char_normal_or_above =
57      prefix_context.last_char_normal_or_above;
58
59    while (s < s_end)
60      {
61        ucs4_t uc;
62        int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
63
64        ucs4_t mapped_uc[3];
65        unsigned int mapped_count;
66
67        if (uc < 0x10000)
68          {
69            /* Look first in the special-casing table.  */
70            char code[3];
71
72            code[0] = (uc >> 8) & 0xff;
73            code[1] = uc & 0xff;
74
75            for (code[2] = 0; ; code[2]++)
76              {
77                const struct special_casing_rule *rule =
78                  gl_unicase_special_lookup (code, 3);
79
80                if (rule == NULL)
81                  break;
82
83                /* Test if the condition applies.  */
84                /* Does the language apply?  */
85                if (rule->language[0] == '\0'
86                    || (iso639_language != NULL
87                        && iso639_language[0] == rule->language[0]
88                        && iso639_language[1] == rule->language[1]))
89                  {
90                    /* Does the context apply?  */
91                    int context = rule->context;
92                    bool applies;
93
94                    if (context < 0)
95                      context = - context;
96                    switch (context)
97                      {
98                      case SCC_ALWAYS:
99                        applies = true;
100                        break;
101
102                      case SCC_FINAL_SIGMA:
103                        /* "Before" condition: preceded by a sequence
104                           consisting of a cased letter and a case-ignorable
105                           sequence.
106                           "After" condition: not followed by a sequence
107                           consisting of a case-ignorable sequence and then a
108                           cased letter.  */
109                        /* Test the "before" condition.  */
110                        applies = uc_is_cased (last_char_except_ignorable);
111                        /* Test the "after" condition.  */
112                        if (applies)
113                          {
114                            const UNIT *s2 = s + count;
115                            for (;;)
116                              {
117                                if (s2 < s_end)
118                                  {
119                                    ucs4_t uc2;
120                                    int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
121                                    /* Our uc_is_case_ignorable function is
122                                       known to return false for all cased
123                                       characters.  So we can call
124                                       uc_is_case_ignorable first.  */
125                                    if (!uc_is_case_ignorable (uc2))
126                                      {
127                                        applies = ! uc_is_cased (uc2);
128                                        break;
129                                      }
130                                    s2 += count2;
131                                  }
132                                else
133                                  {
134                                    applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
135                                    break;
136                                  }
137                              }
138                          }
139                        break;
140
141                      case SCC_AFTER_SOFT_DOTTED:
142                        /* "Before" condition: There is a Soft_Dotted character
143                           before it, with no intervening character of
144                           combining class 0 or 230 (Above).  */
145                        /* Test the "before" condition.  */
146                        applies = uc_is_property_soft_dotted (last_char_normal_or_above);
147                        break;
148
149                      case SCC_MORE_ABOVE:
150                        /* "After" condition: followed by a character of
151                           combining class 230 (Above) with no intervening
152                           character of combining class 0 or 230 (Above).  */
153                        /* Test the "after" condition.  */
154                        {
155                          const UNIT *s2 = s + count;
156                          applies = false;
157                          for (;;)
158                            {
159                              if (s2 < s_end)
160                                {
161                                  ucs4_t uc2;
162                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
163                                  int ccc = uc_combining_class (uc2);
164                                  if (ccc == UC_CCC_A)
165                                    {
166                                      applies = true;
167                                      break;
168                                    }
169                                  if (ccc == UC_CCC_NR)
170                                    break;
171                                  s2 += count2;
172                                }
173                              else
174                                {
175                                  applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
176                                  break;
177                                }
178                            }
179                        }
180                        break;
181
182                      case SCC_BEFORE_DOT:
183                        /* "After" condition: followed by COMBINING DOT ABOVE
184                           (U+0307). Any sequence of characters with a
185                           combining class that is neither 0 nor 230 may
186                           intervene between the current character and the
187                           combining dot above.  */
188                        /* Test the "after" condition.  */
189                        {
190                          const UNIT *s2 = s + count;
191                          applies = false;
192                          for (;;)
193                            {
194                              if (s2 < s_end)
195                                {
196                                  ucs4_t uc2;
197                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
198                                  if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
199                                    {
200                                      applies = true;
201                                      break;
202                                    }
203                                  {
204                                    int ccc = uc_combining_class (uc2);
205                                    if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
206                                      break;
207                                  }
208                                  s2 += count2;
209                                }
210                              else
211                                {
212                                  applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
213                                  break;
214                                }
215                            }
216                        }
217                        break;
218
219                      case SCC_AFTER_I:
220                        /* "Before" condition: There is an uppercase I before
221                           it, and there is no intervening character of
222                           combining class 0 or 230 (Above).  */
223                        /* Test the "before" condition.  */
224                        applies = (last_char_normal_or_above == 'I');
225                        break;
226
227                      default:
228                        abort ();
229                      }
230                    if (rule->context < 0)
231                      applies = !applies;
232
233                    if (applies)
234                      {
235                        /* The rule applies.
236                           Look up the mapping (0 to 3 characters).  */
237                        const unsigned short *mapped_in_rule =
238                          (const unsigned short *)((const char *)rule + offset_in_rule);
239
240                        if (mapped_in_rule[0] == 0)
241                          mapped_count = 0;
242                        else
243                          {
244                            mapped_uc[0] = mapped_in_rule[0];
245                            if (mapped_in_rule[1] == 0)
246                              mapped_count = 1;
247                            else
248                              {
249                                mapped_uc[1] = mapped_in_rule[1];
250                                if (mapped_in_rule[2] == 0)
251                                  mapped_count = 2;
252                                else
253                                  {
254                                    mapped_uc[2] = mapped_in_rule[2];
255                                    mapped_count = 3;
256                                  }
257                              }
258                          }
259                        goto found_mapping;
260                      }
261                  }
262
263                /* Optimization: Save a hash table lookup in the next round.  */
264                if (!rule->has_next)
265                  break;
266              }
267          }
268
269        /* No special-cased mapping.  So use the locale and context independent
270           mapping.  */
271        mapped_uc[0] = single_character_map (uc);
272        mapped_count = 1;
273
274       found_mapping:
275        /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
276        {
277          unsigned int i;
278
279          for (i = 0; i < mapped_count; i++)
280            {
281              ucs4_t muc = mapped_uc[i];
282
283              /* Append muc to the result accumulator.  */
284              if (length < allocated)
285                {
286                  int ret = U_UCTOMB (result + length, muc, allocated - length);
287                  if (ret == -1)
288                    {
289                      errno = EINVAL;
290                      goto fail;
291                    }
292                  if (ret >= 0)
293                    {
294                      length += ret;
295                      goto done_appending;
296                    }
297                }
298              {
299                size_t old_allocated = allocated;
300                size_t new_allocated = 2 * old_allocated;
301                if (new_allocated < 64)
302                  new_allocated = 64;
303                if (new_allocated < old_allocated) /* integer overflow? */
304                  abort ();
305                {
306                  UNIT *larger_result;
307                  if (result == NULL)
308                    {
309                      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
310                      if (larger_result == NULL)
311                        {
312                          errno = ENOMEM;
313                          goto fail;
314                        }
315                    }
316                  else if (result == resultbuf)
317                    {
318                      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
319                      if (larger_result == NULL)
320                        {
321                          errno = ENOMEM;
322                          goto fail;
323                        }
324                      U_CPY (larger_result, resultbuf, length);
325                    }
326                  else
327                    {
328                      larger_result =
329                        (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
330                      if (larger_result == NULL)
331                        {
332                          errno = ENOMEM;
333                          goto fail;
334                        }
335                    }
336                  result = larger_result;
337                  allocated = new_allocated;
338                  {
339                    int ret = U_UCTOMB (result + length, muc, allocated - length);
340                    if (ret == -1)
341                      {
342                        errno = EINVAL;
343                        goto fail;
344                      }
345                    if (ret < 0)
346                      abort ();
347                    length += ret;
348                    goto done_appending;
349                  }
350                }
351              }
352             done_appending: ;
353            }
354        }
355
356        if (!uc_is_case_ignorable (uc))
357          last_char_except_ignorable = uc;
358
359        {
360          int ccc = uc_combining_class (uc);
361          if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
362            last_char_normal_or_above = uc;
363        }
364
365        s += count;
366      }
367  }
368
369  if (nf != NULL)
370    {
371      /* Finally, normalize the result.  */
372      UNIT *normalized_result;
373
374      normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
375      if (normalized_result == NULL)
376        goto fail;
377
378      free (result);
379      return normalized_result;
380    }
381
382  if (length == 0)
383    {
384      if (result == NULL)
385        {
386          /* Return a non-NULL value.  NULL means error.  */
387          result = (UNIT *) malloc (1);
388          if (result == NULL)
389            {
390              errno = ENOMEM;
391              goto fail;
392            }
393        }
394    }
395  else if (result != resultbuf && length < allocated)
396    {
397      /* Shrink the allocated memory if possible.  */
398      UNIT *memory;
399
400      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
401      if (memory != NULL)
402        result = memory;
403    }
404
405  *lengthp = length;
406  return result;
407
408 fail:
409  if (result != resultbuf)
410    {
411      int saved_errno = errno;
412      free (result);
413      errno = saved_errno;
414    }
415  return NULL;
416}
417