1/* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
2   Copyright (C) 2009-2010 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5   This program is free software: you can redistribute it and/or modify it
6   under the terms of the GNU Lesser General Public License as published
7   by the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18/* Quoting the Unicode standard, section "Default Case Algorithms":
19     Find the word boundaries in X according to Unicode Standard Annex #29,
20     ���Text Boundaries.��� For each word boundary, find the first cased character
21     F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
22     then map all characters C between F and the following word boundary to
23     Lowercase_Mapping(C).  */
24
25UNIT *
26FUNC (const UNIT *s, size_t n,
27      casing_prefix_context_t prefix_context,
28      casing_suffix_context_t suffix_context,
29      const char *iso639_language,
30      uninorm_t nf,
31      UNIT *resultbuf, size_t *lengthp)
32{
33  /* The result being accumulated.  */
34  UNIT *result;
35  size_t length;
36  size_t allocated;
37  /* An array containing the word break positions.  */
38  char *wordbreaks;
39
40  /* Initialize the accumulator.  */
41  if (nf != NULL || resultbuf == NULL)
42    {
43      result = NULL;
44      allocated = 0;
45    }
46  else
47    {
48      result = resultbuf;
49      allocated = *lengthp;
50    }
51  length = 0;
52
53  /* Initialize the word breaks array.  */
54  if (n > 0)
55    {
56      wordbreaks = (char *) malloc (n);
57      if (wordbreaks == NULL)
58        {
59          errno = ENOMEM;
60          goto fail2;
61        }
62      U_WORDBREAKS (s, n, wordbreaks);
63    }
64  else
65    wordbreaks = NULL;
66
67  {
68    const UNIT *s_end = s + n;
69    const char *wp = wordbreaks;
70
71    /* When considering the string as segmented by word boundaries: For each
72       such segment:
73        - In the first part, we are searching for the first cased character.
74          In this state, in_word_first_part = true, and no conversion takes
75          place.
76        - In the second part, we are converting every character: the first
77          among these characters to title case, the other ones to lower case.
78          In this state, in_word_first_part = false.  */
79    bool in_word_first_part = true;
80
81    /* Helper for evaluating the FINAL_SIGMA condition:
82       Last character that was not case-ignorable.  */
83    ucs4_t last_char_except_ignorable =
84      prefix_context.last_char_except_ignorable;
85
86    /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
87       Last character that was of combining class 230 ("Above") or 0.  */
88    ucs4_t last_char_normal_or_above =
89      prefix_context.last_char_normal_or_above;
90
91    while (s < s_end)
92      {
93        /* Fetch the next character.  */
94        ucs4_t uc;
95        int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
96
97        ucs4_t (*single_character_map) (ucs4_t);
98        size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
99
100        ucs4_t mapped_uc[3];
101        unsigned int mapped_count;
102
103        if (*wp)
104          /* Crossing a word boundary.  */
105          in_word_first_part = true;
106
107        /* Determine single_character_map, offset_in_rule.
108           There are three possibilities:
109             - uc should not be converted.
110             - uc should be titlecased.
111             - uc should be lowercased.  */
112        if (in_word_first_part)
113          {
114            if (uc_is_cased (uc))
115              {
116                /* uc is to be titlecased.  */
117                single_character_map = uc_totitle;
118                offset_in_rule = offsetof (struct special_casing_rule, title[0]);
119                in_word_first_part = false;
120              }
121            else
122              {
123                /* uc is not converted.  */
124                single_character_map = NULL;
125                offset_in_rule = 0;
126              }
127          }
128        else
129          {
130            /* uc is to be lowercased.  */
131            single_character_map = uc_tolower;
132            offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
133          }
134
135        /* Actually map uc.  */
136        if (single_character_map == NULL)
137          {
138            mapped_uc[0] = uc;
139            mapped_count = 1;
140            goto found_mapping;
141          }
142
143        if (uc < 0x10000)
144          {
145            /* Look first in the special-casing table.  */
146            char code[3];
147
148            code[0] = (uc >> 8) & 0xff;
149            code[1] = uc & 0xff;
150
151            for (code[2] = 0; ; code[2]++)
152              {
153                const struct special_casing_rule *rule =
154                  gl_unicase_special_lookup (code, 3);
155
156                if (rule == NULL)
157                  break;
158
159                /* Test if the condition applies.  */
160                /* Does the language apply?  */
161                if (rule->language[0] == '\0'
162                    || (iso639_language != NULL
163                        && iso639_language[0] == rule->language[0]
164                        && iso639_language[1] == rule->language[1]))
165                  {
166                    /* Does the context apply?  */
167                    int context = rule->context;
168                    bool applies;
169
170                    if (context < 0)
171                      context = - context;
172                    switch (context)
173                      {
174                      case SCC_ALWAYS:
175                        applies = true;
176                        break;
177
178                      case SCC_FINAL_SIGMA:
179                        /* "Before" condition: preceded by a sequence
180                           consisting of a cased letter and a case-ignorable
181                           sequence.
182                           "After" condition: not followed by a sequence
183                           consisting of a case-ignorable sequence and then a
184                           cased letter.  */
185                        /* Test the "before" condition.  */
186                        applies = uc_is_cased (last_char_except_ignorable);
187                        /* Test the "after" condition.  */
188                        if (applies)
189                          {
190                            const UNIT *s2 = s + count;
191                            for (;;)
192                              {
193                                if (s2 < s_end)
194                                  {
195                                    ucs4_t uc2;
196                                    int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
197                                    /* Our uc_is_case_ignorable function is
198                                       known to return false for all cased
199                                       characters.  So we can call
200                                       uc_is_case_ignorable first.  */
201                                    if (!uc_is_case_ignorable (uc2))
202                                      {
203                                        applies = ! uc_is_cased (uc2);
204                                        break;
205                                      }
206                                    s2 += count2;
207                                  }
208                                else
209                                  {
210                                    applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
211                                    break;
212                                  }
213                              }
214                          }
215                        break;
216
217                      case SCC_AFTER_SOFT_DOTTED:
218                        /* "Before" condition: There is a Soft_Dotted character
219                           before it, with no intervening character of
220                           combining class 0 or 230 (Above).  */
221                        /* Test the "before" condition.  */
222                        applies = uc_is_property_soft_dotted (last_char_normal_or_above);
223                        break;
224
225                      case SCC_MORE_ABOVE:
226                        /* "After" condition: followed by a character of
227                           combining class 230 (Above) with no intervening
228                           character of combining class 0 or 230 (Above).  */
229                        /* Test the "after" condition.  */
230                        {
231                          const UNIT *s2 = s + count;
232                          applies = false;
233                          for (;;)
234                            {
235                              if (s2 < s_end)
236                                {
237                                  ucs4_t uc2;
238                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
239                                  int ccc = uc_combining_class (uc2);
240                                  if (ccc == UC_CCC_A)
241                                    {
242                                      applies = true;
243                                      break;
244                                    }
245                                  if (ccc == UC_CCC_NR)
246                                    break;
247                                  s2 += count2;
248                                }
249                              else
250                                {
251                                  applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
252                                  break;
253                                }
254                            }
255                        }
256                        break;
257
258                      case SCC_BEFORE_DOT:
259                        /* "After" condition: followed by COMBINING DOT ABOVE
260                           (U+0307). Any sequence of characters with a
261                           combining class that is neither 0 nor 230 may
262                           intervene between the current character and the
263                           combining dot above.  */
264                        /* Test the "after" condition.  */
265                        {
266                          const UNIT *s2 = s + count;
267                          applies = false;
268                          for (;;)
269                            {
270                              if (s2 < s_end)
271                                {
272                                  ucs4_t uc2;
273                                  int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
274                                  if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
275                                    {
276                                      applies = true;
277                                      break;
278                                    }
279                                  {
280                                    int ccc = uc_combining_class (uc2);
281                                    if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
282                                      break;
283                                  }
284                                  s2 += count2;
285                                }
286                              else
287                                {
288                                  applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
289                                  break;
290                                }
291                            }
292                        }
293                        break;
294
295                      case SCC_AFTER_I:
296                        /* "Before" condition: There is an uppercase I before
297                           it, and there is no intervening character of
298                           combining class 0 or 230 (Above).  */
299                        /* Test the "before" condition.  */
300                        applies = (last_char_normal_or_above == 'I');
301                        break;
302
303                      default:
304                        abort ();
305                      }
306                    if (rule->context < 0)
307                      applies = !applies;
308
309                    if (applies)
310                      {
311                        /* The rule applies.
312                           Look up the mapping (0 to 3 characters).  */
313                        const unsigned short *mapped_in_rule =
314                          (const unsigned short *)((const char *)rule + offset_in_rule);
315
316                        if (mapped_in_rule[0] == 0)
317                          mapped_count = 0;
318                        else
319                          {
320                            mapped_uc[0] = mapped_in_rule[0];
321                            if (mapped_in_rule[1] == 0)
322                              mapped_count = 1;
323                            else
324                              {
325                                mapped_uc[1] = mapped_in_rule[1];
326                                if (mapped_in_rule[2] == 0)
327                                  mapped_count = 2;
328                                else
329                                  {
330                                    mapped_uc[2] = mapped_in_rule[2];
331                                    mapped_count = 3;
332                                  }
333                              }
334                          }
335                        goto found_mapping;
336                      }
337                  }
338
339                /* Optimization: Save a hash table lookup in the next round.  */
340                if (!rule->has_next)
341                  break;
342              }
343          }
344
345        /* No special-cased mapping.  So use the locale and context independent
346           mapping.  */
347        mapped_uc[0] = single_character_map (uc);
348        mapped_count = 1;
349
350       found_mapping:
351        /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
352        {
353          unsigned int i;
354
355          for (i = 0; i < mapped_count; i++)
356            {
357              ucs4_t muc = mapped_uc[i];
358
359              /* Append muc to the result accumulator.  */
360              if (length < allocated)
361                {
362                  int ret = U_UCTOMB (result + length, muc, allocated - length);
363                  if (ret == -1)
364                    {
365                      errno = EINVAL;
366                      goto fail1;
367                    }
368                  if (ret >= 0)
369                    {
370                      length += ret;
371                      goto done_appending;
372                    }
373                }
374              {
375                size_t old_allocated = allocated;
376                size_t new_allocated = 2 * old_allocated;
377                if (new_allocated < 64)
378                  new_allocated = 64;
379                if (new_allocated < old_allocated) /* integer overflow? */
380                  abort ();
381                {
382                  UNIT *larger_result;
383                  if (result == NULL)
384                    {
385                      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
386                      if (larger_result == NULL)
387                        {
388                          errno = ENOMEM;
389                          goto fail1;
390                        }
391                    }
392                  else if (result == resultbuf)
393                    {
394                      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
395                      if (larger_result == NULL)
396                        {
397                          errno = ENOMEM;
398                          goto fail1;
399                        }
400                      U_CPY (larger_result, resultbuf, length);
401                    }
402                  else
403                    {
404                      larger_result =
405                        (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
406                      if (larger_result == NULL)
407                        {
408                          errno = ENOMEM;
409                          goto fail1;
410                        }
411                    }
412                  result = larger_result;
413                  allocated = new_allocated;
414                  {
415                    int ret = U_UCTOMB (result + length, muc, allocated - length);
416                    if (ret == -1)
417                      {
418                        errno = EINVAL;
419                        goto fail1;
420                      }
421                    if (ret < 0)
422                      abort ();
423                    length += ret;
424                    goto done_appending;
425                  }
426                }
427              }
428             done_appending: ;
429            }
430        }
431
432        if (!uc_is_case_ignorable (uc))
433          last_char_except_ignorable = uc;
434
435        {
436          int ccc = uc_combining_class (uc);
437          if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
438            last_char_normal_or_above = uc;
439        }
440
441        s += count;
442        wp += count;
443      }
444  }
445
446  free (wordbreaks);
447
448  if (nf != NULL)
449    {
450      /* Finally, normalize the result.  */
451      UNIT *normalized_result;
452
453      normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
454      if (normalized_result == NULL)
455        goto fail2;
456
457      free (result);
458      return normalized_result;
459    }
460
461  if (length == 0)
462    {
463      if (result == NULL)
464        {
465          /* Return a non-NULL value.  NULL means error.  */
466          result = (UNIT *) malloc (1);
467          if (result == NULL)
468            {
469              errno = ENOMEM;
470              goto fail2;
471            }
472        }
473    }
474  else if (result != resultbuf && length < allocated)
475    {
476      /* Shrink the allocated memory if possible.  */
477      UNIT *memory;
478
479      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
480      if (memory != NULL)
481        result = memory;
482    }
483
484  *lengthp = length;
485  return result;
486
487 fail1:
488  {
489    int saved_errno = errno;
490    free (wordbreaks);
491    errno = saved_errno;
492  }
493 fail2:
494  if (result != resultbuf)
495    {
496      int saved_errno = errno;
497      free (result);
498      errno = saved_errno;
499    }
500  return NULL;
501}
502