1/* Word breaks in UTF-8/UTF-16/UTF-32 strings.
2   Copyright (C) 2009-2010 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5   This program is free software: you can redistribute it and/or modify it
6   under the terms of the GNU Lesser General Public License as published
7   by the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18void
19FUNC (const UNIT *s, size_t n, char *p)
20{
21  if (n > 0)
22    {
23      const UNIT *s_end = s + n;
24
25      /* Word break property of the last character.
26         -1 at the very beginning of the string.  */
27      int last_char_prop = -1;
28
29      /* Format and Extend characters are ignored; this means, the mostly used
30         unit is the complex character (= character with subsequent ignored
31         characters).
32         Word break property of the last complex character.
33         -1 at the very beginning of the string.  */
34      int last_compchar_prop = -1;
35      char *last_compchar_ptr = NULL;
36
37      /* For recognizing rules involving 3 complex characters:
38         Word break property of the second-to-last complex character.
39         -1 at the very beginning of the string.  */
40      int secondlast_compchar_prop = -1;
41
42      /* Don't break inside multibyte characters.  */
43      memset (p, 0, n);
44
45      while (s < s_end)
46        {
47          ucs4_t uc;
48          int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
49          int prop = uc_wordbreak_property (uc);
50
51          /* No break at the start of the string.  */
52          if (last_char_prop >= 0)
53            {
54              /* No break between CR and LF.  */
55              if (last_char_prop == WBP_CR && prop == WBP_LF)
56                /* *p = 0 */;
57              /* Break before and after newlines.  */
58              else if (last_char_prop >= WBP_NEWLINE
59                       /* same as:
60                          last_char_prop == WBP_CR
61                          || last_char_prop == WBP_LF
62                          || last_char_prop == WBP_NEWLINE */
63                       || prop >= WBP_NEWLINE
64                          /* same as:
65                             prop == WBP_CR
66                             || prop == WBP_LF
67                             || prop == WBP_NEWLINE */)
68                *p = 1;
69              /* Ignore Format and Extend characters.  */
70              else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
71                {
72                  /* No break in these situations (see UAX #29):
73
74                      secondlast          last             current
75
76                       ALetter   (MidLetter | MidNumLet) �� ALetter      (WB7)
77                       ALetter �� (MidLetter | MidNumLet)   ALetter      (WB6)
78                       Numeric   (MidNum | MidNumLet)    �� Numeric      (WB11)
79                       Numeric �� (MidNum | MidNumLet)      Numeric      (WB12)
80                                                 ALetter �� ALetter      (WB5)
81                                                 ALetter �� Numeric      (WB9)
82                                                 Numeric �� ALetter      (WB10)
83                                                 Numeric �� Numeric      (WB8)
84                                                Katakana �� Katakana     (WB13)
85                          (ALetter | Numeric | Katakana) �� ExtendNumLet (WB13a)
86                                            ExtendNumLet �� ExtendNumLet (WB13a)
87                         ExtendNumLet �� (ALetter | Numeric | Katakana)  (WB13b)
88                   */
89                  /* No break across certain punctuation.  Also, disable word
90                     breaks that were recognized earlier (due to lookahead of
91                     only one complex character).  */
92                  if ((prop == WBP_ALETTER
93                       && (last_compchar_prop == WBP_MIDLETTER
94                           || last_compchar_prop == WBP_MIDNUMLET)
95                       && secondlast_compchar_prop == WBP_ALETTER)
96                      || (prop == WBP_NUMERIC
97                          && (last_compchar_prop == WBP_MIDNUM
98                              || last_compchar_prop == WBP_MIDNUMLET)
99                          && secondlast_compchar_prop == WBP_NUMERIC))
100                    {
101                      *last_compchar_ptr = 0;
102                      /* *p = 0; */
103                    }
104                  else
105                    {
106                      /* Perform a single table lookup.  */
107                      if (uniwbrk_table[last_compchar_prop][prop])
108                        *p = 1;
109                      /* else *p = 0; */
110                    }
111                }
112            }
113
114          last_char_prop = prop;
115          /* Ignore Format and Extend characters, except at the start of the string.  */
116          if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT))
117            {
118              secondlast_compchar_prop = last_compchar_prop;
119              last_compchar_prop = prop;
120              last_compchar_ptr = p;
121            }
122
123          s += count;
124          p += count;
125        }
126    }
127}
128