1/*
2 * Copyright �� 2011,2012,2014  Google, Inc.
3 *
4 *  This is part of HarfBuzz, a text shaping library.
5 *
6 * Permission is hereby granted, without written agreement and without
7 * license or royalty fees, to use, copy, modify, and distribute this
8 * software and its documentation for any purpose, provided that the
9 * above copyright notice and the following two paragraphs appear in
10 * all copies of this software.
11 *
12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
16 * DAMAGE.
17 *
18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20 * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
23 *
24 * Google Author(s): Behdad Esfahbod
25 */
26
27#ifndef HB_UTF_PRIVATE_HH
28#define HB_UTF_PRIVATE_HH
29
30#include "hb-private.hh"
31
32
33struct hb_utf8_t
34{
35  typedef uint8_t codepoint_t;
36
37  static inline const uint8_t *
38  next (const uint8_t *text,
39        const uint8_t *end,
40        hb_codepoint_t *unicode,
41        hb_codepoint_t replacement)
42  {
43    /* Written to only accept well-formed sequences.
44     * Based on ideas from ICU's U8_NEXT.
45     * Generates one "replacement" for each ill-formed byte. */
46
47    hb_codepoint_t c = *text++;
48
49    if (c > 0x7Fu)
50    {
51      if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
52      {
53        unsigned int t1;
54        if (likely (text < end &&
55                    (t1 = text[0] - 0x80u) <= 0x3Fu))
56        {
57          c = ((c&0x1Fu)<<6) | t1;
58          text++;
59        }
60        else
61          goto error;
62      }
63      else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
64      {
65        unsigned int t1, t2;
66        if (likely (1 < end - text &&
67                    (t1 = text[0] - 0x80u) <= 0x3Fu &&
68                    (t2 = text[1] - 0x80u) <= 0x3Fu))
69        {
70          c = ((c&0xFu)<<12) | (t1<<6) | t2;
71          if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu)))
72            goto error;
73          text += 2;
74        }
75        else
76          goto error;
77      }
78      else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
79      {
80        unsigned int t1, t2, t3;
81        if (likely (2 < end - text &&
82                    (t1 = text[0] - 0x80u) <= 0x3Fu &&
83                    (t2 = text[1] - 0x80u) <= 0x3Fu &&
84                    (t3 = text[2] - 0x80u) <= 0x3Fu))
85        {
86          c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
87          if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu)))
88            goto error;
89          text += 3;
90        }
91        else
92          goto error;
93      }
94      else
95        goto error;
96    }
97
98    *unicode = c;
99    return text;
100
101  error:
102    *unicode = replacement;
103    return text;
104  }
105
106  static inline const uint8_t *
107  prev (const uint8_t *text,
108        const uint8_t *start,
109        hb_codepoint_t *unicode,
110        hb_codepoint_t replacement)
111  {
112    const uint8_t *end = text--;
113    while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
114      text--;
115
116    if (likely (next (text, end, unicode, replacement) == end))
117      return text;
118
119    *unicode = replacement;
120    return end - 1;
121  }
122
123  static inline unsigned int
124  strlen (const uint8_t *text)
125  {
126    return ::strlen ((const char *) text);
127  }
128};
129
130
131struct hb_utf16_t
132{
133  typedef uint16_t codepoint_t;
134
135  static inline const uint16_t *
136  next (const uint16_t *text,
137        const uint16_t *end,
138        hb_codepoint_t *unicode,
139        hb_codepoint_t replacement)
140  {
141    hb_codepoint_t c = *text++;
142
143    if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
144    {
145      *unicode = c;
146      return text;
147    }
148
149    if (likely (c <= 0xDBFFu && text < end))
150    {
151      /* High-surrogate in c */
152      hb_codepoint_t l = *text;
153      if (likely (hb_in_range (l, 0xDC00u, 0xDFFFu)))
154      {
155        /* Low-surrogate in l */
156        *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
157         text++;
158         return text;
159      }
160    }
161
162    /* Lonely / out-of-order surrogate. */
163    *unicode = replacement;
164    return text;
165  }
166
167  static inline const uint16_t *
168  prev (const uint16_t *text,
169        const uint16_t *start,
170        hb_codepoint_t *unicode,
171        hb_codepoint_t replacement)
172  {
173    hb_codepoint_t c = *--text;
174
175    if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
176    {
177      *unicode = c;
178      return text;
179    }
180
181    if (likely (c >= 0xDC00u && start < text))
182    {
183      /* Low-surrogate in c */
184      hb_codepoint_t h = text[-1];
185      if (likely (hb_in_range (h, 0xD800u, 0xDBFFu)))
186      {
187        /* High-surrogate in h */
188        *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u);
189        text--;
190        return text;
191      }
192    }
193
194    /* Lonely / out-of-order surrogate. */
195    *unicode = replacement;
196    return text;
197  }
198
199
200  static inline unsigned int
201  strlen (const uint16_t *text)
202  {
203    unsigned int l = 0;
204    while (*text++) l++;
205    return l;
206  }
207};
208
209
210template <bool validate=true>
211struct hb_utf32_t
212{
213  typedef uint32_t codepoint_t;
214
215  static inline const uint32_t *
216  next (const uint32_t *text,
217        const uint32_t *end HB_UNUSED,
218        hb_codepoint_t *unicode,
219        hb_codepoint_t replacement)
220  {
221    hb_codepoint_t c = *unicode = *text++;
222    if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
223      *unicode = replacement;
224    return text;
225  }
226
227  static inline const uint32_t *
228  prev (const uint32_t *text,
229        const uint32_t *start HB_UNUSED,
230        hb_codepoint_t *unicode,
231        hb_codepoint_t replacement)
232  {
233    hb_codepoint_t c = *unicode = *--text;
234    if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
235      *unicode = replacement;
236    return text;
237  }
238
239  static inline unsigned int
240  strlen (const uint32_t *text)
241  {
242    unsigned int l = 0;
243    while (*text++) l++;
244    return l;
245  }
246};
247
248
249struct hb_latin1_t
250{
251  typedef uint8_t codepoint_t;
252
253  static inline const uint8_t *
254  next (const uint8_t *text,
255        const uint8_t *end HB_UNUSED,
256        hb_codepoint_t *unicode,
257        hb_codepoint_t replacement HB_UNUSED)
258  {
259    *unicode = *text++;
260    return text;
261  }
262
263  static inline const uint8_t *
264  prev (const uint8_t *text,
265        const uint8_t *start HB_UNUSED,
266        hb_codepoint_t *unicode,
267        hb_codepoint_t replacement)
268  {
269    *unicode = *--text;
270    return text;
271  }
272
273  static inline unsigned int
274  strlen (const uint8_t *text)
275  {
276    unsigned int l = 0;
277    while (*text++) l++;
278    return l;
279  }
280};
281
282#endif /* HB_UTF_PRIVATE_HH */
283