1/**********************************************************************
2  utf_8.c -  Oniguruma (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include "regenc.h"
31
32#define USE_INVALID_CODE_SCHEME
33
34#ifdef USE_INVALID_CODE_SCHEME
35/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
36#define INVALID_CODE_FE   0xfffffffe
37#define INVALID_CODE_FF   0xffffffff
38#define VALID_CODE_LIMIT  0x7fffffff
39#endif
40
41#define utf8_islead(c)     ((UChar )((c) & 0xc0) != 0x80)
42
43static const int EncLen_UTF8[] = {
44  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
57  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
58  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
59  4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
60};
61
62typedef enum {
63  FAILURE = -2,
64  ACCEPT,
65  S0, S1, S2, S3,
66  S4, S5, S6, S7
67} state_t;
68#define A ACCEPT
69#define F FAILURE
70static const signed char trans[][0x100] = {
71  { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
72    /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
73    /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
74    /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
75    /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
76    /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
77    /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
78    /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
79    /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
80    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
81    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
82    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
83    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
84    /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85    /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86    /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
87    /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F
88  },
89  { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
90    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
91    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
92    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
93    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
94    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
95    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
96    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
97    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98    /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
99    /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
100    /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
101    /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
102    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
103    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
104    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
105    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
106  },
107  { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
108    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
109    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
110    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
111    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
112    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
113    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
114    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
115    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
116    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
117    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
118    /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
119    /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
121    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
122    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
123    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
124  },
125  { /* S3   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
126    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
127    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
128    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
129    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
130    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
131    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
132    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
133    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
134    /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
135    /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
136    /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
137    /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
139    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
140    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
141    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
142  },
143  { /* S4   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
144    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
145    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
146    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
147    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
148    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
149    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
150    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
151    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
152    /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
153    /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
155    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
156    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
157    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
158    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
159    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
160  },
161  { /* S5   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
162    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
163    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
164    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
165    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
166    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
167    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
168    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
169    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
170    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
171    /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
172    /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
173    /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
174    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
175    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
176    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
177    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
178  },
179  { /* S6   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
180    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
181    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
182    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
183    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
184    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
185    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
186    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
187    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
188    /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
189    /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
190    /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
191    /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
192    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
193    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
194    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
195    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
196  },
197  { /* S7   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
198    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
199    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
200    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
201    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
202    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
203    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
204    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
205    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
206    /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
207    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
208    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
209    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
210    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
211    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
212    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
213    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
214  },
215};
216#undef A
217#undef F
218
219static int
220mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
221{
222  int firstbyte = *p++;
223  state_t s;
224  s = trans[0][firstbyte];
225  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
226                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
227
228  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
229  s = trans[s][*p++];
230  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
231                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
232
233  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
234  s = trans[s][*p++];
235  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
236                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
237
238  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
239  s = trans[s][*p++];
240  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
241                       ONIGENC_CONSTRUCT_MBCLEN_INVALID();
242}
243
244static int
245is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc)
246{
247  if (p < end) {
248    if (*p == 0x0a) return 1;
249
250#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
251    if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1;
252    if (p + 1 < end) {
253      if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
254	return 1;
255      if (p + 2 < end) {
256	if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
257	    && *(p+1) == 0x80 && *p == 0xe2)  /* U+2028, U+2029 */
258	  return 1;
259      }
260    }
261#endif
262  }
263
264  return 0;
265}
266
267static OnigCodePoint
268mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
269{
270  int c, len;
271  OnigCodePoint n;
272
273  len = mbc_enc_len(p, end, enc);
274  c = *p++;
275  if (len > 1) {
276    len--;
277    n = c & ((1 << (6 - len)) - 1);
278    while (len--) {
279      c = *p++;
280      n = (n << 6) | (c & ((1 << 6) - 1));
281    }
282    return n;
283  }
284  else {
285#ifdef USE_INVALID_CODE_SCHEME
286    if (c > 0xfd) {
287      return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
288    }
289#endif
290    return (OnigCodePoint )c;
291  }
292}
293
294static int
295code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
296{
297  if      ((code & 0xffffff80) == 0) return 1;
298  else if ((code & 0xfffff800) == 0) return 2;
299  else if ((code & 0xffff0000) == 0) return 3;
300  else if ((code & 0xffe00000) == 0) return 4;
301  else if ((code & 0xfc000000) == 0) return 5;
302  else if ((code & 0x80000000) == 0) return 6;
303#ifdef USE_INVALID_CODE_SCHEME
304  else if (code == INVALID_CODE_FE) return 1;
305  else if (code == INVALID_CODE_FF) return 1;
306#endif
307  else
308    return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
309}
310
311static int
312code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
313{
314#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
315#define UTF8_TRAIL0(code)        (UChar )(((code) & 0x3f) | 0x80)
316
317  if ((code & 0xffffff80) == 0) {
318    *buf = (UChar )code;
319    return 1;
320  }
321  else {
322    UChar *p = buf;
323
324    if ((code & 0xfffff800) == 0) {
325      *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
326    }
327    else if ((code & 0xffff0000) == 0) {
328      *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
329      *p++ = UTF8_TRAILS(code, 6);
330    }
331    else if ((code & 0xffe00000) == 0) {
332      *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
333      *p++ = UTF8_TRAILS(code, 12);
334      *p++ = UTF8_TRAILS(code,  6);
335    }
336    else if ((code & 0xfc000000) == 0) {
337      *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
338      *p++ = UTF8_TRAILS(code, 18);
339      *p++ = UTF8_TRAILS(code, 12);
340      *p++ = UTF8_TRAILS(code,  6);
341    }
342    else if ((code & 0x80000000) == 0) {
343      *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
344      *p++ = UTF8_TRAILS(code, 24);
345      *p++ = UTF8_TRAILS(code, 18);
346      *p++ = UTF8_TRAILS(code, 12);
347      *p++ = UTF8_TRAILS(code,  6);
348    }
349#ifdef USE_INVALID_CODE_SCHEME
350    else if (code == INVALID_CODE_FE) {
351      *p = 0xfe;
352      return 1;
353    }
354    else if (code == INVALID_CODE_FF) {
355      *p = 0xff;
356      return 1;
357    }
358#endif
359    else {
360      return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
361    }
362
363    *p++ = UTF8_TRAIL0(code);
364    return (int )(p - buf);
365  }
366}
367
368static int
369mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
370		   const UChar* end, UChar* fold, OnigEncoding enc)
371{
372  const UChar* p = *pp;
373
374  if (ONIGENC_IS_MBC_ASCII(p)) {
375#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
376    if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
377      if (*p == 0x49) {
378	*fold++ = 0xc4;
379	*fold   = 0xb1;
380	(*pp)++;
381	return 2;
382      }
383    }
384#endif
385
386    *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
387    (*pp)++;
388    return 1; /* return byte length of converted char to lower */
389  }
390  else {
391    return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold);
392  }
393}
394
395
396static int
397get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
398			  const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
399{
400  *sb_out = 0x80;
401  return onigenc_unicode_ctype_code_range(ctype, ranges);
402}
403
404
405static UChar*
406left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
407{
408  const UChar *p;
409
410  if (s <= start) return (UChar* )s;
411  p = s;
412
413  while (!utf8_islead(*p) && p > start) p--;
414  return (UChar* )p;
415}
416
417static int
418get_case_fold_codes_by_str(OnigCaseFoldType flag,
419    const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[],
420    OnigEncoding enc)
421{
422  return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items);
423}
424
425OnigEncodingDefine(utf_8, UTF_8) = {
426  mbc_enc_len,
427  "UTF-8",     /* name */
428  6,           /* max byte length */
429  1,           /* min byte length */
430  is_mbc_newline,
431  mbc_to_code,
432  code_to_mbclen,
433  code_to_mbc,
434  mbc_case_fold,
435  onigenc_unicode_apply_all_case_fold,
436  get_case_fold_codes_by_str,
437  onigenc_unicode_property_name_to_ctype,
438  onigenc_unicode_is_code_ctype,
439  get_ctype_code_range,
440  left_adjust_char_head,
441  onigenc_always_true_is_allowed_reverse_match,
442  0,
443  ONIGENC_FLAG_UNICODE,
444};
445ENC_ALIAS("CP65001", "UTF-8")
446
447/*
448 * Name: UTF8-MAC
449 * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html
450 * Link: http://developer.apple.com/qa/qa2001/qa1235.html
451 * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html
452 * Link: http://www.gnu.org/software/emacs/NEWS.23.2
453 */
454ENC_REPLICATE("UTF8-MAC", "UTF-8")
455ENC_ALIAS("UTF-8-MAC", "UTF8-MAC")
456ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */
457
458