1/**********************************************************************
2  emacs_mule.c -  Oniguruma (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include "regint.h"
31
32
33#define emacsmule_islead(c)    ((UChar )(c) < 0x9e)
34
35/*
36    CHARACTER := ASCII_CHAR | MULTIBYTE_CHAR
37    MULTIBYTE_CHAR := PRIMARY_CHAR_1 | PRIMARY_CHAR_2
38                      | SECONDARY_CHAR_1 | SECONDARY_CHAR_2
39    PRIMARY_CHAR_1   := LEADING_CODE_PRI C1
40    PRIMARY_CHAR_2   := LEADING_CODE_PRI C1 C2
41    SECONDARY_CHAR_1 := LEADING_CODE_SEC LEADING_CODE_EXT C1
42    SECONDARY_CHAR_2 := LEADING_CODE_SEC LEADING_CODE_EXT C1 C2
43    ASCII_CHAR := 0 | 1 | ... | 127
44    LEADING_CODE_PRI := 129 | 130 | ... | 153
45    LEADING_CODE_SEC := 154 | 155 | 156 | 157
46    C1, C2, LEADING_CODE_EXT := 160 | 161 | ... | 255
47 */
48
49static const int EncLen_EmacsMule[] = {
50  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
59  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1,
60  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66};
67
68typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3, S4, S5, S6 } state_t;
69#define A ACCEPT
70#define F FAILURE
71static const signed char trans[][0x100] = {
72  { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
73    /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
74    /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
75    /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
76    /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
77    /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
78    /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
79    /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
80    /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
81    /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
82    /* 9 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 5, 6, F, F,
83    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
84    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
85    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
86    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
87    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
88    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
89  },
90  { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
91    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
92    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
93    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
94    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
95    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
96    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
97    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
99    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
100    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
101    /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
102    /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
103    /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
104    /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
105    /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
106    /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A
107  },
108  { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
109    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
110    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
111    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
112    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
113    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
114    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
115    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
116    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
117    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
118    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
119    /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120    /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
121    /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122    /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123    /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
124    /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
125  },
126  { /* S3   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
127    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
128    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
129    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
130    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
131    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
132    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
133    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
134    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
135    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
136    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
137    /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138    /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
139    /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
140    /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
141    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
142    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
143  },
144  { /* S4   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
145    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
146    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
147    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
148    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
149    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
150    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
151    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
152    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
153    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
154    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
155    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
156    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
157    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
158    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
159    /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
160    /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
161  },
162  { /* S5   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
163    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
164    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
165    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
166    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
167    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
168    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
169    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
170    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
171    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
172    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
173    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
174    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
175    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
176    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
177    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
178    /* f */ 2, 2, 2, 2, 2, F, F, F, F, F, F, F, F, F, F, F
179  },
180  { /* S6   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
181    /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
182    /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
183    /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
184    /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
185    /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
186    /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
187    /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
188    /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
189    /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
190    /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
191    /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
192    /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
193    /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
194    /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
195    /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
196    /* f */ F, F, F, F, F, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, F
197  },
198
199};
200#undef A
201#undef F
202
203static int
204mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
205{
206  int firstbyte = *p++;
207  state_t s;
208  s = trans[0][firstbyte];
209  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
210                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
211  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EmacsMule[firstbyte]-1);
212  s = trans[s][*p++];
213  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
214                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
215  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EmacsMule[firstbyte]-2);
216  s = trans[s][*p++];
217  if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
218                                  ONIGENC_CONSTRUCT_MBCLEN_INVALID();
219  if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EmacsMule[firstbyte]-3);
220  s = trans[s][*p++];
221  return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
222                       ONIGENC_CONSTRUCT_MBCLEN_INVALID();
223}
224
225static OnigCodePoint
226mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
227{
228  int c, i, len;
229  OnigCodePoint n;
230
231  len = enclen(enc, p, end);
232  n = (OnigCodePoint )*p++;
233  if (len == 1) return n;
234
235  for (i = 1; i < len; i++) {
236    if (p >= end) break;
237    c = *p++;
238    n <<= 8;  n += c;
239  }
240  return n;
241}
242
243static int
244code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
245{
246  if (ONIGENC_IS_CODE_ASCII(code)) return 1;
247  else if (code > 0xffffffff) return 0;
248  else if ((code & 0xff000000) >= 0x80000000) return 4;
249  else if ((code &   0xff0000) >= 0x800000) return 3;
250  else if ((code &     0xff00) >= 0x8000) return 2;
251  else
252    return ONIGERR_INVALID_CODE_POINT_VALUE;
253}
254
255static int
256code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
257{
258  UChar *p = buf;
259
260  if ((code & 0xff000000) != 0) *p++ = (UChar )(((code >> 24) & 0xff));
261  if ((code &   0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
262  if ((code &     0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
263  *p++ = (UChar )(code & 0xff);
264
265  if (enclen(enc, buf, p) != (p - buf))
266    return ONIGERR_INVALID_CODE_POINT_VALUE;
267  return (int)(p - buf);
268}
269
270static int
271mbc_case_fold(OnigCaseFoldType flag,
272	      const UChar** pp, const UChar* end, UChar* lower,
273	      OnigEncoding enc)
274{
275  int len;
276  const UChar* p = *pp;
277
278  if (ONIGENC_IS_MBC_ASCII(p)) {
279    *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
280    (*pp)++;
281    return 1;
282  }
283  else {
284    int i;
285
286    len = mbc_enc_len(p, end, enc);
287    for (i = 0; i < len; i++) {
288      *lower++ = *p++;
289    }
290    (*pp) += len;
291    return len; /* return byte length of converted char to lower */
292  }
293}
294
295static UChar*
296left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
297{
298  const UChar *p;
299
300  if (s <= start) return (UChar* )s;
301  p = s;
302
303  while (!emacsmule_islead(*p) && p > start) p--;
304  return (UChar* )p;
305}
306
307static int
308is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
309{
310  if (code < 128)
311    return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
312  else
313    return (code_to_mbclen(code, enc) > 1 ? TRUE : FALSE);
314}
315
316/*
317 * Name: Emacs-Mule
318 * Link: http://www.m17n.org/mule/pricai96/mule.en.html
319 */
320OnigEncodingDefine(emacs_mule, Emacs_Mule) = {
321  mbc_enc_len,
322  "Emacs-Mule",   /* name */
323  4,          /* max enc length */
324  1,          /* min enc length */
325  onigenc_is_mbc_newline_0x0a,
326  mbc_to_code,
327  code_to_mbclen,
328  code_to_mbc,
329  mbc_case_fold,
330  onigenc_ascii_apply_all_case_fold,
331  onigenc_ascii_get_case_fold_codes_by_str,
332  onigenc_minimum_property_name_to_ctype,
333  is_code_ctype,
334  onigenc_not_support_get_ctype_code_range,
335  left_adjust_char_head,
336  onigenc_always_true_is_allowed_reverse_match,
337  0,
338  ONIGENC_FLAG_NONE,
339};
340
341ENC_REPLICATE("stateless-ISO-2022-JP", "Emacs-Mule")
342