1/**********************************************************************
2  utf_16le.c -  Oniguruma (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include "regenc.h"
31
32#define UTF16_IS_SURROGATE_FIRST(c)    (((c) & 0xfc) == 0xd8)
33#define UTF16_IS_SURROGATE_SECOND(c)   (((c) & 0xfc) == 0xdc)
34#define UTF16_IS_SURROGATE(c)          (((c) & 0xf8) == 0xd8)
35
36static const int EncLen_UTF16[] = {
37  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
38  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
39  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
40  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
41  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50  2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
51  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
53};
54
55static int
56utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e,
57		    OnigEncoding enc ARG_UNUSED)
58{
59  int len = (int)(e - p);
60  UChar byte;
61  if (len < 2)
62    return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
63  byte = p[1];
64  if (!UTF16_IS_SURROGATE(byte)) {
65    return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
66  }
67  if (UTF16_IS_SURROGATE_FIRST(byte)) {
68    if (len < 4)
69      return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-len);
70    if (UTF16_IS_SURROGATE_SECOND(p[3]))
71      return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
72  }
73  return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
74}
75
76static int
77utf16le_is_mbc_newline(const UChar* p, const UChar* end,
78		       OnigEncoding enc ARG_UNUSED)
79{
80  if (p + 1 < end) {
81    if (*p == 0x0a && *(p+1) == 0x00)
82      return 1;
83#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
84    if ((*p == 0x0b || *p == 0x0c || *p == 0x0d || *p == 0x85)
85	&& *(p+1) == 0x00)
86      return 1;
87    if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
88      return 1;
89#endif
90  }
91  return 0;
92}
93
94static OnigCodePoint
95utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
96		    OnigEncoding enc ARG_UNUSED)
97{
98  OnigCodePoint code;
99  UChar c0 = *p;
100  UChar c1 = *(p+1);
101
102  if (UTF16_IS_SURROGATE_FIRST(c1)) {
103    code = ((((c1 << 8) + c0) & 0x03ff) << 10)
104         + (((p[3] << 8) + p[2]) & 0x03ff) + 0x10000;
105  }
106  else {
107    code = c1 * 256 + p[0];
108  }
109  return code;
110}
111
112static int
113utf16le_code_to_mbclen(OnigCodePoint code,
114		       OnigEncoding enc ARG_UNUSED)
115{
116  return (code > 0xffff ? 4 : 2);
117}
118
119static int
120utf16le_code_to_mbc(OnigCodePoint code, UChar *buf,
121		    OnigEncoding enc ARG_UNUSED)
122{
123  UChar* p = buf;
124
125  if (code > 0xffff) {
126    unsigned int high = (code >> 10) + 0xD7C0;
127    unsigned int low = (code & 0x3FF) + 0xDC00;
128    *p++ = high & 0xFF;
129    *p++ = (high >> 8) & 0xFF;
130    *p++ = low & 0xFF;
131    *p++ = (low >> 8) & 0xFF;
132    return 4;
133  }
134  else {
135    *p++ = (UChar )(code & 0xff);
136    *p++ = (UChar )((code & 0xff00) >> 8);
137    return 2;
138  }
139}
140
141static int
142utf16le_mbc_case_fold(OnigCaseFoldType flag,
143		      const UChar** pp, const UChar* end, UChar* fold,
144		      OnigEncoding enc)
145{
146  const UChar* p = *pp;
147
148  if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {
149#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
150    if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
151      if (*p == 0x49) {
152	*fold++ = 0x31;
153	*fold   = 0x01;
154	(*pp) += 2;
155	return 2;
156      }
157    }
158#endif
159
160    *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
161    *fold   = 0;
162    *pp += 2;
163    return 2;
164  }
165  else
166    return onigenc_unicode_mbc_case_fold(enc, flag, pp,
167					 end, fold);
168}
169
170#if 0
171static int
172utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,
173			 const UChar* end)
174{
175  const UChar* p = *pp;
176
177  (*pp) += EncLen_UTF16[*(p+1)];
178
179  if (*(p+1) == 0) {
180    int c, v;
181
182    if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
183      return TRUE;
184    }
185
186    c = *p;
187    v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
188                       (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
189    if ((v | BIT_CTYPE_LOWER) != 0) {
190      /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
191      if (c >= 0xaa && c <= 0xba)
192	return FALSE;
193      else
194	return TRUE;
195    }
196    return (v != 0 ? TRUE : FALSE);
197  }
198
199  return FALSE;
200}
201#endif
202
203static UChar*
204utf16le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
205			      OnigEncoding enc ARG_UNUSED)
206{
207  if (s <= start) return (UChar* )s;
208
209  if ((s - start) % 2 == 1) {
210    s--;
211  }
212
213  if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
214    s -= 2;
215
216  return (UChar* )s;
217}
218
219static int
220utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag,
221				   const OnigUChar* p, const OnigUChar* end,
222				   OnigCaseFoldCodeItem items[],
223				   OnigEncoding enc)
224{
225  return onigenc_unicode_get_case_fold_codes_by_str(enc,
226						    flag, p, end, items);
227}
228
229OnigEncodingDefine(utf_16le, UTF_16LE) = {
230  utf16le_mbc_enc_len,
231  "UTF-16LE",   /* name */
232  4,            /* max byte length */
233  2,            /* min byte length */
234  utf16le_is_mbc_newline,
235  utf16le_mbc_to_code,
236  utf16le_code_to_mbclen,
237  utf16le_code_to_mbc,
238  utf16le_mbc_case_fold,
239  onigenc_unicode_apply_all_case_fold,
240  utf16le_get_case_fold_codes_by_str,
241  onigenc_unicode_property_name_to_ctype,
242  onigenc_unicode_is_code_ctype,
243  onigenc_utf16_32_get_ctype_code_range,
244  utf16le_left_adjust_char_head,
245  onigenc_always_false_is_allowed_reverse_match,
246  0,
247  ONIGENC_FLAG_UNICODE,
248};
249