1/**********************************************************************
2  utf_16be.c -  Oniguruma (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include "regenc.h"
31
32#define UTF16_IS_SURROGATE_FIRST(c)    (((c) & 0xfc) == 0xd8)
33#define UTF16_IS_SURROGATE_SECOND(c)   (((c) & 0xfc) == 0xdc)
34#define UTF16_IS_SURROGATE(c)          (((c) & 0xf8) == 0xd8)
35
36static const int EncLen_UTF16[] = {
37  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
38  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
39  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
40  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
41  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
44  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
45  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
46  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50  2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
51  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
53};
54
55static int
56utf16be_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED,
57		    OnigEncoding enc ARG_UNUSED)
58{
59  int byte = p[0];
60  if (!UTF16_IS_SURROGATE(byte)) {
61    if (2 <= e-p)
62      return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
63    else
64      return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
65  }
66  if (UTF16_IS_SURROGATE_FIRST(byte)) {
67    switch (e-p) {
68      case 1: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(3);
69      case 2: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2);
70      case 3:
71        if (UTF16_IS_SURROGATE_SECOND(p[2]))
72          return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
73        break;
74      default:
75        if (UTF16_IS_SURROGATE_SECOND(p[2]))
76          return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
77        break;
78    }
79  }
80  return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
81}
82
83static int
84utf16be_is_mbc_newline(const UChar* p, const UChar* end,
85		       OnigEncoding enc)
86{
87  if (p + 1 < end) {
88    if (*(p+1) == 0x0a && *p == 0x00)
89      return 1;
90#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
91    if ((*(p+1) == 0x0b || *(p+1) == 0x0c || *(p+1) == 0x0d || *(p+1) == 0x85)
92	&& *p == 0x00)
93      return 1;
94    if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28))
95      return 1;
96#endif
97  }
98  return 0;
99}
100
101static OnigCodePoint
102utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
103		    OnigEncoding enc)
104{
105  OnigCodePoint code;
106
107  if (UTF16_IS_SURROGATE_FIRST(*p)) {
108    code = ((((p[0] << 8) + p[1]) & 0x03ff) << 10)
109         + (((p[2] << 8) + p[3]) & 0x03ff) + 0x10000;
110  }
111  else {
112    code = p[0] * 256 + p[1];
113  }
114  return code;
115}
116
117static int
118utf16be_code_to_mbclen(OnigCodePoint code,
119		       OnigEncoding enc)
120{
121  return (code > 0xffff ? 4 : 2);
122}
123
124static int
125utf16be_code_to_mbc(OnigCodePoint code, UChar *buf,
126		    OnigEncoding enc)
127{
128  UChar* p = buf;
129
130  if (code > 0xffff) {
131    unsigned int high = (code >> 10) + 0xD7C0;
132    unsigned int low = (code & 0x3FF) + 0xDC00;
133    *p++ = (high >> 8) & 0xFF;
134    *p++ = high & 0xFF;
135    *p++ = (low >> 8) & 0xFF;
136    *p++ = low & 0xFF;
137    return 4;
138  }
139  else {
140    *p++ = (UChar )((code & 0xff00) >> 8);
141    *p++ = (UChar )(code & 0xff);
142    return 2;
143  }
144}
145
146static int
147utf16be_mbc_case_fold(OnigCaseFoldType flag,
148		      const UChar** pp, const UChar* end, UChar* fold,
149		      OnigEncoding enc)
150{
151  const UChar* p = *pp;
152
153  if (ONIGENC_IS_ASCII_CODE(*(p+1)) && *p == 0) {
154    p++;
155#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
156    if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
157      if (*p == 0x49) {
158	*fold++ = 0x01;
159	*fold   = 0x31;
160	(*pp) += 2;
161	return 2;
162      }
163    }
164#endif
165
166    *fold++ = 0;
167    *fold   = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
168    *pp += 2;
169    return 2;
170  }
171  else
172    return onigenc_unicode_mbc_case_fold(enc, flag,
173					 pp, end, fold);
174}
175
176#if 0
177static int
178utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
179{
180  const UChar* p = *pp;
181
182  (*pp) += EncLen_UTF16[*p];
183
184  if (*p == 0) {
185    int c, v;
186
187    p++;
188    if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
189      return TRUE;
190    }
191
192    c = *p;
193    v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
194		(BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
195
196    if ((v | BIT_CTYPE_LOWER) != 0) {
197      /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
198      if (c >= 0xaa && c <= 0xba)
199	return FALSE;
200      else
201	return TRUE;
202    }
203    return (v != 0 ? TRUE : FALSE);
204  }
205
206  return FALSE;
207}
208#endif
209
210static UChar*
211utf16be_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
212			      OnigEncoding enc ARG_UNUSED)
213{
214  if (s <= start) return (UChar* )s;
215
216  if ((s - start) % 2 == 1) {
217    s--;
218  }
219
220  if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1)
221    s -= 2;
222
223  return (UChar* )s;
224}
225
226static int
227utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag,
228				   const OnigUChar* p, const OnigUChar* end,
229				   OnigCaseFoldCodeItem items[],
230				   OnigEncoding enc)
231{
232  return onigenc_unicode_get_case_fold_codes_by_str(enc,
233						    flag, p, end, items);
234}
235
236OnigEncodingDefine(utf_16be, UTF_16BE) = {
237  utf16be_mbc_enc_len,
238  "UTF-16BE",   /* name */
239  4,            /* max byte length */
240  2,            /* min byte length */
241  utf16be_is_mbc_newline,
242  utf16be_mbc_to_code,
243  utf16be_code_to_mbclen,
244  utf16be_code_to_mbc,
245  utf16be_mbc_case_fold,
246  onigenc_unicode_apply_all_case_fold,
247  utf16be_get_case_fold_codes_by_str,
248  onigenc_unicode_property_name_to_ctype,
249  onigenc_unicode_is_code_ctype,
250  onigenc_utf16_32_get_ctype_code_range,
251  utf16be_left_adjust_char_head,
252  onigenc_always_false_is_allowed_reverse_match,
253  0,
254  ONIGENC_FLAG_UNICODE,
255};
256ENC_ALIAS("UCS-2BE", "UTF-16BE")
257