1/********************************************************************** 2 utf_16be.c - Oniguruma (regular expression library) 3**********************************************************************/ 4/*- 5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include "regenc.h" 31 32#define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8) 33#define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc) 34#define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8) 35 36static const int EncLen_UTF16[] = { 37 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 38 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 39 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 40 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 41 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 42 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 43 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 46 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 50 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 52 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 53}; 54 55static int 56utf16be_mbc_enc_len(const UChar* p, const OnigUChar* e ARG_UNUSED, 57 OnigEncoding enc ARG_UNUSED) 58{ 59 int byte = p[0]; 60 if (!UTF16_IS_SURROGATE(byte)) { 61 if (2 <= e-p) 62 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2); 63 else 64 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); 65 } 66 if (UTF16_IS_SURROGATE_FIRST(byte)) { 67 switch (e-p) { 68 case 1: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(3); 69 case 2: return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2); 70 case 3: 71 if (UTF16_IS_SURROGATE_SECOND(p[2])) 72 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); 73 break; 74 default: 75 if (UTF16_IS_SURROGATE_SECOND(p[2])) 76 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); 77 break; 78 } 79 } 80 return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 81} 82 83static int 84utf16be_is_mbc_newline(const UChar* p, const UChar* end, 85 OnigEncoding enc) 86{ 87 if (p + 1 < end) { 88 if (*(p+1) == 0x0a && *p == 0x00) 89 return 1; 90#ifdef USE_UNICODE_ALL_LINE_TERMINATORS 91 if ((*(p+1) == 0x0b || *(p+1) == 0x0c || *(p+1) == 0x0d || *(p+1) == 0x85) 92 && *p == 0x00) 93 return 1; 94 if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28)) 95 return 1; 96#endif 97 } 98 return 0; 99} 100 101static OnigCodePoint 102utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, 103 OnigEncoding enc) 104{ 105 OnigCodePoint code; 106 107 if (UTF16_IS_SURROGATE_FIRST(*p)) { 108 code = ((((p[0] << 8) + p[1]) & 0x03ff) << 10) 109 + (((p[2] << 8) + p[3]) & 0x03ff) + 0x10000; 110 } 111 else { 112 code = p[0] * 256 + p[1]; 113 } 114 return code; 115} 116 117static int 118utf16be_code_to_mbclen(OnigCodePoint code, 119 OnigEncoding enc) 120{ 121 return (code > 0xffff ? 4 : 2); 122} 123 124static int 125utf16be_code_to_mbc(OnigCodePoint code, UChar *buf, 126 OnigEncoding enc) 127{ 128 UChar* p = buf; 129 130 if (code > 0xffff) { 131 unsigned int high = (code >> 10) + 0xD7C0; 132 unsigned int low = (code & 0x3FF) + 0xDC00; 133 *p++ = (high >> 8) & 0xFF; 134 *p++ = high & 0xFF; 135 *p++ = (low >> 8) & 0xFF; 136 *p++ = low & 0xFF; 137 return 4; 138 } 139 else { 140 *p++ = (UChar )((code & 0xff00) >> 8); 141 *p++ = (UChar )(code & 0xff); 142 return 2; 143 } 144} 145 146static int 147utf16be_mbc_case_fold(OnigCaseFoldType flag, 148 const UChar** pp, const UChar* end, UChar* fold, 149 OnigEncoding enc) 150{ 151 const UChar* p = *pp; 152 153 if (ONIGENC_IS_ASCII_CODE(*(p+1)) && *p == 0) { 154 p++; 155#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 156 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 157 if (*p == 0x49) { 158 *fold++ = 0x01; 159 *fold = 0x31; 160 (*pp) += 2; 161 return 2; 162 } 163 } 164#endif 165 166 *fold++ = 0; 167 *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 168 *pp += 2; 169 return 2; 170 } 171 else 172 return onigenc_unicode_mbc_case_fold(enc, flag, 173 pp, end, fold); 174} 175 176#if 0 177static int 178utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) 179{ 180 const UChar* p = *pp; 181 182 (*pp) += EncLen_UTF16[*p]; 183 184 if (*p == 0) { 185 int c, v; 186 187 p++; 188 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 189 return TRUE; 190 } 191 192 c = *p; 193 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, 194 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); 195 196 if ((v | BIT_CTYPE_LOWER) != 0) { 197 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 198 if (c >= 0xaa && c <= 0xba) 199 return FALSE; 200 else 201 return TRUE; 202 } 203 return (v != 0 ? TRUE : FALSE); 204 } 205 206 return FALSE; 207} 208#endif 209 210static UChar* 211utf16be_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, 212 OnigEncoding enc ARG_UNUSED) 213{ 214 if (s <= start) return (UChar* )s; 215 216 if ((s - start) % 2 == 1) { 217 s--; 218 } 219 220 if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) 221 s -= 2; 222 223 return (UChar* )s; 224} 225 226static int 227utf16be_get_case_fold_codes_by_str(OnigCaseFoldType flag, 228 const OnigUChar* p, const OnigUChar* end, 229 OnigCaseFoldCodeItem items[], 230 OnigEncoding enc) 231{ 232 return onigenc_unicode_get_case_fold_codes_by_str(enc, 233 flag, p, end, items); 234} 235 236OnigEncodingDefine(utf_16be, UTF_16BE) = { 237 utf16be_mbc_enc_len, 238 "UTF-16BE", /* name */ 239 4, /* max byte length */ 240 2, /* min byte length */ 241 utf16be_is_mbc_newline, 242 utf16be_mbc_to_code, 243 utf16be_code_to_mbclen, 244 utf16be_code_to_mbc, 245 utf16be_mbc_case_fold, 246 onigenc_unicode_apply_all_case_fold, 247 utf16be_get_case_fold_codes_by_str, 248 onigenc_unicode_property_name_to_ctype, 249 onigenc_unicode_is_code_ctype, 250 onigenc_utf16_32_get_ctype_code_range, 251 utf16be_left_adjust_char_head, 252 onigenc_always_false_is_allowed_reverse_match, 253 0, 254 ONIGENC_FLAG_UNICODE, 255}; 256ENC_ALIAS("UCS-2BE", "UTF-16BE") 257