1/********************************************************************** 2 utf_16le.c - Oniguruma (regular expression library) 3**********************************************************************/ 4/*- 5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include "regenc.h" 31 32#define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8) 33#define UTF16_IS_SURROGATE_SECOND(c) (((c) & 0xfc) == 0xdc) 34#define UTF16_IS_SURROGATE(c) (((c) & 0xf8) == 0xd8) 35 36static const int EncLen_UTF16[] = { 37 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 38 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 39 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 40 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 41 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 42 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 43 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 46 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 50 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 52 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 53}; 54 55static int 56utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e, 57 OnigEncoding enc ARG_UNUSED) 58{ 59 int len = (int)(e - p); 60 UChar byte; 61 if (len < 2) 62 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1); 63 byte = p[1]; 64 if (!UTF16_IS_SURROGATE(byte)) { 65 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2); 66 } 67 if (UTF16_IS_SURROGATE_FIRST(byte)) { 68 if (len < 4) 69 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-len); 70 if (UTF16_IS_SURROGATE_SECOND(p[3])) 71 return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4); 72 } 73 return ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 74} 75 76static int 77utf16le_is_mbc_newline(const UChar* p, const UChar* end, 78 OnigEncoding enc ARG_UNUSED) 79{ 80 if (p + 1 < end) { 81 if (*p == 0x0a && *(p+1) == 0x00) 82 return 1; 83#ifdef USE_UNICODE_ALL_LINE_TERMINATORS 84 if ((*p == 0x0b || *p == 0x0c || *p == 0x0d || *p == 0x85) 85 && *(p+1) == 0x00) 86 return 1; 87 if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28)) 88 return 1; 89#endif 90 } 91 return 0; 92} 93 94static OnigCodePoint 95utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED, 96 OnigEncoding enc ARG_UNUSED) 97{ 98 OnigCodePoint code; 99 UChar c0 = *p; 100 UChar c1 = *(p+1); 101 102 if (UTF16_IS_SURROGATE_FIRST(c1)) { 103 code = ((((c1 << 8) + c0) & 0x03ff) << 10) 104 + (((p[3] << 8) + p[2]) & 0x03ff) + 0x10000; 105 } 106 else { 107 code = c1 * 256 + p[0]; 108 } 109 return code; 110} 111 112static int 113utf16le_code_to_mbclen(OnigCodePoint code, 114 OnigEncoding enc ARG_UNUSED) 115{ 116 return (code > 0xffff ? 4 : 2); 117} 118 119static int 120utf16le_code_to_mbc(OnigCodePoint code, UChar *buf, 121 OnigEncoding enc ARG_UNUSED) 122{ 123 UChar* p = buf; 124 125 if (code > 0xffff) { 126 unsigned int high = (code >> 10) + 0xD7C0; 127 unsigned int low = (code & 0x3FF) + 0xDC00; 128 *p++ = high & 0xFF; 129 *p++ = (high >> 8) & 0xFF; 130 *p++ = low & 0xFF; 131 *p++ = (low >> 8) & 0xFF; 132 return 4; 133 } 134 else { 135 *p++ = (UChar )(code & 0xff); 136 *p++ = (UChar )((code & 0xff00) >> 8); 137 return 2; 138 } 139} 140 141static int 142utf16le_mbc_case_fold(OnigCaseFoldType flag, 143 const UChar** pp, const UChar* end, UChar* fold, 144 OnigEncoding enc) 145{ 146 const UChar* p = *pp; 147 148 if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) { 149#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 150 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 151 if (*p == 0x49) { 152 *fold++ = 0x31; 153 *fold = 0x01; 154 (*pp) += 2; 155 return 2; 156 } 157 } 158#endif 159 160 *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 161 *fold = 0; 162 *pp += 2; 163 return 2; 164 } 165 else 166 return onigenc_unicode_mbc_case_fold(enc, flag, pp, 167 end, fold); 168} 169 170#if 0 171static int 172utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, 173 const UChar* end) 174{ 175 const UChar* p = *pp; 176 177 (*pp) += EncLen_UTF16[*(p+1)]; 178 179 if (*(p+1) == 0) { 180 int c, v; 181 182 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 183 return TRUE; 184 } 185 186 c = *p; 187 v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, 188 (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); 189 if ((v | BIT_CTYPE_LOWER) != 0) { 190 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 191 if (c >= 0xaa && c <= 0xba) 192 return FALSE; 193 else 194 return TRUE; 195 } 196 return (v != 0 ? TRUE : FALSE); 197 } 198 199 return FALSE; 200} 201#endif 202 203static UChar* 204utf16le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, 205 OnigEncoding enc ARG_UNUSED) 206{ 207 if (s <= start) return (UChar* )s; 208 209 if ((s - start) % 2 == 1) { 210 s--; 211 } 212 213 if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) 214 s -= 2; 215 216 return (UChar* )s; 217} 218 219static int 220utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag, 221 const OnigUChar* p, const OnigUChar* end, 222 OnigCaseFoldCodeItem items[], 223 OnigEncoding enc) 224{ 225 return onigenc_unicode_get_case_fold_codes_by_str(enc, 226 flag, p, end, items); 227} 228 229OnigEncodingDefine(utf_16le, UTF_16LE) = { 230 utf16le_mbc_enc_len, 231 "UTF-16LE", /* name */ 232 4, /* max byte length */ 233 2, /* min byte length */ 234 utf16le_is_mbc_newline, 235 utf16le_mbc_to_code, 236 utf16le_code_to_mbclen, 237 utf16le_code_to_mbc, 238 utf16le_mbc_case_fold, 239 onigenc_unicode_apply_all_case_fold, 240 utf16le_get_case_fold_codes_by_str, 241 onigenc_unicode_property_name_to_ctype, 242 onigenc_unicode_is_code_ctype, 243 onigenc_utf16_32_get_ctype_code_range, 244 utf16le_left_adjust_char_head, 245 onigenc_always_false_is_allowed_reverse_match, 246 0, 247 ONIGENC_FLAG_UNICODE, 248}; 249