1/********************************************************************** 2 utf_8.c - Oniguruma (regular expression library) 3**********************************************************************/ 4/*- 5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include "regenc.h" 31 32#define USE_INVALID_CODE_SCHEME 33 34#ifdef USE_INVALID_CODE_SCHEME 35/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ 36#define INVALID_CODE_FE 0xfffffffe 37#define INVALID_CODE_FF 0xffffffff 38#define VALID_CODE_LIMIT 0x7fffffff 39#endif 40 41#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) 42 43static const int EncLen_UTF8[] = { 44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 56 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 57 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 58 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 59 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 60}; 61 62typedef enum { 63 FAILURE = -2, 64 ACCEPT, 65 S0, S1, S2, S3, 66 S4, S5, S6, S7 67} state_t; 68#define A ACCEPT 69#define F FAILURE 70static const signed char trans[][0x100] = { 71 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 72 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 73 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 74 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 75 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 76 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 77 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 78 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 79 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 80 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 81 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 82 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 83 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 84 /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 85 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 86 /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 87 /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F 88 }, 89 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 90 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 91 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 92 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 93 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 94 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 95 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 96 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 97 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 98 /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 99 /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 100 /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 101 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 102 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 103 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 104 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 105 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 106 }, 107 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 108 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 109 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 110 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 111 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 112 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 113 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 114 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 115 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 116 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 117 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 118 /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 119 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 120 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 121 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 122 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 123 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 124 }, 125 { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 126 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 127 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 128 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 129 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 130 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 131 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 132 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 133 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 134 /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 135 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 136 /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 137 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 138 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 139 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 140 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 141 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 142 }, 143 { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 144 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 145 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 146 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 147 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 148 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 149 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 150 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 151 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 152 /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 153 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 154 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 155 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 156 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 157 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 158 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 159 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 160 }, 161 { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 162 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 163 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 164 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 165 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 166 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 167 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 168 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 169 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 170 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 171 /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 172 /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 173 /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 174 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 175 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 176 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 177 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 178 }, 179 { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 180 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 181 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 182 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 183 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 184 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 185 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 186 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 187 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 188 /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 189 /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 190 /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 191 /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 192 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 193 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 194 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 195 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 196 }, 197 { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 198 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 199 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 200 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 201 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 202 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 203 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 204 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 205 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 206 /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 207 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 208 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 209 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 210 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 211 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 212 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 213 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 214 }, 215}; 216#undef A 217#undef F 218 219static int 220mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) 221{ 222 int firstbyte = *p++; 223 state_t s; 224 s = trans[0][firstbyte]; 225 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) : 226 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 227 228 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1); 229 s = trans[s][*p++]; 230 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) : 231 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 232 233 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2); 234 s = trans[s][*p++]; 235 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) : 236 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 237 238 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3); 239 s = trans[s][*p++]; 240 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) : 241 ONIGENC_CONSTRUCT_MBCLEN_INVALID(); 242} 243 244static int 245is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc) 246{ 247 if (p < end) { 248 if (*p == 0x0a) return 1; 249 250#ifdef USE_UNICODE_ALL_LINE_TERMINATORS 251 if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1; 252 if (p + 1 < end) { 253 if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ 254 return 1; 255 if (p + 2 < end) { 256 if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) 257 && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ 258 return 1; 259 } 260 } 261#endif 262 } 263 264 return 0; 265} 266 267static OnigCodePoint 268mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc) 269{ 270 int c, len; 271 OnigCodePoint n; 272 273 len = mbc_enc_len(p, end, enc); 274 c = *p++; 275 if (len > 1) { 276 len--; 277 n = c & ((1 << (6 - len)) - 1); 278 while (len--) { 279 c = *p++; 280 n = (n << 6) | (c & ((1 << 6) - 1)); 281 } 282 return n; 283 } 284 else { 285#ifdef USE_INVALID_CODE_SCHEME 286 if (c > 0xfd) { 287 return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); 288 } 289#endif 290 return (OnigCodePoint )c; 291 } 292} 293 294static int 295code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) 296{ 297 if ((code & 0xffffff80) == 0) return 1; 298 else if ((code & 0xfffff800) == 0) return 2; 299 else if ((code & 0xffff0000) == 0) return 3; 300 else if ((code & 0xffe00000) == 0) return 4; 301 else if ((code & 0xfc000000) == 0) return 5; 302 else if ((code & 0x80000000) == 0) return 6; 303#ifdef USE_INVALID_CODE_SCHEME 304 else if (code == INVALID_CODE_FE) return 1; 305 else if (code == INVALID_CODE_FF) return 1; 306#endif 307 else 308 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 309} 310 311static int 312code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED) 313{ 314#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80) 315#define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80) 316 317 if ((code & 0xffffff80) == 0) { 318 *buf = (UChar )code; 319 return 1; 320 } 321 else { 322 UChar *p = buf; 323 324 if ((code & 0xfffff800) == 0) { 325 *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0); 326 } 327 else if ((code & 0xffff0000) == 0) { 328 *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0); 329 *p++ = UTF8_TRAILS(code, 6); 330 } 331 else if ((code & 0xffe00000) == 0) { 332 *p++ = (UChar )(((code>>18) & 0x07) | 0xf0); 333 *p++ = UTF8_TRAILS(code, 12); 334 *p++ = UTF8_TRAILS(code, 6); 335 } 336 else if ((code & 0xfc000000) == 0) { 337 *p++ = (UChar )(((code>>24) & 0x03) | 0xf8); 338 *p++ = UTF8_TRAILS(code, 18); 339 *p++ = UTF8_TRAILS(code, 12); 340 *p++ = UTF8_TRAILS(code, 6); 341 } 342 else if ((code & 0x80000000) == 0) { 343 *p++ = (UChar )(((code>>30) & 0x01) | 0xfc); 344 *p++ = UTF8_TRAILS(code, 24); 345 *p++ = UTF8_TRAILS(code, 18); 346 *p++ = UTF8_TRAILS(code, 12); 347 *p++ = UTF8_TRAILS(code, 6); 348 } 349#ifdef USE_INVALID_CODE_SCHEME 350 else if (code == INVALID_CODE_FE) { 351 *p = 0xfe; 352 return 1; 353 } 354 else if (code == INVALID_CODE_FF) { 355 *p = 0xff; 356 return 1; 357 } 358#endif 359 else { 360 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 361 } 362 363 *p++ = UTF8_TRAIL0(code); 364 return (int )(p - buf); 365 } 366} 367 368static int 369mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, 370 const UChar* end, UChar* fold, OnigEncoding enc) 371{ 372 const UChar* p = *pp; 373 374 if (ONIGENC_IS_MBC_ASCII(p)) { 375#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI 376 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) { 377 if (*p == 0x49) { 378 *fold++ = 0xc4; 379 *fold = 0xb1; 380 (*pp)++; 381 return 2; 382 } 383 } 384#endif 385 386 *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); 387 (*pp)++; 388 return 1; /* return byte length of converted char to lower */ 389 } 390 else { 391 return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold); 392 } 393} 394 395 396static int 397get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, 398 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED) 399{ 400 *sb_out = 0x80; 401 return onigenc_unicode_ctype_code_range(ctype, ranges); 402} 403 404 405static UChar* 406left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED) 407{ 408 const UChar *p; 409 410 if (s <= start) return (UChar* )s; 411 p = s; 412 413 while (!utf8_islead(*p) && p > start) p--; 414 return (UChar* )p; 415} 416 417static int 418get_case_fold_codes_by_str(OnigCaseFoldType flag, 419 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[], 420 OnigEncoding enc) 421{ 422 return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items); 423} 424 425OnigEncodingDefine(utf_8, UTF_8) = { 426 mbc_enc_len, 427 "UTF-8", /* name */ 428 6, /* max byte length */ 429 1, /* min byte length */ 430 is_mbc_newline, 431 mbc_to_code, 432 code_to_mbclen, 433 code_to_mbc, 434 mbc_case_fold, 435 onigenc_unicode_apply_all_case_fold, 436 get_case_fold_codes_by_str, 437 onigenc_unicode_property_name_to_ctype, 438 onigenc_unicode_is_code_ctype, 439 get_ctype_code_range, 440 left_adjust_char_head, 441 onigenc_always_true_is_allowed_reverse_match, 442 0, 443 ONIGENC_FLAG_UNICODE, 444}; 445ENC_ALIAS("CP65001", "UTF-8") 446 447/* 448 * Name: UTF8-MAC 449 * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html 450 * Link: http://developer.apple.com/qa/qa2001/qa1235.html 451 * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html 452 * Link: http://www.gnu.org/software/emacs/NEWS.23.2 453 */ 454ENC_REPLICATE("UTF8-MAC", "UTF-8") 455ENC_ALIAS("UTF-8-MAC", "UTF8-MAC") 456ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */ 457 458