1/********************************************************************** 2 euc_tw.c - Oniguruma (regular expression library) 3**********************************************************************/ 4/*- 5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include "regenc.h" 31 32static const int EncLen_EUCTW[] = { 33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 43 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 46 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 49}; 50 51typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3 } state_t; 52#define A ACCEPT 53#define F FAILURE 54static const signed char trans[][0x100] = { 55 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 56 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 57 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 58 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 59 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 60 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 61 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 62 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 63 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 64 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 2, F, 65 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 66 /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 67 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 68 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 69 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 70 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 71 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 72 }, 73 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 74 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 75 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 76 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 77 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 78 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 79 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 80 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 81 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 82 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 83 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 84 /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 85 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 86 /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 87 /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 88 /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, 89 /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F 90 }, 91 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 92 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 93 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 94 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 95 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 96 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 97 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 98 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 99 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 100 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 101 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 102 /* a */ F, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 103 /* b */ 3, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 104 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 105 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 106 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 107 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 108 }, 109 { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 110 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 111 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 112 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 113 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 114 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 115 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 116 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 117 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 118 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 119 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, 120 /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 121 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 122 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 123 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 124 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 125 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 126 } 127}; 128#undef A 129#undef F 130 131static int 132euctw_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED) 133{ 134 int firstbyte = *p++; 135 state_t s = trans[0][firstbyte]; 136#define RETURN(n) \ 137 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \ 138 ONIGENC_CONSTRUCT_MBCLEN_INVALID() 139 if (s < 0) RETURN(1); 140 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCTW[firstbyte]-1); 141 s = trans[s][*p++]; 142 if (s < 0) RETURN(2); 143 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-2); 144 s = trans[s][*p++]; 145 if (s < 0) RETURN(3); 146 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-3); 147 s = trans[s][*p++]; 148 RETURN(4); 149#undef RETURN 150} 151 152static OnigCodePoint 153euctw_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc ARG_UNUSED) 154{ 155 return onigenc_mbn_mbc_to_code(enc, p, end); 156} 157 158static int 159euctw_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc) 160{ 161 return onigenc_mb4_code_to_mbc(enc, code, buf); 162} 163 164static int 165euctw_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, 166 UChar* lower, OnigEncoding enc) 167{ 168 return onigenc_mbn_mbc_case_fold(enc, flag, 169 pp, end, lower); 170} 171 172static int 173euctw_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc) 174{ 175 return onigenc_mb4_is_code_ctype(enc, code, ctype); 176} 177 178#define euctw_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) 179 180static UChar* 181euctw_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc) 182{ 183 /* Assumed in this encoding, 184 mb-trail bytes don't mix with single bytes. 185 */ 186 const UChar *p; 187 int len; 188 189 if (s <= start) return (UChar* )s; 190 p = s; 191 192 while (!euctw_islead(*p) && p > start) p--; 193 len = enclen(enc, p, end); 194 if (p + len > s) return (UChar* )p; 195 p += len; 196 return (UChar* )(p + ((s - p) & ~1)); 197} 198 199static int 200euctw_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED) 201{ 202 const UChar c = *s; 203 if (c <= 0x7e) return TRUE; 204 else return FALSE; 205} 206 207OnigEncodingDefine(euc_tw, EUC_TW) = { 208 euctw_mbc_enc_len, 209 "EUC-TW", /* name */ 210 4, /* max enc length */ 211 1, /* min enc length */ 212 onigenc_is_mbc_newline_0x0a, 213 euctw_mbc_to_code, 214 onigenc_mb4_code_to_mbclen, 215 euctw_code_to_mbc, 216 euctw_mbc_case_fold, 217 onigenc_ascii_apply_all_case_fold, 218 onigenc_ascii_get_case_fold_codes_by_str, 219 onigenc_minimum_property_name_to_ctype, 220 euctw_is_code_ctype, 221 onigenc_not_support_get_ctype_code_range, 222 euctw_left_adjust_char_head, 223 euctw_is_allowed_reverse_match, 224 0, 225 ONIGENC_FLAG_NONE, 226}; 227ENC_ALIAS("eucTW", "EUC-TW") 228