1/********************************************************************** 2 iso8859_1.c - Oniguruma (regular expression library) 3**********************************************************************/ 4/*- 5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include "regenc.h" 31 32#define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) 33 34#define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \ 35 ((EncISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0) 36 37static const unsigned short EncISO_8859_1_CtypeTable[256] = { 38 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 39 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008, 40 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 41 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 42 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 43 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 44 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 45 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 46 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2, 47 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 48 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 49 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0, 50 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2, 51 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 52 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 53 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008, 54 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 55 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 56 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 57 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 58 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 59 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0, 60 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0, 61 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, 62 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 63 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 64 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0, 65 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2, 66 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 67 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 68 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0, 69 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2 70}; 71 72static const OnigPairCaseFoldCodes CaseFoldMap[] = { 73 { 0xc0, 0xe0 }, 74 { 0xc1, 0xe1 }, 75 { 0xc2, 0xe2 }, 76 { 0xc3, 0xe3 }, 77 { 0xc4, 0xe4 }, 78 { 0xc5, 0xe5 }, 79 { 0xc6, 0xe6 }, 80 { 0xc7, 0xe7 }, 81 { 0xc8, 0xe8 }, 82 { 0xc9, 0xe9 }, 83 { 0xca, 0xea }, 84 { 0xcb, 0xeb }, 85 { 0xcc, 0xec }, 86 { 0xcd, 0xed }, 87 { 0xce, 0xee }, 88 { 0xcf, 0xef }, 89 90 { 0xd0, 0xf0 }, 91 { 0xd1, 0xf1 }, 92 { 0xd2, 0xf2 }, 93 { 0xd3, 0xf3 }, 94 { 0xd4, 0xf4 }, 95 { 0xd5, 0xf5 }, 96 { 0xd6, 0xf6 }, 97 { 0xd8, 0xf8 }, 98 { 0xd9, 0xf9 }, 99 { 0xda, 0xfa }, 100 { 0xdb, 0xfb }, 101 { 0xdc, 0xfc }, 102 { 0xdd, 0xfd }, 103 { 0xde, 0xfe } 104}; 105 106static int 107apply_all_case_fold(OnigCaseFoldType flag, 108 OnigApplyAllCaseFoldFunc f, void* arg, 109 OnigEncoding enc ARG_UNUSED) 110{ 111 return onigenc_apply_all_case_fold_with_map( 112 numberof(CaseFoldMap), CaseFoldMap, 1, 113 flag, f, arg); 114} 115 116static int 117get_case_fold_codes_by_str(OnigCaseFoldType flag, 118 const OnigUChar* p, const OnigUChar* end, 119 OnigCaseFoldCodeItem items[], 120 OnigEncoding enc ARG_UNUSED) 121{ 122 if (0x41 <= *p && *p <= 0x5a) { 123 items[0].byte_len = 1; 124 items[0].code_len = 1; 125 items[0].code[0] = (OnigCodePoint )(*p + 0x20); 126 if (*p == 0x53 && end > p + 1 127 && (*(p+1) == 0x53 || *(p+1) == 0x73)) { /* SS */ 128 items[1].byte_len = 2; 129 items[1].code_len = 1; 130 items[1].code[0] = (OnigCodePoint )0xdf; 131 return 2; 132 } 133 else 134 return 1; 135 } 136 else if (0x61 <= *p && *p <= 0x7a) { 137 items[0].byte_len = 1; 138 items[0].code_len = 1; 139 items[0].code[0] = (OnigCodePoint )(*p - 0x20); 140 if (*p == 0x73 && end > p + 1 141 && (*(p+1) == 0x73 || *(p+1) == 0x53)) { /* ss */ 142 items[1].byte_len = 2; 143 items[1].code_len = 1; 144 items[1].code[0] = (OnigCodePoint )0xdf; 145 return 2; 146 } 147 else 148 return 1; 149 } 150 else if (0xc0 <= *p && *p <= 0xcf) { 151 items[0].byte_len = 1; 152 items[0].code_len = 1; 153 items[0].code[0] = (OnigCodePoint )(*p + 0x20); 154 return 1; 155 } 156 else if (0xd0 <= *p && *p <= 0xdf) { 157 if (*p == 0xdf) { 158 items[0].byte_len = 1; 159 items[0].code_len = 2; 160 items[0].code[0] = (OnigCodePoint )'s'; 161 items[0].code[1] = (OnigCodePoint )'s'; 162 163 items[1].byte_len = 1; 164 items[1].code_len = 2; 165 items[1].code[0] = (OnigCodePoint )'S'; 166 items[1].code[1] = (OnigCodePoint )'S'; 167 168 items[2].byte_len = 1; 169 items[2].code_len = 2; 170 items[2].code[0] = (OnigCodePoint )'s'; 171 items[2].code[1] = (OnigCodePoint )'S'; 172 173 items[3].byte_len = 1; 174 items[3].code_len = 2; 175 items[3].code[0] = (OnigCodePoint )'S'; 176 items[3].code[1] = (OnigCodePoint )'s'; 177 178 return 4; 179 } 180 else if (*p != 0xd7) { 181 items[0].byte_len = 1; 182 items[0].code_len = 1; 183 items[0].code[0] = (OnigCodePoint )(*p + 0x20); 184 return 1; 185 } 186 } 187 else if (0xe0 <= *p && *p <= 0xef) { 188 items[0].byte_len = 1; 189 items[0].code_len = 1; 190 items[0].code[0] = (OnigCodePoint )(*p - 0x20); 191 return 1; 192 } 193 else if (0xf0 <= *p && *p <= 0xfe) { 194 if (*p != 0xf7) { 195 items[0].byte_len = 1; 196 items[0].code_len = 1; 197 items[0].code[0] = (OnigCodePoint )(*p - 0x20); 198 return 1; 199 } 200 } 201 202 return 0; 203} 204 205static int 206mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, 207 UChar* lower, OnigEncoding enc ARG_UNUSED) 208{ 209 const UChar* p = *pp; 210 211 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 212 *lower++ = 's'; 213 *lower = 's'; 214 (*pp)++; 215 return 2; 216 } 217 218 *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); 219 (*pp)++; 220 return 1; 221} 222 223#if 0 224static int 225is_mbc_ambiguous(OnigCaseFoldType flag, 226 const UChar** pp, const UChar* end) 227{ 228 int v; 229 const UChar* p = *pp; 230 231 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { 232 (*pp)++; 233 return TRUE; 234 } 235 236 (*pp)++; 237 v = (EncISO_8859_1_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); 238 if ((v | BIT_CTYPE_LOWER) != 0) { 239 /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ 240 if (*p >= 0xaa && *p <= 0xba) 241 return FALSE; 242 else 243 return TRUE; 244 } 245 246 return (v != 0 ? TRUE : FALSE); 247} 248#endif 249 250static int 251is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED) 252{ 253 if (code < 256) 254 return ENC_IS_ISO_8859_1_CTYPE(code, ctype); 255 else 256 return FALSE; 257} 258 259OnigEncodingDefine(iso_8859_1, ISO_8859_1) = { 260 onigenc_single_byte_mbc_enc_len, 261 "ISO-8859-1", /* name */ 262 1, /* max enc length */ 263 1, /* min enc length */ 264 onigenc_is_mbc_newline_0x0a, 265 onigenc_single_byte_mbc_to_code, 266 onigenc_single_byte_code_to_mbclen, 267 onigenc_single_byte_code_to_mbc, 268 mbc_case_fold, 269 apply_all_case_fold, 270 get_case_fold_codes_by_str, 271 onigenc_minimum_property_name_to_ctype, 272 is_code_ctype, 273 onigenc_not_support_get_ctype_code_range, 274 onigenc_single_byte_left_adjust_char_head, 275 onigenc_always_true_is_allowed_reverse_match, 276 0, 277 ONIGENC_FLAG_NONE, 278}; 279ENC_ALIAS("ISO8859-1", "ISO-8859-1") 280 281/* 282 * Name: windows-1252 283 * MIBenum: 2252 284 * Link: http://www.iana.org/assignments/character-sets 285 * Link: http://www.microsoft.com/globaldev/reference/sbcs/1252.mspx 286 * Link: http://en.wikipedia.org/wiki/Windows-1252 287 */ 288ENC_REPLICATE("Windows-1252", "ISO-8859-1") 289ENC_ALIAS("CP1252", "Windows-1252") 290