1/* Character set conversion with error handling and autodetection. 2 Copyright (C) 2002, 2005, 2007, 2009, 2010 Free Software Foundation, Inc. 3 Written by Bruno Haible. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU Lesser General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include "striconveha.h" 22 23#include <errno.h> 24#include <stdlib.h> 25#include <string.h> 26 27#include "malloca.h" 28#include "c-strcase.h" 29#include "striconveh.h" 30 31#define SIZEOF(a) (sizeof(a)/sizeof(a[0])) 32 33 34/* Autodetection list. */ 35 36struct autodetect_alias 37{ 38 struct autodetect_alias *next; 39 const char *name; 40 const char * const *encodings_to_try; 41}; 42 43static const char * const autodetect_utf8_try[] = 44{ 45 /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would 46 be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */ 47 "UTF-8", "ISO-8859-1", 48 NULL 49}; 50static const char * const autodetect_jp_try[] = 51{ 52 /* Try 7-bit encoding first. If the input contains bytes >= 0x80, 53 it will fail. 54 Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This 55 is unavoidable. People will condemn SHIFT_JIS. 56 If we tried SHIFT_JIS first, then some short EUC-JP inputs would 57 come out wrong, and people would condemn EUC-JP and Unix, which 58 would not be good. 59 Finally try SHIFT_JIS. */ 60 "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS", 61 NULL 62}; 63static const char * const autodetect_kr_try[] = 64{ 65 /* Try 7-bit encoding first. If the input contains bytes >= 0x80, 66 it will fail. 67 Finally try EUC-KR. */ 68 "ISO-2022-KR", "EUC-KR", 69 NULL 70}; 71 72static struct autodetect_alias autodetect_predefined[] = 73{ 74 { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try }, 75 { &autodetect_predefined[2], "autodetect_jp", autodetect_jp_try }, 76 { NULL, "autodetect_kr", autodetect_kr_try } 77}; 78 79static struct autodetect_alias *autodetect_list = &autodetect_predefined[0]; 80static struct autodetect_alias **autodetect_list_end = 81 &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next; 82 83int 84uniconv_register_autodetect (const char *name, 85 const char * const *try_in_order) 86{ 87 size_t namelen; 88 size_t listlen; 89 size_t memneed; 90 size_t i; 91 char *memory; 92 struct autodetect_alias *new_alias; 93 char *new_name; 94 const char **new_try_in_order; 95 96 /* The TRY_IN_ORDER list must not be empty. */ 97 if (try_in_order[0] == NULL) 98 { 99 errno = EINVAL; 100 return -1; 101 } 102 103 /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated 104 with dynamic extent. */ 105 namelen = strlen (name) + 1; 106 memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *); 107 for (i = 0; try_in_order[i] != NULL; i++) 108 memneed += sizeof (char *) + strlen (try_in_order[i]) + 1; 109 listlen = i; 110 111 memory = (char *) malloc (memneed); 112 if (memory != NULL) 113 { 114 new_alias = (struct autodetect_alias *) memory; 115 memory += sizeof (struct autodetect_alias); 116 117 new_try_in_order = (const char **) memory; 118 memory += (listlen + 1) * sizeof (char *); 119 120 new_name = (char *) memory; 121 memcpy (new_name, name, namelen); 122 memory += namelen; 123 124 for (i = 0; i < listlen; i++) 125 { 126 size_t len = strlen (try_in_order[i]) + 1; 127 memcpy (memory, try_in_order[i], len); 128 new_try_in_order[i] = (const char *) memory; 129 memory += len; 130 } 131 new_try_in_order[i] = NULL; 132 133 /* Now insert the new alias. */ 134 new_alias->name = new_name; 135 new_alias->encodings_to_try = new_try_in_order; 136 new_alias->next = NULL; 137 /* FIXME: Not multithread-safe. */ 138 *autodetect_list_end = new_alias; 139 autodetect_list_end = &new_alias->next; 140 return 0; 141 } 142 else 143 { 144 errno = ENOMEM; 145 return -1; 146 } 147} 148 149/* Like mem_iconveha, except no handling of transliteration. */ 150static int 151mem_iconveha_notranslit (const char *src, size_t srclen, 152 const char *from_codeset, const char *to_codeset, 153 enum iconv_ilseq_handler handler, 154 size_t *offsets, 155 char **resultp, size_t *lengthp) 156{ 157 int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler, 158 offsets, resultp, lengthp); 159 if (retval >= 0 || errno != EINVAL) 160 return retval; 161 else 162 { 163 struct autodetect_alias *alias; 164 165 /* Unsupported from_codeset or to_codeset. Check whether the caller 166 requested autodetection. */ 167 for (alias = autodetect_list; alias != NULL; alias = alias->next) 168 if (strcmp (from_codeset, alias->name) == 0) 169 { 170 const char * const *encodings; 171 172 if (handler != iconveh_error) 173 { 174 /* First try all encodings without any forgiving. */ 175 encodings = alias->encodings_to_try; 176 do 177 { 178 retval = mem_iconveha_notranslit (src, srclen, 179 *encodings, to_codeset, 180 iconveh_error, offsets, 181 resultp, lengthp); 182 if (!(retval < 0 && errno == EILSEQ)) 183 return retval; 184 encodings++; 185 } 186 while (*encodings != NULL); 187 } 188 189 encodings = alias->encodings_to_try; 190 do 191 { 192 retval = mem_iconveha_notranslit (src, srclen, 193 *encodings, to_codeset, 194 handler, offsets, 195 resultp, lengthp); 196 if (!(retval < 0 && errno == EILSEQ)) 197 return retval; 198 encodings++; 199 } 200 while (*encodings != NULL); 201 202 /* Return the last call's result. */ 203 return -1; 204 } 205 206 /* It wasn't an autodetection name. */ 207 errno = EINVAL; 208 return -1; 209 } 210} 211 212int 213mem_iconveha (const char *src, size_t srclen, 214 const char *from_codeset, const char *to_codeset, 215 bool transliterate, 216 enum iconv_ilseq_handler handler, 217 size_t *offsets, 218 char **resultp, size_t *lengthp) 219{ 220 if (srclen == 0) 221 { 222 /* Nothing to convert. */ 223 *lengthp = 0; 224 return 0; 225 } 226 227 /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5, 228 we want to use transliteration. */ 229#if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105 230 if (transliterate) 231 { 232 int retval; 233 size_t len = strlen (to_codeset); 234 char *to_codeset_suffixed = (char *) malloca (len + 10 + 1); 235 memcpy (to_codeset_suffixed, to_codeset, len); 236 memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1); 237 238 retval = mem_iconveha_notranslit (src, srclen, 239 from_codeset, to_codeset_suffixed, 240 handler, offsets, resultp, lengthp); 241 242 freea (to_codeset_suffixed); 243 244 return retval; 245 } 246 else 247#endif 248 return mem_iconveha_notranslit (src, srclen, 249 from_codeset, to_codeset, 250 handler, offsets, resultp, lengthp); 251} 252 253/* Like str_iconveha, except no handling of transliteration. */ 254static char * 255str_iconveha_notranslit (const char *src, 256 const char *from_codeset, const char *to_codeset, 257 enum iconv_ilseq_handler handler) 258{ 259 char *result = str_iconveh (src, from_codeset, to_codeset, handler); 260 261 if (result != NULL || errno != EINVAL) 262 return result; 263 else 264 { 265 struct autodetect_alias *alias; 266 267 /* Unsupported from_codeset or to_codeset. Check whether the caller 268 requested autodetection. */ 269 for (alias = autodetect_list; alias != NULL; alias = alias->next) 270 if (strcmp (from_codeset, alias->name) == 0) 271 { 272 const char * const *encodings; 273 274 if (handler != iconveh_error) 275 { 276 /* First try all encodings without any forgiving. */ 277 encodings = alias->encodings_to_try; 278 do 279 { 280 result = str_iconveha_notranslit (src, 281 *encodings, to_codeset, 282 iconveh_error); 283 if (!(result == NULL && errno == EILSEQ)) 284 return result; 285 encodings++; 286 } 287 while (*encodings != NULL); 288 } 289 290 encodings = alias->encodings_to_try; 291 do 292 { 293 result = str_iconveha_notranslit (src, 294 *encodings, to_codeset, 295 handler); 296 if (!(result == NULL && errno == EILSEQ)) 297 return result; 298 encodings++; 299 } 300 while (*encodings != NULL); 301 302 /* Return the last call's result. */ 303 return NULL; 304 } 305 306 /* It wasn't an autodetection name. */ 307 errno = EINVAL; 308 return NULL; 309 } 310} 311 312char * 313str_iconveha (const char *src, 314 const char *from_codeset, const char *to_codeset, 315 bool transliterate, 316 enum iconv_ilseq_handler handler) 317{ 318 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0) 319 { 320 char *result = strdup (src); 321 322 if (result == NULL) 323 errno = ENOMEM; 324 return result; 325 } 326 327 /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5, 328 we want to use transliteration. */ 329#if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105 330 if (transliterate) 331 { 332 char *result; 333 size_t len = strlen (to_codeset); 334 char *to_codeset_suffixed = (char *) malloca (len + 10 + 1); 335 memcpy (to_codeset_suffixed, to_codeset, len); 336 memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1); 337 338 result = str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed, 339 handler); 340 341 freea (to_codeset_suffixed); 342 343 return result; 344 } 345 else 346#endif 347 return str_iconveha_notranslit (src, from_codeset, to_codeset, handler); 348} 349