1/* 2 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 */ 18 19#if HAVE_CONFIG_H 20# include <config.h> 21#endif 22 23#ifdef HAVE_ICONV 24 25#include <assert.h> 26#include <errno.h> 27#include <iconv.h> 28#include <stdlib.h> 29#include <string.h> 30 31#include "iconvert.h" 32#include "share/alloc.h" 33 34/* 35 * Convert data from one encoding to another. Return: 36 * 37 * -2 : memory allocation failed 38 * -1 : unknown encoding 39 * 0 : data was converted exactly 40 * 1 : data was converted inexactly 41 * 2 : data was invalid (but still converted) 42 * 43 * We convert in two steps, via UTF-8, as this is the only 44 * reliable way of distinguishing between invalid input 45 * and valid input which iconv refuses to transliterate. 46 * We convert from UTF-8 twice, because we have no way of 47 * knowing whether the conversion was exact if iconv returns 48 * E2BIG (due to a bug in the specification of iconv). 49 * An alternative approach is to assume that the output of 50 * iconv is never more than 4 times as long as the input, 51 * but I prefer to avoid that assumption if possible. 52 */ 53 54int iconvert(const char *fromcode, const char *tocode, 55 const char *from, size_t fromlen, 56 char **to, size_t *tolen) 57{ 58 int ret = 0; 59 iconv_t cd1, cd2; 60 char *ib; 61 char *ob; 62 char *utfbuf = 0, *outbuf, *newbuf; 63 size_t utflen, outlen, ibl, obl, k; 64 char tbuf[2048]; 65 66 cd1 = iconv_open("UTF-8", fromcode); 67 if (cd1 == (iconv_t)(-1)) 68 return -1; 69 70 cd2 = (iconv_t)(-1); 71 /* Don't use strcasecmp() as it's locale-dependent. */ 72 if (!strchr("Uu", tocode[0]) || 73 !strchr("Tt", tocode[1]) || 74 !strchr("Ff", tocode[2]) || 75 tocode[3] != '-' || 76 tocode[4] != '8' || 77 tocode[5] != '\0') { 78 char *tocode1; 79 80 /* 81 * Try using this non-standard feature of glibc and libiconv. 82 * This is deliberately not a config option as people often 83 * change their iconv library without rebuilding applications. 84 */ 85 tocode1 = (char *)safe_malloc_add_2op_(strlen(tocode), /*+*/11); 86 if (!tocode1) 87 goto fail; 88 89 strcpy(tocode1, tocode); 90 strcat(tocode1, "//TRANSLIT"); 91 cd2 = iconv_open(tocode1, "UTF-8"); 92 free(tocode1); 93 94 if (cd2 == (iconv_t)(-1)) 95 cd2 = iconv_open(tocode, fromcode); 96 97 if (cd2 == (iconv_t)(-1)) { 98 iconv_close(cd1); 99 return -1; 100 } 101 } 102 103 utflen = 1; /*fromlen * 2 + 1; XXX */ 104 utfbuf = (char *)malloc(utflen); 105 if (!utfbuf) 106 goto fail; 107 108 /* Convert to UTF-8 */ 109 ib = (char *)from; 110 ibl = fromlen; 111 ob = utfbuf; 112 obl = utflen; 113 for (;;) { 114 k = iconv(cd1, &ib, &ibl, &ob, &obl); 115 assert((!k && !ibl) || 116 (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) || 117 (k == (size_t)(-1) && 118 (errno == EILSEQ || errno == EINVAL) && ibl)); 119 if (!ibl) 120 break; 121 if (obl < 6) { 122 /* Enlarge the buffer */ 123 if(utflen*2 < utflen) /* overflow check */ 124 goto fail; 125 utflen *= 2; 126 newbuf = (char *)realloc(utfbuf, utflen); 127 if (!newbuf) 128 goto fail; 129 ob = (ob - utfbuf) + newbuf; 130 obl = utflen - (ob - newbuf); 131 utfbuf = newbuf; 132 } 133 else { 134 /* Invalid input */ 135 ib++, ibl--; 136 *ob++ = '#', obl--; 137 ret = 2; 138 iconv(cd1, 0, 0, 0, 0); 139 } 140 } 141 142 if (cd2 == (iconv_t)(-1)) { 143 /* The target encoding was UTF-8 */ 144 if (tolen) 145 *tolen = ob - utfbuf; 146 if (!to) { 147 free(utfbuf); 148 iconv_close(cd1); 149 return ret; 150 } 151 newbuf = (char *)safe_realloc_add_2op_(utfbuf, (ob - utfbuf), /*+*/1); 152 if (!newbuf) 153 goto fail; 154 ob = (ob - utfbuf) + newbuf; 155 *ob = '\0'; 156 *to = newbuf; 157 iconv_close(cd1); 158 return ret; 159 } 160 161 /* Truncate the buffer to be tidy */ 162 utflen = ob - utfbuf; 163 newbuf = (char *)realloc(utfbuf, utflen); 164 if (!newbuf) 165 goto fail; 166 utfbuf = newbuf; 167 168 /* Convert from UTF-8 to discover how long the output is */ 169 outlen = 0; 170 ib = utfbuf; 171 ibl = utflen; 172 while (ibl) { 173 ob = tbuf; 174 obl = sizeof(tbuf); 175 k = iconv(cd2, &ib, &ibl, &ob, &obl); 176 assert((k != (size_t)(-1) && !ibl) || 177 (k == (size_t)(-1) && errno == E2BIG && ibl) || 178 (k == (size_t)(-1) && errno == EILSEQ && ibl)); 179 if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) { 180 /* Replace one character */ 181 char *tb = "?"; 182 size_t tbl = 1; 183 184 outlen += ob - tbuf; 185 ob = tbuf; 186 obl = sizeof(tbuf); 187 k = iconv(cd2, &tb, &tbl, &ob, &obl); 188 assert((!k && !tbl) || 189 (k == (size_t)(-1) && errno == EILSEQ && tbl)); 190 for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--) 191 ; 192 } 193 outlen += ob - tbuf; 194 } 195 ob = tbuf; 196 obl = sizeof(tbuf); 197 k = iconv(cd2, 0, 0, &ob, &obl); 198 assert(!k); 199 outlen += ob - tbuf; 200 201 /* Convert from UTF-8 for real */ 202 outbuf = (char *)safe_malloc_add_2op_(outlen, /*+*/1); 203 if (!outbuf) 204 goto fail; 205 ib = utfbuf; 206 ibl = utflen; 207 ob = outbuf; 208 obl = outlen; 209 while (ibl) { 210 k = iconv(cd2, &ib, &ibl, &ob, &obl); 211 assert((k != (size_t)(-1) && !ibl) || 212 (k == (size_t)(-1) && errno == EILSEQ && ibl)); 213 if (k && !ret) 214 ret = 1; 215 if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) { 216 /* Replace one character */ 217 char *tb = "?"; 218 size_t tbl = 1; 219 220 k = iconv(cd2, &tb, &tbl, &ob, &obl); 221 assert((!k && !tbl) || 222 (k == (size_t)(-1) && errno == EILSEQ && tbl)); 223 for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--) 224 ; 225 } 226 } 227 k = iconv(cd2, 0, 0, &ob, &obl); 228 assert(!k); 229 assert(!obl); 230 *ob = '\0'; 231 232 free(utfbuf); 233 iconv_close(cd1); 234 iconv_close(cd2); 235 if (tolen) 236 *tolen = outlen; 237 if (!to) { 238 free(outbuf); 239 return ret; 240 } 241 *to = outbuf; 242 return ret; 243 244 fail: 245 if(0 != utfbuf) 246 free(utfbuf); 247 iconv_close(cd1); 248 if (cd2 != (iconv_t)(-1)) 249 iconv_close(cd2); 250 return -2; 251} 252 253#endif /* HAVE_ICONV */ 254