1/* 2 * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com> 3 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> 4 * 5 * Buffer overflow checking added: Josh Coalson, 9/9/2007 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 */ 21 22/* 23 * Convert a string between UTF-8 and the locale's charset. 24 */ 25 26#if HAVE_CONFIG_H 27# include <config.h> 28#endif 29 30#include <stdlib.h> 31#include <string.h> 32 33#include "share/alloc.h" 34#include "utf8.h" 35#include "charset.h" 36 37 38#ifdef _WIN32 39 40 /* Thanks to Peter Harris <peter.harris@hummingbird.com> for this win32 41 * code. 42 */ 43 44#include <stdio.h> 45#include <windows.h> 46 47static unsigned char *make_utf8_string(const wchar_t *unicode) 48{ 49 size_t size = 0, n; 50 int index = 0, out_index = 0; 51 unsigned char *out; 52 unsigned short c; 53 54 /* first calculate the size of the target string */ 55 c = unicode[index++]; 56 while(c) { 57 if(c < 0x0080) { 58 n = 1; 59 } else if(c < 0x0800) { 60 n = 2; 61 } else { 62 n = 3; 63 } 64 if(size+n < size) /* overflow check */ 65 return NULL; 66 size += n; 67 c = unicode[index++]; 68 } 69 70 out = safe_malloc_add_2op_(size, /*+*/1); 71 if (out == NULL) 72 return NULL; 73 index = 0; 74 75 c = unicode[index++]; 76 while(c) 77 { 78 if(c < 0x080) { 79 out[out_index++] = (unsigned char)c; 80 } else if(c < 0x800) { 81 out[out_index++] = 0xc0 | (c >> 6); 82 out[out_index++] = 0x80 | (c & 0x3f); 83 } else { 84 out[out_index++] = 0xe0 | (c >> 12); 85 out[out_index++] = 0x80 | ((c >> 6) & 0x3f); 86 out[out_index++] = 0x80 | (c & 0x3f); 87 } 88 c = unicode[index++]; 89 } 90 out[out_index] = 0x00; 91 92 return out; 93} 94 95static wchar_t *make_unicode_string(const unsigned char *utf8) 96{ 97 size_t size = 0; 98 int index = 0, out_index = 0; 99 wchar_t *out; 100 unsigned char c; 101 102 /* first calculate the size of the target string */ 103 c = utf8[index++]; 104 while(c) { 105 if((c & 0x80) == 0) { 106 index += 0; 107 } else if((c & 0xe0) == 0xe0) { 108 index += 2; 109 } else { 110 index += 1; 111 } 112 if(size + 1 == 0) /* overflow check */ 113 return NULL; 114 size++; 115 c = utf8[index++]; 116 } 117 118 if(size + 1 == 0) /* overflow check */ 119 return NULL; 120 out = safe_malloc_mul_2op_(size+1, /*times*/sizeof(wchar_t)); 121 if (out == NULL) 122 return NULL; 123 index = 0; 124 125 c = utf8[index++]; 126 while(c) 127 { 128 if((c & 0x80) == 0) { 129 out[out_index++] = c; 130 } else if((c & 0xe0) == 0xe0) { 131 out[out_index] = (c & 0x1F) << 12; 132 c = utf8[index++]; 133 out[out_index] |= (c & 0x3F) << 6; 134 c = utf8[index++]; 135 out[out_index++] |= (c & 0x3F); 136 } else { 137 out[out_index] = (c & 0x3F) << 6; 138 c = utf8[index++]; 139 out[out_index++] |= (c & 0x3F); 140 } 141 c = utf8[index++]; 142 } 143 out[out_index] = 0; 144 145 return out; 146} 147 148int utf8_encode(const char *from, char **to) 149{ 150 wchar_t *unicode; 151 int wchars, err; 152 153 wchars = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from, 154 strlen(from), NULL, 0); 155 156 if(wchars == 0) 157 { 158 fprintf(stderr, "Unicode translation error %d\n", GetLastError()); 159 return -1; 160 } 161 162 if(wchars < 0) /* underflow check */ 163 return -1; 164 165 unicode = safe_calloc_((size_t)wchars + 1, sizeof(unsigned short)); 166 if(unicode == NULL) 167 { 168 fprintf(stderr, "Out of memory processing string to UTF8\n"); 169 return -1; 170 } 171 172 err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from, 173 strlen(from), unicode, wchars); 174 if(err != wchars) 175 { 176 free(unicode); 177 fprintf(stderr, "Unicode translation error %d\n", GetLastError()); 178 return -1; 179 } 180 181 /* On NT-based windows systems, we could use WideCharToMultiByte(), but 182 * MS doesn't actually have a consistent API across win32. 183 */ 184 *to = make_utf8_string(unicode); 185 186 free(unicode); 187 return 0; 188} 189 190int utf8_decode(const char *from, char **to) 191{ 192 wchar_t *unicode; 193 int chars, err; 194 195 /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but 196 * MS doesn't actually have a consistent API across win32. 197 */ 198 unicode = make_unicode_string(from); 199 if(unicode == NULL) 200 { 201 fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16\n"); 202 return -1; 203 } 204 205 chars = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode, 206 -1, NULL, 0, NULL, NULL); 207 208 if(chars < 0) /* underflow check */ 209 return -1; 210 211 if(chars == 0) 212 { 213 fprintf(stderr, "Unicode translation error %d\n", GetLastError()); 214 free(unicode); 215 return -1; 216 } 217 218 *to = safe_calloc_((size_t)chars + 1, sizeof(unsigned char)); 219 if(*to == NULL) 220 { 221 fprintf(stderr, "Out of memory processing string to local charset\n"); 222 free(unicode); 223 return -1; 224 } 225 226 err = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode, 227 -1, *to, chars, NULL, NULL); 228 if(err != chars) 229 { 230 fprintf(stderr, "Unicode translation error %d\n", GetLastError()); 231 free(unicode); 232 free(*to); 233 *to = NULL; 234 return -1; 235 } 236 237 free(unicode); 238 return 0; 239} 240 241#else /* End win32. Rest is for real operating systems */ 242 243 244#ifdef HAVE_LANGINFO_CODESET 245#include <langinfo.h> 246#endif 247 248#include "iconvert.h" 249 250static const char *current_charset(void) 251{ 252 const char *c = 0; 253#ifdef HAVE_LANGINFO_CODESET 254 c = nl_langinfo(CODESET); 255#endif 256 257 if (!c) 258 c = getenv("CHARSET"); 259 260 return c? c : "US-ASCII"; 261} 262 263static int convert_buffer(const char *fromcode, const char *tocode, 264 const char *from, size_t fromlen, 265 char **to, size_t *tolen) 266{ 267 int ret = -1; 268 269#ifdef HAVE_ICONV 270 ret = iconvert(fromcode, tocode, from, fromlen, to, tolen); 271 if (ret != -1) 272 return ret; 273#endif 274 275#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */ 276 ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen); 277 if (ret != -1) 278 return ret; 279#endif 280 281 return ret; 282} 283 284static int convert_string(const char *fromcode, const char *tocode, 285 const char *from, char **to, char replace) 286{ 287 int ret; 288 size_t fromlen; 289 char *s; 290 291 fromlen = strlen(from); 292 ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0); 293 if (ret == -2) 294 return -1; 295 if (ret != -1) 296 return ret; 297 298 s = safe_malloc_add_2op_(fromlen, /*+*/1); 299 if (!s) 300 return -1; 301 strcpy(s, from); 302 *to = s; 303 for (; *s; s++) 304 if (*s & ~0x7f) 305 *s = replace; 306 return 3; 307} 308 309int utf8_encode(const char *from, char **to) 310{ 311 return convert_string(current_charset(), "UTF-8", from, to, '#'); 312} 313 314int utf8_decode(const char *from, char **to) 315{ 316 return convert_string("UTF-8", current_charset(), from, to, '?'); 317} 318 319#endif 320