1/* Copyright 2008,2009 Alain Knaff. 2 * This file is part of mtools. 3 * 4 * Mtools is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation, either version 3 of the License, or 7 * (at your option) any later version. 8 * 9 * Mtools is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with Mtools. If not, see <http://www.gnu.org/licenses/>. 16 * 17 * Various character set conversions used by mtools 18 */ 19#include "sysincludes.h" 20#include "msdos.h" 21#include "mtools.h" 22 23#include <stdio.h> 24#include <errno.h> 25#include <stdlib.h> 26#include "file_name.h" 27 28 29#ifdef HAVE_ICONV_H 30#include <iconv.h> 31 32struct doscp_t { 33 iconv_t from; 34 iconv_t to; 35}; 36 37static char *wcharCp=NULL; 38 39static char* wcharTries[] = { 40 "WCHAR_T", 41 "UTF-32BE", "UTF-32LE", 42 "UTF-16BE", "UTF-16LE", 43 "UTF-32", "UTF-16", 44 "UCS-4BE", "UCS-4LE", 45 "UCS-2BE", "UCS-2LE", 46 "UCS-4", "UCS-2" 47}; 48 49static wchar_t *testString = L"ab"; 50 51static int try(char *testCp) { 52 size_t res; 53 char *inbuf = (char *)testString; 54 size_t inbufLen = 2*sizeof(wchar_t); 55 char outbuf[3]; 56 char *outbufP = outbuf; 57 size_t outbufLen = 2*sizeof(char); 58 iconv_t test = iconv_open("ASCII", testCp); 59 60 if(test == (iconv_t) -1) 61 goto fail0; 62 res = iconv(test, 63 &inbuf, &inbufLen, 64 &outbufP, &outbufLen); 65 if(res != 0 || outbufLen != 0 || inbufLen != 0) 66 goto fail; 67 if(memcmp(outbuf, "ab", 2)) 68 goto fail; 69 /* fprintf(stderr, "%s ok\n", testCp); */ 70 return 1; 71 fail: 72 iconv_close(test); 73 fail0: 74 /*fprintf(stderr, "%s fail\n", testCp);*/ 75 return 0; 76} 77 78static const char *getWcharCp() { 79 int i; 80 if(wcharCp != NULL) 81 return wcharCp; 82 for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) { 83 if(try(wcharTries[i])) 84 return (wcharCp=wcharTries[i]); 85 } 86 fprintf(stderr, "No codepage found for wchar_t\n"); 87 return NULL; 88} 89 90 91doscp_t *cp_open(int codepage) 92{ 93 char dosCp[17]; 94 doscp_t *ret; 95 iconv_t *from; 96 iconv_t *to; 97 98 if(codepage == 0) 99 codepage = mtools_default_codepage; 100 if(codepage < 0 || codepage > 9999) { 101 fprintf(stderr, "Bad codepage %d\n", codepage); 102 return NULL; 103 } 104 105 if(getWcharCp() == NULL) 106 return NULL; 107 108 sprintf(dosCp, "CP%d", codepage); 109 from = iconv_open(wcharCp, dosCp); 110 if(from == (iconv_t)-1) { 111 fprintf(stderr, "Error converting to codepage %d %s\n", 112 codepage, strerror(errno)); 113 return NULL; 114 } 115 116 sprintf(dosCp, "CP%d//TRANSLIT", codepage); 117 to = iconv_open(dosCp, wcharCp); 118 if(to == (iconv_t)-1) { 119 /* Transliteration not supported? */ 120 sprintf(dosCp, "CP%d", codepage); 121 to = iconv_open(dosCp, wcharCp); 122 } 123 if(to == (iconv_t)-1) { 124 iconv_close(from); 125 fprintf(stderr, "Error converting to codepage %d %s\n", 126 codepage, strerror(errno)); 127 return NULL; 128 } 129 130 ret = New(doscp_t); 131 if(ret == NULL) 132 return ret; 133 ret->from = from; 134 ret->to = to; 135 return ret; 136} 137 138void cp_close(doscp_t *cp) 139{ 140 iconv_close(cp->to); 141 iconv_close(cp->from); 142 free(cp); 143} 144 145int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len) 146{ 147 int r; 148 size_t in_len=len; 149 size_t out_len=len*sizeof(wchar_t); 150 wchar_t *dptr=wchar; 151 r=iconv(cp->from, &dos, &in_len, (char **)&dptr, &out_len); 152 if(r < 0) 153 return r; 154 *dptr = L'\0'; 155 return dptr-wchar; 156} 157 158/** 159 * Converts len wide character to destination. Caller's responsibility to 160 * ensure that dest is large enough. 161 * mangled will be set if there has been an untranslatable character. 162 */ 163static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest, 164 size_t len, int *mangled) 165{ 166 int r; 167 int i; 168 size_t in_len=len*sizeof(wchar_t); 169 size_t out_len=len*4; 170 char *dptr = dest; 171 172 while(in_len > 0) { 173 r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len); 174 if(r >= 0 || errno != EILSEQ) { 175 /* everything transformed, or error that is _not_ a bad 176 * character */ 177 break; 178 } 179 *mangled |= 1; 180 181 if(dptr) 182 *dptr++ = '_'; 183 in_len--; 184 185 wchar++; 186 out_len--; 187 } 188 189 len = dptr-dest; /* how many dest characters have there been 190 generated */ 191 192 /* eliminate question marks which might have been formed by 193 untransliterable characters */ 194 for(i=0; i<len; i++) { 195 if(dest[i] == '?') { 196 dest[i] = '_'; 197 *mangled |= 1; 198 } 199 } 200 return len; 201} 202 203void wchar_to_dos(doscp_t *cp, 204 wchar_t *wchar, char *dos, size_t len, int *mangled) 205{ 206 safe_iconv(cp->to, wchar, dos, len, mangled); 207} 208 209#else 210 211#include "codepage.h" 212 213struct doscp_t { 214 unsigned char *from_dos; 215 unsigned char to_dos[0x80]; 216}; 217 218doscp_t *cp_open(int codepage) 219{ 220 doscp_t *ret; 221 int i; 222 Codepage_t *cp; 223 224 if(codepage == 0) 225 codepage = 850; 226 227 ret = New(doscp_t); 228 if(ret == NULL) 229 return ret; 230 231 for(cp=codepages; cp->nr ; cp++) 232 if(cp->nr == codepage) { 233 ret->from_dos = cp->tounix; 234 break; 235 } 236 237 if(ret->from_dos == NULL) { 238 fprintf(stderr, "Bad codepage %d\n", codepage); 239 free(ret); 240 return NULL; 241 } 242 243 for(i=0; i<0x80; i++) { 244 char native = ret->from_dos[i]; 245 if(! (native & 0x80)) 246 continue; 247 ret->to_dos[native & 0x7f] = 0x80 | i; 248 } 249 return ret; 250} 251 252void cp_close(doscp_t *cp) 253{ 254 free(cp); 255} 256 257int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len) 258{ 259 int i; 260 261 for(i=0; i<len && dos[i]; i++) { 262 char c = dos[i]; 263 if(c >= ' ' && c <= '~') 264 wchar[i] = c; 265 else { 266 wchar[i] = cp->from_dos[c & 0x7f]; 267 } 268 } 269 wchar[i] = '\0'; 270 return i; 271} 272 273 274void wchar_to_dos(doscp_t *cp, 275 wchar_t *wchar, char *dos, size_t len, int *mangled) 276{ 277 int i; 278 for(i=0; i<len && wchar[i]; i++) { 279 char c = wchar[i]; 280 if(c >= ' ' && c <= '~') 281 dos[i] = c; 282 else { 283 dos[i] = cp->to_dos[c & 0x7f]; 284 if(dos[i] == '\0') { 285 dos[i]='_'; 286 *mangled=1; 287 } 288 } 289 } 290} 291 292#endif 293 294 295#ifndef HAVE_WCHAR_H 296 297typedef int mbstate_t; 298 299static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps) 300{ 301 *s = wc; 302 return 1; 303} 304 305static inline size_t mbrtowc(wchar_t *pwc, const char *s, 306 size_t n, mbstate_t *ps) 307{ 308 *pwc = *s; 309 return 1; 310} 311 312#endif 313 314#ifdef HAVE_ICONV_H 315 316#include <langinfo.h> 317 318static iconv_t to_native = NULL; 319 320static void initialize_to_native(void) 321{ 322 char *li, *cp; 323 int len; 324 if(to_native != NULL) 325 return; 326 li = nl_langinfo(CODESET); 327 len = strlen(li) + 11; 328 if(getWcharCp() == NULL) 329 exit(1); 330 cp = safe_malloc(len); 331 strcpy(cp, li); 332 strcat(cp, "//TRANSLIT"); 333 to_native = iconv_open(cp, wcharCp); 334 if(to_native == (iconv_t) -1) 335 to_native = iconv_open(li, wcharCp); 336 if(to_native == (iconv_t) -1) 337 fprintf(stderr, "Could not allocate iconv for %s\n", cp); 338 free(cp); 339 if(to_native == (iconv_t) -1) 340 exit(1); 341} 342 343 344 345#endif 346 347 348/** 349 * Convert wchar string to native, converting at most len wchar characters 350 * Returns number of generated native characters 351 */ 352int wchar_to_native(const wchar_t *wchar, char *native, size_t len) 353{ 354#ifdef HAVE_ICONV_H 355 int mangled; 356 int r; 357 initialize_to_native(); 358 len = wcsnlen(wchar,len); 359 r=safe_iconv(to_native, wchar, native, len, &mangled); 360 native[r]='\0'; 361 return r; 362#else 363 int i; 364 char *dptr = native; 365 mbstate_t ps; 366 memset(&ps, 0, sizeof(ps)); 367 for(i=0; i<len && wchar[i] != 0; i++) { 368 int r = wcrtomb(dptr, wchar[i], &ps); 369 if(r < 0 && errno == EILSEQ) { 370 r=1; 371 *dptr='_'; 372 } 373 if(r < 0) 374 return r; 375 dptr+=r; 376 } 377 *dptr='\0'; 378 return dptr-native; 379#endif 380} 381 382/** 383 * Convert native string to wchar string, converting at most len wchar 384 * characters. If end is supplied, stop conversion when source pointer 385 * exceeds end. Returns number of converted wchars 386 */ 387int native_to_wchar(const char *native, wchar_t *wchar, size_t len, 388 const char *end, int *mangled) 389{ 390 mbstate_t ps; 391 int i; 392 memset(&ps, 0, sizeof(ps)); 393 394 for(i=0; i<len && (native < end || !end); i++) { 395 int r = mbrtowc(wchar+i, native, len, &ps); 396 if(r < 0) { 397 /* Unconvertible character. Just pretend it's Latin1 398 encoded (if valid Latin1 character) or substitue 399 with an underscore if not 400 */ 401 char c = *native; 402 if(c >= '\xa0' && c < '\xff') 403 wchar[i] = c & 0xff; 404 else 405 wchar[i] = '_'; 406 memset(&ps, 0, sizeof(ps)); 407 r=1; 408 } 409 if(r == 0) 410 break; 411 native += r; 412 } 413 if(mangled && end && native < end) 414 *mangled |= 3; 415 wchar[i]='\0'; 416 return i; 417} 418 419