1/*  Copyright 2008,2009 Alain Knaff.
2 *  This file is part of mtools.
3 *
4 *  Mtools is free software: you can redistribute it and/or modify
5 *  it under the terms of the GNU General Public License as published by
6 *  the Free Software Foundation, either version 3 of the License, or
7 *  (at your option) any later version.
8 *
9 *  Mtools is distributed in the hope that it will be useful,
10 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 *  GNU General Public License for more details.
13 *
14 *  You should have received a copy of the GNU General Public License
15 *  along with Mtools.  If not, see <http://www.gnu.org/licenses/>.
16 *
17 * Various character set conversions used by mtools
18 */
19#include "sysincludes.h"
20#include "msdos.h"
21#include "mtools.h"
22
23#include <stdio.h>
24#include <errno.h>
25#include <stdlib.h>
26#include "file_name.h"
27
28
29#ifdef HAVE_ICONV_H
30#include <iconv.h>
31
32struct doscp_t {
33	iconv_t from;
34	iconv_t to;
35};
36
37static char *wcharCp=NULL;
38
39static char* wcharTries[] = {
40	"WCHAR_T",
41	"UTF-32BE", "UTF-32LE",
42	"UTF-16BE", "UTF-16LE",
43	"UTF-32", "UTF-16",
44	"UCS-4BE", "UCS-4LE",
45	"UCS-2BE", "UCS-2LE",
46	"UCS-4", "UCS-2"
47};
48
49static wchar_t *testString = L"ab";
50
51static int try(char *testCp) {
52	size_t res;
53	char *inbuf = (char *)testString;
54	size_t inbufLen = 2*sizeof(wchar_t);
55	char outbuf[3];
56	char *outbufP = outbuf;
57	size_t outbufLen = 2*sizeof(char);
58	iconv_t test = iconv_open("ASCII", testCp);
59
60	if(test == (iconv_t) -1)
61		goto fail0;
62	res = iconv(test,
63		    &inbuf, &inbufLen,
64		    &outbufP, &outbufLen);
65	if(res != 0 || outbufLen != 0 || inbufLen != 0)
66		goto fail;
67	if(memcmp(outbuf, "ab", 2))
68		goto fail;
69	/* fprintf(stderr, "%s ok\n", testCp); */
70	return 1;
71 fail:
72	iconv_close(test);
73 fail0:
74	/*fprintf(stderr, "%s fail\n", testCp);*/
75	return 0;
76}
77
78static const char *getWcharCp() {
79	int i;
80	if(wcharCp != NULL)
81		return wcharCp;
82	for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
83		if(try(wcharTries[i]))
84			return (wcharCp=wcharTries[i]);
85	}
86	fprintf(stderr, "No codepage found for wchar_t\n");
87	return NULL;
88}
89
90
91doscp_t *cp_open(int codepage)
92{
93	char dosCp[17];
94	doscp_t *ret;
95	iconv_t *from;
96	iconv_t *to;
97
98	if(codepage == 0)
99		codepage = mtools_default_codepage;
100	if(codepage < 0 || codepage > 9999) {
101		fprintf(stderr, "Bad codepage %d\n", codepage);
102		return NULL;
103	}
104
105	if(getWcharCp() == NULL)
106		return NULL;
107
108	sprintf(dosCp, "CP%d", codepage);
109	from = iconv_open(wcharCp, dosCp);
110	if(from == (iconv_t)-1) {
111		fprintf(stderr, "Error converting to codepage %d %s\n",
112			codepage, strerror(errno));
113		return NULL;
114	}
115
116	sprintf(dosCp, "CP%d//TRANSLIT", codepage);
117	to   =  iconv_open(dosCp, wcharCp);
118	if(to == (iconv_t)-1) {
119		/* Transliteration not supported? */
120		sprintf(dosCp, "CP%d", codepage);
121		to   =  iconv_open(dosCp, wcharCp);
122	}
123	if(to == (iconv_t)-1) {
124		iconv_close(from);
125		fprintf(stderr, "Error converting to codepage %d %s\n",
126			codepage, strerror(errno));
127		return NULL;
128	}
129
130	ret = New(doscp_t);
131	if(ret == NULL)
132		return ret;
133	ret->from = from;
134	ret->to   = to;
135	return ret;
136}
137
138void cp_close(doscp_t *cp)
139{
140	iconv_close(cp->to);
141	iconv_close(cp->from);
142	free(cp);
143}
144
145int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
146{
147	int r;
148	size_t in_len=len;
149	size_t out_len=len*sizeof(wchar_t);
150	wchar_t *dptr=wchar;
151	r=iconv(cp->from, &dos, &in_len, (char **)&dptr, &out_len);
152	if(r < 0)
153		return r;
154	*dptr = L'\0';
155	return dptr-wchar;
156}
157
158/**
159 * Converts len wide character to destination. Caller's responsibility to
160 * ensure that dest is large enough.
161 * mangled will be set if there has been an untranslatable character.
162 */
163static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
164		      size_t len, int *mangled)
165{
166	int r;
167	int i;
168	size_t in_len=len*sizeof(wchar_t);
169	size_t out_len=len*4;
170	char *dptr = dest;
171
172	while(in_len > 0) {
173		r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
174		if(r >= 0 || errno != EILSEQ) {
175			/* everything transformed, or error that is _not_ a bad
176			 * character */
177			break;
178		}
179		*mangled |= 1;
180
181		if(dptr)
182			*dptr++ = '_';
183		in_len--;
184
185		wchar++;
186		out_len--;
187	}
188
189	len = dptr-dest; /* how many dest characters have there been
190			    generated */
191
192	/* eliminate question marks which might have been formed by
193	   untransliterable characters */
194	for(i=0; i<len; i++) {
195		if(dest[i] == '?') {
196			dest[i] = '_';
197			*mangled |= 1;
198		}
199	}
200	return len;
201}
202
203void wchar_to_dos(doscp_t *cp,
204		  wchar_t *wchar, char *dos, size_t len, int *mangled)
205{
206	safe_iconv(cp->to, wchar, dos, len, mangled);
207}
208
209#else
210
211#include "codepage.h"
212
213struct doscp_t {
214	unsigned char *from_dos;
215	unsigned char to_dos[0x80];
216};
217
218doscp_t *cp_open(int codepage)
219{
220	doscp_t *ret;
221	int i;
222	Codepage_t *cp;
223
224	if(codepage == 0)
225		codepage = 850;
226
227	ret = New(doscp_t);
228	if(ret == NULL)
229		return ret;
230
231	for(cp=codepages; cp->nr ; cp++)
232		if(cp->nr == codepage) {
233			ret->from_dos = cp->tounix;
234			break;
235		}
236
237	if(ret->from_dos == NULL) {
238		fprintf(stderr, "Bad codepage %d\n", codepage);
239		free(ret);
240		return NULL;
241	}
242
243	for(i=0; i<0x80; i++) {
244		char native = ret->from_dos[i];
245		if(! (native & 0x80))
246			continue;
247		ret->to_dos[native & 0x7f] = 0x80 | i;
248	}
249	return ret;
250}
251
252void cp_close(doscp_t *cp)
253{
254	free(cp);
255}
256
257int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
258{
259	int i;
260
261	for(i=0; i<len && dos[i]; i++) {
262		char c = dos[i];
263		if(c >= ' ' && c <= '~')
264			wchar[i] = c;
265		else {
266			wchar[i] = cp->from_dos[c & 0x7f];
267		}
268	}
269	wchar[i] = '\0';
270	return i;
271}
272
273
274void wchar_to_dos(doscp_t *cp,
275		  wchar_t *wchar, char *dos, size_t len, int *mangled)
276{
277	int i;
278	for(i=0; i<len && wchar[i]; i++) {
279		char c = wchar[i];
280		if(c >= ' ' && c <= '~')
281			dos[i] = c;
282		else {
283			dos[i] = cp->to_dos[c & 0x7f];
284			if(dos[i] == '\0') {
285				dos[i]='_';
286				*mangled=1;
287			}
288		}
289	}
290}
291
292#endif
293
294
295#ifndef HAVE_WCHAR_H
296
297typedef int mbstate_t;
298
299static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
300{
301	*s = wc;
302	return 1;
303}
304
305static inline size_t mbrtowc(wchar_t *pwc, const char *s,
306			     size_t n, mbstate_t *ps)
307{
308	*pwc = *s;
309	return 1;
310}
311
312#endif
313
314#ifdef HAVE_ICONV_H
315
316#include <langinfo.h>
317
318static iconv_t to_native = NULL;
319
320static void initialize_to_native(void)
321{
322	char *li, *cp;
323	int len;
324	if(to_native != NULL)
325		return;
326	li = nl_langinfo(CODESET);
327	len = strlen(li) + 11;
328	if(getWcharCp() == NULL)
329		exit(1);
330	cp = safe_malloc(len);
331	strcpy(cp, li);
332	strcat(cp, "//TRANSLIT");
333	to_native = iconv_open(cp, wcharCp);
334	if(to_native == (iconv_t) -1)
335		to_native = iconv_open(li, wcharCp);
336	if(to_native == (iconv_t) -1)
337		fprintf(stderr, "Could not allocate iconv for %s\n", cp);
338	free(cp);
339	if(to_native == (iconv_t) -1)
340		exit(1);
341}
342
343
344
345#endif
346
347
348/**
349 * Convert wchar string to native, converting at most len wchar characters
350 * Returns number of generated native characters
351 */
352int wchar_to_native(const wchar_t *wchar, char *native, size_t len)
353{
354#ifdef HAVE_ICONV_H
355	int mangled;
356	int r;
357	initialize_to_native();
358	len = wcsnlen(wchar,len);
359	r=safe_iconv(to_native, wchar, native, len, &mangled);
360	native[r]='\0';
361	return r;
362#else
363	int i;
364	char *dptr = native;
365	mbstate_t ps;
366	memset(&ps, 0, sizeof(ps));
367	for(i=0; i<len && wchar[i] != 0; i++) {
368		int r = wcrtomb(dptr, wchar[i], &ps);
369		if(r < 0 && errno == EILSEQ) {
370			r=1;
371			*dptr='_';
372		}
373		if(r < 0)
374			return r;
375		dptr+=r;
376	}
377	*dptr='\0';
378	return dptr-native;
379#endif
380}
381
382/**
383 * Convert native string to wchar string, converting at most len wchar
384 * characters. If end is supplied, stop conversion when source pointer
385 * exceeds end. Returns number of converted wchars
386 */
387int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
388		    const char *end, int *mangled)
389{
390	mbstate_t ps;
391	int i;
392	memset(&ps, 0, sizeof(ps));
393
394	for(i=0; i<len && (native < end || !end); i++) {
395		int r = mbrtowc(wchar+i, native, len, &ps);
396		if(r < 0) {
397			/* Unconvertible character. Just pretend it's Latin1
398			   encoded (if valid Latin1 character) or substitue
399			   with an underscore if not
400			*/
401			char c = *native;
402			if(c >= '\xa0' && c < '\xff')
403				wchar[i] = c & 0xff;
404			else
405				wchar[i] = '_';
406			memset(&ps, 0, sizeof(ps));
407			r=1;
408		}
409		if(r == 0)
410			break;
411		native += r;
412	}
413	if(mangled && end && native < end)
414		*mangled |= 3;
415	wchar[i]='\0';
416	return i;
417}
418
419