1/*
2 * Copyright 2003-2008, Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 *		Andrew Bachmann
7 */
8
9
10#include <CharacterSet.h>
11#include <CharacterSetRoster.h>
12#include <UTF8.h>
13
14#include <errno.h>
15#include <iconv.h>
16#include <stdio.h>
17
18
19//#define DEBUG_CONV 1
20
21#ifdef DEBUG_CONV
22#	define DEBPRINT(ARGS) printf ARGS;
23#else
24#	define DEBPRINT(ARGS) ;
25#endif
26
27using namespace BPrivate;
28
29int iconvctl(iconv_t icd, int request, void* argument);
30
31
32static void
33discard_invalid_input_character(iconv_t* conversion, char** inputBuffer,
34	size_t* inputLeft)
35{
36	if (*inputLeft == 0)
37		return;
38
39	char outputBuffer[1];
40
41	// skip the invalid input character only
42	size_t left = 1;
43	for (; left <= *inputLeft; left ++) {
44		// reset internal state
45		iconv(*conversion, NULL, NULL, NULL, NULL);
46
47		char* buffer = *inputBuffer;
48		char* output = outputBuffer;
49		size_t outputLeft = 1;
50		size_t size = iconv(*conversion, &buffer, &left,
51			&output, &outputLeft);
52
53		if (size != (size_t)-1) {
54			// should not reach here
55			break;
56		}
57
58		if (errno == EINVAL) {
59			// too few input bytes provided,
60			// increase input buffer size and try again
61			continue;
62		}
63
64		if (errno == EILSEQ) {
65			// minimal size of input buffer found
66			break;
67		}
68
69		// should not reach here
70	};
71
72	*inputBuffer += left;
73	*inputLeft -= left;
74}
75
76
77status_t
78convert_encoding(const char* from, const char* to, const char* src,
79	int32* srcLen, char* dst, int32* dstLen, int32* state,
80	char substitute)
81{
82	if (*srcLen == 0) {
83		// nothing to do!
84		*dstLen = 0;
85		return B_OK;
86	}
87
88	// TODO: this doesn't work, as the state is reset every time!
89	iconv_t conversion = iconv_open(to, from);
90	if (conversion == (iconv_t)-1) {
91		DEBPRINT(("iconv_open failed\n"));
92		return B_ERROR;
93	}
94
95	size_t outputLeft = *dstLen;
96
97	if (state == NULL || *state == 0) {
98		if (state != NULL)
99			*state = 1;
100
101		iconv(conversion, NULL, NULL, &dst, &outputLeft);
102	}
103
104	char** inputBuffer = const_cast<char**>(&src);
105	size_t inputLeft = *srcLen;
106	do {
107		size_t nonReversibleConversions = iconv(conversion, inputBuffer,
108			&inputLeft, &dst, &outputLeft);
109		if (nonReversibleConversions == (size_t)-1) {
110			if (errno == E2BIG) {
111				// Not enough room in the output buffer for the next converted character
112				// This is not a "real" error, we just quit out.
113				break;
114			}
115
116			switch (errno) {
117				case EILSEQ: // unable to generate a corresponding character
118				{
119					discard_invalid_input_character(&conversion, inputBuffer,
120						&inputLeft);
121
122					// prepare to convert the substitute character to target encoding
123					char original = substitute;
124					size_t len = 1;
125					char* copy = &original;
126
127					// Perform the conversion
128					// We ignore any errors during this as part of robustness/best-effort
129					// We use ISO-8859-1 as a source because it is a single byte encoding
130					// It also overlaps UTF-8 for the lower 128 characters.  It is also
131					// likely to have a mapping to almost any target encoding.
132					iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
133					if (iso8859_1to != (iconv_t)-1) {
134						iconv(iso8859_1to, 0, 0, 0, 0);
135						iconv(iso8859_1to, &copy, &len, &dst, &outputLeft);
136						iconv_close(iso8859_1to);
137					}
138					break;
139				}
140
141				case EINVAL: // incomplete multibyte sequence at the end of the input
142					// TODO inputLeft bytes from inputBuffer should
143					// be stored in state variable, so that conversion
144					// can continue when the caller provides the missing
145					// bytes with the next call of this method
146
147					// we just eat bad bytes, as part of robustness/best-effort
148					inputBuffer++;
149					inputLeft--;
150					break;
151
152				default:
153					// unknown error, completely bail
154					status_t status = errno;
155					iconv_close(conversion);
156					return status;
157			}
158		}
159	} while (inputLeft > 0 && outputLeft > 0);
160
161	*srcLen -= inputLeft;
162	*dstLen -= outputLeft;
163	iconv_close(conversion);
164
165	return B_OK;
166}
167
168
169status_t
170convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen,
171	char* dst, int32* dstLen, int32* state, char substitute)
172{
173	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
174		srcEncoding);
175	if (charset == NULL)
176		return B_ERROR;
177
178#if DEBUG_CONV
179	fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
180	for (int i = 0 ; i < *srcLen ; i++) {
181		fprintf(stderr, "%c", src[i]);
182	}
183	fprintf(stderr, "\"\n");
184#endif
185
186	return convert_encoding(charset->GetName(), "UTF-8", src, srcLen,
187		dst, dstLen, state, substitute);
188}
189
190
191status_t
192convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen,
193	char* dst, int32* dstLen, int32* state, char substitute)
194{
195	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
196		dstEncoding);
197	if (charset == NULL)
198		return B_ERROR;
199
200#if DEBUG_CONV
201	fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
202	for (int i = 0 ; i < *srcLen ; i++) {
203		fprintf(stderr, "%c", src[i]);
204	}
205	fprintf(stderr, "\"\n");
206#endif
207
208	return convert_encoding("UTF-8", charset->GetName(), src, srcLen,
209		dst, dstLen, state, substitute);
210}
211
212