1#include "UdfString.h"
2
3#include <ByteOrder.h>
4
5#include <AutoDeleter.h>
6
7
8/*! \brief Converts the given unicode character to utf8.
9
10	\param c The unicode character.
11	\param out Pointer to a C-string of at least 4 characters
12	           long into which the output utf8 characters will
13	           be written. The string that is pointed to will
14	           be incremented to reflect the number of characters
15	           written, i.e. if \a out initially points to a pointer
16	           to the first character in string named \c str, and
17	           the function writes 4 characters to \c str, then
18	           upon returning, out will point to a pointer to
19	           the fifth character in \c str.
20*/
21static void
22unicode_to_utf8(uint32 c, char **out)
23{
24	char *s = *out;
25
26	if (c < 0x80)
27		*(s++) = c;
28	else if (c < 0x800) {
29		*(s++) = 0xc0 | (c>>6);
30		*(s++) = 0x80 | (c & 0x3f);
31	} else if (c < 0x10000) {
32		*(s++) = 0xe0 | (c>>12);
33		*(s++) = 0x80 | ((c>>6) & 0x3f);
34		*(s++) = 0x80 | (c & 0x3f);
35	} else if (c <= 0x10ffff) {
36		*(s++) = 0xf0 | (c>>18);
37		*(s++) = 0x80 | ((c>>12) & 0x3f);
38		*(s++) = 0x80 | ((c>>6) & 0x3f);
39		*(s++) = 0x80 | (c & 0x3f);
40	}
41	*out = s;
42}
43
44/*! \brief Converts the given utf8 character to 4-byte unicode.
45
46	\param in Pointer to a C-String from which utf8 characters
47	          will be read. *in will be incremented to reflect
48	          the number of characters read, similarly to the
49	          \c out parameter for unicode_to_utf8().
50
51	\return The 4-byte unicode character, or **in if passed an
52	        invalid character, or 0 if passed any NULL pointers.
53*/
54static uint32
55utf8_to_unicode(const char **in)
56{
57	if (!in)
58		return 0;
59	uint8 *bytes = (uint8 *)*in;
60	if (!bytes)
61		return 0;
62
63	int32 length;
64	uint8 mask = 0x1f;
65
66	switch (bytes[0] & 0xf0) {
67		case 0xc0:
68		case 0xd0:	length = 2; break;
69		case 0xe0:	length = 3; break;
70		case 0xf0:
71			mask = 0x0f;
72			length = 4;
73			break;
74		default:
75			// valid 1-byte character
76			// and invalid characters
77			(*in)++;
78			return bytes[0];
79	}
80	uint32 c = bytes[0] & mask;
81	int32 i = 1;
82	for (;i < length && (bytes[i] & 0x80) > 0;i++)
83		c = (c << 6) | (bytes[i] & 0x3f);
84
85	if (i < length) {
86		// invalid character
87		(*in)++;
88		return (uint32)bytes[0];
89	}
90	*in += length;
91	return c;
92}
93
94
95// #pragma mark -
96
97
98/*! \brief Creates an empty string object. */
99UdfString::UdfString()
100	:
101	fCs0String(NULL),
102	fUtf8String(NULL)
103{
104}
105
106
107/*! \brief Creates a new UdfString object from the given Utf8 string. */
108UdfString::UdfString(const char *utf8)
109	:
110	fCs0String(NULL),
111	fUtf8String(NULL)
112{
113	SetTo(utf8);
114}
115
116
117/*! \brief Creates a new UdfString object from the given Cs0 string. */
118UdfString::UdfString(const char *cs0, uint32 length)
119	:
120	fCs0String(NULL),
121	fUtf8String(NULL)
122{
123	SetTo(cs0, length);
124}
125
126
127UdfString::~UdfString()
128{
129	_Clear();
130}
131
132
133/*! \brief Assignment from a Utf8 string. */
134void
135UdfString::SetTo(const char *utf8)
136{
137	TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n",
138		utf8, utf8 ? strlen(utf8) : 0));
139	_Clear();
140
141	if (utf8 == NULL) {
142		TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n"));
143		return;
144	}
145
146	uint32 length = strlen(utf8);
147	// First copy the utf8 string
148	fUtf8String = new(nothrow) char[length + 1];
149	if (fUtf8String == NULL) {
150		TRACE_ERROR(("UdfString::SetTo: fUtf8String[%ld] allocation failed\n",
151			length + 1));
152		return;
153	}
154
155	memcpy(fUtf8String, utf8, length + 1);
156	// Next convert to raw 4-byte unicode. Then we'll do some
157	// analysis to figure out if we have any invalid characters,
158	// and whether we can get away with compressed 8-bit unicode,
159	// or have to use burly 16-bit unicode.
160	uint32 *raw = new(nothrow) uint32[length];
161	if (raw == NULL) {
162		TRACE_ERROR(("UdfString::SetTo: uint32 raw[%ld] temporary string "
163			"allocation failed\n", length));
164		_Clear();
165		return;
166	}
167
168	ArrayDeleter<uint32> rawDeleter(raw);
169
170	const char *in = utf8;
171	uint32 rawLength = 0;
172	for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++)
173		raw[i] = utf8_to_unicode(&in);
174
175	// Check for invalids.
176	uint32 mask = 0xffff0000;
177	for (uint32 i = 0; i < rawLength; i++) {
178		if (raw[i] & mask) {
179			TRACE(("WARNING: utf8 string contained a multi-byte sequence which "
180			       "was converted into a unicode character larger than 16-bits; "
181			       "character will be converted to an underscore character for "
182			       "safety.\n"));
183			raw[i] = '_';
184		}
185	}
186	// See if we can get away with 8-bit compressed unicode
187	mask = 0xffffff00;
188	bool canUse8bit = true;
189	for (uint32 i = 0; i < rawLength; i++) {
190		if (raw[i] & mask) {
191			canUse8bit = false;
192			break;
193		}
194	}
195	// Build our cs0 string
196	if (canUse8bit) {
197		fCs0Length = rawLength + 1;
198		fCs0String = new(nothrow) char[fCs0Length];
199		if (fCs0String != NULL) {
200			fCs0String[0] = '\x08';	// 8-bit compressed unicode
201			for (uint32 i = 0; i < rawLength; i++)
202				fCs0String[i + 1] = raw[i] % 256;
203		} else {
204			TRACE_ERROR(("UdfString::SetTo: fCs0String[%ld] allocation failed\n",
205				fCs0Length));
206			_Clear();
207			return;
208		}
209	} else {
210		fCs0Length = rawLength * 2 + 1;
211		fCs0String = new(nothrow) char[fCs0Length];
212		if (fCs0String != NULL) {
213			uint32 pos = 0;
214			fCs0String[pos++] = '\x10';	// 16-bit unicode
215			for (uint32 i = 0; i < rawLength; i++) {
216				// 16-bit unicode chars must be written big endian
217				uint16 value = uint16(raw[i]);
218				uint8 high = uint8(value >> 8 & 0xff);
219				uint8 low = uint8(value & 0xff);
220				fCs0String[pos++] = high;
221				fCs0String[pos++] = low;
222			}
223		} else {
224			TRACE_ERROR(("UdfString::SetTo: fCs0String[%ld] allocation failed\n",
225				fCs0Length));
226			_Clear();
227			return;
228		}
229	}
230}
231
232
233/*! \brief Assignment from a Cs0 string. */
234void
235UdfString::SetTo(const char *cs0, uint32 length)
236{
237	DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0, length));
238
239	_Clear();
240	if (length == 0)
241		return;
242	if (!cs0) {
243		PRINT(("passed NULL cs0 string\n"));
244		return;
245	}
246
247	// First copy the Cs0 string and length
248	fCs0String = new(nothrow) char[length];
249	if (fCs0String) {
250		memcpy(fCs0String, cs0, length);
251		fCs0Length = length;
252	} else {
253		PRINT(("new fCs0String[%ld] allocation failed\n", length));
254		return;
255	}
256
257	// Now convert to utf8
258
259	// The first byte of the CS0 string is the compression ID.
260	// - 8: 1 byte characters
261	// - 16: 2 byte, big endian characters
262	// - 254: "CS0 expansion is empty and unique", 1 byte characters
263	// - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
264	PRINT(("compression ID: %d\n", cs0[0]));
265	switch (reinterpret_cast<const uint8*>(cs0)[0]) {
266		case 8:
267		case 254:
268		{
269			const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
270			int32 maxLength = length-1;				// Max length of input string in uint8 characters
271			int32 allocationLength = maxLength*2+1;	// Need at most 2 utf8 chars per uint8 char
272			fUtf8String = new(nothrow) char[allocationLength];
273			if (fUtf8String) {
274				char *outputString = fUtf8String;
275
276				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
277					unicode_to_utf8(inputString[i], &outputString);
278				}
279				outputString[0] = 0;
280			} else {
281				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
282			}
283
284			break;
285		}
286
287		case 16:
288		case 255:
289		{
290			const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
291			int32 maxLength = (length-1) / 2;		// Max length of input string in uint16 characters
292			int32 allocationLength = maxLength*3+1;	// Need at most 3 utf8 chars per uint16 char
293			fUtf8String = new(nothrow) char[allocationLength];
294			if (fUtf8String) {
295				char *outputString = fUtf8String;
296
297				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
298					unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
299				}
300				outputString[0] = 0;
301			} else {
302				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
303			}
304
305			break;
306		}
307
308		default:
309			PRINT(("invalid compression id!\n"));
310			break;
311	}
312}
313
314void
315UdfString::_Clear()
316{
317	DEBUG_INIT("UdfString");
318
319	delete [] fCs0String;
320	fCs0String = NULL;
321	delete [] fUtf8String;
322	fUtf8String = NULL;
323}
324