1#include "UdfString.h"
2
3#include "ByteOrder.h"
4
5
6/*! \brief Converts the given unicode character to utf8.
7
8	\param c The unicode character.
9	\param out Pointer to a C-string of at least 4 characters
10	           long into which the output utf8 characters will
11	           be written. The string that is pointed to will
12	           be incremented to reflect the number of characters
13	           written, i.e. if \a out initially points to a pointer
14	           to the first character in string named \c str, and
15	           the function writes 4 characters to \c str, then
16	           upon returning, out will point to a pointer to
17	           the fifth character in \c str.
18*/
19static
20void
21unicode_to_utf8(uint32 c, char **out)
22{
23	char *s = *out;
24
25	if (c < 0x80)
26		*(s++) = c;
27	else if (c < 0x800) {
28		*(s++) = 0xc0 | (c>>6);
29		*(s++) = 0x80 | (c & 0x3f);
30	} else if (c < 0x10000) {
31		*(s++) = 0xe0 | (c>>12);
32		*(s++) = 0x80 | ((c>>6) & 0x3f);
33		*(s++) = 0x80 | (c & 0x3f);
34	} else if (c <= 0x10ffff) {
35		*(s++) = 0xf0 | (c>>18);
36		*(s++) = 0x80 | ((c>>12) & 0x3f);
37		*(s++) = 0x80 | ((c>>6) & 0x3f);
38		*(s++) = 0x80 | (c & 0x3f);
39	}
40	*out = s;
41}
42
43/*! \brief Converts the given utf8 character to 4-byte unicode.
44
45	\param in Pointer to a C-String from which utf8 characters
46	          will be read. *in will be incremented to reflect
47	          the number of characters read, similarly to the
48	          \c out parameter for Udf::unicode_to_utf8().
49
50	\return The 4-byte unicode character, or **in if passed an
51	        invalid character, or 0 if passed any NULL pointers.
52*/
53static
54uint32
55utf8_to_unicode(const char **in)
56{
57	if (!in)
58		return 0;
59	uint8 *bytes = (uint8 *)*in;
60	if (!bytes)
61		return 0;
62
63	int32 length;
64	uint8 mask = 0x1f;
65
66	switch (bytes[0] & 0xf0) {
67		case 0xc0:
68		case 0xd0:	length = 2; break;
69		case 0xe0:	length = 3; break;
70		case 0xf0:
71			mask = 0x0f;
72			length = 4;
73			break;
74		default:
75			// valid 1-byte character
76			// and invalid characters
77			(*in)++;
78			return bytes[0];
79	}
80	uint32 c = bytes[0] & mask;
81	int32 i = 1;
82	for (;i < length && (bytes[i] & 0x80) > 0;i++)
83		c = (c << 6) | (bytes[i] & 0x3f);
84
85	if (i < length) {
86		// invalid character
87		(*in)++;
88		return (uint32)bytes[0];
89	}
90	*in += length;
91	return c;
92}
93
94using namespace Udf;
95
96/*! \brief Creates an empty string object.
97*/
98String::String()
99	: fCs0String(NULL)
100	, fUtf8String(NULL)
101{
102}
103
104/*! \brief Creates a new String object from the given Utf8 string.
105*/
106String::String(const char *utf8)
107	: fCs0String(NULL)
108	, fUtf8String(NULL)
109{
110	SetTo(utf8);
111}
112
113/*! \brief Creates a new String object from the given Cs0 string.
114*/
115String::String(const char *cs0, uint32 length)
116	: fCs0String(NULL)
117	, fUtf8String(NULL)
118{
119	SetTo(cs0, length);
120}
121
122String::~String()
123{
124	DEBUG_INIT("String");
125
126	_Clear();
127}
128
129/*! \brief Assignment from a Utf8 string.
130*/
131void
132String::SetTo(const char *utf8)
133{
134	DEBUG_INIT_ETC("String", ("utf8: `%s', strlen(utf8): %ld", utf8,
135	               utf8 ? strlen(utf8) : 0));
136	_Clear();
137	if (!utf8) {
138		PRINT(("passed NULL utf8 string\n"));
139		return;
140	}
141	uint32 length = strlen(utf8);
142	// First copy the utf8 string
143	fUtf8String = new(nothrow) char[length+1];
144	if (!fUtf8String){
145		PRINT(("new fUtf8String[%ld] allocation failed\n", length+1));
146		return;
147	}
148	memcpy(fUtf8String, utf8, length+1);
149	// Next convert to raw 4-byte unicode. Then we'll do some
150	// analysis to figure out if we have any invalid characters,
151	// and whether we can get away with compressed 8-bit unicode,
152	// or have to use burly 16-bit unicode.
153	uint32 *raw = new(nothrow) uint32[length];
154	if (!raw) {
155		PRINT(("new uint32 raw[%ld] temporary string allocation failed\n", length));
156		_Clear();
157		return;
158	}
159	const char *in = utf8;
160	uint32 rawLength = 0;
161	for (uint32 i = 0; i < length && uint32(in-utf8) < length; i++, rawLength++)
162		raw[i] = utf8_to_unicode(&in);
163	// Check for invalids.
164	uint32 mask = 0xffff0000;
165	for (uint32 i = 0; i < rawLength; i++) {
166		if (raw[i] & mask) {
167			PRINT(("WARNING: utf8 string contained a multi-byte sequence which "
168			       "was converted into a unicode character larger than 16-bits; "
169			       "character will be converted to an underscore character for "
170			       "safety.\n"));
171			raw[i] = '_';
172		}
173	}
174	// See if we can get away with 8-bit compressed unicode
175	mask = 0xffffff00;
176	bool canUse8bit = true;
177	for (uint32 i = 0; i < rawLength; i++) {
178		if (raw[i] & mask) {
179			canUse8bit = false;
180			break;
181		}
182	}
183	// Build our cs0 string
184	if (canUse8bit) {
185		fCs0Length = rawLength+1;
186		fCs0String = new(nothrow) char[fCs0Length];
187		if (fCs0String) {
188			fCs0String[0] = '\x08';	// 8-bit compressed unicode
189			for (uint32 i = 0; i < rawLength; i++)
190				fCs0String[i+1] = raw[i] % 256;
191		} else {
192			PRINT(("new fCs0String[%ld] allocation failed\n", fCs0Length));
193			_Clear();
194			return;
195		}
196	} else {
197		fCs0Length = rawLength*2+1;
198		fCs0String = new(nothrow) char[fCs0Length];
199		if (fCs0String) {
200			uint32 pos = 0;
201			fCs0String[pos++] = '\x10';	// 16-bit unicode
202			for (uint32 i = 0; i < rawLength; i++) {
203				// 16-bit unicode chars must be written big endian
204				uint16 value = uint16(raw[i]);
205				uint8 high = uint8(value >> 8 & 0xff);
206				uint8 low = uint8(value & 0xff);
207				fCs0String[pos++] = high;
208				fCs0String[pos++] = low;
209			}
210		} else {
211			PRINT(("new fCs0String[%ld] allocation failed\n", fCs0Length));
212			_Clear();
213			return;
214		}
215	}
216	// Clean up
217	delete [] raw;
218	raw = NULL;
219}
220
221/*! \brief Assignment from a Cs0 string.
222*/
223void
224String::SetTo(const char *cs0, uint32 length)
225{
226	DEBUG_INIT_ETC("String", ("cs0: %p, length: %ld", cs0, length));
227
228	_Clear();
229	if (length == 0)
230		return;
231	if (!cs0) {
232		PRINT(("passed NULL cs0 string\n"));
233		return;
234	}
235
236	// First copy the Cs0 string and length
237	fCs0String = new(nothrow) char[length];
238	if (fCs0String) {
239		memcpy(fCs0String, cs0, length);
240		fCs0Length = length;
241	} else {
242		PRINT(("new fCs0String[%ld] allocation failed\n", length));
243		return;
244	}
245
246	// Now convert to utf8
247
248	// The first byte of the CS0 string is the compression ID.
249	// - 8: 1 byte characters
250	// - 16: 2 byte, big endian characters
251	// - 254: "CS0 expansion is empty and unique", 1 byte characters
252	// - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
253	PRINT(("compression ID: %d\n", cs0[0]));
254	switch (reinterpret_cast<const uint8*>(cs0)[0]) {
255		case 8:
256		case 254:
257		{
258			const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
259			int32 maxLength = length-1;				// Max length of input string in uint8 characters
260			int32 allocationLength = maxLength*2+1;	// Need at most 2 utf8 chars per uint8 char
261			fUtf8String = new(nothrow) char[allocationLength];
262			if (fUtf8String) {
263				char *outputString = fUtf8String;
264
265				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
266					unicode_to_utf8(inputString[i], &outputString);
267				}
268				outputString[0] = 0;
269			} else {
270				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
271			}
272
273			break;
274		}
275
276		case 16:
277		case 255:
278		{
279			const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
280			int32 maxLength = (length-1) / 2;		// Max length of input string in uint16 characters
281			int32 allocationLength = maxLength*3+1;	// Need at most 3 utf8 chars per uint16 char
282			fUtf8String = new(nothrow) char[allocationLength];
283			if (fUtf8String) {
284				char *outputString = fUtf8String;
285
286				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
287					unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
288				}
289				outputString[0] = 0;
290			} else {
291				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
292			}
293
294			break;
295		}
296
297		default:
298			PRINT(("invalid compression id!\n"));
299			break;
300	}
301}
302
303void
304String::_Clear()
305{
306	DEBUG_INIT("String");
307
308	delete [] fCs0String;
309	fCs0String = NULL;
310	delete [] fUtf8String;
311	fUtf8String = NULL;
312}
313