1#include "UdfString.h"
2
3#include <ByteOrder.h>
4
5#include <AutoDeleter.h>
6
7
8using std::nothrow;
9
10
11/*! \brief Converts the given unicode character to utf8.
12
13	\param c The unicode character.
14	\param out Pointer to a C-string of at least 4 characters
15	           long into which the output utf8 characters will
16	           be written. The string that is pointed to will
17	           be incremented to reflect the number of characters
18	           written, i.e. if \a out initially points to a pointer
19	           to the first character in string named \c str, and
20	           the function writes 4 characters to \c str, then
21	           upon returning, out will point to a pointer to
22	           the fifth character in \c str.
23*/
24static void
25unicode_to_utf8(uint32 c, char **out)
26{
27	char *s = *out;
28
29	if (c < 0x80)
30		*(s++) = c;
31	else if (c < 0x800) {
32		*(s++) = 0xc0 | (c>>6);
33		*(s++) = 0x80 | (c & 0x3f);
34	} else if (c < 0x10000) {
35		*(s++) = 0xe0 | (c>>12);
36		*(s++) = 0x80 | ((c>>6) & 0x3f);
37		*(s++) = 0x80 | (c & 0x3f);
38	} else if (c <= 0x10ffff) {
39		*(s++) = 0xf0 | (c>>18);
40		*(s++) = 0x80 | ((c>>12) & 0x3f);
41		*(s++) = 0x80 | ((c>>6) & 0x3f);
42		*(s++) = 0x80 | (c & 0x3f);
43	}
44	*out = s;
45}
46
47/*! \brief Converts the given utf8 character to 4-byte unicode.
48
49	\param in Pointer to a C-String from which utf8 characters
50	          will be read. *in will be incremented to reflect
51	          the number of characters read, similarly to the
52	          \c out parameter for unicode_to_utf8().
53
54	\return The 4-byte unicode character, or **in if passed an
55	        invalid character, or 0 if passed any NULL pointers.
56*/
57static uint32
58utf8_to_unicode(const char **in)
59{
60	if (!in)
61		return 0;
62	uint8 *bytes = (uint8 *)*in;
63	if (!bytes)
64		return 0;
65
66	int32 length;
67	uint8 mask = 0x1f;
68
69	switch (bytes[0] & 0xf0) {
70		case 0xc0:
71		case 0xd0:	length = 2; break;
72		case 0xe0:	length = 3; break;
73		case 0xf0:
74			mask = 0x0f;
75			length = 4;
76			break;
77		default:
78			// valid 1-byte character
79			// and invalid characters
80			(*in)++;
81			return bytes[0];
82	}
83	uint32 c = bytes[0] & mask;
84	int32 i = 1;
85	for (;i < length && (bytes[i] & 0x80) > 0;i++)
86		c = (c << 6) | (bytes[i] & 0x3f);
87
88	if (i < length) {
89		// invalid character
90		(*in)++;
91		return (uint32)bytes[0];
92	}
93	*in += length;
94	return c;
95}
96
97
98// #pragma mark -
99
100
101/*! \brief Creates an empty string object. */
102UdfString::UdfString()
103	:
104	fCs0String(NULL),
105	fUtf8String(NULL)
106{
107}
108
109
110/*! \brief Creates a new UdfString object from the given Utf8 string. */
111UdfString::UdfString(const char *utf8)
112	:
113	fCs0String(NULL),
114	fUtf8String(NULL)
115{
116	SetTo(utf8);
117}
118
119
120/*! \brief Creates a new UdfString object from the given Cs0 string. */
121UdfString::UdfString(const char *cs0, uint32 length)
122	:
123	fCs0String(NULL),
124	fUtf8String(NULL)
125{
126	SetTo(cs0, length);
127}
128
129
130UdfString::~UdfString()
131{
132	_Clear();
133}
134
135
136/*! \brief Assignment from a Utf8 string. */
137void
138UdfString::SetTo(const char *utf8)
139{
140	TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n",
141		utf8, utf8 ? strlen(utf8) : 0));
142	_Clear();
143
144	if (utf8 == NULL) {
145		TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n"));
146		return;
147	}
148
149	uint32 length = strlen(utf8);
150	// First copy the utf8 string
151	fUtf8String = new(nothrow) char[length + 1];
152	if (fUtf8String == NULL) {
153		TRACE_ERROR(("UdfString::SetTo: fUtf8String[%" B_PRIu32
154			"] allocation failed\n", length + 1));
155		return;
156	}
157
158	memcpy(fUtf8String, utf8, length + 1);
159	// Next convert to raw 4-byte unicode. Then we'll do some
160	// analysis to figure out if we have any invalid characters,
161	// and whether we can get away with compressed 8-bit unicode,
162	// or have to use burly 16-bit unicode.
163	uint32 *raw = new(nothrow) uint32[length];
164	if (raw == NULL) {
165		TRACE_ERROR(("UdfString::SetTo: uint32 raw[%" B_PRIu32 "] temporary"
166			" string allocation failed\n", length));
167		_Clear();
168		return;
169	}
170
171	ArrayDeleter<uint32> rawDeleter(raw);
172
173	const char *in = utf8;
174	uint32 rawLength = 0;
175	for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++)
176		raw[i] = utf8_to_unicode(&in);
177
178	// Check for invalids.
179	uint32 mask = 0xffff0000;
180	for (uint32 i = 0; i < rawLength; i++) {
181		if (raw[i] & mask) {
182			TRACE(("WARNING: utf8 string contained a multi-byte sequence which "
183			       "was converted into a unicode character larger than 16-bits; "
184			       "character will be converted to an underscore character for "
185			       "safety.\n"));
186			raw[i] = '_';
187		}
188	}
189	// See if we can get away with 8-bit compressed unicode
190	mask = 0xffffff00;
191	bool canUse8bit = true;
192	for (uint32 i = 0; i < rawLength; i++) {
193		if (raw[i] & mask) {
194			canUse8bit = false;
195			break;
196		}
197	}
198	// Build our cs0 string
199	if (canUse8bit) {
200		fCs0Length = rawLength + 1;
201		fCs0String = new(nothrow) char[fCs0Length];
202		if (fCs0String != NULL) {
203			fCs0String[0] = '\x08';	// 8-bit compressed unicode
204			for (uint32 i = 0; i < rawLength; i++)
205				fCs0String[i + 1] = raw[i] % 256;
206		} else {
207			TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
208				"] allocation failed\n", fCs0Length));
209			_Clear();
210			return;
211		}
212	} else {
213		fCs0Length = rawLength * 2 + 1;
214		fCs0String = new(nothrow) char[fCs0Length];
215		if (fCs0String != NULL) {
216			uint32 pos = 0;
217			fCs0String[pos++] = '\x10';	// 16-bit unicode
218			for (uint32 i = 0; i < rawLength; i++) {
219				// 16-bit unicode chars must be written big endian
220				uint16 value = uint16(raw[i]);
221				uint8 high = uint8(value >> 8 & 0xff);
222				uint8 low = uint8(value & 0xff);
223				fCs0String[pos++] = high;
224				fCs0String[pos++] = low;
225			}
226		} else {
227			TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
228				"] allocation failed\n", fCs0Length));
229			_Clear();
230			return;
231		}
232	}
233}
234
235
236/*! \brief Assignment from a Cs0 string. */
237void
238UdfString::SetTo(const char *cs0, uint32 length)
239{
240	DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %" B_PRIu32, cs0, length));
241
242	_Clear();
243	if (length == 0)
244		return;
245	if (!cs0) {
246		PRINT(("passed NULL cs0 string\n"));
247		return;
248	}
249
250	// First copy the Cs0 string and length
251	fCs0String = new(nothrow) char[length];
252	if (fCs0String) {
253		memcpy(fCs0String, cs0, length);
254		fCs0Length = length;
255	} else {
256		PRINT(("new fCs0String[%" B_PRIu32 "] allocation failed\n", length));
257		return;
258	}
259
260	// Now convert to utf8
261
262	// The first byte of the CS0 string is the compression ID.
263	// - 8: 1 byte characters
264	// - 16: 2 byte, big endian characters
265	// - 254: "CS0 expansion is empty and unique", 1 byte characters
266	// - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
267	PRINT(("compression ID: %d\n", cs0[0]));
268	switch (reinterpret_cast<const uint8*>(cs0)[0]) {
269		case 8:
270		case 254:
271		{
272			const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
273			int32 maxLength = length-1;				// Max length of input string in uint8 characters
274			int32 allocationLength = maxLength*2+1;	// Need at most 2 utf8 chars per uint8 char
275			fUtf8String = new(nothrow) char[allocationLength];
276			if (fUtf8String) {
277				char *outputString = fUtf8String;
278
279				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
280					unicode_to_utf8(inputString[i], &outputString);
281				}
282				outputString[0] = 0;
283			} else {
284				PRINT(("new fUtf8String[%" B_PRId32 "] allocation failed\n",
285					allocationLength));
286			}
287
288			break;
289		}
290
291		case 16:
292		case 255:
293		{
294			const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
295			int32 maxLength = (length-1) / 2;		// Max length of input string in uint16 characters
296			int32 allocationLength = maxLength*3+1;	// Need at most 3 utf8 chars per uint16 char
297			fUtf8String = new(nothrow) char[allocationLength];
298			if (fUtf8String) {
299				char *outputString = fUtf8String;
300
301				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
302					unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
303				}
304				outputString[0] = 0;
305			} else {
306				PRINT(("new fUtf8String[%" B_PRId32 "] allocation failed\n",
307					allocationLength));
308			}
309
310			break;
311		}
312
313		default:
314			PRINT(("invalid compression id!\n"));
315			break;
316	}
317}
318
319void
320UdfString::_Clear()
321{
322	DEBUG_INIT("UdfString");
323
324	delete [] fCs0String;
325	fCs0String = NULL;
326	delete [] fUtf8String;
327	fUtf8String = NULL;
328}
329