udf/r5/UdfString.cpp

#include "UdfString.h"

#include "ByteOrder.h"


/*! \brief Converts the given unicode character to utf8.

	\param c The unicode character.
	\param out Pointer to a C-string of at least 4 characters
	           long into which the output utf8 characters will
	           be written. The string that is pointed to will
	           be incremented to reflect the number of characters
	           written, i.e. if \a out initially points to a pointer
	           to the first character in string named \c str, and
	           the function writes 4 characters to \c str, then
	           upon returning, out will point to a pointer to
	           the fifth character in \c str.
*/
static
void
unicode_to_utf8(uint32 c, char **out)
{
	char *s = *out;

	if (c < 0x80)
		*(s++) = c;
	else if (c < 0x800) {
		*(s++) = 0xc0 | (c>>6);
		*(s++) = 0x80 | (c & 0x3f);
	} else if (c < 0x10000) {
		*(s++) = 0xe0 | (c>>12);
		*(s++) = 0x80 | ((c>>6) & 0x3f);
		*(s++) = 0x80 | (c & 0x3f);
	} else if (c <= 0x10ffff) {
		*(s++) = 0xf0 | (c>>18);
		*(s++) = 0x80 | ((c>>12) & 0x3f);
		*(s++) = 0x80 | ((c>>6) & 0x3f);
		*(s++) = 0x80 | (c & 0x3f);
	}
	*out = s;
}

/*! \brief Converts the given utf8 character to 4-byte unicode.

	\param in Pointer to a C-String from which utf8 characters
	          will be read. *in will be incremented to reflect
	          the number of characters read, similarly to the
	          \c out parameter for Udf::unicode_to_utf8().

	\return The 4-byte unicode character, or **in if passed an
	        invalid character, or 0 if passed any NULL pointers.
*/
static
uint32
utf8_to_unicode(const char **in)
{
	if (!in)
		return 0;
	uint8 *bytes = (uint8 *)*in;
	if (!bytes)
		return 0;

	int32 length;
	uint8 mask = 0x1f;

	switch (bytes[0] & 0xf0) {
		case 0xc0:
		case 0xd0:	length = 2; break;
		case 0xe0:	length = 3; break;
		case 0xf0:
			mask = 0x0f;
			length = 4;
			break;
		default:
			// valid 1-byte character
			// and invalid characters
			(*in)++;
			return bytes[0];
	}
	uint32 c = bytes[0] & mask;
	int32 i = 1;
	for (;i < length && (bytes[i] & 0x80) > 0;i++)
		c = (c << 6) | (bytes[i] & 0x3f);

	if (i < length) {
		// invalid character
		(*in)++;
		return (uint32)bytes[0];
	}
	*in += length;
	return c;
}

using namespace Udf;

/*! \brief Creates an empty string object.
*/
String::String()
	: fCs0String(NULL)
	, fUtf8String(NULL)
{
}

/*! \brief Creates a new String object from the given Utf8 string.
*/
String::String(const char *utf8)
	: fCs0String(NULL)
	, fUtf8String(NULL)
{
	SetTo(utf8);
}

/*! \brief Creates a new String object from the given Cs0 string.
*/
String::String(const char *cs0, uint32 length)
	: fCs0String(NULL)
	, fUtf8String(NULL)
{
	SetTo(cs0, length);
}

String::~String()
{
	DEBUG_INIT("String");

	_Clear();
}

/*! \brief Assignment from a Utf8 string.
*/
void
String::SetTo(const char *utf8)
{
	DEBUG_INIT_ETC("String", ("utf8: `%s', strlen(utf8): %ld", utf8,
	               utf8 ? strlen(utf8) : 0));
	_Clear();
	if (!utf8) {
		PRINT(("passed NULL utf8 string\n"));
		return;
	}
	uint32 length = strlen(utf8);
	// First copy the utf8 string
	fUtf8String = new(nothrow) char[length+1];
	if (!fUtf8String){
		PRINT(("new fUtf8String[%ld] allocation failed\n", length+1));
		return;
	}
	memcpy(fUtf8String, utf8, length+1);
	// Next convert to raw 4-byte unicode. Then we'll do some
	// analysis to figure out if we have any invalid characters,
	// and whether we can get away with compressed 8-bit unicode,
	// or have to use burly 16-bit unicode.
	uint32 *raw = new(nothrow) uint32[length];
	if (!raw) {
		PRINT(("new uint32 raw[%ld] temporary string allocation failed\n", length));
		_Clear();
		return;
	}
	const char *in = utf8;
	uint32 rawLength = 0;
	for (uint32 i = 0; i < length && uint32(in-utf8) < length; i++, rawLength++)
		raw[i] = utf8_to_unicode(&in);
	// Check for invalids.
	uint32 mask = 0xffff0000;
	for (uint32 i = 0; i < rawLength; i++) {
		if (raw[i] & mask) {
			PRINT(("WARNING: utf8 string contained a multi-byte sequence which "
			       "was converted into a unicode character larger than 16-bits; "
			       "character will be converted to an underscore character for "
			       "safety.\n"));
			raw[i] = '_';
		}
	}
	// See if we can get away with 8-bit compressed unicode
	mask = 0xffffff00;
	bool canUse8bit = true;
	for (uint32 i = 0; i < rawLength; i++) {
		if (raw[i] & mask) {
			canUse8bit = false;
			break;
		}
	}
	// Build our cs0 string
	if (canUse8bit) {
		fCs0Length = rawLength+1;
		fCs0String = new(nothrow) char[fCs0Length];
		if (fCs0String) {
			fCs0String[0] = '\x08';	// 8-bit compressed unicode
			for (uint32 i = 0; i < rawLength; i++)
				fCs0String[i+1] = raw[i] % 256;
		} else {
			PRINT(("new fCs0String[%ld] allocation failed\n", fCs0Length));
			_Clear();
			return;
		}
	} else {
		fCs0Length = rawLength*2+1;
		fCs0String = new(nothrow) char[fCs0Length];
		if (fCs0String) {
			uint32 pos = 0;
			fCs0String[pos++] = '\x10';	// 16-bit unicode
			for (uint32 i = 0; i < rawLength; i++) {
				// 16-bit unicode chars must be written big endian
				uint16 value = uint16(raw[i]);
				uint8 high = uint8(value >> 8 & 0xff);
				uint8 low = uint8(value & 0xff);
				fCs0String[pos++] = high;
				fCs0String[pos++] = low;
			}
		} else {
			PRINT(("new fCs0String[%ld] allocation failed\n", fCs0Length));
			_Clear();
			return;
		}
	}
	// Clean up
	delete [] raw;
	raw = NULL;
}

/*! \brief Assignment from a Cs0 string.
*/
void
String::SetTo(const char *cs0, uint32 length)
{
	DEBUG_INIT_ETC("String", ("cs0: %p, length: %ld", cs0, length));

	_Clear();
	if (length == 0)
		return;
	if (!cs0) {
		PRINT(("passed NULL cs0 string\n"));
		return;
	}

	// First copy the Cs0 string and length
	fCs0String = new(nothrow) char[length];
	if (fCs0String) {
		memcpy(fCs0String, cs0, length);
		fCs0Length = length;
	} else {
		PRINT(("new fCs0String[%ld] allocation failed\n", length));
		return;
	}

	// Now convert to utf8

	// The first byte of the CS0 string is the compression ID.
	// - 8: 1 byte characters
	// - 16: 2 byte, big endian characters
	// - 254: "CS0 expansion is empty and unique", 1 byte characters
	// - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
	PRINT(("compression ID: %d\n", cs0[0]));
	switch (reinterpret_cast<const uint8*>(cs0)[0]) {
		case 8:
		case 254:
		{
			const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
			int32 maxLength = length-1;				// Max length of input string in uint8 characters
			int32 allocationLength = maxLength*2+1;	// Need at most 2 utf8 chars per uint8 char
			fUtf8String = new(nothrow) char[allocationLength];
			if (fUtf8String) {
				char *outputString = fUtf8String;

				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
					unicode_to_utf8(inputString[i], &outputString);
				}
				outputString[0] = 0;
			} else {
				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
			}

			break;
		}

		case 16:
		case 255:
		{
			const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
			int32 maxLength = (length-1) / 2;		// Max length of input string in uint16 characters
			int32 allocationLength = maxLength*3+1;	// Need at most 3 utf8 chars per uint16 char
			fUtf8String = new(nothrow) char[allocationLength];
			if (fUtf8String) {
				char *outputString = fUtf8String;

				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
					unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
				}
				outputString[0] = 0;
			} else {
				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
			}

			break;
		}

		default:
			PRINT(("invalid compression id!\n"));
			break;
	}
}

void
String::_Clear()
{
	DEBUG_INIT("String");

	delete [] fCs0String;
	fCs0String = NULL;
	delete [] fUtf8String;
	fUtf8String = NULL;
}