1/*
2 * Copyright 2014 Jonathan Schleifer <js@webkeks.org>
3 * Copyright 2014 Haiku, Inc. All rights reserved.
4 *
5 * Distributed under the terms of the MIT License.
6 *
7 * Authors:
8 *		Jonathan Schleifer, js@webkeks.org
9 *		John Scipione, jscipione@gmail.com
10 */
11
12
13#include "convertutf.h"
14
15
16#include <ByteOrder.h>
17#include <Errors.h>
18#include <StorageDefs.h>
19
20
21static inline size_t
22glyph_length(uint32 glyph)
23{
24	if (glyph < 0x80)
25		return 1;
26	else if (glyph < 0x800)
27		return 2;
28	else if (glyph < 0x10000)
29		return 3;
30	else if (glyph < 0x110000)
31		return 4;
32
33	return 0;
34}
35
36
37static void
38encode_glyph(uint32 glyph, size_t glyphLength, char* buffer)
39{
40	if (glyphLength == 1) {
41		*buffer = glyph;
42	} else if (glyphLength == 2) {
43		*buffer++ = 0xC0 | (glyph >> 6);
44		*buffer = 0x80 | (glyph & 0x3F);
45	} else if (glyphLength == 3) {
46		*buffer++ = 0xE0 | (glyph >> 12);
47		*buffer++ = 0x80 | (glyph >> 6 & 0x3F);
48		*buffer = 0x80 | (glyph & 0x3F);
49	} else if (glyphLength == 4) {
50		*buffer++ = 0xF0 | (glyph >> 18);
51		*buffer++ = 0x80 | (glyph >> 12 & 0x3F);
52		*buffer++ = 0x80 | (glyph >> 6 & 0x3F);
53		*buffer = 0x80 | (glyph & 0x3F);
54	}
55}
56
57
58static ssize_t
59utf16_to_utf8(const uint16* source, size_t sourceCodeUnitCount, char* target,
60	size_t targetLength, bool isLittleEndian)
61{
62	if (source == NULL || sourceCodeUnitCount == 0
63		|| target == NULL || targetLength == 0) {
64		return B_BAD_VALUE;
65	}
66
67	ssize_t outLength = 0;
68
69	for (size_t i = 0; i < sourceCodeUnitCount; i++) {
70		uint32 glyph = isLittleEndian
71			? B_LENDIAN_TO_HOST_INT32(source[i])
72			: B_BENDIAN_TO_HOST_INT32(source[i]);
73
74		if ((glyph & 0xFC00) == 0xDC00) {
75			// missing high surrogate
76			return B_BAD_VALUE;
77		}
78
79		if ((glyph & 0xFC00) == 0xD800) {
80			if (sourceCodeUnitCount <= i + 1) {
81				// high surrogate at end of string
82				return B_BAD_VALUE;
83			}
84
85			uint32 low = isLittleEndian
86				? B_LENDIAN_TO_HOST_INT32(source[i + 1])
87				: B_BENDIAN_TO_HOST_INT32(source[i + 1]);
88			if ((low & 0xFC00) != 0xDC00) {
89				// missing low surrogate
90				return B_BAD_VALUE;
91			}
92
93			glyph = (((glyph & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
94			i++;
95		}
96
97		size_t glyphLength = glyph_length(glyph);
98		if (glyphLength == 0)
99			return B_BAD_VALUE;
100		else if (outLength + glyphLength >= targetLength
101			|| outLength + glyphLength >= B_FILE_NAME_LENGTH) {
102			// NUL terminate the string so the caller can use the
103			// abbreviated version in this case. Since the length
104			// isn't returned the caller will need to call strlen()
105			// to get the length of the string.
106			target[outLength] = '\0';
107			return B_NAME_TOO_LONG;
108		}
109
110		encode_glyph(glyph, glyphLength, target + outLength);
111		outLength += glyphLength;
112	}
113
114	target[outLength] = '\0';
115
116	return outLength;
117}
118
119
120ssize_t
121utf16le_to_utf8(const uint16* source, size_t sourceCodeUnitCount,
122	char* target, size_t targetLength)
123{
124	return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength,
125		true);
126}
127
128
129ssize_t
130utf16be_to_utf8(const uint16* source, size_t sourceCodeUnitCount,
131	char* target, size_t targetLength)
132{
133	return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength,
134		false);
135}
136