1/*
2 * Copyright 2004-2010, Haiku, Inc.
3 * Distributed under the terms of the MIT License.
4 */
5#ifndef _UTF8_FUNCTIONS_H
6#define _UTF8_FUNCTIONS_H
7
8
9#include <SupportDefs.h>
10
11
12static inline bool
13IsInsideGlyph(uchar ch)
14{
15	return (ch & 0xc0) == 0x80;
16}
17
18
19static inline uint32
20UTF8NextCharLenUnsafe(const char *text)
21{
22	const char *ptr = text;
23
24	do {
25		ptr++;
26	} while (IsInsideGlyph(*ptr));
27
28	return ptr - text;
29}
30
31
32static inline uint32
33UTF8NextCharLen(const char *text)
34{
35	if (text == NULL || *text == 0)
36		return 0;
37
38	return UTF8NextCharLenUnsafe(text);
39}
40
41
42static inline uint32
43UTF8NextCharLen(const char *bytes, size_t length)
44{
45	if (bytes == NULL || length == 0 || bytes[0] == 0)
46		return 0;
47
48	if ((bytes[0] & 0x80) == 0) {
49		// A single ASCII char - or so...
50		return 1;
51	}
52
53	if (IsInsideGlyph(bytes[0])) {
54		// Not a proper multibyte start.
55		return 0;
56	}
57
58	// We already know that we have the upper two bits set due to the above
59	// two checks.
60	uint8 mask = 0x20;
61	size_t bytesExpected = 2;
62	while ((bytes[0] & mask) != 0) {
63		if (mask == 0x02) {
64			// Seven byte char - invalid.
65			return 0;
66		}
67
68		bytesExpected++;
69		mask >>= 1;
70	}
71
72	// There would need to be more bytes to satisfy the char.
73	if (bytesExpected > length)
74		return 0;
75
76	// We already know the first byte is fine, check the rest.
77	for (size_t i = 1; i < bytesExpected; i++) {
78		if (!IsInsideGlyph(bytes[i])) {
79			// The sequence is incomplete.
80			return 0;
81		}
82	}
83
84	// Puh, everything's fine.
85	return bytesExpected;
86}
87
88
89static inline uint32
90UTF8PreviousCharLen(const char *text, const char *limit)
91{
92	const char *ptr = text;
93
94	if (ptr == NULL || limit == NULL)
95		return 0;
96
97	do {
98		if (ptr == limit)
99			break;
100		ptr--;
101	} while (IsInsideGlyph(*ptr));
102
103	return text - ptr;
104}
105
106
107/*!	UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
108	numChars characters are read. If numChars is a negative value it is ignored
109	and the string is read up to the terminating 0.
110*/
111static inline uint32
112UTF8CountBytes(const char *bytes, int32 numChars)
113{
114	if (bytes == NULL)
115		return 0;
116
117	if (numChars < 0)
118		numChars = INT_MAX;
119
120	const char *base = bytes;
121	while (bytes[0] != '\0') {
122		if ((bytes[0] & 0xc0) != 0x80) {
123			if (--numChars < 0)
124				break;
125		}
126		bytes++;
127	}
128
129	return bytes - base;
130}
131
132
133/*!	UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
134	numBytes bytes are read. If numBytes is a negative value it is ignored
135	and the string is read up to the terminating 0.
136*/
137static inline uint32
138UTF8CountChars(const char *bytes, int32 numBytes)
139{
140	if (bytes == NULL)
141		return 0;
142
143	uint32 length = 0;
144	const char *last;
145	if (numBytes < 0)
146		last = (const char *)SIZE_MAX;
147	else
148		last = bytes + numBytes - 1;
149
150	while (bytes[0] && bytes <= last) {
151		if ((bytes++[0] & 0xc0) != 0x80)
152			length++;
153	}
154
155	return length;
156}
157
158
159/*!	UTF8ToCharCode converts the input that includes potential multibyte chars
160	to UTF-32 char codes that can be used by FreeType. The string pointer is
161	then advanced to the next character in the string. In case the terminating
162	0 is reached, the string pointer is not advanced anymore and nulls are
163	returned. This makes it safe to overruns and enables streamed processing
164	of UTF8 strings.
165*/
166static inline uint32
167UTF8ToCharCode(const char **bytes)
168{
169	#define UTF8_SUBSTITUTE_CHARACTER	0xfffd
170
171	uint32 result;
172	if (((*bytes)[0] & 0x80) == 0) {
173		// a single byte character
174		result = (*bytes)[0];
175		if (result != '\0') {
176			// do not advance beyond the terminating '\0'
177			(*bytes)++;
178		}
179
180		return result;
181	}
182
183	if (((*bytes)[0] & 0xc0) == 0x80) {
184		// not a proper multibyte start
185		(*bytes)++;
186		return UTF8_SUBSTITUTE_CHARACTER;
187	}
188
189	// start of a multibyte character
190	uint8 mask = 0x80;
191	result = (uint32)((*bytes)[0] & 0xff);
192	(*bytes)++;
193
194	while (result & mask) {
195		if (mask == 0x02) {
196			// seven byte char - invalid
197			return UTF8_SUBSTITUTE_CHARACTER;
198		}
199
200		result &= ~mask;
201		mask >>= 1;
202	}
203
204	while (((*bytes)[0] & 0xc0) == 0x80) {
205		result <<= 6;
206		result += (*bytes)[0] & 0x3f;
207		(*bytes)++;
208
209		mask <<= 1;
210		if (mask == 0x40)
211			return result;
212	}
213
214	if (mask == 0x40)
215		return result;
216
217	if ((*bytes)[0] == '\0') {
218		// string terminated within multibyte char
219		return 0x00;
220	}
221
222	// not enough bytes in multibyte char
223	return UTF8_SUBSTITUTE_CHARACTER;
224
225	#undef UTF8_SUBSTITUTE_CHARACTER
226}
227
228#endif	// _UTF8_FUNCTIONS_H
229