1/*
2 * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
3 * (Royal Institute of Technology, Stockholm, Sweden).
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the Institute nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34#include <config.h>
35#include "windlocl.h"
36
37static int
38utf8toutf32(const unsigned char **pp, uint32_t *out)
39{
40    const unsigned char *p = *pp;
41    unsigned c = *p;
42
43    if (c & 0x80) {
44	if ((c & 0xE0) == 0xC0) {
45	    const unsigned c2 = *++p;
46	    if ((c2 & 0xC0) == 0x80) {
47		*out =  ((c  & 0x1F) << 6)
48		    | (c2 & 0x3F);
49	    } else {
50		return WIND_ERR_INVALID_UTF8;
51	    }
52	} else if ((c & 0xF0) == 0xE0) {
53	    const unsigned c2 = *++p;
54	    if ((c2 & 0xC0) == 0x80) {
55		const unsigned c3 = *++p;
56		if ((c3 & 0xC0) == 0x80) {
57		    *out =   ((c  & 0x0F) << 12)
58			| ((c2 & 0x3F) << 6)
59			|  (c3 & 0x3F);
60		} else {
61		    return WIND_ERR_INVALID_UTF8;
62		}
63	    } else {
64		return WIND_ERR_INVALID_UTF8;
65	    }
66	} else if ((c & 0xF8) == 0xF0) {
67	    const unsigned c2 = *++p;
68	    if ((c2 & 0xC0) == 0x80) {
69		const unsigned c3 = *++p;
70		if ((c3 & 0xC0) == 0x80) {
71		    const unsigned c4 = *++p;
72		    if ((c4 & 0xC0) == 0x80) {
73			*out =   ((c  & 0x07) << 18)
74			    | ((c2 & 0x3F) << 12)
75			    | ((c3 & 0x3F) <<  6)
76			    |  (c4 & 0x3F);
77		    } else {
78			return WIND_ERR_INVALID_UTF8;
79		    }
80		} else {
81		    return WIND_ERR_INVALID_UTF8;
82		}
83	    } else {
84		return WIND_ERR_INVALID_UTF8;
85	    }
86	} else {
87	    return WIND_ERR_INVALID_UTF8;
88	}
89    } else {
90	*out = c;
91    }
92
93    *pp = p;
94
95    return 0;
96}
97
98/**
99 * Convert an UTF-8 string to an UCS4 string.
100 *
101 * @param in an UTF-8 string to convert.
102 * @param out the resulting UCS4 strint, must be at least
103 * wind_utf8ucs4_length() long.  If out is NULL, the function will
104 * calculate the needed space for the out variable (just like
105 * wind_utf8ucs4_length()).
106 * @param out_len before processing out_len should be the length of
107 * the out variable, after processing it will be the length of the out
108 * string.
109 *
110 * @return returns 0 on success, an wind error code otherwise
111 * @ingroup wind
112 */
113
114int
115wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
116{
117    const unsigned char *p;
118    size_t o = 0;
119    int ret;
120
121    for (p = (const unsigned char *)in; *p != '\0'; ++p) {
122	uint32_t u;
123
124	ret = utf8toutf32(&p, &u);
125	if (ret)
126	    return ret;
127
128	if (out) {
129	    if (o >= *out_len)
130		return WIND_ERR_OVERRUN;
131	    out[o] = u;
132	}
133	o++;
134    }
135    *out_len = o;
136    return 0;
137}
138
139/**
140 * Calculate the length of from converting a UTF-8 string to a UCS4
141 * string.
142 *
143 * @param in an UTF-8 string to convert.
144 * @param out_len the length of the resulting UCS4 string.
145 *
146 * @return returns 0 on success, an wind error code otherwise
147 * @ingroup wind
148 */
149
150int
151wind_utf8ucs4_length(const char *in, size_t *out_len)
152{
153    return wind_utf8ucs4(in, NULL, out_len);
154}
155
156/**
157 * Convert an UTF-8 string to an UCS4 string.
158 *
159 * @param in an UTF-8 string to convert.
160 * @param out the resulting UCS4 strint, must be free with free().
161 * @param out_len will be the length of the out string.
162 *
163 * @return returns 0 on success, an wind error code otherwise
164 * @ingroup wind
165 */
166
167int
168wind_utf8ucs4_copy(const char *in, uint32_t **out, size_t *out_len)
169{
170    int ret;
171
172    ret = wind_utf8ucs4_length(in, out_len);
173    if (ret)
174	return ret;
175    if (*out_len > UINT_MAX / sizeof((*out)[0]))
176	return ERANGE;
177    if (*out_len == 0)
178        return 0;
179
180    *out = malloc(*out_len * sizeof((*out)[0]));
181    if (*out == NULL) {
182	*out_len = 0;
183	return ENOMEM;
184    }
185
186    ret = wind_utf8ucs4(in, *out, out_len);
187    if (ret) {
188	free(*out);
189	*out = NULL;
190	*out_len = 0;
191    }
192    return ret;
193}
194
195
196static const char first_char[4] =
197    { 0x00, 0xC0, 0xE0, 0xF0 };
198
199/**
200 * Convert an UCS4 string to a UTF-8 string.
201 *
202 * @param in an UCS4 string to convert.
203 * @param in_len the length input array.
204
205 * @param out the resulting UTF-8 strint, must be at least
206 * wind_ucs4utf8_length() + 1 long (the extra char for the NUL).  If
207 * out is NULL, the function will calculate the needed space for the
208 * out variable (just like wind_ucs4utf8_length()).
209
210 * @param out_len before processing out_len should be the length of
211 * the out variable, after processing it will be the length of the out
212 * string. NUL not included.
213 *
214 * @return returns 0 on success, an wind error code otherwise
215 * @ingroup wind
216 */
217
218int
219wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
220{
221    uint32_t ch;
222    size_t i, len, o;
223
224    for (o = 0, i = 0; i < in_len; i++) {
225	ch = in[i];
226
227	if (ch < 0x80) {
228	    len = 1;
229	} else if (ch < 0x800) {
230	    len = 2;
231	} else if (ch < 0x10000) {
232	    len = 3;
233	} else if (ch <= 0x10FFFF) {
234	    len = 4;
235	} else
236	    return WIND_ERR_INVALID_UTF32;
237
238	o += len;
239
240	if (out) {
241	    if (o >= *out_len)
242		return WIND_ERR_OVERRUN;
243
244	    switch(len) {
245	    case 4:
246		out[3] = (ch | 0x80) & 0xbf;
247		ch = ch << 6;
248	    case 3:
249		out[2] = (ch | 0x80) & 0xbf;
250		ch = ch << 6;
251	    case 2:
252		out[1] = (ch | 0x80) & 0xbf;
253		ch = ch << 6;
254	    case 1:
255		out[0] = ch | first_char[len - 1];
256	    }
257	    out += len;
258	}
259    }
260    if (out) {
261	if (o >= *out_len)
262	    return WIND_ERR_OVERRUN;
263	*out = '\0';
264    }
265    *out_len = o;
266    return 0;
267}
268
269/**
270 * Calculate the length of from converting a UCS4 string to an UTF-8 string.
271 *
272 * @param in an UCS4 string to convert.
273 * @param in_len the length of UCS4 string to convert.
274 * @param out_len the length of the resulting UTF-8 string.
275 *
276 * @return returns 0 on success, an wind error code otherwise
277 * @ingroup wind
278 */
279
280int
281wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
282{
283    return wind_ucs4utf8(in, in_len, NULL, out_len);
284}
285
286/**
287 * Convert an UCS4 string to a UTF-8 string.
288 *
289 * @param in an UCS4 string to convert.
290 * @param in_len the length input array.
291 * @param out an allocated string, should be released with free().
292 * @param out_len size of out string, NUL not included in count.
293 *
294 * @return returns 0 on success, an wind error code otherwise
295 * @ingroup wind
296 */
297
298int
299wind_ucs4utf8_copy(const uint32_t *in, size_t in_len, char **out, size_t *out_len)
300{
301    size_t size;
302    int ret;
303
304    ret = wind_ucs4utf8_length(in, in_len, &size);
305    if (ret)
306	return ret;
307
308    size += 1;
309
310    *out = malloc(size);
311    if (*out == NULL)
312	return ENOMEM;
313
314    ret = wind_ucs4utf8(in, in_len, *out, &size);
315    if (ret) {
316	free(*out);
317	*out = NULL;
318	return ret;
319    }
320
321    if (out_len)
322	*out_len = size;
323
324    return 0;
325}
326
327
328/**
329 * Read in an UCS2 from a buffer.
330 *
331 * @param ptr The input buffer to read from.
332 * @param len the length of the input buffer.
333 * @param flags Flags to control the behavior of the function.
334 * @param out the output UCS2, the array must be at least out/2 long.
335 * @param out_len the output length
336 *
337 * @return returns 0 on success, an wind error code otherwise.
338 * @ingroup wind
339 */
340
341int
342wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
343	      uint16_t *out, size_t *out_len)
344{
345    const unsigned char *p = ptr;
346    int little = ((*flags) & WIND_RW_LE);
347    size_t olen = *out_len;
348
349    /** if len is zero, flags are unchanged */
350    if (len == 0) {
351	*out_len = 0;
352	return 0;
353    }
354
355    /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
356    if (len & 1)
357	return WIND_ERR_LENGTH_NOT_MOD2;
358
359    /**
360     * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
361     * found, check is LE/BE flag is already and use that otherwise
362     * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
363     * the LE/BE flag and set the resulting LE/BE flag.
364     */
365    if ((*flags) & WIND_RW_BOM) {
366	uint16_t bom = (p[0] << 8) + p[1];
367	if (bom == 0xfffe || bom == 0xfeff) {
368	    little = (bom == 0xfffe);
369	    p += 2;
370	    len -= 2;
371	} else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
372	    /* little already set */
373	} else
374	    return WIND_ERR_NO_BOM;
375	*flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
376	*flags |= little ? WIND_RW_LE : WIND_RW_BE;
377    }
378
379    while (len) {
380	if (olen < 1)
381	    return WIND_ERR_OVERRUN;
382	if (little)
383	    *out = (p[1] << 8) + p[0];
384	else
385	    *out = (p[0] << 8) + p[1];
386	out++; p += 2; len -= 2; olen--;
387    }
388    *out_len -= olen;
389    return 0;
390}
391
392/**
393 * Write an UCS2 string to a buffer.
394 *
395 * @param in The input UCS2 string.
396 * @param in_len the length of the input buffer.
397 * @param flags Flags to control the behavior of the function.
398 * @param ptr The input buffer to write to, the array must be at least
399 * (in + 1) * 2 bytes long.
400 * @param out_len the output length
401 *
402 * @return returns 0 on success, an wind error code otherwise.
403 * @ingroup wind
404 */
405
406int
407wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
408	       void *ptr, size_t *out_len)
409{
410    unsigned char *p = ptr;
411    size_t len = *out_len;
412
413    /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
414    if (len & 1)
415	return WIND_ERR_LENGTH_NOT_MOD2;
416
417    /** On zero input length, flags are preserved */
418    if (in_len == 0) {
419	*out_len = 0;
420	return 0;
421    }
422    /** If flags have WIND_RW_BOM set, the byte order mark is written
423     * first to the output data */
424    if ((*flags) & WIND_RW_BOM) {
425	uint16_t bom = 0xfffe;
426
427	if (len < 2)
428	    return WIND_ERR_OVERRUN;
429
430	if ((*flags) & WIND_RW_LE) {
431	    p[0] = (bom >> 8) & 0xff;
432	    p[1] = (bom     ) & 0xff;
433	} else {
434	    p[1] = (bom     ) & 0xff;
435	    p[0] = (bom >> 8) & 0xff;
436	}
437	len -= 2;
438    }
439
440    while (in_len) {
441	/** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
442	if (len < 2)
443	    return WIND_ERR_OVERRUN;
444	if ((*flags) & WIND_RW_LE) {
445	    p[0] = (in[0] >> 8) & 0xff;
446	    p[1] = (in[0]     ) & 0xff;
447	} else {
448	    p[1] = (in[0]     ) & 0xff;
449	    p[0] = (in[0] >> 8) & 0xff;
450	}
451	len -= 2;
452	in_len--;
453	p += 2;
454	in++;
455    }
456    *out_len -= len;
457    return 0;
458}
459
460
461/**
462 * Convert an UTF-8 string to an UCS2 string.
463 *
464 * @param in an UTF-8 string to convert.
465 * @param out the resulting UCS2 strint, must be at least
466 * wind_utf8ucs2_length() long.  If out is NULL, the function will
467 * calculate the needed space for the out variable (just like
468 * wind_utf8ucs2_length()).
469 * @param out_len before processing out_len should be the length of
470 * the out variable, after processing it will be the length of the out
471 * string.
472 *
473 * @return returns 0 on success, an wind error code otherwise
474 * @ingroup wind
475 */
476
477int
478wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
479{
480    const unsigned char *p;
481    size_t o = 0;
482    int ret;
483
484    for (p = (const unsigned char *)in; *p != '\0'; ++p) {
485	uint32_t u;
486
487	ret = utf8toutf32(&p, &u);
488	if (ret)
489	    return ret;
490
491	if (u & 0xffff0000)
492	    return WIND_ERR_NOT_UTF16;
493
494	if (out) {
495	    if (o >= *out_len)
496		return WIND_ERR_OVERRUN;
497	    out[o] = u;
498	}
499	o++;
500    }
501    *out_len = o;
502    return 0;
503}
504
505/**
506 * Calculate the length of from converting a UTF-8 string to a UCS2
507 * string.
508 *
509 * @param in an UTF-8 string to convert.
510 * @param out_len the length of the resulting UCS4 string.
511 *
512 * @return returns 0 on success, an wind error code otherwise
513 * @ingroup wind
514 */
515
516int
517wind_utf8ucs2_length(const char *in, size_t *out_len)
518{
519    return wind_utf8ucs2(in, NULL, out_len);
520}
521
522/**
523 * Convert an UCS2 string to a UTF-8 string.
524 *
525 * @param in an UCS2 string to convert.
526 * @param in_len the length of the in UCS2 string.
527 * @param out the resulting UTF-8 strint, must be at least
528 * wind_ucs2utf8_length() long.  If out is NULL, the function will
529 * calculate the needed space for the out variable (just like
530 * wind_ucs2utf8_length()).
531 * @param out_len before processing out_len should be the length of
532 * the out variable, after processing it will be the length of the out
533 * string.
534 *
535 * @return returns 0 on success, an wind error code otherwise
536 * @ingroup wind
537 */
538
539int
540wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
541{
542    uint16_t ch;
543    size_t i, len, o;
544
545    for (o = 0, i = 0; i < in_len; i++) {
546	ch = in[i];
547
548	if (ch < 0x80) {
549	    len = 1;
550	} else if (ch < 0x800) {
551	    len = 2;
552	} else
553	    len = 3;
554
555	o += len;
556
557	if (out) {
558	    if (o >= *out_len)
559		return WIND_ERR_OVERRUN;
560
561	    switch(len) {
562	    case 3:
563		out[2] = (ch | 0x80) & 0xbf;
564		ch = ch << 6;
565	    case 2:
566		out[1] = (ch | 0x80) & 0xbf;
567		ch = ch << 6;
568	    case 1:
569		out[0] = ch | first_char[len - 1];
570	    }
571	    out += len;
572	}
573    }
574    if (out) {
575	if (o >= *out_len)
576	    return WIND_ERR_OVERRUN;
577	*out = '\0';
578    }
579    *out_len = o;
580    return 0;
581}
582
583/**
584 * Calculate the length of from converting a UCS2 string to an UTF-8 string.
585 *
586 * @param in an UCS2 string to convert.
587 * @param in_len an UCS2 string length to convert.
588 * @param out_len the length of the resulting UTF-8 string.
589 *
590 * @return returns 0 on success, an wind error code otherwise
591 * @ingroup wind
592 */
593
594int
595wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
596{
597    return wind_ucs2utf8(in, in_len, NULL, out_len);
598}
599