1/*	$NetBSD$	*/
2
3/* OpenLDAP: pkg/ldap/libraries/libldap/utf-8-conv.c,v 1.16.2.6 2010/04/13 20:23:01 kurt Exp */
4/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 1998-2010 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in the file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17/* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
18 *
19 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
20 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
21 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
22 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
23 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
24 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
25 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
26 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
27 *---
28 * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
29 * can be found in the file "build/LICENSE-2.0.1" in this distribution
30 * of OpenLDAP Software.
31 */
32
33/*
34 * UTF-8 Conversion Routines
35 *
36 * These routines convert between Wide Character and UTF-8,
37 * or between MultiByte and UTF-8 encodings.
38 *
39 * Both single character and string versions of the functions are provided.
40 * All functions return -1 if the character or string cannot be converted.
41 */
42
43#include "portable.h"
44
45#if SIZEOF_WCHAR_T >= 4
46/* These routines assume ( sizeof(wchar_t) >= 4 ) */
47
48#include <stdio.h>
49#include <ac/stdlib.h>		/* For wctomb, wcstombs, mbtowc, mbstowcs */
50#include <ac/string.h>
51#include <ac/time.h>		/* for time_t */
52
53#include "ldap-int.h"
54
55#include <ldap_utf8.h>
56
57static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
58
59
60/*-----------------------------------------------------------------------------
61					UTF-8 Format Summary
62
63ASCII chars 						7 bits
64    0xxxxxxx
65
662-character UTF-8 sequence:        11 bits
67    110xxxxx  10xxxxxx
68
693-character UTF-8                  16 bits
70    1110xxxx  10xxxxxx  10xxxxxx
71
724-char UTF-8                       21 bits
73    11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
74
755-char UTF-8                       26 bits
76    111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
77
786-char UTF-8                       31 bits
79    1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
80
81Unicode address space   (0 - 0x10FFFF)    21 bits
82ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
83
84Note: This code does not prevent UTF-8 sequences which are longer than
85      necessary from being decoded.
86*/
87
88/*-----------------------------------------------------------------------------
89   Convert a UTF-8 character to a wide char.
90   Return the length of the UTF-8 input character in bytes.
91*/
92int
93ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
94{
95	int utflen, i;
96	wchar_t ch;
97
98	if (utf8char == NULL) return -1;
99
100	/* Get UTF-8 sequence length from 1st byte */
101	utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
102
103	if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
104
105	/* First byte minus length tag */
106	ch = (wchar_t)(utf8char[0] & mask[utflen]);
107
108	for(i=1; i < utflen; i++) {
109		/* Subsequent bytes must start with 10 */
110		if ((utf8char[i] & 0xc0) != 0x80) return -1;
111
112		ch <<= 6;			/* 6 bits of data in each subsequent byte */
113		ch |= (wchar_t)(utf8char[i] & 0x3f);
114	}
115
116	if (wchar) *wchar = ch;
117
118	return utflen;
119}
120
121/*-----------------------------------------------------------------------------
122   Convert a UTF-8 string to a wide char string.
123   No more than 'count' wide chars will be written to the output buffer.
124   Return the size of the converted string in wide chars, excl null terminator.
125*/
126int
127ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
128{
129	size_t wclen = 0;
130	int utflen, i;
131	wchar_t ch;
132
133
134	/* If input ptr is NULL or empty... */
135	if (utf8str == NULL || !*utf8str) {
136		if ( wcstr )
137			*wcstr = 0;
138		return 0;
139	}
140
141	/* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
142	while ( *utf8str && (wcstr==NULL || wclen<count) ) {
143		/* Get UTF-8 sequence length from 1st byte */
144		utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
145
146		if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
147
148		/* First byte minus length tag */
149		ch = (wchar_t)(utf8str[0] & mask[utflen]);
150
151		for(i=1; i < utflen; i++) {
152			/* Subsequent bytes must start with 10 */
153			if ((utf8str[i] & 0xc0) != 0x80) return -1;
154
155			ch <<= 6;			/* 6 bits of data in each subsequent byte */
156			ch |= (wchar_t)(utf8str[i] & 0x3f);
157		}
158
159		if (wcstr) wcstr[wclen] = ch;
160
161		utf8str += utflen;	/* Move to next UTF-8 character */
162		wclen++;			/* Count number of wide chars stored/required */
163	}
164
165	/* Add null terminator if there's room in the buffer. */
166	if (wcstr && wclen < count) wcstr[wclen] = 0;
167
168	return wclen;
169}
170
171
172/*-----------------------------------------------------------------------------
173   Convert one wide char to a UTF-8 character.
174   Return the length of the converted UTF-8 character in bytes.
175   No more than 'count' bytes will be written to the output buffer.
176*/
177int
178ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
179{
180	int len=0;
181
182	if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
183	{						/* Ignore count */
184		if( wchar < 0 )
185			return -1;
186		if( wchar < 0x80 )
187			return 1;
188		if( wchar < 0x800 )
189			return 2;
190		if( wchar < 0x10000 )
191			return 3;
192		if( wchar < 0x200000 )
193			return 4;
194		if( wchar < 0x4000000 )
195			return 5;
196#if SIZEOF_WCHAR_T > 4
197		/* UL is not strictly needed by ANSI C */
198		if( wchar < (wchar_t)0x80000000UL )
199#endif /* SIZEOF_WCHAR_T > 4 */
200			return 6;
201		return -1;
202	}
203
204
205	if ( wchar < 0 ) {				/* Invalid wide character */
206		len = -1;
207
208	} else if( wchar < 0x80 ) {
209		if (count >= 1) {
210			utf8char[len++] = (char)wchar;
211		}
212
213	} else if( wchar < 0x800 ) {
214		if (count >=2) {
215			utf8char[len++] = 0xc0 | ( wchar >> 6 );
216			utf8char[len++] = 0x80 | ( wchar & 0x3f );
217		}
218
219	} else if( wchar < 0x10000 ) {
220		if (count >= 3) {
221			utf8char[len++] = 0xe0 | ( wchar >> 12 );
222			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
223			utf8char[len++] = 0x80 | ( wchar & 0x3f );
224		}
225
226	} else if( wchar < 0x200000 ) {
227		if (count >= 4) {
228			utf8char[len++] = 0xf0 | ( wchar >> 18 );
229			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
230			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
231			utf8char[len++] = 0x80 | ( wchar & 0x3f );
232		}
233
234	} else if( wchar < 0x4000000 ) {
235		if (count >= 5) {
236			utf8char[len++] = 0xf8 | ( wchar >> 24 );
237			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
238			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
239			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
240			utf8char[len++] = 0x80 | ( wchar & 0x3f );
241		}
242
243	} else
244#if SIZEOF_WCHAR_T > 4
245		/* UL is not strictly needed by ANSI C */
246		if( wchar < (wchar_t)0x80000000UL )
247#endif /* SIZEOF_WCHAR_T > 4 */
248	{
249		if (count >= 6) {
250			utf8char[len++] = 0xfc | ( wchar >> 30 );
251			utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
252			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
253			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
254			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
255			utf8char[len++] = 0x80 | ( wchar & 0x3f );
256		}
257
258#if SIZEOF_WCHAR_T > 4
259	} else {
260		len = -1;
261#endif /* SIZEOF_WCHAR_T > 4 */
262	}
263
264	return len;
265
266}
267
268
269/*-----------------------------------------------------------------------------
270   Convert a wide char string to a UTF-8 string.
271   No more than 'count' bytes will be written to the output buffer.
272   Return the # of bytes written to the output buffer, excl null terminator.
273*/
274int
275ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
276{
277	int len = 0;
278	int n;
279	char *p = utf8str;
280	wchar_t empty = 0;		/* To avoid use of L"" construct */
281
282	if (wcstr == NULL)		/* Treat input ptr NULL as an empty string */
283		wcstr = &empty;
284
285	if (utf8str == NULL)	/* Just compute size of output, excl null */
286	{
287		while (*wcstr)
288		{
289			/* Get UTF-8 size of next wide char */
290			n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
291			if (n == -1)
292				return -1;
293			len += n;
294		}
295
296		return len;
297	}
298
299
300	/* Do the actual conversion. */
301
302	n = 1;					/* In case of empty wcstr */
303	while (*wcstr)
304	{
305		n = ldap_x_wc_to_utf8( p, *wcstr++, count);
306
307		if (n <= 0)  		/* If encoding error (-1) or won't fit (0), quit */
308			break;
309
310		p += n;
311		count -= n;			/* Space left in output buffer */
312	}
313
314	/* If not enough room for last character, pad remainder with null
315	   so that return value = original count, indicating buffer full. */
316	if (n == 0)
317	{
318		while (count--)
319			*p++ = 0;
320	}
321
322	/* Add a null terminator if there's room. */
323	else if (count)
324		*p = 0;
325
326	if (n == -1)			/* Conversion encountered invalid wide char. */
327		return -1;
328
329	/* Return the number of bytes written to output buffer, excl null. */
330	return (p - utf8str);
331}
332
333
334/*-----------------------------------------------------------------------------
335   Convert a UTF-8 character to a MultiByte character.
336   Return the size of the converted character in bytes.
337*/
338int
339ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
340		int (*f_wctomb)(char *mbchar, wchar_t wchar) )
341{
342	wchar_t wchar;
343	int n;
344	char tmp[6];				/* Large enough for biggest multibyte char */
345
346	if (f_wctomb == NULL)		/* If no conversion function was given... */
347		f_wctomb = wctomb;		/*    use the local ANSI C function */
348
349	/* First convert UTF-8 char to a wide char */
350	n = ldap_x_utf8_to_wc( &wchar, utf8char);
351
352	if (n == -1)
353		return -1;		/* Invalid UTF-8 character */
354
355	if (mbchar == NULL)
356		n = f_wctomb( tmp, wchar );
357	else
358		n = f_wctomb( mbchar, wchar);
359
360	return n;
361}
362
363/*-----------------------------------------------------------------------------
364   Convert a UTF-8 string to a MultiByte string.
365   No more than 'count' bytes will be written to the output buffer.
366   Return the size of the converted string in bytes, excl null terminator.
367*/
368int
369ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
370		size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
371{
372	wchar_t *wcs;
373	size_t wcsize;
374    int n;
375
376	if (f_wcstombs == NULL)		/* If no conversion function was given... */
377		f_wcstombs = wcstombs;	/*    use the local ANSI C function */
378
379	if (utf8str == NULL || *utf8str == 0)	/* NULL or empty input string */
380	{
381		if (mbstr)
382			*mbstr = 0;
383		return 0;
384	}
385
386/* Allocate memory for the maximum size wchar string that we could get. */
387	wcsize = strlen(utf8str) + 1;
388	wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
389	if (wcs == NULL)
390		return -1;				/* Memory allocation failure. */
391
392	/* First convert the UTF-8 string to a wide char string */
393	n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
394
395	/* Then convert wide char string to multi-byte string */
396	if (n != -1)
397	{
398		n = f_wcstombs(mbstr, wcs, count);
399	}
400
401	LDAP_FREE(wcs);
402
403	return n;
404}
405
406/*-----------------------------------------------------------------------------
407   Convert a MultiByte character to a UTF-8 character.
408   'mbsize' indicates the number of bytes of 'mbchar' to check.
409   Returns the number of bytes written to the output character.
410*/
411int
412ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
413		int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
414{
415    wchar_t wchar;
416    int n;
417
418	if (f_mbtowc == NULL)		/* If no conversion function was given... */
419		f_mbtowc = mbtowc;		/*    use the local ANSI C function */
420
421    if (mbsize == 0)				/* 0 is not valid. */
422        return -1;
423
424    if (mbchar == NULL || *mbchar == 0)
425    {
426        if (utf8char)
427            *utf8char = 0;
428        return 1;
429    }
430
431	/* First convert the MB char to a Wide Char */
432	n = f_mbtowc( &wchar, mbchar, mbsize);
433
434	if (n == -1)
435		return -1;
436
437	/* Convert the Wide Char to a UTF-8 character. */
438	n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
439
440	return n;
441}
442
443
444/*-----------------------------------------------------------------------------
445   Convert a MultiByte string to a UTF-8 string.
446   No more than 'count' bytes will be written to the output buffer.
447   Return the size of the converted string in bytes, excl null terminator.
448*/
449int
450ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
451		size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
452{
453	wchar_t *wcs;
454	int n;
455	size_t wcsize;
456
457	if (mbstr == NULL)		   /* Treat NULL input string as an empty string */
458		mbstr = "";
459
460	if (f_mbstowcs == NULL)		/* If no conversion function was given... */
461		f_mbstowcs = mbstowcs;	/*    use the local ANSI C function */
462
463	/* Allocate memory for the maximum size wchar string that we could get. */
464	wcsize = strlen(mbstr) + 1;
465	wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
466	if (wcs == NULL)
467		return -1;
468
469	/* First convert multi-byte string to a wide char string */
470	n = f_mbstowcs(wcs, mbstr, wcsize);
471
472	/* Convert wide char string to UTF-8 string */
473	if (n != -1)
474	{
475		n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
476	}
477
478	LDAP_FREE(wcs);
479
480	return n;
481}
482
483#endif /* SIZEOF_WCHAR_T >= 4 */
484