1/*
2 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
3 * Use is subject to license terms.
4 */
5
6/*
7 * util/support/utf8_conv.c
8 *
9 * Copyright 2008 by the Massachusetts Institute of Technology.
10 * All Rights Reserved.
11 *
12 * Export of this software from the United States of America may
13 *   require a specific license from the United States Government.
14 *   It is the responsibility of any person or organization contemplating
15 *   export to obtain such a license before exporting.
16 *
17 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
18 * distribute this software and its documentation for any purpose and
19 * without fee is hereby granted, provided that the above copyright
20 * notice appear in all copies and that both that copyright notice and
21 * this permission notice appear in supporting documentation, and that
22 * the name of M.I.T. not be used in advertising or publicity pertaining
23 * to distribution of the software without specific, written prior
24 * permission.  Furthermore if you modify this software you must label
25 * your software as modified software and not distribute it in such a
26 * fashion that it might be confused with the original M.I.T. software.
27 * M.I.T. makes no representations about the suitability of
28 * this software for any purpose.  It is provided "as is" without express
29 * or implied warranty.
30 */
31/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
32 *
33 * Copyright 1998-2008 The OpenLDAP Foundation.
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted only as authorized by the OpenLDAP
38 * Public License.
39 *
40 * A copy of this license is available in the file LICENSE in the
41 * top-level directory of the distribution or, alternatively, at
42 * <http://www.OpenLDAP.org/license.html>.
43 */
44/* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
45 *
46 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
47 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
48 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
49 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
50 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
51 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
52 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
53 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
54 */
55
56/*
57 * UTF-8 Conversion Routines
58 *
59 * These routines convert between Wide Character and UTF-8,
60 * or between MultiByte and UTF-8 encodings.
61 *
62 * Both single character and string versions of the functions are provided.
63 * All functions return -1 if the character or string cannot be converted.
64 */
65
66#include "k5-platform.h"
67#include "k5-utf8.h"
68#include "supp-int.h"
69#include "errno.h"  /* SUNW17PACresync */
70
71static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
72
73static ssize_t
74k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str,
75		  const char *utf8str,
76		  size_t count,
77		  int little_endian)
78{
79    size_t ucs2len = 0;
80    size_t utflen, i;
81    krb5_ucs2 ch;
82
83    /* If input ptr is NULL or empty... */
84    if (utf8str == NULL || *utf8str == '\0') {
85	*ucs2str = 0;
86
87	return 0;
88    }
89
90    /* Examine next UTF-8 character.  */
91    while (*utf8str && ucs2len < count) {
92	/* Get UTF-8 sequence length from 1st byte */
93	utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen);
94
95	if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN)
96	    return -1;
97
98	/* First byte minus length tag */
99	ch = (krb5_ucs2)(utf8str[0] & mask[utflen]);
100
101	for (i = 1; i < utflen; i++) {
102	    /* Subsequent bytes must start with 10 */
103	    if ((utf8str[i] & 0xc0) != 0x80)
104		return -1;
105
106	    ch <<= 6;			/* 6 bits of data in each subsequent byte */
107	    ch |= (krb5_ucs2)(utf8str[i] & 0x3f);
108	}
109
110	if (ucs2str != NULL) {
111#ifdef K5_BE
112#ifndef SWAP16
113#define SWAP16(X)	((((X) << 8) | ((X) >> 8)) & 0xFFFF)
114#endif
115	    if (little_endian)
116		ucs2str[ucs2len] = SWAP16(ch);
117	    else
118#endif
119		ucs2str[ucs2len] = ch;
120	}
121
122	utf8str += utflen;	/* Move to next UTF-8 character */
123	ucs2len++;		/* Count number of wide chars stored/required */
124    }
125
126    assert(ucs2len < count);
127
128    if (ucs2str != NULL) {
129	/* Add null terminator if there's room in the buffer. */
130	ucs2str[ucs2len] = 0;
131    }
132
133    return ucs2len;
134}
135
136int
137krb5int_utf8s_to_ucs2s(const char *utf8s,
138		       krb5_ucs2 **ucs2s,
139		       size_t *ucs2chars)
140{
141    ssize_t len;
142    size_t chars;
143
144    chars = krb5int_utf8_chars(utf8s);
145    *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
146    if (*ucs2s == NULL) {
147	return ENOMEM;
148    }
149
150    len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
151    if (len < 0) {
152	free(*ucs2s);
153	*ucs2s = NULL;
154	return EINVAL;
155    }
156
157    if (ucs2chars != NULL) {
158	*ucs2chars = chars;
159    }
160
161    return 0;
162}
163
164int
165krb5int_utf8cs_to_ucs2s(const char *utf8s,
166			size_t utf8slen,
167			krb5_ucs2 **ucs2s,
168			size_t *ucs2chars)
169{
170    ssize_t len;
171    size_t chars;
172
173    chars = krb5int_utf8c_chars(utf8s, utf8slen);
174    *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
175    if (*ucs2s == NULL) {
176	return ENOMEM;
177    }
178
179    len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
180    if (len < 0) {
181	free(*ucs2s);
182	*ucs2s = NULL;
183	return EINVAL;
184    }
185
186    if (ucs2chars != NULL) {
187	*ucs2chars = chars;
188    }
189
190    return 0;
191}
192
193int
194krb5int_utf8s_to_ucs2les(const char *utf8s,
195                         unsigned char **ucs2les,
196			 size_t *ucs2leslen)
197{
198    ssize_t len;
199    size_t chars;
200
201    chars = krb5int_utf8_chars(utf8s);
202
203    *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
204    if (*ucs2les == NULL) {
205	return ENOMEM;
206    }
207
208    len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
209    if (len < 0) {
210	free(*ucs2les);
211	*ucs2les = NULL;
212	return EINVAL;
213    }
214
215    if (ucs2leslen != NULL) {
216	*ucs2leslen = chars * sizeof(krb5_ucs2);
217    }
218
219    return 0;
220}
221
222int
223krb5int_utf8cs_to_ucs2les(const char *utf8s,
224			  size_t utf8slen,
225			  unsigned char **ucs2les,
226			  size_t *ucs2leslen)
227{
228    ssize_t len;
229    size_t chars;
230
231    chars = krb5int_utf8c_chars(utf8s, utf8slen);
232
233    *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
234    if (*ucs2les == NULL) {
235	return ENOMEM;
236    }
237
238    len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
239    if (len < 0) {
240	free(*ucs2les);
241	*ucs2les = NULL;
242	return EINVAL;
243    }
244
245    if (ucs2leslen != NULL) {
246	*ucs2leslen = chars * sizeof(krb5_ucs2);
247    }
248
249    return 0;
250}
251
252/*-----------------------------------------------------------------------------
253   Convert a wide char string to a UTF-8 string.
254   No more than 'count' bytes will be written to the output buffer.
255   Return the # of bytes written to the output buffer, excl null terminator.
256
257   ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the
258   length of the UCS-2 string in characters
259*/
260static ssize_t
261k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str,
262		  size_t count, ssize_t ucs2len, int little_endian)
263{
264    int len = 0;
265    int n;
266    char *p = utf8str;
267    krb5_ucs2 empty = 0, ch;
268
269    if (ucs2str == NULL)	/* Treat input ptr NULL as an empty string */
270	ucs2str = &empty;
271
272    if (utf8str == NULL)	/* Just compute size of output, excl null */
273    {
274	while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) {
275	    /* Get UTF-8 size of next wide char */
276	  ch = *ucs2str++;
277#ifdef K5_BE
278	    if (little_endian)
279		ch = SWAP16(ch);
280#endif
281
282	    n = krb5int_ucs2_to_utf8(ch, NULL);
283	    if (n < 1)
284		return -1;
285	    if (len + n < len)
286		return -1; /* overflow */
287	    len += n;
288	}
289
290	return len;
291    }
292
293    /* Do the actual conversion. */
294
295    n = 1;					/* In case of empty ucs2str */
296    while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) {
297      ch = *ucs2str++;
298#ifdef K5_BE
299	if (little_endian)
300	    ch = SWAP16(ch);
301#endif
302
303	n = krb5int_ucs2_to_utf8(ch, p);
304
305	if (n < 1)
306	    break;
307
308	p += n;
309	count -= n;			/* Space left in output buffer */
310    }
311
312    /* If not enough room for last character, pad remainder with null
313       so that return value = original count, indicating buffer full. */
314    if (n == 0) {
315	while (count--)
316	    *p++ = 0;
317    }
318    /* Add a null terminator if there's room. */
319    else if (count)
320	*p = 0;
321
322    if (n == -1)			/* Conversion encountered invalid wide char. */
323	return -1;
324
325    /* Return the number of bytes written to output buffer, excl null. */
326    return (p - utf8str);
327}
328
329int
330krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s,
331		       char **utf8s,
332		       size_t *utf8slen)
333{
334    ssize_t len;
335
336    len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0);
337    if (len < 0) {
338	return EINVAL;
339    }
340
341    *utf8s = (char *)malloc((size_t)len + 1);
342    if (*utf8s == NULL) {
343	return ENOMEM;
344    }
345
346    len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0);
347    if (len < 0) {
348	free(*utf8s);
349	*utf8s = NULL;
350	return EINVAL;
351    }
352
353    if (utf8slen != NULL) {
354	*utf8slen = len;
355    }
356
357    return 0;
358}
359
360int
361krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les,
362			 char **utf8s,
363			 size_t *utf8slen)
364{
365    ssize_t len;
366
367    len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1);
368    if (len < 0)
369	return EINVAL;
370
371    *utf8s = (char *)malloc((size_t)len + 1);
372    if (*utf8s == NULL) {
373	return ENOMEM;
374    }
375
376    len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1);
377    if (len < 0) {
378	free(*utf8s);
379	*utf8s = NULL;
380	return EINVAL;
381    }
382
383    if (utf8slen != NULL) {
384	*utf8slen = len;
385    }
386
387    return 0;
388}
389
390int
391krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s,
392                        size_t ucs2slen,
393                        char **utf8s,
394                        size_t *utf8slen)
395{
396    ssize_t len;
397
398    if (ucs2slen > SSIZE_MAX)
399	return ERANGE;
400
401    len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0,
402			    (ssize_t)ucs2slen, 0);
403    if (len < 0)
404	return EINVAL;
405
406    *utf8s = (char *)malloc((size_t)len + 1);
407    if (*utf8s == NULL) {
408	return ENOMEM;
409    }
410
411    len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s,
412			    (size_t)len + 1, (ssize_t)ucs2slen, 0);
413    if (len < 0) {
414	free(*utf8s);
415	*utf8s = NULL;
416	return EINVAL;
417    }
418
419    if (utf8slen != NULL) {
420	*utf8slen = len;
421    }
422
423    return 0;
424}
425
426int
427krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les,
428                          size_t ucs2leslen,
429                          char **utf8s,
430                          size_t *utf8slen)
431{
432    ssize_t len;
433
434    if (ucs2leslen > SSIZE_MAX)
435	return ERANGE;
436
437    len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0,
438			    (ssize_t)ucs2leslen, 1);
439    if (len < 0)
440	return EINVAL;
441
442    *utf8s = (char *)malloc((size_t)len + 1);
443    if (*utf8s == NULL) {
444	return ENOMEM;
445    }
446
447    len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les,
448			    (size_t)len + 1, (ssize_t)ucs2leslen, 1);
449    if (len < 0) {
450	free(*utf8s);
451	*utf8s = NULL;
452	return EINVAL;
453    }
454
455    if (utf8slen != NULL) {
456	*utf8slen = len;
457    }
458
459    return 0;
460}
461
462