1/*
2 * util/support/utf8.c
3 *
4 * Copyright 2008 by the Massachusetts Institute of Technology.
5 * All Rights Reserved.
6 *
7 * Export of this software from the United States of America may
8 *   require a specific license from the United States Government.
9 *   It is the responsibility of any person or organization contemplating
10 *   export to obtain such a license before exporting.
11 *
12 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13 * distribute this software and its documentation for any purpose and
14 * without fee is hereby granted, provided that the above copyright
15 * notice appear in all copies and that both that copyright notice and
16 * this permission notice appear in supporting documentation, and that
17 * the name of M.I.T. not be used in advertising or publicity pertaining
18 * to distribution of the software without specific, written prior
19 * permission.  Furthermore if you modify this software you must label
20 * your software as modified software and not distribute it in such a
21 * fashion that it might be confused with the original M.I.T. software.
22 * M.I.T. makes no representations about the suitability of
23 * this software for any purpose.  It is provided "as is" without express
24 * or implied warranty.
25 */
26/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
27 *
28 * Copyright 1998-2008 The OpenLDAP Foundation.
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted only as authorized by the OpenLDAP
33 * Public License.
34 *
35 * A copy of this license is available in the file LICENSE in the
36 * top-level directory of the distribution or, alternatively, at
37 * <http://www.OpenLDAP.org/license.html>.
38 */
39/* Basic UTF-8 routines
40 *
41 * These routines are "dumb".  Though they understand UTF-8,
42 * they don't grok Unicode.  That is, they can push bits,
43 * but don't have a clue what the bits represent.  That's
44 * good enough for use with the KRB5 Client SDK.
45 *
46 * These routines are not optimized.
47 */
48
49#include "k5-platform.h"
50#include "k5-utf8.h"
51#include "supp-int.h"
52
53/*
54 * return the number of bytes required to hold the
55 * NULL-terminated UTF-8 string NOT INCLUDING the
56 * termination.
57 */
58size_t krb5int_utf8_bytes(const char *p)
59{
60    size_t bytes;
61
62    for (bytes = 0; p[bytes]; bytes++)
63	;
64
65    return bytes;
66}
67
68size_t krb5int_utf8_chars(const char *p)
69{
70    /* could be optimized and could check for invalid sequences */
71    size_t chars = 0;
72
73    for ( ; *p ; KRB5_UTF8_INCR(p))
74	chars++;
75
76    return chars;
77}
78
79size_t krb5int_utf8c_chars(const char *p, size_t length)
80{
81    /* could be optimized and could check for invalid sequences */
82    size_t chars = 0;
83    const char *end = p + length;
84
85    for ( ; p < end; KRB5_UTF8_INCR(p))
86	chars++;
87
88    return chars;
89}
90
91/* return offset to next character */
92int krb5int_utf8_offset(const char *p)
93{
94    return KRB5_UTF8_NEXT(p) - p;
95}
96
97/*
98 * Returns length indicated by first byte.
99 */
100const char krb5int_utf8_lentab[] = {
101    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
108    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
109
110int krb5int_utf8_charlen(const char *p)
111{
112    if (!(*p & 0x80))
113	return 1;
114
115    return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
116}
117
118/*
119 * Make sure the UTF-8 char used the shortest possible encoding
120 * returns charlen if valid, 0 if not.
121 *
122 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
123 * The table is slightly modified from that of the RFC.
124 *
125 * UCS-4 range (hex)      UTF-8 sequence (binary)
126 * 0000 0000-0000 007F   0.......
127 * 0000 0080-0000 07FF   110++++. 10......
128 * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
129 * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
130 * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
131 * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
132 *
133 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
134 * at least one of the '+' bits must be set, otherwise the character
135 * should have been encoded in fewer octets. Note that in the two-octet
136 * case, only the first octet needs to be validated, and this is done
137 * in the krb5int_utf8_lentab[] above.
138 */
139
140/* mask of required bits in second octet */
141#undef c
142#define c const char
143c krb5int_utf8_mintab[] = {
144    (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
145    (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
146    (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
147    (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
148#undef c
149
150int krb5int_utf8_charlen2(const char *p)
151{
152    int i = KRB5_UTF8_CHARLEN(p);
153
154    if (i > 2) {
155	if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
156	    i = 0;
157    }
158
159    return i;
160}
161
162/*
163 * Convert a UTF8 character to a UCS4 character.  Return 0 on success,
164 * -1 on failure.
165 */
166int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
167{
168    const unsigned char *c = (const unsigned char *) p;
169    krb5_ucs4 ch;
170    int len, i;
171    static unsigned char mask[] = {
172	0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
173
174    *out = 0;
175    len = KRB5_UTF8_CHARLEN2(p, len);
176
177    if (len == 0)
178	return -1;
179
180    ch = c[0] & mask[len];
181
182    for (i = 1; i < len; i++) {
183	if ((c[i] & 0xc0) != 0x80)
184	    return -1;
185
186	ch <<= 6;
187	ch |= c[i] & 0x3f;
188    }
189
190    *out = ch;
191    return 0;
192}
193
194int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
195{
196    krb5_ucs4 ch;
197
198    *out = 0;
199    if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
200	return -1;
201    *out = (krb5_ucs2) ch;
202    return 0;
203}
204
205/* conv UCS-2 to UTF-8, not used */
206size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
207{
208    size_t len = 0;
209    unsigned char *p = (unsigned char *) buf;
210
211    /* not a valid Unicode character */
212    if (c < 0)
213	return 0;
214
215    /* Just return length, don't convert */
216    if (buf == NULL) {
217	if (c < 0x80) return 1;
218	else if (c < 0x800) return 2;
219	else if (c < 0x10000) return 3;
220	else if (c < 0x200000) return 4;
221	else if (c < 0x4000000) return 5;
222	else return 6;
223    }
224
225    if (c < 0x80) {
226	p[len++] = c;
227    } else if (c < 0x800) {
228	p[len++] = 0xc0 | ( c >> 6 );
229	p[len++] = 0x80 | ( c & 0x3f );
230    } else if (c < 0x10000) {
231	p[len++] = 0xe0 | ( c >> 12 );
232	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
233	p[len++] = 0x80 | ( c & 0x3f );
234    } else if (c < 0x200000) {
235	p[len++] = 0xf0 | ( c >> 18 );
236	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
237	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
238	p[len++] = 0x80 | ( c & 0x3f );
239    } else if (c < 0x4000000) {
240	p[len++] = 0xf8 | ( c >> 24 );
241	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
242	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
243	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
244	p[len++] = 0x80 | ( c & 0x3f );
245    } else /* if( c < 0x80000000 ) */ {
246	p[len++] = 0xfc | ( c >> 30 );
247	p[len++] = 0x80 | ( (c >> 24) & 0x3f );
248	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
249	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
250	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
251	p[len++] = 0x80 | ( c & 0x3f );
252    }
253
254    return len;
255}
256
257size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
258{
259    return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
260}
261
262#define KRB5_UCS_UTF8LEN(c)	\
263    c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
264    (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
265
266/*
267 * Advance to the next UTF-8 character
268 *
269 * Ignores length of multibyte character, instead rely on
270 * continuation markers to find start of next character.
271 * This allows for "resyncing" of when invalid characters
272 * are provided provided the start of the next character
273 * is appears within the 6 bytes examined.
274 */
275char *krb5int_utf8_next(const char *p)
276{
277    int i;
278    const unsigned char *u = (const unsigned char *) p;
279
280    if (KRB5_UTF8_ISASCII(u)) {
281	return (char *) &p[1];
282    }
283
284    for (i = 1; i < 6; i++) {
285	if ((u[i] & 0xc0) != 0x80) {
286	    return (char *) &p[i];
287	}
288    }
289
290    return (char *) &p[i];
291}
292
293/*
294 * Advance to the previous UTF-8 character
295 *
296 * Ignores length of multibyte character, instead rely on
297 * continuation markers to find start of next character.
298 * This allows for "resyncing" of when invalid characters
299 * are provided provided the start of the next character
300 * is appears within the 6 bytes examined.
301 */
302char *krb5int_utf8_prev(const char *p)
303{
304    int i;
305    const unsigned char *u = (const unsigned char *) p;
306
307    for (i = -1; i>-6 ; i--) {
308	if ((u[i] & 0xc0 ) != 0x80) {
309	    return (char *) &p[i];
310	}
311    }
312
313    return (char *) &p[i];
314}
315
316/*
317 * Copy one UTF-8 character from src to dst returning
318 * number of bytes copied.
319 *
320 * Ignores length of multibyte character, instead rely on
321 * continuation markers to find start of next character.
322 * This allows for "resyncing" of when invalid characters
323 * are provided provided the start of the next character
324 * is appears within the 6 bytes examined.
325 */
326int krb5int_utf8_copy(char* dst, const char *src)
327{
328    int i;
329    const unsigned char *u = (const unsigned char *) src;
330
331    dst[0] = src[0];
332
333    if (KRB5_UTF8_ISASCII(u)) {
334	return 1;
335    }
336
337    for (i=1; i<6; i++) {
338	if ((u[i] & 0xc0) != 0x80) {
339	    return i;
340	}
341	dst[i] = src[i];
342    }
343
344    return i;
345}
346
347#ifndef UTF8_ALPHA_CTYPE
348/*
349 * UTF-8 ctype routines
350 * Only deals with characters < 0x80 (ie: US-ASCII)
351 */
352
353int krb5int_utf8_isascii(const char * p)
354{
355    unsigned c = * (const unsigned char *) p;
356
357    return KRB5_ASCII(c);
358}
359
360int krb5int_utf8_isdigit(const char * p)
361{
362    unsigned c = * (const unsigned char *) p;
363
364    if (!KRB5_ASCII(c))
365	return 0;
366
367    return KRB5_DIGIT( c );
368}
369
370int krb5int_utf8_isxdigit(const char * p)
371{
372    unsigned c = * (const unsigned char *) p;
373
374    if (!KRB5_ASCII(c))
375	return 0;
376
377    return KRB5_HEX(c);
378}
379
380int krb5int_utf8_isspace(const char * p)
381{
382    unsigned c = * (const unsigned char *) p;
383
384    if (!KRB5_ASCII(c))
385	return 0;
386
387    switch(c) {
388    case ' ':
389    case '\t':
390    case '\n':
391    case '\r':
392    case '\v':
393    case '\f':
394	return 1;
395    }
396
397    return 0;
398}
399
400/*
401 * These are not needed by the C SDK and are
402 * not "good enough" for general use.
403 */
404int krb5int_utf8_isalpha(const char * p)
405{
406    unsigned c = * (const unsigned char *) p;
407
408    if (!KRB5_ASCII(c))
409	return 0;
410
411    return KRB5_ALPHA(c);
412}
413
414int krb5int_utf8_isalnum(const char * p)
415{
416    unsigned c = * (const unsigned char *) p;
417
418    if (!KRB5_ASCII(c))
419	return 0;
420
421    return KRB5_ALNUM(c);
422}
423
424#if 0
425int krb5int_utf8_islower(const char * p)
426{
427    unsigned c = * (const unsigned char *) p;
428
429    if (!KRB5_ASCII(c))
430	return 0;
431
432    return KRB5_LOWER(c);
433}
434
435int krb5int_utf8_isupper(const char * p)
436{
437    unsigned c = * (const unsigned char *) p;
438
439    if (!KRB5_ASCII(c))
440	return 0;
441
442    return KRB5_UPPER(c);
443}
444#endif
445#endif
446
447
448/*
449 * UTF-8 string routines
450 */
451
452/* like strchr() */
453char *krb5int_utf8_strchr(const char *str, const char *chr)
454{
455    krb5_ucs4 chs, ch;
456
457    if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
458	return NULL;
459    for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
460	if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
461	    return (char *)str;
462    }
463
464    return NULL;
465}
466
467/* like strcspn() but returns number of bytes, not characters */
468size_t krb5int_utf8_strcspn(const char *str, const char *set)
469{
470    const char *cstr, *cset;
471    krb5_ucs4 chstr, chset;
472
473    for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
474	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
475	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
476		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
477		return cstr - str;
478	}
479    }
480
481    return cstr - str;
482}
483
484/* like strspn() but returns number of bytes, not characters */
485size_t krb5int_utf8_strspn(const char *str, const char *set)
486{
487    const char *cstr, *cset;
488    krb5_ucs4 chstr, chset;
489
490    for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
491	for (cset = set; ; KRB5_UTF8_INCR(cset)) {
492	    if (*cset == '\0')
493		return cstr - str;
494	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
495		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
496		break;
497	}
498    }
499
500    return cstr - str;
501}
502
503/* like strpbrk(), replaces strchr() as well */
504char *krb5int_utf8_strpbrk(const char *str, const char *set)
505{
506    const char *cset;
507    krb5_ucs4 chstr, chset;
508
509    for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
510	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
511	    if (krb5int_utf8_to_ucs4(str, &chstr) == 0
512		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
513		return (char *)str;
514	}
515    }
516
517    return NULL;
518}
519
520/* like strtok_r(), not strtok() */
521char *krb5int_utf8_strtok(char *str, const char *sep, char **last)
522{
523    char *begin;
524    char *end;
525
526    if (last == NULL)
527	return NULL;
528
529    begin = str ? str : *last;
530
531    begin += krb5int_utf8_strspn(begin, sep);
532
533    if (*begin == '\0') {
534	*last = NULL;
535	return NULL;
536    }
537
538    end = &begin[krb5int_utf8_strcspn(begin, sep)];
539
540    if (*end != '\0') {
541	char *next = KRB5_UTF8_NEXT(end);
542	*end = '\0';
543	end = next;
544    }
545
546    *last = end;
547
548    return begin;
549}
550