1/* utf-8.c -- Basic UTF-8 routines */
2/* $OpenLDAP$ */
3/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4 *
5 * Copyright 1998-2011 The OpenLDAP Foundation.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted only as authorized by the OpenLDAP
10 * Public License.
11 *
12 * A copy of this license is available in the file LICENSE in the
13 * top-level directory of the distribution or, alternatively, at
14 * <http://www.OpenLDAP.org/license.html>.
15 */
16/* Basic UTF-8 routines
17 *
18 * These routines are "dumb".  Though they understand UTF-8,
19 * they don't grok Unicode.  That is, they can push bits,
20 * but don't have a clue what the bits represent.  That's
21 * good enough for use with the LDAP Client SDK.
22 *
23 * These routines are not optimized.
24 */
25
26#include "portable.h"
27
28#include <stdio.h>
29
30#include <ac/stdlib.h>
31
32#include <ac/socket.h>
33#include <ac/string.h>
34#include <ac/time.h>
35
36#include "ldap_utf8.h"
37
38#include "ldap-int.h"
39#include "ldap_defaults.h"
40
41/*
42 * return the number of bytes required to hold the
43 * NULL-terminated UTF-8 string NOT INCLUDING the
44 * termination.
45 */
46ber_len_t ldap_utf8_bytes( const char * p )
47{
48	ber_len_t bytes;
49
50	for( bytes=0; p[bytes]; bytes++ ) {
51		/* EMPTY */ ;
52	}
53
54	return bytes;
55}
56
57ber_len_t ldap_utf8_chars( const char * p )
58{
59	/* could be optimized and could check for invalid sequences */
60	ber_len_t chars=0;
61
62	for( ; *p ; LDAP_UTF8_INCR(p) ) {
63		chars++;
64	}
65
66	return chars;
67}
68
69/* return offset to next character */
70int ldap_utf8_offset( const char * p )
71{
72	return LDAP_UTF8_NEXT(p) - p;
73}
74
75/*
76 * Returns length indicated by first byte.
77 */
78const char ldap_utf8_lentab[] = {
79	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83	0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
86	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
87
88int ldap_utf8_charlen( const char * p )
89{
90	if (!(*p & 0x80))
91		return 1;
92
93	return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
94}
95
96/*
97 * Make sure the UTF-8 char used the shortest possible encoding
98 * returns charlen if valid, 0 if not.
99 *
100 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
101 * The table is slightly modified from that of the RFC.
102 *
103 * UCS-4 range (hex)      UTF-8 sequence (binary)
104 * 0000 0000-0000 007F   0.......
105 * 0000 0080-0000 07FF   110++++. 10......
106 * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
107 * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
108 * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
109 * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
110 *
111 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
112 * at least one of the '+' bits must be set, otherwise the character
113 * should have been encoded in fewer octets. Note that in the two-octet
114 * case, only the first octet needs to be validated, and this is done
115 * in the ldap_utf8_lentab[] above.
116 */
117
118/* mask of required bits in second octet */
119#undef c
120#define c const char
121c ldap_utf8_mintab[] = {
122	(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
123	(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
124	(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
125	(c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
126#undef c
127
128int ldap_utf8_charlen2( const char * p )
129{
130	int i = LDAP_UTF8_CHARLEN( p );
131
132	if ( i > 2 ) {
133		if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
134			i = 0;
135	}
136	return i;
137}
138
139/* conv UTF-8 to UCS-4, useful for comparisons */
140ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
141{
142    const unsigned char *c = (const unsigned char *) p;
143    ldap_ucs4_t ch;
144	int len, i;
145	static unsigned char mask[] = {
146		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147
148	len = LDAP_UTF8_CHARLEN2(p, len);
149
150	if( len == 0 ) return LDAP_UCS4_INVALID;
151
152	ch = c[0] & mask[len];
153
154	for(i=1; i < len; i++) {
155		if ((c[i] & 0xc0) != 0x80) {
156			return LDAP_UCS4_INVALID;
157		}
158
159		ch <<= 6;
160		ch |= c[i] & 0x3f;
161	}
162
163	return ch;
164}
165
166/* conv UCS-4 to UTF-8, not used */
167int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
168{
169	int len=0;
170	unsigned char* p = (unsigned char *) buf;
171
172	/* not a valid Unicode character */
173	if ( c < 0 ) return 0;
174
175	/* Just return length, don't convert */
176	if(buf == NULL) {
177		if( c < 0x80 ) return 1;
178		else if( c < 0x800 ) return 2;
179		else if( c < 0x10000 ) return 3;
180		else if( c < 0x200000 ) return 4;
181		else if( c < 0x4000000 ) return 5;
182		else return 6;
183	}
184
185	if( c < 0x80 ) {
186		p[len++] = c;
187
188	} else if( c < 0x800 ) {
189		p[len++] = 0xc0 | ( c >> 6 );
190		p[len++] = 0x80 | ( c & 0x3f );
191
192	} else if( c < 0x10000 ) {
193		p[len++] = 0xe0 | ( c >> 12 );
194		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
195		p[len++] = 0x80 | ( c & 0x3f );
196
197	} else if( c < 0x200000 ) {
198		p[len++] = 0xf0 | ( c >> 18 );
199		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
200		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
201		p[len++] = 0x80 | ( c & 0x3f );
202
203	} else if( c < 0x4000000 ) {
204		p[len++] = 0xf8 | ( c >> 24 );
205		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
206		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
207		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
208		p[len++] = 0x80 | ( c & 0x3f );
209
210	} else /* if( c < 0x80000000 ) */ {
211		p[len++] = 0xfc | ( c >> 30 );
212		p[len++] = 0x80 | ( (c >> 24) & 0x3f );
213		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
214		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
215		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
216		p[len++] = 0x80 | ( c & 0x3f );
217	}
218
219	return len;
220}
221
222#define LDAP_UCS_UTF8LEN(c)	\
223	c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
224	(c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
225
226/* Convert a string to UTF-8 format. The input string is expected to
227 * have characters of 1, 2, or 4 octets (in network byte order)
228 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
229 * types respectively. (Here T61STRING just means that there is one
230 * octet per character and characters may use the high bit of the octet.
231 * The characters are assumed to use ISO mappings, no provision is made
232 * for converting from T.61 coding rules to Unicode.)
233 */
234
235int
236ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
237{
238	unsigned char *in, *end;
239	char *ptr;
240	ldap_ucs4_t u;
241	int i, l = 0;
242
243	utf8s->bv_val = NULL;
244	utf8s->bv_len = 0;
245
246	in = (unsigned char *)ucs->bv_val;
247
248	/* Make sure we stop at an even multiple of csize */
249	end = in + ( ucs->bv_len & ~(csize-1) );
250
251	for (; in < end; ) {
252		u = *in++;
253		if (csize > 1) {
254			u <<= 8;
255			u |= *in++;
256		}
257		if (csize > 2) {
258			u <<= 8;
259			u |= *in++;
260			u <<= 8;
261			u |= *in++;
262		}
263		i = LDAP_UCS_UTF8LEN(u);
264		if (i == 0)
265			return LDAP_INVALID_SYNTAX;
266		l += i;
267	}
268
269	utf8s->bv_val = LDAP_MALLOC( l+1 );
270	if (utf8s->bv_val == NULL)
271		return LDAP_NO_MEMORY;
272	utf8s->bv_len = l;
273
274	ptr = utf8s->bv_val;
275	for (in = (unsigned char *)ucs->bv_val; in < end; ) {
276		u = *in++;
277		if (csize > 1) {
278			u <<= 8;
279			u |= *in++;
280		}
281		if (csize > 2) {
282			u <<= 8;
283			u |= *in++;
284			u <<= 8;
285			u |= *in++;
286		}
287		ptr += ldap_x_ucs4_to_utf8(u, ptr);
288	}
289	*ptr = '\0';
290	return LDAP_SUCCESS;
291}
292
293/*
294 * Advance to the next UTF-8 character
295 *
296 * Ignores length of multibyte character, instead rely on
297 * continuation markers to find start of next character.
298 * This allows for "resyncing" of when invalid characters
299 * are provided provided the start of the next character
300 * is appears within the 6 bytes examined.
301 */
302char* ldap_utf8_next( const char * p )
303{
304	int i;
305	const unsigned char *u = (const unsigned char *) p;
306
307	if( LDAP_UTF8_ISASCII(u) ) {
308		return (char *) &p[1];
309	}
310
311	for( i=1; i<6; i++ ) {
312		if ( ( u[i] & 0xc0 ) != 0x80 ) {
313			return (char *) &p[i];
314		}
315	}
316
317	return (char *) &p[i];
318}
319
320/*
321 * Advance to the previous UTF-8 character
322 *
323 * Ignores length of multibyte character, instead rely on
324 * continuation markers to find start of next character.
325 * This allows for "resyncing" of when invalid characters
326 * are provided provided the start of the next character
327 * is appears within the 6 bytes examined.
328 */
329char* ldap_utf8_prev( const char * p )
330{
331	int i;
332	const unsigned char *u = (const unsigned char *) p;
333
334	for( i=-1; i>-6 ; i-- ) {
335		if ( ( u[i] & 0xc0 ) != 0x80 ) {
336			return (char *) &p[i];
337		}
338	}
339
340	return (char *) &p[i];
341}
342
343/*
344 * Copy one UTF-8 character from src to dst returning
345 * number of bytes copied.
346 *
347 * Ignores length of multibyte character, instead rely on
348 * continuation markers to find start of next character.
349 * This allows for "resyncing" of when invalid characters
350 * are provided provided the start of the next character
351 * is appears within the 6 bytes examined.
352 */
353int ldap_utf8_copy( char* dst, const char *src )
354{
355	int i;
356	const unsigned char *u = (const unsigned char *) src;
357
358	dst[0] = src[0];
359
360	if( LDAP_UTF8_ISASCII(u) ) {
361		return 1;
362	}
363
364	for( i=1; i<6; i++ ) {
365		if ( ( u[i] & 0xc0 ) != 0x80 ) {
366			return i;
367		}
368		dst[i] = src[i];
369	}
370
371	return i;
372}
373
374#ifndef UTF8_ALPHA_CTYPE
375/*
376 * UTF-8 ctype routines
377 * Only deals with characters < 0x80 (ie: US-ASCII)
378 */
379
380int ldap_utf8_isascii( const char * p )
381{
382	unsigned c = * (const unsigned char *) p;
383	return LDAP_ASCII(c);
384}
385
386int ldap_utf8_isdigit( const char * p )
387{
388	unsigned c = * (const unsigned char *) p;
389
390	if(!LDAP_ASCII(c)) return 0;
391
392	return LDAP_DIGIT( c );
393}
394
395int ldap_utf8_isxdigit( const char * p )
396{
397	unsigned c = * (const unsigned char *) p;
398
399	if(!LDAP_ASCII(c)) return 0;
400
401	return LDAP_HEX(c);
402}
403
404int ldap_utf8_isspace( const char * p )
405{
406	unsigned c = * (const unsigned char *) p;
407
408	if(!LDAP_ASCII(c)) return 0;
409
410	switch(c) {
411	case ' ':
412	case '\t':
413	case '\n':
414	case '\r':
415	case '\v':
416	case '\f':
417		return 1;
418	}
419
420	return 0;
421}
422
423/*
424 * These are not needed by the C SDK and are
425 * not "good enough" for general use.
426 */
427int ldap_utf8_isalpha( const char * p )
428{
429	unsigned c = * (const unsigned char *) p;
430
431	if(!LDAP_ASCII(c)) return 0;
432
433	return LDAP_ALPHA(c);
434}
435
436int ldap_utf8_isalnum( const char * p )
437{
438	unsigned c = * (const unsigned char *) p;
439
440	if(!LDAP_ASCII(c)) return 0;
441
442	return LDAP_ALNUM(c);
443}
444
445int ldap_utf8_islower( const char * p )
446{
447	unsigned c = * (const unsigned char *) p;
448
449	if(!LDAP_ASCII(c)) return 0;
450
451	return LDAP_LOWER(c);
452}
453
454int ldap_utf8_isupper( const char * p )
455{
456	unsigned c = * (const unsigned char *) p;
457
458	if(!LDAP_ASCII(c)) return 0;
459
460	return LDAP_UPPER(c);
461}
462#endif
463
464
465/*
466 * UTF-8 string routines
467 */
468
469/* like strchr() */
470char * (ldap_utf8_strchr)( const char *str, const char *chr )
471{
472	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
473		if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
474			return (char *) str;
475		}
476	}
477
478	return NULL;
479}
480
481/* like strcspn() but returns number of bytes, not characters */
482ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
483{
484	const char *cstr;
485	const char *cset;
486
487	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
488		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
489			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
490				return cstr - str;
491			}
492		}
493	}
494
495	return cstr - str;
496}
497
498/* like strspn() but returns number of bytes, not characters */
499ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
500{
501	const char *cstr;
502	const char *cset;
503
504	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
505		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
506			if( *cset == '\0' ) {
507				return cstr - str;
508			}
509
510			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
511				break;
512			}
513		}
514	}
515
516	return cstr - str;
517}
518
519/* like strpbrk(), replaces strchr() as well */
520char *(ldap_utf8_strpbrk)( const char *str, const char *set )
521{
522	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
523		const char *cset;
524
525		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
526			if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
527				return (char *) str;
528			}
529		}
530	}
531
532	return NULL;
533}
534
535/* like strtok_r(), not strtok() */
536char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
537{
538	char *begin;
539	char *end;
540
541	if( last == NULL ) return NULL;
542
543	begin = str ? str : *last;
544
545	begin += ldap_utf8_strspn( begin, sep );
546
547	if( *begin == '\0' ) {
548		*last = NULL;
549		return NULL;
550	}
551
552	end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
553
554	if( *end != '\0' ) {
555		char *next = LDAP_UTF8_NEXT( end );
556		*end = '\0';
557		end = next;
558	}
559
560	*last = end;
561	return begin;
562}
563