1/*	$NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $	*/
2
3/* utf-8.c -- Basic UTF-8 routines */
4/* $OpenLDAP$ */
5/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6 *
7 * Copyright 1998-2021 The OpenLDAP Foundation.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted only as authorized by the OpenLDAP
12 * Public License.
13 *
14 * A copy of this license is available in the file LICENSE in the
15 * top-level directory of the distribution or, alternatively, at
16 * <http://www.OpenLDAP.org/license.html>.
17 */
18/* Basic UTF-8 routines
19 *
20 * These routines are "dumb".  Though they understand UTF-8,
21 * they don't grok Unicode.  That is, they can push bits,
22 * but don't have a clue what the bits represent.  That's
23 * good enough for use with the LDAP Client SDK.
24 *
25 * These routines are not optimized.
26 */
27
28#include <sys/cdefs.h>
29__RCSID("$NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
30
31#include "portable.h"
32
33#include <stdio.h>
34
35#include <ac/stdlib.h>
36
37#include <ac/socket.h>
38#include <ac/string.h>
39#include <ac/time.h>
40
41#include "ldap_utf8.h"
42
43#include "ldap-int.h"
44#include "ldap_defaults.h"
45
46/*
47 * return the number of bytes required to hold the
48 * NULL-terminated UTF-8 string NOT INCLUDING the
49 * termination.
50 */
51ber_len_t ldap_utf8_bytes( const char * p )
52{
53	ber_len_t bytes;
54
55	for( bytes=0; p[bytes]; bytes++ ) {
56		/* EMPTY */ ;
57	}
58
59	return bytes;
60}
61
62ber_len_t ldap_utf8_chars( const char * p )
63{
64	/* could be optimized and could check for invalid sequences */
65	ber_len_t chars=0;
66
67	for( ; *p ; LDAP_UTF8_INCR(p) ) {
68		chars++;
69	}
70
71	return chars;
72}
73
74/* return offset to next character */
75int ldap_utf8_offset( const char * p )
76{
77	return LDAP_UTF8_NEXT(p) - p;
78}
79
80/*
81 * Returns length indicated by first byte.
82 */
83const char ldap_utf8_lentab[] = {
84	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88	0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
91	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
92
93int ldap_utf8_charlen( const char * p )
94{
95	if (!(*p & 0x80))
96		return 1;
97
98	return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
99}
100
101/*
102 * Make sure the UTF-8 char used the shortest possible encoding
103 * returns charlen if valid, 0 if not.
104 *
105 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
106 * The table is slightly modified from that of the RFC.
107 *
108 * UCS-4 range (hex)      UTF-8 sequence (binary)
109 * 0000 0000-0000 007F   0.......
110 * 0000 0080-0000 07FF   110++++. 10......
111 * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
112 * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
113 * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
114 * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
115 *
116 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
117 * at least one of the '+' bits must be set, otherwise the character
118 * should have been encoded in fewer octets. Note that in the two-octet
119 * case, only the first octet needs to be validated, and this is done
120 * in the ldap_utf8_lentab[] above.
121 */
122
123/* mask of required bits in second octet */
124#undef c
125#define c const char
126c ldap_utf8_mintab[] = {
127	(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
128	(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
129	(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
130	(c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
131#undef c
132
133int ldap_utf8_charlen2( const char * p )
134{
135	int i = LDAP_UTF8_CHARLEN( p );
136
137	if ( i > 2 ) {
138		if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
139			i = 0;
140	}
141	return i;
142}
143
144/* conv UTF-8 to UCS-4, useful for comparisons */
145ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
146{
147    const unsigned char *c = (const unsigned char *) p;
148    ldap_ucs4_t ch;
149	int len, i;
150	static unsigned char mask[] = {
151		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
152
153	len = LDAP_UTF8_CHARLEN2(p, len);
154
155	if( len == 0 ) return LDAP_UCS4_INVALID;
156
157	ch = c[0] & mask[len];
158
159	for(i=1; i < len; i++) {
160		if ((c[i] & 0xc0) != 0x80) {
161			return LDAP_UCS4_INVALID;
162		}
163
164		ch <<= 6;
165		ch |= c[i] & 0x3f;
166	}
167
168	return ch;
169}
170
171/* conv UCS-4 to UTF-8, not used */
172int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
173{
174	int len=0;
175	unsigned char* p = (unsigned char *) buf;
176
177	/* not a valid Unicode character */
178	if ( c < 0 ) return 0;
179
180	/* Just return length, don't convert */
181	if(buf == NULL) {
182		if( c < 0x80 ) return 1;
183		else if( c < 0x800 ) return 2;
184		else if( c < 0x10000 ) return 3;
185		else if( c < 0x200000 ) return 4;
186		else if( c < 0x4000000 ) return 5;
187		else return 6;
188	}
189
190	if( c < 0x80 ) {
191		p[len++] = c;
192
193	} else if( c < 0x800 ) {
194		p[len++] = 0xc0 | ( c >> 6 );
195		p[len++] = 0x80 | ( c & 0x3f );
196
197	} else if( c < 0x10000 ) {
198		p[len++] = 0xe0 | ( c >> 12 );
199		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
200		p[len++] = 0x80 | ( c & 0x3f );
201
202	} else if( c < 0x200000 ) {
203		p[len++] = 0xf0 | ( c >> 18 );
204		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
205		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
206		p[len++] = 0x80 | ( c & 0x3f );
207
208	} else if( c < 0x4000000 ) {
209		p[len++] = 0xf8 | ( c >> 24 );
210		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
211		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
212		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
213		p[len++] = 0x80 | ( c & 0x3f );
214
215	} else /* if( c < 0x80000000 ) */ {
216		p[len++] = 0xfc | ( c >> 30 );
217		p[len++] = 0x80 | ( (c >> 24) & 0x3f );
218		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
219		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
220		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
221		p[len++] = 0x80 | ( c & 0x3f );
222	}
223
224	return len;
225}
226
227#define LDAP_UCS_UTF8LEN(c)	\
228	c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
229	(c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
230
231/* Convert a string to UTF-8 format. The input string is expected to
232 * have characters of 1, 2, or 4 octets (in network byte order)
233 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
234 * types respectively. (Here T61STRING just means that there is one
235 * octet per character and characters may use the high bit of the octet.
236 * The characters are assumed to use ISO mappings, no provision is made
237 * for converting from T.61 coding rules to Unicode.)
238 */
239
240int
241ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
242{
243	unsigned char *in, *end;
244	char *ptr;
245	ldap_ucs4_t u;
246	int i, l = 0;
247
248	utf8s->bv_val = NULL;
249	utf8s->bv_len = 0;
250
251	in = (unsigned char *)ucs->bv_val;
252
253	/* Make sure we stop at an even multiple of csize */
254	end = in + ( ucs->bv_len & ~(csize-1) );
255
256	for (; in < end; ) {
257		u = *in++;
258		if (csize > 1) {
259			u <<= 8;
260			u |= *in++;
261		}
262		if (csize > 2) {
263			u <<= 8;
264			u |= *in++;
265			u <<= 8;
266			u |= *in++;
267		}
268		i = LDAP_UCS_UTF8LEN(u);
269		if (i == 0)
270			return LDAP_INVALID_SYNTAX;
271		l += i;
272	}
273
274	utf8s->bv_val = LDAP_MALLOC( l+1 );
275	if (utf8s->bv_val == NULL)
276		return LDAP_NO_MEMORY;
277	utf8s->bv_len = l;
278
279	ptr = utf8s->bv_val;
280	for (in = (unsigned char *)ucs->bv_val; in < end; ) {
281		u = *in++;
282		if (csize > 1) {
283			u <<= 8;
284			u |= *in++;
285		}
286		if (csize > 2) {
287			u <<= 8;
288			u |= *in++;
289			u <<= 8;
290			u |= *in++;
291		}
292		ptr += ldap_x_ucs4_to_utf8(u, ptr);
293	}
294	*ptr = '\0';
295	return LDAP_SUCCESS;
296}
297
298/*
299 * Advance to the next UTF-8 character
300 *
301 * Ignores length of multibyte character, instead rely on
302 * continuation markers to find start of next character.
303 * This allows for "resyncing" of when invalid characters
304 * are provided provided the start of the next character
305 * is appears within the 6 bytes examined.
306 */
307char* ldap_utf8_next( const char * p )
308{
309	int i;
310	const unsigned char *u = (const unsigned char *) p;
311
312	if( LDAP_UTF8_ISASCII(u) ) {
313		return (char *) &p[1];
314	}
315
316	for( i=1; i<6; i++ ) {
317		if ( ( u[i] & 0xc0 ) != 0x80 ) {
318			return (char *) &p[i];
319		}
320	}
321
322	return (char *) &p[i];
323}
324
325/*
326 * Advance to the previous UTF-8 character
327 *
328 * Ignores length of multibyte character, instead rely on
329 * continuation markers to find start of next character.
330 * This allows for "resyncing" of when invalid characters
331 * are provided provided the start of the next character
332 * is appears within the 6 bytes examined.
333 */
334char* ldap_utf8_prev( const char * p )
335{
336	int i;
337	const unsigned char *u = (const unsigned char *) p;
338
339	for( i=-1; i>-6 ; i-- ) {
340		if ( ( u[i] & 0xc0 ) != 0x80 ) {
341			return (char *) &p[i];
342		}
343	}
344
345	return (char *) &p[i];
346}
347
348/*
349 * Copy one UTF-8 character from src to dst returning
350 * number of bytes copied.
351 *
352 * Ignores length of multibyte character, instead rely on
353 * continuation markers to find start of next character.
354 * This allows for "resyncing" of when invalid characters
355 * are provided provided the start of the next character
356 * is appears within the 6 bytes examined.
357 */
358int ldap_utf8_copy( char* dst, const char *src )
359{
360	int i;
361	const unsigned char *u = (const unsigned char *) src;
362
363	dst[0] = src[0];
364
365	if( LDAP_UTF8_ISASCII(u) ) {
366		return 1;
367	}
368
369	for( i=1; i<6; i++ ) {
370		if ( ( u[i] & 0xc0 ) != 0x80 ) {
371			return i;
372		}
373		dst[i] = src[i];
374	}
375
376	return i;
377}
378
379#ifndef UTF8_ALPHA_CTYPE
380/*
381 * UTF-8 ctype routines
382 * Only deals with characters < 0x80 (ie: US-ASCII)
383 */
384
385int ldap_utf8_isascii( const char * p )
386{
387	unsigned c = * (const unsigned char *) p;
388	return LDAP_ASCII(c);
389}
390
391int ldap_utf8_isdigit( const char * p )
392{
393	unsigned c = * (const unsigned char *) p;
394
395	if(!LDAP_ASCII(c)) return 0;
396
397	return LDAP_DIGIT( c );
398}
399
400int ldap_utf8_isxdigit( const char * p )
401{
402	unsigned c = * (const unsigned char *) p;
403
404	if(!LDAP_ASCII(c)) return 0;
405
406	return LDAP_HEX(c);
407}
408
409int ldap_utf8_isspace( const char * p )
410{
411	unsigned c = * (const unsigned char *) p;
412
413	if(!LDAP_ASCII(c)) return 0;
414
415	switch(c) {
416	case ' ':
417	case '\t':
418	case '\n':
419	case '\r':
420	case '\v':
421	case '\f':
422		return 1;
423	}
424
425	return 0;
426}
427
428/*
429 * These are not needed by the C SDK and are
430 * not "good enough" for general use.
431 */
432int ldap_utf8_isalpha( const char * p )
433{
434	unsigned c = * (const unsigned char *) p;
435
436	if(!LDAP_ASCII(c)) return 0;
437
438	return LDAP_ALPHA(c);
439}
440
441int ldap_utf8_isalnum( const char * p )
442{
443	unsigned c = * (const unsigned char *) p;
444
445	if(!LDAP_ASCII(c)) return 0;
446
447	return LDAP_ALNUM(c);
448}
449
450int ldap_utf8_islower( const char * p )
451{
452	unsigned c = * (const unsigned char *) p;
453
454	if(!LDAP_ASCII(c)) return 0;
455
456	return LDAP_LOWER(c);
457}
458
459int ldap_utf8_isupper( const char * p )
460{
461	unsigned c = * (const unsigned char *) p;
462
463	if(!LDAP_ASCII(c)) return 0;
464
465	return LDAP_UPPER(c);
466}
467#endif
468
469
470/*
471 * UTF-8 string routines
472 */
473
474/* like strchr() */
475char * (ldap_utf8_strchr)( const char *str, const char *chr )
476{
477	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
478		if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
479			return (char *) str;
480		}
481	}
482
483	return NULL;
484}
485
486/* like strcspn() but returns number of bytes, not characters */
487ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
488{
489	const char *cstr;
490	const char *cset;
491
492	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
493		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
494			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
495				return cstr - str;
496			}
497		}
498	}
499
500	return cstr - str;
501}
502
503/* like strspn() but returns number of bytes, not characters */
504ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
505{
506	const char *cstr;
507	const char *cset;
508
509	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
510		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
511			if( *cset == '\0' ) {
512				return cstr - str;
513			}
514
515			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
516				break;
517			}
518		}
519	}
520
521	return cstr - str;
522}
523
524/* like strpbrk(), replaces strchr() as well */
525char *(ldap_utf8_strpbrk)( const char *str, const char *set )
526{
527	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
528		const char *cset;
529
530		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
531			if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
532				return (char *) str;
533			}
534		}
535	}
536
537	return NULL;
538}
539
540/* like strtok_r(), not strtok() */
541char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
542{
543	char *begin;
544	char *end;
545
546	if( last == NULL ) return NULL;
547
548	begin = str ? str : *last;
549
550	begin += ldap_utf8_strspn( begin, sep );
551
552	if( *begin == '\0' ) {
553		*last = NULL;
554		return NULL;
555	}
556
557	end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
558
559	if( *end != '\0' ) {
560		char *next = LDAP_UTF8_NEXT( end );
561		*end = '\0';
562		end = next;
563	}
564
565	*last = end;
566	return begin;
567}
568