1#pragma ident	"%Z%%M%	%I%	%E% SMI"
2
3/*
4 * The contents of this file are subject to the Netscape Public
5 * License Version 1.1 (the "License"); you may not use this file
6 * except in compliance with the License. You may obtain a copy of
7 * the License at http://www.mozilla.org/NPL/
8 *
9 * Software distributed under the License is distributed on an "AS
10 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
11 * implied. See the License for the specific language governing
12 * rights and limitations under the License.
13 *
14 * The Original Code is Mozilla Communicator client code, released
15 * March 31, 1998.
16 *
17 * The Initial Developer of the Original Code is Netscape
18 * Communications Corporation. Portions created by Netscape are
19 * Copyright (C) 1998-1999 Netscape Communications Corporation. All
20 * Rights Reserved.
21 *
22 * Contributor(s):
23 */
24
25/* uft8.c - misc. utf8 "string" functions. */
26#include "ldap-int.h"
27
28static char UTF8len[64]
29= {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
31   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32   2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
33
34int
35LDAP_CALL
36ldap_utf8len (const char* s)
37     /* Return the number of char's in the character at *s. */
38{
39    return ldap_utf8next((char*)s) - s;
40}
41
42char*
43LDAP_CALL
44ldap_utf8next (char* s)
45     /* Return a pointer to the character immediately following *s.
46	Handle any valid UTF-8 character, including '\0' and ASCII.
47	Try to handle a misaligned pointer or a malformed character.
48     */
49{
50    register unsigned char* next = (unsigned char*)s;
51    switch (UTF8len [(*next >> 2) & 0x3F]) {
52      case 0: /* erroneous: s points to the middle of a character. */
53      case 6: if ((*++next & 0xC0) != 0x80) break;
54      case 5: if ((*++next & 0xC0) != 0x80) break;
55      case 4: if ((*++next & 0xC0) != 0x80) break;
56      case 3: if ((*++next & 0xC0) != 0x80) break;
57      case 2: if ((*++next & 0xC0) != 0x80) break;
58      case 1: ++next;
59    }
60    return (char*) next;
61}
62
63char*
64LDAP_CALL
65ldap_utf8prev (char* s)
66     /* Return a pointer to the character immediately preceding *s.
67	Handle any valid UTF-8 character, including '\0' and ASCII.
68	Try to handle a misaligned pointer or a malformed character.
69     */
70{
71    register unsigned char* prev = (unsigned char*)s;
72    unsigned char* limit = prev - 6;
73    while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
74    	;
75    }
76    return (char*) prev;
77}
78
79int
80LDAP_CALL
81ldap_utf8copy (char* dst, const char* src)
82     /* Copy a character from src to dst; return the number of char's copied.
83	Handle any valid UTF-8 character, including '\0' and ASCII.
84	Try to handle a misaligned pointer or a malformed character.
85     */
86{
87    register const unsigned char* s = (const unsigned char*)src;
88    switch (UTF8len [(*s >> 2) & 0x3F]) {
89      case 0: /* erroneous: s points to the middle of a character. */
90      case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
91      case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
92      case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
93      case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
94      case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
95      case 1: *dst   = *s++;
96    }
97    return s - (const unsigned char*)src;
98}
99
100size_t
101LDAP_CALL
102ldap_utf8characters (const char* src)
103     /* Return the number of UTF-8 characters in the 0-terminated array s. */
104{
105    register char* s = (char*)src;
106    size_t n;
107    for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
108    return n;
109}
110
111unsigned long LDAP_CALL
112ldap_utf8getcc( const char** src )
113{
114    register unsigned long c;
115    register const unsigned char* s = (const unsigned char*)*src;
116    switch (UTF8len [(*s >> 2) & 0x3F]) {
117      case 0: /* erroneous: s points to the middle of a character. */
118	      c = (*s++) & 0x3F; goto more5;
119      case 1: c = (*s++); break;
120      case 2: c = (*s++) & 0x1F; goto more1;
121      case 3: c = (*s++) & 0x0F; goto more2;
122      case 4: c = (*s++) & 0x07; goto more3;
123      case 5: c = (*s++) & 0x03; goto more4;
124      case 6: c = (*s++) & 0x01; goto more5;
125      more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
126      more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
127      more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
128      more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
129      more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
130	break;
131    }
132    *src = (const char*)s;
133    return c;
134}
135
136char*
137LDAP_CALL
138ldap_utf8strtok_r( char* sp, const char* brk, char** next)
139{
140    const char *bp;
141    unsigned long sc, bc;
142    char *tok;
143
144    if (sp == NULL && (sp = *next) == NULL)
145      return NULL;
146
147    /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
148  cont:
149    sc = LDAP_UTF8GETC(sp);
150    for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
151	if (sc == bc)
152	  goto cont;
153    }
154
155    if (sc == 0) { /* no non-delimiter characters */
156	*next = NULL;
157	return NULL;
158    }
159    tok = LDAP_UTF8PREV(sp);
160
161    /* Scan token; roughly, sp += strcspn(sp, brk)
162     * Note that brk must be 0-terminated; we stop if we see that, too.
163     */
164    while (1) {
165	sc = LDAP_UTF8GETC(sp);
166	bp = brk;
167	do {
168	    if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
169		if (sc == 0) {
170		    *next = NULL;
171		} else {
172		    *next = sp;
173		    *(LDAP_UTF8PREV(sp)) = 0;
174		}
175		return tok;
176	    }
177	} while (bc != 0);
178    }
179    /* NOTREACHED */
180}
181
182int
183LDAP_CALL
184ldap_utf8isalnum( char* s )
185{
186    register unsigned char c = *(unsigned char*)s;
187    if (0x80 & c) return 0;
188    if (c >= 'A' && c <= 'Z') return 1;
189    if (c >= 'a' && c <= 'z') return 1;
190    if (c >= '0' && c <= '9') return 1;
191    return 0;
192}
193
194int
195LDAP_CALL
196ldap_utf8isalpha( char* s )
197{
198    register unsigned char c = *(unsigned char*)s;
199    if (0x80 & c) return 0;
200    if (c >= 'A' && c <= 'Z') return 1;
201    if (c >= 'a' && c <= 'z') return 1;
202    return 0;
203}
204
205int
206LDAP_CALL
207ldap_utf8isdigit( char* s )
208{
209    register unsigned char c = *(unsigned char*)s;
210    if (0x80 & c) return 0;
211    if (c >= '0' && c <= '9') return 1;
212    return 0;
213}
214
215int
216LDAP_CALL
217ldap_utf8isxdigit( char* s )
218{
219    register unsigned char c = *(unsigned char*)s;
220    if (0x80 & c) return 0;
221    if (c >= '0' && c <= '9') return 1;
222    if (c >= 'A' && c <= 'F') return 1;
223    if (c >= 'a' && c <= 'f') return 1;
224    return 0;
225}
226
227int
228LDAP_CALL
229ldap_utf8isspace( char* s )
230{
231    register unsigned char *c = (unsigned char*)s;
232    int len = ldap_utf8len(s);
233
234    if (len == 0) {
235	return 0;
236    } else if (len == 1) {
237	switch (*c) {
238	    case 0x09:
239	    case 0x0A:
240	    case 0x0B:
241	    case 0x0C:
242	    case 0x0D:
243	    case 0x20:
244		return 1;
245	    default:
246		return 0;
247	}
248    } else if (len == 2) {
249	if (*c == 0xc2) {
250		return *(c+1) == 0x80;
251	}
252    } else if (len == 3) {
253	if (*c == 0xE2) {
254	    c++;
255	    if (*c == 0x80) {
256		c++;
257		return (*c>=0x80 && *c<=0x8a);
258	    }
259	} else if (*c == 0xE3) {
260	    return (*(c+1)==0x80) && (*(c+2)==0x80);
261	} else if (*c==0xEF) {
262	    return (*(c+1)==0xBB) && (*(c+2)==0xBF);
263	}
264	return 0;
265    }
266
267    /* should never reach here */
268    return 0;
269}
270