1/*
2 * Copyright (C) 2008 by the Massachusetts Institute of Technology,
3 * Cambridge, MA, USA.  All Rights Reserved.
4 *
5 * This software is being provided to you, the LICENSEE, by the
6 * Massachusetts Institute of Technology (M.I.T.) under the following
7 * license.  By obtaining, using and/or copying this software, you agree
8 * that you have read, understood, and will comply with these terms and
9 * conditions:
10 *
11 * Export of this software from the United States of America may
12 * require a specific license from the United States Government.
13 * It is the responsibility of any person or organization contemplating
14 * export to obtain such a license before exporting.
15 *
16 * WITHIN THAT CONSTRAINT, permission to use, copy, modify and distribute
17 * this software and its documentation for any purpose and without fee or
18 * royalty is hereby granted, provided that you agree to comply with the
19 * following copyright notice and statements, including the disclaimer, and
20 * that the same appear on ALL copies of the software and documentation,
21 * including modifications that you make for internal use or for
22 * distribution:
23 *
24 * THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS
25 * OR WARRANTIES, EXPRESS OR IMPLIED.  By way of example, but not
26 * limitation, M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF
27 * MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
28 * THE LICENSED SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY
29 * PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
30 *
31 * The name of the Massachusetts Institute of Technology or M.I.T. may NOT
32 * be used in advertising or publicity pertaining to distribution of the
33 * software.  Title to copyright in this software and any associated
34 * documentation shall at all times remain with M.I.T., and USER agrees to
35 * preserve same.
36 *
37 * Furthermore if you modify this software you must label
38 * your software as modified software and not distribute it in such a
39 * fashion that it might be confused with the original M.I.T. software.
40 */
41/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
42 *
43 * Copyright 1998-2008 The OpenLDAP Foundation.
44 * All rights reserved.
45 *
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted only as authorized by the OpenLDAP
48 * Public License.
49 *
50 * A copy of this license is available in file LICENSE in the
51 * top-level directory of the distribution or, alternatively, at
52 * <http://www.OpenLDAP.org/license.html>.
53 */
54/* This notice applies to changes, created by or for Novell, Inc.,
55 * to preexisting works for which notices appear elsewhere in this file.
56 *
57 * Copyright (C) 2000 Novell, Inc. All Rights Reserved.
58 *
59 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND TREATIES.
60 * USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT TO VERSION
61 * 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS AVAILABLE AT
62 * HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" IN THE
63 * TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION OF THIS
64 * WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP PUBLIC
65 * LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT THE
66 * PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
67 */
68
69#ifndef K5_UTF8_H
70#define K5_UTF8_H
71
72#include "autoconf.h"
73
74#ifdef HAVE_SYS_TYPES_H
75#include <sys/types.h>
76#endif
77
78#ifdef HAVE_UNISTD_H
79#include <unistd.h>
80#endif
81
82#ifdef HAVE_STDLIB_H
83#include <stdlib.h>
84#endif
85
86#if INT_MAX == 0x7fff
87typedef	unsigned int	krb5_ucs2;
88#elif SHRT_MAX == 0x7fff
89typedef	unsigned short	krb5_ucs2;
90#else
91#error undefined 16 bit type
92#endif
93
94#if INT_MAX == 0x7fffffffL
95typedef int	krb5_ucs4;
96#elif LONG_MAX == 0x7fffffffL
97typedef long	krb5_ucs4;
98#elif SHRT_MAX == 0x7fffffffL
99typedef short	krb5_ucs4;
100#else
101#error: undefined 32 bit type
102#endif
103
104#define KRB5_MAX_UTF8_LEN   (sizeof(krb5_ucs2) * 3/2)
105
106int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out);
107size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf);
108
109int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out);
110size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf);
111
112int
113krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s,
114		       char **utf8s,
115		       size_t *utf8slen);
116
117int
118krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s,
119			size_t ucs2slen,
120		        char **utf8s,
121		        size_t *utf8slen);
122
123int
124krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les,
125			 char **utf8s,
126			 size_t *utf8slen);
127
128int
129krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les,
130			  size_t ucs2leslen,
131			  char **utf8s,
132			  size_t *utf8slen);
133
134int
135krb5int_utf8s_to_ucs2s(const char *utf8s,
136		       krb5_ucs2 **ucs2s,
137		       size_t *ucs2chars);
138
139int
140krb5int_utf8cs_to_ucs2s(const char *utf8s,
141			size_t utf8slen,
142		        krb5_ucs2 **ucs2s,
143		        size_t *ucs2chars);
144
145int
146krb5int_utf8s_to_ucs2les(const char *utf8s,
147			 unsigned char **ucs2les,
148		         size_t *ucs2leslen);
149
150int
151krb5int_utf8cs_to_ucs2les(const char *utf8s,
152			  size_t utf8slen,
153		          unsigned char **ucs2les,
154			  size_t *ucs2leslen);
155
156/* returns the number of bytes in the UTF-8 string */
157size_t krb5int_utf8_bytes(const char *);
158/* returns the number of UTF-8 characters in the string */
159size_t krb5int_utf8_chars(const char *);
160/* returns the number of UTF-8 characters in the counted string */
161size_t krb5int_utf8c_chars(const char *, size_t);
162/* returns the length (in bytes) of the UTF-8 character */
163int krb5int_utf8_offset(const char *);
164/* returns the length (in bytes) indicated by the UTF-8 character */
165int krb5int_utf8_charlen(const char *);
166
167/* returns the length (in bytes) indicated by the UTF-8 character
168 * also checks that shortest possible encoding was used
169 */
170int krb5int_utf8_charlen2(const char *);
171
172/* copies a UTF-8 character and returning number of bytes copied */
173int krb5int_utf8_copy(char *, const char *);
174
175/* returns pointer of next UTF-8 character in string */
176char *krb5int_utf8_next( const char *);
177/* returns pointer of previous UTF-8 character in string */
178char *krb5int_utf8_prev( const char *);
179
180/* primitive ctype routines -- not aware of non-ascii characters */
181int krb5int_utf8_isascii( const char *);
182int krb5int_utf8_isalpha( const char *);
183int krb5int_utf8_isalnum( const char *);
184int krb5int_utf8_isdigit( const char *);
185int krb5int_utf8_isxdigit( const char *);
186int krb5int_utf8_isspace( const char *);
187
188/* span characters not in set, return bytes spanned */
189size_t krb5int_utf8_strcspn( const char* str, const char *set);
190/* span characters in set, return bytes spanned */
191size_t krb5int_utf8_strspn( const char* str, const char *set);
192/* return first occurance of character in string */
193char *krb5int_utf8_strchr( const char* str, const char *chr);
194/* return first character of set in string */
195char *krb5int_utf8_strpbrk( const char* str, const char *set);
196/* reentrant tokenizer */
197char *krb5int_utf8_strtok( char* sp, const char* sep, char **last);
198
199/* Optimizations */
200extern const char krb5int_utf8_lentab[128];
201extern const char krb5int_utf8_mintab[32];
202
203#define KRB5_UTF8_ISASCII(p) ( !(*(const unsigned char *)(p) & 0x80 ) )
204#define KRB5_UTF8_CHARLEN(p) ( KRB5_UTF8_ISASCII(p) \
205	? 1 : krb5int_utf8_lentab[*(const unsigned char *)(p) ^ 0x80] )
206
207/* This is like CHARLEN but additionally validates to make sure
208 * the char used the shortest possible encoding.
209 * 'l' is used to temporarily hold the result of CHARLEN.
210 */
211#define KRB5_UTF8_CHARLEN2(p, l) ( ( ( l = KRB5_UTF8_CHARLEN( p )) < 3 || \
212	( krb5int_utf8_mintab[*(const unsigned char *)(p) & 0x1f] & (p)[1] ) ) ? \
213	l : 0 )
214
215#define KRB5_UTF8_OFFSET(p) ( KRB5_UTF8_ISASCII(p) \
216	? 1 : krb5int_utf8_offset((p)) )
217
218#define KRB5_UTF8_COPY(d,s) ( KRB5_UTF8_ISASCII(s) \
219	? (*(d) = *(s), 1) : krb5int_utf8_copy((d),(s)) )
220
221#define KRB5_UTF8_NEXT(p) (	KRB5_UTF8_ISASCII(p) \
222	? (char *)(p)+1 : krb5int_utf8_next((p)) )
223
224#define KRB5_UTF8_INCR(p) ((p) = KRB5_UTF8_NEXT(p))
225
226/* For symmetry */
227#define KRB5_UTF8_PREV(p) (krb5int_utf8_prev((p)))
228#define KRB5_UTF8_DECR(p) ((p)=KRB5_UTF8_PREV((p)))
229
230/*
231 * these macros assume 'x' is an ASCII x
232 * and assume the "C" locale
233 */
234#define KRB5_ASCII(c)		(!((c) & 0x80))
235#define KRB5_SPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
236#define KRB5_DIGIT(c)		((c) >= '0' && (c) <= '9')
237#define KRB5_LOWER(c)		((c) >= 'a' && (c) <= 'z')
238#define KRB5_UPPER(c)		((c) >= 'A' && (c) <= 'Z')
239#define KRB5_ALPHA(c)		(KRB5_LOWER(c) || KRB5_UPPER(c))
240#define KRB5_ALNUM(c)		(KRB5_ALPHA(c) || KRB5_DIGIT(c))
241
242#define KRB5_LDH(c)		(KRB5_ALNUM(c) || (c) == '-')
243
244#define KRB5_HEXLOWER(c)	((c) >= 'a' && (c) <= 'f')
245#define KRB5_HEXUPPER(c)	((c) >= 'A' && (c) <= 'F')
246#define KRB5_HEX(c)		(KRB5_DIGIT(c) || \
247				KRB5_HEXLOWER(c) || KRB5_HEXUPPER(c))
248
249#endif /* K5_UTF8_H */
250