1/* $OpenLDAP$ */
2/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3 *
4 * Copyright 2002-2011 The OpenLDAP Foundation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
9 * Public License.
10 *
11 * A copy of this license is available in the file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
14 */
15/* ACKNOWLEDGEMENTS:
16 * This work was initially developed by Howard Chu for inclusion in
17 * OpenLDAP Software.
18 */
19
20/*
21 * Basic T.61 <-> UTF-8 conversion
22 *
23 * These routines will perform a lossless translation from T.61 to UTF-8
24 * and a lossy translation from UTF-8 to T.61.
25 */
26
27#include "portable.h"
28
29#include <stdio.h>
30
31#include <ac/stdlib.h>
32
33#include <ac/socket.h>
34#include <ac/string.h>
35#include <ac/time.h>
36
37#include "ldap-int.h"
38#include "ldap_utf8.h"
39
40#include "ldap_defaults.h"
41
42/*
43 * T.61 is somewhat braindead; even in the 7-bit space it is not
44 * completely equivalent to 7-bit US-ASCII. Our definition of the
45 * character set comes from RFC 1345 with a slightly more readable
46 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
47 *
48 * Even though '#' and '$' are present in the 7-bit US-ASCII space,
49 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
50 * xA6 and xA4.
51 *
52 * Also T.61 lacks
53 *	backslash 	\	(x5C)
54 *	caret		^	(x5E)
55 *	backquote	`	(x60)
56 *	left brace	{	(x7B)
57 *	right brace	}	(x7D)
58 *	tilde		~	(x7E)
59 *
60 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
61 * accents of some form or another. There are predefined combinations
62 * for certain characters, but they can also be used arbitrarily. The
63 * table at dkuug.dk maps these accents to the E000 "private use" range
64 * of the Unicode space, but I believe they more properly belong in the
65 * 0300 range (non-spacing accents). The transformation is complicated
66 * slightly because Unicode wants the non-spacing character to follow
67 * the base character, while T.61 has the non-spacing character leading.
68 * Also, T.61 specifically recognizes certain combined pairs as "characters"
69 * but doesn't specify how to treat unrecognized pairs. This code will
70 * always attempt to combine pairs when a known Unicode composite exists.
71 */
72
73static const wchar_t t61_tab[] = {
74	0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
75	0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
76	0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
77	0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
78	0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
79	0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
80	0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
81	0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
82	0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
83	0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
84	0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
85	0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
86	0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
87	0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
88	0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
89	0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
90	0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
91	0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
92	0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
93	0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
94	0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
95	0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
96	0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
97	0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
98	0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
99	0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
100	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
101	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
102	0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
103	0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
104	0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
105	0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
106};
107
108typedef wchar_t wvec16[16];
109typedef wchar_t wvec32[32];
110typedef wchar_t wvec64[64];
111
112/* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
113static const wvec16 accents = {
114	0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
115	0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
116
117/* In the following tables, base characters commented in (parentheses)
118 * are not defined by T.61 but are mapped anyway since their Unicode
119 * composite exists.
120 */
121
122/* Grave accented chars AEIOU (NWY) */
123static const wvec32 c1_vec1 = {
124	/* Upper case */
125	0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
126	0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
127static const wvec32 c1_vec2 = {
128	/* Lower case */
129	0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
130	0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
131
132static const wvec32 *c1_grave[] = {
133	NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
134};
135
136/* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
137static const wvec32 c2_vec1 = {
138	/* Upper case */
139	0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
140	0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
141	0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
142	0, 0xdd, 0x179, 0, 0, 0, 0, 0};
143static const wvec32 c2_vec2 = {
144	/* Lower case */
145	0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
146	0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
147	0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
148	0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
149static const wvec32 c2_vec3 = {
150	/* (AE and ae) */
151	0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
152	0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
153
154static const wvec32 *c2_acute[] = {
155	NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
156};
157
158/* Circumflex AEIOUYCGHJSW (Z) */
159static const wvec32 c3_vec1 = {
160	/* Upper case */
161	0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
162	0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
163	0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
164	0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
165static const wvec32 c3_vec2 = {
166	/* Lower case */
167	0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
168	0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
169	0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
170	0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
171static const wvec32 *c3_circumflex[] = {
172	NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
173};
174
175/* Tilde AIOUN (EVY) */
176static const wvec32 c4_vec1 = {
177	/* Upper case */
178	0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
179	0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
180static const wvec32 c4_vec2 = {
181	/* Lower case */
182	0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
183	0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
184static const wvec32 *c4_tilde[] = {
185	NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
186};
187
188/* Macron AEIOU (YG) */
189static const wvec32 c5_vec1 = {
190	/* Upper case */
191	0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
192	0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
193static const wvec32 c5_vec2 = {
194	/* Lower case */
195	0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
196	0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
197static const wvec32 c5_vec3 = {
198	/* (AE and ae) */
199	0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200	0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
201static const wvec32 *c5_macron[] = {
202	NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
203};
204
205/* Breve AUG (EIO) */
206static const wvec32 c6_vec1 = {
207	/* Upper case */
208	0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
209	0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
210static const wvec32 c6_vec2 = {
211	/* Lower case */
212	0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
213	0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
214static const wvec32 *c6_breve[] = {
215	NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
216};
217
218/* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
219static const wvec32 c7_vec1 = {
220	/* Upper case */
221	0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
222	0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
223	0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
224	0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
225static const wvec32 c7_vec2 = {
226	/* Lower case */
227	0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
228	0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
229	0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
230	0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
231static const wvec32 *c7_dotabove[] = {
232	NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
233};
234
235/* Diaeresis AEIOUY (HWXt) */
236static const wvec32 c8_vec1 = {
237	/* Upper case */
238	0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
239	0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
240static const wvec32 c8_vec2 = {
241	/* Lower case */
242	0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
243	0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
244static const wvec32 *c8_diaeresis[] = {
245	NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
246};
247
248/* Ring Above AU (wy) */
249static const wvec32 ca_vec1 = {
250	/* Upper case */
251	0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
252	0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
253static const wvec32 ca_vec2 = {
254	/* Lower case */
255	0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
256	0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
257static const wvec32 *ca_ringabove[] = {
258	NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
259};
260
261/* Cedilla CGKLNRST (EDH) */
262static const wvec32 cb_vec1 = {
263	/* Upper case */
264	0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
265	0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
266	0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
267static const wvec32 cb_vec2 = {
268	/* Lower case */
269	0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
270	0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
271	0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
272static const wvec32 *cb_cedilla[] = {
273	NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
274};
275
276/* Double Acute Accent OU */
277static const wvec32 cd_vec1 = {
278	/* Upper case */
279	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
280	0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
281static const wvec32 cd_vec2 = {
282	/* Lower case */
283	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
284	0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
285static const wvec32 *cd_doubleacute[] = {
286	NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
287};
288
289/* Ogonek AEIU (O) */
290static const wvec32 ce_vec1 = {
291	/* Upper case */
292	0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
293	0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
294static const wvec32 ce_vec2 = {
295	/* Lower case */
296	0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
297	0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
298static const wvec32 *ce_ogonek[] = {
299	NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
300};
301
302/* Caron CDELNRSTZ (AIOUGKjH) */
303static const wvec32 cf_vec1 = {
304	/* Upper case */
305	0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
306	0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
307	0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
308	0, 0, 0x17d, 0, 0, 0, 0, 0};
309static const wvec32 cf_vec2 = {
310	/* Lower case */
311	0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
312	0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
313	0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
314	0, 0, 0x17e, 0, 0, 0, 0, 0};
315static const wvec32 *cf_caron[] = {
316	NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
317};
318
319static const wvec32 **cx_tab[] = {
320	NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
321	c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
322	cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
323
324int ldap_t61s_valid( struct berval *str )
325{
326	unsigned char *c = (unsigned char *)str->bv_val;
327	int i;
328
329	for (i=0; i < str->bv_len; c++,i++)
330		if (!t61_tab[*c])
331			return 0;
332	return 1;
333}
334
335/* Transform a T.61 string to UTF-8.
336 */
337int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
338{
339	unsigned char *c;
340	char *d;
341	int i, wlen = 0;
342
343	/* Just count the length of the UTF-8 result first */
344	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
345		/* Invalid T.61 characters? */
346		if (!t61_tab[*c])
347			return LDAP_INVALID_SYNTAX;
348		if ((*c & 0xf0) == 0xc0) {
349			int j = *c & 0x0f;
350			/* If this is the end of the string, or if the base
351			 * character is just a space, treat this as a regular
352			 * spacing character.
353			 */
354			if ((!c[1] || c[1] == 0x20) && accents[j]) {
355				wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
356			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
357			/* We have a composite mapping for this pair */
358				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
359				wlen += ldap_x_wc_to_utf8( NULL,
360					(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
361			} else {
362			/* No mapping, just swap it around so the base
363			 * character comes first.
364			 */
365			 	wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
366				wlen += ldap_x_wc_to_utf8(NULL,
367					t61_tab[*c], 0);
368			}
369			c++; i++;
370			continue;
371		} else {
372			wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
373		}
374	}
375
376	/* Now transform the string */
377	dst->bv_len = wlen;
378	dst->bv_val = LDAP_MALLOC( wlen+1 );
379	d = dst->bv_val;
380	if (!d)
381		return LDAP_NO_MEMORY;
382
383	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
384		if ((*c & 0xf0) == 0xc0) {
385			int j = *c & 0x0f;
386			/* If this is the end of the string, or if the base
387			 * character is just a space, treat this as a regular
388			 * spacing character.
389			 */
390			if ((!c[1] || c[1] == 0x20) && accents[j]) {
391				d += ldap_x_wc_to_utf8(d, accents[j], 6);
392			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
393			/* We have a composite mapping for this pair */
394				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
395				d += ldap_x_wc_to_utf8(d,
396				(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
397			} else {
398			/* No mapping, just swap it around so the base
399			 * character comes first.
400			 */
401				d += ldap_x_wc_to_utf8(d, c[1], 6);
402				d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
403			}
404			c++; i++;
405			continue;
406		} else {
407			d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
408		}
409	}
410	*d = '\0';
411	return LDAP_SUCCESS;
412}
413
414/* For the reverse mapping, we just pay attention to the Latin-oriented
415 * code blocks. These are
416 *	0000 - 007f Basic Latin
417 *	0080 - 00ff Latin-1 Supplement
418 *	0100 - 017f Latin Extended-A
419 *	0180 - 024f Latin Extended-B
420 *	1e00 - 1eff Latin Extended Additional
421 *
422 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
423 * unrecognized characters are replaced with '?' 0x3f.
424 */
425
426static const wvec64 u000 = {
427	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
428	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
429	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
430	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
431	0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
432	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
433	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
434	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
435
436/* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
437 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
438 * on their own, even though it provides them as combiners for other
439 * letters. T.61 doesn't define these pairings either, so this may just
440 * have to be replaced with '?' 0x3f if other software can't cope with it.
441 */
442static const wvec64 u001 = {
443	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
444	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
445	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
446	0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
447	0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
448	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
449	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
450	0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
451
452static const wvec64 u002 = {
453	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
454	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
455	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
456	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
457	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
458	0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
459	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
460	0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
461
462static const wvec64 u003 = {
463	0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
464	0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
465	0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
466	0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
467	0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
468	0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
469	0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
470	0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
471
472/* These codes are used here but not defined by T.61:
473 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
474 */
475static const wvec64 u010 = {
476	0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
477	0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
478	0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
479	0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
480	0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
481	0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
482	0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
483	0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
484
485/* These codes are used here but not defined by T.61:
486 * x14e = xc6/x4f, x14f = xc6/x6f
487 */
488static const wvec64 u011 = {
489	0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
490	0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
491	0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
492	0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
493	0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
494	0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
495	0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
496	0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
497
498/* All of the codes in this block are undefined in T.61.
499 */
500static const wvec64 u013 = {
501	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
502	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
503	0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
504	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
505	0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
506	0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
507	0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
508	0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
509
510/* All of the codes in this block are undefined in T.61.
511 */
512static const wvec64 u020 = {
513	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
514	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
515	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
516	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
517	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
518	0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
519	0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
520	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
521
522static const wvec64 u023 = {
523	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
524	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
525	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
526	0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
527	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
528	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
529	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
530	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
531
532/* These are the non-spacing characters by themselves. They should
533 * never appear by themselves in actual text.
534 */
535static const wvec64 u030 = {
536	0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
537	0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
538	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
539	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
540	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
541	0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
542	0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
543	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
544
545/* None of the following blocks are defined in T.61.
546 */
547static const wvec64 u1e0 = {
548	0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
549	0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
550	0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
551	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
552	0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
553	0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
554	0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
555	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
556};
557
558static const wvec64 u1e1 = {
559	0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
560	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
561	0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
562	0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
563	0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
564	0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
565	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
566	0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
567};
568
569static const wvec64 u1e2 = {
570	0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
571	0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
572	0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
573	0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
574	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
575	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
576	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
577	0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
578};
579
580static const wvec64 u1e3 = {
581	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
582	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
583	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
584	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
585	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
586	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
587	0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
588	0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
589};
590
591static const wvec64 *wc00[] = {
592	&u000, &u001, &u002, &u003,
593	&u010, &u011, NULL, &u013,
594	&u020, NULL, NULL, &u023,
595	&u030, NULL, NULL, NULL};
596
597static const wvec64 *wc1e[] = {
598	&u1e0, &u1e1, &u1e2, &u1e3};
599
600
601int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
602{
603	char *c, *d;
604	wchar_t tmp;
605	int i, j, tlen = 0;
606
607	/* Just count the length of the T.61 result first */
608	for (i=0,c=src->bv_val; i < src->bv_len;) {
609		j = ldap_x_utf8_to_wc( &tmp, c );
610		if (j == -1)
611			return LDAP_INVALID_SYNTAX;
612		switch (tmp >> 8) {
613		case 0x00:
614		case 0x01:
615		case 0x02:
616		case 0x03:
617			if (wc00[tmp >> 6] &&
618				((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
619				tlen++;
620			}
621			tlen++;
622			break;
623		case 0x1e:
624			if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
625				tlen++;
626			}
627		case 0x21:
628		default:
629			tlen ++;
630			break;
631		}
632		i += j;
633		c += j;
634	}
635	dst->bv_len = tlen;
636	dst->bv_val = LDAP_MALLOC( tlen+1 );
637	if (!dst->bv_val)
638		return LDAP_NO_MEMORY;
639
640	d = dst->bv_val;
641	for (i=0,c=src->bv_val; i < src->bv_len;) {
642		j = ldap_x_utf8_to_wc( &tmp, c );
643		switch (tmp >> 8) {
644		case 0x00:
645		case 0x01:
646		case 0x02:
647			if (wc00[tmp >> 6]) {
648				tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
649				if (tmp & 0xff00)
650					*d++ = (tmp >> 8);
651				*d++ = tmp & 0xff;
652			} else {
653				*d++ = 0x3f;
654			}
655			break;
656		case 0x03:
657			/* swap order of non-spacing characters */
658			if (wc00[tmp >> 6]) {
659				wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
660				if (t2 != 0x3f) {
661					d[0] = d[-1];
662					d[-1] = t2;
663					d++;
664				} else {
665					*d++ = 0x3f;
666				}
667			} else {
668				*d++ = 0x3f;
669			}
670			break;
671		case 0x1e:
672			tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];
673			if (tmp & 0xff00)
674				*d++ = (tmp >> 8);
675			*d++ = tmp & 0xff;
676			break;
677		case 0x21:
678			if (tmp == 0x2126) {
679				*d++ = 0xe0;
680				break;
681			}
682			/* FALLTHRU */
683		default:
684			*d++ = 0x3f;
685			break;
686		}
687		i += j;
688		c += j;
689	}
690	*d = '\0';
691	return LDAP_SUCCESS;
692}
693