t61.c revision 1.1.1.3
1/*	$NetBSD: t61.c,v 1.1.1.3 2010/03/08 02:14:20 lukem Exp $	*/
2
3/* OpenLDAP: pkg/ldap/libraries/libldap/t61.c,v 1.9.2.5 2009/01/22 00:00:56 kurt Exp */
4/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 2002-2009 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in the file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17/* ACKNOWLEDGEMENTS:
18 * This work was initially developed by Howard Chu for inclusion in
19 * OpenLDAP Software.
20 */
21
22/*
23 * Basic T.61 <-> UTF-8 conversion
24 *
25 * These routines will perform a lossless translation from T.61 to UTF-8
26 * and a lossy translation from UTF-8 to T.61.
27 */
28
29#include "portable.h"
30
31#include <stdio.h>
32
33#include <ac/stdlib.h>
34
35#include <ac/socket.h>
36#include <ac/string.h>
37#include <ac/time.h>
38
39#include "ldap-int.h"
40#include "ldap_utf8.h"
41
42#include "ldap_defaults.h"
43
44/*
45 * T.61 is somewhat braindead; even in the 7-bit space it is not
46 * completely equivalent to 7-bit US-ASCII. Our definition of the
47 * character set comes from RFC 1345 with a slightly more readable
48 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
49 *
50 * Even though '#' and '$' are present in the 7-bit US-ASCII space,
51 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
52 * xA6 and xA4.
53 *
54 * Also T.61 lacks
55 *	backslash 	\	(x5C)
56 *	caret		^	(x5E)
57 *	backquote	`	(x60)
58 *	left brace	{	(x7B)
59 *	right brace	}	(x7D)
60 *	tilde		~	(x7E)
61 *
62 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
63 * accents of some form or another. There are predefined combinations
64 * for certain characters, but they can also be used arbitrarily. The
65 * table at dkuug.dk maps these accents to the E000 "private use" range
66 * of the Unicode space, but I believe they more properly belong in the
67 * 0300 range (non-spacing accents). The transformation is complicated
68 * slightly because Unicode wants the non-spacing character to follow
69 * the base character, while T.61 has the non-spacing character leading.
70 * Also, T.61 specifically recognizes certain combined pairs as "characters"
71 * but doesn't specify how to treat unrecognized pairs. This code will
72 * always attempt to combine pairs when a known Unicode composite exists.
73 */
74
75static const wchar_t t61_tab[] = {
76	0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
77	0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
78	0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
79	0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
80	0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
81	0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
82	0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
83	0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
84	0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
85	0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
86	0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
87	0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
88	0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
89	0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
90	0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
91	0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
92	0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
93	0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
94	0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
95	0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
96	0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
97	0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
98	0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
99	0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
100	0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
101	0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
102	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
103	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
104	0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
105	0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
106	0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
107	0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
108};
109
110typedef wchar_t wvec16[16];
111typedef wchar_t wvec32[32];
112typedef wchar_t wvec64[64];
113
114/* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
115static const wvec16 accents = {
116	0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
117	0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
118
119/* In the following tables, base characters commented in (parentheses)
120 * are not defined by T.61 but are mapped anyway since their Unicode
121 * composite exists.
122 */
123
124/* Grave accented chars AEIOU (NWY) */
125static const wvec32 c1_vec1 = {
126	/* Upper case */
127	0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
128	0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
129static const wvec32 c1_vec2 = {
130	/* Lower case */
131	0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
132	0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
133
134static const wvec32 *c1_grave[] = {
135	NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
136};
137
138/* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
139static const wvec32 c2_vec1 = {
140	/* Upper case */
141	0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
142	0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
143	0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
144	0, 0xdd, 0x179, 0, 0, 0, 0, 0};
145static const wvec32 c2_vec2 = {
146	/* Lower case */
147	0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
148	0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
149	0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
150	0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
151static const wvec32 c2_vec3 = {
152	/* (AE and ae) */
153	0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
154	0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
155
156static const wvec32 *c2_acute[] = {
157	NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
158};
159
160/* Circumflex AEIOUYCGHJSW (Z) */
161static const wvec32 c3_vec1 = {
162	/* Upper case */
163	0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
164	0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
165	0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
166	0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
167static const wvec32 c3_vec2 = {
168	/* Lower case */
169	0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
170	0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
171	0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
172	0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
173static const wvec32 *c3_circumflex[] = {
174	NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
175};
176
177/* Tilde AIOUN (EVY) */
178static const wvec32 c4_vec1 = {
179	/* Upper case */
180	0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
181	0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
182static const wvec32 c4_vec2 = {
183	/* Lower case */
184	0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
185	0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
186static const wvec32 *c4_tilde[] = {
187	NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
188};
189
190/* Macron AEIOU (YG) */
191static const wvec32 c5_vec1 = {
192	/* Upper case */
193	0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
194	0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
195static const wvec32 c5_vec2 = {
196	/* Lower case */
197	0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
198	0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
199static const wvec32 c5_vec3 = {
200	/* (AE and ae) */
201	0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202	0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
203static const wvec32 *c5_macron[] = {
204	NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
205};
206
207/* Breve AUG (EIO) */
208static const wvec32 c6_vec1 = {
209	/* Upper case */
210	0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
211	0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
212static const wvec32 c6_vec2 = {
213	/* Lower case */
214	0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
215	0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
216static const wvec32 *c6_breve[] = {
217	NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
218};
219
220/* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
221static const wvec32 c7_vec1 = {
222	/* Upper case */
223	0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
224	0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
225	0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
226	0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
227static const wvec32 c7_vec2 = {
228	/* Lower case */
229	0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
230	0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
231	0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
232	0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
233static const wvec32 *c7_dotabove[] = {
234	NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
235};
236
237/* Diaeresis AEIOUY (HWXt) */
238static const wvec32 c8_vec1 = {
239	/* Upper case */
240	0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
241	0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
242static const wvec32 c8_vec2 = {
243	/* Lower case */
244	0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
245	0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
246static const wvec32 *c8_diaeresis[] = {
247	NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
248};
249
250/* Ring Above AU (wy) */
251static const wvec32 ca_vec1 = {
252	/* Upper case */
253	0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
254	0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
255static const wvec32 ca_vec2 = {
256	/* Lower case */
257	0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
258	0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
259static const wvec32 *ca_ringabove[] = {
260	NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
261};
262
263/* Cedilla CGKLNRST (EDH) */
264static const wvec32 cb_vec1 = {
265	/* Upper case */
266	0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
267	0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
268	0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
269static const wvec32 cb_vec2 = {
270	/* Lower case */
271	0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
272	0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
273	0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
274static const wvec32 *cb_cedilla[] = {
275	NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
276};
277
278/* Double Acute Accent OU */
279static const wvec32 cd_vec1 = {
280	/* Upper case */
281	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
282	0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
283static const wvec32 cd_vec2 = {
284	/* Lower case */
285	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
286	0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
287static const wvec32 *cd_doubleacute[] = {
288	NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
289};
290
291/* Ogonek AEIU (O) */
292static const wvec32 ce_vec1 = {
293	/* Upper case */
294	0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
295	0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
296static const wvec32 ce_vec2 = {
297	/* Lower case */
298	0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
299	0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
300static const wvec32 *ce_ogonek[] = {
301	NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
302};
303
304/* Caron CDELNRSTZ (AIOUGKjH) */
305static const wvec32 cf_vec1 = {
306	/* Upper case */
307	0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
308	0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
309	0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
310	0, 0, 0x17d, 0, 0, 0, 0, 0};
311static const wvec32 cf_vec2 = {
312	/* Lower case */
313	0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
314	0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
315	0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
316	0, 0, 0x17e, 0, 0, 0, 0, 0};
317static const wvec32 *cf_caron[] = {
318	NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
319};
320
321static const wvec32 **cx_tab[] = {
322	NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
323	c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
324	cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
325
326int ldap_t61s_valid( struct berval *str )
327{
328	unsigned char *c = (unsigned char *)str->bv_val;
329	int i;
330
331	for (i=0; i < str->bv_len; c++,i++)
332		if (!t61_tab[*c])
333			return 0;
334	return 1;
335}
336
337/* Transform a T.61 string to UTF-8.
338 */
339int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
340{
341	unsigned char *c;
342	char *d;
343	int i, wlen = 0;
344
345	/* Just count the length of the UTF-8 result first */
346	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
347		/* Invalid T.61 characters? */
348		if (!t61_tab[*c])
349			return LDAP_INVALID_SYNTAX;
350		if ((*c & 0xf0) == 0xc0) {
351			int j = *c & 0x0f;
352			/* If this is the end of the string, or if the base
353			 * character is just a space, treat this as a regular
354			 * spacing character.
355			 */
356			if ((!c[1] || c[1] == 0x20) && accents[j]) {
357				wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
358			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
359			/* We have a composite mapping for this pair */
360				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
361				wlen += ldap_x_wc_to_utf8( NULL,
362					(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
363			} else {
364			/* No mapping, just swap it around so the base
365			 * character comes first.
366			 */
367			 	wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
368				wlen += ldap_x_wc_to_utf8(NULL,
369					t61_tab[*c], 0);
370			}
371			c++; i++;
372			continue;
373		} else {
374			wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
375		}
376	}
377
378	/* Now transform the string */
379	dst->bv_len = wlen;
380	dst->bv_val = LDAP_MALLOC( wlen+1 );
381	d = dst->bv_val;
382	if (!d)
383		return LDAP_NO_MEMORY;
384
385	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
386		if ((*c & 0xf0) == 0xc0) {
387			int j = *c & 0x0f;
388			/* If this is the end of the string, or if the base
389			 * character is just a space, treat this as a regular
390			 * spacing character.
391			 */
392			if ((!c[1] || c[1] == 0x20) && accents[j]) {
393				d += ldap_x_wc_to_utf8(d, accents[j], 6);
394			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
395			/* We have a composite mapping for this pair */
396				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
397				d += ldap_x_wc_to_utf8(d,
398				(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
399			} else {
400			/* No mapping, just swap it around so the base
401			 * character comes first.
402			 */
403				d += ldap_x_wc_to_utf8(d, c[1], 6);
404				d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
405			}
406			c++; i++;
407			continue;
408		} else {
409			d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
410		}
411	}
412	*d = '\0';
413	return LDAP_SUCCESS;
414}
415
416/* For the reverse mapping, we just pay attention to the Latin-oriented
417 * code blocks. These are
418 *	0000 - 007f Basic Latin
419 *	0080 - 00ff Latin-1 Supplement
420 *	0100 - 017f Latin Extended-A
421 *	0180 - 024f Latin Extended-B
422 *	1e00 - 1eff Latin Extended Additional
423 *
424 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
425 * unrecognized characters are replaced with '?' 0x3f.
426 */
427
428static const wvec64 u000 = {
429	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
430	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
431	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
432	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
433	0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
434	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
435	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
436	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
437
438/* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
439 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
440 * on their own, even though it provides them as combiners for other
441 * letters. T.61 doesn't define these pairings either, so this may just
442 * have to be replaced with '?' 0x3f if other software can't cope with it.
443 */
444static const wvec64 u001 = {
445	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
446	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
447	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
448	0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
449	0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
450	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
451	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
452	0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
453
454static const wvec64 u002 = {
455	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
456	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
457	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
458	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
459	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
460	0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
461	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
462	0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
463
464static const wvec64 u003 = {
465	0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
466	0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
467	0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
468	0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
469	0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
470	0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
471	0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
472	0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
473
474/* These codes are used here but not defined by T.61:
475 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
476 */
477static const wvec64 u010 = {
478	0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
479	0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
480	0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
481	0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
482	0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
483	0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
484	0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
485	0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
486
487/* These codes are used here but not defined by T.61:
488 * x14e = xc6/x4f, x14f = xc6/x6f
489 */
490static const wvec64 u011 = {
491	0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
492	0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
493	0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
494	0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
495	0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
496	0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
497	0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
498	0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
499
500/* All of the codes in this block are undefined in T.61.
501 */
502static const wvec64 u013 = {
503	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
504	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
505	0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
506	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
507	0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
508	0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
509	0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
510	0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
511
512/* All of the codes in this block are undefined in T.61.
513 */
514static const wvec64 u020 = {
515	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
516	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
517	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
518	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
519	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
520	0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
521	0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
522	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
523
524static const wvec64 u023 = {
525	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
526	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
527	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
528	0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
529	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
530	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
531	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
532	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
533
534/* These are the non-spacing characters by themselves. They should
535 * never appear by themselves in actual text.
536 */
537static const wvec64 u030 = {
538	0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
539	0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
540	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
541	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
542	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
543	0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
544	0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
545	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
546
547/* None of the following blocks are defined in T.61.
548 */
549static const wvec64 u1e0 = {
550	0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
551	0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
552	0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
553	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
554	0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
555	0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
556	0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
557	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
558};
559
560static const wvec64 u1e1 = {
561	0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
562	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
563	0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
564	0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
565	0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
566	0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
567	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
568	0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
569};
570
571static const wvec64 u1e2 = {
572	0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
573	0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
574	0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
575	0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
576	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
577	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
578	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
579	0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
580};
581
582static const wvec64 u1e3 = {
583	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
584	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
585	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
586	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
587	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
588	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
589	0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
590	0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
591};
592
593static const wvec64 *wc00[] = {
594	&u000, &u001, &u002, &u003,
595	&u010, &u011, NULL, &u013,
596	&u020, NULL, NULL, &u023,
597	&u030, NULL, NULL, NULL};
598
599static const wvec64 *wc1e[] = {
600	&u1e0, &u1e1, &u1e2, &u1e3};
601
602
603int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
604{
605	char *c, *d;
606	wchar_t tmp;
607	int i, j, tlen = 0;
608
609	/* Just count the length of the T.61 result first */
610	for (i=0,c=src->bv_val; i < src->bv_len;) {
611		j = ldap_x_utf8_to_wc( &tmp, c );
612		if (j == -1)
613			return LDAP_INVALID_SYNTAX;
614		switch (tmp >> 8) {
615		case 0x00:
616		case 0x01:
617		case 0x02:
618		case 0x03:
619			if (wc00[tmp >> 6] &&
620				((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
621				tlen++;
622			}
623			tlen++;
624			break;
625		case 0x1e:
626			if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
627				tlen++;
628			}
629		case 0x21:
630		default:
631			tlen ++;
632			break;
633		}
634		i += j;
635		c += j;
636	}
637	dst->bv_len = tlen;
638	dst->bv_val = LDAP_MALLOC( tlen+1 );
639	if (!dst->bv_val)
640		return LDAP_NO_MEMORY;
641
642	d = dst->bv_val;
643	for (i=0,c=src->bv_val; i < src->bv_len;) {
644		j = ldap_x_utf8_to_wc( &tmp, c );
645		switch (tmp >> 8) {
646		case 0x00:
647		case 0x01:
648		case 0x02:
649			if (wc00[tmp >> 6]) {
650				tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
651				if (tmp & 0xff00)
652					*d++ = (tmp >> 8);
653				*d++ = tmp & 0xff;
654			} else {
655				*d++ = 0x3f;
656			}
657			break;
658		case 0x03:
659			/* swap order of non-spacing characters */
660			if (wc00[tmp >> 6]) {
661				wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
662				if (t2 != 0x3f) {
663					d[0] = d[-1];
664					d[-1] = t2;
665					d++;
666				} else {
667					*d++ = 0x3f;
668				}
669			} else {
670				*d++ = 0x3f;
671			}
672			break;
673		case 0x1e:
674			tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];
675			if (tmp & 0xff00)
676				*d++ = (tmp >> 8);
677			*d++ = tmp & 0xff;
678			break;
679		case 0x21:
680			if (tmp == 0x2126) {
681				*d++ = 0xe0;
682				break;
683			}
684			/* FALLTHRU */
685		default:
686			*d++ = 0x3f;
687			break;
688		}
689		i += j;
690		c += j;
691	}
692	*d = '\0';
693	return LDAP_SUCCESS;
694}
695