1/*	$NetBSD: t61.c,v 1.3 2021/08/14 16:14:56 christos Exp $	*/
2
3/* $OpenLDAP$ */
4/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 2002-2021 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in the file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17/* ACKNOWLEDGEMENTS:
18 * This work was initially developed by Howard Chu for inclusion in
19 * OpenLDAP Software.
20 */
21
22/*
23 * Basic T.61 <-> UTF-8 conversion
24 *
25 * These routines will perform a lossless translation from T.61 to UTF-8
26 * and a lossy translation from UTF-8 to T.61.
27 */
28
29#include <sys/cdefs.h>
30__RCSID("$NetBSD: t61.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
31
32#include "portable.h"
33
34#include <stdio.h>
35
36#include <ac/stdlib.h>
37
38#include <ac/socket.h>
39#include <ac/string.h>
40#include <ac/time.h>
41
42#include "ldap-int.h"
43#include "ldap_utf8.h"
44
45#include "ldap_defaults.h"
46
47/*
48 * T.61 is somewhat braindead; even in the 7-bit space it is not
49 * completely equivalent to 7-bit US-ASCII. Our definition of the
50 * character set comes from RFC 1345 with a slightly more readable
51 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
52 *
53 * Even though '#' and '$' are present in the 7-bit US-ASCII space,
54 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
55 * xA6 and xA4.
56 *
57 * Also T.61 lacks
58 *	backslash 	\	(x5C)
59 *	caret		^	(x5E)
60 *	backquote	`	(x60)
61 *	left brace	{	(x7B)
62 *	right brace	}	(x7D)
63 *	tilde		~	(x7E)
64 *
65 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
66 * accents of some form or another. There are predefined combinations
67 * for certain characters, but they can also be used arbitrarily. The
68 * table at dkuug.dk maps these accents to the E000 "private use" range
69 * of the Unicode space, but I believe they more properly belong in the
70 * 0300 range (non-spacing accents). The transformation is complicated
71 * slightly because Unicode wants the non-spacing character to follow
72 * the base character, while T.61 has the non-spacing character leading.
73 * Also, T.61 specifically recognizes certain combined pairs as "characters"
74 * but doesn't specify how to treat unrecognized pairs. This code will
75 * always attempt to combine pairs when a known Unicode composite exists.
76 */
77
78static const wchar_t t61_tab[] = {
79	0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
80	0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
81	0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
82	0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
83	0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
84	0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
85	0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
86	0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
87	0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
88	0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
89	0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
90	0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
91	0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
92	0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
93	0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
94	0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
95	0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
96	0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
97	0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
98	0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
99	0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
100	0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
101	0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
102	0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
103	0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
104	0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
105	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
106	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
107	0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
108	0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
109	0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
110	0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
111};
112
113typedef wchar_t wvec16[16];
114typedef wchar_t wvec32[32];
115typedef wchar_t wvec64[64];
116
117/* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
118static const wvec16 accents = {
119	0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
120	0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
121
122/* In the following tables, base characters commented in (parentheses)
123 * are not defined by T.61 but are mapped anyway since their Unicode
124 * composite exists.
125 */
126
127/* Grave accented chars AEIOU (NWY) */
128static const wvec32 c1_vec1 = {
129	/* Upper case */
130	0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
131	0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
132static const wvec32 c1_vec2 = {
133	/* Lower case */
134	0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
135	0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
136
137static const wvec32 *c1_grave[] = {
138	NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
139};
140
141/* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
142static const wvec32 c2_vec1 = {
143	/* Upper case */
144	0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
145	0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
146	0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
147	0, 0xdd, 0x179, 0, 0, 0, 0, 0};
148static const wvec32 c2_vec2 = {
149	/* Lower case */
150	0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
151	0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
152	0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
153	0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
154static const wvec32 c2_vec3 = {
155	/* (AE and ae) */
156	0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
157	0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
158
159static const wvec32 *c2_acute[] = {
160	NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
161};
162
163/* Circumflex AEIOUYCGHJSW (Z) */
164static const wvec32 c3_vec1 = {
165	/* Upper case */
166	0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
167	0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
168	0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
169	0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
170static const wvec32 c3_vec2 = {
171	/* Lower case */
172	0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
173	0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
174	0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
175	0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
176static const wvec32 *c3_circumflex[] = {
177	NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
178};
179
180/* Tilde AIOUN (EVY) */
181static const wvec32 c4_vec1 = {
182	/* Upper case */
183	0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
184	0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
185static const wvec32 c4_vec2 = {
186	/* Lower case */
187	0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
188	0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
189static const wvec32 *c4_tilde[] = {
190	NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
191};
192
193/* Macron AEIOU (YG) */
194static const wvec32 c5_vec1 = {
195	/* Upper case */
196	0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
197	0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
198static const wvec32 c5_vec2 = {
199	/* Lower case */
200	0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
201	0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
202static const wvec32 c5_vec3 = {
203	/* (AE and ae) */
204	0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205	0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206static const wvec32 *c5_macron[] = {
207	NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
208};
209
210/* Breve AUG (EIO) */
211static const wvec32 c6_vec1 = {
212	/* Upper case */
213	0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
214	0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
215static const wvec32 c6_vec2 = {
216	/* Lower case */
217	0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
218	0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
219static const wvec32 *c6_breve[] = {
220	NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
221};
222
223/* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
224static const wvec32 c7_vec1 = {
225	/* Upper case */
226	0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
227	0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
228	0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
229	0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
230static const wvec32 c7_vec2 = {
231	/* Lower case */
232	0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
233	0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
234	0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
235	0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
236static const wvec32 *c7_dotabove[] = {
237	NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
238};
239
240/* Diaeresis AEIOUY (HWXt) */
241static const wvec32 c8_vec1 = {
242	/* Upper case */
243	0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
244	0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
245static const wvec32 c8_vec2 = {
246	/* Lower case */
247	0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
248	0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
249static const wvec32 *c8_diaeresis[] = {
250	NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
251};
252
253/* Ring Above AU (wy) */
254static const wvec32 ca_vec1 = {
255	/* Upper case */
256	0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
257	0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
258static const wvec32 ca_vec2 = {
259	/* Lower case */
260	0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
261	0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
262static const wvec32 *ca_ringabove[] = {
263	NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
264};
265
266/* Cedilla CGKLNRST (EDH) */
267static const wvec32 cb_vec1 = {
268	/* Upper case */
269	0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
270	0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
271	0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
272static const wvec32 cb_vec2 = {
273	/* Lower case */
274	0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
275	0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
276	0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
277static const wvec32 *cb_cedilla[] = {
278	NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
279};
280
281/* Double Acute Accent OU */
282static const wvec32 cd_vec1 = {
283	/* Upper case */
284	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
285	0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
286static const wvec32 cd_vec2 = {
287	/* Lower case */
288	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
289	0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
290static const wvec32 *cd_doubleacute[] = {
291	NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
292};
293
294/* Ogonek AEIU (O) */
295static const wvec32 ce_vec1 = {
296	/* Upper case */
297	0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
298	0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
299static const wvec32 ce_vec2 = {
300	/* Lower case */
301	0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
302	0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
303static const wvec32 *ce_ogonek[] = {
304	NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
305};
306
307/* Caron CDELNRSTZ (AIOUGKjH) */
308static const wvec32 cf_vec1 = {
309	/* Upper case */
310	0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
311	0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
312	0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
313	0, 0, 0x17d, 0, 0, 0, 0, 0};
314static const wvec32 cf_vec2 = {
315	/* Lower case */
316	0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
317	0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
318	0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
319	0, 0, 0x17e, 0, 0, 0, 0, 0};
320static const wvec32 *cf_caron[] = {
321	NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
322};
323
324static const wvec32 **cx_tab[] = {
325	NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
326	c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
327	cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
328
329int ldap_t61s_valid( struct berval *str )
330{
331	unsigned char *c = (unsigned char *)str->bv_val;
332	int i;
333
334	for (i=0; i < str->bv_len; c++,i++)
335		if (!t61_tab[*c])
336			return 0;
337	return 1;
338}
339
340/* Transform a T.61 string to UTF-8.
341 */
342int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
343{
344	unsigned char *c;
345	char *d;
346	int i, wlen = 0;
347
348	/* Just count the length of the UTF-8 result first */
349	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
350		/* Invalid T.61 characters? */
351		if (!t61_tab[*c])
352			return LDAP_INVALID_SYNTAX;
353		if ((*c & 0xf0) == 0xc0) {
354			int j = *c & 0x0f;
355			/* If this is the end of the string, or if the base
356			 * character is just a space, treat this as a regular
357			 * spacing character.
358			 */
359			if ((!c[1] || c[1] == 0x20) && accents[j]) {
360				wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
361			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
362			/* We have a composite mapping for this pair */
363				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
364				wlen += ldap_x_wc_to_utf8( NULL,
365					(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
366			} else {
367			/* No mapping, just swap it around so the base
368			 * character comes first.
369			 */
370			 	wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
371				wlen += ldap_x_wc_to_utf8(NULL,
372					t61_tab[*c], 0);
373			}
374			c++; i++;
375			continue;
376		} else {
377			wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
378		}
379	}
380
381	/* Now transform the string */
382	dst->bv_len = wlen;
383	dst->bv_val = LDAP_MALLOC( wlen+1 );
384	d = dst->bv_val;
385	if (!d)
386		return LDAP_NO_MEMORY;
387
388	for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
389		if ((*c & 0xf0) == 0xc0) {
390			int j = *c & 0x0f;
391			/* If this is the end of the string, or if the base
392			 * character is just a space, treat this as a regular
393			 * spacing character.
394			 */
395			if ((!c[1] || c[1] == 0x20) && accents[j]) {
396				d += ldap_x_wc_to_utf8(d, accents[j], 6);
397			} else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
398			/* We have a composite mapping for this pair */
399				(*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
400				d += ldap_x_wc_to_utf8(d,
401				(*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
402			} else {
403			/* No mapping, just swap it around so the base
404			 * character comes first.
405			 */
406				d += ldap_x_wc_to_utf8(d, c[1], 6);
407				d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
408			}
409			c++; i++;
410			continue;
411		} else {
412			d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
413		}
414	}
415	*d = '\0';
416	return LDAP_SUCCESS;
417}
418
419/* For the reverse mapping, we just pay attention to the Latin-oriented
420 * code blocks. These are
421 *	0000 - 007f Basic Latin
422 *	0080 - 00ff Latin-1 Supplement
423 *	0100 - 017f Latin Extended-A
424 *	0180 - 024f Latin Extended-B
425 *	1e00 - 1eff Latin Extended Additional
426 *
427 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
428 * unrecognized characters are replaced with '?' 0x3f.
429 */
430
431static const wvec64 u000 = {
432	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
433	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
434	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
435	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
436	0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
437	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
438	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
439	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
440
441/* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
442 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
443 * on their own, even though it provides them as combiners for other
444 * letters. T.61 doesn't define these pairings either, so this may just
445 * have to be replaced with '?' 0x3f if other software can't cope with it.
446 */
447static const wvec64 u001 = {
448	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
449	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
450	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
451	0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
452	0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
453	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
454	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
455	0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
456
457static const wvec64 u002 = {
458	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
459	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
460	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
461	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
462	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
463	0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
464	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
465	0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
466
467static const wvec64 u003 = {
468	0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
469	0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
470	0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
471	0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
472	0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
473	0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
474	0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
475	0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
476
477/* These codes are used here but not defined by T.61:
478 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
479 */
480static const wvec64 u010 = {
481	0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
482	0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
483	0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
484	0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
485	0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
486	0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
487	0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
488	0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
489
490/* These codes are used here but not defined by T.61:
491 * x14e = xc6/x4f, x14f = xc6/x6f
492 */
493static const wvec64 u011 = {
494	0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
495	0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
496	0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
497	0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
498	0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
499	0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
500	0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
501	0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
502
503/* All of the codes in this block are undefined in T.61.
504 */
505static const wvec64 u013 = {
506	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
507	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
508	0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
509	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
510	0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
511	0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
512	0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
513	0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
514
515/* All of the codes in this block are undefined in T.61.
516 */
517static const wvec64 u020 = {
518	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
519	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
520	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
521	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
522	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
523	0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
524	0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
525	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
526
527static const wvec64 u023 = {
528	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
529	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
530	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
531	0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
532	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
533	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
534	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
535	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
536
537/* These are the non-spacing characters by themselves. They should
538 * never appear by themselves in actual text.
539 */
540static const wvec64 u030 = {
541	0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
542	0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
543	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
544	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
545	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
546	0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
547	0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
548	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
549
550/* None of the following blocks are defined in T.61.
551 */
552static const wvec64 u1e0 = {
553	0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
554	0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
555	0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
556	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
557	0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
558	0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
559	0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
560	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
561};
562
563static const wvec64 u1e1 = {
564	0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
565	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
566	0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
567	0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
568	0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
569	0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
570	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
571	0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
572};
573
574static const wvec64 u1e2 = {
575	0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
576	0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
577	0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
578	0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
579	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
580	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
581	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
582	0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
583};
584
585static const wvec64 u1e3 = {
586	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
587	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
588	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
589	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
590	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
591	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
592	0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
593	0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
594};
595
596static const wvec64 *wc00[] = {
597	&u000, &u001, &u002, &u003,
598	&u010, &u011, NULL, &u013,
599	&u020, NULL, NULL, &u023,
600	&u030, NULL, NULL, NULL};
601
602static const wvec64 *wc1e[] = {
603	&u1e0, &u1e1, &u1e2, &u1e3};
604
605
606int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
607{
608	char *c, *d;
609	wchar_t tmp;
610	int i, j, tlen = 0;
611
612	/* Just count the length of the T.61 result first */
613	for (i=0,c=src->bv_val; i < src->bv_len;) {
614		j = ldap_x_utf8_to_wc( &tmp, c );
615		if (j == -1)
616			return LDAP_INVALID_SYNTAX;
617		switch (tmp >> 8) {
618		case 0x00:
619		case 0x01:
620		case 0x02:
621		case 0x03:
622			if (wc00[tmp >> 6] &&
623				((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
624				tlen++;
625			}
626			tlen++;
627			break;
628		case 0x1e:
629			if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
630				tlen++;
631			}
632		case 0x21:
633		default:
634			tlen ++;
635			break;
636		}
637		i += j;
638		c += j;
639	}
640	dst->bv_len = tlen;
641	dst->bv_val = LDAP_MALLOC( tlen+1 );
642	if (!dst->bv_val)
643		return LDAP_NO_MEMORY;
644
645	d = dst->bv_val;
646	for (i=0,c=src->bv_val; i < src->bv_len;) {
647		j = ldap_x_utf8_to_wc( &tmp, c );
648		switch (tmp >> 8) {
649		case 0x00:
650		case 0x01:
651		case 0x02:
652			if (wc00[tmp >> 6]) {
653				tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
654				if (tmp & 0xff00)
655					*d++ = (tmp >> 8);
656				*d++ = tmp & 0xff;
657			} else {
658				*d++ = 0x3f;
659			}
660			break;
661		case 0x03:
662			/* swap order of non-spacing characters */
663			if (wc00[tmp >> 6]) {
664				wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
665				if (t2 != 0x3f) {
666					d[0] = d[-1];
667					d[-1] = t2;
668					d++;
669				} else {
670					*d++ = 0x3f;
671				}
672			} else {
673				*d++ = 0x3f;
674			}
675			break;
676		case 0x1e:
677			tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];
678			if (tmp & 0xff00)
679				*d++ = (tmp >> 8);
680			*d++ = tmp & 0xff;
681			break;
682		case 0x21:
683			if (tmp == 0x2126) {
684				*d++ = 0xe0;
685				break;
686			}
687			/* FALLTHRU */
688		default:
689			*d++ = 0x3f;
690			break;
691		}
692		i += j;
693		c += j;
694	}
695	*d = '\0';
696	return LDAP_SUCCESS;
697}
698