t61.c revision 1.1.1.5
1/* $NetBSD: t61.c,v 1.1.1.5 2014/05/28 09:58:42 tron Exp $ */ 2 3/* $OpenLDAP$ */ 4/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 2002-2014 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in the file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17/* ACKNOWLEDGEMENTS: 18 * This work was initially developed by Howard Chu for inclusion in 19 * OpenLDAP Software. 20 */ 21 22/* 23 * Basic T.61 <-> UTF-8 conversion 24 * 25 * These routines will perform a lossless translation from T.61 to UTF-8 26 * and a lossy translation from UTF-8 to T.61. 27 */ 28 29#include "portable.h" 30 31#include <stdio.h> 32 33#include <ac/stdlib.h> 34 35#include <ac/socket.h> 36#include <ac/string.h> 37#include <ac/time.h> 38 39#include "ldap-int.h" 40#include "ldap_utf8.h" 41 42#include "ldap_defaults.h" 43 44/* 45 * T.61 is somewhat braindead; even in the 7-bit space it is not 46 * completely equivalent to 7-bit US-ASCII. Our definition of the 47 * character set comes from RFC 1345 with a slightly more readable 48 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT. 49 * 50 * Even though '#' and '$' are present in the 7-bit US-ASCII space, 51 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters 52 * xA6 and xA4. 53 * 54 * Also T.61 lacks 55 * backslash \ (x5C) 56 * caret ^ (x5E) 57 * backquote ` (x60) 58 * left brace { (x7B) 59 * right brace } (x7D) 60 * tilde ~ (x7E) 61 * 62 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing 63 * accents of some form or another. There are predefined combinations 64 * for certain characters, but they can also be used arbitrarily. The 65 * table at dkuug.dk maps these accents to the E000 "private use" range 66 * of the Unicode space, but I believe they more properly belong in the 67 * 0300 range (non-spacing accents). The transformation is complicated 68 * slightly because Unicode wants the non-spacing character to follow 69 * the base character, while T.61 has the non-spacing character leading. 70 * Also, T.61 specifically recognizes certain combined pairs as "characters" 71 * but doesn't specify how to treat unrecognized pairs. This code will 72 * always attempt to combine pairs when a known Unicode composite exists. 73 */ 74 75static const wchar_t t61_tab[] = { 76 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 77 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, 78 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017, 79 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f, 80 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027, 81 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f, 82 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037, 83 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f, 84 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047, 85 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f, 86 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057, 87 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f, 88 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067, 89 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f, 90 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077, 91 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f, 92 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087, 93 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f, 94 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097, 95 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f, 96 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7, 97 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000, 98 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7, 99 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf, 100 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307, 101 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c, 102 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 103 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 104 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f, 105 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149, 106 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140, 107 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000 108}; 109 110typedef wchar_t wvec16[16]; 111typedef wchar_t wvec32[32]; 112typedef wchar_t wvec64[64]; 113 114/* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */ 115static const wvec16 accents = { 116 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9, 117 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7}; 118 119/* In the following tables, base characters commented in (parentheses) 120 * are not defined by T.61 but are mapped anyway since their Unicode 121 * composite exists. 122 */ 123 124/* Grave accented chars AEIOU (NWY) */ 125static const wvec32 c1_vec1 = { 126 /* Upper case */ 127 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2, 128 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0}; 129static const wvec32 c1_vec2 = { 130 /* Lower case */ 131 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2, 132 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0}; 133 134static const wvec32 *c1_grave[] = { 135 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL 136}; 137 138/* Acute accented chars AEIOUYCLNRSZ (GKMPW) */ 139static const wvec32 c2_vec1 = { 140 /* Upper case */ 141 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4, 142 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3, 143 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82, 144 0, 0xdd, 0x179, 0, 0, 0, 0, 0}; 145static const wvec32 c2_vec2 = { 146 /* Lower case */ 147 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5, 148 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3, 149 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83, 150 0, 0xfd, 0x17a, 0, 0, 0, 0, 0}; 151static const wvec32 c2_vec3 = { 152 /* (AE and ae) */ 153 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 155 156static const wvec32 *c2_acute[] = { 157 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3 158}; 159 160/* Circumflex AEIOUYCGHJSW (Z) */ 161static const wvec32 c3_vec1 = { 162 /* Upper case */ 163 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c, 164 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4, 165 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174, 166 0, 0x176, 0x1e90, 0, 0, 0, 0, 0}; 167static const wvec32 c3_vec2 = { 168 /* Lower case */ 169 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d, 170 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4, 171 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175, 172 0, 0x177, 0x1e91, 0, 0, 0, 0, 0}; 173static const wvec32 *c3_circumflex[] = { 174 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL 175}; 176 177/* Tilde AIOUN (EVY) */ 178static const wvec32 c4_vec1 = { 179 /* Upper case */ 180 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5, 181 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0}; 182static const wvec32 c4_vec2 = { 183 /* Lower case */ 184 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5, 185 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0}; 186static const wvec32 *c4_tilde[] = { 187 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL 188}; 189 190/* Macron AEIOU (YG) */ 191static const wvec32 c5_vec1 = { 192 /* Upper case */ 193 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c, 194 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0}; 195static const wvec32 c5_vec2 = { 196 /* Lower case */ 197 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d, 198 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0}; 199static const wvec32 c5_vec3 = { 200 /* (AE and ae) */ 201 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 202 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 203static const wvec32 *c5_macron[] = { 204 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3 205}; 206 207/* Breve AUG (EIO) */ 208static const wvec32 c6_vec1 = { 209 /* Upper case */ 210 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e, 211 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 212static const wvec32 c6_vec2 = { 213 /* Lower case */ 214 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f, 215 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 216static const wvec32 *c6_breve[] = { 217 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL 218}; 219 220/* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */ 221static const wvec32 c7_vec1 = { 222 /* Upper case */ 223 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120, 224 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e, 225 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86, 226 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0}; 227static const wvec32 c7_vec2 = { 228 /* Lower case */ 229 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121, 230 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f, 231 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87, 232 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0}; 233static const wvec32 *c7_dotabove[] = { 234 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL 235}; 236 237/* Diaeresis AEIOUY (HWXt) */ 238static const wvec32 c8_vec1 = { 239 /* Upper case */ 240 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6, 241 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0}; 242static const wvec32 c8_vec2 = { 243 /* Lower case */ 244 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6, 245 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0}; 246static const wvec32 *c8_diaeresis[] = { 247 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL 248}; 249 250/* Ring Above AU (wy) */ 251static const wvec32 ca_vec1 = { 252 /* Upper case */ 253 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 254 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 255static const wvec32 ca_vec2 = { 256 /* Lower case */ 257 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 258 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0}; 259static const wvec32 *ca_ringabove[] = { 260 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL 261}; 262 263/* Cedilla CGKLNRST (EDH) */ 264static const wvec32 cb_vec1 = { 265 /* Upper case */ 266 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122, 267 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0, 268 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 269static const wvec32 cb_vec2 = { 270 /* Lower case */ 271 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123, 272 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0, 273 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 274static const wvec32 *cb_cedilla[] = { 275 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL 276}; 277 278/* Double Acute Accent OU */ 279static const wvec32 cd_vec1 = { 280 /* Upper case */ 281 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150, 282 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 283static const wvec32 cd_vec2 = { 284 /* Lower case */ 285 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151, 286 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 287static const wvec32 *cd_doubleacute[] = { 288 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL 289}; 290 291/* Ogonek AEIU (O) */ 292static const wvec32 ce_vec1 = { 293 /* Upper case */ 294 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea, 295 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 296static const wvec32 ce_vec2 = { 297 /* Lower case */ 298 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb, 299 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 300static const wvec32 *ce_ogonek[] = { 301 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL 302}; 303 304/* Caron CDELNRSTZ (AIOUGKjH) */ 305static const wvec32 cf_vec1 = { 306 /* Upper case */ 307 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6, 308 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1, 309 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0, 310 0, 0, 0x17d, 0, 0, 0, 0, 0}; 311static const wvec32 cf_vec2 = { 312 /* Lower case */ 313 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7, 314 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2, 315 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0, 316 0, 0, 0x17e, 0, 0, 0, 0, 0}; 317static const wvec32 *cf_caron[] = { 318 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL 319}; 320 321static const wvec32 **cx_tab[] = { 322 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron, 323 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove, 324 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron }; 325 326int ldap_t61s_valid( struct berval *str ) 327{ 328 unsigned char *c = (unsigned char *)str->bv_val; 329 int i; 330 331 for (i=0; i < str->bv_len; c++,i++) 332 if (!t61_tab[*c]) 333 return 0; 334 return 1; 335} 336 337/* Transform a T.61 string to UTF-8. 338 */ 339int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst ) 340{ 341 unsigned char *c; 342 char *d; 343 int i, wlen = 0; 344 345 /* Just count the length of the UTF-8 result first */ 346 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) { 347 /* Invalid T.61 characters? */ 348 if (!t61_tab[*c]) 349 return LDAP_INVALID_SYNTAX; 350 if ((*c & 0xf0) == 0xc0) { 351 int j = *c & 0x0f; 352 /* If this is the end of the string, or if the base 353 * character is just a space, treat this as a regular 354 * spacing character. 355 */ 356 if ((!c[1] || c[1] == 0x20) && accents[j]) { 357 wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0); 358 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] && 359 /* We have a composite mapping for this pair */ 360 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) { 361 wlen += ldap_x_wc_to_utf8( NULL, 362 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0); 363 } else { 364 /* No mapping, just swap it around so the base 365 * character comes first. 366 */ 367 wlen += ldap_x_wc_to_utf8(NULL, c[1], 0); 368 wlen += ldap_x_wc_to_utf8(NULL, 369 t61_tab[*c], 0); 370 } 371 c++; i++; 372 continue; 373 } else { 374 wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0); 375 } 376 } 377 378 /* Now transform the string */ 379 dst->bv_len = wlen; 380 dst->bv_val = LDAP_MALLOC( wlen+1 ); 381 d = dst->bv_val; 382 if (!d) 383 return LDAP_NO_MEMORY; 384 385 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) { 386 if ((*c & 0xf0) == 0xc0) { 387 int j = *c & 0x0f; 388 /* If this is the end of the string, or if the base 389 * character is just a space, treat this as a regular 390 * spacing character. 391 */ 392 if ((!c[1] || c[1] == 0x20) && accents[j]) { 393 d += ldap_x_wc_to_utf8(d, accents[j], 6); 394 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] && 395 /* We have a composite mapping for this pair */ 396 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) { 397 d += ldap_x_wc_to_utf8(d, 398 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6); 399 } else { 400 /* No mapping, just swap it around so the base 401 * character comes first. 402 */ 403 d += ldap_x_wc_to_utf8(d, c[1], 6); 404 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6); 405 } 406 c++; i++; 407 continue; 408 } else { 409 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6); 410 } 411 } 412 *d = '\0'; 413 return LDAP_SUCCESS; 414} 415 416/* For the reverse mapping, we just pay attention to the Latin-oriented 417 * code blocks. These are 418 * 0000 - 007f Basic Latin 419 * 0080 - 00ff Latin-1 Supplement 420 * 0100 - 017f Latin Extended-A 421 * 0180 - 024f Latin Extended-B 422 * 1e00 - 1eff Latin Extended Additional 423 * 424 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other 425 * unrecognized characters are replaced with '?' 0x3f. 426 */ 427 428static const wvec64 u000 = { 429 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 430 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 431 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 432 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 433 0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027, 434 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 435 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 436 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f}; 437 438/* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20, 439 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters 440 * on their own, even though it provides them as combiners for other 441 * letters. T.61 doesn't define these pairings either, so this may just 442 * have to be replaced with '?' 0x3f if other software can't cope with it. 443 */ 444static const wvec64 u001 = { 445 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 446 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 447 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 448 0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f, 449 0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 450 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 451 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 452 0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f}; 453 454static const wvec64 u002 = { 455 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 456 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 457 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 458 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 459 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7, 460 0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520, 461 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7, 462 0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf}; 463 464static const wvec64 u003 = { 465 0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43, 466 0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849, 467 0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4, 468 0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb, 469 0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63, 470 0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869, 471 0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8, 472 0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879}; 473 474/* These codes are used here but not defined by T.61: 475 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69 476 */ 477static const wvec64 u010 = { 478 0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263, 479 0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64, 480 0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765, 481 0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667, 482 0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4, 483 0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69, 484 0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b, 485 0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7}; 486 487/* These codes are used here but not defined by T.61: 488 * x14e = xc6/x4f, x14f = xc6/x6f 489 */ 490static const wvec64 u011 = { 491 0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e, 492 0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f, 493 0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72, 494 0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73, 495 0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd, 496 0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75, 497 0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379, 498 0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f}; 499 500/* All of the codes in this block are undefined in T.61. 501 */ 502static const wvec64 u013 = { 503 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 504 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49, 505 0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f, 506 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 507 0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67, 508 0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f, 509 0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f, 510 0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f}; 511 512/* All of the codes in this block are undefined in T.61. 513 */ 514static const wvec64 u020 = { 515 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 516 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 517 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 518 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68, 519 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761, 520 0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f, 521 0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f, 522 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f}; 523 524static const wvec64 u023 = { 525 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20, 526 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 527 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 528 0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f, 529 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 530 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 531 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 532 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f}; 533 534/* These are the non-spacing characters by themselves. They should 535 * never appear by themselves in actual text. 536 */ 537static const wvec64 u030 = { 538 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7, 539 0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f, 540 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 541 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 542 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb, 543 0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 544 0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 545 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f}; 546 547/* None of the following blocks are defined in T.61. 548 */ 549static const wvec64 u1e0 = { 550 0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f, 551 0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f, 552 0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 553 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766, 554 0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868, 555 0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 556 0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 557 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d, 558}; 559 560static const wvec64 u1e1 = { 561 0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f, 562 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 563 0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770, 564 0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 565 0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 566 0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f, 567 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 568 0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f, 569}; 570 571static const wvec64 u1e2 = { 572 0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777, 573 0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779, 574 0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874, 575 0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 576 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 577 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 578 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 579 0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f, 580}; 581 582static const wvec64 u1e3 = { 583 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 584 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 585 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 586 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 587 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 588 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 589 0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f, 590 0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 591}; 592 593static const wvec64 *wc00[] = { 594 &u000, &u001, &u002, &u003, 595 &u010, &u011, NULL, &u013, 596 &u020, NULL, NULL, &u023, 597 &u030, NULL, NULL, NULL}; 598 599static const wvec64 *wc1e[] = { 600 &u1e0, &u1e1, &u1e2, &u1e3}; 601 602 603int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst ) 604{ 605 char *c, *d; 606 wchar_t tmp; 607 int i, j, tlen = 0; 608 609 /* Just count the length of the T.61 result first */ 610 for (i=0,c=src->bv_val; i < src->bv_len;) { 611 j = ldap_x_utf8_to_wc( &tmp, c ); 612 if (j == -1) 613 return LDAP_INVALID_SYNTAX; 614 switch (tmp >> 8) { 615 case 0x00: 616 case 0x01: 617 case 0x02: 618 case 0x03: 619 if (wc00[tmp >> 6] && 620 ((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) { 621 tlen++; 622 } 623 tlen++; 624 break; 625 case 0x1e: 626 if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) { 627 tlen++; 628 } 629 case 0x21: 630 default: 631 tlen ++; 632 break; 633 } 634 i += j; 635 c += j; 636 } 637 dst->bv_len = tlen; 638 dst->bv_val = LDAP_MALLOC( tlen+1 ); 639 if (!dst->bv_val) 640 return LDAP_NO_MEMORY; 641 642 d = dst->bv_val; 643 for (i=0,c=src->bv_val; i < src->bv_len;) { 644 j = ldap_x_utf8_to_wc( &tmp, c ); 645 switch (tmp >> 8) { 646 case 0x00: 647 case 0x01: 648 case 0x02: 649 if (wc00[tmp >> 6]) { 650 tmp = (*wc00[tmp >> 6])[tmp & 0x3f]; 651 if (tmp & 0xff00) 652 *d++ = (tmp >> 8); 653 *d++ = tmp & 0xff; 654 } else { 655 *d++ = 0x3f; 656 } 657 break; 658 case 0x03: 659 /* swap order of non-spacing characters */ 660 if (wc00[tmp >> 6]) { 661 wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f]; 662 if (t2 != 0x3f) { 663 d[0] = d[-1]; 664 d[-1] = t2; 665 d++; 666 } else { 667 *d++ = 0x3f; 668 } 669 } else { 670 *d++ = 0x3f; 671 } 672 break; 673 case 0x1e: 674 tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f]; 675 if (tmp & 0xff00) 676 *d++ = (tmp >> 8); 677 *d++ = tmp & 0xff; 678 break; 679 case 0x21: 680 if (tmp == 0x2126) { 681 *d++ = 0xe0; 682 break; 683 } 684 /* FALLTHRU */ 685 default: 686 *d++ = 0x3f; 687 break; 688 } 689 i += j; 690 c += j; 691 } 692 *d = '\0'; 693 return LDAP_SUCCESS; 694} 695