t61.c revision 1.1.1.1
1/* $OpenLDAP: pkg/ldap/libraries/libldap/t61.c,v 1.9.2.3 2008/02/11 23:26:41 kurt Exp $ */ 2/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 3 * 4 * Copyright 2002-2008 The OpenLDAP Foundation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted only as authorized by the OpenLDAP 9 * Public License. 10 * 11 * A copy of this license is available in the file LICENSE in the 12 * top-level directory of the distribution or, alternatively, at 13 * <http://www.OpenLDAP.org/license.html>. 14 */ 15/* ACKNOWLEDGEMENTS: 16 * This work was initially developed by Howard Chu for inclusion in 17 * OpenLDAP Software. 18 */ 19 20/* 21 * Basic T.61 <-> UTF-8 conversion 22 * 23 * These routines will perform a lossless translation from T.61 to UTF-8 24 * and a lossy translation from UTF-8 to T.61. 25 */ 26 27#include "portable.h" 28 29#include <stdio.h> 30 31#include <ac/stdlib.h> 32 33#include <ac/socket.h> 34#include <ac/string.h> 35#include <ac/time.h> 36 37#include "ldap-int.h" 38#include "ldap_utf8.h" 39 40#include "ldap_defaults.h" 41 42/* 43 * T.61 is somewhat braindead; even in the 7-bit space it is not 44 * completely equivalent to 7-bit US-ASCII. Our definition of the 45 * character set comes from RFC 1345 with a slightly more readable 46 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT. 47 * 48 * Even though '#' and '$' are present in the 7-bit US-ASCII space, 49 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters 50 * xA6 and xA4. 51 * 52 * Also T.61 lacks 53 * backslash \ (x5C) 54 * caret ^ (x5E) 55 * backquote ` (x60) 56 * left brace { (x7B) 57 * right brace } (x7D) 58 * tilde ~ (x7E) 59 * 60 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing 61 * accents of some form or another. There are predefined combinations 62 * for certain characters, but they can also be used arbitrarily. The 63 * table at dkuug.dk maps these accents to the E000 "private use" range 64 * of the Unicode space, but I believe they more properly belong in the 65 * 0300 range (non-spacing accents). The transformation is complicated 66 * slightly because Unicode wants the non-spacing character to follow 67 * the base character, while T.61 has the non-spacing character leading. 68 * Also, T.61 specifically recognizes certain combined pairs as "characters" 69 * but doesn't specify how to treat unrecognized pairs. This code will 70 * always attempt to combine pairs when a known Unicode composite exists. 71 */ 72 73static const wchar_t t61_tab[] = { 74 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 75 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, 76 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017, 77 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f, 78 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027, 79 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f, 80 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037, 81 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f, 82 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047, 83 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f, 84 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057, 85 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f, 86 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067, 87 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f, 88 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077, 89 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f, 90 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087, 91 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f, 92 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097, 93 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f, 94 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7, 95 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000, 96 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7, 97 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf, 98 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307, 99 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c, 100 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 101 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 102 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f, 103 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149, 104 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140, 105 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000 106}; 107 108typedef wchar_t wvec16[16]; 109typedef wchar_t wvec32[32]; 110typedef wchar_t wvec64[64]; 111 112/* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */ 113static const wvec16 accents = { 114 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9, 115 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7}; 116 117/* In the following tables, base characters commented in (parentheses) 118 * are not defined by T.61 but are mapped anyway since their Unicode 119 * composite exists. 120 */ 121 122/* Grave accented chars AEIOU (NWY) */ 123static const wvec32 c1_vec1 = { 124 /* Upper case */ 125 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2, 126 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0}; 127static const wvec32 c1_vec2 = { 128 /* Lower case */ 129 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2, 130 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0}; 131 132static const wvec32 *c1_grave[] = { 133 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL 134}; 135 136/* Acute accented chars AEIOUYCLNRSZ (GKMPW) */ 137static const wvec32 c2_vec1 = { 138 /* Upper case */ 139 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4, 140 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3, 141 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82, 142 0, 0xdd, 0x179, 0, 0, 0, 0, 0}; 143static const wvec32 c2_vec2 = { 144 /* Lower case */ 145 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5, 146 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3, 147 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83, 148 0, 0xfd, 0x17a, 0, 0, 0, 0, 0}; 149static const wvec32 c2_vec3 = { 150 /* (AE and ae) */ 151 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 153 154static const wvec32 *c2_acute[] = { 155 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3 156}; 157 158/* Circumflex AEIOUYCGHJSW (Z) */ 159static const wvec32 c3_vec1 = { 160 /* Upper case */ 161 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c, 162 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4, 163 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174, 164 0, 0x176, 0x1e90, 0, 0, 0, 0, 0}; 165static const wvec32 c3_vec2 = { 166 /* Lower case */ 167 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d, 168 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4, 169 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175, 170 0, 0x177, 0x1e91, 0, 0, 0, 0, 0}; 171static const wvec32 *c3_circumflex[] = { 172 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL 173}; 174 175/* Tilde AIOUN (EVY) */ 176static const wvec32 c4_vec1 = { 177 /* Upper case */ 178 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5, 179 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0}; 180static const wvec32 c4_vec2 = { 181 /* Lower case */ 182 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5, 183 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0}; 184static const wvec32 *c4_tilde[] = { 185 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL 186}; 187 188/* Macron AEIOU (YG) */ 189static const wvec32 c5_vec1 = { 190 /* Upper case */ 191 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c, 192 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0}; 193static const wvec32 c5_vec2 = { 194 /* Lower case */ 195 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d, 196 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0}; 197static const wvec32 c5_vec3 = { 198 /* (AE and ae) */ 199 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 200 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 201static const wvec32 *c5_macron[] = { 202 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3 203}; 204 205/* Breve AUG (EIO) */ 206static const wvec32 c6_vec1 = { 207 /* Upper case */ 208 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e, 209 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 210static const wvec32 c6_vec2 = { 211 /* Lower case */ 212 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f, 213 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 214static const wvec32 *c6_breve[] = { 215 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL 216}; 217 218/* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */ 219static const wvec32 c7_vec1 = { 220 /* Upper case */ 221 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120, 222 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e, 223 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86, 224 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0}; 225static const wvec32 c7_vec2 = { 226 /* Lower case */ 227 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121, 228 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f, 229 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87, 230 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0}; 231static const wvec32 *c7_dotabove[] = { 232 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL 233}; 234 235/* Diaeresis AEIOUY (HWXt) */ 236static const wvec32 c8_vec1 = { 237 /* Upper case */ 238 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6, 239 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0}; 240static const wvec32 c8_vec2 = { 241 /* Lower case */ 242 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6, 243 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0}; 244static const wvec32 *c8_diaeresis[] = { 245 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL 246}; 247 248/* Ring Above AU (wy) */ 249static const wvec32 ca_vec1 = { 250 /* Upper case */ 251 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 253static const wvec32 ca_vec2 = { 254 /* Lower case */ 255 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0}; 257static const wvec32 *ca_ringabove[] = { 258 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL 259}; 260 261/* Cedilla CGKLNRST (EDH) */ 262static const wvec32 cb_vec1 = { 263 /* Upper case */ 264 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122, 265 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0, 266 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 267static const wvec32 cb_vec2 = { 268 /* Lower case */ 269 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123, 270 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0, 271 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 272static const wvec32 *cb_cedilla[] = { 273 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL 274}; 275 276/* Double Acute Accent OU */ 277static const wvec32 cd_vec1 = { 278 /* Upper case */ 279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150, 280 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 281static const wvec32 cd_vec2 = { 282 /* Lower case */ 283 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151, 284 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 285static const wvec32 *cd_doubleacute[] = { 286 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL 287}; 288 289/* Ogonek AEIU (O) */ 290static const wvec32 ce_vec1 = { 291 /* Upper case */ 292 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea, 293 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 294static const wvec32 ce_vec2 = { 295 /* Lower case */ 296 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb, 297 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 298static const wvec32 *ce_ogonek[] = { 299 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL 300}; 301 302/* Caron CDELNRSTZ (AIOUGKjH) */ 303static const wvec32 cf_vec1 = { 304 /* Upper case */ 305 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6, 306 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1, 307 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0, 308 0, 0, 0x17d, 0, 0, 0, 0, 0}; 309static const wvec32 cf_vec2 = { 310 /* Lower case */ 311 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7, 312 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2, 313 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0, 314 0, 0, 0x17e, 0, 0, 0, 0, 0}; 315static const wvec32 *cf_caron[] = { 316 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL 317}; 318 319static const wvec32 **cx_tab[] = { 320 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron, 321 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove, 322 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron }; 323 324int ldap_t61s_valid( struct berval *str ) 325{ 326 unsigned char *c = (unsigned char *)str->bv_val; 327 int i; 328 329 for (i=0; i < str->bv_len; c++,i++) 330 if (!t61_tab[*c]) 331 return 0; 332 return 1; 333} 334 335/* Transform a T.61 string to UTF-8. 336 */ 337int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst ) 338{ 339 unsigned char *c; 340 char *d; 341 int i, wlen = 0; 342 343 /* Just count the length of the UTF-8 result first */ 344 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) { 345 /* Invalid T.61 characters? */ 346 if (!t61_tab[*c]) 347 return LDAP_INVALID_SYNTAX; 348 if ((*c & 0xf0) == 0xc0) { 349 int j = *c & 0x0f; 350 /* If this is the end of the string, or if the base 351 * character is just a space, treat this as a regular 352 * spacing character. 353 */ 354 if ((!c[1] || c[1] == 0x20) && accents[j]) { 355 wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0); 356 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] && 357 /* We have a composite mapping for this pair */ 358 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) { 359 wlen += ldap_x_wc_to_utf8( NULL, 360 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0); 361 } else { 362 /* No mapping, just swap it around so the base 363 * character comes first. 364 */ 365 wlen += ldap_x_wc_to_utf8(NULL, c[1], 0); 366 wlen += ldap_x_wc_to_utf8(NULL, 367 t61_tab[*c], 0); 368 } 369 c++; i++; 370 continue; 371 } else { 372 wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0); 373 } 374 } 375 376 /* Now transform the string */ 377 dst->bv_len = wlen; 378 dst->bv_val = LDAP_MALLOC( wlen+1 ); 379 d = dst->bv_val; 380 if (!d) 381 return LDAP_NO_MEMORY; 382 383 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) { 384 if ((*c & 0xf0) == 0xc0) { 385 int j = *c & 0x0f; 386 /* If this is the end of the string, or if the base 387 * character is just a space, treat this as a regular 388 * spacing character. 389 */ 390 if ((!c[1] || c[1] == 0x20) && accents[j]) { 391 d += ldap_x_wc_to_utf8(d, accents[j], 6); 392 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] && 393 /* We have a composite mapping for this pair */ 394 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) { 395 d += ldap_x_wc_to_utf8(d, 396 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6); 397 } else { 398 /* No mapping, just swap it around so the base 399 * character comes first. 400 */ 401 d += ldap_x_wc_to_utf8(d, c[1], 6); 402 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6); 403 } 404 c++; i++; 405 continue; 406 } else { 407 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6); 408 } 409 } 410 *d = '\0'; 411 return LDAP_SUCCESS; 412} 413 414/* For the reverse mapping, we just pay attention to the Latin-oriented 415 * code blocks. These are 416 * 0000 - 007f Basic Latin 417 * 0080 - 00ff Latin-1 Supplement 418 * 0100 - 017f Latin Extended-A 419 * 0180 - 024f Latin Extended-B 420 * 1e00 - 1eff Latin Extended Additional 421 * 422 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other 423 * unrecognized characters are replaced with '?' 0x3f. 424 */ 425 426static const wvec64 u000 = { 427 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 428 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 429 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 430 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 431 0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027, 432 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 433 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 434 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f}; 435 436/* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20, 437 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters 438 * on their own, even though it provides them as combiners for other 439 * letters. T.61 doesn't define these pairings either, so this may just 440 * have to be replaced with '?' 0x3f if other software can't cope with it. 441 */ 442static const wvec64 u001 = { 443 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 444 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 445 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 446 0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f, 447 0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 448 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 449 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 450 0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f}; 451 452static const wvec64 u002 = { 453 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 454 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 455 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 456 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 457 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7, 458 0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520, 459 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7, 460 0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf}; 461 462static const wvec64 u003 = { 463 0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43, 464 0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849, 465 0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4, 466 0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb, 467 0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63, 468 0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869, 469 0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8, 470 0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879}; 471 472/* These codes are used here but not defined by T.61: 473 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69 474 */ 475static const wvec64 u010 = { 476 0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263, 477 0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64, 478 0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765, 479 0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667, 480 0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4, 481 0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69, 482 0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b, 483 0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7}; 484 485/* These codes are used here but not defined by T.61: 486 * x14e = xc6/x4f, x14f = xc6/x6f 487 */ 488static const wvec64 u011 = { 489 0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e, 490 0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f, 491 0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72, 492 0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73, 493 0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd, 494 0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75, 495 0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379, 496 0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f}; 497 498/* All of the codes in this block are undefined in T.61. 499 */ 500static const wvec64 u013 = { 501 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 502 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49, 503 0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f, 504 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 505 0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67, 506 0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f, 507 0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f, 508 0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f}; 509 510/* All of the codes in this block are undefined in T.61. 511 */ 512static const wvec64 u020 = { 513 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 514 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 515 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 516 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68, 517 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761, 518 0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f, 519 0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f, 520 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f}; 521 522static const wvec64 u023 = { 523 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20, 524 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 525 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 526 0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f, 527 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 528 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 529 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 530 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f}; 531 532/* These are the non-spacing characters by themselves. They should 533 * never appear by themselves in actual text. 534 */ 535static const wvec64 u030 = { 536 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7, 537 0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f, 538 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 539 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 540 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb, 541 0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 542 0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 543 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f}; 544 545/* None of the following blocks are defined in T.61. 546 */ 547static const wvec64 u1e0 = { 548 0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f, 549 0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f, 550 0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 551 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766, 552 0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868, 553 0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 554 0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 555 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d, 556}; 557 558static const wvec64 u1e1 = { 559 0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f, 560 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 561 0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770, 562 0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 563 0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 564 0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f, 565 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 566 0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f, 567}; 568 569static const wvec64 u1e2 = { 570 0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777, 571 0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779, 572 0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874, 573 0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 574 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 575 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 576 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 577 0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f, 578}; 579 580static const wvec64 u1e3 = { 581 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 582 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 583 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 584 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 585 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 586 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 587 0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f, 588 0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 589}; 590 591static const wvec64 *wc00[] = { 592 &u000, &u001, &u002, &u003, 593 &u010, &u011, NULL, &u013, 594 &u020, NULL, NULL, &u023, 595 &u030, NULL, NULL, NULL}; 596 597static const wvec64 *wc1e[] = { 598 &u1e0, &u1e1, &u1e2, &u1e3}; 599 600 601int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst ) 602{ 603 char *c, *d; 604 wchar_t tmp; 605 int i, j, tlen = 0; 606 607 /* Just count the length of the T.61 result first */ 608 for (i=0,c=src->bv_val; i < src->bv_len;) { 609 j = ldap_x_utf8_to_wc( &tmp, c ); 610 if (j == -1) 611 return LDAP_INVALID_SYNTAX; 612 switch (tmp >> 8) { 613 case 0x00: 614 case 0x01: 615 case 0x02: 616 case 0x03: 617 if (wc00[tmp >> 6] && 618 ((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) { 619 tlen++; 620 } 621 tlen++; 622 break; 623 case 0x1e: 624 if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) { 625 tlen++; 626 } 627 case 0x21: 628 default: 629 tlen ++; 630 break; 631 } 632 i += j; 633 c += j; 634 } 635 dst->bv_len = tlen; 636 dst->bv_val = LDAP_MALLOC( tlen+1 ); 637 if (!dst->bv_val) 638 return LDAP_NO_MEMORY; 639 640 d = dst->bv_val; 641 for (i=0,c=src->bv_val; i < src->bv_len;) { 642 j = ldap_x_utf8_to_wc( &tmp, c ); 643 switch (tmp >> 8) { 644 case 0x00: 645 case 0x01: 646 case 0x02: 647 if (wc00[tmp >> 6]) { 648 tmp = (*wc00[tmp >> 6])[tmp & 0x3f]; 649 if (tmp & 0xff00) 650 *d++ = (tmp >> 8); 651 *d++ = tmp & 0xff; 652 } else { 653 *d++ = 0x3f; 654 } 655 break; 656 case 0x03: 657 /* swap order of non-spacing characters */ 658 if (wc00[tmp >> 6]) { 659 wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f]; 660 if (t2 != 0x3f) { 661 d[0] = d[-1]; 662 d[-1] = t2; 663 d++; 664 } else { 665 *d++ = 0x3f; 666 } 667 } else { 668 *d++ = 0x3f; 669 } 670 break; 671 case 0x1e: 672 tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f]; 673 if (tmp & 0xff00) 674 *d++ = (tmp >> 8); 675 *d++ = tmp & 0xff; 676 break; 677 case 0x21: 678 if (tmp == 0x2126) { 679 *d++ = 0xe0; 680 break; 681 } 682 /* FALLTHRU */ 683 default: 684 *d++ = 0x3f; 685 break; 686 } 687 } 688 *d = '\0'; 689 return LDAP_SUCCESS; 690} 691