t61.c revision 1.1.1.6
1/* $NetBSD: t61.c,v 1.1.1.6 2017/02/09 01:46:46 christos Exp $ */ 2 3/* $OpenLDAP$ */ 4/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 2002-2016 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in the file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17/* ACKNOWLEDGEMENTS: 18 * This work was initially developed by Howard Chu for inclusion in 19 * OpenLDAP Software. 20 */ 21 22/* 23 * Basic T.61 <-> UTF-8 conversion 24 * 25 * These routines will perform a lossless translation from T.61 to UTF-8 26 * and a lossy translation from UTF-8 to T.61. 27 */ 28 29#include <sys/cdefs.h> 30__RCSID("$NetBSD: t61.c,v 1.1.1.6 2017/02/09 01:46:46 christos Exp $"); 31 32#include "portable.h" 33 34#include <stdio.h> 35 36#include <ac/stdlib.h> 37 38#include <ac/socket.h> 39#include <ac/string.h> 40#include <ac/time.h> 41 42#include "ldap-int.h" 43#include "ldap_utf8.h" 44 45#include "ldap_defaults.h" 46 47/* 48 * T.61 is somewhat braindead; even in the 7-bit space it is not 49 * completely equivalent to 7-bit US-ASCII. Our definition of the 50 * character set comes from RFC 1345 with a slightly more readable 51 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT. 52 * 53 * Even though '#' and '$' are present in the 7-bit US-ASCII space, 54 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters 55 * xA6 and xA4. 56 * 57 * Also T.61 lacks 58 * backslash \ (x5C) 59 * caret ^ (x5E) 60 * backquote ` (x60) 61 * left brace { (x7B) 62 * right brace } (x7D) 63 * tilde ~ (x7E) 64 * 65 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing 66 * accents of some form or another. There are predefined combinations 67 * for certain characters, but they can also be used arbitrarily. The 68 * table at dkuug.dk maps these accents to the E000 "private use" range 69 * of the Unicode space, but I believe they more properly belong in the 70 * 0300 range (non-spacing accents). The transformation is complicated 71 * slightly because Unicode wants the non-spacing character to follow 72 * the base character, while T.61 has the non-spacing character leading. 73 * Also, T.61 specifically recognizes certain combined pairs as "characters" 74 * but doesn't specify how to treat unrecognized pairs. This code will 75 * always attempt to combine pairs when a known Unicode composite exists. 76 */ 77 78static const wchar_t t61_tab[] = { 79 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007, 80 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f, 81 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017, 82 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f, 83 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027, 84 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f, 85 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037, 86 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f, 87 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047, 88 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f, 89 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057, 90 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f, 91 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067, 92 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f, 93 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077, 94 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f, 95 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087, 96 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f, 97 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097, 98 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f, 99 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7, 100 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000, 101 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7, 102 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf, 103 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307, 104 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c, 105 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 106 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 107 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f, 108 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149, 109 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140, 110 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000 111}; 112 113typedef wchar_t wvec16[16]; 114typedef wchar_t wvec32[32]; 115typedef wchar_t wvec64[64]; 116 117/* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */ 118static const wvec16 accents = { 119 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9, 120 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7}; 121 122/* In the following tables, base characters commented in (parentheses) 123 * are not defined by T.61 but are mapped anyway since their Unicode 124 * composite exists. 125 */ 126 127/* Grave accented chars AEIOU (NWY) */ 128static const wvec32 c1_vec1 = { 129 /* Upper case */ 130 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2, 131 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0}; 132static const wvec32 c1_vec2 = { 133 /* Lower case */ 134 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2, 135 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0}; 136 137static const wvec32 *c1_grave[] = { 138 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL 139}; 140 141/* Acute accented chars AEIOUYCLNRSZ (GKMPW) */ 142static const wvec32 c2_vec1 = { 143 /* Upper case */ 144 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4, 145 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3, 146 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82, 147 0, 0xdd, 0x179, 0, 0, 0, 0, 0}; 148static const wvec32 c2_vec2 = { 149 /* Lower case */ 150 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5, 151 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3, 152 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83, 153 0, 0xfd, 0x17a, 0, 0, 0, 0, 0}; 154static const wvec32 c2_vec3 = { 155 /* (AE and ae) */ 156 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 158 159static const wvec32 *c2_acute[] = { 160 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3 161}; 162 163/* Circumflex AEIOUYCGHJSW (Z) */ 164static const wvec32 c3_vec1 = { 165 /* Upper case */ 166 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c, 167 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4, 168 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174, 169 0, 0x176, 0x1e90, 0, 0, 0, 0, 0}; 170static const wvec32 c3_vec2 = { 171 /* Lower case */ 172 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d, 173 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4, 174 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175, 175 0, 0x177, 0x1e91, 0, 0, 0, 0, 0}; 176static const wvec32 *c3_circumflex[] = { 177 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL 178}; 179 180/* Tilde AIOUN (EVY) */ 181static const wvec32 c4_vec1 = { 182 /* Upper case */ 183 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5, 184 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0}; 185static const wvec32 c4_vec2 = { 186 /* Lower case */ 187 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5, 188 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0}; 189static const wvec32 *c4_tilde[] = { 190 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL 191}; 192 193/* Macron AEIOU (YG) */ 194static const wvec32 c5_vec1 = { 195 /* Upper case */ 196 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c, 197 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0}; 198static const wvec32 c5_vec2 = { 199 /* Lower case */ 200 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d, 201 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0}; 202static const wvec32 c5_vec3 = { 203 /* (AE and ae) */ 204 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 205 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 206static const wvec32 *c5_macron[] = { 207 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3 208}; 209 210/* Breve AUG (EIO) */ 211static const wvec32 c6_vec1 = { 212 /* Upper case */ 213 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e, 214 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 215static const wvec32 c6_vec2 = { 216 /* Lower case */ 217 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f, 218 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 219static const wvec32 *c6_breve[] = { 220 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL 221}; 222 223/* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */ 224static const wvec32 c7_vec1 = { 225 /* Upper case */ 226 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120, 227 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e, 228 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86, 229 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0}; 230static const wvec32 c7_vec2 = { 231 /* Lower case */ 232 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121, 233 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f, 234 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87, 235 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0}; 236static const wvec32 *c7_dotabove[] = { 237 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL 238}; 239 240/* Diaeresis AEIOUY (HWXt) */ 241static const wvec32 c8_vec1 = { 242 /* Upper case */ 243 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6, 244 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0}; 245static const wvec32 c8_vec2 = { 246 /* Lower case */ 247 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6, 248 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0}; 249static const wvec32 *c8_diaeresis[] = { 250 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL 251}; 252 253/* Ring Above AU (wy) */ 254static const wvec32 ca_vec1 = { 255 /* Upper case */ 256 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 257 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 258static const wvec32 ca_vec2 = { 259 /* Lower case */ 260 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 261 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0}; 262static const wvec32 *ca_ringabove[] = { 263 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL 264}; 265 266/* Cedilla CGKLNRST (EDH) */ 267static const wvec32 cb_vec1 = { 268 /* Upper case */ 269 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122, 270 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0, 271 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 272static const wvec32 cb_vec2 = { 273 /* Lower case */ 274 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123, 275 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0, 276 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 277static const wvec32 *cb_cedilla[] = { 278 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL 279}; 280 281/* Double Acute Accent OU */ 282static const wvec32 cd_vec1 = { 283 /* Upper case */ 284 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150, 285 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 286static const wvec32 cd_vec2 = { 287 /* Lower case */ 288 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151, 289 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 290static const wvec32 *cd_doubleacute[] = { 291 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL 292}; 293 294/* Ogonek AEIU (O) */ 295static const wvec32 ce_vec1 = { 296 /* Upper case */ 297 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea, 298 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 299static const wvec32 ce_vec2 = { 300 /* Lower case */ 301 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb, 302 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 303static const wvec32 *ce_ogonek[] = { 304 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL 305}; 306 307/* Caron CDELNRSTZ (AIOUGKjH) */ 308static const wvec32 cf_vec1 = { 309 /* Upper case */ 310 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6, 311 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1, 312 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0, 313 0, 0, 0x17d, 0, 0, 0, 0, 0}; 314static const wvec32 cf_vec2 = { 315 /* Lower case */ 316 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7, 317 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2, 318 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0, 319 0, 0, 0x17e, 0, 0, 0, 0, 0}; 320static const wvec32 *cf_caron[] = { 321 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL 322}; 323 324static const wvec32 **cx_tab[] = { 325 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron, 326 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove, 327 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron }; 328 329int ldap_t61s_valid( struct berval *str ) 330{ 331 unsigned char *c = (unsigned char *)str->bv_val; 332 int i; 333 334 for (i=0; i < str->bv_len; c++,i++) 335 if (!t61_tab[*c]) 336 return 0; 337 return 1; 338} 339 340/* Transform a T.61 string to UTF-8. 341 */ 342int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst ) 343{ 344 unsigned char *c; 345 char *d; 346 int i, wlen = 0; 347 348 /* Just count the length of the UTF-8 result first */ 349 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) { 350 /* Invalid T.61 characters? */ 351 if (!t61_tab[*c]) 352 return LDAP_INVALID_SYNTAX; 353 if ((*c & 0xf0) == 0xc0) { 354 int j = *c & 0x0f; 355 /* If this is the end of the string, or if the base 356 * character is just a space, treat this as a regular 357 * spacing character. 358 */ 359 if ((!c[1] || c[1] == 0x20) && accents[j]) { 360 wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0); 361 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] && 362 /* We have a composite mapping for this pair */ 363 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) { 364 wlen += ldap_x_wc_to_utf8( NULL, 365 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0); 366 } else { 367 /* No mapping, just swap it around so the base 368 * character comes first. 369 */ 370 wlen += ldap_x_wc_to_utf8(NULL, c[1], 0); 371 wlen += ldap_x_wc_to_utf8(NULL, 372 t61_tab[*c], 0); 373 } 374 c++; i++; 375 continue; 376 } else { 377 wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0); 378 } 379 } 380 381 /* Now transform the string */ 382 dst->bv_len = wlen; 383 dst->bv_val = LDAP_MALLOC( wlen+1 ); 384 d = dst->bv_val; 385 if (!d) 386 return LDAP_NO_MEMORY; 387 388 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) { 389 if ((*c & 0xf0) == 0xc0) { 390 int j = *c & 0x0f; 391 /* If this is the end of the string, or if the base 392 * character is just a space, treat this as a regular 393 * spacing character. 394 */ 395 if ((!c[1] || c[1] == 0x20) && accents[j]) { 396 d += ldap_x_wc_to_utf8(d, accents[j], 6); 397 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] && 398 /* We have a composite mapping for this pair */ 399 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) { 400 d += ldap_x_wc_to_utf8(d, 401 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6); 402 } else { 403 /* No mapping, just swap it around so the base 404 * character comes first. 405 */ 406 d += ldap_x_wc_to_utf8(d, c[1], 6); 407 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6); 408 } 409 c++; i++; 410 continue; 411 } else { 412 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6); 413 } 414 } 415 *d = '\0'; 416 return LDAP_SUCCESS; 417} 418 419/* For the reverse mapping, we just pay attention to the Latin-oriented 420 * code blocks. These are 421 * 0000 - 007f Basic Latin 422 * 0080 - 00ff Latin-1 Supplement 423 * 0100 - 017f Latin Extended-A 424 * 0180 - 024f Latin Extended-B 425 * 1e00 - 1eff Latin Extended Additional 426 * 427 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other 428 * unrecognized characters are replaced with '?' 0x3f. 429 */ 430 431static const wvec64 u000 = { 432 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 433 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 434 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 435 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 436 0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027, 437 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 438 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 439 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f}; 440 441/* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20, 442 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters 443 * on their own, even though it provides them as combiners for other 444 * letters. T.61 doesn't define these pairings either, so this may just 445 * have to be replaced with '?' 0x3f if other software can't cope with it. 446 */ 447static const wvec64 u001 = { 448 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 449 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 450 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 451 0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f, 452 0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 453 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 454 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 455 0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f}; 456 457static const wvec64 u002 = { 458 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 459 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 460 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 461 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 462 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7, 463 0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520, 464 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7, 465 0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf}; 466 467static const wvec64 u003 = { 468 0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43, 469 0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849, 470 0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4, 471 0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb, 472 0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63, 473 0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869, 474 0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8, 475 0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879}; 476 477/* These codes are used here but not defined by T.61: 478 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69 479 */ 480static const wvec64 u010 = { 481 0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263, 482 0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64, 483 0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765, 484 0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667, 485 0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4, 486 0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69, 487 0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b, 488 0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7}; 489 490/* These codes are used here but not defined by T.61: 491 * x14e = xc6/x4f, x14f = xc6/x6f 492 */ 493static const wvec64 u011 = { 494 0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e, 495 0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f, 496 0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72, 497 0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73, 498 0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd, 499 0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75, 500 0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379, 501 0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f}; 502 503/* All of the codes in this block are undefined in T.61. 504 */ 505static const wvec64 u013 = { 506 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 507 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49, 508 0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f, 509 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 510 0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67, 511 0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f, 512 0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f, 513 0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f}; 514 515/* All of the codes in this block are undefined in T.61. 516 */ 517static const wvec64 u020 = { 518 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 519 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 520 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 521 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68, 522 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761, 523 0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f, 524 0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f, 525 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f}; 526 527static const wvec64 u023 = { 528 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20, 529 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 530 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 531 0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f, 532 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 533 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 534 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 535 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f}; 536 537/* These are the non-spacing characters by themselves. They should 538 * never appear by themselves in actual text. 539 */ 540static const wvec64 u030 = { 541 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7, 542 0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f, 543 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 544 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 545 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb, 546 0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 547 0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 548 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f}; 549 550/* None of the following blocks are defined in T.61. 551 */ 552static const wvec64 u1e0 = { 553 0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f, 554 0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f, 555 0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 556 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766, 557 0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868, 558 0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 559 0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 560 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d, 561}; 562 563static const wvec64 u1e1 = { 564 0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f, 565 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 566 0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770, 567 0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 568 0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 569 0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f, 570 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 571 0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f, 572}; 573 574static const wvec64 u1e2 = { 575 0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777, 576 0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779, 577 0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874, 578 0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 579 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 580 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 581 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 582 0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f, 583}; 584 585static const wvec64 u1e3 = { 586 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 587 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 588 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 589 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 590 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 591 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 592 0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f, 593 0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 594}; 595 596static const wvec64 *wc00[] = { 597 &u000, &u001, &u002, &u003, 598 &u010, &u011, NULL, &u013, 599 &u020, NULL, NULL, &u023, 600 &u030, NULL, NULL, NULL}; 601 602static const wvec64 *wc1e[] = { 603 &u1e0, &u1e1, &u1e2, &u1e3}; 604 605 606int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst ) 607{ 608 char *c, *d; 609 wchar_t tmp; 610 int i, j, tlen = 0; 611 612 /* Just count the length of the T.61 result first */ 613 for (i=0,c=src->bv_val; i < src->bv_len;) { 614 j = ldap_x_utf8_to_wc( &tmp, c ); 615 if (j == -1) 616 return LDAP_INVALID_SYNTAX; 617 switch (tmp >> 8) { 618 case 0x00: 619 case 0x01: 620 case 0x02: 621 case 0x03: 622 if (wc00[tmp >> 6] && 623 ((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) { 624 tlen++; 625 } 626 tlen++; 627 break; 628 case 0x1e: 629 if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) { 630 tlen++; 631 } 632 case 0x21: 633 default: 634 tlen ++; 635 break; 636 } 637 i += j; 638 c += j; 639 } 640 dst->bv_len = tlen; 641 dst->bv_val = LDAP_MALLOC( tlen+1 ); 642 if (!dst->bv_val) 643 return LDAP_NO_MEMORY; 644 645 d = dst->bv_val; 646 for (i=0,c=src->bv_val; i < src->bv_len;) { 647 j = ldap_x_utf8_to_wc( &tmp, c ); 648 switch (tmp >> 8) { 649 case 0x00: 650 case 0x01: 651 case 0x02: 652 if (wc00[tmp >> 6]) { 653 tmp = (*wc00[tmp >> 6])[tmp & 0x3f]; 654 if (tmp & 0xff00) 655 *d++ = (tmp >> 8); 656 *d++ = tmp & 0xff; 657 } else { 658 *d++ = 0x3f; 659 } 660 break; 661 case 0x03: 662 /* swap order of non-spacing characters */ 663 if (wc00[tmp >> 6]) { 664 wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f]; 665 if (t2 != 0x3f) { 666 d[0] = d[-1]; 667 d[-1] = t2; 668 d++; 669 } else { 670 *d++ = 0x3f; 671 } 672 } else { 673 *d++ = 0x3f; 674 } 675 break; 676 case 0x1e: 677 tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f]; 678 if (tmp & 0xff00) 679 *d++ = (tmp >> 8); 680 *d++ = tmp & 0xff; 681 break; 682 case 0x21: 683 if (tmp == 0x2126) { 684 *d++ = 0xe0; 685 break; 686 } 687 /* FALLTHRU */ 688 default: 689 *d++ = 0x3f; 690 break; 691 } 692 i += j; 693 c += j; 694 } 695 *d = '\0'; 696 return LDAP_SUCCESS; 697} 698