1#ifndef lint 2static char *rcsid = "$Id: punycode.c,v 1.1 2003/06/04 00:26:06 marka Exp $"; 3#endif 4 5/* 6 * Copyright (c) 2001,2002 Japan Network Information Center. 7 * All rights reserved. 8 * 9 * By using this file, you agree to the terms and conditions set forth bellow. 10 * 11 * LICENSE TERMS AND CONDITIONS 12 * 13 * The following License Terms and Conditions apply, unless a different 14 * license is obtained from Japan Network Information Center ("JPNIC"), 15 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 16 * Chiyoda-ku, Tokyo 101-0047, Japan. 17 * 18 * 1. Use, Modification and Redistribution (including distribution of any 19 * modified or derived work) in source and/or binary forms is permitted 20 * under this License Terms and Conditions. 21 * 22 * 2. Redistribution of source code must retain the copyright notices as they 23 * appear in each source code file, this License Terms and Conditions. 24 * 25 * 3. Redistribution in binary form must reproduce the Copyright Notice, 26 * this License Terms and Conditions, in the documentation and/or other 27 * materials provided with the distribution. For the purposes of binary 28 * distribution the "Copyright Notice" refers to the following language: 29 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 30 * 31 * 4. The name of JPNIC may not be used to endorse or promote products 32 * derived from this Software without specific prior written approval of 33 * JPNIC. 34 * 35 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 36 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 37 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 38 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 40 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 41 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 42 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 43 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 44 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 45 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 46 */ 47 48#include <config.h> 49 50#include <stddef.h> 51#include <stdlib.h> 52#include <string.h> 53 54#include <idn/result.h> 55#include <idn/assert.h> 56#include <idn/logmacro.h> 57#include <idn/converter.h> 58#include <idn/ucs4.h> 59#include <idn/debug.h> 60#include <idn/punycode.h> 61#include <idn/util.h> 62 63/* 64 * Although draft-ietf-idn-punycode-00.txt doesn't specify the ACE 65 * signature, we have to choose one. In order to prevent the converted 66 * name from beginning with a hyphen, we should choose a prefix rather 67 * than a suffix. 68 */ 69#ifndef IDN_PUNYCODE_PREFIX 70#define IDN_PUNYCODE_PREFIX "xn--" 71#endif 72 73#define INVALID_UCS 0x80000000 74#define MAX_UCS 0x10FFFF 75 76/* 77 * As the draft states, it is possible that `delta' may overflow during 78 * the encoding. The upper bound of 'delta' is: 79 * <# of chars. of input string> + <max. difference in code point> * 80 * <# of chars. of input string + 1> 81 * For this value not to be greater than 0xffffffff (since the calculation 82 * is done using unsigned long, which is at least 32bit long), the maxmum 83 * input string size is about 3850 characters, which is long enough for 84 * a domain label... 85 */ 86#define PUNYCODE_MAXINPUT 3800 87 88/* 89 * Parameters. 90 */ 91#define PUNYCODE_BASE 36 92#define PUNYCODE_TMIN 1 93#define PUNYCODE_TMAX 26 94#define PUNYCODE_SKEW 38 95#define PUNYCODE_DAMP 700 96#define PUNYCODE_INITIAL_BIAS 72 97#define PUNYCODE_INITIAL_N 0x80 98 99static int punycode_getwc(const char *s, size_t len, 100 int bias, unsigned long *vp); 101static int punycode_putwc(char *s, size_t len, 102 unsigned long delta, int bias); 103static int punycode_update_bias(unsigned long delta, 104 size_t npoints, int first); 105 106idn_result_t 107idn__punycode_decode(idn_converter_t ctx, void *privdata, 108 const char *from, unsigned long *to, size_t tolen) { 109 unsigned long *to_org = to; 110 unsigned long c, idx; 111 size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX); 112 size_t fromlen; 113 size_t uidx, fidx, ucslen; 114 int first, bias; 115 idn_result_t r; 116 117 assert(ctx != NULL); 118 119 TRACE(("idn__punycode_decode(from=\"%s\", tolen=%d)\n", 120 idn__debug_xstring(from, 50), (int)tolen)); 121 122 if (!idn__util_asciihaveaceprefix(from, IDN_PUNYCODE_PREFIX)) { 123 if (*from == '\0') { 124 r = idn_ucs4_utf8toucs4(from, to, tolen); 125 goto ret; 126 } 127 r = idn_invalid_encoding; 128 goto ret; 129 } 130 from += prefixlen; 131 fromlen = strlen(from); 132 133 /* 134 * Find the last delimiter, and copy the characters 135 * before it verbatim. 136 */ 137 ucslen = 0; 138 for (fidx = fromlen; fidx > 0; fidx--) { 139 if (from[fidx - 1] == '-') { 140 if (tolen < fidx) { 141 r = idn_buffer_overflow; 142 goto ret; 143 } 144 for (uidx = 0; uidx < fidx - 1; uidx++) { 145 to[uidx] = from[uidx]; 146 } 147 ucslen = uidx; 148 break; 149 } 150 } 151 152 first = 1; 153 bias = PUNYCODE_INITIAL_BIAS; 154 c = PUNYCODE_INITIAL_N; 155 idx = 0; 156 while (fidx < fromlen) { 157 int len; 158 unsigned long delta; 159 int i; 160 161 len = punycode_getwc(from + fidx, fromlen - fidx, bias, &delta); 162 if (len == 0) { 163 r = idn_invalid_encoding; 164 goto ret; 165 } 166 fidx += len; 167 168 bias = punycode_update_bias(delta, ucslen + 1, first); 169 first = 0; 170 idx += delta; 171 c += idx / (ucslen + 1); 172 uidx = idx % (ucslen + 1); 173 174 /* Insert 'c' at uidx. */ 175 if (tolen-- <= 0) { 176 r = idn_buffer_overflow; 177 goto ret; 178 } 179 for (i = ucslen; i > uidx; i--) 180 to[i] = to[i - 1]; 181 to[uidx] = c; 182 183 ucslen++; 184 idx = uidx + 1; 185 } 186 187 /* Terminate with NUL. */ 188 if (tolen <= 0) { 189 r = idn_buffer_overflow; 190 goto ret; 191 } 192 to[ucslen] = '\0'; 193 r = idn_success; 194 195ret: 196 if (r == idn_success) { 197 TRACE(("idn__punycode_decode(): succcess (to=\"%s\")\n", 198 idn__debug_ucs4xstring(to_org, 50))); 199 } else { 200 TRACE(("idn__punycode_decode(): %s\n", idn_result_tostring(r))); 201 } 202 return (r); 203} 204 205idn_result_t 206idn__punycode_encode(idn_converter_t ctx, void *privdata, 207 const unsigned long *from, char *to, size_t tolen) { 208 char *to_org = to; 209 unsigned long cur_code, next_code, delta; 210 size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX); 211 size_t fromlen; 212 size_t ucsdone; 213 size_t toidx; 214 int uidx, bias, first; 215 idn_result_t r; 216 217 assert(ctx != NULL); 218 219 TRACE(("idn__punycode_encode(from=\"%s\", tolen=%d)\n", 220 idn__debug_ucs4xstring(from, 50), (int)tolen)); 221 222 if (*from == '\0') { 223 r = idn_ucs4_ucs4toutf8(from, to, tolen); 224 goto ret; 225 } else if (idn__util_ucs4haveaceprefix(from, IDN_PUNYCODE_PREFIX)) { 226 r = idn_prohibited; 227 goto ret; 228 } 229 230 if (tolen < prefixlen) { 231 r = idn_buffer_overflow; 232 goto ret; 233 } 234 memcpy(to, IDN_PUNYCODE_PREFIX, prefixlen); 235 to += prefixlen; 236 tolen -= prefixlen; 237 238 fromlen = idn_ucs4_strlen(from); 239 240 /* 241 * If the input string is too long (actually too long to be sane), 242 * return failure in order to prevent possible overflow. 243 */ 244 if (fromlen > PUNYCODE_MAXINPUT) { 245 ERROR(("idn__punycode_encode(): " 246 "the input string is too long to convert Punycode\n", 247 idn__debug_ucs4xstring(from, 50))); 248 r = idn_failure; 249 goto ret; 250 } 251 252 ucsdone = 0; /* number of characters processed */ 253 toidx = 0; 254 255 /* 256 * First, pick up basic code points and copy them to 'to'. 257 */ 258 for (uidx = 0; uidx < fromlen; uidx++) { 259 if (from[uidx] < 0x80) { 260 if (toidx >= tolen) { 261 r = idn_buffer_overflow; 262 goto ret; 263 } 264 to[toidx++] = from[uidx]; 265 ucsdone++; 266 } 267 } 268 269 /* 270 * If there are any basic code points, output a delimiter 271 * (hyphen-minus). 272 */ 273 if (toidx > 0) { 274 if (toidx >= tolen) { 275 r = idn_buffer_overflow; 276 goto ret; 277 } 278 to[toidx++] = '-'; 279 to += toidx; 280 tolen -= toidx; 281 } 282 283 /* 284 * Then encode non-basic characters. 285 */ 286 first = 1; 287 cur_code = PUNYCODE_INITIAL_N; 288 bias = PUNYCODE_INITIAL_BIAS; 289 delta = 0; 290 while (ucsdone < fromlen) { 291 int limit = -1, rest; 292 293 /* 294 * Find the smallest code point equal to or greater 295 * than 'cur_code'. Also remember the index of the 296 * last occurence of the code point. 297 */ 298 for (next_code = MAX_UCS, uidx = fromlen - 1; 299 uidx >= 0; uidx--) { 300 if (from[uidx] >= cur_code && from[uidx] < next_code) { 301 next_code = from[uidx]; 302 limit = uidx; 303 } 304 } 305 /* There must be such code point. */ 306 assert(limit >= 0); 307 308 delta += (next_code - cur_code) * (ucsdone + 1); 309 cur_code = next_code; 310 311 /* 312 * Scan the input string again, and encode characters 313 * whose code point is 'cur_code'. Use 'limit' to avoid 314 * unnecessary scan. 315 */ 316 for (uidx = 0, rest = ucsdone; uidx <= limit; uidx++) { 317 if (from[uidx] < cur_code) { 318 delta++; 319 rest--; 320 } else if (from[uidx] == cur_code) { 321 int sz = punycode_putwc(to, tolen, delta, bias); 322 if (sz == 0) { 323 r = idn_buffer_overflow; 324 goto ret; 325 } 326 to += sz; 327 tolen -= sz; 328 ucsdone++; 329 bias = punycode_update_bias(delta, ucsdone, 330 first); 331 delta = 0; 332 first = 0; 333 } 334 } 335 delta += rest + 1; 336 cur_code++; 337 } 338 339 /* 340 * Terminate with NUL. 341 */ 342 if (tolen <= 0) { 343 r = idn_buffer_overflow; 344 goto ret; 345 } 346 *to = '\0'; 347 r = idn_success; 348 349ret: 350 if (r == idn_success) { 351 TRACE(("idn__punycode_encode(): succcess (to=\"%s\")\n", 352 idn__debug_xstring(to_org, 50))); 353 } else { 354 TRACE(("idn__punycode_encode(): %s\n", idn_result_tostring(r))); 355 } 356 return (r); 357} 358 359static int 360punycode_getwc(const char *s, size_t len, int bias, unsigned long *vp) { 361 size_t orglen = len; 362 unsigned long v = 0, w = 1; 363 int k; 364 365 for (k = PUNYCODE_BASE - bias; len > 0; k += PUNYCODE_BASE) { 366 int c = *s++; 367 int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN : 368 (k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k; 369 370 len--; 371 if ('a' <= c && c <= 'z') 372 c = c - 'a'; 373 else if ('A' <= c && c <= 'Z') 374 c = c - 'A'; 375 else if ('0' <= c && c <= '9') 376 c = c - '0' + 26; 377 else 378 c = -1; 379 380 if (c < 0) 381 return (0); /* invalid character */ 382 383 v += c * w; 384 385 if (c < t) { 386 *vp = v; 387 return (orglen - len); 388 } 389 390 w *= (PUNYCODE_BASE - t); 391 } 392 393 return (0); /* final character missing */ 394} 395 396static int 397punycode_putwc(char *s, size_t len, unsigned long delta, int bias) { 398 const char *punycode_base36 = "abcdefghijklmnopqrstuvwxyz0123456789"; 399 int k; 400 char *sorg = s; 401 402 for (k = PUNYCODE_BASE - bias; 1; k += PUNYCODE_BASE) { 403 int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN : 404 (k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k; 405 406 if (delta < t) 407 break; 408 if (len < 1) 409 return (0); 410 *s++ = punycode_base36[t + ((delta - t) % (PUNYCODE_BASE - t))]; 411 len--; 412 delta = (delta - t) / (PUNYCODE_BASE - t); 413 } 414 if (len < 1) 415 return (0); 416 *s++ = punycode_base36[delta]; 417 return (s - sorg); 418} 419 420static int 421punycode_update_bias(unsigned long delta, size_t npoints, int first) { 422 int k = 0; 423 424 delta /= first ? PUNYCODE_DAMP : 2; 425 delta += delta / npoints; 426 427 while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) { 428 delta /= PUNYCODE_BASE - PUNYCODE_TMIN; 429 k++; 430 } 431 return (PUNYCODE_BASE * k + 432 (((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta) / 433 (delta + PUNYCODE_SKEW))); 434} 435