1/* $NetBSD: punycode.c,v 1.2.6.1 2012/06/06 18:18:07 bouyer Exp $ */ 2 3#ifndef lint 4static char *rcsid = "Id: punycode.c,v 1.1 2003/06/04 00:26:06 marka Exp "; 5#endif 6 7/* 8 * Copyright (c) 2001,2002 Japan Network Information Center. 9 * All rights reserved. 10 * 11 * By using this file, you agree to the terms and conditions set forth bellow. 12 * 13 * LICENSE TERMS AND CONDITIONS 14 * 15 * The following License Terms and Conditions apply, unless a different 16 * license is obtained from Japan Network Information Center ("JPNIC"), 17 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 18 * Chiyoda-ku, Tokyo 101-0047, Japan. 19 * 20 * 1. Use, Modification and Redistribution (including distribution of any 21 * modified or derived work) in source and/or binary forms is permitted 22 * under this License Terms and Conditions. 23 * 24 * 2. Redistribution of source code must retain the copyright notices as they 25 * appear in each source code file, this License Terms and Conditions. 26 * 27 * 3. Redistribution in binary form must reproduce the Copyright Notice, 28 * this License Terms and Conditions, in the documentation and/or other 29 * materials provided with the distribution. For the purposes of binary 30 * distribution the "Copyright Notice" refers to the following language: 31 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 32 * 33 * 4. The name of JPNIC may not be used to endorse or promote products 34 * derived from this Software without specific prior written approval of 35 * JPNIC. 36 * 37 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 40 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 41 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 44 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 45 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 46 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 47 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 48 */ 49 50#include <config.h> 51 52#include <stddef.h> 53#include <stdlib.h> 54#include <string.h> 55 56#include <idn/result.h> 57#include <idn/assert.h> 58#include <idn/logmacro.h> 59#include <idn/converter.h> 60#include <idn/ucs4.h> 61#include <idn/debug.h> 62#include <idn/punycode.h> 63#include <idn/util.h> 64 65/* 66 * Although draft-ietf-idn-punycode-00.txt doesn't specify the ACE 67 * signature, we have to choose one. In order to prevent the converted 68 * name from beginning with a hyphen, we should choose a prefix rather 69 * than a suffix. 70 */ 71#ifndef IDN_PUNYCODE_PREFIX 72#define IDN_PUNYCODE_PREFIX "xn--" 73#endif 74 75#define INVALID_UCS 0x80000000 76#define MAX_UCS 0x10FFFF 77 78/* 79 * As the draft states, it is possible that `delta' may overflow during 80 * the encoding. The upper bound of 'delta' is: 81 * <# of chars. of input string> + <max. difference in code point> * 82 * <# of chars. of input string + 1> 83 * For this value not to be greater than 0xffffffff (since the calculation 84 * is done using unsigned long, which is at least 32bit long), the maxmum 85 * input string size is about 3850 characters, which is long enough for 86 * a domain label... 87 */ 88#define PUNYCODE_MAXINPUT 3800 89 90/* 91 * Parameters. 92 */ 93#define PUNYCODE_BASE 36 94#define PUNYCODE_TMIN 1 95#define PUNYCODE_TMAX 26 96#define PUNYCODE_SKEW 38 97#define PUNYCODE_DAMP 700 98#define PUNYCODE_INITIAL_BIAS 72 99#define PUNYCODE_INITIAL_N 0x80 100 101static int punycode_getwc(const char *s, size_t len, 102 int bias, unsigned long *vp); 103static int punycode_putwc(char *s, size_t len, 104 unsigned long delta, int bias); 105static int punycode_update_bias(unsigned long delta, 106 size_t npoints, int first); 107 108idn_result_t 109idn__punycode_decode(idn_converter_t ctx, void *privdata, 110 const char *from, unsigned long *to, size_t tolen) { 111 unsigned long *to_org = to; 112 unsigned long c, idx; 113 size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX); 114 size_t fromlen; 115 size_t uidx, fidx, ucslen; 116 int first, bias; 117 idn_result_t r; 118 119 assert(ctx != NULL); 120 121 TRACE(("idn__punycode_decode(from=\"%s\", tolen=%d)\n", 122 idn__debug_xstring(from, 50), (int)tolen)); 123 124 if (!idn__util_asciihaveaceprefix(from, IDN_PUNYCODE_PREFIX)) { 125 if (*from == '\0') { 126 r = idn_ucs4_utf8toucs4(from, to, tolen); 127 goto ret; 128 } 129 r = idn_invalid_encoding; 130 goto ret; 131 } 132 from += prefixlen; 133 fromlen = strlen(from); 134 135 /* 136 * Find the last delimiter, and copy the characters 137 * before it verbatim. 138 */ 139 ucslen = 0; 140 for (fidx = fromlen; fidx > 0; fidx--) { 141 if (from[fidx - 1] == '-') { 142 if (tolen < fidx) { 143 r = idn_buffer_overflow; 144 goto ret; 145 } 146 for (uidx = 0; uidx < fidx - 1; uidx++) { 147 to[uidx] = from[uidx]; 148 } 149 ucslen = uidx; 150 break; 151 } 152 } 153 154 first = 1; 155 bias = PUNYCODE_INITIAL_BIAS; 156 c = PUNYCODE_INITIAL_N; 157 idx = 0; 158 while (fidx < fromlen) { 159 int len; 160 unsigned long delta; 161 int i; 162 163 len = punycode_getwc(from + fidx, fromlen - fidx, bias, &delta); 164 if (len == 0) { 165 r = idn_invalid_encoding; 166 goto ret; 167 } 168 fidx += len; 169 170 bias = punycode_update_bias(delta, ucslen + 1, first); 171 first = 0; 172 idx += delta; 173 c += idx / (ucslen + 1); 174 uidx = idx % (ucslen + 1); 175 176 /* Insert 'c' at uidx. */ 177 if (tolen-- <= 0) { 178 r = idn_buffer_overflow; 179 goto ret; 180 } 181 for (i = ucslen; i > uidx; i--) 182 to[i] = to[i - 1]; 183 to[uidx] = c; 184 185 ucslen++; 186 idx = uidx + 1; 187 } 188 189 /* Terminate with NUL. */ 190 if (tolen <= 0) { 191 r = idn_buffer_overflow; 192 goto ret; 193 } 194 to[ucslen] = '\0'; 195 r = idn_success; 196 197ret: 198 if (r == idn_success) { 199 TRACE(("idn__punycode_decode(): succcess (to=\"%s\")\n", 200 idn__debug_ucs4xstring(to_org, 50))); 201 } else { 202 TRACE(("idn__punycode_decode(): %s\n", idn_result_tostring(r))); 203 } 204 return (r); 205} 206 207idn_result_t 208idn__punycode_encode(idn_converter_t ctx, void *privdata, 209 const unsigned long *from, char *to, size_t tolen) { 210 char *to_org = to; 211 unsigned long cur_code, next_code, delta; 212 size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX); 213 size_t fromlen; 214 size_t ucsdone; 215 size_t toidx; 216 int uidx, bias, first; 217 idn_result_t r; 218 219 assert(ctx != NULL); 220 221 TRACE(("idn__punycode_encode(from=\"%s\", tolen=%d)\n", 222 idn__debug_ucs4xstring(from, 50), (int)tolen)); 223 224 if (*from == '\0') { 225 r = idn_ucs4_ucs4toutf8(from, to, tolen); 226 goto ret; 227 } else if (idn__util_ucs4haveaceprefix(from, IDN_PUNYCODE_PREFIX)) { 228 r = idn_prohibited; 229 goto ret; 230 } 231 232 if (tolen < prefixlen) { 233 r = idn_buffer_overflow; 234 goto ret; 235 } 236 memcpy(to, IDN_PUNYCODE_PREFIX, prefixlen); 237 to += prefixlen; 238 tolen -= prefixlen; 239 240 fromlen = idn_ucs4_strlen(from); 241 242 /* 243 * If the input string is too long (actually too long to be sane), 244 * return failure in order to prevent possible overflow. 245 */ 246 if (fromlen > PUNYCODE_MAXINPUT) { 247 ERROR(("idn__punycode_encode(): " 248 "the input string is too long to convert Punycode\n", 249 idn__debug_ucs4xstring(from, 50))); 250 r = idn_failure; 251 goto ret; 252 } 253 254 ucsdone = 0; /* number of characters processed */ 255 toidx = 0; 256 257 /* 258 * First, pick up basic code points and copy them to 'to'. 259 */ 260 for (uidx = 0; uidx < fromlen; uidx++) { 261 if (from[uidx] < 0x80) { 262 if (toidx >= tolen) { 263 r = idn_buffer_overflow; 264 goto ret; 265 } 266 to[toidx++] = from[uidx]; 267 ucsdone++; 268 } 269 } 270 271 /* 272 * If there are any basic code points, output a delimiter 273 * (hyphen-minus). 274 */ 275 if (toidx > 0) { 276 if (toidx >= tolen) { 277 r = idn_buffer_overflow; 278 goto ret; 279 } 280 to[toidx++] = '-'; 281 to += toidx; 282 tolen -= toidx; 283 } 284 285 /* 286 * Then encode non-basic characters. 287 */ 288 first = 1; 289 cur_code = PUNYCODE_INITIAL_N; 290 bias = PUNYCODE_INITIAL_BIAS; 291 delta = 0; 292 while (ucsdone < fromlen) { 293 int limit = -1, rest; 294 295 /* 296 * Find the smallest code point equal to or greater 297 * than 'cur_code'. Also remember the index of the 298 * last occurence of the code point. 299 */ 300 for (next_code = MAX_UCS, uidx = fromlen - 1; 301 uidx >= 0; uidx--) { 302 if (from[uidx] >= cur_code && from[uidx] < next_code) { 303 next_code = from[uidx]; 304 limit = uidx; 305 } 306 } 307 /* There must be such code point. */ 308 assert(limit >= 0); 309 310 delta += (next_code - cur_code) * (ucsdone + 1); 311 cur_code = next_code; 312 313 /* 314 * Scan the input string again, and encode characters 315 * whose code point is 'cur_code'. Use 'limit' to avoid 316 * unnecessary scan. 317 */ 318 for (uidx = 0, rest = ucsdone; uidx <= limit; uidx++) { 319 if (from[uidx] < cur_code) { 320 delta++; 321 rest--; 322 } else if (from[uidx] == cur_code) { 323 int sz = punycode_putwc(to, tolen, delta, bias); 324 if (sz == 0) { 325 r = idn_buffer_overflow; 326 goto ret; 327 } 328 to += sz; 329 tolen -= sz; 330 ucsdone++; 331 bias = punycode_update_bias(delta, ucsdone, 332 first); 333 delta = 0; 334 first = 0; 335 } 336 } 337 delta += rest + 1; 338 cur_code++; 339 } 340 341 /* 342 * Terminate with NUL. 343 */ 344 if (tolen <= 0) { 345 r = idn_buffer_overflow; 346 goto ret; 347 } 348 *to = '\0'; 349 r = idn_success; 350 351ret: 352 if (r == idn_success) { 353 TRACE(("idn__punycode_encode(): succcess (to=\"%s\")\n", 354 idn__debug_xstring(to_org, 50))); 355 } else { 356 TRACE(("idn__punycode_encode(): %s\n", idn_result_tostring(r))); 357 } 358 return (r); 359} 360 361static int 362punycode_getwc(const char *s, size_t len, int bias, unsigned long *vp) { 363 size_t orglen = len; 364 unsigned long v = 0, w = 1; 365 int k; 366 367 for (k = PUNYCODE_BASE - bias; len > 0; k += PUNYCODE_BASE) { 368 int c = *s++; 369 int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN : 370 (k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k; 371 372 len--; 373 if ('a' <= c && c <= 'z') 374 c = c - 'a'; 375 else if ('A' <= c && c <= 'Z') 376 c = c - 'A'; 377 else if ('0' <= c && c <= '9') 378 c = c - '0' + 26; 379 else 380 c = -1; 381 382 if (c < 0) 383 return (0); /* invalid character */ 384 385 v += c * w; 386 387 if (c < t) { 388 *vp = v; 389 return (orglen - len); 390 } 391 392 w *= (PUNYCODE_BASE - t); 393 } 394 395 return (0); /* final character missing */ 396} 397 398static int 399punycode_putwc(char *s, size_t len, unsigned long delta, int bias) { 400 const char *punycode_base36 = "abcdefghijklmnopqrstuvwxyz0123456789"; 401 int k; 402 char *sorg = s; 403 404 for (k = PUNYCODE_BASE - bias; 1; k += PUNYCODE_BASE) { 405 int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN : 406 (k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k; 407 408 if (delta < t) 409 break; 410 if (len < 1) 411 return (0); 412 *s++ = punycode_base36[t + ((delta - t) % (PUNYCODE_BASE - t))]; 413 len--; 414 delta = (delta - t) / (PUNYCODE_BASE - t); 415 } 416 if (len < 1) 417 return (0); 418 *s++ = punycode_base36[delta]; 419 return (s - sorg); 420} 421 422static int 423punycode_update_bias(unsigned long delta, size_t npoints, int first) { 424 int k = 0; 425 426 delta /= first ? PUNYCODE_DAMP : 2; 427 delta += delta / npoints; 428 429 while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) { 430 delta /= PUNYCODE_BASE - PUNYCODE_TMIN; 431 k++; 432 } 433 return (PUNYCODE_BASE * k + 434 (((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta) / 435 (delta + PUNYCODE_SKEW))); 436} 437