1/* IRI related functions. 2 Copyright (C) 2008, 2009 Free Software Foundation, Inc. 3 4This file is part of GNU Wget. 5 6GNU Wget is free software; you can redistribute it and/or modify 7it under the terms of the GNU General Public License as published by 8the Free Software Foundation; either version 3 of the License, or (at 9your option) any later version. 10 11GNU Wget is distributed in the hope that it will be useful, 12but WITHOUT ANY WARRANTY; without even the implied warranty of 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14GNU General Public License for more details. 15 16You should have received a copy of the GNU General Public License 17along with Wget. If not, see <http://www.gnu.org/licenses/>. 18 19Additional permission under GNU GPL version 3 section 7 20 21If you modify this program, or any covered work, by linking or 22combining it with the OpenSSL project's OpenSSL library (or a 23modified version of that library), containing parts covered by the 24terms of the OpenSSL or SSLeay licenses, the Free Software Foundation 25grants you additional permission to convey the resulting work. 26Corresponding Source for a non-source form of such a combination 27shall include the source code for the parts of OpenSSL used as well 28as that of the covered work. */ 29 30#include "wget.h" 31 32#include <stdio.h> 33#include <stdlib.h> 34#include <string.h> 35#include <iconv.h> 36#include <stringprep.h> 37#include <idna.h> 38#include <errno.h> 39 40#include "utils.h" 41 42/* RFC3987 section 3.1 mandates STD3 ASCII RULES */ 43#define IDNA_FLAGS IDNA_USE_STD3_ASCII_RULES 44 45/* Note: locale encoding is kept in options struct (opt.locale) */ 46 47static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out); 48 49 50/* Given a string containing "charset=XXX", return the encoding if found, 51 or NULL otherwise */ 52char * 53parse_charset (char *str) 54{ 55 char *charset; 56 57 if (!str || !*str) 58 return NULL; 59 60 str = strcasestr (str, "charset="); 61 if (!str) 62 return NULL; 63 64 str += 8; 65 charset = str; 66 67 /* sXXXav: which chars should be banned ??? */ 68 while (*charset && !c_isspace (*charset)) 69 charset++; 70 71 /* sXXXav: could strdupdelim return NULL ? */ 72 charset = strdupdelim (str, charset); 73 74 /* Do a minimum check on the charset value */ 75 if (!check_encoding_name (charset)) 76 { 77 xfree (charset); 78 return NULL; 79 } 80 81 /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/ 82 83 return charset; 84} 85 86/* Find the locale used, or fall back on a default value */ 87char * 88find_locale (void) 89{ 90 return (char *) stringprep_locale_charset (); 91} 92 93/* Basic check of an encoding name. */ 94bool 95check_encoding_name (char *encoding) 96{ 97 char *s = encoding; 98 99 while (*s) 100 { 101 if (!c_isascii (*s) || c_isspace (*s)) 102 { 103 logprintf (LOG_VERBOSE, _("Encoding %s isn't valid\n"), quote (encoding)); 104 return false; 105 } 106 107 s++; 108 } 109 110 return true; 111} 112 113/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */ 114static bool 115open_locale_to_utf8 (void) 116{ 117 118} 119 120/* Try converting string str from locale to UTF-8. Return a new string 121 on success, or str on error or if conversion isn't needed. */ 122const char * 123locale_to_utf8 (const char *str) 124{ 125 iconv_t l2u; 126 char *new; 127 128 /* That shouldn't happen, just in case */ 129 if (!opt.locale) 130 { 131 logprintf (LOG_VERBOSE, _("locale_to_utf8: locale is unset\n")); 132 opt.locale = find_locale (); 133 } 134 135 if (!opt.locale || !strcasecmp (opt.locale, "utf-8")) 136 return str; 137 138 l2u = iconv_open ("UTF-8", opt.locale); 139 if (l2u != (iconv_t)(-1)) 140 { 141 logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"), 142 quote (opt.locale), quote ("UTF-8")); 143 return str; 144 } 145 146 if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new)) 147 return (const char *) new; 148 149 return str; 150} 151 152/* Do the conversion according to the passed conversion descriptor cd. *out 153 will contain the transcoded string on success. *out content is 154 unspecified otherwise. */ 155static bool 156do_conversion (iconv_t cd, char *in, size_t inlen, char **out) 157{ 158 /* sXXXav : hummm hard to guess... */ 159 size_t len, done, outlen = inlen * 2; 160 int invalid = 0, tooshort = 0; 161 char *s; 162 163 s = xmalloc (outlen + 1); 164 *out = s; 165 len = outlen; 166 done = 0; 167 168 for (;;) 169 { 170 if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1)) 171 { 172 *out = s; 173 *(s + len - outlen - done) = '\0'; 174 return true; 175 } 176 177 /* Incomplete or invalid multibyte sequence */ 178 if (errno == EINVAL || errno == EILSEQ) 179 { 180 if (!invalid) 181 logprintf (LOG_VERBOSE, 182 _("Incomplete or invalid multibyte sequence encountered\n")); 183 184 invalid++; 185 **out = *in; 186 in++; 187 inlen--; 188 (*out)++; 189 outlen--; 190 } 191 else if (errno == E2BIG) /* Output buffer full */ 192 { 193 char *new; 194 195 tooshort++; 196 done = len; 197 outlen = done + inlen * 2; 198 new = xmalloc (outlen + 1); 199 memcpy (new, s, done); 200 xfree (s); 201 s = new; 202 len = outlen; 203 *out = s + done; 204 } 205 else /* Weird, we got an unspecified error */ 206 { 207 logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno); 208 break; 209 } 210 } 211 212 return false; 213} 214 215/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL 216 on error. */ 217char * 218idn_encode (struct iri *i, char *host) 219{ 220 char *new; 221 int ret; 222 223 /* Encode to UTF-8 if not done */ 224 if (!i->utf8_encode) 225 { 226 if (!remote_to_utf8 (i, (const char *) host, (const char **) &new)) 227 return NULL; /* Nothing to encode or an error occured */ 228 host = new; 229 } 230 231 /* toASCII UTF-8 NULL terminated string */ 232 ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS); 233 if (ret != IDNA_SUCCESS) 234 { 235 /* sXXXav : free new when needed ! */ 236 logprintf (LOG_VERBOSE, _("idn_encode failed (%d): %s\n"), ret, 237 quote (idna_strerror (ret))); 238 return NULL; 239 } 240 241 return new; 242} 243 244/* Try to decode an "ASCII encoded" host. Return the new domain in the locale 245 on success or NULL on error. */ 246char * 247idn_decode (char *host) 248{ 249 char *new; 250 int ret; 251 252 ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS); 253 if (ret != IDNA_SUCCESS) 254 { 255 logprintf (LOG_VERBOSE, _("idn_decode failed (%d): %s\n"), ret, 256 quote (idna_strerror (ret))); 257 return NULL; 258 } 259 260 return new; 261} 262 263/* Try to transcode string str from remote encoding to UTF-8. On success, *new 264 contains the transcoded string. *new content is unspecified otherwise. */ 265bool 266remote_to_utf8 (struct iri *i, const char *str, const char **new) 267{ 268 iconv_t cd; 269 bool ret = false; 270 271 if (!i->uri_encoding) 272 return false; 273 274 cd = iconv_open ("UTF-8", i->uri_encoding); 275 if (cd == (iconv_t)(-1)) 276 return false; 277 278 if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new)) 279 ret = true; 280 281 iconv_close (cd); 282 283 /* Test if something was converted */ 284 if (!strcmp (str, *new)) 285 { 286 xfree ((char *) *new); 287 return false; 288 } 289 290 return ret; 291} 292 293/* Allocate a new iri structure and return a pointer to it. */ 294struct iri * 295iri_new (void) 296{ 297 struct iri *i = xmalloc (sizeof *i); 298 i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL; 299 i->content_encoding = NULL; 300 i->orig_url = NULL; 301 i->utf8_encode = opt.enable_iri; 302 return i; 303} 304 305struct iri *iri_dup (const struct iri *src) 306{ 307 struct iri *i = xmalloc (sizeof *i); 308 i->uri_encoding = src->uri_encoding ? xstrdup (src->uri_encoding) : NULL; 309 i->content_encoding = (src->content_encoding ? 310 xstrdup (src->content_encoding) : NULL); 311 i->orig_url = src->orig_url ? xstrdup (src->orig_url) : NULL; 312 i->utf8_encode = src->utf8_encode; 313 return i; 314} 315 316/* Completely free an iri structure. */ 317void 318iri_free (struct iri *i) 319{ 320 xfree_null (i->uri_encoding); 321 xfree_null (i->content_encoding); 322 xfree_null (i->orig_url); 323 xfree (i); 324} 325 326/* Set uri_encoding of struct iri i. If a remote encoding was specified, use 327 it unless force is true. */ 328void 329set_uri_encoding (struct iri *i, char *charset, bool force) 330{ 331 DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None")); 332 if (!force && opt.encoding_remote) 333 return; 334 if (i->uri_encoding) 335 { 336 if (charset && !strcasecmp (i->uri_encoding, charset)) 337 return; 338 xfree (i->uri_encoding); 339 } 340 341 i->uri_encoding = charset ? xstrdup (charset) : NULL; 342} 343 344/* Set content_encoding of struct iri i. */ 345void 346set_content_encoding (struct iri *i, char *charset) 347{ 348 DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None")); 349 if (opt.encoding_remote) 350 return; 351 if (i->content_encoding) 352 { 353 if (charset && !strcasecmp (i->content_encoding, charset)) 354 return; 355 xfree (i->content_encoding); 356 } 357 358 i->content_encoding = charset ? xstrdup (charset) : NULL; 359} 360 361