1/* IRI related functions.
2   Copyright (C) 2008, 2009 Free Software Foundation, Inc.
3
4This file is part of GNU Wget.
5
6GNU Wget is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3 of the License, or (at
9your option) any later version.
10
11GNU Wget is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with Wget.  If not, see <http://www.gnu.org/licenses/>.
18
19Additional permission under GNU GPL version 3 section 7
20
21If you modify this program, or any covered work, by linking or
22combining it with the OpenSSL project's OpenSSL library (or a
23modified version of that library), containing parts covered by the
24terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
25grants you additional permission to convey the resulting work.
26Corresponding Source for a non-source form of such a combination
27shall include the source code for the parts of OpenSSL used as well
28as that of the covered work.  */
29
30#include "wget.h"
31
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35#include <iconv.h>
36#include <stringprep.h>
37#include <idna.h>
38#include <errno.h>
39
40#include "utils.h"
41
42/* RFC3987 section 3.1 mandates STD3 ASCII RULES */
43#define IDNA_FLAGS  IDNA_USE_STD3_ASCII_RULES
44
45/* Note: locale encoding is kept in options struct (opt.locale) */
46
47static bool do_conversion (iconv_t cd, char *in, size_t inlen, char **out);
48
49
50/* Given a string containing "charset=XXX", return the encoding if found,
51   or NULL otherwise */
52char *
53parse_charset (char *str)
54{
55  char *charset;
56
57  if (!str || !*str)
58    return NULL;
59
60  str = strcasestr (str, "charset=");
61  if (!str)
62    return NULL;
63
64  str += 8;
65  charset = str;
66
67  /* sXXXav: which chars should be banned ??? */
68  while (*charset && !c_isspace (*charset))
69    charset++;
70
71  /* sXXXav: could strdupdelim return NULL ? */
72  charset = strdupdelim (str, charset);
73
74  /* Do a minimum check on the charset value */
75  if (!check_encoding_name (charset))
76    {
77      xfree (charset);
78      return NULL;
79    }
80
81  /*logprintf (LOG_VERBOSE, "parse_charset: %s\n", quote (charset));*/
82
83  return charset;
84}
85
86/* Find the locale used, or fall back on a default value */
87char *
88find_locale (void)
89{
90  return (char *) stringprep_locale_charset ();
91}
92
93/* Basic check of an encoding name. */
94bool
95check_encoding_name (char *encoding)
96{
97  char *s = encoding;
98
99  while (*s)
100    {
101      if (!c_isascii (*s) || c_isspace (*s))
102        {
103          logprintf (LOG_VERBOSE, _("Encoding %s isn't valid\n"), quote (encoding));
104          return false;
105        }
106
107      s++;
108    }
109
110  return true;
111}
112
113/* Try opening an iconv_t descriptor for conversion from locale to UTF-8 */
114static bool
115open_locale_to_utf8 (void)
116{
117
118}
119
120/* Try converting string str from locale to UTF-8. Return a new string
121   on success, or str on error or if conversion isn't needed. */
122const char *
123locale_to_utf8 (const char *str)
124{
125  iconv_t l2u;
126  char *new;
127
128  /* That shouldn't happen, just in case */
129  if (!opt.locale)
130    {
131      logprintf (LOG_VERBOSE, _("locale_to_utf8: locale is unset\n"));
132      opt.locale = find_locale ();
133    }
134
135  if (!opt.locale || !strcasecmp (opt.locale, "utf-8"))
136    return str;
137
138  l2u = iconv_open ("UTF-8", opt.locale);
139  if (l2u != (iconv_t)(-1))
140    {
141      logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
142                 quote (opt.locale), quote ("UTF-8"));
143      return str;
144    }
145
146  if (do_conversion (l2u, (char *) str, strlen ((char *) str), &new))
147    return (const char *) new;
148
149  return str;
150}
151
152/* Do the conversion according to the passed conversion descriptor cd. *out
153   will contain the transcoded string on success. *out content is
154   unspecified otherwise. */
155static bool
156do_conversion (iconv_t cd, char *in, size_t inlen, char **out)
157{
158  /* sXXXav : hummm hard to guess... */
159  size_t len, done, outlen = inlen * 2;
160  int invalid = 0, tooshort = 0;
161  char *s;
162
163  s = xmalloc (outlen + 1);
164  *out = s;
165  len = outlen;
166  done = 0;
167
168  for (;;)
169    {
170      if (iconv (cd, &in, &inlen, out, &outlen) != (size_t)(-1))
171        {
172          *out = s;
173          *(s + len - outlen - done) = '\0';
174          return true;
175        }
176
177      /* Incomplete or invalid multibyte sequence */
178      if (errno == EINVAL || errno == EILSEQ)
179        {
180          if (!invalid)
181            logprintf (LOG_VERBOSE,
182                      _("Incomplete or invalid multibyte sequence encountered\n"));
183
184          invalid++;
185          **out = *in;
186          in++;
187          inlen--;
188          (*out)++;
189          outlen--;
190        }
191      else if (errno == E2BIG) /* Output buffer full */
192        {
193          char *new;
194
195          tooshort++;
196          done = len;
197          outlen = done + inlen * 2;
198          new = xmalloc (outlen + 1);
199          memcpy (new, s, done);
200          xfree (s);
201          s = new;
202          len = outlen;
203          *out = s + done;
204        }
205      else /* Weird, we got an unspecified error */
206        {
207          logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
208          break;
209        }
210    }
211
212    return false;
213}
214
215/* Try to "ASCII encode" UTF-8 host. Return the new domain on success or NULL
216   on error. */
217char *
218idn_encode (struct iri *i, char *host)
219{
220  char *new;
221  int ret;
222
223  /* Encode to UTF-8 if not done */
224  if (!i->utf8_encode)
225    {
226      if (!remote_to_utf8 (i, (const char *) host, (const char **) &new))
227          return NULL;  /* Nothing to encode or an error occured */
228      host = new;
229    }
230
231  /* toASCII UTF-8 NULL terminated string */
232  ret = idna_to_ascii_8z (host, &new, IDNA_FLAGS);
233  if (ret != IDNA_SUCCESS)
234    {
235      /* sXXXav : free new when needed ! */
236      logprintf (LOG_VERBOSE, _("idn_encode failed (%d): %s\n"), ret,
237                 quote (idna_strerror (ret)));
238      return NULL;
239    }
240
241  return new;
242}
243
244/* Try to decode an "ASCII encoded" host. Return the new domain in the locale
245   on success or NULL on error. */
246char *
247idn_decode (char *host)
248{
249  char *new;
250  int ret;
251
252  ret = idna_to_unicode_8zlz (host, &new, IDNA_FLAGS);
253  if (ret != IDNA_SUCCESS)
254    {
255      logprintf (LOG_VERBOSE, _("idn_decode failed (%d): %s\n"), ret,
256                 quote (idna_strerror (ret)));
257      return NULL;
258    }
259
260  return new;
261}
262
263/* Try to transcode string str from remote encoding to UTF-8. On success, *new
264   contains the transcoded string. *new content is unspecified otherwise. */
265bool
266remote_to_utf8 (struct iri *i, const char *str, const char **new)
267{
268  iconv_t cd;
269  bool ret = false;
270
271  if (!i->uri_encoding)
272    return false;
273
274  cd = iconv_open ("UTF-8", i->uri_encoding);
275  if (cd == (iconv_t)(-1))
276    return false;
277
278  if (do_conversion (cd, (char *) str, strlen ((char *) str), (char **) new))
279    ret = true;
280
281  iconv_close (cd);
282
283  /* Test if something was converted */
284  if (!strcmp (str, *new))
285    {
286      xfree ((char *) *new);
287      return false;
288    }
289
290  return ret;
291}
292
293/* Allocate a new iri structure and return a pointer to it. */
294struct iri *
295iri_new (void)
296{
297  struct iri *i = xmalloc (sizeof *i);
298  i->uri_encoding = opt.encoding_remote ? xstrdup (opt.encoding_remote) : NULL;
299  i->content_encoding = NULL;
300  i->orig_url = NULL;
301  i->utf8_encode = opt.enable_iri;
302  return i;
303}
304
305struct iri *iri_dup (const struct iri *src)
306{
307  struct iri *i = xmalloc (sizeof *i);
308  i->uri_encoding = src->uri_encoding ? xstrdup (src->uri_encoding) : NULL;
309  i->content_encoding = (src->content_encoding ?
310                         xstrdup (src->content_encoding) : NULL);
311  i->orig_url = src->orig_url ? xstrdup (src->orig_url) : NULL;
312  i->utf8_encode = src->utf8_encode;
313  return i;
314}
315
316/* Completely free an iri structure. */
317void
318iri_free (struct iri *i)
319{
320  xfree_null (i->uri_encoding);
321  xfree_null (i->content_encoding);
322  xfree_null (i->orig_url);
323  xfree (i);
324}
325
326/* Set uri_encoding of struct iri i. If a remote encoding was specified, use
327   it unless force is true. */
328void
329set_uri_encoding (struct iri *i, char *charset, bool force)
330{
331  DEBUGP (("URI encoding = %s\n", charset ? quote (charset) : "None"));
332  if (!force && opt.encoding_remote)
333    return;
334  if (i->uri_encoding)
335    {
336      if (charset && !strcasecmp (i->uri_encoding, charset))
337        return;
338      xfree (i->uri_encoding);
339    }
340
341  i->uri_encoding = charset ? xstrdup (charset) : NULL;
342}
343
344/* Set content_encoding of struct iri i. */
345void
346set_content_encoding (struct iri *i, char *charset)
347{
348  DEBUGP (("URI content encoding = %s\n", charset ? quote (charset) : "None"));
349  if (opt.encoding_remote)
350    return;
351  if (i->content_encoding)
352    {
353      if (charset && !strcasecmp (i->content_encoding, charset))
354        return;
355      xfree (i->content_encoding);
356    }
357
358  i->content_encoding = charset ? xstrdup (charset) : NULL;
359}
360
361