1/* Convert multibyte character to wide character.
2   Copyright (C) 1999-2002, 2005-2014 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#include <config.h>
19
20/* Specification.  */
21#include <wchar.h>
22
23#if GNULIB_defined_mbstate_t
24/* Implement mbrtowc() on top of mbtowc().  */
25
26# include <errno.h>
27# include <stdlib.h>
28
29# include "localcharset.h"
30# include "streq.h"
31# include "verify.h"
32
33
34verify (sizeof (mbstate_t) >= 4);
35
36static char internal_state[4];
37
38size_t
39mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
40{
41  char *pstate = (char *)ps;
42
43  if (s == NULL)
44    {
45      pwc = NULL;
46      s = "";
47      n = 1;
48    }
49
50  if (n == 0)
51    return (size_t)(-2);
52
53  /* Here n > 0.  */
54
55  if (pstate == NULL)
56    pstate = internal_state;
57
58  {
59    size_t nstate = pstate[0];
60    char buf[4];
61    const char *p;
62    size_t m;
63
64    switch (nstate)
65      {
66      case 0:
67        p = s;
68        m = n;
69        break;
70      case 3:
71        buf[2] = pstate[3];
72        /*FALLTHROUGH*/
73      case 2:
74        buf[1] = pstate[2];
75        /*FALLTHROUGH*/
76      case 1:
77        buf[0] = pstate[1];
78        p = buf;
79        m = nstate;
80        buf[m++] = s[0];
81        if (n >= 2 && m < 4)
82          {
83            buf[m++] = s[1];
84            if (n >= 3 && m < 4)
85              buf[m++] = s[2];
86          }
87        break;
88      default:
89        errno = EINVAL;
90        return (size_t)(-1);
91      }
92
93    /* Here m > 0.  */
94
95# if __GLIBC__ || defined __UCLIBC__
96    /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
97    mbtowc (NULL, NULL, 0);
98# endif
99    {
100      int res = mbtowc (pwc, p, m);
101
102      if (res >= 0)
103        {
104          if (pwc != NULL && ((*pwc == 0) != (res == 0)))
105            abort ();
106          if (nstate >= (res > 0 ? res : 1))
107            abort ();
108          res -= nstate;
109          pstate[0] = 0;
110          return res;
111        }
112
113      /* mbtowc does not distinguish between invalid and incomplete multibyte
114         sequences.  But mbrtowc needs to make this distinction.
115         There are two possible approaches:
116           - Use iconv() and its return value.
117           - Use built-in knowledge about the possible encodings.
118         Given the low quality of implementation of iconv() on the systems that
119         lack mbrtowc(), we use the second approach.
120         The possible encodings are:
121           - 8-bit encodings,
122           - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
123           - UTF-8.
124         Use specialized code for each.  */
125      if (m >= 4 || m >= MB_CUR_MAX)
126        goto invalid;
127      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
128      {
129        const char *encoding = locale_charset ();
130
131        if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
132          {
133            /* Cf. unistr/u8-mblen.c.  */
134            unsigned char c = (unsigned char) p[0];
135
136            if (c >= 0xc2)
137              {
138                if (c < 0xe0)
139                  {
140                    if (m == 1)
141                      goto incomplete;
142                  }
143                else if (c < 0xf0)
144                  {
145                    if (m == 1)
146                      goto incomplete;
147                    if (m == 2)
148                      {
149                        unsigned char c2 = (unsigned char) p[1];
150
151                        if ((c2 ^ 0x80) < 0x40
152                            && (c >= 0xe1 || c2 >= 0xa0)
153                            && (c != 0xed || c2 < 0xa0))
154                          goto incomplete;
155                      }
156                  }
157                else if (c <= 0xf4)
158                  {
159                    if (m == 1)
160                      goto incomplete;
161                    else /* m == 2 || m == 3 */
162                      {
163                        unsigned char c2 = (unsigned char) p[1];
164
165                        if ((c2 ^ 0x80) < 0x40
166                            && (c >= 0xf1 || c2 >= 0x90)
167                            && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
168                          {
169                            if (m == 2)
170                              goto incomplete;
171                            else /* m == 3 */
172                              {
173                                unsigned char c3 = (unsigned char) p[2];
174
175                                if ((c3 ^ 0x80) < 0x40)
176                                  goto incomplete;
177                              }
178                          }
179                      }
180                  }
181              }
182            goto invalid;
183          }
184
185        /* As a reference for this code, you can use the GNU libiconv
186           implementation.  Look for uses of the RET_TOOFEW macro.  */
187
188        if (STREQ_OPT (encoding,
189                       "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
190          {
191            if (m == 1)
192              {
193                unsigned char c = (unsigned char) p[0];
194
195                if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
196                  goto incomplete;
197              }
198            if (m == 2)
199              {
200                unsigned char c = (unsigned char) p[0];
201
202                if (c == 0x8f)
203                  {
204                    unsigned char c2 = (unsigned char) p[1];
205
206                    if (c2 >= 0xa1 && c2 < 0xff)
207                      goto incomplete;
208                  }
209              }
210            goto invalid;
211          }
212        if (STREQ_OPT (encoding,
213                       "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
214            || STREQ_OPT (encoding,
215                          "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
216            || STREQ_OPT (encoding,
217                          "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
218          {
219            if (m == 1)
220              {
221                unsigned char c = (unsigned char) p[0];
222
223                if (c >= 0xa1 && c < 0xff)
224                  goto incomplete;
225              }
226            goto invalid;
227          }
228        if (STREQ_OPT (encoding,
229                       "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
230          {
231            if (m == 1)
232              {
233                unsigned char c = (unsigned char) p[0];
234
235                if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
236                  goto incomplete;
237              }
238            else /* m == 2 || m == 3 */
239              {
240                unsigned char c = (unsigned char) p[0];
241
242                if (c == 0x8e)
243                  goto incomplete;
244              }
245            goto invalid;
246          }
247        if (STREQ_OPT (encoding,
248                       "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
249          {
250            if (m == 1)
251              {
252                unsigned char c = (unsigned char) p[0];
253
254                if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
255                  goto incomplete;
256              }
257            else /* m == 2 || m == 3 */
258              {
259                unsigned char c = (unsigned char) p[0];
260
261                if (c >= 0x90 && c <= 0xe3)
262                  {
263                    unsigned char c2 = (unsigned char) p[1];
264
265                    if (c2 >= 0x30 && c2 <= 0x39)
266                      {
267                        if (m == 2)
268                          goto incomplete;
269                        else /* m == 3 */
270                          {
271                            unsigned char c3 = (unsigned char) p[2];
272
273                            if (c3 >= 0x81 && c3 <= 0xfe)
274                              goto incomplete;
275                          }
276                      }
277                  }
278              }
279            goto invalid;
280          }
281        if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
282          {
283            if (m == 1)
284              {
285                unsigned char c = (unsigned char) p[0];
286
287                if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
288                    || (c >= 0xf0 && c <= 0xf9))
289                  goto incomplete;
290              }
291            goto invalid;
292          }
293
294        /* An unknown multibyte encoding.  */
295        goto incomplete;
296      }
297
298     incomplete:
299      {
300        size_t k = nstate;
301        /* Here 0 <= k < m < 4.  */
302        pstate[++k] = s[0];
303        if (k < m)
304          {
305            pstate[++k] = s[1];
306            if (k < m)
307              pstate[++k] = s[2];
308          }
309        if (k != m)
310          abort ();
311      }
312      pstate[0] = m;
313      return (size_t)(-2);
314
315     invalid:
316      errno = EILSEQ;
317      /* The conversion state is undefined, says POSIX.  */
318      return (size_t)(-1);
319    }
320  }
321}
322
323#else
324/* Override the system's mbrtowc() function.  */
325
326# undef mbrtowc
327
328size_t
329rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
330{
331# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
332  if (s == NULL)
333    {
334      pwc = NULL;
335      s = "";
336      n = 1;
337    }
338# endif
339
340# if MBRTOWC_EMPTY_INPUT_BUG
341  if (n == 0)
342    return (size_t) -2;
343# endif
344
345# if MBRTOWC_RETVAL_BUG
346  {
347    static mbstate_t internal_state;
348
349    /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
350       hidden internal state, but we can call it on our variable.  */
351    if (ps == NULL)
352      ps = &internal_state;
353
354    if (!mbsinit (ps))
355      {
356        /* Parse the rest of the multibyte character byte for byte.  */
357        size_t count = 0;
358        for (; n > 0; s++, n--)
359          {
360            wchar_t wc;
361            size_t ret = mbrtowc (&wc, s, 1, ps);
362
363            if (ret == (size_t)(-1))
364              return (size_t)(-1);
365            count++;
366            if (ret != (size_t)(-2))
367              {
368                /* The multibyte character has been completed.  */
369                if (pwc != NULL)
370                  *pwc = wc;
371                return (wc == 0 ? 0 : count);
372              }
373          }
374        return (size_t)(-2);
375      }
376  }
377# endif
378
379# if MBRTOWC_NUL_RETVAL_BUG
380  {
381    wchar_t wc;
382    size_t ret = mbrtowc (&wc, s, n, ps);
383
384    if (ret != (size_t)(-1) && ret != (size_t)(-2))
385      {
386        if (pwc != NULL)
387          *pwc = wc;
388        if (wc == 0)
389          ret = 0;
390      }
391    return ret;
392  }
393# else
394  {
395#   if MBRTOWC_NULL_ARG1_BUG
396    wchar_t dummy;
397
398    if (pwc == NULL)
399      pwc = &dummy;
400#   endif
401
402    return mbrtowc (pwc, s, n, ps);
403  }
404# endif
405}
406
407#endif
408