1/* Convert multibyte character to wide character.
2   Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU Lesser General Public License as published by
7   the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#include <config.h>
19
20/* Specification.  */
21#include <wchar.h>
22
23#if GNULIB_defined_mbstate_t
24/* Implement mbrtowc() on top of mbtowc().  */
25
26# include <errno.h>
27# include <stdlib.h>
28
29# include "localcharset.h"
30# include "streq.h"
31# include "verify.h"
32
33
34verify (sizeof (mbstate_t) >= 4);
35
36static char internal_state[4];
37
38size_t
39mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
40{
41  char *pstate = (char *)ps;
42
43  if (pstate == NULL)
44    pstate = internal_state;
45
46  if (s == NULL)
47    {
48      pwc = NULL;
49      s = "";
50      n = 1;
51    }
52
53  if (n == 0)
54    return (size_t)(-2);
55
56  /* Here n > 0.  */
57  {
58    size_t nstate = pstate[0];
59    char buf[4];
60    const char *p;
61    size_t m;
62
63    switch (nstate)
64      {
65      case 0:
66        p = s;
67        m = n;
68        break;
69      case 3:
70        buf[2] = pstate[3];
71        /*FALLTHROUGH*/
72      case 2:
73        buf[1] = pstate[2];
74        /*FALLTHROUGH*/
75      case 1:
76        buf[0] = pstate[1];
77        p = buf;
78        m = nstate;
79        buf[m++] = s[0];
80        if (n >= 2 && m < 4)
81          {
82            buf[m++] = s[1];
83            if (n >= 3 && m < 4)
84              buf[m++] = s[2];
85          }
86        break;
87      default:
88        errno = EINVAL;
89        return (size_t)(-1);
90      }
91
92    /* Here m > 0.  */
93
94# if __GLIBC__
95    /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
96    mbtowc (NULL, NULL, 0);
97# endif
98    {
99      int res = mbtowc (pwc, p, m);
100
101      if (res >= 0)
102        {
103          if (pwc != NULL && ((*pwc == 0) != (res == 0)))
104            abort ();
105          if (nstate >= (res > 0 ? res : 1))
106            abort ();
107          res -= nstate;
108          pstate[0] = 0;
109          return res;
110        }
111
112      /* mbtowc does not distinguish between invalid and incomplete multibyte
113         sequences.  But mbrtowc needs to make this distinction.
114         There are two possible approaches:
115           - Use iconv() and its return value.
116           - Use built-in knowledge about the possible encodings.
117         Given the low quality of implementation of iconv() on the systems that
118         lack mbrtowc(), we use the second approach.
119         The possible encodings are:
120           - 8-bit encodings,
121           - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
122           - UTF-8.
123         Use specialized code for each.  */
124      if (m >= 4 || m >= MB_CUR_MAX)
125        goto invalid;
126      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
127      {
128        const char *encoding = locale_charset ();
129
130        if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
131          {
132            /* Cf. unistr/u8-mblen.c.  */
133            unsigned char c = (unsigned char) p[0];
134
135            if (c >= 0xc2)
136              {
137                if (c < 0xe0)
138                  {
139                    if (m == 1)
140                      goto incomplete;
141                  }
142                else if (c < 0xf0)
143                  {
144                    if (m == 1)
145                      goto incomplete;
146                    if (m == 2)
147                      {
148                        unsigned char c2 = (unsigned char) p[1];
149
150                        if ((c2 ^ 0x80) < 0x40
151                            && (c >= 0xe1 || c2 >= 0xa0)
152                            && (c != 0xed || c2 < 0xa0))
153                          goto incomplete;
154                      }
155                  }
156                else if (c <= 0xf4)
157                  {
158                    if (m == 1)
159                      goto incomplete;
160                    else /* m == 2 || m == 3 */
161                      {
162                        unsigned char c2 = (unsigned char) p[1];
163
164                        if ((c2 ^ 0x80) < 0x40
165                            && (c >= 0xf1 || c2 >= 0x90)
166                            && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
167                          {
168                            if (m == 2)
169                              goto incomplete;
170                            else /* m == 3 */
171                              {
172                                unsigned char c3 = (unsigned char) p[2];
173
174                                if ((c3 ^ 0x80) < 0x40)
175                                  goto incomplete;
176                              }
177                          }
178                      }
179                  }
180              }
181            goto invalid;
182          }
183
184        /* As a reference for this code, you can use the GNU libiconv
185           implementation.  Look for uses of the RET_TOOFEW macro.  */
186
187        if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
188          {
189            if (m == 1)
190              {
191                unsigned char c = (unsigned char) p[0];
192
193                if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
194                  goto incomplete;
195              }
196            if (m == 2)
197              {
198                unsigned char c = (unsigned char) p[0];
199
200                if (c == 0x8f)
201                  {
202                    unsigned char c2 = (unsigned char) p[1];
203
204                    if (c2 >= 0xa1 && c2 < 0xff)
205                      goto incomplete;
206                  }
207              }
208            goto invalid;
209          }
210        if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
211            || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
212            || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
213          {
214            if (m == 1)
215              {
216                unsigned char c = (unsigned char) p[0];
217
218                if (c >= 0xa1 && c < 0xff)
219                  goto incomplete;
220              }
221            goto invalid;
222          }
223        if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
224          {
225            if (m == 1)
226              {
227                unsigned char c = (unsigned char) p[0];
228
229                if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
230                  goto incomplete;
231              }
232            else /* m == 2 || m == 3 */
233              {
234                unsigned char c = (unsigned char) p[0];
235
236                if (c == 0x8e)
237                  goto incomplete;
238              }
239            goto invalid;
240          }
241        if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
242          {
243            if (m == 1)
244              {
245                unsigned char c = (unsigned char) p[0];
246
247                if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
248                  goto incomplete;
249              }
250            else /* m == 2 || m == 3 */
251              {
252                unsigned char c = (unsigned char) p[0];
253
254                if (c >= 0x90 && c <= 0xe3)
255                  {
256                    unsigned char c2 = (unsigned char) p[1];
257
258                    if (c2 >= 0x30 && c2 <= 0x39)
259                      {
260                        if (m == 2)
261                          goto incomplete;
262                        else /* m == 3 */
263                          {
264                            unsigned char c3 = (unsigned char) p[2];
265
266                            if (c3 >= 0x81 && c3 <= 0xfe)
267                              goto incomplete;
268                          }
269                      }
270                  }
271              }
272            goto invalid;
273          }
274        if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
275          {
276            if (m == 1)
277              {
278                unsigned char c = (unsigned char) p[0];
279
280                if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
281                    || (c >= 0xf0 && c <= 0xf9))
282                  goto incomplete;
283              }
284            goto invalid;
285          }
286
287        /* An unknown multibyte encoding.  */
288        goto incomplete;
289      }
290
291     incomplete:
292      {
293        size_t k = nstate;
294        /* Here 0 <= k < m < 4.  */
295        pstate[++k] = s[0];
296        if (k < m)
297          {
298            pstate[++k] = s[1];
299            if (k < m)
300              pstate[++k] = s[2];
301          }
302        if (k != m)
303          abort ();
304      }
305      pstate[0] = m;
306      return (size_t)(-2);
307
308     invalid:
309      errno = EILSEQ;
310      /* The conversion state is undefined, says POSIX.  */
311      return (size_t)(-1);
312    }
313  }
314}
315
316#else
317/* Override the system's mbrtowc() function.  */
318
319# undef mbrtowc
320
321size_t
322rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
323{
324# if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
325  if (s == NULL)
326    {
327      pwc = NULL;
328      s = "";
329      n = 1;
330    }
331# endif
332
333# if MBRTOWC_RETVAL_BUG
334  {
335    static mbstate_t internal_state;
336
337    /* Override mbrtowc's internal state.  We can not call mbsinit() on the
338       hidden internal state, but we can call it on our variable.  */
339    if (ps == NULL)
340      ps = &internal_state;
341
342    if (!mbsinit (ps))
343      {
344        /* Parse the rest of the multibyte character byte for byte.  */
345        size_t count = 0;
346        for (; n > 0; s++, n--)
347          {
348            wchar_t wc;
349            size_t ret = mbrtowc (&wc, s, 1, ps);
350
351            if (ret == (size_t)(-1))
352              return (size_t)(-1);
353            count++;
354            if (ret != (size_t)(-2))
355              {
356                /* The multibyte character has been completed.  */
357                if (pwc != NULL)
358                  *pwc = wc;
359                return (wc == 0 ? 0 : count);
360              }
361          }
362        return (size_t)(-2);
363      }
364  }
365# endif
366
367# if MBRTOWC_NUL_RETVAL_BUG
368  {
369    wchar_t wc;
370    size_t ret = mbrtowc (&wc, s, n, ps);
371
372    if (ret != (size_t)(-1) && ret != (size_t)(-2))
373      {
374        if (pwc != NULL)
375          *pwc = wc;
376        if (wc == 0)
377          ret = 0;
378      }
379    return ret;
380  }
381# else
382  return mbrtowc (pwc, s, n, ps);
383# endif
384}
385
386#endif
387