1263409Smarcel/* Convert multibyte character to wide character.
2263409Smarcel   Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc.
3263409Smarcel   Written by Bruno Haible <bruno@clisp.org>, 2008.
4263409Smarcel
5263409Smarcel   This program is free software: you can redistribute it and/or modify
6263409Smarcel   it under the terms of the GNU General Public License as published by
7263409Smarcel   the Free Software Foundation; either version 3 of the License, or
8263409Smarcel   (at your option) any later version.
9263409Smarcel
10263409Smarcel   This program is distributed in the hope that it will be useful,
11263409Smarcel   but WITHOUT ANY WARRANTY; without even the implied warranty of
12263409Smarcel   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13263409Smarcel   GNU General Public License for more details.
14263409Smarcel
15263409Smarcel   You should have received a copy of the GNU General Public License
16263409Smarcel   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17263409Smarcel
18263409Smarcel#include <config.h>
19263409Smarcel
20263409Smarcel/* Specification.  */
21263409Smarcel#include <wchar.h>
22263409Smarcel
23263409Smarcel#if GNULIB_defined_mbstate_t
24263409Smarcel/* Implement mbrtowc() on top of mbtowc().  */
25263409Smarcel
26263409Smarcel# include <errno.h>
27263409Smarcel# include <stdlib.h>
28263409Smarcel
29263409Smarcel# include "localcharset.h"
30263409Smarcel# include "streq.h"
31263442Smarcel# include "verify.h"
32263674Smarcel
33263442Smarcel
34263409Smarcelverify (sizeof (mbstate_t) >= 4);
35263674Smarcel
36263674Smarcelstatic char internal_state[4];
37263409Smarcel
38268161Smarcelsize_t
39263409Smarcelmbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
40263409Smarcel{
41263409Smarcel  char *pstate = (char *)ps;
42268161Smarcel
43268161Smarcel  if (pstate == NULL)
44268161Smarcel    pstate = internal_state;
45268161Smarcel
46263409Smarcel  if (s == NULL)
47263674Smarcel    {
48263674Smarcel      pwc = NULL;
49263487Smarcel      s = "";
50263409Smarcel      n = 1;
51263409Smarcel    }
52272030Smarcel
53272030Smarcel  if (n == 0)
54263409Smarcel    return (size_t)(-2);
55263409Smarcel
56272030Smarcel  /* Here n > 0.  */
57272030Smarcel  {
58263409Smarcel    size_t nstate = pstate[0];
59263409Smarcel    char buf[4];
60263674Smarcel    const char *p;
61272776Smarcel    size_t m;
62263674Smarcel
63272776Smarcel    switch (nstate)
64263674Smarcel      {
65272776Smarcel      case 0:
66272776Smarcel        p = s;
67272776Smarcel        m = n;
68272776Smarcel        break;
69263674Smarcel      case 3:
70263674Smarcel        buf[2] = pstate[3];
71263442Smarcel        /*FALLTHROUGH*/
72268161Smarcel      case 2:
73263442Smarcel        buf[1] = pstate[2];
74263674Smarcel        /*FALLTHROUGH*/
75263674Smarcel      case 1:
76263674Smarcel        buf[0] = pstate[1];
77272776Smarcel        p = buf;
78263674Smarcel        m = nstate;
79263674Smarcel        buf[m++] = s[0];
80263674Smarcel        if (n >= 2 && m < 4)
81263674Smarcel          {
82263674Smarcel            buf[m++] = s[1];
83263674Smarcel            if (n >= 3 && m < 4)
84263674Smarcel              buf[m++] = s[2];
85263674Smarcel          }
86263674Smarcel        break;
87263674Smarcel      default:
88263843Smarcel        errno = EINVAL;
89272776Smarcel        return (size_t)(-1);
90263674Smarcel      }
91263843Smarcel
92263674Smarcel    /* Here m > 0.  */
93263674Smarcel
94272776Smarcel# if __GLIBC__
95263843Smarcel    /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
96272776Smarcel    mbtowc (NULL, NULL, 0);
97263674Smarcel# endif
98263674Smarcel    {
99263674Smarcel      int res = mbtowc (pwc, p, m);
100272776Smarcel
101263674Smarcel      if (res >= 0)
102263674Smarcel        {
103263843Smarcel          if (pwc != NULL && ((*pwc == 0) != (res == 0)))
104263674Smarcel            abort ();
105263674Smarcel          if (nstate >= (res > 0 ? res : 1))
106272776Smarcel            abort ();
107263843Smarcel          res -= nstate;
108272776Smarcel          pstate[0] = 0;
109263674Smarcel          return res;
110263674Smarcel        }
111268161Smarcel
112263674Smarcel      /* mbtowc does not distinguish between invalid and incomplete multibyte
113263674Smarcel         sequences.  But mbrtowc needs to make this distinction.
114263674Smarcel         There are two possible approaches:
115263674Smarcel           - Use iconv() and its return value.
116263674Smarcel           - Use built-in knowledge about the possible encodings.
117263674Smarcel         Given the low quality of implementation of iconv() on the systems that
118263674Smarcel         lack mbrtowc(), we use the second approach.
119263674Smarcel         The possible encodings are:
120263442Smarcel           - 8-bit encodings,
121263442Smarcel           - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
122263409Smarcel           - UTF-8.
123263409Smarcel         Use specialized code for each.  */
124263409Smarcel      if (m >= 4 || m >= MB_CUR_MAX)
125263409Smarcel        goto invalid;
126263440Smarcel      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
127263442Smarcel      {
128263700Smarcel        const char *encoding = locale_charset ();
129263700Smarcel
130263409Smarcel        if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
131263409Smarcel          {
132263409Smarcel            /* Cf. unistr/u8-mblen.c.  */
133            unsigned char c = (unsigned char) p[0];
134
135            if (c >= 0xc2)
136              {
137                if (c < 0xe0)
138                  {
139                    if (m == 1)
140                      goto incomplete;
141                  }
142                else if (c < 0xf0)
143                  {
144                    if (m == 1)
145                      goto incomplete;
146                    if (m == 2)
147                      {
148                        unsigned char c2 = (unsigned char) p[1];
149
150                        if ((c2 ^ 0x80) < 0x40
151                            && (c >= 0xe1 || c2 >= 0xa0)
152                            && (c != 0xed || c2 < 0xa0))
153                          goto incomplete;
154                      }
155                  }
156                else if (c <= 0xf4)
157                  {
158                    if (m == 1)
159                      goto incomplete;
160                    else /* m == 2 || m == 3 */
161                      {
162                        unsigned char c2 = (unsigned char) p[1];
163
164                        if ((c2 ^ 0x80) < 0x40
165                            && (c >= 0xf1 || c2 >= 0x90)
166                            && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
167                          {
168                            if (m == 2)
169                              goto incomplete;
170                            else /* m == 3 */
171                              {
172                                unsigned char c3 = (unsigned char) p[2];
173
174                                if ((c3 ^ 0x80) < 0x40)
175                                  goto incomplete;
176                              }
177                          }
178                      }
179                  }
180              }
181            goto invalid;
182          }
183
184        /* As a reference for this code, you can use the GNU libiconv
185           implementation.  Look for uses of the RET_TOOFEW macro.  */
186
187        if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
188          {
189            if (m == 1)
190              {
191                unsigned char c = (unsigned char) p[0];
192
193                if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
194                  goto incomplete;
195              }
196            if (m == 2)
197              {
198                unsigned char c = (unsigned char) p[0];
199
200                if (c == 0x8f)
201                  {
202                    unsigned char c2 = (unsigned char) p[1];
203
204                    if (c2 >= 0xa1 && c2 < 0xff)
205                      goto incomplete;
206                  }
207              }
208            goto invalid;
209          }
210        if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
211            || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
212            || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
213          {
214            if (m == 1)
215              {
216                unsigned char c = (unsigned char) p[0];
217
218                if (c >= 0xa1 && c < 0xff)
219                  goto incomplete;
220              }
221            goto invalid;
222          }
223        if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
224          {
225            if (m == 1)
226              {
227                unsigned char c = (unsigned char) p[0];
228
229                if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
230                  goto incomplete;
231              }
232            else /* m == 2 || m == 3 */
233              {
234                unsigned char c = (unsigned char) p[0];
235
236                if (c == 0x8e)
237                  goto incomplete;
238              }
239            goto invalid;
240          }
241        if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
242          {
243            if (m == 1)
244              {
245                unsigned char c = (unsigned char) p[0];
246
247                if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
248                  goto incomplete;
249              }
250            else /* m == 2 || m == 3 */
251              {
252                unsigned char c = (unsigned char) p[0];
253
254                if (c >= 0x90 && c <= 0xe3)
255                  {
256                    unsigned char c2 = (unsigned char) p[1];
257
258                    if (c2 >= 0x30 && c2 <= 0x39)
259                      {
260                        if (m == 2)
261                          goto incomplete;
262                        else /* m == 3 */
263                          {
264                            unsigned char c3 = (unsigned char) p[2];
265
266                            if (c3 >= 0x81 && c3 <= 0xfe)
267                              goto incomplete;
268                          }
269                      }
270                  }
271              }
272            goto invalid;
273          }
274        if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
275          {
276            if (m == 1)
277              {
278                unsigned char c = (unsigned char) p[0];
279
280                if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
281                    || (c >= 0xf0 && c <= 0xf9))
282                  goto incomplete;
283              }
284            goto invalid;
285          }
286
287        /* An unknown multibyte encoding.  */
288        goto incomplete;
289      }
290
291     incomplete:
292      {
293        size_t k = nstate;
294        /* Here 0 <= k < m < 4.  */
295        pstate[++k] = s[0];
296        if (k < m)
297          {
298            pstate[++k] = s[1];
299            if (k < m)
300              pstate[++k] = s[2];
301          }
302        if (k != m)
303          abort ();
304      }
305      pstate[0] = m;
306      return (size_t)(-2);
307
308     invalid:
309      errno = EILSEQ;
310      /* The conversion state is undefined, says POSIX.  */
311      return (size_t)(-1);
312    }
313  }
314}
315
316#else
317/* Override the system's mbrtowc() function.  */
318
319# undef mbrtowc
320
321size_t
322rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
323{
324# if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
325  if (s == NULL)
326    {
327      pwc = NULL;
328      s = "";
329      n = 1;
330    }
331# endif
332
333# if MBRTOWC_RETVAL_BUG
334  {
335    static mbstate_t internal_state;
336
337    /* Override mbrtowc's internal state.  We can not call mbsinit() on the
338       hidden internal state, but we can call it on our variable.  */
339    if (ps == NULL)
340      ps = &internal_state;
341
342    if (!mbsinit (ps))
343      {
344        /* Parse the rest of the multibyte character byte for byte.  */
345        size_t count = 0;
346        for (; n > 0; s++, n--)
347          {
348            wchar_t wc;
349            size_t ret = mbrtowc (&wc, s, 1, ps);
350
351            if (ret == (size_t)(-1))
352              return (size_t)(-1);
353            count++;
354            if (ret != (size_t)(-2))
355              {
356                /* The multibyte character has been completed.  */
357                if (pwc != NULL)
358                  *pwc = wc;
359                return (wc == 0 ? 0 : count);
360              }
361          }
362        return (size_t)(-2);
363      }
364  }
365# endif
366
367# if MBRTOWC_NUL_RETVAL_BUG
368  {
369    wchar_t wc;
370    size_t ret = mbrtowc (&wc, s, n, ps);
371
372    if (ret != (size_t)(-1) && ret != (size_t)(-2))
373      {
374        if (pwc != NULL)
375          *pwc = wc;
376        if (wc == 0)
377          ret = 0;
378      }
379    return ret;
380  }
381# else
382  return mbrtowc (pwc, s, n, ps);
383# endif
384}
385
386#endif
387