1/* Convert multibyte character to wide character.
2   Copyright (C) 1999-2002, 2005-2022 Free Software Foundation, Inc.
3
4   This file is free software: you can redistribute it and/or modify
5   it under the terms of the GNU Lesser General Public License as
6   published by the Free Software Foundation; either version 2.1 of the
7   License, or (at your option) any later version.
8
9   This file is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU Lesser General Public License for more details.
13
14   You should have received a copy of the GNU Lesser General Public License
15   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16
17/* Written by Bruno Haible <bruno@clisp.org>, 2008.  */
18
19/* This file contains the body of the mbrtowc and mbrtoc32 functions,
20   when GNULIB_defined_mbstate_t is defined.  */
21
22  char *pstate = (char *)ps;
23
24  if (s == NULL)
25    {
26      pwc = NULL;
27      s = "";
28      n = 1;
29    }
30
31  if (n == 0)
32    return (size_t)(-2);
33
34  /* Here n > 0.  */
35
36  if (pstate == NULL)
37    pstate = internal_state;
38
39  {
40    size_t nstate = pstate[0];
41    char buf[4];
42    const char *p;
43    size_t m;
44    enc_t enc;
45    int res;
46
47    switch (nstate)
48      {
49      case 0:
50        p = s;
51        m = n;
52        break;
53      case 3:
54        buf[2] = pstate[3];
55        FALLTHROUGH;
56      case 2:
57        buf[1] = pstate[2];
58        FALLTHROUGH;
59      case 1:
60        buf[0] = pstate[1];
61        p = buf;
62        m = nstate;
63        buf[m++] = s[0];
64        if (n >= 2 && m < 4)
65          {
66            buf[m++] = s[1];
67            if (n >= 3 && m < 4)
68              buf[m++] = s[2];
69          }
70        break;
71      default:
72        errno = EINVAL;
73        return (size_t)(-1);
74      }
75
76    /* Here m > 0.  */
77
78    enc = locale_encoding_classification ();
79
80    if (enc == enc_utf8) /* UTF-8 */
81      {
82        /* Achieve
83             - multi-thread safety and
84             - the ability to produce wide character values > WCHAR_MAX
85           by not calling mbtowc() at all.  */
86#include "mbrtowc-impl-utf8.h"
87      }
88    else
89      {
90        /* The hidden internal state of mbtowc would make this function not
91           multi-thread safe.  Achieve multi-thread safety through a lock.  */
92        wchar_t wc;
93        res = mbtowc_with_lock (&wc, p, m);
94
95        if (res >= 0)
96          {
97            if ((wc == 0) != (res == 0))
98              abort ();
99            if (pwc != NULL)
100              *pwc = wc;
101            goto success;
102          }
103
104        /* mbtowc does not distinguish between invalid and incomplete multibyte
105           sequences.  But mbrtowc needs to make this distinction.
106           There are two possible approaches:
107             - Use iconv() and its return value.
108             - Use built-in knowledge about the possible encodings.
109           Given the low quality of implementation of iconv() on the systems
110           that lack mbrtowc(), we use the second approach.
111           The possible encodings are:
112             - 8-bit encodings,
113             - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
114             - UTF-8 (already handled above).
115           Use specialized code for each.  */
116        if (m >= 4 || m >= MB_CUR_MAX)
117          goto invalid;
118        /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
119        switch (enc)
120          {
121          /* As a reference for this code, you can use the GNU libiconv
122             implementation.  Look for uses of the RET_TOOFEW macro.  */
123
124          case enc_eucjp: /* EUC-JP */
125            {
126              if (m == 1)
127                {
128                  unsigned char c = (unsigned char) p[0];
129
130                  if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
131                    goto incomplete;
132                }
133              if (m == 2)
134                {
135                  unsigned char c = (unsigned char) p[0];
136
137                  if (c == 0x8f)
138                    {
139                      unsigned char c2 = (unsigned char) p[1];
140
141                      if (c2 >= 0xa1 && c2 < 0xff)
142                        goto incomplete;
143                    }
144                }
145              goto invalid;
146            }
147
148          case enc_94: /* EUC-KR, GB2312, BIG5 */
149            {
150              if (m == 1)
151                {
152                  unsigned char c = (unsigned char) p[0];
153
154                  if (c >= 0xa1 && c < 0xff)
155                    goto incomplete;
156                }
157              goto invalid;
158            }
159
160          case enc_euctw: /* EUC-TW */
161            {
162              if (m == 1)
163                {
164                  unsigned char c = (unsigned char) p[0];
165
166                  if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
167                    goto incomplete;
168                }
169              else /* m == 2 || m == 3 */
170                {
171                  unsigned char c = (unsigned char) p[0];
172
173                  if (c == 0x8e)
174                    goto incomplete;
175                }
176              goto invalid;
177            }
178
179          case enc_gb18030: /* GB18030 */
180            {
181              if (m == 1)
182                {
183                  unsigned char c = (unsigned char) p[0];
184
185                  if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
186                    goto incomplete;
187                }
188              else /* m == 2 || m == 3 */
189                {
190                  unsigned char c = (unsigned char) p[0];
191
192                  if (c >= 0x90 && c <= 0xe3)
193                    {
194                      unsigned char c2 = (unsigned char) p[1];
195
196                      if (c2 >= 0x30 && c2 <= 0x39)
197                        {
198                          if (m == 2)
199                            goto incomplete;
200                          else /* m == 3 */
201                            {
202                              unsigned char c3 = (unsigned char) p[2];
203
204                              if (c3 >= 0x81 && c3 <= 0xfe)
205                                goto incomplete;
206                            }
207                        }
208                    }
209                }
210              goto invalid;
211            }
212
213          case enc_sjis: /* SJIS */
214            {
215              if (m == 1)
216                {
217                  unsigned char c = (unsigned char) p[0];
218
219                  if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
220                      || (c >= 0xf0 && c <= 0xf9))
221                    goto incomplete;
222                }
223              goto invalid;
224            }
225
226          default:
227            /* An unknown multibyte encoding.  */
228            goto incomplete;
229          }
230      }
231
232   success:
233    /* res >= 0 is the corrected return value of
234       mbtowc_with_lock (&wc, p, m).  */
235    if (nstate >= (res > 0 ? res : 1))
236      abort ();
237    res -= nstate;
238    pstate[0] = 0;
239    return res;
240
241   incomplete:
242    {
243      size_t k = nstate;
244      /* Here 0 <= k < m < 4.  */
245      pstate[++k] = s[0];
246      if (k < m)
247        {
248          pstate[++k] = s[1];
249          if (k < m)
250            pstate[++k] = s[2];
251        }
252      if (k != m)
253        abort ();
254    }
255    pstate[0] = m;
256    return (size_t)(-2);
257
258   invalid:
259    errno = EILSEQ;
260    /* The conversion state is undefined, says POSIX.  */
261    return (size_t)(-1);
262  }
263