1/* Convert multibyte character to wide character.
2   Copyright (C) 1999-2002, 2005-2009 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#include <config.h>
19
20/* Specification.  */
21#include <wchar.h>
22
23#if GNULIB_defined_mbstate_t
24/* Implement mbrtowc() on top of mbtowc().  */
25
26# include <errno.h>
27# include <stdlib.h>
28
29# include "localcharset.h"
30# include "streq.h"
31# include "verify.h"
32
33
34verify (sizeof (mbstate_t) >= 4);
35
36static char internal_state[4];
37
38size_t
39mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
40{
41  char *pstate = (char *)ps;
42
43  if (pstate == NULL)
44    pstate = internal_state;
45
46  if (s == NULL)
47    {
48      pwc = NULL;
49      s = "";
50      n = 1;
51    }
52
53  if (n == 0)
54    return (size_t)(-2);
55
56  /* Here n > 0.  */
57  {
58    size_t nstate = pstate[0];
59    char buf[4];
60    const char *p;
61    size_t m;
62
63    switch (nstate)
64      {
65      case 0:
66	p = s;
67	m = n;
68	break;
69      case 3:
70	buf[2] = pstate[3];
71	/*FALLTHROUGH*/
72      case 2:
73	buf[1] = pstate[2];
74	/*FALLTHROUGH*/
75      case 1:
76	buf[0] = pstate[1];
77	p = buf;
78	m = nstate;
79	buf[m++] = s[0];
80	if (n >= 2 && m < 4)
81	  {
82	    buf[m++] = s[1];
83	    if (n >= 3 && m < 4)
84	      buf[m++] = s[2];
85	  }
86	break;
87      default:
88	errno = EINVAL;
89	return (size_t)(-1);
90      }
91
92    /* Here m > 0.  */
93
94# if __GLIBC__
95    /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
96    mbtowc (NULL, NULL, 0);
97# endif
98    {
99      int res = mbtowc (pwc, p, m);
100
101      if (res >= 0)
102	{
103	  if (pwc != NULL && ((*pwc == 0) != (res == 0)))
104	    abort ();
105	  if (nstate >= (res > 0 ? res : 1))
106	    abort ();
107	  res -= nstate;
108	  pstate[0] = 0;
109	  return res;
110	}
111
112      /* mbtowc does not distinguish between invalid and incomplete multibyte
113	 sequences.  But mbrtowc needs to make this distinction.
114	 There are two possible approaches:
115	   - Use iconv() and its return value.
116	   - Use built-in knowledge about the possible encodings.
117	 Given the low quality of implementation of iconv() on the systems that
118	 lack mbrtowc(), we use the second approach.
119	 The possible encodings are:
120	   - 8-bit encodings,
121	   - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
122	   - UTF-8.
123	 Use specialized code for each.  */
124      if (m >= 4 || m >= MB_CUR_MAX)
125	goto invalid;
126      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
127      {
128	const char *encoding = locale_charset ();
129
130	if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
131	  {
132	    /* Cf. unistr/u8-mblen.c.  */
133	    unsigned char c = (unsigned char) p[0];
134
135	    if (c >= 0xc2)
136	      {
137		if (c < 0xe0)
138		  {
139		    if (m == 1)
140		      goto incomplete;
141		  }
142		else if (c < 0xf0)
143		  {
144		    if (m == 1)
145		      goto incomplete;
146		    if (m == 2)
147		      {
148			unsigned char c2 = (unsigned char) p[1];
149
150			if ((c2 ^ 0x80) < 0x40
151			    && (c >= 0xe1 || c2 >= 0xa0)
152			    && (c != 0xed || c2 < 0xa0))
153			  goto incomplete;
154		      }
155		  }
156		else if (c <= 0xf4)
157		  {
158		    if (m == 1)
159		      goto incomplete;
160		    else /* m == 2 || m == 3 */
161		      {
162			unsigned char c2 = (unsigned char) p[1];
163
164			if ((c2 ^ 0x80) < 0x40
165			    && (c >= 0xf1 || c2 >= 0x90)
166			    && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
167			  {
168			    if (m == 2)
169			      goto incomplete;
170			    else /* m == 3 */
171			      {
172				unsigned char c3 = (unsigned char) p[2];
173
174				if ((c3 ^ 0x80) < 0x40)
175				  goto incomplete;
176			      }
177			  }
178		      }
179		  }
180	      }
181	    goto invalid;
182	  }
183
184	/* As a reference for this code, you can use the GNU libiconv
185	   implementation.  Look for uses of the RET_TOOFEW macro.  */
186
187	if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
188	  {
189	    if (m == 1)
190	      {
191		unsigned char c = (unsigned char) p[0];
192
193		if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
194		  goto incomplete;
195	      }
196	    if (m == 2)
197	      {
198		unsigned char c = (unsigned char) p[0];
199
200		if (c == 0x8f)
201		  {
202		    unsigned char c2 = (unsigned char) p[1];
203
204		    if (c2 >= 0xa1 && c2 < 0xff)
205		      goto incomplete;
206		  }
207	      }
208	    goto invalid;
209	  }
210	if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
211	    || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
212	    || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
213	  {
214	    if (m == 1)
215	      {
216		unsigned char c = (unsigned char) p[0];
217
218		if (c >= 0xa1 && c < 0xff)
219		  goto incomplete;
220	      }
221	    goto invalid;
222	  }
223	if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
224	  {
225	    if (m == 1)
226	      {
227		unsigned char c = (unsigned char) p[0];
228
229		if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
230		  goto incomplete;
231	      }
232	    else /* m == 2 || m == 3 */
233	      {
234		unsigned char c = (unsigned char) p[0];
235
236		if (c == 0x8e)
237		  goto incomplete;
238	      }
239	    goto invalid;
240	  }
241	if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
242	  {
243	    if (m == 1)
244	      {
245		unsigned char c = (unsigned char) p[0];
246
247		if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
248		  goto incomplete;
249	      }
250	    else /* m == 2 || m == 3 */
251	      {
252		unsigned char c = (unsigned char) p[0];
253
254		if (c >= 0x90 && c <= 0xe3)
255		  {
256		    unsigned char c2 = (unsigned char) p[1];
257
258		    if (c2 >= 0x30 && c2 <= 0x39)
259		      {
260			if (m == 2)
261			  goto incomplete;
262			else /* m == 3 */
263			  {
264			    unsigned char c3 = (unsigned char) p[2];
265
266			    if (c3 >= 0x81 && c3 <= 0xfe)
267			      goto incomplete;
268			  }
269		      }
270		  }
271	      }
272	    goto invalid;
273	  }
274	if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
275	  {
276	    if (m == 1)
277	      {
278		unsigned char c = (unsigned char) p[0];
279
280		if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
281		    || (c >= 0xf0 && c <= 0xf9))
282		  goto incomplete;
283	      }
284	    goto invalid;
285	  }
286
287	/* An unknown multibyte encoding.  */
288	goto incomplete;
289      }
290
291     incomplete:
292      {
293	size_t k = nstate;
294	/* Here 0 <= k < m < 4.  */
295	pstate[++k] = s[0];
296	if (k < m)
297	  {
298	    pstate[++k] = s[1];
299	    if (k < m)
300	      pstate[++k] = s[2];
301	  }
302	if (k != m)
303	  abort ();
304      }
305      pstate[0] = m;
306      return (size_t)(-2);
307
308     invalid:
309      errno = EILSEQ;
310      /* The conversion state is undefined, says POSIX.  */
311      return (size_t)(-1);
312    }
313  }
314}
315
316#else
317/* Override the system's mbrtowc() function.  */
318
319# undef mbrtowc
320
321size_t
322rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
323{
324# if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
325  if (s == NULL)
326    {
327      pwc = NULL;
328      s = "";
329      n = 1;
330    }
331# endif
332
333# if MBRTOWC_RETVAL_BUG
334  {
335    static mbstate_t internal_state;
336
337    /* Override mbrtowc's internal state.  We can not call mbsinit() on the
338       hidden internal state, but we can call it on our variable.  */
339    if (ps == NULL)
340      ps = &internal_state;
341
342    if (!mbsinit (ps))
343      {
344	/* Parse the rest of the multibyte character byte for byte.  */
345	size_t count = 0;
346	for (; n > 0; s++, n--)
347	  {
348	    wchar_t wc;
349	    size_t ret = mbrtowc (&wc, s, 1, ps);
350
351	    if (ret == (size_t)(-1))
352	      return (size_t)(-1);
353	    count++;
354	    if (ret != (size_t)(-2))
355	      {
356		/* The multibyte character has been completed.  */
357		if (pwc != NULL)
358		  *pwc = wc;
359		return (wc == 0 ? 0 : count);
360	      }
361	  }
362	return (size_t)(-2);
363      }
364  }
365# endif
366
367# if MBRTOWC_NUL_RETVAL_BUG
368  {
369    wchar_t wc;
370    size_t ret = mbrtowc (&wc, s, n, ps);
371
372    if (ret != (size_t)(-1) && ret != (size_t)(-2))
373      {
374	if (pwc != NULL)
375	  *pwc = wc;
376	if (wc == 0)
377	  ret = 0;
378      }
379    return ret;
380  }
381# else
382  return mbrtowc (pwc, s, n, ps);
383# endif
384}
385
386#endif
387