1/* Convert multibyte character to wide character. 2 Copyright (C) 1999-2002, 2005-2014 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2008. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include <wchar.h> 22 23#if GNULIB_defined_mbstate_t 24/* Implement mbrtowc() on top of mbtowc(). */ 25 26# include <errno.h> 27# include <stdlib.h> 28 29# include "localcharset.h" 30# include "streq.h" 31# include "verify.h" 32 33 34verify (sizeof (mbstate_t) >= 4); 35 36static char internal_state[4]; 37 38size_t 39mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) 40{ 41 char *pstate = (char *)ps; 42 43 if (s == NULL) 44 { 45 pwc = NULL; 46 s = ""; 47 n = 1; 48 } 49 50 if (n == 0) 51 return (size_t)(-2); 52 53 /* Here n > 0. */ 54 55 if (pstate == NULL) 56 pstate = internal_state; 57 58 { 59 size_t nstate = pstate[0]; 60 char buf[4]; 61 const char *p; 62 size_t m; 63 64 switch (nstate) 65 { 66 case 0: 67 p = s; 68 m = n; 69 break; 70 case 3: 71 buf[2] = pstate[3]; 72 /*FALLTHROUGH*/ 73 case 2: 74 buf[1] = pstate[2]; 75 /*FALLTHROUGH*/ 76 case 1: 77 buf[0] = pstate[1]; 78 p = buf; 79 m = nstate; 80 buf[m++] = s[0]; 81 if (n >= 2 && m < 4) 82 { 83 buf[m++] = s[1]; 84 if (n >= 3 && m < 4) 85 buf[m++] = s[2]; 86 } 87 break; 88 default: 89 errno = EINVAL; 90 return (size_t)(-1); 91 } 92 93 /* Here m > 0. */ 94 95# if __GLIBC__ || defined __UCLIBC__ 96 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */ 97 mbtowc (NULL, NULL, 0); 98# endif 99 { 100 int res = mbtowc (pwc, p, m); 101 102 if (res >= 0) 103 { 104 if (pwc != NULL && ((*pwc == 0) != (res == 0))) 105 abort (); 106 if (nstate >= (res > 0 ? res : 1)) 107 abort (); 108 res -= nstate; 109 pstate[0] = 0; 110 return res; 111 } 112 113 /* mbtowc does not distinguish between invalid and incomplete multibyte 114 sequences. But mbrtowc needs to make this distinction. 115 There are two possible approaches: 116 - Use iconv() and its return value. 117 - Use built-in knowledge about the possible encodings. 118 Given the low quality of implementation of iconv() on the systems that 119 lack mbrtowc(), we use the second approach. 120 The possible encodings are: 121 - 8-bit encodings, 122 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, 123 - UTF-8. 124 Use specialized code for each. */ 125 if (m >= 4 || m >= MB_CUR_MAX) 126 goto invalid; 127 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ 128 { 129 const char *encoding = locale_charset (); 130 131 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) 132 { 133 /* Cf. unistr/u8-mblen.c. */ 134 unsigned char c = (unsigned char) p[0]; 135 136 if (c >= 0xc2) 137 { 138 if (c < 0xe0) 139 { 140 if (m == 1) 141 goto incomplete; 142 } 143 else if (c < 0xf0) 144 { 145 if (m == 1) 146 goto incomplete; 147 if (m == 2) 148 { 149 unsigned char c2 = (unsigned char) p[1]; 150 151 if ((c2 ^ 0x80) < 0x40 152 && (c >= 0xe1 || c2 >= 0xa0) 153 && (c != 0xed || c2 < 0xa0)) 154 goto incomplete; 155 } 156 } 157 else if (c <= 0xf4) 158 { 159 if (m == 1) 160 goto incomplete; 161 else /* m == 2 || m == 3 */ 162 { 163 unsigned char c2 = (unsigned char) p[1]; 164 165 if ((c2 ^ 0x80) < 0x40 166 && (c >= 0xf1 || c2 >= 0x90) 167 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) 168 { 169 if (m == 2) 170 goto incomplete; 171 else /* m == 3 */ 172 { 173 unsigned char c3 = (unsigned char) p[2]; 174 175 if ((c3 ^ 0x80) < 0x40) 176 goto incomplete; 177 } 178 } 179 } 180 } 181 } 182 goto invalid; 183 } 184 185 /* As a reference for this code, you can use the GNU libiconv 186 implementation. Look for uses of the RET_TOOFEW macro. */ 187 188 if (STREQ_OPT (encoding, 189 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) 190 { 191 if (m == 1) 192 { 193 unsigned char c = (unsigned char) p[0]; 194 195 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) 196 goto incomplete; 197 } 198 if (m == 2) 199 { 200 unsigned char c = (unsigned char) p[0]; 201 202 if (c == 0x8f) 203 { 204 unsigned char c2 = (unsigned char) p[1]; 205 206 if (c2 >= 0xa1 && c2 < 0xff) 207 goto incomplete; 208 } 209 } 210 goto invalid; 211 } 212 if (STREQ_OPT (encoding, 213 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) 214 || STREQ_OPT (encoding, 215 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 216 || STREQ_OPT (encoding, 217 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) 218 { 219 if (m == 1) 220 { 221 unsigned char c = (unsigned char) p[0]; 222 223 if (c >= 0xa1 && c < 0xff) 224 goto incomplete; 225 } 226 goto invalid; 227 } 228 if (STREQ_OPT (encoding, 229 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) 230 { 231 if (m == 1) 232 { 233 unsigned char c = (unsigned char) p[0]; 234 235 if ((c >= 0xa1 && c < 0xff) || c == 0x8e) 236 goto incomplete; 237 } 238 else /* m == 2 || m == 3 */ 239 { 240 unsigned char c = (unsigned char) p[0]; 241 242 if (c == 0x8e) 243 goto incomplete; 244 } 245 goto invalid; 246 } 247 if (STREQ_OPT (encoding, 248 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 249 { 250 if (m == 1) 251 { 252 unsigned char c = (unsigned char) p[0]; 253 254 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) 255 goto incomplete; 256 } 257 else /* m == 2 || m == 3 */ 258 { 259 unsigned char c = (unsigned char) p[0]; 260 261 if (c >= 0x90 && c <= 0xe3) 262 { 263 unsigned char c2 = (unsigned char) p[1]; 264 265 if (c2 >= 0x30 && c2 <= 0x39) 266 { 267 if (m == 2) 268 goto incomplete; 269 else /* m == 3 */ 270 { 271 unsigned char c3 = (unsigned char) p[2]; 272 273 if (c3 >= 0x81 && c3 <= 0xfe) 274 goto incomplete; 275 } 276 } 277 } 278 } 279 goto invalid; 280 } 281 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) 282 { 283 if (m == 1) 284 { 285 unsigned char c = (unsigned char) p[0]; 286 287 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) 288 || (c >= 0xf0 && c <= 0xf9)) 289 goto incomplete; 290 } 291 goto invalid; 292 } 293 294 /* An unknown multibyte encoding. */ 295 goto incomplete; 296 } 297 298 incomplete: 299 { 300 size_t k = nstate; 301 /* Here 0 <= k < m < 4. */ 302 pstate[++k] = s[0]; 303 if (k < m) 304 { 305 pstate[++k] = s[1]; 306 if (k < m) 307 pstate[++k] = s[2]; 308 } 309 if (k != m) 310 abort (); 311 } 312 pstate[0] = m; 313 return (size_t)(-2); 314 315 invalid: 316 errno = EILSEQ; 317 /* The conversion state is undefined, says POSIX. */ 318 return (size_t)(-1); 319 } 320 } 321} 322 323#else 324/* Override the system's mbrtowc() function. */ 325 326# undef mbrtowc 327 328size_t 329rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) 330{ 331# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG 332 if (s == NULL) 333 { 334 pwc = NULL; 335 s = ""; 336 n = 1; 337 } 338# endif 339 340# if MBRTOWC_EMPTY_INPUT_BUG 341 if (n == 0) 342 return (size_t) -2; 343# endif 344 345# if MBRTOWC_RETVAL_BUG 346 { 347 static mbstate_t internal_state; 348 349 /* Override mbrtowc's internal state. We cannot call mbsinit() on the 350 hidden internal state, but we can call it on our variable. */ 351 if (ps == NULL) 352 ps = &internal_state; 353 354 if (!mbsinit (ps)) 355 { 356 /* Parse the rest of the multibyte character byte for byte. */ 357 size_t count = 0; 358 for (; n > 0; s++, n--) 359 { 360 wchar_t wc; 361 size_t ret = mbrtowc (&wc, s, 1, ps); 362 363 if (ret == (size_t)(-1)) 364 return (size_t)(-1); 365 count++; 366 if (ret != (size_t)(-2)) 367 { 368 /* The multibyte character has been completed. */ 369 if (pwc != NULL) 370 *pwc = wc; 371 return (wc == 0 ? 0 : count); 372 } 373 } 374 return (size_t)(-2); 375 } 376 } 377# endif 378 379# if MBRTOWC_NUL_RETVAL_BUG 380 { 381 wchar_t wc; 382 size_t ret = mbrtowc (&wc, s, n, ps); 383 384 if (ret != (size_t)(-1) && ret != (size_t)(-2)) 385 { 386 if (pwc != NULL) 387 *pwc = wc; 388 if (wc == 0) 389 ret = 0; 390 } 391 return ret; 392 } 393# else 394 { 395# if MBRTOWC_NULL_ARG1_BUG 396 wchar_t dummy; 397 398 if (pwc == NULL) 399 pwc = &dummy; 400# endif 401 402 return mbrtowc (pwc, s, n, ps); 403 } 404# endif 405} 406 407#endif 408