1263409Smarcel/* Convert multibyte character to wide character. 2263409Smarcel Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc. 3263409Smarcel Written by Bruno Haible <bruno@clisp.org>, 2008. 4263409Smarcel 5263409Smarcel This program is free software: you can redistribute it and/or modify 6263409Smarcel it under the terms of the GNU General Public License as published by 7263409Smarcel the Free Software Foundation; either version 3 of the License, or 8263409Smarcel (at your option) any later version. 9263409Smarcel 10263409Smarcel This program is distributed in the hope that it will be useful, 11263409Smarcel but WITHOUT ANY WARRANTY; without even the implied warranty of 12263409Smarcel MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13263409Smarcel GNU General Public License for more details. 14263409Smarcel 15263409Smarcel You should have received a copy of the GNU General Public License 16263409Smarcel along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17263409Smarcel 18263409Smarcel#include <config.h> 19263409Smarcel 20263409Smarcel/* Specification. */ 21263409Smarcel#include <wchar.h> 22263409Smarcel 23263409Smarcel#if GNULIB_defined_mbstate_t 24263409Smarcel/* Implement mbrtowc() on top of mbtowc(). */ 25263409Smarcel 26263409Smarcel# include <errno.h> 27263409Smarcel# include <stdlib.h> 28263409Smarcel 29263409Smarcel# include "localcharset.h" 30263409Smarcel# include "streq.h" 31263442Smarcel# include "verify.h" 32263674Smarcel 33263442Smarcel 34263409Smarcelverify (sizeof (mbstate_t) >= 4); 35263674Smarcel 36263674Smarcelstatic char internal_state[4]; 37263409Smarcel 38268161Smarcelsize_t 39263409Smarcelmbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) 40263409Smarcel{ 41263409Smarcel char *pstate = (char *)ps; 42268161Smarcel 43268161Smarcel if (pstate == NULL) 44268161Smarcel pstate = internal_state; 45268161Smarcel 46263409Smarcel if (s == NULL) 47263674Smarcel { 48263674Smarcel pwc = NULL; 49263487Smarcel s = ""; 50263409Smarcel n = 1; 51263409Smarcel } 52272030Smarcel 53272030Smarcel if (n == 0) 54263409Smarcel return (size_t)(-2); 55263409Smarcel 56272030Smarcel /* Here n > 0. */ 57272030Smarcel { 58263409Smarcel size_t nstate = pstate[0]; 59263409Smarcel char buf[4]; 60263674Smarcel const char *p; 61272776Smarcel size_t m; 62263674Smarcel 63272776Smarcel switch (nstate) 64263674Smarcel { 65272776Smarcel case 0: 66272776Smarcel p = s; 67272776Smarcel m = n; 68272776Smarcel break; 69263674Smarcel case 3: 70263674Smarcel buf[2] = pstate[3]; 71263442Smarcel /*FALLTHROUGH*/ 72268161Smarcel case 2: 73263442Smarcel buf[1] = pstate[2]; 74263674Smarcel /*FALLTHROUGH*/ 75263674Smarcel case 1: 76263674Smarcel buf[0] = pstate[1]; 77272776Smarcel p = buf; 78263674Smarcel m = nstate; 79263674Smarcel buf[m++] = s[0]; 80263674Smarcel if (n >= 2 && m < 4) 81263674Smarcel { 82263674Smarcel buf[m++] = s[1]; 83263674Smarcel if (n >= 3 && m < 4) 84263674Smarcel buf[m++] = s[2]; 85263674Smarcel } 86263674Smarcel break; 87263674Smarcel default: 88263843Smarcel errno = EINVAL; 89272776Smarcel return (size_t)(-1); 90263674Smarcel } 91263843Smarcel 92263674Smarcel /* Here m > 0. */ 93263674Smarcel 94272776Smarcel# if __GLIBC__ 95263843Smarcel /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */ 96272776Smarcel mbtowc (NULL, NULL, 0); 97263674Smarcel# endif 98263674Smarcel { 99263674Smarcel int res = mbtowc (pwc, p, m); 100272776Smarcel 101263674Smarcel if (res >= 0) 102263674Smarcel { 103263843Smarcel if (pwc != NULL && ((*pwc == 0) != (res == 0))) 104263674Smarcel abort (); 105263674Smarcel if (nstate >= (res > 0 ? res : 1)) 106272776Smarcel abort (); 107263843Smarcel res -= nstate; 108272776Smarcel pstate[0] = 0; 109263674Smarcel return res; 110263674Smarcel } 111268161Smarcel 112263674Smarcel /* mbtowc does not distinguish between invalid and incomplete multibyte 113263674Smarcel sequences. But mbrtowc needs to make this distinction. 114263674Smarcel There are two possible approaches: 115263674Smarcel - Use iconv() and its return value. 116263674Smarcel - Use built-in knowledge about the possible encodings. 117263674Smarcel Given the low quality of implementation of iconv() on the systems that 118263674Smarcel lack mbrtowc(), we use the second approach. 119263674Smarcel The possible encodings are: 120263442Smarcel - 8-bit encodings, 121263442Smarcel - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, 122263409Smarcel - UTF-8. 123263409Smarcel Use specialized code for each. */ 124263409Smarcel if (m >= 4 || m >= MB_CUR_MAX) 125263409Smarcel goto invalid; 126263440Smarcel /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ 127263442Smarcel { 128263700Smarcel const char *encoding = locale_charset (); 129263700Smarcel 130263409Smarcel if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) 131263409Smarcel { 132263409Smarcel /* Cf. unistr/u8-mblen.c. */ 133 unsigned char c = (unsigned char) p[0]; 134 135 if (c >= 0xc2) 136 { 137 if (c < 0xe0) 138 { 139 if (m == 1) 140 goto incomplete; 141 } 142 else if (c < 0xf0) 143 { 144 if (m == 1) 145 goto incomplete; 146 if (m == 2) 147 { 148 unsigned char c2 = (unsigned char) p[1]; 149 150 if ((c2 ^ 0x80) < 0x40 151 && (c >= 0xe1 || c2 >= 0xa0) 152 && (c != 0xed || c2 < 0xa0)) 153 goto incomplete; 154 } 155 } 156 else if (c <= 0xf4) 157 { 158 if (m == 1) 159 goto incomplete; 160 else /* m == 2 || m == 3 */ 161 { 162 unsigned char c2 = (unsigned char) p[1]; 163 164 if ((c2 ^ 0x80) < 0x40 165 && (c >= 0xf1 || c2 >= 0x90) 166 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) 167 { 168 if (m == 2) 169 goto incomplete; 170 else /* m == 3 */ 171 { 172 unsigned char c3 = (unsigned char) p[2]; 173 174 if ((c3 ^ 0x80) < 0x40) 175 goto incomplete; 176 } 177 } 178 } 179 } 180 } 181 goto invalid; 182 } 183 184 /* As a reference for this code, you can use the GNU libiconv 185 implementation. Look for uses of the RET_TOOFEW macro. */ 186 187 if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) 188 { 189 if (m == 1) 190 { 191 unsigned char c = (unsigned char) p[0]; 192 193 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) 194 goto incomplete; 195 } 196 if (m == 2) 197 { 198 unsigned char c = (unsigned char) p[0]; 199 200 if (c == 0x8f) 201 { 202 unsigned char c2 = (unsigned char) p[1]; 203 204 if (c2 >= 0xa1 && c2 < 0xff) 205 goto incomplete; 206 } 207 } 208 goto invalid; 209 } 210 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) 211 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 212 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) 213 { 214 if (m == 1) 215 { 216 unsigned char c = (unsigned char) p[0]; 217 218 if (c >= 0xa1 && c < 0xff) 219 goto incomplete; 220 } 221 goto invalid; 222 } 223 if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) 224 { 225 if (m == 1) 226 { 227 unsigned char c = (unsigned char) p[0]; 228 229 if ((c >= 0xa1 && c < 0xff) || c == 0x8e) 230 goto incomplete; 231 } 232 else /* m == 2 || m == 3 */ 233 { 234 unsigned char c = (unsigned char) p[0]; 235 236 if (c == 0x8e) 237 goto incomplete; 238 } 239 goto invalid; 240 } 241 if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 242 { 243 if (m == 1) 244 { 245 unsigned char c = (unsigned char) p[0]; 246 247 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) 248 goto incomplete; 249 } 250 else /* m == 2 || m == 3 */ 251 { 252 unsigned char c = (unsigned char) p[0]; 253 254 if (c >= 0x90 && c <= 0xe3) 255 { 256 unsigned char c2 = (unsigned char) p[1]; 257 258 if (c2 >= 0x30 && c2 <= 0x39) 259 { 260 if (m == 2) 261 goto incomplete; 262 else /* m == 3 */ 263 { 264 unsigned char c3 = (unsigned char) p[2]; 265 266 if (c3 >= 0x81 && c3 <= 0xfe) 267 goto incomplete; 268 } 269 } 270 } 271 } 272 goto invalid; 273 } 274 if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) 275 { 276 if (m == 1) 277 { 278 unsigned char c = (unsigned char) p[0]; 279 280 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) 281 || (c >= 0xf0 && c <= 0xf9)) 282 goto incomplete; 283 } 284 goto invalid; 285 } 286 287 /* An unknown multibyte encoding. */ 288 goto incomplete; 289 } 290 291 incomplete: 292 { 293 size_t k = nstate; 294 /* Here 0 <= k < m < 4. */ 295 pstate[++k] = s[0]; 296 if (k < m) 297 { 298 pstate[++k] = s[1]; 299 if (k < m) 300 pstate[++k] = s[2]; 301 } 302 if (k != m) 303 abort (); 304 } 305 pstate[0] = m; 306 return (size_t)(-2); 307 308 invalid: 309 errno = EILSEQ; 310 /* The conversion state is undefined, says POSIX. */ 311 return (size_t)(-1); 312 } 313 } 314} 315 316#else 317/* Override the system's mbrtowc() function. */ 318 319# undef mbrtowc 320 321size_t 322rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) 323{ 324# if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG 325 if (s == NULL) 326 { 327 pwc = NULL; 328 s = ""; 329 n = 1; 330 } 331# endif 332 333# if MBRTOWC_RETVAL_BUG 334 { 335 static mbstate_t internal_state; 336 337 /* Override mbrtowc's internal state. We can not call mbsinit() on the 338 hidden internal state, but we can call it on our variable. */ 339 if (ps == NULL) 340 ps = &internal_state; 341 342 if (!mbsinit (ps)) 343 { 344 /* Parse the rest of the multibyte character byte for byte. */ 345 size_t count = 0; 346 for (; n > 0; s++, n--) 347 { 348 wchar_t wc; 349 size_t ret = mbrtowc (&wc, s, 1, ps); 350 351 if (ret == (size_t)(-1)) 352 return (size_t)(-1); 353 count++; 354 if (ret != (size_t)(-2)) 355 { 356 /* The multibyte character has been completed. */ 357 if (pwc != NULL) 358 *pwc = wc; 359 return (wc == 0 ? 0 : count); 360 } 361 } 362 return (size_t)(-2); 363 } 364 } 365# endif 366 367# if MBRTOWC_NUL_RETVAL_BUG 368 { 369 wchar_t wc; 370 size_t ret = mbrtowc (&wc, s, n, ps); 371 372 if (ret != (size_t)(-1) && ret != (size_t)(-2)) 373 { 374 if (pwc != NULL) 375 *pwc = wc; 376 if (wc == 0) 377 ret = 0; 378 } 379 return ret; 380 } 381# else 382 return mbrtowc (pwc, s, n, ps); 383# endif 384} 385 386#endif 387