gb18030.c revision 128004
1127834Stjr/*- 2127834Stjr * Copyright (c) 2002-2004 Tim J. Robbins 3127834Stjr * All rights reserved. 4118146Sache * 5118146Sache * Redistribution and use in source and binary forms, with or without 6118146Sache * modification, are permitted provided that the following conditions 7118146Sache * are met: 8118146Sache * 1. Redistributions of source code must retain the above copyright 9118146Sache * notice, this list of conditions and the following disclaimer. 10118146Sache * 2. Redistributions in binary form must reproduce the above copyright 11118146Sache * notice, this list of conditions and the following disclaimer in the 12118146Sache * documentation and/or other materials provided with the distribution. 13118146Sache * 14127834Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15118146Sache * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16118146Sache * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17127834Stjr * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18118146Sache * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19118146Sache * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20118146Sache * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21118146Sache * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22118146Sache * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23118146Sache * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24118146Sache * SUCH DAMAGE. 25118146Sache */ 26127834Stjr/* 27127834Stjr * PRC National Standard GB 18030-2000 encoding of Chinese text. 28127834Stjr * 29127834Stjr * See gb18030(5) for details. 30127834Stjr */ 31118146Sache 32128004Stjr#include <sys/param.h> 33118146Sache__FBSDID("$FreeBSD: head/lib/libc/locale/gb18030.c 128004 2004-04-07 10:48:19Z tjr $"); 34118146Sache 35127834Stjr#include <errno.h> 36127834Stjr#include <runetype.h> 37118146Sache#include <stdlib.h> 38128004Stjr#include <string.h> 39127834Stjr#include <wchar.h> 40118146Sache 41127834Stjrextern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, 42127834Stjr size_t, mbstate_t * __restrict); 43128004Stjrextern int (*__mbsinit)(const mbstate_t *); 44127834Stjrextern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict); 45118146Sache 46127834Stjrint _GB18030_init(_RuneLocale *); 47128004Stjrsize_t _GB18030_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, 48127834Stjr mbstate_t * __restrict); 49128004Stjrint _GB18030_mbsinit(const mbstate_t *); 50128004Stjrsize_t _GB18030_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); 51127834Stjr 52128004Stjrtypedef struct { 53128004Stjr int count; 54128004Stjr u_char bytes[4]; 55128004Stjr} _GB18030State; 56128004Stjr 57118146Sacheint 58127834Stjr_GB18030_init(_RuneLocale *rl) 59118146Sache{ 60127834Stjr 61127834Stjr __mbrtowc = _GB18030_mbrtowc; 62127834Stjr __wcrtomb = _GB18030_wcrtomb; 63128004Stjr __mbsinit = _GB18030_mbsinit; 64118146Sache _CurrentRuneLocale = rl; 65118146Sache __mb_cur_max = 4; 66127834Stjr 67118146Sache return (0); 68118146Sache} 69118146Sache 70128004Stjrint 71128004Stjr_GB18030_mbsinit(const mbstate_t *ps) 72128004Stjr{ 73128004Stjr 74128004Stjr return (ps == NULL || ((_GB18030State *)ps)->count == 0); 75128004Stjr} 76128004Stjr 77127834Stjrsize_t 78127834Stjr_GB18030_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, 79128004Stjr size_t n, mbstate_t * __restrict ps) 80118146Sache{ 81128004Stjr _GB18030State *gs; 82127834Stjr wchar_t wch; 83128004Stjr int ch, len, ocount; 84128004Stjr size_t ncopy; 85118146Sache 86128004Stjr gs = (_GB18030State *)ps; 87128004Stjr 88128004Stjr if (s == NULL) { 89128004Stjr s = ""; 90128004Stjr n = 1; 91128004Stjr pwc = NULL; 92128004Stjr } 93128004Stjr 94128004Stjr ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof(gs->bytes) - gs->count); 95128004Stjr memcpy(gs->bytes + gs->count, s, ncopy); 96128004Stjr ocount = gs->count; 97128004Stjr gs->count += ncopy; 98128004Stjr s = (char *)gs->bytes; 99128004Stjr n = gs->count; 100128004Stjr 101127834Stjr if (n == 0) 102127834Stjr /* Incomplete multibyte sequence */ 103127834Stjr return ((size_t)-2); 104118146Sache 105127834Stjr /* 106127834Stjr * Single byte: [00-7f] 107127834Stjr * Two byte: [81-fe][40-7e,80-fe] 108127834Stjr * Four byte: [81-fe][30-39][81-fe][30-39] 109127834Stjr */ 110127834Stjr ch = (unsigned char)*s++; 111127834Stjr if (ch <= 0x7f) { 112127834Stjr len = 1; 113127834Stjr wch = ch; 114127834Stjr } else if (ch >= 0x81 && ch <= 0xfe) { 115127834Stjr wch = ch; 116127834Stjr if (n < 2) 117127834Stjr return ((size_t)-2); 118127834Stjr ch = (unsigned char)*s++; 119127834Stjr if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) { 120127834Stjr wch = (wch << 8) | ch; 121127834Stjr len = 2; 122127834Stjr } else if (ch >= 0x30 && ch <= 0x39) { 123127834Stjr /* 124127834Stjr * Strip high bit off the wide character we will 125127834Stjr * eventually output so that it is positive when 126127834Stjr * cast to wint_t on 32-bit twos-complement machines. 127127834Stjr */ 128127834Stjr wch = ((wch & 0x7f) << 8) | ch; 129127834Stjr if (n < 3) 130127834Stjr return ((size_t)-2); 131127834Stjr ch = (unsigned char)*s++; 132127834Stjr if (ch < 0x81 || ch > 0xfe) 133127834Stjr goto ilseq; 134127834Stjr wch = (wch << 8) | ch; 135127834Stjr if (n < 4) 136127834Stjr return ((size_t)-2); 137127834Stjr ch = (unsigned char)*s++; 138127834Stjr if (ch < 0x30 || ch > 0x39) 139127834Stjr goto ilseq; 140127834Stjr wch = (wch << 8) | ch; 141127834Stjr len = 4; 142127834Stjr } else 143127834Stjr goto ilseq; 144127834Stjr } else 145127834Stjr goto ilseq; 146118146Sache 147127834Stjr if (pwc != NULL) 148127834Stjr *pwc = wch; 149128004Stjr gs->count = 0; 150128004Stjr return (wch == L'\0' ? 0 : len - ocount); 151127834Stjrilseq: 152127834Stjr errno = EILSEQ; 153127834Stjr return ((size_t)-1); 154118146Sache} 155118146Sache 156127834Stjrsize_t 157127834Stjr_GB18030_wcrtomb(char * __restrict s, wchar_t wc, 158127834Stjr mbstate_t * __restrict ps __unused) 159118146Sache{ 160127834Stjr size_t len; 161127834Stjr int c; 162118146Sache 163127834Stjr if (s == NULL) 164127834Stjr /* Reset to initial shift state (no-op) */ 165127834Stjr return (1); 166127834Stjr if ((wc & ~0x7fffffff) != 0) 167127834Stjr goto ilseq; 168127834Stjr if (wc & 0x7f000000) { 169127834Stjr /* Replace high bit that mbrtowc() removed. */ 170127834Stjr wc |= 0x80000000; 171127834Stjr c = (wc >> 24) & 0xff; 172127834Stjr if (c < 0x81 || c > 0xfe) 173127834Stjr goto ilseq; 174127834Stjr *s++ = c; 175127834Stjr c = (wc >> 16) & 0xff; 176127834Stjr if (c < 0x30 || c > 0x39) 177127834Stjr goto ilseq; 178127834Stjr *s++ = c; 179127834Stjr c = (wc >> 8) & 0xff; 180127834Stjr if (c < 0x81 || c > 0xfe) 181127834Stjr goto ilseq; 182127834Stjr *s++ = c; 183127834Stjr c = wc & 0xff; 184127834Stjr if (c < 0x30 || c > 0x39) 185127834Stjr goto ilseq; 186127834Stjr *s++ = c; 187127834Stjr len = 4; 188127834Stjr } else if (wc & 0x00ff0000) 189127834Stjr goto ilseq; 190127834Stjr else if (wc & 0x0000ff00) { 191127834Stjr c = (wc >> 8) & 0xff; 192127834Stjr if (c < 0x81 || c > 0xfe) 193127834Stjr goto ilseq; 194127834Stjr *s++ = c; 195127834Stjr c = wc & 0xff; 196127834Stjr if (c < 0x40 || c == 0x7f || c == 0xff) 197127834Stjr goto ilseq; 198127834Stjr *s++ = c; 199127834Stjr len = 2; 200127834Stjr } else if (wc <= 0x7f) { 201127834Stjr *s++ = wc; 202127834Stjr len = 1; 203127834Stjr } else 204127834Stjr goto ilseq; 205127834Stjr 206127834Stjr return (len); 207127834Stjrilseq: 208127834Stjr errno = EILSEQ; 209127834Stjr return ((size_t)-1); 210118146Sache} 211