gb18030.c revision 128004
1127834Stjr/*-
2127834Stjr * Copyright (c) 2002-2004 Tim J. Robbins
3127834Stjr * All rights reserved.
4118146Sache *
5118146Sache * Redistribution and use in source and binary forms, with or without
6118146Sache * modification, are permitted provided that the following conditions
7118146Sache * are met:
8118146Sache * 1. Redistributions of source code must retain the above copyright
9118146Sache *    notice, this list of conditions and the following disclaimer.
10118146Sache * 2. Redistributions in binary form must reproduce the above copyright
11118146Sache *    notice, this list of conditions and the following disclaimer in the
12118146Sache *    documentation and/or other materials provided with the distribution.
13118146Sache *
14127834Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15118146Sache * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16118146Sache * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17127834Stjr * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18118146Sache * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19118146Sache * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20118146Sache * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21118146Sache * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22118146Sache * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23118146Sache * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24118146Sache * SUCH DAMAGE.
25118146Sache */
26127834Stjr/*
27127834Stjr * PRC National Standard GB 18030-2000 encoding of Chinese text.
28127834Stjr *
29127834Stjr * See gb18030(5) for details.
30127834Stjr */
31118146Sache
32128004Stjr#include <sys/param.h>
33118146Sache__FBSDID("$FreeBSD: head/lib/libc/locale/gb18030.c 128004 2004-04-07 10:48:19Z tjr $");
34118146Sache
35127834Stjr#include <errno.h>
36127834Stjr#include <runetype.h>
37118146Sache#include <stdlib.h>
38128004Stjr#include <string.h>
39127834Stjr#include <wchar.h>
40118146Sache
41127834Stjrextern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict,
42127834Stjr    size_t, mbstate_t * __restrict);
43128004Stjrextern int (*__mbsinit)(const mbstate_t *);
44127834Stjrextern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict);
45118146Sache
46127834Stjrint	_GB18030_init(_RuneLocale *);
47128004Stjrsize_t	_GB18030_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
48127834Stjr	    mbstate_t * __restrict);
49128004Stjrint	_GB18030_mbsinit(const mbstate_t *);
50128004Stjrsize_t	_GB18030_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
51127834Stjr
52128004Stjrtypedef struct {
53128004Stjr	int	count;
54128004Stjr	u_char	bytes[4];
55128004Stjr} _GB18030State;
56128004Stjr
57118146Sacheint
58127834Stjr_GB18030_init(_RuneLocale *rl)
59118146Sache{
60127834Stjr
61127834Stjr	__mbrtowc = _GB18030_mbrtowc;
62127834Stjr	__wcrtomb = _GB18030_wcrtomb;
63128004Stjr	__mbsinit = _GB18030_mbsinit;
64118146Sache	_CurrentRuneLocale = rl;
65118146Sache	__mb_cur_max = 4;
66127834Stjr
67118146Sache	return (0);
68118146Sache}
69118146Sache
70128004Stjrint
71128004Stjr_GB18030_mbsinit(const mbstate_t *ps)
72128004Stjr{
73128004Stjr
74128004Stjr	return (ps == NULL || ((_GB18030State *)ps)->count == 0);
75128004Stjr}
76128004Stjr
77127834Stjrsize_t
78127834Stjr_GB18030_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
79128004Stjr    size_t n, mbstate_t * __restrict ps)
80118146Sache{
81128004Stjr	_GB18030State *gs;
82127834Stjr	wchar_t wch;
83128004Stjr	int ch, len, ocount;
84128004Stjr	size_t ncopy;
85118146Sache
86128004Stjr	gs = (_GB18030State *)ps;
87128004Stjr
88128004Stjr	if (s == NULL) {
89128004Stjr		s = "";
90128004Stjr		n = 1;
91128004Stjr		pwc = NULL;
92128004Stjr	}
93128004Stjr
94128004Stjr	ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof(gs->bytes) - gs->count);
95128004Stjr	memcpy(gs->bytes + gs->count, s, ncopy);
96128004Stjr	ocount = gs->count;
97128004Stjr	gs->count += ncopy;
98128004Stjr	s = (char *)gs->bytes;
99128004Stjr	n = gs->count;
100128004Stjr
101127834Stjr	if (n == 0)
102127834Stjr		/* Incomplete multibyte sequence */
103127834Stjr		return ((size_t)-2);
104118146Sache
105127834Stjr	/*
106127834Stjr	 * Single byte:		[00-7f]
107127834Stjr	 * Two byte:		[81-fe][40-7e,80-fe]
108127834Stjr	 * Four byte:		[81-fe][30-39][81-fe][30-39]
109127834Stjr	 */
110127834Stjr	ch = (unsigned char)*s++;
111127834Stjr	if (ch <= 0x7f) {
112127834Stjr		len = 1;
113127834Stjr		wch = ch;
114127834Stjr	} else if (ch >= 0x81 && ch <= 0xfe) {
115127834Stjr		wch = ch;
116127834Stjr		if (n < 2)
117127834Stjr			return ((size_t)-2);
118127834Stjr		ch = (unsigned char)*s++;
119127834Stjr		if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
120127834Stjr			wch = (wch << 8) | ch;
121127834Stjr			len = 2;
122127834Stjr		} else if (ch >= 0x30 && ch <= 0x39) {
123127834Stjr			/*
124127834Stjr			 * Strip high bit off the wide character we will
125127834Stjr			 * eventually output so that it is positive when
126127834Stjr			 * cast to wint_t on 32-bit twos-complement machines.
127127834Stjr			 */
128127834Stjr			wch = ((wch & 0x7f) << 8) | ch;
129127834Stjr			if (n < 3)
130127834Stjr				return ((size_t)-2);
131127834Stjr			ch = (unsigned char)*s++;
132127834Stjr			if (ch < 0x81 || ch > 0xfe)
133127834Stjr				goto ilseq;
134127834Stjr			wch = (wch << 8) | ch;
135127834Stjr			if (n < 4)
136127834Stjr				return ((size_t)-2);
137127834Stjr			ch = (unsigned char)*s++;
138127834Stjr			if (ch < 0x30 || ch > 0x39)
139127834Stjr				goto ilseq;
140127834Stjr			wch = (wch << 8) | ch;
141127834Stjr			len = 4;
142127834Stjr		} else
143127834Stjr			goto ilseq;
144127834Stjr	} else
145127834Stjr		goto ilseq;
146118146Sache
147127834Stjr	if (pwc != NULL)
148127834Stjr		*pwc = wch;
149128004Stjr	gs->count = 0;
150128004Stjr	return (wch == L'\0' ? 0 : len - ocount);
151127834Stjrilseq:
152127834Stjr	errno = EILSEQ;
153127834Stjr	return ((size_t)-1);
154118146Sache}
155118146Sache
156127834Stjrsize_t
157127834Stjr_GB18030_wcrtomb(char * __restrict s, wchar_t wc,
158127834Stjr    mbstate_t * __restrict ps __unused)
159118146Sache{
160127834Stjr	size_t len;
161127834Stjr	int c;
162118146Sache
163127834Stjr	if (s == NULL)
164127834Stjr		/* Reset to initial shift state (no-op) */
165127834Stjr		return (1);
166127834Stjr	if ((wc & ~0x7fffffff) != 0)
167127834Stjr		goto ilseq;
168127834Stjr	if (wc & 0x7f000000) {
169127834Stjr		/* Replace high bit that mbrtowc() removed. */
170127834Stjr		wc |= 0x80000000;
171127834Stjr		c = (wc >> 24) & 0xff;
172127834Stjr		if (c < 0x81 || c > 0xfe)
173127834Stjr			goto ilseq;
174127834Stjr		*s++ = c;
175127834Stjr		c = (wc >> 16) & 0xff;
176127834Stjr		if (c < 0x30 || c > 0x39)
177127834Stjr			goto ilseq;
178127834Stjr		*s++ = c;
179127834Stjr		c = (wc >> 8) & 0xff;
180127834Stjr		if (c < 0x81 || c > 0xfe)
181127834Stjr			goto ilseq;
182127834Stjr		*s++ = c;
183127834Stjr		c = wc & 0xff;
184127834Stjr		if (c < 0x30 || c > 0x39)
185127834Stjr			goto ilseq;
186127834Stjr		*s++ = c;
187127834Stjr		len = 4;
188127834Stjr	} else if (wc & 0x00ff0000)
189127834Stjr		goto ilseq;
190127834Stjr	else if (wc & 0x0000ff00) {
191127834Stjr		c = (wc >> 8) & 0xff;
192127834Stjr		if (c < 0x81 || c > 0xfe)
193127834Stjr			goto ilseq;
194127834Stjr		*s++ = c;
195127834Stjr		c = wc & 0xff;
196127834Stjr		if (c < 0x40 || c == 0x7f || c == 0xff)
197127834Stjr			goto ilseq;
198127834Stjr		*s++ = c;
199127834Stjr		len = 2;
200127834Stjr	} else if (wc <= 0x7f) {
201127834Stjr		*s++ = wc;
202127834Stjr		len = 1;
203127834Stjr	} else
204127834Stjr		goto ilseq;
205127834Stjr
206127834Stjr	return (len);
207127834Stjrilseq:
208127834Stjr	errno = EILSEQ;
209127834Stjr	return ((size_t)-1);
210118146Sache}
211