gb18030.c revision 127834
1127834Stjr/*-
2127834Stjr * Copyright (c) 2002-2004 Tim J. Robbins
3127834Stjr * All rights reserved.
4118146Sache *
5118146Sache * Redistribution and use in source and binary forms, with or without
6118146Sache * modification, are permitted provided that the following conditions
7118146Sache * are met:
8118146Sache * 1. Redistributions of source code must retain the above copyright
9118146Sache *    notice, this list of conditions and the following disclaimer.
10118146Sache * 2. Redistributions in binary form must reproduce the above copyright
11118146Sache *    notice, this list of conditions and the following disclaimer in the
12118146Sache *    documentation and/or other materials provided with the distribution.
13118146Sache *
14127834Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15118146Sache * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16118146Sache * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17127834Stjr * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18118146Sache * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19118146Sache * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20118146Sache * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21118146Sache * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22118146Sache * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23118146Sache * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24118146Sache * SUCH DAMAGE.
25118146Sache */
26127834Stjr/*
27127834Stjr * PRC National Standard GB 18030-2000 encoding of Chinese text.
28127834Stjr *
29127834Stjr * See gb18030(5) for details.
30127834Stjr */
31118146Sache
32118146Sache#include <sys/cdefs.h>
33118146Sache__FBSDID("$FreeBSD: head/lib/libc/locale/gb18030.c 127834 2004-04-04 11:00:42Z tjr $");
34118146Sache
35127834Stjr#include <errno.h>
36127834Stjr#include <runetype.h>
37118146Sache#include <stdlib.h>
38127834Stjr#include <wchar.h>
39118146Sache
40127834Stjrextern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict,
41127834Stjr    size_t, mbstate_t * __restrict);
42127834Stjrextern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict);
43118146Sache
44127834Stjrint	_GB18030_init(_RuneLocale *);
45127834Stjrsize_t  _GB18030_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
46127834Stjr	    mbstate_t * __restrict);
47127834Stjrsize_t  _GB18030_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
48127834Stjr
49118146Sacheint
50127834Stjr_GB18030_init(_RuneLocale *rl)
51118146Sache{
52127834Stjr
53127834Stjr	__mbrtowc = _GB18030_mbrtowc;
54127834Stjr	__wcrtomb = _GB18030_wcrtomb;
55118146Sache	_CurrentRuneLocale = rl;
56118146Sache	__mb_cur_max = 4;
57127834Stjr
58118146Sache	return (0);
59118146Sache}
60118146Sache
61127834Stjrsize_t
62127834Stjr_GB18030_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
63127834Stjr    size_t n, mbstate_t * __restrict ps __unused)
64118146Sache{
65127834Stjr	wchar_t wch;
66127834Stjr	int ch, len;
67118146Sache
68127834Stjr	if (s == NULL)
69127834Stjr		/* Reset to initial shift state (no-op) */
70127834Stjr		return (0);
71127834Stjr	if (n == 0)
72127834Stjr		/* Incomplete multibyte sequence */
73127834Stjr		return ((size_t)-2);
74118146Sache
75127834Stjr	/*
76127834Stjr	 * Single byte:		[00-7f]
77127834Stjr	 * Two byte:		[81-fe][40-7e,80-fe]
78127834Stjr	 * Four byte:		[81-fe][30-39][81-fe][30-39]
79127834Stjr	 */
80127834Stjr	ch = (unsigned char)*s++;
81127834Stjr	if (ch <= 0x7f) {
82127834Stjr		len = 1;
83127834Stjr		wch = ch;
84127834Stjr	} else if (ch >= 0x81 && ch <= 0xfe) {
85127834Stjr		wch = ch;
86127834Stjr		if (n < 2)
87127834Stjr			return ((size_t)-2);
88127834Stjr		ch = (unsigned char)*s++;
89127834Stjr		if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
90127834Stjr			wch = (wch << 8) | ch;
91127834Stjr			len = 2;
92127834Stjr		} else if (ch >= 0x30 && ch <= 0x39) {
93127834Stjr			/*
94127834Stjr			 * Strip high bit off the wide character we will
95127834Stjr			 * eventually output so that it is positive when
96127834Stjr			 * cast to wint_t on 32-bit twos-complement machines.
97127834Stjr			 */
98127834Stjr			wch = ((wch & 0x7f) << 8) | ch;
99127834Stjr			if (n < 3)
100127834Stjr				return ((size_t)-2);
101127834Stjr			ch = (unsigned char)*s++;
102127834Stjr			if (ch < 0x81 || ch > 0xfe)
103127834Stjr				goto ilseq;
104127834Stjr			wch = (wch << 8) | ch;
105127834Stjr			if (n < 4)
106127834Stjr				return ((size_t)-2);
107127834Stjr			ch = (unsigned char)*s++;
108127834Stjr			if (ch < 0x30 || ch > 0x39)
109127834Stjr				goto ilseq;
110127834Stjr			wch = (wch << 8) | ch;
111127834Stjr			len = 4;
112127834Stjr		} else
113127834Stjr			goto ilseq;
114127834Stjr	} else
115127834Stjr		goto ilseq;
116118146Sache
117127834Stjr	if (pwc != NULL)
118127834Stjr		*pwc = wch;
119127834Stjr	return (wch == L'\0' ? 0 : len);
120127834Stjrilseq:
121127834Stjr	errno = EILSEQ;
122127834Stjr	return ((size_t)-1);
123118146Sache}
124118146Sache
125127834Stjrsize_t
126127834Stjr_GB18030_wcrtomb(char * __restrict s, wchar_t wc,
127127834Stjr    mbstate_t * __restrict ps __unused)
128118146Sache{
129127834Stjr	size_t len;
130127834Stjr	int c;
131118146Sache
132127834Stjr	if (s == NULL)
133127834Stjr		/* Reset to initial shift state (no-op) */
134127834Stjr		return (1);
135127834Stjr
136127834Stjr	if ((wc & ~0x7fffffff) != 0)
137127834Stjr		goto ilseq;
138127834Stjr	if (wc & 0x7f000000) {
139127834Stjr		/* Replace high bit that mbrtowc() removed. */
140127834Stjr		wc |= 0x80000000;
141127834Stjr		c = (wc >> 24) & 0xff;
142127834Stjr		if (c < 0x81 || c > 0xfe)
143127834Stjr			goto ilseq;
144127834Stjr		*s++ = c;
145127834Stjr		c = (wc >> 16) & 0xff;
146127834Stjr		if (c < 0x30 || c > 0x39)
147127834Stjr			goto ilseq;
148127834Stjr		*s++ = c;
149127834Stjr		c = (wc >> 8) & 0xff;
150127834Stjr		if (c < 0x81 || c > 0xfe)
151127834Stjr			goto ilseq;
152127834Stjr		*s++ = c;
153127834Stjr		c = wc & 0xff;
154127834Stjr		if (c < 0x30 || c > 0x39)
155127834Stjr			goto ilseq;
156127834Stjr		*s++ = c;
157127834Stjr		len = 4;
158127834Stjr	} else if (wc & 0x00ff0000)
159127834Stjr		goto ilseq;
160127834Stjr	else if (wc & 0x0000ff00) {
161127834Stjr		c = (wc >> 8) & 0xff;
162127834Stjr		if (c < 0x81 || c > 0xfe)
163127834Stjr			goto ilseq;
164127834Stjr		*s++ = c;
165127834Stjr		c = wc & 0xff;
166127834Stjr		if (c < 0x40 || c == 0x7f || c == 0xff)
167127834Stjr			goto ilseq;
168127834Stjr		*s++ = c;
169127834Stjr		len = 2;
170127834Stjr	} else if (wc <= 0x7f) {
171127834Stjr		*s++ = wc;
172127834Stjr		len = 1;
173127834Stjr	} else
174127834Stjr		goto ilseq;
175127834Stjr
176127834Stjr	return (len);
177127834Stjrilseq:
178127834Stjr	errno = EILSEQ;
179127834Stjr	return ((size_t)-1);
180118146Sache}
181