1139823Simp/* $NetBSD: citrus_gbk2k.c,v 1.9 2022/04/19 20:32:14 rillig Exp $ */
21541Srgrimes
31541Srgrimes/*-
41541Srgrimes * Copyright (c)2003 Citrus Project,
51541Srgrimes * All rights reserved.
61541Srgrimes *
71541Srgrimes * Redistribution and use in source and binary forms, with or without
81541Srgrimes * modification, are permitted provided that the following conditions
91541Srgrimes * are met:
101541Srgrimes * 1. Redistributions of source code must retain the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer.
121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
131541Srgrimes *    notice, this list of conditions and the following disclaimer in the
141541Srgrimes *    documentation and/or other materials provided with the distribution.
151541Srgrimes *
161541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
171541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
181541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
191541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
201541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
211541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
221541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
231541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
241541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
251541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
261541Srgrimes * SUCH DAMAGE.
271541Srgrimes */
281541Srgrimes
291541Srgrimes#include <sys/cdefs.h>
301541Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
311541Srgrimes__RCSID("$NetBSD: citrus_gbk2k.c,v 1.9 2022/04/19 20:32:14 rillig Exp $");
3236503Speter#endif /* LIBC_SCCS and not lint */
331541Srgrimes
341541Srgrimes#include <assert.h>
3583651Speter#include <errno.h>
3683651Speter#include <string.h>
3783651Speter#include <stdio.h>
381541Srgrimes#include <stdlib.h>
391541Srgrimes#include <stddef.h>
401541Srgrimes#include <wchar.h>
411541Srgrimes#include <sys/types.h>
421541Srgrimes#include <limits.h>
4383651Speter
44190380Srwatson#include "citrus_namespace.h"
45190380Srwatson#include "citrus_types.h"
461541Srgrimes#include "citrus_bcs.h"
4748274Speter#include "citrus_module.h"
4848274Speter#include "citrus_ctype.h"
4960041Sphk#include "citrus_stdenc.h"
5031886Sbde#include "citrus_gbk2k.h"
511541Srgrimes
521541Srgrimes
531541Srgrimes/* ----------------------------------------------------------------------
541541Srgrimes * private stuffs used by templates
551541Srgrimes */
561541Srgrimes
571541Srgrimestypedef struct _GBK2KState {
589336Sdfr	char ch[4];
592997Swollman	int chlen;
602997Swollman} _GBK2KState;
6183651Speter
621541Srgrimestypedef struct {
633305Sphk	int mb_cur_max;
6412662Sdg} _GBK2KEncodingInfo;
6512662Sdg
6692783Sjefftypedef struct {
673305Sphk	_GBK2KEncodingInfo	ei;
68122698Salfred	struct {
69122698Salfred		/* for future multi-locale facility */
701541Srgrimes		_GBK2KState	s_mblen;
719336Sdfr		_GBK2KState	s_mbrlen;
7283651Speter		_GBK2KState	s_mbrtowc;
7383651Speter		_GBK2KState	s_mbtowc;
74190380Srwatson		_GBK2KState	s_mbsrtowcs;
751541Srgrimes		_GBK2KState	s_mbsnrtowcs;
7683651Speter		_GBK2KState	s_wcrtomb;
7783651Speter		_GBK2KState	s_wcsrtombs;
781541Srgrimes		_GBK2KState	s_wcsnrtombs;
791541Srgrimes		_GBK2KState	s_wctomb;
801541Srgrimes	} states;
811541Srgrimes} _GBK2KCTypeInfo;
82158739Smohans
83158739Smohans#define _CEI_TO_EI(_cei_)		(&(_cei_)->ei)
84158739Smohans#define _CEI_TO_STATE(_cei_, _func_)	(_cei_)->states.s_##_func_
85158739Smohans
86158739Smohans#define _FUNCNAME(m)			_citrus_GBK2K_##m
87190380Srwatson#define _ENCODING_INFO			_GBK2KEncodingInfo
88190380Srwatson#define _CTYPE_INFO			_GBK2KCTypeInfo
89190380Srwatson#define _ENCODING_STATE			_GBK2KState
90190380Srwatson#define _ENCODING_MB_CUR_MAX(_ei_)	(_ei_)->mb_cur_max
91190380Srwatson#define _ENCODING_IS_STATE_DEPENDENT	0
92190380Srwatson#define _STATE_NEEDS_EXPLICIT_INIT(_ps_)	0
93190380Srwatson
94190380Srwatsonstatic __inline void
95190380Srwatson/*ARGSUSED*/
96190380Srwatson_citrus_GBK2K_init_state(_GBK2KEncodingInfo * __restrict ei,
97190380Srwatson			 _GBK2KState * __restrict s)
98190380Srwatson{
99190380Srwatson	memset(s, 0, sizeof(*s));
100190380Srwatson}
101190380Srwatson
102190380Srwatsonstatic __inline void
103190380Srwatson/*ARGSUSED*/
104190380Srwatson_citrus_GBK2K_pack_state(_GBK2KEncodingInfo * __restrict ei,
105158739Smohans			 void * __restrict pspriv,
1061541Srgrimes			 const _GBK2KState * __restrict s)
1071541Srgrimes{
1081541Srgrimes	memcpy(pspriv, (const void *)s, sizeof(*s));
10983651Speter}
11083651Speter
11183651Speterstatic __inline void
11283651Speter/*ARGSUSED*/
1131541Srgrimes_citrus_GBK2K_unpack_state(_GBK2KEncodingInfo * __restrict ei,
1141541Srgrimes			   _GBK2KState * __restrict s,
115176224Sjhb			   const void * __restrict pspriv)
11612911Sphk{
11783651Speter	memcpy((void *)s, pspriv, sizeof(*s));
11812911Sphk}
11912911Sphk
12083651Speterstatic  __inline int
12183651Speter_mb_singlebyte(int c)
1229336Sdfr{
123184588Sdfr	c &= 0xff;
12483651Speter	return (c <= 0x7f);
125138496Sps}
126184588Sdfr
12783651Speterstatic __inline int
128176224Sjhb_mb_leadbyte(int c)
1299759Sbde{
1309336Sdfr	c &= 0xff;
1319336Sdfr	return (0x81 <= c && c <= 0xfe);
1329336Sdfr}
1339336Sdfr
1349336Sdfrstatic __inline int
1359336Sdfr_mb_trailbyte(int c)
1369336Sdfr{
1379336Sdfr	c &= 0xff;
1389336Sdfr	return ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfe));
1399336Sdfr}
1409336Sdfr
1419336Sdfrstatic __inline int
1429336Sdfr_mb_surrogate(int c)
1439336Sdfr{
1449336Sdfr	c &= 0xff;
1459336Sdfr	return (0x30 <= c && c <= 0x39);
1469336Sdfr}
1479336Sdfr
1489336Sdfrstatic __inline int
1499336Sdfr_mb_count(wchar_t v)
1509336Sdfr{
1519336Sdfr	u_int32_t c;
1529336Sdfr
1539336Sdfr	c = (u_int32_t)v; /* XXX */
1549336Sdfr	if (!(c & 0xffffff00))
1559336Sdfr		return (1);
1569336Sdfr	if (!(c & 0xffff0000))
1579336Sdfr		return (2);
1589336Sdfr	return (4);
15960938Sjake}
1603664Sphk
161176224Sjhb#define	_PSENC		(psenc->ch[psenc->chlen - 1])
162176224Sjhb#define	_PUSH_PSENC(c)	(psenc->ch[psenc->chlen++] = (c))
163176224Sjhb
164176224Sjhbstatic int
165176224Sjhb_citrus_GBK2K_mbrtowc_priv(_GBK2KEncodingInfo * __restrict ei,
166176224Sjhb			   wchar_t * __restrict pwc,
167176224Sjhb			   const char ** __restrict s, size_t n,
168176224Sjhb			   _GBK2KState * __restrict psenc,
169176224Sjhb			   size_t * __restrict nresult)
170176224Sjhb{
171176224Sjhb	int chlenbak, len;
172176224Sjhb	const char *s0, *s1;
173176224Sjhb	wchar_t wc;
174176224Sjhb
175176224Sjhb	_DIAGASSERT(ei != NULL);
176176224Sjhb	/* pwc may be NULL */
177176224Sjhb	_DIAGASSERT(s != NULL);
178176224Sjhb	_DIAGASSERT(psenc != NULL);
179176224Sjhb
180176224Sjhb	s0 = *s;
1811541Srgrimes
1821541Srgrimes	if (s0 == NULL) {
1831541Srgrimes		/* _citrus_GBK2K_init_state(ei, psenc); */
1841541Srgrimes		psenc->chlen = 0;
1851541Srgrimes		*nresult = 0;
1861541Srgrimes		return (0);
18783651Speter	}
1881541Srgrimes
18983651Speter	chlenbak = psenc->chlen;
1901541Srgrimes
191177599Sru	switch (psenc->chlen) {
1921541Srgrimes	case 3:
193177599Sru		if (!_mb_leadbyte (_PSENC))
1941541Srgrimes			goto invalid;
1951541Srgrimes	/* FALLTHROUGH */
1961541Srgrimes	case 2:
1971541Srgrimes		if (!_mb_surrogate(_PSENC) || _mb_trailbyte(_PSENC))
1981541Srgrimes			goto invalid;
1991541Srgrimes	/* FALLTHROUGH */
2001541Srgrimes	case 1:
2011541Srgrimes		if (!_mb_leadbyte (_PSENC))
2021541Srgrimes			goto invalid;
2031541Srgrimes	/* FALLTHOROUGH */
2041541Srgrimes	case 0:
20583651Speter		break;
20683651Speter	default:
207152652Srees		goto invalid;
2081541Srgrimes	}
20983651Speter
21083651Speter	for (;;) {
21183651Speter		if (n-- < 1)
21283651Speter			goto restart;
21383651Speter
21483651Speter		_PUSH_PSENC(*s0++);
2151541Srgrimes
2161541Srgrimes		switch (psenc->chlen) {
217177599Sru		case 1:
2189336Sdfr			if (_mb_singlebyte(_PSENC))
219177599Sru				goto convert;
2209336Sdfr			if (_mb_leadbyte  (_PSENC))
2219336Sdfr				continue;
2221541Srgrimes			goto ilseq;
2239336Sdfr		case 2:
2241541Srgrimes			if (_mb_trailbyte (_PSENC))
2251541Srgrimes				goto convert;
2261541Srgrimes			if (ei->mb_cur_max == 4 &&
2271541Srgrimes			    _mb_surrogate (_PSENC))
2281541Srgrimes				continue;
2291541Srgrimes			goto ilseq;
2301541Srgrimes		case 3:
2311541Srgrimes			if (_mb_leadbyte  (_PSENC))
23284002Speter				continue;
23317186Sdfr			goto ilseq;
234152652Srees		case 4:
235176224Sjhb			if (_mb_surrogate (_PSENC))
2361541Srgrimes				goto convert;
2371541Srgrimes			goto ilseq;
23883651Speter		}
23983651Speter	}
24083651Speter
24183651Speterconvert:
2421541Srgrimes	len = psenc->chlen;
24383651Speter	s1  = &psenc->ch[0];
24483651Speter	wc  = 0;
2451541Srgrimes	while (len-- > 0)
2461541Srgrimes		wc = (wc << 8) | (*s1++ & 0xff);
2471541Srgrimes
2481541Srgrimes	if (pwc != NULL)
2491541Srgrimes		*pwc = wc;
2501541Srgrimes	*s = s0;
2511541Srgrimes	*nresult = (wc == 0) ? 0 : psenc->chlen - chlenbak;
2521541Srgrimes	/* _citrus_GBK2K_init_state(ei, psenc); */
2531541Srgrimes	psenc->chlen = 0;
25484002Speter
2551541Srgrimes	return (0);
2561541Srgrimes
2571541Srgrimesrestart:
2581541Srgrimes	*s = s0;
2591541Srgrimes	*nresult = (size_t)-2;
2601541Srgrimes
2611541Srgrimes	return (0);
2621541Srgrimes
2631541Srgrimesinvalid:
26483651Speter	return (EINVAL);
2659336Sdfr
2669336Sdfrilseq:
2679336Sdfr	*nresult = (size_t)-1;
2689336Sdfr	return (EILSEQ);
26984002Speter}
27083651Speter
27183651Speterstatic int
2721541Srgrimes_citrus_GBK2K_wcrtomb_priv(_GBK2KEncodingInfo * __restrict ei,
2739336Sdfr			   char * __restrict s, size_t n, wchar_t wc,
27499797Sdillon			   _GBK2KState * __restrict psenc,
2751541Srgrimes			   size_t * __restrict nresult)
2761541Srgrimes{
2771541Srgrimes	int len, ret;
2781541Srgrimes
2791541Srgrimes	_DIAGASSERT(ei != NULL);
28017186Sdfr	_DIAGASSERT(s != NULL);
28117186Sdfr	_DIAGASSERT(psenc != NULL);
2821541Srgrimes
2831549Srgrimes	if (psenc->chlen != 0) {
28483651Speter		ret = EINVAL;
2851541Srgrimes		goto err;
28683651Speter	}
28783651Speter
28883651Speter	len = _mb_count(wc);
2891541Srgrimes	if (n < len) {
2901541Srgrimes		ret = E2BIG;
2911541Srgrimes		goto err;
29236519Speter	}
29317186Sdfr
29417186Sdfr	switch (len) {
29536519Speter	case 1:
29617186Sdfr		if (!_mb_singlebyte(_PUSH_PSENC(wc     ))) {
2971541Srgrimes			ret = EILSEQ;
2981541Srgrimes			goto err;
2991541Srgrimes		}
3001541Srgrimes		break;
3011541Srgrimes	case 2:
3021541Srgrimes		if (!_mb_leadbyte  (_PUSH_PSENC(wc >> 8)) ||
3031541Srgrimes		    !_mb_trailbyte (_PUSH_PSENC(wc     ))) {
3041541Srgrimes			ret = EILSEQ;
3051541Srgrimes			goto err;
3061541Srgrimes		}
3071541Srgrimes		break;
3081541Srgrimes	case 4:
3091541Srgrimes		if (ei->mb_cur_max != 4 ||
3101541Srgrimes		    !_mb_leadbyte  (_PUSH_PSENC(wc >> 24)) ||
3111541Srgrimes		    !_mb_surrogate (_PUSH_PSENC(wc >> 16)) ||
312177599Sru		    !_mb_leadbyte  (_PUSH_PSENC(wc >>  8)) ||
3131541Srgrimes		    !_mb_surrogate (_PUSH_PSENC(wc      ))) {
314177599Sru			ret = EILSEQ;
3151541Srgrimes			goto err;
3161541Srgrimes		}
3171541Srgrimes		break;
3181541Srgrimes	}
3191541Srgrimes
3201541Srgrimes	_DIAGASSERT(len == psenc->chlen);
3211541Srgrimes
3221541Srgrimes	memcpy(s, psenc->ch, psenc->chlen);
3231541Srgrimes	*nresult = psenc->chlen;
3241541Srgrimes	/* _citrus_GBK2K_init_state(ei, psenc); */
3251541Srgrimes	psenc->chlen = 0;
3261541Srgrimes
3271541Srgrimes	return (0);
3281541Srgrimes
3291541Srgrimeserr:
3301541Srgrimes	*nresult = (size_t)-1;
3311541Srgrimes	return ret;
3321541Srgrimes}
3331541Srgrimes
3341541Srgrimesstatic __inline int
3351541Srgrimes/*ARGSUSED*/
3361541Srgrimes_citrus_GBK2K_stdenc_wctocs(_GBK2KEncodingInfo * __restrict ei,
3371541Srgrimes			    _csid_t * __restrict csid,
338104908Smike			    _index_t * __restrict idx, wchar_t wc)
339104908Smike{
34017186Sdfr	u_int8_t ch, cl;
3411541Srgrimes
3421541Srgrimes	_DIAGASSERT(csid != NULL && idx != NULL);
3431541Srgrimes
3441541Srgrimes	if ((u_int32_t)wc<0x80) {
345177599Sru		/* ISO646 */
3461541Srgrimes		*csid = 0;
3471541Srgrimes		*idx = (_index_t)wc;
3481541Srgrimes	} else if ((u_int32_t)wc>=0x10000) {
3491541Srgrimes		/* GBKUCS : XXX */
3501541Srgrimes		*csid = 3;
3511541Srgrimes		*idx = (_index_t)wc;
3521541Srgrimes	} else {
3531541Srgrimes		ch = (u_int8_t)(wc >> 8);
3541541Srgrimes		cl = (u_int8_t)wc;
3551541Srgrimes		if (ch>=0xA1 && cl>=0xA1) {
3561541Srgrimes			/* EUC G1 */
3571541Srgrimes			*csid = 1;
3581541Srgrimes			*idx = (_index_t)wc & 0x7F7FU;
3591541Srgrimes		} else {
3601541Srgrimes			/* extended area (0x8140-) */
3611541Srgrimes			*csid = 2;
3621541Srgrimes			*idx = (_index_t)wc;
3631549Srgrimes		}
36483651Speter	}
3651541Srgrimes
36683651Speter	return 0;
3671541Srgrimes}
36836541Speter
3691541Srgrimesstatic __inline int
3701541Srgrimes/*ARGSUSED*/
3711541Srgrimes_citrus_GBK2K_stdenc_cstowc(_GBK2KEncodingInfo * __restrict ei,
3721541Srgrimes			    wchar_t * __restrict wc,
3731541Srgrimes			    _csid_t csid, _index_t idx)
3741541Srgrimes{
37536541Speter
3761541Srgrimes	_DIAGASSERT(wc != NULL);
3771541Srgrimes
3781541Srgrimes	switch (csid) {
3791541Srgrimes	case 0:
3801541Srgrimes		/* ISO646 */
3811541Srgrimes		*wc = (wchar_t)idx;
3821541Srgrimes		break;
3831541Srgrimes	case 1:
3841541Srgrimes		/* EUC G1 */
3851541Srgrimes		*wc = (wchar_t)idx | 0x8080U;
3861541Srgrimes		break;
3871541Srgrimes	case 2:
3881541Srgrimes		/* extended area */
3891541Srgrimes		*wc = (wchar_t)idx;
390177599Sru		break;
3911541Srgrimes	case 3:
392177599Sru		/* GBKUCS : XXX */
3931541Srgrimes		if (ei->mb_cur_max != 4)
3941541Srgrimes			return EINVAL;
3951541Srgrimes		*wc = (wchar_t)idx;
39636541Speter		break;
3971541Srgrimes	default:
3981541Srgrimes		return EILSEQ;
3991541Srgrimes	}
4001541Srgrimes
4011541Srgrimes	return 0;
4021541Srgrimes}
4031541Srgrimes
4041541Srgrimesstatic __inline int
4051541Srgrimes/*ARGSUSED*/
4061541Srgrimes_citrus_GBK2K_stdenc_get_state_desc_generic(_GBK2KEncodingInfo * __restrict ei,
4071541Srgrimes					    _GBK2KState * __restrict psenc,
4081541Srgrimes					    int * __restrict rstate)
4091541Srgrimes{
4101541Srgrimes
4111541Srgrimes	if (psenc->chlen == 0)
4121541Srgrimes		*rstate = _STDENC_SDGEN_INITIAL;
4131541Srgrimes	else
4141541Srgrimes		*rstate = _STDENC_SDGEN_INCOMPLETE_CHAR;
4151541Srgrimes
4161541Srgrimes	return 0;
4171541Srgrimes}
4181541Srgrimes
4191541Srgrimesstatic int
4201541Srgrimes/*ARGSUSED*/
4211541Srgrimes_citrus_GBK2K_encoding_module_init(_GBK2KEncodingInfo * __restrict ei,
4221541Srgrimes				   const void * __restrict var, size_t lenvar)
4231541Srgrimes{
4241541Srgrimes	const char *p;
4251549Srgrimes
42683651Speter	_DIAGASSERT(ei != NULL);
4271541Srgrimes
42883651Speter	p = var;
4291541Srgrimes#define MATCH(x, act)                                           \
43092783Sjeffdo {                                                            \
43192783Sjeff        if (lenvar >= (sizeof(#x)-1) &&                         \
4321541Srgrimes            _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) {       \
4331541Srgrimes                act;                                            \
4341541Srgrimes                lenvar -= sizeof(#x)-1;                         \
4351541Srgrimes                p += sizeof(#x)-1;                              \
4361541Srgrimes        }                                                       \
4371541Srgrimes} while (0)
4381541Srgrimes	memset((void *)ei, 0, sizeof(*ei));
4391541Srgrimes	ei->mb_cur_max = 4;
4401541Srgrimes	while (lenvar>0) {
4411541Srgrimes		switch (_bcs_tolower(*p)) {
4423664Sphk		case '2':
4439336Sdfr			MATCH("2byte", ei->mb_cur_max = 2);
4449336Sdfr			break;
4459336Sdfr		}
4461541Srgrimes		p++;
44719449Sdfr		lenvar--;
44899797Sdillon	}
44999797Sdillon
45019449Sdfr	return (0);
4511541Srgrimes}
4521541Srgrimes
4531541Srgrimesstatic void
4541541Srgrimes/*ARGSUSED*/
4551541Srgrimes_citrus_GBK2K_encoding_module_uninit(_GBK2KEncodingInfo *ei)
456184588Sdfr{
4573664Sphk}
458184588Sdfr
459148162Sps
460184588Sdfr/* ----------------------------------------------------------------------
461158739Smohans * public interface for ctype
462172600Smohans */
46316365Sphk
46442957Sdillon_CITRUS_CTYPE_DECLS(GBK2K);
46542957Sdillon_CITRUS_CTYPE_DEF_OPS(GBK2K);
4661549Srgrimes
4671541Srgrimes#include "citrus_ctype_template.h"
4681541Srgrimes
46938894Sbde/* ----------------------------------------------------------------------
47083651Speter * public interface for stdenc
47138894Sbde */
472128111Speadar
47338894Sbde_CITRUS_STDENC_DECLS(GBK2K);
474184588Sdfr_CITRUS_STDENC_DEF_OPS(GBK2K);
475127421Srees
476128111Speadar#include "citrus_stdenc_template.h"
477128126Smarcel