1219019Sgabor/* $FreeBSD$ */
2219019Sgabor/*	$NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $	*/
3219019Sgabor
4219019Sgabor/*-
5219019Sgabor * Copyright (c)2003 Citrus Project,
6219019Sgabor * All rights reserved.
7219019Sgabor *
8219019Sgabor * Redistribution and use in source and binary forms, with or without
9219019Sgabor * modification, are permitted provided that the following conditions
10219019Sgabor * are met:
11219019Sgabor * 1. Redistributions of source code must retain the above copyright
12219019Sgabor *    notice, this list of conditions and the following disclaimer.
13219019Sgabor * 2. Redistributions in binary form must reproduce the above copyright
14219019Sgabor *    notice, this list of conditions and the following disclaimer in the
15219019Sgabor *    documentation and/or other materials provided with the distribution.
16219019Sgabor *
17219019Sgabor * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18219019Sgabor * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19219019Sgabor * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20219019Sgabor * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21219019Sgabor * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22219019Sgabor * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23219019Sgabor * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24219019Sgabor * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25219019Sgabor * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26219019Sgabor * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27219019Sgabor * SUCH DAMAGE.
28219019Sgabor */
29219019Sgabor
30219019Sgabor#include <sys/cdefs.h>
31219019Sgabor#include <sys/endian.h>
32219019Sgabor#include <sys/types.h>
33219019Sgabor
34219019Sgabor#include <assert.h>
35219019Sgabor#include <errno.h>
36219019Sgabor#include <limits.h>
37219019Sgabor#include <stddef.h>
38219019Sgabor#include <stdio.h>
39219019Sgabor#include <stdlib.h>
40219019Sgabor#include <string.h>
41219019Sgabor#include <wchar.h>
42219019Sgabor
43219019Sgabor#include "citrus_namespace.h"
44219019Sgabor#include "citrus_types.h"
45219019Sgabor#include "citrus_module.h"
46219019Sgabor#include "citrus_stdenc.h"
47219019Sgabor#include "citrus_bcs.h"
48219019Sgabor
49219019Sgabor#include "citrus_utf1632.h"
50219019Sgabor
51219019Sgabor
52219019Sgabor/* ----------------------------------------------------------------------
53219019Sgabor * private stuffs used by templates
54219019Sgabor */
55219019Sgabor
56219019Sgabortypedef struct {
57219019Sgabor	int		 chlen;
58219019Sgabor	int		 current_endian;
59219019Sgabor	uint8_t		 ch[4];
60219019Sgabor} _UTF1632State;
61219019Sgabor
62219019Sgabor#define _ENDIAN_UNKNOWN		0
63219019Sgabor#define _ENDIAN_BIG		1
64219019Sgabor#define _ENDIAN_LITTLE		2
65219019Sgabor#if BYTE_ORDER == BIG_ENDIAN
66219019Sgabor#define _ENDIAN_INTERNAL	_ENDIAN_BIG
67219019Sgabor#define _ENDIAN_SWAPPED		_ENDIAN_LITTLE
68219019Sgabor#else
69219019Sgabor#define _ENDIAN_INTERNAL	_ENDIAN_LITTLE
70219019Sgabor#define _ENDIAN_SWAPPED	_ENDIAN_BIG
71219019Sgabor#endif
72219019Sgabor#define _MODE_UTF32		0x00000001U
73219019Sgabor#define _MODE_FORCE_ENDIAN	0x00000002U
74219019Sgabor
75219019Sgabortypedef struct {
76219019Sgabor	int		 preffered_endian;
77219019Sgabor	unsigned int	 cur_max;
78219019Sgabor	uint32_t	 mode;
79219019Sgabor} _UTF1632EncodingInfo;
80219019Sgabor
81219019Sgabor#define _FUNCNAME(m)			_citrus_UTF1632_##m
82219019Sgabor#define _ENCODING_INFO			_UTF1632EncodingInfo
83219019Sgabor#define _ENCODING_STATE			_UTF1632State
84219019Sgabor#define _ENCODING_MB_CUR_MAX(_ei_)	((_ei_)->cur_max)
85219019Sgabor#define _ENCODING_IS_STATE_DEPENDENT	0
86219019Sgabor#define _STATE_NEEDS_EXPLICIT_INIT(_ps_)	0
87219019Sgabor
88219019Sgabor
89219019Sgaborstatic __inline void
90219019Sgabor/*ARGSUSED*/
91219019Sgabor_citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei __unused,
92219019Sgabor    _UTF1632State *s)
93219019Sgabor{
94219019Sgabor
95219019Sgabor	memset(s, 0, sizeof(*s));
96219019Sgabor}
97219019Sgabor
98219019Sgaborstatic int
99219019Sgabor_citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc,
100281550Stijl    char **s, size_t n, _UTF1632State *psenc, size_t *nresult)
101219019Sgabor{
102281550Stijl	char *s0;
103219019Sgabor	size_t result;
104219019Sgabor	wchar_t wc = L'\0';
105219019Sgabor	int chlenbak, endian, needlen;
106219019Sgabor
107219019Sgabor	s0 = *s;
108219019Sgabor
109219019Sgabor	if (s0 == NULL) {
110219019Sgabor		_citrus_UTF1632_init_state(ei, psenc);
111219019Sgabor		*nresult = 0; /* state independent */
112219019Sgabor		return (0);
113219019Sgabor	}
114219019Sgabor
115219019Sgabor	result = 0;
116219019Sgabor	chlenbak = psenc->chlen;
117219019Sgabor
118219019Sgaborrefetch:
119219019Sgabor	needlen = ((ei->mode & _MODE_UTF32) != 0 || chlenbak >= 2) ? 4 : 2;
120219019Sgabor
121219019Sgabor	while (chlenbak < needlen) {
122219019Sgabor		if (n == 0)
123219019Sgabor			goto restart;
124219019Sgabor		psenc->ch[chlenbak++] = *s0++;
125219019Sgabor		n--;
126219019Sgabor		result++;
127219019Sgabor	}
128219019Sgabor
129219019Sgabor	/* judge endian marker */
130219019Sgabor	if ((ei->mode & _MODE_UTF32) == 0) {
131219019Sgabor		/* UTF16 */
132219019Sgabor		if (psenc->ch[0] == 0xFE && psenc->ch[1] == 0xFF) {
133219019Sgabor			psenc->current_endian = _ENDIAN_BIG;
134219019Sgabor			chlenbak = 0;
135219019Sgabor			goto refetch;
136219019Sgabor		} else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE) {
137219019Sgabor			psenc->current_endian = _ENDIAN_LITTLE;
138219019Sgabor			chlenbak = 0;
139219019Sgabor			goto refetch;
140219019Sgabor		}
141219019Sgabor	} else {
142219019Sgabor		/* UTF32 */
143219019Sgabor		if (psenc->ch[0] == 0x00 && psenc->ch[1] == 0x00 &&
144219019Sgabor		    psenc->ch[2] == 0xFE && psenc->ch[3] == 0xFF) {
145219019Sgabor			psenc->current_endian = _ENDIAN_BIG;
146219019Sgabor			chlenbak = 0;
147219019Sgabor			goto refetch;
148219019Sgabor		} else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE &&
149219019Sgabor			   psenc->ch[2] == 0x00 && psenc->ch[3] == 0x00) {
150219019Sgabor			psenc->current_endian = _ENDIAN_LITTLE;
151219019Sgabor			chlenbak = 0;
152219019Sgabor			goto refetch;
153219019Sgabor		}
154219019Sgabor	}
155219019Sgabor	endian = ((ei->mode & _MODE_FORCE_ENDIAN) != 0 ||
156219019Sgabor	    psenc->current_endian == _ENDIAN_UNKNOWN) ? ei->preffered_endian :
157219019Sgabor	    psenc->current_endian;
158219019Sgabor
159219019Sgabor	/* get wc */
160219019Sgabor	if ((ei->mode & _MODE_UTF32) == 0) {
161219019Sgabor		/* UTF16 */
162219019Sgabor		if (needlen == 2) {
163219019Sgabor			switch (endian) {
164219019Sgabor			case _ENDIAN_LITTLE:
165219019Sgabor				wc = (psenc->ch[0] |
166219019Sgabor				    ((wchar_t)psenc->ch[1] << 8));
167219019Sgabor				break;
168219019Sgabor			case _ENDIAN_BIG:
169219019Sgabor				wc = (psenc->ch[1] |
170219019Sgabor				    ((wchar_t)psenc->ch[0] << 8));
171219019Sgabor				break;
172219019Sgabor			default:
173219019Sgabor				goto ilseq;
174219019Sgabor			}
175219019Sgabor			if (wc >= 0xD800 && wc <= 0xDBFF) {
176219019Sgabor				/* surrogate high */
177219019Sgabor				needlen = 4;
178219019Sgabor				goto refetch;
179219019Sgabor			}
180219019Sgabor		} else {
181219019Sgabor			/* surrogate low */
182219019Sgabor			wc -= 0xD800; /* wc : surrogate high (see above) */
183219019Sgabor			wc <<= 10;
184219019Sgabor			switch (endian) {
185219019Sgabor			case _ENDIAN_LITTLE:
186219019Sgabor				if (psenc->ch[3] < 0xDC || psenc->ch[3] > 0xDF)
187219019Sgabor					goto ilseq;
188219019Sgabor				wc |= psenc->ch[2];
189219019Sgabor				wc |= (wchar_t)(psenc->ch[3] & 3) << 8;
190219019Sgabor				break;
191219019Sgabor			case _ENDIAN_BIG:
192219019Sgabor				if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF)
193219019Sgabor					goto ilseq;
194219019Sgabor				wc |= psenc->ch[3];
195219019Sgabor				wc |= (wchar_t)(psenc->ch[2] & 3) << 8;
196219019Sgabor				break;
197219019Sgabor			default:
198219019Sgabor				goto ilseq;
199219019Sgabor			}
200219019Sgabor			wc += 0x10000;
201219019Sgabor		}
202219019Sgabor	} else {
203219019Sgabor		/* UTF32 */
204219019Sgabor		switch (endian) {
205219019Sgabor		case _ENDIAN_LITTLE:
206219019Sgabor			wc = (psenc->ch[0] |
207219019Sgabor			    ((wchar_t)psenc->ch[1] << 8) |
208219019Sgabor			    ((wchar_t)psenc->ch[2] << 16) |
209219019Sgabor			    ((wchar_t)psenc->ch[3] << 24));
210219019Sgabor			break;
211219019Sgabor		case _ENDIAN_BIG:
212219019Sgabor			wc = (psenc->ch[3] |
213219019Sgabor			    ((wchar_t)psenc->ch[2] << 8) |
214219019Sgabor			    ((wchar_t)psenc->ch[1] << 16) |
215219019Sgabor			    ((wchar_t)psenc->ch[0] << 24));
216219019Sgabor			break;
217219019Sgabor		default:
218219019Sgabor			goto ilseq;
219219019Sgabor		}
220219019Sgabor		if (wc >= 0xD800 && wc <= 0xDFFF)
221219019Sgabor			goto ilseq;
222219019Sgabor	}
223219019Sgabor
224219019Sgabor
225219019Sgabor	*pwc = wc;
226219019Sgabor	psenc->chlen = 0;
227219019Sgabor	*nresult = result;
228219019Sgabor	*s = s0;
229219019Sgabor
230219019Sgabor	return (0);
231219019Sgabor
232219019Sgaborilseq:
233219019Sgabor	*nresult = (size_t)-1;
234219019Sgabor	psenc->chlen = 0;
235219019Sgabor	return (EILSEQ);
236219019Sgabor
237219019Sgaborrestart:
238219019Sgabor	*nresult = (size_t)-2;
239219019Sgabor	psenc->chlen = chlenbak;
240219019Sgabor	*s = s0;
241219019Sgabor	return (0);
242219019Sgabor}
243219019Sgabor
244219019Sgaborstatic int
245219019Sgabor_citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n,
246219019Sgabor    wchar_t wc, _UTF1632State *psenc, size_t *nresult)
247219019Sgabor{
248219019Sgabor	wchar_t wc2;
249219019Sgabor	static const char _bom[4] = {
250219019Sgabor	    0x00, 0x00, 0xFE, 0xFF,
251219019Sgabor	};
252219019Sgabor	const char *bom = &_bom[0];
253219019Sgabor	size_t cnt;
254219019Sgabor
255219019Sgabor	cnt = (size_t)0;
256219019Sgabor	if (psenc->current_endian == _ENDIAN_UNKNOWN) {
257219019Sgabor		if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) {
258219019Sgabor			if (ei->mode & _MODE_UTF32)
259219019Sgabor				cnt = 4;
260219019Sgabor			else {
261219019Sgabor				cnt = 2;
262219019Sgabor				bom += 2;
263219019Sgabor			}
264219019Sgabor			if (n < cnt)
265219019Sgabor				goto e2big;
266219019Sgabor			memcpy(s, bom, cnt);
267219019Sgabor			s += cnt, n -= cnt;
268219019Sgabor		}
269219019Sgabor		psenc->current_endian = ei->preffered_endian;
270219019Sgabor	}
271219019Sgabor
272219019Sgabor	wc2 = 0;
273219019Sgabor	if ((ei->mode & _MODE_UTF32)==0) {
274219019Sgabor		/* UTF16 */
275219019Sgabor		if (wc > 0xFFFF) {
276219019Sgabor			/* surrogate */
277219019Sgabor			if (wc > 0x10FFFF)
278219019Sgabor				goto ilseq;
279219019Sgabor			if (n < 4)
280219019Sgabor				goto e2big;
281219019Sgabor			cnt += 4;
282219019Sgabor			wc -= 0x10000;
283219019Sgabor			wc2 = (wc & 0x3FF) | 0xDC00;
284219019Sgabor			wc = (wc>>10) | 0xD800;
285219019Sgabor		} else {
286219019Sgabor			if (n < 2)
287219019Sgabor				goto e2big;
288219019Sgabor			cnt += 2;
289219019Sgabor		}
290219019Sgabor
291219019Sgaborsurrogate:
292219019Sgabor		switch (psenc->current_endian) {
293219019Sgabor		case _ENDIAN_BIG:
294219019Sgabor			s[1] = wc;
295219019Sgabor			s[0] = (wc >>= 8);
296219019Sgabor			break;
297219019Sgabor		case _ENDIAN_LITTLE:
298219019Sgabor			s[0] = wc;
299219019Sgabor			s[1] = (wc >>= 8);
300219019Sgabor			break;
301219019Sgabor		}
302219019Sgabor		if (wc2 != 0) {
303219019Sgabor			wc = wc2;
304219019Sgabor			wc2 = 0;
305219019Sgabor			s += 2;
306219019Sgabor			goto surrogate;
307219019Sgabor		}
308219019Sgabor	} else {
309219019Sgabor		/* UTF32 */
310219019Sgabor		if (wc >= 0xD800 && wc <= 0xDFFF)
311219019Sgabor			goto ilseq;
312219019Sgabor		if (n < 4)
313219019Sgabor			goto e2big;
314219019Sgabor		cnt += 4;
315219019Sgabor		switch (psenc->current_endian) {
316219019Sgabor		case _ENDIAN_BIG:
317219019Sgabor			s[3] = wc;
318219019Sgabor			s[2] = (wc >>= 8);
319219019Sgabor			s[1] = (wc >>= 8);
320219019Sgabor			s[0] = (wc >>= 8);
321219019Sgabor			break;
322219019Sgabor		case _ENDIAN_LITTLE:
323219019Sgabor			s[0] = wc;
324219019Sgabor			s[1] = (wc >>= 8);
325219019Sgabor			s[2] = (wc >>= 8);
326219019Sgabor			s[3] = (wc >>= 8);
327219019Sgabor			break;
328219019Sgabor		}
329219019Sgabor	}
330219019Sgabor	*nresult = cnt;
331219019Sgabor
332219019Sgabor	return (0);
333219019Sgabor
334219019Sgaborilseq:
335219019Sgabor	*nresult = (size_t)-1;
336219019Sgabor	return (EILSEQ);
337219019Sgabore2big:
338219019Sgabor	*nresult = (size_t)-1;
339219019Sgabor	return (E2BIG);
340219019Sgabor}
341219019Sgabor
342219019Sgaborstatic void
343219019Sgaborparse_variable(_UTF1632EncodingInfo * __restrict ei,
344219019Sgabor    const void * __restrict var, size_t lenvar)
345219019Sgabor{
346219019Sgabor	const char *p;
347219019Sgabor
348219019Sgabor	p = var;
349219019Sgabor	while (lenvar > 0) {
350219019Sgabor		switch (*p) {
351219019Sgabor		case 'B':
352219019Sgabor		case 'b':
353219019Sgabor			MATCH(big, ei->preffered_endian = _ENDIAN_BIG);
354219019Sgabor			break;
355219019Sgabor		case 'L':
356219019Sgabor		case 'l':
357219019Sgabor			MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE);
358219019Sgabor			break;
359219019Sgabor		case 'i':
360219019Sgabor		case 'I':
361219019Sgabor			MATCH(internal, ei->preffered_endian = _ENDIAN_INTERNAL);
362219019Sgabor			break;
363219019Sgabor		case 's':
364219019Sgabor		case 'S':
365219019Sgabor			MATCH(swapped, ei->preffered_endian = _ENDIAN_SWAPPED);
366219019Sgabor			break;
367219019Sgabor		case 'F':
368219019Sgabor		case 'f':
369219019Sgabor			MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN);
370219019Sgabor			break;
371219019Sgabor		case 'U':
372219019Sgabor		case 'u':
373219019Sgabor			MATCH(utf32, ei->mode |= _MODE_UTF32);
374219019Sgabor			break;
375219019Sgabor		}
376219019Sgabor		p++;
377219019Sgabor		lenvar--;
378219019Sgabor	}
379219019Sgabor}
380219019Sgabor
381219019Sgaborstatic int
382219019Sgabor/*ARGSUSED*/
383219019Sgabor_citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei,
384219019Sgabor    const void * __restrict var, size_t lenvar)
385219019Sgabor{
386219019Sgabor
387219019Sgabor	memset((void *)ei, 0, sizeof(*ei));
388219019Sgabor
389219019Sgabor	parse_variable(ei, var, lenvar);
390219019Sgabor
391219019Sgabor	ei->cur_max = ((ei->mode&_MODE_UTF32) == 0) ? 6 : 8;
392219019Sgabor	/* 6: endian + surrogate */
393219019Sgabor	/* 8: endian + normal */
394219019Sgabor
395219019Sgabor	if (ei->preffered_endian == _ENDIAN_UNKNOWN) {
396219019Sgabor		ei->preffered_endian = _ENDIAN_BIG;
397219019Sgabor	}
398219019Sgabor
399219019Sgabor	return (0);
400219019Sgabor}
401219019Sgabor
402219019Sgaborstatic void
403219019Sgabor/*ARGSUSED*/
404219019Sgabor_citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei __unused)
405219019Sgabor{
406219019Sgabor
407219019Sgabor}
408219019Sgabor
409219019Sgaborstatic __inline int
410219019Sgabor/*ARGSUSED*/
411219019Sgabor_citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei __unused,
412219019Sgabor     _csid_t * __restrict csid, _index_t * __restrict idx, _wc_t wc)
413219019Sgabor{
414219019Sgabor
415219019Sgabor	*csid = 0;
416219019Sgabor	*idx = (_index_t)wc;
417219019Sgabor
418219019Sgabor	return (0);
419219019Sgabor}
420219019Sgabor
421219019Sgaborstatic __inline int
422219019Sgabor/*ARGSUSED*/
423219019Sgabor_citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei __unused,
424219019Sgabor    _wc_t * __restrict wc, _csid_t csid, _index_t idx)
425219019Sgabor{
426219019Sgabor
427219019Sgabor	if (csid != 0)
428219019Sgabor		return (EILSEQ);
429219019Sgabor
430219019Sgabor	*wc = (_wc_t)idx;
431219019Sgabor
432219019Sgabor	return (0);
433219019Sgabor}
434219019Sgabor
435219019Sgaborstatic __inline int
436219019Sgabor/*ARGSUSED*/
437219019Sgabor_citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei __unused,
438219019Sgabor    _UTF1632State * __restrict psenc, int * __restrict rstate)
439219019Sgabor{
440219019Sgabor
441219019Sgabor	*rstate = (psenc->chlen == 0) ? _STDENC_SDGEN_INITIAL :
442219019Sgabor	    _STDENC_SDGEN_INCOMPLETE_CHAR;
443219019Sgabor	return (0);
444219019Sgabor}
445219019Sgabor
446219019Sgabor/* ----------------------------------------------------------------------
447219019Sgabor * public interface for stdenc
448219019Sgabor */
449219019Sgabor
450219019Sgabor_CITRUS_STDENC_DECLS(UTF1632);
451219019Sgabor_CITRUS_STDENC_DEF_OPS(UTF1632);
452219019Sgabor
453219019Sgabor#include "citrus_stdenc_template.h"
454