1219019Sgabor/* $FreeBSD$ */ 2219019Sgabor/* $NetBSD: citrus_utf1632.c,v 1.9 2008/06/14 16:01:08 tnozaki Exp $ */ 3219019Sgabor 4219019Sgabor/*- 5219019Sgabor * Copyright (c)2003 Citrus Project, 6219019Sgabor * All rights reserved. 7219019Sgabor * 8219019Sgabor * Redistribution and use in source and binary forms, with or without 9219019Sgabor * modification, are permitted provided that the following conditions 10219019Sgabor * are met: 11219019Sgabor * 1. Redistributions of source code must retain the above copyright 12219019Sgabor * notice, this list of conditions and the following disclaimer. 13219019Sgabor * 2. Redistributions in binary form must reproduce the above copyright 14219019Sgabor * notice, this list of conditions and the following disclaimer in the 15219019Sgabor * documentation and/or other materials provided with the distribution. 16219019Sgabor * 17219019Sgabor * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18219019Sgabor * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19219019Sgabor * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20219019Sgabor * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21219019Sgabor * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22219019Sgabor * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23219019Sgabor * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24219019Sgabor * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25219019Sgabor * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26219019Sgabor * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27219019Sgabor * SUCH DAMAGE. 28219019Sgabor */ 29219019Sgabor 30219019Sgabor#include <sys/cdefs.h> 31219019Sgabor#include <sys/endian.h> 32219019Sgabor#include <sys/types.h> 33219019Sgabor 34219019Sgabor#include <assert.h> 35219019Sgabor#include <errno.h> 36219019Sgabor#include <limits.h> 37219019Sgabor#include <stddef.h> 38219019Sgabor#include <stdio.h> 39219019Sgabor#include <stdlib.h> 40219019Sgabor#include <string.h> 41219019Sgabor#include <wchar.h> 42219019Sgabor 43219019Sgabor#include "citrus_namespace.h" 44219019Sgabor#include "citrus_types.h" 45219019Sgabor#include "citrus_module.h" 46219019Sgabor#include "citrus_stdenc.h" 47219019Sgabor#include "citrus_bcs.h" 48219019Sgabor 49219019Sgabor#include "citrus_utf1632.h" 50219019Sgabor 51219019Sgabor 52219019Sgabor/* ---------------------------------------------------------------------- 53219019Sgabor * private stuffs used by templates 54219019Sgabor */ 55219019Sgabor 56219019Sgabortypedef struct { 57219019Sgabor int chlen; 58219019Sgabor int current_endian; 59219019Sgabor uint8_t ch[4]; 60219019Sgabor} _UTF1632State; 61219019Sgabor 62219019Sgabor#define _ENDIAN_UNKNOWN 0 63219019Sgabor#define _ENDIAN_BIG 1 64219019Sgabor#define _ENDIAN_LITTLE 2 65219019Sgabor#if BYTE_ORDER == BIG_ENDIAN 66219019Sgabor#define _ENDIAN_INTERNAL _ENDIAN_BIG 67219019Sgabor#define _ENDIAN_SWAPPED _ENDIAN_LITTLE 68219019Sgabor#else 69219019Sgabor#define _ENDIAN_INTERNAL _ENDIAN_LITTLE 70219019Sgabor#define _ENDIAN_SWAPPED _ENDIAN_BIG 71219019Sgabor#endif 72219019Sgabor#define _MODE_UTF32 0x00000001U 73219019Sgabor#define _MODE_FORCE_ENDIAN 0x00000002U 74219019Sgabor 75219019Sgabortypedef struct { 76219019Sgabor int preffered_endian; 77219019Sgabor unsigned int cur_max; 78219019Sgabor uint32_t mode; 79219019Sgabor} _UTF1632EncodingInfo; 80219019Sgabor 81219019Sgabor#define _FUNCNAME(m) _citrus_UTF1632_##m 82219019Sgabor#define _ENCODING_INFO _UTF1632EncodingInfo 83219019Sgabor#define _ENCODING_STATE _UTF1632State 84219019Sgabor#define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 85219019Sgabor#define _ENCODING_IS_STATE_DEPENDENT 0 86219019Sgabor#define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 87219019Sgabor 88219019Sgabor 89219019Sgaborstatic __inline void 90219019Sgabor/*ARGSUSED*/ 91219019Sgabor_citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei __unused, 92219019Sgabor _UTF1632State *s) 93219019Sgabor{ 94219019Sgabor 95219019Sgabor memset(s, 0, sizeof(*s)); 96219019Sgabor} 97219019Sgabor 98219019Sgaborstatic int 99219019Sgabor_citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 100281550Stijl char **s, size_t n, _UTF1632State *psenc, size_t *nresult) 101219019Sgabor{ 102281550Stijl char *s0; 103219019Sgabor size_t result; 104219019Sgabor wchar_t wc = L'\0'; 105219019Sgabor int chlenbak, endian, needlen; 106219019Sgabor 107219019Sgabor s0 = *s; 108219019Sgabor 109219019Sgabor if (s0 == NULL) { 110219019Sgabor _citrus_UTF1632_init_state(ei, psenc); 111219019Sgabor *nresult = 0; /* state independent */ 112219019Sgabor return (0); 113219019Sgabor } 114219019Sgabor 115219019Sgabor result = 0; 116219019Sgabor chlenbak = psenc->chlen; 117219019Sgabor 118219019Sgaborrefetch: 119219019Sgabor needlen = ((ei->mode & _MODE_UTF32) != 0 || chlenbak >= 2) ? 4 : 2; 120219019Sgabor 121219019Sgabor while (chlenbak < needlen) { 122219019Sgabor if (n == 0) 123219019Sgabor goto restart; 124219019Sgabor psenc->ch[chlenbak++] = *s0++; 125219019Sgabor n--; 126219019Sgabor result++; 127219019Sgabor } 128219019Sgabor 129219019Sgabor /* judge endian marker */ 130219019Sgabor if ((ei->mode & _MODE_UTF32) == 0) { 131219019Sgabor /* UTF16 */ 132219019Sgabor if (psenc->ch[0] == 0xFE && psenc->ch[1] == 0xFF) { 133219019Sgabor psenc->current_endian = _ENDIAN_BIG; 134219019Sgabor chlenbak = 0; 135219019Sgabor goto refetch; 136219019Sgabor } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE) { 137219019Sgabor psenc->current_endian = _ENDIAN_LITTLE; 138219019Sgabor chlenbak = 0; 139219019Sgabor goto refetch; 140219019Sgabor } 141219019Sgabor } else { 142219019Sgabor /* UTF32 */ 143219019Sgabor if (psenc->ch[0] == 0x00 && psenc->ch[1] == 0x00 && 144219019Sgabor psenc->ch[2] == 0xFE && psenc->ch[3] == 0xFF) { 145219019Sgabor psenc->current_endian = _ENDIAN_BIG; 146219019Sgabor chlenbak = 0; 147219019Sgabor goto refetch; 148219019Sgabor } else if (psenc->ch[0] == 0xFF && psenc->ch[1] == 0xFE && 149219019Sgabor psenc->ch[2] == 0x00 && psenc->ch[3] == 0x00) { 150219019Sgabor psenc->current_endian = _ENDIAN_LITTLE; 151219019Sgabor chlenbak = 0; 152219019Sgabor goto refetch; 153219019Sgabor } 154219019Sgabor } 155219019Sgabor endian = ((ei->mode & _MODE_FORCE_ENDIAN) != 0 || 156219019Sgabor psenc->current_endian == _ENDIAN_UNKNOWN) ? ei->preffered_endian : 157219019Sgabor psenc->current_endian; 158219019Sgabor 159219019Sgabor /* get wc */ 160219019Sgabor if ((ei->mode & _MODE_UTF32) == 0) { 161219019Sgabor /* UTF16 */ 162219019Sgabor if (needlen == 2) { 163219019Sgabor switch (endian) { 164219019Sgabor case _ENDIAN_LITTLE: 165219019Sgabor wc = (psenc->ch[0] | 166219019Sgabor ((wchar_t)psenc->ch[1] << 8)); 167219019Sgabor break; 168219019Sgabor case _ENDIAN_BIG: 169219019Sgabor wc = (psenc->ch[1] | 170219019Sgabor ((wchar_t)psenc->ch[0] << 8)); 171219019Sgabor break; 172219019Sgabor default: 173219019Sgabor goto ilseq; 174219019Sgabor } 175219019Sgabor if (wc >= 0xD800 && wc <= 0xDBFF) { 176219019Sgabor /* surrogate high */ 177219019Sgabor needlen = 4; 178219019Sgabor goto refetch; 179219019Sgabor } 180219019Sgabor } else { 181219019Sgabor /* surrogate low */ 182219019Sgabor wc -= 0xD800; /* wc : surrogate high (see above) */ 183219019Sgabor wc <<= 10; 184219019Sgabor switch (endian) { 185219019Sgabor case _ENDIAN_LITTLE: 186219019Sgabor if (psenc->ch[3] < 0xDC || psenc->ch[3] > 0xDF) 187219019Sgabor goto ilseq; 188219019Sgabor wc |= psenc->ch[2]; 189219019Sgabor wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 190219019Sgabor break; 191219019Sgabor case _ENDIAN_BIG: 192219019Sgabor if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 193219019Sgabor goto ilseq; 194219019Sgabor wc |= psenc->ch[3]; 195219019Sgabor wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 196219019Sgabor break; 197219019Sgabor default: 198219019Sgabor goto ilseq; 199219019Sgabor } 200219019Sgabor wc += 0x10000; 201219019Sgabor } 202219019Sgabor } else { 203219019Sgabor /* UTF32 */ 204219019Sgabor switch (endian) { 205219019Sgabor case _ENDIAN_LITTLE: 206219019Sgabor wc = (psenc->ch[0] | 207219019Sgabor ((wchar_t)psenc->ch[1] << 8) | 208219019Sgabor ((wchar_t)psenc->ch[2] << 16) | 209219019Sgabor ((wchar_t)psenc->ch[3] << 24)); 210219019Sgabor break; 211219019Sgabor case _ENDIAN_BIG: 212219019Sgabor wc = (psenc->ch[3] | 213219019Sgabor ((wchar_t)psenc->ch[2] << 8) | 214219019Sgabor ((wchar_t)psenc->ch[1] << 16) | 215219019Sgabor ((wchar_t)psenc->ch[0] << 24)); 216219019Sgabor break; 217219019Sgabor default: 218219019Sgabor goto ilseq; 219219019Sgabor } 220219019Sgabor if (wc >= 0xD800 && wc <= 0xDFFF) 221219019Sgabor goto ilseq; 222219019Sgabor } 223219019Sgabor 224219019Sgabor 225219019Sgabor *pwc = wc; 226219019Sgabor psenc->chlen = 0; 227219019Sgabor *nresult = result; 228219019Sgabor *s = s0; 229219019Sgabor 230219019Sgabor return (0); 231219019Sgabor 232219019Sgaborilseq: 233219019Sgabor *nresult = (size_t)-1; 234219019Sgabor psenc->chlen = 0; 235219019Sgabor return (EILSEQ); 236219019Sgabor 237219019Sgaborrestart: 238219019Sgabor *nresult = (size_t)-2; 239219019Sgabor psenc->chlen = chlenbak; 240219019Sgabor *s = s0; 241219019Sgabor return (0); 242219019Sgabor} 243219019Sgabor 244219019Sgaborstatic int 245219019Sgabor_citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 246219019Sgabor wchar_t wc, _UTF1632State *psenc, size_t *nresult) 247219019Sgabor{ 248219019Sgabor wchar_t wc2; 249219019Sgabor static const char _bom[4] = { 250219019Sgabor 0x00, 0x00, 0xFE, 0xFF, 251219019Sgabor }; 252219019Sgabor const char *bom = &_bom[0]; 253219019Sgabor size_t cnt; 254219019Sgabor 255219019Sgabor cnt = (size_t)0; 256219019Sgabor if (psenc->current_endian == _ENDIAN_UNKNOWN) { 257219019Sgabor if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 258219019Sgabor if (ei->mode & _MODE_UTF32) 259219019Sgabor cnt = 4; 260219019Sgabor else { 261219019Sgabor cnt = 2; 262219019Sgabor bom += 2; 263219019Sgabor } 264219019Sgabor if (n < cnt) 265219019Sgabor goto e2big; 266219019Sgabor memcpy(s, bom, cnt); 267219019Sgabor s += cnt, n -= cnt; 268219019Sgabor } 269219019Sgabor psenc->current_endian = ei->preffered_endian; 270219019Sgabor } 271219019Sgabor 272219019Sgabor wc2 = 0; 273219019Sgabor if ((ei->mode & _MODE_UTF32)==0) { 274219019Sgabor /* UTF16 */ 275219019Sgabor if (wc > 0xFFFF) { 276219019Sgabor /* surrogate */ 277219019Sgabor if (wc > 0x10FFFF) 278219019Sgabor goto ilseq; 279219019Sgabor if (n < 4) 280219019Sgabor goto e2big; 281219019Sgabor cnt += 4; 282219019Sgabor wc -= 0x10000; 283219019Sgabor wc2 = (wc & 0x3FF) | 0xDC00; 284219019Sgabor wc = (wc>>10) | 0xD800; 285219019Sgabor } else { 286219019Sgabor if (n < 2) 287219019Sgabor goto e2big; 288219019Sgabor cnt += 2; 289219019Sgabor } 290219019Sgabor 291219019Sgaborsurrogate: 292219019Sgabor switch (psenc->current_endian) { 293219019Sgabor case _ENDIAN_BIG: 294219019Sgabor s[1] = wc; 295219019Sgabor s[0] = (wc >>= 8); 296219019Sgabor break; 297219019Sgabor case _ENDIAN_LITTLE: 298219019Sgabor s[0] = wc; 299219019Sgabor s[1] = (wc >>= 8); 300219019Sgabor break; 301219019Sgabor } 302219019Sgabor if (wc2 != 0) { 303219019Sgabor wc = wc2; 304219019Sgabor wc2 = 0; 305219019Sgabor s += 2; 306219019Sgabor goto surrogate; 307219019Sgabor } 308219019Sgabor } else { 309219019Sgabor /* UTF32 */ 310219019Sgabor if (wc >= 0xD800 && wc <= 0xDFFF) 311219019Sgabor goto ilseq; 312219019Sgabor if (n < 4) 313219019Sgabor goto e2big; 314219019Sgabor cnt += 4; 315219019Sgabor switch (psenc->current_endian) { 316219019Sgabor case _ENDIAN_BIG: 317219019Sgabor s[3] = wc; 318219019Sgabor s[2] = (wc >>= 8); 319219019Sgabor s[1] = (wc >>= 8); 320219019Sgabor s[0] = (wc >>= 8); 321219019Sgabor break; 322219019Sgabor case _ENDIAN_LITTLE: 323219019Sgabor s[0] = wc; 324219019Sgabor s[1] = (wc >>= 8); 325219019Sgabor s[2] = (wc >>= 8); 326219019Sgabor s[3] = (wc >>= 8); 327219019Sgabor break; 328219019Sgabor } 329219019Sgabor } 330219019Sgabor *nresult = cnt; 331219019Sgabor 332219019Sgabor return (0); 333219019Sgabor 334219019Sgaborilseq: 335219019Sgabor *nresult = (size_t)-1; 336219019Sgabor return (EILSEQ); 337219019Sgabore2big: 338219019Sgabor *nresult = (size_t)-1; 339219019Sgabor return (E2BIG); 340219019Sgabor} 341219019Sgabor 342219019Sgaborstatic void 343219019Sgaborparse_variable(_UTF1632EncodingInfo * __restrict ei, 344219019Sgabor const void * __restrict var, size_t lenvar) 345219019Sgabor{ 346219019Sgabor const char *p; 347219019Sgabor 348219019Sgabor p = var; 349219019Sgabor while (lenvar > 0) { 350219019Sgabor switch (*p) { 351219019Sgabor case 'B': 352219019Sgabor case 'b': 353219019Sgabor MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 354219019Sgabor break; 355219019Sgabor case 'L': 356219019Sgabor case 'l': 357219019Sgabor MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 358219019Sgabor break; 359219019Sgabor case 'i': 360219019Sgabor case 'I': 361219019Sgabor MATCH(internal, ei->preffered_endian = _ENDIAN_INTERNAL); 362219019Sgabor break; 363219019Sgabor case 's': 364219019Sgabor case 'S': 365219019Sgabor MATCH(swapped, ei->preffered_endian = _ENDIAN_SWAPPED); 366219019Sgabor break; 367219019Sgabor case 'F': 368219019Sgabor case 'f': 369219019Sgabor MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 370219019Sgabor break; 371219019Sgabor case 'U': 372219019Sgabor case 'u': 373219019Sgabor MATCH(utf32, ei->mode |= _MODE_UTF32); 374219019Sgabor break; 375219019Sgabor } 376219019Sgabor p++; 377219019Sgabor lenvar--; 378219019Sgabor } 379219019Sgabor} 380219019Sgabor 381219019Sgaborstatic int 382219019Sgabor/*ARGSUSED*/ 383219019Sgabor_citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 384219019Sgabor const void * __restrict var, size_t lenvar) 385219019Sgabor{ 386219019Sgabor 387219019Sgabor memset((void *)ei, 0, sizeof(*ei)); 388219019Sgabor 389219019Sgabor parse_variable(ei, var, lenvar); 390219019Sgabor 391219019Sgabor ei->cur_max = ((ei->mode&_MODE_UTF32) == 0) ? 6 : 8; 392219019Sgabor /* 6: endian + surrogate */ 393219019Sgabor /* 8: endian + normal */ 394219019Sgabor 395219019Sgabor if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 396219019Sgabor ei->preffered_endian = _ENDIAN_BIG; 397219019Sgabor } 398219019Sgabor 399219019Sgabor return (0); 400219019Sgabor} 401219019Sgabor 402219019Sgaborstatic void 403219019Sgabor/*ARGSUSED*/ 404219019Sgabor_citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei __unused) 405219019Sgabor{ 406219019Sgabor 407219019Sgabor} 408219019Sgabor 409219019Sgaborstatic __inline int 410219019Sgabor/*ARGSUSED*/ 411219019Sgabor_citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei __unused, 412219019Sgabor _csid_t * __restrict csid, _index_t * __restrict idx, _wc_t wc) 413219019Sgabor{ 414219019Sgabor 415219019Sgabor *csid = 0; 416219019Sgabor *idx = (_index_t)wc; 417219019Sgabor 418219019Sgabor return (0); 419219019Sgabor} 420219019Sgabor 421219019Sgaborstatic __inline int 422219019Sgabor/*ARGSUSED*/ 423219019Sgabor_citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei __unused, 424219019Sgabor _wc_t * __restrict wc, _csid_t csid, _index_t idx) 425219019Sgabor{ 426219019Sgabor 427219019Sgabor if (csid != 0) 428219019Sgabor return (EILSEQ); 429219019Sgabor 430219019Sgabor *wc = (_wc_t)idx; 431219019Sgabor 432219019Sgabor return (0); 433219019Sgabor} 434219019Sgabor 435219019Sgaborstatic __inline int 436219019Sgabor/*ARGSUSED*/ 437219019Sgabor_citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei __unused, 438219019Sgabor _UTF1632State * __restrict psenc, int * __restrict rstate) 439219019Sgabor{ 440219019Sgabor 441219019Sgabor *rstate = (psenc->chlen == 0) ? _STDENC_SDGEN_INITIAL : 442219019Sgabor _STDENC_SDGEN_INCOMPLETE_CHAR; 443219019Sgabor return (0); 444219019Sgabor} 445219019Sgabor 446219019Sgabor/* ---------------------------------------------------------------------- 447219019Sgabor * public interface for stdenc 448219019Sgabor */ 449219019Sgabor 450219019Sgabor_CITRUS_STDENC_DECLS(UTF1632); 451219019Sgabor_CITRUS_STDENC_DEF_OPS(UTF1632); 452219019Sgabor 453219019Sgabor#include "citrus_stdenc_template.h" 454