1/* 2 Unix SMB/CIFS implementation. 3 minimal iconv implementation 4 Copyright (C) Andrew Tridgell 2001 5 Copyright (C) Jelmer Vernooij 2002,2003 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 2 of the License, or 10 (at your option) any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with this program; if not, write to the Free Software 19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 21 From samba 3.0 beta and GNU libiconv-1.8 22 It's bad but most of the time we can't use libc iconv service: 23 - it doesn't round trip for most encoding 24 - it doesn't know about Apple extension 25*/ 26 27#ifdef HAVE_CONFIG_H 28#include "config.h" 29#endif /* HAVE_CONFIG_H */ 30#include <stdlib.h> 31#include <errno.h> 32 33#include <netatalk/endian.h> 34#include <atalk/unicode.h> 35#include <atalk/logger.h> 36#include <atalk/unicode.h> 37#include "byteorder.h" 38 39/* Given a trailing UTF-8 byte, get the contribution from it to 40 * the Unicode scalar value for a particular bit shift amount 41 */ 42#define GETUCVAL(utf8_trailbyte,shift) ((unsigned int) (( utf8_trailbyte & 0x3F) << shift)) 43 44/* Given a unicode scalar, get a trail UTF-8 byte for a particular bit shift amount */ 45#define GETUTF8TRAILBYTE(uc,shift) ((char)( 0x80 | ((uc >> shift) & 0x3F) ) ) 46 47 48 49static size_t utf8_pull(void *,char **, size_t *, char **, size_t *); 50static size_t utf8_push(void *,char **, size_t *, char **, size_t *); 51 52struct charset_functions charset_utf8 = 53{ 54 "UTF8", 55 0x08000103, 56 utf8_pull, 57 utf8_push, 58 CHARSET_VOLUME | CHARSET_MULTIBYTE | CHARSET_PRECOMPOSED, 59 NULL, 60 NULL, NULL 61}; 62 63struct charset_functions charset_utf8_mac = 64{ 65 "UTF8-MAC", 66 0x08000103, 67 utf8_pull, 68 utf8_push, 69 CHARSET_VOLUME | CHARSET_CLIENT | CHARSET_MULTIBYTE | CHARSET_DECOMPOSED, 70 NULL, 71 NULL, NULL 72}; 73 74/* ------------------- Convert from UTF-8 to UTF-16 -------------------*/ 75static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft, 76 char **outbuf, size_t *outbytesleft) 77{ 78 ucs2_t uc = 0; 79 unsigned int codepoint; 80 int len; 81 82 while (*inbytesleft >= 1 && *outbytesleft >= 2) { 83 unsigned char *c = (unsigned char *)*inbuf; 84 len = 1; 85 86 /* Arrange conditionals in the order of most frequent occurrence 87 * for users of Latin-based chars */ 88 if ((c[0] & 0x80) == 0) { 89 uc = c[0]; 90 } else if ((c[0] & 0xe0) == 0xc0) { 91 if (*inbytesleft < 2) { 92 LOG(log_debug, logtype_default, "short utf8 char"); 93 goto badseq; 94 } 95 uc = (ucs2_t) (((c[0] & 0x1f) << 6) | GETUCVAL(c[1],0)) ; 96 len = 2; 97 } else if ((c[0] & 0xf0) == 0xe0) { 98 if (*inbytesleft < 3) { 99 LOG(log_debug, logtype_default, "short utf8 char"); 100 goto badseq; 101 } 102 uc = (ucs2_t) (((c[0] & 0x0f) << 12) | GETUCVAL(c[1],6) | GETUCVAL(c[2],0)) ; 103 len = 3; 104 } else if ((c[0] & 0xf8) == 0xf0) { 105 /* 4 bytes, which happens for surrogate pairs only */ 106 if (*inbytesleft < 4) { 107 LOG(log_debug, logtype_default, "short utf8 char"); 108 goto badseq; 109 } 110 if (*outbytesleft < 4) { 111 LOG(log_debug, logtype_default, "short ucs-2 write"); 112 errno = E2BIG; 113 return -1; 114 } 115 codepoint = ((c[0] & 0x07) << 18) | GETUCVAL(c[1],12) | 116 GETUCVAL(c[2],6) | GETUCVAL(c[3],0); 117 SSVAL(*outbuf,0,(((codepoint - 0x10000) >> 10) + 0xD800)); /* hi */ 118 SSVAL(*outbuf,2,(0xDC00 + (codepoint & 0x03FF))); /* low */ 119 len = 4; 120 (*inbuf) += 4; 121 (*inbytesleft) -= 4; 122 (*outbytesleft) -= 4; 123 (*outbuf) += 4; 124 continue; 125 } 126 else { 127 errno = EINVAL; 128 return -1; 129 } 130 131 SSVAL(*outbuf,0,uc); 132 (*inbuf) += len; 133 (*inbytesleft) -= len; 134 (*outbytesleft) -= 2; 135 (*outbuf) += 2; 136 } 137 138 if (*inbytesleft > 0) { 139 errno = E2BIG; 140 return -1; 141 } 142 143 return 0; 144 145badseq: 146 errno = EINVAL; 147 return -1; 148} 149 150/* --------------------- Convert from UTF-16 to UTF-8 -----------*/ 151static size_t utf8_push(void *cd _U_, char **inbuf, size_t *inbytesleft, 152 char **outbuf, size_t *outbytesleft) 153{ 154 ucs2_t uc=0; 155 ucs2_t hi, low; 156 unsigned int codepoint; 157 int olen, ilen; 158 159 while (*inbytesleft >= 2 && *outbytesleft >= 1) { 160 unsigned char *c = (unsigned char *)*outbuf; 161 uc = SVAL((*inbuf),0); 162 olen=1; 163 ilen=2; 164 165 /* Arrange conditionals in the order of most frequent occurrence for 166 users of Latin-based chars */ 167 if (uc < 0x80) { 168 c[0] = uc; 169 } else if (uc < 0x800) { 170 if (*outbytesleft < 2) { 171 LOG(log_debug, logtype_default, "short utf8 write"); 172 goto toobig; 173 } 174 c[1] = GETUTF8TRAILBYTE(uc, 0); 175 c[0] = (char)(0xc0 | ((uc >> 6) & 0x1f)); 176 olen = 2; 177 } 178 else if ( uc >= 0x202a && uc <= 0x202e ) { 179 /* ignore bidi hint characters */ 180 olen = 0; 181 } 182 /* 183 * A 2-byte uc value represents a stand-alone Unicode character if 184 * 0 <= uc < 0xd800 or 0xdfff < uc <= 0xffff. 185 * If 0xd800 <= uc <= 0xdfff, uc itself does not represent a Unicode character. 186 * Rather, it is just part of a surrogate pair. A surrogate pair consists of 187 * a high surrogate in the range [0xd800 ... 0xdbff] and a low surrogate in the 188 * range [0xdc00 ... 0xdfff]. Together the pair maps to a single Unicode character 189 * whose scalar value is 64K or larger. It is this scalar value that is transformed 190 * to UTF-8, not the individual surrogates. 191 * 192 * See www.unicode.org/faq/utf_bom.html for more info. 193 */ 194 195 else if ( 0xd800 <= uc && uc <= 0xdfff) { 196 /* surrogate - needs 4 bytes from input and 4 bytes for output to UTF-8 */ 197 if (*outbytesleft < 4) { 198 LOG(log_debug, logtype_default, "short utf8 write"); 199 goto toobig; 200 } 201 if (*inbytesleft < 4) { 202 errno = EINVAL; 203 return -1; 204 } 205 hi = SVAL((*inbuf),0); 206 low = SVAL((*inbuf),2); 207 if ( 0xd800 <= hi && hi <= 0xdbff && 0xdc00 <= low && low <= 0xdfff) { 208 codepoint = ((hi - 0xd800) << 10) + (low - 0xdc00) + 0x10000; 209 c[3] = GETUTF8TRAILBYTE(codepoint, 0); 210 c[2] = GETUTF8TRAILBYTE(codepoint, 6); 211 c[1] = GETUTF8TRAILBYTE(codepoint, 12); 212 c[0] = (char)(0xf0 | ((codepoint >> 18) & 0x07)); 213 ilen = olen = 4; 214 } else { /* invalid values for surrogate */ 215 errno = EINVAL; 216 return -1; 217 } 218 } else { 219 if (*outbytesleft < 3) { 220 LOG(log_debug, logtype_default, "short utf8 write"); 221 goto toobig; 222 } 223 c[2] = GETUTF8TRAILBYTE(uc, 0); 224 c[1] = GETUTF8TRAILBYTE(uc, 6); 225 c[0] = (char)(0xe0 | ((uc >> 12) & 0x0f)); 226 olen = 3; 227 } 228 229 (*inbytesleft) -= ilen; 230 (*outbytesleft) -= olen; 231 (*inbuf) += ilen; 232 (*outbuf) += olen; 233 } 234 235 if (*inbytesleft == 1) { 236 errno = EINVAL; 237 return -1; 238 } 239 240 if (*inbytesleft > 1) { 241 errno = E2BIG; 242 return -1; 243 } 244 245 return 0; 246 247toobig: 248 errno = E2BIG; 249 return -1; 250} 251