1/* 2 * libid3tag - ID3 tag manipulation library 3 * Copyright (C) 2000-2003 Underbit Technologies, Inc. 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 * 19 * $Id: utf8.c,v 1.8 2003/04/19 00:14:33 rob Exp $ 20 */ 21 22# ifdef HAVE_CONFIG_H 23# include "config.h" 24# endif 25 26# include "global.h" 27 28# include <stdlib.h> 29 30# include "id3tag.h" 31# include "utf8.h" 32# include "ucs4.h" 33 34/* 35 * NAME: utf8->length() 36 * DESCRIPTION: return the number of ucs4 chars represented by a utf8 string 37 */ 38id3_length_t id3_utf8_length(id3_utf8_t const *utf8) 39{ 40 id3_length_t length = 0; 41 42 while (*utf8) { 43 if ((utf8[0] & 0x80) == 0x00) 44 ++length; 45 else if ((utf8[0] & 0xe0) == 0xc0 && 46 (utf8[1] & 0xc0) == 0x80) { 47 if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) { 48 ++length; 49 utf8 += 1; 50 } 51 } 52 else if ((utf8[0] & 0xf0) == 0xe0 && 53 (utf8[1] & 0xc0) == 0x80 && 54 (utf8[2] & 0xc0) == 0x80) { 55 if ((((utf8[0] & 0x0fL) << 12) | 56 ((utf8[1] & 0x3fL) << 6)) >= 0x00000800L) { 57 ++length; 58 utf8 += 2; 59 } 60 } 61 else if ((utf8[0] & 0xf8) == 0xf0 && 62 (utf8[1] & 0xc0) == 0x80 && 63 (utf8[2] & 0xc0) == 0x80 && 64 (utf8[3] & 0xc0) == 0x80) { 65 if ((((utf8[0] & 0x07L) << 18) | 66 ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) { 67 ++length; 68 utf8 += 3; 69 } 70 } 71 else if ((utf8[0] & 0xfc) == 0xf8 && 72 (utf8[1] & 0xc0) == 0x80 && 73 (utf8[2] & 0xc0) == 0x80 && 74 (utf8[3] & 0xc0) == 0x80 && 75 (utf8[4] & 0xc0) == 0x80) { 76 if ((((utf8[0] & 0x03L) << 24) | 77 ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) { 78 ++length; 79 utf8 += 4; 80 } 81 } 82 else if ((utf8[0] & 0xfe) == 0xfc && 83 (utf8[1] & 0xc0) == 0x80 && 84 (utf8[2] & 0xc0) == 0x80 && 85 (utf8[3] & 0xc0) == 0x80 && 86 (utf8[4] & 0xc0) == 0x80 && 87 (utf8[5] & 0xc0) == 0x80) { 88 if ((((utf8[0] & 0x01L) << 30) | 89 ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) { 90 ++length; 91 utf8 += 5; 92 } 93 } 94 95 ++utf8; 96 } 97 98 return length; 99} 100 101/* 102 * NAME: utf8->size() 103 * DESCRIPTION: return the encoding size of a utf8 string 104 */ 105id3_length_t id3_utf8_size(id3_utf8_t const *utf8) 106{ 107 id3_utf8_t const *ptr = utf8; 108 109 while (*ptr) 110 ++ptr; 111 112 return ptr - utf8 + 1; 113} 114 115/* 116 * NAME: utf8->ucs4duplicate() 117 * DESCRIPTION: duplicate and decode a utf8 string into ucs4 118 */ 119id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8) 120{ 121 id3_ucs4_t *ucs4; 122 123 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4)); 124 if (ucs4) 125 id3_utf8_decode(utf8, ucs4); 126 127 return release(ucs4); 128} 129 130/* 131 * NAME: utf8->decodechar() 132 * DESCRIPTION: decode a series of utf8 chars into a single ucs4 char 133 */ 134id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4) 135{ 136 id3_utf8_t const *start = utf8; 137 138 while (1) { 139 if ((utf8[0] & 0x80) == 0x00) { 140 *ucs4 = utf8[0]; 141 return utf8 - start + 1; 142 } 143 else if ((utf8[0] & 0xe0) == 0xc0 && 144 (utf8[1] & 0xc0) == 0x80) { 145 *ucs4 = 146 ((utf8[0] & 0x1fL) << 6) | 147 ((utf8[1] & 0x3fL) << 0); 148 if (*ucs4 >= 0x00000080L) 149 return utf8 - start + 2; 150 } 151 else if ((utf8[0] & 0xf0) == 0xe0 && 152 (utf8[1] & 0xc0) == 0x80 && 153 (utf8[2] & 0xc0) == 0x80) { 154 *ucs4 = 155 ((utf8[0] & 0x0fL) << 12) | 156 ((utf8[1] & 0x3fL) << 6) | 157 ((utf8[2] & 0x3fL) << 0); 158 if (*ucs4 >= 0x00000800L) 159 return utf8 - start + 3; 160 } 161 else if ((utf8[0] & 0xf8) == 0xf0 && 162 (utf8[1] & 0xc0) == 0x80 && 163 (utf8[2] & 0xc0) == 0x80 && 164 (utf8[3] & 0xc0) == 0x80) { 165 *ucs4 = 166 ((utf8[0] & 0x07L) << 18) | 167 ((utf8[1] & 0x3fL) << 12) | 168 ((utf8[2] & 0x3fL) << 6) | 169 ((utf8[3] & 0x3fL) << 0); 170 if (*ucs4 >= 0x00010000L) 171 return utf8 - start + 4; 172 } 173 else if ((utf8[0] & 0xfc) == 0xf8 && 174 (utf8[1] & 0xc0) == 0x80 && 175 (utf8[2] & 0xc0) == 0x80 && 176 (utf8[3] & 0xc0) == 0x80 && 177 (utf8[4] & 0xc0) == 0x80) { 178 *ucs4 = 179 ((utf8[0] & 0x03L) << 24) | 180 ((utf8[1] & 0x3fL) << 18) | 181 ((utf8[2] & 0x3fL) << 12) | 182 ((utf8[3] & 0x3fL) << 6) | 183 ((utf8[4] & 0x3fL) << 0); 184 if (*ucs4 >= 0x00200000L) 185 return utf8 - start + 5; 186 } 187 else if ((utf8[0] & 0xfe) == 0xfc && 188 (utf8[1] & 0xc0) == 0x80 && 189 (utf8[2] & 0xc0) == 0x80 && 190 (utf8[3] & 0xc0) == 0x80 && 191 (utf8[4] & 0xc0) == 0x80 && 192 (utf8[5] & 0xc0) == 0x80) { 193 *ucs4 = 194 ((utf8[0] & 0x01L) << 30) | 195 ((utf8[1] & 0x3fL) << 24) | 196 ((utf8[2] & 0x3fL) << 18) | 197 ((utf8[3] & 0x3fL) << 12) | 198 ((utf8[4] & 0x3fL) << 6) | 199 ((utf8[5] & 0x3fL) << 0); 200 if (*ucs4 >= 0x04000000L) 201 return utf8 - start + 6; 202 } 203 204 ++utf8; 205 } 206} 207 208/* 209 * NAME: utf8->encodechar() 210 * DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars 211 */ 212id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4) 213{ 214 if (ucs4 <= 0x0000007fL) { 215 utf8[0] = ucs4; 216 217 return 1; 218 } 219 else if (ucs4 <= 0x000007ffL) { 220 utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f); 221 utf8[1] = 0x80 | ((ucs4 >> 0) & 0x3f); 222 223 return 2; 224 } 225 else if (ucs4 <= 0x0000ffffL) { 226 utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f); 227 utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f); 228 utf8[2] = 0x80 | ((ucs4 >> 0) & 0x3f); 229 230 return 3; 231 } 232 else if (ucs4 <= 0x001fffffL) { 233 utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07); 234 utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f); 235 utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f); 236 utf8[3] = 0x80 | ((ucs4 >> 0) & 0x3f); 237 238 return 4; 239 } 240 else if (ucs4 <= 0x03ffffffL) { 241 utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03); 242 utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f); 243 utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f); 244 utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f); 245 utf8[4] = 0x80 | ((ucs4 >> 0) & 0x3f); 246 247 return 5; 248 } 249 else if (ucs4 <= 0x7fffffffL) { 250 utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01); 251 utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f); 252 utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f); 253 utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f); 254 utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f); 255 utf8[5] = 0x80 | ((ucs4 >> 0) & 0x3f); 256 257 return 6; 258 } 259 260 /* default */ 261 262 return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR); 263} 264 265/* 266 * NAME: utf8->decode() 267 * DESCRIPTION: decode a complete utf8 string into a ucs4 string 268 */ 269void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4) 270{ 271 do 272 utf8 += id3_utf8_decodechar(utf8, ucs4); 273 while (*ucs4++); 274} 275 276/* 277 * NAME: utf8->encode() 278 * DESCRIPTION: encode a complete ucs4 string into a utf8 string 279 */ 280void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4) 281{ 282 do 283 utf8 += id3_utf8_encodechar(utf8, *ucs4); 284 while (*ucs4++); 285} 286 287/* 288 * NAME: utf8->put() 289 * DESCRIPTION: serialize a single utf8 character 290 */ 291id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8) 292{ 293 if (ptr) 294 *(*ptr)++ = utf8; 295 296 return 1; 297} 298 299/* 300 * NAME: utf8->get() 301 * DESCRIPTION: deserialize a single utf8 character 302 */ 303id3_utf8_t id3_utf8_get(id3_byte_t const **ptr) 304{ 305 return *(*ptr)++; 306} 307 308/* 309 * NAME: utf8->serialize() 310 * DESCRIPTION: serialize a ucs4 string using utf8 encoding 311 */ 312id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4, 313 int terminate) 314{ 315 id3_length_t size = 0; 316 id3_utf8_t utf8[6], *out; 317 318 while (*ucs4) { 319 switch (id3_utf8_encodechar(out = utf8, *ucs4++)) { 320 case 6: size += id3_utf8_put(ptr, *out++); 321 case 5: size += id3_utf8_put(ptr, *out++); 322 case 4: size += id3_utf8_put(ptr, *out++); 323 case 3: size += id3_utf8_put(ptr, *out++); 324 case 2: size += id3_utf8_put(ptr, *out++); 325 case 1: size += id3_utf8_put(ptr, *out++); 326 case 0: break; 327 } 328 } 329 330 if (terminate) 331 size += id3_utf8_put(ptr, 0); 332 333 return size; 334} 335 336/* 337 * NAME: utf8->deserialize() 338 * DESCRIPTION: deserialize a ucs4 string using utf8 encoding 339 */ 340id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length) 341{ 342 id3_byte_t const *end; 343 id3_utf8_t *utf8ptr, *utf8; 344 id3_ucs4_t *ucs4; 345 346 end = *ptr + length; 347 348 utf8 = malloc((length + 1) * sizeof(*utf8)); 349 if (utf8 == 0) 350 return 0; 351 352 utf8ptr = utf8; 353 while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr))) 354 ++utf8ptr; 355 356 *utf8ptr = 0; 357 358 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4)); 359 if (ucs4) 360 id3_utf8_decode(utf8, ucs4); 361 362 free(utf8); 363 364 return ucs4; 365} 366