1/*
2 * libid3tag - ID3 tag manipulation library
3 * Copyright (C) 2000-2003 Underbit Technologies, Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 *
19 * $Id: utf8.c,v 1.8 2003/04/19 00:14:33 rob Exp $
20 */
21
22# ifdef HAVE_CONFIG_H
23#  include "config.h"
24# endif
25
26# include "global.h"
27
28# include <stdlib.h>
29
30# include "id3tag.h"
31# include "utf8.h"
32# include "ucs4.h"
33
34/*
35 * NAME:	utf8->length()
36 * DESCRIPTION:	return the number of ucs4 chars represented by a utf8 string
37 */
38id3_length_t id3_utf8_length(id3_utf8_t const *utf8)
39{
40  id3_length_t length = 0;
41
42  while (*utf8) {
43    if ((utf8[0] & 0x80) == 0x00)
44      ++length;
45    else if ((utf8[0] & 0xe0) == 0xc0 &&
46	     (utf8[1] & 0xc0) == 0x80) {
47      if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) {
48	++length;
49	utf8 += 1;
50      }
51    }
52    else if ((utf8[0] & 0xf0) == 0xe0 &&
53	     (utf8[1] & 0xc0) == 0x80 &&
54	     (utf8[2] & 0xc0) == 0x80) {
55      if ((((utf8[0] & 0x0fL) << 12) |
56	   ((utf8[1] & 0x3fL) <<  6)) >= 0x00000800L) {
57	++length;
58	utf8 += 2;
59      }
60    }
61    else if ((utf8[0] & 0xf8) == 0xf0 &&
62	     (utf8[1] & 0xc0) == 0x80 &&
63	     (utf8[2] & 0xc0) == 0x80 &&
64	     (utf8[3] & 0xc0) == 0x80) {
65      if ((((utf8[0] & 0x07L) << 18) |
66	   ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) {
67	++length;
68	utf8 += 3;
69      }
70    }
71    else if ((utf8[0] & 0xfc) == 0xf8 &&
72	     (utf8[1] & 0xc0) == 0x80 &&
73	     (utf8[2] & 0xc0) == 0x80 &&
74	     (utf8[3] & 0xc0) == 0x80 &&
75	     (utf8[4] & 0xc0) == 0x80) {
76      if ((((utf8[0] & 0x03L) << 24) |
77	   ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) {
78	++length;
79	utf8 += 4;
80      }
81    }
82    else if ((utf8[0] & 0xfe) == 0xfc &&
83	     (utf8[1] & 0xc0) == 0x80 &&
84	     (utf8[2] & 0xc0) == 0x80 &&
85	     (utf8[3] & 0xc0) == 0x80 &&
86	     (utf8[4] & 0xc0) == 0x80 &&
87	     (utf8[5] & 0xc0) == 0x80) {
88      if ((((utf8[0] & 0x01L) << 30) |
89	   ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) {
90	++length;
91	utf8 += 5;
92      }
93    }
94
95    ++utf8;
96  }
97
98  return length;
99}
100
101/*
102 * NAME:	utf8->size()
103 * DESCRIPTION:	return the encoding size of a utf8 string
104 */
105id3_length_t id3_utf8_size(id3_utf8_t const *utf8)
106{
107  id3_utf8_t const *ptr = utf8;
108
109  while (*ptr)
110    ++ptr;
111
112  return ptr - utf8 + 1;
113}
114
115/*
116 * NAME:	utf8->ucs4duplicate()
117 * DESCRIPTION:	duplicate and decode a utf8 string into ucs4
118 */
119id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8)
120{
121  id3_ucs4_t *ucs4;
122
123  ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
124  if (ucs4)
125    id3_utf8_decode(utf8, ucs4);
126
127  return release(ucs4);
128}
129
130/*
131 * NAME:	utf8->decodechar()
132 * DESCRIPTION:	decode a series of utf8 chars into a single ucs4 char
133 */
134id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
135{
136  id3_utf8_t const *start = utf8;
137
138  while (1) {
139    if ((utf8[0] & 0x80) == 0x00) {
140      *ucs4 = utf8[0];
141      return utf8 - start + 1;
142    }
143    else if ((utf8[0] & 0xe0) == 0xc0 &&
144	     (utf8[1] & 0xc0) == 0x80) {
145      *ucs4 =
146	((utf8[0] & 0x1fL) << 6) |
147	((utf8[1] & 0x3fL) << 0);
148      if (*ucs4 >= 0x00000080L)
149	return utf8 - start + 2;
150    }
151    else if ((utf8[0] & 0xf0) == 0xe0 &&
152	     (utf8[1] & 0xc0) == 0x80 &&
153	     (utf8[2] & 0xc0) == 0x80) {
154      *ucs4 =
155	((utf8[0] & 0x0fL) << 12) |
156	((utf8[1] & 0x3fL) <<  6) |
157	((utf8[2] & 0x3fL) <<  0);
158      if (*ucs4 >= 0x00000800L)
159	return utf8 - start + 3;
160    }
161    else if ((utf8[0] & 0xf8) == 0xf0 &&
162	     (utf8[1] & 0xc0) == 0x80 &&
163	     (utf8[2] & 0xc0) == 0x80 &&
164	     (utf8[3] & 0xc0) == 0x80) {
165      *ucs4 =
166	((utf8[0] & 0x07L) << 18) |
167	((utf8[1] & 0x3fL) << 12) |
168	((utf8[2] & 0x3fL) <<  6) |
169	((utf8[3] & 0x3fL) <<  0);
170      if (*ucs4 >= 0x00010000L)
171	return utf8 - start + 4;
172    }
173    else if ((utf8[0] & 0xfc) == 0xf8 &&
174	     (utf8[1] & 0xc0) == 0x80 &&
175	     (utf8[2] & 0xc0) == 0x80 &&
176	     (utf8[3] & 0xc0) == 0x80 &&
177	     (utf8[4] & 0xc0) == 0x80) {
178      *ucs4 =
179	((utf8[0] & 0x03L) << 24) |
180	((utf8[1] & 0x3fL) << 18) |
181	((utf8[2] & 0x3fL) << 12) |
182	((utf8[3] & 0x3fL) <<  6) |
183	((utf8[4] & 0x3fL) <<  0);
184      if (*ucs4 >= 0x00200000L)
185	return utf8 - start + 5;
186    }
187    else if ((utf8[0] & 0xfe) == 0xfc &&
188	     (utf8[1] & 0xc0) == 0x80 &&
189	     (utf8[2] & 0xc0) == 0x80 &&
190	     (utf8[3] & 0xc0) == 0x80 &&
191	     (utf8[4] & 0xc0) == 0x80 &&
192	     (utf8[5] & 0xc0) == 0x80) {
193      *ucs4 =
194	((utf8[0] & 0x01L) << 30) |
195	((utf8[1] & 0x3fL) << 24) |
196	((utf8[2] & 0x3fL) << 18) |
197	((utf8[3] & 0x3fL) << 12) |
198	((utf8[4] & 0x3fL) <<  6) |
199	((utf8[5] & 0x3fL) <<  0);
200      if (*ucs4 >= 0x04000000L)
201	return utf8 - start + 6;
202    }
203
204    ++utf8;
205  }
206}
207
208/*
209 * NAME:	utf8->encodechar()
210 * DESCRIPTION:	encode a single ucs4 char into a series of up to 6 utf8 chars
211 */
212id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4)
213{
214  if (ucs4 <= 0x0000007fL) {
215    utf8[0] = ucs4;
216
217    return 1;
218  }
219  else if (ucs4 <= 0x000007ffL) {
220    utf8[0] = 0xc0 | ((ucs4 >>  6) & 0x1f);
221    utf8[1] = 0x80 | ((ucs4 >>  0) & 0x3f);
222
223    return 2;
224  }
225  else if (ucs4 <= 0x0000ffffL) {
226    utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f);
227    utf8[1] = 0x80 | ((ucs4 >>  6) & 0x3f);
228    utf8[2] = 0x80 | ((ucs4 >>  0) & 0x3f);
229
230    return 3;
231  }
232  else if (ucs4 <= 0x001fffffL) {
233    utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07);
234    utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
235    utf8[2] = 0x80 | ((ucs4 >>  6) & 0x3f);
236    utf8[3] = 0x80 | ((ucs4 >>  0) & 0x3f);
237
238    return 4;
239  }
240  else if (ucs4 <= 0x03ffffffL) {
241    utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03);
242    utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f);
243    utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f);
244    utf8[3] = 0x80 | ((ucs4 >>  6) & 0x3f);
245    utf8[4] = 0x80 | ((ucs4 >>  0) & 0x3f);
246
247    return 5;
248  }
249  else if (ucs4 <= 0x7fffffffL) {
250    utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01);
251    utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f);
252    utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f);
253    utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f);
254    utf8[4] = 0x80 | ((ucs4 >>  6) & 0x3f);
255    utf8[5] = 0x80 | ((ucs4 >>  0) & 0x3f);
256
257    return 6;
258  }
259
260  /* default */
261
262  return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR);
263}
264
265/*
266 * NAME:	utf8->decode()
267 * DESCRIPTION:	decode a complete utf8 string into a ucs4 string
268 */
269void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
270{
271  do
272    utf8 += id3_utf8_decodechar(utf8, ucs4);
273  while (*ucs4++);
274}
275
276/*
277 * NAME:	utf8->encode()
278 * DESCRIPTION:	encode a complete ucs4 string into a utf8 string
279 */
280void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4)
281{
282  do
283    utf8 += id3_utf8_encodechar(utf8, *ucs4);
284  while (*ucs4++);
285}
286
287/*
288 * NAME:	utf8->put()
289 * DESCRIPTION:	serialize a single utf8 character
290 */
291id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8)
292{
293  if (ptr)
294    *(*ptr)++ = utf8;
295
296  return 1;
297}
298
299/*
300 * NAME:	utf8->get()
301 * DESCRIPTION:	deserialize a single utf8 character
302 */
303id3_utf8_t id3_utf8_get(id3_byte_t const **ptr)
304{
305  return *(*ptr)++;
306}
307
308/*
309 * NAME:	utf8->serialize()
310 * DESCRIPTION:	serialize a ucs4 string using utf8 encoding
311 */
312id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
313				int terminate)
314{
315  id3_length_t size = 0;
316  id3_utf8_t utf8[6], *out;
317
318  while (*ucs4) {
319    switch (id3_utf8_encodechar(out = utf8, *ucs4++)) {
320    case 6: size += id3_utf8_put(ptr, *out++);
321    case 5: size += id3_utf8_put(ptr, *out++);
322    case 4: size += id3_utf8_put(ptr, *out++);
323    case 3: size += id3_utf8_put(ptr, *out++);
324    case 2: size += id3_utf8_put(ptr, *out++);
325    case 1: size += id3_utf8_put(ptr, *out++);
326    case 0: break;
327    }
328  }
329
330  if (terminate)
331    size += id3_utf8_put(ptr, 0);
332
333  return size;
334}
335
336/*
337 * NAME:	utf8->deserialize()
338 * DESCRIPTION:	deserialize a ucs4 string using utf8 encoding
339 */
340id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length)
341{
342  id3_byte_t const *end;
343  id3_utf8_t *utf8ptr, *utf8;
344  id3_ucs4_t *ucs4;
345
346  end = *ptr + length;
347
348  utf8 = malloc((length + 1) * sizeof(*utf8));
349  if (utf8 == 0)
350    return 0;
351
352  utf8ptr = utf8;
353  while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr)))
354    ++utf8ptr;
355
356  *utf8ptr = 0;
357
358  ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
359  if (ucs4)
360    id3_utf8_decode(utf8, ucs4);
361
362  free(utf8);
363
364  return ucs4;
365}
366