1/*
2 * "$Id: transcode.c 11093 2013-07-03 20:48:42Z msweet $"
3 *
4 *   Transcoding support for CUPS.
5 *
6 *   Copyright 2007-2010 by Apple Inc.
7 *   Copyright 1997-2007 by Easy Software Products.
8 *
9 *   These coded instructions, statements, and computer programs are the
10 *   property of Apple Inc. and are protected by Federal copyright
11 *   law.  Distribution and use rights are outlined in the file "LICENSE.txt"
12 *   which should have been included with this file.  If this file is
13 *   file is missing or damaged, see the license at "http://www.cups.org/".
14 *
15 *   This file is subject to the Apple OS-Developed Software exception.
16 *
17 * Contents:
18 *
19 *   _cupsCharmapFlush() - Flush all character set maps out of cache.
20 *   cupsCharsetToUTF8() - Convert legacy character set to UTF-8.
21 *   cupsUTF8ToCharset() - Convert UTF-8 to legacy character set.
22 *   cupsUTF8ToUTF32()   - Convert UTF-8 to UTF-32.
23 *   cupsUTF32ToUTF8()   - Convert UTF-32 to UTF-8.
24 */
25
26/*
27 * Include necessary headers...
28 */
29
30#include "cups-private.h"
31#include <limits.h>
32#include <time.h>
33#ifdef HAVE_ICONV_H
34#  include <iconv.h>
35#endif /* HAVE_ICONV_H */
36
37
38/*
39 * Local globals...
40 */
41
42#ifdef HAVE_ICONV_H
43static _cups_mutex_t	map_mutex = _CUPS_MUTEX_INITIALIZER;
44					/* Mutex to control access to maps */
45static iconv_t		map_from_utf8 = (iconv_t)-1;
46					/* Convert from UTF-8 to charset */
47static iconv_t		map_to_utf8 = (iconv_t)-1;
48					/* Convert from charset to UTF-8 */
49static cups_encoding_t	map_encoding = CUPS_AUTO_ENCODING;
50					/* Which charset is cached */
51#endif /* HAVE_ICONV_H */
52
53
54/*
55 * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
56 */
57
58void
59_cupsCharmapFlush(void)
60{
61#ifdef HAVE_ICONV_H
62  if (map_from_utf8 != (iconv_t)-1)
63  {
64    iconv_close(map_from_utf8);
65    map_from_utf8 = (iconv_t)-1;
66  }
67
68  if (map_to_utf8 != (iconv_t)-1)
69  {
70    iconv_close(map_to_utf8);
71    map_to_utf8 = (iconv_t)-1;
72  }
73
74  map_encoding = CUPS_AUTO_ENCODING;
75#endif /* HAVE_ICONV_H */
76}
77
78
79/*
80 * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
81 */
82
83int					/* O - Count or -1 on error */
84cupsCharsetToUTF8(
85    cups_utf8_t           *dest,	/* O - Target string */
86    const char            *src,		/* I - Source string */
87    const int             maxout,	/* I - Max output */
88    const cups_encoding_t encoding)	/* I - Encoding */
89{
90  cups_utf8_t	*destptr;		/* Pointer into UTF-8 buffer */
91#ifdef HAVE_ICONV_H
92  size_t	srclen,			/* Length of source string */
93		outBytesLeft;		/* Bytes remaining in output buffer */
94#endif /* HAVE_ICONV_H */
95
96
97 /*
98  * Check for valid arguments...
99  */
100
101  DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)",
102	        dest, src, maxout, encoding));
103
104  if (!dest || !src || maxout < 1)
105  {
106    if (dest)
107      *dest = '\0';
108
109    DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
110    return (-1);
111  }
112
113 /*
114  * Handle identity conversions...
115  */
116
117  if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
118      encoding >= CUPS_ENCODING_VBCS_END)
119  {
120    strlcpy((char *)dest, src, maxout);
121    return ((int)strlen((char *)dest));
122  }
123
124 /*
125  * Handle ISO-8859-1 to UTF-8 directly...
126  */
127
128  destptr = dest;
129
130  if (encoding == CUPS_ISO8859_1)
131  {
132    int		ch;			/* Character from string */
133    cups_utf8_t	*destend;		/* End of UTF-8 buffer */
134
135
136    destend = dest + maxout - 2;
137
138    while (*src && destptr < destend)
139    {
140      ch = *src++ & 255;
141
142      if (ch & 128)
143      {
144	*destptr++ = 0xc0 | (ch >> 6);
145	*destptr++ = 0x80 | (ch & 0x3f);
146      }
147      else
148	*destptr++ = ch;
149    }
150
151    *destptr = '\0';
152
153    return ((int)(destptr - dest));
154  }
155
156 /*
157  * Convert input legacy charset to UTF-8...
158  */
159
160#ifdef HAVE_ICONV_H
161  _cupsMutexLock(&map_mutex);
162
163  if (map_encoding != encoding)
164  {
165    _cupsCharmapFlush();
166
167    map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
168    map_to_utf8   = iconv_open("UTF-8", _cupsEncodingName(encoding));
169    map_encoding     = encoding;
170  }
171
172  if (map_to_utf8 != (iconv_t)-1)
173  {
174    char *altdestptr = (char *)dest;	/* Silence bogus GCC type-punned */
175
176    srclen       = strlen(src);
177    outBytesLeft = maxout - 1;
178
179    iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft);
180    *altdestptr = '\0';
181
182    _cupsMutexUnlock(&map_mutex);
183
184    return ((int)(altdestptr - (char *)dest));
185  }
186
187  _cupsMutexUnlock(&map_mutex);
188#endif /* HAVE_ICONV_H */
189
190 /*
191  * No iconv() support, so error out...
192  */
193
194  *destptr = '\0';
195
196  return (-1);
197}
198
199
200/*
201 * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
202 */
203
204int					/* O - Count or -1 on error */
205cupsUTF8ToCharset(
206    char		  *dest,	/* O - Target string */
207    const cups_utf8_t	  *src,		/* I - Source string */
208    const int		  maxout,	/* I - Max output */
209    const cups_encoding_t encoding)	/* I - Encoding */
210{
211  char		*destptr;		/* Pointer into destination */
212#ifdef HAVE_ICONV_H
213  size_t	srclen,			/* Length of source string */
214		outBytesLeft;		/* Bytes remaining in output buffer */
215#endif /* HAVE_ICONV_H */
216
217
218 /*
219  * Check for valid arguments...
220  */
221
222  if (!dest || !src || maxout < 1)
223  {
224    if (dest)
225      *dest = '\0';
226
227    return (-1);
228  }
229
230 /*
231  * Handle identity conversions...
232  */
233
234  if (encoding == CUPS_UTF8 ||
235      encoding >= CUPS_ENCODING_VBCS_END)
236  {
237    strlcpy(dest, (char *)src, maxout);
238    return ((int)strlen(dest));
239  }
240
241 /*
242  * Handle UTF-8 to ISO-8859-1 directly...
243  */
244
245  destptr = dest;
246
247  if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII)
248  {
249    int		ch,			/* Character from string */
250		maxch;			/* Maximum character for charset */
251    char	*destend;		/* End of ISO-8859-1 buffer */
252
253    maxch   = encoding == CUPS_ISO8859_1 ? 256 : 128;
254    destend = dest + maxout - 1;
255
256    while (*src && destptr < destend)
257    {
258      ch = *src++;
259
260      if ((ch & 0xe0) == 0xc0)
261      {
262	ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
263
264	if (ch < maxch)
265          *destptr++ = ch;
266	else
267          *destptr++ = '?';
268      }
269      else if ((ch & 0xf0) == 0xe0 ||
270               (ch & 0xf8) == 0xf0)
271        *destptr++ = '?';
272      else if (!(ch & 0x80))
273	*destptr++ = ch;
274    }
275
276    *destptr = '\0';
277
278    return ((int)(destptr - dest));
279  }
280
281#ifdef HAVE_ICONV_H
282 /*
283  * Convert input UTF-8 to legacy charset...
284  */
285
286  _cupsMutexLock(&map_mutex);
287
288  if (map_encoding != encoding)
289  {
290    _cupsCharmapFlush();
291
292    map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
293    map_to_utf8   = iconv_open("UTF-8", _cupsEncodingName(encoding));
294    map_encoding  = encoding;
295  }
296
297  if (map_from_utf8 != (iconv_t)-1)
298  {
299    char *altsrc = (char *)src;		/* Silence bogus GCC type-punned */
300
301    srclen       = strlen((char *)src);
302    outBytesLeft = maxout - 1;
303
304    iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft);
305    *destptr = '\0';
306
307    _cupsMutexUnlock(&map_mutex);
308
309    return ((int)(destptr - dest));
310  }
311
312  _cupsMutexUnlock(&map_mutex);
313#endif /* HAVE_ICONV_H */
314
315 /*
316  * No iconv() support, so error out...
317  */
318
319  *destptr = '\0';
320
321  return (-1);
322}
323
324
325/*
326 * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
327 *
328 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
329 *
330 *   UTF-32 char     UTF-8 char(s)
331 *   --------------------------------------------------
332 *	  0 to 127 = 0xxxxxxx (US-ASCII)
333 *     128 to 2047 = 110xxxxx 10yyyyyy
334 *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
335 *	   > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
336 *
337 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
338 * which would convert to five- or six-octet UTF-8 sequences...
339 */
340
341int					/* O - Count or -1 on error */
342cupsUTF8ToUTF32(
343    cups_utf32_t      *dest,		/* O - Target string */
344    const cups_utf8_t *src,		/* I - Source string */
345    const int         maxout)		/* I - Max output */
346{
347  int		i;			/* Looping variable */
348  cups_utf8_t	ch;			/* Character value */
349  cups_utf8_t	next;			/* Next character value */
350  cups_utf32_t	ch32;			/* UTF-32 character value */
351
352
353 /*
354  * Check for valid arguments and clear output...
355  */
356
357  DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", dest,
358                src, maxout));
359
360  if (dest)
361    *dest = 0;
362
363  if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
364  {
365    DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
366
367    return (-1);
368  }
369
370 /*
371  * Convert input UTF-8 to output UTF-32...
372  */
373
374  for (i = maxout - 1; *src && i > 0; i --)
375  {
376    ch = *src++;
377
378   /*
379    * Convert UTF-8 character(s) to UTF-32 character...
380    */
381
382    if (!(ch & 0x80))
383    {
384     /*
385      * One-octet UTF-8 <= 127 (US-ASCII)...
386      */
387
388      *dest++ = ch;
389
390      DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
391      continue;
392    }
393    else if ((ch & 0xe0) == 0xc0)
394    {
395     /*
396      * Two-octet UTF-8 <= 2047 (Latin-x)...
397      */
398
399      next = *src++;
400      if ((next & 0xc0) != 0x80)
401      {
402        DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
403
404	return (-1);
405      }
406
407      ch32 = ((ch & 0x1f) << 6) | (next & 0x3f);
408
409     /*
410      * Check for non-shortest form (invalid UTF-8)...
411      */
412
413      if (ch32 < 0x80)
414      {
415        DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
416
417	return (-1);
418      }
419
420      *dest++ = ch32;
421
422      DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
423                    src[-2], src[-1], (unsigned)ch32));
424    }
425    else if ((ch & 0xf0) == 0xe0)
426    {
427     /*
428      * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
429      */
430
431      next = *src++;
432      if ((next & 0xc0) != 0x80)
433      {
434        DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
435
436	return (-1);
437      }
438
439      ch32 = ((ch & 0x0f) << 6) | (next & 0x3f);
440
441      next = *src++;
442      if ((next & 0xc0) != 0x80)
443      {
444        DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
445
446	return (-1);
447      }
448
449      ch32 = (ch32 << 6) | (next & 0x3f);
450
451     /*
452      * Check for non-shortest form (invalid UTF-8)...
453      */
454
455      if (ch32 < 0x800)
456      {
457        DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
458
459	return (-1);
460      }
461
462      *dest++ = ch32;
463
464      DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
465                    src[-3], src[-2], src[-1], (unsigned)ch32));
466    }
467    else if ((ch & 0xf8) == 0xf0)
468    {
469     /*
470      * Four-octet UTF-8...
471      */
472
473      next = *src++;
474      if ((next & 0xc0) != 0x80)
475      {
476        DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
477
478	return (-1);
479      }
480
481      ch32 = ((ch & 0x07) << 6) | (next & 0x3f);
482
483      next = *src++;
484      if ((next & 0xc0) != 0x80)
485      {
486        DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
487
488	return (-1);
489      }
490
491      ch32 = (ch32 << 6) | (next & 0x3f);
492
493      next = *src++;
494      if ((next & 0xc0) != 0x80)
495      {
496        DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
497
498	return (-1);
499      }
500
501      ch32 = (ch32 << 6) | (next & 0x3f);
502
503     /*
504      * Check for non-shortest form (invalid UTF-8)...
505      */
506
507      if (ch32 < 0x10000)
508      {
509        DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
510
511	return (-1);
512      }
513
514      *dest++ = ch32;
515
516      DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
517                    src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
518    }
519    else
520    {
521     /*
522      * More than 4-octet (invalid UTF-8 sequence)...
523      */
524
525      DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
526
527      return (-1);
528    }
529
530   /*
531    * Check for UTF-16 surrogate (illegal UTF-8)...
532    */
533
534    if (ch32 >= 0xd800 && ch32 <= 0xdfff)
535      return (-1);
536  }
537
538  *dest = 0;
539
540  DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
541
542  return (maxout - 1 - i);
543}
544
545
546/*
547 * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
548 *
549 * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
550 *
551 *   UTF-32 char     UTF-8 char(s)
552 *   --------------------------------------------------
553 *	  0 to 127 = 0xxxxxxx (US-ASCII)
554 *     128 to 2047 = 110xxxxx 10yyyyyy
555 *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
556 *	   > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
557 *
558 * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
559 * which would convert to five- or six-octet UTF-8 sequences...
560 */
561
562int					/* O - Count or -1 on error */
563cupsUTF32ToUTF8(
564    cups_utf8_t        *dest,		/* O - Target string */
565    const cups_utf32_t *src,		/* I - Source string */
566    const int          maxout)		/* I - Max output */
567{
568  cups_utf8_t	*start;			/* Start of destination string */
569  int		i;			/* Looping variable */
570  int		swap;			/* Byte-swap input to output */
571  cups_utf32_t	ch;			/* Character value */
572
573
574 /*
575  * Check for valid arguments and clear output...
576  */
577
578  DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", dest, src,
579                maxout));
580
581  if (dest)
582    *dest = '\0';
583
584  if (!dest || !src || maxout < 1)
585  {
586    DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
587
588    return (-1);
589  }
590
591 /*
592  * Check for leading BOM in UTF-32 and inverted BOM...
593  */
594
595  start = dest;
596  swap  = *src == 0xfffe0000;
597
598  DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
599
600  if (*src == 0xfffe0000 || *src == 0xfeff)
601    src ++;
602
603 /*
604  * Convert input UTF-32 to output UTF-8...
605  */
606
607  for (i = maxout - 1; *src && i > 0;)
608  {
609    ch = *src++;
610
611   /*
612    * Byte swap input UTF-32, if necessary...
613    * (only byte-swapping 24 of 32 bits)
614    */
615
616    if (swap)
617      ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
618
619   /*
620    * Check for beyond Plane 16 (invalid UTF-32)...
621    */
622
623    if (ch > 0x10ffff)
624    {
625      DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
626
627      return (-1);
628    }
629
630   /*
631    * Convert UTF-32 character to UTF-8 character(s)...
632    */
633
634    if (ch < 0x80)
635    {
636     /*
637      * One-octet UTF-8 <= 127 (US-ASCII)...
638      */
639
640      *dest++ = (cups_utf8_t)ch;
641      i --;
642
643      DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
644    }
645    else if (ch < 0x800)
646    {
647     /*
648      * Two-octet UTF-8 <= 2047 (Latin-x)...
649      */
650
651      if (i < 2)
652      {
653        DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
654
655        return (-1);
656      }
657
658      *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
659      *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
660      i -= 2;
661
662      DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
663                    dest[-2], dest[-1]));
664    }
665    else if (ch < 0x10000)
666    {
667     /*
668      * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
669      */
670
671      if (i < 3)
672      {
673        DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
674
675        return (-1);
676      }
677
678      *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
679      *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
680      *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
681      i -= 3;
682
683      DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
684                    dest[-3], dest[-2], dest[-1]));
685    }
686    else
687    {
688     /*
689      * Four-octet UTF-8...
690      */
691
692      if (i < 4)
693      {
694        DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
695
696        return (-1);
697      }
698
699      *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
700      *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
701      *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
702      *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
703      i -= 4;
704
705      DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
706                    (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
707    }
708  }
709
710  *dest = '\0';
711
712  DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
713
714  return ((int)(dest - start));
715}
716
717
718/*
719 * End of "$Id: transcode.c 11093 2013-07-03 20:48:42Z msweet $"
720 */
721