1/*---------------------------------------------------------------------------*
2 |              PDFlib - A library for generating PDF on the fly             |
3 +---------------------------------------------------------------------------+
4 | Copyright (c) 1997-2004 Thomas Merz and PDFlib GmbH. All rights reserved. |
5 +---------------------------------------------------------------------------+
6 |                                                                           |
7 |    This software is subject to the PDFlib license. It is NOT in the       |
8 |    public domain. Extended versions and commercial licenses are           |
9 |    available, please check http://www.pdflib.com.                         |
10 |                                                                           |
11 *---------------------------------------------------------------------------*/
12
13/* $Id: pc_unicode.c 14574 2005-10-29 16:27:43Z bonefish $
14 *
15 * PDFlib routines for converting between Unicode values and Adobe glyph names
16 *
17 */
18
19#include "pc_util.h"
20#include "pc_chartabs.h"
21
22
23/*
24 * Returns the Unicode value of a glyph name. If the name is not
25 * contained in the Adobe Glyph List (AGL) 0 will be returned.
26 */
27
28pdc_ushort
29pdc_adobe2unicode(const char *name)
30{
31    int lo = 0;
32    int hi = ((sizeof tab_agl2uni) / (sizeof (pdc_glyph_tab)));
33
34    if (name)
35    {
36        while (lo < hi)
37        {
38            int i = (lo + hi) / 2;
39            int cmp = strcmp(name, tab_agl2uni[i].glyphname);
40
41            if (cmp == 0)
42                return tab_agl2uni[i].code;
43
44            if (cmp < 0)
45                hi = i;
46            else
47                lo = i + 1;
48        }
49    }
50
51    return 0;
52}
53
54/*
55 * Returns the name in the Adobe Glyph List which corresponds to
56 * the supplied Unicode value. If the value doesn't have a
57 * corresponding Unicode name NULL will be returned.
58 */
59
60const char *
61pdc_unicode2adobe(pdc_ushort uv)
62{
63    int lo = 0;
64    int hi = ((sizeof tab_uni2agl) / (sizeof (pdc_glyph_tab)));
65
66    if (uv)
67    {
68        while (lo < hi)
69        {
70            int i = (lo + hi) / 2;
71
72            if (uv == tab_uni2agl[i].code)
73                return tab_uni2agl[i].glyphname;
74
75            if (uv < tab_uni2agl[i].code)
76                hi = i;
77            else
78                lo = i + 1;
79        }
80    }
81
82    return (char *) 0;
83}
84
85
86
87/*
88 * Returns true if a character name is contained in pc_standard_latin_charset.
89 * Otherwise false will be returned.
90 */
91
92pdc_bool
93pdc_is_std_charname(const char *name)
94{
95    int lo = 0;
96    int hi = ((sizeof pc_standard_latin_charset) / (sizeof (char *)));
97
98    if (name)
99    {
100        while (lo < hi)
101        {
102            int i = (lo + hi) / 2;
103            int cmp = strcmp(name, pc_standard_latin_charset[i]);
104
105            if (cmp == 0)
106                return pdc_true;
107
108            if (cmp < 0)
109                hi = i;
110            else
111                lo = i + 1;
112        }
113    }
114
115    return pdc_false;
116}
117
118/*
119 *  The following source is based on Unicode's original source
120 *  code ConvertUTF.c. It has been adapted to PDFlib programming
121 *  conventions.
122 *
123 *  The original file had the following notice:
124 *
125 *      Copyright 2001 Unicode, Inc.
126 *
127 *      Limitations on Rights to Redistribute This Code
128 *
129 *      Author: Mark E. Davis, 1994.
130 *      Rev History: Rick McGowan, fixes & updates May 2001.
131 *
132 *
133 *  Functions for conversions between UTF32, UTF-16, and UTF-8.
134 *  These funtions forming a complete set of conversions between
135 *  the three formats. UTF-7 is not included here.
136 *
137 *  Each of these routines takes pointers to input buffers and output
138 *  buffers. The input buffers are const.
139 *
140 *  Each routine converts the text between *sourceStart and sourceEnd,
141 *  putting the result into the buffer between *targetStart and
142 *  targetEnd. Note: the end pointers are *after* the last item: e.g.
143 *  *(sourceEnd - 1) is the last item.
144 *
145 *  The return result indicates whether the conversion was successful,
146 *  and if not, whether the problem was in the source or target buffers.
147 *  (Only the first encountered problem is indicated.)
148 *
149 *  After the conversion, *sourceStart and *targetStart are both
150 *  updated to point to the end of last text successfully converted in
151 *  the respective buffers.
152 *
153 *  Input parameters:
154 *      sourceStart - pointer to a pointer to the source buffer.
155 *              The contents of this are modified on return so that
156 *              it points at the next thing to be converted.
157 *      targetStart - similarly, pointer to pointer to the target buffer.
158 *      sourceEnd, targetEnd - respectively pointers to the ends of the
159 *              two buffers, for overflow checking only.
160 *
161 *  These conversion functions take a pdc_convers_flags argument. When this
162 *  flag is set to strict, both irregular sequences and isolated surrogates
163 *  will cause an error.  When the flag is set to lenient, both irregular
164 *  sequences and isolated surrogates are converted.
165 *
166 *  Whether the flag is strict or lenient, all illegal sequences will cause
167 *  an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
168 *  or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
169 *  must check for illegal sequences.
170 *
171 *  When the flag is set to lenient, characters over 0x10FFFF are converted
172 *  to the replacement character; otherwise (when the flag is set to strict)
173 *  they constitute an error.
174 *
175 *  Output parameters:
176 *      The value "sourceIllegal" is returned from some routines if the input
177 *      sequence is malformed.  When "sourceIllegal" is returned, the source
178 *      value will point to the illegal value that caused the problem. E.g.,
179 *      in UTF-8 when a sequence is malformed, it points to the start of the
180 *      malformed sequence.
181 *
182 *  Author: Mark E. Davis, 1994.
183 *  Rev History: Rick McGowan, fixes & updates May 2001.
184 *
185 */
186
187/*
188 * The following 4 definitions are compiler-specific.
189 * The C standard does not guarantee that wchar_t has at least
190 * 16 bits, so wchar_t is no less portable than unsigned short!
191 * All should be unsigned values to avoid sign extension during
192 * bit mask & shift operations.
193 */
194
195typedef unsigned long   UTF32;  /* at least 32 bits */
196typedef unsigned short  UTF16;  /* at least 16 bits */
197typedef unsigned char   UTF8;   /* typically 8 bits */
198
199/* Some fundamental constants */
200#define UNI_SUR_HIGH_START      (UTF32)0xD800
201#define UNI_SUR_HIGH_END        (UTF32)0xDBFF
202#define UNI_SUR_LOW_START       (UTF32)0xDC00
203#define UNI_SUR_LOW_END         (UTF32)0xDFFF
204#define UNI_REPLACEMENT_CHAR    (UTF32)0x0000FFFD
205#define UNI_MAX_BMP             (UTF32)0x0000FFFF
206#define UNI_MAX_UTF16           (UTF32)0x0010FFFF
207#define UNI_MAX_UTF32           (UTF32)0x7FFFFFFF
208
209static const int halfShift      = 10; /* used for shifting by 10 bits */
210
211static const UTF32 halfBase     = 0x0010000UL;
212static const UTF32 halfMask     = 0x3FFUL;
213
214
215/* --------------------------------------------------------------------- */
216
217#if 0
218static pdc_convers_result
219pdc_convertUTF32toUTF16 (
220                UTF32** sourceStart, const UTF32* sourceEnd,
221                UTF16** targetStart, const UTF16* targetEnd,
222                const pdc_convers_flags flags) {
223    pdc_convers_result result = conversionOK;
224    UTF32* source = *sourceStart;
225    UTF16* target = *targetStart;
226    while (source < sourceEnd) {
227        UTF32 ch;
228        if (target >= targetEnd) {
229            result = targetExhausted; break;
230        }
231        ch = *source++;
232        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
233            if ((flags == strictConversion) &&
234                (ch >= UNI_SUR_HIGH_START &&
235                 ch <= UNI_SUR_LOW_END)) {
236                --source; /* return to the illegal value itself */
237                result = sourceIllegal;
238                break;
239            } else {
240                *target++ = (UTF16) ch;     /* normal case */
241            }
242        } else if (ch > UNI_MAX_UTF16) {
243            if (flags == strictConversion) {
244                result = sourceIllegal;
245            } else {
246                *target++ = UNI_REPLACEMENT_CHAR;
247            }
248        } else {
249            /* target is a character in range 0xFFFF - 0x10FFFF. */
250            if (target + 1 >= targetEnd) {
251                result = targetExhausted;
252                break;
253            }
254            ch -= halfBase;
255            *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
256            *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
257        }
258    }
259    *sourceStart = source;
260    *targetStart = target;
261    return result;
262}
263
264/* --------------------------------------------------------------------- */
265
266static pdc_convers_result
267pdc_convertUTF16toUTF32 (
268                UTF16** sourceStart, UTF16* sourceEnd,
269                UTF32** targetStart, const UTF32* targetEnd,
270                const pdc_convers_flags flags) {
271    pdc_convers_result result = conversionOK;
272    UTF16* source = *sourceStart;
273    UTF32* target = *targetStart;
274    UTF32 ch, ch2;
275    while (source < sourceEnd) {
276        ch = *source++;
277        if (ch >= UNI_SUR_HIGH_START &&
278            ch <= UNI_SUR_HIGH_END &&
279            source < sourceEnd) {
280            ch2 = *source;
281            if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
282                ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
283                      + (ch2 - UNI_SUR_LOW_START) + halfBase;
284                ++source;
285            } else if (flags == strictConversion) {
286                /* it's an unpaired high surrogate */
287                --source; /* return to the illegal value itself */
288                result = sourceIllegal;
289                break;
290            }
291        } else if ((flags == strictConversion) &&
292                   (ch >= UNI_SUR_LOW_START &&
293                    ch <= UNI_SUR_LOW_END)) {
294            /* an unpaired low surrogate */
295            --source; /* return to the illegal value itself */
296            result = sourceIllegal;
297            break;
298        }
299        if (target >= targetEnd) {
300            result = targetExhausted;
301            break;
302        }
303        *target++ = ch;
304    }
305    *sourceStart = source;
306    *targetStart = target;
307#ifdef CVTUTF_DEBUG
308if (result == sourceIllegal) {
309    fprintf(stderr, "pdc_convertUTF16toUTF32 illegal seq 0x%04x,%04x\n",
310            ch, ch2);
311    fflush(stderr);
312}
313#endif
314    return result;
315}
316#endif
317
318/* --------------------------------------------------------------------- */
319
320/*
321 * Index into the table below with the first byte of a UTF-8 sequence to
322 * get the number of trailing bytes that are supposed to follow it.
323 */
324static const char trailingBytesForUTF8[256] = {
325        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
326        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
327        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
328        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
329        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
330        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
331        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
332        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
333};
334
335#if 0
336static const char
337pdc_get_trailingBytesForUTF8(int i) {
338    return (trailingBytesForUTF8[i]);
339}
340#endif
341
342/*
343 * Magic values subtracted from a buffer value during UTF8 conversion.
344 * This table contains as many values as there might be trailing bytes
345 * in a UTF-8 sequence.
346 */
347static const UTF32 offsetsFromUTF8[6] = {
348    0x00000000UL, 0x00003080UL, 0x000E2080UL,
349    0x03C82080UL, 0xFA082080UL, 0x82082080UL
350};
351
352/*
353 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
354 * into the first byte, depending on how many bytes follow.  There are
355 * as many entries in this table as there are UTF-8 sequence types.
356 * (I.e., one byte sequence, two byte... six byte sequence.)
357 */
358static const UTF8 firstByteMark[7] = {
359    0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
360};
361
362/* --------------------------------------------------------------------- */
363
364/* The interface converts a whole buffer to avoid function-call overhead.
365 * Constants have been gathered. Loops & conditionals have been removed as
366 * much as possible for efficiency, in favor of drop-through switches.
367 * (See "Note A" at the bottom of the file for equivalent code.)
368 * If your compiler supports it, the "pdc_islegalUTF8" call can be turned
369 * into an inline function.
370 */
371
372/* --------------------------------------------------------------------- */
373
374static pdc_convers_result
375pdc_convertUTF16toUTF8 (
376                UTF16** sourceStart, const UTF16* sourceEnd,
377                UTF8** targetStart, const UTF8* targetEnd,
378                const pdc_convers_flags flags) {
379    pdc_convers_result result = conversionOK;
380    UTF16* source = *sourceStart;
381    UTF8* target = *targetStart;
382    while (source < sourceEnd) {
383        UTF32 ch;
384        unsigned short bytesToWrite = 0;
385        const UTF32 byteMask = 0xBF;
386        const UTF32 byteMark = 0x80;
387        ch = *source++;
388        /* If we have a surrogate pair, convert to UTF32 first. */
389        if (ch >= UNI_SUR_HIGH_START &&
390            ch <= UNI_SUR_HIGH_END &&
391            source < sourceEnd) {
392            UTF32 ch2 = *source;
393            if (ch2 >= UNI_SUR_LOW_START &&
394                ch2 <= UNI_SUR_LOW_END) {
395                ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
396                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
397                ++source;
398            } else if (flags == strictConversion) {
399                /* it's an unpaired high surrogate */
400                --source; /* return to the illegal value itself */
401                result = sourceIllegal;
402                break;
403            }
404        } else if ((flags == strictConversion) &&
405                   (ch >= UNI_SUR_LOW_START &&
406                    ch <= UNI_SUR_LOW_END)) {
407            --source; /* return to the illegal value itself */
408            result = sourceIllegal;
409            break;
410        }
411        /* Figure out how many bytes the result will require */
412        if (ch < (UTF32)0x80) {                 bytesToWrite = 1;
413        } else if (ch < (UTF32)0x800) {         bytesToWrite = 2;
414        } else if (ch < (UTF32)0x10000) {       bytesToWrite = 3;
415        } else if (ch < (UTF32)0x200000) {      bytesToWrite = 4;
416        } else {                                bytesToWrite = 2;
417                                                ch = UNI_REPLACEMENT_CHAR;
418        }
419
420        target += bytesToWrite;
421        if (target > targetEnd) {
422            target -= bytesToWrite; result = targetExhausted; break;
423        }
424        switch (bytesToWrite) { /* note: everything falls through. */
425            case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
426            case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
427            case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
428            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
429        }
430        target += bytesToWrite;
431    }
432    *sourceStart = source;
433    *targetStart = target;
434    return result;
435}
436
437/* --------------------------------------------------------------------- */
438
439/*
440 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
441 * This must be called with the length pre-determined by the first byte.
442 * If not calling this from pdc_convertUTF8to*, then the length can be set by:
443 *      length = trailingBytesForUTF8[*source]+1;
444 * and the sequence is illegal right away if there aren't that many bytes
445 * available.
446 * If presented with a length > 4, this returns pdc_false.  The Unicode
447 * definition of UTF-8 goes up to 4-byte sequences.
448 */
449
450static pdc_bool
451pdc_islegalUTF8(UTF8 *source, int length) {
452    UTF8 a;
453    UTF8 *srcptr = source+length;
454    switch (length) {
455    default: return pdc_false;
456        /* Everything else falls through when "pdc_true"... */
457    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
458    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return pdc_false;
459    case 2: if ((a = (*--srcptr)) > 0xBF) return pdc_false;
460        switch (*source) {
461            /* no fall-through in this inner switch */
462            case 0xE0: if (a < 0xA0) return pdc_false; break;
463            case 0xF0: if (a < 0x90) return pdc_false; break;
464            case 0xF4: if (a > 0x8F) return pdc_false; break;
465            default:  if (a < 0x80) return pdc_false;
466        }
467    case 1: if (*source >= 0x80 && *source < 0xC2) return pdc_false;
468            if (*source > 0xF4) return pdc_false;
469    }
470    return pdc_true;
471}
472
473/* --------------------------------------------------------------------- */
474
475#if 0
476/*
477 * Exported function to return whether a UTF-8 sequence is legal or not.
478 * This is not used here; it's just exported.
479 */
480static pdc_bool pdc_islegalUTF8sequence(UTF8 *source, UTF8 *sourceEnd) {
481    int length = trailingBytesForUTF8[*source]+1;
482    if (source+length > sourceEnd) {
483        return pdc_false;
484    }
485    return pdc_islegalUTF8(source, length);
486}
487#endif
488
489/* --------------------------------------------------------------------- */
490
491static pdc_convers_result
492pdc_convertUTF8toUTF16 (
493                UTF8** sourceStart, UTF8* sourceEnd,
494                UTF16** targetStart, const UTF16* targetEnd,
495                const pdc_convers_flags flags) {
496    pdc_convers_result result = conversionOK;
497    UTF8* source = *sourceStart;
498    UTF16* target = *targetStart;
499    while (source < sourceEnd) {
500        UTF32 ch = 0L;
501        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
502        if (source + extraBytesToRead >= sourceEnd) {
503            result = sourceExhausted;
504            break;
505        }
506        /* Do this check whether lenient or strict */
507        if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
508            result = sourceIllegal;
509            break;
510        }
511        /*
512         * The cases all fall through. See "Note A" below.
513         */
514        switch (extraBytesToRead) {
515            case 3: ch += *source++; ch <<= 6;
516            case 2: ch += *source++; ch <<= 6;
517            case 1: ch += *source++; ch <<= 6;
518            case 0: ch += *source++;
519        }
520        ch -= offsetsFromUTF8[extraBytesToRead];
521
522        if (target >= targetEnd) {
523            result = targetExhausted;
524            break;
525        }
526        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
527            if ((flags == strictConversion) &&
528                (ch >= UNI_SUR_HIGH_START &&
529                 ch <= UNI_SUR_LOW_END)) {
530                --source; /* return to the illegal value itself */
531                result = sourceIllegal;
532                break;
533            } else {
534                *target++ = (UTF16) ch;     /* normal case */
535            }
536        } else if (ch > UNI_MAX_UTF16) {
537            if (flags == strictConversion) {
538                    result = sourceIllegal;
539                    source -= extraBytesToRead; /* return to the start */
540            } else {
541                    *target++ = UNI_REPLACEMENT_CHAR;
542            }
543        } else {
544            /* target is a character in range 0xFFFF - 0x10FFFF. */
545            if (target + 1 >= targetEnd) {
546                    result = targetExhausted;
547                    break;
548            }
549            ch -= halfBase;
550            *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
551            *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
552        }
553    }
554    *sourceStart = source;
555    *targetStart = target;
556    return result;
557}
558
559/* --------------------------------------------------------------------- */
560
561#if 0
562static pdc_convers_result
563pdc_convertUTF32toUTF8 (
564                UTF32** sourceStart, const UTF32* sourceEnd,
565                UTF8** targetStart, const UTF8* targetEnd,
566                const pdc_convers_flags flags) {
567    pdc_convers_result result = conversionOK;
568    UTF32* source = *sourceStart;
569    UTF8* target = *targetStart;
570    while (source < sourceEnd) {
571        UTF32 ch;
572        unsigned short bytesToWrite = 0;
573        const UTF32 byteMask = 0x000000BF;
574        const UTF32 byteMark = 0x00000080;
575        ch = *source++;
576        /* surrogates of any stripe are not legal UTF32 characters */
577        if (flags == strictConversion ) {
578            if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) {
579                --source; /* return to the illegal value itself */
580                result = sourceIllegal;
581                break;
582            }
583        }
584        /* Figure out how many bytes the result will require */
585        if (ch < (UTF32)0x80) {                 bytesToWrite = 1;
586        } else if (ch < (UTF32)0x800) {         bytesToWrite = 2;
587        } else if (ch < (UTF32)0x10000) {       bytesToWrite = 3;
588        } else if (ch < (UTF32)0x200000) {      bytesToWrite = 4;
589        } else {                                bytesToWrite = 2;
590                                                ch = UNI_REPLACEMENT_CHAR;
591        }
592
593        target += bytesToWrite;
594        if (target > targetEnd) {
595            target -= bytesToWrite; result = targetExhausted; break;
596        }
597        switch (bytesToWrite) { /* note: everything falls through. */
598            case 4: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
599            case 3: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
600            case 2: *--target = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
601            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
602        }
603        target += bytesToWrite;
604    }
605    *sourceStart = source;
606    *targetStart = target;
607    return result;
608}
609
610/* --------------------------------------------------------------------- */
611
612static pdc_convers_result
613pdc_convertUTF8toUTF32 (
614                UTF8** sourceStart, UTF8* sourceEnd,
615                UTF32** targetStart, const UTF32* targetEnd,
616                const pdc_convers_flags flags) {
617    pdc_convers_result result = conversionOK;
618    UTF8* source = *sourceStart;
619    UTF32* target = *targetStart;
620
621    (void) flags;
622
623    while (source < sourceEnd) {
624        UTF32 ch = 0;
625        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
626        if (source + extraBytesToRead >= sourceEnd) {
627            result = sourceExhausted; break;
628        }
629        /* Do this check whether lenient or strict */
630        if (! pdc_islegalUTF8(source, extraBytesToRead+1)) {
631            result = sourceIllegal;
632            break;
633        }
634        /*
635         * The cases all fall through. See "Note A" below.
636         */
637        switch (extraBytesToRead) {
638            case 3: ch += *source++; ch <<= 6;
639            case 2: ch += *source++; ch <<= 6;
640            case 1: ch += *source++; ch <<= 6;
641            case 0: ch += *source++;
642        }
643        ch -= offsetsFromUTF8[extraBytesToRead];
644
645        if (target >= targetEnd) {
646            result = targetExhausted;
647            break;
648        }
649        if (ch <= UNI_MAX_UTF32) {
650            *target++ = ch;
651        } else if (ch > UNI_MAX_UTF32) {
652            *target++ = UNI_REPLACEMENT_CHAR;
653        } else {
654            if (target + 1 >= targetEnd) {
655                result = targetExhausted;
656                break;
657            }
658            ch -= halfBase;
659            *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
660            *target++ = (ch & halfMask) + UNI_SUR_LOW_START;
661        }
662    }
663    *sourceStart = source;
664    *targetStart = target;
665    return result;
666}
667#endif
668
669/* ---------------------------------------------------------------------
670
671        Note A.
672        The fall-through switches in UTF-8 reading code save a
673        temp variable, some decrements & conditionals.  The switches
674        are equivalent to the following loop:
675                {
676                        int tmpBytesToRead = extraBytesToRead+1;
677                        do {
678                                ch += *source++;
679                                --tmpBytesToRead;
680                                if (tmpBytesToRead) ch <<= 6;
681                        } while (tmpBytesToRead > 0);
682                }
683        In UTF-8 writing code, the switches on "bytesToWrite" are
684        similarly unrolled loops.
685
686   --------------------------------------------------------------------- */
687
688/*
689 *  pdc_convert_string converts a arbitrary encoded string (maybe UTF) to
690 *  another string.
691 *
692 *  The new converted string is allocated and terminated by required zeros.
693 *  The caller is responsible for freeing the string buffer.
694 *
695 *
696 *  LBP: low byte picking
697 *
698 *  Input-Parameter:
699 *
700 *  inutf:      input string format (see pc_unicode.h):
701 *
702 *              pdc_auto:     If a BOM is recognized:
703 *                                pdc_utf8 or pdc_utf16xx resp.
704 *                            Otherwise if input encoding <inev> is specified:
705 *                                pdc_bytes
706 *                            Otherwise:
707 *                                pdc_utf16
708 *
709 *              pdc_auto2:    If input encoding is not specified:
710 *                                pdc_utf16
711 *                            Otherwise after successfull LBP:
712 *                                pdc_auto
713 *                            Otherwise
714 *                                pdc_utf16
715 *
716 *              pdc_bytes:    8-bit string. Encoding is <inev> if specified.
717 *
718 *              pdc_bytes2:   After successfull LBP:
719 *                                pdc_bytes
720 *                            Otherwise
721 *                                pdc_utf16
722 *
723 *              pdc_utf8:     UTF-8 formatted string.
724 *
725 *              pdc_utf16:    If a UTF16 BOM is recognized:
726 *                                pdc_utf16be or pdc_utf16le
727 *                            Otherwise UTF-16 machine byte ordered string.
728 *
729 *              pdc_utf16be   UTF-16 big endian formatted string.
730 *
731 *              pdc_utf16le   UTF-16 little endian formatted string.
732 *
733 *  inev:       Encoding vector for input pdc_bytes string.
734 *
735 *  instring:   Input string.
736 *
737 *  inlen:      Length of input string in byte.
738 *
739 *  oututf:     Target format for output string.
740 *              pdc_auto, pdc_auto2 and pdc_bytes2 are not supported.
741 *
742 *  outev:      Encoding vector for output pdc_bytes string.
743 *
744 *  flags:      PDC_CONV_KEEPBYTES:
745 *              Input pdc_bytes strings will be kept differing from oututf.
746 *              *oututf: pdc_byte.
747 *
748 *              PDC_CONV_TRY7BYTES:
749 *              UTF-8 output strings will have no BOM if every byte
750 *              is smaller than x80.
751 *              *oututf: pdc_byte.
752 *
753 *              PDC_CONV_TRYBYTES:
754 *              UTF-UTF-16xx output strings will be converted by LBP
755 *              if every character is smaller than x0100.
756 *              *oututf: pdc_byte.
757 *
758 *              PDC_CONV_WITHBOM:
759 *              UTF-8 or UTF-UTF-16xx output strings will be armed
760 *              with an appropriate BOM.
761 *
762 *              PDC_CONV_NOBOM:
763 *              In UTF-8 or UTF-UTF-16xx output strings any BOM sequence
764 *              will be removed.
765 *
766 *  verbose:    Error messages are put out. Otherwise they are saved only.
767 *
768 *  Output-Parameter:
769 *
770 *  oututf:     Reached format for output string.
771 *
772 *  outstring:  Pointer of allocated output string
773 *
774 *  outlen:     Length of output string.
775 *
776 */
777
778int
779pdc_convert_string(pdc_core *pdc,
780                   pdc_text_format inutf, pdc_encodingvector *inev,
781                   pdc_byte *instring, int inlen,
782                   pdc_text_format *oututf_p, pdc_encodingvector *outev,
783                   pdc_byte **outstring, int *outlen, int flags,
784                   pdc_bool verbose)
785{
786    static const char *fn = "pdc_convert_string";
787    pdc_text_format oututf = *oututf_p;
788    pdc_text_format oututf_s;
789    pdc_ushort *usinstr = (pdc_ushort *) instring;
790    pdc_ushort uv = 0;
791    pdc_byte *instr = (pdc_byte *) instring;
792    pdc_bool inalloc = pdc_false;
793    pdc_bool hasbom = pdc_false;
794    pdc_bool toswap = pdc_false;
795    int errcode = 0;
796    int i, j, len;
797
798    /* analyzing 2 byte textformat */
799    if (inutf == pdc_auto2 || inutf == pdc_bytes2)
800    {
801        if (inutf == pdc_auto2 && !inev)
802        {
803            inutf = pdc_utf16;
804        }
805        else
806        {
807            len = inlen / 2;
808            if (2 * len != inlen)
809            {
810                errcode = PDC_E_CONV_ILLUTF16;
811                goto PDC_CONV_ERROR;
812            }
813            for (i = 0; i < len; i++)
814                if (usinstr[i] > 0x00FF)
815                    break;
816
817            /* low byte picking */
818            if (i == len)
819            {
820                instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 2), fn);
821                for (i = 0; i < len; i++)
822                    instr[i] = (pdc_byte) usinstr[i];
823                instr[len] = 0;
824                instr[len + 1] = 0;
825
826                inalloc = pdc_true;
827                instring = instr;
828                inlen = len;
829
830                if (inutf == pdc_bytes2)
831                    inutf = pdc_bytes;
832                else
833                    inutf = pdc_auto;
834            }
835            else
836            {
837                inutf = pdc_utf16;
838            }
839        }
840    }
841
842    /* analyzing UTF-16 textformat */
843    if (inutf == pdc_utf16)
844    {
845        if (pdc_is_utf16be_unicode(instring))
846            inutf = pdc_utf16be;
847        else if (pdc_is_utf16le_unicode(instring))
848            inutf = pdc_utf16le;
849    }
850
851    /* analyzing auto textformat */
852    else if (inutf == pdc_auto)
853    {
854        if (pdc_is_utf8_unicode(instring))
855            inutf = pdc_utf8;
856        else if (pdc_is_utf16be_unicode(instring))
857            inutf = pdc_utf16be;
858        else if (pdc_is_utf16le_unicode(instring))
859            inutf = pdc_utf16le;
860        else if (inev)
861            inutf = pdc_bytes;
862        else
863            inutf = pdc_utf16;
864    }
865
866    /* conversion to UTF-16 by swapping */
867    if ((inutf == pdc_utf16be  || inutf == pdc_utf16le) &&
868        (inutf != oututf || flags & PDC_CONV_TRYBYTES))
869    {
870        if (inlen &&
871            ((inutf == pdc_utf16be && !PDC_ISBIGENDIAN) ||
872             (inutf == pdc_utf16le &&  PDC_ISBIGENDIAN)))
873        {
874            if (inalloc)
875                pdc_swap_bytes((char *) instring, inlen, NULL);
876            else
877            {
878                instr = (pdc_byte *) pdc_malloc(pdc, (size_t) inlen, fn);
879                pdc_swap_bytes((char *) instring, inlen, (char *) instr);
880
881                inalloc = pdc_true;
882                instring = instr;
883            }
884        }
885        inutf = pdc_utf16;
886    }
887
888    /* conversion to UTF-16 by inflation or encoding vector */
889    if (inutf == pdc_bytes)
890    {
891        if ((oututf != pdc_bytes && !(flags & PDC_CONV_KEEPBYTES)) ||
892            inev != NULL || outev != NULL)
893        {
894            len = 2 * inlen;
895            instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 2), fn);
896            usinstr = (pdc_ushort *) instr;
897
898            for (i = 0; i < inlen; i++)
899            {
900                uv = (pdc_ushort) instring[i];
901                if (inev && uv)
902                {
903                    uv = inev->codes[uv];
904                    if (!uv) uv = 0x0020;
905                }
906                usinstr[i] = uv;
907            }
908
909            if (inalloc)
910                pdc_free(pdc, instring);
911
912            inalloc = pdc_true;
913            instring = instr;
914            inlen = len;
915            inutf = pdc_utf16;
916        }
917        else if (flags & PDC_CONV_KEEPBYTES)
918        {
919            oututf = pdc_bytes;
920        }
921    }
922
923    /* illegal UTF-16 */
924    if (inutf != pdc_bytes && inutf != pdc_utf8 && inlen % 2)
925    {
926        if (inalloc)
927            pdc_free(pdc, instring);
928        errcode = PDC_E_CONV_ILLUTF16;
929        goto PDC_CONV_ERROR;
930    }
931
932    /* UTF conversion */
933    oututf_s = oututf;
934    if ((oututf_s == pdc_bytes && inutf == pdc_utf8) ||
935         oututf_s == pdc_utf16be || oututf_s == pdc_utf16le)
936        oututf_s = pdc_utf16;
937    if (inutf != oututf_s && oututf_s != pdc_bytes)
938    {
939        len = 4 * inlen + 2;
940        instr = (pdc_byte *) pdc_malloc(pdc, (size_t) len, fn);
941
942        if (inlen)
943        {
944            pdc_convers_result result;
945            pdc_byte *instringa, *instra, *instringe, *instre;
946
947            instringa = instring;
948            instringe = instring + inlen;
949            instra = instr;
950            instre = instr + len;
951
952            if (inutf == pdc_utf8)
953                result = pdc_convertUTF8toUTF16(
954                             (UTF8 **) &instringa, (UTF8 *) instringe,
955                             (UTF16 **) &instra, (UTF16 *) instre,
956                             strictConversion);
957            else
958                result = pdc_convertUTF16toUTF8(
959                             (UTF16 **) &instringa, (UTF16 *) instringe,
960                             (UTF8 **) &instra, (UTF8 *) instre,
961                             strictConversion);
962
963            if (inalloc)
964                pdc_free(pdc, instring);
965
966            switch (result)
967            {
968                case targetExhausted:
969                errcode = PDC_E_CONV_MEMOVERFLOW;
970                break;
971
972                case sourceExhausted:
973                case sourceIllegal:
974                errcode = PDC_E_CONV_ILLUTF;
975                break;
976
977                default:
978                break;
979            }
980
981            if (errcode)
982            {
983                pdc_free(pdc, instr);
984                goto PDC_CONV_ERROR;
985            }
986
987            inlen = instra - instr;
988        }
989
990        if (inlen + 2 != len)
991            instr = pdc_realloc(pdc, instr, (size_t) (inlen + 2), fn);
992        instr[inlen] = 0;
993        instr[inlen + 1] = 0;
994
995        inalloc = pdc_true;
996        instring = instr;
997        inutf = oututf_s;
998    }
999
1000    if (inutf == pdc_bytes)
1001    {
1002        if (!inalloc)
1003        {
1004            instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (inlen + 2), fn);
1005            memcpy(instr, instring, (size_t) inlen);
1006            instr[inlen] = 0;
1007            instr[inlen + 1] = 0;
1008
1009            instring = instr;
1010        }
1011    }
1012
1013    /* trying to reduce UTF-16 string to bytes string */
1014    if (inutf == pdc_utf16 &&
1015        (flags & PDC_CONV_TRYBYTES || oututf == pdc_bytes))
1016    {
1017        len = inlen / 2;
1018        instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 2), fn);
1019        usinstr = (pdc_ushort *) instring;
1020
1021        for (i = 0; i < len; i++)
1022        {
1023            uv = usinstr[i];
1024            if (outev && uv)
1025                uv = (pdc_ushort) pdc_get_encoding_bytecode(pdc, outev, uv);
1026            if (uv > 0x00FF)
1027                break;
1028
1029            instr[i] = (pdc_byte) uv;
1030        }
1031
1032        if (i == len)
1033        {
1034            instr[len] = 0;
1035            instr[len + 1] = 0;
1036
1037            if (inalloc)
1038                pdc_free(pdc, instring);
1039
1040            inalloc = pdc_true;
1041            instring = instr;
1042            inlen = len;
1043            inutf = pdc_bytes;
1044        }
1045        else
1046            pdc_free(pdc, instr);
1047    }
1048
1049    /* UTF-8 format */
1050    if (inutf == pdc_utf8)
1051    {
1052        hasbom = pdc_is_utf8_unicode(instring);
1053
1054        if (flags & PDC_CONV_TRY7BYTES)
1055        {
1056            for (i = hasbom ? 3 : 0; i < inlen; i++)
1057                if (instring[i] > 0x7F)
1058                    break;
1059            if (i == inlen)
1060            {
1061                flags &= ~PDC_CONV_WITHBOM;
1062                flags |= PDC_CONV_NOBOM;
1063                inutf = pdc_bytes;
1064            }
1065        }
1066
1067        if (!inalloc || flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
1068        {
1069            i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 3 : 0;
1070            j = (flags & PDC_CONV_NOBOM && hasbom) ? 3 : 0;
1071
1072            len = inlen + i - j;
1073            instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 1), fn);
1074            memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
1075            instr[len] = 0;
1076
1077            if (inalloc)
1078                pdc_free(pdc, instring);
1079
1080            instring = instr;
1081            inlen = len;
1082
1083            hasbom = (flags & PDC_CONV_WITHBOM);
1084        }
1085
1086        if (hasbom)
1087        {
1088            instring[0] = PDF_BOM2;
1089            instring[1] = PDF_BOM3;
1090            instring[2] = PDF_BOM4;
1091        }
1092    }
1093
1094    /* UTF-16 formats */
1095    if (inutf == pdc_utf16 || inutf == pdc_utf16be || inutf == pdc_utf16le)
1096    {
1097        hasbom = pdc_is_utf16be_unicode(instring) ||
1098                 pdc_is_utf16le_unicode(instring);
1099
1100        if (!inalloc || oututf == pdc_utf16be || oututf == pdc_utf16le ||
1101            flags & PDC_CONV_WITHBOM || flags & PDC_CONV_NOBOM)
1102        {
1103            i = (flags & PDC_CONV_WITHBOM && !hasbom) ? 2 : 0;
1104            j = (flags & PDC_CONV_NOBOM && hasbom) ? 2 : 0;
1105
1106            len = inlen + i - j;
1107            instr = (pdc_byte *) pdc_malloc(pdc, (size_t) (len + 2), fn);
1108            memcpy(&instr[i], &instring[j], (size_t) (inlen - j));
1109            instr[len] = 0;
1110            instr[len + 1] = 0;
1111
1112            if (inalloc)
1113                pdc_free(pdc, instring);
1114
1115            instring = instr;
1116            inlen = len;
1117
1118            hasbom = (flags & PDC_CONV_WITHBOM);
1119        }
1120
1121        i = hasbom ? 2 : 0;
1122        if (inutf == pdc_utf16)
1123        {
1124            if (oututf == pdc_utf16be)
1125            {
1126                inutf = pdc_utf16be;
1127                toswap = !PDC_ISBIGENDIAN;
1128            }
1129            if (oututf == pdc_utf16le)
1130            {
1131                inutf = pdc_utf16le;
1132                toswap = PDC_ISBIGENDIAN;
1133            }
1134            if (toswap)
1135                pdc_swap_bytes((char *) &instring[i], inlen - i, NULL);
1136        }
1137
1138        if (hasbom)
1139        {
1140            if (inutf == pdc_utf16be ||
1141                (inutf == pdc_utf16 && PDC_ISBIGENDIAN))
1142            {
1143                instring[0] = PDF_BOM0;
1144                instring[1] = PDF_BOM1;
1145            }
1146            if (inutf == pdc_utf16le ||
1147                (inutf == pdc_utf16 && !PDC_ISBIGENDIAN))
1148            {
1149                instring[0] = PDF_BOM1;
1150                instring[1] = PDF_BOM0;
1151            }
1152        }
1153    }
1154
1155    *oututf_p = inutf;
1156    *outlen = inlen;
1157    *outstring = instring;
1158    return 0;
1159
1160    PDC_CONV_ERROR:
1161    *outlen = 0;
1162    *outstring = NULL;
1163
1164    if (errcode == PDC_E_CONV_ILLUTF)
1165    {
1166        const char *stemp =
1167            pdc_errprintf(pdc, "%d", inutf == pdc_utf8 ? 8 : 16);
1168        pdc_set_errmsg(pdc, errcode, stemp, 0, 0, 0);
1169    }
1170    else
1171        pdc_set_errmsg(pdc, errcode, 0, 0, 0, 0);
1172
1173    if (verbose)
1174        pdc_error(pdc, -1, 0, 0, 0, 0);
1175
1176    return errcode;
1177}
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215