1/* utf8.c -- convert characters to/from UTF-8
2
3  (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
4  See tidy.h for the copyright notice.
5
6  CVS Info :
7
8    $Author: iccir $
9    $Date: 2007/01/30 23:46:52 $
10    $Revision: 1.3 $
11
12  Uses public interfaces to abstract input source and output
13  sink, which may be user supplied or either FILE* or memory
14  based Tidy implementations.  Encoding support is uniform
15  regardless of I/O mechanism.
16
17  Note, UTF-8 encoding, by itself, does not affect the actual
18  "codepoints" of the underlying character encoding.  In the
19  cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
20  refer to ISO-10646 "codepoints".  For anything else, they
21  refer to some other "codepoint" set.
22
23  Put another way, UTF-8 is a variable length method to
24  represent any non-negative integer value.  The glyph
25  that a integer value represents is unchanged and defined
26  externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
27  Latin2-9, and so on).
28
29  Put still another way, UTF-8 is more of a _transfer_ encoding
30  than a _character_ encoding, per se.
31*/
32
33#include "tidy.h"
34#include "forward.h"
35#include "utf8.h"
36
37/*
38UTF-8 encoding/decoding functions
39Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
40
41Also see below for UTF-16 encoding/decoding functions
42
43References :
44
451) UCS Transformation Format 8 (UTF-8):
46ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
47<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
48<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
49
50Table 4 - Mapping from UCS-4 to UTF-8
51
522) Unicode standards:
53<http://www.unicode.org/unicode/standard/standard.html>
54
553) Legal UTF-8 byte sequences:
56<http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
57
58Code point          1st byte    2nd byte    3rd byte    4th byte
59----------          --------    --------    --------    --------
60U+0000..U+007F      00..7F
61U+0080..U+07FF      C2..DF      80..BF
62U+0800..U+0FFF      E0          A0..BF      80..BF
63U+1000..U+FFFF      E1..EF      80..BF      80..BF
64U+10000..U+3FFFF    F0          90..BF      80..BF      80..BF
65U+40000..U+FFFFF    F1..F3      80..BF      80..BF      80..BF
66U+100000..U+10FFFF  F4          80..8F      80..BF      80..BF
67
68The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
69allows for the use of five- and six-byte sequences to encode
70characters that are outside the range of the Unicode character
71set; those five- and six-byte sequences are illegal for the use
72of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
73does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
74(but it does allow other noncharacters).
75
764) RFC 2279: UTF-8, a transformation format of ISO 10646:
77<http://www.ietf.org/rfc/rfc2279.txt>
78
795) UTF-8 and Unicode FAQ:
80<http://www.cl.cam.ac.uk/~mgk25/unicode.html>
81
826) Markus Kuhn's UTF-8 decoder stress test file:
83<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
84
857) UTF-8 Demo:
86<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
87
888) UTF-8 Sampler:
89<http://www.columbia.edu/kermit/utf8.html>
90
919) Transformation Format for 16 Planes of Group 00 (UTF-16):
92ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
93<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
94<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
95
9610) RFC 2781: UTF-16, an encoding of ISO 10646:
97<http://www.ietf.org/rfc/rfc2781.txt>
98
9911) UTF-16 invalid surrogate pairs:
100<http://www.unicode.org/unicode/faq/utf_bom.html#16>
101
102UTF-16       UTF-8          UCS-4
103D83F DFF*    F0 9F BF B*    0001FFF*
104D87F DFF*    F0 AF BF B*    0002FFF*
105D8BF DFF*    F0 BF BF B*    0003FFF*
106D8FF DFF*    F1 8F BF B*    0004FFF*
107D93F DFF*    F1 9F BF B*    0005FFF*
108D97F DFF*    F1 AF BF B*    0006FFF*
109                ...
110DBBF DFF*    F3 BF BF B*    000FFFF*
111DBFF DFF*    F4 8F BF B*    0010FFF*
112
113* = E or F
114
1151010  A
1161011  B
1171100  C
1181101  D
1191110  E
1201111  F
121
122*/
123
124#define kNumUTF8Sequences        7
125#define kMaxUTF8Bytes            4
126
127#define kUTF8ByteSwapNotAChar    0xFFFE
128#define kUTF8NotAChar            0xFFFF
129
130#define kMaxUTF8FromUCS4         0x10FFFF
131
132#define kUTF16SurrogatesBegin    0x10000
133#define kMaxUTF16FromUCS4        0x10FFFF
134
135/* UTF-16 surrogate pair areas */
136#define kUTF16LowSurrogateBegin  0xD800
137#define kUTF16LowSurrogateEnd    0xDBFF
138#define kUTF16HighSurrogateBegin 0xDC00
139#define kUTF16HighSurrogateEnd   0xDFFF
140
141
142/* offsets into validUTF8 table below */
143static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
144{
145    0, /* 1 byte */
146    1, /* 2 bytes */
147    2, /* 3 bytes */
148    4, /* 4 bytes */
149    kNumUTF8Sequences /* must be last */
150};
151
152static const struct validUTF8Sequence
153{
154     uint lowChar;
155     uint highChar;
156     int  numBytes;
157     byte validBytes[8];
158} validUTF8[kNumUTF8Sequences] =
159{
160/*   low       high   #bytes  byte 1      byte 2      byte 3      byte 4 */
161    {0x0000,   0x007F,   1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
162    {0x0080,   0x07FF,   2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
163    {0x0800,   0x0FFF,   3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
164    {0x1000,   0xFFFF,   3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
165    {0x10000,  0x3FFFF,  4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
166    {0x40000,  0xFFFFF,  4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
167    {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
168};
169
170int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
171                                TidyInputSource* inp, int* count )
172{
173    byte tempbuf[10];
174    byte *buf = &tempbuf[0];
175    uint ch = 0, n = 0;
176    int i, bytes = 0;
177    Bool hasError = no;
178
179    if ( successorBytes )
180        buf = (byte*) successorBytes;
181
182    /* special check if we have been passed an EOF char */
183    if ( firstByte == EndOfStream )
184    {
185        /* at present */
186        *c = firstByte;
187        *count = 1;
188        return 0;
189    }
190
191    ch = firstByte; /* first byte is passed in separately */
192
193    if (ch <= 0x7F) /* 0XXX XXXX one byte */
194    {
195        n = ch;
196        bytes = 1;
197    }
198    else if ((ch & 0xE0) == 0xC0)  /* 110X XXXX  two bytes */
199    {
200        n = ch & 31;
201        bytes = 2;
202    }
203    else if ((ch & 0xF0) == 0xE0)  /* 1110 XXXX  three bytes */
204    {
205        n = ch & 15;
206        bytes = 3;
207    }
208    else if ((ch & 0xF8) == 0xF0)  /* 1111 0XXX  four bytes */
209    {
210        n = ch & 7;
211        bytes = 4;
212    }
213    else if ((ch & 0xFC) == 0xF8)  /* 1111 10XX  five bytes */
214    {
215        n = ch & 3;
216        bytes = 5;
217        hasError = yes;
218    }
219    else if ((ch & 0xFE) == 0xFC)  /* 1111 110X  six bytes */
220    {
221        n = ch & 1;
222        bytes = 6;
223        hasError = yes;
224    }
225    else
226    {
227        /* not a valid first byte of a UTF-8 sequence */
228        n = ch;
229        bytes = 1;
230        hasError = yes;
231    }
232
233    /* successor bytes should have the form 10XX XXXX */
234
235    /* If caller supplied buffer, use it.  Else see if caller
236    ** supplied an input source, use that.
237    */
238    if ( successorBytes )
239    {
240        for ( i=0; i < bytes-1; ++i )
241        {
242            if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
243            {
244                hasError = yes;
245                bytes = i;
246                break;
247            }
248            n = (n << 6) | (buf[i] & 0x3F);
249        }
250    }
251    else if ( inp )
252    {
253        for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
254        {
255            int b = inp->getByte( inp->sourceData );
256            buf[i] = (tmbchar) b;
257
258            /* End of data or illegal successor byte value */
259            if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
260            {
261                hasError = yes;
262                bytes = i;
263                if ( b != EOF )
264                    inp->ungetByte( inp->sourceData, buf[i] );
265                break;
266            }
267            n = (n << 6) | (buf[i] & 0x3F);
268        }
269    }
270    else if ( bytes > 1 )
271    {
272        hasError = yes;
273        bytes = 1;
274    }
275
276    if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
277        hasError = yes;
278
279    if (!hasError && (n > kMaxUTF8FromUCS4))
280        hasError = yes;
281
282#if 0 /* Breaks Big5 D8 - DF */
283    if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
284        /* unpaired surrogates not allowed */
285        hasError = yes;
286#endif
287
288    if (!hasError)
289    {
290        int lo, hi;
291
292        lo = offsetUTF8Sequences[bytes - 1];
293        hi = offsetUTF8Sequences[bytes] - 1;
294
295        /* check for overlong sequences */
296        if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
297            hasError = yes;
298        else
299        {
300            hasError = yes; /* assume error until proven otherwise */
301
302            for (i = lo; i <= hi; i++)
303            {
304                int tempCount;
305                byte theByte;
306
307                for (tempCount = 0; tempCount < bytes; tempCount++)
308                {
309                    if (!tempCount)
310                        theByte = (tmbchar) firstByte;
311                    else
312                        theByte = buf[tempCount - 1];
313
314                    if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
315                         theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
316                        hasError = no;
317                    if (hasError)
318                        break;
319                }
320            }
321        }
322    }
323
324#if 1 && defined(_DEBUG)
325    if ( hasError )
326    {
327       /* debug */
328       fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
329       fprintf( stderr, "0x%02x ", firstByte );
330       for (i = 1; i < bytes; i++)
331           fprintf( stderr, "0x%02x ", buf[i - 1] );
332       fprintf( stderr, " = U+%04ulx\n", n );
333    }
334#endif
335
336    *count = bytes;
337    *c = n;
338    if ( hasError )
339        return -1;
340    return 0;
341}
342
343int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
344                                TidyOutputSink* outp, int* count )
345{
346    byte tempbuf[10] = {0};
347    byte* buf = &tempbuf[0];
348    int bytes = 0;
349    Bool hasError = no;
350
351    if ( encodebuf )
352        buf = (byte*) encodebuf;
353
354    if (c <= 0x7F)  /* 0XXX XXXX one byte */
355    {
356        buf[0] = (tmbchar) c;
357        bytes = 1;
358    }
359    else if (c <= 0x7FF)  /* 110X XXXX  two bytes */
360    {
361        buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
362        buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
363        bytes = 2;
364    }
365    else if (c <= 0xFFFF)  /* 1110 XXXX  three bytes */
366    {
367        buf[0] = (tmbchar) (0xE0 | (c >> 12));
368        buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
369        buf[2] = (tmbchar) (0x80 | (c & 0x3F));
370        bytes = 3;
371        if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
372            hasError = yes;
373#if 0 /* Breaks Big5 D8 - DF */
374        else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd )
375            /* unpaired surrogates not allowed */
376            hasError = yes;
377#endif
378    }
379    else if (c <= 0x1FFFFF)  /* 1111 0XXX  four bytes */
380    {
381        buf[0] = (tmbchar) (0xF0 | (c >> 18));
382        buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
383        buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
384        buf[3] = (tmbchar) (0x80 | (c & 0x3F));
385        bytes = 4;
386        if (c > kMaxUTF8FromUCS4)
387            hasError = yes;
388    }
389    else if (c <= 0x3FFFFFF)  /* 1111 10XX  five bytes */
390    {
391        buf[0] = (tmbchar) (0xF8 | (c >> 24));
392        buf[1] = (tmbchar) (0x80 | (c >> 18));
393        buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
394        buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
395        buf[4] = (tmbchar) (0x80 | (c & 0x3F));
396        bytes = 5;
397        hasError = yes;
398    }
399    else if (c <= 0x7FFFFFFF)  /* 1111 110X  six bytes */
400    {
401        buf[0] = (tmbchar) (0xFC | (c >> 30));
402        buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
403        buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
404        buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
405        buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
406        buf[5] = (tmbchar) (0x80 | (c & 0x3F));
407        bytes = 6;
408        hasError = yes;
409    }
410    else
411        hasError = yes;
412
413    /* don't output invalid UTF-8 byte sequence to a stream */
414    if ( !hasError && outp != NULL )
415    {
416        int ix;
417        for ( ix=0; ix < bytes; ++ix )
418          outp->putByte( outp->sinkData, buf[ix] );
419    }
420
421#if 1 && defined(_DEBUG)
422    if ( hasError )
423    {
424        int i;
425        fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
426        for (i = 0; i < bytes; i++)
427            fprintf( stderr, "0x%02x ", buf[i] );
428        fprintf( stderr, "\n" );
429    }
430#endif
431
432    *count = bytes;
433    if (hasError)
434        return -1;
435    return 0;
436}
437
438
439/* return one less than the number of bytes used by the UTF-8 byte sequence */
440/* str points to the UTF-8 byte sequence */
441/* the Unicode char is returned in *ch */
442uint TY_(GetUTF8)( ctmbstr str, uint *ch )
443{
444    uint n;
445    int bytes;
446
447    int err;
448
449    bytes = 0;
450
451    /* first byte "str[0]" is passed in separately from the */
452    /* rest of the UTF-8 byte sequence starting at "str[1]" */
453    err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
454    if (err)
455    {
456#if 1 && defined(_DEBUG)
457        fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
458#endif
459        n = 0xFFFD; /* replacement char */
460    }
461
462    *ch = n;
463    return bytes - 1;
464}
465
466/* store char c as UTF-8 encoded byte stream */
467tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
468{
469    int err, count = 0;
470
471    err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
472    if (err)
473    {
474#if 1 && defined(_DEBUG)
475        fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
476#endif
477        /* replacement char 0xFFFD encoded as UTF-8 */
478        buf[0] = (byte) 0xEF;
479        buf[1] = (byte) 0xBF;
480        buf[2] = (byte) 0xBD;
481        count = 3;
482    }
483
484    buf += count;
485    return buf;
486}
487
488Bool    TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
489{
490  return ( ucs4 <= kMaxUTF16FromUCS4 );
491}
492
493Bool    TY_(IsHighSurrogate)( tchar ch )
494{
495    return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
496}
497Bool    TY_(IsLowSurrogate)( tchar ch )
498{
499    return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
500}
501
502tchar   TY_(CombineSurrogatePair)( tchar high, tchar low )
503{
504    assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
505    return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
506             high - kUTF16HighSurrogateBegin + 0x10000 );
507}
508
509Bool   TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
510{
511    Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
512    if ( status )
513    {
514        *low  = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
515        *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
516    }
517    return status;
518}
519
520Bool    TY_(IsValidCombinedChar)( tchar ch )
521{
522    return ( ch >= kUTF16SurrogatesBegin &&
523             (ch & 0x0000FFFE) != 0x0000FFFE &&
524             (ch & 0x0000FFFF) != 0x0000FFFF );
525}
526
527Bool    TY_(IsCombinedChar)( tchar ch )
528{
529    return ( ch >= kUTF16SurrogatesBegin );
530}
531
532/*
533 * local variables:
534 * mode: c
535 * indent-tabs-mode: nil
536 * c-basic-offset: 4
537 * eval: (c-set-offset 'substatement-open 0)
538 * end:
539 */
540