1/*---------------------------------------------------------------------------
2|   Copyright (C) 1999  Jochen C. Loewer (loewerj@hotmail.com)
3+----------------------------------------------------------------------------
4|
5|   $Id: utf8conv.c,v 1.2 2004/08/14 14:42:27 rolf Exp $
6|
7|
8|   Functions, which (try) to convert UTF-8 encoded Unicode strings back
9|   to some 8bit encodings like ISO-8859-*, ...
10|
11|
12|   The contents of this file are subject to the Mozilla Public License
13|   Version 1.1 (the "License"); you may not use this file except in
14|   compliance with the License. You may obtain a copy of the License at
15|   http://www.mozilla.org/MPL/
16|
17|   Software distributed under the License is distributed on an "AS IS"
18|   basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
19|   License for the specific language governing rights and limitations
20|   under the License.
21|
22|   The Original Code is tDOM.
23|
24|   The Initial Developer of the Original Code is Jochen Loewer
25|   Portions created by Jochen Loewer are Copyright (C) 1998, 1999
26|   Jochen Loewer. All Rights Reserved.
27|
28|   Contributor(s):
29|
30|
31|   $Log: utf8conv.c,v $
32|   Revision 1.2  2004/08/14 14:42:27  rolf
33|   Use 'Id' cvs keyword (instead of 'Header') in the file heads.
34|
35|   Revision 1.1.1.1  2002/02/22 01:05:35  rolf
36|   tDOM0.7test with Jochens first set of patches
37|
38|
39|
40|   written by Jochen Loewer
41|   November, 1999
42|
43\--------------------------------------------------------------------------*/
44
45
46
47/*---------------------------------------------------------------------------
48|   Includes
49|
50\--------------------------------------------------------------------------*/
51#include <tcl.h>
52#include <stdlib.h>
53#include <string.h>
54#include <utf8conv.h>
55
56/*---------------------------------------------------------------------------
57|   Defines
58|
59\--------------------------------------------------------------------------*/
60#define DBG(x)
61
62#define ENC_END       0
63#define ENC_IDENTITY  1
64#define ENC_MAP       2
65
66#if defined(_MSC_VER)
67# define STRCASECMP(a,b)  stricmp (a,b)
68#else
69# define STRCASECMP(a,b)  strcasecmp (a,b)
70#endif
71
72
73/*---------------------------------------------------------------------------
74|   Static Globals
75|
76\--------------------------------------------------------------------------*/
77#include "encodings.inc"
78
79
80
81/*---------------------------------------------------------------------------
82|   tdom_GetEncoding  -  Looks up a encoding table for the given encoding
83|                        name. If nothing was found NULL is returned.
84|
85\--------------------------------------------------------------------------*/
86TEncoding *
87tdom_GetEncoding (
88    char  * name
89)
90{
91    TEncoding *encoding = TDOM_UnicodeTo8bitEncodings;
92
93    while (encoding && encoding->name) {
94        DBG(fprintf(stderr, "encoding=%x encoding->name='%s' name='%s'",
95                             encoding, encoding->name, name);)
96        if (STRCASECMP(encoding->name,name)==0) {
97            return encoding;
98        }
99        encoding++;
100    }
101    return NULL;
102}
103
104
105/*---------------------------------------------------------------------------
106|   tdom_GetEncodingName
107|
108\--------------------------------------------------------------------------*/
109char *
110tdom_GetEncodingName (TEncoding *encoding)
111{
112    TEncoding *knownencoding = TDOM_UnicodeTo8bitEncodings;
113
114    while (knownencoding && knownencoding->name) {
115        if (knownencoding == encoding) {
116            return (char*) knownencoding->name;
117        }
118        knownencoding++;
119    }
120    return NULL;
121}
122
123
124/*---------------------------------------------------------------------------
125|   tdom_Utf8to8Bit  -  Convert a UTF-8 encode string with byte length
126|                       *len to 8bit encoding using the specify encoding.
127|
128\--------------------------------------------------------------------------*/
129void
130tdom_Utf8to8Bit (
131    TEncoding  * encoding,
132    const char * utf8_string,
133    int        * len
134)
135{
136    unsigned char  *in, *end, *out;
137    TEncodingRule  *rule;
138    int             byte;
139    int             unicode;
140
141
142    if (encoding == NULL) {
143       /* don't convert; keep UTF-8 */
144       return;
145    }
146
147    in  = (unsigned char*) utf8_string;
148    out = (unsigned char*) utf8_string;
149    end = in + *len;
150    unicode = 0;
151
152    while (in < end) {
153
154        byte = *in;
155
156        /* extract unicode character from (multiple) UTF-8 bytes */
157
158        if (byte < 0xC0) {
159            unicode = byte;
160            in++;
161        } else if (byte < 0xE0) {
162            if ((in[1] & 0xC0) == 0x80) {
163                unicode = ((byte & 0x1F) << 6) | (in[1] & 0x3F);
164                in += 2;
165            } else {
166                unicode = byte;
167                in++;
168            }
169        } else if (byte < 0xF0) {
170            if (((in[1] & 0xC0) == 0x80) && ((in[2] & 0xC0) == 0x80)) {
171                unicode =  ((byte  & 0x0F) << 12)
172                         | ((in[1] & 0x3F) << 6 )
173                         | ((in[2] & 0x3F)      );
174                in += 3;
175            } else {
176                unicode = byte;
177                in++;
178            }
179        } else {
180            /* ??? > 3 bytes UTF chars ??? */
181            in++;
182        }
183
184        /* convert unicode character to 8bit representation */
185        rule = encoding->rules;
186        while (rule && rule->type != ENC_END) {
187            if (   (unicode >= rule->start_code)
188                && (unicode < (rule->start_code + rule->len)) ) {
189
190                if (rule->type == ENC_MAP) {
191                    *out++ = rule->map[unicode - rule->start_code];
192                } else {
193                    *out++ = unicode & 0xFF;
194                }
195                break;
196            }
197            rule++;
198        }
199        if (rule->type == ENC_END) {
200            /* no rule foun, use fallback */
201            *out++ = encoding->fallback_char & 0x0FF;
202        }
203    }
204    if (out < end) {
205        *out = '\0';
206    }
207    *len = ( (char*)out - utf8_string);
208}
209
210