1/*--------------------------------------------------------------------------- 2| Copyright (C) 1999 Jochen C. Loewer (loewerj@hotmail.com) 3+---------------------------------------------------------------------------- 4| 5| $Id: utf8conv.c,v 1.2 2004/08/14 14:42:27 rolf Exp $ 6| 7| 8| Functions, which (try) to convert UTF-8 encoded Unicode strings back 9| to some 8bit encodings like ISO-8859-*, ... 10| 11| 12| The contents of this file are subject to the Mozilla Public License 13| Version 1.1 (the "License"); you may not use this file except in 14| compliance with the License. You may obtain a copy of the License at 15| http://www.mozilla.org/MPL/ 16| 17| Software distributed under the License is distributed on an "AS IS" 18| basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 19| License for the specific language governing rights and limitations 20| under the License. 21| 22| The Original Code is tDOM. 23| 24| The Initial Developer of the Original Code is Jochen Loewer 25| Portions created by Jochen Loewer are Copyright (C) 1998, 1999 26| Jochen Loewer. All Rights Reserved. 27| 28| Contributor(s): 29| 30| 31| $Log: utf8conv.c,v $ 32| Revision 1.2 2004/08/14 14:42:27 rolf 33| Use 'Id' cvs keyword (instead of 'Header') in the file heads. 34| 35| Revision 1.1.1.1 2002/02/22 01:05:35 rolf 36| tDOM0.7test with Jochens first set of patches 37| 38| 39| 40| written by Jochen Loewer 41| November, 1999 42| 43\--------------------------------------------------------------------------*/ 44 45 46 47/*--------------------------------------------------------------------------- 48| Includes 49| 50\--------------------------------------------------------------------------*/ 51#include <tcl.h> 52#include <stdlib.h> 53#include <string.h> 54#include <utf8conv.h> 55 56/*--------------------------------------------------------------------------- 57| Defines 58| 59\--------------------------------------------------------------------------*/ 60#define DBG(x) 61 62#define ENC_END 0 63#define ENC_IDENTITY 1 64#define ENC_MAP 2 65 66#if defined(_MSC_VER) 67# define STRCASECMP(a,b) stricmp (a,b) 68#else 69# define STRCASECMP(a,b) strcasecmp (a,b) 70#endif 71 72 73/*--------------------------------------------------------------------------- 74| Static Globals 75| 76\--------------------------------------------------------------------------*/ 77#include "encodings.inc" 78 79 80 81/*--------------------------------------------------------------------------- 82| tdom_GetEncoding - Looks up a encoding table for the given encoding 83| name. If nothing was found NULL is returned. 84| 85\--------------------------------------------------------------------------*/ 86TEncoding * 87tdom_GetEncoding ( 88 char * name 89) 90{ 91 TEncoding *encoding = TDOM_UnicodeTo8bitEncodings; 92 93 while (encoding && encoding->name) { 94 DBG(fprintf(stderr, "encoding=%x encoding->name='%s' name='%s'", 95 encoding, encoding->name, name);) 96 if (STRCASECMP(encoding->name,name)==0) { 97 return encoding; 98 } 99 encoding++; 100 } 101 return NULL; 102} 103 104 105/*--------------------------------------------------------------------------- 106| tdom_GetEncodingName 107| 108\--------------------------------------------------------------------------*/ 109char * 110tdom_GetEncodingName (TEncoding *encoding) 111{ 112 TEncoding *knownencoding = TDOM_UnicodeTo8bitEncodings; 113 114 while (knownencoding && knownencoding->name) { 115 if (knownencoding == encoding) { 116 return (char*) knownencoding->name; 117 } 118 knownencoding++; 119 } 120 return NULL; 121} 122 123 124/*--------------------------------------------------------------------------- 125| tdom_Utf8to8Bit - Convert a UTF-8 encode string with byte length 126| *len to 8bit encoding using the specify encoding. 127| 128\--------------------------------------------------------------------------*/ 129void 130tdom_Utf8to8Bit ( 131 TEncoding * encoding, 132 const char * utf8_string, 133 int * len 134) 135{ 136 unsigned char *in, *end, *out; 137 TEncodingRule *rule; 138 int byte; 139 int unicode; 140 141 142 if (encoding == NULL) { 143 /* don't convert; keep UTF-8 */ 144 return; 145 } 146 147 in = (unsigned char*) utf8_string; 148 out = (unsigned char*) utf8_string; 149 end = in + *len; 150 unicode = 0; 151 152 while (in < end) { 153 154 byte = *in; 155 156 /* extract unicode character from (multiple) UTF-8 bytes */ 157 158 if (byte < 0xC0) { 159 unicode = byte; 160 in++; 161 } else if (byte < 0xE0) { 162 if ((in[1] & 0xC0) == 0x80) { 163 unicode = ((byte & 0x1F) << 6) | (in[1] & 0x3F); 164 in += 2; 165 } else { 166 unicode = byte; 167 in++; 168 } 169 } else if (byte < 0xF0) { 170 if (((in[1] & 0xC0) == 0x80) && ((in[2] & 0xC0) == 0x80)) { 171 unicode = ((byte & 0x0F) << 12) 172 | ((in[1] & 0x3F) << 6 ) 173 | ((in[2] & 0x3F) ); 174 in += 3; 175 } else { 176 unicode = byte; 177 in++; 178 } 179 } else { 180 /* ??? > 3 bytes UTF chars ??? */ 181 in++; 182 } 183 184 /* convert unicode character to 8bit representation */ 185 rule = encoding->rules; 186 while (rule && rule->type != ENC_END) { 187 if ( (unicode >= rule->start_code) 188 && (unicode < (rule->start_code + rule->len)) ) { 189 190 if (rule->type == ENC_MAP) { 191 *out++ = rule->map[unicode - rule->start_code]; 192 } else { 193 *out++ = unicode & 0xFF; 194 } 195 break; 196 } 197 rule++; 198 } 199 if (rule->type == ENC_END) { 200 /* no rule foun, use fallback */ 201 *out++ = encoding->fallback_char & 0x0FF; 202 } 203 } 204 if (out < end) { 205 *out = '\0'; 206 } 207 *len = ( (char*)out - utf8_string); 208} 209 210