1/* 2 * Summary: interface for the encoding conversion functions 3 * Description: interface for the encoding conversion functions needed for 4 * XML basic encoding and iconv() support. 5 * 6 * Related specs are 7 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies 8 * [ISO-10646] UTF-8 and UTF-16 in Annexes 9 * [ISO-8859-1] ISO Latin-1 characters codes. 10 * [UNICODE] The Unicode Consortium, "The Unicode Standard -- 11 * Worldwide Character Encoding -- Version 1.0", Addison- 12 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is 13 * described in Unicode Technical Report #4. 14 * [US-ASCII] Coded Character Set--7-bit American Standard Code for 15 * Information Interchange, ANSI X3.4-1986. 16 * 17 * Copy: See Copyright for the status of this software. 18 * 19 * Author: Daniel Veillard 20 */ 21 22#ifndef __XML_CHAR_ENCODING_H__ 23#define __XML_CHAR_ENCODING_H__ 24 25#include <libxml/xmlversion.h> 26 27#ifdef LIBXML_ICONV_ENABLED 28#include <iconv.h> 29#endif 30#ifdef __cplusplus 31extern "C" { 32#endif 33 34/* 35 * xmlCharEncoding: 36 * 37 * Predefined values for some standard encodings. 38 * Libxml does not do beforehand translation on UTF8 and ISOLatinX. 39 * It also supports ASCII, ISO-8859-1, and UTF16 (LE and BE) by default. 40 * 41 * Anything else would have to be translated to UTF8 before being 42 * given to the parser itself. The BOM for UTF16 and the encoding 43 * declaration are looked at and a converter is looked for at that 44 * point. If not found the parser stops here as asked by the XML REC. A 45 * converter can be registered by the user using xmlRegisterCharEncodingHandler 46 * but the current form doesn't allow stateful transcoding (a serious 47 * problem agreed !). If iconv has been found it will be used 48 * automatically and allow stateful transcoding, the simplest is then 49 * to be sure to enable iconv and to provide iconv libs for the encoding 50 * support needed. 51 * 52 * Note that the generic "UTF-16" is not a predefined value. Instead, only 53 * the specific UTF-16LE and UTF-16BE are present. 54 */ 55typedef enum { 56 XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */ 57 XML_CHAR_ENCODING_NONE= 0, /* No char encoding detected */ 58 XML_CHAR_ENCODING_UTF8= 1, /* UTF-8 */ 59 XML_CHAR_ENCODING_UTF16LE= 2, /* UTF-16 little endian */ 60 XML_CHAR_ENCODING_UTF16BE= 3, /* UTF-16 big endian */ 61 XML_CHAR_ENCODING_UCS4LE= 4, /* UCS-4 little endian */ 62 XML_CHAR_ENCODING_UCS4BE= 5, /* UCS-4 big endian */ 63 XML_CHAR_ENCODING_EBCDIC= 6, /* EBCDIC uh! */ 64 XML_CHAR_ENCODING_UCS4_2143=7, /* UCS-4 unusual ordering */ 65 XML_CHAR_ENCODING_UCS4_3412=8, /* UCS-4 unusual ordering */ 66 XML_CHAR_ENCODING_UCS2= 9, /* UCS-2 */ 67 XML_CHAR_ENCODING_8859_1= 10,/* ISO-8859-1 ISO Latin 1 */ 68 XML_CHAR_ENCODING_8859_2= 11,/* ISO-8859-2 ISO Latin 2 */ 69 XML_CHAR_ENCODING_8859_3= 12,/* ISO-8859-3 */ 70 XML_CHAR_ENCODING_8859_4= 13,/* ISO-8859-4 */ 71 XML_CHAR_ENCODING_8859_5= 14,/* ISO-8859-5 */ 72 XML_CHAR_ENCODING_8859_6= 15,/* ISO-8859-6 */ 73 XML_CHAR_ENCODING_8859_7= 16,/* ISO-8859-7 */ 74 XML_CHAR_ENCODING_8859_8= 17,/* ISO-8859-8 */ 75 XML_CHAR_ENCODING_8859_9= 18,/* ISO-8859-9 */ 76 XML_CHAR_ENCODING_2022_JP= 19,/* ISO-2022-JP */ 77 XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */ 78 XML_CHAR_ENCODING_EUC_JP= 21,/* EUC-JP */ 79 XML_CHAR_ENCODING_ASCII= 22 /* pure ASCII */ 80} xmlCharEncoding; 81 82/** 83 * xmlCharEncodingInputFunc: 84 * @out: a pointer to an array of bytes to store the UTF-8 result 85 * @outlen: the length of @out 86 * @in: a pointer to an array of chars in the original encoding 87 * @inlen: the length of @in 88 * 89 * Take a block of chars in the original encoding and try to convert 90 * it to an UTF-8 block of chars out. 91 * 92 * Returns the number of bytes written, -1 if lack of space, or -2 93 * if the transcoding failed. 94 * The value of @inlen after return is the number of octets consumed 95 * if the return value is positive, else unpredictiable. 96 * The value of @outlen after return is the number of octets consumed. 97 */ 98typedef int (* xmlCharEncodingInputFunc)(unsigned char *out, int *outlen, 99 const unsigned char *in, int *inlen); 100 101 102/** 103 * xmlCharEncodingOutputFunc: 104 * @out: a pointer to an array of bytes to store the result 105 * @outlen: the length of @out 106 * @in: a pointer to an array of UTF-8 chars 107 * @inlen: the length of @in 108 * 109 * Take a block of UTF-8 chars in and try to convert it to another 110 * encoding. 111 * Note: a first call designed to produce heading info is called with 112 * in = NULL. If stateful this should also initialize the encoder state. 113 * 114 * Returns the number of bytes written, -1 if lack of space, or -2 115 * if the transcoding failed. 116 * The value of @inlen after return is the number of octets consumed 117 * if the return value is positive, else unpredictiable. 118 * The value of @outlen after return is the number of octets produced. 119 */ 120typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, 121 const unsigned char *in, int *inlen); 122 123 124/* 125 * Block defining the handlers for non UTF-8 encodings. 126 * If iconv is supported, there are two extra fields. 127 */ 128 129typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; 130typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; 131struct _xmlCharEncodingHandler { 132 char *name; 133 xmlCharEncodingInputFunc input; 134 xmlCharEncodingOutputFunc output; 135#ifdef LIBXML_ICONV_ENABLED 136 iconv_t iconv_in; 137 iconv_t iconv_out; 138#endif /* LIBXML_ICONV_ENABLED */ 139}; 140 141#ifdef __cplusplus 142} 143#endif 144#include <libxml/tree.h> 145#ifdef __cplusplus 146extern "C" { 147#endif 148 149/* 150 * Interfaces for encoding handlers. 151 */ 152XMLPUBFUN void XMLCALL 153 xmlInitCharEncodingHandlers (void); 154XMLPUBFUN void XMLCALL 155 xmlCleanupCharEncodingHandlers (void); 156XMLPUBFUN void XMLCALL 157 xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler); 158XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL 159 xmlGetCharEncodingHandler (xmlCharEncoding enc); 160XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL 161 xmlFindCharEncodingHandler (const char *name); 162XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL 163 xmlNewCharEncodingHandler (const char *name, 164 xmlCharEncodingInputFunc input, 165 xmlCharEncodingOutputFunc output); 166 167/* 168 * Interfaces for encoding names and aliases. 169 */ 170XMLPUBFUN int XMLCALL 171 xmlAddEncodingAlias (const char *name, 172 const char *alias); 173XMLPUBFUN int XMLCALL 174 xmlDelEncodingAlias (const char *alias); 175XMLPUBFUN const char * XMLCALL 176 xmlGetEncodingAlias (const char *alias); 177XMLPUBFUN void XMLCALL 178 xmlCleanupEncodingAliases (void); 179XMLPUBFUN xmlCharEncoding XMLCALL 180 xmlParseCharEncoding (const char *name); 181XMLPUBFUN const char * XMLCALL 182 xmlGetCharEncodingName (xmlCharEncoding enc); 183 184/* 185 * Interfaces directly used by the parsers. 186 */ 187XMLPUBFUN xmlCharEncoding XMLCALL 188 xmlDetectCharEncoding (const unsigned char *in, 189 int len); 190 191XMLPUBFUN int XMLCALL 192 xmlCharEncOutFunc (xmlCharEncodingHandler *handler, 193 xmlBufferPtr out, 194 xmlBufferPtr in); 195 196XMLPUBFUN int XMLCALL 197 xmlCharEncInFunc (xmlCharEncodingHandler *handler, 198 xmlBufferPtr out, 199 xmlBufferPtr in); 200XMLPUBFUN int XMLCALL 201 xmlCharEncFirstLine (xmlCharEncodingHandler *handler, 202 xmlBufferPtr out, 203 xmlBufferPtr in); 204XMLPUBFUN int XMLCALL 205 xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); 206 207/* 208 * Export a few useful functions 209 */ 210#ifdef LIBXML_OUTPUT_ENABLED 211XMLPUBFUN int XMLCALL 212 UTF8Toisolat1 (unsigned char *out, 213 int *outlen, 214 const unsigned char *in, 215 int *inlen); 216#endif /* LIBXML_OUTPUT_ENABLED */ 217XMLPUBFUN int XMLCALL 218 isolat1ToUTF8 (unsigned char *out, 219 int *outlen, 220 const unsigned char *in, 221 int *inlen); 222#ifdef __cplusplus 223} 224#endif 225 226#endif /* __XML_CHAR_ENCODING_H__ */ 227