1/* 2 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 */ 18 19#include <stdlib.h> 20 21/* 22 * These functions are like the C library's mbtowc() and wctomb(), 23 * but instead of depending on the locale they always work in UTF-8, 24 * and they use int instead of wchar_t. 25 */ 26 27int utf8_mbtowc(int *pwc, const char *s, size_t n); 28int utf8_wctomb(char *s, int wc); 29 30/* 31 * This is an object-oriented version of mbtowc() and wctomb(). 32 * The caller first uses charset_find() to get a pointer to struct 33 * charset, then uses the mbtowc() and wctomb() methods on it. 34 * The function charset_max() gives the maximum length of a 35 * multibyte character in that encoding. 36 * This API is only appropriate for stateless encodings like UTF-8 37 * or ISO-8859-3, but I have no intention of implementing anything 38 * other than UTF-8 and 8-bit encodings. 39 * 40 * MINOR BUG: If there is no memory charset_find() may return 0 and 41 * there is no way to distinguish this case from an unknown encoding. 42 */ 43 44struct charset; 45 46struct charset *charset_find(const char *code); 47 48int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n); 49int charset_wctomb(struct charset *charset, char *s, int wc); 50int charset_max(struct charset *charset); 51 52/* 53 * Function to convert a buffer from one encoding to another. 54 * Invalid bytes are replaced by '#', and characters that are 55 * not available in the target encoding are replaced by '?'. 56 * Each of TO and TOLEN may be zero if the result is not wanted. 57 * The input or output may contain null bytes, but the output 58 * buffer is also null-terminated, so it is all right to 59 * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0). 60 * 61 * Return value: 62 * 63 * -2 : memory allocation failed 64 * -1 : unknown encoding 65 * 0 : data was converted exactly 66 * 1 : valid data was converted approximately (using '?') 67 * 2 : input was invalid (but still converted, using '#') 68 */ 69 70int charset_convert(const char *fromcode, const char *tocode, 71 const char *from, size_t fromlen, 72 char **to, size_t *tolen); 73