1#ifndef lint 2static char *rcsid = "$Id: utf8.c,v 1.1 2003/06/04 00:26:44 marka Exp $"; 3#endif 4 5/* 6 * Copyright (c) 2000 Japan Network Information Center. All rights reserved. 7 * 8 * By using this file, you agree to the terms and conditions set forth bellow. 9 * 10 * LICENSE TERMS AND CONDITIONS 11 * 12 * The following License Terms and Conditions apply, unless a different 13 * license is obtained from Japan Network Information Center ("JPNIC"), 14 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 15 * Chiyoda-ku, Tokyo 101-0047, Japan. 16 * 17 * 1. Use, Modification and Redistribution (including distribution of any 18 * modified or derived work) in source and/or binary forms is permitted 19 * under this License Terms and Conditions. 20 * 21 * 2. Redistribution of source code must retain the copyright notices as they 22 * appear in each source code file, this License Terms and Conditions. 23 * 24 * 3. Redistribution in binary form must reproduce the Copyright Notice, 25 * this License Terms and Conditions, in the documentation and/or other 26 * materials provided with the distribution. For the purposes of binary 27 * distribution the "Copyright Notice" refers to the following language: 28 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 29 * 30 * 4. The name of JPNIC may not be used to endorse or promote products 31 * derived from this Software without specific prior written approval of 32 * JPNIC. 33 * 34 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 37 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 39 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 40 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 41 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 42 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 43 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 44 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 45 */ 46 47#include <config.h> 48 49#include <stddef.h> 50 51#include <idn/assert.h> 52#include <idn/logmacro.h> 53#include <idn/utf8.h> 54#include <idn/debug.h> 55 56#define UTF8_WIDTH(c) \ 57 (((c) < 0x80) ? 1 : \ 58 ((c) < 0xc0) ? 0 : \ 59 ((c) < 0xe0) ? 2 : \ 60 ((c) < 0xf0) ? 3 : \ 61 ((c) < 0xf8) ? 4 : \ 62 ((c) < 0xfc) ? 5 : \ 63 ((c) < 0xfe) ? 6 : 0) 64 65#define VALID_CONT_BYTE(c) (0x80 <= (c) && (c) < 0xc0) 66 67int 68idn_utf8_mblen(const char *s) { 69 int c = *(unsigned char *)s; 70 71 assert(s != NULL); 72 73#if 0 74 TRACE(("idn_utf8_mblen(s=<%s>)\n", idn__debug_hexstring(s, 6))); 75#endif 76 77 return UTF8_WIDTH(c); 78} 79 80int 81idn_utf8_getmb(const char *s, size_t len, char *buf) { 82 /* buf must be at least 7-bytes long */ 83 const unsigned char *p = (const unsigned char *)s; 84 unsigned char *q = (unsigned char *)buf; 85 int width = UTF8_WIDTH(*p); 86 int w; 87 88 assert(s != NULL); 89 90#if 0 91 TRACE(("idn_utf8_getmb(s=<%s>,len=%d)\n", 92 idn__debug_hexstring(s, 6), len)); 93#endif 94 95 if (width == 0 || len < width) 96 return (0); 97 98 /* Copy the first byte. */ 99 *q++ = *p++; 100 101 /* .. and the rest. */ 102 w = width; 103 while (--w > 0) { 104 if (!VALID_CONT_BYTE(*p)) 105 return (0); 106 *q++ = *p++; 107 } 108 return (width); 109} 110 111int 112idn_utf8_getwc(const char *s, size_t len, unsigned long *vp) { 113 unsigned long v; 114 unsigned long min; 115 const unsigned char *p = (const unsigned char *)s; 116 int c; 117 int width; 118 int rest; 119 120 assert(s != NULL); 121 122#if 0 123 TRACE(("idn_utf8_getwc(s=<%s>,len=%d)\n", 124 idn__debug_hexstring(s, 10), len)); 125#endif 126 127 c = *p++; 128 width = UTF8_WIDTH(c); 129 130 switch (width) { 131 case 0: 132 return (0); 133 case 1: 134 v = c; 135 min = 0; 136 break; 137 case 2: 138 v = c & 0x1f; 139 min = 0x80; 140 break; 141 case 3: 142 v = c & 0xf; 143 min = 0x800; 144 break; 145 case 4: 146 v = c & 0x7; 147 min = 0x10000; 148 break; 149 case 5: 150 v = c & 3; 151 min = 0x200000; 152 break; 153 case 6: 154 v = c & 1; 155 min = 0x4000000; 156 break; 157 default: 158 FATAL(("idn_utf8_getint: internal error\n")); 159 return (0); 160 } 161 162 if (len < width) 163 return (0); 164 165 rest = width - 1; 166 while (rest-- > 0) { 167 if (!VALID_CONT_BYTE(*p)) 168 return (0); 169 v = (v << 6) | (*p & 0x3f); 170 p++; 171 } 172 173 if (v < min) 174 return (0); 175 176 *vp = v; 177 return (width); 178} 179 180int 181idn_utf8_putwc(char *s, size_t len, unsigned long v) { 182 unsigned char *p = (unsigned char *)s; 183 int mask; 184 int off; 185 int l; 186 187 assert(s != NULL); 188 189#if 0 190 TRACE(("idn_utf8_putwc(v=%lx)\n", v)); 191#endif 192 193 if (v < 0x80) { 194 mask = 0; 195 l = 1; 196 } else if (v < 0x800) { 197 mask = 0xc0; 198 l = 2; 199 } else if (v < 0x10000) { 200 mask = 0xe0; 201 l = 3; 202 } else if (v < 0x200000) { 203 mask = 0xf0; 204 l = 4; 205 } else if (v < 0x4000000) { 206 mask = 0xf8; 207 l = 5; 208 } else if (v < 0x80000000) { 209 mask = 0xfc; 210 l = 6; 211 } else { 212 return (0); 213 } 214 215 if (len < l) 216 return (0); 217 218 off = 6 * (l - 1); 219 *p++ = (v >> off) | mask; 220 mask = 0x80; 221 while (off > 0) { 222 off -= 6; 223 *p++ = ((v >> off) & 0x3f) | mask; 224 } 225 return l; 226} 227 228int 229idn_utf8_isvalidchar(const char *s) { 230 unsigned long dummy; 231 232 TRACE(("idn_utf8_isvalidchar(s=<%s>)\n", 233 idn__debug_hexstring(s, 6))); 234 235 return (idn_utf8_getwc(s, 6, &dummy) > 0); 236} 237 238int 239idn_utf8_isvalidstring(const char *s) { 240 unsigned long dummy; 241 int width; 242 243 assert(s != NULL); 244 245 TRACE(("idn_utf8_isvalidstring(s=<%s>)\n", 246 idn__debug_hexstring(s, 20))); 247 248 while (*s != '\0') { 249 width = idn_utf8_getwc(s, 6, &dummy); 250 if (width == 0) 251 return (0); 252 s += width; 253 } 254 return (1); 255} 256 257char * 258idn_utf8_findfirstbyte(const char *s, const char *known_top) { 259 const unsigned char *p = (const unsigned char *)s; 260 const unsigned char *t = (const unsigned char *)known_top; 261 262 assert(s != NULL && known_top != NULL && known_top <= s); 263 264 TRACE(("idn_utf8_findfirstbyte(s=<%s>)\n", 265 idn__debug_hexstring(s, 8))); 266 267 while (p >= t) { 268 if (!VALID_CONT_BYTE(*p)) 269 break; 270 p--; 271 } 272 if (p < t || UTF8_WIDTH(*p) == 0) 273 return (NULL); 274 275 return ((char *)p); 276} 277