1/* $NetBSD: chartype.c,v 1.10 2011/08/16 16:25:15 christos Exp $ */ 2 3/*- 4 * Copyright (c) 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the NetBSD 18 * Foundation, Inc. and its contributors. 19 * 4. Neither the name of The NetBSD Foundation nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 25 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 26 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 27 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 31 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 * POSSIBILITY OF SUCH DAMAGE. 34 */ 35 36/* 37 * chartype.c: character classification and meta information 38 */ 39#include "config.h" 40#if !defined(lint) && !defined(SCCSID) 41__RCSID("$NetBSD: chartype.c,v 1.10 2011/08/16 16:25:15 christos Exp $"); 42#endif /* not lint && not SCCSID */ 43#include "el.h" 44#include <stdlib.h> 45 46#define CT_BUFSIZ ((size_t)1024) 47 48#ifdef WIDECHAR 49protected void 50ct_conv_buff_resize(ct_buffer_t *conv, size_t mincsize, size_t minwsize) 51{ 52 void *p; 53 if (mincsize > conv->csize) { 54 conv->csize = mincsize; 55 p = el_realloc(conv->cbuff, conv->csize * sizeof(*conv->cbuff)); 56 if (p == NULL) { 57 conv->csize = 0; 58 el_free(conv->cbuff); 59 conv->cbuff = NULL; 60 } else 61 conv->cbuff = p; 62 } 63 64 if (minwsize > conv->wsize) { 65 conv->wsize = minwsize; 66 p = el_realloc(conv->wbuff, conv->wsize * sizeof(*conv->wbuff)); 67 if (p == NULL) { 68 conv->wsize = 0; 69 el_free(conv->wbuff); 70 conv->wbuff = NULL; 71 } else 72 conv->wbuff = p; 73 } 74} 75 76 77public char * 78ct_encode_string(const Char *s, ct_buffer_t *conv) 79{ 80 char *dst; 81 ssize_t used = 0; 82 83 if (!s) 84 return NULL; 85 if (!conv->cbuff) 86 ct_conv_buff_resize(conv, CT_BUFSIZ, (size_t)0); 87 if (!conv->cbuff) 88 return NULL; 89 90 dst = conv->cbuff; 91 while (*s) { 92 used = (ssize_t)(conv->csize - (size_t)(dst - conv->cbuff)); 93 if (used < 5) { 94 used = dst - conv->cbuff; 95 ct_conv_buff_resize(conv, conv->csize + CT_BUFSIZ, 96 (size_t)0); 97 if (!conv->cbuff) 98 return NULL; 99 dst = conv->cbuff + used; 100 } 101 used = ct_encode_char(dst, (size_t)5, *s); 102 if (used == -1) /* failed to encode, need more buffer space */ 103 abort(); 104 ++s; 105 dst += used; 106 } 107 *dst = '\0'; 108 return conv->cbuff; 109} 110 111public Char * 112ct_decode_string(const char *s, ct_buffer_t *conv) 113{ 114 size_t len = 0; 115 116 if (!s) 117 return NULL; 118 if (!conv->wbuff) 119 ct_conv_buff_resize(conv, (size_t)0, CT_BUFSIZ); 120 if (!conv->wbuff) 121 return NULL; 122 123 len = ct_mbstowcs(NULL, s, (size_t)0); 124 if (len == (size_t)-1) 125 return NULL; 126 if (len > conv->wsize) 127 ct_conv_buff_resize(conv, (size_t)0, len + 1); 128 if (!conv->wbuff) 129 return NULL; 130 ct_mbstowcs(conv->wbuff, s, conv->wsize); 131 return conv->wbuff; 132} 133 134 135protected Char ** 136ct_decode_argv(int argc, const char *argv[], ct_buffer_t *conv) 137{ 138 size_t bufspace; 139 int i; 140 Char *p; 141 Char **wargv; 142 ssize_t bytes; 143 144 /* Make sure we have enough space in the conversion buffer to store all 145 * the argv strings. */ 146 for (i = 0, bufspace = 0; i < argc; ++i) 147 bufspace += argv[i] ? strlen(argv[i]) + 1 : 0; 148 ct_conv_buff_resize(conv, (size_t)0, bufspace); 149 if (!conv->wsize) 150 return NULL; 151 152 wargv = el_malloc((size_t)argc * sizeof(*wargv)); 153 154 for (i = 0, p = conv->wbuff; i < argc; ++i) { 155 if (!argv[i]) { /* don't pass null pointers to mbstowcs */ 156 wargv[i] = NULL; 157 continue; 158 } else { 159 wargv[i] = p; 160 bytes = (ssize_t)mbstowcs(p, argv[i], bufspace); 161 } 162 if (bytes == -1) { 163 el_free(wargv); 164 return NULL; 165 } else 166 bytes++; /* include '\0' in the count */ 167 bufspace -= (size_t)bytes; 168 p += bytes; 169 } 170 171 return wargv; 172} 173 174 175protected size_t 176ct_enc_width(Char c) 177{ 178 /* UTF-8 encoding specific values */ 179 if (c < 0x80) 180 return 1; 181 else if (c < 0x0800) 182 return 2; 183 else if (c < 0x10000) 184 return 3; 185 else if (c < 0x110000) 186 return 4; 187 else 188 return 0; /* not a valid codepoint */ 189} 190 191protected ssize_t 192ct_encode_char(char *dst, size_t len, Char c) 193{ 194 ssize_t l = 0; 195 if (len < ct_enc_width(c)) 196 return -1; 197 l = ct_wctomb(dst, c); 198 199 if (l < 0) { 200 ct_wctomb_reset; 201 l = 0; 202 } 203 return l; 204} 205#endif 206 207protected const Char * 208ct_visual_string(const Char *s) 209{ 210 static Char *buff = NULL; 211 static size_t buffsize = 0; 212 void *p; 213 Char *dst; 214 ssize_t used = 0; 215 216 if (!s) 217 return NULL; 218 if (!buff) { 219 buffsize = CT_BUFSIZ; 220 buff = el_malloc(buffsize * sizeof(*buff)); 221 } 222 dst = buff; 223 while (*s) { 224 used = ct_visual_char(dst, buffsize - (size_t)(dst - buff), *s); 225 if (used == -1) { /* failed to encode, need more buffer space */ 226 used = dst - buff; 227 buffsize += CT_BUFSIZ; 228 p = el_realloc(buff, buffsize * sizeof(*buff)); 229 if (p == NULL) 230 goto out; 231 buff = p; 232 dst = buff + used; 233 /* don't increment s here - we want to retry it! */ 234 continue; 235 } 236 else 237 ++s; 238 dst += used; 239 } 240 if (dst >= (buff + buffsize)) { /* sigh */ 241 buffsize += 1; 242 p = el_realloc(buff, buffsize * sizeof(*buff)); 243 if (p == NULL) 244 goto out; 245 buff = p; 246 dst = buff + buffsize - 1; 247 } 248 *dst = 0; 249 return buff; 250out: 251 el_free(buff); 252 buffsize = 0; 253 return NULL; 254} 255 256 257 258protected int 259ct_visual_width(Char c) 260{ 261 int t = ct_chr_class(c); 262 switch (t) { 263 case CHTYPE_ASCIICTL: 264 return 2; /* ^@ ^? etc. */ 265 case CHTYPE_TAB: 266 return 1; /* Hmm, this really need to be handled outside! */ 267 case CHTYPE_NL: 268 return 0; /* Should this be 1 instead? */ 269#ifdef WIDECHAR 270 case CHTYPE_PRINT: 271 return wcwidth(c); 272 case CHTYPE_NONPRINT: 273 if (c > 0xffff) /* prefer standard 4-byte display over 5-byte */ 274 return 8; /* \U+12345 */ 275 else 276 return 7; /* \U+1234 */ 277#else 278 case CHTYPE_PRINT: 279 return 1; 280 case CHTYPE_NONPRINT: 281 return 4; /* \123 */ 282#endif 283 default: 284 return 0; /* should not happen */ 285 } 286} 287 288 289protected ssize_t 290ct_visual_char(Char *dst, size_t len, Char c) 291{ 292 int t = ct_chr_class(c); 293 switch (t) { 294 case CHTYPE_TAB: 295 case CHTYPE_NL: 296 case CHTYPE_ASCIICTL: 297 if (len < 2) 298 return -1; /* insufficient space */ 299 *dst++ = '^'; 300 if (c == '\177') 301 *dst = '?'; /* DEL -> ^? */ 302 else 303 *dst = c | 0100; /* uncontrolify it */ 304 return 2; 305 case CHTYPE_PRINT: 306 if (len < 1) 307 return -1; /* insufficient space */ 308 *dst = c; 309 return 1; 310 case CHTYPE_NONPRINT: 311 /* we only use single-width glyphs for display, 312 * so this is right */ 313 if ((ssize_t)len < ct_visual_width(c)) 314 return -1; /* insufficient space */ 315#ifdef WIDECHAR 316 *dst++ = '\\'; 317 *dst++ = 'U'; 318 *dst++ = '+'; 319#define tohexdigit(v) "0123456789ABCDEF"[v] 320 if (c > 0xffff) /* prefer standard 4-byte display over 5-byte */ 321 *dst++ = tohexdigit(((unsigned int) c >> 16) & 0xf); 322 *dst++ = tohexdigit(((unsigned int) c >> 12) & 0xf); 323 *dst++ = tohexdigit(((unsigned int) c >> 8) & 0xf); 324 *dst++ = tohexdigit(((unsigned int) c >> 4) & 0xf); 325 *dst = tohexdigit(((unsigned int) c ) & 0xf); 326 return c > 0xffff ? 8 : 7; 327#else 328 *dst++ = '\\'; 329#define tooctaldigit(v) ((v) + '0') 330 *dst++ = tooctaldigit(((unsigned int) c >> 6) & 0x7); 331 *dst++ = tooctaldigit(((unsigned int) c >> 3) & 0x7); 332 *dst++ = tooctaldigit(((unsigned int) c ) & 0x7); 333#endif 334 /*FALLTHROUGH*/ 335 /* these two should be handled outside this function */ 336 default: /* we should never hit the default */ 337 return 0; 338 } 339} 340 341 342 343 344protected int 345ct_chr_class(Char c) 346{ 347 if (c == '\t') 348 return CHTYPE_TAB; 349 else if (c == '\n') 350 return CHTYPE_NL; 351 else if (IsASCII(c) && Iscntrl(c)) 352 return CHTYPE_ASCIICTL; 353 else if (Isprint(c)) 354 return CHTYPE_PRINT; 355 else 356 return CHTYPE_NONPRINT; 357} 358