charset.c revision 128345
115920Speter/* 2139969Simp * Copyright (C) 1984-2002 Mark Nudelman 3139969Simp * 415920Speter * You may distribute under the terms of either the GNU General Public 515920Speter * License or the Less License, as specified in the README file. 615920Speter * 715920Speter * For more information about less, or for information on how to 815920Speter * contact the author, see the README file. 915920Speter */ 1015920Speter 1115920Speter 1215920Speter/* 1315920Speter * Functions to define the character set 1415920Speter * and do things specific to the character set. 1515920Speter */ 1615920Speter 1715920Speter#include "less.h" 1815920Speter#if HAVE_LOCALE 1915920Speter#include <locale.h> 2015920Speter#include <ctype.h> 2115920Speter#endif 2215920Speter 2315920Speterpublic int utf_mode = 0; 2415920Speter 2515920Speter/* 2615920Speter * Predefined character sets, 2715920Speter * selected by the LESSCHARSET environment variable. 2815920Speter */ 2915920Speterstruct charset { 3015920Speter char *name; 3115920Speter int *p_flag; 3215920Speter char *desc; 3315920Speter} charsets[] = { 3415920Speter { "ascii", NULL, "8bcccbcc18b95.b" }, 3550471Speter { "dos", NULL, "8bcccbcc12bc5b223.b" }, 3615920Speter { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." }, 3715920Speter { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" }, 3815920Speter { "iso8859", NULL, "8bcccbcc18b95.33b." }, 39157413Sstefanf { "koi8-r", NULL, "8bcccbcc18b95.b128." }, 40157413Sstefanf { "next", NULL, "8bcccbcc18b95.bb125.bb" }, 41157413Sstefanf { "utf-8", &utf_mode, "8bcccbcc18b." }, 42157413Sstefanf { NULL, NULL, NULL } 43157413Sstefanf}; 44157413Sstefanf 45157413Sstefanfstruct cs_alias { 46157413Sstefanf char *name; 47157413Sstefanf char *oname; 4815920Speter} cs_aliases[] = { 4915920Speter { "latin1", "iso8859" }, 5015920Speter { "latin9", "iso8859" }, 51100437Stjr { NULL, NULL } 52153094Sstefanf}; 5315920Speter 54153094Sstefanf#define IS_BINARY_CHAR 01 55157413Sstefanf#define IS_CONTROL_CHAR 02 5615920Speter 57153094Sstefanfstatic char chardef[256]; 58157413Sstefanfstatic char *binfmt = NULL; 5915920Speterpublic int binattr = AT_STANDOUT; 60157413Sstefanf 61157413Sstefanf 62157413Sstefanf/* 6315920Speter * Define a charset, given a description string. 64157413Sstefanf * The string consists of 256 letters, 6586505Sknu * one for each character in the charset. 6615920Speter * If the string is shorter than 256 letters, missing letters 6715920Speter * are taken to be identical to the last one. 6815920Speter * A decimal number followed by a letter is taken to be a 6915920Speter * repetition of the letter. 70153094Sstefanf * 7115920Speter * Each letter is one of: 7215920Speter * . normal character 7315920Speter * b binary character 74215520Sjilles * c control character 7515920Speter */ 7615920Speter static void 77157413Sstefanfichardef(s) 78157413Sstefanf char *s; 7915920Speter{ 80157413Sstefanf register char *cp; 81153094Sstefanf register int n; 82157413Sstefanf register char v; 83157413Sstefanf 84157413Sstefanf n = 0; 8525235Ssteve v = 0; 86153094Sstefanf cp = chardef; 8715920Speter while (*s != '\0') 8815920Speter { 89157413Sstefanf switch (*s++) 9015920Speter { 91108286Stjr case '.': 92 v = 0; 93 break; 94 case 'c': 95 v = IS_CONTROL_CHAR; 96 break; 97 case 'b': 98 v = IS_BINARY_CHAR|IS_CONTROL_CHAR; 99 break; 100 101 case '0': case '1': case '2': case '3': case '4': 102 case '5': case '6': case '7': case '8': case '9': 103 n = (10 * n) + (s[-1] - '0'); 104 continue; 105 106 default: 107 error("invalid chardef", NULL_PARG); 108 quit(QUIT_ERROR); 109 /*NOTREACHED*/ 110 } 111 112 do 113 { 114 if (cp >= chardef + sizeof(chardef)) 115 { 116 error("chardef longer than 256", NULL_PARG); 117 quit(QUIT_ERROR); 118 /*NOTREACHED*/ 119 } 120 *cp++ = v; 121 } while (--n > 0); 122 n = 0; 123 } 124 125 while (cp < chardef + sizeof(chardef)) 126 *cp++ = v; 127} 128 129/* 130 * Define a charset, given a charset name. 131 * The valid charset names are listed in the "charsets" array. 132 */ 133 static int 134icharset(name) 135 register char *name; 136{ 137 register struct charset *p; 138 register struct cs_alias *a; 139 140 if (name == NULL || *name == '\0') 141 return (0); 142 143 /* First see if the name is an alias. */ 144 for (a = cs_aliases; a->name != NULL; a++) 145 { 146 if (strcmp(name, a->name) == 0) 147 { 148 name = a->oname; 149 break; 150 } 151 } 152 153 for (p = charsets; p->name != NULL; p++) 154 { 155 if (strcmp(name, p->name) == 0) 156 { 157 ichardef(p->desc); 158 if (p->p_flag != NULL) 159 *(p->p_flag) = 1; 160 return (1); 161 } 162 } 163 164 error("invalid charset name", NULL_PARG); 165 quit(QUIT_ERROR); 166 /*NOTREACHED*/ 167 return (0); 168} 169 170#if HAVE_LOCALE 171/* 172 * Define a charset, given a locale name. 173 */ 174 static void 175ilocale() 176{ 177 register int c; 178 179 setlocale(LC_ALL, ""); 180 for (c = 0; c < (int) sizeof(chardef); c++) 181 { 182 if (isprint(c)) 183 chardef[c] = 0; 184 else if (iscntrl(c)) 185 chardef[c] = IS_CONTROL_CHAR; 186 else 187 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 188 } 189} 190#endif 191 192/* 193 * Define the printing format for control chars. 194 */ 195 public void 196setbinfmt(s) 197 char *s; 198{ 199 if (s == NULL || *s == '\0') 200 s = "*s<%X>"; 201 /* 202 * Select the attributes if it starts with "*". 203 */ 204 if (*s == '*') 205 { 206 switch (s[1]) 207 { 208 case 'd': binattr = AT_BOLD; break; 209 case 'k': binattr = AT_BLINK; break; 210 case 's': binattr = AT_STANDOUT; break; 211 case 'u': binattr = AT_UNDERLINE; break; 212 default: binattr = AT_NORMAL; break; 213 } 214 s += 2; 215 } 216 binfmt = s; 217} 218 219/* 220 * Initialize charset data structures. 221 */ 222 public void 223init_charset() 224{ 225 register char *s; 226 227 s = lgetenv("LESSBINFMT"); 228 setbinfmt(s); 229 230 /* 231 * See if environment variable LESSCHARSET is defined. 232 */ 233 s = lgetenv("LESSCHARSET"); 234 if (icharset(s)) 235 return; 236 /* 237 * LESSCHARSET is not defined: try LESSCHARDEF. 238 */ 239 s = lgetenv("LESSCHARDEF"); 240 if (s != NULL && *s != '\0') 241 { 242 ichardef(s); 243 return; 244 } 245 246#if HAVE_STRSTR 247 /* 248 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used. 249 */ 250 if ((s = lgetenv("LC_ALL")) != NULL || 251 (s = lgetenv("LC_CTYPE")) != NULL || 252 (s = lgetenv("LANG")) != NULL) 253 { 254 if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL) 255 if (icharset("utf-8")) 256 return; 257 } 258#endif 259 260#if HAVE_LOCALE 261 /* 262 * Use setlocale. 263 */ 264 ilocale(); 265#else 266#if MSDOS_COMPILER 267 /* 268 * Default to "dos". 269 */ 270 (void) icharset("dos"); 271#else 272 /* 273 * Default to "latin1". 274 */ 275 (void) icharset("latin1"); 276#endif 277#endif 278} 279 280/* 281 * Is a given character a "binary" character? 282 */ 283 public int 284binary_char(c) 285 unsigned char c; 286{ 287 c &= 0377; 288 return (chardef[c] & IS_BINARY_CHAR); 289} 290 291/* 292 * Is a given character a "control" character? 293 */ 294 public int 295control_char(c) 296 int c; 297{ 298 c &= 0377; 299 return (chardef[c] & IS_CONTROL_CHAR); 300} 301 302/* 303 * Return the printable form of a character. 304 * For example, in the "ascii" charset '\3' is printed as "^C". 305 */ 306 public char * 307prchar(c) 308 int c; 309{ 310 static char buf[8]; 311 312 c &= 0377; 313 if (!control_char(c)) 314 sprintf(buf, "%c", c); 315 else if (c == ESC) 316 sprintf(buf, "ESC"); 317#if IS_EBCDIC_HOST 318 else if (!binary_char(c) && c < 64) 319 sprintf(buf, "^%c", 320 /* 321 * This array roughly inverts CONTROL() #defined in less.h, 322 * and should be kept in sync with CONTROL() and IBM-1047. 323 */ 324 "@ABC.I.?...KLMNO" 325 "PQRS.JH.XY.." 326 "\\]^_" 327 "......W[.....EFG" 328 "..V....D....TU.Z"[c]); 329#else 330 else if (c < 128 && !control_char(c ^ 0100)) 331 sprintf(buf, "^%c", c ^ 0100); 332#endif 333 else 334 sprintf(buf, binfmt, c); 335 return (buf); 336} 337