charset.c revision 89019
11556Srgrimes/* 21556Srgrimes * Copyright (C) 1984-2000 Mark Nudelman 31556Srgrimes * 41556Srgrimes * You may distribute under the terms of either the GNU General Public 51556Srgrimes * License or the Less License, as specified in the README file. 61556Srgrimes * 71556Srgrimes * For more information about less, or for information on how to 81556Srgrimes * contact the author, see the README file. 91556Srgrimes */ 101556Srgrimes 111556Srgrimes 121556Srgrimes/* 131556Srgrimes * Functions to define the character set 141556Srgrimes * and do things specific to the character set. 151556Srgrimes */ 161556Srgrimes 171556Srgrimes#include "less.h" 181556Srgrimes#if HAVE_LOCALE 191556Srgrimes#include <locale.h> 201556Srgrimes#include <ctype.h> 211556Srgrimes#endif 221556Srgrimes 231556Srgrimespublic int utf_mode = 0; 241556Srgrimes 251556Srgrimes/* 261556Srgrimes * Predefined character sets, 271556Srgrimes * selected by the LESSCHARSET environment variable. 281556Srgrimes */ 291556Srgrimesstruct charset { 301556Srgrimes char *name; 311556Srgrimes int *p_flag; 321556Srgrimes char *desc; 331556Srgrimes} charsets[] = { 341556Srgrimes { "ascii", NULL, "8bcccbcc18b95.b" }, 351556Srgrimes { "dos", NULL, "8bcccbcc12bc5b223.b" }, 361556Srgrimes { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." }, 371556Srgrimes { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" }, 3836150Scharnier { "iso8859", NULL, "8bcccbcc18b95.33b." }, 3936150Scharnier { "koi8-r", NULL, "8bcccbcc18b95.b128." }, 4036150Scharnier { "next", NULL, "8bcccbcc18b95.bb125.bb" }, 4136150Scharnier { "utf-8", &utf_mode, "8bcccbcc18b." }, 4238521Scracauer { NULL, NULL, NULL } 431556Srgrimes}; 441556Srgrimes 4517987Speterstruct cs_alias { 4617987Speter char *name; 4717987Speter char *oname; 4817987Speter} cs_aliases[] = { 4917987Speter { "latin1", "iso8859" }, 5017987Speter { "latin9", "iso8859" }, 5117987Speter { NULL, NULL } 5217987Speter}; 5317987Speter 5417987Speter#define IS_BINARY_CHAR 01 5517987Speter#define IS_CONTROL_CHAR 02 5618018Speter 5717987Speterstatic char chardef[256]; 581556Srgrimesstatic char *binfmt = NULL; 591556Srgrimespublic int binattr = AT_STANDOUT; 6025222Ssteve 611556Srgrimes 6217987Speter/* 6317987Speter * Define a charset, given a description string. 6417987Speter * The string consists of 256 letters, 651556Srgrimes * one for each character in the charset. 661556Srgrimes * If the string is shorter than 256 letters, missing letters 6717987Speter * are taken to be identical to the last one. 6817987Speter * A decimal number followed by a letter is taken to be a 691556Srgrimes * repetition of the letter. 701556Srgrimes * 711556Srgrimes * Each letter is one of: 721556Srgrimes * . normal character 731556Srgrimes * b binary character 741556Srgrimes * c control character 751556Srgrimes */ 761556Srgrimes static void 771556Srgrimesichardef(s) 781556Srgrimes char *s; 791556Srgrimes{ 801556Srgrimes register char *cp; 811556Srgrimes register int n; 821556Srgrimes register char v; 831556Srgrimes 841556Srgrimes n = 0; 8528346Ssteve v = 0; 861556Srgrimes cp = chardef; 871556Srgrimes while (*s != '\0') 8828346Ssteve { 891556Srgrimes switch (*s++) 9038521Scracauer { 9138521Scracauer case '.': 921556Srgrimes v = 0; 9320425Ssteve break; 9417987Speter case 'c': 9520425Ssteve v = IS_CONTROL_CHAR; 9617987Speter break; 9717987Speter case 'b': 9817987Speter v = IS_BINARY_CHAR|IS_CONTROL_CHAR; 9920425Ssteve break; 10038521Scracauer 10120425Ssteve case '0': case '1': case '2': case '3': case '4': 10217987Speter case '5': case '6': case '7': case '8': case '9': 10317987Speter n = (10 * n) + (s[-1] - '0'); 10417987Speter continue; 1051556Srgrimes 1061556Srgrimes default: 1071556Srgrimes error("invalid chardef", NULL_PARG); 1081556Srgrimes quit(QUIT_ERROR); 1091556Srgrimes /*NOTREACHED*/ 1101556Srgrimes } 1111556Srgrimes 1121556Srgrimes do 1131556Srgrimes { 1141556Srgrimes if (cp >= chardef + sizeof(chardef)) 1151556Srgrimes { 1161556Srgrimes error("chardef longer than 256", NULL_PARG); 11720425Ssteve quit(QUIT_ERROR); 1181556Srgrimes /*NOTREACHED*/ 11920425Ssteve } 12017987Speter *cp++ = v; 12117987Speter } while (--n > 0); 1221556Srgrimes n = 0; 1231556Srgrimes } 1241556Srgrimes 1251556Srgrimes while (cp < chardef + sizeof(chardef)) 1261556Srgrimes *cp++ = v; 1271556Srgrimes} 1281556Srgrimes 1291556Srgrimes/* 13020425Ssteve * Define a charset, given a charset name. 1311556Srgrimes * The valid charset names are listed in the "charsets" array. 13220425Ssteve */ 13320425Ssteve static int 13420425Ssteveicharset(name) 13520425Ssteve register char *name; 1361556Srgrimes{ 1371556Srgrimes register struct charset *p; 1381556Srgrimes register struct cs_alias *a; 1391556Srgrimes 1401556Srgrimes if (name == NULL || *name == '\0') 14117987Speter return (0); 14217987Speter 1431556Srgrimes /* First see if the name is an alias. */ 1441556Srgrimes for (a = cs_aliases; a->name != NULL; a++) 1451556Srgrimes { 1461556Srgrimes if (strcmp(name, a->name) == 0) 1471556Srgrimes { 1481556Srgrimes name = a->oname; 1491556Srgrimes break; 1501556Srgrimes } 1511556Srgrimes } 1521556Srgrimes 1531556Srgrimes for (p = charsets; p->name != NULL; p++) 1541556Srgrimes { 1551556Srgrimes if (strcmp(name, p->name) == 0) 1561556Srgrimes { 15717987Speter ichardef(p->desc); 15820425Ssteve if (p->p_flag != NULL) 1591556Srgrimes *(p->p_flag) = 1; 16020425Ssteve return (1); 16120425Ssteve } 16220425Ssteve } 1631556Srgrimes 16417987Speter error("invalid charset name", NULL_PARG); 16520425Ssteve quit(QUIT_ERROR); 1661556Srgrimes /*NOTREACHED*/ 16720425Ssteve} 16820425Ssteve 16920425Ssteve#if HAVE_LOCALE 1701556Srgrimes/* 1711556Srgrimes * Define a charset, given a locale name. 1721556Srgrimes */ 1731556Srgrimes static void 1741556Srgrimesilocale() 1751556Srgrimes{ 17620425Ssteve register int c; 1771556Srgrimes 1781556Srgrimes setlocale(LC_ALL, ""); 1791556Srgrimes for (c = 0; c < (int) sizeof(chardef); c++) 18028346Ssteve { 18117987Speter if (isprint(c)) 1821556Srgrimes chardef[c] = 0; 1831556Srgrimes else if (iscntrl(c)) 1841556Srgrimes chardef[c] = IS_CONTROL_CHAR; 1851556Srgrimes else 1861556Srgrimes chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR; 1871556Srgrimes } 1881556Srgrimes} 1891556Srgrimes#endif 1901556Srgrimes 1911556Srgrimes/* 1921556Srgrimes * Define the printing format for control chars. 1931556Srgrimes */ 1941556Srgrimes public void 19517987Spetersetbinfmt(s) 19617987Speter char *s; 19725905Ssteve{ 19820425Ssteve if (s == NULL || *s == '\0') 19917987Speter s = "*s<%X>"; 2001556Srgrimes /* 2011556Srgrimes * Select the attributes if it starts with "*". 2021556Srgrimes */ 2031556Srgrimes if (*s == '*') 2041556Srgrimes { 2051556Srgrimes switch (s[1]) 2061556Srgrimes { 2071556Srgrimes case 'd': binattr = AT_BOLD; break; 20820425Ssteve case 'k': binattr = AT_BLINK; break; 2091556Srgrimes case 's': binattr = AT_STANDOUT; break; 21020425Ssteve case 'u': binattr = AT_UNDERLINE; break; 21120425Ssteve default: binattr = AT_NORMAL; break; 21220425Ssteve } 2131556Srgrimes s += 2; 2141556Srgrimes } 2151556Srgrimes binfmt = s; 2161556Srgrimes} 2171556Srgrimes 2181556Srgrimes/* 2191556Srgrimes * Initialize charset data structures. 2201556Srgrimes */ 22117987Speter public void 22217987Speterinit_charset() 22317987Speter{ 22420425Ssteve register char *s; 22517987Speter 2261556Srgrimes s = lgetenv("LESSBINFMT"); 2271556Srgrimes setbinfmt(s); 2281556Srgrimes 2291556Srgrimes /* 2301556Srgrimes * See if environment variable LESSCHARSET is defined. 2311556Srgrimes */ 2321556Srgrimes s = lgetenv("LESSCHARSET"); 2331556Srgrimes if (icharset(s)) 2341556Srgrimes return; 2351556Srgrimes /* 2361556Srgrimes * LESSCHARSET is not defined: try LESSCHARDEF. 2371556Srgrimes */ 2381556Srgrimes s = lgetenv("LESSCHARDEF"); 2391556Srgrimes if (s != NULL && *s != '\0') 2401556Srgrimes { 24117987Speter ichardef(s); 2421556Srgrimes return; 2431556Srgrimes } 2441556Srgrimes 2451556Srgrimes#if HAVE_STRSTR 2461556Srgrimes /* 2471556Srgrimes * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used. 2481556Srgrimes */ 2491556Srgrimes if ((s = lgetenv("LC_ALL")) != NULL || 25026104Ssteve (s = lgetenv("LC_CTYPE")) != NULL || 2511556Srgrimes (s = lgetenv("LANG")) != NULL) 2521556Srgrimes { 2531556Srgrimes if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL) 2541556Srgrimes if (icharset("utf-8")) 2551556Srgrimes return; 2561556Srgrimes } 2571556Srgrimes#endif 2581556Srgrimes 2591556Srgrimes#if HAVE_LOCALE 2601556Srgrimes /* 26117987Speter * Use setlocale. 26225905Ssteve */ 26325905Ssteve ilocale(); 26417987Speter#else 2651556Srgrimes#if MSDOS_COMPILER 2661556Srgrimes /* 2671556Srgrimes * Default to "dos". 2681556Srgrimes */ 2691556Srgrimes (void) icharset("dos"); 2701556Srgrimes#else 2711556Srgrimes /* 2721556Srgrimes * Default to "latin1". 2731556Srgrimes */ 2741556Srgrimes (void) icharset("latin1"); 2751556Srgrimes#endif 2761556Srgrimes#endif 2771556Srgrimes} 2781556Srgrimes 2791556Srgrimes/* 28020425Ssteve * Is a given character a "binary" character? 28117987Speter */ 28217987Speter public int 2831556Srgrimesbinary_char(c) 2841556Srgrimes unsigned char c; 2851556Srgrimes{ 2861556Srgrimes c &= 0377; 2871556Srgrimes return (chardef[c] & IS_BINARY_CHAR); 2881556Srgrimes} 2891556Srgrimes 2901556Srgrimes/* 2911556Srgrimes * Is a given character a "control" character? 2921556Srgrimes */ 2931556Srgrimes public int 2941556Srgrimescontrol_char(c) 2951556Srgrimes int c; 2961556Srgrimes{ 2971556Srgrimes c &= 0377; 2981556Srgrimes return (chardef[c] & IS_CONTROL_CHAR); 2991556Srgrimes} 3001556Srgrimes 3011556Srgrimes/* 3021556Srgrimes * Return the printable form of a character. 3031556Srgrimes * For example, in the "ascii" charset '\3' is printed as "^C". 3041556Srgrimes */ 3051556Srgrimes public char * 3061556Srgrimesprchar(c) 3071556Srgrimes int c; 3081556Srgrimes{ 3091556Srgrimes static char buf[8]; 3101556Srgrimes 3111556Srgrimes c &= 0377; 3121556Srgrimes if (!control_char(c)) 31326104Ssteve sprintf(buf, "%c", c); 31426104Ssteve else if (c == ESC) 3151556Srgrimes sprintf(buf, "ESC"); 3161556Srgrimes#if IS_EBCDIC_HOST 31726104Ssteve else if (!binary_char(c) && c < 64) 31826104Ssteve sprintf(buf, "^%c", 31926104Ssteve /* 3201556Srgrimes * This array roughly inverts CONTROL() #defined in less.h, 32126104Ssteve * and should be kept in sync with CONTROL() and IBM-1047. 32217987Speter */ 32317987Speter "@ABC.I.?...KLMNO" 3241556Srgrimes "PQRS.JH.XY.." 3251556Srgrimes "\\]^_" 32626104Ssteve "......W[.....EFG" 3271556Srgrimes "..V....D....TU.Z"[c]); 3281556Srgrimes#else 3291556Srgrimes else if (c < 128 && !control_char(c ^ 0100)) 3301556Srgrimes sprintf(buf, "^%c", c ^ 0100); 3311556Srgrimes#endif 3321556Srgrimes else 3331556Srgrimes sprintf(buf, binfmt, c); 3341556Srgrimes return (buf); 3351556Srgrimes} 3361556Srgrimes