charset.c revision 89019
11556Srgrimes/*
21556Srgrimes * Copyright (C) 1984-2000  Mark Nudelman
31556Srgrimes *
41556Srgrimes * You may distribute under the terms of either the GNU General Public
51556Srgrimes * License or the Less License, as specified in the README file.
61556Srgrimes *
71556Srgrimes * For more information about less, or for information on how to
81556Srgrimes * contact the author, see the README file.
91556Srgrimes */
101556Srgrimes
111556Srgrimes
121556Srgrimes/*
131556Srgrimes * Functions to define the character set
141556Srgrimes * and do things specific to the character set.
151556Srgrimes */
161556Srgrimes
171556Srgrimes#include "less.h"
181556Srgrimes#if HAVE_LOCALE
191556Srgrimes#include <locale.h>
201556Srgrimes#include <ctype.h>
211556Srgrimes#endif
221556Srgrimes
231556Srgrimespublic int utf_mode = 0;
241556Srgrimes
251556Srgrimes/*
261556Srgrimes * Predefined character sets,
271556Srgrimes * selected by the LESSCHARSET environment variable.
281556Srgrimes */
291556Srgrimesstruct charset {
301556Srgrimes	char *name;
311556Srgrimes	int *p_flag;
321556Srgrimes	char *desc;
331556Srgrimes} charsets[] = {
341556Srgrimes	{ "ascii",	NULL,       "8bcccbcc18b95.b" },
351556Srgrimes	{ "dos",	NULL,       "8bcccbcc12bc5b223.b" },
361556Srgrimes	{ "ebcdic",	NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
371556Srgrimes	{ "IBM-1047",	NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
3836150Scharnier	{ "iso8859",	NULL,       "8bcccbcc18b95.33b." },
3936150Scharnier	{ "koi8-r",	NULL,       "8bcccbcc18b95.b128." },
4036150Scharnier	{ "next",	NULL,       "8bcccbcc18b95.bb125.bb" },
4136150Scharnier	{ "utf-8",	&utf_mode,  "8bcccbcc18b." },
4238521Scracauer	{ NULL, NULL, NULL }
431556Srgrimes};
441556Srgrimes
4517987Speterstruct cs_alias {
4617987Speter	char *name;
4717987Speter	char *oname;
4817987Speter} cs_aliases[] = {
4917987Speter	{ "latin1",	"iso8859" },
5017987Speter	{ "latin9",	"iso8859" },
5117987Speter	{ NULL, NULL }
5217987Speter};
5317987Speter
5417987Speter#define	IS_BINARY_CHAR	01
5517987Speter#define	IS_CONTROL_CHAR	02
5618018Speter
5717987Speterstatic char chardef[256];
581556Srgrimesstatic char *binfmt = NULL;
591556Srgrimespublic int binattr = AT_STANDOUT;
6025222Ssteve
611556Srgrimes
6217987Speter/*
6317987Speter * Define a charset, given a description string.
6417987Speter * The string consists of 256 letters,
651556Srgrimes * one for each character in the charset.
661556Srgrimes * If the string is shorter than 256 letters, missing letters
6717987Speter * are taken to be identical to the last one.
6817987Speter * A decimal number followed by a letter is taken to be a
691556Srgrimes * repetition of the letter.
701556Srgrimes *
711556Srgrimes * Each letter is one of:
721556Srgrimes *	. normal character
731556Srgrimes *	b binary character
741556Srgrimes *	c control character
751556Srgrimes */
761556Srgrimes	static void
771556Srgrimesichardef(s)
781556Srgrimes	char *s;
791556Srgrimes{
801556Srgrimes	register char *cp;
811556Srgrimes	register int n;
821556Srgrimes	register char v;
831556Srgrimes
841556Srgrimes	n = 0;
8528346Ssteve	v = 0;
861556Srgrimes	cp = chardef;
871556Srgrimes	while (*s != '\0')
8828346Ssteve	{
891556Srgrimes		switch (*s++)
9038521Scracauer		{
9138521Scracauer		case '.':
921556Srgrimes			v = 0;
9320425Ssteve			break;
9417987Speter		case 'c':
9520425Ssteve			v = IS_CONTROL_CHAR;
9617987Speter			break;
9717987Speter		case 'b':
9817987Speter			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
9920425Ssteve			break;
10038521Scracauer
10120425Ssteve		case '0': case '1': case '2': case '3': case '4':
10217987Speter		case '5': case '6': case '7': case '8': case '9':
10317987Speter			n = (10 * n) + (s[-1] - '0');
10417987Speter			continue;
1051556Srgrimes
1061556Srgrimes		default:
1071556Srgrimes			error("invalid chardef", NULL_PARG);
1081556Srgrimes			quit(QUIT_ERROR);
1091556Srgrimes			/*NOTREACHED*/
1101556Srgrimes		}
1111556Srgrimes
1121556Srgrimes		do
1131556Srgrimes		{
1141556Srgrimes			if (cp >= chardef + sizeof(chardef))
1151556Srgrimes			{
1161556Srgrimes				error("chardef longer than 256", NULL_PARG);
11720425Ssteve				quit(QUIT_ERROR);
1181556Srgrimes				/*NOTREACHED*/
11920425Ssteve			}
12017987Speter			*cp++ = v;
12117987Speter		} while (--n > 0);
1221556Srgrimes		n = 0;
1231556Srgrimes	}
1241556Srgrimes
1251556Srgrimes	while (cp < chardef + sizeof(chardef))
1261556Srgrimes		*cp++ = v;
1271556Srgrimes}
1281556Srgrimes
1291556Srgrimes/*
13020425Ssteve * Define a charset, given a charset name.
1311556Srgrimes * The valid charset names are listed in the "charsets" array.
13220425Ssteve */
13320425Ssteve	static int
13420425Ssteveicharset(name)
13520425Ssteve	register char *name;
1361556Srgrimes{
1371556Srgrimes	register struct charset *p;
1381556Srgrimes	register struct cs_alias *a;
1391556Srgrimes
1401556Srgrimes	if (name == NULL || *name == '\0')
14117987Speter		return (0);
14217987Speter
1431556Srgrimes	/* First see if the name is an alias. */
1441556Srgrimes	for (a = cs_aliases;  a->name != NULL;  a++)
1451556Srgrimes	{
1461556Srgrimes		if (strcmp(name, a->name) == 0)
1471556Srgrimes		{
1481556Srgrimes			name = a->oname;
1491556Srgrimes			break;
1501556Srgrimes		}
1511556Srgrimes	}
1521556Srgrimes
1531556Srgrimes	for (p = charsets;  p->name != NULL;  p++)
1541556Srgrimes	{
1551556Srgrimes		if (strcmp(name, p->name) == 0)
1561556Srgrimes		{
15717987Speter			ichardef(p->desc);
15820425Ssteve			if (p->p_flag != NULL)
1591556Srgrimes				*(p->p_flag) = 1;
16020425Ssteve			return (1);
16120425Ssteve		}
16220425Ssteve	}
1631556Srgrimes
16417987Speter	error("invalid charset name", NULL_PARG);
16520425Ssteve	quit(QUIT_ERROR);
1661556Srgrimes	/*NOTREACHED*/
16720425Ssteve}
16820425Ssteve
16920425Ssteve#if HAVE_LOCALE
1701556Srgrimes/*
1711556Srgrimes * Define a charset, given a locale name.
1721556Srgrimes */
1731556Srgrimes	static void
1741556Srgrimesilocale()
1751556Srgrimes{
17620425Ssteve	register int c;
1771556Srgrimes
1781556Srgrimes	setlocale(LC_ALL, "");
1791556Srgrimes	for (c = 0;  c < (int) sizeof(chardef);  c++)
18028346Ssteve	{
18117987Speter		if (isprint(c))
1821556Srgrimes			chardef[c] = 0;
1831556Srgrimes		else if (iscntrl(c))
1841556Srgrimes			chardef[c] = IS_CONTROL_CHAR;
1851556Srgrimes		else
1861556Srgrimes			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
1871556Srgrimes	}
1881556Srgrimes}
1891556Srgrimes#endif
1901556Srgrimes
1911556Srgrimes/*
1921556Srgrimes * Define the printing format for control chars.
1931556Srgrimes */
1941556Srgrimes   	public void
19517987Spetersetbinfmt(s)
19617987Speter	char *s;
19725905Ssteve{
19820425Ssteve	if (s == NULL || *s == '\0')
19917987Speter		s = "*s<%X>";
2001556Srgrimes	/*
2011556Srgrimes	 * Select the attributes if it starts with "*".
2021556Srgrimes	 */
2031556Srgrimes	if (*s == '*')
2041556Srgrimes	{
2051556Srgrimes		switch (s[1])
2061556Srgrimes		{
2071556Srgrimes		case 'd':  binattr = AT_BOLD;      break;
20820425Ssteve		case 'k':  binattr = AT_BLINK;     break;
2091556Srgrimes		case 's':  binattr = AT_STANDOUT;  break;
21020425Ssteve		case 'u':  binattr = AT_UNDERLINE; break;
21120425Ssteve		default:   binattr = AT_NORMAL;    break;
21220425Ssteve		}
2131556Srgrimes		s += 2;
2141556Srgrimes	}
2151556Srgrimes	binfmt = s;
2161556Srgrimes}
2171556Srgrimes
2181556Srgrimes/*
2191556Srgrimes * Initialize charset data structures.
2201556Srgrimes */
22117987Speter	public void
22217987Speterinit_charset()
22317987Speter{
22420425Ssteve	register char *s;
22517987Speter
2261556Srgrimes	s = lgetenv("LESSBINFMT");
2271556Srgrimes	setbinfmt(s);
2281556Srgrimes
2291556Srgrimes	/*
2301556Srgrimes	 * See if environment variable LESSCHARSET is defined.
2311556Srgrimes	 */
2321556Srgrimes	s = lgetenv("LESSCHARSET");
2331556Srgrimes	if (icharset(s))
2341556Srgrimes		return;
2351556Srgrimes	/*
2361556Srgrimes	 * LESSCHARSET is not defined: try LESSCHARDEF.
2371556Srgrimes	 */
2381556Srgrimes	s = lgetenv("LESSCHARDEF");
2391556Srgrimes	if (s != NULL && *s != '\0')
2401556Srgrimes	{
24117987Speter		ichardef(s);
2421556Srgrimes		return;
2431556Srgrimes	}
2441556Srgrimes
2451556Srgrimes#if HAVE_STRSTR
2461556Srgrimes	/*
2471556Srgrimes	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
2481556Srgrimes	 */
2491556Srgrimes	if ((s = lgetenv("LC_ALL")) != NULL ||
25026104Ssteve	    (s = lgetenv("LC_CTYPE")) != NULL ||
2511556Srgrimes	    (s = lgetenv("LANG")) != NULL)
2521556Srgrimes	{
2531556Srgrimes		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
2541556Srgrimes			if (icharset("utf-8"))
2551556Srgrimes				return;
2561556Srgrimes	}
2571556Srgrimes#endif
2581556Srgrimes
2591556Srgrimes#if HAVE_LOCALE
2601556Srgrimes	/*
26117987Speter	 * Use setlocale.
26225905Ssteve	 */
26325905Ssteve	ilocale();
26417987Speter#else
2651556Srgrimes#if MSDOS_COMPILER
2661556Srgrimes	/*
2671556Srgrimes	 * Default to "dos".
2681556Srgrimes	 */
2691556Srgrimes	(void) icharset("dos");
2701556Srgrimes#else
2711556Srgrimes	/*
2721556Srgrimes	 * Default to "latin1".
2731556Srgrimes	 */
2741556Srgrimes	(void) icharset("latin1");
2751556Srgrimes#endif
2761556Srgrimes#endif
2771556Srgrimes}
2781556Srgrimes
2791556Srgrimes/*
28020425Ssteve * Is a given character a "binary" character?
28117987Speter */
28217987Speter	public int
2831556Srgrimesbinary_char(c)
2841556Srgrimes	unsigned char c;
2851556Srgrimes{
2861556Srgrimes	c &= 0377;
2871556Srgrimes	return (chardef[c] & IS_BINARY_CHAR);
2881556Srgrimes}
2891556Srgrimes
2901556Srgrimes/*
2911556Srgrimes * Is a given character a "control" character?
2921556Srgrimes */
2931556Srgrimes	public int
2941556Srgrimescontrol_char(c)
2951556Srgrimes	int c;
2961556Srgrimes{
2971556Srgrimes	c &= 0377;
2981556Srgrimes	return (chardef[c] & IS_CONTROL_CHAR);
2991556Srgrimes}
3001556Srgrimes
3011556Srgrimes/*
3021556Srgrimes * Return the printable form of a character.
3031556Srgrimes * For example, in the "ascii" charset '\3' is printed as "^C".
3041556Srgrimes */
3051556Srgrimes	public char *
3061556Srgrimesprchar(c)
3071556Srgrimes	int c;
3081556Srgrimes{
3091556Srgrimes	static char buf[8];
3101556Srgrimes
3111556Srgrimes	c &= 0377;
3121556Srgrimes	if (!control_char(c))
31326104Ssteve		sprintf(buf, "%c", c);
31426104Ssteve	else if (c == ESC)
3151556Srgrimes		sprintf(buf, "ESC");
3161556Srgrimes#if IS_EBCDIC_HOST
31726104Ssteve	else if (!binary_char(c) && c < 64)
31826104Ssteve		sprintf(buf, "^%c",
31926104Ssteve		/*
3201556Srgrimes		 * This array roughly inverts CONTROL() #defined in less.h,
32126104Ssteve	 	 * and should be kept in sync with CONTROL() and IBM-1047.
32217987Speter 	 	 */
32317987Speter		"@ABC.I.?...KLMNO"
3241556Srgrimes		"PQRS.JH.XY.."
3251556Srgrimes		"\\]^_"
32626104Ssteve		"......W[.....EFG"
3271556Srgrimes		"..V....D....TU.Z"[c]);
3281556Srgrimes#else
3291556Srgrimes  	else if (c < 128 && !control_char(c ^ 0100))
3301556Srgrimes  		sprintf(buf, "^%c", c ^ 0100);
3311556Srgrimes#endif
3321556Srgrimes	else
3331556Srgrimes		sprintf(buf, binfmt, c);
3341556Srgrimes	return (buf);
3351556Srgrimes}
3361556Srgrimes