charset.c revision 60786
1/*
2 * Copyright (C) 1984-2000  Mark Nudelman
3 *
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Less License, as specified in the README file.
6 *
7 * For more information about less, or for information on how to
8 * contact the author, see the README file.
9 */
10
11
12/*
13 * Functions to define the character set
14 * and do things specific to the character set.
15 */
16
17#include "less.h"
18#if HAVE_LOCALE
19#include <locale.h>
20#include <ctype.h>
21#endif
22
23public int utf_mode = 0;
24
25/*
26 * Predefined character sets,
27 * selected by the LESSCHARSET environment variable.
28 */
29struct charset {
30	char *name;
31	int *p_flag;
32	char *desc;
33} charsets[] = {
34	{ "ascii",	NULL,       "8bcccbcc18b95.b" },
35	{ "dos",	NULL,       "8bcccbcc12bc5b95.b." },
36	{ "ebcdic",	NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
37	{ "iso8859",	NULL,       "8bcccbcc18b95.33b." },
38	{ "koi8-r",	NULL,       "8bcccbcc18b95.b128." },
39	{ "latin1",	NULL,       "8bcccbcc18b95.33b." },
40	{ "next",	NULL,       "8bcccbcc18b95.bb125.bb" },
41	{ "utf-8",	&utf_mode,  "8bcccbcc18b." },
42	{ NULL, NULL, NULL }
43};
44
45#define	IS_BINARY_CHAR	01
46#define	IS_CONTROL_CHAR	02
47
48static char chardef[256];
49static char *binfmt = NULL;
50public int binattr = AT_STANDOUT;
51
52
53/*
54 * Define a charset, given a description string.
55 * The string consists of 256 letters,
56 * one for each character in the charset.
57 * If the string is shorter than 256 letters, missing letters
58 * are taken to be identical to the last one.
59 * A decimal number followed by a letter is taken to be a
60 * repetition of the letter.
61 *
62 * Each letter is one of:
63 *	. normal character
64 *	b binary character
65 *	c control character
66 */
67	static void
68ichardef(s)
69	char *s;
70{
71	register char *cp;
72	register int n;
73	register char v;
74
75	n = 0;
76	v = 0;
77	cp = chardef;
78	while (*s != '\0')
79	{
80		switch (*s++)
81		{
82		case '.':
83			v = 0;
84			break;
85		case 'c':
86			v = IS_CONTROL_CHAR;
87			break;
88		case 'b':
89			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
90			break;
91
92		case '0': case '1': case '2': case '3': case '4':
93		case '5': case '6': case '7': case '8': case '9':
94			n = (10 * n) + (s[-1] - '0');
95			continue;
96
97		default:
98			error("invalid chardef", NULL_PARG);
99			quit(QUIT_ERROR);
100			/*NOTREACHED*/
101		}
102
103		do
104		{
105			if (cp >= chardef + sizeof(chardef))
106			{
107				error("chardef longer than 256", NULL_PARG);
108				quit(QUIT_ERROR);
109				/*NOTREACHED*/
110			}
111			*cp++ = v;
112		} while (--n > 0);
113		n = 0;
114	}
115
116	while (cp < chardef + sizeof(chardef))
117		*cp++ = v;
118}
119
120/*
121 * Define a charset, given a charset name.
122 * The valid charset names are listed in the "charsets" array.
123 */
124	static int
125icharset(name)
126	register char *name;
127{
128	register struct charset *p;
129
130	if (name == NULL || *name == '\0')
131		return (0);
132
133	for (p = charsets;  p->name != NULL;  p++)
134	{
135		if (strcmp(name, p->name) == 0)
136		{
137			ichardef(p->desc);
138			if (p->p_flag != NULL)
139				*(p->p_flag) = 1;
140			return (1);
141		}
142	}
143
144	error("invalid charset name", NULL_PARG);
145	quit(QUIT_ERROR);
146	/*NOTREACHED*/
147}
148
149#if HAVE_LOCALE
150/*
151 * Define a charset, given a locale name.
152 */
153	static void
154ilocale()
155{
156	register int c;
157
158	setlocale(LC_ALL, "");
159	for (c = 0;  c < (int) sizeof(chardef);  c++)
160	{
161		if (isprint(c))
162			chardef[c] = 0;
163		else if (iscntrl(c))
164			chardef[c] = IS_CONTROL_CHAR;
165		else
166			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
167	}
168}
169#endif
170
171/*
172 * Define the printing format for control chars.
173 */
174   	public void
175setbinfmt(s)
176	char *s;
177{
178	if (s == NULL || *s == '\0')
179		s = "*s<%X>";
180	/*
181	 * Select the attributes if it starts with "*".
182	 */
183	if (*s == '*')
184	{
185		switch (s[1])
186		{
187		case 'd':  binattr = AT_BOLD;      break;
188		case 'k':  binattr = AT_BLINK;     break;
189		case 's':  binattr = AT_STANDOUT;  break;
190		case 'u':  binattr = AT_UNDERLINE; break;
191		default:   binattr = AT_NORMAL;    break;
192		}
193		s += 2;
194	}
195	binfmt = s;
196}
197
198/*
199 * Initialize charset data structures.
200 */
201	public void
202init_charset()
203{
204	register char *s;
205
206	s = lgetenv("LESSBINFMT");
207	setbinfmt(s);
208
209	/*
210	 * See if environment variable LESSCHARSET is defined.
211	 */
212	s = lgetenv("LESSCHARSET");
213	if (icharset(s))
214		return;
215	/*
216	 * LESSCHARSET is not defined: try LESSCHARDEF.
217	 */
218	s = lgetenv("LESSCHARDEF");
219	if (s != NULL && *s != '\0')
220	{
221		ichardef(s);
222		return;
223	}
224
225#if HAVE_STRSTR
226	/*
227	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
228	 */
229	if ((s = lgetenv("LC_ALL")) != NULL ||
230	    (s = lgetenv("LC_CTYPE")) != NULL ||
231	    (s = lgetenv("LANG")) != NULL)
232	{
233		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
234			if (icharset("utf-8"))
235				return;
236	}
237#endif
238
239#if HAVE_LOCALE
240	/*
241	 * Use setlocale.
242	 */
243	ilocale();
244#else
245	/*
246	 * Default to "latin1".
247	 */
248	(void) icharset("latin1");
249#endif
250}
251
252/*
253 * Is a given character a "binary" character?
254 */
255	public int
256binary_char(c)
257	unsigned char c;
258{
259	c &= 0377;
260	return (chardef[c] & IS_BINARY_CHAR);
261}
262
263/*
264 * Is a given character a "control" character?
265 */
266	public int
267control_char(c)
268	int c;
269{
270	c &= 0377;
271	return (chardef[c] & IS_CONTROL_CHAR);
272}
273
274/*
275 * Return the printable form of a character.
276 * For example, in the "ascii" charset '\3' is printed as "^C".
277 */
278	public char *
279prchar(c)
280	int c;
281{
282	static char buf[8];
283
284	c &= 0377;
285	if (!control_char(c))
286		sprintf(buf, "%c", c);
287	else if (c == ESC)
288		sprintf(buf, "ESC");
289	else if (c < 128 && !control_char(c ^ 0100))
290		sprintf(buf, "^%c", c ^ 0100);
291	else
292		sprintf(buf, binfmt, c);
293	return (buf);
294}
295