charset.c revision 89019
1/*
2 * Copyright (C) 1984-2000  Mark Nudelman
3 *
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Less License, as specified in the README file.
6 *
7 * For more information about less, or for information on how to
8 * contact the author, see the README file.
9 */
10
11
12/*
13 * Functions to define the character set
14 * and do things specific to the character set.
15 */
16
17#include "less.h"
18#if HAVE_LOCALE
19#include <locale.h>
20#include <ctype.h>
21#endif
22
23public int utf_mode = 0;
24
25/*
26 * Predefined character sets,
27 * selected by the LESSCHARSET environment variable.
28 */
29struct charset {
30	char *name;
31	int *p_flag;
32	char *desc;
33} charsets[] = {
34	{ "ascii",	NULL,       "8bcccbcc18b95.b" },
35	{ "dos",	NULL,       "8bcccbcc12bc5b223.b" },
36	{ "ebcdic",	NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
37	{ "IBM-1047",	NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
38	{ "iso8859",	NULL,       "8bcccbcc18b95.33b." },
39	{ "koi8-r",	NULL,       "8bcccbcc18b95.b128." },
40	{ "next",	NULL,       "8bcccbcc18b95.bb125.bb" },
41	{ "utf-8",	&utf_mode,  "8bcccbcc18b." },
42	{ NULL, NULL, NULL }
43};
44
45struct cs_alias {
46	char *name;
47	char *oname;
48} cs_aliases[] = {
49	{ "latin1",	"iso8859" },
50	{ "latin9",	"iso8859" },
51	{ NULL, NULL }
52};
53
54#define	IS_BINARY_CHAR	01
55#define	IS_CONTROL_CHAR	02
56
57static char chardef[256];
58static char *binfmt = NULL;
59public int binattr = AT_STANDOUT;
60
61
62/*
63 * Define a charset, given a description string.
64 * The string consists of 256 letters,
65 * one for each character in the charset.
66 * If the string is shorter than 256 letters, missing letters
67 * are taken to be identical to the last one.
68 * A decimal number followed by a letter is taken to be a
69 * repetition of the letter.
70 *
71 * Each letter is one of:
72 *	. normal character
73 *	b binary character
74 *	c control character
75 */
76	static void
77ichardef(s)
78	char *s;
79{
80	register char *cp;
81	register int n;
82	register char v;
83
84	n = 0;
85	v = 0;
86	cp = chardef;
87	while (*s != '\0')
88	{
89		switch (*s++)
90		{
91		case '.':
92			v = 0;
93			break;
94		case 'c':
95			v = IS_CONTROL_CHAR;
96			break;
97		case 'b':
98			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
99			break;
100
101		case '0': case '1': case '2': case '3': case '4':
102		case '5': case '6': case '7': case '8': case '9':
103			n = (10 * n) + (s[-1] - '0');
104			continue;
105
106		default:
107			error("invalid chardef", NULL_PARG);
108			quit(QUIT_ERROR);
109			/*NOTREACHED*/
110		}
111
112		do
113		{
114			if (cp >= chardef + sizeof(chardef))
115			{
116				error("chardef longer than 256", NULL_PARG);
117				quit(QUIT_ERROR);
118				/*NOTREACHED*/
119			}
120			*cp++ = v;
121		} while (--n > 0);
122		n = 0;
123	}
124
125	while (cp < chardef + sizeof(chardef))
126		*cp++ = v;
127}
128
129/*
130 * Define a charset, given a charset name.
131 * The valid charset names are listed in the "charsets" array.
132 */
133	static int
134icharset(name)
135	register char *name;
136{
137	register struct charset *p;
138	register struct cs_alias *a;
139
140	if (name == NULL || *name == '\0')
141		return (0);
142
143	/* First see if the name is an alias. */
144	for (a = cs_aliases;  a->name != NULL;  a++)
145	{
146		if (strcmp(name, a->name) == 0)
147		{
148			name = a->oname;
149			break;
150		}
151	}
152
153	for (p = charsets;  p->name != NULL;  p++)
154	{
155		if (strcmp(name, p->name) == 0)
156		{
157			ichardef(p->desc);
158			if (p->p_flag != NULL)
159				*(p->p_flag) = 1;
160			return (1);
161		}
162	}
163
164	error("invalid charset name", NULL_PARG);
165	quit(QUIT_ERROR);
166	/*NOTREACHED*/
167}
168
169#if HAVE_LOCALE
170/*
171 * Define a charset, given a locale name.
172 */
173	static void
174ilocale()
175{
176	register int c;
177
178	setlocale(LC_ALL, "");
179	for (c = 0;  c < (int) sizeof(chardef);  c++)
180	{
181		if (isprint(c))
182			chardef[c] = 0;
183		else if (iscntrl(c))
184			chardef[c] = IS_CONTROL_CHAR;
185		else
186			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
187	}
188}
189#endif
190
191/*
192 * Define the printing format for control chars.
193 */
194   	public void
195setbinfmt(s)
196	char *s;
197{
198	if (s == NULL || *s == '\0')
199		s = "*s<%X>";
200	/*
201	 * Select the attributes if it starts with "*".
202	 */
203	if (*s == '*')
204	{
205		switch (s[1])
206		{
207		case 'd':  binattr = AT_BOLD;      break;
208		case 'k':  binattr = AT_BLINK;     break;
209		case 's':  binattr = AT_STANDOUT;  break;
210		case 'u':  binattr = AT_UNDERLINE; break;
211		default:   binattr = AT_NORMAL;    break;
212		}
213		s += 2;
214	}
215	binfmt = s;
216}
217
218/*
219 * Initialize charset data structures.
220 */
221	public void
222init_charset()
223{
224	register char *s;
225
226	s = lgetenv("LESSBINFMT");
227	setbinfmt(s);
228
229	/*
230	 * See if environment variable LESSCHARSET is defined.
231	 */
232	s = lgetenv("LESSCHARSET");
233	if (icharset(s))
234		return;
235	/*
236	 * LESSCHARSET is not defined: try LESSCHARDEF.
237	 */
238	s = lgetenv("LESSCHARDEF");
239	if (s != NULL && *s != '\0')
240	{
241		ichardef(s);
242		return;
243	}
244
245#if HAVE_STRSTR
246	/*
247	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
248	 */
249	if ((s = lgetenv("LC_ALL")) != NULL ||
250	    (s = lgetenv("LC_CTYPE")) != NULL ||
251	    (s = lgetenv("LANG")) != NULL)
252	{
253		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
254			if (icharset("utf-8"))
255				return;
256	}
257#endif
258
259#if HAVE_LOCALE
260	/*
261	 * Use setlocale.
262	 */
263	ilocale();
264#else
265#if MSDOS_COMPILER
266	/*
267	 * Default to "dos".
268	 */
269	(void) icharset("dos");
270#else
271	/*
272	 * Default to "latin1".
273	 */
274	(void) icharset("latin1");
275#endif
276#endif
277}
278
279/*
280 * Is a given character a "binary" character?
281 */
282	public int
283binary_char(c)
284	unsigned char c;
285{
286	c &= 0377;
287	return (chardef[c] & IS_BINARY_CHAR);
288}
289
290/*
291 * Is a given character a "control" character?
292 */
293	public int
294control_char(c)
295	int c;
296{
297	c &= 0377;
298	return (chardef[c] & IS_CONTROL_CHAR);
299}
300
301/*
302 * Return the printable form of a character.
303 * For example, in the "ascii" charset '\3' is printed as "^C".
304 */
305	public char *
306prchar(c)
307	int c;
308{
309	static char buf[8];
310
311	c &= 0377;
312	if (!control_char(c))
313		sprintf(buf, "%c", c);
314	else if (c == ESC)
315		sprintf(buf, "ESC");
316#if IS_EBCDIC_HOST
317	else if (!binary_char(c) && c < 64)
318		sprintf(buf, "^%c",
319		/*
320		 * This array roughly inverts CONTROL() #defined in less.h,
321	 	 * and should be kept in sync with CONTROL() and IBM-1047.
322 	 	 */
323		"@ABC.I.?...KLMNO"
324		"PQRS.JH.XY.."
325		"\\]^_"
326		"......W[.....EFG"
327		"..V....D....TU.Z"[c]);
328#else
329  	else if (c < 128 && !control_char(c ^ 0100))
330  		sprintf(buf, "^%c", c ^ 0100);
331#endif
332	else
333		sprintf(buf, binfmt, c);
334	return (buf);
335}
336