charset.c revision 128345
115920Speter/*
2139969Simp * Copyright (C) 1984-2002  Mark Nudelman
3139969Simp *
415920Speter * You may distribute under the terms of either the GNU General Public
515920Speter * License or the Less License, as specified in the README file.
615920Speter *
715920Speter * For more information about less, or for information on how to
815920Speter * contact the author, see the README file.
915920Speter */
1015920Speter
1115920Speter
1215920Speter/*
1315920Speter * Functions to define the character set
1415920Speter * and do things specific to the character set.
1515920Speter */
1615920Speter
1715920Speter#include "less.h"
1815920Speter#if HAVE_LOCALE
1915920Speter#include <locale.h>
2015920Speter#include <ctype.h>
2115920Speter#endif
2215920Speter
2315920Speterpublic int utf_mode = 0;
2415920Speter
2515920Speter/*
2615920Speter * Predefined character sets,
2715920Speter * selected by the LESSCHARSET environment variable.
2815920Speter */
2915920Speterstruct charset {
3015920Speter	char *name;
3115920Speter	int *p_flag;
3215920Speter	char *desc;
3315920Speter} charsets[] = {
3415920Speter	{ "ascii",	NULL,       "8bcccbcc18b95.b" },
3550471Speter	{ "dos",	NULL,       "8bcccbcc12bc5b223.b" },
3615920Speter	{ "ebcdic",	NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
3715920Speter	{ "IBM-1047",	NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
3815920Speter	{ "iso8859",	NULL,       "8bcccbcc18b95.33b." },
39157413Sstefanf	{ "koi8-r",	NULL,       "8bcccbcc18b95.b128." },
40157413Sstefanf	{ "next",	NULL,       "8bcccbcc18b95.bb125.bb" },
41157413Sstefanf	{ "utf-8",	&utf_mode,  "8bcccbcc18b." },
42157413Sstefanf	{ NULL, NULL, NULL }
43157413Sstefanf};
44157413Sstefanf
45157413Sstefanfstruct cs_alias {
46157413Sstefanf	char *name;
47157413Sstefanf	char *oname;
4815920Speter} cs_aliases[] = {
4915920Speter	{ "latin1",	"iso8859" },
5015920Speter	{ "latin9",	"iso8859" },
51100437Stjr	{ NULL, NULL }
52153094Sstefanf};
5315920Speter
54153094Sstefanf#define	IS_BINARY_CHAR	01
55157413Sstefanf#define	IS_CONTROL_CHAR	02
5615920Speter
57153094Sstefanfstatic char chardef[256];
58157413Sstefanfstatic char *binfmt = NULL;
5915920Speterpublic int binattr = AT_STANDOUT;
60157413Sstefanf
61157413Sstefanf
62157413Sstefanf/*
6315920Speter * Define a charset, given a description string.
64157413Sstefanf * The string consists of 256 letters,
6586505Sknu * one for each character in the charset.
6615920Speter * If the string is shorter than 256 letters, missing letters
6715920Speter * are taken to be identical to the last one.
6815920Speter * A decimal number followed by a letter is taken to be a
6915920Speter * repetition of the letter.
70153094Sstefanf *
7115920Speter * Each letter is one of:
7215920Speter *	. normal character
7315920Speter *	b binary character
74215520Sjilles *	c control character
7515920Speter */
7615920Speter	static void
77157413Sstefanfichardef(s)
78157413Sstefanf	char *s;
7915920Speter{
80157413Sstefanf	register char *cp;
81153094Sstefanf	register int n;
82157413Sstefanf	register char v;
83157413Sstefanf
84157413Sstefanf	n = 0;
8525235Ssteve	v = 0;
86153094Sstefanf	cp = chardef;
8715920Speter	while (*s != '\0')
8815920Speter	{
89157413Sstefanf		switch (*s++)
9015920Speter		{
91108286Stjr		case '.':
92			v = 0;
93			break;
94		case 'c':
95			v = IS_CONTROL_CHAR;
96			break;
97		case 'b':
98			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
99			break;
100
101		case '0': case '1': case '2': case '3': case '4':
102		case '5': case '6': case '7': case '8': case '9':
103			n = (10 * n) + (s[-1] - '0');
104			continue;
105
106		default:
107			error("invalid chardef", NULL_PARG);
108			quit(QUIT_ERROR);
109			/*NOTREACHED*/
110		}
111
112		do
113		{
114			if (cp >= chardef + sizeof(chardef))
115			{
116				error("chardef longer than 256", NULL_PARG);
117				quit(QUIT_ERROR);
118				/*NOTREACHED*/
119			}
120			*cp++ = v;
121		} while (--n > 0);
122		n = 0;
123	}
124
125	while (cp < chardef + sizeof(chardef))
126		*cp++ = v;
127}
128
129/*
130 * Define a charset, given a charset name.
131 * The valid charset names are listed in the "charsets" array.
132 */
133	static int
134icharset(name)
135	register char *name;
136{
137	register struct charset *p;
138	register struct cs_alias *a;
139
140	if (name == NULL || *name == '\0')
141		return (0);
142
143	/* First see if the name is an alias. */
144	for (a = cs_aliases;  a->name != NULL;  a++)
145	{
146		if (strcmp(name, a->name) == 0)
147		{
148			name = a->oname;
149			break;
150		}
151	}
152
153	for (p = charsets;  p->name != NULL;  p++)
154	{
155		if (strcmp(name, p->name) == 0)
156		{
157			ichardef(p->desc);
158			if (p->p_flag != NULL)
159				*(p->p_flag) = 1;
160			return (1);
161		}
162	}
163
164	error("invalid charset name", NULL_PARG);
165	quit(QUIT_ERROR);
166	/*NOTREACHED*/
167	return (0);
168}
169
170#if HAVE_LOCALE
171/*
172 * Define a charset, given a locale name.
173 */
174	static void
175ilocale()
176{
177	register int c;
178
179	setlocale(LC_ALL, "");
180	for (c = 0;  c < (int) sizeof(chardef);  c++)
181	{
182		if (isprint(c))
183			chardef[c] = 0;
184		else if (iscntrl(c))
185			chardef[c] = IS_CONTROL_CHAR;
186		else
187			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
188	}
189}
190#endif
191
192/*
193 * Define the printing format for control chars.
194 */
195   	public void
196setbinfmt(s)
197	char *s;
198{
199	if (s == NULL || *s == '\0')
200		s = "*s<%X>";
201	/*
202	 * Select the attributes if it starts with "*".
203	 */
204	if (*s == '*')
205	{
206		switch (s[1])
207		{
208		case 'd':  binattr = AT_BOLD;      break;
209		case 'k':  binattr = AT_BLINK;     break;
210		case 's':  binattr = AT_STANDOUT;  break;
211		case 'u':  binattr = AT_UNDERLINE; break;
212		default:   binattr = AT_NORMAL;    break;
213		}
214		s += 2;
215	}
216	binfmt = s;
217}
218
219/*
220 * Initialize charset data structures.
221 */
222	public void
223init_charset()
224{
225	register char *s;
226
227	s = lgetenv("LESSBINFMT");
228	setbinfmt(s);
229
230	/*
231	 * See if environment variable LESSCHARSET is defined.
232	 */
233	s = lgetenv("LESSCHARSET");
234	if (icharset(s))
235		return;
236	/*
237	 * LESSCHARSET is not defined: try LESSCHARDEF.
238	 */
239	s = lgetenv("LESSCHARDEF");
240	if (s != NULL && *s != '\0')
241	{
242		ichardef(s);
243		return;
244	}
245
246#if HAVE_STRSTR
247	/*
248	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
249	 */
250	if ((s = lgetenv("LC_ALL")) != NULL ||
251	    (s = lgetenv("LC_CTYPE")) != NULL ||
252	    (s = lgetenv("LANG")) != NULL)
253	{
254		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
255			if (icharset("utf-8"))
256				return;
257	}
258#endif
259
260#if HAVE_LOCALE
261	/*
262	 * Use setlocale.
263	 */
264	ilocale();
265#else
266#if MSDOS_COMPILER
267	/*
268	 * Default to "dos".
269	 */
270	(void) icharset("dos");
271#else
272	/*
273	 * Default to "latin1".
274	 */
275	(void) icharset("latin1");
276#endif
277#endif
278}
279
280/*
281 * Is a given character a "binary" character?
282 */
283	public int
284binary_char(c)
285	unsigned char c;
286{
287	c &= 0377;
288	return (chardef[c] & IS_BINARY_CHAR);
289}
290
291/*
292 * Is a given character a "control" character?
293 */
294	public int
295control_char(c)
296	int c;
297{
298	c &= 0377;
299	return (chardef[c] & IS_CONTROL_CHAR);
300}
301
302/*
303 * Return the printable form of a character.
304 * For example, in the "ascii" charset '\3' is printed as "^C".
305 */
306	public char *
307prchar(c)
308	int c;
309{
310	static char buf[8];
311
312	c &= 0377;
313	if (!control_char(c))
314		sprintf(buf, "%c", c);
315	else if (c == ESC)
316		sprintf(buf, "ESC");
317#if IS_EBCDIC_HOST
318	else if (!binary_char(c) && c < 64)
319		sprintf(buf, "^%c",
320		/*
321		 * This array roughly inverts CONTROL() #defined in less.h,
322	 	 * and should be kept in sync with CONTROL() and IBM-1047.
323 	 	 */
324		"@ABC.I.?...KLMNO"
325		"PQRS.JH.XY.."
326		"\\]^_"
327		"......W[.....EFG"
328		"..V....D....TU.Z"[c]);
329#else
330  	else if (c < 128 && !control_char(c ^ 0100))
331  		sprintf(buf, "^%c", c ^ 0100);
332#endif
333	else
334		sprintf(buf, binfmt, c);
335	return (buf);
336}
337