1191739Sobrien/*
2191739Sobrien * Copyright (c) Ian F. Darwin 1986-1995.
3191739Sobrien * Software written by Ian F. Darwin and others;
4191739Sobrien * maintained 1995-present by Christos Zoulas and others.
5191739Sobrien *
6191739Sobrien * Redistribution and use in source and binary forms, with or without
7191739Sobrien * modification, are permitted provided that the following conditions
8191739Sobrien * are met:
9191739Sobrien * 1. Redistributions of source code must retain the above copyright
10191739Sobrien *    notice immediately at the beginning of the file, without modification,
11191739Sobrien *    this list of conditions, and the following disclaimer.
12191739Sobrien * 2. Redistributions in binary form must reproduce the above copyright
13191739Sobrien *    notice, this list of conditions and the following disclaimer in the
14191739Sobrien *    documentation and/or other materials provided with the distribution.
15191739Sobrien *
16191739Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17191739Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18191739Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19191739Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20191739Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21191739Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22191739Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23191739Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24191739Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25191739Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26191739Sobrien * SUCH DAMAGE.
27191739Sobrien */
28191739Sobrien/*
29191739Sobrien * Encoding -- determine the character encoding of a text file.
30191739Sobrien *
31191739Sobrien * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
32191739Sobrien * international characters.
33191739Sobrien */
34191739Sobrien
35191739Sobrien#include "file.h"
36191739Sobrien
37191739Sobrien#ifndef	lint
38234449SobrienFILE_RCSID("@(#)$File: encoding.c,v 1.7 2012/01/24 19:02:02 christos Exp $")
39191739Sobrien#endif	/* lint */
40191739Sobrien
41191739Sobrien#include "magic.h"
42191739Sobrien#include <string.h>
43191739Sobrien#include <memory.h>
44191739Sobrien#include <stdlib.h>
45191739Sobrien
46191739Sobrien
47191739Sobrienprivate int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
48191739Sobrienprivate int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
49191739Sobrien    size_t *);
50191739Sobrienprivate int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
51191739Sobrienprivate int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
52191739Sobrienprivate int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
53191739Sobrienprivate void from_ebcdic(const unsigned char *, size_t, unsigned char *);
54191739Sobrien
55234449Sobrien#ifdef DEBUG_ENCODING
56234449Sobrien#define DPRINTF(a) printf a
57234449Sobrien#else
58234449Sobrien#define DPRINTF(a)
59234449Sobrien#endif
60234449Sobrien
61191739Sobrien/*
62191739Sobrien * Try to determine whether text is in some character code we can
63191739Sobrien * identify.  Each of these tests, if it succeeds, will leave
64191739Sobrien * the text converted into one-unichar-per-character Unicode in
65191739Sobrien * ubuf, and the number of characters converted in ulen.
66191739Sobrien */
67191739Sobrienprotected int
68191739Sobrienfile_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)
69191739Sobrien{
70191739Sobrien	size_t mlen;
71191739Sobrien	int rv = 1, ucs_type;
72191739Sobrien	unsigned char *nbuf = NULL;
73191739Sobrien
74234449Sobrien	*type = "text";
75191739Sobrien	mlen = (nbytes + 1) * sizeof(nbuf[0]);
76191739Sobrien	if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
77191739Sobrien		file_oomem(ms, mlen);
78191739Sobrien		goto done;
79191739Sobrien	}
80191739Sobrien	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
81191739Sobrien	if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
82191739Sobrien		file_oomem(ms, mlen);
83191739Sobrien		goto done;
84191739Sobrien	}
85191739Sobrien
86191739Sobrien	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
87234449Sobrien		DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
88191739Sobrien		*code = "ASCII";
89191739Sobrien		*code_mime = "us-ascii";
90191739Sobrien	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
91234449Sobrien		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
92191739Sobrien		*code = "UTF-8 Unicode (with BOM)";
93191739Sobrien		*code_mime = "utf-8";
94191739Sobrien	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
95234449Sobrien		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
96234449Sobrien		*code = "UTF-8 Unicode (with BOM)";
97191739Sobrien		*code = "UTF-8 Unicode";
98191739Sobrien		*code_mime = "utf-8";
99191739Sobrien	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
100191739Sobrien		if (ucs_type == 1) {
101191739Sobrien			*code = "Little-endian UTF-16 Unicode";
102191739Sobrien			*code_mime = "utf-16le";
103191739Sobrien		} else {
104191739Sobrien			*code = "Big-endian UTF-16 Unicode";
105191739Sobrien			*code_mime = "utf-16be";
106191739Sobrien		}
107234449Sobrien		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
108191739Sobrien	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
109234449Sobrien		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
110191739Sobrien		*code = "ISO-8859";
111191739Sobrien		*code_mime = "iso-8859-1";
112191739Sobrien	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
113234449Sobrien		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
114191739Sobrien		*code = "Non-ISO extended-ASCII";
115191739Sobrien		*code_mime = "unknown-8bit";
116191739Sobrien	} else {
117191739Sobrien		from_ebcdic(buf, nbytes, nbuf);
118191739Sobrien
119191739Sobrien		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
120234449Sobrien			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
121191739Sobrien			*code = "EBCDIC";
122191739Sobrien			*code_mime = "ebcdic";
123191739Sobrien		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
124234449Sobrien			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
125234449Sobrien			    *ulen));
126191739Sobrien			*code = "International EBCDIC";
127191739Sobrien			*code_mime = "ebcdic";
128191739Sobrien		} else { /* Doesn't look like text at all */
129234449Sobrien			DPRINTF(("binary\n"));
130191739Sobrien			rv = 0;
131191739Sobrien			*type = "binary";
132191739Sobrien		}
133191739Sobrien	}
134191739Sobrien
135191739Sobrien done:
136234449Sobrien	free(nbuf);
137191739Sobrien
138191739Sobrien	return rv;
139191739Sobrien}
140191739Sobrien
141191739Sobrien/*
142191739Sobrien * This table reflects a particular philosophy about what constitutes
143191739Sobrien * "text," and there is room for disagreement about it.
144191739Sobrien *
145191739Sobrien * Version 3.31 of the file command considered a file to be ASCII if
146191739Sobrien * each of its characters was approved by either the isascii() or
147191739Sobrien * isalpha() function.  On most systems, this would mean that any
148191739Sobrien * file consisting only of characters in the range 0x00 ... 0x7F
149191739Sobrien * would be called ASCII text, but many systems might reasonably
150191739Sobrien * consider some characters outside this range to be alphabetic,
151191739Sobrien * so the file command would call such characters ASCII.  It might
152191739Sobrien * have been more accurate to call this "considered textual on the
153191739Sobrien * local system" than "ASCII."
154191739Sobrien *
155191739Sobrien * It considered a file to be "International language text" if each
156191739Sobrien * of its characters was either an ASCII printing character (according
157191739Sobrien * to the real ASCII standard, not the above test), a character in
158191739Sobrien * the range 0x80 ... 0xFF, or one of the following control characters:
159191739Sobrien * backspace, tab, line feed, vertical tab, form feed, carriage return,
160191739Sobrien * escape.  No attempt was made to determine the language in which files
161191739Sobrien * of this type were written.
162191739Sobrien *
163191739Sobrien *
164191739Sobrien * The table below considers a file to be ASCII if all of its characters
165191739Sobrien * are either ASCII printing characters (again, according to the X3.4
166191739Sobrien * standard, not isascii()) or any of the following controls: bell,
167191739Sobrien * backspace, tab, line feed, form feed, carriage return, esc, nextline.
168191739Sobrien *
169191739Sobrien * I include bell because some programs (particularly shell scripts)
170191739Sobrien * use it literally, even though it is rare in normal text.  I exclude
171191739Sobrien * vertical tab because it never seems to be used in real text.  I also
172191739Sobrien * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
173191739Sobrien * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
174191739Sobrien * character to.  It might be more appropriate to include it in the 8859
175191739Sobrien * set instead of the ASCII set, but it's got to be included in *something*
176191739Sobrien * we recognize or EBCDIC files aren't going to be considered textual.
177191739Sobrien * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
178191739Sobrien * and Latin characters, so these should possibly be allowed.  But they
179191739Sobrien * make a real mess on VT100-style displays if they're not paired properly,
180191739Sobrien * so we are probably better off not calling them text.
181191739Sobrien *
182191739Sobrien * A file is considered to be ISO-8859 text if its characters are all
183191739Sobrien * either ASCII, according to the above definition, or printing characters
184191739Sobrien * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
185191739Sobrien *
186191739Sobrien * Finally, a file is considered to be international text from some other
187191739Sobrien * character code if its characters are all either ISO-8859 (according to
188191739Sobrien * the above definition) or characters in the range 0x80 ... 0x9F, which
189191739Sobrien * ISO-8859 considers to be control characters but the IBM PC and Macintosh
190191739Sobrien * consider to be printing characters.
191191739Sobrien */
192191739Sobrien
193191739Sobrien#define F 0   /* character never appears in text */
194191739Sobrien#define T 1   /* character appears in plain ASCII text */
195191739Sobrien#define I 2   /* character appears in ISO-8859 text */
196191739Sobrien#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
197191739Sobrien
198191739Sobrienprivate char text_chars[256] = {
199191739Sobrien	/*                  BEL BS HT LF    FF CR    */
200191739Sobrien	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
201191739Sobrien	/*                              ESC          */
202191739Sobrien	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
203191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
204191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
205191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
206191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
207191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
208191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
209191739Sobrien	/*            NEL                            */
210191739Sobrien	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
211191739Sobrien	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
212191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
213191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
214191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
215191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
216191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
217191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
218191739Sobrien};
219191739Sobrien
220191739Sobrienprivate int
221191739Sobrienlooks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
222191739Sobrien    size_t *ulen)
223191739Sobrien{
224191739Sobrien	size_t i;
225191739Sobrien
226191739Sobrien	*ulen = 0;
227191739Sobrien
228191739Sobrien	for (i = 0; i < nbytes; i++) {
229191739Sobrien		int t = text_chars[buf[i]];
230191739Sobrien
231191739Sobrien		if (t != T)
232191739Sobrien			return 0;
233191739Sobrien
234191739Sobrien		ubuf[(*ulen)++] = buf[i];
235191739Sobrien	}
236191739Sobrien
237191739Sobrien	return 1;
238191739Sobrien}
239191739Sobrien
240191739Sobrienprivate int
241191739Sobrienlooks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
242191739Sobrien{
243191739Sobrien	size_t i;
244191739Sobrien
245191739Sobrien	*ulen = 0;
246191739Sobrien
247191739Sobrien	for (i = 0; i < nbytes; i++) {
248191739Sobrien		int t = text_chars[buf[i]];
249191739Sobrien
250191739Sobrien		if (t != T && t != I)
251191739Sobrien			return 0;
252191739Sobrien
253191739Sobrien		ubuf[(*ulen)++] = buf[i];
254191739Sobrien	}
255191739Sobrien
256191739Sobrien	return 1;
257191739Sobrien}
258191739Sobrien
259191739Sobrienprivate int
260191739Sobrienlooks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
261191739Sobrien    size_t *ulen)
262191739Sobrien{
263191739Sobrien	size_t i;
264191739Sobrien
265191739Sobrien	*ulen = 0;
266191739Sobrien
267191739Sobrien	for (i = 0; i < nbytes; i++) {
268191739Sobrien		int t = text_chars[buf[i]];
269191739Sobrien
270191739Sobrien		if (t != T && t != I && t != X)
271191739Sobrien			return 0;
272191739Sobrien
273191739Sobrien		ubuf[(*ulen)++] = buf[i];
274191739Sobrien	}
275191739Sobrien
276191739Sobrien	return 1;
277191739Sobrien}
278191739Sobrien
279191739Sobrien/*
280191739Sobrien * Decide whether some text looks like UTF-8. Returns:
281191739Sobrien *
282191739Sobrien *     -1: invalid UTF-8
283191739Sobrien *      0: uses odd control characters, so doesn't look like text
284191739Sobrien *      1: 7-bit text
285191739Sobrien *      2: definitely UTF-8 text (valid high-bit set bytes)
286191739Sobrien *
287191739Sobrien * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
288191739Sobrien * ubuf must be big enough!
289191739Sobrien */
290191739Sobrienprotected int
291191739Sobrienfile_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
292191739Sobrien{
293191739Sobrien	size_t i;
294191739Sobrien	int n;
295191739Sobrien	unichar c;
296191739Sobrien	int gotone = 0, ctrl = 0;
297191739Sobrien
298191739Sobrien	if (ubuf)
299191739Sobrien		*ulen = 0;
300191739Sobrien
301191739Sobrien	for (i = 0; i < nbytes; i++) {
302191739Sobrien		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
303191739Sobrien			/*
304191739Sobrien			 * Even if the whole file is valid UTF-8 sequences,
305191739Sobrien			 * still reject it if it uses weird control characters.
306191739Sobrien			 */
307191739Sobrien
308191739Sobrien			if (text_chars[buf[i]] != T)
309191739Sobrien				ctrl = 1;
310191739Sobrien
311191739Sobrien			if (ubuf)
312191739Sobrien				ubuf[(*ulen)++] = buf[i];
313191739Sobrien		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
314191739Sobrien			return -1;
315191739Sobrien		} else {			   /* 11xxxxxx begins UTF-8 */
316191739Sobrien			int following;
317191739Sobrien
318191739Sobrien			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
319191739Sobrien				c = buf[i] & 0x1f;
320191739Sobrien				following = 1;
321191739Sobrien			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
322191739Sobrien				c = buf[i] & 0x0f;
323191739Sobrien				following = 2;
324191739Sobrien			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
325191739Sobrien				c = buf[i] & 0x07;
326191739Sobrien				following = 3;
327191739Sobrien			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
328191739Sobrien				c = buf[i] & 0x03;
329191739Sobrien				following = 4;
330191739Sobrien			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
331191739Sobrien				c = buf[i] & 0x01;
332191739Sobrien				following = 5;
333191739Sobrien			} else
334191739Sobrien				return -1;
335191739Sobrien
336191739Sobrien			for (n = 0; n < following; n++) {
337191739Sobrien				i++;
338191739Sobrien				if (i >= nbytes)
339191739Sobrien					goto done;
340191739Sobrien
341191739Sobrien				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
342191739Sobrien					return -1;
343191739Sobrien
344191739Sobrien				c = (c << 6) + (buf[i] & 0x3f);
345191739Sobrien			}
346191739Sobrien
347191739Sobrien			if (ubuf)
348191739Sobrien				ubuf[(*ulen)++] = c;
349191739Sobrien			gotone = 1;
350191739Sobrien		}
351191739Sobrien	}
352191739Sobriendone:
353191739Sobrien	return ctrl ? 0 : (gotone ? 2 : 1);
354191739Sobrien}
355191739Sobrien
356191739Sobrien/*
357191739Sobrien * Decide whether some text looks like UTF-8 with BOM. If there is no
358191739Sobrien * BOM, return -1; otherwise return the result of looks_utf8 on the
359191739Sobrien * rest of the text.
360191739Sobrien */
361191739Sobrienprivate int
362191739Sobrienlooks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
363191739Sobrien    size_t *ulen)
364191739Sobrien{
365191739Sobrien	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
366191739Sobrien		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
367191739Sobrien	else
368191739Sobrien		return -1;
369191739Sobrien}
370191739Sobrien
371191739Sobrienprivate int
372191739Sobrienlooks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
373191739Sobrien    size_t *ulen)
374191739Sobrien{
375191739Sobrien	int bigend;
376191739Sobrien	size_t i;
377191739Sobrien
378191739Sobrien	if (nbytes < 2)
379191739Sobrien		return 0;
380191739Sobrien
381191739Sobrien	if (buf[0] == 0xff && buf[1] == 0xfe)
382191739Sobrien		bigend = 0;
383191739Sobrien	else if (buf[0] == 0xfe && buf[1] == 0xff)
384191739Sobrien		bigend = 1;
385191739Sobrien	else
386191739Sobrien		return 0;
387191739Sobrien
388191739Sobrien	*ulen = 0;
389191739Sobrien
390191739Sobrien	for (i = 2; i + 1 < nbytes; i += 2) {
391191739Sobrien		/* XXX fix to properly handle chars > 65536 */
392191739Sobrien
393191739Sobrien		if (bigend)
394191739Sobrien			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
395191739Sobrien		else
396191739Sobrien			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
397191739Sobrien
398191739Sobrien		if (ubuf[*ulen - 1] == 0xfffe)
399191739Sobrien			return 0;
400191739Sobrien		if (ubuf[*ulen - 1] < 128 &&
401191739Sobrien		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
402191739Sobrien			return 0;
403191739Sobrien	}
404191739Sobrien
405191739Sobrien	return 1 + bigend;
406191739Sobrien}
407191739Sobrien
408191739Sobrien#undef F
409191739Sobrien#undef T
410191739Sobrien#undef I
411191739Sobrien#undef X
412191739Sobrien
413191739Sobrien/*
414191739Sobrien * This table maps each EBCDIC character to an (8-bit extended) ASCII
415191739Sobrien * character, as specified in the rationale for the dd(1) command in
416191739Sobrien * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
417191739Sobrien *
418191739Sobrien * Unfortunately it does not seem to correspond exactly to any of the
419191739Sobrien * five variants of EBCDIC documented in IBM's _Enterprise Systems
420191739Sobrien * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
421191739Sobrien * Edition, July, 1999, pp. I-1 - I-4.
422191739Sobrien *
423191739Sobrien * Fortunately, though, all versions of EBCDIC, including this one, agree
424191739Sobrien * on most of the printing characters that also appear in (7-bit) ASCII.
425191739Sobrien * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
426191739Sobrien *
427191739Sobrien * Fortunately too, there is general agreement that codes 0x00 through
428191739Sobrien * 0x3F represent control characters, 0x41 a nonbreaking space, and the
429191739Sobrien * remainder printing characters.
430191739Sobrien *
431191739Sobrien * This is sufficient to allow us to identify EBCDIC text and to distinguish
432191739Sobrien * between old-style and internationalized examples of text.
433191739Sobrien */
434191739Sobrien
435191739Sobrienprivate unsigned char ebcdic_to_ascii[] = {
436191739Sobrien  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
437191739Sobrien 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
438191739Sobrien128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
439191739Sobrien144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
440191739Sobrien' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
441191739Sobrien'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
442191739Sobrien'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
443191739Sobrien186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
444191739Sobrien195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
445191739Sobrien202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
446191739Sobrien209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
447191739Sobrien216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
448191739Sobrien'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
449191739Sobrien'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
450191739Sobrien'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
451191739Sobrien'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
452191739Sobrien};
453191739Sobrien
454191739Sobrien#ifdef notdef
455191739Sobrien/*
456191739Sobrien * The following EBCDIC-to-ASCII table may relate more closely to reality,
457191739Sobrien * or at least to modern reality.  It comes from
458191739Sobrien *
459191739Sobrien *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
460191739Sobrien *
461191739Sobrien * and maps the characters of EBCDIC code page 1047 (the code used for
462191739Sobrien * Unix-derived software on IBM's 390 systems) to the corresponding
463191739Sobrien * characters from ISO 8859-1.
464191739Sobrien *
465191739Sobrien * If this table is used instead of the above one, some of the special
466191739Sobrien * cases for the NEL character can be taken out of the code.
467191739Sobrien */
468191739Sobrien
469191739Sobrienprivate unsigned char ebcdic_1047_to_8859[] = {
470191739Sobrien0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
471191739Sobrien0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
472191739Sobrien0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
473191739Sobrien0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
474191739Sobrien0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
475191739Sobrien0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
476191739Sobrien0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
477191739Sobrien0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
478191739Sobrien0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
479191739Sobrien0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
480191739Sobrien0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
481191739Sobrien0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
482191739Sobrien0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
483191739Sobrien0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
484191739Sobrien0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
485191739Sobrien0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
486191739Sobrien};
487191739Sobrien#endif
488191739Sobrien
489191739Sobrien/*
490191739Sobrien * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
491191739Sobrien */
492191739Sobrienprivate void
493191739Sobrienfrom_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
494191739Sobrien{
495191739Sobrien	size_t i;
496191739Sobrien
497191739Sobrien	for (i = 0; i < nbytes; i++) {
498191739Sobrien		out[i] = ebcdic_to_ascii[buf[i]];
499191739Sobrien	}
500191739Sobrien}
501