1/*
2 * Copyright (c) Ian F. Darwin 1986-1995.
3 * Software written by Ian F. Darwin and others;
4 * maintained 1995-present by Christos Zoulas and others.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice immediately at the beginning of the file, without modification,
11 *    this list of conditions, and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28/*
29 * Encoding -- determine the character encoding of a text file.
30 *
31 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
32 * international characters.
33 */
34
35#include "file.h"
36
37#ifndef	lint
38FILE_RCSID("@(#)$File: encoding.c,v 1.42 2022/12/26 17:31:14 christos Exp $")
39#endif	/* lint */
40
41#include "magic.h"
42#include <string.h>
43#include <stdlib.h>
44
45
46file_private int looks_ascii(const unsigned char *, size_t, file_unichar_t *,
47    size_t *);
48file_private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *,
49    size_t *);
50file_private int looks_utf7(const unsigned char *, size_t, file_unichar_t *,
51    size_t *);
52file_private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *,
53    size_t *);
54file_private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *,
55    size_t *);
56file_private int looks_latin1(const unsigned char *, size_t, file_unichar_t *,
57    size_t *);
58file_private int looks_extended(const unsigned char *, size_t, file_unichar_t *,
59    size_t *);
60file_private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
61
62#ifdef DEBUG_ENCODING
63#define DPRINTF(a) printf a
64#else
65#define DPRINTF(a)
66#endif
67
68/*
69 * Try to determine whether text is in some character code we can
70 * identify.  Each of these tests, if it succeeds, will leave
71 * the text converted into one-file_unichar_t-per-character Unicode in
72 * ubuf, and the number of characters converted in ulen.
73 */
74file_protected int
75file_encoding(struct magic_set *ms, const struct buffer *b,
76    file_unichar_t **ubuf, size_t *ulen, const char **code,
77    const char **code_mime, const char **type)
78{
79	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
80	size_t nbytes = b->flen;
81	size_t mlen;
82	int rv = 1, ucs_type;
83	file_unichar_t *udefbuf;
84	size_t udeflen;
85
86	if (ubuf == NULL)
87		ubuf = &udefbuf;
88	if (ulen == NULL)
89		ulen = &udeflen;
90
91	*type = "text";
92	*ulen = 0;
93	*code = "unknown";
94	*code_mime = "binary";
95
96	if (nbytes > ms->encoding_max)
97		nbytes = ms->encoding_max;
98
99	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
100	*ubuf = CAST(file_unichar_t *, calloc(CAST(size_t, 1), mlen));
101	if (*ubuf == NULL) {
102		file_oomem(ms, mlen);
103		goto done;
104	}
105	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
106		if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
107			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
108			*code = "Unicode text, UTF-7";
109			*code_mime = "utf-7";
110		} else {
111			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
112			*code = "ASCII";
113			*code_mime = "us-ascii";
114		}
115	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
116		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
117		*code = "Unicode text, UTF-8 (with BOM)";
118		*code_mime = "utf-8";
119	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
120		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
121		*code = "Unicode text, UTF-8";
122		*code_mime = "utf-8";
123	} else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
124		if (ucs_type == 1) {
125			*code = "Unicode text, UTF-32, little-endian";
126			*code_mime = "utf-32le";
127		} else {
128			*code = "Unicode text, UTF-32, big-endian";
129			*code_mime = "utf-32be";
130		}
131		DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
132	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
133		if (ucs_type == 1) {
134			*code = "Unicode text, UTF-16, little-endian";
135			*code_mime = "utf-16le";
136		} else {
137			*code = "Unicode text, UTF-16, big-endian";
138			*code_mime = "utf-16be";
139		}
140		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
141	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
142		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
143		*code = "ISO-8859";
144		*code_mime = "iso-8859-1";
145	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
146		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
147		*code = "Non-ISO extended-ASCII";
148		*code_mime = "unknown-8bit";
149	} else {
150		unsigned char *nbuf;
151
152		mlen = (nbytes + 1) * sizeof(nbuf[0]);
153		if ((nbuf = CAST(unsigned char *, malloc(mlen))) == NULL) {
154			file_oomem(ms, mlen);
155			goto done;
156		}
157		from_ebcdic(buf, nbytes, nbuf);
158
159		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
160			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
161			*code = "EBCDIC";
162			*code_mime = "ebcdic";
163		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
164			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
165			    *ulen));
166			*code = "International EBCDIC";
167			*code_mime = "ebcdic";
168		} else { /* Doesn't look like text at all */
169			DPRINTF(("binary\n"));
170			rv = 0;
171			*type = "binary";
172		}
173		free(nbuf);
174	}
175
176 done:
177	if (ubuf == &udefbuf)
178		free(udefbuf);
179
180	return rv;
181}
182
183/*
184 * This table reflects a particular philosophy about what constitutes
185 * "text," and there is room for disagreement about it.
186 *
187 * Version 3.31 of the file command considered a file to be ASCII if
188 * each of its characters was approved by either the isascii() or
189 * isalpha() function.  On most systems, this would mean that any
190 * file consisting only of characters in the range 0x00 ... 0x7F
191 * would be called ASCII text, but many systems might reasonably
192 * consider some characters outside this range to be alphabetic,
193 * so the file command would call such characters ASCII.  It might
194 * have been more accurate to call this "considered textual on the
195 * local system" than "ASCII."
196 *
197 * It considered a file to be "International language text" if each
198 * of its characters was either an ASCII printing character (according
199 * to the real ASCII standard, not the above test), a character in
200 * the range 0x80 ... 0xFF, or one of the following control characters:
201 * backspace, tab, line feed, vertical tab, form feed, carriage return,
202 * escape.  No attempt was made to determine the language in which files
203 * of this type were written.
204 *
205 *
206 * The table below considers a file to be ASCII if all of its characters
207 * are either ASCII printing characters (again, according to the X3.4
208 * standard, not isascii()) or any of the following controls: bell,
209 * backspace, tab, line feed, form feed, carriage return, esc, nextline.
210 *
211 * I include bell because some programs (particularly shell scripts)
212 * use it literally, even though it is rare in normal text.  I exclude
213 * vertical tab because it never seems to be used in real text.  I also
214 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
215 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
216 * character to.  It might be more appropriate to include it in the 8859
217 * set instead of the ASCII set, but it's got to be included in *something*
218 * we recognize or EBCDIC files aren't going to be considered textual.
219 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
220 * and Latin characters, so these should possibly be allowed.  But they
221 * make a real mess on VT100-style displays if they're not paired properly,
222 * so we are probably better off not calling them text.
223 *
224 * A file is considered to be ISO-8859 text if its characters are all
225 * either ASCII, according to the above definition, or printing characters
226 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
227 *
228 * Finally, a file is considered to be international text from some other
229 * character code if its characters are all either ISO-8859 (according to
230 * the above definition) or characters in the range 0x80 ... 0x9F, which
231 * ISO-8859 considers to be control characters but the IBM PC and Macintosh
232 * consider to be printing characters.
233 */
234
235#define F 0   /* character never appears in text */
236#define T 1   /* character appears in plain ASCII text */
237#define I 2   /* character appears in ISO-8859 text */
238#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
239
240file_private char text_chars[256] = {
241	/*                  BEL BS HT LF VT FF CR    */
242	F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F,  /* 0x0X */
243	/*                              ESC          */
244	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
245	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
246	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
247	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
248	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
249	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
250	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
251	/*            NEL                            */
252	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
253	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
254	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
255	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
256	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
257	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
258	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
259	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
260};
261
262#define LOOKS(NAME, COND) \
263file_private int \
264looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \
265    size_t *ulen) \
266{ \
267	size_t i; \
268\
269	*ulen = 0; \
270\
271	for (i = 0; i < nbytes; i++) { \
272		int t = text_chars[buf[i]]; \
273\
274		if (COND) \
275			return 0; \
276\
277		ubuf[(*ulen)++] = buf[i]; \
278	} \
279	return 1; \
280}
281
282LOOKS(ascii, t != T)
283LOOKS(latin1, t != T && t != I)
284LOOKS(extended, t != T && t != I && t != X)
285
286/*
287 * Decide whether some text looks like UTF-8. Returns:
288 *
289 *     -1: invalid UTF-8
290 *      0: uses odd control characters, so doesn't look like text
291 *      1: 7-bit text
292 *      2: definitely UTF-8 text (valid high-bit set bytes)
293 *
294 * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
295 * ubuf must be big enough!
296 */
297
298// from: https://golang.org/src/unicode/utf8/utf8.go
299
300#define	XX 0xF1 // invalid: size 1
301#define	AS 0xF0 // ASCII: size 1
302#define	S1 0x02 // accept 0, size 2
303#define	S2 0x13 // accept 1, size 3
304#define	S3 0x03 // accept 0, size 3
305#define	S4 0x23 // accept 2, size 3
306#define	S5 0x34 // accept 3, size 4
307#define	S6 0x04 // accept 0, size 4
308#define	S7 0x44 // accept 4, size 4
309
310#define LOCB 0x80
311#define HICB 0xBF
312
313// first is information about the first byte in a UTF-8 sequence.
314static const uint8_t first[] = {
315    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
316    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
317    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
318    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
319    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
320    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
321    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
322    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
323    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
324    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
325    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
326    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
327    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
328    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
329    XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
330    S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
331    S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
332    S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
333};
334
335// acceptRange gives the range of valid values for the second byte in a UTF-8
336// sequence.
337struct accept_range {
338	uint8_t lo; // lowest value for second byte.
339	uint8_t hi; // highest value for second byte.
340} accept_ranges[16] = {
341// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
342	{ LOCB, HICB },
343	{ 0xA0, HICB },
344	{ LOCB, 0x9F },
345	{ 0x90, HICB },
346	{ LOCB, 0x8F },
347};
348
349file_protected int
350file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
351    size_t *ulen)
352{
353	size_t i;
354	int n;
355	file_unichar_t c;
356	int gotone = 0, ctrl = 0;
357
358	if (ubuf)
359		*ulen = 0;
360
361	for (i = 0; i < nbytes; i++) {
362		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
363			/*
364			 * Even if the whole file is valid UTF-8 sequences,
365			 * still reject it if it uses weird control characters.
366			 */
367
368			if (text_chars[buf[i]] != T)
369				ctrl = 1;
370
371			if (ubuf)
372				ubuf[(*ulen)++] = buf[i];
373		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
374			return -1;
375		} else {			   /* 11xxxxxx begins UTF-8 */
376			int following;
377			uint8_t x = first[buf[i]];
378			const struct accept_range *ar =
379			    &accept_ranges[(unsigned int)x >> 4];
380			if (x == XX)
381				return -1;
382
383			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
384				c = buf[i] & 0x1f;
385				following = 1;
386			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
387				c = buf[i] & 0x0f;
388				following = 2;
389			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
390				c = buf[i] & 0x07;
391				following = 3;
392			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
393				c = buf[i] & 0x03;
394				following = 4;
395			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
396				c = buf[i] & 0x01;
397				following = 5;
398			} else
399				return -1;
400
401			for (n = 0; n < following; n++) {
402				i++;
403				if (i >= nbytes)
404					goto done;
405
406				if (n == 0 &&
407				     (buf[i] < ar->lo || buf[i] > ar->hi))
408					return -1;
409
410				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
411					return -1;
412
413				c = (c << 6) + (buf[i] & 0x3f);
414			}
415
416			if (ubuf)
417				ubuf[(*ulen)++] = c;
418			gotone = 1;
419		}
420	}
421done:
422	return ctrl ? 0 : (gotone ? 2 : 1);
423}
424
425/*
426 * Decide whether some text looks like UTF-8 with BOM. If there is no
427 * BOM, return -1; otherwise return the result of looks_utf8 on the
428 * rest of the text.
429 */
430file_private int
431looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes,
432    file_unichar_t *ubuf, size_t *ulen)
433{
434	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
435		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
436	else
437		return -1;
438}
439
440file_private int
441looks_utf7(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf,
442    size_t *ulen)
443{
444	if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
445		switch (buf[3]) {
446		case '8':
447		case '9':
448		case '+':
449		case '/':
450			if (ubuf)
451				*ulen = 0;
452			return 1;
453		default:
454			return -1;
455		}
456	else
457		return -1;
458}
459
460#define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef)
461#define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff)
462#define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
463
464file_private int
465looks_ucs16(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
466    size_t *ulen)
467{
468	int bigend;
469	uint32_t hi;
470	size_t i;
471
472	if (nbytes < 2)
473		return 0;
474
475	if (bf[0] == 0xff && bf[1] == 0xfe)
476		bigend = 0;
477	else if (bf[0] == 0xfe && bf[1] == 0xff)
478		bigend = 1;
479	else
480		return 0;
481
482	*ulen = 0;
483	hi = 0;
484
485	for (i = 2; i + 1 < nbytes; i += 2) {
486		uint32_t uc;
487
488		if (bigend)
489			uc = CAST(uint32_t,
490			    bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8));
491		else
492			uc = CAST(uint32_t,
493			    bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8));
494
495		uc &= 0xffff;
496
497		switch (uc) {
498		case 0xfffe:
499		case 0xffff:
500			return 0;
501		default:
502			if (UCS16_NOCHAR(uc))
503				return 0;
504			break;
505		}
506		if (hi) {
507			if (!UCS16_LOSURR(uc))
508				return 0;
509			uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00);
510			hi = 0;
511		}
512		if (uc < 128 && text_chars[CAST(size_t, uc)] != T)
513			return 0;
514		ubf[(*ulen)++] = uc;
515		if (UCS16_HISURR(uc))
516			hi = uc - 0xd800 + 1;
517		if (UCS16_LOSURR(uc))
518			return 0;
519	}
520
521	return 1 + bigend;
522}
523
524file_private int
525looks_ucs32(const unsigned char *bf, size_t nbytes, file_unichar_t *ubf,
526    size_t *ulen)
527{
528	int bigend;
529	size_t i;
530
531	if (nbytes < 4)
532		return 0;
533
534	if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
535		bigend = 0;
536	else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
537		bigend = 1;
538	else
539		return 0;
540
541	*ulen = 0;
542
543	for (i = 4; i + 3 < nbytes; i += 4) {
544		/* XXX fix to properly handle chars > 65536 */
545
546		if (bigend)
547			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 3])
548			    | (CAST(file_unichar_t, bf[i + 2]) << 8)
549			    | (CAST(file_unichar_t, bf[i + 1]) << 16)
550			    | (CAST(file_unichar_t, bf[i]) << 24);
551		else
552			ubf[(*ulen)++] = CAST(file_unichar_t, bf[i + 0])
553			    | (CAST(file_unichar_t, bf[i + 1]) << 8)
554			    | (CAST(file_unichar_t, bf[i + 2]) << 16)
555			    | (CAST(file_unichar_t, bf[i + 3]) << 24);
556
557		if (ubf[*ulen - 1] == 0xfffe)
558			return 0;
559		if (ubf[*ulen - 1] < 128 &&
560		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)
561			return 0;
562	}
563
564	return 1 + bigend;
565}
566#undef F
567#undef T
568#undef I
569#undef X
570
571/*
572 * This table maps each EBCDIC character to an (8-bit extended) ASCII
573 * character, as specified in the rationale for the dd(1) command in
574 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
575 *
576 * Unfortunately it does not seem to correspond exactly to any of the
577 * five variants of EBCDIC documented in IBM's _Enterprise Systems
578 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
579 * Edition, July, 1999, pp. I-1 - I-4.
580 *
581 * Fortunately, though, all versions of EBCDIC, including this one, agree
582 * on most of the printing characters that also appear in (7-bit) ASCII.
583 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
584 *
585 * Fortunately too, there is general agreement that codes 0x00 through
586 * 0x3F represent control characters, 0x41 a nonbreaking space, and the
587 * remainder printing characters.
588 *
589 * This is sufficient to allow us to identify EBCDIC text and to distinguish
590 * between old-style and internationalized examples of text.
591 */
592
593file_private unsigned char ebcdic_to_ascii[] = {
594  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
595 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
596128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
597144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
598' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
599'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
600'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
601186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
602195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
603202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
604209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
605216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
606'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
607'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
608'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
609'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
610};
611
612#ifdef notdef
613/*
614 * The following EBCDIC-to-ASCII table may relate more closely to reality,
615 * or at least to modern reality.  It comes from
616 *
617 *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
618 *
619 * and maps the characters of EBCDIC code page 1047 (the code used for
620 * Unix-derived software on IBM's 390 systems) to the corresponding
621 * characters from ISO 8859-1.
622 *
623 * If this table is used instead of the above one, some of the special
624 * cases for the NEL character can be taken out of the code.
625 */
626
627file_private unsigned char ebcdic_1047_to_8859[] = {
6280x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
6290x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
6300x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
6310x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
6320x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
6330x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
6340x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
6350xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
6360xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
6370xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
6380xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
6390xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
6400x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
6410x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
6420x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
6430x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
644};
645#endif
646
647/*
648 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
649 */
650file_private void
651from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
652{
653	size_t i;
654
655	for (i = 0; i < nbytes; i++) {
656		out[i] = ebcdic_to_ascii[buf[i]];
657	}
658}
659