1191739Sobrien/*
2191739Sobrien * Copyright (c) Ian F. Darwin 1986-1995.
3191739Sobrien * Software written by Ian F. Darwin and others;
4191739Sobrien * maintained 1995-present by Christos Zoulas and others.
5191739Sobrien *
6191739Sobrien * Redistribution and use in source and binary forms, with or without
7191739Sobrien * modification, are permitted provided that the following conditions
8191739Sobrien * are met:
9191739Sobrien * 1. Redistributions of source code must retain the above copyright
10191739Sobrien *    notice immediately at the beginning of the file, without modification,
11191739Sobrien *    this list of conditions, and the following disclaimer.
12191739Sobrien * 2. Redistributions in binary form must reproduce the above copyright
13191739Sobrien *    notice, this list of conditions and the following disclaimer in the
14191739Sobrien *    documentation and/or other materials provided with the distribution.
15191739Sobrien *
16191739Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17191739Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18191739Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19191739Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20191739Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21191739Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22191739Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23191739Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24191739Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25191739Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26191739Sobrien * SUCH DAMAGE.
27191739Sobrien */
28191739Sobrien/*
29191739Sobrien * Encoding -- determine the character encoding of a text file.
30191739Sobrien *
31191739Sobrien * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
32191739Sobrien * international characters.
33191739Sobrien */
34191739Sobrien
35191739Sobrien#include "file.h"
36191739Sobrien
37191739Sobrien#ifndef	lint
38284778SdelphijFILE_RCSID("@(#)$File: encoding.c,v 1.13 2015/06/04 19:16:28 christos Exp $")
39191739Sobrien#endif	/* lint */
40191739Sobrien
41191739Sobrien#include "magic.h"
42191739Sobrien#include <string.h>
43191739Sobrien#include <memory.h>
44191739Sobrien#include <stdlib.h>
45191739Sobrien
46191739Sobrien
47191739Sobrienprivate int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
48191739Sobrienprivate int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
49191739Sobrien    size_t *);
50284778Sdelphijprivate int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
51191739Sobrienprivate int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
52191739Sobrienprivate int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
53191739Sobrienprivate int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
54191739Sobrienprivate void from_ebcdic(const unsigned char *, size_t, unsigned char *);
55191739Sobrien
56226048Sobrien#ifdef DEBUG_ENCODING
57226048Sobrien#define DPRINTF(a) printf a
58226048Sobrien#else
59226048Sobrien#define DPRINTF(a)
60226048Sobrien#endif
61226048Sobrien
62191739Sobrien/*
63191739Sobrien * Try to determine whether text is in some character code we can
64191739Sobrien * identify.  Each of these tests, if it succeeds, will leave
65191739Sobrien * the text converted into one-unichar-per-character Unicode in
66191739Sobrien * ubuf, and the number of characters converted in ulen.
67191739Sobrien */
68191739Sobrienprotected int
69191739Sobrienfile_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)
70191739Sobrien{
71191739Sobrien	size_t mlen;
72191739Sobrien	int rv = 1, ucs_type;
73191739Sobrien	unsigned char *nbuf = NULL;
74191739Sobrien
75234250Sobrien	*type = "text";
76267843Sdelphij	*ulen = 0;
77267843Sdelphij	*code = "unknown";
78267843Sdelphij	*code_mime = "binary";
79267843Sdelphij
80267843Sdelphij	mlen = (nbytes + 1) * sizeof((*ubuf)[0]);
81267843Sdelphij	if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) {
82191739Sobrien		file_oomem(ms, mlen);
83191739Sobrien		goto done;
84191739Sobrien	}
85267843Sdelphij	mlen = (nbytes + 1) * sizeof(nbuf[0]);
86267843Sdelphij	if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
87191739Sobrien		file_oomem(ms, mlen);
88191739Sobrien		goto done;
89191739Sobrien	}
90191739Sobrien
91191739Sobrien	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
92284778Sdelphij		if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) {
93284778Sdelphij			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
94284778Sdelphij			*code = "UTF-7 Unicode";
95284778Sdelphij			*code_mime = "utf-7";
96284778Sdelphij		} else {
97284778Sdelphij			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
98284778Sdelphij			*code = "ASCII";
99284778Sdelphij			*code_mime = "us-ascii";
100284778Sdelphij		}
101191739Sobrien	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
102226048Sobrien		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
103191739Sobrien		*code = "UTF-8 Unicode (with BOM)";
104191739Sobrien		*code_mime = "utf-8";
105191739Sobrien	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
106226048Sobrien		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
107191739Sobrien		*code = "UTF-8 Unicode";
108191739Sobrien		*code_mime = "utf-8";
109191739Sobrien	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
110191739Sobrien		if (ucs_type == 1) {
111191739Sobrien			*code = "Little-endian UTF-16 Unicode";
112191739Sobrien			*code_mime = "utf-16le";
113191739Sobrien		} else {
114191739Sobrien			*code = "Big-endian UTF-16 Unicode";
115191739Sobrien			*code_mime = "utf-16be";
116191739Sobrien		}
117226048Sobrien		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
118191739Sobrien	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
119226048Sobrien		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
120191739Sobrien		*code = "ISO-8859";
121191739Sobrien		*code_mime = "iso-8859-1";
122191739Sobrien	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
123226048Sobrien		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
124191739Sobrien		*code = "Non-ISO extended-ASCII";
125191739Sobrien		*code_mime = "unknown-8bit";
126191739Sobrien	} else {
127191739Sobrien		from_ebcdic(buf, nbytes, nbuf);
128191739Sobrien
129191739Sobrien		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
130226048Sobrien			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
131191739Sobrien			*code = "EBCDIC";
132191739Sobrien			*code_mime = "ebcdic";
133191739Sobrien		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
134226048Sobrien			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
135226048Sobrien			    *ulen));
136191739Sobrien			*code = "International EBCDIC";
137191739Sobrien			*code_mime = "ebcdic";
138191739Sobrien		} else { /* Doesn't look like text at all */
139226048Sobrien			DPRINTF(("binary\n"));
140191739Sobrien			rv = 0;
141191739Sobrien			*type = "binary";
142191739Sobrien		}
143191739Sobrien	}
144191739Sobrien
145191739Sobrien done:
146234250Sobrien	free(nbuf);
147191739Sobrien
148191739Sobrien	return rv;
149191739Sobrien}
150191739Sobrien
151191739Sobrien/*
152191739Sobrien * This table reflects a particular philosophy about what constitutes
153191739Sobrien * "text," and there is room for disagreement about it.
154191739Sobrien *
155191739Sobrien * Version 3.31 of the file command considered a file to be ASCII if
156191739Sobrien * each of its characters was approved by either the isascii() or
157191739Sobrien * isalpha() function.  On most systems, this would mean that any
158191739Sobrien * file consisting only of characters in the range 0x00 ... 0x7F
159191739Sobrien * would be called ASCII text, but many systems might reasonably
160191739Sobrien * consider some characters outside this range to be alphabetic,
161191739Sobrien * so the file command would call such characters ASCII.  It might
162191739Sobrien * have been more accurate to call this "considered textual on the
163191739Sobrien * local system" than "ASCII."
164191739Sobrien *
165191739Sobrien * It considered a file to be "International language text" if each
166191739Sobrien * of its characters was either an ASCII printing character (according
167191739Sobrien * to the real ASCII standard, not the above test), a character in
168191739Sobrien * the range 0x80 ... 0xFF, or one of the following control characters:
169191739Sobrien * backspace, tab, line feed, vertical tab, form feed, carriage return,
170191739Sobrien * escape.  No attempt was made to determine the language in which files
171191739Sobrien * of this type were written.
172191739Sobrien *
173191739Sobrien *
174191739Sobrien * The table below considers a file to be ASCII if all of its characters
175191739Sobrien * are either ASCII printing characters (again, according to the X3.4
176191739Sobrien * standard, not isascii()) or any of the following controls: bell,
177191739Sobrien * backspace, tab, line feed, form feed, carriage return, esc, nextline.
178191739Sobrien *
179191739Sobrien * I include bell because some programs (particularly shell scripts)
180191739Sobrien * use it literally, even though it is rare in normal text.  I exclude
181191739Sobrien * vertical tab because it never seems to be used in real text.  I also
182191739Sobrien * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
183191739Sobrien * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
184191739Sobrien * character to.  It might be more appropriate to include it in the 8859
185191739Sobrien * set instead of the ASCII set, but it's got to be included in *something*
186191739Sobrien * we recognize or EBCDIC files aren't going to be considered textual.
187191739Sobrien * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
188191739Sobrien * and Latin characters, so these should possibly be allowed.  But they
189191739Sobrien * make a real mess on VT100-style displays if they're not paired properly,
190191739Sobrien * so we are probably better off not calling them text.
191191739Sobrien *
192191739Sobrien * A file is considered to be ISO-8859 text if its characters are all
193191739Sobrien * either ASCII, according to the above definition, or printing characters
194191739Sobrien * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
195191739Sobrien *
196191739Sobrien * Finally, a file is considered to be international text from some other
197191739Sobrien * character code if its characters are all either ISO-8859 (according to
198191739Sobrien * the above definition) or characters in the range 0x80 ... 0x9F, which
199191739Sobrien * ISO-8859 considers to be control characters but the IBM PC and Macintosh
200191739Sobrien * consider to be printing characters.
201191739Sobrien */
202191739Sobrien
203191739Sobrien#define F 0   /* character never appears in text */
204191739Sobrien#define T 1   /* character appears in plain ASCII text */
205191739Sobrien#define I 2   /* character appears in ISO-8859 text */
206191739Sobrien#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
207191739Sobrien
208191739Sobrienprivate char text_chars[256] = {
209284778Sdelphij	/*                  BEL BS HT LF VT FF CR    */
210284778Sdelphij	F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F,  /* 0x0X */
211191739Sobrien	/*                              ESC          */
212191739Sobrien	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
213191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
214191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
215191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
216191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
217191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
218191739Sobrien	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
219191739Sobrien	/*            NEL                            */
220191739Sobrien	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
221191739Sobrien	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
222191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
223191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
224191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
225191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
226191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
227191739Sobrien	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
228191739Sobrien};
229191739Sobrien
230191739Sobrienprivate int
231191739Sobrienlooks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
232191739Sobrien    size_t *ulen)
233191739Sobrien{
234191739Sobrien	size_t i;
235191739Sobrien
236191739Sobrien	*ulen = 0;
237191739Sobrien
238191739Sobrien	for (i = 0; i < nbytes; i++) {
239191739Sobrien		int t = text_chars[buf[i]];
240191739Sobrien
241191739Sobrien		if (t != T)
242191739Sobrien			return 0;
243191739Sobrien
244191739Sobrien		ubuf[(*ulen)++] = buf[i];
245191739Sobrien	}
246191739Sobrien
247191739Sobrien	return 1;
248191739Sobrien}
249191739Sobrien
250191739Sobrienprivate int
251191739Sobrienlooks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
252191739Sobrien{
253191739Sobrien	size_t i;
254191739Sobrien
255191739Sobrien	*ulen = 0;
256191739Sobrien
257191739Sobrien	for (i = 0; i < nbytes; i++) {
258191739Sobrien		int t = text_chars[buf[i]];
259191739Sobrien
260191739Sobrien		if (t != T && t != I)
261191739Sobrien			return 0;
262191739Sobrien
263191739Sobrien		ubuf[(*ulen)++] = buf[i];
264191739Sobrien	}
265191739Sobrien
266191739Sobrien	return 1;
267191739Sobrien}
268191739Sobrien
269191739Sobrienprivate int
270191739Sobrienlooks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
271191739Sobrien    size_t *ulen)
272191739Sobrien{
273191739Sobrien	size_t i;
274191739Sobrien
275191739Sobrien	*ulen = 0;
276191739Sobrien
277191739Sobrien	for (i = 0; i < nbytes; i++) {
278191739Sobrien		int t = text_chars[buf[i]];
279191739Sobrien
280191739Sobrien		if (t != T && t != I && t != X)
281191739Sobrien			return 0;
282191739Sobrien
283191739Sobrien		ubuf[(*ulen)++] = buf[i];
284191739Sobrien	}
285191739Sobrien
286191739Sobrien	return 1;
287191739Sobrien}
288191739Sobrien
289191739Sobrien/*
290191739Sobrien * Decide whether some text looks like UTF-8. Returns:
291191739Sobrien *
292191739Sobrien *     -1: invalid UTF-8
293191739Sobrien *      0: uses odd control characters, so doesn't look like text
294191739Sobrien *      1: 7-bit text
295191739Sobrien *      2: definitely UTF-8 text (valid high-bit set bytes)
296191739Sobrien *
297191739Sobrien * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
298191739Sobrien * ubuf must be big enough!
299191739Sobrien */
300191739Sobrienprotected int
301191739Sobrienfile_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
302191739Sobrien{
303191739Sobrien	size_t i;
304191739Sobrien	int n;
305191739Sobrien	unichar c;
306191739Sobrien	int gotone = 0, ctrl = 0;
307191739Sobrien
308191739Sobrien	if (ubuf)
309191739Sobrien		*ulen = 0;
310191739Sobrien
311191739Sobrien	for (i = 0; i < nbytes; i++) {
312191739Sobrien		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
313191739Sobrien			/*
314191739Sobrien			 * Even if the whole file is valid UTF-8 sequences,
315191739Sobrien			 * still reject it if it uses weird control characters.
316191739Sobrien			 */
317191739Sobrien
318191739Sobrien			if (text_chars[buf[i]] != T)
319191739Sobrien				ctrl = 1;
320191739Sobrien
321191739Sobrien			if (ubuf)
322191739Sobrien				ubuf[(*ulen)++] = buf[i];
323191739Sobrien		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
324191739Sobrien			return -1;
325191739Sobrien		} else {			   /* 11xxxxxx begins UTF-8 */
326191739Sobrien			int following;
327191739Sobrien
328191739Sobrien			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
329191739Sobrien				c = buf[i] & 0x1f;
330191739Sobrien				following = 1;
331191739Sobrien			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
332191739Sobrien				c = buf[i] & 0x0f;
333191739Sobrien				following = 2;
334191739Sobrien			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
335191739Sobrien				c = buf[i] & 0x07;
336191739Sobrien				following = 3;
337191739Sobrien			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
338191739Sobrien				c = buf[i] & 0x03;
339191739Sobrien				following = 4;
340191739Sobrien			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
341191739Sobrien				c = buf[i] & 0x01;
342191739Sobrien				following = 5;
343191739Sobrien			} else
344191739Sobrien				return -1;
345191739Sobrien
346191739Sobrien			for (n = 0; n < following; n++) {
347191739Sobrien				i++;
348191739Sobrien				if (i >= nbytes)
349191739Sobrien					goto done;
350191739Sobrien
351191739Sobrien				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
352191739Sobrien					return -1;
353191739Sobrien
354191739Sobrien				c = (c << 6) + (buf[i] & 0x3f);
355191739Sobrien			}
356191739Sobrien
357191739Sobrien			if (ubuf)
358191739Sobrien				ubuf[(*ulen)++] = c;
359191739Sobrien			gotone = 1;
360191739Sobrien		}
361191739Sobrien	}
362191739Sobriendone:
363191739Sobrien	return ctrl ? 0 : (gotone ? 2 : 1);
364191739Sobrien}
365191739Sobrien
366191739Sobrien/*
367191739Sobrien * Decide whether some text looks like UTF-8 with BOM. If there is no
368191739Sobrien * BOM, return -1; otherwise return the result of looks_utf8 on the
369191739Sobrien * rest of the text.
370191739Sobrien */
371191739Sobrienprivate int
372191739Sobrienlooks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
373191739Sobrien    size_t *ulen)
374191739Sobrien{
375191739Sobrien	if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
376191739Sobrien		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
377191739Sobrien	else
378191739Sobrien		return -1;
379191739Sobrien}
380191739Sobrien
381191739Sobrienprivate int
382284778Sdelphijlooks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
383284778Sdelphij{
384284778Sdelphij	if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
385284778Sdelphij		switch (buf[3]) {
386284778Sdelphij		case '8':
387284778Sdelphij		case '9':
388284778Sdelphij		case '+':
389284778Sdelphij		case '/':
390284778Sdelphij			if (ubuf)
391284778Sdelphij				*ulen = 0;
392284778Sdelphij			return 1;
393284778Sdelphij		default:
394284778Sdelphij			return -1;
395284778Sdelphij		}
396284778Sdelphij	else
397284778Sdelphij		return -1;
398284778Sdelphij}
399284778Sdelphij
400284778Sdelphijprivate int
401191739Sobrienlooks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
402191739Sobrien    size_t *ulen)
403191739Sobrien{
404191739Sobrien	int bigend;
405191739Sobrien	size_t i;
406191739Sobrien
407191739Sobrien	if (nbytes < 2)
408191739Sobrien		return 0;
409191739Sobrien
410191739Sobrien	if (buf[0] == 0xff && buf[1] == 0xfe)
411191739Sobrien		bigend = 0;
412191739Sobrien	else if (buf[0] == 0xfe && buf[1] == 0xff)
413191739Sobrien		bigend = 1;
414191739Sobrien	else
415191739Sobrien		return 0;
416191739Sobrien
417191739Sobrien	*ulen = 0;
418191739Sobrien
419191739Sobrien	for (i = 2; i + 1 < nbytes; i += 2) {
420191739Sobrien		/* XXX fix to properly handle chars > 65536 */
421191739Sobrien
422191739Sobrien		if (bigend)
423191739Sobrien			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
424191739Sobrien		else
425191739Sobrien			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
426191739Sobrien
427191739Sobrien		if (ubuf[*ulen - 1] == 0xfffe)
428191739Sobrien			return 0;
429191739Sobrien		if (ubuf[*ulen - 1] < 128 &&
430191739Sobrien		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
431191739Sobrien			return 0;
432191739Sobrien	}
433191739Sobrien
434191739Sobrien	return 1 + bigend;
435191739Sobrien}
436191739Sobrien
437191739Sobrien#undef F
438191739Sobrien#undef T
439191739Sobrien#undef I
440191739Sobrien#undef X
441191739Sobrien
442191739Sobrien/*
443191739Sobrien * This table maps each EBCDIC character to an (8-bit extended) ASCII
444191739Sobrien * character, as specified in the rationale for the dd(1) command in
445191739Sobrien * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
446191739Sobrien *
447191739Sobrien * Unfortunately it does not seem to correspond exactly to any of the
448191739Sobrien * five variants of EBCDIC documented in IBM's _Enterprise Systems
449191739Sobrien * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
450191739Sobrien * Edition, July, 1999, pp. I-1 - I-4.
451191739Sobrien *
452191739Sobrien * Fortunately, though, all versions of EBCDIC, including this one, agree
453191739Sobrien * on most of the printing characters that also appear in (7-bit) ASCII.
454191739Sobrien * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
455191739Sobrien *
456191739Sobrien * Fortunately too, there is general agreement that codes 0x00 through
457191739Sobrien * 0x3F represent control characters, 0x41 a nonbreaking space, and the
458191739Sobrien * remainder printing characters.
459191739Sobrien *
460191739Sobrien * This is sufficient to allow us to identify EBCDIC text and to distinguish
461191739Sobrien * between old-style and internationalized examples of text.
462191739Sobrien */
463191739Sobrien
464191739Sobrienprivate unsigned char ebcdic_to_ascii[] = {
465191739Sobrien  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
466191739Sobrien 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
467191739Sobrien128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
468191739Sobrien144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
469191739Sobrien' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
470191739Sobrien'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
471191739Sobrien'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
472191739Sobrien186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
473191739Sobrien195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
474191739Sobrien202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
475191739Sobrien209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
476191739Sobrien216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
477191739Sobrien'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
478191739Sobrien'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
479191739Sobrien'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
480191739Sobrien'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
481191739Sobrien};
482191739Sobrien
483191739Sobrien#ifdef notdef
484191739Sobrien/*
485191739Sobrien * The following EBCDIC-to-ASCII table may relate more closely to reality,
486191739Sobrien * or at least to modern reality.  It comes from
487191739Sobrien *
488191739Sobrien *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
489191739Sobrien *
490191739Sobrien * and maps the characters of EBCDIC code page 1047 (the code used for
491191739Sobrien * Unix-derived software on IBM's 390 systems) to the corresponding
492191739Sobrien * characters from ISO 8859-1.
493191739Sobrien *
494191739Sobrien * If this table is used instead of the above one, some of the special
495191739Sobrien * cases for the NEL character can be taken out of the code.
496191739Sobrien */
497191739Sobrien
498191739Sobrienprivate unsigned char ebcdic_1047_to_8859[] = {
499191739Sobrien0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
500191739Sobrien0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
501191739Sobrien0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
502191739Sobrien0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
503191739Sobrien0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
504191739Sobrien0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
505191739Sobrien0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
506191739Sobrien0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
507191739Sobrien0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
508191739Sobrien0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
509191739Sobrien0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
510191739Sobrien0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
511191739Sobrien0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
512191739Sobrien0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
513191739Sobrien0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
514191739Sobrien0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
515191739Sobrien};
516191739Sobrien#endif
517191739Sobrien
518191739Sobrien/*
519191739Sobrien * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
520191739Sobrien */
521191739Sobrienprivate void
522191739Sobrienfrom_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
523191739Sobrien{
524191739Sobrien	size_t i;
525191739Sobrien
526191739Sobrien	for (i = 0; i < nbytes; i++) {
527191739Sobrien		out[i] = ebcdic_to_ascii[buf[i]];
528191739Sobrien	}
529191739Sobrien}
530