1/*
2 * Copyright 2006, Ingo Weinhold <bonefish@cs.tu-berlin.de>.
3 * All rights reserved. Distributed under the terms of the MIT License.
4 */
5
6#include "TextSnifferAddon.h"
7
8#include <MimeType.h>
9
10
11static int file_ascmagic(const unsigned char *buf, size_t nbytes,
12	BMimeType* mimeType);
13
14
15// constructor
16TextSnifferAddon::TextSnifferAddon()
17{
18}
19
20// destructor
21TextSnifferAddon::~TextSnifferAddon()
22{
23}
24
25// MinimalBufferSize
26size_t
27TextSnifferAddon::MinimalBufferSize()
28{
29	return 512;
30}
31
32// GuessMimeType
33float
34TextSnifferAddon::GuessMimeType(const char* fileName, BMimeType* type)
35{
36	// we check content only
37	return -1;
38}
39
40// GuessMimeType
41float
42TextSnifferAddon::GuessMimeType(BFile* file, const void* buffer, int32 length,
43	BMimeType* type)
44{
45	if (file_ascmagic((const unsigned char*)buffer, length, type)) {
46		// If the buffer is very short, we return a lower priority. Maybe
47		// someone else knows better.
48		if (length < 20)
49			return .0f;
50		return 0.25f;
51	}
52
53	return -1;
54}
55
56
57// #pragma mark - ascmagic.c from the BSD file tool
58/*
59 * The following code has been taken from version 4.17 of the BSD file tool,
60 * file ascmagic.c, modified for our purpose.
61 */
62
63/*
64 * Copyright (c) Ian F. Darwin 1986-1995.
65 * Software written by Ian F. Darwin and others;
66 * maintained 1995-present by Christos Zoulas and others.
67 *
68 * Redistribution and use in source and binary forms, with or without
69 * modification, are permitted provided that the following conditions
70 * are met:
71 * 1. Redistributions of source code must retain the above copyright
72 *    notice immediately at the beginning of the file, without modification,
73 *    this list of conditions, and the following disclaimer.
74 * 2. Redistributions in binary form must reproduce the above copyright
75 *    notice, this list of conditions and the following disclaimer in the
76 *    documentation and/or other materials provided with the distribution.
77 *
78 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
82 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88 * SUCH DAMAGE.
89 */
90/*
91 * ASCII magic -- file types that we know based on keywords
92 * that can appear anywhere in the file.
93 *
94 * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
95 * to handle character codes other than ASCII on a unified basis.
96 *
97 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
98 * international characters, now subsumed into this file.
99 */
100
101#include <stdio.h>
102#include <string.h>
103#include <memory.h>
104#include <ctype.h>
105#include <stdlib.h>
106#include <unistd.h>
107#include "names.h"
108
109typedef unsigned long my_unichar;
110
111#define MAXLINELEN 300	/* longest sane line length */
112#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
113		  || (x) == 0x85 || (x) == '\f')
114
115static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *);
116static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *);
117static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *);
118static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *);
119static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *);
120static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
121static int ascmatch(const unsigned char *, const my_unichar *, size_t);
122
123
124static int
125file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType)
126{
127	size_t i;
128	unsigned char *nbuf = NULL;
129	my_unichar *ubuf = NULL;
130	size_t ulen;
131	struct names *p;
132	int rv = -1;
133
134	const char *code = NULL;
135	const char *code_mime = NULL;
136	const char *type = NULL;
137	const char *subtype = NULL;
138	const char *subtypeMimeGeneric = NULL;
139	const char *subtypeMimeSpecific = NULL;
140
141	int has_escapes = 0;
142	int has_backspace = 0;
143	int seen_cr = 0;
144
145	int n_crlf = 0;
146	int n_lf = 0;
147	int n_cr = 0;
148	int n_nel = 0;
149
150	int last_line_end = -1;
151	int has_long_lines = 0;
152
153	if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
154		goto done;
155	if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
156		goto done;
157
158	/*
159	 * Then try to determine whether it's any character code we can
160	 * identify.  Each of these tests, if it succeeds, will leave
161	 * the text converted into one-my_unichar-per-character Unicode in
162	 * ubuf, and the number of characters converted in ulen.
163	 */
164	if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
165		code = "ASCII";
166		code_mime = "us-ascii";
167		type = "text";
168	} else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
169		code = "UTF-8 Unicode";
170		code_mime = "utf-8";
171		type = "text";
172	} else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
173		if (i == 1)
174			code = "Little-endian UTF-16 Unicode";
175		else
176			code = "Big-endian UTF-16 Unicode";
177
178		type = "character data";
179		code_mime = "utf-16";    /* is this defined? */
180	} else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
181		code = "ISO-8859";
182		type = "text";
183		code_mime = "iso-8859-1";
184	} else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
185		code = "Non-ISO extended-ASCII";
186		type = "text";
187		code_mime = "unknown";
188	} else {
189		from_ebcdic(buf, nbytes, nbuf);
190
191		if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
192			code = "EBCDIC";
193			type = "character data";
194			code_mime = "ebcdic";
195		} else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
196			code = "International EBCDIC";
197			type = "character data";
198			code_mime = "ebcdic";
199		} else {
200			rv = 0;
201			goto done;  /* doesn't look like text at all */
202		}
203	}
204
205	if (nbytes <= 1) {
206		rv = 0;
207		goto done;
208	}
209
210	/*
211	 * for troff, look for . + letter + letter or .\";
212	 * this must be done to disambiguate tar archives' ./file
213	 * and other trash from real troff input.
214	 *
215	 * I believe Plan 9 troff allows non-ASCII characters in the names
216	 * of macros, so this test might possibly fail on such a file.
217	 */
218	if (*ubuf == '.') {
219		my_unichar *tp = ubuf + 1;
220
221		while (ISSPC(*tp))
222			++tp;	/* skip leading whitespace */
223		if ((tp[0] == '\\' && tp[1] == '\"') ||
224		    (isascii((unsigned char)tp[0]) &&
225		     isalnum((unsigned char)tp[0]) &&
226		     isascii((unsigned char)tp[1]) &&
227		     isalnum((unsigned char)tp[1]) &&
228		     ISSPC(tp[2]))) {
229		    subtypeMimeGeneric = "text/x-source-code";
230			subtypeMimeSpecific = "text/troff";
231			subtype = "troff or preprocessor input";
232			goto subtype_identified;
233		}
234	}
235
236	if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
237		subtypeMimeGeneric = "text/x-source-code";
238		subtypeMimeSpecific = "text/fortran";
239		subtype = "fortran program";
240		goto subtype_identified;
241	}
242
243	/* look for tokens from names.h - this is expensive! */
244
245	i = 0;
246	while (i < ulen) {
247		size_t end;
248
249		/*
250		 * skip past any leading space
251		 */
252		while (i < ulen && ISSPC(ubuf[i]))
253			i++;
254		if (i >= ulen)
255			break;
256
257		/*
258		 * find the next whitespace
259		 */
260		for (end = i + 1; end < nbytes; end++)
261			if (ISSPC(ubuf[end]))
262				break;
263
264		/*
265		 * compare the word thus isolated against the token list
266		 */
267		for (p = names; p < names + NNAMES; p++) {
268			if (ascmatch((const unsigned char *)p->name, ubuf + i,
269			    end - i)) {
270				subtype = types[p->type].human;
271				subtypeMimeGeneric = types[p->type].generic_mime;
272				subtypeMimeSpecific = types[p->type].specific_mime;
273				goto subtype_identified;
274			}
275		}
276
277		i = end;
278	}
279
280subtype_identified:
281
282	/*
283	 * Now try to discover other details about the file.
284	 */
285	for (i = 0; i < ulen; i++) {
286		if (ubuf[i] == '\n') {
287			if (seen_cr)
288				n_crlf++;
289			else
290				n_lf++;
291			last_line_end = i;
292		} else if (seen_cr)
293			n_cr++;
294
295		seen_cr = (ubuf[i] == '\r');
296		if (seen_cr)
297			last_line_end = i;
298
299		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
300			n_nel++;
301			last_line_end = i;
302		}
303
304		/* If this line is _longer_ than MAXLINELEN, remember it. */
305		if ((int)i > last_line_end + MAXLINELEN)
306			has_long_lines = 1;
307
308		if (ubuf[i] == '\033')
309			has_escapes = 1;
310		if (ubuf[i] == '\b')
311			has_backspace = 1;
312	}
313
314	rv = 1;
315done:
316	if (nbuf)
317		free(nbuf);
318	if (ubuf)
319		free(ubuf);
320
321	if (rv) {
322		// If we have identified the subtype, return it, otherwise just
323		// text/plain.
324
325		bool found = false;
326		if (subtypeMimeSpecific != NULL) {
327			mimeType->SetTo(subtypeMimeSpecific);
328			if (mimeType->IsInstalled())
329				found = true;
330		}
331		if (!found && subtypeMimeGeneric != NULL) {
332			mimeType->SetTo(subtypeMimeGeneric);
333			if (mimeType->IsInstalled())
334				found = true;
335		}
336		if (!found)
337			mimeType->SetTo("text/plain");
338	}
339
340	return rv;
341}
342
343static int
344ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen)
345{
346	size_t i;
347
348	for (i = 0; i < ulen; i++) {
349		if (s[i] != us[i])
350			return 0;
351	}
352
353	if (s[i])
354		return 0;
355	else
356		return 1;
357}
358
359/*
360 * This table reflects a particular philosophy about what constitutes
361 * "text," and there is room for disagreement about it.
362 *
363 * Version 3.31 of the file command considered a file to be ASCII if
364 * each of its characters was approved by either the isascii() or
365 * isalpha() function.  On most systems, this would mean that any
366 * file consisting only of characters in the range 0x00 ... 0x7F
367 * would be called ASCII text, but many systems might reasonably
368 * consider some characters outside this range to be alphabetic,
369 * so the file command would call such characters ASCII.  It might
370 * have been more accurate to call this "considered textual on the
371 * local system" than "ASCII."
372 *
373 * It considered a file to be "International language text" if each
374 * of its characters was either an ASCII printing character (according
375 * to the real ASCII standard, not the above test), a character in
376 * the range 0x80 ... 0xFF, or one of the following control characters:
377 * backspace, tab, line feed, vertical tab, form feed, carriage return,
378 * escape.  No attempt was made to determine the language in which files
379 * of this type were written.
380 *
381 *
382 * The table below considers a file to be ASCII if all of its characters
383 * are either ASCII printing characters (again, according to the X3.4
384 * standard, not isascii()) or any of the following controls: bell,
385 * backspace, tab, line feed, form feed, carriage return, esc, nextline.
386 *
387 * I include bell because some programs (particularly shell scripts)
388 * use it literally, even though it is rare in normal text.  I exclude
389 * vertical tab because it never seems to be used in real text.  I also
390 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
391 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
392 * character to.  It might be more appropriate to include it in the 8859
393 * set instead of the ASCII set, but it's got to be included in *something*
394 * we recognize or EBCDIC files aren't going to be considered textual.
395 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
396 * and Latin characters, so these should possibly be allowed.  But they
397 * make a real mess on VT100-style displays if they're not paired properly,
398 * so we are probably better off not calling them text.
399 *
400 * A file is considered to be ISO-8859 text if its characters are all
401 * either ASCII, according to the above definition, or printing characters
402 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
403 *
404 * Finally, a file is considered to be international text from some other
405 * character code if its characters are all either ISO-8859 (according to
406 * the above definition) or characters in the range 0x80 ... 0x9F, which
407 * ISO-8859 considers to be control characters but the IBM PC and Macintosh
408 * consider to be printing characters.
409 */
410
411#define F 0   /* character never appears in text */
412#define T 1   /* character appears in plain ASCII text */
413#define I 2   /* character appears in ISO-8859 text */
414#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
415
416static char text_chars[256] = {
417	/*                  BEL BS HT LF    FF CR    */
418	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
419        /*                              ESC          */
420	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
421	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
422	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
423	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
424	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
425	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
426	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
427	/*            NEL                            */
428	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
429	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
430	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
431	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
432	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
433	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
434	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
435	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
436};
437
438static int
439looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
440    size_t *ulen)
441{
442	int i;
443
444	*ulen = 0;
445
446	for (i = 0; i < (int)nbytes; i++) {
447		int t = text_chars[buf[i]];
448
449		if (t != T)
450			return 0;
451
452		ubuf[(*ulen)++] = buf[i];
453	}
454
455	return 1;
456}
457
458static int
459looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
460{
461	int i;
462
463	*ulen = 0;
464
465	for (i = 0; i < (int)nbytes; i++) {
466		int t = text_chars[buf[i]];
467
468		if (t != T && t != I)
469			return 0;
470
471		ubuf[(*ulen)++] = buf[i];
472	}
473
474	return 1;
475}
476
477static int
478looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
479    size_t *ulen)
480{
481	int i;
482
483	*ulen = 0;
484
485	for (i = 0; i < (int)nbytes; i++) {
486		int t = text_chars[buf[i]];
487
488		if (t != T && t != I && t != X)
489			return 0;
490
491		ubuf[(*ulen)++] = buf[i];
492	}
493
494	return 1;
495}
496
497static int
498looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
499{
500	int i, n;
501	my_unichar c;
502	int gotone = 0;
503
504	*ulen = 0;
505
506	for (i = 0; i < (int)nbytes; i++) {
507		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
508			/*
509			 * Even if the whole file is valid UTF-8 sequences,
510			 * still reject it if it uses weird control characters.
511			 */
512
513			if (text_chars[buf[i]] != T)
514				return 0;
515
516			ubuf[(*ulen)++] = buf[i];
517		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
518			return 0;
519		} else {			   /* 11xxxxxx begins UTF-8 */
520			int following;
521
522			if ((buf[i] & 0x20) == 0) {		/* 110xxxxx */
523				c = buf[i] & 0x1f;
524				following = 1;
525			} else if ((buf[i] & 0x10) == 0) {	/* 1110xxxx */
526				c = buf[i] & 0x0f;
527				following = 2;
528			} else if ((buf[i] & 0x08) == 0) {	/* 11110xxx */
529				c = buf[i] & 0x07;
530				following = 3;
531			} else if ((buf[i] & 0x04) == 0) {	/* 111110xx */
532				c = buf[i] & 0x03;
533				following = 4;
534			} else if ((buf[i] & 0x02) == 0) {	/* 1111110x */
535				c = buf[i] & 0x01;
536				following = 5;
537			} else
538				return 0;
539
540			for (n = 0; n < following; n++) {
541				i++;
542				if (i >= (int)nbytes)
543					goto done;
544
545				if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
546					return 0;
547
548				c = (c << 6) + (buf[i] & 0x3f);
549			}
550
551			ubuf[(*ulen)++] = c;
552			gotone = 1;
553		}
554	}
555done:
556	return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
557}
558
559static int
560looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
561    size_t *ulen)
562{
563	int bigend;
564	int i;
565
566	if (nbytes < 2)
567		return 0;
568
569	if (buf[0] == 0xff && buf[1] == 0xfe)
570		bigend = 0;
571	else if (buf[0] == 0xfe && buf[1] == 0xff)
572		bigend = 1;
573	else
574		return 0;
575
576	*ulen = 0;
577
578	for (i = 2; i + 1 < (int)nbytes; i += 2) {
579		/* XXX fix to properly handle chars > 65536 */
580
581		if (bigend)
582			ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
583		else
584			ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
585
586		if (ubuf[*ulen - 1] == 0xfffe)
587			return 0;
588		if (ubuf[*ulen - 1] < 128 &&
589		    text_chars[(size_t)ubuf[*ulen - 1]] != T)
590			return 0;
591	}
592
593	return 1 + bigend;
594}
595
596#undef F
597#undef T
598#undef I
599#undef X
600
601/*
602 * This table maps each EBCDIC character to an (8-bit extended) ASCII
603 * character, as specified in the rationale for the dd(1) command in
604 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
605 *
606 * Unfortunately it does not seem to correspond exactly to any of the
607 * five variants of EBCDIC documented in IBM's _Enterprise Systems
608 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
609 * Edition, July, 1999, pp. I-1 - I-4.
610 *
611 * Fortunately, though, all versions of EBCDIC, including this one, agree
612 * on most of the printing characters that also appear in (7-bit) ASCII.
613 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
614 *
615 * Fortunately too, there is general agreement that codes 0x00 through
616 * 0x3F represent control characters, 0x41 a nonbreaking space, and the
617 * remainder printing characters.
618 *
619 * This is sufficient to allow us to identify EBCDIC text and to distinguish
620 * between old-style and internationalized examples of text.
621 */
622
623static unsigned char ebcdic_to_ascii[] = {
624  0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
625 16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
626128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
627144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
628' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
629'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
630'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
631186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
632195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
633202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
634209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
635216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
636'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
637'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
638'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
639'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
640};
641
642#ifdef notdef
643/*
644 * The following EBCDIC-to-ASCII table may relate more closely to reality,
645 * or at least to modern reality.  It comes from
646 *
647 *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
648 *
649 * and maps the characters of EBCDIC code page 1047 (the code used for
650 * Unix-derived software on IBM's 390 systems) to the corresponding
651 * characters from ISO 8859-1.
652 *
653 * If this table is used instead of the above one, some of the special
654 * cases for the NEL character can be taken out of the code.
655 */
656
657static unsigned char ebcdic_1047_to_8859[] = {
6580x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
6590x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
6600x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
6610x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
6620x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
6630x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
6640x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
6650xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
6660xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
6670xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
6680xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
6690xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
6700x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
6710x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
6720x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
6730x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
674};
675#endif
676
677/*
678 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
679 */
680static void
681from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
682{
683	int i;
684
685	for (i = 0; i < (int)nbytes; i++) {
686		out[i] = ebcdic_to_ascii[buf[i]];
687	}
688}
689