1191739Sobrien/* 2191739Sobrien * Copyright (c) Ian F. Darwin 1986-1995. 3191739Sobrien * Software written by Ian F. Darwin and others; 4191739Sobrien * maintained 1995-present by Christos Zoulas and others. 5191739Sobrien * 6191739Sobrien * Redistribution and use in source and binary forms, with or without 7191739Sobrien * modification, are permitted provided that the following conditions 8191739Sobrien * are met: 9191739Sobrien * 1. Redistributions of source code must retain the above copyright 10191739Sobrien * notice immediately at the beginning of the file, without modification, 11191739Sobrien * this list of conditions, and the following disclaimer. 12191739Sobrien * 2. Redistributions in binary form must reproduce the above copyright 13191739Sobrien * notice, this list of conditions and the following disclaimer in the 14191739Sobrien * documentation and/or other materials provided with the distribution. 15191739Sobrien * 16191739Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17191739Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18191739Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19191739Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 20191739Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21191739Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22191739Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23191739Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24191739Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25191739Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26191739Sobrien * SUCH DAMAGE. 27191739Sobrien */ 28191739Sobrien/* 29191739Sobrien * Encoding -- determine the character encoding of a text file. 30191739Sobrien * 31191739Sobrien * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 32191739Sobrien * international characters. 33191739Sobrien */ 34191739Sobrien 35191739Sobrien#include "file.h" 36191739Sobrien 37191739Sobrien#ifndef lint 38234449SobrienFILE_RCSID("@(#)$File: encoding.c,v 1.7 2012/01/24 19:02:02 christos Exp $") 39191739Sobrien#endif /* lint */ 40191739Sobrien 41191739Sobrien#include "magic.h" 42191739Sobrien#include <string.h> 43191739Sobrien#include <memory.h> 44191739Sobrien#include <stdlib.h> 45191739Sobrien 46191739Sobrien 47191739Sobrienprivate int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); 48191739Sobrienprivate int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, 49191739Sobrien size_t *); 50191739Sobrienprivate int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); 51191739Sobrienprivate int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); 52191739Sobrienprivate int looks_extended(const unsigned char *, size_t, unichar *, size_t *); 53191739Sobrienprivate void from_ebcdic(const unsigned char *, size_t, unsigned char *); 54191739Sobrien 55234449Sobrien#ifdef DEBUG_ENCODING 56234449Sobrien#define DPRINTF(a) printf a 57234449Sobrien#else 58234449Sobrien#define DPRINTF(a) 59234449Sobrien#endif 60234449Sobrien 61191739Sobrien/* 62191739Sobrien * Try to determine whether text is in some character code we can 63191739Sobrien * identify. Each of these tests, if it succeeds, will leave 64191739Sobrien * the text converted into one-unichar-per-character Unicode in 65191739Sobrien * ubuf, and the number of characters converted in ulen. 66191739Sobrien */ 67191739Sobrienprotected int 68191739Sobrienfile_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) 69191739Sobrien{ 70191739Sobrien size_t mlen; 71191739Sobrien int rv = 1, ucs_type; 72191739Sobrien unsigned char *nbuf = NULL; 73191739Sobrien 74234449Sobrien *type = "text"; 75191739Sobrien mlen = (nbytes + 1) * sizeof(nbuf[0]); 76191739Sobrien if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) { 77191739Sobrien file_oomem(ms, mlen); 78191739Sobrien goto done; 79191739Sobrien } 80191739Sobrien mlen = (nbytes + 1) * sizeof((*ubuf)[0]); 81191739Sobrien if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) { 82191739Sobrien file_oomem(ms, mlen); 83191739Sobrien goto done; 84191739Sobrien } 85191739Sobrien 86191739Sobrien if (looks_ascii(buf, nbytes, *ubuf, ulen)) { 87234449Sobrien DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); 88191739Sobrien *code = "ASCII"; 89191739Sobrien *code_mime = "us-ascii"; 90191739Sobrien } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { 91234449Sobrien DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); 92191739Sobrien *code = "UTF-8 Unicode (with BOM)"; 93191739Sobrien *code_mime = "utf-8"; 94191739Sobrien } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { 95234449Sobrien DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); 96234449Sobrien *code = "UTF-8 Unicode (with BOM)"; 97191739Sobrien *code = "UTF-8 Unicode"; 98191739Sobrien *code_mime = "utf-8"; 99191739Sobrien } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { 100191739Sobrien if (ucs_type == 1) { 101191739Sobrien *code = "Little-endian UTF-16 Unicode"; 102191739Sobrien *code_mime = "utf-16le"; 103191739Sobrien } else { 104191739Sobrien *code = "Big-endian UTF-16 Unicode"; 105191739Sobrien *code_mime = "utf-16be"; 106191739Sobrien } 107234449Sobrien DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); 108191739Sobrien } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { 109234449Sobrien DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); 110191739Sobrien *code = "ISO-8859"; 111191739Sobrien *code_mime = "iso-8859-1"; 112191739Sobrien } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { 113234449Sobrien DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); 114191739Sobrien *code = "Non-ISO extended-ASCII"; 115191739Sobrien *code_mime = "unknown-8bit"; 116191739Sobrien } else { 117191739Sobrien from_ebcdic(buf, nbytes, nbuf); 118191739Sobrien 119191739Sobrien if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { 120234449Sobrien DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); 121191739Sobrien *code = "EBCDIC"; 122191739Sobrien *code_mime = "ebcdic"; 123191739Sobrien } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { 124234449Sobrien DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", 125234449Sobrien *ulen)); 126191739Sobrien *code = "International EBCDIC"; 127191739Sobrien *code_mime = "ebcdic"; 128191739Sobrien } else { /* Doesn't look like text at all */ 129234449Sobrien DPRINTF(("binary\n")); 130191739Sobrien rv = 0; 131191739Sobrien *type = "binary"; 132191739Sobrien } 133191739Sobrien } 134191739Sobrien 135191739Sobrien done: 136234449Sobrien free(nbuf); 137191739Sobrien 138191739Sobrien return rv; 139191739Sobrien} 140191739Sobrien 141191739Sobrien/* 142191739Sobrien * This table reflects a particular philosophy about what constitutes 143191739Sobrien * "text," and there is room for disagreement about it. 144191739Sobrien * 145191739Sobrien * Version 3.31 of the file command considered a file to be ASCII if 146191739Sobrien * each of its characters was approved by either the isascii() or 147191739Sobrien * isalpha() function. On most systems, this would mean that any 148191739Sobrien * file consisting only of characters in the range 0x00 ... 0x7F 149191739Sobrien * would be called ASCII text, but many systems might reasonably 150191739Sobrien * consider some characters outside this range to be alphabetic, 151191739Sobrien * so the file command would call such characters ASCII. It might 152191739Sobrien * have been more accurate to call this "considered textual on the 153191739Sobrien * local system" than "ASCII." 154191739Sobrien * 155191739Sobrien * It considered a file to be "International language text" if each 156191739Sobrien * of its characters was either an ASCII printing character (according 157191739Sobrien * to the real ASCII standard, not the above test), a character in 158191739Sobrien * the range 0x80 ... 0xFF, or one of the following control characters: 159191739Sobrien * backspace, tab, line feed, vertical tab, form feed, carriage return, 160191739Sobrien * escape. No attempt was made to determine the language in which files 161191739Sobrien * of this type were written. 162191739Sobrien * 163191739Sobrien * 164191739Sobrien * The table below considers a file to be ASCII if all of its characters 165191739Sobrien * are either ASCII printing characters (again, according to the X3.4 166191739Sobrien * standard, not isascii()) or any of the following controls: bell, 167191739Sobrien * backspace, tab, line feed, form feed, carriage return, esc, nextline. 168191739Sobrien * 169191739Sobrien * I include bell because some programs (particularly shell scripts) 170191739Sobrien * use it literally, even though it is rare in normal text. I exclude 171191739Sobrien * vertical tab because it never seems to be used in real text. I also 172191739Sobrien * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 173191739Sobrien * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 174191739Sobrien * character to. It might be more appropriate to include it in the 8859 175191739Sobrien * set instead of the ASCII set, but it's got to be included in *something* 176191739Sobrien * we recognize or EBCDIC files aren't going to be considered textual. 177191739Sobrien * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 178191739Sobrien * and Latin characters, so these should possibly be allowed. But they 179191739Sobrien * make a real mess on VT100-style displays if they're not paired properly, 180191739Sobrien * so we are probably better off not calling them text. 181191739Sobrien * 182191739Sobrien * A file is considered to be ISO-8859 text if its characters are all 183191739Sobrien * either ASCII, according to the above definition, or printing characters 184191739Sobrien * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 185191739Sobrien * 186191739Sobrien * Finally, a file is considered to be international text from some other 187191739Sobrien * character code if its characters are all either ISO-8859 (according to 188191739Sobrien * the above definition) or characters in the range 0x80 ... 0x9F, which 189191739Sobrien * ISO-8859 considers to be control characters but the IBM PC and Macintosh 190191739Sobrien * consider to be printing characters. 191191739Sobrien */ 192191739Sobrien 193191739Sobrien#define F 0 /* character never appears in text */ 194191739Sobrien#define T 1 /* character appears in plain ASCII text */ 195191739Sobrien#define I 2 /* character appears in ISO-8859 text */ 196191739Sobrien#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 197191739Sobrien 198191739Sobrienprivate char text_chars[256] = { 199191739Sobrien /* BEL BS HT LF FF CR */ 200191739Sobrien F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 201191739Sobrien /* ESC */ 202191739Sobrien F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 203191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 204191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 205191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 206191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 207191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 208191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 209191739Sobrien /* NEL */ 210191739Sobrien X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 211191739Sobrien X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 212191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 213191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 214191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 215191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 216191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 217191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 218191739Sobrien}; 219191739Sobrien 220191739Sobrienprivate int 221191739Sobrienlooks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, 222191739Sobrien size_t *ulen) 223191739Sobrien{ 224191739Sobrien size_t i; 225191739Sobrien 226191739Sobrien *ulen = 0; 227191739Sobrien 228191739Sobrien for (i = 0; i < nbytes; i++) { 229191739Sobrien int t = text_chars[buf[i]]; 230191739Sobrien 231191739Sobrien if (t != T) 232191739Sobrien return 0; 233191739Sobrien 234191739Sobrien ubuf[(*ulen)++] = buf[i]; 235191739Sobrien } 236191739Sobrien 237191739Sobrien return 1; 238191739Sobrien} 239191739Sobrien 240191739Sobrienprivate int 241191739Sobrienlooks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 242191739Sobrien{ 243191739Sobrien size_t i; 244191739Sobrien 245191739Sobrien *ulen = 0; 246191739Sobrien 247191739Sobrien for (i = 0; i < nbytes; i++) { 248191739Sobrien int t = text_chars[buf[i]]; 249191739Sobrien 250191739Sobrien if (t != T && t != I) 251191739Sobrien return 0; 252191739Sobrien 253191739Sobrien ubuf[(*ulen)++] = buf[i]; 254191739Sobrien } 255191739Sobrien 256191739Sobrien return 1; 257191739Sobrien} 258191739Sobrien 259191739Sobrienprivate int 260191739Sobrienlooks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, 261191739Sobrien size_t *ulen) 262191739Sobrien{ 263191739Sobrien size_t i; 264191739Sobrien 265191739Sobrien *ulen = 0; 266191739Sobrien 267191739Sobrien for (i = 0; i < nbytes; i++) { 268191739Sobrien int t = text_chars[buf[i]]; 269191739Sobrien 270191739Sobrien if (t != T && t != I && t != X) 271191739Sobrien return 0; 272191739Sobrien 273191739Sobrien ubuf[(*ulen)++] = buf[i]; 274191739Sobrien } 275191739Sobrien 276191739Sobrien return 1; 277191739Sobrien} 278191739Sobrien 279191739Sobrien/* 280191739Sobrien * Decide whether some text looks like UTF-8. Returns: 281191739Sobrien * 282191739Sobrien * -1: invalid UTF-8 283191739Sobrien * 0: uses odd control characters, so doesn't look like text 284191739Sobrien * 1: 7-bit text 285191739Sobrien * 2: definitely UTF-8 text (valid high-bit set bytes) 286191739Sobrien * 287191739Sobrien * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; 288191739Sobrien * ubuf must be big enough! 289191739Sobrien */ 290191739Sobrienprotected int 291191739Sobrienfile_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 292191739Sobrien{ 293191739Sobrien size_t i; 294191739Sobrien int n; 295191739Sobrien unichar c; 296191739Sobrien int gotone = 0, ctrl = 0; 297191739Sobrien 298191739Sobrien if (ubuf) 299191739Sobrien *ulen = 0; 300191739Sobrien 301191739Sobrien for (i = 0; i < nbytes; i++) { 302191739Sobrien if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 303191739Sobrien /* 304191739Sobrien * Even if the whole file is valid UTF-8 sequences, 305191739Sobrien * still reject it if it uses weird control characters. 306191739Sobrien */ 307191739Sobrien 308191739Sobrien if (text_chars[buf[i]] != T) 309191739Sobrien ctrl = 1; 310191739Sobrien 311191739Sobrien if (ubuf) 312191739Sobrien ubuf[(*ulen)++] = buf[i]; 313191739Sobrien } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 314191739Sobrien return -1; 315191739Sobrien } else { /* 11xxxxxx begins UTF-8 */ 316191739Sobrien int following; 317191739Sobrien 318191739Sobrien if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 319191739Sobrien c = buf[i] & 0x1f; 320191739Sobrien following = 1; 321191739Sobrien } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 322191739Sobrien c = buf[i] & 0x0f; 323191739Sobrien following = 2; 324191739Sobrien } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 325191739Sobrien c = buf[i] & 0x07; 326191739Sobrien following = 3; 327191739Sobrien } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 328191739Sobrien c = buf[i] & 0x03; 329191739Sobrien following = 4; 330191739Sobrien } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 331191739Sobrien c = buf[i] & 0x01; 332191739Sobrien following = 5; 333191739Sobrien } else 334191739Sobrien return -1; 335191739Sobrien 336191739Sobrien for (n = 0; n < following; n++) { 337191739Sobrien i++; 338191739Sobrien if (i >= nbytes) 339191739Sobrien goto done; 340191739Sobrien 341191739Sobrien if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 342191739Sobrien return -1; 343191739Sobrien 344191739Sobrien c = (c << 6) + (buf[i] & 0x3f); 345191739Sobrien } 346191739Sobrien 347191739Sobrien if (ubuf) 348191739Sobrien ubuf[(*ulen)++] = c; 349191739Sobrien gotone = 1; 350191739Sobrien } 351191739Sobrien } 352191739Sobriendone: 353191739Sobrien return ctrl ? 0 : (gotone ? 2 : 1); 354191739Sobrien} 355191739Sobrien 356191739Sobrien/* 357191739Sobrien * Decide whether some text looks like UTF-8 with BOM. If there is no 358191739Sobrien * BOM, return -1; otherwise return the result of looks_utf8 on the 359191739Sobrien * rest of the text. 360191739Sobrien */ 361191739Sobrienprivate int 362191739Sobrienlooks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf, 363191739Sobrien size_t *ulen) 364191739Sobrien{ 365191739Sobrien if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) 366191739Sobrien return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); 367191739Sobrien else 368191739Sobrien return -1; 369191739Sobrien} 370191739Sobrien 371191739Sobrienprivate int 372191739Sobrienlooks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, 373191739Sobrien size_t *ulen) 374191739Sobrien{ 375191739Sobrien int bigend; 376191739Sobrien size_t i; 377191739Sobrien 378191739Sobrien if (nbytes < 2) 379191739Sobrien return 0; 380191739Sobrien 381191739Sobrien if (buf[0] == 0xff && buf[1] == 0xfe) 382191739Sobrien bigend = 0; 383191739Sobrien else if (buf[0] == 0xfe && buf[1] == 0xff) 384191739Sobrien bigend = 1; 385191739Sobrien else 386191739Sobrien return 0; 387191739Sobrien 388191739Sobrien *ulen = 0; 389191739Sobrien 390191739Sobrien for (i = 2; i + 1 < nbytes; i += 2) { 391191739Sobrien /* XXX fix to properly handle chars > 65536 */ 392191739Sobrien 393191739Sobrien if (bigend) 394191739Sobrien ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; 395191739Sobrien else 396191739Sobrien ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; 397191739Sobrien 398191739Sobrien if (ubuf[*ulen - 1] == 0xfffe) 399191739Sobrien return 0; 400191739Sobrien if (ubuf[*ulen - 1] < 128 && 401191739Sobrien text_chars[(size_t)ubuf[*ulen - 1]] != T) 402191739Sobrien return 0; 403191739Sobrien } 404191739Sobrien 405191739Sobrien return 1 + bigend; 406191739Sobrien} 407191739Sobrien 408191739Sobrien#undef F 409191739Sobrien#undef T 410191739Sobrien#undef I 411191739Sobrien#undef X 412191739Sobrien 413191739Sobrien/* 414191739Sobrien * This table maps each EBCDIC character to an (8-bit extended) ASCII 415191739Sobrien * character, as specified in the rationale for the dd(1) command in 416191739Sobrien * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 417191739Sobrien * 418191739Sobrien * Unfortunately it does not seem to correspond exactly to any of the 419191739Sobrien * five variants of EBCDIC documented in IBM's _Enterprise Systems 420191739Sobrien * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 421191739Sobrien * Edition, July, 1999, pp. I-1 - I-4. 422191739Sobrien * 423191739Sobrien * Fortunately, though, all versions of EBCDIC, including this one, agree 424191739Sobrien * on most of the printing characters that also appear in (7-bit) ASCII. 425191739Sobrien * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 426191739Sobrien * 427191739Sobrien * Fortunately too, there is general agreement that codes 0x00 through 428191739Sobrien * 0x3F represent control characters, 0x41 a nonbreaking space, and the 429191739Sobrien * remainder printing characters. 430191739Sobrien * 431191739Sobrien * This is sufficient to allow us to identify EBCDIC text and to distinguish 432191739Sobrien * between old-style and internationalized examples of text. 433191739Sobrien */ 434191739Sobrien 435191739Sobrienprivate unsigned char ebcdic_to_ascii[] = { 436191739Sobrien 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 437191739Sobrien 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 438191739Sobrien128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 439191739Sobrien144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 440191739Sobrien' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 441191739Sobrien'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 442191739Sobrien'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 443191739Sobrien186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 444191739Sobrien195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 445191739Sobrien202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 446191739Sobrien209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 447191739Sobrien216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 448191739Sobrien'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 449191739Sobrien'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 450191739Sobrien'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 451191739Sobrien'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 452191739Sobrien}; 453191739Sobrien 454191739Sobrien#ifdef notdef 455191739Sobrien/* 456191739Sobrien * The following EBCDIC-to-ASCII table may relate more closely to reality, 457191739Sobrien * or at least to modern reality. It comes from 458191739Sobrien * 459191739Sobrien * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 460191739Sobrien * 461191739Sobrien * and maps the characters of EBCDIC code page 1047 (the code used for 462191739Sobrien * Unix-derived software on IBM's 390 systems) to the corresponding 463191739Sobrien * characters from ISO 8859-1. 464191739Sobrien * 465191739Sobrien * If this table is used instead of the above one, some of the special 466191739Sobrien * cases for the NEL character can be taken out of the code. 467191739Sobrien */ 468191739Sobrien 469191739Sobrienprivate unsigned char ebcdic_1047_to_8859[] = { 470191739Sobrien0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 471191739Sobrien0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 472191739Sobrien0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 473191739Sobrien0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 474191739Sobrien0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 475191739Sobrien0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 476191739Sobrien0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 477191739Sobrien0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 478191739Sobrien0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 479191739Sobrien0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 480191739Sobrien0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 481191739Sobrien0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 482191739Sobrien0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 483191739Sobrien0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 484191739Sobrien0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 485191739Sobrien0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 486191739Sobrien}; 487191739Sobrien#endif 488191739Sobrien 489191739Sobrien/* 490191739Sobrien * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 491191739Sobrien */ 492191739Sobrienprivate void 493191739Sobrienfrom_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 494191739Sobrien{ 495191739Sobrien size_t i; 496191739Sobrien 497191739Sobrien for (i = 0; i < nbytes; i++) { 498191739Sobrien out[i] = ebcdic_to_ascii[buf[i]]; 499191739Sobrien } 500191739Sobrien} 501