1191739Sobrien/* 2191739Sobrien * Copyright (c) Ian F. Darwin 1986-1995. 3191739Sobrien * Software written by Ian F. Darwin and others; 4191739Sobrien * maintained 1995-present by Christos Zoulas and others. 5191739Sobrien * 6191739Sobrien * Redistribution and use in source and binary forms, with or without 7191739Sobrien * modification, are permitted provided that the following conditions 8191739Sobrien * are met: 9191739Sobrien * 1. Redistributions of source code must retain the above copyright 10191739Sobrien * notice immediately at the beginning of the file, without modification, 11191739Sobrien * this list of conditions, and the following disclaimer. 12191739Sobrien * 2. Redistributions in binary form must reproduce the above copyright 13191739Sobrien * notice, this list of conditions and the following disclaimer in the 14191739Sobrien * documentation and/or other materials provided with the distribution. 15191739Sobrien * 16191739Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17191739Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18191739Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19191739Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 20191739Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21191739Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22191739Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23191739Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24191739Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25191739Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26191739Sobrien * SUCH DAMAGE. 27191739Sobrien */ 28191739Sobrien/* 29191739Sobrien * Encoding -- determine the character encoding of a text file. 30191739Sobrien * 31191739Sobrien * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit 32191739Sobrien * international characters. 33191739Sobrien */ 34191739Sobrien 35191739Sobrien#include "file.h" 36191739Sobrien 37191739Sobrien#ifndef lint 38284778SdelphijFILE_RCSID("@(#)$File: encoding.c,v 1.13 2015/06/04 19:16:28 christos Exp $") 39191739Sobrien#endif /* lint */ 40191739Sobrien 41191739Sobrien#include "magic.h" 42191739Sobrien#include <string.h> 43191739Sobrien#include <memory.h> 44191739Sobrien#include <stdlib.h> 45191739Sobrien 46191739Sobrien 47191739Sobrienprivate int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); 48191739Sobrienprivate int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, 49191739Sobrien size_t *); 50284778Sdelphijprivate int looks_utf7(const unsigned char *, size_t, unichar *, size_t *); 51191739Sobrienprivate int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); 52191739Sobrienprivate int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); 53191739Sobrienprivate int looks_extended(const unsigned char *, size_t, unichar *, size_t *); 54191739Sobrienprivate void from_ebcdic(const unsigned char *, size_t, unsigned char *); 55191739Sobrien 56226048Sobrien#ifdef DEBUG_ENCODING 57226048Sobrien#define DPRINTF(a) printf a 58226048Sobrien#else 59226048Sobrien#define DPRINTF(a) 60226048Sobrien#endif 61226048Sobrien 62191739Sobrien/* 63191739Sobrien * Try to determine whether text is in some character code we can 64191739Sobrien * identify. Each of these tests, if it succeeds, will leave 65191739Sobrien * the text converted into one-unichar-per-character Unicode in 66191739Sobrien * ubuf, and the number of characters converted in ulen. 67191739Sobrien */ 68191739Sobrienprotected int 69191739Sobrienfile_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, unichar **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type) 70191739Sobrien{ 71191739Sobrien size_t mlen; 72191739Sobrien int rv = 1, ucs_type; 73191739Sobrien unsigned char *nbuf = NULL; 74191739Sobrien 75234250Sobrien *type = "text"; 76267843Sdelphij *ulen = 0; 77267843Sdelphij *code = "unknown"; 78267843Sdelphij *code_mime = "binary"; 79267843Sdelphij 80267843Sdelphij mlen = (nbytes + 1) * sizeof((*ubuf)[0]); 81267843Sdelphij if ((*ubuf = CAST(unichar *, calloc((size_t)1, mlen))) == NULL) { 82191739Sobrien file_oomem(ms, mlen); 83191739Sobrien goto done; 84191739Sobrien } 85267843Sdelphij mlen = (nbytes + 1) * sizeof(nbuf[0]); 86267843Sdelphij if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) { 87191739Sobrien file_oomem(ms, mlen); 88191739Sobrien goto done; 89191739Sobrien } 90191739Sobrien 91191739Sobrien if (looks_ascii(buf, nbytes, *ubuf, ulen)) { 92284778Sdelphij if (looks_utf7(buf, nbytes, *ubuf, ulen) > 0) { 93284778Sdelphij DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen)); 94284778Sdelphij *code = "UTF-7 Unicode"; 95284778Sdelphij *code_mime = "utf-7"; 96284778Sdelphij } else { 97284778Sdelphij DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); 98284778Sdelphij *code = "ASCII"; 99284778Sdelphij *code_mime = "us-ascii"; 100284778Sdelphij } 101191739Sobrien } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { 102226048Sobrien DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); 103191739Sobrien *code = "UTF-8 Unicode (with BOM)"; 104191739Sobrien *code_mime = "utf-8"; 105191739Sobrien } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { 106226048Sobrien DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); 107191739Sobrien *code = "UTF-8 Unicode"; 108191739Sobrien *code_mime = "utf-8"; 109191739Sobrien } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { 110191739Sobrien if (ucs_type == 1) { 111191739Sobrien *code = "Little-endian UTF-16 Unicode"; 112191739Sobrien *code_mime = "utf-16le"; 113191739Sobrien } else { 114191739Sobrien *code = "Big-endian UTF-16 Unicode"; 115191739Sobrien *code_mime = "utf-16be"; 116191739Sobrien } 117226048Sobrien DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); 118191739Sobrien } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { 119226048Sobrien DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); 120191739Sobrien *code = "ISO-8859"; 121191739Sobrien *code_mime = "iso-8859-1"; 122191739Sobrien } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { 123226048Sobrien DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); 124191739Sobrien *code = "Non-ISO extended-ASCII"; 125191739Sobrien *code_mime = "unknown-8bit"; 126191739Sobrien } else { 127191739Sobrien from_ebcdic(buf, nbytes, nbuf); 128191739Sobrien 129191739Sobrien if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { 130226048Sobrien DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); 131191739Sobrien *code = "EBCDIC"; 132191739Sobrien *code_mime = "ebcdic"; 133191739Sobrien } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { 134226048Sobrien DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", 135226048Sobrien *ulen)); 136191739Sobrien *code = "International EBCDIC"; 137191739Sobrien *code_mime = "ebcdic"; 138191739Sobrien } else { /* Doesn't look like text at all */ 139226048Sobrien DPRINTF(("binary\n")); 140191739Sobrien rv = 0; 141191739Sobrien *type = "binary"; 142191739Sobrien } 143191739Sobrien } 144191739Sobrien 145191739Sobrien done: 146234250Sobrien free(nbuf); 147191739Sobrien 148191739Sobrien return rv; 149191739Sobrien} 150191739Sobrien 151191739Sobrien/* 152191739Sobrien * This table reflects a particular philosophy about what constitutes 153191739Sobrien * "text," and there is room for disagreement about it. 154191739Sobrien * 155191739Sobrien * Version 3.31 of the file command considered a file to be ASCII if 156191739Sobrien * each of its characters was approved by either the isascii() or 157191739Sobrien * isalpha() function. On most systems, this would mean that any 158191739Sobrien * file consisting only of characters in the range 0x00 ... 0x7F 159191739Sobrien * would be called ASCII text, but many systems might reasonably 160191739Sobrien * consider some characters outside this range to be alphabetic, 161191739Sobrien * so the file command would call such characters ASCII. It might 162191739Sobrien * have been more accurate to call this "considered textual on the 163191739Sobrien * local system" than "ASCII." 164191739Sobrien * 165191739Sobrien * It considered a file to be "International language text" if each 166191739Sobrien * of its characters was either an ASCII printing character (according 167191739Sobrien * to the real ASCII standard, not the above test), a character in 168191739Sobrien * the range 0x80 ... 0xFF, or one of the following control characters: 169191739Sobrien * backspace, tab, line feed, vertical tab, form feed, carriage return, 170191739Sobrien * escape. No attempt was made to determine the language in which files 171191739Sobrien * of this type were written. 172191739Sobrien * 173191739Sobrien * 174191739Sobrien * The table below considers a file to be ASCII if all of its characters 175191739Sobrien * are either ASCII printing characters (again, according to the X3.4 176191739Sobrien * standard, not isascii()) or any of the following controls: bell, 177191739Sobrien * backspace, tab, line feed, form feed, carriage return, esc, nextline. 178191739Sobrien * 179191739Sobrien * I include bell because some programs (particularly shell scripts) 180191739Sobrien * use it literally, even though it is rare in normal text. I exclude 181191739Sobrien * vertical tab because it never seems to be used in real text. I also 182191739Sobrien * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85), 183191739Sobrien * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline 184191739Sobrien * character to. It might be more appropriate to include it in the 8859 185191739Sobrien * set instead of the ASCII set, but it's got to be included in *something* 186191739Sobrien * we recognize or EBCDIC files aren't going to be considered textual. 187191739Sobrien * Some old Unix source files use SO/SI (^N/^O) to shift between Greek 188191739Sobrien * and Latin characters, so these should possibly be allowed. But they 189191739Sobrien * make a real mess on VT100-style displays if they're not paired properly, 190191739Sobrien * so we are probably better off not calling them text. 191191739Sobrien * 192191739Sobrien * A file is considered to be ISO-8859 text if its characters are all 193191739Sobrien * either ASCII, according to the above definition, or printing characters 194191739Sobrien * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF. 195191739Sobrien * 196191739Sobrien * Finally, a file is considered to be international text from some other 197191739Sobrien * character code if its characters are all either ISO-8859 (according to 198191739Sobrien * the above definition) or characters in the range 0x80 ... 0x9F, which 199191739Sobrien * ISO-8859 considers to be control characters but the IBM PC and Macintosh 200191739Sobrien * consider to be printing characters. 201191739Sobrien */ 202191739Sobrien 203191739Sobrien#define F 0 /* character never appears in text */ 204191739Sobrien#define T 1 /* character appears in plain ASCII text */ 205191739Sobrien#define I 2 /* character appears in ISO-8859 text */ 206191739Sobrien#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 207191739Sobrien 208191739Sobrienprivate char text_chars[256] = { 209284778Sdelphij /* BEL BS HT LF VT FF CR */ 210284778Sdelphij F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F, /* 0x0X */ 211191739Sobrien /* ESC */ 212191739Sobrien F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 213191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 214191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 215191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 216191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 217191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 218191739Sobrien T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 219191739Sobrien /* NEL */ 220191739Sobrien X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 221191739Sobrien X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 222191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 223191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 224191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 225191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 226191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 227191739Sobrien I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 228191739Sobrien}; 229191739Sobrien 230191739Sobrienprivate int 231191739Sobrienlooks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, 232191739Sobrien size_t *ulen) 233191739Sobrien{ 234191739Sobrien size_t i; 235191739Sobrien 236191739Sobrien *ulen = 0; 237191739Sobrien 238191739Sobrien for (i = 0; i < nbytes; i++) { 239191739Sobrien int t = text_chars[buf[i]]; 240191739Sobrien 241191739Sobrien if (t != T) 242191739Sobrien return 0; 243191739Sobrien 244191739Sobrien ubuf[(*ulen)++] = buf[i]; 245191739Sobrien } 246191739Sobrien 247191739Sobrien return 1; 248191739Sobrien} 249191739Sobrien 250191739Sobrienprivate int 251191739Sobrienlooks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 252191739Sobrien{ 253191739Sobrien size_t i; 254191739Sobrien 255191739Sobrien *ulen = 0; 256191739Sobrien 257191739Sobrien for (i = 0; i < nbytes; i++) { 258191739Sobrien int t = text_chars[buf[i]]; 259191739Sobrien 260191739Sobrien if (t != T && t != I) 261191739Sobrien return 0; 262191739Sobrien 263191739Sobrien ubuf[(*ulen)++] = buf[i]; 264191739Sobrien } 265191739Sobrien 266191739Sobrien return 1; 267191739Sobrien} 268191739Sobrien 269191739Sobrienprivate int 270191739Sobrienlooks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, 271191739Sobrien size_t *ulen) 272191739Sobrien{ 273191739Sobrien size_t i; 274191739Sobrien 275191739Sobrien *ulen = 0; 276191739Sobrien 277191739Sobrien for (i = 0; i < nbytes; i++) { 278191739Sobrien int t = text_chars[buf[i]]; 279191739Sobrien 280191739Sobrien if (t != T && t != I && t != X) 281191739Sobrien return 0; 282191739Sobrien 283191739Sobrien ubuf[(*ulen)++] = buf[i]; 284191739Sobrien } 285191739Sobrien 286191739Sobrien return 1; 287191739Sobrien} 288191739Sobrien 289191739Sobrien/* 290191739Sobrien * Decide whether some text looks like UTF-8. Returns: 291191739Sobrien * 292191739Sobrien * -1: invalid UTF-8 293191739Sobrien * 0: uses odd control characters, so doesn't look like text 294191739Sobrien * 1: 7-bit text 295191739Sobrien * 2: definitely UTF-8 text (valid high-bit set bytes) 296191739Sobrien * 297191739Sobrien * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; 298191739Sobrien * ubuf must be big enough! 299191739Sobrien */ 300191739Sobrienprotected int 301191739Sobrienfile_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 302191739Sobrien{ 303191739Sobrien size_t i; 304191739Sobrien int n; 305191739Sobrien unichar c; 306191739Sobrien int gotone = 0, ctrl = 0; 307191739Sobrien 308191739Sobrien if (ubuf) 309191739Sobrien *ulen = 0; 310191739Sobrien 311191739Sobrien for (i = 0; i < nbytes; i++) { 312191739Sobrien if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 313191739Sobrien /* 314191739Sobrien * Even if the whole file is valid UTF-8 sequences, 315191739Sobrien * still reject it if it uses weird control characters. 316191739Sobrien */ 317191739Sobrien 318191739Sobrien if (text_chars[buf[i]] != T) 319191739Sobrien ctrl = 1; 320191739Sobrien 321191739Sobrien if (ubuf) 322191739Sobrien ubuf[(*ulen)++] = buf[i]; 323191739Sobrien } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 324191739Sobrien return -1; 325191739Sobrien } else { /* 11xxxxxx begins UTF-8 */ 326191739Sobrien int following; 327191739Sobrien 328191739Sobrien if ((buf[i] & 0x20) == 0) { /* 110xxxxx */ 329191739Sobrien c = buf[i] & 0x1f; 330191739Sobrien following = 1; 331191739Sobrien } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */ 332191739Sobrien c = buf[i] & 0x0f; 333191739Sobrien following = 2; 334191739Sobrien } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */ 335191739Sobrien c = buf[i] & 0x07; 336191739Sobrien following = 3; 337191739Sobrien } else if ((buf[i] & 0x04) == 0) { /* 111110xx */ 338191739Sobrien c = buf[i] & 0x03; 339191739Sobrien following = 4; 340191739Sobrien } else if ((buf[i] & 0x02) == 0) { /* 1111110x */ 341191739Sobrien c = buf[i] & 0x01; 342191739Sobrien following = 5; 343191739Sobrien } else 344191739Sobrien return -1; 345191739Sobrien 346191739Sobrien for (n = 0; n < following; n++) { 347191739Sobrien i++; 348191739Sobrien if (i >= nbytes) 349191739Sobrien goto done; 350191739Sobrien 351191739Sobrien if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) 352191739Sobrien return -1; 353191739Sobrien 354191739Sobrien c = (c << 6) + (buf[i] & 0x3f); 355191739Sobrien } 356191739Sobrien 357191739Sobrien if (ubuf) 358191739Sobrien ubuf[(*ulen)++] = c; 359191739Sobrien gotone = 1; 360191739Sobrien } 361191739Sobrien } 362191739Sobriendone: 363191739Sobrien return ctrl ? 0 : (gotone ? 2 : 1); 364191739Sobrien} 365191739Sobrien 366191739Sobrien/* 367191739Sobrien * Decide whether some text looks like UTF-8 with BOM. If there is no 368191739Sobrien * BOM, return -1; otherwise return the result of looks_utf8 on the 369191739Sobrien * rest of the text. 370191739Sobrien */ 371191739Sobrienprivate int 372191739Sobrienlooks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf, 373191739Sobrien size_t *ulen) 374191739Sobrien{ 375191739Sobrien if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) 376191739Sobrien return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); 377191739Sobrien else 378191739Sobrien return -1; 379191739Sobrien} 380191739Sobrien 381191739Sobrienprivate int 382284778Sdelphijlooks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) 383284778Sdelphij{ 384284778Sdelphij if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v') 385284778Sdelphij switch (buf[3]) { 386284778Sdelphij case '8': 387284778Sdelphij case '9': 388284778Sdelphij case '+': 389284778Sdelphij case '/': 390284778Sdelphij if (ubuf) 391284778Sdelphij *ulen = 0; 392284778Sdelphij return 1; 393284778Sdelphij default: 394284778Sdelphij return -1; 395284778Sdelphij } 396284778Sdelphij else 397284778Sdelphij return -1; 398284778Sdelphij} 399284778Sdelphij 400284778Sdelphijprivate int 401191739Sobrienlooks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, 402191739Sobrien size_t *ulen) 403191739Sobrien{ 404191739Sobrien int bigend; 405191739Sobrien size_t i; 406191739Sobrien 407191739Sobrien if (nbytes < 2) 408191739Sobrien return 0; 409191739Sobrien 410191739Sobrien if (buf[0] == 0xff && buf[1] == 0xfe) 411191739Sobrien bigend = 0; 412191739Sobrien else if (buf[0] == 0xfe && buf[1] == 0xff) 413191739Sobrien bigend = 1; 414191739Sobrien else 415191739Sobrien return 0; 416191739Sobrien 417191739Sobrien *ulen = 0; 418191739Sobrien 419191739Sobrien for (i = 2; i + 1 < nbytes; i += 2) { 420191739Sobrien /* XXX fix to properly handle chars > 65536 */ 421191739Sobrien 422191739Sobrien if (bigend) 423191739Sobrien ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; 424191739Sobrien else 425191739Sobrien ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; 426191739Sobrien 427191739Sobrien if (ubuf[*ulen - 1] == 0xfffe) 428191739Sobrien return 0; 429191739Sobrien if (ubuf[*ulen - 1] < 128 && 430191739Sobrien text_chars[(size_t)ubuf[*ulen - 1]] != T) 431191739Sobrien return 0; 432191739Sobrien } 433191739Sobrien 434191739Sobrien return 1 + bigend; 435191739Sobrien} 436191739Sobrien 437191739Sobrien#undef F 438191739Sobrien#undef T 439191739Sobrien#undef I 440191739Sobrien#undef X 441191739Sobrien 442191739Sobrien/* 443191739Sobrien * This table maps each EBCDIC character to an (8-bit extended) ASCII 444191739Sobrien * character, as specified in the rationale for the dd(1) command in 445191739Sobrien * draft 11.2 (September, 1991) of the POSIX P1003.2 standard. 446191739Sobrien * 447191739Sobrien * Unfortunately it does not seem to correspond exactly to any of the 448191739Sobrien * five variants of EBCDIC documented in IBM's _Enterprise Systems 449191739Sobrien * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh 450191739Sobrien * Edition, July, 1999, pp. I-1 - I-4. 451191739Sobrien * 452191739Sobrien * Fortunately, though, all versions of EBCDIC, including this one, agree 453191739Sobrien * on most of the printing characters that also appear in (7-bit) ASCII. 454191739Sobrien * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all. 455191739Sobrien * 456191739Sobrien * Fortunately too, there is general agreement that codes 0x00 through 457191739Sobrien * 0x3F represent control characters, 0x41 a nonbreaking space, and the 458191739Sobrien * remainder printing characters. 459191739Sobrien * 460191739Sobrien * This is sufficient to allow us to identify EBCDIC text and to distinguish 461191739Sobrien * between old-style and internationalized examples of text. 462191739Sobrien */ 463191739Sobrien 464191739Sobrienprivate unsigned char ebcdic_to_ascii[] = { 465191739Sobrien 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 466191739Sobrien 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 467191739Sobrien128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 468191739Sobrien144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 469191739Sobrien' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|', 470191739Sobrien'&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~', 471191739Sobrien'-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?', 472191739Sobrien186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"', 473191739Sobrien195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201, 474191739Sobrien202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208, 475191739Sobrien209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215, 476191739Sobrien216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231, 477191739Sobrien'{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237, 478191739Sobrien'}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243, 479191739Sobrien'\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249, 480191739Sobrien'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255 481191739Sobrien}; 482191739Sobrien 483191739Sobrien#ifdef notdef 484191739Sobrien/* 485191739Sobrien * The following EBCDIC-to-ASCII table may relate more closely to reality, 486191739Sobrien * or at least to modern reality. It comes from 487191739Sobrien * 488191739Sobrien * http://ftp.s390.ibm.com/products/oe/bpxqp9.html 489191739Sobrien * 490191739Sobrien * and maps the characters of EBCDIC code page 1047 (the code used for 491191739Sobrien * Unix-derived software on IBM's 390 systems) to the corresponding 492191739Sobrien * characters from ISO 8859-1. 493191739Sobrien * 494191739Sobrien * If this table is used instead of the above one, some of the special 495191739Sobrien * cases for the NEL character can be taken out of the code. 496191739Sobrien */ 497191739Sobrien 498191739Sobrienprivate unsigned char ebcdic_1047_to_8859[] = { 499191739Sobrien0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, 500191739Sobrien0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, 501191739Sobrien0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07, 502191739Sobrien0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A, 503191739Sobrien0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C, 504191739Sobrien0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E, 505191739Sobrien0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F, 506191739Sobrien0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22, 507191739Sobrien0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1, 508191739Sobrien0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4, 509191739Sobrien0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE, 510191739Sobrien0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7, 511191739Sobrien0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5, 512191739Sobrien0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF, 513191739Sobrien0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5, 514191739Sobrien0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F 515191739Sobrien}; 516191739Sobrien#endif 517191739Sobrien 518191739Sobrien/* 519191739Sobrien * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII. 520191739Sobrien */ 521191739Sobrienprivate void 522191739Sobrienfrom_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out) 523191739Sobrien{ 524191739Sobrien size_t i; 525191739Sobrien 526191739Sobrien for (i = 0; i < nbytes; i++) { 527191739Sobrien out[i] = ebcdic_to_ascii[buf[i]]; 528191739Sobrien } 529191739Sobrien} 530