1284194Sdelphij/* 2284194Sdelphij * Copyright (c) Ian F. Darwin 1986-1995. 3284194Sdelphij * Software written by Ian F. Darwin and others; 4284194Sdelphij * maintained 1995-present by Christos Zoulas and others. 5284194Sdelphij * 6284194Sdelphij * Redistribution and use in source and binary forms, with or without 7284194Sdelphij * modification, are permitted provided that the following conditions 8284194Sdelphij * are met: 9284194Sdelphij * 1. Redistributions of source code must retain the above copyright 10284194Sdelphij * notice immediately at the beginning of the file, without modification, 11284194Sdelphij * this list of conditions, and the following disclaimer. 12284194Sdelphij * 2. Redistributions in binary form must reproduce the above copyright 13284194Sdelphij * notice, this list of conditions and the following disclaimer in the 14284194Sdelphij * documentation and/or other materials provided with the distribution. 15284194Sdelphij * 16284194Sdelphij * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17284194Sdelphij * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18284194Sdelphij * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19284194Sdelphij * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR 20284194Sdelphij * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21284194Sdelphij * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22284194Sdelphij * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23284194Sdelphij * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24284194Sdelphij * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25284194Sdelphij * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26284194Sdelphij * SUCH DAMAGE. 27284194Sdelphij */ 28284194Sdelphij/* 29284194Sdelphij * ASCII magic -- try to detect text encoding. 30284194Sdelphij * 31284194Sdelphij * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, 32284194Sdelphij * to handle character codes other than ASCII on a unified basis. 33284194Sdelphij */ 34284194Sdelphij 35284194Sdelphij#include "file.h" 36284194Sdelphij 37284194Sdelphij#ifndef lint 38284194SdelphijFILE_RCSID("@(#)$File: ascmagic.c,v 1.91 2014/11/28 02:46:39 christos Exp $") 39284194Sdelphij#endif /* lint */ 40284194Sdelphij 41284194Sdelphij#include "magic.h" 42284194Sdelphij#include <string.h> 43284194Sdelphij#include <memory.h> 44284194Sdelphij#include <ctype.h> 45284194Sdelphij#include <stdlib.h> 46284194Sdelphij#ifdef HAVE_UNISTD_H 47284194Sdelphij#include <unistd.h> 48284194Sdelphij#endif 49284194Sdelphij 50284194Sdelphij#define MAXLINELEN 300 /* longest sane line length */ 51284194Sdelphij#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ 52284194Sdelphij || (x) == 0x85 || (x) == '\f') 53284194Sdelphij 54284194Sdelphijprivate unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t); 55284194Sdelphijprivate size_t trim_nuls(const unsigned char *, size_t); 56284194Sdelphij 57284194Sdelphij/* 58284194Sdelphij * Undo the NUL-termination kindly provided by process() 59284194Sdelphij * but leave at least one byte to look at 60284194Sdelphij */ 61284194Sdelphijprivate size_t 62284194Sdelphijtrim_nuls(const unsigned char *buf, size_t nbytes) 63284194Sdelphij{ 64284194Sdelphij while (nbytes > 1 && buf[nbytes - 1] == '\0') 65284194Sdelphij nbytes--; 66284194Sdelphij 67284194Sdelphij return nbytes; 68284194Sdelphij} 69284194Sdelphij 70284194Sdelphijprotected int 71284194Sdelphijfile_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes, 72284194Sdelphij int text) 73284194Sdelphij{ 74284194Sdelphij unichar *ubuf = NULL; 75284194Sdelphij size_t ulen = 0; 76284194Sdelphij int rv = 1; 77284194Sdelphij 78284194Sdelphij const char *code = NULL; 79284194Sdelphij const char *code_mime = NULL; 80284194Sdelphij const char *type = NULL; 81284194Sdelphij 82284194Sdelphij if (ms->flags & MAGIC_APPLE) 83284194Sdelphij return 0; 84284194Sdelphij 85284194Sdelphij nbytes = trim_nuls(buf, nbytes); 86284194Sdelphij 87284194Sdelphij /* If file doesn't look like any sort of text, give up. */ 88284194Sdelphij if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime, 89284194Sdelphij &type) == 0) 90284194Sdelphij rv = 0; 91284194Sdelphij else 92284194Sdelphij rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code, 93284194Sdelphij type, text); 94284194Sdelphij 95284194Sdelphij free(ubuf); 96284194Sdelphij 97284194Sdelphij return rv; 98284194Sdelphij} 99284194Sdelphij 100284194Sdelphijprotected int 101284194Sdelphijfile_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf, 102284194Sdelphij size_t nbytes, unichar *ubuf, size_t ulen, const char *code, 103284194Sdelphij const char *type, int text) 104284194Sdelphij{ 105284194Sdelphij unsigned char *utf8_buf = NULL, *utf8_end; 106284194Sdelphij size_t mlen, i; 107284194Sdelphij int rv = -1; 108284194Sdelphij int mime = ms->flags & MAGIC_MIME; 109284194Sdelphij 110284194Sdelphij const char *subtype = NULL; 111284194Sdelphij const char *subtype_mime = NULL; 112284194Sdelphij 113284194Sdelphij int has_escapes = 0; 114284194Sdelphij int has_backspace = 0; 115284194Sdelphij int seen_cr = 0; 116284194Sdelphij 117284194Sdelphij int n_crlf = 0; 118284194Sdelphij int n_lf = 0; 119284194Sdelphij int n_cr = 0; 120284194Sdelphij int n_nel = 0; 121284194Sdelphij int executable = 0; 122284194Sdelphij 123284194Sdelphij size_t last_line_end = (size_t)-1; 124284194Sdelphij int has_long_lines = 0; 125284194Sdelphij 126284194Sdelphij if (ms->flags & MAGIC_APPLE) 127284194Sdelphij return 0; 128284194Sdelphij 129284194Sdelphij nbytes = trim_nuls(buf, nbytes); 130284194Sdelphij 131284194Sdelphij /* If we have fewer than 2 bytes, give up. */ 132284194Sdelphij if (nbytes <= 1) { 133284194Sdelphij rv = 0; 134284194Sdelphij goto done; 135284194Sdelphij } 136284194Sdelphij 137284194Sdelphij if (ulen > 0 && (ms->flags & MAGIC_NO_CHECK_SOFT) == 0) { 138284194Sdelphij /* Convert ubuf to UTF-8 and try text soft magic */ 139284194Sdelphij /* malloc size is a conservative overestimate; could be 140284194Sdelphij improved, or at least realloced after conversion. */ 141284194Sdelphij mlen = ulen * 6; 142284194Sdelphij if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) { 143284194Sdelphij file_oomem(ms, mlen); 144284194Sdelphij goto done; 145284194Sdelphij } 146284194Sdelphij if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) 147284194Sdelphij == NULL) 148284194Sdelphij goto done; 149284194Sdelphij if ((rv = file_softmagic(ms, utf8_buf, 150284194Sdelphij (size_t)(utf8_end - utf8_buf), 0, NULL, 151284194Sdelphij TEXTTEST, text)) == 0) 152284194Sdelphij rv = -1; 153284194Sdelphij } 154284194Sdelphij 155284194Sdelphij /* Now try to discover other details about the file. */ 156284194Sdelphij for (i = 0; i < ulen; i++) { 157284194Sdelphij if (ubuf[i] == '\n') { 158284194Sdelphij if (seen_cr) 159284194Sdelphij n_crlf++; 160284194Sdelphij else 161284194Sdelphij n_lf++; 162284194Sdelphij last_line_end = i; 163284194Sdelphij } else if (seen_cr) 164284194Sdelphij n_cr++; 165284194Sdelphij 166284194Sdelphij seen_cr = (ubuf[i] == '\r'); 167284194Sdelphij if (seen_cr) 168284194Sdelphij last_line_end = i; 169284194Sdelphij 170284194Sdelphij if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ 171284194Sdelphij n_nel++; 172284194Sdelphij last_line_end = i; 173284194Sdelphij } 174284194Sdelphij 175284194Sdelphij /* If this line is _longer_ than MAXLINELEN, remember it. */ 176284194Sdelphij if (i > last_line_end + MAXLINELEN) 177284194Sdelphij has_long_lines = 1; 178284194Sdelphij 179284194Sdelphij if (ubuf[i] == '\033') 180284194Sdelphij has_escapes = 1; 181284194Sdelphij if (ubuf[i] == '\b') 182284194Sdelphij has_backspace = 1; 183284194Sdelphij } 184284194Sdelphij 185284194Sdelphij /* Beware, if the data has been truncated, the final CR could have 186284194Sdelphij been followed by a LF. If we have HOWMANY bytes, it indicates 187284194Sdelphij that the data might have been truncated, probably even before 188284194Sdelphij this function was called. */ 189284194Sdelphij if (seen_cr && nbytes < HOWMANY) 190284194Sdelphij n_cr++; 191284194Sdelphij 192284194Sdelphij if (strcmp(type, "binary") == 0) { 193284194Sdelphij rv = 0; 194284194Sdelphij goto done; 195284194Sdelphij } 196284194Sdelphij if (mime) { 197284194Sdelphij if (!file_printedlen(ms) && (mime & MAGIC_MIME_TYPE) != 0) { 198284194Sdelphij if (subtype_mime) { 199284194Sdelphij if (file_printf(ms, "%s", subtype_mime) == -1) 200284194Sdelphij goto done; 201284194Sdelphij } else { 202284194Sdelphij if (file_printf(ms, "text/plain") == -1) 203284194Sdelphij goto done; 204284194Sdelphij } 205284194Sdelphij } 206284194Sdelphij } else { 207284194Sdelphij if (file_printedlen(ms)) { 208284194Sdelphij switch (file_replace(ms, " text$", ", ")) { 209284194Sdelphij case 0: 210284194Sdelphij switch (file_replace(ms, " text executable$", 211284194Sdelphij ", ")) { 212284194Sdelphij case 0: 213284194Sdelphij if (file_printf(ms, ", ") == -1) 214284194Sdelphij goto done; 215284194Sdelphij break; 216284194Sdelphij case -1: 217284194Sdelphij goto done; 218284194Sdelphij default: 219284194Sdelphij executable = 1; 220284194Sdelphij break; 221284194Sdelphij } 222284194Sdelphij break; 223284194Sdelphij case -1: 224284194Sdelphij goto done; 225284194Sdelphij default: 226284194Sdelphij break; 227284194Sdelphij } 228284194Sdelphij } 229284194Sdelphij 230284194Sdelphij if (file_printf(ms, "%s", code) == -1) 231284194Sdelphij goto done; 232284194Sdelphij 233284194Sdelphij if (subtype) { 234284194Sdelphij if (file_printf(ms, " %s", subtype) == -1) 235284194Sdelphij goto done; 236284194Sdelphij } 237284194Sdelphij 238284194Sdelphij if (file_printf(ms, " %s", type) == -1) 239284194Sdelphij goto done; 240284194Sdelphij 241284194Sdelphij if (executable) 242284194Sdelphij if (file_printf(ms, " executable") == -1) 243284194Sdelphij goto done; 244284194Sdelphij 245284194Sdelphij if (has_long_lines) 246284194Sdelphij if (file_printf(ms, ", with very long lines") == -1) 247284194Sdelphij goto done; 248284194Sdelphij 249284194Sdelphij /* 250284194Sdelphij * Only report line terminators if we find one other than LF, 251284194Sdelphij * or if we find none at all. 252284194Sdelphij */ 253284194Sdelphij if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) || 254284194Sdelphij (n_crlf != 0 || n_cr != 0 || n_nel != 0)) { 255284194Sdelphij if (file_printf(ms, ", with") == -1) 256284194Sdelphij goto done; 257284194Sdelphij 258284194Sdelphij if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) { 259284194Sdelphij if (file_printf(ms, " no") == -1) 260284194Sdelphij goto done; 261284194Sdelphij } else { 262284194Sdelphij if (n_crlf) { 263284194Sdelphij if (file_printf(ms, " CRLF") == -1) 264284194Sdelphij goto done; 265284194Sdelphij if (n_cr || n_lf || n_nel) 266284194Sdelphij if (file_printf(ms, ",") == -1) 267284194Sdelphij goto done; 268284194Sdelphij } 269284194Sdelphij if (n_cr) { 270284194Sdelphij if (file_printf(ms, " CR") == -1) 271284194Sdelphij goto done; 272284194Sdelphij if (n_lf || n_nel) 273284194Sdelphij if (file_printf(ms, ",") == -1) 274284194Sdelphij goto done; 275284194Sdelphij } 276284194Sdelphij if (n_lf) { 277284194Sdelphij if (file_printf(ms, " LF") == -1) 278284194Sdelphij goto done; 279284194Sdelphij if (n_nel) 280284194Sdelphij if (file_printf(ms, ",") == -1) 281284194Sdelphij goto done; 282284194Sdelphij } 283284194Sdelphij if (n_nel) 284284194Sdelphij if (file_printf(ms, " NEL") == -1) 285284194Sdelphij goto done; 286284194Sdelphij } 287284194Sdelphij 288284194Sdelphij if (file_printf(ms, " line terminators") == -1) 289284194Sdelphij goto done; 290284194Sdelphij } 291284194Sdelphij 292284194Sdelphij if (has_escapes) 293284194Sdelphij if (file_printf(ms, ", with escape sequences") == -1) 294284194Sdelphij goto done; 295284194Sdelphij if (has_backspace) 296284194Sdelphij if (file_printf(ms, ", with overstriking") == -1) 297284194Sdelphij goto done; 298284194Sdelphij } 299284194Sdelphij rv = 1; 300284194Sdelphijdone: 301284194Sdelphij free(utf8_buf); 302284194Sdelphij 303284194Sdelphij return rv; 304284194Sdelphij} 305284194Sdelphij 306284194Sdelphij/* 307284194Sdelphij * Encode Unicode string as UTF-8, returning pointer to character 308284194Sdelphij * after end of string, or NULL if an invalid character is found. 309284194Sdelphij */ 310284194Sdelphijprivate unsigned char * 311284194Sdelphijencode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen) 312284194Sdelphij{ 313284194Sdelphij size_t i; 314284194Sdelphij unsigned char *end = buf + len; 315284194Sdelphij 316284194Sdelphij for (i = 0; i < ulen; i++) { 317284194Sdelphij if (ubuf[i] <= 0x7f) { 318284194Sdelphij if (end - buf < 1) 319284194Sdelphij return NULL; 320284194Sdelphij *buf++ = (unsigned char)ubuf[i]; 321284194Sdelphij } else if (ubuf[i] <= 0x7ff) { 322284194Sdelphij if (end - buf < 2) 323284194Sdelphij return NULL; 324284194Sdelphij *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0); 325284194Sdelphij *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); 326284194Sdelphij } else if (ubuf[i] <= 0xffff) { 327284194Sdelphij if (end - buf < 3) 328284194Sdelphij return NULL; 329284194Sdelphij *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0); 330284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); 331284194Sdelphij *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); 332284194Sdelphij } else if (ubuf[i] <= 0x1fffff) { 333284194Sdelphij if (end - buf < 4) 334284194Sdelphij return NULL; 335284194Sdelphij *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0); 336284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); 337284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); 338284194Sdelphij *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); 339284194Sdelphij } else if (ubuf[i] <= 0x3ffffff) { 340284194Sdelphij if (end - buf < 5) 341284194Sdelphij return NULL; 342284194Sdelphij *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8); 343284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80); 344284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); 345284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); 346284194Sdelphij *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); 347284194Sdelphij } else if (ubuf[i] <= 0x7fffffff) { 348284194Sdelphij if (end - buf < 6) 349284194Sdelphij return NULL; 350284194Sdelphij *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc); 351284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80); 352284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80); 353284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); 354284194Sdelphij *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); 355284194Sdelphij *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); 356284194Sdelphij } else /* Invalid character */ 357284194Sdelphij return NULL; 358284194Sdelphij } 359284194Sdelphij 360284194Sdelphij return buf; 361284194Sdelphij} 362