168349Sobrien/*
2133359Sobrien * Copyright (c) Ian F. Darwin 1986-1995.
3133359Sobrien * Software written by Ian F. Darwin and others;
4133359Sobrien * maintained 1995-present by Christos Zoulas and others.
5191736Sobrien *
6133359Sobrien * Redistribution and use in source and binary forms, with or without
7133359Sobrien * modification, are permitted provided that the following conditions
8133359Sobrien * are met:
9133359Sobrien * 1. Redistributions of source code must retain the above copyright
10133359Sobrien *    notice immediately at the beginning of the file, without modification,
11133359Sobrien *    this list of conditions, and the following disclaimer.
12133359Sobrien * 2. Redistributions in binary form must reproduce the above copyright
13133359Sobrien *    notice, this list of conditions and the following disclaimer in the
14133359Sobrien *    documentation and/or other materials provided with the distribution.
15191736Sobrien *
16133359Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17133359Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18133359Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19133359Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20133359Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21133359Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22133359Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23133359Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24133359Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25133359Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26133359Sobrien * SUCH DAMAGE.
27133359Sobrien */
28133359Sobrien/*
29226048Sobrien * ASCII magic -- try to detect text encoding.
3068349Sobrien *
3168349Sobrien * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
3268349Sobrien * to handle character codes other than ASCII on a unified basis.
3368349Sobrien */
3468349Sobrien
3568349Sobrien#include "file.h"
36191736Sobrien
37191736Sobrien#ifndef	lint
38330569SgordonFILE_RCSID("@(#)$File: ascmagic.c,v 1.97 2016/06/27 20:56:25 christos Exp $")
39191736Sobrien#endif	/* lint */
40191736Sobrien
41133359Sobrien#include "magic.h"
4268349Sobrien#include <string.h>
4368349Sobrien#include <memory.h>
4468349Sobrien#include <ctype.h>
4568349Sobrien#include <stdlib.h>
4668349Sobrien#ifdef HAVE_UNISTD_H
4768349Sobrien#include <unistd.h>
4868349Sobrien#endif
4968349Sobrien
5068349Sobrien#define MAXLINELEN 300	/* longest sane line length */
5168349Sobrien#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
5268349Sobrien		  || (x) == 0x85 || (x) == '\f')
5368349Sobrien
54186690Sobrienprivate unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
55191736Sobrienprivate size_t trim_nuls(const unsigned char *, size_t);
5668349Sobrien
57191736Sobrien/*
58191736Sobrien * Undo the NUL-termination kindly provided by process()
59191736Sobrien * but leave at least one byte to look at
60191736Sobrien */
61191736Sobrienprivate size_t
62191736Sobrientrim_nuls(const unsigned char *buf, size_t nbytes)
63191736Sobrien{
64191736Sobrien	while (nbytes > 1 && buf[nbytes - 1] == '\0')
65191736Sobrien		nbytes--;
66133359Sobrien
67191736Sobrien	return nbytes;
68191736Sobrien}
69191736Sobrien
70133359Sobrienprotected int
71234250Sobrienfile_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes,
72234250Sobrien	int text)
7368349Sobrien{
74191736Sobrien	unichar *ubuf = NULL;
75267843Sdelphij	size_t ulen = 0;
76191736Sobrien	int rv = 1;
77191736Sobrien
78191736Sobrien	const char *code = NULL;
79191736Sobrien	const char *code_mime = NULL;
80191736Sobrien	const char *type = NULL;
81191736Sobrien
82191736Sobrien	nbytes = trim_nuls(buf, nbytes);
83191736Sobrien
84191736Sobrien	/* If file doesn't look like any sort of text, give up. */
85191736Sobrien	if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime,
86234250Sobrien	    &type) == 0)
87191736Sobrien		rv = 0;
88234250Sobrien        else
89234250Sobrien		rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code,
90234250Sobrien						 type, text);
91191736Sobrien
92234250Sobrien	free(ubuf);
93191736Sobrien
94191736Sobrien	return rv;
95191736Sobrien}
96191736Sobrien
97191736Sobrienprotected int
98191736Sobrienfile_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
99191736Sobrien    size_t nbytes, unichar *ubuf, size_t ulen, const char *code,
100234250Sobrien    const char *type, int text)
101191736Sobrien{
102191736Sobrien	unsigned char *utf8_buf = NULL, *utf8_end;
103191736Sobrien	size_t mlen, i;
104159764Sobrien	int rv = -1;
105175296Sobrien	int mime = ms->flags & MAGIC_MIME;
10668349Sobrien
107133359Sobrien	const char *subtype = NULL;
108133359Sobrien	const char *subtype_mime = NULL;
10968349Sobrien
11068349Sobrien	int has_escapes = 0;
11168349Sobrien	int has_backspace = 0;
112159764Sobrien	int seen_cr = 0;
11368349Sobrien
11468349Sobrien	int n_crlf = 0;
11568349Sobrien	int n_lf = 0;
11668349Sobrien	int n_cr = 0;
11768349Sobrien	int n_nel = 0;
118234250Sobrien	int executable = 0;
11968349Sobrien
120169962Sobrien	size_t last_line_end = (size_t)-1;
12168349Sobrien	int has_long_lines = 0;
12268349Sobrien
123191736Sobrien	nbytes = trim_nuls(buf, nbytes);
124133359Sobrien
125191736Sobrien	/* If we have fewer than 2 bytes, give up. */
126159764Sobrien	if (nbytes <= 1) {
127159764Sobrien		rv = 0;
128159764Sobrien		goto done;
129159764Sobrien	}
130159764Sobrien
131267843Sdelphij	if (ulen > 0 && (ms->flags & MAGIC_NO_CHECK_SOFT) == 0) {
132226048Sobrien		/* Convert ubuf to UTF-8 and try text soft magic */
133226048Sobrien		/* malloc size is a conservative overestimate; could be
134226048Sobrien		   improved, or at least realloced after conversion. */
135226048Sobrien		mlen = ulen * 6;
136226048Sobrien		if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
137226048Sobrien			file_oomem(ms, mlen);
138226048Sobrien			goto done;
139226048Sobrien		}
140226048Sobrien		if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))
141226048Sobrien		    == NULL)
142226048Sobrien			goto done;
143226048Sobrien		if ((rv = file_softmagic(ms, utf8_buf,
144330569Sgordon		    (size_t)(utf8_end - utf8_buf), NULL, NULL,
145276415Sdelphij		    TEXTTEST, text)) == 0)
146226048Sobrien			rv = -1;
147330569Sgordon		if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) {
148330569Sgordon			rv = rv == -1 ? 0 : 1;
149330569Sgordon			goto done;
150330569Sgordon		}
15168349Sobrien	}
152330569Sgordon	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)))
153330569Sgordon		return 0;
15468349Sobrien
155186690Sobrien	/* Now try to discover other details about the file. */
15668349Sobrien	for (i = 0; i < ulen; i++) {
157159764Sobrien		if (ubuf[i] == '\n') {
158159764Sobrien			if (seen_cr)
159159764Sobrien				n_crlf++;
160159764Sobrien			else
161159764Sobrien				n_lf++;
162159764Sobrien			last_line_end = i;
163159764Sobrien		} else if (seen_cr)
164159764Sobrien			n_cr++;
165159764Sobrien
166159764Sobrien		seen_cr = (ubuf[i] == '\r');
167159764Sobrien		if (seen_cr)
168159764Sobrien			last_line_end = i;
169159764Sobrien
170159764Sobrien		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
171159764Sobrien			n_nel++;
172159764Sobrien			last_line_end = i;
173159764Sobrien		}
174159764Sobrien
175159764Sobrien		/* If this line is _longer_ than MAXLINELEN, remember it. */
17668349Sobrien		if (i > last_line_end + MAXLINELEN)
17768349Sobrien			has_long_lines = 1;
17868349Sobrien
17968349Sobrien		if (ubuf[i] == '\033')
18068349Sobrien			has_escapes = 1;
18168349Sobrien		if (ubuf[i] == '\b')
18268349Sobrien			has_backspace = 1;
18368349Sobrien	}
18468349Sobrien
185159764Sobrien	/* Beware, if the data has been truncated, the final CR could have
186330569Sgordon	   been followed by a LF.  If we have ms->bytes_max bytes, it indicates
187159764Sobrien	   that the data might have been truncated, probably even before
188159764Sobrien	   this function was called. */
189330569Sgordon	if (seen_cr && nbytes < ms->bytes_max)
190159764Sobrien		n_cr++;
191159764Sobrien
192191736Sobrien	if (strcmp(type, "binary") == 0) {
193191736Sobrien		rv = 0;
194191736Sobrien		goto done;
195191736Sobrien	}
196175296Sobrien	if (mime) {
197226048Sobrien		if (!file_printedlen(ms) && (mime & MAGIC_MIME_TYPE) != 0) {
198175296Sobrien			if (subtype_mime) {
199191736Sobrien				if (file_printf(ms, "%s", subtype_mime) == -1)
200175296Sobrien					goto done;
201175296Sobrien			} else {
202175296Sobrien				if (file_printf(ms, "text/plain") == -1)
203175296Sobrien					goto done;
204175296Sobrien			}
205133359Sobrien		}
20668349Sobrien	} else {
207226048Sobrien		if (file_printedlen(ms)) {
208226048Sobrien			switch (file_replace(ms, " text$", ", ")) {
209226048Sobrien			case 0:
210226048Sobrien				switch (file_replace(ms, " text executable$",
211226048Sobrien				    ", ")) {
212226048Sobrien				case 0:
213226048Sobrien					if (file_printf(ms, ", ") == -1)
214226048Sobrien						goto done;
215267843Sdelphij					break;
216226048Sobrien				case -1:
217226048Sobrien					goto done;
218226048Sobrien				default:
219226048Sobrien					executable = 1;
220226048Sobrien					break;
221226048Sobrien				}
222226048Sobrien				break;
223226048Sobrien			case -1:
224226048Sobrien				goto done;
225226048Sobrien			default:
226226048Sobrien				break;
227226048Sobrien			}
228226048Sobrien		}
229226048Sobrien
230191736Sobrien		if (file_printf(ms, "%s", code) == -1)
231159764Sobrien			goto done;
23268349Sobrien
23368349Sobrien		if (subtype) {
234191736Sobrien			if (file_printf(ms, " %s", subtype) == -1)
235159764Sobrien				goto done;
23668349Sobrien		}
23768349Sobrien
238191736Sobrien		if (file_printf(ms, " %s", type) == -1)
239159764Sobrien			goto done;
24068349Sobrien
241226048Sobrien		if (executable)
242226048Sobrien			if (file_printf(ms, " executable") == -1)
243226048Sobrien				goto done;
244226048Sobrien
24568349Sobrien		if (has_long_lines)
246133359Sobrien			if (file_printf(ms, ", with very long lines") == -1)
247159764Sobrien				goto done;
24868349Sobrien
24968349Sobrien		/*
25068349Sobrien		 * Only report line terminators if we find one other than LF,
25168349Sobrien		 * or if we find none at all.
25268349Sobrien		 */
25368349Sobrien		if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
25468349Sobrien		    (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
255133359Sobrien			if (file_printf(ms, ", with") == -1)
256159764Sobrien				goto done;
25768349Sobrien
258191736Sobrien			if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
259133359Sobrien				if (file_printf(ms, " no") == -1)
260159764Sobrien					goto done;
261133359Sobrien			} else {
26268349Sobrien				if (n_crlf) {
263133359Sobrien					if (file_printf(ms, " CRLF") == -1)
264159764Sobrien						goto done;
26568349Sobrien					if (n_cr || n_lf || n_nel)
266133359Sobrien						if (file_printf(ms, ",") == -1)
267159764Sobrien							goto done;
26868349Sobrien				}
26968349Sobrien				if (n_cr) {
270133359Sobrien					if (file_printf(ms, " CR") == -1)
271159764Sobrien						goto done;
27268349Sobrien					if (n_lf || n_nel)
273133359Sobrien						if (file_printf(ms, ",") == -1)
274159764Sobrien							goto done;
27568349Sobrien				}
27668349Sobrien				if (n_lf) {
277133359Sobrien					if (file_printf(ms, " LF") == -1)
278159764Sobrien						goto done;
27968349Sobrien					if (n_nel)
280133359Sobrien						if (file_printf(ms, ",") == -1)
281159764Sobrien							goto done;
28268349Sobrien				}
28368349Sobrien				if (n_nel)
284133359Sobrien					if (file_printf(ms, " NEL") == -1)
285159764Sobrien						goto done;
28668349Sobrien			}
28768349Sobrien
288133359Sobrien			if (file_printf(ms, " line terminators") == -1)
289159764Sobrien				goto done;
29068349Sobrien		}
29168349Sobrien
29268349Sobrien		if (has_escapes)
293133359Sobrien			if (file_printf(ms, ", with escape sequences") == -1)
294159764Sobrien				goto done;
29568349Sobrien		if (has_backspace)
296133359Sobrien			if (file_printf(ms, ", with overstriking") == -1)
297159764Sobrien				goto done;
29868349Sobrien	}
299159764Sobrien	rv = 1;
300159764Sobriendone:
301234250Sobrien	free(utf8_buf);
30268349Sobrien
303159764Sobrien	return rv;
30468349Sobrien}
30568349Sobrien
30668349Sobrien/*
307186690Sobrien * Encode Unicode string as UTF-8, returning pointer to character
308186690Sobrien * after end of string, or NULL if an invalid character is found.
309186690Sobrien */
310186690Sobrienprivate unsigned char *
311186690Sobrienencode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
31268349Sobrien{
313169962Sobrien	size_t i;
314186690Sobrien	unsigned char *end = buf + len;
315186690Sobrien
316186690Sobrien	for (i = 0; i < ulen; i++) {
317186690Sobrien		if (ubuf[i] <= 0x7f) {
318186690Sobrien			if (end - buf < 1)
319186690Sobrien				return NULL;
320186690Sobrien			*buf++ = (unsigned char)ubuf[i];
321186690Sobrien		} else if (ubuf[i] <= 0x7ff) {
322186690Sobrien			if (end - buf < 2)
323186690Sobrien				return NULL;
324186690Sobrien			*buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
325186690Sobrien			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
326186690Sobrien		} else if (ubuf[i] <= 0xffff) {
327186690Sobrien			if (end - buf < 3)
328186690Sobrien				return NULL;
329186690Sobrien			*buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
330186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
331186690Sobrien			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
332186690Sobrien		} else if (ubuf[i] <= 0x1fffff) {
333186690Sobrien			if (end - buf < 4)
334186690Sobrien				return NULL;
335186690Sobrien			*buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
336186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
337186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
338186690Sobrien			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
339186690Sobrien		} else if (ubuf[i] <= 0x3ffffff) {
340186690Sobrien			if (end - buf < 5)
341186690Sobrien				return NULL;
342186690Sobrien			*buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
343186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
344186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
345186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
346186690Sobrien			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
347186690Sobrien		} else if (ubuf[i] <= 0x7fffffff) {
348186690Sobrien			if (end - buf < 6)
349186690Sobrien				return NULL;
350186690Sobrien			*buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
351186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
352186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
353186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
354186690Sobrien			*buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
355186690Sobrien			*buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
356186690Sobrien		} else /* Invalid character */
357186690Sobrien			return NULL;
358186690Sobrien	}
359186690Sobrien
360186690Sobrien	return buf;
361186690Sobrien}
362