168349Sobrien/*
2133359Sobrien * Copyright (c) Ian F. Darwin 1986-1995.
3133359Sobrien * Software written by Ian F. Darwin and others;
4133359Sobrien * maintained 1995-present by Christos Zoulas and others.
5191736Sobrien *
6133359Sobrien * Redistribution and use in source and binary forms, with or without
7133359Sobrien * modification, are permitted provided that the following conditions
8133359Sobrien * are met:
9133359Sobrien * 1. Redistributions of source code must retain the above copyright
10133359Sobrien *    notice immediately at the beginning of the file, without modification,
11133359Sobrien *    this list of conditions, and the following disclaimer.
12133359Sobrien * 2. Redistributions in binary form must reproduce the above copyright
13133359Sobrien *    notice, this list of conditions and the following disclaimer in the
14133359Sobrien *    documentation and/or other materials provided with the distribution.
15191736Sobrien *
16133359Sobrien * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17133359Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18133359Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19133359Sobrien * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20133359Sobrien * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21133359Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22133359Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23133359Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24133359Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25133359Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26133359Sobrien * SUCH DAMAGE.
27133359Sobrien */
28133359Sobrien/*
29226048Sobrien * ASCII magic -- try to detect text encoding.
3068349Sobrien *
3168349Sobrien * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
3268349Sobrien * to handle character codes other than ASCII on a unified basis.
3368349Sobrien */
3468349Sobrien
3568349Sobrien#include "file.h"
36191736Sobrien
37191736Sobrien#ifndef	lint
38362844SdelphijFILE_RCSID("@(#)$File: ascmagic.c,v 1.107 2020/06/08 19:58:36 christos Exp $")
39191736Sobrien#endif	/* lint */
40191736Sobrien
41133359Sobrien#include "magic.h"
4268349Sobrien#include <string.h>
4368349Sobrien#include <ctype.h>
4468349Sobrien#include <stdlib.h>
4568349Sobrien#ifdef HAVE_UNISTD_H
4668349Sobrien#include <unistd.h>
4768349Sobrien#endif
4868349Sobrien
4968349Sobrien#define MAXLINELEN 300	/* longest sane line length */
5068349Sobrien#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
5168349Sobrien		  || (x) == 0x85 || (x) == '\f')
5268349Sobrien
53186690Sobrienprivate unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
54191736Sobrienprivate size_t trim_nuls(const unsigned char *, size_t);
5568349Sobrien
56191736Sobrien/*
57191736Sobrien * Undo the NUL-termination kindly provided by process()
58191736Sobrien * but leave at least one byte to look at
59191736Sobrien */
60191736Sobrienprivate size_t
61191736Sobrientrim_nuls(const unsigned char *buf, size_t nbytes)
62191736Sobrien{
63191736Sobrien	while (nbytes > 1 && buf[nbytes - 1] == '\0')
64191736Sobrien		nbytes--;
65133359Sobrien
66191736Sobrien	return nbytes;
67191736Sobrien}
68191736Sobrien
69133359Sobrienprotected int
70337827Seadlerfile_ascmagic(struct magic_set *ms, const struct buffer *b, int text)
7168349Sobrien{
72191736Sobrien	unichar *ubuf = NULL;
73267843Sdelphij	size_t ulen = 0;
74191736Sobrien	int rv = 1;
75337827Seadler	struct buffer bb;
76191736Sobrien
77191736Sobrien	const char *code = NULL;
78191736Sobrien	const char *code_mime = NULL;
79191736Sobrien	const char *type = NULL;
80191736Sobrien
81337827Seadler	bb = *b;
82354939Sdelphij	bb.flen = trim_nuls(CAST(const unsigned char *, b->fbuf), b->flen);
83354939Sdelphij	/*
84354939Sdelphij	 * Avoid trimming at an odd byte if the original buffer was evenly
85354939Sdelphij	 * sized; this avoids losing the last character on UTF-16 LE text
86354939Sdelphij	 */
87354939Sdelphij	if ((bb.flen & 1) && !(b->flen & 1))
88354939Sdelphij		bb.flen++;
89191736Sobrien
90191736Sobrien	/* If file doesn't look like any sort of text, give up. */
91337827Seadler	if (file_encoding(ms, &bb, &ubuf, &ulen, &code, &code_mime,
92234250Sobrien	    &type) == 0)
93191736Sobrien		rv = 0;
94234250Sobrien        else
95337827Seadler		rv = file_ascmagic_with_encoding(ms, &bb,
96337827Seadler		    ubuf, ulen, code, type, text);
97191736Sobrien
98234250Sobrien	free(ubuf);
99191736Sobrien
100191736Sobrien	return rv;
101191736Sobrien}
102191736Sobrien
103191736Sobrienprotected int
104354939Sdelphijfile_ascmagic_with_encoding(struct magic_set *ms,
105337827Seadler    const struct buffer *b, unichar *ubuf, size_t ulen, const char *code,
106234250Sobrien    const char *type, int text)
107191736Sobrien{
108337827Seadler	struct buffer bb;
109354939Sdelphij	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
110337827Seadler	size_t nbytes = b->flen;
111191736Sobrien	unsigned char *utf8_buf = NULL, *utf8_end;
112354939Sdelphij	size_t mlen, i, len;
113159764Sobrien	int rv = -1;
114175296Sobrien	int mime = ms->flags & MAGIC_MIME;
115354939Sdelphij	int need_separator = 0;
11668349Sobrien
117133359Sobrien	const char *subtype = NULL;
11868349Sobrien
11968349Sobrien	int has_escapes = 0;
12068349Sobrien	int has_backspace = 0;
121159764Sobrien	int seen_cr = 0;
12268349Sobrien
12368349Sobrien	int n_crlf = 0;
12468349Sobrien	int n_lf = 0;
12568349Sobrien	int n_cr = 0;
12668349Sobrien	int n_nel = 0;
127234250Sobrien	int executable = 0;
12868349Sobrien
129354939Sdelphij	size_t last_line_end = CAST(size_t, -1);
13068349Sobrien	int has_long_lines = 0;
13168349Sobrien
132191736Sobrien	nbytes = trim_nuls(buf, nbytes);
133133359Sobrien
134191736Sobrien	/* If we have fewer than 2 bytes, give up. */
135159764Sobrien	if (nbytes <= 1) {
136159764Sobrien		rv = 0;
137159764Sobrien		goto done;
138159764Sobrien	}
139159764Sobrien
140267843Sdelphij	if (ulen > 0 && (ms->flags & MAGIC_NO_CHECK_SOFT) == 0) {
141226048Sobrien		/* Convert ubuf to UTF-8 and try text soft magic */
142226048Sobrien		/* malloc size is a conservative overestimate; could be
143226048Sobrien		   improved, or at least realloced after conversion. */
144226048Sobrien		mlen = ulen * 6;
145226048Sobrien		if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
146226048Sobrien			file_oomem(ms, mlen);
147226048Sobrien			goto done;
148226048Sobrien		}
149226048Sobrien		if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))
150226048Sobrien		    == NULL)
151226048Sobrien			goto done;
152354939Sdelphij		buffer_init(&bb, b->fd, &b->st, utf8_buf,
153354939Sdelphij		    CAST(size_t, utf8_end - utf8_buf));
154337827Seadler
155337827Seadler		if ((rv = file_softmagic(ms, &bb, NULL, NULL,
156275698Sdelphij		    TEXTTEST, text)) == 0)
157226048Sobrien			rv = -1;
158354939Sdelphij		else
159354939Sdelphij			need_separator = 1;
160337827Seadler		buffer_fini(&bb);
161309847Sdelphij		if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) {
162309847Sdelphij			rv = rv == -1 ? 0 : 1;
163309847Sdelphij			goto done;
164309847Sdelphij		}
16568349Sobrien	}
16668349Sobrien
167362844Sdelphij	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) {
168362844Sdelphij		rv = 0;
169362844Sdelphij		goto done;
170362844Sdelphij	}
171362844Sdelphij
172186690Sobrien	/* Now try to discover other details about the file. */
17368349Sobrien	for (i = 0; i < ulen; i++) {
174159764Sobrien		if (ubuf[i] == '\n') {
175159764Sobrien			if (seen_cr)
176159764Sobrien				n_crlf++;
177159764Sobrien			else
178159764Sobrien				n_lf++;
179159764Sobrien			last_line_end = i;
180159764Sobrien		} else if (seen_cr)
181159764Sobrien			n_cr++;
182159764Sobrien
183159764Sobrien		seen_cr = (ubuf[i] == '\r');
184159764Sobrien		if (seen_cr)
185159764Sobrien			last_line_end = i;
186159764Sobrien
187159764Sobrien		if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
188159764Sobrien			n_nel++;
189159764Sobrien			last_line_end = i;
190159764Sobrien		}
191159764Sobrien
192159764Sobrien		/* If this line is _longer_ than MAXLINELEN, remember it. */
19368349Sobrien		if (i > last_line_end + MAXLINELEN)
19468349Sobrien			has_long_lines = 1;
19568349Sobrien
19668349Sobrien		if (ubuf[i] == '\033')
19768349Sobrien			has_escapes = 1;
19868349Sobrien		if (ubuf[i] == '\b')
19968349Sobrien			has_backspace = 1;
20068349Sobrien	}
20168349Sobrien
202159764Sobrien	/* Beware, if the data has been truncated, the final CR could have
203298192Sdelphij	   been followed by a LF.  If we have ms->bytes_max bytes, it indicates
204159764Sobrien	   that the data might have been truncated, probably even before
205159764Sobrien	   this function was called. */
206298192Sdelphij	if (seen_cr && nbytes < ms->bytes_max)
207159764Sobrien		n_cr++;
208159764Sobrien
209191736Sobrien	if (strcmp(type, "binary") == 0) {
210191736Sobrien		rv = 0;
211191736Sobrien		goto done;
212191736Sobrien	}
213354939Sdelphij	len = file_printedlen(ms);
214175296Sobrien	if (mime) {
215354939Sdelphij		if ((mime & MAGIC_MIME_TYPE) != 0) {
216354939Sdelphij			if (len) {
217354939Sdelphij				/*
218354939Sdelphij				 * Softmagic printed something, we
219354939Sdelphij				 * are either done, or we need a separator
220354939Sdelphij				 */
221354939Sdelphij				if ((ms->flags & MAGIC_CONTINUE) == 0) {
222354939Sdelphij					rv = 1;
223354939Sdelphij					goto done;
224354939Sdelphij				}
225354939Sdelphij				if (need_separator && file_separator(ms) == -1)
226354939Sdelphij					goto done;
227175296Sobrien			} else {
228175296Sobrien				if (file_printf(ms, "text/plain") == -1)
229175296Sobrien					goto done;
230175296Sobrien			}
231133359Sobrien		}
23268349Sobrien	} else {
233354939Sdelphij		if (len) {
234226048Sobrien			switch (file_replace(ms, " text$", ", ")) {
235226048Sobrien			case 0:
236226048Sobrien				switch (file_replace(ms, " text executable$",
237226048Sobrien				    ", ")) {
238226048Sobrien				case 0:
239226048Sobrien					if (file_printf(ms, ", ") == -1)
240226048Sobrien						goto done;
241267843Sdelphij					break;
242226048Sobrien				case -1:
243226048Sobrien					goto done;
244226048Sobrien				default:
245226048Sobrien					executable = 1;
246226048Sobrien					break;
247226048Sobrien				}
248226048Sobrien				break;
249226048Sobrien			case -1:
250226048Sobrien				goto done;
251226048Sobrien			default:
252226048Sobrien				break;
253226048Sobrien			}
254226048Sobrien		}
255226048Sobrien
256191736Sobrien		if (file_printf(ms, "%s", code) == -1)
257159764Sobrien			goto done;
25868349Sobrien
25968349Sobrien		if (subtype) {
260191736Sobrien			if (file_printf(ms, " %s", subtype) == -1)
261159764Sobrien				goto done;
26268349Sobrien		}
26368349Sobrien
264191736Sobrien		if (file_printf(ms, " %s", type) == -1)
265159764Sobrien			goto done;
26668349Sobrien
267226048Sobrien		if (executable)
268226048Sobrien			if (file_printf(ms, " executable") == -1)
269226048Sobrien				goto done;
270226048Sobrien
27168349Sobrien		if (has_long_lines)
272133359Sobrien			if (file_printf(ms, ", with very long lines") == -1)
273159764Sobrien				goto done;
27468349Sobrien
27568349Sobrien		/*
27668349Sobrien		 * Only report line terminators if we find one other than LF,
27768349Sobrien		 * or if we find none at all.
27868349Sobrien		 */
27968349Sobrien		if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
28068349Sobrien		    (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
281133359Sobrien			if (file_printf(ms, ", with") == -1)
282159764Sobrien				goto done;
28368349Sobrien
284191736Sobrien			if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
285133359Sobrien				if (file_printf(ms, " no") == -1)
286159764Sobrien					goto done;
287133359Sobrien			} else {
28868349Sobrien				if (n_crlf) {
289133359Sobrien					if (file_printf(ms, " CRLF") == -1)
290159764Sobrien						goto done;
29168349Sobrien					if (n_cr || n_lf || n_nel)
292133359Sobrien						if (file_printf(ms, ",") == -1)
293159764Sobrien							goto done;
29468349Sobrien				}
29568349Sobrien				if (n_cr) {
296133359Sobrien					if (file_printf(ms, " CR") == -1)
297159764Sobrien						goto done;
29868349Sobrien					if (n_lf || n_nel)
299133359Sobrien						if (file_printf(ms, ",") == -1)
300159764Sobrien							goto done;
30168349Sobrien				}
30268349Sobrien				if (n_lf) {
303133359Sobrien					if (file_printf(ms, " LF") == -1)
304159764Sobrien						goto done;
30568349Sobrien					if (n_nel)
306133359Sobrien						if (file_printf(ms, ",") == -1)
307159764Sobrien							goto done;
30868349Sobrien				}
30968349Sobrien				if (n_nel)
310133359Sobrien					if (file_printf(ms, " NEL") == -1)
311159764Sobrien						goto done;
31268349Sobrien			}
31368349Sobrien
314133359Sobrien			if (file_printf(ms, " line terminators") == -1)
315159764Sobrien				goto done;
31668349Sobrien		}
31768349Sobrien
31868349Sobrien		if (has_escapes)
319133359Sobrien			if (file_printf(ms, ", with escape sequences") == -1)
320159764Sobrien				goto done;
32168349Sobrien		if (has_backspace)
322133359Sobrien			if (file_printf(ms, ", with overstriking") == -1)
323159764Sobrien				goto done;
32468349Sobrien	}
325159764Sobrien	rv = 1;
326159764Sobriendone:
327234250Sobrien	free(utf8_buf);
32868349Sobrien
329159764Sobrien	return rv;
33068349Sobrien}
33168349Sobrien
33268349Sobrien/*
333186690Sobrien * Encode Unicode string as UTF-8, returning pointer to character
334186690Sobrien * after end of string, or NULL if an invalid character is found.
335186690Sobrien */
336186690Sobrienprivate unsigned char *
337186690Sobrienencode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
33868349Sobrien{
339169962Sobrien	size_t i;
340186690Sobrien	unsigned char *end = buf + len;
341186690Sobrien
342186690Sobrien	for (i = 0; i < ulen; i++) {
343186690Sobrien		if (ubuf[i] <= 0x7f) {
344186690Sobrien			if (end - buf < 1)
345186690Sobrien				return NULL;
346354939Sdelphij			*buf++ = CAST(unsigned char, ubuf[i]);
347186690Sobrien		} else if (ubuf[i] <= 0x7ff) {
348186690Sobrien			if (end - buf < 2)
349186690Sobrien				return NULL;
350354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] >> 6) + 0xc0);
351354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
352186690Sobrien		} else if (ubuf[i] <= 0xffff) {
353186690Sobrien			if (end - buf < 3)
354186690Sobrien				return NULL;
355354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] >> 12) + 0xe0);
356354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >> 6) & 0x3f) + 0x80);
357354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
358186690Sobrien		} else if (ubuf[i] <= 0x1fffff) {
359186690Sobrien			if (end - buf < 4)
360186690Sobrien				return NULL;
361354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] >> 18) + 0xf0);
362354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >> 12) & 0x3f) + 0x80);
363354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >>  6) & 0x3f) + 0x80);
364354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
365186690Sobrien		} else if (ubuf[i] <= 0x3ffffff) {
366186690Sobrien			if (end - buf < 5)
367186690Sobrien				return NULL;
368354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] >> 24) + 0xf8);
369354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >> 18) & 0x3f) + 0x80);
370354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >> 12) & 0x3f) + 0x80);
371354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >>  6) & 0x3f) + 0x80);
372354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
373186690Sobrien		} else if (ubuf[i] <= 0x7fffffff) {
374186690Sobrien			if (end - buf < 6)
375186690Sobrien				return NULL;
376354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] >> 30) + 0xfc);
377354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >> 24) & 0x3f) + 0x80);
378354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >> 18) & 0x3f) + 0x80);
379354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >> 12) & 0x3f) + 0x80);
380354939Sdelphij			*buf++ = CAST(unsigned char, ((ubuf[i] >>  6) & 0x3f) + 0x80);
381354939Sdelphij			*buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
382186690Sobrien		} else /* Invalid character */
383186690Sobrien			return NULL;
384186690Sobrien	}
385186690Sobrien
386186690Sobrien	return buf;
387186690Sobrien}
388