1/*-
2 * Copyright (c) 2018 Christos Zoulas
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 * POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/*
28 * Parse JSON object serialization format (RFC-7159)
29 */
30
31#ifndef TEST
32#include "file.h"
33
34#ifndef lint
35FILE_RCSID("@(#)$File: is_json.c,v 1.30 2022/09/27 19:12:40 christos Exp $")
36#endif
37
38#include "magic.h"
39#else
40#include <stdio.h>
41#include <stddef.h>
42#endif
43#include <string.h>
44
45#ifdef DEBUG
46#include <stdio.h>
47#define DPRINTF(a, b, c)	\
48    printf("%*s%s [%.2x/%c] %.*s\n", (int)lvl, "", (a), *(b), *(b), \
49	(int)(b - c), (const char *)(c))
50#define __file_debugused
51#else
52#define DPRINTF(a, b, c)	do { } while (/*CONSTCOND*/0)
53#define __file_debugused __attribute__((__unused__))
54#endif
55
56#define JSON_ARRAY	0
57#define JSON_CONSTANT	1
58#define JSON_NUMBER	2
59#define JSON_OBJECT	3
60#define JSON_STRING	4
61#define JSON_ARRAYN	5
62#define JSON_MAX	6
63
64/*
65 * if JSON_COUNT != 0:
66 *	count all the objects, require that we have the whole data file
67 * otherwise:
68 *	stop if we find an object or an array
69 */
70#ifndef JSON_COUNT
71#define JSON_COUNT 0
72#endif
73
74static int json_parse(const unsigned char **, const unsigned char *, size_t *,
75	size_t);
76
77static int
78json_isspace(const unsigned char uc)
79{
80	switch (uc) {
81	case ' ':
82	case '\n':
83	case '\r':
84	case '\t':
85		return 1;
86	default:
87		return 0;
88	}
89}
90
91static int
92json_isdigit(unsigned char uc)
93{
94	switch (uc) {
95	case '0': case '1': case '2': case '3': case '4':
96	case '5': case '6': case '7': case '8': case '9':
97		return 1;
98	default:
99		return 0;
100	}
101}
102
103static int
104json_isxdigit(unsigned char uc)
105{
106	if (json_isdigit(uc))
107		return 1;
108	switch (uc) {
109	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
110	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
111		return 1;
112	default:
113		return 0;
114	}
115}
116
117static const unsigned char *
118json_skip_space(const unsigned char *uc, const unsigned char *ue)
119{
120	while (uc < ue && json_isspace(*uc))
121		uc++;
122	return uc;
123}
124
125/*ARGSUSED*/
126static int
127json_parse_string(const unsigned char **ucp, const unsigned char *ue,
128    size_t lvl __file_debugused)
129{
130	const unsigned char *uc = *ucp;
131	size_t i;
132
133	DPRINTF("Parse string: ", uc, *ucp);
134	while (uc < ue) {
135		switch (*uc++) {
136		case '\0':
137			goto out;
138		case '\\':
139			if (uc == ue)
140				goto out;
141			switch (*uc++) {
142			case '\0':
143				goto out;
144			case '"':
145			case '\\':
146			case '/':
147			case 'b':
148			case 'f':
149			case 'n':
150			case 'r':
151			case 't':
152				continue;
153			case 'u':
154				if (ue - uc < 4) {
155					uc = ue;
156					goto out;
157				}
158				for (i = 0; i < 4; i++)
159					if (!json_isxdigit(*uc++))
160						goto out;
161				continue;
162			default:
163				goto out;
164			}
165		case '"':
166			DPRINTF("Good string: ", uc, *ucp);
167			*ucp = uc;
168			return 1;
169		default:
170			continue;
171		}
172	}
173out:
174	DPRINTF("Bad string: ", uc, *ucp);
175	*ucp = uc;
176	return 0;
177}
178
179static int
180json_parse_array(const unsigned char **ucp, const unsigned char *ue,
181	size_t *st, size_t lvl)
182{
183	const unsigned char *uc = *ucp;
184
185	DPRINTF("Parse array: ", uc, *ucp);
186	while (uc < ue) {
187		uc = json_skip_space(uc, ue);
188		if (uc == ue)
189			goto out;
190		if (*uc == ']')
191			goto done;
192		if (!json_parse(&uc, ue, st, lvl + 1))
193			goto out;
194		if (uc == ue)
195			goto out;
196		switch (*uc) {
197		case ',':
198			uc++;
199			continue;
200		case ']':
201		done:
202			st[JSON_ARRAYN]++;
203			DPRINTF("Good array: ", uc, *ucp);
204			*ucp = uc + 1;
205			return 1;
206		default:
207			goto out;
208		}
209	}
210out:
211	DPRINTF("Bad array: ", uc,  *ucp);
212	*ucp = uc;
213	return 0;
214}
215
216static int
217json_parse_object(const unsigned char **ucp, const unsigned char *ue,
218	size_t *st, size_t lvl)
219{
220	const unsigned char *uc = *ucp;
221	DPRINTF("Parse object: ", uc, *ucp);
222	while (uc < ue) {
223		uc = json_skip_space(uc, ue);
224		if (uc == ue)
225			goto out;
226		if (*uc == '}') {
227			uc++;
228			goto done;
229		}
230		if (*uc++ != '"') {
231			DPRINTF("not string", uc, *ucp);
232			goto out;
233		}
234		DPRINTF("next field", uc, *ucp);
235		if (!json_parse_string(&uc, ue, lvl)) {
236			DPRINTF("not string", uc, *ucp);
237			goto out;
238		}
239		uc = json_skip_space(uc, ue);
240		if (uc == ue)
241			goto out;
242		if (*uc++ != ':') {
243			DPRINTF("not colon", uc, *ucp);
244			goto out;
245		}
246		if (!json_parse(&uc, ue, st, lvl + 1)) {
247			DPRINTF("not json", uc, *ucp);
248			goto out;
249		}
250		if (uc == ue)
251			goto out;
252		switch (*uc++) {
253		case ',':
254			continue;
255		case '}': /* { */
256		done:
257			DPRINTF("Good object: ", uc, *ucp);
258			*ucp = uc;
259			return 1;
260		default:
261			DPRINTF("not more", uc, *ucp);
262			*ucp = uc - 1;
263			goto out;
264		}
265	}
266out:
267	DPRINTF("Bad object: ", uc, *ucp);
268	*ucp = uc;
269	return 0;
270}
271
272/*ARGSUSED*/
273static int
274json_parse_number(const unsigned char **ucp, const unsigned char *ue,
275    size_t lvl __file_debugused)
276{
277	const unsigned char *uc = *ucp;
278	int got = 0;
279
280	DPRINTF("Parse number: ", uc, *ucp);
281	if (uc == ue)
282		return 0;
283	if (*uc == '-')
284		uc++;
285
286	for (; uc < ue; uc++) {
287		if (!json_isdigit(*uc))
288			break;
289		got = 1;
290	}
291	if (uc == ue)
292		goto out;
293	if (*uc == '.')
294		uc++;
295	for (; uc < ue; uc++) {
296		if (!json_isdigit(*uc))
297			break;
298		got = 1;
299	}
300	if (uc == ue)
301		goto out;
302	if (got && (*uc == 'e' || *uc == 'E')) {
303		uc++;
304		got = 0;
305		if (uc == ue)
306			goto out;
307		if (*uc == '+' || *uc == '-')
308			uc++;
309		for (; uc < ue; uc++) {
310			if (!json_isdigit(*uc))
311				break;
312			got = 1;
313		}
314	}
315out:
316	if (!got)
317		DPRINTF("Bad number: ", uc, *ucp);
318	else
319		DPRINTF("Good number: ", uc, *ucp);
320	*ucp = uc;
321	return got;
322}
323
324/*ARGSUSED*/
325static int
326json_parse_const(const unsigned char **ucp, const unsigned char *ue,
327    const char *str, size_t len, size_t lvl __file_debugused)
328{
329	const unsigned char *uc = *ucp;
330
331	DPRINTF("Parse const: ", uc, *ucp);
332	*ucp += --len - 1;
333	if (*ucp > ue)
334		*ucp = ue;
335	for (; uc < ue && --len;) {
336		if (*uc++ != *++str) {
337			DPRINTF("Bad const: ", uc, *ucp);
338			return 0;
339		}
340	}
341	DPRINTF("Good const: ", uc, *ucp);
342	return 1;
343}
344
345static int
346json_parse(const unsigned char **ucp, const unsigned char *ue,
347    size_t *st, size_t lvl)
348{
349	const unsigned char *uc, *ouc;
350	int rv = 0;
351	int t;
352
353	ouc = uc = json_skip_space(*ucp, ue);
354	if (uc == ue)
355		goto out;
356
357	// Avoid recursion
358	if (lvl > 500) {
359		DPRINTF("Too many levels", uc, *ucp);
360		return 0;
361	}
362#if JSON_COUNT
363	/* bail quickly if not counting */
364	if (lvl > 1 && (st[JSON_OBJECT] || st[JSON_ARRAYN]))
365		return 1;
366#endif
367
368	DPRINTF("Parse general: ", uc, *ucp);
369	switch (*uc++) {
370	case '"':
371		rv = json_parse_string(&uc, ue, lvl + 1);
372		t = JSON_STRING;
373		break;
374	case '[':
375		rv = json_parse_array(&uc, ue, st, lvl + 1);
376		t = JSON_ARRAY;
377		break;
378	case '{': /* '}' */
379		rv = json_parse_object(&uc, ue, st, lvl + 1);
380		t = JSON_OBJECT;
381		break;
382	case 't':
383		rv = json_parse_const(&uc, ue, "true", sizeof("true"), lvl + 1);
384		t = JSON_CONSTANT;
385		break;
386	case 'f':
387		rv = json_parse_const(&uc, ue, "false", sizeof("false"),
388		    lvl + 1);
389		t = JSON_CONSTANT;
390		break;
391	case 'n':
392		rv = json_parse_const(&uc, ue, "null", sizeof("null"), lvl + 1);
393		t = JSON_CONSTANT;
394		break;
395	default:
396		--uc;
397		rv = json_parse_number(&uc, ue, lvl + 1);
398		t = JSON_NUMBER;
399		break;
400	}
401	if (rv)
402		st[t]++;
403	uc = json_skip_space(uc, ue);
404out:
405	DPRINTF("End general: ", uc, *ucp);
406	*ucp = uc;
407	if (lvl == 0) {
408		if (!rv)
409			return 0;
410		if (uc == ue)
411			return (st[JSON_ARRAYN] || st[JSON_OBJECT]) ? 1 : 0;
412		if (*ouc == *uc && json_parse(&uc, ue, st, 1))
413			return (st[JSON_ARRAYN] || st[JSON_OBJECT]) ? 2 : 0;
414		else
415			return 0;
416	}
417	return rv;
418}
419
420#ifndef TEST
421int
422file_is_json(struct magic_set *ms, const struct buffer *b)
423{
424	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
425	const unsigned char *ue = uc + b->flen;
426	size_t st[JSON_MAX];
427	int mime = ms->flags & MAGIC_MIME;
428	int jt;
429
430
431	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
432		return 0;
433
434	memset(st, 0, sizeof(st));
435
436	if ((jt = json_parse(&uc, ue, st, 0)) == 0)
437		return 0;
438
439	if (mime == MAGIC_MIME_ENCODING)
440		return 1;
441	if (mime) {
442		if (file_printf(ms, "application/%s",
443		    jt == 1 ? "json" : "x-ndjson") == -1)
444			return -1;
445		return 1;
446	}
447	if (file_printf(ms, "%sJSON text data",
448	    jt == 1 ? "" : "New Line Delimited ") == -1)
449		return -1;
450#if JSON_COUNT
451#define P(n) st[n], st[n] > 1 ? "s" : ""
452	if (file_printf(ms, " (%" SIZE_T_FORMAT "u object%s, %" SIZE_T_FORMAT
453	    "u array%s, %" SIZE_T_FORMAT "u string%s, %" SIZE_T_FORMAT
454	    "u constant%s, %" SIZE_T_FORMAT "u number%s, %" SIZE_T_FORMAT
455	    "u >1array%s)",
456	    P(JSON_OBJECT), P(JSON_ARRAY), P(JSON_STRING), P(JSON_CONSTANT),
457	    P(JSON_NUMBER), P(JSON_ARRAYN))
458	    == -1)
459		return -1;
460#endif
461	return 1;
462}
463
464#else
465
466#include <sys/types.h>
467#include <sys/stat.h>
468#include <stdio.h>
469#include <fcntl.h>
470#include <unistd.h>
471#include <stdlib.h>
472#include <stdint.h>
473#include <err.h>
474
475int
476main(int argc, char *argv[])
477{
478	int fd;
479	struct stat st;
480	unsigned char *p;
481	size_t stats[JSON_MAX];
482
483	if ((fd = open(argv[1], O_RDONLY)) == -1)
484		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
485
486	if (fstat(fd, &st) == -1)
487		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
488
489	if ((p = CAST(char *, malloc(st.st_size))) == NULL)
490		err(EXIT_FAILURE, "Can't allocate %jd bytes",
491		    (intmax_t)st.st_size);
492	if (read(fd, p, st.st_size) != st.st_size)
493		err(EXIT_FAILURE, "Can't read %jd bytes",
494		    (intmax_t)st.st_size);
495	memset(stats, 0, sizeof(stats));
496	printf("is json %d\n", json_parse((const unsigned char **)&p,
497	    p + st.st_size, stats, 0));
498	return 0;
499}
500#endif
501