1/*-
2 * Copyright (c) 2019 Christos Zoulas
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 * POSSIBILITY OF SUCH DAMAGE.
25 */
26
27/*
28 * Parse CSV object serialization format (RFC-4180, RFC-7111)
29 */
30
31#ifndef TEST
32#include "file.h"
33
34#ifndef lint
35FILE_RCSID("@(#)$File: is_csv.c,v 1.4 2019/06/26 20:31:31 christos Exp $")
36#endif
37
38#include <string.h>
39#include "magic.h"
40#else
41#include <sys/types.h>
42#endif
43
44
45#ifdef DEBUG
46#include <stdio.h>
47#define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
48#else
49#define DPRINTF(fmt, ...)
50#endif
51
52/*
53 * if CSV_LINES == 0:
54 *	check all the lines in the buffer
55 * otherwise:
56 *	check only up-to the number of lines specified
57 *
58 * the last line count is always ignored if it does not end in CRLF
59 */
60#ifndef CSV_LINES
61#define CSV_LINES 10
62#endif
63
64static int csv_parse(const unsigned char *, const unsigned char *);
65
66static const unsigned char *
67eatquote(const unsigned char *uc, const unsigned char *ue)
68{
69	int quote = 0;
70
71	while (uc < ue) {
72		unsigned char c = *uc++;
73		if (c != '"') {
74			// We already got one, done.
75			if (quote) {
76				return --uc;
77			}
78			continue;
79		}
80		if (quote) {
81			// quote-quote escapes
82			quote = 0;
83			continue;
84		}
85		// first quote
86		quote = 1;
87	}
88	return ue;
89}
90
91static int
92csv_parse(const unsigned char *uc, const unsigned char *ue)
93{
94	size_t nf = 0, tf = 0, nl = 0;
95
96	while (uc < ue) {
97		unsigned char c;
98		switch (c = *uc++) {
99		case '"':
100			// Eat until the matching quote
101			uc = eatquote(uc, ue);
102			break;
103		case ',':
104			nf++;
105			break;
106		case '\n':
107			DPRINTF("%zu %zu %zu\n", nl, nf, tf);
108			nl++;
109#if CSV_LINES
110			if (nl == CSV_LINES)
111				return tf != 0 && tf == nf;
112#endif
113			if (tf == 0) {
114				// First time and no fields, give up
115				if (nf == 0)
116					return 0;
117				// First time, set the number of fields
118				tf = nf;
119			} else if (tf != nf) {
120				// Field number mismatch, we are done.
121				return 0;
122			}
123			nf = 0;
124			break;
125		default:
126			break;
127		}
128	}
129	return tf && nl > 2;
130}
131
132#ifndef TEST
133int
134file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text)
135{
136	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
137	const unsigned char *ue = uc + b->flen;
138	int mime = ms->flags & MAGIC_MIME;
139
140	if (!looks_text)
141		return 0;
142
143	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
144		return 0;
145
146	if (!csv_parse(uc, ue))
147		return 0;
148
149	if (mime == MAGIC_MIME_ENCODING)
150		return 1;
151
152	if (mime) {
153		if (file_printf(ms, "application/csv") == -1)
154			return -1;
155		return 1;
156	}
157
158	if (file_printf(ms, "CSV text") == -1)
159		return -1;
160
161	return 1;
162}
163
164#else
165
166#include <sys/types.h>
167#include <sys/stat.h>
168#include <stdio.h>
169#include <fcntl.h>
170#include <unistd.h>
171#include <stdlib.h>
172#include <stdint.h>
173#include <err.h>
174
175int
176main(int argc, char *argv[])
177{
178	int fd, rv;
179	struct stat st;
180	unsigned char *p;
181
182	if ((fd = open(argv[1], O_RDONLY)) == -1)
183		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
184
185	if (fstat(fd, &st) == -1)
186		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
187
188	if ((p = malloc(st.st_size)) == NULL)
189		err(EXIT_FAILURE, "Can't allocate %jd bytes",
190		    (intmax_t)st.st_size);
191	if (read(fd, p, st.st_size) != st.st_size)
192		err(EXIT_FAILURE, "Can't read %jd bytes",
193		    (intmax_t)st.st_size);
194	printf("is csv %d\n", csv_parse(p, p + st.st_size));
195	return 0;
196}
197#endif
198