1/*	$NetBSD: is_csv.c,v 1.5 2023/08/18 19:00:11 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2019 Christos Zoulas
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * Parse CSV object serialization format (RFC-4180, RFC-7111)
31 */
32
33#ifndef TEST
34#include "file.h"
35
36#ifndef lint
37#if 0
38FILE_RCSID("@(#)$File: is_csv.c,v 1.13 2023/07/17 16:08:17 christos Exp $")
39#else
40__RCSID("$NetBSD: is_csv.c,v 1.5 2023/08/18 19:00:11 christos Exp $");
41#endif
42#endif
43
44#include <string.h>
45#include "magic.h"
46#else
47#include <sys/types.h>
48#endif
49
50
51#ifdef DEBUG
52#include <stdio.h>
53#define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
54#else
55#define DPRINTF(fmt, ...)
56#endif
57
58/*
59 * if CSV_LINES == 0:
60 *	check all the lines in the buffer
61 * otherwise:
62 *	check only up-to the number of lines specified
63 *
64 * the last line count is always ignored if it does not end in CRLF
65 */
66#ifndef CSV_LINES
67#define CSV_LINES 10
68#endif
69
70static int csv_parse(const unsigned char *, const unsigned char *);
71
72static const unsigned char *
73eatquote(const unsigned char *uc, const unsigned char *ue)
74{
75	int quote = 0;
76
77	while (uc < ue) {
78		unsigned char c = *uc++;
79		if (c != '"') {
80			// We already got one, done.
81			if (quote) {
82				return --uc;
83			}
84			continue;
85		}
86		if (quote) {
87			// quote-quote escapes
88			quote = 0;
89			continue;
90		}
91		// first quote
92		quote = 1;
93	}
94	return ue;
95}
96
97static int
98csv_parse(const unsigned char *uc, const unsigned char *ue)
99{
100	size_t nf = 0, tf = 0, nl = 0;
101
102	while (uc < ue) {
103		switch (*uc++) {
104		case '"':
105			// Eat until the matching quote
106			uc = eatquote(uc, ue);
107			break;
108		case ',':
109			nf++;
110			break;
111		case '\n':
112			DPRINTF("%zu %zu %zu\n", nl, nf, tf);
113			nl++;
114#if CSV_LINES
115			if (nl == CSV_LINES)
116				return tf != 0 && tf == nf;
117#endif
118			if (tf == 0) {
119				// First time and no fields, give up
120				if (nf == 0)
121					return 0;
122				// First time, set the number of fields
123				tf = nf;
124			} else if (tf != nf) {
125				// Field number mismatch, we are done.
126				return 0;
127			}
128			nf = 0;
129			break;
130		default:
131			break;
132		}
133	}
134	return tf && nl >= 2;
135}
136
137#ifndef TEST
138int
139file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
140    const char *code)
141{
142	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
143	const unsigned char *ue = uc + b->flen;
144	int mime = ms->flags & MAGIC_MIME;
145
146	if (!looks_text)
147		return 0;
148
149	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
150		return 0;
151
152	if (!csv_parse(uc, ue))
153		return 0;
154
155	if (mime == MAGIC_MIME_ENCODING)
156		return 1;
157
158	if (mime) {
159		if (file_printf(ms, "text/csv") == -1)
160			return -1;
161		return 1;
162	}
163
164	if (file_printf(ms, "CSV %s%stext", code ? code : "",
165	    code ? " " : "") == -1)
166		return -1;
167
168	return 1;
169}
170
171#else
172
173#include <sys/types.h>
174#include <sys/stat.h>
175#include <stdio.h>
176#include <fcntl.h>
177#include <unistd.h>
178#include <stdlib.h>
179#include <stdint.h>
180#include <err.h>
181
182int
183main(int argc, char *argv[])
184{
185	int fd;
186	struct stat st;
187	unsigned char *p;
188
189	if ((fd = open(argv[1], O_RDONLY)) == -1)
190		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
191
192	if (fstat(fd, &st) == -1)
193		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
194
195	if ((p = CAST(char *, malloc(st.st_size))) == NULL)
196		err(EXIT_FAILURE, "Can't allocate %jd bytes",
197		    (intmax_t)st.st_size);
198	if (read(fd, p, st.st_size) != st.st_size)
199		err(EXIT_FAILURE, "Can't read %jd bytes",
200		    (intmax_t)st.st_size);
201	printf("is csv %d\n", csv_parse(p, p + st.st_size));
202	return 0;
203}
204#endif
205