1357712Sdelphij/*-
2357712Sdelphij * Copyright (c) 2019 Christos Zoulas
3357712Sdelphij * All rights reserved.
4357712Sdelphij *
5357712Sdelphij * Redistribution and use in source and binary forms, with or without
6357712Sdelphij * modification, are permitted provided that the following conditions
7357712Sdelphij * are met:
8357712Sdelphij * 1. Redistributions of source code must retain the above copyright
9357712Sdelphij *    notice, this list of conditions and the following disclaimer.
10357712Sdelphij * 2. Redistributions in binary form must reproduce the above copyright
11357712Sdelphij *    notice, this list of conditions and the following disclaimer in the
12357712Sdelphij *    documentation and/or other materials provided with the distribution.
13357712Sdelphij *
14357712Sdelphij * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15357712Sdelphij * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16357712Sdelphij * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17357712Sdelphij * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18357712Sdelphij * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19357712Sdelphij * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20357712Sdelphij * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21357712Sdelphij * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22357712Sdelphij * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23357712Sdelphij * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24357712Sdelphij * POSSIBILITY OF SUCH DAMAGE.
25357712Sdelphij */
26357712Sdelphij
27357712Sdelphij/*
28357712Sdelphij * Parse CSV object serialization format (RFC-4180, RFC-7111)
29357712Sdelphij */
30357712Sdelphij
31357712Sdelphij#ifndef TEST
32357712Sdelphij#include "file.h"
33357712Sdelphij
34357712Sdelphij#ifndef lint
35357712SdelphijFILE_RCSID("@(#)$File: is_csv.c,v 1.4 2019/06/26 20:31:31 christos Exp $")
36357712Sdelphij#endif
37357712Sdelphij
38357712Sdelphij#include <string.h>
39357712Sdelphij#include "magic.h"
40357712Sdelphij#else
41357712Sdelphij#include <sys/types.h>
42357712Sdelphij#endif
43357712Sdelphij
44357712Sdelphij
45357712Sdelphij#ifdef DEBUG
46357712Sdelphij#include <stdio.h>
47357712Sdelphij#define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
48357712Sdelphij#else
49357712Sdelphij#define DPRINTF(fmt, ...)
50357712Sdelphij#endif
51357712Sdelphij
52357712Sdelphij/*
53357712Sdelphij * if CSV_LINES == 0:
54357712Sdelphij *	check all the lines in the buffer
55357712Sdelphij * otherwise:
56357712Sdelphij *	check only up-to the number of lines specified
57357712Sdelphij *
58357712Sdelphij * the last line count is always ignored if it does not end in CRLF
59357712Sdelphij */
60357712Sdelphij#ifndef CSV_LINES
61357712Sdelphij#define CSV_LINES 10
62357712Sdelphij#endif
63357712Sdelphij
64357712Sdelphijstatic int csv_parse(const unsigned char *, const unsigned char *);
65357712Sdelphij
66357712Sdelphijstatic const unsigned char *
67357712Sdelphijeatquote(const unsigned char *uc, const unsigned char *ue)
68357712Sdelphij{
69357712Sdelphij	int quote = 0;
70357712Sdelphij
71357712Sdelphij	while (uc < ue) {
72357712Sdelphij		unsigned char c = *uc++;
73357712Sdelphij		if (c != '"') {
74357712Sdelphij			// We already got one, done.
75357712Sdelphij			if (quote) {
76357712Sdelphij				return --uc;
77357712Sdelphij			}
78357712Sdelphij			continue;
79357712Sdelphij		}
80357712Sdelphij		if (quote) {
81357712Sdelphij			// quote-quote escapes
82357712Sdelphij			quote = 0;
83357712Sdelphij			continue;
84357712Sdelphij		}
85357712Sdelphij		// first quote
86357712Sdelphij		quote = 1;
87357712Sdelphij	}
88357712Sdelphij	return ue;
89357712Sdelphij}
90357712Sdelphij
91357712Sdelphijstatic int
92357712Sdelphijcsv_parse(const unsigned char *uc, const unsigned char *ue)
93357712Sdelphij{
94357712Sdelphij	size_t nf = 0, tf = 0, nl = 0;
95357712Sdelphij
96357712Sdelphij	while (uc < ue) {
97357712Sdelphij		unsigned char c;
98357712Sdelphij		switch (c = *uc++) {
99357712Sdelphij		case '"':
100357712Sdelphij			// Eat until the matching quote
101357712Sdelphij			uc = eatquote(uc, ue);
102357712Sdelphij			break;
103357712Sdelphij		case ',':
104357712Sdelphij			nf++;
105357712Sdelphij			break;
106357712Sdelphij		case '\n':
107357712Sdelphij			DPRINTF("%zu %zu %zu\n", nl, nf, tf);
108357712Sdelphij			nl++;
109357712Sdelphij#if CSV_LINES
110357712Sdelphij			if (nl == CSV_LINES)
111357712Sdelphij				return tf != 0 && tf == nf;
112357712Sdelphij#endif
113357712Sdelphij			if (tf == 0) {
114357712Sdelphij				// First time and no fields, give up
115357712Sdelphij				if (nf == 0)
116357712Sdelphij					return 0;
117357712Sdelphij				// First time, set the number of fields
118357712Sdelphij				tf = nf;
119357712Sdelphij			} else if (tf != nf) {
120357712Sdelphij				// Field number mismatch, we are done.
121357712Sdelphij				return 0;
122357712Sdelphij			}
123357712Sdelphij			nf = 0;
124357712Sdelphij			break;
125357712Sdelphij		default:
126357712Sdelphij			break;
127357712Sdelphij		}
128357712Sdelphij	}
129357712Sdelphij	return tf && nl > 2;
130357712Sdelphij}
131357712Sdelphij
132357712Sdelphij#ifndef TEST
133357712Sdelphijint
134357712Sdelphijfile_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text)
135357712Sdelphij{
136357712Sdelphij	const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
137357712Sdelphij	const unsigned char *ue = uc + b->flen;
138357712Sdelphij	int mime = ms->flags & MAGIC_MIME;
139357712Sdelphij
140357712Sdelphij	if (!looks_text)
141357712Sdelphij		return 0;
142357712Sdelphij
143357712Sdelphij	if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
144357712Sdelphij		return 0;
145357712Sdelphij
146357712Sdelphij	if (!csv_parse(uc, ue))
147357712Sdelphij		return 0;
148357712Sdelphij
149357712Sdelphij	if (mime == MAGIC_MIME_ENCODING)
150357712Sdelphij		return 1;
151357712Sdelphij
152357712Sdelphij	if (mime) {
153357712Sdelphij		if (file_printf(ms, "application/csv") == -1)
154357712Sdelphij			return -1;
155357712Sdelphij		return 1;
156357712Sdelphij	}
157357712Sdelphij
158357712Sdelphij	if (file_printf(ms, "CSV text") == -1)
159357712Sdelphij		return -1;
160357712Sdelphij
161357712Sdelphij	return 1;
162357712Sdelphij}
163357712Sdelphij
164357712Sdelphij#else
165357712Sdelphij
166357712Sdelphij#include <sys/types.h>
167357712Sdelphij#include <sys/stat.h>
168357712Sdelphij#include <stdio.h>
169357712Sdelphij#include <fcntl.h>
170357712Sdelphij#include <unistd.h>
171357712Sdelphij#include <stdlib.h>
172357712Sdelphij#include <stdint.h>
173357712Sdelphij#include <err.h>
174357712Sdelphij
175357712Sdelphijint
176357712Sdelphijmain(int argc, char *argv[])
177357712Sdelphij{
178357712Sdelphij	int fd, rv;
179357712Sdelphij	struct stat st;
180357712Sdelphij	unsigned char *p;
181357712Sdelphij
182357712Sdelphij	if ((fd = open(argv[1], O_RDONLY)) == -1)
183357712Sdelphij		err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
184357712Sdelphij
185357712Sdelphij	if (fstat(fd, &st) == -1)
186357712Sdelphij		err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
187357712Sdelphij
188357712Sdelphij	if ((p = malloc(st.st_size)) == NULL)
189357712Sdelphij		err(EXIT_FAILURE, "Can't allocate %jd bytes",
190357712Sdelphij		    (intmax_t)st.st_size);
191357712Sdelphij	if (read(fd, p, st.st_size) != st.st_size)
192357712Sdelphij		err(EXIT_FAILURE, "Can't read %jd bytes",
193357712Sdelphij		    (intmax_t)st.st_size);
194357712Sdelphij	printf("is csv %d\n", csv_parse(p, p + st.st_size));
195357712Sdelphij	return 0;
196357712Sdelphij}
197357712Sdelphij#endif
198