1357712Sdelphij/*- 2357712Sdelphij * Copyright (c) 2019 Christos Zoulas 3357712Sdelphij * All rights reserved. 4357712Sdelphij * 5357712Sdelphij * Redistribution and use in source and binary forms, with or without 6357712Sdelphij * modification, are permitted provided that the following conditions 7357712Sdelphij * are met: 8357712Sdelphij * 1. Redistributions of source code must retain the above copyright 9357712Sdelphij * notice, this list of conditions and the following disclaimer. 10357712Sdelphij * 2. Redistributions in binary form must reproduce the above copyright 11357712Sdelphij * notice, this list of conditions and the following disclaimer in the 12357712Sdelphij * documentation and/or other materials provided with the distribution. 13357712Sdelphij * 14357712Sdelphij * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 15357712Sdelphij * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 16357712Sdelphij * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17357712Sdelphij * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 18357712Sdelphij * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19357712Sdelphij * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20357712Sdelphij * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21357712Sdelphij * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22357712Sdelphij * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23357712Sdelphij * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24357712Sdelphij * POSSIBILITY OF SUCH DAMAGE. 25357712Sdelphij */ 26357712Sdelphij 27357712Sdelphij/* 28357712Sdelphij * Parse CSV object serialization format (RFC-4180, RFC-7111) 29357712Sdelphij */ 30357712Sdelphij 31357712Sdelphij#ifndef TEST 32357712Sdelphij#include "file.h" 33357712Sdelphij 34357712Sdelphij#ifndef lint 35357712SdelphijFILE_RCSID("@(#)$File: is_csv.c,v 1.4 2019/06/26 20:31:31 christos Exp $") 36357712Sdelphij#endif 37357712Sdelphij 38357712Sdelphij#include <string.h> 39357712Sdelphij#include "magic.h" 40357712Sdelphij#else 41357712Sdelphij#include <sys/types.h> 42357712Sdelphij#endif 43357712Sdelphij 44357712Sdelphij 45357712Sdelphij#ifdef DEBUG 46357712Sdelphij#include <stdio.h> 47357712Sdelphij#define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__) 48357712Sdelphij#else 49357712Sdelphij#define DPRINTF(fmt, ...) 50357712Sdelphij#endif 51357712Sdelphij 52357712Sdelphij/* 53357712Sdelphij * if CSV_LINES == 0: 54357712Sdelphij * check all the lines in the buffer 55357712Sdelphij * otherwise: 56357712Sdelphij * check only up-to the number of lines specified 57357712Sdelphij * 58357712Sdelphij * the last line count is always ignored if it does not end in CRLF 59357712Sdelphij */ 60357712Sdelphij#ifndef CSV_LINES 61357712Sdelphij#define CSV_LINES 10 62357712Sdelphij#endif 63357712Sdelphij 64357712Sdelphijstatic int csv_parse(const unsigned char *, const unsigned char *); 65357712Sdelphij 66357712Sdelphijstatic const unsigned char * 67357712Sdelphijeatquote(const unsigned char *uc, const unsigned char *ue) 68357712Sdelphij{ 69357712Sdelphij int quote = 0; 70357712Sdelphij 71357712Sdelphij while (uc < ue) { 72357712Sdelphij unsigned char c = *uc++; 73357712Sdelphij if (c != '"') { 74357712Sdelphij // We already got one, done. 75357712Sdelphij if (quote) { 76357712Sdelphij return --uc; 77357712Sdelphij } 78357712Sdelphij continue; 79357712Sdelphij } 80357712Sdelphij if (quote) { 81357712Sdelphij // quote-quote escapes 82357712Sdelphij quote = 0; 83357712Sdelphij continue; 84357712Sdelphij } 85357712Sdelphij // first quote 86357712Sdelphij quote = 1; 87357712Sdelphij } 88357712Sdelphij return ue; 89357712Sdelphij} 90357712Sdelphij 91357712Sdelphijstatic int 92357712Sdelphijcsv_parse(const unsigned char *uc, const unsigned char *ue) 93357712Sdelphij{ 94357712Sdelphij size_t nf = 0, tf = 0, nl = 0; 95357712Sdelphij 96357712Sdelphij while (uc < ue) { 97357712Sdelphij unsigned char c; 98357712Sdelphij switch (c = *uc++) { 99357712Sdelphij case '"': 100357712Sdelphij // Eat until the matching quote 101357712Sdelphij uc = eatquote(uc, ue); 102357712Sdelphij break; 103357712Sdelphij case ',': 104357712Sdelphij nf++; 105357712Sdelphij break; 106357712Sdelphij case '\n': 107357712Sdelphij DPRINTF("%zu %zu %zu\n", nl, nf, tf); 108357712Sdelphij nl++; 109357712Sdelphij#if CSV_LINES 110357712Sdelphij if (nl == CSV_LINES) 111357712Sdelphij return tf != 0 && tf == nf; 112357712Sdelphij#endif 113357712Sdelphij if (tf == 0) { 114357712Sdelphij // First time and no fields, give up 115357712Sdelphij if (nf == 0) 116357712Sdelphij return 0; 117357712Sdelphij // First time, set the number of fields 118357712Sdelphij tf = nf; 119357712Sdelphij } else if (tf != nf) { 120357712Sdelphij // Field number mismatch, we are done. 121357712Sdelphij return 0; 122357712Sdelphij } 123357712Sdelphij nf = 0; 124357712Sdelphij break; 125357712Sdelphij default: 126357712Sdelphij break; 127357712Sdelphij } 128357712Sdelphij } 129357712Sdelphij return tf && nl > 2; 130357712Sdelphij} 131357712Sdelphij 132357712Sdelphij#ifndef TEST 133357712Sdelphijint 134357712Sdelphijfile_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text) 135357712Sdelphij{ 136357712Sdelphij const unsigned char *uc = CAST(const unsigned char *, b->fbuf); 137357712Sdelphij const unsigned char *ue = uc + b->flen; 138357712Sdelphij int mime = ms->flags & MAGIC_MIME; 139357712Sdelphij 140357712Sdelphij if (!looks_text) 141357712Sdelphij return 0; 142357712Sdelphij 143357712Sdelphij if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0) 144357712Sdelphij return 0; 145357712Sdelphij 146357712Sdelphij if (!csv_parse(uc, ue)) 147357712Sdelphij return 0; 148357712Sdelphij 149357712Sdelphij if (mime == MAGIC_MIME_ENCODING) 150357712Sdelphij return 1; 151357712Sdelphij 152357712Sdelphij if (mime) { 153357712Sdelphij if (file_printf(ms, "application/csv") == -1) 154357712Sdelphij return -1; 155357712Sdelphij return 1; 156357712Sdelphij } 157357712Sdelphij 158357712Sdelphij if (file_printf(ms, "CSV text") == -1) 159357712Sdelphij return -1; 160357712Sdelphij 161357712Sdelphij return 1; 162357712Sdelphij} 163357712Sdelphij 164357712Sdelphij#else 165357712Sdelphij 166357712Sdelphij#include <sys/types.h> 167357712Sdelphij#include <sys/stat.h> 168357712Sdelphij#include <stdio.h> 169357712Sdelphij#include <fcntl.h> 170357712Sdelphij#include <unistd.h> 171357712Sdelphij#include <stdlib.h> 172357712Sdelphij#include <stdint.h> 173357712Sdelphij#include <err.h> 174357712Sdelphij 175357712Sdelphijint 176357712Sdelphijmain(int argc, char *argv[]) 177357712Sdelphij{ 178357712Sdelphij int fd, rv; 179357712Sdelphij struct stat st; 180357712Sdelphij unsigned char *p; 181357712Sdelphij 182357712Sdelphij if ((fd = open(argv[1], O_RDONLY)) == -1) 183357712Sdelphij err(EXIT_FAILURE, "Can't open `%s'", argv[1]); 184357712Sdelphij 185357712Sdelphij if (fstat(fd, &st) == -1) 186357712Sdelphij err(EXIT_FAILURE, "Can't stat `%s'", argv[1]); 187357712Sdelphij 188357712Sdelphij if ((p = malloc(st.st_size)) == NULL) 189357712Sdelphij err(EXIT_FAILURE, "Can't allocate %jd bytes", 190357712Sdelphij (intmax_t)st.st_size); 191357712Sdelphij if (read(fd, p, st.st_size) != st.st_size) 192357712Sdelphij err(EXIT_FAILURE, "Can't read %jd bytes", 193357712Sdelphij (intmax_t)st.st_size); 194357712Sdelphij printf("is csv %d\n", csv_parse(p, p + st.st_size)); 195357712Sdelphij return 0; 196357712Sdelphij} 197357712Sdelphij#endif 198