1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2005,2008 Oracle.  All rights reserved.
5 *
6 * $Id: load.c,v 1.12 2008/01/08 20:58:23 bostic Exp $
7 */
8
9#include "csv.h"
10#include "csv_local.h"
11#include "csv_extern.h"
12
13typedef enum { GL_OK, GL_EOF, GL_FAIL } getline_status;
14
15static int input_field_count(const char *, size_t, u_int32_t *);
16static getline_status
17	   input_getline(char **, size_t *, size_t *);
18static int input_put_alloc(u_int32_t **, size_t *, size_t, u_int32_t);
19static int input_set_offset(u_int32_t *, char *, size_t, u_int32_t);
20
21static input_fmt ifmt;			/* Input format. */
22static u_long	 record_count = 0;	/* Input record count for errors. */
23static u_long	 version;		/* Version we're loading. */
24
25/*
26 * input_load --
27 *	Read the input file and load new records into the database.
28 */
29int
30input_load(input_fmt ifmt_arg, u_long version_arg)
31{
32	getline_status gtl_status;
33	DBT key, data;
34	DBC *cursor;
35	u_int32_t field_count, primary_key, *put_line;
36	size_t input_len, len, put_len;
37	int is_first, ret;
38	char *input_line;
39
40	field_count = 0;			/* Shut the compiler up. */
41
42	/* ifmt and version are global to this file. */
43	ifmt = ifmt_arg;
44	version = version_arg;
45
46	/*
47	 * The primary key for the database is a unique number.  Find out the
48	 * last unique number allocated in this database by opening a cursor
49	 * and fetching the last record.
50	 */
51	if ((ret = db->cursor(db, NULL, &cursor, 0)) != 0) {
52		dbenv->err(dbenv, ret, "DB->cursor");
53		return (1);
54	}
55	memset(&key, 0, sizeof(key));
56	memset(&data, 0, sizeof(data));
57	if ((ret = cursor->c_get(cursor, &key, &data, DB_LAST)) != 0)
58		if (ret == DB_NOTFOUND)
59			primary_key = 0;
60		else {
61			dbenv->err(dbenv, ret, "DB->cursor: DB_LAST");
62			return (1);
63		}
64	else
65		memcpy(&primary_key, key.data, sizeof(primary_key));
66	if ((ret = cursor->c_close(cursor)) != 0) {
67		dbenv->err(dbenv, ret, "DBC->close");
68		return (1);
69	}
70	if (verbose)
71		dbenv->errx(dbenv,
72		    "maximum existing record in the database is %lu",
73		    (u_long)primary_key);
74
75	key.data = &primary_key;
76	key.size = sizeof(primary_key);
77	input_line = NULL;
78	put_line = NULL;
79	input_len = put_len = 0;
80
81	/*
82	 * See the README file for a description of the file input format.
83	 */
84	for (is_first = 1; (gtl_status =
85	    input_getline(&input_line, &input_len, &len)) == GL_OK;) {
86		++record_count;
87		if (verbose > 1)
88			dbenv->errx(dbenv, "reading %lu", (u_long)record_count);
89
90		/* The first non-blank line of the input is a column map. */
91		if (is_first) {
92			is_first = 0;
93
94			/* Count the fields we're expecting in the input. */
95			if (input_field_count(
96			    input_line, len, &field_count) != 0)
97				return (1);
98
99		}
100
101		/* Allocate room for the table of offsets. */
102		if (input_put_alloc(
103		    &put_line, &put_len, len, field_count) != 0)
104			return (1);
105
106		/*
107		 * Build the offset table and create the record we're
108		 * going to store.
109		 */
110		if (input_set_offset(put_line,
111		    input_line, len, field_count) != 0)
112			return (1);
113
114		++primary_key;
115
116		memcpy(put_line + (field_count + 2), input_line, len);
117		data.data = put_line;
118		data.size = (field_count + 2) * sizeof(u_int32_t) + len;
119
120		if (verbose > 1)
121			(void)entry_print(
122			    data.data, data.size, field_count);
123
124		/* Load the key/data pair into the database. */
125		if ((ret = db->put(db, NULL, &key, &data, 0)) != 0) {
126			dbenv->err(dbenv, ret,
127			    "DB->put: %lu", (u_long)primary_key);
128			return (1);
129		}
130	}
131
132	if (gtl_status != GL_EOF)
133		return (1);
134
135	if (verbose)
136		dbenv->errx(dbenv,
137		    "%lu records read from the input file into the database",
138		    record_count);
139
140	/*
141	 * This program isn't transactional, limit the window for corruption.
142	 */
143	if ((ret = db->sync(db, 0)) != 0) {
144		dbenv->err(dbenv, ret, "DB->sync");
145		return (1);
146	}
147
148	return (0);
149}
150
151/*
152 * input_getline --
153 *	Read in a line of input into a buffer.
154 */
155static getline_status
156input_getline(char **input_linep, size_t *input_lenp, size_t *lenp)
157{
158	size_t input_len, len;
159	int ch;
160	char *input_line, *p, *endp;
161
162	input_line = *input_linep;
163	input_len = *input_lenp;
164
165	p = input_line;
166	endp = input_line + input_len;
167
168	for (len = 0; (ch = getchar()) != EOF;) {
169		if (ch == '\0')		/* Strip <nul> (\000) bytes. */
170			continue;
171		switch (ifmt) {
172		case FORMAT_NL:
173			if (ch == '\n')
174				goto end;
175			break;
176		case FORMAT_EXCEL:
177			/* Strip <nl> (\012) bytes. */
178			if (ch == '\n')
179				continue;
180			/*
181			 * <cr> (\015) bytes terminate lines.
182			 * Skip blank lines.
183			 */
184			if (ch == '\015') {
185				if (len == 0)
186					continue;
187				goto end;
188			}
189		}
190		if (input_line == endp) {
191			input_len += 256;
192			input_len *= 2;
193			if ((input_line =
194			    realloc(input_line, input_len)) == NULL) {
195				dbenv->err(dbenv, errno,
196				    "unable to allocate %lu bytes for record",
197				    (u_long)input_len);
198				return (GL_FAIL);
199			}
200			p = input_line;
201			endp = p + input_len;
202		}
203
204		if (isprint(ch)) {	/* Strip unprintables. */
205			*p++ = (char)ch;
206			++len;
207		}
208	}
209
210end:	if (len == 0)
211		return (GL_EOF);
212
213	*lenp = len;
214	*input_linep = input_line;
215	*input_lenp = input_len;
216
217	return (GL_OK);
218}
219
220/*
221 * input_field_count --
222 *	Count the fields in the line.
223 */
224static int
225input_field_count(const char *line, size_t len, u_int32_t *field_countp)
226{
227	u_int32_t field_count;
228	int quoted;
229
230	field_count = 1;
231
232	/*
233	 * There are N-1 separators for N fields, that is, "a,b,c" is three
234	 * fields, with two comma separators.
235	 */
236	switch (ifmt) {
237	case FORMAT_EXCEL:
238		quoted = 0;
239		for (field_count = 1; len > 0; ++line, --len)
240			if (*line == '"')
241				quoted = !quoted;
242			else if (*line == ',' && !quoted)
243				++field_count;
244		break;
245	case FORMAT_NL:
246		for (field_count = 1; len > 0; ++line, --len)
247			if (*line == ',')
248				++field_count;
249		break;
250	}
251	*field_countp = field_count;
252
253	if (verbose)
254		dbenv->errx(dbenv,
255		    "input file made up of %lu fields", (u_int)field_count);
256
257	return (0);
258}
259
260/*
261 * input_put_alloc --
262 *	Allocate room for the offset table plus the input.
263 */
264static int
265input_put_alloc(u_int32_t **put_linep,
266    size_t *put_lenp, size_t len, u_int32_t field_count)
267{
268	size_t total;
269
270	total = (field_count + 2) * sizeof(u_int32_t) + len;
271	if (total > *put_lenp &&
272	    (*put_linep = realloc(*put_linep, *put_lenp += total)) == NULL) {
273		dbenv->err(dbenv, errno,
274		    "unable to allocate %lu bytes for record",
275		    (u_long)*put_lenp);
276		return (1);
277	}
278	return (0);
279}
280
281/*
282 * input_set_offset --
283 *	Build an offset table and record combination.
284 */
285static int
286input_set_offset(u_int32_t *put_line,
287    char *input_line, size_t len, u_int32_t field_count)
288{
289	u_int32_t *op;
290	int quoted;
291	char *p, *endp;
292
293	op = put_line;
294
295	/* The first field is the version number. */
296	*op++ = version;
297
298	/*
299	 * Walk the input line, looking for comma separators.  It's an error
300	 * to have too many or too few fields.
301	 */
302	*op++ = 0;
303	quoted = 0;
304	for (p = input_line, endp = input_line + len;; ++p) {
305		if (ifmt == FORMAT_EXCEL && p < endp) {
306			if (*p == '"')
307				quoted = !quoted;
308			if (quoted)
309				continue;
310		}
311		if (*p == ',' || p == endp) {
312			if (field_count == 0) {
313				dbenv->errx(dbenv,
314				    "record %lu: too many fields in the record",
315				    record_count);
316				return (1);
317			}
318			--field_count;
319
320			*op++ = (u_int32_t)(p - input_line) + 1;
321
322			if (verbose > 1)
323				dbenv->errx(dbenv,
324				    "offset %lu: {%.*s}", op[-1],
325				    OFFSET_LEN(op, -2), input_line + op[-2]);
326
327			/*
328			 * Don't insert a new field if the input lines ends
329			 * in a comma.
330			 */
331			if (p == endp || p + 1 == endp)
332				break;
333		}
334	}
335	*op++ = (u_int32_t)(p - input_line);
336
337	if (field_count != 0) {
338		dbenv->errx(dbenv,
339		    "record %lu: not enough fields in the record",
340		    record_count);
341		return (1);
342	}
343	memcpy(op, input_line, len);
344
345	return (0);
346}
347