1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2005,2008 Oracle. All rights reserved. 5 * 6 * $Id: load.c,v 1.12 2008/01/08 20:58:23 bostic Exp $ 7 */ 8 9#include "csv.h" 10#include "csv_local.h" 11#include "csv_extern.h" 12 13typedef enum { GL_OK, GL_EOF, GL_FAIL } getline_status; 14 15static int input_field_count(const char *, size_t, u_int32_t *); 16static getline_status 17 input_getline(char **, size_t *, size_t *); 18static int input_put_alloc(u_int32_t **, size_t *, size_t, u_int32_t); 19static int input_set_offset(u_int32_t *, char *, size_t, u_int32_t); 20 21static input_fmt ifmt; /* Input format. */ 22static u_long record_count = 0; /* Input record count for errors. */ 23static u_long version; /* Version we're loading. */ 24 25/* 26 * input_load -- 27 * Read the input file and load new records into the database. 28 */ 29int 30input_load(input_fmt ifmt_arg, u_long version_arg) 31{ 32 getline_status gtl_status; 33 DBT key, data; 34 DBC *cursor; 35 u_int32_t field_count, primary_key, *put_line; 36 size_t input_len, len, put_len; 37 int is_first, ret; 38 char *input_line; 39 40 field_count = 0; /* Shut the compiler up. */ 41 42 /* ifmt and version are global to this file. */ 43 ifmt = ifmt_arg; 44 version = version_arg; 45 46 /* 47 * The primary key for the database is a unique number. Find out the 48 * last unique number allocated in this database by opening a cursor 49 * and fetching the last record. 50 */ 51 if ((ret = db->cursor(db, NULL, &cursor, 0)) != 0) { 52 dbenv->err(dbenv, ret, "DB->cursor"); 53 return (1); 54 } 55 memset(&key, 0, sizeof(key)); 56 memset(&data, 0, sizeof(data)); 57 if ((ret = cursor->c_get(cursor, &key, &data, DB_LAST)) != 0) 58 if (ret == DB_NOTFOUND) 59 primary_key = 0; 60 else { 61 dbenv->err(dbenv, ret, "DB->cursor: DB_LAST"); 62 return (1); 63 } 64 else 65 memcpy(&primary_key, key.data, sizeof(primary_key)); 66 if ((ret = cursor->c_close(cursor)) != 0) { 67 dbenv->err(dbenv, ret, "DBC->close"); 68 return (1); 69 } 70 if (verbose) 71 dbenv->errx(dbenv, 72 "maximum existing record in the database is %lu", 73 (u_long)primary_key); 74 75 key.data = &primary_key; 76 key.size = sizeof(primary_key); 77 input_line = NULL; 78 put_line = NULL; 79 input_len = put_len = 0; 80 81 /* 82 * See the README file for a description of the file input format. 83 */ 84 for (is_first = 1; (gtl_status = 85 input_getline(&input_line, &input_len, &len)) == GL_OK;) { 86 ++record_count; 87 if (verbose > 1) 88 dbenv->errx(dbenv, "reading %lu", (u_long)record_count); 89 90 /* The first non-blank line of the input is a column map. */ 91 if (is_first) { 92 is_first = 0; 93 94 /* Count the fields we're expecting in the input. */ 95 if (input_field_count( 96 input_line, len, &field_count) != 0) 97 return (1); 98 99 } 100 101 /* Allocate room for the table of offsets. */ 102 if (input_put_alloc( 103 &put_line, &put_len, len, field_count) != 0) 104 return (1); 105 106 /* 107 * Build the offset table and create the record we're 108 * going to store. 109 */ 110 if (input_set_offset(put_line, 111 input_line, len, field_count) != 0) 112 return (1); 113 114 ++primary_key; 115 116 memcpy(put_line + (field_count + 2), input_line, len); 117 data.data = put_line; 118 data.size = (field_count + 2) * sizeof(u_int32_t) + len; 119 120 if (verbose > 1) 121 (void)entry_print( 122 data.data, data.size, field_count); 123 124 /* Load the key/data pair into the database. */ 125 if ((ret = db->put(db, NULL, &key, &data, 0)) != 0) { 126 dbenv->err(dbenv, ret, 127 "DB->put: %lu", (u_long)primary_key); 128 return (1); 129 } 130 } 131 132 if (gtl_status != GL_EOF) 133 return (1); 134 135 if (verbose) 136 dbenv->errx(dbenv, 137 "%lu records read from the input file into the database", 138 record_count); 139 140 /* 141 * This program isn't transactional, limit the window for corruption. 142 */ 143 if ((ret = db->sync(db, 0)) != 0) { 144 dbenv->err(dbenv, ret, "DB->sync"); 145 return (1); 146 } 147 148 return (0); 149} 150 151/* 152 * input_getline -- 153 * Read in a line of input into a buffer. 154 */ 155static getline_status 156input_getline(char **input_linep, size_t *input_lenp, size_t *lenp) 157{ 158 size_t input_len, len; 159 int ch; 160 char *input_line, *p, *endp; 161 162 input_line = *input_linep; 163 input_len = *input_lenp; 164 165 p = input_line; 166 endp = input_line + input_len; 167 168 for (len = 0; (ch = getchar()) != EOF;) { 169 if (ch == '\0') /* Strip <nul> (\000) bytes. */ 170 continue; 171 switch (ifmt) { 172 case FORMAT_NL: 173 if (ch == '\n') 174 goto end; 175 break; 176 case FORMAT_EXCEL: 177 /* Strip <nl> (\012) bytes. */ 178 if (ch == '\n') 179 continue; 180 /* 181 * <cr> (\015) bytes terminate lines. 182 * Skip blank lines. 183 */ 184 if (ch == '\015') { 185 if (len == 0) 186 continue; 187 goto end; 188 } 189 } 190 if (input_line == endp) { 191 input_len += 256; 192 input_len *= 2; 193 if ((input_line = 194 realloc(input_line, input_len)) == NULL) { 195 dbenv->err(dbenv, errno, 196 "unable to allocate %lu bytes for record", 197 (u_long)input_len); 198 return (GL_FAIL); 199 } 200 p = input_line; 201 endp = p + input_len; 202 } 203 204 if (isprint(ch)) { /* Strip unprintables. */ 205 *p++ = (char)ch; 206 ++len; 207 } 208 } 209 210end: if (len == 0) 211 return (GL_EOF); 212 213 *lenp = len; 214 *input_linep = input_line; 215 *input_lenp = input_len; 216 217 return (GL_OK); 218} 219 220/* 221 * input_field_count -- 222 * Count the fields in the line. 223 */ 224static int 225input_field_count(const char *line, size_t len, u_int32_t *field_countp) 226{ 227 u_int32_t field_count; 228 int quoted; 229 230 field_count = 1; 231 232 /* 233 * There are N-1 separators for N fields, that is, "a,b,c" is three 234 * fields, with two comma separators. 235 */ 236 switch (ifmt) { 237 case FORMAT_EXCEL: 238 quoted = 0; 239 for (field_count = 1; len > 0; ++line, --len) 240 if (*line == '"') 241 quoted = !quoted; 242 else if (*line == ',' && !quoted) 243 ++field_count; 244 break; 245 case FORMAT_NL: 246 for (field_count = 1; len > 0; ++line, --len) 247 if (*line == ',') 248 ++field_count; 249 break; 250 } 251 *field_countp = field_count; 252 253 if (verbose) 254 dbenv->errx(dbenv, 255 "input file made up of %lu fields", (u_int)field_count); 256 257 return (0); 258} 259 260/* 261 * input_put_alloc -- 262 * Allocate room for the offset table plus the input. 263 */ 264static int 265input_put_alloc(u_int32_t **put_linep, 266 size_t *put_lenp, size_t len, u_int32_t field_count) 267{ 268 size_t total; 269 270 total = (field_count + 2) * sizeof(u_int32_t) + len; 271 if (total > *put_lenp && 272 (*put_linep = realloc(*put_linep, *put_lenp += total)) == NULL) { 273 dbenv->err(dbenv, errno, 274 "unable to allocate %lu bytes for record", 275 (u_long)*put_lenp); 276 return (1); 277 } 278 return (0); 279} 280 281/* 282 * input_set_offset -- 283 * Build an offset table and record combination. 284 */ 285static int 286input_set_offset(u_int32_t *put_line, 287 char *input_line, size_t len, u_int32_t field_count) 288{ 289 u_int32_t *op; 290 int quoted; 291 char *p, *endp; 292 293 op = put_line; 294 295 /* The first field is the version number. */ 296 *op++ = version; 297 298 /* 299 * Walk the input line, looking for comma separators. It's an error 300 * to have too many or too few fields. 301 */ 302 *op++ = 0; 303 quoted = 0; 304 for (p = input_line, endp = input_line + len;; ++p) { 305 if (ifmt == FORMAT_EXCEL && p < endp) { 306 if (*p == '"') 307 quoted = !quoted; 308 if (quoted) 309 continue; 310 } 311 if (*p == ',' || p == endp) { 312 if (field_count == 0) { 313 dbenv->errx(dbenv, 314 "record %lu: too many fields in the record", 315 record_count); 316 return (1); 317 } 318 --field_count; 319 320 *op++ = (u_int32_t)(p - input_line) + 1; 321 322 if (verbose > 1) 323 dbenv->errx(dbenv, 324 "offset %lu: {%.*s}", op[-1], 325 OFFSET_LEN(op, -2), input_line + op[-2]); 326 327 /* 328 * Don't insert a new field if the input lines ends 329 * in a comma. 330 */ 331 if (p == endp || p + 1 == endp) 332 break; 333 } 334 } 335 *op++ = (u_int32_t)(p - input_line); 336 337 if (field_count != 0) { 338 dbenv->errx(dbenv, 339 "record %lu: not enough fields in the record", 340 record_count); 341 return (1); 342 } 343 memcpy(op, input_line, len); 344 345 return (0); 346} 347