backend.c revision 5496:cee79a909683
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * sqlite is not compatible with _FILE_OFFSET_BITS=64, but we need to
30 * be able to statvfs(2) possibly large systems.  This define gives us
31 * access to the transitional interfaces.  See lfcompile64(5) for how
32 * _LARGEFILE64_SOURCE works.
33 */
34#define	_LARGEFILE64_SOURCE
35
36#include <assert.h>
37#include <door.h>
38#include <dirent.h>
39#include <errno.h>
40#include <fcntl.h>
41#include <limits.h>
42#include <pthread.h>
43#include <stdarg.h>
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47#include <sys/stat.h>
48#include <sys/statvfs.h>
49#include <unistd.h>
50#include <zone.h>
51
52#include "configd.h"
53#include "repcache_protocol.h"
54
55#include <sqlite.h>
56#include <sqlite-misc.h>
57
58/*
59 * This file has two purposes:
60 *
61 * 1. It contains the database schema, and the code for setting up our backend
62 *    databases, including installing said schema.
63 *
64 * 2. It provides a simplified interface to the SQL database library, and
65 *    synchronizes MT access to the database.
66 */
67
68typedef struct backend_spent {
69	uint64_t bs_count;
70	hrtime_t bs_time;
71	hrtime_t bs_vtime;
72} backend_spent_t;
73
74typedef struct backend_totals {
75	backend_spent_t	bt_lock;	/* waiting for lock */
76	backend_spent_t	bt_exec;	/* time spent executing SQL */
77} backend_totals_t;
78
79typedef struct sqlite_backend {
80	pthread_mutex_t	be_lock;
81	pthread_t	be_thread;	/* thread holding lock */
82	struct sqlite	*be_db;
83	const char	*be_path;	/* path to db */
84	int		be_readonly;	/* readonly at start, and still is */
85	int		be_writing;	/* held for writing */
86	backend_type_t	be_type;	/* type of db */
87	hrtime_t	be_lastcheck;	/* time of last read-only check */
88	backend_totals_t be_totals[2];	/* one for reading, one for writing */
89} sqlite_backend_t;
90
91struct backend_tx {
92	sqlite_backend_t	*bt_be;
93	int			bt_readonly;
94	int			bt_type;
95	int			bt_full;	/* SQLITE_FULL during tx */
96};
97
98#define	UPDATE_TOTALS_WR(sb, writing, field, ts, vts) { \
99	backend_spent_t *__bsp = &(sb)->be_totals[!!(writing)].field; \
100	__bsp->bs_count++;						\
101	__bsp->bs_time += (gethrtime() - ts);				\
102	__bsp->bs_vtime += (gethrvtime() - vts);			\
103}
104
105#define	UPDATE_TOTALS(sb, field, ts, vts) \
106	UPDATE_TOTALS_WR(sb, (sb)->be_writing, field, ts, vts)
107
108struct backend_query {
109	char	*bq_buf;
110	size_t	bq_size;
111};
112
113struct backend_tbl_info {
114	const char *bti_name;
115	const char *bti_cols;
116};
117
118struct backend_idx_info {
119	const char *bxi_tbl;
120	const char *bxi_idx;
121	const char *bxi_cols;
122};
123
124static pthread_mutex_t backend_panic_lock = PTHREAD_MUTEX_INITIALIZER;
125static pthread_cond_t backend_panic_cv = PTHREAD_COND_INITIALIZER;
126pthread_t backend_panic_thread = 0;
127
128int backend_do_trace = 0;		/* invoke tracing callback */
129int backend_print_trace = 0;		/* tracing callback prints SQL */
130int backend_panic_abort = 0;		/* abort when panicking */
131
132/* interval between read-only checks while starting up */
133#define	BACKEND_READONLY_CHECK_INTERVAL	(2 * (hrtime_t)NANOSEC)
134
135/*
136 * Any change to the below schema should bump the version number
137 */
138#define	BACKEND_SCHEMA_VERSION		5
139
140static struct backend_tbl_info tbls_normal[] = { /* BACKEND_TYPE_NORMAL */
141	/*
142	 * service_tbl holds all services.  svc_id is the identifier of the
143	 * service.
144	 */
145	{
146		"service_tbl",
147		"svc_id          INTEGER PRIMARY KEY,"
148		"svc_name        CHAR(256) NOT NULL"
149	},
150
151	/*
152	 * instance_tbl holds all of the instances.  The parent service id
153	 * is instance_svc.
154	 */
155	{
156		"instance_tbl",
157		"instance_id     INTEGER PRIMARY KEY,"
158		"instance_name   CHAR(256) NOT NULL,"
159		"instance_svc    INTEGER NOT NULL"
160	},
161
162	/*
163	 * snapshot_lnk_tbl links (instance, snapshot name) with snapshots.
164	 */
165	{
166		"snapshot_lnk_tbl",
167		"lnk_id          INTEGER PRIMARY KEY,"
168		"lnk_inst_id     INTEGER NOT NULL,"
169		"lnk_snap_name   CHAR(256) NOT NULL,"
170		"lnk_snap_id     INTEGER NOT NULL"
171	},
172
173	/*
174	 * snaplevel_tbl maps a snapshot id to a set of named, ordered
175	 * snaplevels.
176	 */
177	{
178		"snaplevel_tbl",
179		"snap_id                 INTEGER NOT NULL,"
180		"snap_level_num          INTEGER NOT NULL,"
181		"snap_level_id           INTEGER NOT NULL,"
182		"snap_level_service_id   INTEGER NOT NULL,"
183		"snap_level_service      CHAR(256) NOT NULL,"
184		"snap_level_instance_id  INTEGER NULL,"
185		"snap_level_instance     CHAR(256) NULL"
186	},
187
188	/*
189	 * snaplevel_lnk_tbl links snaplevels to property groups.
190	 * snaplvl_pg_* is identical to the original property group,
191	 * and snaplvl_gen_id overrides the generation number.
192	 * The service/instance ids are as in the snaplevel.
193	 */
194	{
195		"snaplevel_lnk_tbl",
196		"snaplvl_level_id INTEGER NOT NULL,"
197		"snaplvl_pg_id    INTEGER NOT NULL,"
198		"snaplvl_pg_name  CHAR(256) NOT NULL,"
199		"snaplvl_pg_type  CHAR(256) NOT NULL,"
200		"snaplvl_pg_flags INTEGER NOT NULL,"
201		"snaplvl_gen_id   INTEGER NOT NULL"
202	},
203
204	{ NULL, NULL }
205};
206
207static struct backend_idx_info idxs_normal[] = { /* BACKEND_TYPE_NORMAL */
208	{ "service_tbl",	"name",	"svc_name" },
209	{ "instance_tbl",	"name",	"instance_svc, instance_name" },
210	{ "snapshot_lnk_tbl",	"name",	"lnk_inst_id, lnk_snap_name" },
211	{ "snapshot_lnk_tbl",	"snapid", "lnk_snap_id" },
212	{ "snaplevel_tbl",	"id",	"snap_id" },
213	{ "snaplevel_lnk_tbl",	"id",	"snaplvl_pg_id" },
214	{ "snaplevel_lnk_tbl",	"level", "snaplvl_level_id" },
215	{ NULL, NULL, NULL }
216};
217
218static struct backend_tbl_info tbls_np[] = { /* BACKEND_TYPE_NONPERSIST */
219	{ NULL, NULL }
220};
221
222static struct backend_idx_info idxs_np[] = {	/* BACKEND_TYPE_NONPERSIST */
223	{ NULL, NULL, NULL }
224};
225
226static struct backend_tbl_info tbls_common[] = { /* all backend types */
227	/*
228	 * pg_tbl defines property groups.  They are associated with a single
229	 * service or instance.  The pg_gen_id links them with the latest
230	 * "edited" version of its properties.
231	 */
232	{
233		"pg_tbl",
234		"pg_id           INTEGER PRIMARY KEY,"
235		"pg_parent_id    INTEGER NOT NULL,"
236		"pg_name         CHAR(256) NOT NULL,"
237		"pg_type         CHAR(256) NOT NULL,"
238		"pg_flags        INTEGER NOT NULL,"
239		"pg_gen_id       INTEGER NOT NULL"
240	},
241
242	/*
243	 * prop_lnk_tbl links a particular pg_id and gen_id to a set of
244	 * (prop_name, prop_type, val_id) trios.
245	 */
246	{
247		"prop_lnk_tbl",
248		"lnk_prop_id     INTEGER PRIMARY KEY,"
249		"lnk_pg_id       INTEGER NOT NULL,"
250		"lnk_gen_id      INTEGER NOT NULL,"
251		"lnk_prop_name   CHAR(256) NOT NULL,"
252		"lnk_prop_type   CHAR(2) NOT NULL,"
253		"lnk_val_id      INTEGER"
254	},
255
256	/*
257	 * value_tbl maps a value_id to a set of values.  For any given
258	 * value_id, value_type is constant.
259	 */
260	{
261		"value_tbl",
262		"value_id        INTEGER NOT NULL,"
263		"value_type      CHAR(1) NOT NULL,"
264		"value_value     VARCHAR NOT NULL"
265	},
266
267	/*
268	 * id_tbl has one row per id space
269	 */
270	{
271		"id_tbl",
272		"id_name         STRING NOT NULL,"
273		"id_next         INTEGER NOT NULL"
274	},
275
276	/*
277	 * schema_version has a single row, which contains
278	 * BACKEND_SCHEMA_VERSION at the time of creation.
279	 */
280	{
281		"schema_version",
282		"schema_version  INTEGER"
283	},
284	{ NULL, NULL }
285};
286
287static struct backend_idx_info idxs_common[] = { /* all backend types */
288	{ "pg_tbl",		"parent", "pg_parent_id" },
289	{ "pg_tbl",		"name",	"pg_parent_id, pg_name" },
290	{ "pg_tbl",		"type",	"pg_parent_id, pg_type" },
291	{ "prop_lnk_tbl",	"base",	"lnk_pg_id, lnk_gen_id" },
292	{ "prop_lnk_tbl",	"val",	"lnk_val_id" },
293	{ "value_tbl",		"id",	"value_id" },
294	{ "id_tbl",		"id",	"id_name" },
295	{ NULL, NULL, NULL }
296};
297
298struct run_single_int_info {
299	uint32_t	*rs_out;
300	int		rs_result;
301};
302
303/*ARGSUSED*/
304static int
305run_single_int_callback(void *arg, int columns, char **vals, char **names)
306{
307	struct run_single_int_info *info = arg;
308	uint32_t val;
309
310	char *endptr = vals[0];
311
312	assert(info->rs_result != REP_PROTOCOL_SUCCESS);
313	assert(columns == 1);
314
315	if (vals[0] == NULL)
316		return (BACKEND_CALLBACK_CONTINUE);
317
318	errno = 0;
319	val = strtoul(vals[0], &endptr, 10);
320	if ((val == 0 && endptr == vals[0]) || *endptr != 0 || errno != 0)
321		backend_panic("malformed integer \"%20s\"", vals[0]);
322
323	*info->rs_out = val;
324	info->rs_result = REP_PROTOCOL_SUCCESS;
325	return (BACKEND_CALLBACK_CONTINUE);
326}
327
328/*ARGSUSED*/
329int
330backend_fail_if_seen(void *arg, int columns, char **vals, char **names)
331{
332	return (BACKEND_CALLBACK_ABORT);
333}
334
335/*
336 * check to see if we can successfully start a transaction;  if not, the
337 * filesystem is mounted read-only.
338 */
339static int
340backend_is_readonly(struct sqlite *db, const char *path)
341{
342	int r;
343	statvfs64_t stat;
344
345	if (statvfs64(path, &stat) == 0 && (stat.f_flag & ST_RDONLY))
346		return (SQLITE_READONLY);
347
348	r = sqlite_exec(db,
349	    "BEGIN TRANSACTION; "
350	    "UPDATE schema_version SET schema_version = schema_version; ",
351	    NULL, NULL, NULL);
352	(void) sqlite_exec(db, "ROLLBACK TRANSACTION", NULL, NULL, NULL);
353	return (r);
354}
355
356static void
357backend_trace_sql(void *arg, const char *sql)
358{
359	sqlite_backend_t *be = arg;
360
361	if (backend_print_trace) {
362		(void) fprintf(stderr, "%d: %s\n", be->be_type, sql);
363	}
364}
365
366static sqlite_backend_t be_info[BACKEND_TYPE_TOTAL];
367static sqlite_backend_t *bes[BACKEND_TYPE_TOTAL];
368
369#define	BACKEND_PANIC_TIMEOUT	(50 * MILLISEC)
370/*
371 * backend_panic() -- some kind of database problem or corruption has been hit.
372 * We attempt to quiesce the other database users -- all of the backend sql
373 * entry points will call backend_panic(NULL) if a panic is in progress, as
374 * will any attempt to start a transaction.
375 *
376 * We give threads holding a backend lock 50ms (BACKEND_PANIC_TIMEOUT) to
377 * either drop the lock or call backend_panic().  If they don't respond in
378 * time, we'll just exit anyway.
379 */
380void
381backend_panic(const char *format, ...)
382{
383	int i;
384	va_list args;
385	int failed = 0;
386
387	(void) pthread_mutex_lock(&backend_panic_lock);
388	if (backend_panic_thread != 0) {
389		(void) pthread_mutex_unlock(&backend_panic_lock);
390		/*
391		 * first, drop any backend locks we're holding, then
392		 * sleep forever on the panic_cv.
393		 */
394		for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
395			if (bes[i] != NULL &&
396			    bes[i]->be_thread == pthread_self())
397				(void) pthread_mutex_unlock(&bes[i]->be_lock);
398		}
399		(void) pthread_mutex_lock(&backend_panic_lock);
400		for (;;)
401			(void) pthread_cond_wait(&backend_panic_cv,
402			    &backend_panic_lock);
403	}
404	backend_panic_thread = pthread_self();
405	(void) pthread_mutex_unlock(&backend_panic_lock);
406
407	for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
408		if (bes[i] != NULL && bes[i]->be_thread == pthread_self())
409			(void) pthread_mutex_unlock(&bes[i]->be_lock);
410	}
411
412	va_start(args, format);
413	configd_vcritical(format, args);
414	va_end(args);
415
416	for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
417		timespec_t rel;
418
419		rel.tv_sec = 0;
420		rel.tv_nsec = BACKEND_PANIC_TIMEOUT;
421
422		if (bes[i] != NULL && bes[i]->be_thread != pthread_self()) {
423			if (pthread_mutex_reltimedlock_np(&bes[i]->be_lock,
424			    &rel) != 0)
425				failed++;
426		}
427	}
428	if (failed) {
429		configd_critical("unable to quiesce database\n");
430	}
431
432	if (backend_panic_abort)
433		abort();
434
435	exit(CONFIGD_EXIT_DATABASE_BAD);
436}
437
438/*
439 * Returns
440 *   _SUCCESS
441 *   _DONE - callback aborted query
442 *   _NO_RESOURCES - out of memory (_FULL & _TOOBIG?)
443 */
444static int
445backend_error(sqlite_backend_t *be, int error, char *errmsg)
446{
447	if (error == SQLITE_OK)
448		return (REP_PROTOCOL_SUCCESS);
449
450	switch (error) {
451	case SQLITE_ABORT:
452		free(errmsg);
453		return (REP_PROTOCOL_DONE);
454
455	case SQLITE_NOMEM:
456	case SQLITE_FULL:
457	case SQLITE_TOOBIG:
458		free(errmsg);
459		return (REP_PROTOCOL_FAIL_NO_RESOURCES);
460
461	default:
462		backend_panic("%s: db error: %s", be->be_path, errmsg);
463		/*NOTREACHED*/
464	}
465}
466
467static void
468backend_backup_cleanup(const char **out_arg, ssize_t out_sz)
469{
470	char **out = (char **)out_arg;
471
472	while (out_sz-- > 0)
473		free(*out++);
474	free(out_arg);
475}
476
477/*
478 * builds a inverse-time-sorted array of backup files.  The path is a
479 * a single buffer, and the pointers look like:
480 *
481 *	/this/is/a/full/path/to/repository-name-YYYYMMDDHHMMSS
482 *	^pathname		^	       ^(pathname+pathlen)
483 *				basename
484 *
485 * dirname will either be pathname, or ".".
486 *
487 * Returns the number of elements in the array, 0 if there are no previous
488 * backups, or -1 on error.
489 */
490static ssize_t
491backend_backup_get_prev(char *pathname, size_t pathlen, const char ***out_arg)
492{
493	char b_start, b_end;
494	DIR *dir;
495	char **out = NULL;
496	char *name, *p;
497	char *dirname, *basename;
498	char *pathend;
499	struct dirent *ent;
500
501	size_t count = 0;
502	size_t baselen;
503
504	/*
505	 * year, month, day, hour, min, sec, plus an '_'.
506	 */
507	const size_t ndigits = 4 + 5*2 + 1;
508	const size_t baroffset = 4 + 2*2;
509
510	size_t idx;
511
512	pathend = pathname + pathlen;
513	b_end = *pathend;
514	*pathend = '\0';
515
516	basename = strrchr(pathname, '/');
517
518	if (basename != NULL) {
519		assert(pathend > pathname && basename < pathend);
520		basename++;
521		dirname = pathname;
522	} else {
523		basename = pathname;
524		dirname = ".";
525	}
526
527	baselen = strlen(basename);
528
529	/*
530	 * munge the string temporarily for the opendir(), then restore it.
531	 */
532	b_start = basename[0];
533
534	basename[0] = '\0';
535	dir = opendir(dirname);
536	basename[0] = b_start;		/* restore path */
537
538	if (dir == NULL)
539		goto fail;
540
541
542	while ((ent = readdir(dir)) != NULL) {
543		/*
544		 * Must match:
545		 *	basename-YYYYMMDD_HHMMSS
546		 * or we ignore it.
547		 */
548		if (strncmp(ent->d_name, basename, baselen) != 0)
549			continue;
550
551		name = ent->d_name;
552		if (name[baselen] != '-')
553			continue;
554
555		p = name + baselen + 1;
556
557		for (idx = 0; idx < ndigits; idx++) {
558			char c = p[idx];
559			if (idx == baroffset && c != '_')
560				break;
561			if (idx != baroffset && (c < '0' || c > '9'))
562				break;
563		}
564		if (idx != ndigits || p[idx] != '\0')
565			continue;
566
567		/*
568		 * We have a match.  insertion-sort it into our list.
569		 */
570		name = strdup(name);
571		if (name == NULL)
572			goto fail_closedir;
573		p = strrchr(name, '-');
574
575		for (idx = 0; idx < count; idx++) {
576			char *tmp = out[idx];
577			char *tp = strrchr(tmp, '-');
578
579			int cmp = strcmp(p, tp);
580			if (cmp == 0)
581				cmp = strcmp(name, tmp);
582
583			if (cmp == 0) {
584				free(name);
585				name = NULL;
586				break;
587			} else if (cmp > 0) {
588				out[idx] = name;
589				name = tmp;
590				p = tp;
591			}
592		}
593
594		if (idx == count) {
595			char **new_out = realloc(out,
596			    (count + 1) * sizeof (*out));
597
598			if (new_out == NULL) {
599				free(name);
600				goto fail_closedir;
601			}
602
603			out = new_out;
604			out[count++] = name;
605		} else {
606			assert(name == NULL);
607		}
608	}
609	(void) closedir(dir);
610
611	basename[baselen] = b_end;
612
613	*out_arg = (const char **)out;
614	return (count);
615
616fail_closedir:
617	(void) closedir(dir);
618fail:
619	basename[0] = b_start;
620	*pathend = b_end;
621
622	backend_backup_cleanup((const char **)out, count);
623
624	*out_arg = NULL;
625	return (-1);
626}
627
628/*
629 * Copies the repository path into out, a buffer of out_len bytes,
630 * removes the ".db" (or whatever) extension, and, if name is non-NULL,
631 * appends "-name" to it.  If name is non-NULL, it can fail with:
632 *
633 *	_TRUNCATED	will not fit in buffer.
634 *	_BAD_REQUEST	name is not a valid identifier
635 */
636static rep_protocol_responseid_t
637backend_backup_base(sqlite_backend_t *be, const char *name,
638    char *out, size_t out_len)
639{
640	char *p, *q;
641	size_t len;
642
643	/*
644	 * for paths of the form /path/to/foo.db, we truncate at the final
645	 * '.'.
646	 */
647	(void) strlcpy(out, be->be_path, out_len);
648
649	p = strrchr(out, '/');
650	q = strrchr(out, '.');
651
652	if (p != NULL && q != NULL && q > p)
653		*q = 0;
654
655	if (name != NULL) {
656		len = strlen(out);
657		assert(len < out_len);
658
659		out += len;
660		out_len -= len;
661
662		len = strlen(name);
663
664		/*
665		 * verify that the name tag is entirely alphabetic,
666		 * non-empty, and not too long.
667		 */
668		if (len == 0 || len >= REP_PROTOCOL_NAME_LEN ||
669		    uu_check_name(name, UU_NAME_DOMAIN) < 0)
670			return (REP_PROTOCOL_FAIL_BAD_REQUEST);
671
672		if (snprintf(out, out_len, "-%s", name) >= out_len)
673			return (REP_PROTOCOL_FAIL_TRUNCATED);
674	}
675
676	return (REP_PROTOCOL_SUCCESS);
677}
678
679/*
680 * See if a backup is needed.  We do a backup unless both files are
681 * byte-for-byte identical.
682 */
683static int
684backend_check_backup_needed(const char *rep_name, const char *backup_name)
685{
686	int repfd = open(rep_name, O_RDONLY);
687	int fd = open(backup_name, O_RDONLY);
688	struct stat s_rep, s_backup;
689	int c1, c2;
690
691	FILE *f_rep = NULL;
692	FILE *f_backup = NULL;
693
694	if (repfd < 0 || fd < 0)
695		goto fail;
696
697	if (fstat(repfd, &s_rep) < 0 || fstat(fd, &s_backup) < 0)
698		goto fail;
699
700	/*
701	 * if they are the same file, we need to do a backup to break the
702	 * hard link or symlink involved.
703	 */
704	if (s_rep.st_ino == s_backup.st_ino && s_rep.st_dev == s_backup.st_dev)
705		goto fail;
706
707	if (s_rep.st_size != s_backup.st_size)
708		goto fail;
709
710	if ((f_rep = fdopen(repfd, "r")) == NULL ||
711	    (f_backup = fdopen(fd, "r")) == NULL)
712		goto fail;
713
714	do {
715		c1 = getc(f_rep);
716		c2 = getc(f_backup);
717		if (c1 != c2)
718			goto fail;
719	} while (c1 != EOF);
720
721	if (!ferror(f_rep) && !ferror(f_backup)) {
722		(void) fclose(f_rep);
723		(void) fclose(f_backup);
724		(void) close(repfd);
725		(void) close(fd);
726		return (0);
727	}
728
729fail:
730	if (f_rep != NULL)
731		(void) fclose(f_rep);
732	if (f_backup != NULL)
733		(void) fclose(f_backup);
734	if (repfd >= 0)
735		(void) close(repfd);
736	if (fd >= 0)
737		(void) close(fd);
738	return (1);
739}
740
741/*
742 * Can return:
743 *	_BAD_REQUEST		name is not valid
744 *	_TRUNCATED		name is too long for current repository path
745 *	_UNKNOWN		failed for unknown reason (details written to
746 *				console)
747 *	_BACKEND_READONLY	backend is not writable
748 *
749 *	_SUCCESS		Backup completed successfully.
750 */
751static rep_protocol_responseid_t
752backend_create_backup_locked(sqlite_backend_t *be, const char *name)
753{
754	const char **old_list;
755	ssize_t old_sz;
756	ssize_t old_max = max_repository_backups;
757	ssize_t cur;
758
759	char *finalname;
760
761	char finalpath[PATH_MAX];
762	char tmppath[PATH_MAX];
763	char buf[8192];
764	int infd, outfd;
765	size_t len;
766	off_t inlen, outlen, offset;
767
768	time_t now;
769	struct tm now_tm;
770
771	rep_protocol_responseid_t result;
772
773	if (be->be_readonly)
774		return (REP_PROTOCOL_FAIL_BACKEND_READONLY);
775
776	result = backend_backup_base(be, name, finalpath, sizeof (finalpath));
777	if (result != REP_PROTOCOL_SUCCESS)
778		return (result);
779
780	if (!backend_check_backup_needed(be->be_path, finalpath)) {
781		return (REP_PROTOCOL_SUCCESS);
782	}
783
784	/*
785	 * remember the original length, and the basename location
786	 */
787	len = strlen(finalpath);
788	finalname = strrchr(finalpath, '/');
789	if (finalname != NULL)
790		finalname++;
791	else
792		finalname = finalpath;
793
794	(void) strlcpy(tmppath, finalpath, sizeof (tmppath));
795	if (strlcat(tmppath, "-tmpXXXXXX", sizeof (tmppath)) >=
796	    sizeof (tmppath))
797		return (REP_PROTOCOL_FAIL_TRUNCATED);
798
799	now = time(NULL);
800	if (localtime_r(&now, &now_tm) == NULL) {
801		configd_critical(
802		    "\"%s\" backup failed: localtime(3C) failed: %s\n", name,
803		    be->be_path, strerror(errno));
804		return (REP_PROTOCOL_FAIL_UNKNOWN);
805	}
806
807	if (strftime(finalpath + len, sizeof (finalpath) - len,
808	    "-%Y""%m""%d""_""%H""%M""%S", &now_tm) >=
809	    sizeof (finalpath) - len) {
810		return (REP_PROTOCOL_FAIL_TRUNCATED);
811	}
812
813	infd = open(be->be_path, O_RDONLY);
814	if (infd < 0) {
815		configd_critical("\"%s\" backup failed: opening %s: %s\n", name,
816		    be->be_path, strerror(errno));
817		return (REP_PROTOCOL_FAIL_UNKNOWN);
818	}
819
820	outfd = mkstemp(tmppath);
821	if (outfd < 0) {
822		configd_critical("\"%s\" backup failed: mkstemp(%s): %s\n",
823		    name, tmppath, strerror(errno));
824		(void) close(infd);
825		return (REP_PROTOCOL_FAIL_UNKNOWN);
826	}
827
828	for (;;) {
829		do {
830			inlen = read(infd, buf, sizeof (buf));
831		} while (inlen < 0 && errno == EINTR);
832
833		if (inlen <= 0)
834			break;
835
836		for (offset = 0; offset < inlen; offset += outlen) {
837			do {
838				outlen = write(outfd, buf + offset,
839				    inlen - offset);
840			} while (outlen < 0 && errno == EINTR);
841
842			if (outlen >= 0)
843				continue;
844
845			configd_critical(
846			    "\"%s\" backup failed: write to %s: %s\n",
847			    name, tmppath, strerror(errno));
848			result = REP_PROTOCOL_FAIL_UNKNOWN;
849			goto fail;
850		}
851	}
852
853	if (inlen < 0) {
854		configd_critical(
855		    "\"%s\" backup failed: read from %s: %s\n",
856		    name, be->be_path, strerror(errno));
857		goto fail;
858	}
859
860	/*
861	 * grab the old list before doing our re-name.
862	 */
863	if (old_max > 0)
864		old_sz = backend_backup_get_prev(finalpath, len, &old_list);
865
866	if (rename(tmppath, finalpath) < 0) {
867		configd_critical(
868		    "\"%s\" backup failed: rename(%s, %s): %s\n",
869		    name, tmppath, finalpath, strerror(errno));
870		result = REP_PROTOCOL_FAIL_UNKNOWN;
871		goto fail;
872	}
873
874	tmppath[len] = 0;	/* strip -XXXXXX, for reference symlink */
875
876	(void) unlink(tmppath);
877	if (symlink(finalname, tmppath) < 0) {
878		configd_critical(
879		    "\"%s\" backup completed, but updating "
880		    "\"%s\" symlink to \"%s\" failed: %s\n",
881		    name, tmppath, finalname, strerror(errno));
882	}
883
884	if (old_max > 0 && old_sz > 0) {
885		/* unlink all but the first (old_max - 1) files */
886		for (cur = old_max - 1; cur < old_sz; cur++) {
887			(void) strlcpy(finalname, old_list[cur],
888			    sizeof (finalpath) - (finalname - finalpath));
889			if (unlink(finalpath) < 0)
890				configd_critical(
891				    "\"%s\" backup completed, but removing old "
892				    "file \"%s\" failed: %s\n",
893				    name, finalpath, strerror(errno));
894		}
895
896		backend_backup_cleanup(old_list, old_sz);
897	}
898
899	result = REP_PROTOCOL_SUCCESS;
900
901fail:
902	(void) close(infd);
903	(void) close(outfd);
904	if (result != REP_PROTOCOL_SUCCESS)
905		(void) unlink(tmppath);
906
907	return (result);
908}
909
910static int
911backend_check_readonly(sqlite_backend_t *be, int writing, hrtime_t t)
912{
913	char *errp;
914	struct sqlite *new;
915	int r;
916
917	assert(be->be_readonly);
918	assert(be == bes[BACKEND_TYPE_NORMAL]);
919
920	/*
921	 * If we don't *need* to be writable, only check every once in a
922	 * while.
923	 */
924	if (!writing) {
925		if ((uint64_t)(t - be->be_lastcheck) <
926		    BACKEND_READONLY_CHECK_INTERVAL)
927			return (REP_PROTOCOL_SUCCESS);
928		be->be_lastcheck = t;
929	}
930
931	new = sqlite_open(be->be_path, 0600, &errp);
932	if (new == NULL) {
933		backend_panic("reopening %s: %s\n", be->be_path, errp);
934		/*NOTREACHED*/
935	}
936	r = backend_is_readonly(new, be->be_path);
937
938	if (r != SQLITE_OK) {
939		sqlite_close(new);
940		if (writing)
941			return (REP_PROTOCOL_FAIL_BACKEND_READONLY);
942		return (REP_PROTOCOL_SUCCESS);
943	}
944
945	/*
946	 * We can write!  Swap the db handles, mark ourself writable,
947	 * and make a backup.
948	 */
949	sqlite_close(be->be_db);
950	be->be_db = new;
951	be->be_readonly = 0;
952
953	if (backend_create_backup_locked(be, REPOSITORY_BOOT_BACKUP) !=
954	    REP_PROTOCOL_SUCCESS) {
955		configd_critical(
956		    "unable to create \"%s\" backup of \"%s\"\n",
957		    REPOSITORY_BOOT_BACKUP, be->be_path);
958	}
959
960	return (REP_PROTOCOL_SUCCESS);
961}
962
963/*
964 * If t is not BACKEND_TYPE_NORMAL, can fail with
965 *   _BACKEND_ACCESS - backend does not exist
966 *
967 * If writing is nonzero, can also fail with
968 *   _BACKEND_READONLY - backend is read-only
969 */
970static int
971backend_lock(backend_type_t t, int writing, sqlite_backend_t **bep)
972{
973	sqlite_backend_t *be = NULL;
974	hrtime_t ts, vts;
975
976	*bep = NULL;
977
978	assert(t == BACKEND_TYPE_NORMAL ||
979	    t == BACKEND_TYPE_NONPERSIST);
980
981	be = bes[t];
982	if (t == BACKEND_TYPE_NORMAL)
983		assert(be != NULL);		/* should always be there */
984
985	if (be == NULL)
986		return (REP_PROTOCOL_FAIL_BACKEND_ACCESS);
987
988	if (backend_panic_thread != 0)
989		backend_panic(NULL);		/* don't proceed */
990
991	ts = gethrtime();
992	vts = gethrvtime();
993	(void) pthread_mutex_lock(&be->be_lock);
994	UPDATE_TOTALS_WR(be, writing, bt_lock, ts, vts);
995
996	if (backend_panic_thread != 0) {
997		(void) pthread_mutex_unlock(&be->be_lock);
998		backend_panic(NULL);		/* don't proceed */
999	}
1000	be->be_thread = pthread_self();
1001
1002	if (be->be_readonly) {
1003		int r;
1004		assert(t == BACKEND_TYPE_NORMAL);
1005
1006		r = backend_check_readonly(be, writing, ts);
1007		if (r != REP_PROTOCOL_SUCCESS) {
1008			be->be_thread = 0;
1009			(void) pthread_mutex_unlock(&be->be_lock);
1010			return (r);
1011		}
1012	}
1013
1014	if (backend_do_trace)
1015		(void) sqlite_trace(be->be_db, backend_trace_sql, be);
1016	else
1017		(void) sqlite_trace(be->be_db, NULL, NULL);
1018
1019	be->be_writing = writing;
1020	*bep = be;
1021	return (REP_PROTOCOL_SUCCESS);
1022}
1023
1024static void
1025backend_unlock(sqlite_backend_t *be)
1026{
1027	be->be_writing = 0;
1028	be->be_thread = 0;
1029	(void) pthread_mutex_unlock(&be->be_lock);
1030}
1031
1032static void
1033backend_destroy(sqlite_backend_t *be)
1034{
1035	if (be->be_db != NULL) {
1036		sqlite_close(be->be_db);
1037		be->be_db = NULL;
1038	}
1039	be->be_thread = 0;
1040	(void) pthread_mutex_unlock(&be->be_lock);
1041	(void) pthread_mutex_destroy(&be->be_lock);
1042}
1043
1044static void
1045backend_create_finish(backend_type_t backend_id, sqlite_backend_t *be)
1046{
1047	assert(MUTEX_HELD(&be->be_lock));
1048	assert(be == &be_info[backend_id]);
1049
1050	bes[backend_id] = be;
1051	(void) pthread_mutex_unlock(&be->be_lock);
1052}
1053
1054static int
1055backend_fd_write(int fd, const char *mess)
1056{
1057	int len = strlen(mess);
1058	int written;
1059
1060	while (len > 0) {
1061		if ((written = write(fd, mess, len)) < 0)
1062			return (-1);
1063		mess += written;
1064		len -= written;
1065	}
1066	return (0);
1067}
1068
1069/*
1070 * Can return:
1071 *	_BAD_REQUEST		name is not valid
1072 *	_TRUNCATED		name is too long for current repository path
1073 *	_UNKNOWN		failed for unknown reason (details written to
1074 *				console)
1075 *	_BACKEND_READONLY	backend is not writable
1076 *
1077 *	_SUCCESS		Backup completed successfully.
1078 */
1079rep_protocol_responseid_t
1080backend_create_backup(const char *name)
1081{
1082	rep_protocol_responseid_t result;
1083	sqlite_backend_t *be;
1084
1085	result = backend_lock(BACKEND_TYPE_NORMAL, 0, &be);
1086	if (result != REP_PROTOCOL_SUCCESS)
1087		return (result);
1088
1089	result = backend_create_backup_locked(be, name);
1090	backend_unlock(be);
1091
1092	return (result);
1093}
1094
1095/*ARGSUSED*/
1096static int
1097backend_integrity_callback(void *private, int narg, char **vals, char **cols)
1098{
1099	char **out = private;
1100	char *old = *out;
1101	char *new;
1102	const char *info;
1103	size_t len;
1104	int x;
1105
1106	for (x = 0; x < narg; x++) {
1107		if ((info = vals[x]) != NULL &&
1108		    strcmp(info, "ok") != 0) {
1109			len = (old == NULL)? 0 : strlen(old);
1110			len += strlen(info) + 2;	/* '\n' + '\0' */
1111
1112			new = realloc(old, len);
1113			if (new == NULL)
1114				return (BACKEND_CALLBACK_ABORT);
1115			if (old == NULL)
1116				new[0] = 0;
1117			old = *out = new;
1118			(void) strlcat(new, info, len);
1119			(void) strlcat(new, "\n", len);
1120		}
1121	}
1122	return (BACKEND_CALLBACK_CONTINUE);
1123}
1124
1125#define	BACKEND_CREATE_LOCKED		-2
1126#define	BACKEND_CREATE_FAIL		-1
1127#define	BACKEND_CREATE_SUCCESS		0
1128#define	BACKEND_CREATE_READONLY		1
1129#define	BACKEND_CREATE_NEED_INIT	2
1130static int
1131backend_create(backend_type_t backend_id, const char *db_file,
1132    sqlite_backend_t **bep)
1133{
1134	char *errp;
1135	char *integrity_results = NULL;
1136	sqlite_backend_t *be;
1137	int r;
1138	uint32_t val = -1UL;
1139	struct run_single_int_info info;
1140	int fd;
1141
1142	assert(backend_id >= 0 && backend_id < BACKEND_TYPE_TOTAL);
1143
1144	be = &be_info[backend_id];
1145	assert(be->be_db == NULL);
1146
1147	(void) pthread_mutex_init(&be->be_lock, NULL);
1148	(void) pthread_mutex_lock(&be->be_lock);
1149
1150	be->be_type = backend_id;
1151	be->be_path = strdup(db_file);
1152	if (be->be_path == NULL) {
1153		perror("malloc");
1154		goto fail;
1155	}
1156
1157	be->be_db = sqlite_open(be->be_path, 0600, &errp);
1158
1159	if (be->be_db == NULL) {
1160		if (strstr(errp, "out of memory") != NULL) {
1161			configd_critical("%s: %s\n", db_file, errp);
1162			free(errp);
1163
1164			goto fail;
1165		}
1166
1167		/* report it as an integrity failure */
1168		integrity_results = errp;
1169		errp = NULL;
1170		goto integrity_fail;
1171	}
1172
1173	/*
1174	 * check if we are inited and of the correct schema version
1175	 *
1176	 * Eventually, we'll support schema upgrade here.
1177	 */
1178	info.rs_out = &val;
1179	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;
1180
1181	r = sqlite_exec(be->be_db, "SELECT schema_version FROM schema_version;",
1182	    run_single_int_callback, &info, &errp);
1183	if (r == SQLITE_ERROR &&
1184	    strcmp("no such table: schema_version", errp) == 0) {
1185		free(errp);
1186		/*
1187		 * Could be an empty repository, could be pre-schema_version
1188		 * schema.  Check for id_tbl, which has always been there.
1189		 */
1190		r = sqlite_exec(be->be_db, "SELECT count() FROM id_tbl;",
1191		    NULL, NULL, &errp);
1192		if (r == SQLITE_ERROR &&
1193		    strcmp("no such table: id_tbl", errp) == 0) {
1194			free(errp);
1195			*bep = be;
1196			return (BACKEND_CREATE_NEED_INIT);
1197		}
1198
1199		configd_critical("%s: schema version mismatch\n", db_file);
1200		goto fail;
1201	}
1202	if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
1203		free(errp);
1204		*bep = NULL;
1205		backend_destroy(be);
1206		return (BACKEND_CREATE_LOCKED);
1207	}
1208	if (r == SQLITE_OK) {
1209		if (info.rs_result == REP_PROTOCOL_FAIL_NOT_FOUND ||
1210		    val != BACKEND_SCHEMA_VERSION) {
1211			configd_critical("%s: schema version mismatch\n",
1212			    db_file);
1213			goto fail;
1214		}
1215	}
1216
1217	/*
1218	 * pull in the whole database sequentially.
1219	 */
1220	if ((fd = open(db_file, O_RDONLY)) >= 0) {
1221		size_t sz = 64 * 1024;
1222		char *buffer = malloc(sz);
1223		if (buffer != NULL) {
1224			while (read(fd, buffer, sz) > 0)
1225				;
1226			free(buffer);
1227		}
1228		(void) close(fd);
1229	}
1230
1231	/*
1232	 * run an integrity check
1233	 */
1234	r = sqlite_exec(be->be_db, "PRAGMA integrity_check;",
1235	    backend_integrity_callback, &integrity_results, &errp);
1236
1237	if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
1238		free(errp);
1239		*bep = NULL;
1240		backend_destroy(be);
1241		return (BACKEND_CREATE_LOCKED);
1242	}
1243	if (r == SQLITE_ABORT) {
1244		free(errp);
1245		errp = NULL;
1246		integrity_results = "out of memory running integrity check\n";
1247	} else if (r != SQLITE_OK && integrity_results == NULL) {
1248		integrity_results = errp;
1249		errp = NULL;
1250	}
1251
1252integrity_fail:
1253	if (integrity_results != NULL) {
1254		const char *fname = "/etc/svc/volatile/db_errors";
1255		if ((fd = open(fname, O_CREAT|O_WRONLY|O_APPEND, 0600)) < 0) {
1256			fname = NULL;
1257		} else {
1258			if (backend_fd_write(fd, "\n\n") < 0 ||
1259			    backend_fd_write(fd, db_file) < 0 ||
1260			    backend_fd_write(fd,
1261			    ": PRAGMA integrity_check; failed.  Results:\n") <
1262			    0 || backend_fd_write(fd, integrity_results) < 0 ||
1263			    backend_fd_write(fd, "\n\n") < 0) {
1264				fname = NULL;
1265			}
1266			(void) close(fd);
1267		}
1268
1269		if (!is_main_repository ||
1270		    backend_id == BACKEND_TYPE_NONPERSIST) {
1271			if (fname != NULL)
1272				configd_critical(
1273				    "%s: integrity check failed. Details in "
1274				    "%s\n", db_file, fname);
1275			else
1276				configd_critical(
1277				    "%s: integrity check failed.\n",
1278				    db_file);
1279		} else {
1280			(void) fprintf(stderr,
1281"\n"
1282"svc.configd: smf(5) database integrity check of:\n"
1283"\n"
1284"    %s\n"
1285"\n"
1286"  failed. The database might be damaged or a media error might have\n"
1287"  prevented it from being verified.  Additional information useful to\n"
1288"  your service provider%s%s\n"
1289"\n"
1290"  The system will not be able to boot until you have restored a working\n"
1291"  database.  svc.startd(1M) will provide a sulogin(1M) prompt for recovery\n"
1292"  purposes.  The command:\n"
1293"\n"
1294"    /lib/svc/bin/restore_repository\n"
1295"\n"
1296"  can be run to restore a backup version of your repository.  See\n"
1297"  http://sun.com/msg/SMF-8000-MY for more information.\n"
1298"\n",
1299			    db_file,
1300			    (fname == NULL)? ":\n\n" : " is in:\n\n    ",
1301			    (fname == NULL)? integrity_results : fname);
1302		}
1303		free(errp);
1304		goto fail;
1305	}
1306
1307	/*
1308	 * check if we are writable
1309	 */
1310	r = backend_is_readonly(be->be_db, be->be_path);
1311
1312	if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
1313		free(errp);
1314		*bep = NULL;
1315		backend_destroy(be);
1316		return (BACKEND_CREATE_LOCKED);
1317	}
1318	if (r != SQLITE_OK && r != SQLITE_FULL) {
1319		free(errp);
1320		be->be_readonly = 1;
1321		*bep = be;
1322		return (BACKEND_CREATE_READONLY);
1323	}
1324	*bep = be;
1325	return (BACKEND_CREATE_SUCCESS);
1326
1327fail:
1328	*bep = NULL;
1329	backend_destroy(be);
1330	return (BACKEND_CREATE_FAIL);
1331}
1332
1333/*
1334 * (arg & -arg) is, through the magic of twos-complement arithmetic, the
1335 * lowest set bit in arg.
1336 */
1337static size_t
1338round_up_to_p2(size_t arg)
1339{
1340	/*
1341	 * Don't allow a zero result.
1342	 */
1343	assert(arg > 0 && ((ssize_t)arg > 0));
1344
1345	while ((arg & (arg - 1)) != 0)
1346		arg += (arg & -arg);
1347
1348	return (arg);
1349}
1350
1351/*
1352 * Returns
1353 *   _NO_RESOURCES - out of memory
1354 *   _BACKEND_ACCESS - backend type t (other than _NORMAL) doesn't exist
1355 *   _DONE - callback aborted query
1356 *   _SUCCESS
1357 */
1358int
1359backend_run(backend_type_t t, backend_query_t *q,
1360    backend_run_callback_f *cb, void *data)
1361{
1362	char *errmsg = NULL;
1363	int ret;
1364	sqlite_backend_t *be;
1365	hrtime_t ts, vts;
1366
1367	if (q == NULL || q->bq_buf == NULL)
1368		return (REP_PROTOCOL_FAIL_NO_RESOURCES);
1369
1370	if ((ret = backend_lock(t, 0, &be)) != REP_PROTOCOL_SUCCESS)
1371		return (ret);
1372
1373	ts = gethrtime();
1374	vts = gethrvtime();
1375	ret = sqlite_exec(be->be_db, q->bq_buf, cb, data, &errmsg);
1376	UPDATE_TOTALS(be, bt_exec, ts, vts);
1377	ret = backend_error(be, ret, errmsg);
1378	backend_unlock(be);
1379
1380	return (ret);
1381}
1382
1383/*
1384 * Starts a "read-only" transaction -- i.e., locks out writers as long
1385 * as it is active.
1386 *
1387 * Fails with
1388 *   _NO_RESOURCES - out of memory
1389 *
1390 * If t is not _NORMAL, can also fail with
1391 *   _BACKEND_ACCESS - backend does not exist
1392 *
1393 * If writable is true, can also fail with
1394 *   _BACKEND_READONLY
1395 */
1396static int
1397backend_tx_begin_common(backend_type_t t, backend_tx_t **txp, int writable)
1398{
1399	backend_tx_t *ret;
1400	sqlite_backend_t *be;
1401	int r;
1402
1403	*txp = NULL;
1404
1405	ret = uu_zalloc(sizeof (*ret));
1406	if (ret == NULL)
1407		return (REP_PROTOCOL_FAIL_NO_RESOURCES);
1408
1409	if ((r = backend_lock(t, writable, &be)) != REP_PROTOCOL_SUCCESS) {
1410		uu_free(ret);
1411		return (r);
1412	}
1413
1414	ret->bt_be = be;
1415	ret->bt_readonly = !writable;
1416	ret->bt_type = t;
1417	ret->bt_full = 0;
1418
1419	*txp = ret;
1420	return (REP_PROTOCOL_SUCCESS);
1421}
1422
1423int
1424backend_tx_begin_ro(backend_type_t t, backend_tx_t **txp)
1425{
1426	return (backend_tx_begin_common(t, txp, 0));
1427}
1428
1429static void
1430backend_tx_end(backend_tx_t *tx)
1431{
1432	sqlite_backend_t *be;
1433
1434	be = tx->bt_be;
1435
1436	if (tx->bt_full) {
1437		struct sqlite *new;
1438
1439		/*
1440		 * sqlite tends to be sticky with SQLITE_FULL, so we try
1441		 * to get a fresh database handle if we got a FULL warning
1442		 * along the way.  If that fails, no harm done.
1443		 */
1444		new = sqlite_open(be->be_path, 0600, NULL);
1445		if (new != NULL) {
1446			sqlite_close(be->be_db);
1447			be->be_db = new;
1448		}
1449	}
1450	backend_unlock(be);
1451	tx->bt_be = NULL;
1452	uu_free(tx);
1453}
1454
1455void
1456backend_tx_end_ro(backend_tx_t *tx)
1457{
1458	assert(tx->bt_readonly);
1459	backend_tx_end(tx);
1460}
1461
1462/*
1463 * Fails with
1464 *   _NO_RESOURCES - out of memory
1465 *   _BACKEND_ACCESS
1466 *   _BACKEND_READONLY
1467 */
1468int
1469backend_tx_begin(backend_type_t t, backend_tx_t **txp)
1470{
1471	int r;
1472	char *errmsg;
1473	hrtime_t ts, vts;
1474
1475	r = backend_tx_begin_common(t, txp, 1);
1476	if (r != REP_PROTOCOL_SUCCESS)
1477		return (r);
1478
1479	ts = gethrtime();
1480	vts = gethrvtime();
1481	r = sqlite_exec((*txp)->bt_be->be_db, "BEGIN TRANSACTION", NULL, NULL,
1482	    &errmsg);
1483	UPDATE_TOTALS((*txp)->bt_be, bt_exec, ts, vts);
1484	if (r == SQLITE_FULL)
1485		(*txp)->bt_full = 1;
1486	r = backend_error((*txp)->bt_be, r, errmsg);
1487
1488	if (r != REP_PROTOCOL_SUCCESS) {
1489		assert(r != REP_PROTOCOL_DONE);
1490		(void) sqlite_exec((*txp)->bt_be->be_db,
1491		    "ROLLBACK TRANSACTION", NULL, NULL, NULL);
1492		backend_tx_end(*txp);
1493		*txp = NULL;
1494		return (r);
1495	}
1496
1497	(*txp)->bt_readonly = 0;
1498
1499	return (REP_PROTOCOL_SUCCESS);
1500}
1501
1502void
1503backend_tx_rollback(backend_tx_t *tx)
1504{
1505	int r;
1506	char *errmsg;
1507	sqlite_backend_t *be;
1508	hrtime_t ts, vts;
1509
1510	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1511	be = tx->bt_be;
1512
1513	ts = gethrtime();
1514	vts = gethrvtime();
1515	r = sqlite_exec(be->be_db, "ROLLBACK TRANSACTION", NULL, NULL,
1516	    &errmsg);
1517	UPDATE_TOTALS(be, bt_exec, ts, vts);
1518	if (r == SQLITE_FULL)
1519		tx->bt_full = 1;
1520	(void) backend_error(be, r, errmsg);
1521
1522	backend_tx_end(tx);
1523}
1524
1525/*
1526 * Fails with
1527 *   _NO_RESOURCES - out of memory
1528 */
1529int
1530backend_tx_commit(backend_tx_t *tx)
1531{
1532	int r, r2;
1533	char *errmsg;
1534	sqlite_backend_t *be;
1535	hrtime_t ts, vts;
1536
1537	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1538	be = tx->bt_be;
1539	ts = gethrtime();
1540	vts = gethrvtime();
1541	r = sqlite_exec(be->be_db, "COMMIT TRANSACTION", NULL, NULL,
1542	    &errmsg);
1543	UPDATE_TOTALS(be, bt_exec, ts, vts);
1544	if (r == SQLITE_FULL)
1545		tx->bt_full = 1;
1546
1547	r = backend_error(be, r, errmsg);
1548	assert(r != REP_PROTOCOL_DONE);
1549
1550	if (r != REP_PROTOCOL_SUCCESS) {
1551		r2 = sqlite_exec(be->be_db, "ROLLBACK TRANSACTION", NULL, NULL,
1552		    &errmsg);
1553		r2 = backend_error(be, r2, errmsg);
1554		if (r2 != REP_PROTOCOL_SUCCESS)
1555			backend_panic("cannot rollback failed commit");
1556
1557		backend_tx_end(tx);
1558		return (r);
1559	}
1560	backend_tx_end(tx);
1561	return (REP_PROTOCOL_SUCCESS);
1562}
1563
1564static const char *
1565id_space_to_name(enum id_space id)
1566{
1567	switch (id) {
1568	case BACKEND_ID_SERVICE_INSTANCE:
1569		return ("SI");
1570	case BACKEND_ID_PROPERTYGRP:
1571		return ("PG");
1572	case BACKEND_ID_GENERATION:
1573		return ("GEN");
1574	case BACKEND_ID_PROPERTY:
1575		return ("PROP");
1576	case BACKEND_ID_VALUE:
1577		return ("VAL");
1578	case BACKEND_ID_SNAPNAME:
1579		return ("SNAME");
1580	case BACKEND_ID_SNAPSHOT:
1581		return ("SHOT");
1582	case BACKEND_ID_SNAPLEVEL:
1583		return ("SLVL");
1584	default:
1585		abort();
1586		/*NOTREACHED*/
1587	}
1588}
1589
1590/*
1591 * Returns a new id or 0 if the id argument is invalid or the query fails.
1592 */
1593uint32_t
1594backend_new_id(backend_tx_t *tx, enum id_space id)
1595{
1596	struct run_single_int_info info;
1597	uint32_t new_id = 0;
1598	const char *name = id_space_to_name(id);
1599	char *errmsg;
1600	int ret;
1601	sqlite_backend_t *be;
1602	hrtime_t ts, vts;
1603
1604	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1605	be = tx->bt_be;
1606
1607	info.rs_out = &new_id;
1608	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;
1609
1610	ts = gethrtime();
1611	vts = gethrvtime();
1612	ret = sqlite_exec_printf(be->be_db,
1613	    "SELECT id_next FROM id_tbl WHERE (id_name = '%q');"
1614	    "UPDATE id_tbl SET id_next = id_next + 1 WHERE (id_name = '%q');",
1615	    run_single_int_callback, &info, &errmsg, name, name);
1616	UPDATE_TOTALS(be, bt_exec, ts, vts);
1617	if (ret == SQLITE_FULL)
1618		tx->bt_full = 1;
1619
1620	ret = backend_error(be, ret, errmsg);
1621
1622	if (ret != REP_PROTOCOL_SUCCESS) {
1623		return (0);
1624	}
1625
1626	return (new_id);
1627}
1628
1629/*
1630 * Returns
1631 *   _NO_RESOURCES - out of memory
1632 *   _DONE - callback aborted query
1633 *   _SUCCESS
1634 */
1635int
1636backend_tx_run(backend_tx_t *tx, backend_query_t *q,
1637    backend_run_callback_f *cb, void *data)
1638{
1639	char *errmsg = NULL;
1640	int ret;
1641	sqlite_backend_t *be;
1642	hrtime_t ts, vts;
1643
1644	assert(tx != NULL && tx->bt_be != NULL);
1645	be = tx->bt_be;
1646
1647	if (q == NULL || q->bq_buf == NULL)
1648		return (REP_PROTOCOL_FAIL_NO_RESOURCES);
1649
1650	ts = gethrtime();
1651	vts = gethrvtime();
1652	ret = sqlite_exec(be->be_db, q->bq_buf, cb, data, &errmsg);
1653	UPDATE_TOTALS(be, bt_exec, ts, vts);
1654	if (ret == SQLITE_FULL)
1655		tx->bt_full = 1;
1656	ret = backend_error(be, ret, errmsg);
1657
1658	return (ret);
1659}
1660
1661/*
1662 * Returns
1663 *   _NO_RESOURCES - out of memory
1664 *   _NOT_FOUND - the query returned no results
1665 *   _SUCCESS - the query returned a single integer
1666 */
1667int
1668backend_tx_run_single_int(backend_tx_t *tx, backend_query_t *q, uint32_t *buf)
1669{
1670	struct run_single_int_info info;
1671	int ret;
1672
1673	info.rs_out = buf;
1674	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;
1675
1676	ret = backend_tx_run(tx, q, run_single_int_callback, &info);
1677	assert(ret != REP_PROTOCOL_DONE);
1678
1679	if (ret != REP_PROTOCOL_SUCCESS)
1680		return (ret);
1681
1682	return (info.rs_result);
1683}
1684
1685/*
1686 * Fails with
1687 *   _NO_RESOURCES - out of memory
1688 */
1689int
1690backend_tx_run_update(backend_tx_t *tx, const char *format, ...)
1691{
1692	va_list a;
1693	char *errmsg;
1694	int ret;
1695	sqlite_backend_t *be;
1696	hrtime_t ts, vts;
1697
1698	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1699	be = tx->bt_be;
1700
1701	va_start(a, format);
1702	ts = gethrtime();
1703	vts = gethrvtime();
1704	ret = sqlite_exec_vprintf(be->be_db, format, NULL, NULL, &errmsg, a);
1705	UPDATE_TOTALS(be, bt_exec, ts, vts);
1706	if (ret == SQLITE_FULL)
1707		tx->bt_full = 1;
1708	va_end(a);
1709	ret = backend_error(be, ret, errmsg);
1710	assert(ret != REP_PROTOCOL_DONE);
1711
1712	return (ret);
1713}
1714
1715/*
1716 * returns REP_PROTOCOL_FAIL_NOT_FOUND if no changes occured
1717 */
1718int
1719backend_tx_run_update_changed(backend_tx_t *tx, const char *format, ...)
1720{
1721	va_list a;
1722	char *errmsg;
1723	int ret;
1724	sqlite_backend_t *be;
1725	hrtime_t ts, vts;
1726
1727	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1728	be = tx->bt_be;
1729
1730	va_start(a, format);
1731	ts = gethrtime();
1732	vts = gethrvtime();
1733	ret = sqlite_exec_vprintf(be->be_db, format, NULL, NULL, &errmsg, a);
1734	UPDATE_TOTALS(be, bt_exec, ts, vts);
1735	if (ret == SQLITE_FULL)
1736		tx->bt_full = 1;
1737	va_end(a);
1738
1739	ret = backend_error(be, ret, errmsg);
1740
1741	return (ret);
1742}
1743
1744#define	BACKEND_ADD_SCHEMA(be, file, tbls, idxs) \
1745	(backend_add_schema((be), (file), \
1746	    (tbls), sizeof (tbls) / sizeof (*(tbls)), \
1747	    (idxs), sizeof (idxs) / sizeof (*(idxs))))
1748
1749static int
1750backend_add_schema(sqlite_backend_t *be, const char *file,
1751    struct backend_tbl_info *tbls, int tbl_count,
1752    struct backend_idx_info *idxs, int idx_count)
1753{
1754	int i;
1755	char *errmsg;
1756	int ret;
1757
1758	/*
1759	 * Create the tables.
1760	 */
1761	for (i = 0; i < tbl_count; i++) {
1762		if (tbls[i].bti_name == NULL) {
1763			assert(i + 1 == tbl_count);
1764			break;
1765		}
1766		ret = sqlite_exec_printf(be->be_db,
1767		    "CREATE TABLE %s (%s);\n",
1768		    NULL, NULL, &errmsg, tbls[i].bti_name, tbls[i].bti_cols);
1769
1770		if (ret != SQLITE_OK) {
1771			configd_critical(
1772			    "%s: %s table creation fails: %s\n", file,
1773			    tbls[i].bti_name, errmsg);
1774			free(errmsg);
1775			return (-1);
1776		}
1777	}
1778
1779	/*
1780	 * Make indices on key tables and columns.
1781	 */
1782	for (i = 0; i < idx_count; i++) {
1783		if (idxs[i].bxi_tbl == NULL) {
1784			assert(i + 1 == idx_count);
1785			break;
1786		}
1787
1788		ret = sqlite_exec_printf(be->be_db,
1789		    "CREATE INDEX %s_%s ON %s (%s);\n",
1790		    NULL, NULL, &errmsg, idxs[i].bxi_tbl, idxs[i].bxi_idx,
1791		    idxs[i].bxi_tbl, idxs[i].bxi_cols);
1792
1793		if (ret != SQLITE_OK) {
1794			configd_critical(
1795			    "%s: %s_%s index creation fails: %s\n", file,
1796			    idxs[i].bxi_tbl, idxs[i].bxi_idx, errmsg);
1797			free(errmsg);
1798			return (-1);
1799		}
1800	}
1801	return (0);
1802}
1803
1804static int
1805backend_init_schema(sqlite_backend_t *be, const char *db_file, backend_type_t t)
1806{
1807	int i;
1808	char *errmsg;
1809	int ret;
1810
1811	assert(t == BACKEND_TYPE_NORMAL || t == BACKEND_TYPE_NONPERSIST);
1812
1813	if (t == BACKEND_TYPE_NORMAL) {
1814		ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_normal, idxs_normal);
1815	} else if (t == BACKEND_TYPE_NONPERSIST) {
1816		ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_np, idxs_np);
1817	} else {
1818		abort();		/* can't happen */
1819	}
1820
1821	if (ret < 0) {
1822		return (ret);
1823	}
1824
1825	ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_common, idxs_common);
1826	if (ret < 0) {
1827		return (ret);
1828	}
1829
1830	/*
1831	 * Add the schema version to the table
1832	 */
1833	ret = sqlite_exec_printf(be->be_db,
1834	    "INSERT INTO schema_version (schema_version) VALUES (%d)",
1835	    NULL, NULL, &errmsg, BACKEND_SCHEMA_VERSION);
1836	if (ret != SQLITE_OK) {
1837		configd_critical(
1838		    "setting schema version fails: %s\n", errmsg);
1839		free(errmsg);
1840	}
1841
1842	/*
1843	 * Populate id_tbl with initial IDs.
1844	 */
1845	for (i = 0; i < BACKEND_ID_INVALID; i++) {
1846		const char *name = id_space_to_name(i);
1847
1848		ret = sqlite_exec_printf(be->be_db,
1849		    "INSERT INTO id_tbl (id_name, id_next) "
1850		    "VALUES ('%q', %d);", NULL, NULL, &errmsg, name, 1);
1851		if (ret != SQLITE_OK) {
1852			configd_critical(
1853			    "id insertion for %s fails: %s\n", name, errmsg);
1854			free(errmsg);
1855			return (-1);
1856		}
1857	}
1858	/*
1859	 * Set the persistance of the database.  The normal database is marked
1860	 * "synchronous", so that all writes are synchronized to stable storage
1861	 * before proceeding.
1862	 */
1863	ret = sqlite_exec_printf(be->be_db,
1864	    "PRAGMA default_synchronous = %s; PRAGMA synchronous = %s;",
1865	    NULL, NULL, &errmsg,
1866	    (t == BACKEND_TYPE_NORMAL)? "ON" : "OFF",
1867	    (t == BACKEND_TYPE_NORMAL)? "ON" : "OFF");
1868	if (ret != SQLITE_OK) {
1869		configd_critical("pragma setting fails: %s\n", errmsg);
1870		free(errmsg);
1871		return (-1);
1872	}
1873
1874	return (0);
1875}
1876
1877int
1878backend_init(const char *db_file, const char *npdb_file, int have_np)
1879{
1880	sqlite_backend_t *be;
1881	int r;
1882	int writable_persist = 1;
1883
1884	/* set up our temporary directory */
1885	sqlite_temp_directory = "/etc/svc/volatile";
1886
1887	if (strcmp(SQLITE_VERSION, sqlite_version) != 0) {
1888		configd_critical("Mismatched link!  (%s should be %s)\n",
1889		    sqlite_version, SQLITE_VERSION);
1890		return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1891	}
1892	if (db_file == NULL)
1893		db_file = REPOSITORY_DB;
1894
1895	r = backend_create(BACKEND_TYPE_NORMAL, db_file, &be);
1896	switch (r) {
1897	case BACKEND_CREATE_FAIL:
1898		return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1899	case BACKEND_CREATE_LOCKED:
1900		return (CONFIGD_EXIT_DATABASE_LOCKED);
1901	case BACKEND_CREATE_SUCCESS:
1902		break;		/* success */
1903	case BACKEND_CREATE_READONLY:
1904		writable_persist = 0;
1905		break;
1906	case BACKEND_CREATE_NEED_INIT:
1907		if (backend_init_schema(be, db_file, BACKEND_TYPE_NORMAL)) {
1908			backend_destroy(be);
1909			return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1910		}
1911		break;
1912	default:
1913		abort();
1914		/*NOTREACHED*/
1915	}
1916	backend_create_finish(BACKEND_TYPE_NORMAL, be);
1917
1918	if (have_np) {
1919		if (npdb_file == NULL)
1920			npdb_file = NONPERSIST_DB;
1921
1922		r = backend_create(BACKEND_TYPE_NONPERSIST, npdb_file, &be);
1923		switch (r) {
1924		case BACKEND_CREATE_SUCCESS:
1925			break;		/* success */
1926		case BACKEND_CREATE_FAIL:
1927			return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1928		case BACKEND_CREATE_LOCKED:
1929			return (CONFIGD_EXIT_DATABASE_LOCKED);
1930		case BACKEND_CREATE_READONLY:
1931			configd_critical("%s: unable to write\n", npdb_file);
1932			return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1933		case BACKEND_CREATE_NEED_INIT:
1934			if (backend_init_schema(be, db_file,
1935			    BACKEND_TYPE_NONPERSIST)) {
1936				backend_destroy(be);
1937				return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1938			}
1939			break;
1940		default:
1941			abort();
1942			/*NOTREACHED*/
1943		}
1944		backend_create_finish(BACKEND_TYPE_NONPERSIST, be);
1945
1946		/*
1947		 * If we started up with a writable filesystem, but the
1948		 * non-persistent database needed initialization, we
1949		 * are booting a non-global zone, so do a backup.
1950		 */
1951		if (r == BACKEND_CREATE_NEED_INIT && writable_persist &&
1952		    backend_lock(BACKEND_TYPE_NORMAL, 0, &be) ==
1953		    REP_PROTOCOL_SUCCESS) {
1954			if (backend_create_backup_locked(be,
1955			    REPOSITORY_BOOT_BACKUP) != REP_PROTOCOL_SUCCESS) {
1956				configd_critical(
1957				    "unable to create \"%s\" backup of "
1958				    "\"%s\"\n", REPOSITORY_BOOT_BACKUP,
1959				    be->be_path);
1960			}
1961			backend_unlock(be);
1962		}
1963	}
1964	return (CONFIGD_EXIT_OKAY);
1965}
1966
1967/*
1968 * quiesce all database activity prior to exiting
1969 */
1970void
1971backend_fini(void)
1972{
1973	sqlite_backend_t *be_normal, *be_np;
1974
1975	(void) backend_lock(BACKEND_TYPE_NORMAL, 1, &be_normal);
1976	(void) backend_lock(BACKEND_TYPE_NONPERSIST, 1, &be_np);
1977}
1978
1979#define	QUERY_BASE	128
1980backend_query_t *
1981backend_query_alloc(void)
1982{
1983	backend_query_t *q;
1984	q = calloc(1, sizeof (backend_query_t));
1985	if (q != NULL) {
1986		q->bq_size = QUERY_BASE;
1987		q->bq_buf = calloc(1, q->bq_size);
1988		if (q->bq_buf == NULL) {
1989			q->bq_size = 0;
1990		}
1991
1992	}
1993	return (q);
1994}
1995
1996void
1997backend_query_append(backend_query_t *q, const char *value)
1998{
1999	char *alloc;
2000	int count;
2001	size_t size, old_len;
2002
2003	if (q == NULL) {
2004		/* We'll discover the error when we try to run the query. */
2005		return;
2006	}
2007
2008	while (q->bq_buf != NULL) {
2009		old_len = strlen(q->bq_buf);
2010		size = q->bq_size;
2011		count = strlcat(q->bq_buf, value, size);
2012
2013		if (count < size)
2014			break;				/* success */
2015
2016		q->bq_buf[old_len] = 0;
2017		size = round_up_to_p2(count + 1);
2018
2019		assert(size > q->bq_size);
2020		alloc = realloc(q->bq_buf, size);
2021		if (alloc == NULL) {
2022			free(q->bq_buf);
2023			q->bq_buf = NULL;
2024			break;				/* can't grow */
2025		}
2026
2027		q->bq_buf = alloc;
2028		q->bq_size = size;
2029	}
2030}
2031
2032void
2033backend_query_add(backend_query_t *q, const char *format, ...)
2034{
2035	va_list args;
2036	char *new;
2037
2038	if (q == NULL || q->bq_buf == NULL)
2039		return;
2040
2041	va_start(args, format);
2042	new = sqlite_vmprintf(format, args);
2043	va_end(args);
2044
2045	if (new == NULL) {
2046		free(q->bq_buf);
2047		q->bq_buf = NULL;
2048		return;
2049	}
2050
2051	backend_query_append(q, new);
2052
2053	free(new);
2054}
2055
2056void
2057backend_query_free(backend_query_t *q)
2058{
2059	if (q != NULL) {
2060		if (q->bq_buf != NULL) {
2061			free(q->bq_buf);
2062		}
2063		free(q);
2064	}
2065}
2066