backend.c revision 5405:f7a026c6d133
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * sqlite is not compatible with _FILE_OFFSET_BITS=64, but we need to
31 * be able to statvfs(2) possibly large systems.  This define gives us
32 * access to the transitional interfaces.  See lfcompile64(5) for how
33 * _LARGEFILE64_SOURCE works.
34 */
35#define	_LARGEFILE64_SOURCE
36
37#include <assert.h>
38#include <door.h>
39#include <dirent.h>
40#include <errno.h>
41#include <fcntl.h>
42#include <limits.h>
43#include <pthread.h>
44#include <stdarg.h>
45#include <stdio.h>
46#include <stdlib.h>
47#include <string.h>
48#include <sys/stat.h>
49#include <sys/statvfs.h>
50#include <unistd.h>
51#include <zone.h>
52
53#include "configd.h"
54#include "repcache_protocol.h"
55
56#include <sqlite.h>
57#include <sqlite-misc.h>
58
59/*
60 * This file has two purposes:
61 *
62 * 1. It contains the database schema, and the code for setting up our backend
63 *    databases, including installing said schema.
64 *
65 * 2. It provides a simplified interface to the SQL database library, and
66 *    synchronizes MT access to the database.
67 */
68
69typedef struct backend_spent {
70	uint64_t bs_count;
71	hrtime_t bs_time;
72	hrtime_t bs_vtime;
73} backend_spent_t;
74
75typedef struct backend_totals {
76	backend_spent_t	bt_lock;	/* waiting for lock */
77	backend_spent_t	bt_exec;	/* time spent executing SQL */
78} backend_totals_t;
79
80typedef struct sqlite_backend {
81	pthread_mutex_t	be_lock;
82	pthread_t	be_thread;	/* thread holding lock */
83	struct sqlite	*be_db;
84	const char	*be_path;	/* path to db */
85	int		be_readonly;	/* readonly at start, and still is */
86	int		be_writing;	/* held for writing */
87	backend_type_t	be_type;	/* type of db */
88	hrtime_t	be_lastcheck;	/* time of last read-only check */
89	backend_totals_t be_totals[2];	/* one for reading, one for writing */
90} sqlite_backend_t;
91
92struct backend_tx {
93	sqlite_backend_t	*bt_be;
94	int			bt_readonly;
95	int			bt_type;
96	int			bt_full;	/* SQLITE_FULL during tx */
97};
98
99#define	UPDATE_TOTALS_WR(sb, writing, field, ts, vts) { \
100	backend_spent_t *__bsp = &(sb)->be_totals[!!(writing)].field; \
101	__bsp->bs_count++;						\
102	__bsp->bs_time += (gethrtime() - ts);				\
103	__bsp->bs_vtime += (gethrvtime() - vts);			\
104}
105
106#define	UPDATE_TOTALS(sb, field, ts, vts) \
107	UPDATE_TOTALS_WR(sb, (sb)->be_writing, field, ts, vts)
108
109struct backend_query {
110	char	*bq_buf;
111	size_t	bq_size;
112};
113
114struct backend_tbl_info {
115	const char *bti_name;
116	const char *bti_cols;
117};
118
119struct backend_idx_info {
120	const char *bxi_tbl;
121	const char *bxi_idx;
122	const char *bxi_cols;
123};
124
125static pthread_mutex_t backend_panic_lock = PTHREAD_MUTEX_INITIALIZER;
126static pthread_cond_t backend_panic_cv = PTHREAD_COND_INITIALIZER;
127pthread_t backend_panic_thread = 0;
128
129int backend_do_trace = 0;		/* invoke tracing callback */
130int backend_print_trace = 0;		/* tracing callback prints SQL */
131int backend_panic_abort = 0;		/* abort when panicking */
132
133/* interval between read-only checks while starting up */
134#define	BACKEND_READONLY_CHECK_INTERVAL	(2 * (hrtime_t)NANOSEC)
135
136/*
137 * Any change to the below schema should bump the version number
138 */
139#define	BACKEND_SCHEMA_VERSION		5
140
141static struct backend_tbl_info tbls_normal[] = { /* BACKEND_TYPE_NORMAL */
142	/*
143	 * service_tbl holds all services.  svc_id is the identifier of the
144	 * service.
145	 */
146	{
147		"service_tbl",
148		"svc_id          INTEGER PRIMARY KEY,"
149		"svc_name        CHAR(256) NOT NULL"
150	},
151
152	/*
153	 * instance_tbl holds all of the instances.  The parent service id
154	 * is instance_svc.
155	 */
156	{
157		"instance_tbl",
158		"instance_id     INTEGER PRIMARY KEY,"
159		"instance_name   CHAR(256) NOT NULL,"
160		"instance_svc    INTEGER NOT NULL"
161	},
162
163	/*
164	 * snapshot_lnk_tbl links (instance, snapshot name) with snapshots.
165	 */
166	{
167		"snapshot_lnk_tbl",
168		"lnk_id          INTEGER PRIMARY KEY,"
169		"lnk_inst_id     INTEGER NOT NULL,"
170		"lnk_snap_name   CHAR(256) NOT NULL,"
171		"lnk_snap_id     INTEGER NOT NULL"
172	},
173
174	/*
175	 * snaplevel_tbl maps a snapshot id to a set of named, ordered
176	 * snaplevels.
177	 */
178	{
179		"snaplevel_tbl",
180		"snap_id                 INTEGER NOT NULL,"
181		"snap_level_num          INTEGER NOT NULL,"
182		"snap_level_id           INTEGER NOT NULL,"
183		"snap_level_service_id   INTEGER NOT NULL,"
184		"snap_level_service      CHAR(256) NOT NULL,"
185		"snap_level_instance_id  INTEGER NULL,"
186		"snap_level_instance     CHAR(256) NULL"
187	},
188
189	/*
190	 * snaplevel_lnk_tbl links snaplevels to property groups.
191	 * snaplvl_pg_* is identical to the original property group,
192	 * and snaplvl_gen_id overrides the generation number.
193	 * The service/instance ids are as in the snaplevel.
194	 */
195	{
196		"snaplevel_lnk_tbl",
197		"snaplvl_level_id INTEGER NOT NULL,"
198		"snaplvl_pg_id    INTEGER NOT NULL,"
199		"snaplvl_pg_name  CHAR(256) NOT NULL,"
200		"snaplvl_pg_type  CHAR(256) NOT NULL,"
201		"snaplvl_pg_flags INTEGER NOT NULL,"
202		"snaplvl_gen_id   INTEGER NOT NULL"
203	},
204
205	{ NULL, NULL }
206};
207
208static struct backend_idx_info idxs_normal[] = { /* BACKEND_TYPE_NORMAL */
209	{ "service_tbl",	"name",	"svc_name" },
210	{ "instance_tbl",	"name",	"instance_svc, instance_name" },
211	{ "snapshot_lnk_tbl",	"name",	"lnk_inst_id, lnk_snap_name" },
212	{ "snapshot_lnk_tbl",	"snapid", "lnk_snap_id" },
213	{ "snaplevel_tbl",	"id",	"snap_id" },
214	{ "snaplevel_lnk_tbl",	"id",	"snaplvl_pg_id" },
215	{ "snaplevel_lnk_tbl",	"level", "snaplvl_level_id" },
216	{ NULL, NULL, NULL }
217};
218
219static struct backend_tbl_info tbls_np[] = { /* BACKEND_TYPE_NONPERSIST */
220	{ NULL, NULL }
221};
222
223static struct backend_idx_info idxs_np[] = {	/* BACKEND_TYPE_NONPERSIST */
224	{ NULL, NULL, NULL }
225};
226
227static struct backend_tbl_info tbls_common[] = { /* all backend types */
228	/*
229	 * pg_tbl defines property groups.  They are associated with a single
230	 * service or instance.  The pg_gen_id links them with the latest
231	 * "edited" version of its properties.
232	 */
233	{
234		"pg_tbl",
235		"pg_id           INTEGER PRIMARY KEY,"
236		"pg_parent_id    INTEGER NOT NULL,"
237		"pg_name         CHAR(256) NOT NULL,"
238		"pg_type         CHAR(256) NOT NULL,"
239		"pg_flags        INTEGER NOT NULL,"
240		"pg_gen_id       INTEGER NOT NULL"
241	},
242
243	/*
244	 * prop_lnk_tbl links a particular pg_id and gen_id to a set of
245	 * (prop_name, prop_type, val_id) trios.
246	 */
247	{
248		"prop_lnk_tbl",
249		"lnk_prop_id     INTEGER PRIMARY KEY,"
250		"lnk_pg_id       INTEGER NOT NULL,"
251		"lnk_gen_id      INTEGER NOT NULL,"
252		"lnk_prop_name   CHAR(256) NOT NULL,"
253		"lnk_prop_type   CHAR(2) NOT NULL,"
254		"lnk_val_id      INTEGER"
255	},
256
257	/*
258	 * value_tbl maps a value_id to a set of values.  For any given
259	 * value_id, value_type is constant.
260	 */
261	{
262		"value_tbl",
263		"value_id        INTEGER NOT NULL,"
264		"value_type      CHAR(1) NOT NULL,"
265		"value_value     VARCHAR NOT NULL"
266	},
267
268	/*
269	 * id_tbl has one row per id space
270	 */
271	{
272		"id_tbl",
273		"id_name         STRING NOT NULL,"
274		"id_next         INTEGER NOT NULL"
275	},
276
277	/*
278	 * schema_version has a single row, which contains
279	 * BACKEND_SCHEMA_VERSION at the time of creation.
280	 */
281	{
282		"schema_version",
283		"schema_version  INTEGER"
284	},
285	{ NULL, NULL }
286};
287
288static struct backend_idx_info idxs_common[] = { /* all backend types */
289	{ "pg_tbl",		"parent", "pg_parent_id" },
290	{ "pg_tbl",		"name",	"pg_parent_id, pg_name" },
291	{ "pg_tbl",		"type",	"pg_parent_id, pg_type" },
292	{ "prop_lnk_tbl",	"base",	"lnk_pg_id, lnk_gen_id" },
293	{ "prop_lnk_tbl",	"val",	"lnk_val_id" },
294	{ "value_tbl",		"id",	"value_id" },
295	{ "id_tbl",		"id",	"id_name" },
296	{ NULL, NULL, NULL }
297};
298
299struct run_single_int_info {
300	uint32_t	*rs_out;
301	int		rs_result;
302};
303
304/*ARGSUSED*/
305static int
306run_single_int_callback(void *arg, int columns, char **vals, char **names)
307{
308	struct run_single_int_info *info = arg;
309	uint32_t val;
310
311	char *endptr = vals[0];
312
313	assert(info->rs_result != REP_PROTOCOL_SUCCESS);
314	assert(columns == 1);
315
316	if (vals[0] == NULL)
317		return (BACKEND_CALLBACK_CONTINUE);
318
319	errno = 0;
320	val = strtoul(vals[0], &endptr, 10);
321	if ((val == 0 && endptr == vals[0]) || *endptr != 0 || errno != 0)
322		backend_panic("malformed integer \"%20s\"", vals[0]);
323
324	*info->rs_out = val;
325	info->rs_result = REP_PROTOCOL_SUCCESS;
326	return (BACKEND_CALLBACK_CONTINUE);
327}
328
329/*ARGSUSED*/
330int
331backend_fail_if_seen(void *arg, int columns, char **vals, char **names)
332{
333	return (BACKEND_CALLBACK_ABORT);
334}
335
336/*
337 * check to see if we can successfully start a transaction;  if not, the
338 * filesystem is mounted read-only.
339 */
340static int
341backend_is_readonly(struct sqlite *db, const char *path)
342{
343	int r;
344	statvfs64_t stat;
345
346	if (statvfs64(path, &stat) == 0 && (stat.f_flag & ST_RDONLY))
347		return (SQLITE_READONLY);
348
349	r = sqlite_exec(db,
350	    "BEGIN TRANSACTION; "
351	    "UPDATE schema_version SET schema_version = schema_version; ",
352	    NULL, NULL, NULL);
353	(void) sqlite_exec(db, "ROLLBACK TRANSACTION", NULL, NULL, NULL);
354	return (r);
355}
356
357static void
358backend_trace_sql(void *arg, const char *sql)
359{
360	sqlite_backend_t *be = arg;
361
362	if (backend_print_trace) {
363		(void) fprintf(stderr, "%d: %s\n", be->be_type, sql);
364	}
365}
366
367static sqlite_backend_t be_info[BACKEND_TYPE_TOTAL];
368static sqlite_backend_t *bes[BACKEND_TYPE_TOTAL];
369
370#define	BACKEND_PANIC_TIMEOUT	(50 * MILLISEC)
371/*
372 * backend_panic() -- some kind of database problem or corruption has been hit.
373 * We attempt to quiesce the other database users -- all of the backend sql
374 * entry points will call backend_panic(NULL) if a panic is in progress, as
375 * will any attempt to start a transaction.
376 *
377 * We give threads holding a backend lock 50ms (BACKEND_PANIC_TIMEOUT) to
378 * either drop the lock or call backend_panic().  If they don't respond in
379 * time, we'll just exit anyway.
380 */
381void
382backend_panic(const char *format, ...)
383{
384	int i;
385	va_list args;
386	int failed = 0;
387
388	(void) pthread_mutex_lock(&backend_panic_lock);
389	if (backend_panic_thread != 0) {
390		(void) pthread_mutex_unlock(&backend_panic_lock);
391		/*
392		 * first, drop any backend locks we're holding, then
393		 * sleep forever on the panic_cv.
394		 */
395		for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
396			if (bes[i] != NULL &&
397			    bes[i]->be_thread == pthread_self())
398				(void) pthread_mutex_unlock(&bes[i]->be_lock);
399		}
400		(void) pthread_mutex_lock(&backend_panic_lock);
401		for (;;)
402			(void) pthread_cond_wait(&backend_panic_cv,
403			    &backend_panic_lock);
404	}
405	backend_panic_thread = pthread_self();
406	(void) pthread_mutex_unlock(&backend_panic_lock);
407
408	for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
409		if (bes[i] != NULL && bes[i]->be_thread == pthread_self())
410			(void) pthread_mutex_unlock(&bes[i]->be_lock);
411	}
412
413	va_start(args, format);
414	configd_vcritical(format, args);
415	va_end(args);
416
417	for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
418		timespec_t rel;
419
420		rel.tv_sec = 0;
421		rel.tv_nsec = BACKEND_PANIC_TIMEOUT;
422
423		if (bes[i] != NULL && bes[i]->be_thread != pthread_self()) {
424			if (pthread_mutex_reltimedlock_np(&bes[i]->be_lock,
425			    &rel) != 0)
426				failed++;
427		}
428	}
429	if (failed) {
430		configd_critical("unable to quiesce database\n");
431	}
432
433	if (backend_panic_abort)
434		abort();
435
436	exit(CONFIGD_EXIT_DATABASE_BAD);
437}
438
439/*
440 * Returns
441 *   _SUCCESS
442 *   _DONE - callback aborted query
443 *   _NO_RESOURCES - out of memory (_FULL & _TOOBIG?)
444 */
445static int
446backend_error(sqlite_backend_t *be, int error, char *errmsg)
447{
448	if (error == SQLITE_OK)
449		return (REP_PROTOCOL_SUCCESS);
450
451	switch (error) {
452	case SQLITE_ABORT:
453		free(errmsg);
454		return (REP_PROTOCOL_DONE);
455
456	case SQLITE_NOMEM:
457	case SQLITE_FULL:
458	case SQLITE_TOOBIG:
459		free(errmsg);
460		return (REP_PROTOCOL_FAIL_NO_RESOURCES);
461
462	default:
463		backend_panic("%s: db error: %s", be->be_path, errmsg);
464		/*NOTREACHED*/
465	}
466}
467
468static void
469backend_backup_cleanup(const char **out_arg, ssize_t out_sz)
470{
471	char **out = (char **)out_arg;
472
473	while (out_sz-- > 0)
474		free(*out++);
475	free(out_arg);
476}
477
478/*
479 * builds a inverse-time-sorted array of backup files.  The path is a
480 * a single buffer, and the pointers look like:
481 *
482 *	/this/is/a/full/path/to/repository-name-YYYYMMDDHHMMSS
483 *	^pathname		^	       ^(pathname+pathlen)
484 *				basename
485 *
486 * dirname will either be pathname, or ".".
487 *
488 * Returns the number of elements in the array, 0 if there are no previous
489 * backups, or -1 on error.
490 */
491static ssize_t
492backend_backup_get_prev(char *pathname, size_t pathlen, const char ***out_arg)
493{
494	char b_start, b_end;
495	DIR *dir;
496	char **out = NULL;
497	char *name, *p;
498	char *dirname, *basename;
499	char *pathend;
500	struct dirent *ent;
501
502	size_t count = 0;
503	size_t baselen;
504
505	/*
506	 * year, month, day, hour, min, sec, plus an '_'.
507	 */
508	const size_t ndigits = 4 + 5*2 + 1;
509	const size_t baroffset = 4 + 2*2;
510
511	size_t idx;
512
513	pathend = pathname + pathlen;
514	b_end = *pathend;
515	*pathend = '\0';
516
517	basename = strrchr(pathname, '/');
518
519	if (basename != NULL) {
520		assert(pathend > pathname && basename < pathend);
521		basename++;
522		dirname = pathname;
523	} else {
524		basename = pathname;
525		dirname = ".";
526	}
527
528	baselen = strlen(basename);
529
530	/*
531	 * munge the string temporarily for the opendir(), then restore it.
532	 */
533	b_start = basename[0];
534
535	basename[0] = '\0';
536	dir = opendir(dirname);
537	basename[0] = b_start;		/* restore path */
538
539	if (dir == NULL)
540		goto fail;
541
542
543	while ((ent = readdir(dir)) != NULL) {
544		/*
545		 * Must match:
546		 *	basename-YYYYMMDD_HHMMSS
547		 * or we ignore it.
548		 */
549		if (strncmp(ent->d_name, basename, baselen) != 0)
550			continue;
551
552		name = ent->d_name;
553		if (name[baselen] != '-')
554			continue;
555
556		p = name + baselen + 1;
557
558		for (idx = 0; idx < ndigits; idx++) {
559			char c = p[idx];
560			if (idx == baroffset && c != '_')
561				break;
562			if (idx != baroffset && (c < '0' || c > '9'))
563				break;
564		}
565		if (idx != ndigits || p[idx] != '\0')
566			continue;
567
568		/*
569		 * We have a match.  insertion-sort it into our list.
570		 */
571		name = strdup(name);
572		if (name == NULL)
573			goto fail_closedir;
574		p = strrchr(name, '-');
575
576		for (idx = 0; idx < count; idx++) {
577			char *tmp = out[idx];
578			char *tp = strrchr(tmp, '-');
579
580			int cmp = strcmp(p, tp);
581			if (cmp == 0)
582				cmp = strcmp(name, tmp);
583
584			if (cmp == 0) {
585				free(name);
586				name = NULL;
587				break;
588			} else if (cmp > 0) {
589				out[idx] = name;
590				name = tmp;
591				p = tp;
592			}
593		}
594
595		if (idx == count) {
596			char **new_out = realloc(out,
597			    (count + 1) * sizeof (*out));
598
599			if (new_out == NULL) {
600				free(name);
601				goto fail_closedir;
602			}
603
604			out = new_out;
605			out[count++] = name;
606		} else {
607			assert(name == NULL);
608		}
609	}
610	(void) closedir(dir);
611
612	basename[baselen] = b_end;
613
614	*out_arg = (const char **)out;
615	return (count);
616
617fail_closedir:
618	(void) closedir(dir);
619fail:
620	basename[0] = b_start;
621	*pathend = b_end;
622
623	backend_backup_cleanup((const char **)out, count);
624
625	*out_arg = NULL;
626	return (-1);
627}
628
629/*
630 * Copies the repository path into out, a buffer of out_len bytes,
631 * removes the ".db" (or whatever) extension, and, if name is non-NULL,
632 * appends "-name" to it.  If name is non-NULL, it can fail with:
633 *
634 *	_TRUNCATED	will not fit in buffer.
635 *	_BAD_REQUEST	name is not a valid identifier
636 */
637static rep_protocol_responseid_t
638backend_backup_base(sqlite_backend_t *be, const char *name,
639    char *out, size_t out_len)
640{
641	char *p, *q;
642	size_t len;
643
644	/*
645	 * for paths of the form /path/to/foo.db, we truncate at the final
646	 * '.'.
647	 */
648	(void) strlcpy(out, be->be_path, out_len);
649
650	p = strrchr(out, '/');
651	q = strrchr(out, '.');
652
653	if (p != NULL && q != NULL && q > p)
654		*q = 0;
655
656	if (name != NULL) {
657		len = strlen(out);
658		assert(len < out_len);
659
660		out += len;
661		out_len -= len;
662
663		len = strlen(name);
664
665		/*
666		 * verify that the name tag is entirely alphabetic,
667		 * non-empty, and not too long.
668		 */
669		if (len == 0 || len >= REP_PROTOCOL_NAME_LEN ||
670		    uu_check_name(name, UU_NAME_DOMAIN) < 0)
671			return (REP_PROTOCOL_FAIL_BAD_REQUEST);
672
673		if (snprintf(out, out_len, "-%s", name) >= out_len)
674			return (REP_PROTOCOL_FAIL_TRUNCATED);
675	}
676
677	return (REP_PROTOCOL_SUCCESS);
678}
679
680/*
681 * See if a backup is needed.  We do a backup unless both files are
682 * byte-for-byte identical.
683 */
684static int
685backend_check_backup_needed(const char *rep_name, const char *backup_name)
686{
687	int repfd = open(rep_name, O_RDONLY);
688	int fd = open(backup_name, O_RDONLY);
689	struct stat s_rep, s_backup;
690	int c1, c2;
691
692	FILE *f_rep = NULL;
693	FILE *f_backup = NULL;
694
695	if (repfd < 0 || fd < 0)
696		goto fail;
697
698	if (fstat(repfd, &s_rep) < 0 || fstat(fd, &s_backup) < 0)
699		goto fail;
700
701	/*
702	 * if they are the same file, we need to do a backup to break the
703	 * hard link or symlink involved.
704	 */
705	if (s_rep.st_ino == s_backup.st_ino && s_rep.st_dev == s_backup.st_dev)
706		goto fail;
707
708	if (s_rep.st_size != s_backup.st_size)
709		goto fail;
710
711	if ((f_rep = fdopen(repfd, "r")) == NULL ||
712	    (f_backup = fdopen(fd, "r")) == NULL)
713		goto fail;
714
715	do {
716		c1 = getc(f_rep);
717		c2 = getc(f_backup);
718		if (c1 != c2)
719			goto fail;
720	} while (c1 != EOF);
721
722	if (!ferror(f_rep) && !ferror(f_backup)) {
723		(void) fclose(f_rep);
724		(void) fclose(f_backup);
725		(void) close(repfd);
726		(void) close(fd);
727		return (0);
728	}
729
730fail:
731	if (f_rep != NULL)
732		(void) fclose(f_rep);
733	if (f_backup != NULL)
734		(void) fclose(f_backup);
735	if (repfd >= 0)
736		(void) close(repfd);
737	if (fd >= 0)
738		(void) close(fd);
739	return (1);
740}
741
742/*
743 * Can return:
744 *	_BAD_REQUEST		name is not valid
745 *	_TRUNCATED		name is too long for current repository path
746 *	_UNKNOWN		failed for unknown reason (details written to
747 *				console)
748 *	_BACKEND_READONLY	backend is not writable
749 *
750 *	_SUCCESS		Backup completed successfully.
751 */
752static rep_protocol_responseid_t
753backend_create_backup_locked(sqlite_backend_t *be, const char *name)
754{
755	const char **old_list;
756	ssize_t old_sz;
757	ssize_t old_max = max_repository_backups;
758	ssize_t cur;
759
760	char *finalname;
761
762	char finalpath[PATH_MAX];
763	char tmppath[PATH_MAX];
764	char buf[8192];
765	int infd, outfd;
766	size_t len;
767	off_t inlen, outlen, offset;
768
769	time_t now;
770	struct tm now_tm;
771
772	rep_protocol_responseid_t result;
773
774	if (be->be_readonly)
775		return (REP_PROTOCOL_FAIL_BACKEND_READONLY);
776
777	result = backend_backup_base(be, name, finalpath, sizeof (finalpath));
778	if (result != REP_PROTOCOL_SUCCESS)
779		return (result);
780
781	if (!backend_check_backup_needed(be->be_path, finalpath)) {
782		return (REP_PROTOCOL_SUCCESS);
783	}
784
785	/*
786	 * remember the original length, and the basename location
787	 */
788	len = strlen(finalpath);
789	finalname = strrchr(finalpath, '/');
790	if (finalname != NULL)
791		finalname++;
792	else
793		finalname = finalpath;
794
795	(void) strlcpy(tmppath, finalpath, sizeof (tmppath));
796	if (strlcat(tmppath, "-tmpXXXXXX", sizeof (tmppath)) >=
797	    sizeof (tmppath))
798		return (REP_PROTOCOL_FAIL_TRUNCATED);
799
800	now = time(NULL);
801	if (localtime_r(&now, &now_tm) == NULL) {
802		configd_critical(
803		    "\"%s\" backup failed: localtime(3C) failed: %s\n", name,
804		    be->be_path, strerror(errno));
805		return (REP_PROTOCOL_FAIL_UNKNOWN);
806	}
807
808	if (strftime(finalpath + len, sizeof (finalpath) - len,
809	    "-%Y""%m""%d""_""%H""%M""%S", &now_tm) >=
810	    sizeof (finalpath) - len) {
811		return (REP_PROTOCOL_FAIL_TRUNCATED);
812	}
813
814	infd = open(be->be_path, O_RDONLY);
815	if (infd < 0) {
816		configd_critical("\"%s\" backup failed: opening %s: %s\n", name,
817		    be->be_path, strerror(errno));
818		return (REP_PROTOCOL_FAIL_UNKNOWN);
819	}
820
821	outfd = mkstemp(tmppath);
822	if (outfd < 0) {
823		configd_critical("\"%s\" backup failed: mkstemp(%s): %s\n",
824		    name, tmppath, strerror(errno));
825		(void) close(infd);
826		return (REP_PROTOCOL_FAIL_UNKNOWN);
827	}
828
829	for (;;) {
830		do {
831			inlen = read(infd, buf, sizeof (buf));
832		} while (inlen < 0 && errno == EINTR);
833
834		if (inlen <= 0)
835			break;
836
837		for (offset = 0; offset < inlen; offset += outlen) {
838			do {
839				outlen = write(outfd, buf + offset,
840				    inlen - offset);
841			} while (outlen < 0 && errno == EINTR);
842
843			if (outlen >= 0)
844				continue;
845
846			configd_critical(
847			    "\"%s\" backup failed: write to %s: %s\n",
848			    name, tmppath, strerror(errno));
849			result = REP_PROTOCOL_FAIL_UNKNOWN;
850			goto fail;
851		}
852	}
853
854	if (inlen < 0) {
855		configd_critical(
856		    "\"%s\" backup failed: read from %s: %s\n",
857		    name, be->be_path, strerror(errno));
858		goto fail;
859	}
860
861	/*
862	 * grab the old list before doing our re-name.
863	 */
864	if (old_max > 0)
865		old_sz = backend_backup_get_prev(finalpath, len, &old_list);
866
867	if (rename(tmppath, finalpath) < 0) {
868		configd_critical(
869		    "\"%s\" backup failed: rename(%s, %s): %s\n",
870		    name, tmppath, finalpath, strerror(errno));
871		result = REP_PROTOCOL_FAIL_UNKNOWN;
872		goto fail;
873	}
874
875	tmppath[len] = 0;	/* strip -XXXXXX, for reference symlink */
876
877	(void) unlink(tmppath);
878	if (symlink(finalname, tmppath) < 0) {
879		configd_critical(
880		    "\"%s\" backup completed, but updating "
881		    "\"%s\" symlink to \"%s\" failed: %s\n",
882		    name, tmppath, finalname, strerror(errno));
883	}
884
885	if (old_max > 0 && old_sz > 0) {
886		/* unlink all but the first (old_max - 1) files */
887		for (cur = old_max - 1; cur < old_sz; cur++) {
888			(void) strlcpy(finalname, old_list[cur],
889			    sizeof (finalpath) - (finalname - finalpath));
890			if (unlink(finalpath) < 0)
891				configd_critical(
892				    "\"%s\" backup completed, but removing old "
893				    "file \"%s\" failed: %s\n",
894				    name, finalpath, strerror(errno));
895		}
896
897		backend_backup_cleanup(old_list, old_sz);
898	}
899
900	result = REP_PROTOCOL_SUCCESS;
901
902fail:
903	(void) close(infd);
904	(void) close(outfd);
905	if (result != REP_PROTOCOL_SUCCESS)
906		(void) unlink(tmppath);
907
908	return (result);
909}
910
911static int
912backend_check_readonly(sqlite_backend_t *be, int writing, hrtime_t t)
913{
914	char *errp;
915	struct sqlite *new;
916	int r;
917
918	assert(be->be_readonly);
919	assert(be == bes[BACKEND_TYPE_NORMAL]);
920
921	/*
922	 * If we don't *need* to be writable, only check every once in a
923	 * while.
924	 */
925	if (!writing) {
926		if ((uint64_t)(t - be->be_lastcheck) <
927		    BACKEND_READONLY_CHECK_INTERVAL)
928			return (REP_PROTOCOL_SUCCESS);
929		be->be_lastcheck = t;
930	}
931
932	new = sqlite_open(be->be_path, 0600, &errp);
933	if (new == NULL) {
934		backend_panic("reopening %s: %s\n", be->be_path, errp);
935		/*NOTREACHED*/
936	}
937	r = backend_is_readonly(new, be->be_path);
938
939	if (r != SQLITE_OK) {
940		sqlite_close(new);
941		if (writing)
942			return (REP_PROTOCOL_FAIL_BACKEND_READONLY);
943		return (REP_PROTOCOL_SUCCESS);
944	}
945
946	/*
947	 * We can write!  Swap the db handles, mark ourself writable,
948	 * and make a backup.
949	 */
950	sqlite_close(be->be_db);
951	be->be_db = new;
952	be->be_readonly = 0;
953
954	if (backend_create_backup_locked(be, REPOSITORY_BOOT_BACKUP) !=
955	    REP_PROTOCOL_SUCCESS) {
956		configd_critical(
957		    "unable to create \"%s\" backup of \"%s\"\n",
958		    REPOSITORY_BOOT_BACKUP, be->be_path);
959	}
960
961	return (REP_PROTOCOL_SUCCESS);
962}
963
964/*
965 * If t is not BACKEND_TYPE_NORMAL, can fail with
966 *   _BACKEND_ACCESS - backend does not exist
967 *
968 * If writing is nonzero, can also fail with
969 *   _BACKEND_READONLY - backend is read-only
970 */
971static int
972backend_lock(backend_type_t t, int writing, sqlite_backend_t **bep)
973{
974	sqlite_backend_t *be = NULL;
975	hrtime_t ts, vts;
976
977	*bep = NULL;
978
979	assert(t == BACKEND_TYPE_NORMAL ||
980	    t == BACKEND_TYPE_NONPERSIST);
981
982	be = bes[t];
983	if (t == BACKEND_TYPE_NORMAL)
984		assert(be != NULL);		/* should always be there */
985
986	if (be == NULL)
987		return (REP_PROTOCOL_FAIL_BACKEND_ACCESS);
988
989	if (backend_panic_thread != 0)
990		backend_panic(NULL);		/* don't proceed */
991
992	ts = gethrtime();
993	vts = gethrvtime();
994	(void) pthread_mutex_lock(&be->be_lock);
995	UPDATE_TOTALS_WR(be, writing, bt_lock, ts, vts);
996
997	if (backend_panic_thread != 0) {
998		(void) pthread_mutex_unlock(&be->be_lock);
999		backend_panic(NULL);		/* don't proceed */
1000	}
1001	be->be_thread = pthread_self();
1002
1003	if (be->be_readonly) {
1004		int r;
1005		assert(t == BACKEND_TYPE_NORMAL);
1006
1007		r = backend_check_readonly(be, writing, ts);
1008		if (r != REP_PROTOCOL_SUCCESS) {
1009			be->be_thread = 0;
1010			(void) pthread_mutex_unlock(&be->be_lock);
1011			return (r);
1012		}
1013	}
1014
1015	if (backend_do_trace)
1016		(void) sqlite_trace(be->be_db, backend_trace_sql, be);
1017	else
1018		(void) sqlite_trace(be->be_db, NULL, NULL);
1019
1020	be->be_writing = writing;
1021	*bep = be;
1022	return (REP_PROTOCOL_SUCCESS);
1023}
1024
1025static void
1026backend_unlock(sqlite_backend_t *be)
1027{
1028	be->be_writing = 0;
1029	be->be_thread = 0;
1030	(void) pthread_mutex_unlock(&be->be_lock);
1031}
1032
1033static void
1034backend_destroy(sqlite_backend_t *be)
1035{
1036	if (be->be_db != NULL) {
1037		sqlite_close(be->be_db);
1038		be->be_db = NULL;
1039	}
1040	be->be_thread = 0;
1041	(void) pthread_mutex_unlock(&be->be_lock);
1042	(void) pthread_mutex_destroy(&be->be_lock);
1043}
1044
1045static void
1046backend_create_finish(backend_type_t backend_id, sqlite_backend_t *be)
1047{
1048	assert(MUTEX_HELD(&be->be_lock));
1049	assert(be == &be_info[backend_id]);
1050
1051	bes[backend_id] = be;
1052	(void) pthread_mutex_unlock(&be->be_lock);
1053}
1054
1055static int
1056backend_fd_write(int fd, const char *mess)
1057{
1058	int len = strlen(mess);
1059	int written;
1060
1061	while (len > 0) {
1062		if ((written = write(fd, mess, len)) < 0)
1063			return (-1);
1064		mess += written;
1065		len -= written;
1066	}
1067	return (0);
1068}
1069
1070/*
1071 * Can return:
1072 *	_BAD_REQUEST		name is not valid
1073 *	_TRUNCATED		name is too long for current repository path
1074 *	_UNKNOWN		failed for unknown reason (details written to
1075 *				console)
1076 *	_BACKEND_READONLY	backend is not writable
1077 *
1078 *	_SUCCESS		Backup completed successfully.
1079 */
1080rep_protocol_responseid_t
1081backend_create_backup(const char *name)
1082{
1083	rep_protocol_responseid_t result;
1084	sqlite_backend_t *be;
1085
1086	result = backend_lock(BACKEND_TYPE_NORMAL, 0, &be);
1087	if (result != REP_PROTOCOL_SUCCESS)
1088		return (result);
1089
1090	result = backend_create_backup_locked(be, name);
1091	backend_unlock(be);
1092
1093	return (result);
1094}
1095
1096/*ARGSUSED*/
1097static int
1098backend_integrity_callback(void *private, int narg, char **vals, char **cols)
1099{
1100	char **out = private;
1101	char *old = *out;
1102	char *new;
1103	const char *info;
1104	size_t len;
1105	int x;
1106
1107	for (x = 0; x < narg; x++) {
1108		if ((info = vals[x]) != NULL &&
1109		    strcmp(info, "ok") != 0) {
1110			len = (old == NULL)? 0 : strlen(old);
1111			len += strlen(info) + 2;	/* '\n' + '\0' */
1112
1113			new = realloc(old, len);
1114			if (new == NULL)
1115				return (BACKEND_CALLBACK_ABORT);
1116			if (old == NULL)
1117				new[0] = 0;
1118			old = *out = new;
1119			(void) strlcat(new, info, len);
1120			(void) strlcat(new, "\n", len);
1121		}
1122	}
1123	return (BACKEND_CALLBACK_CONTINUE);
1124}
1125
1126#define	BACKEND_CREATE_LOCKED		-2
1127#define	BACKEND_CREATE_FAIL		-1
1128#define	BACKEND_CREATE_SUCCESS		0
1129#define	BACKEND_CREATE_READONLY		1
1130#define	BACKEND_CREATE_NEED_INIT	2
1131static int
1132backend_create(backend_type_t backend_id, const char *db_file,
1133    sqlite_backend_t **bep)
1134{
1135	char *errp;
1136	char *integrity_results = NULL;
1137	sqlite_backend_t *be;
1138	int r;
1139	uint32_t val = -1UL;
1140	struct run_single_int_info info;
1141	int fd;
1142
1143	assert(backend_id >= 0 && backend_id < BACKEND_TYPE_TOTAL);
1144
1145	be = &be_info[backend_id];
1146	assert(be->be_db == NULL);
1147
1148	(void) pthread_mutex_init(&be->be_lock, NULL);
1149	(void) pthread_mutex_lock(&be->be_lock);
1150
1151	be->be_type = backend_id;
1152	be->be_path = strdup(db_file);
1153	if (be->be_path == NULL) {
1154		perror("malloc");
1155		goto fail;
1156	}
1157
1158	be->be_db = sqlite_open(be->be_path, 0600, &errp);
1159
1160	if (be->be_db == NULL) {
1161		if (strstr(errp, "out of memory") != NULL) {
1162			configd_critical("%s: %s\n", db_file, errp);
1163			free(errp);
1164
1165			goto fail;
1166		}
1167
1168		/* report it as an integrity failure */
1169		integrity_results = errp;
1170		errp = NULL;
1171		goto integrity_fail;
1172	}
1173
1174	/*
1175	 * check if we are inited and of the correct schema version
1176	 *
1177	 * Eventually, we'll support schema upgrade here.
1178	 */
1179	info.rs_out = &val;
1180	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;
1181
1182	r = sqlite_exec(be->be_db, "SELECT schema_version FROM schema_version;",
1183	    run_single_int_callback, &info, &errp);
1184	if (r == SQLITE_ERROR &&
1185	    strcmp("no such table: schema_version", errp) == 0) {
1186		free(errp);
1187		/*
1188		 * Could be an empty repository, could be pre-schema_version
1189		 * schema.  Check for id_tbl, which has always been there.
1190		 */
1191		r = sqlite_exec(be->be_db, "SELECT count() FROM id_tbl;",
1192		    NULL, NULL, &errp);
1193		if (r == SQLITE_ERROR &&
1194		    strcmp("no such table: id_tbl", errp) == 0) {
1195			free(errp);
1196			*bep = be;
1197			return (BACKEND_CREATE_NEED_INIT);
1198		}
1199
1200		configd_critical("%s: schema version mismatch\n", db_file);
1201		goto fail;
1202	}
1203	if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
1204		free(errp);
1205		*bep = NULL;
1206		backend_destroy(be);
1207		return (BACKEND_CREATE_LOCKED);
1208	}
1209	if (r == SQLITE_OK) {
1210		if (info.rs_result == REP_PROTOCOL_FAIL_NOT_FOUND ||
1211		    val != BACKEND_SCHEMA_VERSION) {
1212			configd_critical("%s: schema version mismatch\n",
1213			    db_file);
1214			goto fail;
1215		}
1216	}
1217
1218	/*
1219	 * pull in the whole database sequentially.
1220	 */
1221	if ((fd = open(db_file, O_RDONLY)) >= 0) {
1222		size_t sz = 64 * 1024;
1223		char *buffer = malloc(sz);
1224		if (buffer != NULL) {
1225			while (read(fd, buffer, sz) > 0)
1226				;
1227			free(buffer);
1228		}
1229		(void) close(fd);
1230	}
1231
1232	/*
1233	 * run an integrity check
1234	 */
1235	r = sqlite_exec(be->be_db, "PRAGMA integrity_check;",
1236	    backend_integrity_callback, &integrity_results, &errp);
1237
1238	if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
1239		free(errp);
1240		*bep = NULL;
1241		backend_destroy(be);
1242		return (BACKEND_CREATE_LOCKED);
1243	}
1244	if (r == SQLITE_ABORT) {
1245		free(errp);
1246		errp = NULL;
1247		integrity_results = "out of memory running integrity check\n";
1248	} else if (r != SQLITE_OK && integrity_results == NULL) {
1249		integrity_results = errp;
1250		errp = NULL;
1251	}
1252
1253integrity_fail:
1254	if (integrity_results != NULL) {
1255		const char *fname = "/etc/svc/volatile/db_errors";
1256		if ((fd = open(fname, O_CREAT|O_WRONLY|O_APPEND, 0600)) < 0) {
1257			fname = NULL;
1258		} else {
1259			if (backend_fd_write(fd, "\n\n") < 0 ||
1260			    backend_fd_write(fd, db_file) < 0 ||
1261			    backend_fd_write(fd,
1262			    ": PRAGMA integrity_check; failed.  Results:\n") <
1263			    0 || backend_fd_write(fd, integrity_results) < 0 ||
1264			    backend_fd_write(fd, "\n\n") < 0) {
1265				fname = NULL;
1266			}
1267			(void) close(fd);
1268		}
1269
1270		if (!is_main_repository ||
1271		    backend_id == BACKEND_TYPE_NONPERSIST) {
1272			if (fname != NULL)
1273				configd_critical(
1274				    "%s: integrity check failed. Details in "
1275				    "%s\n", db_file, fname);
1276			else
1277				configd_critical(
1278				    "%s: integrity check failed.\n",
1279				    db_file);
1280		} else {
1281			(void) fprintf(stderr,
1282"\n"
1283"svc.configd: smf(5) database integrity check of:\n"
1284"\n"
1285"    %s\n"
1286"\n"
1287"  failed. The database might be damaged or a media error might have\n"
1288"  prevented it from being verified.  Additional information useful to\n"
1289"  your service provider%s%s\n"
1290"\n"
1291"  The system will not be able to boot until you have restored a working\n"
1292"  database.  svc.startd(1M) will provide a sulogin(1M) prompt for recovery\n"
1293"  purposes.  The command:\n"
1294"\n"
1295"    /lib/svc/bin/restore_repository\n"
1296"\n"
1297"  can be run to restore a backup version of your repository.  See\n"
1298"  http://sun.com/msg/SMF-8000-MY for more information.\n"
1299"\n",
1300			    db_file,
1301			    (fname == NULL)? ":\n\n" : " is in:\n\n    ",
1302			    (fname == NULL)? integrity_results : fname);
1303		}
1304		free(errp);
1305		goto fail;
1306	}
1307
1308	/*
1309	 * check if we are writable
1310	 */
1311	r = backend_is_readonly(be->be_db, be->be_path);
1312
1313	if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
1314		free(errp);
1315		*bep = NULL;
1316		backend_destroy(be);
1317		return (BACKEND_CREATE_LOCKED);
1318	}
1319	if (r != SQLITE_OK && r != SQLITE_FULL) {
1320		free(errp);
1321		be->be_readonly = 1;
1322		*bep = be;
1323		return (BACKEND_CREATE_READONLY);
1324	}
1325	*bep = be;
1326	return (BACKEND_CREATE_SUCCESS);
1327
1328fail:
1329	*bep = NULL;
1330	backend_destroy(be);
1331	return (BACKEND_CREATE_FAIL);
1332}
1333
1334/*
1335 * (arg & -arg) is, through the magic of twos-complement arithmetic, the
1336 * lowest set bit in arg.
1337 */
1338static size_t
1339round_up_to_p2(size_t arg)
1340{
1341	/*
1342	 * Don't allow a zero result.
1343	 */
1344	assert(arg > 0 && ((ssize_t)arg > 0));
1345
1346	while ((arg & (arg - 1)) != 0)
1347		arg += (arg & -arg);
1348
1349	return (arg);
1350}
1351
1352/*
1353 * Returns
1354 *   _NO_RESOURCES - out of memory
1355 *   _BACKEND_ACCESS - backend type t (other than _NORMAL) doesn't exist
1356 *   _DONE - callback aborted query
1357 *   _SUCCESS
1358 */
1359int
1360backend_run(backend_type_t t, backend_query_t *q,
1361    backend_run_callback_f *cb, void *data)
1362{
1363	char *errmsg = NULL;
1364	int ret;
1365	sqlite_backend_t *be;
1366	hrtime_t ts, vts;
1367
1368	if (q == NULL || q->bq_buf == NULL)
1369		return (REP_PROTOCOL_FAIL_NO_RESOURCES);
1370
1371	if ((ret = backend_lock(t, 0, &be)) != REP_PROTOCOL_SUCCESS)
1372		return (ret);
1373
1374	ts = gethrtime();
1375	vts = gethrvtime();
1376	ret = sqlite_exec(be->be_db, q->bq_buf, cb, data, &errmsg);
1377	UPDATE_TOTALS(be, bt_exec, ts, vts);
1378	ret = backend_error(be, ret, errmsg);
1379	backend_unlock(be);
1380
1381	return (ret);
1382}
1383
1384/*
1385 * Starts a "read-only" transaction -- i.e., locks out writers as long
1386 * as it is active.
1387 *
1388 * Fails with
1389 *   _NO_RESOURCES - out of memory
1390 *
1391 * If t is not _NORMAL, can also fail with
1392 *   _BACKEND_ACCESS - backend does not exist
1393 *
1394 * If writable is true, can also fail with
1395 *   _BACKEND_READONLY
1396 */
1397static int
1398backend_tx_begin_common(backend_type_t t, backend_tx_t **txp, int writable)
1399{
1400	backend_tx_t *ret;
1401	sqlite_backend_t *be;
1402	int r;
1403
1404	*txp = NULL;
1405
1406	ret = uu_zalloc(sizeof (*ret));
1407	if (ret == NULL)
1408		return (REP_PROTOCOL_FAIL_NO_RESOURCES);
1409
1410	if ((r = backend_lock(t, writable, &be)) != REP_PROTOCOL_SUCCESS) {
1411		uu_free(ret);
1412		return (r);
1413	}
1414
1415	ret->bt_be = be;
1416	ret->bt_readonly = !writable;
1417	ret->bt_type = t;
1418	ret->bt_full = 0;
1419
1420	*txp = ret;
1421	return (REP_PROTOCOL_SUCCESS);
1422}
1423
1424int
1425backend_tx_begin_ro(backend_type_t t, backend_tx_t **txp)
1426{
1427	return (backend_tx_begin_common(t, txp, 0));
1428}
1429
1430static void
1431backend_tx_end(backend_tx_t *tx)
1432{
1433	sqlite_backend_t *be;
1434
1435	be = tx->bt_be;
1436
1437	if (tx->bt_full) {
1438		struct sqlite *new;
1439
1440		/*
1441		 * sqlite tends to be sticky with SQLITE_FULL, so we try
1442		 * to get a fresh database handle if we got a FULL warning
1443		 * along the way.  If that fails, no harm done.
1444		 */
1445		new = sqlite_open(be->be_path, 0600, NULL);
1446		if (new != NULL) {
1447			sqlite_close(be->be_db);
1448			be->be_db = new;
1449		}
1450	}
1451	backend_unlock(be);
1452	tx->bt_be = NULL;
1453	uu_free(tx);
1454}
1455
1456void
1457backend_tx_end_ro(backend_tx_t *tx)
1458{
1459	assert(tx->bt_readonly);
1460	backend_tx_end(tx);
1461}
1462
1463/*
1464 * Fails with
1465 *   _NO_RESOURCES - out of memory
1466 *   _BACKEND_ACCESS
1467 *   _BACKEND_READONLY
1468 */
1469int
1470backend_tx_begin(backend_type_t t, backend_tx_t **txp)
1471{
1472	int r;
1473	char *errmsg;
1474	hrtime_t ts, vts;
1475
1476	r = backend_tx_begin_common(t, txp, 1);
1477	if (r != REP_PROTOCOL_SUCCESS)
1478		return (r);
1479
1480	ts = gethrtime();
1481	vts = gethrvtime();
1482	r = sqlite_exec((*txp)->bt_be->be_db, "BEGIN TRANSACTION", NULL, NULL,
1483	    &errmsg);
1484	UPDATE_TOTALS((*txp)->bt_be, bt_exec, ts, vts);
1485	if (r == SQLITE_FULL)
1486		(*txp)->bt_full = 1;
1487	r = backend_error((*txp)->bt_be, r, errmsg);
1488
1489	if (r != REP_PROTOCOL_SUCCESS) {
1490		assert(r != REP_PROTOCOL_DONE);
1491		(void) sqlite_exec((*txp)->bt_be->be_db,
1492		    "ROLLBACK TRANSACTION", NULL, NULL, NULL);
1493		backend_tx_end(*txp);
1494		*txp = NULL;
1495		return (r);
1496	}
1497
1498	(*txp)->bt_readonly = 0;
1499
1500	return (REP_PROTOCOL_SUCCESS);
1501}
1502
1503void
1504backend_tx_rollback(backend_tx_t *tx)
1505{
1506	int r;
1507	char *errmsg;
1508	sqlite_backend_t *be;
1509	hrtime_t ts, vts;
1510
1511	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1512	be = tx->bt_be;
1513
1514	ts = gethrtime();
1515	vts = gethrvtime();
1516	r = sqlite_exec(be->be_db, "ROLLBACK TRANSACTION", NULL, NULL,
1517	    &errmsg);
1518	UPDATE_TOTALS(be, bt_exec, ts, vts);
1519	if (r == SQLITE_FULL)
1520		tx->bt_full = 1;
1521	(void) backend_error(be, r, errmsg);
1522
1523	backend_tx_end(tx);
1524}
1525
1526/*
1527 * Fails with
1528 *   _NO_RESOURCES - out of memory
1529 */
1530int
1531backend_tx_commit(backend_tx_t *tx)
1532{
1533	int r, r2;
1534	char *errmsg;
1535	sqlite_backend_t *be;
1536	hrtime_t ts, vts;
1537
1538	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1539	be = tx->bt_be;
1540	ts = gethrtime();
1541	vts = gethrvtime();
1542	r = sqlite_exec(be->be_db, "COMMIT TRANSACTION", NULL, NULL,
1543	    &errmsg);
1544	UPDATE_TOTALS(be, bt_exec, ts, vts);
1545	if (r == SQLITE_FULL)
1546		tx->bt_full = 1;
1547
1548	r = backend_error(be, r, errmsg);
1549	assert(r != REP_PROTOCOL_DONE);
1550
1551	if (r != REP_PROTOCOL_SUCCESS) {
1552		r2 = sqlite_exec(be->be_db, "ROLLBACK TRANSACTION", NULL, NULL,
1553		    &errmsg);
1554		r2 = backend_error(be, r2, errmsg);
1555		if (r2 != REP_PROTOCOL_SUCCESS)
1556			backend_panic("cannot rollback failed commit");
1557
1558		backend_tx_end(tx);
1559		return (r);
1560	}
1561	backend_tx_end(tx);
1562	return (REP_PROTOCOL_SUCCESS);
1563}
1564
1565static const char *
1566id_space_to_name(enum id_space id)
1567{
1568	switch (id) {
1569	case BACKEND_ID_SERVICE_INSTANCE:
1570		return ("SI");
1571	case BACKEND_ID_PROPERTYGRP:
1572		return ("PG");
1573	case BACKEND_ID_GENERATION:
1574		return ("GEN");
1575	case BACKEND_ID_PROPERTY:
1576		return ("PROP");
1577	case BACKEND_ID_VALUE:
1578		return ("VAL");
1579	case BACKEND_ID_SNAPNAME:
1580		return ("SNAME");
1581	case BACKEND_ID_SNAPSHOT:
1582		return ("SHOT");
1583	case BACKEND_ID_SNAPLEVEL:
1584		return ("SLVL");
1585	default:
1586		abort();
1587		/*NOTREACHED*/
1588	}
1589}
1590
1591/*
1592 * Returns a new id or 0 if the id argument is invalid or the query fails.
1593 */
1594uint32_t
1595backend_new_id(backend_tx_t *tx, enum id_space id)
1596{
1597	struct run_single_int_info info;
1598	uint32_t new_id = 0;
1599	const char *name = id_space_to_name(id);
1600	char *errmsg;
1601	int ret;
1602	sqlite_backend_t *be;
1603	hrtime_t ts, vts;
1604
1605	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1606	be = tx->bt_be;
1607
1608	info.rs_out = &new_id;
1609	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;
1610
1611	ts = gethrtime();
1612	vts = gethrvtime();
1613	ret = sqlite_exec_printf(be->be_db,
1614	    "SELECT id_next FROM id_tbl WHERE (id_name = '%q');"
1615	    "UPDATE id_tbl SET id_next = id_next + 1 WHERE (id_name = '%q');",
1616	    run_single_int_callback, &info, &errmsg, name, name);
1617	UPDATE_TOTALS(be, bt_exec, ts, vts);
1618	if (ret == SQLITE_FULL)
1619		tx->bt_full = 1;
1620
1621	ret = backend_error(be, ret, errmsg);
1622
1623	if (ret != REP_PROTOCOL_SUCCESS) {
1624		return (0);
1625	}
1626
1627	return (new_id);
1628}
1629
1630/*
1631 * Returns
1632 *   _NO_RESOURCES - out of memory
1633 *   _DONE - callback aborted query
1634 *   _SUCCESS
1635 */
1636int
1637backend_tx_run(backend_tx_t *tx, backend_query_t *q,
1638    backend_run_callback_f *cb, void *data)
1639{
1640	char *errmsg = NULL;
1641	int ret;
1642	sqlite_backend_t *be;
1643	hrtime_t ts, vts;
1644
1645	assert(tx != NULL && tx->bt_be != NULL);
1646	be = tx->bt_be;
1647
1648	if (q == NULL || q->bq_buf == NULL)
1649		return (REP_PROTOCOL_FAIL_NO_RESOURCES);
1650
1651	ts = gethrtime();
1652	vts = gethrvtime();
1653	ret = sqlite_exec(be->be_db, q->bq_buf, cb, data, &errmsg);
1654	UPDATE_TOTALS(be, bt_exec, ts, vts);
1655	if (ret == SQLITE_FULL)
1656		tx->bt_full = 1;
1657	ret = backend_error(be, ret, errmsg);
1658
1659	return (ret);
1660}
1661
1662/*
1663 * Returns
1664 *   _NO_RESOURCES - out of memory
1665 *   _NOT_FOUND - the query returned no results
1666 *   _SUCCESS - the query returned a single integer
1667 */
1668int
1669backend_tx_run_single_int(backend_tx_t *tx, backend_query_t *q, uint32_t *buf)
1670{
1671	struct run_single_int_info info;
1672	int ret;
1673
1674	info.rs_out = buf;
1675	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;
1676
1677	ret = backend_tx_run(tx, q, run_single_int_callback, &info);
1678	assert(ret != REP_PROTOCOL_DONE);
1679
1680	if (ret != REP_PROTOCOL_SUCCESS)
1681		return (ret);
1682
1683	return (info.rs_result);
1684}
1685
1686/*
1687 * Fails with
1688 *   _NO_RESOURCES - out of memory
1689 */
1690int
1691backend_tx_run_update(backend_tx_t *tx, const char *format, ...)
1692{
1693	va_list a;
1694	char *errmsg;
1695	int ret;
1696	sqlite_backend_t *be;
1697	hrtime_t ts, vts;
1698
1699	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1700	be = tx->bt_be;
1701
1702	va_start(a, format);
1703	ts = gethrtime();
1704	vts = gethrvtime();
1705	ret = sqlite_exec_vprintf(be->be_db, format, NULL, NULL, &errmsg, a);
1706	UPDATE_TOTALS(be, bt_exec, ts, vts);
1707	if (ret == SQLITE_FULL)
1708		tx->bt_full = 1;
1709	va_end(a);
1710	ret = backend_error(be, ret, errmsg);
1711	assert(ret != REP_PROTOCOL_DONE);
1712
1713	return (ret);
1714}
1715
1716/*
1717 * returns REP_PROTOCOL_FAIL_NOT_FOUND if no changes occured
1718 */
1719int
1720backend_tx_run_update_changed(backend_tx_t *tx, const char *format, ...)
1721{
1722	va_list a;
1723	char *errmsg;
1724	int ret;
1725	sqlite_backend_t *be;
1726	hrtime_t ts, vts;
1727
1728	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
1729	be = tx->bt_be;
1730
1731	va_start(a, format);
1732	ts = gethrtime();
1733	vts = gethrvtime();
1734	ret = sqlite_exec_vprintf(be->be_db, format, NULL, NULL, &errmsg, a);
1735	UPDATE_TOTALS(be, bt_exec, ts, vts);
1736	if (ret == SQLITE_FULL)
1737		tx->bt_full = 1;
1738	va_end(a);
1739
1740	ret = backend_error(be, ret, errmsg);
1741
1742	return (ret);
1743}
1744
1745#define	BACKEND_ADD_SCHEMA(be, file, tbls, idxs) \
1746	(backend_add_schema((be), (file), \
1747	    (tbls), sizeof (tbls) / sizeof (*(tbls)), \
1748	    (idxs), sizeof (idxs) / sizeof (*(idxs))))
1749
1750static int
1751backend_add_schema(sqlite_backend_t *be, const char *file,
1752    struct backend_tbl_info *tbls, int tbl_count,
1753    struct backend_idx_info *idxs, int idx_count)
1754{
1755	int i;
1756	char *errmsg;
1757	int ret;
1758
1759	/*
1760	 * Create the tables.
1761	 */
1762	for (i = 0; i < tbl_count; i++) {
1763		if (tbls[i].bti_name == NULL) {
1764			assert(i + 1 == tbl_count);
1765			break;
1766		}
1767		ret = sqlite_exec_printf(be->be_db,
1768		    "CREATE TABLE %s (%s);\n",
1769		    NULL, NULL, &errmsg, tbls[i].bti_name, tbls[i].bti_cols);
1770
1771		if (ret != SQLITE_OK) {
1772			configd_critical(
1773			    "%s: %s table creation fails: %s\n", file,
1774			    tbls[i].bti_name, errmsg);
1775			free(errmsg);
1776			return (-1);
1777		}
1778	}
1779
1780	/*
1781	 * Make indices on key tables and columns.
1782	 */
1783	for (i = 0; i < idx_count; i++) {
1784		if (idxs[i].bxi_tbl == NULL) {
1785			assert(i + 1 == idx_count);
1786			break;
1787		}
1788
1789		ret = sqlite_exec_printf(be->be_db,
1790		    "CREATE INDEX %s_%s ON %s (%s);\n",
1791		    NULL, NULL, &errmsg, idxs[i].bxi_tbl, idxs[i].bxi_idx,
1792		    idxs[i].bxi_tbl, idxs[i].bxi_cols);
1793
1794		if (ret != SQLITE_OK) {
1795			configd_critical(
1796			    "%s: %s_%s index creation fails: %s\n", file,
1797			    idxs[i].bxi_tbl, idxs[i].bxi_idx, errmsg);
1798			free(errmsg);
1799			return (-1);
1800		}
1801	}
1802	return (0);
1803}
1804
1805static int
1806backend_init_schema(sqlite_backend_t *be, const char *db_file, backend_type_t t)
1807{
1808	int i;
1809	char *errmsg;
1810	int ret;
1811
1812	assert(t == BACKEND_TYPE_NORMAL || t == BACKEND_TYPE_NONPERSIST);
1813
1814	if (t == BACKEND_TYPE_NORMAL) {
1815		ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_normal, idxs_normal);
1816	} else if (t == BACKEND_TYPE_NONPERSIST) {
1817		ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_np, idxs_np);
1818	} else {
1819		abort();		/* can't happen */
1820	}
1821
1822	if (ret < 0) {
1823		return (ret);
1824	}
1825
1826	ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_common, idxs_common);
1827	if (ret < 0) {
1828		return (ret);
1829	}
1830
1831	/*
1832	 * Add the schema version to the table
1833	 */
1834	ret = sqlite_exec_printf(be->be_db,
1835	    "INSERT INTO schema_version (schema_version) VALUES (%d)",
1836	    NULL, NULL, &errmsg, BACKEND_SCHEMA_VERSION);
1837	if (ret != SQLITE_OK) {
1838		configd_critical(
1839		    "setting schema version fails: %s\n", errmsg);
1840		free(errmsg);
1841	}
1842
1843	/*
1844	 * Populate id_tbl with initial IDs.
1845	 */
1846	for (i = 0; i < BACKEND_ID_INVALID; i++) {
1847		const char *name = id_space_to_name(i);
1848
1849		ret = sqlite_exec_printf(be->be_db,
1850		    "INSERT INTO id_tbl (id_name, id_next) "
1851		    "VALUES ('%q', %d);", NULL, NULL, &errmsg, name, 1);
1852		if (ret != SQLITE_OK) {
1853			configd_critical(
1854			    "id insertion for %s fails: %s\n", name, errmsg);
1855			free(errmsg);
1856			return (-1);
1857		}
1858	}
1859	/*
1860	 * Set the persistance of the database.  The normal database is marked
1861	 * "synchronous", so that all writes are synchronized to stable storage
1862	 * before proceeding.
1863	 */
1864	ret = sqlite_exec_printf(be->be_db,
1865	    "PRAGMA default_synchronous = %s; PRAGMA synchronous = %s;",
1866	    NULL, NULL, &errmsg,
1867	    (t == BACKEND_TYPE_NORMAL)? "ON" : "OFF",
1868	    (t == BACKEND_TYPE_NORMAL)? "ON" : "OFF");
1869	if (ret != SQLITE_OK) {
1870		configd_critical("pragma setting fails: %s\n", errmsg);
1871		free(errmsg);
1872		return (-1);
1873	}
1874
1875	return (0);
1876}
1877
1878int
1879backend_init(const char *db_file, const char *npdb_file, int have_np)
1880{
1881	sqlite_backend_t *be;
1882	int r;
1883	int writable_persist = 1;
1884
1885	/* set up our temporary directory */
1886	sqlite_temp_directory = "/etc/svc/volatile";
1887
1888	if (strcmp(SQLITE_VERSION, sqlite_version) != 0) {
1889		configd_critical("Mismatched link!  (%s should be %s)\n",
1890		    sqlite_version, SQLITE_VERSION);
1891		return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1892	}
1893	if (db_file == NULL)
1894		db_file = REPOSITORY_DB;
1895	if (strcmp(db_file, REPOSITORY_DB) != 0) {
1896		is_main_repository = 0;
1897	}
1898
1899	r = backend_create(BACKEND_TYPE_NORMAL, db_file, &be);
1900	switch (r) {
1901	case BACKEND_CREATE_FAIL:
1902		return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1903	case BACKEND_CREATE_LOCKED:
1904		return (CONFIGD_EXIT_DATABASE_LOCKED);
1905	case BACKEND_CREATE_SUCCESS:
1906		break;		/* success */
1907	case BACKEND_CREATE_READONLY:
1908		writable_persist = 0;
1909		break;
1910	case BACKEND_CREATE_NEED_INIT:
1911		if (backend_init_schema(be, db_file, BACKEND_TYPE_NORMAL)) {
1912			backend_destroy(be);
1913			return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1914		}
1915		break;
1916	default:
1917		abort();
1918		/*NOTREACHED*/
1919	}
1920	backend_create_finish(BACKEND_TYPE_NORMAL, be);
1921
1922	if (have_np) {
1923		if (npdb_file == NULL)
1924			npdb_file = NONPERSIST_DB;
1925
1926		r = backend_create(BACKEND_TYPE_NONPERSIST, npdb_file, &be);
1927		switch (r) {
1928		case BACKEND_CREATE_SUCCESS:
1929			break;		/* success */
1930		case BACKEND_CREATE_FAIL:
1931			return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1932		case BACKEND_CREATE_LOCKED:
1933			return (CONFIGD_EXIT_DATABASE_LOCKED);
1934		case BACKEND_CREATE_READONLY:
1935			configd_critical("%s: unable to write\n", npdb_file);
1936			return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1937		case BACKEND_CREATE_NEED_INIT:
1938			if (backend_init_schema(be, db_file,
1939			    BACKEND_TYPE_NONPERSIST)) {
1940				backend_destroy(be);
1941				return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
1942			}
1943			break;
1944		default:
1945			abort();
1946			/*NOTREACHED*/
1947		}
1948		backend_create_finish(BACKEND_TYPE_NONPERSIST, be);
1949
1950		/*
1951		 * If we started up with a writable filesystem, but the
1952		 * non-persistent database needed initialization, we
1953		 * are booting a non-global zone, so do a backup.
1954		 */
1955		if (r == BACKEND_CREATE_NEED_INIT && writable_persist &&
1956		    backend_lock(BACKEND_TYPE_NORMAL, 0, &be) ==
1957		    REP_PROTOCOL_SUCCESS) {
1958			if (backend_create_backup_locked(be,
1959			    REPOSITORY_BOOT_BACKUP) != REP_PROTOCOL_SUCCESS) {
1960				configd_critical(
1961				    "unable to create \"%s\" backup of "
1962				    "\"%s\"\n", REPOSITORY_BOOT_BACKUP,
1963				    be->be_path);
1964			}
1965			backend_unlock(be);
1966		}
1967	}
1968	return (CONFIGD_EXIT_OKAY);
1969}
1970
1971/*
1972 * quiesce all database activity prior to exiting
1973 */
1974void
1975backend_fini(void)
1976{
1977	sqlite_backend_t *be_normal, *be_np;
1978
1979	(void) backend_lock(BACKEND_TYPE_NORMAL, 1, &be_normal);
1980	(void) backend_lock(BACKEND_TYPE_NONPERSIST, 1, &be_np);
1981}
1982
1983#define	QUERY_BASE	128
1984backend_query_t *
1985backend_query_alloc(void)
1986{
1987	backend_query_t *q;
1988	q = calloc(1, sizeof (backend_query_t));
1989	if (q != NULL) {
1990		q->bq_size = QUERY_BASE;
1991		q->bq_buf = calloc(1, q->bq_size);
1992		if (q->bq_buf == NULL) {
1993			q->bq_size = 0;
1994		}
1995
1996	}
1997	return (q);
1998}
1999
2000void
2001backend_query_append(backend_query_t *q, const char *value)
2002{
2003	char *alloc;
2004	int count;
2005	size_t size, old_len;
2006
2007	if (q == NULL) {
2008		/* We'll discover the error when we try to run the query. */
2009		return;
2010	}
2011
2012	while (q->bq_buf != NULL) {
2013		old_len = strlen(q->bq_buf);
2014		size = q->bq_size;
2015		count = strlcat(q->bq_buf, value, size);
2016
2017		if (count < size)
2018			break;				/* success */
2019
2020		q->bq_buf[old_len] = 0;
2021		size = round_up_to_p2(count + 1);
2022
2023		assert(size > q->bq_size);
2024		alloc = realloc(q->bq_buf, size);
2025		if (alloc == NULL) {
2026			free(q->bq_buf);
2027			q->bq_buf = NULL;
2028			break;				/* can't grow */
2029		}
2030
2031		q->bq_buf = alloc;
2032		q->bq_size = size;
2033	}
2034}
2035
2036void
2037backend_query_add(backend_query_t *q, const char *format, ...)
2038{
2039	va_list args;
2040	char *new;
2041
2042	if (q == NULL || q->bq_buf == NULL)
2043		return;
2044
2045	va_start(args, format);
2046	new = sqlite_vmprintf(format, args);
2047	va_end(args);
2048
2049	if (new == NULL) {
2050		free(q->bq_buf);
2051		q->bq_buf = NULL;
2052		return;
2053	}
2054
2055	backend_query_append(q, new);
2056
2057	free(new);
2058}
2059
2060void
2061backend_query_free(backend_query_t *q)
2062{
2063	if (q != NULL) {
2064		if (q->bq_buf != NULL) {
2065			free(q->bq_buf);
2066		}
2067		free(q);
2068	}
2069}
2070