1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2001,2008 Oracle.  All rights reserved.
5 *
6 * $Id: fop_basic.c,v 12.30 2008/01/11 20:50:00 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/fop.h"
14#include "dbinc/log.h"
15#include "dbinc/mp.h"
16#include "dbinc/txn.h"
17#include "dbinc/db_am.h"
18
19/*
20 * The transactional guarantees Berkeley DB provides for file
21 * system level operations (database physical file create, delete,
22 * rename) are based on our understanding of current file system
23 * semantics; a system that does not provide these semantics and
24 * guarantees could be in danger.
25 *
26 * First, as in standard database changes, fsync and fdatasync must
27 * work: when applied to the log file, the records written into the
28 * log must be transferred to stable storage.
29 *
30 * Second, it must not be possible for the log file to be removed
31 * without previous file system level operations being flushed to
32 * stable storage.  Berkeley DB applications write log records
33 * describing file system operations into the log, then perform the
34 * file system operation, then commit the enclosing transaction
35 * (which flushes the log file to stable storage).  Subsequently,
36 * a database environment checkpoint may make it possible for the
37 * application to remove the log file containing the record of the
38 * file system operation.  DB's transactional guarantees for file
39 * system operations require the log file removal not succeed until
40 * all previous filesystem operations have been flushed to stable
41 * storage.  In other words, the flush of the log file, or the
42 * removal of the log file, must block until all previous
43 * filesystem operations have been flushed to stable storage.  This
44 * semantic is not, as far as we know, required by any existing
45 * standards document, but we have never seen a filesystem where
46 * it does not apply.
47 */
48
49/*
50 * __fop_create --
51 * Create a (transactionally protected) file system object.  This is used
52 * to create DB files now, potentially blobs, queue extents and anything
53 * else you wish to store in a file system object.
54 *
55 * PUBLIC: int __fop_create __P((ENV *,
56 * PUBLIC:     DB_TXN *, DB_FH **, const char *, APPNAME, int, u_int32_t));
57 */
58int
59__fop_create(env, txn, fhpp, name, appname, mode, flags)
60	ENV *env;
61	DB_TXN *txn;
62	DB_FH **fhpp;
63	const char *name;
64	APPNAME appname;
65	int mode;
66	u_int32_t flags;
67{
68	DBT data;
69	DB_FH *fhp;
70	DB_LSN lsn;
71	int ret;
72	char *real_name;
73
74	real_name = NULL;
75	fhp = NULL;
76
77	if ((ret =
78	    __db_appname(env, appname, name, 0, NULL, &real_name)) != 0)
79		return (ret);
80
81	if (mode == 0)
82		mode = DB_MODE_600;
83
84	if (DBENV_LOGGING(env)
85#if !defined(DEBUG_WOP)
86	    && txn != NULL
87#endif
88	    ) {
89		DB_INIT_DBT(data, name, strlen(name) + 1);
90		if ((ret = __fop_create_log(env, txn, &lsn,
91		    flags | DB_FLUSH,
92		    &data, (u_int32_t)appname, (u_int32_t)mode)) != 0)
93			goto err;
94	}
95
96	DB_ENV_TEST_RECOVERY(env, DB_TEST_POSTLOG, ret, name);
97
98	if (fhpp == NULL)
99		fhpp = &fhp;
100	ret = __os_open(
101	    env, real_name, 0, DB_OSO_CREATE | DB_OSO_EXCL, mode, fhpp);
102
103err:
104DB_TEST_RECOVERY_LABEL
105	if (fhpp == &fhp && fhp != NULL)
106		(void)__os_closehandle(env, fhp);
107	if (real_name != NULL)
108		__os_free(env, real_name);
109	return (ret);
110}
111
112/*
113 * __fop_remove --
114 *	Remove a file system object.
115 *
116 * PUBLIC: int __fop_remove __P((ENV *,
117 * PUBLIC:     DB_TXN *, u_int8_t *, const char *, APPNAME, u_int32_t));
118 */
119int
120__fop_remove(env, txn, fileid, name, appname, flags)
121	ENV *env;
122	DB_TXN *txn;
123	u_int8_t *fileid;
124	const char *name;
125	APPNAME appname;
126	u_int32_t flags;
127{
128	DBT fdbt, ndbt;
129	DB_LSN lsn;
130	char *real_name;
131	int ret;
132
133	real_name = NULL;
134
135	if ((ret =
136	    __db_appname(env, appname, name, 0, NULL, &real_name)) != 0)
137		goto err;
138
139	if (!IS_REAL_TXN(txn)) {
140		if (fileid != NULL && (ret = __memp_nameop(
141		    env, fileid, NULL, real_name, NULL, 0)) != 0)
142			goto err;
143	} else {
144		if (DBENV_LOGGING(env)
145#if !defined(DEBUG_WOP)
146		    && txn != NULL
147#endif
148		) {
149			memset(&fdbt, 0, sizeof(ndbt));
150			fdbt.data = fileid;
151			fdbt.size = fileid == NULL ? 0 : DB_FILE_ID_LEN;
152			DB_INIT_DBT(ndbt, name, strlen(name) + 1);
153			if ((ret = __fop_remove_log(env, txn, &lsn,
154			    flags, &ndbt, &fdbt, (u_int32_t)appname)) != 0)
155				goto err;
156		}
157		ret = __txn_remevent(env, txn, real_name, fileid, 0);
158	}
159
160err:	if (real_name != NULL)
161		__os_free(env, real_name);
162	return (ret);
163}
164
165/*
166 * __fop_write
167 *
168 * Write "size" bytes from "buf" to file "name" beginning at offset "off."
169 * If the file is open, supply a handle in fhp.  Istmp indicate if this is
170 * an operation that needs to be undone in the face of failure (i.e., if
171 * this is a write to a temporary file, we're simply going to remove the
172 * file, so don't worry about undoing the write).
173 *
174 * Currently, we *only* use this with istmp true.  If we need more general
175 * handling, then we'll have to zero out regions on abort (and possibly
176 * log the before image of the data in the log record).
177 *
178 * PUBLIC: int __fop_write __P((ENV *,
179 * PUBLIC:     DB_TXN *, const char *, APPNAME, DB_FH *, u_int32_t, db_pgno_t,
180 * PUBLIC:     u_int32_t, void *, u_int32_t, u_int32_t, u_int32_t));
181 */
182int
183__fop_write(env,
184    txn, name, appname, fhp, pgsize, pageno, off, buf, size, istmp, flags)
185	ENV *env;
186	DB_TXN *txn;
187	const char *name;
188	APPNAME appname;
189	DB_FH *fhp;
190	u_int32_t pgsize;
191	db_pgno_t pageno;
192	u_int32_t off;
193	void *buf;
194	u_int32_t size, istmp, flags;
195{
196	DBT data, namedbt;
197	DB_LSN lsn;
198	size_t nbytes;
199	int local_open, ret, t_ret;
200	char *real_name;
201
202	DB_ASSERT(env, istmp != 0);
203
204	ret = local_open = 0;
205	real_name = NULL;
206
207	if ((ret =
208	    __db_appname(env, appname, name, 0, NULL, &real_name)) != 0)
209		return (ret);
210
211	if (DBENV_LOGGING(env)
212#if !defined(DEBUG_WOP)
213	    && txn != NULL
214#endif
215	    ) {
216		memset(&data, 0, sizeof(data));
217		data.data = buf;
218		data.size = size;
219		DB_INIT_DBT(namedbt, name, strlen(name) + 1);
220		if ((ret = __fop_write_log(env, txn,
221		    &lsn, flags, &namedbt, (u_int32_t)appname,
222		    pgsize, pageno, off, &data, istmp)) != 0)
223			goto err;
224	}
225
226	if (fhp == NULL) {
227		/* File isn't open; we need to reopen it. */
228		if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0)
229			goto err;
230		local_open = 1;
231	}
232
233	/* Seek to offset. */
234	if ((ret = __os_seek(env, fhp, pageno, pgsize, off)) != 0)
235		goto err;
236
237	/* Now do the write. */
238	if ((ret = __os_write(env, fhp, buf, size, &nbytes)) != 0)
239		goto err;
240
241err:	if (local_open &&
242	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
243			ret = t_ret;
244
245	if (real_name != NULL)
246		__os_free(env, real_name);
247	return (ret);
248}
249
250/*
251 * __fop_rename --
252 *	Change a file's name.
253 *
254 * PUBLIC: int __fop_rename __P((ENV *, DB_TXN *, const char *,
255 * PUBLIC:      const char *, u_int8_t *, APPNAME, int, u_int32_t));
256 */
257int
258__fop_rename(env, txn, oldname, newname, fid, appname, with_undo, flags)
259	ENV *env;
260	DB_TXN *txn;
261	const char *oldname;
262	const char *newname;
263	u_int8_t *fid;
264	APPNAME appname;
265	int with_undo;
266	u_int32_t flags;
267{
268	DBT fiddbt, new, old;
269	DB_LSN lsn;
270	int ret;
271	char *n, *o;
272
273	o = n = NULL;
274	if ((ret = __db_appname(env, appname, oldname, 0, NULL, &o)) != 0)
275		goto err;
276	if ((ret = __db_appname(env, appname, newname, 0, NULL, &n)) != 0)
277		goto err;
278
279	if (DBENV_LOGGING(env)
280#if !defined(DEBUG_WOP)
281	    && txn != NULL
282#endif
283	    ) {
284		DB_INIT_DBT(old, oldname, strlen(oldname) + 1);
285		DB_INIT_DBT(new, newname, strlen(newname) + 1);
286		memset(&fiddbt, 0, sizeof(fiddbt));
287		fiddbt.data = fid;
288		fiddbt.size = DB_FILE_ID_LEN;
289		if (with_undo)
290			ret = __fop_rename_log(env,
291			    txn, &lsn, flags | DB_FLUSH,
292			    &old, &new, &fiddbt, (u_int32_t)appname);
293		else
294			ret = __fop_rename_noundo_log(env,
295			    txn, &lsn, flags | DB_FLUSH,
296			    &old, &new, &fiddbt, (u_int32_t)appname);
297		if (ret != 0)
298			goto err;
299	}
300
301	ret = __memp_nameop(env, fid, newname, o, n, 0);
302
303err:	if (o != NULL)
304		__os_free(env, o);
305	if (n != NULL)
306		__os_free(env, n);
307	return (ret);
308}
309