1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2001,2008 Oracle.  All rights reserved.
5 *
6 * $Id: fop_rec.c,v 12.27 2008/01/31 18:40:43 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/fop.h"
14#include "dbinc/db_am.h"
15#include "dbinc/mp.h"
16#include "dbinc/txn.h"
17
18static int __fop_rename_recover_int
19    __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
20
21/*
22 * The transactional guarantees Berkeley DB provides for file
23 * system level operations (database physical file create, delete,
24 * rename) are based on our understanding of current file system
25 * semantics; a system that does not provide these semantics and
26 * guarantees could be in danger.
27 *
28 * First, as in standard database changes, fsync and fdatasync must
29 * work: when applied to the log file, the records written into the
30 * log must be transferred to stable storage.
31 *
32 * Second, it must not be possible for the log file to be removed
33 * without previous file system level operations being flushed to
34 * stable storage.  Berkeley DB applications write log records
35 * describing file system operations into the log, then perform the
36 * file system operation, then commit the enclosing transaction
37 * (which flushes the log file to stable storage).  Subsequently,
38 * a database environment checkpoint may make it possible for the
39 * application to remove the log file containing the record of the
40 * file system operation.  DB's transactional guarantees for file
41 * system operations require the log file removal not succeed until
42 * all previous filesystem operations have been flushed to stable
43 * storage.  In other words, the flush of the log file, or the
44 * removal of the log file, must block until all previous
45 * filesystem operations have been flushed to stable storage.  This
46 * semantic is not, as far as we know, required by any existing
47 * standards document, but we have never seen a filesystem where
48 * it does not apply.
49 */
50
51/*
52 * __fop_create_recover --
53 *	Recovery function for create.
54 *
55 * PUBLIC: int __fop_create_recover
56 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
57 */
58int
59__fop_create_recover(env, dbtp, lsnp, op, info)
60	ENV *env;
61	DBT *dbtp;
62	DB_LSN *lsnp;
63	db_recops op;
64	void *info;
65{
66	__fop_create_args *argp;
67	DB_FH *fhp;
68	DBMETA *meta;
69	u_int8_t mbuf[DBMETASIZE];
70	int ret;
71	char *real_name;
72
73	COMPQUIET(info, NULL);
74
75	real_name = NULL;
76	REC_PRINT(__fop_create_print);
77	REC_NOOP_INTRO(__fop_create_read);
78	meta = (DBMETA *)mbuf;
79
80	if ((ret = __db_appname(env, (APPNAME)argp->appname,
81	    (const char *)argp->name.data, 0, NULL, &real_name)) != 0)
82		goto out;
83
84	if (DB_UNDO(op)) {
85		/*
86		 * If the file was opened in mpool, we must mark it as
87		 * dead via nameop which will also unlink the file.
88		 */
89		if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
90			if (__fop_read_meta(env,
91			    real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
92			    __db_chk_meta(env, NULL, meta, 1) == 0) {
93				if ((ret = __memp_nameop(env,
94				    meta->uid, NULL, real_name, NULL, 0)) != 0)
95					goto out;
96			} else
97				goto do_unlink;
98			(void)__os_closehandle(env, fhp);
99		} else
100do_unlink:		(void)__os_unlink(env, real_name, 0);
101	} else if (DB_REDO(op)) {
102		if ((ret = __os_open(env, real_name, 0,
103		    DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
104			(void)__os_closehandle(env, fhp);
105		else
106			goto out;
107	}
108
109	*lsnp = argp->prev_lsn;
110
111out: if (real_name != NULL)
112		__os_free(env, real_name);
113
114	REC_NOOP_CLOSE;
115}
116
117/*
118 * __fop_remove_recover --
119 *	Recovery function for remove.
120 *
121 * PUBLIC: int __fop_remove_recover
122 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
123 */
124int
125__fop_remove_recover(env, dbtp, lsnp, op, info)
126	ENV *env;
127	DBT *dbtp;
128	DB_LSN *lsnp;
129	db_recops op;
130	void *info;
131{
132	__fop_remove_args *argp;
133	int ret;
134	char *real_name;
135
136	COMPQUIET(info, NULL);
137
138	real_name = NULL;
139	REC_PRINT(__fop_remove_print);
140	REC_NOOP_INTRO(__fop_remove_read);
141
142	if ((ret = __db_appname(env, (APPNAME)argp->appname,
143	    (const char *)argp->name.data, 0, NULL, &real_name)) != 0)
144		goto out;
145
146	/* Its ok if the file is not there. */
147	if (DB_REDO(op))
148		(void)__memp_nameop(env,
149		    (u_int8_t *)argp->fid.data, NULL, real_name, NULL, 0);
150
151	*lsnp = argp->prev_lsn;
152out:	if (real_name != NULL)
153		__os_free(env, real_name);
154	REC_NOOP_CLOSE;
155}
156
157/*
158 * __fop_write_recover --
159 *	Recovery function for writechunk.
160 *
161 * PUBLIC: int __fop_write_recover
162 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
163 */
164int
165__fop_write_recover(env, dbtp, lsnp, op, info)
166	ENV *env;
167	DBT *dbtp;
168	DB_LSN *lsnp;
169	db_recops op;
170	void *info;
171{
172	__fop_write_args *argp;
173	int ret;
174
175	COMPQUIET(info, NULL);
176
177	REC_PRINT(__fop_write_print);
178	REC_NOOP_INTRO(__fop_write_read);
179
180	ret = 0;
181	if (DB_UNDO(op))
182		DB_ASSERT(env, argp->flag != 0);
183	else if (DB_REDO(op))
184		ret = __fop_write(env,
185		    argp->txnp, argp->name.data, (APPNAME)argp->appname,
186		    NULL, argp->pgsize, argp->pageno, argp->offset,
187		    argp->page.data, argp->page.size, argp->flag, 0);
188
189	if (ret == 0)
190		*lsnp = argp->prev_lsn;
191	REC_NOOP_CLOSE;
192}
193
194/*
195 * __fop_rename_recover --
196 *	Recovery functions for rename.  There are two variants that
197 * both use the same utility function.  Had we known about this on day
198 * one, we would have simply added a parameter.  However, since we need
199 * to retain old records for backward compatibility (online-upgrade)
200 * wrapping the two seems like the right solution.
201 *
202 * PUBLIC: int __fop_rename_recover
203 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
204 *
205 * PUBLIC: int __fop_rename_noundo_recover
206 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
207 */
208int
209__fop_rename_recover(env, dbtp, lsnp, op, info)
210	ENV *env;
211	DBT *dbtp;
212	DB_LSN *lsnp;
213	db_recops op;
214	void *info;
215{
216	return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 1));
217}
218
219int
220__fop_rename_noundo_recover(env, dbtp, lsnp, op, info)
221	ENV *env;
222	DBT *dbtp;
223	DB_LSN *lsnp;
224	db_recops op;
225	void *info;
226{
227	return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 0));
228}
229
230static int
231__fop_rename_recover_int(env, dbtp, lsnp, op, info, undo)
232	ENV *env;
233	DBT *dbtp;
234	DB_LSN *lsnp;
235	db_recops op;
236	void *info;
237	int undo;
238{
239	__fop_rename_args *argp;
240	DB_FH *fhp;
241	DBMETA *meta;
242	u_int8_t *fileid, mbuf[DBMETASIZE];
243	int ret;
244	char *real_new, *real_old, *src;
245
246	COMPQUIET(info, NULL);
247
248	fhp = NULL;
249	meta = (DBMETA *)&mbuf[0];
250	ret = 0;
251	real_new = real_old = NULL;
252
253	REC_PRINT(__fop_rename_print);
254	REC_NOOP_INTRO(__fop_rename_read);
255	fileid = argp->fileid.data;
256
257	if ((ret = __db_appname(env, (APPNAME)argp->appname,
258	    (const char *)argp->newname.data, 0, NULL, &real_new)) != 0)
259		goto out;
260	if ((ret = __db_appname(env, (APPNAME)argp->appname,
261	    (const char *)argp->oldname.data, 0, NULL, &real_old)) != 0)
262		goto out;
263
264	/*
265	 * Verify that we are manipulating the correct file.  We should always
266	 * be OK on an ABORT or an APPLY, but during recovery, we have to
267	 * check.
268	 */
269	if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
270		src = DB_UNDO(op) ? real_new : real_old;
271		/*
272		 * Interpret any error as meaning that the file either doesn't
273		 * exist, doesn't have a meta-data page, or is in some other
274		 * way, shape or form, incorrect, so that we should not restore
275		 * it.
276		 */
277		if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
278			goto done;
279		if (__fop_read_meta(env,
280		    src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
281			goto done;
282		if (__db_chk_meta(env, NULL, meta, 1) != 0)
283			goto done;
284		if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
285			goto done;
286		(void)__os_closehandle(env, fhp);
287		fhp = NULL;
288		if (DB_REDO(op)) {
289			/*
290			 * Check to see if the target file exists.  If it
291			 * does and it does not have the proper id then
292			 * it is a later version.  We just remove the source
293			 * file since the state of the world is beyond this
294			 * point.
295			 */
296			if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
297			    __fop_read_meta(env, src, mbuf,
298			    DBMETASIZE, fhp, 1, NULL) == 0 &&
299			    __db_chk_meta(env, NULL, meta, 1) == 0 &&
300			    memcmp(argp->fileid.data,
301			    meta->uid, DB_FILE_ID_LEN) != 0) {
302				(void)__memp_nameop(env,
303				    fileid, NULL, real_old, NULL, 0);
304				goto done;
305			}
306		}
307	}
308
309	if (undo && DB_UNDO(op))
310		(void)__memp_nameop(env, fileid,
311		    (const char *)argp->oldname.data, real_new, real_old, 0);
312	if (DB_REDO(op))
313		(void)__memp_nameop(env, fileid,
314		    (const char *)argp->newname.data, real_old, real_new, 0);
315
316done:	*lsnp = argp->prev_lsn;
317out:	if (real_new != NULL)
318		__os_free(env, real_new);
319	if (real_old != NULL)
320		__os_free(env, real_old);
321	if (fhp != NULL)
322		(void)__os_closehandle(env, fhp);
323
324	REC_NOOP_CLOSE;
325}
326
327/*
328 * __fop_file_remove_recover --
329 *	Recovery function for file_remove.  On the REDO pass, we need to
330 * make sure no one recreated the file while we weren't looking.  On an
331 * undo pass must check if the file we are interested in is the one that
332 * exists and then set the status of the child transaction depending on
333 * what we find out.
334 *
335 * PUBLIC: int __fop_file_remove_recover
336 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
337 */
338int
339__fop_file_remove_recover(env, dbtp, lsnp, op, info)
340	ENV *env;
341	DBT *dbtp;
342	DB_LSN *lsnp;
343	db_recops op;
344	void *info;
345{
346	__fop_file_remove_args *argp;
347	DBMETA *meta;
348	DB_FH *fhp;
349	size_t len;
350	u_int8_t mbuf[DBMETASIZE];
351	u_int32_t cstat, ret_stat;
352	int is_real, is_tmp, ret;
353	char *real_name;
354
355	fhp = NULL;
356	meta = (DBMETA *)&mbuf[0];
357	is_real = is_tmp = 0;
358	real_name = NULL;
359	REC_PRINT(__fop_file_remove_print);
360	REC_NOOP_INTRO(__fop_file_remove_read);
361
362	/*
363	 * This record is only interesting on the backward, forward, and
364	 * apply phases.
365	 */
366	if (op != DB_TXN_BACKWARD_ROLL &&
367	    op != DB_TXN_FORWARD_ROLL && op != DB_TXN_APPLY)
368		goto done;
369
370	if ((ret = __db_appname(env,
371	    (APPNAME)argp->appname, argp->name.data, 0, NULL, &real_name)) != 0)
372		goto out;
373
374	/* Verify that we are manipulating the correct file.  */
375	len = 0;
376	if (__os_open(env, real_name, 0, 0, 0, &fhp) != 0 ||
377	    (ret = __fop_read_meta(env, real_name,
378	    mbuf, DBMETASIZE, fhp, 1, &len)) != 0) {
379		/*
380		 * If len is non-zero, then the file exists and has something
381		 * in it, but that something isn't a full meta-data page, so
382		 * this is very bad.  Bail out!
383		 */
384		if (len != 0)
385			goto out;
386
387		/* File does not exist. */
388		cstat = TXN_EXPECTED;
389	} else {
390		/*
391		 * We can ignore errors here since we'll simply fail the
392		 * checks below and assume this is the wrong file.
393		 */
394		(void)__db_chk_meta(env, NULL, meta, 1);
395		is_real =
396		    memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
397		is_tmp =
398		    memcmp(argp->tmp_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
399
400		if (!is_real && !is_tmp)
401			/* File exists, but isn't what we were removing. */
402			cstat = TXN_IGNORE;
403		else
404			/* File exists and is the one that we were removing. */
405			cstat = TXN_COMMIT;
406	}
407	if (fhp != NULL) {
408		(void)__os_closehandle(env, fhp);
409		fhp = NULL;
410	}
411
412	if (DB_UNDO(op)) {
413		/* On the backward pass, we leave a note for the child txn. */
414		if ((ret = __db_txnlist_update(env,
415		    info, argp->child, cstat, NULL, &ret_stat, 1)) != 0)
416			goto out;
417	} else if (DB_REDO(op)) {
418		/*
419		 * On the forward pass, check if someone recreated the
420		 * file while we weren't looking.
421		 */
422		if (cstat == TXN_COMMIT)
423			(void)__memp_nameop(env,
424			    is_real ? argp->real_fid.data : argp->tmp_fid.data,
425			    NULL, real_name, NULL, 0);
426	}
427
428done:	*lsnp = argp->prev_lsn;
429	ret = 0;
430
431out:	if (real_name != NULL)
432		__os_free(env, real_name);
433	if (fhp != NULL)
434		(void)__os_closehandle(env, fhp);
435	REC_NOOP_CLOSE;
436}
437