1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2001-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/fop.h"
14#include "dbinc/db_am.h"
15#include "dbinc/mp.h"
16#include "dbinc/txn.h"
17
18static int __fop_rename_recover_int
19    __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
20static int __fop_rename_42_recover_int
21    __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int));
22
23/*
24 * The transactional guarantees Berkeley DB provides for file
25 * system level operations (database physical file create, delete,
26 * rename) are based on our understanding of current file system
27 * semantics; a system that does not provide these semantics and
28 * guarantees could be in danger.
29 *
30 * First, as in standard database changes, fsync and fdatasync must
31 * work: when applied to the log file, the records written into the
32 * log must be transferred to stable storage.
33 *
34 * Second, it must not be possible for the log file to be removed
35 * without previous file system level operations being flushed to
36 * stable storage.  Berkeley DB applications write log records
37 * describing file system operations into the log, then perform the
38 * file system operation, then commit the enclosing transaction
39 * (which flushes the log file to stable storage).  Subsequently,
40 * a database environment checkpoint may make it possible for the
41 * application to remove the log file containing the record of the
42 * file system operation.  DB's transactional guarantees for file
43 * system operations require the log file removal not succeed until
44 * all previous filesystem operations have been flushed to stable
45 * storage.  In other words, the flush of the log file, or the
46 * removal of the log file, must block until all previous
47 * filesystem operations have been flushed to stable storage.  This
48 * semantic is not, as far as we know, required by any existing
49 * standards document, but we have never seen a filesystem where
50 * it does not apply.
51 */
52
53/*
54 * __fop_create_recover --
55 *	Recovery function for create.
56 *
57 * PUBLIC: int __fop_create_recover
58 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
59 */
60int
61__fop_create_recover(env, dbtp, lsnp, op, info)
62	ENV *env;
63	DBT *dbtp;
64	DB_LSN *lsnp;
65	db_recops op;
66	void *info;
67{
68	__fop_create_args *argp;
69	DB_FH *fhp;
70	DBMETA *meta;
71	u_int8_t mbuf[DBMETASIZE];
72	int ret;
73	char *real_name;
74	const char *dirname;
75
76	COMPQUIET(info, NULL);
77
78	real_name = NULL;
79	REC_PRINT(__fop_create_print);
80	REC_NOOP_INTRO(__fop_create_read);
81	meta = (DBMETA *)mbuf;
82
83	if (argp->dirname.size == 0)
84		dirname = NULL;
85	else
86		dirname = (const char *)argp->dirname.data;
87
88	if ((ret = __db_appname(env, (APPNAME)argp->appname == DB_APP_DATA ?
89	    DB_APP_RECOVER : (APPNAME)argp->appname,
90	    (const char *)argp->name.data, &dirname, &real_name)) != 0)
91		goto out;
92
93	if (DB_UNDO(op)) {
94		/*
95		 * If the file was opened in mpool, we must mark it as
96		 * dead via nameop which will also unlink the file.
97		 */
98		if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
99			if (__fop_read_meta(env,
100			    real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
101			    __db_chk_meta(env, NULL, meta, 1) == 0) {
102				if ((ret = __memp_nameop(env,
103				    meta->uid, NULL, real_name, NULL, 0)) != 0)
104					goto out;
105			} else {
106				(void)__os_closehandle(env, fhp);
107				goto do_unlink;
108			}
109			(void)__os_closehandle(env, fhp);
110		} else
111do_unlink:		(void)__os_unlink(env, real_name, 0);
112	} else if (DB_REDO(op)) {
113		if ((ret = __os_open(env, real_name, 0,
114		    DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
115			(void)__os_closehandle(env, fhp);
116		else
117			goto out;
118	}
119
120	*lsnp = argp->prev_lsn;
121
122out: if (real_name != NULL)
123		__os_free(env, real_name);
124
125	REC_NOOP_CLOSE;
126}
127
128/*
129 * __fop_create_42_recover --
130 *	Recovery function for create.
131 *
132 * PUBLIC: int __fop_create_42_recover
133 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
134 */
135int
136__fop_create_42_recover(env, dbtp, lsnp, op, info)
137	ENV *env;
138	DBT *dbtp;
139	DB_LSN *lsnp;
140	db_recops op;
141	void *info;
142{
143	__fop_create_args *argp;
144	DB_FH *fhp;
145	DBMETA *meta;
146	u_int8_t mbuf[DBMETASIZE];
147	int ret;
148	char *real_name;
149
150	COMPQUIET(info, NULL);
151
152	real_name = NULL;
153	REC_PRINT(__fop_create_print);
154	REC_NOOP_INTRO(__fop_create_read);
155	meta = (DBMETA *)mbuf;
156
157	if ((ret = __db_appname(env, (APPNAME)argp->appname,
158	    (const char *)argp->name.data, NULL, &real_name)) != 0)
159		goto out;
160
161	if (DB_UNDO(op)) {
162		/*
163		 * If the file was opened in mpool, we must mark it as
164		 * dead via nameop which will also unlink the file.
165		 */
166		if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) {
167			if (__fop_read_meta(env,
168			    real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 &&
169			    __db_chk_meta(env, NULL, meta, 1) == 0) {
170				if ((ret = __memp_nameop(env,
171				    meta->uid, NULL, real_name, NULL, 0)) != 0)
172					goto out;
173			} else
174				goto do_unlink;
175			(void)__os_closehandle(env, fhp);
176		} else
177do_unlink:		(void)__os_unlink(env, real_name, 0);
178	} else if (DB_REDO(op)) {
179		if ((ret = __os_open(env, real_name, 0,
180		    DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0)
181			(void)__os_closehandle(env, fhp);
182		else
183			goto out;
184	}
185
186	*lsnp = argp->prev_lsn;
187
188out: if (real_name != NULL)
189		__os_free(env, real_name);
190
191	REC_NOOP_CLOSE;
192}
193
194/*
195 * __fop_remove_recover --
196 *	Recovery function for remove.
197 *
198 * PUBLIC: int __fop_remove_recover
199 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
200 */
201int
202__fop_remove_recover(env, dbtp, lsnp, op, info)
203	ENV *env;
204	DBT *dbtp;
205	DB_LSN *lsnp;
206	db_recops op;
207	void *info;
208{
209	__fop_remove_args *argp;
210	int ret;
211	char *real_name;
212
213	COMPQUIET(info, NULL);
214
215	real_name = NULL;
216	REC_PRINT(__fop_remove_print);
217	REC_NOOP_INTRO(__fop_remove_read);
218
219	if ((ret = __db_appname(env, (APPNAME)argp->appname,
220	    (const char *)argp->name.data, NULL, &real_name)) != 0)
221		goto out;
222
223	/* Its ok if the file is not there. */
224	if (DB_REDO(op))
225		(void)__memp_nameop(env,
226		    (u_int8_t *)argp->fid.data, NULL, real_name, NULL, 0);
227
228	*lsnp = argp->prev_lsn;
229out:	if (real_name != NULL)
230		__os_free(env, real_name);
231	REC_NOOP_CLOSE;
232}
233
234/*
235 * __fop_write_recover --
236 *	Recovery function for writechunk.
237 *
238 * PUBLIC: int __fop_write_recover
239 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
240 */
241int
242__fop_write_recover(env, dbtp, lsnp, op, info)
243	ENV *env;
244	DBT *dbtp;
245	DB_LSN *lsnp;
246	db_recops op;
247	void *info;
248{
249	__fop_write_args *argp;
250	int ret;
251
252	COMPQUIET(info, NULL);
253
254	REC_PRINT(__fop_write_print);
255	REC_NOOP_INTRO(__fop_write_read);
256
257	ret = 0;
258	if (DB_UNDO(op))
259		DB_ASSERT(env, argp->flag != 0);
260	else if (DB_REDO(op))
261		ret = __fop_write(env,
262		    argp->txnp, argp->name.data,
263		    argp->dirname.size == 0 ? NULL : argp->dirname.data,
264		    (APPNAME)argp->appname == DB_APP_DATA ? DB_APP_RECOVER :
265		    (APPNAME)argp->appname,
266		    NULL, argp->pgsize, argp->pageno, argp->offset,
267		    argp->page.data, argp->page.size, argp->flag, 0);
268
269	if (ret == 0)
270		*lsnp = argp->prev_lsn;
271	REC_NOOP_CLOSE;
272}
273
274/*
275 * __fop_write_42_recover --
276 *	Recovery function for writechunk.
277 *
278 * PUBLIC: int __fop_write_42_recover
279 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
280 */
281int
282__fop_write_42_recover(env, dbtp, lsnp, op, info)
283	ENV *env;
284	DBT *dbtp;
285	DB_LSN *lsnp;
286	db_recops op;
287	void *info;
288{
289	__fop_write_args *argp;
290	int ret;
291
292	COMPQUIET(info, NULL);
293
294	REC_PRINT(__fop_write_print);
295	REC_NOOP_INTRO(__fop_write_read);
296
297	ret = 0;
298	if (DB_UNDO(op))
299		DB_ASSERT(env, argp->flag != 0);
300	else if (DB_REDO(op))
301		ret = __fop_write(env,
302		    argp->txnp, argp->name.data, NULL, (APPNAME)argp->appname,
303		    NULL, argp->pgsize, argp->pageno, argp->offset,
304		    argp->page.data, argp->page.size, argp->flag, 0);
305
306	if (ret == 0)
307		*lsnp = argp->prev_lsn;
308	REC_NOOP_CLOSE;
309}
310
311/*
312 * __fop_rename_recover --
313 *	Recovery functions for rename.  There are two variants that
314 * both use the same utility function.  Had we known about this on day
315 * one, we would have simply added a parameter.  However, since we need
316 * to retain old records for backward compatibility (online-upgrade)
317 * wrapping the two seems like the right solution.
318 *
319 * PUBLIC: int __fop_rename_recover
320 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
321 *
322 * PUBLIC: int __fop_rename_noundo_recover
323 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
324 */
325int
326__fop_rename_recover(env, dbtp, lsnp, op, info)
327	ENV *env;
328	DBT *dbtp;
329	DB_LSN *lsnp;
330	db_recops op;
331	void *info;
332{
333	return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 1));
334}
335
336int
337__fop_rename_noundo_recover(env, dbtp, lsnp, op, info)
338	ENV *env;
339	DBT *dbtp;
340	DB_LSN *lsnp;
341	db_recops op;
342	void *info;
343{
344	return (__fop_rename_recover_int(env, dbtp, lsnp, op, info, 0));
345}
346
347static int
348__fop_rename_recover_int(env, dbtp, lsnp, op, info, undo)
349	ENV *env;
350	DBT *dbtp;
351	DB_LSN *lsnp;
352	db_recops op;
353	void *info;
354	int undo;
355{
356	__fop_rename_args *argp;
357	APPNAME appname;
358	DB_FH *fhp;
359	DBMETA *meta;
360	u_int8_t *fileid, mbuf[DBMETASIZE];
361	int ret;
362	char *real_new, *real_old, *src;
363	const char *dirname;
364
365	COMPQUIET(info, NULL);
366
367	fhp = NULL;
368	meta = (DBMETA *)&mbuf[0];
369	ret = 0;
370	real_new = real_old = NULL;
371
372	REC_PRINT(__fop_rename_print);
373	REC_NOOP_INTRO(__fop_rename_read);
374	fileid = argp->fileid.data;
375
376	if (argp->dirname.size == 0)
377		dirname = NULL;
378	else
379		dirname = (const char *)argp->dirname.data;
380
381	if ((APPNAME)argp->appname == DB_APP_DATA)
382		appname = DB_APP_RECOVER;
383	else
384		appname = (APPNAME)argp->appname;
385
386	if ((ret = __db_appname(env, appname, (const char *)argp->newname.data,
387	    &dirname, &real_new)) != 0)
388		goto out;
389	if ((ret = __db_appname(env, appname, (const char *)argp->oldname.data,
390	    &dirname, &real_old)) != 0)
391		goto out;
392
393	/*
394	 * Verify that we are manipulating the correct file.  We should always
395	 * be OK on an ABORT or an APPLY, but during recovery, we have to
396	 * check.
397	 */
398	if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
399		src = DB_UNDO(op) ? real_new : real_old;
400		/*
401		 * Interpret any error as meaning that the file either doesn't
402		 * exist, doesn't have a meta-data page, or is in some other
403		 * way, shape or form, incorrect, so that we should not restore
404		 * it.
405		 */
406		if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
407			goto done;
408		if (__fop_read_meta(env,
409		    src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
410			goto done;
411		if (__db_chk_meta(env, NULL, meta, 1) != 0)
412			goto done;
413		if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
414			goto done;
415		(void)__os_closehandle(env, fhp);
416		fhp = NULL;
417		if (DB_REDO(op)) {
418			/*
419			 * Check to see if the target file exists.  If it
420			 * does and it does not have the proper id then
421			 * it is a later version.  We just remove the source
422			 * file since the state of the world is beyond this
423			 * point.
424			 */
425			if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
426			    __fop_read_meta(env, src, mbuf,
427			    DBMETASIZE, fhp, 1, NULL) == 0 &&
428			    __db_chk_meta(env, NULL, meta, 1) == 0 &&
429			    memcmp(argp->fileid.data,
430			    meta->uid, DB_FILE_ID_LEN) != 0) {
431				(void)__memp_nameop(env,
432				    fileid, NULL, real_old, NULL, 0);
433				goto done;
434			}
435		}
436	}
437
438	if (undo && DB_UNDO(op))
439		(void)__memp_nameop(env, fileid,
440		    (const char *)argp->oldname.data, real_new, real_old, 0);
441	if (DB_REDO(op))
442		(void)__memp_nameop(env, fileid,
443		    (const char *)argp->newname.data, real_old, real_new, 0);
444
445done:	*lsnp = argp->prev_lsn;
446out:	if (real_new != NULL)
447		__os_free(env, real_new);
448	if (real_old != NULL)
449		__os_free(env, real_old);
450	if (fhp != NULL)
451		(void)__os_closehandle(env, fhp);
452
453	REC_NOOP_CLOSE;
454}
455/*
456 * __fop_rename_42_recover --
457 *	Recovery functions for rename.  There are two variants that
458 * both use the same utility function.  Had we known about this on day
459 * one, we would have simply added a parameter.  However, since we need
460 * to retain old records for backward compatibility (online-upgrade)
461 * wrapping the two seems like the right solution.
462 *
463 * PUBLIC: int __fop_rename_42_recover
464 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
465 *
466 * PUBLIC: int __fop_rename_noundo_46_recover
467 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
468 */
469int
470__fop_rename_42_recover(env, dbtp, lsnp, op, info)
471	ENV *env;
472	DBT *dbtp;
473	DB_LSN *lsnp;
474	db_recops op;
475	void *info;
476{
477	return (__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, 1));
478}
479
480int
481__fop_rename_noundo_46_recover(env, dbtp, lsnp, op, info)
482	ENV *env;
483	DBT *dbtp;
484	DB_LSN *lsnp;
485	db_recops op;
486	void *info;
487{
488	return (__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, 0));
489}
490
491static int
492__fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo)
493	ENV *env;
494	DBT *dbtp;
495	DB_LSN *lsnp;
496	db_recops op;
497	void *info;
498	int undo;
499{
500	__fop_rename_args *argp;
501	DB_FH *fhp;
502	DBMETA *meta;
503	u_int8_t *fileid, mbuf[DBMETASIZE];
504	int ret;
505	char *real_new, *real_old, *src;
506
507	COMPQUIET(info, NULL);
508
509	fhp = NULL;
510	meta = (DBMETA *)&mbuf[0];
511	ret = 0;
512	real_new = real_old = NULL;
513
514	REC_PRINT(__fop_rename_print);
515	REC_NOOP_INTRO(__fop_rename_read);
516	fileid = argp->fileid.data;
517
518	if ((ret = __db_appname(env, (APPNAME)argp->appname,
519	    (const char *)argp->newname.data, NULL, &real_new)) != 0)
520		goto out;
521	if ((ret = __db_appname(env, (APPNAME)argp->appname,
522	    (const char *)argp->oldname.data, NULL, &real_old)) != 0)
523		goto out;
524
525	/*
526	 * Verify that we are manipulating the correct file.  We should always
527	 * be OK on an ABORT or an APPLY, but during recovery, we have to
528	 * check.
529	 */
530	if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) {
531		src = DB_UNDO(op) ? real_new : real_old;
532		/*
533		 * Interpret any error as meaning that the file either doesn't
534		 * exist, doesn't have a meta-data page, or is in some other
535		 * way, shape or form, incorrect, so that we should not restore
536		 * it.
537		 */
538		if (__os_open(env, src, 0, 0, 0, &fhp) != 0)
539			goto done;
540		if (__fop_read_meta(env,
541		    src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0)
542			goto done;
543		if (__db_chk_meta(env, NULL, meta, 1) != 0)
544			goto done;
545		if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0)
546			goto done;
547		(void)__os_closehandle(env, fhp);
548		fhp = NULL;
549		if (DB_REDO(op)) {
550			/*
551			 * Check to see if the target file exists.  If it
552			 * does and it does not have the proper id then
553			 * it is a later version.  We just remove the source
554			 * file since the state of the world is beyond this
555			 * point.
556			 */
557			if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 &&
558			    __fop_read_meta(env, src, mbuf,
559			    DBMETASIZE, fhp, 1, NULL) == 0 &&
560			    __db_chk_meta(env, NULL, meta, 1) == 0 &&
561			    memcmp(argp->fileid.data,
562			    meta->uid, DB_FILE_ID_LEN) != 0) {
563				(void)__memp_nameop(env,
564				    fileid, NULL, real_old, NULL, 0);
565				goto done;
566			}
567		}
568	}
569
570	if (undo && DB_UNDO(op))
571		(void)__memp_nameop(env, fileid,
572		    (const char *)argp->oldname.data, real_new, real_old, 0);
573	if (DB_REDO(op))
574		(void)__memp_nameop(env, fileid,
575		    (const char *)argp->newname.data, real_old, real_new, 0);
576
577done:	*lsnp = argp->prev_lsn;
578out:	if (real_new != NULL)
579		__os_free(env, real_new);
580	if (real_old != NULL)
581		__os_free(env, real_old);
582	if (fhp != NULL)
583		(void)__os_closehandle(env, fhp);
584
585	REC_NOOP_CLOSE;
586}
587
588/*
589 * __fop_file_remove_recover --
590 *	Recovery function for file_remove.  On the REDO pass, we need to
591 * make sure no one recreated the file while we weren't looking.  On an
592 * undo pass must check if the file we are interested in is the one that
593 * exists and then set the status of the child transaction depending on
594 * what we find out.
595 *
596 * PUBLIC: int __fop_file_remove_recover
597 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
598 */
599int
600__fop_file_remove_recover(env, dbtp, lsnp, op, info)
601	ENV *env;
602	DBT *dbtp;
603	DB_LSN *lsnp;
604	db_recops op;
605	void *info;
606{
607	__fop_file_remove_args *argp;
608	DBMETA *meta;
609	DB_FH *fhp;
610	size_t len;
611	u_int8_t mbuf[DBMETASIZE];
612	u_int32_t cstat, ret_stat;
613	int is_real, is_tmp, ret;
614	char *real_name;
615
616	fhp = NULL;
617	meta = (DBMETA *)&mbuf[0];
618	is_real = is_tmp = 0;
619	real_name = NULL;
620	REC_PRINT(__fop_file_remove_print);
621	REC_NOOP_INTRO(__fop_file_remove_read);
622
623	/*
624	 * This record is only interesting on the backward, forward, and
625	 * apply phases.
626	 */
627	if (op != DB_TXN_BACKWARD_ROLL &&
628	    op != DB_TXN_FORWARD_ROLL && op != DB_TXN_APPLY)
629		goto done;
630
631	if ((ret = __db_appname(env, (APPNAME)argp->appname,
632	    argp->name.data, NULL, &real_name)) != 0)
633		goto out;
634
635	/* Verify that we are manipulating the correct file.  */
636	len = 0;
637	if (__os_open(env, real_name, 0, 0, 0, &fhp) != 0 ||
638	    (ret = __fop_read_meta(env, real_name,
639	    mbuf, DBMETASIZE, fhp, 1, &len)) != 0) {
640		/*
641		 * If len is non-zero, then the file exists and has something
642		 * in it, but that something isn't a full meta-data page, so
643		 * this is very bad.  Bail out!
644		 */
645		if (len != 0)
646			goto out;
647
648		/* File does not exist. */
649		cstat = TXN_EXPECTED;
650	} else {
651		/*
652		 * We can ignore errors here since we'll simply fail the
653		 * checks below and assume this is the wrong file.
654		 */
655		(void)__db_chk_meta(env, NULL, meta, 1);
656		is_real =
657		    memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
658		is_tmp =
659		    memcmp(argp->tmp_fid.data, meta->uid, DB_FILE_ID_LEN) == 0;
660
661		if (!is_real && !is_tmp)
662			/* File exists, but isn't what we were removing. */
663			cstat = TXN_IGNORE;
664		else
665			/* File exists and is the one that we were removing. */
666			cstat = TXN_COMMIT;
667	}
668	if (fhp != NULL) {
669		(void)__os_closehandle(env, fhp);
670		fhp = NULL;
671	}
672
673	if (DB_UNDO(op)) {
674		/* On the backward pass, we leave a note for the child txn. */
675		if ((ret = __db_txnlist_update(env,
676		    info, argp->child, cstat, NULL, &ret_stat, 1)) != 0)
677			goto out;
678	} else if (DB_REDO(op)) {
679		/*
680		 * On the forward pass, check if someone recreated the
681		 * file while we weren't looking.
682		 */
683		if (cstat == TXN_COMMIT)
684			(void)__memp_nameop(env,
685			    is_real ? argp->real_fid.data : argp->tmp_fid.data,
686			    NULL, real_name, NULL, 0);
687	}
688
689done:	*lsnp = argp->prev_lsn;
690	ret = 0;
691
692out:	if (real_name != NULL)
693		__os_free(env, real_name);
694	if (fhp != NULL)
695		(void)__os_closehandle(env, fhp);
696	REC_NOOP_CLOSE;
697}
698