1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2001, 2010 Oracle and/or its affiliates.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/lock.h"
14#include "dbinc/mp.h"
15#include "dbinc/txn.h"
16#include "dbinc/log.h"
17#include "dbinc/db_am.h"
18
19typedef struct __txn_event TXN_EVENT;
20struct __txn_event {
21	TXN_EVENT_T op;
22	TAILQ_ENTRY(__txn_event) links;
23	union {
24		struct {
25			/* Delayed close. */
26			DB *dbp;
27		} c;
28		struct {
29			/* Delayed remove. */
30			char *name;
31			u_int8_t *fileid;
32			int inmem;
33		} r;
34		struct {
35			/* Lock event. */
36			DB_LOCK lock;
37			DB_LOCKER *locker;
38			DB *dbp;
39		} t;
40	} u;
41};
42
43#define	TXN_TOP_PARENT(txn) do {					\
44	while (txn->parent != NULL)					\
45		txn = txn->parent;					\
46} while (0)
47
48/*
49 * __txn_closeevent --
50 *
51 * Creates a close event that can be added to the [so-called] commit list, so
52 * that we can redo a failed DB handle close once we've aborted the transaction.
53 *
54 * PUBLIC: int __txn_closeevent __P((ENV *, DB_TXN *, DB *));
55 */
56int
57__txn_closeevent(env, txn, dbp)
58	ENV *env;
59	DB_TXN *txn;
60	DB *dbp;
61{
62	int ret;
63	TXN_EVENT *e;
64
65	e = NULL;
66	if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0)
67		return (ret);
68
69	e->u.c.dbp = dbp;
70	e->op = TXN_CLOSE;
71	TXN_TOP_PARENT(txn);
72	TAILQ_INSERT_TAIL(&txn->events, e, links);
73
74	return (0);
75}
76
77/*
78 * __txn_remevent --
79 *
80 * Creates a remove event that can be added to the commit list.
81 *
82 * PUBLIC: int __txn_remevent __P((ENV *,
83 * PUBLIC:       DB_TXN *, const char *, u_int8_t *, int));
84 */
85int
86__txn_remevent(env, txn, name, fileid, inmem)
87	ENV *env;
88	DB_TXN *txn;
89	const char *name;
90	u_int8_t *fileid;
91	int inmem;
92{
93	int ret;
94	TXN_EVENT *e;
95
96	e = NULL;
97	if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0)
98		return (ret);
99
100	if ((ret = __os_strdup(env, name, &e->u.r.name)) != 0)
101		goto err;
102
103	if (fileid != NULL) {
104		if ((ret = __os_calloc(env,
105		    1, DB_FILE_ID_LEN, &e->u.r.fileid)) != 0)
106			return (ret);
107		memcpy(e->u.r.fileid, fileid, DB_FILE_ID_LEN);
108	}
109
110	e->u.r.inmem = inmem;
111	e->op = TXN_REMOVE;
112	TXN_TOP_PARENT(txn);
113	TAILQ_INSERT_TAIL(&txn->events, e, links);
114
115	return (0);
116
117err:	if (e != NULL)
118		__os_free(env, e);
119
120	return (ret);
121}
122
123/*
124 * __txn_remrem --
125 *	Remove a remove event because the remove has been superceeded,
126 * by a create of the same name, for example.
127 *
128 * PUBLIC: void __txn_remrem __P((ENV *, DB_TXN *, const char *));
129 */
130void
131__txn_remrem(env, txn, name)
132	ENV *env;
133	DB_TXN *txn;
134	const char *name;
135{
136	TXN_EVENT *e, *next_e;
137
138	TXN_TOP_PARENT(txn);
139	for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) {
140		next_e = TAILQ_NEXT(e, links);
141		if (e->op != TXN_REMOVE || strcmp(name, e->u.r.name) != 0)
142			continue;
143		TAILQ_REMOVE(&txn->events, e, links);
144		__os_free(env, e->u.r.name);
145		if (e->u.r.fileid != NULL)
146			__os_free(env, e->u.r.fileid);
147		__os_free(env, e);
148	}
149
150	return;
151}
152
153/*
154 * __txn_lockevent --
155 *
156 * Add a lockevent to the commit-queue.  The lock event indicates a locker
157 * trade.
158 *
159 * PUBLIC: int __txn_lockevent __P((ENV *,
160 * PUBLIC:     DB_TXN *, DB *, DB_LOCK *, DB_LOCKER *));
161 */
162int
163__txn_lockevent(env, txn, dbp, lock, locker)
164	ENV *env;
165	DB_TXN *txn;
166	DB *dbp;
167	DB_LOCK *lock;
168	DB_LOCKER *locker;
169{
170	int ret;
171	TXN_EVENT *e;
172
173	if (!LOCKING_ON(env))
174		return (0);
175
176	e = NULL;
177	if ((ret = __os_calloc(env, 1, sizeof(TXN_EVENT), &e)) != 0)
178		return (ret);
179
180	e->u.t.locker = locker;
181	e->u.t.lock = *lock;
182	e->u.t.dbp = dbp;
183	e->op = TXN_TRADE;
184	/* This event goes on the current transaction, not its parent. */
185	TAILQ_INSERT_TAIL(&txn->events, e, links);
186	dbp->cur_txn = txn;
187
188	return (0);
189}
190
191/*
192 * __txn_remlock --
193 *	Remove a lock event because the locker is going away.  We can remove
194 * by lock (using offset) or by locker_id (or by both).
195 *
196 * PUBLIC: void __txn_remlock __P((ENV *, DB_TXN *, DB_LOCK *, DB_LOCKER *));
197 */
198void
199__txn_remlock(env, txn, lock, locker)
200	ENV *env;
201	DB_TXN *txn;
202	DB_LOCK *lock;
203	DB_LOCKER *locker;
204{
205	TXN_EVENT *e, *next_e;
206
207	for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) {
208		next_e = TAILQ_NEXT(e, links);
209		if ((e->op != TXN_TRADE && e->op != TXN_TRADED) ||
210		    (e->u.t.lock.off != lock->off && e->u.t.locker != locker))
211			continue;
212		TAILQ_REMOVE(&txn->events, e, links);
213		__os_free(env, e);
214	}
215
216	return;
217}
218
219/*
220 * __txn_doevents --
221 * Process the list of events associated with a transaction.  On commit,
222 * apply the events; on abort, just toss the entries.
223 *
224 * PUBLIC: int __txn_doevents __P((ENV *, DB_TXN *, int, int));
225 */
226#define	DO_TRADE do {							\
227	memset(&req, 0, sizeof(req));					\
228	req.lock = e->u.t.lock;						\
229	req.op = DB_LOCK_TRADE;						\
230	t_ret = __lock_vec(env, txn->parent ?				\
231	    txn->parent->locker : e->u.t.locker, 0, &req, 1, NULL);	\
232	if (t_ret == 0)	{						\
233		if (txn->parent != NULL) {				\
234			e->u.t.dbp->cur_txn = txn->parent;		\
235			e->u.t.dbp->cur_locker = txn->parent->locker;	\
236		} else {						\
237			e->op = TXN_TRADED;				\
238			e->u.t.dbp->cur_locker = e->u.t.locker;		\
239			e->u.t.dbp->cur_txn = NULL;			\
240		}							\
241	} else if (t_ret == DB_NOTFOUND)				\
242		t_ret = 0;						\
243	if (t_ret != 0 && ret == 0)					\
244		ret = t_ret;						\
245} while (0)
246
247int
248__txn_doevents(env, txn, opcode, preprocess)
249	ENV *env;
250	DB_TXN *txn;
251	int opcode, preprocess;
252{
253	DB_LOCKREQ req;
254	TXN_EVENT *e, *enext;
255	int ret, t_ret;
256
257	ret = 0;
258
259	/*
260	 * This phase only gets called if we have a phase where we
261	 * release read locks.  Since not all paths will call this
262	 * phase, we have to check for it below as well.  So, when
263	 * we do the trade, we update the opcode of the entry so that
264	 * we don't try the trade again.
265	 */
266	if (preprocess) {
267		for (e = TAILQ_FIRST(&txn->events);
268		    e != NULL; e = enext) {
269			enext = TAILQ_NEXT(e, links);
270			if (e->op != TXN_TRADE ||
271			    IS_WRITELOCK(e->u.t.lock.mode))
272				continue;
273			DO_TRADE;
274			if (txn->parent != NULL) {
275				TAILQ_REMOVE(&txn->events, e, links);
276				TAILQ_INSERT_HEAD(
277				     &txn->parent->events, e, links);
278			}
279		}
280		return (ret);
281	}
282
283	/*
284	 * Prepare should only cause a preprocess, since the transaction
285	 * isn't over.
286	 */
287	DB_ASSERT(env, opcode != TXN_PREPARE);
288	while ((e = TAILQ_FIRST(&txn->events)) != NULL) {
289		TAILQ_REMOVE(&txn->events, e, links);
290		/*
291		 * Most deferred events should only happen on
292		 * commits, not aborts or prepares.  The one exception
293		 * is a close which gets done on commit and abort, but
294		 * not prepare. If we're not doing operations, then we
295		 * can just go free resources.
296		 */
297		if (opcode == TXN_ABORT && e->op != TXN_CLOSE)
298			goto dofree;
299		switch (e->op) {
300		case TXN_CLOSE:
301			if ((t_ret = __db_close(e->u.c.dbp,
302			    NULL, DB_NOSYNC)) != 0 && ret == 0)
303				ret = t_ret;
304			break;
305		case TXN_REMOVE:
306			if (e->u.r.fileid != NULL) {
307				if ((t_ret = __memp_nameop(env,
308				    e->u.r.fileid, NULL, e->u.r.name,
309				    NULL, e->u.r.inmem)) != 0 && ret == 0)
310					ret = t_ret;
311			} else if ((t_ret =
312			    __os_unlink(env, e->u.r.name, 0)) != 0 && ret == 0)
313				ret = t_ret;
314			break;
315		case TXN_TRADE:
316			DO_TRADE;
317			if (txn->parent != NULL) {
318				TAILQ_INSERT_HEAD(
319				     &txn->parent->events, e, links);
320				continue;
321			}
322			/* Fall through */
323		case TXN_TRADED:
324			/* Downgrade the lock. */
325			if ((t_ret = __lock_downgrade(env,
326			    &e->u.t.lock, DB_LOCK_READ, 0)) != 0 && ret == 0)
327				ret = t_ret;
328			break;
329		default:
330			/* This had better never happen. */
331			DB_ASSERT(env, 0);
332		}
333dofree:
334		/* Free resources here. */
335		switch (e->op) {
336		case TXN_REMOVE:
337			if (e->u.r.fileid != NULL)
338				__os_free(env, e->u.r.fileid);
339			__os_free(env, e->u.r.name);
340			break;
341		case TXN_TRADE:
342			if (opcode == TXN_ABORT)
343				e->u.t.dbp->cur_txn = NULL;
344			break;
345		case TXN_CLOSE:
346		case TXN_TRADED:
347		default:
348			break;
349		}
350		__os_free(env, e);
351	}
352
353	return (ret);
354}
355
356/*
357 * PUBLIC: int __txn_record_fname __P((ENV *, DB_TXN *, FNAME *));
358 */
359int
360__txn_record_fname(env, txn, fname)
361	ENV *env;
362	DB_TXN *txn;
363	FNAME *fname;
364{
365	DB_LOG *dblp;
366	DB_TXNMGR *mgr;
367	TXN_DETAIL *td;
368	roff_t fname_off;
369	roff_t *np, *ldbs;
370	u_int32_t i;
371	int ret;
372
373	if ((td = txn->td) == NULL)
374		return (0);
375	mgr = env->tx_handle;
376	dblp = env->lg_handle;
377	fname_off = R_OFFSET(&dblp->reginfo, fname);
378
379	/* See if we already have a ref to this DB handle. */
380	ldbs = R_ADDR(&mgr->reginfo, td->log_dbs);
381	for (i = 0, np = ldbs; i < td->nlog_dbs; i++, np++)
382		if (*np == fname_off)
383			return (0);
384
385	if (td->nlog_slots <= td->nlog_dbs) {
386		TXN_SYSTEM_LOCK(env);
387		if ((ret = __env_alloc(&mgr->reginfo,
388		    sizeof(roff_t) * (td->nlog_slots << 1), &np)) != 0) {
389			TXN_SYSTEM_UNLOCK(env);
390			return (ret);
391		}
392
393		memcpy(np, ldbs, td->nlog_dbs * sizeof(roff_t));
394		if (td->nlog_slots > TXN_NSLOTS)
395			__env_alloc_free(&mgr->reginfo, ldbs);
396
397		TXN_SYSTEM_UNLOCK(env);
398		td->log_dbs = R_OFFSET(&mgr->reginfo, np);
399		ldbs = np;
400		td->nlog_slots = td->nlog_slots << 1;
401	}
402
403	ldbs[td->nlog_dbs] = fname_off;
404	td->nlog_dbs++;
405	fname->txn_ref++;
406
407	return (0);
408}
409
410/*
411 * __txn_dref_fnam --
412 *	Either pass the fname to our parent txn or decrement the refcount
413 * and close the fileid if it goes to zero.
414 *
415 * PUBLIC: int __txn_dref_fname __P((ENV *, DB_TXN *));
416 */
417int
418__txn_dref_fname(env, txn)
419	ENV *env;
420	DB_TXN *txn;
421{
422	DB_LOG *dblp;
423	DB_TXNMGR *mgr;
424	FNAME *fname;
425	roff_t *np;
426	TXN_DETAIL *ptd, *td;
427	u_int32_t i;
428	int ret;
429
430	td = txn->td;
431
432	if (td->nlog_dbs == 0)
433		return (0);
434
435	mgr = env->tx_handle;
436	dblp = env->lg_handle;
437	ret = 0;
438
439	ptd = txn->parent != NULL ? txn->parent->td : NULL;
440
441	np = R_ADDR(&mgr->reginfo, td->log_dbs);
442	for (i = 0; i < td->nlog_dbs; i++, np++) {
443		fname = R_ADDR(&dblp->reginfo, *np);
444		MUTEX_LOCK(env, fname->mutex);
445		if (ptd != NULL) {
446			ret = __txn_record_fname(env, txn->parent, fname);
447			fname->txn_ref--;
448			MUTEX_UNLOCK(env, fname->mutex);
449		} else if (fname->txn_ref == 1) {
450			MUTEX_UNLOCK(env, fname->mutex);
451			DB_ASSERT(env, fname->txn_ref != 0);
452			ret = __dbreg_close_id_int(
453			    env, fname, DBREG_CLOSE, 0);
454		} else {
455			fname->txn_ref--;
456			MUTEX_UNLOCK(env, fname->mutex);
457		}
458		if (ret != 0)
459			break;
460	}
461
462	return (ret);
463}
464