1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: mp_fput.c,v 12.46 2008/04/28 02:59:57 alexg Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/log.h"
13#include "dbinc/mp.h"
14
15static int __memp_reset_lru __P((ENV *, REGINFO *));
16
17/*
18 * __memp_fput_pp --
19 *	DB_MPOOLFILE->put pre/post processing.
20 *
21 * PUBLIC: int __memp_fput_pp
22 * PUBLIC:     __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
23 */
24int
25__memp_fput_pp(dbmfp, pgaddr, priority, flags)
26	DB_MPOOLFILE *dbmfp;
27	void *pgaddr;
28	DB_CACHE_PRIORITY priority;
29	u_int32_t flags;
30{
31	DB_THREAD_INFO *ip;
32	ENV *env;
33	int ret, t_ret;
34
35	env = dbmfp->env;
36
37	if (flags != 0)
38		return (__db_ferr(env, "DB_MPOOLFILE->put", 0));
39
40	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put");
41
42	ENV_ENTER(env, ip);
43
44	ret = __memp_fput(dbmfp, ip, pgaddr, priority);
45	if (IS_ENV_REPLICATED(env) &&
46	    (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
47		ret = t_ret;
48
49	ENV_LEAVE(env, ip);
50	return (ret);
51}
52
53/*
54 * __memp_fput --
55 *	DB_MPOOLFILE->put.
56 *
57 * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *,
58 * PUBLIC:      DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY));
59 */
60int
61__memp_fput(dbmfp, ip, pgaddr, priority)
62	DB_MPOOLFILE *dbmfp;
63	DB_THREAD_INFO *ip;
64	void *pgaddr;
65	DB_CACHE_PRIORITY priority;
66{
67	BH *bhp;
68	DB_ENV *dbenv;
69	DB_MPOOL *dbmp;
70	DB_MPOOL_HASH *hp;
71	ENV *env;
72	MPOOL *c_mp;
73	MPOOLFILE *mfp;
74	PIN_LIST *list, *lp;
75	REGINFO *infop, *reginfo;
76	roff_t b_ref;
77	int region;
78	int adjust, pfactor, ret, t_ret;
79	char buf[DB_THREADID_STRLEN];
80
81	env = dbmfp->env;
82	dbenv = env->dbenv;
83	dbmp = env->mp_handle;
84	mfp = dbmfp->mfp;
85	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
86	ret = 0;
87
88	/*
89	 * If this is marked dummy, we are using it to unpin a buffer for
90	 * another thread.
91	 */
92	if (F_ISSET(dbmfp, MP_DUMMY))
93		goto unpin;
94
95	/*
96	 * If we're mapping the file, there's nothing to do.  Because we can
97	 * stop mapping the file at any time, we have to check on each buffer
98	 * to see if the address we gave the application was part of the map
99	 * region.
100	 */
101	if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
102	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
103		return (0);
104
105#ifdef DIAGNOSTIC
106	/*
107	 * Decrement the per-file pinned buffer count (mapped pages aren't
108	 * counted).
109	 */
110	MPOOL_SYSTEM_LOCK(env);
111	if (dbmfp->pinref == 0) {
112		MPOOL_SYSTEM_UNLOCK(env);
113		__db_errx(env,
114		    "%s: more pages returned than retrieved", __memp_fn(dbmfp));
115		return (__env_panic(env, EACCES));
116	}
117	--dbmfp->pinref;
118	MPOOL_SYSTEM_UNLOCK(env);
119#endif
120
121unpin:
122	/* Convert a page address to a buffer header and hash bucket. */
123	MP_GET_BUCKET(env, mfp, bhp->pgno, &infop, hp, ret);
124	if (ret != 0)
125		return (ret);
126	c_mp = infop->primary;
127
128	/*
129	 * Check for a reference count going to zero.  This can happen if the
130	 * application returns a page twice.
131	 */
132	if (bhp->ref == 0) {
133		__db_errx(env, "%s: page %lu: unpinned page returned",
134		    __memp_fn(dbmfp), (u_long)bhp->pgno);
135		DB_ASSERT(env, bhp->ref != 0);
136		MUTEX_UNLOCK(env, hp->mtx_hash);
137		return (__env_panic(env, EACCES));
138	}
139
140	/* Note the activity so allocation won't decide to quit. */
141	++c_mp->put_counter;
142
143	if (ip != NULL) {
144		reginfo = env->reginfo;
145		list = R_ADDR(reginfo, ip->dbth_pinlist);
146		region = (int)(infop - dbmp->reginfo);
147		b_ref = R_OFFSET(infop, bhp);
148		for (lp = list; lp < &list[ip->dbth_pinmax]; lp++)
149			if (lp->b_ref == b_ref && lp->region == region)
150				break;
151
152		if (lp == &list[ip->dbth_pinmax]) {
153			__db_errx(env,
154		    "__memp_fput: pinned buffer not found for thread %s",
155			    dbenv->thread_id_string(dbenv,
156			    ip->dbth_pid, ip->dbth_tid, buf));
157			return (__env_panic(env, EINVAL));
158		}
159
160		lp->b_ref = INVALID_ROFF;
161		ip->dbth_pincount--;
162	}
163
164	/*
165	 * Mark the file dirty.  Check for a dirty bit on the buffer as well
166	 * as the dirty flag because the buffer might have been marked dirty
167	 * in the DB_MPOOLFILE->set method.
168	 */
169	if (F_ISSET(bhp, BH_DIRTY)) {
170		mfp->file_written = 1;
171
172		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
173	}
174
175	/*
176	 * If more than one reference to the page or a reference other than a
177	 * thread waiting to flush the buffer to disk, we're done.  Ignore the
178	 * discard flags (for now) and leave the buffer's priority alone.
179	 */
180	if (--bhp->ref > 1 || (bhp->ref == 1 && !F_ISSET(bhp, BH_LOCKED))) {
181		MUTEX_UNLOCK(env, hp->mtx_hash);
182		return (0);
183	}
184
185	/* The buffer should not be accessed again. */
186	MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0);
187
188	/* Update priority values. */
189	if (priority == DB_PRIORITY_VERY_LOW ||
190	    mfp->priority == MPOOL_PRI_VERY_LOW)
191		bhp->priority = 0;
192	else {
193		/*
194		 * We don't lock the LRU counter or the stat.st_pages field, if
195		 * we get garbage (which won't happen on a 32-bit machine), it
196		 * only means a buffer has the wrong priority.
197		 */
198		bhp->priority = c_mp->lru_count;
199
200		switch (priority) {
201		default:
202		case DB_PRIORITY_UNCHANGED:
203			pfactor = mfp->priority;
204			break;
205		case DB_PRIORITY_VERY_LOW:
206			pfactor = MPOOL_PRI_VERY_LOW;
207			break;
208		case DB_PRIORITY_LOW:
209			pfactor = MPOOL_PRI_LOW;
210			break;
211		case DB_PRIORITY_DEFAULT:
212			pfactor = MPOOL_PRI_DEFAULT;
213			break;
214		case DB_PRIORITY_HIGH:
215			pfactor = MPOOL_PRI_HIGH;
216			break;
217		case DB_PRIORITY_VERY_HIGH:
218			pfactor = MPOOL_PRI_VERY_HIGH;
219			break;
220		}
221
222		adjust = 0;
223		if (pfactor != 0)
224			adjust = (int)c_mp->stat.st_pages / pfactor;
225
226		if (F_ISSET(bhp, BH_DIRTY))
227			adjust += (int)c_mp->stat.st_pages / MPOOL_PRI_DIRTY;
228
229		if (adjust > 0) {
230			if (UINT32_MAX - bhp->priority >= (u_int32_t)adjust)
231				bhp->priority += adjust;
232		} else if (adjust < 0)
233			if (bhp->priority > (u_int32_t)-adjust)
234				bhp->priority += adjust;
235	}
236
237	/*
238	 * The sync code has a separate counter for buffers on which it waits.
239	 * It reads that value without holding a lock so we update it as the
240	 * last thing we do.  Once that value goes to 0, we won't see another
241	 * reference to that buffer being returned to the cache until the sync
242	 * code has finished, so we're safe as long as we don't let the value
243	 * go to 0 before we finish with the buffer.
244	 */
245	if (F_ISSET(bhp, BH_LOCKED) && bhp->ref_sync != 0)
246		--bhp->ref_sync;
247
248	MUTEX_UNLOCK(env, hp->mtx_hash);
249
250	/*
251	 * On every buffer put we update the buffer generation number and check
252	 * for wraparound.
253	 */
254	if (++c_mp->lru_count == UINT32_MAX)
255		if ((t_ret =
256		    __memp_reset_lru(env, dbmp->reginfo)) != 0 && ret == 0)
257			ret = t_ret;
258
259	return (ret);
260}
261
262/*
263 * __memp_reset_lru --
264 *	Reset the cache LRU counter.
265 */
266static int
267__memp_reset_lru(env, infop)
268	ENV *env;
269	REGINFO *infop;
270{
271	BH *bhp, *tbhp;
272	DB_MPOOL_HASH *hp;
273	MPOOL *c_mp;
274	u_int32_t bucket, priority;
275
276	c_mp = infop->primary;
277	/*
278	 * Update the counter so all future allocations will start at the
279	 * bottom.
280	 */
281	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
282
283	/* Adjust the priority of every buffer in the system. */
284	for (hp = R_ADDR(infop, c_mp->htab),
285	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
286		/*
287		 * Skip empty buckets.
288		 *
289		 * We can check for empty buckets before locking as we
290		 * only care if the pointer is zero or non-zero.
291		 */
292		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) {
293			c_mp->lru_reset++;
294			continue;
295		}
296
297		MUTEX_LOCK(env, hp->mtx_hash);
298		c_mp->lru_reset++;
299		/*
300		 * We need to take a little care that the bucket does
301		 * not become unsorted.  This is highly unlikely but
302		 * possible.
303		 */
304		priority = 0;
305		SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
306			for (tbhp = bhp; tbhp != NULL;
307			    tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) {
308				if (tbhp->priority != UINT32_MAX &&
309				    tbhp->priority > MPOOL_BASE_DECREMENT) {
310					tbhp->priority -= MPOOL_BASE_DECREMENT;
311					if (tbhp->priority < priority)
312						tbhp->priority = priority;
313				}
314			}
315			priority = bhp->priority;
316		}
317		MUTEX_UNLOCK(env, hp->mtx_hash);
318	}
319	c_mp->lru_reset = 0;
320
321	COMPQUIET(env, NULL);
322	return (0);
323}
324
325/*
326 * __memp_unpin_buffers --
327 *	Unpin buffers pinned by a thread.
328 *
329 * PUBLIC: int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *));
330 */
331int
332__memp_unpin_buffers(env, ip)
333	ENV *env;
334	DB_THREAD_INFO *ip;
335{
336	BH *bhp;
337	DB_MPOOL *dbmp;
338	DB_MPOOLFILE dbmf;
339	PIN_LIST *list, *lp;
340	REGINFO *rinfop, *reginfo;
341	int ret;
342
343	memset(&dbmf, 0, sizeof(dbmf));
344	dbmf.env = env;
345	dbmf.flags = MP_DUMMY;
346	dbmp = env->mp_handle;
347	reginfo = env->reginfo;
348
349	list = R_ADDR(reginfo, ip->dbth_pinlist);
350	for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) {
351		if (lp->b_ref == INVALID_ROFF)
352			continue;
353		rinfop = &dbmp->reginfo[lp->region];
354		bhp = R_ADDR(rinfop, lp->b_ref);
355		dbmf.mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
356		if ((ret = __memp_fput(&dbmf, ip,
357		    (u_int8_t *)bhp + SSZA(BH, buf),
358		    DB_PRIORITY_UNCHANGED)) != 0)
359			return (ret);
360	}
361	return (0);
362}
363