1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/log.h"
13#include "dbinc/mp.h"
14
15static int __memp_reset_lru __P((ENV *, REGINFO *));
16
17/*
18 * __memp_fput_pp --
19 *	DB_MPOOLFILE->put pre/post processing.
20 *
21 * PUBLIC: int __memp_fput_pp
22 * PUBLIC:     __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
23 */
24int
25__memp_fput_pp(dbmfp, pgaddr, priority, flags)
26	DB_MPOOLFILE *dbmfp;
27	void *pgaddr;
28	DB_CACHE_PRIORITY priority;
29	u_int32_t flags;
30{
31	DB_THREAD_INFO *ip;
32	ENV *env;
33	int ret, t_ret;
34
35	env = dbmfp->env;
36
37	if (flags != 0)
38		return (__db_ferr(env, "DB_MPOOLFILE->put", 0));
39
40	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put");
41
42	ENV_ENTER(env, ip);
43
44	ret = __memp_fput(dbmfp, ip, pgaddr, priority);
45	if (IS_ENV_REPLICATED(env) &&
46	    (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
47		ret = t_ret;
48
49	ENV_LEAVE(env, ip);
50	return (ret);
51}
52
53/*
54 * __memp_fput --
55 *	DB_MPOOLFILE->put.
56 *
57 * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *,
58 * PUBLIC:      DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY));
59 */
60int
61__memp_fput(dbmfp, ip, pgaddr, priority)
62	DB_MPOOLFILE *dbmfp;
63	DB_THREAD_INFO *ip;
64	void *pgaddr;
65	DB_CACHE_PRIORITY priority;
66{
67	BH *bhp;
68	DB_ENV *dbenv;
69	DB_MPOOL *dbmp;
70	DB_MPOOL_HASH *hp;
71	ENV *env;
72	MPOOL *c_mp;
73	MPOOLFILE *mfp;
74	PIN_LIST *list, *lp;
75	REGINFO *infop, *reginfo;
76	roff_t b_ref;
77	int region;
78	int adjust, pfactor, ret, t_ret;
79	char buf[DB_THREADID_STRLEN];
80
81	env = dbmfp->env;
82	dbenv = env->dbenv;
83	dbmp = env->mp_handle;
84	mfp = dbmfp->mfp;
85	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
86	ret = 0;
87
88	/*
89	 * If this is marked dummy, we are using it to unpin a buffer for
90	 * another thread.
91	 */
92	if (F_ISSET(dbmfp, MP_DUMMY))
93		goto unpin;
94
95	/*
96	 * If we're mapping the file, there's nothing to do.  Because we can
97	 * stop mapping the file at any time, we have to check on each buffer
98	 * to see if the address we gave the application was part of the map
99	 * region.
100	 */
101	if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
102	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
103		return (0);
104
105#ifdef DIAGNOSTIC
106	/*
107	 * Decrement the per-file pinned buffer count (mapped pages aren't
108	 * counted).
109	 */
110	MPOOL_SYSTEM_LOCK(env);
111	if (dbmfp->pinref == 0) {
112		MPOOL_SYSTEM_UNLOCK(env);
113		__db_errx(env,
114		    "%s: more pages returned than retrieved", __memp_fn(dbmfp));
115		return (__env_panic(env, EACCES));
116	}
117	--dbmfp->pinref;
118	MPOOL_SYSTEM_UNLOCK(env);
119#endif
120
121unpin:
122	infop = &dbmp->reginfo[bhp->region];
123	c_mp = infop->primary;
124	hp = R_ADDR(infop, c_mp->htab);
125	hp = &hp[bhp->bucket];
126
127	/*
128	 * Check for a reference count going to zero.  This can happen if the
129	 * application returns a page twice.
130	 */
131	if (atomic_read(&bhp->ref) == 0) {
132		__db_errx(env, "%s: page %lu: unpinned page returned",
133		    __memp_fn(dbmfp), (u_long)bhp->pgno);
134		DB_ASSERT(env, atomic_read(&bhp->ref) != 0);
135		return (__env_panic(env, EACCES));
136	}
137
138	/* Note the activity so allocation won't decide to quit. */
139	++c_mp->put_counter;
140
141	if (ip != NULL) {
142		reginfo = env->reginfo;
143		list = R_ADDR(reginfo, ip->dbth_pinlist);
144		region = (int)(infop - dbmp->reginfo);
145		b_ref = R_OFFSET(infop, bhp);
146		for (lp = list; lp < &list[ip->dbth_pinmax]; lp++)
147			if (lp->b_ref == b_ref && lp->region == region)
148				break;
149
150		if (lp == &list[ip->dbth_pinmax]) {
151			__db_errx(env,
152		    "__memp_fput: pinned buffer not found for thread %s",
153			    dbenv->thread_id_string(dbenv,
154			    ip->dbth_pid, ip->dbth_tid, buf));
155			return (__env_panic(env, EINVAL));
156		}
157
158		lp->b_ref = INVALID_ROFF;
159		ip->dbth_pincount--;
160	}
161
162	/*
163	 * Mark the file dirty.
164	 */
165	if (F_ISSET(bhp, BH_EXCLUSIVE) && F_ISSET(bhp, BH_DIRTY)) {
166		DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
167		mfp->file_written = 1;
168	}
169
170	/*
171	 * If more than one reference to the page we're done.  Ignore the
172	 * discard flags (for now) and leave the buffer's priority alone.
173	 * We are doing this a little early as the remaining ref may or
174	 * may not be a write behind.  If it is we set the priority
175	 * here, if not it will get set again later.  We might race
176	 * and miss setting the priority which would leave it wrong
177	 * for a while.
178	 */
179	DB_ASSERT(env, atomic_read(&bhp->ref) != 0);
180	if (atomic_dec(env, &bhp->ref) > 1 || (atomic_read(&bhp->ref) == 1 &&
181	    !F_ISSET(bhp, BH_DIRTY))) {
182		/*
183		 * __memp_pgwrite only has a shared lock while it clears
184		 * the BH_DIRTY bit. If we only have a shared latch then
185		 * we can't touch the flags bits.
186		 */
187		if (F_ISSET(bhp, BH_EXCLUSIVE))
188			F_CLR(bhp, BH_EXCLUSIVE);
189		MUTEX_UNLOCK(env, bhp->mtx_buf);
190		return (0);
191	}
192
193	/* The buffer should not be accessed again. */
194	if (BH_REFCOUNT(bhp) == 0)
195		MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0);
196
197	/* Update priority values. */
198	if (priority == DB_PRIORITY_VERY_LOW ||
199	    mfp->priority == MPOOL_PRI_VERY_LOW)
200		bhp->priority = 0;
201	else {
202		/*
203		 * We don't lock the LRU counter or the stat.st_pages field, if
204		 * we get garbage (which won't happen on a 32-bit machine), it
205		 * only means a buffer has the wrong priority.
206		 */
207		bhp->priority = c_mp->lru_count;
208
209		switch (priority) {
210		default:
211		case DB_PRIORITY_UNCHANGED:
212			pfactor = mfp->priority;
213			break;
214		case DB_PRIORITY_VERY_LOW:
215			pfactor = MPOOL_PRI_VERY_LOW;
216			break;
217		case DB_PRIORITY_LOW:
218			pfactor = MPOOL_PRI_LOW;
219			break;
220		case DB_PRIORITY_DEFAULT:
221			pfactor = MPOOL_PRI_DEFAULT;
222			break;
223		case DB_PRIORITY_HIGH:
224			pfactor = MPOOL_PRI_HIGH;
225			break;
226		case DB_PRIORITY_VERY_HIGH:
227			pfactor = MPOOL_PRI_VERY_HIGH;
228			break;
229		}
230
231		adjust = 0;
232		if (pfactor != 0)
233			adjust = (int)c_mp->stat.st_pages / pfactor;
234
235		if (F_ISSET(bhp, BH_DIRTY))
236			adjust += (int)c_mp->stat.st_pages / MPOOL_PRI_DIRTY;
237
238		if (adjust > 0) {
239			if (UINT32_MAX - bhp->priority >= (u_int32_t)adjust)
240				bhp->priority += adjust;
241		} else if (adjust < 0)
242			if (bhp->priority > (u_int32_t)-adjust)
243				bhp->priority += adjust;
244	}
245
246	/*
247	 * __memp_pgwrite only has a shared lock while it clears the
248	 * BH_DIRTY bit. If we only have a shared latch then we can't
249	 * touch the flags bits.
250	 */
251	if (F_ISSET(bhp, BH_EXCLUSIVE))
252		F_CLR(bhp, BH_EXCLUSIVE);
253	MUTEX_UNLOCK(env, bhp->mtx_buf);
254
255	/*
256	 * On every buffer put we update the buffer generation number and check
257	 * for wraparound.
258	 */
259	if (++c_mp->lru_count == UINT32_MAX)
260		if ((t_ret =
261		    __memp_reset_lru(env, dbmp->reginfo)) != 0 && ret == 0)
262			ret = t_ret;
263
264	return (ret);
265}
266
267/*
268 * __memp_reset_lru --
269 *	Reset the cache LRU counter.
270 */
271static int
272__memp_reset_lru(env, infop)
273	ENV *env;
274	REGINFO *infop;
275{
276	BH *bhp, *tbhp;
277	DB_MPOOL_HASH *hp;
278	MPOOL *c_mp;
279	u_int32_t bucket, priority;
280
281	c_mp = infop->primary;
282	/*
283	 * Update the counter so all future allocations will start at the
284	 * bottom.
285	 */
286	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
287
288	/* Adjust the priority of every buffer in the system. */
289	for (hp = R_ADDR(infop, c_mp->htab),
290	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
291		/*
292		 * Skip empty buckets.
293		 *
294		 * We can check for empty buckets before locking as we
295		 * only care if the pointer is zero or non-zero.
296		 */
297		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) {
298			c_mp->lru_reset++;
299			continue;
300		}
301
302		MUTEX_LOCK(env, hp->mtx_hash);
303		c_mp->lru_reset++;
304		/*
305		 * We need to take a little care that the bucket does
306		 * not become unsorted.  This is highly unlikely but
307		 * possible.
308		 */
309		priority = 0;
310		SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
311			for (tbhp = bhp; tbhp != NULL;
312			    tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) {
313				if (tbhp->priority != UINT32_MAX &&
314				    tbhp->priority > MPOOL_BASE_DECREMENT) {
315					tbhp->priority -= MPOOL_BASE_DECREMENT;
316					if (tbhp->priority < priority)
317						tbhp->priority = priority;
318				}
319			}
320			priority = bhp->priority;
321		}
322		MUTEX_UNLOCK(env, hp->mtx_hash);
323	}
324	c_mp->lru_reset = 0;
325
326	COMPQUIET(env, NULL);
327	return (0);
328}
329
330/*
331 * __memp_unpin_buffers --
332 *	Unpin buffers pinned by a thread.
333 *
334 * PUBLIC: int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *));
335 */
336int
337__memp_unpin_buffers(env, ip)
338	ENV *env;
339	DB_THREAD_INFO *ip;
340{
341	BH *bhp;
342	DB_MPOOL *dbmp;
343	DB_MPOOLFILE dbmf;
344	PIN_LIST *list, *lp;
345	REGINFO *rinfop, *reginfo;
346	int ret;
347
348	memset(&dbmf, 0, sizeof(dbmf));
349	dbmf.env = env;
350	dbmf.flags = MP_DUMMY;
351	dbmp = env->mp_handle;
352	reginfo = env->reginfo;
353
354	list = R_ADDR(reginfo, ip->dbth_pinlist);
355	for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) {
356		if (lp->b_ref == INVALID_ROFF)
357			continue;
358		rinfop = &dbmp->reginfo[lp->region];
359		bhp = R_ADDR(rinfop, lp->b_ref);
360		dbmf.mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
361		if ((ret = __memp_fput(&dbmf, ip,
362		    (u_int8_t *)bhp + SSZA(BH, buf),
363		    DB_PRIORITY_UNCHANGED)) != 0)
364			return (ret);
365	}
366	return (0);
367}
368