1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: mp_alloc.c,v 12.43 2008/04/21 14:39:57 carol Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/mp.h"
13#include "dbinc/txn.h"
14
15static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
16
17/*
18 * __memp_alloc --
19 *	Allocate some space from a cache region.
20 *
21 * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
22 * PUBLIC:     REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
23 */
24int
25__memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
26	DB_MPOOL *dbmp;
27	REGINFO *infop;
28	MPOOLFILE *mfp;
29	size_t len;
30	roff_t *offsetp;
31	void *retp;
32{
33	BH *bhp, *mvcc_bhp, *t1bhp, *t2bhp, *t3bhp;
34	BH_FROZEN_PAGE *frozen_bhp;
35	DB_LSN vlsn;
36	DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp;
37	ENV *env;
38	MPOOL *c_mp;
39	MPOOLFILE *bh_mfp;
40	size_t freed_space;
41	u_int32_t buckets, buffers, high_priority, priority, priority_saved;
42	u_int32_t put_counter, total_buckets;
43	int aggressive, alloc_freeze, giveup, got_oldest, ret;
44	u_int8_t *endp;
45	void *p;
46
47	env = dbmp->env;
48	c_mp = infop->primary;
49	dbht = R_ADDR(infop, c_mp->htab);
50	hp_end = &dbht[c_mp->htab_buckets];
51	hp_saved = NULL;
52	priority_saved = 0;
53
54	buckets = buffers = put_counter = total_buckets = 0;
55	aggressive = alloc_freeze = giveup = got_oldest = 0;
56
57	STAT(c_mp->stat.st_alloc++);
58
59	/*
60	 * If we're allocating a buffer, and the one we're discarding is the
61	 * same size, we don't want to waste the time to re-integrate it into
62	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
63	 * NULL, we'll compare the underlying page sizes of the two buffers
64	 * before free-ing and re-allocating buffers.
65	 */
66	if (mfp != NULL) {
67		len = SSZA(BH, buf) + mfp->stat.st_pagesize;
68		/* Add space for alignment padding for MVCC diagnostics. */
69		MVCC_BHSIZE(mfp, len);
70	}
71
72	MPOOL_REGION_LOCK(env, infop);
73
74	/*
75	 * Anything newer than 1/10th of the buffer pool is ignored during
76	 * allocation (unless allocation starts failing).
77	 */
78	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
79
80	/*
81	 * First we try to allocate from free memory.  If that fails, scan the
82	 * buffer pool to find buffers with low priorities.  We consider small
83	 * sets of hash buckets each time to limit the amount of work needing
84	 * to be done.  This approximates LRU, but not very well.  We either
85	 * find a buffer of the same size to use, or we will free 3 times what
86	 * we need in the hopes it will coalesce into a contiguous chunk of the
87	 * right size.  In the latter case we branch back here and try again.
88	 */
89alloc:	if ((ret = __env_alloc(infop, len, &p)) == 0) {
90		if (mfp != NULL)
91			c_mp->stat.st_pages++;
92		MPOOL_REGION_UNLOCK(env, infop);
93		/*
94		 * For MVCC diagnostics, align the pointer so that the buffer
95		 * starts on a page boundary.
96		 */
97		MVCC_BHALIGN(mfp, p);
98
99found:		if (offsetp != NULL)
100			*offsetp = R_OFFSET(infop, p);
101		*(void **)retp = p;
102
103		/*
104		 * Update the search statistics.
105		 *
106		 * We're not holding the region locked here, these statistics
107		 * can't be trusted.
108		 */
109#ifdef HAVE_STATISTICS
110		total_buckets += buckets;
111		if (total_buckets != 0) {
112			if (total_buckets > c_mp->stat.st_alloc_max_buckets)
113				c_mp->stat.st_alloc_max_buckets = total_buckets;
114			c_mp->stat.st_alloc_buckets += total_buckets;
115		}
116		if (buffers != 0) {
117			if (buffers > c_mp->stat.st_alloc_max_pages)
118				c_mp->stat.st_alloc_max_pages = buffers;
119			c_mp->stat.st_alloc_pages += buffers;
120		}
121#endif
122		return (0);
123	} else if (giveup || c_mp->stat.st_pages == 0) {
124		MPOOL_REGION_UNLOCK(env, infop);
125
126		__db_errx(env,
127		    "unable to allocate space from the buffer cache");
128		return (ret);
129	}
130	ret = 0;
131
132	/*
133	 * We re-attempt the allocation every time we've freed 3 times what
134	 * we need.  Reset our free-space counter.
135	 */
136	freed_space = 0;
137	total_buckets += buckets;
138	buckets = 0;
139
140	/*
141	 * Walk the hash buckets and find the next two with potentially useful
142	 * buffers.  Free the buffer with the lowest priority from the buckets'
143	 * chains.
144	 */
145	for (;;) {
146		/* All pages have been freed, make one last try */
147		if (c_mp->stat.st_pages == 0)
148			goto alloc;
149
150		/* Check for wrap around. */
151		hp = &dbht[c_mp->last_checked++];
152		if (hp >= hp_end) {
153			c_mp->last_checked = 0;
154			hp = &dbht[c_mp->last_checked++];
155		}
156
157		/*
158		 * The failure mode is when there are too many buffers we can't
159		 * write or there's not enough memory in the system to support
160		 * the number of pinned buffers.
161		 *
162		 * Get aggressive if we've reviewed the entire cache without
163		 * freeing the needed space.  (The code resets "aggressive"
164		 * when we free any space.)  Aggressive means:
165		 *
166		 * a: set a flag to attempt to flush high priority buffers as
167		 *    well as other buffers.
168		 * b: sync the mpool to force out queue extent pages.  While we
169		 *    might not have enough space for what we want and flushing
170		 *    is expensive, why not?
171		 * c: look at a buffer in every hash bucket rather than choose
172		 *    the more preferable of two.
173		 * d: start to think about giving up.
174		 *
175		 * If we get here twice, sleep for a second, hopefully someone
176		 * else will run and free up some memory.
177		 *
178		 * Always try to allocate memory too, in case some other thread
179		 * returns its memory to the region.
180		 *
181		 * We don't have any way to know an allocation has no way to
182		 * succeed.  Fail if no pages are returned to the cache after
183		 * we've been trying for a relatively long time.
184		 *
185		 * !!!
186		 * This test ignores pathological cases like no buffers in the
187		 * system -- we check for that early on, so it isn't possible.
188		 */
189		if (buckets++ == c_mp->htab_buckets) {
190			if (freed_space > 0)
191				goto alloc;
192			MPOOL_REGION_UNLOCK(env, infop);
193
194			switch (++aggressive) {
195			case 1:
196				break;
197			case 2:
198				put_counter = c_mp->put_counter;
199				/* FALLTHROUGH */
200			case 3:
201			case 4:
202			case 5:
203			case 6:
204				(void)__memp_sync_int(
205				    env, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);
206
207				__os_yield(env, 1, 0);
208				break;
209			default:
210				aggressive = 1;
211				if (put_counter == c_mp->put_counter)
212					giveup = 1;
213				break;
214			}
215
216			MPOOL_REGION_LOCK(env, infop);
217			goto alloc;
218		}
219
220		/*
221		 * Skip empty buckets.
222		 *
223		 * We can check for empty buckets before locking the hash
224		 * bucket as we only care if the pointer is zero or non-zero.
225		 */
226		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
227			continue;
228
229		/* Unlock the region and lock the hash bucket. */
230		MPOOL_REGION_UNLOCK(env, infop);
231		MUTEX_LOCK(env, hp->mtx_hash);
232
233		/*
234		 * Find a buffer we can use.
235		 *
236		 * We don't want to free a buffer out of the middle of an MVCC
237		 * chain (that requires I/O).  So, walk the buffers, looking
238		 * for, in order of preference:
239		 *	an obsolete buffer at the end of an MVCC chain,
240		 *	the lowest-LRU singleton buffer, and
241		 *	the lowest LRU-buffer of all.
242		 * We use an obsolete buffer at the end of a chain as soon as
243		 * we find one.  We use the lowest-LRU singleton buffer if we
244		 * find one and it's better than the result of another hash
245		 * bucket we've reviewed.  We use the lowest-LRU buffer we find
246		 * if it's lower than another hash bucket we've reviewed and
247		 * we're being aggressive.
248		 *
249		 * Ignore referenced buffers, we can't get rid of them.
250		 */
251retry_search:	bhp = mvcc_bhp = NULL;
252		SH_TAILQ_FOREACH(t1bhp, &hp->hash_bucket, hq, __bh) {
253			/*
254			 * It's a single buffer (not an MVCC chain).
255			 *
256			 * If the buffer is not in use, its LRU is one we'll
257			 * consider at this point in our search, and it's a
258			 * better LRU than we've found so far, remember it.
259			 */
260			if (SH_CHAIN_SINGLETON(t1bhp, vc)) {
261				if (t1bhp->ref == 0 &&
262				    (aggressive ||
263				    t1bhp->priority < high_priority) &&
264				    (bhp == NULL ||
265				    bhp->priority > t1bhp->priority))
266					bhp = t1bhp;
267				continue;
268			}
269
270			/*
271			 * It's an MVCC chain.
272			 */
273			t2bhp = t1bhp;
274			do {
275				t3bhp = t2bhp;
276
277				/*
278				 * If the buffer is not in use, its LRU is one
279				 * we'll consider at this point, and it's a
280				 * better LRU than we've found so far, remember
281				 * it.  The "LRU is OK" check is simpler here
282				 * because we'll only consider a MVCC buffer if
283				 * we're being aggressive.
284				 */
285				if (t2bhp->ref == 0 &&
286				    aggressive &&
287				    (mvcc_bhp == NULL ||
288				    mvcc_bhp->priority > t2bhp->priority))
289					mvcc_bhp = t2bhp;
290			} while
291			    ((t2bhp = SH_CHAIN_PREV(t2bhp, vc, __bh)) != NULL);
292
293			/*
294			 * t3bhp is the last buffer on the MVCC chain, and
295			 * an obsolete buffer at the end of the MVCC chain
296			 * gets used without further search.
297			 *
298			 * If the buffer isn't obsolete with respect to the
299			 * cached old reader LSN, recalculate the oldest
300			 * reader LSN and check again.
301			 */
302retry_obsolete:		if (BH_OBSOLETE(t3bhp, hp->old_reader, vlsn)) {
303				bhp = t3bhp;
304				goto this_buffer;
305			}
306			if (!got_oldest) {
307				if ((ret = __txn_oldest_reader(
308				    env, &hp->old_reader)) != 0)
309					return (ret);
310				got_oldest = 1;
311				goto retry_obsolete;
312			}
313		}
314
315		/*
316		 * bhp is either NULL or the lowest-LRU singleton buffer.
317		 * mvcc_bhp is either NULL or the lowest-LRU MVCC buffer.
318		 * In both cases, we'll use the chosen buffer only if we
319		 * have compared its LRU against the chosen LRU of another
320		 * hash bucket.
321		 */
322		if (bhp == NULL) {
323			if (mvcc_bhp == NULL)
324				goto next_hb;
325			bhp = mvcc_bhp;
326		}
327
328		/* Adjust the priority if the bucket has not been reset. */
329		priority = bhp->priority;
330		if (c_mp->lru_reset != 0 && c_mp->lru_reset <= hp - dbht)
331			priority -= MPOOL_BASE_DECREMENT;
332
333		/*
334		 * Compare two hash buckets and select the one with the lowest
335		 * priority.  Performance testing shows looking at two improves
336		 * the LRU-ness and looking at more only does a little better.
337		 */
338		if (hp_saved == NULL) {
339			hp_saved = hp;
340			priority_saved = priority;
341			goto next_hb;
342		}
343
344		/*
345		 * If the buffer we just found is a better choice than our
346		 * previous choice, use it.
347		 *
348		 * If the previous choice was better, pretend we're moving
349		 * from this hash bucket to the previous one and re-do the
350		 * search.
351		 *
352		 * We don't worry about simply swapping between two buckets
353		 * because that could only happen if a buffer was removed
354		 * from the chain, or its priority updated.   If a buffer
355		 * is removed from the chain, some other thread has managed
356		 * to discard a buffer, so we're moving forward.  Updating
357		 * a buffer's priority will make it a high-priority buffer,
358		 * so we'll ignore it when we search again, and so we will
359		 * eventually zero in on a buffer to use, or we'll decide
360		 * there are no buffers we can use.
361		 *
362		 * If there's only a single hash bucket with buffers, we'll
363		 * search the bucket once, choose a buffer, walk the entire
364		 * list of buckets and search it again.   In the case of a
365		 * system that's busy, it's possible to imagine a case where
366		 * we'd loop for a long while.  For that reason, and because
367		 * the test is easy, we special case and test for it.
368		 */
369		if (priority > priority_saved && hp != hp_saved) {
370			MUTEX_UNLOCK(env, hp->mtx_hash);
371			hp_tmp = hp_saved;
372			hp_saved = hp;
373			hp = hp_tmp;
374			priority_saved = priority;
375			MUTEX_LOCK(env, hp->mtx_hash);
376			goto retry_search;
377		}
378
379this_buffer:	buffers++;
380
381		/*
382		 * Discard any previously remembered hash bucket, we've got
383		 * a winner.
384		 */
385		hp_saved = NULL;
386
387		/* Find the associated MPOOLFILE. */
388		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
389
390		/* If the page is dirty, pin it and write it. */
391		ret = 0;
392		if (F_ISSET(bhp, BH_DIRTY)) {
393			++bhp->ref;
394			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
395			--bhp->ref;
396#ifdef HAVE_STATISTICS
397			if (ret == 0)
398				++c_mp->stat.st_rw_evict;
399#endif
400		}
401#ifdef HAVE_STATISTICS
402		else
403			++c_mp->stat.st_ro_evict;
404#endif
405
406		/*
407		 * Freeze this buffer, if necessary.  That is, if the buffer
408		 * itself or the next version created could be read by the
409		 * oldest reader in the system.
410		 */
411		if (ret == 0 && bh_mfp->multiversion) {
412			if (!got_oldest && !SH_CHAIN_HASPREV(bhp, vc) &&
413			    !BH_OBSOLETE(bhp, hp->old_reader, vlsn)) {
414				(void)__txn_oldest_reader(env,
415				    &hp->old_reader);
416				got_oldest = 1;
417			}
418			if (SH_CHAIN_HASPREV(bhp, vc) ||
419			    !BH_OBSOLETE(bhp, hp->old_reader, vlsn)) {
420				/*
421				 * Before freezing, double-check that we have
422				 * an up-to-date old_reader LSN.
423				 */
424				if (!aggressive ||
425				    F_ISSET(bhp, BH_FROZEN) || bhp->ref != 0)
426					goto next_hb;
427				ret = __memp_bh_freeze(dbmp,
428				    infop, hp, bhp, &alloc_freeze);
429			}
430		}
431
432		/*
433		 * If a write fails for any reason, we can't proceed.
434		 *
435		 * We released the hash bucket lock while doing I/O, so another
436		 * thread may have acquired this buffer and incremented the ref
437		 * count after we wrote it, in which case we can't have it.
438		 *
439		 * If there's a write error and we're having problems finding
440		 * something to allocate, avoid selecting this buffer again
441		 * by making it the bucket's least-desirable buffer.
442		 */
443		if (ret != 0 || bhp->ref != 0) {
444			if (ret != 0 && aggressive)
445				__memp_bad_buffer(hp);
446			goto next_hb;
447		}
448
449		/*
450		 * If the buffer is frozen, thaw it and look for another one
451		 * we can use.
452		 */
453		if (F_ISSET(bhp, BH_FROZEN)) {
454			++bhp->ref;
455			if ((ret = __memp_bh_thaw(dbmp, infop, hp,
456			    bhp, NULL)) != 0) {
457				MUTEX_UNLOCK(env, hp->mtx_hash);
458				return (ret);
459			}
460			alloc_freeze = 0;
461			goto retry_search;
462		}
463
464		/*
465		 * If we need some empty buffer headers for freezing, turn the
466		 * buffer we've found into frozen headers and put them on the
467		 * free list.  Only reset alloc_freeze if we've actually
468		 * allocated some frozen buffer headers.
469		 */
470		if (alloc_freeze) {
471			if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
472				return (ret);
473			MVCC_MPROTECT(bhp->buf, bh_mfp->stat.st_pagesize,
474			    PROT_READ | PROT_WRITE | PROT_EXEC);
475
476			MPOOL_REGION_LOCK(env, infop);
477			SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
478			    (BH_FROZEN_ALLOC *)bhp, links);
479			frozen_bhp = (BH_FROZEN_PAGE *)
480			    ((BH_FROZEN_ALLOC *)bhp + 1);
481			endp = (u_int8_t *)bhp->buf + bh_mfp->stat.st_pagesize;
482			while ((u_int8_t *)(frozen_bhp + 1) < endp) {
483				SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
484				    (BH *)frozen_bhp, hq);
485				frozen_bhp++;
486			}
487			alloc_freeze = 0;
488			continue;
489		}
490
491		/*
492		 * Check to see if the buffer is the size we're looking for.
493		 * If so, we can simply reuse it.  Otherwise, free the buffer
494		 * and its space and keep looking.
495		 */
496		if (mfp != NULL &&
497		    mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
498			if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
499				return (ret);
500			p = bhp;
501			goto found;
502		}
503
504		freed_space += sizeof(*bhp) + bh_mfp->stat.st_pagesize;
505		if ((ret =
506		    __memp_bhfree(dbmp, infop, hp, bhp, BH_FREE_FREEMEM)) != 0)
507			return (ret);
508
509		/* Reset "aggressive" if we free any space. */
510		if (aggressive > 1)
511			aggressive = 1;
512
513		/*
514		 * Unlock this hash bucket and re-acquire the region lock. If
515		 * we're reaching here as a result of calling memp_bhfree, the
516		 * hash bucket lock has already been discarded.
517		 */
518		if (0) {
519next_hb:		MUTEX_UNLOCK(env, hp->mtx_hash);
520		}
521		MPOOL_REGION_LOCK(env, infop);
522
523		/*
524		 * Retry the allocation as soon as we've freed up sufficient
525		 * space.  We're likely to have to coalesce of memory to
526		 * satisfy the request, don't try until it's likely (possible?)
527		 * we'll succeed.
528		 */
529		if (freed_space >= 3 * len)
530			goto alloc;
531	}
532	/* NOTREACHED */
533}
534
535/*
536 * __memp_free --
537 *	Free some space from a cache region.
538 *
539 * PUBLIC: void __memp_free __P((REGINFO *, MPOOLFILE *, void *));
540 */
541void
542__memp_free(infop, mfp, buf)
543	REGINFO *infop;
544	MPOOLFILE *mfp;
545	void *buf;
546{
547	MVCC_BHUNALIGN(mfp, buf);
548	COMPQUIET(mfp, NULL);
549	__env_alloc_free(infop, buf);
550}
551
552/*
553 * __memp_bad_buffer --
554 *	Make the first buffer in a hash bucket the least desirable buffer.
555 */
556static void
557__memp_bad_buffer(hp)
558	DB_MPOOL_HASH *hp;
559{
560	BH *bhp, *last_bhp;
561	u_int32_t priority;
562
563	/*
564	 * Get the first buffer from the bucket.  If it is also the last buffer
565	 * (in other words, it is the only buffer in the bucket), we're done.
566	 */
567	bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
568	last_bhp = SH_TAILQ_LASTP(&hp->hash_bucket, hq, __bh);
569	if (bhp == last_bhp)
570		return;
571
572	/* There are multiple buffers in the bucket, remove the first one. */
573	SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
574
575	/*
576	 * Find the highest priority buffer in the bucket.  Buffers are
577	 * sorted by priority, so it's the last one in the bucket.
578	 */
579	priority = BH_PRIORITY(last_bhp);
580
581	/*
582	 * Append our buffer to the bucket and set its priority to be just as
583	 * bad.
584	 */
585	SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
586	for (; bhp != NULL ; bhp = SH_CHAIN_PREV(bhp, vc, __bh))
587		bhp->priority = priority;
588}
589