1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2006,2008 Oracle.  All rights reserved.
5 *
6 * $Id: mp_resize.c,v 12.14 2008/03/13 15:21:21 mbrey Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/mp.h"
13#include "dbinc/txn.h"
14
15static int __memp_add_bucket __P((DB_MPOOL *));
16static int __memp_add_region __P((DB_MPOOL *));
17static int __memp_map_regions __P((DB_MPOOL *));
18static int __memp_merge_buckets
19    __P((DB_MPOOL *, u_int32_t, u_int32_t, u_int32_t));
20static int __memp_remove_bucket __P((DB_MPOOL *));
21static int __memp_remove_region __P((DB_MPOOL *));
22
23/*
24 * PUBLIC: int __memp_get_bucket __P((ENV *,
25 * PUBLIC:     MPOOLFILE *, db_pgno_t, REGINFO **, DB_MPOOL_HASH **));
26 */
27int
28__memp_get_bucket(env, mfp, pgno, infopp, hpp)
29	ENV *env;
30	MPOOLFILE *mfp;
31	db_pgno_t pgno;
32	REGINFO **infopp;
33	DB_MPOOL_HASH **hpp;
34{
35	DB_MPOOL *dbmp;
36	DB_MPOOL_HASH *hp;
37	MPOOL *c_mp, *mp;
38	REGINFO *infop;
39	roff_t mf_offset;
40	u_int32_t bucket, nbuckets, new_bucket, new_nbuckets, region;
41	u_int32_t *regids;
42	int ret;
43
44	dbmp = env->mp_handle;
45	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
46	mp = dbmp->reginfo[0].primary;
47	ret = 0;
48
49	for (;;) {
50		nbuckets = mp->nbuckets;
51		MP_BUCKET(mf_offset, pgno, nbuckets, bucket);
52
53		/*
54		 * Once we work out which region we are looking in, we have to
55		 * check that we have that region mapped, and that the version
56		 * we have matches the ID in the main mpool region.  Otherwise
57		 * we have to go and map in any regions that don't match and
58		 * retry.
59		 */
60		region = NREGION(mp, bucket);
61		regids = R_ADDR(dbmp->reginfo, mp->regids);
62
63		for (;;) {
64			infop = *infopp = &dbmp->reginfo[region];
65			c_mp = infop->primary;
66
67			/* If we have the correct region mapped, we're done. */
68			if (c_mp != NULL && regids[region] == infop->id)
69				break;
70			if ((ret = __memp_map_regions(dbmp)) != 0)
71				return (ret);
72		}
73
74		/* If our caller wants the hash bucket, lock it here. */
75		if (hpp != NULL) {
76			hp = R_ADDR(infop, c_mp->htab);
77			hp = &hp[bucket - region * mp->htab_buckets];
78
79			MUTEX_LOCK(env, hp->mtx_hash);
80
81			/*
82			 * Check that we still have the correct region mapped.
83			 */
84			if (regids[region] != infop->id) {
85				MUTEX_UNLOCK(env, hp->mtx_hash);
86				continue;
87			}
88
89			/*
90			 * Now that the bucket is locked, we need to check that
91			 * the cache has not been resized while we waited.
92			 */
93			new_nbuckets = mp->nbuckets;
94			if (nbuckets != new_nbuckets) {
95				MP_BUCKET(mf_offset, pgno, new_nbuckets,
96				    new_bucket);
97
98				if (new_bucket != bucket) {
99					MUTEX_UNLOCK(env, hp->mtx_hash);
100					continue;
101				}
102			}
103
104			*hpp = hp;
105		}
106
107		break;
108	}
109
110	return (ret);
111}
112
113static int
114__memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket)
115	DB_MPOOL *dbmp;
116	u_int32_t new_nbuckets, old_bucket, new_bucket;
117{
118	BH *alloc_bhp, *bhp, *current_bhp, *new_bhp, *next_bhp;
119	DB_LSN vlsn;
120	DB_MPOOL_HASH *new_hp, *old_hp;
121	ENV *env;
122	MPOOL *mp, *new_mp, *old_mp;
123	MPOOLFILE *mfp;
124	REGINFO *new_infop, *old_infop;
125	u_int32_t bucket, high_mask, new_region, old_region;
126	int ret;
127
128	env = dbmp->env;
129	mp = dbmp->reginfo[0].primary;
130	new_bhp = NULL;
131	ret = 0;
132
133	MP_MASK(new_nbuckets, high_mask);
134
135	old_region = NREGION(mp, old_bucket);
136	old_infop = &dbmp->reginfo[old_region];
137	old_mp = old_infop->primary;
138	old_hp = R_ADDR(old_infop, old_mp->htab);
139	old_hp = &old_hp[old_bucket - old_region * mp->htab_buckets];
140
141	new_region = NREGION(mp, new_bucket);
142	new_infop = &dbmp->reginfo[new_region];
143	new_mp = new_infop->primary;
144	new_hp = R_ADDR(new_infop, new_mp->htab);
145	new_hp = &new_hp[new_bucket - new_region * mp->htab_buckets];
146
147	/*
148	 * Before merging, we need to check that there are no old buffers left
149	 * in the target hash bucket after a previous split.
150	 */
151free_old:
152	MUTEX_LOCK(env, new_hp->mtx_hash);
153	SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) {
154		MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket);
155
156		if (bucket != new_bucket) {
157			/*
158			 * There is no way that an old buffer can be locked
159			 * after a split, since everyone will look for it in
160			 * the new hash bucket.
161			 */
162			DB_ASSERT(env, !F_ISSET(bhp, BH_LOCKED | BH_DIRTY) &&
163			    bhp->ref == 0);
164			if ((ret = __memp_bhfree(dbmp,
165			    new_infop, new_hp, bhp, BH_FREE_FREEMEM)) != 0) {
166				MUTEX_UNLOCK(env, new_hp->mtx_hash);
167				return (ret);
168			}
169
170			/*
171			 * The free has modified the list of buffers and
172			 * dropped the mutex.  We need to start again.
173			 */
174			goto free_old;
175		}
176	}
177	MUTEX_UNLOCK(env, new_hp->mtx_hash);
178
179	/*
180	 * Before we begin, make sure that all of the buffers we care about are
181	 * not in use and not frozen.  We do this because we can't drop the old
182	 * hash bucket mutex once we start moving buffers around.
183	 */
184retry:	MUTEX_LOCK(env, old_hp->mtx_hash);
185	SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
186		MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
187		    new_nbuckets, high_mask, bucket);
188
189		if (bucket == new_bucket &&
190		    (F_ISSET(bhp, BH_LOCKED) || bhp->ref != 0)) {
191			MUTEX_UNLOCK(env, old_hp->mtx_hash);
192			__os_yield(env, 0, 0);
193			goto retry;
194		} else if (bucket == new_bucket && F_ISSET(bhp, BH_FROZEN)) {
195			++bhp->ref;
196			if (BH_OBSOLETE(bhp, old_hp->old_reader, vlsn))
197				alloc_bhp = NULL;
198			else {
199				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
200				MUTEX_UNLOCK(env, old_hp->mtx_hash);
201				if ((ret = __memp_alloc(dbmp,
202				    old_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
203					return (ret);
204				MUTEX_LOCK(env, old_hp->mtx_hash);
205			}
206			if ((ret = __memp_bh_thaw(dbmp,
207			    old_infop, old_hp, bhp, alloc_bhp)) != 0) {
208				MUTEX_UNLOCK(env, old_hp->mtx_hash);
209				return (ret);
210			}
211
212			/*
213			 * We've dropped the mutex in order to thaw, so we need
214			 * to go back to the beginning and check that all of
215			 * the buffers we care about are still unlocked and
216			 * unreferenced.
217			 */
218			MUTEX_UNLOCK(env, old_hp->mtx_hash);
219			goto retry;
220		}
221	}
222
223	/*
224	 * We now know that all of the buffers we care about are unlocked and
225	 * unreferenced.  Go ahead and copy them.
226	 */
227	SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
228		MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
229		    new_nbuckets, high_mask, bucket);
230		mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
231
232		/*
233		 * We ignore buffers that don't hash to the new bucket.  We
234		 * could also ignore clean buffers which are not part of a
235		 * multiversion chain as long as they have a backing file.
236		 */
237		if (bucket != new_bucket || (!F_ISSET(bhp, BH_DIRTY) &&
238		    SH_CHAIN_SINGLETON(bhp, vc) && !mfp->no_backing_file))
239			continue;
240
241		for (current_bhp = bhp, next_bhp = NULL;
242		    current_bhp != NULL;
243		    current_bhp = SH_CHAIN_PREV(current_bhp, vc, __bh),
244		    next_bhp = alloc_bhp) {
245			if ((ret = __memp_alloc(dbmp,
246			    new_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
247				break;
248
249			alloc_bhp->ref = current_bhp->ref;
250			alloc_bhp->ref_sync = current_bhp->ref_sync;
251			alloc_bhp->priority = current_bhp->priority;
252			alloc_bhp->pgno = current_bhp->pgno;
253			alloc_bhp->mf_offset = current_bhp->mf_offset;
254			alloc_bhp->flags = current_bhp->flags;
255			alloc_bhp->td_off = current_bhp->td_off;
256
257			/*
258			 * We've duplicated the buffer, so now we need to
259			 * update reference counts, including the counts in the
260			 * per-MPOOLFILE and the transaction detail (for MVCC
261			 * buffers).
262			 */
263			MUTEX_LOCK(env, mfp->mutex);
264			++mfp->block_cnt;
265			MUTEX_UNLOCK(env, mfp->mutex);
266
267			if (alloc_bhp->td_off != INVALID_ROFF &&
268			    (ret = __txn_add_buffer(env,
269			    R_ADDR(&env->tx_handle->reginfo,
270			    alloc_bhp->td_off))) != 0)
271				break;
272
273			memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize);
274
275			/*
276			 * We build up the MVCC chain first, then insert the
277			 * head (stored in new_bhp) once.
278			 */
279			if (next_bhp == NULL) {
280				SH_CHAIN_INIT(alloc_bhp, vc);
281				new_bhp = alloc_bhp;
282			} else
283				SH_CHAIN_INSERT_BEFORE(
284				    next_bhp, alloc_bhp, vc, __bh);
285		}
286
287		MUTEX_LOCK(env, new_hp->mtx_hash);
288		SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq);
289		if (F_ISSET(new_bhp, BH_DIRTY))
290			++new_hp->hash_page_dirty;
291
292		MUTEX_UNLOCK(env, new_hp->mtx_hash);
293
294		if (F_ISSET(bhp, BH_DIRTY)) {
295			F_CLR(bhp, BH_DIRTY);
296			--old_hp->hash_page_dirty;
297		}
298	}
299
300	if (ret == 0)
301		mp->nbuckets = new_nbuckets;
302	MUTEX_UNLOCK(env, old_hp->mtx_hash);
303
304	return (ret);
305}
306
307static int
308__memp_add_bucket(dbmp)
309	DB_MPOOL *dbmp;
310{
311	ENV *env;
312	MPOOL *mp;
313	u_int32_t high_mask, new_bucket, old_bucket;
314
315	env = dbmp->env;
316	mp = dbmp->reginfo[0].primary;
317
318	new_bucket = mp->nbuckets;
319	/* We should always be adding buckets to the last region. */
320	DB_ASSERT(env, NREGION(mp, new_bucket) == mp->nreg - 1);
321	MP_MASK(mp->nbuckets, high_mask);
322	old_bucket = new_bucket & (high_mask >> 1);
323
324	/*
325	 * With fixed-sized regions, the new region is always smaller than the
326	 * existing total cache size, so buffers always need to be copied.  If
327	 * we implement variable region sizes, it's possible that we will be
328	 * splitting a hash bucket in the new region.  Catch that here.
329	 */
330	DB_ASSERT(env, NREGION(mp, old_bucket) != NREGION(mp, new_bucket));
331
332	return (__memp_merge_buckets(dbmp, mp->nbuckets + 1,
333	    old_bucket, new_bucket));
334}
335
336static int
337__memp_add_region(dbmp)
338	DB_MPOOL *dbmp;
339{
340	ENV *env;
341	MPOOL *mp;
342	REGINFO *infop;
343	int ret;
344	roff_t reg_size;
345	u_int i;
346	u_int32_t *regids;
347
348	env = dbmp->env;
349	mp = dbmp->reginfo[0].primary;
350	/* All cache regions are the same size. */
351	reg_size = dbmp->reginfo[0].rp->size;
352	ret = 0;
353
354	infop = &dbmp->reginfo[mp->nreg];
355	infop->env = env;
356	infop->type = REGION_TYPE_MPOOL;
357	infop->id = INVALID_REGION_ID;
358	infop->flags = REGION_CREATE_OK;
359	if ((ret = __env_region_attach(env, infop, reg_size)) != 0)
360		return (ret);
361	if ((ret = __memp_init(env,
362	    dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0)
363		return (ret);
364	regids = R_ADDR(dbmp->reginfo, mp->regids);
365	regids[mp->nreg++] = infop->id;
366
367	for (i = 0; i < mp->htab_buckets; i++)
368		if ((ret = __memp_add_bucket(dbmp)) != 0)
369			break;
370
371	return (ret);
372}
373
374static int
375__memp_remove_bucket(dbmp)
376	DB_MPOOL *dbmp;
377{
378	ENV *env;
379	MPOOL *mp;
380	u_int32_t high_mask, new_bucket, old_bucket;
381
382	env = dbmp->env;
383	mp = dbmp->reginfo[0].primary;
384
385	old_bucket = mp->nbuckets - 1;
386
387	/* We should always be removing buckets from the last region. */
388	DB_ASSERT(env, NREGION(mp, old_bucket) == mp->nreg - 1);
389	MP_MASK(mp->nbuckets - 1, high_mask);
390	new_bucket = old_bucket & (high_mask >> 1);
391
392	return (__memp_merge_buckets(dbmp, mp->nbuckets - 1,
393	    old_bucket, new_bucket));
394}
395
396static int
397__memp_remove_region(dbmp)
398	DB_MPOOL *dbmp;
399{
400	ENV *env;
401	MPOOL *mp;
402	REGINFO *infop;
403	int ret;
404	u_int i;
405
406	env = dbmp->env;
407	mp = dbmp->reginfo[0].primary;
408	ret = 0;
409
410	if (mp->nreg == 1) {
411		__db_errx(env, "cannot remove the last cache");
412		return (EINVAL);
413	}
414
415	for (i = 0; i < mp->htab_buckets; i++)
416		if ((ret = __memp_remove_bucket(dbmp)) != 0)
417			return (ret);
418
419	/* Detach from the region then destroy it. */
420	infop = &dbmp->reginfo[--mp->nreg];
421	return (__env_region_detach(env, infop, 1));
422}
423
424static int
425__memp_map_regions(dbmp)
426	DB_MPOOL *dbmp;
427{
428	ENV *env;
429	MPOOL *mp;
430	int ret;
431	u_int i;
432	u_int32_t *regids;
433
434	env = dbmp->env;
435	mp = dbmp->reginfo[0].primary;
436	regids = R_ADDR(dbmp->reginfo, mp->regids);
437	ret = 0;
438
439	for (i = 1; i < mp->nreg; ++i) {
440		if (dbmp->reginfo[i].primary != NULL &&
441		    dbmp->reginfo[i].id == regids[i])
442			continue;
443
444		if (dbmp->reginfo[i].primary != NULL)
445			ret = __env_region_detach(env, &dbmp->reginfo[i], 0);
446
447		dbmp->reginfo[i].env = env;
448		dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
449		dbmp->reginfo[i].id = regids[i];
450		dbmp->reginfo[i].flags = REGION_JOIN_OK;
451		if ((ret =
452		    __env_region_attach(env, &dbmp->reginfo[i], 0)) != 0)
453			return (ret);
454		dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i],
455		    dbmp->reginfo[i].rp->primary);
456	}
457
458	for (; i < mp->max_nreg; i++)
459		if (dbmp->reginfo[i].primary != NULL &&
460		    (ret = __env_region_detach(env,
461		    &dbmp->reginfo[i], 0)) != 0)
462			break;
463
464	return (ret);
465}
466
467/*
468 * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t));
469 */
470int
471__memp_resize(dbmp, gbytes, bytes)
472	DB_MPOOL *dbmp;
473	u_int32_t gbytes, bytes;
474{
475	ENV *env;
476	MPOOL *mp;
477	int ret;
478	u_int32_t ncache;
479	roff_t reg_size, total_size;
480
481	env = dbmp->env;
482	mp = dbmp->reginfo[0].primary;
483	reg_size = dbmp->reginfo[0].rp->size;
484	total_size = (roff_t)gbytes * GIGABYTE + bytes;
485	ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size);
486
487	if (ncache < 1)
488		ncache = 1;
489	else if (ncache > mp->max_nreg) {
490		__db_errx(env,
491		    "cannot resize to %lu cache regions: maximum is %lu",
492		    (u_long)ncache, (u_long)mp->max_nreg);
493		return (EINVAL);
494	}
495
496	ret = 0;
497	MUTEX_LOCK(env, mp->mtx_resize);
498	while (mp->nreg != ncache)
499		if ((ret = (mp->nreg < ncache ?
500		    __memp_add_region(dbmp) :
501		    __memp_remove_region(dbmp))) != 0)
502			break;
503	MUTEX_UNLOCK(env, mp->mtx_resize);
504
505	return (ret);
506}
507
508/*
509 * PUBLIC: int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
510 */
511int
512__memp_get_cache_max(dbenv, max_gbytesp, max_bytesp)
513	DB_ENV *dbenv;
514	u_int32_t *max_gbytesp, *max_bytesp;
515{
516	DB_MPOOL *dbmp;
517	ENV *env;
518	MPOOL *mp;
519	roff_t reg_size, max_size;
520
521	env = dbenv->env;
522
523	ENV_NOT_CONFIGURED(env,
524	    env->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL);
525
526	if (MPOOL_ON(env)) {
527		/* Cannot be set after open, no lock required to read. */
528		dbmp = env->mp_handle;
529		mp = dbmp->reginfo[0].primary;
530		reg_size = dbmp->reginfo[0].rp->size;
531		max_size = mp->max_nreg * reg_size;
532		*max_gbytesp = (u_int32_t)(max_size / GIGABYTE);
533		*max_bytesp = (u_int32_t)(max_size % GIGABYTE);
534	} else {
535		*max_gbytesp = dbenv->mp_max_gbytes;
536		*max_bytesp = dbenv->mp_max_bytes;
537	}
538
539	return (0);
540}
541
542/*
543 * PUBLIC: int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t));
544 */
545int
546__memp_set_cache_max(dbenv, max_gbytes, max_bytes)
547	DB_ENV *dbenv;
548	u_int32_t max_gbytes, max_bytes;
549{
550	ENV *env;
551
552	env = dbenv->env;
553
554	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_cache_max");
555	dbenv->mp_max_gbytes = max_gbytes;
556	dbenv->mp_max_bytes = max_bytes;
557
558	return (0);
559}
560