1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: mp_region.c,v 12.39 2008/05/08 03:15:38 mjc Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/mp.h"
13
14static int	__memp_init_config __P((ENV *, MPOOL *));
15static void	__memp_region_size __P((ENV *, roff_t *, u_int32_t *));
16
17/*
18 * __memp_open --
19 *	Internal version of memp_open: only called from ENV->open.
20 *
21 * PUBLIC: int __memp_open __P((ENV *, int));
22 */
23int
24__memp_open(env, create_ok)
25	ENV *env;
26	int create_ok;
27{
28	DB_ENV *dbenv;
29	DB_MPOOL *dbmp;
30	MPOOL *mp;
31	REGINFO reginfo;
32	roff_t reg_size;
33	u_int i, max_nreg;
34	u_int32_t htab_buckets, *regids;
35	int ret;
36
37	dbenv = env->dbenv;
38
39	/* Calculate the region size and hash bucket count. */
40	__memp_region_size(env, &reg_size, &htab_buckets);
41
42	/* Create and initialize the DB_MPOOL structure. */
43	if ((ret = __os_calloc(env, 1, sizeof(*dbmp), &dbmp)) != 0)
44		return (ret);
45	LIST_INIT(&dbmp->dbregq);
46	TAILQ_INIT(&dbmp->dbmfq);
47	dbmp->env = env;
48
49	/* Join/create the first mpool region. */
50	memset(&reginfo, 0, sizeof(REGINFO));
51	reginfo.env = env;
52	reginfo.type = REGION_TYPE_MPOOL;
53	reginfo.id = INVALID_REGION_ID;
54	reginfo.flags = REGION_JOIN_OK;
55	if (create_ok)
56		F_SET(&reginfo, REGION_CREATE_OK);
57	if ((ret = __env_region_attach(env, &reginfo, reg_size)) != 0)
58		goto err;
59
60	/*
61	 * If we created the region, initialize it.  Create or join any
62	 * additional regions.
63	 */
64	if (F_ISSET(&reginfo, REGION_CREATE)) {
65		/*
66		 * We define how many regions there are going to be, allocate
67		 * the REGINFO structures and create them.  Make sure we don't
68		 * clear the wrong entries on error.
69		 */
70		max_nreg = __memp_max_regions(env);
71		if ((ret = __os_calloc(env,
72		    max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
73			goto err;
74		/* Make sure we don't clear the wrong entries on error. */
75		dbmp->reginfo[0] = reginfo;
76		for (i = 1; i < max_nreg; ++i)
77			dbmp->reginfo[i].id = INVALID_REGION_ID;
78
79		/* Initialize the first region. */
80		if ((ret = __memp_init(env, dbmp,
81		    0, htab_buckets, max_nreg)) != 0)
82			goto err;
83
84		/*
85		 * Create/initialize remaining regions and copy their IDs into
86		 * the first region.
87		 */
88		mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary);
89		regids = R_ADDR(dbmp->reginfo, mp->regids);
90		regids[0] = dbmp->reginfo[0].id;
91		for (i = 1; i < dbenv->mp_ncache; ++i) {
92			dbmp->reginfo[i].env = env;
93			dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
94			dbmp->reginfo[i].id = INVALID_REGION_ID;
95			dbmp->reginfo[i].flags = REGION_CREATE_OK;
96			if ((ret = __env_region_attach(
97			    env, &dbmp->reginfo[i], reg_size)) != 0)
98				goto err;
99			if ((ret = __memp_init(env, dbmp,
100			    i, htab_buckets, max_nreg)) != 0)
101				goto err;
102
103			regids[i] = dbmp->reginfo[i].id;
104		}
105	} else {
106		/*
107		 * Determine how many regions there are going to be, allocate
108		 * the REGINFO structures and fill in local copies of that
109		 * information.
110		 */
111		mp = R_ADDR(&reginfo, reginfo.rp->primary);
112		dbenv->mp_ncache = mp->nreg;
113		if ((ret = __os_calloc(env,
114		    mp->max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
115			goto err;
116		/* Make sure we don't clear the wrong entries on error. */
117		for (i = 0; i < dbenv->mp_ncache; ++i)
118			dbmp->reginfo[i].id = INVALID_REGION_ID;
119		dbmp->reginfo[0] = reginfo;
120
121		/* Join remaining regions. */
122		regids = R_ADDR(dbmp->reginfo, mp->regids);
123		for (i = 1; i < dbenv->mp_ncache; ++i) {
124			dbmp->reginfo[i].env = env;
125			dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
126			dbmp->reginfo[i].id = regids[i];
127			dbmp->reginfo[i].flags = REGION_JOIN_OK;
128			if ((ret = __env_region_attach(
129			    env, &dbmp->reginfo[i], 0)) != 0)
130				goto err;
131		}
132	}
133
134	/* Set the local addresses for the regions. */
135	for (i = 0; i < dbenv->mp_ncache; ++i)
136		dbmp->reginfo[i].primary =
137		    R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
138
139	/* If the region is threaded, allocate a mutex to lock the handles. */
140	if ((ret = __mutex_alloc(env,
141	    MTX_MPOOL_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbmp->mutex)) != 0)
142		goto err;
143
144	env->mp_handle = dbmp;
145
146	/* A process joining the region may reset the mpool configuration. */
147	if ((ret = __memp_init_config(env, mp)) != 0)
148		return (ret);
149
150	return (0);
151
152err:	env->mp_handle = NULL;
153	if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
154		for (i = 0; i < dbenv->mp_ncache; ++i)
155			if (dbmp->reginfo[i].id != INVALID_REGION_ID)
156				(void)__env_region_detach(
157				    env, &dbmp->reginfo[i], 0);
158		__os_free(env, dbmp->reginfo);
159	}
160
161	(void)__mutex_free(env, &dbmp->mutex);
162	__os_free(env, dbmp);
163	return (ret);
164}
165
166/*
167 * __memp_init --
168 *	Initialize a MPOOL structure in shared memory.
169 *
170 * PUBLIC: int	__memp_init
171 * PUBLIC:     __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int));
172 */
173int
174__memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
175	ENV *env;
176	DB_MPOOL *dbmp;
177	u_int reginfo_off, max_nreg;
178	u_int32_t htab_buckets;
179{
180	BH *frozen_bhp;
181	BH_FROZEN_ALLOC *frozen;
182	DB_ENV *dbenv;
183	DB_MPOOL_HASH *htab, *hp;
184	MPOOL *mp, *main_mp;
185	REGINFO *infop;
186	db_mutex_t mtx_base, mtx_discard, mtx_prev;
187	u_int32_t i;
188	int ret;
189	void *p;
190
191	dbenv = env->dbenv;
192
193	infop = &dbmp->reginfo[reginfo_off];
194	if ((ret = __env_alloc(infop, sizeof(MPOOL), &infop->primary)) != 0)
195		goto mem_err;
196	infop->rp->primary = R_OFFSET(infop, infop->primary);
197	mp = infop->primary;
198	memset(mp, 0, sizeof(*mp));
199
200	if ((ret =
201	    __mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0)
202		return (ret);
203
204	if (reginfo_off == 0) {
205		ZERO_LSN(mp->lsn);
206
207		mp->nreg = dbenv->mp_ncache;
208		mp->max_nreg = max_nreg;
209		if ((ret = __env_alloc(&dbmp->reginfo[0],
210		    max_nreg * sizeof(u_int32_t), &p)) != 0)
211			goto mem_err;
212		mp->regids = R_OFFSET(dbmp->reginfo, p);
213		mp->nbuckets = dbenv->mp_ncache * htab_buckets;
214
215		/* Allocate file table space and initialize it. */
216		if ((ret = __env_alloc(infop,
217		    MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), &htab)) != 0)
218			goto mem_err;
219		mp->ftab = R_OFFSET(infop, htab);
220		for (i = 0; i < MPOOL_FILE_BUCKETS; i++) {
221			if ((ret = __mutex_alloc(env,
222			     MTX_MPOOL_FILE_BUCKET, 0, &htab[i].mtx_hash)) != 0)
223				return (ret);
224			SH_TAILQ_INIT(&htab[i].hash_bucket);
225			htab[i].hash_page_dirty = 0;
226		}
227
228		/*
229		 * Allocate all of the hash bucket mutexes up front.  We do
230		 * this so that we don't need to free and reallocate mutexes as
231		 * the cache is resized.
232		 */
233		mtx_base = mtx_prev = MUTEX_INVALID;
234		for (i = 0; i < mp->max_nreg * htab_buckets; i++) {
235			if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
236			    0, &mtx_discard)) != 0)
237				return (ret);
238			if (i == 0) {
239				mtx_base = mtx_discard;
240				mtx_prev = mtx_discard - 1;
241			}
242			DB_ASSERT(env, mtx_discard == mtx_prev + 1 ||
243			    mtx_base == MUTEX_INVALID);
244			mtx_prev = mtx_discard;
245			if ((ret = __mutex_alloc(env, MTX_MPOOL_IO,
246			    DB_MUTEX_SELF_BLOCK, &mtx_discard)) != 0)
247				return (ret);
248			DB_ASSERT(env, mtx_discard == mtx_prev + 1 ||
249			    mtx_base == MUTEX_INVALID);
250			mtx_prev = mtx_discard;
251		}
252	} else {
253		main_mp = dbmp->reginfo[0].primary;
254		htab = R_ADDR(&dbmp->reginfo[0], main_mp->htab);
255		mtx_base = htab[0].mtx_hash;
256	}
257
258	/*
259	 * We preallocated all of the mutexes in a block, so for regions after
260	 * the first, we skip mutexes in use in earlier regions.  Each region
261	 * has the same number of buckets and there are two mutexes per hash
262	 * bucket (the bucket mutex and the I/O mutex).
263	 */
264	if (mtx_base != MUTEX_INVALID)
265		mtx_base += reginfo_off * htab_buckets * 2;
266
267	/* Allocate hash table space and initialize it. */
268	if ((ret = __env_alloc(infop,
269	    htab_buckets * sizeof(DB_MPOOL_HASH), &htab)) != 0)
270		goto mem_err;
271	mp->htab = R_OFFSET(infop, htab);
272	for (i = 0; i < htab_buckets; i++) {
273		hp = &htab[i];
274		hp->mtx_hash = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID :
275		    mtx_base + i * 2;
276		hp->mtx_io = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID :
277		    mtx_base + i * 2 + 1;
278		SH_TAILQ_INIT(&hp->hash_bucket);
279		hp->hash_page_dirty = 0;
280#ifdef HAVE_STATISTICS
281		hp->hash_io_wait = 0;
282		hp->hash_frozen = hp->hash_thawed = hp->hash_frozen_freed = 0;
283#endif
284		hp->flags = 0;
285		ZERO_LSN(hp->old_reader);
286	}
287	mp->htab_buckets = htab_buckets;
288#ifdef HAVE_STATISTICS
289	mp->stat.st_hash_buckets = htab_buckets;
290#endif
291
292	SH_TAILQ_INIT(&mp->free_frozen);
293	SH_TAILQ_INIT(&mp->alloc_frozen);
294
295	/*
296	 * Pre-allocate one frozen buffer header.  This avoids situations where
297	 * the cache becomes full of pages and we don't even have the 28 bytes
298	 * (or so) available to allocate a frozen buffer header.
299	 */
300	if ((ret = __env_alloc(infop,
301	    sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen)) != 0)
302		goto mem_err;
303	frozen_bhp = (BH *)(frozen + 1);
304	SH_TAILQ_INSERT_TAIL(&mp->alloc_frozen, frozen, links);
305	SH_TAILQ_INSERT_TAIL(&mp->free_frozen, frozen_bhp, hq);
306
307	/*
308	 * Only the environment creator knows the total cache size, fill in
309	 * those statistics now.
310	 */
311	mp->stat.st_gbytes = dbenv->mp_gbytes;
312	mp->stat.st_bytes = dbenv->mp_bytes;
313	return (0);
314
315mem_err:__db_errx(env, "Unable to allocate memory for mpool region");
316	return (ret);
317}
318
319/*
320 * PUBLIC: u_int32_t __memp_max_regions __P((ENV *));
321 */
322u_int32_t
323__memp_max_regions(env)
324	ENV *env;
325{
326	DB_ENV *dbenv;
327	roff_t reg_size, max_size;
328	size_t max_nreg;
329
330	dbenv = env->dbenv;
331
332	__memp_region_size(env, &reg_size, NULL);
333	max_size =
334	    (roff_t)dbenv->mp_max_gbytes * GIGABYTE + dbenv->mp_max_bytes;
335	max_nreg = (max_size + reg_size / 2) / reg_size;
336
337	/* Sanity check that the number of regions fits in 32 bits. */
338	DB_ASSERT(env, max_nreg == (u_int32_t)max_nreg);
339
340	if (max_nreg <= dbenv->mp_ncache)
341		max_nreg = dbenv->mp_ncache;
342	return ((u_int32_t)max_nreg);
343}
344
345/*
346 * __memp_region_size --
347 *	Size the region and figure out how many hash buckets we'll have.
348 */
349static void
350__memp_region_size(env, reg_sizep, htab_bucketsp)
351	ENV *env;
352	roff_t *reg_sizep;
353	u_int32_t *htab_bucketsp;
354{
355	DB_ENV *dbenv;
356	roff_t reg_size, cache_size;
357
358	dbenv = env->dbenv;
359
360	/*
361	 * Figure out how big each cache region is.  Cast an operand to roff_t
362	 * so we do 64-bit arithmetic as appropriate.
363	 */
364	cache_size = (roff_t)dbenv->mp_gbytes * GIGABYTE + dbenv->mp_bytes;
365	reg_size = cache_size / dbenv->mp_ncache;
366	if (reg_sizep != NULL)
367		*reg_sizep = reg_size;
368
369	/*
370	 * Figure out how many hash buckets each region will have.  Assume we
371	 * want to keep the hash chains with under 10 pages on each chain.  We
372	 * don't know the pagesize in advance, and it may differ for different
373	 * files.  Use a pagesize of 1K for the calculation -- we walk these
374	 * chains a lot, they must be kept short.
375	 *
376	 * XXX
377	 * Cache sizes larger than 10TB would cause 32-bit wrapping in the
378	 * calculation of the number of hash buckets.  This probably isn't
379	 * something we need to worry about right now, but is checked when the
380	 * cache size is set.
381	 */
382	if (htab_bucketsp != NULL)
383		*htab_bucketsp =
384		    __db_tablesize((u_int32_t)(reg_size / (10 * 1024)));
385}
386
387/*
388 * __memp_region_mutex_count --
389 *	Return the number of mutexes the mpool region will need.
390 *
391 * PUBLIC: u_int32_t __memp_region_mutex_count __P((ENV *));
392 */
393u_int32_t
394__memp_region_mutex_count(env)
395	ENV *env;
396{
397	DB_ENV *dbenv;
398	u_int32_t htab_buckets;
399
400	dbenv = env->dbenv;
401
402	__memp_region_size(env, NULL, &htab_buckets);
403
404	/*
405	 * We need a couple of mutexes for the region itself, one for each
406	 * file handle (MPOOLFILE) the application allocates, one for each
407	 * of the MPOOL_FILE_BUCKETS, and each cache has two mutexes per
408	 * hash bucket.
409	 */
410	return (dbenv->mp_ncache * htab_buckets * 2 + 50 + MPOOL_FILE_BUCKETS);
411}
412
413/*
414 * __memp_init_config --
415 *	Initialize shared configuration information.
416 */
417static int
418__memp_init_config(env, mp)
419	ENV *env;
420	MPOOL *mp;
421{
422	DB_ENV *dbenv;
423
424	dbenv = env->dbenv;
425
426	MPOOL_SYSTEM_LOCK(env);
427	if (dbenv->mp_mmapsize != 0)
428		mp->mp_mmapsize = dbenv->mp_mmapsize;
429	if (dbenv->mp_maxopenfd != 0)
430		mp->mp_maxopenfd = dbenv->mp_maxopenfd;
431	if (dbenv->mp_maxwrite != 0)
432		mp->mp_maxwrite = dbenv->mp_maxwrite;
433	if (dbenv->mp_maxwrite_sleep != 0)
434		mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep;
435	MPOOL_SYSTEM_UNLOCK(env);
436
437	return (0);
438}
439
440/*
441 * __memp_env_refresh --
442 *	Clean up after the mpool system on a close or failed open.
443 *
444 * PUBLIC: int __memp_env_refresh __P((ENV *));
445 */
446int
447__memp_env_refresh(env)
448	ENV *env;
449{
450	BH *bhp;
451	BH_FROZEN_ALLOC *frozen_alloc;
452	DB_MPOOL *dbmp;
453	DB_MPOOLFILE *dbmfp;
454	DB_MPOOL_HASH *hp;
455	DB_MPREG *mpreg;
456	MPOOL *mp, *c_mp;
457	REGINFO *infop;
458	db_mutex_t mtx_base, mtx;
459	u_int32_t bucket, htab_buckets, i, max_nreg, nreg;
460	int ret, t_ret;
461
462	ret = 0;
463	dbmp = env->mp_handle;
464	mp = dbmp->reginfo[0].primary;
465	htab_buckets = mp->htab_buckets;
466	nreg = mp->nreg;
467	max_nreg = mp->max_nreg;
468	hp = R_ADDR(&dbmp->reginfo[0], mp->htab);
469	mtx_base = hp->mtx_hash;
470
471	/*
472	 * If a private region, return the memory to the heap.  Not needed for
473	 * filesystem-backed or system shared memory regions, that memory isn't
474	 * owned by any particular process.
475	 */
476	if (!F_ISSET(env, ENV_PRIVATE))
477		goto not_priv;
478
479	/* Discard buffers. */
480	for (i = 0; i < nreg; ++i) {
481		infop = &dbmp->reginfo[i];
482		c_mp = infop->primary;
483		for (hp = R_ADDR(infop, c_mp->htab), bucket = 0;
484		    bucket < c_mp->htab_buckets; ++hp, ++bucket) {
485			while ((bhp = SH_TAILQ_FIRST(
486			    &hp->hash_bucket, __bh)) != NULL)
487				if (F_ISSET(bhp, BH_FROZEN))
488					SH_TAILQ_REMOVE(
489					    &hp->hash_bucket, bhp,
490					    hq, __bh);
491				else {
492					if (F_ISSET(bhp, BH_DIRTY)) {
493						--hp->hash_page_dirty;
494						F_CLR(bhp,
495						    BH_DIRTY | BH_DIRTY_CREATE);
496					}
497					if ((t_ret = __memp_bhfree(
498					    dbmp, infop, hp, bhp,
499					    BH_FREE_FREEMEM |
500					    BH_FREE_UNLOCKED)) != 0 && ret == 0)
501						ret = t_ret;
502				}
503		}
504		while ((frozen_alloc = SH_TAILQ_FIRST(
505		    &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) {
506			SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc,
507			    links, __bh_frozen_a);
508			__env_alloc_free(infop, frozen_alloc);
509		}
510	}
511
512	/* Discard hash bucket mutexes. */
513	if (mtx_base != MUTEX_INVALID)
514		for (i = 0; i < 2 * max_nreg * htab_buckets; ++i) {
515			mtx = mtx_base + i;
516			if ((t_ret = __mutex_free(env, &mtx)) != 0 &&
517			    ret == 0)
518				ret = t_ret;
519		}
520
521not_priv:
522	/* Discard DB_MPOOLFILEs. */
523	while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
524		if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0)
525			ret = t_ret;
526
527	/* Discard DB_MPREGs. */
528	if (dbmp->pg_inout != NULL)
529		__os_free(env, dbmp->pg_inout);
530	while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
531		LIST_REMOVE(mpreg, q);
532		__os_free(env, mpreg);
533	}
534
535	/* Discard the DB_MPOOL thread mutex. */
536	if ((t_ret = __mutex_free(env, &dbmp->mutex)) != 0 && ret == 0)
537		ret = t_ret;
538
539	if (F_ISSET(env, ENV_PRIVATE)) {
540		/* Discard REGION IDs. */
541		infop = &dbmp->reginfo[0];
542		__memp_free(infop, NULL, R_ADDR(infop, mp->regids));
543
544		/* Discard the File table. */
545		__memp_free(infop, NULL, R_ADDR(infop, mp->ftab));
546
547		/* Discard Hash tables. */
548		for (i = 0; i < nreg; ++i) {
549			infop = &dbmp->reginfo[i];
550			c_mp = infop->primary;
551			__memp_free(infop, NULL, R_ADDR(infop, c_mp->htab));
552		}
553	}
554
555	/* Detach from the region. */
556	for (i = 0; i < nreg; ++i) {
557		infop = &dbmp->reginfo[i];
558		if ((t_ret =
559		    __env_region_detach(env, infop, 0)) != 0 && ret == 0)
560			ret = t_ret;
561	}
562
563	/* Discard DB_MPOOL. */
564	__os_free(env, dbmp->reginfo);
565	__os_free(env, dbmp);
566
567	env->mp_handle = NULL;
568	return (ret);
569}
570