1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: mp_region.c,v 12.39 2008/05/08 03:15:38 mjc Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/mp.h" 13 14static int __memp_init_config __P((ENV *, MPOOL *)); 15static void __memp_region_size __P((ENV *, roff_t *, u_int32_t *)); 16 17/* 18 * __memp_open -- 19 * Internal version of memp_open: only called from ENV->open. 20 * 21 * PUBLIC: int __memp_open __P((ENV *, int)); 22 */ 23int 24__memp_open(env, create_ok) 25 ENV *env; 26 int create_ok; 27{ 28 DB_ENV *dbenv; 29 DB_MPOOL *dbmp; 30 MPOOL *mp; 31 REGINFO reginfo; 32 roff_t reg_size; 33 u_int i, max_nreg; 34 u_int32_t htab_buckets, *regids; 35 int ret; 36 37 dbenv = env->dbenv; 38 39 /* Calculate the region size and hash bucket count. */ 40 __memp_region_size(env, ®_size, &htab_buckets); 41 42 /* Create and initialize the DB_MPOOL structure. */ 43 if ((ret = __os_calloc(env, 1, sizeof(*dbmp), &dbmp)) != 0) 44 return (ret); 45 LIST_INIT(&dbmp->dbregq); 46 TAILQ_INIT(&dbmp->dbmfq); 47 dbmp->env = env; 48 49 /* Join/create the first mpool region. */ 50 memset(®info, 0, sizeof(REGINFO)); 51 reginfo.env = env; 52 reginfo.type = REGION_TYPE_MPOOL; 53 reginfo.id = INVALID_REGION_ID; 54 reginfo.flags = REGION_JOIN_OK; 55 if (create_ok) 56 F_SET(®info, REGION_CREATE_OK); 57 if ((ret = __env_region_attach(env, ®info, reg_size)) != 0) 58 goto err; 59 60 /* 61 * If we created the region, initialize it. Create or join any 62 * additional regions. 63 */ 64 if (F_ISSET(®info, REGION_CREATE)) { 65 /* 66 * We define how many regions there are going to be, allocate 67 * the REGINFO structures and create them. Make sure we don't 68 * clear the wrong entries on error. 69 */ 70 max_nreg = __memp_max_regions(env); 71 if ((ret = __os_calloc(env, 72 max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) 73 goto err; 74 /* Make sure we don't clear the wrong entries on error. */ 75 dbmp->reginfo[0] = reginfo; 76 for (i = 1; i < max_nreg; ++i) 77 dbmp->reginfo[i].id = INVALID_REGION_ID; 78 79 /* Initialize the first region. */ 80 if ((ret = __memp_init(env, dbmp, 81 0, htab_buckets, max_nreg)) != 0) 82 goto err; 83 84 /* 85 * Create/initialize remaining regions and copy their IDs into 86 * the first region. 87 */ 88 mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary); 89 regids = R_ADDR(dbmp->reginfo, mp->regids); 90 regids[0] = dbmp->reginfo[0].id; 91 for (i = 1; i < dbenv->mp_ncache; ++i) { 92 dbmp->reginfo[i].env = env; 93 dbmp->reginfo[i].type = REGION_TYPE_MPOOL; 94 dbmp->reginfo[i].id = INVALID_REGION_ID; 95 dbmp->reginfo[i].flags = REGION_CREATE_OK; 96 if ((ret = __env_region_attach( 97 env, &dbmp->reginfo[i], reg_size)) != 0) 98 goto err; 99 if ((ret = __memp_init(env, dbmp, 100 i, htab_buckets, max_nreg)) != 0) 101 goto err; 102 103 regids[i] = dbmp->reginfo[i].id; 104 } 105 } else { 106 /* 107 * Determine how many regions there are going to be, allocate 108 * the REGINFO structures and fill in local copies of that 109 * information. 110 */ 111 mp = R_ADDR(®info, reginfo.rp->primary); 112 dbenv->mp_ncache = mp->nreg; 113 if ((ret = __os_calloc(env, 114 mp->max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) 115 goto err; 116 /* Make sure we don't clear the wrong entries on error. */ 117 for (i = 0; i < dbenv->mp_ncache; ++i) 118 dbmp->reginfo[i].id = INVALID_REGION_ID; 119 dbmp->reginfo[0] = reginfo; 120 121 /* Join remaining regions. */ 122 regids = R_ADDR(dbmp->reginfo, mp->regids); 123 for (i = 1; i < dbenv->mp_ncache; ++i) { 124 dbmp->reginfo[i].env = env; 125 dbmp->reginfo[i].type = REGION_TYPE_MPOOL; 126 dbmp->reginfo[i].id = regids[i]; 127 dbmp->reginfo[i].flags = REGION_JOIN_OK; 128 if ((ret = __env_region_attach( 129 env, &dbmp->reginfo[i], 0)) != 0) 130 goto err; 131 } 132 } 133 134 /* Set the local addresses for the regions. */ 135 for (i = 0; i < dbenv->mp_ncache; ++i) 136 dbmp->reginfo[i].primary = 137 R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary); 138 139 /* If the region is threaded, allocate a mutex to lock the handles. */ 140 if ((ret = __mutex_alloc(env, 141 MTX_MPOOL_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbmp->mutex)) != 0) 142 goto err; 143 144 env->mp_handle = dbmp; 145 146 /* A process joining the region may reset the mpool configuration. */ 147 if ((ret = __memp_init_config(env, mp)) != 0) 148 return (ret); 149 150 return (0); 151 152err: env->mp_handle = NULL; 153 if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { 154 for (i = 0; i < dbenv->mp_ncache; ++i) 155 if (dbmp->reginfo[i].id != INVALID_REGION_ID) 156 (void)__env_region_detach( 157 env, &dbmp->reginfo[i], 0); 158 __os_free(env, dbmp->reginfo); 159 } 160 161 (void)__mutex_free(env, &dbmp->mutex); 162 __os_free(env, dbmp); 163 return (ret); 164} 165 166/* 167 * __memp_init -- 168 * Initialize a MPOOL structure in shared memory. 169 * 170 * PUBLIC: int __memp_init 171 * PUBLIC: __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int)); 172 */ 173int 174__memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) 175 ENV *env; 176 DB_MPOOL *dbmp; 177 u_int reginfo_off, max_nreg; 178 u_int32_t htab_buckets; 179{ 180 BH *frozen_bhp; 181 BH_FROZEN_ALLOC *frozen; 182 DB_ENV *dbenv; 183 DB_MPOOL_HASH *htab, *hp; 184 MPOOL *mp, *main_mp; 185 REGINFO *infop; 186 db_mutex_t mtx_base, mtx_discard, mtx_prev; 187 u_int32_t i; 188 int ret; 189 void *p; 190 191 dbenv = env->dbenv; 192 193 infop = &dbmp->reginfo[reginfo_off]; 194 if ((ret = __env_alloc(infop, sizeof(MPOOL), &infop->primary)) != 0) 195 goto mem_err; 196 infop->rp->primary = R_OFFSET(infop, infop->primary); 197 mp = infop->primary; 198 memset(mp, 0, sizeof(*mp)); 199 200 if ((ret = 201 __mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0) 202 return (ret); 203 204 if (reginfo_off == 0) { 205 ZERO_LSN(mp->lsn); 206 207 mp->nreg = dbenv->mp_ncache; 208 mp->max_nreg = max_nreg; 209 if ((ret = __env_alloc(&dbmp->reginfo[0], 210 max_nreg * sizeof(u_int32_t), &p)) != 0) 211 goto mem_err; 212 mp->regids = R_OFFSET(dbmp->reginfo, p); 213 mp->nbuckets = dbenv->mp_ncache * htab_buckets; 214 215 /* Allocate file table space and initialize it. */ 216 if ((ret = __env_alloc(infop, 217 MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), &htab)) != 0) 218 goto mem_err; 219 mp->ftab = R_OFFSET(infop, htab); 220 for (i = 0; i < MPOOL_FILE_BUCKETS; i++) { 221 if ((ret = __mutex_alloc(env, 222 MTX_MPOOL_FILE_BUCKET, 0, &htab[i].mtx_hash)) != 0) 223 return (ret); 224 SH_TAILQ_INIT(&htab[i].hash_bucket); 225 htab[i].hash_page_dirty = 0; 226 } 227 228 /* 229 * Allocate all of the hash bucket mutexes up front. We do 230 * this so that we don't need to free and reallocate mutexes as 231 * the cache is resized. 232 */ 233 mtx_base = mtx_prev = MUTEX_INVALID; 234 for (i = 0; i < mp->max_nreg * htab_buckets; i++) { 235 if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET, 236 0, &mtx_discard)) != 0) 237 return (ret); 238 if (i == 0) { 239 mtx_base = mtx_discard; 240 mtx_prev = mtx_discard - 1; 241 } 242 DB_ASSERT(env, mtx_discard == mtx_prev + 1 || 243 mtx_base == MUTEX_INVALID); 244 mtx_prev = mtx_discard; 245 if ((ret = __mutex_alloc(env, MTX_MPOOL_IO, 246 DB_MUTEX_SELF_BLOCK, &mtx_discard)) != 0) 247 return (ret); 248 DB_ASSERT(env, mtx_discard == mtx_prev + 1 || 249 mtx_base == MUTEX_INVALID); 250 mtx_prev = mtx_discard; 251 } 252 } else { 253 main_mp = dbmp->reginfo[0].primary; 254 htab = R_ADDR(&dbmp->reginfo[0], main_mp->htab); 255 mtx_base = htab[0].mtx_hash; 256 } 257 258 /* 259 * We preallocated all of the mutexes in a block, so for regions after 260 * the first, we skip mutexes in use in earlier regions. Each region 261 * has the same number of buckets and there are two mutexes per hash 262 * bucket (the bucket mutex and the I/O mutex). 263 */ 264 if (mtx_base != MUTEX_INVALID) 265 mtx_base += reginfo_off * htab_buckets * 2; 266 267 /* Allocate hash table space and initialize it. */ 268 if ((ret = __env_alloc(infop, 269 htab_buckets * sizeof(DB_MPOOL_HASH), &htab)) != 0) 270 goto mem_err; 271 mp->htab = R_OFFSET(infop, htab); 272 for (i = 0; i < htab_buckets; i++) { 273 hp = &htab[i]; 274 hp->mtx_hash = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID : 275 mtx_base + i * 2; 276 hp->mtx_io = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID : 277 mtx_base + i * 2 + 1; 278 SH_TAILQ_INIT(&hp->hash_bucket); 279 hp->hash_page_dirty = 0; 280#ifdef HAVE_STATISTICS 281 hp->hash_io_wait = 0; 282 hp->hash_frozen = hp->hash_thawed = hp->hash_frozen_freed = 0; 283#endif 284 hp->flags = 0; 285 ZERO_LSN(hp->old_reader); 286 } 287 mp->htab_buckets = htab_buckets; 288#ifdef HAVE_STATISTICS 289 mp->stat.st_hash_buckets = htab_buckets; 290#endif 291 292 SH_TAILQ_INIT(&mp->free_frozen); 293 SH_TAILQ_INIT(&mp->alloc_frozen); 294 295 /* 296 * Pre-allocate one frozen buffer header. This avoids situations where 297 * the cache becomes full of pages and we don't even have the 28 bytes 298 * (or so) available to allocate a frozen buffer header. 299 */ 300 if ((ret = __env_alloc(infop, 301 sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen)) != 0) 302 goto mem_err; 303 frozen_bhp = (BH *)(frozen + 1); 304 SH_TAILQ_INSERT_TAIL(&mp->alloc_frozen, frozen, links); 305 SH_TAILQ_INSERT_TAIL(&mp->free_frozen, frozen_bhp, hq); 306 307 /* 308 * Only the environment creator knows the total cache size, fill in 309 * those statistics now. 310 */ 311 mp->stat.st_gbytes = dbenv->mp_gbytes; 312 mp->stat.st_bytes = dbenv->mp_bytes; 313 return (0); 314 315mem_err:__db_errx(env, "Unable to allocate memory for mpool region"); 316 return (ret); 317} 318 319/* 320 * PUBLIC: u_int32_t __memp_max_regions __P((ENV *)); 321 */ 322u_int32_t 323__memp_max_regions(env) 324 ENV *env; 325{ 326 DB_ENV *dbenv; 327 roff_t reg_size, max_size; 328 size_t max_nreg; 329 330 dbenv = env->dbenv; 331 332 __memp_region_size(env, ®_size, NULL); 333 max_size = 334 (roff_t)dbenv->mp_max_gbytes * GIGABYTE + dbenv->mp_max_bytes; 335 max_nreg = (max_size + reg_size / 2) / reg_size; 336 337 /* Sanity check that the number of regions fits in 32 bits. */ 338 DB_ASSERT(env, max_nreg == (u_int32_t)max_nreg); 339 340 if (max_nreg <= dbenv->mp_ncache) 341 max_nreg = dbenv->mp_ncache; 342 return ((u_int32_t)max_nreg); 343} 344 345/* 346 * __memp_region_size -- 347 * Size the region and figure out how many hash buckets we'll have. 348 */ 349static void 350__memp_region_size(env, reg_sizep, htab_bucketsp) 351 ENV *env; 352 roff_t *reg_sizep; 353 u_int32_t *htab_bucketsp; 354{ 355 DB_ENV *dbenv; 356 roff_t reg_size, cache_size; 357 358 dbenv = env->dbenv; 359 360 /* 361 * Figure out how big each cache region is. Cast an operand to roff_t 362 * so we do 64-bit arithmetic as appropriate. 363 */ 364 cache_size = (roff_t)dbenv->mp_gbytes * GIGABYTE + dbenv->mp_bytes; 365 reg_size = cache_size / dbenv->mp_ncache; 366 if (reg_sizep != NULL) 367 *reg_sizep = reg_size; 368 369 /* 370 * Figure out how many hash buckets each region will have. Assume we 371 * want to keep the hash chains with under 10 pages on each chain. We 372 * don't know the pagesize in advance, and it may differ for different 373 * files. Use a pagesize of 1K for the calculation -- we walk these 374 * chains a lot, they must be kept short. 375 * 376 * XXX 377 * Cache sizes larger than 10TB would cause 32-bit wrapping in the 378 * calculation of the number of hash buckets. This probably isn't 379 * something we need to worry about right now, but is checked when the 380 * cache size is set. 381 */ 382 if (htab_bucketsp != NULL) 383 *htab_bucketsp = 384 __db_tablesize((u_int32_t)(reg_size / (10 * 1024))); 385} 386 387/* 388 * __memp_region_mutex_count -- 389 * Return the number of mutexes the mpool region will need. 390 * 391 * PUBLIC: u_int32_t __memp_region_mutex_count __P((ENV *)); 392 */ 393u_int32_t 394__memp_region_mutex_count(env) 395 ENV *env; 396{ 397 DB_ENV *dbenv; 398 u_int32_t htab_buckets; 399 400 dbenv = env->dbenv; 401 402 __memp_region_size(env, NULL, &htab_buckets); 403 404 /* 405 * We need a couple of mutexes for the region itself, one for each 406 * file handle (MPOOLFILE) the application allocates, one for each 407 * of the MPOOL_FILE_BUCKETS, and each cache has two mutexes per 408 * hash bucket. 409 */ 410 return (dbenv->mp_ncache * htab_buckets * 2 + 50 + MPOOL_FILE_BUCKETS); 411} 412 413/* 414 * __memp_init_config -- 415 * Initialize shared configuration information. 416 */ 417static int 418__memp_init_config(env, mp) 419 ENV *env; 420 MPOOL *mp; 421{ 422 DB_ENV *dbenv; 423 424 dbenv = env->dbenv; 425 426 MPOOL_SYSTEM_LOCK(env); 427 if (dbenv->mp_mmapsize != 0) 428 mp->mp_mmapsize = dbenv->mp_mmapsize; 429 if (dbenv->mp_maxopenfd != 0) 430 mp->mp_maxopenfd = dbenv->mp_maxopenfd; 431 if (dbenv->mp_maxwrite != 0) 432 mp->mp_maxwrite = dbenv->mp_maxwrite; 433 if (dbenv->mp_maxwrite_sleep != 0) 434 mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep; 435 MPOOL_SYSTEM_UNLOCK(env); 436 437 return (0); 438} 439 440/* 441 * __memp_env_refresh -- 442 * Clean up after the mpool system on a close or failed open. 443 * 444 * PUBLIC: int __memp_env_refresh __P((ENV *)); 445 */ 446int 447__memp_env_refresh(env) 448 ENV *env; 449{ 450 BH *bhp; 451 BH_FROZEN_ALLOC *frozen_alloc; 452 DB_MPOOL *dbmp; 453 DB_MPOOLFILE *dbmfp; 454 DB_MPOOL_HASH *hp; 455 DB_MPREG *mpreg; 456 MPOOL *mp, *c_mp; 457 REGINFO *infop; 458 db_mutex_t mtx_base, mtx; 459 u_int32_t bucket, htab_buckets, i, max_nreg, nreg; 460 int ret, t_ret; 461 462 ret = 0; 463 dbmp = env->mp_handle; 464 mp = dbmp->reginfo[0].primary; 465 htab_buckets = mp->htab_buckets; 466 nreg = mp->nreg; 467 max_nreg = mp->max_nreg; 468 hp = R_ADDR(&dbmp->reginfo[0], mp->htab); 469 mtx_base = hp->mtx_hash; 470 471 /* 472 * If a private region, return the memory to the heap. Not needed for 473 * filesystem-backed or system shared memory regions, that memory isn't 474 * owned by any particular process. 475 */ 476 if (!F_ISSET(env, ENV_PRIVATE)) 477 goto not_priv; 478 479 /* Discard buffers. */ 480 for (i = 0; i < nreg; ++i) { 481 infop = &dbmp->reginfo[i]; 482 c_mp = infop->primary; 483 for (hp = R_ADDR(infop, c_mp->htab), bucket = 0; 484 bucket < c_mp->htab_buckets; ++hp, ++bucket) { 485 while ((bhp = SH_TAILQ_FIRST( 486 &hp->hash_bucket, __bh)) != NULL) 487 if (F_ISSET(bhp, BH_FROZEN)) 488 SH_TAILQ_REMOVE( 489 &hp->hash_bucket, bhp, 490 hq, __bh); 491 else { 492 if (F_ISSET(bhp, BH_DIRTY)) { 493 --hp->hash_page_dirty; 494 F_CLR(bhp, 495 BH_DIRTY | BH_DIRTY_CREATE); 496 } 497 if ((t_ret = __memp_bhfree( 498 dbmp, infop, hp, bhp, 499 BH_FREE_FREEMEM | 500 BH_FREE_UNLOCKED)) != 0 && ret == 0) 501 ret = t_ret; 502 } 503 } 504 while ((frozen_alloc = SH_TAILQ_FIRST( 505 &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) { 506 SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc, 507 links, __bh_frozen_a); 508 __env_alloc_free(infop, frozen_alloc); 509 } 510 } 511 512 /* Discard hash bucket mutexes. */ 513 if (mtx_base != MUTEX_INVALID) 514 for (i = 0; i < 2 * max_nreg * htab_buckets; ++i) { 515 mtx = mtx_base + i; 516 if ((t_ret = __mutex_free(env, &mtx)) != 0 && 517 ret == 0) 518 ret = t_ret; 519 } 520 521not_priv: 522 /* Discard DB_MPOOLFILEs. */ 523 while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) 524 if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0) 525 ret = t_ret; 526 527 /* Discard DB_MPREGs. */ 528 if (dbmp->pg_inout != NULL) 529 __os_free(env, dbmp->pg_inout); 530 while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) { 531 LIST_REMOVE(mpreg, q); 532 __os_free(env, mpreg); 533 } 534 535 /* Discard the DB_MPOOL thread mutex. */ 536 if ((t_ret = __mutex_free(env, &dbmp->mutex)) != 0 && ret == 0) 537 ret = t_ret; 538 539 if (F_ISSET(env, ENV_PRIVATE)) { 540 /* Discard REGION IDs. */ 541 infop = &dbmp->reginfo[0]; 542 __memp_free(infop, NULL, R_ADDR(infop, mp->regids)); 543 544 /* Discard the File table. */ 545 __memp_free(infop, NULL, R_ADDR(infop, mp->ftab)); 546 547 /* Discard Hash tables. */ 548 for (i = 0; i < nreg; ++i) { 549 infop = &dbmp->reginfo[i]; 550 c_mp = infop->primary; 551 __memp_free(infop, NULL, R_ADDR(infop, c_mp->htab)); 552 } 553 } 554 555 /* Detach from the region. */ 556 for (i = 0; i < nreg; ++i) { 557 infop = &dbmp->reginfo[i]; 558 if ((t_ret = 559 __env_region_detach(env, infop, 0)) != 0 && ret == 0) 560 ret = t_ret; 561 } 562 563 /* Discard DB_MPOOL. */ 564 __os_free(env, dbmp->reginfo); 565 __os_free(env, dbmp); 566 567 env->mp_handle = NULL; 568 return (ret); 569} 570