1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996-2009 Oracle. All rights reserved. 5 * 6 * $Id$ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/log.h" 13#include "dbinc/mp.h" 14 15static int __memp_reset_lru __P((ENV *, REGINFO *)); 16 17/* 18 * __memp_fput_pp -- 19 * DB_MPOOLFILE->put pre/post processing. 20 * 21 * PUBLIC: int __memp_fput_pp 22 * PUBLIC: __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t)); 23 */ 24int 25__memp_fput_pp(dbmfp, pgaddr, priority, flags) 26 DB_MPOOLFILE *dbmfp; 27 void *pgaddr; 28 DB_CACHE_PRIORITY priority; 29 u_int32_t flags; 30{ 31 DB_THREAD_INFO *ip; 32 ENV *env; 33 int ret, t_ret; 34 35 env = dbmfp->env; 36 37 if (flags != 0) 38 return (__db_ferr(env, "DB_MPOOLFILE->put", 0)); 39 40 MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put"); 41 42 ENV_ENTER(env, ip); 43 44 ret = __memp_fput(dbmfp, ip, pgaddr, priority); 45 if (IS_ENV_REPLICATED(env) && 46 (t_ret = __op_rep_exit(env)) != 0 && ret == 0) 47 ret = t_ret; 48 49 ENV_LEAVE(env, ip); 50 return (ret); 51} 52 53/* 54 * __memp_fput -- 55 * DB_MPOOLFILE->put. 56 * 57 * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, 58 * PUBLIC: DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY)); 59 */ 60int 61__memp_fput(dbmfp, ip, pgaddr, priority) 62 DB_MPOOLFILE *dbmfp; 63 DB_THREAD_INFO *ip; 64 void *pgaddr; 65 DB_CACHE_PRIORITY priority; 66{ 67 BH *bhp; 68 DB_ENV *dbenv; 69 DB_MPOOL *dbmp; 70 DB_MPOOL_HASH *hp; 71 ENV *env; 72 MPOOL *c_mp; 73 MPOOLFILE *mfp; 74 PIN_LIST *list, *lp; 75 REGINFO *infop, *reginfo; 76 roff_t b_ref; 77 int region; 78 int adjust, pfactor, ret, t_ret; 79 char buf[DB_THREADID_STRLEN]; 80 81 env = dbmfp->env; 82 dbenv = env->dbenv; 83 dbmp = env->mp_handle; 84 mfp = dbmfp->mfp; 85 bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); 86 ret = 0; 87 88 /* 89 * If this is marked dummy, we are using it to unpin a buffer for 90 * another thread. 91 */ 92 if (F_ISSET(dbmfp, MP_DUMMY)) 93 goto unpin; 94 95 /* 96 * If we're mapping the file, there's nothing to do. Because we can 97 * stop mapping the file at any time, we have to check on each buffer 98 * to see if the address we gave the application was part of the map 99 * region. 100 */ 101 if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && 102 (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) 103 return (0); 104 105#ifdef DIAGNOSTIC 106 /* 107 * Decrement the per-file pinned buffer count (mapped pages aren't 108 * counted). 109 */ 110 MPOOL_SYSTEM_LOCK(env); 111 if (dbmfp->pinref == 0) { 112 MPOOL_SYSTEM_UNLOCK(env); 113 __db_errx(env, 114 "%s: more pages returned than retrieved", __memp_fn(dbmfp)); 115 return (__env_panic(env, EACCES)); 116 } 117 --dbmfp->pinref; 118 MPOOL_SYSTEM_UNLOCK(env); 119#endif 120 121unpin: 122 infop = &dbmp->reginfo[bhp->region]; 123 c_mp = infop->primary; 124 hp = R_ADDR(infop, c_mp->htab); 125 hp = &hp[bhp->bucket]; 126 127 /* 128 * Check for a reference count going to zero. This can happen if the 129 * application returns a page twice. 130 */ 131 if (atomic_read(&bhp->ref) == 0) { 132 __db_errx(env, "%s: page %lu: unpinned page returned", 133 __memp_fn(dbmfp), (u_long)bhp->pgno); 134 DB_ASSERT(env, atomic_read(&bhp->ref) != 0); 135 return (__env_panic(env, EACCES)); 136 } 137 138 /* Note the activity so allocation won't decide to quit. */ 139 ++c_mp->put_counter; 140 141 if (ip != NULL) { 142 reginfo = env->reginfo; 143 list = R_ADDR(reginfo, ip->dbth_pinlist); 144 region = (int)(infop - dbmp->reginfo); 145 b_ref = R_OFFSET(infop, bhp); 146 for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) 147 if (lp->b_ref == b_ref && lp->region == region) 148 break; 149 150 if (lp == &list[ip->dbth_pinmax]) { 151 __db_errx(env, 152 "__memp_fput: pinned buffer not found for thread %s", 153 dbenv->thread_id_string(dbenv, 154 ip->dbth_pid, ip->dbth_tid, buf)); 155 return (__env_panic(env, EINVAL)); 156 } 157 158 lp->b_ref = INVALID_ROFF; 159 ip->dbth_pincount--; 160 } 161 162 /* 163 * Mark the file dirty. 164 */ 165 if (F_ISSET(bhp, BH_EXCLUSIVE) && F_ISSET(bhp, BH_DIRTY)) { 166 DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0); 167 mfp->file_written = 1; 168 } 169 170 /* 171 * If more than one reference to the page we're done. Ignore the 172 * discard flags (for now) and leave the buffer's priority alone. 173 * We are doing this a little early as the remaining ref may or 174 * may not be a write behind. If it is we set the priority 175 * here, if not it will get set again later. We might race 176 * and miss setting the priority which would leave it wrong 177 * for a while. 178 */ 179 DB_ASSERT(env, atomic_read(&bhp->ref) != 0); 180 if (atomic_dec(env, &bhp->ref) > 1 || (atomic_read(&bhp->ref) == 1 && 181 !F_ISSET(bhp, BH_DIRTY))) { 182 /* 183 * __memp_pgwrite only has a shared lock while it clears 184 * the BH_DIRTY bit. If we only have a shared latch then 185 * we can't touch the flags bits. 186 */ 187 if (F_ISSET(bhp, BH_EXCLUSIVE)) 188 F_CLR(bhp, BH_EXCLUSIVE); 189 MUTEX_UNLOCK(env, bhp->mtx_buf); 190 return (0); 191 } 192 193 /* The buffer should not be accessed again. */ 194 if (BH_REFCOUNT(bhp) == 0) 195 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0); 196 197 /* Update priority values. */ 198 if (priority == DB_PRIORITY_VERY_LOW || 199 mfp->priority == MPOOL_PRI_VERY_LOW) 200 bhp->priority = 0; 201 else { 202 /* 203 * We don't lock the LRU counter or the stat.st_pages field, if 204 * we get garbage (which won't happen on a 32-bit machine), it 205 * only means a buffer has the wrong priority. 206 */ 207 bhp->priority = c_mp->lru_count; 208 209 switch (priority) { 210 default: 211 case DB_PRIORITY_UNCHANGED: 212 pfactor = mfp->priority; 213 break; 214 case DB_PRIORITY_VERY_LOW: 215 pfactor = MPOOL_PRI_VERY_LOW; 216 break; 217 case DB_PRIORITY_LOW: 218 pfactor = MPOOL_PRI_LOW; 219 break; 220 case DB_PRIORITY_DEFAULT: 221 pfactor = MPOOL_PRI_DEFAULT; 222 break; 223 case DB_PRIORITY_HIGH: 224 pfactor = MPOOL_PRI_HIGH; 225 break; 226 case DB_PRIORITY_VERY_HIGH: 227 pfactor = MPOOL_PRI_VERY_HIGH; 228 break; 229 } 230 231 adjust = 0; 232 if (pfactor != 0) 233 adjust = (int)c_mp->stat.st_pages / pfactor; 234 235 if (F_ISSET(bhp, BH_DIRTY)) 236 adjust += (int)c_mp->stat.st_pages / MPOOL_PRI_DIRTY; 237 238 if (adjust > 0) { 239 if (UINT32_MAX - bhp->priority >= (u_int32_t)adjust) 240 bhp->priority += adjust; 241 } else if (adjust < 0) 242 if (bhp->priority > (u_int32_t)-adjust) 243 bhp->priority += adjust; 244 } 245 246 /* 247 * __memp_pgwrite only has a shared lock while it clears the 248 * BH_DIRTY bit. If we only have a shared latch then we can't 249 * touch the flags bits. 250 */ 251 if (F_ISSET(bhp, BH_EXCLUSIVE)) 252 F_CLR(bhp, BH_EXCLUSIVE); 253 MUTEX_UNLOCK(env, bhp->mtx_buf); 254 255 /* 256 * On every buffer put we update the buffer generation number and check 257 * for wraparound. 258 */ 259 if (++c_mp->lru_count == UINT32_MAX) 260 if ((t_ret = 261 __memp_reset_lru(env, dbmp->reginfo)) != 0 && ret == 0) 262 ret = t_ret; 263 264 return (ret); 265} 266 267/* 268 * __memp_reset_lru -- 269 * Reset the cache LRU counter. 270 */ 271static int 272__memp_reset_lru(env, infop) 273 ENV *env; 274 REGINFO *infop; 275{ 276 BH *bhp, *tbhp; 277 DB_MPOOL_HASH *hp; 278 MPOOL *c_mp; 279 u_int32_t bucket, priority; 280 281 c_mp = infop->primary; 282 /* 283 * Update the counter so all future allocations will start at the 284 * bottom. 285 */ 286 c_mp->lru_count -= MPOOL_BASE_DECREMENT; 287 288 /* Adjust the priority of every buffer in the system. */ 289 for (hp = R_ADDR(infop, c_mp->htab), 290 bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { 291 /* 292 * Skip empty buckets. 293 * 294 * We can check for empty buckets before locking as we 295 * only care if the pointer is zero or non-zero. 296 */ 297 if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) { 298 c_mp->lru_reset++; 299 continue; 300 } 301 302 MUTEX_LOCK(env, hp->mtx_hash); 303 c_mp->lru_reset++; 304 /* 305 * We need to take a little care that the bucket does 306 * not become unsorted. This is highly unlikely but 307 * possible. 308 */ 309 priority = 0; 310 SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { 311 for (tbhp = bhp; tbhp != NULL; 312 tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) { 313 if (tbhp->priority != UINT32_MAX && 314 tbhp->priority > MPOOL_BASE_DECREMENT) { 315 tbhp->priority -= MPOOL_BASE_DECREMENT; 316 if (tbhp->priority < priority) 317 tbhp->priority = priority; 318 } 319 } 320 priority = bhp->priority; 321 } 322 MUTEX_UNLOCK(env, hp->mtx_hash); 323 } 324 c_mp->lru_reset = 0; 325 326 COMPQUIET(env, NULL); 327 return (0); 328} 329 330/* 331 * __memp_unpin_buffers -- 332 * Unpin buffers pinned by a thread. 333 * 334 * PUBLIC: int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *)); 335 */ 336int 337__memp_unpin_buffers(env, ip) 338 ENV *env; 339 DB_THREAD_INFO *ip; 340{ 341 BH *bhp; 342 DB_MPOOL *dbmp; 343 DB_MPOOLFILE dbmf; 344 PIN_LIST *list, *lp; 345 REGINFO *rinfop, *reginfo; 346 int ret; 347 348 memset(&dbmf, 0, sizeof(dbmf)); 349 dbmf.env = env; 350 dbmf.flags = MP_DUMMY; 351 dbmp = env->mp_handle; 352 reginfo = env->reginfo; 353 354 list = R_ADDR(reginfo, ip->dbth_pinlist); 355 for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) { 356 if (lp->b_ref == INVALID_ROFF) 357 continue; 358 rinfop = &dbmp->reginfo[lp->region]; 359 bhp = R_ADDR(rinfop, lp->b_ref); 360 dbmf.mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); 361 if ((ret = __memp_fput(&dbmf, ip, 362 (u_int8_t *)bhp + SSZA(BH, buf), 363 DB_PRIORITY_UNCHANGED)) != 0) 364 return (ret); 365 } 366 return (0); 367} 368