vm_page.c revision 216317
11590Srgrimes/*- 21590Srgrimes * Copyright (c) 1991 Regents of the University of California. 31590Srgrimes * All rights reserved. 41590Srgrimes * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 51590Srgrimes * 61590Srgrimes * This code is derived from software contributed to Berkeley by 71590Srgrimes * The Mach Operating System project at Carnegie-Mellon University. 81590Srgrimes * 91590Srgrimes * Redistribution and use in source and binary forms, with or without 101590Srgrimes * modification, are permitted provided that the following conditions 111590Srgrimes * are met: 121590Srgrimes * 1. Redistributions of source code must retain the above copyright 131590Srgrimes * notice, this list of conditions and the following disclaimer. 141590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 151590Srgrimes * notice, this list of conditions and the following disclaimer in the 161590Srgrimes * documentation and/or other materials provided with the distribution. 171590Srgrimes * 4. Neither the name of the University nor the names of its contributors 181590Srgrimes * may be used to endorse or promote products derived from this software 191590Srgrimes * without specific prior written permission. 201590Srgrimes * 211590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 221590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 231590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 241590Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 251590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 261590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 271590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 281590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 291590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 301590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 311590Srgrimes * SUCH DAMAGE. 321590Srgrimes * 331590Srgrimes * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 341590Srgrimes */ 3527753Scharnier 361590Srgrimes/*- 3727753Scharnier * Copyright (c) 1987, 1990 Carnegie-Mellon University. 3827753Scharnier * All rights reserved. 3937453Sbde * 401590Srgrimes * Authors: Avadis Tevanian, Jr., Michael Wayne Young 411590Srgrimes * 421590Srgrimes * Permission to use, copy, modify and distribute this software and 431590Srgrimes * its documentation is hereby granted, provided that both the copyright 441590Srgrimes * notice and this permission notice appear in all copies of the 451590Srgrimes * software, derivative works or modified versions, and any portions 4614543Sdg * thereof, and that both notices appear in supporting documentation. 471590Srgrimes * 481590Srgrimes * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 491590Srgrimes * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 501590Srgrimes * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 511590Srgrimes * 521590Srgrimes * Carnegie Mellon requests users of this software to return to 531590Srgrimes * 541590Srgrimes * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 551590Srgrimes * School of Computer Science 561590Srgrimes * Carnegie Mellon University 5736080Swollman * Pittsburgh PA 15213-3890 5836103Swollman * 591590Srgrimes * any improvements or extensions that they make and grant Carnegie the 601590Srgrimes * rights to redistribute these changes. 6136080Swollman */ 621590Srgrimes 631590Srgrimes/* 6436080Swollman * GENERAL RULES ON VM_PAGE MANIPULATION 651590Srgrimes * 6636080Swollman * - a pageq mutex is required when adding or removing a page from a 6736080Swollman * page queue (vm_page_queue[]), regardless of other mutexes or the 681590Srgrimes * busy state of a page. 691590Srgrimes * 7036080Swollman * - a hash chain mutex is required when associating or disassociating 711590Srgrimes * a page from the VM PAGE CACHE hash table (vm_page_buckets), 7236080Swollman * regardless of other mutexes or the busy state of a page. 7336080Swollman * 7436080Swollman * - either a hash chain mutex OR a busied page is required in order 7536080Swollman * to modify the page flags. A hash chain mutex must be obtained in 7636080Swollman * order to busy a page. A page's flags cannot be modified by a 7736080Swollman * hash chain mutex if the page is marked busy. 7836080Swollman * 791590Srgrimes * - The object memq mutex is held when inserting or removing 8036080Swollman * pages from an object (vm_page_insert() or vm_page_remove()). This 8136080Swollman * is different from the object's main mutex. 8236080Swollman * 8336080Swollman * Generally speaking, you have to be aware of side effects when running 8436080Swollman * vm_page ops. A vm_page_lookup() will return with the hash chain 8536080Swollman * locked, whether it was able to lookup the page or not. vm_page_free(), 8636080Swollman * vm_page_cache(), vm_page_activate(), and a number of other routines 871590Srgrimes * will release the hash chain mutex for you. Intermediate manipulation 8836080Swollman * routines such as vm_page_flag_set() expect the hash chain to be held 8936080Swollman * on entry and the hash chain will remain held on return. 9036080Swollman * 9136080Swollman * pageq scanning can only occur with the pageq in question locked. 9236080Swollman * We have a known bottleneck with the active queue, but the cache 9336080Swollman * and free queues are actually arrays already. 9436080Swollman */ 9536080Swollman 9636080Swollman/* 9736080Swollman * Resident memory management module. 9836080Swollman */ 9936080Swollman 10036080Swollman#include <sys/cdefs.h> 10136080Swollman__FBSDID("$FreeBSD: head/sys/vm/vm_page.c 216317 2010-12-09 06:54:06Z jchandra $"); 10236080Swollman 10336080Swollman#include "opt_msgbuf.h" 10436080Swollman#include "opt_vm.h" 10536080Swollman 10636080Swollman#include <sys/param.h> 10736080Swollman#include <sys/systm.h> 10836080Swollman#include <sys/lock.h> 10936080Swollman#include <sys/kernel.h> 11036080Swollman#include <sys/limits.h> 11136080Swollman#include <sys/malloc.h> 11236080Swollman#include <sys/msgbuf.h> 11336080Swollman#include <sys/mutex.h> 11436080Swollman#include <sys/proc.h> 11536080Swollman#include <sys/sysctl.h> 11636080Swollman#include <sys/vmmeter.h> 11736080Swollman#include <sys/vnode.h> 11836080Swollman 11936080Swollman#include <vm/vm.h> 12036080Swollman#include <vm/pmap.h> 12136080Swollman#include <vm/vm_param.h> 12236080Swollman#include <vm/vm_kern.h> 12336080Swollman#include <vm/vm_object.h> 1241590Srgrimes#include <vm/vm_page.h> 1251590Srgrimes#include <vm/vm_pageout.h> 1261590Srgrimes#include <vm/vm_pager.h> 1271590Srgrimes#include <vm/vm_phys.h> 12836080Swollman#include <vm/vm_reserv.h> 12936080Swollman#include <vm/vm_extern.h> 13036080Swollman#include <vm/uma.h> 1311590Srgrimes#include <vm/uma_int.h> 13236080Swollman 13336080Swollman#include <machine/md_var.h> 1341590Srgrimes 1351590Srgrimes#if defined(__amd64__) || defined (__i386__) 13636080Swollmanextern struct sysctl_oid_list sysctl__vm_pmap_children; 13736080Swollman#else 13836080SwollmanSYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 13936080Swollman#endif 14028726Swollman 14136080Swollmanstatic uint64_t pmap_tryrelock_calls; 1421590SrgrimesSYSCTL_QUAD(_vm_pmap, OID_AUTO, tryrelock_calls, CTLFLAG_RD, 1431590Srgrimes &pmap_tryrelock_calls, 0, "Number of tryrelock calls"); 1441590Srgrimes 1451590Srgrimesstatic int pmap_tryrelock_restart; 1461590SrgrimesSYSCTL_INT(_vm_pmap, OID_AUTO, tryrelock_restart, CTLFLAG_RD, 1471590Srgrimes &pmap_tryrelock_restart, 0, "Number of tryrelock restarts"); 1481590Srgrimes 1491590Srgrimesstatic int pmap_tryrelock_race; 15036080SwollmanSYSCTL_INT(_vm_pmap, OID_AUTO, tryrelock_race, CTLFLAG_RD, 15136080Swollman &pmap_tryrelock_race, 0, "Number of tryrelock pmap race cases"); 15236080Swollman 15336080Swollman/* 15436080Swollman * Associated with page of user-allocatable memory is a 15528726Swollman * page structure. 15636091Sache */ 15737453Sbde 15836091Sachestruct vpgqueues vm_page_queues[PQ_COUNT]; 1591590Srgrimesstruct vpglocks vm_page_queue_lock; 1601590Srgrimesstruct vpglocks vm_page_queue_free_lock; 161 162struct vpglocks pa_lock[PA_LOCK_COUNT] __aligned(CACHE_LINE_SIZE); 163 164vm_page_t vm_page_array = 0; 165int vm_page_array_size = 0; 166long first_page = 0; 167int vm_page_zero_count = 0; 168 169static int boot_pages = UMA_BOOT_PAGES; 170TUNABLE_INT("vm.boot_pages", &boot_pages); 171SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0, 172 "number of pages allocated for bootstrapping the VM system"); 173 174static void vm_page_clear_dirty_mask(vm_page_t m, int pagebits); 175static void vm_page_queue_remove(int queue, vm_page_t m); 176static void vm_page_enqueue(int queue, vm_page_t m); 177 178/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ 179#if PAGE_SIZE == 32768 180#ifdef CTASSERT 181CTASSERT(sizeof(u_long) >= 8); 182#endif 183#endif 184 185/* 186 * Try to acquire a physical address lock while a pmap is locked. If we 187 * fail to trylock we unlock and lock the pmap directly and cache the 188 * locked pa in *locked. The caller should then restart their loop in case 189 * the virtual to physical mapping has changed. 190 */ 191int 192vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) 193{ 194 vm_paddr_t lockpa; 195 uint32_t gen_count; 196 197 gen_count = pmap->pm_gen_count; 198 atomic_add_long((volatile long *)&pmap_tryrelock_calls, 1); 199 lockpa = *locked; 200 *locked = pa; 201 if (lockpa) { 202 PA_LOCK_ASSERT(lockpa, MA_OWNED); 203 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) 204 return (0); 205 PA_UNLOCK(lockpa); 206 } 207 if (PA_TRYLOCK(pa)) 208 return (0); 209 PMAP_UNLOCK(pmap); 210 atomic_add_int((volatile int *)&pmap_tryrelock_restart, 1); 211 PA_LOCK(pa); 212 PMAP_LOCK(pmap); 213 214 if (pmap->pm_gen_count != gen_count + 1) { 215 pmap->pm_retries++; 216 atomic_add_int((volatile int *)&pmap_tryrelock_race, 1); 217 return (EAGAIN); 218 } 219 return (0); 220} 221 222/* 223 * vm_set_page_size: 224 * 225 * Sets the page size, perhaps based upon the memory 226 * size. Must be called before any use of page-size 227 * dependent functions. 228 */ 229void 230vm_set_page_size(void) 231{ 232 if (cnt.v_page_size == 0) 233 cnt.v_page_size = PAGE_SIZE; 234 if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0) 235 panic("vm_set_page_size: page size not a power of two"); 236} 237 238/* 239 * vm_page_blacklist_lookup: 240 * 241 * See if a physical address in this page has been listed 242 * in the blacklist tunable. Entries in the tunable are 243 * separated by spaces or commas. If an invalid integer is 244 * encountered then the rest of the string is skipped. 245 */ 246static int 247vm_page_blacklist_lookup(char *list, vm_paddr_t pa) 248{ 249 vm_paddr_t bad; 250 char *cp, *pos; 251 252 for (pos = list; *pos != '\0'; pos = cp) { 253 bad = strtoq(pos, &cp, 0); 254 if (*cp != '\0') { 255 if (*cp == ' ' || *cp == ',') { 256 cp++; 257 if (cp == pos) 258 continue; 259 } else 260 break; 261 } 262 if (pa == trunc_page(bad)) 263 return (1); 264 } 265 return (0); 266} 267 268/* 269 * vm_page_startup: 270 * 271 * Initializes the resident memory module. 272 * 273 * Allocates memory for the page cells, and 274 * for the object/offset-to-page hash table headers. 275 * Each page cell is initialized and placed on the free list. 276 */ 277vm_offset_t 278vm_page_startup(vm_offset_t vaddr) 279{ 280 vm_offset_t mapped; 281 vm_paddr_t page_range; 282 vm_paddr_t new_end; 283 int i; 284 vm_paddr_t pa; 285 vm_paddr_t last_pa; 286 char *list; 287 288 /* the biggest memory array is the second group of pages */ 289 vm_paddr_t end; 290 vm_paddr_t biggestsize; 291 vm_paddr_t low_water, high_water; 292 int biggestone; 293 294 biggestsize = 0; 295 biggestone = 0; 296 vaddr = round_page(vaddr); 297 298 for (i = 0; phys_avail[i + 1]; i += 2) { 299 phys_avail[i] = round_page(phys_avail[i]); 300 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 301 } 302 303 low_water = phys_avail[0]; 304 high_water = phys_avail[1]; 305 306 for (i = 0; phys_avail[i + 1]; i += 2) { 307 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; 308 309 if (size > biggestsize) { 310 biggestone = i; 311 biggestsize = size; 312 } 313 if (phys_avail[i] < low_water) 314 low_water = phys_avail[i]; 315 if (phys_avail[i + 1] > high_water) 316 high_water = phys_avail[i + 1]; 317 } 318 319#ifdef XEN 320 low_water = 0; 321#endif 322 323 end = phys_avail[biggestone+1]; 324 325 /* 326 * Initialize the locks. 327 */ 328 mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF | 329 MTX_RECURSE); 330 mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL, 331 MTX_DEF); 332 333 /* Setup page locks. */ 334 for (i = 0; i < PA_LOCK_COUNT; i++) 335 mtx_init(&pa_lock[i].data, "page lock", NULL, 336 MTX_DEF | MTX_RECURSE | MTX_DUPOK); 337 338 /* 339 * Initialize the queue headers for the hold queue, the active queue, 340 * and the inactive queue. 341 */ 342 for (i = 0; i < PQ_COUNT; i++) 343 TAILQ_INIT(&vm_page_queues[i].pl); 344 vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count; 345 vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count; 346 vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count; 347 348 /* 349 * Allocate memory for use when boot strapping the kernel memory 350 * allocator. 351 */ 352 new_end = end - (boot_pages * UMA_SLAB_SIZE); 353 new_end = trunc_page(new_end); 354 mapped = pmap_map(&vaddr, new_end, end, 355 VM_PROT_READ | VM_PROT_WRITE); 356 bzero((void *)mapped, end - new_end); 357 uma_startup((void *)mapped, boot_pages); 358 359#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \ 360 defined(__mips__) 361 /* 362 * Allocate a bitmap to indicate that a random physical page 363 * needs to be included in a minidump. 364 * 365 * The amd64 port needs this to indicate which direct map pages 366 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 367 * 368 * However, i386 still needs this workspace internally within the 369 * minidump code. In theory, they are not needed on i386, but are 370 * included should the sf_buf code decide to use them. 371 */ 372 last_pa = 0; 373 for (i = 0; dump_avail[i + 1] != 0; i += 2) 374 if (dump_avail[i + 1] > last_pa) 375 last_pa = dump_avail[i + 1]; 376 page_range = last_pa / PAGE_SIZE; 377 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); 378 new_end -= vm_page_dump_size; 379 vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, 380 new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); 381 bzero((void *)vm_page_dump, vm_page_dump_size); 382#endif 383#ifdef __amd64__ 384 /* 385 * Request that the physical pages underlying the message buffer be 386 * included in a crash dump. Since the message buffer is accessed 387 * through the direct map, they are not automatically included. 388 */ 389 pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); 390 last_pa = pa + round_page(MSGBUF_SIZE); 391 while (pa < last_pa) { 392 dump_add_page(pa); 393 pa += PAGE_SIZE; 394 } 395#endif 396 /* 397 * Compute the number of pages of memory that will be available for 398 * use (taking into account the overhead of a page structure per 399 * page). 400 */ 401 first_page = low_water / PAGE_SIZE; 402#ifdef VM_PHYSSEG_SPARSE 403 page_range = 0; 404 for (i = 0; phys_avail[i + 1] != 0; i += 2) 405 page_range += atop(phys_avail[i + 1] - phys_avail[i]); 406#elif defined(VM_PHYSSEG_DENSE) 407 page_range = high_water / PAGE_SIZE - first_page; 408#else 409#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 410#endif 411 end = new_end; 412 413 /* 414 * Reserve an unmapped guard page to trap access to vm_page_array[-1]. 415 */ 416 vaddr += PAGE_SIZE; 417 418 /* 419 * Initialize the mem entry structures now, and put them in the free 420 * queue. 421 */ 422 new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 423 mapped = pmap_map(&vaddr, new_end, end, 424 VM_PROT_READ | VM_PROT_WRITE); 425 vm_page_array = (vm_page_t) mapped; 426#if VM_NRESERVLEVEL > 0 427 /* 428 * Allocate memory for the reservation management system's data 429 * structures. 430 */ 431 new_end = vm_reserv_startup(&vaddr, new_end, high_water); 432#endif 433#if defined(__amd64__) 434 /* 435 * pmap_map on amd64 and mips can come out of the direct-map, not kvm 436 * like i386, so the pages must be tracked for a crashdump to include 437 * this data. This includes the vm_page_array and the early UMA 438 * bootstrap pages. 439 */ 440 for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) 441 dump_add_page(pa); 442#endif 443 phys_avail[biggestone + 1] = new_end; 444 445 /* 446 * Clear all of the page structures 447 */ 448 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); 449 for (i = 0; i < page_range; i++) 450 vm_page_array[i].order = VM_NFREEORDER; 451 vm_page_array_size = page_range; 452 453 /* 454 * Initialize the physical memory allocator. 455 */ 456 vm_phys_init(); 457 458 /* 459 * Add every available physical page that is not blacklisted to 460 * the free lists. 461 */ 462 cnt.v_page_count = 0; 463 cnt.v_free_count = 0; 464 list = getenv("vm.blacklist"); 465 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 466 pa = phys_avail[i]; 467 last_pa = phys_avail[i + 1]; 468 while (pa < last_pa) { 469 if (list != NULL && 470 vm_page_blacklist_lookup(list, pa)) 471 printf("Skipping page with pa 0x%jx\n", 472 (uintmax_t)pa); 473 else 474 vm_phys_add_page(pa); 475 pa += PAGE_SIZE; 476 } 477 } 478 freeenv(list); 479#if VM_NRESERVLEVEL > 0 480 /* 481 * Initialize the reservation management system. 482 */ 483 vm_reserv_init(); 484#endif 485 return (vaddr); 486} 487 488void 489vm_page_flag_set(vm_page_t m, unsigned short bits) 490{ 491 492 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 493 /* 494 * The PG_WRITEABLE flag can only be set if the page is managed and 495 * VPO_BUSY. Currently, this flag is only set by pmap_enter(). 496 */ 497 KASSERT((bits & PG_WRITEABLE) == 0 || 498 ((m->flags & (PG_UNMANAGED | PG_FICTITIOUS)) == 0 && 499 (m->oflags & VPO_BUSY) != 0), ("PG_WRITEABLE and !VPO_BUSY")); 500 m->flags |= bits; 501} 502 503void 504vm_page_flag_clear(vm_page_t m, unsigned short bits) 505{ 506 507 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 508 /* 509 * The PG_REFERENCED flag can only be cleared if the object 510 * containing the page is locked. 511 */ 512 KASSERT((bits & PG_REFERENCED) == 0 || VM_OBJECT_LOCKED(m->object), 513 ("PG_REFERENCED and !VM_OBJECT_LOCKED")); 514 m->flags &= ~bits; 515} 516 517void 518vm_page_busy(vm_page_t m) 519{ 520 521 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 522 KASSERT((m->oflags & VPO_BUSY) == 0, 523 ("vm_page_busy: page already busy!!!")); 524 m->oflags |= VPO_BUSY; 525} 526 527/* 528 * vm_page_flash: 529 * 530 * wakeup anyone waiting for the page. 531 */ 532void 533vm_page_flash(vm_page_t m) 534{ 535 536 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 537 if (m->oflags & VPO_WANTED) { 538 m->oflags &= ~VPO_WANTED; 539 wakeup(m); 540 } 541} 542 543/* 544 * vm_page_wakeup: 545 * 546 * clear the VPO_BUSY flag and wakeup anyone waiting for the 547 * page. 548 * 549 */ 550void 551vm_page_wakeup(vm_page_t m) 552{ 553 554 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 555 KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!")); 556 m->oflags &= ~VPO_BUSY; 557 vm_page_flash(m); 558} 559 560void 561vm_page_io_start(vm_page_t m) 562{ 563 564 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 565 m->busy++; 566} 567 568void 569vm_page_io_finish(vm_page_t m) 570{ 571 572 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 573 m->busy--; 574 if (m->busy == 0) 575 vm_page_flash(m); 576} 577 578/* 579 * Keep page from being freed by the page daemon 580 * much of the same effect as wiring, except much lower 581 * overhead and should be used only for *very* temporary 582 * holding ("wiring"). 583 */ 584void 585vm_page_hold(vm_page_t mem) 586{ 587 588 vm_page_lock_assert(mem, MA_OWNED); 589 mem->hold_count++; 590} 591 592void 593vm_page_unhold(vm_page_t mem) 594{ 595 596 vm_page_lock_assert(mem, MA_OWNED); 597 --mem->hold_count; 598 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); 599 if (mem->hold_count == 0 && mem->queue == PQ_HOLD) 600 vm_page_free_toq(mem); 601} 602 603/* 604 * vm_page_free: 605 * 606 * Free a page. 607 */ 608void 609vm_page_free(vm_page_t m) 610{ 611 612 m->flags &= ~PG_ZERO; 613 vm_page_free_toq(m); 614} 615 616/* 617 * vm_page_free_zero: 618 * 619 * Free a page to the zerod-pages queue 620 */ 621void 622vm_page_free_zero(vm_page_t m) 623{ 624 625 m->flags |= PG_ZERO; 626 vm_page_free_toq(m); 627} 628 629/* 630 * vm_page_sleep: 631 * 632 * Sleep and release the page and page queues locks. 633 * 634 * The object containing the given page must be locked. 635 */ 636void 637vm_page_sleep(vm_page_t m, const char *msg) 638{ 639 640 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 641 if (mtx_owned(&vm_page_queue_mtx)) 642 vm_page_unlock_queues(); 643 if (mtx_owned(vm_page_lockptr(m))) 644 vm_page_unlock(m); 645 646 /* 647 * It's possible that while we sleep, the page will get 648 * unbusied and freed. If we are holding the object 649 * lock, we will assume we hold a reference to the object 650 * such that even if m->object changes, we can re-lock 651 * it. 652 */ 653 m->oflags |= VPO_WANTED; 654 msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0); 655} 656 657/* 658 * vm_page_dirty: 659 * 660 * make page all dirty 661 */ 662void 663vm_page_dirty(vm_page_t m) 664{ 665 666 KASSERT((m->flags & PG_CACHED) == 0, 667 ("vm_page_dirty: page in cache!")); 668 KASSERT(!VM_PAGE_IS_FREE(m), 669 ("vm_page_dirty: page is free!")); 670 KASSERT(m->valid == VM_PAGE_BITS_ALL, 671 ("vm_page_dirty: page is invalid!")); 672 m->dirty = VM_PAGE_BITS_ALL; 673} 674 675/* 676 * vm_page_splay: 677 * 678 * Implements Sleator and Tarjan's top-down splay algorithm. Returns 679 * the vm_page containing the given pindex. If, however, that 680 * pindex is not found in the vm_object, returns a vm_page that is 681 * adjacent to the pindex, coming before or after it. 682 */ 683vm_page_t 684vm_page_splay(vm_pindex_t pindex, vm_page_t root) 685{ 686 struct vm_page dummy; 687 vm_page_t lefttreemax, righttreemin, y; 688 689 if (root == NULL) 690 return (root); 691 lefttreemax = righttreemin = &dummy; 692 for (;; root = y) { 693 if (pindex < root->pindex) { 694 if ((y = root->left) == NULL) 695 break; 696 if (pindex < y->pindex) { 697 /* Rotate right. */ 698 root->left = y->right; 699 y->right = root; 700 root = y; 701 if ((y = root->left) == NULL) 702 break; 703 } 704 /* Link into the new root's right tree. */ 705 righttreemin->left = root; 706 righttreemin = root; 707 } else if (pindex > root->pindex) { 708 if ((y = root->right) == NULL) 709 break; 710 if (pindex > y->pindex) { 711 /* Rotate left. */ 712 root->right = y->left; 713 y->left = root; 714 root = y; 715 if ((y = root->right) == NULL) 716 break; 717 } 718 /* Link into the new root's left tree. */ 719 lefttreemax->right = root; 720 lefttreemax = root; 721 } else 722 break; 723 } 724 /* Assemble the new root. */ 725 lefttreemax->right = root->left; 726 righttreemin->left = root->right; 727 root->left = dummy.right; 728 root->right = dummy.left; 729 return (root); 730} 731 732/* 733 * vm_page_insert: [ internal use only ] 734 * 735 * Inserts the given mem entry into the object and object list. 736 * 737 * The pagetables are not updated but will presumably fault the page 738 * in if necessary, or if a kernel page the caller will at some point 739 * enter the page into the kernel's pmap. We are not allowed to block 740 * here so we *can't* do this anyway. 741 * 742 * The object and page must be locked. 743 * This routine may not block. 744 */ 745void 746vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 747{ 748 vm_page_t root; 749 750 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 751 if (m->object != NULL) 752 panic("vm_page_insert: page already inserted"); 753 754 /* 755 * Record the object/offset pair in this page 756 */ 757 m->object = object; 758 m->pindex = pindex; 759 760 /* 761 * Now link into the object's ordered list of backed pages. 762 */ 763 root = object->root; 764 if (root == NULL) { 765 m->left = NULL; 766 m->right = NULL; 767 TAILQ_INSERT_TAIL(&object->memq, m, listq); 768 } else { 769 root = vm_page_splay(pindex, root); 770 if (pindex < root->pindex) { 771 m->left = root->left; 772 m->right = root; 773 root->left = NULL; 774 TAILQ_INSERT_BEFORE(root, m, listq); 775 } else if (pindex == root->pindex) 776 panic("vm_page_insert: offset already allocated"); 777 else { 778 m->right = root->right; 779 m->left = root; 780 root->right = NULL; 781 TAILQ_INSERT_AFTER(&object->memq, root, m, listq); 782 } 783 } 784 object->root = m; 785 object->generation++; 786 787 /* 788 * show that the object has one more resident page. 789 */ 790 object->resident_page_count++; 791 /* 792 * Hold the vnode until the last page is released. 793 */ 794 if (object->resident_page_count == 1 && object->type == OBJT_VNODE) 795 vhold((struct vnode *)object->handle); 796 797 /* 798 * Since we are inserting a new and possibly dirty page, 799 * update the object's OBJ_MIGHTBEDIRTY flag. 800 */ 801 if (m->flags & PG_WRITEABLE) 802 vm_object_set_writeable_dirty(object); 803} 804 805/* 806 * vm_page_remove: 807 * NOTE: used by device pager as well -wfj 808 * 809 * Removes the given mem entry from the object/offset-page 810 * table and the object page list, but do not invalidate/terminate 811 * the backing store. 812 * 813 * The object and page must be locked. 814 * The underlying pmap entry (if any) is NOT removed here. 815 * This routine may not block. 816 */ 817void 818vm_page_remove(vm_page_t m) 819{ 820 vm_object_t object; 821 vm_page_t root; 822 823 if ((m->flags & PG_UNMANAGED) == 0) 824 vm_page_lock_assert(m, MA_OWNED); 825 if ((object = m->object) == NULL) 826 return; 827 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 828 if (m->oflags & VPO_BUSY) { 829 m->oflags &= ~VPO_BUSY; 830 vm_page_flash(m); 831 } 832 833 /* 834 * Now remove from the object's list of backed pages. 835 */ 836 if (m != object->root) 837 vm_page_splay(m->pindex, object->root); 838 if (m->left == NULL) 839 root = m->right; 840 else { 841 root = vm_page_splay(m->pindex, m->left); 842 root->right = m->right; 843 } 844 object->root = root; 845 TAILQ_REMOVE(&object->memq, m, listq); 846 847 /* 848 * And show that the object has one fewer resident page. 849 */ 850 object->resident_page_count--; 851 /* 852 * The vnode may now be recycled. 853 */ 854 if (object->resident_page_count == 0 && object->type == OBJT_VNODE) 855 vdrop((struct vnode *)object->handle); 856 857 m->object = NULL; 858} 859 860/* 861 * vm_page_lookup: 862 * 863 * Returns the page associated with the object/offset 864 * pair specified; if none is found, NULL is returned. 865 * 866 * The object must be locked. 867 * This routine may not block. 868 * This is a critical path routine 869 */ 870vm_page_t 871vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 872{ 873 vm_page_t m; 874 875 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 876 if ((m = object->root) != NULL && m->pindex != pindex) { 877 m = vm_page_splay(pindex, m); 878 if ((object->root = m)->pindex != pindex) 879 m = NULL; 880 } 881 return (m); 882} 883 884/* 885 * vm_page_find_least: 886 * 887 * Returns the page associated with the object with least pindex 888 * greater than or equal to the parameter pindex, or NULL. 889 * 890 * The object must be locked. 891 * The routine may not block. 892 */ 893vm_page_t 894vm_page_find_least(vm_object_t object, vm_pindex_t pindex) 895{ 896 vm_page_t m; 897 898 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 899 if ((m = TAILQ_FIRST(&object->memq)) != NULL) { 900 if (m->pindex < pindex) { 901 m = vm_page_splay(pindex, object->root); 902 if ((object->root = m)->pindex < pindex) 903 m = TAILQ_NEXT(m, listq); 904 } 905 } 906 return (m); 907} 908 909/* 910 * Returns the given page's successor (by pindex) within the object if it is 911 * resident; if none is found, NULL is returned. 912 * 913 * The object must be locked. 914 */ 915vm_page_t 916vm_page_next(vm_page_t m) 917{ 918 vm_page_t next; 919 920 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 921 if ((next = TAILQ_NEXT(m, listq)) != NULL && 922 next->pindex != m->pindex + 1) 923 next = NULL; 924 return (next); 925} 926 927/* 928 * Returns the given page's predecessor (by pindex) within the object if it is 929 * resident; if none is found, NULL is returned. 930 * 931 * The object must be locked. 932 */ 933vm_page_t 934vm_page_prev(vm_page_t m) 935{ 936 vm_page_t prev; 937 938 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 939 if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 940 prev->pindex != m->pindex - 1) 941 prev = NULL; 942 return (prev); 943} 944 945/* 946 * vm_page_rename: 947 * 948 * Move the given memory entry from its 949 * current object to the specified target object/offset. 950 * 951 * The object must be locked. 952 * This routine may not block. 953 * 954 * Note: swap associated with the page must be invalidated by the move. We 955 * have to do this for several reasons: (1) we aren't freeing the 956 * page, (2) we are dirtying the page, (3) the VM system is probably 957 * moving the page from object A to B, and will then later move 958 * the backing store from A to B and we can't have a conflict. 959 * 960 * Note: we *always* dirty the page. It is necessary both for the 961 * fact that we moved it, and because we may be invalidating 962 * swap. If the page is on the cache, we have to deactivate it 963 * or vm_page_dirty() will panic. Dirty pages are not allowed 964 * on the cache. 965 */ 966void 967vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 968{ 969 970 vm_page_remove(m); 971 vm_page_insert(m, new_object, new_pindex); 972 vm_page_dirty(m); 973} 974 975/* 976 * Convert all of the given object's cached pages that have a 977 * pindex within the given range into free pages. If the value 978 * zero is given for "end", then the range's upper bound is 979 * infinity. If the given object is backed by a vnode and it 980 * transitions from having one or more cached pages to none, the 981 * vnode's hold count is reduced. 982 */ 983void 984vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 985{ 986 vm_page_t m, m_next; 987 boolean_t empty; 988 989 mtx_lock(&vm_page_queue_free_mtx); 990 if (__predict_false(object->cache == NULL)) { 991 mtx_unlock(&vm_page_queue_free_mtx); 992 return; 993 } 994 m = object->cache = vm_page_splay(start, object->cache); 995 if (m->pindex < start) { 996 if (m->right == NULL) 997 m = NULL; 998 else { 999 m_next = vm_page_splay(start, m->right); 1000 m_next->left = m; 1001 m->right = NULL; 1002 m = object->cache = m_next; 1003 } 1004 } 1005 1006 /* 1007 * At this point, "m" is either (1) a reference to the page 1008 * with the least pindex that is greater than or equal to 1009 * "start" or (2) NULL. 1010 */ 1011 for (; m != NULL && (m->pindex < end || end == 0); m = m_next) { 1012 /* 1013 * Find "m"'s successor and remove "m" from the 1014 * object's cache. 1015 */ 1016 if (m->right == NULL) { 1017 object->cache = m->left; 1018 m_next = NULL; 1019 } else { 1020 m_next = vm_page_splay(start, m->right); 1021 m_next->left = m->left; 1022 object->cache = m_next; 1023 } 1024 /* Convert "m" to a free page. */ 1025 m->object = NULL; 1026 m->valid = 0; 1027 /* Clear PG_CACHED and set PG_FREE. */ 1028 m->flags ^= PG_CACHED | PG_FREE; 1029 KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE, 1030 ("vm_page_cache_free: page %p has inconsistent flags", m)); 1031 cnt.v_cache_count--; 1032 cnt.v_free_count++; 1033 } 1034 empty = object->cache == NULL; 1035 mtx_unlock(&vm_page_queue_free_mtx); 1036 if (object->type == OBJT_VNODE && empty) 1037 vdrop(object->handle); 1038} 1039 1040/* 1041 * Returns the cached page that is associated with the given 1042 * object and offset. If, however, none exists, returns NULL. 1043 * 1044 * The free page queue must be locked. 1045 */ 1046static inline vm_page_t 1047vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) 1048{ 1049 vm_page_t m; 1050 1051 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1052 if ((m = object->cache) != NULL && m->pindex != pindex) { 1053 m = vm_page_splay(pindex, m); 1054 if ((object->cache = m)->pindex != pindex) 1055 m = NULL; 1056 } 1057 return (m); 1058} 1059 1060/* 1061 * Remove the given cached page from its containing object's 1062 * collection of cached pages. 1063 * 1064 * The free page queue must be locked. 1065 */ 1066void 1067vm_page_cache_remove(vm_page_t m) 1068{ 1069 vm_object_t object; 1070 vm_page_t root; 1071 1072 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1073 KASSERT((m->flags & PG_CACHED) != 0, 1074 ("vm_page_cache_remove: page %p is not cached", m)); 1075 object = m->object; 1076 if (m != object->cache) { 1077 root = vm_page_splay(m->pindex, object->cache); 1078 KASSERT(root == m, 1079 ("vm_page_cache_remove: page %p is not cached in object %p", 1080 m, object)); 1081 } 1082 if (m->left == NULL) 1083 root = m->right; 1084 else if (m->right == NULL) 1085 root = m->left; 1086 else { 1087 root = vm_page_splay(m->pindex, m->left); 1088 root->right = m->right; 1089 } 1090 object->cache = root; 1091 m->object = NULL; 1092 cnt.v_cache_count--; 1093} 1094 1095/* 1096 * Transfer all of the cached pages with offset greater than or 1097 * equal to 'offidxstart' from the original object's cache to the 1098 * new object's cache. However, any cached pages with offset 1099 * greater than or equal to the new object's size are kept in the 1100 * original object. Initially, the new object's cache must be 1101 * empty. Offset 'offidxstart' in the original object must 1102 * correspond to offset zero in the new object. 1103 * 1104 * The new object must be locked. 1105 */ 1106void 1107vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, 1108 vm_object_t new_object) 1109{ 1110 vm_page_t m, m_next; 1111 1112 /* 1113 * Insertion into an object's collection of cached pages 1114 * requires the object to be locked. In contrast, removal does 1115 * not. 1116 */ 1117 VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); 1118 KASSERT(new_object->cache == NULL, 1119 ("vm_page_cache_transfer: object %p has cached pages", 1120 new_object)); 1121 mtx_lock(&vm_page_queue_free_mtx); 1122 if ((m = orig_object->cache) != NULL) { 1123 /* 1124 * Transfer all of the pages with offset greater than or 1125 * equal to 'offidxstart' from the original object's 1126 * cache to the new object's cache. 1127 */ 1128 m = vm_page_splay(offidxstart, m); 1129 if (m->pindex < offidxstart) { 1130 orig_object->cache = m; 1131 new_object->cache = m->right; 1132 m->right = NULL; 1133 } else { 1134 orig_object->cache = m->left; 1135 new_object->cache = m; 1136 m->left = NULL; 1137 } 1138 while ((m = new_object->cache) != NULL) { 1139 if ((m->pindex - offidxstart) >= new_object->size) { 1140 /* 1141 * Return all of the cached pages with 1142 * offset greater than or equal to the 1143 * new object's size to the original 1144 * object's cache. 1145 */ 1146 new_object->cache = m->left; 1147 m->left = orig_object->cache; 1148 orig_object->cache = m; 1149 break; 1150 } 1151 m_next = vm_page_splay(m->pindex, m->right); 1152 /* Update the page's object and offset. */ 1153 m->object = new_object; 1154 m->pindex -= offidxstart; 1155 if (m_next == NULL) 1156 break; 1157 m->right = NULL; 1158 m_next->left = m; 1159 new_object->cache = m_next; 1160 } 1161 KASSERT(new_object->cache == NULL || 1162 new_object->type == OBJT_SWAP, 1163 ("vm_page_cache_transfer: object %p's type is incompatible" 1164 " with cached pages", new_object)); 1165 } 1166 mtx_unlock(&vm_page_queue_free_mtx); 1167} 1168 1169/* 1170 * vm_page_alloc: 1171 * 1172 * Allocate and return a memory cell associated 1173 * with this VM object/offset pair. 1174 * 1175 * The caller must always specify an allocation class. 1176 * 1177 * allocation classes: 1178 * VM_ALLOC_NORMAL normal process request 1179 * VM_ALLOC_SYSTEM system *really* needs a page 1180 * VM_ALLOC_INTERRUPT interrupt time request 1181 * 1182 * optional allocation flags: 1183 * VM_ALLOC_ZERO prefer a zeroed page 1184 * VM_ALLOC_WIRED wire the allocated page 1185 * VM_ALLOC_NOOBJ page is not associated with a vm object 1186 * VM_ALLOC_NOBUSY do not set the page busy 1187 * VM_ALLOC_IFCACHED return page only if it is cached 1188 * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page 1189 * is cached 1190 * 1191 * This routine may not sleep. 1192 */ 1193vm_page_t 1194vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) 1195{ 1196 struct vnode *vp = NULL; 1197 vm_object_t m_object; 1198 vm_page_t m; 1199 int flags, page_req; 1200 1201 page_req = req & VM_ALLOC_CLASS_MASK; 1202 KASSERT(curthread->td_intr_nesting_level == 0 || 1203 page_req == VM_ALLOC_INTERRUPT, 1204 ("vm_page_alloc(NORMAL|SYSTEM) in interrupt context")); 1205 1206 if ((req & VM_ALLOC_NOOBJ) == 0) { 1207 KASSERT(object != NULL, 1208 ("vm_page_alloc: NULL object.")); 1209 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1210 } 1211 1212 /* 1213 * The pager is allowed to eat deeper into the free page list. 1214 */ 1215 if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) { 1216 page_req = VM_ALLOC_SYSTEM; 1217 }; 1218 1219 mtx_lock(&vm_page_queue_free_mtx); 1220 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || 1221 (page_req == VM_ALLOC_SYSTEM && 1222 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || 1223 (page_req == VM_ALLOC_INTERRUPT && 1224 cnt.v_free_count + cnt.v_cache_count > 0)) { 1225 /* 1226 * Allocate from the free queue if the number of free pages 1227 * exceeds the minimum for the request class. 1228 */ 1229 if (object != NULL && 1230 (m = vm_page_cache_lookup(object, pindex)) != NULL) { 1231 if ((req & VM_ALLOC_IFNOTCACHED) != 0) { 1232 mtx_unlock(&vm_page_queue_free_mtx); 1233 return (NULL); 1234 } 1235 if (vm_phys_unfree_page(m)) 1236 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); 1237#if VM_NRESERVLEVEL > 0 1238 else if (!vm_reserv_reactivate_page(m)) 1239#else 1240 else 1241#endif 1242 panic("vm_page_alloc: cache page %p is missing" 1243 " from the free queue", m); 1244 } else if ((req & VM_ALLOC_IFCACHED) != 0) { 1245 mtx_unlock(&vm_page_queue_free_mtx); 1246 return (NULL); 1247#if VM_NRESERVLEVEL > 0 1248 } else if (object == NULL || object->type == OBJT_DEVICE || 1249 object->type == OBJT_SG || 1250 (object->flags & OBJ_COLORED) == 0 || 1251 (m = vm_reserv_alloc_page(object, pindex)) == NULL) { 1252#else 1253 } else { 1254#endif 1255 m = vm_phys_alloc_pages(object != NULL ? 1256 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); 1257#if VM_NRESERVLEVEL > 0 1258 if (m == NULL && vm_reserv_reclaim_inactive()) { 1259 m = vm_phys_alloc_pages(object != NULL ? 1260 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 1261 0); 1262 } 1263#endif 1264 } 1265 } else { 1266 /* 1267 * Not allocatable, give up. 1268 */ 1269 mtx_unlock(&vm_page_queue_free_mtx); 1270 atomic_add_int(&vm_pageout_deficit, 1271 MAX((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1272 pagedaemon_wakeup(); 1273 return (NULL); 1274 } 1275 1276 /* 1277 * At this point we had better have found a good page. 1278 */ 1279 1280 KASSERT(m != NULL, ("vm_page_alloc: missing page")); 1281 KASSERT(m->queue == PQ_NONE, 1282 ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); 1283 KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); 1284 KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); 1285 KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m)); 1286 KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); 1287 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1288 ("vm_page_alloc: page %p has unexpected memattr %d", m, 1289 pmap_page_get_memattr(m))); 1290 if ((m->flags & PG_CACHED) != 0) { 1291 KASSERT(m->valid != 0, 1292 ("vm_page_alloc: cached page %p is invalid", m)); 1293 if (m->object == object && m->pindex == pindex) 1294 cnt.v_reactivated++; 1295 else 1296 m->valid = 0; 1297 m_object = m->object; 1298 vm_page_cache_remove(m); 1299 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1300 vp = m_object->handle; 1301 } else { 1302 KASSERT(VM_PAGE_IS_FREE(m), 1303 ("vm_page_alloc: page %p is not free", m)); 1304 KASSERT(m->valid == 0, 1305 ("vm_page_alloc: free page %p is valid", m)); 1306 cnt.v_free_count--; 1307 } 1308 1309 /* 1310 * Initialize structure. Only the PG_ZERO flag is inherited. 1311 */ 1312 flags = 0; 1313 if (m->flags & PG_ZERO) { 1314 vm_page_zero_count--; 1315 if (req & VM_ALLOC_ZERO) 1316 flags = PG_ZERO; 1317 } 1318 if (object == NULL || object->type == OBJT_PHYS) 1319 flags |= PG_UNMANAGED; 1320 m->flags = flags; 1321 if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) 1322 m->oflags = 0; 1323 else 1324 m->oflags = VPO_BUSY; 1325 if (req & VM_ALLOC_WIRED) { 1326 atomic_add_int(&cnt.v_wire_count, 1); 1327 m->wire_count = 1; 1328 } 1329 m->act_count = 0; 1330 mtx_unlock(&vm_page_queue_free_mtx); 1331 1332 if (object != NULL) { 1333 /* Ignore device objects; the pager sets "memattr" for them. */ 1334 if (object->memattr != VM_MEMATTR_DEFAULT && 1335 object->type != OBJT_DEVICE && object->type != OBJT_SG) 1336 pmap_page_set_memattr(m, object->memattr); 1337 vm_page_insert(m, object, pindex); 1338 } else 1339 m->pindex = pindex; 1340 1341 /* 1342 * The following call to vdrop() must come after the above call 1343 * to vm_page_insert() in case both affect the same object and 1344 * vnode. Otherwise, the affected vnode's hold count could 1345 * temporarily become zero. 1346 */ 1347 if (vp != NULL) 1348 vdrop(vp); 1349 1350 /* 1351 * Don't wakeup too often - wakeup the pageout daemon when 1352 * we would be nearly out of memory. 1353 */ 1354 if (vm_paging_needed()) 1355 pagedaemon_wakeup(); 1356 1357 return (m); 1358} 1359 1360/* 1361 * Initialize a page that has been freshly dequeued from a freelist. 1362 * The caller has to drop the vnode returned, if it is not NULL. 1363 * 1364 * To be called with vm_page_queue_free_mtx held. 1365 */ 1366struct vnode * 1367vm_page_alloc_init(vm_page_t m) 1368{ 1369 struct vnode *drop; 1370 vm_object_t m_object; 1371 1372 KASSERT(m->queue == PQ_NONE, 1373 ("vm_page_alloc_init: page %p has unexpected queue %d", 1374 m, m->queue)); 1375 KASSERT(m->wire_count == 0, 1376 ("vm_page_alloc_init: page %p is wired", m)); 1377 KASSERT(m->hold_count == 0, 1378 ("vm_page_alloc_init: page %p is held", m)); 1379 KASSERT(m->busy == 0, 1380 ("vm_page_alloc_init: page %p is busy", m)); 1381 KASSERT(m->dirty == 0, 1382 ("vm_page_alloc_init: page %p is dirty", m)); 1383 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1384 ("vm_page_alloc_init: page %p has unexpected memattr %d", 1385 m, pmap_page_get_memattr(m))); 1386 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1387 drop = NULL; 1388 if ((m->flags & PG_CACHED) != 0) { 1389 m->valid = 0; 1390 m_object = m->object; 1391 vm_page_cache_remove(m); 1392 if (m_object->type == OBJT_VNODE && 1393 m_object->cache == NULL) 1394 drop = m_object->handle; 1395 } else { 1396 KASSERT(VM_PAGE_IS_FREE(m), 1397 ("vm_page_alloc_init: page %p is not free", m)); 1398 KASSERT(m->valid == 0, 1399 ("vm_page_alloc_init: free page %p is valid", m)); 1400 cnt.v_free_count--; 1401 } 1402 if (m->flags & PG_ZERO) 1403 vm_page_zero_count--; 1404 /* Don't clear the PG_ZERO flag; we'll need it later. */ 1405 m->flags = PG_UNMANAGED | (m->flags & PG_ZERO); 1406 m->oflags = 0; 1407 /* Unmanaged pages don't use "act_count". */ 1408 return (drop); 1409} 1410 1411/* 1412 * vm_page_alloc_freelist: 1413 * 1414 * Allocate a page from the specified freelist. 1415 * Only the ALLOC_CLASS values in req are honored, other request flags 1416 * are ignored. 1417 */ 1418vm_page_t 1419vm_page_alloc_freelist(int flind, int req) 1420{ 1421 struct vnode *drop; 1422 vm_page_t m; 1423 int page_req; 1424 1425 m = NULL; 1426 page_req = req & VM_ALLOC_CLASS_MASK; 1427 mtx_lock(&vm_page_queue_free_mtx); 1428 /* 1429 * Do not allocate reserved pages unless the req has asked for it. 1430 */ 1431 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || 1432 (page_req == VM_ALLOC_SYSTEM && 1433 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || 1434 (page_req == VM_ALLOC_INTERRUPT && 1435 cnt.v_free_count + cnt.v_cache_count > 0)) { 1436 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); 1437 } 1438 if (m == NULL) { 1439 mtx_unlock(&vm_page_queue_free_mtx); 1440 return (NULL); 1441 } 1442 drop = vm_page_alloc_init(m); 1443 mtx_unlock(&vm_page_queue_free_mtx); 1444 if (drop) 1445 vdrop(drop); 1446 return (m); 1447} 1448 1449/* 1450 * vm_wait: (also see VM_WAIT macro) 1451 * 1452 * Block until free pages are available for allocation 1453 * - Called in various places before memory allocations. 1454 */ 1455void 1456vm_wait(void) 1457{ 1458 1459 mtx_lock(&vm_page_queue_free_mtx); 1460 if (curproc == pageproc) { 1461 vm_pageout_pages_needed = 1; 1462 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, 1463 PDROP | PSWP, "VMWait", 0); 1464 } else { 1465 if (!vm_pages_needed) { 1466 vm_pages_needed = 1; 1467 wakeup(&vm_pages_needed); 1468 } 1469 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, 1470 "vmwait", 0); 1471 } 1472} 1473 1474/* 1475 * vm_waitpfault: (also see VM_WAITPFAULT macro) 1476 * 1477 * Block until free pages are available for allocation 1478 * - Called only in vm_fault so that processes page faulting 1479 * can be easily tracked. 1480 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 1481 * processes will be able to grab memory first. Do not change 1482 * this balance without careful testing first. 1483 */ 1484void 1485vm_waitpfault(void) 1486{ 1487 1488 mtx_lock(&vm_page_queue_free_mtx); 1489 if (!vm_pages_needed) { 1490 vm_pages_needed = 1; 1491 wakeup(&vm_pages_needed); 1492 } 1493 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, 1494 "pfault", 0); 1495} 1496 1497/* 1498 * vm_page_requeue: 1499 * 1500 * Move the given page to the tail of its present page queue. 1501 * 1502 * The page queues must be locked. 1503 */ 1504void 1505vm_page_requeue(vm_page_t m) 1506{ 1507 struct vpgqueues *vpq; 1508 int queue; 1509 1510 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1511 queue = m->queue; 1512 KASSERT(queue != PQ_NONE, 1513 ("vm_page_requeue: page %p is not queued", m)); 1514 vpq = &vm_page_queues[queue]; 1515 TAILQ_REMOVE(&vpq->pl, m, pageq); 1516 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1517} 1518 1519/* 1520 * vm_page_queue_remove: 1521 * 1522 * Remove the given page from the specified queue. 1523 * 1524 * The page and page queues must be locked. 1525 */ 1526static __inline void 1527vm_page_queue_remove(int queue, vm_page_t m) 1528{ 1529 struct vpgqueues *pq; 1530 1531 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1532 vm_page_lock_assert(m, MA_OWNED); 1533 pq = &vm_page_queues[queue]; 1534 TAILQ_REMOVE(&pq->pl, m, pageq); 1535 (*pq->cnt)--; 1536} 1537 1538/* 1539 * vm_pageq_remove: 1540 * 1541 * Remove a page from its queue. 1542 * 1543 * The given page must be locked. 1544 * This routine may not block. 1545 */ 1546void 1547vm_pageq_remove(vm_page_t m) 1548{ 1549 int queue; 1550 1551 vm_page_lock_assert(m, MA_OWNED); 1552 if ((queue = m->queue) != PQ_NONE) { 1553 vm_page_lock_queues(); 1554 m->queue = PQ_NONE; 1555 vm_page_queue_remove(queue, m); 1556 vm_page_unlock_queues(); 1557 } 1558} 1559 1560/* 1561 * vm_page_enqueue: 1562 * 1563 * Add the given page to the specified queue. 1564 * 1565 * The page queues must be locked. 1566 */ 1567static void 1568vm_page_enqueue(int queue, vm_page_t m) 1569{ 1570 struct vpgqueues *vpq; 1571 1572 vpq = &vm_page_queues[queue]; 1573 m->queue = queue; 1574 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1575 ++*vpq->cnt; 1576} 1577 1578/* 1579 * vm_page_activate: 1580 * 1581 * Put the specified page on the active list (if appropriate). 1582 * Ensure that act_count is at least ACT_INIT but do not otherwise 1583 * mess with it. 1584 * 1585 * The page must be locked. 1586 * This routine may not block. 1587 */ 1588void 1589vm_page_activate(vm_page_t m) 1590{ 1591 int queue; 1592 1593 vm_page_lock_assert(m, MA_OWNED); 1594 if ((queue = m->queue) != PQ_ACTIVE) { 1595 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { 1596 if (m->act_count < ACT_INIT) 1597 m->act_count = ACT_INIT; 1598 vm_page_lock_queues(); 1599 if (queue != PQ_NONE) 1600 vm_page_queue_remove(queue, m); 1601 vm_page_enqueue(PQ_ACTIVE, m); 1602 vm_page_unlock_queues(); 1603 } else 1604 KASSERT(queue == PQ_NONE, 1605 ("vm_page_activate: wired page %p is queued", m)); 1606 } else { 1607 if (m->act_count < ACT_INIT) 1608 m->act_count = ACT_INIT; 1609 } 1610} 1611 1612/* 1613 * vm_page_free_wakeup: 1614 * 1615 * Helper routine for vm_page_free_toq() and vm_page_cache(). This 1616 * routine is called when a page has been added to the cache or free 1617 * queues. 1618 * 1619 * The page queues must be locked. 1620 * This routine may not block. 1621 */ 1622static inline void 1623vm_page_free_wakeup(void) 1624{ 1625 1626 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1627 /* 1628 * if pageout daemon needs pages, then tell it that there are 1629 * some free. 1630 */ 1631 if (vm_pageout_pages_needed && 1632 cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { 1633 wakeup(&vm_pageout_pages_needed); 1634 vm_pageout_pages_needed = 0; 1635 } 1636 /* 1637 * wakeup processes that are waiting on memory if we hit a 1638 * high water mark. And wakeup scheduler process if we have 1639 * lots of memory. this process will swapin processes. 1640 */ 1641 if (vm_pages_needed && !vm_page_count_min()) { 1642 vm_pages_needed = 0; 1643 wakeup(&cnt.v_free_count); 1644 } 1645} 1646 1647/* 1648 * vm_page_free_toq: 1649 * 1650 * Returns the given page to the free list, 1651 * disassociating it with any VM object. 1652 * 1653 * Object and page must be locked prior to entry. 1654 * This routine may not block. 1655 */ 1656 1657void 1658vm_page_free_toq(vm_page_t m) 1659{ 1660 1661 if ((m->flags & PG_UNMANAGED) == 0) { 1662 vm_page_lock_assert(m, MA_OWNED); 1663 KASSERT(!pmap_page_is_mapped(m), 1664 ("vm_page_free_toq: freeing mapped page %p", m)); 1665 } 1666 PCPU_INC(cnt.v_tfree); 1667 1668 if (VM_PAGE_IS_FREE(m)) 1669 panic("vm_page_free: freeing free page %p", m); 1670 else if (m->busy != 0) 1671 panic("vm_page_free: freeing busy page %p", m); 1672 1673 /* 1674 * unqueue, then remove page. Note that we cannot destroy 1675 * the page here because we do not want to call the pager's 1676 * callback routine until after we've put the page on the 1677 * appropriate free queue. 1678 */ 1679 if ((m->flags & PG_UNMANAGED) == 0) 1680 vm_pageq_remove(m); 1681 vm_page_remove(m); 1682 1683 /* 1684 * If fictitious remove object association and 1685 * return, otherwise delay object association removal. 1686 */ 1687 if ((m->flags & PG_FICTITIOUS) != 0) { 1688 return; 1689 } 1690 1691 m->valid = 0; 1692 vm_page_undirty(m); 1693 1694 if (m->wire_count != 0) 1695 panic("vm_page_free: freeing wired page %p", m); 1696 if (m->hold_count != 0) { 1697 m->flags &= ~PG_ZERO; 1698 vm_page_lock_queues(); 1699 vm_page_enqueue(PQ_HOLD, m); 1700 vm_page_unlock_queues(); 1701 } else { 1702 /* 1703 * Restore the default memory attribute to the page. 1704 */ 1705 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 1706 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 1707 1708 /* 1709 * Insert the page into the physical memory allocator's 1710 * cache/free page queues. 1711 */ 1712 mtx_lock(&vm_page_queue_free_mtx); 1713 m->flags |= PG_FREE; 1714 cnt.v_free_count++; 1715#if VM_NRESERVLEVEL > 0 1716 if (!vm_reserv_free_page(m)) 1717#else 1718 if (TRUE) 1719#endif 1720 vm_phys_free_pages(m, 0); 1721 if ((m->flags & PG_ZERO) != 0) 1722 ++vm_page_zero_count; 1723 else 1724 vm_page_zero_idle_wakeup(); 1725 vm_page_free_wakeup(); 1726 mtx_unlock(&vm_page_queue_free_mtx); 1727 } 1728} 1729 1730/* 1731 * vm_page_wire: 1732 * 1733 * Mark this page as wired down by yet 1734 * another map, removing it from paging queues 1735 * as necessary. 1736 * 1737 * If the page is fictitious, then its wire count must remain one. 1738 * 1739 * The page must be locked. 1740 * This routine may not block. 1741 */ 1742void 1743vm_page_wire(vm_page_t m) 1744{ 1745 1746 /* 1747 * Only bump the wire statistics if the page is not already wired, 1748 * and only unqueue the page if it is on some queue (if it is unmanaged 1749 * it is already off the queues). 1750 */ 1751 vm_page_lock_assert(m, MA_OWNED); 1752 if ((m->flags & PG_FICTITIOUS) != 0) { 1753 KASSERT(m->wire_count == 1, 1754 ("vm_page_wire: fictitious page %p's wire count isn't one", 1755 m)); 1756 return; 1757 } 1758 if (m->wire_count == 0) { 1759 if ((m->flags & PG_UNMANAGED) == 0) 1760 vm_pageq_remove(m); 1761 atomic_add_int(&cnt.v_wire_count, 1); 1762 } 1763 m->wire_count++; 1764 KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); 1765} 1766 1767/* 1768 * vm_page_unwire: 1769 * 1770 * Release one wiring of the specified page, potentially enabling it to be 1771 * paged again. If paging is enabled, then the value of the parameter 1772 * "activate" determines to which queue the page is added. If "activate" is 1773 * non-zero, then the page is added to the active queue. Otherwise, it is 1774 * added to the inactive queue. 1775 * 1776 * However, unless the page belongs to an object, it is not enqueued because 1777 * it cannot be paged out. 1778 * 1779 * If a page is fictitious, then its wire count must alway be one. 1780 * 1781 * A managed page must be locked. 1782 */ 1783void 1784vm_page_unwire(vm_page_t m, int activate) 1785{ 1786 1787 if ((m->flags & PG_UNMANAGED) == 0) 1788 vm_page_lock_assert(m, MA_OWNED); 1789 if ((m->flags & PG_FICTITIOUS) != 0) { 1790 KASSERT(m->wire_count == 1, 1791 ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); 1792 return; 1793 } 1794 if (m->wire_count > 0) { 1795 m->wire_count--; 1796 if (m->wire_count == 0) { 1797 atomic_subtract_int(&cnt.v_wire_count, 1); 1798 if ((m->flags & PG_UNMANAGED) != 0 || 1799 m->object == NULL) 1800 return; 1801 vm_page_lock_queues(); 1802 if (activate) 1803 vm_page_enqueue(PQ_ACTIVE, m); 1804 else { 1805 vm_page_flag_clear(m, PG_WINATCFLS); 1806 vm_page_enqueue(PQ_INACTIVE, m); 1807 } 1808 vm_page_unlock_queues(); 1809 } 1810 } else 1811 panic("vm_page_unwire: page %p's wire count is zero", m); 1812} 1813 1814/* 1815 * Move the specified page to the inactive queue. 1816 * 1817 * Many pages placed on the inactive queue should actually go 1818 * into the cache, but it is difficult to figure out which. What 1819 * we do instead, if the inactive target is well met, is to put 1820 * clean pages at the head of the inactive queue instead of the tail. 1821 * This will cause them to be moved to the cache more quickly and 1822 * if not actively re-referenced, reclaimed more quickly. If we just 1823 * stick these pages at the end of the inactive queue, heavy filesystem 1824 * meta-data accesses can cause an unnecessary paging load on memory bound 1825 * processes. This optimization causes one-time-use metadata to be 1826 * reused more quickly. 1827 * 1828 * Normally athead is 0 resulting in LRU operation. athead is set 1829 * to 1 if we want this page to be 'as if it were placed in the cache', 1830 * except without unmapping it from the process address space. 1831 * 1832 * This routine may not block. 1833 */ 1834static inline void 1835_vm_page_deactivate(vm_page_t m, int athead) 1836{ 1837 int queue; 1838 1839 vm_page_lock_assert(m, MA_OWNED); 1840 1841 /* 1842 * Ignore if already inactive. 1843 */ 1844 if ((queue = m->queue) == PQ_INACTIVE) 1845 return; 1846 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { 1847 vm_page_lock_queues(); 1848 vm_page_flag_clear(m, PG_WINATCFLS); 1849 if (queue != PQ_NONE) 1850 vm_page_queue_remove(queue, m); 1851 if (athead) 1852 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, 1853 pageq); 1854 else 1855 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, 1856 pageq); 1857 m->queue = PQ_INACTIVE; 1858 cnt.v_inactive_count++; 1859 vm_page_unlock_queues(); 1860 } 1861} 1862 1863/* 1864 * Move the specified page to the inactive queue. 1865 * 1866 * The page must be locked. 1867 */ 1868void 1869vm_page_deactivate(vm_page_t m) 1870{ 1871 1872 _vm_page_deactivate(m, 0); 1873} 1874 1875/* 1876 * vm_page_try_to_cache: 1877 * 1878 * Returns 0 on failure, 1 on success 1879 */ 1880int 1881vm_page_try_to_cache(vm_page_t m) 1882{ 1883 1884 vm_page_lock_assert(m, MA_OWNED); 1885 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1886 if (m->dirty || m->hold_count || m->busy || m->wire_count || 1887 (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) 1888 return (0); 1889 pmap_remove_all(m); 1890 if (m->dirty) 1891 return (0); 1892 vm_page_cache(m); 1893 return (1); 1894} 1895 1896/* 1897 * vm_page_try_to_free() 1898 * 1899 * Attempt to free the page. If we cannot free it, we do nothing. 1900 * 1 is returned on success, 0 on failure. 1901 */ 1902int 1903vm_page_try_to_free(vm_page_t m) 1904{ 1905 1906 vm_page_lock_assert(m, MA_OWNED); 1907 if (m->object != NULL) 1908 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1909 if (m->dirty || m->hold_count || m->busy || m->wire_count || 1910 (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) 1911 return (0); 1912 pmap_remove_all(m); 1913 if (m->dirty) 1914 return (0); 1915 vm_page_free(m); 1916 return (1); 1917} 1918 1919/* 1920 * vm_page_cache 1921 * 1922 * Put the specified page onto the page cache queue (if appropriate). 1923 * 1924 * This routine may not block. 1925 */ 1926void 1927vm_page_cache(vm_page_t m) 1928{ 1929 vm_object_t object; 1930 vm_page_t root; 1931 1932 vm_page_lock_assert(m, MA_OWNED); 1933 object = m->object; 1934 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1935 if ((m->flags & PG_UNMANAGED) || (m->oflags & VPO_BUSY) || m->busy || 1936 m->hold_count || m->wire_count) 1937 panic("vm_page_cache: attempting to cache busy page"); 1938 pmap_remove_all(m); 1939 if (m->dirty != 0) 1940 panic("vm_page_cache: page %p is dirty", m); 1941 if (m->valid == 0 || object->type == OBJT_DEFAULT || 1942 (object->type == OBJT_SWAP && 1943 !vm_pager_has_page(object, m->pindex, NULL, NULL))) { 1944 /* 1945 * Hypothesis: A cache-elgible page belonging to a 1946 * default object or swap object but without a backing 1947 * store must be zero filled. 1948 */ 1949 vm_page_free(m); 1950 return; 1951 } 1952 KASSERT((m->flags & PG_CACHED) == 0, 1953 ("vm_page_cache: page %p is already cached", m)); 1954 PCPU_INC(cnt.v_tcached); 1955 1956 /* 1957 * Remove the page from the paging queues. 1958 */ 1959 vm_pageq_remove(m); 1960 1961 /* 1962 * Remove the page from the object's collection of resident 1963 * pages. 1964 */ 1965 if (m != object->root) 1966 vm_page_splay(m->pindex, object->root); 1967 if (m->left == NULL) 1968 root = m->right; 1969 else { 1970 root = vm_page_splay(m->pindex, m->left); 1971 root->right = m->right; 1972 } 1973 object->root = root; 1974 TAILQ_REMOVE(&object->memq, m, listq); 1975 object->resident_page_count--; 1976 1977 /* 1978 * Restore the default memory attribute to the page. 1979 */ 1980 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 1981 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 1982 1983 /* 1984 * Insert the page into the object's collection of cached pages 1985 * and the physical memory allocator's cache/free page queues. 1986 */ 1987 m->flags &= ~PG_ZERO; 1988 mtx_lock(&vm_page_queue_free_mtx); 1989 m->flags |= PG_CACHED; 1990 cnt.v_cache_count++; 1991 root = object->cache; 1992 if (root == NULL) { 1993 m->left = NULL; 1994 m->right = NULL; 1995 } else { 1996 root = vm_page_splay(m->pindex, root); 1997 if (m->pindex < root->pindex) { 1998 m->left = root->left; 1999 m->right = root; 2000 root->left = NULL; 2001 } else if (__predict_false(m->pindex == root->pindex)) 2002 panic("vm_page_cache: offset already cached"); 2003 else { 2004 m->right = root->right; 2005 m->left = root; 2006 root->right = NULL; 2007 } 2008 } 2009 object->cache = m; 2010#if VM_NRESERVLEVEL > 0 2011 if (!vm_reserv_free_page(m)) { 2012#else 2013 if (TRUE) { 2014#endif 2015 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); 2016 vm_phys_free_pages(m, 0); 2017 } 2018 vm_page_free_wakeup(); 2019 mtx_unlock(&vm_page_queue_free_mtx); 2020 2021 /* 2022 * Increment the vnode's hold count if this is the object's only 2023 * cached page. Decrement the vnode's hold count if this was 2024 * the object's only resident page. 2025 */ 2026 if (object->type == OBJT_VNODE) { 2027 if (root == NULL && object->resident_page_count != 0) 2028 vhold(object->handle); 2029 else if (root != NULL && object->resident_page_count == 0) 2030 vdrop(object->handle); 2031 } 2032} 2033 2034/* 2035 * vm_page_dontneed 2036 * 2037 * Cache, deactivate, or do nothing as appropriate. This routine 2038 * is typically used by madvise() MADV_DONTNEED. 2039 * 2040 * Generally speaking we want to move the page into the cache so 2041 * it gets reused quickly. However, this can result in a silly syndrome 2042 * due to the page recycling too quickly. Small objects will not be 2043 * fully cached. On the otherhand, if we move the page to the inactive 2044 * queue we wind up with a problem whereby very large objects 2045 * unnecessarily blow away our inactive and cache queues. 2046 * 2047 * The solution is to move the pages based on a fixed weighting. We 2048 * either leave them alone, deactivate them, or move them to the cache, 2049 * where moving them to the cache has the highest weighting. 2050 * By forcing some pages into other queues we eventually force the 2051 * system to balance the queues, potentially recovering other unrelated 2052 * space from active. The idea is to not force this to happen too 2053 * often. 2054 */ 2055void 2056vm_page_dontneed(vm_page_t m) 2057{ 2058 int dnw; 2059 int head; 2060 2061 vm_page_lock_assert(m, MA_OWNED); 2062 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2063 dnw = PCPU_GET(dnweight); 2064 PCPU_INC(dnweight); 2065 2066 /* 2067 * Occasionally leave the page alone. 2068 */ 2069 if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) { 2070 if (m->act_count >= ACT_INIT) 2071 --m->act_count; 2072 return; 2073 } 2074 2075 /* 2076 * Clear any references to the page. Otherwise, the page daemon will 2077 * immediately reactivate the page. 2078 * 2079 * Perform the pmap_clear_reference() first. Otherwise, a concurrent 2080 * pmap operation, such as pmap_remove(), could clear a reference in 2081 * the pmap and set PG_REFERENCED on the page before the 2082 * pmap_clear_reference() had completed. Consequently, the page would 2083 * appear referenced based upon an old reference that occurred before 2084 * this function ran. 2085 */ 2086 pmap_clear_reference(m); 2087 vm_page_lock_queues(); 2088 vm_page_flag_clear(m, PG_REFERENCED); 2089 vm_page_unlock_queues(); 2090 2091 if (m->dirty == 0 && pmap_is_modified(m)) 2092 vm_page_dirty(m); 2093 2094 if (m->dirty || (dnw & 0x0070) == 0) { 2095 /* 2096 * Deactivate the page 3 times out of 32. 2097 */ 2098 head = 0; 2099 } else { 2100 /* 2101 * Cache the page 28 times out of every 32. Note that 2102 * the page is deactivated instead of cached, but placed 2103 * at the head of the queue instead of the tail. 2104 */ 2105 head = 1; 2106 } 2107 _vm_page_deactivate(m, head); 2108} 2109 2110/* 2111 * Grab a page, waiting until we are waken up due to the page 2112 * changing state. We keep on waiting, if the page continues 2113 * to be in the object. If the page doesn't exist, first allocate it 2114 * and then conditionally zero it. 2115 * 2116 * The caller must always specify the VM_ALLOC_RETRY flag. This is intended 2117 * to facilitate its eventual removal. 2118 * 2119 * This routine may block. 2120 */ 2121vm_page_t 2122vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 2123{ 2124 vm_page_t m; 2125 2126 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2127 KASSERT((allocflags & VM_ALLOC_RETRY) != 0, 2128 ("vm_page_grab: VM_ALLOC_RETRY is required")); 2129retrylookup: 2130 if ((m = vm_page_lookup(object, pindex)) != NULL) { 2131 if ((m->oflags & VPO_BUSY) != 0 || 2132 ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) { 2133 /* 2134 * Reference the page before unlocking and 2135 * sleeping so that the page daemon is less 2136 * likely to reclaim it. 2137 */ 2138 vm_page_lock_queues(); 2139 vm_page_flag_set(m, PG_REFERENCED); 2140 vm_page_sleep(m, "pgrbwt"); 2141 goto retrylookup; 2142 } else { 2143 if ((allocflags & VM_ALLOC_WIRED) != 0) { 2144 vm_page_lock(m); 2145 vm_page_wire(m); 2146 vm_page_unlock(m); 2147 } 2148 if ((allocflags & VM_ALLOC_NOBUSY) == 0) 2149 vm_page_busy(m); 2150 return (m); 2151 } 2152 } 2153 m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY | 2154 VM_ALLOC_IGN_SBUSY)); 2155 if (m == NULL) { 2156 VM_OBJECT_UNLOCK(object); 2157 VM_WAIT; 2158 VM_OBJECT_LOCK(object); 2159 goto retrylookup; 2160 } else if (m->valid != 0) 2161 return (m); 2162 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) 2163 pmap_zero_page(m); 2164 return (m); 2165} 2166 2167/* 2168 * Mapping function for valid bits or for dirty bits in 2169 * a page. May not block. 2170 * 2171 * Inputs are required to range within a page. 2172 */ 2173int 2174vm_page_bits(int base, int size) 2175{ 2176 int first_bit; 2177 int last_bit; 2178 2179 KASSERT( 2180 base + size <= PAGE_SIZE, 2181 ("vm_page_bits: illegal base/size %d/%d", base, size) 2182 ); 2183 2184 if (size == 0) /* handle degenerate case */ 2185 return (0); 2186 2187 first_bit = base >> DEV_BSHIFT; 2188 last_bit = (base + size - 1) >> DEV_BSHIFT; 2189 2190 return ((2 << last_bit) - (1 << first_bit)); 2191} 2192 2193/* 2194 * vm_page_set_valid: 2195 * 2196 * Sets portions of a page valid. The arguments are expected 2197 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2198 * of any partial chunks touched by the range. The invalid portion of 2199 * such chunks will be zeroed. 2200 * 2201 * (base + size) must be less then or equal to PAGE_SIZE. 2202 */ 2203void 2204vm_page_set_valid(vm_page_t m, int base, int size) 2205{ 2206 int endoff, frag; 2207 2208 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2209 if (size == 0) /* handle degenerate case */ 2210 return; 2211 2212 /* 2213 * If the base is not DEV_BSIZE aligned and the valid 2214 * bit is clear, we have to zero out a portion of the 2215 * first block. 2216 */ 2217 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2218 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 2219 pmap_zero_page_area(m, frag, base - frag); 2220 2221 /* 2222 * If the ending offset is not DEV_BSIZE aligned and the 2223 * valid bit is clear, we have to zero out a portion of 2224 * the last block. 2225 */ 2226 endoff = base + size; 2227 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2228 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 2229 pmap_zero_page_area(m, endoff, 2230 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2231 2232 /* 2233 * Assert that no previously invalid block that is now being validated 2234 * is already dirty. 2235 */ 2236 KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, 2237 ("vm_page_set_valid: page %p is dirty", m)); 2238 2239 /* 2240 * Set valid bits inclusive of any overlap. 2241 */ 2242 m->valid |= vm_page_bits(base, size); 2243} 2244 2245/* 2246 * Clear the given bits from the specified page's dirty field. 2247 */ 2248static __inline void 2249vm_page_clear_dirty_mask(vm_page_t m, int pagebits) 2250{ 2251 2252 /* 2253 * If the object is locked and the page is neither VPO_BUSY nor 2254 * PG_WRITEABLE, then the page's dirty field cannot possibly be 2255 * modified by a concurrent pmap operation. 2256 */ 2257 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2258 if ((m->oflags & VPO_BUSY) == 0 && (m->flags & PG_WRITEABLE) == 0) 2259 m->dirty &= ~pagebits; 2260 else { 2261 vm_page_lock_queues(); 2262 m->dirty &= ~pagebits; 2263 vm_page_unlock_queues(); 2264 } 2265} 2266 2267/* 2268 * vm_page_set_validclean: 2269 * 2270 * Sets portions of a page valid and clean. The arguments are expected 2271 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2272 * of any partial chunks touched by the range. The invalid portion of 2273 * such chunks will be zero'd. 2274 * 2275 * This routine may not block. 2276 * 2277 * (base + size) must be less then or equal to PAGE_SIZE. 2278 */ 2279void 2280vm_page_set_validclean(vm_page_t m, int base, int size) 2281{ 2282 u_long oldvalid; 2283 int endoff, frag, pagebits; 2284 2285 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2286 if (size == 0) /* handle degenerate case */ 2287 return; 2288 2289 /* 2290 * If the base is not DEV_BSIZE aligned and the valid 2291 * bit is clear, we have to zero out a portion of the 2292 * first block. 2293 */ 2294 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2295 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 2296 pmap_zero_page_area(m, frag, base - frag); 2297 2298 /* 2299 * If the ending offset is not DEV_BSIZE aligned and the 2300 * valid bit is clear, we have to zero out a portion of 2301 * the last block. 2302 */ 2303 endoff = base + size; 2304 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2305 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 2306 pmap_zero_page_area(m, endoff, 2307 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2308 2309 /* 2310 * Set valid, clear dirty bits. If validating the entire 2311 * page we can safely clear the pmap modify bit. We also 2312 * use this opportunity to clear the VPO_NOSYNC flag. If a process 2313 * takes a write fault on a MAP_NOSYNC memory area the flag will 2314 * be set again. 2315 * 2316 * We set valid bits inclusive of any overlap, but we can only 2317 * clear dirty bits for DEV_BSIZE chunks that are fully within 2318 * the range. 2319 */ 2320 oldvalid = m->valid; 2321 pagebits = vm_page_bits(base, size); 2322 m->valid |= pagebits; 2323#if 0 /* NOT YET */ 2324 if ((frag = base & (DEV_BSIZE - 1)) != 0) { 2325 frag = DEV_BSIZE - frag; 2326 base += frag; 2327 size -= frag; 2328 if (size < 0) 2329 size = 0; 2330 } 2331 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 2332#endif 2333 if (base == 0 && size == PAGE_SIZE) { 2334 /* 2335 * The page can only be modified within the pmap if it is 2336 * mapped, and it can only be mapped if it was previously 2337 * fully valid. 2338 */ 2339 if (oldvalid == VM_PAGE_BITS_ALL) 2340 /* 2341 * Perform the pmap_clear_modify() first. Otherwise, 2342 * a concurrent pmap operation, such as 2343 * pmap_protect(), could clear a modification in the 2344 * pmap and set the dirty field on the page before 2345 * pmap_clear_modify() had begun and after the dirty 2346 * field was cleared here. 2347 */ 2348 pmap_clear_modify(m); 2349 m->dirty = 0; 2350 m->oflags &= ~VPO_NOSYNC; 2351 } else if (oldvalid != VM_PAGE_BITS_ALL) 2352 m->dirty &= ~pagebits; 2353 else 2354 vm_page_clear_dirty_mask(m, pagebits); 2355} 2356 2357void 2358vm_page_clear_dirty(vm_page_t m, int base, int size) 2359{ 2360 2361 vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); 2362} 2363 2364/* 2365 * vm_page_set_invalid: 2366 * 2367 * Invalidates DEV_BSIZE'd chunks within a page. Both the 2368 * valid and dirty bits for the effected areas are cleared. 2369 * 2370 * May not block. 2371 */ 2372void 2373vm_page_set_invalid(vm_page_t m, int base, int size) 2374{ 2375 int bits; 2376 2377 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2378 KASSERT((m->oflags & VPO_BUSY) == 0, 2379 ("vm_page_set_invalid: page %p is busy", m)); 2380 bits = vm_page_bits(base, size); 2381 if (m->valid == VM_PAGE_BITS_ALL && bits != 0) 2382 pmap_remove_all(m); 2383 KASSERT(!pmap_page_is_mapped(m), 2384 ("vm_page_set_invalid: page %p is mapped", m)); 2385 m->valid &= ~bits; 2386 m->dirty &= ~bits; 2387} 2388 2389/* 2390 * vm_page_zero_invalid() 2391 * 2392 * The kernel assumes that the invalid portions of a page contain 2393 * garbage, but such pages can be mapped into memory by user code. 2394 * When this occurs, we must zero out the non-valid portions of the 2395 * page so user code sees what it expects. 2396 * 2397 * Pages are most often semi-valid when the end of a file is mapped 2398 * into memory and the file's size is not page aligned. 2399 */ 2400void 2401vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 2402{ 2403 int b; 2404 int i; 2405 2406 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2407 /* 2408 * Scan the valid bits looking for invalid sections that 2409 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the 2410 * valid bit may be set ) have already been zerod by 2411 * vm_page_set_validclean(). 2412 */ 2413 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 2414 if (i == (PAGE_SIZE / DEV_BSIZE) || 2415 (m->valid & (1 << i)) 2416 ) { 2417 if (i > b) { 2418 pmap_zero_page_area(m, 2419 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); 2420 } 2421 b = i + 1; 2422 } 2423 } 2424 2425 /* 2426 * setvalid is TRUE when we can safely set the zero'd areas 2427 * as being valid. We can do this if there are no cache consistancy 2428 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 2429 */ 2430 if (setvalid) 2431 m->valid = VM_PAGE_BITS_ALL; 2432} 2433 2434/* 2435 * vm_page_is_valid: 2436 * 2437 * Is (partial) page valid? Note that the case where size == 0 2438 * will return FALSE in the degenerate case where the page is 2439 * entirely invalid, and TRUE otherwise. 2440 * 2441 * May not block. 2442 */ 2443int 2444vm_page_is_valid(vm_page_t m, int base, int size) 2445{ 2446 int bits = vm_page_bits(base, size); 2447 2448 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2449 if (m->valid && ((m->valid & bits) == bits)) 2450 return 1; 2451 else 2452 return 0; 2453} 2454 2455/* 2456 * update dirty bits from pmap/mmu. May not block. 2457 */ 2458void 2459vm_page_test_dirty(vm_page_t m) 2460{ 2461 2462 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2463 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) 2464 vm_page_dirty(m); 2465} 2466 2467int so_zerocp_fullpage = 0; 2468 2469/* 2470 * Replace the given page with a copy. The copied page assumes 2471 * the portion of the given page's "wire_count" that is not the 2472 * responsibility of this copy-on-write mechanism. 2473 * 2474 * The object containing the given page must have a non-zero 2475 * paging-in-progress count and be locked. 2476 */ 2477void 2478vm_page_cowfault(vm_page_t m) 2479{ 2480 vm_page_t mnew; 2481 vm_object_t object; 2482 vm_pindex_t pindex; 2483 2484 mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED); 2485 vm_page_lock_assert(m, MA_OWNED); 2486 object = m->object; 2487 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2488 KASSERT(object->paging_in_progress != 0, 2489 ("vm_page_cowfault: object %p's paging-in-progress count is zero.", 2490 object)); 2491 pindex = m->pindex; 2492 2493 retry_alloc: 2494 pmap_remove_all(m); 2495 vm_page_remove(m); 2496 mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); 2497 if (mnew == NULL) { 2498 vm_page_insert(m, object, pindex); 2499 vm_page_unlock(m); 2500 VM_OBJECT_UNLOCK(object); 2501 VM_WAIT; 2502 VM_OBJECT_LOCK(object); 2503 if (m == vm_page_lookup(object, pindex)) { 2504 vm_page_lock(m); 2505 goto retry_alloc; 2506 } else { 2507 /* 2508 * Page disappeared during the wait. 2509 */ 2510 return; 2511 } 2512 } 2513 2514 if (m->cow == 0) { 2515 /* 2516 * check to see if we raced with an xmit complete when 2517 * waiting to allocate a page. If so, put things back 2518 * the way they were 2519 */ 2520 vm_page_unlock(m); 2521 vm_page_lock(mnew); 2522 vm_page_free(mnew); 2523 vm_page_unlock(mnew); 2524 vm_page_insert(m, object, pindex); 2525 } else { /* clear COW & copy page */ 2526 if (!so_zerocp_fullpage) 2527 pmap_copy_page(m, mnew); 2528 mnew->valid = VM_PAGE_BITS_ALL; 2529 vm_page_dirty(mnew); 2530 mnew->wire_count = m->wire_count - m->cow; 2531 m->wire_count = m->cow; 2532 vm_page_unlock(m); 2533 } 2534} 2535 2536void 2537vm_page_cowclear(vm_page_t m) 2538{ 2539 2540 vm_page_lock_assert(m, MA_OWNED); 2541 if (m->cow) { 2542 m->cow--; 2543 /* 2544 * let vm_fault add back write permission lazily 2545 */ 2546 } 2547 /* 2548 * sf_buf_free() will free the page, so we needn't do it here 2549 */ 2550} 2551 2552int 2553vm_page_cowsetup(vm_page_t m) 2554{ 2555 2556 vm_page_lock_assert(m, MA_OWNED); 2557 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0 || 2558 m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYLOCK(m->object)) 2559 return (EBUSY); 2560 m->cow++; 2561 pmap_remove_write(m); 2562 VM_OBJECT_UNLOCK(m->object); 2563 return (0); 2564} 2565 2566#include "opt_ddb.h" 2567#ifdef DDB 2568#include <sys/kernel.h> 2569 2570#include <ddb/ddb.h> 2571 2572DB_SHOW_COMMAND(page, vm_page_print_page_info) 2573{ 2574 db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); 2575 db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); 2576 db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); 2577 db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); 2578 db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); 2579 db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); 2580 db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); 2581 db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); 2582 db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); 2583 db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); 2584} 2585 2586DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 2587{ 2588 2589 db_printf("PQ_FREE:"); 2590 db_printf(" %d", cnt.v_free_count); 2591 db_printf("\n"); 2592 2593 db_printf("PQ_CACHE:"); 2594 db_printf(" %d", cnt.v_cache_count); 2595 db_printf("\n"); 2596 2597 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", 2598 *vm_page_queues[PQ_ACTIVE].cnt, 2599 *vm_page_queues[PQ_INACTIVE].cnt); 2600} 2601#endif /* DDB */ 2602