Cross Reference: /freebsd-10.2-release/sys/vm/vm_page.c

Deleted Added

sdiff udiff text old ( 228287 ) new ( 230623 )

full compact

vm_page.c (228287)	vm_page.c (230623)
1/- 2 Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * The Mach Operating System project at Carnegie-Mellon University. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 34 / 35 36/- 37 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 38 * All rights reserved. 39 * 40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 / 62 63/ 64 * GENERAL RULES ON VM_PAGE MANIPULATION 65 * 66 * - a pageq mutex is required when adding or removing a page from a 67 * page queue (vm_page_queue[]), regardless of other mutexes or the 68 * busy state of a page. 69 * 70 * - The object mutex is held when inserting or removing 71 * pages from an object (vm_page_insert() or vm_page_remove()). 72 * 73 / 74 75/ 76 * Resident memory management module. 77 */ 78 79#include <sys/cdefs.h>	1/- 2 Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * The Mach Operating System project at Carnegie-Mellon University. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 34 / 35 36/- 37 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 38 * All rights reserved. 39 * 40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 / 62 63/ 64 * GENERAL RULES ON VM_PAGE MANIPULATION 65 * 66 * - a pageq mutex is required when adding or removing a page from a 67 * page queue (vm_page_queue[]), regardless of other mutexes or the 68 * busy state of a page. 69 * 70 * - The object mutex is held when inserting or removing 71 * pages from an object (vm_page_insert() or vm_page_remove()). 72 * 73 / 74 75/ 76 * Resident memory management module. 77 */ 78 79#include <sys/cdefs.h>
80__FBSDID("$FreeBSD: head/sys/vm/vm_page.c 228287 2011-12-05 18:29:25Z alc $");	80__FBSDID("$FreeBSD: head/sys/vm/vm_page.c 230623 2012-01-27 20:18:31Z kmacy $");
81 82#include "opt_vm.h" 83 84#include <sys/param.h> 85#include <sys/systm.h> 86#include <sys/lock.h> 87#include <sys/kernel.h> 88#include <sys/limits.h> 89#include <sys/malloc.h> 90#include <sys/msgbuf.h> 91#include <sys/mutex.h> 92#include <sys/proc.h> 93#include <sys/sysctl.h> 94#include <sys/vmmeter.h> 95#include <sys/vnode.h> 96 97#include <vm/vm.h> 98#include <vm/pmap.h> 99#include <vm/vm_param.h> 100#include <vm/vm_kern.h> 101#include <vm/vm_object.h> 102#include <vm/vm_page.h> 103#include <vm/vm_pageout.h> 104#include <vm/vm_pager.h> 105#include <vm/vm_phys.h> 106#include <vm/vm_reserv.h> 107#include <vm/vm_extern.h> 108#include <vm/uma.h> 109#include <vm/uma_int.h> 110 111#include <machine/md_var.h> 112 113/* 114 * Associated with page of user-allocatable memory is a 115 * page structure. 116 / 117* 118struct vpgqueues vm_page_queues[PQ_COUNT]; 119struct vpglocks vm_page_queue_lock; 120struct vpglocks vm_page_queue_free_lock; 121 122struct vpglocks pa_lock[PA_LOCK_COUNT]; 123 124vm_page_t vm_page_array = 0; 125int vm_page_array_size = 0; 126long first_page = 0; 127int vm_page_zero_count = 0; 128 129static int boot_pages = UMA_BOOT_PAGES; 130TUNABLE_INT("vm.boot_pages", &boot_pages); 131SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0, 132 "number of pages allocated for bootstrapping the VM system"); 133 134static int pa_tryrelock_restart; 135SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, 136 &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); 137 138static uma_zone_t fakepg_zone; 139 140static struct vnode vm_page_alloc_init(vm_page_t m); 141static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); 142static void vm_page_queue_remove(int queue, vm_page_t m); 143static void vm_page_enqueue(int queue, vm_page_t m); 144static void vm_page_init_fakepg(void dummy); 145 146SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL); 147 148static void 149vm_page_init_fakepg(void dummy) 150{ 151* 152 fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, 153 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE \| UMA_ZONE_VM); 154} 155 156/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. / 157#if PAGE_SIZE == 32768 158#ifdef CTASSERT 159CTASSERT(sizeof(u_long) >= 8); 160#endif 161#endif 162* 163/* 164 * Try to acquire a physical address lock while a pmap is locked. If we 165 * fail to trylock we unlock and lock the pmap directly and cache the 166 * locked pa in locked. The caller should then restart their loop in case 167* * the virtual to physical mapping has changed. 168 / 169int 170vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t locked) 171{ 172 vm_paddr_t lockpa; 173 174 lockpa = locked; 175* locked = pa; 176* if (lockpa) { 177 PA_LOCK_ASSERT(lockpa, MA_OWNED); 178 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) 179 return (0); 180 PA_UNLOCK(lockpa); 181 } 182 if (PA_TRYLOCK(pa)) 183 return (0); 184 PMAP_UNLOCK(pmap); 185 atomic_add_int(&pa_tryrelock_restart, 1); 186 PA_LOCK(pa); 187 PMAP_LOCK(pmap); 188 return (EAGAIN); 189} 190 191/* 192 * vm_set_page_size: 193 * 194 * Sets the page size, perhaps based upon the memory 195 * size. Must be called before any use of page-size 196 * dependent functions. 197 / 198void 199vm_set_page_size(void) 200{ 201* if (cnt.v_page_size == 0) 202 cnt.v_page_size = PAGE_SIZE; 203 if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0) 204 panic("vm_set_page_size: page size not a power of two"); 205} 206 207/* 208 * vm_page_blacklist_lookup: 209 * 210 * See if a physical address in this page has been listed 211 * in the blacklist tunable. Entries in the tunable are 212 * separated by spaces or commas. If an invalid integer is 213 * encountered then the rest of the string is skipped. 214 / 215static int 216vm_page_blacklist_lookup(char list, vm_paddr_t pa) 217{ 218 vm_paddr_t bad; 219 char cp, pos; 220 221 for (pos = list; pos != '\0'; pos = cp) { 222* bad = strtoq(pos, &cp, 0); 223 if (cp != '\0') { 224* if (cp == ' ' \|\| cp == ',') { 225 cp++; 226 if (cp == pos) 227 continue; 228 } else 229 break; 230 } 231 if (pa == trunc_page(bad)) 232 return (1); 233 } 234 return (0); 235} 236 237/* 238 * vm_page_startup: 239 * 240 * Initializes the resident memory module. 241 * 242 * Allocates memory for the page cells, and 243 * for the object/offset-to-page hash table headers. 244 * Each page cell is initialized and placed on the free list. 245 / 246vm_offset_t 247vm_page_startup(vm_offset_t vaddr) 248{ 249* vm_offset_t mapped; 250 vm_paddr_t page_range; 251 vm_paddr_t new_end; 252 int i; 253 vm_paddr_t pa; 254 vm_paddr_t last_pa; 255 char list; 256* 257 /* the biggest memory array is the second group of pages / 258* vm_paddr_t end; 259 vm_paddr_t biggestsize; 260 vm_paddr_t low_water, high_water; 261 int biggestone; 262 263 biggestsize = 0; 264 biggestone = 0; 265 vaddr = round_page(vaddr); 266 267 for (i = 0; phys_avail[i + 1]; i += 2) { 268 phys_avail[i] = round_page(phys_avail[i]); 269 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 270 } 271 272 low_water = phys_avail[0]; 273 high_water = phys_avail[1]; 274 275 for (i = 0; phys_avail[i + 1]; i += 2) { 276 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; 277 278 if (size > biggestsize) { 279 biggestone = i; 280 biggestsize = size; 281 } 282 if (phys_avail[i] < low_water) 283 low_water = phys_avail[i]; 284 if (phys_avail[i + 1] > high_water) 285 high_water = phys_avail[i + 1]; 286 } 287 288#ifdef XEN 289 low_water = 0; 290#endif 291 292 end = phys_avail[biggestone+1]; 293 294 /* 295 * Initialize the locks. 296 / 297* mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF \| 298 MTX_RECURSE); 299 mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL, 300 MTX_DEF); 301 302 /* Setup page locks. / 303* for (i = 0; i < PA_LOCK_COUNT; i++) 304 mtx_init(&pa_lock[i].data, "page lock", NULL, MTX_DEF); 305 306 /* 307 * Initialize the queue headers for the hold queue, the active queue, 308 * and the inactive queue. 309 / 310* for (i = 0; i < PQ_COUNT; i++) 311 TAILQ_INIT(&vm_page_queues[i].pl); 312 vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count; 313 vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count; 314 vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count; 315 316 /* 317 * Allocate memory for use when boot strapping the kernel memory 318 * allocator. 319 / 320* new_end = end - (boot_pages * UMA_SLAB_SIZE); 321 new_end = trunc_page(new_end); 322 mapped = pmap_map(&vaddr, new_end, end, 323 VM_PROT_READ \| VM_PROT_WRITE); 324 bzero((void )mapped, end - new_end); 325* uma_startup((void )mapped, boot_pages); 326* 327#if defined(__amd64__) \|\| defined(__i386__) \|\| defined(__arm__) \|\| \ 328 defined(__mips__) 329 /* 330 * Allocate a bitmap to indicate that a random physical page 331 * needs to be included in a minidump. 332 * 333 * The amd64 port needs this to indicate which direct map pages 334 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 335 * 336 * However, i386 still needs this workspace internally within the 337 * minidump code. In theory, they are not needed on i386, but are 338 * included should the sf_buf code decide to use them. 339 / 340* last_pa = 0; 341 for (i = 0; dump_avail[i + 1] != 0; i += 2) 342 if (dump_avail[i + 1] > last_pa) 343 last_pa = dump_avail[i + 1]; 344 page_range = last_pa / PAGE_SIZE; 345 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); 346 new_end -= vm_page_dump_size; 347 vm_page_dump = (void )(uintptr_t)pmap_map(&vaddr, new_end, 348* new_end + vm_page_dump_size, VM_PROT_READ \| VM_PROT_WRITE); 349 bzero((void )vm_page_dump, vm_page_dump_size); 350#endif 351#ifdef __amd64__ 352* /* 353 * Request that the physical pages underlying the message buffer be 354 * included in a crash dump. Since the message buffer is accessed 355 * through the direct map, they are not automatically included. 356 / 357* pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); 358 last_pa = pa + round_page(msgbufsize); 359 while (pa < last_pa) { 360 dump_add_page(pa); 361 pa += PAGE_SIZE; 362 } 363#endif 364 /* 365 * Compute the number of pages of memory that will be available for 366 * use (taking into account the overhead of a page structure per 367 * page). 368 / 369* first_page = low_water / PAGE_SIZE; 370#ifdef VM_PHYSSEG_SPARSE 371 page_range = 0; 372 for (i = 0; phys_avail[i + 1] != 0; i += 2) 373 page_range += atop(phys_avail[i + 1] - phys_avail[i]); 374#elif defined(VM_PHYSSEG_DENSE) 375 page_range = high_water / PAGE_SIZE - first_page; 376#else 377#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 378#endif 379 end = new_end; 380 381 /* 382 * Reserve an unmapped guard page to trap access to vm_page_array[-1]. 383 / 384* vaddr += PAGE_SIZE; 385 386 /* 387 * Initialize the mem entry structures now, and put them in the free 388 * queue. 389 / 390* new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 391 mapped = pmap_map(&vaddr, new_end, end, 392 VM_PROT_READ \| VM_PROT_WRITE); 393 vm_page_array = (vm_page_t) mapped; 394#if VM_NRESERVLEVEL > 0 395 /* 396 * Allocate memory for the reservation management system's data 397 * structures. 398 / 399* new_end = vm_reserv_startup(&vaddr, new_end, high_water); 400#endif 401#if defined(__amd64__) \|\| defined(__mips__) 402 /* 403 * pmap_map on amd64 and mips can come out of the direct-map, not kvm 404 * like i386, so the pages must be tracked for a crashdump to include 405 * this data. This includes the vm_page_array and the early UMA 406 * bootstrap pages. 407 / 408* for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) 409 dump_add_page(pa); 410#endif 411 phys_avail[biggestone + 1] = new_end; 412 413 /* 414 * Clear all of the page structures 415 / 416* bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); 417 for (i = 0; i < page_range; i++) 418 vm_page_array[i].order = VM_NFREEORDER; 419 vm_page_array_size = page_range; 420 421 /* 422 * Initialize the physical memory allocator. 423 / 424* vm_phys_init(); 425 426 /* 427 * Add every available physical page that is not blacklisted to 428 * the free lists. 429 / 430* cnt.v_page_count = 0; 431 cnt.v_free_count = 0; 432 list = getenv("vm.blacklist"); 433 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 434 pa = phys_avail[i]; 435 last_pa = phys_avail[i + 1]; 436 while (pa < last_pa) { 437 if (list != NULL && 438 vm_page_blacklist_lookup(list, pa)) 439 printf("Skipping page with pa 0x%jx\n", 440 (uintmax_t)pa); 441 else 442 vm_phys_add_page(pa); 443 pa += PAGE_SIZE; 444 } 445 } 446 freeenv(list); 447#if VM_NRESERVLEVEL > 0 448 /* 449 * Initialize the reservation management system. 450 / 451* vm_reserv_init(); 452#endif 453 return (vaddr); 454} 455 456 457CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0); 458 459void 460vm_page_aflag_set(vm_page_t m, uint8_t bits) 461{ 462 uint32_t addr, val; 463* 464 /* 465 * The PGA_WRITEABLE flag can only be set if the page is managed and 466 * VPO_BUSY. Currently, this flag is only set by pmap_enter(). 467 / 468* KASSERT((bits & PGA_WRITEABLE) == 0 \|\| 469 (m->oflags & (VPO_UNMANAGED \| VPO_BUSY)) == VPO_BUSY, 470 ("PGA_WRITEABLE and !VPO_BUSY")); 471 472 /* 473 * We want to use atomic updates for m->aflags, which is a 474 * byte wide. Not all architectures provide atomic operations 475 * on the single-byte destination. Punt and access the whole 476 * 4-byte word with an atomic update. Parallel non-atomic 477 * updates to the fields included in the update by proximity 478 * are handled properly by atomics. 479 / 480* addr = (void )&m->aflags; 481* MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0); 482 val = bits; 483#if BYTE_ORDER == BIG_ENDIAN 484 val <<= 24; 485#endif 486 atomic_set_32(addr, val); 487} 488 489void 490vm_page_aflag_clear(vm_page_t m, uint8_t bits) 491{ 492 uint32_t addr, val; 493* 494 /* 495 * The PGA_REFERENCED flag can only be cleared if the object 496 * containing the page is locked. 497 / 498* KASSERT((bits & PGA_REFERENCED) == 0 \|\| VM_OBJECT_LOCKED(m->object), 499 ("PGA_REFERENCED and !VM_OBJECT_LOCKED")); 500 501 /* 502 * See the comment in vm_page_aflag_set(). 503 / 504* addr = (void )&m->aflags; 505* MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0); 506 val = bits; 507#if BYTE_ORDER == BIG_ENDIAN 508 val <<= 24; 509#endif 510 atomic_clear_32(addr, val); 511} 512 513void 514vm_page_reference(vm_page_t m) 515{ 516 517 vm_page_aflag_set(m, PGA_REFERENCED); 518} 519 520void 521vm_page_busy(vm_page_t m) 522{ 523 524 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 525 KASSERT((m->oflags & VPO_BUSY) == 0, 526 ("vm_page_busy: page already busy!!!")); 527 m->oflags \|= VPO_BUSY; 528} 529 530/* 531 * vm_page_flash: 532 * 533 * wakeup anyone waiting for the page. 534 / 535void 536vm_page_flash(vm_page_t m) 537{ 538* 539 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 540 if (m->oflags & VPO_WANTED) { 541 m->oflags &= ~VPO_WANTED; 542 wakeup(m); 543 } 544} 545 546/* 547 * vm_page_wakeup: 548 * 549 * clear the VPO_BUSY flag and wakeup anyone waiting for the 550 * page. 551 * 552 / 553void 554vm_page_wakeup(vm_page_t m) 555{ 556* 557 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 558 KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!")); 559 m->oflags &= ~VPO_BUSY; 560 vm_page_flash(m); 561} 562 563void 564vm_page_io_start(vm_page_t m) 565{ 566 567 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 568 m->busy++; 569} 570 571void 572vm_page_io_finish(vm_page_t m) 573{ 574 575 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 576 KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m)); 577 m->busy--; 578 if (m->busy == 0) 579 vm_page_flash(m); 580} 581 582/* 583 * Keep page from being freed by the page daemon 584 * much of the same effect as wiring, except much lower 585 * overhead and should be used only for very temporary 586 * holding ("wiring"). 587 / 588void 589vm_page_hold(vm_page_t mem) 590{ 591* 592 vm_page_lock_assert(mem, MA_OWNED); 593 mem->hold_count++; 594} 595 596void 597vm_page_unhold(vm_page_t mem) 598{ 599 600 vm_page_lock_assert(mem, MA_OWNED); 601 --mem->hold_count; 602 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); 603 if (mem->hold_count == 0 && mem->queue == PQ_HOLD) 604 vm_page_free_toq(mem); 605} 606 607/* 608 * vm_page_unhold_pages: 609 * 610 * Unhold each of the pages that is referenced by the given array. 611 / 612void 613vm_page_unhold_pages(vm_page_t ma, int count) 614{ 615 struct mtx mtx, new_mtx; 616 617 mtx = NULL; 618 for (; count != 0; count--) { 619 /* 620 * Avoid releasing and reacquiring the same page lock. 621 / 622* new_mtx = vm_page_lockptr(ma); 623* if (mtx != new_mtx) { 624 if (mtx != NULL) 625 mtx_unlock(mtx); 626 mtx = new_mtx; 627 mtx_lock(mtx); 628 } 629 vm_page_unhold(ma); 630* ma++; 631 } 632 if (mtx != NULL) 633 mtx_unlock(mtx); 634} 635 636/* 637 * vm_page_getfake: 638 * 639 * Create a fictitious page with the specified physical address and 640 * memory attribute. The memory attribute is the only the machine- 641 * dependent aspect of a fictitious page that must be initialized. 642 / 643vm_page_t 644vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) 645{ 646* vm_page_t m; 647 648 m = uma_zalloc(fakepg_zone, M_WAITOK \| M_ZERO); 649 m->phys_addr = paddr; 650 m->queue = PQ_NONE; 651 /* Fictitious pages don't use "segind". / 652* m->flags = PG_FICTITIOUS; 653 /* Fictitious pages don't use "order" or "pool". / 654* m->oflags = VPO_BUSY \| VPO_UNMANAGED; 655 m->wire_count = 1; 656 pmap_page_set_memattr(m, memattr); 657 return (m); 658} 659 660/* 661 * vm_page_putfake: 662 * 663 * Release a fictitious page. 664 / 665void 666vm_page_putfake(vm_page_t m) 667{ 668* 669 KASSERT((m->flags & PG_FICTITIOUS) != 0, 670 ("vm_page_putfake: bad page %p", m)); 671 uma_zfree(fakepg_zone, m); 672} 673 674/* 675 * vm_page_updatefake: 676 * 677 * Update the given fictitious page to the specified physical address and 678 * memory attribute. 679 / 680void 681vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 682{ 683* 684 KASSERT((m->flags & PG_FICTITIOUS) != 0, 685 ("vm_page_updatefake: bad page %p", m)); 686 m->phys_addr = paddr; 687 pmap_page_set_memattr(m, memattr); 688} 689 690/* 691 * vm_page_free: 692 * 693 * Free a page. 694 / 695void 696vm_page_free(vm_page_t m) 697{ 698* 699 m->flags &= ~PG_ZERO; 700 vm_page_free_toq(m); 701} 702 703/* 704 * vm_page_free_zero: 705 * 706 * Free a page to the zerod-pages queue 707 / 708void 709vm_page_free_zero(vm_page_t m) 710{ 711* 712 m->flags \|= PG_ZERO; 713 vm_page_free_toq(m); 714} 715 716/* 717 * vm_page_sleep: 718 * 719 * Sleep and release the page and page queues locks. 720 * 721 * The object containing the given page must be locked. 722 / 723void 724vm_page_sleep(vm_page_t m, const char msg) 725{ 726 727 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 728 if (mtx_owned(&vm_page_queue_mtx)) 729 vm_page_unlock_queues(); 730 if (mtx_owned(vm_page_lockptr(m))) 731 vm_page_unlock(m); 732 733 /* 734 * It's possible that while we sleep, the page will get 735 * unbusied and freed. If we are holding the object 736 * lock, we will assume we hold a reference to the object 737 * such that even if m->object changes, we can re-lock 738 * it. 739 / 740* m->oflags \|= VPO_WANTED; 741 msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0); 742} 743 744/* 745 * vm_page_dirty: 746 * 747 * Set all bits in the page's dirty field. 748 * 749 * The object containing the specified page must be locked if the 750 * call is made from the machine-independent layer. 751 * 752 * See vm_page_clear_dirty_mask(). 753 / 754void 755vm_page_dirty(vm_page_t m) 756{ 757* 758 KASSERT((m->flags & PG_CACHED) == 0, 759 ("vm_page_dirty: page in cache!")); 760 KASSERT(!VM_PAGE_IS_FREE(m), 761 ("vm_page_dirty: page is free!")); 762 KASSERT(m->valid == VM_PAGE_BITS_ALL, 763 ("vm_page_dirty: page is invalid!")); 764 m->dirty = VM_PAGE_BITS_ALL; 765} 766 767/* 768 * vm_page_splay: 769 * 770 * Implements Sleator and Tarjan's top-down splay algorithm. Returns 771 * the vm_page containing the given pindex. If, however, that 772 * pindex is not found in the vm_object, returns a vm_page that is 773 * adjacent to the pindex, coming before or after it. 774 / 775vm_page_t 776vm_page_splay(vm_pindex_t pindex, vm_page_t root) 777{ 778* struct vm_page dummy; 779 vm_page_t lefttreemax, righttreemin, y; 780 781 if (root == NULL) 782 return (root); 783 lefttreemax = righttreemin = &dummy; 784 for (;; root = y) { 785 if (pindex < root->pindex) { 786 if ((y = root->left) == NULL) 787 break; 788 if (pindex < y->pindex) { 789 /* Rotate right. / 790* root->left = y->right; 791 y->right = root; 792 root = y; 793 if ((y = root->left) == NULL) 794 break; 795 } 796 /* Link into the new root's right tree. / 797* righttreemin->left = root; 798 righttreemin = root; 799 } else if (pindex > root->pindex) { 800 if ((y = root->right) == NULL) 801 break; 802 if (pindex > y->pindex) { 803 /* Rotate left. / 804* root->right = y->left; 805 y->left = root; 806 root = y; 807 if ((y = root->right) == NULL) 808 break; 809 } 810 /* Link into the new root's left tree. / 811* lefttreemax->right = root; 812 lefttreemax = root; 813 } else 814 break; 815 } 816 /* Assemble the new root. / 817* lefttreemax->right = root->left; 818 righttreemin->left = root->right; 819 root->left = dummy.right; 820 root->right = dummy.left; 821 return (root); 822} 823 824/* 825 * vm_page_insert: [ internal use only ] 826 * 827 * Inserts the given mem entry into the object and object list. 828 * 829 * The pagetables are not updated but will presumably fault the page 830 * in if necessary, or if a kernel page the caller will at some point 831 * enter the page into the kernel's pmap. We are not allowed to block 832 * here so we can't do this anyway. 833 * 834 * The object and page must be locked. 835 * This routine may not block. 836 / 837void 838vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 839{ 840* vm_page_t root; 841 842 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 843 if (m->object != NULL) 844 panic("vm_page_insert: page already inserted"); 845 846 /* 847 * Record the object/offset pair in this page 848 / 849* m->object = object; 850 m->pindex = pindex; 851 852 /* 853 * Now link into the object's ordered list of backed pages. 854 / 855* root = object->root; 856 if (root == NULL) { 857 m->left = NULL; 858 m->right = NULL; 859 TAILQ_INSERT_TAIL(&object->memq, m, listq); 860 } else { 861 root = vm_page_splay(pindex, root); 862 if (pindex < root->pindex) { 863 m->left = root->left; 864 m->right = root; 865 root->left = NULL; 866 TAILQ_INSERT_BEFORE(root, m, listq); 867 } else if (pindex == root->pindex) 868 panic("vm_page_insert: offset already allocated"); 869 else { 870 m->right = root->right; 871 m->left = root; 872 root->right = NULL; 873 TAILQ_INSERT_AFTER(&object->memq, root, m, listq); 874 } 875 } 876 object->root = m; 877 878 /* 879 * show that the object has one more resident page. 880 / 881* object->resident_page_count++; 882 /* 883 * Hold the vnode until the last page is released. 884 / 885* if (object->resident_page_count == 1 && object->type == OBJT_VNODE) 886 vhold((struct vnode )object->handle); 887* 888 /* 889 * Since we are inserting a new and possibly dirty page, 890 * update the object's OBJ_MIGHTBEDIRTY flag. 891 / 892* if (m->aflags & PGA_WRITEABLE) 893 vm_object_set_writeable_dirty(object); 894} 895 896/* 897 * vm_page_remove: 898 * NOTE: used by device pager as well -wfj 899 * 900 * Removes the given mem entry from the object/offset-page 901 * table and the object page list, but do not invalidate/terminate 902 * the backing store. 903 * 904 * The object and page must be locked. 905 * The underlying pmap entry (if any) is NOT removed here. 906 * This routine may not block. 907 / 908void 909vm_page_remove(vm_page_t m) 910{ 911* vm_object_t object; 912 vm_page_t next, prev, root; 913 914 if ((m->oflags & VPO_UNMANAGED) == 0) 915 vm_page_lock_assert(m, MA_OWNED); 916 if ((object = m->object) == NULL) 917 return; 918 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 919 if (m->oflags & VPO_BUSY) { 920 m->oflags &= ~VPO_BUSY; 921 vm_page_flash(m); 922 } 923 924 /* 925 * Now remove from the object's list of backed pages. 926 / 927* if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { 928 /* 929 * Since the page's successor in the list is also its parent 930 * in the tree, its right subtree must be empty. 931 / 932* next->left = m->left; 933 KASSERT(m->right == NULL, 934 ("vm_page_remove: page %p has right child", m)); 935 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 936 prev->right == m) { 937 /* 938 * Since the page's predecessor in the list is also its parent 939 * in the tree, its left subtree must be empty. 940 / 941* KASSERT(m->left == NULL, 942 ("vm_page_remove: page %p has left child", m)); 943 prev->right = m->right; 944 } else { 945 if (m != object->root) 946 vm_page_splay(m->pindex, object->root); 947 if (m->left == NULL) 948 root = m->right; 949 else if (m->right == NULL) 950 root = m->left; 951 else { 952 /* 953 * Move the page's successor to the root, because 954 * pages are usually removed in ascending order. 955 / 956* if (m->right != next) 957 vm_page_splay(m->pindex, m->right); 958 next->left = m->left; 959 root = next; 960 } 961 object->root = root; 962 } 963 TAILQ_REMOVE(&object->memq, m, listq); 964 965 /* 966 * And show that the object has one fewer resident page. 967 / 968* object->resident_page_count--; 969 /* 970 * The vnode may now be recycled. 971 / 972* if (object->resident_page_count == 0 && object->type == OBJT_VNODE) 973 vdrop((struct vnode )object->handle); 974* 975 m->object = NULL; 976} 977 978/* 979 * vm_page_lookup: 980 * 981 * Returns the page associated with the object/offset 982 * pair specified; if none is found, NULL is returned. 983 * 984 * The object must be locked. 985 * This routine may not block. 986 * This is a critical path routine 987 / 988vm_page_t 989vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 990{ 991* vm_page_t m; 992 993 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 994 if ((m = object->root) != NULL && m->pindex != pindex) { 995 m = vm_page_splay(pindex, m); 996 if ((object->root = m)->pindex != pindex) 997 m = NULL; 998 } 999 return (m); 1000} 1001 1002/* 1003 * vm_page_find_least: 1004 * 1005 * Returns the page associated with the object with least pindex 1006 * greater than or equal to the parameter pindex, or NULL. 1007 * 1008 * The object must be locked. 1009 * The routine may not block. 1010 / 1011vm_page_t 1012vm_page_find_least(vm_object_t object, vm_pindex_t pindex) 1013{ 1014* vm_page_t m; 1015 1016 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1017 if ((m = TAILQ_FIRST(&object->memq)) != NULL) { 1018 if (m->pindex < pindex) { 1019 m = vm_page_splay(pindex, object->root); 1020 if ((object->root = m)->pindex < pindex) 1021 m = TAILQ_NEXT(m, listq); 1022 } 1023 } 1024 return (m); 1025} 1026 1027/* 1028 * Returns the given page's successor (by pindex) within the object if it is 1029 * resident; if none is found, NULL is returned. 1030 * 1031 * The object must be locked. 1032 / 1033vm_page_t 1034vm_page_next(vm_page_t m) 1035{ 1036* vm_page_t next; 1037 1038 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1039 if ((next = TAILQ_NEXT(m, listq)) != NULL && 1040 next->pindex != m->pindex + 1) 1041 next = NULL; 1042 return (next); 1043} 1044 1045/* 1046 * Returns the given page's predecessor (by pindex) within the object if it is 1047 * resident; if none is found, NULL is returned. 1048 * 1049 * The object must be locked. 1050 / 1051vm_page_t 1052vm_page_prev(vm_page_t m) 1053{ 1054* vm_page_t prev; 1055 1056 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1057 if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 1058 prev->pindex != m->pindex - 1) 1059 prev = NULL; 1060 return (prev); 1061} 1062 1063/* 1064 * vm_page_rename: 1065 * 1066 * Move the given memory entry from its 1067 * current object to the specified target object/offset. 1068 * 1069 * The object must be locked. 1070 * This routine may not block. 1071 * 1072 * Note: swap associated with the page must be invalidated by the move. We 1073 * have to do this for several reasons: (1) we aren't freeing the 1074 * page, (2) we are dirtying the page, (3) the VM system is probably 1075 * moving the page from object A to B, and will then later move 1076 * the backing store from A to B and we can't have a conflict. 1077 * 1078 * Note: we always dirty the page. It is necessary both for the 1079 * fact that we moved it, and because we may be invalidating 1080 * swap. If the page is on the cache, we have to deactivate it 1081 * or vm_page_dirty() will panic. Dirty pages are not allowed 1082 * on the cache. 1083 / 1084void 1085vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 1086{ 1087* 1088 vm_page_remove(m); 1089 vm_page_insert(m, new_object, new_pindex); 1090 vm_page_dirty(m); 1091} 1092 1093/* 1094 * Convert all of the given object's cached pages that have a 1095 * pindex within the given range into free pages. If the value 1096 * zero is given for "end", then the range's upper bound is 1097 * infinity. If the given object is backed by a vnode and it 1098 * transitions from having one or more cached pages to none, the 1099 * vnode's hold count is reduced. 1100 / 1101void 1102vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 1103{ 1104* vm_page_t m, m_next; 1105 boolean_t empty; 1106 1107 mtx_lock(&vm_page_queue_free_mtx); 1108 if (__predict_false(object->cache == NULL)) { 1109 mtx_unlock(&vm_page_queue_free_mtx); 1110 return; 1111 } 1112 m = object->cache = vm_page_splay(start, object->cache); 1113 if (m->pindex < start) { 1114 if (m->right == NULL) 1115 m = NULL; 1116 else { 1117 m_next = vm_page_splay(start, m->right); 1118 m_next->left = m; 1119 m->right = NULL; 1120 m = object->cache = m_next; 1121 } 1122 } 1123 1124 /* 1125 * At this point, "m" is either (1) a reference to the page 1126 * with the least pindex that is greater than or equal to 1127 * "start" or (2) NULL. 1128 / 1129* for (; m != NULL && (m->pindex < end \|\| end == 0); m = m_next) { 1130 /* 1131 * Find "m"'s successor and remove "m" from the 1132 * object's cache. 1133 / 1134* if (m->right == NULL) { 1135 object->cache = m->left; 1136 m_next = NULL; 1137 } else { 1138 m_next = vm_page_splay(start, m->right); 1139 m_next->left = m->left; 1140 object->cache = m_next; 1141 } 1142 /* Convert "m" to a free page. / 1143* m->object = NULL; 1144 m->valid = 0; 1145 /* Clear PG_CACHED and set PG_FREE. / 1146* m->flags ^= PG_CACHED \| PG_FREE; 1147 KASSERT((m->flags & (PG_CACHED \| PG_FREE)) == PG_FREE, 1148 ("vm_page_cache_free: page %p has inconsistent flags", m)); 1149 cnt.v_cache_count--; 1150 cnt.v_free_count++; 1151 } 1152 empty = object->cache == NULL; 1153 mtx_unlock(&vm_page_queue_free_mtx); 1154 if (object->type == OBJT_VNODE && empty) 1155 vdrop(object->handle); 1156} 1157 1158/* 1159 * Returns the cached page that is associated with the given 1160 * object and offset. If, however, none exists, returns NULL. 1161 * 1162 * The free page queue must be locked. 1163 / 1164static inline vm_page_t 1165vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) 1166{ 1167* vm_page_t m; 1168 1169 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1170 if ((m = object->cache) != NULL && m->pindex != pindex) { 1171 m = vm_page_splay(pindex, m); 1172 if ((object->cache = m)->pindex != pindex) 1173 m = NULL; 1174 } 1175 return (m); 1176} 1177 1178/* 1179 * Remove the given cached page from its containing object's 1180 * collection of cached pages. 1181 * 1182 * The free page queue must be locked. 1183 / 1184void 1185vm_page_cache_remove(vm_page_t m) 1186{ 1187* vm_object_t object; 1188 vm_page_t root; 1189 1190 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1191 KASSERT((m->flags & PG_CACHED) != 0, 1192 ("vm_page_cache_remove: page %p is not cached", m)); 1193 object = m->object; 1194 if (m != object->cache) { 1195 root = vm_page_splay(m->pindex, object->cache); 1196 KASSERT(root == m, 1197 ("vm_page_cache_remove: page %p is not cached in object %p", 1198 m, object)); 1199 } 1200 if (m->left == NULL) 1201 root = m->right; 1202 else if (m->right == NULL) 1203 root = m->left; 1204 else { 1205 root = vm_page_splay(m->pindex, m->left); 1206 root->right = m->right; 1207 } 1208 object->cache = root; 1209 m->object = NULL; 1210 cnt.v_cache_count--; 1211} 1212 1213/* 1214 * Transfer all of the cached pages with offset greater than or 1215 * equal to 'offidxstart' from the original object's cache to the 1216 * new object's cache. However, any cached pages with offset 1217 * greater than or equal to the new object's size are kept in the 1218 * original object. Initially, the new object's cache must be 1219 * empty. Offset 'offidxstart' in the original object must 1220 * correspond to offset zero in the new object. 1221 * 1222 * The new object must be locked. 1223 / 1224void 1225vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, 1226* vm_object_t new_object) 1227{ 1228 vm_page_t m, m_next; 1229 1230 /* 1231 * Insertion into an object's collection of cached pages 1232 * requires the object to be locked. In contrast, removal does 1233 * not. 1234 / 1235* VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); 1236 KASSERT(new_object->cache == NULL, 1237 ("vm_page_cache_transfer: object %p has cached pages", 1238 new_object)); 1239 mtx_lock(&vm_page_queue_free_mtx); 1240 if ((m = orig_object->cache) != NULL) { 1241 /* 1242 * Transfer all of the pages with offset greater than or 1243 * equal to 'offidxstart' from the original object's 1244 * cache to the new object's cache. 1245 / 1246* m = vm_page_splay(offidxstart, m); 1247 if (m->pindex < offidxstart) { 1248 orig_object->cache = m; 1249 new_object->cache = m->right; 1250 m->right = NULL; 1251 } else { 1252 orig_object->cache = m->left; 1253 new_object->cache = m; 1254 m->left = NULL; 1255 } 1256 while ((m = new_object->cache) != NULL) { 1257 if ((m->pindex - offidxstart) >= new_object->size) { 1258 /* 1259 * Return all of the cached pages with 1260 * offset greater than or equal to the 1261 * new object's size to the original 1262 * object's cache. 1263 / 1264* new_object->cache = m->left; 1265 m->left = orig_object->cache; 1266 orig_object->cache = m; 1267 break; 1268 } 1269 m_next = vm_page_splay(m->pindex, m->right); 1270 /* Update the page's object and offset. / 1271* m->object = new_object; 1272 m->pindex -= offidxstart; 1273 if (m_next == NULL) 1274 break; 1275 m->right = NULL; 1276 m_next->left = m; 1277 new_object->cache = m_next; 1278 } 1279 KASSERT(new_object->cache == NULL \|\| 1280 new_object->type == OBJT_SWAP, 1281 ("vm_page_cache_transfer: object %p's type is incompatible" 1282 " with cached pages", new_object)); 1283 } 1284 mtx_unlock(&vm_page_queue_free_mtx); 1285} 1286 1287/* 1288 * vm_page_alloc: 1289 * 1290 * Allocate and return a page that is associated with the specified 1291 * object and offset pair. By default, this page has the flag VPO_BUSY 1292 * set. 1293 * 1294 * The caller must always specify an allocation class. 1295 * 1296 * allocation classes: 1297 * VM_ALLOC_NORMAL normal process request 1298 * VM_ALLOC_SYSTEM system really needs a page 1299 * VM_ALLOC_INTERRUPT interrupt time request 1300 * 1301 * optional allocation flags: 1302 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1303 * intends to allocate 1304 * VM_ALLOC_IFCACHED return page only if it is cached 1305 * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page 1306 * is cached 1307 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page	81 82#include "opt_vm.h" 83 84#include <sys/param.h> 85#include <sys/systm.h> 86#include <sys/lock.h> 87#include <sys/kernel.h> 88#include <sys/limits.h> 89#include <sys/malloc.h> 90#include <sys/msgbuf.h> 91#include <sys/mutex.h> 92#include <sys/proc.h> 93#include <sys/sysctl.h> 94#include <sys/vmmeter.h> 95#include <sys/vnode.h> 96 97#include <vm/vm.h> 98#include <vm/pmap.h> 99#include <vm/vm_param.h> 100#include <vm/vm_kern.h> 101#include <vm/vm_object.h> 102#include <vm/vm_page.h> 103#include <vm/vm_pageout.h> 104#include <vm/vm_pager.h> 105#include <vm/vm_phys.h> 106#include <vm/vm_reserv.h> 107#include <vm/vm_extern.h> 108#include <vm/uma.h> 109#include <vm/uma_int.h> 110 111#include <machine/md_var.h> 112 113/* 114 * Associated with page of user-allocatable memory is a 115 * page structure. 116 / 117* 118struct vpgqueues vm_page_queues[PQ_COUNT]; 119struct vpglocks vm_page_queue_lock; 120struct vpglocks vm_page_queue_free_lock; 121 122struct vpglocks pa_lock[PA_LOCK_COUNT]; 123 124vm_page_t vm_page_array = 0; 125int vm_page_array_size = 0; 126long first_page = 0; 127int vm_page_zero_count = 0; 128 129static int boot_pages = UMA_BOOT_PAGES; 130TUNABLE_INT("vm.boot_pages", &boot_pages); 131SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0, 132 "number of pages allocated for bootstrapping the VM system"); 133 134static int pa_tryrelock_restart; 135SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, 136 &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); 137 138static uma_zone_t fakepg_zone; 139 140static struct vnode vm_page_alloc_init(vm_page_t m); 141static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); 142static void vm_page_queue_remove(int queue, vm_page_t m); 143static void vm_page_enqueue(int queue, vm_page_t m); 144static void vm_page_init_fakepg(void dummy); 145 146SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL); 147 148static void 149vm_page_init_fakepg(void dummy) 150{ 151* 152 fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, 153 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE \| UMA_ZONE_VM); 154} 155 156/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. / 157#if PAGE_SIZE == 32768 158#ifdef CTASSERT 159CTASSERT(sizeof(u_long) >= 8); 160#endif 161#endif 162* 163/* 164 * Try to acquire a physical address lock while a pmap is locked. If we 165 * fail to trylock we unlock and lock the pmap directly and cache the 166 * locked pa in locked. The caller should then restart their loop in case 167* * the virtual to physical mapping has changed. 168 / 169int 170vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t locked) 171{ 172 vm_paddr_t lockpa; 173 174 lockpa = locked; 175* locked = pa; 176* if (lockpa) { 177 PA_LOCK_ASSERT(lockpa, MA_OWNED); 178 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) 179 return (0); 180 PA_UNLOCK(lockpa); 181 } 182 if (PA_TRYLOCK(pa)) 183 return (0); 184 PMAP_UNLOCK(pmap); 185 atomic_add_int(&pa_tryrelock_restart, 1); 186 PA_LOCK(pa); 187 PMAP_LOCK(pmap); 188 return (EAGAIN); 189} 190 191/* 192 * vm_set_page_size: 193 * 194 * Sets the page size, perhaps based upon the memory 195 * size. Must be called before any use of page-size 196 * dependent functions. 197 / 198void 199vm_set_page_size(void) 200{ 201* if (cnt.v_page_size == 0) 202 cnt.v_page_size = PAGE_SIZE; 203 if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0) 204 panic("vm_set_page_size: page size not a power of two"); 205} 206 207/* 208 * vm_page_blacklist_lookup: 209 * 210 * See if a physical address in this page has been listed 211 * in the blacklist tunable. Entries in the tunable are 212 * separated by spaces or commas. If an invalid integer is 213 * encountered then the rest of the string is skipped. 214 / 215static int 216vm_page_blacklist_lookup(char list, vm_paddr_t pa) 217{ 218 vm_paddr_t bad; 219 char cp, pos; 220 221 for (pos = list; pos != '\0'; pos = cp) { 222* bad = strtoq(pos, &cp, 0); 223 if (cp != '\0') { 224* if (cp == ' ' \|\| cp == ',') { 225 cp++; 226 if (cp == pos) 227 continue; 228 } else 229 break; 230 } 231 if (pa == trunc_page(bad)) 232 return (1); 233 } 234 return (0); 235} 236 237/* 238 * vm_page_startup: 239 * 240 * Initializes the resident memory module. 241 * 242 * Allocates memory for the page cells, and 243 * for the object/offset-to-page hash table headers. 244 * Each page cell is initialized and placed on the free list. 245 / 246vm_offset_t 247vm_page_startup(vm_offset_t vaddr) 248{ 249* vm_offset_t mapped; 250 vm_paddr_t page_range; 251 vm_paddr_t new_end; 252 int i; 253 vm_paddr_t pa; 254 vm_paddr_t last_pa; 255 char list; 256* 257 /* the biggest memory array is the second group of pages / 258* vm_paddr_t end; 259 vm_paddr_t biggestsize; 260 vm_paddr_t low_water, high_water; 261 int biggestone; 262 263 biggestsize = 0; 264 biggestone = 0; 265 vaddr = round_page(vaddr); 266 267 for (i = 0; phys_avail[i + 1]; i += 2) { 268 phys_avail[i] = round_page(phys_avail[i]); 269 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 270 } 271 272 low_water = phys_avail[0]; 273 high_water = phys_avail[1]; 274 275 for (i = 0; phys_avail[i + 1]; i += 2) { 276 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; 277 278 if (size > biggestsize) { 279 biggestone = i; 280 biggestsize = size; 281 } 282 if (phys_avail[i] < low_water) 283 low_water = phys_avail[i]; 284 if (phys_avail[i + 1] > high_water) 285 high_water = phys_avail[i + 1]; 286 } 287 288#ifdef XEN 289 low_water = 0; 290#endif 291 292 end = phys_avail[biggestone+1]; 293 294 /* 295 * Initialize the locks. 296 / 297* mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF \| 298 MTX_RECURSE); 299 mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL, 300 MTX_DEF); 301 302 /* Setup page locks. / 303* for (i = 0; i < PA_LOCK_COUNT; i++) 304 mtx_init(&pa_lock[i].data, "page lock", NULL, MTX_DEF); 305 306 /* 307 * Initialize the queue headers for the hold queue, the active queue, 308 * and the inactive queue. 309 / 310* for (i = 0; i < PQ_COUNT; i++) 311 TAILQ_INIT(&vm_page_queues[i].pl); 312 vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count; 313 vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count; 314 vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count; 315 316 /* 317 * Allocate memory for use when boot strapping the kernel memory 318 * allocator. 319 / 320* new_end = end - (boot_pages * UMA_SLAB_SIZE); 321 new_end = trunc_page(new_end); 322 mapped = pmap_map(&vaddr, new_end, end, 323 VM_PROT_READ \| VM_PROT_WRITE); 324 bzero((void )mapped, end - new_end); 325* uma_startup((void )mapped, boot_pages); 326* 327#if defined(__amd64__) \|\| defined(__i386__) \|\| defined(__arm__) \|\| \ 328 defined(__mips__) 329 /* 330 * Allocate a bitmap to indicate that a random physical page 331 * needs to be included in a minidump. 332 * 333 * The amd64 port needs this to indicate which direct map pages 334 * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 335 * 336 * However, i386 still needs this workspace internally within the 337 * minidump code. In theory, they are not needed on i386, but are 338 * included should the sf_buf code decide to use them. 339 / 340* last_pa = 0; 341 for (i = 0; dump_avail[i + 1] != 0; i += 2) 342 if (dump_avail[i + 1] > last_pa) 343 last_pa = dump_avail[i + 1]; 344 page_range = last_pa / PAGE_SIZE; 345 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); 346 new_end -= vm_page_dump_size; 347 vm_page_dump = (void )(uintptr_t)pmap_map(&vaddr, new_end, 348* new_end + vm_page_dump_size, VM_PROT_READ \| VM_PROT_WRITE); 349 bzero((void )vm_page_dump, vm_page_dump_size); 350#endif 351#ifdef __amd64__ 352* /* 353 * Request that the physical pages underlying the message buffer be 354 * included in a crash dump. Since the message buffer is accessed 355 * through the direct map, they are not automatically included. 356 / 357* pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); 358 last_pa = pa + round_page(msgbufsize); 359 while (pa < last_pa) { 360 dump_add_page(pa); 361 pa += PAGE_SIZE; 362 } 363#endif 364 /* 365 * Compute the number of pages of memory that will be available for 366 * use (taking into account the overhead of a page structure per 367 * page). 368 / 369* first_page = low_water / PAGE_SIZE; 370#ifdef VM_PHYSSEG_SPARSE 371 page_range = 0; 372 for (i = 0; phys_avail[i + 1] != 0; i += 2) 373 page_range += atop(phys_avail[i + 1] - phys_avail[i]); 374#elif defined(VM_PHYSSEG_DENSE) 375 page_range = high_water / PAGE_SIZE - first_page; 376#else 377#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 378#endif 379 end = new_end; 380 381 /* 382 * Reserve an unmapped guard page to trap access to vm_page_array[-1]. 383 / 384* vaddr += PAGE_SIZE; 385 386 /* 387 * Initialize the mem entry structures now, and put them in the free 388 * queue. 389 / 390* new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 391 mapped = pmap_map(&vaddr, new_end, end, 392 VM_PROT_READ \| VM_PROT_WRITE); 393 vm_page_array = (vm_page_t) mapped; 394#if VM_NRESERVLEVEL > 0 395 /* 396 * Allocate memory for the reservation management system's data 397 * structures. 398 / 399* new_end = vm_reserv_startup(&vaddr, new_end, high_water); 400#endif 401#if defined(__amd64__) \|\| defined(__mips__) 402 /* 403 * pmap_map on amd64 and mips can come out of the direct-map, not kvm 404 * like i386, so the pages must be tracked for a crashdump to include 405 * this data. This includes the vm_page_array and the early UMA 406 * bootstrap pages. 407 / 408* for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) 409 dump_add_page(pa); 410#endif 411 phys_avail[biggestone + 1] = new_end; 412 413 /* 414 * Clear all of the page structures 415 / 416* bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); 417 for (i = 0; i < page_range; i++) 418 vm_page_array[i].order = VM_NFREEORDER; 419 vm_page_array_size = page_range; 420 421 /* 422 * Initialize the physical memory allocator. 423 / 424* vm_phys_init(); 425 426 /* 427 * Add every available physical page that is not blacklisted to 428 * the free lists. 429 / 430* cnt.v_page_count = 0; 431 cnt.v_free_count = 0; 432 list = getenv("vm.blacklist"); 433 for (i = 0; phys_avail[i + 1] != 0; i += 2) { 434 pa = phys_avail[i]; 435 last_pa = phys_avail[i + 1]; 436 while (pa < last_pa) { 437 if (list != NULL && 438 vm_page_blacklist_lookup(list, pa)) 439 printf("Skipping page with pa 0x%jx\n", 440 (uintmax_t)pa); 441 else 442 vm_phys_add_page(pa); 443 pa += PAGE_SIZE; 444 } 445 } 446 freeenv(list); 447#if VM_NRESERVLEVEL > 0 448 /* 449 * Initialize the reservation management system. 450 / 451* vm_reserv_init(); 452#endif 453 return (vaddr); 454} 455 456 457CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0); 458 459void 460vm_page_aflag_set(vm_page_t m, uint8_t bits) 461{ 462 uint32_t addr, val; 463* 464 /* 465 * The PGA_WRITEABLE flag can only be set if the page is managed and 466 * VPO_BUSY. Currently, this flag is only set by pmap_enter(). 467 / 468* KASSERT((bits & PGA_WRITEABLE) == 0 \|\| 469 (m->oflags & (VPO_UNMANAGED \| VPO_BUSY)) == VPO_BUSY, 470 ("PGA_WRITEABLE and !VPO_BUSY")); 471 472 /* 473 * We want to use atomic updates for m->aflags, which is a 474 * byte wide. Not all architectures provide atomic operations 475 * on the single-byte destination. Punt and access the whole 476 * 4-byte word with an atomic update. Parallel non-atomic 477 * updates to the fields included in the update by proximity 478 * are handled properly by atomics. 479 / 480* addr = (void )&m->aflags; 481* MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0); 482 val = bits; 483#if BYTE_ORDER == BIG_ENDIAN 484 val <<= 24; 485#endif 486 atomic_set_32(addr, val); 487} 488 489void 490vm_page_aflag_clear(vm_page_t m, uint8_t bits) 491{ 492 uint32_t addr, val; 493* 494 /* 495 * The PGA_REFERENCED flag can only be cleared if the object 496 * containing the page is locked. 497 / 498* KASSERT((bits & PGA_REFERENCED) == 0 \|\| VM_OBJECT_LOCKED(m->object), 499 ("PGA_REFERENCED and !VM_OBJECT_LOCKED")); 500 501 /* 502 * See the comment in vm_page_aflag_set(). 503 / 504* addr = (void )&m->aflags; 505* MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0); 506 val = bits; 507#if BYTE_ORDER == BIG_ENDIAN 508 val <<= 24; 509#endif 510 atomic_clear_32(addr, val); 511} 512 513void 514vm_page_reference(vm_page_t m) 515{ 516 517 vm_page_aflag_set(m, PGA_REFERENCED); 518} 519 520void 521vm_page_busy(vm_page_t m) 522{ 523 524 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 525 KASSERT((m->oflags & VPO_BUSY) == 0, 526 ("vm_page_busy: page already busy!!!")); 527 m->oflags \|= VPO_BUSY; 528} 529 530/* 531 * vm_page_flash: 532 * 533 * wakeup anyone waiting for the page. 534 / 535void 536vm_page_flash(vm_page_t m) 537{ 538* 539 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 540 if (m->oflags & VPO_WANTED) { 541 m->oflags &= ~VPO_WANTED; 542 wakeup(m); 543 } 544} 545 546/* 547 * vm_page_wakeup: 548 * 549 * clear the VPO_BUSY flag and wakeup anyone waiting for the 550 * page. 551 * 552 / 553void 554vm_page_wakeup(vm_page_t m) 555{ 556* 557 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 558 KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!")); 559 m->oflags &= ~VPO_BUSY; 560 vm_page_flash(m); 561} 562 563void 564vm_page_io_start(vm_page_t m) 565{ 566 567 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 568 m->busy++; 569} 570 571void 572vm_page_io_finish(vm_page_t m) 573{ 574 575 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 576 KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m)); 577 m->busy--; 578 if (m->busy == 0) 579 vm_page_flash(m); 580} 581 582/* 583 * Keep page from being freed by the page daemon 584 * much of the same effect as wiring, except much lower 585 * overhead and should be used only for very temporary 586 * holding ("wiring"). 587 / 588void 589vm_page_hold(vm_page_t mem) 590{ 591* 592 vm_page_lock_assert(mem, MA_OWNED); 593 mem->hold_count++; 594} 595 596void 597vm_page_unhold(vm_page_t mem) 598{ 599 600 vm_page_lock_assert(mem, MA_OWNED); 601 --mem->hold_count; 602 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); 603 if (mem->hold_count == 0 && mem->queue == PQ_HOLD) 604 vm_page_free_toq(mem); 605} 606 607/* 608 * vm_page_unhold_pages: 609 * 610 * Unhold each of the pages that is referenced by the given array. 611 / 612void 613vm_page_unhold_pages(vm_page_t ma, int count) 614{ 615 struct mtx mtx, new_mtx; 616 617 mtx = NULL; 618 for (; count != 0; count--) { 619 /* 620 * Avoid releasing and reacquiring the same page lock. 621 / 622* new_mtx = vm_page_lockptr(ma); 623* if (mtx != new_mtx) { 624 if (mtx != NULL) 625 mtx_unlock(mtx); 626 mtx = new_mtx; 627 mtx_lock(mtx); 628 } 629 vm_page_unhold(ma); 630* ma++; 631 } 632 if (mtx != NULL) 633 mtx_unlock(mtx); 634} 635 636/* 637 * vm_page_getfake: 638 * 639 * Create a fictitious page with the specified physical address and 640 * memory attribute. The memory attribute is the only the machine- 641 * dependent aspect of a fictitious page that must be initialized. 642 / 643vm_page_t 644vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) 645{ 646* vm_page_t m; 647 648 m = uma_zalloc(fakepg_zone, M_WAITOK \| M_ZERO); 649 m->phys_addr = paddr; 650 m->queue = PQ_NONE; 651 /* Fictitious pages don't use "segind". / 652* m->flags = PG_FICTITIOUS; 653 /* Fictitious pages don't use "order" or "pool". / 654* m->oflags = VPO_BUSY \| VPO_UNMANAGED; 655 m->wire_count = 1; 656 pmap_page_set_memattr(m, memattr); 657 return (m); 658} 659 660/* 661 * vm_page_putfake: 662 * 663 * Release a fictitious page. 664 / 665void 666vm_page_putfake(vm_page_t m) 667{ 668* 669 KASSERT((m->flags & PG_FICTITIOUS) != 0, 670 ("vm_page_putfake: bad page %p", m)); 671 uma_zfree(fakepg_zone, m); 672} 673 674/* 675 * vm_page_updatefake: 676 * 677 * Update the given fictitious page to the specified physical address and 678 * memory attribute. 679 / 680void 681vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 682{ 683* 684 KASSERT((m->flags & PG_FICTITIOUS) != 0, 685 ("vm_page_updatefake: bad page %p", m)); 686 m->phys_addr = paddr; 687 pmap_page_set_memattr(m, memattr); 688} 689 690/* 691 * vm_page_free: 692 * 693 * Free a page. 694 / 695void 696vm_page_free(vm_page_t m) 697{ 698* 699 m->flags &= ~PG_ZERO; 700 vm_page_free_toq(m); 701} 702 703/* 704 * vm_page_free_zero: 705 * 706 * Free a page to the zerod-pages queue 707 / 708void 709vm_page_free_zero(vm_page_t m) 710{ 711* 712 m->flags \|= PG_ZERO; 713 vm_page_free_toq(m); 714} 715 716/* 717 * vm_page_sleep: 718 * 719 * Sleep and release the page and page queues locks. 720 * 721 * The object containing the given page must be locked. 722 / 723void 724vm_page_sleep(vm_page_t m, const char msg) 725{ 726 727 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 728 if (mtx_owned(&vm_page_queue_mtx)) 729 vm_page_unlock_queues(); 730 if (mtx_owned(vm_page_lockptr(m))) 731 vm_page_unlock(m); 732 733 /* 734 * It's possible that while we sleep, the page will get 735 * unbusied and freed. If we are holding the object 736 * lock, we will assume we hold a reference to the object 737 * such that even if m->object changes, we can re-lock 738 * it. 739 / 740* m->oflags \|= VPO_WANTED; 741 msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0); 742} 743 744/* 745 * vm_page_dirty: 746 * 747 * Set all bits in the page's dirty field. 748 * 749 * The object containing the specified page must be locked if the 750 * call is made from the machine-independent layer. 751 * 752 * See vm_page_clear_dirty_mask(). 753 / 754void 755vm_page_dirty(vm_page_t m) 756{ 757* 758 KASSERT((m->flags & PG_CACHED) == 0, 759 ("vm_page_dirty: page in cache!")); 760 KASSERT(!VM_PAGE_IS_FREE(m), 761 ("vm_page_dirty: page is free!")); 762 KASSERT(m->valid == VM_PAGE_BITS_ALL, 763 ("vm_page_dirty: page is invalid!")); 764 m->dirty = VM_PAGE_BITS_ALL; 765} 766 767/* 768 * vm_page_splay: 769 * 770 * Implements Sleator and Tarjan's top-down splay algorithm. Returns 771 * the vm_page containing the given pindex. If, however, that 772 * pindex is not found in the vm_object, returns a vm_page that is 773 * adjacent to the pindex, coming before or after it. 774 / 775vm_page_t 776vm_page_splay(vm_pindex_t pindex, vm_page_t root) 777{ 778* struct vm_page dummy; 779 vm_page_t lefttreemax, righttreemin, y; 780 781 if (root == NULL) 782 return (root); 783 lefttreemax = righttreemin = &dummy; 784 for (;; root = y) { 785 if (pindex < root->pindex) { 786 if ((y = root->left) == NULL) 787 break; 788 if (pindex < y->pindex) { 789 /* Rotate right. / 790* root->left = y->right; 791 y->right = root; 792 root = y; 793 if ((y = root->left) == NULL) 794 break; 795 } 796 /* Link into the new root's right tree. / 797* righttreemin->left = root; 798 righttreemin = root; 799 } else if (pindex > root->pindex) { 800 if ((y = root->right) == NULL) 801 break; 802 if (pindex > y->pindex) { 803 /* Rotate left. / 804* root->right = y->left; 805 y->left = root; 806 root = y; 807 if ((y = root->right) == NULL) 808 break; 809 } 810 /* Link into the new root's left tree. / 811* lefttreemax->right = root; 812 lefttreemax = root; 813 } else 814 break; 815 } 816 /* Assemble the new root. / 817* lefttreemax->right = root->left; 818 righttreemin->left = root->right; 819 root->left = dummy.right; 820 root->right = dummy.left; 821 return (root); 822} 823 824/* 825 * vm_page_insert: [ internal use only ] 826 * 827 * Inserts the given mem entry into the object and object list. 828 * 829 * The pagetables are not updated but will presumably fault the page 830 * in if necessary, or if a kernel page the caller will at some point 831 * enter the page into the kernel's pmap. We are not allowed to block 832 * here so we can't do this anyway. 833 * 834 * The object and page must be locked. 835 * This routine may not block. 836 / 837void 838vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 839{ 840* vm_page_t root; 841 842 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 843 if (m->object != NULL) 844 panic("vm_page_insert: page already inserted"); 845 846 /* 847 * Record the object/offset pair in this page 848 / 849* m->object = object; 850 m->pindex = pindex; 851 852 /* 853 * Now link into the object's ordered list of backed pages. 854 / 855* root = object->root; 856 if (root == NULL) { 857 m->left = NULL; 858 m->right = NULL; 859 TAILQ_INSERT_TAIL(&object->memq, m, listq); 860 } else { 861 root = vm_page_splay(pindex, root); 862 if (pindex < root->pindex) { 863 m->left = root->left; 864 m->right = root; 865 root->left = NULL; 866 TAILQ_INSERT_BEFORE(root, m, listq); 867 } else if (pindex == root->pindex) 868 panic("vm_page_insert: offset already allocated"); 869 else { 870 m->right = root->right; 871 m->left = root; 872 root->right = NULL; 873 TAILQ_INSERT_AFTER(&object->memq, root, m, listq); 874 } 875 } 876 object->root = m; 877 878 /* 879 * show that the object has one more resident page. 880 / 881* object->resident_page_count++; 882 /* 883 * Hold the vnode until the last page is released. 884 / 885* if (object->resident_page_count == 1 && object->type == OBJT_VNODE) 886 vhold((struct vnode )object->handle); 887* 888 /* 889 * Since we are inserting a new and possibly dirty page, 890 * update the object's OBJ_MIGHTBEDIRTY flag. 891 / 892* if (m->aflags & PGA_WRITEABLE) 893 vm_object_set_writeable_dirty(object); 894} 895 896/* 897 * vm_page_remove: 898 * NOTE: used by device pager as well -wfj 899 * 900 * Removes the given mem entry from the object/offset-page 901 * table and the object page list, but do not invalidate/terminate 902 * the backing store. 903 * 904 * The object and page must be locked. 905 * The underlying pmap entry (if any) is NOT removed here. 906 * This routine may not block. 907 / 908void 909vm_page_remove(vm_page_t m) 910{ 911* vm_object_t object; 912 vm_page_t next, prev, root; 913 914 if ((m->oflags & VPO_UNMANAGED) == 0) 915 vm_page_lock_assert(m, MA_OWNED); 916 if ((object = m->object) == NULL) 917 return; 918 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 919 if (m->oflags & VPO_BUSY) { 920 m->oflags &= ~VPO_BUSY; 921 vm_page_flash(m); 922 } 923 924 /* 925 * Now remove from the object's list of backed pages. 926 / 927* if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { 928 /* 929 * Since the page's successor in the list is also its parent 930 * in the tree, its right subtree must be empty. 931 / 932* next->left = m->left; 933 KASSERT(m->right == NULL, 934 ("vm_page_remove: page %p has right child", m)); 935 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 936 prev->right == m) { 937 /* 938 * Since the page's predecessor in the list is also its parent 939 * in the tree, its left subtree must be empty. 940 / 941* KASSERT(m->left == NULL, 942 ("vm_page_remove: page %p has left child", m)); 943 prev->right = m->right; 944 } else { 945 if (m != object->root) 946 vm_page_splay(m->pindex, object->root); 947 if (m->left == NULL) 948 root = m->right; 949 else if (m->right == NULL) 950 root = m->left; 951 else { 952 /* 953 * Move the page's successor to the root, because 954 * pages are usually removed in ascending order. 955 / 956* if (m->right != next) 957 vm_page_splay(m->pindex, m->right); 958 next->left = m->left; 959 root = next; 960 } 961 object->root = root; 962 } 963 TAILQ_REMOVE(&object->memq, m, listq); 964 965 /* 966 * And show that the object has one fewer resident page. 967 / 968* object->resident_page_count--; 969 /* 970 * The vnode may now be recycled. 971 / 972* if (object->resident_page_count == 0 && object->type == OBJT_VNODE) 973 vdrop((struct vnode )object->handle); 974* 975 m->object = NULL; 976} 977 978/* 979 * vm_page_lookup: 980 * 981 * Returns the page associated with the object/offset 982 * pair specified; if none is found, NULL is returned. 983 * 984 * The object must be locked. 985 * This routine may not block. 986 * This is a critical path routine 987 / 988vm_page_t 989vm_page_lookup(vm_object_t object, vm_pindex_t pindex) 990{ 991* vm_page_t m; 992 993 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 994 if ((m = object->root) != NULL && m->pindex != pindex) { 995 m = vm_page_splay(pindex, m); 996 if ((object->root = m)->pindex != pindex) 997 m = NULL; 998 } 999 return (m); 1000} 1001 1002/* 1003 * vm_page_find_least: 1004 * 1005 * Returns the page associated with the object with least pindex 1006 * greater than or equal to the parameter pindex, or NULL. 1007 * 1008 * The object must be locked. 1009 * The routine may not block. 1010 / 1011vm_page_t 1012vm_page_find_least(vm_object_t object, vm_pindex_t pindex) 1013{ 1014* vm_page_t m; 1015 1016 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1017 if ((m = TAILQ_FIRST(&object->memq)) != NULL) { 1018 if (m->pindex < pindex) { 1019 m = vm_page_splay(pindex, object->root); 1020 if ((object->root = m)->pindex < pindex) 1021 m = TAILQ_NEXT(m, listq); 1022 } 1023 } 1024 return (m); 1025} 1026 1027/* 1028 * Returns the given page's successor (by pindex) within the object if it is 1029 * resident; if none is found, NULL is returned. 1030 * 1031 * The object must be locked. 1032 / 1033vm_page_t 1034vm_page_next(vm_page_t m) 1035{ 1036* vm_page_t next; 1037 1038 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1039 if ((next = TAILQ_NEXT(m, listq)) != NULL && 1040 next->pindex != m->pindex + 1) 1041 next = NULL; 1042 return (next); 1043} 1044 1045/* 1046 * Returns the given page's predecessor (by pindex) within the object if it is 1047 * resident; if none is found, NULL is returned. 1048 * 1049 * The object must be locked. 1050 / 1051vm_page_t 1052vm_page_prev(vm_page_t m) 1053{ 1054* vm_page_t prev; 1055 1056 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1057 if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 1058 prev->pindex != m->pindex - 1) 1059 prev = NULL; 1060 return (prev); 1061} 1062 1063/* 1064 * vm_page_rename: 1065 * 1066 * Move the given memory entry from its 1067 * current object to the specified target object/offset. 1068 * 1069 * The object must be locked. 1070 * This routine may not block. 1071 * 1072 * Note: swap associated with the page must be invalidated by the move. We 1073 * have to do this for several reasons: (1) we aren't freeing the 1074 * page, (2) we are dirtying the page, (3) the VM system is probably 1075 * moving the page from object A to B, and will then later move 1076 * the backing store from A to B and we can't have a conflict. 1077 * 1078 * Note: we always dirty the page. It is necessary both for the 1079 * fact that we moved it, and because we may be invalidating 1080 * swap. If the page is on the cache, we have to deactivate it 1081 * or vm_page_dirty() will panic. Dirty pages are not allowed 1082 * on the cache. 1083 / 1084void 1085vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 1086{ 1087* 1088 vm_page_remove(m); 1089 vm_page_insert(m, new_object, new_pindex); 1090 vm_page_dirty(m); 1091} 1092 1093/* 1094 * Convert all of the given object's cached pages that have a 1095 * pindex within the given range into free pages. If the value 1096 * zero is given for "end", then the range's upper bound is 1097 * infinity. If the given object is backed by a vnode and it 1098 * transitions from having one or more cached pages to none, the 1099 * vnode's hold count is reduced. 1100 / 1101void 1102vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 1103{ 1104* vm_page_t m, m_next; 1105 boolean_t empty; 1106 1107 mtx_lock(&vm_page_queue_free_mtx); 1108 if (__predict_false(object->cache == NULL)) { 1109 mtx_unlock(&vm_page_queue_free_mtx); 1110 return; 1111 } 1112 m = object->cache = vm_page_splay(start, object->cache); 1113 if (m->pindex < start) { 1114 if (m->right == NULL) 1115 m = NULL; 1116 else { 1117 m_next = vm_page_splay(start, m->right); 1118 m_next->left = m; 1119 m->right = NULL; 1120 m = object->cache = m_next; 1121 } 1122 } 1123 1124 /* 1125 * At this point, "m" is either (1) a reference to the page 1126 * with the least pindex that is greater than or equal to 1127 * "start" or (2) NULL. 1128 / 1129* for (; m != NULL && (m->pindex < end \|\| end == 0); m = m_next) { 1130 /* 1131 * Find "m"'s successor and remove "m" from the 1132 * object's cache. 1133 / 1134* if (m->right == NULL) { 1135 object->cache = m->left; 1136 m_next = NULL; 1137 } else { 1138 m_next = vm_page_splay(start, m->right); 1139 m_next->left = m->left; 1140 object->cache = m_next; 1141 } 1142 /* Convert "m" to a free page. / 1143* m->object = NULL; 1144 m->valid = 0; 1145 /* Clear PG_CACHED and set PG_FREE. / 1146* m->flags ^= PG_CACHED \| PG_FREE; 1147 KASSERT((m->flags & (PG_CACHED \| PG_FREE)) == PG_FREE, 1148 ("vm_page_cache_free: page %p has inconsistent flags", m)); 1149 cnt.v_cache_count--; 1150 cnt.v_free_count++; 1151 } 1152 empty = object->cache == NULL; 1153 mtx_unlock(&vm_page_queue_free_mtx); 1154 if (object->type == OBJT_VNODE && empty) 1155 vdrop(object->handle); 1156} 1157 1158/* 1159 * Returns the cached page that is associated with the given 1160 * object and offset. If, however, none exists, returns NULL. 1161 * 1162 * The free page queue must be locked. 1163 / 1164static inline vm_page_t 1165vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) 1166{ 1167* vm_page_t m; 1168 1169 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1170 if ((m = object->cache) != NULL && m->pindex != pindex) { 1171 m = vm_page_splay(pindex, m); 1172 if ((object->cache = m)->pindex != pindex) 1173 m = NULL; 1174 } 1175 return (m); 1176} 1177 1178/* 1179 * Remove the given cached page from its containing object's 1180 * collection of cached pages. 1181 * 1182 * The free page queue must be locked. 1183 / 1184void 1185vm_page_cache_remove(vm_page_t m) 1186{ 1187* vm_object_t object; 1188 vm_page_t root; 1189 1190 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1191 KASSERT((m->flags & PG_CACHED) != 0, 1192 ("vm_page_cache_remove: page %p is not cached", m)); 1193 object = m->object; 1194 if (m != object->cache) { 1195 root = vm_page_splay(m->pindex, object->cache); 1196 KASSERT(root == m, 1197 ("vm_page_cache_remove: page %p is not cached in object %p", 1198 m, object)); 1199 } 1200 if (m->left == NULL) 1201 root = m->right; 1202 else if (m->right == NULL) 1203 root = m->left; 1204 else { 1205 root = vm_page_splay(m->pindex, m->left); 1206 root->right = m->right; 1207 } 1208 object->cache = root; 1209 m->object = NULL; 1210 cnt.v_cache_count--; 1211} 1212 1213/* 1214 * Transfer all of the cached pages with offset greater than or 1215 * equal to 'offidxstart' from the original object's cache to the 1216 * new object's cache. However, any cached pages with offset 1217 * greater than or equal to the new object's size are kept in the 1218 * original object. Initially, the new object's cache must be 1219 * empty. Offset 'offidxstart' in the original object must 1220 * correspond to offset zero in the new object. 1221 * 1222 * The new object must be locked. 1223 / 1224void 1225vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, 1226* vm_object_t new_object) 1227{ 1228 vm_page_t m, m_next; 1229 1230 /* 1231 * Insertion into an object's collection of cached pages 1232 * requires the object to be locked. In contrast, removal does 1233 * not. 1234 / 1235* VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); 1236 KASSERT(new_object->cache == NULL, 1237 ("vm_page_cache_transfer: object %p has cached pages", 1238 new_object)); 1239 mtx_lock(&vm_page_queue_free_mtx); 1240 if ((m = orig_object->cache) != NULL) { 1241 /* 1242 * Transfer all of the pages with offset greater than or 1243 * equal to 'offidxstart' from the original object's 1244 * cache to the new object's cache. 1245 / 1246* m = vm_page_splay(offidxstart, m); 1247 if (m->pindex < offidxstart) { 1248 orig_object->cache = m; 1249 new_object->cache = m->right; 1250 m->right = NULL; 1251 } else { 1252 orig_object->cache = m->left; 1253 new_object->cache = m; 1254 m->left = NULL; 1255 } 1256 while ((m = new_object->cache) != NULL) { 1257 if ((m->pindex - offidxstart) >= new_object->size) { 1258 /* 1259 * Return all of the cached pages with 1260 * offset greater than or equal to the 1261 * new object's size to the original 1262 * object's cache. 1263 / 1264* new_object->cache = m->left; 1265 m->left = orig_object->cache; 1266 orig_object->cache = m; 1267 break; 1268 } 1269 m_next = vm_page_splay(m->pindex, m->right); 1270 /* Update the page's object and offset. / 1271* m->object = new_object; 1272 m->pindex -= offidxstart; 1273 if (m_next == NULL) 1274 break; 1275 m->right = NULL; 1276 m_next->left = m; 1277 new_object->cache = m_next; 1278 } 1279 KASSERT(new_object->cache == NULL \|\| 1280 new_object->type == OBJT_SWAP, 1281 ("vm_page_cache_transfer: object %p's type is incompatible" 1282 " with cached pages", new_object)); 1283 } 1284 mtx_unlock(&vm_page_queue_free_mtx); 1285} 1286 1287/* 1288 * vm_page_alloc: 1289 * 1290 * Allocate and return a page that is associated with the specified 1291 * object and offset pair. By default, this page has the flag VPO_BUSY 1292 * set. 1293 * 1294 * The caller must always specify an allocation class. 1295 * 1296 * allocation classes: 1297 * VM_ALLOC_NORMAL normal process request 1298 * VM_ALLOC_SYSTEM system really needs a page 1299 * VM_ALLOC_INTERRUPT interrupt time request 1300 * 1301 * optional allocation flags: 1302 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1303 * intends to allocate 1304 * VM_ALLOC_IFCACHED return page only if it is cached 1305 * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page 1306 * is cached 1307 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page
	1308 * VM_ALLOC_NODUMP do not include the page in a kernel core dump
1308 * VM_ALLOC_NOOBJ page is not associated with an object and 1309 * should not have the flag VPO_BUSY set 1310 * VM_ALLOC_WIRED wire the allocated page 1311 * VM_ALLOC_ZERO prefer a zeroed page 1312 * 1313 * This routine may not sleep. 1314 / 1315vm_page_t 1316vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) 1317{ 1318* struct vnode vp = NULL; 1319* vm_object_t m_object; 1320 vm_page_t m; 1321 int flags, req_class; 1322 1323 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0), 1324 ("vm_page_alloc: inconsistent object/req")); 1325 if (object != NULL) 1326 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1327 1328 req_class = req & VM_ALLOC_CLASS_MASK; 1329 1330 /* 1331 * The page daemon is allowed to dig deeper into the free page list. 1332 / 1333* if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1334 req_class = VM_ALLOC_SYSTEM; 1335 1336 mtx_lock(&vm_page_queue_free_mtx); 1337 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved \|\| 1338 (req_class == VM_ALLOC_SYSTEM && 1339 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) \|\| 1340 (req_class == VM_ALLOC_INTERRUPT && 1341 cnt.v_free_count + cnt.v_cache_count > 0)) { 1342 /* 1343 * Allocate from the free queue if the number of free pages 1344 * exceeds the minimum for the request class. 1345 / 1346* if (object != NULL && 1347 (m = vm_page_cache_lookup(object, pindex)) != NULL) { 1348 if ((req & VM_ALLOC_IFNOTCACHED) != 0) { 1349 mtx_unlock(&vm_page_queue_free_mtx); 1350 return (NULL); 1351 } 1352 if (vm_phys_unfree_page(m)) 1353 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); 1354#if VM_NRESERVLEVEL > 0 1355 else if (!vm_reserv_reactivate_page(m)) 1356#else 1357 else 1358#endif 1359 panic("vm_page_alloc: cache page %p is missing" 1360 " from the free queue", m); 1361 } else if ((req & VM_ALLOC_IFCACHED) != 0) { 1362 mtx_unlock(&vm_page_queue_free_mtx); 1363 return (NULL); 1364#if VM_NRESERVLEVEL > 0 1365 } else if (object == NULL \|\| object->type == OBJT_DEVICE \|\| 1366 object->type == OBJT_SG \|\| 1367 (object->flags & OBJ_COLORED) == 0 \|\| 1368 (m = vm_reserv_alloc_page(object, pindex)) == NULL) { 1369#else 1370 } else { 1371#endif 1372 m = vm_phys_alloc_pages(object != NULL ? 1373 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); 1374#if VM_NRESERVLEVEL > 0 1375 if (m == NULL && vm_reserv_reclaim_inactive()) { 1376 m = vm_phys_alloc_pages(object != NULL ? 1377 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 1378 0); 1379 } 1380#endif 1381 } 1382 } else { 1383 /* 1384 * Not allocatable, give up. 1385 / 1386* mtx_unlock(&vm_page_queue_free_mtx); 1387 atomic_add_int(&vm_pageout_deficit, 1388 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1389 pagedaemon_wakeup(); 1390 return (NULL); 1391 } 1392 1393 /* 1394 * At this point we had better have found a good page. 1395 / 1396* KASSERT(m != NULL, ("vm_page_alloc: missing page")); 1397 KASSERT(m->queue == PQ_NONE, 1398 ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); 1399 KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); 1400 KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); 1401 KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m)); 1402 KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); 1403 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1404 ("vm_page_alloc: page %p has unexpected memattr %d", m, 1405 pmap_page_get_memattr(m))); 1406 if ((m->flags & PG_CACHED) != 0) { 1407 KASSERT((m->flags & PG_ZERO) == 0, 1408 ("vm_page_alloc: cached page %p is PG_ZERO", m)); 1409 KASSERT(m->valid != 0, 1410 ("vm_page_alloc: cached page %p is invalid", m)); 1411 if (m->object == object && m->pindex == pindex) 1412 cnt.v_reactivated++; 1413 else 1414 m->valid = 0; 1415 m_object = m->object; 1416 vm_page_cache_remove(m); 1417 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1418 vp = m_object->handle; 1419 } else { 1420 KASSERT(VM_PAGE_IS_FREE(m), 1421 ("vm_page_alloc: page %p is not free", m)); 1422 KASSERT(m->valid == 0, 1423 ("vm_page_alloc: free page %p is valid", m)); 1424 cnt.v_free_count--; 1425 } 1426 1427 /* 1428 * Only the PG_ZERO flag is inherited. The PG_CACHED or PG_FREE flag 1429 * must be cleared before the free page queues lock is released. 1430 / 1431* flags = 0;	1309 * VM_ALLOC_NOOBJ page is not associated with an object and 1310 * should not have the flag VPO_BUSY set 1311 * VM_ALLOC_WIRED wire the allocated page 1312 * VM_ALLOC_ZERO prefer a zeroed page 1313 * 1314 * This routine may not sleep. 1315 / 1316vm_page_t 1317vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) 1318{ 1319* struct vnode vp = NULL; 1320* vm_object_t m_object; 1321 vm_page_t m; 1322 int flags, req_class; 1323 1324 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0), 1325 ("vm_page_alloc: inconsistent object/req")); 1326 if (object != NULL) 1327 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1328 1329 req_class = req & VM_ALLOC_CLASS_MASK; 1330 1331 /* 1332 * The page daemon is allowed to dig deeper into the free page list. 1333 / 1334* if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1335 req_class = VM_ALLOC_SYSTEM; 1336 1337 mtx_lock(&vm_page_queue_free_mtx); 1338 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved \|\| 1339 (req_class == VM_ALLOC_SYSTEM && 1340 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) \|\| 1341 (req_class == VM_ALLOC_INTERRUPT && 1342 cnt.v_free_count + cnt.v_cache_count > 0)) { 1343 /* 1344 * Allocate from the free queue if the number of free pages 1345 * exceeds the minimum for the request class. 1346 / 1347* if (object != NULL && 1348 (m = vm_page_cache_lookup(object, pindex)) != NULL) { 1349 if ((req & VM_ALLOC_IFNOTCACHED) != 0) { 1350 mtx_unlock(&vm_page_queue_free_mtx); 1351 return (NULL); 1352 } 1353 if (vm_phys_unfree_page(m)) 1354 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); 1355#if VM_NRESERVLEVEL > 0 1356 else if (!vm_reserv_reactivate_page(m)) 1357#else 1358 else 1359#endif 1360 panic("vm_page_alloc: cache page %p is missing" 1361 " from the free queue", m); 1362 } else if ((req & VM_ALLOC_IFCACHED) != 0) { 1363 mtx_unlock(&vm_page_queue_free_mtx); 1364 return (NULL); 1365#if VM_NRESERVLEVEL > 0 1366 } else if (object == NULL \|\| object->type == OBJT_DEVICE \|\| 1367 object->type == OBJT_SG \|\| 1368 (object->flags & OBJ_COLORED) == 0 \|\| 1369 (m = vm_reserv_alloc_page(object, pindex)) == NULL) { 1370#else 1371 } else { 1372#endif 1373 m = vm_phys_alloc_pages(object != NULL ? 1374 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); 1375#if VM_NRESERVLEVEL > 0 1376 if (m == NULL && vm_reserv_reclaim_inactive()) { 1377 m = vm_phys_alloc_pages(object != NULL ? 1378 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 1379 0); 1380 } 1381#endif 1382 } 1383 } else { 1384 /* 1385 * Not allocatable, give up. 1386 / 1387* mtx_unlock(&vm_page_queue_free_mtx); 1388 atomic_add_int(&vm_pageout_deficit, 1389 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1390 pagedaemon_wakeup(); 1391 return (NULL); 1392 } 1393 1394 /* 1395 * At this point we had better have found a good page. 1396 / 1397* KASSERT(m != NULL, ("vm_page_alloc: missing page")); 1398 KASSERT(m->queue == PQ_NONE, 1399 ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); 1400 KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); 1401 KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); 1402 KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m)); 1403 KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); 1404 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1405 ("vm_page_alloc: page %p has unexpected memattr %d", m, 1406 pmap_page_get_memattr(m))); 1407 if ((m->flags & PG_CACHED) != 0) { 1408 KASSERT((m->flags & PG_ZERO) == 0, 1409 ("vm_page_alloc: cached page %p is PG_ZERO", m)); 1410 KASSERT(m->valid != 0, 1411 ("vm_page_alloc: cached page %p is invalid", m)); 1412 if (m->object == object && m->pindex == pindex) 1413 cnt.v_reactivated++; 1414 else 1415 m->valid = 0; 1416 m_object = m->object; 1417 vm_page_cache_remove(m); 1418 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1419 vp = m_object->handle; 1420 } else { 1421 KASSERT(VM_PAGE_IS_FREE(m), 1422 ("vm_page_alloc: page %p is not free", m)); 1423 KASSERT(m->valid == 0, 1424 ("vm_page_alloc: free page %p is valid", m)); 1425 cnt.v_free_count--; 1426 } 1427 1428 /* 1429 * Only the PG_ZERO flag is inherited. The PG_CACHED or PG_FREE flag 1430 * must be cleared before the free page queues lock is released. 1431 / 1432* flags = 0;
	1433 if (req & VM_ALLOC_NODUMP) 1434 flags \|= PG_NODUMP;
1432 if (m->flags & PG_ZERO) { 1433 vm_page_zero_count--; 1434 if (req & VM_ALLOC_ZERO) 1435 flags = PG_ZERO; 1436 } 1437 m->flags = flags; 1438 mtx_unlock(&vm_page_queue_free_mtx); 1439 m->aflags = 0; 1440 if (object == NULL \|\| object->type == OBJT_PHYS) 1441 m->oflags = VPO_UNMANAGED; 1442 else 1443 m->oflags = 0; 1444 if ((req & (VM_ALLOC_NOBUSY \| VM_ALLOC_NOOBJ)) == 0) 1445 m->oflags \|= VPO_BUSY; 1446 if (req & VM_ALLOC_WIRED) { 1447 /* 1448 * The page lock is not required for wiring a page until that 1449 * page is inserted into the object. 1450 / 1451* atomic_add_int(&cnt.v_wire_count, 1); 1452 m->wire_count = 1; 1453 } 1454 m->act_count = 0; 1455 1456 if (object != NULL) { 1457 /* Ignore device objects; the pager sets "memattr" for them. / 1458* if (object->memattr != VM_MEMATTR_DEFAULT && 1459 object->type != OBJT_DEVICE && object->type != OBJT_SG) 1460 pmap_page_set_memattr(m, object->memattr); 1461 vm_page_insert(m, object, pindex); 1462 } else 1463 m->pindex = pindex; 1464 1465 /* 1466 * The following call to vdrop() must come after the above call 1467 * to vm_page_insert() in case both affect the same object and 1468 * vnode. Otherwise, the affected vnode's hold count could 1469 * temporarily become zero. 1470 / 1471* if (vp != NULL) 1472 vdrop(vp); 1473 1474 /* 1475 * Don't wakeup too often - wakeup the pageout daemon when 1476 * we would be nearly out of memory. 1477 / 1478* if (vm_paging_needed()) 1479 pagedaemon_wakeup(); 1480 1481 return (m); 1482} 1483 1484/* 1485 * vm_page_alloc_contig: 1486 * 1487 * Allocate a contiguous set of physical pages of the given size "npages" 1488 * from the free lists. All of the physical pages must be at or above 1489 * the given physical address "low" and below the given physical address 1490 * "high". The given value "alignment" determines the alignment of the 1491 * first physical page in the set. If the given value "boundary" is 1492 * non-zero, then the set of physical pages cannot cross any physical 1493 * address boundary that is a multiple of that value. Both "alignment" 1494 * and "boundary" must be a power of two. 1495 * 1496 * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, 1497 * then the memory attribute setting for the physical pages is configured 1498 * to the object's memory attribute setting. Otherwise, the memory 1499 * attribute setting for the physical pages is configured to "memattr", 1500 * overriding the object's memory attribute setting. However, if the 1501 * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the 1502 * memory attribute setting for the physical pages cannot be configured 1503 * to VM_MEMATTR_DEFAULT. 1504 * 1505 * The caller must always specify an allocation class. 1506 * 1507 * allocation classes: 1508 * VM_ALLOC_NORMAL normal process request 1509 * VM_ALLOC_SYSTEM system really needs a page 1510 * VM_ALLOC_INTERRUPT interrupt time request 1511 * 1512 * optional allocation flags: 1513 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page 1514 * VM_ALLOC_NOOBJ page is not associated with an object and 1515 * should not have the flag VPO_BUSY set 1516 * VM_ALLOC_WIRED wire the allocated page 1517 * VM_ALLOC_ZERO prefer a zeroed page 1518 * 1519 * This routine may not sleep. 1520 / 1521vm_page_t 1522vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, 1523* u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 1524 vm_paddr_t boundary, vm_memattr_t memattr) 1525{ 1526 struct vnode drop; 1527* vm_page_t deferred_vdrop_list, m, m_ret; 1528 u_int flags, oflags; 1529 int req_class; 1530 1531 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0), 1532 ("vm_page_alloc_contig: inconsistent object/req")); 1533 if (object != NULL) { 1534 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1535 KASSERT(object->type == OBJT_PHYS, 1536 ("vm_page_alloc_contig: object %p isn't OBJT_PHYS", 1537 object)); 1538 } 1539 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); 1540 req_class = req & VM_ALLOC_CLASS_MASK; 1541 1542 /* 1543 * The page daemon is allowed to dig deeper into the free page list. 1544 / 1545* if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1546 req_class = VM_ALLOC_SYSTEM; 1547 1548 deferred_vdrop_list = NULL; 1549 mtx_lock(&vm_page_queue_free_mtx); 1550 if (cnt.v_free_count + cnt.v_cache_count >= npages + 1551 cnt.v_free_reserved \|\| (req_class == VM_ALLOC_SYSTEM && 1552 cnt.v_free_count + cnt.v_cache_count >= npages + 1553 cnt.v_interrupt_free_min) \|\| (req_class == VM_ALLOC_INTERRUPT && 1554 cnt.v_free_count + cnt.v_cache_count >= npages)) { 1555#if VM_NRESERVLEVEL > 0 1556retry: 1557 if (object == NULL \|\| (object->flags & OBJ_COLORED) == 0 \|\| 1558 (m_ret = vm_reserv_alloc_contig(object, pindex, npages, 1559 low, high, alignment, boundary)) == NULL) 1560#endif 1561 m_ret = vm_phys_alloc_contig(npages, low, high, 1562 alignment, boundary); 1563 } else { 1564 mtx_unlock(&vm_page_queue_free_mtx); 1565 atomic_add_int(&vm_pageout_deficit, npages); 1566 pagedaemon_wakeup(); 1567 return (NULL); 1568 } 1569 if (m_ret != NULL) 1570 for (m = m_ret; m < &m_ret[npages]; m++) { 1571 drop = vm_page_alloc_init(m); 1572 if (drop != NULL) { 1573 /* 1574 * Enqueue the vnode for deferred vdrop(). 1575 * 1576 * Once the pages are removed from the free 1577 * page list, "pageq" can be safely abused to 1578 * construct a short-lived list of vnodes. 1579 / 1580* m->pageq.tqe_prev = (void )drop; 1581* m->pageq.tqe_next = deferred_vdrop_list; 1582 deferred_vdrop_list = m; 1583 } 1584 } 1585 else { 1586#if VM_NRESERVLEVEL > 0 1587 if (vm_reserv_reclaim_contig(npages, low, high, alignment, 1588 boundary)) 1589 goto retry; 1590#endif 1591 } 1592 mtx_unlock(&vm_page_queue_free_mtx); 1593 if (m_ret == NULL) 1594 return (NULL); 1595 1596 /* 1597 * Initialize the pages. Only the PG_ZERO flag is inherited. 1598 / 1599* flags = 0; 1600 if ((req & VM_ALLOC_ZERO) != 0) 1601 flags = PG_ZERO;	1435 if (m->flags & PG_ZERO) { 1436 vm_page_zero_count--; 1437 if (req & VM_ALLOC_ZERO) 1438 flags = PG_ZERO; 1439 } 1440 m->flags = flags; 1441 mtx_unlock(&vm_page_queue_free_mtx); 1442 m->aflags = 0; 1443 if (object == NULL \|\| object->type == OBJT_PHYS) 1444 m->oflags = VPO_UNMANAGED; 1445 else 1446 m->oflags = 0; 1447 if ((req & (VM_ALLOC_NOBUSY \| VM_ALLOC_NOOBJ)) == 0) 1448 m->oflags \|= VPO_BUSY; 1449 if (req & VM_ALLOC_WIRED) { 1450 /* 1451 * The page lock is not required for wiring a page until that 1452 * page is inserted into the object. 1453 / 1454* atomic_add_int(&cnt.v_wire_count, 1); 1455 m->wire_count = 1; 1456 } 1457 m->act_count = 0; 1458 1459 if (object != NULL) { 1460 /* Ignore device objects; the pager sets "memattr" for them. / 1461* if (object->memattr != VM_MEMATTR_DEFAULT && 1462 object->type != OBJT_DEVICE && object->type != OBJT_SG) 1463 pmap_page_set_memattr(m, object->memattr); 1464 vm_page_insert(m, object, pindex); 1465 } else 1466 m->pindex = pindex; 1467 1468 /* 1469 * The following call to vdrop() must come after the above call 1470 * to vm_page_insert() in case both affect the same object and 1471 * vnode. Otherwise, the affected vnode's hold count could 1472 * temporarily become zero. 1473 / 1474* if (vp != NULL) 1475 vdrop(vp); 1476 1477 /* 1478 * Don't wakeup too often - wakeup the pageout daemon when 1479 * we would be nearly out of memory. 1480 / 1481* if (vm_paging_needed()) 1482 pagedaemon_wakeup(); 1483 1484 return (m); 1485} 1486 1487/* 1488 * vm_page_alloc_contig: 1489 * 1490 * Allocate a contiguous set of physical pages of the given size "npages" 1491 * from the free lists. All of the physical pages must be at or above 1492 * the given physical address "low" and below the given physical address 1493 * "high". The given value "alignment" determines the alignment of the 1494 * first physical page in the set. If the given value "boundary" is 1495 * non-zero, then the set of physical pages cannot cross any physical 1496 * address boundary that is a multiple of that value. Both "alignment" 1497 * and "boundary" must be a power of two. 1498 * 1499 * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, 1500 * then the memory attribute setting for the physical pages is configured 1501 * to the object's memory attribute setting. Otherwise, the memory 1502 * attribute setting for the physical pages is configured to "memattr", 1503 * overriding the object's memory attribute setting. However, if the 1504 * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the 1505 * memory attribute setting for the physical pages cannot be configured 1506 * to VM_MEMATTR_DEFAULT. 1507 * 1508 * The caller must always specify an allocation class. 1509 * 1510 * allocation classes: 1511 * VM_ALLOC_NORMAL normal process request 1512 * VM_ALLOC_SYSTEM system really needs a page 1513 * VM_ALLOC_INTERRUPT interrupt time request 1514 * 1515 * optional allocation flags: 1516 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page 1517 * VM_ALLOC_NOOBJ page is not associated with an object and 1518 * should not have the flag VPO_BUSY set 1519 * VM_ALLOC_WIRED wire the allocated page 1520 * VM_ALLOC_ZERO prefer a zeroed page 1521 * 1522 * This routine may not sleep. 1523 / 1524vm_page_t 1525vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, 1526* u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, 1527 vm_paddr_t boundary, vm_memattr_t memattr) 1528{ 1529 struct vnode drop; 1530* vm_page_t deferred_vdrop_list, m, m_ret; 1531 u_int flags, oflags; 1532 int req_class; 1533 1534 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0), 1535 ("vm_page_alloc_contig: inconsistent object/req")); 1536 if (object != NULL) { 1537 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1538 KASSERT(object->type == OBJT_PHYS, 1539 ("vm_page_alloc_contig: object %p isn't OBJT_PHYS", 1540 object)); 1541 } 1542 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); 1543 req_class = req & VM_ALLOC_CLASS_MASK; 1544 1545 /* 1546 * The page daemon is allowed to dig deeper into the free page list. 1547 / 1548* if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1549 req_class = VM_ALLOC_SYSTEM; 1550 1551 deferred_vdrop_list = NULL; 1552 mtx_lock(&vm_page_queue_free_mtx); 1553 if (cnt.v_free_count + cnt.v_cache_count >= npages + 1554 cnt.v_free_reserved \|\| (req_class == VM_ALLOC_SYSTEM && 1555 cnt.v_free_count + cnt.v_cache_count >= npages + 1556 cnt.v_interrupt_free_min) \|\| (req_class == VM_ALLOC_INTERRUPT && 1557 cnt.v_free_count + cnt.v_cache_count >= npages)) { 1558#if VM_NRESERVLEVEL > 0 1559retry: 1560 if (object == NULL \|\| (object->flags & OBJ_COLORED) == 0 \|\| 1561 (m_ret = vm_reserv_alloc_contig(object, pindex, npages, 1562 low, high, alignment, boundary)) == NULL) 1563#endif 1564 m_ret = vm_phys_alloc_contig(npages, low, high, 1565 alignment, boundary); 1566 } else { 1567 mtx_unlock(&vm_page_queue_free_mtx); 1568 atomic_add_int(&vm_pageout_deficit, npages); 1569 pagedaemon_wakeup(); 1570 return (NULL); 1571 } 1572 if (m_ret != NULL) 1573 for (m = m_ret; m < &m_ret[npages]; m++) { 1574 drop = vm_page_alloc_init(m); 1575 if (drop != NULL) { 1576 /* 1577 * Enqueue the vnode for deferred vdrop(). 1578 * 1579 * Once the pages are removed from the free 1580 * page list, "pageq" can be safely abused to 1581 * construct a short-lived list of vnodes. 1582 / 1583* m->pageq.tqe_prev = (void )drop; 1584* m->pageq.tqe_next = deferred_vdrop_list; 1585 deferred_vdrop_list = m; 1586 } 1587 } 1588 else { 1589#if VM_NRESERVLEVEL > 0 1590 if (vm_reserv_reclaim_contig(npages, low, high, alignment, 1591 boundary)) 1592 goto retry; 1593#endif 1594 } 1595 mtx_unlock(&vm_page_queue_free_mtx); 1596 if (m_ret == NULL) 1597 return (NULL); 1598 1599 /* 1600 * Initialize the pages. Only the PG_ZERO flag is inherited. 1601 / 1602* flags = 0; 1603 if ((req & VM_ALLOC_ZERO) != 0) 1604 flags = PG_ZERO;
	1605 if ((req & VM_ALLOC_NODUMP) != 0) 1606 flags \|= PG_NODUMP;
1602 if ((req & VM_ALLOC_WIRED) != 0) 1603 atomic_add_int(&cnt.v_wire_count, npages); 1604 oflags = VPO_UNMANAGED; 1605 if (object != NULL) { 1606 if ((req & VM_ALLOC_NOBUSY) == 0) 1607 oflags \|= VPO_BUSY; 1608 if (object->memattr != VM_MEMATTR_DEFAULT && 1609 memattr == VM_MEMATTR_DEFAULT) 1610 memattr = object->memattr; 1611 } 1612 for (m = m_ret; m < &m_ret[npages]; m++) { 1613 m->aflags = 0; 1614 m->flags &= flags; 1615 if ((req & VM_ALLOC_WIRED) != 0) 1616 m->wire_count = 1; 1617 /* Unmanaged pages don't use "act_count". / 1618* m->oflags = oflags; 1619 if (memattr != VM_MEMATTR_DEFAULT) 1620 pmap_page_set_memattr(m, memattr); 1621 if (object != NULL) 1622 vm_page_insert(m, object, pindex); 1623 else 1624 m->pindex = pindex; 1625 pindex++; 1626 } 1627 while (deferred_vdrop_list != NULL) { 1628 vdrop((struct vnode )deferred_vdrop_list->pageq.tqe_prev); 1629* deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next; 1630 } 1631 if (vm_paging_needed()) 1632 pagedaemon_wakeup(); 1633 return (m_ret); 1634} 1635 1636/* 1637 * Initialize a page that has been freshly dequeued from a freelist. 1638 * The caller has to drop the vnode returned, if it is not NULL. 1639 * 1640 * This function may only be used to initialize unmanaged pages. 1641 * 1642 * To be called with vm_page_queue_free_mtx held. 1643 / 1644static struct vnode 1645vm_page_alloc_init(vm_page_t m) 1646{ 1647 struct vnode drop; 1648* vm_object_t m_object; 1649 1650 KASSERT(m->queue == PQ_NONE, 1651 ("vm_page_alloc_init: page %p has unexpected queue %d", 1652 m, m->queue)); 1653 KASSERT(m->wire_count == 0, 1654 ("vm_page_alloc_init: page %p is wired", m)); 1655 KASSERT(m->hold_count == 0, 1656 ("vm_page_alloc_init: page %p is held", m)); 1657 KASSERT(m->busy == 0, 1658 ("vm_page_alloc_init: page %p is busy", m)); 1659 KASSERT(m->dirty == 0, 1660 ("vm_page_alloc_init: page %p is dirty", m)); 1661 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1662 ("vm_page_alloc_init: page %p has unexpected memattr %d", 1663 m, pmap_page_get_memattr(m))); 1664 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1665 drop = NULL; 1666 if ((m->flags & PG_CACHED) != 0) { 1667 KASSERT((m->flags & PG_ZERO) == 0, 1668 ("vm_page_alloc_init: cached page %p is PG_ZERO", m)); 1669 m->valid = 0; 1670 m_object = m->object; 1671 vm_page_cache_remove(m); 1672 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1673 drop = m_object->handle; 1674 } else { 1675 KASSERT(VM_PAGE_IS_FREE(m), 1676 ("vm_page_alloc_init: page %p is not free", m)); 1677 KASSERT(m->valid == 0, 1678 ("vm_page_alloc_init: free page %p is valid", m)); 1679 cnt.v_free_count--; 1680 if ((m->flags & PG_ZERO) != 0) 1681 vm_page_zero_count--; 1682 } 1683 /* Don't clear the PG_ZERO flag; we'll need it later. / 1684* m->flags &= PG_ZERO; 1685 return (drop); 1686} 1687 1688/* 1689 * vm_page_alloc_freelist: 1690 * 1691 * Allocate a physical page from the specified free page list. 1692 * 1693 * The caller must always specify an allocation class. 1694 * 1695 * allocation classes: 1696 * VM_ALLOC_NORMAL normal process request 1697 * VM_ALLOC_SYSTEM system really needs a page 1698 * VM_ALLOC_INTERRUPT interrupt time request 1699 * 1700 * optional allocation flags: 1701 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1702 * intends to allocate 1703 * VM_ALLOC_WIRED wire the allocated page 1704 * VM_ALLOC_ZERO prefer a zeroed page 1705 * 1706 * This routine may not sleep. 1707 / 1708vm_page_t 1709vm_page_alloc_freelist(int flind, int req) 1710{ 1711* struct vnode drop; 1712* vm_page_t m; 1713 u_int flags; 1714 int req_class; 1715 1716 req_class = req & VM_ALLOC_CLASS_MASK; 1717 1718 /* 1719 * The page daemon is allowed to dig deeper into the free page list. 1720 / 1721* if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1722 req_class = VM_ALLOC_SYSTEM; 1723 1724 /* 1725 * Do not allocate reserved pages unless the req has asked for it. 1726 / 1727* mtx_lock(&vm_page_queue_free_mtx); 1728 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved \|\| 1729 (req_class == VM_ALLOC_SYSTEM && 1730 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) \|\| 1731 (req_class == VM_ALLOC_INTERRUPT && 1732 cnt.v_free_count + cnt.v_cache_count > 0)) 1733 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); 1734 else { 1735 mtx_unlock(&vm_page_queue_free_mtx); 1736 atomic_add_int(&vm_pageout_deficit, 1737 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1738 pagedaemon_wakeup(); 1739 return (NULL); 1740 } 1741 if (m == NULL) { 1742 mtx_unlock(&vm_page_queue_free_mtx); 1743 return (NULL); 1744 } 1745 drop = vm_page_alloc_init(m); 1746 mtx_unlock(&vm_page_queue_free_mtx); 1747 1748 /* 1749 * Initialize the page. Only the PG_ZERO flag is inherited. 1750 / 1751* m->aflags = 0; 1752 flags = 0; 1753 if ((req & VM_ALLOC_ZERO) != 0) 1754 flags = PG_ZERO; 1755 m->flags &= flags; 1756 if ((req & VM_ALLOC_WIRED) != 0) { 1757 /* 1758 * The page lock is not required for wiring a page that does 1759 * not belong to an object. 1760 / 1761* atomic_add_int(&cnt.v_wire_count, 1); 1762 m->wire_count = 1; 1763 } 1764 /* Unmanaged pages don't use "act_count". / 1765* m->oflags = VPO_UNMANAGED; 1766 if (drop != NULL) 1767 vdrop(drop); 1768 if (vm_paging_needed()) 1769 pagedaemon_wakeup(); 1770 return (m); 1771} 1772 1773/* 1774 * vm_wait: (also see VM_WAIT macro) 1775 * 1776 * Block until free pages are available for allocation 1777 * - Called in various places before memory allocations. 1778 / 1779void 1780vm_wait(void) 1781{ 1782* 1783 mtx_lock(&vm_page_queue_free_mtx); 1784 if (curproc == pageproc) { 1785 vm_pageout_pages_needed = 1; 1786 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, 1787 PDROP \| PSWP, "VMWait", 0); 1788 } else { 1789 if (!vm_pages_needed) { 1790 vm_pages_needed = 1; 1791 wakeup(&vm_pages_needed); 1792 } 1793 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP \| PVM, 1794 "vmwait", 0); 1795 } 1796} 1797 1798/* 1799 * vm_waitpfault: (also see VM_WAITPFAULT macro) 1800 * 1801 * Block until free pages are available for allocation 1802 * - Called only in vm_fault so that processes page faulting 1803 * can be easily tracked. 1804 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 1805 * processes will be able to grab memory first. Do not change 1806 * this balance without careful testing first. 1807 / 1808void 1809vm_waitpfault(void) 1810{ 1811* 1812 mtx_lock(&vm_page_queue_free_mtx); 1813 if (!vm_pages_needed) { 1814 vm_pages_needed = 1; 1815 wakeup(&vm_pages_needed); 1816 } 1817 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP \| PUSER, 1818 "pfault", 0); 1819} 1820 1821/* 1822 * vm_page_requeue: 1823 * 1824 * Move the given page to the tail of its present page queue. 1825 * 1826 * The page queues must be locked. 1827 / 1828void 1829vm_page_requeue(vm_page_t m) 1830{ 1831* struct vpgqueues vpq; 1832* int queue; 1833 1834 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1835 queue = m->queue; 1836 KASSERT(queue != PQ_NONE, 1837 ("vm_page_requeue: page %p is not queued", m)); 1838 vpq = &vm_page_queues[queue]; 1839 TAILQ_REMOVE(&vpq->pl, m, pageq); 1840 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1841} 1842 1843/* 1844 * vm_page_queue_remove: 1845 * 1846 * Remove the given page from the specified queue. 1847 * 1848 * The page and page queues must be locked. 1849 / 1850static __inline void 1851vm_page_queue_remove(int queue, vm_page_t m) 1852{ 1853* struct vpgqueues pq; 1854* 1855 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1856 vm_page_lock_assert(m, MA_OWNED); 1857 pq = &vm_page_queues[queue]; 1858 TAILQ_REMOVE(&pq->pl, m, pageq); 1859 (pq->cnt)--; 1860} 1861* 1862/* 1863 * vm_pageq_remove: 1864 * 1865 * Remove a page from its queue. 1866 * 1867 * The given page must be locked. 1868 * This routine may not block. 1869 / 1870void 1871vm_pageq_remove(vm_page_t m) 1872{ 1873* int queue; 1874 1875 vm_page_lock_assert(m, MA_OWNED); 1876 if ((queue = m->queue) != PQ_NONE) { 1877 vm_page_lock_queues(); 1878 m->queue = PQ_NONE; 1879 vm_page_queue_remove(queue, m); 1880 vm_page_unlock_queues(); 1881 } 1882} 1883 1884/* 1885 * vm_page_enqueue: 1886 * 1887 * Add the given page to the specified queue. 1888 * 1889 * The page queues must be locked. 1890 / 1891static void 1892vm_page_enqueue(int queue, vm_page_t m) 1893{ 1894* struct vpgqueues vpq; 1895* 1896 vpq = &vm_page_queues[queue]; 1897 m->queue = queue; 1898 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1899 ++vpq->cnt; 1900} 1901* 1902/* 1903 * vm_page_activate: 1904 * 1905 * Put the specified page on the active list (if appropriate). 1906 * Ensure that act_count is at least ACT_INIT but do not otherwise 1907 * mess with it. 1908 * 1909 * The page must be locked. 1910 * This routine may not block. 1911 / 1912void 1913vm_page_activate(vm_page_t m) 1914{ 1915* int queue; 1916 1917 vm_page_lock_assert(m, MA_OWNED); 1918 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1919 if ((queue = m->queue) != PQ_ACTIVE) { 1920 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 1921 if (m->act_count < ACT_INIT) 1922 m->act_count = ACT_INIT; 1923 vm_page_lock_queues(); 1924 if (queue != PQ_NONE) 1925 vm_page_queue_remove(queue, m); 1926 vm_page_enqueue(PQ_ACTIVE, m); 1927 vm_page_unlock_queues(); 1928 } else 1929 KASSERT(queue == PQ_NONE, 1930 ("vm_page_activate: wired page %p is queued", m)); 1931 } else { 1932 if (m->act_count < ACT_INIT) 1933 m->act_count = ACT_INIT; 1934 } 1935} 1936 1937/* 1938 * vm_page_free_wakeup: 1939 * 1940 * Helper routine for vm_page_free_toq() and vm_page_cache(). This 1941 * routine is called when a page has been added to the cache or free 1942 * queues. 1943 * 1944 * The page queues must be locked. 1945 * This routine may not block. 1946 / 1947static inline void 1948vm_page_free_wakeup(void) 1949{ 1950* 1951 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1952 /* 1953 * if pageout daemon needs pages, then tell it that there are 1954 * some free. 1955 / 1956* if (vm_pageout_pages_needed && 1957 cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { 1958 wakeup(&vm_pageout_pages_needed); 1959 vm_pageout_pages_needed = 0; 1960 } 1961 /* 1962 * wakeup processes that are waiting on memory if we hit a 1963 * high water mark. And wakeup scheduler process if we have 1964 * lots of memory. this process will swapin processes. 1965 / 1966* if (vm_pages_needed && !vm_page_count_min()) { 1967 vm_pages_needed = 0; 1968 wakeup(&cnt.v_free_count); 1969 } 1970} 1971 1972/* 1973 * vm_page_free_toq: 1974 * 1975 * Returns the given page to the free list, 1976 * disassociating it with any VM object. 1977 * 1978 * Object and page must be locked prior to entry. 1979 * This routine may not block. 1980 / 1981* 1982void 1983vm_page_free_toq(vm_page_t m) 1984{ 1985 1986 if ((m->oflags & VPO_UNMANAGED) == 0) { 1987 vm_page_lock_assert(m, MA_OWNED); 1988 KASSERT(!pmap_page_is_mapped(m), 1989 ("vm_page_free_toq: freeing mapped page %p", m)); 1990 } 1991 PCPU_INC(cnt.v_tfree); 1992 1993 if (VM_PAGE_IS_FREE(m)) 1994 panic("vm_page_free: freeing free page %p", m); 1995 else if (m->busy != 0) 1996 panic("vm_page_free: freeing busy page %p", m); 1997 1998 /* 1999 * unqueue, then remove page. Note that we cannot destroy 2000 * the page here because we do not want to call the pager's 2001 * callback routine until after we've put the page on the 2002 * appropriate free queue. 2003 / 2004* if ((m->oflags & VPO_UNMANAGED) == 0) 2005 vm_pageq_remove(m); 2006 vm_page_remove(m); 2007 2008 /* 2009 * If fictitious remove object association and 2010 * return, otherwise delay object association removal. 2011 / 2012* if ((m->flags & PG_FICTITIOUS) != 0) { 2013 return; 2014 } 2015 2016 m->valid = 0; 2017 vm_page_undirty(m); 2018 2019 if (m->wire_count != 0) 2020 panic("vm_page_free: freeing wired page %p", m); 2021 if (m->hold_count != 0) { 2022 m->flags &= ~PG_ZERO; 2023 vm_page_lock_queues(); 2024 vm_page_enqueue(PQ_HOLD, m); 2025 vm_page_unlock_queues(); 2026 } else { 2027 /* 2028 * Restore the default memory attribute to the page. 2029 / 2030* if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 2031 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 2032 2033 /* 2034 * Insert the page into the physical memory allocator's 2035 * cache/free page queues. 2036 / 2037* mtx_lock(&vm_page_queue_free_mtx); 2038 m->flags \|= PG_FREE; 2039 cnt.v_free_count++; 2040#if VM_NRESERVLEVEL > 0 2041 if (!vm_reserv_free_page(m)) 2042#else 2043 if (TRUE) 2044#endif 2045 vm_phys_free_pages(m, 0); 2046 if ((m->flags & PG_ZERO) != 0) 2047 ++vm_page_zero_count; 2048 else 2049 vm_page_zero_idle_wakeup(); 2050 vm_page_free_wakeup(); 2051 mtx_unlock(&vm_page_queue_free_mtx); 2052 } 2053} 2054 2055/* 2056 * vm_page_wire: 2057 * 2058 * Mark this page as wired down by yet 2059 * another map, removing it from paging queues 2060 * as necessary. 2061 * 2062 * If the page is fictitious, then its wire count must remain one. 2063 * 2064 * The page must be locked. 2065 * This routine may not block. 2066 / 2067void 2068vm_page_wire(vm_page_t m) 2069{ 2070* 2071 /* 2072 * Only bump the wire statistics if the page is not already wired, 2073 * and only unqueue the page if it is on some queue (if it is unmanaged 2074 * it is already off the queues). 2075 / 2076* vm_page_lock_assert(m, MA_OWNED); 2077 if ((m->flags & PG_FICTITIOUS) != 0) { 2078 KASSERT(m->wire_count == 1, 2079 ("vm_page_wire: fictitious page %p's wire count isn't one", 2080 m)); 2081 return; 2082 } 2083 if (m->wire_count == 0) { 2084 if ((m->oflags & VPO_UNMANAGED) == 0) 2085 vm_pageq_remove(m); 2086 atomic_add_int(&cnt.v_wire_count, 1); 2087 } 2088 m->wire_count++; 2089 KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); 2090} 2091 2092/* 2093 * vm_page_unwire: 2094 * 2095 * Release one wiring of the specified page, potentially enabling it to be 2096 * paged again. If paging is enabled, then the value of the parameter 2097 * "activate" determines to which queue the page is added. If "activate" is 2098 * non-zero, then the page is added to the active queue. Otherwise, it is 2099 * added to the inactive queue. 2100 * 2101 * However, unless the page belongs to an object, it is not enqueued because 2102 * it cannot be paged out. 2103 * 2104 * If a page is fictitious, then its wire count must alway be one. 2105 * 2106 * A managed page must be locked. 2107 / 2108void 2109vm_page_unwire(vm_page_t m, int activate) 2110{ 2111* 2112 if ((m->oflags & VPO_UNMANAGED) == 0) 2113 vm_page_lock_assert(m, MA_OWNED); 2114 if ((m->flags & PG_FICTITIOUS) != 0) { 2115 KASSERT(m->wire_count == 1, 2116 ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); 2117 return; 2118 } 2119 if (m->wire_count > 0) { 2120 m->wire_count--; 2121 if (m->wire_count == 0) { 2122 atomic_subtract_int(&cnt.v_wire_count, 1); 2123 if ((m->oflags & VPO_UNMANAGED) != 0 \|\| 2124 m->object == NULL) 2125 return; 2126 vm_page_lock_queues(); 2127 if (activate) 2128 vm_page_enqueue(PQ_ACTIVE, m); 2129 else { 2130 m->flags &= ~PG_WINATCFLS; 2131 vm_page_enqueue(PQ_INACTIVE, m); 2132 } 2133 vm_page_unlock_queues(); 2134 } 2135 } else 2136 panic("vm_page_unwire: page %p's wire count is zero", m); 2137} 2138 2139/* 2140 * Move the specified page to the inactive queue. 2141 * 2142 * Many pages placed on the inactive queue should actually go 2143 * into the cache, but it is difficult to figure out which. What 2144 * we do instead, if the inactive target is well met, is to put 2145 * clean pages at the head of the inactive queue instead of the tail. 2146 * This will cause them to be moved to the cache more quickly and 2147 * if not actively re-referenced, reclaimed more quickly. If we just 2148 * stick these pages at the end of the inactive queue, heavy filesystem 2149 * meta-data accesses can cause an unnecessary paging load on memory bound 2150 * processes. This optimization causes one-time-use metadata to be 2151 * reused more quickly. 2152 * 2153 * Normally athead is 0 resulting in LRU operation. athead is set 2154 * to 1 if we want this page to be 'as if it were placed in the cache', 2155 * except without unmapping it from the process address space. 2156 * 2157 * This routine may not block. 2158 / 2159static inline void 2160_vm_page_deactivate(vm_page_t m, int athead) 2161{ 2162* int queue; 2163 2164 vm_page_lock_assert(m, MA_OWNED); 2165 2166 /* 2167 * Ignore if already inactive. 2168 / 2169* if ((queue = m->queue) == PQ_INACTIVE) 2170 return; 2171 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 2172 vm_page_lock_queues(); 2173 m->flags &= ~PG_WINATCFLS; 2174 if (queue != PQ_NONE) 2175 vm_page_queue_remove(queue, m); 2176 if (athead) 2177 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, 2178 pageq); 2179 else 2180 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, 2181 pageq); 2182 m->queue = PQ_INACTIVE; 2183 cnt.v_inactive_count++; 2184 vm_page_unlock_queues(); 2185 } 2186} 2187 2188/* 2189 * Move the specified page to the inactive queue. 2190 * 2191 * The page must be locked. 2192 / 2193void 2194vm_page_deactivate(vm_page_t m) 2195{ 2196* 2197 _vm_page_deactivate(m, 0); 2198} 2199 2200/* 2201 * vm_page_try_to_cache: 2202 * 2203 * Returns 0 on failure, 1 on success 2204 / 2205int 2206vm_page_try_to_cache(vm_page_t m) 2207{ 2208* 2209 vm_page_lock_assert(m, MA_OWNED); 2210 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2211 if (m->dirty \|\| m->hold_count \|\| m->busy \|\| m->wire_count \|\| 2212 (m->oflags & (VPO_BUSY \| VPO_UNMANAGED)) != 0) 2213 return (0); 2214 pmap_remove_all(m); 2215 if (m->dirty) 2216 return (0); 2217 vm_page_cache(m); 2218 return (1); 2219} 2220 2221/* 2222 * vm_page_try_to_free() 2223 * 2224 * Attempt to free the page. If we cannot free it, we do nothing. 2225 * 1 is returned on success, 0 on failure. 2226 / 2227int 2228vm_page_try_to_free(vm_page_t m) 2229{ 2230* 2231 vm_page_lock_assert(m, MA_OWNED); 2232 if (m->object != NULL) 2233 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2234 if (m->dirty \|\| m->hold_count \|\| m->busy \|\| m->wire_count \|\| 2235 (m->oflags & (VPO_BUSY \| VPO_UNMANAGED)) != 0) 2236 return (0); 2237 pmap_remove_all(m); 2238 if (m->dirty) 2239 return (0); 2240 vm_page_free(m); 2241 return (1); 2242} 2243 2244/* 2245 * vm_page_cache 2246 * 2247 * Put the specified page onto the page cache queue (if appropriate). 2248 * 2249 * This routine may not block. 2250 / 2251void 2252vm_page_cache(vm_page_t m) 2253{ 2254* vm_object_t object; 2255 vm_page_t next, prev, root; 2256 2257 vm_page_lock_assert(m, MA_OWNED); 2258 object = m->object; 2259 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2260 if ((m->oflags & (VPO_UNMANAGED \| VPO_BUSY)) \|\| m->busy \|\| 2261 m->hold_count \|\| m->wire_count) 2262 panic("vm_page_cache: attempting to cache busy page"); 2263 pmap_remove_all(m); 2264 if (m->dirty != 0) 2265 panic("vm_page_cache: page %p is dirty", m); 2266 if (m->valid == 0 \|\| object->type == OBJT_DEFAULT \|\| 2267 (object->type == OBJT_SWAP && 2268 !vm_pager_has_page(object, m->pindex, NULL, NULL))) { 2269 /* 2270 * Hypothesis: A cache-elgible page belonging to a 2271 * default object or swap object but without a backing 2272 * store must be zero filled. 2273 / 2274* vm_page_free(m); 2275 return; 2276 } 2277 KASSERT((m->flags & PG_CACHED) == 0, 2278 ("vm_page_cache: page %p is already cached", m)); 2279 PCPU_INC(cnt.v_tcached); 2280 2281 /* 2282 * Remove the page from the paging queues. 2283 / 2284* vm_pageq_remove(m); 2285 2286 /* 2287 * Remove the page from the object's collection of resident 2288 * pages. 2289 / 2290* if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { 2291 /* 2292 * Since the page's successor in the list is also its parent 2293 * in the tree, its right subtree must be empty. 2294 / 2295* next->left = m->left; 2296 KASSERT(m->right == NULL, 2297 ("vm_page_cache: page %p has right child", m)); 2298 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 2299 prev->right == m) { 2300 /* 2301 * Since the page's predecessor in the list is also its parent 2302 * in the tree, its left subtree must be empty. 2303 / 2304* KASSERT(m->left == NULL, 2305 ("vm_page_cache: page %p has left child", m)); 2306 prev->right = m->right; 2307 } else { 2308 if (m != object->root) 2309 vm_page_splay(m->pindex, object->root); 2310 if (m->left == NULL) 2311 root = m->right; 2312 else if (m->right == NULL) 2313 root = m->left; 2314 else { 2315 /* 2316 * Move the page's successor to the root, because 2317 * pages are usually removed in ascending order. 2318 / 2319* if (m->right != next) 2320 vm_page_splay(m->pindex, m->right); 2321 next->left = m->left; 2322 root = next; 2323 } 2324 object->root = root; 2325 } 2326 TAILQ_REMOVE(&object->memq, m, listq); 2327 object->resident_page_count--; 2328 2329 /* 2330 * Restore the default memory attribute to the page. 2331 / 2332* if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 2333 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 2334 2335 /* 2336 * Insert the page into the object's collection of cached pages 2337 * and the physical memory allocator's cache/free page queues. 2338 / 2339* m->flags &= ~PG_ZERO; 2340 mtx_lock(&vm_page_queue_free_mtx); 2341 m->flags \|= PG_CACHED; 2342 cnt.v_cache_count++; 2343 root = object->cache; 2344 if (root == NULL) { 2345 m->left = NULL; 2346 m->right = NULL; 2347 } else { 2348 root = vm_page_splay(m->pindex, root); 2349 if (m->pindex < root->pindex) { 2350 m->left = root->left; 2351 m->right = root; 2352 root->left = NULL; 2353 } else if (__predict_false(m->pindex == root->pindex)) 2354 panic("vm_page_cache: offset already cached"); 2355 else { 2356 m->right = root->right; 2357 m->left = root; 2358 root->right = NULL; 2359 } 2360 } 2361 object->cache = m; 2362#if VM_NRESERVLEVEL > 0 2363 if (!vm_reserv_free_page(m)) { 2364#else 2365 if (TRUE) { 2366#endif 2367 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); 2368 vm_phys_free_pages(m, 0); 2369 } 2370 vm_page_free_wakeup(); 2371 mtx_unlock(&vm_page_queue_free_mtx); 2372 2373 /* 2374 * Increment the vnode's hold count if this is the object's only 2375 * cached page. Decrement the vnode's hold count if this was 2376 * the object's only resident page. 2377 / 2378* if (object->type == OBJT_VNODE) { 2379 if (root == NULL && object->resident_page_count != 0) 2380 vhold(object->handle); 2381 else if (root != NULL && object->resident_page_count == 0) 2382 vdrop(object->handle); 2383 } 2384} 2385 2386/* 2387 * vm_page_dontneed 2388 * 2389 * Cache, deactivate, or do nothing as appropriate. This routine 2390 * is typically used by madvise() MADV_DONTNEED. 2391 * 2392 * Generally speaking we want to move the page into the cache so 2393 * it gets reused quickly. However, this can result in a silly syndrome 2394 * due to the page recycling too quickly. Small objects will not be 2395 * fully cached. On the otherhand, if we move the page to the inactive 2396 * queue we wind up with a problem whereby very large objects 2397 * unnecessarily blow away our inactive and cache queues. 2398 * 2399 * The solution is to move the pages based on a fixed weighting. We 2400 * either leave them alone, deactivate them, or move them to the cache, 2401 * where moving them to the cache has the highest weighting. 2402 * By forcing some pages into other queues we eventually force the 2403 * system to balance the queues, potentially recovering other unrelated 2404 * space from active. The idea is to not force this to happen too 2405 * often. 2406 / 2407void 2408vm_page_dontneed(vm_page_t m) 2409{ 2410* int dnw; 2411 int head; 2412 2413 vm_page_lock_assert(m, MA_OWNED); 2414 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2415 dnw = PCPU_GET(dnweight); 2416 PCPU_INC(dnweight); 2417 2418 /* 2419 * Occasionally leave the page alone. 2420 / 2421* if ((dnw & 0x01F0) == 0 \|\| m->queue == PQ_INACTIVE) { 2422 if (m->act_count >= ACT_INIT) 2423 --m->act_count; 2424 return; 2425 } 2426 2427 /* 2428 * Clear any references to the page. Otherwise, the page daemon will 2429 * immediately reactivate the page. 2430 * 2431 * Perform the pmap_clear_reference() first. Otherwise, a concurrent 2432 * pmap operation, such as pmap_remove(), could clear a reference in 2433 * the pmap and set PGA_REFERENCED on the page before the 2434 * pmap_clear_reference() had completed. Consequently, the page would 2435 * appear referenced based upon an old reference that occurred before 2436 * this function ran. 2437 / 2438* pmap_clear_reference(m); 2439 vm_page_aflag_clear(m, PGA_REFERENCED); 2440 2441 if (m->dirty == 0 && pmap_is_modified(m)) 2442 vm_page_dirty(m); 2443 2444 if (m->dirty \|\| (dnw & 0x0070) == 0) { 2445 /* 2446 * Deactivate the page 3 times out of 32. 2447 / 2448* head = 0; 2449 } else { 2450 /* 2451 * Cache the page 28 times out of every 32. Note that 2452 * the page is deactivated instead of cached, but placed 2453 * at the head of the queue instead of the tail. 2454 / 2455* head = 1; 2456 } 2457 _vm_page_deactivate(m, head); 2458} 2459 2460/* 2461 * Grab a page, waiting until we are waken up due to the page 2462 * changing state. We keep on waiting, if the page continues 2463 * to be in the object. If the page doesn't exist, first allocate it 2464 * and then conditionally zero it. 2465 * 2466 * The caller must always specify the VM_ALLOC_RETRY flag. This is intended 2467 * to facilitate its eventual removal. 2468 * 2469 * This routine may block. 2470 / 2471vm_page_t 2472vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 2473{ 2474* vm_page_t m; 2475 2476 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2477 KASSERT((allocflags & VM_ALLOC_RETRY) != 0, 2478 ("vm_page_grab: VM_ALLOC_RETRY is required")); 2479retrylookup: 2480 if ((m = vm_page_lookup(object, pindex)) != NULL) { 2481 if ((m->oflags & VPO_BUSY) != 0 \|\| 2482 ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) { 2483 /* 2484 * Reference the page before unlocking and 2485 * sleeping so that the page daemon is less 2486 * likely to reclaim it. 2487 / 2488* vm_page_aflag_set(m, PGA_REFERENCED); 2489 vm_page_sleep(m, "pgrbwt"); 2490 goto retrylookup; 2491 } else { 2492 if ((allocflags & VM_ALLOC_WIRED) != 0) { 2493 vm_page_lock(m); 2494 vm_page_wire(m); 2495 vm_page_unlock(m); 2496 } 2497 if ((allocflags & VM_ALLOC_NOBUSY) == 0) 2498 vm_page_busy(m); 2499 return (m); 2500 } 2501 } 2502 m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY \| 2503 VM_ALLOC_IGN_SBUSY)); 2504 if (m == NULL) { 2505 VM_OBJECT_UNLOCK(object); 2506 VM_WAIT; 2507 VM_OBJECT_LOCK(object); 2508 goto retrylookup; 2509 } else if (m->valid != 0) 2510 return (m); 2511 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) 2512 pmap_zero_page(m); 2513 return (m); 2514} 2515 2516/* 2517 * Mapping function for valid bits or for dirty bits in 2518 * a page. May not block. 2519 * 2520 * Inputs are required to range within a page. 2521 / 2522vm_page_bits_t 2523vm_page_bits(int base, int size) 2524{ 2525* int first_bit; 2526 int last_bit; 2527 2528 KASSERT( 2529 base + size <= PAGE_SIZE, 2530 ("vm_page_bits: illegal base/size %d/%d", base, size) 2531 ); 2532 2533 if (size == 0) /* handle degenerate case / 2534* return (0); 2535 2536 first_bit = base >> DEV_BSHIFT; 2537 last_bit = (base + size - 1) >> DEV_BSHIFT; 2538 2539 return (((vm_page_bits_t)2 << last_bit) - 2540 ((vm_page_bits_t)1 << first_bit)); 2541} 2542 2543/* 2544 * vm_page_set_valid_range: 2545 * 2546 * Sets portions of a page valid. The arguments are expected 2547 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2548 * of any partial chunks touched by the range. The invalid portion of 2549 * such chunks will be zeroed. 2550 * 2551 * (base + size) must be less then or equal to PAGE_SIZE. 2552 / 2553void 2554vm_page_set_valid_range(vm_page_t m, int base, int size) 2555{ 2556* int endoff, frag; 2557 2558 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2559 if (size == 0) /* handle degenerate case / 2560* return; 2561 2562 /* 2563 * If the base is not DEV_BSIZE aligned and the valid 2564 * bit is clear, we have to zero out a portion of the 2565 * first block. 2566 / 2567* if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2568 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 2569 pmap_zero_page_area(m, frag, base - frag); 2570 2571 /* 2572 * If the ending offset is not DEV_BSIZE aligned and the 2573 * valid bit is clear, we have to zero out a portion of 2574 * the last block. 2575 / 2576* endoff = base + size; 2577 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2578 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 2579 pmap_zero_page_area(m, endoff, 2580 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2581 2582 /* 2583 * Assert that no previously invalid block that is now being validated 2584 * is already dirty. 2585 / 2586* KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, 2587 ("vm_page_set_valid_range: page %p is dirty", m)); 2588 2589 /* 2590 * Set valid bits inclusive of any overlap. 2591 / 2592* m->valid \|= vm_page_bits(base, size); 2593} 2594 2595/* 2596 * Clear the given bits from the specified page's dirty field. 2597 / 2598static __inline void 2599vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) 2600{ 2601* uintptr_t addr; 2602#if PAGE_SIZE < 16384 2603 int shift; 2604#endif 2605 2606 /* 2607 * If the object is locked and the page is neither VPO_BUSY nor 2608 * PGA_WRITEABLE, then the page's dirty field cannot possibly be 2609 * set by a concurrent pmap operation. 2610 / 2611* VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2612 if ((m->oflags & VPO_BUSY) == 0 && (m->aflags & PGA_WRITEABLE) == 0) 2613 m->dirty &= ~pagebits; 2614 else { 2615 /* 2616 * The pmap layer can call vm_page_dirty() without 2617 * holding a distinguished lock. The combination of 2618 * the object's lock and an atomic operation suffice 2619 * to guarantee consistency of the page dirty field. 2620 * 2621 * For PAGE_SIZE == 32768 case, compiler already 2622 * properly aligns the dirty field, so no forcible 2623 * alignment is needed. Only require existence of 2624 * atomic_clear_64 when page size is 32768. 2625 / 2626* addr = (uintptr_t)&m->dirty; 2627#if PAGE_SIZE == 32768 2628 atomic_clear_64((uint64_t )addr, pagebits); 2629#elif PAGE_SIZE == 16384 2630* atomic_clear_32((uint32_t )addr, pagebits); 2631#else / PAGE_SIZE <= 8192 / 2632* /* 2633 * Use a trick to perform a 32-bit atomic on the 2634 * containing aligned word, to not depend on the existence 2635 * of atomic_clear_{8, 16}. 2636 / 2637* shift = addr & (sizeof(uint32_t) - 1); 2638#if BYTE_ORDER == BIG_ENDIAN 2639 shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; 2640#else 2641 shift = NBBY; 2642#endif 2643* addr &= ~(sizeof(uint32_t) - 1); 2644 atomic_clear_32((uint32_t )addr, pagebits << shift); 2645#endif / PAGE_SIZE / 2646* } 2647} 2648 2649/* 2650 * vm_page_set_validclean: 2651 * 2652 * Sets portions of a page valid and clean. The arguments are expected 2653 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2654 * of any partial chunks touched by the range. The invalid portion of 2655 * such chunks will be zero'd. 2656 * 2657 * This routine may not block. 2658 * 2659 * (base + size) must be less then or equal to PAGE_SIZE. 2660 / 2661void 2662vm_page_set_validclean(vm_page_t m, int base, int size) 2663{ 2664* vm_page_bits_t oldvalid, pagebits; 2665 int endoff, frag; 2666 2667 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2668 if (size == 0) /* handle degenerate case / 2669* return; 2670 2671 /* 2672 * If the base is not DEV_BSIZE aligned and the valid 2673 * bit is clear, we have to zero out a portion of the 2674 * first block. 2675 / 2676* if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2677 (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) 2678 pmap_zero_page_area(m, frag, base - frag); 2679 2680 /* 2681 * If the ending offset is not DEV_BSIZE aligned and the 2682 * valid bit is clear, we have to zero out a portion of 2683 * the last block. 2684 / 2685* endoff = base + size; 2686 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2687 (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) 2688 pmap_zero_page_area(m, endoff, 2689 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2690 2691 /* 2692 * Set valid, clear dirty bits. If validating the entire 2693 * page we can safely clear the pmap modify bit. We also 2694 * use this opportunity to clear the VPO_NOSYNC flag. If a process 2695 * takes a write fault on a MAP_NOSYNC memory area the flag will 2696 * be set again. 2697 * 2698 * We set valid bits inclusive of any overlap, but we can only 2699 * clear dirty bits for DEV_BSIZE chunks that are fully within 2700 * the range. 2701 / 2702* oldvalid = m->valid; 2703 pagebits = vm_page_bits(base, size); 2704 m->valid \|= pagebits; 2705#if 0 /* NOT YET / 2706* if ((frag = base & (DEV_BSIZE - 1)) != 0) { 2707 frag = DEV_BSIZE - frag; 2708 base += frag; 2709 size -= frag; 2710 if (size < 0) 2711 size = 0; 2712 } 2713 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 2714#endif 2715 if (base == 0 && size == PAGE_SIZE) { 2716 /* 2717 * The page can only be modified within the pmap if it is 2718 * mapped, and it can only be mapped if it was previously 2719 * fully valid. 2720 / 2721* if (oldvalid == VM_PAGE_BITS_ALL) 2722 /* 2723 * Perform the pmap_clear_modify() first. Otherwise, 2724 * a concurrent pmap operation, such as 2725 * pmap_protect(), could clear a modification in the 2726 * pmap and set the dirty field on the page before 2727 * pmap_clear_modify() had begun and after the dirty 2728 * field was cleared here. 2729 / 2730* pmap_clear_modify(m); 2731 m->dirty = 0; 2732 m->oflags &= ~VPO_NOSYNC; 2733 } else if (oldvalid != VM_PAGE_BITS_ALL) 2734 m->dirty &= ~pagebits; 2735 else 2736 vm_page_clear_dirty_mask(m, pagebits); 2737} 2738 2739void 2740vm_page_clear_dirty(vm_page_t m, int base, int size) 2741{ 2742 2743 vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); 2744} 2745 2746/* 2747 * vm_page_set_invalid: 2748 * 2749 * Invalidates DEV_BSIZE'd chunks within a page. Both the 2750 * valid and dirty bits for the effected areas are cleared. 2751 * 2752 * May not block. 2753 / 2754void 2755vm_page_set_invalid(vm_page_t m, int base, int size) 2756{ 2757* vm_page_bits_t bits; 2758 2759 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2760 KASSERT((m->oflags & VPO_BUSY) == 0, 2761 ("vm_page_set_invalid: page %p is busy", m)); 2762 bits = vm_page_bits(base, size); 2763 if (m->valid == VM_PAGE_BITS_ALL && bits != 0) 2764 pmap_remove_all(m); 2765 KASSERT(!pmap_page_is_mapped(m), 2766 ("vm_page_set_invalid: page %p is mapped", m)); 2767 m->valid &= ~bits; 2768 m->dirty &= ~bits; 2769} 2770 2771/* 2772 * vm_page_zero_invalid() 2773 * 2774 * The kernel assumes that the invalid portions of a page contain 2775 * garbage, but such pages can be mapped into memory by user code. 2776 * When this occurs, we must zero out the non-valid portions of the 2777 * page so user code sees what it expects. 2778 * 2779 * Pages are most often semi-valid when the end of a file is mapped 2780 * into memory and the file's size is not page aligned. 2781 / 2782void 2783vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 2784{ 2785* int b; 2786 int i; 2787 2788 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2789 /* 2790 * Scan the valid bits looking for invalid sections that 2791 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the 2792 * valid bit may be set ) have already been zerod by 2793 * vm_page_set_validclean(). 2794 / 2795* for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 2796 if (i == (PAGE_SIZE / DEV_BSIZE) \|\| 2797 (m->valid & ((vm_page_bits_t)1 << i))) { 2798 if (i > b) { 2799 pmap_zero_page_area(m, 2800 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); 2801 } 2802 b = i + 1; 2803 } 2804 } 2805 2806 /* 2807 * setvalid is TRUE when we can safely set the zero'd areas 2808 * as being valid. We can do this if there are no cache consistancy 2809 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 2810 / 2811* if (setvalid) 2812 m->valid = VM_PAGE_BITS_ALL; 2813} 2814 2815/* 2816 * vm_page_is_valid: 2817 * 2818 * Is (partial) page valid? Note that the case where size == 0 2819 * will return FALSE in the degenerate case where the page is 2820 * entirely invalid, and TRUE otherwise. 2821 * 2822 * May not block. 2823 / 2824int 2825vm_page_is_valid(vm_page_t m, int base, int size) 2826{ 2827* vm_page_bits_t bits; 2828 2829 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2830 bits = vm_page_bits(base, size); 2831 if (m->valid && ((m->valid & bits) == bits)) 2832 return 1; 2833 else 2834 return 0; 2835} 2836 2837/* 2838 * update dirty bits from pmap/mmu. May not block. 2839 / 2840void 2841vm_page_test_dirty(vm_page_t m) 2842{ 2843* 2844 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2845 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) 2846 vm_page_dirty(m); 2847} 2848 2849void 2850vm_page_lock_KBI(vm_page_t m, const char file, int line) 2851{ 2852* 2853 mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); 2854} 2855 2856void 2857vm_page_unlock_KBI(vm_page_t m, const char file, int line) 2858{ 2859* 2860 mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); 2861} 2862 2863int 2864vm_page_trylock_KBI(vm_page_t m, const char file, int line) 2865{ 2866* 2867 return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); 2868} 2869 2870#if defined(INVARIANTS) \|\| defined(INVARIANT_SUPPORT) 2871void 2872vm_page_lock_assert_KBI(vm_page_t m, int a, const char file, int line) 2873{ 2874* 2875 mtx_assert_(vm_page_lockptr(m), a, file, line); 2876} 2877#endif 2878 2879int so_zerocp_fullpage = 0; 2880 2881/* 2882 * Replace the given page with a copy. The copied page assumes 2883 * the portion of the given page's "wire_count" that is not the 2884 * responsibility of this copy-on-write mechanism. 2885 * 2886 * The object containing the given page must have a non-zero 2887 * paging-in-progress count and be locked. 2888 / 2889void 2890vm_page_cowfault(vm_page_t m) 2891{ 2892* vm_page_t mnew; 2893 vm_object_t object; 2894 vm_pindex_t pindex; 2895 2896 mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED); 2897 vm_page_lock_assert(m, MA_OWNED); 2898 object = m->object; 2899 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2900 KASSERT(object->paging_in_progress != 0, 2901 ("vm_page_cowfault: object %p's paging-in-progress count is zero.", 2902 object)); 2903 pindex = m->pindex; 2904 2905 retry_alloc: 2906 pmap_remove_all(m); 2907 vm_page_remove(m); 2908 mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL \| VM_ALLOC_NOBUSY); 2909 if (mnew == NULL) { 2910 vm_page_insert(m, object, pindex); 2911 vm_page_unlock(m); 2912 VM_OBJECT_UNLOCK(object); 2913 VM_WAIT; 2914 VM_OBJECT_LOCK(object); 2915 if (m == vm_page_lookup(object, pindex)) { 2916 vm_page_lock(m); 2917 goto retry_alloc; 2918 } else { 2919 /* 2920 * Page disappeared during the wait. 2921 / 2922* return; 2923 } 2924 } 2925 2926 if (m->cow == 0) { 2927 /* 2928 * check to see if we raced with an xmit complete when 2929 * waiting to allocate a page. If so, put things back 2930 * the way they were 2931 / 2932* vm_page_unlock(m); 2933 vm_page_lock(mnew); 2934 vm_page_free(mnew); 2935 vm_page_unlock(mnew); 2936 vm_page_insert(m, object, pindex); 2937 } else { /* clear COW & copy page / 2938* if (!so_zerocp_fullpage) 2939 pmap_copy_page(m, mnew); 2940 mnew->valid = VM_PAGE_BITS_ALL; 2941 vm_page_dirty(mnew); 2942 mnew->wire_count = m->wire_count - m->cow; 2943 m->wire_count = m->cow; 2944 vm_page_unlock(m); 2945 } 2946} 2947 2948void 2949vm_page_cowclear(vm_page_t m) 2950{ 2951 2952 vm_page_lock_assert(m, MA_OWNED); 2953 if (m->cow) { 2954 m->cow--; 2955 /* 2956 * let vm_fault add back write permission lazily 2957 / 2958* } 2959 /* 2960 * sf_buf_free() will free the page, so we needn't do it here 2961 / 2962} 2963* 2964int 2965vm_page_cowsetup(vm_page_t m) 2966{ 2967 2968 vm_page_lock_assert(m, MA_OWNED); 2969 if ((m->flags & PG_FICTITIOUS) != 0 \|\| 2970 (m->oflags & VPO_UNMANAGED) != 0 \|\| 2971 m->cow == USHRT_MAX - 1 \|\| !VM_OBJECT_TRYLOCK(m->object)) 2972 return (EBUSY); 2973 m->cow++; 2974 pmap_remove_write(m); 2975 VM_OBJECT_UNLOCK(m->object); 2976 return (0); 2977} 2978 2979#ifdef INVARIANTS 2980void 2981vm_page_object_lock_assert(vm_page_t m) 2982{ 2983 2984 /* 2985 * Certain of the page's fields may only be modified by the 2986 * holder of the containing object's lock or the setter of the 2987 * page's VPO_BUSY flag. Unfortunately, the setter of the 2988 * VPO_BUSY flag is not recorded, and thus cannot be checked 2989 * here. 2990 / 2991* if (m->object != NULL && (m->oflags & VPO_BUSY) == 0) 2992 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2993} 2994#endif 2995 2996#include "opt_ddb.h" 2997#ifdef DDB 2998#include <sys/kernel.h> 2999 3000#include <ddb/ddb.h> 3001 3002DB_SHOW_COMMAND(page, vm_page_print_page_info) 3003{ 3004 db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); 3005 db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); 3006 db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); 3007 db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); 3008 db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); 3009 db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); 3010 db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); 3011 db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); 3012 db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); 3013 db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); 3014} 3015 3016DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 3017{ 3018 3019 db_printf("PQ_FREE:"); 3020 db_printf(" %d", cnt.v_free_count); 3021 db_printf("\n"); 3022 3023 db_printf("PQ_CACHE:"); 3024 db_printf(" %d", cnt.v_cache_count); 3025 db_printf("\n"); 3026 3027 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", 3028 vm_page_queues[PQ_ACTIVE].cnt, 3029* vm_page_queues[PQ_INACTIVE].cnt); 3030} 3031#endif / DDB */	1607 if ((req & VM_ALLOC_WIRED) != 0) 1608 atomic_add_int(&cnt.v_wire_count, npages); 1609 oflags = VPO_UNMANAGED; 1610 if (object != NULL) { 1611 if ((req & VM_ALLOC_NOBUSY) == 0) 1612 oflags \|= VPO_BUSY; 1613 if (object->memattr != VM_MEMATTR_DEFAULT && 1614 memattr == VM_MEMATTR_DEFAULT) 1615 memattr = object->memattr; 1616 } 1617 for (m = m_ret; m < &m_ret[npages]; m++) { 1618 m->aflags = 0; 1619 m->flags &= flags; 1620 if ((req & VM_ALLOC_WIRED) != 0) 1621 m->wire_count = 1; 1622 /* Unmanaged pages don't use "act_count". / 1623* m->oflags = oflags; 1624 if (memattr != VM_MEMATTR_DEFAULT) 1625 pmap_page_set_memattr(m, memattr); 1626 if (object != NULL) 1627 vm_page_insert(m, object, pindex); 1628 else 1629 m->pindex = pindex; 1630 pindex++; 1631 } 1632 while (deferred_vdrop_list != NULL) { 1633 vdrop((struct vnode )deferred_vdrop_list->pageq.tqe_prev); 1634* deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next; 1635 } 1636 if (vm_paging_needed()) 1637 pagedaemon_wakeup(); 1638 return (m_ret); 1639} 1640 1641/* 1642 * Initialize a page that has been freshly dequeued from a freelist. 1643 * The caller has to drop the vnode returned, if it is not NULL. 1644 * 1645 * This function may only be used to initialize unmanaged pages. 1646 * 1647 * To be called with vm_page_queue_free_mtx held. 1648 / 1649static struct vnode 1650vm_page_alloc_init(vm_page_t m) 1651{ 1652 struct vnode drop; 1653* vm_object_t m_object; 1654 1655 KASSERT(m->queue == PQ_NONE, 1656 ("vm_page_alloc_init: page %p has unexpected queue %d", 1657 m, m->queue)); 1658 KASSERT(m->wire_count == 0, 1659 ("vm_page_alloc_init: page %p is wired", m)); 1660 KASSERT(m->hold_count == 0, 1661 ("vm_page_alloc_init: page %p is held", m)); 1662 KASSERT(m->busy == 0, 1663 ("vm_page_alloc_init: page %p is busy", m)); 1664 KASSERT(m->dirty == 0, 1665 ("vm_page_alloc_init: page %p is dirty", m)); 1666 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1667 ("vm_page_alloc_init: page %p has unexpected memattr %d", 1668 m, pmap_page_get_memattr(m))); 1669 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1670 drop = NULL; 1671 if ((m->flags & PG_CACHED) != 0) { 1672 KASSERT((m->flags & PG_ZERO) == 0, 1673 ("vm_page_alloc_init: cached page %p is PG_ZERO", m)); 1674 m->valid = 0; 1675 m_object = m->object; 1676 vm_page_cache_remove(m); 1677 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1678 drop = m_object->handle; 1679 } else { 1680 KASSERT(VM_PAGE_IS_FREE(m), 1681 ("vm_page_alloc_init: page %p is not free", m)); 1682 KASSERT(m->valid == 0, 1683 ("vm_page_alloc_init: free page %p is valid", m)); 1684 cnt.v_free_count--; 1685 if ((m->flags & PG_ZERO) != 0) 1686 vm_page_zero_count--; 1687 } 1688 /* Don't clear the PG_ZERO flag; we'll need it later. / 1689* m->flags &= PG_ZERO; 1690 return (drop); 1691} 1692 1693/* 1694 * vm_page_alloc_freelist: 1695 * 1696 * Allocate a physical page from the specified free page list. 1697 * 1698 * The caller must always specify an allocation class. 1699 * 1700 * allocation classes: 1701 * VM_ALLOC_NORMAL normal process request 1702 * VM_ALLOC_SYSTEM system really needs a page 1703 * VM_ALLOC_INTERRUPT interrupt time request 1704 * 1705 * optional allocation flags: 1706 * VM_ALLOC_COUNT(number) the number of additional pages that the caller 1707 * intends to allocate 1708 * VM_ALLOC_WIRED wire the allocated page 1709 * VM_ALLOC_ZERO prefer a zeroed page 1710 * 1711 * This routine may not sleep. 1712 / 1713vm_page_t 1714vm_page_alloc_freelist(int flind, int req) 1715{ 1716* struct vnode drop; 1717* vm_page_t m; 1718 u_int flags; 1719 int req_class; 1720 1721 req_class = req & VM_ALLOC_CLASS_MASK; 1722 1723 /* 1724 * The page daemon is allowed to dig deeper into the free page list. 1725 / 1726* if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) 1727 req_class = VM_ALLOC_SYSTEM; 1728 1729 /* 1730 * Do not allocate reserved pages unless the req has asked for it. 1731 / 1732* mtx_lock(&vm_page_queue_free_mtx); 1733 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved \|\| 1734 (req_class == VM_ALLOC_SYSTEM && 1735 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) \|\| 1736 (req_class == VM_ALLOC_INTERRUPT && 1737 cnt.v_free_count + cnt.v_cache_count > 0)) 1738 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); 1739 else { 1740 mtx_unlock(&vm_page_queue_free_mtx); 1741 atomic_add_int(&vm_pageout_deficit, 1742 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1743 pagedaemon_wakeup(); 1744 return (NULL); 1745 } 1746 if (m == NULL) { 1747 mtx_unlock(&vm_page_queue_free_mtx); 1748 return (NULL); 1749 } 1750 drop = vm_page_alloc_init(m); 1751 mtx_unlock(&vm_page_queue_free_mtx); 1752 1753 /* 1754 * Initialize the page. Only the PG_ZERO flag is inherited. 1755 / 1756* m->aflags = 0; 1757 flags = 0; 1758 if ((req & VM_ALLOC_ZERO) != 0) 1759 flags = PG_ZERO; 1760 m->flags &= flags; 1761 if ((req & VM_ALLOC_WIRED) != 0) { 1762 /* 1763 * The page lock is not required for wiring a page that does 1764 * not belong to an object. 1765 / 1766* atomic_add_int(&cnt.v_wire_count, 1); 1767 m->wire_count = 1; 1768 } 1769 /* Unmanaged pages don't use "act_count". / 1770* m->oflags = VPO_UNMANAGED; 1771 if (drop != NULL) 1772 vdrop(drop); 1773 if (vm_paging_needed()) 1774 pagedaemon_wakeup(); 1775 return (m); 1776} 1777 1778/* 1779 * vm_wait: (also see VM_WAIT macro) 1780 * 1781 * Block until free pages are available for allocation 1782 * - Called in various places before memory allocations. 1783 / 1784void 1785vm_wait(void) 1786{ 1787* 1788 mtx_lock(&vm_page_queue_free_mtx); 1789 if (curproc == pageproc) { 1790 vm_pageout_pages_needed = 1; 1791 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, 1792 PDROP \| PSWP, "VMWait", 0); 1793 } else { 1794 if (!vm_pages_needed) { 1795 vm_pages_needed = 1; 1796 wakeup(&vm_pages_needed); 1797 } 1798 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP \| PVM, 1799 "vmwait", 0); 1800 } 1801} 1802 1803/* 1804 * vm_waitpfault: (also see VM_WAITPFAULT macro) 1805 * 1806 * Block until free pages are available for allocation 1807 * - Called only in vm_fault so that processes page faulting 1808 * can be easily tracked. 1809 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 1810 * processes will be able to grab memory first. Do not change 1811 * this balance without careful testing first. 1812 / 1813void 1814vm_waitpfault(void) 1815{ 1816* 1817 mtx_lock(&vm_page_queue_free_mtx); 1818 if (!vm_pages_needed) { 1819 vm_pages_needed = 1; 1820 wakeup(&vm_pages_needed); 1821 } 1822 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP \| PUSER, 1823 "pfault", 0); 1824} 1825 1826/* 1827 * vm_page_requeue: 1828 * 1829 * Move the given page to the tail of its present page queue. 1830 * 1831 * The page queues must be locked. 1832 / 1833void 1834vm_page_requeue(vm_page_t m) 1835{ 1836* struct vpgqueues vpq; 1837* int queue; 1838 1839 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1840 queue = m->queue; 1841 KASSERT(queue != PQ_NONE, 1842 ("vm_page_requeue: page %p is not queued", m)); 1843 vpq = &vm_page_queues[queue]; 1844 TAILQ_REMOVE(&vpq->pl, m, pageq); 1845 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1846} 1847 1848/* 1849 * vm_page_queue_remove: 1850 * 1851 * Remove the given page from the specified queue. 1852 * 1853 * The page and page queues must be locked. 1854 / 1855static __inline void 1856vm_page_queue_remove(int queue, vm_page_t m) 1857{ 1858* struct vpgqueues pq; 1859* 1860 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1861 vm_page_lock_assert(m, MA_OWNED); 1862 pq = &vm_page_queues[queue]; 1863 TAILQ_REMOVE(&pq->pl, m, pageq); 1864 (pq->cnt)--; 1865} 1866* 1867/* 1868 * vm_pageq_remove: 1869 * 1870 * Remove a page from its queue. 1871 * 1872 * The given page must be locked. 1873 * This routine may not block. 1874 / 1875void 1876vm_pageq_remove(vm_page_t m) 1877{ 1878* int queue; 1879 1880 vm_page_lock_assert(m, MA_OWNED); 1881 if ((queue = m->queue) != PQ_NONE) { 1882 vm_page_lock_queues(); 1883 m->queue = PQ_NONE; 1884 vm_page_queue_remove(queue, m); 1885 vm_page_unlock_queues(); 1886 } 1887} 1888 1889/* 1890 * vm_page_enqueue: 1891 * 1892 * Add the given page to the specified queue. 1893 * 1894 * The page queues must be locked. 1895 / 1896static void 1897vm_page_enqueue(int queue, vm_page_t m) 1898{ 1899* struct vpgqueues vpq; 1900* 1901 vpq = &vm_page_queues[queue]; 1902 m->queue = queue; 1903 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1904 ++vpq->cnt; 1905} 1906* 1907/* 1908 * vm_page_activate: 1909 * 1910 * Put the specified page on the active list (if appropriate). 1911 * Ensure that act_count is at least ACT_INIT but do not otherwise 1912 * mess with it. 1913 * 1914 * The page must be locked. 1915 * This routine may not block. 1916 / 1917void 1918vm_page_activate(vm_page_t m) 1919{ 1920* int queue; 1921 1922 vm_page_lock_assert(m, MA_OWNED); 1923 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1924 if ((queue = m->queue) != PQ_ACTIVE) { 1925 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 1926 if (m->act_count < ACT_INIT) 1927 m->act_count = ACT_INIT; 1928 vm_page_lock_queues(); 1929 if (queue != PQ_NONE) 1930 vm_page_queue_remove(queue, m); 1931 vm_page_enqueue(PQ_ACTIVE, m); 1932 vm_page_unlock_queues(); 1933 } else 1934 KASSERT(queue == PQ_NONE, 1935 ("vm_page_activate: wired page %p is queued", m)); 1936 } else { 1937 if (m->act_count < ACT_INIT) 1938 m->act_count = ACT_INIT; 1939 } 1940} 1941 1942/* 1943 * vm_page_free_wakeup: 1944 * 1945 * Helper routine for vm_page_free_toq() and vm_page_cache(). This 1946 * routine is called when a page has been added to the cache or free 1947 * queues. 1948 * 1949 * The page queues must be locked. 1950 * This routine may not block. 1951 / 1952static inline void 1953vm_page_free_wakeup(void) 1954{ 1955* 1956 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1957 /* 1958 * if pageout daemon needs pages, then tell it that there are 1959 * some free. 1960 / 1961* if (vm_pageout_pages_needed && 1962 cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { 1963 wakeup(&vm_pageout_pages_needed); 1964 vm_pageout_pages_needed = 0; 1965 } 1966 /* 1967 * wakeup processes that are waiting on memory if we hit a 1968 * high water mark. And wakeup scheduler process if we have 1969 * lots of memory. this process will swapin processes. 1970 / 1971* if (vm_pages_needed && !vm_page_count_min()) { 1972 vm_pages_needed = 0; 1973 wakeup(&cnt.v_free_count); 1974 } 1975} 1976 1977/* 1978 * vm_page_free_toq: 1979 * 1980 * Returns the given page to the free list, 1981 * disassociating it with any VM object. 1982 * 1983 * Object and page must be locked prior to entry. 1984 * This routine may not block. 1985 / 1986* 1987void 1988vm_page_free_toq(vm_page_t m) 1989{ 1990 1991 if ((m->oflags & VPO_UNMANAGED) == 0) { 1992 vm_page_lock_assert(m, MA_OWNED); 1993 KASSERT(!pmap_page_is_mapped(m), 1994 ("vm_page_free_toq: freeing mapped page %p", m)); 1995 } 1996 PCPU_INC(cnt.v_tfree); 1997 1998 if (VM_PAGE_IS_FREE(m)) 1999 panic("vm_page_free: freeing free page %p", m); 2000 else if (m->busy != 0) 2001 panic("vm_page_free: freeing busy page %p", m); 2002 2003 /* 2004 * unqueue, then remove page. Note that we cannot destroy 2005 * the page here because we do not want to call the pager's 2006 * callback routine until after we've put the page on the 2007 * appropriate free queue. 2008 / 2009* if ((m->oflags & VPO_UNMANAGED) == 0) 2010 vm_pageq_remove(m); 2011 vm_page_remove(m); 2012 2013 /* 2014 * If fictitious remove object association and 2015 * return, otherwise delay object association removal. 2016 / 2017* if ((m->flags & PG_FICTITIOUS) != 0) { 2018 return; 2019 } 2020 2021 m->valid = 0; 2022 vm_page_undirty(m); 2023 2024 if (m->wire_count != 0) 2025 panic("vm_page_free: freeing wired page %p", m); 2026 if (m->hold_count != 0) { 2027 m->flags &= ~PG_ZERO; 2028 vm_page_lock_queues(); 2029 vm_page_enqueue(PQ_HOLD, m); 2030 vm_page_unlock_queues(); 2031 } else { 2032 /* 2033 * Restore the default memory attribute to the page. 2034 / 2035* if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 2036 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 2037 2038 /* 2039 * Insert the page into the physical memory allocator's 2040 * cache/free page queues. 2041 / 2042* mtx_lock(&vm_page_queue_free_mtx); 2043 m->flags \|= PG_FREE; 2044 cnt.v_free_count++; 2045#if VM_NRESERVLEVEL > 0 2046 if (!vm_reserv_free_page(m)) 2047#else 2048 if (TRUE) 2049#endif 2050 vm_phys_free_pages(m, 0); 2051 if ((m->flags & PG_ZERO) != 0) 2052 ++vm_page_zero_count; 2053 else 2054 vm_page_zero_idle_wakeup(); 2055 vm_page_free_wakeup(); 2056 mtx_unlock(&vm_page_queue_free_mtx); 2057 } 2058} 2059 2060/* 2061 * vm_page_wire: 2062 * 2063 * Mark this page as wired down by yet 2064 * another map, removing it from paging queues 2065 * as necessary. 2066 * 2067 * If the page is fictitious, then its wire count must remain one. 2068 * 2069 * The page must be locked. 2070 * This routine may not block. 2071 / 2072void 2073vm_page_wire(vm_page_t m) 2074{ 2075* 2076 /* 2077 * Only bump the wire statistics if the page is not already wired, 2078 * and only unqueue the page if it is on some queue (if it is unmanaged 2079 * it is already off the queues). 2080 / 2081* vm_page_lock_assert(m, MA_OWNED); 2082 if ((m->flags & PG_FICTITIOUS) != 0) { 2083 KASSERT(m->wire_count == 1, 2084 ("vm_page_wire: fictitious page %p's wire count isn't one", 2085 m)); 2086 return; 2087 } 2088 if (m->wire_count == 0) { 2089 if ((m->oflags & VPO_UNMANAGED) == 0) 2090 vm_pageq_remove(m); 2091 atomic_add_int(&cnt.v_wire_count, 1); 2092 } 2093 m->wire_count++; 2094 KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); 2095} 2096 2097/* 2098 * vm_page_unwire: 2099 * 2100 * Release one wiring of the specified page, potentially enabling it to be 2101 * paged again. If paging is enabled, then the value of the parameter 2102 * "activate" determines to which queue the page is added. If "activate" is 2103 * non-zero, then the page is added to the active queue. Otherwise, it is 2104 * added to the inactive queue. 2105 * 2106 * However, unless the page belongs to an object, it is not enqueued because 2107 * it cannot be paged out. 2108 * 2109 * If a page is fictitious, then its wire count must alway be one. 2110 * 2111 * A managed page must be locked. 2112 / 2113void 2114vm_page_unwire(vm_page_t m, int activate) 2115{ 2116* 2117 if ((m->oflags & VPO_UNMANAGED) == 0) 2118 vm_page_lock_assert(m, MA_OWNED); 2119 if ((m->flags & PG_FICTITIOUS) != 0) { 2120 KASSERT(m->wire_count == 1, 2121 ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); 2122 return; 2123 } 2124 if (m->wire_count > 0) { 2125 m->wire_count--; 2126 if (m->wire_count == 0) { 2127 atomic_subtract_int(&cnt.v_wire_count, 1); 2128 if ((m->oflags & VPO_UNMANAGED) != 0 \|\| 2129 m->object == NULL) 2130 return; 2131 vm_page_lock_queues(); 2132 if (activate) 2133 vm_page_enqueue(PQ_ACTIVE, m); 2134 else { 2135 m->flags &= ~PG_WINATCFLS; 2136 vm_page_enqueue(PQ_INACTIVE, m); 2137 } 2138 vm_page_unlock_queues(); 2139 } 2140 } else 2141 panic("vm_page_unwire: page %p's wire count is zero", m); 2142} 2143 2144/* 2145 * Move the specified page to the inactive queue. 2146 * 2147 * Many pages placed on the inactive queue should actually go 2148 * into the cache, but it is difficult to figure out which. What 2149 * we do instead, if the inactive target is well met, is to put 2150 * clean pages at the head of the inactive queue instead of the tail. 2151 * This will cause them to be moved to the cache more quickly and 2152 * if not actively re-referenced, reclaimed more quickly. If we just 2153 * stick these pages at the end of the inactive queue, heavy filesystem 2154 * meta-data accesses can cause an unnecessary paging load on memory bound 2155 * processes. This optimization causes one-time-use metadata to be 2156 * reused more quickly. 2157 * 2158 * Normally athead is 0 resulting in LRU operation. athead is set 2159 * to 1 if we want this page to be 'as if it were placed in the cache', 2160 * except without unmapping it from the process address space. 2161 * 2162 * This routine may not block. 2163 / 2164static inline void 2165_vm_page_deactivate(vm_page_t m, int athead) 2166{ 2167* int queue; 2168 2169 vm_page_lock_assert(m, MA_OWNED); 2170 2171 /* 2172 * Ignore if already inactive. 2173 / 2174* if ((queue = m->queue) == PQ_INACTIVE) 2175 return; 2176 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 2177 vm_page_lock_queues(); 2178 m->flags &= ~PG_WINATCFLS; 2179 if (queue != PQ_NONE) 2180 vm_page_queue_remove(queue, m); 2181 if (athead) 2182 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, 2183 pageq); 2184 else 2185 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, 2186 pageq); 2187 m->queue = PQ_INACTIVE; 2188 cnt.v_inactive_count++; 2189 vm_page_unlock_queues(); 2190 } 2191} 2192 2193/* 2194 * Move the specified page to the inactive queue. 2195 * 2196 * The page must be locked. 2197 / 2198void 2199vm_page_deactivate(vm_page_t m) 2200{ 2201* 2202 _vm_page_deactivate(m, 0); 2203} 2204 2205/* 2206 * vm_page_try_to_cache: 2207 * 2208 * Returns 0 on failure, 1 on success 2209 / 2210int 2211vm_page_try_to_cache(vm_page_t m) 2212{ 2213* 2214 vm_page_lock_assert(m, MA_OWNED); 2215 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2216 if (m->dirty \|\| m->hold_count \|\| m->busy \|\| m->wire_count \|\| 2217 (m->oflags & (VPO_BUSY \| VPO_UNMANAGED)) != 0) 2218 return (0); 2219 pmap_remove_all(m); 2220 if (m->dirty) 2221 return (0); 2222 vm_page_cache(m); 2223 return (1); 2224} 2225 2226/* 2227 * vm_page_try_to_free() 2228 * 2229 * Attempt to free the page. If we cannot free it, we do nothing. 2230 * 1 is returned on success, 0 on failure. 2231 / 2232int 2233vm_page_try_to_free(vm_page_t m) 2234{ 2235* 2236 vm_page_lock_assert(m, MA_OWNED); 2237 if (m->object != NULL) 2238 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2239 if (m->dirty \|\| m->hold_count \|\| m->busy \|\| m->wire_count \|\| 2240 (m->oflags & (VPO_BUSY \| VPO_UNMANAGED)) != 0) 2241 return (0); 2242 pmap_remove_all(m); 2243 if (m->dirty) 2244 return (0); 2245 vm_page_free(m); 2246 return (1); 2247} 2248 2249/* 2250 * vm_page_cache 2251 * 2252 * Put the specified page onto the page cache queue (if appropriate). 2253 * 2254 * This routine may not block. 2255 / 2256void 2257vm_page_cache(vm_page_t m) 2258{ 2259* vm_object_t object; 2260 vm_page_t next, prev, root; 2261 2262 vm_page_lock_assert(m, MA_OWNED); 2263 object = m->object; 2264 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2265 if ((m->oflags & (VPO_UNMANAGED \| VPO_BUSY)) \|\| m->busy \|\| 2266 m->hold_count \|\| m->wire_count) 2267 panic("vm_page_cache: attempting to cache busy page"); 2268 pmap_remove_all(m); 2269 if (m->dirty != 0) 2270 panic("vm_page_cache: page %p is dirty", m); 2271 if (m->valid == 0 \|\| object->type == OBJT_DEFAULT \|\| 2272 (object->type == OBJT_SWAP && 2273 !vm_pager_has_page(object, m->pindex, NULL, NULL))) { 2274 /* 2275 * Hypothesis: A cache-elgible page belonging to a 2276 * default object or swap object but without a backing 2277 * store must be zero filled. 2278 / 2279* vm_page_free(m); 2280 return; 2281 } 2282 KASSERT((m->flags & PG_CACHED) == 0, 2283 ("vm_page_cache: page %p is already cached", m)); 2284 PCPU_INC(cnt.v_tcached); 2285 2286 /* 2287 * Remove the page from the paging queues. 2288 / 2289* vm_pageq_remove(m); 2290 2291 /* 2292 * Remove the page from the object's collection of resident 2293 * pages. 2294 / 2295* if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) { 2296 /* 2297 * Since the page's successor in the list is also its parent 2298 * in the tree, its right subtree must be empty. 2299 / 2300* next->left = m->left; 2301 KASSERT(m->right == NULL, 2302 ("vm_page_cache: page %p has right child", m)); 2303 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 2304 prev->right == m) { 2305 /* 2306 * Since the page's predecessor in the list is also its parent 2307 * in the tree, its left subtree must be empty. 2308 / 2309* KASSERT(m->left == NULL, 2310 ("vm_page_cache: page %p has left child", m)); 2311 prev->right = m->right; 2312 } else { 2313 if (m != object->root) 2314 vm_page_splay(m->pindex, object->root); 2315 if (m->left == NULL) 2316 root = m->right; 2317 else if (m->right == NULL) 2318 root = m->left; 2319 else { 2320 /* 2321 * Move the page's successor to the root, because 2322 * pages are usually removed in ascending order. 2323 / 2324* if (m->right != next) 2325 vm_page_splay(m->pindex, m->right); 2326 next->left = m->left; 2327 root = next; 2328 } 2329 object->root = root; 2330 } 2331 TAILQ_REMOVE(&object->memq, m, listq); 2332 object->resident_page_count--; 2333 2334 /* 2335 * Restore the default memory attribute to the page. 2336 / 2337* if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 2338 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 2339 2340 /* 2341 * Insert the page into the object's collection of cached pages 2342 * and the physical memory allocator's cache/free page queues. 2343 / 2344* m->flags &= ~PG_ZERO; 2345 mtx_lock(&vm_page_queue_free_mtx); 2346 m->flags \|= PG_CACHED; 2347 cnt.v_cache_count++; 2348 root = object->cache; 2349 if (root == NULL) { 2350 m->left = NULL; 2351 m->right = NULL; 2352 } else { 2353 root = vm_page_splay(m->pindex, root); 2354 if (m->pindex < root->pindex) { 2355 m->left = root->left; 2356 m->right = root; 2357 root->left = NULL; 2358 } else if (__predict_false(m->pindex == root->pindex)) 2359 panic("vm_page_cache: offset already cached"); 2360 else { 2361 m->right = root->right; 2362 m->left = root; 2363 root->right = NULL; 2364 } 2365 } 2366 object->cache = m; 2367#if VM_NRESERVLEVEL > 0 2368 if (!vm_reserv_free_page(m)) { 2369#else 2370 if (TRUE) { 2371#endif 2372 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); 2373 vm_phys_free_pages(m, 0); 2374 } 2375 vm_page_free_wakeup(); 2376 mtx_unlock(&vm_page_queue_free_mtx); 2377 2378 /* 2379 * Increment the vnode's hold count if this is the object's only 2380 * cached page. Decrement the vnode's hold count if this was 2381 * the object's only resident page. 2382 / 2383* if (object->type == OBJT_VNODE) { 2384 if (root == NULL && object->resident_page_count != 0) 2385 vhold(object->handle); 2386 else if (root != NULL && object->resident_page_count == 0) 2387 vdrop(object->handle); 2388 } 2389} 2390 2391/* 2392 * vm_page_dontneed 2393 * 2394 * Cache, deactivate, or do nothing as appropriate. This routine 2395 * is typically used by madvise() MADV_DONTNEED. 2396 * 2397 * Generally speaking we want to move the page into the cache so 2398 * it gets reused quickly. However, this can result in a silly syndrome 2399 * due to the page recycling too quickly. Small objects will not be 2400 * fully cached. On the otherhand, if we move the page to the inactive 2401 * queue we wind up with a problem whereby very large objects 2402 * unnecessarily blow away our inactive and cache queues. 2403 * 2404 * The solution is to move the pages based on a fixed weighting. We 2405 * either leave them alone, deactivate them, or move them to the cache, 2406 * where moving them to the cache has the highest weighting. 2407 * By forcing some pages into other queues we eventually force the 2408 * system to balance the queues, potentially recovering other unrelated 2409 * space from active. The idea is to not force this to happen too 2410 * often. 2411 / 2412void 2413vm_page_dontneed(vm_page_t m) 2414{ 2415* int dnw; 2416 int head; 2417 2418 vm_page_lock_assert(m, MA_OWNED); 2419 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2420 dnw = PCPU_GET(dnweight); 2421 PCPU_INC(dnweight); 2422 2423 /* 2424 * Occasionally leave the page alone. 2425 / 2426* if ((dnw & 0x01F0) == 0 \|\| m->queue == PQ_INACTIVE) { 2427 if (m->act_count >= ACT_INIT) 2428 --m->act_count; 2429 return; 2430 } 2431 2432 /* 2433 * Clear any references to the page. Otherwise, the page daemon will 2434 * immediately reactivate the page. 2435 * 2436 * Perform the pmap_clear_reference() first. Otherwise, a concurrent 2437 * pmap operation, such as pmap_remove(), could clear a reference in 2438 * the pmap and set PGA_REFERENCED on the page before the 2439 * pmap_clear_reference() had completed. Consequently, the page would 2440 * appear referenced based upon an old reference that occurred before 2441 * this function ran. 2442 / 2443* pmap_clear_reference(m); 2444 vm_page_aflag_clear(m, PGA_REFERENCED); 2445 2446 if (m->dirty == 0 && pmap_is_modified(m)) 2447 vm_page_dirty(m); 2448 2449 if (m->dirty \|\| (dnw & 0x0070) == 0) { 2450 /* 2451 * Deactivate the page 3 times out of 32. 2452 / 2453* head = 0; 2454 } else { 2455 /* 2456 * Cache the page 28 times out of every 32. Note that 2457 * the page is deactivated instead of cached, but placed 2458 * at the head of the queue instead of the tail. 2459 / 2460* head = 1; 2461 } 2462 _vm_page_deactivate(m, head); 2463} 2464 2465/* 2466 * Grab a page, waiting until we are waken up due to the page 2467 * changing state. We keep on waiting, if the page continues 2468 * to be in the object. If the page doesn't exist, first allocate it 2469 * and then conditionally zero it. 2470 * 2471 * The caller must always specify the VM_ALLOC_RETRY flag. This is intended 2472 * to facilitate its eventual removal. 2473 * 2474 * This routine may block. 2475 / 2476vm_page_t 2477vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 2478{ 2479* vm_page_t m; 2480 2481 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2482 KASSERT((allocflags & VM_ALLOC_RETRY) != 0, 2483 ("vm_page_grab: VM_ALLOC_RETRY is required")); 2484retrylookup: 2485 if ((m = vm_page_lookup(object, pindex)) != NULL) { 2486 if ((m->oflags & VPO_BUSY) != 0 \|\| 2487 ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) { 2488 /* 2489 * Reference the page before unlocking and 2490 * sleeping so that the page daemon is less 2491 * likely to reclaim it. 2492 / 2493* vm_page_aflag_set(m, PGA_REFERENCED); 2494 vm_page_sleep(m, "pgrbwt"); 2495 goto retrylookup; 2496 } else { 2497 if ((allocflags & VM_ALLOC_WIRED) != 0) { 2498 vm_page_lock(m); 2499 vm_page_wire(m); 2500 vm_page_unlock(m); 2501 } 2502 if ((allocflags & VM_ALLOC_NOBUSY) == 0) 2503 vm_page_busy(m); 2504 return (m); 2505 } 2506 } 2507 m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY \| 2508 VM_ALLOC_IGN_SBUSY)); 2509 if (m == NULL) { 2510 VM_OBJECT_UNLOCK(object); 2511 VM_WAIT; 2512 VM_OBJECT_LOCK(object); 2513 goto retrylookup; 2514 } else if (m->valid != 0) 2515 return (m); 2516 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) 2517 pmap_zero_page(m); 2518 return (m); 2519} 2520 2521/* 2522 * Mapping function for valid bits or for dirty bits in 2523 * a page. May not block. 2524 * 2525 * Inputs are required to range within a page. 2526 / 2527vm_page_bits_t 2528vm_page_bits(int base, int size) 2529{ 2530* int first_bit; 2531 int last_bit; 2532 2533 KASSERT( 2534 base + size <= PAGE_SIZE, 2535 ("vm_page_bits: illegal base/size %d/%d", base, size) 2536 ); 2537 2538 if (size == 0) /* handle degenerate case / 2539* return (0); 2540 2541 first_bit = base >> DEV_BSHIFT; 2542 last_bit = (base + size - 1) >> DEV_BSHIFT; 2543 2544 return (((vm_page_bits_t)2 << last_bit) - 2545 ((vm_page_bits_t)1 << first_bit)); 2546} 2547 2548/* 2549 * vm_page_set_valid_range: 2550 * 2551 * Sets portions of a page valid. The arguments are expected 2552 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2553 * of any partial chunks touched by the range. The invalid portion of 2554 * such chunks will be zeroed. 2555 * 2556 * (base + size) must be less then or equal to PAGE_SIZE. 2557 / 2558void 2559vm_page_set_valid_range(vm_page_t m, int base, int size) 2560{ 2561* int endoff, frag; 2562 2563 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2564 if (size == 0) /* handle degenerate case / 2565* return; 2566 2567 /* 2568 * If the base is not DEV_BSIZE aligned and the valid 2569 * bit is clear, we have to zero out a portion of the 2570 * first block. 2571 / 2572* if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2573 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 2574 pmap_zero_page_area(m, frag, base - frag); 2575 2576 /* 2577 * If the ending offset is not DEV_BSIZE aligned and the 2578 * valid bit is clear, we have to zero out a portion of 2579 * the last block. 2580 / 2581* endoff = base + size; 2582 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2583 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 2584 pmap_zero_page_area(m, endoff, 2585 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2586 2587 /* 2588 * Assert that no previously invalid block that is now being validated 2589 * is already dirty. 2590 / 2591* KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, 2592 ("vm_page_set_valid_range: page %p is dirty", m)); 2593 2594 /* 2595 * Set valid bits inclusive of any overlap. 2596 / 2597* m->valid \|= vm_page_bits(base, size); 2598} 2599 2600/* 2601 * Clear the given bits from the specified page's dirty field. 2602 / 2603static __inline void 2604vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) 2605{ 2606* uintptr_t addr; 2607#if PAGE_SIZE < 16384 2608 int shift; 2609#endif 2610 2611 /* 2612 * If the object is locked and the page is neither VPO_BUSY nor 2613 * PGA_WRITEABLE, then the page's dirty field cannot possibly be 2614 * set by a concurrent pmap operation. 2615 / 2616* VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2617 if ((m->oflags & VPO_BUSY) == 0 && (m->aflags & PGA_WRITEABLE) == 0) 2618 m->dirty &= ~pagebits; 2619 else { 2620 /* 2621 * The pmap layer can call vm_page_dirty() without 2622 * holding a distinguished lock. The combination of 2623 * the object's lock and an atomic operation suffice 2624 * to guarantee consistency of the page dirty field. 2625 * 2626 * For PAGE_SIZE == 32768 case, compiler already 2627 * properly aligns the dirty field, so no forcible 2628 * alignment is needed. Only require existence of 2629 * atomic_clear_64 when page size is 32768. 2630 / 2631* addr = (uintptr_t)&m->dirty; 2632#if PAGE_SIZE == 32768 2633 atomic_clear_64((uint64_t )addr, pagebits); 2634#elif PAGE_SIZE == 16384 2635* atomic_clear_32((uint32_t )addr, pagebits); 2636#else / PAGE_SIZE <= 8192 / 2637* /* 2638 * Use a trick to perform a 32-bit atomic on the 2639 * containing aligned word, to not depend on the existence 2640 * of atomic_clear_{8, 16}. 2641 / 2642* shift = addr & (sizeof(uint32_t) - 1); 2643#if BYTE_ORDER == BIG_ENDIAN 2644 shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; 2645#else 2646 shift = NBBY; 2647#endif 2648* addr &= ~(sizeof(uint32_t) - 1); 2649 atomic_clear_32((uint32_t )addr, pagebits << shift); 2650#endif / PAGE_SIZE / 2651* } 2652} 2653 2654/* 2655 * vm_page_set_validclean: 2656 * 2657 * Sets portions of a page valid and clean. The arguments are expected 2658 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2659 * of any partial chunks touched by the range. The invalid portion of 2660 * such chunks will be zero'd. 2661 * 2662 * This routine may not block. 2663 * 2664 * (base + size) must be less then or equal to PAGE_SIZE. 2665 / 2666void 2667vm_page_set_validclean(vm_page_t m, int base, int size) 2668{ 2669* vm_page_bits_t oldvalid, pagebits; 2670 int endoff, frag; 2671 2672 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2673 if (size == 0) /* handle degenerate case / 2674* return; 2675 2676 /* 2677 * If the base is not DEV_BSIZE aligned and the valid 2678 * bit is clear, we have to zero out a portion of the 2679 * first block. 2680 / 2681* if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2682 (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) 2683 pmap_zero_page_area(m, frag, base - frag); 2684 2685 /* 2686 * If the ending offset is not DEV_BSIZE aligned and the 2687 * valid bit is clear, we have to zero out a portion of 2688 * the last block. 2689 / 2690* endoff = base + size; 2691 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2692 (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) 2693 pmap_zero_page_area(m, endoff, 2694 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2695 2696 /* 2697 * Set valid, clear dirty bits. If validating the entire 2698 * page we can safely clear the pmap modify bit. We also 2699 * use this opportunity to clear the VPO_NOSYNC flag. If a process 2700 * takes a write fault on a MAP_NOSYNC memory area the flag will 2701 * be set again. 2702 * 2703 * We set valid bits inclusive of any overlap, but we can only 2704 * clear dirty bits for DEV_BSIZE chunks that are fully within 2705 * the range. 2706 / 2707* oldvalid = m->valid; 2708 pagebits = vm_page_bits(base, size); 2709 m->valid \|= pagebits; 2710#if 0 /* NOT YET / 2711* if ((frag = base & (DEV_BSIZE - 1)) != 0) { 2712 frag = DEV_BSIZE - frag; 2713 base += frag; 2714 size -= frag; 2715 if (size < 0) 2716 size = 0; 2717 } 2718 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 2719#endif 2720 if (base == 0 && size == PAGE_SIZE) { 2721 /* 2722 * The page can only be modified within the pmap if it is 2723 * mapped, and it can only be mapped if it was previously 2724 * fully valid. 2725 / 2726* if (oldvalid == VM_PAGE_BITS_ALL) 2727 /* 2728 * Perform the pmap_clear_modify() first. Otherwise, 2729 * a concurrent pmap operation, such as 2730 * pmap_protect(), could clear a modification in the 2731 * pmap and set the dirty field on the page before 2732 * pmap_clear_modify() had begun and after the dirty 2733 * field was cleared here. 2734 / 2735* pmap_clear_modify(m); 2736 m->dirty = 0; 2737 m->oflags &= ~VPO_NOSYNC; 2738 } else if (oldvalid != VM_PAGE_BITS_ALL) 2739 m->dirty &= ~pagebits; 2740 else 2741 vm_page_clear_dirty_mask(m, pagebits); 2742} 2743 2744void 2745vm_page_clear_dirty(vm_page_t m, int base, int size) 2746{ 2747 2748 vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); 2749} 2750 2751/* 2752 * vm_page_set_invalid: 2753 * 2754 * Invalidates DEV_BSIZE'd chunks within a page. Both the 2755 * valid and dirty bits for the effected areas are cleared. 2756 * 2757 * May not block. 2758 / 2759void 2760vm_page_set_invalid(vm_page_t m, int base, int size) 2761{ 2762* vm_page_bits_t bits; 2763 2764 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2765 KASSERT((m->oflags & VPO_BUSY) == 0, 2766 ("vm_page_set_invalid: page %p is busy", m)); 2767 bits = vm_page_bits(base, size); 2768 if (m->valid == VM_PAGE_BITS_ALL && bits != 0) 2769 pmap_remove_all(m); 2770 KASSERT(!pmap_page_is_mapped(m), 2771 ("vm_page_set_invalid: page %p is mapped", m)); 2772 m->valid &= ~bits; 2773 m->dirty &= ~bits; 2774} 2775 2776/* 2777 * vm_page_zero_invalid() 2778 * 2779 * The kernel assumes that the invalid portions of a page contain 2780 * garbage, but such pages can be mapped into memory by user code. 2781 * When this occurs, we must zero out the non-valid portions of the 2782 * page so user code sees what it expects. 2783 * 2784 * Pages are most often semi-valid when the end of a file is mapped 2785 * into memory and the file's size is not page aligned. 2786 / 2787void 2788vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 2789{ 2790* int b; 2791 int i; 2792 2793 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2794 /* 2795 * Scan the valid bits looking for invalid sections that 2796 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the 2797 * valid bit may be set ) have already been zerod by 2798 * vm_page_set_validclean(). 2799 / 2800* for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 2801 if (i == (PAGE_SIZE / DEV_BSIZE) \|\| 2802 (m->valid & ((vm_page_bits_t)1 << i))) { 2803 if (i > b) { 2804 pmap_zero_page_area(m, 2805 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); 2806 } 2807 b = i + 1; 2808 } 2809 } 2810 2811 /* 2812 * setvalid is TRUE when we can safely set the zero'd areas 2813 * as being valid. We can do this if there are no cache consistancy 2814 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 2815 / 2816* if (setvalid) 2817 m->valid = VM_PAGE_BITS_ALL; 2818} 2819 2820/* 2821 * vm_page_is_valid: 2822 * 2823 * Is (partial) page valid? Note that the case where size == 0 2824 * will return FALSE in the degenerate case where the page is 2825 * entirely invalid, and TRUE otherwise. 2826 * 2827 * May not block. 2828 / 2829int 2830vm_page_is_valid(vm_page_t m, int base, int size) 2831{ 2832* vm_page_bits_t bits; 2833 2834 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2835 bits = vm_page_bits(base, size); 2836 if (m->valid && ((m->valid & bits) == bits)) 2837 return 1; 2838 else 2839 return 0; 2840} 2841 2842/* 2843 * update dirty bits from pmap/mmu. May not block. 2844 / 2845void 2846vm_page_test_dirty(vm_page_t m) 2847{ 2848* 2849 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2850 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) 2851 vm_page_dirty(m); 2852} 2853 2854void 2855vm_page_lock_KBI(vm_page_t m, const char file, int line) 2856{ 2857* 2858 mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); 2859} 2860 2861void 2862vm_page_unlock_KBI(vm_page_t m, const char file, int line) 2863{ 2864* 2865 mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); 2866} 2867 2868int 2869vm_page_trylock_KBI(vm_page_t m, const char file, int line) 2870{ 2871* 2872 return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); 2873} 2874 2875#if defined(INVARIANTS) \|\| defined(INVARIANT_SUPPORT) 2876void 2877vm_page_lock_assert_KBI(vm_page_t m, int a, const char file, int line) 2878{ 2879* 2880 mtx_assert_(vm_page_lockptr(m), a, file, line); 2881} 2882#endif 2883 2884int so_zerocp_fullpage = 0; 2885 2886/* 2887 * Replace the given page with a copy. The copied page assumes 2888 * the portion of the given page's "wire_count" that is not the 2889 * responsibility of this copy-on-write mechanism. 2890 * 2891 * The object containing the given page must have a non-zero 2892 * paging-in-progress count and be locked. 2893 / 2894void 2895vm_page_cowfault(vm_page_t m) 2896{ 2897* vm_page_t mnew; 2898 vm_object_t object; 2899 vm_pindex_t pindex; 2900 2901 mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED); 2902 vm_page_lock_assert(m, MA_OWNED); 2903 object = m->object; 2904 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2905 KASSERT(object->paging_in_progress != 0, 2906 ("vm_page_cowfault: object %p's paging-in-progress count is zero.", 2907 object)); 2908 pindex = m->pindex; 2909 2910 retry_alloc: 2911 pmap_remove_all(m); 2912 vm_page_remove(m); 2913 mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL \| VM_ALLOC_NOBUSY); 2914 if (mnew == NULL) { 2915 vm_page_insert(m, object, pindex); 2916 vm_page_unlock(m); 2917 VM_OBJECT_UNLOCK(object); 2918 VM_WAIT; 2919 VM_OBJECT_LOCK(object); 2920 if (m == vm_page_lookup(object, pindex)) { 2921 vm_page_lock(m); 2922 goto retry_alloc; 2923 } else { 2924 /* 2925 * Page disappeared during the wait. 2926 / 2927* return; 2928 } 2929 } 2930 2931 if (m->cow == 0) { 2932 /* 2933 * check to see if we raced with an xmit complete when 2934 * waiting to allocate a page. If so, put things back 2935 * the way they were 2936 / 2937* vm_page_unlock(m); 2938 vm_page_lock(mnew); 2939 vm_page_free(mnew); 2940 vm_page_unlock(mnew); 2941 vm_page_insert(m, object, pindex); 2942 } else { /* clear COW & copy page / 2943* if (!so_zerocp_fullpage) 2944 pmap_copy_page(m, mnew); 2945 mnew->valid = VM_PAGE_BITS_ALL; 2946 vm_page_dirty(mnew); 2947 mnew->wire_count = m->wire_count - m->cow; 2948 m->wire_count = m->cow; 2949 vm_page_unlock(m); 2950 } 2951} 2952 2953void 2954vm_page_cowclear(vm_page_t m) 2955{ 2956 2957 vm_page_lock_assert(m, MA_OWNED); 2958 if (m->cow) { 2959 m->cow--; 2960 /* 2961 * let vm_fault add back write permission lazily 2962 / 2963* } 2964 /* 2965 * sf_buf_free() will free the page, so we needn't do it here 2966 / 2967} 2968* 2969int 2970vm_page_cowsetup(vm_page_t m) 2971{ 2972 2973 vm_page_lock_assert(m, MA_OWNED); 2974 if ((m->flags & PG_FICTITIOUS) != 0 \|\| 2975 (m->oflags & VPO_UNMANAGED) != 0 \|\| 2976 m->cow == USHRT_MAX - 1 \|\| !VM_OBJECT_TRYLOCK(m->object)) 2977 return (EBUSY); 2978 m->cow++; 2979 pmap_remove_write(m); 2980 VM_OBJECT_UNLOCK(m->object); 2981 return (0); 2982} 2983 2984#ifdef INVARIANTS 2985void 2986vm_page_object_lock_assert(vm_page_t m) 2987{ 2988 2989 /* 2990 * Certain of the page's fields may only be modified by the 2991 * holder of the containing object's lock or the setter of the 2992 * page's VPO_BUSY flag. Unfortunately, the setter of the 2993 * VPO_BUSY flag is not recorded, and thus cannot be checked 2994 * here. 2995 / 2996* if (m->object != NULL && (m->oflags & VPO_BUSY) == 0) 2997 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2998} 2999#endif 3000 3001#include "opt_ddb.h" 3002#ifdef DDB 3003#include <sys/kernel.h> 3004 3005#include <ddb/ddb.h> 3006 3007DB_SHOW_COMMAND(page, vm_page_print_page_info) 3008{ 3009 db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); 3010 db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); 3011 db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); 3012 db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); 3013 db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); 3014 db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); 3015 db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); 3016 db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); 3017 db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); 3018 db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); 3019} 3020 3021DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 3022{ 3023 3024 db_printf("PQ_FREE:"); 3025 db_printf(" %d", cnt.v_free_count); 3026 db_printf("\n"); 3027 3028 db_printf("PQ_CACHE:"); 3029 db_printf(" %d", cnt.v_cache_count); 3030 db_printf("\n"); 3031 3032 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", 3033 vm_page_queues[PQ_ACTIVE].cnt, 3034* vm_page_queues[PQ_INACTIVE].cnt); 3035} 3036#endif / DDB */