vm_page.c revision 224746
138514Sdfr/*- 259603Sdfr * Copyright (c) 1991 Regents of the University of California. 338514Sdfr * All rights reserved. 438514Sdfr * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 538514Sdfr * 638514Sdfr * This code is derived from software contributed to Berkeley by 738514Sdfr * The Mach Operating System project at Carnegie-Mellon University. 838514Sdfr * 938514Sdfr * Redistribution and use in source and binary forms, with or without 1038514Sdfr * modification, are permitted provided that the following conditions 1138514Sdfr * are met: 1238514Sdfr * 1. Redistributions of source code must retain the above copyright 1338514Sdfr * notice, this list of conditions and the following disclaimer. 1438514Sdfr * 2. Redistributions in binary form must reproduce the above copyright 1538514Sdfr * notice, this list of conditions and the following disclaimer in the 1638514Sdfr * documentation and/or other materials provided with the distribution. 1738514Sdfr * 4. Neither the name of the University nor the names of its contributors 1838514Sdfr * may be used to endorse or promote products derived from this software 1938514Sdfr * without specific prior written permission. 2038514Sdfr * 2138514Sdfr * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2238514Sdfr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2338514Sdfr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2438514Sdfr * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2538514Sdfr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2650477Speter * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2738514Sdfr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2838514Sdfr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2959603Sdfr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3059603Sdfr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3138514Sdfr * SUCH DAMAGE. 3276166Smarkm * 3338514Sdfr * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 3476166Smarkm */ 3538514Sdfr 3677642Sdd/*- 3738514Sdfr * Copyright (c) 1987, 1990 Carnegie-Mellon University. 3838514Sdfr * All rights reserved. 3938514Sdfr * 4038514Sdfr * Authors: Avadis Tevanian, Jr., Michael Wayne Young 4138514Sdfr * 4276166Smarkm * Permission to use, copy, modify and distribute this software and 4338514Sdfr * its documentation is hereby granted, provided that both the copyright 4485735Sgreen * notice and this permission notice appear in all copies of the 4585735Sgreen * software, derivative works or modified versions, and any portions 4685735Sgreen * thereof, and that both notices appear in supporting documentation. 4738514Sdfr * 4839071Sdfr * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 4939071Sdfr * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 5052128Speter * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 5139071Sdfr * 5239071Sdfr * Carnegie Mellon requests users of this software to return to 5339071Sdfr * 5452128Speter * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 5539071Sdfr * School of Computer Science 5639071Sdfr * Carnegie Mellon University 5776166Smarkm * Pittsburgh PA 15213-3890 58102288Speter * 5939071Sdfr * any improvements or extensions that they make and grant Carnegie the 6059603Sdfr * rights to redistribute these changes. 6138514Sdfr */ 6238514Sdfr 6359603Sdfr/* 6459603Sdfr * GENERAL RULES ON VM_PAGE MANIPULATION 6539071Sdfr * 6639071Sdfr * - a pageq mutex is required when adding or removing a page from a 6739071Sdfr * page queue (vm_page_queue[]), regardless of other mutexes or the 6839071Sdfr * busy state of a page. 6959603Sdfr * 7080700Sjake * - a hash chain mutex is required when associating or disassociating 7180700Sjake * a page from the VM PAGE CACHE hash table (vm_page_buckets), 7280700Sjake * regardless of other mutexes or the busy state of a page. 7380700Sjake * 7438514Sdfr * - either a hash chain mutex OR a busied page is required in order 7538514Sdfr * to modify the page flags. A hash chain mutex must be obtained in 7640254Speter * order to busy a page. A page's flags cannot be modified by a 7739071Sdfr * hash chain mutex if the page is marked busy. 7839071Sdfr * 7939071Sdfr * - The object memq mutex is held when inserting or removing 8039071Sdfr * pages from an object (vm_page_insert() or vm_page_remove()). This 8139071Sdfr * is different from the object's main mutex. 8239071Sdfr * 8339071Sdfr * Generally speaking, you have to be aware of side effects when running 8439071Sdfr * vm_page ops. A vm_page_lookup() will return with the hash chain 8539071Sdfr * locked, whether it was able to lookup the page or not. vm_page_free(), 8639071Sdfr * vm_page_cache(), vm_page_activate(), and a number of other routines 8740254Speter * will release the hash chain mutex for you. Intermediate manipulation 8840254Speter * routines such as vm_page_flag_set() expect the hash chain to be held 8940254Speter * on entry and the hash chain will remain held on return. 9040254Speter * 9140254Speter * pageq scanning can only occur with the pageq in question locked. 9240292Speter * We have a known bottleneck with the active queue, but the cache 9340292Speter * and free queues are actually arrays already. 9459603Sdfr */ 9559603Sdfr 9659603Sdfr/* 9738514Sdfr * Resident memory management module. 9838514Sdfr */ 9959751Speter 10059751Speter#include <sys/cdefs.h> 10159751Speter__FBSDID("$FreeBSD: head/sys/vm/vm_page.c 224746 2011-08-09 21:01:36Z kib $"); 10259751Speter 10359603Sdfr#include "opt_vm.h" 10459603Sdfr 10559603Sdfr#include <sys/param.h> 10659603Sdfr#include <sys/systm.h> 10759603Sdfr#include <sys/lock.h> 10838514Sdfr#include <sys/kernel.h> 10959603Sdfr#include <sys/limits.h> 11059751Speter#include <sys/malloc.h> 11178161Speter#include <sys/msgbuf.h> 11278161Speter#include <sys/mutex.h> 11385736Sgreen#include <sys/proc.h> 11485736Sgreen#include <sys/sysctl.h> 11585736Sgreen#include <sys/vmmeter.h> 11659603Sdfr#include <sys/vnode.h> 11759603Sdfr 11859603Sdfr#include <vm/vm.h> 11959603Sdfr#include <vm/pmap.h> 12059603Sdfr#include <vm/vm_param.h> 12159603Sdfr#include <vm/vm_kern.h> 12259751Speter#include <vm/vm_object.h> 12359751Speter#include <vm/vm_page.h> 12459751Speter#include <vm/vm_pageout.h> 12578161Speter#include <vm/vm_pager.h> 12685736Sgreen#include <vm/vm_phys.h> 12759603Sdfr#include <vm/vm_reserv.h> 12859603Sdfr#include <vm/vm_extern.h> 12959603Sdfr#include <vm/uma.h> 13059603Sdfr#include <vm/uma_int.h> 13159603Sdfr 13259603Sdfr#include <machine/md_var.h> 13359603Sdfr 13459603Sdfr/* 13559603Sdfr * Associated with page of user-allocatable memory is a 13659603Sdfr * page structure. 13759603Sdfr */ 13859603Sdfr 13959603Sdfrstruct vpgqueues vm_page_queues[PQ_COUNT]; 14059603Sdfrstruct vpglocks vm_page_queue_lock; 14159751Speterstruct vpglocks vm_page_queue_free_lock; 14259603Sdfr 14359603Sdfrstruct vpglocks pa_lock[PA_LOCK_COUNT]; 14466719Sjhb 14566719Sjhbvm_page_t vm_page_array = 0; 14659603Sdfrint vm_page_array_size = 0; 14738514Sdfrlong first_page = 0; 14859603Sdfrint vm_page_zero_count = 0; 14959603Sdfr 15059603Sdfrstatic int boot_pages = UMA_BOOT_PAGES; 15159603SdfrTUNABLE_INT("vm.boot_pages", &boot_pages); 15266719SjhbSYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0, 15359603Sdfr "number of pages allocated for bootstrapping the VM system"); 15459603Sdfr 15559603Sdfrstatic int pa_tryrelock_restart; 15659603SdfrSYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, 157104094Sphk &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); 15866719Sjhb 15966719Sjhbstatic uma_zone_t fakepg_zone; 16059603Sdfr 16159603Sdfrstatic void vm_page_clear_dirty_mask(vm_page_t m, int pagebits); 16259603Sdfrstatic void vm_page_queue_remove(int queue, vm_page_t m); 16359603Sdfrstatic void vm_page_enqueue(int queue, vm_page_t m); 16459603Sdfrstatic void vm_page_init_fakepg(void *dummy); 16595228Smarcel 16695228SmarcelSYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL); 16795228Smarcel 16895228Smarcelstatic void 16959603Sdfrvm_page_init_fakepg(void *dummy) 17038514Sdfr{ 17138514Sdfr 17238514Sdfr fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, 17338514Sdfr NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); 17438514Sdfr} 17538514Sdfr 17638514Sdfr/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ 17740156Speter#if PAGE_SIZE == 32768 17840156Speter#ifdef CTASSERT 17940156SpeterCTASSERT(sizeof(u_long) >= 8); 18040156Speter#endif 18181500Swpaul#endif 18281500Swpaul 18340156Speter/* 18438514Sdfr * Try to acquire a physical address lock while a pmap is locked. If we 18559603Sdfr * fail to trylock we unlock and lock the pmap directly and cache the 18638514Sdfr * locked pa in *locked. The caller should then restart their loop in case 18740156Speter * the virtual to physical mapping has changed. 18882848Speter */ 18982848Speterint 19082848Spetervm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) 19182848Speter{ 19282848Speter vm_paddr_t lockpa; 19382848Speter 19482848Speter lockpa = *locked; 19582848Speter *locked = pa; 19682848Speter if (lockpa) { 19782848Speter PA_LOCK_ASSERT(lockpa, MA_OWNED); 19882848Speter if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) 19982848Speter return (0); 20082848Speter PA_UNLOCK(lockpa); 20159603Sdfr } 20282848Speter if (PA_TRYLOCK(pa)) 20359603Sdfr return (0); 20482848Speter PMAP_UNLOCK(pmap); 20559603Sdfr atomic_add_int(&pa_tryrelock_restart, 1); 20682848Speter PA_LOCK(pa); 20782848Speter PMAP_LOCK(pmap); 20882848Speter return (EAGAIN); 20982848Speter} 21040156Speter 21182848Speter/* 21282848Speter * vm_set_page_size: 21382848Speter * 21482848Speter * Sets the page size, perhaps based upon the memory 21582848Speter * size. Must be called before any use of page-size 21682848Speter * dependent functions. 21782848Speter */ 21882848Spetervoid 21982848Spetervm_set_page_size(void) 22082848Speter{ 22159603Sdfr if (cnt.v_page_size == 0) 22259603Sdfr cnt.v_page_size = PAGE_SIZE; 22382848Speter if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0) 22482848Speter panic("vm_set_page_size: page size not a power of two"); 22582848Speter} 22682848Speter 22782848Speter/* 22882848Speter * vm_page_blacklist_lookup: 22982848Speter * 23059603Sdfr * See if a physical address in this page has been listed 23182848Speter * in the blacklist tunable. Entries in the tunable are 23282848Speter * separated by spaces or commas. If an invalid integer is 23382848Speter * encountered then the rest of the string is skipped. 23459603Sdfr */ 23582848Speterstatic int 23659603Sdfrvm_page_blacklist_lookup(char *list, vm_paddr_t pa) 23738514Sdfr{ 23838514Sdfr vm_paddr_t bad; 23940156Speter char *cp, *pos; 24038514Sdfr 24138514Sdfr for (pos = list; *pos != '\0'; pos = cp) { 24259751Speter bad = strtoq(pos, &cp, 0); 24340254Speter if (*cp != '\0') { 24440254Speter if (*cp == ' ' || *cp == ',') { 24540254Speter cp++; 24640254Speter if (cp == pos) 24740254Speter continue; 24840254Speter } else 24940254Speter break; 25040254Speter } 25140292Speter if (pa == trunc_page(bad)) 25240292Speter return (1); 25340254Speter } 25440254Speter return (0); 25540254Speter} 25640254Speter 25740254Speter/* 25840254Speter * vm_page_startup: 25940254Speter * 26040254Speter * Initializes the resident memory module. 26140254Speter * 26240254Speter * Allocates memory for the page cells, and 26340254Speter * for the object/offset-to-page hash table headers. 26440254Speter * Each page cell is initialized and placed on the free list. 26540254Speter */ 26640254Spetervm_offset_t 26740254Spetervm_page_startup(vm_offset_t vaddr) 26840254Speter{ 26940254Speter vm_offset_t mapped; 27040254Speter vm_paddr_t page_range; 27140254Speter vm_paddr_t new_end; 27240254Speter int i; 27340254Speter vm_paddr_t pa; 27440254Speter vm_paddr_t last_pa; 27540254Speter char *list; 27640254Speter 27740254Speter /* the biggest memory array is the second group of pages */ 27840254Speter vm_paddr_t end; 27940254Speter vm_paddr_t biggestsize; 28040254Speter vm_paddr_t low_water, high_water; 28140254Speter int biggestone; 28240254Speter 28340254Speter biggestsize = 0; 28440254Speter biggestone = 0; 28540254Speter vaddr = round_page(vaddr); 28640254Speter 28740254Speter for (i = 0; phys_avail[i + 1]; i += 2) { 28840254Speter phys_avail[i] = round_page(phys_avail[i]); 28940254Speter phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); 29040254Speter } 29140254Speter 29240254Speter low_water = phys_avail[0]; 29359603Sdfr high_water = phys_avail[1]; 29438514Sdfr 29559603Sdfr for (i = 0; phys_avail[i + 1]; i += 2) { 29639071Sdfr vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; 29738514Sdfr 29838514Sdfr if (size > biggestsize) { 29938514Sdfr biggestone = i; 30038514Sdfr biggestsize = size; 30138514Sdfr } 30238514Sdfr if (phys_avail[i] < low_water) 30380700Sjake low_water = phys_avail[i]; 30438514Sdfr if (phys_avail[i + 1] > high_water) 30538514Sdfr high_water = phys_avail[i + 1]; 30638514Sdfr } 30738514Sdfr 30838514Sdfr#ifdef XEN 30938514Sdfr low_water = 0; 31038514Sdfr#endif 31138514Sdfr 31239071Sdfr end = phys_avail[biggestone+1]; 31338514Sdfr 31440254Speter /* 31540254Speter * Initialize the locks. 31640254Speter */ 31738514Sdfr mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF | 31839071Sdfr MTX_RECURSE); 31938514Sdfr mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL, 32038514Sdfr MTX_DEF); 32138514Sdfr 32238514Sdfr /* Setup page locks. */ 32339071Sdfr for (i = 0; i < PA_LOCK_COUNT; i++) 32439071Sdfr mtx_init(&pa_lock[i].data, "page lock", NULL, MTX_DEF); 32539071Sdfr 32639071Sdfr /* 32739071Sdfr * Initialize the queue headers for the hold queue, the active queue, 32839071Sdfr * and the inactive queue. 32939071Sdfr */ 33039071Sdfr for (i = 0; i < PQ_COUNT; i++) 33139071Sdfr TAILQ_INIT(&vm_page_queues[i].pl); 33239071Sdfr vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count; 33339071Sdfr vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count; 33439071Sdfr vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count; 33539071Sdfr 33639071Sdfr /* 33739071Sdfr * Allocate memory for use when boot strapping the kernel memory 33839071Sdfr * allocator. 33939071Sdfr */ 34039071Sdfr new_end = end - (boot_pages * UMA_SLAB_SIZE); 34139071Sdfr new_end = trunc_page(new_end); 34239071Sdfr mapped = pmap_map(&vaddr, new_end, end, 34339071Sdfr VM_PROT_READ | VM_PROT_WRITE); 34439071Sdfr bzero((void *)mapped, end - new_end); 34539071Sdfr uma_startup((void *)mapped, boot_pages); 34639071Sdfr 34739071Sdfr#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \ 34839071Sdfr defined(__mips__) 34939071Sdfr /* 35039071Sdfr * Allocate a bitmap to indicate that a random physical page 35139071Sdfr * needs to be included in a minidump. 35239071Sdfr * 35339071Sdfr * The amd64 port needs this to indicate which direct map pages 35439071Sdfr * need to be dumped, via calls to dump_add_page()/dump_drop_page(). 35539071Sdfr * 35639071Sdfr * However, i386 still needs this workspace internally within the 35739071Sdfr * minidump code. In theory, they are not needed on i386, but are 35859603Sdfr * included should the sf_buf code decide to use them. 35959603Sdfr */ 36059603Sdfr last_pa = 0; 36159603Sdfr for (i = 0; dump_avail[i + 1] != 0; i += 2) 36259603Sdfr if (dump_avail[i + 1] > last_pa) 36338514Sdfr last_pa = dump_avail[i + 1]; 36438514Sdfr page_range = last_pa / PAGE_SIZE; 36539071Sdfr vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); 36639071Sdfr new_end -= vm_page_dump_size; 36739071Sdfr vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, 36839071Sdfr new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); 36939071Sdfr bzero((void *)vm_page_dump, vm_page_dump_size); 37039071Sdfr#endif 37139071Sdfr#ifdef __amd64__ 37239071Sdfr /* 37340254Speter * Request that the physical pages underlying the message buffer be 37440254Speter * included in a crash dump. Since the message buffer is accessed 37540254Speter * through the direct map, they are not automatically included. 37640254Speter */ 37740254Speter pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); 37838514Sdfr last_pa = pa + round_page(msgbufsize); 37938514Sdfr while (pa < last_pa) { 38038514Sdfr dump_add_page(pa); 38139071Sdfr pa += PAGE_SIZE; 38239071Sdfr } 38339071Sdfr#endif 38439071Sdfr /* 38539071Sdfr * Compute the number of pages of memory that will be available for 38639071Sdfr * use (taking into account the overhead of a page structure per 38759603Sdfr * page). 38859603Sdfr */ 38959603Sdfr first_page = low_water / PAGE_SIZE; 39059603Sdfr#ifdef VM_PHYSSEG_SPARSE 39159603Sdfr page_range = 0; 39259603Sdfr for (i = 0; phys_avail[i + 1] != 0; i += 2) 39359603Sdfr page_range += atop(phys_avail[i + 1] - phys_avail[i]); 39459603Sdfr#elif defined(VM_PHYSSEG_DENSE) 39559603Sdfr page_range = high_water / PAGE_SIZE - first_page; 39659603Sdfr#else 39759603Sdfr#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." 39859603Sdfr#endif 39959603Sdfr end = new_end; 40059603Sdfr 40159603Sdfr /* 40259603Sdfr * Reserve an unmapped guard page to trap access to vm_page_array[-1]. 40359603Sdfr */ 40459603Sdfr vaddr += PAGE_SIZE; 40559603Sdfr 40659603Sdfr /* 40759603Sdfr * Initialize the mem entry structures now, and put them in the free 40859603Sdfr * queue. 40959603Sdfr */ 41059603Sdfr new_end = trunc_page(end - page_range * sizeof(struct vm_page)); 41159603Sdfr mapped = pmap_map(&vaddr, new_end, end, 41259603Sdfr VM_PROT_READ | VM_PROT_WRITE); 41359603Sdfr vm_page_array = (vm_page_t) mapped; 41459603Sdfr#if VM_NRESERVLEVEL > 0 41559603Sdfr /* 41659603Sdfr * Allocate memory for the reservation management system's data 41759603Sdfr * structures. 41859603Sdfr */ 41959603Sdfr new_end = vm_reserv_startup(&vaddr, new_end, high_water); 42059603Sdfr#endif 42138514Sdfr#if defined(__amd64__) || defined(__mips__) 42259751Speter /* 42359751Speter * pmap_map on amd64 and mips can come out of the direct-map, not kvm 42440156Speter * like i386, so the pages must be tracked for a crashdump to include 42540156Speter * this data. This includes the vm_page_array and the early UMA 42640156Speter * bootstrap pages. 42740156Speter */ 42840156Speter for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) 42940156Speter dump_add_page(pa); 43040156Speter#endif 43140156Speter phys_avail[biggestone + 1] = new_end; 43259751Speter 43340156Speter /* 43440156Speter * Clear all of the page structures 43559751Speter */ 43640156Speter bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); 43740156Speter for (i = 0; i < page_range; i++) 43840156Speter vm_page_array[i].order = VM_NFREEORDER; 43940156Speter vm_page_array_size = page_range; 44040156Speter 44140156Speter /* 44240156Speter * Initialize the physical memory allocator. 44340156Speter */ 44440156Speter vm_phys_init(); 44540156Speter 44659603Sdfr /* 44759603Sdfr * Add every available physical page that is not blacklisted to 44859603Sdfr * the free lists. 44959603Sdfr */ 45059603Sdfr cnt.v_page_count = 0; 45159603Sdfr cnt.v_free_count = 0; 45259603Sdfr list = getenv("vm.blacklist"); 45340292Speter for (i = 0; phys_avail[i + 1] != 0; i += 2) { 45440156Speter pa = phys_avail[i]; 45540156Speter last_pa = phys_avail[i + 1]; 45640156Speter while (pa < last_pa) { 45740156Speter if (list != NULL && 45840156Speter vm_page_blacklist_lookup(list, pa)) 45940156Speter printf("Skipping page with pa 0x%jx\n", 46040156Speter (uintmax_t)pa); 46140156Speter else 46240156Speter vm_phys_add_page(pa); 46359603Sdfr pa += PAGE_SIZE; 46440156Speter } 46540156Speter } 46640156Speter freeenv(list); 46740156Speter#if VM_NRESERVLEVEL > 0 46859751Speter /* 46959751Speter * Initialize the reservation management system. 47059751Speter */ 47159751Speter vm_reserv_init(); 47259751Speter#endif 47359751Speter return (vaddr); 47459751Speter} 47559751Speter 47659751Spetervoid 47781500Swpaulvm_page_flag_set(vm_page_t m, unsigned short bits) 47881500Swpaul{ 47981500Swpaul 48059751Speter mtx_assert(&vm_page_queue_mtx, MA_OWNED); 48159751Speter /* 48259751Speter * The PG_WRITEABLE flag can only be set if the page is managed and 48359751Speter * VPO_BUSY. Currently, this flag is only set by pmap_enter(). 48459751Speter */ 48559751Speter KASSERT((bits & PG_WRITEABLE) == 0 || 48659751Speter (m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == VPO_BUSY, 48759751Speter ("PG_WRITEABLE and !VPO_BUSY")); 48859751Speter m->flags |= bits; 48959751Speter} 49040156Speter 49159751Spetervoid 49259603Sdfrvm_page_flag_clear(vm_page_t m, unsigned short bits) 49359751Speter{ 49440156Speter 49559751Speter mtx_assert(&vm_page_queue_mtx, MA_OWNED); 49659603Sdfr /* 49759603Sdfr * The PG_REFERENCED flag can only be cleared if the object 49859603Sdfr * containing the page is locked. 49959603Sdfr */ 50081500Swpaul KASSERT((bits & PG_REFERENCED) == 0 || VM_OBJECT_LOCKED(m->object), 50181500Swpaul ("PG_REFERENCED and !VM_OBJECT_LOCKED")); 50281500Swpaul m->flags &= ~bits; 50359603Sdfr} 50459603Sdfr 50559603Sdfrvoid 50659603Sdfrvm_page_busy(vm_page_t m) 50759603Sdfr{ 50840156Speter 50940156Speter VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 51040156Speter KASSERT((m->oflags & VPO_BUSY) == 0, 51140156Speter ("vm_page_busy: page already busy!!!")); 51259751Speter m->oflags |= VPO_BUSY; 51338514Sdfr} 51438514Sdfr 51583366Sjulian/* 51640962Speter * vm_page_flash: 51740962Speter * 51839071Sdfr * wakeup anyone waiting for the page. 51939071Sdfr */ 52039071Sdfrvoid 52139071Sdfrvm_page_flash(vm_page_t m) 52239071Sdfr{ 52339071Sdfr 52439071Sdfr VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 52539071Sdfr if (m->oflags & VPO_WANTED) { 52639071Sdfr m->oflags &= ~VPO_WANTED; 52739071Sdfr wakeup(m); 52839071Sdfr } 52939071Sdfr} 53038514Sdfr 53162550Smckusick/* 53238514Sdfr * vm_page_wakeup: 53338514Sdfr * 53440292Speter * clear the VPO_BUSY flag and wakeup anyone waiting for the 53540292Speter * page. 53640292Speter * 53740292Speter */ 53840292Spetervoid 53981500Swpaulvm_page_wakeup(vm_page_t m) 54081201Sgreen{ 54181500Swpaul 54238514Sdfr VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 54379224Sdillon KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!")); 54479224Sdillon m->oflags &= ~VPO_BUSY; 54540292Speter vm_page_flash(m); 54640292Speter} 54740292Speter 54883366Sjulianvoid 54962550Smckusickvm_page_io_start(vm_page_t m) 55062550Smckusick{ 55138514Sdfr 55238514Sdfr VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 55354655Seivind m->busy++; 55438514Sdfr} 55538514Sdfr 55639071Sdfrvoid 55738514Sdfrvm_page_io_finish(vm_page_t m) 55840962Speter{ 55940962Speter 56040962Speter VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 56140962Speter KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m)); 56240962Speter m->busy--; 56340962Speter if (m->busy == 0) 56440962Speter vm_page_flash(m); 565101941Srwatson} 566101941Srwatson 56740962Speter/* 56838514Sdfr * Keep page from being freed by the page daemon 56938514Sdfr * much of the same effect as wiring, except much lower 57038514Sdfr * overhead and should be used only for *very* temporary 57140962Speter * holding ("wiring"). 57239071Sdfr */ 57338514Sdfrvoid 57439071Sdfrvm_page_hold(vm_page_t mem) 57538514Sdfr{ 57640962Speter 57740962Speter vm_page_lock_assert(mem, MA_OWNED); 57839071Sdfr mem->hold_count++; 57939071Sdfr} 58039071Sdfr 58139071Sdfrvoid 58240962Spetervm_page_unhold(vm_page_t mem) 58340962Speter{ 58439071Sdfr 58539071Sdfr vm_page_lock_assert(mem, MA_OWNED); 58639071Sdfr --mem->hold_count; 58739071Sdfr KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); 58840962Speter if (mem->hold_count == 0 && mem->queue == PQ_HOLD) 58939071Sdfr vm_page_free_toq(mem); 59039071Sdfr} 59139071Sdfr 59239071Sdfr/* 59340962Speter * vm_page_unhold_pages: 59439071Sdfr * 59539071Sdfr * Unhold each of the pages that is referenced by the given array. 59639071Sdfr */ 59739071Sdfrvoid 59839071Sdfrvm_page_unhold_pages(vm_page_t *ma, int count) 59938514Sdfr{ 60039071Sdfr struct mtx *mtx, *new_mtx; 60139071Sdfr 60239071Sdfr mtx = NULL; 60338514Sdfr for (; count != 0; count--) { 60440962Speter /* 60540962Speter * Avoid releasing and reacquiring the same page lock. 60640962Speter */ 60739071Sdfr new_mtx = vm_page_lockptr(*ma); 60839071Sdfr if (mtx != new_mtx) { 60938514Sdfr if (mtx != NULL) 61039071Sdfr mtx_unlock(mtx); 61139071Sdfr mtx = new_mtx; 61239071Sdfr mtx_lock(mtx); 61339071Sdfr } 61438514Sdfr vm_page_unhold(*ma); 61540962Speter ma++; 61640962Speter } 61739071Sdfr if (mtx != NULL) 61839071Sdfr mtx_unlock(mtx); 61939071Sdfr} 62039071Sdfr 62139071Sdfr/* 62239071Sdfr * vm_page_getfake: 62339071Sdfr * 62439071Sdfr * Create a fictitious page with the specified physical address and 62539071Sdfr * memory attribute. The memory attribute is the only the machine- 62639071Sdfr * dependent aspect of a fictitious page that must be initialized. 62739071Sdfr */ 62839071Sdfrvm_page_t 62939071Sdfrvm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) 63039071Sdfr{ 63139071Sdfr vm_page_t m; 63239071Sdfr 63339071Sdfr m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); 63439071Sdfr m->phys_addr = paddr; 63539071Sdfr m->queue = PQ_NONE; 63639071Sdfr /* Fictitious pages don't use "segind". */ 63739071Sdfr m->flags = PG_FICTITIOUS; 63839071Sdfr /* Fictitious pages don't use "order" or "pool". */ 63939071Sdfr m->oflags = VPO_BUSY | VPO_UNMANAGED; 64065503Sbp m->wire_count = 1; 64165503Sbp pmap_page_set_memattr(m, memattr); 64265503Sbp return (m); 64365503Sbp} 64465503Sbp 64539071Sdfr/* 64639071Sdfr * vm_page_putfake: 64739071Sdfr * 64839071Sdfr * Release a fictitious page. 64939071Sdfr */ 65039071Sdfrvoid 65139071Sdfrvm_page_putfake(vm_page_t m) 65238514Sdfr{ 65339071Sdfr 65438514Sdfr KASSERT((m->flags & PG_FICTITIOUS) != 0, 65538514Sdfr ("vm_page_putfake: bad page %p", m)); 65639071Sdfr uma_zfree(fakepg_zone, m); 65739071Sdfr} 65838514Sdfr 65939071Sdfr/* 66039071Sdfr * vm_page_updatefake: 66139071Sdfr * 66239071Sdfr * Update the given fictitious page to the specified physical address and 66339071Sdfr * memory attribute. 66459603Sdfr */ 66559603Sdfrvoid 66659603Sdfrvm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) 66759603Sdfr{ 66859603Sdfr 66959603Sdfr KASSERT((m->flags & PG_FICTITIOUS) != 0, 67059603Sdfr ("vm_page_updatefake: bad page %p", m)); 67139071Sdfr m->phys_addr = paddr; 67239071Sdfr pmap_page_set_memattr(m, memattr); 67339071Sdfr} 67438514Sdfr 67539071Sdfr/* 67638514Sdfr * vm_page_free: 67738514Sdfr * 67839071Sdfr * Free a page. 67939071Sdfr */ 68039071Sdfrvoid 68139071Sdfrvm_page_free(vm_page_t m) 68239071Sdfr{ 68339071Sdfr 68439071Sdfr m->flags &= ~PG_ZERO; 68539071Sdfr vm_page_free_toq(m); 68659603Sdfr} 68739071Sdfr 68839071Sdfr/* 68939071Sdfr * vm_page_free_zero: 69039071Sdfr * 69159603Sdfr * Free a page to the zerod-pages queue 69259603Sdfr */ 69359603Sdfrvoid 69459603Sdfrvm_page_free_zero(vm_page_t m) 69539071Sdfr{ 69639071Sdfr 69738514Sdfr m->flags |= PG_ZERO; 69839071Sdfr vm_page_free_toq(m); 69939071Sdfr} 70039071Sdfr 70139071Sdfr/* 70239071Sdfr * vm_page_sleep: 70339071Sdfr * 70439071Sdfr * Sleep and release the page and page queues locks. 705101941Srwatson * 706101941Srwatson * The object containing the given page must be locked. 70739071Sdfr */ 70839071Sdfrvoid 70939071Sdfrvm_page_sleep(vm_page_t m, const char *msg) 71039071Sdfr{ 71139071Sdfr 71239071Sdfr VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 71339071Sdfr if (mtx_owned(&vm_page_queue_mtx)) 71439071Sdfr vm_page_unlock_queues(); 71539071Sdfr if (mtx_owned(vm_page_lockptr(m))) 71639071Sdfr vm_page_unlock(m); 717102547Sjake 718102547Sjake /* 719102547Sjake * It's possible that while we sleep, the page will get 720102547Sjake * unbusied and freed. If we are holding the object 72139071Sdfr * lock, we will assume we hold a reference to the object 72239071Sdfr * such that even if m->object changes, we can re-lock 72339071Sdfr * it. 72485734Sgreen */ 72585734Sgreen m->oflags |= VPO_WANTED; 72685734Sgreen msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0); 72785734Sgreen} 72885734Sgreen 72985734Sgreen/* 73059603Sdfr * vm_page_dirty: 73139071Sdfr * 73238514Sdfr * Set all bits in the page's dirty field. 73339071Sdfr * 73438514Sdfr * The object containing the specified page must be locked if the call is 73559603Sdfr * made from the machine-independent layer. If, however, the call is 73640292Speter * made from the pmap layer, then the page queues lock may be required. 73738514Sdfr * See vm_page_clear_dirty_mask(). 73886469Siedowse */ 73940292Spetervoid 74040156Spetervm_page_dirty(vm_page_t m) 74159751Speter{ 74259751Speter 74359751Speter KASSERT((m->flags & PG_CACHED) == 0, 74459751Speter ("vm_page_dirty: page in cache!")); 74559751Speter KASSERT(!VM_PAGE_IS_FREE(m), 74659751Speter ("vm_page_dirty: page is free!")); 74759751Speter KASSERT(m->valid == VM_PAGE_BITS_ALL, 74859751Speter ("vm_page_dirty: page is invalid!")); 74959751Speter m->dirty = VM_PAGE_BITS_ALL; 75059751Speter} 75159603Sdfr 75240292Speter/* 75340156Speter * vm_page_splay: 75440292Speter * 75540292Speter * Implements Sleator and Tarjan's top-down splay algorithm. Returns 75640962Speter * the vm_page containing the given pindex. If, however, that 75740962Speter * pindex is not found in the vm_object, returns a vm_page that is 75840292Speter * adjacent to the pindex, coming before or after it. 75969781Sdwmalone */ 76040292Spetervm_page_t 76140292Spetervm_page_splay(vm_pindex_t pindex, vm_page_t root) 76240292Speter{ 76340156Speter struct vm_page dummy; 76440292Speter vm_page_t lefttreemax, righttreemin, y; 76540962Speter 766101941Srwatson if (root == NULL) 767101941Srwatson return (root); 76840292Speter lefttreemax = righttreemin = &dummy; 76940292Speter for (;; root = y) { 77040292Speter if (pindex < root->pindex) { 77140292Speter if ((y = root->left) == NULL) 77240962Speter break; 77340292Speter if (pindex < y->pindex) { 77440292Speter /* Rotate right. */ 77540292Speter root->left = y->right; 77640292Speter y->right = root; 77740292Speter root = y; 77840292Speter if ((y = root->left) == NULL) 77940292Speter break; 78040156Speter } 78140292Speter /* Link into the new root's right tree. */ 78240292Speter righttreemin->left = root; 78340292Speter righttreemin = root; 78440292Speter } else if (pindex > root->pindex) { 78540292Speter if ((y = root->right) == NULL) 78640292Speter break; 78740292Speter if (pindex > y->pindex) { 78840292Speter /* Rotate left. */ 78940292Speter root->right = y->left; 79040292Speter y->left = root; 79140292Speter root = y; 792101941Srwatson if ((y = root->right) == NULL) 793101941Srwatson break; 79440292Speter } 79540292Speter /* Link into the new root's left tree. */ 79640292Speter lefttreemax->right = root; 79740292Speter lefttreemax = root; 798101941Srwatson } else 799101941Srwatson break; 80040292Speter } 80140292Speter /* Assemble the new root. */ 80240292Speter lefttreemax->right = root->left; 80340292Speter righttreemin->left = root->right; 80440292Speter root->left = dummy.right; 80540292Speter root->right = dummy.left; 80640292Speter return (root); 80740292Speter} 80859603Sdfr 80959603Sdfr/* 81059603Sdfr * vm_page_insert: [ internal use only ] 81181201Sgreen * 81281201Sgreen * Inserts the given mem entry into the object and object list. 81381201Sgreen * 81459603Sdfr * The pagetables are not updated but will presumably fault the page 81559603Sdfr * in if necessary, or if a kernel page the caller will at some point 81659603Sdfr * enter the page into the kernel's pmap. We are not allowed to block 81759603Sdfr * here so we *can't* do this anyway. 81859603Sdfr * 81940292Speter * The object and page must be locked. 82040292Speter * This routine may not block. 82138514Sdfr */ 82238514Sdfrvoid 82338514Sdfrvm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) 82440292Speter{ 82540292Speter vm_page_t root; 82640292Speter 82740292Speter VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 82840962Speter if (m->object != NULL) 82940962Speter panic("vm_page_insert: page already inserted"); 83083366Sjulian 83191406Sjhb /* 83238514Sdfr * Record the object/offset pair in this page 83338514Sdfr */ 83438514Sdfr m->object = object; 83538514Sdfr m->pindex = pindex; 83638514Sdfr 83740156Speter /* 83838514Sdfr * Now link into the object's ordered list of backed pages. 83959603Sdfr */ 84038514Sdfr root = object->root; 84159603Sdfr if (root == NULL) { 84259603Sdfr m->left = NULL; 84359603Sdfr m->right = NULL; 84483282Speter TAILQ_INSERT_TAIL(&object->memq, m, listq); 84559603Sdfr } else { 84659603Sdfr root = vm_page_splay(pindex, root); 84759603Sdfr if (pindex < root->pindex) { 84859603Sdfr m->left = root->left; 84959603Sdfr m->right = root; 85059603Sdfr root->left = NULL; 85159751Speter TAILQ_INSERT_BEFORE(root, m, listq); 85259603Sdfr } else if (pindex == root->pindex) 85359603Sdfr panic("vm_page_insert: offset already allocated"); 85439071Sdfr else { 85559603Sdfr m->right = root->right; 85659603Sdfr m->left = root; 85759603Sdfr root->right = NULL; 85859603Sdfr TAILQ_INSERT_AFTER(&object->memq, root, m, listq); 85959603Sdfr } 86059603Sdfr } 86139071Sdfr object->root = m; 86259603Sdfr 86359603Sdfr /* 86439071Sdfr * show that the object has one more resident page. 86559603Sdfr */ 86659603Sdfr object->resident_page_count++; 86759603Sdfr /* 86859603Sdfr * Hold the vnode until the last page is released. 86938514Sdfr */ 87038514Sdfr if (object->resident_page_count == 1 && object->type == OBJT_VNODE) 87140156Speter vhold((struct vnode *)object->handle); 87259751Speter 87340156Speter /* 87440156Speter * Since we are inserting a new and possibly dirty page, 87540156Speter * update the object's OBJ_MIGHTBEDIRTY flag. 87640156Speter */ 87740156Speter if (m->flags & PG_WRITEABLE) 87839071Sdfr vm_object_set_writeable_dirty(object); 87940435Speter} 88038514Sdfr 88139071Sdfr/* 88238514Sdfr * vm_page_remove: 88340435Speter * NOTE: used by device pager as well -wfj 88440435Speter * 88540397Speter * Removes the given mem entry from the object/offset-page 88639071Sdfr * table and the object page list, but do not invalidate/terminate 88739071Sdfr * the backing store. 88838514Sdfr * 88938514Sdfr * The object and page must be locked. 89038514Sdfr * The underlying pmap entry (if any) is NOT removed here. 89159603Sdfr * This routine may not block. 89238514Sdfr */ 89339071Sdfrvoid 89439071Sdfrvm_page_remove(vm_page_t m) 89539071Sdfr{ 89639071Sdfr vm_object_t object; 89740435Speter vm_page_t root; 89838514Sdfr 89939071Sdfr if ((m->oflags & VPO_UNMANAGED) == 0) 90040435Speter vm_page_lock_assert(m, MA_OWNED); 90140435Speter if ((object = m->object) == NULL) 90243388Sbde return; 90340435Speter VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 90495410Smarcel if (m->oflags & VPO_BUSY) { 90595410Smarcel m->oflags &= ~VPO_BUSY; 90659744Speter vm_page_flash(m); 90740435Speter } 90842200Speter 90940435Speter /* 91040435Speter * Now remove from the object's list of backed pages. 91139071Sdfr */ 91238514Sdfr if (m != object->root) 91339071Sdfr vm_page_splay(m->pindex, object->root); 91440435Speter if (m->left == NULL) 91540435Speter root = m->right; 91643388Sbde else { 91740435Speter root = vm_page_splay(m->pindex, m->left); 91895410Smarcel root->right = m->right; 91995410Smarcel } 92059744Speter object->root = root; 92140435Speter TAILQ_REMOVE(&object->memq, m, listq); 92242200Speter 92340435Speter /* 92440435Speter * And show that the object has one fewer resident page. 92539071Sdfr */ 92638514Sdfr object->resident_page_count--; 92739071Sdfr /* 92840435Speter * The vnode may now be recycled. 92940435Speter */ 93043388Sbde if (object->resident_page_count == 0 && object->type == OBJT_VNODE) 93140435Speter vdrop((struct vnode *)object->handle); 93295410Smarcel 93395410Smarcel m->object = NULL; 93459744Speter} 93540435Speter 93642200Speter/* 93740435Speter * vm_page_lookup: 93840435Speter * 93939071Sdfr * Returns the page associated with the object/offset 94038514Sdfr * pair specified; if none is found, NULL is returned. 94139071Sdfr * 94240435Speter * The object must be locked. 94340435Speter * This routine may not block. 94443388Sbde * This is a critical path routine 94540435Speter */ 94695410Smarcelvm_page_t 94795410Smarcelvm_page_lookup(vm_object_t object, vm_pindex_t pindex) 94859744Speter{ 94940435Speter vm_page_t m; 95042200Speter 95140435Speter VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 95240435Speter if ((m = object->root) != NULL && m->pindex != pindex) { 95338514Sdfr m = vm_page_splay(pindex, m); 95438514Sdfr if ((object->root = m)->pindex != pindex) 95538514Sdfr m = NULL; 95638514Sdfr } 95738514Sdfr return (m); 95839071Sdfr} 95939071Sdfr 96039071Sdfr/* 96139071Sdfr * vm_page_find_least: 96239071Sdfr * 96339071Sdfr * Returns the page associated with the object with least pindex 96438514Sdfr * greater than or equal to the parameter pindex, or NULL. 96539071Sdfr * 96639071Sdfr * The object must be locked. 96739071Sdfr * The routine may not block. 96838514Sdfr */ 96939071Sdfrvm_page_t 97039071Sdfrvm_page_find_least(vm_object_t object, vm_pindex_t pindex) 97139071Sdfr{ 97239071Sdfr vm_page_t m; 97339071Sdfr 97439071Sdfr VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 97539071Sdfr if ((m = TAILQ_FIRST(&object->memq)) != NULL) { 97638514Sdfr if (m->pindex < pindex) { 97738514Sdfr m = vm_page_splay(pindex, object->root); 978104094Sphk if ((object->root = m)->pindex < pindex) 97943301Sdillon m = TAILQ_NEXT(m, listq); 98038514Sdfr } 98159603Sdfr } 98239071Sdfr return (m); 98340254Speter} 98440254Speter 98539071Sdfr/* 98639071Sdfr * Returns the given page's successor (by pindex) within the object if it is 98738514Sdfr * resident; if none is found, NULL is returned. 98840254Speter * 98939071Sdfr * The object must be locked. 99039071Sdfr */ 99139071Sdfrvm_page_t 99239071Sdfrvm_page_next(vm_page_t m) 99339071Sdfr{ 99439071Sdfr vm_page_t next; 99539071Sdfr 99638514Sdfr VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 99738514Sdfr if ((next = TAILQ_NEXT(m, listq)) != NULL && 99839071Sdfr next->pindex != m->pindex + 1) 99939071Sdfr next = NULL; 100039071Sdfr return (next); 100139071Sdfr} 100239071Sdfr 100339071Sdfr/* 100439071Sdfr * Returns the given page's predecessor (by pindex) within the object if it is 100539071Sdfr * resident; if none is found, NULL is returned. 100639071Sdfr * 100739071Sdfr * The object must be locked. 100839071Sdfr */ 100939071Sdfrvm_page_t 101043301Sdillonvm_page_prev(vm_page_t m) 101139071Sdfr{ 101239071Sdfr vm_page_t prev; 101339071Sdfr 101439071Sdfr VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 101539071Sdfr if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && 101639071Sdfr prev->pindex != m->pindex - 1) 101739071Sdfr prev = NULL; 101839071Sdfr return (prev); 101940254Speter} 102040254Speter 102140254Speter/* 102240254Speter * vm_page_rename: 102340254Speter * 102440254Speter * Move the given memory entry from its 102540254Speter * current object to the specified target object/offset. 102640254Speter * 102740254Speter * The object must be locked. 102840254Speter * This routine may not block. 102940254Speter * 103043301Sdillon * Note: swap associated with the page must be invalidated by the move. We 103140254Speter * have to do this for several reasons: (1) we aren't freeing the 103240254Speter * page, (2) we are dirtying the page, (3) the VM system is probably 103340254Speter * moving the page from object A to B, and will then later move 103440254Speter * the backing store from A to B and we can't have a conflict. 103540254Speter * 103640254Speter * Note: we *always* dirty the page. It is necessary both for the 103739071Sdfr * fact that we moved it, and because we may be invalidating 103838514Sdfr * swap. If the page is on the cache, we have to deactivate it 103938514Sdfr * or vm_page_dirty() will panic. Dirty pages are not allowed 104040156Speter * on the cache. 104143309Sdillon */ 104238514Sdfrvoid 104359603Sdfrvm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) 104443311Sdillon{ 104538514Sdfr 1046102348Smarcel vm_page_remove(m); 104740254Speter vm_page_insert(m, new_object, new_pindex); 104840254Speter vm_page_dirty(m); 104940254Speter} 105040254Speter 105140254Speter/* 105240254Speter * Convert all of the given object's cached pages that have a 105340254Speter * pindex within the given range into free pages. If the value 1054102348Smarcel * zero is given for "end", then the range's upper bound is 105540254Speter * infinity. If the given object is backed by a vnode and it 105640254Speter * transitions from having one or more cached pages to none, the 105740254Speter * vnode's hold count is reduced. 105840254Speter */ 105940254Spetervoid 106040254Spetervm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 106138514Sdfr{ 106238514Sdfr vm_page_t m, m_next; 106338514Sdfr boolean_t empty; 106438514Sdfr 106543301Sdillon mtx_lock(&vm_page_queue_free_mtx); 106638514Sdfr if (__predict_false(object->cache == NULL)) { 106759603Sdfr mtx_unlock(&vm_page_queue_free_mtx); 106855090Sbde return; 106938514Sdfr } 107055090Sbde m = object->cache = vm_page_splay(start, object->cache); 107139071Sdfr if (m->pindex < start) { 107239071Sdfr if (m->right == NULL) 107338514Sdfr m = NULL; 107438514Sdfr else { 107540254Speter m_next = vm_page_splay(start, m->right); 107638514Sdfr m_next->left = m; 107738514Sdfr m->right = NULL; 107855090Sbde m = object->cache = m_next; 107953820Speter } 108053820Speter } 108153820Speter 108238514Sdfr /* 108338514Sdfr * At this point, "m" is either (1) a reference to the page 108438514Sdfr * with the least pindex that is greater than or equal to 108553820Speter * "start" or (2) NULL. 108638514Sdfr */ 108738514Sdfr for (; m != NULL && (m->pindex < end || end == 0); m = m_next) { 108838514Sdfr /* 108938514Sdfr * Find "m"'s successor and remove "m" from the 109038514Sdfr * object's cache. 109138514Sdfr */ 109238514Sdfr if (m->right == NULL) { 109338514Sdfr object->cache = m->left; 109443301Sdillon m_next = NULL; 109538514Sdfr } else { 109638514Sdfr m_next = vm_page_splay(start, m->right); 109738514Sdfr m_next->left = m->left; 109878161Speter object->cache = m_next; 109978161Speter } 110078161Speter /* Convert "m" to a free page. */ 110178161Speter m->object = NULL; 110278161Speter m->valid = 0; 110378161Speter /* Clear PG_CACHED and set PG_FREE. */ 110478161Speter m->flags ^= PG_CACHED | PG_FREE; 110578161Speter KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE, 110678161Speter ("vm_page_cache_free: page %p has inconsistent flags", m)); 110778161Speter cnt.v_cache_count--; 110878161Speter cnt.v_free_count++; 110978161Speter } 111078161Speter empty = object->cache == NULL; 111178161Speter mtx_unlock(&vm_page_queue_free_mtx); 111278161Speter if (object->type == OBJT_VNODE && empty) 111378161Speter vdrop(object->handle); 111478161Speter} 111578161Speter 111678161Speter/* 111778161Speter * Returns the cached page that is associated with the given 111878161Speter * object and offset. If, however, none exists, returns NULL. 111978161Speter * 112078161Speter * The free page queue must be locked. 112178161Speter */ 112278161Speterstatic inline vm_page_t 112378161Spetervm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) 112478161Speter{ 112578161Speter vm_page_t m; 112678161Speter 112778161Speter mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 112878161Speter if ((m = object->cache) != NULL && m->pindex != pindex) { 112978161Speter m = vm_page_splay(pindex, m); 113078161Speter if ((object->cache = m)->pindex != pindex) 113178161Speter m = NULL; 113278161Speter } 113378161Speter return (m); 113478161Speter} 113578161Speter 113678161Speter/* 113778161Speter * Remove the given cached page from its containing object's 113878161Speter * collection of cached pages. 113978161Speter * 114078161Speter * The free page queue must be locked. 114178161Speter */ 114278161Spetervoid 114378161Spetervm_page_cache_remove(vm_page_t m) 114478161Speter{ 114578161Speter vm_object_t object; 114678161Speter vm_page_t root; 114778161Speter 114878161Speter mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 114978161Speter KASSERT((m->flags & PG_CACHED) != 0, 115078161Speter ("vm_page_cache_remove: page %p is not cached", m)); 115178161Speter object = m->object; 115278161Speter if (m != object->cache) { 115378161Speter root = vm_page_splay(m->pindex, object->cache); 115478161Speter KASSERT(root == m, 115578161Speter ("vm_page_cache_remove: page %p is not cached in object %p", 115685736Sgreen m, object)); 115785736Sgreen } 115885736Sgreen if (m->left == NULL) 115985736Sgreen root = m->right; 116085736Sgreen else if (m->right == NULL) 116185736Sgreen root = m->left; 116285736Sgreen else { 116385736Sgreen root = vm_page_splay(m->pindex, m->left); 116485736Sgreen root->right = m->right; 116585736Sgreen } 116685736Sgreen object->cache = root; 116785736Sgreen m->object = NULL; 116885736Sgreen cnt.v_cache_count--; 116985736Sgreen} 117085736Sgreen 117185736Sgreen/* 117285736Sgreen * Transfer all of the cached pages with offset greater than or 117385736Sgreen * equal to 'offidxstart' from the original object's cache to the 117485736Sgreen * new object's cache. However, any cached pages with offset 117595228Smarcel * greater than or equal to the new object's size are kept in the 117695228Smarcel * original object. Initially, the new object's cache must be 117795228Smarcel * empty. Offset 'offidxstart' in the original object must 117895228Smarcel * correspond to offset zero in the new object. 117995228Smarcel * 118095228Smarcel * The new object must be locked. 118195228Smarcel */ 118295228Smarcelvoid 118395228Smarcelvm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, 118495228Smarcel vm_object_t new_object) 118595228Smarcel{ 118695228Smarcel vm_page_t m, m_next; 118795228Smarcel 118895228Smarcel /* 118995228Smarcel * Insertion into an object's collection of cached pages 119095410Smarcel * requires the object to be locked. In contrast, removal does 1191104072Sjake * not. 1192104072Sjake */ 1193104072Sjake VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED); 1194104072Sjake KASSERT(new_object->cache == NULL, 1195104072Sjake ("vm_page_cache_transfer: object %p has cached pages", 1196104072Sjake new_object)); 1197104072Sjake mtx_lock(&vm_page_queue_free_mtx); 1198104072Sjake if ((m = orig_object->cache) != NULL) { 1199104072Sjake /* 1200104072Sjake * Transfer all of the pages with offset greater than or 1201105147Smarcel * equal to 'offidxstart' from the original object's 1202105147Smarcel * cache to the new object's cache. 1203105147Smarcel */ 1204105147Smarcel m = vm_page_splay(offidxstart, m); 1205105147Smarcel if (m->pindex < offidxstart) { 1206105147Smarcel orig_object->cache = m; 1207105147Smarcel new_object->cache = m->right; 1208105147Smarcel m->right = NULL; 1209105147Smarcel } else { 1210105147Smarcel orig_object->cache = m->left; 1211105147Smarcel new_object->cache = m; 1212105147Smarcel m->left = NULL; 121395410Smarcel } 121495410Smarcel while ((m = new_object->cache) != NULL) { 121595410Smarcel if ((m->pindex - offidxstart) >= new_object->size) { 121695410Smarcel /* 121795410Smarcel * Return all of the cached pages with 121895410Smarcel * offset greater than or equal to the 121995410Smarcel * new object's size to the original 122095410Smarcel * object's cache. 122195410Smarcel */ 122295410Smarcel new_object->cache = m->left; 122395410Smarcel m->left = orig_object->cache; 122495410Smarcel orig_object->cache = m; 122595410Smarcel break; 122695410Smarcel } 122795410Smarcel m_next = vm_page_splay(m->pindex, m->right); 122895410Smarcel /* Update the page's object and offset. */ 122995410Smarcel m->object = new_object; 123095410Smarcel m->pindex -= offidxstart; 123195410Smarcel if (m_next == NULL) 123295410Smarcel break; 123395410Smarcel m->right = NULL; 123495410Smarcel m_next->left = m; 123595410Smarcel new_object->cache = m_next; 123695410Smarcel } 123795410Smarcel KASSERT(new_object->cache == NULL || 123895410Smarcel new_object->type == OBJT_SWAP, 123995410Smarcel ("vm_page_cache_transfer: object %p's type is incompatible" 124095410Smarcel " with cached pages", new_object)); 124195410Smarcel } 124295410Smarcel mtx_unlock(&vm_page_queue_free_mtx); 124395410Smarcel} 124495410Smarcel 124595410Smarcel/* 124695410Smarcel * vm_page_alloc: 124795410Smarcel * 124895410Smarcel * Allocate and return a memory cell associated 124995410Smarcel * with this VM object/offset pair. 125095410Smarcel * 125195410Smarcel * The caller must always specify an allocation class. 125295410Smarcel * 125395410Smarcel * allocation classes: 125495410Smarcel * VM_ALLOC_NORMAL normal process request 125595410Smarcel * VM_ALLOC_SYSTEM system *really* needs a page 125695410Smarcel * VM_ALLOC_INTERRUPT interrupt time request 125795410Smarcel * 125895410Smarcel * optional allocation flags: 1259 * VM_ALLOC_ZERO prefer a zeroed page 1260 * VM_ALLOC_WIRED wire the allocated page 1261 * VM_ALLOC_NOOBJ page is not associated with a vm object 1262 * VM_ALLOC_NOBUSY do not set the page busy 1263 * VM_ALLOC_IFCACHED return page only if it is cached 1264 * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page 1265 * is cached 1266 * 1267 * This routine may not sleep. 1268 */ 1269vm_page_t 1270vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) 1271{ 1272 struct vnode *vp = NULL; 1273 vm_object_t m_object; 1274 vm_page_t m; 1275 int flags, page_req; 1276 1277 if ((req & VM_ALLOC_NOOBJ) == 0) { 1278 KASSERT(object != NULL, 1279 ("vm_page_alloc: NULL object.")); 1280 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1281 } 1282 1283 page_req = req & VM_ALLOC_CLASS_MASK; 1284 1285 /* 1286 * The pager is allowed to eat deeper into the free page list. 1287 */ 1288 if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) 1289 page_req = VM_ALLOC_SYSTEM; 1290 1291 mtx_lock(&vm_page_queue_free_mtx); 1292 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || 1293 (page_req == VM_ALLOC_SYSTEM && 1294 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || 1295 (page_req == VM_ALLOC_INTERRUPT && 1296 cnt.v_free_count + cnt.v_cache_count > 0)) { 1297 /* 1298 * Allocate from the free queue if the number of free pages 1299 * exceeds the minimum for the request class. 1300 */ 1301 if (object != NULL && 1302 (m = vm_page_cache_lookup(object, pindex)) != NULL) { 1303 if ((req & VM_ALLOC_IFNOTCACHED) != 0) { 1304 mtx_unlock(&vm_page_queue_free_mtx); 1305 return (NULL); 1306 } 1307 if (vm_phys_unfree_page(m)) 1308 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); 1309#if VM_NRESERVLEVEL > 0 1310 else if (!vm_reserv_reactivate_page(m)) 1311#else 1312 else 1313#endif 1314 panic("vm_page_alloc: cache page %p is missing" 1315 " from the free queue", m); 1316 } else if ((req & VM_ALLOC_IFCACHED) != 0) { 1317 mtx_unlock(&vm_page_queue_free_mtx); 1318 return (NULL); 1319#if VM_NRESERVLEVEL > 0 1320 } else if (object == NULL || object->type == OBJT_DEVICE || 1321 object->type == OBJT_SG || 1322 (object->flags & OBJ_COLORED) == 0 || 1323 (m = vm_reserv_alloc_page(object, pindex)) == NULL) { 1324#else 1325 } else { 1326#endif 1327 m = vm_phys_alloc_pages(object != NULL ? 1328 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); 1329#if VM_NRESERVLEVEL > 0 1330 if (m == NULL && vm_reserv_reclaim_inactive()) { 1331 m = vm_phys_alloc_pages(object != NULL ? 1332 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 1333 0); 1334 } 1335#endif 1336 } 1337 } else { 1338 /* 1339 * Not allocatable, give up. 1340 */ 1341 mtx_unlock(&vm_page_queue_free_mtx); 1342 atomic_add_int(&vm_pageout_deficit, 1343 MAX((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); 1344 pagedaemon_wakeup(); 1345 return (NULL); 1346 } 1347 1348 /* 1349 * At this point we had better have found a good page. 1350 */ 1351 1352 KASSERT(m != NULL, ("vm_page_alloc: missing page")); 1353 KASSERT(m->queue == PQ_NONE, 1354 ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); 1355 KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); 1356 KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); 1357 KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m)); 1358 KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); 1359 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1360 ("vm_page_alloc: page %p has unexpected memattr %d", m, 1361 pmap_page_get_memattr(m))); 1362 if ((m->flags & PG_CACHED) != 0) { 1363 KASSERT(m->valid != 0, 1364 ("vm_page_alloc: cached page %p is invalid", m)); 1365 if (m->object == object && m->pindex == pindex) 1366 cnt.v_reactivated++; 1367 else 1368 m->valid = 0; 1369 m_object = m->object; 1370 vm_page_cache_remove(m); 1371 if (m_object->type == OBJT_VNODE && m_object->cache == NULL) 1372 vp = m_object->handle; 1373 } else { 1374 KASSERT(VM_PAGE_IS_FREE(m), 1375 ("vm_page_alloc: page %p is not free", m)); 1376 KASSERT(m->valid == 0, 1377 ("vm_page_alloc: free page %p is valid", m)); 1378 cnt.v_free_count--; 1379 } 1380 1381 /* 1382 * Only the PG_ZERO flag is inherited. The PG_CACHED or PG_FREE flag 1383 * must be cleared before the free page queues lock is released. 1384 */ 1385 flags = 0; 1386 if (m->flags & PG_ZERO) { 1387 vm_page_zero_count--; 1388 if (req & VM_ALLOC_ZERO) 1389 flags = PG_ZERO; 1390 } 1391 m->flags = flags; 1392 mtx_unlock(&vm_page_queue_free_mtx); 1393 if (object == NULL || object->type == OBJT_PHYS) 1394 m->oflags = VPO_UNMANAGED; 1395 else 1396 m->oflags = 0; 1397 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0) 1398 m->oflags |= VPO_BUSY; 1399 if (req & VM_ALLOC_WIRED) { 1400 /* 1401 * The page lock is not required for wiring a page until that 1402 * page is inserted into the object. 1403 */ 1404 atomic_add_int(&cnt.v_wire_count, 1); 1405 m->wire_count = 1; 1406 } 1407 m->act_count = 0; 1408 1409 if (object != NULL) { 1410 /* Ignore device objects; the pager sets "memattr" for them. */ 1411 if (object->memattr != VM_MEMATTR_DEFAULT && 1412 object->type != OBJT_DEVICE && object->type != OBJT_SG) 1413 pmap_page_set_memattr(m, object->memattr); 1414 vm_page_insert(m, object, pindex); 1415 } else 1416 m->pindex = pindex; 1417 1418 /* 1419 * The following call to vdrop() must come after the above call 1420 * to vm_page_insert() in case both affect the same object and 1421 * vnode. Otherwise, the affected vnode's hold count could 1422 * temporarily become zero. 1423 */ 1424 if (vp != NULL) 1425 vdrop(vp); 1426 1427 /* 1428 * Don't wakeup too often - wakeup the pageout daemon when 1429 * we would be nearly out of memory. 1430 */ 1431 if (vm_paging_needed()) 1432 pagedaemon_wakeup(); 1433 1434 return (m); 1435} 1436 1437/* 1438 * Initialize a page that has been freshly dequeued from a freelist. 1439 * The caller has to drop the vnode returned, if it is not NULL. 1440 * 1441 * To be called with vm_page_queue_free_mtx held. 1442 */ 1443struct vnode * 1444vm_page_alloc_init(vm_page_t m) 1445{ 1446 struct vnode *drop; 1447 vm_object_t m_object; 1448 1449 KASSERT(m->queue == PQ_NONE, 1450 ("vm_page_alloc_init: page %p has unexpected queue %d", 1451 m, m->queue)); 1452 KASSERT(m->wire_count == 0, 1453 ("vm_page_alloc_init: page %p is wired", m)); 1454 KASSERT(m->hold_count == 0, 1455 ("vm_page_alloc_init: page %p is held", m)); 1456 KASSERT(m->busy == 0, 1457 ("vm_page_alloc_init: page %p is busy", m)); 1458 KASSERT(m->dirty == 0, 1459 ("vm_page_alloc_init: page %p is dirty", m)); 1460 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, 1461 ("vm_page_alloc_init: page %p has unexpected memattr %d", 1462 m, pmap_page_get_memattr(m))); 1463 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1464 drop = NULL; 1465 if ((m->flags & PG_CACHED) != 0) { 1466 m->valid = 0; 1467 m_object = m->object; 1468 vm_page_cache_remove(m); 1469 if (m_object->type == OBJT_VNODE && 1470 m_object->cache == NULL) 1471 drop = m_object->handle; 1472 } else { 1473 KASSERT(VM_PAGE_IS_FREE(m), 1474 ("vm_page_alloc_init: page %p is not free", m)); 1475 KASSERT(m->valid == 0, 1476 ("vm_page_alloc_init: free page %p is valid", m)); 1477 cnt.v_free_count--; 1478 } 1479 if (m->flags & PG_ZERO) 1480 vm_page_zero_count--; 1481 /* Don't clear the PG_ZERO flag; we'll need it later. */ 1482 m->flags &= PG_ZERO; 1483 m->oflags = VPO_UNMANAGED; 1484 /* Unmanaged pages don't use "act_count". */ 1485 return (drop); 1486} 1487 1488/* 1489 * vm_page_alloc_freelist: 1490 * 1491 * Allocate a page from the specified freelist. 1492 * Only the ALLOC_CLASS values in req are honored, other request flags 1493 * are ignored. 1494 */ 1495vm_page_t 1496vm_page_alloc_freelist(int flind, int req) 1497{ 1498 struct vnode *drop; 1499 vm_page_t m; 1500 int page_req; 1501 1502 m = NULL; 1503 page_req = req & VM_ALLOC_CLASS_MASK; 1504 mtx_lock(&vm_page_queue_free_mtx); 1505 /* 1506 * Do not allocate reserved pages unless the req has asked for it. 1507 */ 1508 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved || 1509 (page_req == VM_ALLOC_SYSTEM && 1510 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) || 1511 (page_req == VM_ALLOC_INTERRUPT && 1512 cnt.v_free_count + cnt.v_cache_count > 0)) { 1513 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); 1514 } 1515 if (m == NULL) { 1516 mtx_unlock(&vm_page_queue_free_mtx); 1517 return (NULL); 1518 } 1519 drop = vm_page_alloc_init(m); 1520 mtx_unlock(&vm_page_queue_free_mtx); 1521 if (drop) 1522 vdrop(drop); 1523 return (m); 1524} 1525 1526/* 1527 * vm_wait: (also see VM_WAIT macro) 1528 * 1529 * Block until free pages are available for allocation 1530 * - Called in various places before memory allocations. 1531 */ 1532void 1533vm_wait(void) 1534{ 1535 1536 mtx_lock(&vm_page_queue_free_mtx); 1537 if (curproc == pageproc) { 1538 vm_pageout_pages_needed = 1; 1539 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, 1540 PDROP | PSWP, "VMWait", 0); 1541 } else { 1542 if (!vm_pages_needed) { 1543 vm_pages_needed = 1; 1544 wakeup(&vm_pages_needed); 1545 } 1546 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, 1547 "vmwait", 0); 1548 } 1549} 1550 1551/* 1552 * vm_waitpfault: (also see VM_WAITPFAULT macro) 1553 * 1554 * Block until free pages are available for allocation 1555 * - Called only in vm_fault so that processes page faulting 1556 * can be easily tracked. 1557 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing 1558 * processes will be able to grab memory first. Do not change 1559 * this balance without careful testing first. 1560 */ 1561void 1562vm_waitpfault(void) 1563{ 1564 1565 mtx_lock(&vm_page_queue_free_mtx); 1566 if (!vm_pages_needed) { 1567 vm_pages_needed = 1; 1568 wakeup(&vm_pages_needed); 1569 } 1570 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, 1571 "pfault", 0); 1572} 1573 1574/* 1575 * vm_page_requeue: 1576 * 1577 * Move the given page to the tail of its present page queue. 1578 * 1579 * The page queues must be locked. 1580 */ 1581void 1582vm_page_requeue(vm_page_t m) 1583{ 1584 struct vpgqueues *vpq; 1585 int queue; 1586 1587 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1588 queue = m->queue; 1589 KASSERT(queue != PQ_NONE, 1590 ("vm_page_requeue: page %p is not queued", m)); 1591 vpq = &vm_page_queues[queue]; 1592 TAILQ_REMOVE(&vpq->pl, m, pageq); 1593 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1594} 1595 1596/* 1597 * vm_page_queue_remove: 1598 * 1599 * Remove the given page from the specified queue. 1600 * 1601 * The page and page queues must be locked. 1602 */ 1603static __inline void 1604vm_page_queue_remove(int queue, vm_page_t m) 1605{ 1606 struct vpgqueues *pq; 1607 1608 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1609 vm_page_lock_assert(m, MA_OWNED); 1610 pq = &vm_page_queues[queue]; 1611 TAILQ_REMOVE(&pq->pl, m, pageq); 1612 (*pq->cnt)--; 1613} 1614 1615/* 1616 * vm_pageq_remove: 1617 * 1618 * Remove a page from its queue. 1619 * 1620 * The given page must be locked. 1621 * This routine may not block. 1622 */ 1623void 1624vm_pageq_remove(vm_page_t m) 1625{ 1626 int queue; 1627 1628 vm_page_lock_assert(m, MA_OWNED); 1629 if ((queue = m->queue) != PQ_NONE) { 1630 vm_page_lock_queues(); 1631 m->queue = PQ_NONE; 1632 vm_page_queue_remove(queue, m); 1633 vm_page_unlock_queues(); 1634 } 1635} 1636 1637/* 1638 * vm_page_enqueue: 1639 * 1640 * Add the given page to the specified queue. 1641 * 1642 * The page queues must be locked. 1643 */ 1644static void 1645vm_page_enqueue(int queue, vm_page_t m) 1646{ 1647 struct vpgqueues *vpq; 1648 1649 vpq = &vm_page_queues[queue]; 1650 m->queue = queue; 1651 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq); 1652 ++*vpq->cnt; 1653} 1654 1655/* 1656 * vm_page_activate: 1657 * 1658 * Put the specified page on the active list (if appropriate). 1659 * Ensure that act_count is at least ACT_INIT but do not otherwise 1660 * mess with it. 1661 * 1662 * The page must be locked. 1663 * This routine may not block. 1664 */ 1665void 1666vm_page_activate(vm_page_t m) 1667{ 1668 int queue; 1669 1670 vm_page_lock_assert(m, MA_OWNED); 1671 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1672 if ((queue = m->queue) != PQ_ACTIVE) { 1673 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 1674 if (m->act_count < ACT_INIT) 1675 m->act_count = ACT_INIT; 1676 vm_page_lock_queues(); 1677 if (queue != PQ_NONE) 1678 vm_page_queue_remove(queue, m); 1679 vm_page_enqueue(PQ_ACTIVE, m); 1680 vm_page_unlock_queues(); 1681 } else 1682 KASSERT(queue == PQ_NONE, 1683 ("vm_page_activate: wired page %p is queued", m)); 1684 } else { 1685 if (m->act_count < ACT_INIT) 1686 m->act_count = ACT_INIT; 1687 } 1688} 1689 1690/* 1691 * vm_page_free_wakeup: 1692 * 1693 * Helper routine for vm_page_free_toq() and vm_page_cache(). This 1694 * routine is called when a page has been added to the cache or free 1695 * queues. 1696 * 1697 * The page queues must be locked. 1698 * This routine may not block. 1699 */ 1700static inline void 1701vm_page_free_wakeup(void) 1702{ 1703 1704 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1705 /* 1706 * if pageout daemon needs pages, then tell it that there are 1707 * some free. 1708 */ 1709 if (vm_pageout_pages_needed && 1710 cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { 1711 wakeup(&vm_pageout_pages_needed); 1712 vm_pageout_pages_needed = 0; 1713 } 1714 /* 1715 * wakeup processes that are waiting on memory if we hit a 1716 * high water mark. And wakeup scheduler process if we have 1717 * lots of memory. this process will swapin processes. 1718 */ 1719 if (vm_pages_needed && !vm_page_count_min()) { 1720 vm_pages_needed = 0; 1721 wakeup(&cnt.v_free_count); 1722 } 1723} 1724 1725/* 1726 * vm_page_free_toq: 1727 * 1728 * Returns the given page to the free list, 1729 * disassociating it with any VM object. 1730 * 1731 * Object and page must be locked prior to entry. 1732 * This routine may not block. 1733 */ 1734 1735void 1736vm_page_free_toq(vm_page_t m) 1737{ 1738 1739 if ((m->oflags & VPO_UNMANAGED) == 0) { 1740 vm_page_lock_assert(m, MA_OWNED); 1741 KASSERT(!pmap_page_is_mapped(m), 1742 ("vm_page_free_toq: freeing mapped page %p", m)); 1743 } 1744 PCPU_INC(cnt.v_tfree); 1745 1746 if (VM_PAGE_IS_FREE(m)) 1747 panic("vm_page_free: freeing free page %p", m); 1748 else if (m->busy != 0) 1749 panic("vm_page_free: freeing busy page %p", m); 1750 1751 /* 1752 * unqueue, then remove page. Note that we cannot destroy 1753 * the page here because we do not want to call the pager's 1754 * callback routine until after we've put the page on the 1755 * appropriate free queue. 1756 */ 1757 if ((m->oflags & VPO_UNMANAGED) == 0) 1758 vm_pageq_remove(m); 1759 vm_page_remove(m); 1760 1761 /* 1762 * If fictitious remove object association and 1763 * return, otherwise delay object association removal. 1764 */ 1765 if ((m->flags & PG_FICTITIOUS) != 0) { 1766 return; 1767 } 1768 1769 m->valid = 0; 1770 vm_page_undirty(m); 1771 1772 if (m->wire_count != 0) 1773 panic("vm_page_free: freeing wired page %p", m); 1774 if (m->hold_count != 0) { 1775 m->flags &= ~PG_ZERO; 1776 vm_page_lock_queues(); 1777 vm_page_enqueue(PQ_HOLD, m); 1778 vm_page_unlock_queues(); 1779 } else { 1780 /* 1781 * Restore the default memory attribute to the page. 1782 */ 1783 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 1784 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 1785 1786 /* 1787 * Insert the page into the physical memory allocator's 1788 * cache/free page queues. 1789 */ 1790 mtx_lock(&vm_page_queue_free_mtx); 1791 m->flags |= PG_FREE; 1792 cnt.v_free_count++; 1793#if VM_NRESERVLEVEL > 0 1794 if (!vm_reserv_free_page(m)) 1795#else 1796 if (TRUE) 1797#endif 1798 vm_phys_free_pages(m, 0); 1799 if ((m->flags & PG_ZERO) != 0) 1800 ++vm_page_zero_count; 1801 else 1802 vm_page_zero_idle_wakeup(); 1803 vm_page_free_wakeup(); 1804 mtx_unlock(&vm_page_queue_free_mtx); 1805 } 1806} 1807 1808/* 1809 * vm_page_wire: 1810 * 1811 * Mark this page as wired down by yet 1812 * another map, removing it from paging queues 1813 * as necessary. 1814 * 1815 * If the page is fictitious, then its wire count must remain one. 1816 * 1817 * The page must be locked. 1818 * This routine may not block. 1819 */ 1820void 1821vm_page_wire(vm_page_t m) 1822{ 1823 1824 /* 1825 * Only bump the wire statistics if the page is not already wired, 1826 * and only unqueue the page if it is on some queue (if it is unmanaged 1827 * it is already off the queues). 1828 */ 1829 vm_page_lock_assert(m, MA_OWNED); 1830 if ((m->flags & PG_FICTITIOUS) != 0) { 1831 KASSERT(m->wire_count == 1, 1832 ("vm_page_wire: fictitious page %p's wire count isn't one", 1833 m)); 1834 return; 1835 } 1836 if (m->wire_count == 0) { 1837 if ((m->oflags & VPO_UNMANAGED) == 0) 1838 vm_pageq_remove(m); 1839 atomic_add_int(&cnt.v_wire_count, 1); 1840 } 1841 m->wire_count++; 1842 KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); 1843} 1844 1845/* 1846 * vm_page_unwire: 1847 * 1848 * Release one wiring of the specified page, potentially enabling it to be 1849 * paged again. If paging is enabled, then the value of the parameter 1850 * "activate" determines to which queue the page is added. If "activate" is 1851 * non-zero, then the page is added to the active queue. Otherwise, it is 1852 * added to the inactive queue. 1853 * 1854 * However, unless the page belongs to an object, it is not enqueued because 1855 * it cannot be paged out. 1856 * 1857 * If a page is fictitious, then its wire count must alway be one. 1858 * 1859 * A managed page must be locked. 1860 */ 1861void 1862vm_page_unwire(vm_page_t m, int activate) 1863{ 1864 1865 if ((m->oflags & VPO_UNMANAGED) == 0) 1866 vm_page_lock_assert(m, MA_OWNED); 1867 if ((m->flags & PG_FICTITIOUS) != 0) { 1868 KASSERT(m->wire_count == 1, 1869 ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); 1870 return; 1871 } 1872 if (m->wire_count > 0) { 1873 m->wire_count--; 1874 if (m->wire_count == 0) { 1875 atomic_subtract_int(&cnt.v_wire_count, 1); 1876 if ((m->oflags & VPO_UNMANAGED) != 0 || 1877 m->object == NULL) 1878 return; 1879 vm_page_lock_queues(); 1880 if (activate) 1881 vm_page_enqueue(PQ_ACTIVE, m); 1882 else { 1883 vm_page_flag_clear(m, PG_WINATCFLS); 1884 vm_page_enqueue(PQ_INACTIVE, m); 1885 } 1886 vm_page_unlock_queues(); 1887 } 1888 } else 1889 panic("vm_page_unwire: page %p's wire count is zero", m); 1890} 1891 1892/* 1893 * Move the specified page to the inactive queue. 1894 * 1895 * Many pages placed on the inactive queue should actually go 1896 * into the cache, but it is difficult to figure out which. What 1897 * we do instead, if the inactive target is well met, is to put 1898 * clean pages at the head of the inactive queue instead of the tail. 1899 * This will cause them to be moved to the cache more quickly and 1900 * if not actively re-referenced, reclaimed more quickly. If we just 1901 * stick these pages at the end of the inactive queue, heavy filesystem 1902 * meta-data accesses can cause an unnecessary paging load on memory bound 1903 * processes. This optimization causes one-time-use metadata to be 1904 * reused more quickly. 1905 * 1906 * Normally athead is 0 resulting in LRU operation. athead is set 1907 * to 1 if we want this page to be 'as if it were placed in the cache', 1908 * except without unmapping it from the process address space. 1909 * 1910 * This routine may not block. 1911 */ 1912static inline void 1913_vm_page_deactivate(vm_page_t m, int athead) 1914{ 1915 int queue; 1916 1917 vm_page_lock_assert(m, MA_OWNED); 1918 1919 /* 1920 * Ignore if already inactive. 1921 */ 1922 if ((queue = m->queue) == PQ_INACTIVE) 1923 return; 1924 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { 1925 vm_page_lock_queues(); 1926 vm_page_flag_clear(m, PG_WINATCFLS); 1927 if (queue != PQ_NONE) 1928 vm_page_queue_remove(queue, m); 1929 if (athead) 1930 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, 1931 pageq); 1932 else 1933 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, 1934 pageq); 1935 m->queue = PQ_INACTIVE; 1936 cnt.v_inactive_count++; 1937 vm_page_unlock_queues(); 1938 } 1939} 1940 1941/* 1942 * Move the specified page to the inactive queue. 1943 * 1944 * The page must be locked. 1945 */ 1946void 1947vm_page_deactivate(vm_page_t m) 1948{ 1949 1950 _vm_page_deactivate(m, 0); 1951} 1952 1953/* 1954 * vm_page_try_to_cache: 1955 * 1956 * Returns 0 on failure, 1 on success 1957 */ 1958int 1959vm_page_try_to_cache(vm_page_t m) 1960{ 1961 1962 vm_page_lock_assert(m, MA_OWNED); 1963 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1964 if (m->dirty || m->hold_count || m->busy || m->wire_count || 1965 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0) 1966 return (0); 1967 pmap_remove_all(m); 1968 if (m->dirty) 1969 return (0); 1970 vm_page_cache(m); 1971 return (1); 1972} 1973 1974/* 1975 * vm_page_try_to_free() 1976 * 1977 * Attempt to free the page. If we cannot free it, we do nothing. 1978 * 1 is returned on success, 0 on failure. 1979 */ 1980int 1981vm_page_try_to_free(vm_page_t m) 1982{ 1983 1984 vm_page_lock_assert(m, MA_OWNED); 1985 if (m->object != NULL) 1986 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1987 if (m->dirty || m->hold_count || m->busy || m->wire_count || 1988 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0) 1989 return (0); 1990 pmap_remove_all(m); 1991 if (m->dirty) 1992 return (0); 1993 vm_page_free(m); 1994 return (1); 1995} 1996 1997/* 1998 * vm_page_cache 1999 * 2000 * Put the specified page onto the page cache queue (if appropriate). 2001 * 2002 * This routine may not block. 2003 */ 2004void 2005vm_page_cache(vm_page_t m) 2006{ 2007 vm_object_t object; 2008 vm_page_t root; 2009 2010 vm_page_lock_assert(m, MA_OWNED); 2011 object = m->object; 2012 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2013 if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy || 2014 m->hold_count || m->wire_count) 2015 panic("vm_page_cache: attempting to cache busy page"); 2016 pmap_remove_all(m); 2017 if (m->dirty != 0) 2018 panic("vm_page_cache: page %p is dirty", m); 2019 if (m->valid == 0 || object->type == OBJT_DEFAULT || 2020 (object->type == OBJT_SWAP && 2021 !vm_pager_has_page(object, m->pindex, NULL, NULL))) { 2022 /* 2023 * Hypothesis: A cache-elgible page belonging to a 2024 * default object or swap object but without a backing 2025 * store must be zero filled. 2026 */ 2027 vm_page_free(m); 2028 return; 2029 } 2030 KASSERT((m->flags & PG_CACHED) == 0, 2031 ("vm_page_cache: page %p is already cached", m)); 2032 PCPU_INC(cnt.v_tcached); 2033 2034 /* 2035 * Remove the page from the paging queues. 2036 */ 2037 vm_pageq_remove(m); 2038 2039 /* 2040 * Remove the page from the object's collection of resident 2041 * pages. 2042 */ 2043 if (m != object->root) 2044 vm_page_splay(m->pindex, object->root); 2045 if (m->left == NULL) 2046 root = m->right; 2047 else { 2048 root = vm_page_splay(m->pindex, m->left); 2049 root->right = m->right; 2050 } 2051 object->root = root; 2052 TAILQ_REMOVE(&object->memq, m, listq); 2053 object->resident_page_count--; 2054 2055 /* 2056 * Restore the default memory attribute to the page. 2057 */ 2058 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) 2059 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); 2060 2061 /* 2062 * Insert the page into the object's collection of cached pages 2063 * and the physical memory allocator's cache/free page queues. 2064 */ 2065 m->flags &= ~PG_ZERO; 2066 mtx_lock(&vm_page_queue_free_mtx); 2067 m->flags |= PG_CACHED; 2068 cnt.v_cache_count++; 2069 root = object->cache; 2070 if (root == NULL) { 2071 m->left = NULL; 2072 m->right = NULL; 2073 } else { 2074 root = vm_page_splay(m->pindex, root); 2075 if (m->pindex < root->pindex) { 2076 m->left = root->left; 2077 m->right = root; 2078 root->left = NULL; 2079 } else if (__predict_false(m->pindex == root->pindex)) 2080 panic("vm_page_cache: offset already cached"); 2081 else { 2082 m->right = root->right; 2083 m->left = root; 2084 root->right = NULL; 2085 } 2086 } 2087 object->cache = m; 2088#if VM_NRESERVLEVEL > 0 2089 if (!vm_reserv_free_page(m)) { 2090#else 2091 if (TRUE) { 2092#endif 2093 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); 2094 vm_phys_free_pages(m, 0); 2095 } 2096 vm_page_free_wakeup(); 2097 mtx_unlock(&vm_page_queue_free_mtx); 2098 2099 /* 2100 * Increment the vnode's hold count if this is the object's only 2101 * cached page. Decrement the vnode's hold count if this was 2102 * the object's only resident page. 2103 */ 2104 if (object->type == OBJT_VNODE) { 2105 if (root == NULL && object->resident_page_count != 0) 2106 vhold(object->handle); 2107 else if (root != NULL && object->resident_page_count == 0) 2108 vdrop(object->handle); 2109 } 2110} 2111 2112/* 2113 * vm_page_dontneed 2114 * 2115 * Cache, deactivate, or do nothing as appropriate. This routine 2116 * is typically used by madvise() MADV_DONTNEED. 2117 * 2118 * Generally speaking we want to move the page into the cache so 2119 * it gets reused quickly. However, this can result in a silly syndrome 2120 * due to the page recycling too quickly. Small objects will not be 2121 * fully cached. On the otherhand, if we move the page to the inactive 2122 * queue we wind up with a problem whereby very large objects 2123 * unnecessarily blow away our inactive and cache queues. 2124 * 2125 * The solution is to move the pages based on a fixed weighting. We 2126 * either leave them alone, deactivate them, or move them to the cache, 2127 * where moving them to the cache has the highest weighting. 2128 * By forcing some pages into other queues we eventually force the 2129 * system to balance the queues, potentially recovering other unrelated 2130 * space from active. The idea is to not force this to happen too 2131 * often. 2132 */ 2133void 2134vm_page_dontneed(vm_page_t m) 2135{ 2136 int dnw; 2137 int head; 2138 2139 vm_page_lock_assert(m, MA_OWNED); 2140 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2141 dnw = PCPU_GET(dnweight); 2142 PCPU_INC(dnweight); 2143 2144 /* 2145 * Occasionally leave the page alone. 2146 */ 2147 if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) { 2148 if (m->act_count >= ACT_INIT) 2149 --m->act_count; 2150 return; 2151 } 2152 2153 /* 2154 * Clear any references to the page. Otherwise, the page daemon will 2155 * immediately reactivate the page. 2156 * 2157 * Perform the pmap_clear_reference() first. Otherwise, a concurrent 2158 * pmap operation, such as pmap_remove(), could clear a reference in 2159 * the pmap and set PG_REFERENCED on the page before the 2160 * pmap_clear_reference() had completed. Consequently, the page would 2161 * appear referenced based upon an old reference that occurred before 2162 * this function ran. 2163 */ 2164 pmap_clear_reference(m); 2165 vm_page_lock_queues(); 2166 vm_page_flag_clear(m, PG_REFERENCED); 2167 vm_page_unlock_queues(); 2168 2169 if (m->dirty == 0 && pmap_is_modified(m)) 2170 vm_page_dirty(m); 2171 2172 if (m->dirty || (dnw & 0x0070) == 0) { 2173 /* 2174 * Deactivate the page 3 times out of 32. 2175 */ 2176 head = 0; 2177 } else { 2178 /* 2179 * Cache the page 28 times out of every 32. Note that 2180 * the page is deactivated instead of cached, but placed 2181 * at the head of the queue instead of the tail. 2182 */ 2183 head = 1; 2184 } 2185 _vm_page_deactivate(m, head); 2186} 2187 2188/* 2189 * Grab a page, waiting until we are waken up due to the page 2190 * changing state. We keep on waiting, if the page continues 2191 * to be in the object. If the page doesn't exist, first allocate it 2192 * and then conditionally zero it. 2193 * 2194 * The caller must always specify the VM_ALLOC_RETRY flag. This is intended 2195 * to facilitate its eventual removal. 2196 * 2197 * This routine may block. 2198 */ 2199vm_page_t 2200vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) 2201{ 2202 vm_page_t m; 2203 2204 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2205 KASSERT((allocflags & VM_ALLOC_RETRY) != 0, 2206 ("vm_page_grab: VM_ALLOC_RETRY is required")); 2207retrylookup: 2208 if ((m = vm_page_lookup(object, pindex)) != NULL) { 2209 if ((m->oflags & VPO_BUSY) != 0 || 2210 ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) { 2211 /* 2212 * Reference the page before unlocking and 2213 * sleeping so that the page daemon is less 2214 * likely to reclaim it. 2215 */ 2216 vm_page_lock_queues(); 2217 vm_page_flag_set(m, PG_REFERENCED); 2218 vm_page_sleep(m, "pgrbwt"); 2219 goto retrylookup; 2220 } else { 2221 if ((allocflags & VM_ALLOC_WIRED) != 0) { 2222 vm_page_lock(m); 2223 vm_page_wire(m); 2224 vm_page_unlock(m); 2225 } 2226 if ((allocflags & VM_ALLOC_NOBUSY) == 0) 2227 vm_page_busy(m); 2228 return (m); 2229 } 2230 } 2231 m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY | 2232 VM_ALLOC_IGN_SBUSY)); 2233 if (m == NULL) { 2234 VM_OBJECT_UNLOCK(object); 2235 VM_WAIT; 2236 VM_OBJECT_LOCK(object); 2237 goto retrylookup; 2238 } else if (m->valid != 0) 2239 return (m); 2240 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) 2241 pmap_zero_page(m); 2242 return (m); 2243} 2244 2245/* 2246 * Mapping function for valid bits or for dirty bits in 2247 * a page. May not block. 2248 * 2249 * Inputs are required to range within a page. 2250 */ 2251int 2252vm_page_bits(int base, int size) 2253{ 2254 int first_bit; 2255 int last_bit; 2256 2257 KASSERT( 2258 base + size <= PAGE_SIZE, 2259 ("vm_page_bits: illegal base/size %d/%d", base, size) 2260 ); 2261 2262 if (size == 0) /* handle degenerate case */ 2263 return (0); 2264 2265 first_bit = base >> DEV_BSHIFT; 2266 last_bit = (base + size - 1) >> DEV_BSHIFT; 2267 2268 return ((2 << last_bit) - (1 << first_bit)); 2269} 2270 2271/* 2272 * vm_page_set_valid: 2273 * 2274 * Sets portions of a page valid. The arguments are expected 2275 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2276 * of any partial chunks touched by the range. The invalid portion of 2277 * such chunks will be zeroed. 2278 * 2279 * (base + size) must be less then or equal to PAGE_SIZE. 2280 */ 2281void 2282vm_page_set_valid(vm_page_t m, int base, int size) 2283{ 2284 int endoff, frag; 2285 2286 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2287 if (size == 0) /* handle degenerate case */ 2288 return; 2289 2290 /* 2291 * If the base is not DEV_BSIZE aligned and the valid 2292 * bit is clear, we have to zero out a portion of the 2293 * first block. 2294 */ 2295 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2296 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 2297 pmap_zero_page_area(m, frag, base - frag); 2298 2299 /* 2300 * If the ending offset is not DEV_BSIZE aligned and the 2301 * valid bit is clear, we have to zero out a portion of 2302 * the last block. 2303 */ 2304 endoff = base + size; 2305 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2306 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 2307 pmap_zero_page_area(m, endoff, 2308 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2309 2310 /* 2311 * Assert that no previously invalid block that is now being validated 2312 * is already dirty. 2313 */ 2314 KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, 2315 ("vm_page_set_valid: page %p is dirty", m)); 2316 2317 /* 2318 * Set valid bits inclusive of any overlap. 2319 */ 2320 m->valid |= vm_page_bits(base, size); 2321} 2322 2323/* 2324 * Clear the given bits from the specified page's dirty field. 2325 */ 2326static __inline void 2327vm_page_clear_dirty_mask(vm_page_t m, int pagebits) 2328{ 2329 2330 /* 2331 * If the object is locked and the page is neither VPO_BUSY nor 2332 * PG_WRITEABLE, then the page's dirty field cannot possibly be 2333 * set by a concurrent pmap operation. 2334 */ 2335 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2336 if ((m->oflags & VPO_BUSY) == 0 && (m->flags & PG_WRITEABLE) == 0) 2337 m->dirty &= ~pagebits; 2338 else { 2339#if defined(__amd64__) || defined(__i386__) || defined(__ia64__) 2340 /* 2341 * On the aforementioned architectures, the page queues lock 2342 * is not required by the following read-modify-write 2343 * operation. The combination of the object's lock and an 2344 * atomic operation suffice. Moreover, the pmap layer on 2345 * these architectures can call vm_page_dirty() without 2346 * holding the page queues lock. 2347 */ 2348#if PAGE_SIZE == 4096 2349 atomic_clear_char(&m->dirty, pagebits); 2350#elif PAGE_SIZE == 8192 2351 atomic_clear_short(&m->dirty, pagebits); 2352#elif PAGE_SIZE == 16384 2353 atomic_clear_int(&m->dirty, pagebits); 2354#else 2355#error "PAGE_SIZE is not supported." 2356#endif 2357#else 2358 /* 2359 * Otherwise, the page queues lock is required to ensure that 2360 * a concurrent pmap operation does not set the page's dirty 2361 * field during the following read-modify-write operation. 2362 */ 2363 vm_page_lock_queues(); 2364 m->dirty &= ~pagebits; 2365 vm_page_unlock_queues(); 2366#endif 2367 } 2368} 2369 2370/* 2371 * vm_page_set_validclean: 2372 * 2373 * Sets portions of a page valid and clean. The arguments are expected 2374 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive 2375 * of any partial chunks touched by the range. The invalid portion of 2376 * such chunks will be zero'd. 2377 * 2378 * This routine may not block. 2379 * 2380 * (base + size) must be less then or equal to PAGE_SIZE. 2381 */ 2382void 2383vm_page_set_validclean(vm_page_t m, int base, int size) 2384{ 2385 u_long oldvalid; 2386 int endoff, frag, pagebits; 2387 2388 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2389 if (size == 0) /* handle degenerate case */ 2390 return; 2391 2392 /* 2393 * If the base is not DEV_BSIZE aligned and the valid 2394 * bit is clear, we have to zero out a portion of the 2395 * first block. 2396 */ 2397 if ((frag = base & ~(DEV_BSIZE - 1)) != base && 2398 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) 2399 pmap_zero_page_area(m, frag, base - frag); 2400 2401 /* 2402 * If the ending offset is not DEV_BSIZE aligned and the 2403 * valid bit is clear, we have to zero out a portion of 2404 * the last block. 2405 */ 2406 endoff = base + size; 2407 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && 2408 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) 2409 pmap_zero_page_area(m, endoff, 2410 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); 2411 2412 /* 2413 * Set valid, clear dirty bits. If validating the entire 2414 * page we can safely clear the pmap modify bit. We also 2415 * use this opportunity to clear the VPO_NOSYNC flag. If a process 2416 * takes a write fault on a MAP_NOSYNC memory area the flag will 2417 * be set again. 2418 * 2419 * We set valid bits inclusive of any overlap, but we can only 2420 * clear dirty bits for DEV_BSIZE chunks that are fully within 2421 * the range. 2422 */ 2423 oldvalid = m->valid; 2424 pagebits = vm_page_bits(base, size); 2425 m->valid |= pagebits; 2426#if 0 /* NOT YET */ 2427 if ((frag = base & (DEV_BSIZE - 1)) != 0) { 2428 frag = DEV_BSIZE - frag; 2429 base += frag; 2430 size -= frag; 2431 if (size < 0) 2432 size = 0; 2433 } 2434 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); 2435#endif 2436 if (base == 0 && size == PAGE_SIZE) { 2437 /* 2438 * The page can only be modified within the pmap if it is 2439 * mapped, and it can only be mapped if it was previously 2440 * fully valid. 2441 */ 2442 if (oldvalid == VM_PAGE_BITS_ALL) 2443 /* 2444 * Perform the pmap_clear_modify() first. Otherwise, 2445 * a concurrent pmap operation, such as 2446 * pmap_protect(), could clear a modification in the 2447 * pmap and set the dirty field on the page before 2448 * pmap_clear_modify() had begun and after the dirty 2449 * field was cleared here. 2450 */ 2451 pmap_clear_modify(m); 2452 m->dirty = 0; 2453 m->oflags &= ~VPO_NOSYNC; 2454 } else if (oldvalid != VM_PAGE_BITS_ALL) 2455 m->dirty &= ~pagebits; 2456 else 2457 vm_page_clear_dirty_mask(m, pagebits); 2458} 2459 2460void 2461vm_page_clear_dirty(vm_page_t m, int base, int size) 2462{ 2463 2464 vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); 2465} 2466 2467/* 2468 * vm_page_set_invalid: 2469 * 2470 * Invalidates DEV_BSIZE'd chunks within a page. Both the 2471 * valid and dirty bits for the effected areas are cleared. 2472 * 2473 * May not block. 2474 */ 2475void 2476vm_page_set_invalid(vm_page_t m, int base, int size) 2477{ 2478 int bits; 2479 2480 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2481 KASSERT((m->oflags & VPO_BUSY) == 0, 2482 ("vm_page_set_invalid: page %p is busy", m)); 2483 bits = vm_page_bits(base, size); 2484 if (m->valid == VM_PAGE_BITS_ALL && bits != 0) 2485 pmap_remove_all(m); 2486 KASSERT(!pmap_page_is_mapped(m), 2487 ("vm_page_set_invalid: page %p is mapped", m)); 2488 m->valid &= ~bits; 2489 m->dirty &= ~bits; 2490} 2491 2492/* 2493 * vm_page_zero_invalid() 2494 * 2495 * The kernel assumes that the invalid portions of a page contain 2496 * garbage, but such pages can be mapped into memory by user code. 2497 * When this occurs, we must zero out the non-valid portions of the 2498 * page so user code sees what it expects. 2499 * 2500 * Pages are most often semi-valid when the end of a file is mapped 2501 * into memory and the file's size is not page aligned. 2502 */ 2503void 2504vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) 2505{ 2506 int b; 2507 int i; 2508 2509 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2510 /* 2511 * Scan the valid bits looking for invalid sections that 2512 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the 2513 * valid bit may be set ) have already been zerod by 2514 * vm_page_set_validclean(). 2515 */ 2516 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { 2517 if (i == (PAGE_SIZE / DEV_BSIZE) || 2518 (m->valid & (1 << i)) 2519 ) { 2520 if (i > b) { 2521 pmap_zero_page_area(m, 2522 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); 2523 } 2524 b = i + 1; 2525 } 2526 } 2527 2528 /* 2529 * setvalid is TRUE when we can safely set the zero'd areas 2530 * as being valid. We can do this if there are no cache consistancy 2531 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. 2532 */ 2533 if (setvalid) 2534 m->valid = VM_PAGE_BITS_ALL; 2535} 2536 2537/* 2538 * vm_page_is_valid: 2539 * 2540 * Is (partial) page valid? Note that the case where size == 0 2541 * will return FALSE in the degenerate case where the page is 2542 * entirely invalid, and TRUE otherwise. 2543 * 2544 * May not block. 2545 */ 2546int 2547vm_page_is_valid(vm_page_t m, int base, int size) 2548{ 2549 int bits = vm_page_bits(base, size); 2550 2551 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2552 if (m->valid && ((m->valid & bits) == bits)) 2553 return 1; 2554 else 2555 return 0; 2556} 2557 2558/* 2559 * update dirty bits from pmap/mmu. May not block. 2560 */ 2561void 2562vm_page_test_dirty(vm_page_t m) 2563{ 2564 2565 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2566 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) 2567 vm_page_dirty(m); 2568} 2569 2570int so_zerocp_fullpage = 0; 2571 2572/* 2573 * Replace the given page with a copy. The copied page assumes 2574 * the portion of the given page's "wire_count" that is not the 2575 * responsibility of this copy-on-write mechanism. 2576 * 2577 * The object containing the given page must have a non-zero 2578 * paging-in-progress count and be locked. 2579 */ 2580void 2581vm_page_cowfault(vm_page_t m) 2582{ 2583 vm_page_t mnew; 2584 vm_object_t object; 2585 vm_pindex_t pindex; 2586 2587 mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED); 2588 vm_page_lock_assert(m, MA_OWNED); 2589 object = m->object; 2590 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2591 KASSERT(object->paging_in_progress != 0, 2592 ("vm_page_cowfault: object %p's paging-in-progress count is zero.", 2593 object)); 2594 pindex = m->pindex; 2595 2596 retry_alloc: 2597 pmap_remove_all(m); 2598 vm_page_remove(m); 2599 mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); 2600 if (mnew == NULL) { 2601 vm_page_insert(m, object, pindex); 2602 vm_page_unlock(m); 2603 VM_OBJECT_UNLOCK(object); 2604 VM_WAIT; 2605 VM_OBJECT_LOCK(object); 2606 if (m == vm_page_lookup(object, pindex)) { 2607 vm_page_lock(m); 2608 goto retry_alloc; 2609 } else { 2610 /* 2611 * Page disappeared during the wait. 2612 */ 2613 return; 2614 } 2615 } 2616 2617 if (m->cow == 0) { 2618 /* 2619 * check to see if we raced with an xmit complete when 2620 * waiting to allocate a page. If so, put things back 2621 * the way they were 2622 */ 2623 vm_page_unlock(m); 2624 vm_page_lock(mnew); 2625 vm_page_free(mnew); 2626 vm_page_unlock(mnew); 2627 vm_page_insert(m, object, pindex); 2628 } else { /* clear COW & copy page */ 2629 if (!so_zerocp_fullpage) 2630 pmap_copy_page(m, mnew); 2631 mnew->valid = VM_PAGE_BITS_ALL; 2632 vm_page_dirty(mnew); 2633 mnew->wire_count = m->wire_count - m->cow; 2634 m->wire_count = m->cow; 2635 vm_page_unlock(m); 2636 } 2637} 2638 2639void 2640vm_page_cowclear(vm_page_t m) 2641{ 2642 2643 vm_page_lock_assert(m, MA_OWNED); 2644 if (m->cow) { 2645 m->cow--; 2646 /* 2647 * let vm_fault add back write permission lazily 2648 */ 2649 } 2650 /* 2651 * sf_buf_free() will free the page, so we needn't do it here 2652 */ 2653} 2654 2655int 2656vm_page_cowsetup(vm_page_t m) 2657{ 2658 2659 vm_page_lock_assert(m, MA_OWNED); 2660 if ((m->flags & PG_FICTITIOUS) != 0 || 2661 (m->oflags & VPO_UNMANAGED) != 0 || 2662 m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYLOCK(m->object)) 2663 return (EBUSY); 2664 m->cow++; 2665 pmap_remove_write(m); 2666 VM_OBJECT_UNLOCK(m->object); 2667 return (0); 2668} 2669 2670#ifdef INVARIANTS 2671void 2672vm_page_object_lock_assert(vm_page_t m) 2673{ 2674 2675 /* 2676 * Certain of the page's fields may only be modified by the 2677 * holder of the containing object's lock or the setter of the 2678 * page's VPO_BUSY flag. Unfortunately, the setter of the 2679 * VPO_BUSY flag is not recorded, and thus cannot be checked 2680 * here. 2681 */ 2682 if (m->object != NULL && (m->oflags & VPO_BUSY) == 0) 2683 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2684} 2685#endif 2686 2687#include "opt_ddb.h" 2688#ifdef DDB 2689#include <sys/kernel.h> 2690 2691#include <ddb/ddb.h> 2692 2693DB_SHOW_COMMAND(page, vm_page_print_page_info) 2694{ 2695 db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); 2696 db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); 2697 db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); 2698 db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); 2699 db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); 2700 db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); 2701 db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); 2702 db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); 2703 db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); 2704 db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); 2705} 2706 2707DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) 2708{ 2709 2710 db_printf("PQ_FREE:"); 2711 db_printf(" %d", cnt.v_free_count); 2712 db_printf("\n"); 2713 2714 db_printf("PQ_CACHE:"); 2715 db_printf(" %d", cnt.v_cache_count); 2716 db_printf("\n"); 2717 2718 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", 2719 *vm_page_queues[PQ_ACTIVE].cnt, 2720 *vm_page_queues[PQ_INACTIVE].cnt); 2721} 2722#endif /* DDB */ 2723