Deleted Added
full compact
vm_page.c (228287) vm_page.c (230623)
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * The Mach Operating System project at Carnegie-Mellon University.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91
34 */
35
36/*-
37 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38 * All rights reserved.
39 *
40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
55 * School of Computer Science
56 * Carnegie Mellon University
57 * Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63/*
64 * GENERAL RULES ON VM_PAGE MANIPULATION
65 *
66 * - a pageq mutex is required when adding or removing a page from a
67 * page queue (vm_page_queue[]), regardless of other mutexes or the
68 * busy state of a page.
69 *
70 * - The object mutex is held when inserting or removing
71 * pages from an object (vm_page_insert() or vm_page_remove()).
72 *
73 */
74
75/*
76 * Resident memory management module.
77 */
78
79#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * The Mach Operating System project at Carnegie-Mellon University.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91
34 */
35
36/*-
37 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38 * All rights reserved.
39 *
40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
55 * School of Computer Science
56 * Carnegie Mellon University
57 * Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63/*
64 * GENERAL RULES ON VM_PAGE MANIPULATION
65 *
66 * - a pageq mutex is required when adding or removing a page from a
67 * page queue (vm_page_queue[]), regardless of other mutexes or the
68 * busy state of a page.
69 *
70 * - The object mutex is held when inserting or removing
71 * pages from an object (vm_page_insert() or vm_page_remove()).
72 *
73 */
74
75/*
76 * Resident memory management module.
77 */
78
79#include <sys/cdefs.h>
80__FBSDID("$FreeBSD: head/sys/vm/vm_page.c 228287 2011-12-05 18:29:25Z alc $");
80__FBSDID("$FreeBSD: head/sys/vm/vm_page.c 230623 2012-01-27 20:18:31Z kmacy $");
81
82#include "opt_vm.h"
83
84#include <sys/param.h>
85#include <sys/systm.h>
86#include <sys/lock.h>
87#include <sys/kernel.h>
88#include <sys/limits.h>
89#include <sys/malloc.h>
90#include <sys/msgbuf.h>
91#include <sys/mutex.h>
92#include <sys/proc.h>
93#include <sys/sysctl.h>
94#include <sys/vmmeter.h>
95#include <sys/vnode.h>
96
97#include <vm/vm.h>
98#include <vm/pmap.h>
99#include <vm/vm_param.h>
100#include <vm/vm_kern.h>
101#include <vm/vm_object.h>
102#include <vm/vm_page.h>
103#include <vm/vm_pageout.h>
104#include <vm/vm_pager.h>
105#include <vm/vm_phys.h>
106#include <vm/vm_reserv.h>
107#include <vm/vm_extern.h>
108#include <vm/uma.h>
109#include <vm/uma_int.h>
110
111#include <machine/md_var.h>
112
113/*
114 * Associated with page of user-allocatable memory is a
115 * page structure.
116 */
117
118struct vpgqueues vm_page_queues[PQ_COUNT];
119struct vpglocks vm_page_queue_lock;
120struct vpglocks vm_page_queue_free_lock;
121
122struct vpglocks pa_lock[PA_LOCK_COUNT];
123
124vm_page_t vm_page_array = 0;
125int vm_page_array_size = 0;
126long first_page = 0;
127int vm_page_zero_count = 0;
128
129static int boot_pages = UMA_BOOT_PAGES;
130TUNABLE_INT("vm.boot_pages", &boot_pages);
131SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
132 "number of pages allocated for bootstrapping the VM system");
133
134static int pa_tryrelock_restart;
135SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
136 &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
137
138static uma_zone_t fakepg_zone;
139
140static struct vnode *vm_page_alloc_init(vm_page_t m);
141static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
142static void vm_page_queue_remove(int queue, vm_page_t m);
143static void vm_page_enqueue(int queue, vm_page_t m);
144static void vm_page_init_fakepg(void *dummy);
145
146SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
147
148static void
149vm_page_init_fakepg(void *dummy)
150{
151
152 fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
153 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
154}
155
156/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
157#if PAGE_SIZE == 32768
158#ifdef CTASSERT
159CTASSERT(sizeof(u_long) >= 8);
160#endif
161#endif
162
163/*
164 * Try to acquire a physical address lock while a pmap is locked. If we
165 * fail to trylock we unlock and lock the pmap directly and cache the
166 * locked pa in *locked. The caller should then restart their loop in case
167 * the virtual to physical mapping has changed.
168 */
169int
170vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
171{
172 vm_paddr_t lockpa;
173
174 lockpa = *locked;
175 *locked = pa;
176 if (lockpa) {
177 PA_LOCK_ASSERT(lockpa, MA_OWNED);
178 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
179 return (0);
180 PA_UNLOCK(lockpa);
181 }
182 if (PA_TRYLOCK(pa))
183 return (0);
184 PMAP_UNLOCK(pmap);
185 atomic_add_int(&pa_tryrelock_restart, 1);
186 PA_LOCK(pa);
187 PMAP_LOCK(pmap);
188 return (EAGAIN);
189}
190
191/*
192 * vm_set_page_size:
193 *
194 * Sets the page size, perhaps based upon the memory
195 * size. Must be called before any use of page-size
196 * dependent functions.
197 */
198void
199vm_set_page_size(void)
200{
201 if (cnt.v_page_size == 0)
202 cnt.v_page_size = PAGE_SIZE;
203 if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
204 panic("vm_set_page_size: page size not a power of two");
205}
206
207/*
208 * vm_page_blacklist_lookup:
209 *
210 * See if a physical address in this page has been listed
211 * in the blacklist tunable. Entries in the tunable are
212 * separated by spaces or commas. If an invalid integer is
213 * encountered then the rest of the string is skipped.
214 */
215static int
216vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
217{
218 vm_paddr_t bad;
219 char *cp, *pos;
220
221 for (pos = list; *pos != '\0'; pos = cp) {
222 bad = strtoq(pos, &cp, 0);
223 if (*cp != '\0') {
224 if (*cp == ' ' || *cp == ',') {
225 cp++;
226 if (cp == pos)
227 continue;
228 } else
229 break;
230 }
231 if (pa == trunc_page(bad))
232 return (1);
233 }
234 return (0);
235}
236
237/*
238 * vm_page_startup:
239 *
240 * Initializes the resident memory module.
241 *
242 * Allocates memory for the page cells, and
243 * for the object/offset-to-page hash table headers.
244 * Each page cell is initialized and placed on the free list.
245 */
246vm_offset_t
247vm_page_startup(vm_offset_t vaddr)
248{
249 vm_offset_t mapped;
250 vm_paddr_t page_range;
251 vm_paddr_t new_end;
252 int i;
253 vm_paddr_t pa;
254 vm_paddr_t last_pa;
255 char *list;
256
257 /* the biggest memory array is the second group of pages */
258 vm_paddr_t end;
259 vm_paddr_t biggestsize;
260 vm_paddr_t low_water, high_water;
261 int biggestone;
262
263 biggestsize = 0;
264 biggestone = 0;
265 vaddr = round_page(vaddr);
266
267 for (i = 0; phys_avail[i + 1]; i += 2) {
268 phys_avail[i] = round_page(phys_avail[i]);
269 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
270 }
271
272 low_water = phys_avail[0];
273 high_water = phys_avail[1];
274
275 for (i = 0; phys_avail[i + 1]; i += 2) {
276 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
277
278 if (size > biggestsize) {
279 biggestone = i;
280 biggestsize = size;
281 }
282 if (phys_avail[i] < low_water)
283 low_water = phys_avail[i];
284 if (phys_avail[i + 1] > high_water)
285 high_water = phys_avail[i + 1];
286 }
287
288#ifdef XEN
289 low_water = 0;
290#endif
291
292 end = phys_avail[biggestone+1];
293
294 /*
295 * Initialize the locks.
296 */
297 mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
298 MTX_RECURSE);
299 mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
300 MTX_DEF);
301
302 /* Setup page locks. */
303 for (i = 0; i < PA_LOCK_COUNT; i++)
304 mtx_init(&pa_lock[i].data, "page lock", NULL, MTX_DEF);
305
306 /*
307 * Initialize the queue headers for the hold queue, the active queue,
308 * and the inactive queue.
309 */
310 for (i = 0; i < PQ_COUNT; i++)
311 TAILQ_INIT(&vm_page_queues[i].pl);
312 vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
313 vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
314 vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
315
316 /*
317 * Allocate memory for use when boot strapping the kernel memory
318 * allocator.
319 */
320 new_end = end - (boot_pages * UMA_SLAB_SIZE);
321 new_end = trunc_page(new_end);
322 mapped = pmap_map(&vaddr, new_end, end,
323 VM_PROT_READ | VM_PROT_WRITE);
324 bzero((void *)mapped, end - new_end);
325 uma_startup((void *)mapped, boot_pages);
326
327#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
328 defined(__mips__)
329 /*
330 * Allocate a bitmap to indicate that a random physical page
331 * needs to be included in a minidump.
332 *
333 * The amd64 port needs this to indicate which direct map pages
334 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
335 *
336 * However, i386 still needs this workspace internally within the
337 * minidump code. In theory, they are not needed on i386, but are
338 * included should the sf_buf code decide to use them.
339 */
340 last_pa = 0;
341 for (i = 0; dump_avail[i + 1] != 0; i += 2)
342 if (dump_avail[i + 1] > last_pa)
343 last_pa = dump_avail[i + 1];
344 page_range = last_pa / PAGE_SIZE;
345 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
346 new_end -= vm_page_dump_size;
347 vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
348 new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
349 bzero((void *)vm_page_dump, vm_page_dump_size);
350#endif
351#ifdef __amd64__
352 /*
353 * Request that the physical pages underlying the message buffer be
354 * included in a crash dump. Since the message buffer is accessed
355 * through the direct map, they are not automatically included.
356 */
357 pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
358 last_pa = pa + round_page(msgbufsize);
359 while (pa < last_pa) {
360 dump_add_page(pa);
361 pa += PAGE_SIZE;
362 }
363#endif
364 /*
365 * Compute the number of pages of memory that will be available for
366 * use (taking into account the overhead of a page structure per
367 * page).
368 */
369 first_page = low_water / PAGE_SIZE;
370#ifdef VM_PHYSSEG_SPARSE
371 page_range = 0;
372 for (i = 0; phys_avail[i + 1] != 0; i += 2)
373 page_range += atop(phys_avail[i + 1] - phys_avail[i]);
374#elif defined(VM_PHYSSEG_DENSE)
375 page_range = high_water / PAGE_SIZE - first_page;
376#else
377#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
378#endif
379 end = new_end;
380
381 /*
382 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
383 */
384 vaddr += PAGE_SIZE;
385
386 /*
387 * Initialize the mem entry structures now, and put them in the free
388 * queue.
389 */
390 new_end = trunc_page(end - page_range * sizeof(struct vm_page));
391 mapped = pmap_map(&vaddr, new_end, end,
392 VM_PROT_READ | VM_PROT_WRITE);
393 vm_page_array = (vm_page_t) mapped;
394#if VM_NRESERVLEVEL > 0
395 /*
396 * Allocate memory for the reservation management system's data
397 * structures.
398 */
399 new_end = vm_reserv_startup(&vaddr, new_end, high_water);
400#endif
401#if defined(__amd64__) || defined(__mips__)
402 /*
403 * pmap_map on amd64 and mips can come out of the direct-map, not kvm
404 * like i386, so the pages must be tracked for a crashdump to include
405 * this data. This includes the vm_page_array and the early UMA
406 * bootstrap pages.
407 */
408 for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
409 dump_add_page(pa);
410#endif
411 phys_avail[biggestone + 1] = new_end;
412
413 /*
414 * Clear all of the page structures
415 */
416 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
417 for (i = 0; i < page_range; i++)
418 vm_page_array[i].order = VM_NFREEORDER;
419 vm_page_array_size = page_range;
420
421 /*
422 * Initialize the physical memory allocator.
423 */
424 vm_phys_init();
425
426 /*
427 * Add every available physical page that is not blacklisted to
428 * the free lists.
429 */
430 cnt.v_page_count = 0;
431 cnt.v_free_count = 0;
432 list = getenv("vm.blacklist");
433 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
434 pa = phys_avail[i];
435 last_pa = phys_avail[i + 1];
436 while (pa < last_pa) {
437 if (list != NULL &&
438 vm_page_blacklist_lookup(list, pa))
439 printf("Skipping page with pa 0x%jx\n",
440 (uintmax_t)pa);
441 else
442 vm_phys_add_page(pa);
443 pa += PAGE_SIZE;
444 }
445 }
446 freeenv(list);
447#if VM_NRESERVLEVEL > 0
448 /*
449 * Initialize the reservation management system.
450 */
451 vm_reserv_init();
452#endif
453 return (vaddr);
454}
455
456
457CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
458
459void
460vm_page_aflag_set(vm_page_t m, uint8_t bits)
461{
462 uint32_t *addr, val;
463
464 /*
465 * The PGA_WRITEABLE flag can only be set if the page is managed and
466 * VPO_BUSY. Currently, this flag is only set by pmap_enter().
467 */
468 KASSERT((bits & PGA_WRITEABLE) == 0 ||
469 (m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == VPO_BUSY,
470 ("PGA_WRITEABLE and !VPO_BUSY"));
471
472 /*
473 * We want to use atomic updates for m->aflags, which is a
474 * byte wide. Not all architectures provide atomic operations
475 * on the single-byte destination. Punt and access the whole
476 * 4-byte word with an atomic update. Parallel non-atomic
477 * updates to the fields included in the update by proximity
478 * are handled properly by atomics.
479 */
480 addr = (void *)&m->aflags;
481 MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
482 val = bits;
483#if BYTE_ORDER == BIG_ENDIAN
484 val <<= 24;
485#endif
486 atomic_set_32(addr, val);
487}
488
489void
490vm_page_aflag_clear(vm_page_t m, uint8_t bits)
491{
492 uint32_t *addr, val;
493
494 /*
495 * The PGA_REFERENCED flag can only be cleared if the object
496 * containing the page is locked.
497 */
498 KASSERT((bits & PGA_REFERENCED) == 0 || VM_OBJECT_LOCKED(m->object),
499 ("PGA_REFERENCED and !VM_OBJECT_LOCKED"));
500
501 /*
502 * See the comment in vm_page_aflag_set().
503 */
504 addr = (void *)&m->aflags;
505 MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
506 val = bits;
507#if BYTE_ORDER == BIG_ENDIAN
508 val <<= 24;
509#endif
510 atomic_clear_32(addr, val);
511}
512
513void
514vm_page_reference(vm_page_t m)
515{
516
517 vm_page_aflag_set(m, PGA_REFERENCED);
518}
519
520void
521vm_page_busy(vm_page_t m)
522{
523
524 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
525 KASSERT((m->oflags & VPO_BUSY) == 0,
526 ("vm_page_busy: page already busy!!!"));
527 m->oflags |= VPO_BUSY;
528}
529
530/*
531 * vm_page_flash:
532 *
533 * wakeup anyone waiting for the page.
534 */
535void
536vm_page_flash(vm_page_t m)
537{
538
539 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
540 if (m->oflags & VPO_WANTED) {
541 m->oflags &= ~VPO_WANTED;
542 wakeup(m);
543 }
544}
545
546/*
547 * vm_page_wakeup:
548 *
549 * clear the VPO_BUSY flag and wakeup anyone waiting for the
550 * page.
551 *
552 */
553void
554vm_page_wakeup(vm_page_t m)
555{
556
557 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
558 KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
559 m->oflags &= ~VPO_BUSY;
560 vm_page_flash(m);
561}
562
563void
564vm_page_io_start(vm_page_t m)
565{
566
567 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
568 m->busy++;
569}
570
571void
572vm_page_io_finish(vm_page_t m)
573{
574
575 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
576 KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m));
577 m->busy--;
578 if (m->busy == 0)
579 vm_page_flash(m);
580}
581
582/*
583 * Keep page from being freed by the page daemon
584 * much of the same effect as wiring, except much lower
585 * overhead and should be used only for *very* temporary
586 * holding ("wiring").
587 */
588void
589vm_page_hold(vm_page_t mem)
590{
591
592 vm_page_lock_assert(mem, MA_OWNED);
593 mem->hold_count++;
594}
595
596void
597vm_page_unhold(vm_page_t mem)
598{
599
600 vm_page_lock_assert(mem, MA_OWNED);
601 --mem->hold_count;
602 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
603 if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
604 vm_page_free_toq(mem);
605}
606
607/*
608 * vm_page_unhold_pages:
609 *
610 * Unhold each of the pages that is referenced by the given array.
611 */
612void
613vm_page_unhold_pages(vm_page_t *ma, int count)
614{
615 struct mtx *mtx, *new_mtx;
616
617 mtx = NULL;
618 for (; count != 0; count--) {
619 /*
620 * Avoid releasing and reacquiring the same page lock.
621 */
622 new_mtx = vm_page_lockptr(*ma);
623 if (mtx != new_mtx) {
624 if (mtx != NULL)
625 mtx_unlock(mtx);
626 mtx = new_mtx;
627 mtx_lock(mtx);
628 }
629 vm_page_unhold(*ma);
630 ma++;
631 }
632 if (mtx != NULL)
633 mtx_unlock(mtx);
634}
635
636/*
637 * vm_page_getfake:
638 *
639 * Create a fictitious page with the specified physical address and
640 * memory attribute. The memory attribute is the only the machine-
641 * dependent aspect of a fictitious page that must be initialized.
642 */
643vm_page_t
644vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
645{
646 vm_page_t m;
647
648 m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
649 m->phys_addr = paddr;
650 m->queue = PQ_NONE;
651 /* Fictitious pages don't use "segind". */
652 m->flags = PG_FICTITIOUS;
653 /* Fictitious pages don't use "order" or "pool". */
654 m->oflags = VPO_BUSY | VPO_UNMANAGED;
655 m->wire_count = 1;
656 pmap_page_set_memattr(m, memattr);
657 return (m);
658}
659
660/*
661 * vm_page_putfake:
662 *
663 * Release a fictitious page.
664 */
665void
666vm_page_putfake(vm_page_t m)
667{
668
669 KASSERT((m->flags & PG_FICTITIOUS) != 0,
670 ("vm_page_putfake: bad page %p", m));
671 uma_zfree(fakepg_zone, m);
672}
673
674/*
675 * vm_page_updatefake:
676 *
677 * Update the given fictitious page to the specified physical address and
678 * memory attribute.
679 */
680void
681vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
682{
683
684 KASSERT((m->flags & PG_FICTITIOUS) != 0,
685 ("vm_page_updatefake: bad page %p", m));
686 m->phys_addr = paddr;
687 pmap_page_set_memattr(m, memattr);
688}
689
690/*
691 * vm_page_free:
692 *
693 * Free a page.
694 */
695void
696vm_page_free(vm_page_t m)
697{
698
699 m->flags &= ~PG_ZERO;
700 vm_page_free_toq(m);
701}
702
703/*
704 * vm_page_free_zero:
705 *
706 * Free a page to the zerod-pages queue
707 */
708void
709vm_page_free_zero(vm_page_t m)
710{
711
712 m->flags |= PG_ZERO;
713 vm_page_free_toq(m);
714}
715
716/*
717 * vm_page_sleep:
718 *
719 * Sleep and release the page and page queues locks.
720 *
721 * The object containing the given page must be locked.
722 */
723void
724vm_page_sleep(vm_page_t m, const char *msg)
725{
726
727 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
728 if (mtx_owned(&vm_page_queue_mtx))
729 vm_page_unlock_queues();
730 if (mtx_owned(vm_page_lockptr(m)))
731 vm_page_unlock(m);
732
733 /*
734 * It's possible that while we sleep, the page will get
735 * unbusied and freed. If we are holding the object
736 * lock, we will assume we hold a reference to the object
737 * such that even if m->object changes, we can re-lock
738 * it.
739 */
740 m->oflags |= VPO_WANTED;
741 msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
742}
743
744/*
745 * vm_page_dirty:
746 *
747 * Set all bits in the page's dirty field.
748 *
749 * The object containing the specified page must be locked if the
750 * call is made from the machine-independent layer.
751 *
752 * See vm_page_clear_dirty_mask().
753 */
754void
755vm_page_dirty(vm_page_t m)
756{
757
758 KASSERT((m->flags & PG_CACHED) == 0,
759 ("vm_page_dirty: page in cache!"));
760 KASSERT(!VM_PAGE_IS_FREE(m),
761 ("vm_page_dirty: page is free!"));
762 KASSERT(m->valid == VM_PAGE_BITS_ALL,
763 ("vm_page_dirty: page is invalid!"));
764 m->dirty = VM_PAGE_BITS_ALL;
765}
766
767/*
768 * vm_page_splay:
769 *
770 * Implements Sleator and Tarjan's top-down splay algorithm. Returns
771 * the vm_page containing the given pindex. If, however, that
772 * pindex is not found in the vm_object, returns a vm_page that is
773 * adjacent to the pindex, coming before or after it.
774 */
775vm_page_t
776vm_page_splay(vm_pindex_t pindex, vm_page_t root)
777{
778 struct vm_page dummy;
779 vm_page_t lefttreemax, righttreemin, y;
780
781 if (root == NULL)
782 return (root);
783 lefttreemax = righttreemin = &dummy;
784 for (;; root = y) {
785 if (pindex < root->pindex) {
786 if ((y = root->left) == NULL)
787 break;
788 if (pindex < y->pindex) {
789 /* Rotate right. */
790 root->left = y->right;
791 y->right = root;
792 root = y;
793 if ((y = root->left) == NULL)
794 break;
795 }
796 /* Link into the new root's right tree. */
797 righttreemin->left = root;
798 righttreemin = root;
799 } else if (pindex > root->pindex) {
800 if ((y = root->right) == NULL)
801 break;
802 if (pindex > y->pindex) {
803 /* Rotate left. */
804 root->right = y->left;
805 y->left = root;
806 root = y;
807 if ((y = root->right) == NULL)
808 break;
809 }
810 /* Link into the new root's left tree. */
811 lefttreemax->right = root;
812 lefttreemax = root;
813 } else
814 break;
815 }
816 /* Assemble the new root. */
817 lefttreemax->right = root->left;
818 righttreemin->left = root->right;
819 root->left = dummy.right;
820 root->right = dummy.left;
821 return (root);
822}
823
824/*
825 * vm_page_insert: [ internal use only ]
826 *
827 * Inserts the given mem entry into the object and object list.
828 *
829 * The pagetables are not updated but will presumably fault the page
830 * in if necessary, or if a kernel page the caller will at some point
831 * enter the page into the kernel's pmap. We are not allowed to block
832 * here so we *can't* do this anyway.
833 *
834 * The object and page must be locked.
835 * This routine may not block.
836 */
837void
838vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
839{
840 vm_page_t root;
841
842 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
843 if (m->object != NULL)
844 panic("vm_page_insert: page already inserted");
845
846 /*
847 * Record the object/offset pair in this page
848 */
849 m->object = object;
850 m->pindex = pindex;
851
852 /*
853 * Now link into the object's ordered list of backed pages.
854 */
855 root = object->root;
856 if (root == NULL) {
857 m->left = NULL;
858 m->right = NULL;
859 TAILQ_INSERT_TAIL(&object->memq, m, listq);
860 } else {
861 root = vm_page_splay(pindex, root);
862 if (pindex < root->pindex) {
863 m->left = root->left;
864 m->right = root;
865 root->left = NULL;
866 TAILQ_INSERT_BEFORE(root, m, listq);
867 } else if (pindex == root->pindex)
868 panic("vm_page_insert: offset already allocated");
869 else {
870 m->right = root->right;
871 m->left = root;
872 root->right = NULL;
873 TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
874 }
875 }
876 object->root = m;
877
878 /*
879 * show that the object has one more resident page.
880 */
881 object->resident_page_count++;
882 /*
883 * Hold the vnode until the last page is released.
884 */
885 if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
886 vhold((struct vnode *)object->handle);
887
888 /*
889 * Since we are inserting a new and possibly dirty page,
890 * update the object's OBJ_MIGHTBEDIRTY flag.
891 */
892 if (m->aflags & PGA_WRITEABLE)
893 vm_object_set_writeable_dirty(object);
894}
895
896/*
897 * vm_page_remove:
898 * NOTE: used by device pager as well -wfj
899 *
900 * Removes the given mem entry from the object/offset-page
901 * table and the object page list, but do not invalidate/terminate
902 * the backing store.
903 *
904 * The object and page must be locked.
905 * The underlying pmap entry (if any) is NOT removed here.
906 * This routine may not block.
907 */
908void
909vm_page_remove(vm_page_t m)
910{
911 vm_object_t object;
912 vm_page_t next, prev, root;
913
914 if ((m->oflags & VPO_UNMANAGED) == 0)
915 vm_page_lock_assert(m, MA_OWNED);
916 if ((object = m->object) == NULL)
917 return;
918 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
919 if (m->oflags & VPO_BUSY) {
920 m->oflags &= ~VPO_BUSY;
921 vm_page_flash(m);
922 }
923
924 /*
925 * Now remove from the object's list of backed pages.
926 */
927 if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
928 /*
929 * Since the page's successor in the list is also its parent
930 * in the tree, its right subtree must be empty.
931 */
932 next->left = m->left;
933 KASSERT(m->right == NULL,
934 ("vm_page_remove: page %p has right child", m));
935 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
936 prev->right == m) {
937 /*
938 * Since the page's predecessor in the list is also its parent
939 * in the tree, its left subtree must be empty.
940 */
941 KASSERT(m->left == NULL,
942 ("vm_page_remove: page %p has left child", m));
943 prev->right = m->right;
944 } else {
945 if (m != object->root)
946 vm_page_splay(m->pindex, object->root);
947 if (m->left == NULL)
948 root = m->right;
949 else if (m->right == NULL)
950 root = m->left;
951 else {
952 /*
953 * Move the page's successor to the root, because
954 * pages are usually removed in ascending order.
955 */
956 if (m->right != next)
957 vm_page_splay(m->pindex, m->right);
958 next->left = m->left;
959 root = next;
960 }
961 object->root = root;
962 }
963 TAILQ_REMOVE(&object->memq, m, listq);
964
965 /*
966 * And show that the object has one fewer resident page.
967 */
968 object->resident_page_count--;
969 /*
970 * The vnode may now be recycled.
971 */
972 if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
973 vdrop((struct vnode *)object->handle);
974
975 m->object = NULL;
976}
977
978/*
979 * vm_page_lookup:
980 *
981 * Returns the page associated with the object/offset
982 * pair specified; if none is found, NULL is returned.
983 *
984 * The object must be locked.
985 * This routine may not block.
986 * This is a critical path routine
987 */
988vm_page_t
989vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
990{
991 vm_page_t m;
992
993 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
994 if ((m = object->root) != NULL && m->pindex != pindex) {
995 m = vm_page_splay(pindex, m);
996 if ((object->root = m)->pindex != pindex)
997 m = NULL;
998 }
999 return (m);
1000}
1001
1002/*
1003 * vm_page_find_least:
1004 *
1005 * Returns the page associated with the object with least pindex
1006 * greater than or equal to the parameter pindex, or NULL.
1007 *
1008 * The object must be locked.
1009 * The routine may not block.
1010 */
1011vm_page_t
1012vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
1013{
1014 vm_page_t m;
1015
1016 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1017 if ((m = TAILQ_FIRST(&object->memq)) != NULL) {
1018 if (m->pindex < pindex) {
1019 m = vm_page_splay(pindex, object->root);
1020 if ((object->root = m)->pindex < pindex)
1021 m = TAILQ_NEXT(m, listq);
1022 }
1023 }
1024 return (m);
1025}
1026
1027/*
1028 * Returns the given page's successor (by pindex) within the object if it is
1029 * resident; if none is found, NULL is returned.
1030 *
1031 * The object must be locked.
1032 */
1033vm_page_t
1034vm_page_next(vm_page_t m)
1035{
1036 vm_page_t next;
1037
1038 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1039 if ((next = TAILQ_NEXT(m, listq)) != NULL &&
1040 next->pindex != m->pindex + 1)
1041 next = NULL;
1042 return (next);
1043}
1044
1045/*
1046 * Returns the given page's predecessor (by pindex) within the object if it is
1047 * resident; if none is found, NULL is returned.
1048 *
1049 * The object must be locked.
1050 */
1051vm_page_t
1052vm_page_prev(vm_page_t m)
1053{
1054 vm_page_t prev;
1055
1056 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1057 if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
1058 prev->pindex != m->pindex - 1)
1059 prev = NULL;
1060 return (prev);
1061}
1062
1063/*
1064 * vm_page_rename:
1065 *
1066 * Move the given memory entry from its
1067 * current object to the specified target object/offset.
1068 *
1069 * The object must be locked.
1070 * This routine may not block.
1071 *
1072 * Note: swap associated with the page must be invalidated by the move. We
1073 * have to do this for several reasons: (1) we aren't freeing the
1074 * page, (2) we are dirtying the page, (3) the VM system is probably
1075 * moving the page from object A to B, and will then later move
1076 * the backing store from A to B and we can't have a conflict.
1077 *
1078 * Note: we *always* dirty the page. It is necessary both for the
1079 * fact that we moved it, and because we may be invalidating
1080 * swap. If the page is on the cache, we have to deactivate it
1081 * or vm_page_dirty() will panic. Dirty pages are not allowed
1082 * on the cache.
1083 */
1084void
1085vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1086{
1087
1088 vm_page_remove(m);
1089 vm_page_insert(m, new_object, new_pindex);
1090 vm_page_dirty(m);
1091}
1092
1093/*
1094 * Convert all of the given object's cached pages that have a
1095 * pindex within the given range into free pages. If the value
1096 * zero is given for "end", then the range's upper bound is
1097 * infinity. If the given object is backed by a vnode and it
1098 * transitions from having one or more cached pages to none, the
1099 * vnode's hold count is reduced.
1100 */
1101void
1102vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1103{
1104 vm_page_t m, m_next;
1105 boolean_t empty;
1106
1107 mtx_lock(&vm_page_queue_free_mtx);
1108 if (__predict_false(object->cache == NULL)) {
1109 mtx_unlock(&vm_page_queue_free_mtx);
1110 return;
1111 }
1112 m = object->cache = vm_page_splay(start, object->cache);
1113 if (m->pindex < start) {
1114 if (m->right == NULL)
1115 m = NULL;
1116 else {
1117 m_next = vm_page_splay(start, m->right);
1118 m_next->left = m;
1119 m->right = NULL;
1120 m = object->cache = m_next;
1121 }
1122 }
1123
1124 /*
1125 * At this point, "m" is either (1) a reference to the page
1126 * with the least pindex that is greater than or equal to
1127 * "start" or (2) NULL.
1128 */
1129 for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
1130 /*
1131 * Find "m"'s successor and remove "m" from the
1132 * object's cache.
1133 */
1134 if (m->right == NULL) {
1135 object->cache = m->left;
1136 m_next = NULL;
1137 } else {
1138 m_next = vm_page_splay(start, m->right);
1139 m_next->left = m->left;
1140 object->cache = m_next;
1141 }
1142 /* Convert "m" to a free page. */
1143 m->object = NULL;
1144 m->valid = 0;
1145 /* Clear PG_CACHED and set PG_FREE. */
1146 m->flags ^= PG_CACHED | PG_FREE;
1147 KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
1148 ("vm_page_cache_free: page %p has inconsistent flags", m));
1149 cnt.v_cache_count--;
1150 cnt.v_free_count++;
1151 }
1152 empty = object->cache == NULL;
1153 mtx_unlock(&vm_page_queue_free_mtx);
1154 if (object->type == OBJT_VNODE && empty)
1155 vdrop(object->handle);
1156}
1157
1158/*
1159 * Returns the cached page that is associated with the given
1160 * object and offset. If, however, none exists, returns NULL.
1161 *
1162 * The free page queue must be locked.
1163 */
1164static inline vm_page_t
1165vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
1166{
1167 vm_page_t m;
1168
1169 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1170 if ((m = object->cache) != NULL && m->pindex != pindex) {
1171 m = vm_page_splay(pindex, m);
1172 if ((object->cache = m)->pindex != pindex)
1173 m = NULL;
1174 }
1175 return (m);
1176}
1177
1178/*
1179 * Remove the given cached page from its containing object's
1180 * collection of cached pages.
1181 *
1182 * The free page queue must be locked.
1183 */
1184void
1185vm_page_cache_remove(vm_page_t m)
1186{
1187 vm_object_t object;
1188 vm_page_t root;
1189
1190 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1191 KASSERT((m->flags & PG_CACHED) != 0,
1192 ("vm_page_cache_remove: page %p is not cached", m));
1193 object = m->object;
1194 if (m != object->cache) {
1195 root = vm_page_splay(m->pindex, object->cache);
1196 KASSERT(root == m,
1197 ("vm_page_cache_remove: page %p is not cached in object %p",
1198 m, object));
1199 }
1200 if (m->left == NULL)
1201 root = m->right;
1202 else if (m->right == NULL)
1203 root = m->left;
1204 else {
1205 root = vm_page_splay(m->pindex, m->left);
1206 root->right = m->right;
1207 }
1208 object->cache = root;
1209 m->object = NULL;
1210 cnt.v_cache_count--;
1211}
1212
1213/*
1214 * Transfer all of the cached pages with offset greater than or
1215 * equal to 'offidxstart' from the original object's cache to the
1216 * new object's cache. However, any cached pages with offset
1217 * greater than or equal to the new object's size are kept in the
1218 * original object. Initially, the new object's cache must be
1219 * empty. Offset 'offidxstart' in the original object must
1220 * correspond to offset zero in the new object.
1221 *
1222 * The new object must be locked.
1223 */
1224void
1225vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
1226 vm_object_t new_object)
1227{
1228 vm_page_t m, m_next;
1229
1230 /*
1231 * Insertion into an object's collection of cached pages
1232 * requires the object to be locked. In contrast, removal does
1233 * not.
1234 */
1235 VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
1236 KASSERT(new_object->cache == NULL,
1237 ("vm_page_cache_transfer: object %p has cached pages",
1238 new_object));
1239 mtx_lock(&vm_page_queue_free_mtx);
1240 if ((m = orig_object->cache) != NULL) {
1241 /*
1242 * Transfer all of the pages with offset greater than or
1243 * equal to 'offidxstart' from the original object's
1244 * cache to the new object's cache.
1245 */
1246 m = vm_page_splay(offidxstart, m);
1247 if (m->pindex < offidxstart) {
1248 orig_object->cache = m;
1249 new_object->cache = m->right;
1250 m->right = NULL;
1251 } else {
1252 orig_object->cache = m->left;
1253 new_object->cache = m;
1254 m->left = NULL;
1255 }
1256 while ((m = new_object->cache) != NULL) {
1257 if ((m->pindex - offidxstart) >= new_object->size) {
1258 /*
1259 * Return all of the cached pages with
1260 * offset greater than or equal to the
1261 * new object's size to the original
1262 * object's cache.
1263 */
1264 new_object->cache = m->left;
1265 m->left = orig_object->cache;
1266 orig_object->cache = m;
1267 break;
1268 }
1269 m_next = vm_page_splay(m->pindex, m->right);
1270 /* Update the page's object and offset. */
1271 m->object = new_object;
1272 m->pindex -= offidxstart;
1273 if (m_next == NULL)
1274 break;
1275 m->right = NULL;
1276 m_next->left = m;
1277 new_object->cache = m_next;
1278 }
1279 KASSERT(new_object->cache == NULL ||
1280 new_object->type == OBJT_SWAP,
1281 ("vm_page_cache_transfer: object %p's type is incompatible"
1282 " with cached pages", new_object));
1283 }
1284 mtx_unlock(&vm_page_queue_free_mtx);
1285}
1286
1287/*
1288 * vm_page_alloc:
1289 *
1290 * Allocate and return a page that is associated with the specified
1291 * object and offset pair. By default, this page has the flag VPO_BUSY
1292 * set.
1293 *
1294 * The caller must always specify an allocation class.
1295 *
1296 * allocation classes:
1297 * VM_ALLOC_NORMAL normal process request
1298 * VM_ALLOC_SYSTEM system *really* needs a page
1299 * VM_ALLOC_INTERRUPT interrupt time request
1300 *
1301 * optional allocation flags:
1302 * VM_ALLOC_COUNT(number) the number of additional pages that the caller
1303 * intends to allocate
1304 * VM_ALLOC_IFCACHED return page only if it is cached
1305 * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page
1306 * is cached
1307 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page
81
82#include "opt_vm.h"
83
84#include <sys/param.h>
85#include <sys/systm.h>
86#include <sys/lock.h>
87#include <sys/kernel.h>
88#include <sys/limits.h>
89#include <sys/malloc.h>
90#include <sys/msgbuf.h>
91#include <sys/mutex.h>
92#include <sys/proc.h>
93#include <sys/sysctl.h>
94#include <sys/vmmeter.h>
95#include <sys/vnode.h>
96
97#include <vm/vm.h>
98#include <vm/pmap.h>
99#include <vm/vm_param.h>
100#include <vm/vm_kern.h>
101#include <vm/vm_object.h>
102#include <vm/vm_page.h>
103#include <vm/vm_pageout.h>
104#include <vm/vm_pager.h>
105#include <vm/vm_phys.h>
106#include <vm/vm_reserv.h>
107#include <vm/vm_extern.h>
108#include <vm/uma.h>
109#include <vm/uma_int.h>
110
111#include <machine/md_var.h>
112
113/*
114 * Associated with page of user-allocatable memory is a
115 * page structure.
116 */
117
118struct vpgqueues vm_page_queues[PQ_COUNT];
119struct vpglocks vm_page_queue_lock;
120struct vpglocks vm_page_queue_free_lock;
121
122struct vpglocks pa_lock[PA_LOCK_COUNT];
123
124vm_page_t vm_page_array = 0;
125int vm_page_array_size = 0;
126long first_page = 0;
127int vm_page_zero_count = 0;
128
129static int boot_pages = UMA_BOOT_PAGES;
130TUNABLE_INT("vm.boot_pages", &boot_pages);
131SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
132 "number of pages allocated for bootstrapping the VM system");
133
134static int pa_tryrelock_restart;
135SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
136 &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
137
138static uma_zone_t fakepg_zone;
139
140static struct vnode *vm_page_alloc_init(vm_page_t m);
141static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
142static void vm_page_queue_remove(int queue, vm_page_t m);
143static void vm_page_enqueue(int queue, vm_page_t m);
144static void vm_page_init_fakepg(void *dummy);
145
146SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
147
148static void
149vm_page_init_fakepg(void *dummy)
150{
151
152 fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
153 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
154}
155
156/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
157#if PAGE_SIZE == 32768
158#ifdef CTASSERT
159CTASSERT(sizeof(u_long) >= 8);
160#endif
161#endif
162
163/*
164 * Try to acquire a physical address lock while a pmap is locked. If we
165 * fail to trylock we unlock and lock the pmap directly and cache the
166 * locked pa in *locked. The caller should then restart their loop in case
167 * the virtual to physical mapping has changed.
168 */
169int
170vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
171{
172 vm_paddr_t lockpa;
173
174 lockpa = *locked;
175 *locked = pa;
176 if (lockpa) {
177 PA_LOCK_ASSERT(lockpa, MA_OWNED);
178 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
179 return (0);
180 PA_UNLOCK(lockpa);
181 }
182 if (PA_TRYLOCK(pa))
183 return (0);
184 PMAP_UNLOCK(pmap);
185 atomic_add_int(&pa_tryrelock_restart, 1);
186 PA_LOCK(pa);
187 PMAP_LOCK(pmap);
188 return (EAGAIN);
189}
190
191/*
192 * vm_set_page_size:
193 *
194 * Sets the page size, perhaps based upon the memory
195 * size. Must be called before any use of page-size
196 * dependent functions.
197 */
198void
199vm_set_page_size(void)
200{
201 if (cnt.v_page_size == 0)
202 cnt.v_page_size = PAGE_SIZE;
203 if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
204 panic("vm_set_page_size: page size not a power of two");
205}
206
207/*
208 * vm_page_blacklist_lookup:
209 *
210 * See if a physical address in this page has been listed
211 * in the blacklist tunable. Entries in the tunable are
212 * separated by spaces or commas. If an invalid integer is
213 * encountered then the rest of the string is skipped.
214 */
215static int
216vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
217{
218 vm_paddr_t bad;
219 char *cp, *pos;
220
221 for (pos = list; *pos != '\0'; pos = cp) {
222 bad = strtoq(pos, &cp, 0);
223 if (*cp != '\0') {
224 if (*cp == ' ' || *cp == ',') {
225 cp++;
226 if (cp == pos)
227 continue;
228 } else
229 break;
230 }
231 if (pa == trunc_page(bad))
232 return (1);
233 }
234 return (0);
235}
236
237/*
238 * vm_page_startup:
239 *
240 * Initializes the resident memory module.
241 *
242 * Allocates memory for the page cells, and
243 * for the object/offset-to-page hash table headers.
244 * Each page cell is initialized and placed on the free list.
245 */
246vm_offset_t
247vm_page_startup(vm_offset_t vaddr)
248{
249 vm_offset_t mapped;
250 vm_paddr_t page_range;
251 vm_paddr_t new_end;
252 int i;
253 vm_paddr_t pa;
254 vm_paddr_t last_pa;
255 char *list;
256
257 /* the biggest memory array is the second group of pages */
258 vm_paddr_t end;
259 vm_paddr_t biggestsize;
260 vm_paddr_t low_water, high_water;
261 int biggestone;
262
263 biggestsize = 0;
264 biggestone = 0;
265 vaddr = round_page(vaddr);
266
267 for (i = 0; phys_avail[i + 1]; i += 2) {
268 phys_avail[i] = round_page(phys_avail[i]);
269 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
270 }
271
272 low_water = phys_avail[0];
273 high_water = phys_avail[1];
274
275 for (i = 0; phys_avail[i + 1]; i += 2) {
276 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
277
278 if (size > biggestsize) {
279 biggestone = i;
280 biggestsize = size;
281 }
282 if (phys_avail[i] < low_water)
283 low_water = phys_avail[i];
284 if (phys_avail[i + 1] > high_water)
285 high_water = phys_avail[i + 1];
286 }
287
288#ifdef XEN
289 low_water = 0;
290#endif
291
292 end = phys_avail[biggestone+1];
293
294 /*
295 * Initialize the locks.
296 */
297 mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
298 MTX_RECURSE);
299 mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
300 MTX_DEF);
301
302 /* Setup page locks. */
303 for (i = 0; i < PA_LOCK_COUNT; i++)
304 mtx_init(&pa_lock[i].data, "page lock", NULL, MTX_DEF);
305
306 /*
307 * Initialize the queue headers for the hold queue, the active queue,
308 * and the inactive queue.
309 */
310 for (i = 0; i < PQ_COUNT; i++)
311 TAILQ_INIT(&vm_page_queues[i].pl);
312 vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
313 vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
314 vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
315
316 /*
317 * Allocate memory for use when boot strapping the kernel memory
318 * allocator.
319 */
320 new_end = end - (boot_pages * UMA_SLAB_SIZE);
321 new_end = trunc_page(new_end);
322 mapped = pmap_map(&vaddr, new_end, end,
323 VM_PROT_READ | VM_PROT_WRITE);
324 bzero((void *)mapped, end - new_end);
325 uma_startup((void *)mapped, boot_pages);
326
327#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
328 defined(__mips__)
329 /*
330 * Allocate a bitmap to indicate that a random physical page
331 * needs to be included in a minidump.
332 *
333 * The amd64 port needs this to indicate which direct map pages
334 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
335 *
336 * However, i386 still needs this workspace internally within the
337 * minidump code. In theory, they are not needed on i386, but are
338 * included should the sf_buf code decide to use them.
339 */
340 last_pa = 0;
341 for (i = 0; dump_avail[i + 1] != 0; i += 2)
342 if (dump_avail[i + 1] > last_pa)
343 last_pa = dump_avail[i + 1];
344 page_range = last_pa / PAGE_SIZE;
345 vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
346 new_end -= vm_page_dump_size;
347 vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
348 new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
349 bzero((void *)vm_page_dump, vm_page_dump_size);
350#endif
351#ifdef __amd64__
352 /*
353 * Request that the physical pages underlying the message buffer be
354 * included in a crash dump. Since the message buffer is accessed
355 * through the direct map, they are not automatically included.
356 */
357 pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
358 last_pa = pa + round_page(msgbufsize);
359 while (pa < last_pa) {
360 dump_add_page(pa);
361 pa += PAGE_SIZE;
362 }
363#endif
364 /*
365 * Compute the number of pages of memory that will be available for
366 * use (taking into account the overhead of a page structure per
367 * page).
368 */
369 first_page = low_water / PAGE_SIZE;
370#ifdef VM_PHYSSEG_SPARSE
371 page_range = 0;
372 for (i = 0; phys_avail[i + 1] != 0; i += 2)
373 page_range += atop(phys_avail[i + 1] - phys_avail[i]);
374#elif defined(VM_PHYSSEG_DENSE)
375 page_range = high_water / PAGE_SIZE - first_page;
376#else
377#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
378#endif
379 end = new_end;
380
381 /*
382 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
383 */
384 vaddr += PAGE_SIZE;
385
386 /*
387 * Initialize the mem entry structures now, and put them in the free
388 * queue.
389 */
390 new_end = trunc_page(end - page_range * sizeof(struct vm_page));
391 mapped = pmap_map(&vaddr, new_end, end,
392 VM_PROT_READ | VM_PROT_WRITE);
393 vm_page_array = (vm_page_t) mapped;
394#if VM_NRESERVLEVEL > 0
395 /*
396 * Allocate memory for the reservation management system's data
397 * structures.
398 */
399 new_end = vm_reserv_startup(&vaddr, new_end, high_water);
400#endif
401#if defined(__amd64__) || defined(__mips__)
402 /*
403 * pmap_map on amd64 and mips can come out of the direct-map, not kvm
404 * like i386, so the pages must be tracked for a crashdump to include
405 * this data. This includes the vm_page_array and the early UMA
406 * bootstrap pages.
407 */
408 for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
409 dump_add_page(pa);
410#endif
411 phys_avail[biggestone + 1] = new_end;
412
413 /*
414 * Clear all of the page structures
415 */
416 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
417 for (i = 0; i < page_range; i++)
418 vm_page_array[i].order = VM_NFREEORDER;
419 vm_page_array_size = page_range;
420
421 /*
422 * Initialize the physical memory allocator.
423 */
424 vm_phys_init();
425
426 /*
427 * Add every available physical page that is not blacklisted to
428 * the free lists.
429 */
430 cnt.v_page_count = 0;
431 cnt.v_free_count = 0;
432 list = getenv("vm.blacklist");
433 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
434 pa = phys_avail[i];
435 last_pa = phys_avail[i + 1];
436 while (pa < last_pa) {
437 if (list != NULL &&
438 vm_page_blacklist_lookup(list, pa))
439 printf("Skipping page with pa 0x%jx\n",
440 (uintmax_t)pa);
441 else
442 vm_phys_add_page(pa);
443 pa += PAGE_SIZE;
444 }
445 }
446 freeenv(list);
447#if VM_NRESERVLEVEL > 0
448 /*
449 * Initialize the reservation management system.
450 */
451 vm_reserv_init();
452#endif
453 return (vaddr);
454}
455
456
457CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
458
459void
460vm_page_aflag_set(vm_page_t m, uint8_t bits)
461{
462 uint32_t *addr, val;
463
464 /*
465 * The PGA_WRITEABLE flag can only be set if the page is managed and
466 * VPO_BUSY. Currently, this flag is only set by pmap_enter().
467 */
468 KASSERT((bits & PGA_WRITEABLE) == 0 ||
469 (m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == VPO_BUSY,
470 ("PGA_WRITEABLE and !VPO_BUSY"));
471
472 /*
473 * We want to use atomic updates for m->aflags, which is a
474 * byte wide. Not all architectures provide atomic operations
475 * on the single-byte destination. Punt and access the whole
476 * 4-byte word with an atomic update. Parallel non-atomic
477 * updates to the fields included in the update by proximity
478 * are handled properly by atomics.
479 */
480 addr = (void *)&m->aflags;
481 MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
482 val = bits;
483#if BYTE_ORDER == BIG_ENDIAN
484 val <<= 24;
485#endif
486 atomic_set_32(addr, val);
487}
488
489void
490vm_page_aflag_clear(vm_page_t m, uint8_t bits)
491{
492 uint32_t *addr, val;
493
494 /*
495 * The PGA_REFERENCED flag can only be cleared if the object
496 * containing the page is locked.
497 */
498 KASSERT((bits & PGA_REFERENCED) == 0 || VM_OBJECT_LOCKED(m->object),
499 ("PGA_REFERENCED and !VM_OBJECT_LOCKED"));
500
501 /*
502 * See the comment in vm_page_aflag_set().
503 */
504 addr = (void *)&m->aflags;
505 MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
506 val = bits;
507#if BYTE_ORDER == BIG_ENDIAN
508 val <<= 24;
509#endif
510 atomic_clear_32(addr, val);
511}
512
513void
514vm_page_reference(vm_page_t m)
515{
516
517 vm_page_aflag_set(m, PGA_REFERENCED);
518}
519
520void
521vm_page_busy(vm_page_t m)
522{
523
524 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
525 KASSERT((m->oflags & VPO_BUSY) == 0,
526 ("vm_page_busy: page already busy!!!"));
527 m->oflags |= VPO_BUSY;
528}
529
530/*
531 * vm_page_flash:
532 *
533 * wakeup anyone waiting for the page.
534 */
535void
536vm_page_flash(vm_page_t m)
537{
538
539 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
540 if (m->oflags & VPO_WANTED) {
541 m->oflags &= ~VPO_WANTED;
542 wakeup(m);
543 }
544}
545
546/*
547 * vm_page_wakeup:
548 *
549 * clear the VPO_BUSY flag and wakeup anyone waiting for the
550 * page.
551 *
552 */
553void
554vm_page_wakeup(vm_page_t m)
555{
556
557 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
558 KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
559 m->oflags &= ~VPO_BUSY;
560 vm_page_flash(m);
561}
562
563void
564vm_page_io_start(vm_page_t m)
565{
566
567 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
568 m->busy++;
569}
570
571void
572vm_page_io_finish(vm_page_t m)
573{
574
575 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
576 KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m));
577 m->busy--;
578 if (m->busy == 0)
579 vm_page_flash(m);
580}
581
582/*
583 * Keep page from being freed by the page daemon
584 * much of the same effect as wiring, except much lower
585 * overhead and should be used only for *very* temporary
586 * holding ("wiring").
587 */
588void
589vm_page_hold(vm_page_t mem)
590{
591
592 vm_page_lock_assert(mem, MA_OWNED);
593 mem->hold_count++;
594}
595
596void
597vm_page_unhold(vm_page_t mem)
598{
599
600 vm_page_lock_assert(mem, MA_OWNED);
601 --mem->hold_count;
602 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
603 if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
604 vm_page_free_toq(mem);
605}
606
607/*
608 * vm_page_unhold_pages:
609 *
610 * Unhold each of the pages that is referenced by the given array.
611 */
612void
613vm_page_unhold_pages(vm_page_t *ma, int count)
614{
615 struct mtx *mtx, *new_mtx;
616
617 mtx = NULL;
618 for (; count != 0; count--) {
619 /*
620 * Avoid releasing and reacquiring the same page lock.
621 */
622 new_mtx = vm_page_lockptr(*ma);
623 if (mtx != new_mtx) {
624 if (mtx != NULL)
625 mtx_unlock(mtx);
626 mtx = new_mtx;
627 mtx_lock(mtx);
628 }
629 vm_page_unhold(*ma);
630 ma++;
631 }
632 if (mtx != NULL)
633 mtx_unlock(mtx);
634}
635
636/*
637 * vm_page_getfake:
638 *
639 * Create a fictitious page with the specified physical address and
640 * memory attribute. The memory attribute is the only the machine-
641 * dependent aspect of a fictitious page that must be initialized.
642 */
643vm_page_t
644vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
645{
646 vm_page_t m;
647
648 m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
649 m->phys_addr = paddr;
650 m->queue = PQ_NONE;
651 /* Fictitious pages don't use "segind". */
652 m->flags = PG_FICTITIOUS;
653 /* Fictitious pages don't use "order" or "pool". */
654 m->oflags = VPO_BUSY | VPO_UNMANAGED;
655 m->wire_count = 1;
656 pmap_page_set_memattr(m, memattr);
657 return (m);
658}
659
660/*
661 * vm_page_putfake:
662 *
663 * Release a fictitious page.
664 */
665void
666vm_page_putfake(vm_page_t m)
667{
668
669 KASSERT((m->flags & PG_FICTITIOUS) != 0,
670 ("vm_page_putfake: bad page %p", m));
671 uma_zfree(fakepg_zone, m);
672}
673
674/*
675 * vm_page_updatefake:
676 *
677 * Update the given fictitious page to the specified physical address and
678 * memory attribute.
679 */
680void
681vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
682{
683
684 KASSERT((m->flags & PG_FICTITIOUS) != 0,
685 ("vm_page_updatefake: bad page %p", m));
686 m->phys_addr = paddr;
687 pmap_page_set_memattr(m, memattr);
688}
689
690/*
691 * vm_page_free:
692 *
693 * Free a page.
694 */
695void
696vm_page_free(vm_page_t m)
697{
698
699 m->flags &= ~PG_ZERO;
700 vm_page_free_toq(m);
701}
702
703/*
704 * vm_page_free_zero:
705 *
706 * Free a page to the zerod-pages queue
707 */
708void
709vm_page_free_zero(vm_page_t m)
710{
711
712 m->flags |= PG_ZERO;
713 vm_page_free_toq(m);
714}
715
716/*
717 * vm_page_sleep:
718 *
719 * Sleep and release the page and page queues locks.
720 *
721 * The object containing the given page must be locked.
722 */
723void
724vm_page_sleep(vm_page_t m, const char *msg)
725{
726
727 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
728 if (mtx_owned(&vm_page_queue_mtx))
729 vm_page_unlock_queues();
730 if (mtx_owned(vm_page_lockptr(m)))
731 vm_page_unlock(m);
732
733 /*
734 * It's possible that while we sleep, the page will get
735 * unbusied and freed. If we are holding the object
736 * lock, we will assume we hold a reference to the object
737 * such that even if m->object changes, we can re-lock
738 * it.
739 */
740 m->oflags |= VPO_WANTED;
741 msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
742}
743
744/*
745 * vm_page_dirty:
746 *
747 * Set all bits in the page's dirty field.
748 *
749 * The object containing the specified page must be locked if the
750 * call is made from the machine-independent layer.
751 *
752 * See vm_page_clear_dirty_mask().
753 */
754void
755vm_page_dirty(vm_page_t m)
756{
757
758 KASSERT((m->flags & PG_CACHED) == 0,
759 ("vm_page_dirty: page in cache!"));
760 KASSERT(!VM_PAGE_IS_FREE(m),
761 ("vm_page_dirty: page is free!"));
762 KASSERT(m->valid == VM_PAGE_BITS_ALL,
763 ("vm_page_dirty: page is invalid!"));
764 m->dirty = VM_PAGE_BITS_ALL;
765}
766
767/*
768 * vm_page_splay:
769 *
770 * Implements Sleator and Tarjan's top-down splay algorithm. Returns
771 * the vm_page containing the given pindex. If, however, that
772 * pindex is not found in the vm_object, returns a vm_page that is
773 * adjacent to the pindex, coming before or after it.
774 */
775vm_page_t
776vm_page_splay(vm_pindex_t pindex, vm_page_t root)
777{
778 struct vm_page dummy;
779 vm_page_t lefttreemax, righttreemin, y;
780
781 if (root == NULL)
782 return (root);
783 lefttreemax = righttreemin = &dummy;
784 for (;; root = y) {
785 if (pindex < root->pindex) {
786 if ((y = root->left) == NULL)
787 break;
788 if (pindex < y->pindex) {
789 /* Rotate right. */
790 root->left = y->right;
791 y->right = root;
792 root = y;
793 if ((y = root->left) == NULL)
794 break;
795 }
796 /* Link into the new root's right tree. */
797 righttreemin->left = root;
798 righttreemin = root;
799 } else if (pindex > root->pindex) {
800 if ((y = root->right) == NULL)
801 break;
802 if (pindex > y->pindex) {
803 /* Rotate left. */
804 root->right = y->left;
805 y->left = root;
806 root = y;
807 if ((y = root->right) == NULL)
808 break;
809 }
810 /* Link into the new root's left tree. */
811 lefttreemax->right = root;
812 lefttreemax = root;
813 } else
814 break;
815 }
816 /* Assemble the new root. */
817 lefttreemax->right = root->left;
818 righttreemin->left = root->right;
819 root->left = dummy.right;
820 root->right = dummy.left;
821 return (root);
822}
823
824/*
825 * vm_page_insert: [ internal use only ]
826 *
827 * Inserts the given mem entry into the object and object list.
828 *
829 * The pagetables are not updated but will presumably fault the page
830 * in if necessary, or if a kernel page the caller will at some point
831 * enter the page into the kernel's pmap. We are not allowed to block
832 * here so we *can't* do this anyway.
833 *
834 * The object and page must be locked.
835 * This routine may not block.
836 */
837void
838vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
839{
840 vm_page_t root;
841
842 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
843 if (m->object != NULL)
844 panic("vm_page_insert: page already inserted");
845
846 /*
847 * Record the object/offset pair in this page
848 */
849 m->object = object;
850 m->pindex = pindex;
851
852 /*
853 * Now link into the object's ordered list of backed pages.
854 */
855 root = object->root;
856 if (root == NULL) {
857 m->left = NULL;
858 m->right = NULL;
859 TAILQ_INSERT_TAIL(&object->memq, m, listq);
860 } else {
861 root = vm_page_splay(pindex, root);
862 if (pindex < root->pindex) {
863 m->left = root->left;
864 m->right = root;
865 root->left = NULL;
866 TAILQ_INSERT_BEFORE(root, m, listq);
867 } else if (pindex == root->pindex)
868 panic("vm_page_insert: offset already allocated");
869 else {
870 m->right = root->right;
871 m->left = root;
872 root->right = NULL;
873 TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
874 }
875 }
876 object->root = m;
877
878 /*
879 * show that the object has one more resident page.
880 */
881 object->resident_page_count++;
882 /*
883 * Hold the vnode until the last page is released.
884 */
885 if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
886 vhold((struct vnode *)object->handle);
887
888 /*
889 * Since we are inserting a new and possibly dirty page,
890 * update the object's OBJ_MIGHTBEDIRTY flag.
891 */
892 if (m->aflags & PGA_WRITEABLE)
893 vm_object_set_writeable_dirty(object);
894}
895
896/*
897 * vm_page_remove:
898 * NOTE: used by device pager as well -wfj
899 *
900 * Removes the given mem entry from the object/offset-page
901 * table and the object page list, but do not invalidate/terminate
902 * the backing store.
903 *
904 * The object and page must be locked.
905 * The underlying pmap entry (if any) is NOT removed here.
906 * This routine may not block.
907 */
908void
909vm_page_remove(vm_page_t m)
910{
911 vm_object_t object;
912 vm_page_t next, prev, root;
913
914 if ((m->oflags & VPO_UNMANAGED) == 0)
915 vm_page_lock_assert(m, MA_OWNED);
916 if ((object = m->object) == NULL)
917 return;
918 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
919 if (m->oflags & VPO_BUSY) {
920 m->oflags &= ~VPO_BUSY;
921 vm_page_flash(m);
922 }
923
924 /*
925 * Now remove from the object's list of backed pages.
926 */
927 if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
928 /*
929 * Since the page's successor in the list is also its parent
930 * in the tree, its right subtree must be empty.
931 */
932 next->left = m->left;
933 KASSERT(m->right == NULL,
934 ("vm_page_remove: page %p has right child", m));
935 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
936 prev->right == m) {
937 /*
938 * Since the page's predecessor in the list is also its parent
939 * in the tree, its left subtree must be empty.
940 */
941 KASSERT(m->left == NULL,
942 ("vm_page_remove: page %p has left child", m));
943 prev->right = m->right;
944 } else {
945 if (m != object->root)
946 vm_page_splay(m->pindex, object->root);
947 if (m->left == NULL)
948 root = m->right;
949 else if (m->right == NULL)
950 root = m->left;
951 else {
952 /*
953 * Move the page's successor to the root, because
954 * pages are usually removed in ascending order.
955 */
956 if (m->right != next)
957 vm_page_splay(m->pindex, m->right);
958 next->left = m->left;
959 root = next;
960 }
961 object->root = root;
962 }
963 TAILQ_REMOVE(&object->memq, m, listq);
964
965 /*
966 * And show that the object has one fewer resident page.
967 */
968 object->resident_page_count--;
969 /*
970 * The vnode may now be recycled.
971 */
972 if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
973 vdrop((struct vnode *)object->handle);
974
975 m->object = NULL;
976}
977
978/*
979 * vm_page_lookup:
980 *
981 * Returns the page associated with the object/offset
982 * pair specified; if none is found, NULL is returned.
983 *
984 * The object must be locked.
985 * This routine may not block.
986 * This is a critical path routine
987 */
988vm_page_t
989vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
990{
991 vm_page_t m;
992
993 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
994 if ((m = object->root) != NULL && m->pindex != pindex) {
995 m = vm_page_splay(pindex, m);
996 if ((object->root = m)->pindex != pindex)
997 m = NULL;
998 }
999 return (m);
1000}
1001
1002/*
1003 * vm_page_find_least:
1004 *
1005 * Returns the page associated with the object with least pindex
1006 * greater than or equal to the parameter pindex, or NULL.
1007 *
1008 * The object must be locked.
1009 * The routine may not block.
1010 */
1011vm_page_t
1012vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
1013{
1014 vm_page_t m;
1015
1016 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1017 if ((m = TAILQ_FIRST(&object->memq)) != NULL) {
1018 if (m->pindex < pindex) {
1019 m = vm_page_splay(pindex, object->root);
1020 if ((object->root = m)->pindex < pindex)
1021 m = TAILQ_NEXT(m, listq);
1022 }
1023 }
1024 return (m);
1025}
1026
1027/*
1028 * Returns the given page's successor (by pindex) within the object if it is
1029 * resident; if none is found, NULL is returned.
1030 *
1031 * The object must be locked.
1032 */
1033vm_page_t
1034vm_page_next(vm_page_t m)
1035{
1036 vm_page_t next;
1037
1038 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1039 if ((next = TAILQ_NEXT(m, listq)) != NULL &&
1040 next->pindex != m->pindex + 1)
1041 next = NULL;
1042 return (next);
1043}
1044
1045/*
1046 * Returns the given page's predecessor (by pindex) within the object if it is
1047 * resident; if none is found, NULL is returned.
1048 *
1049 * The object must be locked.
1050 */
1051vm_page_t
1052vm_page_prev(vm_page_t m)
1053{
1054 vm_page_t prev;
1055
1056 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1057 if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
1058 prev->pindex != m->pindex - 1)
1059 prev = NULL;
1060 return (prev);
1061}
1062
1063/*
1064 * vm_page_rename:
1065 *
1066 * Move the given memory entry from its
1067 * current object to the specified target object/offset.
1068 *
1069 * The object must be locked.
1070 * This routine may not block.
1071 *
1072 * Note: swap associated with the page must be invalidated by the move. We
1073 * have to do this for several reasons: (1) we aren't freeing the
1074 * page, (2) we are dirtying the page, (3) the VM system is probably
1075 * moving the page from object A to B, and will then later move
1076 * the backing store from A to B and we can't have a conflict.
1077 *
1078 * Note: we *always* dirty the page. It is necessary both for the
1079 * fact that we moved it, and because we may be invalidating
1080 * swap. If the page is on the cache, we have to deactivate it
1081 * or vm_page_dirty() will panic. Dirty pages are not allowed
1082 * on the cache.
1083 */
1084void
1085vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1086{
1087
1088 vm_page_remove(m);
1089 vm_page_insert(m, new_object, new_pindex);
1090 vm_page_dirty(m);
1091}
1092
1093/*
1094 * Convert all of the given object's cached pages that have a
1095 * pindex within the given range into free pages. If the value
1096 * zero is given for "end", then the range's upper bound is
1097 * infinity. If the given object is backed by a vnode and it
1098 * transitions from having one or more cached pages to none, the
1099 * vnode's hold count is reduced.
1100 */
1101void
1102vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1103{
1104 vm_page_t m, m_next;
1105 boolean_t empty;
1106
1107 mtx_lock(&vm_page_queue_free_mtx);
1108 if (__predict_false(object->cache == NULL)) {
1109 mtx_unlock(&vm_page_queue_free_mtx);
1110 return;
1111 }
1112 m = object->cache = vm_page_splay(start, object->cache);
1113 if (m->pindex < start) {
1114 if (m->right == NULL)
1115 m = NULL;
1116 else {
1117 m_next = vm_page_splay(start, m->right);
1118 m_next->left = m;
1119 m->right = NULL;
1120 m = object->cache = m_next;
1121 }
1122 }
1123
1124 /*
1125 * At this point, "m" is either (1) a reference to the page
1126 * with the least pindex that is greater than or equal to
1127 * "start" or (2) NULL.
1128 */
1129 for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
1130 /*
1131 * Find "m"'s successor and remove "m" from the
1132 * object's cache.
1133 */
1134 if (m->right == NULL) {
1135 object->cache = m->left;
1136 m_next = NULL;
1137 } else {
1138 m_next = vm_page_splay(start, m->right);
1139 m_next->left = m->left;
1140 object->cache = m_next;
1141 }
1142 /* Convert "m" to a free page. */
1143 m->object = NULL;
1144 m->valid = 0;
1145 /* Clear PG_CACHED and set PG_FREE. */
1146 m->flags ^= PG_CACHED | PG_FREE;
1147 KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
1148 ("vm_page_cache_free: page %p has inconsistent flags", m));
1149 cnt.v_cache_count--;
1150 cnt.v_free_count++;
1151 }
1152 empty = object->cache == NULL;
1153 mtx_unlock(&vm_page_queue_free_mtx);
1154 if (object->type == OBJT_VNODE && empty)
1155 vdrop(object->handle);
1156}
1157
1158/*
1159 * Returns the cached page that is associated with the given
1160 * object and offset. If, however, none exists, returns NULL.
1161 *
1162 * The free page queue must be locked.
1163 */
1164static inline vm_page_t
1165vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
1166{
1167 vm_page_t m;
1168
1169 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1170 if ((m = object->cache) != NULL && m->pindex != pindex) {
1171 m = vm_page_splay(pindex, m);
1172 if ((object->cache = m)->pindex != pindex)
1173 m = NULL;
1174 }
1175 return (m);
1176}
1177
1178/*
1179 * Remove the given cached page from its containing object's
1180 * collection of cached pages.
1181 *
1182 * The free page queue must be locked.
1183 */
1184void
1185vm_page_cache_remove(vm_page_t m)
1186{
1187 vm_object_t object;
1188 vm_page_t root;
1189
1190 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1191 KASSERT((m->flags & PG_CACHED) != 0,
1192 ("vm_page_cache_remove: page %p is not cached", m));
1193 object = m->object;
1194 if (m != object->cache) {
1195 root = vm_page_splay(m->pindex, object->cache);
1196 KASSERT(root == m,
1197 ("vm_page_cache_remove: page %p is not cached in object %p",
1198 m, object));
1199 }
1200 if (m->left == NULL)
1201 root = m->right;
1202 else if (m->right == NULL)
1203 root = m->left;
1204 else {
1205 root = vm_page_splay(m->pindex, m->left);
1206 root->right = m->right;
1207 }
1208 object->cache = root;
1209 m->object = NULL;
1210 cnt.v_cache_count--;
1211}
1212
1213/*
1214 * Transfer all of the cached pages with offset greater than or
1215 * equal to 'offidxstart' from the original object's cache to the
1216 * new object's cache. However, any cached pages with offset
1217 * greater than or equal to the new object's size are kept in the
1218 * original object. Initially, the new object's cache must be
1219 * empty. Offset 'offidxstart' in the original object must
1220 * correspond to offset zero in the new object.
1221 *
1222 * The new object must be locked.
1223 */
1224void
1225vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
1226 vm_object_t new_object)
1227{
1228 vm_page_t m, m_next;
1229
1230 /*
1231 * Insertion into an object's collection of cached pages
1232 * requires the object to be locked. In contrast, removal does
1233 * not.
1234 */
1235 VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
1236 KASSERT(new_object->cache == NULL,
1237 ("vm_page_cache_transfer: object %p has cached pages",
1238 new_object));
1239 mtx_lock(&vm_page_queue_free_mtx);
1240 if ((m = orig_object->cache) != NULL) {
1241 /*
1242 * Transfer all of the pages with offset greater than or
1243 * equal to 'offidxstart' from the original object's
1244 * cache to the new object's cache.
1245 */
1246 m = vm_page_splay(offidxstart, m);
1247 if (m->pindex < offidxstart) {
1248 orig_object->cache = m;
1249 new_object->cache = m->right;
1250 m->right = NULL;
1251 } else {
1252 orig_object->cache = m->left;
1253 new_object->cache = m;
1254 m->left = NULL;
1255 }
1256 while ((m = new_object->cache) != NULL) {
1257 if ((m->pindex - offidxstart) >= new_object->size) {
1258 /*
1259 * Return all of the cached pages with
1260 * offset greater than or equal to the
1261 * new object's size to the original
1262 * object's cache.
1263 */
1264 new_object->cache = m->left;
1265 m->left = orig_object->cache;
1266 orig_object->cache = m;
1267 break;
1268 }
1269 m_next = vm_page_splay(m->pindex, m->right);
1270 /* Update the page's object and offset. */
1271 m->object = new_object;
1272 m->pindex -= offidxstart;
1273 if (m_next == NULL)
1274 break;
1275 m->right = NULL;
1276 m_next->left = m;
1277 new_object->cache = m_next;
1278 }
1279 KASSERT(new_object->cache == NULL ||
1280 new_object->type == OBJT_SWAP,
1281 ("vm_page_cache_transfer: object %p's type is incompatible"
1282 " with cached pages", new_object));
1283 }
1284 mtx_unlock(&vm_page_queue_free_mtx);
1285}
1286
1287/*
1288 * vm_page_alloc:
1289 *
1290 * Allocate and return a page that is associated with the specified
1291 * object and offset pair. By default, this page has the flag VPO_BUSY
1292 * set.
1293 *
1294 * The caller must always specify an allocation class.
1295 *
1296 * allocation classes:
1297 * VM_ALLOC_NORMAL normal process request
1298 * VM_ALLOC_SYSTEM system *really* needs a page
1299 * VM_ALLOC_INTERRUPT interrupt time request
1300 *
1301 * optional allocation flags:
1302 * VM_ALLOC_COUNT(number) the number of additional pages that the caller
1303 * intends to allocate
1304 * VM_ALLOC_IFCACHED return page only if it is cached
1305 * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page
1306 * is cached
1307 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page
1308 * VM_ALLOC_NODUMP do not include the page in a kernel core dump
1308 * VM_ALLOC_NOOBJ page is not associated with an object and
1309 * should not have the flag VPO_BUSY set
1310 * VM_ALLOC_WIRED wire the allocated page
1311 * VM_ALLOC_ZERO prefer a zeroed page
1312 *
1313 * This routine may not sleep.
1314 */
1315vm_page_t
1316vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1317{
1318 struct vnode *vp = NULL;
1319 vm_object_t m_object;
1320 vm_page_t m;
1321 int flags, req_class;
1322
1323 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0),
1324 ("vm_page_alloc: inconsistent object/req"));
1325 if (object != NULL)
1326 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1327
1328 req_class = req & VM_ALLOC_CLASS_MASK;
1329
1330 /*
1331 * The page daemon is allowed to dig deeper into the free page list.
1332 */
1333 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1334 req_class = VM_ALLOC_SYSTEM;
1335
1336 mtx_lock(&vm_page_queue_free_mtx);
1337 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1338 (req_class == VM_ALLOC_SYSTEM &&
1339 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1340 (req_class == VM_ALLOC_INTERRUPT &&
1341 cnt.v_free_count + cnt.v_cache_count > 0)) {
1342 /*
1343 * Allocate from the free queue if the number of free pages
1344 * exceeds the minimum for the request class.
1345 */
1346 if (object != NULL &&
1347 (m = vm_page_cache_lookup(object, pindex)) != NULL) {
1348 if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
1349 mtx_unlock(&vm_page_queue_free_mtx);
1350 return (NULL);
1351 }
1352 if (vm_phys_unfree_page(m))
1353 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
1354#if VM_NRESERVLEVEL > 0
1355 else if (!vm_reserv_reactivate_page(m))
1356#else
1357 else
1358#endif
1359 panic("vm_page_alloc: cache page %p is missing"
1360 " from the free queue", m);
1361 } else if ((req & VM_ALLOC_IFCACHED) != 0) {
1362 mtx_unlock(&vm_page_queue_free_mtx);
1363 return (NULL);
1364#if VM_NRESERVLEVEL > 0
1365 } else if (object == NULL || object->type == OBJT_DEVICE ||
1366 object->type == OBJT_SG ||
1367 (object->flags & OBJ_COLORED) == 0 ||
1368 (m = vm_reserv_alloc_page(object, pindex)) == NULL) {
1369#else
1370 } else {
1371#endif
1372 m = vm_phys_alloc_pages(object != NULL ?
1373 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
1374#if VM_NRESERVLEVEL > 0
1375 if (m == NULL && vm_reserv_reclaim_inactive()) {
1376 m = vm_phys_alloc_pages(object != NULL ?
1377 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
1378 0);
1379 }
1380#endif
1381 }
1382 } else {
1383 /*
1384 * Not allocatable, give up.
1385 */
1386 mtx_unlock(&vm_page_queue_free_mtx);
1387 atomic_add_int(&vm_pageout_deficit,
1388 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1389 pagedaemon_wakeup();
1390 return (NULL);
1391 }
1392
1393 /*
1394 * At this point we had better have found a good page.
1395 */
1396 KASSERT(m != NULL, ("vm_page_alloc: missing page"));
1397 KASSERT(m->queue == PQ_NONE,
1398 ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
1399 KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
1400 KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
1401 KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m));
1402 KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
1403 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1404 ("vm_page_alloc: page %p has unexpected memattr %d", m,
1405 pmap_page_get_memattr(m)));
1406 if ((m->flags & PG_CACHED) != 0) {
1407 KASSERT((m->flags & PG_ZERO) == 0,
1408 ("vm_page_alloc: cached page %p is PG_ZERO", m));
1409 KASSERT(m->valid != 0,
1410 ("vm_page_alloc: cached page %p is invalid", m));
1411 if (m->object == object && m->pindex == pindex)
1412 cnt.v_reactivated++;
1413 else
1414 m->valid = 0;
1415 m_object = m->object;
1416 vm_page_cache_remove(m);
1417 if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
1418 vp = m_object->handle;
1419 } else {
1420 KASSERT(VM_PAGE_IS_FREE(m),
1421 ("vm_page_alloc: page %p is not free", m));
1422 KASSERT(m->valid == 0,
1423 ("vm_page_alloc: free page %p is valid", m));
1424 cnt.v_free_count--;
1425 }
1426
1427 /*
1428 * Only the PG_ZERO flag is inherited. The PG_CACHED or PG_FREE flag
1429 * must be cleared before the free page queues lock is released.
1430 */
1431 flags = 0;
1309 * VM_ALLOC_NOOBJ page is not associated with an object and
1310 * should not have the flag VPO_BUSY set
1311 * VM_ALLOC_WIRED wire the allocated page
1312 * VM_ALLOC_ZERO prefer a zeroed page
1313 *
1314 * This routine may not sleep.
1315 */
1316vm_page_t
1317vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1318{
1319 struct vnode *vp = NULL;
1320 vm_object_t m_object;
1321 vm_page_t m;
1322 int flags, req_class;
1323
1324 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0),
1325 ("vm_page_alloc: inconsistent object/req"));
1326 if (object != NULL)
1327 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1328
1329 req_class = req & VM_ALLOC_CLASS_MASK;
1330
1331 /*
1332 * The page daemon is allowed to dig deeper into the free page list.
1333 */
1334 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1335 req_class = VM_ALLOC_SYSTEM;
1336
1337 mtx_lock(&vm_page_queue_free_mtx);
1338 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1339 (req_class == VM_ALLOC_SYSTEM &&
1340 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1341 (req_class == VM_ALLOC_INTERRUPT &&
1342 cnt.v_free_count + cnt.v_cache_count > 0)) {
1343 /*
1344 * Allocate from the free queue if the number of free pages
1345 * exceeds the minimum for the request class.
1346 */
1347 if (object != NULL &&
1348 (m = vm_page_cache_lookup(object, pindex)) != NULL) {
1349 if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
1350 mtx_unlock(&vm_page_queue_free_mtx);
1351 return (NULL);
1352 }
1353 if (vm_phys_unfree_page(m))
1354 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
1355#if VM_NRESERVLEVEL > 0
1356 else if (!vm_reserv_reactivate_page(m))
1357#else
1358 else
1359#endif
1360 panic("vm_page_alloc: cache page %p is missing"
1361 " from the free queue", m);
1362 } else if ((req & VM_ALLOC_IFCACHED) != 0) {
1363 mtx_unlock(&vm_page_queue_free_mtx);
1364 return (NULL);
1365#if VM_NRESERVLEVEL > 0
1366 } else if (object == NULL || object->type == OBJT_DEVICE ||
1367 object->type == OBJT_SG ||
1368 (object->flags & OBJ_COLORED) == 0 ||
1369 (m = vm_reserv_alloc_page(object, pindex)) == NULL) {
1370#else
1371 } else {
1372#endif
1373 m = vm_phys_alloc_pages(object != NULL ?
1374 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
1375#if VM_NRESERVLEVEL > 0
1376 if (m == NULL && vm_reserv_reclaim_inactive()) {
1377 m = vm_phys_alloc_pages(object != NULL ?
1378 VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
1379 0);
1380 }
1381#endif
1382 }
1383 } else {
1384 /*
1385 * Not allocatable, give up.
1386 */
1387 mtx_unlock(&vm_page_queue_free_mtx);
1388 atomic_add_int(&vm_pageout_deficit,
1389 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1390 pagedaemon_wakeup();
1391 return (NULL);
1392 }
1393
1394 /*
1395 * At this point we had better have found a good page.
1396 */
1397 KASSERT(m != NULL, ("vm_page_alloc: missing page"));
1398 KASSERT(m->queue == PQ_NONE,
1399 ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
1400 KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
1401 KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
1402 KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m));
1403 KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
1404 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1405 ("vm_page_alloc: page %p has unexpected memattr %d", m,
1406 pmap_page_get_memattr(m)));
1407 if ((m->flags & PG_CACHED) != 0) {
1408 KASSERT((m->flags & PG_ZERO) == 0,
1409 ("vm_page_alloc: cached page %p is PG_ZERO", m));
1410 KASSERT(m->valid != 0,
1411 ("vm_page_alloc: cached page %p is invalid", m));
1412 if (m->object == object && m->pindex == pindex)
1413 cnt.v_reactivated++;
1414 else
1415 m->valid = 0;
1416 m_object = m->object;
1417 vm_page_cache_remove(m);
1418 if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
1419 vp = m_object->handle;
1420 } else {
1421 KASSERT(VM_PAGE_IS_FREE(m),
1422 ("vm_page_alloc: page %p is not free", m));
1423 KASSERT(m->valid == 0,
1424 ("vm_page_alloc: free page %p is valid", m));
1425 cnt.v_free_count--;
1426 }
1427
1428 /*
1429 * Only the PG_ZERO flag is inherited. The PG_CACHED or PG_FREE flag
1430 * must be cleared before the free page queues lock is released.
1431 */
1432 flags = 0;
1433 if (req & VM_ALLOC_NODUMP)
1434 flags |= PG_NODUMP;
1432 if (m->flags & PG_ZERO) {
1433 vm_page_zero_count--;
1434 if (req & VM_ALLOC_ZERO)
1435 flags = PG_ZERO;
1436 }
1437 m->flags = flags;
1438 mtx_unlock(&vm_page_queue_free_mtx);
1439 m->aflags = 0;
1440 if (object == NULL || object->type == OBJT_PHYS)
1441 m->oflags = VPO_UNMANAGED;
1442 else
1443 m->oflags = 0;
1444 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0)
1445 m->oflags |= VPO_BUSY;
1446 if (req & VM_ALLOC_WIRED) {
1447 /*
1448 * The page lock is not required for wiring a page until that
1449 * page is inserted into the object.
1450 */
1451 atomic_add_int(&cnt.v_wire_count, 1);
1452 m->wire_count = 1;
1453 }
1454 m->act_count = 0;
1455
1456 if (object != NULL) {
1457 /* Ignore device objects; the pager sets "memattr" for them. */
1458 if (object->memattr != VM_MEMATTR_DEFAULT &&
1459 object->type != OBJT_DEVICE && object->type != OBJT_SG)
1460 pmap_page_set_memattr(m, object->memattr);
1461 vm_page_insert(m, object, pindex);
1462 } else
1463 m->pindex = pindex;
1464
1465 /*
1466 * The following call to vdrop() must come after the above call
1467 * to vm_page_insert() in case both affect the same object and
1468 * vnode. Otherwise, the affected vnode's hold count could
1469 * temporarily become zero.
1470 */
1471 if (vp != NULL)
1472 vdrop(vp);
1473
1474 /*
1475 * Don't wakeup too often - wakeup the pageout daemon when
1476 * we would be nearly out of memory.
1477 */
1478 if (vm_paging_needed())
1479 pagedaemon_wakeup();
1480
1481 return (m);
1482}
1483
1484/*
1485 * vm_page_alloc_contig:
1486 *
1487 * Allocate a contiguous set of physical pages of the given size "npages"
1488 * from the free lists. All of the physical pages must be at or above
1489 * the given physical address "low" and below the given physical address
1490 * "high". The given value "alignment" determines the alignment of the
1491 * first physical page in the set. If the given value "boundary" is
1492 * non-zero, then the set of physical pages cannot cross any physical
1493 * address boundary that is a multiple of that value. Both "alignment"
1494 * and "boundary" must be a power of two.
1495 *
1496 * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
1497 * then the memory attribute setting for the physical pages is configured
1498 * to the object's memory attribute setting. Otherwise, the memory
1499 * attribute setting for the physical pages is configured to "memattr",
1500 * overriding the object's memory attribute setting. However, if the
1501 * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
1502 * memory attribute setting for the physical pages cannot be configured
1503 * to VM_MEMATTR_DEFAULT.
1504 *
1505 * The caller must always specify an allocation class.
1506 *
1507 * allocation classes:
1508 * VM_ALLOC_NORMAL normal process request
1509 * VM_ALLOC_SYSTEM system *really* needs a page
1510 * VM_ALLOC_INTERRUPT interrupt time request
1511 *
1512 * optional allocation flags:
1513 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page
1514 * VM_ALLOC_NOOBJ page is not associated with an object and
1515 * should not have the flag VPO_BUSY set
1516 * VM_ALLOC_WIRED wire the allocated page
1517 * VM_ALLOC_ZERO prefer a zeroed page
1518 *
1519 * This routine may not sleep.
1520 */
1521vm_page_t
1522vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
1523 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
1524 vm_paddr_t boundary, vm_memattr_t memattr)
1525{
1526 struct vnode *drop;
1527 vm_page_t deferred_vdrop_list, m, m_ret;
1528 u_int flags, oflags;
1529 int req_class;
1530
1531 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0),
1532 ("vm_page_alloc_contig: inconsistent object/req"));
1533 if (object != NULL) {
1534 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1535 KASSERT(object->type == OBJT_PHYS,
1536 ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
1537 object));
1538 }
1539 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
1540 req_class = req & VM_ALLOC_CLASS_MASK;
1541
1542 /*
1543 * The page daemon is allowed to dig deeper into the free page list.
1544 */
1545 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1546 req_class = VM_ALLOC_SYSTEM;
1547
1548 deferred_vdrop_list = NULL;
1549 mtx_lock(&vm_page_queue_free_mtx);
1550 if (cnt.v_free_count + cnt.v_cache_count >= npages +
1551 cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
1552 cnt.v_free_count + cnt.v_cache_count >= npages +
1553 cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
1554 cnt.v_free_count + cnt.v_cache_count >= npages)) {
1555#if VM_NRESERVLEVEL > 0
1556retry:
1557 if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
1558 (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
1559 low, high, alignment, boundary)) == NULL)
1560#endif
1561 m_ret = vm_phys_alloc_contig(npages, low, high,
1562 alignment, boundary);
1563 } else {
1564 mtx_unlock(&vm_page_queue_free_mtx);
1565 atomic_add_int(&vm_pageout_deficit, npages);
1566 pagedaemon_wakeup();
1567 return (NULL);
1568 }
1569 if (m_ret != NULL)
1570 for (m = m_ret; m < &m_ret[npages]; m++) {
1571 drop = vm_page_alloc_init(m);
1572 if (drop != NULL) {
1573 /*
1574 * Enqueue the vnode for deferred vdrop().
1575 *
1576 * Once the pages are removed from the free
1577 * page list, "pageq" can be safely abused to
1578 * construct a short-lived list of vnodes.
1579 */
1580 m->pageq.tqe_prev = (void *)drop;
1581 m->pageq.tqe_next = deferred_vdrop_list;
1582 deferred_vdrop_list = m;
1583 }
1584 }
1585 else {
1586#if VM_NRESERVLEVEL > 0
1587 if (vm_reserv_reclaim_contig(npages, low, high, alignment,
1588 boundary))
1589 goto retry;
1590#endif
1591 }
1592 mtx_unlock(&vm_page_queue_free_mtx);
1593 if (m_ret == NULL)
1594 return (NULL);
1595
1596 /*
1597 * Initialize the pages. Only the PG_ZERO flag is inherited.
1598 */
1599 flags = 0;
1600 if ((req & VM_ALLOC_ZERO) != 0)
1601 flags = PG_ZERO;
1435 if (m->flags & PG_ZERO) {
1436 vm_page_zero_count--;
1437 if (req & VM_ALLOC_ZERO)
1438 flags = PG_ZERO;
1439 }
1440 m->flags = flags;
1441 mtx_unlock(&vm_page_queue_free_mtx);
1442 m->aflags = 0;
1443 if (object == NULL || object->type == OBJT_PHYS)
1444 m->oflags = VPO_UNMANAGED;
1445 else
1446 m->oflags = 0;
1447 if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0)
1448 m->oflags |= VPO_BUSY;
1449 if (req & VM_ALLOC_WIRED) {
1450 /*
1451 * The page lock is not required for wiring a page until that
1452 * page is inserted into the object.
1453 */
1454 atomic_add_int(&cnt.v_wire_count, 1);
1455 m->wire_count = 1;
1456 }
1457 m->act_count = 0;
1458
1459 if (object != NULL) {
1460 /* Ignore device objects; the pager sets "memattr" for them. */
1461 if (object->memattr != VM_MEMATTR_DEFAULT &&
1462 object->type != OBJT_DEVICE && object->type != OBJT_SG)
1463 pmap_page_set_memattr(m, object->memattr);
1464 vm_page_insert(m, object, pindex);
1465 } else
1466 m->pindex = pindex;
1467
1468 /*
1469 * The following call to vdrop() must come after the above call
1470 * to vm_page_insert() in case both affect the same object and
1471 * vnode. Otherwise, the affected vnode's hold count could
1472 * temporarily become zero.
1473 */
1474 if (vp != NULL)
1475 vdrop(vp);
1476
1477 /*
1478 * Don't wakeup too often - wakeup the pageout daemon when
1479 * we would be nearly out of memory.
1480 */
1481 if (vm_paging_needed())
1482 pagedaemon_wakeup();
1483
1484 return (m);
1485}
1486
1487/*
1488 * vm_page_alloc_contig:
1489 *
1490 * Allocate a contiguous set of physical pages of the given size "npages"
1491 * from the free lists. All of the physical pages must be at or above
1492 * the given physical address "low" and below the given physical address
1493 * "high". The given value "alignment" determines the alignment of the
1494 * first physical page in the set. If the given value "boundary" is
1495 * non-zero, then the set of physical pages cannot cross any physical
1496 * address boundary that is a multiple of that value. Both "alignment"
1497 * and "boundary" must be a power of two.
1498 *
1499 * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
1500 * then the memory attribute setting for the physical pages is configured
1501 * to the object's memory attribute setting. Otherwise, the memory
1502 * attribute setting for the physical pages is configured to "memattr",
1503 * overriding the object's memory attribute setting. However, if the
1504 * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
1505 * memory attribute setting for the physical pages cannot be configured
1506 * to VM_MEMATTR_DEFAULT.
1507 *
1508 * The caller must always specify an allocation class.
1509 *
1510 * allocation classes:
1511 * VM_ALLOC_NORMAL normal process request
1512 * VM_ALLOC_SYSTEM system *really* needs a page
1513 * VM_ALLOC_INTERRUPT interrupt time request
1514 *
1515 * optional allocation flags:
1516 * VM_ALLOC_NOBUSY do not set the flag VPO_BUSY on the page
1517 * VM_ALLOC_NOOBJ page is not associated with an object and
1518 * should not have the flag VPO_BUSY set
1519 * VM_ALLOC_WIRED wire the allocated page
1520 * VM_ALLOC_ZERO prefer a zeroed page
1521 *
1522 * This routine may not sleep.
1523 */
1524vm_page_t
1525vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
1526 u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
1527 vm_paddr_t boundary, vm_memattr_t memattr)
1528{
1529 struct vnode *drop;
1530 vm_page_t deferred_vdrop_list, m, m_ret;
1531 u_int flags, oflags;
1532 int req_class;
1533
1534 KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0),
1535 ("vm_page_alloc_contig: inconsistent object/req"));
1536 if (object != NULL) {
1537 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1538 KASSERT(object->type == OBJT_PHYS,
1539 ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
1540 object));
1541 }
1542 KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
1543 req_class = req & VM_ALLOC_CLASS_MASK;
1544
1545 /*
1546 * The page daemon is allowed to dig deeper into the free page list.
1547 */
1548 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1549 req_class = VM_ALLOC_SYSTEM;
1550
1551 deferred_vdrop_list = NULL;
1552 mtx_lock(&vm_page_queue_free_mtx);
1553 if (cnt.v_free_count + cnt.v_cache_count >= npages +
1554 cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
1555 cnt.v_free_count + cnt.v_cache_count >= npages +
1556 cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
1557 cnt.v_free_count + cnt.v_cache_count >= npages)) {
1558#if VM_NRESERVLEVEL > 0
1559retry:
1560 if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
1561 (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
1562 low, high, alignment, boundary)) == NULL)
1563#endif
1564 m_ret = vm_phys_alloc_contig(npages, low, high,
1565 alignment, boundary);
1566 } else {
1567 mtx_unlock(&vm_page_queue_free_mtx);
1568 atomic_add_int(&vm_pageout_deficit, npages);
1569 pagedaemon_wakeup();
1570 return (NULL);
1571 }
1572 if (m_ret != NULL)
1573 for (m = m_ret; m < &m_ret[npages]; m++) {
1574 drop = vm_page_alloc_init(m);
1575 if (drop != NULL) {
1576 /*
1577 * Enqueue the vnode for deferred vdrop().
1578 *
1579 * Once the pages are removed from the free
1580 * page list, "pageq" can be safely abused to
1581 * construct a short-lived list of vnodes.
1582 */
1583 m->pageq.tqe_prev = (void *)drop;
1584 m->pageq.tqe_next = deferred_vdrop_list;
1585 deferred_vdrop_list = m;
1586 }
1587 }
1588 else {
1589#if VM_NRESERVLEVEL > 0
1590 if (vm_reserv_reclaim_contig(npages, low, high, alignment,
1591 boundary))
1592 goto retry;
1593#endif
1594 }
1595 mtx_unlock(&vm_page_queue_free_mtx);
1596 if (m_ret == NULL)
1597 return (NULL);
1598
1599 /*
1600 * Initialize the pages. Only the PG_ZERO flag is inherited.
1601 */
1602 flags = 0;
1603 if ((req & VM_ALLOC_ZERO) != 0)
1604 flags = PG_ZERO;
1605 if ((req & VM_ALLOC_NODUMP) != 0)
1606 flags |= PG_NODUMP;
1602 if ((req & VM_ALLOC_WIRED) != 0)
1603 atomic_add_int(&cnt.v_wire_count, npages);
1604 oflags = VPO_UNMANAGED;
1605 if (object != NULL) {
1606 if ((req & VM_ALLOC_NOBUSY) == 0)
1607 oflags |= VPO_BUSY;
1608 if (object->memattr != VM_MEMATTR_DEFAULT &&
1609 memattr == VM_MEMATTR_DEFAULT)
1610 memattr = object->memattr;
1611 }
1612 for (m = m_ret; m < &m_ret[npages]; m++) {
1613 m->aflags = 0;
1614 m->flags &= flags;
1615 if ((req & VM_ALLOC_WIRED) != 0)
1616 m->wire_count = 1;
1617 /* Unmanaged pages don't use "act_count". */
1618 m->oflags = oflags;
1619 if (memattr != VM_MEMATTR_DEFAULT)
1620 pmap_page_set_memattr(m, memattr);
1621 if (object != NULL)
1622 vm_page_insert(m, object, pindex);
1623 else
1624 m->pindex = pindex;
1625 pindex++;
1626 }
1627 while (deferred_vdrop_list != NULL) {
1628 vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
1629 deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
1630 }
1631 if (vm_paging_needed())
1632 pagedaemon_wakeup();
1633 return (m_ret);
1634}
1635
1636/*
1637 * Initialize a page that has been freshly dequeued from a freelist.
1638 * The caller has to drop the vnode returned, if it is not NULL.
1639 *
1640 * This function may only be used to initialize unmanaged pages.
1641 *
1642 * To be called with vm_page_queue_free_mtx held.
1643 */
1644static struct vnode *
1645vm_page_alloc_init(vm_page_t m)
1646{
1647 struct vnode *drop;
1648 vm_object_t m_object;
1649
1650 KASSERT(m->queue == PQ_NONE,
1651 ("vm_page_alloc_init: page %p has unexpected queue %d",
1652 m, m->queue));
1653 KASSERT(m->wire_count == 0,
1654 ("vm_page_alloc_init: page %p is wired", m));
1655 KASSERT(m->hold_count == 0,
1656 ("vm_page_alloc_init: page %p is held", m));
1657 KASSERT(m->busy == 0,
1658 ("vm_page_alloc_init: page %p is busy", m));
1659 KASSERT(m->dirty == 0,
1660 ("vm_page_alloc_init: page %p is dirty", m));
1661 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1662 ("vm_page_alloc_init: page %p has unexpected memattr %d",
1663 m, pmap_page_get_memattr(m)));
1664 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1665 drop = NULL;
1666 if ((m->flags & PG_CACHED) != 0) {
1667 KASSERT((m->flags & PG_ZERO) == 0,
1668 ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
1669 m->valid = 0;
1670 m_object = m->object;
1671 vm_page_cache_remove(m);
1672 if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
1673 drop = m_object->handle;
1674 } else {
1675 KASSERT(VM_PAGE_IS_FREE(m),
1676 ("vm_page_alloc_init: page %p is not free", m));
1677 KASSERT(m->valid == 0,
1678 ("vm_page_alloc_init: free page %p is valid", m));
1679 cnt.v_free_count--;
1680 if ((m->flags & PG_ZERO) != 0)
1681 vm_page_zero_count--;
1682 }
1683 /* Don't clear the PG_ZERO flag; we'll need it later. */
1684 m->flags &= PG_ZERO;
1685 return (drop);
1686}
1687
1688/*
1689 * vm_page_alloc_freelist:
1690 *
1691 * Allocate a physical page from the specified free page list.
1692 *
1693 * The caller must always specify an allocation class.
1694 *
1695 * allocation classes:
1696 * VM_ALLOC_NORMAL normal process request
1697 * VM_ALLOC_SYSTEM system *really* needs a page
1698 * VM_ALLOC_INTERRUPT interrupt time request
1699 *
1700 * optional allocation flags:
1701 * VM_ALLOC_COUNT(number) the number of additional pages that the caller
1702 * intends to allocate
1703 * VM_ALLOC_WIRED wire the allocated page
1704 * VM_ALLOC_ZERO prefer a zeroed page
1705 *
1706 * This routine may not sleep.
1707 */
1708vm_page_t
1709vm_page_alloc_freelist(int flind, int req)
1710{
1711 struct vnode *drop;
1712 vm_page_t m;
1713 u_int flags;
1714 int req_class;
1715
1716 req_class = req & VM_ALLOC_CLASS_MASK;
1717
1718 /*
1719 * The page daemon is allowed to dig deeper into the free page list.
1720 */
1721 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1722 req_class = VM_ALLOC_SYSTEM;
1723
1724 /*
1725 * Do not allocate reserved pages unless the req has asked for it.
1726 */
1727 mtx_lock(&vm_page_queue_free_mtx);
1728 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1729 (req_class == VM_ALLOC_SYSTEM &&
1730 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1731 (req_class == VM_ALLOC_INTERRUPT &&
1732 cnt.v_free_count + cnt.v_cache_count > 0))
1733 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
1734 else {
1735 mtx_unlock(&vm_page_queue_free_mtx);
1736 atomic_add_int(&vm_pageout_deficit,
1737 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1738 pagedaemon_wakeup();
1739 return (NULL);
1740 }
1741 if (m == NULL) {
1742 mtx_unlock(&vm_page_queue_free_mtx);
1743 return (NULL);
1744 }
1745 drop = vm_page_alloc_init(m);
1746 mtx_unlock(&vm_page_queue_free_mtx);
1747
1748 /*
1749 * Initialize the page. Only the PG_ZERO flag is inherited.
1750 */
1751 m->aflags = 0;
1752 flags = 0;
1753 if ((req & VM_ALLOC_ZERO) != 0)
1754 flags = PG_ZERO;
1755 m->flags &= flags;
1756 if ((req & VM_ALLOC_WIRED) != 0) {
1757 /*
1758 * The page lock is not required for wiring a page that does
1759 * not belong to an object.
1760 */
1761 atomic_add_int(&cnt.v_wire_count, 1);
1762 m->wire_count = 1;
1763 }
1764 /* Unmanaged pages don't use "act_count". */
1765 m->oflags = VPO_UNMANAGED;
1766 if (drop != NULL)
1767 vdrop(drop);
1768 if (vm_paging_needed())
1769 pagedaemon_wakeup();
1770 return (m);
1771}
1772
1773/*
1774 * vm_wait: (also see VM_WAIT macro)
1775 *
1776 * Block until free pages are available for allocation
1777 * - Called in various places before memory allocations.
1778 */
1779void
1780vm_wait(void)
1781{
1782
1783 mtx_lock(&vm_page_queue_free_mtx);
1784 if (curproc == pageproc) {
1785 vm_pageout_pages_needed = 1;
1786 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
1787 PDROP | PSWP, "VMWait", 0);
1788 } else {
1789 if (!vm_pages_needed) {
1790 vm_pages_needed = 1;
1791 wakeup(&vm_pages_needed);
1792 }
1793 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
1794 "vmwait", 0);
1795 }
1796}
1797
1798/*
1799 * vm_waitpfault: (also see VM_WAITPFAULT macro)
1800 *
1801 * Block until free pages are available for allocation
1802 * - Called only in vm_fault so that processes page faulting
1803 * can be easily tracked.
1804 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing
1805 * processes will be able to grab memory first. Do not change
1806 * this balance without careful testing first.
1807 */
1808void
1809vm_waitpfault(void)
1810{
1811
1812 mtx_lock(&vm_page_queue_free_mtx);
1813 if (!vm_pages_needed) {
1814 vm_pages_needed = 1;
1815 wakeup(&vm_pages_needed);
1816 }
1817 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
1818 "pfault", 0);
1819}
1820
1821/*
1822 * vm_page_requeue:
1823 *
1824 * Move the given page to the tail of its present page queue.
1825 *
1826 * The page queues must be locked.
1827 */
1828void
1829vm_page_requeue(vm_page_t m)
1830{
1831 struct vpgqueues *vpq;
1832 int queue;
1833
1834 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1835 queue = m->queue;
1836 KASSERT(queue != PQ_NONE,
1837 ("vm_page_requeue: page %p is not queued", m));
1838 vpq = &vm_page_queues[queue];
1839 TAILQ_REMOVE(&vpq->pl, m, pageq);
1840 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
1841}
1842
1843/*
1844 * vm_page_queue_remove:
1845 *
1846 * Remove the given page from the specified queue.
1847 *
1848 * The page and page queues must be locked.
1849 */
1850static __inline void
1851vm_page_queue_remove(int queue, vm_page_t m)
1852{
1853 struct vpgqueues *pq;
1854
1855 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1856 vm_page_lock_assert(m, MA_OWNED);
1857 pq = &vm_page_queues[queue];
1858 TAILQ_REMOVE(&pq->pl, m, pageq);
1859 (*pq->cnt)--;
1860}
1861
1862/*
1863 * vm_pageq_remove:
1864 *
1865 * Remove a page from its queue.
1866 *
1867 * The given page must be locked.
1868 * This routine may not block.
1869 */
1870void
1871vm_pageq_remove(vm_page_t m)
1872{
1873 int queue;
1874
1875 vm_page_lock_assert(m, MA_OWNED);
1876 if ((queue = m->queue) != PQ_NONE) {
1877 vm_page_lock_queues();
1878 m->queue = PQ_NONE;
1879 vm_page_queue_remove(queue, m);
1880 vm_page_unlock_queues();
1881 }
1882}
1883
1884/*
1885 * vm_page_enqueue:
1886 *
1887 * Add the given page to the specified queue.
1888 *
1889 * The page queues must be locked.
1890 */
1891static void
1892vm_page_enqueue(int queue, vm_page_t m)
1893{
1894 struct vpgqueues *vpq;
1895
1896 vpq = &vm_page_queues[queue];
1897 m->queue = queue;
1898 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
1899 ++*vpq->cnt;
1900}
1901
1902/*
1903 * vm_page_activate:
1904 *
1905 * Put the specified page on the active list (if appropriate).
1906 * Ensure that act_count is at least ACT_INIT but do not otherwise
1907 * mess with it.
1908 *
1909 * The page must be locked.
1910 * This routine may not block.
1911 */
1912void
1913vm_page_activate(vm_page_t m)
1914{
1915 int queue;
1916
1917 vm_page_lock_assert(m, MA_OWNED);
1918 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1919 if ((queue = m->queue) != PQ_ACTIVE) {
1920 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
1921 if (m->act_count < ACT_INIT)
1922 m->act_count = ACT_INIT;
1923 vm_page_lock_queues();
1924 if (queue != PQ_NONE)
1925 vm_page_queue_remove(queue, m);
1926 vm_page_enqueue(PQ_ACTIVE, m);
1927 vm_page_unlock_queues();
1928 } else
1929 KASSERT(queue == PQ_NONE,
1930 ("vm_page_activate: wired page %p is queued", m));
1931 } else {
1932 if (m->act_count < ACT_INIT)
1933 m->act_count = ACT_INIT;
1934 }
1935}
1936
1937/*
1938 * vm_page_free_wakeup:
1939 *
1940 * Helper routine for vm_page_free_toq() and vm_page_cache(). This
1941 * routine is called when a page has been added to the cache or free
1942 * queues.
1943 *
1944 * The page queues must be locked.
1945 * This routine may not block.
1946 */
1947static inline void
1948vm_page_free_wakeup(void)
1949{
1950
1951 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1952 /*
1953 * if pageout daemon needs pages, then tell it that there are
1954 * some free.
1955 */
1956 if (vm_pageout_pages_needed &&
1957 cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
1958 wakeup(&vm_pageout_pages_needed);
1959 vm_pageout_pages_needed = 0;
1960 }
1961 /*
1962 * wakeup processes that are waiting on memory if we hit a
1963 * high water mark. And wakeup scheduler process if we have
1964 * lots of memory. this process will swapin processes.
1965 */
1966 if (vm_pages_needed && !vm_page_count_min()) {
1967 vm_pages_needed = 0;
1968 wakeup(&cnt.v_free_count);
1969 }
1970}
1971
1972/*
1973 * vm_page_free_toq:
1974 *
1975 * Returns the given page to the free list,
1976 * disassociating it with any VM object.
1977 *
1978 * Object and page must be locked prior to entry.
1979 * This routine may not block.
1980 */
1981
1982void
1983vm_page_free_toq(vm_page_t m)
1984{
1985
1986 if ((m->oflags & VPO_UNMANAGED) == 0) {
1987 vm_page_lock_assert(m, MA_OWNED);
1988 KASSERT(!pmap_page_is_mapped(m),
1989 ("vm_page_free_toq: freeing mapped page %p", m));
1990 }
1991 PCPU_INC(cnt.v_tfree);
1992
1993 if (VM_PAGE_IS_FREE(m))
1994 panic("vm_page_free: freeing free page %p", m);
1995 else if (m->busy != 0)
1996 panic("vm_page_free: freeing busy page %p", m);
1997
1998 /*
1999 * unqueue, then remove page. Note that we cannot destroy
2000 * the page here because we do not want to call the pager's
2001 * callback routine until after we've put the page on the
2002 * appropriate free queue.
2003 */
2004 if ((m->oflags & VPO_UNMANAGED) == 0)
2005 vm_pageq_remove(m);
2006 vm_page_remove(m);
2007
2008 /*
2009 * If fictitious remove object association and
2010 * return, otherwise delay object association removal.
2011 */
2012 if ((m->flags & PG_FICTITIOUS) != 0) {
2013 return;
2014 }
2015
2016 m->valid = 0;
2017 vm_page_undirty(m);
2018
2019 if (m->wire_count != 0)
2020 panic("vm_page_free: freeing wired page %p", m);
2021 if (m->hold_count != 0) {
2022 m->flags &= ~PG_ZERO;
2023 vm_page_lock_queues();
2024 vm_page_enqueue(PQ_HOLD, m);
2025 vm_page_unlock_queues();
2026 } else {
2027 /*
2028 * Restore the default memory attribute to the page.
2029 */
2030 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
2031 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
2032
2033 /*
2034 * Insert the page into the physical memory allocator's
2035 * cache/free page queues.
2036 */
2037 mtx_lock(&vm_page_queue_free_mtx);
2038 m->flags |= PG_FREE;
2039 cnt.v_free_count++;
2040#if VM_NRESERVLEVEL > 0
2041 if (!vm_reserv_free_page(m))
2042#else
2043 if (TRUE)
2044#endif
2045 vm_phys_free_pages(m, 0);
2046 if ((m->flags & PG_ZERO) != 0)
2047 ++vm_page_zero_count;
2048 else
2049 vm_page_zero_idle_wakeup();
2050 vm_page_free_wakeup();
2051 mtx_unlock(&vm_page_queue_free_mtx);
2052 }
2053}
2054
2055/*
2056 * vm_page_wire:
2057 *
2058 * Mark this page as wired down by yet
2059 * another map, removing it from paging queues
2060 * as necessary.
2061 *
2062 * If the page is fictitious, then its wire count must remain one.
2063 *
2064 * The page must be locked.
2065 * This routine may not block.
2066 */
2067void
2068vm_page_wire(vm_page_t m)
2069{
2070
2071 /*
2072 * Only bump the wire statistics if the page is not already wired,
2073 * and only unqueue the page if it is on some queue (if it is unmanaged
2074 * it is already off the queues).
2075 */
2076 vm_page_lock_assert(m, MA_OWNED);
2077 if ((m->flags & PG_FICTITIOUS) != 0) {
2078 KASSERT(m->wire_count == 1,
2079 ("vm_page_wire: fictitious page %p's wire count isn't one",
2080 m));
2081 return;
2082 }
2083 if (m->wire_count == 0) {
2084 if ((m->oflags & VPO_UNMANAGED) == 0)
2085 vm_pageq_remove(m);
2086 atomic_add_int(&cnt.v_wire_count, 1);
2087 }
2088 m->wire_count++;
2089 KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
2090}
2091
2092/*
2093 * vm_page_unwire:
2094 *
2095 * Release one wiring of the specified page, potentially enabling it to be
2096 * paged again. If paging is enabled, then the value of the parameter
2097 * "activate" determines to which queue the page is added. If "activate" is
2098 * non-zero, then the page is added to the active queue. Otherwise, it is
2099 * added to the inactive queue.
2100 *
2101 * However, unless the page belongs to an object, it is not enqueued because
2102 * it cannot be paged out.
2103 *
2104 * If a page is fictitious, then its wire count must alway be one.
2105 *
2106 * A managed page must be locked.
2107 */
2108void
2109vm_page_unwire(vm_page_t m, int activate)
2110{
2111
2112 if ((m->oflags & VPO_UNMANAGED) == 0)
2113 vm_page_lock_assert(m, MA_OWNED);
2114 if ((m->flags & PG_FICTITIOUS) != 0) {
2115 KASSERT(m->wire_count == 1,
2116 ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
2117 return;
2118 }
2119 if (m->wire_count > 0) {
2120 m->wire_count--;
2121 if (m->wire_count == 0) {
2122 atomic_subtract_int(&cnt.v_wire_count, 1);
2123 if ((m->oflags & VPO_UNMANAGED) != 0 ||
2124 m->object == NULL)
2125 return;
2126 vm_page_lock_queues();
2127 if (activate)
2128 vm_page_enqueue(PQ_ACTIVE, m);
2129 else {
2130 m->flags &= ~PG_WINATCFLS;
2131 vm_page_enqueue(PQ_INACTIVE, m);
2132 }
2133 vm_page_unlock_queues();
2134 }
2135 } else
2136 panic("vm_page_unwire: page %p's wire count is zero", m);
2137}
2138
2139/*
2140 * Move the specified page to the inactive queue.
2141 *
2142 * Many pages placed on the inactive queue should actually go
2143 * into the cache, but it is difficult to figure out which. What
2144 * we do instead, if the inactive target is well met, is to put
2145 * clean pages at the head of the inactive queue instead of the tail.
2146 * This will cause them to be moved to the cache more quickly and
2147 * if not actively re-referenced, reclaimed more quickly. If we just
2148 * stick these pages at the end of the inactive queue, heavy filesystem
2149 * meta-data accesses can cause an unnecessary paging load on memory bound
2150 * processes. This optimization causes one-time-use metadata to be
2151 * reused more quickly.
2152 *
2153 * Normally athead is 0 resulting in LRU operation. athead is set
2154 * to 1 if we want this page to be 'as if it were placed in the cache',
2155 * except without unmapping it from the process address space.
2156 *
2157 * This routine may not block.
2158 */
2159static inline void
2160_vm_page_deactivate(vm_page_t m, int athead)
2161{
2162 int queue;
2163
2164 vm_page_lock_assert(m, MA_OWNED);
2165
2166 /*
2167 * Ignore if already inactive.
2168 */
2169 if ((queue = m->queue) == PQ_INACTIVE)
2170 return;
2171 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
2172 vm_page_lock_queues();
2173 m->flags &= ~PG_WINATCFLS;
2174 if (queue != PQ_NONE)
2175 vm_page_queue_remove(queue, m);
2176 if (athead)
2177 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m,
2178 pageq);
2179 else
2180 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m,
2181 pageq);
2182 m->queue = PQ_INACTIVE;
2183 cnt.v_inactive_count++;
2184 vm_page_unlock_queues();
2185 }
2186}
2187
2188/*
2189 * Move the specified page to the inactive queue.
2190 *
2191 * The page must be locked.
2192 */
2193void
2194vm_page_deactivate(vm_page_t m)
2195{
2196
2197 _vm_page_deactivate(m, 0);
2198}
2199
2200/*
2201 * vm_page_try_to_cache:
2202 *
2203 * Returns 0 on failure, 1 on success
2204 */
2205int
2206vm_page_try_to_cache(vm_page_t m)
2207{
2208
2209 vm_page_lock_assert(m, MA_OWNED);
2210 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2211 if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2212 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2213 return (0);
2214 pmap_remove_all(m);
2215 if (m->dirty)
2216 return (0);
2217 vm_page_cache(m);
2218 return (1);
2219}
2220
2221/*
2222 * vm_page_try_to_free()
2223 *
2224 * Attempt to free the page. If we cannot free it, we do nothing.
2225 * 1 is returned on success, 0 on failure.
2226 */
2227int
2228vm_page_try_to_free(vm_page_t m)
2229{
2230
2231 vm_page_lock_assert(m, MA_OWNED);
2232 if (m->object != NULL)
2233 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2234 if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2235 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2236 return (0);
2237 pmap_remove_all(m);
2238 if (m->dirty)
2239 return (0);
2240 vm_page_free(m);
2241 return (1);
2242}
2243
2244/*
2245 * vm_page_cache
2246 *
2247 * Put the specified page onto the page cache queue (if appropriate).
2248 *
2249 * This routine may not block.
2250 */
2251void
2252vm_page_cache(vm_page_t m)
2253{
2254 vm_object_t object;
2255 vm_page_t next, prev, root;
2256
2257 vm_page_lock_assert(m, MA_OWNED);
2258 object = m->object;
2259 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2260 if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy ||
2261 m->hold_count || m->wire_count)
2262 panic("vm_page_cache: attempting to cache busy page");
2263 pmap_remove_all(m);
2264 if (m->dirty != 0)
2265 panic("vm_page_cache: page %p is dirty", m);
2266 if (m->valid == 0 || object->type == OBJT_DEFAULT ||
2267 (object->type == OBJT_SWAP &&
2268 !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
2269 /*
2270 * Hypothesis: A cache-elgible page belonging to a
2271 * default object or swap object but without a backing
2272 * store must be zero filled.
2273 */
2274 vm_page_free(m);
2275 return;
2276 }
2277 KASSERT((m->flags & PG_CACHED) == 0,
2278 ("vm_page_cache: page %p is already cached", m));
2279 PCPU_INC(cnt.v_tcached);
2280
2281 /*
2282 * Remove the page from the paging queues.
2283 */
2284 vm_pageq_remove(m);
2285
2286 /*
2287 * Remove the page from the object's collection of resident
2288 * pages.
2289 */
2290 if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
2291 /*
2292 * Since the page's successor in the list is also its parent
2293 * in the tree, its right subtree must be empty.
2294 */
2295 next->left = m->left;
2296 KASSERT(m->right == NULL,
2297 ("vm_page_cache: page %p has right child", m));
2298 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
2299 prev->right == m) {
2300 /*
2301 * Since the page's predecessor in the list is also its parent
2302 * in the tree, its left subtree must be empty.
2303 */
2304 KASSERT(m->left == NULL,
2305 ("vm_page_cache: page %p has left child", m));
2306 prev->right = m->right;
2307 } else {
2308 if (m != object->root)
2309 vm_page_splay(m->pindex, object->root);
2310 if (m->left == NULL)
2311 root = m->right;
2312 else if (m->right == NULL)
2313 root = m->left;
2314 else {
2315 /*
2316 * Move the page's successor to the root, because
2317 * pages are usually removed in ascending order.
2318 */
2319 if (m->right != next)
2320 vm_page_splay(m->pindex, m->right);
2321 next->left = m->left;
2322 root = next;
2323 }
2324 object->root = root;
2325 }
2326 TAILQ_REMOVE(&object->memq, m, listq);
2327 object->resident_page_count--;
2328
2329 /*
2330 * Restore the default memory attribute to the page.
2331 */
2332 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
2333 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
2334
2335 /*
2336 * Insert the page into the object's collection of cached pages
2337 * and the physical memory allocator's cache/free page queues.
2338 */
2339 m->flags &= ~PG_ZERO;
2340 mtx_lock(&vm_page_queue_free_mtx);
2341 m->flags |= PG_CACHED;
2342 cnt.v_cache_count++;
2343 root = object->cache;
2344 if (root == NULL) {
2345 m->left = NULL;
2346 m->right = NULL;
2347 } else {
2348 root = vm_page_splay(m->pindex, root);
2349 if (m->pindex < root->pindex) {
2350 m->left = root->left;
2351 m->right = root;
2352 root->left = NULL;
2353 } else if (__predict_false(m->pindex == root->pindex))
2354 panic("vm_page_cache: offset already cached");
2355 else {
2356 m->right = root->right;
2357 m->left = root;
2358 root->right = NULL;
2359 }
2360 }
2361 object->cache = m;
2362#if VM_NRESERVLEVEL > 0
2363 if (!vm_reserv_free_page(m)) {
2364#else
2365 if (TRUE) {
2366#endif
2367 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
2368 vm_phys_free_pages(m, 0);
2369 }
2370 vm_page_free_wakeup();
2371 mtx_unlock(&vm_page_queue_free_mtx);
2372
2373 /*
2374 * Increment the vnode's hold count if this is the object's only
2375 * cached page. Decrement the vnode's hold count if this was
2376 * the object's only resident page.
2377 */
2378 if (object->type == OBJT_VNODE) {
2379 if (root == NULL && object->resident_page_count != 0)
2380 vhold(object->handle);
2381 else if (root != NULL && object->resident_page_count == 0)
2382 vdrop(object->handle);
2383 }
2384}
2385
2386/*
2387 * vm_page_dontneed
2388 *
2389 * Cache, deactivate, or do nothing as appropriate. This routine
2390 * is typically used by madvise() MADV_DONTNEED.
2391 *
2392 * Generally speaking we want to move the page into the cache so
2393 * it gets reused quickly. However, this can result in a silly syndrome
2394 * due to the page recycling too quickly. Small objects will not be
2395 * fully cached. On the otherhand, if we move the page to the inactive
2396 * queue we wind up with a problem whereby very large objects
2397 * unnecessarily blow away our inactive and cache queues.
2398 *
2399 * The solution is to move the pages based on a fixed weighting. We
2400 * either leave them alone, deactivate them, or move them to the cache,
2401 * where moving them to the cache has the highest weighting.
2402 * By forcing some pages into other queues we eventually force the
2403 * system to balance the queues, potentially recovering other unrelated
2404 * space from active. The idea is to not force this to happen too
2405 * often.
2406 */
2407void
2408vm_page_dontneed(vm_page_t m)
2409{
2410 int dnw;
2411 int head;
2412
2413 vm_page_lock_assert(m, MA_OWNED);
2414 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2415 dnw = PCPU_GET(dnweight);
2416 PCPU_INC(dnweight);
2417
2418 /*
2419 * Occasionally leave the page alone.
2420 */
2421 if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) {
2422 if (m->act_count >= ACT_INIT)
2423 --m->act_count;
2424 return;
2425 }
2426
2427 /*
2428 * Clear any references to the page. Otherwise, the page daemon will
2429 * immediately reactivate the page.
2430 *
2431 * Perform the pmap_clear_reference() first. Otherwise, a concurrent
2432 * pmap operation, such as pmap_remove(), could clear a reference in
2433 * the pmap and set PGA_REFERENCED on the page before the
2434 * pmap_clear_reference() had completed. Consequently, the page would
2435 * appear referenced based upon an old reference that occurred before
2436 * this function ran.
2437 */
2438 pmap_clear_reference(m);
2439 vm_page_aflag_clear(m, PGA_REFERENCED);
2440
2441 if (m->dirty == 0 && pmap_is_modified(m))
2442 vm_page_dirty(m);
2443
2444 if (m->dirty || (dnw & 0x0070) == 0) {
2445 /*
2446 * Deactivate the page 3 times out of 32.
2447 */
2448 head = 0;
2449 } else {
2450 /*
2451 * Cache the page 28 times out of every 32. Note that
2452 * the page is deactivated instead of cached, but placed
2453 * at the head of the queue instead of the tail.
2454 */
2455 head = 1;
2456 }
2457 _vm_page_deactivate(m, head);
2458}
2459
2460/*
2461 * Grab a page, waiting until we are waken up due to the page
2462 * changing state. We keep on waiting, if the page continues
2463 * to be in the object. If the page doesn't exist, first allocate it
2464 * and then conditionally zero it.
2465 *
2466 * The caller must always specify the VM_ALLOC_RETRY flag. This is intended
2467 * to facilitate its eventual removal.
2468 *
2469 * This routine may block.
2470 */
2471vm_page_t
2472vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2473{
2474 vm_page_t m;
2475
2476 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2477 KASSERT((allocflags & VM_ALLOC_RETRY) != 0,
2478 ("vm_page_grab: VM_ALLOC_RETRY is required"));
2479retrylookup:
2480 if ((m = vm_page_lookup(object, pindex)) != NULL) {
2481 if ((m->oflags & VPO_BUSY) != 0 ||
2482 ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) {
2483 /*
2484 * Reference the page before unlocking and
2485 * sleeping so that the page daemon is less
2486 * likely to reclaim it.
2487 */
2488 vm_page_aflag_set(m, PGA_REFERENCED);
2489 vm_page_sleep(m, "pgrbwt");
2490 goto retrylookup;
2491 } else {
2492 if ((allocflags & VM_ALLOC_WIRED) != 0) {
2493 vm_page_lock(m);
2494 vm_page_wire(m);
2495 vm_page_unlock(m);
2496 }
2497 if ((allocflags & VM_ALLOC_NOBUSY) == 0)
2498 vm_page_busy(m);
2499 return (m);
2500 }
2501 }
2502 m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY |
2503 VM_ALLOC_IGN_SBUSY));
2504 if (m == NULL) {
2505 VM_OBJECT_UNLOCK(object);
2506 VM_WAIT;
2507 VM_OBJECT_LOCK(object);
2508 goto retrylookup;
2509 } else if (m->valid != 0)
2510 return (m);
2511 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
2512 pmap_zero_page(m);
2513 return (m);
2514}
2515
2516/*
2517 * Mapping function for valid bits or for dirty bits in
2518 * a page. May not block.
2519 *
2520 * Inputs are required to range within a page.
2521 */
2522vm_page_bits_t
2523vm_page_bits(int base, int size)
2524{
2525 int first_bit;
2526 int last_bit;
2527
2528 KASSERT(
2529 base + size <= PAGE_SIZE,
2530 ("vm_page_bits: illegal base/size %d/%d", base, size)
2531 );
2532
2533 if (size == 0) /* handle degenerate case */
2534 return (0);
2535
2536 first_bit = base >> DEV_BSHIFT;
2537 last_bit = (base + size - 1) >> DEV_BSHIFT;
2538
2539 return (((vm_page_bits_t)2 << last_bit) -
2540 ((vm_page_bits_t)1 << first_bit));
2541}
2542
2543/*
2544 * vm_page_set_valid_range:
2545 *
2546 * Sets portions of a page valid. The arguments are expected
2547 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2548 * of any partial chunks touched by the range. The invalid portion of
2549 * such chunks will be zeroed.
2550 *
2551 * (base + size) must be less then or equal to PAGE_SIZE.
2552 */
2553void
2554vm_page_set_valid_range(vm_page_t m, int base, int size)
2555{
2556 int endoff, frag;
2557
2558 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2559 if (size == 0) /* handle degenerate case */
2560 return;
2561
2562 /*
2563 * If the base is not DEV_BSIZE aligned and the valid
2564 * bit is clear, we have to zero out a portion of the
2565 * first block.
2566 */
2567 if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2568 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
2569 pmap_zero_page_area(m, frag, base - frag);
2570
2571 /*
2572 * If the ending offset is not DEV_BSIZE aligned and the
2573 * valid bit is clear, we have to zero out a portion of
2574 * the last block.
2575 */
2576 endoff = base + size;
2577 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2578 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
2579 pmap_zero_page_area(m, endoff,
2580 DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2581
2582 /*
2583 * Assert that no previously invalid block that is now being validated
2584 * is already dirty.
2585 */
2586 KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
2587 ("vm_page_set_valid_range: page %p is dirty", m));
2588
2589 /*
2590 * Set valid bits inclusive of any overlap.
2591 */
2592 m->valid |= vm_page_bits(base, size);
2593}
2594
2595/*
2596 * Clear the given bits from the specified page's dirty field.
2597 */
2598static __inline void
2599vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
2600{
2601 uintptr_t addr;
2602#if PAGE_SIZE < 16384
2603 int shift;
2604#endif
2605
2606 /*
2607 * If the object is locked and the page is neither VPO_BUSY nor
2608 * PGA_WRITEABLE, then the page's dirty field cannot possibly be
2609 * set by a concurrent pmap operation.
2610 */
2611 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2612 if ((m->oflags & VPO_BUSY) == 0 && (m->aflags & PGA_WRITEABLE) == 0)
2613 m->dirty &= ~pagebits;
2614 else {
2615 /*
2616 * The pmap layer can call vm_page_dirty() without
2617 * holding a distinguished lock. The combination of
2618 * the object's lock and an atomic operation suffice
2619 * to guarantee consistency of the page dirty field.
2620 *
2621 * For PAGE_SIZE == 32768 case, compiler already
2622 * properly aligns the dirty field, so no forcible
2623 * alignment is needed. Only require existence of
2624 * atomic_clear_64 when page size is 32768.
2625 */
2626 addr = (uintptr_t)&m->dirty;
2627#if PAGE_SIZE == 32768
2628 atomic_clear_64((uint64_t *)addr, pagebits);
2629#elif PAGE_SIZE == 16384
2630 atomic_clear_32((uint32_t *)addr, pagebits);
2631#else /* PAGE_SIZE <= 8192 */
2632 /*
2633 * Use a trick to perform a 32-bit atomic on the
2634 * containing aligned word, to not depend on the existence
2635 * of atomic_clear_{8, 16}.
2636 */
2637 shift = addr & (sizeof(uint32_t) - 1);
2638#if BYTE_ORDER == BIG_ENDIAN
2639 shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
2640#else
2641 shift *= NBBY;
2642#endif
2643 addr &= ~(sizeof(uint32_t) - 1);
2644 atomic_clear_32((uint32_t *)addr, pagebits << shift);
2645#endif /* PAGE_SIZE */
2646 }
2647}
2648
2649/*
2650 * vm_page_set_validclean:
2651 *
2652 * Sets portions of a page valid and clean. The arguments are expected
2653 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2654 * of any partial chunks touched by the range. The invalid portion of
2655 * such chunks will be zero'd.
2656 *
2657 * This routine may not block.
2658 *
2659 * (base + size) must be less then or equal to PAGE_SIZE.
2660 */
2661void
2662vm_page_set_validclean(vm_page_t m, int base, int size)
2663{
2664 vm_page_bits_t oldvalid, pagebits;
2665 int endoff, frag;
2666
2667 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2668 if (size == 0) /* handle degenerate case */
2669 return;
2670
2671 /*
2672 * If the base is not DEV_BSIZE aligned and the valid
2673 * bit is clear, we have to zero out a portion of the
2674 * first block.
2675 */
2676 if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2677 (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
2678 pmap_zero_page_area(m, frag, base - frag);
2679
2680 /*
2681 * If the ending offset is not DEV_BSIZE aligned and the
2682 * valid bit is clear, we have to zero out a portion of
2683 * the last block.
2684 */
2685 endoff = base + size;
2686 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2687 (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
2688 pmap_zero_page_area(m, endoff,
2689 DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2690
2691 /*
2692 * Set valid, clear dirty bits. If validating the entire
2693 * page we can safely clear the pmap modify bit. We also
2694 * use this opportunity to clear the VPO_NOSYNC flag. If a process
2695 * takes a write fault on a MAP_NOSYNC memory area the flag will
2696 * be set again.
2697 *
2698 * We set valid bits inclusive of any overlap, but we can only
2699 * clear dirty bits for DEV_BSIZE chunks that are fully within
2700 * the range.
2701 */
2702 oldvalid = m->valid;
2703 pagebits = vm_page_bits(base, size);
2704 m->valid |= pagebits;
2705#if 0 /* NOT YET */
2706 if ((frag = base & (DEV_BSIZE - 1)) != 0) {
2707 frag = DEV_BSIZE - frag;
2708 base += frag;
2709 size -= frag;
2710 if (size < 0)
2711 size = 0;
2712 }
2713 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
2714#endif
2715 if (base == 0 && size == PAGE_SIZE) {
2716 /*
2717 * The page can only be modified within the pmap if it is
2718 * mapped, and it can only be mapped if it was previously
2719 * fully valid.
2720 */
2721 if (oldvalid == VM_PAGE_BITS_ALL)
2722 /*
2723 * Perform the pmap_clear_modify() first. Otherwise,
2724 * a concurrent pmap operation, such as
2725 * pmap_protect(), could clear a modification in the
2726 * pmap and set the dirty field on the page before
2727 * pmap_clear_modify() had begun and after the dirty
2728 * field was cleared here.
2729 */
2730 pmap_clear_modify(m);
2731 m->dirty = 0;
2732 m->oflags &= ~VPO_NOSYNC;
2733 } else if (oldvalid != VM_PAGE_BITS_ALL)
2734 m->dirty &= ~pagebits;
2735 else
2736 vm_page_clear_dirty_mask(m, pagebits);
2737}
2738
2739void
2740vm_page_clear_dirty(vm_page_t m, int base, int size)
2741{
2742
2743 vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
2744}
2745
2746/*
2747 * vm_page_set_invalid:
2748 *
2749 * Invalidates DEV_BSIZE'd chunks within a page. Both the
2750 * valid and dirty bits for the effected areas are cleared.
2751 *
2752 * May not block.
2753 */
2754void
2755vm_page_set_invalid(vm_page_t m, int base, int size)
2756{
2757 vm_page_bits_t bits;
2758
2759 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2760 KASSERT((m->oflags & VPO_BUSY) == 0,
2761 ("vm_page_set_invalid: page %p is busy", m));
2762 bits = vm_page_bits(base, size);
2763 if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
2764 pmap_remove_all(m);
2765 KASSERT(!pmap_page_is_mapped(m),
2766 ("vm_page_set_invalid: page %p is mapped", m));
2767 m->valid &= ~bits;
2768 m->dirty &= ~bits;
2769}
2770
2771/*
2772 * vm_page_zero_invalid()
2773 *
2774 * The kernel assumes that the invalid portions of a page contain
2775 * garbage, but such pages can be mapped into memory by user code.
2776 * When this occurs, we must zero out the non-valid portions of the
2777 * page so user code sees what it expects.
2778 *
2779 * Pages are most often semi-valid when the end of a file is mapped
2780 * into memory and the file's size is not page aligned.
2781 */
2782void
2783vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
2784{
2785 int b;
2786 int i;
2787
2788 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2789 /*
2790 * Scan the valid bits looking for invalid sections that
2791 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the
2792 * valid bit may be set ) have already been zerod by
2793 * vm_page_set_validclean().
2794 */
2795 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
2796 if (i == (PAGE_SIZE / DEV_BSIZE) ||
2797 (m->valid & ((vm_page_bits_t)1 << i))) {
2798 if (i > b) {
2799 pmap_zero_page_area(m,
2800 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
2801 }
2802 b = i + 1;
2803 }
2804 }
2805
2806 /*
2807 * setvalid is TRUE when we can safely set the zero'd areas
2808 * as being valid. We can do this if there are no cache consistancy
2809 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS.
2810 */
2811 if (setvalid)
2812 m->valid = VM_PAGE_BITS_ALL;
2813}
2814
2815/*
2816 * vm_page_is_valid:
2817 *
2818 * Is (partial) page valid? Note that the case where size == 0
2819 * will return FALSE in the degenerate case where the page is
2820 * entirely invalid, and TRUE otherwise.
2821 *
2822 * May not block.
2823 */
2824int
2825vm_page_is_valid(vm_page_t m, int base, int size)
2826{
2827 vm_page_bits_t bits;
2828
2829 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2830 bits = vm_page_bits(base, size);
2831 if (m->valid && ((m->valid & bits) == bits))
2832 return 1;
2833 else
2834 return 0;
2835}
2836
2837/*
2838 * update dirty bits from pmap/mmu. May not block.
2839 */
2840void
2841vm_page_test_dirty(vm_page_t m)
2842{
2843
2844 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2845 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
2846 vm_page_dirty(m);
2847}
2848
2849void
2850vm_page_lock_KBI(vm_page_t m, const char *file, int line)
2851{
2852
2853 mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
2854}
2855
2856void
2857vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
2858{
2859
2860 mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
2861}
2862
2863int
2864vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
2865{
2866
2867 return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
2868}
2869
2870#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
2871void
2872vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
2873{
2874
2875 mtx_assert_(vm_page_lockptr(m), a, file, line);
2876}
2877#endif
2878
2879int so_zerocp_fullpage = 0;
2880
2881/*
2882 * Replace the given page with a copy. The copied page assumes
2883 * the portion of the given page's "wire_count" that is not the
2884 * responsibility of this copy-on-write mechanism.
2885 *
2886 * The object containing the given page must have a non-zero
2887 * paging-in-progress count and be locked.
2888 */
2889void
2890vm_page_cowfault(vm_page_t m)
2891{
2892 vm_page_t mnew;
2893 vm_object_t object;
2894 vm_pindex_t pindex;
2895
2896 mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
2897 vm_page_lock_assert(m, MA_OWNED);
2898 object = m->object;
2899 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2900 KASSERT(object->paging_in_progress != 0,
2901 ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
2902 object));
2903 pindex = m->pindex;
2904
2905 retry_alloc:
2906 pmap_remove_all(m);
2907 vm_page_remove(m);
2908 mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
2909 if (mnew == NULL) {
2910 vm_page_insert(m, object, pindex);
2911 vm_page_unlock(m);
2912 VM_OBJECT_UNLOCK(object);
2913 VM_WAIT;
2914 VM_OBJECT_LOCK(object);
2915 if (m == vm_page_lookup(object, pindex)) {
2916 vm_page_lock(m);
2917 goto retry_alloc;
2918 } else {
2919 /*
2920 * Page disappeared during the wait.
2921 */
2922 return;
2923 }
2924 }
2925
2926 if (m->cow == 0) {
2927 /*
2928 * check to see if we raced with an xmit complete when
2929 * waiting to allocate a page. If so, put things back
2930 * the way they were
2931 */
2932 vm_page_unlock(m);
2933 vm_page_lock(mnew);
2934 vm_page_free(mnew);
2935 vm_page_unlock(mnew);
2936 vm_page_insert(m, object, pindex);
2937 } else { /* clear COW & copy page */
2938 if (!so_zerocp_fullpage)
2939 pmap_copy_page(m, mnew);
2940 mnew->valid = VM_PAGE_BITS_ALL;
2941 vm_page_dirty(mnew);
2942 mnew->wire_count = m->wire_count - m->cow;
2943 m->wire_count = m->cow;
2944 vm_page_unlock(m);
2945 }
2946}
2947
2948void
2949vm_page_cowclear(vm_page_t m)
2950{
2951
2952 vm_page_lock_assert(m, MA_OWNED);
2953 if (m->cow) {
2954 m->cow--;
2955 /*
2956 * let vm_fault add back write permission lazily
2957 */
2958 }
2959 /*
2960 * sf_buf_free() will free the page, so we needn't do it here
2961 */
2962}
2963
2964int
2965vm_page_cowsetup(vm_page_t m)
2966{
2967
2968 vm_page_lock_assert(m, MA_OWNED);
2969 if ((m->flags & PG_FICTITIOUS) != 0 ||
2970 (m->oflags & VPO_UNMANAGED) != 0 ||
2971 m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYLOCK(m->object))
2972 return (EBUSY);
2973 m->cow++;
2974 pmap_remove_write(m);
2975 VM_OBJECT_UNLOCK(m->object);
2976 return (0);
2977}
2978
2979#ifdef INVARIANTS
2980void
2981vm_page_object_lock_assert(vm_page_t m)
2982{
2983
2984 /*
2985 * Certain of the page's fields may only be modified by the
2986 * holder of the containing object's lock or the setter of the
2987 * page's VPO_BUSY flag. Unfortunately, the setter of the
2988 * VPO_BUSY flag is not recorded, and thus cannot be checked
2989 * here.
2990 */
2991 if (m->object != NULL && (m->oflags & VPO_BUSY) == 0)
2992 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2993}
2994#endif
2995
2996#include "opt_ddb.h"
2997#ifdef DDB
2998#include <sys/kernel.h>
2999
3000#include <ddb/ddb.h>
3001
3002DB_SHOW_COMMAND(page, vm_page_print_page_info)
3003{
3004 db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
3005 db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
3006 db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
3007 db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
3008 db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
3009 db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
3010 db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
3011 db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
3012 db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
3013 db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
3014}
3015
3016DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
3017{
3018
3019 db_printf("PQ_FREE:");
3020 db_printf(" %d", cnt.v_free_count);
3021 db_printf("\n");
3022
3023 db_printf("PQ_CACHE:");
3024 db_printf(" %d", cnt.v_cache_count);
3025 db_printf("\n");
3026
3027 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
3028 *vm_page_queues[PQ_ACTIVE].cnt,
3029 *vm_page_queues[PQ_INACTIVE].cnt);
3030}
3031#endif /* DDB */
1607 if ((req & VM_ALLOC_WIRED) != 0)
1608 atomic_add_int(&cnt.v_wire_count, npages);
1609 oflags = VPO_UNMANAGED;
1610 if (object != NULL) {
1611 if ((req & VM_ALLOC_NOBUSY) == 0)
1612 oflags |= VPO_BUSY;
1613 if (object->memattr != VM_MEMATTR_DEFAULT &&
1614 memattr == VM_MEMATTR_DEFAULT)
1615 memattr = object->memattr;
1616 }
1617 for (m = m_ret; m < &m_ret[npages]; m++) {
1618 m->aflags = 0;
1619 m->flags &= flags;
1620 if ((req & VM_ALLOC_WIRED) != 0)
1621 m->wire_count = 1;
1622 /* Unmanaged pages don't use "act_count". */
1623 m->oflags = oflags;
1624 if (memattr != VM_MEMATTR_DEFAULT)
1625 pmap_page_set_memattr(m, memattr);
1626 if (object != NULL)
1627 vm_page_insert(m, object, pindex);
1628 else
1629 m->pindex = pindex;
1630 pindex++;
1631 }
1632 while (deferred_vdrop_list != NULL) {
1633 vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
1634 deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
1635 }
1636 if (vm_paging_needed())
1637 pagedaemon_wakeup();
1638 return (m_ret);
1639}
1640
1641/*
1642 * Initialize a page that has been freshly dequeued from a freelist.
1643 * The caller has to drop the vnode returned, if it is not NULL.
1644 *
1645 * This function may only be used to initialize unmanaged pages.
1646 *
1647 * To be called with vm_page_queue_free_mtx held.
1648 */
1649static struct vnode *
1650vm_page_alloc_init(vm_page_t m)
1651{
1652 struct vnode *drop;
1653 vm_object_t m_object;
1654
1655 KASSERT(m->queue == PQ_NONE,
1656 ("vm_page_alloc_init: page %p has unexpected queue %d",
1657 m, m->queue));
1658 KASSERT(m->wire_count == 0,
1659 ("vm_page_alloc_init: page %p is wired", m));
1660 KASSERT(m->hold_count == 0,
1661 ("vm_page_alloc_init: page %p is held", m));
1662 KASSERT(m->busy == 0,
1663 ("vm_page_alloc_init: page %p is busy", m));
1664 KASSERT(m->dirty == 0,
1665 ("vm_page_alloc_init: page %p is dirty", m));
1666 KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1667 ("vm_page_alloc_init: page %p has unexpected memattr %d",
1668 m, pmap_page_get_memattr(m)));
1669 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1670 drop = NULL;
1671 if ((m->flags & PG_CACHED) != 0) {
1672 KASSERT((m->flags & PG_ZERO) == 0,
1673 ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
1674 m->valid = 0;
1675 m_object = m->object;
1676 vm_page_cache_remove(m);
1677 if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
1678 drop = m_object->handle;
1679 } else {
1680 KASSERT(VM_PAGE_IS_FREE(m),
1681 ("vm_page_alloc_init: page %p is not free", m));
1682 KASSERT(m->valid == 0,
1683 ("vm_page_alloc_init: free page %p is valid", m));
1684 cnt.v_free_count--;
1685 if ((m->flags & PG_ZERO) != 0)
1686 vm_page_zero_count--;
1687 }
1688 /* Don't clear the PG_ZERO flag; we'll need it later. */
1689 m->flags &= PG_ZERO;
1690 return (drop);
1691}
1692
1693/*
1694 * vm_page_alloc_freelist:
1695 *
1696 * Allocate a physical page from the specified free page list.
1697 *
1698 * The caller must always specify an allocation class.
1699 *
1700 * allocation classes:
1701 * VM_ALLOC_NORMAL normal process request
1702 * VM_ALLOC_SYSTEM system *really* needs a page
1703 * VM_ALLOC_INTERRUPT interrupt time request
1704 *
1705 * optional allocation flags:
1706 * VM_ALLOC_COUNT(number) the number of additional pages that the caller
1707 * intends to allocate
1708 * VM_ALLOC_WIRED wire the allocated page
1709 * VM_ALLOC_ZERO prefer a zeroed page
1710 *
1711 * This routine may not sleep.
1712 */
1713vm_page_t
1714vm_page_alloc_freelist(int flind, int req)
1715{
1716 struct vnode *drop;
1717 vm_page_t m;
1718 u_int flags;
1719 int req_class;
1720
1721 req_class = req & VM_ALLOC_CLASS_MASK;
1722
1723 /*
1724 * The page daemon is allowed to dig deeper into the free page list.
1725 */
1726 if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1727 req_class = VM_ALLOC_SYSTEM;
1728
1729 /*
1730 * Do not allocate reserved pages unless the req has asked for it.
1731 */
1732 mtx_lock(&vm_page_queue_free_mtx);
1733 if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1734 (req_class == VM_ALLOC_SYSTEM &&
1735 cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1736 (req_class == VM_ALLOC_INTERRUPT &&
1737 cnt.v_free_count + cnt.v_cache_count > 0))
1738 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
1739 else {
1740 mtx_unlock(&vm_page_queue_free_mtx);
1741 atomic_add_int(&vm_pageout_deficit,
1742 max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1743 pagedaemon_wakeup();
1744 return (NULL);
1745 }
1746 if (m == NULL) {
1747 mtx_unlock(&vm_page_queue_free_mtx);
1748 return (NULL);
1749 }
1750 drop = vm_page_alloc_init(m);
1751 mtx_unlock(&vm_page_queue_free_mtx);
1752
1753 /*
1754 * Initialize the page. Only the PG_ZERO flag is inherited.
1755 */
1756 m->aflags = 0;
1757 flags = 0;
1758 if ((req & VM_ALLOC_ZERO) != 0)
1759 flags = PG_ZERO;
1760 m->flags &= flags;
1761 if ((req & VM_ALLOC_WIRED) != 0) {
1762 /*
1763 * The page lock is not required for wiring a page that does
1764 * not belong to an object.
1765 */
1766 atomic_add_int(&cnt.v_wire_count, 1);
1767 m->wire_count = 1;
1768 }
1769 /* Unmanaged pages don't use "act_count". */
1770 m->oflags = VPO_UNMANAGED;
1771 if (drop != NULL)
1772 vdrop(drop);
1773 if (vm_paging_needed())
1774 pagedaemon_wakeup();
1775 return (m);
1776}
1777
1778/*
1779 * vm_wait: (also see VM_WAIT macro)
1780 *
1781 * Block until free pages are available for allocation
1782 * - Called in various places before memory allocations.
1783 */
1784void
1785vm_wait(void)
1786{
1787
1788 mtx_lock(&vm_page_queue_free_mtx);
1789 if (curproc == pageproc) {
1790 vm_pageout_pages_needed = 1;
1791 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
1792 PDROP | PSWP, "VMWait", 0);
1793 } else {
1794 if (!vm_pages_needed) {
1795 vm_pages_needed = 1;
1796 wakeup(&vm_pages_needed);
1797 }
1798 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
1799 "vmwait", 0);
1800 }
1801}
1802
1803/*
1804 * vm_waitpfault: (also see VM_WAITPFAULT macro)
1805 *
1806 * Block until free pages are available for allocation
1807 * - Called only in vm_fault so that processes page faulting
1808 * can be easily tracked.
1809 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing
1810 * processes will be able to grab memory first. Do not change
1811 * this balance without careful testing first.
1812 */
1813void
1814vm_waitpfault(void)
1815{
1816
1817 mtx_lock(&vm_page_queue_free_mtx);
1818 if (!vm_pages_needed) {
1819 vm_pages_needed = 1;
1820 wakeup(&vm_pages_needed);
1821 }
1822 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
1823 "pfault", 0);
1824}
1825
1826/*
1827 * vm_page_requeue:
1828 *
1829 * Move the given page to the tail of its present page queue.
1830 *
1831 * The page queues must be locked.
1832 */
1833void
1834vm_page_requeue(vm_page_t m)
1835{
1836 struct vpgqueues *vpq;
1837 int queue;
1838
1839 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1840 queue = m->queue;
1841 KASSERT(queue != PQ_NONE,
1842 ("vm_page_requeue: page %p is not queued", m));
1843 vpq = &vm_page_queues[queue];
1844 TAILQ_REMOVE(&vpq->pl, m, pageq);
1845 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
1846}
1847
1848/*
1849 * vm_page_queue_remove:
1850 *
1851 * Remove the given page from the specified queue.
1852 *
1853 * The page and page queues must be locked.
1854 */
1855static __inline void
1856vm_page_queue_remove(int queue, vm_page_t m)
1857{
1858 struct vpgqueues *pq;
1859
1860 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1861 vm_page_lock_assert(m, MA_OWNED);
1862 pq = &vm_page_queues[queue];
1863 TAILQ_REMOVE(&pq->pl, m, pageq);
1864 (*pq->cnt)--;
1865}
1866
1867/*
1868 * vm_pageq_remove:
1869 *
1870 * Remove a page from its queue.
1871 *
1872 * The given page must be locked.
1873 * This routine may not block.
1874 */
1875void
1876vm_pageq_remove(vm_page_t m)
1877{
1878 int queue;
1879
1880 vm_page_lock_assert(m, MA_OWNED);
1881 if ((queue = m->queue) != PQ_NONE) {
1882 vm_page_lock_queues();
1883 m->queue = PQ_NONE;
1884 vm_page_queue_remove(queue, m);
1885 vm_page_unlock_queues();
1886 }
1887}
1888
1889/*
1890 * vm_page_enqueue:
1891 *
1892 * Add the given page to the specified queue.
1893 *
1894 * The page queues must be locked.
1895 */
1896static void
1897vm_page_enqueue(int queue, vm_page_t m)
1898{
1899 struct vpgqueues *vpq;
1900
1901 vpq = &vm_page_queues[queue];
1902 m->queue = queue;
1903 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
1904 ++*vpq->cnt;
1905}
1906
1907/*
1908 * vm_page_activate:
1909 *
1910 * Put the specified page on the active list (if appropriate).
1911 * Ensure that act_count is at least ACT_INIT but do not otherwise
1912 * mess with it.
1913 *
1914 * The page must be locked.
1915 * This routine may not block.
1916 */
1917void
1918vm_page_activate(vm_page_t m)
1919{
1920 int queue;
1921
1922 vm_page_lock_assert(m, MA_OWNED);
1923 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1924 if ((queue = m->queue) != PQ_ACTIVE) {
1925 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
1926 if (m->act_count < ACT_INIT)
1927 m->act_count = ACT_INIT;
1928 vm_page_lock_queues();
1929 if (queue != PQ_NONE)
1930 vm_page_queue_remove(queue, m);
1931 vm_page_enqueue(PQ_ACTIVE, m);
1932 vm_page_unlock_queues();
1933 } else
1934 KASSERT(queue == PQ_NONE,
1935 ("vm_page_activate: wired page %p is queued", m));
1936 } else {
1937 if (m->act_count < ACT_INIT)
1938 m->act_count = ACT_INIT;
1939 }
1940}
1941
1942/*
1943 * vm_page_free_wakeup:
1944 *
1945 * Helper routine for vm_page_free_toq() and vm_page_cache(). This
1946 * routine is called when a page has been added to the cache or free
1947 * queues.
1948 *
1949 * The page queues must be locked.
1950 * This routine may not block.
1951 */
1952static inline void
1953vm_page_free_wakeup(void)
1954{
1955
1956 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1957 /*
1958 * if pageout daemon needs pages, then tell it that there are
1959 * some free.
1960 */
1961 if (vm_pageout_pages_needed &&
1962 cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
1963 wakeup(&vm_pageout_pages_needed);
1964 vm_pageout_pages_needed = 0;
1965 }
1966 /*
1967 * wakeup processes that are waiting on memory if we hit a
1968 * high water mark. And wakeup scheduler process if we have
1969 * lots of memory. this process will swapin processes.
1970 */
1971 if (vm_pages_needed && !vm_page_count_min()) {
1972 vm_pages_needed = 0;
1973 wakeup(&cnt.v_free_count);
1974 }
1975}
1976
1977/*
1978 * vm_page_free_toq:
1979 *
1980 * Returns the given page to the free list,
1981 * disassociating it with any VM object.
1982 *
1983 * Object and page must be locked prior to entry.
1984 * This routine may not block.
1985 */
1986
1987void
1988vm_page_free_toq(vm_page_t m)
1989{
1990
1991 if ((m->oflags & VPO_UNMANAGED) == 0) {
1992 vm_page_lock_assert(m, MA_OWNED);
1993 KASSERT(!pmap_page_is_mapped(m),
1994 ("vm_page_free_toq: freeing mapped page %p", m));
1995 }
1996 PCPU_INC(cnt.v_tfree);
1997
1998 if (VM_PAGE_IS_FREE(m))
1999 panic("vm_page_free: freeing free page %p", m);
2000 else if (m->busy != 0)
2001 panic("vm_page_free: freeing busy page %p", m);
2002
2003 /*
2004 * unqueue, then remove page. Note that we cannot destroy
2005 * the page here because we do not want to call the pager's
2006 * callback routine until after we've put the page on the
2007 * appropriate free queue.
2008 */
2009 if ((m->oflags & VPO_UNMANAGED) == 0)
2010 vm_pageq_remove(m);
2011 vm_page_remove(m);
2012
2013 /*
2014 * If fictitious remove object association and
2015 * return, otherwise delay object association removal.
2016 */
2017 if ((m->flags & PG_FICTITIOUS) != 0) {
2018 return;
2019 }
2020
2021 m->valid = 0;
2022 vm_page_undirty(m);
2023
2024 if (m->wire_count != 0)
2025 panic("vm_page_free: freeing wired page %p", m);
2026 if (m->hold_count != 0) {
2027 m->flags &= ~PG_ZERO;
2028 vm_page_lock_queues();
2029 vm_page_enqueue(PQ_HOLD, m);
2030 vm_page_unlock_queues();
2031 } else {
2032 /*
2033 * Restore the default memory attribute to the page.
2034 */
2035 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
2036 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
2037
2038 /*
2039 * Insert the page into the physical memory allocator's
2040 * cache/free page queues.
2041 */
2042 mtx_lock(&vm_page_queue_free_mtx);
2043 m->flags |= PG_FREE;
2044 cnt.v_free_count++;
2045#if VM_NRESERVLEVEL > 0
2046 if (!vm_reserv_free_page(m))
2047#else
2048 if (TRUE)
2049#endif
2050 vm_phys_free_pages(m, 0);
2051 if ((m->flags & PG_ZERO) != 0)
2052 ++vm_page_zero_count;
2053 else
2054 vm_page_zero_idle_wakeup();
2055 vm_page_free_wakeup();
2056 mtx_unlock(&vm_page_queue_free_mtx);
2057 }
2058}
2059
2060/*
2061 * vm_page_wire:
2062 *
2063 * Mark this page as wired down by yet
2064 * another map, removing it from paging queues
2065 * as necessary.
2066 *
2067 * If the page is fictitious, then its wire count must remain one.
2068 *
2069 * The page must be locked.
2070 * This routine may not block.
2071 */
2072void
2073vm_page_wire(vm_page_t m)
2074{
2075
2076 /*
2077 * Only bump the wire statistics if the page is not already wired,
2078 * and only unqueue the page if it is on some queue (if it is unmanaged
2079 * it is already off the queues).
2080 */
2081 vm_page_lock_assert(m, MA_OWNED);
2082 if ((m->flags & PG_FICTITIOUS) != 0) {
2083 KASSERT(m->wire_count == 1,
2084 ("vm_page_wire: fictitious page %p's wire count isn't one",
2085 m));
2086 return;
2087 }
2088 if (m->wire_count == 0) {
2089 if ((m->oflags & VPO_UNMANAGED) == 0)
2090 vm_pageq_remove(m);
2091 atomic_add_int(&cnt.v_wire_count, 1);
2092 }
2093 m->wire_count++;
2094 KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
2095}
2096
2097/*
2098 * vm_page_unwire:
2099 *
2100 * Release one wiring of the specified page, potentially enabling it to be
2101 * paged again. If paging is enabled, then the value of the parameter
2102 * "activate" determines to which queue the page is added. If "activate" is
2103 * non-zero, then the page is added to the active queue. Otherwise, it is
2104 * added to the inactive queue.
2105 *
2106 * However, unless the page belongs to an object, it is not enqueued because
2107 * it cannot be paged out.
2108 *
2109 * If a page is fictitious, then its wire count must alway be one.
2110 *
2111 * A managed page must be locked.
2112 */
2113void
2114vm_page_unwire(vm_page_t m, int activate)
2115{
2116
2117 if ((m->oflags & VPO_UNMANAGED) == 0)
2118 vm_page_lock_assert(m, MA_OWNED);
2119 if ((m->flags & PG_FICTITIOUS) != 0) {
2120 KASSERT(m->wire_count == 1,
2121 ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
2122 return;
2123 }
2124 if (m->wire_count > 0) {
2125 m->wire_count--;
2126 if (m->wire_count == 0) {
2127 atomic_subtract_int(&cnt.v_wire_count, 1);
2128 if ((m->oflags & VPO_UNMANAGED) != 0 ||
2129 m->object == NULL)
2130 return;
2131 vm_page_lock_queues();
2132 if (activate)
2133 vm_page_enqueue(PQ_ACTIVE, m);
2134 else {
2135 m->flags &= ~PG_WINATCFLS;
2136 vm_page_enqueue(PQ_INACTIVE, m);
2137 }
2138 vm_page_unlock_queues();
2139 }
2140 } else
2141 panic("vm_page_unwire: page %p's wire count is zero", m);
2142}
2143
2144/*
2145 * Move the specified page to the inactive queue.
2146 *
2147 * Many pages placed on the inactive queue should actually go
2148 * into the cache, but it is difficult to figure out which. What
2149 * we do instead, if the inactive target is well met, is to put
2150 * clean pages at the head of the inactive queue instead of the tail.
2151 * This will cause them to be moved to the cache more quickly and
2152 * if not actively re-referenced, reclaimed more quickly. If we just
2153 * stick these pages at the end of the inactive queue, heavy filesystem
2154 * meta-data accesses can cause an unnecessary paging load on memory bound
2155 * processes. This optimization causes one-time-use metadata to be
2156 * reused more quickly.
2157 *
2158 * Normally athead is 0 resulting in LRU operation. athead is set
2159 * to 1 if we want this page to be 'as if it were placed in the cache',
2160 * except without unmapping it from the process address space.
2161 *
2162 * This routine may not block.
2163 */
2164static inline void
2165_vm_page_deactivate(vm_page_t m, int athead)
2166{
2167 int queue;
2168
2169 vm_page_lock_assert(m, MA_OWNED);
2170
2171 /*
2172 * Ignore if already inactive.
2173 */
2174 if ((queue = m->queue) == PQ_INACTIVE)
2175 return;
2176 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
2177 vm_page_lock_queues();
2178 m->flags &= ~PG_WINATCFLS;
2179 if (queue != PQ_NONE)
2180 vm_page_queue_remove(queue, m);
2181 if (athead)
2182 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m,
2183 pageq);
2184 else
2185 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m,
2186 pageq);
2187 m->queue = PQ_INACTIVE;
2188 cnt.v_inactive_count++;
2189 vm_page_unlock_queues();
2190 }
2191}
2192
2193/*
2194 * Move the specified page to the inactive queue.
2195 *
2196 * The page must be locked.
2197 */
2198void
2199vm_page_deactivate(vm_page_t m)
2200{
2201
2202 _vm_page_deactivate(m, 0);
2203}
2204
2205/*
2206 * vm_page_try_to_cache:
2207 *
2208 * Returns 0 on failure, 1 on success
2209 */
2210int
2211vm_page_try_to_cache(vm_page_t m)
2212{
2213
2214 vm_page_lock_assert(m, MA_OWNED);
2215 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2216 if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2217 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2218 return (0);
2219 pmap_remove_all(m);
2220 if (m->dirty)
2221 return (0);
2222 vm_page_cache(m);
2223 return (1);
2224}
2225
2226/*
2227 * vm_page_try_to_free()
2228 *
2229 * Attempt to free the page. If we cannot free it, we do nothing.
2230 * 1 is returned on success, 0 on failure.
2231 */
2232int
2233vm_page_try_to_free(vm_page_t m)
2234{
2235
2236 vm_page_lock_assert(m, MA_OWNED);
2237 if (m->object != NULL)
2238 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2239 if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2240 (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2241 return (0);
2242 pmap_remove_all(m);
2243 if (m->dirty)
2244 return (0);
2245 vm_page_free(m);
2246 return (1);
2247}
2248
2249/*
2250 * vm_page_cache
2251 *
2252 * Put the specified page onto the page cache queue (if appropriate).
2253 *
2254 * This routine may not block.
2255 */
2256void
2257vm_page_cache(vm_page_t m)
2258{
2259 vm_object_t object;
2260 vm_page_t next, prev, root;
2261
2262 vm_page_lock_assert(m, MA_OWNED);
2263 object = m->object;
2264 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2265 if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy ||
2266 m->hold_count || m->wire_count)
2267 panic("vm_page_cache: attempting to cache busy page");
2268 pmap_remove_all(m);
2269 if (m->dirty != 0)
2270 panic("vm_page_cache: page %p is dirty", m);
2271 if (m->valid == 0 || object->type == OBJT_DEFAULT ||
2272 (object->type == OBJT_SWAP &&
2273 !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
2274 /*
2275 * Hypothesis: A cache-elgible page belonging to a
2276 * default object or swap object but without a backing
2277 * store must be zero filled.
2278 */
2279 vm_page_free(m);
2280 return;
2281 }
2282 KASSERT((m->flags & PG_CACHED) == 0,
2283 ("vm_page_cache: page %p is already cached", m));
2284 PCPU_INC(cnt.v_tcached);
2285
2286 /*
2287 * Remove the page from the paging queues.
2288 */
2289 vm_pageq_remove(m);
2290
2291 /*
2292 * Remove the page from the object's collection of resident
2293 * pages.
2294 */
2295 if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
2296 /*
2297 * Since the page's successor in the list is also its parent
2298 * in the tree, its right subtree must be empty.
2299 */
2300 next->left = m->left;
2301 KASSERT(m->right == NULL,
2302 ("vm_page_cache: page %p has right child", m));
2303 } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
2304 prev->right == m) {
2305 /*
2306 * Since the page's predecessor in the list is also its parent
2307 * in the tree, its left subtree must be empty.
2308 */
2309 KASSERT(m->left == NULL,
2310 ("vm_page_cache: page %p has left child", m));
2311 prev->right = m->right;
2312 } else {
2313 if (m != object->root)
2314 vm_page_splay(m->pindex, object->root);
2315 if (m->left == NULL)
2316 root = m->right;
2317 else if (m->right == NULL)
2318 root = m->left;
2319 else {
2320 /*
2321 * Move the page's successor to the root, because
2322 * pages are usually removed in ascending order.
2323 */
2324 if (m->right != next)
2325 vm_page_splay(m->pindex, m->right);
2326 next->left = m->left;
2327 root = next;
2328 }
2329 object->root = root;
2330 }
2331 TAILQ_REMOVE(&object->memq, m, listq);
2332 object->resident_page_count--;
2333
2334 /*
2335 * Restore the default memory attribute to the page.
2336 */
2337 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
2338 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
2339
2340 /*
2341 * Insert the page into the object's collection of cached pages
2342 * and the physical memory allocator's cache/free page queues.
2343 */
2344 m->flags &= ~PG_ZERO;
2345 mtx_lock(&vm_page_queue_free_mtx);
2346 m->flags |= PG_CACHED;
2347 cnt.v_cache_count++;
2348 root = object->cache;
2349 if (root == NULL) {
2350 m->left = NULL;
2351 m->right = NULL;
2352 } else {
2353 root = vm_page_splay(m->pindex, root);
2354 if (m->pindex < root->pindex) {
2355 m->left = root->left;
2356 m->right = root;
2357 root->left = NULL;
2358 } else if (__predict_false(m->pindex == root->pindex))
2359 panic("vm_page_cache: offset already cached");
2360 else {
2361 m->right = root->right;
2362 m->left = root;
2363 root->right = NULL;
2364 }
2365 }
2366 object->cache = m;
2367#if VM_NRESERVLEVEL > 0
2368 if (!vm_reserv_free_page(m)) {
2369#else
2370 if (TRUE) {
2371#endif
2372 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
2373 vm_phys_free_pages(m, 0);
2374 }
2375 vm_page_free_wakeup();
2376 mtx_unlock(&vm_page_queue_free_mtx);
2377
2378 /*
2379 * Increment the vnode's hold count if this is the object's only
2380 * cached page. Decrement the vnode's hold count if this was
2381 * the object's only resident page.
2382 */
2383 if (object->type == OBJT_VNODE) {
2384 if (root == NULL && object->resident_page_count != 0)
2385 vhold(object->handle);
2386 else if (root != NULL && object->resident_page_count == 0)
2387 vdrop(object->handle);
2388 }
2389}
2390
2391/*
2392 * vm_page_dontneed
2393 *
2394 * Cache, deactivate, or do nothing as appropriate. This routine
2395 * is typically used by madvise() MADV_DONTNEED.
2396 *
2397 * Generally speaking we want to move the page into the cache so
2398 * it gets reused quickly. However, this can result in a silly syndrome
2399 * due to the page recycling too quickly. Small objects will not be
2400 * fully cached. On the otherhand, if we move the page to the inactive
2401 * queue we wind up with a problem whereby very large objects
2402 * unnecessarily blow away our inactive and cache queues.
2403 *
2404 * The solution is to move the pages based on a fixed weighting. We
2405 * either leave them alone, deactivate them, or move them to the cache,
2406 * where moving them to the cache has the highest weighting.
2407 * By forcing some pages into other queues we eventually force the
2408 * system to balance the queues, potentially recovering other unrelated
2409 * space from active. The idea is to not force this to happen too
2410 * often.
2411 */
2412void
2413vm_page_dontneed(vm_page_t m)
2414{
2415 int dnw;
2416 int head;
2417
2418 vm_page_lock_assert(m, MA_OWNED);
2419 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2420 dnw = PCPU_GET(dnweight);
2421 PCPU_INC(dnweight);
2422
2423 /*
2424 * Occasionally leave the page alone.
2425 */
2426 if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) {
2427 if (m->act_count >= ACT_INIT)
2428 --m->act_count;
2429 return;
2430 }
2431
2432 /*
2433 * Clear any references to the page. Otherwise, the page daemon will
2434 * immediately reactivate the page.
2435 *
2436 * Perform the pmap_clear_reference() first. Otherwise, a concurrent
2437 * pmap operation, such as pmap_remove(), could clear a reference in
2438 * the pmap and set PGA_REFERENCED on the page before the
2439 * pmap_clear_reference() had completed. Consequently, the page would
2440 * appear referenced based upon an old reference that occurred before
2441 * this function ran.
2442 */
2443 pmap_clear_reference(m);
2444 vm_page_aflag_clear(m, PGA_REFERENCED);
2445
2446 if (m->dirty == 0 && pmap_is_modified(m))
2447 vm_page_dirty(m);
2448
2449 if (m->dirty || (dnw & 0x0070) == 0) {
2450 /*
2451 * Deactivate the page 3 times out of 32.
2452 */
2453 head = 0;
2454 } else {
2455 /*
2456 * Cache the page 28 times out of every 32. Note that
2457 * the page is deactivated instead of cached, but placed
2458 * at the head of the queue instead of the tail.
2459 */
2460 head = 1;
2461 }
2462 _vm_page_deactivate(m, head);
2463}
2464
2465/*
2466 * Grab a page, waiting until we are waken up due to the page
2467 * changing state. We keep on waiting, if the page continues
2468 * to be in the object. If the page doesn't exist, first allocate it
2469 * and then conditionally zero it.
2470 *
2471 * The caller must always specify the VM_ALLOC_RETRY flag. This is intended
2472 * to facilitate its eventual removal.
2473 *
2474 * This routine may block.
2475 */
2476vm_page_t
2477vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2478{
2479 vm_page_t m;
2480
2481 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2482 KASSERT((allocflags & VM_ALLOC_RETRY) != 0,
2483 ("vm_page_grab: VM_ALLOC_RETRY is required"));
2484retrylookup:
2485 if ((m = vm_page_lookup(object, pindex)) != NULL) {
2486 if ((m->oflags & VPO_BUSY) != 0 ||
2487 ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) {
2488 /*
2489 * Reference the page before unlocking and
2490 * sleeping so that the page daemon is less
2491 * likely to reclaim it.
2492 */
2493 vm_page_aflag_set(m, PGA_REFERENCED);
2494 vm_page_sleep(m, "pgrbwt");
2495 goto retrylookup;
2496 } else {
2497 if ((allocflags & VM_ALLOC_WIRED) != 0) {
2498 vm_page_lock(m);
2499 vm_page_wire(m);
2500 vm_page_unlock(m);
2501 }
2502 if ((allocflags & VM_ALLOC_NOBUSY) == 0)
2503 vm_page_busy(m);
2504 return (m);
2505 }
2506 }
2507 m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY |
2508 VM_ALLOC_IGN_SBUSY));
2509 if (m == NULL) {
2510 VM_OBJECT_UNLOCK(object);
2511 VM_WAIT;
2512 VM_OBJECT_LOCK(object);
2513 goto retrylookup;
2514 } else if (m->valid != 0)
2515 return (m);
2516 if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
2517 pmap_zero_page(m);
2518 return (m);
2519}
2520
2521/*
2522 * Mapping function for valid bits or for dirty bits in
2523 * a page. May not block.
2524 *
2525 * Inputs are required to range within a page.
2526 */
2527vm_page_bits_t
2528vm_page_bits(int base, int size)
2529{
2530 int first_bit;
2531 int last_bit;
2532
2533 KASSERT(
2534 base + size <= PAGE_SIZE,
2535 ("vm_page_bits: illegal base/size %d/%d", base, size)
2536 );
2537
2538 if (size == 0) /* handle degenerate case */
2539 return (0);
2540
2541 first_bit = base >> DEV_BSHIFT;
2542 last_bit = (base + size - 1) >> DEV_BSHIFT;
2543
2544 return (((vm_page_bits_t)2 << last_bit) -
2545 ((vm_page_bits_t)1 << first_bit));
2546}
2547
2548/*
2549 * vm_page_set_valid_range:
2550 *
2551 * Sets portions of a page valid. The arguments are expected
2552 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2553 * of any partial chunks touched by the range. The invalid portion of
2554 * such chunks will be zeroed.
2555 *
2556 * (base + size) must be less then or equal to PAGE_SIZE.
2557 */
2558void
2559vm_page_set_valid_range(vm_page_t m, int base, int size)
2560{
2561 int endoff, frag;
2562
2563 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2564 if (size == 0) /* handle degenerate case */
2565 return;
2566
2567 /*
2568 * If the base is not DEV_BSIZE aligned and the valid
2569 * bit is clear, we have to zero out a portion of the
2570 * first block.
2571 */
2572 if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2573 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
2574 pmap_zero_page_area(m, frag, base - frag);
2575
2576 /*
2577 * If the ending offset is not DEV_BSIZE aligned and the
2578 * valid bit is clear, we have to zero out a portion of
2579 * the last block.
2580 */
2581 endoff = base + size;
2582 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2583 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
2584 pmap_zero_page_area(m, endoff,
2585 DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2586
2587 /*
2588 * Assert that no previously invalid block that is now being validated
2589 * is already dirty.
2590 */
2591 KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
2592 ("vm_page_set_valid_range: page %p is dirty", m));
2593
2594 /*
2595 * Set valid bits inclusive of any overlap.
2596 */
2597 m->valid |= vm_page_bits(base, size);
2598}
2599
2600/*
2601 * Clear the given bits from the specified page's dirty field.
2602 */
2603static __inline void
2604vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
2605{
2606 uintptr_t addr;
2607#if PAGE_SIZE < 16384
2608 int shift;
2609#endif
2610
2611 /*
2612 * If the object is locked and the page is neither VPO_BUSY nor
2613 * PGA_WRITEABLE, then the page's dirty field cannot possibly be
2614 * set by a concurrent pmap operation.
2615 */
2616 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2617 if ((m->oflags & VPO_BUSY) == 0 && (m->aflags & PGA_WRITEABLE) == 0)
2618 m->dirty &= ~pagebits;
2619 else {
2620 /*
2621 * The pmap layer can call vm_page_dirty() without
2622 * holding a distinguished lock. The combination of
2623 * the object's lock and an atomic operation suffice
2624 * to guarantee consistency of the page dirty field.
2625 *
2626 * For PAGE_SIZE == 32768 case, compiler already
2627 * properly aligns the dirty field, so no forcible
2628 * alignment is needed. Only require existence of
2629 * atomic_clear_64 when page size is 32768.
2630 */
2631 addr = (uintptr_t)&m->dirty;
2632#if PAGE_SIZE == 32768
2633 atomic_clear_64((uint64_t *)addr, pagebits);
2634#elif PAGE_SIZE == 16384
2635 atomic_clear_32((uint32_t *)addr, pagebits);
2636#else /* PAGE_SIZE <= 8192 */
2637 /*
2638 * Use a trick to perform a 32-bit atomic on the
2639 * containing aligned word, to not depend on the existence
2640 * of atomic_clear_{8, 16}.
2641 */
2642 shift = addr & (sizeof(uint32_t) - 1);
2643#if BYTE_ORDER == BIG_ENDIAN
2644 shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
2645#else
2646 shift *= NBBY;
2647#endif
2648 addr &= ~(sizeof(uint32_t) - 1);
2649 atomic_clear_32((uint32_t *)addr, pagebits << shift);
2650#endif /* PAGE_SIZE */
2651 }
2652}
2653
2654/*
2655 * vm_page_set_validclean:
2656 *
2657 * Sets portions of a page valid and clean. The arguments are expected
2658 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2659 * of any partial chunks touched by the range. The invalid portion of
2660 * such chunks will be zero'd.
2661 *
2662 * This routine may not block.
2663 *
2664 * (base + size) must be less then or equal to PAGE_SIZE.
2665 */
2666void
2667vm_page_set_validclean(vm_page_t m, int base, int size)
2668{
2669 vm_page_bits_t oldvalid, pagebits;
2670 int endoff, frag;
2671
2672 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2673 if (size == 0) /* handle degenerate case */
2674 return;
2675
2676 /*
2677 * If the base is not DEV_BSIZE aligned and the valid
2678 * bit is clear, we have to zero out a portion of the
2679 * first block.
2680 */
2681 if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2682 (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
2683 pmap_zero_page_area(m, frag, base - frag);
2684
2685 /*
2686 * If the ending offset is not DEV_BSIZE aligned and the
2687 * valid bit is clear, we have to zero out a portion of
2688 * the last block.
2689 */
2690 endoff = base + size;
2691 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2692 (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
2693 pmap_zero_page_area(m, endoff,
2694 DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2695
2696 /*
2697 * Set valid, clear dirty bits. If validating the entire
2698 * page we can safely clear the pmap modify bit. We also
2699 * use this opportunity to clear the VPO_NOSYNC flag. If a process
2700 * takes a write fault on a MAP_NOSYNC memory area the flag will
2701 * be set again.
2702 *
2703 * We set valid bits inclusive of any overlap, but we can only
2704 * clear dirty bits for DEV_BSIZE chunks that are fully within
2705 * the range.
2706 */
2707 oldvalid = m->valid;
2708 pagebits = vm_page_bits(base, size);
2709 m->valid |= pagebits;
2710#if 0 /* NOT YET */
2711 if ((frag = base & (DEV_BSIZE - 1)) != 0) {
2712 frag = DEV_BSIZE - frag;
2713 base += frag;
2714 size -= frag;
2715 if (size < 0)
2716 size = 0;
2717 }
2718 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
2719#endif
2720 if (base == 0 && size == PAGE_SIZE) {
2721 /*
2722 * The page can only be modified within the pmap if it is
2723 * mapped, and it can only be mapped if it was previously
2724 * fully valid.
2725 */
2726 if (oldvalid == VM_PAGE_BITS_ALL)
2727 /*
2728 * Perform the pmap_clear_modify() first. Otherwise,
2729 * a concurrent pmap operation, such as
2730 * pmap_protect(), could clear a modification in the
2731 * pmap and set the dirty field on the page before
2732 * pmap_clear_modify() had begun and after the dirty
2733 * field was cleared here.
2734 */
2735 pmap_clear_modify(m);
2736 m->dirty = 0;
2737 m->oflags &= ~VPO_NOSYNC;
2738 } else if (oldvalid != VM_PAGE_BITS_ALL)
2739 m->dirty &= ~pagebits;
2740 else
2741 vm_page_clear_dirty_mask(m, pagebits);
2742}
2743
2744void
2745vm_page_clear_dirty(vm_page_t m, int base, int size)
2746{
2747
2748 vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
2749}
2750
2751/*
2752 * vm_page_set_invalid:
2753 *
2754 * Invalidates DEV_BSIZE'd chunks within a page. Both the
2755 * valid and dirty bits for the effected areas are cleared.
2756 *
2757 * May not block.
2758 */
2759void
2760vm_page_set_invalid(vm_page_t m, int base, int size)
2761{
2762 vm_page_bits_t bits;
2763
2764 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2765 KASSERT((m->oflags & VPO_BUSY) == 0,
2766 ("vm_page_set_invalid: page %p is busy", m));
2767 bits = vm_page_bits(base, size);
2768 if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
2769 pmap_remove_all(m);
2770 KASSERT(!pmap_page_is_mapped(m),
2771 ("vm_page_set_invalid: page %p is mapped", m));
2772 m->valid &= ~bits;
2773 m->dirty &= ~bits;
2774}
2775
2776/*
2777 * vm_page_zero_invalid()
2778 *
2779 * The kernel assumes that the invalid portions of a page contain
2780 * garbage, but such pages can be mapped into memory by user code.
2781 * When this occurs, we must zero out the non-valid portions of the
2782 * page so user code sees what it expects.
2783 *
2784 * Pages are most often semi-valid when the end of a file is mapped
2785 * into memory and the file's size is not page aligned.
2786 */
2787void
2788vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
2789{
2790 int b;
2791 int i;
2792
2793 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2794 /*
2795 * Scan the valid bits looking for invalid sections that
2796 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the
2797 * valid bit may be set ) have already been zerod by
2798 * vm_page_set_validclean().
2799 */
2800 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
2801 if (i == (PAGE_SIZE / DEV_BSIZE) ||
2802 (m->valid & ((vm_page_bits_t)1 << i))) {
2803 if (i > b) {
2804 pmap_zero_page_area(m,
2805 b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
2806 }
2807 b = i + 1;
2808 }
2809 }
2810
2811 /*
2812 * setvalid is TRUE when we can safely set the zero'd areas
2813 * as being valid. We can do this if there are no cache consistancy
2814 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS.
2815 */
2816 if (setvalid)
2817 m->valid = VM_PAGE_BITS_ALL;
2818}
2819
2820/*
2821 * vm_page_is_valid:
2822 *
2823 * Is (partial) page valid? Note that the case where size == 0
2824 * will return FALSE in the degenerate case where the page is
2825 * entirely invalid, and TRUE otherwise.
2826 *
2827 * May not block.
2828 */
2829int
2830vm_page_is_valid(vm_page_t m, int base, int size)
2831{
2832 vm_page_bits_t bits;
2833
2834 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2835 bits = vm_page_bits(base, size);
2836 if (m->valid && ((m->valid & bits) == bits))
2837 return 1;
2838 else
2839 return 0;
2840}
2841
2842/*
2843 * update dirty bits from pmap/mmu. May not block.
2844 */
2845void
2846vm_page_test_dirty(vm_page_t m)
2847{
2848
2849 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2850 if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
2851 vm_page_dirty(m);
2852}
2853
2854void
2855vm_page_lock_KBI(vm_page_t m, const char *file, int line)
2856{
2857
2858 mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
2859}
2860
2861void
2862vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
2863{
2864
2865 mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
2866}
2867
2868int
2869vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
2870{
2871
2872 return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
2873}
2874
2875#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
2876void
2877vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
2878{
2879
2880 mtx_assert_(vm_page_lockptr(m), a, file, line);
2881}
2882#endif
2883
2884int so_zerocp_fullpage = 0;
2885
2886/*
2887 * Replace the given page with a copy. The copied page assumes
2888 * the portion of the given page's "wire_count" that is not the
2889 * responsibility of this copy-on-write mechanism.
2890 *
2891 * The object containing the given page must have a non-zero
2892 * paging-in-progress count and be locked.
2893 */
2894void
2895vm_page_cowfault(vm_page_t m)
2896{
2897 vm_page_t mnew;
2898 vm_object_t object;
2899 vm_pindex_t pindex;
2900
2901 mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
2902 vm_page_lock_assert(m, MA_OWNED);
2903 object = m->object;
2904 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2905 KASSERT(object->paging_in_progress != 0,
2906 ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
2907 object));
2908 pindex = m->pindex;
2909
2910 retry_alloc:
2911 pmap_remove_all(m);
2912 vm_page_remove(m);
2913 mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
2914 if (mnew == NULL) {
2915 vm_page_insert(m, object, pindex);
2916 vm_page_unlock(m);
2917 VM_OBJECT_UNLOCK(object);
2918 VM_WAIT;
2919 VM_OBJECT_LOCK(object);
2920 if (m == vm_page_lookup(object, pindex)) {
2921 vm_page_lock(m);
2922 goto retry_alloc;
2923 } else {
2924 /*
2925 * Page disappeared during the wait.
2926 */
2927 return;
2928 }
2929 }
2930
2931 if (m->cow == 0) {
2932 /*
2933 * check to see if we raced with an xmit complete when
2934 * waiting to allocate a page. If so, put things back
2935 * the way they were
2936 */
2937 vm_page_unlock(m);
2938 vm_page_lock(mnew);
2939 vm_page_free(mnew);
2940 vm_page_unlock(mnew);
2941 vm_page_insert(m, object, pindex);
2942 } else { /* clear COW & copy page */
2943 if (!so_zerocp_fullpage)
2944 pmap_copy_page(m, mnew);
2945 mnew->valid = VM_PAGE_BITS_ALL;
2946 vm_page_dirty(mnew);
2947 mnew->wire_count = m->wire_count - m->cow;
2948 m->wire_count = m->cow;
2949 vm_page_unlock(m);
2950 }
2951}
2952
2953void
2954vm_page_cowclear(vm_page_t m)
2955{
2956
2957 vm_page_lock_assert(m, MA_OWNED);
2958 if (m->cow) {
2959 m->cow--;
2960 /*
2961 * let vm_fault add back write permission lazily
2962 */
2963 }
2964 /*
2965 * sf_buf_free() will free the page, so we needn't do it here
2966 */
2967}
2968
2969int
2970vm_page_cowsetup(vm_page_t m)
2971{
2972
2973 vm_page_lock_assert(m, MA_OWNED);
2974 if ((m->flags & PG_FICTITIOUS) != 0 ||
2975 (m->oflags & VPO_UNMANAGED) != 0 ||
2976 m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYLOCK(m->object))
2977 return (EBUSY);
2978 m->cow++;
2979 pmap_remove_write(m);
2980 VM_OBJECT_UNLOCK(m->object);
2981 return (0);
2982}
2983
2984#ifdef INVARIANTS
2985void
2986vm_page_object_lock_assert(vm_page_t m)
2987{
2988
2989 /*
2990 * Certain of the page's fields may only be modified by the
2991 * holder of the containing object's lock or the setter of the
2992 * page's VPO_BUSY flag. Unfortunately, the setter of the
2993 * VPO_BUSY flag is not recorded, and thus cannot be checked
2994 * here.
2995 */
2996 if (m->object != NULL && (m->oflags & VPO_BUSY) == 0)
2997 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2998}
2999#endif
3000
3001#include "opt_ddb.h"
3002#ifdef DDB
3003#include <sys/kernel.h>
3004
3005#include <ddb/ddb.h>
3006
3007DB_SHOW_COMMAND(page, vm_page_print_page_info)
3008{
3009 db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
3010 db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
3011 db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
3012 db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
3013 db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
3014 db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
3015 db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
3016 db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
3017 db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
3018 db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
3019}
3020
3021DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
3022{
3023
3024 db_printf("PQ_FREE:");
3025 db_printf(" %d", cnt.v_free_count);
3026 db_printf("\n");
3027
3028 db_printf("PQ_CACHE:");
3029 db_printf(" %d", cnt.v_cache_count);
3030 db_printf("\n");
3031
3032 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
3033 *vm_page_queues[PQ_ACTIVE].cnt,
3034 *vm_page_queues[PQ_INACTIVE].cnt);
3035}
3036#endif /* DDB */