vm_phys.c revision 226928
1170477Salc/*-
2170477Salc * Copyright (c) 2002-2006 Rice University
3170477Salc * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
4170477Salc * All rights reserved.
5170477Salc *
6170477Salc * This software was developed for the FreeBSD Project by Alan L. Cox,
7170477Salc * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
8170477Salc *
9170477Salc * Redistribution and use in source and binary forms, with or without
10170477Salc * modification, are permitted provided that the following conditions
11170477Salc * are met:
12170477Salc * 1. Redistributions of source code must retain the above copyright
13170477Salc *    notice, this list of conditions and the following disclaimer.
14170477Salc * 2. Redistributions in binary form must reproduce the above copyright
15170477Salc *    notice, this list of conditions and the following disclaimer in the
16170477Salc *    documentation and/or other materials provided with the distribution.
17170477Salc *
18170477Salc * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19170477Salc * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20170477Salc * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21170477Salc * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22170477Salc * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23170477Salc * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24170477Salc * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25170477Salc * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26170477Salc * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27170477Salc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
28170477Salc * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29170477Salc * POSSIBILITY OF SUCH DAMAGE.
30170477Salc */
31170477Salc
32170477Salc#include <sys/cdefs.h>
33170477Salc__FBSDID("$FreeBSD: head/sys/vm/vm_phys.c 226928 2011-10-30 05:06:14Z alc $");
34170477Salc
35170477Salc#include "opt_ddb.h"
36226642Sattilio#include "opt_vm.h"
37170477Salc
38170477Salc#include <sys/param.h>
39170477Salc#include <sys/systm.h>
40170477Salc#include <sys/lock.h>
41170477Salc#include <sys/kernel.h>
42170477Salc#include <sys/malloc.h>
43170477Salc#include <sys/mutex.h>
44170477Salc#include <sys/queue.h>
45170477Salc#include <sys/sbuf.h>
46170477Salc#include <sys/sysctl.h>
47170477Salc#include <sys/vmmeter.h>
48172317Salc#include <sys/vnode.h>
49170477Salc
50170477Salc#include <ddb/ddb.h>
51170477Salc
52170477Salc#include <vm/vm.h>
53170477Salc#include <vm/vm_param.h>
54170477Salc#include <vm/vm_kern.h>
55170477Salc#include <vm/vm_object.h>
56170477Salc#include <vm/vm_page.h>
57170477Salc#include <vm/vm_phys.h>
58177956Salc#include <vm/vm_reserv.h>
59170477Salc
60210550Sjhb/*
61210550Sjhb * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
62210550Sjhb * domain.  These extra lists are stored at the end of the regular
63210550Sjhb * free lists starting with VM_NFREELIST.
64210550Sjhb */
65210550Sjhb#define VM_RAW_NFREELIST	(VM_NFREELIST + VM_NDOMAIN - 1)
66210550Sjhb
67170477Salcstruct vm_freelist {
68170477Salc	struct pglist pl;
69170477Salc	int lcnt;
70170477Salc};
71170477Salc
72170477Salcstruct vm_phys_seg {
73170477Salc	vm_paddr_t	start;
74170477Salc	vm_paddr_t	end;
75170477Salc	vm_page_t	first_page;
76210550Sjhb	int		domain;
77170477Salc	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
78170477Salc};
79170477Salc
80210550Sjhbstruct mem_affinity *mem_affinity;
81210550Sjhb
82170477Salcstatic struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
83170477Salc
84170477Salcstatic int vm_phys_nsegs;
85170477Salc
86170477Salcstatic struct vm_freelist
87210550Sjhb    vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
88210550Sjhbstatic struct vm_freelist
89210550Sjhb(*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
90170477Salc
91170477Salcstatic int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
92170477Salc
93170477Salcstatic int cnt_prezero;
94170477SalcSYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
95170477Salc    &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
96170477Salc
97170477Salcstatic int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
98170477SalcSYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
99170477Salc    NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
100170477Salc
101170477Salcstatic int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
102170477SalcSYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
103170477Salc    NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
104170477Salc
105210550Sjhb#if VM_NDOMAIN > 1
106210550Sjhbstatic int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
107210550SjhbSYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
108210550Sjhb    NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
109210550Sjhb#endif
110210550Sjhb
111210550Sjhbstatic void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
112210550Sjhb    int domain);
113170477Salcstatic void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
114170477Salcstatic int vm_phys_paddr_to_segind(vm_paddr_t pa);
115170477Salcstatic void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
116170477Salc    int order);
117170477Salc
118170477Salc/*
119170477Salc * Outputs the state of the physical memory allocator, specifically,
120170477Salc * the amount of physical memory in each free list.
121170477Salc */
122170477Salcstatic int
123170477Salcsysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
124170477Salc{
125170477Salc	struct sbuf sbuf;
126170477Salc	struct vm_freelist *fl;
127170477Salc	int error, flind, oind, pind;
128170477Salc
129217916Smdf	error = sysctl_wire_old_buffer(req, 0);
130217916Smdf	if (error != 0)
131217916Smdf		return (error);
132212750Smdf	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
133170477Salc	for (flind = 0; flind < vm_nfreelists; flind++) {
134170477Salc		sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
135170477Salc		    "\n  ORDER (SIZE)  |  NUMBER"
136170477Salc		    "\n              ", flind);
137170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++)
138170477Salc			sbuf_printf(&sbuf, "  |  POOL %d", pind);
139170477Salc		sbuf_printf(&sbuf, "\n--            ");
140170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++)
141170477Salc			sbuf_printf(&sbuf, "-- --      ");
142170477Salc		sbuf_printf(&sbuf, "--\n");
143170477Salc		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
144214564Salc			sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
145170477Salc			    1 << (PAGE_SHIFT - 10 + oind));
146170477Salc			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
147170477Salc				fl = vm_phys_free_queues[flind][pind];
148214564Salc				sbuf_printf(&sbuf, "  |  %6d", fl[oind].lcnt);
149170477Salc			}
150170477Salc			sbuf_printf(&sbuf, "\n");
151170477Salc		}
152170477Salc	}
153212750Smdf	error = sbuf_finish(&sbuf);
154170477Salc	sbuf_delete(&sbuf);
155170477Salc	return (error);
156170477Salc}
157170477Salc
158170477Salc/*
159170477Salc * Outputs the set of physical memory segments.
160170477Salc */
161170477Salcstatic int
162170477Salcsysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
163170477Salc{
164170477Salc	struct sbuf sbuf;
165170477Salc	struct vm_phys_seg *seg;
166170477Salc	int error, segind;
167170477Salc
168217916Smdf	error = sysctl_wire_old_buffer(req, 0);
169217916Smdf	if (error != 0)
170217916Smdf		return (error);
171212750Smdf	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
172170477Salc	for (segind = 0; segind < vm_phys_nsegs; segind++) {
173170477Salc		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
174170477Salc		seg = &vm_phys_segs[segind];
175170477Salc		sbuf_printf(&sbuf, "start:     %#jx\n",
176170477Salc		    (uintmax_t)seg->start);
177170477Salc		sbuf_printf(&sbuf, "end:       %#jx\n",
178170477Salc		    (uintmax_t)seg->end);
179210550Sjhb		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
180170477Salc		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
181170477Salc	}
182212750Smdf	error = sbuf_finish(&sbuf);
183170477Salc	sbuf_delete(&sbuf);
184170477Salc	return (error);
185170477Salc}
186170477Salc
187210550Sjhb#if VM_NDOMAIN > 1
188170477Salc/*
189210550Sjhb * Outputs the set of free list lookup lists.
190210550Sjhb */
191210550Sjhbstatic int
192210550Sjhbsysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
193210550Sjhb{
194210550Sjhb	struct sbuf sbuf;
195210550Sjhb	int domain, error, flind, ndomains;
196210550Sjhb
197217916Smdf	error = sysctl_wire_old_buffer(req, 0);
198217916Smdf	if (error != 0)
199217916Smdf		return (error);
200217916Smdf	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
201210550Sjhb	ndomains = vm_nfreelists - VM_NFREELIST + 1;
202210550Sjhb	for (domain = 0; domain < ndomains; domain++) {
203210550Sjhb		sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
204210550Sjhb		for (flind = 0; flind < vm_nfreelists; flind++)
205210550Sjhb			sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
206210550Sjhb			    vm_phys_lookup_lists[domain][flind]);
207210550Sjhb	}
208212750Smdf	error = sbuf_finish(&sbuf);
209210550Sjhb	sbuf_delete(&sbuf);
210210550Sjhb	return (error);
211210550Sjhb}
212210550Sjhb#endif
213210550Sjhb
214210550Sjhb/*
215170477Salc * Create a physical memory segment.
216170477Salc */
217170477Salcstatic void
218210550Sjhb_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
219170477Salc{
220170477Salc	struct vm_phys_seg *seg;
221170477Salc#ifdef VM_PHYSSEG_SPARSE
222170477Salc	long pages;
223170477Salc	int segind;
224170477Salc
225170477Salc	pages = 0;
226170477Salc	for (segind = 0; segind < vm_phys_nsegs; segind++) {
227170477Salc		seg = &vm_phys_segs[segind];
228170477Salc		pages += atop(seg->end - seg->start);
229170477Salc	}
230170477Salc#endif
231170477Salc	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
232170477Salc	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
233170477Salc	seg = &vm_phys_segs[vm_phys_nsegs++];
234170477Salc	seg->start = start;
235170477Salc	seg->end = end;
236210550Sjhb	seg->domain = domain;
237170477Salc#ifdef VM_PHYSSEG_SPARSE
238170477Salc	seg->first_page = &vm_page_array[pages];
239170477Salc#else
240170477Salc	seg->first_page = PHYS_TO_VM_PAGE(start);
241170477Salc#endif
242210550Sjhb#if VM_NDOMAIN > 1
243210550Sjhb	if (flind == VM_FREELIST_DEFAULT && domain != 0) {
244210550Sjhb		flind = VM_NFREELIST + (domain - 1);
245210550Sjhb		if (flind >= vm_nfreelists)
246210550Sjhb			vm_nfreelists = flind + 1;
247210550Sjhb	}
248210550Sjhb#endif
249170477Salc	seg->free_queues = &vm_phys_free_queues[flind];
250170477Salc}
251170477Salc
252210550Sjhbstatic void
253210550Sjhbvm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
254210550Sjhb{
255210550Sjhb	int i;
256210550Sjhb
257210550Sjhb	if (mem_affinity == NULL) {
258210550Sjhb		_vm_phys_create_seg(start, end, flind, 0);
259210550Sjhb		return;
260210550Sjhb	}
261210550Sjhb
262210550Sjhb	for (i = 0;; i++) {
263210550Sjhb		if (mem_affinity[i].end == 0)
264210550Sjhb			panic("Reached end of affinity info");
265210550Sjhb		if (mem_affinity[i].end <= start)
266210550Sjhb			continue;
267210550Sjhb		if (mem_affinity[i].start > start)
268210550Sjhb			panic("No affinity info for start %jx",
269210550Sjhb			    (uintmax_t)start);
270210550Sjhb		if (mem_affinity[i].end >= end) {
271210550Sjhb			_vm_phys_create_seg(start, end, flind,
272210550Sjhb			    mem_affinity[i].domain);
273210550Sjhb			break;
274210550Sjhb		}
275210550Sjhb		_vm_phys_create_seg(start, mem_affinity[i].end, flind,
276210550Sjhb		    mem_affinity[i].domain);
277210550Sjhb		start = mem_affinity[i].end;
278210550Sjhb	}
279210550Sjhb}
280210550Sjhb
281170477Salc/*
282170477Salc * Initialize the physical memory allocator.
283170477Salc */
284170477Salcvoid
285170477Salcvm_phys_init(void)
286170477Salc{
287170477Salc	struct vm_freelist *fl;
288170477Salc	int flind, i, oind, pind;
289210550Sjhb#if VM_NDOMAIN > 1
290210550Sjhb	int ndomains, j;
291210550Sjhb#endif
292170477Salc
293170477Salc	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
294170477Salc#ifdef	VM_FREELIST_ISADMA
295170477Salc		if (phys_avail[i] < 16777216) {
296170477Salc			if (phys_avail[i + 1] > 16777216) {
297170477Salc				vm_phys_create_seg(phys_avail[i], 16777216,
298170477Salc				    VM_FREELIST_ISADMA);
299170477Salc				vm_phys_create_seg(16777216, phys_avail[i + 1],
300170477Salc				    VM_FREELIST_DEFAULT);
301170477Salc			} else {
302170477Salc				vm_phys_create_seg(phys_avail[i],
303170477Salc				    phys_avail[i + 1], VM_FREELIST_ISADMA);
304170477Salc			}
305170477Salc			if (VM_FREELIST_ISADMA >= vm_nfreelists)
306170477Salc				vm_nfreelists = VM_FREELIST_ISADMA + 1;
307170477Salc		} else
308170477Salc#endif
309170477Salc#ifdef	VM_FREELIST_HIGHMEM
310170477Salc		if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
311170477Salc			if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
312170477Salc				vm_phys_create_seg(phys_avail[i],
313170477Salc				    VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
314170477Salc				vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
315170477Salc				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
316170477Salc			} else {
317170477Salc				vm_phys_create_seg(phys_avail[i],
318170477Salc				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
319170477Salc			}
320170477Salc			if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
321170477Salc				vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
322170477Salc		} else
323170477Salc#endif
324170477Salc		vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
325170477Salc		    VM_FREELIST_DEFAULT);
326170477Salc	}
327170477Salc	for (flind = 0; flind < vm_nfreelists; flind++) {
328170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
329170477Salc			fl = vm_phys_free_queues[flind][pind];
330170477Salc			for (oind = 0; oind < VM_NFREEORDER; oind++)
331170477Salc				TAILQ_INIT(&fl[oind].pl);
332170477Salc		}
333170477Salc	}
334210550Sjhb#if VM_NDOMAIN > 1
335210550Sjhb	/*
336210550Sjhb	 * Build a free list lookup list for each domain.  All of the
337210550Sjhb	 * memory domain lists are inserted at the VM_FREELIST_DEFAULT
338210550Sjhb	 * index in a round-robin order starting with the current
339210550Sjhb	 * domain.
340210550Sjhb	 */
341210550Sjhb	ndomains = vm_nfreelists - VM_NFREELIST + 1;
342210550Sjhb	for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
343210550Sjhb		for (i = 0; i < ndomains; i++)
344210550Sjhb			vm_phys_lookup_lists[i][flind] =
345210550Sjhb			    &vm_phys_free_queues[flind];
346210550Sjhb	for (i = 0; i < ndomains; i++)
347210550Sjhb		for (j = 0; j < ndomains; j++) {
348210550Sjhb			flind = (i + j) % ndomains;
349210550Sjhb			if (flind == 0)
350210550Sjhb				flind = VM_FREELIST_DEFAULT;
351210550Sjhb			else
352210550Sjhb				flind += VM_NFREELIST - 1;
353210550Sjhb			vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
354210550Sjhb			    &vm_phys_free_queues[flind];
355210550Sjhb		}
356210550Sjhb	for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
357210550Sjhb	     flind++)
358210550Sjhb		for (i = 0; i < ndomains; i++)
359210550Sjhb			vm_phys_lookup_lists[i][flind + ndomains - 1] =
360210550Sjhb			    &vm_phys_free_queues[flind];
361210550Sjhb#else
362210550Sjhb	for (flind = 0; flind < vm_nfreelists; flind++)
363210550Sjhb		vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
364210550Sjhb#endif
365170477Salc}
366170477Salc
367170477Salc/*
368170477Salc * Split a contiguous, power of two-sized set of physical pages.
369170477Salc */
370170477Salcstatic __inline void
371170477Salcvm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
372170477Salc{
373170477Salc	vm_page_t m_buddy;
374170477Salc
375170477Salc	while (oind > order) {
376170477Salc		oind--;
377170477Salc		m_buddy = &m[1 << oind];
378170477Salc		KASSERT(m_buddy->order == VM_NFREEORDER,
379170477Salc		    ("vm_phys_split_pages: page %p has unexpected order %d",
380170477Salc		    m_buddy, m_buddy->order));
381170477Salc		m_buddy->order = oind;
382170477Salc		TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
383170477Salc		fl[oind].lcnt++;
384170477Salc        }
385170477Salc}
386170477Salc
387170477Salc/*
388170477Salc * Initialize a physical page and add it to the free lists.
389170477Salc */
390170477Salcvoid
391170477Salcvm_phys_add_page(vm_paddr_t pa)
392170477Salc{
393170477Salc	vm_page_t m;
394170477Salc
395170477Salc	cnt.v_page_count++;
396170477Salc	m = vm_phys_paddr_to_vm_page(pa);
397170477Salc	m->phys_addr = pa;
398217508Salc	m->queue = PQ_NONE;
399170477Salc	m->segind = vm_phys_paddr_to_segind(pa);
400170477Salc	m->flags = PG_FREE;
401170477Salc	KASSERT(m->order == VM_NFREEORDER,
402170477Salc	    ("vm_phys_add_page: page %p has unexpected order %d",
403170477Salc	    m, m->order));
404170477Salc	m->pool = VM_FREEPOOL_DEFAULT;
405170477Salc	pmap_page_init(m);
406171451Salc	mtx_lock(&vm_page_queue_free_mtx);
407172317Salc	cnt.v_free_count++;
408170477Salc	vm_phys_free_pages(m, 0);
409171451Salc	mtx_unlock(&vm_page_queue_free_mtx);
410170477Salc}
411170477Salc
412170477Salc/*
413170477Salc * Allocate a contiguous, power of two-sized set of physical pages
414170477Salc * from the free lists.
415171451Salc *
416171451Salc * The free page queues must be locked.
417170477Salc */
418170477Salcvm_page_t
419170477Salcvm_phys_alloc_pages(int pool, int order)
420170477Salc{
421210327Sjchandra	vm_page_t m;
422210327Sjchandra	int flind;
423210327Sjchandra
424210327Sjchandra	for (flind = 0; flind < vm_nfreelists; flind++) {
425210327Sjchandra		m = vm_phys_alloc_freelist_pages(flind, pool, order);
426210327Sjchandra		if (m != NULL)
427210327Sjchandra			return (m);
428210327Sjchandra	}
429210327Sjchandra	return (NULL);
430210327Sjchandra}
431210327Sjchandra
432210327Sjchandra/*
433210327Sjchandra * Find and dequeue a free page on the given free list, with the
434210327Sjchandra * specified pool and order
435210327Sjchandra */
436210327Sjchandravm_page_t
437210327Sjchandravm_phys_alloc_freelist_pages(int flind, int pool, int order)
438210327Sjchandra{
439170477Salc	struct vm_freelist *fl;
440170477Salc	struct vm_freelist *alt;
441210550Sjhb	int domain, oind, pind;
442170477Salc	vm_page_t m;
443170477Salc
444210327Sjchandra	KASSERT(flind < VM_NFREELIST,
445210327Sjchandra	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
446170477Salc	KASSERT(pool < VM_NFREEPOOL,
447210327Sjchandra	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
448170477Salc	KASSERT(order < VM_NFREEORDER,
449210327Sjchandra	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
450210550Sjhb
451210550Sjhb#if VM_NDOMAIN > 1
452210550Sjhb	domain = PCPU_GET(domain);
453210550Sjhb#else
454210550Sjhb	domain = 0;
455210550Sjhb#endif
456170477Salc	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
457210550Sjhb	fl = (*vm_phys_lookup_lists[domain][flind])[pool];
458210327Sjchandra	for (oind = order; oind < VM_NFREEORDER; oind++) {
459210327Sjchandra		m = TAILQ_FIRST(&fl[oind].pl);
460210327Sjchandra		if (m != NULL) {
461210327Sjchandra			TAILQ_REMOVE(&fl[oind].pl, m, pageq);
462210327Sjchandra			fl[oind].lcnt--;
463210327Sjchandra			m->order = VM_NFREEORDER;
464210327Sjchandra			vm_phys_split_pages(m, oind, fl, order);
465210327Sjchandra			return (m);
466210327Sjchandra		}
467210327Sjchandra	}
468210327Sjchandra
469210327Sjchandra	/*
470210327Sjchandra	 * The given pool was empty.  Find the largest
471210327Sjchandra	 * contiguous, power-of-two-sized set of pages in any
472210327Sjchandra	 * pool.  Transfer these pages to the given pool, and
473210327Sjchandra	 * use them to satisfy the allocation.
474210327Sjchandra	 */
475210327Sjchandra	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
476210327Sjchandra		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
477210550Sjhb			alt = (*vm_phys_lookup_lists[domain][flind])[pind];
478210327Sjchandra			m = TAILQ_FIRST(&alt[oind].pl);
479170477Salc			if (m != NULL) {
480210327Sjchandra				TAILQ_REMOVE(&alt[oind].pl, m, pageq);
481210327Sjchandra				alt[oind].lcnt--;
482170477Salc				m->order = VM_NFREEORDER;
483210327Sjchandra				vm_phys_set_pool(pool, m, oind);
484170477Salc				vm_phys_split_pages(m, oind, fl, order);
485170477Salc				return (m);
486170477Salc			}
487170477Salc		}
488170477Salc	}
489170477Salc	return (NULL);
490170477Salc}
491170477Salc
492170477Salc/*
493170477Salc * Find the vm_page corresponding to the given physical address.
494170477Salc */
495170477Salcvm_page_t
496170477Salcvm_phys_paddr_to_vm_page(vm_paddr_t pa)
497170477Salc{
498170477Salc	struct vm_phys_seg *seg;
499170477Salc	int segind;
500170477Salc
501170477Salc	for (segind = 0; segind < vm_phys_nsegs; segind++) {
502170477Salc		seg = &vm_phys_segs[segind];
503170477Salc		if (pa >= seg->start && pa < seg->end)
504170477Salc			return (&seg->first_page[atop(pa - seg->start)]);
505170477Salc	}
506194459Sthompsa	return (NULL);
507170477Salc}
508170477Salc
509170477Salc/*
510170477Salc * Find the segment containing the given physical address.
511170477Salc */
512170477Salcstatic int
513170477Salcvm_phys_paddr_to_segind(vm_paddr_t pa)
514170477Salc{
515170477Salc	struct vm_phys_seg *seg;
516170477Salc	int segind;
517170477Salc
518170477Salc	for (segind = 0; segind < vm_phys_nsegs; segind++) {
519170477Salc		seg = &vm_phys_segs[segind];
520170477Salc		if (pa >= seg->start && pa < seg->end)
521170477Salc			return (segind);
522170477Salc	}
523170477Salc	panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
524170477Salc	    (uintmax_t)pa);
525170477Salc}
526170477Salc
527170477Salc/*
528170477Salc * Free a contiguous, power of two-sized set of physical pages.
529171451Salc *
530171451Salc * The free page queues must be locked.
531170477Salc */
532170477Salcvoid
533170477Salcvm_phys_free_pages(vm_page_t m, int order)
534170477Salc{
535170477Salc	struct vm_freelist *fl;
536170477Salc	struct vm_phys_seg *seg;
537226928Salc	vm_paddr_t pa;
538170477Salc	vm_page_t m_buddy;
539170477Salc
540170477Salc	KASSERT(m->order == VM_NFREEORDER,
541171451Salc	    ("vm_phys_free_pages: page %p has unexpected order %d",
542170477Salc	    m, m->order));
543170477Salc	KASSERT(m->pool < VM_NFREEPOOL,
544171451Salc	    ("vm_phys_free_pages: page %p has unexpected pool %d",
545170477Salc	    m, m->pool));
546170477Salc	KASSERT(order < VM_NFREEORDER,
547171451Salc	    ("vm_phys_free_pages: order %d is out of range", order));
548170477Salc	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
549170477Salc	seg = &vm_phys_segs[m->segind];
550226928Salc	if (order < VM_NFREEORDER - 1) {
551226928Salc		pa = VM_PAGE_TO_PHYS(m);
552226928Salc		do {
553226928Salc			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
554226928Salc			if (pa < seg->start || pa >= seg->end)
555226928Salc				break;
556226928Salc			m_buddy = &seg->first_page[atop(pa - seg->start)];
557226928Salc			if (m_buddy->order != order)
558226928Salc				break;
559226928Salc			fl = (*seg->free_queues)[m_buddy->pool];
560226928Salc			TAILQ_REMOVE(&fl[order].pl, m_buddy, pageq);
561226928Salc			fl[order].lcnt--;
562226928Salc			m_buddy->order = VM_NFREEORDER;
563226928Salc			if (m_buddy->pool != m->pool)
564226928Salc				vm_phys_set_pool(m->pool, m_buddy, order);
565226928Salc			order++;
566226928Salc			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
567226928Salc			m = &seg->first_page[atop(pa - seg->start)];
568226928Salc		} while (order < VM_NFREEORDER - 1);
569170477Salc	}
570170477Salc	m->order = order;
571170477Salc	fl = (*seg->free_queues)[m->pool];
572170477Salc	TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
573170477Salc	fl[order].lcnt++;
574170477Salc}
575170477Salc
576170477Salc/*
577226928Salc * Free a contiguous, arbitrarily sized set of physical pages.
578226928Salc *
579226928Salc * The free page queues must be locked.
580226928Salc */
581226928Salcvoid
582226928Salcvm_phys_free_contig(vm_page_t m, u_long npages)
583226928Salc{
584226928Salc	u_int n;
585226928Salc	int order;
586226928Salc
587226928Salc	/*
588226928Salc	 * Avoid unnecessary coalescing by freeing the pages in the largest
589226928Salc	 * possible power-of-two-sized subsets.
590226928Salc	 */
591226928Salc	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
592226928Salc	for (;; npages -= n) {
593226928Salc		/*
594226928Salc		 * Unsigned "min" is used here so that "order" is assigned
595226928Salc		 * "VM_NFREEORDER - 1" when "m"'s physical address is zero
596226928Salc		 * or the low-order bits of its physical address are zero
597226928Salc		 * because the size of a physical address exceeds the size of
598226928Salc		 * a long.
599226928Salc		 */
600226928Salc		order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
601226928Salc		    VM_NFREEORDER - 1);
602226928Salc		n = 1 << order;
603226928Salc		if (npages < n)
604226928Salc			break;
605226928Salc		vm_phys_free_pages(m, order);
606226928Salc		m += n;
607226928Salc	}
608226928Salc	/* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
609226928Salc	for (; npages > 0; npages -= n) {
610226928Salc		order = flsl(npages) - 1;
611226928Salc		n = 1 << order;
612226928Salc		vm_phys_free_pages(m, order);
613226928Salc		m += n;
614226928Salc	}
615226928Salc}
616226928Salc
617226928Salc/*
618170477Salc * Set the pool for a contiguous, power of two-sized set of physical pages.
619170477Salc */
620172317Salcvoid
621170477Salcvm_phys_set_pool(int pool, vm_page_t m, int order)
622170477Salc{
623170477Salc	vm_page_t m_tmp;
624170477Salc
625170477Salc	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
626170477Salc		m_tmp->pool = pool;
627170477Salc}
628170477Salc
629170477Salc/*
630174825Salc * Search for the given physical page "m" in the free lists.  If the search
631174825Salc * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
632174825Salc * FALSE, indicating that "m" is not in the free lists.
633172317Salc *
634172317Salc * The free page queues must be locked.
635170477Salc */
636174821Salcboolean_t
637172317Salcvm_phys_unfree_page(vm_page_t m)
638172317Salc{
639172317Salc	struct vm_freelist *fl;
640172317Salc	struct vm_phys_seg *seg;
641172317Salc	vm_paddr_t pa, pa_half;
642172317Salc	vm_page_t m_set, m_tmp;
643172317Salc	int order;
644172317Salc
645172317Salc	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
646172317Salc
647172317Salc	/*
648172317Salc	 * First, find the contiguous, power of two-sized set of free
649172317Salc	 * physical pages containing the given physical page "m" and
650172317Salc	 * assign it to "m_set".
651172317Salc	 */
652172317Salc	seg = &vm_phys_segs[m->segind];
653172317Salc	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
654174799Salc	    order < VM_NFREEORDER - 1; ) {
655172317Salc		order++;
656172317Salc		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
657177932Salc		if (pa >= seg->start)
658174821Salc			m_set = &seg->first_page[atop(pa - seg->start)];
659174821Salc		else
660174821Salc			return (FALSE);
661172317Salc	}
662174821Salc	if (m_set->order < order)
663174821Salc		return (FALSE);
664174821Salc	if (m_set->order == VM_NFREEORDER)
665174821Salc		return (FALSE);
666172317Salc	KASSERT(m_set->order < VM_NFREEORDER,
667172317Salc	    ("vm_phys_unfree_page: page %p has unexpected order %d",
668172317Salc	    m_set, m_set->order));
669172317Salc
670172317Salc	/*
671172317Salc	 * Next, remove "m_set" from the free lists.  Finally, extract
672172317Salc	 * "m" from "m_set" using an iterative algorithm: While "m_set"
673172317Salc	 * is larger than a page, shrink "m_set" by returning the half
674172317Salc	 * of "m_set" that does not contain "m" to the free lists.
675172317Salc	 */
676172317Salc	fl = (*seg->free_queues)[m_set->pool];
677172317Salc	order = m_set->order;
678172317Salc	TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
679172317Salc	fl[order].lcnt--;
680172317Salc	m_set->order = VM_NFREEORDER;
681172317Salc	while (order > 0) {
682172317Salc		order--;
683172317Salc		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
684172317Salc		if (m->phys_addr < pa_half)
685172317Salc			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
686172317Salc		else {
687172317Salc			m_tmp = m_set;
688172317Salc			m_set = &seg->first_page[atop(pa_half - seg->start)];
689172317Salc		}
690172317Salc		m_tmp->order = order;
691172317Salc		TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
692172317Salc		fl[order].lcnt++;
693172317Salc	}
694172317Salc	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
695174821Salc	return (TRUE);
696172317Salc}
697172317Salc
698172317Salc/*
699172317Salc * Try to zero one physical page.  Used by an idle priority thread.
700172317Salc */
701170477Salcboolean_t
702170477Salcvm_phys_zero_pages_idle(void)
703170477Salc{
704172317Salc	static struct vm_freelist *fl = vm_phys_free_queues[0][0];
705172317Salc	static int flind, oind, pind;
706170477Salc	vm_page_t m, m_tmp;
707170477Salc
708170477Salc	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
709172317Salc	for (;;) {
710172317Salc		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
711172317Salc			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
712172317Salc				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
713172317Salc					vm_phys_unfree_page(m_tmp);
714172317Salc					cnt.v_free_count--;
715172317Salc					mtx_unlock(&vm_page_queue_free_mtx);
716172317Salc					pmap_zero_page_idle(m_tmp);
717172317Salc					m_tmp->flags |= PG_ZERO;
718172317Salc					mtx_lock(&vm_page_queue_free_mtx);
719172317Salc					cnt.v_free_count++;
720172317Salc					vm_phys_free_pages(m_tmp, 0);
721172317Salc					vm_page_zero_count++;
722172317Salc					cnt_prezero++;
723172317Salc					return (TRUE);
724170477Salc				}
725170477Salc			}
726170477Salc		}
727172317Salc		oind++;
728172317Salc		if (oind == VM_NFREEORDER) {
729172317Salc			oind = 0;
730172317Salc			pind++;
731172317Salc			if (pind == VM_NFREEPOOL) {
732172317Salc				pind = 0;
733172317Salc				flind++;
734172317Salc				if (flind == vm_nfreelists)
735172317Salc					flind = 0;
736172317Salc			}
737172317Salc			fl = vm_phys_free_queues[flind][pind];
738172317Salc		}
739170477Salc	}
740170477Salc}
741170477Salc
742170477Salc/*
743170818Salc * Allocate a contiguous set of physical pages of the given size
744170818Salc * "npages" from the free lists.  All of the physical pages must be at
745170818Salc * or above the given physical address "low" and below the given
746170818Salc * physical address "high".  The given value "alignment" determines the
747170818Salc * alignment of the first physical page in the set.  If the given value
748170818Salc * "boundary" is non-zero, then the set of physical pages cannot cross
749170818Salc * any physical address boundary that is a multiple of that value.  Both
750170477Salc * "alignment" and "boundary" must be a power of two.
751170477Salc */
752170477Salcvm_page_t
753226928Salcvm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
754226928Salc    u_long alignment, vm_paddr_t boundary)
755170477Salc{
756170477Salc	struct vm_freelist *fl;
757170477Salc	struct vm_phys_seg *seg;
758210327Sjchandra	struct vnode *vp;
759170477Salc	vm_paddr_t pa, pa_last, size;
760194607Salc	vm_page_t deferred_vdrop_list, m, m_ret;
761226928Salc	u_long npages_end;
762210550Sjhb	int domain, flind, i, oind, order, pind;
763170477Salc
764210550Sjhb#if VM_NDOMAIN > 1
765210550Sjhb	domain = PCPU_GET(domain);
766210550Sjhb#else
767210550Sjhb	domain = 0;
768210550Sjhb#endif
769170477Salc	size = npages << PAGE_SHIFT;
770170477Salc	KASSERT(size != 0,
771170477Salc	    ("vm_phys_alloc_contig: size must not be 0"));
772170477Salc	KASSERT((alignment & (alignment - 1)) == 0,
773170477Salc	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
774170477Salc	KASSERT((boundary & (boundary - 1)) == 0,
775170477Salc	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
776194607Salc	deferred_vdrop_list = NULL;
777170477Salc	/* Compute the queue that is the best fit for npages. */
778170477Salc	for (order = 0; (1 << order) < npages; order++);
779170477Salc	mtx_lock(&vm_page_queue_free_mtx);
780177956Salc#if VM_NRESERVLEVEL > 0
781177956Salcretry:
782177956Salc#endif
783170477Salc	for (flind = 0; flind < vm_nfreelists; flind++) {
784170477Salc		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
785170477Salc			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
786210550Sjhb				fl = (*vm_phys_lookup_lists[domain][flind])
787210550Sjhb				    [pind];
788170477Salc				TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
789170477Salc					/*
790170477Salc					 * A free list may contain physical pages
791170477Salc					 * from one or more segments.
792170477Salc					 */
793170477Salc					seg = &vm_phys_segs[m_ret->segind];
794170477Salc					if (seg->start > high ||
795170477Salc					    low >= seg->end)
796170477Salc						continue;
797170477Salc
798170477Salc					/*
799170477Salc					 * Is the size of this allocation request
800170477Salc					 * larger than the largest block size?
801170477Salc					 */
802170477Salc					if (order >= VM_NFREEORDER) {
803170477Salc						/*
804170477Salc						 * Determine if a sufficient number
805170477Salc						 * of subsequent blocks to satisfy
806170477Salc						 * the allocation request are free.
807170477Salc						 */
808170477Salc						pa = VM_PAGE_TO_PHYS(m_ret);
809170477Salc						pa_last = pa + size;
810170477Salc						for (;;) {
811170477Salc							pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
812170477Salc							if (pa >= pa_last)
813170477Salc								break;
814170477Salc							if (pa < seg->start ||
815170477Salc							    pa >= seg->end)
816170477Salc								break;
817170477Salc							m = &seg->first_page[atop(pa - seg->start)];
818170477Salc							if (m->order != VM_NFREEORDER - 1)
819170477Salc								break;
820170477Salc						}
821170477Salc						/* If not, continue to the next block. */
822170477Salc						if (pa < pa_last)
823170477Salc							continue;
824170477Salc					}
825170477Salc
826170477Salc					/*
827170477Salc					 * Determine if the blocks are within the given range,
828170477Salc					 * satisfy the given alignment, and do not cross the
829170477Salc					 * given boundary.
830170477Salc					 */
831170477Salc					pa = VM_PAGE_TO_PHYS(m_ret);
832170477Salc					if (pa >= low &&
833170477Salc					    pa + size <= high &&
834170477Salc					    (pa & (alignment - 1)) == 0 &&
835170477Salc					    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
836170477Salc						goto done;
837170477Salc				}
838170477Salc			}
839170477Salc		}
840170477Salc	}
841177956Salc#if VM_NRESERVLEVEL > 0
842177956Salc	if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary))
843177956Salc		goto retry;
844177956Salc#endif
845170477Salc	mtx_unlock(&vm_page_queue_free_mtx);
846170477Salc	return (NULL);
847170477Salcdone:
848170477Salc	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
849170477Salc		fl = (*seg->free_queues)[m->pool];
850170477Salc		TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
851170477Salc		fl[m->order].lcnt--;
852170477Salc		m->order = VM_NFREEORDER;
853170477Salc	}
854170477Salc	if (m_ret->pool != VM_FREEPOOL_DEFAULT)
855170477Salc		vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
856170477Salc	fl = (*seg->free_queues)[m_ret->pool];
857170477Salc	vm_phys_split_pages(m_ret, oind, fl, order);
858170477Salc	for (i = 0; i < npages; i++) {
859170477Salc		m = &m_ret[i];
860210327Sjchandra		vp = vm_page_alloc_init(m);
861210327Sjchandra		if (vp != NULL) {
862210327Sjchandra			/*
863210327Sjchandra			 * Enqueue the vnode for deferred vdrop().
864210327Sjchandra			 *
865210327Sjchandra			 * Unmanaged pages don't use "pageq", so it
866210327Sjchandra			 * can be safely abused to construct a short-
867210327Sjchandra			 * lived queue of vnodes.
868210327Sjchandra			 */
869210327Sjchandra			m->pageq.tqe_prev = (void *)vp;
870210327Sjchandra			m->pageq.tqe_next = deferred_vdrop_list;
871210327Sjchandra			deferred_vdrop_list = m;
872172317Salc		}
873170477Salc	}
874226928Salc	/* Return excess pages to the free lists. */
875226928Salc	npages_end = roundup2(npages, 1 << imin(oind, order));
876226928Salc	if (npages < npages_end)
877226928Salc		vm_phys_free_contig(&m_ret[npages], npages_end - npages);
878170477Salc	mtx_unlock(&vm_page_queue_free_mtx);
879194607Salc	while (deferred_vdrop_list != NULL) {
880194607Salc		vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
881194607Salc		deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
882194607Salc	}
883170477Salc	return (m_ret);
884170477Salc}
885170477Salc
886170477Salc#ifdef DDB
887170477Salc/*
888170477Salc * Show the number of physical pages in each of the free lists.
889170477Salc */
890170477SalcDB_SHOW_COMMAND(freepages, db_show_freepages)
891170477Salc{
892170477Salc	struct vm_freelist *fl;
893170477Salc	int flind, oind, pind;
894170477Salc
895170477Salc	for (flind = 0; flind < vm_nfreelists; flind++) {
896170477Salc		db_printf("FREE LIST %d:\n"
897170477Salc		    "\n  ORDER (SIZE)  |  NUMBER"
898170477Salc		    "\n              ", flind);
899170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++)
900170477Salc			db_printf("  |  POOL %d", pind);
901170477Salc		db_printf("\n--            ");
902170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++)
903170477Salc			db_printf("-- --      ");
904170477Salc		db_printf("--\n");
905170477Salc		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
906170477Salc			db_printf("  %2.2d (%6.6dK)", oind,
907170477Salc			    1 << (PAGE_SHIFT - 10 + oind));
908170477Salc			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
909170477Salc				fl = vm_phys_free_queues[flind][pind];
910170477Salc				db_printf("  |  %6.6d", fl[oind].lcnt);
911170477Salc			}
912170477Salc			db_printf("\n");
913170477Salc		}
914170477Salc		db_printf("\n");
915170477Salc	}
916170477Salc}
917170477Salc#endif
918