vm_phys.c revision 217916
1170477Salc/*-
2170477Salc * Copyright (c) 2002-2006 Rice University
3170477Salc * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
4170477Salc * All rights reserved.
5170477Salc *
6170477Salc * This software was developed for the FreeBSD Project by Alan L. Cox,
7170477Salc * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
8170477Salc *
9170477Salc * Redistribution and use in source and binary forms, with or without
10170477Salc * modification, are permitted provided that the following conditions
11170477Salc * are met:
12170477Salc * 1. Redistributions of source code must retain the above copyright
13170477Salc *    notice, this list of conditions and the following disclaimer.
14170477Salc * 2. Redistributions in binary form must reproduce the above copyright
15170477Salc *    notice, this list of conditions and the following disclaimer in the
16170477Salc *    documentation and/or other materials provided with the distribution.
17170477Salc *
18170477Salc * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19170477Salc * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20170477Salc * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21170477Salc * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22170477Salc * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23170477Salc * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24170477Salc * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25170477Salc * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26170477Salc * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27170477Salc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
28170477Salc * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29170477Salc * POSSIBILITY OF SUCH DAMAGE.
30170477Salc */
31170477Salc
32170477Salc#include <sys/cdefs.h>
33170477Salc__FBSDID("$FreeBSD: head/sys/vm/vm_phys.c 217916 2011-01-27 00:34:12Z mdf $");
34170477Salc
35170477Salc#include "opt_ddb.h"
36170477Salc
37170477Salc#include <sys/param.h>
38170477Salc#include <sys/systm.h>
39170477Salc#include <sys/lock.h>
40170477Salc#include <sys/kernel.h>
41170477Salc#include <sys/malloc.h>
42170477Salc#include <sys/mutex.h>
43170477Salc#include <sys/queue.h>
44170477Salc#include <sys/sbuf.h>
45170477Salc#include <sys/sysctl.h>
46170477Salc#include <sys/vmmeter.h>
47172317Salc#include <sys/vnode.h>
48170477Salc
49170477Salc#include <ddb/ddb.h>
50170477Salc
51170477Salc#include <vm/vm.h>
52170477Salc#include <vm/vm_param.h>
53170477Salc#include <vm/vm_kern.h>
54170477Salc#include <vm/vm_object.h>
55170477Salc#include <vm/vm_page.h>
56170477Salc#include <vm/vm_phys.h>
57177956Salc#include <vm/vm_reserv.h>
58170477Salc
59210550Sjhb/*
60210550Sjhb * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
61210550Sjhb * domain.  These extra lists are stored at the end of the regular
62210550Sjhb * free lists starting with VM_NFREELIST.
63210550Sjhb */
64210550Sjhb#define VM_RAW_NFREELIST	(VM_NFREELIST + VM_NDOMAIN - 1)
65210550Sjhb
66170477Salcstruct vm_freelist {
67170477Salc	struct pglist pl;
68170477Salc	int lcnt;
69170477Salc};
70170477Salc
71170477Salcstruct vm_phys_seg {
72170477Salc	vm_paddr_t	start;
73170477Salc	vm_paddr_t	end;
74170477Salc	vm_page_t	first_page;
75210550Sjhb	int		domain;
76170477Salc	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
77170477Salc};
78170477Salc
79210550Sjhbstruct mem_affinity *mem_affinity;
80210550Sjhb
81170477Salcstatic struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
82170477Salc
83170477Salcstatic int vm_phys_nsegs;
84170477Salc
85170477Salcstatic struct vm_freelist
86210550Sjhb    vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
87210550Sjhbstatic struct vm_freelist
88210550Sjhb(*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
89170477Salc
90170477Salcstatic int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
91170477Salc
92170477Salcstatic int cnt_prezero;
93170477SalcSYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
94170477Salc    &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
95170477Salc
96170477Salcstatic int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
97170477SalcSYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
98170477Salc    NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
99170477Salc
100170477Salcstatic int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
101170477SalcSYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
102170477Salc    NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
103170477Salc
104210550Sjhb#if VM_NDOMAIN > 1
105210550Sjhbstatic int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
106210550SjhbSYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
107210550Sjhb    NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
108210550Sjhb#endif
109210550Sjhb
110210550Sjhbstatic void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
111210550Sjhb    int domain);
112170477Salcstatic void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
113170477Salcstatic int vm_phys_paddr_to_segind(vm_paddr_t pa);
114170477Salcstatic void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
115170477Salc    int order);
116170477Salc
117170477Salc/*
118170477Salc * Outputs the state of the physical memory allocator, specifically,
119170477Salc * the amount of physical memory in each free list.
120170477Salc */
121170477Salcstatic int
122170477Salcsysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
123170477Salc{
124170477Salc	struct sbuf sbuf;
125170477Salc	struct vm_freelist *fl;
126170477Salc	int error, flind, oind, pind;
127170477Salc
128217916Smdf	error = sysctl_wire_old_buffer(req, 0);
129217916Smdf	if (error != 0)
130217916Smdf		return (error);
131212750Smdf	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
132170477Salc	for (flind = 0; flind < vm_nfreelists; flind++) {
133170477Salc		sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
134170477Salc		    "\n  ORDER (SIZE)  |  NUMBER"
135170477Salc		    "\n              ", flind);
136170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++)
137170477Salc			sbuf_printf(&sbuf, "  |  POOL %d", pind);
138170477Salc		sbuf_printf(&sbuf, "\n--            ");
139170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++)
140170477Salc			sbuf_printf(&sbuf, "-- --      ");
141170477Salc		sbuf_printf(&sbuf, "--\n");
142170477Salc		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
143214564Salc			sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
144170477Salc			    1 << (PAGE_SHIFT - 10 + oind));
145170477Salc			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
146170477Salc				fl = vm_phys_free_queues[flind][pind];
147214564Salc				sbuf_printf(&sbuf, "  |  %6d", fl[oind].lcnt);
148170477Salc			}
149170477Salc			sbuf_printf(&sbuf, "\n");
150170477Salc		}
151170477Salc	}
152212750Smdf	error = sbuf_finish(&sbuf);
153170477Salc	sbuf_delete(&sbuf);
154170477Salc	return (error);
155170477Salc}
156170477Salc
157170477Salc/*
158170477Salc * Outputs the set of physical memory segments.
159170477Salc */
160170477Salcstatic int
161170477Salcsysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
162170477Salc{
163170477Salc	struct sbuf sbuf;
164170477Salc	struct vm_phys_seg *seg;
165170477Salc	int error, segind;
166170477Salc
167217916Smdf	error = sysctl_wire_old_buffer(req, 0);
168217916Smdf	if (error != 0)
169217916Smdf		return (error);
170212750Smdf	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
171170477Salc	for (segind = 0; segind < vm_phys_nsegs; segind++) {
172170477Salc		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
173170477Salc		seg = &vm_phys_segs[segind];
174170477Salc		sbuf_printf(&sbuf, "start:     %#jx\n",
175170477Salc		    (uintmax_t)seg->start);
176170477Salc		sbuf_printf(&sbuf, "end:       %#jx\n",
177170477Salc		    (uintmax_t)seg->end);
178210550Sjhb		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
179170477Salc		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
180170477Salc	}
181212750Smdf	error = sbuf_finish(&sbuf);
182170477Salc	sbuf_delete(&sbuf);
183170477Salc	return (error);
184170477Salc}
185170477Salc
186210550Sjhb#if VM_NDOMAIN > 1
187170477Salc/*
188210550Sjhb * Outputs the set of free list lookup lists.
189210550Sjhb */
190210550Sjhbstatic int
191210550Sjhbsysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
192210550Sjhb{
193210550Sjhb	struct sbuf sbuf;
194210550Sjhb	int domain, error, flind, ndomains;
195210550Sjhb
196217916Smdf	error = sysctl_wire_old_buffer(req, 0);
197217916Smdf	if (error != 0)
198217916Smdf		return (error);
199217916Smdf	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
200210550Sjhb	ndomains = vm_nfreelists - VM_NFREELIST + 1;
201210550Sjhb	for (domain = 0; domain < ndomains; domain++) {
202210550Sjhb		sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
203210550Sjhb		for (flind = 0; flind < vm_nfreelists; flind++)
204210550Sjhb			sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
205210550Sjhb			    vm_phys_lookup_lists[domain][flind]);
206210550Sjhb	}
207212750Smdf	error = sbuf_finish(&sbuf);
208210550Sjhb	sbuf_delete(&sbuf);
209210550Sjhb	return (error);
210210550Sjhb}
211210550Sjhb#endif
212210550Sjhb
213210550Sjhb/*
214170477Salc * Create a physical memory segment.
215170477Salc */
216170477Salcstatic void
217210550Sjhb_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
218170477Salc{
219170477Salc	struct vm_phys_seg *seg;
220170477Salc#ifdef VM_PHYSSEG_SPARSE
221170477Salc	long pages;
222170477Salc	int segind;
223170477Salc
224170477Salc	pages = 0;
225170477Salc	for (segind = 0; segind < vm_phys_nsegs; segind++) {
226170477Salc		seg = &vm_phys_segs[segind];
227170477Salc		pages += atop(seg->end - seg->start);
228170477Salc	}
229170477Salc#endif
230170477Salc	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
231170477Salc	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
232170477Salc	seg = &vm_phys_segs[vm_phys_nsegs++];
233170477Salc	seg->start = start;
234170477Salc	seg->end = end;
235210550Sjhb	seg->domain = domain;
236170477Salc#ifdef VM_PHYSSEG_SPARSE
237170477Salc	seg->first_page = &vm_page_array[pages];
238170477Salc#else
239170477Salc	seg->first_page = PHYS_TO_VM_PAGE(start);
240170477Salc#endif
241210550Sjhb#if VM_NDOMAIN > 1
242210550Sjhb	if (flind == VM_FREELIST_DEFAULT && domain != 0) {
243210550Sjhb		flind = VM_NFREELIST + (domain - 1);
244210550Sjhb		if (flind >= vm_nfreelists)
245210550Sjhb			vm_nfreelists = flind + 1;
246210550Sjhb	}
247210550Sjhb#endif
248170477Salc	seg->free_queues = &vm_phys_free_queues[flind];
249170477Salc}
250170477Salc
251210550Sjhbstatic void
252210550Sjhbvm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
253210550Sjhb{
254210550Sjhb	int i;
255210550Sjhb
256210550Sjhb	if (mem_affinity == NULL) {
257210550Sjhb		_vm_phys_create_seg(start, end, flind, 0);
258210550Sjhb		return;
259210550Sjhb	}
260210550Sjhb
261210550Sjhb	for (i = 0;; i++) {
262210550Sjhb		if (mem_affinity[i].end == 0)
263210550Sjhb			panic("Reached end of affinity info");
264210550Sjhb		if (mem_affinity[i].end <= start)
265210550Sjhb			continue;
266210550Sjhb		if (mem_affinity[i].start > start)
267210550Sjhb			panic("No affinity info for start %jx",
268210550Sjhb			    (uintmax_t)start);
269210550Sjhb		if (mem_affinity[i].end >= end) {
270210550Sjhb			_vm_phys_create_seg(start, end, flind,
271210550Sjhb			    mem_affinity[i].domain);
272210550Sjhb			break;
273210550Sjhb		}
274210550Sjhb		_vm_phys_create_seg(start, mem_affinity[i].end, flind,
275210550Sjhb		    mem_affinity[i].domain);
276210550Sjhb		start = mem_affinity[i].end;
277210550Sjhb	}
278210550Sjhb}
279210550Sjhb
280170477Salc/*
281170477Salc * Initialize the physical memory allocator.
282170477Salc */
283170477Salcvoid
284170477Salcvm_phys_init(void)
285170477Salc{
286170477Salc	struct vm_freelist *fl;
287170477Salc	int flind, i, oind, pind;
288210550Sjhb#if VM_NDOMAIN > 1
289210550Sjhb	int ndomains, j;
290210550Sjhb#endif
291170477Salc
292170477Salc	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
293170477Salc#ifdef	VM_FREELIST_ISADMA
294170477Salc		if (phys_avail[i] < 16777216) {
295170477Salc			if (phys_avail[i + 1] > 16777216) {
296170477Salc				vm_phys_create_seg(phys_avail[i], 16777216,
297170477Salc				    VM_FREELIST_ISADMA);
298170477Salc				vm_phys_create_seg(16777216, phys_avail[i + 1],
299170477Salc				    VM_FREELIST_DEFAULT);
300170477Salc			} else {
301170477Salc				vm_phys_create_seg(phys_avail[i],
302170477Salc				    phys_avail[i + 1], VM_FREELIST_ISADMA);
303170477Salc			}
304170477Salc			if (VM_FREELIST_ISADMA >= vm_nfreelists)
305170477Salc				vm_nfreelists = VM_FREELIST_ISADMA + 1;
306170477Salc		} else
307170477Salc#endif
308170477Salc#ifdef	VM_FREELIST_HIGHMEM
309170477Salc		if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
310170477Salc			if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
311170477Salc				vm_phys_create_seg(phys_avail[i],
312170477Salc				    VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
313170477Salc				vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
314170477Salc				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
315170477Salc			} else {
316170477Salc				vm_phys_create_seg(phys_avail[i],
317170477Salc				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
318170477Salc			}
319170477Salc			if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
320170477Salc				vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
321170477Salc		} else
322170477Salc#endif
323170477Salc		vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
324170477Salc		    VM_FREELIST_DEFAULT);
325170477Salc	}
326170477Salc	for (flind = 0; flind < vm_nfreelists; flind++) {
327170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
328170477Salc			fl = vm_phys_free_queues[flind][pind];
329170477Salc			for (oind = 0; oind < VM_NFREEORDER; oind++)
330170477Salc				TAILQ_INIT(&fl[oind].pl);
331170477Salc		}
332170477Salc	}
333210550Sjhb#if VM_NDOMAIN > 1
334210550Sjhb	/*
335210550Sjhb	 * Build a free list lookup list for each domain.  All of the
336210550Sjhb	 * memory domain lists are inserted at the VM_FREELIST_DEFAULT
337210550Sjhb	 * index in a round-robin order starting with the current
338210550Sjhb	 * domain.
339210550Sjhb	 */
340210550Sjhb	ndomains = vm_nfreelists - VM_NFREELIST + 1;
341210550Sjhb	for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
342210550Sjhb		for (i = 0; i < ndomains; i++)
343210550Sjhb			vm_phys_lookup_lists[i][flind] =
344210550Sjhb			    &vm_phys_free_queues[flind];
345210550Sjhb	for (i = 0; i < ndomains; i++)
346210550Sjhb		for (j = 0; j < ndomains; j++) {
347210550Sjhb			flind = (i + j) % ndomains;
348210550Sjhb			if (flind == 0)
349210550Sjhb				flind = VM_FREELIST_DEFAULT;
350210550Sjhb			else
351210550Sjhb				flind += VM_NFREELIST - 1;
352210550Sjhb			vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
353210550Sjhb			    &vm_phys_free_queues[flind];
354210550Sjhb		}
355210550Sjhb	for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
356210550Sjhb	     flind++)
357210550Sjhb		for (i = 0; i < ndomains; i++)
358210550Sjhb			vm_phys_lookup_lists[i][flind + ndomains - 1] =
359210550Sjhb			    &vm_phys_free_queues[flind];
360210550Sjhb#else
361210550Sjhb	for (flind = 0; flind < vm_nfreelists; flind++)
362210550Sjhb		vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
363210550Sjhb#endif
364170477Salc}
365170477Salc
366170477Salc/*
367170477Salc * Split a contiguous, power of two-sized set of physical pages.
368170477Salc */
369170477Salcstatic __inline void
370170477Salcvm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
371170477Salc{
372170477Salc	vm_page_t m_buddy;
373170477Salc
374170477Salc	while (oind > order) {
375170477Salc		oind--;
376170477Salc		m_buddy = &m[1 << oind];
377170477Salc		KASSERT(m_buddy->order == VM_NFREEORDER,
378170477Salc		    ("vm_phys_split_pages: page %p has unexpected order %d",
379170477Salc		    m_buddy, m_buddy->order));
380170477Salc		m_buddy->order = oind;
381170477Salc		TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
382170477Salc		fl[oind].lcnt++;
383170477Salc        }
384170477Salc}
385170477Salc
386170477Salc/*
387170477Salc * Initialize a physical page and add it to the free lists.
388170477Salc */
389170477Salcvoid
390170477Salcvm_phys_add_page(vm_paddr_t pa)
391170477Salc{
392170477Salc	vm_page_t m;
393170477Salc
394170477Salc	cnt.v_page_count++;
395170477Salc	m = vm_phys_paddr_to_vm_page(pa);
396170477Salc	m->phys_addr = pa;
397217508Salc	m->queue = PQ_NONE;
398170477Salc	m->segind = vm_phys_paddr_to_segind(pa);
399170477Salc	m->flags = PG_FREE;
400170477Salc	KASSERT(m->order == VM_NFREEORDER,
401170477Salc	    ("vm_phys_add_page: page %p has unexpected order %d",
402170477Salc	    m, m->order));
403170477Salc	m->pool = VM_FREEPOOL_DEFAULT;
404170477Salc	pmap_page_init(m);
405171451Salc	mtx_lock(&vm_page_queue_free_mtx);
406172317Salc	cnt.v_free_count++;
407170477Salc	vm_phys_free_pages(m, 0);
408171451Salc	mtx_unlock(&vm_page_queue_free_mtx);
409170477Salc}
410170477Salc
411170477Salc/*
412170477Salc * Allocate a contiguous, power of two-sized set of physical pages
413170477Salc * from the free lists.
414171451Salc *
415171451Salc * The free page queues must be locked.
416170477Salc */
417170477Salcvm_page_t
418170477Salcvm_phys_alloc_pages(int pool, int order)
419170477Salc{
420210327Sjchandra	vm_page_t m;
421210327Sjchandra	int flind;
422210327Sjchandra
423210327Sjchandra	for (flind = 0; flind < vm_nfreelists; flind++) {
424210327Sjchandra		m = vm_phys_alloc_freelist_pages(flind, pool, order);
425210327Sjchandra		if (m != NULL)
426210327Sjchandra			return (m);
427210327Sjchandra	}
428210327Sjchandra	return (NULL);
429210327Sjchandra}
430210327Sjchandra
431210327Sjchandra/*
432210327Sjchandra * Find and dequeue a free page on the given free list, with the
433210327Sjchandra * specified pool and order
434210327Sjchandra */
435210327Sjchandravm_page_t
436210327Sjchandravm_phys_alloc_freelist_pages(int flind, int pool, int order)
437210327Sjchandra{
438170477Salc	struct vm_freelist *fl;
439170477Salc	struct vm_freelist *alt;
440210550Sjhb	int domain, oind, pind;
441170477Salc	vm_page_t m;
442170477Salc
443210327Sjchandra	KASSERT(flind < VM_NFREELIST,
444210327Sjchandra	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
445170477Salc	KASSERT(pool < VM_NFREEPOOL,
446210327Sjchandra	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
447170477Salc	KASSERT(order < VM_NFREEORDER,
448210327Sjchandra	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
449210550Sjhb
450210550Sjhb#if VM_NDOMAIN > 1
451210550Sjhb	domain = PCPU_GET(domain);
452210550Sjhb#else
453210550Sjhb	domain = 0;
454210550Sjhb#endif
455170477Salc	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
456210550Sjhb	fl = (*vm_phys_lookup_lists[domain][flind])[pool];
457210327Sjchandra	for (oind = order; oind < VM_NFREEORDER; oind++) {
458210327Sjchandra		m = TAILQ_FIRST(&fl[oind].pl);
459210327Sjchandra		if (m != NULL) {
460210327Sjchandra			TAILQ_REMOVE(&fl[oind].pl, m, pageq);
461210327Sjchandra			fl[oind].lcnt--;
462210327Sjchandra			m->order = VM_NFREEORDER;
463210327Sjchandra			vm_phys_split_pages(m, oind, fl, order);
464210327Sjchandra			return (m);
465210327Sjchandra		}
466210327Sjchandra	}
467210327Sjchandra
468210327Sjchandra	/*
469210327Sjchandra	 * The given pool was empty.  Find the largest
470210327Sjchandra	 * contiguous, power-of-two-sized set of pages in any
471210327Sjchandra	 * pool.  Transfer these pages to the given pool, and
472210327Sjchandra	 * use them to satisfy the allocation.
473210327Sjchandra	 */
474210327Sjchandra	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
475210327Sjchandra		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
476210550Sjhb			alt = (*vm_phys_lookup_lists[domain][flind])[pind];
477210327Sjchandra			m = TAILQ_FIRST(&alt[oind].pl);
478170477Salc			if (m != NULL) {
479210327Sjchandra				TAILQ_REMOVE(&alt[oind].pl, m, pageq);
480210327Sjchandra				alt[oind].lcnt--;
481170477Salc				m->order = VM_NFREEORDER;
482210327Sjchandra				vm_phys_set_pool(pool, m, oind);
483170477Salc				vm_phys_split_pages(m, oind, fl, order);
484170477Salc				return (m);
485170477Salc			}
486170477Salc		}
487170477Salc	}
488170477Salc	return (NULL);
489170477Salc}
490170477Salc
491170477Salc/*
492170477Salc * Allocate physical memory from phys_avail[].
493170477Salc */
494170477Salcvm_paddr_t
495170477Salcvm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
496170477Salc{
497170477Salc	vm_paddr_t pa;
498170477Salc	int i;
499170477Salc
500170477Salc	size = round_page(size);
501170477Salc	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
502170477Salc		if (phys_avail[i + 1] - phys_avail[i] < size)
503170477Salc			continue;
504170477Salc		pa = phys_avail[i];
505170477Salc		phys_avail[i] += size;
506170477Salc		return (pa);
507170477Salc	}
508170477Salc	panic("vm_phys_bootstrap_alloc");
509170477Salc}
510170477Salc
511170477Salc/*
512170477Salc * Find the vm_page corresponding to the given physical address.
513170477Salc */
514170477Salcvm_page_t
515170477Salcvm_phys_paddr_to_vm_page(vm_paddr_t pa)
516170477Salc{
517170477Salc	struct vm_phys_seg *seg;
518170477Salc	int segind;
519170477Salc
520170477Salc	for (segind = 0; segind < vm_phys_nsegs; segind++) {
521170477Salc		seg = &vm_phys_segs[segind];
522170477Salc		if (pa >= seg->start && pa < seg->end)
523170477Salc			return (&seg->first_page[atop(pa - seg->start)]);
524170477Salc	}
525194459Sthompsa	return (NULL);
526170477Salc}
527170477Salc
528170477Salc/*
529170477Salc * Find the segment containing the given physical address.
530170477Salc */
531170477Salcstatic int
532170477Salcvm_phys_paddr_to_segind(vm_paddr_t pa)
533170477Salc{
534170477Salc	struct vm_phys_seg *seg;
535170477Salc	int segind;
536170477Salc
537170477Salc	for (segind = 0; segind < vm_phys_nsegs; segind++) {
538170477Salc		seg = &vm_phys_segs[segind];
539170477Salc		if (pa >= seg->start && pa < seg->end)
540170477Salc			return (segind);
541170477Salc	}
542170477Salc	panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
543170477Salc	    (uintmax_t)pa);
544170477Salc}
545170477Salc
546170477Salc/*
547170477Salc * Free a contiguous, power of two-sized set of physical pages.
548171451Salc *
549171451Salc * The free page queues must be locked.
550170477Salc */
551170477Salcvoid
552170477Salcvm_phys_free_pages(vm_page_t m, int order)
553170477Salc{
554170477Salc	struct vm_freelist *fl;
555170477Salc	struct vm_phys_seg *seg;
556170477Salc	vm_paddr_t pa, pa_buddy;
557170477Salc	vm_page_t m_buddy;
558170477Salc
559170477Salc	KASSERT(m->order == VM_NFREEORDER,
560171451Salc	    ("vm_phys_free_pages: page %p has unexpected order %d",
561170477Salc	    m, m->order));
562170477Salc	KASSERT(m->pool < VM_NFREEPOOL,
563171451Salc	    ("vm_phys_free_pages: page %p has unexpected pool %d",
564170477Salc	    m, m->pool));
565170477Salc	KASSERT(order < VM_NFREEORDER,
566171451Salc	    ("vm_phys_free_pages: order %d is out of range", order));
567170477Salc	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
568170477Salc	pa = VM_PAGE_TO_PHYS(m);
569170477Salc	seg = &vm_phys_segs[m->segind];
570170477Salc	while (order < VM_NFREEORDER - 1) {
571170477Salc		pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
572170477Salc		if (pa_buddy < seg->start ||
573170477Salc		    pa_buddy >= seg->end)
574170477Salc			break;
575170477Salc		m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
576170477Salc		if (m_buddy->order != order)
577170477Salc			break;
578170477Salc		fl = (*seg->free_queues)[m_buddy->pool];
579170477Salc		TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
580170477Salc		fl[m_buddy->order].lcnt--;
581170477Salc		m_buddy->order = VM_NFREEORDER;
582170477Salc		if (m_buddy->pool != m->pool)
583170477Salc			vm_phys_set_pool(m->pool, m_buddy, order);
584170477Salc		order++;
585170477Salc		pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
586170477Salc		m = &seg->first_page[atop(pa - seg->start)];
587170477Salc	}
588170477Salc	m->order = order;
589170477Salc	fl = (*seg->free_queues)[m->pool];
590170477Salc	TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
591170477Salc	fl[order].lcnt++;
592170477Salc}
593170477Salc
594170477Salc/*
595170477Salc * Set the pool for a contiguous, power of two-sized set of physical pages.
596170477Salc */
597172317Salcvoid
598170477Salcvm_phys_set_pool(int pool, vm_page_t m, int order)
599170477Salc{
600170477Salc	vm_page_t m_tmp;
601170477Salc
602170477Salc	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
603170477Salc		m_tmp->pool = pool;
604170477Salc}
605170477Salc
606170477Salc/*
607174825Salc * Search for the given physical page "m" in the free lists.  If the search
608174825Salc * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
609174825Salc * FALSE, indicating that "m" is not in the free lists.
610172317Salc *
611172317Salc * The free page queues must be locked.
612170477Salc */
613174821Salcboolean_t
614172317Salcvm_phys_unfree_page(vm_page_t m)
615172317Salc{
616172317Salc	struct vm_freelist *fl;
617172317Salc	struct vm_phys_seg *seg;
618172317Salc	vm_paddr_t pa, pa_half;
619172317Salc	vm_page_t m_set, m_tmp;
620172317Salc	int order;
621172317Salc
622172317Salc	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
623172317Salc
624172317Salc	/*
625172317Salc	 * First, find the contiguous, power of two-sized set of free
626172317Salc	 * physical pages containing the given physical page "m" and
627172317Salc	 * assign it to "m_set".
628172317Salc	 */
629172317Salc	seg = &vm_phys_segs[m->segind];
630172317Salc	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
631174799Salc	    order < VM_NFREEORDER - 1; ) {
632172317Salc		order++;
633172317Salc		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
634177932Salc		if (pa >= seg->start)
635174821Salc			m_set = &seg->first_page[atop(pa - seg->start)];
636174821Salc		else
637174821Salc			return (FALSE);
638172317Salc	}
639174821Salc	if (m_set->order < order)
640174821Salc		return (FALSE);
641174821Salc	if (m_set->order == VM_NFREEORDER)
642174821Salc		return (FALSE);
643172317Salc	KASSERT(m_set->order < VM_NFREEORDER,
644172317Salc	    ("vm_phys_unfree_page: page %p has unexpected order %d",
645172317Salc	    m_set, m_set->order));
646172317Salc
647172317Salc	/*
648172317Salc	 * Next, remove "m_set" from the free lists.  Finally, extract
649172317Salc	 * "m" from "m_set" using an iterative algorithm: While "m_set"
650172317Salc	 * is larger than a page, shrink "m_set" by returning the half
651172317Salc	 * of "m_set" that does not contain "m" to the free lists.
652172317Salc	 */
653172317Salc	fl = (*seg->free_queues)[m_set->pool];
654172317Salc	order = m_set->order;
655172317Salc	TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
656172317Salc	fl[order].lcnt--;
657172317Salc	m_set->order = VM_NFREEORDER;
658172317Salc	while (order > 0) {
659172317Salc		order--;
660172317Salc		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
661172317Salc		if (m->phys_addr < pa_half)
662172317Salc			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
663172317Salc		else {
664172317Salc			m_tmp = m_set;
665172317Salc			m_set = &seg->first_page[atop(pa_half - seg->start)];
666172317Salc		}
667172317Salc		m_tmp->order = order;
668172317Salc		TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
669172317Salc		fl[order].lcnt++;
670172317Salc	}
671172317Salc	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
672174821Salc	return (TRUE);
673172317Salc}
674172317Salc
675172317Salc/*
676172317Salc * Try to zero one physical page.  Used by an idle priority thread.
677172317Salc */
678170477Salcboolean_t
679170477Salcvm_phys_zero_pages_idle(void)
680170477Salc{
681172317Salc	static struct vm_freelist *fl = vm_phys_free_queues[0][0];
682172317Salc	static int flind, oind, pind;
683170477Salc	vm_page_t m, m_tmp;
684170477Salc
685170477Salc	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
686172317Salc	for (;;) {
687172317Salc		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
688172317Salc			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
689172317Salc				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
690172317Salc					vm_phys_unfree_page(m_tmp);
691172317Salc					cnt.v_free_count--;
692172317Salc					mtx_unlock(&vm_page_queue_free_mtx);
693172317Salc					pmap_zero_page_idle(m_tmp);
694172317Salc					m_tmp->flags |= PG_ZERO;
695172317Salc					mtx_lock(&vm_page_queue_free_mtx);
696172317Salc					cnt.v_free_count++;
697172317Salc					vm_phys_free_pages(m_tmp, 0);
698172317Salc					vm_page_zero_count++;
699172317Salc					cnt_prezero++;
700172317Salc					return (TRUE);
701170477Salc				}
702170477Salc			}
703170477Salc		}
704172317Salc		oind++;
705172317Salc		if (oind == VM_NFREEORDER) {
706172317Salc			oind = 0;
707172317Salc			pind++;
708172317Salc			if (pind == VM_NFREEPOOL) {
709172317Salc				pind = 0;
710172317Salc				flind++;
711172317Salc				if (flind == vm_nfreelists)
712172317Salc					flind = 0;
713172317Salc			}
714172317Salc			fl = vm_phys_free_queues[flind][pind];
715172317Salc		}
716170477Salc	}
717170477Salc}
718170477Salc
719170477Salc/*
720170818Salc * Allocate a contiguous set of physical pages of the given size
721170818Salc * "npages" from the free lists.  All of the physical pages must be at
722170818Salc * or above the given physical address "low" and below the given
723170818Salc * physical address "high".  The given value "alignment" determines the
724170818Salc * alignment of the first physical page in the set.  If the given value
725170818Salc * "boundary" is non-zero, then the set of physical pages cannot cross
726170818Salc * any physical address boundary that is a multiple of that value.  Both
727170477Salc * "alignment" and "boundary" must be a power of two.
728170477Salc */
729170477Salcvm_page_t
730170477Salcvm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
731195649Salc    unsigned long alignment, unsigned long boundary)
732170477Salc{
733170477Salc	struct vm_freelist *fl;
734170477Salc	struct vm_phys_seg *seg;
735210327Sjchandra	struct vnode *vp;
736170477Salc	vm_paddr_t pa, pa_last, size;
737194607Salc	vm_page_t deferred_vdrop_list, m, m_ret;
738210550Sjhb	int domain, flind, i, oind, order, pind;
739170477Salc
740210550Sjhb#if VM_NDOMAIN > 1
741210550Sjhb	domain = PCPU_GET(domain);
742210550Sjhb#else
743210550Sjhb	domain = 0;
744210550Sjhb#endif
745170477Salc	size = npages << PAGE_SHIFT;
746170477Salc	KASSERT(size != 0,
747170477Salc	    ("vm_phys_alloc_contig: size must not be 0"));
748170477Salc	KASSERT((alignment & (alignment - 1)) == 0,
749170477Salc	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
750170477Salc	KASSERT((boundary & (boundary - 1)) == 0,
751170477Salc	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
752194607Salc	deferred_vdrop_list = NULL;
753170477Salc	/* Compute the queue that is the best fit for npages. */
754170477Salc	for (order = 0; (1 << order) < npages; order++);
755170477Salc	mtx_lock(&vm_page_queue_free_mtx);
756177956Salc#if VM_NRESERVLEVEL > 0
757177956Salcretry:
758177956Salc#endif
759170477Salc	for (flind = 0; flind < vm_nfreelists; flind++) {
760170477Salc		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
761170477Salc			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
762210550Sjhb				fl = (*vm_phys_lookup_lists[domain][flind])
763210550Sjhb				    [pind];
764170477Salc				TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
765170477Salc					/*
766170477Salc					 * A free list may contain physical pages
767170477Salc					 * from one or more segments.
768170477Salc					 */
769170477Salc					seg = &vm_phys_segs[m_ret->segind];
770170477Salc					if (seg->start > high ||
771170477Salc					    low >= seg->end)
772170477Salc						continue;
773170477Salc
774170477Salc					/*
775170477Salc					 * Is the size of this allocation request
776170477Salc					 * larger than the largest block size?
777170477Salc					 */
778170477Salc					if (order >= VM_NFREEORDER) {
779170477Salc						/*
780170477Salc						 * Determine if a sufficient number
781170477Salc						 * of subsequent blocks to satisfy
782170477Salc						 * the allocation request are free.
783170477Salc						 */
784170477Salc						pa = VM_PAGE_TO_PHYS(m_ret);
785170477Salc						pa_last = pa + size;
786170477Salc						for (;;) {
787170477Salc							pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
788170477Salc							if (pa >= pa_last)
789170477Salc								break;
790170477Salc							if (pa < seg->start ||
791170477Salc							    pa >= seg->end)
792170477Salc								break;
793170477Salc							m = &seg->first_page[atop(pa - seg->start)];
794170477Salc							if (m->order != VM_NFREEORDER - 1)
795170477Salc								break;
796170477Salc						}
797170477Salc						/* If not, continue to the next block. */
798170477Salc						if (pa < pa_last)
799170477Salc							continue;
800170477Salc					}
801170477Salc
802170477Salc					/*
803170477Salc					 * Determine if the blocks are within the given range,
804170477Salc					 * satisfy the given alignment, and do not cross the
805170477Salc					 * given boundary.
806170477Salc					 */
807170477Salc					pa = VM_PAGE_TO_PHYS(m_ret);
808170477Salc					if (pa >= low &&
809170477Salc					    pa + size <= high &&
810170477Salc					    (pa & (alignment - 1)) == 0 &&
811170477Salc					    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
812170477Salc						goto done;
813170477Salc				}
814170477Salc			}
815170477Salc		}
816170477Salc	}
817177956Salc#if VM_NRESERVLEVEL > 0
818177956Salc	if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary))
819177956Salc		goto retry;
820177956Salc#endif
821170477Salc	mtx_unlock(&vm_page_queue_free_mtx);
822170477Salc	return (NULL);
823170477Salcdone:
824170477Salc	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
825170477Salc		fl = (*seg->free_queues)[m->pool];
826170477Salc		TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
827170477Salc		fl[m->order].lcnt--;
828170477Salc		m->order = VM_NFREEORDER;
829170477Salc	}
830170477Salc	if (m_ret->pool != VM_FREEPOOL_DEFAULT)
831170477Salc		vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
832170477Salc	fl = (*seg->free_queues)[m_ret->pool];
833170477Salc	vm_phys_split_pages(m_ret, oind, fl, order);
834170477Salc	for (i = 0; i < npages; i++) {
835170477Salc		m = &m_ret[i];
836210327Sjchandra		vp = vm_page_alloc_init(m);
837210327Sjchandra		if (vp != NULL) {
838210327Sjchandra			/*
839210327Sjchandra			 * Enqueue the vnode for deferred vdrop().
840210327Sjchandra			 *
841210327Sjchandra			 * Unmanaged pages don't use "pageq", so it
842210327Sjchandra			 * can be safely abused to construct a short-
843210327Sjchandra			 * lived queue of vnodes.
844210327Sjchandra			 */
845210327Sjchandra			m->pageq.tqe_prev = (void *)vp;
846210327Sjchandra			m->pageq.tqe_next = deferred_vdrop_list;
847210327Sjchandra			deferred_vdrop_list = m;
848172317Salc		}
849170477Salc	}
850170477Salc	for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
851170477Salc		m = &m_ret[i];
852170477Salc		KASSERT(m->order == VM_NFREEORDER,
853170477Salc		    ("vm_phys_alloc_contig: page %p has unexpected order %d",
854170477Salc		    m, m->order));
855171451Salc		vm_phys_free_pages(m, 0);
856170477Salc	}
857170477Salc	mtx_unlock(&vm_page_queue_free_mtx);
858194607Salc	while (deferred_vdrop_list != NULL) {
859194607Salc		vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
860194607Salc		deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
861194607Salc	}
862170477Salc	return (m_ret);
863170477Salc}
864170477Salc
865170477Salc#ifdef DDB
866170477Salc/*
867170477Salc * Show the number of physical pages in each of the free lists.
868170477Salc */
869170477SalcDB_SHOW_COMMAND(freepages, db_show_freepages)
870170477Salc{
871170477Salc	struct vm_freelist *fl;
872170477Salc	int flind, oind, pind;
873170477Salc
874170477Salc	for (flind = 0; flind < vm_nfreelists; flind++) {
875170477Salc		db_printf("FREE LIST %d:\n"
876170477Salc		    "\n  ORDER (SIZE)  |  NUMBER"
877170477Salc		    "\n              ", flind);
878170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++)
879170477Salc			db_printf("  |  POOL %d", pind);
880170477Salc		db_printf("\n--            ");
881170477Salc		for (pind = 0; pind < VM_NFREEPOOL; pind++)
882170477Salc			db_printf("-- --      ");
883170477Salc		db_printf("--\n");
884170477Salc		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
885170477Salc			db_printf("  %2.2d (%6.6dK)", oind,
886170477Salc			    1 << (PAGE_SHIFT - 10 + oind));
887170477Salc			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
888170477Salc				fl = vm_phys_free_queues[flind][pind];
889170477Salc				db_printf("  |  %6.6d", fl[oind].lcnt);
890170477Salc			}
891170477Salc			db_printf("\n");
892170477Salc		}
893170477Salc		db_printf("\n");
894170477Salc	}
895170477Salc}
896170477Salc#endif
897