vm_phys.c revision 217508
1/*-
2 * Copyright (c) 2002-2006 Rice University
3 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Alan L. Cox,
7 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
28 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/vm/vm_phys.c 217508 2011-01-17 19:17:26Z alc $");
34
35#include "opt_ddb.h"
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/lock.h>
40#include <sys/kernel.h>
41#include <sys/malloc.h>
42#include <sys/mutex.h>
43#include <sys/queue.h>
44#include <sys/sbuf.h>
45#include <sys/sysctl.h>
46#include <sys/vmmeter.h>
47#include <sys/vnode.h>
48
49#include <ddb/ddb.h>
50
51#include <vm/vm.h>
52#include <vm/vm_param.h>
53#include <vm/vm_kern.h>
54#include <vm/vm_object.h>
55#include <vm/vm_page.h>
56#include <vm/vm_phys.h>
57#include <vm/vm_reserv.h>
58
59/*
60 * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
61 * domain.  These extra lists are stored at the end of the regular
62 * free lists starting with VM_NFREELIST.
63 */
64#define VM_RAW_NFREELIST	(VM_NFREELIST + VM_NDOMAIN - 1)
65
66struct vm_freelist {
67	struct pglist pl;
68	int lcnt;
69};
70
71struct vm_phys_seg {
72	vm_paddr_t	start;
73	vm_paddr_t	end;
74	vm_page_t	first_page;
75	int		domain;
76	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
77};
78
79struct mem_affinity *mem_affinity;
80
81static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
82
83static int vm_phys_nsegs;
84
85static struct vm_freelist
86    vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
87static struct vm_freelist
88(*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
89
90static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
91
92static int cnt_prezero;
93SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
94    &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
95
96static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
97SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
98    NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
99
100static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
101SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
102    NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
103
104#if VM_NDOMAIN > 1
105static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
106SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
107    NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
108#endif
109
110static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
111    int domain);
112static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
113static int vm_phys_paddr_to_segind(vm_paddr_t pa);
114static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
115    int order);
116
117/*
118 * Outputs the state of the physical memory allocator, specifically,
119 * the amount of physical memory in each free list.
120 */
121static int
122sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
123{
124	struct sbuf sbuf;
125	struct vm_freelist *fl;
126	int error, flind, oind, pind;
127
128	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
129	for (flind = 0; flind < vm_nfreelists; flind++) {
130		sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
131		    "\n  ORDER (SIZE)  |  NUMBER"
132		    "\n              ", flind);
133		for (pind = 0; pind < VM_NFREEPOOL; pind++)
134			sbuf_printf(&sbuf, "  |  POOL %d", pind);
135		sbuf_printf(&sbuf, "\n--            ");
136		for (pind = 0; pind < VM_NFREEPOOL; pind++)
137			sbuf_printf(&sbuf, "-- --      ");
138		sbuf_printf(&sbuf, "--\n");
139		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
140			sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
141			    1 << (PAGE_SHIFT - 10 + oind));
142			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
143				fl = vm_phys_free_queues[flind][pind];
144				sbuf_printf(&sbuf, "  |  %6d", fl[oind].lcnt);
145			}
146			sbuf_printf(&sbuf, "\n");
147		}
148	}
149	error = sbuf_finish(&sbuf);
150	sbuf_delete(&sbuf);
151	return (error);
152}
153
154/*
155 * Outputs the set of physical memory segments.
156 */
157static int
158sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
159{
160	struct sbuf sbuf;
161	struct vm_phys_seg *seg;
162	int error, segind;
163
164	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
165	for (segind = 0; segind < vm_phys_nsegs; segind++) {
166		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
167		seg = &vm_phys_segs[segind];
168		sbuf_printf(&sbuf, "start:     %#jx\n",
169		    (uintmax_t)seg->start);
170		sbuf_printf(&sbuf, "end:       %#jx\n",
171		    (uintmax_t)seg->end);
172		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
173		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
174	}
175	error = sbuf_finish(&sbuf);
176	sbuf_delete(&sbuf);
177	return (error);
178}
179
180#if VM_NDOMAIN > 1
181/*
182 * Outputs the set of free list lookup lists.
183 */
184static int
185sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
186{
187	struct sbuf sbuf;
188	int domain, error, flind, ndomains;
189
190	ndomains = vm_nfreelists - VM_NFREELIST + 1;
191	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
192	for (domain = 0; domain < ndomains; domain++) {
193		sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
194		for (flind = 0; flind < vm_nfreelists; flind++)
195			sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
196			    vm_phys_lookup_lists[domain][flind]);
197	}
198	error = sbuf_finish(&sbuf);
199	sbuf_delete(&sbuf);
200	return (error);
201}
202#endif
203
204/*
205 * Create a physical memory segment.
206 */
207static void
208_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
209{
210	struct vm_phys_seg *seg;
211#ifdef VM_PHYSSEG_SPARSE
212	long pages;
213	int segind;
214
215	pages = 0;
216	for (segind = 0; segind < vm_phys_nsegs; segind++) {
217		seg = &vm_phys_segs[segind];
218		pages += atop(seg->end - seg->start);
219	}
220#endif
221	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
222	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
223	seg = &vm_phys_segs[vm_phys_nsegs++];
224	seg->start = start;
225	seg->end = end;
226	seg->domain = domain;
227#ifdef VM_PHYSSEG_SPARSE
228	seg->first_page = &vm_page_array[pages];
229#else
230	seg->first_page = PHYS_TO_VM_PAGE(start);
231#endif
232#if VM_NDOMAIN > 1
233	if (flind == VM_FREELIST_DEFAULT && domain != 0) {
234		flind = VM_NFREELIST + (domain - 1);
235		if (flind >= vm_nfreelists)
236			vm_nfreelists = flind + 1;
237	}
238#endif
239	seg->free_queues = &vm_phys_free_queues[flind];
240}
241
242static void
243vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
244{
245	int i;
246
247	if (mem_affinity == NULL) {
248		_vm_phys_create_seg(start, end, flind, 0);
249		return;
250	}
251
252	for (i = 0;; i++) {
253		if (mem_affinity[i].end == 0)
254			panic("Reached end of affinity info");
255		if (mem_affinity[i].end <= start)
256			continue;
257		if (mem_affinity[i].start > start)
258			panic("No affinity info for start %jx",
259			    (uintmax_t)start);
260		if (mem_affinity[i].end >= end) {
261			_vm_phys_create_seg(start, end, flind,
262			    mem_affinity[i].domain);
263			break;
264		}
265		_vm_phys_create_seg(start, mem_affinity[i].end, flind,
266		    mem_affinity[i].domain);
267		start = mem_affinity[i].end;
268	}
269}
270
271/*
272 * Initialize the physical memory allocator.
273 */
274void
275vm_phys_init(void)
276{
277	struct vm_freelist *fl;
278	int flind, i, oind, pind;
279#if VM_NDOMAIN > 1
280	int ndomains, j;
281#endif
282
283	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
284#ifdef	VM_FREELIST_ISADMA
285		if (phys_avail[i] < 16777216) {
286			if (phys_avail[i + 1] > 16777216) {
287				vm_phys_create_seg(phys_avail[i], 16777216,
288				    VM_FREELIST_ISADMA);
289				vm_phys_create_seg(16777216, phys_avail[i + 1],
290				    VM_FREELIST_DEFAULT);
291			} else {
292				vm_phys_create_seg(phys_avail[i],
293				    phys_avail[i + 1], VM_FREELIST_ISADMA);
294			}
295			if (VM_FREELIST_ISADMA >= vm_nfreelists)
296				vm_nfreelists = VM_FREELIST_ISADMA + 1;
297		} else
298#endif
299#ifdef	VM_FREELIST_HIGHMEM
300		if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
301			if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
302				vm_phys_create_seg(phys_avail[i],
303				    VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
304				vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
305				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
306			} else {
307				vm_phys_create_seg(phys_avail[i],
308				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
309			}
310			if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
311				vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
312		} else
313#endif
314		vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
315		    VM_FREELIST_DEFAULT);
316	}
317	for (flind = 0; flind < vm_nfreelists; flind++) {
318		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
319			fl = vm_phys_free_queues[flind][pind];
320			for (oind = 0; oind < VM_NFREEORDER; oind++)
321				TAILQ_INIT(&fl[oind].pl);
322		}
323	}
324#if VM_NDOMAIN > 1
325	/*
326	 * Build a free list lookup list for each domain.  All of the
327	 * memory domain lists are inserted at the VM_FREELIST_DEFAULT
328	 * index in a round-robin order starting with the current
329	 * domain.
330	 */
331	ndomains = vm_nfreelists - VM_NFREELIST + 1;
332	for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
333		for (i = 0; i < ndomains; i++)
334			vm_phys_lookup_lists[i][flind] =
335			    &vm_phys_free_queues[flind];
336	for (i = 0; i < ndomains; i++)
337		for (j = 0; j < ndomains; j++) {
338			flind = (i + j) % ndomains;
339			if (flind == 0)
340				flind = VM_FREELIST_DEFAULT;
341			else
342				flind += VM_NFREELIST - 1;
343			vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
344			    &vm_phys_free_queues[flind];
345		}
346	for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
347	     flind++)
348		for (i = 0; i < ndomains; i++)
349			vm_phys_lookup_lists[i][flind + ndomains - 1] =
350			    &vm_phys_free_queues[flind];
351#else
352	for (flind = 0; flind < vm_nfreelists; flind++)
353		vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
354#endif
355}
356
357/*
358 * Split a contiguous, power of two-sized set of physical pages.
359 */
360static __inline void
361vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
362{
363	vm_page_t m_buddy;
364
365	while (oind > order) {
366		oind--;
367		m_buddy = &m[1 << oind];
368		KASSERT(m_buddy->order == VM_NFREEORDER,
369		    ("vm_phys_split_pages: page %p has unexpected order %d",
370		    m_buddy, m_buddy->order));
371		m_buddy->order = oind;
372		TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
373		fl[oind].lcnt++;
374        }
375}
376
377/*
378 * Initialize a physical page and add it to the free lists.
379 */
380void
381vm_phys_add_page(vm_paddr_t pa)
382{
383	vm_page_t m;
384
385	cnt.v_page_count++;
386	m = vm_phys_paddr_to_vm_page(pa);
387	m->phys_addr = pa;
388	m->queue = PQ_NONE;
389	m->segind = vm_phys_paddr_to_segind(pa);
390	m->flags = PG_FREE;
391	KASSERT(m->order == VM_NFREEORDER,
392	    ("vm_phys_add_page: page %p has unexpected order %d",
393	    m, m->order));
394	m->pool = VM_FREEPOOL_DEFAULT;
395	pmap_page_init(m);
396	mtx_lock(&vm_page_queue_free_mtx);
397	cnt.v_free_count++;
398	vm_phys_free_pages(m, 0);
399	mtx_unlock(&vm_page_queue_free_mtx);
400}
401
402/*
403 * Allocate a contiguous, power of two-sized set of physical pages
404 * from the free lists.
405 *
406 * The free page queues must be locked.
407 */
408vm_page_t
409vm_phys_alloc_pages(int pool, int order)
410{
411	vm_page_t m;
412	int flind;
413
414	for (flind = 0; flind < vm_nfreelists; flind++) {
415		m = vm_phys_alloc_freelist_pages(flind, pool, order);
416		if (m != NULL)
417			return (m);
418	}
419	return (NULL);
420}
421
422/*
423 * Find and dequeue a free page on the given free list, with the
424 * specified pool and order
425 */
426vm_page_t
427vm_phys_alloc_freelist_pages(int flind, int pool, int order)
428{
429	struct vm_freelist *fl;
430	struct vm_freelist *alt;
431	int domain, oind, pind;
432	vm_page_t m;
433
434	KASSERT(flind < VM_NFREELIST,
435	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
436	KASSERT(pool < VM_NFREEPOOL,
437	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
438	KASSERT(order < VM_NFREEORDER,
439	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
440
441#if VM_NDOMAIN > 1
442	domain = PCPU_GET(domain);
443#else
444	domain = 0;
445#endif
446	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
447	fl = (*vm_phys_lookup_lists[domain][flind])[pool];
448	for (oind = order; oind < VM_NFREEORDER; oind++) {
449		m = TAILQ_FIRST(&fl[oind].pl);
450		if (m != NULL) {
451			TAILQ_REMOVE(&fl[oind].pl, m, pageq);
452			fl[oind].lcnt--;
453			m->order = VM_NFREEORDER;
454			vm_phys_split_pages(m, oind, fl, order);
455			return (m);
456		}
457	}
458
459	/*
460	 * The given pool was empty.  Find the largest
461	 * contiguous, power-of-two-sized set of pages in any
462	 * pool.  Transfer these pages to the given pool, and
463	 * use them to satisfy the allocation.
464	 */
465	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
466		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
467			alt = (*vm_phys_lookup_lists[domain][flind])[pind];
468			m = TAILQ_FIRST(&alt[oind].pl);
469			if (m != NULL) {
470				TAILQ_REMOVE(&alt[oind].pl, m, pageq);
471				alt[oind].lcnt--;
472				m->order = VM_NFREEORDER;
473				vm_phys_set_pool(pool, m, oind);
474				vm_phys_split_pages(m, oind, fl, order);
475				return (m);
476			}
477		}
478	}
479	return (NULL);
480}
481
482/*
483 * Allocate physical memory from phys_avail[].
484 */
485vm_paddr_t
486vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
487{
488	vm_paddr_t pa;
489	int i;
490
491	size = round_page(size);
492	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
493		if (phys_avail[i + 1] - phys_avail[i] < size)
494			continue;
495		pa = phys_avail[i];
496		phys_avail[i] += size;
497		return (pa);
498	}
499	panic("vm_phys_bootstrap_alloc");
500}
501
502/*
503 * Find the vm_page corresponding to the given physical address.
504 */
505vm_page_t
506vm_phys_paddr_to_vm_page(vm_paddr_t pa)
507{
508	struct vm_phys_seg *seg;
509	int segind;
510
511	for (segind = 0; segind < vm_phys_nsegs; segind++) {
512		seg = &vm_phys_segs[segind];
513		if (pa >= seg->start && pa < seg->end)
514			return (&seg->first_page[atop(pa - seg->start)]);
515	}
516	return (NULL);
517}
518
519/*
520 * Find the segment containing the given physical address.
521 */
522static int
523vm_phys_paddr_to_segind(vm_paddr_t pa)
524{
525	struct vm_phys_seg *seg;
526	int segind;
527
528	for (segind = 0; segind < vm_phys_nsegs; segind++) {
529		seg = &vm_phys_segs[segind];
530		if (pa >= seg->start && pa < seg->end)
531			return (segind);
532	}
533	panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
534	    (uintmax_t)pa);
535}
536
537/*
538 * Free a contiguous, power of two-sized set of physical pages.
539 *
540 * The free page queues must be locked.
541 */
542void
543vm_phys_free_pages(vm_page_t m, int order)
544{
545	struct vm_freelist *fl;
546	struct vm_phys_seg *seg;
547	vm_paddr_t pa, pa_buddy;
548	vm_page_t m_buddy;
549
550	KASSERT(m->order == VM_NFREEORDER,
551	    ("vm_phys_free_pages: page %p has unexpected order %d",
552	    m, m->order));
553	KASSERT(m->pool < VM_NFREEPOOL,
554	    ("vm_phys_free_pages: page %p has unexpected pool %d",
555	    m, m->pool));
556	KASSERT(order < VM_NFREEORDER,
557	    ("vm_phys_free_pages: order %d is out of range", order));
558	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
559	pa = VM_PAGE_TO_PHYS(m);
560	seg = &vm_phys_segs[m->segind];
561	while (order < VM_NFREEORDER - 1) {
562		pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
563		if (pa_buddy < seg->start ||
564		    pa_buddy >= seg->end)
565			break;
566		m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
567		if (m_buddy->order != order)
568			break;
569		fl = (*seg->free_queues)[m_buddy->pool];
570		TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
571		fl[m_buddy->order].lcnt--;
572		m_buddy->order = VM_NFREEORDER;
573		if (m_buddy->pool != m->pool)
574			vm_phys_set_pool(m->pool, m_buddy, order);
575		order++;
576		pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
577		m = &seg->first_page[atop(pa - seg->start)];
578	}
579	m->order = order;
580	fl = (*seg->free_queues)[m->pool];
581	TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
582	fl[order].lcnt++;
583}
584
585/*
586 * Set the pool for a contiguous, power of two-sized set of physical pages.
587 */
588void
589vm_phys_set_pool(int pool, vm_page_t m, int order)
590{
591	vm_page_t m_tmp;
592
593	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
594		m_tmp->pool = pool;
595}
596
597/*
598 * Search for the given physical page "m" in the free lists.  If the search
599 * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
600 * FALSE, indicating that "m" is not in the free lists.
601 *
602 * The free page queues must be locked.
603 */
604boolean_t
605vm_phys_unfree_page(vm_page_t m)
606{
607	struct vm_freelist *fl;
608	struct vm_phys_seg *seg;
609	vm_paddr_t pa, pa_half;
610	vm_page_t m_set, m_tmp;
611	int order;
612
613	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
614
615	/*
616	 * First, find the contiguous, power of two-sized set of free
617	 * physical pages containing the given physical page "m" and
618	 * assign it to "m_set".
619	 */
620	seg = &vm_phys_segs[m->segind];
621	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
622	    order < VM_NFREEORDER - 1; ) {
623		order++;
624		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
625		if (pa >= seg->start)
626			m_set = &seg->first_page[atop(pa - seg->start)];
627		else
628			return (FALSE);
629	}
630	if (m_set->order < order)
631		return (FALSE);
632	if (m_set->order == VM_NFREEORDER)
633		return (FALSE);
634	KASSERT(m_set->order < VM_NFREEORDER,
635	    ("vm_phys_unfree_page: page %p has unexpected order %d",
636	    m_set, m_set->order));
637
638	/*
639	 * Next, remove "m_set" from the free lists.  Finally, extract
640	 * "m" from "m_set" using an iterative algorithm: While "m_set"
641	 * is larger than a page, shrink "m_set" by returning the half
642	 * of "m_set" that does not contain "m" to the free lists.
643	 */
644	fl = (*seg->free_queues)[m_set->pool];
645	order = m_set->order;
646	TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
647	fl[order].lcnt--;
648	m_set->order = VM_NFREEORDER;
649	while (order > 0) {
650		order--;
651		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
652		if (m->phys_addr < pa_half)
653			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
654		else {
655			m_tmp = m_set;
656			m_set = &seg->first_page[atop(pa_half - seg->start)];
657		}
658		m_tmp->order = order;
659		TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
660		fl[order].lcnt++;
661	}
662	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
663	return (TRUE);
664}
665
666/*
667 * Try to zero one physical page.  Used by an idle priority thread.
668 */
669boolean_t
670vm_phys_zero_pages_idle(void)
671{
672	static struct vm_freelist *fl = vm_phys_free_queues[0][0];
673	static int flind, oind, pind;
674	vm_page_t m, m_tmp;
675
676	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
677	for (;;) {
678		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
679			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
680				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
681					vm_phys_unfree_page(m_tmp);
682					cnt.v_free_count--;
683					mtx_unlock(&vm_page_queue_free_mtx);
684					pmap_zero_page_idle(m_tmp);
685					m_tmp->flags |= PG_ZERO;
686					mtx_lock(&vm_page_queue_free_mtx);
687					cnt.v_free_count++;
688					vm_phys_free_pages(m_tmp, 0);
689					vm_page_zero_count++;
690					cnt_prezero++;
691					return (TRUE);
692				}
693			}
694		}
695		oind++;
696		if (oind == VM_NFREEORDER) {
697			oind = 0;
698			pind++;
699			if (pind == VM_NFREEPOOL) {
700				pind = 0;
701				flind++;
702				if (flind == vm_nfreelists)
703					flind = 0;
704			}
705			fl = vm_phys_free_queues[flind][pind];
706		}
707	}
708}
709
710/*
711 * Allocate a contiguous set of physical pages of the given size
712 * "npages" from the free lists.  All of the physical pages must be at
713 * or above the given physical address "low" and below the given
714 * physical address "high".  The given value "alignment" determines the
715 * alignment of the first physical page in the set.  If the given value
716 * "boundary" is non-zero, then the set of physical pages cannot cross
717 * any physical address boundary that is a multiple of that value.  Both
718 * "alignment" and "boundary" must be a power of two.
719 */
720vm_page_t
721vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
722    unsigned long alignment, unsigned long boundary)
723{
724	struct vm_freelist *fl;
725	struct vm_phys_seg *seg;
726	struct vnode *vp;
727	vm_paddr_t pa, pa_last, size;
728	vm_page_t deferred_vdrop_list, m, m_ret;
729	int domain, flind, i, oind, order, pind;
730
731#if VM_NDOMAIN > 1
732	domain = PCPU_GET(domain);
733#else
734	domain = 0;
735#endif
736	size = npages << PAGE_SHIFT;
737	KASSERT(size != 0,
738	    ("vm_phys_alloc_contig: size must not be 0"));
739	KASSERT((alignment & (alignment - 1)) == 0,
740	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
741	KASSERT((boundary & (boundary - 1)) == 0,
742	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
743	deferred_vdrop_list = NULL;
744	/* Compute the queue that is the best fit for npages. */
745	for (order = 0; (1 << order) < npages; order++);
746	mtx_lock(&vm_page_queue_free_mtx);
747#if VM_NRESERVLEVEL > 0
748retry:
749#endif
750	for (flind = 0; flind < vm_nfreelists; flind++) {
751		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
752			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
753				fl = (*vm_phys_lookup_lists[domain][flind])
754				    [pind];
755				TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
756					/*
757					 * A free list may contain physical pages
758					 * from one or more segments.
759					 */
760					seg = &vm_phys_segs[m_ret->segind];
761					if (seg->start > high ||
762					    low >= seg->end)
763						continue;
764
765					/*
766					 * Is the size of this allocation request
767					 * larger than the largest block size?
768					 */
769					if (order >= VM_NFREEORDER) {
770						/*
771						 * Determine if a sufficient number
772						 * of subsequent blocks to satisfy
773						 * the allocation request are free.
774						 */
775						pa = VM_PAGE_TO_PHYS(m_ret);
776						pa_last = pa + size;
777						for (;;) {
778							pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
779							if (pa >= pa_last)
780								break;
781							if (pa < seg->start ||
782							    pa >= seg->end)
783								break;
784							m = &seg->first_page[atop(pa - seg->start)];
785							if (m->order != VM_NFREEORDER - 1)
786								break;
787						}
788						/* If not, continue to the next block. */
789						if (pa < pa_last)
790							continue;
791					}
792
793					/*
794					 * Determine if the blocks are within the given range,
795					 * satisfy the given alignment, and do not cross the
796					 * given boundary.
797					 */
798					pa = VM_PAGE_TO_PHYS(m_ret);
799					if (pa >= low &&
800					    pa + size <= high &&
801					    (pa & (alignment - 1)) == 0 &&
802					    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
803						goto done;
804				}
805			}
806		}
807	}
808#if VM_NRESERVLEVEL > 0
809	if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary))
810		goto retry;
811#endif
812	mtx_unlock(&vm_page_queue_free_mtx);
813	return (NULL);
814done:
815	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
816		fl = (*seg->free_queues)[m->pool];
817		TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
818		fl[m->order].lcnt--;
819		m->order = VM_NFREEORDER;
820	}
821	if (m_ret->pool != VM_FREEPOOL_DEFAULT)
822		vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
823	fl = (*seg->free_queues)[m_ret->pool];
824	vm_phys_split_pages(m_ret, oind, fl, order);
825	for (i = 0; i < npages; i++) {
826		m = &m_ret[i];
827		vp = vm_page_alloc_init(m);
828		if (vp != NULL) {
829			/*
830			 * Enqueue the vnode for deferred vdrop().
831			 *
832			 * Unmanaged pages don't use "pageq", so it
833			 * can be safely abused to construct a short-
834			 * lived queue of vnodes.
835			 */
836			m->pageq.tqe_prev = (void *)vp;
837			m->pageq.tqe_next = deferred_vdrop_list;
838			deferred_vdrop_list = m;
839		}
840	}
841	for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
842		m = &m_ret[i];
843		KASSERT(m->order == VM_NFREEORDER,
844		    ("vm_phys_alloc_contig: page %p has unexpected order %d",
845		    m, m->order));
846		vm_phys_free_pages(m, 0);
847	}
848	mtx_unlock(&vm_page_queue_free_mtx);
849	while (deferred_vdrop_list != NULL) {
850		vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
851		deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
852	}
853	return (m_ret);
854}
855
856#ifdef DDB
857/*
858 * Show the number of physical pages in each of the free lists.
859 */
860DB_SHOW_COMMAND(freepages, db_show_freepages)
861{
862	struct vm_freelist *fl;
863	int flind, oind, pind;
864
865	for (flind = 0; flind < vm_nfreelists; flind++) {
866		db_printf("FREE LIST %d:\n"
867		    "\n  ORDER (SIZE)  |  NUMBER"
868		    "\n              ", flind);
869		for (pind = 0; pind < VM_NFREEPOOL; pind++)
870			db_printf("  |  POOL %d", pind);
871		db_printf("\n--            ");
872		for (pind = 0; pind < VM_NFREEPOOL; pind++)
873			db_printf("-- --      ");
874		db_printf("--\n");
875		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
876			db_printf("  %2.2d (%6.6dK)", oind,
877			    1 << (PAGE_SHIFT - 10 + oind));
878			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
879				fl = vm_phys_free_queues[flind][pind];
880				db_printf("  |  %6.6d", fl[oind].lcnt);
881			}
882			db_printf("\n");
883		}
884		db_printf("\n");
885	}
886}
887#endif
888