vm_phys.c revision 246805
1/*-
2 * Copyright (c) 2002-2006 Rice University
3 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Alan L. Cox,
7 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
28 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 *	Physical memory system implementation
34 *
35 * Any external functions defined by this module are only to be used by the
36 * virtual memory system.
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: head/sys/vm/vm_phys.c 246805 2013-02-14 19:38:04Z jhb $");
41
42#include "opt_ddb.h"
43#include "opt_vm.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/lock.h>
48#include <sys/kernel.h>
49#include <sys/malloc.h>
50#include <sys/mutex.h>
51#include <sys/queue.h>
52#include <sys/sbuf.h>
53#include <sys/sysctl.h>
54#include <sys/vmmeter.h>
55
56#include <ddb/ddb.h>
57
58#include <vm/vm.h>
59#include <vm/vm_param.h>
60#include <vm/vm_kern.h>
61#include <vm/vm_object.h>
62#include <vm/vm_page.h>
63#include <vm/vm_phys.h>
64
65/*
66 * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
67 * domain.  These extra lists are stored at the end of the regular
68 * free lists starting with VM_NFREELIST.
69 */
70#define VM_RAW_NFREELIST	(VM_NFREELIST + VM_NDOMAIN - 1)
71
72struct vm_freelist {
73	struct pglist pl;
74	int lcnt;
75};
76
77struct vm_phys_seg {
78	vm_paddr_t	start;
79	vm_paddr_t	end;
80	vm_page_t	first_page;
81	int		domain;
82	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
83};
84
85struct mem_affinity *mem_affinity;
86
87static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
88
89static int vm_phys_nsegs;
90
91#define VM_PHYS_FICTITIOUS_NSEGS	8
92static struct vm_phys_fictitious_seg {
93	vm_paddr_t	start;
94	vm_paddr_t	end;
95	vm_page_t	first_page;
96} vm_phys_fictitious_segs[VM_PHYS_FICTITIOUS_NSEGS];
97static struct mtx vm_phys_fictitious_reg_mtx;
98MALLOC_DEFINE(M_FICT_PAGES, "", "");
99
100static struct vm_freelist
101    vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
102static struct vm_freelist
103(*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
104
105static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
106
107static int cnt_prezero;
108SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
109    &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
110
111static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
112SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD,
113    NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info");
114
115static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
116SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
117    NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
118
119#if VM_NDOMAIN > 1
120static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
121SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
122    NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
123#endif
124
125static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
126    int domain);
127static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
128static int vm_phys_paddr_to_segind(vm_paddr_t pa);
129static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
130    int order);
131
132/*
133 * Outputs the state of the physical memory allocator, specifically,
134 * the amount of physical memory in each free list.
135 */
136static int
137sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
138{
139	struct sbuf sbuf;
140	struct vm_freelist *fl;
141	int error, flind, oind, pind;
142
143	error = sysctl_wire_old_buffer(req, 0);
144	if (error != 0)
145		return (error);
146	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
147	for (flind = 0; flind < vm_nfreelists; flind++) {
148		sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
149		    "\n  ORDER (SIZE)  |  NUMBER"
150		    "\n              ", flind);
151		for (pind = 0; pind < VM_NFREEPOOL; pind++)
152			sbuf_printf(&sbuf, "  |  POOL %d", pind);
153		sbuf_printf(&sbuf, "\n--            ");
154		for (pind = 0; pind < VM_NFREEPOOL; pind++)
155			sbuf_printf(&sbuf, "-- --      ");
156		sbuf_printf(&sbuf, "--\n");
157		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
158			sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
159			    1 << (PAGE_SHIFT - 10 + oind));
160			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
161				fl = vm_phys_free_queues[flind][pind];
162				sbuf_printf(&sbuf, "  |  %6d", fl[oind].lcnt);
163			}
164			sbuf_printf(&sbuf, "\n");
165		}
166	}
167	error = sbuf_finish(&sbuf);
168	sbuf_delete(&sbuf);
169	return (error);
170}
171
172/*
173 * Outputs the set of physical memory segments.
174 */
175static int
176sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
177{
178	struct sbuf sbuf;
179	struct vm_phys_seg *seg;
180	int error, segind;
181
182	error = sysctl_wire_old_buffer(req, 0);
183	if (error != 0)
184		return (error);
185	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
186	for (segind = 0; segind < vm_phys_nsegs; segind++) {
187		sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
188		seg = &vm_phys_segs[segind];
189		sbuf_printf(&sbuf, "start:     %#jx\n",
190		    (uintmax_t)seg->start);
191		sbuf_printf(&sbuf, "end:       %#jx\n",
192		    (uintmax_t)seg->end);
193		sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
194		sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
195	}
196	error = sbuf_finish(&sbuf);
197	sbuf_delete(&sbuf);
198	return (error);
199}
200
201#if VM_NDOMAIN > 1
202/*
203 * Outputs the set of free list lookup lists.
204 */
205static int
206sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
207{
208	struct sbuf sbuf;
209	int domain, error, flind, ndomains;
210
211	error = sysctl_wire_old_buffer(req, 0);
212	if (error != 0)
213		return (error);
214	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
215	ndomains = vm_nfreelists - VM_NFREELIST + 1;
216	for (domain = 0; domain < ndomains; domain++) {
217		sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
218		for (flind = 0; flind < vm_nfreelists; flind++)
219			sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
220			    vm_phys_lookup_lists[domain][flind]);
221	}
222	error = sbuf_finish(&sbuf);
223	sbuf_delete(&sbuf);
224	return (error);
225}
226#endif
227
228/*
229 * Create a physical memory segment.
230 */
231static void
232_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
233{
234	struct vm_phys_seg *seg;
235#ifdef VM_PHYSSEG_SPARSE
236	long pages;
237	int segind;
238
239	pages = 0;
240	for (segind = 0; segind < vm_phys_nsegs; segind++) {
241		seg = &vm_phys_segs[segind];
242		pages += atop(seg->end - seg->start);
243	}
244#endif
245	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
246	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
247	seg = &vm_phys_segs[vm_phys_nsegs++];
248	seg->start = start;
249	seg->end = end;
250	seg->domain = domain;
251#ifdef VM_PHYSSEG_SPARSE
252	seg->first_page = &vm_page_array[pages];
253#else
254	seg->first_page = PHYS_TO_VM_PAGE(start);
255#endif
256#if VM_NDOMAIN > 1
257	if (flind == VM_FREELIST_DEFAULT && domain != 0) {
258		flind = VM_NFREELIST + (domain - 1);
259		if (flind >= vm_nfreelists)
260			vm_nfreelists = flind + 1;
261	}
262#endif
263	seg->free_queues = &vm_phys_free_queues[flind];
264}
265
266static void
267vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
268{
269	int i;
270
271	if (mem_affinity == NULL) {
272		_vm_phys_create_seg(start, end, flind, 0);
273		return;
274	}
275
276	for (i = 0;; i++) {
277		if (mem_affinity[i].end == 0)
278			panic("Reached end of affinity info");
279		if (mem_affinity[i].end <= start)
280			continue;
281		if (mem_affinity[i].start > start)
282			panic("No affinity info for start %jx",
283			    (uintmax_t)start);
284		if (mem_affinity[i].end >= end) {
285			_vm_phys_create_seg(start, end, flind,
286			    mem_affinity[i].domain);
287			break;
288		}
289		_vm_phys_create_seg(start, mem_affinity[i].end, flind,
290		    mem_affinity[i].domain);
291		start = mem_affinity[i].end;
292	}
293}
294
295/*
296 * Initialize the physical memory allocator.
297 */
298void
299vm_phys_init(void)
300{
301	struct vm_freelist *fl;
302	int flind, i, oind, pind;
303#if VM_NDOMAIN > 1
304	int ndomains, j;
305#endif
306
307	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
308#ifdef	VM_FREELIST_ISADMA
309		if (phys_avail[i] < 16777216) {
310			if (phys_avail[i + 1] > 16777216) {
311				vm_phys_create_seg(phys_avail[i], 16777216,
312				    VM_FREELIST_ISADMA);
313				vm_phys_create_seg(16777216, phys_avail[i + 1],
314				    VM_FREELIST_DEFAULT);
315			} else {
316				vm_phys_create_seg(phys_avail[i],
317				    phys_avail[i + 1], VM_FREELIST_ISADMA);
318			}
319			if (VM_FREELIST_ISADMA >= vm_nfreelists)
320				vm_nfreelists = VM_FREELIST_ISADMA + 1;
321		} else
322#endif
323#ifdef	VM_FREELIST_HIGHMEM
324		if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
325			if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
326				vm_phys_create_seg(phys_avail[i],
327				    VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
328				vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
329				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
330			} else {
331				vm_phys_create_seg(phys_avail[i],
332				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
333			}
334			if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
335				vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
336		} else
337#endif
338		vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
339		    VM_FREELIST_DEFAULT);
340	}
341	for (flind = 0; flind < vm_nfreelists; flind++) {
342		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
343			fl = vm_phys_free_queues[flind][pind];
344			for (oind = 0; oind < VM_NFREEORDER; oind++)
345				TAILQ_INIT(&fl[oind].pl);
346		}
347	}
348#if VM_NDOMAIN > 1
349	/*
350	 * Build a free list lookup list for each domain.  All of the
351	 * memory domain lists are inserted at the VM_FREELIST_DEFAULT
352	 * index in a round-robin order starting with the current
353	 * domain.
354	 */
355	ndomains = vm_nfreelists - VM_NFREELIST + 1;
356	for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
357		for (i = 0; i < ndomains; i++)
358			vm_phys_lookup_lists[i][flind] =
359			    &vm_phys_free_queues[flind];
360	for (i = 0; i < ndomains; i++)
361		for (j = 0; j < ndomains; j++) {
362			flind = (i + j) % ndomains;
363			if (flind == 0)
364				flind = VM_FREELIST_DEFAULT;
365			else
366				flind += VM_NFREELIST - 1;
367			vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
368			    &vm_phys_free_queues[flind];
369		}
370	for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
371	     flind++)
372		for (i = 0; i < ndomains; i++)
373			vm_phys_lookup_lists[i][flind + ndomains - 1] =
374			    &vm_phys_free_queues[flind];
375#else
376	for (flind = 0; flind < vm_nfreelists; flind++)
377		vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
378#endif
379
380	mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF);
381}
382
383/*
384 * Split a contiguous, power of two-sized set of physical pages.
385 */
386static __inline void
387vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order)
388{
389	vm_page_t m_buddy;
390
391	while (oind > order) {
392		oind--;
393		m_buddy = &m[1 << oind];
394		KASSERT(m_buddy->order == VM_NFREEORDER,
395		    ("vm_phys_split_pages: page %p has unexpected order %d",
396		    m_buddy, m_buddy->order));
397		m_buddy->order = oind;
398		TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
399		fl[oind].lcnt++;
400        }
401}
402
403/*
404 * Initialize a physical page and add it to the free lists.
405 */
406void
407vm_phys_add_page(vm_paddr_t pa)
408{
409	vm_page_t m;
410
411	cnt.v_page_count++;
412	m = vm_phys_paddr_to_vm_page(pa);
413	m->phys_addr = pa;
414	m->queue = PQ_NONE;
415	m->segind = vm_phys_paddr_to_segind(pa);
416	m->flags = PG_FREE;
417	KASSERT(m->order == VM_NFREEORDER,
418	    ("vm_phys_add_page: page %p has unexpected order %d",
419	    m, m->order));
420	m->pool = VM_FREEPOOL_DEFAULT;
421	pmap_page_init(m);
422	mtx_lock(&vm_page_queue_free_mtx);
423	cnt.v_free_count++;
424	vm_phys_free_pages(m, 0);
425	mtx_unlock(&vm_page_queue_free_mtx);
426}
427
428/*
429 * Allocate a contiguous, power of two-sized set of physical pages
430 * from the free lists.
431 *
432 * The free page queues must be locked.
433 */
434vm_page_t
435vm_phys_alloc_pages(int pool, int order)
436{
437	vm_page_t m;
438	int flind;
439
440	for (flind = 0; flind < vm_nfreelists; flind++) {
441		m = vm_phys_alloc_freelist_pages(flind, pool, order);
442		if (m != NULL)
443			return (m);
444	}
445	return (NULL);
446}
447
448/*
449 * Find and dequeue a free page on the given free list, with the
450 * specified pool and order
451 */
452vm_page_t
453vm_phys_alloc_freelist_pages(int flind, int pool, int order)
454{
455	struct vm_freelist *fl;
456	struct vm_freelist *alt;
457	int domain, oind, pind;
458	vm_page_t m;
459
460	KASSERT(flind < VM_NFREELIST,
461	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
462	KASSERT(pool < VM_NFREEPOOL,
463	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
464	KASSERT(order < VM_NFREEORDER,
465	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
466
467#if VM_NDOMAIN > 1
468	domain = PCPU_GET(domain);
469#else
470	domain = 0;
471#endif
472	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
473	fl = (*vm_phys_lookup_lists[domain][flind])[pool];
474	for (oind = order; oind < VM_NFREEORDER; oind++) {
475		m = TAILQ_FIRST(&fl[oind].pl);
476		if (m != NULL) {
477			TAILQ_REMOVE(&fl[oind].pl, m, pageq);
478			fl[oind].lcnt--;
479			m->order = VM_NFREEORDER;
480			vm_phys_split_pages(m, oind, fl, order);
481			return (m);
482		}
483	}
484
485	/*
486	 * The given pool was empty.  Find the largest
487	 * contiguous, power-of-two-sized set of pages in any
488	 * pool.  Transfer these pages to the given pool, and
489	 * use them to satisfy the allocation.
490	 */
491	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
492		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
493			alt = (*vm_phys_lookup_lists[domain][flind])[pind];
494			m = TAILQ_FIRST(&alt[oind].pl);
495			if (m != NULL) {
496				TAILQ_REMOVE(&alt[oind].pl, m, pageq);
497				alt[oind].lcnt--;
498				m->order = VM_NFREEORDER;
499				vm_phys_set_pool(pool, m, oind);
500				vm_phys_split_pages(m, oind, fl, order);
501				return (m);
502			}
503		}
504	}
505	return (NULL);
506}
507
508/*
509 * Find the vm_page corresponding to the given physical address.
510 */
511vm_page_t
512vm_phys_paddr_to_vm_page(vm_paddr_t pa)
513{
514	struct vm_phys_seg *seg;
515	int segind;
516
517	for (segind = 0; segind < vm_phys_nsegs; segind++) {
518		seg = &vm_phys_segs[segind];
519		if (pa >= seg->start && pa < seg->end)
520			return (&seg->first_page[atop(pa - seg->start)]);
521	}
522	return (NULL);
523}
524
525vm_page_t
526vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
527{
528	struct vm_phys_fictitious_seg *seg;
529	vm_page_t m;
530	int segind;
531
532	m = NULL;
533	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
534		seg = &vm_phys_fictitious_segs[segind];
535		if (pa >= seg->start && pa < seg->end) {
536			m = &seg->first_page[atop(pa - seg->start)];
537			KASSERT((m->flags & PG_FICTITIOUS) != 0,
538			    ("%p not fictitious", m));
539			break;
540		}
541	}
542	return (m);
543}
544
545int
546vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
547    vm_memattr_t memattr)
548{
549	struct vm_phys_fictitious_seg *seg;
550	vm_page_t fp;
551	long i, page_count;
552	int segind;
553#ifdef VM_PHYSSEG_DENSE
554	long pi;
555	boolean_t malloced;
556#endif
557
558	page_count = (end - start) / PAGE_SIZE;
559
560#ifdef VM_PHYSSEG_DENSE
561	pi = atop(start);
562	if (pi >= first_page && atop(end) < vm_page_array_size) {
563		fp = &vm_page_array[pi - first_page];
564		malloced = FALSE;
565	} else
566#endif
567	{
568		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
569		    M_WAITOK | M_ZERO);
570#ifdef VM_PHYSSEG_DENSE
571		malloced = TRUE;
572#endif
573	}
574	for (i = 0; i < page_count; i++) {
575		vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr);
576		pmap_page_init(&fp[i]);
577		fp[i].oflags &= ~(VPO_BUSY | VPO_UNMANAGED);
578	}
579	mtx_lock(&vm_phys_fictitious_reg_mtx);
580	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
581		seg = &vm_phys_fictitious_segs[segind];
582		if (seg->start == 0 && seg->end == 0) {
583			seg->start = start;
584			seg->end = end;
585			seg->first_page = fp;
586			mtx_unlock(&vm_phys_fictitious_reg_mtx);
587			return (0);
588		}
589	}
590	mtx_unlock(&vm_phys_fictitious_reg_mtx);
591#ifdef VM_PHYSSEG_DENSE
592	if (malloced)
593#endif
594		free(fp, M_FICT_PAGES);
595	return (EBUSY);
596}
597
598void
599vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
600{
601	struct vm_phys_fictitious_seg *seg;
602	vm_page_t fp;
603	int segind;
604#ifdef VM_PHYSSEG_DENSE
605	long pi;
606#endif
607
608#ifdef VM_PHYSSEG_DENSE
609	pi = atop(start);
610#endif
611
612	mtx_lock(&vm_phys_fictitious_reg_mtx);
613	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
614		seg = &vm_phys_fictitious_segs[segind];
615		if (seg->start == start && seg->end == end) {
616			seg->start = seg->end = 0;
617			fp = seg->first_page;
618			seg->first_page = NULL;
619			mtx_unlock(&vm_phys_fictitious_reg_mtx);
620#ifdef VM_PHYSSEG_DENSE
621			if (pi < first_page || atop(end) >= vm_page_array_size)
622#endif
623				free(fp, M_FICT_PAGES);
624			return;
625		}
626	}
627	mtx_unlock(&vm_phys_fictitious_reg_mtx);
628	KASSERT(0, ("Unregistering not registered fictitious range"));
629}
630
631/*
632 * Find the segment containing the given physical address.
633 */
634static int
635vm_phys_paddr_to_segind(vm_paddr_t pa)
636{
637	struct vm_phys_seg *seg;
638	int segind;
639
640	for (segind = 0; segind < vm_phys_nsegs; segind++) {
641		seg = &vm_phys_segs[segind];
642		if (pa >= seg->start && pa < seg->end)
643			return (segind);
644	}
645	panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
646	    (uintmax_t)pa);
647}
648
649/*
650 * Free a contiguous, power of two-sized set of physical pages.
651 *
652 * The free page queues must be locked.
653 */
654void
655vm_phys_free_pages(vm_page_t m, int order)
656{
657	struct vm_freelist *fl;
658	struct vm_phys_seg *seg;
659	vm_paddr_t pa;
660	vm_page_t m_buddy;
661
662	KASSERT(m->order == VM_NFREEORDER,
663	    ("vm_phys_free_pages: page %p has unexpected order %d",
664	    m, m->order));
665	KASSERT(m->pool < VM_NFREEPOOL,
666	    ("vm_phys_free_pages: page %p has unexpected pool %d",
667	    m, m->pool));
668	KASSERT(order < VM_NFREEORDER,
669	    ("vm_phys_free_pages: order %d is out of range", order));
670	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
671	seg = &vm_phys_segs[m->segind];
672	if (order < VM_NFREEORDER - 1) {
673		pa = VM_PAGE_TO_PHYS(m);
674		do {
675			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
676			if (pa < seg->start || pa >= seg->end)
677				break;
678			m_buddy = &seg->first_page[atop(pa - seg->start)];
679			if (m_buddy->order != order)
680				break;
681			fl = (*seg->free_queues)[m_buddy->pool];
682			TAILQ_REMOVE(&fl[order].pl, m_buddy, pageq);
683			fl[order].lcnt--;
684			m_buddy->order = VM_NFREEORDER;
685			if (m_buddy->pool != m->pool)
686				vm_phys_set_pool(m->pool, m_buddy, order);
687			order++;
688			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
689			m = &seg->first_page[atop(pa - seg->start)];
690		} while (order < VM_NFREEORDER - 1);
691	}
692	m->order = order;
693	fl = (*seg->free_queues)[m->pool];
694	TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
695	fl[order].lcnt++;
696}
697
698/*
699 * Free a contiguous, arbitrarily sized set of physical pages.
700 *
701 * The free page queues must be locked.
702 */
703void
704vm_phys_free_contig(vm_page_t m, u_long npages)
705{
706	u_int n;
707	int order;
708
709	/*
710	 * Avoid unnecessary coalescing by freeing the pages in the largest
711	 * possible power-of-two-sized subsets.
712	 */
713	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
714	for (;; npages -= n) {
715		/*
716		 * Unsigned "min" is used here so that "order" is assigned
717		 * "VM_NFREEORDER - 1" when "m"'s physical address is zero
718		 * or the low-order bits of its physical address are zero
719		 * because the size of a physical address exceeds the size of
720		 * a long.
721		 */
722		order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
723		    VM_NFREEORDER - 1);
724		n = 1 << order;
725		if (npages < n)
726			break;
727		vm_phys_free_pages(m, order);
728		m += n;
729	}
730	/* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
731	for (; npages > 0; npages -= n) {
732		order = flsl(npages) - 1;
733		n = 1 << order;
734		vm_phys_free_pages(m, order);
735		m += n;
736	}
737}
738
739/*
740 * Set the pool for a contiguous, power of two-sized set of physical pages.
741 */
742void
743vm_phys_set_pool(int pool, vm_page_t m, int order)
744{
745	vm_page_t m_tmp;
746
747	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
748		m_tmp->pool = pool;
749}
750
751/*
752 * Search for the given physical page "m" in the free lists.  If the search
753 * succeeds, remove "m" from the free lists and return TRUE.  Otherwise, return
754 * FALSE, indicating that "m" is not in the free lists.
755 *
756 * The free page queues must be locked.
757 */
758boolean_t
759vm_phys_unfree_page(vm_page_t m)
760{
761	struct vm_freelist *fl;
762	struct vm_phys_seg *seg;
763	vm_paddr_t pa, pa_half;
764	vm_page_t m_set, m_tmp;
765	int order;
766
767	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
768
769	/*
770	 * First, find the contiguous, power of two-sized set of free
771	 * physical pages containing the given physical page "m" and
772	 * assign it to "m_set".
773	 */
774	seg = &vm_phys_segs[m->segind];
775	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
776	    order < VM_NFREEORDER - 1; ) {
777		order++;
778		pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
779		if (pa >= seg->start)
780			m_set = &seg->first_page[atop(pa - seg->start)];
781		else
782			return (FALSE);
783	}
784	if (m_set->order < order)
785		return (FALSE);
786	if (m_set->order == VM_NFREEORDER)
787		return (FALSE);
788	KASSERT(m_set->order < VM_NFREEORDER,
789	    ("vm_phys_unfree_page: page %p has unexpected order %d",
790	    m_set, m_set->order));
791
792	/*
793	 * Next, remove "m_set" from the free lists.  Finally, extract
794	 * "m" from "m_set" using an iterative algorithm: While "m_set"
795	 * is larger than a page, shrink "m_set" by returning the half
796	 * of "m_set" that does not contain "m" to the free lists.
797	 */
798	fl = (*seg->free_queues)[m_set->pool];
799	order = m_set->order;
800	TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
801	fl[order].lcnt--;
802	m_set->order = VM_NFREEORDER;
803	while (order > 0) {
804		order--;
805		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
806		if (m->phys_addr < pa_half)
807			m_tmp = &seg->first_page[atop(pa_half - seg->start)];
808		else {
809			m_tmp = m_set;
810			m_set = &seg->first_page[atop(pa_half - seg->start)];
811		}
812		m_tmp->order = order;
813		TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
814		fl[order].lcnt++;
815	}
816	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
817	return (TRUE);
818}
819
820/*
821 * Try to zero one physical page.  Used by an idle priority thread.
822 */
823boolean_t
824vm_phys_zero_pages_idle(void)
825{
826	static struct vm_freelist *fl = vm_phys_free_queues[0][0];
827	static int flind, oind, pind;
828	vm_page_t m, m_tmp;
829
830	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
831	for (;;) {
832		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
833			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
834				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
835					vm_phys_unfree_page(m_tmp);
836					cnt.v_free_count--;
837					mtx_unlock(&vm_page_queue_free_mtx);
838					pmap_zero_page_idle(m_tmp);
839					m_tmp->flags |= PG_ZERO;
840					mtx_lock(&vm_page_queue_free_mtx);
841					cnt.v_free_count++;
842					vm_phys_free_pages(m_tmp, 0);
843					vm_page_zero_count++;
844					cnt_prezero++;
845					return (TRUE);
846				}
847			}
848		}
849		oind++;
850		if (oind == VM_NFREEORDER) {
851			oind = 0;
852			pind++;
853			if (pind == VM_NFREEPOOL) {
854				pind = 0;
855				flind++;
856				if (flind == vm_nfreelists)
857					flind = 0;
858			}
859			fl = vm_phys_free_queues[flind][pind];
860		}
861	}
862}
863
864/*
865 * Allocate a contiguous set of physical pages of the given size
866 * "npages" from the free lists.  All of the physical pages must be at
867 * or above the given physical address "low" and below the given
868 * physical address "high".  The given value "alignment" determines the
869 * alignment of the first physical page in the set.  If the given value
870 * "boundary" is non-zero, then the set of physical pages cannot cross
871 * any physical address boundary that is a multiple of that value.  Both
872 * "alignment" and "boundary" must be a power of two.
873 */
874vm_page_t
875vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
876    u_long alignment, vm_paddr_t boundary)
877{
878	struct vm_freelist *fl;
879	struct vm_phys_seg *seg;
880	vm_paddr_t pa, pa_last, size;
881	vm_page_t m, m_ret;
882	u_long npages_end;
883	int domain, flind, oind, order, pind;
884
885	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
886#if VM_NDOMAIN > 1
887	domain = PCPU_GET(domain);
888#else
889	domain = 0;
890#endif
891	size = npages << PAGE_SHIFT;
892	KASSERT(size != 0,
893	    ("vm_phys_alloc_contig: size must not be 0"));
894	KASSERT((alignment & (alignment - 1)) == 0,
895	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
896	KASSERT((boundary & (boundary - 1)) == 0,
897	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
898	/* Compute the queue that is the best fit for npages. */
899	for (order = 0; (1 << order) < npages; order++);
900	for (flind = 0; flind < vm_nfreelists; flind++) {
901		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
902			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
903				fl = (*vm_phys_lookup_lists[domain][flind])
904				    [pind];
905				TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
906					/*
907					 * A free list may contain physical pages
908					 * from one or more segments.
909					 */
910					seg = &vm_phys_segs[m_ret->segind];
911					if (seg->start > high ||
912					    low >= seg->end)
913						continue;
914
915					/*
916					 * Is the size of this allocation request
917					 * larger than the largest block size?
918					 */
919					if (order >= VM_NFREEORDER) {
920						/*
921						 * Determine if a sufficient number
922						 * of subsequent blocks to satisfy
923						 * the allocation request are free.
924						 */
925						pa = VM_PAGE_TO_PHYS(m_ret);
926						pa_last = pa + size;
927						for (;;) {
928							pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
929							if (pa >= pa_last)
930								break;
931							if (pa < seg->start ||
932							    pa >= seg->end)
933								break;
934							m = &seg->first_page[atop(pa - seg->start)];
935							if (m->order != VM_NFREEORDER - 1)
936								break;
937						}
938						/* If not, continue to the next block. */
939						if (pa < pa_last)
940							continue;
941					}
942
943					/*
944					 * Determine if the blocks are within the given range,
945					 * satisfy the given alignment, and do not cross the
946					 * given boundary.
947					 */
948					pa = VM_PAGE_TO_PHYS(m_ret);
949					if (pa >= low &&
950					    pa + size <= high &&
951					    (pa & (alignment - 1)) == 0 &&
952					    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
953						goto done;
954				}
955			}
956		}
957	}
958	return (NULL);
959done:
960	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
961		fl = (*seg->free_queues)[m->pool];
962		TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
963		fl[m->order].lcnt--;
964		m->order = VM_NFREEORDER;
965	}
966	if (m_ret->pool != VM_FREEPOOL_DEFAULT)
967		vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
968	fl = (*seg->free_queues)[m_ret->pool];
969	vm_phys_split_pages(m_ret, oind, fl, order);
970	/* Return excess pages to the free lists. */
971	npages_end = roundup2(npages, 1 << imin(oind, order));
972	if (npages < npages_end)
973		vm_phys_free_contig(&m_ret[npages], npages_end - npages);
974	return (m_ret);
975}
976
977#ifdef DDB
978/*
979 * Show the number of physical pages in each of the free lists.
980 */
981DB_SHOW_COMMAND(freepages, db_show_freepages)
982{
983	struct vm_freelist *fl;
984	int flind, oind, pind;
985
986	for (flind = 0; flind < vm_nfreelists; flind++) {
987		db_printf("FREE LIST %d:\n"
988		    "\n  ORDER (SIZE)  |  NUMBER"
989		    "\n              ", flind);
990		for (pind = 0; pind < VM_NFREEPOOL; pind++)
991			db_printf("  |  POOL %d", pind);
992		db_printf("\n--            ");
993		for (pind = 0; pind < VM_NFREEPOOL; pind++)
994			db_printf("-- --      ");
995		db_printf("--\n");
996		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
997			db_printf("  %2.2d (%6.6dK)", oind,
998			    1 << (PAGE_SHIFT - 10 + oind));
999			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
1000				fl = vm_phys_free_queues[flind][pind];
1001				db_printf("  |  %6.6d", fl[oind].lcnt);
1002			}
1003			db_printf("\n");
1004		}
1005		db_printf("\n");
1006	}
1007}
1008#endif
1009