1/*	$OpenBSD: uvm_pmemrange.c,v 1.66 2024/05/01 12:54:27 mpi Exp $	*/
2
3/*
4 * Copyright (c) 2024 Martin Pieuchot <mpi@openbsd.org>
5 * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20#include <sys/param.h>
21#include <sys/systm.h>
22#include <uvm/uvm.h>
23#include <sys/malloc.h>
24#include <sys/kernel.h>
25#include <sys/proc.h>
26#include <sys/mount.h>
27
28/*
29 * 2 trees: addr tree and size tree.
30 *
31 * The allocator keeps chunks of free pages (called a range).
32 * Two pages are part of the same range if:
33 * - all pages in between are part of that range,
34 * - they are of the same memory type (zeroed or non-zeroed),
35 * - they are part of the same pmemrange.
36 * A pmemrange is a range of memory which is part of the same vm_physseg
37 * and has a use-count.
38 *
39 * addr tree is vm_page[0].objt
40 * size tree is vm_page[1].objt
41 *
42 * The size tree is not used for memory ranges of 1 page, instead,
43 * single queue is vm_page[0].pageq
44 *
45 * vm_page[0].fpgsz describes the length of a free range. Two adjacent ranges
46 * are joined, unless:
47 * - they have pages in between them which are not free
48 * - they belong to different memtypes (zeroed vs dirty memory)
49 * - they are in different pmemrange areas (ISA vs non-ISA memory for instance)
50 * - they are not a continuation of the same array
51 * The latter issue is caused by vm_physseg ordering and splitting from the
52 * MD initialization machinery. The MD code is dependant on freelists and
53 * happens to split ISA memory from non-ISA memory.
54 * (Note: freelists die die die!)
55 *
56 * uvm_page_init guarantees that every vm_physseg contains an array of
57 * struct vm_page. Also, uvm_page_physload allocates an array of struct
58 * vm_page. This code depends on that array. The array may break across
59 * vm_physsegs boundaries.
60 */
61
62/*
63 * Validate the flags of the page. (Used in asserts.)
64 * Any free page must have the PQ_FREE flag set.
65 * Free pages may be zeroed.
66 * Pmap flags are left untouched.
67 *
68 * The PQ_FREE flag is not checked here: by not checking, we can easily use
69 * this check in pages which are freed.
70 */
71#define VALID_FLAGS(pg_flags)						\
72	(((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0)
73
74/* Tree comparators. */
75int	uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *,
76	    const struct uvm_pmemrange *);
77int	uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *);
78int	uvm_pmr_pg_to_memtype(struct vm_page *);
79
80#ifdef DDB
81void	uvm_pmr_print(void);
82#endif
83
84/*
85 * Memory types. The page flags are used to derive what the current memory
86 * type of a page is.
87 */
88int
89uvm_pmr_pg_to_memtype(struct vm_page *pg)
90{
91	if (pg->pg_flags & PG_ZERO)
92		return UVM_PMR_MEMTYPE_ZERO;
93	/* Default: dirty memory. */
94	return UVM_PMR_MEMTYPE_DIRTY;
95}
96
97/* Trees. */
98RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp);
99RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp);
100RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr,
101    uvm_pmemrange_addr_cmp);
102
103/* Validation. */
104#ifdef DEBUG
105void	uvm_pmr_assertvalid(struct uvm_pmemrange *pmr);
106#else
107#define uvm_pmr_assertvalid(pmr)	do {} while (0)
108#endif
109
110psize_t			 uvm_pmr_get1page(psize_t, int, struct pglist *,
111			    paddr_t, paddr_t, int);
112
113struct uvm_pmemrange	*uvm_pmr_allocpmr(void);
114struct vm_page		*uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int);
115struct vm_page		*uvm_pmr_nextsz(struct uvm_pmemrange *,
116			    struct vm_page *, int);
117void			 uvm_pmr_pnaddr(struct uvm_pmemrange *pmr,
118			    struct vm_page *pg, struct vm_page **pg_prev,
119			    struct vm_page **pg_next);
120struct vm_page		*uvm_pmr_findnextsegment(struct uvm_pmemrange *,
121			    struct vm_page *, paddr_t);
122struct vm_page		*uvm_pmr_findprevsegment(struct uvm_pmemrange *,
123			    struct vm_page *, paddr_t);
124psize_t			 uvm_pmr_remove_1strange(struct pglist *, paddr_t,
125			    struct vm_page **, int);
126psize_t			 uvm_pmr_remove_1strange_reverse(struct pglist *,
127    			    paddr_t *);
128void			 uvm_pmr_split(paddr_t);
129struct uvm_pmemrange	*uvm_pmemrange_find(paddr_t);
130struct uvm_pmemrange	*uvm_pmemrange_use_insert(struct uvm_pmemrange_use *,
131			    struct uvm_pmemrange *);
132psize_t			 pow2divide(psize_t, psize_t);
133struct vm_page		*uvm_pmr_rootupdate(struct uvm_pmemrange *,
134			    struct vm_page *, paddr_t, paddr_t, int);
135
136/*
137 * Computes num/denom and rounds it up to the next power-of-2.
138 *
139 * This is a division function which calculates an approximation of
140 * num/denom, with result =~ num/denom. It is meant to be fast and doesn't
141 * have to be accurate.
142 *
143 * Providing too large a value makes the allocator slightly faster, at the
144 * risk of hitting the failure case more often. Providing too small a value
145 * makes the allocator a bit slower, but less likely to hit a failure case.
146 */
147psize_t
148pow2divide(psize_t num, psize_t denom)
149{
150	int rshift;
151
152	for (rshift = 0; num > denom; rshift++, denom <<= 1)
153		;
154	return (paddr_t)1 << rshift;
155}
156
157/*
158 * Predicate: lhs is a subrange or rhs.
159 *
160 * If rhs_low == 0: don't care about lower bound.
161 * If rhs_high == 0: don't care about upper bound.
162 */
163#define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high)	\
164	(((rhs_low) == 0 || (lhs_low) >= (rhs_low)) &&			\
165	((rhs_high) == 0 || (lhs_high) <= (rhs_high)))
166
167/*
168 * Predicate: lhs intersects with rhs.
169 *
170 * If rhs_low == 0: don't care about lower bound.
171 * If rhs_high == 0: don't care about upper bound.
172 * Ranges don't intersect if they don't have any page in common, array
173 * semantics mean that < instead of <= should be used here.
174 */
175#define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high)	\
176	(((rhs_low) == 0 || (rhs_low) < (lhs_high)) &&			\
177	((rhs_high) == 0 || (lhs_low) < (rhs_high)))
178
179/*
180 * Align to power-of-2 alignment.
181 */
182#define PMR_ALIGN(pgno, align)						\
183	(((pgno) + ((align) - 1)) & ~((align) - 1))
184#define PMR_ALIGN_DOWN(pgno, align)					\
185	((pgno) & ~((align) - 1))
186
187
188/*
189 * Comparator: sort by address ascending.
190 */
191int
192uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs,
193    const struct uvm_pmemrange *rhs)
194{
195	return lhs->low < rhs->low ? -1 : lhs->low > rhs->low;
196}
197
198/*
199 * Comparator: sort by use ascending.
200 *
201 * The higher the use value of a range, the more devices need memory in
202 * this range. Therefore allocate from the range with the lowest use first.
203 */
204int
205uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs)
206{
207	int result;
208
209	result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use;
210	if (result == 0)
211		result = uvm_pmemrange_addr_cmp(lhs, rhs);
212	return result;
213}
214
215int
216uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
217{
218	paddr_t lhs_addr, rhs_addr;
219
220	lhs_addr = VM_PAGE_TO_PHYS(lhs);
221	rhs_addr = VM_PAGE_TO_PHYS(rhs);
222
223	return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr);
224}
225
226int
227uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs)
228{
229	psize_t lhs_size, rhs_size;
230	int cmp;
231
232	/* Using second tree, so we receive pg[1] instead of pg[0]. */
233	lhs_size = (lhs - 1)->fpgsz;
234	rhs_size = (rhs - 1)->fpgsz;
235
236	cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size);
237	if (cmp == 0)
238		cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1);
239	return cmp;
240}
241
242/*
243 * Find the first range of free pages that is at least sz pages long.
244 */
245struct vm_page *
246uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti)
247{
248	struct	vm_page *node, *best;
249
250	KASSERT(sz >= 1);
251
252	if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti]))
253		return TAILQ_FIRST(&pmr->single[mti]);
254
255	node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]);
256	best = NULL;
257	while (node != NULL) {
258		if ((node - 1)->fpgsz >= sz) {
259			best = (node - 1);
260			node = RBT_LEFT(uvm_objtree, node);
261		} else
262			node = RBT_RIGHT(uvm_objtree, node);
263	}
264	return best;
265}
266
267/*
268 * Finds the next range. The next range has a size >= pg->fpgsz.
269 * Returns NULL if no more ranges are available.
270 */
271struct vm_page *
272uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt)
273{
274	struct vm_page *npg;
275
276	KASSERT(pmr != NULL && pg != NULL);
277	if (pg->fpgsz == 1) {
278		if (TAILQ_NEXT(pg, pageq) != NULL)
279			return TAILQ_NEXT(pg, pageq);
280		else
281			npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]);
282	} else
283		npg = RBT_NEXT(uvm_pmr_size, pg + 1);
284
285	return npg == NULL ? NULL : npg - 1;
286}
287
288/*
289 * Finds the previous and next ranges relative to the (uninserted) pg range.
290 *
291 * *pg_prev == NULL if no previous range is available, that can join with
292 * 	pg.
293 * *pg_next == NULL if no next range is available, that can join with
294 * 	pg.
295 */
296void
297uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg,
298    struct vm_page **pg_prev, struct vm_page **pg_next)
299{
300	KASSERT(pg_prev != NULL && pg_next != NULL);
301
302	*pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
303	if (*pg_next == NULL)
304		*pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
305	else
306		*pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next);
307
308	KDASSERT(*pg_next == NULL ||
309	    VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg));
310	KDASSERT(*pg_prev == NULL ||
311	    VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg));
312
313	/* Reset if not contig. */
314	if (*pg_prev != NULL &&
315	    (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz
316	    != atop(VM_PAGE_TO_PHYS(pg)) ||
317	    *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */
318	    uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg)))
319		*pg_prev = NULL;
320	if (*pg_next != NULL &&
321	    (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz
322	    != atop(VM_PAGE_TO_PHYS(*pg_next)) ||
323	    pg + pg->fpgsz != *pg_next || /* Array broke. */
324	    uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg)))
325		*pg_next = NULL;
326	return;
327}
328
329/*
330 * Remove a range from the address tree.
331 * Address tree maintains pmr counters.
332 */
333void
334uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg)
335{
336	KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
337	KDASSERT(pg->pg_flags & PQ_FREE);
338	RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg);
339
340	pmr->nsegs--;
341}
342/*
343 * Remove a range from the size tree.
344 */
345void
346uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
347{
348	int memtype;
349#ifdef DEBUG
350	struct vm_page *i;
351#endif
352
353	KDASSERT(pg->fpgsz >= 1);
354	KDASSERT(pg->pg_flags & PQ_FREE);
355	memtype = uvm_pmr_pg_to_memtype(pg);
356
357	if (pg->fpgsz == 1) {
358#ifdef DEBUG
359		TAILQ_FOREACH(i, &pmr->single[memtype], pageq) {
360			if (i == pg)
361				break;
362		}
363		KDASSERT(i == pg);
364#endif
365		TAILQ_REMOVE(&pmr->single[memtype], pg, pageq);
366	} else {
367		KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype],
368		    pg + 1) == pg + 1);
369		RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1);
370	}
371}
372/* Remove from both trees. */
373void
374uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg)
375{
376	uvm_pmr_assertvalid(pmr);
377	uvm_pmr_remove_size(pmr, pg);
378	uvm_pmr_remove_addr(pmr, pg);
379	uvm_pmr_assertvalid(pmr);
380}
381
382/*
383 * Insert the range described in pg.
384 * Returns the range thus created (which may be joined with the previous and
385 * next ranges).
386 * If no_join, the caller guarantees that the range cannot possibly join
387 * with adjacent ranges.
388 */
389struct vm_page *
390uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
391{
392	struct vm_page *prev, *next;
393
394#ifdef DEBUG
395	struct vm_page *i;
396	int mt;
397#endif
398
399	KDASSERT(pg->pg_flags & PQ_FREE);
400	KDASSERT(pg->fpgsz >= 1);
401
402#ifdef DEBUG
403	for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
404		TAILQ_FOREACH(i, &pmr->single[mt], pageq)
405			KDASSERT(i != pg);
406		if (pg->fpgsz > 1) {
407			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt],
408			    pg + 1) == NULL);
409		}
410		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL);
411	}
412#endif
413
414	if (!no_join) {
415		uvm_pmr_pnaddr(pmr, pg, &prev, &next);
416		if (next != NULL) {
417			uvm_pmr_remove_size(pmr, next);
418			uvm_pmr_remove_addr(pmr, next);
419			pg->fpgsz += next->fpgsz;
420			next->fpgsz = 0;
421		}
422		if (prev != NULL) {
423			uvm_pmr_remove_size(pmr, prev);
424			prev->fpgsz += pg->fpgsz;
425			pg->fpgsz = 0;
426			return prev;
427		}
428	}
429
430	RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg);
431
432	pmr->nsegs++;
433
434	return pg;
435}
436/*
437 * Insert the range described in pg.
438 * Returns the range thus created (which may be joined with the previous and
439 * next ranges).
440 * Page must already be in the address tree.
441 */
442void
443uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg)
444{
445	int memtype;
446#ifdef DEBUG
447	struct vm_page *i;
448	int mti;
449#endif
450
451	KDASSERT(pg->fpgsz >= 1);
452	KDASSERT(pg->pg_flags & PQ_FREE);
453
454	memtype = uvm_pmr_pg_to_memtype(pg);
455#ifdef DEBUG
456	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
457		TAILQ_FOREACH(i, &pmr->single[mti], pageq)
458			KDASSERT(i != pg);
459		if (pg->fpgsz > 1) {
460			KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti],
461			    pg + 1) == NULL);
462		}
463		KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg);
464	}
465	for (i = pg; i < pg + pg->fpgsz; i++)
466		KASSERT(uvm_pmr_pg_to_memtype(i) == memtype);
467#endif
468
469	if (pg->fpgsz == 1)
470		TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq);
471	else
472		RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1);
473}
474/* Insert in both trees. */
475struct vm_page *
476uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join)
477{
478	uvm_pmr_assertvalid(pmr);
479	pg = uvm_pmr_insert_addr(pmr, pg, no_join);
480	uvm_pmr_insert_size(pmr, pg);
481	uvm_pmr_assertvalid(pmr);
482	return pg;
483}
484
485/*
486 * Find the last page that is part of this segment.
487 * => pg: the range at which to start the search.
488 * => boundary: the page number boundary specification (0 = no boundary).
489 * => pmr: the pmemrange of the page.
490 *
491 * This function returns 1 before the next range, so if you want to have the
492 * next range, you need to run TAILQ_NEXT(result, pageq) after calling.
493 * The reason is that this way, the length of the segment is easily
494 * calculated using: atop(result) - atop(pg) + 1.
495 * Hence this function also never returns NULL.
496 */
497struct vm_page *
498uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr,
499    struct vm_page *pg, paddr_t boundary)
500{
501	paddr_t	first_boundary;
502	struct	vm_page *next;
503	struct	vm_page *prev;
504
505	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
506	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
507	if (boundary != 0) {
508		first_boundary =
509		    PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary);
510	} else
511		first_boundary = 0;
512
513	/*
514	 * Increase next until it hits the first page of the next segment.
515	 *
516	 * While loop checks the following:
517	 * - next != NULL	we have not reached the end of pgl
518	 * - boundary == 0 || next < first_boundary
519	 *			we do not cross a boundary
520	 * - atop(prev) + 1 == atop(next)
521	 *			still in the same segment
522	 * - low <= last
523	 * - high > last	still in the same memory range
524	 * - memtype is equal	allocator is unable to view different memtypes
525	 *			as part of the same segment
526	 * - prev + 1 == next	no array breakage occurs
527	 */
528	prev = pg;
529	next = TAILQ_NEXT(prev, pageq);
530	while (next != NULL &&
531	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) &&
532	    atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) &&
533	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
534	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
535	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
536	    prev + 1 == next) {
537		prev = next;
538		next = TAILQ_NEXT(prev, pageq);
539	}
540
541	/*
542	 * End of this segment.
543	 */
544	return prev;
545}
546
547/*
548 * Find the first page that is part of this segment.
549 * => pg: the range at which to start the search.
550 * => boundary: the page number boundary specification (0 = no boundary).
551 * => pmr: the pmemrange of the page.
552 *
553 * This function returns 1 after the previous range, so if you want to have the
554 * previous range, you need to run TAILQ_NEXT(result, pageq) after calling.
555 * The reason is that this way, the length of the segment is easily
556 * calculated using: atop(pg) - atop(result) + 1.
557 * Hence this function also never returns NULL.
558 */
559struct vm_page *
560uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr,
561    struct vm_page *pg, paddr_t boundary)
562{
563	paddr_t	first_boundary;
564	struct	vm_page *next;
565	struct	vm_page *prev;
566
567	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) &&
568	    pmr->high > atop(VM_PAGE_TO_PHYS(pg)));
569	if (boundary != 0) {
570		first_boundary =
571		    PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary);
572	} else
573		first_boundary = 0;
574
575	/*
576	 * Increase next until it hits the first page of the previous segment.
577	 *
578	 * While loop checks the following:
579	 * - next != NULL	we have not reached the end of pgl
580	 * - boundary == 0 || next >= first_boundary
581	 *			we do not cross a boundary
582	 * - atop(prev) - 1 == atop(next)
583	 *			still in the same segment
584	 * - low <= last
585	 * - high > last	still in the same memory range
586	 * - memtype is equal	allocator is unable to view different memtypes
587	 *			as part of the same segment
588	 * - prev - 1 == next	no array breakage occurs
589	 */
590	prev = pg;
591	next = TAILQ_NEXT(prev, pageq);
592	while (next != NULL &&
593	    (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) &&
594	    atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) &&
595	    pmr->low <= atop(VM_PAGE_TO_PHYS(next)) &&
596	    pmr->high > atop(VM_PAGE_TO_PHYS(next)) &&
597	    uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) &&
598	    prev - 1 == next) {
599		prev = next;
600		next = TAILQ_NEXT(prev, pageq);
601	}
602
603	/*
604	 * Start of this segment.
605	 */
606	return prev;
607}
608
609/*
610 * Remove the first segment of contiguous pages from pgl.
611 * A segment ends if it crosses boundary (unless boundary = 0) or
612 * if it would enter a different uvm_pmemrange.
613 *
614 * Work: the page range that the caller is currently working with.
615 * May be null.
616 *
617 * If is_desperate is non-zero, the smallest segment is erased. Otherwise,
618 * the first segment is erased (which, if called by uvm_pmr_getpages(),
619 * probably is the smallest or very close to it).
620 */
621psize_t
622uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary,
623    struct vm_page **work, int is_desperate)
624{
625	struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest;
626	psize_t count;
627	struct uvm_pmemrange *pmr, *pmr_iter;
628
629	KASSERT(!TAILQ_EMPTY(pgl));
630
631	/*
632	 * Initialize to first page.
633	 * Unless desperate scan finds a better candidate, this is what'll be
634	 * erased.
635	 */
636	start = TAILQ_FIRST(pgl);
637	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
638	end = uvm_pmr_findnextsegment(pmr, start, boundary);
639
640	/*
641	 * If we are desperate, we _really_ want to get rid of the smallest
642	 * element (rather than a close match to the smallest element).
643	 */
644	if (is_desperate) {
645		/* Linear search for smallest segment. */
646		pmr_iter = pmr;
647		for (iter = TAILQ_NEXT(end, pageq);
648		    iter != NULL && start != end;
649		    iter = TAILQ_NEXT(iter_end, pageq)) {
650			/*
651			 * Only update pmr if it doesn't match current
652			 * iteration.
653			 */
654			if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) ||
655			    pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) {
656				pmr_iter = uvm_pmemrange_find(atop(
657				    VM_PAGE_TO_PHYS(iter)));
658			}
659
660			iter_end = uvm_pmr_findnextsegment(pmr_iter, iter,
661			    boundary);
662
663			/*
664			 * Current iteration is smaller than best match so
665			 * far; update.
666			 */
667			if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) <
668			    VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) {
669				start = iter;
670				end = iter_end;
671				pmr = pmr_iter;
672			}
673		}
674	}
675
676	/*
677	 * Calculate count and end of the list.
678	 */
679	count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1;
680	lowest = start;
681	end = TAILQ_NEXT(end, pageq);
682
683	/*
684	 * Actually remove the range of pages.
685	 *
686	 * Sadly, this cannot be done using pointer iteration:
687	 * vm_physseg is not guaranteed to be sorted on address, hence
688	 * uvm_page_init() may not have initialized its array sorted by
689	 * page number.
690	 */
691	for (iter = start; iter != end; iter = iter_end) {
692		iter_end = TAILQ_NEXT(iter, pageq);
693		TAILQ_REMOVE(pgl, iter, pageq);
694	}
695
696	lowest->fpgsz = count;
697	inserted = uvm_pmr_insert(pmr, lowest, 0);
698
699	/*
700	 * If the caller was working on a range and this function modified
701	 * that range, update the pointer.
702	 */
703	if (work != NULL && *work != NULL &&
704	    atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) &&
705	    atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz >
706	    atop(VM_PAGE_TO_PHYS(*work)))
707		*work = inserted;
708	return count;
709}
710
711/*
712 * Remove the first segment of contiguous pages from a pgl
713 * with the list elements in reverse order of physaddr.
714 *
715 * A segment ends if it would enter a different uvm_pmemrange.
716 *
717 * Stores starting physical address of the segment in pstart.
718 */
719psize_t
720uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart)
721{
722	struct vm_page *start, *end, *iter, *iter_end, *lowest;
723	psize_t count;
724	struct uvm_pmemrange *pmr;
725
726	KASSERT(!TAILQ_EMPTY(pgl));
727
728	start = TAILQ_FIRST(pgl);
729	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start)));
730	end = uvm_pmr_findprevsegment(pmr, start, 0);
731
732	KASSERT(end <= start);
733
734	/*
735	 * Calculate count and end of the list.
736	 */
737	count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1;
738	lowest = end;
739	end = TAILQ_NEXT(end, pageq);
740
741	/*
742	 * Actually remove the range of pages.
743	 *
744	 * Sadly, this cannot be done using pointer iteration:
745	 * vm_physseg is not guaranteed to be sorted on address, hence
746	 * uvm_page_init() may not have initialized its array sorted by
747	 * page number.
748	 */
749	for (iter = start; iter != end; iter = iter_end) {
750		iter_end = TAILQ_NEXT(iter, pageq);
751		TAILQ_REMOVE(pgl, iter, pageq);
752	}
753
754	lowest->fpgsz = count;
755	(void) uvm_pmr_insert(pmr, lowest, 0);
756
757	*pstart = VM_PAGE_TO_PHYS(lowest);
758	return count;
759}
760
761/*
762 * Extract a number of pages from a segment of free pages.
763 * Called by uvm_pmr_getpages.
764 *
765 * Returns the segment that was created from pages left over at the tail
766 * of the remove set of pages, or NULL if no pages were left at the tail.
767 */
768struct vm_page *
769uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg,
770    paddr_t start, paddr_t end, struct pglist *result)
771{
772	struct vm_page *after, *pg_i;
773	psize_t before_sz, after_sz;
774#ifdef DEBUG
775	psize_t i;
776#endif
777
778	KDASSERT(end > start);
779	KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)));
780	KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz);
781	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start);
782	KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end);
783
784	before_sz = start - atop(VM_PAGE_TO_PHYS(pg));
785	after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end;
786	KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz);
787	uvm_pmr_assertvalid(pmr);
788
789	uvm_pmr_remove_size(pmr, pg);
790	if (before_sz == 0)
791		uvm_pmr_remove_addr(pmr, pg);
792	after = pg + before_sz + (end - start);
793
794	/* Add selected pages to result. */
795	for (pg_i = pg + before_sz; pg_i != after; pg_i++) {
796		KASSERT(pg_i->pg_flags & PQ_FREE);
797		pg_i->fpgsz = 0;
798		TAILQ_INSERT_TAIL(result, pg_i, pageq);
799	}
800
801	/* Before handling. */
802	if (before_sz > 0) {
803		pg->fpgsz = before_sz;
804		uvm_pmr_insert_size(pmr, pg);
805	}
806
807	/* After handling. */
808	if (after_sz > 0) {
809#ifdef DEBUG
810		for (i = 0; i < after_sz; i++) {
811			KASSERT(!uvm_pmr_isfree(after + i));
812		}
813#endif
814		KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end);
815		after->fpgsz = after_sz;
816		after = uvm_pmr_insert_addr(pmr, after, 1);
817		uvm_pmr_insert_size(pmr, after);
818	}
819
820	uvm_pmr_assertvalid(pmr);
821	return (after_sz > 0 ? after : NULL);
822}
823
824/*
825 * Indicate to the page daemon that a nowait call failed and it should
826 * recover at least some memory in the most restricted region (assumed
827 * to be dma_constraint).
828 */
829extern volatile int uvm_nowait_failed;
830
831/*
832 * Acquire a number of pages.
833 *
834 * count:	the number of pages returned
835 * start:	lowest page number
836 * end:		highest page number +1
837 * 		(start = end = 0: no limitation)
838 * align:	power-of-2 alignment constraint (align = 1: no alignment)
839 * boundary:	power-of-2 boundary (boundary = 0: no boundary)
840 * maxseg:	maximum number of segments to return
841 * flags:	UVM_PLA_* flags
842 * result:	returned pages storage (uses pageq)
843 */
844int
845uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align,
846    paddr_t boundary, int maxseg, int flags, struct pglist *result)
847{
848	struct	uvm_pmemrange *pmr;	/* Iterate memory ranges. */
849	struct	vm_page *found, *f_next; /* Iterate chunks. */
850	psize_t	fcount;			/* Current found pages. */
851	int	fnsegs;			/* Current segment counter. */
852	int	try, start_try;
853	psize_t	search[3];
854	paddr_t	fstart, fend;		/* Pages to be taken from found. */
855	int	memtype;		/* Requested memtype. */
856	int	memtype_init;		/* Best memtype. */
857	int	desperate;		/* True if allocation failed. */
858#ifdef DIAGNOSTIC
859	struct	vm_page *diag_prev;	/* Used during validation. */
860#endif /* DIAGNOSTIC */
861
862	/*
863	 * Validate arguments.
864	 */
865	KASSERT(count > 0);
866	KASSERT(start == 0 || end == 0 || start < end);
867	KASSERT(align >= 1);
868	KASSERT(powerof2(align));
869	KASSERT(maxseg > 0);
870	KASSERT(boundary == 0 || powerof2(boundary));
871	KASSERT(boundary == 0 || maxseg * boundary >= count);
872	KASSERT(TAILQ_EMPTY(result));
873	KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT));
874
875	/*
876	 * TRYCONTIG is a noop if you only want a single segment.
877	 * Remove it if that's the case: otherwise it'll deny the fast
878	 * allocation.
879	 */
880	if (maxseg == 1 || count == 1)
881		flags &= ~UVM_PLA_TRYCONTIG;
882
883	/*
884	 * Configure search.
885	 *
886	 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case.
887	 * search[1] is multiple segments, chosen to fulfill the search in
888	 *   approximately even-sized segments.
889	 *   This is a good trade-off between slightly reduced allocation speed
890	 *   and less fragmentation.
891	 * search[2] is the worst case, in which all segments are evaluated.
892	 *   This provides the least fragmentation, but makes the search
893	 *   possibly longer (although in the case it is selected, that no
894	 *   longer matters most).
895	 *
896	 * The exception is when maxseg == 1: since we can only fulfill that
897	 * with one segment of size pages, only a single search type has to
898	 * be attempted.
899	 */
900	if (maxseg == 1 || count == 1) {
901		start_try = 2;
902		search[2] = count;
903	} else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) {
904		start_try = 2;
905		search[2] = 1;
906	} else {
907		start_try = 0;
908		search[0] = count;
909		search[1] = pow2divide(count, maxseg);
910		search[2] = 1;
911		if ((flags & UVM_PLA_TRYCONTIG) == 0)
912			start_try = 1;
913		if (search[1] >= search[0]) {
914			search[1] = search[0];
915			start_try = 1;
916		}
917		if (search[2] >= search[start_try]) {
918			start_try = 2;
919		}
920	}
921
922	/*
923	 * Memory type: if zeroed memory is requested, traverse the zero set.
924	 * Otherwise, traverse the dirty set.
925	 *
926	 * The memtype iterator is reinitialized to memtype_init on entrance
927	 * of a pmemrange.
928	 */
929	if (flags & UVM_PLA_ZERO)
930		memtype_init = UVM_PMR_MEMTYPE_ZERO;
931	else
932		memtype_init = UVM_PMR_MEMTYPE_DIRTY;
933
934	/*
935	 * Initially, we're not desperate.
936	 *
937	 * Note that if we return from a sleep, we are still desperate.
938	 * Chances are that memory pressure is still high, so resetting
939	 * seems over-optimistic to me.
940	 */
941	desperate = 0;
942
943again:
944	uvm_lock_fpageq();
945
946	/*
947	 * check to see if we need to generate some free pages waking
948	 * the pagedaemon.
949	 */
950	if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin ||
951	    ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg &&
952	    (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg))
953		wakeup(&uvm.pagedaemon);
954
955	/*
956	 * fail if any of these conditions is true:
957	 * [1]  there really are no free pages, or
958	 * [2]  only kernel "reserved" pages remain and
959	 *        the UVM_PLA_USERESERVE flag wasn't used.
960	 * [3]  only pagedaemon "reserved" pages remain and
961	 *        the requestor isn't the pagedaemon nor the syncer.
962	 */
963	if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) &&
964	    !(flags & UVM_PLA_USERESERVE)) {
965		uvm_unlock_fpageq();
966		return ENOMEM;
967	}
968
969	if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) &&
970	    (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) {
971		uvm_unlock_fpageq();
972		if (flags & UVM_PLA_WAITOK) {
973			uvm_wait("uvm_pmr_getpages");
974			goto again;
975		}
976		return ENOMEM;
977	}
978
979retry:		/* Return point after sleeping. */
980	fcount = 0;
981	fnsegs = 0;
982
983retry_desperate:
984	/*
985	 * If we just want any page(s), go for the really fast option.
986	 */
987	if (count <= maxseg && align == 1 && boundary == 0 &&
988	    (flags & UVM_PLA_TRYCONTIG) == 0) {
989		fcount += uvm_pmr_get1page(count - fcount, memtype_init,
990		    result, start, end, 0);
991
992		/*
993		 * If we found sufficient pages, go to the success exit code.
994		 *
995		 * Otherwise, go immediately to fail, since we collected
996		 * all we could anyway.
997		 */
998		if (fcount == count)
999			goto out;
1000		else
1001			goto fail;
1002	}
1003
1004	/*
1005	 * The heart of the contig case.
1006	 *
1007	 * The code actually looks like this:
1008	 *
1009	 * foreach (struct pmemrange) {
1010	 *	foreach (memtype) {
1011	 *		foreach(try) {
1012	 *			foreach (free range of memtype in pmemrange,
1013	 *			    starting at search[try]) {
1014	 *				while (range has space left)
1015	 *					take from range
1016	 *			}
1017	 *		}
1018	 *	}
1019	 *
1020	 *	if next pmemrange has higher usecount than current:
1021	 *		enter desperate case (which will drain the pmemranges
1022	 *		until empty prior to moving to the next one)
1023	 * }
1024	 *
1025	 * When desperate is activated, try always starts at the highest
1026	 * value. The memtype loop is using a goto ReScanMemtype.
1027	 * The try loop is using a goto ReScan.
1028	 * The 'range has space left' loop uses label DrainFound.
1029	 *
1030	 * Writing them all as loops would take up a lot of screen space in
1031	 * the form of indentation and some parts are easier to express
1032	 * using the labels.
1033	 */
1034
1035	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1036		/* Empty range. */
1037		if (pmr->nsegs == 0)
1038			continue;
1039
1040		/* Outside requested range. */
1041		if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1042			continue;
1043
1044		memtype = memtype_init;
1045
1046rescan_memtype:	/* Return point at memtype++. */
1047		try = start_try;
1048
1049rescan:		/* Return point at try++. */
1050		for (found = uvm_pmr_nfindsz(pmr, search[try], memtype);
1051		    found != NULL;
1052		    found = f_next) {
1053			f_next = uvm_pmr_nextsz(pmr, found, memtype);
1054
1055			fstart = atop(VM_PAGE_TO_PHYS(found));
1056			if (start != 0)
1057				fstart = MAX(start, fstart);
1058drain_found:
1059			/*
1060			 * Throw away the first segment if fnsegs == maxseg
1061			 *
1062			 * Note that f_next is still valid after this call,
1063			 * since we only allocated from entries before f_next.
1064			 * We don't revisit the entries we already extracted
1065			 * from unless we entered the desperate case.
1066			 */
1067			if (fnsegs == maxseg) {
1068				fnsegs--;
1069				fcount -=
1070				    uvm_pmr_remove_1strange(result, boundary,
1071				    &found, desperate);
1072			}
1073
1074			fstart = PMR_ALIGN(fstart, align);
1075			fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz;
1076			if (end != 0)
1077				fend = MIN(end, fend);
1078			if (boundary != 0) {
1079				fend =
1080				    MIN(fend, PMR_ALIGN(fstart + 1, boundary));
1081			}
1082			if (fstart >= fend)
1083				continue;
1084			if (fend - fstart > count - fcount)
1085				fend = fstart + (count - fcount);
1086
1087			fcount += fend - fstart;
1088			fnsegs++;
1089			found = uvm_pmr_extract_range(pmr, found,
1090			    fstart, fend, result);
1091
1092			if (fcount == count)
1093				goto out;
1094
1095			/*
1096			 * If there's still space left in found, try to
1097			 * fully drain it prior to continuing.
1098			 */
1099			if (found != NULL) {
1100				fstart = fend;
1101				goto drain_found;
1102			}
1103		}
1104
1105		/* Try a smaller search now. */
1106		if (++try < nitems(search))
1107			goto rescan;
1108
1109		/*
1110		 * Exhaust all memory types prior to going to the next memory
1111		 * segment.
1112		 * This means that zero-vs-dirty are eaten prior to moving
1113		 * to a pmemrange with a higher use-count.
1114		 *
1115		 * Code is basically a difficult way of writing:
1116		 * memtype = memtype_init;
1117		 * do {
1118		 *	...;
1119		 *	memtype += 1;
1120		 *	memtype %= MEMTYPE_MAX;
1121		 * } while (memtype != memtype_init);
1122		 */
1123		memtype += 1;
1124		if (memtype == UVM_PMR_MEMTYPE_MAX)
1125			memtype = 0;
1126		if (memtype != memtype_init)
1127			goto rescan_memtype;
1128
1129		/*
1130		 * If not desperate, enter desperate case prior to eating all
1131		 * the good stuff in the next range.
1132		 */
1133		if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL &&
1134		    TAILQ_NEXT(pmr, pmr_use)->use != pmr->use)
1135			break;
1136	}
1137
1138	/*
1139	 * Not enough memory of the requested type available. Fall back to
1140	 * less good memory that we'll clean up better later.
1141	 *
1142	 * This algorithm is not very smart though, it just starts scanning
1143	 * a different typed range, but the nicer ranges of the previous
1144	 * iteration may fall out. Hence there is a small chance of a false
1145	 * negative.
1146	 *
1147	 * When desperate: scan all sizes starting at the smallest
1148	 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may
1149	 * allow us to hit the fast path now).
1150	 *
1151	 * Also, because we will revisit entries we scanned before, we need
1152	 * to reset the page queue, or we may end up releasing entries in
1153	 * such a way as to invalidate f_next.
1154	 */
1155	if (!desperate) {
1156		desperate = 1;
1157		start_try = nitems(search) - 1;
1158		flags &= ~UVM_PLA_TRYCONTIG;
1159
1160		while (!TAILQ_EMPTY(result))
1161			uvm_pmr_remove_1strange(result, 0, NULL, 0);
1162		fnsegs = 0;
1163		fcount = 0;
1164		goto retry_desperate;
1165	}
1166
1167fail:
1168	/* Allocation failed. */
1169	/* XXX: claim from memory reserve here */
1170
1171	while (!TAILQ_EMPTY(result))
1172		uvm_pmr_remove_1strange(result, 0, NULL, 0);
1173
1174	if (flags & UVM_PLA_WAITOK) {
1175		if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count),
1176		    flags & UVM_PLA_FAILOK) == 0)
1177			goto retry;
1178		KASSERT(flags & UVM_PLA_FAILOK);
1179	} else {
1180		if (!(flags & UVM_PLA_NOWAKE)) {
1181			uvm_nowait_failed = 1;
1182			wakeup(&uvm.pagedaemon);
1183		}
1184	}
1185	uvm_unlock_fpageq();
1186
1187	return ENOMEM;
1188
1189out:
1190	/* Allocation successful. */
1191	uvmexp.free -= fcount;
1192
1193	uvm_unlock_fpageq();
1194
1195	/* Update statistics and zero pages if UVM_PLA_ZERO. */
1196#ifdef DIAGNOSTIC
1197	fnsegs = 0;
1198	fcount = 0;
1199	diag_prev = NULL;
1200#endif /* DIAGNOSTIC */
1201	TAILQ_FOREACH(found, result, pageq) {
1202		atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK);
1203
1204		if (found->pg_flags & PG_ZERO) {
1205			uvm_lock_fpageq();
1206			uvmexp.zeropages--;
1207			if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1208				wakeup(&uvmexp.zeropages);
1209			uvm_unlock_fpageq();
1210		}
1211		if (flags & UVM_PLA_ZERO) {
1212			if (found->pg_flags & PG_ZERO)
1213				uvmexp.pga_zerohit++;
1214			else {
1215				uvmexp.pga_zeromiss++;
1216				uvm_pagezero(found);
1217			}
1218		}
1219		atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE);
1220
1221		found->uobject = NULL;
1222		found->uanon = NULL;
1223		found->pg_version++;
1224
1225		/*
1226		 * Validate that the page matches range criterium.
1227		 */
1228		KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start);
1229		KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end);
1230
1231#ifdef DIAGNOSTIC
1232		/*
1233		 * Update fcount (# found pages) and
1234		 * fnsegs (# found segments) counters.
1235		 */
1236		if (diag_prev == NULL ||
1237		    /* new segment if it contains a hole */
1238		    atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 !=
1239		    atop(VM_PAGE_TO_PHYS(found)) ||
1240		    /* new segment if it crosses boundary */
1241		    (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) !=
1242		    (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1)))
1243			fnsegs++;
1244		fcount++;
1245
1246		diag_prev = found;
1247#endif /* DIAGNOSTIC */
1248	}
1249
1250#ifdef DIAGNOSTIC
1251	/*
1252	 * Panic on algorithm failure.
1253	 */
1254	if (fcount != count || fnsegs > maxseg) {
1255		panic("pmemrange allocation error: "
1256		    "allocated %ld pages in %d segments, "
1257		    "but request was %ld pages in %d segments",
1258		    fcount, fnsegs, count, maxseg);
1259	}
1260#endif /* DIAGNOSTIC */
1261
1262	return 0;
1263}
1264
1265/*
1266 * Acquire a single page.
1267 *
1268 * flags:	UVM_PLA_* flags
1269 * result:	returned page.
1270 */
1271struct vm_page *
1272uvm_pmr_getone(int flags)
1273{
1274	struct vm_page *pg;
1275	struct pglist pgl;
1276
1277	TAILQ_INIT(&pgl);
1278	if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, flags, &pgl) != 0)
1279		return NULL;
1280
1281	pg = TAILQ_FIRST(&pgl);
1282	KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL);
1283
1284	return pg;
1285}
1286
1287/*
1288 * Free a number of contig pages (invoked by uvm_page_init).
1289 */
1290void
1291uvm_pmr_freepages(struct vm_page *pg, psize_t count)
1292{
1293	struct uvm_pmemrange *pmr;
1294	psize_t i, pmr_count;
1295	struct vm_page *firstpg = pg;
1296
1297	for (i = 0; i < count; i++) {
1298		KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) ==
1299		    atop(VM_PAGE_TO_PHYS(pg)) + i);
1300
1301		if (!((pg[i].pg_flags & PQ_FREE) == 0 &&
1302		    VALID_FLAGS(pg[i].pg_flags))) {
1303			printf("Flags: 0x%x, will panic now.\n",
1304			    pg[i].pg_flags);
1305		}
1306		KASSERT((pg[i].pg_flags & PQ_FREE) == 0 &&
1307		    VALID_FLAGS(pg[i].pg_flags));
1308		atomic_setbits_int(&pg[i].pg_flags, PQ_FREE);
1309		atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
1310	}
1311
1312	uvm_lock_fpageq();
1313
1314	for (i = count; i > 0; i -= pmr_count) {
1315		pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1316		KASSERT(pmr != NULL);
1317
1318		pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg)));
1319		pg->fpgsz = pmr_count;
1320		uvm_pmr_insert(pmr, pg, 0);
1321
1322		uvmexp.free += pmr_count;
1323		pg += pmr_count;
1324	}
1325	wakeup(&uvmexp.free);
1326	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1327		wakeup(&uvmexp.zeropages);
1328
1329	uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count));
1330
1331	uvm_unlock_fpageq();
1332}
1333
1334/*
1335 * Free all pages in the queue.
1336 */
1337void
1338uvm_pmr_freepageq(struct pglist *pgl)
1339{
1340	struct vm_page *pg;
1341	paddr_t pstart;
1342	psize_t plen;
1343
1344	TAILQ_FOREACH(pg, pgl, pageq) {
1345		if (!((pg->pg_flags & PQ_FREE) == 0 &&
1346		    VALID_FLAGS(pg->pg_flags))) {
1347			printf("Flags: 0x%x, will panic now.\n",
1348			    pg->pg_flags);
1349		}
1350		KASSERT((pg->pg_flags & PQ_FREE) == 0 &&
1351		    VALID_FLAGS(pg->pg_flags));
1352		atomic_setbits_int(&pg->pg_flags, PQ_FREE);
1353		atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
1354	}
1355
1356	uvm_lock_fpageq();
1357	while (!TAILQ_EMPTY(pgl)) {
1358		pg = TAILQ_FIRST(pgl);
1359		if (pg == TAILQ_NEXT(pg, pageq) + 1) {
1360			/*
1361			 * If pg is one behind the position of the
1362			 * next page in the list in the page array,
1363			 * try going backwards instead of forward.
1364			 */
1365			plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart);
1366		} else {
1367			pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl));
1368			plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0);
1369		}
1370		uvmexp.free += plen;
1371
1372		uvm_wakeup_pla(pstart, ptoa(plen));
1373	}
1374	wakeup(&uvmexp.free);
1375	if (uvmexp.zeropages < UVM_PAGEZERO_TARGET)
1376		wakeup(&uvmexp.zeropages);
1377	uvm_unlock_fpageq();
1378
1379	return;
1380}
1381
1382/*
1383 * Store a pmemrange in the list.
1384 *
1385 * The list is sorted by use.
1386 */
1387struct uvm_pmemrange *
1388uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq,
1389    struct uvm_pmemrange *pmr)
1390{
1391	struct uvm_pmemrange *iter;
1392	int cmp = 1;
1393
1394	TAILQ_FOREACH(iter, useq, pmr_use) {
1395		cmp = uvm_pmemrange_use_cmp(pmr, iter);
1396		if (cmp == 0)
1397			return iter;
1398		if (cmp == -1)
1399			break;
1400	}
1401
1402	if (iter == NULL)
1403		TAILQ_INSERT_TAIL(useq, pmr, pmr_use);
1404	else
1405		TAILQ_INSERT_BEFORE(iter, pmr, pmr_use);
1406	return NULL;
1407}
1408
1409#ifdef DEBUG
1410/*
1411 * Validation of the whole pmemrange.
1412 * Called with fpageq locked.
1413 */
1414void
1415uvm_pmr_assertvalid(struct uvm_pmemrange *pmr)
1416{
1417	struct vm_page *prev, *next, *i, *xref;
1418	int lcv, mti;
1419
1420	/* Empty range */
1421	if (pmr->nsegs == 0)
1422		return;
1423
1424	/* Validate address tree. */
1425	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) {
1426		/* Validate the range. */
1427		KASSERT(i->fpgsz > 0);
1428		KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low);
1429		KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz
1430		    <= pmr->high);
1431
1432		/* Validate each page in this range. */
1433		for (lcv = 0; lcv < i->fpgsz; lcv++) {
1434			/*
1435			 * Only the first page has a size specification.
1436			 * Rest is size 0.
1437			 */
1438			KASSERT(lcv == 0 || i[lcv].fpgsz == 0);
1439			/*
1440			 * Flag check.
1441			 */
1442			KASSERT(VALID_FLAGS(i[lcv].pg_flags) &&
1443			    (i[lcv].pg_flags & PQ_FREE) == PQ_FREE);
1444			/*
1445			 * Free pages are:
1446			 * - not wired
1447			 * - have no vm_anon
1448			 * - have no uvm_object
1449			 */
1450			KASSERT(i[lcv].wire_count == 0);
1451			KASSERT(i[lcv].uanon == (void*)0xdeadbeef ||
1452			    i[lcv].uanon == NULL);
1453			KASSERT(i[lcv].uobject == (void*)0xdeadbeef ||
1454			    i[lcv].uobject == NULL);
1455			/*
1456			 * Pages in a single range always have the same
1457			 * memtype.
1458			 */
1459			KASSERT(uvm_pmr_pg_to_memtype(&i[0]) ==
1460			    uvm_pmr_pg_to_memtype(&i[lcv]));
1461		}
1462
1463		/* Check that it shouldn't be joined with its predecessor. */
1464		prev = RBT_PREV(uvm_pmr_addr, i);
1465		if (prev != NULL) {
1466			KASSERT(uvm_pmr_pg_to_memtype(i) !=
1467			    uvm_pmr_pg_to_memtype(prev) ||
1468			    atop(VM_PAGE_TO_PHYS(i)) >
1469			    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz ||
1470			    prev + prev->fpgsz != i);
1471		}
1472
1473		/* Assert i is in the size tree as well. */
1474		if (i->fpgsz == 1) {
1475			TAILQ_FOREACH(xref,
1476			    &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) {
1477				if (xref == i)
1478					break;
1479			}
1480			KASSERT(xref == i);
1481		} else {
1482			KASSERT(RBT_FIND(uvm_pmr_size,
1483			    &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) ==
1484			    i + 1);
1485		}
1486	}
1487
1488	/* Validate size tree. */
1489	for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) {
1490		for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) {
1491			next = uvm_pmr_nextsz(pmr, i, mti);
1492			if (next != NULL) {
1493				KASSERT(i->fpgsz <=
1494				    next->fpgsz);
1495			}
1496
1497			/* Assert i is in the addr tree as well. */
1498			KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i);
1499
1500			/* Assert i is of the correct memory type. */
1501			KASSERT(uvm_pmr_pg_to_memtype(i) == mti);
1502		}
1503	}
1504
1505	/* Validate nsegs statistic. */
1506	lcv = 0;
1507	RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr)
1508		lcv++;
1509	KASSERT(pmr->nsegs == lcv);
1510}
1511#endif /* DEBUG */
1512
1513/*
1514 * Split pmr at split point pageno.
1515 * Called with fpageq unlocked.
1516 *
1517 * Split is only applied if a pmemrange spans pageno.
1518 */
1519void
1520uvm_pmr_split(paddr_t pageno)
1521{
1522	struct uvm_pmemrange *pmr, *drain;
1523	struct vm_page *rebuild, *prev, *next;
1524	psize_t prev_sz;
1525
1526	uvm_lock_fpageq();
1527	pmr = uvm_pmemrange_find(pageno);
1528	if (pmr == NULL || !(pmr->low < pageno)) {
1529		/* No split required. */
1530		uvm_unlock_fpageq();
1531		return;
1532	}
1533
1534	KASSERT(pmr->low < pageno);
1535	KASSERT(pmr->high > pageno);
1536
1537	/*
1538	 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into
1539	 * uvm_kmemalloc which calls into pmemrange, making the locking
1540	 * a bit hard, so we just race!
1541	 */
1542	uvm_unlock_fpageq();
1543	drain = uvm_pmr_allocpmr();
1544	uvm_lock_fpageq();
1545	pmr = uvm_pmemrange_find(pageno);
1546	if (pmr == NULL || !(pmr->low < pageno)) {
1547		/*
1548		 * We lost the race since someone else ran this or a related
1549		 * function, however this should be triggered very rarely so
1550		 * we just leak the pmr.
1551		 */
1552		printf("uvm_pmr_split: lost one pmr\n");
1553		uvm_unlock_fpageq();
1554		return;
1555	}
1556
1557	drain->low = pageno;
1558	drain->high = pmr->high;
1559	drain->use = pmr->use;
1560
1561	uvm_pmr_assertvalid(pmr);
1562	uvm_pmr_assertvalid(drain);
1563	KASSERT(drain->nsegs == 0);
1564
1565	RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) {
1566		if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno)
1567			break;
1568	}
1569	if (rebuild == NULL)
1570		prev = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1571	else
1572		prev = RBT_PREV(uvm_pmr_addr, rebuild);
1573	KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1574
1575	/*
1576	 * Handle free chunk that spans the split point.
1577	 */
1578	if (prev != NULL &&
1579	    atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) {
1580		psize_t before, after;
1581
1582		KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno);
1583
1584		uvm_pmr_remove(pmr, prev);
1585		prev_sz = prev->fpgsz;
1586		before = pageno - atop(VM_PAGE_TO_PHYS(prev));
1587		after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno;
1588
1589		KASSERT(before > 0);
1590		KASSERT(after > 0);
1591
1592		prev->fpgsz = before;
1593		uvm_pmr_insert(pmr, prev, 1);
1594		(prev + before)->fpgsz = after;
1595		uvm_pmr_insert(drain, prev + before, 1);
1596	}
1597
1598	/* Move free chunks that no longer fall in the range. */
1599	for (; rebuild != NULL; rebuild = next) {
1600		next = RBT_NEXT(uvm_pmr_addr, rebuild);
1601
1602		uvm_pmr_remove(pmr, rebuild);
1603		uvm_pmr_insert(drain, rebuild, 1);
1604	}
1605
1606	pmr->high = pageno;
1607	uvm_pmr_assertvalid(pmr);
1608	uvm_pmr_assertvalid(drain);
1609
1610	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain);
1611	uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain);
1612	uvm_unlock_fpageq();
1613}
1614
1615/*
1616 * Increase the usage counter for the given range of memory.
1617 *
1618 * The more usage counters a given range of memory has, the more will be
1619 * attempted not to allocate from it.
1620 *
1621 * Addresses here are in paddr_t, not page-numbers.
1622 * The lowest and highest allowed address are specified.
1623 */
1624void
1625uvm_pmr_use_inc(paddr_t low, paddr_t high)
1626{
1627	struct uvm_pmemrange *pmr;
1628	paddr_t sz;
1629
1630	/* pmr uses page numbers, translate low and high. */
1631	high++;
1632	high = atop(trunc_page(high));
1633	low = atop(round_page(low));
1634	uvm_pmr_split(low);
1635	uvm_pmr_split(high);
1636
1637	sz = 0;
1638	uvm_lock_fpageq();
1639	/* Increase use count on segments in range. */
1640	RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) {
1641		if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) {
1642			TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use);
1643			pmr->use++;
1644			sz += pmr->high - pmr->low;
1645			uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr);
1646		}
1647		uvm_pmr_assertvalid(pmr);
1648	}
1649	uvm_unlock_fpageq();
1650
1651	KASSERT(sz >= high - low);
1652}
1653
1654/*
1655 * Allocate a pmemrange.
1656 *
1657 * If called from uvm_page_init, the uvm_pageboot_alloc is used.
1658 * If called after uvm_init, malloc is used.
1659 * (And if called in between, you're dead.)
1660 */
1661struct uvm_pmemrange *
1662uvm_pmr_allocpmr(void)
1663{
1664	struct uvm_pmemrange *nw;
1665	int i;
1666
1667	/* We're only ever hitting the !uvm.page_init_done case for now. */
1668	if (!uvm.page_init_done) {
1669		nw = (struct uvm_pmemrange *)
1670		    uvm_pageboot_alloc(sizeof(struct uvm_pmemrange));
1671	} else {
1672		nw = malloc(sizeof(struct uvm_pmemrange),
1673		    M_VMMAP, M_NOWAIT);
1674	}
1675	KASSERT(nw != NULL);
1676	memset(nw, 0, sizeof(struct uvm_pmemrange));
1677	RBT_INIT(uvm_pmr_addr, &nw->addr);
1678	for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) {
1679		RBT_INIT(uvm_pmr_size, &nw->size[i]);
1680		TAILQ_INIT(&nw->single[i]);
1681	}
1682	return nw;
1683}
1684
1685/*
1686 * Initialization of pmr.
1687 * Called by uvm_page_init.
1688 *
1689 * Sets up pmemranges.
1690 */
1691void
1692uvm_pmr_init(void)
1693{
1694	struct uvm_pmemrange *new_pmr;
1695	int i;
1696
1697	TAILQ_INIT(&uvm.pmr_control.use);
1698	RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1699	TAILQ_INIT(&uvm.pmr_control.allocs);
1700
1701	/* By default, one range for the entire address space. */
1702	new_pmr = uvm_pmr_allocpmr();
1703	new_pmr->low = 0;
1704	new_pmr->high = atop((paddr_t)-1) + 1;
1705
1706	RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr);
1707	uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr);
1708
1709	for (i = 0; uvm_md_constraints[i] != NULL; i++) {
1710		uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low,
1711	    	    uvm_md_constraints[i]->ucr_high);
1712	}
1713}
1714
1715/*
1716 * Find the pmemrange that contains the given page number.
1717 *
1718 * (Manually traverses the binary tree, because that is cheaper on stack
1719 * usage.)
1720 */
1721struct uvm_pmemrange *
1722uvm_pmemrange_find(paddr_t pageno)
1723{
1724	struct uvm_pmemrange *pmr;
1725
1726	pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr);
1727	while (pmr != NULL) {
1728		if (pmr->low > pageno)
1729			pmr = RBT_LEFT(uvm_pmemrange_addr, pmr);
1730		else if (pmr->high <= pageno)
1731			pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr);
1732		else
1733			break;
1734	}
1735
1736	return pmr;
1737}
1738
1739#if defined(DDB) || defined(DEBUG)
1740/*
1741 * Return true if the given page is in any of the free lists.
1742 * Used by uvm_page_printit.
1743 * This function is safe, even if the page is not on the freeq.
1744 * Note: does not apply locking, only called from ddb.
1745 */
1746int
1747uvm_pmr_isfree(struct vm_page *pg)
1748{
1749	struct vm_page *r;
1750	struct uvm_pmemrange *pmr;
1751
1752	pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg)));
1753	if (pmr == NULL)
1754		return 0;
1755	r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg);
1756	if (r == NULL)
1757		r = RBT_MAX(uvm_pmr_addr, &pmr->addr);
1758	else if (r != pg)
1759		r = RBT_PREV(uvm_pmr_addr, r);
1760	if (r == NULL)
1761		return 0; /* Empty tree. */
1762
1763	KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg)));
1764	return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz >
1765	    atop(VM_PAGE_TO_PHYS(pg));
1766}
1767#endif /* DEBUG */
1768
1769/*
1770 * Given a root of a tree, find a range which intersects start, end and
1771 * is of the same memtype.
1772 *
1773 * Page must be in the address tree.
1774 */
1775struct vm_page*
1776uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root,
1777    paddr_t start, paddr_t end, int memtype)
1778{
1779	int	direction;
1780	struct	vm_page *root;
1781	struct	vm_page *high, *high_next;
1782	struct	vm_page *low, *low_next;
1783
1784	KDASSERT(pmr != NULL && init_root != NULL);
1785	root = init_root;
1786
1787	/* Which direction to use for searching. */
1788	if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start)
1789		direction =  1;
1790	else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end)
1791		direction = -1;
1792	else /* nothing to do */
1793		return root;
1794
1795	/* First, update root to fall within the chosen range. */
1796	while (root && !PMR_INTERSECTS_WITH(
1797	    atop(VM_PAGE_TO_PHYS(root)),
1798	    atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz,
1799	    start, end)) {
1800		if (direction == 1)
1801			root = RBT_RIGHT(uvm_objtree, root);
1802		else
1803			root = RBT_LEFT(uvm_objtree, root);
1804	}
1805	if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype)
1806		return root;
1807
1808	/*
1809	 * Root is valid, but of the wrong memtype.
1810	 *
1811	 * Try to find a range that has the given memtype in the subtree
1812	 * (memtype mismatches are costly, either because the conversion
1813	 * is expensive, or a later allocation will need to do the opposite
1814	 * conversion, which will be expensive).
1815	 *
1816	 *
1817	 * First, simply increase address until we hit something we can use.
1818	 * Cache the upper page, so we can page-walk later.
1819	 */
1820	high = root;
1821	high_next = RBT_RIGHT(uvm_objtree, high);
1822	while (high_next != NULL && PMR_INTERSECTS_WITH(
1823	    atop(VM_PAGE_TO_PHYS(high_next)),
1824	    atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz,
1825	    start, end)) {
1826		high = high_next;
1827		if (uvm_pmr_pg_to_memtype(high) == memtype)
1828			return high;
1829		high_next = RBT_RIGHT(uvm_objtree, high);
1830	}
1831
1832	/*
1833	 * Second, decrease the address until we hit something we can use.
1834	 * Cache the lower page, so we can page-walk later.
1835	 */
1836	low = root;
1837	low_next = RBT_LEFT(uvm_objtree, low);
1838	while (low_next != NULL && PMR_INTERSECTS_WITH(
1839	    atop(VM_PAGE_TO_PHYS(low_next)),
1840	    atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz,
1841	    start, end)) {
1842		low = low_next;
1843		if (uvm_pmr_pg_to_memtype(low) == memtype)
1844			return low;
1845		low_next = RBT_LEFT(uvm_objtree, low);
1846	}
1847
1848	if (low == high)
1849		return NULL;
1850
1851	/* No hits. Walk the address tree until we find something usable. */
1852	for (low = RBT_NEXT(uvm_pmr_addr, low);
1853	    low != high;
1854	    low = RBT_NEXT(uvm_pmr_addr, low)) {
1855		KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)),
1856	    	    atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz,
1857	    	    start, end));
1858		if (uvm_pmr_pg_to_memtype(low) == memtype)
1859			return low;
1860	}
1861
1862	/* Nothing found. */
1863	return NULL;
1864}
1865
1866/*
1867 * Allocate any page, the fastest way. Page number constraints only.
1868 */
1869psize_t
1870uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result,
1871    paddr_t start, paddr_t end, int memtype_only)
1872{
1873	struct	uvm_pmemrange *pmr;
1874	struct	vm_page *found, *splitpg;
1875	psize_t	fcount;
1876	int	memtype;
1877
1878	fcount = 0;
1879	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
1880		/* We're done. */
1881		if (fcount == count)
1882			break;
1883
1884		/* Outside requested range. */
1885		if (!(start == 0 && end == 0) &&
1886		    !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end))
1887			continue;
1888
1889		/* Range is empty. */
1890		if (pmr->nsegs == 0)
1891			continue;
1892
1893		/* Loop over all memtypes, starting at memtype_init. */
1894		memtype = memtype_init;
1895		while (fcount != count) {
1896			found = TAILQ_FIRST(&pmr->single[memtype]);
1897			/*
1898			 * If found is outside the range, walk the list
1899			 * until we find something that intersects with
1900			 * boundaries.
1901			 */
1902			while (found && !PMR_INTERSECTS_WITH(
1903			    atop(VM_PAGE_TO_PHYS(found)),
1904			    atop(VM_PAGE_TO_PHYS(found)) + 1,
1905			    start, end))
1906				found = TAILQ_NEXT(found, pageq);
1907
1908			if (found == NULL) {
1909				/*
1910				 * Check if the size tree contains a range
1911				 * that intersects with the boundaries. As the
1912				 * allocation is for any page, try the smallest
1913				 * range so that large ranges are preserved for
1914				 * more constrained cases. Only one entry is
1915				 * checked here, to avoid a brute-force search.
1916				 *
1917				 * Note that a size tree gives pg[1] instead of
1918				 * pg[0].
1919				 */
1920				found = RBT_MIN(uvm_pmr_size,
1921				    &pmr->size[memtype]);
1922				if (found != NULL) {
1923					found--;
1924					if (!PMR_INTERSECTS_WITH(
1925					    atop(VM_PAGE_TO_PHYS(found)),
1926					    atop(VM_PAGE_TO_PHYS(found)) +
1927					    found->fpgsz, start, end))
1928						found = NULL;
1929				}
1930			}
1931			if (found == NULL) {
1932				/*
1933				 * Try address-guided search to meet the page
1934				 * number constraints.
1935				 */
1936				found = RBT_ROOT(uvm_pmr_addr, &pmr->addr);
1937				if (found != NULL) {
1938					found = uvm_pmr_rootupdate(pmr, found,
1939					    start, end, memtype);
1940				}
1941			}
1942			if (found != NULL) {
1943				uvm_pmr_assertvalid(pmr);
1944				uvm_pmr_remove_size(pmr, found);
1945
1946				/*
1947				 * If the page intersects the end, then it'll
1948				 * need splitting.
1949				 *
1950				 * Note that we don't need to split if the page
1951				 * intersects start: the drain function will
1952				 * simply stop on hitting start.
1953				 */
1954				if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) +
1955				    found->fpgsz > end) {
1956					psize_t splitsz =
1957					    atop(VM_PAGE_TO_PHYS(found)) +
1958					    found->fpgsz - end;
1959
1960					uvm_pmr_remove_addr(pmr, found);
1961					uvm_pmr_assertvalid(pmr);
1962					found->fpgsz -= splitsz;
1963					splitpg = found + found->fpgsz;
1964					splitpg->fpgsz = splitsz;
1965					uvm_pmr_insert(pmr, splitpg, 1);
1966
1967					/*
1968					 * At this point, splitpg and found
1969					 * actually should be joined.
1970					 * But we explicitly disable that,
1971					 * because we will start subtracting
1972					 * from found.
1973					 */
1974					KASSERT(start == 0 ||
1975					    atop(VM_PAGE_TO_PHYS(found)) +
1976					    found->fpgsz > start);
1977					uvm_pmr_insert_addr(pmr, found, 1);
1978				}
1979
1980				/*
1981				 * Fetch pages from the end.
1982				 * If the range is larger than the requested
1983				 * number of pages, this saves us an addr-tree
1984				 * update.
1985				 *
1986				 * Since we take from the end and insert at
1987				 * the head, any ranges keep preserved.
1988				 */
1989				while (found->fpgsz > 0 && fcount < count &&
1990				    (start == 0 ||
1991				    atop(VM_PAGE_TO_PHYS(found)) +
1992				    found->fpgsz > start)) {
1993					found->fpgsz--;
1994					fcount++;
1995					TAILQ_INSERT_HEAD(result,
1996					    &found[found->fpgsz], pageq);
1997				}
1998				if (found->fpgsz > 0) {
1999					uvm_pmr_insert_size(pmr, found);
2000					KDASSERT(fcount == count);
2001					uvm_pmr_assertvalid(pmr);
2002					return fcount;
2003				}
2004
2005				/*
2006				 * Delayed addr-tree removal.
2007				 */
2008				uvm_pmr_remove_addr(pmr, found);
2009				uvm_pmr_assertvalid(pmr);
2010			} else {
2011				if (memtype_only)
2012					break;
2013				/*
2014				 * Skip to the next memtype.
2015				 */
2016				memtype += 1;
2017				if (memtype == UVM_PMR_MEMTYPE_MAX)
2018					memtype = 0;
2019				if (memtype == memtype_init)
2020					break;
2021			}
2022		}
2023	}
2024
2025	/*
2026	 * Search finished.
2027	 *
2028	 * Ran out of ranges before enough pages were gathered, or we hit the
2029	 * case where found->fpgsz == count - fcount, in which case the
2030	 * above exit condition didn't trigger.
2031	 *
2032	 * On failure, caller will free the pages.
2033	 */
2034	return fcount;
2035}
2036
2037#ifdef DDB
2038/*
2039 * Print information about pmemrange.
2040 * Does not do locking (so either call it from DDB or acquire fpageq lock
2041 * before invoking.
2042 */
2043void
2044uvm_pmr_print(void)
2045{
2046	struct	uvm_pmemrange *pmr;
2047	struct	vm_page *pg;
2048	psize_t	size[UVM_PMR_MEMTYPE_MAX];
2049	psize_t	free;
2050	int	useq_len;
2051	int	mt;
2052
2053	printf("Ranges, use queue:\n");
2054	useq_len = 0;
2055	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
2056		useq_len++;
2057		free = 0;
2058		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2059			pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]);
2060			if (pg != NULL)
2061				pg--;
2062			else
2063				pg = TAILQ_FIRST(&pmr->single[mt]);
2064			size[mt] = (pg == NULL ? 0 : pg->fpgsz);
2065
2066			RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr)
2067				free += pg->fpgsz;
2068		}
2069
2070		printf("* [0x%lx-0x%lx] use=%d nsegs=%ld",
2071		    (unsigned long)pmr->low, (unsigned long)pmr->high,
2072		    pmr->use, (unsigned long)pmr->nsegs);
2073		for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) {
2074			printf(" maxsegsz[%d]=0x%lx", mt,
2075			    (unsigned long)size[mt]);
2076		}
2077		printf(" free=0x%lx\n", (unsigned long)free);
2078	}
2079	printf("#ranges = %d\n", useq_len);
2080}
2081#endif
2082
2083/*
2084 * uvm_wait_pla: wait (sleep) for the page daemon to free some pages
2085 * in a specific physmem area.
2086 *
2087 * Returns ENOMEM if the pagedaemon failed to free any pages.
2088 * If not failok, failure will lead to panic.
2089 *
2090 * Must be called with fpageq locked.
2091 */
2092int
2093uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok)
2094{
2095	struct uvm_pmalloc pma;
2096	const char *wmsg = "pmrwait";
2097
2098	if (curproc == uvm.pagedaemon_proc) {
2099		/*
2100		 * This is not that uncommon when the pagedaemon is trying
2101		 * to flush out a large mmapped file. VOP_WRITE will circle
2102		 * back through the buffer cache and try to get more memory.
2103		 * The pagedaemon starts by calling bufbackoff, but we can
2104		 * easily use up that reserve in a single scan iteration.
2105		 */
2106		uvm_unlock_fpageq();
2107		if (bufbackoff(NULL, atop(size)) == 0) {
2108			uvm_lock_fpageq();
2109			return 0;
2110		}
2111		uvm_lock_fpageq();
2112
2113		/*
2114		 * XXX detect pagedaemon deadlock - see comment in
2115		 * uvm_wait(), as this is exactly the same issue.
2116		 */
2117		printf("pagedaemon: wait_pla deadlock detected!\n");
2118		msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg,
2119		    MSEC_TO_NSEC(125));
2120#if defined(DEBUG)
2121		/* DEBUG: panic so we can debug it */
2122		panic("wait_pla pagedaemon deadlock");
2123#endif
2124		return 0;
2125	}
2126
2127	for (;;) {
2128		pma.pm_constraint.ucr_low = low;
2129		pma.pm_constraint.ucr_high = high;
2130		pma.pm_size = size;
2131		pma.pm_flags = UVM_PMA_LINKED;
2132		TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq);
2133
2134		wakeup(&uvm.pagedaemon);		/* wake the daemon! */
2135		while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY))
2136			msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP);
2137
2138		if (!(pma.pm_flags & UVM_PMA_FREED) &&
2139		    pma.pm_flags & UVM_PMA_FAIL) {
2140			if (failok)
2141				return ENOMEM;
2142			printf("uvm_wait: failed to free %ld pages between "
2143			    "0x%lx-0x%lx\n", atop(size), low, high);
2144		} else
2145			return 0;
2146	}
2147	/* UNREACHABLE */
2148}
2149
2150/*
2151 * Wake up uvm_pmalloc sleepers.
2152 */
2153void
2154uvm_wakeup_pla(paddr_t low, psize_t len)
2155{
2156	struct uvm_pmalloc *pma, *pma_next;
2157	paddr_t high;
2158
2159	high = low + len;
2160
2161	/* Wake specific allocations waiting for this memory. */
2162	for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL;
2163	    pma = pma_next) {
2164		pma_next = TAILQ_NEXT(pma, pmq);
2165
2166		if (low < pma->pm_constraint.ucr_high &&
2167		    high > pma->pm_constraint.ucr_low) {
2168			pma->pm_flags |= UVM_PMA_FREED;
2169			if (!(pma->pm_flags & UVM_PMA_BUSY)) {
2170				pma->pm_flags &= ~UVM_PMA_LINKED;
2171				TAILQ_REMOVE(&uvm.pmr_control.allocs, pma,
2172				    pmq);
2173				wakeup(pma);
2174			}
2175		}
2176	}
2177}
2178
2179void
2180uvm_pagezero_thread(void *arg)
2181{
2182	struct pglist pgl;
2183	struct vm_page *pg;
2184	int count;
2185
2186	/* Run at the lowest possible priority. */
2187	curproc->p_p->ps_nice = NZERO + PRIO_MAX;
2188
2189	KERNEL_UNLOCK();
2190
2191	TAILQ_INIT(&pgl);
2192	for (;;) {
2193		uvm_lock_fpageq();
2194		while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET ||
2195		    (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY,
2196		     &pgl, 0, 0, 1)) == 0) {
2197			msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock,
2198			    MAXPRI, "pgzero", INFSLP);
2199		}
2200		uvm_unlock_fpageq();
2201
2202		TAILQ_FOREACH(pg, &pgl, pageq) {
2203			uvm_pagezero(pg);
2204			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
2205		}
2206
2207		uvm_lock_fpageq();
2208		while (!TAILQ_EMPTY(&pgl))
2209			uvm_pmr_remove_1strange(&pgl, 0, NULL, 0);
2210		uvmexp.zeropages += count;
2211 		uvm_unlock_fpageq();
2212
2213		yield();
2214	}
2215}
2216
2217#if defined(MULTIPROCESSOR) && defined(__HAVE_UVM_PERCPU)
2218int
2219uvm_pmr_cache_alloc(struct uvm_pmr_cache_item *upci)
2220{
2221	struct vm_page *pg;
2222	struct pglist pgl;
2223	int flags = UVM_PLA_NOWAIT|UVM_PLA_NOWAKE;
2224	int npages = UVM_PMR_CACHEMAGSZ;
2225
2226	splassert(IPL_VM);
2227	KASSERT(upci->upci_npages == 0);
2228
2229	TAILQ_INIT(&pgl);
2230	if (uvm_pmr_getpages(npages, 0, 0, 1, 0, npages, flags, &pgl))
2231		return -1;
2232
2233	while ((pg = TAILQ_FIRST(&pgl)) != NULL) {
2234		TAILQ_REMOVE(&pgl, pg, pageq);
2235		upci->upci_pages[upci->upci_npages] = pg;
2236		upci->upci_npages++;
2237	}
2238	atomic_add_int(&uvmexp.percpucaches, npages);
2239
2240	return 0;
2241}
2242
2243struct vm_page *
2244uvm_pmr_cache_get(int flags)
2245{
2246	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2247	struct uvm_pmr_cache_item *upci;
2248	struct vm_page *pg;
2249	int s;
2250
2251	s = splvm();
2252	upci = &upc->upc_magz[upc->upc_actv];
2253	if (upci->upci_npages == 0) {
2254		unsigned int prev;
2255
2256		prev = (upc->upc_actv == 0) ?  1 : 0;
2257		upci = &upc->upc_magz[prev];
2258		if (upci->upci_npages == 0) {
2259			atomic_inc_int(&uvmexp.pcpmiss);
2260			if (uvm_pmr_cache_alloc(upci)) {
2261				splx(s);
2262				return uvm_pmr_getone(flags);
2263			}
2264		}
2265		/* Swap magazines */
2266		upc->upc_actv = prev;
2267	} else {
2268		atomic_inc_int(&uvmexp.pcphit);
2269	}
2270
2271	atomic_dec_int(&uvmexp.percpucaches);
2272	upci->upci_npages--;
2273	pg = upci->upci_pages[upci->upci_npages];
2274	splx(s);
2275
2276	if (flags & UVM_PLA_ZERO)
2277		uvm_pagezero(pg);
2278
2279	return pg;
2280}
2281
2282void
2283uvm_pmr_cache_free(struct uvm_pmr_cache_item *upci)
2284{
2285	struct pglist pgl;
2286	unsigned int i;
2287
2288	splassert(IPL_VM);
2289
2290	TAILQ_INIT(&pgl);
2291	for (i = 0; i < upci->upci_npages; i++)
2292		TAILQ_INSERT_TAIL(&pgl, upci->upci_pages[i], pageq);
2293
2294	uvm_pmr_freepageq(&pgl);
2295
2296	atomic_sub_int(&uvmexp.percpucaches, upci->upci_npages);
2297	upci->upci_npages = 0;
2298	memset(upci->upci_pages, 0, sizeof(upci->upci_pages));
2299}
2300
2301void
2302uvm_pmr_cache_put(struct vm_page *pg)
2303{
2304	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2305	struct uvm_pmr_cache_item *upci;
2306	int s;
2307
2308	s = splvm();
2309	upci = &upc->upc_magz[upc->upc_actv];
2310	if (upci->upci_npages >= UVM_PMR_CACHEMAGSZ) {
2311		unsigned int prev;
2312
2313		prev = (upc->upc_actv == 0) ?  1 : 0;
2314		upci = &upc->upc_magz[prev];
2315		if (upci->upci_npages > 0)
2316			uvm_pmr_cache_free(upci);
2317
2318		/* Swap magazines */
2319		upc->upc_actv = prev;
2320		KASSERT(upci->upci_npages == 0);
2321	}
2322
2323	upci->upci_pages[upci->upci_npages] = pg;
2324	upci->upci_npages++;
2325	atomic_inc_int(&uvmexp.percpucaches);
2326	splx(s);
2327}
2328
2329void
2330uvm_pmr_cache_drain(void)
2331{
2332	struct uvm_pmr_cache *upc = &curcpu()->ci_uvm;
2333	int s;
2334
2335	s = splvm();
2336	uvm_pmr_cache_free(&upc->upc_magz[0]);
2337	uvm_pmr_cache_free(&upc->upc_magz[1]);
2338	splx(s);
2339}
2340
2341#else /* !(MULTIPROCESSOR && __HAVE_UVM_PERCPU) */
2342
2343struct vm_page *
2344uvm_pmr_cache_get(int flags)
2345{
2346	return uvm_pmr_getone(flags);
2347}
2348
2349void
2350uvm_pmr_cache_put(struct vm_page *pg)
2351{
2352	uvm_pmr_freepages(pg, 1);
2353}
2354
2355void
2356uvm_pmr_cache_drain(void)
2357{
2358}
2359#endif
2360