1/* $OpenBSD: uvm_pmemrange.c,v 1.66 2024/05/01 12:54:27 mpi Exp $ */ 2 3/* 4 * Copyright (c) 2024 Martin Pieuchot <mpi@openbsd.org> 5 * Copyright (c) 2009, 2010 Ariane van der Steldt <ariane@stack.nl> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20#include <sys/param.h> 21#include <sys/systm.h> 22#include <uvm/uvm.h> 23#include <sys/malloc.h> 24#include <sys/kernel.h> 25#include <sys/proc.h> 26#include <sys/mount.h> 27 28/* 29 * 2 trees: addr tree and size tree. 30 * 31 * The allocator keeps chunks of free pages (called a range). 32 * Two pages are part of the same range if: 33 * - all pages in between are part of that range, 34 * - they are of the same memory type (zeroed or non-zeroed), 35 * - they are part of the same pmemrange. 36 * A pmemrange is a range of memory which is part of the same vm_physseg 37 * and has a use-count. 38 * 39 * addr tree is vm_page[0].objt 40 * size tree is vm_page[1].objt 41 * 42 * The size tree is not used for memory ranges of 1 page, instead, 43 * single queue is vm_page[0].pageq 44 * 45 * vm_page[0].fpgsz describes the length of a free range. Two adjacent ranges 46 * are joined, unless: 47 * - they have pages in between them which are not free 48 * - they belong to different memtypes (zeroed vs dirty memory) 49 * - they are in different pmemrange areas (ISA vs non-ISA memory for instance) 50 * - they are not a continuation of the same array 51 * The latter issue is caused by vm_physseg ordering and splitting from the 52 * MD initialization machinery. The MD code is dependant on freelists and 53 * happens to split ISA memory from non-ISA memory. 54 * (Note: freelists die die die!) 55 * 56 * uvm_page_init guarantees that every vm_physseg contains an array of 57 * struct vm_page. Also, uvm_page_physload allocates an array of struct 58 * vm_page. This code depends on that array. The array may break across 59 * vm_physsegs boundaries. 60 */ 61 62/* 63 * Validate the flags of the page. (Used in asserts.) 64 * Any free page must have the PQ_FREE flag set. 65 * Free pages may be zeroed. 66 * Pmap flags are left untouched. 67 * 68 * The PQ_FREE flag is not checked here: by not checking, we can easily use 69 * this check in pages which are freed. 70 */ 71#define VALID_FLAGS(pg_flags) \ 72 (((pg_flags) & ~(PQ_FREE|PG_ZERO|PG_PMAPMASK)) == 0x0) 73 74/* Tree comparators. */ 75int uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *, 76 const struct uvm_pmemrange *); 77int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); 78int uvm_pmr_pg_to_memtype(struct vm_page *); 79 80#ifdef DDB 81void uvm_pmr_print(void); 82#endif 83 84/* 85 * Memory types. The page flags are used to derive what the current memory 86 * type of a page is. 87 */ 88int 89uvm_pmr_pg_to_memtype(struct vm_page *pg) 90{ 91 if (pg->pg_flags & PG_ZERO) 92 return UVM_PMR_MEMTYPE_ZERO; 93 /* Default: dirty memory. */ 94 return UVM_PMR_MEMTYPE_DIRTY; 95} 96 97/* Trees. */ 98RBT_GENERATE(uvm_pmr_addr, vm_page, objt, uvm_pmr_addr_cmp); 99RBT_GENERATE(uvm_pmr_size, vm_page, objt, uvm_pmr_size_cmp); 100RBT_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, 101 uvm_pmemrange_addr_cmp); 102 103/* Validation. */ 104#ifdef DEBUG 105void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); 106#else 107#define uvm_pmr_assertvalid(pmr) do {} while (0) 108#endif 109 110psize_t uvm_pmr_get1page(psize_t, int, struct pglist *, 111 paddr_t, paddr_t, int); 112 113struct uvm_pmemrange *uvm_pmr_allocpmr(void); 114struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); 115struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, 116 struct vm_page *, int); 117void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, 118 struct vm_page *pg, struct vm_page **pg_prev, 119 struct vm_page **pg_next); 120struct vm_page *uvm_pmr_findnextsegment(struct uvm_pmemrange *, 121 struct vm_page *, paddr_t); 122struct vm_page *uvm_pmr_findprevsegment(struct uvm_pmemrange *, 123 struct vm_page *, paddr_t); 124psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, 125 struct vm_page **, int); 126psize_t uvm_pmr_remove_1strange_reverse(struct pglist *, 127 paddr_t *); 128void uvm_pmr_split(paddr_t); 129struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); 130struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, 131 struct uvm_pmemrange *); 132psize_t pow2divide(psize_t, psize_t); 133struct vm_page *uvm_pmr_rootupdate(struct uvm_pmemrange *, 134 struct vm_page *, paddr_t, paddr_t, int); 135 136/* 137 * Computes num/denom and rounds it up to the next power-of-2. 138 * 139 * This is a division function which calculates an approximation of 140 * num/denom, with result =~ num/denom. It is meant to be fast and doesn't 141 * have to be accurate. 142 * 143 * Providing too large a value makes the allocator slightly faster, at the 144 * risk of hitting the failure case more often. Providing too small a value 145 * makes the allocator a bit slower, but less likely to hit a failure case. 146 */ 147psize_t 148pow2divide(psize_t num, psize_t denom) 149{ 150 int rshift; 151 152 for (rshift = 0; num > denom; rshift++, denom <<= 1) 153 ; 154 return (paddr_t)1 << rshift; 155} 156 157/* 158 * Predicate: lhs is a subrange or rhs. 159 * 160 * If rhs_low == 0: don't care about lower bound. 161 * If rhs_high == 0: don't care about upper bound. 162 */ 163#define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ 164 (((rhs_low) == 0 || (lhs_low) >= (rhs_low)) && \ 165 ((rhs_high) == 0 || (lhs_high) <= (rhs_high))) 166 167/* 168 * Predicate: lhs intersects with rhs. 169 * 170 * If rhs_low == 0: don't care about lower bound. 171 * If rhs_high == 0: don't care about upper bound. 172 * Ranges don't intersect if they don't have any page in common, array 173 * semantics mean that < instead of <= should be used here. 174 */ 175#define PMR_INTERSECTS_WITH(lhs_low, lhs_high, rhs_low, rhs_high) \ 176 (((rhs_low) == 0 || (rhs_low) < (lhs_high)) && \ 177 ((rhs_high) == 0 || (lhs_low) < (rhs_high))) 178 179/* 180 * Align to power-of-2 alignment. 181 */ 182#define PMR_ALIGN(pgno, align) \ 183 (((pgno) + ((align) - 1)) & ~((align) - 1)) 184#define PMR_ALIGN_DOWN(pgno, align) \ 185 ((pgno) & ~((align) - 1)) 186 187 188/* 189 * Comparator: sort by address ascending. 190 */ 191int 192uvm_pmemrange_addr_cmp(const struct uvm_pmemrange *lhs, 193 const struct uvm_pmemrange *rhs) 194{ 195 return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; 196} 197 198/* 199 * Comparator: sort by use ascending. 200 * 201 * The higher the use value of a range, the more devices need memory in 202 * this range. Therefore allocate from the range with the lowest use first. 203 */ 204int 205uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) 206{ 207 int result; 208 209 result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; 210 if (result == 0) 211 result = uvm_pmemrange_addr_cmp(lhs, rhs); 212 return result; 213} 214 215int 216uvm_pmr_addr_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 217{ 218 paddr_t lhs_addr, rhs_addr; 219 220 lhs_addr = VM_PAGE_TO_PHYS(lhs); 221 rhs_addr = VM_PAGE_TO_PHYS(rhs); 222 223 return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); 224} 225 226int 227uvm_pmr_size_cmp(const struct vm_page *lhs, const struct vm_page *rhs) 228{ 229 psize_t lhs_size, rhs_size; 230 int cmp; 231 232 /* Using second tree, so we receive pg[1] instead of pg[0]. */ 233 lhs_size = (lhs - 1)->fpgsz; 234 rhs_size = (rhs - 1)->fpgsz; 235 236 cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); 237 if (cmp == 0) 238 cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); 239 return cmp; 240} 241 242/* 243 * Find the first range of free pages that is at least sz pages long. 244 */ 245struct vm_page * 246uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) 247{ 248 struct vm_page *node, *best; 249 250 KASSERT(sz >= 1); 251 252 if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) 253 return TAILQ_FIRST(&pmr->single[mti]); 254 255 node = RBT_ROOT(uvm_pmr_size, &pmr->size[mti]); 256 best = NULL; 257 while (node != NULL) { 258 if ((node - 1)->fpgsz >= sz) { 259 best = (node - 1); 260 node = RBT_LEFT(uvm_objtree, node); 261 } else 262 node = RBT_RIGHT(uvm_objtree, node); 263 } 264 return best; 265} 266 267/* 268 * Finds the next range. The next range has a size >= pg->fpgsz. 269 * Returns NULL if no more ranges are available. 270 */ 271struct vm_page * 272uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) 273{ 274 struct vm_page *npg; 275 276 KASSERT(pmr != NULL && pg != NULL); 277 if (pg->fpgsz == 1) { 278 if (TAILQ_NEXT(pg, pageq) != NULL) 279 return TAILQ_NEXT(pg, pageq); 280 else 281 npg = RBT_MIN(uvm_pmr_size, &pmr->size[mt]); 282 } else 283 npg = RBT_NEXT(uvm_pmr_size, pg + 1); 284 285 return npg == NULL ? NULL : npg - 1; 286} 287 288/* 289 * Finds the previous and next ranges relative to the (uninserted) pg range. 290 * 291 * *pg_prev == NULL if no previous range is available, that can join with 292 * pg. 293 * *pg_next == NULL if no next range is available, that can join with 294 * pg. 295 */ 296void 297uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, 298 struct vm_page **pg_prev, struct vm_page **pg_next) 299{ 300 KASSERT(pg_prev != NULL && pg_next != NULL); 301 302 *pg_next = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 303 if (*pg_next == NULL) 304 *pg_prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 305 else 306 *pg_prev = RBT_PREV(uvm_pmr_addr, *pg_next); 307 308 KDASSERT(*pg_next == NULL || 309 VM_PAGE_TO_PHYS(*pg_next) > VM_PAGE_TO_PHYS(pg)); 310 KDASSERT(*pg_prev == NULL || 311 VM_PAGE_TO_PHYS(*pg_prev) < VM_PAGE_TO_PHYS(pg)); 312 313 /* Reset if not contig. */ 314 if (*pg_prev != NULL && 315 (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fpgsz 316 != atop(VM_PAGE_TO_PHYS(pg)) || 317 *pg_prev + (*pg_prev)->fpgsz != pg || /* Array broke. */ 318 uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) 319 *pg_prev = NULL; 320 if (*pg_next != NULL && 321 (atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz 322 != atop(VM_PAGE_TO_PHYS(*pg_next)) || 323 pg + pg->fpgsz != *pg_next || /* Array broke. */ 324 uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) 325 *pg_next = NULL; 326 return; 327} 328 329/* 330 * Remove a range from the address tree. 331 * Address tree maintains pmr counters. 332 */ 333void 334uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) 335{ 336 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 337 KDASSERT(pg->pg_flags & PQ_FREE); 338 RBT_REMOVE(uvm_pmr_addr, &pmr->addr, pg); 339 340 pmr->nsegs--; 341} 342/* 343 * Remove a range from the size tree. 344 */ 345void 346uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 347{ 348 int memtype; 349#ifdef DEBUG 350 struct vm_page *i; 351#endif 352 353 KDASSERT(pg->fpgsz >= 1); 354 KDASSERT(pg->pg_flags & PQ_FREE); 355 memtype = uvm_pmr_pg_to_memtype(pg); 356 357 if (pg->fpgsz == 1) { 358#ifdef DEBUG 359 TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { 360 if (i == pg) 361 break; 362 } 363 KDASSERT(i == pg); 364#endif 365 TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); 366 } else { 367 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[memtype], 368 pg + 1) == pg + 1); 369 RBT_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); 370 } 371} 372/* Remove from both trees. */ 373void 374uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) 375{ 376 uvm_pmr_assertvalid(pmr); 377 uvm_pmr_remove_size(pmr, pg); 378 uvm_pmr_remove_addr(pmr, pg); 379 uvm_pmr_assertvalid(pmr); 380} 381 382/* 383 * Insert the range described in pg. 384 * Returns the range thus created (which may be joined with the previous and 385 * next ranges). 386 * If no_join, the caller guarantees that the range cannot possibly join 387 * with adjacent ranges. 388 */ 389struct vm_page * 390uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 391{ 392 struct vm_page *prev, *next; 393 394#ifdef DEBUG 395 struct vm_page *i; 396 int mt; 397#endif 398 399 KDASSERT(pg->pg_flags & PQ_FREE); 400 KDASSERT(pg->fpgsz >= 1); 401 402#ifdef DEBUG 403 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 404 TAILQ_FOREACH(i, &pmr->single[mt], pageq) 405 KDASSERT(i != pg); 406 if (pg->fpgsz > 1) { 407 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mt], 408 pg + 1) == NULL); 409 } 410 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); 411 } 412#endif 413 414 if (!no_join) { 415 uvm_pmr_pnaddr(pmr, pg, &prev, &next); 416 if (next != NULL) { 417 uvm_pmr_remove_size(pmr, next); 418 uvm_pmr_remove_addr(pmr, next); 419 pg->fpgsz += next->fpgsz; 420 next->fpgsz = 0; 421 } 422 if (prev != NULL) { 423 uvm_pmr_remove_size(pmr, prev); 424 prev->fpgsz += pg->fpgsz; 425 pg->fpgsz = 0; 426 return prev; 427 } 428 } 429 430 RBT_INSERT(uvm_pmr_addr, &pmr->addr, pg); 431 432 pmr->nsegs++; 433 434 return pg; 435} 436/* 437 * Insert the range described in pg. 438 * Returns the range thus created (which may be joined with the previous and 439 * next ranges). 440 * Page must already be in the address tree. 441 */ 442void 443uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) 444{ 445 int memtype; 446#ifdef DEBUG 447 struct vm_page *i; 448 int mti; 449#endif 450 451 KDASSERT(pg->fpgsz >= 1); 452 KDASSERT(pg->pg_flags & PQ_FREE); 453 454 memtype = uvm_pmr_pg_to_memtype(pg); 455#ifdef DEBUG 456 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 457 TAILQ_FOREACH(i, &pmr->single[mti], pageq) 458 KDASSERT(i != pg); 459 if (pg->fpgsz > 1) { 460 KDASSERT(RBT_FIND(uvm_pmr_size, &pmr->size[mti], 461 pg + 1) == NULL); 462 } 463 KDASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); 464 } 465 for (i = pg; i < pg + pg->fpgsz; i++) 466 KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); 467#endif 468 469 if (pg->fpgsz == 1) 470 TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); 471 else 472 RBT_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); 473} 474/* Insert in both trees. */ 475struct vm_page * 476uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) 477{ 478 uvm_pmr_assertvalid(pmr); 479 pg = uvm_pmr_insert_addr(pmr, pg, no_join); 480 uvm_pmr_insert_size(pmr, pg); 481 uvm_pmr_assertvalid(pmr); 482 return pg; 483} 484 485/* 486 * Find the last page that is part of this segment. 487 * => pg: the range at which to start the search. 488 * => boundary: the page number boundary specification (0 = no boundary). 489 * => pmr: the pmemrange of the page. 490 * 491 * This function returns 1 before the next range, so if you want to have the 492 * next range, you need to run TAILQ_NEXT(result, pageq) after calling. 493 * The reason is that this way, the length of the segment is easily 494 * calculated using: atop(result) - atop(pg) + 1. 495 * Hence this function also never returns NULL. 496 */ 497struct vm_page * 498uvm_pmr_findnextsegment(struct uvm_pmemrange *pmr, 499 struct vm_page *pg, paddr_t boundary) 500{ 501 paddr_t first_boundary; 502 struct vm_page *next; 503 struct vm_page *prev; 504 505 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 506 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 507 if (boundary != 0) { 508 first_boundary = 509 PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); 510 } else 511 first_boundary = 0; 512 513 /* 514 * Increase next until it hits the first page of the next segment. 515 * 516 * While loop checks the following: 517 * - next != NULL we have not reached the end of pgl 518 * - boundary == 0 || next < first_boundary 519 * we do not cross a boundary 520 * - atop(prev) + 1 == atop(next) 521 * still in the same segment 522 * - low <= last 523 * - high > last still in the same memory range 524 * - memtype is equal allocator is unable to view different memtypes 525 * as part of the same segment 526 * - prev + 1 == next no array breakage occurs 527 */ 528 prev = pg; 529 next = TAILQ_NEXT(prev, pageq); 530 while (next != NULL && 531 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) < first_boundary) && 532 atop(VM_PAGE_TO_PHYS(prev)) + 1 == atop(VM_PAGE_TO_PHYS(next)) && 533 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 534 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 535 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 536 prev + 1 == next) { 537 prev = next; 538 next = TAILQ_NEXT(prev, pageq); 539 } 540 541 /* 542 * End of this segment. 543 */ 544 return prev; 545} 546 547/* 548 * Find the first page that is part of this segment. 549 * => pg: the range at which to start the search. 550 * => boundary: the page number boundary specification (0 = no boundary). 551 * => pmr: the pmemrange of the page. 552 * 553 * This function returns 1 after the previous range, so if you want to have the 554 * previous range, you need to run TAILQ_NEXT(result, pageq) after calling. 555 * The reason is that this way, the length of the segment is easily 556 * calculated using: atop(pg) - atop(result) + 1. 557 * Hence this function also never returns NULL. 558 */ 559struct vm_page * 560uvm_pmr_findprevsegment(struct uvm_pmemrange *pmr, 561 struct vm_page *pg, paddr_t boundary) 562{ 563 paddr_t first_boundary; 564 struct vm_page *next; 565 struct vm_page *prev; 566 567 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg)) && 568 pmr->high > atop(VM_PAGE_TO_PHYS(pg))); 569 if (boundary != 0) { 570 first_boundary = 571 PMR_ALIGN_DOWN(atop(VM_PAGE_TO_PHYS(pg)), boundary); 572 } else 573 first_boundary = 0; 574 575 /* 576 * Increase next until it hits the first page of the previous segment. 577 * 578 * While loop checks the following: 579 * - next != NULL we have not reached the end of pgl 580 * - boundary == 0 || next >= first_boundary 581 * we do not cross a boundary 582 * - atop(prev) - 1 == atop(next) 583 * still in the same segment 584 * - low <= last 585 * - high > last still in the same memory range 586 * - memtype is equal allocator is unable to view different memtypes 587 * as part of the same segment 588 * - prev - 1 == next no array breakage occurs 589 */ 590 prev = pg; 591 next = TAILQ_NEXT(prev, pageq); 592 while (next != NULL && 593 (boundary == 0 || atop(VM_PAGE_TO_PHYS(next)) >= first_boundary) && 594 atop(VM_PAGE_TO_PHYS(prev)) - 1 == atop(VM_PAGE_TO_PHYS(next)) && 595 pmr->low <= atop(VM_PAGE_TO_PHYS(next)) && 596 pmr->high > atop(VM_PAGE_TO_PHYS(next)) && 597 uvm_pmr_pg_to_memtype(prev) == uvm_pmr_pg_to_memtype(next) && 598 prev - 1 == next) { 599 prev = next; 600 next = TAILQ_NEXT(prev, pageq); 601 } 602 603 /* 604 * Start of this segment. 605 */ 606 return prev; 607} 608 609/* 610 * Remove the first segment of contiguous pages from pgl. 611 * A segment ends if it crosses boundary (unless boundary = 0) or 612 * if it would enter a different uvm_pmemrange. 613 * 614 * Work: the page range that the caller is currently working with. 615 * May be null. 616 * 617 * If is_desperate is non-zero, the smallest segment is erased. Otherwise, 618 * the first segment is erased (which, if called by uvm_pmr_getpages(), 619 * probably is the smallest or very close to it). 620 */ 621psize_t 622uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, 623 struct vm_page **work, int is_desperate) 624{ 625 struct vm_page *start, *end, *iter, *iter_end, *inserted, *lowest; 626 psize_t count; 627 struct uvm_pmemrange *pmr, *pmr_iter; 628 629 KASSERT(!TAILQ_EMPTY(pgl)); 630 631 /* 632 * Initialize to first page. 633 * Unless desperate scan finds a better candidate, this is what'll be 634 * erased. 635 */ 636 start = TAILQ_FIRST(pgl); 637 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 638 end = uvm_pmr_findnextsegment(pmr, start, boundary); 639 640 /* 641 * If we are desperate, we _really_ want to get rid of the smallest 642 * element (rather than a close match to the smallest element). 643 */ 644 if (is_desperate) { 645 /* Linear search for smallest segment. */ 646 pmr_iter = pmr; 647 for (iter = TAILQ_NEXT(end, pageq); 648 iter != NULL && start != end; 649 iter = TAILQ_NEXT(iter_end, pageq)) { 650 /* 651 * Only update pmr if it doesn't match current 652 * iteration. 653 */ 654 if (pmr->low > atop(VM_PAGE_TO_PHYS(iter)) || 655 pmr->high <= atop(VM_PAGE_TO_PHYS(iter))) { 656 pmr_iter = uvm_pmemrange_find(atop( 657 VM_PAGE_TO_PHYS(iter))); 658 } 659 660 iter_end = uvm_pmr_findnextsegment(pmr_iter, iter, 661 boundary); 662 663 /* 664 * Current iteration is smaller than best match so 665 * far; update. 666 */ 667 if (VM_PAGE_TO_PHYS(iter_end) - VM_PAGE_TO_PHYS(iter) < 668 VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) { 669 start = iter; 670 end = iter_end; 671 pmr = pmr_iter; 672 } 673 } 674 } 675 676 /* 677 * Calculate count and end of the list. 678 */ 679 count = atop(VM_PAGE_TO_PHYS(end) - VM_PAGE_TO_PHYS(start)) + 1; 680 lowest = start; 681 end = TAILQ_NEXT(end, pageq); 682 683 /* 684 * Actually remove the range of pages. 685 * 686 * Sadly, this cannot be done using pointer iteration: 687 * vm_physseg is not guaranteed to be sorted on address, hence 688 * uvm_page_init() may not have initialized its array sorted by 689 * page number. 690 */ 691 for (iter = start; iter != end; iter = iter_end) { 692 iter_end = TAILQ_NEXT(iter, pageq); 693 TAILQ_REMOVE(pgl, iter, pageq); 694 } 695 696 lowest->fpgsz = count; 697 inserted = uvm_pmr_insert(pmr, lowest, 0); 698 699 /* 700 * If the caller was working on a range and this function modified 701 * that range, update the pointer. 702 */ 703 if (work != NULL && *work != NULL && 704 atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && 705 atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fpgsz > 706 atop(VM_PAGE_TO_PHYS(*work))) 707 *work = inserted; 708 return count; 709} 710 711/* 712 * Remove the first segment of contiguous pages from a pgl 713 * with the list elements in reverse order of physaddr. 714 * 715 * A segment ends if it would enter a different uvm_pmemrange. 716 * 717 * Stores starting physical address of the segment in pstart. 718 */ 719psize_t 720uvm_pmr_remove_1strange_reverse(struct pglist *pgl, paddr_t *pstart) 721{ 722 struct vm_page *start, *end, *iter, *iter_end, *lowest; 723 psize_t count; 724 struct uvm_pmemrange *pmr; 725 726 KASSERT(!TAILQ_EMPTY(pgl)); 727 728 start = TAILQ_FIRST(pgl); 729 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(start))); 730 end = uvm_pmr_findprevsegment(pmr, start, 0); 731 732 KASSERT(end <= start); 733 734 /* 735 * Calculate count and end of the list. 736 */ 737 count = atop(VM_PAGE_TO_PHYS(start) - VM_PAGE_TO_PHYS(end)) + 1; 738 lowest = end; 739 end = TAILQ_NEXT(end, pageq); 740 741 /* 742 * Actually remove the range of pages. 743 * 744 * Sadly, this cannot be done using pointer iteration: 745 * vm_physseg is not guaranteed to be sorted on address, hence 746 * uvm_page_init() may not have initialized its array sorted by 747 * page number. 748 */ 749 for (iter = start; iter != end; iter = iter_end) { 750 iter_end = TAILQ_NEXT(iter, pageq); 751 TAILQ_REMOVE(pgl, iter, pageq); 752 } 753 754 lowest->fpgsz = count; 755 (void) uvm_pmr_insert(pmr, lowest, 0); 756 757 *pstart = VM_PAGE_TO_PHYS(lowest); 758 return count; 759} 760 761/* 762 * Extract a number of pages from a segment of free pages. 763 * Called by uvm_pmr_getpages. 764 * 765 * Returns the segment that was created from pages left over at the tail 766 * of the remove set of pages, or NULL if no pages were left at the tail. 767 */ 768struct vm_page * 769uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, 770 paddr_t start, paddr_t end, struct pglist *result) 771{ 772 struct vm_page *after, *pg_i; 773 psize_t before_sz, after_sz; 774#ifdef DEBUG 775 psize_t i; 776#endif 777 778 KDASSERT(end > start); 779 KDASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); 780 KDASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz); 781 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); 782 KDASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz >= end); 783 784 before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); 785 after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fpgsz - end; 786 KDASSERT(before_sz + after_sz + (end - start) == pg->fpgsz); 787 uvm_pmr_assertvalid(pmr); 788 789 uvm_pmr_remove_size(pmr, pg); 790 if (before_sz == 0) 791 uvm_pmr_remove_addr(pmr, pg); 792 after = pg + before_sz + (end - start); 793 794 /* Add selected pages to result. */ 795 for (pg_i = pg + before_sz; pg_i != after; pg_i++) { 796 KASSERT(pg_i->pg_flags & PQ_FREE); 797 pg_i->fpgsz = 0; 798 TAILQ_INSERT_TAIL(result, pg_i, pageq); 799 } 800 801 /* Before handling. */ 802 if (before_sz > 0) { 803 pg->fpgsz = before_sz; 804 uvm_pmr_insert_size(pmr, pg); 805 } 806 807 /* After handling. */ 808 if (after_sz > 0) { 809#ifdef DEBUG 810 for (i = 0; i < after_sz; i++) { 811 KASSERT(!uvm_pmr_isfree(after + i)); 812 } 813#endif 814 KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); 815 after->fpgsz = after_sz; 816 after = uvm_pmr_insert_addr(pmr, after, 1); 817 uvm_pmr_insert_size(pmr, after); 818 } 819 820 uvm_pmr_assertvalid(pmr); 821 return (after_sz > 0 ? after : NULL); 822} 823 824/* 825 * Indicate to the page daemon that a nowait call failed and it should 826 * recover at least some memory in the most restricted region (assumed 827 * to be dma_constraint). 828 */ 829extern volatile int uvm_nowait_failed; 830 831/* 832 * Acquire a number of pages. 833 * 834 * count: the number of pages returned 835 * start: lowest page number 836 * end: highest page number +1 837 * (start = end = 0: no limitation) 838 * align: power-of-2 alignment constraint (align = 1: no alignment) 839 * boundary: power-of-2 boundary (boundary = 0: no boundary) 840 * maxseg: maximum number of segments to return 841 * flags: UVM_PLA_* flags 842 * result: returned pages storage (uses pageq) 843 */ 844int 845uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, 846 paddr_t boundary, int maxseg, int flags, struct pglist *result) 847{ 848 struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ 849 struct vm_page *found, *f_next; /* Iterate chunks. */ 850 psize_t fcount; /* Current found pages. */ 851 int fnsegs; /* Current segment counter. */ 852 int try, start_try; 853 psize_t search[3]; 854 paddr_t fstart, fend; /* Pages to be taken from found. */ 855 int memtype; /* Requested memtype. */ 856 int memtype_init; /* Best memtype. */ 857 int desperate; /* True if allocation failed. */ 858#ifdef DIAGNOSTIC 859 struct vm_page *diag_prev; /* Used during validation. */ 860#endif /* DIAGNOSTIC */ 861 862 /* 863 * Validate arguments. 864 */ 865 KASSERT(count > 0); 866 KASSERT(start == 0 || end == 0 || start < end); 867 KASSERT(align >= 1); 868 KASSERT(powerof2(align)); 869 KASSERT(maxseg > 0); 870 KASSERT(boundary == 0 || powerof2(boundary)); 871 KASSERT(boundary == 0 || maxseg * boundary >= count); 872 KASSERT(TAILQ_EMPTY(result)); 873 KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT)); 874 875 /* 876 * TRYCONTIG is a noop if you only want a single segment. 877 * Remove it if that's the case: otherwise it'll deny the fast 878 * allocation. 879 */ 880 if (maxseg == 1 || count == 1) 881 flags &= ~UVM_PLA_TRYCONTIG; 882 883 /* 884 * Configure search. 885 * 886 * search[0] is one segment, only used in UVM_PLA_TRYCONTIG case. 887 * search[1] is multiple segments, chosen to fulfill the search in 888 * approximately even-sized segments. 889 * This is a good trade-off between slightly reduced allocation speed 890 * and less fragmentation. 891 * search[2] is the worst case, in which all segments are evaluated. 892 * This provides the least fragmentation, but makes the search 893 * possibly longer (although in the case it is selected, that no 894 * longer matters most). 895 * 896 * The exception is when maxseg == 1: since we can only fulfill that 897 * with one segment of size pages, only a single search type has to 898 * be attempted. 899 */ 900 if (maxseg == 1 || count == 1) { 901 start_try = 2; 902 search[2] = count; 903 } else if (maxseg >= count && (flags & UVM_PLA_TRYCONTIG) == 0) { 904 start_try = 2; 905 search[2] = 1; 906 } else { 907 start_try = 0; 908 search[0] = count; 909 search[1] = pow2divide(count, maxseg); 910 search[2] = 1; 911 if ((flags & UVM_PLA_TRYCONTIG) == 0) 912 start_try = 1; 913 if (search[1] >= search[0]) { 914 search[1] = search[0]; 915 start_try = 1; 916 } 917 if (search[2] >= search[start_try]) { 918 start_try = 2; 919 } 920 } 921 922 /* 923 * Memory type: if zeroed memory is requested, traverse the zero set. 924 * Otherwise, traverse the dirty set. 925 * 926 * The memtype iterator is reinitialized to memtype_init on entrance 927 * of a pmemrange. 928 */ 929 if (flags & UVM_PLA_ZERO) 930 memtype_init = UVM_PMR_MEMTYPE_ZERO; 931 else 932 memtype_init = UVM_PMR_MEMTYPE_DIRTY; 933 934 /* 935 * Initially, we're not desperate. 936 * 937 * Note that if we return from a sleep, we are still desperate. 938 * Chances are that memory pressure is still high, so resetting 939 * seems over-optimistic to me. 940 */ 941 desperate = 0; 942 943again: 944 uvm_lock_fpageq(); 945 946 /* 947 * check to see if we need to generate some free pages waking 948 * the pagedaemon. 949 */ 950 if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freemin || 951 ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg && 952 (uvmexp.inactive + BUFPAGES_INACT) < uvmexp.inactarg)) 953 wakeup(&uvm.pagedaemon); 954 955 /* 956 * fail if any of these conditions is true: 957 * [1] there really are no free pages, or 958 * [2] only kernel "reserved" pages remain and 959 * the UVM_PLA_USERESERVE flag wasn't used. 960 * [3] only pagedaemon "reserved" pages remain and 961 * the requestor isn't the pagedaemon nor the syncer. 962 */ 963 if ((uvmexp.free <= (uvmexp.reserve_kernel + count)) && 964 !(flags & UVM_PLA_USERESERVE)) { 965 uvm_unlock_fpageq(); 966 return ENOMEM; 967 } 968 969 if ((uvmexp.free <= (uvmexp.reserve_pagedaemon + count)) && 970 (curproc != uvm.pagedaemon_proc) && (curproc != syncerproc)) { 971 uvm_unlock_fpageq(); 972 if (flags & UVM_PLA_WAITOK) { 973 uvm_wait("uvm_pmr_getpages"); 974 goto again; 975 } 976 return ENOMEM; 977 } 978 979retry: /* Return point after sleeping. */ 980 fcount = 0; 981 fnsegs = 0; 982 983retry_desperate: 984 /* 985 * If we just want any page(s), go for the really fast option. 986 */ 987 if (count <= maxseg && align == 1 && boundary == 0 && 988 (flags & UVM_PLA_TRYCONTIG) == 0) { 989 fcount += uvm_pmr_get1page(count - fcount, memtype_init, 990 result, start, end, 0); 991 992 /* 993 * If we found sufficient pages, go to the success exit code. 994 * 995 * Otherwise, go immediately to fail, since we collected 996 * all we could anyway. 997 */ 998 if (fcount == count) 999 goto out; 1000 else 1001 goto fail; 1002 } 1003 1004 /* 1005 * The heart of the contig case. 1006 * 1007 * The code actually looks like this: 1008 * 1009 * foreach (struct pmemrange) { 1010 * foreach (memtype) { 1011 * foreach(try) { 1012 * foreach (free range of memtype in pmemrange, 1013 * starting at search[try]) { 1014 * while (range has space left) 1015 * take from range 1016 * } 1017 * } 1018 * } 1019 * 1020 * if next pmemrange has higher usecount than current: 1021 * enter desperate case (which will drain the pmemranges 1022 * until empty prior to moving to the next one) 1023 * } 1024 * 1025 * When desperate is activated, try always starts at the highest 1026 * value. The memtype loop is using a goto ReScanMemtype. 1027 * The try loop is using a goto ReScan. 1028 * The 'range has space left' loop uses label DrainFound. 1029 * 1030 * Writing them all as loops would take up a lot of screen space in 1031 * the form of indentation and some parts are easier to express 1032 * using the labels. 1033 */ 1034 1035 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1036 /* Empty range. */ 1037 if (pmr->nsegs == 0) 1038 continue; 1039 1040 /* Outside requested range. */ 1041 if (!PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1042 continue; 1043 1044 memtype = memtype_init; 1045 1046rescan_memtype: /* Return point at memtype++. */ 1047 try = start_try; 1048 1049rescan: /* Return point at try++. */ 1050 for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); 1051 found != NULL; 1052 found = f_next) { 1053 f_next = uvm_pmr_nextsz(pmr, found, memtype); 1054 1055 fstart = atop(VM_PAGE_TO_PHYS(found)); 1056 if (start != 0) 1057 fstart = MAX(start, fstart); 1058drain_found: 1059 /* 1060 * Throw away the first segment if fnsegs == maxseg 1061 * 1062 * Note that f_next is still valid after this call, 1063 * since we only allocated from entries before f_next. 1064 * We don't revisit the entries we already extracted 1065 * from unless we entered the desperate case. 1066 */ 1067 if (fnsegs == maxseg) { 1068 fnsegs--; 1069 fcount -= 1070 uvm_pmr_remove_1strange(result, boundary, 1071 &found, desperate); 1072 } 1073 1074 fstart = PMR_ALIGN(fstart, align); 1075 fend = atop(VM_PAGE_TO_PHYS(found)) + found->fpgsz; 1076 if (end != 0) 1077 fend = MIN(end, fend); 1078 if (boundary != 0) { 1079 fend = 1080 MIN(fend, PMR_ALIGN(fstart + 1, boundary)); 1081 } 1082 if (fstart >= fend) 1083 continue; 1084 if (fend - fstart > count - fcount) 1085 fend = fstart + (count - fcount); 1086 1087 fcount += fend - fstart; 1088 fnsegs++; 1089 found = uvm_pmr_extract_range(pmr, found, 1090 fstart, fend, result); 1091 1092 if (fcount == count) 1093 goto out; 1094 1095 /* 1096 * If there's still space left in found, try to 1097 * fully drain it prior to continuing. 1098 */ 1099 if (found != NULL) { 1100 fstart = fend; 1101 goto drain_found; 1102 } 1103 } 1104 1105 /* Try a smaller search now. */ 1106 if (++try < nitems(search)) 1107 goto rescan; 1108 1109 /* 1110 * Exhaust all memory types prior to going to the next memory 1111 * segment. 1112 * This means that zero-vs-dirty are eaten prior to moving 1113 * to a pmemrange with a higher use-count. 1114 * 1115 * Code is basically a difficult way of writing: 1116 * memtype = memtype_init; 1117 * do { 1118 * ...; 1119 * memtype += 1; 1120 * memtype %= MEMTYPE_MAX; 1121 * } while (memtype != memtype_init); 1122 */ 1123 memtype += 1; 1124 if (memtype == UVM_PMR_MEMTYPE_MAX) 1125 memtype = 0; 1126 if (memtype != memtype_init) 1127 goto rescan_memtype; 1128 1129 /* 1130 * If not desperate, enter desperate case prior to eating all 1131 * the good stuff in the next range. 1132 */ 1133 if (!desperate && TAILQ_NEXT(pmr, pmr_use) != NULL && 1134 TAILQ_NEXT(pmr, pmr_use)->use != pmr->use) 1135 break; 1136 } 1137 1138 /* 1139 * Not enough memory of the requested type available. Fall back to 1140 * less good memory that we'll clean up better later. 1141 * 1142 * This algorithm is not very smart though, it just starts scanning 1143 * a different typed range, but the nicer ranges of the previous 1144 * iteration may fall out. Hence there is a small chance of a false 1145 * negative. 1146 * 1147 * When desperate: scan all sizes starting at the smallest 1148 * (start_try = 1) and do not consider UVM_PLA_TRYCONTIG (which may 1149 * allow us to hit the fast path now). 1150 * 1151 * Also, because we will revisit entries we scanned before, we need 1152 * to reset the page queue, or we may end up releasing entries in 1153 * such a way as to invalidate f_next. 1154 */ 1155 if (!desperate) { 1156 desperate = 1; 1157 start_try = nitems(search) - 1; 1158 flags &= ~UVM_PLA_TRYCONTIG; 1159 1160 while (!TAILQ_EMPTY(result)) 1161 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1162 fnsegs = 0; 1163 fcount = 0; 1164 goto retry_desperate; 1165 } 1166 1167fail: 1168 /* Allocation failed. */ 1169 /* XXX: claim from memory reserve here */ 1170 1171 while (!TAILQ_EMPTY(result)) 1172 uvm_pmr_remove_1strange(result, 0, NULL, 0); 1173 1174 if (flags & UVM_PLA_WAITOK) { 1175 if (uvm_wait_pla(ptoa(start), ptoa(end) - 1, ptoa(count), 1176 flags & UVM_PLA_FAILOK) == 0) 1177 goto retry; 1178 KASSERT(flags & UVM_PLA_FAILOK); 1179 } else { 1180 if (!(flags & UVM_PLA_NOWAKE)) { 1181 uvm_nowait_failed = 1; 1182 wakeup(&uvm.pagedaemon); 1183 } 1184 } 1185 uvm_unlock_fpageq(); 1186 1187 return ENOMEM; 1188 1189out: 1190 /* Allocation successful. */ 1191 uvmexp.free -= fcount; 1192 1193 uvm_unlock_fpageq(); 1194 1195 /* Update statistics and zero pages if UVM_PLA_ZERO. */ 1196#ifdef DIAGNOSTIC 1197 fnsegs = 0; 1198 fcount = 0; 1199 diag_prev = NULL; 1200#endif /* DIAGNOSTIC */ 1201 TAILQ_FOREACH(found, result, pageq) { 1202 atomic_clearbits_int(&found->pg_flags, PG_PMAPMASK); 1203 1204 if (found->pg_flags & PG_ZERO) { 1205 uvm_lock_fpageq(); 1206 uvmexp.zeropages--; 1207 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1208 wakeup(&uvmexp.zeropages); 1209 uvm_unlock_fpageq(); 1210 } 1211 if (flags & UVM_PLA_ZERO) { 1212 if (found->pg_flags & PG_ZERO) 1213 uvmexp.pga_zerohit++; 1214 else { 1215 uvmexp.pga_zeromiss++; 1216 uvm_pagezero(found); 1217 } 1218 } 1219 atomic_clearbits_int(&found->pg_flags, PG_ZERO|PQ_FREE); 1220 1221 found->uobject = NULL; 1222 found->uanon = NULL; 1223 found->pg_version++; 1224 1225 /* 1226 * Validate that the page matches range criterium. 1227 */ 1228 KDASSERT(start == 0 || atop(VM_PAGE_TO_PHYS(found)) >= start); 1229 KDASSERT(end == 0 || atop(VM_PAGE_TO_PHYS(found)) < end); 1230 1231#ifdef DIAGNOSTIC 1232 /* 1233 * Update fcount (# found pages) and 1234 * fnsegs (# found segments) counters. 1235 */ 1236 if (diag_prev == NULL || 1237 /* new segment if it contains a hole */ 1238 atop(VM_PAGE_TO_PHYS(diag_prev)) + 1 != 1239 atop(VM_PAGE_TO_PHYS(found)) || 1240 /* new segment if it crosses boundary */ 1241 (atop(VM_PAGE_TO_PHYS(diag_prev)) & ~(boundary - 1)) != 1242 (atop(VM_PAGE_TO_PHYS(found)) & ~(boundary - 1))) 1243 fnsegs++; 1244 fcount++; 1245 1246 diag_prev = found; 1247#endif /* DIAGNOSTIC */ 1248 } 1249 1250#ifdef DIAGNOSTIC 1251 /* 1252 * Panic on algorithm failure. 1253 */ 1254 if (fcount != count || fnsegs > maxseg) { 1255 panic("pmemrange allocation error: " 1256 "allocated %ld pages in %d segments, " 1257 "but request was %ld pages in %d segments", 1258 fcount, fnsegs, count, maxseg); 1259 } 1260#endif /* DIAGNOSTIC */ 1261 1262 return 0; 1263} 1264 1265/* 1266 * Acquire a single page. 1267 * 1268 * flags: UVM_PLA_* flags 1269 * result: returned page. 1270 */ 1271struct vm_page * 1272uvm_pmr_getone(int flags) 1273{ 1274 struct vm_page *pg; 1275 struct pglist pgl; 1276 1277 TAILQ_INIT(&pgl); 1278 if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, flags, &pgl) != 0) 1279 return NULL; 1280 1281 pg = TAILQ_FIRST(&pgl); 1282 KASSERT(pg != NULL && TAILQ_NEXT(pg, pageq) == NULL); 1283 1284 return pg; 1285} 1286 1287/* 1288 * Free a number of contig pages (invoked by uvm_page_init). 1289 */ 1290void 1291uvm_pmr_freepages(struct vm_page *pg, psize_t count) 1292{ 1293 struct uvm_pmemrange *pmr; 1294 psize_t i, pmr_count; 1295 struct vm_page *firstpg = pg; 1296 1297 for (i = 0; i < count; i++) { 1298 KASSERT(atop(VM_PAGE_TO_PHYS(&pg[i])) == 1299 atop(VM_PAGE_TO_PHYS(pg)) + i); 1300 1301 if (!((pg[i].pg_flags & PQ_FREE) == 0 && 1302 VALID_FLAGS(pg[i].pg_flags))) { 1303 printf("Flags: 0x%x, will panic now.\n", 1304 pg[i].pg_flags); 1305 } 1306 KASSERT((pg[i].pg_flags & PQ_FREE) == 0 && 1307 VALID_FLAGS(pg[i].pg_flags)); 1308 atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); 1309 atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO); 1310 } 1311 1312 uvm_lock_fpageq(); 1313 1314 for (i = count; i > 0; i -= pmr_count) { 1315 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1316 KASSERT(pmr != NULL); 1317 1318 pmr_count = MIN(i, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); 1319 pg->fpgsz = pmr_count; 1320 uvm_pmr_insert(pmr, pg, 0); 1321 1322 uvmexp.free += pmr_count; 1323 pg += pmr_count; 1324 } 1325 wakeup(&uvmexp.free); 1326 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1327 wakeup(&uvmexp.zeropages); 1328 1329 uvm_wakeup_pla(VM_PAGE_TO_PHYS(firstpg), ptoa(count)); 1330 1331 uvm_unlock_fpageq(); 1332} 1333 1334/* 1335 * Free all pages in the queue. 1336 */ 1337void 1338uvm_pmr_freepageq(struct pglist *pgl) 1339{ 1340 struct vm_page *pg; 1341 paddr_t pstart; 1342 psize_t plen; 1343 1344 TAILQ_FOREACH(pg, pgl, pageq) { 1345 if (!((pg->pg_flags & PQ_FREE) == 0 && 1346 VALID_FLAGS(pg->pg_flags))) { 1347 printf("Flags: 0x%x, will panic now.\n", 1348 pg->pg_flags); 1349 } 1350 KASSERT((pg->pg_flags & PQ_FREE) == 0 && 1351 VALID_FLAGS(pg->pg_flags)); 1352 atomic_setbits_int(&pg->pg_flags, PQ_FREE); 1353 atomic_clearbits_int(&pg->pg_flags, PG_ZERO); 1354 } 1355 1356 uvm_lock_fpageq(); 1357 while (!TAILQ_EMPTY(pgl)) { 1358 pg = TAILQ_FIRST(pgl); 1359 if (pg == TAILQ_NEXT(pg, pageq) + 1) { 1360 /* 1361 * If pg is one behind the position of the 1362 * next page in the list in the page array, 1363 * try going backwards instead of forward. 1364 */ 1365 plen = uvm_pmr_remove_1strange_reverse(pgl, &pstart); 1366 } else { 1367 pstart = VM_PAGE_TO_PHYS(TAILQ_FIRST(pgl)); 1368 plen = uvm_pmr_remove_1strange(pgl, 0, NULL, 0); 1369 } 1370 uvmexp.free += plen; 1371 1372 uvm_wakeup_pla(pstart, ptoa(plen)); 1373 } 1374 wakeup(&uvmexp.free); 1375 if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) 1376 wakeup(&uvmexp.zeropages); 1377 uvm_unlock_fpageq(); 1378 1379 return; 1380} 1381 1382/* 1383 * Store a pmemrange in the list. 1384 * 1385 * The list is sorted by use. 1386 */ 1387struct uvm_pmemrange * 1388uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, 1389 struct uvm_pmemrange *pmr) 1390{ 1391 struct uvm_pmemrange *iter; 1392 int cmp = 1; 1393 1394 TAILQ_FOREACH(iter, useq, pmr_use) { 1395 cmp = uvm_pmemrange_use_cmp(pmr, iter); 1396 if (cmp == 0) 1397 return iter; 1398 if (cmp == -1) 1399 break; 1400 } 1401 1402 if (iter == NULL) 1403 TAILQ_INSERT_TAIL(useq, pmr, pmr_use); 1404 else 1405 TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); 1406 return NULL; 1407} 1408 1409#ifdef DEBUG 1410/* 1411 * Validation of the whole pmemrange. 1412 * Called with fpageq locked. 1413 */ 1414void 1415uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) 1416{ 1417 struct vm_page *prev, *next, *i, *xref; 1418 int lcv, mti; 1419 1420 /* Empty range */ 1421 if (pmr->nsegs == 0) 1422 return; 1423 1424 /* Validate address tree. */ 1425 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) { 1426 /* Validate the range. */ 1427 KASSERT(i->fpgsz > 0); 1428 KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); 1429 KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fpgsz 1430 <= pmr->high); 1431 1432 /* Validate each page in this range. */ 1433 for (lcv = 0; lcv < i->fpgsz; lcv++) { 1434 /* 1435 * Only the first page has a size specification. 1436 * Rest is size 0. 1437 */ 1438 KASSERT(lcv == 0 || i[lcv].fpgsz == 0); 1439 /* 1440 * Flag check. 1441 */ 1442 KASSERT(VALID_FLAGS(i[lcv].pg_flags) && 1443 (i[lcv].pg_flags & PQ_FREE) == PQ_FREE); 1444 /* 1445 * Free pages are: 1446 * - not wired 1447 * - have no vm_anon 1448 * - have no uvm_object 1449 */ 1450 KASSERT(i[lcv].wire_count == 0); 1451 KASSERT(i[lcv].uanon == (void*)0xdeadbeef || 1452 i[lcv].uanon == NULL); 1453 KASSERT(i[lcv].uobject == (void*)0xdeadbeef || 1454 i[lcv].uobject == NULL); 1455 /* 1456 * Pages in a single range always have the same 1457 * memtype. 1458 */ 1459 KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == 1460 uvm_pmr_pg_to_memtype(&i[lcv])); 1461 } 1462 1463 /* Check that it shouldn't be joined with its predecessor. */ 1464 prev = RBT_PREV(uvm_pmr_addr, i); 1465 if (prev != NULL) { 1466 KASSERT(uvm_pmr_pg_to_memtype(i) != 1467 uvm_pmr_pg_to_memtype(prev) || 1468 atop(VM_PAGE_TO_PHYS(i)) > 1469 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz || 1470 prev + prev->fpgsz != i); 1471 } 1472 1473 /* Assert i is in the size tree as well. */ 1474 if (i->fpgsz == 1) { 1475 TAILQ_FOREACH(xref, 1476 &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { 1477 if (xref == i) 1478 break; 1479 } 1480 KASSERT(xref == i); 1481 } else { 1482 KASSERT(RBT_FIND(uvm_pmr_size, 1483 &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == 1484 i + 1); 1485 } 1486 } 1487 1488 /* Validate size tree. */ 1489 for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { 1490 for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { 1491 next = uvm_pmr_nextsz(pmr, i, mti); 1492 if (next != NULL) { 1493 KASSERT(i->fpgsz <= 1494 next->fpgsz); 1495 } 1496 1497 /* Assert i is in the addr tree as well. */ 1498 KASSERT(RBT_FIND(uvm_pmr_addr, &pmr->addr, i) == i); 1499 1500 /* Assert i is of the correct memory type. */ 1501 KASSERT(uvm_pmr_pg_to_memtype(i) == mti); 1502 } 1503 } 1504 1505 /* Validate nsegs statistic. */ 1506 lcv = 0; 1507 RBT_FOREACH(i, uvm_pmr_addr, &pmr->addr) 1508 lcv++; 1509 KASSERT(pmr->nsegs == lcv); 1510} 1511#endif /* DEBUG */ 1512 1513/* 1514 * Split pmr at split point pageno. 1515 * Called with fpageq unlocked. 1516 * 1517 * Split is only applied if a pmemrange spans pageno. 1518 */ 1519void 1520uvm_pmr_split(paddr_t pageno) 1521{ 1522 struct uvm_pmemrange *pmr, *drain; 1523 struct vm_page *rebuild, *prev, *next; 1524 psize_t prev_sz; 1525 1526 uvm_lock_fpageq(); 1527 pmr = uvm_pmemrange_find(pageno); 1528 if (pmr == NULL || !(pmr->low < pageno)) { 1529 /* No split required. */ 1530 uvm_unlock_fpageq(); 1531 return; 1532 } 1533 1534 KASSERT(pmr->low < pageno); 1535 KASSERT(pmr->high > pageno); 1536 1537 /* 1538 * uvm_pmr_allocpmr() calls into malloc() which in turn calls into 1539 * uvm_kmemalloc which calls into pmemrange, making the locking 1540 * a bit hard, so we just race! 1541 */ 1542 uvm_unlock_fpageq(); 1543 drain = uvm_pmr_allocpmr(); 1544 uvm_lock_fpageq(); 1545 pmr = uvm_pmemrange_find(pageno); 1546 if (pmr == NULL || !(pmr->low < pageno)) { 1547 /* 1548 * We lost the race since someone else ran this or a related 1549 * function, however this should be triggered very rarely so 1550 * we just leak the pmr. 1551 */ 1552 printf("uvm_pmr_split: lost one pmr\n"); 1553 uvm_unlock_fpageq(); 1554 return; 1555 } 1556 1557 drain->low = pageno; 1558 drain->high = pmr->high; 1559 drain->use = pmr->use; 1560 1561 uvm_pmr_assertvalid(pmr); 1562 uvm_pmr_assertvalid(drain); 1563 KASSERT(drain->nsegs == 0); 1564 1565 RBT_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { 1566 if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) 1567 break; 1568 } 1569 if (rebuild == NULL) 1570 prev = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1571 else 1572 prev = RBT_PREV(uvm_pmr_addr, rebuild); 1573 KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1574 1575 /* 1576 * Handle free chunk that spans the split point. 1577 */ 1578 if (prev != NULL && 1579 atop(VM_PAGE_TO_PHYS(prev)) + prev->fpgsz > pageno) { 1580 psize_t before, after; 1581 1582 KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); 1583 1584 uvm_pmr_remove(pmr, prev); 1585 prev_sz = prev->fpgsz; 1586 before = pageno - atop(VM_PAGE_TO_PHYS(prev)); 1587 after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; 1588 1589 KASSERT(before > 0); 1590 KASSERT(after > 0); 1591 1592 prev->fpgsz = before; 1593 uvm_pmr_insert(pmr, prev, 1); 1594 (prev + before)->fpgsz = after; 1595 uvm_pmr_insert(drain, prev + before, 1); 1596 } 1597 1598 /* Move free chunks that no longer fall in the range. */ 1599 for (; rebuild != NULL; rebuild = next) { 1600 next = RBT_NEXT(uvm_pmr_addr, rebuild); 1601 1602 uvm_pmr_remove(pmr, rebuild); 1603 uvm_pmr_insert(drain, rebuild, 1); 1604 } 1605 1606 pmr->high = pageno; 1607 uvm_pmr_assertvalid(pmr); 1608 uvm_pmr_assertvalid(drain); 1609 1610 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); 1611 uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); 1612 uvm_unlock_fpageq(); 1613} 1614 1615/* 1616 * Increase the usage counter for the given range of memory. 1617 * 1618 * The more usage counters a given range of memory has, the more will be 1619 * attempted not to allocate from it. 1620 * 1621 * Addresses here are in paddr_t, not page-numbers. 1622 * The lowest and highest allowed address are specified. 1623 */ 1624void 1625uvm_pmr_use_inc(paddr_t low, paddr_t high) 1626{ 1627 struct uvm_pmemrange *pmr; 1628 paddr_t sz; 1629 1630 /* pmr uses page numbers, translate low and high. */ 1631 high++; 1632 high = atop(trunc_page(high)); 1633 low = atop(round_page(low)); 1634 uvm_pmr_split(low); 1635 uvm_pmr_split(high); 1636 1637 sz = 0; 1638 uvm_lock_fpageq(); 1639 /* Increase use count on segments in range. */ 1640 RBT_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { 1641 if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { 1642 TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); 1643 pmr->use++; 1644 sz += pmr->high - pmr->low; 1645 uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); 1646 } 1647 uvm_pmr_assertvalid(pmr); 1648 } 1649 uvm_unlock_fpageq(); 1650 1651 KASSERT(sz >= high - low); 1652} 1653 1654/* 1655 * Allocate a pmemrange. 1656 * 1657 * If called from uvm_page_init, the uvm_pageboot_alloc is used. 1658 * If called after uvm_init, malloc is used. 1659 * (And if called in between, you're dead.) 1660 */ 1661struct uvm_pmemrange * 1662uvm_pmr_allocpmr(void) 1663{ 1664 struct uvm_pmemrange *nw; 1665 int i; 1666 1667 /* We're only ever hitting the !uvm.page_init_done case for now. */ 1668 if (!uvm.page_init_done) { 1669 nw = (struct uvm_pmemrange *) 1670 uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); 1671 } else { 1672 nw = malloc(sizeof(struct uvm_pmemrange), 1673 M_VMMAP, M_NOWAIT); 1674 } 1675 KASSERT(nw != NULL); 1676 memset(nw, 0, sizeof(struct uvm_pmemrange)); 1677 RBT_INIT(uvm_pmr_addr, &nw->addr); 1678 for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { 1679 RBT_INIT(uvm_pmr_size, &nw->size[i]); 1680 TAILQ_INIT(&nw->single[i]); 1681 } 1682 return nw; 1683} 1684 1685/* 1686 * Initialization of pmr. 1687 * Called by uvm_page_init. 1688 * 1689 * Sets up pmemranges. 1690 */ 1691void 1692uvm_pmr_init(void) 1693{ 1694 struct uvm_pmemrange *new_pmr; 1695 int i; 1696 1697 TAILQ_INIT(&uvm.pmr_control.use); 1698 RBT_INIT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1699 TAILQ_INIT(&uvm.pmr_control.allocs); 1700 1701 /* By default, one range for the entire address space. */ 1702 new_pmr = uvm_pmr_allocpmr(); 1703 new_pmr->low = 0; 1704 new_pmr->high = atop((paddr_t)-1) + 1; 1705 1706 RBT_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); 1707 uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); 1708 1709 for (i = 0; uvm_md_constraints[i] != NULL; i++) { 1710 uvm_pmr_use_inc(uvm_md_constraints[i]->ucr_low, 1711 uvm_md_constraints[i]->ucr_high); 1712 } 1713} 1714 1715/* 1716 * Find the pmemrange that contains the given page number. 1717 * 1718 * (Manually traverses the binary tree, because that is cheaper on stack 1719 * usage.) 1720 */ 1721struct uvm_pmemrange * 1722uvm_pmemrange_find(paddr_t pageno) 1723{ 1724 struct uvm_pmemrange *pmr; 1725 1726 pmr = RBT_ROOT(uvm_pmemrange_addr, &uvm.pmr_control.addr); 1727 while (pmr != NULL) { 1728 if (pmr->low > pageno) 1729 pmr = RBT_LEFT(uvm_pmemrange_addr, pmr); 1730 else if (pmr->high <= pageno) 1731 pmr = RBT_RIGHT(uvm_pmemrange_addr, pmr); 1732 else 1733 break; 1734 } 1735 1736 return pmr; 1737} 1738 1739#if defined(DDB) || defined(DEBUG) 1740/* 1741 * Return true if the given page is in any of the free lists. 1742 * Used by uvm_page_printit. 1743 * This function is safe, even if the page is not on the freeq. 1744 * Note: does not apply locking, only called from ddb. 1745 */ 1746int 1747uvm_pmr_isfree(struct vm_page *pg) 1748{ 1749 struct vm_page *r; 1750 struct uvm_pmemrange *pmr; 1751 1752 pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); 1753 if (pmr == NULL) 1754 return 0; 1755 r = RBT_NFIND(uvm_pmr_addr, &pmr->addr, pg); 1756 if (r == NULL) 1757 r = RBT_MAX(uvm_pmr_addr, &pmr->addr); 1758 else if (r != pg) 1759 r = RBT_PREV(uvm_pmr_addr, r); 1760 if (r == NULL) 1761 return 0; /* Empty tree. */ 1762 1763 KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); 1764 return atop(VM_PAGE_TO_PHYS(r)) + r->fpgsz > 1765 atop(VM_PAGE_TO_PHYS(pg)); 1766} 1767#endif /* DEBUG */ 1768 1769/* 1770 * Given a root of a tree, find a range which intersects start, end and 1771 * is of the same memtype. 1772 * 1773 * Page must be in the address tree. 1774 */ 1775struct vm_page* 1776uvm_pmr_rootupdate(struct uvm_pmemrange *pmr, struct vm_page *init_root, 1777 paddr_t start, paddr_t end, int memtype) 1778{ 1779 int direction; 1780 struct vm_page *root; 1781 struct vm_page *high, *high_next; 1782 struct vm_page *low, *low_next; 1783 1784 KDASSERT(pmr != NULL && init_root != NULL); 1785 root = init_root; 1786 1787 /* Which direction to use for searching. */ 1788 if (start != 0 && atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz <= start) 1789 direction = 1; 1790 else if (end != 0 && atop(VM_PAGE_TO_PHYS(root)) >= end) 1791 direction = -1; 1792 else /* nothing to do */ 1793 return root; 1794 1795 /* First, update root to fall within the chosen range. */ 1796 while (root && !PMR_INTERSECTS_WITH( 1797 atop(VM_PAGE_TO_PHYS(root)), 1798 atop(VM_PAGE_TO_PHYS(root)) + root->fpgsz, 1799 start, end)) { 1800 if (direction == 1) 1801 root = RBT_RIGHT(uvm_objtree, root); 1802 else 1803 root = RBT_LEFT(uvm_objtree, root); 1804 } 1805 if (root == NULL || uvm_pmr_pg_to_memtype(root) == memtype) 1806 return root; 1807 1808 /* 1809 * Root is valid, but of the wrong memtype. 1810 * 1811 * Try to find a range that has the given memtype in the subtree 1812 * (memtype mismatches are costly, either because the conversion 1813 * is expensive, or a later allocation will need to do the opposite 1814 * conversion, which will be expensive). 1815 * 1816 * 1817 * First, simply increase address until we hit something we can use. 1818 * Cache the upper page, so we can page-walk later. 1819 */ 1820 high = root; 1821 high_next = RBT_RIGHT(uvm_objtree, high); 1822 while (high_next != NULL && PMR_INTERSECTS_WITH( 1823 atop(VM_PAGE_TO_PHYS(high_next)), 1824 atop(VM_PAGE_TO_PHYS(high_next)) + high_next->fpgsz, 1825 start, end)) { 1826 high = high_next; 1827 if (uvm_pmr_pg_to_memtype(high) == memtype) 1828 return high; 1829 high_next = RBT_RIGHT(uvm_objtree, high); 1830 } 1831 1832 /* 1833 * Second, decrease the address until we hit something we can use. 1834 * Cache the lower page, so we can page-walk later. 1835 */ 1836 low = root; 1837 low_next = RBT_LEFT(uvm_objtree, low); 1838 while (low_next != NULL && PMR_INTERSECTS_WITH( 1839 atop(VM_PAGE_TO_PHYS(low_next)), 1840 atop(VM_PAGE_TO_PHYS(low_next)) + low_next->fpgsz, 1841 start, end)) { 1842 low = low_next; 1843 if (uvm_pmr_pg_to_memtype(low) == memtype) 1844 return low; 1845 low_next = RBT_LEFT(uvm_objtree, low); 1846 } 1847 1848 if (low == high) 1849 return NULL; 1850 1851 /* No hits. Walk the address tree until we find something usable. */ 1852 for (low = RBT_NEXT(uvm_pmr_addr, low); 1853 low != high; 1854 low = RBT_NEXT(uvm_pmr_addr, low)) { 1855 KDASSERT(PMR_IS_SUBRANGE_OF(atop(VM_PAGE_TO_PHYS(low)), 1856 atop(VM_PAGE_TO_PHYS(low)) + low->fpgsz, 1857 start, end)); 1858 if (uvm_pmr_pg_to_memtype(low) == memtype) 1859 return low; 1860 } 1861 1862 /* Nothing found. */ 1863 return NULL; 1864} 1865 1866/* 1867 * Allocate any page, the fastest way. Page number constraints only. 1868 */ 1869psize_t 1870uvm_pmr_get1page(psize_t count, int memtype_init, struct pglist *result, 1871 paddr_t start, paddr_t end, int memtype_only) 1872{ 1873 struct uvm_pmemrange *pmr; 1874 struct vm_page *found, *splitpg; 1875 psize_t fcount; 1876 int memtype; 1877 1878 fcount = 0; 1879 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 1880 /* We're done. */ 1881 if (fcount == count) 1882 break; 1883 1884 /* Outside requested range. */ 1885 if (!(start == 0 && end == 0) && 1886 !PMR_INTERSECTS_WITH(pmr->low, pmr->high, start, end)) 1887 continue; 1888 1889 /* Range is empty. */ 1890 if (pmr->nsegs == 0) 1891 continue; 1892 1893 /* Loop over all memtypes, starting at memtype_init. */ 1894 memtype = memtype_init; 1895 while (fcount != count) { 1896 found = TAILQ_FIRST(&pmr->single[memtype]); 1897 /* 1898 * If found is outside the range, walk the list 1899 * until we find something that intersects with 1900 * boundaries. 1901 */ 1902 while (found && !PMR_INTERSECTS_WITH( 1903 atop(VM_PAGE_TO_PHYS(found)), 1904 atop(VM_PAGE_TO_PHYS(found)) + 1, 1905 start, end)) 1906 found = TAILQ_NEXT(found, pageq); 1907 1908 if (found == NULL) { 1909 /* 1910 * Check if the size tree contains a range 1911 * that intersects with the boundaries. As the 1912 * allocation is for any page, try the smallest 1913 * range so that large ranges are preserved for 1914 * more constrained cases. Only one entry is 1915 * checked here, to avoid a brute-force search. 1916 * 1917 * Note that a size tree gives pg[1] instead of 1918 * pg[0]. 1919 */ 1920 found = RBT_MIN(uvm_pmr_size, 1921 &pmr->size[memtype]); 1922 if (found != NULL) { 1923 found--; 1924 if (!PMR_INTERSECTS_WITH( 1925 atop(VM_PAGE_TO_PHYS(found)), 1926 atop(VM_PAGE_TO_PHYS(found)) + 1927 found->fpgsz, start, end)) 1928 found = NULL; 1929 } 1930 } 1931 if (found == NULL) { 1932 /* 1933 * Try address-guided search to meet the page 1934 * number constraints. 1935 */ 1936 found = RBT_ROOT(uvm_pmr_addr, &pmr->addr); 1937 if (found != NULL) { 1938 found = uvm_pmr_rootupdate(pmr, found, 1939 start, end, memtype); 1940 } 1941 } 1942 if (found != NULL) { 1943 uvm_pmr_assertvalid(pmr); 1944 uvm_pmr_remove_size(pmr, found); 1945 1946 /* 1947 * If the page intersects the end, then it'll 1948 * need splitting. 1949 * 1950 * Note that we don't need to split if the page 1951 * intersects start: the drain function will 1952 * simply stop on hitting start. 1953 */ 1954 if (end != 0 && atop(VM_PAGE_TO_PHYS(found)) + 1955 found->fpgsz > end) { 1956 psize_t splitsz = 1957 atop(VM_PAGE_TO_PHYS(found)) + 1958 found->fpgsz - end; 1959 1960 uvm_pmr_remove_addr(pmr, found); 1961 uvm_pmr_assertvalid(pmr); 1962 found->fpgsz -= splitsz; 1963 splitpg = found + found->fpgsz; 1964 splitpg->fpgsz = splitsz; 1965 uvm_pmr_insert(pmr, splitpg, 1); 1966 1967 /* 1968 * At this point, splitpg and found 1969 * actually should be joined. 1970 * But we explicitly disable that, 1971 * because we will start subtracting 1972 * from found. 1973 */ 1974 KASSERT(start == 0 || 1975 atop(VM_PAGE_TO_PHYS(found)) + 1976 found->fpgsz > start); 1977 uvm_pmr_insert_addr(pmr, found, 1); 1978 } 1979 1980 /* 1981 * Fetch pages from the end. 1982 * If the range is larger than the requested 1983 * number of pages, this saves us an addr-tree 1984 * update. 1985 * 1986 * Since we take from the end and insert at 1987 * the head, any ranges keep preserved. 1988 */ 1989 while (found->fpgsz > 0 && fcount < count && 1990 (start == 0 || 1991 atop(VM_PAGE_TO_PHYS(found)) + 1992 found->fpgsz > start)) { 1993 found->fpgsz--; 1994 fcount++; 1995 TAILQ_INSERT_HEAD(result, 1996 &found[found->fpgsz], pageq); 1997 } 1998 if (found->fpgsz > 0) { 1999 uvm_pmr_insert_size(pmr, found); 2000 KDASSERT(fcount == count); 2001 uvm_pmr_assertvalid(pmr); 2002 return fcount; 2003 } 2004 2005 /* 2006 * Delayed addr-tree removal. 2007 */ 2008 uvm_pmr_remove_addr(pmr, found); 2009 uvm_pmr_assertvalid(pmr); 2010 } else { 2011 if (memtype_only) 2012 break; 2013 /* 2014 * Skip to the next memtype. 2015 */ 2016 memtype += 1; 2017 if (memtype == UVM_PMR_MEMTYPE_MAX) 2018 memtype = 0; 2019 if (memtype == memtype_init) 2020 break; 2021 } 2022 } 2023 } 2024 2025 /* 2026 * Search finished. 2027 * 2028 * Ran out of ranges before enough pages were gathered, or we hit the 2029 * case where found->fpgsz == count - fcount, in which case the 2030 * above exit condition didn't trigger. 2031 * 2032 * On failure, caller will free the pages. 2033 */ 2034 return fcount; 2035} 2036 2037#ifdef DDB 2038/* 2039 * Print information about pmemrange. 2040 * Does not do locking (so either call it from DDB or acquire fpageq lock 2041 * before invoking. 2042 */ 2043void 2044uvm_pmr_print(void) 2045{ 2046 struct uvm_pmemrange *pmr; 2047 struct vm_page *pg; 2048 psize_t size[UVM_PMR_MEMTYPE_MAX]; 2049 psize_t free; 2050 int useq_len; 2051 int mt; 2052 2053 printf("Ranges, use queue:\n"); 2054 useq_len = 0; 2055 TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { 2056 useq_len++; 2057 free = 0; 2058 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2059 pg = RBT_MAX(uvm_pmr_size, &pmr->size[mt]); 2060 if (pg != NULL) 2061 pg--; 2062 else 2063 pg = TAILQ_FIRST(&pmr->single[mt]); 2064 size[mt] = (pg == NULL ? 0 : pg->fpgsz); 2065 2066 RBT_FOREACH(pg, uvm_pmr_addr, &pmr->addr) 2067 free += pg->fpgsz; 2068 } 2069 2070 printf("* [0x%lx-0x%lx] use=%d nsegs=%ld", 2071 (unsigned long)pmr->low, (unsigned long)pmr->high, 2072 pmr->use, (unsigned long)pmr->nsegs); 2073 for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { 2074 printf(" maxsegsz[%d]=0x%lx", mt, 2075 (unsigned long)size[mt]); 2076 } 2077 printf(" free=0x%lx\n", (unsigned long)free); 2078 } 2079 printf("#ranges = %d\n", useq_len); 2080} 2081#endif 2082 2083/* 2084 * uvm_wait_pla: wait (sleep) for the page daemon to free some pages 2085 * in a specific physmem area. 2086 * 2087 * Returns ENOMEM if the pagedaemon failed to free any pages. 2088 * If not failok, failure will lead to panic. 2089 * 2090 * Must be called with fpageq locked. 2091 */ 2092int 2093uvm_wait_pla(paddr_t low, paddr_t high, paddr_t size, int failok) 2094{ 2095 struct uvm_pmalloc pma; 2096 const char *wmsg = "pmrwait"; 2097 2098 if (curproc == uvm.pagedaemon_proc) { 2099 /* 2100 * This is not that uncommon when the pagedaemon is trying 2101 * to flush out a large mmapped file. VOP_WRITE will circle 2102 * back through the buffer cache and try to get more memory. 2103 * The pagedaemon starts by calling bufbackoff, but we can 2104 * easily use up that reserve in a single scan iteration. 2105 */ 2106 uvm_unlock_fpageq(); 2107 if (bufbackoff(NULL, atop(size)) == 0) { 2108 uvm_lock_fpageq(); 2109 return 0; 2110 } 2111 uvm_lock_fpageq(); 2112 2113 /* 2114 * XXX detect pagedaemon deadlock - see comment in 2115 * uvm_wait(), as this is exactly the same issue. 2116 */ 2117 printf("pagedaemon: wait_pla deadlock detected!\n"); 2118 msleep_nsec(&uvmexp.free, &uvm.fpageqlock, PVM, wmsg, 2119 MSEC_TO_NSEC(125)); 2120#if defined(DEBUG) 2121 /* DEBUG: panic so we can debug it */ 2122 panic("wait_pla pagedaemon deadlock"); 2123#endif 2124 return 0; 2125 } 2126 2127 for (;;) { 2128 pma.pm_constraint.ucr_low = low; 2129 pma.pm_constraint.ucr_high = high; 2130 pma.pm_size = size; 2131 pma.pm_flags = UVM_PMA_LINKED; 2132 TAILQ_INSERT_TAIL(&uvm.pmr_control.allocs, &pma, pmq); 2133 2134 wakeup(&uvm.pagedaemon); /* wake the daemon! */ 2135 while (pma.pm_flags & (UVM_PMA_LINKED | UVM_PMA_BUSY)) 2136 msleep_nsec(&pma, &uvm.fpageqlock, PVM, wmsg, INFSLP); 2137 2138 if (!(pma.pm_flags & UVM_PMA_FREED) && 2139 pma.pm_flags & UVM_PMA_FAIL) { 2140 if (failok) 2141 return ENOMEM; 2142 printf("uvm_wait: failed to free %ld pages between " 2143 "0x%lx-0x%lx\n", atop(size), low, high); 2144 } else 2145 return 0; 2146 } 2147 /* UNREACHABLE */ 2148} 2149 2150/* 2151 * Wake up uvm_pmalloc sleepers. 2152 */ 2153void 2154uvm_wakeup_pla(paddr_t low, psize_t len) 2155{ 2156 struct uvm_pmalloc *pma, *pma_next; 2157 paddr_t high; 2158 2159 high = low + len; 2160 2161 /* Wake specific allocations waiting for this memory. */ 2162 for (pma = TAILQ_FIRST(&uvm.pmr_control.allocs); pma != NULL; 2163 pma = pma_next) { 2164 pma_next = TAILQ_NEXT(pma, pmq); 2165 2166 if (low < pma->pm_constraint.ucr_high && 2167 high > pma->pm_constraint.ucr_low) { 2168 pma->pm_flags |= UVM_PMA_FREED; 2169 if (!(pma->pm_flags & UVM_PMA_BUSY)) { 2170 pma->pm_flags &= ~UVM_PMA_LINKED; 2171 TAILQ_REMOVE(&uvm.pmr_control.allocs, pma, 2172 pmq); 2173 wakeup(pma); 2174 } 2175 } 2176 } 2177} 2178 2179void 2180uvm_pagezero_thread(void *arg) 2181{ 2182 struct pglist pgl; 2183 struct vm_page *pg; 2184 int count; 2185 2186 /* Run at the lowest possible priority. */ 2187 curproc->p_p->ps_nice = NZERO + PRIO_MAX; 2188 2189 KERNEL_UNLOCK(); 2190 2191 TAILQ_INIT(&pgl); 2192 for (;;) { 2193 uvm_lock_fpageq(); 2194 while (uvmexp.zeropages >= UVM_PAGEZERO_TARGET || 2195 (count = uvm_pmr_get1page(16, UVM_PMR_MEMTYPE_DIRTY, 2196 &pgl, 0, 0, 1)) == 0) { 2197 msleep_nsec(&uvmexp.zeropages, &uvm.fpageqlock, 2198 MAXPRI, "pgzero", INFSLP); 2199 } 2200 uvm_unlock_fpageq(); 2201 2202 TAILQ_FOREACH(pg, &pgl, pageq) { 2203 uvm_pagezero(pg); 2204 atomic_setbits_int(&pg->pg_flags, PG_ZERO); 2205 } 2206 2207 uvm_lock_fpageq(); 2208 while (!TAILQ_EMPTY(&pgl)) 2209 uvm_pmr_remove_1strange(&pgl, 0, NULL, 0); 2210 uvmexp.zeropages += count; 2211 uvm_unlock_fpageq(); 2212 2213 yield(); 2214 } 2215} 2216 2217#if defined(MULTIPROCESSOR) && defined(__HAVE_UVM_PERCPU) 2218int 2219uvm_pmr_cache_alloc(struct uvm_pmr_cache_item *upci) 2220{ 2221 struct vm_page *pg; 2222 struct pglist pgl; 2223 int flags = UVM_PLA_NOWAIT|UVM_PLA_NOWAKE; 2224 int npages = UVM_PMR_CACHEMAGSZ; 2225 2226 splassert(IPL_VM); 2227 KASSERT(upci->upci_npages == 0); 2228 2229 TAILQ_INIT(&pgl); 2230 if (uvm_pmr_getpages(npages, 0, 0, 1, 0, npages, flags, &pgl)) 2231 return -1; 2232 2233 while ((pg = TAILQ_FIRST(&pgl)) != NULL) { 2234 TAILQ_REMOVE(&pgl, pg, pageq); 2235 upci->upci_pages[upci->upci_npages] = pg; 2236 upci->upci_npages++; 2237 } 2238 atomic_add_int(&uvmexp.percpucaches, npages); 2239 2240 return 0; 2241} 2242 2243struct vm_page * 2244uvm_pmr_cache_get(int flags) 2245{ 2246 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm; 2247 struct uvm_pmr_cache_item *upci; 2248 struct vm_page *pg; 2249 int s; 2250 2251 s = splvm(); 2252 upci = &upc->upc_magz[upc->upc_actv]; 2253 if (upci->upci_npages == 0) { 2254 unsigned int prev; 2255 2256 prev = (upc->upc_actv == 0) ? 1 : 0; 2257 upci = &upc->upc_magz[prev]; 2258 if (upci->upci_npages == 0) { 2259 atomic_inc_int(&uvmexp.pcpmiss); 2260 if (uvm_pmr_cache_alloc(upci)) { 2261 splx(s); 2262 return uvm_pmr_getone(flags); 2263 } 2264 } 2265 /* Swap magazines */ 2266 upc->upc_actv = prev; 2267 } else { 2268 atomic_inc_int(&uvmexp.pcphit); 2269 } 2270 2271 atomic_dec_int(&uvmexp.percpucaches); 2272 upci->upci_npages--; 2273 pg = upci->upci_pages[upci->upci_npages]; 2274 splx(s); 2275 2276 if (flags & UVM_PLA_ZERO) 2277 uvm_pagezero(pg); 2278 2279 return pg; 2280} 2281 2282void 2283uvm_pmr_cache_free(struct uvm_pmr_cache_item *upci) 2284{ 2285 struct pglist pgl; 2286 unsigned int i; 2287 2288 splassert(IPL_VM); 2289 2290 TAILQ_INIT(&pgl); 2291 for (i = 0; i < upci->upci_npages; i++) 2292 TAILQ_INSERT_TAIL(&pgl, upci->upci_pages[i], pageq); 2293 2294 uvm_pmr_freepageq(&pgl); 2295 2296 atomic_sub_int(&uvmexp.percpucaches, upci->upci_npages); 2297 upci->upci_npages = 0; 2298 memset(upci->upci_pages, 0, sizeof(upci->upci_pages)); 2299} 2300 2301void 2302uvm_pmr_cache_put(struct vm_page *pg) 2303{ 2304 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm; 2305 struct uvm_pmr_cache_item *upci; 2306 int s; 2307 2308 s = splvm(); 2309 upci = &upc->upc_magz[upc->upc_actv]; 2310 if (upci->upci_npages >= UVM_PMR_CACHEMAGSZ) { 2311 unsigned int prev; 2312 2313 prev = (upc->upc_actv == 0) ? 1 : 0; 2314 upci = &upc->upc_magz[prev]; 2315 if (upci->upci_npages > 0) 2316 uvm_pmr_cache_free(upci); 2317 2318 /* Swap magazines */ 2319 upc->upc_actv = prev; 2320 KASSERT(upci->upci_npages == 0); 2321 } 2322 2323 upci->upci_pages[upci->upci_npages] = pg; 2324 upci->upci_npages++; 2325 atomic_inc_int(&uvmexp.percpucaches); 2326 splx(s); 2327} 2328 2329void 2330uvm_pmr_cache_drain(void) 2331{ 2332 struct uvm_pmr_cache *upc = &curcpu()->ci_uvm; 2333 int s; 2334 2335 s = splvm(); 2336 uvm_pmr_cache_free(&upc->upc_magz[0]); 2337 uvm_pmr_cache_free(&upc->upc_magz[1]); 2338 splx(s); 2339} 2340 2341#else /* !(MULTIPROCESSOR && __HAVE_UVM_PERCPU) */ 2342 2343struct vm_page * 2344uvm_pmr_cache_get(int flags) 2345{ 2346 return uvm_pmr_getone(flags); 2347} 2348 2349void 2350uvm_pmr_cache_put(struct vm_page *pg) 2351{ 2352 uvm_pmr_freepages(pg, 1); 2353} 2354 2355void 2356uvm_pmr_cache_drain(void) 2357{ 2358} 2359#endif 2360