1/* 2 * linux/mm/vmalloc.c 3 * 4 * Copyright (C) 1993 Linus Torvalds 5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 8 * Numa awareness, Christoph Lameter, SGI, June 2005 9 */ 10 11#include <linux/vmalloc.h> 12#include <linux/mm.h> 13#include <linux/module.h> 14#include <linux/highmem.h> 15#include <linux/sched.h> 16#include <linux/slab.h> 17#include <linux/spinlock.h> 18#include <linux/interrupt.h> 19#include <linux/proc_fs.h> 20#include <linux/seq_file.h> 21#include <linux/debugobjects.h> 22#include <linux/kallsyms.h> 23#include <linux/list.h> 24#include <linux/rbtree.h> 25#include <linux/radix-tree.h> 26#include <linux/rcupdate.h> 27#include <linux/pfn.h> 28#include <linux/kmemleak.h> 29#include <asm/atomic.h> 30#include <asm/uaccess.h> 31#include <asm/tlbflush.h> 32#include <asm/shmparam.h> 33 34bool vmap_lazy_unmap __read_mostly = true; 35 36/*** Page table manipulation functions ***/ 37 38static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 39{ 40 pte_t *pte; 41 42 pte = pte_offset_kernel(pmd, addr); 43 do { 44 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 45 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 46 } while (pte++, addr += PAGE_SIZE, addr != end); 47} 48 49static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) 50{ 51 pmd_t *pmd; 52 unsigned long next; 53 54 pmd = pmd_offset(pud, addr); 55 do { 56 next = pmd_addr_end(addr, end); 57 if (pmd_none_or_clear_bad(pmd)) 58 continue; 59 vunmap_pte_range(pmd, addr, next); 60 } while (pmd++, addr = next, addr != end); 61} 62 63static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) 64{ 65 pud_t *pud; 66 unsigned long next; 67 68 pud = pud_offset(pgd, addr); 69 do { 70 next = pud_addr_end(addr, end); 71 if (pud_none_or_clear_bad(pud)) 72 continue; 73 vunmap_pmd_range(pud, addr, next); 74 } while (pud++, addr = next, addr != end); 75} 76 77static void vunmap_page_range(unsigned long addr, unsigned long end) 78{ 79 pgd_t *pgd; 80 unsigned long next; 81 82 BUG_ON(addr >= end); 83 pgd = pgd_offset_k(addr); 84 do { 85 next = pgd_addr_end(addr, end); 86 if (pgd_none_or_clear_bad(pgd)) 87 continue; 88 vunmap_pud_range(pgd, addr, next); 89 } while (pgd++, addr = next, addr != end); 90} 91 92static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 93 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 94{ 95 pte_t *pte; 96 97 /* 98 * nr is a running index into the array which helps higher level 99 * callers keep track of where we're up to. 100 */ 101 102 pte = pte_alloc_kernel(pmd, addr); 103 if (!pte) 104 return -ENOMEM; 105 do { 106 struct page *page = pages[*nr]; 107 108 if (WARN_ON(!pte_none(*pte))) 109 return -EBUSY; 110 if (WARN_ON(!page)) 111 return -ENOMEM; 112 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 113 (*nr)++; 114 } while (pte++, addr += PAGE_SIZE, addr != end); 115 return 0; 116} 117 118static int vmap_pmd_range(pud_t *pud, unsigned long addr, 119 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 120{ 121 pmd_t *pmd; 122 unsigned long next; 123 124 pmd = pmd_alloc(&init_mm, pud, addr); 125 if (!pmd) 126 return -ENOMEM; 127 do { 128 next = pmd_addr_end(addr, end); 129 if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) 130 return -ENOMEM; 131 } while (pmd++, addr = next, addr != end); 132 return 0; 133} 134 135static int vmap_pud_range(pgd_t *pgd, unsigned long addr, 136 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 137{ 138 pud_t *pud; 139 unsigned long next; 140 141 pud = pud_alloc(&init_mm, pgd, addr); 142 if (!pud) 143 return -ENOMEM; 144 do { 145 next = pud_addr_end(addr, end); 146 if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) 147 return -ENOMEM; 148 } while (pud++, addr = next, addr != end); 149 return 0; 150} 151 152/* 153 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and 154 * will have pfns corresponding to the "pages" array. 155 * 156 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] 157 */ 158static int vmap_page_range_noflush(unsigned long start, unsigned long end, 159 pgprot_t prot, struct page **pages) 160{ 161 pgd_t *pgd; 162 unsigned long next; 163 unsigned long addr = start; 164 int err = 0; 165 int nr = 0; 166 167 BUG_ON(addr >= end); 168 pgd = pgd_offset_k(addr); 169 do { 170 next = pgd_addr_end(addr, end); 171 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); 172 if (err) 173 return err; 174 } while (pgd++, addr = next, addr != end); 175 176 return nr; 177} 178 179static int vmap_page_range(unsigned long start, unsigned long end, 180 pgprot_t prot, struct page **pages) 181{ 182 int ret; 183 184 ret = vmap_page_range_noflush(start, end, prot, pages); 185 flush_cache_vmap(start, end); 186 return ret; 187} 188 189int is_vmalloc_or_module_addr(const void *x) 190{ 191 /* 192 * ARM, x86-64 and sparc64 put modules in a special place, 193 * and fall back on vmalloc() if that fails. Others 194 * just put it in the vmalloc space. 195 */ 196#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 197 unsigned long addr = (unsigned long)x; 198 if (addr >= MODULES_VADDR && addr < MODULES_END) 199 return 1; 200#endif 201 return is_vmalloc_addr(x); 202} 203 204/* 205 * Walk a vmap address to the struct page it maps. 206 */ 207struct page *vmalloc_to_page(const void *vmalloc_addr) 208{ 209 unsigned long addr = (unsigned long) vmalloc_addr; 210 struct page *page = NULL; 211 pgd_t *pgd = pgd_offset_k(addr); 212 213 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 214 215 if (!pgd_none(*pgd)) { 216 pud_t *pud = pud_offset(pgd, addr); 217 if (!pud_none(*pud)) { 218 pmd_t *pmd = pmd_offset(pud, addr); 219 if (!pmd_none(*pmd)) { 220 pte_t *ptep, pte; 221 222 ptep = pte_offset_map(pmd, addr); 223 pte = *ptep; 224 if (pte_present(pte)) 225 page = pte_page(pte); 226 pte_unmap(ptep); 227 } 228 } 229 } 230 return page; 231} 232EXPORT_SYMBOL(vmalloc_to_page); 233 234/* 235 * Map a vmalloc()-space virtual address to the physical page frame number. 236 */ 237unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 238{ 239 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 240} 241EXPORT_SYMBOL(vmalloc_to_pfn); 242 243 244/*** Global kva allocator ***/ 245 246#define VM_LAZY_FREE 0x01 247#define VM_LAZY_FREEING 0x02 248#define VM_VM_AREA 0x04 249 250struct vmap_area { 251 unsigned long va_start; 252 unsigned long va_end; 253 unsigned long flags; 254 struct rb_node rb_node; /* address sorted rbtree */ 255 struct list_head list; /* address sorted list */ 256 struct list_head purge_list; /* "lazy purge" list */ 257 void *private; 258 struct rcu_head rcu_head; 259}; 260 261static DEFINE_SPINLOCK(vmap_area_lock); 262static struct rb_root vmap_area_root = RB_ROOT; 263static LIST_HEAD(vmap_area_list); 264static unsigned long vmap_area_pcpu_hole; 265 266static struct vmap_area *__find_vmap_area(unsigned long addr) 267{ 268 struct rb_node *n = vmap_area_root.rb_node; 269 270 while (n) { 271 struct vmap_area *va; 272 273 va = rb_entry(n, struct vmap_area, rb_node); 274 if (addr < va->va_start) 275 n = n->rb_left; 276 else if (addr > va->va_start) 277 n = n->rb_right; 278 else 279 return va; 280 } 281 282 return NULL; 283} 284 285static void __insert_vmap_area(struct vmap_area *va) 286{ 287 struct rb_node **p = &vmap_area_root.rb_node; 288 struct rb_node *parent = NULL; 289 struct rb_node *tmp; 290 291 while (*p) { 292 struct vmap_area *tmp; 293 294 parent = *p; 295 tmp = rb_entry(parent, struct vmap_area, rb_node); 296 if (va->va_start < tmp->va_end) 297 p = &(*p)->rb_left; 298 else if (va->va_end > tmp->va_start) 299 p = &(*p)->rb_right; 300 else 301 BUG(); 302 } 303 304 rb_link_node(&va->rb_node, parent, p); 305 rb_insert_color(&va->rb_node, &vmap_area_root); 306 307 /* address-sort this list so it is usable like the vmlist */ 308 tmp = rb_prev(&va->rb_node); 309 if (tmp) { 310 struct vmap_area *prev; 311 prev = rb_entry(tmp, struct vmap_area, rb_node); 312 list_add_rcu(&va->list, &prev->list); 313 } else 314 list_add_rcu(&va->list, &vmap_area_list); 315} 316 317static void purge_vmap_area_lazy(void); 318 319/* 320 * Allocate a region of KVA of the specified size and alignment, within the 321 * vstart and vend. 322 */ 323static struct vmap_area *alloc_vmap_area(unsigned long size, 324 unsigned long align, 325 unsigned long vstart, unsigned long vend, 326 int node, gfp_t gfp_mask) 327{ 328 struct vmap_area *va; 329 struct rb_node *n; 330 unsigned long addr; 331 int purged = 0; 332 333 BUG_ON(!size); 334 BUG_ON(size & ~PAGE_MASK); 335 336 va = kmalloc_node(sizeof(struct vmap_area), 337 gfp_mask & GFP_RECLAIM_MASK, node); 338 if (unlikely(!va)) 339 return ERR_PTR(-ENOMEM); 340 341retry: 342 addr = ALIGN(vstart, align); 343 344 spin_lock(&vmap_area_lock); 345 if (addr + size - 1 < addr) 346 goto overflow; 347 348 n = vmap_area_root.rb_node; 349 if (n) { 350 struct vmap_area *first = NULL; 351 352 do { 353 struct vmap_area *tmp; 354 tmp = rb_entry(n, struct vmap_area, rb_node); 355 if (tmp->va_end >= addr) { 356 if (!first && tmp->va_start < addr + size) 357 first = tmp; 358 n = n->rb_left; 359 } else { 360 first = tmp; 361 n = n->rb_right; 362 } 363 } while (n); 364 365 if (!first) 366 goto found; 367 368 if (first->va_end < addr) { 369 n = rb_next(&first->rb_node); 370 if (n) 371 first = rb_entry(n, struct vmap_area, rb_node); 372 else 373 goto found; 374 } 375 376 while (addr + size > first->va_start && addr + size <= vend) { 377 addr = ALIGN(first->va_end + PAGE_SIZE, align); 378 if (addr + size - 1 < addr) 379 goto overflow; 380 381 n = rb_next(&first->rb_node); 382 if (n) 383 first = rb_entry(n, struct vmap_area, rb_node); 384 else 385 goto found; 386 } 387 } 388found: 389 if (addr + size > vend) { 390overflow: 391 spin_unlock(&vmap_area_lock); 392 if (!purged) { 393 purge_vmap_area_lazy(); 394 purged = 1; 395 goto retry; 396 } 397 if (printk_ratelimit()) 398 printk(KERN_WARNING 399 "vmap allocation for size %lu failed: " 400 "use vmalloc=<size> to increase size.\n", size); 401 kfree(va); 402 return ERR_PTR(-EBUSY); 403 } 404 405 BUG_ON(addr & (align-1)); 406 407 va->va_start = addr; 408 va->va_end = addr + size; 409 va->flags = 0; 410 __insert_vmap_area(va); 411 spin_unlock(&vmap_area_lock); 412 413 return va; 414} 415 416static void rcu_free_va(struct rcu_head *head) 417{ 418 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); 419 420 kfree(va); 421} 422 423static void __free_vmap_area(struct vmap_area *va) 424{ 425 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 426 rb_erase(&va->rb_node, &vmap_area_root); 427 RB_CLEAR_NODE(&va->rb_node); 428 list_del_rcu(&va->list); 429 430 /* 431 * Track the highest possible candidate for pcpu area 432 * allocation. Areas outside of vmalloc area can be returned 433 * here too, consider only end addresses which fall inside 434 * vmalloc area proper. 435 */ 436 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) 437 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); 438 439 call_rcu(&va->rcu_head, rcu_free_va); 440} 441 442/* 443 * Free a region of KVA allocated by alloc_vmap_area 444 */ 445static void free_vmap_area(struct vmap_area *va) 446{ 447 spin_lock(&vmap_area_lock); 448 __free_vmap_area(va); 449 spin_unlock(&vmap_area_lock); 450} 451 452/* 453 * Clear the pagetable entries of a given vmap_area 454 */ 455static void unmap_vmap_area(struct vmap_area *va) 456{ 457 vunmap_page_range(va->va_start, va->va_end); 458} 459 460static void vmap_debug_free_range(unsigned long start, unsigned long end) 461{ 462 /* 463 * Unmap page tables and force a TLB flush immediately if 464 * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free 465 * bugs similarly to those in linear kernel virtual address 466 * space after a page has been freed. 467 * 468 * All the lazy freeing logic is still retained, in order to 469 * minimise intrusiveness of this debugging feature. 470 * 471 * This is going to be *slow* (linear kernel virtual address 472 * debugging doesn't do a broadcast TLB flush so it is a lot 473 * faster). 474 */ 475#ifdef CONFIG_DEBUG_PAGEALLOC 476 vunmap_page_range(start, end); 477 flush_tlb_kernel_range(start, end); 478#endif 479} 480 481/* 482 * lazy_max_pages is the maximum amount of virtual address space we gather up 483 * before attempting to purge with a TLB flush. 484 * 485 * There is a tradeoff here: a larger number will cover more kernel page tables 486 * and take slightly longer to purge, but it will linearly reduce the number of 487 * global TLB flushes that must be performed. It would seem natural to scale 488 * this number up linearly with the number of CPUs (because vmapping activity 489 * could also scale linearly with the number of CPUs), however it is likely 490 * that in practice, workloads might be constrained in other ways that mean 491 * vmap activity will not scale linearly with CPUs. Also, I want to be 492 * conservative and not introduce a big latency on huge systems, so go with 493 * a less aggressive log scale. It will still be an improvement over the old 494 * code, and it will be simple to change the scale factor if we find that it 495 * becomes a problem on bigger systems. 496 */ 497static unsigned long lazy_max_pages(void) 498{ 499 unsigned int log; 500 501 if (!vmap_lazy_unmap) 502 return 0; 503 504 log = fls(num_online_cpus()); 505 506 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 507} 508 509static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); 510 511/* for per-CPU blocks */ 512static void purge_fragmented_blocks_allcpus(void); 513 514/* 515 * called before a call to iounmap() if the caller wants vm_area_struct's 516 * immediately freed. 517 */ 518void set_iounmap_nonlazy(void) 519{ 520 atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); 521} 522 523/* 524 * Purges all lazily-freed vmap areas. 525 * 526 * If sync is 0 then don't purge if there is already a purge in progress. 527 * If force_flush is 1, then flush kernel TLBs between *start and *end even 528 * if we found no lazy vmap areas to unmap (callers can use this to optimise 529 * their own TLB flushing). 530 * Returns with *start = min(*start, lowest purged address) 531 * *end = max(*end, highest purged address) 532 */ 533static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, 534 int sync, int force_flush) 535{ 536 static DEFINE_SPINLOCK(purge_lock); 537 LIST_HEAD(valist); 538 struct vmap_area *va; 539 struct vmap_area *n_va; 540 int nr = 0; 541 542 /* 543 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers 544 * should not expect such behaviour. This just simplifies locking for 545 * the case that isn't actually used at the moment anyway. 546 */ 547 if (!sync && !force_flush) { 548 if (!spin_trylock(&purge_lock)) 549 return; 550 } else 551 spin_lock(&purge_lock); 552 553 if (sync) 554 purge_fragmented_blocks_allcpus(); 555 556 rcu_read_lock(); 557 list_for_each_entry_rcu(va, &vmap_area_list, list) { 558 if (va->flags & VM_LAZY_FREE) { 559 if (va->va_start < *start) 560 *start = va->va_start; 561 if (va->va_end > *end) 562 *end = va->va_end; 563 nr += (va->va_end - va->va_start) >> PAGE_SHIFT; 564 unmap_vmap_area(va); 565 list_add_tail(&va->purge_list, &valist); 566 va->flags |= VM_LAZY_FREEING; 567 va->flags &= ~VM_LAZY_FREE; 568 } 569 } 570 rcu_read_unlock(); 571 572 if (nr) 573 atomic_sub(nr, &vmap_lazy_nr); 574 575 if (nr || force_flush) 576 flush_tlb_kernel_range(*start, *end); 577 578 if (nr) { 579 spin_lock(&vmap_area_lock); 580 list_for_each_entry_safe(va, n_va, &valist, purge_list) 581 __free_vmap_area(va); 582 spin_unlock(&vmap_area_lock); 583 } 584 spin_unlock(&purge_lock); 585} 586 587/* 588 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 589 * is already purging. 590 */ 591static void try_purge_vmap_area_lazy(void) 592{ 593 unsigned long start = ULONG_MAX, end = 0; 594 595 __purge_vmap_area_lazy(&start, &end, 0, 0); 596} 597 598/* 599 * Kick off a purge of the outstanding lazy areas. 600 */ 601static void purge_vmap_area_lazy(void) 602{ 603 unsigned long start = ULONG_MAX, end = 0; 604 605 __purge_vmap_area_lazy(&start, &end, 1, 0); 606} 607 608/* 609 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been 610 * called for the correct range previously. 611 */ 612static void free_unmap_vmap_area_noflush(struct vmap_area *va) 613{ 614 va->flags |= VM_LAZY_FREE; 615 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); 616 if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) 617 try_purge_vmap_area_lazy(); 618} 619 620/* 621 * Free and unmap a vmap area 622 */ 623static void free_unmap_vmap_area(struct vmap_area *va) 624{ 625 flush_cache_vunmap(va->va_start, va->va_end); 626 free_unmap_vmap_area_noflush(va); 627} 628 629static struct vmap_area *find_vmap_area(unsigned long addr) 630{ 631 struct vmap_area *va; 632 633 spin_lock(&vmap_area_lock); 634 va = __find_vmap_area(addr); 635 spin_unlock(&vmap_area_lock); 636 637 return va; 638} 639 640static void free_unmap_vmap_area_addr(unsigned long addr) 641{ 642 struct vmap_area *va; 643 644 va = find_vmap_area(addr); 645 BUG_ON(!va); 646 free_unmap_vmap_area(va); 647} 648 649 650/*** Per cpu kva allocator ***/ 651 652/* 653 * vmap space is limited especially on 32 bit architectures. Ensure there is 654 * room for at least 16 percpu vmap blocks per CPU. 655 */ 656/* 657 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 658 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 659 * instead (we just need a rough idea) 660 */ 661#if BITS_PER_LONG == 32 662#define VMALLOC_SPACE (128UL*1024*1024) 663#else 664#define VMALLOC_SPACE (128UL*1024*1024*1024) 665#endif 666 667#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 668#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 669#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 670#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 671#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 672#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 673#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 674 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 675 VMALLOC_PAGES / NR_CPUS / 16)) 676 677#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 678 679static bool vmap_initialized __read_mostly = false; 680 681struct vmap_block_queue { 682 spinlock_t lock; 683 struct list_head free; 684}; 685 686struct vmap_block { 687 spinlock_t lock; 688 struct vmap_area *va; 689 struct vmap_block_queue *vbq; 690 unsigned long free, dirty; 691 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 692 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 693 struct list_head free_list; 694 struct rcu_head rcu_head; 695 struct list_head purge; 696}; 697 698/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 699static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 700 701/* 702 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block 703 * in the free path. Could get rid of this if we change the API to return a 704 * "cookie" from alloc, to be passed to free. But no big deal yet. 705 */ 706static DEFINE_SPINLOCK(vmap_block_tree_lock); 707static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); 708 709/* 710 * We should probably have a fallback mechanism to allocate virtual memory 711 * out of partially filled vmap blocks. However vmap block sizing should be 712 * fairly reasonable according to the vmalloc size, so it shouldn't be a 713 * big problem. 714 */ 715 716static unsigned long addr_to_vb_idx(unsigned long addr) 717{ 718 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 719 addr /= VMAP_BLOCK_SIZE; 720 return addr; 721} 722 723static struct vmap_block *new_vmap_block(gfp_t gfp_mask) 724{ 725 struct vmap_block_queue *vbq; 726 struct vmap_block *vb; 727 struct vmap_area *va; 728 unsigned long vb_idx; 729 int node, err; 730 731 node = numa_node_id(); 732 733 vb = kmalloc_node(sizeof(struct vmap_block), 734 gfp_mask & GFP_RECLAIM_MASK, node); 735 if (unlikely(!vb)) 736 return ERR_PTR(-ENOMEM); 737 738 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 739 VMALLOC_START, VMALLOC_END, 740 node, gfp_mask); 741 if (unlikely(IS_ERR(va))) { 742 kfree(vb); 743 return ERR_CAST(va); 744 } 745 746 err = radix_tree_preload(gfp_mask); 747 if (unlikely(err)) { 748 kfree(vb); 749 free_vmap_area(va); 750 return ERR_PTR(err); 751 } 752 753 spin_lock_init(&vb->lock); 754 vb->va = va; 755 vb->free = VMAP_BBMAP_BITS; 756 vb->dirty = 0; 757 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); 758 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 759 INIT_LIST_HEAD(&vb->free_list); 760 761 vb_idx = addr_to_vb_idx(va->va_start); 762 spin_lock(&vmap_block_tree_lock); 763 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); 764 spin_unlock(&vmap_block_tree_lock); 765 BUG_ON(err); 766 radix_tree_preload_end(); 767 768 vbq = &get_cpu_var(vmap_block_queue); 769 vb->vbq = vbq; 770 spin_lock(&vbq->lock); 771 list_add_rcu(&vb->free_list, &vbq->free); 772 spin_unlock(&vbq->lock); 773 put_cpu_var(vmap_block_queue); 774 775 return vb; 776} 777 778static void rcu_free_vb(struct rcu_head *head) 779{ 780 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); 781 782 kfree(vb); 783} 784 785static void free_vmap_block(struct vmap_block *vb) 786{ 787 struct vmap_block *tmp; 788 unsigned long vb_idx; 789 790 vb_idx = addr_to_vb_idx(vb->va->va_start); 791 spin_lock(&vmap_block_tree_lock); 792 tmp = radix_tree_delete(&vmap_block_tree, vb_idx); 793 spin_unlock(&vmap_block_tree_lock); 794 BUG_ON(tmp != vb); 795 796 free_unmap_vmap_area_noflush(vb->va); 797 call_rcu(&vb->rcu_head, rcu_free_vb); 798} 799 800static void purge_fragmented_blocks(int cpu) 801{ 802 LIST_HEAD(purge); 803 struct vmap_block *vb; 804 struct vmap_block *n_vb; 805 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 806 807 rcu_read_lock(); 808 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 809 810 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 811 continue; 812 813 spin_lock(&vb->lock); 814 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 815 vb->free = 0; /* prevent further allocs after releasing lock */ 816 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 817 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); 818 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); 819 spin_lock(&vbq->lock); 820 list_del_rcu(&vb->free_list); 821 spin_unlock(&vbq->lock); 822 spin_unlock(&vb->lock); 823 list_add_tail(&vb->purge, &purge); 824 } else 825 spin_unlock(&vb->lock); 826 } 827 rcu_read_unlock(); 828 829 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 830 list_del(&vb->purge); 831 free_vmap_block(vb); 832 } 833} 834 835static void purge_fragmented_blocks_thiscpu(void) 836{ 837 purge_fragmented_blocks(smp_processor_id()); 838} 839 840static void purge_fragmented_blocks_allcpus(void) 841{ 842 int cpu; 843 844 for_each_possible_cpu(cpu) 845 purge_fragmented_blocks(cpu); 846} 847 848static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 849{ 850 struct vmap_block_queue *vbq; 851 struct vmap_block *vb; 852 unsigned long addr = 0; 853 unsigned int order; 854 int purge = 0; 855 856 BUG_ON(size & ~PAGE_MASK); 857 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 858 order = get_order(size); 859 860again: 861 rcu_read_lock(); 862 vbq = &get_cpu_var(vmap_block_queue); 863 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 864 int i; 865 866 spin_lock(&vb->lock); 867 if (vb->free < 1UL << order) 868 goto next; 869 870 i = bitmap_find_free_region(vb->alloc_map, 871 VMAP_BBMAP_BITS, order); 872 873 if (i < 0) { 874 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { 875 /* fragmented and no outstanding allocations */ 876 BUG_ON(vb->dirty != VMAP_BBMAP_BITS); 877 purge = 1; 878 } 879 goto next; 880 } 881 addr = vb->va->va_start + (i << PAGE_SHIFT); 882 BUG_ON(addr_to_vb_idx(addr) != 883 addr_to_vb_idx(vb->va->va_start)); 884 vb->free -= 1UL << order; 885 if (vb->free == 0) { 886 spin_lock(&vbq->lock); 887 list_del_rcu(&vb->free_list); 888 spin_unlock(&vbq->lock); 889 } 890 spin_unlock(&vb->lock); 891 break; 892next: 893 spin_unlock(&vb->lock); 894 } 895 896 if (purge) 897 purge_fragmented_blocks_thiscpu(); 898 899 put_cpu_var(vmap_block_queue); 900 rcu_read_unlock(); 901 902 if (!addr) { 903 vb = new_vmap_block(gfp_mask); 904 if (IS_ERR(vb)) 905 return vb; 906 goto again; 907 } 908 909 return (void *)addr; 910} 911 912static void vb_free(const void *addr, unsigned long size) 913{ 914 unsigned long offset; 915 unsigned long vb_idx; 916 unsigned int order; 917 struct vmap_block *vb; 918 919 BUG_ON(size & ~PAGE_MASK); 920 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 921 922 flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); 923 924 order = get_order(size); 925 926 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); 927 928 vb_idx = addr_to_vb_idx((unsigned long)addr); 929 rcu_read_lock(); 930 vb = radix_tree_lookup(&vmap_block_tree, vb_idx); 931 rcu_read_unlock(); 932 BUG_ON(!vb); 933 934 spin_lock(&vb->lock); 935 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); 936 937 vb->dirty += 1UL << order; 938 if (vb->dirty == VMAP_BBMAP_BITS) { 939 BUG_ON(vb->free); 940 spin_unlock(&vb->lock); 941 free_vmap_block(vb); 942 } else 943 spin_unlock(&vb->lock); 944} 945 946/** 947 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 948 * 949 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 950 * to amortize TLB flushing overheads. What this means is that any page you 951 * have now, may, in a former life, have been mapped into kernel virtual 952 * address by the vmap layer and so there might be some CPUs with TLB entries 953 * still referencing that page (additional to the regular 1:1 kernel mapping). 954 * 955 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 956 * be sure that none of the pages we have control over will have any aliases 957 * from the vmap layer. 958 */ 959void vm_unmap_aliases(void) 960{ 961 unsigned long start = ULONG_MAX, end = 0; 962 int cpu; 963 int flush = 0; 964 965 if (unlikely(!vmap_initialized)) 966 return; 967 968 for_each_possible_cpu(cpu) { 969 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 970 struct vmap_block *vb; 971 972 rcu_read_lock(); 973 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 974 int i; 975 976 spin_lock(&vb->lock); 977 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); 978 while (i < VMAP_BBMAP_BITS) { 979 unsigned long s, e; 980 int j; 981 j = find_next_zero_bit(vb->dirty_map, 982 VMAP_BBMAP_BITS, i); 983 984 s = vb->va->va_start + (i << PAGE_SHIFT); 985 e = vb->va->va_start + (j << PAGE_SHIFT); 986 vunmap_page_range(s, e); 987 flush = 1; 988 989 if (s < start) 990 start = s; 991 if (e > end) 992 end = e; 993 994 i = j; 995 i = find_next_bit(vb->dirty_map, 996 VMAP_BBMAP_BITS, i); 997 } 998 spin_unlock(&vb->lock); 999 } 1000 rcu_read_unlock(); 1001 } 1002 1003 __purge_vmap_area_lazy(&start, &end, 1, flush); 1004} 1005EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1006 1007/** 1008 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 1009 * @mem: the pointer returned by vm_map_ram 1010 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 1011 */ 1012void vm_unmap_ram(const void *mem, unsigned int count) 1013{ 1014 unsigned long size = count << PAGE_SHIFT; 1015 unsigned long addr = (unsigned long)mem; 1016 1017 BUG_ON(!addr); 1018 BUG_ON(addr < VMALLOC_START); 1019 BUG_ON(addr > VMALLOC_END); 1020 BUG_ON(addr & (PAGE_SIZE-1)); 1021 1022 debug_check_no_locks_freed(mem, size); 1023 vmap_debug_free_range(addr, addr+size); 1024 1025 if (likely(count <= VMAP_MAX_ALLOC)) 1026 vb_free(mem, size); 1027 else 1028 free_unmap_vmap_area_addr(addr); 1029} 1030EXPORT_SYMBOL(vm_unmap_ram); 1031 1032/** 1033 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 1034 * @pages: an array of pointers to the pages to be mapped 1035 * @count: number of pages 1036 * @node: prefer to allocate data structures on this node 1037 * @prot: memory protection to use. PAGE_KERNEL for regular RAM 1038 * 1039 * Returns: a pointer to the address that has been mapped, or %NULL on failure 1040 */ 1041void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) 1042{ 1043 unsigned long size = count << PAGE_SHIFT; 1044 unsigned long addr; 1045 void *mem; 1046 1047 if (likely(count <= VMAP_MAX_ALLOC)) { 1048 mem = vb_alloc(size, GFP_KERNEL); 1049 if (IS_ERR(mem)) 1050 return NULL; 1051 addr = (unsigned long)mem; 1052 } else { 1053 struct vmap_area *va; 1054 va = alloc_vmap_area(size, PAGE_SIZE, 1055 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 1056 if (IS_ERR(va)) 1057 return NULL; 1058 1059 addr = va->va_start; 1060 mem = (void *)addr; 1061 } 1062 if (vmap_page_range(addr, addr + size, prot, pages) < 0) { 1063 vm_unmap_ram(mem, count); 1064 return NULL; 1065 } 1066 return mem; 1067} 1068EXPORT_SYMBOL(vm_map_ram); 1069 1070/** 1071 * vm_area_register_early - register vmap area early during boot 1072 * @vm: vm_struct to register 1073 * @align: requested alignment 1074 * 1075 * This function is used to register kernel vm area before 1076 * vmalloc_init() is called. @vm->size and @vm->flags should contain 1077 * proper values on entry and other fields should be zero. On return, 1078 * vm->addr contains the allocated address. 1079 * 1080 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1081 */ 1082void __init vm_area_register_early(struct vm_struct *vm, size_t align) 1083{ 1084 static size_t vm_init_off __initdata; 1085 unsigned long addr; 1086 1087 addr = ALIGN(VMALLOC_START + vm_init_off, align); 1088 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 1089 1090 vm->addr = (void *)addr; 1091 1092 vm->next = vmlist; 1093 vmlist = vm; 1094} 1095 1096void __init vmalloc_init(void) 1097{ 1098 struct vmap_area *va; 1099 struct vm_struct *tmp; 1100 int i; 1101 1102 for_each_possible_cpu(i) { 1103 struct vmap_block_queue *vbq; 1104 1105 vbq = &per_cpu(vmap_block_queue, i); 1106 spin_lock_init(&vbq->lock); 1107 INIT_LIST_HEAD(&vbq->free); 1108 } 1109 1110 /* Import existing vmlist entries. */ 1111 for (tmp = vmlist; tmp; tmp = tmp->next) { 1112 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 1113 va->flags = tmp->flags | VM_VM_AREA; 1114 va->va_start = (unsigned long)tmp->addr; 1115 va->va_end = va->va_start + tmp->size; 1116 __insert_vmap_area(va); 1117 } 1118 1119 vmap_area_pcpu_hole = VMALLOC_END; 1120 1121 vmap_initialized = true; 1122} 1123 1124/** 1125 * map_kernel_range_noflush - map kernel VM area with the specified pages 1126 * @addr: start of the VM area to map 1127 * @size: size of the VM area to map 1128 * @prot: page protection flags to use 1129 * @pages: pages to map 1130 * 1131 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size 1132 * specify should have been allocated using get_vm_area() and its 1133 * friends. 1134 * 1135 * NOTE: 1136 * This function does NOT do any cache flushing. The caller is 1137 * responsible for calling flush_cache_vmap() on to-be-mapped areas 1138 * before calling this function. 1139 * 1140 * RETURNS: 1141 * The number of pages mapped on success, -errno on failure. 1142 */ 1143int map_kernel_range_noflush(unsigned long addr, unsigned long size, 1144 pgprot_t prot, struct page **pages) 1145{ 1146 return vmap_page_range_noflush(addr, addr + size, prot, pages); 1147} 1148 1149/** 1150 * unmap_kernel_range_noflush - unmap kernel VM area 1151 * @addr: start of the VM area to unmap 1152 * @size: size of the VM area to unmap 1153 * 1154 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size 1155 * specify should have been allocated using get_vm_area() and its 1156 * friends. 1157 * 1158 * NOTE: 1159 * This function does NOT do any cache flushing. The caller is 1160 * responsible for calling flush_cache_vunmap() on to-be-mapped areas 1161 * before calling this function and flush_tlb_kernel_range() after. 1162 */ 1163void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) 1164{ 1165 vunmap_page_range(addr, addr + size); 1166} 1167 1168/** 1169 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB 1170 * @addr: start of the VM area to unmap 1171 * @size: size of the VM area to unmap 1172 * 1173 * Similar to unmap_kernel_range_noflush() but flushes vcache before 1174 * the unmapping and tlb after. 1175 */ 1176void unmap_kernel_range(unsigned long addr, unsigned long size) 1177{ 1178 unsigned long end = addr + size; 1179 1180 flush_cache_vunmap(addr, end); 1181 vunmap_page_range(addr, end); 1182 flush_tlb_kernel_range(addr, end); 1183} 1184 1185int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 1186{ 1187 unsigned long addr = (unsigned long)area->addr; 1188 unsigned long end = addr + area->size - PAGE_SIZE; 1189 int err; 1190 1191 err = vmap_page_range(addr, end, prot, *pages); 1192 if (err > 0) { 1193 *pages += err; 1194 err = 0; 1195 } 1196 1197 return err; 1198} 1199EXPORT_SYMBOL_GPL(map_vm_area); 1200 1201/*** Old vmalloc interfaces ***/ 1202DEFINE_RWLOCK(vmlist_lock); 1203struct vm_struct *vmlist; 1204 1205static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1206 unsigned long flags, void *caller) 1207{ 1208 struct vm_struct *tmp, **p; 1209 1210 vm->flags = flags; 1211 vm->addr = (void *)va->va_start; 1212 vm->size = va->va_end - va->va_start; 1213 vm->caller = caller; 1214 va->private = vm; 1215 va->flags |= VM_VM_AREA; 1216 1217 write_lock(&vmlist_lock); 1218 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1219 if (tmp->addr >= vm->addr) 1220 break; 1221 } 1222 vm->next = *p; 1223 *p = vm; 1224 write_unlock(&vmlist_lock); 1225} 1226 1227static struct vm_struct *__get_vm_area_node(unsigned long size, 1228 unsigned long align, unsigned long flags, unsigned long start, 1229 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1230{ 1231 static struct vmap_area *va; 1232 struct vm_struct *area; 1233 1234 BUG_ON(in_interrupt()); 1235 if (flags & VM_IOREMAP) { 1236 int bit = fls(size); 1237 1238 if (bit > IOREMAP_MAX_ORDER) 1239 bit = IOREMAP_MAX_ORDER; 1240 else if (bit < PAGE_SHIFT) 1241 bit = PAGE_SHIFT; 1242 1243 align = 1ul << bit; 1244 } 1245 1246 size = PAGE_ALIGN(size); 1247 if (unlikely(!size)) 1248 return NULL; 1249 1250 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1251 if (unlikely(!area)) 1252 return NULL; 1253 1254 /* 1255 * We always allocate a guard page. 1256 */ 1257 size += PAGE_SIZE; 1258 1259 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 1260 if (IS_ERR(va)) { 1261 kfree(area); 1262 return NULL; 1263 } 1264 1265 insert_vmalloc_vm(area, va, flags, caller); 1266 return area; 1267} 1268 1269struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1270 unsigned long start, unsigned long end) 1271{ 1272 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1273 __builtin_return_address(0)); 1274} 1275EXPORT_SYMBOL_GPL(__get_vm_area); 1276 1277struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 1278 unsigned long start, unsigned long end, 1279 void *caller) 1280{ 1281 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1282 caller); 1283} 1284 1285/** 1286 * get_vm_area - reserve a contiguous kernel virtual area 1287 * @size: size of the area 1288 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 1289 * 1290 * Search an area of @size in the kernel virtual mapping area, 1291 * and reserved it for out purposes. Returns the area descriptor 1292 * on success or %NULL on failure. 1293 */ 1294struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1295{ 1296 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1297 -1, GFP_KERNEL, __builtin_return_address(0)); 1298} 1299 1300struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1301 void *caller) 1302{ 1303 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1304 -1, GFP_KERNEL, caller); 1305} 1306 1307struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, 1308 int node, gfp_t gfp_mask) 1309{ 1310 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1311 node, gfp_mask, __builtin_return_address(0)); 1312} 1313 1314static struct vm_struct *find_vm_area(const void *addr) 1315{ 1316 struct vmap_area *va; 1317 1318 va = find_vmap_area((unsigned long)addr); 1319 if (va && va->flags & VM_VM_AREA) 1320 return va->private; 1321 1322 return NULL; 1323} 1324 1325/** 1326 * remove_vm_area - find and remove a continuous kernel virtual area 1327 * @addr: base address 1328 * 1329 * Search for the kernel VM area starting at @addr, and remove it. 1330 * This function returns the found VM area, but using it is NOT safe 1331 * on SMP machines, except for its size or flags. 1332 */ 1333struct vm_struct *remove_vm_area(const void *addr) 1334{ 1335 struct vmap_area *va; 1336 1337 va = find_vmap_area((unsigned long)addr); 1338 if (va && va->flags & VM_VM_AREA) { 1339 struct vm_struct *vm = va->private; 1340 struct vm_struct *tmp, **p; 1341 /* 1342 * remove from list and disallow access to this vm_struct 1343 * before unmap. (address range confliction is maintained by 1344 * vmap.) 1345 */ 1346 write_lock(&vmlist_lock); 1347 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1348 ; 1349 *p = tmp->next; 1350 write_unlock(&vmlist_lock); 1351 1352 vmap_debug_free_range(va->va_start, va->va_end); 1353 free_unmap_vmap_area(va); 1354 vm->size -= PAGE_SIZE; 1355 1356 return vm; 1357 } 1358 return NULL; 1359} 1360 1361static void __vunmap(const void *addr, int deallocate_pages) 1362{ 1363 struct vm_struct *area; 1364 1365 if (!addr) 1366 return; 1367 1368 if ((PAGE_SIZE-1) & (unsigned long)addr) { 1369 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 1370 return; 1371 } 1372 1373 area = remove_vm_area(addr); 1374 if (unlikely(!area)) { 1375 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 1376 addr); 1377 return; 1378 } 1379 1380 debug_check_no_locks_freed(addr, area->size); 1381 debug_check_no_obj_freed(addr, area->size); 1382 1383 if (deallocate_pages) { 1384 int i; 1385 1386 for (i = 0; i < area->nr_pages; i++) { 1387 struct page *page = area->pages[i]; 1388 1389 BUG_ON(!page); 1390 __free_page(page); 1391 } 1392 1393 if (area->flags & VM_VPAGES) 1394 vfree(area->pages); 1395 else 1396 kfree(area->pages); 1397 } 1398 1399 kfree(area); 1400 return; 1401} 1402 1403/** 1404 * vfree - release memory allocated by vmalloc() 1405 * @addr: memory base address 1406 * 1407 * Free the virtually continuous memory area starting at @addr, as 1408 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is 1409 * NULL, no operation is performed. 1410 * 1411 * Must not be called in interrupt context. 1412 */ 1413void vfree(const void *addr) 1414{ 1415 BUG_ON(in_interrupt()); 1416 1417 kmemleak_free(addr); 1418 1419 __vunmap(addr, 1); 1420} 1421EXPORT_SYMBOL(vfree); 1422 1423/** 1424 * vunmap - release virtual mapping obtained by vmap() 1425 * @addr: memory base address 1426 * 1427 * Free the virtually contiguous memory area starting at @addr, 1428 * which was created from the page array passed to vmap(). 1429 * 1430 * Must not be called in interrupt context. 1431 */ 1432void vunmap(const void *addr) 1433{ 1434 BUG_ON(in_interrupt()); 1435 might_sleep(); 1436 __vunmap(addr, 0); 1437} 1438EXPORT_SYMBOL(vunmap); 1439 1440/** 1441 * vmap - map an array of pages into virtually contiguous space 1442 * @pages: array of page pointers 1443 * @count: number of pages to map 1444 * @flags: vm_area->flags 1445 * @prot: page protection for the mapping 1446 * 1447 * Maps @count pages from @pages into contiguous kernel virtual 1448 * space. 1449 */ 1450void *vmap(struct page **pages, unsigned int count, 1451 unsigned long flags, pgprot_t prot) 1452{ 1453 struct vm_struct *area; 1454 1455 might_sleep(); 1456 1457 if (count > totalram_pages) 1458 return NULL; 1459 1460 area = get_vm_area_caller((count << PAGE_SHIFT), flags, 1461 __builtin_return_address(0)); 1462 if (!area) 1463 return NULL; 1464 1465 if (map_vm_area(area, prot, &pages)) { 1466 vunmap(area->addr); 1467 return NULL; 1468 } 1469 1470 return area->addr; 1471} 1472EXPORT_SYMBOL(vmap); 1473 1474static void *__vmalloc_node(unsigned long size, unsigned long align, 1475 gfp_t gfp_mask, pgprot_t prot, 1476 int node, void *caller); 1477static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1478 pgprot_t prot, int node, void *caller) 1479{ 1480 struct page **pages; 1481 unsigned int nr_pages, array_size, i; 1482 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1483 1484 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; 1485 array_size = (nr_pages * sizeof(struct page *)); 1486 1487 area->nr_pages = nr_pages; 1488 /* Please note that the recursion is strictly bounded. */ 1489 if (array_size > PAGE_SIZE) { 1490 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, 1491 PAGE_KERNEL, node, caller); 1492 area->flags |= VM_VPAGES; 1493 } else { 1494 pages = kmalloc_node(array_size, nested_gfp, node); 1495 } 1496 area->pages = pages; 1497 area->caller = caller; 1498 if (!area->pages) { 1499 remove_vm_area(area->addr); 1500 kfree(area); 1501 return NULL; 1502 } 1503 1504 for (i = 0; i < area->nr_pages; i++) { 1505 struct page *page; 1506 1507 if (node < 0) 1508 page = alloc_page(gfp_mask); 1509 else 1510 page = alloc_pages_node(node, gfp_mask, 0); 1511 1512 if (unlikely(!page)) { 1513 /* Successfully allocated i pages, free them in __vunmap() */ 1514 area->nr_pages = i; 1515 goto fail; 1516 } 1517 area->pages[i] = page; 1518 } 1519 1520 if (map_vm_area(area, prot, &pages)) 1521 goto fail; 1522 return area->addr; 1523 1524fail: 1525 vfree(area->addr); 1526 return NULL; 1527} 1528 1529void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 1530{ 1531 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, 1532 __builtin_return_address(0)); 1533 1534 /* 1535 * A ref_count = 3 is needed because the vm_struct and vmap_area 1536 * structures allocated in the __get_vm_area_node() function contain 1537 * references to the virtual address of the vmalloc'ed block. 1538 */ 1539 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); 1540 1541 return addr; 1542} 1543 1544/** 1545 * __vmalloc_node - allocate virtually contiguous memory 1546 * @size: allocation size 1547 * @align: desired alignment 1548 * @gfp_mask: flags for the page level allocator 1549 * @prot: protection mask for the allocated pages 1550 * @node: node to use for allocation or -1 1551 * @caller: caller's return address 1552 * 1553 * Allocate enough pages to cover @size from the page level 1554 * allocator with @gfp_mask flags. Map them into contiguous 1555 * kernel virtual space, using a pagetable protection of @prot. 1556 */ 1557static void *__vmalloc_node(unsigned long size, unsigned long align, 1558 gfp_t gfp_mask, pgprot_t prot, 1559 int node, void *caller) 1560{ 1561 struct vm_struct *area; 1562 void *addr; 1563 unsigned long real_size = size; 1564 1565 size = PAGE_ALIGN(size); 1566 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1567 return NULL; 1568 1569 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, 1570 VMALLOC_END, node, gfp_mask, caller); 1571 1572 if (!area) 1573 return NULL; 1574 1575 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1576 1577 /* 1578 * A ref_count = 3 is needed because the vm_struct and vmap_area 1579 * structures allocated in the __get_vm_area_node() function contain 1580 * references to the virtual address of the vmalloc'ed block. 1581 */ 1582 kmemleak_alloc(addr, real_size, 3, gfp_mask); 1583 1584 return addr; 1585} 1586 1587void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1588{ 1589 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1590 __builtin_return_address(0)); 1591} 1592EXPORT_SYMBOL(__vmalloc); 1593 1594/** 1595 * vmalloc - allocate virtually contiguous memory 1596 * @size: allocation size 1597 * Allocate enough pages to cover @size from the page level 1598 * allocator and map them into contiguous kernel virtual space. 1599 * 1600 * For tight control over page level allocator and protection flags 1601 * use __vmalloc() instead. 1602 */ 1603void *vmalloc(unsigned long size) 1604{ 1605 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1606 -1, __builtin_return_address(0)); 1607} 1608EXPORT_SYMBOL(vmalloc); 1609 1610/** 1611 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 1612 * @size: allocation size 1613 * 1614 * The resulting memory area is zeroed so it can be mapped to userspace 1615 * without leaking data. 1616 */ 1617void *vmalloc_user(unsigned long size) 1618{ 1619 struct vm_struct *area; 1620 void *ret; 1621 1622 ret = __vmalloc_node(size, SHMLBA, 1623 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1624 PAGE_KERNEL, -1, __builtin_return_address(0)); 1625 if (ret) { 1626 area = find_vm_area(ret); 1627 area->flags |= VM_USERMAP; 1628 } 1629 return ret; 1630} 1631EXPORT_SYMBOL(vmalloc_user); 1632 1633/** 1634 * vmalloc_node - allocate memory on a specific node 1635 * @size: allocation size 1636 * @node: numa node 1637 * 1638 * Allocate enough pages to cover @size from the page level 1639 * allocator and map them into contiguous kernel virtual space. 1640 * 1641 * For tight control over page level allocator and protection flags 1642 * use __vmalloc() instead. 1643 */ 1644void *vmalloc_node(unsigned long size, int node) 1645{ 1646 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1647 node, __builtin_return_address(0)); 1648} 1649EXPORT_SYMBOL(vmalloc_node); 1650 1651#ifndef PAGE_KERNEL_EXEC 1652# define PAGE_KERNEL_EXEC PAGE_KERNEL 1653#endif 1654 1655/** 1656 * vmalloc_exec - allocate virtually contiguous, executable memory 1657 * @size: allocation size 1658 * 1659 * Kernel-internal function to allocate enough pages to cover @size 1660 * the page level allocator and map them into contiguous and 1661 * executable kernel virtual space. 1662 * 1663 * For tight control over page level allocator and protection flags 1664 * use __vmalloc() instead. 1665 */ 1666 1667void *vmalloc_exec(unsigned long size) 1668{ 1669 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1670 -1, __builtin_return_address(0)); 1671} 1672 1673#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 1674#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL 1675#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 1676#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL 1677#else 1678#define GFP_VMALLOC32 GFP_KERNEL 1679#endif 1680 1681/** 1682 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 1683 * @size: allocation size 1684 * 1685 * Allocate enough 32bit PA addressable pages to cover @size from the 1686 * page level allocator and map them into contiguous kernel virtual space. 1687 */ 1688void *vmalloc_32(unsigned long size) 1689{ 1690 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, 1691 -1, __builtin_return_address(0)); 1692} 1693EXPORT_SYMBOL(vmalloc_32); 1694 1695/** 1696 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 1697 * @size: allocation size 1698 * 1699 * The resulting memory area is 32bit addressable and zeroed so it can be 1700 * mapped to userspace without leaking data. 1701 */ 1702void *vmalloc_32_user(unsigned long size) 1703{ 1704 struct vm_struct *area; 1705 void *ret; 1706 1707 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1708 -1, __builtin_return_address(0)); 1709 if (ret) { 1710 area = find_vm_area(ret); 1711 area->flags |= VM_USERMAP; 1712 } 1713 return ret; 1714} 1715EXPORT_SYMBOL(vmalloc_32_user); 1716 1717/* 1718 * small helper routine , copy contents to buf from addr. 1719 * If the page is not present, fill zero. 1720 */ 1721 1722static int aligned_vread(char *buf, char *addr, unsigned long count) 1723{ 1724 struct page *p; 1725 int copied = 0; 1726 1727 while (count) { 1728 unsigned long offset, length; 1729 1730 offset = (unsigned long)addr & ~PAGE_MASK; 1731 length = PAGE_SIZE - offset; 1732 if (length > count) 1733 length = count; 1734 p = vmalloc_to_page(addr); 1735 /* 1736 * To do safe access to this _mapped_ area, we need 1737 * lock. But adding lock here means that we need to add 1738 * overhead of vmalloc()/vfree() calles for this _debug_ 1739 * interface, rarely used. Instead of that, we'll use 1740 * kmap() and get small overhead in this access function. 1741 */ 1742 if (p) { 1743 /* 1744 * we can expect USER0 is not used (see vread/vwrite's 1745 * function description) 1746 */ 1747 void *map = kmap_atomic(p, KM_USER0); 1748 memcpy(buf, map + offset, length); 1749 kunmap_atomic(map, KM_USER0); 1750 } else 1751 memset(buf, 0, length); 1752 1753 addr += length; 1754 buf += length; 1755 copied += length; 1756 count -= length; 1757 } 1758 return copied; 1759} 1760 1761static int aligned_vwrite(char *buf, char *addr, unsigned long count) 1762{ 1763 struct page *p; 1764 int copied = 0; 1765 1766 while (count) { 1767 unsigned long offset, length; 1768 1769 offset = (unsigned long)addr & ~PAGE_MASK; 1770 length = PAGE_SIZE - offset; 1771 if (length > count) 1772 length = count; 1773 p = vmalloc_to_page(addr); 1774 /* 1775 * To do safe access to this _mapped_ area, we need 1776 * lock. But adding lock here means that we need to add 1777 * overhead of vmalloc()/vfree() calles for this _debug_ 1778 * interface, rarely used. Instead of that, we'll use 1779 * kmap() and get small overhead in this access function. 1780 */ 1781 if (p) { 1782 /* 1783 * we can expect USER0 is not used (see vread/vwrite's 1784 * function description) 1785 */ 1786 void *map = kmap_atomic(p, KM_USER0); 1787 memcpy(map + offset, buf, length); 1788 kunmap_atomic(map, KM_USER0); 1789 } 1790 addr += length; 1791 buf += length; 1792 copied += length; 1793 count -= length; 1794 } 1795 return copied; 1796} 1797 1798/** 1799 * vread() - read vmalloc area in a safe way. 1800 * @buf: buffer for reading data 1801 * @addr: vm address. 1802 * @count: number of bytes to be read. 1803 * 1804 * Returns # of bytes which addr and buf should be increased. 1805 * (same number to @count). Returns 0 if [addr...addr+count) doesn't 1806 * includes any intersect with alive vmalloc area. 1807 * 1808 * This function checks that addr is a valid vmalloc'ed area, and 1809 * copy data from that area to a given buffer. If the given memory range 1810 * of [addr...addr+count) includes some valid address, data is copied to 1811 * proper area of @buf. If there are memory holes, they'll be zero-filled. 1812 * IOREMAP area is treated as memory hole and no copy is done. 1813 * 1814 * If [addr...addr+count) doesn't includes any intersects with alive 1815 * vm_struct area, returns 0. 1816 * @buf should be kernel's buffer. Because this function uses KM_USER0, 1817 * the caller should guarantee KM_USER0 is not used. 1818 * 1819 * Note: In usual ops, vread() is never necessary because the caller 1820 * should know vmalloc() area is valid and can use memcpy(). 1821 * This is for routines which have to access vmalloc area without 1822 * any informaion, as /dev/kmem. 1823 * 1824 */ 1825 1826long vread(char *buf, char *addr, unsigned long count) 1827{ 1828 struct vm_struct *tmp; 1829 char *vaddr, *buf_start = buf; 1830 unsigned long buflen = count; 1831 unsigned long n; 1832 1833 /* Don't allow overflow */ 1834 if ((unsigned long) addr + count < count) 1835 count = -(unsigned long) addr; 1836 1837 read_lock(&vmlist_lock); 1838 for (tmp = vmlist; count && tmp; tmp = tmp->next) { 1839 vaddr = (char *) tmp->addr; 1840 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1841 continue; 1842 while (addr < vaddr) { 1843 if (count == 0) 1844 goto finished; 1845 *buf = '\0'; 1846 buf++; 1847 addr++; 1848 count--; 1849 } 1850 n = vaddr + tmp->size - PAGE_SIZE - addr; 1851 if (n > count) 1852 n = count; 1853 if (!(tmp->flags & VM_IOREMAP)) 1854 aligned_vread(buf, addr, n); 1855 else /* IOREMAP area is treated as memory hole */ 1856 memset(buf, 0, n); 1857 buf += n; 1858 addr += n; 1859 count -= n; 1860 } 1861finished: 1862 read_unlock(&vmlist_lock); 1863 1864 if (buf == buf_start) 1865 return 0; 1866 /* zero-fill memory holes */ 1867 if (buf != buf_start + buflen) 1868 memset(buf, 0, buflen - (buf - buf_start)); 1869 1870 return buflen; 1871} 1872 1873/** 1874 * vwrite() - write vmalloc area in a safe way. 1875 * @buf: buffer for source data 1876 * @addr: vm address. 1877 * @count: number of bytes to be read. 1878 * 1879 * Returns # of bytes which addr and buf should be incresed. 1880 * (same number to @count). 1881 * If [addr...addr+count) doesn't includes any intersect with valid 1882 * vmalloc area, returns 0. 1883 * 1884 * This function checks that addr is a valid vmalloc'ed area, and 1885 * copy data from a buffer to the given addr. If specified range of 1886 * [addr...addr+count) includes some valid address, data is copied from 1887 * proper area of @buf. If there are memory holes, no copy to hole. 1888 * IOREMAP area is treated as memory hole and no copy is done. 1889 * 1890 * If [addr...addr+count) doesn't includes any intersects with alive 1891 * vm_struct area, returns 0. 1892 * @buf should be kernel's buffer. Because this function uses KM_USER0, 1893 * the caller should guarantee KM_USER0 is not used. 1894 * 1895 * Note: In usual ops, vwrite() is never necessary because the caller 1896 * should know vmalloc() area is valid and can use memcpy(). 1897 * This is for routines which have to access vmalloc area without 1898 * any informaion, as /dev/kmem. 1899 * 1900 * The caller should guarantee KM_USER1 is not used. 1901 */ 1902 1903long vwrite(char *buf, char *addr, unsigned long count) 1904{ 1905 struct vm_struct *tmp; 1906 char *vaddr; 1907 unsigned long n, buflen; 1908 int copied = 0; 1909 1910 /* Don't allow overflow */ 1911 if ((unsigned long) addr + count < count) 1912 count = -(unsigned long) addr; 1913 buflen = count; 1914 1915 read_lock(&vmlist_lock); 1916 for (tmp = vmlist; count && tmp; tmp = tmp->next) { 1917 vaddr = (char *) tmp->addr; 1918 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1919 continue; 1920 while (addr < vaddr) { 1921 if (count == 0) 1922 goto finished; 1923 buf++; 1924 addr++; 1925 count--; 1926 } 1927 n = vaddr + tmp->size - PAGE_SIZE - addr; 1928 if (n > count) 1929 n = count; 1930 if (!(tmp->flags & VM_IOREMAP)) { 1931 aligned_vwrite(buf, addr, n); 1932 copied++; 1933 } 1934 buf += n; 1935 addr += n; 1936 count -= n; 1937 } 1938finished: 1939 read_unlock(&vmlist_lock); 1940 if (!copied) 1941 return 0; 1942 return buflen; 1943} 1944 1945/** 1946 * remap_vmalloc_range - map vmalloc pages to userspace 1947 * @vma: vma to cover (map full range of vma) 1948 * @addr: vmalloc memory 1949 * @pgoff: number of pages into addr before first page to map 1950 * 1951 * Returns: 0 for success, -Exxx on failure 1952 * 1953 * This function checks that addr is a valid vmalloc'ed area, and 1954 * that it is big enough to cover the vma. Will return failure if 1955 * that criteria isn't met. 1956 * 1957 * Similar to remap_pfn_range() (see mm/memory.c) 1958 */ 1959int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 1960 unsigned long pgoff) 1961{ 1962 struct vm_struct *area; 1963 unsigned long uaddr = vma->vm_start; 1964 unsigned long usize = vma->vm_end - vma->vm_start; 1965 1966 if ((PAGE_SIZE-1) & (unsigned long)addr) 1967 return -EINVAL; 1968 1969 area = find_vm_area(addr); 1970 if (!area) 1971 return -EINVAL; 1972 1973 if (!(area->flags & VM_USERMAP)) 1974 return -EINVAL; 1975 1976 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 1977 return -EINVAL; 1978 1979 addr += pgoff << PAGE_SHIFT; 1980 do { 1981 struct page *page = vmalloc_to_page(addr); 1982 int ret; 1983 1984 ret = vm_insert_page(vma, uaddr, page); 1985 if (ret) 1986 return ret; 1987 1988 uaddr += PAGE_SIZE; 1989 addr += PAGE_SIZE; 1990 usize -= PAGE_SIZE; 1991 } while (usize > 0); 1992 1993 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 1994 vma->vm_flags |= VM_RESERVED; 1995 1996 return 0; 1997} 1998EXPORT_SYMBOL(remap_vmalloc_range); 1999 2000/* 2001 * Implement a stub for vmalloc_sync_all() if the architecture chose not to 2002 * have one. 2003 */ 2004void __attribute__((weak)) vmalloc_sync_all(void) 2005{ 2006} 2007 2008 2009static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) 2010{ 2011 /* apply_to_page_range() does all the hard work. */ 2012 return 0; 2013} 2014 2015/** 2016 * alloc_vm_area - allocate a range of kernel address space 2017 * @size: size of the area 2018 * 2019 * Returns: NULL on failure, vm_struct on success 2020 * 2021 * This function reserves a range of kernel address space, and 2022 * allocates pagetables to map that range. No actual mappings 2023 * are created. If the kernel address space is not shared 2024 * between processes, it syncs the pagetable across all 2025 * processes. 2026 */ 2027struct vm_struct *alloc_vm_area(size_t size) 2028{ 2029 struct vm_struct *area; 2030 2031 area = get_vm_area_caller(size, VM_IOREMAP, 2032 __builtin_return_address(0)); 2033 if (area == NULL) 2034 return NULL; 2035 2036 /* 2037 * This ensures that page tables are constructed for this region 2038 * of kernel virtual address space and mapped into init_mm. 2039 */ 2040 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2041 area->size, f, NULL)) { 2042 free_vm_area(area); 2043 return NULL; 2044 } 2045 2046 /* Make sure the pagetables are constructed in process kernel 2047 mappings */ 2048 vmalloc_sync_all(); 2049 2050 return area; 2051} 2052EXPORT_SYMBOL_GPL(alloc_vm_area); 2053 2054void free_vm_area(struct vm_struct *area) 2055{ 2056 struct vm_struct *ret; 2057 ret = remove_vm_area(area->addr); 2058 BUG_ON(ret != area); 2059 kfree(area); 2060} 2061EXPORT_SYMBOL_GPL(free_vm_area); 2062 2063static struct vmap_area *node_to_va(struct rb_node *n) 2064{ 2065 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; 2066} 2067 2068/** 2069 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end 2070 * @end: target address 2071 * @pnext: out arg for the next vmap_area 2072 * @pprev: out arg for the previous vmap_area 2073 * 2074 * Returns: %true if either or both of next and prev are found, 2075 * %false if no vmap_area exists 2076 * 2077 * Find vmap_areas end addresses of which enclose @end. ie. if not 2078 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. 2079 */ 2080static bool pvm_find_next_prev(unsigned long end, 2081 struct vmap_area **pnext, 2082 struct vmap_area **pprev) 2083{ 2084 struct rb_node *n = vmap_area_root.rb_node; 2085 struct vmap_area *va = NULL; 2086 2087 while (n) { 2088 va = rb_entry(n, struct vmap_area, rb_node); 2089 if (end < va->va_end) 2090 n = n->rb_left; 2091 else if (end > va->va_end) 2092 n = n->rb_right; 2093 else 2094 break; 2095 } 2096 2097 if (!va) 2098 return false; 2099 2100 if (va->va_end > end) { 2101 *pnext = va; 2102 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); 2103 } else { 2104 *pprev = va; 2105 *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); 2106 } 2107 return true; 2108} 2109 2110/** 2111 * pvm_determine_end - find the highest aligned address between two vmap_areas 2112 * @pnext: in/out arg for the next vmap_area 2113 * @pprev: in/out arg for the previous vmap_area 2114 * @align: alignment 2115 * 2116 * Returns: determined end address 2117 * 2118 * Find the highest aligned address between *@pnext and *@pprev below 2119 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned 2120 * down address is between the end addresses of the two vmap_areas. 2121 * 2122 * Please note that the address returned by this function may fall 2123 * inside *@pnext vmap_area. The caller is responsible for checking 2124 * that. 2125 */ 2126static unsigned long pvm_determine_end(struct vmap_area **pnext, 2127 struct vmap_area **pprev, 2128 unsigned long align) 2129{ 2130 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2131 unsigned long addr; 2132 2133 if (*pnext) 2134 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); 2135 else 2136 addr = vmalloc_end; 2137 2138 while (*pprev && (*pprev)->va_end > addr) { 2139 *pnext = *pprev; 2140 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); 2141 } 2142 2143 return addr; 2144} 2145 2146/** 2147 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 2148 * @offsets: array containing offset of each area 2149 * @sizes: array containing size of each area 2150 * @nr_vms: the number of areas to allocate 2151 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 2152 * @gfp_mask: allocation mask 2153 * 2154 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 2155 * vm_structs on success, %NULL on failure 2156 * 2157 * Percpu allocator wants to use congruent vm areas so that it can 2158 * maintain the offsets among percpu areas. This function allocates 2159 * congruent vmalloc areas for it. These areas tend to be scattered 2160 * pretty far, distance between two areas easily going up to 2161 * gigabytes. To avoid interacting with regular vmallocs, these areas 2162 * are allocated from top. 2163 * 2164 * Despite its complicated look, this allocator is rather simple. It 2165 * does everything top-down and scans areas from the end looking for 2166 * matching slot. While scanning, if any of the areas overlaps with 2167 * existing vmap_area, the base address is pulled down to fit the 2168 * area. Scanning is repeated till all the areas fit and then all 2169 * necessary data structres are inserted and the result is returned. 2170 */ 2171struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 2172 const size_t *sizes, int nr_vms, 2173 size_t align, gfp_t gfp_mask) 2174{ 2175 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 2176 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2177 struct vmap_area **vas, *prev, *next; 2178 struct vm_struct **vms; 2179 int area, area2, last_area, term_area; 2180 unsigned long base, start, end, last_end; 2181 bool purged = false; 2182 2183 gfp_mask &= GFP_RECLAIM_MASK; 2184 2185 /* verify parameters and allocate data structures */ 2186 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); 2187 for (last_area = 0, area = 0; area < nr_vms; area++) { 2188 start = offsets[area]; 2189 end = start + sizes[area]; 2190 2191 /* is everything aligned properly? */ 2192 BUG_ON(!IS_ALIGNED(offsets[area], align)); 2193 BUG_ON(!IS_ALIGNED(sizes[area], align)); 2194 2195 /* detect the area with the highest address */ 2196 if (start > offsets[last_area]) 2197 last_area = area; 2198 2199 for (area2 = 0; area2 < nr_vms; area2++) { 2200 unsigned long start2 = offsets[area2]; 2201 unsigned long end2 = start2 + sizes[area2]; 2202 2203 if (area2 == area) 2204 continue; 2205 2206 BUG_ON(start2 >= start && start2 < end); 2207 BUG_ON(end2 <= end && end2 > start); 2208 } 2209 } 2210 last_end = offsets[last_area] + sizes[last_area]; 2211 2212 if (vmalloc_end - vmalloc_start < last_end) { 2213 WARN_ON(true); 2214 return NULL; 2215 } 2216 2217 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); 2218 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); 2219 if (!vas || !vms) 2220 goto err_free; 2221 2222 for (area = 0; area < nr_vms; area++) { 2223 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); 2224 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); 2225 if (!vas[area] || !vms[area]) 2226 goto err_free; 2227 } 2228retry: 2229 spin_lock(&vmap_area_lock); 2230 2231 /* start scanning - we scan from the top, begin with the last area */ 2232 area = term_area = last_area; 2233 start = offsets[area]; 2234 end = start + sizes[area]; 2235 2236 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { 2237 base = vmalloc_end - last_end; 2238 goto found; 2239 } 2240 base = pvm_determine_end(&next, &prev, align) - end; 2241 2242 while (true) { 2243 BUG_ON(next && next->va_end <= base + end); 2244 BUG_ON(prev && prev->va_end > base + end); 2245 2246 /* 2247 * base might have underflowed, add last_end before 2248 * comparing. 2249 */ 2250 if (base + last_end < vmalloc_start + last_end) { 2251 spin_unlock(&vmap_area_lock); 2252 if (!purged) { 2253 purge_vmap_area_lazy(); 2254 purged = true; 2255 goto retry; 2256 } 2257 goto err_free; 2258 } 2259 2260 /* 2261 * If next overlaps, move base downwards so that it's 2262 * right below next and then recheck. 2263 */ 2264 if (next && next->va_start < base + end) { 2265 base = pvm_determine_end(&next, &prev, align) - end; 2266 term_area = area; 2267 continue; 2268 } 2269 2270 /* 2271 * If prev overlaps, shift down next and prev and move 2272 * base so that it's right below new next and then 2273 * recheck. 2274 */ 2275 if (prev && prev->va_end > base + start) { 2276 next = prev; 2277 prev = node_to_va(rb_prev(&next->rb_node)); 2278 base = pvm_determine_end(&next, &prev, align) - end; 2279 term_area = area; 2280 continue; 2281 } 2282 2283 /* 2284 * This area fits, move on to the previous one. If 2285 * the previous one is the terminal one, we're done. 2286 */ 2287 area = (area + nr_vms - 1) % nr_vms; 2288 if (area == term_area) 2289 break; 2290 start = offsets[area]; 2291 end = start + sizes[area]; 2292 pvm_find_next_prev(base + end, &next, &prev); 2293 } 2294found: 2295 /* we've found a fitting base, insert all va's */ 2296 for (area = 0; area < nr_vms; area++) { 2297 struct vmap_area *va = vas[area]; 2298 2299 va->va_start = base + offsets[area]; 2300 va->va_end = va->va_start + sizes[area]; 2301 __insert_vmap_area(va); 2302 } 2303 2304 vmap_area_pcpu_hole = base + offsets[last_area]; 2305 2306 spin_unlock(&vmap_area_lock); 2307 2308 /* insert all vm's */ 2309 for (area = 0; area < nr_vms; area++) 2310 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, 2311 pcpu_get_vm_areas); 2312 2313 kfree(vas); 2314 return vms; 2315 2316err_free: 2317 for (area = 0; area < nr_vms; area++) { 2318 if (vas) 2319 kfree(vas[area]); 2320 if (vms) 2321 kfree(vms[area]); 2322 } 2323 kfree(vas); 2324 kfree(vms); 2325 return NULL; 2326} 2327 2328/** 2329 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 2330 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 2331 * @nr_vms: the number of allocated areas 2332 * 2333 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 2334 */ 2335void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 2336{ 2337 int i; 2338 2339 for (i = 0; i < nr_vms; i++) 2340 free_vm_area(vms[i]); 2341 kfree(vms); 2342} 2343 2344#ifdef CONFIG_PROC_FS 2345static void *s_start(struct seq_file *m, loff_t *pos) 2346{ 2347 loff_t n = *pos; 2348 struct vm_struct *v; 2349 2350 read_lock(&vmlist_lock); 2351 v = vmlist; 2352 while (n > 0 && v) { 2353 n--; 2354 v = v->next; 2355 } 2356 if (!n) 2357 return v; 2358 2359 return NULL; 2360 2361} 2362 2363static void *s_next(struct seq_file *m, void *p, loff_t *pos) 2364{ 2365 struct vm_struct *v = p; 2366 2367 ++*pos; 2368 return v->next; 2369} 2370 2371static void s_stop(struct seq_file *m, void *p) 2372{ 2373 read_unlock(&vmlist_lock); 2374} 2375 2376static void show_numa_info(struct seq_file *m, struct vm_struct *v) 2377{ 2378 if (NUMA_BUILD) { 2379 unsigned int nr, *counters = m->private; 2380 2381 if (!counters) 2382 return; 2383 2384 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2385 2386 for (nr = 0; nr < v->nr_pages; nr++) 2387 counters[page_to_nid(v->pages[nr])]++; 2388 2389 for_each_node_state(nr, N_HIGH_MEMORY) 2390 if (counters[nr]) 2391 seq_printf(m, " N%u=%u", nr, counters[nr]); 2392 } 2393} 2394 2395static int s_show(struct seq_file *m, void *p) 2396{ 2397 struct vm_struct *v = p; 2398 2399 seq_printf(m, "0x%p-0x%p %7ld", 2400 v->addr, v->addr + v->size, v->size); 2401 2402 if (v->caller) { 2403 char buff[KSYM_SYMBOL_LEN]; 2404 2405 seq_putc(m, ' '); 2406 sprint_symbol(buff, (unsigned long)v->caller); 2407 seq_puts(m, buff); 2408 } 2409 2410 if (v->nr_pages) 2411 seq_printf(m, " pages=%d", v->nr_pages); 2412 2413 if (v->phys_addr) 2414 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); 2415 2416 if (v->flags & VM_IOREMAP) 2417 seq_printf(m, " ioremap"); 2418 2419 if (v->flags & VM_ALLOC) 2420 seq_printf(m, " vmalloc"); 2421 2422 if (v->flags & VM_MAP) 2423 seq_printf(m, " vmap"); 2424 2425 if (v->flags & VM_USERMAP) 2426 seq_printf(m, " user"); 2427 2428 if (v->flags & VM_VPAGES) 2429 seq_printf(m, " vpages"); 2430 2431 show_numa_info(m, v); 2432 seq_putc(m, '\n'); 2433 return 0; 2434} 2435 2436static const struct seq_operations vmalloc_op = { 2437 .start = s_start, 2438 .next = s_next, 2439 .stop = s_stop, 2440 .show = s_show, 2441}; 2442 2443static int vmalloc_open(struct inode *inode, struct file *file) 2444{ 2445 unsigned int *ptr = NULL; 2446 int ret; 2447 2448 if (NUMA_BUILD) { 2449 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2450 if (ptr == NULL) 2451 return -ENOMEM; 2452 } 2453 ret = seq_open(file, &vmalloc_op); 2454 if (!ret) { 2455 struct seq_file *m = file->private_data; 2456 m->private = ptr; 2457 } else 2458 kfree(ptr); 2459 return ret; 2460} 2461 2462static const struct file_operations proc_vmalloc_operations = { 2463 .open = vmalloc_open, 2464 .read = seq_read, 2465 .llseek = seq_lseek, 2466 .release = seq_release_private, 2467}; 2468 2469static int __init proc_vmalloc_init(void) 2470{ 2471 proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); 2472 return 0; 2473} 2474module_init(proc_vmalloc_init); 2475#endif 2476