1 2 3/* Notebook: 4 fix mmap readahead to honour policy and enable policy for any page cache 5 object 6 statistics for bigpages 7 global policy for page cache? currently it uses process policy. Requires 8 first item above. 9 handle mremap for shared memory (currently ignored for the policy) 10 grows down? 11 make bind policy root only? It can trigger oom much faster and the 12 kernel is not always grateful with that. 13 could replace all the switch()es with a mempolicy_ops structure. 14*/ 15 16#include <linux/mempolicy.h> 17#include <linux/mm.h> 18#include <linux/highmem.h> 19#include <linux/hugetlb.h> 20#include <linux/kernel.h> 21#include <linux/sched.h> 22#include <linux/mm.h> 23#include <linux/nodemask.h> 24#include <linux/cpuset.h> 25#include <linux/gfp.h> 26#include <linux/slab.h> 27#include <linux/string.h> 28#include <linux/module.h> 29#include <linux/interrupt.h> 30#include <linux/init.h> 31#include <linux/compat.h> 32#include <linux/mempolicy.h> 33#include <linux/swap.h> 34#include <linux/seq_file.h> 35#include <linux/proc_fs.h> 36#include <linux/migrate.h> 37#include <linux/rmap.h> 38#include <linux/security.h> 39 40#include <asm/tlbflush.h> 41#include <asm/uaccess.h> 42 43/* Internal flags */ 44#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 45#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 46#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 47 48static struct kmem_cache *policy_cache; 49static struct kmem_cache *sn_cache; 50 51#define PDprintk(fmt...) 52 53/* Highest zone. An specific allocation for a zone below that is not 54 policied. */ 55enum zone_type policy_zone = 0; 56 57struct mempolicy default_policy = { 58 .refcnt = ATOMIC_INIT(1), /* never free it */ 59 .policy = MPOL_DEFAULT, 60}; 61 62/* Do sanity checking on a policy */ 63static int mpol_check_policy(int mode, nodemask_t *nodes) 64{ 65 int empty = nodes_empty(*nodes); 66 67 switch (mode) { 68 case MPOL_DEFAULT: 69 if (!empty) 70 return -EINVAL; 71 break; 72 case MPOL_BIND: 73 case MPOL_INTERLEAVE: 74 /* Preferred will only use the first bit, but allow 75 more for now. */ 76 if (empty) 77 return -EINVAL; 78 break; 79 } 80 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 81} 82 83/* Generate a custom zonelist for the BIND policy. */ 84static struct zonelist *bind_zonelist(nodemask_t *nodes) 85{ 86 struct zonelist *zl; 87 int num, max, nd; 88 enum zone_type k; 89 90 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 91 max++; /* space for zlcache_ptr (see mmzone.h) */ 92 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 93 if (!zl) 94 return ERR_PTR(-ENOMEM); 95 zl->zlcache_ptr = NULL; 96 num = 0; 97 /* First put in the highest zones from all nodes, then all the next 98 lower zones etc. Avoid empty zones because the memory allocator 99 doesn't like them. If you implement node hot removal you 100 have to fix that. */ 101 k = policy_zone; 102 while (1) { 103 for_each_node_mask(nd, *nodes) { 104 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 105 if (z->present_pages > 0) 106 zl->zones[num++] = z; 107 } 108 if (k == 0) 109 break; 110 k--; 111 } 112 if (num == 0) { 113 kfree(zl); 114 return ERR_PTR(-EINVAL); 115 } 116 zl->zones[num] = NULL; 117 return zl; 118} 119 120/* Create a new policy */ 121static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) 122{ 123 struct mempolicy *policy; 124 125 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); 126 if (mode == MPOL_DEFAULT) 127 return NULL; 128 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 129 if (!policy) 130 return ERR_PTR(-ENOMEM); 131 atomic_set(&policy->refcnt, 1); 132 switch (mode) { 133 case MPOL_INTERLEAVE: 134 policy->v.nodes = *nodes; 135 if (nodes_weight(*nodes) == 0) { 136 kmem_cache_free(policy_cache, policy); 137 return ERR_PTR(-EINVAL); 138 } 139 break; 140 case MPOL_PREFERRED: 141 policy->v.preferred_node = first_node(*nodes); 142 if (policy->v.preferred_node >= MAX_NUMNODES) 143 policy->v.preferred_node = -1; 144 break; 145 case MPOL_BIND: 146 policy->v.zonelist = bind_zonelist(nodes); 147 if (IS_ERR(policy->v.zonelist)) { 148 void *error_code = policy->v.zonelist; 149 kmem_cache_free(policy_cache, policy); 150 return error_code; 151 } 152 break; 153 } 154 policy->policy = mode; 155 policy->cpuset_mems_allowed = cpuset_mems_allowed(current); 156 return policy; 157} 158 159static void gather_stats(struct page *, void *, int pte_dirty); 160static void migrate_page_add(struct page *page, struct list_head *pagelist, 161 unsigned long flags); 162 163/* Scan through pages checking if pages follow certain conditions. */ 164static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 165 unsigned long addr, unsigned long end, 166 const nodemask_t *nodes, unsigned long flags, 167 void *private) 168{ 169 pte_t *orig_pte; 170 pte_t *pte; 171 spinlock_t *ptl; 172 173 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 174 do { 175 struct page *page; 176 int nid; 177 178 if (!pte_present(*pte)) 179 continue; 180 page = vm_normal_page(vma, addr, *pte); 181 if (!page) 182 continue; 183 /* 184 * The check for PageReserved here is important to avoid 185 * handling zero pages and other pages that may have been 186 * marked special by the system. 187 * 188 * If the PageReserved would not be checked here then f.e. 189 * the location of the zero page could have an influence 190 * on MPOL_MF_STRICT, zero pages would be counted for 191 * the per node stats, and there would be useless attempts 192 * to put zero pages on the migration list. 193 */ 194 if (PageReserved(page)) 195 continue; 196 nid = page_to_nid(page); 197 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 198 continue; 199 200 if (flags & MPOL_MF_STATS) 201 gather_stats(page, private, pte_dirty(*pte)); 202 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 203 migrate_page_add(page, private, flags); 204 else 205 break; 206 } while (pte++, addr += PAGE_SIZE, addr != end); 207 pte_unmap_unlock(orig_pte, ptl); 208 return addr != end; 209} 210 211static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 212 unsigned long addr, unsigned long end, 213 const nodemask_t *nodes, unsigned long flags, 214 void *private) 215{ 216 pmd_t *pmd; 217 unsigned long next; 218 219 pmd = pmd_offset(pud, addr); 220 do { 221 next = pmd_addr_end(addr, end); 222 if (pmd_none_or_clear_bad(pmd)) 223 continue; 224 if (check_pte_range(vma, pmd, addr, next, nodes, 225 flags, private)) 226 return -EIO; 227 } while (pmd++, addr = next, addr != end); 228 return 0; 229} 230 231static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 232 unsigned long addr, unsigned long end, 233 const nodemask_t *nodes, unsigned long flags, 234 void *private) 235{ 236 pud_t *pud; 237 unsigned long next; 238 239 pud = pud_offset(pgd, addr); 240 do { 241 next = pud_addr_end(addr, end); 242 if (pud_none_or_clear_bad(pud)) 243 continue; 244 if (check_pmd_range(vma, pud, addr, next, nodes, 245 flags, private)) 246 return -EIO; 247 } while (pud++, addr = next, addr != end); 248 return 0; 249} 250 251static inline int check_pgd_range(struct vm_area_struct *vma, 252 unsigned long addr, unsigned long end, 253 const nodemask_t *nodes, unsigned long flags, 254 void *private) 255{ 256 pgd_t *pgd; 257 unsigned long next; 258 259 pgd = pgd_offset(vma->vm_mm, addr); 260 do { 261 next = pgd_addr_end(addr, end); 262 if (pgd_none_or_clear_bad(pgd)) 263 continue; 264 if (check_pud_range(vma, pgd, addr, next, nodes, 265 flags, private)) 266 return -EIO; 267 } while (pgd++, addr = next, addr != end); 268 return 0; 269} 270 271/* 272 * Check if all pages in a range are on a set of nodes. 273 * If pagelist != NULL then isolate pages from the LRU and 274 * put them on the pagelist. 275 */ 276static struct vm_area_struct * 277check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 278 const nodemask_t *nodes, unsigned long flags, void *private) 279{ 280 int err; 281 struct vm_area_struct *first, *vma, *prev; 282 283 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 284 285 err = migrate_prep(); 286 if (err) 287 return ERR_PTR(err); 288 } 289 290 first = find_vma(mm, start); 291 if (!first) 292 return ERR_PTR(-EFAULT); 293 prev = NULL; 294 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 295 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 296 if (!vma->vm_next && vma->vm_end < end) 297 return ERR_PTR(-EFAULT); 298 if (prev && prev->vm_end < vma->vm_start) 299 return ERR_PTR(-EFAULT); 300 } 301 if (!is_vm_hugetlb_page(vma) && 302 ((flags & MPOL_MF_STRICT) || 303 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 304 vma_migratable(vma)))) { 305 unsigned long endvma = vma->vm_end; 306 307 if (endvma > end) 308 endvma = end; 309 if (vma->vm_start > start) 310 start = vma->vm_start; 311 err = check_pgd_range(vma, start, endvma, nodes, 312 flags, private); 313 if (err) { 314 first = ERR_PTR(err); 315 break; 316 } 317 } 318 prev = vma; 319 } 320 return first; 321} 322 323/* Apply policy to a single VMA */ 324static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) 325{ 326 int err = 0; 327 struct mempolicy *old = vma->vm_policy; 328 329 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 330 vma->vm_start, vma->vm_end, vma->vm_pgoff, 331 vma->vm_ops, vma->vm_file, 332 vma->vm_ops ? vma->vm_ops->set_policy : NULL); 333 334 if (vma->vm_ops && vma->vm_ops->set_policy) 335 err = vma->vm_ops->set_policy(vma, new); 336 if (!err) { 337 mpol_get(new); 338 vma->vm_policy = new; 339 mpol_free(old); 340 } 341 return err; 342} 343 344/* Step 2: apply policy to a range and do splits. */ 345static int mbind_range(struct vm_area_struct *vma, unsigned long start, 346 unsigned long end, struct mempolicy *new) 347{ 348 struct vm_area_struct *next; 349 int err; 350 351 err = 0; 352 for (; vma && vma->vm_start < end; vma = next) { 353 next = vma->vm_next; 354 if (vma->vm_start < start) 355 err = split_vma(vma->vm_mm, vma, start, 1); 356 if (!err && vma->vm_end > end) 357 err = split_vma(vma->vm_mm, vma, end, 0); 358 if (!err) 359 err = policy_vma(vma, new); 360 if (err) 361 break; 362 } 363 return err; 364} 365 366static int contextualize_policy(int mode, nodemask_t *nodes) 367{ 368 if (!nodes) 369 return 0; 370 371 cpuset_update_task_memory_state(); 372 if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) 373 return -EINVAL; 374 return mpol_check_policy(mode, nodes); 375} 376 377 378/* 379 * Update task->flags PF_MEMPOLICY bit: set iff non-default 380 * mempolicy. Allows more rapid checking of this (combined perhaps 381 * with other PF_* flag bits) on memory allocation hot code paths. 382 * 383 * If called from outside this file, the task 'p' should -only- be 384 * a newly forked child not yet visible on the task list, because 385 * manipulating the task flags of a visible task is not safe. 386 * 387 * The above limitation is why this routine has the funny name 388 * mpol_fix_fork_child_flag(). 389 * 390 * It is also safe to call this with a task pointer of current, 391 * which the static wrapper mpol_set_task_struct_flag() does, 392 * for use within this file. 393 */ 394 395void mpol_fix_fork_child_flag(struct task_struct *p) 396{ 397 if (p->mempolicy) 398 p->flags |= PF_MEMPOLICY; 399 else 400 p->flags &= ~PF_MEMPOLICY; 401} 402 403static void mpol_set_task_struct_flag(void) 404{ 405 mpol_fix_fork_child_flag(current); 406} 407 408/* Set the process memory policy */ 409long do_set_mempolicy(int mode, nodemask_t *nodes) 410{ 411 struct mempolicy *new; 412 413 if (contextualize_policy(mode, nodes)) 414 return -EINVAL; 415 new = mpol_new(mode, nodes); 416 if (IS_ERR(new)) 417 return PTR_ERR(new); 418 mpol_free(current->mempolicy); 419 current->mempolicy = new; 420 mpol_set_task_struct_flag(); 421 if (new && new->policy == MPOL_INTERLEAVE) 422 current->il_next = first_node(new->v.nodes); 423 return 0; 424} 425 426/* Fill a zone bitmap for a policy */ 427static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 428{ 429 int i; 430 431 nodes_clear(*nodes); 432 switch (p->policy) { 433 case MPOL_BIND: 434 for (i = 0; p->v.zonelist->zones[i]; i++) 435 node_set(zone_to_nid(p->v.zonelist->zones[i]), 436 *nodes); 437 break; 438 case MPOL_DEFAULT: 439 break; 440 case MPOL_INTERLEAVE: 441 *nodes = p->v.nodes; 442 break; 443 case MPOL_PREFERRED: 444 /* or use current node instead of online map? */ 445 if (p->v.preferred_node < 0) 446 *nodes = node_online_map; 447 else 448 node_set(p->v.preferred_node, *nodes); 449 break; 450 default: 451 BUG(); 452 } 453} 454 455static int lookup_node(struct mm_struct *mm, unsigned long addr) 456{ 457 struct page *p; 458 int err; 459 460 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 461 if (err >= 0) { 462 err = page_to_nid(p); 463 put_page(p); 464 } 465 return err; 466} 467 468/* Retrieve NUMA policy */ 469long do_get_mempolicy(int *policy, nodemask_t *nmask, 470 unsigned long addr, unsigned long flags) 471{ 472 int err; 473 struct mm_struct *mm = current->mm; 474 struct vm_area_struct *vma = NULL; 475 struct mempolicy *pol = current->mempolicy; 476 477 cpuset_update_task_memory_state(); 478 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 479 return -EINVAL; 480 if (flags & MPOL_F_ADDR) { 481 down_read(&mm->mmap_sem); 482 vma = find_vma_intersection(mm, addr, addr+1); 483 if (!vma) { 484 up_read(&mm->mmap_sem); 485 return -EFAULT; 486 } 487 if (vma->vm_ops && vma->vm_ops->get_policy) 488 pol = vma->vm_ops->get_policy(vma, addr); 489 else 490 pol = vma->vm_policy; 491 } else if (addr) 492 return -EINVAL; 493 494 if (!pol) 495 pol = &default_policy; 496 497 if (flags & MPOL_F_NODE) { 498 if (flags & MPOL_F_ADDR) { 499 err = lookup_node(mm, addr); 500 if (err < 0) 501 goto out; 502 *policy = err; 503 } else if (pol == current->mempolicy && 504 pol->policy == MPOL_INTERLEAVE) { 505 *policy = current->il_next; 506 } else { 507 err = -EINVAL; 508 goto out; 509 } 510 } else 511 *policy = pol->policy; 512 513 if (vma) { 514 up_read(¤t->mm->mmap_sem); 515 vma = NULL; 516 } 517 518 err = 0; 519 if (nmask) 520 get_zonemask(pol, nmask); 521 522 out: 523 if (vma) 524 up_read(¤t->mm->mmap_sem); 525 return err; 526} 527 528#ifdef CONFIG_MIGRATION 529/* 530 * page migration 531 */ 532static void migrate_page_add(struct page *page, struct list_head *pagelist, 533 unsigned long flags) 534{ 535 /* 536 * Avoid migrating a page that is shared with others. 537 */ 538 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) 539 isolate_lru_page(page, pagelist); 540} 541 542static struct page *new_node_page(struct page *page, unsigned long node, int **x) 543{ 544 return alloc_pages_node(node, GFP_HIGHUSER, 0); 545} 546 547/* 548 * Migrate pages from one node to a target node. 549 * Returns error or the number of pages not migrated. 550 */ 551int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) 552{ 553 nodemask_t nmask; 554 LIST_HEAD(pagelist); 555 int err = 0; 556 557 nodes_clear(nmask); 558 node_set(source, nmask); 559 560 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, 561 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 562 563 if (!list_empty(&pagelist)) 564 err = migrate_pages(&pagelist, new_node_page, dest); 565 566 return err; 567} 568 569/* 570 * Move pages between the two nodesets so as to preserve the physical 571 * layout as much as possible. 572 * 573 * Returns the number of page that could not be moved. 574 */ 575int do_migrate_pages(struct mm_struct *mm, 576 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 577{ 578 LIST_HEAD(pagelist); 579 int busy = 0; 580 int err = 0; 581 nodemask_t tmp; 582 583 down_read(&mm->mmap_sem); 584 585 err = migrate_vmas(mm, from_nodes, to_nodes, flags); 586 if (err) 587 goto out; 588 589/* 590 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 591 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 592 * bit in 'tmp', and return that <source, dest> pair for migration. 593 * The pair of nodemasks 'to' and 'from' define the map. 594 * 595 * If no pair of bits is found that way, fallback to picking some 596 * pair of 'source' and 'dest' bits that are not the same. If the 597 * 'source' and 'dest' bits are the same, this represents a node 598 * that will be migrating to itself, so no pages need move. 599 * 600 * If no bits are left in 'tmp', or if all remaining bits left 601 * in 'tmp' correspond to the same bit in 'to', return false 602 * (nothing left to migrate). 603 * 604 * This lets us pick a pair of nodes to migrate between, such that 605 * if possible the dest node is not already occupied by some other 606 * source node, minimizing the risk of overloading the memory on a 607 * node that would happen if we migrated incoming memory to a node 608 * before migrating outgoing memory source that same node. 609 * 610 * A single scan of tmp is sufficient. As we go, we remember the 611 * most recent <s, d> pair that moved (s != d). If we find a pair 612 * that not only moved, but what's better, moved to an empty slot 613 * (d is not set in tmp), then we break out then, with that pair. 614 * Otherwise when we finish scannng from_tmp, we at least have the 615 * most recent <s, d> pair that moved. If we get all the way through 616 * the scan of tmp without finding any node that moved, much less 617 * moved to an empty node, then there is nothing left worth migrating. 618 */ 619 620 tmp = *from_nodes; 621 while (!nodes_empty(tmp)) { 622 int s,d; 623 int source = -1; 624 int dest = 0; 625 626 for_each_node_mask(s, tmp) { 627 d = node_remap(s, *from_nodes, *to_nodes); 628 if (s == d) 629 continue; 630 631 source = s; /* Node moved. Memorize */ 632 dest = d; 633 634 /* dest not in remaining from nodes? */ 635 if (!node_isset(dest, tmp)) 636 break; 637 } 638 if (source == -1) 639 break; 640 641 node_clear(source, tmp); 642 err = migrate_to_node(mm, source, dest, flags); 643 if (err > 0) 644 busy += err; 645 if (err < 0) 646 break; 647 } 648out: 649 up_read(&mm->mmap_sem); 650 if (err < 0) 651 return err; 652 return busy; 653 654} 655 656static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 657{ 658 struct vm_area_struct *vma = (struct vm_area_struct *)private; 659 660 return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); 661} 662#else 663 664static void migrate_page_add(struct page *page, struct list_head *pagelist, 665 unsigned long flags) 666{ 667} 668 669int do_migrate_pages(struct mm_struct *mm, 670 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 671{ 672 return -ENOSYS; 673} 674 675static struct page *new_vma_page(struct page *page, unsigned long private, int **x) 676{ 677 return NULL; 678} 679#endif 680 681long do_mbind(unsigned long start, unsigned long len, 682 unsigned long mode, nodemask_t *nmask, unsigned long flags) 683{ 684 struct vm_area_struct *vma; 685 struct mm_struct *mm = current->mm; 686 struct mempolicy *new; 687 unsigned long end; 688 int err; 689 LIST_HEAD(pagelist); 690 691 if ((flags & ~(unsigned long)(MPOL_MF_STRICT | 692 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 693 || mode > MPOL_MAX) 694 return -EINVAL; 695 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 696 return -EPERM; 697 698 if (start & ~PAGE_MASK) 699 return -EINVAL; 700 701 if (mode == MPOL_DEFAULT) 702 flags &= ~MPOL_MF_STRICT; 703 704 len = (len + PAGE_SIZE - 1) & PAGE_MASK; 705 end = start + len; 706 707 if (end < start) 708 return -EINVAL; 709 if (end == start) 710 return 0; 711 712 if (mpol_check_policy(mode, nmask)) 713 return -EINVAL; 714 715 new = mpol_new(mode, nmask); 716 if (IS_ERR(new)) 717 return PTR_ERR(new); 718 719 /* 720 * If we are using the default policy then operation 721 * on discontinuous address spaces is okay after all 722 */ 723 if (!new) 724 flags |= MPOL_MF_DISCONTIG_OK; 725 726 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 727 mode,nodes_addr(nodes)[0]); 728 729 down_write(&mm->mmap_sem); 730 vma = check_range(mm, start, end, nmask, 731 flags | MPOL_MF_INVERT, &pagelist); 732 733 err = PTR_ERR(vma); 734 if (!IS_ERR(vma)) { 735 int nr_failed = 0; 736 737 err = mbind_range(vma, start, end, new); 738 739 if (!list_empty(&pagelist)) 740 nr_failed = migrate_pages(&pagelist, new_vma_page, 741 (unsigned long)vma); 742 743 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 744 err = -EIO; 745 } 746 747 up_write(&mm->mmap_sem); 748 mpol_free(new); 749 return err; 750} 751 752/* 753 * User space interface with variable sized bitmaps for nodelists. 754 */ 755 756/* Copy a node mask from user space. */ 757static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 758 unsigned long maxnode) 759{ 760 unsigned long k; 761 unsigned long nlongs; 762 unsigned long endmask; 763 764 --maxnode; 765 nodes_clear(*nodes); 766 if (maxnode == 0 || !nmask) 767 return 0; 768 if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 769 return -EINVAL; 770 771 nlongs = BITS_TO_LONGS(maxnode); 772 if ((maxnode % BITS_PER_LONG) == 0) 773 endmask = ~0UL; 774 else 775 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 776 777 /* When the user specified more nodes than supported just check 778 if the non supported part is all zero. */ 779 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 780 if (nlongs > PAGE_SIZE/sizeof(long)) 781 return -EINVAL; 782 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 783 unsigned long t; 784 if (get_user(t, nmask + k)) 785 return -EFAULT; 786 if (k == nlongs - 1) { 787 if (t & endmask) 788 return -EINVAL; 789 } else if (t) 790 return -EINVAL; 791 } 792 nlongs = BITS_TO_LONGS(MAX_NUMNODES); 793 endmask = ~0UL; 794 } 795 796 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) 797 return -EFAULT; 798 nodes_addr(*nodes)[nlongs-1] &= endmask; 799 return 0; 800} 801 802/* Copy a kernel node mask to user space */ 803static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 804 nodemask_t *nodes) 805{ 806 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 807 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); 808 809 if (copy > nbytes) { 810 if (copy > PAGE_SIZE) 811 return -EINVAL; 812 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 813 return -EFAULT; 814 copy = nbytes; 815 } 816 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 817} 818 819asmlinkage long sys_mbind(unsigned long start, unsigned long len, 820 unsigned long mode, 821 unsigned long __user *nmask, unsigned long maxnode, 822 unsigned flags) 823{ 824 nodemask_t nodes; 825 int err; 826 827 err = get_nodes(&nodes, nmask, maxnode); 828 if (err) 829 return err; 830#ifdef CONFIG_CPUSETS 831 /* Restrict the nodes to the allowed nodes in the cpuset */ 832 nodes_and(nodes, nodes, current->mems_allowed); 833#endif 834 return do_mbind(start, len, mode, &nodes, flags); 835} 836 837/* Set the process memory policy */ 838asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 839 unsigned long maxnode) 840{ 841 int err; 842 nodemask_t nodes; 843 844 if (mode < 0 || mode > MPOL_MAX) 845 return -EINVAL; 846 err = get_nodes(&nodes, nmask, maxnode); 847 if (err) 848 return err; 849 return do_set_mempolicy(mode, &nodes); 850} 851 852asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 853 const unsigned long __user *old_nodes, 854 const unsigned long __user *new_nodes) 855{ 856 struct mm_struct *mm; 857 struct task_struct *task; 858 nodemask_t old; 859 nodemask_t new; 860 nodemask_t task_nodes; 861 int err; 862 863 err = get_nodes(&old, old_nodes, maxnode); 864 if (err) 865 return err; 866 867 err = get_nodes(&new, new_nodes, maxnode); 868 if (err) 869 return err; 870 871 /* Find the mm_struct */ 872 read_lock(&tasklist_lock); 873 task = pid ? find_task_by_pid(pid) : current; 874 if (!task) { 875 read_unlock(&tasklist_lock); 876 return -ESRCH; 877 } 878 mm = get_task_mm(task); 879 read_unlock(&tasklist_lock); 880 881 if (!mm) 882 return -EINVAL; 883 884 /* 885 * Check if this process has the right to modify the specified 886 * process. The right exists if the process has administrative 887 * capabilities, superuser privileges or the same 888 * userid as the target process. 889 */ 890 if ((current->euid != task->suid) && (current->euid != task->uid) && 891 (current->uid != task->suid) && (current->uid != task->uid) && 892 !capable(CAP_SYS_NICE)) { 893 err = -EPERM; 894 goto out; 895 } 896 897 task_nodes = cpuset_mems_allowed(task); 898 /* Is the user allowed to access the target nodes? */ 899 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { 900 err = -EPERM; 901 goto out; 902 } 903 904 err = security_task_movememory(task); 905 if (err) 906 goto out; 907 908 err = do_migrate_pages(mm, &old, &new, 909 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 910out: 911 mmput(mm); 912 return err; 913} 914 915 916/* Retrieve NUMA policy */ 917asmlinkage long sys_get_mempolicy(int __user *policy, 918 unsigned long __user *nmask, 919 unsigned long maxnode, 920 unsigned long addr, unsigned long flags) 921{ 922 int err, pval; 923 nodemask_t nodes; 924 925 if (nmask != NULL && maxnode < MAX_NUMNODES) 926 return -EINVAL; 927 928 err = do_get_mempolicy(&pval, &nodes, addr, flags); 929 930 if (err) 931 return err; 932 933 if (policy && put_user(pval, policy)) 934 return -EFAULT; 935 936 if (nmask) 937 err = copy_nodes_to_user(nmask, maxnode, &nodes); 938 939 return err; 940} 941 942#ifdef CONFIG_COMPAT 943 944asmlinkage long compat_sys_get_mempolicy(int __user *policy, 945 compat_ulong_t __user *nmask, 946 compat_ulong_t maxnode, 947 compat_ulong_t addr, compat_ulong_t flags) 948{ 949 long err; 950 unsigned long __user *nm = NULL; 951 unsigned long nr_bits, alloc_size; 952 DECLARE_BITMAP(bm, MAX_NUMNODES); 953 954 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 955 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 956 957 if (nmask) 958 nm = compat_alloc_user_space(alloc_size); 959 960 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 961 962 if (!err && nmask) { 963 err = copy_from_user(bm, nm, alloc_size); 964 /* ensure entire bitmap is zeroed */ 965 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 966 err |= compat_put_bitmap(nmask, bm, nr_bits); 967 } 968 969 return err; 970} 971 972asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, 973 compat_ulong_t maxnode) 974{ 975 long err = 0; 976 unsigned long __user *nm = NULL; 977 unsigned long nr_bits, alloc_size; 978 DECLARE_BITMAP(bm, MAX_NUMNODES); 979 980 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 981 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 982 983 if (nmask) { 984 err = compat_get_bitmap(bm, nmask, nr_bits); 985 nm = compat_alloc_user_space(alloc_size); 986 err |= copy_to_user(nm, bm, alloc_size); 987 } 988 989 if (err) 990 return -EFAULT; 991 992 return sys_set_mempolicy(mode, nm, nr_bits+1); 993} 994 995asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, 996 compat_ulong_t mode, compat_ulong_t __user *nmask, 997 compat_ulong_t maxnode, compat_ulong_t flags) 998{ 999 long err = 0; 1000 unsigned long __user *nm = NULL; 1001 unsigned long nr_bits, alloc_size; 1002 nodemask_t bm; 1003 1004 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 1005 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 1006 1007 if (nmask) { 1008 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); 1009 nm = compat_alloc_user_space(alloc_size); 1010 err |= copy_to_user(nm, nodes_addr(bm), alloc_size); 1011 } 1012 1013 if (err) 1014 return -EFAULT; 1015 1016 return sys_mbind(start, len, mode, nm, nr_bits+1, flags); 1017} 1018 1019#endif 1020 1021/* Return effective policy for a VMA */ 1022static struct mempolicy * get_vma_policy(struct task_struct *task, 1023 struct vm_area_struct *vma, unsigned long addr) 1024{ 1025 struct mempolicy *pol = task->mempolicy; 1026 1027 if (vma) { 1028 if (vma->vm_ops && vma->vm_ops->get_policy) 1029 pol = vma->vm_ops->get_policy(vma, addr); 1030 else if (vma->vm_policy && 1031 vma->vm_policy->policy != MPOL_DEFAULT) 1032 pol = vma->vm_policy; 1033 } 1034 if (!pol) 1035 pol = &default_policy; 1036 return pol; 1037} 1038 1039/* Return a zonelist representing a mempolicy */ 1040static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 1041{ 1042 int nd; 1043 1044 switch (policy->policy) { 1045 case MPOL_PREFERRED: 1046 nd = policy->v.preferred_node; 1047 if (nd < 0) 1048 nd = numa_node_id(); 1049 break; 1050 case MPOL_BIND: 1051 /* Lower zones don't get a policy applied */ 1052 /* Careful: current->mems_allowed might have moved */ 1053 if (gfp_zone(gfp) >= policy_zone) 1054 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) 1055 return policy->v.zonelist; 1056 /*FALL THROUGH*/ 1057 case MPOL_INTERLEAVE: /* should not happen */ 1058 case MPOL_DEFAULT: 1059 nd = numa_node_id(); 1060 break; 1061 default: 1062 nd = 0; 1063 BUG(); 1064 } 1065 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); 1066} 1067 1068/* Do dynamic interleaving for a process */ 1069static unsigned interleave_nodes(struct mempolicy *policy) 1070{ 1071 unsigned nid, next; 1072 struct task_struct *me = current; 1073 1074 nid = me->il_next; 1075 next = next_node(nid, policy->v.nodes); 1076 if (next >= MAX_NUMNODES) 1077 next = first_node(policy->v.nodes); 1078 me->il_next = next; 1079 return nid; 1080} 1081 1082/* 1083 * Depending on the memory policy provide a node from which to allocate the 1084 * next slab entry. 1085 */ 1086unsigned slab_node(struct mempolicy *policy) 1087{ 1088 int pol = policy ? policy->policy : MPOL_DEFAULT; 1089 1090 switch (pol) { 1091 case MPOL_INTERLEAVE: 1092 return interleave_nodes(policy); 1093 1094 case MPOL_BIND: 1095 /* 1096 * Follow bind policy behavior and start allocation at the 1097 * first node. 1098 */ 1099 return zone_to_nid(policy->v.zonelist->zones[0]); 1100 1101 case MPOL_PREFERRED: 1102 if (policy->v.preferred_node >= 0) 1103 return policy->v.preferred_node; 1104 /* Fall through */ 1105 1106 default: 1107 return numa_node_id(); 1108 } 1109} 1110 1111/* Do static interleaving for a VMA with known offset. */ 1112static unsigned offset_il_node(struct mempolicy *pol, 1113 struct vm_area_struct *vma, unsigned long off) 1114{ 1115 unsigned nnodes = nodes_weight(pol->v.nodes); 1116 unsigned target = (unsigned)off % nnodes; 1117 int c; 1118 int nid = -1; 1119 1120 c = 0; 1121 do { 1122 nid = next_node(nid, pol->v.nodes); 1123 c++; 1124 } while (c <= target); 1125 return nid; 1126} 1127 1128/* Determine a node number for interleave */ 1129static inline unsigned interleave_nid(struct mempolicy *pol, 1130 struct vm_area_struct *vma, unsigned long addr, int shift) 1131{ 1132 if (vma) { 1133 unsigned long off; 1134 1135 /* 1136 * for small pages, there is no difference between 1137 * shift and PAGE_SHIFT, so the bit-shift is safe. 1138 * for huge pages, since vm_pgoff is in units of small 1139 * pages, we need to shift off the always 0 bits to get 1140 * a useful offset. 1141 */ 1142 BUG_ON(shift < PAGE_SHIFT); 1143 off = vma->vm_pgoff >> (shift - PAGE_SHIFT); 1144 off += (addr - vma->vm_start) >> shift; 1145 return offset_il_node(pol, vma, off); 1146 } else 1147 return interleave_nodes(pol); 1148} 1149 1150#ifdef CONFIG_HUGETLBFS 1151/* Return a zonelist suitable for a huge page allocation. */ 1152struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) 1153{ 1154 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1155 1156 if (pol->policy == MPOL_INTERLEAVE) { 1157 unsigned nid; 1158 1159 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 1160 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); 1161 } 1162 return zonelist_policy(GFP_HIGHUSER, pol); 1163} 1164#endif 1165 1166/* Allocate a page in interleaved policy. 1167 Own path because it needs to do special accounting. */ 1168static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1169 unsigned nid) 1170{ 1171 struct zonelist *zl; 1172 struct page *page; 1173 1174 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); 1175 page = __alloc_pages(gfp, order, zl); 1176 if (page && page_zone(page) == zl->zones[0]) 1177 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); 1178 return page; 1179} 1180 1181/** 1182 * alloc_page_vma - Allocate a page for a VMA. 1183 * 1184 * @gfp: 1185 * %GFP_USER user allocation. 1186 * %GFP_KERNEL kernel allocations, 1187 * %GFP_HIGHMEM highmem/user allocations, 1188 * %GFP_FS allocation should not call back into a file system. 1189 * %GFP_ATOMIC don't sleep. 1190 * 1191 * @vma: Pointer to VMA or NULL if not available. 1192 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1193 * 1194 * This function allocates a page from the kernel page pool and applies 1195 * a NUMA policy associated with the VMA or the current process. 1196 * When VMA is not NULL caller must hold down_read on the mmap_sem of the 1197 * mm_struct of the VMA to prevent it from going away. Should be used for 1198 * all allocations for pages that will be mapped into 1199 * user space. Returns NULL when no page can be allocated. 1200 * 1201 * Should be called with the mm_sem of the vma hold. 1202 */ 1203struct page * 1204alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1205{ 1206 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1207 1208 cpuset_update_task_memory_state(); 1209 1210 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 1211 unsigned nid; 1212 1213 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1214 return alloc_page_interleave(gfp, 0, nid); 1215 } 1216 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 1217} 1218 1219/** 1220 * alloc_pages_current - Allocate pages. 1221 * 1222 * @gfp: 1223 * %GFP_USER user allocation, 1224 * %GFP_KERNEL kernel allocation, 1225 * %GFP_HIGHMEM highmem allocation, 1226 * %GFP_FS don't call back into a file system. 1227 * %GFP_ATOMIC don't sleep. 1228 * @order: Power of two of allocation size in pages. 0 is a single page. 1229 * 1230 * Allocate a page from the kernel page pool. When not in 1231 * interrupt context and apply the current process NUMA policy. 1232 * Returns NULL when no page can be allocated. 1233 * 1234 * Don't call cpuset_update_task_memory_state() unless 1235 * 1) it's ok to take cpuset_sem (can WAIT), and 1236 * 2) allocating for current task (not interrupt). 1237 */ 1238struct page *alloc_pages_current(gfp_t gfp, unsigned order) 1239{ 1240 struct mempolicy *pol = current->mempolicy; 1241 1242 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1243 cpuset_update_task_memory_state(); 1244 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1245 pol = &default_policy; 1246 if (pol->policy == MPOL_INTERLEAVE) 1247 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1248 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); 1249} 1250EXPORT_SYMBOL(alloc_pages_current); 1251 1252/* 1253 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it 1254 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 1255 * with the mems_allowed returned by cpuset_mems_allowed(). This 1256 * keeps mempolicies cpuset relative after its cpuset moves. See 1257 * further kernel/cpuset.c update_nodemask(). 1258 */ 1259void *cpuset_being_rebound; 1260 1261/* Slow path of a mempolicy copy */ 1262struct mempolicy *__mpol_copy(struct mempolicy *old) 1263{ 1264 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 1265 1266 if (!new) 1267 return ERR_PTR(-ENOMEM); 1268 if (current_cpuset_is_being_rebound()) { 1269 nodemask_t mems = cpuset_mems_allowed(current); 1270 mpol_rebind_policy(old, &mems); 1271 } 1272 *new = *old; 1273 atomic_set(&new->refcnt, 1); 1274 if (new->policy == MPOL_BIND) { 1275 int sz = ksize(old->v.zonelist); 1276 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); 1277 if (!new->v.zonelist) { 1278 kmem_cache_free(policy_cache, new); 1279 return ERR_PTR(-ENOMEM); 1280 } 1281 } 1282 return new; 1283} 1284 1285/* Slow path of a mempolicy comparison */ 1286int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 1287{ 1288 if (!a || !b) 1289 return 0; 1290 if (a->policy != b->policy) 1291 return 0; 1292 switch (a->policy) { 1293 case MPOL_DEFAULT: 1294 return 1; 1295 case MPOL_INTERLEAVE: 1296 return nodes_equal(a->v.nodes, b->v.nodes); 1297 case MPOL_PREFERRED: 1298 return a->v.preferred_node == b->v.preferred_node; 1299 case MPOL_BIND: { 1300 int i; 1301 for (i = 0; a->v.zonelist->zones[i]; i++) 1302 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) 1303 return 0; 1304 return b->v.zonelist->zones[i] == NULL; 1305 } 1306 default: 1307 BUG(); 1308 return 0; 1309 } 1310} 1311 1312/* Slow path of a mpol destructor. */ 1313void __mpol_free(struct mempolicy *p) 1314{ 1315 if (!atomic_dec_and_test(&p->refcnt)) 1316 return; 1317 if (p->policy == MPOL_BIND) 1318 kfree(p->v.zonelist); 1319 p->policy = MPOL_DEFAULT; 1320 kmem_cache_free(policy_cache, p); 1321} 1322 1323/* 1324 * Shared memory backing store policy support. 1325 * 1326 * Remember policies even when nobody has shared memory mapped. 1327 * The policies are kept in Red-Black tree linked from the inode. 1328 * They are protected by the sp->lock spinlock, which should be held 1329 * for any accesses to the tree. 1330 */ 1331 1332/* lookup first element intersecting start-end */ 1333/* Caller holds sp->lock */ 1334static struct sp_node * 1335sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 1336{ 1337 struct rb_node *n = sp->root.rb_node; 1338 1339 while (n) { 1340 struct sp_node *p = rb_entry(n, struct sp_node, nd); 1341 1342 if (start >= p->end) 1343 n = n->rb_right; 1344 else if (end <= p->start) 1345 n = n->rb_left; 1346 else 1347 break; 1348 } 1349 if (!n) 1350 return NULL; 1351 for (;;) { 1352 struct sp_node *w = NULL; 1353 struct rb_node *prev = rb_prev(n); 1354 if (!prev) 1355 break; 1356 w = rb_entry(prev, struct sp_node, nd); 1357 if (w->end <= start) 1358 break; 1359 n = prev; 1360 } 1361 return rb_entry(n, struct sp_node, nd); 1362} 1363 1364/* Insert a new shared policy into the list. */ 1365/* Caller holds sp->lock */ 1366static void sp_insert(struct shared_policy *sp, struct sp_node *new) 1367{ 1368 struct rb_node **p = &sp->root.rb_node; 1369 struct rb_node *parent = NULL; 1370 struct sp_node *nd; 1371 1372 while (*p) { 1373 parent = *p; 1374 nd = rb_entry(parent, struct sp_node, nd); 1375 if (new->start < nd->start) 1376 p = &(*p)->rb_left; 1377 else if (new->end > nd->end) 1378 p = &(*p)->rb_right; 1379 else 1380 BUG(); 1381 } 1382 rb_link_node(&new->nd, parent, p); 1383 rb_insert_color(&new->nd, &sp->root); 1384 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, 1385 new->policy ? new->policy->policy : 0); 1386} 1387 1388/* Find shared policy intersecting idx */ 1389struct mempolicy * 1390mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 1391{ 1392 struct mempolicy *pol = NULL; 1393 struct sp_node *sn; 1394 1395 if (!sp->root.rb_node) 1396 return NULL; 1397 spin_lock(&sp->lock); 1398 sn = sp_lookup(sp, idx, idx+1); 1399 if (sn) { 1400 mpol_get(sn->policy); 1401 pol = sn->policy; 1402 } 1403 spin_unlock(&sp->lock); 1404 return pol; 1405} 1406 1407static void sp_delete(struct shared_policy *sp, struct sp_node *n) 1408{ 1409 PDprintk("deleting %lx-l%x\n", n->start, n->end); 1410 rb_erase(&n->nd, &sp->root); 1411 mpol_free(n->policy); 1412 kmem_cache_free(sn_cache, n); 1413} 1414 1415struct sp_node * 1416sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) 1417{ 1418 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 1419 1420 if (!n) 1421 return NULL; 1422 n->start = start; 1423 n->end = end; 1424 mpol_get(pol); 1425 n->policy = pol; 1426 return n; 1427} 1428 1429/* Replace a policy range. */ 1430static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 1431 unsigned long end, struct sp_node *new) 1432{ 1433 struct sp_node *n, *new2 = NULL; 1434 1435restart: 1436 spin_lock(&sp->lock); 1437 n = sp_lookup(sp, start, end); 1438 /* Take care of old policies in the same range. */ 1439 while (n && n->start < end) { 1440 struct rb_node *next = rb_next(&n->nd); 1441 if (n->start >= start) { 1442 if (n->end <= end) 1443 sp_delete(sp, n); 1444 else 1445 n->start = end; 1446 } else { 1447 /* Old policy spanning whole new range. */ 1448 if (n->end > end) { 1449 if (!new2) { 1450 spin_unlock(&sp->lock); 1451 new2 = sp_alloc(end, n->end, n->policy); 1452 if (!new2) 1453 return -ENOMEM; 1454 goto restart; 1455 } 1456 n->end = start; 1457 sp_insert(sp, new2); 1458 new2 = NULL; 1459 break; 1460 } else 1461 n->end = start; 1462 } 1463 if (!next) 1464 break; 1465 n = rb_entry(next, struct sp_node, nd); 1466 } 1467 if (new) 1468 sp_insert(sp, new); 1469 spin_unlock(&sp->lock); 1470 if (new2) { 1471 mpol_free(new2->policy); 1472 kmem_cache_free(sn_cache, new2); 1473 } 1474 return 0; 1475} 1476 1477void mpol_shared_policy_init(struct shared_policy *info, int policy, 1478 nodemask_t *policy_nodes) 1479{ 1480 info->root = RB_ROOT; 1481 spin_lock_init(&info->lock); 1482 1483 if (policy != MPOL_DEFAULT) { 1484 struct mempolicy *newpol; 1485 1486 /* Falls back to MPOL_DEFAULT on any error */ 1487 newpol = mpol_new(policy, policy_nodes); 1488 if (!IS_ERR(newpol)) { 1489 /* Create pseudo-vma that contains just the policy */ 1490 struct vm_area_struct pvma; 1491 1492 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1493 /* Policy covers entire file */ 1494 pvma.vm_end = TASK_SIZE; 1495 mpol_set_shared_policy(info, &pvma, newpol); 1496 mpol_free(newpol); 1497 } 1498 } 1499} 1500 1501int mpol_set_shared_policy(struct shared_policy *info, 1502 struct vm_area_struct *vma, struct mempolicy *npol) 1503{ 1504 int err; 1505 struct sp_node *new = NULL; 1506 unsigned long sz = vma_pages(vma); 1507 1508 PDprintk("set_shared_policy %lx sz %lu %d %lx\n", 1509 vma->vm_pgoff, 1510 sz, npol? npol->policy : -1, 1511 npol ? nodes_addr(npol->v.nodes)[0] : -1); 1512 1513 if (npol) { 1514 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 1515 if (!new) 1516 return -ENOMEM; 1517 } 1518 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 1519 if (err && new) 1520 kmem_cache_free(sn_cache, new); 1521 return err; 1522} 1523 1524/* Free a backing policy store on inode delete. */ 1525void mpol_free_shared_policy(struct shared_policy *p) 1526{ 1527 struct sp_node *n; 1528 struct rb_node *next; 1529 1530 if (!p->root.rb_node) 1531 return; 1532 spin_lock(&p->lock); 1533 next = rb_first(&p->root); 1534 while (next) { 1535 n = rb_entry(next, struct sp_node, nd); 1536 next = rb_next(&n->nd); 1537 rb_erase(&n->nd, &p->root); 1538 mpol_free(n->policy); 1539 kmem_cache_free(sn_cache, n); 1540 } 1541 spin_unlock(&p->lock); 1542} 1543 1544/* assumes fs == KERNEL_DS */ 1545void __init numa_policy_init(void) 1546{ 1547 policy_cache = kmem_cache_create("numa_policy", 1548 sizeof(struct mempolicy), 1549 0, SLAB_PANIC, NULL, NULL); 1550 1551 sn_cache = kmem_cache_create("shared_policy_node", 1552 sizeof(struct sp_node), 1553 0, SLAB_PANIC, NULL, NULL); 1554 1555 /* Set interleaving policy for system init. This way not all 1556 the data structures allocated at system boot end up in node zero. */ 1557 1558 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) 1559 printk("numa_policy_init: interleaving failed\n"); 1560} 1561 1562/* Reset policy of current process to default */ 1563void numa_default_policy(void) 1564{ 1565 do_set_mempolicy(MPOL_DEFAULT, NULL); 1566} 1567 1568/* Migrate a policy to a different set of nodes */ 1569void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 1570{ 1571 nodemask_t *mpolmask; 1572 nodemask_t tmp; 1573 1574 if (!pol) 1575 return; 1576 mpolmask = &pol->cpuset_mems_allowed; 1577 if (nodes_equal(*mpolmask, *newmask)) 1578 return; 1579 1580 switch (pol->policy) { 1581 case MPOL_DEFAULT: 1582 break; 1583 case MPOL_INTERLEAVE: 1584 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); 1585 pol->v.nodes = tmp; 1586 *mpolmask = *newmask; 1587 current->il_next = node_remap(current->il_next, 1588 *mpolmask, *newmask); 1589 break; 1590 case MPOL_PREFERRED: 1591 pol->v.preferred_node = node_remap(pol->v.preferred_node, 1592 *mpolmask, *newmask); 1593 *mpolmask = *newmask; 1594 break; 1595 case MPOL_BIND: { 1596 nodemask_t nodes; 1597 struct zone **z; 1598 struct zonelist *zonelist; 1599 1600 nodes_clear(nodes); 1601 for (z = pol->v.zonelist->zones; *z; z++) 1602 node_set(zone_to_nid(*z), nodes); 1603 nodes_remap(tmp, nodes, *mpolmask, *newmask); 1604 nodes = tmp; 1605 1606 zonelist = bind_zonelist(&nodes); 1607 1608 /* If no mem, then zonelist is NULL and we keep old zonelist. 1609 * If that old zonelist has no remaining mems_allowed nodes, 1610 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. 1611 */ 1612 1613 if (!IS_ERR(zonelist)) { 1614 /* Good - got mem - substitute new zonelist */ 1615 kfree(pol->v.zonelist); 1616 pol->v.zonelist = zonelist; 1617 } 1618 *mpolmask = *newmask; 1619 break; 1620 } 1621 default: 1622 BUG(); 1623 break; 1624 } 1625} 1626 1627/* 1628 * Wrapper for mpol_rebind_policy() that just requires task 1629 * pointer, and updates task mempolicy. 1630 */ 1631 1632void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 1633{ 1634 mpol_rebind_policy(tsk->mempolicy, new); 1635} 1636 1637/* 1638 * Rebind each vma in mm to new nodemask. 1639 * 1640 * Call holding a reference to mm. Takes mm->mmap_sem during call. 1641 */ 1642 1643void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 1644{ 1645 struct vm_area_struct *vma; 1646 1647 down_write(&mm->mmap_sem); 1648 for (vma = mm->mmap; vma; vma = vma->vm_next) 1649 mpol_rebind_policy(vma->vm_policy, new); 1650 up_write(&mm->mmap_sem); 1651} 1652 1653/* 1654 * Display pages allocated per node and memory policy via /proc. 1655 */ 1656 1657static const char * const policy_types[] = 1658 { "default", "prefer", "bind", "interleave" }; 1659 1660/* 1661 * Convert a mempolicy into a string. 1662 * Returns the number of characters in buffer (if positive) 1663 * or an error (negative) 1664 */ 1665static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 1666{ 1667 char *p = buffer; 1668 int l; 1669 nodemask_t nodes; 1670 int mode = pol ? pol->policy : MPOL_DEFAULT; 1671 1672 switch (mode) { 1673 case MPOL_DEFAULT: 1674 nodes_clear(nodes); 1675 break; 1676 1677 case MPOL_PREFERRED: 1678 nodes_clear(nodes); 1679 node_set(pol->v.preferred_node, nodes); 1680 break; 1681 1682 case MPOL_BIND: 1683 get_zonemask(pol, &nodes); 1684 break; 1685 1686 case MPOL_INTERLEAVE: 1687 nodes = pol->v.nodes; 1688 break; 1689 1690 default: 1691 BUG(); 1692 return -EFAULT; 1693 } 1694 1695 l = strlen(policy_types[mode]); 1696 if (buffer + maxlen < p + l + 1) 1697 return -ENOSPC; 1698 1699 strcpy(p, policy_types[mode]); 1700 p += l; 1701 1702 if (!nodes_empty(nodes)) { 1703 if (buffer + maxlen < p + 2) 1704 return -ENOSPC; 1705 *p++ = '='; 1706 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 1707 } 1708 return p - buffer; 1709} 1710 1711struct numa_maps { 1712 unsigned long pages; 1713 unsigned long anon; 1714 unsigned long active; 1715 unsigned long writeback; 1716 unsigned long mapcount_max; 1717 unsigned long dirty; 1718 unsigned long swapcache; 1719 unsigned long node[MAX_NUMNODES]; 1720}; 1721 1722static void gather_stats(struct page *page, void *private, int pte_dirty) 1723{ 1724 struct numa_maps *md = private; 1725 int count = page_mapcount(page); 1726 1727 md->pages++; 1728 if (pte_dirty || PageDirty(page)) 1729 md->dirty++; 1730 1731 if (PageSwapCache(page)) 1732 md->swapcache++; 1733 1734 if (PageActive(page)) 1735 md->active++; 1736 1737 if (PageWriteback(page)) 1738 md->writeback++; 1739 1740 if (PageAnon(page)) 1741 md->anon++; 1742 1743 if (count > md->mapcount_max) 1744 md->mapcount_max = count; 1745 1746 md->node[page_to_nid(page)]++; 1747} 1748 1749#ifdef CONFIG_HUGETLB_PAGE 1750static void check_huge_range(struct vm_area_struct *vma, 1751 unsigned long start, unsigned long end, 1752 struct numa_maps *md) 1753{ 1754 unsigned long addr; 1755 struct page *page; 1756 1757 for (addr = start; addr < end; addr += HPAGE_SIZE) { 1758 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 1759 pte_t pte; 1760 1761 if (!ptep) 1762 continue; 1763 1764 pte = *ptep; 1765 if (pte_none(pte)) 1766 continue; 1767 1768 page = pte_page(pte); 1769 if (!page) 1770 continue; 1771 1772 gather_stats(page, md, pte_dirty(*ptep)); 1773 } 1774} 1775#else 1776static inline void check_huge_range(struct vm_area_struct *vma, 1777 unsigned long start, unsigned long end, 1778 struct numa_maps *md) 1779{ 1780} 1781#endif 1782 1783int show_numa_map(struct seq_file *m, void *v) 1784{ 1785 struct proc_maps_private *priv = m->private; 1786 struct vm_area_struct *vma = v; 1787 struct numa_maps *md; 1788 struct file *file = vma->vm_file; 1789 struct mm_struct *mm = vma->vm_mm; 1790 int n; 1791 char buffer[50]; 1792 1793 if (!mm) 1794 return 0; 1795 1796 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); 1797 if (!md) 1798 return 0; 1799 1800 mpol_to_str(buffer, sizeof(buffer), 1801 get_vma_policy(priv->task, vma, vma->vm_start)); 1802 1803 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1804 1805 if (file) { 1806 seq_printf(m, " file="); 1807 seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); 1808 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1809 seq_printf(m, " heap"); 1810 } else if (vma->vm_start <= mm->start_stack && 1811 vma->vm_end >= mm->start_stack) { 1812 seq_printf(m, " stack"); 1813 } 1814 1815 if (is_vm_hugetlb_page(vma)) { 1816 check_huge_range(vma, vma->vm_start, vma->vm_end, md); 1817 seq_printf(m, " huge"); 1818 } else { 1819 check_pgd_range(vma, vma->vm_start, vma->vm_end, 1820 &node_online_map, MPOL_MF_STATS, md); 1821 } 1822 1823 if (!md->pages) 1824 goto out; 1825 1826 if (md->anon) 1827 seq_printf(m," anon=%lu",md->anon); 1828 1829 if (md->dirty) 1830 seq_printf(m," dirty=%lu",md->dirty); 1831 1832 if (md->pages != md->anon && md->pages != md->dirty) 1833 seq_printf(m, " mapped=%lu", md->pages); 1834 1835 if (md->mapcount_max > 1) 1836 seq_printf(m, " mapmax=%lu", md->mapcount_max); 1837 1838 if (md->swapcache) 1839 seq_printf(m," swapcache=%lu", md->swapcache); 1840 1841 if (md->active < md->pages && !is_vm_hugetlb_page(vma)) 1842 seq_printf(m," active=%lu", md->active); 1843 1844 if (md->writeback) 1845 seq_printf(m," writeback=%lu", md->writeback); 1846 1847 for_each_online_node(n) 1848 if (md->node[n]) 1849 seq_printf(m, " N%d=%lu", n, md->node[n]); 1850out: 1851 seq_putc(m, '\n'); 1852 kfree(md); 1853 1854 if (m->count < m->size) 1855 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; 1856 return 0; 1857} 1858