1/* 2 * Memory Migration functionality - linux/mm/migration.c 3 * 4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 5 * 6 * Page migration was first developed in the context of the memory hotplug 7 * project. The main authors of the migration code are: 8 * 9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 10 * Hirokazu Takahashi <taka@valinux.co.jp> 11 * Dave Hansen <haveblue@us.ibm.com> 12 * Christoph Lameter 13 */ 14 15#include <linux/migrate.h> 16#include <linux/module.h> 17#include <linux/swap.h> 18#include <linux/swapops.h> 19#include <linux/pagemap.h> 20#include <linux/buffer_head.h> 21#include <linux/mm_inline.h> 22#include <linux/nsproxy.h> 23#include <linux/pagevec.h> 24#include <linux/ksm.h> 25#include <linux/rmap.h> 26#include <linux/topology.h> 27#include <linux/cpu.h> 28#include <linux/cpuset.h> 29#include <linux/writeback.h> 30#include <linux/mempolicy.h> 31#include <linux/vmalloc.h> 32#include <linux/security.h> 33#include <linux/memcontrol.h> 34#include <linux/syscalls.h> 35#include <linux/gfp.h> 36 37#include "internal.h" 38 39#include <asm/tlbflush.h> 40 41#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 42 43/* 44 * migrate_prep() needs to be called before we start compiling a list of pages 45 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is 46 * undesirable, use migrate_prep_local() 47 */ 48int migrate_prep(void) 49{ 50 /* 51 * Clear the LRU lists so pages can be isolated. 52 * Note that pages may be moved off the LRU after we have 53 * drained them. Those pages will fail to migrate like other 54 * pages that may be busy. 55 */ 56 lru_add_drain_all(); 57 58 return 0; 59} 60 61/* Do the necessary work of migrate_prep but not if it involves other CPUs */ 62int migrate_prep_local(void) 63{ 64 lru_add_drain(); 65 66 return 0; 67} 68 69/* 70 * Add isolated pages on the list back to the LRU under page lock 71 * to avoid leaking evictable pages back onto unevictable list. 72 */ 73void putback_lru_pages(struct list_head *l) 74{ 75 struct page *page; 76 struct page *page2; 77 78 list_for_each_entry_safe(page, page2, l, lru) { 79 list_del(&page->lru); 80 dec_zone_page_state(page, NR_ISOLATED_ANON + 81 page_is_file_cache(page)); 82 putback_lru_page(page); 83 } 84} 85 86/* 87 * Restore a potential migration pte to a working pte entry 88 */ 89static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, 90 unsigned long addr, void *old) 91{ 92 struct mm_struct *mm = vma->vm_mm; 93 swp_entry_t entry; 94 pgd_t *pgd; 95 pud_t *pud; 96 pmd_t *pmd; 97 pte_t *ptep, pte; 98 spinlock_t *ptl; 99 100 pgd = pgd_offset(mm, addr); 101 if (!pgd_present(*pgd)) 102 goto out; 103 104 pud = pud_offset(pgd, addr); 105 if (!pud_present(*pud)) 106 goto out; 107 108 pmd = pmd_offset(pud, addr); 109 if (!pmd_present(*pmd)) 110 goto out; 111 112 ptep = pte_offset_map(pmd, addr); 113 114 if (!is_swap_pte(*ptep)) { 115 pte_unmap(ptep); 116 goto out; 117 } 118 119 ptl = pte_lockptr(mm, pmd); 120 spin_lock(ptl); 121 pte = *ptep; 122 if (!is_swap_pte(pte)) 123 goto unlock; 124 125 entry = pte_to_swp_entry(pte); 126 127 if (!is_migration_entry(entry) || 128 migration_entry_to_page(entry) != old) 129 goto unlock; 130 131 get_page(new); 132 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 133 if (is_write_migration_entry(entry)) 134 pte = pte_mkwrite(pte); 135 flush_cache_page(vma, addr, pte_pfn(pte)); 136 set_pte_at(mm, addr, ptep, pte); 137 138 if (PageAnon(new)) 139 page_add_anon_rmap(new, vma, addr); 140 else 141 page_add_file_rmap(new); 142 143 /* No need to invalidate - it was non-present before */ 144 update_mmu_cache(vma, addr, ptep); 145unlock: 146 pte_unmap_unlock(ptep, ptl); 147out: 148 return SWAP_AGAIN; 149} 150 151/* 152 * Get rid of all migration entries and replace them by 153 * references to the indicated page. 154 */ 155static void remove_migration_ptes(struct page *old, struct page *new) 156{ 157 rmap_walk(new, remove_migration_pte, old); 158} 159 160/* 161 * Something used the pte of a page under migration. We need to 162 * get to the page and wait until migration is finished. 163 * When we return from this function the fault will be retried. 164 * 165 * This function is called from do_swap_page(). 166 */ 167void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 168 unsigned long address) 169{ 170 pte_t *ptep, pte; 171 spinlock_t *ptl; 172 swp_entry_t entry; 173 struct page *page; 174 175 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 176 pte = *ptep; 177 if (!is_swap_pte(pte)) 178 goto out; 179 180 entry = pte_to_swp_entry(pte); 181 if (!is_migration_entry(entry)) 182 goto out; 183 184 page = migration_entry_to_page(entry); 185 186 /* 187 * Once radix-tree replacement of page migration started, page_count 188 * *must* be zero. And, we don't want to call wait_on_page_locked() 189 * against a page without get_page(). 190 * So, we use get_page_unless_zero(), here. Even failed, page fault 191 * will occur again. 192 */ 193 if (!get_page_unless_zero(page)) 194 goto out; 195 pte_unmap_unlock(ptep, ptl); 196 wait_on_page_locked(page); 197 put_page(page); 198 return; 199out: 200 pte_unmap_unlock(ptep, ptl); 201} 202 203/* 204 * Replace the page in the mapping. 205 * 206 * The number of remaining references must be: 207 * 1 for anonymous pages without a mapping 208 * 2 for pages with a mapping 209 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 210 */ 211static int migrate_page_move_mapping(struct address_space *mapping, 212 struct page *newpage, struct page *page) 213{ 214 int expected_count; 215 void **pslot; 216 217 if (!mapping) { 218 /* Anonymous page without mapping */ 219 if (page_count(page) != 1) 220 return -EAGAIN; 221 return 0; 222 } 223 224 spin_lock_irq(&mapping->tree_lock); 225 226 pslot = radix_tree_lookup_slot(&mapping->page_tree, 227 page_index(page)); 228 229 expected_count = 2 + page_has_private(page); 230 if (page_count(page) != expected_count || 231 (struct page *)radix_tree_deref_slot(pslot) != page) { 232 spin_unlock_irq(&mapping->tree_lock); 233 return -EAGAIN; 234 } 235 236 if (!page_freeze_refs(page, expected_count)) { 237 spin_unlock_irq(&mapping->tree_lock); 238 return -EAGAIN; 239 } 240 241 /* 242 * Now we know that no one else is looking at the page. 243 */ 244 get_page(newpage); /* add cache reference */ 245 if (PageSwapCache(page)) { 246 SetPageSwapCache(newpage); 247 set_page_private(newpage, page_private(page)); 248 } 249 250 radix_tree_replace_slot(pslot, newpage); 251 252 page_unfreeze_refs(page, expected_count); 253 /* 254 * Drop cache reference from old page. 255 * We know this isn't the last reference. 256 */ 257 __put_page(page); 258 259 /* 260 * If moved to a different zone then also account 261 * the page for that zone. Other VM counters will be 262 * taken care of when we establish references to the 263 * new page and drop references to the old page. 264 * 265 * Note that anonymous pages are accounted for 266 * via NR_FILE_PAGES and NR_ANON_PAGES if they 267 * are mapped to swap space. 268 */ 269 __dec_zone_page_state(page, NR_FILE_PAGES); 270 __inc_zone_page_state(newpage, NR_FILE_PAGES); 271 if (PageSwapBacked(page)) { 272 __dec_zone_page_state(page, NR_SHMEM); 273 __inc_zone_page_state(newpage, NR_SHMEM); 274 } 275 spin_unlock_irq(&mapping->tree_lock); 276 277 return 0; 278} 279 280/* 281 * Copy the page to its new location 282 */ 283static void migrate_page_copy(struct page *newpage, struct page *page) 284{ 285 copy_highpage(newpage, page); 286 287 if (PageError(page)) 288 SetPageError(newpage); 289 if (PageReferenced(page)) 290 SetPageReferenced(newpage); 291 if (PageUptodate(page)) 292 SetPageUptodate(newpage); 293 if (TestClearPageActive(page)) { 294 VM_BUG_ON(PageUnevictable(page)); 295 SetPageActive(newpage); 296 } else if (TestClearPageUnevictable(page)) 297 SetPageUnevictable(newpage); 298 if (PageChecked(page)) 299 SetPageChecked(newpage); 300 if (PageMappedToDisk(page)) 301 SetPageMappedToDisk(newpage); 302 303 if (PageDirty(page)) { 304 clear_page_dirty_for_io(page); 305 /* 306 * Want to mark the page and the radix tree as dirty, and 307 * redo the accounting that clear_page_dirty_for_io undid, 308 * but we can't use set_page_dirty because that function 309 * is actually a signal that all of the page has become dirty. 310 * Wheras only part of our page may be dirty. 311 */ 312 __set_page_dirty_nobuffers(newpage); 313 } 314 315 mlock_migrate_page(newpage, page); 316 ksm_migrate_page(newpage, page); 317 318 ClearPageSwapCache(page); 319 ClearPagePrivate(page); 320 set_page_private(page, 0); 321 page->mapping = NULL; 322 323 /* 324 * If any waiters have accumulated on the new page then 325 * wake them up. 326 */ 327 if (PageWriteback(newpage)) 328 end_page_writeback(newpage); 329} 330 331/************************************************************ 332 * Migration functions 333 ***********************************************************/ 334 335/* Always fail migration. Used for mappings that are not movable */ 336int fail_migrate_page(struct address_space *mapping, 337 struct page *newpage, struct page *page) 338{ 339 return -EIO; 340} 341EXPORT_SYMBOL(fail_migrate_page); 342 343/* 344 * Common logic to directly migrate a single page suitable for 345 * pages that do not use PagePrivate/PagePrivate2. 346 * 347 * Pages are locked upon entry and exit. 348 */ 349int migrate_page(struct address_space *mapping, 350 struct page *newpage, struct page *page) 351{ 352 int rc; 353 354 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 355 356 rc = migrate_page_move_mapping(mapping, newpage, page); 357 358 if (rc) 359 return rc; 360 361 migrate_page_copy(newpage, page); 362 return 0; 363} 364EXPORT_SYMBOL(migrate_page); 365 366#ifdef CONFIG_BLOCK 367/* 368 * Migration function for pages with buffers. This function can only be used 369 * if the underlying filesystem guarantees that no other references to "page" 370 * exist. 371 */ 372int buffer_migrate_page(struct address_space *mapping, 373 struct page *newpage, struct page *page) 374{ 375 struct buffer_head *bh, *head; 376 int rc; 377 378 if (!page_has_buffers(page)) 379 return migrate_page(mapping, newpage, page); 380 381 head = page_buffers(page); 382 383 rc = migrate_page_move_mapping(mapping, newpage, page); 384 385 if (rc) 386 return rc; 387 388 bh = head; 389 do { 390 get_bh(bh); 391 lock_buffer(bh); 392 bh = bh->b_this_page; 393 394 } while (bh != head); 395 396 ClearPagePrivate(page); 397 set_page_private(newpage, page_private(page)); 398 set_page_private(page, 0); 399 put_page(page); 400 get_page(newpage); 401 402 bh = head; 403 do { 404 set_bh_page(bh, newpage, bh_offset(bh)); 405 bh = bh->b_this_page; 406 407 } while (bh != head); 408 409 SetPagePrivate(newpage); 410 411 migrate_page_copy(newpage, page); 412 413 bh = head; 414 do { 415 unlock_buffer(bh); 416 put_bh(bh); 417 bh = bh->b_this_page; 418 419 } while (bh != head); 420 421 return 0; 422} 423EXPORT_SYMBOL(buffer_migrate_page); 424#endif 425 426/* 427 * Writeback a page to clean the dirty state 428 */ 429static int writeout(struct address_space *mapping, struct page *page) 430{ 431 struct writeback_control wbc = { 432 .sync_mode = WB_SYNC_NONE, 433 .nr_to_write = 1, 434 .range_start = 0, 435 .range_end = LLONG_MAX, 436 .nonblocking = 1, 437 .for_reclaim = 1 438 }; 439 int rc; 440 441 if (!mapping->a_ops->writepage) 442 /* No write method for the address space */ 443 return -EINVAL; 444 445 if (!clear_page_dirty_for_io(page)) 446 /* Someone else already triggered a write */ 447 return -EAGAIN; 448 449 /* 450 * A dirty page may imply that the underlying filesystem has 451 * the page on some queue. So the page must be clean for 452 * migration. Writeout may mean we loose the lock and the 453 * page state is no longer what we checked for earlier. 454 * At this point we know that the migration attempt cannot 455 * be successful. 456 */ 457 remove_migration_ptes(page, page); 458 459 rc = mapping->a_ops->writepage(page, &wbc); 460 461 if (rc != AOP_WRITEPAGE_ACTIVATE) 462 /* unlocked. Relock */ 463 lock_page(page); 464 465 return (rc < 0) ? -EIO : -EAGAIN; 466} 467 468/* 469 * Default handling if a filesystem does not provide a migration function. 470 */ 471static int fallback_migrate_page(struct address_space *mapping, 472 struct page *newpage, struct page *page) 473{ 474 if (PageDirty(page)) 475 return writeout(mapping, page); 476 477 /* 478 * Buffers may be managed in a filesystem specific way. 479 * We must have no buffers or drop them. 480 */ 481 if (page_has_private(page) && 482 !try_to_release_page(page, GFP_KERNEL)) 483 return -EAGAIN; 484 485 return migrate_page(mapping, newpage, page); 486} 487 488/* 489 * Move a page to a newly allocated page 490 * The page is locked and all ptes have been successfully removed. 491 * 492 * The new page will have replaced the old page if this function 493 * is successful. 494 * 495 * Return value: 496 * < 0 - error code 497 * == 0 - success 498 */ 499static int move_to_new_page(struct page *newpage, struct page *page, 500 int remap_swapcache) 501{ 502 struct address_space *mapping; 503 int rc; 504 505 /* 506 * Block others from accessing the page when we get around to 507 * establishing additional references. We are the only one 508 * holding a reference to the new page at this point. 509 */ 510 if (!trylock_page(newpage)) 511 BUG(); 512 513 /* Prepare mapping for the new page.*/ 514 newpage->index = page->index; 515 newpage->mapping = page->mapping; 516 if (PageSwapBacked(page)) 517 SetPageSwapBacked(newpage); 518 519 mapping = page_mapping(page); 520 if (!mapping) 521 rc = migrate_page(mapping, newpage, page); 522 else if (mapping->a_ops->migratepage) 523 /* 524 * Most pages have a mapping and most filesystems 525 * should provide a migration function. Anonymous 526 * pages are part of swap space which also has its 527 * own migration function. This is the most common 528 * path for page migration. 529 */ 530 rc = mapping->a_ops->migratepage(mapping, 531 newpage, page); 532 else 533 rc = fallback_migrate_page(mapping, newpage, page); 534 535 if (rc) { 536 newpage->mapping = NULL; 537 } else { 538 if (remap_swapcache) 539 remove_migration_ptes(page, newpage); 540 } 541 542 unlock_page(newpage); 543 544 return rc; 545} 546 547/* 548 * Obtain the lock on page, remove all ptes and migrate the page 549 * to the newly allocated page in newpage. 550 */ 551static int unmap_and_move(new_page_t get_new_page, unsigned long private, 552 struct page *page, int force, int offlining) 553{ 554 int rc = 0; 555 int *result = NULL; 556 struct page *newpage = get_new_page(page, private, &result); 557 int remap_swapcache = 1; 558 int charge = 0; 559 struct mem_cgroup *mem = NULL; 560 struct anon_vma *anon_vma = NULL; 561 562 if (!newpage) 563 return -ENOMEM; 564 565 if (page_count(page) == 1) { 566 /* page was freed from under us. So we are done. */ 567 goto move_newpage; 568 } 569 570 /* prepare cgroup just returns 0 or -ENOMEM */ 571 rc = -EAGAIN; 572 573 if (!trylock_page(page)) { 574 if (!force) 575 goto move_newpage; 576 lock_page(page); 577 } 578 579 /* 580 * Only memory hotplug's offline_pages() caller has locked out KSM, 581 * and can safely migrate a KSM page. The other cases have skipped 582 * PageKsm along with PageReserved - but it is only now when we have 583 * the page lock that we can be certain it will not go KSM beneath us 584 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees 585 * its pagecount raised, but only here do we take the page lock which 586 * serializes that). 587 */ 588 if (PageKsm(page) && !offlining) { 589 rc = -EBUSY; 590 goto unlock; 591 } 592 593 /* charge against new page */ 594 charge = mem_cgroup_prepare_migration(page, newpage, &mem); 595 if (charge == -ENOMEM) { 596 rc = -ENOMEM; 597 goto unlock; 598 } 599 BUG_ON(charge); 600 601 if (PageWriteback(page)) { 602 if (!force) 603 goto uncharge; 604 wait_on_page_writeback(page); 605 } 606 /* 607 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 608 * we cannot notice that anon_vma is freed while we migrates a page. 609 * This get_anon_vma() delays freeing anon_vma pointer until the end 610 * of migration. File cache pages are no problem because of page_lock() 611 * File Caches may use write_page() or lock_page() in migration, then, 612 * just care Anon page here. 613 */ 614 if (PageAnon(page)) { 615 /* 616 * Only page_lock_anon_vma() understands the subtleties of 617 * getting a hold on an anon_vma from outside one of its mms. 618 */ 619 anon_vma = page_lock_anon_vma(page); 620 if (anon_vma) { 621 /* 622 * Take a reference count on the anon_vma if the 623 * page is mapped so that it is guaranteed to 624 * exist when the page is remapped later 625 */ 626 get_anon_vma(anon_vma); 627 page_unlock_anon_vma(anon_vma); 628 } else if (PageSwapCache(page)) { 629 /* 630 * We cannot be sure that the anon_vma of an unmapped 631 * swapcache page is safe to use because we don't 632 * know in advance if the VMA that this page belonged 633 * to still exists. If the VMA and others sharing the 634 * data have been freed, then the anon_vma could 635 * already be invalid. 636 * 637 * To avoid this possibility, swapcache pages get 638 * migrated but are not remapped when migration 639 * completes 640 */ 641 remap_swapcache = 0; 642 } else { 643 goto uncharge; 644 } 645 } 646 647 /* 648 * Corner case handling: 649 * 1. When a new swap-cache page is read into, it is added to the LRU 650 * and treated as swapcache but it has no rmap yet. 651 * Calling try_to_unmap() against a page->mapping==NULL page will 652 * trigger a BUG. So handle it here. 653 * 2. An orphaned page (see truncate_complete_page) might have 654 * fs-private metadata. The page can be picked up due to memory 655 * offlining. Everywhere else except page reclaim, the page is 656 * invisible to the vm, so the page can not be migrated. So try to 657 * free the metadata, so the page can be freed. 658 */ 659 if (!page->mapping) { 660 VM_BUG_ON(PageAnon(page)); 661 if (page_has_private(page)) { 662 try_to_free_buffers(page); 663 goto uncharge; 664 } 665 goto skip_unmap; 666 } 667 668 /* Establish migration ptes or remove ptes */ 669 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 670 671skip_unmap: 672 if (!page_mapped(page)) 673 rc = move_to_new_page(newpage, page, remap_swapcache); 674 675 if (rc && remap_swapcache) 676 remove_migration_ptes(page, page); 677 678 /* Drop an anon_vma reference if we took one */ 679 if (anon_vma) 680 drop_anon_vma(anon_vma); 681 682uncharge: 683 if (!charge) 684 mem_cgroup_end_migration(mem, page, newpage); 685unlock: 686 unlock_page(page); 687 688 if (rc != -EAGAIN) { 689 /* 690 * A page that has been migrated has all references 691 * removed and will be freed. A page that has not been 692 * migrated will have kepts its references and be 693 * restored. 694 */ 695 list_del(&page->lru); 696 dec_zone_page_state(page, NR_ISOLATED_ANON + 697 page_is_file_cache(page)); 698 putback_lru_page(page); 699 } 700 701move_newpage: 702 703 /* 704 * Move the new page to the LRU. If migration was not successful 705 * then this will free the page. 706 */ 707 putback_lru_page(newpage); 708 709 if (result) { 710 if (rc) 711 *result = rc; 712 else 713 *result = page_to_nid(newpage); 714 } 715 return rc; 716} 717 718/* 719 * migrate_pages 720 * 721 * The function takes one list of pages to migrate and a function 722 * that determines from the page to be migrated and the private data 723 * the target of the move and allocates the page. 724 * 725 * The function returns after 10 attempts or if no pages 726 * are movable anymore because to has become empty 727 * or no retryable pages exist anymore. All pages will be 728 * returned to the LRU or freed. 729 * 730 * Return: Number of pages not migrated or error code. 731 */ 732int migrate_pages(struct list_head *from, 733 new_page_t get_new_page, unsigned long private, int offlining) 734{ 735 int retry = 1; 736 int nr_failed = 0; 737 int pass = 0; 738 struct page *page; 739 struct page *page2; 740 int swapwrite = current->flags & PF_SWAPWRITE; 741 int rc; 742 743 if (!swapwrite) 744 current->flags |= PF_SWAPWRITE; 745 746 for(pass = 0; pass < 10 && retry; pass++) { 747 retry = 0; 748 749 list_for_each_entry_safe(page, page2, from, lru) { 750 cond_resched(); 751 752 rc = unmap_and_move(get_new_page, private, 753 page, pass > 2, offlining); 754 755 switch(rc) { 756 case -ENOMEM: 757 goto out; 758 case -EAGAIN: 759 retry++; 760 break; 761 case 0: 762 break; 763 default: 764 /* Permanent failure */ 765 nr_failed++; 766 break; 767 } 768 } 769 } 770 rc = 0; 771out: 772 if (!swapwrite) 773 current->flags &= ~PF_SWAPWRITE; 774 775 putback_lru_pages(from); 776 777 if (rc) 778 return rc; 779 780 return nr_failed + retry; 781} 782 783#ifdef CONFIG_NUMA 784/* 785 * Move a list of individual pages 786 */ 787struct page_to_node { 788 unsigned long addr; 789 struct page *page; 790 int node; 791 int status; 792}; 793 794static struct page *new_page_node(struct page *p, unsigned long private, 795 int **result) 796{ 797 struct page_to_node *pm = (struct page_to_node *)private; 798 799 while (pm->node != MAX_NUMNODES && pm->page != p) 800 pm++; 801 802 if (pm->node == MAX_NUMNODES) 803 return NULL; 804 805 *result = &pm->status; 806 807 return alloc_pages_exact_node(pm->node, 808 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 809} 810 811/* 812 * Move a set of pages as indicated in the pm array. The addr 813 * field must be set to the virtual address of the page to be moved 814 * and the node number must contain a valid target node. 815 * The pm array ends with node = MAX_NUMNODES. 816 */ 817static int do_move_page_to_node_array(struct mm_struct *mm, 818 struct page_to_node *pm, 819 int migrate_all) 820{ 821 int err; 822 struct page_to_node *pp; 823 LIST_HEAD(pagelist); 824 825 down_read(&mm->mmap_sem); 826 827 /* 828 * Build a list of pages to migrate 829 */ 830 for (pp = pm; pp->node != MAX_NUMNODES; pp++) { 831 struct vm_area_struct *vma; 832 struct page *page; 833 834 err = -EFAULT; 835 vma = find_vma(mm, pp->addr); 836 if (!vma || !vma_migratable(vma)) 837 goto set_status; 838 839 page = follow_page(vma, pp->addr, FOLL_GET); 840 841 err = PTR_ERR(page); 842 if (IS_ERR(page)) 843 goto set_status; 844 845 err = -ENOENT; 846 if (!page) 847 goto set_status; 848 849 /* Use PageReserved to check for zero page */ 850 if (PageReserved(page) || PageKsm(page)) 851 goto put_and_set; 852 853 pp->page = page; 854 err = page_to_nid(page); 855 856 if (err == pp->node) 857 /* 858 * Node already in the right place 859 */ 860 goto put_and_set; 861 862 err = -EACCES; 863 if (page_mapcount(page) > 1 && 864 !migrate_all) 865 goto put_and_set; 866 867 err = isolate_lru_page(page); 868 if (!err) { 869 list_add_tail(&page->lru, &pagelist); 870 inc_zone_page_state(page, NR_ISOLATED_ANON + 871 page_is_file_cache(page)); 872 } 873put_and_set: 874 /* 875 * Either remove the duplicate refcount from 876 * isolate_lru_page() or drop the page ref if it was 877 * not isolated. 878 */ 879 put_page(page); 880set_status: 881 pp->status = err; 882 } 883 884 err = 0; 885 if (!list_empty(&pagelist)) 886 err = migrate_pages(&pagelist, new_page_node, 887 (unsigned long)pm, 0); 888 889 up_read(&mm->mmap_sem); 890 return err; 891} 892 893/* 894 * Migrate an array of page address onto an array of nodes and fill 895 * the corresponding array of status. 896 */ 897static int do_pages_move(struct mm_struct *mm, struct task_struct *task, 898 unsigned long nr_pages, 899 const void __user * __user *pages, 900 const int __user *nodes, 901 int __user *status, int flags) 902{ 903 struct page_to_node *pm; 904 nodemask_t task_nodes; 905 unsigned long chunk_nr_pages; 906 unsigned long chunk_start; 907 int err; 908 909 task_nodes = cpuset_mems_allowed(task); 910 911 err = -ENOMEM; 912 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 913 if (!pm) 914 goto out; 915 916 migrate_prep(); 917 918 /* 919 * Store a chunk of page_to_node array in a page, 920 * but keep the last one as a marker 921 */ 922 chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; 923 924 for (chunk_start = 0; 925 chunk_start < nr_pages; 926 chunk_start += chunk_nr_pages) { 927 int j; 928 929 if (chunk_start + chunk_nr_pages > nr_pages) 930 chunk_nr_pages = nr_pages - chunk_start; 931 932 /* fill the chunk pm with addrs and nodes from user-space */ 933 for (j = 0; j < chunk_nr_pages; j++) { 934 const void __user *p; 935 int node; 936 937 err = -EFAULT; 938 if (get_user(p, pages + j + chunk_start)) 939 goto out_pm; 940 pm[j].addr = (unsigned long) p; 941 942 if (get_user(node, nodes + j + chunk_start)) 943 goto out_pm; 944 945 err = -ENODEV; 946 if (node < 0 || node >= MAX_NUMNODES) 947 goto out_pm; 948 949 if (!node_state(node, N_HIGH_MEMORY)) 950 goto out_pm; 951 952 err = -EACCES; 953 if (!node_isset(node, task_nodes)) 954 goto out_pm; 955 956 pm[j].node = node; 957 } 958 959 /* End marker for this chunk */ 960 pm[chunk_nr_pages].node = MAX_NUMNODES; 961 962 /* Migrate this chunk */ 963 err = do_move_page_to_node_array(mm, pm, 964 flags & MPOL_MF_MOVE_ALL); 965 if (err < 0) 966 goto out_pm; 967 968 /* Return status information */ 969 for (j = 0; j < chunk_nr_pages; j++) 970 if (put_user(pm[j].status, status + j + chunk_start)) { 971 err = -EFAULT; 972 goto out_pm; 973 } 974 } 975 err = 0; 976 977out_pm: 978 free_page((unsigned long)pm); 979out: 980 return err; 981} 982 983/* 984 * Determine the nodes of an array of pages and store it in an array of status. 985 */ 986static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 987 const void __user **pages, int *status) 988{ 989 unsigned long i; 990 991 down_read(&mm->mmap_sem); 992 993 for (i = 0; i < nr_pages; i++) { 994 unsigned long addr = (unsigned long)(*pages); 995 struct vm_area_struct *vma; 996 struct page *page; 997 int err = -EFAULT; 998 999 vma = find_vma(mm, addr); 1000 if (!vma) 1001 goto set_status; 1002 1003 page = follow_page(vma, addr, 0); 1004 1005 err = PTR_ERR(page); 1006 if (IS_ERR(page)) 1007 goto set_status; 1008 1009 err = -ENOENT; 1010 /* Use PageReserved to check for zero page */ 1011 if (!page || PageReserved(page) || PageKsm(page)) 1012 goto set_status; 1013 1014 err = page_to_nid(page); 1015set_status: 1016 *status = err; 1017 1018 pages++; 1019 status++; 1020 } 1021 1022 up_read(&mm->mmap_sem); 1023} 1024 1025/* 1026 * Determine the nodes of a user array of pages and store it in 1027 * a user array of status. 1028 */ 1029static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 1030 const void __user * __user *pages, 1031 int __user *status) 1032{ 1033#define DO_PAGES_STAT_CHUNK_NR 16 1034 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1035 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1036 1037 while (nr_pages) { 1038 unsigned long chunk_nr; 1039 1040 chunk_nr = nr_pages; 1041 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) 1042 chunk_nr = DO_PAGES_STAT_CHUNK_NR; 1043 1044 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) 1045 break; 1046 1047 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1048 1049 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 1050 break; 1051 1052 pages += chunk_nr; 1053 status += chunk_nr; 1054 nr_pages -= chunk_nr; 1055 } 1056 return nr_pages ? -EFAULT : 0; 1057} 1058 1059/* 1060 * Move a list of pages in the address space of the currently executing 1061 * process. 1062 */ 1063SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1064 const void __user * __user *, pages, 1065 const int __user *, nodes, 1066 int __user *, status, int, flags) 1067{ 1068 const struct cred *cred = current_cred(), *tcred; 1069 struct task_struct *task; 1070 struct mm_struct *mm; 1071 int err; 1072 1073 /* Check flags */ 1074 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1075 return -EINVAL; 1076 1077 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1078 return -EPERM; 1079 1080 /* Find the mm_struct */ 1081 read_lock(&tasklist_lock); 1082 task = pid ? find_task_by_vpid(pid) : current; 1083 if (!task) { 1084 read_unlock(&tasklist_lock); 1085 return -ESRCH; 1086 } 1087 mm = get_task_mm(task); 1088 read_unlock(&tasklist_lock); 1089 1090 if (!mm) 1091 return -EINVAL; 1092 1093 /* 1094 * Check if this process has the right to modify the specified 1095 * process. The right exists if the process has administrative 1096 * capabilities, superuser privileges or the same 1097 * userid as the target process. 1098 */ 1099 rcu_read_lock(); 1100 tcred = __task_cred(task); 1101 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1102 cred->uid != tcred->suid && cred->uid != tcred->uid && 1103 !capable(CAP_SYS_NICE)) { 1104 rcu_read_unlock(); 1105 err = -EPERM; 1106 goto out; 1107 } 1108 rcu_read_unlock(); 1109 1110 err = security_task_movememory(task); 1111 if (err) 1112 goto out; 1113 1114 if (nodes) { 1115 err = do_pages_move(mm, task, nr_pages, pages, nodes, status, 1116 flags); 1117 } else { 1118 err = do_pages_stat(mm, nr_pages, pages, status); 1119 } 1120 1121out: 1122 mmput(mm); 1123 return err; 1124} 1125 1126/* 1127 * Call migration functions in the vma_ops that may prepare 1128 * memory in a vm for migration. migration functions may perform 1129 * the migration for vmas that do not have an underlying page struct. 1130 */ 1131int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, 1132 const nodemask_t *from, unsigned long flags) 1133{ 1134 struct vm_area_struct *vma; 1135 int err = 0; 1136 1137 for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { 1138 if (vma->vm_ops && vma->vm_ops->migrate) { 1139 err = vma->vm_ops->migrate(vma, to, from, flags); 1140 if (err) 1141 break; 1142 } 1143 } 1144 return err; 1145} 1146#endif 1147