1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24#include <linux/res_counter.h> 25#include <linux/memcontrol.h> 26#include <linux/cgroup.h> 27#include <linux/mm.h> 28#include <linux/hugetlb.h> 29#include <linux/pagemap.h> 30#include <linux/smp.h> 31#include <linux/page-flags.h> 32#include <linux/backing-dev.h> 33#include <linux/bit_spinlock.h> 34#include <linux/rcupdate.h> 35#include <linux/limits.h> 36#include <linux/mutex.h> 37#include <linux/rbtree.h> 38#include <linux/slab.h> 39#include <linux/swap.h> 40#include <linux/swapops.h> 41#include <linux/spinlock.h> 42#include <linux/eventfd.h> 43#include <linux/sort.h> 44#include <linux/fs.h> 45#include <linux/seq_file.h> 46#include <linux/vmalloc.h> 47#include <linux/mm_inline.h> 48#include <linux/page_cgroup.h> 49#include <linux/cpu.h> 50#include <linux/oom.h> 51#include "internal.h" 52 53#include <asm/uaccess.h> 54 55#include <trace/events/vmscan.h> 56 57struct cgroup_subsys mem_cgroup_subsys __read_mostly; 58#define MEM_CGROUP_RECLAIM_RETRIES 5 59struct mem_cgroup *root_mem_cgroup __read_mostly; 60 61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 63int do_swap_account __read_mostly; 64static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 65#else 66#define do_swap_account (0) 67#endif 68 69/* 70 * Per memcg event counter is incremented at every pagein/pageout. This counter 71 * is used for trigger some periodic events. This is straightforward and better 72 * than using jiffies etc. to handle periodic memcg event. 73 * 74 * These values will be used as !((event) & ((1 <<(thresh)) - 1)) 75 */ 76#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ 77#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ 78 79/* 80 * Statistics for memory cgroup. 81 */ 82enum mem_cgroup_stat_index { 83 /* 84 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 85 */ 86 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 87 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 88 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 92 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 93 94 MEM_CGROUP_STAT_NSTATS, 95}; 96 97struct mem_cgroup_stat_cpu { 98 s64 count[MEM_CGROUP_STAT_NSTATS]; 99}; 100 101/* 102 * per-zone information in memory controller. 103 */ 104struct mem_cgroup_per_zone { 105 /* 106 * spin_lock to protect the per cgroup LRU 107 */ 108 struct list_head lists[NR_LRU_LISTS]; 109 unsigned long count[NR_LRU_LISTS]; 110 111 struct zone_reclaim_stat reclaim_stat; 112 struct rb_node tree_node; /* RB tree node */ 113 unsigned long long usage_in_excess;/* Set to the value by which */ 114 /* the soft limit is exceeded*/ 115 bool on_tree; 116 struct mem_cgroup *mem; /* Back pointer, we cannot */ 117 /* use container_of */ 118}; 119/* Macro for accessing counter */ 120#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 121 122struct mem_cgroup_per_node { 123 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 124}; 125 126struct mem_cgroup_lru_info { 127 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 128}; 129 130/* 131 * Cgroups above their limits are maintained in a RB-Tree, independent of 132 * their hierarchy representation 133 */ 134 135struct mem_cgroup_tree_per_zone { 136 struct rb_root rb_root; 137 spinlock_t lock; 138}; 139 140struct mem_cgroup_tree_per_node { 141 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 142}; 143 144struct mem_cgroup_tree { 145 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 146}; 147 148static struct mem_cgroup_tree soft_limit_tree __read_mostly; 149 150struct mem_cgroup_threshold { 151 struct eventfd_ctx *eventfd; 152 u64 threshold; 153}; 154 155/* For threshold */ 156struct mem_cgroup_threshold_ary { 157 /* An array index points to threshold just below usage. */ 158 int current_threshold; 159 /* Size of entries[] */ 160 unsigned int size; 161 /* Array of thresholds */ 162 struct mem_cgroup_threshold entries[0]; 163}; 164 165struct mem_cgroup_thresholds { 166 /* Primary thresholds array */ 167 struct mem_cgroup_threshold_ary *primary; 168 /* 169 * Spare threshold array. 170 * This is needed to make mem_cgroup_unregister_event() "never fail". 171 * It must be able to store at least primary->size - 1 entries. 172 */ 173 struct mem_cgroup_threshold_ary *spare; 174}; 175 176/* for OOM */ 177struct mem_cgroup_eventfd_list { 178 struct list_head list; 179 struct eventfd_ctx *eventfd; 180}; 181 182static void mem_cgroup_threshold(struct mem_cgroup *mem); 183static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 184 185/* 186 * The memory controller data structure. The memory controller controls both 187 * page cache and RSS per cgroup. We would eventually like to provide 188 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 189 * to help the administrator determine what knobs to tune. 190 * 191 * TODO: Add a water mark for the memory controller. Reclaim will begin when 192 * we hit the water mark. May be even add a low water mark, such that 193 * no reclaim occurs from a cgroup at it's low water mark, this is 194 * a feature that will be implemented much later in the future. 195 */ 196struct mem_cgroup { 197 struct cgroup_subsys_state css; 198 /* 199 * the counter to account for memory usage 200 */ 201 struct res_counter res; 202 /* 203 * the counter to account for mem+swap usage. 204 */ 205 struct res_counter memsw; 206 /* 207 * Per cgroup active and inactive list, similar to the 208 * per zone LRU lists. 209 */ 210 struct mem_cgroup_lru_info info; 211 212 /* 213 protect against reclaim related member. 214 */ 215 spinlock_t reclaim_param_lock; 216 217 /* 218 * While reclaiming in a hierarchy, we cache the last child we 219 * reclaimed from. 220 */ 221 int last_scanned_child; 222 /* 223 * Should the accounting and control be hierarchical, per subtree? 224 */ 225 bool use_hierarchy; 226 atomic_t oom_lock; 227 atomic_t refcnt; 228 229 unsigned int swappiness; 230 /* OOM-Killer disable */ 231 int oom_kill_disable; 232 233 /* set when res.limit == memsw.limit */ 234 bool memsw_is_minimum; 235 236 /* protect arrays of thresholds */ 237 struct mutex thresholds_lock; 238 239 /* thresholds for memory usage. RCU-protected */ 240 struct mem_cgroup_thresholds thresholds; 241 242 /* thresholds for mem+swap usage. RCU-protected */ 243 struct mem_cgroup_thresholds memsw_thresholds; 244 245 /* For oom notifier event fd */ 246 struct list_head oom_notify; 247 248 /* 249 * Should we move charges of a task when a task is moved into this 250 * mem_cgroup ? And what type of charges should we move ? 251 */ 252 unsigned long move_charge_at_immigrate; 253 /* 254 * percpu counter. 255 */ 256 struct mem_cgroup_stat_cpu *stat; 257}; 258 259/* Stuffs for move charges at task migration. */ 260/* 261 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 262 * left-shifted bitmap of these types. 263 */ 264enum move_type { 265 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 266 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 267 NR_MOVE_TYPE, 268}; 269 270/* "mc" and its members are protected by cgroup_mutex */ 271static struct move_charge_struct { 272 spinlock_t lock; /* for from, to */ 273 struct mem_cgroup *from; 274 struct mem_cgroup *to; 275 unsigned long precharge; 276 unsigned long moved_charge; 277 unsigned long moved_swap; 278 struct task_struct *moving_task; /* a task moving charges */ 279 struct mm_struct *mm; 280 wait_queue_head_t waitq; /* a waitq for other context */ 281} mc = { 282 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 283 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 284}; 285 286static bool move_anon(void) 287{ 288 return test_bit(MOVE_CHARGE_TYPE_ANON, 289 &mc.to->move_charge_at_immigrate); 290} 291 292static bool move_file(void) 293{ 294 return test_bit(MOVE_CHARGE_TYPE_FILE, 295 &mc.to->move_charge_at_immigrate); 296} 297 298/* 299 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 300 * limit reclaim to prevent infinite loops, if they ever occur. 301 */ 302#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 303#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 304 305enum charge_type { 306 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 307 MEM_CGROUP_CHARGE_TYPE_MAPPED, 308 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 309 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 310 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 311 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 312 NR_CHARGE_TYPE, 313}; 314 315/* only for here (for easy reading.) */ 316#define PCGF_CACHE (1UL << PCG_CACHE) 317#define PCGF_USED (1UL << PCG_USED) 318#define PCGF_LOCK (1UL << PCG_LOCK) 319/* Not used, but added here for completeness */ 320#define PCGF_ACCT (1UL << PCG_ACCT) 321 322/* for encoding cft->private value on file */ 323#define _MEM (0) 324#define _MEMSWAP (1) 325#define _OOM_TYPE (2) 326#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 327#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 328#define MEMFILE_ATTR(val) ((val) & 0xffff) 329/* Used for OOM nofiier */ 330#define OOM_CONTROL (0) 331 332/* 333 * Reclaim flags for mem_cgroup_hierarchical_reclaim 334 */ 335#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 336#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 337#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 338#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 339#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 340#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 341 342static void mem_cgroup_get(struct mem_cgroup *mem); 343static void mem_cgroup_put(struct mem_cgroup *mem); 344static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 345static void drain_all_stock_async(void); 346 347static struct mem_cgroup_per_zone * 348mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 349{ 350 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 351} 352 353struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 354{ 355 return &mem->css; 356} 357 358static struct mem_cgroup_per_zone * 359page_cgroup_zoneinfo(struct page_cgroup *pc) 360{ 361 struct mem_cgroup *mem = pc->mem_cgroup; 362 int nid = page_cgroup_nid(pc); 363 int zid = page_cgroup_zid(pc); 364 365 if (!mem) 366 return NULL; 367 368 return mem_cgroup_zoneinfo(mem, nid, zid); 369} 370 371static struct mem_cgroup_tree_per_zone * 372soft_limit_tree_node_zone(int nid, int zid) 373{ 374 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 375} 376 377static struct mem_cgroup_tree_per_zone * 378soft_limit_tree_from_page(struct page *page) 379{ 380 int nid = page_to_nid(page); 381 int zid = page_zonenum(page); 382 383 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 384} 385 386static void 387__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 388 struct mem_cgroup_per_zone *mz, 389 struct mem_cgroup_tree_per_zone *mctz, 390 unsigned long long new_usage_in_excess) 391{ 392 struct rb_node **p = &mctz->rb_root.rb_node; 393 struct rb_node *parent = NULL; 394 struct mem_cgroup_per_zone *mz_node; 395 396 if (mz->on_tree) 397 return; 398 399 mz->usage_in_excess = new_usage_in_excess; 400 if (!mz->usage_in_excess) 401 return; 402 while (*p) { 403 parent = *p; 404 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 405 tree_node); 406 if (mz->usage_in_excess < mz_node->usage_in_excess) 407 p = &(*p)->rb_left; 408 /* 409 * We can't avoid mem cgroups that are over their soft 410 * limit by the same amount 411 */ 412 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 413 p = &(*p)->rb_right; 414 } 415 rb_link_node(&mz->tree_node, parent, p); 416 rb_insert_color(&mz->tree_node, &mctz->rb_root); 417 mz->on_tree = true; 418} 419 420static void 421__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 422 struct mem_cgroup_per_zone *mz, 423 struct mem_cgroup_tree_per_zone *mctz) 424{ 425 if (!mz->on_tree) 426 return; 427 rb_erase(&mz->tree_node, &mctz->rb_root); 428 mz->on_tree = false; 429} 430 431static void 432mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 433 struct mem_cgroup_per_zone *mz, 434 struct mem_cgroup_tree_per_zone *mctz) 435{ 436 spin_lock(&mctz->lock); 437 __mem_cgroup_remove_exceeded(mem, mz, mctz); 438 spin_unlock(&mctz->lock); 439} 440 441 442static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 443{ 444 unsigned long long excess; 445 struct mem_cgroup_per_zone *mz; 446 struct mem_cgroup_tree_per_zone *mctz; 447 int nid = page_to_nid(page); 448 int zid = page_zonenum(page); 449 mctz = soft_limit_tree_from_page(page); 450 451 /* 452 * Necessary to update all ancestors when hierarchy is used. 453 * because their event counter is not touched. 454 */ 455 for (; mem; mem = parent_mem_cgroup(mem)) { 456 mz = mem_cgroup_zoneinfo(mem, nid, zid); 457 excess = res_counter_soft_limit_excess(&mem->res); 458 /* 459 * We have to update the tree if mz is on RB-tree or 460 * mem is over its softlimit. 461 */ 462 if (excess || mz->on_tree) { 463 spin_lock(&mctz->lock); 464 /* if on-tree, remove it */ 465 if (mz->on_tree) 466 __mem_cgroup_remove_exceeded(mem, mz, mctz); 467 /* 468 * Insert again. mz->usage_in_excess will be updated. 469 * If excess is 0, no tree ops. 470 */ 471 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 472 spin_unlock(&mctz->lock); 473 } 474 } 475} 476 477static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 478{ 479 int node, zone; 480 struct mem_cgroup_per_zone *mz; 481 struct mem_cgroup_tree_per_zone *mctz; 482 483 for_each_node_state(node, N_POSSIBLE) { 484 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 485 mz = mem_cgroup_zoneinfo(mem, node, zone); 486 mctz = soft_limit_tree_node_zone(node, zone); 487 mem_cgroup_remove_exceeded(mem, mz, mctz); 488 } 489 } 490} 491 492static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 493{ 494 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 495} 496 497static struct mem_cgroup_per_zone * 498__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 499{ 500 struct rb_node *rightmost = NULL; 501 struct mem_cgroup_per_zone *mz; 502 503retry: 504 mz = NULL; 505 rightmost = rb_last(&mctz->rb_root); 506 if (!rightmost) 507 goto done; /* Nothing to reclaim from */ 508 509 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 510 /* 511 * Remove the node now but someone else can add it back, 512 * we will to add it back at the end of reclaim to its correct 513 * position in the tree. 514 */ 515 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 516 if (!res_counter_soft_limit_excess(&mz->mem->res) || 517 !css_tryget(&mz->mem->css)) 518 goto retry; 519done: 520 return mz; 521} 522 523static struct mem_cgroup_per_zone * 524mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 525{ 526 struct mem_cgroup_per_zone *mz; 527 528 spin_lock(&mctz->lock); 529 mz = __mem_cgroup_largest_soft_limit_node(mctz); 530 spin_unlock(&mctz->lock); 531 return mz; 532} 533 534static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 535 enum mem_cgroup_stat_index idx) 536{ 537 int cpu; 538 s64 val = 0; 539 540 for_each_possible_cpu(cpu) 541 val += per_cpu(mem->stat->count[idx], cpu); 542 return val; 543} 544 545static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) 546{ 547 s64 ret; 548 549 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 550 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 551 return ret; 552} 553 554static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 555 bool charge) 556{ 557 int val = (charge) ? 1 : -1; 558 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 559} 560 561static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 562 struct page_cgroup *pc, 563 bool charge) 564{ 565 int val = (charge) ? 1 : -1; 566 567 preempt_disable(); 568 569 if (PageCgroupCache(pc)) 570 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 571 else 572 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 573 574 if (charge) 575 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 576 else 577 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 578 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); 579 580 preempt_enable(); 581} 582 583static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 584 enum lru_list idx) 585{ 586 int nid, zid; 587 struct mem_cgroup_per_zone *mz; 588 u64 total = 0; 589 590 for_each_online_node(nid) 591 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 592 mz = mem_cgroup_zoneinfo(mem, nid, zid); 593 total += MEM_CGROUP_ZSTAT(mz, idx); 594 } 595 return total; 596} 597 598static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) 599{ 600 s64 val; 601 602 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); 603 604 return !(val & ((1 << event_mask_shift) - 1)); 605} 606 607/* 608 * Check events in order. 609 * 610 */ 611static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 612{ 613 /* threshold event is triggered in finer grain than soft limit */ 614 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { 615 mem_cgroup_threshold(mem); 616 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) 617 mem_cgroup_update_tree(mem, page); 618 } 619} 620 621static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 622{ 623 return container_of(cgroup_subsys_state(cont, 624 mem_cgroup_subsys_id), struct mem_cgroup, 625 css); 626} 627 628struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 629{ 630 /* 631 * mm_update_next_owner() may clear mm->owner to NULL 632 * if it races with swapoff, page migration, etc. 633 * So this can be called with p == NULL. 634 */ 635 if (unlikely(!p)) 636 return NULL; 637 638 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 639 struct mem_cgroup, css); 640} 641 642static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 643{ 644 struct mem_cgroup *mem = NULL; 645 646 if (!mm) 647 return NULL; 648 /* 649 * Because we have no locks, mm->owner's may be being moved to other 650 * cgroup. We use css_tryget() here even if this looks 651 * pessimistic (rather than adding locks here). 652 */ 653 rcu_read_lock(); 654 do { 655 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 656 if (unlikely(!mem)) 657 break; 658 } while (!css_tryget(&mem->css)); 659 rcu_read_unlock(); 660 return mem; 661} 662 663/* 664 * Call callback function against all cgroup under hierarchy tree. 665 */ 666static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, 667 int (*func)(struct mem_cgroup *, void *)) 668{ 669 int found, ret, nextid; 670 struct cgroup_subsys_state *css; 671 struct mem_cgroup *mem; 672 673 if (!root->use_hierarchy) 674 return (*func)(root, data); 675 676 nextid = 1; 677 do { 678 ret = 0; 679 mem = NULL; 680 681 rcu_read_lock(); 682 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 683 &found); 684 if (css && css_tryget(css)) 685 mem = container_of(css, struct mem_cgroup, css); 686 rcu_read_unlock(); 687 688 if (mem) { 689 ret = (*func)(mem, data); 690 css_put(&mem->css); 691 } 692 nextid = found + 1; 693 } while (!ret && css); 694 695 return ret; 696} 697 698static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 699{ 700 return (mem == root_mem_cgroup); 701} 702 703/* 704 * Following LRU functions are allowed to be used without PCG_LOCK. 705 * Operations are called by routine of global LRU independently from memcg. 706 * What we have to take care of here is validness of pc->mem_cgroup. 707 * 708 * Changes to pc->mem_cgroup happens when 709 * 1. charge 710 * 2. moving account 711 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 712 * It is added to LRU before charge. 713 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 714 * When moving account, the page is not on LRU. It's isolated. 715 */ 716 717void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 718{ 719 struct page_cgroup *pc; 720 struct mem_cgroup_per_zone *mz; 721 722 if (mem_cgroup_disabled()) 723 return; 724 pc = lookup_page_cgroup(page); 725 /* can happen while we handle swapcache. */ 726 if (!TestClearPageCgroupAcctLRU(pc)) 727 return; 728 VM_BUG_ON(!pc->mem_cgroup); 729 /* 730 * We don't check PCG_USED bit. It's cleared when the "page" is finally 731 * removed from global LRU. 732 */ 733 mz = page_cgroup_zoneinfo(pc); 734 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 735 if (mem_cgroup_is_root(pc->mem_cgroup)) 736 return; 737 VM_BUG_ON(list_empty(&pc->lru)); 738 list_del_init(&pc->lru); 739 return; 740} 741 742void mem_cgroup_del_lru(struct page *page) 743{ 744 mem_cgroup_del_lru_list(page, page_lru(page)); 745} 746 747void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 748{ 749 struct mem_cgroup_per_zone *mz; 750 struct page_cgroup *pc; 751 752 if (mem_cgroup_disabled()) 753 return; 754 755 pc = lookup_page_cgroup(page); 756 /* 757 * Used bit is set without atomic ops but after smp_wmb(). 758 * For making pc->mem_cgroup visible, insert smp_rmb() here. 759 */ 760 smp_rmb(); 761 /* unused or root page is not rotated. */ 762 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 763 return; 764 mz = page_cgroup_zoneinfo(pc); 765 list_move(&pc->lru, &mz->lists[lru]); 766} 767 768void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 769{ 770 struct page_cgroup *pc; 771 struct mem_cgroup_per_zone *mz; 772 773 if (mem_cgroup_disabled()) 774 return; 775 pc = lookup_page_cgroup(page); 776 VM_BUG_ON(PageCgroupAcctLRU(pc)); 777 /* 778 * Used bit is set without atomic ops but after smp_wmb(). 779 * For making pc->mem_cgroup visible, insert smp_rmb() here. 780 */ 781 smp_rmb(); 782 if (!PageCgroupUsed(pc)) 783 return; 784 785 mz = page_cgroup_zoneinfo(pc); 786 MEM_CGROUP_ZSTAT(mz, lru) += 1; 787 SetPageCgroupAcctLRU(pc); 788 if (mem_cgroup_is_root(pc->mem_cgroup)) 789 return; 790 list_add(&pc->lru, &mz->lists[lru]); 791} 792 793/* 794 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 795 * lru because the page may.be reused after it's fully uncharged (because of 796 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 797 * it again. This function is only used to charge SwapCache. It's done under 798 * lock_page and expected that zone->lru_lock is never held. 799 */ 800static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 801{ 802 unsigned long flags; 803 struct zone *zone = page_zone(page); 804 struct page_cgroup *pc = lookup_page_cgroup(page); 805 806 spin_lock_irqsave(&zone->lru_lock, flags); 807 /* 808 * Forget old LRU when this page_cgroup is *not* used. This Used bit 809 * is guarded by lock_page() because the page is SwapCache. 810 */ 811 if (!PageCgroupUsed(pc)) 812 mem_cgroup_del_lru_list(page, page_lru(page)); 813 spin_unlock_irqrestore(&zone->lru_lock, flags); 814} 815 816static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 817{ 818 unsigned long flags; 819 struct zone *zone = page_zone(page); 820 struct page_cgroup *pc = lookup_page_cgroup(page); 821 822 spin_lock_irqsave(&zone->lru_lock, flags); 823 /* link when the page is linked to LRU but page_cgroup isn't */ 824 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 825 mem_cgroup_add_lru_list(page, page_lru(page)); 826 spin_unlock_irqrestore(&zone->lru_lock, flags); 827} 828 829 830void mem_cgroup_move_lists(struct page *page, 831 enum lru_list from, enum lru_list to) 832{ 833 if (mem_cgroup_disabled()) 834 return; 835 mem_cgroup_del_lru_list(page, from); 836 mem_cgroup_add_lru_list(page, to); 837} 838 839int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 840{ 841 int ret; 842 struct mem_cgroup *curr = NULL; 843 struct task_struct *p; 844 845 p = find_lock_task_mm(task); 846 if (!p) 847 return 0; 848 curr = try_get_mem_cgroup_from_mm(p->mm); 849 task_unlock(p); 850 if (!curr) 851 return 0; 852 /* 853 * We should check use_hierarchy of "mem" not "curr". Because checking 854 * use_hierarchy of "curr" here make this function true if hierarchy is 855 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 856 * hierarchy(even if use_hierarchy is disabled in "mem"). 857 */ 858 if (mem->use_hierarchy) 859 ret = css_is_ancestor(&curr->css, &mem->css); 860 else 861 ret = (curr == mem); 862 css_put(&curr->css); 863 return ret; 864} 865 866static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 867{ 868 unsigned long active; 869 unsigned long inactive; 870 unsigned long gb; 871 unsigned long inactive_ratio; 872 873 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 874 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 875 876 gb = (inactive + active) >> (30 - PAGE_SHIFT); 877 if (gb) 878 inactive_ratio = int_sqrt(10 * gb); 879 else 880 inactive_ratio = 1; 881 882 if (present_pages) { 883 present_pages[0] = inactive; 884 present_pages[1] = active; 885 } 886 887 return inactive_ratio; 888} 889 890int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 891{ 892 unsigned long active; 893 unsigned long inactive; 894 unsigned long present_pages[2]; 895 unsigned long inactive_ratio; 896 897 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 898 899 inactive = present_pages[0]; 900 active = present_pages[1]; 901 902 if (inactive * inactive_ratio < active) 903 return 1; 904 905 return 0; 906} 907 908int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 909{ 910 unsigned long active; 911 unsigned long inactive; 912 913 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 914 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 915 916 return (active > inactive); 917} 918 919unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 920 struct zone *zone, 921 enum lru_list lru) 922{ 923 int nid = zone_to_nid(zone); 924 int zid = zone_idx(zone); 925 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 926 927 return MEM_CGROUP_ZSTAT(mz, lru); 928} 929 930struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 931 struct zone *zone) 932{ 933 int nid = zone_to_nid(zone); 934 int zid = zone_idx(zone); 935 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 936 937 return &mz->reclaim_stat; 938} 939 940struct zone_reclaim_stat * 941mem_cgroup_get_reclaim_stat_from_page(struct page *page) 942{ 943 struct page_cgroup *pc; 944 struct mem_cgroup_per_zone *mz; 945 946 if (mem_cgroup_disabled()) 947 return NULL; 948 949 pc = lookup_page_cgroup(page); 950 /* 951 * Used bit is set without atomic ops but after smp_wmb(). 952 * For making pc->mem_cgroup visible, insert smp_rmb() here. 953 */ 954 smp_rmb(); 955 if (!PageCgroupUsed(pc)) 956 return NULL; 957 958 mz = page_cgroup_zoneinfo(pc); 959 if (!mz) 960 return NULL; 961 962 return &mz->reclaim_stat; 963} 964 965unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 966 struct list_head *dst, 967 unsigned long *scanned, int order, 968 int mode, struct zone *z, 969 struct mem_cgroup *mem_cont, 970 int active, int file) 971{ 972 unsigned long nr_taken = 0; 973 struct page *page; 974 unsigned long scan; 975 LIST_HEAD(pc_list); 976 struct list_head *src; 977 struct page_cgroup *pc, *tmp; 978 int nid = zone_to_nid(z); 979 int zid = zone_idx(z); 980 struct mem_cgroup_per_zone *mz; 981 int lru = LRU_FILE * file + active; 982 int ret; 983 984 BUG_ON(!mem_cont); 985 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 986 src = &mz->lists[lru]; 987 988 scan = 0; 989 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 990 if (scan >= nr_to_scan) 991 break; 992 993 page = pc->page; 994 if (unlikely(!PageCgroupUsed(pc))) 995 continue; 996 if (unlikely(!PageLRU(page))) 997 continue; 998 999 scan++; 1000 ret = __isolate_lru_page(page, mode, file); 1001 switch (ret) { 1002 case 0: 1003 list_move(&page->lru, dst); 1004 mem_cgroup_del_lru(page); 1005 nr_taken++; 1006 break; 1007 case -EBUSY: 1008 /* we don't affect global LRU but rotate in our LRU */ 1009 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1010 break; 1011 default: 1012 break; 1013 } 1014 } 1015 1016 *scanned = scan; 1017 1018 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, 1019 0, 0, 0, mode); 1020 1021 return nr_taken; 1022} 1023 1024#define mem_cgroup_from_res_counter(counter, member) \ 1025 container_of(counter, struct mem_cgroup, member) 1026 1027static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 1028{ 1029 if (do_swap_account) { 1030 if (res_counter_check_under_limit(&mem->res) && 1031 res_counter_check_under_limit(&mem->memsw)) 1032 return true; 1033 } else 1034 if (res_counter_check_under_limit(&mem->res)) 1035 return true; 1036 return false; 1037} 1038 1039static unsigned int get_swappiness(struct mem_cgroup *memcg) 1040{ 1041 struct cgroup *cgrp = memcg->css.cgroup; 1042 unsigned int swappiness; 1043 1044 /* root ? */ 1045 if (cgrp->parent == NULL) 1046 return vm_swappiness; 1047 1048 spin_lock(&memcg->reclaim_param_lock); 1049 swappiness = memcg->swappiness; 1050 spin_unlock(&memcg->reclaim_param_lock); 1051 1052 return swappiness; 1053} 1054 1055/* A routine for testing mem is not under move_account */ 1056 1057static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1058{ 1059 struct mem_cgroup *from; 1060 struct mem_cgroup *to; 1061 bool ret = false; 1062 /* 1063 * Unlike task_move routines, we access mc.to, mc.from not under 1064 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1065 */ 1066 spin_lock(&mc.lock); 1067 from = mc.from; 1068 to = mc.to; 1069 if (!from) 1070 goto unlock; 1071 if (from == mem || to == mem 1072 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) 1073 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) 1074 ret = true; 1075unlock: 1076 spin_unlock(&mc.lock); 1077 return ret; 1078} 1079 1080static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1081{ 1082 if (mc.moving_task && current != mc.moving_task) { 1083 if (mem_cgroup_under_move(mem)) { 1084 DEFINE_WAIT(wait); 1085 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1086 /* moving charge context might have finished. */ 1087 if (mc.moving_task) 1088 schedule(); 1089 finish_wait(&mc.waitq, &wait); 1090 return true; 1091 } 1092 } 1093 return false; 1094} 1095 1096static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1097{ 1098 int *val = data; 1099 (*val)++; 1100 return 0; 1101} 1102 1103/** 1104 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1105 * @memcg: The memory cgroup that went over limit 1106 * @p: Task that is going to be killed 1107 * 1108 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1109 * enabled 1110 */ 1111void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1112{ 1113 struct cgroup *task_cgrp; 1114 struct cgroup *mem_cgrp; 1115 /* 1116 * Need a buffer in BSS, can't rely on allocations. The code relies 1117 * on the assumption that OOM is serialized for memory controller. 1118 * If this assumption is broken, revisit this code. 1119 */ 1120 static char memcg_name[PATH_MAX]; 1121 int ret; 1122 1123 if (!memcg || !p) 1124 return; 1125 1126 1127 rcu_read_lock(); 1128 1129 mem_cgrp = memcg->css.cgroup; 1130 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1131 1132 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1133 if (ret < 0) { 1134 /* 1135 * Unfortunately, we are unable to convert to a useful name 1136 * But we'll still print out the usage information 1137 */ 1138 rcu_read_unlock(); 1139 goto done; 1140 } 1141 rcu_read_unlock(); 1142 1143 printk(KERN_INFO "Task in %s killed", memcg_name); 1144 1145 rcu_read_lock(); 1146 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1147 if (ret < 0) { 1148 rcu_read_unlock(); 1149 goto done; 1150 } 1151 rcu_read_unlock(); 1152 1153 /* 1154 * Continues from above, so we don't need an KERN_ level 1155 */ 1156 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1157done: 1158 1159 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1160 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1161 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1162 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1163 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1164 "failcnt %llu\n", 1165 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1166 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1167 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1168} 1169 1170/* 1171 * This function returns the number of memcg under hierarchy tree. Returns 1172 * 1(self count) if no children. 1173 */ 1174static int mem_cgroup_count_children(struct mem_cgroup *mem) 1175{ 1176 int num = 0; 1177 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1178 return num; 1179} 1180 1181/* 1182 * Return the memory (and swap, if configured) limit for a memcg. 1183 */ 1184u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1185{ 1186 u64 limit; 1187 u64 memsw; 1188 1189 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + 1190 total_swap_pages; 1191 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1192 /* 1193 * If memsw is finite and limits the amount of swap space available 1194 * to this memcg, return that limit. 1195 */ 1196 return min(limit, memsw); 1197} 1198 1199/* 1200 * Visit the first child (need not be the first child as per the ordering 1201 * of the cgroup list, since we track last_scanned_child) of @mem and use 1202 * that to reclaim free pages from. 1203 */ 1204static struct mem_cgroup * 1205mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1206{ 1207 struct mem_cgroup *ret = NULL; 1208 struct cgroup_subsys_state *css; 1209 int nextid, found; 1210 1211 if (!root_mem->use_hierarchy) { 1212 css_get(&root_mem->css); 1213 ret = root_mem; 1214 } 1215 1216 while (!ret) { 1217 rcu_read_lock(); 1218 nextid = root_mem->last_scanned_child + 1; 1219 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1220 &found); 1221 if (css && css_tryget(css)) 1222 ret = container_of(css, struct mem_cgroup, css); 1223 1224 rcu_read_unlock(); 1225 /* Updates scanning parameter */ 1226 spin_lock(&root_mem->reclaim_param_lock); 1227 if (!css) { 1228 /* this means start scan from ID:1 */ 1229 root_mem->last_scanned_child = 0; 1230 } else 1231 root_mem->last_scanned_child = found; 1232 spin_unlock(&root_mem->reclaim_param_lock); 1233 } 1234 1235 return ret; 1236} 1237 1238/* 1239 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1240 * we reclaimed from, so that we don't end up penalizing one child extensively 1241 * based on its position in the children list. 1242 * 1243 * root_mem is the original ancestor that we've been reclaim from. 1244 * 1245 * We give up and return to the caller when we visit root_mem twice. 1246 * (other groups can be removed while we're walking....) 1247 * 1248 * If shrink==true, for avoiding to free too much, this returns immedieately. 1249 */ 1250static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1251 struct zone *zone, 1252 gfp_t gfp_mask, 1253 unsigned long reclaim_options) 1254{ 1255 struct mem_cgroup *victim; 1256 int ret, total = 0; 1257 int loop = 0; 1258 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1259 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1260 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1261 unsigned long excess = mem_cgroup_get_excess(root_mem); 1262 1263 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1264 if (root_mem->memsw_is_minimum) 1265 noswap = true; 1266 1267 while (1) { 1268 victim = mem_cgroup_select_victim(root_mem); 1269 if (victim == root_mem) { 1270 loop++; 1271 if (loop >= 1) 1272 drain_all_stock_async(); 1273 if (loop >= 2) { 1274 /* 1275 * If we have not been able to reclaim 1276 * anything, it might because there are 1277 * no reclaimable pages under this hierarchy 1278 */ 1279 if (!check_soft || !total) { 1280 css_put(&victim->css); 1281 break; 1282 } 1283 /* 1284 * We want to do more targetted reclaim. 1285 * excess >> 2 is not to excessive so as to 1286 * reclaim too much, nor too less that we keep 1287 * coming back to reclaim from this cgroup 1288 */ 1289 if (total >= (excess >> 2) || 1290 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1291 css_put(&victim->css); 1292 break; 1293 } 1294 } 1295 } 1296 if (!mem_cgroup_local_usage(victim)) { 1297 /* this cgroup's local usage == 0 */ 1298 css_put(&victim->css); 1299 continue; 1300 } 1301 /* we use swappiness of local cgroup */ 1302 if (check_soft) 1303 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1304 noswap, get_swappiness(victim), zone); 1305 else 1306 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1307 noswap, get_swappiness(victim)); 1308 css_put(&victim->css); 1309 /* 1310 * At shrinking usage, we can't check we should stop here or 1311 * reclaim more. It's depends on callers. last_scanned_child 1312 * will work enough for keeping fairness under tree. 1313 */ 1314 if (shrink) 1315 return ret; 1316 total += ret; 1317 if (check_soft) { 1318 if (res_counter_check_under_soft_limit(&root_mem->res)) 1319 return total; 1320 } else if (mem_cgroup_check_under_limit(root_mem)) 1321 return 1 + total; 1322 } 1323 return total; 1324} 1325 1326static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) 1327{ 1328 int *val = (int *)data; 1329 int x; 1330 /* 1331 * Logically, we can stop scanning immediately when we find 1332 * a memcg is already locked. But condidering unlock ops and 1333 * creation/removal of memcg, scan-all is simple operation. 1334 */ 1335 x = atomic_inc_return(&mem->oom_lock); 1336 *val = max(x, *val); 1337 return 0; 1338} 1339/* 1340 * Check OOM-Killer is already running under our hierarchy. 1341 * If someone is running, return false. 1342 */ 1343static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1344{ 1345 int lock_count = 0; 1346 1347 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1348 1349 if (lock_count == 1) 1350 return true; 1351 return false; 1352} 1353 1354static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1355{ 1356 /* 1357 * When a new child is created while the hierarchy is under oom, 1358 * mem_cgroup_oom_lock() may not be called. We have to use 1359 * atomic_add_unless() here. 1360 */ 1361 atomic_add_unless(&mem->oom_lock, -1, 0); 1362 return 0; 1363} 1364 1365static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1366{ 1367 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); 1368} 1369 1370static DEFINE_MUTEX(memcg_oom_mutex); 1371static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1372 1373struct oom_wait_info { 1374 struct mem_cgroup *mem; 1375 wait_queue_t wait; 1376}; 1377 1378static int memcg_oom_wake_function(wait_queue_t *wait, 1379 unsigned mode, int sync, void *arg) 1380{ 1381 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1382 struct oom_wait_info *oom_wait_info; 1383 1384 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1385 1386 if (oom_wait_info->mem == wake_mem) 1387 goto wakeup; 1388 /* if no hierarchy, no match */ 1389 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) 1390 return 0; 1391 /* 1392 * Both of oom_wait_info->mem and wake_mem are stable under us. 1393 * Then we can use css_is_ancestor without taking care of RCU. 1394 */ 1395 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1396 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1397 return 0; 1398 1399wakeup: 1400 return autoremove_wake_function(wait, mode, sync, arg); 1401} 1402 1403static void memcg_wakeup_oom(struct mem_cgroup *mem) 1404{ 1405 /* for filtering, pass "mem" as argument. */ 1406 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1407} 1408 1409static void memcg_oom_recover(struct mem_cgroup *mem) 1410{ 1411 if (mem && atomic_read(&mem->oom_lock)) 1412 memcg_wakeup_oom(mem); 1413} 1414 1415/* 1416 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1417 */ 1418bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1419{ 1420 struct oom_wait_info owait; 1421 bool locked, need_to_kill; 1422 1423 owait.mem = mem; 1424 owait.wait.flags = 0; 1425 owait.wait.func = memcg_oom_wake_function; 1426 owait.wait.private = current; 1427 INIT_LIST_HEAD(&owait.wait.task_list); 1428 need_to_kill = true; 1429 /* At first, try to OOM lock hierarchy under mem.*/ 1430 mutex_lock(&memcg_oom_mutex); 1431 locked = mem_cgroup_oom_lock(mem); 1432 /* 1433 * Even if signal_pending(), we can't quit charge() loop without 1434 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1435 * under OOM is always welcomed, use TASK_KILLABLE here. 1436 */ 1437 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1438 if (!locked || mem->oom_kill_disable) 1439 need_to_kill = false; 1440 if (locked) 1441 mem_cgroup_oom_notify(mem); 1442 mutex_unlock(&memcg_oom_mutex); 1443 1444 if (need_to_kill) { 1445 finish_wait(&memcg_oom_waitq, &owait.wait); 1446 mem_cgroup_out_of_memory(mem, mask); 1447 } else { 1448 schedule(); 1449 finish_wait(&memcg_oom_waitq, &owait.wait); 1450 } 1451 mutex_lock(&memcg_oom_mutex); 1452 mem_cgroup_oom_unlock(mem); 1453 memcg_wakeup_oom(mem); 1454 mutex_unlock(&memcg_oom_mutex); 1455 1456 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1457 return false; 1458 /* Give chance to dying process */ 1459 schedule_timeout(1); 1460 return true; 1461} 1462 1463/* 1464 * Currently used to update mapped file statistics, but the routine can be 1465 * generalized to update other statistics as well. 1466 */ 1467void mem_cgroup_update_file_mapped(struct page *page, int val) 1468{ 1469 struct mem_cgroup *mem; 1470 struct page_cgroup *pc; 1471 1472 pc = lookup_page_cgroup(page); 1473 if (unlikely(!pc)) 1474 return; 1475 1476 lock_page_cgroup(pc); 1477 mem = pc->mem_cgroup; 1478 if (!mem || !PageCgroupUsed(pc)) 1479 goto done; 1480 1481 /* 1482 * Preemption is already disabled. We can use __this_cpu_xxx 1483 */ 1484 if (val > 0) { 1485 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1486 SetPageCgroupFileMapped(pc); 1487 } else { 1488 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1489 ClearPageCgroupFileMapped(pc); 1490 } 1491 1492done: 1493 unlock_page_cgroup(pc); 1494} 1495 1496/* 1497 * size of first charge trial. "32" comes from vmscan.c's magic value. 1498 * TODO: maybe necessary to use big numbers in big irons. 1499 */ 1500#define CHARGE_SIZE (32 * PAGE_SIZE) 1501struct memcg_stock_pcp { 1502 struct mem_cgroup *cached; /* this never be root cgroup */ 1503 int charge; 1504 struct work_struct work; 1505}; 1506static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1507static atomic_t memcg_drain_count; 1508 1509/* 1510 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1511 * from local stock and true is returned. If the stock is 0 or charges from a 1512 * cgroup which is not current target, returns false. This stock will be 1513 * refilled. 1514 */ 1515static bool consume_stock(struct mem_cgroup *mem) 1516{ 1517 struct memcg_stock_pcp *stock; 1518 bool ret = true; 1519 1520 stock = &get_cpu_var(memcg_stock); 1521 if (mem == stock->cached && stock->charge) 1522 stock->charge -= PAGE_SIZE; 1523 else /* need to call res_counter_charge */ 1524 ret = false; 1525 put_cpu_var(memcg_stock); 1526 return ret; 1527} 1528 1529/* 1530 * Returns stocks cached in percpu to res_counter and reset cached information. 1531 */ 1532static void drain_stock(struct memcg_stock_pcp *stock) 1533{ 1534 struct mem_cgroup *old = stock->cached; 1535 1536 if (stock->charge) { 1537 res_counter_uncharge(&old->res, stock->charge); 1538 if (do_swap_account) 1539 res_counter_uncharge(&old->memsw, stock->charge); 1540 } 1541 stock->cached = NULL; 1542 stock->charge = 0; 1543} 1544 1545/* 1546 * This must be called under preempt disabled or must be called by 1547 * a thread which is pinned to local cpu. 1548 */ 1549static void drain_local_stock(struct work_struct *dummy) 1550{ 1551 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1552 drain_stock(stock); 1553} 1554 1555/* 1556 * Cache charges(val) which is from res_counter, to local per_cpu area. 1557 * This will be consumed by consume_stock() function, later. 1558 */ 1559static void refill_stock(struct mem_cgroup *mem, int val) 1560{ 1561 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1562 1563 if (stock->cached != mem) { /* reset if necessary */ 1564 drain_stock(stock); 1565 stock->cached = mem; 1566 } 1567 stock->charge += val; 1568 put_cpu_var(memcg_stock); 1569} 1570 1571/* 1572 * Tries to drain stocked charges in other cpus. This function is asynchronous 1573 * and just put a work per cpu for draining localy on each cpu. Caller can 1574 * expects some charges will be back to res_counter later but cannot wait for 1575 * it. 1576 */ 1577static void drain_all_stock_async(void) 1578{ 1579 int cpu; 1580 /* This function is for scheduling "drain" in asynchronous way. 1581 * The result of "drain" is not directly handled by callers. Then, 1582 * if someone is calling drain, we don't have to call drain more. 1583 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if 1584 * there is a race. We just do loose check here. 1585 */ 1586 if (atomic_read(&memcg_drain_count)) 1587 return; 1588 /* Notify other cpus that system-wide "drain" is running */ 1589 atomic_inc(&memcg_drain_count); 1590 get_online_cpus(); 1591 for_each_online_cpu(cpu) { 1592 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1593 schedule_work_on(cpu, &stock->work); 1594 } 1595 put_online_cpus(); 1596 atomic_dec(&memcg_drain_count); 1597 /* We don't wait for flush_work */ 1598} 1599 1600/* This is a synchronous drain interface. */ 1601static void drain_all_stock_sync(void) 1602{ 1603 /* called when force_empty is called */ 1604 atomic_inc(&memcg_drain_count); 1605 schedule_on_each_cpu(drain_local_stock); 1606 atomic_dec(&memcg_drain_count); 1607} 1608 1609static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1610 unsigned long action, 1611 void *hcpu) 1612{ 1613 int cpu = (unsigned long)hcpu; 1614 struct memcg_stock_pcp *stock; 1615 1616 if (action != CPU_DEAD) 1617 return NOTIFY_OK; 1618 stock = &per_cpu(memcg_stock, cpu); 1619 drain_stock(stock); 1620 return NOTIFY_OK; 1621} 1622 1623 1624/* See __mem_cgroup_try_charge() for details */ 1625enum { 1626 CHARGE_OK, /* success */ 1627 CHARGE_RETRY, /* need to retry but retry is not bad */ 1628 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 1629 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 1630 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 1631}; 1632 1633static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 1634 int csize, bool oom_check) 1635{ 1636 struct mem_cgroup *mem_over_limit; 1637 struct res_counter *fail_res; 1638 unsigned long flags = 0; 1639 int ret; 1640 1641 ret = res_counter_charge(&mem->res, csize, &fail_res); 1642 1643 if (likely(!ret)) { 1644 if (!do_swap_account) 1645 return CHARGE_OK; 1646 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 1647 if (likely(!ret)) 1648 return CHARGE_OK; 1649 1650 res_counter_uncharge(&mem->res, csize); 1651 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1652 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1653 } else 1654 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1655 1656 if (csize > PAGE_SIZE) /* change csize and retry */ 1657 return CHARGE_RETRY; 1658 1659 if (!(gfp_mask & __GFP_WAIT)) 1660 return CHARGE_WOULDBLOCK; 1661 1662 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1663 gfp_mask, flags); 1664 /* 1665 * try_to_free_mem_cgroup_pages() might not give us a full 1666 * picture of reclaim. Some pages are reclaimed and might be 1667 * moved to swap cache or just unmapped from the cgroup. 1668 * Check the limit again to see if the reclaim reduced the 1669 * current usage of the cgroup before giving up 1670 */ 1671 if (ret || mem_cgroup_check_under_limit(mem_over_limit)) 1672 return CHARGE_RETRY; 1673 1674 /* 1675 * At task move, charge accounts can be doubly counted. So, it's 1676 * better to wait until the end of task_move if something is going on. 1677 */ 1678 if (mem_cgroup_wait_acct_move(mem_over_limit)) 1679 return CHARGE_RETRY; 1680 1681 /* If we don't need to call oom-killer at el, return immediately */ 1682 if (!oom_check) 1683 return CHARGE_NOMEM; 1684 /* check OOM */ 1685 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 1686 return CHARGE_OOM_DIE; 1687 1688 return CHARGE_RETRY; 1689} 1690 1691/* 1692 * Unlike exported interface, "oom" parameter is added. if oom==true, 1693 * oom-killer can be invoked. 1694 */ 1695static int __mem_cgroup_try_charge(struct mm_struct *mm, 1696 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1697{ 1698 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1699 struct mem_cgroup *mem = NULL; 1700 int ret; 1701 int csize = CHARGE_SIZE; 1702 1703 /* 1704 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1705 * in system level. So, allow to go ahead dying process in addition to 1706 * MEMDIE process. 1707 */ 1708 if (unlikely(test_thread_flag(TIF_MEMDIE) 1709 || fatal_signal_pending(current))) 1710 goto bypass; 1711 1712 /* 1713 * We always charge the cgroup the mm_struct belongs to. 1714 * The mm_struct's mem_cgroup changes on task migration if the 1715 * thread group leader migrates. It's possible that mm is not 1716 * set, if so charge the init_mm (happens for pagecache usage). 1717 */ 1718 if (!*memcg && !mm) 1719 goto bypass; 1720again: 1721 if (*memcg) { /* css should be a valid one */ 1722 mem = *memcg; 1723 VM_BUG_ON(css_is_removed(&mem->css)); 1724 if (mem_cgroup_is_root(mem)) 1725 goto done; 1726 if (consume_stock(mem)) 1727 goto done; 1728 css_get(&mem->css); 1729 } else { 1730 struct task_struct *p; 1731 1732 rcu_read_lock(); 1733 p = rcu_dereference(mm->owner); 1734 /* 1735 * Because we don't have task_lock(), "p" can exit. 1736 * In that case, "mem" can point to root or p can be NULL with 1737 * race with swapoff. Then, we have small risk of mis-accouning. 1738 * But such kind of mis-account by race always happens because 1739 * we don't have cgroup_mutex(). It's overkill and we allo that 1740 * small race, here. 1741 * (*) swapoff at el will charge against mm-struct not against 1742 * task-struct. So, mm->owner can be NULL. 1743 */ 1744 mem = mem_cgroup_from_task(p); 1745 if (!mem || mem_cgroup_is_root(mem)) { 1746 rcu_read_unlock(); 1747 goto done; 1748 } 1749 if (consume_stock(mem)) { 1750 /* 1751 * It seems dagerous to access memcg without css_get(). 1752 * But considering how consume_stok works, it's not 1753 * necessary. If consume_stock success, some charges 1754 * from this memcg are cached on this cpu. So, we 1755 * don't need to call css_get()/css_tryget() before 1756 * calling consume_stock(). 1757 */ 1758 rcu_read_unlock(); 1759 goto done; 1760 } 1761 /* after here, we may be blocked. we need to get refcnt */ 1762 if (!css_tryget(&mem->css)) { 1763 rcu_read_unlock(); 1764 goto again; 1765 } 1766 rcu_read_unlock(); 1767 } 1768 1769 do { 1770 bool oom_check; 1771 1772 /* If killed, bypass charge */ 1773 if (fatal_signal_pending(current)) { 1774 css_put(&mem->css); 1775 goto bypass; 1776 } 1777 1778 oom_check = false; 1779 if (oom && !nr_oom_retries) { 1780 oom_check = true; 1781 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1782 } 1783 1784 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); 1785 1786 switch (ret) { 1787 case CHARGE_OK: 1788 break; 1789 case CHARGE_RETRY: /* not in OOM situation but retry */ 1790 csize = PAGE_SIZE; 1791 css_put(&mem->css); 1792 mem = NULL; 1793 goto again; 1794 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 1795 css_put(&mem->css); 1796 goto nomem; 1797 case CHARGE_NOMEM: /* OOM routine works */ 1798 if (!oom) { 1799 css_put(&mem->css); 1800 goto nomem; 1801 } 1802 /* If oom, we never return -ENOMEM */ 1803 nr_oom_retries--; 1804 break; 1805 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 1806 css_put(&mem->css); 1807 goto bypass; 1808 } 1809 } while (ret != CHARGE_OK); 1810 1811 if (csize > PAGE_SIZE) 1812 refill_stock(mem, csize - PAGE_SIZE); 1813 css_put(&mem->css); 1814done: 1815 *memcg = mem; 1816 return 0; 1817nomem: 1818 *memcg = NULL; 1819 return -ENOMEM; 1820bypass: 1821 *memcg = NULL; 1822 return 0; 1823} 1824 1825/* 1826 * Somemtimes we have to undo a charge we got by try_charge(). 1827 * This function is for that and do uncharge, put css's refcnt. 1828 * gotten by try_charge(). 1829 */ 1830static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 1831 unsigned long count) 1832{ 1833 if (!mem_cgroup_is_root(mem)) { 1834 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 1835 if (do_swap_account) 1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 1837 } 1838} 1839 1840static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1841{ 1842 __mem_cgroup_cancel_charge(mem, 1); 1843} 1844 1845/* 1846 * A helper function to get mem_cgroup from ID. must be called under 1847 * rcu_read_lock(). The caller must check css_is_removed() or some if 1848 * it's concern. (dropping refcnt from swap can be called against removed 1849 * memcg.) 1850 */ 1851static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 1852{ 1853 struct cgroup_subsys_state *css; 1854 1855 /* ID 0 is unused ID */ 1856 if (!id) 1857 return NULL; 1858 css = css_lookup(&mem_cgroup_subsys, id); 1859 if (!css) 1860 return NULL; 1861 return container_of(css, struct mem_cgroup, css); 1862} 1863 1864struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 1865{ 1866 struct mem_cgroup *mem = NULL; 1867 struct page_cgroup *pc; 1868 unsigned short id; 1869 swp_entry_t ent; 1870 1871 VM_BUG_ON(!PageLocked(page)); 1872 1873 pc = lookup_page_cgroup(page); 1874 lock_page_cgroup(pc); 1875 if (PageCgroupUsed(pc)) { 1876 mem = pc->mem_cgroup; 1877 if (mem && !css_tryget(&mem->css)) 1878 mem = NULL; 1879 } else if (PageSwapCache(page)) { 1880 ent.val = page_private(page); 1881 id = lookup_swap_cgroup(ent); 1882 rcu_read_lock(); 1883 mem = mem_cgroup_lookup(id); 1884 if (mem && !css_tryget(&mem->css)) 1885 mem = NULL; 1886 rcu_read_unlock(); 1887 } 1888 unlock_page_cgroup(pc); 1889 return mem; 1890} 1891 1892/* 1893 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 1894 * USED state. If already USED, uncharge and return. 1895 */ 1896 1897static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 1898 struct page_cgroup *pc, 1899 enum charge_type ctype) 1900{ 1901 /* try_charge() can return NULL to *memcg, taking care of it. */ 1902 if (!mem) 1903 return; 1904 1905 lock_page_cgroup(pc); 1906 if (unlikely(PageCgroupUsed(pc))) { 1907 unlock_page_cgroup(pc); 1908 mem_cgroup_cancel_charge(mem); 1909 return; 1910 } 1911 1912 pc->mem_cgroup = mem; 1913 /* 1914 * We access a page_cgroup asynchronously without lock_page_cgroup(). 1915 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 1916 * is accessed after testing USED bit. To make pc->mem_cgroup visible 1917 * before USED bit, we need memory barrier here. 1918 * See mem_cgroup_add_lru_list(), etc. 1919 */ 1920 smp_wmb(); 1921 switch (ctype) { 1922 case MEM_CGROUP_CHARGE_TYPE_CACHE: 1923 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 1924 SetPageCgroupCache(pc); 1925 SetPageCgroupUsed(pc); 1926 break; 1927 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1928 ClearPageCgroupCache(pc); 1929 SetPageCgroupUsed(pc); 1930 break; 1931 default: 1932 break; 1933 } 1934 1935 mem_cgroup_charge_statistics(mem, pc, true); 1936 1937 unlock_page_cgroup(pc); 1938 /* 1939 * "charge_statistics" updated event counter. Then, check it. 1940 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1941 * if they exceeds softlimit. 1942 */ 1943 memcg_check_events(mem, pc->page); 1944} 1945 1946/** 1947 * __mem_cgroup_move_account - move account of the page 1948 * @pc: page_cgroup of the page. 1949 * @from: mem_cgroup which the page is moved from. 1950 * @to: mem_cgroup which the page is moved to. @from != @to. 1951 * @uncharge: whether we should call uncharge and css_put against @from. 1952 * 1953 * The caller must confirm following. 1954 * - page is not on LRU (isolate_page() is useful.) 1955 * - the pc is locked, used, and ->mem_cgroup points to @from. 1956 * 1957 * This function doesn't do "charge" nor css_get to new cgroup. It should be 1958 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 1959 * true, this function does "uncharge" from old cgroup, but it doesn't if 1960 * @uncharge is false, so a caller should do "uncharge". 1961 */ 1962 1963static void __mem_cgroup_move_account(struct page_cgroup *pc, 1964 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1965{ 1966 VM_BUG_ON(from == to); 1967 VM_BUG_ON(PageLRU(pc->page)); 1968 VM_BUG_ON(!PageCgroupLocked(pc)); 1969 VM_BUG_ON(!PageCgroupUsed(pc)); 1970 VM_BUG_ON(pc->mem_cgroup != from); 1971 1972 if (PageCgroupFileMapped(pc)) { 1973 /* Update mapped_file data for mem_cgroup */ 1974 preempt_disable(); 1975 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1976 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1977 preempt_enable(); 1978 } 1979 mem_cgroup_charge_statistics(from, pc, false); 1980 if (uncharge) 1981 /* This is not "cancel", but cancel_charge does all we need. */ 1982 mem_cgroup_cancel_charge(from); 1983 1984 /* caller should have done css_get */ 1985 pc->mem_cgroup = to; 1986 mem_cgroup_charge_statistics(to, pc, true); 1987 /* 1988 * We charges against "to" which may not have any tasks. Then, "to" 1989 * can be under rmdir(). But in current implementation, caller of 1990 * this function is just force_empty() and move charge, so it's 1991 * garanteed that "to" is never removed. So, we don't check rmdir 1992 * status here. 1993 */ 1994} 1995 1996/* 1997 * check whether the @pc is valid for moving account and call 1998 * __mem_cgroup_move_account() 1999 */ 2000static int mem_cgroup_move_account(struct page_cgroup *pc, 2001 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2002{ 2003 int ret = -EINVAL; 2004 lock_page_cgroup(pc); 2005 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 2006 __mem_cgroup_move_account(pc, from, to, uncharge); 2007 ret = 0; 2008 } 2009 unlock_page_cgroup(pc); 2010 /* 2011 * check events 2012 */ 2013 memcg_check_events(to, pc->page); 2014 memcg_check_events(from, pc->page); 2015 return ret; 2016} 2017 2018/* 2019 * move charges to its parent. 2020 */ 2021 2022static int mem_cgroup_move_parent(struct page_cgroup *pc, 2023 struct mem_cgroup *child, 2024 gfp_t gfp_mask) 2025{ 2026 struct page *page = pc->page; 2027 struct cgroup *cg = child->css.cgroup; 2028 struct cgroup *pcg = cg->parent; 2029 struct mem_cgroup *parent; 2030 int ret; 2031 2032 /* Is ROOT ? */ 2033 if (!pcg) 2034 return -EINVAL; 2035 2036 ret = -EBUSY; 2037 if (!get_page_unless_zero(page)) 2038 goto out; 2039 if (isolate_lru_page(page)) 2040 goto put; 2041 2042 parent = mem_cgroup_from_cont(pcg); 2043 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 2044 if (ret || !parent) 2045 goto put_back; 2046 2047 ret = mem_cgroup_move_account(pc, child, parent, true); 2048 if (ret) 2049 mem_cgroup_cancel_charge(parent); 2050put_back: 2051 putback_lru_page(page); 2052put: 2053 put_page(page); 2054out: 2055 return ret; 2056} 2057 2058/* 2059 * Charge the memory controller for page usage. 2060 * Return 2061 * 0 if the charge was successful 2062 * < 0 if the cgroup is over its limit 2063 */ 2064static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2065 gfp_t gfp_mask, enum charge_type ctype) 2066{ 2067 struct mem_cgroup *mem = NULL; 2068 struct page_cgroup *pc; 2069 int ret; 2070 2071 pc = lookup_page_cgroup(page); 2072 /* can happen at boot */ 2073 if (unlikely(!pc)) 2074 return 0; 2075 prefetchw(pc); 2076 2077 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2078 if (ret || !mem) 2079 return ret; 2080 2081 __mem_cgroup_commit_charge(mem, pc, ctype); 2082 return 0; 2083} 2084 2085int mem_cgroup_newpage_charge(struct page *page, 2086 struct mm_struct *mm, gfp_t gfp_mask) 2087{ 2088 if (mem_cgroup_disabled()) 2089 return 0; 2090 if (PageCompound(page)) 2091 return 0; 2092 /* 2093 * If already mapped, we don't have to account. 2094 * If page cache, page->mapping has address_space. 2095 * But page->mapping may have out-of-use anon_vma pointer, 2096 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2097 * is NULL. 2098 */ 2099 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2100 return 0; 2101 if (unlikely(!mm)) 2102 mm = &init_mm; 2103 return mem_cgroup_charge_common(page, mm, gfp_mask, 2104 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2105} 2106 2107static void 2108__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2109 enum charge_type ctype); 2110 2111int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2112 gfp_t gfp_mask) 2113{ 2114 int ret; 2115 2116 if (mem_cgroup_disabled()) 2117 return 0; 2118 if (PageCompound(page)) 2119 return 0; 2120 /* 2121 * Corner case handling. This is called from add_to_page_cache() 2122 * in usual. But some FS (shmem) precharges this page before calling it 2123 * and call add_to_page_cache() with GFP_NOWAIT. 2124 * 2125 * For GFP_NOWAIT case, the page may be pre-charged before calling 2126 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 2127 * charge twice. (It works but has to pay a bit larger cost.) 2128 * And when the page is SwapCache, it should take swap information 2129 * into account. This is under lock_page() now. 2130 */ 2131 if (!(gfp_mask & __GFP_WAIT)) { 2132 struct page_cgroup *pc; 2133 2134 pc = lookup_page_cgroup(page); 2135 if (!pc) 2136 return 0; 2137 lock_page_cgroup(pc); 2138 if (PageCgroupUsed(pc)) { 2139 unlock_page_cgroup(pc); 2140 return 0; 2141 } 2142 unlock_page_cgroup(pc); 2143 } 2144 2145 if (unlikely(!mm)) 2146 mm = &init_mm; 2147 2148 if (page_is_file_cache(page)) 2149 return mem_cgroup_charge_common(page, mm, gfp_mask, 2150 MEM_CGROUP_CHARGE_TYPE_CACHE); 2151 2152 /* shmem */ 2153 if (PageSwapCache(page)) { 2154 struct mem_cgroup *mem = NULL; 2155 2156 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2157 if (!ret) 2158 __mem_cgroup_commit_charge_swapin(page, mem, 2159 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2160 } else 2161 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2162 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2163 2164 return ret; 2165} 2166 2167/* 2168 * While swap-in, try_charge -> commit or cancel, the page is locked. 2169 * And when try_charge() successfully returns, one refcnt to memcg without 2170 * struct page_cgroup is acquired. This refcnt will be consumed by 2171 * "commit()" or removed by "cancel()" 2172 */ 2173int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2174 struct page *page, 2175 gfp_t mask, struct mem_cgroup **ptr) 2176{ 2177 struct mem_cgroup *mem; 2178 int ret; 2179 2180 if (mem_cgroup_disabled()) 2181 return 0; 2182 2183 if (!do_swap_account) 2184 goto charge_cur_mm; 2185 /* 2186 * A racing thread's fault, or swapoff, may have already updated 2187 * the pte, and even removed page from swap cache: in those cases 2188 * do_swap_page()'s pte_same() test will fail; but there's also a 2189 * KSM case which does need to charge the page. 2190 */ 2191 if (!PageSwapCache(page)) 2192 goto charge_cur_mm; 2193 mem = try_get_mem_cgroup_from_page(page); 2194 if (!mem) 2195 goto charge_cur_mm; 2196 *ptr = mem; 2197 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2198 css_put(&mem->css); 2199 return ret; 2200charge_cur_mm: 2201 if (unlikely(!mm)) 2202 mm = &init_mm; 2203 return __mem_cgroup_try_charge(mm, mask, ptr, true); 2204} 2205 2206static void 2207__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2208 enum charge_type ctype) 2209{ 2210 struct page_cgroup *pc; 2211 2212 if (mem_cgroup_disabled()) 2213 return; 2214 if (!ptr) 2215 return; 2216 cgroup_exclude_rmdir(&ptr->css); 2217 pc = lookup_page_cgroup(page); 2218 mem_cgroup_lru_del_before_commit_swapcache(page); 2219 __mem_cgroup_commit_charge(ptr, pc, ctype); 2220 mem_cgroup_lru_add_after_commit_swapcache(page); 2221 /* 2222 * Now swap is on-memory. This means this page may be 2223 * counted both as mem and swap....double count. 2224 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2225 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2226 * may call delete_from_swap_cache() before reach here. 2227 */ 2228 if (do_swap_account && PageSwapCache(page)) { 2229 swp_entry_t ent = {.val = page_private(page)}; 2230 unsigned short id; 2231 struct mem_cgroup *memcg; 2232 2233 id = swap_cgroup_record(ent, 0); 2234 rcu_read_lock(); 2235 memcg = mem_cgroup_lookup(id); 2236 if (memcg) { 2237 /* 2238 * This recorded memcg can be obsolete one. So, avoid 2239 * calling css_tryget 2240 */ 2241 if (!mem_cgroup_is_root(memcg)) 2242 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2243 mem_cgroup_swap_statistics(memcg, false); 2244 mem_cgroup_put(memcg); 2245 } 2246 rcu_read_unlock(); 2247 } 2248 /* 2249 * At swapin, we may charge account against cgroup which has no tasks. 2250 * So, rmdir()->pre_destroy() can be called while we do this charge. 2251 * In that case, we need to call pre_destroy() again. check it here. 2252 */ 2253 cgroup_release_and_wakeup_rmdir(&ptr->css); 2254} 2255 2256void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2257{ 2258 __mem_cgroup_commit_charge_swapin(page, ptr, 2259 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2260} 2261 2262void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2263{ 2264 if (mem_cgroup_disabled()) 2265 return; 2266 if (!mem) 2267 return; 2268 mem_cgroup_cancel_charge(mem); 2269} 2270 2271static void 2272__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2273{ 2274 struct memcg_batch_info *batch = NULL; 2275 bool uncharge_memsw = true; 2276 /* If swapout, usage of swap doesn't decrease */ 2277 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2278 uncharge_memsw = false; 2279 2280 batch = ¤t->memcg_batch; 2281 /* 2282 * In usual, we do css_get() when we remember memcg pointer. 2283 * But in this case, we keep res->usage until end of a series of 2284 * uncharges. Then, it's ok to ignore memcg's refcnt. 2285 */ 2286 if (!batch->memcg) 2287 batch->memcg = mem; 2288 /* 2289 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2290 * In those cases, all pages freed continously can be expected to be in 2291 * the same cgroup and we have chance to coalesce uncharges. 2292 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2293 * because we want to do uncharge as soon as possible. 2294 */ 2295 2296 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2297 goto direct_uncharge; 2298 2299 /* 2300 * In typical case, batch->memcg == mem. This means we can 2301 * merge a series of uncharges to an uncharge of res_counter. 2302 * If not, we uncharge res_counter ony by one. 2303 */ 2304 if (batch->memcg != mem) 2305 goto direct_uncharge; 2306 /* remember freed charge and uncharge it later */ 2307 batch->bytes += PAGE_SIZE; 2308 if (uncharge_memsw) 2309 batch->memsw_bytes += PAGE_SIZE; 2310 return; 2311direct_uncharge: 2312 res_counter_uncharge(&mem->res, PAGE_SIZE); 2313 if (uncharge_memsw) 2314 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2315 if (unlikely(batch->memcg != mem)) 2316 memcg_oom_recover(mem); 2317 return; 2318} 2319 2320/* 2321 * uncharge if !page_mapped(page) 2322 */ 2323static struct mem_cgroup * 2324__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2325{ 2326 struct page_cgroup *pc; 2327 struct mem_cgroup *mem = NULL; 2328 2329 if (mem_cgroup_disabled()) 2330 return NULL; 2331 2332 if (PageSwapCache(page)) 2333 return NULL; 2334 2335 /* 2336 * Check if our page_cgroup is valid 2337 */ 2338 pc = lookup_page_cgroup(page); 2339 if (unlikely(!pc || !PageCgroupUsed(pc))) 2340 return NULL; 2341 2342 lock_page_cgroup(pc); 2343 2344 mem = pc->mem_cgroup; 2345 2346 if (!PageCgroupUsed(pc)) 2347 goto unlock_out; 2348 2349 switch (ctype) { 2350 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2351 case MEM_CGROUP_CHARGE_TYPE_DROP: 2352 /* See mem_cgroup_prepare_migration() */ 2353 if (page_mapped(page) || PageCgroupMigration(pc)) 2354 goto unlock_out; 2355 break; 2356 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2357 if (!PageAnon(page)) { /* Shared memory */ 2358 if (page->mapping && !page_is_file_cache(page)) 2359 goto unlock_out; 2360 } else if (page_mapped(page)) /* Anon */ 2361 goto unlock_out; 2362 break; 2363 default: 2364 break; 2365 } 2366 2367 mem_cgroup_charge_statistics(mem, pc, false); 2368 2369 ClearPageCgroupUsed(pc); 2370 /* 2371 * pc->mem_cgroup is not cleared here. It will be accessed when it's 2372 * freed from LRU. This is safe because uncharged page is expected not 2373 * to be reused (freed soon). Exception is SwapCache, it's handled by 2374 * special functions. 2375 */ 2376 2377 unlock_page_cgroup(pc); 2378 /* 2379 * even after unlock, we have mem->res.usage here and this memcg 2380 * will never be freed. 2381 */ 2382 memcg_check_events(mem, page); 2383 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 2384 mem_cgroup_swap_statistics(mem, true); 2385 mem_cgroup_get(mem); 2386 } 2387 if (!mem_cgroup_is_root(mem)) 2388 __do_uncharge(mem, ctype); 2389 2390 return mem; 2391 2392unlock_out: 2393 unlock_page_cgroup(pc); 2394 return NULL; 2395} 2396 2397void mem_cgroup_uncharge_page(struct page *page) 2398{ 2399 /* early check. */ 2400 if (page_mapped(page)) 2401 return; 2402 if (page->mapping && !PageAnon(page)) 2403 return; 2404 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2405} 2406 2407void mem_cgroup_uncharge_cache_page(struct page *page) 2408{ 2409 VM_BUG_ON(page_mapped(page)); 2410 VM_BUG_ON(page->mapping); 2411 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2412} 2413 2414/* 2415 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 2416 * In that cases, pages are freed continuously and we can expect pages 2417 * are in the same memcg. All these calls itself limits the number of 2418 * pages freed at once, then uncharge_start/end() is called properly. 2419 * This may be called prural(2) times in a context, 2420 */ 2421 2422void mem_cgroup_uncharge_start(void) 2423{ 2424 current->memcg_batch.do_batch++; 2425 /* We can do nest. */ 2426 if (current->memcg_batch.do_batch == 1) { 2427 current->memcg_batch.memcg = NULL; 2428 current->memcg_batch.bytes = 0; 2429 current->memcg_batch.memsw_bytes = 0; 2430 } 2431} 2432 2433void mem_cgroup_uncharge_end(void) 2434{ 2435 struct memcg_batch_info *batch = ¤t->memcg_batch; 2436 2437 if (!batch->do_batch) 2438 return; 2439 2440 batch->do_batch--; 2441 if (batch->do_batch) /* If stacked, do nothing. */ 2442 return; 2443 2444 if (!batch->memcg) 2445 return; 2446 /* 2447 * This "batch->memcg" is valid without any css_get/put etc... 2448 * bacause we hide charges behind us. 2449 */ 2450 if (batch->bytes) 2451 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2452 if (batch->memsw_bytes) 2453 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2454 memcg_oom_recover(batch->memcg); 2455 /* forget this pointer (for sanity check) */ 2456 batch->memcg = NULL; 2457} 2458 2459#ifdef CONFIG_SWAP 2460/* 2461 * called after __delete_from_swap_cache() and drop "page" account. 2462 * memcg information is recorded to swap_cgroup of "ent" 2463 */ 2464void 2465mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 2466{ 2467 struct mem_cgroup *memcg; 2468 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 2469 2470 if (!swapout) /* this was a swap cache but the swap is unused ! */ 2471 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 2472 2473 memcg = __mem_cgroup_uncharge_common(page, ctype); 2474 2475 /* 2476 * record memcg information, if swapout && memcg != NULL, 2477 * mem_cgroup_get() was called in uncharge(). 2478 */ 2479 if (do_swap_account && swapout && memcg) 2480 swap_cgroup_record(ent, css_id(&memcg->css)); 2481} 2482#endif 2483 2484#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2485/* 2486 * called from swap_entry_free(). remove record in swap_cgroup and 2487 * uncharge "memsw" account. 2488 */ 2489void mem_cgroup_uncharge_swap(swp_entry_t ent) 2490{ 2491 struct mem_cgroup *memcg; 2492 unsigned short id; 2493 2494 if (!do_swap_account) 2495 return; 2496 2497 id = swap_cgroup_record(ent, 0); 2498 rcu_read_lock(); 2499 memcg = mem_cgroup_lookup(id); 2500 if (memcg) { 2501 /* 2502 * We uncharge this because swap is freed. 2503 * This memcg can be obsolete one. We avoid calling css_tryget 2504 */ 2505 if (!mem_cgroup_is_root(memcg)) 2506 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2507 mem_cgroup_swap_statistics(memcg, false); 2508 mem_cgroup_put(memcg); 2509 } 2510 rcu_read_unlock(); 2511} 2512 2513/** 2514 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2515 * @entry: swap entry to be moved 2516 * @from: mem_cgroup which the entry is moved from 2517 * @to: mem_cgroup which the entry is moved to 2518 * @need_fixup: whether we should fixup res_counters and refcounts. 2519 * 2520 * It succeeds only when the swap_cgroup's record for this entry is the same 2521 * as the mem_cgroup's id of @from. 2522 * 2523 * Returns 0 on success, -EINVAL on failure. 2524 * 2525 * The caller must have charged to @to, IOW, called res_counter_charge() about 2526 * both res and memsw, and called css_get(). 2527 */ 2528static int mem_cgroup_move_swap_account(swp_entry_t entry, 2529 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2530{ 2531 unsigned short old_id, new_id; 2532 2533 old_id = css_id(&from->css); 2534 new_id = css_id(&to->css); 2535 2536 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2537 mem_cgroup_swap_statistics(from, false); 2538 mem_cgroup_swap_statistics(to, true); 2539 /* 2540 * This function is only called from task migration context now. 2541 * It postpones res_counter and refcount handling till the end 2542 * of task migration(mem_cgroup_clear_mc()) for performance 2543 * improvement. But we cannot postpone mem_cgroup_get(to) 2544 * because if the process that has been moved to @to does 2545 * swap-in, the refcount of @to might be decreased to 0. 2546 */ 2547 mem_cgroup_get(to); 2548 if (need_fixup) { 2549 if (!mem_cgroup_is_root(from)) 2550 res_counter_uncharge(&from->memsw, PAGE_SIZE); 2551 mem_cgroup_put(from); 2552 /* 2553 * we charged both to->res and to->memsw, so we should 2554 * uncharge to->res. 2555 */ 2556 if (!mem_cgroup_is_root(to)) 2557 res_counter_uncharge(&to->res, PAGE_SIZE); 2558 } 2559 return 0; 2560 } 2561 return -EINVAL; 2562} 2563#else 2564static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2565 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2566{ 2567 return -EINVAL; 2568} 2569#endif 2570 2571/* 2572 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2573 * page belongs to. 2574 */ 2575int mem_cgroup_prepare_migration(struct page *page, 2576 struct page *newpage, struct mem_cgroup **ptr) 2577{ 2578 struct page_cgroup *pc; 2579 struct mem_cgroup *mem = NULL; 2580 enum charge_type ctype; 2581 int ret = 0; 2582 2583 if (mem_cgroup_disabled()) 2584 return 0; 2585 2586 pc = lookup_page_cgroup(page); 2587 lock_page_cgroup(pc); 2588 if (PageCgroupUsed(pc)) { 2589 mem = pc->mem_cgroup; 2590 css_get(&mem->css); 2591 /* 2592 * At migrating an anonymous page, its mapcount goes down 2593 * to 0 and uncharge() will be called. But, even if it's fully 2594 * unmapped, migration may fail and this page has to be 2595 * charged again. We set MIGRATION flag here and delay uncharge 2596 * until end_migration() is called 2597 * 2598 * Corner Case Thinking 2599 * A) 2600 * When the old page was mapped as Anon and it's unmap-and-freed 2601 * while migration was ongoing. 2602 * If unmap finds the old page, uncharge() of it will be delayed 2603 * until end_migration(). If unmap finds a new page, it's 2604 * uncharged when it make mapcount to be 1->0. If unmap code 2605 * finds swap_migration_entry, the new page will not be mapped 2606 * and end_migration() will find it(mapcount==0). 2607 * 2608 * B) 2609 * When the old page was mapped but migraion fails, the kernel 2610 * remaps it. A charge for it is kept by MIGRATION flag even 2611 * if mapcount goes down to 0. We can do remap successfully 2612 * without charging it again. 2613 * 2614 * C) 2615 * The "old" page is under lock_page() until the end of 2616 * migration, so, the old page itself will not be swapped-out. 2617 * If the new page is swapped out before end_migraton, our 2618 * hook to usual swap-out path will catch the event. 2619 */ 2620 if (PageAnon(page)) 2621 SetPageCgroupMigration(pc); 2622 } 2623 unlock_page_cgroup(pc); 2624 /* 2625 * If the page is not charged at this point, 2626 * we return here. 2627 */ 2628 if (!mem) 2629 return 0; 2630 2631 *ptr = mem; 2632 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 2633 css_put(&mem->css);/* drop extra refcnt */ 2634 if (ret || *ptr == NULL) { 2635 if (PageAnon(page)) { 2636 lock_page_cgroup(pc); 2637 ClearPageCgroupMigration(pc); 2638 unlock_page_cgroup(pc); 2639 /* 2640 * The old page may be fully unmapped while we kept it. 2641 */ 2642 mem_cgroup_uncharge_page(page); 2643 } 2644 return -ENOMEM; 2645 } 2646 /* 2647 * We charge new page before it's used/mapped. So, even if unlock_page() 2648 * is called before end_migration, we can catch all events on this new 2649 * page. In the case new page is migrated but not remapped, new page's 2650 * mapcount will be finally 0 and we call uncharge in end_migration(). 2651 */ 2652 pc = lookup_page_cgroup(newpage); 2653 if (PageAnon(page)) 2654 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2655 else if (page_is_file_cache(page)) 2656 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2657 else 2658 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2659 __mem_cgroup_commit_charge(mem, pc, ctype); 2660 return ret; 2661} 2662 2663/* remove redundant charge if migration failed*/ 2664void mem_cgroup_end_migration(struct mem_cgroup *mem, 2665 struct page *oldpage, struct page *newpage) 2666{ 2667 struct page *used, *unused; 2668 struct page_cgroup *pc; 2669 2670 if (!mem) 2671 return; 2672 /* blocks rmdir() */ 2673 cgroup_exclude_rmdir(&mem->css); 2674 /* at migration success, oldpage->mapping is NULL. */ 2675 if (oldpage->mapping) { 2676 used = oldpage; 2677 unused = newpage; 2678 } else { 2679 used = newpage; 2680 unused = oldpage; 2681 } 2682 /* 2683 * We disallowed uncharge of pages under migration because mapcount 2684 * of the page goes down to zero, temporarly. 2685 * Clear the flag and check the page should be charged. 2686 */ 2687 pc = lookup_page_cgroup(oldpage); 2688 lock_page_cgroup(pc); 2689 ClearPageCgroupMigration(pc); 2690 unlock_page_cgroup(pc); 2691 2692 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 2693 2694 /* 2695 * If a page is a file cache, radix-tree replacement is very atomic 2696 * and we can skip this check. When it was an Anon page, its mapcount 2697 * goes down to 0. But because we added MIGRATION flage, it's not 2698 * uncharged yet. There are several case but page->mapcount check 2699 * and USED bit check in mem_cgroup_uncharge_page() will do enough 2700 * check. (see prepare_charge() also) 2701 */ 2702 if (PageAnon(used)) 2703 mem_cgroup_uncharge_page(used); 2704 /* 2705 * At migration, we may charge account against cgroup which has no 2706 * tasks. 2707 * So, rmdir()->pre_destroy() can be called while we do this charge. 2708 * In that case, we need to call pre_destroy() again. check it here. 2709 */ 2710 cgroup_release_and_wakeup_rmdir(&mem->css); 2711} 2712 2713/* 2714 * A call to try to shrink memory usage on charge failure at shmem's swapin. 2715 * Calling hierarchical_reclaim is not enough because we should update 2716 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 2717 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 2718 * not from the memcg which this page would be charged to. 2719 * try_charge_swapin does all of these works properly. 2720 */ 2721int mem_cgroup_shmem_charge_fallback(struct page *page, 2722 struct mm_struct *mm, 2723 gfp_t gfp_mask) 2724{ 2725 struct mem_cgroup *mem = NULL; 2726 int ret; 2727 2728 if (mem_cgroup_disabled()) 2729 return 0; 2730 2731 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2732 if (!ret) 2733 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 2734 2735 return ret; 2736} 2737 2738static DEFINE_MUTEX(set_limit_mutex); 2739 2740static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2741 unsigned long long val) 2742{ 2743 int retry_count; 2744 u64 memswlimit, memlimit; 2745 int ret = 0; 2746 int children = mem_cgroup_count_children(memcg); 2747 u64 curusage, oldusage; 2748 int enlarge; 2749 2750 /* 2751 * For keeping hierarchical_reclaim simple, how long we should retry 2752 * is depends on callers. We set our retry-count to be function 2753 * of # of children which we should visit in this loop. 2754 */ 2755 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 2756 2757 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2758 2759 enlarge = 0; 2760 while (retry_count) { 2761 if (signal_pending(current)) { 2762 ret = -EINTR; 2763 break; 2764 } 2765 /* 2766 * Rather than hide all in some function, I do this in 2767 * open coded manner. You see what this really does. 2768 * We have to guarantee mem->res.limit < mem->memsw.limit. 2769 */ 2770 mutex_lock(&set_limit_mutex); 2771 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2772 if (memswlimit < val) { 2773 ret = -EINVAL; 2774 mutex_unlock(&set_limit_mutex); 2775 break; 2776 } 2777 2778 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2779 if (memlimit < val) 2780 enlarge = 1; 2781 2782 ret = res_counter_set_limit(&memcg->res, val); 2783 if (!ret) { 2784 if (memswlimit == val) 2785 memcg->memsw_is_minimum = true; 2786 else 2787 memcg->memsw_is_minimum = false; 2788 } 2789 mutex_unlock(&set_limit_mutex); 2790 2791 if (!ret) 2792 break; 2793 2794 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2795 MEM_CGROUP_RECLAIM_SHRINK); 2796 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2797 /* Usage is reduced ? */ 2798 if (curusage >= oldusage) 2799 retry_count--; 2800 else 2801 oldusage = curusage; 2802 } 2803 if (!ret && enlarge) 2804 memcg_oom_recover(memcg); 2805 2806 return ret; 2807} 2808 2809static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2810 unsigned long long val) 2811{ 2812 int retry_count; 2813 u64 memlimit, memswlimit, oldusage, curusage; 2814 int children = mem_cgroup_count_children(memcg); 2815 int ret = -EBUSY; 2816 int enlarge = 0; 2817 2818 /* see mem_cgroup_resize_res_limit */ 2819 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2820 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2821 while (retry_count) { 2822 if (signal_pending(current)) { 2823 ret = -EINTR; 2824 break; 2825 } 2826 /* 2827 * Rather than hide all in some function, I do this in 2828 * open coded manner. You see what this really does. 2829 * We have to guarantee mem->res.limit < mem->memsw.limit. 2830 */ 2831 mutex_lock(&set_limit_mutex); 2832 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2833 if (memlimit > val) { 2834 ret = -EINVAL; 2835 mutex_unlock(&set_limit_mutex); 2836 break; 2837 } 2838 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2839 if (memswlimit < val) 2840 enlarge = 1; 2841 ret = res_counter_set_limit(&memcg->memsw, val); 2842 if (!ret) { 2843 if (memlimit == val) 2844 memcg->memsw_is_minimum = true; 2845 else 2846 memcg->memsw_is_minimum = false; 2847 } 2848 mutex_unlock(&set_limit_mutex); 2849 2850 if (!ret) 2851 break; 2852 2853 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2854 MEM_CGROUP_RECLAIM_NOSWAP | 2855 MEM_CGROUP_RECLAIM_SHRINK); 2856 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2857 /* Usage is reduced ? */ 2858 if (curusage >= oldusage) 2859 retry_count--; 2860 else 2861 oldusage = curusage; 2862 } 2863 if (!ret && enlarge) 2864 memcg_oom_recover(memcg); 2865 return ret; 2866} 2867 2868unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2869 gfp_t gfp_mask) 2870{ 2871 unsigned long nr_reclaimed = 0; 2872 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2873 unsigned long reclaimed; 2874 int loop = 0; 2875 struct mem_cgroup_tree_per_zone *mctz; 2876 unsigned long long excess; 2877 2878 if (order > 0) 2879 return 0; 2880 2881 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 2882 /* 2883 * This loop can run a while, specially if mem_cgroup's continuously 2884 * keep exceeding their soft limit and putting the system under 2885 * pressure 2886 */ 2887 do { 2888 if (next_mz) 2889 mz = next_mz; 2890 else 2891 mz = mem_cgroup_largest_soft_limit_node(mctz); 2892 if (!mz) 2893 break; 2894 2895 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 2896 gfp_mask, 2897 MEM_CGROUP_RECLAIM_SOFT); 2898 nr_reclaimed += reclaimed; 2899 spin_lock(&mctz->lock); 2900 2901 /* 2902 * If we failed to reclaim anything from this memory cgroup 2903 * it is time to move on to the next cgroup 2904 */ 2905 next_mz = NULL; 2906 if (!reclaimed) { 2907 do { 2908 /* 2909 * Loop until we find yet another one. 2910 * 2911 * By the time we get the soft_limit lock 2912 * again, someone might have aded the 2913 * group back on the RB tree. Iterate to 2914 * make sure we get a different mem. 2915 * mem_cgroup_largest_soft_limit_node returns 2916 * NULL if no other cgroup is present on 2917 * the tree 2918 */ 2919 next_mz = 2920 __mem_cgroup_largest_soft_limit_node(mctz); 2921 if (next_mz == mz) { 2922 css_put(&next_mz->mem->css); 2923 next_mz = NULL; 2924 } else /* next_mz == NULL or other memcg */ 2925 break; 2926 } while (1); 2927 } 2928 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 2929 excess = res_counter_soft_limit_excess(&mz->mem->res); 2930 /* 2931 * One school of thought says that we should not add 2932 * back the node to the tree if reclaim returns 0. 2933 * But our reclaim could return 0, simply because due 2934 * to priority we are exposing a smaller subset of 2935 * memory to reclaim from. Consider this as a longer 2936 * term TODO. 2937 */ 2938 /* If excess == 0, no tree ops */ 2939 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 2940 spin_unlock(&mctz->lock); 2941 css_put(&mz->mem->css); 2942 loop++; 2943 /* 2944 * Could not reclaim anything and there are no more 2945 * mem cgroups to try or we seem to be looping without 2946 * reclaiming anything. 2947 */ 2948 if (!nr_reclaimed && 2949 (next_mz == NULL || 2950 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2951 break; 2952 } while (!nr_reclaimed); 2953 if (next_mz) 2954 css_put(&next_mz->mem->css); 2955 return nr_reclaimed; 2956} 2957 2958/* 2959 * This routine traverse page_cgroup in given list and drop them all. 2960 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2961 */ 2962static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 2963 int node, int zid, enum lru_list lru) 2964{ 2965 struct zone *zone; 2966 struct mem_cgroup_per_zone *mz; 2967 struct page_cgroup *pc, *busy; 2968 unsigned long flags, loop; 2969 struct list_head *list; 2970 int ret = 0; 2971 2972 zone = &NODE_DATA(node)->node_zones[zid]; 2973 mz = mem_cgroup_zoneinfo(mem, node, zid); 2974 list = &mz->lists[lru]; 2975 2976 loop = MEM_CGROUP_ZSTAT(mz, lru); 2977 /* give some margin against EBUSY etc...*/ 2978 loop += 256; 2979 busy = NULL; 2980 while (loop--) { 2981 ret = 0; 2982 spin_lock_irqsave(&zone->lru_lock, flags); 2983 if (list_empty(list)) { 2984 spin_unlock_irqrestore(&zone->lru_lock, flags); 2985 break; 2986 } 2987 pc = list_entry(list->prev, struct page_cgroup, lru); 2988 if (busy == pc) { 2989 list_move(&pc->lru, list); 2990 busy = NULL; 2991 spin_unlock_irqrestore(&zone->lru_lock, flags); 2992 continue; 2993 } 2994 spin_unlock_irqrestore(&zone->lru_lock, flags); 2995 2996 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 2997 if (ret == -ENOMEM) 2998 break; 2999 3000 if (ret == -EBUSY || ret == -EINVAL) { 3001 /* found lock contention or "pc" is obsolete. */ 3002 busy = pc; 3003 cond_resched(); 3004 } else 3005 busy = NULL; 3006 } 3007 3008 if (!ret && !list_empty(list)) 3009 return -EBUSY; 3010 return ret; 3011} 3012 3013/* 3014 * make mem_cgroup's charge to be 0 if there is no task. 3015 * This enables deleting this mem_cgroup. 3016 */ 3017static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3018{ 3019 int ret; 3020 int node, zid, shrink; 3021 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3022 struct cgroup *cgrp = mem->css.cgroup; 3023 3024 css_get(&mem->css); 3025 3026 shrink = 0; 3027 /* should free all ? */ 3028 if (free_all) 3029 goto try_to_free; 3030move_account: 3031 do { 3032 ret = -EBUSY; 3033 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3034 goto out; 3035 ret = -EINTR; 3036 if (signal_pending(current)) 3037 goto out; 3038 /* This is for making all *used* pages to be on LRU. */ 3039 lru_add_drain_all(); 3040 drain_all_stock_sync(); 3041 ret = 0; 3042 for_each_node_state(node, N_HIGH_MEMORY) { 3043 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3044 enum lru_list l; 3045 for_each_lru(l) { 3046 ret = mem_cgroup_force_empty_list(mem, 3047 node, zid, l); 3048 if (ret) 3049 break; 3050 } 3051 } 3052 if (ret) 3053 break; 3054 } 3055 memcg_oom_recover(mem); 3056 /* it seems parent cgroup doesn't have enough mem */ 3057 if (ret == -ENOMEM) 3058 goto try_to_free; 3059 cond_resched(); 3060 /* "ret" should also be checked to ensure all lists are empty. */ 3061 } while (mem->res.usage > 0 || ret); 3062out: 3063 css_put(&mem->css); 3064 return ret; 3065 3066try_to_free: 3067 /* returns EBUSY if there is a task or if we come here twice. */ 3068 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3069 ret = -EBUSY; 3070 goto out; 3071 } 3072 /* we call try-to-free pages for make this cgroup empty */ 3073 lru_add_drain_all(); 3074 /* try to free all pages in this cgroup */ 3075 shrink = 1; 3076 while (nr_retries && mem->res.usage > 0) { 3077 int progress; 3078 3079 if (signal_pending(current)) { 3080 ret = -EINTR; 3081 goto out; 3082 } 3083 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3084 false, get_swappiness(mem)); 3085 if (!progress) { 3086 nr_retries--; 3087 /* maybe some writeback is necessary */ 3088 congestion_wait(BLK_RW_ASYNC, HZ/10); 3089 } 3090 3091 } 3092 lru_add_drain(); 3093 /* try move_account...there may be some *locked* pages. */ 3094 goto move_account; 3095} 3096 3097int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3098{ 3099 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3100} 3101 3102 3103static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3104{ 3105 return mem_cgroup_from_cont(cont)->use_hierarchy; 3106} 3107 3108static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3109 u64 val) 3110{ 3111 int retval = 0; 3112 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3113 struct cgroup *parent = cont->parent; 3114 struct mem_cgroup *parent_mem = NULL; 3115 3116 if (parent) 3117 parent_mem = mem_cgroup_from_cont(parent); 3118 3119 cgroup_lock(); 3120 /* 3121 * If parent's use_hierarchy is set, we can't make any modifications 3122 * in the child subtrees. If it is unset, then the change can 3123 * occur, provided the current cgroup has no children. 3124 * 3125 * For the root cgroup, parent_mem is NULL, we allow value to be 3126 * set if there are no children. 3127 */ 3128 if ((!parent_mem || !parent_mem->use_hierarchy) && 3129 (val == 1 || val == 0)) { 3130 if (list_empty(&cont->children)) 3131 mem->use_hierarchy = val; 3132 else 3133 retval = -EBUSY; 3134 } else 3135 retval = -EINVAL; 3136 cgroup_unlock(); 3137 3138 return retval; 3139} 3140 3141struct mem_cgroup_idx_data { 3142 s64 val; 3143 enum mem_cgroup_stat_index idx; 3144}; 3145 3146static int 3147mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 3148{ 3149 struct mem_cgroup_idx_data *d = data; 3150 d->val += mem_cgroup_read_stat(mem, d->idx); 3151 return 0; 3152} 3153 3154static void 3155mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3156 enum mem_cgroup_stat_index idx, s64 *val) 3157{ 3158 struct mem_cgroup_idx_data d; 3159 d.idx = idx; 3160 d.val = 0; 3161 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); 3162 *val = d.val; 3163} 3164 3165static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3166{ 3167 u64 idx_val, val; 3168 3169 if (!mem_cgroup_is_root(mem)) { 3170 if (!swap) 3171 return res_counter_read_u64(&mem->res, RES_USAGE); 3172 else 3173 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3174 } 3175 3176 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 3177 val = idx_val; 3178 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); 3179 val += idx_val; 3180 3181 if (swap) { 3182 mem_cgroup_get_recursive_idx_stat(mem, 3183 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 3184 val += idx_val; 3185 } 3186 3187 return val << PAGE_SHIFT; 3188} 3189 3190static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3191{ 3192 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3193 u64 val; 3194 int type, name; 3195 3196 type = MEMFILE_TYPE(cft->private); 3197 name = MEMFILE_ATTR(cft->private); 3198 switch (type) { 3199 case _MEM: 3200 if (name == RES_USAGE) 3201 val = mem_cgroup_usage(mem, false); 3202 else 3203 val = res_counter_read_u64(&mem->res, name); 3204 break; 3205 case _MEMSWAP: 3206 if (name == RES_USAGE) 3207 val = mem_cgroup_usage(mem, true); 3208 else 3209 val = res_counter_read_u64(&mem->memsw, name); 3210 break; 3211 default: 3212 BUG(); 3213 break; 3214 } 3215 return val; 3216} 3217/* 3218 * The user of this function is... 3219 * RES_LIMIT. 3220 */ 3221static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3222 const char *buffer) 3223{ 3224 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3225 int type, name; 3226 unsigned long long val; 3227 int ret; 3228 3229 type = MEMFILE_TYPE(cft->private); 3230 name = MEMFILE_ATTR(cft->private); 3231 switch (name) { 3232 case RES_LIMIT: 3233 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3234 ret = -EINVAL; 3235 break; 3236 } 3237 /* This function does all necessary parse...reuse it */ 3238 ret = res_counter_memparse_write_strategy(buffer, &val); 3239 if (ret) 3240 break; 3241 if (type == _MEM) 3242 ret = mem_cgroup_resize_limit(memcg, val); 3243 else 3244 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3245 break; 3246 case RES_SOFT_LIMIT: 3247 ret = res_counter_memparse_write_strategy(buffer, &val); 3248 if (ret) 3249 break; 3250 /* 3251 * For memsw, soft limits are hard to implement in terms 3252 * of semantics, for now, we support soft limits for 3253 * control without swap 3254 */ 3255 if (type == _MEM) 3256 ret = res_counter_set_soft_limit(&memcg->res, val); 3257 else 3258 ret = -EINVAL; 3259 break; 3260 default: 3261 ret = -EINVAL; /* should be BUG() ? */ 3262 break; 3263 } 3264 return ret; 3265} 3266 3267static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3268 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3269{ 3270 struct cgroup *cgroup; 3271 unsigned long long min_limit, min_memsw_limit, tmp; 3272 3273 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3274 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3275 cgroup = memcg->css.cgroup; 3276 if (!memcg->use_hierarchy) 3277 goto out; 3278 3279 while (cgroup->parent) { 3280 cgroup = cgroup->parent; 3281 memcg = mem_cgroup_from_cont(cgroup); 3282 if (!memcg->use_hierarchy) 3283 break; 3284 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3285 min_limit = min(min_limit, tmp); 3286 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3287 min_memsw_limit = min(min_memsw_limit, tmp); 3288 } 3289out: 3290 *mem_limit = min_limit; 3291 *memsw_limit = min_memsw_limit; 3292 return; 3293} 3294 3295static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3296{ 3297 struct mem_cgroup *mem; 3298 int type, name; 3299 3300 mem = mem_cgroup_from_cont(cont); 3301 type = MEMFILE_TYPE(event); 3302 name = MEMFILE_ATTR(event); 3303 switch (name) { 3304 case RES_MAX_USAGE: 3305 if (type == _MEM) 3306 res_counter_reset_max(&mem->res); 3307 else 3308 res_counter_reset_max(&mem->memsw); 3309 break; 3310 case RES_FAILCNT: 3311 if (type == _MEM) 3312 res_counter_reset_failcnt(&mem->res); 3313 else 3314 res_counter_reset_failcnt(&mem->memsw); 3315 break; 3316 } 3317 3318 return 0; 3319} 3320 3321static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3322 struct cftype *cft) 3323{ 3324 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3325} 3326 3327#ifdef CONFIG_MMU 3328static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3329 struct cftype *cft, u64 val) 3330{ 3331 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3332 3333 if (val >= (1 << NR_MOVE_TYPE)) 3334 return -EINVAL; 3335 /* 3336 * We check this value several times in both in can_attach() and 3337 * attach(), so we need cgroup lock to prevent this value from being 3338 * inconsistent. 3339 */ 3340 cgroup_lock(); 3341 mem->move_charge_at_immigrate = val; 3342 cgroup_unlock(); 3343 3344 return 0; 3345} 3346#else 3347static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3348 struct cftype *cft, u64 val) 3349{ 3350 return -ENOSYS; 3351} 3352#endif 3353 3354 3355/* For read statistics */ 3356enum { 3357 MCS_CACHE, 3358 MCS_RSS, 3359 MCS_FILE_MAPPED, 3360 MCS_PGPGIN, 3361 MCS_PGPGOUT, 3362 MCS_SWAP, 3363 MCS_INACTIVE_ANON, 3364 MCS_ACTIVE_ANON, 3365 MCS_INACTIVE_FILE, 3366 MCS_ACTIVE_FILE, 3367 MCS_UNEVICTABLE, 3368 NR_MCS_STAT, 3369}; 3370 3371struct mcs_total_stat { 3372 s64 stat[NR_MCS_STAT]; 3373}; 3374 3375struct { 3376 char *local_name; 3377 char *total_name; 3378} memcg_stat_strings[NR_MCS_STAT] = { 3379 {"cache", "total_cache"}, 3380 {"rss", "total_rss"}, 3381 {"mapped_file", "total_mapped_file"}, 3382 {"pgpgin", "total_pgpgin"}, 3383 {"pgpgout", "total_pgpgout"}, 3384 {"swap", "total_swap"}, 3385 {"inactive_anon", "total_inactive_anon"}, 3386 {"active_anon", "total_active_anon"}, 3387 {"inactive_file", "total_inactive_file"}, 3388 {"active_file", "total_active_file"}, 3389 {"unevictable", "total_unevictable"} 3390}; 3391 3392 3393static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3394{ 3395 struct mcs_total_stat *s = data; 3396 s64 val; 3397 3398 /* per cpu stat */ 3399 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 3400 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3401 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 3402 s->stat[MCS_RSS] += val * PAGE_SIZE; 3403 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3404 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3405 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 3406 s->stat[MCS_PGPGIN] += val; 3407 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3408 s->stat[MCS_PGPGOUT] += val; 3409 if (do_swap_account) { 3410 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3411 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3412 } 3413 3414 /* per zone stat */ 3415 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 3416 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 3417 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 3418 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 3419 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 3420 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 3421 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 3422 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3423 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3424 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3425 return 0; 3426} 3427 3428static void 3429mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3430{ 3431 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3432} 3433 3434static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3435 struct cgroup_map_cb *cb) 3436{ 3437 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 3438 struct mcs_total_stat mystat; 3439 int i; 3440 3441 memset(&mystat, 0, sizeof(mystat)); 3442 mem_cgroup_get_local_stat(mem_cont, &mystat); 3443 3444 for (i = 0; i < NR_MCS_STAT; i++) { 3445 if (i == MCS_SWAP && !do_swap_account) 3446 continue; 3447 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 3448 } 3449 3450 /* Hierarchical information */ 3451 { 3452 unsigned long long limit, memsw_limit; 3453 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 3454 cb->fill(cb, "hierarchical_memory_limit", limit); 3455 if (do_swap_account) 3456 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 3457 } 3458 3459 memset(&mystat, 0, sizeof(mystat)); 3460 mem_cgroup_get_total_stat(mem_cont, &mystat); 3461 for (i = 0; i < NR_MCS_STAT; i++) { 3462 if (i == MCS_SWAP && !do_swap_account) 3463 continue; 3464 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 3465 } 3466 3467#ifdef CONFIG_DEBUG_VM 3468 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 3469 3470 { 3471 int nid, zid; 3472 struct mem_cgroup_per_zone *mz; 3473 unsigned long recent_rotated[2] = {0, 0}; 3474 unsigned long recent_scanned[2] = {0, 0}; 3475 3476 for_each_online_node(nid) 3477 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3478 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 3479 3480 recent_rotated[0] += 3481 mz->reclaim_stat.recent_rotated[0]; 3482 recent_rotated[1] += 3483 mz->reclaim_stat.recent_rotated[1]; 3484 recent_scanned[0] += 3485 mz->reclaim_stat.recent_scanned[0]; 3486 recent_scanned[1] += 3487 mz->reclaim_stat.recent_scanned[1]; 3488 } 3489 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 3490 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 3491 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 3492 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 3493 } 3494#endif 3495 3496 return 0; 3497} 3498 3499static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 3500{ 3501 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3502 3503 return get_swappiness(memcg); 3504} 3505 3506static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 3507 u64 val) 3508{ 3509 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3510 struct mem_cgroup *parent; 3511 3512 if (val > 100) 3513 return -EINVAL; 3514 3515 if (cgrp->parent == NULL) 3516 return -EINVAL; 3517 3518 parent = mem_cgroup_from_cont(cgrp->parent); 3519 3520 cgroup_lock(); 3521 3522 /* If under hierarchy, only empty-root can set this value */ 3523 if ((parent->use_hierarchy) || 3524 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 3525 cgroup_unlock(); 3526 return -EINVAL; 3527 } 3528 3529 spin_lock(&memcg->reclaim_param_lock); 3530 memcg->swappiness = val; 3531 spin_unlock(&memcg->reclaim_param_lock); 3532 3533 cgroup_unlock(); 3534 3535 return 0; 3536} 3537 3538static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3539{ 3540 struct mem_cgroup_threshold_ary *t; 3541 u64 usage; 3542 int i; 3543 3544 rcu_read_lock(); 3545 if (!swap) 3546 t = rcu_dereference(memcg->thresholds.primary); 3547 else 3548 t = rcu_dereference(memcg->memsw_thresholds.primary); 3549 3550 if (!t) 3551 goto unlock; 3552 3553 usage = mem_cgroup_usage(memcg, swap); 3554 3555 /* 3556 * current_threshold points to threshold just below usage. 3557 * If it's not true, a threshold was crossed after last 3558 * call of __mem_cgroup_threshold(). 3559 */ 3560 i = t->current_threshold; 3561 3562 /* 3563 * Iterate backward over array of thresholds starting from 3564 * current_threshold and check if a threshold is crossed. 3565 * If none of thresholds below usage is crossed, we read 3566 * only one element of the array here. 3567 */ 3568 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3569 eventfd_signal(t->entries[i].eventfd, 1); 3570 3571 /* i = current_threshold + 1 */ 3572 i++; 3573 3574 /* 3575 * Iterate forward over array of thresholds starting from 3576 * current_threshold+1 and check if a threshold is crossed. 3577 * If none of thresholds above usage is crossed, we read 3578 * only one element of the array here. 3579 */ 3580 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3581 eventfd_signal(t->entries[i].eventfd, 1); 3582 3583 /* Update current_threshold */ 3584 t->current_threshold = i - 1; 3585unlock: 3586 rcu_read_unlock(); 3587} 3588 3589static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3590{ 3591 while (memcg) { 3592 __mem_cgroup_threshold(memcg, false); 3593 if (do_swap_account) 3594 __mem_cgroup_threshold(memcg, true); 3595 3596 memcg = parent_mem_cgroup(memcg); 3597 } 3598} 3599 3600static int compare_thresholds(const void *a, const void *b) 3601{ 3602 const struct mem_cgroup_threshold *_a = a; 3603 const struct mem_cgroup_threshold *_b = b; 3604 3605 return _a->threshold - _b->threshold; 3606} 3607 3608static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) 3609{ 3610 struct mem_cgroup_eventfd_list *ev; 3611 3612 list_for_each_entry(ev, &mem->oom_notify, list) 3613 eventfd_signal(ev->eventfd, 1); 3614 return 0; 3615} 3616 3617static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3618{ 3619 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); 3620} 3621 3622static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3623 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 3624{ 3625 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3626 struct mem_cgroup_thresholds *thresholds; 3627 struct mem_cgroup_threshold_ary *new; 3628 int type = MEMFILE_TYPE(cft->private); 3629 u64 threshold, usage; 3630 int i, size, ret; 3631 3632 ret = res_counter_memparse_write_strategy(args, &threshold); 3633 if (ret) 3634 return ret; 3635 3636 mutex_lock(&memcg->thresholds_lock); 3637 3638 if (type == _MEM) 3639 thresholds = &memcg->thresholds; 3640 else if (type == _MEMSWAP) 3641 thresholds = &memcg->memsw_thresholds; 3642 else 3643 BUG(); 3644 3645 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3646 3647 /* Check if a threshold crossed before adding a new one */ 3648 if (thresholds->primary) 3649 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3650 3651 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3652 3653 /* Allocate memory for new array of thresholds */ 3654 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3655 GFP_KERNEL); 3656 if (!new) { 3657 ret = -ENOMEM; 3658 goto unlock; 3659 } 3660 new->size = size; 3661 3662 /* Copy thresholds (if any) to new array */ 3663 if (thresholds->primary) { 3664 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3665 sizeof(struct mem_cgroup_threshold)); 3666 } 3667 3668 /* Add new threshold */ 3669 new->entries[size - 1].eventfd = eventfd; 3670 new->entries[size - 1].threshold = threshold; 3671 3672 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3673 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3674 compare_thresholds, NULL); 3675 3676 /* Find current threshold */ 3677 new->current_threshold = -1; 3678 for (i = 0; i < size; i++) { 3679 if (new->entries[i].threshold < usage) { 3680 /* 3681 * new->current_threshold will not be used until 3682 * rcu_assign_pointer(), so it's safe to increment 3683 * it here. 3684 */ 3685 ++new->current_threshold; 3686 } 3687 } 3688 3689 /* Free old spare buffer and save old primary buffer as spare */ 3690 kfree(thresholds->spare); 3691 thresholds->spare = thresholds->primary; 3692 3693 rcu_assign_pointer(thresholds->primary, new); 3694 3695 /* To be sure that nobody uses thresholds */ 3696 synchronize_rcu(); 3697 3698unlock: 3699 mutex_unlock(&memcg->thresholds_lock); 3700 3701 return ret; 3702} 3703 3704static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 3705 struct cftype *cft, struct eventfd_ctx *eventfd) 3706{ 3707 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3708 struct mem_cgroup_thresholds *thresholds; 3709 struct mem_cgroup_threshold_ary *new; 3710 int type = MEMFILE_TYPE(cft->private); 3711 u64 usage; 3712 int i, j, size; 3713 3714 mutex_lock(&memcg->thresholds_lock); 3715 if (type == _MEM) 3716 thresholds = &memcg->thresholds; 3717 else if (type == _MEMSWAP) 3718 thresholds = &memcg->memsw_thresholds; 3719 else 3720 BUG(); 3721 3722 /* 3723 * Something went wrong if we trying to unregister a threshold 3724 * if we don't have thresholds 3725 */ 3726 BUG_ON(!thresholds); 3727 3728 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3729 3730 /* Check if a threshold crossed before removing */ 3731 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3732 3733 /* Calculate new number of threshold */ 3734 size = 0; 3735 for (i = 0; i < thresholds->primary->size; i++) { 3736 if (thresholds->primary->entries[i].eventfd != eventfd) 3737 size++; 3738 } 3739 3740 new = thresholds->spare; 3741 3742 /* Set thresholds array to NULL if we don't have thresholds */ 3743 if (!size) { 3744 kfree(new); 3745 new = NULL; 3746 goto swap_buffers; 3747 } 3748 3749 new->size = size; 3750 3751 /* Copy thresholds and find current threshold */ 3752 new->current_threshold = -1; 3753 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 3754 if (thresholds->primary->entries[i].eventfd == eventfd) 3755 continue; 3756 3757 new->entries[j] = thresholds->primary->entries[i]; 3758 if (new->entries[j].threshold < usage) { 3759 /* 3760 * new->current_threshold will not be used 3761 * until rcu_assign_pointer(), so it's safe to increment 3762 * it here. 3763 */ 3764 ++new->current_threshold; 3765 } 3766 j++; 3767 } 3768 3769swap_buffers: 3770 /* Swap primary and spare array */ 3771 thresholds->spare = thresholds->primary; 3772 rcu_assign_pointer(thresholds->primary, new); 3773 3774 /* To be sure that nobody uses thresholds */ 3775 synchronize_rcu(); 3776 3777 mutex_unlock(&memcg->thresholds_lock); 3778} 3779 3780static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 3781 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 3782{ 3783 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3784 struct mem_cgroup_eventfd_list *event; 3785 int type = MEMFILE_TYPE(cft->private); 3786 3787 BUG_ON(type != _OOM_TYPE); 3788 event = kmalloc(sizeof(*event), GFP_KERNEL); 3789 if (!event) 3790 return -ENOMEM; 3791 3792 mutex_lock(&memcg_oom_mutex); 3793 3794 event->eventfd = eventfd; 3795 list_add(&event->list, &memcg->oom_notify); 3796 3797 /* already in OOM ? */ 3798 if (atomic_read(&memcg->oom_lock)) 3799 eventfd_signal(eventfd, 1); 3800 mutex_unlock(&memcg_oom_mutex); 3801 3802 return 0; 3803} 3804 3805static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 3806 struct cftype *cft, struct eventfd_ctx *eventfd) 3807{ 3808 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3809 struct mem_cgroup_eventfd_list *ev, *tmp; 3810 int type = MEMFILE_TYPE(cft->private); 3811 3812 BUG_ON(type != _OOM_TYPE); 3813 3814 mutex_lock(&memcg_oom_mutex); 3815 3816 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 3817 if (ev->eventfd == eventfd) { 3818 list_del(&ev->list); 3819 kfree(ev); 3820 } 3821 } 3822 3823 mutex_unlock(&memcg_oom_mutex); 3824} 3825 3826static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 3827 struct cftype *cft, struct cgroup_map_cb *cb) 3828{ 3829 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3830 3831 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 3832 3833 if (atomic_read(&mem->oom_lock)) 3834 cb->fill(cb, "under_oom", 1); 3835 else 3836 cb->fill(cb, "under_oom", 0); 3837 return 0; 3838} 3839 3840static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 3841 struct cftype *cft, u64 val) 3842{ 3843 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3844 struct mem_cgroup *parent; 3845 3846 /* cannot set to root cgroup and only 0 and 1 are allowed */ 3847 if (!cgrp->parent || !((val == 0) || (val == 1))) 3848 return -EINVAL; 3849 3850 parent = mem_cgroup_from_cont(cgrp->parent); 3851 3852 cgroup_lock(); 3853 /* oom-kill-disable is a flag for subhierarchy. */ 3854 if ((parent->use_hierarchy) || 3855 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 3856 cgroup_unlock(); 3857 return -EINVAL; 3858 } 3859 mem->oom_kill_disable = val; 3860 if (!val) 3861 memcg_oom_recover(mem); 3862 cgroup_unlock(); 3863 return 0; 3864} 3865 3866static struct cftype mem_cgroup_files[] = { 3867 { 3868 .name = "usage_in_bytes", 3869 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3870 .read_u64 = mem_cgroup_read, 3871 .register_event = mem_cgroup_usage_register_event, 3872 .unregister_event = mem_cgroup_usage_unregister_event, 3873 }, 3874 { 3875 .name = "max_usage_in_bytes", 3876 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 3877 .trigger = mem_cgroup_reset, 3878 .read_u64 = mem_cgroup_read, 3879 }, 3880 { 3881 .name = "limit_in_bytes", 3882 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 3883 .write_string = mem_cgroup_write, 3884 .read_u64 = mem_cgroup_read, 3885 }, 3886 { 3887 .name = "soft_limit_in_bytes", 3888 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 3889 .write_string = mem_cgroup_write, 3890 .read_u64 = mem_cgroup_read, 3891 }, 3892 { 3893 .name = "failcnt", 3894 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 3895 .trigger = mem_cgroup_reset, 3896 .read_u64 = mem_cgroup_read, 3897 }, 3898 { 3899 .name = "stat", 3900 .read_map = mem_control_stat_show, 3901 }, 3902 { 3903 .name = "force_empty", 3904 .trigger = mem_cgroup_force_empty_write, 3905 }, 3906 { 3907 .name = "use_hierarchy", 3908 .write_u64 = mem_cgroup_hierarchy_write, 3909 .read_u64 = mem_cgroup_hierarchy_read, 3910 }, 3911 { 3912 .name = "swappiness", 3913 .read_u64 = mem_cgroup_swappiness_read, 3914 .write_u64 = mem_cgroup_swappiness_write, 3915 }, 3916 { 3917 .name = "move_charge_at_immigrate", 3918 .read_u64 = mem_cgroup_move_charge_read, 3919 .write_u64 = mem_cgroup_move_charge_write, 3920 }, 3921 { 3922 .name = "oom_control", 3923 .read_map = mem_cgroup_oom_control_read, 3924 .write_u64 = mem_cgroup_oom_control_write, 3925 .register_event = mem_cgroup_oom_register_event, 3926 .unregister_event = mem_cgroup_oom_unregister_event, 3927 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 3928 }, 3929}; 3930 3931#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3932static struct cftype memsw_cgroup_files[] = { 3933 { 3934 .name = "memsw.usage_in_bytes", 3935 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3936 .read_u64 = mem_cgroup_read, 3937 .register_event = mem_cgroup_usage_register_event, 3938 .unregister_event = mem_cgroup_usage_unregister_event, 3939 }, 3940 { 3941 .name = "memsw.max_usage_in_bytes", 3942 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 3943 .trigger = mem_cgroup_reset, 3944 .read_u64 = mem_cgroup_read, 3945 }, 3946 { 3947 .name = "memsw.limit_in_bytes", 3948 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 3949 .write_string = mem_cgroup_write, 3950 .read_u64 = mem_cgroup_read, 3951 }, 3952 { 3953 .name = "memsw.failcnt", 3954 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 3955 .trigger = mem_cgroup_reset, 3956 .read_u64 = mem_cgroup_read, 3957 }, 3958}; 3959 3960static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3961{ 3962 if (!do_swap_account) 3963 return 0; 3964 return cgroup_add_files(cont, ss, memsw_cgroup_files, 3965 ARRAY_SIZE(memsw_cgroup_files)); 3966}; 3967#else 3968static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3969{ 3970 return 0; 3971} 3972#endif 3973 3974static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3975{ 3976 struct mem_cgroup_per_node *pn; 3977 struct mem_cgroup_per_zone *mz; 3978 enum lru_list l; 3979 int zone, tmp = node; 3980 /* 3981 * This routine is called against possible nodes. 3982 * But it's BUG to call kmalloc() against offline node. 3983 * 3984 * TODO: this routine can waste much memory for nodes which will 3985 * never be onlined. It's better to use memory hotplug callback 3986 * function. 3987 */ 3988 if (!node_state(node, N_NORMAL_MEMORY)) 3989 tmp = -1; 3990 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 3991 if (!pn) 3992 return 1; 3993 3994 mem->info.nodeinfo[node] = pn; 3995 memset(pn, 0, sizeof(*pn)); 3996 3997 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3998 mz = &pn->zoneinfo[zone]; 3999 for_each_lru(l) 4000 INIT_LIST_HEAD(&mz->lists[l]); 4001 mz->usage_in_excess = 0; 4002 mz->on_tree = false; 4003 mz->mem = mem; 4004 } 4005 return 0; 4006} 4007 4008static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4009{ 4010 kfree(mem->info.nodeinfo[node]); 4011} 4012 4013static struct mem_cgroup *mem_cgroup_alloc(void) 4014{ 4015 struct mem_cgroup *mem; 4016 int size = sizeof(struct mem_cgroup); 4017 4018 /* Can be very big if MAX_NUMNODES is very big */ 4019 if (size < PAGE_SIZE) 4020 mem = kmalloc(size, GFP_KERNEL); 4021 else 4022 mem = vmalloc(size); 4023 4024 if (!mem) 4025 return NULL; 4026 4027 memset(mem, 0, size); 4028 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4029 if (!mem->stat) { 4030 if (size < PAGE_SIZE) 4031 kfree(mem); 4032 else 4033 vfree(mem); 4034 mem = NULL; 4035 } 4036 return mem; 4037} 4038 4039/* 4040 * At destroying mem_cgroup, references from swap_cgroup can remain. 4041 * (scanning all at force_empty is too costly...) 4042 * 4043 * Instead of clearing all references at force_empty, we remember 4044 * the number of reference from swap_cgroup and free mem_cgroup when 4045 * it goes down to 0. 4046 * 4047 * Removal of cgroup itself succeeds regardless of refs from swap. 4048 */ 4049 4050static void __mem_cgroup_free(struct mem_cgroup *mem) 4051{ 4052 int node; 4053 4054 mem_cgroup_remove_from_trees(mem); 4055 free_css_id(&mem_cgroup_subsys, &mem->css); 4056 4057 for_each_node_state(node, N_POSSIBLE) 4058 free_mem_cgroup_per_zone_info(mem, node); 4059 4060 free_percpu(mem->stat); 4061 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4062 kfree(mem); 4063 else 4064 vfree(mem); 4065} 4066 4067static void mem_cgroup_get(struct mem_cgroup *mem) 4068{ 4069 atomic_inc(&mem->refcnt); 4070} 4071 4072static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 4073{ 4074 if (atomic_sub_and_test(count, &mem->refcnt)) { 4075 struct mem_cgroup *parent = parent_mem_cgroup(mem); 4076 __mem_cgroup_free(mem); 4077 if (parent) 4078 mem_cgroup_put(parent); 4079 } 4080} 4081 4082static void mem_cgroup_put(struct mem_cgroup *mem) 4083{ 4084 __mem_cgroup_put(mem, 1); 4085} 4086 4087/* 4088 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4089 */ 4090static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 4091{ 4092 if (!mem->res.parent) 4093 return NULL; 4094 return mem_cgroup_from_res_counter(mem->res.parent, res); 4095} 4096 4097#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4098static void __init enable_swap_cgroup(void) 4099{ 4100 if (!mem_cgroup_disabled() && really_do_swap_account) 4101 do_swap_account = 1; 4102} 4103#else 4104static void __init enable_swap_cgroup(void) 4105{ 4106} 4107#endif 4108 4109static int mem_cgroup_soft_limit_tree_init(void) 4110{ 4111 struct mem_cgroup_tree_per_node *rtpn; 4112 struct mem_cgroup_tree_per_zone *rtpz; 4113 int tmp, node, zone; 4114 4115 for_each_node_state(node, N_POSSIBLE) { 4116 tmp = node; 4117 if (!node_state(node, N_NORMAL_MEMORY)) 4118 tmp = -1; 4119 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4120 if (!rtpn) 4121 return 1; 4122 4123 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4124 4125 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4126 rtpz = &rtpn->rb_tree_per_zone[zone]; 4127 rtpz->rb_root = RB_ROOT; 4128 spin_lock_init(&rtpz->lock); 4129 } 4130 } 4131 return 0; 4132} 4133 4134static struct cgroup_subsys_state * __ref 4135mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4136{ 4137 struct mem_cgroup *mem, *parent; 4138 long error = -ENOMEM; 4139 int node; 4140 4141 mem = mem_cgroup_alloc(); 4142 if (!mem) 4143 return ERR_PTR(error); 4144 4145 for_each_node_state(node, N_POSSIBLE) 4146 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4147 goto free_out; 4148 4149 /* root ? */ 4150 if (cont->parent == NULL) { 4151 int cpu; 4152 enable_swap_cgroup(); 4153 parent = NULL; 4154 root_mem_cgroup = mem; 4155 if (mem_cgroup_soft_limit_tree_init()) 4156 goto free_out; 4157 for_each_possible_cpu(cpu) { 4158 struct memcg_stock_pcp *stock = 4159 &per_cpu(memcg_stock, cpu); 4160 INIT_WORK(&stock->work, drain_local_stock); 4161 } 4162 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4163 } else { 4164 parent = mem_cgroup_from_cont(cont->parent); 4165 mem->use_hierarchy = parent->use_hierarchy; 4166 mem->oom_kill_disable = parent->oom_kill_disable; 4167 } 4168 4169 if (parent && parent->use_hierarchy) { 4170 res_counter_init(&mem->res, &parent->res); 4171 res_counter_init(&mem->memsw, &parent->memsw); 4172 /* 4173 * We increment refcnt of the parent to ensure that we can 4174 * safely access it on res_counter_charge/uncharge. 4175 * This refcnt will be decremented when freeing this 4176 * mem_cgroup(see mem_cgroup_put). 4177 */ 4178 mem_cgroup_get(parent); 4179 } else { 4180 res_counter_init(&mem->res, NULL); 4181 res_counter_init(&mem->memsw, NULL); 4182 } 4183 mem->last_scanned_child = 0; 4184 spin_lock_init(&mem->reclaim_param_lock); 4185 INIT_LIST_HEAD(&mem->oom_notify); 4186 4187 if (parent) 4188 mem->swappiness = get_swappiness(parent); 4189 atomic_set(&mem->refcnt, 1); 4190 mem->move_charge_at_immigrate = 0; 4191 mutex_init(&mem->thresholds_lock); 4192 return &mem->css; 4193free_out: 4194 __mem_cgroup_free(mem); 4195 root_mem_cgroup = NULL; 4196 return ERR_PTR(error); 4197} 4198 4199static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4200 struct cgroup *cont) 4201{ 4202 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4203 4204 return mem_cgroup_force_empty(mem, false); 4205} 4206 4207static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4208 struct cgroup *cont) 4209{ 4210 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4211 4212 mem_cgroup_put(mem); 4213} 4214 4215static int mem_cgroup_populate(struct cgroup_subsys *ss, 4216 struct cgroup *cont) 4217{ 4218 int ret; 4219 4220 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 4221 ARRAY_SIZE(mem_cgroup_files)); 4222 4223 if (!ret) 4224 ret = register_memsw_files(cont, ss); 4225 return ret; 4226} 4227 4228#ifdef CONFIG_MMU 4229/* Handlers for move charge at task migration. */ 4230#define PRECHARGE_COUNT_AT_ONCE 256 4231static int mem_cgroup_do_precharge(unsigned long count) 4232{ 4233 int ret = 0; 4234 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4235 struct mem_cgroup *mem = mc.to; 4236 4237 if (mem_cgroup_is_root(mem)) { 4238 mc.precharge += count; 4239 /* we don't need css_get for root */ 4240 return ret; 4241 } 4242 /* try to charge at once */ 4243 if (count > 1) { 4244 struct res_counter *dummy; 4245 /* 4246 * "mem" cannot be under rmdir() because we've already checked 4247 * by cgroup_lock_live_cgroup() that it is not removed and we 4248 * are still under the same cgroup_mutex. So we can postpone 4249 * css_get(). 4250 */ 4251 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 4252 goto one_by_one; 4253 if (do_swap_account && res_counter_charge(&mem->memsw, 4254 PAGE_SIZE * count, &dummy)) { 4255 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 4256 goto one_by_one; 4257 } 4258 mc.precharge += count; 4259 return ret; 4260 } 4261one_by_one: 4262 /* fall back to one by one charge */ 4263 while (count--) { 4264 if (signal_pending(current)) { 4265 ret = -EINTR; 4266 break; 4267 } 4268 if (!batch_count--) { 4269 batch_count = PRECHARGE_COUNT_AT_ONCE; 4270 cond_resched(); 4271 } 4272 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 4273 if (ret || !mem) 4274 /* mem_cgroup_clear_mc() will do uncharge later */ 4275 return -ENOMEM; 4276 mc.precharge++; 4277 } 4278 return ret; 4279} 4280 4281/** 4282 * is_target_pte_for_mc - check a pte whether it is valid for move charge 4283 * @vma: the vma the pte to be checked belongs 4284 * @addr: the address corresponding to the pte to be checked 4285 * @ptent: the pte to be checked 4286 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4287 * 4288 * Returns 4289 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4290 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4291 * move charge. if @target is not NULL, the page is stored in target->page 4292 * with extra refcnt got(Callers should handle it). 4293 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4294 * target for charge migration. if @target is not NULL, the entry is stored 4295 * in target->ent. 4296 * 4297 * Called with pte lock held. 4298 */ 4299union mc_target { 4300 struct page *page; 4301 swp_entry_t ent; 4302}; 4303 4304enum mc_target_type { 4305 MC_TARGET_NONE, /* not used */ 4306 MC_TARGET_PAGE, 4307 MC_TARGET_SWAP, 4308}; 4309 4310static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4311 unsigned long addr, pte_t ptent) 4312{ 4313 struct page *page = vm_normal_page(vma, addr, ptent); 4314 4315 if (!page || !page_mapped(page)) 4316 return NULL; 4317 if (PageAnon(page)) { 4318 /* we don't move shared anon */ 4319 if (!move_anon() || page_mapcount(page) > 2) 4320 return NULL; 4321 } else if (!move_file()) 4322 /* we ignore mapcount for file pages */ 4323 return NULL; 4324 if (!get_page_unless_zero(page)) 4325 return NULL; 4326 4327 return page; 4328} 4329 4330static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4331 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4332{ 4333 int usage_count; 4334 struct page *page = NULL; 4335 swp_entry_t ent = pte_to_swp_entry(ptent); 4336 4337 if (!move_anon() || non_swap_entry(ent)) 4338 return NULL; 4339 usage_count = mem_cgroup_count_swap_user(ent, &page); 4340 if (usage_count > 1) { /* we don't move shared anon */ 4341 if (page) 4342 put_page(page); 4343 return NULL; 4344 } 4345 if (do_swap_account) 4346 entry->val = ent.val; 4347 4348 return page; 4349} 4350 4351static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4352 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4353{ 4354 struct page *page = NULL; 4355 struct inode *inode; 4356 struct address_space *mapping; 4357 pgoff_t pgoff; 4358 4359 if (!vma->vm_file) /* anonymous vma */ 4360 return NULL; 4361 if (!move_file()) 4362 return NULL; 4363 4364 inode = vma->vm_file->f_path.dentry->d_inode; 4365 mapping = vma->vm_file->f_mapping; 4366 if (pte_none(ptent)) 4367 pgoff = linear_page_index(vma, addr); 4368 else /* pte_file(ptent) is true */ 4369 pgoff = pte_to_pgoff(ptent); 4370 4371 /* page is moved even if it's not RSS of this task(page-faulted). */ 4372 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 4373 page = find_get_page(mapping, pgoff); 4374 } else { /* shmem/tmpfs file. we should take account of swap too. */ 4375 swp_entry_t ent; 4376 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 4377 if (do_swap_account) 4378 entry->val = ent.val; 4379 } 4380 4381 return page; 4382} 4383 4384static int is_target_pte_for_mc(struct vm_area_struct *vma, 4385 unsigned long addr, pte_t ptent, union mc_target *target) 4386{ 4387 struct page *page = NULL; 4388 struct page_cgroup *pc; 4389 int ret = 0; 4390 swp_entry_t ent = { .val = 0 }; 4391 4392 if (pte_present(ptent)) 4393 page = mc_handle_present_pte(vma, addr, ptent); 4394 else if (is_swap_pte(ptent)) 4395 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4396 else if (pte_none(ptent) || pte_file(ptent)) 4397 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4398 4399 if (!page && !ent.val) 4400 return 0; 4401 if (page) { 4402 pc = lookup_page_cgroup(page); 4403 /* 4404 * Do only loose check w/o page_cgroup lock. 4405 * mem_cgroup_move_account() checks the pc is valid or not under 4406 * the lock. 4407 */ 4408 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 4409 ret = MC_TARGET_PAGE; 4410 if (target) 4411 target->page = page; 4412 } 4413 if (!ret || !target) 4414 put_page(page); 4415 } 4416 /* There is a swap entry and a page doesn't exist or isn't charged */ 4417 if (ent.val && !ret && 4418 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 4419 ret = MC_TARGET_SWAP; 4420 if (target) 4421 target->ent = ent; 4422 } 4423 return ret; 4424} 4425 4426static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4427 unsigned long addr, unsigned long end, 4428 struct mm_walk *walk) 4429{ 4430 struct vm_area_struct *vma = walk->private; 4431 pte_t *pte; 4432 spinlock_t *ptl; 4433 4434 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4435 for (; addr != end; pte++, addr += PAGE_SIZE) 4436 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4437 mc.precharge++; /* increment precharge temporarily */ 4438 pte_unmap_unlock(pte - 1, ptl); 4439 cond_resched(); 4440 4441 return 0; 4442} 4443 4444static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4445{ 4446 unsigned long precharge; 4447 struct vm_area_struct *vma; 4448 4449 /* We've already held the mmap_sem */ 4450 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4451 struct mm_walk mem_cgroup_count_precharge_walk = { 4452 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4453 .mm = mm, 4454 .private = vma, 4455 }; 4456 if (is_vm_hugetlb_page(vma)) 4457 continue; 4458 walk_page_range(vma->vm_start, vma->vm_end, 4459 &mem_cgroup_count_precharge_walk); 4460 } 4461 4462 precharge = mc.precharge; 4463 mc.precharge = 0; 4464 4465 return precharge; 4466} 4467 4468static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4469{ 4470 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4471} 4472 4473static void mem_cgroup_clear_mc(void) 4474{ 4475 struct mem_cgroup *from = mc.from; 4476 struct mem_cgroup *to = mc.to; 4477 4478 /* we must uncharge all the leftover precharges from mc.to */ 4479 if (mc.precharge) { 4480 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4481 mc.precharge = 0; 4482 } 4483 /* 4484 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4485 * we must uncharge here. 4486 */ 4487 if (mc.moved_charge) { 4488 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4489 mc.moved_charge = 0; 4490 } 4491 /* we must fixup refcnts and charges */ 4492 if (mc.moved_swap) { 4493 /* uncharge swap account from the old cgroup */ 4494 if (!mem_cgroup_is_root(mc.from)) 4495 res_counter_uncharge(&mc.from->memsw, 4496 PAGE_SIZE * mc.moved_swap); 4497 __mem_cgroup_put(mc.from, mc.moved_swap); 4498 4499 if (!mem_cgroup_is_root(mc.to)) { 4500 /* 4501 * we charged both to->res and to->memsw, so we should 4502 * uncharge to->res. 4503 */ 4504 res_counter_uncharge(&mc.to->res, 4505 PAGE_SIZE * mc.moved_swap); 4506 } 4507 /* we've already done mem_cgroup_get(mc.to) */ 4508 4509 mc.moved_swap = 0; 4510 } 4511 if (mc.mm) { 4512 up_read(&mc.mm->mmap_sem); 4513 mmput(mc.mm); 4514 } 4515 spin_lock(&mc.lock); 4516 mc.from = NULL; 4517 mc.to = NULL; 4518 spin_unlock(&mc.lock); 4519 mc.moving_task = NULL; 4520 mc.mm = NULL; 4521 memcg_oom_recover(from); 4522 memcg_oom_recover(to); 4523 wake_up_all(&mc.waitq); 4524} 4525 4526static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4527 struct cgroup *cgroup, 4528 struct task_struct *p, 4529 bool threadgroup) 4530{ 4531 int ret = 0; 4532 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 4533 4534 if (mem->move_charge_at_immigrate) { 4535 struct mm_struct *mm; 4536 struct mem_cgroup *from = mem_cgroup_from_task(p); 4537 4538 VM_BUG_ON(from == mem); 4539 4540 mm = get_task_mm(p); 4541 if (!mm) 4542 return 0; 4543 /* We move charges only when we move a owner of the mm */ 4544 if (mm->owner == p) { 4545 /* 4546 * We do all the move charge works under one mmap_sem to 4547 * avoid deadlock with down_write(&mmap_sem) 4548 * -> try_charge() -> if (mc.moving_task) -> sleep. 4549 */ 4550 down_read(&mm->mmap_sem); 4551 4552 VM_BUG_ON(mc.from); 4553 VM_BUG_ON(mc.to); 4554 VM_BUG_ON(mc.precharge); 4555 VM_BUG_ON(mc.moved_charge); 4556 VM_BUG_ON(mc.moved_swap); 4557 VM_BUG_ON(mc.moving_task); 4558 VM_BUG_ON(mc.mm); 4559 4560 spin_lock(&mc.lock); 4561 mc.from = from; 4562 mc.to = mem; 4563 mc.precharge = 0; 4564 mc.moved_charge = 0; 4565 mc.moved_swap = 0; 4566 spin_unlock(&mc.lock); 4567 mc.moving_task = current; 4568 mc.mm = mm; 4569 4570 ret = mem_cgroup_precharge_mc(mm); 4571 if (ret) 4572 mem_cgroup_clear_mc(); 4573 /* We call up_read() and mmput() in clear_mc(). */ 4574 } else 4575 mmput(mm); 4576 } 4577 return ret; 4578} 4579 4580static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4581 struct cgroup *cgroup, 4582 struct task_struct *p, 4583 bool threadgroup) 4584{ 4585 mem_cgroup_clear_mc(); 4586} 4587 4588static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4589 unsigned long addr, unsigned long end, 4590 struct mm_walk *walk) 4591{ 4592 int ret = 0; 4593 struct vm_area_struct *vma = walk->private; 4594 pte_t *pte; 4595 spinlock_t *ptl; 4596 4597retry: 4598 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4599 for (; addr != end; addr += PAGE_SIZE) { 4600 pte_t ptent = *(pte++); 4601 union mc_target target; 4602 int type; 4603 struct page *page; 4604 struct page_cgroup *pc; 4605 swp_entry_t ent; 4606 4607 if (!mc.precharge) 4608 break; 4609 4610 type = is_target_pte_for_mc(vma, addr, ptent, &target); 4611 switch (type) { 4612 case MC_TARGET_PAGE: 4613 page = target.page; 4614 if (isolate_lru_page(page)) 4615 goto put; 4616 pc = lookup_page_cgroup(page); 4617 if (!mem_cgroup_move_account(pc, 4618 mc.from, mc.to, false)) { 4619 mc.precharge--; 4620 /* we uncharge from mc.from later. */ 4621 mc.moved_charge++; 4622 } 4623 putback_lru_page(page); 4624put: /* is_target_pte_for_mc() gets the page */ 4625 put_page(page); 4626 break; 4627 case MC_TARGET_SWAP: 4628 ent = target.ent; 4629 if (!mem_cgroup_move_swap_account(ent, 4630 mc.from, mc.to, false)) { 4631 mc.precharge--; 4632 /* we fixup refcnts and charges later. */ 4633 mc.moved_swap++; 4634 } 4635 break; 4636 default: 4637 break; 4638 } 4639 } 4640 pte_unmap_unlock(pte - 1, ptl); 4641 cond_resched(); 4642 4643 if (addr != end) { 4644 /* 4645 * We have consumed all precharges we got in can_attach(). 4646 * We try charge one by one, but don't do any additional 4647 * charges to mc.to if we have failed in charge once in attach() 4648 * phase. 4649 */ 4650 ret = mem_cgroup_do_precharge(1); 4651 if (!ret) 4652 goto retry; 4653 } 4654 4655 return ret; 4656} 4657 4658static void mem_cgroup_move_charge(struct mm_struct *mm) 4659{ 4660 struct vm_area_struct *vma; 4661 4662 lru_add_drain_all(); 4663 /* We've already held the mmap_sem */ 4664 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4665 int ret; 4666 struct mm_walk mem_cgroup_move_charge_walk = { 4667 .pmd_entry = mem_cgroup_move_charge_pte_range, 4668 .mm = mm, 4669 .private = vma, 4670 }; 4671 if (is_vm_hugetlb_page(vma)) 4672 continue; 4673 ret = walk_page_range(vma->vm_start, vma->vm_end, 4674 &mem_cgroup_move_charge_walk); 4675 if (ret) 4676 /* 4677 * means we have consumed all precharges and failed in 4678 * doing additional charge. Just abandon here. 4679 */ 4680 break; 4681 } 4682} 4683 4684static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4685 struct cgroup *cont, 4686 struct cgroup *old_cont, 4687 struct task_struct *p, 4688 bool threadgroup) 4689{ 4690 if (!mc.mm) 4691 /* no need to move charge */ 4692 return; 4693 4694 mem_cgroup_move_charge(mc.mm); 4695 mem_cgroup_clear_mc(); 4696} 4697#else /* !CONFIG_MMU */ 4698static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4699 struct cgroup *cgroup, 4700 struct task_struct *p, 4701 bool threadgroup) 4702{ 4703 return 0; 4704} 4705static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4706 struct cgroup *cgroup, 4707 struct task_struct *p, 4708 bool threadgroup) 4709{ 4710} 4711static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4712 struct cgroup *cont, 4713 struct cgroup *old_cont, 4714 struct task_struct *p, 4715 bool threadgroup) 4716{ 4717} 4718#endif 4719 4720struct cgroup_subsys mem_cgroup_subsys = { 4721 .name = "memory", 4722 .subsys_id = mem_cgroup_subsys_id, 4723 .create = mem_cgroup_create, 4724 .pre_destroy = mem_cgroup_pre_destroy, 4725 .destroy = mem_cgroup_destroy, 4726 .populate = mem_cgroup_populate, 4727 .can_attach = mem_cgroup_can_attach, 4728 .cancel_attach = mem_cgroup_cancel_attach, 4729 .attach = mem_cgroup_move_task, 4730 .early_init = 0, 4731 .use_id = 1, 4732}; 4733 4734#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4735 4736static int __init disable_swap_account(char *s) 4737{ 4738 really_do_swap_account = 0; 4739 return 1; 4740} 4741__setup("noswapaccount", disable_swap_account); 4742#endif 4743