1#include <linux/mm.h> 2#include <linux/mmzone.h> 3#include <linux/bootmem.h> 4#include <linux/bit_spinlock.h> 5#include <linux/page_cgroup.h> 6#include <linux/hash.h> 7#include <linux/slab.h> 8#include <linux/memory.h> 9#include <linux/vmalloc.h> 10#include <linux/cgroup.h> 11#include <linux/swapops.h> 12#include <linux/kmemleak.h> 13 14static void __meminit 15__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) 16{ 17 pc->flags = 0; 18 pc->mem_cgroup = NULL; 19 pc->page = pfn_to_page(pfn); 20 INIT_LIST_HEAD(&pc->lru); 21} 22static unsigned long total_usage; 23 24#if !defined(CONFIG_SPARSEMEM) 25 26 27void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 28{ 29 pgdat->node_page_cgroup = NULL; 30} 31 32struct page_cgroup *lookup_page_cgroup(struct page *page) 33{ 34 unsigned long pfn = page_to_pfn(page); 35 unsigned long offset; 36 struct page_cgroup *base; 37 38 base = NODE_DATA(page_to_nid(page))->node_page_cgroup; 39 if (unlikely(!base)) 40 return NULL; 41 42 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; 43 return base + offset; 44} 45 46static int __init alloc_node_page_cgroup(int nid) 47{ 48 struct page_cgroup *base, *pc; 49 unsigned long table_size; 50 unsigned long start_pfn, nr_pages, index; 51 52 start_pfn = NODE_DATA(nid)->node_start_pfn; 53 nr_pages = NODE_DATA(nid)->node_spanned_pages; 54 55 if (!nr_pages) 56 return 0; 57 58 table_size = sizeof(struct page_cgroup) * nr_pages; 59 60 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), 61 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 62 if (!base) 63 return -ENOMEM; 64 for (index = 0; index < nr_pages; index++) { 65 pc = base + index; 66 __init_page_cgroup(pc, start_pfn + index); 67 } 68 NODE_DATA(nid)->node_page_cgroup = base; 69 total_usage += table_size; 70 return 0; 71} 72 73void __init page_cgroup_init_flatmem(void) 74{ 75 76 int nid, fail; 77 78 if (mem_cgroup_disabled()) 79 return; 80 81 for_each_online_node(nid) { 82 fail = alloc_node_page_cgroup(nid); 83 if (fail) 84 goto fail; 85 } 86 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 87 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" 88 " don't want memory cgroups\n"); 89 return; 90fail: 91 printk(KERN_CRIT "allocation of page_cgroup failed.\n"); 92 printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); 93 panic("Out of memory"); 94} 95 96#else /* CONFIG_FLAT_NODE_MEM_MAP */ 97 98struct page_cgroup *lookup_page_cgroup(struct page *page) 99{ 100 unsigned long pfn = page_to_pfn(page); 101 struct mem_section *section = __pfn_to_section(pfn); 102 103 if (!section->page_cgroup) 104 return NULL; 105 return section->page_cgroup + pfn; 106} 107 108/* __alloc_bootmem...() is protected by !slab_available() */ 109static int __init_refok init_section_page_cgroup(unsigned long pfn) 110{ 111 struct mem_section *section = __pfn_to_section(pfn); 112 struct page_cgroup *base, *pc; 113 unsigned long table_size; 114 int nid, index; 115 116 if (!section->page_cgroup) { 117 nid = page_to_nid(pfn_to_page(pfn)); 118 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 119 VM_BUG_ON(!slab_is_available()); 120 if (node_state(nid, N_HIGH_MEMORY)) { 121 base = kmalloc_node(table_size, 122 GFP_KERNEL | __GFP_NOWARN, nid); 123 if (!base) 124 base = vmalloc_node(table_size, nid); 125 } else { 126 base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); 127 if (!base) 128 base = vmalloc(table_size); 129 } 130 /* 131 * The value stored in section->page_cgroup is (base - pfn) 132 * and it does not point to the memory block allocated above, 133 * causing kmemleak false positives. 134 */ 135 kmemleak_not_leak(base); 136 } else { 137 /* 138 * We don't have to allocate page_cgroup again, but 139 * address of memmap may be changed. So, we have to initialize 140 * again. 141 */ 142 base = section->page_cgroup + pfn; 143 table_size = 0; 144 /* check address of memmap is changed or not. */ 145 if (base->page == pfn_to_page(pfn)) 146 return 0; 147 } 148 149 if (!base) { 150 printk(KERN_ERR "page cgroup allocation failure\n"); 151 return -ENOMEM; 152 } 153 154 for (index = 0; index < PAGES_PER_SECTION; index++) { 155 pc = base + index; 156 __init_page_cgroup(pc, pfn + index); 157 } 158 159 section->page_cgroup = base - pfn; 160 total_usage += table_size; 161 return 0; 162} 163#ifdef CONFIG_MEMORY_HOTPLUG 164void __free_page_cgroup(unsigned long pfn) 165{ 166 struct mem_section *ms; 167 struct page_cgroup *base; 168 169 ms = __pfn_to_section(pfn); 170 if (!ms || !ms->page_cgroup) 171 return; 172 base = ms->page_cgroup + pfn; 173 if (is_vmalloc_addr(base)) { 174 vfree(base); 175 ms->page_cgroup = NULL; 176 } else { 177 struct page *page = virt_to_page(base); 178 if (!PageReserved(page)) { /* Is bootmem ? */ 179 kfree(base); 180 ms->page_cgroup = NULL; 181 } 182 } 183} 184 185int __meminit online_page_cgroup(unsigned long start_pfn, 186 unsigned long nr_pages, 187 int nid) 188{ 189 unsigned long start, end, pfn; 190 int fail = 0; 191 192 start = start_pfn & ~(PAGES_PER_SECTION - 1); 193 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 194 195 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { 196 if (!pfn_present(pfn)) 197 continue; 198 fail = init_section_page_cgroup(pfn); 199 } 200 if (!fail) 201 return 0; 202 203 /* rollback */ 204 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 205 __free_page_cgroup(pfn); 206 207 return -ENOMEM; 208} 209 210int __meminit offline_page_cgroup(unsigned long start_pfn, 211 unsigned long nr_pages, int nid) 212{ 213 unsigned long start, end, pfn; 214 215 start = start_pfn & ~(PAGES_PER_SECTION - 1); 216 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 217 218 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 219 __free_page_cgroup(pfn); 220 return 0; 221 222} 223 224static int __meminit page_cgroup_callback(struct notifier_block *self, 225 unsigned long action, void *arg) 226{ 227 struct memory_notify *mn = arg; 228 int ret = 0; 229 switch (action) { 230 case MEM_GOING_ONLINE: 231 ret = online_page_cgroup(mn->start_pfn, 232 mn->nr_pages, mn->status_change_nid); 233 break; 234 case MEM_OFFLINE: 235 offline_page_cgroup(mn->start_pfn, 236 mn->nr_pages, mn->status_change_nid); 237 break; 238 case MEM_CANCEL_ONLINE: 239 case MEM_GOING_OFFLINE: 240 break; 241 case MEM_ONLINE: 242 case MEM_CANCEL_OFFLINE: 243 break; 244 } 245 246 if (ret) 247 ret = notifier_from_errno(ret); 248 else 249 ret = NOTIFY_OK; 250 251 return ret; 252} 253 254#endif 255 256void __init page_cgroup_init(void) 257{ 258 unsigned long pfn; 259 int fail = 0; 260 261 if (mem_cgroup_disabled()) 262 return; 263 264 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { 265 if (!pfn_present(pfn)) 266 continue; 267 fail = init_section_page_cgroup(pfn); 268 } 269 if (fail) { 270 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); 271 panic("Out of memory"); 272 } else { 273 hotplug_memory_notifier(page_cgroup_callback, 0); 274 } 275 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 276 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" 277 " want memory cgroups\n"); 278} 279 280void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 281{ 282 return; 283} 284 285#endif 286 287 288#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 289 290static DEFINE_MUTEX(swap_cgroup_mutex); 291struct swap_cgroup_ctrl { 292 struct page **map; 293 unsigned long length; 294 spinlock_t lock; 295}; 296 297struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 298 299struct swap_cgroup { 300 unsigned short id; 301}; 302#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) 303#define SC_POS_MASK (SC_PER_PAGE - 1) 304 305/* 306 * SwapCgroup implements "lookup" and "exchange" operations. 307 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge 308 * against SwapCache. At swap_free(), this is accessed directly from swap. 309 * 310 * This means, 311 * - we have no race in "exchange" when we're accessed via SwapCache because 312 * SwapCache(and its swp_entry) is under lock. 313 * - When called via swap_free(), there is no user of this entry and no race. 314 * Then, we don't need lock around "exchange". 315 * 316 * TODO: we can push these buffers out to HIGHMEM. 317 */ 318 319/* 320 * allocate buffer for swap_cgroup. 321 */ 322static int swap_cgroup_prepare(int type) 323{ 324 struct page *page; 325 struct swap_cgroup_ctrl *ctrl; 326 unsigned long idx, max; 327 328 ctrl = &swap_cgroup_ctrl[type]; 329 330 for (idx = 0; idx < ctrl->length; idx++) { 331 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 332 if (!page) 333 goto not_enough_page; 334 ctrl->map[idx] = page; 335 } 336 return 0; 337not_enough_page: 338 max = idx; 339 for (idx = 0; idx < max; idx++) 340 __free_page(ctrl->map[idx]); 341 342 return -ENOMEM; 343} 344 345/** 346 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. 347 * @end: swap entry to be cmpxchged 348 * @old: old id 349 * @new: new id 350 * 351 * Returns old id at success, 0 at failure. 352 * (There is no mem_cgroup useing 0 as its id) 353 */ 354unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, 355 unsigned short old, unsigned short new) 356{ 357 int type = swp_type(ent); 358 unsigned long offset = swp_offset(ent); 359 unsigned long idx = offset / SC_PER_PAGE; 360 unsigned long pos = offset & SC_POS_MASK; 361 struct swap_cgroup_ctrl *ctrl; 362 struct page *mappage; 363 struct swap_cgroup *sc; 364 unsigned long flags; 365 unsigned short retval; 366 367 ctrl = &swap_cgroup_ctrl[type]; 368 369 mappage = ctrl->map[idx]; 370 sc = page_address(mappage); 371 sc += pos; 372 spin_lock_irqsave(&ctrl->lock, flags); 373 retval = sc->id; 374 if (retval == old) 375 sc->id = new; 376 else 377 retval = 0; 378 spin_unlock_irqrestore(&ctrl->lock, flags); 379 return retval; 380} 381 382/** 383 * swap_cgroup_record - record mem_cgroup for this swp_entry. 384 * @ent: swap entry to be recorded into 385 * @mem: mem_cgroup to be recorded 386 * 387 * Returns old value at success, 0 at failure. 388 * (Of course, old value can be 0.) 389 */ 390unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) 391{ 392 int type = swp_type(ent); 393 unsigned long offset = swp_offset(ent); 394 unsigned long idx = offset / SC_PER_PAGE; 395 unsigned long pos = offset & SC_POS_MASK; 396 struct swap_cgroup_ctrl *ctrl; 397 struct page *mappage; 398 struct swap_cgroup *sc; 399 unsigned short old; 400 unsigned long flags; 401 402 ctrl = &swap_cgroup_ctrl[type]; 403 404 mappage = ctrl->map[idx]; 405 sc = page_address(mappage); 406 sc += pos; 407 spin_lock_irqsave(&ctrl->lock, flags); 408 old = sc->id; 409 sc->id = id; 410 spin_unlock_irqrestore(&ctrl->lock, flags); 411 412 return old; 413} 414 415/** 416 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry 417 * @ent: swap entry to be looked up. 418 * 419 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) 420 */ 421unsigned short lookup_swap_cgroup(swp_entry_t ent) 422{ 423 int type = swp_type(ent); 424 unsigned long offset = swp_offset(ent); 425 unsigned long idx = offset / SC_PER_PAGE; 426 unsigned long pos = offset & SC_POS_MASK; 427 struct swap_cgroup_ctrl *ctrl; 428 struct page *mappage; 429 struct swap_cgroup *sc; 430 unsigned short ret; 431 432 ctrl = &swap_cgroup_ctrl[type]; 433 mappage = ctrl->map[idx]; 434 sc = page_address(mappage); 435 sc += pos; 436 ret = sc->id; 437 return ret; 438} 439 440int swap_cgroup_swapon(int type, unsigned long max_pages) 441{ 442 void *array; 443 unsigned long array_size; 444 unsigned long length; 445 struct swap_cgroup_ctrl *ctrl; 446 447 if (!do_swap_account) 448 return 0; 449 450 length = ((max_pages/SC_PER_PAGE) + 1); 451 array_size = length * sizeof(void *); 452 453 array = vmalloc(array_size); 454 if (!array) 455 goto nomem; 456 457 memset(array, 0, array_size); 458 ctrl = &swap_cgroup_ctrl[type]; 459 mutex_lock(&swap_cgroup_mutex); 460 ctrl->length = length; 461 ctrl->map = array; 462 spin_lock_init(&ctrl->lock); 463 if (swap_cgroup_prepare(type)) { 464 /* memory shortage */ 465 ctrl->map = NULL; 466 ctrl->length = 0; 467 vfree(array); 468 mutex_unlock(&swap_cgroup_mutex); 469 goto nomem; 470 } 471 mutex_unlock(&swap_cgroup_mutex); 472 473 return 0; 474nomem: 475 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); 476 printk(KERN_INFO 477 "swap_cgroup can be disabled by noswapaccount boot option\n"); 478 return -ENOMEM; 479} 480 481void swap_cgroup_swapoff(int type) 482{ 483 int i; 484 struct swap_cgroup_ctrl *ctrl; 485 486 if (!do_swap_account) 487 return; 488 489 mutex_lock(&swap_cgroup_mutex); 490 ctrl = &swap_cgroup_ctrl[type]; 491 if (ctrl->map) { 492 for (i = 0; i < ctrl->length; i++) { 493 struct page *page = ctrl->map[i]; 494 if (page) 495 __free_page(page); 496 } 497 vfree(ctrl->map); 498 ctrl->map = NULL; 499 ctrl->length = 0; 500 } 501 mutex_unlock(&swap_cgroup_mutex); 502} 503 504#endif 505