1/* 2 * kernel/cpuset.c 3 * 4 * Processor and Memory placement constraints for sets of tasks. 5 * 6 * Copyright (C) 2003 BULL SA. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc. 8 * Copyright (C) 2006 Google, Inc 9 * 10 * Portions derived from Patrick Mochel's sysfs code. 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel 12 * 13 * 2003-10-10 Written by Simon Derr. 14 * 2003-10-22 Updates by Stephen Hemminger. 15 * 2004 May-July Rework by Paul Jackson. 16 * 2006 Rework by Paul Menage to use generic cgroups 17 * 2008 Rework of the scheduler domains and CPU hotplug handling 18 * by Max Krasnyansky 19 * 20 * This file is subject to the terms and conditions of the GNU General Public 21 * License. See the file COPYING in the main directory of the Linux 22 * distribution for more details. 23 */ 24 25#include <linux/cpu.h> 26#include <linux/cpumask.h> 27#include <linux/cpuset.h> 28#include <linux/err.h> 29#include <linux/errno.h> 30#include <linux/file.h> 31#include <linux/fs.h> 32#include <linux/init.h> 33#include <linux/interrupt.h> 34#include <linux/kernel.h> 35#include <linux/kmod.h> 36#include <linux/list.h> 37#include <linux/mempolicy.h> 38#include <linux/mm.h> 39#include <linux/memory.h> 40#include <linux/module.h> 41#include <linux/mount.h> 42#include <linux/namei.h> 43#include <linux/pagemap.h> 44#include <linux/proc_fs.h> 45#include <linux/rcupdate.h> 46#include <linux/sched.h> 47#include <linux/seq_file.h> 48#include <linux/security.h> 49#include <linux/slab.h> 50#include <linux/spinlock.h> 51#include <linux/stat.h> 52#include <linux/string.h> 53#include <linux/time.h> 54#include <linux/backing-dev.h> 55#include <linux/sort.h> 56 57#include <asm/uaccess.h> 58#include <asm/atomic.h> 59#include <linux/mutex.h> 60#include <linux/workqueue.h> 61#include <linux/cgroup.h> 62 63/* 64 * Workqueue for cpuset related tasks. 65 * 66 * Using kevent workqueue may cause deadlock when memory_migrate 67 * is set. So we create a separate workqueue thread for cpuset. 68 */ 69static struct workqueue_struct *cpuset_wq; 70 71/* 72 * Tracks how many cpusets are currently defined in system. 73 * When there is only one cpuset (the root cpuset) we can 74 * short circuit some hooks. 75 */ 76int number_of_cpusets __read_mostly; 77 78/* Forward declare cgroup structures */ 79struct cgroup_subsys cpuset_subsys; 80struct cpuset; 81 82/* See "Frequency meter" comments, below. */ 83 84struct fmeter { 85 int cnt; /* unprocessed events count */ 86 int val; /* most recent output value */ 87 time_t time; /* clock (secs) when val computed */ 88 spinlock_t lock; /* guards read or write of above */ 89}; 90 91struct cpuset { 92 struct cgroup_subsys_state css; 93 94 unsigned long flags; /* "unsigned long" so bitops work */ 95 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 96 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 97 98 struct cpuset *parent; /* my parent */ 99 100 struct fmeter fmeter; /* memory_pressure filter */ 101 102 /* partition number for rebuild_sched_domains() */ 103 int pn; 104 105 /* for custom sched domain */ 106 int relax_domain_level; 107 108 /* used for walking a cpuset hierarchy */ 109 struct list_head stack_list; 110}; 111 112/* Retrieve the cpuset for a cgroup */ 113static inline struct cpuset *cgroup_cs(struct cgroup *cont) 114{ 115 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), 116 struct cpuset, css); 117} 118 119/* Retrieve the cpuset for a task */ 120static inline struct cpuset *task_cs(struct task_struct *task) 121{ 122 return container_of(task_subsys_state(task, cpuset_subsys_id), 123 struct cpuset, css); 124} 125 126/* bits in struct cpuset flags field */ 127typedef enum { 128 CS_CPU_EXCLUSIVE, 129 CS_MEM_EXCLUSIVE, 130 CS_MEM_HARDWALL, 131 CS_MEMORY_MIGRATE, 132 CS_SCHED_LOAD_BALANCE, 133 CS_SPREAD_PAGE, 134 CS_SPREAD_SLAB, 135} cpuset_flagbits_t; 136 137/* convenient tests for these bits */ 138static inline int is_cpu_exclusive(const struct cpuset *cs) 139{ 140 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 141} 142 143static inline int is_mem_exclusive(const struct cpuset *cs) 144{ 145 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 146} 147 148static inline int is_mem_hardwall(const struct cpuset *cs) 149{ 150 return test_bit(CS_MEM_HARDWALL, &cs->flags); 151} 152 153static inline int is_sched_load_balance(const struct cpuset *cs) 154{ 155 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 156} 157 158static inline int is_memory_migrate(const struct cpuset *cs) 159{ 160 return test_bit(CS_MEMORY_MIGRATE, &cs->flags); 161} 162 163static inline int is_spread_page(const struct cpuset *cs) 164{ 165 return test_bit(CS_SPREAD_PAGE, &cs->flags); 166} 167 168static inline int is_spread_slab(const struct cpuset *cs) 169{ 170 return test_bit(CS_SPREAD_SLAB, &cs->flags); 171} 172 173static struct cpuset top_cpuset = { 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 175}; 176 177/* 178 * There are two global mutexes guarding cpuset structures. The first 179 * is the main control groups cgroup_mutex, accessed via 180 * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific 181 * callback_mutex, below. They can nest. It is ok to first take 182 * cgroup_mutex, then nest callback_mutex. We also require taking 183 * task_lock() when dereferencing a task's cpuset pointer. See "The 184 * task_lock() exception", at the end of this comment. 185 * 186 * A task must hold both mutexes to modify cpusets. If a task 187 * holds cgroup_mutex, then it blocks others wanting that mutex, 188 * ensuring that it is the only task able to also acquire callback_mutex 189 * and be able to modify cpusets. It can perform various checks on 190 * the cpuset structure first, knowing nothing will change. It can 191 * also allocate memory while just holding cgroup_mutex. While it is 192 * performing these checks, various callback routines can briefly 193 * acquire callback_mutex to query cpusets. Once it is ready to make 194 * the changes, it takes callback_mutex, blocking everyone else. 195 * 196 * Calls to the kernel memory allocator can not be made while holding 197 * callback_mutex, as that would risk double tripping on callback_mutex 198 * from one of the callbacks into the cpuset code from within 199 * __alloc_pages(). 200 * 201 * If a task is only holding callback_mutex, then it has read-only 202 * access to cpusets. 203 * 204 * Now, the task_struct fields mems_allowed and mempolicy may be changed 205 * by other task, we use alloc_lock in the task_struct fields to protect 206 * them. 207 * 208 * The cpuset_common_file_read() handlers only hold callback_mutex across 209 * small pieces of code, such as when reading out possibly multi-word 210 * cpumasks and nodemasks. 211 * 212 * Accessing a task's cpuset should be done in accordance with the 213 * guidelines for accessing subsystem state in kernel/cgroup.c 214 */ 215 216static DEFINE_MUTEX(callback_mutex); 217 218/* 219 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist 220 * buffers. They are statically allocated to prevent using excess stack 221 * when calling cpuset_print_task_mems_allowed(). 222 */ 223#define CPUSET_NAME_LEN (128) 224#define CPUSET_NODELIST_LEN (256) 225static char cpuset_name[CPUSET_NAME_LEN]; 226static char cpuset_nodelist[CPUSET_NODELIST_LEN]; 227static DEFINE_SPINLOCK(cpuset_buffer_lock); 228 229/* 230 * This is ugly, but preserves the userspace API for existing cpuset 231 * users. If someone tries to mount the "cpuset" filesystem, we 232 * silently switch it to mount "cgroup" instead 233 */ 234static int cpuset_get_sb(struct file_system_type *fs_type, 235 int flags, const char *unused_dev_name, 236 void *data, struct vfsmount *mnt) 237{ 238 struct file_system_type *cgroup_fs = get_fs_type("cgroup"); 239 int ret = -ENODEV; 240 if (cgroup_fs) { 241 char mountopts[] = 242 "cpuset,noprefix," 243 "release_agent=/sbin/cpuset_release_agent"; 244 ret = cgroup_fs->get_sb(cgroup_fs, flags, 245 unused_dev_name, mountopts, mnt); 246 put_filesystem(cgroup_fs); 247 } 248 return ret; 249} 250 251static struct file_system_type cpuset_fs_type = { 252 .name = "cpuset", 253 .get_sb = cpuset_get_sb, 254}; 255 256/* 257 * Return in pmask the portion of a cpusets's cpus_allowed that 258 * are online. If none are online, walk up the cpuset hierarchy 259 * until we find one that does have some online cpus. If we get 260 * all the way to the top and still haven't found any online cpus, 261 * return cpu_online_map. Or if passed a NULL cs from an exit'ing 262 * task, return cpu_online_map. 263 * 264 * One way or another, we guarantee to return some non-empty subset 265 * of cpu_online_map. 266 * 267 * Call with callback_mutex held. 268 */ 269 270static void guarantee_online_cpus(const struct cpuset *cs, 271 struct cpumask *pmask) 272{ 273 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 274 cs = cs->parent; 275 if (cs) 276 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 277 else 278 cpumask_copy(pmask, cpu_online_mask); 279 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); 280} 281 282/* 283 * Return in *pmask the portion of a cpusets's mems_allowed that 284 * are online, with memory. If none are online with memory, walk 285 * up the cpuset hierarchy until we find one that does have some 286 * online mems. If we get all the way to the top and still haven't 287 * found any online mems, return node_states[N_HIGH_MEMORY]. 288 * 289 * One way or another, we guarantee to return some non-empty subset 290 * of node_states[N_HIGH_MEMORY]. 291 * 292 * Call with callback_mutex held. 293 */ 294 295static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 296{ 297 while (cs && !nodes_intersects(cs->mems_allowed, 298 node_states[N_HIGH_MEMORY])) 299 cs = cs->parent; 300 if (cs) 301 nodes_and(*pmask, cs->mems_allowed, 302 node_states[N_HIGH_MEMORY]); 303 else 304 *pmask = node_states[N_HIGH_MEMORY]; 305 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 306} 307 308/* 309 * update task's spread flag if cpuset's page/slab spread flag is set 310 * 311 * Called with callback_mutex/cgroup_mutex held 312 */ 313static void cpuset_update_task_spread_flag(struct cpuset *cs, 314 struct task_struct *tsk) 315{ 316 if (is_spread_page(cs)) 317 tsk->flags |= PF_SPREAD_PAGE; 318 else 319 tsk->flags &= ~PF_SPREAD_PAGE; 320 if (is_spread_slab(cs)) 321 tsk->flags |= PF_SPREAD_SLAB; 322 else 323 tsk->flags &= ~PF_SPREAD_SLAB; 324} 325 326/* 327 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 328 * 329 * One cpuset is a subset of another if all its allowed CPUs and 330 * Memory Nodes are a subset of the other, and its exclusive flags 331 * are only set if the other's are set. Call holding cgroup_mutex. 332 */ 333 334static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 335{ 336 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && 337 nodes_subset(p->mems_allowed, q->mems_allowed) && 338 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 339 is_mem_exclusive(p) <= is_mem_exclusive(q); 340} 341 342/** 343 * alloc_trial_cpuset - allocate a trial cpuset 344 * @cs: the cpuset that the trial cpuset duplicates 345 */ 346static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) 347{ 348 struct cpuset *trial; 349 350 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); 351 if (!trial) 352 return NULL; 353 354 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { 355 kfree(trial); 356 return NULL; 357 } 358 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 359 360 return trial; 361} 362 363/** 364 * free_trial_cpuset - free the trial cpuset 365 * @trial: the trial cpuset to be freed 366 */ 367static void free_trial_cpuset(struct cpuset *trial) 368{ 369 free_cpumask_var(trial->cpus_allowed); 370 kfree(trial); 371} 372 373/* 374 * validate_change() - Used to validate that any proposed cpuset change 375 * follows the structural rules for cpusets. 376 * 377 * If we replaced the flag and mask values of the current cpuset 378 * (cur) with those values in the trial cpuset (trial), would 379 * our various subset and exclusive rules still be valid? Presumes 380 * cgroup_mutex held. 381 * 382 * 'cur' is the address of an actual, in-use cpuset. Operations 383 * such as list traversal that depend on the actual address of the 384 * cpuset in the list must use cur below, not trial. 385 * 386 * 'trial' is the address of bulk structure copy of cur, with 387 * perhaps one or more of the fields cpus_allowed, mems_allowed, 388 * or flags changed to new, trial values. 389 * 390 * Return 0 if valid, -errno if not. 391 */ 392 393static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 394{ 395 struct cgroup *cont; 396 struct cpuset *c, *par; 397 398 /* Each of our child cpusets must be a subset of us */ 399 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 400 if (!is_cpuset_subset(cgroup_cs(cont), trial)) 401 return -EBUSY; 402 } 403 404 /* Remaining checks don't apply to root cpuset */ 405 if (cur == &top_cpuset) 406 return 0; 407 408 par = cur->parent; 409 410 /* We must be a subset of our parent cpuset */ 411 if (!is_cpuset_subset(trial, par)) 412 return -EACCES; 413 414 /* 415 * If either I or some sibling (!= me) is exclusive, we can't 416 * overlap 417 */ 418 list_for_each_entry(cont, &par->css.cgroup->children, sibling) { 419 c = cgroup_cs(cont); 420 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 421 c != cur && 422 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 423 return -EINVAL; 424 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 425 c != cur && 426 nodes_intersects(trial->mems_allowed, c->mems_allowed)) 427 return -EINVAL; 428 } 429 430 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 431 if (cgroup_task_count(cur->css.cgroup)) { 432 if (cpumask_empty(trial->cpus_allowed) || 433 nodes_empty(trial->mems_allowed)) { 434 return -ENOSPC; 435 } 436 } 437 438 return 0; 439} 440 441#ifdef CONFIG_SMP 442/* 443 * Helper routine for generate_sched_domains(). 444 * Do cpusets a, b have overlapping cpus_allowed masks? 445 */ 446static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 447{ 448 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); 449} 450 451static void 452update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 453{ 454 if (dattr->relax_domain_level < c->relax_domain_level) 455 dattr->relax_domain_level = c->relax_domain_level; 456 return; 457} 458 459static void 460update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) 461{ 462 LIST_HEAD(q); 463 464 list_add(&c->stack_list, &q); 465 while (!list_empty(&q)) { 466 struct cpuset *cp; 467 struct cgroup *cont; 468 struct cpuset *child; 469 470 cp = list_first_entry(&q, struct cpuset, stack_list); 471 list_del(q.next); 472 473 if (cpumask_empty(cp->cpus_allowed)) 474 continue; 475 476 if (is_sched_load_balance(cp)) 477 update_domain_attr(dattr, cp); 478 479 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 480 child = cgroup_cs(cont); 481 list_add_tail(&child->stack_list, &q); 482 } 483 } 484} 485 486/* 487 * generate_sched_domains() 488 * 489 * This function builds a partial partition of the systems CPUs 490 * A 'partial partition' is a set of non-overlapping subsets whose 491 * union is a subset of that set. 492 * The output of this function needs to be passed to kernel/sched.c 493 * partition_sched_domains() routine, which will rebuild the scheduler's 494 * load balancing domains (sched domains) as specified by that partial 495 * partition. 496 * 497 * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt 498 * for a background explanation of this. 499 * 500 * Does not return errors, on the theory that the callers of this 501 * routine would rather not worry about failures to rebuild sched 502 * domains when operating in the severe memory shortage situations 503 * that could cause allocation failures below. 504 * 505 * Must be called with cgroup_lock held. 506 * 507 * The three key local variables below are: 508 * q - a linked-list queue of cpuset pointers, used to implement a 509 * top-down scan of all cpusets. This scan loads a pointer 510 * to each cpuset marked is_sched_load_balance into the 511 * array 'csa'. For our purposes, rebuilding the schedulers 512 * sched domains, we can ignore !is_sched_load_balance cpusets. 513 * csa - (for CpuSet Array) Array of pointers to all the cpusets 514 * that need to be load balanced, for convenient iterative 515 * access by the subsequent code that finds the best partition, 516 * i.e the set of domains (subsets) of CPUs such that the 517 * cpus_allowed of every cpuset marked is_sched_load_balance 518 * is a subset of one of these domains, while there are as 519 * many such domains as possible, each as small as possible. 520 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 521 * the kernel/sched.c routine partition_sched_domains() in a 522 * convenient format, that can be easily compared to the prior 523 * value to determine what partition elements (sched domains) 524 * were changed (added or removed.) 525 * 526 * Finding the best partition (set of domains): 527 * The triple nested loops below over i, j, k scan over the 528 * load balanced cpusets (using the array of cpuset pointers in 529 * csa[]) looking for pairs of cpusets that have overlapping 530 * cpus_allowed, but which don't have the same 'pn' partition 531 * number and gives them in the same partition number. It keeps 532 * looping on the 'restart' label until it can no longer find 533 * any such pairs. 534 * 535 * The union of the cpus_allowed masks from the set of 536 * all cpusets having the same 'pn' value then form the one 537 * element of the partition (one sched domain) to be passed to 538 * partition_sched_domains(). 539 */ 540static int generate_sched_domains(cpumask_var_t **domains, 541 struct sched_domain_attr **attributes) 542{ 543 LIST_HEAD(q); /* queue of cpusets to be scanned */ 544 struct cpuset *cp; /* scans q */ 545 struct cpuset **csa; /* array of all cpuset ptrs */ 546 int csn; /* how many cpuset ptrs in csa so far */ 547 int i, j, k; /* indices for partition finding loops */ 548 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 549 struct sched_domain_attr *dattr; /* attributes for custom domains */ 550 int ndoms = 0; /* number of sched domains in result */ 551 int nslot; /* next empty doms[] struct cpumask slot */ 552 553 doms = NULL; 554 dattr = NULL; 555 csa = NULL; 556 557 /* Special case for the 99% of systems with one, full, sched domain */ 558 if (is_sched_load_balance(&top_cpuset)) { 559 ndoms = 1; 560 doms = alloc_sched_domains(ndoms); 561 if (!doms) 562 goto done; 563 564 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 565 if (dattr) { 566 *dattr = SD_ATTR_INIT; 567 update_domain_attr_tree(dattr, &top_cpuset); 568 } 569 cpumask_copy(doms[0], top_cpuset.cpus_allowed); 570 571 goto done; 572 } 573 574 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 575 if (!csa) 576 goto done; 577 csn = 0; 578 579 list_add(&top_cpuset.stack_list, &q); 580 while (!list_empty(&q)) { 581 struct cgroup *cont; 582 struct cpuset *child; /* scans child cpusets of cp */ 583 584 cp = list_first_entry(&q, struct cpuset, stack_list); 585 list_del(q.next); 586 587 if (cpumask_empty(cp->cpus_allowed)) 588 continue; 589 590 /* 591 * All child cpusets contain a subset of the parent's cpus, so 592 * just skip them, and then we call update_domain_attr_tree() 593 * to calc relax_domain_level of the corresponding sched 594 * domain. 595 */ 596 if (is_sched_load_balance(cp)) { 597 csa[csn++] = cp; 598 continue; 599 } 600 601 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 602 child = cgroup_cs(cont); 603 list_add_tail(&child->stack_list, &q); 604 } 605 } 606 607 for (i = 0; i < csn; i++) 608 csa[i]->pn = i; 609 ndoms = csn; 610 611restart: 612 /* Find the best partition (set of sched domains) */ 613 for (i = 0; i < csn; i++) { 614 struct cpuset *a = csa[i]; 615 int apn = a->pn; 616 617 for (j = 0; j < csn; j++) { 618 struct cpuset *b = csa[j]; 619 int bpn = b->pn; 620 621 if (apn != bpn && cpusets_overlap(a, b)) { 622 for (k = 0; k < csn; k++) { 623 struct cpuset *c = csa[k]; 624 625 if (c->pn == bpn) 626 c->pn = apn; 627 } 628 ndoms--; /* one less element */ 629 goto restart; 630 } 631 } 632 } 633 634 /* 635 * Now we know how many domains to create. 636 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 637 */ 638 doms = alloc_sched_domains(ndoms); 639 if (!doms) 640 goto done; 641 642 /* 643 * The rest of the code, including the scheduler, can deal with 644 * dattr==NULL case. No need to abort if alloc fails. 645 */ 646 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 647 648 for (nslot = 0, i = 0; i < csn; i++) { 649 struct cpuset *a = csa[i]; 650 struct cpumask *dp; 651 int apn = a->pn; 652 653 if (apn < 0) { 654 /* Skip completed partitions */ 655 continue; 656 } 657 658 dp = doms[nslot]; 659 660 if (nslot == ndoms) { 661 static int warnings = 10; 662 if (warnings) { 663 printk(KERN_WARNING 664 "rebuild_sched_domains confused:" 665 " nslot %d, ndoms %d, csn %d, i %d," 666 " apn %d\n", 667 nslot, ndoms, csn, i, apn); 668 warnings--; 669 } 670 continue; 671 } 672 673 cpumask_clear(dp); 674 if (dattr) 675 *(dattr + nslot) = SD_ATTR_INIT; 676 for (j = i; j < csn; j++) { 677 struct cpuset *b = csa[j]; 678 679 if (apn == b->pn) { 680 cpumask_or(dp, dp, b->cpus_allowed); 681 if (dattr) 682 update_domain_attr_tree(dattr + nslot, b); 683 684 /* Done with this partition */ 685 b->pn = -1; 686 } 687 } 688 nslot++; 689 } 690 BUG_ON(nslot != ndoms); 691 692done: 693 kfree(csa); 694 695 /* 696 * Fallback to the default domain if kmalloc() failed. 697 * See comments in partition_sched_domains(). 698 */ 699 if (doms == NULL) 700 ndoms = 1; 701 702 *domains = doms; 703 *attributes = dattr; 704 return ndoms; 705} 706 707/* 708 * Rebuild scheduler domains. 709 * 710 * Call with neither cgroup_mutex held nor within get_online_cpus(). 711 * Takes both cgroup_mutex and get_online_cpus(). 712 * 713 * Cannot be directly called from cpuset code handling changes 714 * to the cpuset pseudo-filesystem, because it cannot be called 715 * from code that already holds cgroup_mutex. 716 */ 717static void do_rebuild_sched_domains(struct work_struct *unused) 718{ 719 struct sched_domain_attr *attr; 720 cpumask_var_t *doms; 721 int ndoms; 722 723 get_online_cpus(); 724 725 /* Generate domain masks and attrs */ 726 cgroup_lock(); 727 ndoms = generate_sched_domains(&doms, &attr); 728 cgroup_unlock(); 729 730 /* Have scheduler rebuild the domains */ 731 partition_sched_domains(ndoms, doms, attr); 732 733 put_online_cpus(); 734} 735#else /* !CONFIG_SMP */ 736static void do_rebuild_sched_domains(struct work_struct *unused) 737{ 738} 739 740static int generate_sched_domains(cpumask_var_t **domains, 741 struct sched_domain_attr **attributes) 742{ 743 *domains = NULL; 744 return 1; 745} 746#endif /* CONFIG_SMP */ 747 748static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); 749 750/* 751 * Rebuild scheduler domains, asynchronously via workqueue. 752 * 753 * If the flag 'sched_load_balance' of any cpuset with non-empty 754 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 755 * which has that flag enabled, or if any cpuset with a non-empty 756 * 'cpus' is removed, then call this routine to rebuild the 757 * scheduler's dynamic sched domains. 758 * 759 * The rebuild_sched_domains() and partition_sched_domains() 760 * routines must nest cgroup_lock() inside get_online_cpus(), 761 * but such cpuset changes as these must nest that locking the 762 * other way, holding cgroup_lock() for much of the code. 763 * 764 * So in order to avoid an ABBA deadlock, the cpuset code handling 765 * these user changes delegates the actual sched domain rebuilding 766 * to a separate workqueue thread, which ends up processing the 767 * above do_rebuild_sched_domains() function. 768 */ 769static void async_rebuild_sched_domains(void) 770{ 771 queue_work(cpuset_wq, &rebuild_sched_domains_work); 772} 773 774/* 775 * Accomplishes the same scheduler domain rebuild as the above 776 * async_rebuild_sched_domains(), however it directly calls the 777 * rebuild routine synchronously rather than calling it via an 778 * asynchronous work thread. 779 * 780 * This can only be called from code that is not holding 781 * cgroup_mutex (not nested in a cgroup_lock() call.) 782 */ 783void rebuild_sched_domains(void) 784{ 785 do_rebuild_sched_domains(NULL); 786} 787 788/** 789 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's 790 * @tsk: task to test 791 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner 792 * 793 * Call with cgroup_mutex held. May take callback_mutex during call. 794 * Called for each task in a cgroup by cgroup_scan_tasks(). 795 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 796 * words, if its mask is not equal to its cpuset's mask). 797 */ 798static int cpuset_test_cpumask(struct task_struct *tsk, 799 struct cgroup_scanner *scan) 800{ 801 return !cpumask_equal(&tsk->cpus_allowed, 802 (cgroup_cs(scan->cg))->cpus_allowed); 803} 804 805/** 806 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's 807 * @tsk: task to test 808 * @scan: struct cgroup_scanner containing the cgroup of the task 809 * 810 * Called by cgroup_scan_tasks() for each task in a cgroup whose 811 * cpus_allowed mask needs to be changed. 812 * 813 * We don't need to re-check for the cgroup/cpuset membership, since we're 814 * holding cgroup_lock() at this point. 815 */ 816static void cpuset_change_cpumask(struct task_struct *tsk, 817 struct cgroup_scanner *scan) 818{ 819 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); 820} 821 822/** 823 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 824 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 825 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 826 * 827 * Called with cgroup_mutex held 828 * 829 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 830 * calling callback functions for each. 831 * 832 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 833 * if @heap != NULL. 834 */ 835static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 836{ 837 struct cgroup_scanner scan; 838 839 scan.cg = cs->css.cgroup; 840 scan.test_task = cpuset_test_cpumask; 841 scan.process_task = cpuset_change_cpumask; 842 scan.heap = heap; 843 cgroup_scan_tasks(&scan); 844} 845 846/** 847 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 848 * @cs: the cpuset to consider 849 * @buf: buffer of cpu numbers written to this cpuset 850 */ 851static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 852 const char *buf) 853{ 854 struct ptr_heap heap; 855 int retval; 856 int is_load_balanced; 857 858 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 859 if (cs == &top_cpuset) 860 return -EACCES; 861 862 /* 863 * An empty cpus_allowed is ok only if the cpuset has no tasks. 864 * Since cpulist_parse() fails on an empty mask, we special case 865 * that parsing. The validate_change() call ensures that cpusets 866 * with tasks have cpus. 867 */ 868 if (!*buf) { 869 cpumask_clear(trialcs->cpus_allowed); 870 } else { 871 retval = cpulist_parse(buf, trialcs->cpus_allowed); 872 if (retval < 0) 873 return retval; 874 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 876 return -EINVAL; 877 } 878 retval = validate_change(cs, trialcs); 879 if (retval < 0) 880 return retval; 881 882 /* Nothing to do if the cpus didn't change */ 883 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 884 return 0; 885 886 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 887 if (retval) 888 return retval; 889 890 is_load_balanced = is_sched_load_balance(trialcs); 891 892 mutex_lock(&callback_mutex); 893 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 894 mutex_unlock(&callback_mutex); 895 896 /* 897 * Scan tasks in the cpuset, and update the cpumasks of any 898 * that need an update. 899 */ 900 update_tasks_cpumask(cs, &heap); 901 902 heap_free(&heap); 903 904 if (is_load_balanced) 905 async_rebuild_sched_domains(); 906 return 0; 907} 908 909/* 910 * cpuset_migrate_mm 911 * 912 * Migrate memory region from one set of nodes to another. 913 * 914 * Temporarilly set tasks mems_allowed to target nodes of migration, 915 * so that the migration code can allocate pages on these nodes. 916 * 917 * Call holding cgroup_mutex, so current's cpuset won't change 918 * during this call, as manage_mutex holds off any cpuset_attach() 919 * calls. Therefore we don't need to take task_lock around the 920 * call to guarantee_online_mems(), as we know no one is changing 921 * our task's cpuset. 922 * 923 * While the mm_struct we are migrating is typically from some 924 * other task, the task_struct mems_allowed that we are hacking 925 * is for our current task, which must allocate new pages for that 926 * migrating memory region. 927 */ 928 929static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 930 const nodemask_t *to) 931{ 932 struct task_struct *tsk = current; 933 934 tsk->mems_allowed = *to; 935 936 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 937 938 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 939} 940 941/* 942 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy 943 * @tsk: the task to change 944 * @newmems: new nodes that the task will be set 945 * 946 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 947 * we structure updates as setting all new allowed nodes, then clearing newly 948 * disallowed ones. 949 */ 950static void cpuset_change_task_nodemask(struct task_struct *tsk, 951 nodemask_t *newmems) 952{ 953repeat: 954 /* 955 * Allow tasks that have access to memory reserves because they have 956 * been OOM killed to get memory anywhere. 957 */ 958 if (unlikely(test_thread_flag(TIF_MEMDIE))) 959 return; 960 if (current->flags & PF_EXITING) /* Let dying task have memory */ 961 return; 962 963 task_lock(tsk); 964 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 965 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 966 967 968 /* 969 * ensure checking ->mems_allowed_change_disable after setting all new 970 * allowed nodes. 971 * 972 * the read-side task can see an nodemask with new allowed nodes and 973 * old allowed nodes. and if it allocates page when cpuset clears newly 974 * disallowed ones continuous, it can see the new allowed bits. 975 * 976 * And if setting all new allowed nodes is after the checking, setting 977 * all new allowed nodes and clearing newly disallowed ones will be done 978 * continuous, and the read-side task may find no node to alloc page. 979 */ 980 smp_mb(); 981 982 /* 983 * Allocation of memory is very fast, we needn't sleep when waiting 984 * for the read-side. 985 */ 986 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { 987 task_unlock(tsk); 988 if (!task_curr(tsk)) 989 yield(); 990 goto repeat; 991 } 992 993 /* 994 * ensure checking ->mems_allowed_change_disable before clearing all new 995 * disallowed nodes. 996 * 997 * if clearing newly disallowed bits before the checking, the read-side 998 * task may find no node to alloc page. 999 */ 1000 smp_mb(); 1001 1002 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 1003 tsk->mems_allowed = *newmems; 1004 task_unlock(tsk); 1005} 1006 1007/* 1008 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1009 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1010 * memory_migrate flag is set. Called with cgroup_mutex held. 1011 */ 1012static void cpuset_change_nodemask(struct task_struct *p, 1013 struct cgroup_scanner *scan) 1014{ 1015 struct mm_struct *mm; 1016 struct cpuset *cs; 1017 int migrate; 1018 const nodemask_t *oldmem = scan->data; 1019 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); 1020 1021 if (!newmems) 1022 return; 1023 1024 cs = cgroup_cs(scan->cg); 1025 guarantee_online_mems(cs, newmems); 1026 1027 cpuset_change_task_nodemask(p, newmems); 1028 1029 NODEMASK_FREE(newmems); 1030 1031 mm = get_task_mm(p); 1032 if (!mm) 1033 return; 1034 1035 migrate = is_memory_migrate(cs); 1036 1037 mpol_rebind_mm(mm, &cs->mems_allowed); 1038 if (migrate) 1039 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); 1040 mmput(mm); 1041} 1042 1043static void *cpuset_being_rebound; 1044 1045/** 1046 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1047 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1048 * @oldmem: old mems_allowed of cpuset cs 1049 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1050 * 1051 * Called with cgroup_mutex held 1052 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1053 * if @heap != NULL. 1054 */ 1055static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, 1056 struct ptr_heap *heap) 1057{ 1058 struct cgroup_scanner scan; 1059 1060 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1061 1062 scan.cg = cs->css.cgroup; 1063 scan.test_task = NULL; 1064 scan.process_task = cpuset_change_nodemask; 1065 scan.heap = heap; 1066 scan.data = (nodemask_t *)oldmem; 1067 1068 /* 1069 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1070 * take while holding tasklist_lock. Forks can happen - the 1071 * mpol_dup() cpuset_being_rebound check will catch such forks, 1072 * and rebind their vma mempolicies too. Because we still hold 1073 * the global cgroup_mutex, we know that no other rebind effort 1074 * will be contending for the global variable cpuset_being_rebound. 1075 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1076 * is idempotent. Also migrate pages in each mm to new nodes. 1077 */ 1078 cgroup_scan_tasks(&scan); 1079 1080 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1081 cpuset_being_rebound = NULL; 1082} 1083 1084/* 1085 * Handle user request to change the 'mems' memory placement 1086 * of a cpuset. Needs to validate the request, update the 1087 * cpusets mems_allowed, and for each task in the cpuset, 1088 * update mems_allowed and rebind task's mempolicy and any vma 1089 * mempolicies and if the cpuset is marked 'memory_migrate', 1090 * migrate the tasks pages to the new memory. 1091 * 1092 * Call with cgroup_mutex held. May take callback_mutex during call. 1093 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1094 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1095 * their mempolicies to the cpusets new mems_allowed. 1096 */ 1097static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1098 const char *buf) 1099{ 1100 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); 1101 int retval; 1102 struct ptr_heap heap; 1103 1104 if (!oldmem) 1105 return -ENOMEM; 1106 1107 /* 1108 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1109 * it's read-only 1110 */ 1111 if (cs == &top_cpuset) { 1112 retval = -EACCES; 1113 goto done; 1114 } 1115 1116 /* 1117 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1118 * Since nodelist_parse() fails on an empty mask, we special case 1119 * that parsing. The validate_change() call ensures that cpusets 1120 * with tasks have memory. 1121 */ 1122 if (!*buf) { 1123 nodes_clear(trialcs->mems_allowed); 1124 } else { 1125 retval = nodelist_parse(buf, trialcs->mems_allowed); 1126 if (retval < 0) 1127 goto done; 1128 1129 if (!nodes_subset(trialcs->mems_allowed, 1130 node_states[N_HIGH_MEMORY])) { 1131 retval = -EINVAL; 1132 goto done; 1133 } 1134 } 1135 *oldmem = cs->mems_allowed; 1136 if (nodes_equal(*oldmem, trialcs->mems_allowed)) { 1137 retval = 0; /* Too easy - nothing to do */ 1138 goto done; 1139 } 1140 retval = validate_change(cs, trialcs); 1141 if (retval < 0) 1142 goto done; 1143 1144 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 1145 if (retval < 0) 1146 goto done; 1147 1148 mutex_lock(&callback_mutex); 1149 cs->mems_allowed = trialcs->mems_allowed; 1150 mutex_unlock(&callback_mutex); 1151 1152 update_tasks_nodemask(cs, oldmem, &heap); 1153 1154 heap_free(&heap); 1155done: 1156 NODEMASK_FREE(oldmem); 1157 return retval; 1158} 1159 1160int current_cpuset_is_being_rebound(void) 1161{ 1162 return task_cs(current) == cpuset_being_rebound; 1163} 1164 1165static int update_relax_domain_level(struct cpuset *cs, s64 val) 1166{ 1167#ifdef CONFIG_SMP 1168 if (val < -1 || val >= SD_LV_MAX) 1169 return -EINVAL; 1170#endif 1171 1172 if (val != cs->relax_domain_level) { 1173 cs->relax_domain_level = val; 1174 if (!cpumask_empty(cs->cpus_allowed) && 1175 is_sched_load_balance(cs)) 1176 async_rebuild_sched_domains(); 1177 } 1178 1179 return 0; 1180} 1181 1182/* 1183 * cpuset_change_flag - make a task's spread flags the same as its cpuset's 1184 * @tsk: task to be updated 1185 * @scan: struct cgroup_scanner containing the cgroup of the task 1186 * 1187 * Called by cgroup_scan_tasks() for each task in a cgroup. 1188 * 1189 * We don't need to re-check for the cgroup/cpuset membership, since we're 1190 * holding cgroup_lock() at this point. 1191 */ 1192static void cpuset_change_flag(struct task_struct *tsk, 1193 struct cgroup_scanner *scan) 1194{ 1195 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); 1196} 1197 1198/* 1199 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1200 * @cs: the cpuset in which each task's spread flags needs to be changed 1201 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1202 * 1203 * Called with cgroup_mutex held 1204 * 1205 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1206 * calling callback functions for each. 1207 * 1208 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1209 * if @heap != NULL. 1210 */ 1211static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1212{ 1213 struct cgroup_scanner scan; 1214 1215 scan.cg = cs->css.cgroup; 1216 scan.test_task = NULL; 1217 scan.process_task = cpuset_change_flag; 1218 scan.heap = heap; 1219 cgroup_scan_tasks(&scan); 1220} 1221 1222/* 1223 * update_flag - read a 0 or a 1 in a file and update associated flag 1224 * bit: the bit to update (see cpuset_flagbits_t) 1225 * cs: the cpuset to update 1226 * turning_on: whether the flag is being set or cleared 1227 * 1228 * Call with cgroup_mutex held. 1229 */ 1230 1231static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1232 int turning_on) 1233{ 1234 struct cpuset *trialcs; 1235 int balance_flag_changed; 1236 int spread_flag_changed; 1237 struct ptr_heap heap; 1238 int err; 1239 1240 trialcs = alloc_trial_cpuset(cs); 1241 if (!trialcs) 1242 return -ENOMEM; 1243 1244 if (turning_on) 1245 set_bit(bit, &trialcs->flags); 1246 else 1247 clear_bit(bit, &trialcs->flags); 1248 1249 err = validate_change(cs, trialcs); 1250 if (err < 0) 1251 goto out; 1252 1253 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 1254 if (err < 0) 1255 goto out; 1256 1257 balance_flag_changed = (is_sched_load_balance(cs) != 1258 is_sched_load_balance(trialcs)); 1259 1260 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 1261 || (is_spread_page(cs) != is_spread_page(trialcs))); 1262 1263 mutex_lock(&callback_mutex); 1264 cs->flags = trialcs->flags; 1265 mutex_unlock(&callback_mutex); 1266 1267 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1268 async_rebuild_sched_domains(); 1269 1270 if (spread_flag_changed) 1271 update_tasks_flags(cs, &heap); 1272 heap_free(&heap); 1273out: 1274 free_trial_cpuset(trialcs); 1275 return err; 1276} 1277 1278/* 1279 * Frequency meter - How fast is some event occurring? 1280 * 1281 * These routines manage a digitally filtered, constant time based, 1282 * event frequency meter. There are four routines: 1283 * fmeter_init() - initialize a frequency meter. 1284 * fmeter_markevent() - called each time the event happens. 1285 * fmeter_getrate() - returns the recent rate of such events. 1286 * fmeter_update() - internal routine used to update fmeter. 1287 * 1288 * A common data structure is passed to each of these routines, 1289 * which is used to keep track of the state required to manage the 1290 * frequency meter and its digital filter. 1291 * 1292 * The filter works on the number of events marked per unit time. 1293 * The filter is single-pole low-pass recursive (IIR). The time unit 1294 * is 1 second. Arithmetic is done using 32-bit integers scaled to 1295 * simulate 3 decimal digits of precision (multiplied by 1000). 1296 * 1297 * With an FM_COEF of 933, and a time base of 1 second, the filter 1298 * has a half-life of 10 seconds, meaning that if the events quit 1299 * happening, then the rate returned from the fmeter_getrate() 1300 * will be cut in half each 10 seconds, until it converges to zero. 1301 * 1302 * It is not worth doing a real infinitely recursive filter. If more 1303 * than FM_MAXTICKS ticks have elapsed since the last filter event, 1304 * just compute FM_MAXTICKS ticks worth, by which point the level 1305 * will be stable. 1306 * 1307 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid 1308 * arithmetic overflow in the fmeter_update() routine. 1309 * 1310 * Given the simple 32 bit integer arithmetic used, this meter works 1311 * best for reporting rates between one per millisecond (msec) and 1312 * one per 32 (approx) seconds. At constant rates faster than one 1313 * per msec it maxes out at values just under 1,000,000. At constant 1314 * rates between one per msec, and one per second it will stabilize 1315 * to a value N*1000, where N is the rate of events per second. 1316 * At constant rates between one per second and one per 32 seconds, 1317 * it will be choppy, moving up on the seconds that have an event, 1318 * and then decaying until the next event. At rates slower than 1319 * about one in 32 seconds, it decays all the way back to zero between 1320 * each event. 1321 */ 1322 1323#define FM_COEF 933 /* coefficient for half-life of 10 secs */ 1324#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ 1325#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ 1326#define FM_SCALE 1000 /* faux fixed point scale */ 1327 1328/* Initialize a frequency meter */ 1329static void fmeter_init(struct fmeter *fmp) 1330{ 1331 fmp->cnt = 0; 1332 fmp->val = 0; 1333 fmp->time = 0; 1334 spin_lock_init(&fmp->lock); 1335} 1336 1337/* Internal meter update - process cnt events and update value */ 1338static void fmeter_update(struct fmeter *fmp) 1339{ 1340 time_t now = get_seconds(); 1341 time_t ticks = now - fmp->time; 1342 1343 if (ticks == 0) 1344 return; 1345 1346 ticks = min(FM_MAXTICKS, ticks); 1347 while (ticks-- > 0) 1348 fmp->val = (FM_COEF * fmp->val) / FM_SCALE; 1349 fmp->time = now; 1350 1351 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; 1352 fmp->cnt = 0; 1353} 1354 1355/* Process any previous ticks, then bump cnt by one (times scale). */ 1356static void fmeter_markevent(struct fmeter *fmp) 1357{ 1358 spin_lock(&fmp->lock); 1359 fmeter_update(fmp); 1360 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); 1361 spin_unlock(&fmp->lock); 1362} 1363 1364/* Process any previous ticks, then return current value. */ 1365static int fmeter_getrate(struct fmeter *fmp) 1366{ 1367 int val; 1368 1369 spin_lock(&fmp->lock); 1370 fmeter_update(fmp); 1371 val = fmp->val; 1372 spin_unlock(&fmp->lock); 1373 return val; 1374} 1375 1376/* Protected by cgroup_lock */ 1377static cpumask_var_t cpus_attach; 1378 1379/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1380static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1381 struct task_struct *tsk, bool threadgroup) 1382{ 1383 int ret; 1384 struct cpuset *cs = cgroup_cs(cont); 1385 1386 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1387 return -ENOSPC; 1388 1389 /* 1390 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we 1391 * cannot change their cpu affinity and isolating such threads by their 1392 * set of allowed nodes is unnecessary. Thus, cpusets are not 1393 * applicable for such threads. This prevents checking for success of 1394 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may 1395 * be changed. 1396 */ 1397 if (tsk->flags & PF_THREAD_BOUND) 1398 return -EINVAL; 1399 1400 ret = security_task_setscheduler(tsk, 0, NULL); 1401 if (ret) 1402 return ret; 1403 if (threadgroup) { 1404 struct task_struct *c; 1405 1406 rcu_read_lock(); 1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1408 ret = security_task_setscheduler(c, 0, NULL); 1409 if (ret) { 1410 rcu_read_unlock(); 1411 return ret; 1412 } 1413 } 1414 rcu_read_unlock(); 1415 } 1416 return 0; 1417} 1418 1419static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, 1420 struct cpuset *cs) 1421{ 1422 int err; 1423 /* 1424 * can_attach beforehand should guarantee that this doesn't fail. 1425 * TODO: have a better way to handle failure here 1426 */ 1427 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1428 WARN_ON_ONCE(err); 1429 1430 cpuset_change_task_nodemask(tsk, to); 1431 cpuset_update_task_spread_flag(cs, tsk); 1432 1433} 1434 1435static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1436 struct cgroup *oldcont, struct task_struct *tsk, 1437 bool threadgroup) 1438{ 1439 struct mm_struct *mm; 1440 struct cpuset *cs = cgroup_cs(cont); 1441 struct cpuset *oldcs = cgroup_cs(oldcont); 1442 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); 1443 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); 1444 1445 if (from == NULL || to == NULL) 1446 goto alloc_fail; 1447 1448 if (cs == &top_cpuset) { 1449 cpumask_copy(cpus_attach, cpu_possible_mask); 1450 } else { 1451 guarantee_online_cpus(cs, cpus_attach); 1452 } 1453 guarantee_online_mems(cs, to); 1454 1455 /* do per-task migration stuff possibly for each in the threadgroup */ 1456 cpuset_attach_task(tsk, to, cs); 1457 if (threadgroup) { 1458 struct task_struct *c; 1459 rcu_read_lock(); 1460 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1461 cpuset_attach_task(c, to, cs); 1462 } 1463 rcu_read_unlock(); 1464 } 1465 1466 /* change mm; only needs to be done once even if threadgroup */ 1467 *from = oldcs->mems_allowed; 1468 *to = cs->mems_allowed; 1469 mm = get_task_mm(tsk); 1470 if (mm) { 1471 mpol_rebind_mm(mm, to); 1472 if (is_memory_migrate(cs)) 1473 cpuset_migrate_mm(mm, from, to); 1474 mmput(mm); 1475 } 1476 1477alloc_fail: 1478 NODEMASK_FREE(from); 1479 NODEMASK_FREE(to); 1480} 1481 1482/* The various types of files and directories in a cpuset file system */ 1483 1484typedef enum { 1485 FILE_MEMORY_MIGRATE, 1486 FILE_CPULIST, 1487 FILE_MEMLIST, 1488 FILE_CPU_EXCLUSIVE, 1489 FILE_MEM_EXCLUSIVE, 1490 FILE_MEM_HARDWALL, 1491 FILE_SCHED_LOAD_BALANCE, 1492 FILE_SCHED_RELAX_DOMAIN_LEVEL, 1493 FILE_MEMORY_PRESSURE_ENABLED, 1494 FILE_MEMORY_PRESSURE, 1495 FILE_SPREAD_PAGE, 1496 FILE_SPREAD_SLAB, 1497} cpuset_filetype_t; 1498 1499static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1500{ 1501 int retval = 0; 1502 struct cpuset *cs = cgroup_cs(cgrp); 1503 cpuset_filetype_t type = cft->private; 1504 1505 if (!cgroup_lock_live_group(cgrp)) 1506 return -ENODEV; 1507 1508 switch (type) { 1509 case FILE_CPU_EXCLUSIVE: 1510 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); 1511 break; 1512 case FILE_MEM_EXCLUSIVE: 1513 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); 1514 break; 1515 case FILE_MEM_HARDWALL: 1516 retval = update_flag(CS_MEM_HARDWALL, cs, val); 1517 break; 1518 case FILE_SCHED_LOAD_BALANCE: 1519 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); 1520 break; 1521 case FILE_MEMORY_MIGRATE: 1522 retval = update_flag(CS_MEMORY_MIGRATE, cs, val); 1523 break; 1524 case FILE_MEMORY_PRESSURE_ENABLED: 1525 cpuset_memory_pressure_enabled = !!val; 1526 break; 1527 case FILE_MEMORY_PRESSURE: 1528 retval = -EACCES; 1529 break; 1530 case FILE_SPREAD_PAGE: 1531 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1532 break; 1533 case FILE_SPREAD_SLAB: 1534 retval = update_flag(CS_SPREAD_SLAB, cs, val); 1535 break; 1536 default: 1537 retval = -EINVAL; 1538 break; 1539 } 1540 cgroup_unlock(); 1541 return retval; 1542} 1543 1544static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1545{ 1546 int retval = 0; 1547 struct cpuset *cs = cgroup_cs(cgrp); 1548 cpuset_filetype_t type = cft->private; 1549 1550 if (!cgroup_lock_live_group(cgrp)) 1551 return -ENODEV; 1552 1553 switch (type) { 1554 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1555 retval = update_relax_domain_level(cs, val); 1556 break; 1557 default: 1558 retval = -EINVAL; 1559 break; 1560 } 1561 cgroup_unlock(); 1562 return retval; 1563} 1564 1565/* 1566 * Common handling for a write to a "cpus" or "mems" file. 1567 */ 1568static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1569 const char *buf) 1570{ 1571 int retval = 0; 1572 struct cpuset *cs = cgroup_cs(cgrp); 1573 struct cpuset *trialcs; 1574 1575 if (!cgroup_lock_live_group(cgrp)) 1576 return -ENODEV; 1577 1578 trialcs = alloc_trial_cpuset(cs); 1579 if (!trialcs) 1580 return -ENOMEM; 1581 1582 switch (cft->private) { 1583 case FILE_CPULIST: 1584 retval = update_cpumask(cs, trialcs, buf); 1585 break; 1586 case FILE_MEMLIST: 1587 retval = update_nodemask(cs, trialcs, buf); 1588 break; 1589 default: 1590 retval = -EINVAL; 1591 break; 1592 } 1593 1594 free_trial_cpuset(trialcs); 1595 cgroup_unlock(); 1596 return retval; 1597} 1598 1599/* 1600 * These ascii lists should be read in a single call, by using a user 1601 * buffer large enough to hold the entire map. If read in smaller 1602 * chunks, there is no guarantee of atomicity. Since the display format 1603 * used, list of ranges of sequential numbers, is variable length, 1604 * and since these maps can change value dynamically, one could read 1605 * gibberish by doing partial reads while a list was changing. 1606 * A single large read to a buffer that crosses a page boundary is 1607 * ok, because the result being copied to user land is not recomputed 1608 * across a page fault. 1609 */ 1610 1611static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1612{ 1613 int ret; 1614 1615 mutex_lock(&callback_mutex); 1616 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1617 mutex_unlock(&callback_mutex); 1618 1619 return ret; 1620} 1621 1622static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1623{ 1624 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); 1625 int retval; 1626 1627 if (mask == NULL) 1628 return -ENOMEM; 1629 1630 mutex_lock(&callback_mutex); 1631 *mask = cs->mems_allowed; 1632 mutex_unlock(&callback_mutex); 1633 1634 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); 1635 1636 NODEMASK_FREE(mask); 1637 1638 return retval; 1639} 1640 1641static ssize_t cpuset_common_file_read(struct cgroup *cont, 1642 struct cftype *cft, 1643 struct file *file, 1644 char __user *buf, 1645 size_t nbytes, loff_t *ppos) 1646{ 1647 struct cpuset *cs = cgroup_cs(cont); 1648 cpuset_filetype_t type = cft->private; 1649 char *page; 1650 ssize_t retval = 0; 1651 char *s; 1652 1653 if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) 1654 return -ENOMEM; 1655 1656 s = page; 1657 1658 switch (type) { 1659 case FILE_CPULIST: 1660 s += cpuset_sprintf_cpulist(s, cs); 1661 break; 1662 case FILE_MEMLIST: 1663 s += cpuset_sprintf_memlist(s, cs); 1664 break; 1665 default: 1666 retval = -EINVAL; 1667 goto out; 1668 } 1669 *s++ = '\n'; 1670 1671 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); 1672out: 1673 free_page((unsigned long)page); 1674 return retval; 1675} 1676 1677static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) 1678{ 1679 struct cpuset *cs = cgroup_cs(cont); 1680 cpuset_filetype_t type = cft->private; 1681 switch (type) { 1682 case FILE_CPU_EXCLUSIVE: 1683 return is_cpu_exclusive(cs); 1684 case FILE_MEM_EXCLUSIVE: 1685 return is_mem_exclusive(cs); 1686 case FILE_MEM_HARDWALL: 1687 return is_mem_hardwall(cs); 1688 case FILE_SCHED_LOAD_BALANCE: 1689 return is_sched_load_balance(cs); 1690 case FILE_MEMORY_MIGRATE: 1691 return is_memory_migrate(cs); 1692 case FILE_MEMORY_PRESSURE_ENABLED: 1693 return cpuset_memory_pressure_enabled; 1694 case FILE_MEMORY_PRESSURE: 1695 return fmeter_getrate(&cs->fmeter); 1696 case FILE_SPREAD_PAGE: 1697 return is_spread_page(cs); 1698 case FILE_SPREAD_SLAB: 1699 return is_spread_slab(cs); 1700 default: 1701 BUG(); 1702 } 1703 1704 /* Unreachable but makes gcc happy */ 1705 return 0; 1706} 1707 1708static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1709{ 1710 struct cpuset *cs = cgroup_cs(cont); 1711 cpuset_filetype_t type = cft->private; 1712 switch (type) { 1713 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1714 return cs->relax_domain_level; 1715 default: 1716 BUG(); 1717 } 1718 1719 /* Unrechable but makes gcc happy */ 1720 return 0; 1721} 1722 1723 1724/* 1725 * for the common functions, 'private' gives the type of file 1726 */ 1727 1728static struct cftype files[] = { 1729 { 1730 .name = "cpus", 1731 .read = cpuset_common_file_read, 1732 .write_string = cpuset_write_resmask, 1733 .max_write_len = (100U + 6 * NR_CPUS), 1734 .private = FILE_CPULIST, 1735 }, 1736 1737 { 1738 .name = "mems", 1739 .read = cpuset_common_file_read, 1740 .write_string = cpuset_write_resmask, 1741 .max_write_len = (100U + 6 * MAX_NUMNODES), 1742 .private = FILE_MEMLIST, 1743 }, 1744 1745 { 1746 .name = "cpu_exclusive", 1747 .read_u64 = cpuset_read_u64, 1748 .write_u64 = cpuset_write_u64, 1749 .private = FILE_CPU_EXCLUSIVE, 1750 }, 1751 1752 { 1753 .name = "mem_exclusive", 1754 .read_u64 = cpuset_read_u64, 1755 .write_u64 = cpuset_write_u64, 1756 .private = FILE_MEM_EXCLUSIVE, 1757 }, 1758 1759 { 1760 .name = "mem_hardwall", 1761 .read_u64 = cpuset_read_u64, 1762 .write_u64 = cpuset_write_u64, 1763 .private = FILE_MEM_HARDWALL, 1764 }, 1765 1766 { 1767 .name = "sched_load_balance", 1768 .read_u64 = cpuset_read_u64, 1769 .write_u64 = cpuset_write_u64, 1770 .private = FILE_SCHED_LOAD_BALANCE, 1771 }, 1772 1773 { 1774 .name = "sched_relax_domain_level", 1775 .read_s64 = cpuset_read_s64, 1776 .write_s64 = cpuset_write_s64, 1777 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 1778 }, 1779 1780 { 1781 .name = "memory_migrate", 1782 .read_u64 = cpuset_read_u64, 1783 .write_u64 = cpuset_write_u64, 1784 .private = FILE_MEMORY_MIGRATE, 1785 }, 1786 1787 { 1788 .name = "memory_pressure", 1789 .read_u64 = cpuset_read_u64, 1790 .write_u64 = cpuset_write_u64, 1791 .private = FILE_MEMORY_PRESSURE, 1792 .mode = S_IRUGO, 1793 }, 1794 1795 { 1796 .name = "memory_spread_page", 1797 .read_u64 = cpuset_read_u64, 1798 .write_u64 = cpuset_write_u64, 1799 .private = FILE_SPREAD_PAGE, 1800 }, 1801 1802 { 1803 .name = "memory_spread_slab", 1804 .read_u64 = cpuset_read_u64, 1805 .write_u64 = cpuset_write_u64, 1806 .private = FILE_SPREAD_SLAB, 1807 }, 1808}; 1809 1810static struct cftype cft_memory_pressure_enabled = { 1811 .name = "memory_pressure_enabled", 1812 .read_u64 = cpuset_read_u64, 1813 .write_u64 = cpuset_write_u64, 1814 .private = FILE_MEMORY_PRESSURE_ENABLED, 1815}; 1816 1817static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) 1818{ 1819 int err; 1820 1821 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 1822 if (err) 1823 return err; 1824 /* memory_pressure_enabled is in root cpuset only */ 1825 if (!cont->parent) 1826 err = cgroup_add_file(cont, ss, 1827 &cft_memory_pressure_enabled); 1828 return err; 1829} 1830 1831/* 1832 * post_clone() is called at the end of cgroup_clone(). 1833 * 'cgroup' was just created automatically as a result of 1834 * a cgroup_clone(), and the current task is about to 1835 * be moved into 'cgroup'. 1836 * 1837 * Currently we refuse to set up the cgroup - thereby 1838 * refusing the task to be entered, and as a result refusing 1839 * the sys_unshare() or clone() which initiated it - if any 1840 * sibling cpusets have exclusive cpus or mem. 1841 * 1842 * If this becomes a problem for some users who wish to 1843 * allow that scenario, then cpuset_post_clone() could be 1844 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 1845 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex 1846 * held. 1847 */ 1848static void cpuset_post_clone(struct cgroup_subsys *ss, 1849 struct cgroup *cgroup) 1850{ 1851 struct cgroup *parent, *child; 1852 struct cpuset *cs, *parent_cs; 1853 1854 parent = cgroup->parent; 1855 list_for_each_entry(child, &parent->children, sibling) { 1856 cs = cgroup_cs(child); 1857 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) 1858 return; 1859 } 1860 cs = cgroup_cs(cgroup); 1861 parent_cs = cgroup_cs(parent); 1862 1863 cs->mems_allowed = parent_cs->mems_allowed; 1864 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); 1865 return; 1866} 1867 1868/* 1869 * cpuset_create - create a cpuset 1870 * ss: cpuset cgroup subsystem 1871 * cont: control group that the new cpuset will be part of 1872 */ 1873 1874static struct cgroup_subsys_state *cpuset_create( 1875 struct cgroup_subsys *ss, 1876 struct cgroup *cont) 1877{ 1878 struct cpuset *cs; 1879 struct cpuset *parent; 1880 1881 if (!cont->parent) { 1882 return &top_cpuset.css; 1883 } 1884 parent = cgroup_cs(cont->parent); 1885 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1886 if (!cs) 1887 return ERR_PTR(-ENOMEM); 1888 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1889 kfree(cs); 1890 return ERR_PTR(-ENOMEM); 1891 } 1892 1893 cs->flags = 0; 1894 if (is_spread_page(parent)) 1895 set_bit(CS_SPREAD_PAGE, &cs->flags); 1896 if (is_spread_slab(parent)) 1897 set_bit(CS_SPREAD_SLAB, &cs->flags); 1898 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1899 cpumask_clear(cs->cpus_allowed); 1900 nodes_clear(cs->mems_allowed); 1901 fmeter_init(&cs->fmeter); 1902 cs->relax_domain_level = -1; 1903 1904 cs->parent = parent; 1905 number_of_cpusets++; 1906 return &cs->css ; 1907} 1908 1909/* 1910 * If the cpuset being removed has its flag 'sched_load_balance' 1911 * enabled, then simulate turning sched_load_balance off, which 1912 * will call async_rebuild_sched_domains(). 1913 */ 1914 1915static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1916{ 1917 struct cpuset *cs = cgroup_cs(cont); 1918 1919 if (is_sched_load_balance(cs)) 1920 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1921 1922 number_of_cpusets--; 1923 free_cpumask_var(cs->cpus_allowed); 1924 kfree(cs); 1925} 1926 1927struct cgroup_subsys cpuset_subsys = { 1928 .name = "cpuset", 1929 .create = cpuset_create, 1930 .destroy = cpuset_destroy, 1931 .can_attach = cpuset_can_attach, 1932 .attach = cpuset_attach, 1933 .populate = cpuset_populate, 1934 .post_clone = cpuset_post_clone, 1935 .subsys_id = cpuset_subsys_id, 1936 .early_init = 1, 1937}; 1938 1939/** 1940 * cpuset_init - initialize cpusets at system boot 1941 * 1942 * Description: Initialize top_cpuset and the cpuset internal file system, 1943 **/ 1944 1945int __init cpuset_init(void) 1946{ 1947 int err = 0; 1948 1949 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) 1950 BUG(); 1951 1952 cpumask_setall(top_cpuset.cpus_allowed); 1953 nodes_setall(top_cpuset.mems_allowed); 1954 1955 fmeter_init(&top_cpuset.fmeter); 1956 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1957 top_cpuset.relax_domain_level = -1; 1958 1959 err = register_filesystem(&cpuset_fs_type); 1960 if (err < 0) 1961 return err; 1962 1963 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) 1964 BUG(); 1965 1966 number_of_cpusets = 1; 1967 return 0; 1968} 1969 1970/** 1971 * cpuset_do_move_task - move a given task to another cpuset 1972 * @tsk: pointer to task_struct the task to move 1973 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner 1974 * 1975 * Called by cgroup_scan_tasks() for each task in a cgroup. 1976 * Return nonzero to stop the walk through the tasks. 1977 */ 1978static void cpuset_do_move_task(struct task_struct *tsk, 1979 struct cgroup_scanner *scan) 1980{ 1981 struct cgroup *new_cgroup = scan->data; 1982 1983 cgroup_attach_task(new_cgroup, tsk); 1984} 1985 1986/** 1987 * move_member_tasks_to_cpuset - move tasks from one cpuset to another 1988 * @from: cpuset in which the tasks currently reside 1989 * @to: cpuset to which the tasks will be moved 1990 * 1991 * Called with cgroup_mutex held 1992 * callback_mutex must not be held, as cpuset_attach() will take it. 1993 * 1994 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1995 * calling callback functions for each. 1996 */ 1997static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) 1998{ 1999 struct cgroup_scanner scan; 2000 2001 scan.cg = from->css.cgroup; 2002 scan.test_task = NULL; /* select all tasks in cgroup */ 2003 scan.process_task = cpuset_do_move_task; 2004 scan.heap = NULL; 2005 scan.data = to->css.cgroup; 2006 2007 if (cgroup_scan_tasks(&scan)) 2008 printk(KERN_ERR "move_member_tasks_to_cpuset: " 2009 "cgroup_scan_tasks failed\n"); 2010} 2011 2012/* 2013 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 2014 * or memory nodes, we need to walk over the cpuset hierarchy, 2015 * removing that CPU or node from all cpusets. If this removes the 2016 * last CPU or node from a cpuset, then move the tasks in the empty 2017 * cpuset to its next-highest non-empty parent. 2018 * 2019 * Called with cgroup_mutex held 2020 * callback_mutex must not be held, as cpuset_attach() will take it. 2021 */ 2022static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 2023{ 2024 struct cpuset *parent; 2025 2026 /* 2027 * The cgroup's css_sets list is in use if there are tasks 2028 * in the cpuset; the list is empty if there are none; 2029 * the cs->css.refcnt seems always 0. 2030 */ 2031 if (list_empty(&cs->css.cgroup->css_sets)) 2032 return; 2033 2034 /* 2035 * Find its next-highest non-empty parent, (top cpuset 2036 * has online cpus, so can't be empty). 2037 */ 2038 parent = cs->parent; 2039 while (cpumask_empty(parent->cpus_allowed) || 2040 nodes_empty(parent->mems_allowed)) 2041 parent = parent->parent; 2042 2043 move_member_tasks_to_cpuset(cs, parent); 2044} 2045 2046/* 2047 * Walk the specified cpuset subtree and look for empty cpusets. 2048 * The tasks of such cpuset must be moved to a parent cpuset. 2049 * 2050 * Called with cgroup_mutex held. We take callback_mutex to modify 2051 * cpus_allowed and mems_allowed. 2052 * 2053 * This walk processes the tree from top to bottom, completing one layer 2054 * before dropping down to the next. It always processes a node before 2055 * any of its children. 2056 * 2057 * For now, since we lack memory hot unplug, we'll never see a cpuset 2058 * that has tasks along with an empty 'mems'. But if we did see such 2059 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 2060 */ 2061static void scan_for_empty_cpusets(struct cpuset *root) 2062{ 2063 LIST_HEAD(queue); 2064 struct cpuset *cp; /* scans cpusets being updated */ 2065 struct cpuset *child; /* scans child cpusets of cp */ 2066 struct cgroup *cont; 2067 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2068 2069 if (oldmems == NULL) 2070 return; 2071 2072 list_add_tail((struct list_head *)&root->stack_list, &queue); 2073 2074 while (!list_empty(&queue)) { 2075 cp = list_first_entry(&queue, struct cpuset, stack_list); 2076 list_del(queue.next); 2077 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2078 child = cgroup_cs(cont); 2079 list_add_tail(&child->stack_list, &queue); 2080 } 2081 2082 /* Continue past cpusets with all cpus, mems online */ 2083 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && 2084 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2085 continue; 2086 2087 *oldmems = cp->mems_allowed; 2088 2089 /* Remove offline cpus and mems from this cpuset. */ 2090 mutex_lock(&callback_mutex); 2091 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2092 cpu_active_mask); 2093 nodes_and(cp->mems_allowed, cp->mems_allowed, 2094 node_states[N_HIGH_MEMORY]); 2095 mutex_unlock(&callback_mutex); 2096 2097 /* Move tasks from the empty cpuset to a parent */ 2098 if (cpumask_empty(cp->cpus_allowed) || 2099 nodes_empty(cp->mems_allowed)) 2100 remove_tasks_in_empty_cpuset(cp); 2101 else { 2102 update_tasks_cpumask(cp, NULL); 2103 update_tasks_nodemask(cp, oldmems, NULL); 2104 } 2105 } 2106 NODEMASK_FREE(oldmems); 2107} 2108 2109/* 2110 * The top_cpuset tracks what CPUs and Memory Nodes are online, 2111 * period. This is necessary in order to make cpusets transparent 2112 * (of no affect) on systems that are actively using CPU hotplug 2113 * but making no active use of cpusets. 2114 * 2115 * This routine ensures that top_cpuset.cpus_allowed tracks 2116 * cpu_active_mask on each CPU hotplug (cpuhp) event. 2117 * 2118 * Called within get_online_cpus(). Needs to call cgroup_lock() 2119 * before calling generate_sched_domains(). 2120 */ 2121void cpuset_update_active_cpus(void) 2122{ 2123 struct sched_domain_attr *attr; 2124 cpumask_var_t *doms; 2125 int ndoms; 2126 2127 cgroup_lock(); 2128 mutex_lock(&callback_mutex); 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2130 mutex_unlock(&callback_mutex); 2131 scan_for_empty_cpusets(&top_cpuset); 2132 ndoms = generate_sched_domains(&doms, &attr); 2133 cgroup_unlock(); 2134 2135 /* Have scheduler rebuild the domains */ 2136 partition_sched_domains(ndoms, doms, attr); 2137} 2138 2139#ifdef CONFIG_MEMORY_HOTPLUG 2140/* 2141 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2142 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2143 * See also the previous routine cpuset_track_online_cpus(). 2144 */ 2145static int cpuset_track_online_nodes(struct notifier_block *self, 2146 unsigned long action, void *arg) 2147{ 2148 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2149 2150 if (oldmems == NULL) 2151 return NOTIFY_DONE; 2152 2153 cgroup_lock(); 2154 switch (action) { 2155 case MEM_ONLINE: 2156 *oldmems = top_cpuset.mems_allowed; 2157 mutex_lock(&callback_mutex); 2158 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2159 mutex_unlock(&callback_mutex); 2160 update_tasks_nodemask(&top_cpuset, oldmems, NULL); 2161 break; 2162 case MEM_OFFLINE: 2163 /* 2164 * needn't update top_cpuset.mems_allowed explicitly because 2165 * scan_for_empty_cpusets() will update it. 2166 */ 2167 scan_for_empty_cpusets(&top_cpuset); 2168 break; 2169 default: 2170 break; 2171 } 2172 cgroup_unlock(); 2173 2174 NODEMASK_FREE(oldmems); 2175 return NOTIFY_OK; 2176} 2177#endif 2178 2179/** 2180 * cpuset_init_smp - initialize cpus_allowed 2181 * 2182 * Description: Finish top cpuset after cpu, node maps are initialized 2183 **/ 2184 2185void __init cpuset_init_smp(void) 2186{ 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2189 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2191 2192 cpuset_wq = create_singlethread_workqueue("cpuset"); 2193 BUG_ON(!cpuset_wq); 2194} 2195 2196/** 2197 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2198 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2199 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. 2200 * 2201 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset 2202 * attached to the specified @tsk. Guaranteed to return some non-empty 2203 * subset of cpu_online_map, even if this means going outside the 2204 * tasks cpuset. 2205 **/ 2206 2207void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2208{ 2209 mutex_lock(&callback_mutex); 2210 task_lock(tsk); 2211 guarantee_online_cpus(task_cs(tsk), pmask); 2212 task_unlock(tsk); 2213 mutex_unlock(&callback_mutex); 2214} 2215 2216int cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2217{ 2218 const struct cpuset *cs; 2219 int cpu; 2220 2221 rcu_read_lock(); 2222 cs = task_cs(tsk); 2223 if (cs) 2224 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); 2225 rcu_read_unlock(); 2226 2227 /* 2228 * We own tsk->cpus_allowed, nobody can change it under us. 2229 * 2230 * But we used cs && cs->cpus_allowed lockless and thus can 2231 * race with cgroup_attach_task() or update_cpumask() and get 2232 * the wrong tsk->cpus_allowed. However, both cases imply the 2233 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() 2234 * which takes task_rq_lock(). 2235 * 2236 * If we are called after it dropped the lock we must see all 2237 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 2238 * set any mask even if it is not right from task_cs() pov, 2239 * the pending set_cpus_allowed_ptr() will fix things. 2240 */ 2241 2242 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); 2243 if (cpu >= nr_cpu_ids) { 2244 /* 2245 * Either tsk->cpus_allowed is wrong (see above) or it 2246 * is actually empty. The latter case is only possible 2247 * if we are racing with remove_tasks_in_empty_cpuset(). 2248 * Like above we can temporary set any mask and rely on 2249 * set_cpus_allowed_ptr() as synchronization point. 2250 */ 2251 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); 2252 cpu = cpumask_any(cpu_active_mask); 2253 } 2254 2255 return cpu; 2256} 2257 2258void cpuset_init_current_mems_allowed(void) 2259{ 2260 nodes_setall(current->mems_allowed); 2261} 2262 2263/** 2264 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. 2265 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. 2266 * 2267 * Description: Returns the nodemask_t mems_allowed of the cpuset 2268 * attached to the specified @tsk. Guaranteed to return some non-empty 2269 * subset of node_states[N_HIGH_MEMORY], even if this means going outside the 2270 * tasks cpuset. 2271 **/ 2272 2273nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2274{ 2275 nodemask_t mask; 2276 2277 mutex_lock(&callback_mutex); 2278 task_lock(tsk); 2279 guarantee_online_mems(task_cs(tsk), &mask); 2280 task_unlock(tsk); 2281 mutex_unlock(&callback_mutex); 2282 2283 return mask; 2284} 2285 2286/** 2287 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed 2288 * @nodemask: the nodemask to be checked 2289 * 2290 * Are any of the nodes in the nodemask allowed in current->mems_allowed? 2291 */ 2292int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 2293{ 2294 return nodes_intersects(*nodemask, current->mems_allowed); 2295} 2296 2297/* 2298 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 2299 * mem_hardwall ancestor to the specified cpuset. Call holding 2300 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2301 * (an unusual configuration), then returns the root cpuset. 2302 */ 2303static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2304{ 2305 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) 2306 cs = cs->parent; 2307 return cs; 2308} 2309 2310/** 2311 * cpuset_node_allowed_softwall - Can we allocate on a memory node? 2312 * @node: is this an allowed node? 2313 * @gfp_mask: memory allocation flags 2314 * 2315 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is 2316 * set, yes, we can always allocate. If node is in our task's mems_allowed, 2317 * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest 2318 * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been 2319 * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE 2320 * flag, yes. 2321 * Otherwise, no. 2322 * 2323 * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to 2324 * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() 2325 * might sleep, and might allow a node from an enclosing cpuset. 2326 * 2327 * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall 2328 * cpusets, and never sleeps. 2329 * 2330 * The __GFP_THISNODE placement logic is really handled elsewhere, 2331 * by forcibly using a zonelist starting at a specified node, and by 2332 * (in get_page_from_freelist()) refusing to consider the zones for 2333 * any node on the zonelist except the first. By the time any such 2334 * calls get to this routine, we should just shut up and say 'yes'. 2335 * 2336 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2337 * and do not allow allocations outside the current tasks cpuset 2338 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2339 * GFP_KERNEL allocations are not so marked, so can escape to the 2340 * nearest enclosing hardwalled ancestor cpuset. 2341 * 2342 * Scanning up parent cpusets requires callback_mutex. The 2343 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2344 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 2345 * current tasks mems_allowed came up empty on the first pass over 2346 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 2347 * cpuset are short of memory, might require taking the callback_mutex 2348 * mutex. 2349 * 2350 * The first call here from mm/page_alloc:get_page_from_freelist() 2351 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 2352 * so no allocation on a node outside the cpuset is allowed (unless 2353 * in interrupt, of course). 2354 * 2355 * The second pass through get_page_from_freelist() doesn't even call 2356 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 2357 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set 2358 * in alloc_flags. That logic and the checks below have the combined 2359 * affect that: 2360 * in_interrupt - any node ok (current task context irrelevant) 2361 * GFP_ATOMIC - any node ok 2362 * TIF_MEMDIE - any node ok 2363 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 2364 * GFP_USER - only nodes in current tasks mems allowed ok. 2365 * 2366 * Rule: 2367 * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you 2368 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables 2369 * the code that might scan up ancestor cpusets and sleep. 2370 */ 2371int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2372{ 2373 const struct cpuset *cs; /* current cpuset ancestors */ 2374 int allowed; /* is allocation in zone z allowed? */ 2375 2376 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2377 return 1; 2378 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2379 if (node_isset(node, current->mems_allowed)) 2380 return 1; 2381 /* 2382 * Allow tasks that have access to memory reserves because they have 2383 * been OOM killed to get memory anywhere. 2384 */ 2385 if (unlikely(test_thread_flag(TIF_MEMDIE))) 2386 return 1; 2387 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 2388 return 0; 2389 2390 if (current->flags & PF_EXITING) /* Let dying task have memory */ 2391 return 1; 2392 2393 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2394 mutex_lock(&callback_mutex); 2395 2396 task_lock(current); 2397 cs = nearest_hardwall_ancestor(task_cs(current)); 2398 task_unlock(current); 2399 2400 allowed = node_isset(node, cs->mems_allowed); 2401 mutex_unlock(&callback_mutex); 2402 return allowed; 2403} 2404 2405/* 2406 * cpuset_node_allowed_hardwall - Can we allocate on a memory node? 2407 * @node: is this an allowed node? 2408 * @gfp_mask: memory allocation flags 2409 * 2410 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is 2411 * set, yes, we can always allocate. If node is in our task's mems_allowed, 2412 * yes. If the task has been OOM killed and has access to memory reserves as 2413 * specified by the TIF_MEMDIE flag, yes. 2414 * Otherwise, no. 2415 * 2416 * The __GFP_THISNODE placement logic is really handled elsewhere, 2417 * by forcibly using a zonelist starting at a specified node, and by 2418 * (in get_page_from_freelist()) refusing to consider the zones for 2419 * any node on the zonelist except the first. By the time any such 2420 * calls get to this routine, we should just shut up and say 'yes'. 2421 * 2422 * Unlike the cpuset_node_allowed_softwall() variant, above, 2423 * this variant requires that the node be in the current task's 2424 * mems_allowed or that we're in interrupt. It does not scan up the 2425 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. 2426 * It never sleeps. 2427 */ 2428int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) 2429{ 2430 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2431 return 1; 2432 if (node_isset(node, current->mems_allowed)) 2433 return 1; 2434 /* 2435 * Allow tasks that have access to memory reserves because they have 2436 * been OOM killed to get memory anywhere. 2437 */ 2438 if (unlikely(test_thread_flag(TIF_MEMDIE))) 2439 return 1; 2440 return 0; 2441} 2442 2443/** 2444 * cpuset_unlock - release lock on cpuset changes 2445 * 2446 * Undo the lock taken in a previous cpuset_lock() call. 2447 */ 2448 2449void cpuset_unlock(void) 2450{ 2451 mutex_unlock(&callback_mutex); 2452} 2453 2454/** 2455 * cpuset_mem_spread_node() - On which node to begin search for a file page 2456 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2457 * 2458 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 2459 * tasks in a cpuset with is_spread_page or is_spread_slab set), 2460 * and if the memory allocation used cpuset_mem_spread_node() 2461 * to determine on which node to start looking, as it will for 2462 * certain page cache or slab cache pages such as used for file 2463 * system buffers and inode caches, then instead of starting on the 2464 * local node to look for a free page, rather spread the starting 2465 * node around the tasks mems_allowed nodes. 2466 * 2467 * We don't have to worry about the returned node being offline 2468 * because "it can't happen", and even if it did, it would be ok. 2469 * 2470 * The routines calling guarantee_online_mems() are careful to 2471 * only set nodes in task->mems_allowed that are online. So it 2472 * should not be possible for the following code to return an 2473 * offline node. But if it did, that would be ok, as this routine 2474 * is not returning the node where the allocation must be, only 2475 * the node where the search should start. The zonelist passed to 2476 * __alloc_pages() will include all nodes. If the slab allocator 2477 * is passed an offline node, it will fall back to the local node. 2478 * See kmem_cache_alloc_node(). 2479 */ 2480 2481static int cpuset_spread_node(int *rotor) 2482{ 2483 int node; 2484 2485 node = next_node(*rotor, current->mems_allowed); 2486 if (node == MAX_NUMNODES) 2487 node = first_node(current->mems_allowed); 2488 *rotor = node; 2489 return node; 2490} 2491 2492int cpuset_mem_spread_node(void) 2493{ 2494 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); 2495} 2496 2497int cpuset_slab_spread_node(void) 2498{ 2499 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); 2500} 2501 2502EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2503 2504/** 2505 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? 2506 * @tsk1: pointer to task_struct of some task. 2507 * @tsk2: pointer to task_struct of some other task. 2508 * 2509 * Description: Return true if @tsk1's mems_allowed intersects the 2510 * mems_allowed of @tsk2. Used by the OOM killer to determine if 2511 * one of the task's memory usage might impact the memory available 2512 * to the other. 2513 **/ 2514 2515int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, 2516 const struct task_struct *tsk2) 2517{ 2518 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2519} 2520 2521/** 2522 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2523 * @task: pointer to task_struct of some task. 2524 * 2525 * Description: Prints @task's name, cpuset name, and cached copy of its 2526 * mems_allowed to the kernel log. Must hold task_lock(task) to allow 2527 * dereferencing task_cs(task). 2528 */ 2529void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2530{ 2531 struct dentry *dentry; 2532 2533 dentry = task_cs(tsk)->css.cgroup->dentry; 2534 spin_lock(&cpuset_buffer_lock); 2535 snprintf(cpuset_name, CPUSET_NAME_LEN, 2536 dentry ? (const char *)dentry->d_name.name : "/"); 2537 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2538 tsk->mems_allowed); 2539 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2540 tsk->comm, cpuset_name, cpuset_nodelist); 2541 spin_unlock(&cpuset_buffer_lock); 2542} 2543 2544/* 2545 * Collection of memory_pressure is suppressed unless 2546 * this flag is enabled by writing "1" to the special 2547 * cpuset file 'memory_pressure_enabled' in the root cpuset. 2548 */ 2549 2550int cpuset_memory_pressure_enabled __read_mostly; 2551 2552/** 2553 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. 2554 * 2555 * Keep a running average of the rate of synchronous (direct) 2556 * page reclaim efforts initiated by tasks in each cpuset. 2557 * 2558 * This represents the rate at which some task in the cpuset 2559 * ran low on memory on all nodes it was allowed to use, and 2560 * had to enter the kernels page reclaim code in an effort to 2561 * create more free memory by tossing clean pages or swapping 2562 * or writing dirty pages. 2563 * 2564 * Display to user space in the per-cpuset read-only file 2565 * "memory_pressure". Value displayed is an integer 2566 * representing the recent rate of entry into the synchronous 2567 * (direct) page reclaim by any task attached to the cpuset. 2568 **/ 2569 2570void __cpuset_memory_pressure_bump(void) 2571{ 2572 task_lock(current); 2573 fmeter_markevent(&task_cs(current)->fmeter); 2574 task_unlock(current); 2575} 2576 2577#ifdef CONFIG_PROC_PID_CPUSET 2578/* 2579 * proc_cpuset_show() 2580 * - Print tasks cpuset path into seq_file. 2581 * - Used for /proc/<pid>/cpuset. 2582 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2583 * doesn't really matter if tsk->cpuset changes after we read it, 2584 * and we take cgroup_mutex, keeping cpuset_attach() from changing it 2585 * anyway. 2586 */ 2587static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2588{ 2589 struct pid *pid; 2590 struct task_struct *tsk; 2591 char *buf; 2592 struct cgroup_subsys_state *css; 2593 int retval; 2594 2595 retval = -ENOMEM; 2596 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2597 if (!buf) 2598 goto out; 2599 2600 retval = -ESRCH; 2601 pid = m->private; 2602 tsk = get_pid_task(pid, PIDTYPE_PID); 2603 if (!tsk) 2604 goto out_free; 2605 2606 retval = -EINVAL; 2607 cgroup_lock(); 2608 css = task_subsys_state(tsk, cpuset_subsys_id); 2609 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2610 if (retval < 0) 2611 goto out_unlock; 2612 seq_puts(m, buf); 2613 seq_putc(m, '\n'); 2614out_unlock: 2615 cgroup_unlock(); 2616 put_task_struct(tsk); 2617out_free: 2618 kfree(buf); 2619out: 2620 return retval; 2621} 2622 2623static int cpuset_open(struct inode *inode, struct file *file) 2624{ 2625 struct pid *pid = PROC_I(inode)->pid; 2626 return single_open(file, proc_cpuset_show, pid); 2627} 2628 2629const struct file_operations proc_cpuset_operations = { 2630 .open = cpuset_open, 2631 .read = seq_read, 2632 .llseek = seq_lseek, 2633 .release = single_release, 2634}; 2635#endif /* CONFIG_PROC_PID_CPUSET */ 2636 2637/* Display task mems_allowed in /proc/<pid>/status file. */ 2638void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2639{ 2640 seq_printf(m, "Mems_allowed:\t"); 2641 seq_nodemask(m, &task->mems_allowed); 2642 seq_printf(m, "\n"); 2643 seq_printf(m, "Mems_allowed_list:\t"); 2644 seq_nodemask_list(m, &task->mems_allowed); 2645 seq_printf(m, "\n"); 2646} 2647