1/* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */ 2/* 3 * kernel/sched.c 4 * 5 * Kernel scheduler and related syscalls 6 * 7 * Copyright (C) 1991-2002 Linus Torvalds 8 * 9 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 10 * make semaphores SMP safe 11 * 1998-11-19 Implemented schedule_timeout() and related stuff 12 * by Andrea Arcangeli 13 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 14 * hybrid priority-list and round-robin design with 15 * an array-switch method of distributing timeslices 16 * and per-CPU runqueues. Cleanups and useful suggestions 17 * by Davide Libenzi, preemptible kernel bits by Robert Love. 18 * 2003-09-03 Interactivity tuning by Con Kolivas. 19 * 2004-04-02 Scheduler domains code by Nick Piggin 20 * 2007-04-15 Work begun on replacing all interactivity tuning with a 21 * fair scheduling design by Con Kolivas. 22 * 2007-05-05 Load balancing (smp-nice) and other improvements 23 * by Peter Williams 24 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 25 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 26 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 27 * Thomas Gleixner, Mike Kravetz 28 */ 29 30#include <linux/mm.h> 31#include <linux/module.h> 32#include <linux/nmi.h> 33#include <linux/init.h> 34#include <linux/uaccess.h> 35#include <linux/highmem.h> 36#include <linux/smp_lock.h> 37#include <asm/mmu_context.h> 38#include <linux/interrupt.h> 39#include <linux/capability.h> 40#include <linux/completion.h> 41#include <linux/kernel_stat.h> 42#include <linux/debug_locks.h> 43#include <linux/perf_event.h> 44#include <linux/security.h> 45#include <linux/notifier.h> 46#include <linux/profile.h> 47#include <linux/freezer.h> 48#include <linux/vmalloc.h> 49#include <linux/blkdev.h> 50#include <linux/delay.h> 51#include <linux/pid_namespace.h> 52#include <linux/smp.h> 53#include <linux/threads.h> 54#include <linux/timer.h> 55#include <linux/rcupdate.h> 56#include <linux/cpu.h> 57#include <linux/cpuset.h> 58#include <linux/percpu.h> 59#include <linux/proc_fs.h> 60#include <linux/seq_file.h> 61#include <linux/stop_machine.h> 62#include <linux/sysctl.h> 63#include <linux/syscalls.h> 64#include <linux/times.h> 65#include <linux/tsacct_kern.h> 66#include <linux/kprobes.h> 67#include <linux/delayacct.h> 68#include <linux/unistd.h> 69#include <linux/pagemap.h> 70#include <linux/hrtimer.h> 71#include <linux/tick.h> 72#include <linux/debugfs.h> 73#include <linux/ctype.h> 74#include <linux/ftrace.h> 75#include <linux/slab.h> 76 77#include <asm/tlb.h> 78#include <asm/irq_regs.h> 79 80#include "sched_cpupri.h" 81#include "workqueue_sched.h" 82 83#define CREATE_TRACE_POINTS 84#include <trace/events/sched.h> 85 86#if defined(CONFIG_BUZZZ) 87#include <asm/buzzz.h> 88/* These global varaibles are needed to hold prev, next tasks to log context 89 * switch as stack will be invalid after context_switch. 90 * Also per-cpu macros are not needed as these variables are accessed 91 * only inside pre-emption disabled code. 92 */ 93#if defined(BUZZZ_KEVT_LVL) && (BUZZZ_KEVT_LVL >= 1) 94struct task_struct *buzzz_prev[NR_CPUS]; 95struct task_struct *buzzz_next[NR_CPUS]; 96#endif /* BUZZZ_KEVT_LVL */ 97#endif /* CONFIG_BUZZZ */ 98/* 99 * Convert user-nice values [ -20 ... 0 ... 19 ] 100 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 101 * and back. 102 */ 103#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 104#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 105#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 106 107/* 108 * 'User priority' is the nice value converted to something we 109 * can work with better when scaling various scheduler parameters, 110 * it's a [ 0 ... 39 ] range. 111 */ 112#define USER_PRIO(p) ((p)-MAX_RT_PRIO) 113#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 114#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 115 116/* 117 * Helpers for converting nanosecond timing to jiffy resolution 118 */ 119#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 120 121#define NICE_0_LOAD SCHED_LOAD_SCALE 122#define NICE_0_SHIFT SCHED_LOAD_SHIFT 123 124/* 125 * These are the 'tuning knobs' of the scheduler: 126 * 127 * default timeslice is 100 msecs (used only for SCHED_RR tasks). 128 * Timeslices get refilled after they expire. 129 */ 130#define DEF_TIMESLICE (100 * HZ / 1000) 131 132/* 133 * single value that denotes runtime == period, ie unlimited time. 134 */ 135#define RUNTIME_INF ((u64)~0ULL) 136 137static inline int rt_policy(int policy) 138{ 139 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 140 return 1; 141 return 0; 142} 143 144static inline int task_has_rt_policy(struct task_struct *p) 145{ 146 return rt_policy(p->policy); 147} 148 149/* 150 * This is the priority-queue data structure of the RT scheduling class: 151 */ 152struct rt_prio_array { 153 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 154 struct list_head queue[MAX_RT_PRIO]; 155}; 156 157struct rt_bandwidth { 158 /* nests inside the rq lock: */ 159 raw_spinlock_t rt_runtime_lock; 160 ktime_t rt_period; 161 u64 rt_runtime; 162 struct hrtimer rt_period_timer; 163}; 164 165static struct rt_bandwidth def_rt_bandwidth; 166 167static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 168 169static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 170{ 171 struct rt_bandwidth *rt_b = 172 container_of(timer, struct rt_bandwidth, rt_period_timer); 173 ktime_t now; 174 int overrun; 175 int idle = 0; 176 177 for (;;) { 178 now = hrtimer_cb_get_time(timer); 179 overrun = hrtimer_forward(timer, now, rt_b->rt_period); 180 181 if (!overrun) 182 break; 183 184 idle = do_sched_rt_period_timer(rt_b, overrun); 185 } 186 187 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 188} 189 190static 191void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 192{ 193 rt_b->rt_period = ns_to_ktime(period); 194 rt_b->rt_runtime = runtime; 195 196 raw_spin_lock_init(&rt_b->rt_runtime_lock); 197 198 hrtimer_init(&rt_b->rt_period_timer, 199 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 200 rt_b->rt_period_timer.function = sched_rt_period_timer; 201} 202 203static inline int rt_bandwidth_enabled(void) 204{ 205 return sysctl_sched_rt_runtime >= 0; 206} 207 208static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 209{ 210 ktime_t now; 211 212 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 213 return; 214 215 if (hrtimer_active(&rt_b->rt_period_timer)) 216 return; 217 218 raw_spin_lock(&rt_b->rt_runtime_lock); 219 for (;;) { 220 unsigned long delta; 221 ktime_t soft, hard; 222 223 if (hrtimer_active(&rt_b->rt_period_timer)) 224 break; 225 226 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 227 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 228 229 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); 230 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 231 delta = ktime_to_ns(ktime_sub(hard, soft)); 232 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 233 HRTIMER_MODE_ABS_PINNED, 0); 234 } 235 raw_spin_unlock(&rt_b->rt_runtime_lock); 236} 237 238#ifdef CONFIG_RT_GROUP_SCHED 239static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 240{ 241 hrtimer_cancel(&rt_b->rt_period_timer); 242} 243#endif 244 245/* 246 * sched_domains_mutex serializes calls to arch_init_sched_domains, 247 * detach_destroy_domains and partition_sched_domains. 248 */ 249static DEFINE_MUTEX(sched_domains_mutex); 250 251#ifdef CONFIG_CGROUP_SCHED 252 253#include <linux/cgroup.h> 254 255struct cfs_rq; 256 257static LIST_HEAD(task_groups); 258 259/* task group related information */ 260struct task_group { 261 struct cgroup_subsys_state css; 262 263#ifdef CONFIG_FAIR_GROUP_SCHED 264 /* schedulable entities of this group on each cpu */ 265 struct sched_entity **se; 266 /* runqueue "owned" by this group on each cpu */ 267 struct cfs_rq **cfs_rq; 268 unsigned long shares; 269#endif 270 271#ifdef CONFIG_RT_GROUP_SCHED 272 struct sched_rt_entity **rt_se; 273 struct rt_rq **rt_rq; 274 275 struct rt_bandwidth rt_bandwidth; 276#endif 277 278 struct rcu_head rcu; 279 struct list_head list; 280 281 struct task_group *parent; 282 struct list_head siblings; 283 struct list_head children; 284}; 285 286#define root_task_group init_task_group 287 288/* task_group_lock serializes add/remove of task groups and also changes to 289 * a task group's cpu shares. 290 */ 291static DEFINE_SPINLOCK(task_group_lock); 292 293#ifdef CONFIG_FAIR_GROUP_SCHED 294 295#ifdef CONFIG_SMP 296static int root_task_group_empty(void) 297{ 298 return list_empty(&root_task_group.children); 299} 300#endif 301 302# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 303 304/* 305 * A weight of 0 or 1 can cause arithmetics problems. 306 * A weight of a cfs_rq is the sum of weights of which entities 307 * are queued on this cfs_rq, so a weight of a entity should not be 308 * too large, so as the shares value of a task group. 309 * (The default weight is 1024 - so there's no practical 310 * limitation from this.) 311 */ 312#define MIN_SHARES 2 313#define MAX_SHARES (1UL << 18) 314 315static int init_task_group_load = INIT_TASK_GROUP_LOAD; 316#endif 317 318/* Default task group. 319 * Every task in system belong to this group at bootup. 320 */ 321struct task_group init_task_group; 322 323#endif /* CONFIG_CGROUP_SCHED */ 324 325/* CFS-related fields in a runqueue */ 326struct cfs_rq { 327 struct load_weight load; 328 unsigned long nr_running; 329 330 u64 exec_clock; 331 u64 min_vruntime; 332 333 struct rb_root tasks_timeline; 334 struct rb_node *rb_leftmost; 335 336 struct list_head tasks; 337 struct list_head *balance_iterator; 338 339 /* 340 * 'curr' points to currently running entity on this cfs_rq. 341 * It is set to NULL otherwise (i.e when none are currently running). 342 */ 343 struct sched_entity *curr, *next, *last; 344 345 unsigned int nr_spread_over; 346 347#ifdef CONFIG_FAIR_GROUP_SCHED 348 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 349 350 /* 351 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 352 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 353 * (like users, containers etc.) 354 * 355 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 356 * list is used during load balance. 357 */ 358 struct list_head leaf_cfs_rq_list; 359 struct task_group *tg; /* group that "owns" this runqueue */ 360 361#ifdef CONFIG_SMP 362 /* 363 * the part of load.weight contributed by tasks 364 */ 365 unsigned long task_weight; 366 367 /* 368 * h_load = weight * f(tg) 369 * 370 * Where f(tg) is the recursive weight fraction assigned to 371 * this group. 372 */ 373 unsigned long h_load; 374 375 /* 376 * this cpu's part of tg->shares 377 */ 378 unsigned long shares; 379 380 /* 381 * load.weight at the time we set shares 382 */ 383 unsigned long rq_weight; 384#endif 385#endif 386}; 387 388/* Real-Time classes' related field in a runqueue: */ 389struct rt_rq { 390 struct rt_prio_array active; 391 unsigned long rt_nr_running; 392#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 393 struct { 394 int curr; /* highest queued rt task prio */ 395#ifdef CONFIG_SMP 396 int next; /* next highest */ 397#endif 398 } highest_prio; 399#endif 400#ifdef CONFIG_SMP 401 unsigned long rt_nr_migratory; 402 unsigned long rt_nr_total; 403 int overloaded; 404 struct plist_head pushable_tasks; 405#endif 406 int rt_throttled; 407 u64 rt_time; 408 u64 rt_runtime; 409 /* Nests inside the rq lock: */ 410 raw_spinlock_t rt_runtime_lock; 411 412#ifdef CONFIG_RT_GROUP_SCHED 413 unsigned long rt_nr_boosted; 414 415 struct rq *rq; 416 struct list_head leaf_rt_rq_list; 417 struct task_group *tg; 418#endif 419}; 420 421#ifdef CONFIG_SMP 422 423/* 424 * We add the notion of a root-domain which will be used to define per-domain 425 * variables. Each exclusive cpuset essentially defines an island domain by 426 * fully partitioning the member cpus from any other cpuset. Whenever a new 427 * exclusive cpuset is created, we also create and attach a new root-domain 428 * object. 429 * 430 */ 431struct root_domain { 432 atomic_t refcount; 433 cpumask_var_t span; 434 cpumask_var_t online; 435 436 /* 437 * The "RT overload" flag: it gets set if a CPU has more than 438 * one runnable RT task. 439 */ 440 cpumask_var_t rto_mask; 441 atomic_t rto_count; 442#ifdef CONFIG_SMP 443 struct cpupri cpupri; 444#endif 445}; 446 447/* 448 * By default the system creates a single root-domain with all cpus as 449 * members (mimicking the global state we have today). 450 */ 451static struct root_domain def_root_domain; 452 453#endif 454 455/* 456 * This is the main, per-CPU runqueue data structure. 457 * 458 * Locking rule: those places that want to lock multiple runqueues 459 * (such as the load balancing or the thread migration code), lock 460 * acquire operations must be ordered by ascending &runqueue. 461 */ 462struct rq { 463 /* runqueue lock: */ 464 raw_spinlock_t lock; 465 466 /* 467 * nr_running and cpu_load should be in the same cacheline because 468 * remote CPUs use both these fields when doing load calculation. 469 */ 470 unsigned long nr_running; 471 #define CPU_LOAD_IDX_MAX 5 472 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 473 unsigned long last_load_update_tick; 474#ifdef CONFIG_NO_HZ 475 u64 nohz_stamp; 476 unsigned char nohz_balance_kick; 477#endif 478 unsigned int skip_clock_update; 479 480 /* capture load from *all* tasks on this cpu: */ 481 struct load_weight load; 482 unsigned long nr_load_updates; 483 u64 nr_switches; 484 485 struct cfs_rq cfs; 486 struct rt_rq rt; 487 488#ifdef CONFIG_FAIR_GROUP_SCHED 489 /* list of leaf cfs_rq on this cpu: */ 490 struct list_head leaf_cfs_rq_list; 491#endif 492#ifdef CONFIG_RT_GROUP_SCHED 493 struct list_head leaf_rt_rq_list; 494#endif 495 496 /* 497 * This is part of a global counter where only the total sum 498 * over all CPUs matters. A task can increase this counter on 499 * one CPU and if it got migrated afterwards it may decrease 500 * it on another CPU. Always updated under the runqueue lock: 501 */ 502 unsigned long nr_uninterruptible; 503 504 struct task_struct *curr, *idle; 505 unsigned long next_balance; 506 struct mm_struct *prev_mm; 507 508 u64 clock; 509 510 atomic_t nr_iowait; 511 512#ifdef CONFIG_SMP 513 struct root_domain *rd; 514 struct sched_domain *sd; 515 516 unsigned long cpu_power; 517 518 unsigned char idle_at_tick; 519 /* For active balancing */ 520 int post_schedule; 521 int active_balance; 522 int push_cpu; 523 struct cpu_stop_work active_balance_work; 524 /* cpu of this runqueue: */ 525 int cpu; 526 int online; 527 528 unsigned long avg_load_per_task; 529 530 u64 rt_avg; 531 u64 age_stamp; 532 u64 idle_stamp; 533 u64 avg_idle; 534#endif 535 536 /* calc_load related fields */ 537 unsigned long calc_load_update; 538 long calc_load_active; 539 540#ifdef CONFIG_SCHED_HRTICK 541#ifdef CONFIG_SMP 542 int hrtick_csd_pending; 543 struct call_single_data hrtick_csd; 544#endif 545 struct hrtimer hrtick_timer; 546#endif 547 548#ifdef CONFIG_SCHEDSTATS 549 /* latency stats */ 550 struct sched_info rq_sched_info; 551 unsigned long long rq_cpu_time; 552 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 553 554 /* sys_sched_yield() stats */ 555 unsigned int yld_count; 556 557 /* schedule() stats */ 558 unsigned int sched_switch; 559 unsigned int sched_count; 560 unsigned int sched_goidle; 561 562 /* try_to_wake_up() stats */ 563 unsigned int ttwu_count; 564 unsigned int ttwu_local; 565 566 /* BKL stats */ 567 unsigned int bkl_count; 568#endif 569}; 570 571static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 572 573static inline 574void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 575{ 576 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 577 578 /* 579 * A queue event has occurred, and we're going to schedule. In 580 * this case, we can save a useless back to back clock update. 581 */ 582 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 583 rq->skip_clock_update = 1; 584} 585 586static inline int cpu_of(struct rq *rq) 587{ 588#ifdef CONFIG_SMP 589 return rq->cpu; 590#else 591 return 0; 592#endif 593} 594 595#define rcu_dereference_check_sched_domain(p) \ 596 rcu_dereference_check((p), \ 597 rcu_read_lock_sched_held() || \ 598 lockdep_is_held(&sched_domains_mutex)) 599 600/* 601 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 602 * See detach_destroy_domains: synchronize_sched for details. 603 * 604 * The domain tree of any CPU may only be accessed from within 605 * preempt-disabled sections. 606 */ 607#define for_each_domain(cpu, __sd) \ 608 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 609 610#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 611#define this_rq() (&__get_cpu_var(runqueues)) 612#define task_rq(p) cpu_rq(task_cpu(p)) 613#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 614#define raw_rq() (&__raw_get_cpu_var(runqueues)) 615 616#ifdef CONFIG_CGROUP_SCHED 617 618/* 619 * Return the group to which this tasks belongs. 620 * 621 * We use task_subsys_state_check() and extend the RCU verification 622 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 623 * holds that lock for each task it moves into the cgroup. Therefore 624 * by holding that lock, we pin the task to the current cgroup. 625 */ 626static inline struct task_group *task_group(struct task_struct *p) 627{ 628 struct cgroup_subsys_state *css; 629 630 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 631 lockdep_is_held(&task_rq(p)->lock)); 632 return container_of(css, struct task_group, css); 633} 634 635/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 636static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 637{ 638#ifdef CONFIG_FAIR_GROUP_SCHED 639 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 640 p->se.parent = task_group(p)->se[cpu]; 641#endif 642 643#ifdef CONFIG_RT_GROUP_SCHED 644 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 645 p->rt.parent = task_group(p)->rt_se[cpu]; 646#endif 647} 648 649#else /* CONFIG_CGROUP_SCHED */ 650 651static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 652static inline struct task_group *task_group(struct task_struct *p) 653{ 654 return NULL; 655} 656 657#endif /* CONFIG_CGROUP_SCHED */ 658 659inline void update_rq_clock(struct rq *rq) 660{ 661 if (!rq->skip_clock_update) 662 rq->clock = sched_clock_cpu(cpu_of(rq)); 663} 664 665/* 666 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 667 */ 668#ifdef CONFIG_SCHED_DEBUG 669# define const_debug __read_mostly 670#else 671# define const_debug static const 672#endif 673 674/** 675 * runqueue_is_locked 676 * @cpu: the processor in question. 677 * 678 * Returns true if the current cpu runqueue is locked. 679 * This interface allows printk to be called with the runqueue lock 680 * held and know whether or not it is OK to wake up the klogd. 681 */ 682int runqueue_is_locked(int cpu) 683{ 684 return raw_spin_is_locked(&cpu_rq(cpu)->lock); 685} 686 687/* 688 * Debugging: various feature bits 689 */ 690 691#define SCHED_FEAT(name, enabled) \ 692 __SCHED_FEAT_##name , 693 694enum { 695#include "sched_features.h" 696}; 697 698#undef SCHED_FEAT 699 700#define SCHED_FEAT(name, enabled) \ 701 (1UL << __SCHED_FEAT_##name) * enabled | 702 703const_debug unsigned int sysctl_sched_features = 704#include "sched_features.h" 705 0; 706 707#undef SCHED_FEAT 708 709#ifdef CONFIG_SCHED_DEBUG 710#define SCHED_FEAT(name, enabled) \ 711 #name , 712 713static __read_mostly char *sched_feat_names[] = { 714#include "sched_features.h" 715 NULL 716}; 717 718#undef SCHED_FEAT 719 720static int sched_feat_show(struct seq_file *m, void *v) 721{ 722 int i; 723 724 for (i = 0; sched_feat_names[i]; i++) { 725 if (!(sysctl_sched_features & (1UL << i))) 726 seq_puts(m, "NO_"); 727 seq_printf(m, "%s ", sched_feat_names[i]); 728 } 729 seq_puts(m, "\n"); 730 731 return 0; 732} 733 734static ssize_t 735sched_feat_write(struct file *filp, const char __user *ubuf, 736 size_t cnt, loff_t *ppos) 737{ 738 char buf[64]; 739 char *cmp; 740 int neg = 0; 741 int i; 742 743 if (cnt > 63) 744 cnt = 63; 745 746 if (copy_from_user(&buf, ubuf, cnt)) 747 return -EFAULT; 748 749 buf[cnt] = 0; 750 cmp = strstrip(buf); 751 752 if (strncmp(buf, "NO_", 3) == 0) { 753 neg = 1; 754 cmp += 3; 755 } 756 757 for (i = 0; sched_feat_names[i]; i++) { 758 if (strcmp(cmp, sched_feat_names[i]) == 0) { 759 if (neg) 760 sysctl_sched_features &= ~(1UL << i); 761 else 762 sysctl_sched_features |= (1UL << i); 763 break; 764 } 765 } 766 767 if (!sched_feat_names[i]) 768 return -EINVAL; 769 770 *ppos += cnt; 771 772 return cnt; 773} 774 775static int sched_feat_open(struct inode *inode, struct file *filp) 776{ 777 return single_open(filp, sched_feat_show, NULL); 778} 779 780static const struct file_operations sched_feat_fops = { 781 .open = sched_feat_open, 782 .write = sched_feat_write, 783 .read = seq_read, 784 .llseek = seq_lseek, 785 .release = single_release, 786}; 787 788static __init int sched_init_debug(void) 789{ 790 debugfs_create_file("sched_features", 0644, NULL, NULL, 791 &sched_feat_fops); 792 793 return 0; 794} 795late_initcall(sched_init_debug); 796 797#endif 798 799#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 800 801/* 802 * Number of tasks to iterate in a single balance run. 803 * Limited because this is done with IRQs disabled. 804 */ 805const_debug unsigned int sysctl_sched_nr_migrate = 32; 806 807/* 808 * ratelimit for updating the group shares. 809 * default: 0.25ms 810 */ 811unsigned int sysctl_sched_shares_ratelimit = 250000; 812unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; 813 814/* 815 * Inject some fuzzyness into changing the per-cpu group shares 816 * this avoids remote rq-locks at the expense of fairness. 817 * default: 4 818 */ 819unsigned int sysctl_sched_shares_thresh = 4; 820 821/* 822 * period over which we average the RT time consumption, measured 823 * in ms. 824 * 825 * default: 1s 826 */ 827const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 828 829/* 830 * period over which we measure -rt task cpu usage in us. 831 * default: 1s 832 */ 833unsigned int sysctl_sched_rt_period = 1000000; 834 835static __read_mostly int scheduler_running; 836 837/* 838 * part of the period that we allow rt tasks to run in us. 839 * default: 0.95s 840 */ 841int sysctl_sched_rt_runtime = 950000; 842 843static inline u64 global_rt_period(void) 844{ 845 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 846} 847 848static inline u64 global_rt_runtime(void) 849{ 850 if (sysctl_sched_rt_runtime < 0) 851 return RUNTIME_INF; 852 853 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 854} 855 856#ifndef prepare_arch_switch 857# define prepare_arch_switch(next) do { } while (0) 858#endif 859#ifndef finish_arch_switch 860# define finish_arch_switch(prev) do { } while (0) 861#endif 862 863static inline int task_current(struct rq *rq, struct task_struct *p) 864{ 865 return rq->curr == p; 866} 867 868#ifndef __ARCH_WANT_UNLOCKED_CTXSW 869static inline int task_running(struct rq *rq, struct task_struct *p) 870{ 871 return task_current(rq, p); 872} 873 874static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 875{ 876} 877 878static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 879{ 880#ifdef CONFIG_DEBUG_SPINLOCK 881 /* this is a valid case when another task releases the spinlock */ 882 rq->lock.owner = current; 883#endif 884 /* 885 * If we are tracking spinlock dependencies then we have to 886 * fix up the runqueue lock - which gets 'carried over' from 887 * prev into current: 888 */ 889 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 890 891 raw_spin_unlock_irq(&rq->lock); 892} 893 894#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 895static inline int task_running(struct rq *rq, struct task_struct *p) 896{ 897#ifdef CONFIG_SMP 898 return p->oncpu; 899#else 900 return task_current(rq, p); 901#endif 902} 903 904static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 905{ 906#ifdef CONFIG_SMP 907 /* 908 * We can optimise this out completely for !SMP, because the 909 * SMP rebalancing from interrupt is the only thing that cares 910 * here. 911 */ 912 next->oncpu = 1; 913#endif 914#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 915 raw_spin_unlock_irq(&rq->lock); 916#else 917 raw_spin_unlock(&rq->lock); 918#endif 919} 920 921static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 922{ 923#ifdef CONFIG_SMP 924 /* 925 * After ->oncpu is cleared, the task can be moved to a different CPU. 926 * We must ensure this doesn't happen until the switch is completely 927 * finished. 928 */ 929 smp_wmb(); 930 prev->oncpu = 0; 931#endif 932#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 933 local_irq_enable(); 934#endif 935} 936#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 937 938/* 939 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 940 * against ttwu(). 941 */ 942static inline int task_is_waking(struct task_struct *p) 943{ 944 return unlikely(p->state == TASK_WAKING); 945} 946 947/* 948 * __task_rq_lock - lock the runqueue a given task resides on. 949 * Must be called interrupts disabled. 950 */ 951static inline struct rq *__task_rq_lock(struct task_struct *p) 952 __acquires(rq->lock) 953{ 954 struct rq *rq; 955 956 for (;;) { 957 rq = task_rq(p); 958 raw_spin_lock(&rq->lock); 959 if (likely(rq == task_rq(p))) 960 return rq; 961 raw_spin_unlock(&rq->lock); 962 } 963} 964 965/* 966 * task_rq_lock - lock the runqueue a given task resides on and disable 967 * interrupts. Note the ordering: we can safely lookup the task_rq without 968 * explicitly disabling preemption. 969 */ 970static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 971 __acquires(rq->lock) 972{ 973 struct rq *rq; 974 975 for (;;) { 976 local_irq_save(*flags); 977 rq = task_rq(p); 978 raw_spin_lock(&rq->lock); 979 if (likely(rq == task_rq(p))) 980 return rq; 981 raw_spin_unlock_irqrestore(&rq->lock, *flags); 982 } 983} 984 985static void __task_rq_unlock(struct rq *rq) 986 __releases(rq->lock) 987{ 988 raw_spin_unlock(&rq->lock); 989} 990 991static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 992 __releases(rq->lock) 993{ 994 raw_spin_unlock_irqrestore(&rq->lock, *flags); 995} 996 997/* 998 * this_rq_lock - lock this runqueue and disable interrupts. 999 */ 1000static struct rq *this_rq_lock(void) 1001 __acquires(rq->lock) 1002{ 1003 struct rq *rq; 1004 1005 local_irq_disable(); 1006 rq = this_rq(); 1007 raw_spin_lock(&rq->lock); 1008 1009 return rq; 1010} 1011 1012#ifdef CONFIG_SCHED_HRTICK 1013/* 1014 * Use HR-timers to deliver accurate preemption points. 1015 * 1016 * Its all a bit involved since we cannot program an hrt while holding the 1017 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a 1018 * reschedule event. 1019 * 1020 * When we get rescheduled we reprogram the hrtick_timer outside of the 1021 * rq->lock. 1022 */ 1023 1024/* 1025 * Use hrtick when: 1026 * - enabled by features 1027 * - hrtimer is actually high res 1028 */ 1029static inline int hrtick_enabled(struct rq *rq) 1030{ 1031 if (!sched_feat(HRTICK)) 1032 return 0; 1033 if (!cpu_active(cpu_of(rq))) 1034 return 0; 1035 return hrtimer_is_hres_active(&rq->hrtick_timer); 1036} 1037 1038static void hrtick_clear(struct rq *rq) 1039{ 1040 if (hrtimer_active(&rq->hrtick_timer)) 1041 hrtimer_cancel(&rq->hrtick_timer); 1042} 1043 1044/* 1045 * High-resolution timer tick. 1046 * Runs from hardirq context with interrupts disabled. 1047 */ 1048static enum hrtimer_restart hrtick(struct hrtimer *timer) 1049{ 1050 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 1051 1052 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1053 1054#if defined(BUZZZ_KEVT_LVL) && (BUZZZ_KEVT_LVL >= 2) 1055 buzzz_kevt_log0(BUZZZ_KEVT_ID_SCHED_HRTICK); 1056#endif /* BUZZZ_KEVT_LVL */ 1057 1058 raw_spin_lock(&rq->lock); 1059 update_rq_clock(rq); 1060 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1061 raw_spin_unlock(&rq->lock); 1062 1063 return HRTIMER_NORESTART; 1064} 1065 1066#ifdef CONFIG_SMP 1067/* 1068 * called from hardirq (IPI) context 1069 */ 1070static void __hrtick_start(void *arg) 1071{ 1072 struct rq *rq = arg; 1073 1074 raw_spin_lock(&rq->lock); 1075 hrtimer_restart(&rq->hrtick_timer); 1076 rq->hrtick_csd_pending = 0; 1077 raw_spin_unlock(&rq->lock); 1078} 1079 1080/* 1081 * Called to set the hrtick timer state. 1082 * 1083 * called with rq->lock held and irqs disabled 1084 */ 1085static void hrtick_start(struct rq *rq, u64 delay) 1086{ 1087 struct hrtimer *timer = &rq->hrtick_timer; 1088 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1089 1090 hrtimer_set_expires(timer, time); 1091 1092 if (rq == this_rq()) { 1093 hrtimer_restart(timer); 1094 } else if (!rq->hrtick_csd_pending) { 1095 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 1096 rq->hrtick_csd_pending = 1; 1097 } 1098} 1099 1100static int 1101hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 1102{ 1103 int cpu = (int)(long)hcpu; 1104 1105 switch (action) { 1106 case CPU_UP_CANCELED: 1107 case CPU_UP_CANCELED_FROZEN: 1108 case CPU_DOWN_PREPARE: 1109 case CPU_DOWN_PREPARE_FROZEN: 1110 case CPU_DEAD: 1111 case CPU_DEAD_FROZEN: 1112 hrtick_clear(cpu_rq(cpu)); 1113 return NOTIFY_OK; 1114 } 1115 1116 return NOTIFY_DONE; 1117} 1118 1119static __init void init_hrtick(void) 1120{ 1121 hotcpu_notifier(hotplug_hrtick, 0); 1122} 1123#else 1124/* 1125 * Called to set the hrtick timer state. 1126 * 1127 * called with rq->lock held and irqs disabled 1128 */ 1129static void hrtick_start(struct rq *rq, u64 delay) 1130{ 1131 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1132 HRTIMER_MODE_REL_PINNED, 0); 1133} 1134 1135static inline void init_hrtick(void) 1136{ 1137} 1138#endif /* CONFIG_SMP */ 1139 1140static void init_rq_hrtick(struct rq *rq) 1141{ 1142#ifdef CONFIG_SMP 1143 rq->hrtick_csd_pending = 0; 1144 1145 rq->hrtick_csd.flags = 0; 1146 rq->hrtick_csd.func = __hrtick_start; 1147 rq->hrtick_csd.info = rq; 1148#endif 1149 1150 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1151 rq->hrtick_timer.function = hrtick; 1152} 1153#else /* CONFIG_SCHED_HRTICK */ 1154static inline void hrtick_clear(struct rq *rq) 1155{ 1156} 1157 1158static inline void init_rq_hrtick(struct rq *rq) 1159{ 1160} 1161 1162static inline void init_hrtick(void) 1163{ 1164} 1165#endif /* CONFIG_SCHED_HRTICK */ 1166 1167/* 1168 * resched_task - mark a task 'to be rescheduled now'. 1169 * 1170 * On UP this means the setting of the need_resched flag, on SMP it 1171 * might also involve a cross-CPU call to trigger the scheduler on 1172 * the target CPU. 1173 */ 1174#ifdef CONFIG_SMP 1175 1176#ifndef tsk_is_polling 1177#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1178#endif 1179 1180static void resched_task(struct task_struct *p) 1181{ 1182 int cpu; 1183 1184 assert_raw_spin_locked(&task_rq(p)->lock); 1185 1186 if (test_tsk_need_resched(p)) 1187 return; 1188 1189 set_tsk_need_resched(p); 1190 1191 cpu = task_cpu(p); 1192 if (cpu == smp_processor_id()) 1193 return; 1194 1195 /* NEED_RESCHED must be visible before we test polling */ 1196 smp_mb(); 1197 if (!tsk_is_polling(p)) 1198 smp_send_reschedule(cpu); 1199} 1200 1201static void resched_cpu(int cpu) 1202{ 1203 struct rq *rq = cpu_rq(cpu); 1204 unsigned long flags; 1205 1206 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 1207 return; 1208 resched_task(cpu_curr(cpu)); 1209 raw_spin_unlock_irqrestore(&rq->lock, flags); 1210} 1211 1212#ifdef CONFIG_NO_HZ 1213/* 1214 * In the semi idle case, use the nearest busy cpu for migrating timers 1215 * from an idle cpu. This is good for power-savings. 1216 * 1217 * We don't do similar optimization for completely idle system, as 1218 * selecting an idle cpu will add more delays to the timers than intended 1219 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 1220 */ 1221int get_nohz_timer_target(void) 1222{ 1223 int cpu = smp_processor_id(); 1224 int i; 1225 struct sched_domain *sd; 1226 1227 for_each_domain(cpu, sd) { 1228 for_each_cpu(i, sched_domain_span(sd)) 1229 if (!idle_cpu(i)) 1230 return i; 1231 } 1232 return cpu; 1233} 1234/* 1235 * When add_timer_on() enqueues a timer into the timer wheel of an 1236 * idle CPU then this timer might expire before the next timer event 1237 * which is scheduled to wake up that CPU. In case of a completely 1238 * idle system the next event might even be infinite time into the 1239 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 1240 * leaves the inner idle loop so the newly added timer is taken into 1241 * account when the CPU goes back to idle and evaluates the timer 1242 * wheel for the next timer event. 1243 */ 1244void wake_up_idle_cpu(int cpu) 1245{ 1246 struct rq *rq = cpu_rq(cpu); 1247 1248 if (cpu == smp_processor_id()) 1249 return; 1250 1251 /* 1252 * This is safe, as this function is called with the timer 1253 * wheel base lock of (cpu) held. When the CPU is on the way 1254 * to idle and has not yet set rq->curr to idle then it will 1255 * be serialized on the timer wheel base lock and take the new 1256 * timer into account automatically. 1257 */ 1258 if (rq->curr != rq->idle) 1259 return; 1260 1261 /* 1262 * We can set TIF_RESCHED on the idle task of the other CPU 1263 * lockless. The worst case is that the other CPU runs the 1264 * idle task through an additional NOOP schedule() 1265 */ 1266 set_tsk_need_resched(rq->idle); 1267 1268 /* NEED_RESCHED must be visible before we test polling */ 1269 smp_mb(); 1270 if (!tsk_is_polling(rq->idle)) 1271 smp_send_reschedule(cpu); 1272} 1273 1274#endif /* CONFIG_NO_HZ */ 1275 1276static u64 sched_avg_period(void) 1277{ 1278 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 1279} 1280 1281static void sched_avg_update(struct rq *rq) 1282{ 1283 s64 period = sched_avg_period(); 1284 1285 while ((s64)(rq->clock - rq->age_stamp) > period) { 1286 /* 1287 * Inline assembly required to prevent the compiler 1288 * optimising this loop into a divmod call. 1289 * See __iter_div_u64_rem() for another example of this. 1290 */ 1291 asm("" : "+rm" (rq->age_stamp)); 1292 rq->age_stamp += period; 1293 rq->rt_avg /= 2; 1294 } 1295} 1296 1297static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1298{ 1299 rq->rt_avg += rt_delta; 1300 sched_avg_update(rq); 1301} 1302 1303#else /* !CONFIG_SMP */ 1304static void resched_task(struct task_struct *p) 1305{ 1306 assert_raw_spin_locked(&task_rq(p)->lock); 1307 set_tsk_need_resched(p); 1308} 1309 1310static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1311{ 1312} 1313 1314static void sched_avg_update(struct rq *rq) 1315{ 1316} 1317#endif /* CONFIG_SMP */ 1318 1319#if BITS_PER_LONG == 32 1320# define WMULT_CONST (~0UL) 1321#else 1322# define WMULT_CONST (1UL << 32) 1323#endif 1324 1325#define WMULT_SHIFT 32 1326 1327/* 1328 * Shift right and round: 1329 */ 1330#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1331 1332/* 1333 * delta *= weight / lw 1334 */ 1335static unsigned long 1336calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1337 struct load_weight *lw) 1338{ 1339 u64 tmp; 1340 1341 if (!lw->inv_weight) { 1342 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1343 lw->inv_weight = 1; 1344 else 1345 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1346 / (lw->weight+1); 1347 } 1348 1349 tmp = (u64)delta_exec * weight; 1350 /* 1351 * Check whether we'd overflow the 64-bit multiplication: 1352 */ 1353 if (unlikely(tmp > WMULT_CONST)) 1354 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 1355 WMULT_SHIFT/2); 1356 else 1357 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); 1358 1359 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1360} 1361 1362static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1363{ 1364 lw->weight += inc; 1365 lw->inv_weight = 0; 1366} 1367 1368static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 1369{ 1370 lw->weight -= dec; 1371 lw->inv_weight = 0; 1372} 1373 1374/* 1375 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1376 * of tasks with abnormal "nice" values across CPUs the contribution that 1377 * each task makes to its run queue's load is weighted according to its 1378 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 1379 * scaled version of the new time slice allocation that they receive on time 1380 * slice expiry etc. 1381 */ 1382 1383#define WEIGHT_IDLEPRIO 3 1384#define WMULT_IDLEPRIO 1431655765 1385 1386/* 1387 * Nice levels are multiplicative, with a gentle 10% change for every 1388 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 1389 * nice 1, it will get ~10% less CPU time than another CPU-bound task 1390 * that remained on nice 0. 1391 * 1392 * The "10% effect" is relative and cumulative: from _any_ nice level, 1393 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 1394 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 1395 * If a task goes up by ~10% and another task goes down by ~10% then 1396 * the relative distance between them is ~25%.) 1397 */ 1398static const int prio_to_weight[40] = { 1399 /* -20 */ 88761, 71755, 56483, 46273, 36291, 1400 /* -15 */ 29154, 23254, 18705, 14949, 11916, 1401 /* -10 */ 9548, 7620, 6100, 4904, 3906, 1402 /* -5 */ 3121, 2501, 1991, 1586, 1277, 1403 /* 0 */ 1024, 820, 655, 526, 423, 1404 /* 5 */ 335, 272, 215, 172, 137, 1405 /* 10 */ 110, 87, 70, 56, 45, 1406 /* 15 */ 36, 29, 23, 18, 15, 1407}; 1408 1409/* 1410 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. 1411 * 1412 * In cases where the weight does not change often, we can use the 1413 * precalculated inverse to speed up arithmetics by turning divisions 1414 * into multiplications: 1415 */ 1416static const u32 prio_to_wmult[40] = { 1417 /* -20 */ 48388, 59856, 76040, 92818, 118348, 1418 /* -15 */ 147320, 184698, 229616, 287308, 360437, 1419 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 1420 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 1421 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 1422 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 1423 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 1424 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1425}; 1426 1427/* Time spent by the tasks of the cpu accounting group executing in ... */ 1428enum cpuacct_stat_index { 1429 CPUACCT_STAT_USER, /* ... user mode */ 1430 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 1431 1432 CPUACCT_STAT_NSTATS, 1433}; 1434 1435#ifdef CONFIG_CGROUP_CPUACCT 1436static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1437static void cpuacct_update_stats(struct task_struct *tsk, 1438 enum cpuacct_stat_index idx, cputime_t val); 1439#else 1440static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1441static inline void cpuacct_update_stats(struct task_struct *tsk, 1442 enum cpuacct_stat_index idx, cputime_t val) {} 1443#endif 1444 1445static inline void inc_cpu_load(struct rq *rq, unsigned long load) 1446{ 1447 update_load_add(&rq->load, load); 1448} 1449 1450static inline void dec_cpu_load(struct rq *rq, unsigned long load) 1451{ 1452 update_load_sub(&rq->load, load); 1453} 1454 1455#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || \ 1456 defined(CONFIG_RT_GROUP_SCHED) 1457typedef int (*tg_visitor)(struct task_group *, void *); 1458 1459/* 1460 * Iterate the full tree, calling @down when first entering a node and @up when 1461 * leaving it for the final time. 1462 */ 1463static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1464{ 1465 struct task_group *parent, *child; 1466 int ret; 1467 1468 rcu_read_lock(); 1469 parent = &root_task_group; 1470down: 1471 ret = (*down)(parent, data); 1472 if (ret) 1473 goto out_unlock; 1474 list_for_each_entry_rcu(child, &parent->children, siblings) { 1475 parent = child; 1476 goto down; 1477 1478up: 1479 continue; 1480 } 1481 ret = (*up)(parent, data); 1482 if (ret) 1483 goto out_unlock; 1484 1485 child = parent; 1486 parent = parent->parent; 1487 if (parent) 1488 goto up; 1489out_unlock: 1490 rcu_read_unlock(); 1491 1492 return ret; 1493} 1494 1495static int tg_nop(struct task_group *tg, void *data) 1496{ 1497 return 0; 1498} 1499#endif 1500 1501#ifdef CONFIG_SMP 1502/* Used instead of source_load when we know the type == 0 */ 1503static unsigned long weighted_cpuload(const int cpu) 1504{ 1505 return cpu_rq(cpu)->load.weight; 1506} 1507 1508/* 1509 * Return a low guess at the load of a migration-source cpu weighted 1510 * according to the scheduling class and "nice" value. 1511 * 1512 * We want to under-estimate the load of migration sources, to 1513 * balance conservatively. 1514 */ 1515static unsigned long source_load(int cpu, int type) 1516{ 1517 struct rq *rq = cpu_rq(cpu); 1518 unsigned long total = weighted_cpuload(cpu); 1519 1520 if (type == 0 || !sched_feat(LB_BIAS)) 1521 return total; 1522 1523 return min(rq->cpu_load[type-1], total); 1524} 1525 1526/* 1527 * Return a high guess at the load of a migration-target cpu weighted 1528 * according to the scheduling class and "nice" value. 1529 */ 1530static unsigned long target_load(int cpu, int type) 1531{ 1532 struct rq *rq = cpu_rq(cpu); 1533 unsigned long total = weighted_cpuload(cpu); 1534 1535 if (type == 0 || !sched_feat(LB_BIAS)) 1536 return total; 1537 1538 return max(rq->cpu_load[type-1], total); 1539} 1540 1541static unsigned long power_of(int cpu) 1542{ 1543 return cpu_rq(cpu)->cpu_power; 1544} 1545 1546static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1547 1548static unsigned long cpu_avg_load_per_task(int cpu) 1549{ 1550 struct rq *rq = cpu_rq(cpu); 1551 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1552 1553 if (nr_running) 1554 rq->avg_load_per_task = rq->load.weight / nr_running; 1555 else 1556 rq->avg_load_per_task = 0; 1557 1558 return rq->avg_load_per_task; 1559} 1560 1561#ifdef CONFIG_FAIR_GROUP_SCHED 1562 1563static __read_mostly unsigned long __percpu *update_shares_data; 1564 1565static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1566 1567/* 1568 * Calculate and set the cpu's group shares. 1569 */ 1570static void update_group_shares_cpu(struct task_group *tg, int cpu, 1571 unsigned long sd_shares, 1572 unsigned long sd_rq_weight, 1573 unsigned long *usd_rq_weight) 1574{ 1575 unsigned long shares, rq_weight; 1576 int boost = 0; 1577 1578 rq_weight = usd_rq_weight[cpu]; 1579 if (!rq_weight) { 1580 boost = 1; 1581 rq_weight = NICE_0_LOAD; 1582 } 1583 1584 /* 1585 * \Sum_j shares_j * rq_weight_i 1586 * shares_i = ----------------------------- 1587 * \Sum_j rq_weight_j 1588 */ 1589 shares = (sd_shares * rq_weight) / sd_rq_weight; 1590 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1591 1592 if (abs(shares - tg->se[cpu]->load.weight) > 1593 sysctl_sched_shares_thresh) { 1594 struct rq *rq = cpu_rq(cpu); 1595 unsigned long flags; 1596 1597 raw_spin_lock_irqsave(&rq->lock, flags); 1598 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1599 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1600 __set_se_shares(tg->se[cpu], shares); 1601 raw_spin_unlock_irqrestore(&rq->lock, flags); 1602 } 1603} 1604 1605/* 1606 * Re-compute the task group their per cpu shares over the given domain. 1607 * This needs to be done in a bottom-up fashion because the rq weight of a 1608 * parent group depends on the shares of its child groups. 1609 */ 1610static int tg_shares_up(struct task_group *tg, void *data) 1611{ 1612 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; 1613 unsigned long *usd_rq_weight; 1614 struct sched_domain *sd = data; 1615 unsigned long flags; 1616 int i; 1617 1618 if (!tg->se[0]) 1619 return 0; 1620 1621 local_irq_save(flags); 1622 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); 1623 1624 for_each_cpu(i, sched_domain_span(sd)) { 1625 weight = tg->cfs_rq[i]->load.weight; 1626 usd_rq_weight[i] = weight; 1627 1628 rq_weight += weight; 1629 /* 1630 * If there are currently no tasks on the cpu pretend there 1631 * is one of average load so that when a new task gets to 1632 * run here it will not get delayed by group starvation. 1633 */ 1634 if (!weight) 1635 weight = NICE_0_LOAD; 1636 1637 sum_weight += weight; 1638 shares += tg->cfs_rq[i]->shares; 1639 } 1640 1641 if (!rq_weight) 1642 rq_weight = sum_weight; 1643 1644 if ((!shares && rq_weight) || shares > tg->shares) 1645 shares = tg->shares; 1646 1647 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1648 shares = tg->shares; 1649 1650 for_each_cpu(i, sched_domain_span(sd)) 1651 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); 1652 1653 local_irq_restore(flags); 1654 1655 return 0; 1656} 1657 1658/* 1659 * Compute the cpu's hierarchical load factor for each task group. 1660 * This needs to be done in a top-down fashion because the load of a child 1661 * group is a fraction of its parents load. 1662 */ 1663static int tg_load_down(struct task_group *tg, void *data) 1664{ 1665 unsigned long load; 1666 long cpu = (long)data; 1667 1668 if (!tg->parent) { 1669 load = cpu_rq(cpu)->load.weight; 1670 } else { 1671 load = tg->parent->cfs_rq[cpu]->h_load; 1672 load *= tg->cfs_rq[cpu]->shares; 1673 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1674 } 1675 1676 tg->cfs_rq[cpu]->h_load = load; 1677 1678 return 0; 1679} 1680 1681static void update_shares(struct sched_domain *sd) 1682{ 1683 s64 elapsed; 1684 u64 now; 1685 1686 if (root_task_group_empty()) 1687 return; 1688 1689 now = local_clock(); 1690 elapsed = now - sd->last_update; 1691 1692 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1693 sd->last_update = now; 1694 walk_tg_tree(tg_nop, tg_shares_up, sd); 1695 } 1696} 1697 1698static void update_h_load(long cpu) 1699{ 1700 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1701} 1702 1703#else 1704 1705static inline void update_shares(struct sched_domain *sd) 1706{ 1707} 1708 1709#endif 1710 1711#ifdef CONFIG_PREEMPT 1712 1713static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1714 1715/* 1716 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1717 * way at the expense of forcing extra atomic operations in all 1718 * invocations. This assures that the double_lock is acquired using the 1719 * same underlying policy as the spinlock_t on this architecture, which 1720 * reduces latency compared to the unfair variant below. However, it 1721 * also adds more overhead and therefore may reduce throughput. 1722 */ 1723static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1724 __releases(this_rq->lock) 1725 __acquires(busiest->lock) 1726 __acquires(this_rq->lock) 1727{ 1728 raw_spin_unlock(&this_rq->lock); 1729 double_rq_lock(this_rq, busiest); 1730 1731 return 1; 1732} 1733 1734#else 1735/* 1736 * Unfair double_lock_balance: Optimizes throughput at the expense of 1737 * latency by eliminating extra atomic operations when the locks are 1738 * already in proper order on entry. This favors lower cpu-ids and will 1739 * grant the double lock to lower cpus over higher ids under contention, 1740 * regardless of entry order into the function. 1741 */ 1742static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1743 __releases(this_rq->lock) 1744 __acquires(busiest->lock) 1745 __acquires(this_rq->lock) 1746{ 1747 int ret = 0; 1748 1749 if (unlikely(!raw_spin_trylock(&busiest->lock))) { 1750 if (busiest < this_rq) { 1751 raw_spin_unlock(&this_rq->lock); 1752 raw_spin_lock(&busiest->lock); 1753 raw_spin_lock_nested(&this_rq->lock, 1754 SINGLE_DEPTH_NESTING); 1755 ret = 1; 1756 } else 1757 raw_spin_lock_nested(&busiest->lock, 1758 SINGLE_DEPTH_NESTING); 1759 } 1760 return ret; 1761} 1762 1763#endif /* CONFIG_PREEMPT */ 1764 1765/* 1766 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1767 */ 1768static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1769{ 1770 if (unlikely(!irqs_disabled())) { 1771 /* printk() doesn't work good under rq->lock */ 1772 raw_spin_unlock(&this_rq->lock); 1773 BUG_ON(1); 1774 } 1775 1776 return _double_lock_balance(this_rq, busiest); 1777} 1778 1779static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1780 __releases(busiest->lock) 1781{ 1782 raw_spin_unlock(&busiest->lock); 1783 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1784} 1785 1786/* 1787 * double_rq_lock - safely lock two runqueues 1788 * 1789 * Note this does not disable interrupts like task_rq_lock, 1790 * you need to do so manually before calling. 1791 */ 1792static void double_rq_lock(struct rq *rq1, struct rq *rq2) 1793 __acquires(rq1->lock) 1794 __acquires(rq2->lock) 1795{ 1796 BUG_ON(!irqs_disabled()); 1797 if (rq1 == rq2) { 1798 raw_spin_lock(&rq1->lock); 1799 __acquire(rq2->lock); /* Fake it out ;) */ 1800 } else { 1801 if (rq1 < rq2) { 1802 raw_spin_lock(&rq1->lock); 1803 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 1804 } else { 1805 raw_spin_lock(&rq2->lock); 1806 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1807 } 1808 } 1809} 1810 1811/* 1812 * double_rq_unlock - safely unlock two runqueues 1813 * 1814 * Note this does not restore interrupts like task_rq_unlock, 1815 * you need to do so manually after calling. 1816 */ 1817static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 1818 __releases(rq1->lock) 1819 __releases(rq2->lock) 1820{ 1821 raw_spin_unlock(&rq1->lock); 1822 if (rq1 != rq2) 1823 raw_spin_unlock(&rq2->lock); 1824 else 1825 __release(rq2->lock); 1826} 1827 1828#endif 1829 1830#ifdef CONFIG_FAIR_GROUP_SCHED 1831static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1832{ 1833#ifdef CONFIG_SMP 1834 cfs_rq->shares = shares; 1835#endif 1836} 1837#endif 1838 1839static void calc_load_account_idle(struct rq *this_rq); 1840static void update_sysctl(void); 1841static int get_update_sysctl_factor(void); 1842static void update_cpu_load(struct rq *this_rq); 1843 1844static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1845{ 1846 set_task_rq(p, cpu); 1847#ifdef CONFIG_SMP 1848 /* 1849 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1850 * successfuly executed on another CPU. We must ensure that updates of 1851 * per-task data have been completed by this moment. 1852 */ 1853 smp_wmb(); 1854 task_thread_info(p)->cpu = cpu; 1855#endif 1856} 1857 1858static const struct sched_class rt_sched_class; 1859 1860#define sched_class_highest (&rt_sched_class) 1861#define for_each_class(class) \ 1862 for (class = sched_class_highest; class; class = class->next) 1863 1864#include "sched_stats.h" 1865 1866static void inc_nr_running(struct rq *rq) 1867{ 1868 rq->nr_running++; 1869} 1870 1871static void dec_nr_running(struct rq *rq) 1872{ 1873 rq->nr_running--; 1874} 1875 1876static void set_load_weight(struct task_struct *p) 1877{ 1878 /* 1879 * SCHED_IDLE tasks get minimal weight: 1880 */ 1881 if (p->policy == SCHED_IDLE) { 1882 p->se.load.weight = WEIGHT_IDLEPRIO; 1883 p->se.load.inv_weight = WMULT_IDLEPRIO; 1884 return; 1885 } 1886 1887 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1888 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1889} 1890 1891static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1892{ 1893 update_rq_clock(rq); 1894 sched_info_queued(p); 1895 p->sched_class->enqueue_task(rq, p, flags); 1896 p->se.on_rq = 1; 1897} 1898 1899static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1900{ 1901 update_rq_clock(rq); 1902 sched_info_dequeued(p); 1903 p->sched_class->dequeue_task(rq, p, flags); 1904 p->se.on_rq = 0; 1905} 1906 1907/* 1908 * activate_task - move a task to the runqueue. 1909 */ 1910static void activate_task(struct rq *rq, struct task_struct *p, int flags) 1911{ 1912 if (task_contributes_to_load(p)) 1913 rq->nr_uninterruptible--; 1914 1915 enqueue_task(rq, p, flags); 1916 inc_nr_running(rq); 1917} 1918 1919/* 1920 * deactivate_task - remove a task from the runqueue. 1921 */ 1922static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 1923{ 1924 if (task_contributes_to_load(p)) 1925 rq->nr_uninterruptible++; 1926 1927 dequeue_task(rq, p, flags); 1928 dec_nr_running(rq); 1929} 1930 1931#include "sched_idletask.c" 1932#include "sched_fair.c" 1933#include "sched_rt.c" 1934#ifdef CONFIG_SCHED_DEBUG 1935# include "sched_debug.c" 1936#endif 1937 1938/* 1939 * __normal_prio - return the priority that is based on the static prio 1940 */ 1941static inline int __normal_prio(struct task_struct *p) 1942{ 1943 return p->static_prio; 1944} 1945 1946/* 1947 * Calculate the expected normal priority: i.e. priority 1948 * without taking RT-inheritance into account. Might be 1949 * boosted by interactivity modifiers. Changes upon fork, 1950 * setprio syscalls, and whenever the interactivity 1951 * estimator recalculates. 1952 */ 1953static inline int normal_prio(struct task_struct *p) 1954{ 1955 int prio; 1956 1957 if (task_has_rt_policy(p)) 1958 prio = MAX_RT_PRIO-1 - p->rt_priority; 1959 else 1960 prio = __normal_prio(p); 1961 return prio; 1962} 1963 1964/* 1965 * Calculate the current priority, i.e. the priority 1966 * taken into account by the scheduler. This value might 1967 * be boosted by RT tasks, or might be boosted by 1968 * interactivity modifiers. Will be RT if the task got 1969 * RT-boosted. If not then it returns p->normal_prio. 1970 */ 1971static int effective_prio(struct task_struct *p) 1972{ 1973 p->normal_prio = normal_prio(p); 1974 /* 1975 * If we are RT tasks or we were boosted to RT priority, 1976 * keep the priority unchanged. Otherwise, update priority 1977 * to the normal priority: 1978 */ 1979 if (!rt_prio(p->prio)) 1980 return p->normal_prio; 1981 return p->prio; 1982} 1983 1984/** 1985 * task_curr - is this task currently executing on a CPU? 1986 * @p: the task in question. 1987 */ 1988inline int task_curr(const struct task_struct *p) 1989{ 1990 return cpu_curr(task_cpu(p)) == p; 1991} 1992 1993static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1994 const struct sched_class *prev_class, 1995 int oldprio, int running) 1996{ 1997 if (prev_class != p->sched_class) { 1998 if (prev_class->switched_from) 1999 prev_class->switched_from(rq, p, running); 2000 p->sched_class->switched_to(rq, p, running); 2001 } else 2002 p->sched_class->prio_changed(rq, p, oldprio, running); 2003} 2004 2005#ifdef CONFIG_SMP 2006/* 2007 * Is this task likely cache-hot: 2008 */ 2009static int 2010task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 2011{ 2012 s64 delta; 2013 2014 if (p->sched_class != &fair_sched_class) 2015 return 0; 2016 2017 /* 2018 * Buddy candidates are cache hot: 2019 */ 2020 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 2021 (&p->se == cfs_rq_of(&p->se)->next || 2022 &p->se == cfs_rq_of(&p->se)->last)) 2023 return 1; 2024 2025 if (sysctl_sched_migration_cost == -1) 2026 return 1; 2027 if (sysctl_sched_migration_cost == 0) 2028 return 0; 2029 2030 delta = now - p->se.exec_start; 2031 2032 return delta < (s64)sysctl_sched_migration_cost; 2033} 2034 2035void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2036{ 2037#ifdef CONFIG_SCHED_DEBUG 2038 /* 2039 * We should never call set_task_cpu() on a blocked task, 2040 * ttwu() will sort out the placement. 2041 */ 2042 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2043 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2044#endif 2045 2046 trace_sched_migrate_task(p, new_cpu); 2047 2048 if (task_cpu(p) != new_cpu) { 2049 p->se.nr_migrations++; 2050 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2051 } 2052 2053 __set_task_cpu(p, new_cpu); 2054} 2055 2056struct migration_arg { 2057 struct task_struct *task; 2058 int dest_cpu; 2059}; 2060 2061static int migration_cpu_stop(void *data); 2062 2063/* 2064 * The task's runqueue lock must be held. 2065 * Returns true if you have to wait for migration thread. 2066 */ 2067static bool migrate_task(struct task_struct *p, int dest_cpu) 2068{ 2069 struct rq *rq = task_rq(p); 2070 2071 /* 2072 * If the task is not on a runqueue (and not running), then 2073 * the next wake-up will properly place the task. 2074 */ 2075 return p->se.on_rq || task_running(rq, p); 2076} 2077 2078/* 2079 * wait_task_inactive - wait for a thread to unschedule. 2080 * 2081 * If @match_state is nonzero, it's the @p->state value just checked and 2082 * not expected to change. If it changes, i.e. @p might have woken up, 2083 * then return zero. When we succeed in waiting for @p to be off its CPU, 2084 * we return a positive number (its total switch count). If a second call 2085 * a short while later returns the same number, the caller can be sure that 2086 * @p has remained unscheduled the whole time. 2087 * 2088 * The caller must ensure that the task *will* unschedule sometime soon, 2089 * else this function might spin for a *long* time. This function can't 2090 * be called with interrupts off, or it may introduce deadlock with 2091 * smp_call_function() if an IPI is sent by the same process we are 2092 * waiting to become inactive. 2093 */ 2094unsigned long wait_task_inactive(struct task_struct *p, long match_state) 2095{ 2096 unsigned long flags; 2097 int running, on_rq; 2098 unsigned long ncsw; 2099 struct rq *rq; 2100 2101 for (;;) { 2102 /* 2103 * We do the initial early heuristics without holding 2104 * any task-queue locks at all. We'll only try to get 2105 * the runqueue lock when things look like they will 2106 * work out! 2107 */ 2108 rq = task_rq(p); 2109 2110 /* 2111 * If the task is actively running on another CPU 2112 * still, just relax and busy-wait without holding 2113 * any locks. 2114 * 2115 * NOTE! Since we don't hold any locks, it's not 2116 * even sure that "rq" stays as the right runqueue! 2117 * But we don't care, since "task_running()" will 2118 * return false if the runqueue has changed and p 2119 * is actually now running somewhere else! 2120 */ 2121 while (task_running(rq, p)) { 2122 if (match_state && unlikely(p->state != match_state)) 2123 return 0; 2124 cpu_relax(); 2125 } 2126 2127 /* 2128 * Ok, time to look more closely! We need the rq 2129 * lock now, to be *sure*. If we're wrong, we'll 2130 * just go back and repeat. 2131 */ 2132 rq = task_rq_lock(p, &flags); 2133 trace_sched_wait_task(p); 2134 running = task_running(rq, p); 2135 on_rq = p->se.on_rq; 2136 ncsw = 0; 2137 if (!match_state || p->state == match_state) 2138 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2139 task_rq_unlock(rq, &flags); 2140 2141 /* 2142 * If it changed from the expected state, bail out now. 2143 */ 2144 if (unlikely(!ncsw)) 2145 break; 2146 2147 /* 2148 * Was it really running after all now that we 2149 * checked with the proper locks actually held? 2150 * 2151 * Oops. Go back and try again.. 2152 */ 2153 if (unlikely(running)) { 2154 cpu_relax(); 2155 continue; 2156 } 2157 2158 /* 2159 * It's not enough that it's not actively running, 2160 * it must be off the runqueue _entirely_, and not 2161 * preempted! 2162 * 2163 * So if it was still runnable (but just not actively 2164 * running right now), it's preempted, and we should 2165 * yield - it could be a while. 2166 */ 2167 if (unlikely(on_rq)) { 2168 schedule_timeout_uninterruptible(1); 2169 continue; 2170 } 2171 2172 /* 2173 * Ahh, all good. It wasn't running, and it wasn't 2174 * runnable, which means that it will never become 2175 * running in the future either. We're all done! 2176 */ 2177 break; 2178 } 2179 2180 return ncsw; 2181} 2182 2183/*** 2184 * kick_process - kick a running thread to enter/exit the kernel 2185 * @p: the to-be-kicked thread 2186 * 2187 * Cause a process which is running on another CPU to enter 2188 * kernel-mode, without any delay. (to get signals handled.) 2189 * 2190 * NOTE: this function doesnt have to take the runqueue lock, 2191 * because all it wants to ensure is that the remote task enters 2192 * the kernel. If the IPI races and the task has been migrated 2193 * to another CPU then no harm is done and the purpose has been 2194 * achieved as well. 2195 */ 2196void kick_process(struct task_struct *p) 2197{ 2198 int cpu; 2199 2200 preempt_disable(); 2201 cpu = task_cpu(p); 2202 if ((cpu != smp_processor_id()) && task_curr(p)) 2203 smp_send_reschedule(cpu); 2204 preempt_enable(); 2205} 2206EXPORT_SYMBOL_GPL(kick_process); 2207#endif /* CONFIG_SMP */ 2208 2209/** 2210 * task_oncpu_function_call - call a function on the cpu on which a task runs 2211 * @p: the task to evaluate 2212 * @func: the function to be called 2213 * @info: the function call argument 2214 * 2215 * Calls the function @func when the task is currently running. This might 2216 * be on the current CPU, which just calls the function directly 2217 */ 2218void task_oncpu_function_call(struct task_struct *p, 2219 void (*func) (void *info), void *info) 2220{ 2221 int cpu; 2222 2223 preempt_disable(); 2224 cpu = task_cpu(p); 2225 if (task_curr(p)) 2226 smp_call_function_single(cpu, func, info, 1); 2227 preempt_enable(); 2228} 2229 2230#ifdef CONFIG_SMP 2231/* 2232 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2233 */ 2234static int select_fallback_rq(int cpu, struct task_struct *p) 2235{ 2236 int dest_cpu; 2237 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 2238 2239 /* Look for allowed, online CPU in same node. */ 2240 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 2241 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 2242 return dest_cpu; 2243 2244 /* Any allowed, online CPU? */ 2245 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 2246 if (dest_cpu < nr_cpu_ids) 2247 return dest_cpu; 2248 2249 /* No more Mr. Nice Guy. */ 2250 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2251 dest_cpu = cpuset_cpus_allowed_fallback(p); 2252 /* 2253 * Don't tell them about moving exiting tasks or 2254 * kernel threads (both mm NULL), since they never 2255 * leave kernel. 2256 */ 2257 if (p->mm && printk_ratelimit()) { 2258 printk(KERN_INFO "process %d (%s) no " 2259 "longer affine to cpu%d\n", 2260 task_pid_nr(p), p->comm, cpu); 2261 } 2262 } 2263 2264 return dest_cpu; 2265} 2266 2267/* 2268 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2269 */ 2270static inline 2271int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2272{ 2273 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2274 2275 /* 2276 * In order not to call set_task_cpu() on a blocking task we need 2277 * to rely on ttwu() to place the task on a valid ->cpus_allowed 2278 * cpu. 2279 * 2280 * Since this is common to all placement strategies, this lives here. 2281 * 2282 * [ this allows ->select_task() to simply return task_cpu(p) and 2283 * not worry about this generic constraint ] 2284 */ 2285 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 2286 !cpu_online(cpu))) 2287 cpu = select_fallback_rq(task_cpu(p), p); 2288 2289 return cpu; 2290} 2291 2292static void update_avg(u64 *avg, u64 sample) 2293{ 2294 s64 diff = sample - *avg; 2295 *avg += diff >> 3; 2296} 2297#endif 2298 2299static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2300 bool is_sync, bool is_migrate, bool is_local, 2301 unsigned long en_flags) 2302{ 2303 schedstat_inc(p, se.statistics.nr_wakeups); 2304 if (is_sync) 2305 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2306 if (is_migrate) 2307 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2308 if (is_local) 2309 schedstat_inc(p, se.statistics.nr_wakeups_local); 2310 else 2311 schedstat_inc(p, se.statistics.nr_wakeups_remote); 2312 2313 activate_task(rq, p, en_flags); 2314} 2315 2316static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2317 int wake_flags, bool success) 2318{ 2319 trace_sched_wakeup(p, success); 2320 check_preempt_curr(rq, p, wake_flags); 2321 2322 p->state = TASK_RUNNING; 2323#ifdef CONFIG_SMP 2324 if (p->sched_class->task_woken) 2325 p->sched_class->task_woken(rq, p); 2326 2327 if (unlikely(rq->idle_stamp)) { 2328 u64 delta = rq->clock - rq->idle_stamp; 2329 u64 max = 2*sysctl_sched_migration_cost; 2330 2331 if (delta > max) 2332 rq->avg_idle = max; 2333 else 2334 update_avg(&rq->avg_idle, delta); 2335 rq->idle_stamp = 0; 2336 } 2337#endif 2338 /* if a worker is waking up, notify workqueue */ 2339 if ((p->flags & PF_WQ_WORKER) && success) 2340 wq_worker_waking_up(p, cpu_of(rq)); 2341} 2342 2343/** 2344 * try_to_wake_up - wake up a thread 2345 * @p: the thread to be awakened 2346 * @state: the mask of task states that can be woken 2347 * @wake_flags: wake modifier flags (WF_*) 2348 * 2349 * Put it on the run-queue if it's not already there. The "current" 2350 * thread is always on the run-queue (except when the actual 2351 * re-schedule is in progress), and as such you're allowed to do 2352 * the simpler "current->state = TASK_RUNNING" to mark yourself 2353 * runnable without the overhead of this. 2354 * 2355 * Returns %true if @p was woken up, %false if it was already running 2356 * or @state didn't match @p's state. 2357 */ 2358static int try_to_wake_up(struct task_struct *p, unsigned int state, 2359 int wake_flags) 2360{ 2361 int cpu, orig_cpu, this_cpu, success = 0; 2362 unsigned long flags; 2363 unsigned long en_flags = ENQUEUE_WAKEUP; 2364 struct rq *rq; 2365 2366 this_cpu = get_cpu(); 2367 2368 smp_wmb(); 2369 rq = task_rq_lock(p, &flags); 2370 if (!(p->state & state)) 2371 goto out; 2372 2373 if (p->se.on_rq) 2374 goto out_running; 2375 2376 cpu = task_cpu(p); 2377 orig_cpu = cpu; 2378 2379#ifdef CONFIG_SMP 2380 if (unlikely(task_running(rq, p))) 2381 goto out_activate; 2382 2383 /* 2384 * In order to handle concurrent wakeups and release the rq->lock 2385 * we put the task in TASK_WAKING state. 2386 * 2387 * First fix up the nr_uninterruptible count: 2388 */ 2389 if (task_contributes_to_load(p)) { 2390 if (likely(cpu_online(orig_cpu))) 2391 rq->nr_uninterruptible--; 2392 else 2393 this_rq()->nr_uninterruptible--; 2394 } 2395 p->state = TASK_WAKING; 2396 2397 if (p->sched_class->task_waking) { 2398 p->sched_class->task_waking(rq, p); 2399 en_flags |= ENQUEUE_WAKING; 2400 } 2401 2402 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); 2403 if (cpu != orig_cpu) 2404 set_task_cpu(p, cpu); 2405 __task_rq_unlock(rq); 2406 2407 rq = cpu_rq(cpu); 2408 raw_spin_lock(&rq->lock); 2409 2410 /* 2411 * We migrated the task without holding either rq->lock, however 2412 * since the task is not on the task list itself, nobody else 2413 * will try and migrate the task, hence the rq should match the 2414 * cpu we just moved it to. 2415 */ 2416 WARN_ON(task_cpu(p) != cpu); 2417 WARN_ON(p->state != TASK_WAKING); 2418 2419#ifdef CONFIG_SCHEDSTATS 2420 schedstat_inc(rq, ttwu_count); 2421 if (cpu == this_cpu) 2422 schedstat_inc(rq, ttwu_local); 2423 else { 2424 struct sched_domain *sd; 2425 for_each_domain(this_cpu, sd) { 2426 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 2427 schedstat_inc(sd, ttwu_wake_remote); 2428 break; 2429 } 2430 } 2431 } 2432#endif /* CONFIG_SCHEDSTATS */ 2433 2434out_activate: 2435#endif /* CONFIG_SMP */ 2436 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2437 cpu == this_cpu, en_flags); 2438 success = 1; 2439out_running: 2440 ttwu_post_activation(p, rq, wake_flags, success); 2441out: 2442 task_rq_unlock(rq, &flags); 2443 put_cpu(); 2444 2445 return success; 2446} 2447 2448/** 2449 * try_to_wake_up_local - try to wake up a local task with rq lock held 2450 * @p: the thread to be awakened 2451 * 2452 * Put @p on the run-queue if it's not alredy there. The caller must 2453 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2454 * the current task. this_rq() stays locked over invocation. 2455 */ 2456static void try_to_wake_up_local(struct task_struct *p) 2457{ 2458 struct rq *rq = task_rq(p); 2459 bool success = false; 2460 2461 BUG_ON(rq != this_rq()); 2462 BUG_ON(p == current); 2463 lockdep_assert_held(&rq->lock); 2464 2465 if (!(p->state & TASK_NORMAL)) 2466 return; 2467 2468 if (!p->se.on_rq) { 2469 if (likely(!task_running(rq, p))) { 2470 schedstat_inc(rq, ttwu_count); 2471 schedstat_inc(rq, ttwu_local); 2472 } 2473 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2474 success = true; 2475 } 2476 ttwu_post_activation(p, rq, 0, success); 2477} 2478 2479/** 2480 * wake_up_process - Wake up a specific process 2481 * @p: The process to be woken up. 2482 * 2483 * Attempt to wake up the nominated process and move it to the set of runnable 2484 * processes. Returns 1 if the process was woken up, 0 if it was already 2485 * running. 2486 * 2487 * It may be assumed that this function implies a write memory barrier before 2488 * changing the task state if and only if any tasks are woken up. 2489 */ 2490int wake_up_process(struct task_struct *p) 2491{ 2492 return try_to_wake_up(p, TASK_ALL, 0); 2493} 2494EXPORT_SYMBOL(wake_up_process); 2495 2496int wake_up_state(struct task_struct *p, unsigned int state) 2497{ 2498 return try_to_wake_up(p, state, 0); 2499} 2500 2501/* 2502 * Perform scheduler related setup for a newly forked process p. 2503 * p is forked by current. 2504 * 2505 * __sched_fork() is basic setup used by init_idle() too: 2506 */ 2507static void __sched_fork(struct task_struct *p) 2508{ 2509 p->se.exec_start = 0; 2510 p->se.sum_exec_runtime = 0; 2511 p->se.prev_sum_exec_runtime = 0; 2512 p->se.nr_migrations = 0; 2513 2514#ifdef CONFIG_SCHEDSTATS 2515 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2516#endif 2517 2518 INIT_LIST_HEAD(&p->rt.run_list); 2519 p->se.on_rq = 0; 2520 INIT_LIST_HEAD(&p->se.group_node); 2521 2522#ifdef CONFIG_PREEMPT_NOTIFIERS 2523 INIT_HLIST_HEAD(&p->preempt_notifiers); 2524#endif 2525} 2526 2527/* 2528 * fork()/clone()-time setup: 2529 */ 2530void sched_fork(struct task_struct *p, int clone_flags) 2531{ 2532 int cpu = get_cpu(); 2533 2534 __sched_fork(p); 2535 /* 2536 * We mark the process as running here. This guarantees that 2537 * nobody will actually run it, and a signal or other external 2538 * event cannot wake it up and insert it on the runqueue either. 2539 */ 2540 p->state = TASK_RUNNING; 2541 2542 /* 2543 * Revert to default priority/policy on fork if requested. 2544 */ 2545 if (unlikely(p->sched_reset_on_fork)) { 2546 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2547 p->policy = SCHED_NORMAL; 2548 p->normal_prio = p->static_prio; 2549 } 2550 2551 if (PRIO_TO_NICE(p->static_prio) < 0) { 2552 p->static_prio = NICE_TO_PRIO(0); 2553 p->normal_prio = p->static_prio; 2554 set_load_weight(p); 2555 } 2556 2557 /* 2558 * We don't need the reset flag anymore after the fork. It has 2559 * fulfilled its duty: 2560 */ 2561 p->sched_reset_on_fork = 0; 2562 } 2563 2564 /* 2565 * Make sure we do not leak PI boosting priority to the child. 2566 */ 2567 p->prio = current->normal_prio; 2568 2569 if (!rt_prio(p->prio)) 2570 p->sched_class = &fair_sched_class; 2571 2572 if (p->sched_class->task_fork) 2573 p->sched_class->task_fork(p); 2574 2575 /* 2576 * The child is not yet in the pid-hash so no cgroup attach races, 2577 * and the cgroup is pinned to this child due to cgroup_fork() 2578 * is ran before sched_fork(). 2579 * 2580 * Silence PROVE_RCU. 2581 */ 2582 rcu_read_lock(); 2583 set_task_cpu(p, cpu); 2584 rcu_read_unlock(); 2585 2586#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2587 if (likely(sched_info_on())) 2588 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2589#endif 2590#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2591 p->oncpu = 0; 2592#endif 2593#ifdef CONFIG_PREEMPT 2594 /* Want to start with kernel preemption disabled. */ 2595 task_thread_info(p)->preempt_count = 1; 2596#endif 2597 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2598 2599 put_cpu(); 2600} 2601 2602/* 2603 * wake_up_new_task - wake up a newly created task for the first time. 2604 * 2605 * This function will do some initial scheduler statistics housekeeping 2606 * that must be done for every newly created context, then puts the task 2607 * on the runqueue and wakes it. 2608 */ 2609void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2610{ 2611 unsigned long flags; 2612 struct rq *rq; 2613 int cpu __maybe_unused = get_cpu(); 2614 2615#ifdef CONFIG_SMP 2616 rq = task_rq_lock(p, &flags); 2617 p->state = TASK_WAKING; 2618 2619 /* 2620 * Fork balancing, do it here and not earlier because: 2621 * - cpus_allowed can change in the fork path 2622 * - any previously selected cpu might disappear through hotplug 2623 * 2624 * We set TASK_WAKING so that select_task_rq() can drop rq->lock 2625 * without people poking at ->cpus_allowed. 2626 */ 2627 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2628 set_task_cpu(p, cpu); 2629 2630 p->state = TASK_RUNNING; 2631 task_rq_unlock(rq, &flags); 2632#endif 2633 2634 rq = task_rq_lock(p, &flags); 2635 activate_task(rq, p, 0); 2636 trace_sched_wakeup_new(p, 1); 2637 check_preempt_curr(rq, p, WF_FORK); 2638#ifdef CONFIG_SMP 2639 if (p->sched_class->task_woken) 2640 p->sched_class->task_woken(rq, p); 2641#endif 2642 task_rq_unlock(rq, &flags); 2643 put_cpu(); 2644} 2645 2646#ifdef CONFIG_PREEMPT_NOTIFIERS 2647 2648/** 2649 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2650 * @notifier: notifier struct to register 2651 */ 2652void preempt_notifier_register(struct preempt_notifier *notifier) 2653{ 2654 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 2655} 2656EXPORT_SYMBOL_GPL(preempt_notifier_register); 2657 2658/** 2659 * preempt_notifier_unregister - no longer interested in preemption notifications 2660 * @notifier: notifier struct to unregister 2661 * 2662 * This is safe to call from within a preemption notifier. 2663 */ 2664void preempt_notifier_unregister(struct preempt_notifier *notifier) 2665{ 2666 hlist_del(¬ifier->link); 2667} 2668EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2669 2670static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2671{ 2672 struct preempt_notifier *notifier; 2673 struct hlist_node *node; 2674 2675 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2676 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2677} 2678 2679static void 2680fire_sched_out_preempt_notifiers(struct task_struct *curr, 2681 struct task_struct *next) 2682{ 2683 struct preempt_notifier *notifier; 2684 struct hlist_node *node; 2685 2686 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2687 notifier->ops->sched_out(notifier, next); 2688} 2689 2690#else /* !CONFIG_PREEMPT_NOTIFIERS */ 2691 2692static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2693{ 2694} 2695 2696static void 2697fire_sched_out_preempt_notifiers(struct task_struct *curr, 2698 struct task_struct *next) 2699{ 2700} 2701 2702#endif /* CONFIG_PREEMPT_NOTIFIERS */ 2703 2704/** 2705 * prepare_task_switch - prepare to switch tasks 2706 * @rq: the runqueue preparing to switch 2707 * @prev: the current task that is being switched out 2708 * @next: the task we are going to switch to. 2709 * 2710 * This is called with the rq lock held and interrupts off. It must 2711 * be paired with a subsequent finish_task_switch after the context 2712 * switch. 2713 * 2714 * prepare_task_switch sets up locking and calls architecture specific 2715 * hooks. 2716 */ 2717static inline void 2718prepare_task_switch(struct rq *rq, struct task_struct *prev, 2719 struct task_struct *next) 2720{ 2721 fire_sched_out_preempt_notifiers(prev, next); 2722 prepare_lock_switch(rq, next); 2723 prepare_arch_switch(next); 2724} 2725 2726/** 2727 * finish_task_switch - clean up after a task-switch 2728 * @rq: runqueue associated with task-switch 2729 * @prev: the thread we just switched away from. 2730 * 2731 * finish_task_switch must be called after the context switch, paired 2732 * with a prepare_task_switch call before the context switch. 2733 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2734 * and do any other architecture-specific cleanup actions. 2735 * 2736 * Note that we may have delayed dropping an mm in context_switch(). If 2737 * so, we finish that here outside of the runqueue lock. (Doing it 2738 * with the lock held can cause deadlocks; see schedule() for 2739 * details.) 2740 */ 2741static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2742 __releases(rq->lock) 2743{ 2744 struct mm_struct *mm = rq->prev_mm; 2745 long prev_state; 2746 2747 rq->prev_mm = NULL; 2748 2749 /* 2750 * A task struct has one reference for the use as "current". 2751 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2752 * schedule one last time. The schedule call will never return, and 2753 * the scheduled task must drop that reference. 2754 * The test for TASK_DEAD must occur while the runqueue locks are 2755 * still held, otherwise prev could be scheduled on another cpu, die 2756 * there before we look at prev->state, and then the reference would 2757 * be dropped twice. 2758 * Manfred Spraul <manfred@colorfullife.com> 2759 */ 2760 prev_state = prev->state; 2761 finish_arch_switch(prev); 2762#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2763 local_irq_disable(); 2764#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 2765 perf_event_task_sched_in(current); 2766#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2767 local_irq_enable(); 2768#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 2769 finish_lock_switch(rq, prev); 2770 2771 fire_sched_in_preempt_notifiers(current); 2772 if (mm) 2773 mmdrop(mm); 2774 if (unlikely(prev_state == TASK_DEAD)) { 2775 /* 2776 * Remove function-return probe instances associated with this 2777 * task and put them back on the free list. 2778 */ 2779 kprobe_flush_task(prev); 2780 put_task_struct(prev); 2781 } 2782} 2783 2784#ifdef CONFIG_SMP 2785 2786/* assumes rq->lock is held */ 2787static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 2788{ 2789 if (prev->sched_class->pre_schedule) 2790 prev->sched_class->pre_schedule(rq, prev); 2791} 2792 2793/* rq->lock is NOT held, but preemption is disabled */ 2794static inline void post_schedule(struct rq *rq) 2795{ 2796 if (rq->post_schedule) { 2797 unsigned long flags; 2798 2799 raw_spin_lock_irqsave(&rq->lock, flags); 2800 if (rq->curr->sched_class->post_schedule) 2801 rq->curr->sched_class->post_schedule(rq); 2802 raw_spin_unlock_irqrestore(&rq->lock, flags); 2803 2804 rq->post_schedule = 0; 2805 } 2806} 2807 2808#else 2809 2810static inline void pre_schedule(struct rq *rq, struct task_struct *p) 2811{ 2812} 2813 2814static inline void post_schedule(struct rq *rq) 2815{ 2816} 2817 2818#endif 2819 2820/** 2821 * schedule_tail - first thing a freshly forked thread must call. 2822 * @prev: the thread we just switched away from. 2823 */ 2824asmlinkage void schedule_tail(struct task_struct *prev) 2825 __releases(rq->lock) 2826{ 2827 struct rq *rq = this_rq(); 2828 2829 finish_task_switch(rq, prev); 2830 2831 post_schedule(rq); 2832 2833#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2834 /* In this case, finish_task_switch does not reenable preemption */ 2835 preempt_enable(); 2836#endif 2837 if (current->set_child_tid) 2838 put_user(task_pid_vnr(current), current->set_child_tid); 2839} 2840 2841/* 2842 * context_switch - switch to the new MM and the new 2843 * thread's register state. 2844 */ 2845static inline void 2846context_switch(struct rq *rq, struct task_struct *prev, 2847 struct task_struct *next) 2848{ 2849 struct mm_struct *mm, *oldmm; 2850 2851 prepare_task_switch(rq, prev, next); 2852 trace_sched_switch(prev, next); 2853 mm = next->mm; 2854 oldmm = prev->active_mm; 2855 /* 2856 * For paravirt, this is coupled with an exit in switch_to to 2857 * combine the page table reload and the switch backend into 2858 * one hypercall. 2859 */ 2860 arch_start_context_switch(prev); 2861 2862 if (likely(!mm)) { 2863 next->active_mm = oldmm; 2864 atomic_inc(&oldmm->mm_count); 2865 enter_lazy_tlb(oldmm, next); 2866 } else 2867 switch_mm(oldmm, mm, next); 2868 2869 if (likely(!prev->mm)) { 2870 prev->active_mm = NULL; 2871 rq->prev_mm = oldmm; 2872 } 2873 /* 2874 * Since the runqueue lock will be released by the next 2875 * task (which is an invalid locking op but in the case 2876 * of the scheduler it's an obvious special-case), so we 2877 * do an early lockdep release here: 2878 */ 2879#ifndef __ARCH_WANT_UNLOCKED_CTXSW 2880 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2881#endif 2882 2883 /* Here we just switch the register state and the stack. */ 2884 switch_to(prev, next, prev); 2885 2886 barrier(); 2887 /* 2888 * this_rq must be evaluated again because prev may have moved 2889 * CPUs since it called schedule(), thus the 'rq' on its stack 2890 * frame will be invalid. 2891 */ 2892 finish_task_switch(this_rq(), prev); 2893} 2894 2895/* 2896 * nr_running, nr_uninterruptible and nr_context_switches: 2897 * 2898 * externally visible scheduler statistics: current number of runnable 2899 * threads, current number of uninterruptible-sleeping threads, total 2900 * number of context switches performed since bootup. 2901 */ 2902unsigned long nr_running(void) 2903{ 2904 unsigned long i, sum = 0; 2905 2906 for_each_online_cpu(i) 2907 sum += cpu_rq(i)->nr_running; 2908 2909 return sum; 2910} 2911 2912unsigned long nr_uninterruptible(void) 2913{ 2914 unsigned long i, sum = 0; 2915 2916 for_each_possible_cpu(i) 2917 sum += cpu_rq(i)->nr_uninterruptible; 2918 2919 /* 2920 * Since we read the counters lockless, it might be slightly 2921 * inaccurate. Do not allow it to go below zero though: 2922 */ 2923 if (unlikely((long)sum < 0)) 2924 sum = 0; 2925 2926 return sum; 2927} 2928 2929unsigned long long nr_context_switches(void) 2930{ 2931 int i; 2932 unsigned long long sum = 0; 2933 2934 for_each_possible_cpu(i) 2935 sum += cpu_rq(i)->nr_switches; 2936 2937 return sum; 2938} 2939 2940unsigned long nr_iowait(void) 2941{ 2942 unsigned long i, sum = 0; 2943 2944 for_each_possible_cpu(i) 2945 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2946 2947 return sum; 2948} 2949 2950unsigned long nr_iowait_cpu(int cpu) 2951{ 2952 struct rq *this = cpu_rq(cpu); 2953 return atomic_read(&this->nr_iowait); 2954} 2955 2956unsigned long this_cpu_load(void) 2957{ 2958 struct rq *this = this_rq(); 2959 return this->cpu_load[0]; 2960} 2961 2962 2963/* Variables and functions for calc_load */ 2964static atomic_long_t calc_load_tasks; 2965static unsigned long calc_load_update; 2966unsigned long avenrun[3]; 2967EXPORT_SYMBOL(avenrun); 2968 2969static long calc_load_fold_active(struct rq *this_rq) 2970{ 2971 long nr_active, delta = 0; 2972 2973 nr_active = this_rq->nr_running; 2974 nr_active += (long) this_rq->nr_uninterruptible; 2975 2976 if (nr_active != this_rq->calc_load_active) { 2977 delta = nr_active - this_rq->calc_load_active; 2978 this_rq->calc_load_active = nr_active; 2979 } 2980 2981 return delta; 2982} 2983 2984static unsigned long 2985calc_load(unsigned long load, unsigned long exp, unsigned long active) 2986{ 2987 load *= exp; 2988 load += active * (FIXED_1 - exp); 2989 load += 1UL << (FSHIFT - 1); 2990 return load >> FSHIFT; 2991} 2992 2993#ifdef CONFIG_NO_HZ 2994/* 2995 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 2996 * 2997 * When making the ILB scale, we should try to pull this in as well. 2998 */ 2999static atomic_long_t calc_load_tasks_idle; 3000 3001static void calc_load_account_idle(struct rq *this_rq) 3002{ 3003 long delta; 3004 3005 delta = calc_load_fold_active(this_rq); 3006 if (delta) 3007 atomic_long_add(delta, &calc_load_tasks_idle); 3008} 3009 3010static long calc_load_fold_idle(void) 3011{ 3012 long delta = 0; 3013 3014 /* 3015 * Its got a race, we don't care... 3016 */ 3017 if (atomic_long_read(&calc_load_tasks_idle)) 3018 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 3019 3020 return delta; 3021} 3022 3023/** 3024 * fixed_power_int - compute: x^n, in O(log n) time 3025 * 3026 * @x: base of the power 3027 * @frac_bits: fractional bits of @x 3028 * @n: power to raise @x to. 3029 * 3030 * By exploiting the relation between the definition of the natural power 3031 * function: x^n := x*x*...*x (x multiplied by itself for n times), and 3032 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 3033 * (where: n_i \elem {0, 1}, the binary vector representing n), 3034 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 3035 * of course trivially computable in O(log_2 n), the length of our binary 3036 * vector. 3037 */ 3038static unsigned long 3039fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) 3040{ 3041 unsigned long result = 1UL << frac_bits; 3042 3043 if (n) for (;;) { 3044 if (n & 1) { 3045 result *= x; 3046 result += 1UL << (frac_bits - 1); 3047 result >>= frac_bits; 3048 } 3049 n >>= 1; 3050 if (!n) 3051 break; 3052 x *= x; 3053 x += 1UL << (frac_bits - 1); 3054 x >>= frac_bits; 3055 } 3056 3057 return result; 3058} 3059 3060/* 3061 * a1 = a0 * e + a * (1 - e) 3062 * 3063 * a2 = a1 * e + a * (1 - e) 3064 * = (a0 * e + a * (1 - e)) * e + a * (1 - e) 3065 * = a0 * e^2 + a * (1 - e) * (1 + e) 3066 * 3067 * a3 = a2 * e + a * (1 - e) 3068 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) 3069 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) 3070 * 3071 * ... 3072 * 3073 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] 3074 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) 3075 * = a0 * e^n + a * (1 - e^n) 3076 * 3077 * [1] application of the geometric series: 3078 * 3079 * n 1 - x^(n+1) 3080 * S_n := \Sum x^i = ------------- 3081 * i=0 1 - x 3082 */ 3083static unsigned long 3084calc_load_n(unsigned long load, unsigned long exp, 3085 unsigned long active, unsigned int n) 3086{ 3087 3088 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); 3089} 3090 3091/* 3092 * NO_HZ can leave us missing all per-cpu ticks calling 3093 * calc_load_account_active(), but since an idle CPU folds its delta into 3094 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold 3095 * in the pending idle delta if our idle period crossed a load cycle boundary. 3096 * 3097 * Once we've updated the global active value, we need to apply the exponential 3098 * weights adjusted to the number of cycles missed. 3099 */ 3100static void calc_global_nohz(unsigned long ticks) 3101{ 3102 long delta, active, n; 3103 3104 if (time_before(jiffies, calc_load_update)) 3105 return; 3106 3107 /* 3108 * If we crossed a calc_load_update boundary, make sure to fold 3109 * any pending idle changes, the respective CPUs might have 3110 * missed the tick driven calc_load_account_active() update 3111 * due to NO_HZ. 3112 */ 3113 delta = calc_load_fold_idle(); 3114 if (delta) 3115 atomic_long_add(delta, &calc_load_tasks); 3116 3117 /* 3118 * If we were idle for multiple load cycles, apply them. 3119 */ 3120 if (ticks >= LOAD_FREQ) { 3121 n = ticks / LOAD_FREQ; 3122 3123 active = atomic_long_read(&calc_load_tasks); 3124 active = active > 0 ? active * FIXED_1 : 0; 3125 3126 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 3127 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 3128 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 3129 3130 calc_load_update += n * LOAD_FREQ; 3131 } 3132 3133 /* 3134 * Its possible the remainder of the above division also crosses 3135 * a LOAD_FREQ period, the regular check in calc_global_load() 3136 * which comes after this will take care of that. 3137 * 3138 * Consider us being 11 ticks before a cycle completion, and us 3139 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will 3140 * age us 4 cycles, and the test in calc_global_load() will 3141 * pick up the final one. 3142 */ 3143} 3144#else 3145static void calc_load_account_idle(struct rq *this_rq) 3146{ 3147} 3148 3149static inline long calc_load_fold_idle(void) 3150{ 3151 return 0; 3152} 3153 3154static void calc_global_nohz(unsigned long ticks) 3155{ 3156} 3157#endif 3158 3159/** 3160 * get_avenrun - get the load average array 3161 * @loads: pointer to dest load array 3162 * @offset: offset to add 3163 * @shift: shift count to shift the result left 3164 * 3165 * These values are estimates at best, so no need for locking. 3166 */ 3167void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 3168{ 3169 loads[0] = (avenrun[0] + offset) << shift; 3170 loads[1] = (avenrun[1] + offset) << shift; 3171 loads[2] = (avenrun[2] + offset) << shift; 3172} 3173 3174/* 3175 * calc_load - update the avenrun load estimates 10 ticks after the 3176 * CPUs have updated calc_load_tasks. 3177 */ 3178void calc_global_load(unsigned long ticks) 3179{ 3180 long active; 3181 3182 calc_global_nohz(ticks); 3183 3184 if (time_before(jiffies, calc_load_update + 10)) 3185 return; 3186 3187 active = atomic_long_read(&calc_load_tasks); 3188 active = active > 0 ? active * FIXED_1 : 0; 3189 3190 avenrun[0] = calc_load(avenrun[0], EXP_1, active); 3191 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 3192 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 3193 3194 calc_load_update += LOAD_FREQ; 3195} 3196 3197/* 3198 * Called from update_cpu_load() to periodically update this CPU's 3199 * active count. 3200 */ 3201static void calc_load_account_active(struct rq *this_rq) 3202{ 3203 long delta; 3204 3205 if (time_before(jiffies, this_rq->calc_load_update)) 3206 return; 3207 3208 delta = calc_load_fold_active(this_rq); 3209 delta += calc_load_fold_idle(); 3210 if (delta) 3211 atomic_long_add(delta, &calc_load_tasks); 3212 3213 this_rq->calc_load_update += LOAD_FREQ; 3214} 3215 3216/* 3217 * The exact cpuload at various idx values, calculated at every tick would be 3218 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 3219 * 3220 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called 3221 * on nth tick when cpu may be busy, then we have: 3222 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 3223 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load 3224 * 3225 * decay_load_missed() below does efficient calculation of 3226 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 3227 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load 3228 * 3229 * The calculation is approximated on a 128 point scale. 3230 * degrade_zero_ticks is the number of ticks after which load at any 3231 * particular idx is approximated to be zero. 3232 * degrade_factor is a precomputed table, a row for each load idx. 3233 * Each column corresponds to degradation factor for a power of two ticks, 3234 * based on 128 point scale. 3235 * Example: 3236 * row 2, col 3 (=12) says that the degradation at load idx 2 after 3237 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). 3238 * 3239 * With this power of 2 load factors, we can degrade the load n times 3240 * by looking at 1 bits in n and doing as many mult/shift instead of 3241 * n mult/shifts needed by the exact degradation. 3242 */ 3243#define DEGRADE_SHIFT 7 3244static const unsigned char 3245 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; 3246static const unsigned char 3247 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { 3248 {0, 0, 0, 0, 0, 0, 0, 0}, 3249 {64, 32, 8, 0, 0, 0, 0, 0}, 3250 {96, 72, 40, 12, 1, 0, 0}, 3251 {112, 98, 75, 43, 15, 1, 0}, 3252 {120, 112, 98, 76, 45, 16, 2} }; 3253 3254/* 3255 * Update cpu_load for any missed ticks, due to tickless idle. The backlog 3256 * would be when CPU is idle and so we just decay the old load without 3257 * adding any new load. 3258 */ 3259static unsigned long 3260decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) 3261{ 3262 int j = 0; 3263 3264 if (!missed_updates) 3265 return load; 3266 3267 if (missed_updates >= degrade_zero_ticks[idx]) 3268 return 0; 3269 3270 if (idx == 1) 3271 return load >> missed_updates; 3272 3273 while (missed_updates) { 3274 if (missed_updates % 2) 3275 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; 3276 3277 missed_updates >>= 1; 3278 j++; 3279 } 3280 return load; 3281} 3282 3283/* 3284 * Update rq->cpu_load[] statistics. This function is usually called every 3285 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 3286 * every tick. We fix it up based on jiffies. 3287 */ 3288static void update_cpu_load(struct rq *this_rq) 3289{ 3290 unsigned long this_load = this_rq->load.weight; 3291 unsigned long curr_jiffies = jiffies; 3292 unsigned long pending_updates; 3293 int i, scale; 3294 3295 this_rq->nr_load_updates++; 3296 3297 /* Avoid repeated calls on same jiffy, when moving in and out of idle */ 3298 if (curr_jiffies == this_rq->last_load_update_tick) 3299 return; 3300 3301 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 3302 this_rq->last_load_update_tick = curr_jiffies; 3303 3304 /* Update our load: */ 3305 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 3306 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3307 unsigned long old_load, new_load; 3308 3309 /* scale is effectively 1 << i now, and >> i divides by scale */ 3310 3311 old_load = this_rq->cpu_load[i]; 3312 old_load = decay_load_missed(old_load, pending_updates - 1, i); 3313 new_load = this_load; 3314 /* 3315 * Round up the averaging division if load is increasing. This 3316 * prevents us from getting stuck on 9 if the load is 10, for 3317 * example. 3318 */ 3319 if (new_load > old_load) 3320 new_load += scale - 1; 3321 3322 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 3323 } 3324 3325 sched_avg_update(this_rq); 3326} 3327 3328static void update_cpu_load_active(struct rq *this_rq) 3329{ 3330 update_cpu_load(this_rq); 3331 3332 calc_load_account_active(this_rq); 3333} 3334 3335#ifdef CONFIG_SMP 3336 3337/* 3338 * sched_exec - execve() is a valuable balancing opportunity, because at 3339 * this point the task has the smallest effective memory and cache footprint. 3340 */ 3341void sched_exec(void) 3342{ 3343 struct task_struct *p = current; 3344 unsigned long flags; 3345 struct rq *rq; 3346 int dest_cpu; 3347 3348 rq = task_rq_lock(p, &flags); 3349 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3350 if (dest_cpu == smp_processor_id()) 3351 goto unlock; 3352 3353 /* 3354 * select_task_rq() can race against ->cpus_allowed 3355 */ 3356 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3357 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3358 struct migration_arg arg = { p, dest_cpu }; 3359 3360 task_rq_unlock(rq, &flags); 3361 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3362 return; 3363 } 3364unlock: 3365 task_rq_unlock(rq, &flags); 3366} 3367 3368#endif 3369 3370DEFINE_PER_CPU(struct kernel_stat, kstat); 3371 3372EXPORT_PER_CPU_SYMBOL(kstat); 3373 3374/* 3375 * Return any ns on the sched_clock that have not yet been accounted in 3376 * @p in case that task is currently running. 3377 * 3378 * Called with task_rq_lock() held on @rq. 3379 */ 3380static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 3381{ 3382 u64 ns = 0; 3383 3384 if (task_current(rq, p)) { 3385 update_rq_clock(rq); 3386 ns = rq->clock - p->se.exec_start; 3387 if ((s64)ns < 0) 3388 ns = 0; 3389 } 3390 3391 return ns; 3392} 3393 3394unsigned long long task_delta_exec(struct task_struct *p) 3395{ 3396 unsigned long flags; 3397 struct rq *rq; 3398 u64 ns = 0; 3399 3400 rq = task_rq_lock(p, &flags); 3401 ns = do_task_delta_exec(p, rq); 3402 task_rq_unlock(rq, &flags); 3403 3404 return ns; 3405} 3406 3407/* 3408 * Return accounted runtime for the task. 3409 * In case the task is currently running, return the runtime plus current's 3410 * pending runtime that have not been accounted yet. 3411 */ 3412unsigned long long task_sched_runtime(struct task_struct *p) 3413{ 3414 unsigned long flags; 3415 struct rq *rq; 3416 u64 ns = 0; 3417 3418 rq = task_rq_lock(p, &flags); 3419 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3420 task_rq_unlock(rq, &flags); 3421 3422 return ns; 3423} 3424 3425/* 3426 * Return sum_exec_runtime for the thread group. 3427 * In case the task is currently running, return the sum plus current's 3428 * pending runtime that have not been accounted yet. 3429 * 3430 * Note that the thread group might have other running tasks as well, 3431 * so the return value not includes other pending runtime that other 3432 * running tasks might have. 3433 */ 3434unsigned long long thread_group_sched_runtime(struct task_struct *p) 3435{ 3436 struct task_cputime totals; 3437 unsigned long flags; 3438 struct rq *rq; 3439 u64 ns; 3440 3441 rq = task_rq_lock(p, &flags); 3442 thread_group_cputime(p, &totals); 3443 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3444 task_rq_unlock(rq, &flags); 3445 3446 return ns; 3447} 3448 3449/* 3450 * Account user cpu time to a process. 3451 * @p: the process that the cpu time gets accounted to 3452 * @cputime: the cpu time spent in user space since the last update 3453 * @cputime_scaled: cputime scaled by cpu frequency 3454 */ 3455void account_user_time(struct task_struct *p, cputime_t cputime, 3456 cputime_t cputime_scaled) 3457{ 3458 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3459 cputime64_t tmp; 3460 3461 /* Add user time to process. */ 3462 p->utime = cputime_add(p->utime, cputime); 3463 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 3464 account_group_user_time(p, cputime); 3465 3466 /* Add user time to cpustat. */ 3467 tmp = cputime_to_cputime64(cputime); 3468 if (TASK_NICE(p) > 0) 3469 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3470 else 3471 cpustat->user = cputime64_add(cpustat->user, tmp); 3472 3473 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); 3474 /* Account for user time used */ 3475 acct_update_integrals(p); 3476} 3477 3478/* 3479 * Account guest cpu time to a process. 3480 * @p: the process that the cpu time gets accounted to 3481 * @cputime: the cpu time spent in virtual machine since the last update 3482 * @cputime_scaled: cputime scaled by cpu frequency 3483 */ 3484static void account_guest_time(struct task_struct *p, cputime_t cputime, 3485 cputime_t cputime_scaled) 3486{ 3487 cputime64_t tmp; 3488 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3489 3490 tmp = cputime_to_cputime64(cputime); 3491 3492 /* Add guest time to process. */ 3493 p->utime = cputime_add(p->utime, cputime); 3494 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 3495 account_group_user_time(p, cputime); 3496 p->gtime = cputime_add(p->gtime, cputime); 3497 3498 /* Add guest time to cpustat. */ 3499 if (TASK_NICE(p) > 0) { 3500 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3501 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 3502 } else { 3503 cpustat->user = cputime64_add(cpustat->user, tmp); 3504 cpustat->guest = cputime64_add(cpustat->guest, tmp); 3505 } 3506} 3507 3508/* 3509 * Account system cpu time to a process. 3510 * @p: the process that the cpu time gets accounted to 3511 * @hardirq_offset: the offset to subtract from hardirq_count() 3512 * @cputime: the cpu time spent in kernel space since the last update 3513 * @cputime_scaled: cputime scaled by cpu frequency 3514 */ 3515void account_system_time(struct task_struct *p, int hardirq_offset, 3516 cputime_t cputime, cputime_t cputime_scaled) 3517{ 3518 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3519 cputime64_t tmp; 3520 3521 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3522 account_guest_time(p, cputime, cputime_scaled); 3523 return; 3524 } 3525 3526 /* Add system time to process. */ 3527 p->stime = cputime_add(p->stime, cputime); 3528 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 3529 account_group_system_time(p, cputime); 3530 3531 /* Add system time to cpustat. */ 3532 tmp = cputime_to_cputime64(cputime); 3533 if (hardirq_count() - hardirq_offset) 3534 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3535 else if (softirq_count()) 3536 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3537 else 3538 cpustat->system = cputime64_add(cpustat->system, tmp); 3539 3540 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3541 3542 /* Account for system time used */ 3543 acct_update_integrals(p); 3544} 3545 3546/* 3547 * Account for involuntary wait time. 3548 * @steal: the cpu time spent in involuntary wait 3549 */ 3550void account_steal_time(cputime_t cputime) 3551{ 3552 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3553 cputime64_t cputime64 = cputime_to_cputime64(cputime); 3554 3555 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 3556} 3557 3558/* 3559 * Account for idle time. 3560 * @cputime: the cpu time spent in idle wait 3561 */ 3562void account_idle_time(cputime_t cputime) 3563{ 3564 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3565 cputime64_t cputime64 = cputime_to_cputime64(cputime); 3566 struct rq *rq = this_rq(); 3567 3568 if (atomic_read(&rq->nr_iowait) > 0) 3569 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 3570 else 3571 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3572} 3573 3574#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3575 3576/* 3577 * Account a single tick of cpu time. 3578 * @p: the process that the cpu time gets accounted to 3579 * @user_tick: indicates if the tick is a user or a system tick 3580 */ 3581void account_process_tick(struct task_struct *p, int user_tick) 3582{ 3583 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3584 struct rq *rq = this_rq(); 3585 3586 if (user_tick) 3587 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3588 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3589 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, 3590 one_jiffy_scaled); 3591 else 3592 account_idle_time(cputime_one_jiffy); 3593} 3594 3595/* 3596 * Account multiple ticks of steal time. 3597 * @p: the process from which the cpu time has been stolen 3598 * @ticks: number of stolen ticks 3599 */ 3600void account_steal_ticks(unsigned long ticks) 3601{ 3602 account_steal_time(jiffies_to_cputime(ticks)); 3603} 3604 3605/* 3606 * Account multiple ticks of idle time. 3607 * @ticks: number of stolen ticks 3608 */ 3609void account_idle_ticks(unsigned long ticks) 3610{ 3611 account_idle_time(jiffies_to_cputime(ticks)); 3612} 3613 3614#endif 3615 3616/* 3617 * Use precise platform statistics if available: 3618 */ 3619#ifdef CONFIG_VIRT_CPU_ACCOUNTING 3620void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3621{ 3622 *ut = p->utime; 3623 *st = p->stime; 3624} 3625 3626void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3627{ 3628 struct task_cputime cputime; 3629 3630 thread_group_cputime(p, &cputime); 3631 3632 *ut = cputime.utime; 3633 *st = cputime.stime; 3634} 3635#else 3636 3637#ifndef nsecs_to_cputime 3638# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) 3639#endif 3640 3641void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3642{ 3643 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 3644 3645 /* 3646 * Use CFS's precise accounting: 3647 */ 3648 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3649 3650 if (total) { 3651 u64 temp = rtime; 3652 3653 temp *= utime; 3654 do_div(temp, total); 3655 utime = (cputime_t)temp; 3656 } else 3657 utime = rtime; 3658 3659 /* 3660 * Compare with previous values, to keep monotonicity: 3661 */ 3662 p->prev_utime = max(p->prev_utime, utime); 3663 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 3664 3665 *ut = p->prev_utime; 3666 *st = p->prev_stime; 3667} 3668 3669/* 3670 * Must be called with siglock held. 3671 */ 3672void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3673{ 3674 struct signal_struct *sig = p->signal; 3675 struct task_cputime cputime; 3676 cputime_t rtime, utime, total; 3677 3678 thread_group_cputime(p, &cputime); 3679 3680 total = cputime_add(cputime.utime, cputime.stime); 3681 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3682 3683 if (total) { 3684 u64 temp = rtime; 3685 3686 temp *= cputime.utime; 3687 do_div(temp, total); 3688 utime = (cputime_t)temp; 3689 } else 3690 utime = rtime; 3691 3692 sig->prev_utime = max(sig->prev_utime, utime); 3693 sig->prev_stime = max(sig->prev_stime, 3694 cputime_sub(rtime, sig->prev_utime)); 3695 3696 *ut = sig->prev_utime; 3697 *st = sig->prev_stime; 3698} 3699#endif 3700 3701/* 3702 * This function gets called by the timer code, with HZ frequency. 3703 * We call it with interrupts disabled. 3704 * 3705 * It also gets called by the fork code, when changing the parent's 3706 * timeslices. 3707 */ 3708void scheduler_tick(void) 3709{ 3710 int cpu = smp_processor_id(); 3711 struct rq *rq = cpu_rq(cpu); 3712 struct task_struct *curr = rq->curr; 3713 3714#if defined(BUZZZ_KEVT_LVL) && (BUZZZ_KEVT_LVL >= 1) 3715 buzzz_kevt_log1(BUZZZ_KEVT_ID_SCHED_TICK, jiffies); 3716#endif /* BUZZZ_KEVT_LVL */ 3717 3718 sched_clock_tick(); 3719 3720 raw_spin_lock(&rq->lock); 3721 update_rq_clock(rq); 3722 update_cpu_load_active(rq); 3723 curr->sched_class->task_tick(rq, curr, 0); 3724 raw_spin_unlock(&rq->lock); 3725 3726 perf_event_task_tick(curr); 3727 3728#ifdef CONFIG_SMP 3729 rq->idle_at_tick = idle_cpu(cpu); 3730 trigger_load_balance(rq, cpu); 3731#endif 3732} 3733 3734notrace unsigned long get_parent_ip(unsigned long addr) 3735{ 3736 if (in_lock_functions(addr)) { 3737 addr = CALLER_ADDR2; 3738 if (in_lock_functions(addr)) 3739 addr = CALLER_ADDR3; 3740 } 3741 return addr; 3742} 3743 3744#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3745 defined(CONFIG_PREEMPT_TRACER)) 3746 3747void __kprobes add_preempt_count(int val) 3748{ 3749#ifdef CONFIG_DEBUG_PREEMPT 3750 /* 3751 * Underflow? 3752 */ 3753 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 3754 return; 3755#endif 3756 preempt_count() += val; 3757#ifdef CONFIG_DEBUG_PREEMPT 3758 /* 3759 * Spinlock count overflowing soon? 3760 */ 3761 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 3762 PREEMPT_MASK - 10); 3763#endif 3764 if (preempt_count() == val) 3765 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 3766} 3767EXPORT_SYMBOL(add_preempt_count); 3768 3769void __kprobes sub_preempt_count(int val) 3770{ 3771#ifdef CONFIG_DEBUG_PREEMPT 3772 /* 3773 * Underflow? 3774 */ 3775 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 3776 return; 3777 /* 3778 * Is the spinlock portion underflowing? 3779 */ 3780 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 3781 !(preempt_count() & PREEMPT_MASK))) 3782 return; 3783#endif 3784 3785 if (preempt_count() == val) 3786 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 3787 preempt_count() -= val; 3788} 3789EXPORT_SYMBOL(sub_preempt_count); 3790 3791#endif 3792 3793/* 3794 * Print scheduling while atomic bug: 3795 */ 3796static noinline void __schedule_bug(struct task_struct *prev) 3797{ 3798 struct pt_regs *regs = get_irq_regs(); 3799 3800 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3801 prev->comm, prev->pid, preempt_count()); 3802 3803 debug_show_held_locks(prev); 3804 print_modules(); 3805 if (irqs_disabled()) 3806 print_irqtrace_events(prev); 3807 3808 if (regs) 3809 show_regs(regs); 3810 else 3811 dump_stack(); 3812} 3813 3814/* 3815 * Various schedule()-time debugging checks and statistics: 3816 */ 3817static inline void schedule_debug(struct task_struct *prev) 3818{ 3819 /* 3820 * Test if we are atomic. Since do_exit() needs to call into 3821 * schedule() atomically, we ignore that path for now. 3822 * Otherwise, whine if we are scheduling when we should not be. 3823 */ 3824 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 3825 __schedule_bug(prev); 3826 3827 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3828 3829 schedstat_inc(this_rq(), sched_count); 3830#ifdef CONFIG_SCHEDSTATS 3831 if (unlikely(prev->lock_depth >= 0)) { 3832 schedstat_inc(this_rq(), bkl_count); 3833 schedstat_inc(prev, sched_info.bkl_count); 3834 } 3835#endif 3836} 3837 3838static void put_prev_task(struct rq *rq, struct task_struct *prev) 3839{ 3840 if (prev->se.on_rq) 3841 update_rq_clock(rq); 3842 prev->sched_class->put_prev_task(rq, prev); 3843} 3844 3845/* 3846 * Pick up the highest-prio task: 3847 */ 3848static inline struct task_struct * 3849pick_next_task(struct rq *rq) 3850{ 3851 const struct sched_class *class; 3852 struct task_struct *p; 3853 3854 /* 3855 * Optimization: we know that if all tasks are in 3856 * the fair class we can call that function directly: 3857 */ 3858 if (likely(rq->nr_running == rq->cfs.nr_running)) { 3859 p = fair_sched_class.pick_next_task(rq); 3860 if (likely(p)) 3861 return p; 3862 } 3863 3864 class = sched_class_highest; 3865 for ( ; ; ) { 3866 p = class->pick_next_task(rq); 3867 if (p) 3868 return p; 3869 /* 3870 * Will never be NULL as the idle class always 3871 * returns a non-NULL p: 3872 */ 3873 class = class->next; 3874 } 3875} 3876 3877/* 3878 * schedule() is the main scheduler function. 3879 */ 3880asmlinkage void __sched schedule(void) 3881{ 3882 struct task_struct *prev, *next; 3883 unsigned long *switch_count; 3884 struct rq *rq; 3885 int cpu; 3886 3887need_resched: 3888 preempt_disable(); 3889 cpu = smp_processor_id(); 3890 rq = cpu_rq(cpu); 3891 rcu_note_context_switch(cpu); 3892 prev = rq->curr; 3893 3894 release_kernel_lock(prev); 3895need_resched_nonpreemptible: 3896 3897 schedule_debug(prev); 3898 3899 if (sched_feat(HRTICK)) 3900 hrtick_clear(rq); 3901 3902 raw_spin_lock_irq(&rq->lock); 3903 3904 switch_count = &prev->nivcsw; 3905 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3906 if (unlikely(signal_pending_state(prev->state, prev))) { 3907 prev->state = TASK_RUNNING; 3908 } else { 3909 /* 3910 * If a worker is going to sleep, notify and 3911 * ask workqueue whether it wants to wake up a 3912 * task to maintain concurrency. If so, wake 3913 * up the task. 3914 */ 3915 if (prev->flags & PF_WQ_WORKER) { 3916 struct task_struct *to_wakeup; 3917 3918 to_wakeup = wq_worker_sleeping(prev, cpu); 3919 if (to_wakeup) 3920 try_to_wake_up_local(to_wakeup); 3921 } 3922 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3923 } 3924 switch_count = &prev->nvcsw; 3925 } 3926 3927 pre_schedule(rq, prev); 3928 3929 if (unlikely(!rq->nr_running)) 3930 idle_balance(cpu, rq); 3931 3932 put_prev_task(rq, prev); 3933 next = pick_next_task(rq); 3934 clear_tsk_need_resched(prev); 3935 rq->skip_clock_update = 0; 3936 3937 if (likely(prev != next)) { 3938 sched_info_switch(prev, next); 3939 perf_event_task_sched_out(prev, next); 3940 3941 rq->nr_switches++; 3942 rq->curr = next; 3943 ++*switch_count; 3944 3945#if defined(BUZZZ_KEVT_LVL) && (BUZZZ_KEVT_LVL >= 1) 3946 buzzz_prev[cpu] = prev; 3947 buzzz_next[cpu] = next; 3948#endif /* BUZZZ_KEVT_LVL */ 3949 3950 context_switch(rq, prev, next); /* unlocks the rq */ 3951 /* 3952 * The context switch have flipped the stack from under us 3953 * and restored the local variables which were saved when 3954 * this task called schedule() in the past. prev == current 3955 * is still correct, but it can be moved to another cpu/rq. 3956 */ 3957 cpu = smp_processor_id(); 3958 rq = cpu_rq(cpu); 3959 3960#if defined(BUZZZ_KEVT_LVL) && (BUZZZ_KEVT_LVL >= 1) 3961 buzzz_kevt_log2(BUZZZ_KEVT_ID_SCHEDULE, 3962 (uint32_t)buzzz_prev[cpu], (uint32_t)buzzz_next[cpu]); 3963#endif /* BUZZZ_KEVT_LVL */ 3964 3965 } else 3966 raw_spin_unlock_irq(&rq->lock); 3967 3968 post_schedule(rq); 3969 3970 if (unlikely(reacquire_kernel_lock(prev))) 3971 goto need_resched_nonpreemptible; 3972 3973 preempt_enable_no_resched(); 3974 if (need_resched()) 3975 goto need_resched; 3976} 3977EXPORT_SYMBOL(schedule); 3978 3979#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 3980/* 3981 * Look out! "owner" is an entirely speculative pointer 3982 * access and not reliable. 3983 */ 3984int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) 3985{ 3986 unsigned int cpu; 3987 struct rq *rq; 3988 3989 if (!sched_feat(OWNER_SPIN)) 3990 return 0; 3991 3992#ifdef CONFIG_DEBUG_PAGEALLOC 3993 /* 3994 * Need to access the cpu field knowing that 3995 * DEBUG_PAGEALLOC could have unmapped it if 3996 * the mutex owner just released it and exited. 3997 */ 3998 if (probe_kernel_address(&owner->cpu, cpu)) 3999 return 0; 4000#else 4001 cpu = owner->cpu; 4002#endif 4003 4004 /* 4005 * Even if the access succeeded (likely case), 4006 * the cpu field may no longer be valid. 4007 */ 4008 if (cpu >= nr_cpumask_bits) 4009 return 0; 4010 4011 /* 4012 * We need to validate that we can do a 4013 * get_cpu() and that we have the percpu area. 4014 */ 4015 if (!cpu_online(cpu)) 4016 return 0; 4017 4018 rq = cpu_rq(cpu); 4019 4020 for (;;) { 4021 /* 4022 * Owner changed, break to re-assess state. 4023 */ 4024 if (lock->owner != owner) { 4025 /* 4026 * If the lock has switched to a different owner, 4027 * we likely have heavy contention. Return 0 to quit 4028 * optimistic spinning and not contend further: 4029 */ 4030 if (lock->owner) 4031 return 0; 4032 break; 4033 } 4034 4035 /* 4036 * Is that owner really running on that cpu? 4037 */ 4038 if (task_thread_info(rq->curr) != owner || need_resched()) 4039 return 0; 4040 4041 cpu_relax(); 4042 } 4043 4044 return 1; 4045} 4046#endif 4047 4048#ifdef CONFIG_PREEMPT 4049/* 4050 * this is the entry point to schedule() from in-kernel preemption 4051 * off of preempt_enable. Kernel preemptions off return from interrupt 4052 * occur there and call schedule directly. 4053 */ 4054asmlinkage void __sched notrace preempt_schedule(void) 4055{ 4056 struct thread_info *ti = current_thread_info(); 4057 4058 /* 4059 * If there is a non-zero preempt_count or interrupts are disabled, 4060 * we do not want to preempt the current task. Just return.. 4061 */ 4062 if (likely(ti->preempt_count || irqs_disabled())) 4063 return; 4064 4065 do { 4066 add_preempt_count_notrace(PREEMPT_ACTIVE); 4067 schedule(); 4068 sub_preempt_count_notrace(PREEMPT_ACTIVE); 4069 4070 /* 4071 * Check again in case we missed a preemption opportunity 4072 * between schedule and now. 4073 */ 4074 barrier(); 4075 } while (need_resched()); 4076} 4077EXPORT_SYMBOL(preempt_schedule); 4078 4079/* 4080 * this is the entry point to schedule() from kernel preemption 4081 * off of irq context. 4082 * Note, that this is called and return with irqs disabled. This will 4083 * protect us against recursive calling from irq. 4084 */ 4085asmlinkage void __sched preempt_schedule_irq(void) 4086{ 4087 struct thread_info *ti = current_thread_info(); 4088 4089 /* Catch callers which need to be fixed */ 4090 BUG_ON(ti->preempt_count || !irqs_disabled()); 4091 4092 do { 4093 add_preempt_count(PREEMPT_ACTIVE); 4094 local_irq_enable(); 4095 schedule(); 4096 local_irq_disable(); 4097 sub_preempt_count(PREEMPT_ACTIVE); 4098 4099 /* 4100 * Check again in case we missed a preemption opportunity 4101 * between schedule and now. 4102 */ 4103 barrier(); 4104 } while (need_resched()); 4105} 4106 4107#endif /* CONFIG_PREEMPT */ 4108 4109int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 4110 void *key) 4111{ 4112 return try_to_wake_up(curr->private, mode, wake_flags); 4113} 4114EXPORT_SYMBOL(default_wake_function); 4115 4116/* 4117 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 4118 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 4119 * number) then we wake all the non-exclusive tasks and one exclusive task. 4120 * 4121 * There are circumstances in which we can try to wake a task which has already 4122 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 4123 * zero in this (rare) case, and we handle it by continuing to scan the queue. 4124 */ 4125static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 4126 int nr_exclusive, int wake_flags, void *key) 4127{ 4128 wait_queue_t *curr, *next; 4129 4130 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 4131 unsigned flags = curr->flags; 4132 4133 if (curr->func(curr, mode, wake_flags, key) && 4134 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 4135 break; 4136 } 4137} 4138 4139/** 4140 * __wake_up - wake up threads blocked on a waitqueue. 4141 * @q: the waitqueue 4142 * @mode: which threads 4143 * @nr_exclusive: how many wake-one or wake-many threads to wake up 4144 * @key: is directly passed to the wakeup function 4145 * 4146 * It may be assumed that this function implies a write memory barrier before 4147 * changing the task state if and only if any tasks are woken up. 4148 */ 4149void __wake_up(wait_queue_head_t *q, unsigned int mode, 4150 int nr_exclusive, void *key) 4151{ 4152 unsigned long flags; 4153 4154 spin_lock_irqsave(&q->lock, flags); 4155 __wake_up_common(q, mode, nr_exclusive, 0, key); 4156 spin_unlock_irqrestore(&q->lock, flags); 4157} 4158EXPORT_SYMBOL(__wake_up); 4159 4160/* 4161 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 4162 */ 4163void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 4164{ 4165 __wake_up_common(q, mode, 1, 0, NULL); 4166} 4167EXPORT_SYMBOL_GPL(__wake_up_locked); 4168 4169void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 4170{ 4171 __wake_up_common(q, mode, 1, 0, key); 4172} 4173 4174/** 4175 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4176 * @q: the waitqueue 4177 * @mode: which threads 4178 * @nr_exclusive: how many wake-one or wake-many threads to wake up 4179 * @key: opaque value to be passed to wakeup targets 4180 * 4181 * The sync wakeup differs that the waker knows that it will schedule 4182 * away soon, so while the target thread will be woken up, it will not 4183 * be migrated to another CPU - ie. the two threads are 'synchronized' 4184 * with each other. This can prevent needless bouncing between CPUs. 4185 * 4186 * On UP it can prevent extra preemption. 4187 * 4188 * It may be assumed that this function implies a write memory barrier before 4189 * changing the task state if and only if any tasks are woken up. 4190 */ 4191void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 4192 int nr_exclusive, void *key) 4193{ 4194 unsigned long flags; 4195 int wake_flags = WF_SYNC; 4196 4197 if (unlikely(!q)) 4198 return; 4199 4200 if (unlikely(!nr_exclusive)) 4201 wake_flags = 0; 4202 4203 spin_lock_irqsave(&q->lock, flags); 4204 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 4205 spin_unlock_irqrestore(&q->lock, flags); 4206} 4207EXPORT_SYMBOL_GPL(__wake_up_sync_key); 4208 4209/* 4210 * __wake_up_sync - see __wake_up_sync_key() 4211 */ 4212void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 4213{ 4214 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 4215} 4216EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4217 4218/** 4219 * complete: - signals a single thread waiting on this completion 4220 * @x: holds the state of this particular completion 4221 * 4222 * This will wake up a single thread waiting on this completion. Threads will be 4223 * awakened in the same order in which they were queued. 4224 * 4225 * See also complete_all(), wait_for_completion() and related routines. 4226 * 4227 * It may be assumed that this function implies a write memory barrier before 4228 * changing the task state if and only if any tasks are woken up. 4229 */ 4230void complete(struct completion *x) 4231{ 4232 unsigned long flags; 4233 4234 spin_lock_irqsave(&x->wait.lock, flags); 4235 x->done++; 4236 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 4237 spin_unlock_irqrestore(&x->wait.lock, flags); 4238} 4239EXPORT_SYMBOL(complete); 4240 4241/** 4242 * complete_all: - signals all threads waiting on this completion 4243 * @x: holds the state of this particular completion 4244 * 4245 * This will wake up all threads waiting on this particular completion event. 4246 * 4247 * It may be assumed that this function implies a write memory barrier before 4248 * changing the task state if and only if any tasks are woken up. 4249 */ 4250void complete_all(struct completion *x) 4251{ 4252 unsigned long flags; 4253 4254 spin_lock_irqsave(&x->wait.lock, flags); 4255 x->done += UINT_MAX/2; 4256 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 4257 spin_unlock_irqrestore(&x->wait.lock, flags); 4258} 4259EXPORT_SYMBOL(complete_all); 4260 4261static inline long __sched 4262do_wait_for_common(struct completion *x, long timeout, int state) 4263{ 4264 if (!x->done) { 4265 DECLARE_WAITQUEUE(wait, current); 4266 4267 __add_wait_queue_tail_exclusive(&x->wait, &wait); 4268 do { 4269 if (signal_pending_state(state, current)) { 4270 timeout = -ERESTARTSYS; 4271 break; 4272 } 4273 __set_current_state(state); 4274 spin_unlock_irq(&x->wait.lock); 4275 timeout = schedule_timeout(timeout); 4276 spin_lock_irq(&x->wait.lock); 4277 } while (!x->done && timeout); 4278 __remove_wait_queue(&x->wait, &wait); 4279 if (!x->done) 4280 return timeout; 4281 } 4282 x->done--; 4283 return timeout ?: 1; 4284} 4285 4286static long __sched 4287wait_for_common(struct completion *x, long timeout, int state) 4288{ 4289 might_sleep(); 4290 4291 spin_lock_irq(&x->wait.lock); 4292 timeout = do_wait_for_common(x, timeout, state); 4293 spin_unlock_irq(&x->wait.lock); 4294 return timeout; 4295} 4296 4297/** 4298 * wait_for_completion: - waits for completion of a task 4299 * @x: holds the state of this particular completion 4300 * 4301 * This waits to be signaled for completion of a specific task. It is NOT 4302 * interruptible and there is no timeout. 4303 * 4304 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 4305 * and interrupt capability. Also see complete(). 4306 */ 4307void __sched wait_for_completion(struct completion *x) 4308{ 4309 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4310} 4311EXPORT_SYMBOL(wait_for_completion); 4312 4313/** 4314 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 4315 * @x: holds the state of this particular completion 4316 * @timeout: timeout value in jiffies 4317 * 4318 * This waits for either a completion of a specific task to be signaled or for a 4319 * specified timeout to expire. The timeout is in jiffies. It is not 4320 * interruptible. 4321 */ 4322unsigned long __sched 4323wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4324{ 4325 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 4326} 4327EXPORT_SYMBOL(wait_for_completion_timeout); 4328 4329/** 4330 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 4331 * @x: holds the state of this particular completion 4332 * 4333 * This waits for completion of a specific task to be signaled. It is 4334 * interruptible. 4335 */ 4336int __sched wait_for_completion_interruptible(struct completion *x) 4337{ 4338 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4339 if (t == -ERESTARTSYS) 4340 return t; 4341 return 0; 4342} 4343EXPORT_SYMBOL(wait_for_completion_interruptible); 4344 4345/** 4346 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 4347 * @x: holds the state of this particular completion 4348 * @timeout: timeout value in jiffies 4349 * 4350 * This waits for either a completion of a specific task to be signaled or for a 4351 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4352 */ 4353unsigned long __sched 4354wait_for_completion_interruptible_timeout(struct completion *x, 4355 unsigned long timeout) 4356{ 4357 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 4358} 4359EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4360 4361/** 4362 * wait_for_completion_killable: - waits for completion of a task (killable) 4363 * @x: holds the state of this particular completion 4364 * 4365 * This waits to be signaled for completion of a specific task. It can be 4366 * interrupted by a kill signal. 4367 */ 4368int __sched wait_for_completion_killable(struct completion *x) 4369{ 4370 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4371 if (t == -ERESTARTSYS) 4372 return t; 4373 return 0; 4374} 4375EXPORT_SYMBOL(wait_for_completion_killable); 4376 4377/** 4378 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) 4379 * @x: holds the state of this particular completion 4380 * @timeout: timeout value in jiffies 4381 * 4382 * This waits for either a completion of a specific task to be 4383 * signaled or for a specified timeout to expire. It can be 4384 * interrupted by a kill signal. The timeout is in jiffies. 4385 */ 4386unsigned long __sched 4387wait_for_completion_killable_timeout(struct completion *x, 4388 unsigned long timeout) 4389{ 4390 return wait_for_common(x, timeout, TASK_KILLABLE); 4391} 4392EXPORT_SYMBOL(wait_for_completion_killable_timeout); 4393 4394/** 4395 * try_wait_for_completion - try to decrement a completion without blocking 4396 * @x: completion structure 4397 * 4398 * Returns: 0 if a decrement cannot be done without blocking 4399 * 1 if a decrement succeeded. 4400 * 4401 * If a completion is being used as a counting completion, 4402 * attempt to decrement the counter without blocking. This 4403 * enables us to avoid waiting if the resource the completion 4404 * is protecting is not available. 4405 */ 4406bool try_wait_for_completion(struct completion *x) 4407{ 4408 unsigned long flags; 4409 int ret = 1; 4410 4411 spin_lock_irqsave(&x->wait.lock, flags); 4412 if (!x->done) 4413 ret = 0; 4414 else 4415 x->done--; 4416 spin_unlock_irqrestore(&x->wait.lock, flags); 4417 return ret; 4418} 4419EXPORT_SYMBOL(try_wait_for_completion); 4420 4421/** 4422 * completion_done - Test to see if a completion has any waiters 4423 * @x: completion structure 4424 * 4425 * Returns: 0 if there are waiters (wait_for_completion() in progress) 4426 * 1 if there are no waiters. 4427 * 4428 */ 4429bool completion_done(struct completion *x) 4430{ 4431 unsigned long flags; 4432 int ret = 1; 4433 4434 spin_lock_irqsave(&x->wait.lock, flags); 4435 if (!x->done) 4436 ret = 0; 4437 spin_unlock_irqrestore(&x->wait.lock, flags); 4438 return ret; 4439} 4440EXPORT_SYMBOL(completion_done); 4441 4442static long __sched 4443sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4444{ 4445 unsigned long flags; 4446 wait_queue_t wait; 4447 4448 init_waitqueue_entry(&wait, current); 4449 4450 __set_current_state(state); 4451 4452 spin_lock_irqsave(&q->lock, flags); 4453 __add_wait_queue(q, &wait); 4454 spin_unlock(&q->lock); 4455 timeout = schedule_timeout(timeout); 4456 spin_lock_irq(&q->lock); 4457 __remove_wait_queue(q, &wait); 4458 spin_unlock_irqrestore(&q->lock, flags); 4459 4460 return timeout; 4461} 4462 4463void __sched interruptible_sleep_on(wait_queue_head_t *q) 4464{ 4465 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 4466} 4467EXPORT_SYMBOL(interruptible_sleep_on); 4468 4469long __sched 4470interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 4471{ 4472 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 4473} 4474EXPORT_SYMBOL(interruptible_sleep_on_timeout); 4475 4476void __sched sleep_on(wait_queue_head_t *q) 4477{ 4478 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 4479} 4480EXPORT_SYMBOL(sleep_on); 4481 4482long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 4483{ 4484 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 4485} 4486EXPORT_SYMBOL(sleep_on_timeout); 4487 4488#ifdef CONFIG_RT_MUTEXES 4489 4490/* 4491 * rt_mutex_setprio - set the current priority of a task 4492 * @p: task 4493 * @prio: prio value (kernel-internal form) 4494 * 4495 * This function changes the 'effective' priority of a task. It does 4496 * not touch ->normal_prio like __setscheduler(). 4497 * 4498 * Used by the rt_mutex code to implement priority inheritance logic. 4499 */ 4500void rt_mutex_setprio(struct task_struct *p, int prio) 4501{ 4502 unsigned long flags; 4503 int oldprio, on_rq, running; 4504 struct rq *rq; 4505 const struct sched_class *prev_class; 4506 4507 BUG_ON(prio < 0 || prio > MAX_PRIO); 4508 4509 rq = task_rq_lock(p, &flags); 4510 4511 oldprio = p->prio; 4512 prev_class = p->sched_class; 4513 on_rq = p->se.on_rq; 4514 running = task_current(rq, p); 4515 if (on_rq) 4516 dequeue_task(rq, p, 0); 4517 if (running) 4518 p->sched_class->put_prev_task(rq, p); 4519 4520 if (rt_prio(prio)) 4521 p->sched_class = &rt_sched_class; 4522 else 4523 p->sched_class = &fair_sched_class; 4524 4525 p->prio = prio; 4526 4527 if (running) 4528 p->sched_class->set_curr_task(rq); 4529 if (on_rq) { 4530 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4531 4532 check_class_changed(rq, p, prev_class, oldprio, running); 4533 } 4534 task_rq_unlock(rq, &flags); 4535} 4536 4537#endif 4538 4539void set_user_nice(struct task_struct *p, long nice) 4540{ 4541 int old_prio, delta, on_rq; 4542 unsigned long flags; 4543 struct rq *rq; 4544 4545 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 4546 return; 4547 /* 4548 * We have to be careful, if called from sys_setpriority(), 4549 * the task might be in the middle of scheduling on another CPU. 4550 */ 4551 rq = task_rq_lock(p, &flags); 4552 /* 4553 * The RT priorities are set via sched_setscheduler(), but we still 4554 * allow the 'normal' nice value to be set - but as expected 4555 * it wont have any effect on scheduling until the task is 4556 * SCHED_FIFO/SCHED_RR: 4557 */ 4558 if (task_has_rt_policy(p)) { 4559 p->static_prio = NICE_TO_PRIO(nice); 4560 goto out_unlock; 4561 } 4562 on_rq = p->se.on_rq; 4563 if (on_rq) 4564 dequeue_task(rq, p, 0); 4565 4566 p->static_prio = NICE_TO_PRIO(nice); 4567 set_load_weight(p); 4568 old_prio = p->prio; 4569 p->prio = effective_prio(p); 4570 delta = p->prio - old_prio; 4571 4572 if (on_rq) { 4573 enqueue_task(rq, p, 0); 4574 /* 4575 * If the task increased its priority or is running and 4576 * lowered its priority, then reschedule its CPU: 4577 */ 4578 if (delta < 0 || (delta > 0 && task_running(rq, p))) 4579 resched_task(rq->curr); 4580 } 4581out_unlock: 4582 task_rq_unlock(rq, &flags); 4583} 4584EXPORT_SYMBOL(set_user_nice); 4585 4586/* 4587 * can_nice - check if a task can reduce its nice value 4588 * @p: task 4589 * @nice: nice value 4590 */ 4591int can_nice(const struct task_struct *p, const int nice) 4592{ 4593 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4594 int nice_rlim = 20 - nice; 4595 4596 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 4597 capable(CAP_SYS_NICE)); 4598} 4599 4600#ifdef __ARCH_WANT_SYS_NICE 4601 4602/* 4603 * sys_nice - change the priority of the current process. 4604 * @increment: priority increment 4605 * 4606 * sys_setpriority is a more generic, but much slower function that 4607 * does similar things. 4608 */ 4609SYSCALL_DEFINE1(nice, int, increment) 4610{ 4611 long nice, retval; 4612 4613 /* 4614 * Setpriority might change our priority at the same moment. 4615 * We don't have to worry. Conceptually one call occurs first 4616 * and we have a single winner. 4617 */ 4618 if (increment < -40) 4619 increment = -40; 4620 if (increment > 40) 4621 increment = 40; 4622 4623 nice = TASK_NICE(current) + increment; 4624 if (nice < -20) 4625 nice = -20; 4626 if (nice > 19) 4627 nice = 19; 4628 4629 if (increment < 0 && !can_nice(current, nice)) 4630 return -EPERM; 4631 4632 retval = security_task_setnice(current, nice); 4633 if (retval) 4634 return retval; 4635 4636 set_user_nice(current, nice); 4637 return 0; 4638} 4639 4640#endif 4641 4642/** 4643 * task_prio - return the priority value of a given task. 4644 * @p: the task in question. 4645 * 4646 * This is the priority value as seen by users in /proc. 4647 * RT tasks are offset by -200. Normal tasks are centered 4648 * around 0, value goes from -16 to +15. 4649 */ 4650int task_prio(const struct task_struct *p) 4651{ 4652 return p->prio - MAX_RT_PRIO; 4653} 4654 4655/** 4656 * task_nice - return the nice value of a given task. 4657 * @p: the task in question. 4658 */ 4659int task_nice(const struct task_struct *p) 4660{ 4661 return TASK_NICE(p); 4662} 4663EXPORT_SYMBOL(task_nice); 4664 4665/** 4666 * idle_cpu - is a given cpu idle currently? 4667 * @cpu: the processor in question. 4668 */ 4669int idle_cpu(int cpu) 4670{ 4671 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 4672} 4673 4674/** 4675 * idle_task - return the idle task for a given cpu. 4676 * @cpu: the processor in question. 4677 */ 4678struct task_struct *idle_task(int cpu) 4679{ 4680 return cpu_rq(cpu)->idle; 4681} 4682 4683/** 4684 * find_process_by_pid - find a process with a matching PID value. 4685 * @pid: the pid in question. 4686 */ 4687static struct task_struct *find_process_by_pid(pid_t pid) 4688{ 4689 return pid ? find_task_by_vpid(pid) : current; 4690} 4691 4692/* Actually do priority change: must hold rq lock. */ 4693static void 4694__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4695{ 4696 BUG_ON(p->se.on_rq); 4697 4698 p->policy = policy; 4699 p->rt_priority = prio; 4700 p->normal_prio = normal_prio(p); 4701 /* we are holding p->pi_lock already */ 4702 p->prio = rt_mutex_getprio(p); 4703 if (rt_prio(p->prio)) 4704 p->sched_class = &rt_sched_class; 4705 else 4706 p->sched_class = &fair_sched_class; 4707 set_load_weight(p); 4708} 4709 4710/* 4711 * check the target process has a UID that matches the current process's 4712 */ 4713static bool check_same_owner(struct task_struct *p) 4714{ 4715 const struct cred *cred = current_cred(), *pcred; 4716 bool match; 4717 4718 rcu_read_lock(); 4719 pcred = __task_cred(p); 4720 match = (cred->euid == pcred->euid || 4721 cred->euid == pcred->uid); 4722 rcu_read_unlock(); 4723 return match; 4724} 4725 4726static int __sched_setscheduler(struct task_struct *p, int policy, 4727 struct sched_param *param, bool user) 4728{ 4729 int retval, oldprio, oldpolicy = -1, on_rq, running; 4730 unsigned long flags; 4731 const struct sched_class *prev_class; 4732 struct rq *rq; 4733 int reset_on_fork; 4734 4735 /* may grab non-irq protected spin_locks */ 4736 BUG_ON(in_interrupt()); 4737recheck: 4738 /* double check policy once rq lock held */ 4739 if (policy < 0) { 4740 reset_on_fork = p->sched_reset_on_fork; 4741 policy = oldpolicy = p->policy; 4742 } else { 4743 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 4744 policy &= ~SCHED_RESET_ON_FORK; 4745 4746 if (policy != SCHED_FIFO && policy != SCHED_RR && 4747 policy != SCHED_NORMAL && policy != SCHED_BATCH && 4748 policy != SCHED_IDLE) 4749 return -EINVAL; 4750 } 4751 4752 /* 4753 * Valid priorities for SCHED_FIFO and SCHED_RR are 4754 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 4755 * SCHED_BATCH and SCHED_IDLE is 0. 4756 */ 4757 if (param->sched_priority < 0 || 4758 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4759 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4760 return -EINVAL; 4761 if (rt_policy(policy) != (param->sched_priority != 0)) 4762 return -EINVAL; 4763 4764 /* 4765 * Allow unprivileged RT tasks to decrease priority: 4766 */ 4767 if (user && !capable(CAP_SYS_NICE)) { 4768 if (rt_policy(policy)) { 4769 unsigned long rlim_rtprio = 4770 task_rlimit(p, RLIMIT_RTPRIO); 4771 4772 /* can't set/change the rt policy */ 4773 if (policy != p->policy && !rlim_rtprio) 4774 return -EPERM; 4775 4776 /* can't increase priority */ 4777 if (param->sched_priority > p->rt_priority && 4778 param->sched_priority > rlim_rtprio) 4779 return -EPERM; 4780 } 4781 /* 4782 * Like positive nice levels, dont allow tasks to 4783 * move out of SCHED_IDLE either: 4784 */ 4785 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4786 return -EPERM; 4787 4788 /* can't change other user's priorities */ 4789 if (!check_same_owner(p)) 4790 return -EPERM; 4791 4792 /* Normal users shall not reset the sched_reset_on_fork flag */ 4793 if (p->sched_reset_on_fork && !reset_on_fork) 4794 return -EPERM; 4795 } 4796 4797 if (user) { 4798 retval = security_task_setscheduler(p, policy, param); 4799 if (retval) 4800 return retval; 4801 } 4802 4803 /* 4804 * make sure no PI-waiters arrive (or leave) while we are 4805 * changing the priority of the task: 4806 */ 4807 raw_spin_lock_irqsave(&p->pi_lock, flags); 4808 /* 4809 * To be able to change p->policy safely, the apropriate 4810 * runqueue lock must be held. 4811 */ 4812 rq = __task_rq_lock(p); 4813 4814#ifdef CONFIG_RT_GROUP_SCHED 4815 if (user) { 4816 /* 4817 * Do not allow realtime tasks into groups that have no runtime 4818 * assigned. 4819 */ 4820 if (rt_bandwidth_enabled() && rt_policy(policy) && 4821 task_group(p)->rt_bandwidth.rt_runtime == 0) { 4822 __task_rq_unlock(rq); 4823 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4824 return -EPERM; 4825 } 4826 } 4827#endif 4828 4829 /* recheck policy now with rq lock held */ 4830 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4831 policy = oldpolicy = -1; 4832 __task_rq_unlock(rq); 4833 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4834 goto recheck; 4835 } 4836 on_rq = p->se.on_rq; 4837 running = task_current(rq, p); 4838 if (on_rq) 4839 deactivate_task(rq, p, 0); 4840 if (running) 4841 p->sched_class->put_prev_task(rq, p); 4842 4843 p->sched_reset_on_fork = reset_on_fork; 4844 4845 oldprio = p->prio; 4846 prev_class = p->sched_class; 4847 __setscheduler(rq, p, policy, param->sched_priority); 4848 4849 if (running) 4850 p->sched_class->set_curr_task(rq); 4851 if (on_rq) { 4852 activate_task(rq, p, 0); 4853 4854 check_class_changed(rq, p, prev_class, oldprio, running); 4855 } 4856 __task_rq_unlock(rq); 4857 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4858 4859 rt_mutex_adjust_pi(p); 4860 4861 return 0; 4862} 4863 4864/** 4865 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4866 * @p: the task in question. 4867 * @policy: new policy. 4868 * @param: structure containing the new RT priority. 4869 * 4870 * NOTE that the task may be already dead. 4871 */ 4872int sched_setscheduler(struct task_struct *p, int policy, 4873 struct sched_param *param) 4874{ 4875 return __sched_setscheduler(p, policy, param, true); 4876} 4877EXPORT_SYMBOL_GPL(sched_setscheduler); 4878 4879/** 4880 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 4881 * @p: the task in question. 4882 * @policy: new policy. 4883 * @param: structure containing the new RT priority. 4884 * 4885 * Just like sched_setscheduler, only don't bother checking if the 4886 * current context has permission. For example, this is needed in 4887 * stop_machine(): we create temporary high priority worker threads, 4888 * but our caller might not have that capability. 4889 */ 4890int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4891 struct sched_param *param) 4892{ 4893 return __sched_setscheduler(p, policy, param, false); 4894} 4895 4896static int 4897do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4898{ 4899 struct sched_param lparam; 4900 struct task_struct *p; 4901 int retval; 4902 4903 if (!param || pid < 0) 4904 return -EINVAL; 4905 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4906 return -EFAULT; 4907 4908 rcu_read_lock(); 4909 retval = -ESRCH; 4910 p = find_process_by_pid(pid); 4911 if (p != NULL) 4912 retval = sched_setscheduler(p, policy, &lparam); 4913 rcu_read_unlock(); 4914 4915 return retval; 4916} 4917 4918/** 4919 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4920 * @pid: the pid in question. 4921 * @policy: new policy. 4922 * @param: structure containing the new RT priority. 4923 */ 4924SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 4925 struct sched_param __user *, param) 4926{ 4927 /* negative values for policy are not valid */ 4928 if (policy < 0) 4929 return -EINVAL; 4930 4931 return do_sched_setscheduler(pid, policy, param); 4932} 4933 4934/** 4935 * sys_sched_setparam - set/change the RT priority of a thread 4936 * @pid: the pid in question. 4937 * @param: structure containing the new RT priority. 4938 */ 4939SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 4940{ 4941 return do_sched_setscheduler(pid, -1, param); 4942} 4943 4944/** 4945 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4946 * @pid: the pid in question. 4947 */ 4948SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 4949{ 4950 struct task_struct *p; 4951 int retval; 4952 4953 if (pid < 0) 4954 return -EINVAL; 4955 4956 retval = -ESRCH; 4957 rcu_read_lock(); 4958 p = find_process_by_pid(pid); 4959 if (p) { 4960 retval = security_task_getscheduler(p); 4961 if (!retval) 4962 retval = p->policy 4963 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4964 } 4965 rcu_read_unlock(); 4966 return retval; 4967} 4968 4969/** 4970 * sys_sched_getparam - get the RT priority of a thread 4971 * @pid: the pid in question. 4972 * @param: structure containing the RT priority. 4973 */ 4974SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4975{ 4976 struct sched_param lp; 4977 struct task_struct *p; 4978 int retval; 4979 4980 if (!param || pid < 0) 4981 return -EINVAL; 4982 4983 rcu_read_lock(); 4984 p = find_process_by_pid(pid); 4985 retval = -ESRCH; 4986 if (!p) 4987 goto out_unlock; 4988 4989 retval = security_task_getscheduler(p); 4990 if (retval) 4991 goto out_unlock; 4992 4993 lp.sched_priority = p->rt_priority; 4994 rcu_read_unlock(); 4995 4996 /* 4997 * This one might sleep, we cannot do it with a spinlock held ... 4998 */ 4999 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 5000 5001 return retval; 5002 5003out_unlock: 5004 rcu_read_unlock(); 5005 return retval; 5006} 5007 5008long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 5009{ 5010 cpumask_var_t cpus_allowed, new_mask; 5011 struct task_struct *p; 5012 int retval; 5013 5014 get_online_cpus(); 5015 rcu_read_lock(); 5016 5017 p = find_process_by_pid(pid); 5018 if (!p) { 5019 rcu_read_unlock(); 5020 put_online_cpus(); 5021 return -ESRCH; 5022 } 5023 5024 /* Prevent p going away */ 5025 get_task_struct(p); 5026 rcu_read_unlock(); 5027 5028 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 5029 retval = -ENOMEM; 5030 goto out_put_task; 5031 } 5032 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 5033 retval = -ENOMEM; 5034 goto out_free_cpus_allowed; 5035 } 5036 retval = -EPERM; 5037 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5038 goto out_unlock; 5039 5040 retval = security_task_setscheduler(p, 0, NULL); 5041 if (retval) 5042 goto out_unlock; 5043 5044 cpuset_cpus_allowed(p, cpus_allowed); 5045 cpumask_and(new_mask, in_mask, cpus_allowed); 5046 again: 5047 retval = set_cpus_allowed_ptr(p, new_mask); 5048 5049 if (!retval) { 5050 cpuset_cpus_allowed(p, cpus_allowed); 5051 if (!cpumask_subset(new_mask, cpus_allowed)) { 5052 /* 5053 * We must have raced with a concurrent cpuset 5054 * update. Just reset the cpus_allowed to the 5055 * cpuset's cpus_allowed 5056 */ 5057 cpumask_copy(new_mask, cpus_allowed); 5058 goto again; 5059 } 5060 } 5061out_unlock: 5062 free_cpumask_var(new_mask); 5063out_free_cpus_allowed: 5064 free_cpumask_var(cpus_allowed); 5065out_put_task: 5066 put_task_struct(p); 5067 put_online_cpus(); 5068 return retval; 5069} 5070 5071static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5072 struct cpumask *new_mask) 5073{ 5074 if (len < cpumask_size()) 5075 cpumask_clear(new_mask); 5076 else if (len > cpumask_size()) 5077 len = cpumask_size(); 5078 5079 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5080} 5081 5082/** 5083 * sys_sched_setaffinity - set the cpu affinity of a process 5084 * @pid: pid of the process 5085 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5086 * @user_mask_ptr: user-space pointer to the new cpu mask 5087 */ 5088SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 5089 unsigned long __user *, user_mask_ptr) 5090{ 5091 cpumask_var_t new_mask; 5092 int retval; 5093 5094 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 5095 return -ENOMEM; 5096 5097 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 5098 if (retval == 0) 5099 retval = sched_setaffinity(pid, new_mask); 5100 free_cpumask_var(new_mask); 5101 return retval; 5102} 5103 5104long sched_getaffinity(pid_t pid, struct cpumask *mask) 5105{ 5106 struct task_struct *p; 5107 unsigned long flags; 5108 struct rq *rq; 5109 int retval; 5110 5111 get_online_cpus(); 5112 rcu_read_lock(); 5113 5114 retval = -ESRCH; 5115 p = find_process_by_pid(pid); 5116 if (!p) 5117 goto out_unlock; 5118 5119 retval = security_task_getscheduler(p); 5120 if (retval) 5121 goto out_unlock; 5122 5123 rq = task_rq_lock(p, &flags); 5124 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5125 task_rq_unlock(rq, &flags); 5126 5127out_unlock: 5128 rcu_read_unlock(); 5129 put_online_cpus(); 5130 5131 return retval; 5132} 5133 5134/** 5135 * sys_sched_getaffinity - get the cpu affinity of a process 5136 * @pid: pid of the process 5137 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5138 * @user_mask_ptr: user-space pointer to hold the current cpu mask 5139 */ 5140SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 5141 unsigned long __user *, user_mask_ptr) 5142{ 5143 int ret; 5144 cpumask_var_t mask; 5145 5146 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 5147 return -EINVAL; 5148 if (len & (sizeof(unsigned long)-1)) 5149 return -EINVAL; 5150 5151 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 5152 return -ENOMEM; 5153 5154 ret = sched_getaffinity(pid, mask); 5155 if (ret == 0) { 5156 size_t retlen = min_t(size_t, len, cpumask_size()); 5157 5158 if (copy_to_user(user_mask_ptr, mask, retlen)) 5159 ret = -EFAULT; 5160 else 5161 ret = retlen; 5162 } 5163 free_cpumask_var(mask); 5164 5165 return ret; 5166} 5167 5168/** 5169 * sys_sched_yield - yield the current processor to other threads. 5170 * 5171 * This function yields the current CPU to other tasks. If there are no 5172 * other threads running on this CPU then this function will return. 5173 */ 5174SYSCALL_DEFINE0(sched_yield) 5175{ 5176 struct rq *rq = this_rq_lock(); 5177 5178 schedstat_inc(rq, yld_count); 5179 current->sched_class->yield_task(rq); 5180 5181 /* 5182 * Since we are going to call schedule() anyway, there's 5183 * no need to preempt or enable interrupts: 5184 */ 5185 __release(rq->lock); 5186 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 5187 do_raw_spin_unlock(&rq->lock); 5188 preempt_enable_no_resched(); 5189 5190 schedule(); 5191 5192 return 0; 5193} 5194 5195static inline int should_resched(void) 5196{ 5197 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); 5198} 5199 5200static void __cond_resched(void) 5201{ 5202 add_preempt_count(PREEMPT_ACTIVE); 5203 schedule(); 5204 sub_preempt_count(PREEMPT_ACTIVE); 5205} 5206 5207int __sched _cond_resched(void) 5208{ 5209 if (should_resched()) { 5210 __cond_resched(); 5211 return 1; 5212 } 5213 return 0; 5214} 5215EXPORT_SYMBOL(_cond_resched); 5216 5217/* 5218 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 5219 * call schedule, and on return reacquire the lock. 5220 * 5221 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 5222 * operations here to prevent schedule() from being called twice (once via 5223 * spin_unlock(), once by hand). 5224 */ 5225int __cond_resched_lock(spinlock_t *lock) 5226{ 5227 int resched = should_resched(); 5228 int ret = 0; 5229 5230 lockdep_assert_held(lock); 5231 5232 if (spin_needbreak(lock) || resched) { 5233 spin_unlock(lock); 5234 if (resched) 5235 __cond_resched(); 5236 else 5237 cpu_relax(); 5238 ret = 1; 5239 spin_lock(lock); 5240 } 5241 return ret; 5242} 5243EXPORT_SYMBOL(__cond_resched_lock); 5244 5245int __sched __cond_resched_softirq(void) 5246{ 5247 BUG_ON(!in_softirq()); 5248 5249 if (should_resched()) { 5250 local_bh_enable(); 5251 __cond_resched(); 5252 local_bh_disable(); 5253 return 1; 5254 } 5255 return 0; 5256} 5257EXPORT_SYMBOL(__cond_resched_softirq); 5258 5259/** 5260 * yield - yield the current processor to other threads. 5261 * 5262 * This is a shortcut for kernel-space yielding - it marks the 5263 * thread runnable and calls sys_sched_yield(). 5264 */ 5265void __sched yield(void) 5266{ 5267 set_current_state(TASK_RUNNING); 5268 sys_sched_yield(); 5269} 5270EXPORT_SYMBOL(yield); 5271 5272/* 5273 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5274 * that process accounting knows that this is a task in IO wait state. 5275 */ 5276void __sched io_schedule(void) 5277{ 5278 struct rq *rq = raw_rq(); 5279 5280 delayacct_blkio_start(); 5281 atomic_inc(&rq->nr_iowait); 5282 current->in_iowait = 1; 5283 schedule(); 5284 current->in_iowait = 0; 5285 atomic_dec(&rq->nr_iowait); 5286 delayacct_blkio_end(); 5287} 5288EXPORT_SYMBOL(io_schedule); 5289 5290long __sched io_schedule_timeout(long timeout) 5291{ 5292 struct rq *rq = raw_rq(); 5293 long ret; 5294 5295 delayacct_blkio_start(); 5296 atomic_inc(&rq->nr_iowait); 5297 current->in_iowait = 1; 5298 ret = schedule_timeout(timeout); 5299 current->in_iowait = 0; 5300 atomic_dec(&rq->nr_iowait); 5301 delayacct_blkio_end(); 5302 return ret; 5303} 5304 5305/** 5306 * sys_sched_get_priority_max - return maximum RT priority. 5307 * @policy: scheduling class. 5308 * 5309 * this syscall returns the maximum rt_priority that can be used 5310 * by a given scheduling class. 5311 */ 5312SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 5313{ 5314 int ret = -EINVAL; 5315 5316 switch (policy) { 5317 case SCHED_FIFO: 5318 case SCHED_RR: 5319 ret = MAX_USER_RT_PRIO-1; 5320 break; 5321 case SCHED_NORMAL: 5322 case SCHED_BATCH: 5323 case SCHED_IDLE: 5324 ret = 0; 5325 break; 5326 } 5327 return ret; 5328} 5329 5330/** 5331 * sys_sched_get_priority_min - return minimum RT priority. 5332 * @policy: scheduling class. 5333 * 5334 * this syscall returns the minimum rt_priority that can be used 5335 * by a given scheduling class. 5336 */ 5337SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 5338{ 5339 int ret = -EINVAL; 5340 5341 switch (policy) { 5342 case SCHED_FIFO: 5343 case SCHED_RR: 5344 ret = 1; 5345 break; 5346 case SCHED_NORMAL: 5347 case SCHED_BATCH: 5348 case SCHED_IDLE: 5349 ret = 0; 5350 } 5351 return ret; 5352} 5353 5354/** 5355 * sys_sched_rr_get_interval - return the default timeslice of a process. 5356 * @pid: pid of the process. 5357 * @interval: userspace pointer to the timeslice value. 5358 * 5359 * this syscall writes the default timeslice value of a given process 5360 * into the user-space timespec buffer. A value of '0' means infinity. 5361 */ 5362SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 5363 struct timespec __user *, interval) 5364{ 5365 struct task_struct *p; 5366 unsigned int time_slice; 5367 unsigned long flags; 5368 struct rq *rq; 5369 int retval; 5370 struct timespec t; 5371 5372 if (pid < 0) 5373 return -EINVAL; 5374 5375 retval = -ESRCH; 5376 rcu_read_lock(); 5377 p = find_process_by_pid(pid); 5378 if (!p) 5379 goto out_unlock; 5380 5381 retval = security_task_getscheduler(p); 5382 if (retval) 5383 goto out_unlock; 5384 5385 rq = task_rq_lock(p, &flags); 5386 time_slice = p->sched_class->get_rr_interval(rq, p); 5387 task_rq_unlock(rq, &flags); 5388 5389 rcu_read_unlock(); 5390 jiffies_to_timespec(time_slice, &t); 5391 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 5392 return retval; 5393 5394out_unlock: 5395 rcu_read_unlock(); 5396 return retval; 5397} 5398 5399static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 5400 5401void sched_show_task(struct task_struct *p) 5402{ 5403 unsigned long free = 0; 5404 unsigned state; 5405 5406 state = p->state ? __ffs(p->state) + 1 : 0; 5407 printk(KERN_INFO "%-13.13s %c", p->comm, 5408 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5409#if BITS_PER_LONG == 32 5410 if (state == TASK_RUNNING) 5411 printk(KERN_CONT " running "); 5412 else 5413 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 5414#else 5415 if (state == TASK_RUNNING) 5416 printk(KERN_CONT " running task "); 5417 else 5418 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 5419#endif 5420#ifdef CONFIG_DEBUG_STACK_USAGE 5421 free = stack_not_used(p); 5422#endif 5423 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 5424 task_pid_nr(p), task_pid_nr(p->real_parent), 5425 (unsigned long)task_thread_info(p)->flags); 5426 5427 show_stack(p, NULL); 5428} 5429 5430void show_state_filter(unsigned long state_filter) 5431{ 5432 struct task_struct *g, *p; 5433 5434#if BITS_PER_LONG == 32 5435 printk(KERN_INFO 5436 " task PC stack pid father\n"); 5437#else 5438 printk(KERN_INFO 5439 " task PC stack pid father\n"); 5440#endif 5441 read_lock(&tasklist_lock); 5442 do_each_thread(g, p) { 5443 /* 5444 * reset the NMI-timeout, listing all files on a slow 5445 * console might take alot of time: 5446 */ 5447 touch_nmi_watchdog(); 5448 if (!state_filter || (p->state & state_filter)) 5449 sched_show_task(p); 5450 } while_each_thread(g, p); 5451 5452 touch_all_softlockup_watchdogs(); 5453 5454#ifdef CONFIG_SCHED_DEBUG 5455 sysrq_sched_debug_show(); 5456#endif 5457 read_unlock(&tasklist_lock); 5458 /* 5459 * Only show locks if all tasks are dumped: 5460 */ 5461 if (!state_filter) 5462 debug_show_all_locks(); 5463} 5464 5465void __cpuinit init_idle_bootup_task(struct task_struct *idle) 5466{ 5467 idle->sched_class = &idle_sched_class; 5468} 5469 5470/** 5471 * init_idle - set up an idle thread for a given CPU 5472 * @idle: task in question 5473 * @cpu: cpu the idle task belongs to 5474 * 5475 * NOTE: this function does not set the idle thread's NEED_RESCHED 5476 * flag, to make booting more robust. 5477 */ 5478void __cpuinit init_idle(struct task_struct *idle, int cpu) 5479{ 5480 struct rq *rq = cpu_rq(cpu); 5481 unsigned long flags; 5482 5483 raw_spin_lock_irqsave(&rq->lock, flags); 5484 5485 __sched_fork(idle); 5486 idle->state = TASK_RUNNING; 5487 idle->se.exec_start = sched_clock(); 5488 5489 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5490 /* 5491 * We're having a chicken and egg problem, even though we are 5492 * holding rq->lock, the cpu isn't yet set to this cpu so the 5493 * lockdep check in task_group() will fail. 5494 * 5495 * Similar case to sched_fork(). / Alternatively we could 5496 * use task_rq_lock() here and obtain the other rq->lock. 5497 * 5498 * Silence PROVE_RCU 5499 */ 5500 rcu_read_lock(); 5501 __set_task_cpu(idle, cpu); 5502 rcu_read_unlock(); 5503 5504 rq->curr = rq->idle = idle; 5505#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5506 idle->oncpu = 1; 5507#endif 5508 raw_spin_unlock_irqrestore(&rq->lock, flags); 5509 5510 /* Set the preempt count _outside_ the spinlocks! */ 5511#if defined(CONFIG_PREEMPT) 5512 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); 5513#else 5514 task_thread_info(idle)->preempt_count = 0; 5515#endif 5516 /* 5517 * The idle tasks have their own, simple scheduling class: 5518 */ 5519 idle->sched_class = &idle_sched_class; 5520 ftrace_graph_init_task(idle); 5521} 5522 5523/* 5524 * In a system that switches off the HZ timer nohz_cpu_mask 5525 * indicates which cpus entered this state. This is used 5526 * in the rcu update to wait only for active cpus. For system 5527 * which do not switch off the HZ timer nohz_cpu_mask should 5528 * always be CPU_BITS_NONE. 5529 */ 5530cpumask_var_t nohz_cpu_mask; 5531 5532/* 5533 * Increase the granularity value when there are more CPUs, 5534 * because with more CPUs the 'effective latency' as visible 5535 * to users decreases. But the relationship is not linear, 5536 * so pick a second-best guess by going with the log2 of the 5537 * number of CPUs. 5538 * 5539 * This idea comes from the SD scheduler of Con Kolivas: 5540 */ 5541static int get_update_sysctl_factor(void) 5542{ 5543 unsigned int cpus = min_t(int, num_online_cpus(), 8); 5544 unsigned int factor; 5545 5546 switch (sysctl_sched_tunable_scaling) { 5547 case SCHED_TUNABLESCALING_NONE: 5548 factor = 1; 5549 break; 5550 case SCHED_TUNABLESCALING_LINEAR: 5551 factor = cpus; 5552 break; 5553 case SCHED_TUNABLESCALING_LOG: 5554 default: 5555 factor = 1 + ilog2(cpus); 5556 break; 5557 } 5558 5559 return factor; 5560} 5561 5562static void update_sysctl(void) 5563{ 5564 unsigned int factor = get_update_sysctl_factor(); 5565 5566#define SET_SYSCTL(name) \ 5567 (sysctl_##name = (factor) * normalized_sysctl_##name) 5568 SET_SYSCTL(sched_min_granularity); 5569 SET_SYSCTL(sched_latency); 5570 SET_SYSCTL(sched_wakeup_granularity); 5571 SET_SYSCTL(sched_shares_ratelimit); 5572#undef SET_SYSCTL 5573} 5574 5575static inline void sched_init_granularity(void) 5576{ 5577 update_sysctl(); 5578} 5579 5580#ifdef CONFIG_SMP 5581/* 5582 * This is how migration works: 5583 * 5584 * 1) we invoke migration_cpu_stop() on the target CPU using 5585 * stop_one_cpu(). 5586 * 2) stopper starts to run (implicitly forcing the migrated thread 5587 * off the CPU) 5588 * 3) it checks whether the migrated task is still in the wrong runqueue. 5589 * 4) if it's in the wrong runqueue then the migration thread removes 5590 * it and puts it into the right queue. 5591 * 5) stopper completes and stop_one_cpu() returns and the migration 5592 * is done. 5593 */ 5594 5595/* 5596 * Change a given task's CPU affinity. Migrate the thread to a 5597 * proper CPU and schedule it away if the CPU it's executing on 5598 * is removed from the allowed bitmask. 5599 * 5600 * NOTE: the caller must have a valid reference to the task, the 5601 * task must not exit() & deallocate itself prematurely. The 5602 * call is not atomic; no spinlocks may be held. 5603 */ 5604int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5605{ 5606 unsigned long flags; 5607 struct rq *rq; 5608 unsigned int dest_cpu; 5609 int ret = 0; 5610 5611 /* 5612 * Serialize against TASK_WAKING so that ttwu() and wunt() can 5613 * drop the rq->lock and still rely on ->cpus_allowed. 5614 */ 5615again: 5616 while (task_is_waking(p)) 5617 cpu_relax(); 5618 rq = task_rq_lock(p, &flags); 5619 if (task_is_waking(p)) { 5620 task_rq_unlock(rq, &flags); 5621 goto again; 5622 } 5623 5624 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5625 ret = -EINVAL; 5626 goto out; 5627 } 5628 5629 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5630 !cpumask_equal(&p->cpus_allowed, new_mask))) { 5631 ret = -EINVAL; 5632 goto out; 5633 } 5634 5635 if (p->sched_class->set_cpus_allowed) 5636 p->sched_class->set_cpus_allowed(p, new_mask); 5637 else { 5638 cpumask_copy(&p->cpus_allowed, new_mask); 5639 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5640 } 5641 5642 /* Can the task run on the task's current CPU? If so, we're done */ 5643 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5644 goto out; 5645 5646 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5647 if (migrate_task(p, dest_cpu)) { 5648 struct migration_arg arg = { p, dest_cpu }; 5649 /* Need help from migration thread: drop lock and wait. */ 5650 task_rq_unlock(rq, &flags); 5651 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5652 tlb_migrate_finish(p->mm); 5653 return 0; 5654 } 5655out: 5656 task_rq_unlock(rq, &flags); 5657 5658 return ret; 5659} 5660EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 5661 5662/* 5663 * Move (not current) task off this cpu, onto dest cpu. We're doing 5664 * this because either it can't run here any more (set_cpus_allowed() 5665 * away from this CPU, or CPU going down), or because we're 5666 * attempting to rebalance this task on exec (sched_exec). 5667 * 5668 * So we race with normal scheduler movements, but that's OK, as long 5669 * as the task is no longer on this CPU. 5670 * 5671 * Returns non-zero if task was successfully migrated. 5672 */ 5673static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 5674{ 5675 struct rq *rq_dest, *rq_src; 5676 int ret = 0; 5677 5678 if (unlikely(!cpu_active(dest_cpu))) 5679 return ret; 5680 5681 rq_src = cpu_rq(src_cpu); 5682 rq_dest = cpu_rq(dest_cpu); 5683 5684 double_rq_lock(rq_src, rq_dest); 5685 /* Already moved. */ 5686 if (task_cpu(p) != src_cpu) 5687 goto done; 5688 /* Affinity changed (again). */ 5689 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 5690 goto fail; 5691 5692 /* 5693 * If we're not on a rq, the next wake-up will ensure we're 5694 * placed properly. 5695 */ 5696 if (p->se.on_rq) { 5697 deactivate_task(rq_src, p, 0); 5698 set_task_cpu(p, dest_cpu); 5699 activate_task(rq_dest, p, 0); 5700 check_preempt_curr(rq_dest, p, 0); 5701 } 5702done: 5703 ret = 1; 5704fail: 5705 double_rq_unlock(rq_src, rq_dest); 5706 return ret; 5707} 5708 5709/* 5710 * migration_cpu_stop - this will be executed by a highprio stopper thread 5711 * and performs thread migration by bumping thread off CPU then 5712 * 'pushing' onto another runqueue. 5713 */ 5714static int migration_cpu_stop(void *data) 5715{ 5716 struct migration_arg *arg = data; 5717 5718 /* 5719 * The original target cpu might have gone down and we might 5720 * be on another cpu but it doesn't matter. 5721 */ 5722 local_irq_disable(); 5723 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 5724 local_irq_enable(); 5725 return 0; 5726} 5727 5728#ifdef CONFIG_HOTPLUG_CPU 5729/* 5730 * Figure out where task on dead CPU should go, use force if necessary. 5731 */ 5732void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5733{ 5734 struct rq *rq = cpu_rq(dead_cpu); 5735 int needs_cpu, uninitialized_var(dest_cpu); 5736 unsigned long flags; 5737 5738 local_irq_save(flags); 5739 5740 raw_spin_lock(&rq->lock); 5741 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5742 if (needs_cpu) 5743 dest_cpu = select_fallback_rq(dead_cpu, p); 5744 raw_spin_unlock(&rq->lock); 5745 /* 5746 * It can only fail if we race with set_cpus_allowed(), 5747 * in the racer should migrate the task anyway. 5748 */ 5749 if (needs_cpu) 5750 __migrate_task(p, dead_cpu, dest_cpu); 5751 local_irq_restore(flags); 5752} 5753 5754/* 5755 * While a dead CPU has no uninterruptible tasks queued at this point, 5756 * it might still have a nonzero ->nr_uninterruptible counter, because 5757 * for performance reasons the counter is not stricly tracking tasks to 5758 * their home CPUs. So we just add the counter to another CPU's counter, 5759 * to keep the global sum constant after CPU-down: 5760 */ 5761static void migrate_nr_uninterruptible(struct rq *rq_src) 5762{ 5763 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5764 unsigned long flags; 5765 5766 local_irq_save(flags); 5767 double_rq_lock(rq_src, rq_dest); 5768 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5769 rq_src->nr_uninterruptible = 0; 5770 double_rq_unlock(rq_src, rq_dest); 5771 local_irq_restore(flags); 5772} 5773 5774/* Run through task list and migrate tasks from the dead cpu. */ 5775static void migrate_live_tasks(int src_cpu) 5776{ 5777 struct task_struct *p, *t; 5778 5779 read_lock(&tasklist_lock); 5780 5781 do_each_thread(t, p) { 5782 if (p == current) 5783 continue; 5784 5785 if (task_cpu(p) == src_cpu) 5786 move_task_off_dead_cpu(src_cpu, p); 5787 } while_each_thread(t, p); 5788 5789 read_unlock(&tasklist_lock); 5790} 5791 5792/* 5793 * Schedules idle task to be the next runnable task on current CPU. 5794 * It does so by boosting its priority to highest possible. 5795 * Used by CPU offline code. 5796 */ 5797void sched_idle_next(void) 5798{ 5799 int this_cpu = smp_processor_id(); 5800 struct rq *rq = cpu_rq(this_cpu); 5801 struct task_struct *p = rq->idle; 5802 unsigned long flags; 5803 5804 /* cpu has to be offline */ 5805 BUG_ON(cpu_online(this_cpu)); 5806 5807 /* 5808 * Strictly not necessary since rest of the CPUs are stopped by now 5809 * and interrupts disabled on the current cpu. 5810 */ 5811 raw_spin_lock_irqsave(&rq->lock, flags); 5812 5813 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5814 5815 activate_task(rq, p, 0); 5816 5817 raw_spin_unlock_irqrestore(&rq->lock, flags); 5818} 5819 5820/* 5821 * Ensures that the idle task is using init_mm right before its cpu goes 5822 * offline. 5823 */ 5824void idle_task_exit(void) 5825{ 5826 struct mm_struct *mm = current->active_mm; 5827 5828 BUG_ON(cpu_online(smp_processor_id())); 5829 5830 if (mm != &init_mm) 5831 switch_mm(mm, &init_mm, current); 5832 mmdrop(mm); 5833} 5834 5835/* called under rq->lock with disabled interrupts */ 5836static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 5837{ 5838 struct rq *rq = cpu_rq(dead_cpu); 5839 5840 /* Must be exiting, otherwise would be on tasklist. */ 5841 BUG_ON(!p->exit_state); 5842 5843 /* Cannot have done final schedule yet: would have vanished. */ 5844 BUG_ON(p->state == TASK_DEAD); 5845 5846 get_task_struct(p); 5847 5848 /* 5849 * Drop lock around migration; if someone else moves it, 5850 * that's OK. No task can be added to this CPU, so iteration is 5851 * fine. 5852 */ 5853 raw_spin_unlock_irq(&rq->lock); 5854 move_task_off_dead_cpu(dead_cpu, p); 5855 raw_spin_lock_irq(&rq->lock); 5856 5857 put_task_struct(p); 5858} 5859 5860/* release_task() removes task from tasklist, so we won't find dead tasks. */ 5861static void migrate_dead_tasks(unsigned int dead_cpu) 5862{ 5863 struct rq *rq = cpu_rq(dead_cpu); 5864 struct task_struct *next; 5865 5866 for ( ; ; ) { 5867 if (!rq->nr_running) 5868 break; 5869 next = pick_next_task(rq); 5870 if (!next) 5871 break; 5872 next->sched_class->put_prev_task(rq, next); 5873 migrate_dead(dead_cpu, next); 5874 5875 } 5876} 5877 5878/* 5879 * remove the tasks which were accounted by rq from calc_load_tasks. 5880 */ 5881static void calc_global_load_remove(struct rq *rq) 5882{ 5883 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 5884 rq->calc_load_active = 0; 5885} 5886#endif /* CONFIG_HOTPLUG_CPU */ 5887 5888#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5889 5890static struct ctl_table sd_ctl_dir[] = { 5891 { 5892 .procname = "sched_domain", 5893 .mode = 0555, 5894 }, 5895 {} 5896}; 5897 5898static struct ctl_table sd_ctl_root[] = { 5899 { 5900 .procname = "kernel", 5901 .mode = 0555, 5902 .child = sd_ctl_dir, 5903 }, 5904 {} 5905}; 5906 5907static struct ctl_table *sd_alloc_ctl_entry(int n) 5908{ 5909 struct ctl_table *entry = 5910 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 5911 5912 return entry; 5913} 5914 5915static void sd_free_ctl_entry(struct ctl_table **tablep) 5916{ 5917 struct ctl_table *entry; 5918 5919 /* 5920 * In the intermediate directories, both the child directory and 5921 * procname are dynamically allocated and could fail but the mode 5922 * will always be set. In the lowest directory the names are 5923 * static strings and all have proc handlers. 5924 */ 5925 for (entry = *tablep; entry->mode; entry++) { 5926 if (entry->child) 5927 sd_free_ctl_entry(&entry->child); 5928 if (entry->proc_handler == NULL) 5929 kfree(entry->procname); 5930 } 5931 5932 kfree(*tablep); 5933 *tablep = NULL; 5934} 5935 5936static void 5937set_table_entry(struct ctl_table *entry, 5938 const char *procname, void *data, int maxlen, 5939 mode_t mode, proc_handler *proc_handler) 5940{ 5941 entry->procname = procname; 5942 entry->data = data; 5943 entry->maxlen = maxlen; 5944 entry->mode = mode; 5945 entry->proc_handler = proc_handler; 5946} 5947 5948static struct ctl_table * 5949sd_alloc_ctl_domain_table(struct sched_domain *sd) 5950{ 5951 struct ctl_table *table = sd_alloc_ctl_entry(13); 5952 5953 if (table == NULL) 5954 return NULL; 5955 5956 set_table_entry(&table[0], "min_interval", &sd->min_interval, 5957 sizeof(long), 0644, proc_doulongvec_minmax); 5958 set_table_entry(&table[1], "max_interval", &sd->max_interval, 5959 sizeof(long), 0644, proc_doulongvec_minmax); 5960 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 5961 sizeof(int), 0644, proc_dointvec_minmax); 5962 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 5963 sizeof(int), 0644, proc_dointvec_minmax); 5964 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 5965 sizeof(int), 0644, proc_dointvec_minmax); 5966 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 5967 sizeof(int), 0644, proc_dointvec_minmax); 5968 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 5969 sizeof(int), 0644, proc_dointvec_minmax); 5970 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 5971 sizeof(int), 0644, proc_dointvec_minmax); 5972 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5973 sizeof(int), 0644, proc_dointvec_minmax); 5974 set_table_entry(&table[9], "cache_nice_tries", 5975 &sd->cache_nice_tries, 5976 sizeof(int), 0644, proc_dointvec_minmax); 5977 set_table_entry(&table[10], "flags", &sd->flags, 5978 sizeof(int), 0644, proc_dointvec_minmax); 5979 set_table_entry(&table[11], "name", sd->name, 5980 CORENAME_MAX_SIZE, 0444, proc_dostring); 5981 /* &table[12] is terminator */ 5982 5983 return table; 5984} 5985 5986static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5987{ 5988 struct ctl_table *entry, *table; 5989 struct sched_domain *sd; 5990 int domain_num = 0, i; 5991 char buf[32]; 5992 5993 for_each_domain(cpu, sd) 5994 domain_num++; 5995 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5996 if (table == NULL) 5997 return NULL; 5998 5999 i = 0; 6000 for_each_domain(cpu, sd) { 6001 snprintf(buf, 32, "domain%d", i); 6002 entry->procname = kstrdup(buf, GFP_KERNEL); 6003 entry->mode = 0555; 6004 entry->child = sd_alloc_ctl_domain_table(sd); 6005 entry++; 6006 i++; 6007 } 6008 return table; 6009} 6010 6011static struct ctl_table_header *sd_sysctl_header; 6012static void register_sched_domain_sysctl(void) 6013{ 6014 int i, cpu_num = num_possible_cpus(); 6015 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 6016 char buf[32]; 6017 6018 WARN_ON(sd_ctl_dir[0].child); 6019 sd_ctl_dir[0].child = entry; 6020 6021 if (entry == NULL) 6022 return; 6023 6024 for_each_possible_cpu(i) { 6025 snprintf(buf, 32, "cpu%d", i); 6026 entry->procname = kstrdup(buf, GFP_KERNEL); 6027 entry->mode = 0555; 6028 entry->child = sd_alloc_ctl_cpu_table(i); 6029 entry++; 6030 } 6031 6032 WARN_ON(sd_sysctl_header); 6033 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 6034} 6035 6036/* may be called multiple times per register */ 6037static void unregister_sched_domain_sysctl(void) 6038{ 6039 if (sd_sysctl_header) 6040 unregister_sysctl_table(sd_sysctl_header); 6041 sd_sysctl_header = NULL; 6042 if (sd_ctl_dir[0].child) 6043 sd_free_ctl_entry(&sd_ctl_dir[0].child); 6044} 6045#else 6046static void register_sched_domain_sysctl(void) 6047{ 6048} 6049static void unregister_sched_domain_sysctl(void) 6050{ 6051} 6052#endif 6053 6054static void set_rq_online(struct rq *rq) 6055{ 6056 if (!rq->online) { 6057 const struct sched_class *class; 6058 6059 cpumask_set_cpu(rq->cpu, rq->rd->online); 6060 rq->online = 1; 6061 6062 for_each_class(class) { 6063 if (class->rq_online) 6064 class->rq_online(rq); 6065 } 6066 } 6067} 6068 6069static void set_rq_offline(struct rq *rq) 6070{ 6071 if (rq->online) { 6072 const struct sched_class *class; 6073 6074 for_each_class(class) { 6075 if (class->rq_offline) 6076 class->rq_offline(rq); 6077 } 6078 6079 cpumask_clear_cpu(rq->cpu, rq->rd->online); 6080 rq->online = 0; 6081 } 6082} 6083 6084/* 6085 * migration_call - callback that gets triggered when a CPU is added. 6086 * Here we can start up the necessary migration thread for the new CPU. 6087 */ 6088static int __cpuinit 6089migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 6090{ 6091 int cpu = (long)hcpu; 6092 unsigned long flags; 6093 struct rq *rq = cpu_rq(cpu); 6094 6095 switch (action) { 6096 6097 case CPU_UP_PREPARE: 6098 case CPU_UP_PREPARE_FROZEN: 6099 rq->calc_load_update = calc_load_update; 6100 break; 6101 6102 case CPU_ONLINE: 6103 case CPU_ONLINE_FROZEN: 6104 /* Update our root-domain */ 6105 raw_spin_lock_irqsave(&rq->lock, flags); 6106 if (rq->rd) { 6107 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6108 6109 set_rq_online(rq); 6110 } 6111 raw_spin_unlock_irqrestore(&rq->lock, flags); 6112 break; 6113 6114#ifdef CONFIG_HOTPLUG_CPU 6115 case CPU_DEAD: 6116 case CPU_DEAD_FROZEN: 6117 migrate_live_tasks(cpu); 6118 /* Idle task back to normal (off runqueue, low prio) */ 6119 raw_spin_lock_irq(&rq->lock); 6120 deactivate_task(rq, rq->idle, 0); 6121 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 6122 rq->idle->sched_class = &idle_sched_class; 6123 migrate_dead_tasks(cpu); 6124 raw_spin_unlock_irq(&rq->lock); 6125 migrate_nr_uninterruptible(rq); 6126 BUG_ON(rq->nr_running != 0); 6127 calc_global_load_remove(rq); 6128 break; 6129 6130 case CPU_DYING: 6131 case CPU_DYING_FROZEN: 6132 /* Update our root-domain */ 6133 raw_spin_lock_irqsave(&rq->lock, flags); 6134 if (rq->rd) { 6135 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6136 set_rq_offline(rq); 6137 } 6138 raw_spin_unlock_irqrestore(&rq->lock, flags); 6139 break; 6140#endif 6141 } 6142 return NOTIFY_OK; 6143} 6144 6145/* 6146 * Register at high priority so that task migration (migrate_all_tasks) 6147 * happens before everything else. This has to be lower priority than 6148 * the notifier in the perf_event subsystem, though. 6149 */ 6150static struct notifier_block __cpuinitdata migration_notifier = { 6151 .notifier_call = migration_call, 6152 .priority = CPU_PRI_MIGRATION, 6153}; 6154 6155static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 6156 unsigned long action, void *hcpu) 6157{ 6158 switch (action & ~CPU_TASKS_FROZEN) { 6159 case CPU_ONLINE: 6160 case CPU_DOWN_FAILED: 6161 set_cpu_active((long)hcpu, true); 6162 return NOTIFY_OK; 6163 default: 6164 return NOTIFY_DONE; 6165 } 6166} 6167 6168static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 6169 unsigned long action, void *hcpu) 6170{ 6171 switch (action & ~CPU_TASKS_FROZEN) { 6172 case CPU_DOWN_PREPARE: 6173 set_cpu_active((long)hcpu, false); 6174 return NOTIFY_OK; 6175 default: 6176 return NOTIFY_DONE; 6177 } 6178} 6179 6180static int __init migration_init(void) 6181{ 6182 void *cpu = (void *)(long)smp_processor_id(); 6183 int err; 6184 6185 /* Initialize migration for the boot CPU */ 6186 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6187 BUG_ON(err == NOTIFY_BAD); 6188 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6189 register_cpu_notifier(&migration_notifier); 6190 6191 /* Register cpu active notifiers */ 6192 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 6193 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 6194 6195 return 0; 6196} 6197early_initcall(migration_init); 6198#endif 6199 6200#ifdef CONFIG_SMP 6201 6202#ifdef CONFIG_SCHED_DEBUG 6203 6204static __read_mostly int sched_domain_debug_enabled; 6205 6206static int __init sched_domain_debug_setup(char *str) 6207{ 6208 sched_domain_debug_enabled = 1; 6209 6210 return 0; 6211} 6212early_param("sched_debug", sched_domain_debug_setup); 6213 6214static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6215 struct cpumask *groupmask) 6216{ 6217 struct sched_group *group = sd->groups; 6218 char str[256]; 6219 6220 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 6221 cpumask_clear(groupmask); 6222 6223 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6224 6225 if (!(sd->flags & SD_LOAD_BALANCE)) { 6226 printk("does not load-balance\n"); 6227 if (sd->parent) 6228 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 6229 " has parent"); 6230 return -1; 6231 } 6232 6233 printk(KERN_CONT "span %s level %s\n", str, sd->name); 6234 6235 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 6236 printk(KERN_ERR "ERROR: domain->span does not contain " 6237 "CPU%d\n", cpu); 6238 } 6239 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 6240 printk(KERN_ERR "ERROR: domain->groups does not contain" 6241 " CPU%d\n", cpu); 6242 } 6243 6244 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 6245 do { 6246 if (!group) { 6247 printk("\n"); 6248 printk(KERN_ERR "ERROR: group is NULL\n"); 6249 break; 6250 } 6251 6252 if (!group->cpu_power) { 6253 printk(KERN_CONT "\n"); 6254 printk(KERN_ERR "ERROR: domain->cpu_power not " 6255 "set\n"); 6256 break; 6257 } 6258 6259 if (!cpumask_weight(sched_group_cpus(group))) { 6260 printk(KERN_CONT "\n"); 6261 printk(KERN_ERR "ERROR: empty group\n"); 6262 break; 6263 } 6264 6265 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 6266 printk(KERN_CONT "\n"); 6267 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6268 break; 6269 } 6270 6271 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 6272 6273 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6274 6275 printk(KERN_CONT " %s", str); 6276 if (group->cpu_power != SCHED_LOAD_SCALE) { 6277 printk(KERN_CONT " (cpu_power = %d)", 6278 group->cpu_power); 6279 } 6280 6281 group = group->next; 6282 } while (group != sd->groups); 6283 printk(KERN_CONT "\n"); 6284 6285 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 6286 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6287 6288 if (sd->parent && 6289 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 6290 printk(KERN_ERR "ERROR: parent span is not a superset " 6291 "of domain->span\n"); 6292 return 0; 6293} 6294 6295static void sched_domain_debug(struct sched_domain *sd, int cpu) 6296{ 6297 cpumask_var_t groupmask; 6298 int level = 0; 6299 6300 if (!sched_domain_debug_enabled) 6301 return; 6302 6303 if (!sd) { 6304 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 6305 return; 6306 } 6307 6308 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6309 6310 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { 6311 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6312 return; 6313 } 6314 6315 for (;;) { 6316 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6317 break; 6318 level++; 6319 sd = sd->parent; 6320 if (!sd) 6321 break; 6322 } 6323 free_cpumask_var(groupmask); 6324} 6325#else /* !CONFIG_SCHED_DEBUG */ 6326# define sched_domain_debug(sd, cpu) do { } while (0) 6327#endif /* CONFIG_SCHED_DEBUG */ 6328 6329static int sd_degenerate(struct sched_domain *sd) 6330{ 6331 if (cpumask_weight(sched_domain_span(sd)) == 1) 6332 return 1; 6333 6334 /* Following flags need at least 2 groups */ 6335 if (sd->flags & (SD_LOAD_BALANCE | 6336 SD_BALANCE_NEWIDLE | 6337 SD_BALANCE_FORK | 6338 SD_BALANCE_EXEC | 6339 SD_SHARE_CPUPOWER | 6340 SD_SHARE_PKG_RESOURCES)) { 6341 if (sd->groups != sd->groups->next) 6342 return 0; 6343 } 6344 6345 /* Following flags don't use groups */ 6346 if (sd->flags & (SD_WAKE_AFFINE)) 6347 return 0; 6348 6349 return 1; 6350} 6351 6352static int 6353sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 6354{ 6355 unsigned long cflags = sd->flags, pflags = parent->flags; 6356 6357 if (sd_degenerate(parent)) 6358 return 1; 6359 6360 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 6361 return 0; 6362 6363 /* Flags needing groups don't count if only 1 group in parent */ 6364 if (parent->groups == parent->groups->next) { 6365 pflags &= ~(SD_LOAD_BALANCE | 6366 SD_BALANCE_NEWIDLE | 6367 SD_BALANCE_FORK | 6368 SD_BALANCE_EXEC | 6369 SD_SHARE_CPUPOWER | 6370 SD_SHARE_PKG_RESOURCES); 6371 if (nr_node_ids == 1) 6372 pflags &= ~SD_SERIALIZE; 6373 } 6374 if (~cflags & pflags) 6375 return 0; 6376 6377 return 1; 6378} 6379 6380static void free_rootdomain(struct root_domain *rd) 6381{ 6382 synchronize_sched(); 6383 6384 cpupri_cleanup(&rd->cpupri); 6385 6386 free_cpumask_var(rd->rto_mask); 6387 free_cpumask_var(rd->online); 6388 free_cpumask_var(rd->span); 6389 kfree(rd); 6390} 6391 6392static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6393{ 6394 struct root_domain *old_rd = NULL; 6395 unsigned long flags; 6396 6397 raw_spin_lock_irqsave(&rq->lock, flags); 6398 6399 if (rq->rd) { 6400 old_rd = rq->rd; 6401 6402 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 6403 set_rq_offline(rq); 6404 6405 cpumask_clear_cpu(rq->cpu, old_rd->span); 6406 6407 /* 6408 * If we dont want to free the old_rt yet then 6409 * set old_rd to NULL to skip the freeing later 6410 * in this function: 6411 */ 6412 if (!atomic_dec_and_test(&old_rd->refcount)) 6413 old_rd = NULL; 6414 } 6415 6416 atomic_inc(&rd->refcount); 6417 rq->rd = rd; 6418 6419 cpumask_set_cpu(rq->cpu, rd->span); 6420 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 6421 set_rq_online(rq); 6422 6423 raw_spin_unlock_irqrestore(&rq->lock, flags); 6424 6425 if (old_rd) 6426 free_rootdomain(old_rd); 6427} 6428 6429static int init_rootdomain(struct root_domain *rd) 6430{ 6431 memset(rd, 0, sizeof(*rd)); 6432 6433 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 6434 goto out; 6435 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 6436 goto free_span; 6437 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 6438 goto free_online; 6439 6440 if (cpupri_init(&rd->cpupri) != 0) 6441 goto free_rto_mask; 6442 return 0; 6443 6444free_rto_mask: 6445 free_cpumask_var(rd->rto_mask); 6446free_online: 6447 free_cpumask_var(rd->online); 6448free_span: 6449 free_cpumask_var(rd->span); 6450out: 6451 return -ENOMEM; 6452} 6453 6454static void init_defrootdomain(void) 6455{ 6456 init_rootdomain(&def_root_domain); 6457 6458 atomic_set(&def_root_domain.refcount, 1); 6459} 6460 6461static struct root_domain *alloc_rootdomain(void) 6462{ 6463 struct root_domain *rd; 6464 6465 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 6466 if (!rd) 6467 return NULL; 6468 6469 if (init_rootdomain(rd) != 0) { 6470 kfree(rd); 6471 return NULL; 6472 } 6473 6474 return rd; 6475} 6476 6477/* 6478 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6479 * hold the hotplug lock. 6480 */ 6481static void 6482cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 6483{ 6484 struct rq *rq = cpu_rq(cpu); 6485 struct sched_domain *tmp; 6486 6487 for (tmp = sd; tmp; tmp = tmp->parent) 6488 tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); 6489 6490 /* Remove the sched domains which do not contribute to scheduling. */ 6491 for (tmp = sd; tmp; ) { 6492 struct sched_domain *parent = tmp->parent; 6493 if (!parent) 6494 break; 6495 6496 if (sd_parent_degenerate(tmp, parent)) { 6497 tmp->parent = parent->parent; 6498 if (parent->parent) 6499 parent->parent->child = tmp; 6500 } else 6501 tmp = tmp->parent; 6502 } 6503 6504 if (sd && sd_degenerate(sd)) { 6505 sd = sd->parent; 6506 if (sd) 6507 sd->child = NULL; 6508 } 6509 6510 sched_domain_debug(sd, cpu); 6511 6512 rq_attach_root(rq, rd); 6513 rcu_assign_pointer(rq->sd, sd); 6514} 6515 6516/* cpus with isolated domains */ 6517static cpumask_var_t cpu_isolated_map; 6518 6519/* Setup the mask of cpus configured for isolated domains */ 6520static int __init isolated_cpu_setup(char *str) 6521{ 6522 alloc_bootmem_cpumask_var(&cpu_isolated_map); 6523 cpulist_parse(str, cpu_isolated_map); 6524 return 1; 6525} 6526 6527__setup("isolcpus=", isolated_cpu_setup); 6528 6529/* 6530 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 6531 * to a function which identifies what group(along with sched group) a CPU 6532 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids 6533 * (due to the fact that we keep track of groups covered with a struct cpumask). 6534 * 6535 * init_sched_build_groups will build a circular linked list of the groups 6536 * covered by the given span, and will set each group's ->cpumask correctly, 6537 * and ->cpu_power to 0. 6538 */ 6539static void 6540init_sched_build_groups(const struct cpumask *span, 6541 const struct cpumask *cpu_map, 6542 int (*group_fn)(int cpu, const struct cpumask *cpu_map, 6543 struct sched_group **sg, 6544 struct cpumask *tmpmask), 6545 struct cpumask *covered, struct cpumask *tmpmask) 6546{ 6547 struct sched_group *first = NULL, *last = NULL; 6548 int i; 6549 6550 cpumask_clear(covered); 6551 6552 for_each_cpu(i, span) { 6553 struct sched_group *sg; 6554 int group = group_fn(i, cpu_map, &sg, tmpmask); 6555 int j; 6556 6557 if (cpumask_test_cpu(i, covered)) 6558 continue; 6559 6560 cpumask_clear(sched_group_cpus(sg)); 6561 sg->cpu_power = 0; 6562 6563 for_each_cpu(j, span) { 6564 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6565 continue; 6566 6567 cpumask_set_cpu(j, covered); 6568 cpumask_set_cpu(j, sched_group_cpus(sg)); 6569 } 6570 if (!first) 6571 first = sg; 6572 if (last) 6573 last->next = sg; 6574 last = sg; 6575 } 6576 last->next = first; 6577} 6578 6579#define SD_NODES_PER_DOMAIN 16 6580 6581#ifdef CONFIG_NUMA 6582 6583/** 6584 * find_next_best_node - find the next node to include in a sched_domain 6585 * @node: node whose sched_domain we're building 6586 * @used_nodes: nodes already in the sched_domain 6587 * 6588 * Find the next node to include in a given scheduling domain. Simply 6589 * finds the closest node not already in the @used_nodes map. 6590 * 6591 * Should use nodemask_t. 6592 */ 6593static int find_next_best_node(int node, nodemask_t *used_nodes) 6594{ 6595 int i, n, val, min_val, best_node = 0; 6596 6597 min_val = INT_MAX; 6598 6599 for (i = 0; i < nr_node_ids; i++) { 6600 /* Start at @node */ 6601 n = (node + i) % nr_node_ids; 6602 6603 if (!nr_cpus_node(n)) 6604 continue; 6605 6606 /* Skip already used nodes */ 6607 if (node_isset(n, *used_nodes)) 6608 continue; 6609 6610 /* Simple min distance search */ 6611 val = node_distance(node, n); 6612 6613 if (val < min_val) { 6614 min_val = val; 6615 best_node = n; 6616 } 6617 } 6618 6619 node_set(best_node, *used_nodes); 6620 return best_node; 6621} 6622 6623/** 6624 * sched_domain_node_span - get a cpumask for a node's sched_domain 6625 * @node: node whose cpumask we're constructing 6626 * @span: resulting cpumask 6627 * 6628 * Given a node, construct a good cpumask for its sched_domain to span. It 6629 * should be one that prevents unnecessary balancing, but also spreads tasks 6630 * out optimally. 6631 */ 6632static void sched_domain_node_span(int node, struct cpumask *span) 6633{ 6634 nodemask_t used_nodes; 6635 int i; 6636 6637 cpumask_clear(span); 6638 nodes_clear(used_nodes); 6639 6640 cpumask_or(span, span, cpumask_of_node(node)); 6641 node_set(node, used_nodes); 6642 6643 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6644 int next_node = find_next_best_node(node, &used_nodes); 6645 6646 cpumask_or(span, span, cpumask_of_node(next_node)); 6647 } 6648} 6649#endif /* CONFIG_NUMA */ 6650 6651int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6652 6653/* 6654 * The cpus mask in sched_group and sched_domain hangs off the end. 6655 * 6656 * ( See the the comments in include/linux/sched.h:struct sched_group 6657 * and struct sched_domain. ) 6658 */ 6659struct static_sched_group { 6660 struct sched_group sg; 6661 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); 6662}; 6663 6664struct static_sched_domain { 6665 struct sched_domain sd; 6666 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6667}; 6668 6669struct s_data { 6670#ifdef CONFIG_NUMA 6671 int sd_allnodes; 6672 cpumask_var_t domainspan; 6673 cpumask_var_t covered; 6674 cpumask_var_t notcovered; 6675#endif 6676 cpumask_var_t nodemask; 6677 cpumask_var_t this_sibling_map; 6678 cpumask_var_t this_core_map; 6679 cpumask_var_t send_covered; 6680 cpumask_var_t tmpmask; 6681 struct sched_group **sched_group_nodes; 6682 struct root_domain *rd; 6683}; 6684 6685enum s_alloc { 6686 sa_sched_groups = 0, 6687 sa_rootdomain, 6688 sa_tmpmask, 6689 sa_send_covered, 6690 sa_this_core_map, 6691 sa_this_sibling_map, 6692 sa_nodemask, 6693 sa_sched_group_nodes, 6694#ifdef CONFIG_NUMA 6695 sa_notcovered, 6696 sa_covered, 6697 sa_domainspan, 6698#endif 6699 sa_none, 6700}; 6701 6702/* 6703 * SMT sched-domains: 6704 */ 6705#ifdef CONFIG_SCHED_SMT 6706static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 6707static DEFINE_PER_CPU(struct static_sched_group, sched_groups); 6708 6709static int 6710cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6711 struct sched_group **sg, struct cpumask *unused) 6712{ 6713 if (sg) 6714 *sg = &per_cpu(sched_groups, cpu).sg; 6715 return cpu; 6716} 6717#endif /* CONFIG_SCHED_SMT */ 6718 6719/* 6720 * multi-core sched-domains: 6721 */ 6722#ifdef CONFIG_SCHED_MC 6723static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6724static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6725#endif /* CONFIG_SCHED_MC */ 6726 6727#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6728static int 6729cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6730 struct sched_group **sg, struct cpumask *mask) 6731{ 6732 int group; 6733 6734 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6735 group = cpumask_first(mask); 6736 if (sg) 6737 *sg = &per_cpu(sched_group_core, group).sg; 6738 return group; 6739} 6740#elif defined(CONFIG_SCHED_MC) 6741static int 6742cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6743 struct sched_group **sg, struct cpumask *unused) 6744{ 6745 if (sg) 6746 *sg = &per_cpu(sched_group_core, cpu).sg; 6747 return cpu; 6748} 6749#endif 6750 6751static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6752static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6753 6754static int 6755cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, 6756 struct sched_group **sg, struct cpumask *mask) 6757{ 6758 int group; 6759#ifdef CONFIG_SCHED_MC 6760 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6761 group = cpumask_first(mask); 6762#elif defined(CONFIG_SCHED_SMT) 6763 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6764 group = cpumask_first(mask); 6765#else 6766 group = cpu; 6767#endif 6768 if (sg) 6769 *sg = &per_cpu(sched_group_phys, group).sg; 6770 return group; 6771} 6772 6773#ifdef CONFIG_NUMA 6774/* 6775 * The init_sched_build_groups can't handle what we want to do with node 6776 * groups, so roll our own. Now each node has its own list of groups which 6777 * gets dynamically allocated. 6778 */ 6779static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6780static struct sched_group ***sched_group_nodes_bycpu; 6781 6782static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); 6783static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); 6784 6785static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 6786 struct sched_group **sg, 6787 struct cpumask *nodemask) 6788{ 6789 int group; 6790 6791 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); 6792 group = cpumask_first(nodemask); 6793 6794 if (sg) 6795 *sg = &per_cpu(sched_group_allnodes, group).sg; 6796 return group; 6797} 6798 6799static void init_numa_sched_groups_power(struct sched_group *group_head) 6800{ 6801 struct sched_group *sg = group_head; 6802 int j; 6803 6804 if (!sg) 6805 return; 6806 do { 6807 for_each_cpu(j, sched_group_cpus(sg)) { 6808 struct sched_domain *sd; 6809 6810 sd = &per_cpu(phys_domains, j).sd; 6811 if (j != group_first_cpu(sd->groups)) { 6812 /* 6813 * Only add "power" once for each 6814 * physical package. 6815 */ 6816 continue; 6817 } 6818 6819 sg->cpu_power += sd->groups->cpu_power; 6820 } 6821 sg = sg->next; 6822 } while (sg != group_head); 6823} 6824 6825static int build_numa_sched_groups(struct s_data *d, 6826 const struct cpumask *cpu_map, int num) 6827{ 6828 struct sched_domain *sd; 6829 struct sched_group *sg, *prev; 6830 int n, j; 6831 6832 cpumask_clear(d->covered); 6833 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); 6834 if (cpumask_empty(d->nodemask)) { 6835 d->sched_group_nodes[num] = NULL; 6836 goto out; 6837 } 6838 6839 sched_domain_node_span(num, d->domainspan); 6840 cpumask_and(d->domainspan, d->domainspan, cpu_map); 6841 6842 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 6843 GFP_KERNEL, num); 6844 if (!sg) { 6845 printk(KERN_WARNING "Can not alloc domain group for node %d\n", 6846 num); 6847 return -ENOMEM; 6848 } 6849 d->sched_group_nodes[num] = sg; 6850 6851 for_each_cpu(j, d->nodemask) { 6852 sd = &per_cpu(node_domains, j).sd; 6853 sd->groups = sg; 6854 } 6855 6856 sg->cpu_power = 0; 6857 cpumask_copy(sched_group_cpus(sg), d->nodemask); 6858 sg->next = sg; 6859 cpumask_or(d->covered, d->covered, d->nodemask); 6860 6861 prev = sg; 6862 for (j = 0; j < nr_node_ids; j++) { 6863 n = (num + j) % nr_node_ids; 6864 cpumask_complement(d->notcovered, d->covered); 6865 cpumask_and(d->tmpmask, d->notcovered, cpu_map); 6866 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); 6867 if (cpumask_empty(d->tmpmask)) 6868 break; 6869 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); 6870 if (cpumask_empty(d->tmpmask)) 6871 continue; 6872 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 6873 GFP_KERNEL, num); 6874 if (!sg) { 6875 printk(KERN_WARNING 6876 "Can not alloc domain group for node %d\n", j); 6877 return -ENOMEM; 6878 } 6879 sg->cpu_power = 0; 6880 cpumask_copy(sched_group_cpus(sg), d->tmpmask); 6881 sg->next = prev->next; 6882 cpumask_or(d->covered, d->covered, d->tmpmask); 6883 prev->next = sg; 6884 prev = sg; 6885 } 6886out: 6887 return 0; 6888} 6889#endif /* CONFIG_NUMA */ 6890 6891#ifdef CONFIG_NUMA 6892/* Free memory allocated for various sched_group structures */ 6893static void free_sched_groups(const struct cpumask *cpu_map, 6894 struct cpumask *nodemask) 6895{ 6896 int cpu, i; 6897 6898 for_each_cpu(cpu, cpu_map) { 6899 struct sched_group **sched_group_nodes 6900 = sched_group_nodes_bycpu[cpu]; 6901 6902 if (!sched_group_nodes) 6903 continue; 6904 6905 for (i = 0; i < nr_node_ids; i++) { 6906 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6907 6908 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 6909 if (cpumask_empty(nodemask)) 6910 continue; 6911 6912 if (sg == NULL) 6913 continue; 6914 sg = sg->next; 6915next_sg: 6916 oldsg = sg; 6917 sg = sg->next; 6918 kfree(oldsg); 6919 if (oldsg != sched_group_nodes[i]) 6920 goto next_sg; 6921 } 6922 kfree(sched_group_nodes); 6923 sched_group_nodes_bycpu[cpu] = NULL; 6924 } 6925} 6926#else /* !CONFIG_NUMA */ 6927static void free_sched_groups(const struct cpumask *cpu_map, 6928 struct cpumask *nodemask) 6929{ 6930} 6931#endif /* CONFIG_NUMA */ 6932 6933/* 6934 * Initialize sched groups cpu_power. 6935 * 6936 * cpu_power indicates the capacity of sched group, which is used while 6937 * distributing the load between different sched groups in a sched domain. 6938 * Typically cpu_power for all the groups in a sched domain will be same unless 6939 * there are asymmetries in the topology. If there are asymmetries, group 6940 * having more cpu_power will pickup more load compared to the group having 6941 * less cpu_power. 6942 */ 6943static void init_sched_groups_power(int cpu, struct sched_domain *sd) 6944{ 6945 struct sched_domain *child; 6946 struct sched_group *group; 6947 long power; 6948 int weight; 6949 6950 WARN_ON(!sd || !sd->groups); 6951 6952 if (cpu != group_first_cpu(sd->groups)) 6953 return; 6954 6955 child = sd->child; 6956 6957 sd->groups->cpu_power = 0; 6958 6959 if (!child) { 6960 power = SCHED_LOAD_SCALE; 6961 weight = cpumask_weight(sched_domain_span(sd)); 6962 /* 6963 * SMT siblings share the power of a single core. 6964 * Usually multiple threads get a better yield out of 6965 * that one core than a single thread would have, 6966 * reflect that in sd->smt_gain. 6967 */ 6968 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 6969 power *= sd->smt_gain; 6970 power /= weight; 6971 power >>= SCHED_LOAD_SHIFT; 6972 } 6973 sd->groups->cpu_power += power; 6974 return; 6975 } 6976 6977 /* 6978 * Add cpu_power of each child group to this groups cpu_power. 6979 */ 6980 group = child->groups; 6981 do { 6982 sd->groups->cpu_power += group->cpu_power; 6983 group = group->next; 6984 } while (group != child->groups); 6985} 6986 6987/* 6988 * Initializers for schedule domains 6989 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 6990 */ 6991 6992#ifdef CONFIG_SCHED_DEBUG 6993# define SD_INIT_NAME(sd, type) sd->name = #type 6994#else 6995# define SD_INIT_NAME(sd, type) do { } while (0) 6996#endif 6997 6998#define SD_INIT(sd, type) sd_init_##type(sd) 6999 7000#define SD_INIT_FUNC(type) \ 7001static noinline void sd_init_##type(struct sched_domain *sd) \ 7002{ \ 7003 memset(sd, 0, sizeof(*sd)); \ 7004 *sd = SD_##type##_INIT; \ 7005 sd->level = SD_LV_##type; \ 7006 SD_INIT_NAME(sd, type); \ 7007} 7008 7009SD_INIT_FUNC(CPU) 7010#ifdef CONFIG_NUMA 7011 SD_INIT_FUNC(ALLNODES) 7012 SD_INIT_FUNC(NODE) 7013#endif 7014#ifdef CONFIG_SCHED_SMT 7015 SD_INIT_FUNC(SIBLING) 7016#endif 7017#ifdef CONFIG_SCHED_MC 7018 SD_INIT_FUNC(MC) 7019#endif 7020 7021static int default_relax_domain_level = -1; 7022 7023static int __init setup_relax_domain_level(char *str) 7024{ 7025 unsigned long val; 7026 7027 val = simple_strtoul(str, NULL, 0); 7028 if (val < SD_LV_MAX) 7029 default_relax_domain_level = val; 7030 7031 return 1; 7032} 7033__setup("relax_domain_level=", setup_relax_domain_level); 7034 7035static void set_domain_attribute(struct sched_domain *sd, 7036 struct sched_domain_attr *attr) 7037{ 7038 int request; 7039 7040 if (!attr || attr->relax_domain_level < 0) { 7041 if (default_relax_domain_level < 0) 7042 return; 7043 else 7044 request = default_relax_domain_level; 7045 } else 7046 request = attr->relax_domain_level; 7047 if (request < sd->level) { 7048 /* turn off idle balance on this domain */ 7049 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 7050 } else { 7051 /* turn on idle balance on this domain */ 7052 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 7053 } 7054} 7055 7056static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7057 const struct cpumask *cpu_map) 7058{ 7059 switch (what) { 7060 case sa_sched_groups: 7061 free_sched_groups(cpu_map, d->tmpmask); /* fall through */ 7062 d->sched_group_nodes = NULL; 7063 case sa_rootdomain: 7064 free_rootdomain(d->rd); /* fall through */ 7065 case sa_tmpmask: 7066 free_cpumask_var(d->tmpmask); /* fall through */ 7067 case sa_send_covered: 7068 free_cpumask_var(d->send_covered); /* fall through */ 7069 case sa_this_core_map: 7070 free_cpumask_var(d->this_core_map); /* fall through */ 7071 case sa_this_sibling_map: 7072 free_cpumask_var(d->this_sibling_map); /* fall through */ 7073 case sa_nodemask: 7074 free_cpumask_var(d->nodemask); /* fall through */ 7075 case sa_sched_group_nodes: 7076#ifdef CONFIG_NUMA 7077 kfree(d->sched_group_nodes); /* fall through */ 7078 case sa_notcovered: 7079 free_cpumask_var(d->notcovered); /* fall through */ 7080 case sa_covered: 7081 free_cpumask_var(d->covered); /* fall through */ 7082 case sa_domainspan: 7083 free_cpumask_var(d->domainspan); /* fall through */ 7084#endif 7085 case sa_none: 7086 break; 7087 } 7088} 7089 7090static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7091 const struct cpumask *cpu_map) 7092{ 7093#ifdef CONFIG_NUMA 7094 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7095 return sa_none; 7096 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7097 return sa_domainspan; 7098 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7099 return sa_covered; 7100 /* Allocate the per-node list of sched groups */ 7101 d->sched_group_nodes = kcalloc(nr_node_ids, 7102 sizeof(struct sched_group *), GFP_KERNEL); 7103 if (!d->sched_group_nodes) { 7104 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7105 return sa_notcovered; 7106 } 7107 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; 7108#endif 7109 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) 7110 return sa_sched_group_nodes; 7111 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) 7112 return sa_nodemask; 7113 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7114 return sa_this_sibling_map; 7115 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7116 return sa_this_core_map; 7117 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7118 return sa_send_covered; 7119 d->rd = alloc_rootdomain(); 7120 if (!d->rd) { 7121 printk(KERN_WARNING "Cannot alloc root domain\n"); 7122 return sa_tmpmask; 7123 } 7124 return sa_rootdomain; 7125} 7126 7127static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7128 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7129{ 7130 struct sched_domain *sd = NULL; 7131#ifdef CONFIG_NUMA 7132 struct sched_domain *parent; 7133 7134 d->sd_allnodes = 0; 7135 if (cpumask_weight(cpu_map) > 7136 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { 7137 sd = &per_cpu(allnodes_domains, i).sd; 7138 SD_INIT(sd, ALLNODES); 7139 set_domain_attribute(sd, attr); 7140 cpumask_copy(sched_domain_span(sd), cpu_map); 7141 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); 7142 d->sd_allnodes = 1; 7143 } 7144 parent = sd; 7145 7146 sd = &per_cpu(node_domains, i).sd; 7147 SD_INIT(sd, NODE); 7148 set_domain_attribute(sd, attr); 7149 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 7150 sd->parent = parent; 7151 if (parent) 7152 parent->child = sd; 7153 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); 7154#endif 7155 return sd; 7156} 7157 7158static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7159 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7160 struct sched_domain *parent, int i) 7161{ 7162 struct sched_domain *sd; 7163 sd = &per_cpu(phys_domains, i).sd; 7164 SD_INIT(sd, CPU); 7165 set_domain_attribute(sd, attr); 7166 cpumask_copy(sched_domain_span(sd), d->nodemask); 7167 sd->parent = parent; 7168 if (parent) 7169 parent->child = sd; 7170 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); 7171 return sd; 7172} 7173 7174static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7175 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7176 struct sched_domain *parent, int i) 7177{ 7178 struct sched_domain *sd = parent; 7179#ifdef CONFIG_SCHED_MC 7180 sd = &per_cpu(core_domains, i).sd; 7181 SD_INIT(sd, MC); 7182 set_domain_attribute(sd, attr); 7183 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); 7184 sd->parent = parent; 7185 parent->child = sd; 7186 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); 7187#endif 7188 return sd; 7189} 7190 7191static struct sched_domain *__build_smt_sched_domain(struct s_data *d, 7192 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7193 struct sched_domain *parent, int i) 7194{ 7195 struct sched_domain *sd = parent; 7196#ifdef CONFIG_SCHED_SMT 7197 sd = &per_cpu(cpu_domains, i).sd; 7198 SD_INIT(sd, SIBLING); 7199 set_domain_attribute(sd, attr); 7200 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); 7201 sd->parent = parent; 7202 parent->child = sd; 7203 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); 7204#endif 7205 return sd; 7206} 7207 7208static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7209 const struct cpumask *cpu_map, int cpu) 7210{ 7211 switch (l) { 7212#ifdef CONFIG_SCHED_SMT 7213 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7214 cpumask_and(d->this_sibling_map, cpu_map, 7215 topology_thread_cpumask(cpu)); 7216 if (cpu == cpumask_first(d->this_sibling_map)) 7217 init_sched_build_groups(d->this_sibling_map, cpu_map, 7218 &cpu_to_cpu_group, 7219 d->send_covered, d->tmpmask); 7220 break; 7221#endif 7222#ifdef CONFIG_SCHED_MC 7223 case SD_LV_MC: /* set up multi-core groups */ 7224 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); 7225 if (cpu == cpumask_first(d->this_core_map)) 7226 init_sched_build_groups(d->this_core_map, cpu_map, 7227 &cpu_to_core_group, 7228 d->send_covered, d->tmpmask); 7229 break; 7230#endif 7231 case SD_LV_CPU: /* set up physical groups */ 7232 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7233 if (!cpumask_empty(d->nodemask)) 7234 init_sched_build_groups(d->nodemask, cpu_map, 7235 &cpu_to_phys_group, 7236 d->send_covered, d->tmpmask); 7237 break; 7238#ifdef CONFIG_NUMA 7239 case SD_LV_ALLNODES: 7240 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7241 d->send_covered, d->tmpmask); 7242 break; 7243#endif 7244 default: 7245 break; 7246 } 7247} 7248 7249/* 7250 * Build sched domains for a given set of cpus and attach the sched domains 7251 * to the individual cpus 7252 */ 7253static int __build_sched_domains(const struct cpumask *cpu_map, 7254 struct sched_domain_attr *attr) 7255{ 7256 enum s_alloc alloc_state = sa_none; 7257 struct s_data d; 7258 struct sched_domain *sd; 7259 int i; 7260#ifdef CONFIG_NUMA 7261 d.sd_allnodes = 0; 7262#endif 7263 7264 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7265 if (alloc_state != sa_rootdomain) 7266 goto error; 7267 alloc_state = sa_sched_groups; 7268 7269 /* 7270 * Set up domains for cpus specified by the cpu_map. 7271 */ 7272 for_each_cpu(i, cpu_map) { 7273 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7274 cpu_map); 7275 7276 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7277 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7278 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7279 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7280 } 7281 7282 for_each_cpu(i, cpu_map) { 7283 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7284 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7285 } 7286 7287 /* Set up physical groups */ 7288 for (i = 0; i < nr_node_ids; i++) 7289 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7290 7291#ifdef CONFIG_NUMA 7292 /* Set up node groups */ 7293 if (d.sd_allnodes) 7294 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); 7295 7296 for (i = 0; i < nr_node_ids; i++) 7297 if (build_numa_sched_groups(&d, cpu_map, i)) 7298 goto error; 7299#endif 7300 7301 /* Calculate CPU power for physical packages and nodes */ 7302#ifdef CONFIG_SCHED_SMT 7303 for_each_cpu(i, cpu_map) { 7304 sd = &per_cpu(cpu_domains, i).sd; 7305 init_sched_groups_power(i, sd); 7306 } 7307#endif 7308#ifdef CONFIG_SCHED_MC 7309 for_each_cpu(i, cpu_map) { 7310 sd = &per_cpu(core_domains, i).sd; 7311 init_sched_groups_power(i, sd); 7312 } 7313#endif 7314 7315 for_each_cpu(i, cpu_map) { 7316 sd = &per_cpu(phys_domains, i).sd; 7317 init_sched_groups_power(i, sd); 7318 } 7319 7320#ifdef CONFIG_NUMA 7321 for (i = 0; i < nr_node_ids; i++) 7322 init_numa_sched_groups_power(d.sched_group_nodes[i]); 7323 7324 if (d.sd_allnodes) { 7325 struct sched_group *sg; 7326 7327 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7328 d.tmpmask); 7329 init_numa_sched_groups_power(sg); 7330 } 7331#endif 7332 7333 /* Attach the domains */ 7334 for_each_cpu(i, cpu_map) { 7335#ifdef CONFIG_SCHED_SMT 7336 sd = &per_cpu(cpu_domains, i).sd; 7337#elif defined(CONFIG_SCHED_MC) 7338 sd = &per_cpu(core_domains, i).sd; 7339#else 7340 sd = &per_cpu(phys_domains, i).sd; 7341#endif 7342 cpu_attach_domain(sd, d.rd, i); 7343 } 7344 7345 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7346 __free_domain_allocs(&d, sa_tmpmask, cpu_map); 7347 return 0; 7348 7349error: 7350 __free_domain_allocs(&d, alloc_state, cpu_map); 7351 return -ENOMEM; 7352} 7353 7354static int build_sched_domains(const struct cpumask *cpu_map) 7355{ 7356 return __build_sched_domains(cpu_map, NULL); 7357} 7358 7359static cpumask_var_t *doms_cur; /* current sched domains */ 7360static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7361static struct sched_domain_attr *dattr_cur; 7362 /* attribues of custom domains in 'doms_cur' */ 7363 7364/* 7365 * Special case: If a kmalloc of a doms_cur partition (array of 7366 * cpumask) fails, then fallback to a single sched domain, 7367 * as determined by the single cpumask fallback_doms. 7368 */ 7369static cpumask_var_t fallback_doms; 7370 7371/* 7372 * arch_update_cpu_topology lets virtualized architectures update the 7373 * cpu core maps. It is supposed to return 1 if the topology changed 7374 * or 0 if it stayed the same. 7375 */ 7376int __attribute__((weak)) arch_update_cpu_topology(void) 7377{ 7378 return 0; 7379} 7380 7381cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 7382{ 7383 int i; 7384 cpumask_var_t *doms; 7385 7386 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 7387 if (!doms) 7388 return NULL; 7389 for (i = 0; i < ndoms; i++) { 7390 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 7391 free_sched_domains(doms, i); 7392 return NULL; 7393 } 7394 } 7395 return doms; 7396} 7397 7398void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 7399{ 7400 unsigned int i; 7401 for (i = 0; i < ndoms; i++) 7402 free_cpumask_var(doms[i]); 7403 kfree(doms); 7404} 7405 7406/* 7407 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7408 * For now this just excludes isolated cpus, but could be used to 7409 * exclude other special cases in the future. 7410 */ 7411static int arch_init_sched_domains(const struct cpumask *cpu_map) 7412{ 7413 int err; 7414 7415 arch_update_cpu_topology(); 7416 ndoms_cur = 1; 7417 doms_cur = alloc_sched_domains(ndoms_cur); 7418 if (!doms_cur) 7419 doms_cur = &fallback_doms; 7420 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7421 dattr_cur = NULL; 7422 err = build_sched_domains(doms_cur[0]); 7423 register_sched_domain_sysctl(); 7424 7425 return err; 7426} 7427 7428static void arch_destroy_sched_domains(const struct cpumask *cpu_map, 7429 struct cpumask *tmpmask) 7430{ 7431 free_sched_groups(cpu_map, tmpmask); 7432} 7433 7434/* 7435 * Detach sched domains from a group of cpus specified in cpu_map 7436 * These cpus will now be attached to the NULL domain 7437 */ 7438static void detach_destroy_domains(const struct cpumask *cpu_map) 7439{ 7440 /* Save because hotplug lock held. */ 7441 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); 7442 int i; 7443 7444 for_each_cpu(i, cpu_map) 7445 cpu_attach_domain(NULL, &def_root_domain, i); 7446 synchronize_sched(); 7447 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); 7448} 7449 7450/* handle null as "default" */ 7451static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 7452 struct sched_domain_attr *new, int idx_new) 7453{ 7454 struct sched_domain_attr tmp; 7455 7456 /* fast path */ 7457 if (!new && !cur) 7458 return 1; 7459 7460 tmp = SD_ATTR_INIT; 7461 return !memcmp(cur ? (cur + idx_cur) : &tmp, 7462 new ? (new + idx_new) : &tmp, 7463 sizeof(struct sched_domain_attr)); 7464} 7465 7466/* 7467 * Partition sched domains as specified by the 'ndoms_new' 7468 * cpumasks in the array doms_new[] of cpumasks. This compares 7469 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7470 * It destroys each deleted domain and builds each new domain. 7471 * 7472 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 7473 * The masks don't intersect (don't overlap.) We should setup one 7474 * sched domain for each mask. CPUs not in any of the cpumasks will 7475 * not be load balanced. If the same cpumask appears both in the 7476 * current 'doms_cur' domains and in the new 'doms_new', we can leave 7477 * it as it is. 7478 * 7479 * The passed in 'doms_new' should be allocated using 7480 * alloc_sched_domains. This routine takes ownership of it and will 7481 * free_sched_domains it when done with it. If the caller failed the 7482 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 7483 * and partition_sched_domains() will fallback to the single partition 7484 * 'fallback_doms', it also forces the domains to be rebuilt. 7485 * 7486 * If doms_new == NULL it will be replaced with cpu_online_mask. 7487 * ndoms_new == 0 is a special case for destroying existing domains, 7488 * and it will not create the default domain. 7489 * 7490 * Call with hotplug lock held 7491 */ 7492void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 7493 struct sched_domain_attr *dattr_new) 7494{ 7495 int i, j, n; 7496 int new_topology; 7497 7498 mutex_lock(&sched_domains_mutex); 7499 7500 /* always unregister in case we don't destroy any domains */ 7501 unregister_sched_domain_sysctl(); 7502 7503 /* Let architecture update cpu core mappings. */ 7504 new_topology = arch_update_cpu_topology(); 7505 7506 n = doms_new ? ndoms_new : 0; 7507 7508 /* Destroy deleted domains */ 7509 for (i = 0; i < ndoms_cur; i++) { 7510 for (j = 0; j < n && !new_topology; j++) { 7511 if (cpumask_equal(doms_cur[i], doms_new[j]) 7512 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7513 goto match1; 7514 } 7515 /* no match - a current sched domain not in new doms_new[] */ 7516 detach_destroy_domains(doms_cur[i]); 7517match1: 7518 ; 7519 } 7520 7521 if (doms_new == NULL) { 7522 ndoms_cur = 0; 7523 doms_new = &fallback_doms; 7524 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 7525 WARN_ON_ONCE(dattr_new); 7526 } 7527 7528 /* Build new domains */ 7529 for (i = 0; i < ndoms_new; i++) { 7530 for (j = 0; j < ndoms_cur && !new_topology; j++) { 7531 if (cpumask_equal(doms_new[i], doms_cur[j]) 7532 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7533 goto match2; 7534 } 7535 /* no match - add a new doms_new */ 7536 __build_sched_domains(doms_new[i], 7537 dattr_new ? dattr_new + i : NULL); 7538match2: 7539 ; 7540 } 7541 7542 /* Remember the new sched domains */ 7543 if (doms_cur != &fallback_doms) 7544 free_sched_domains(doms_cur, ndoms_cur); 7545 kfree(dattr_cur); /* kfree(NULL) is safe */ 7546 doms_cur = doms_new; 7547 dattr_cur = dattr_new; 7548 ndoms_cur = ndoms_new; 7549 7550 register_sched_domain_sysctl(); 7551 7552 mutex_unlock(&sched_domains_mutex); 7553} 7554 7555#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7556static void arch_reinit_sched_domains(void) 7557{ 7558 get_online_cpus(); 7559 7560 /* Destroy domains first to force the rebuild */ 7561 partition_sched_domains(0, NULL, NULL); 7562 7563 rebuild_sched_domains(); 7564 put_online_cpus(); 7565} 7566 7567static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7568{ 7569 unsigned int level = 0; 7570 7571 if (sscanf(buf, "%u", &level) != 1) 7572 return -EINVAL; 7573 7574 /* 7575 * level is always be positive so don't check for 7576 * level < POWERSAVINGS_BALANCE_NONE which is 0 7577 * What happens on 0 or 1 byte write, 7578 * need to check for count as well? 7579 */ 7580 7581 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) 7582 return -EINVAL; 7583 7584 if (smt) 7585 sched_smt_power_savings = level; 7586 else 7587 sched_mc_power_savings = level; 7588 7589 arch_reinit_sched_domains(); 7590 7591 return count; 7592} 7593 7594#ifdef CONFIG_SCHED_MC 7595static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7596 struct sysdev_class_attribute *attr, 7597 char *page) 7598{ 7599 return sprintf(page, "%u\n", sched_mc_power_savings); 7600} 7601static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7602 struct sysdev_class_attribute *attr, 7603 const char *buf, size_t count) 7604{ 7605 return sched_power_savings_store(buf, count, 0); 7606} 7607static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, 7608 sched_mc_power_savings_show, 7609 sched_mc_power_savings_store); 7610#endif 7611 7612#ifdef CONFIG_SCHED_SMT 7613static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7614 struct sysdev_class_attribute *attr, 7615 char *page) 7616{ 7617 return sprintf(page, "%u\n", sched_smt_power_savings); 7618} 7619static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7620 struct sysdev_class_attribute *attr, 7621 const char *buf, size_t count) 7622{ 7623 return sched_power_savings_store(buf, count, 1); 7624} 7625static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, 7626 sched_smt_power_savings_show, 7627 sched_smt_power_savings_store); 7628#endif 7629 7630int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 7631{ 7632 int err = 0; 7633 7634#ifdef CONFIG_SCHED_SMT 7635 if (smt_capable()) 7636 err = sysfs_create_file(&cls->kset.kobj, 7637 &attr_sched_smt_power_savings.attr); 7638#endif 7639#ifdef CONFIG_SCHED_MC 7640 if (!err && mc_capable()) 7641 err = sysfs_create_file(&cls->kset.kobj, 7642 &attr_sched_mc_power_savings.attr); 7643#endif 7644 return err; 7645} 7646#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7647 7648/* 7649 * Update cpusets according to cpu_active mask. If cpusets are 7650 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7651 * around partition_sched_domains(). 7652 */ 7653static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7654 void *hcpu) 7655{ 7656 switch (action & ~CPU_TASKS_FROZEN) { 7657 case CPU_ONLINE: 7658 case CPU_DOWN_FAILED: 7659 cpuset_update_active_cpus(); 7660 return NOTIFY_OK; 7661 default: 7662 return NOTIFY_DONE; 7663 } 7664} 7665 7666static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7667 void *hcpu) 7668{ 7669 switch (action & ~CPU_TASKS_FROZEN) { 7670 case CPU_DOWN_PREPARE: 7671 cpuset_update_active_cpus(); 7672 return NOTIFY_OK; 7673 default: 7674 return NOTIFY_DONE; 7675 } 7676} 7677 7678static int update_runtime(struct notifier_block *nfb, 7679 unsigned long action, void *hcpu) 7680{ 7681 int cpu = (int)(long)hcpu; 7682 7683 switch (action) { 7684 case CPU_DOWN_PREPARE: 7685 case CPU_DOWN_PREPARE_FROZEN: 7686 disable_runtime(cpu_rq(cpu)); 7687 return NOTIFY_OK; 7688 7689 case CPU_DOWN_FAILED: 7690 case CPU_DOWN_FAILED_FROZEN: 7691 case CPU_ONLINE: 7692 case CPU_ONLINE_FROZEN: 7693 enable_runtime(cpu_rq(cpu)); 7694 return NOTIFY_OK; 7695 7696 default: 7697 return NOTIFY_DONE; 7698 } 7699} 7700 7701void __init sched_init_smp(void) 7702{ 7703 cpumask_var_t non_isolated_cpus; 7704 7705 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7706 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7707 7708#if defined(CONFIG_NUMA) 7709 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 7710 GFP_KERNEL); 7711 BUG_ON(sched_group_nodes_bycpu == NULL); 7712#endif 7713 get_online_cpus(); 7714 mutex_lock(&sched_domains_mutex); 7715 arch_init_sched_domains(cpu_active_mask); 7716 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7717 if (cpumask_empty(non_isolated_cpus)) 7718 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7719 mutex_unlock(&sched_domains_mutex); 7720 put_online_cpus(); 7721 7722 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 7723 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 7724 7725 /* RT runtime code needs to handle some hotplug events */ 7726 hotcpu_notifier(update_runtime, 0); 7727 7728 init_hrtick(); 7729 7730 /* Move init over to a non-isolated CPU */ 7731 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 7732 BUG(); 7733 sched_init_granularity(); 7734 free_cpumask_var(non_isolated_cpus); 7735 7736 init_sched_rt_class(); 7737} 7738#else 7739void __init sched_init_smp(void) 7740{ 7741 sched_init_granularity(); 7742} 7743#endif /* CONFIG_SMP */ 7744 7745const_debug unsigned int sysctl_timer_migration = 1; 7746 7747int in_sched_functions(unsigned long addr) 7748{ 7749 return in_lock_functions(addr) || 7750 (addr >= (unsigned long)__sched_text_start 7751 && addr < (unsigned long)__sched_text_end); 7752} 7753 7754static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 7755{ 7756 cfs_rq->tasks_timeline = RB_ROOT; 7757 INIT_LIST_HEAD(&cfs_rq->tasks); 7758#ifdef CONFIG_FAIR_GROUP_SCHED 7759 cfs_rq->rq = rq; 7760#endif 7761 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7762} 7763 7764static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 7765{ 7766 struct rt_prio_array *array; 7767 int i; 7768 7769 array = &rt_rq->active; 7770 for (i = 0; i < MAX_RT_PRIO; i++) { 7771 INIT_LIST_HEAD(array->queue + i); 7772 __clear_bit(i, array->bitmap); 7773 } 7774 /* delimiter for bitsearch: */ 7775 __set_bit(MAX_RT_PRIO, array->bitmap); 7776 7777#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 7778 rt_rq->highest_prio.curr = MAX_RT_PRIO; 7779#ifdef CONFIG_SMP 7780 rt_rq->highest_prio.next = MAX_RT_PRIO; 7781#endif 7782#endif 7783#ifdef CONFIG_SMP 7784 rt_rq->rt_nr_migratory = 0; 7785 rt_rq->overloaded = 0; 7786 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); 7787#endif 7788 7789 rt_rq->rt_time = 0; 7790 rt_rq->rt_throttled = 0; 7791 rt_rq->rt_runtime = 0; 7792 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 7793 7794#ifdef CONFIG_RT_GROUP_SCHED 7795 rt_rq->rt_nr_boosted = 0; 7796 rt_rq->rq = rq; 7797#endif 7798} 7799 7800#ifdef CONFIG_FAIR_GROUP_SCHED 7801static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7802 struct sched_entity *se, int cpu, int add, 7803 struct sched_entity *parent) 7804{ 7805 struct rq *rq = cpu_rq(cpu); 7806 tg->cfs_rq[cpu] = cfs_rq; 7807 init_cfs_rq(cfs_rq, rq); 7808 cfs_rq->tg = tg; 7809 if (add) 7810 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7811 7812 tg->se[cpu] = se; 7813 /* se could be NULL for init_task_group */ 7814 if (!se) 7815 return; 7816 7817 if (!parent) 7818 se->cfs_rq = &rq->cfs; 7819 else 7820 se->cfs_rq = parent->my_q; 7821 7822 se->my_q = cfs_rq; 7823 se->load.weight = tg->shares; 7824 se->load.inv_weight = 0; 7825 se->parent = parent; 7826} 7827#endif 7828 7829#ifdef CONFIG_RT_GROUP_SCHED 7830static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7831 struct sched_rt_entity *rt_se, int cpu, int add, 7832 struct sched_rt_entity *parent) 7833{ 7834 struct rq *rq = cpu_rq(cpu); 7835 7836 tg->rt_rq[cpu] = rt_rq; 7837 init_rt_rq(rt_rq, rq); 7838 rt_rq->tg = tg; 7839 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7840 if (add) 7841 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7842 7843 tg->rt_se[cpu] = rt_se; 7844 if (!rt_se) 7845 return; 7846 7847 if (!parent) 7848 rt_se->rt_rq = &rq->rt; 7849 else 7850 rt_se->rt_rq = parent->my_q; 7851 7852 rt_se->my_q = rt_rq; 7853 rt_se->parent = parent; 7854 INIT_LIST_HEAD(&rt_se->run_list); 7855} 7856#endif 7857 7858void __init sched_init(void) 7859{ 7860 int i, j; 7861 unsigned long alloc_size = 0, ptr; 7862 7863#ifdef CONFIG_FAIR_GROUP_SCHED 7864 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7865#endif 7866#ifdef CONFIG_RT_GROUP_SCHED 7867 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7868#endif 7869#ifdef CONFIG_CPUMASK_OFFSTACK 7870 alloc_size += num_possible_cpus() * cpumask_size(); 7871#endif 7872 if (alloc_size) { 7873 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7874 7875#ifdef CONFIG_FAIR_GROUP_SCHED 7876 init_task_group.se = (struct sched_entity **)ptr; 7877 ptr += nr_cpu_ids * sizeof(void **); 7878 7879 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7880 ptr += nr_cpu_ids * sizeof(void **); 7881 7882#endif /* CONFIG_FAIR_GROUP_SCHED */ 7883#ifdef CONFIG_RT_GROUP_SCHED 7884 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7885 ptr += nr_cpu_ids * sizeof(void **); 7886 7887 init_task_group.rt_rq = (struct rt_rq **)ptr; 7888 ptr += nr_cpu_ids * sizeof(void **); 7889 7890#endif /* CONFIG_RT_GROUP_SCHED */ 7891#ifdef CONFIG_CPUMASK_OFFSTACK 7892 for_each_possible_cpu(i) { 7893 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 7894 ptr += cpumask_size(); 7895 } 7896#endif /* CONFIG_CPUMASK_OFFSTACK */ 7897 } 7898 7899#ifdef CONFIG_SMP 7900 init_defrootdomain(); 7901#endif 7902 7903 init_rt_bandwidth(&def_rt_bandwidth, 7904 global_rt_period(), global_rt_runtime()); 7905 7906#ifdef CONFIG_RT_GROUP_SCHED 7907 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7908 global_rt_period(), global_rt_runtime()); 7909#endif /* CONFIG_RT_GROUP_SCHED */ 7910 7911#ifdef CONFIG_CGROUP_SCHED 7912 list_add(&init_task_group.list, &task_groups); 7913 INIT_LIST_HEAD(&init_task_group.children); 7914 7915#endif /* CONFIG_CGROUP_SCHED */ 7916 7917#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7918 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7919 __alignof__(unsigned long)); 7920#endif 7921 for_each_possible_cpu(i) { 7922 struct rq *rq; 7923 7924 rq = cpu_rq(i); 7925 raw_spin_lock_init(&rq->lock); 7926 rq->nr_running = 0; 7927 rq->calc_load_active = 0; 7928 rq->calc_load_update = jiffies + LOAD_FREQ; 7929 init_cfs_rq(&rq->cfs, rq); 7930 init_rt_rq(&rq->rt, rq); 7931#ifdef CONFIG_FAIR_GROUP_SCHED 7932 init_task_group.shares = init_task_group_load; 7933 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7934#ifdef CONFIG_CGROUP_SCHED 7935 /* 7936 * How much cpu bandwidth does init_task_group get? 7937 * 7938 * In case of task-groups formed thr' the cgroup filesystem, it 7939 * gets 100% of the cpu resources in the system. This overall 7940 * system cpu resource is divided among the tasks of 7941 * init_task_group and its child task-groups in a fair manner, 7942 * based on each entity's (task or task-group's) weight 7943 * (se->load.weight). 7944 * 7945 * In other words, if init_task_group has 10 tasks of weight 7946 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7947 * then A0's share of the cpu resource is: 7948 * 7949 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7950 * 7951 * We achieve this by letting init_task_group's tasks sit 7952 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7953 */ 7954 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7955#endif 7956#endif /* CONFIG_FAIR_GROUP_SCHED */ 7957 7958 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7959#ifdef CONFIG_RT_GROUP_SCHED 7960 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7961#ifdef CONFIG_CGROUP_SCHED 7962 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7963#endif 7964#endif 7965 7966 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7967 rq->cpu_load[j] = 0; 7968 7969 rq->last_load_update_tick = jiffies; 7970 7971#ifdef CONFIG_SMP 7972 rq->sd = NULL; 7973 rq->rd = NULL; 7974 rq->cpu_power = SCHED_LOAD_SCALE; 7975 rq->post_schedule = 0; 7976 rq->active_balance = 0; 7977 rq->next_balance = jiffies; 7978 rq->push_cpu = 0; 7979 rq->cpu = i; 7980 rq->online = 0; 7981 rq->idle_stamp = 0; 7982 rq->avg_idle = 2*sysctl_sched_migration_cost; 7983 rq_attach_root(rq, &def_root_domain); 7984#ifdef CONFIG_NO_HZ 7985 rq->nohz_balance_kick = 0; 7986 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); 7987#endif 7988#endif 7989 init_rq_hrtick(rq); 7990 atomic_set(&rq->nr_iowait, 0); 7991 } 7992 7993 set_load_weight(&init_task); 7994 7995#ifdef CONFIG_PREEMPT_NOTIFIERS 7996 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 7997#endif 7998 7999#ifdef CONFIG_SMP 8000 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 8001#endif 8002 8003#ifdef CONFIG_RT_MUTEXES 8004 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); 8005#endif 8006 8007 /* 8008 * The boot idle thread does lazy MMU switching as well: 8009 */ 8010 atomic_inc(&init_mm.mm_count); 8011 enter_lazy_tlb(&init_mm, current); 8012 8013 /* 8014 * Make us the idle thread. Technically, schedule() should not be 8015 * called from this thread, however somewhere below it might be, 8016 * but because we are the idle thread, we just pick up running again 8017 * when this runqueue becomes "idle". 8018 */ 8019 init_idle(current, smp_processor_id()); 8020 8021 calc_load_update = jiffies + LOAD_FREQ; 8022 8023 /* 8024 * During early bootup we pretend to be a normal task: 8025 */ 8026 current->sched_class = &fair_sched_class; 8027 8028 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 8029 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8030#ifdef CONFIG_SMP 8031#ifdef CONFIG_NO_HZ 8032 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 8033 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 8034 atomic_set(&nohz.load_balancer, nr_cpu_ids); 8035 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); 8036 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); 8037#endif 8038 /* May be allocated at isolcpus cmdline parse time */ 8039 if (cpu_isolated_map == NULL) 8040 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8041#endif /* SMP */ 8042 8043 perf_event_init(); 8044 8045 scheduler_running = 1; 8046} 8047 8048#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 8049static inline int preempt_count_equals(int preempt_offset) 8050{ 8051 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8052 8053 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 8054} 8055 8056void __might_sleep(const char *file, int line, int preempt_offset) 8057{ 8058#ifdef in_atomic 8059 static unsigned long prev_jiffy; /* ratelimiting */ 8060 8061 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8062 system_state != SYSTEM_RUNNING || oops_in_progress) 8063 return; 8064 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8065 return; 8066 prev_jiffy = jiffies; 8067 8068 printk(KERN_ERR 8069 "BUG: sleeping function called from invalid context at %s:%d\n", 8070 file, line); 8071 printk(KERN_ERR 8072 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 8073 in_atomic(), irqs_disabled(), 8074 current->pid, current->comm); 8075 8076 debug_show_held_locks(current); 8077 if (irqs_disabled()) 8078 print_irqtrace_events(current); 8079 dump_stack(); 8080#endif 8081} 8082EXPORT_SYMBOL(__might_sleep); 8083#endif 8084 8085#ifdef CONFIG_MAGIC_SYSRQ 8086static void normalize_task(struct rq *rq, struct task_struct *p) 8087{ 8088 int on_rq; 8089 8090 on_rq = p->se.on_rq; 8091 if (on_rq) 8092 deactivate_task(rq, p, 0); 8093 __setscheduler(rq, p, SCHED_NORMAL, 0); 8094 if (on_rq) { 8095 activate_task(rq, p, 0); 8096 resched_task(rq->curr); 8097 } 8098} 8099 8100void normalize_rt_tasks(void) 8101{ 8102 struct task_struct *g, *p; 8103 unsigned long flags; 8104 struct rq *rq; 8105 8106 read_lock_irqsave(&tasklist_lock, flags); 8107 do_each_thread(g, p) { 8108 /* 8109 * Only normalize user tasks: 8110 */ 8111 if (!p->mm) 8112 continue; 8113 8114 p->se.exec_start = 0; 8115#ifdef CONFIG_SCHEDSTATS 8116 p->se.statistics.wait_start = 0; 8117 p->se.statistics.sleep_start = 0; 8118 p->se.statistics.block_start = 0; 8119#endif 8120 8121 if (!rt_task(p)) { 8122 /* 8123 * Renice negative nice level userspace 8124 * tasks back to 0: 8125 */ 8126 if (TASK_NICE(p) < 0 && p->mm) 8127 set_user_nice(p, 0); 8128 continue; 8129 } 8130 8131 raw_spin_lock(&p->pi_lock); 8132 rq = __task_rq_lock(p); 8133 8134 normalize_task(rq, p); 8135 8136 __task_rq_unlock(rq); 8137 raw_spin_unlock(&p->pi_lock); 8138 } while_each_thread(g, p); 8139 8140 read_unlock_irqrestore(&tasklist_lock, flags); 8141} 8142 8143#endif /* CONFIG_MAGIC_SYSRQ */ 8144 8145#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 8146/* 8147 * These functions are only useful for the IA64 MCA handling, or kdb. 8148 * 8149 * They can only be called when the whole system has been 8150 * stopped - every CPU needs to be quiescent, and no scheduling 8151 * activity can take place. Using them for anything else would 8152 * be a serious bug, and as a result, they aren't even visible 8153 * under any other configuration. 8154 */ 8155 8156/** 8157 * curr_task - return the current task for a given cpu. 8158 * @cpu: the processor in question. 8159 * 8160 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 8161 */ 8162struct task_struct *curr_task(int cpu) 8163{ 8164 return cpu_curr(cpu); 8165} 8166 8167#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 8168 8169#ifdef CONFIG_IA64 8170/** 8171 * set_curr_task - set the current task for a given cpu. 8172 * @cpu: the processor in question. 8173 * @p: the task pointer to set. 8174 * 8175 * Description: This function must only be used when non-maskable interrupts 8176 * are serviced on a separate stack. It allows the architecture to switch the 8177 * notion of the current task on a cpu in a non-blocking manner. This function 8178 * must be called with all CPU's synchronized, and interrupts disabled, the 8179 * and caller must save the original value of the current task (see 8180 * curr_task() above) and restore that value before reenabling interrupts and 8181 * re-starting the system. 8182 * 8183 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 8184 */ 8185void set_curr_task(int cpu, struct task_struct *p) 8186{ 8187 cpu_curr(cpu) = p; 8188} 8189 8190#endif 8191 8192#ifdef CONFIG_FAIR_GROUP_SCHED 8193static void free_fair_sched_group(struct task_group *tg) 8194{ 8195 int i; 8196 8197 for_each_possible_cpu(i) { 8198 if (tg->cfs_rq) 8199 kfree(tg->cfs_rq[i]); 8200 if (tg->se) 8201 kfree(tg->se[i]); 8202 } 8203 8204 kfree(tg->cfs_rq); 8205 kfree(tg->se); 8206} 8207 8208static 8209int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8210{ 8211 struct cfs_rq *cfs_rq; 8212 struct sched_entity *se; 8213 struct rq *rq; 8214 int i; 8215 8216 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8217 if (!tg->cfs_rq) 8218 goto err; 8219 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); 8220 if (!tg->se) 8221 goto err; 8222 8223 tg->shares = NICE_0_LOAD; 8224 8225 for_each_possible_cpu(i) { 8226 rq = cpu_rq(i); 8227 8228 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8229 GFP_KERNEL, cpu_to_node(i)); 8230 if (!cfs_rq) 8231 goto err; 8232 8233 se = kzalloc_node(sizeof(struct sched_entity), 8234 GFP_KERNEL, cpu_to_node(i)); 8235 if (!se) 8236 goto err_free_rq; 8237 8238 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8239 } 8240 8241 return 1; 8242 8243 err_free_rq: 8244 kfree(cfs_rq); 8245 err: 8246 return 0; 8247} 8248 8249static inline void register_fair_sched_group(struct task_group *tg, int cpu) 8250{ 8251 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, 8252 &cpu_rq(cpu)->leaf_cfs_rq_list); 8253} 8254 8255static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8256{ 8257 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8258} 8259#else /* !CONFG_FAIR_GROUP_SCHED */ 8260static inline void free_fair_sched_group(struct task_group *tg) 8261{ 8262} 8263 8264static inline 8265int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8266{ 8267 return 1; 8268} 8269 8270static inline void register_fair_sched_group(struct task_group *tg, int cpu) 8271{ 8272} 8273 8274static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8275{ 8276} 8277#endif /* CONFIG_FAIR_GROUP_SCHED */ 8278 8279#ifdef CONFIG_RT_GROUP_SCHED 8280static void free_rt_sched_group(struct task_group *tg) 8281{ 8282 int i; 8283 8284 destroy_rt_bandwidth(&tg->rt_bandwidth); 8285 8286 for_each_possible_cpu(i) { 8287 if (tg->rt_rq) 8288 kfree(tg->rt_rq[i]); 8289 if (tg->rt_se) 8290 kfree(tg->rt_se[i]); 8291 } 8292 8293 kfree(tg->rt_rq); 8294 kfree(tg->rt_se); 8295} 8296 8297static 8298int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8299{ 8300 struct rt_rq *rt_rq; 8301 struct sched_rt_entity *rt_se; 8302 struct rq *rq; 8303 int i; 8304 8305 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8306 if (!tg->rt_rq) 8307 goto err; 8308 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 8309 if (!tg->rt_se) 8310 goto err; 8311 8312 init_rt_bandwidth(&tg->rt_bandwidth, 8313 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8314 8315 for_each_possible_cpu(i) { 8316 rq = cpu_rq(i); 8317 8318 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8319 GFP_KERNEL, cpu_to_node(i)); 8320 if (!rt_rq) 8321 goto err; 8322 8323 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 8324 GFP_KERNEL, cpu_to_node(i)); 8325 if (!rt_se) 8326 goto err_free_rq; 8327 8328 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8329 } 8330 8331 return 1; 8332 8333 err_free_rq: 8334 kfree(rt_rq); 8335 err: 8336 return 0; 8337} 8338 8339static inline void register_rt_sched_group(struct task_group *tg, int cpu) 8340{ 8341 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, 8342 &cpu_rq(cpu)->leaf_rt_rq_list); 8343} 8344 8345static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8346{ 8347 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8348} 8349#else /* !CONFIG_RT_GROUP_SCHED */ 8350static inline void free_rt_sched_group(struct task_group *tg) 8351{ 8352} 8353 8354static inline 8355int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8356{ 8357 return 1; 8358} 8359 8360static inline void register_rt_sched_group(struct task_group *tg, int cpu) 8361{ 8362} 8363 8364static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8365{ 8366} 8367#endif /* CONFIG_RT_GROUP_SCHED */ 8368 8369#ifdef CONFIG_CGROUP_SCHED 8370static void free_sched_group(struct task_group *tg) 8371{ 8372 free_fair_sched_group(tg); 8373 free_rt_sched_group(tg); 8374 kfree(tg); 8375} 8376 8377/* allocate runqueue etc for a new task group */ 8378struct task_group *sched_create_group(struct task_group *parent) 8379{ 8380 struct task_group *tg; 8381 unsigned long flags; 8382 int i; 8383 8384 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8385 if (!tg) 8386 return ERR_PTR(-ENOMEM); 8387 8388 if (!alloc_fair_sched_group(tg, parent)) 8389 goto err; 8390 8391 if (!alloc_rt_sched_group(tg, parent)) 8392 goto err; 8393 8394 spin_lock_irqsave(&task_group_lock, flags); 8395 for_each_possible_cpu(i) { 8396 register_fair_sched_group(tg, i); 8397 register_rt_sched_group(tg, i); 8398 } 8399 list_add_rcu(&tg->list, &task_groups); 8400 8401 WARN_ON(!parent); /* root should already exist */ 8402 8403 tg->parent = parent; 8404 INIT_LIST_HEAD(&tg->children); 8405 list_add_rcu(&tg->siblings, &parent->children); 8406 spin_unlock_irqrestore(&task_group_lock, flags); 8407 8408 return tg; 8409 8410err: 8411 free_sched_group(tg); 8412 return ERR_PTR(-ENOMEM); 8413} 8414 8415/* rcu callback to free various structures associated with a task group */ 8416static void free_sched_group_rcu(struct rcu_head *rhp) 8417{ 8418 /* now it should be safe to free those cfs_rqs */ 8419 free_sched_group(container_of(rhp, struct task_group, rcu)); 8420} 8421 8422/* Destroy runqueue etc associated with a task group */ 8423void sched_destroy_group(struct task_group *tg) 8424{ 8425 unsigned long flags; 8426 int i; 8427 8428 spin_lock_irqsave(&task_group_lock, flags); 8429 for_each_possible_cpu(i) { 8430 unregister_fair_sched_group(tg, i); 8431 unregister_rt_sched_group(tg, i); 8432 } 8433 list_del_rcu(&tg->list); 8434 list_del_rcu(&tg->siblings); 8435 spin_unlock_irqrestore(&task_group_lock, flags); 8436 8437 /* wait for possible concurrent references to cfs_rqs complete */ 8438 call_rcu(&tg->rcu, free_sched_group_rcu); 8439} 8440 8441/* change task's runqueue when it moves between groups. 8442 * The caller of this function should have put the task in its new group 8443 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 8444 * reflect its new group. 8445 */ 8446void sched_move_task(struct task_struct *tsk) 8447{ 8448 int on_rq, running; 8449 unsigned long flags; 8450 struct rq *rq; 8451 8452 rq = task_rq_lock(tsk, &flags); 8453 8454 running = task_current(rq, tsk); 8455 on_rq = tsk->se.on_rq; 8456 8457 if (on_rq) 8458 dequeue_task(rq, tsk, 0); 8459 if (unlikely(running)) 8460 tsk->sched_class->put_prev_task(rq, tsk); 8461 8462 set_task_rq(tsk, task_cpu(tsk)); 8463 8464#ifdef CONFIG_FAIR_GROUP_SCHED 8465 if (tsk->sched_class->moved_group) 8466 tsk->sched_class->moved_group(tsk, on_rq); 8467#endif 8468 8469 if (unlikely(running)) 8470 tsk->sched_class->set_curr_task(rq); 8471 if (on_rq) 8472 enqueue_task(rq, tsk, 0); 8473 8474 task_rq_unlock(rq, &flags); 8475} 8476#endif /* CONFIG_CGROUP_SCHED */ 8477 8478#ifdef CONFIG_FAIR_GROUP_SCHED 8479static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8480{ 8481 struct cfs_rq *cfs_rq = se->cfs_rq; 8482 int on_rq; 8483 8484 on_rq = se->on_rq; 8485 if (on_rq) 8486 dequeue_entity(cfs_rq, se, 0); 8487 8488 se->load.weight = shares; 8489 se->load.inv_weight = 0; 8490 8491 if (on_rq) 8492 enqueue_entity(cfs_rq, se, 0); 8493} 8494 8495static void set_se_shares(struct sched_entity *se, unsigned long shares) 8496{ 8497 struct cfs_rq *cfs_rq = se->cfs_rq; 8498 struct rq *rq = cfs_rq->rq; 8499 unsigned long flags; 8500 8501 raw_spin_lock_irqsave(&rq->lock, flags); 8502 __set_se_shares(se, shares); 8503 raw_spin_unlock_irqrestore(&rq->lock, flags); 8504} 8505 8506static DEFINE_MUTEX(shares_mutex); 8507 8508int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8509{ 8510 int i; 8511 unsigned long flags; 8512 8513 /* 8514 * We can't change the weight of the root cgroup. 8515 */ 8516 if (!tg->se[0]) 8517 return -EINVAL; 8518 8519 if (shares < MIN_SHARES) 8520 shares = MIN_SHARES; 8521 else if (shares > MAX_SHARES) 8522 shares = MAX_SHARES; 8523 8524 mutex_lock(&shares_mutex); 8525 if (tg->shares == shares) 8526 goto done; 8527 8528 spin_lock_irqsave(&task_group_lock, flags); 8529 for_each_possible_cpu(i) 8530 unregister_fair_sched_group(tg, i); 8531 list_del_rcu(&tg->siblings); 8532 spin_unlock_irqrestore(&task_group_lock, flags); 8533 8534 /* wait for any ongoing reference to this group to finish */ 8535 synchronize_sched(); 8536 8537 /* 8538 * Now we are free to modify the group's share on each cpu 8539 * w/o tripping rebalance_share or load_balance_fair. 8540 */ 8541 tg->shares = shares; 8542 for_each_possible_cpu(i) { 8543 /* 8544 * force a rebalance 8545 */ 8546 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8547 set_se_shares(tg->se[i], shares); 8548 } 8549 8550 /* 8551 * Enable load balance activity on this group, by inserting it back on 8552 * each cpu's rq->leaf_cfs_rq_list. 8553 */ 8554 spin_lock_irqsave(&task_group_lock, flags); 8555 for_each_possible_cpu(i) 8556 register_fair_sched_group(tg, i); 8557 list_add_rcu(&tg->siblings, &tg->parent->children); 8558 spin_unlock_irqrestore(&task_group_lock, flags); 8559done: 8560 mutex_unlock(&shares_mutex); 8561 return 0; 8562} 8563 8564unsigned long sched_group_shares(struct task_group *tg) 8565{ 8566 return tg->shares; 8567} 8568#endif 8569 8570#ifdef CONFIG_RT_GROUP_SCHED 8571/* 8572 * Ensure that the real time constraints are schedulable. 8573 */ 8574static DEFINE_MUTEX(rt_constraints_mutex); 8575 8576static unsigned long to_ratio(u64 period, u64 runtime) 8577{ 8578 if (runtime == RUNTIME_INF) 8579 return 1ULL << 20; 8580 8581 return div64_u64(runtime << 20, period); 8582} 8583 8584/* Must be called with tasklist_lock held */ 8585static inline int tg_has_rt_tasks(struct task_group *tg) 8586{ 8587 struct task_struct *g, *p; 8588 8589 do_each_thread(g, p) { 8590 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8591 return 1; 8592 } while_each_thread(g, p); 8593 8594 return 0; 8595} 8596 8597struct rt_schedulable_data { 8598 struct task_group *tg; 8599 u64 rt_period; 8600 u64 rt_runtime; 8601}; 8602 8603static int tg_schedulable(struct task_group *tg, void *data) 8604{ 8605 struct rt_schedulable_data *d = data; 8606 struct task_group *child; 8607 unsigned long total, sum = 0; 8608 u64 period, runtime; 8609 8610 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8611 runtime = tg->rt_bandwidth.rt_runtime; 8612 8613 if (tg == d->tg) { 8614 period = d->rt_period; 8615 runtime = d->rt_runtime; 8616 } 8617 8618 /* 8619 * Cannot have more runtime than the period. 8620 */ 8621 if (runtime > period && runtime != RUNTIME_INF) 8622 return -EINVAL; 8623 8624 /* 8625 * Ensure we don't starve existing RT tasks. 8626 */ 8627 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 8628 return -EBUSY; 8629 8630 total = to_ratio(period, runtime); 8631 8632 /* 8633 * Nobody can have more than the global setting allows. 8634 */ 8635 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 8636 return -EINVAL; 8637 8638 /* 8639 * The sum of our children's runtime should not exceed our own. 8640 */ 8641 list_for_each_entry_rcu(child, &tg->children, siblings) { 8642 period = ktime_to_ns(child->rt_bandwidth.rt_period); 8643 runtime = child->rt_bandwidth.rt_runtime; 8644 8645 if (child == d->tg) { 8646 period = d->rt_period; 8647 runtime = d->rt_runtime; 8648 } 8649 8650 sum += to_ratio(period, runtime); 8651 } 8652 8653 if (sum > total) 8654 return -EINVAL; 8655 8656 return 0; 8657} 8658 8659static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8660{ 8661 struct rt_schedulable_data data = { 8662 .tg = tg, 8663 .rt_period = period, 8664 .rt_runtime = runtime, 8665 }; 8666 8667 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8668} 8669 8670static int tg_set_bandwidth(struct task_group *tg, 8671 u64 rt_period, u64 rt_runtime) 8672{ 8673 int i, err = 0; 8674 8675 mutex_lock(&rt_constraints_mutex); 8676 read_lock(&tasklist_lock); 8677 err = __rt_schedulable(tg, rt_period, rt_runtime); 8678 if (err) 8679 goto unlock; 8680 8681 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8682 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8683 tg->rt_bandwidth.rt_runtime = rt_runtime; 8684 8685 for_each_possible_cpu(i) { 8686 struct rt_rq *rt_rq = tg->rt_rq[i]; 8687 8688 raw_spin_lock(&rt_rq->rt_runtime_lock); 8689 rt_rq->rt_runtime = rt_runtime; 8690 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8691 } 8692 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8693 unlock: 8694 read_unlock(&tasklist_lock); 8695 mutex_unlock(&rt_constraints_mutex); 8696 8697 return err; 8698} 8699 8700int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 8701{ 8702 u64 rt_runtime, rt_period; 8703 8704 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8705 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 8706 if (rt_runtime_us < 0) 8707 rt_runtime = RUNTIME_INF; 8708 8709 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8710} 8711 8712long sched_group_rt_runtime(struct task_group *tg) 8713{ 8714 u64 rt_runtime_us; 8715 8716 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 8717 return -1; 8718 8719 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 8720 do_div(rt_runtime_us, NSEC_PER_USEC); 8721 return rt_runtime_us; 8722} 8723 8724int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 8725{ 8726 u64 rt_runtime, rt_period; 8727 8728 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8729 rt_runtime = tg->rt_bandwidth.rt_runtime; 8730 8731 if (rt_period == 0) 8732 return -EINVAL; 8733 8734 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8735} 8736 8737long sched_group_rt_period(struct task_group *tg) 8738{ 8739 u64 rt_period_us; 8740 8741 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 8742 do_div(rt_period_us, NSEC_PER_USEC); 8743 return rt_period_us; 8744} 8745 8746static int sched_rt_global_constraints(void) 8747{ 8748 u64 runtime, period; 8749 int ret = 0; 8750 8751 if (sysctl_sched_rt_period <= 0) 8752 return -EINVAL; 8753 8754 runtime = global_rt_runtime(); 8755 period = global_rt_period(); 8756 8757 /* 8758 * Sanity check on the sysctl variables. 8759 */ 8760 if (runtime > period && runtime != RUNTIME_INF) 8761 return -EINVAL; 8762 8763 mutex_lock(&rt_constraints_mutex); 8764 read_lock(&tasklist_lock); 8765 ret = __rt_schedulable(NULL, 0, 0); 8766 read_unlock(&tasklist_lock); 8767 mutex_unlock(&rt_constraints_mutex); 8768 8769 return ret; 8770} 8771 8772int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 8773{ 8774 /* Don't accept realtime tasks when there is no way for them to run */ 8775 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 8776 return 0; 8777 8778 return 1; 8779} 8780 8781#else /* !CONFIG_RT_GROUP_SCHED */ 8782static int sched_rt_global_constraints(void) 8783{ 8784 unsigned long flags; 8785 int i; 8786 8787 if (sysctl_sched_rt_period <= 0) 8788 return -EINVAL; 8789 8790 /* 8791 * There's always some RT tasks in the root group 8792 * -- migration, kstopmachine etc.. 8793 */ 8794 if (sysctl_sched_rt_runtime == 0) 8795 return -EBUSY; 8796 8797 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 8798 for_each_possible_cpu(i) { 8799 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 8800 8801 raw_spin_lock(&rt_rq->rt_runtime_lock); 8802 rt_rq->rt_runtime = global_rt_runtime(); 8803 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8804 } 8805 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 8806 8807 return 0; 8808} 8809#endif /* CONFIG_RT_GROUP_SCHED */ 8810 8811int sched_rt_handler(struct ctl_table *table, int write, 8812 void __user *buffer, size_t *lenp, 8813 loff_t *ppos) 8814{ 8815 int ret; 8816 int old_period, old_runtime; 8817 static DEFINE_MUTEX(mutex); 8818 8819 mutex_lock(&mutex); 8820 old_period = sysctl_sched_rt_period; 8821 old_runtime = sysctl_sched_rt_runtime; 8822 8823 ret = proc_dointvec(table, write, buffer, lenp, ppos); 8824 8825 if (!ret && write) { 8826 ret = sched_rt_global_constraints(); 8827 if (ret) { 8828 sysctl_sched_rt_period = old_period; 8829 sysctl_sched_rt_runtime = old_runtime; 8830 } else { 8831 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 8832 def_rt_bandwidth.rt_period = 8833 ns_to_ktime(global_rt_period()); 8834 } 8835 } 8836 mutex_unlock(&mutex); 8837 8838 return ret; 8839} 8840 8841#ifdef CONFIG_CGROUP_SCHED 8842 8843/* return corresponding task_group object of a cgroup */ 8844static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 8845{ 8846 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 8847 struct task_group, css); 8848} 8849 8850static struct cgroup_subsys_state * 8851cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) 8852{ 8853 struct task_group *tg, *parent; 8854 8855 if (!cgrp->parent) { 8856 /* This is early initialization for the top cgroup */ 8857 return &init_task_group.css; 8858 } 8859 8860 parent = cgroup_tg(cgrp->parent); 8861 tg = sched_create_group(parent); 8862 if (IS_ERR(tg)) 8863 return ERR_PTR(-ENOMEM); 8864 8865 return &tg->css; 8866} 8867 8868static void 8869cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 8870{ 8871 struct task_group *tg = cgroup_tg(cgrp); 8872 8873 sched_destroy_group(tg); 8874} 8875 8876static int 8877cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 8878{ 8879#ifdef CONFIG_RT_GROUP_SCHED 8880 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 8881 return -EINVAL; 8882#else 8883 /* We don't support RT-tasks being in separate groups */ 8884 if (tsk->sched_class != &fair_sched_class) 8885 return -EINVAL; 8886#endif 8887 return 0; 8888} 8889 8890static int 8891cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8892 struct task_struct *tsk, bool threadgroup) 8893{ 8894 int retval = cpu_cgroup_can_attach_task(cgrp, tsk); 8895 if (retval) 8896 return retval; 8897 if (threadgroup) { 8898 struct task_struct *c; 8899 rcu_read_lock(); 8900 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 8901 retval = cpu_cgroup_can_attach_task(cgrp, c); 8902 if (retval) { 8903 rcu_read_unlock(); 8904 return retval; 8905 } 8906 } 8907 rcu_read_unlock(); 8908 } 8909 return 0; 8910} 8911 8912static void 8913cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8914 struct cgroup *old_cont, struct task_struct *tsk, 8915 bool threadgroup) 8916{ 8917 sched_move_task(tsk); 8918 if (threadgroup) { 8919 struct task_struct *c; 8920 rcu_read_lock(); 8921 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 8922 sched_move_task(c); 8923 } 8924 rcu_read_unlock(); 8925 } 8926} 8927 8928#ifdef CONFIG_FAIR_GROUP_SCHED 8929static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8930 u64 shareval) 8931{ 8932 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 8933} 8934 8935static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 8936{ 8937 struct task_group *tg = cgroup_tg(cgrp); 8938 8939 return (u64) tg->shares; 8940} 8941#endif /* CONFIG_FAIR_GROUP_SCHED */ 8942 8943#ifdef CONFIG_RT_GROUP_SCHED 8944static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8945 s64 val) 8946{ 8947 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 8948} 8949 8950static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 8951{ 8952 return sched_group_rt_runtime(cgroup_tg(cgrp)); 8953} 8954 8955static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 8956 u64 rt_period_us) 8957{ 8958 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 8959} 8960 8961static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 8962{ 8963 return sched_group_rt_period(cgroup_tg(cgrp)); 8964} 8965#endif /* CONFIG_RT_GROUP_SCHED */ 8966 8967static struct cftype cpu_files[] = { 8968#ifdef CONFIG_FAIR_GROUP_SCHED 8969 { 8970 .name = "shares", 8971 .read_u64 = cpu_shares_read_u64, 8972 .write_u64 = cpu_shares_write_u64, 8973 }, 8974#endif 8975#ifdef CONFIG_RT_GROUP_SCHED 8976 { 8977 .name = "rt_runtime_us", 8978 .read_s64 = cpu_rt_runtime_read, 8979 .write_s64 = cpu_rt_runtime_write, 8980 }, 8981 { 8982 .name = "rt_period_us", 8983 .read_u64 = cpu_rt_period_read_uint, 8984 .write_u64 = cpu_rt_period_write_uint, 8985 }, 8986#endif 8987}; 8988 8989static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 8990{ 8991 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); 8992} 8993 8994struct cgroup_subsys cpu_cgroup_subsys = { 8995 .name = "cpu", 8996 .create = cpu_cgroup_create, 8997 .destroy = cpu_cgroup_destroy, 8998 .can_attach = cpu_cgroup_can_attach, 8999 .attach = cpu_cgroup_attach, 9000 .populate = cpu_cgroup_populate, 9001 .subsys_id = cpu_cgroup_subsys_id, 9002 .early_init = 1, 9003}; 9004 9005#endif /* CONFIG_CGROUP_SCHED */ 9006 9007#ifdef CONFIG_CGROUP_CPUACCT 9008 9009/* 9010 * CPU accounting code for task groups. 9011 * 9012 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 9013 * (balbir@in.ibm.com). 9014 */ 9015 9016/* track cpu usage of a group of tasks and its child groups */ 9017struct cpuacct { 9018 struct cgroup_subsys_state css; 9019 /* cpuusage holds pointer to a u64-type object on every cpu */ 9020 u64 __percpu *cpuusage; 9021 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 9022 struct cpuacct *parent; 9023}; 9024 9025struct cgroup_subsys cpuacct_subsys; 9026 9027/* return cpu accounting group corresponding to this container */ 9028static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 9029{ 9030 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 9031 struct cpuacct, css); 9032} 9033 9034/* return cpu accounting group to which this task belongs */ 9035static inline struct cpuacct *task_ca(struct task_struct *tsk) 9036{ 9037 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 9038 struct cpuacct, css); 9039} 9040 9041/* create a new cpu accounting group */ 9042static struct cgroup_subsys_state *cpuacct_create( 9043 struct cgroup_subsys *ss, struct cgroup *cgrp) 9044{ 9045 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 9046 int i; 9047 9048 if (!ca) 9049 goto out; 9050 9051 ca->cpuusage = alloc_percpu(u64); 9052 if (!ca->cpuusage) 9053 goto out_free_ca; 9054 9055 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 9056 if (percpu_counter_init(&ca->cpustat[i], 0)) 9057 goto out_free_counters; 9058 9059 if (cgrp->parent) 9060 ca->parent = cgroup_ca(cgrp->parent); 9061 9062 return &ca->css; 9063 9064out_free_counters: 9065 while (--i >= 0) 9066 percpu_counter_destroy(&ca->cpustat[i]); 9067 free_percpu(ca->cpuusage); 9068out_free_ca: 9069 kfree(ca); 9070out: 9071 return ERR_PTR(-ENOMEM); 9072} 9073 9074/* destroy an existing cpu accounting group */ 9075static void 9076cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 9077{ 9078 struct cpuacct *ca = cgroup_ca(cgrp); 9079 int i; 9080 9081 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 9082 percpu_counter_destroy(&ca->cpustat[i]); 9083 free_percpu(ca->cpuusage); 9084 kfree(ca); 9085} 9086 9087static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 9088{ 9089 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 9090 u64 data; 9091 9092#ifndef CONFIG_64BIT 9093 /* 9094 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 9095 */ 9096 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 9097 data = *cpuusage; 9098 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 9099#else 9100 data = *cpuusage; 9101#endif 9102 9103 return data; 9104} 9105 9106static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 9107{ 9108 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 9109 9110#ifndef CONFIG_64BIT 9111 /* 9112 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 9113 */ 9114 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 9115 *cpuusage = val; 9116 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 9117#else 9118 *cpuusage = val; 9119#endif 9120} 9121 9122/* return total cpu usage (in nanoseconds) of a group */ 9123static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9124{ 9125 struct cpuacct *ca = cgroup_ca(cgrp); 9126 u64 totalcpuusage = 0; 9127 int i; 9128 9129 for_each_present_cpu(i) 9130 totalcpuusage += cpuacct_cpuusage_read(ca, i); 9131 9132 return totalcpuusage; 9133} 9134 9135static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 9136 u64 reset) 9137{ 9138 struct cpuacct *ca = cgroup_ca(cgrp); 9139 int err = 0; 9140 int i; 9141 9142 if (reset) { 9143 err = -EINVAL; 9144 goto out; 9145 } 9146 9147 for_each_present_cpu(i) 9148 cpuacct_cpuusage_write(ca, i, 0); 9149 9150out: 9151 return err; 9152} 9153 9154static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 9155 struct seq_file *m) 9156{ 9157 struct cpuacct *ca = cgroup_ca(cgroup); 9158 u64 percpu; 9159 int i; 9160 9161 for_each_present_cpu(i) { 9162 percpu = cpuacct_cpuusage_read(ca, i); 9163 seq_printf(m, "%llu ", (unsigned long long) percpu); 9164 } 9165 seq_printf(m, "\n"); 9166 return 0; 9167} 9168 9169static const char *cpuacct_stat_desc[] = { 9170 [CPUACCT_STAT_USER] = "user", 9171 [CPUACCT_STAT_SYSTEM] = "system", 9172}; 9173 9174static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 9175 struct cgroup_map_cb *cb) 9176{ 9177 struct cpuacct *ca = cgroup_ca(cgrp); 9178 int i; 9179 9180 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 9181 s64 val = percpu_counter_read(&ca->cpustat[i]); 9182 val = cputime64_to_clock_t(val); 9183 cb->fill(cb, cpuacct_stat_desc[i], val); 9184 } 9185 return 0; 9186} 9187 9188static struct cftype files[] = { 9189 { 9190 .name = "usage", 9191 .read_u64 = cpuusage_read, 9192 .write_u64 = cpuusage_write, 9193 }, 9194 { 9195 .name = "usage_percpu", 9196 .read_seq_string = cpuacct_percpu_seq_read, 9197 }, 9198 { 9199 .name = "stat", 9200 .read_map = cpuacct_stats_show, 9201 }, 9202}; 9203 9204static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9205{ 9206 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); 9207} 9208 9209/* 9210 * charge this task's execution time to its accounting group. 9211 * 9212 * called with rq->lock held. 9213 */ 9214static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9215{ 9216 struct cpuacct *ca; 9217 int cpu; 9218 9219 if (unlikely(!cpuacct_subsys.active)) 9220 return; 9221 9222 cpu = task_cpu(tsk); 9223 9224 rcu_read_lock(); 9225 9226 ca = task_ca(tsk); 9227 9228 for (; ca; ca = ca->parent) { 9229 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 9230 *cpuusage += cputime; 9231 } 9232 9233 rcu_read_unlock(); 9234} 9235 9236/* 9237 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large 9238 * in cputime_t units. As a result, cpuacct_update_stats calls 9239 * percpu_counter_add with values large enough to always overflow the 9240 * per cpu batch limit causing bad SMP scalability. 9241 * 9242 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we 9243 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled 9244 * and enabled. We cap it at INT_MAX which is the largest allowed batch value. 9245 */ 9246#ifdef CONFIG_SMP 9247#define CPUACCT_BATCH \ 9248 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) 9249#else 9250#define CPUACCT_BATCH 0 9251#endif 9252 9253/* 9254 * Charge the system/user time to the task's accounting group. 9255 */ 9256static void cpuacct_update_stats(struct task_struct *tsk, 9257 enum cpuacct_stat_index idx, cputime_t val) 9258{ 9259 struct cpuacct *ca; 9260 int batch = CPUACCT_BATCH; 9261 9262 if (unlikely(!cpuacct_subsys.active)) 9263 return; 9264 9265 rcu_read_lock(); 9266 ca = task_ca(tsk); 9267 9268 do { 9269 __percpu_counter_add(&ca->cpustat[idx], val, batch); 9270 ca = ca->parent; 9271 } while (ca); 9272 rcu_read_unlock(); 9273} 9274 9275struct cgroup_subsys cpuacct_subsys = { 9276 .name = "cpuacct", 9277 .create = cpuacct_create, 9278 .destroy = cpuacct_destroy, 9279 .populate = cpuacct_populate, 9280 .subsys_id = cpuacct_subsys_id, 9281}; 9282#endif /* CONFIG_CGROUP_CPUACCT */ 9283 9284#ifndef CONFIG_SMP 9285 9286void synchronize_sched_expedited(void) 9287{ 9288 barrier(); 9289} 9290EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9291 9292#else /* #ifndef CONFIG_SMP */ 9293 9294static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); 9295 9296static int synchronize_sched_expedited_cpu_stop(void *data) 9297{ 9298 /* 9299 * There must be a full memory barrier on each affected CPU 9300 * between the time that try_stop_cpus() is called and the 9301 * time that it returns. 9302 * 9303 * In the current initial implementation of cpu_stop, the 9304 * above condition is already met when the control reaches 9305 * this point and the following smp_mb() is not strictly 9306 * necessary. Do smp_mb() anyway for documentation and 9307 * robustness against future implementation changes. 9308 */ 9309 smp_mb(); /* See above comment block. */ 9310 return 0; 9311} 9312 9313/* 9314 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 9315 * approach to force grace period to end quickly. This consumes 9316 * significant time on all CPUs, and is thus not recommended for 9317 * any sort of common-case code. 9318 * 9319 * Note that it is illegal to call this function while holding any 9320 * lock that is acquired by a CPU-hotplug notifier. Failing to 9321 * observe this restriction will result in deadlock. 9322 */ 9323void synchronize_sched_expedited(void) 9324{ 9325 int snap, trycount = 0; 9326 9327 smp_mb(); /* ensure prior mod happens before capturing snap. */ 9328 snap = atomic_read(&synchronize_sched_expedited_count) + 1; 9329 get_online_cpus(); 9330 while (try_stop_cpus(cpu_online_mask, 9331 synchronize_sched_expedited_cpu_stop, 9332 NULL) == -EAGAIN) { 9333 put_online_cpus(); 9334 if (trycount++ < 10) 9335 udelay(trycount * num_online_cpus()); 9336 else { 9337 synchronize_sched(); 9338 return; 9339 } 9340 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { 9341 smp_mb(); /* ensure test happens before caller kfree */ 9342 return; 9343 } 9344 get_online_cpus(); 9345 } 9346 atomic_inc(&synchronize_sched_expedited_count); 9347 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ 9348 put_online_cpus(); 9349} 9350EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9351 9352#endif /* #else #ifndef CONFIG_SMP */ 9353