1/* 2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) 3 * 4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 5 * 6 * Interactivity improvements by Mike Galbraith 7 * (C) 2007 Mike Galbraith <efault@gmx.de> 8 * 9 * Various enhancements by Dmitry Adamushko. 10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> 11 * 12 * Group scheduling enhancements by Srivatsa Vaddagiri 13 * Copyright IBM Corporation, 2007 14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> 15 * 16 * Scaled math optimizations by Thomas Gleixner 17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 18 * 19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 21 */ 22 23#include <linux/latencytop.h> 24#include <linux/sched.h> 25 26/* 27 * Targeted preemption latency for CPU-bound tasks: 28 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) 29 * 30 * NOTE: this latency value is not the same as the concept of 31 * 'timeslice length' - timeslices in CFS are of variable length 32 * and have no persistent notion like in traditional, time-slice 33 * based scheduling concepts. 34 * 35 * (to see the precise effective timeslice length of your workload, 36 * run vmstat and monitor the context-switches (cs) field) 37 */ 38unsigned int sysctl_sched_latency = 6000000ULL; 39unsigned int normalized_sysctl_sched_latency = 6000000ULL; 40 41/* 42 * The initial- and re-scaling of tunables is configurable 43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) 44 * 45 * Options are: 46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1 47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) 48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus 49 */ 50enum sched_tunable_scaling sysctl_sched_tunable_scaling 51 = SCHED_TUNABLESCALING_LOG; 52 53/* 54 * Minimal preemption granularity for CPU-bound tasks: 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 56 */ 57unsigned int sysctl_sched_min_granularity = 750000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 59 60/* 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 62 */ 63static unsigned int sched_nr_latency = 8; 64 65/* 66 * After fork, child runs first. If set to 0 (default) then 67 * parent will (try to) run first. 68 */ 69unsigned int sysctl_sched_child_runs_first __read_mostly; 70 71/* 72 * sys_sched_yield() compat mode 73 * 74 * This option switches the agressive yield implementation of the 75 * old scheduler back on. 76 */ 77unsigned int __read_mostly sysctl_sched_compat_yield; 78 79/* 80 * SCHED_OTHER wake-up granularity. 81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 82 * 83 * This option delays the preemption effects of decoupled workloads 84 * and reduces their over-scheduling. Synchronous workloads will still 85 * have immediate wakeup/sleep latencies. 86 */ 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; 89 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 91 92static const struct sched_class fair_sched_class; 93 94/************************************************************** 95 * CFS operations on generic schedulable entities: 96 */ 97 98#ifdef CONFIG_FAIR_GROUP_SCHED 99 100/* cpu runqueue to which this cfs_rq is attached */ 101static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 102{ 103 return cfs_rq->rq; 104} 105 106/* An entity is a task if it doesn't "own" a runqueue */ 107#define entity_is_task(se) (!se->my_q) 108 109static inline struct task_struct *task_of(struct sched_entity *se) 110{ 111#ifdef CONFIG_SCHED_DEBUG 112 WARN_ON_ONCE(!entity_is_task(se)); 113#endif 114 return container_of(se, struct task_struct, se); 115} 116 117/* Walk up scheduling entities hierarchy */ 118#define for_each_sched_entity(se) \ 119 for (; se; se = se->parent) 120 121static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) 122{ 123 return p->se.cfs_rq; 124} 125 126/* runqueue on which this entity is (to be) queued */ 127static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) 128{ 129 return se->cfs_rq; 130} 131 132/* runqueue "owned" by this group */ 133static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) 134{ 135 return grp->my_q; 136} 137 138/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on 139 * another cpu ('this_cpu') 140 */ 141static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 142{ 143 return cfs_rq->tg->cfs_rq[this_cpu]; 144} 145 146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 149 150/* Do the two (enqueued) entities belong to the same group ? */ 151static inline int 152is_same_group(struct sched_entity *se, struct sched_entity *pse) 153{ 154 if (se->cfs_rq == pse->cfs_rq) 155 return 1; 156 157 return 0; 158} 159 160static inline struct sched_entity *parent_entity(struct sched_entity *se) 161{ 162 return se->parent; 163} 164 165/* return depth at which a sched entity is present in the hierarchy */ 166static inline int depth_se(struct sched_entity *se) 167{ 168 int depth = 0; 169 170 for_each_sched_entity(se) 171 depth++; 172 173 return depth; 174} 175 176static void 177find_matching_se(struct sched_entity **se, struct sched_entity **pse) 178{ 179 int se_depth, pse_depth; 180 181 /* 182 * preemption test can be made between sibling entities who are in the 183 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 184 * both tasks until we find their ancestors who are siblings of common 185 * parent. 186 */ 187 188 /* First walk up until both entities are at same depth */ 189 se_depth = depth_se(*se); 190 pse_depth = depth_se(*pse); 191 192 while (se_depth > pse_depth) { 193 se_depth--; 194 *se = parent_entity(*se); 195 } 196 197 while (pse_depth > se_depth) { 198 pse_depth--; 199 *pse = parent_entity(*pse); 200 } 201 202 while (!is_same_group(*se, *pse)) { 203 *se = parent_entity(*se); 204 *pse = parent_entity(*pse); 205 } 206} 207 208#else /* !CONFIG_FAIR_GROUP_SCHED */ 209 210static inline struct task_struct *task_of(struct sched_entity *se) 211{ 212 return container_of(se, struct task_struct, se); 213} 214 215static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 216{ 217 return container_of(cfs_rq, struct rq, cfs); 218} 219 220#define entity_is_task(se) 1 221 222#define for_each_sched_entity(se) \ 223 for (; se; se = NULL) 224 225static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) 226{ 227 return &task_rq(p)->cfs; 228} 229 230static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) 231{ 232 struct task_struct *p = task_of(se); 233 struct rq *rq = task_rq(p); 234 235 return &rq->cfs; 236} 237 238/* runqueue "owned" by this group */ 239static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) 240{ 241 return NULL; 242} 243 244static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 245{ 246 return &cpu_rq(this_cpu)->cfs; 247} 248 249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 251 252static inline int 253is_same_group(struct sched_entity *se, struct sched_entity *pse) 254{ 255 return 1; 256} 257 258static inline struct sched_entity *parent_entity(struct sched_entity *se) 259{ 260 return NULL; 261} 262 263static inline void 264find_matching_se(struct sched_entity **se, struct sched_entity **pse) 265{ 266} 267 268#endif /* CONFIG_FAIR_GROUP_SCHED */ 269 270 271/************************************************************** 272 * Scheduling class tree data structure manipulation methods: 273 */ 274 275static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) 276{ 277 s64 delta = (s64)(vruntime - min_vruntime); 278 if (delta > 0) 279 min_vruntime = vruntime; 280 281 return min_vruntime; 282} 283 284static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) 285{ 286 s64 delta = (s64)(vruntime - min_vruntime); 287 if (delta < 0) 288 min_vruntime = vruntime; 289 290 return min_vruntime; 291} 292 293static inline int entity_before(struct sched_entity *a, 294 struct sched_entity *b) 295{ 296 return (s64)(a->vruntime - b->vruntime) < 0; 297} 298 299static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 300{ 301 return se->vruntime - cfs_rq->min_vruntime; 302} 303 304static void update_min_vruntime(struct cfs_rq *cfs_rq) 305{ 306 u64 vruntime = cfs_rq->min_vruntime; 307 308 if (cfs_rq->curr) 309 vruntime = cfs_rq->curr->vruntime; 310 311 if (cfs_rq->rb_leftmost) { 312 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, 313 struct sched_entity, 314 run_node); 315 316 if (!cfs_rq->curr) 317 vruntime = se->vruntime; 318 else 319 vruntime = min_vruntime(vruntime, se->vruntime); 320 } 321 322 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 323} 324 325/* 326 * Enqueue an entity into the rb-tree: 327 */ 328static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 329{ 330 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 331 struct rb_node *parent = NULL; 332 struct sched_entity *entry; 333 s64 key = entity_key(cfs_rq, se); 334 int leftmost = 1; 335 336 /* 337 * Find the right place in the rbtree: 338 */ 339 while (*link) { 340 parent = *link; 341 entry = rb_entry(parent, struct sched_entity, run_node); 342 /* 343 * We dont care about collisions. Nodes with 344 * the same key stay together. 345 */ 346 if (key < entity_key(cfs_rq, entry)) { 347 link = &parent->rb_left; 348 } else { 349 link = &parent->rb_right; 350 leftmost = 0; 351 } 352 } 353 354 /* 355 * Maintain a cache of leftmost tree entries (it is frequently 356 * used): 357 */ 358 if (leftmost) 359 cfs_rq->rb_leftmost = &se->run_node; 360 361 rb_link_node(&se->run_node, parent, link); 362 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 363} 364 365static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 366{ 367 if (cfs_rq->rb_leftmost == &se->run_node) { 368 struct rb_node *next_node; 369 370 next_node = rb_next(&se->run_node); 371 cfs_rq->rb_leftmost = next_node; 372 } 373 374 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 375} 376 377static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 378{ 379 struct rb_node *left = cfs_rq->rb_leftmost; 380 381 if (!left) 382 return NULL; 383 384 return rb_entry(left, struct sched_entity, run_node); 385} 386 387static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 388{ 389 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 390 391 if (!last) 392 return NULL; 393 394 return rb_entry(last, struct sched_entity, run_node); 395} 396 397/************************************************************** 398 * Scheduling class statistics methods: 399 */ 400 401#ifdef CONFIG_SCHED_DEBUG 402int sched_proc_update_handler(struct ctl_table *table, int write, 403 void __user *buffer, size_t *lenp, 404 loff_t *ppos) 405{ 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 407 int factor = get_update_sysctl_factor(); 408 409 if (ret || !write) 410 return ret; 411 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 413 sysctl_sched_min_granularity); 414 415#define WRT_SYSCTL(name) \ 416 (normalized_sysctl_##name = sysctl_##name / (factor)) 417 WRT_SYSCTL(sched_min_granularity); 418 WRT_SYSCTL(sched_latency); 419 WRT_SYSCTL(sched_wakeup_granularity); 420 WRT_SYSCTL(sched_shares_ratelimit); 421#undef WRT_SYSCTL 422 423 return 0; 424} 425#endif 426 427/* 428 * delta /= w 429 */ 430static inline unsigned long 431calc_delta_fair(unsigned long delta, struct sched_entity *se) 432{ 433 if (unlikely(se->load.weight != NICE_0_LOAD)) 434 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); 435 436 return delta; 437} 438 439/* 440 * The idea is to set a period in which each task runs once. 441 * 442 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 443 * this period because otherwise the slices get too small. 444 * 445 * p = (nr <= nl) ? l : l*nr/nl 446 */ 447static u64 __sched_period(unsigned long nr_running) 448{ 449 u64 period = sysctl_sched_latency; 450 unsigned long nr_latency = sched_nr_latency; 451 452 if (unlikely(nr_running > nr_latency)) { 453 period = sysctl_sched_min_granularity; 454 period *= nr_running; 455 } 456 457 return period; 458} 459 460/* 461 * We calculate the wall-time slice from the period by taking a part 462 * proportional to the weight. 463 * 464 * s = p*P[w/rw] 465 */ 466static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 467{ 468 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); 469 470 for_each_sched_entity(se) { 471 struct load_weight *load; 472 struct load_weight lw; 473 474 cfs_rq = cfs_rq_of(se); 475 load = &cfs_rq->load; 476 477 if (unlikely(!se->on_rq)) { 478 lw = cfs_rq->load; 479 480 update_load_add(&lw, se->load.weight); 481 load = &lw; 482 } 483 slice = calc_delta_mine(slice, se->load.weight, load); 484 } 485 return slice; 486} 487 488/* 489 * We calculate the vruntime slice of a to be inserted task 490 * 491 * vs = s/w 492 */ 493static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) 494{ 495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 496} 497 498/* 499 * Update the current task's runtime statistics. Skip current tasks that 500 * are not in our scheduling class. 501 */ 502static inline void 503__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, 504 unsigned long delta_exec) 505{ 506 unsigned long delta_exec_weighted; 507 508 schedstat_set(curr->statistics.exec_max, 509 max((u64)delta_exec, curr->statistics.exec_max)); 510 511 curr->sum_exec_runtime += delta_exec; 512 schedstat_add(cfs_rq, exec_clock, delta_exec); 513 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 514 515 curr->vruntime += delta_exec_weighted; 516 update_min_vruntime(cfs_rq); 517} 518 519static void update_curr(struct cfs_rq *cfs_rq) 520{ 521 struct sched_entity *curr = cfs_rq->curr; 522 u64 now = rq_of(cfs_rq)->clock; 523 unsigned long delta_exec; 524 525 if (unlikely(!curr)) 526 return; 527 528 /* 529 * Get the amount of time the current task was running 530 * since the last time we changed load (this cannot 531 * overflow on 32 bits): 532 */ 533 delta_exec = (unsigned long)(now - curr->exec_start); 534 if (!delta_exec) 535 return; 536 537 __update_curr(cfs_rq, curr, delta_exec); 538 curr->exec_start = now; 539 540 if (entity_is_task(curr)) { 541 struct task_struct *curtask = task_of(curr); 542 543 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); 544 cpuacct_charge(curtask, delta_exec); 545 account_group_exec_runtime(curtask, delta_exec); 546 } 547} 548 549static inline void 550update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 551{ 552 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); 553} 554 555/* 556 * Task is being enqueued - update stats: 557 */ 558static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 559{ 560 /* 561 * Are we enqueueing a waiting task? (for current tasks 562 * a dequeue/enqueue event is a NOP) 563 */ 564 if (se != cfs_rq->curr) 565 update_stats_wait_start(cfs_rq, se); 566} 567 568static void 569update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 570{ 571 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, 572 rq_of(cfs_rq)->clock - se->statistics.wait_start)); 573 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); 574 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + 575 rq_of(cfs_rq)->clock - se->statistics.wait_start); 576#ifdef CONFIG_SCHEDSTATS 577 if (entity_is_task(se)) { 578 trace_sched_stat_wait(task_of(se), 579 rq_of(cfs_rq)->clock - se->statistics.wait_start); 580 } 581#endif 582 schedstat_set(se->statistics.wait_start, 0); 583} 584 585static inline void 586update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 587{ 588 /* 589 * Mark the end of the wait period if dequeueing a 590 * waiting task: 591 */ 592 if (se != cfs_rq->curr) 593 update_stats_wait_end(cfs_rq, se); 594} 595 596/* 597 * We are picking a new current task - update its stats: 598 */ 599static inline void 600update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 601{ 602 /* 603 * We are starting a new run period: 604 */ 605 se->exec_start = rq_of(cfs_rq)->clock; 606} 607 608/************************************************** 609 * Scheduling class queueing methods: 610 */ 611 612#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED 613static void 614add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) 615{ 616 cfs_rq->task_weight += weight; 617} 618#else 619static inline void 620add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) 621{ 622} 623#endif 624 625static void 626account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 627{ 628 update_load_add(&cfs_rq->load, se->load.weight); 629 if (!parent_entity(se)) 630 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 631 if (entity_is_task(se)) { 632 add_cfs_task_weight(cfs_rq, se->load.weight); 633 list_add(&se->group_node, &cfs_rq->tasks); 634 } 635 cfs_rq->nr_running++; 636 se->on_rq = 1; 637} 638 639static void 640account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 641{ 642 update_load_sub(&cfs_rq->load, se->load.weight); 643 if (!parent_entity(se)) 644 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 645 if (entity_is_task(se)) { 646 add_cfs_task_weight(cfs_rq, -se->load.weight); 647 list_del_init(&se->group_node); 648 } 649 cfs_rq->nr_running--; 650 se->on_rq = 0; 651} 652 653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 654{ 655#ifdef CONFIG_SCHEDSTATS 656 struct task_struct *tsk = NULL; 657 658 if (entity_is_task(se)) 659 tsk = task_of(se); 660 661 if (se->statistics.sleep_start) { 662 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; 663 664 if ((s64)delta < 0) 665 delta = 0; 666 667 if (unlikely(delta > se->statistics.sleep_max)) 668 se->statistics.sleep_max = delta; 669 670 se->statistics.sleep_start = 0; 671 se->statistics.sum_sleep_runtime += delta; 672 673 if (tsk) { 674 account_scheduler_latency(tsk, delta >> 10, 1); 675 trace_sched_stat_sleep(tsk, delta); 676 } 677 } 678 if (se->statistics.block_start) { 679 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; 680 681 if ((s64)delta < 0) 682 delta = 0; 683 684 if (unlikely(delta > se->statistics.block_max)) 685 se->statistics.block_max = delta; 686 687 se->statistics.block_start = 0; 688 se->statistics.sum_sleep_runtime += delta; 689 690 if (tsk) { 691 if (tsk->in_iowait) { 692 se->statistics.iowait_sum += delta; 693 se->statistics.iowait_count++; 694 trace_sched_stat_iowait(tsk, delta); 695 } 696 697 /* 698 * Blocking time is in units of nanosecs, so shift by 699 * 20 to get a milliseconds-range estimation of the 700 * amount of time that the task spent sleeping: 701 */ 702 if (unlikely(prof_on == SLEEP_PROFILING)) { 703 profile_hits(SLEEP_PROFILING, 704 (void *)get_wchan(tsk), 705 delta >> 20); 706 } 707 account_scheduler_latency(tsk, delta >> 10, 0); 708 } 709 } 710#endif 711} 712 713static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 714{ 715#ifdef CONFIG_SCHED_DEBUG 716 s64 d = se->vruntime - cfs_rq->min_vruntime; 717 718 if (d < 0) 719 d = -d; 720 721 if (d > 3*sysctl_sched_latency) 722 schedstat_inc(cfs_rq, nr_spread_over); 723#endif 724} 725 726static void 727place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) 728{ 729 u64 vruntime = cfs_rq->min_vruntime; 730 731 /* 732 * The 'current' period is already promised to the current tasks, 733 * however the extra weight of the new task will slow them down a 734 * little, place the new task so that it fits in the slot that 735 * stays open at the end. 736 */ 737 if (initial && sched_feat(START_DEBIT)) 738 vruntime += sched_vslice(cfs_rq, se); 739 740 /* sleeps up to a single latency don't count. */ 741 if (!initial) { 742 unsigned long thresh = sysctl_sched_latency; 743 744 /* 745 * Halve their sleep time's effect, to allow 746 * for a gentler effect of sleepers: 747 */ 748 if (sched_feat(GENTLE_FAIR_SLEEPERS)) 749 thresh >>= 1; 750 751 vruntime -= thresh; 752 } 753 754 /* ensure we never gain time by being placed backwards. */ 755 vruntime = max_vruntime(se->vruntime, vruntime); 756 757 se->vruntime = vruntime; 758} 759 760static void 761enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 762{ 763 /* 764 * Update the normalized vruntime before updating min_vruntime 765 * through callig update_curr(). 766 */ 767 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) 768 se->vruntime += cfs_rq->min_vruntime; 769 770 /* 771 * Update run-time statistics of the 'current'. 772 */ 773 update_curr(cfs_rq); 774 account_entity_enqueue(cfs_rq, se); 775 776 if (flags & ENQUEUE_WAKEUP) { 777 place_entity(cfs_rq, se, 0); 778 enqueue_sleeper(cfs_rq, se); 779 } 780 781 update_stats_enqueue(cfs_rq, se); 782 check_spread(cfs_rq, se); 783 if (se != cfs_rq->curr) 784 __enqueue_entity(cfs_rq, se); 785} 786 787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 788{ 789 if (!se || cfs_rq->last == se) 790 cfs_rq->last = NULL; 791 792 if (!se || cfs_rq->next == se) 793 cfs_rq->next = NULL; 794} 795 796static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 797{ 798 for_each_sched_entity(se) 799 __clear_buddies(cfs_rq_of(se), se); 800} 801 802static void 803dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 804{ 805 /* 806 * Update run-time statistics of the 'current'. 807 */ 808 update_curr(cfs_rq); 809 810 update_stats_dequeue(cfs_rq, se); 811 if (flags & DEQUEUE_SLEEP) { 812#ifdef CONFIG_SCHEDSTATS 813 if (entity_is_task(se)) { 814 struct task_struct *tsk = task_of(se); 815 816 if (tsk->state & TASK_INTERRUPTIBLE) 817 se->statistics.sleep_start = rq_of(cfs_rq)->clock; 818 if (tsk->state & TASK_UNINTERRUPTIBLE) 819 se->statistics.block_start = rq_of(cfs_rq)->clock; 820 } 821#endif 822 } 823 824 clear_buddies(cfs_rq, se); 825 826 if (se != cfs_rq->curr) 827 __dequeue_entity(cfs_rq, se); 828 account_entity_dequeue(cfs_rq, se); 829 update_min_vruntime(cfs_rq); 830 831 /* 832 * Normalize the entity after updating the min_vruntime because the 833 * update can refer to the ->curr item and we need to reflect this 834 * movement in our normalized position. 835 */ 836 if (!(flags & DEQUEUE_SLEEP)) 837 se->vruntime -= cfs_rq->min_vruntime; 838} 839 840/* 841 * Preempt the current task with a newly woken task if needed: 842 */ 843static void 844check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 845{ 846 unsigned long ideal_runtime, delta_exec; 847 848 ideal_runtime = sched_slice(cfs_rq, curr); 849 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 850 if (delta_exec > ideal_runtime) { 851 resched_task(rq_of(cfs_rq)->curr); 852 /* 853 * The current task ran long enough, ensure it doesn't get 854 * re-elected due to buddy favours. 855 */ 856 clear_buddies(cfs_rq, curr); 857 return; 858 } 859 860 /* 861 * Ensure that a task that missed wakeup preemption by a 862 * narrow margin doesn't have to wait for a full slice. 863 * This also mitigates buddy induced latencies under load. 864 */ 865 if (!sched_feat(WAKEUP_PREEMPT)) 866 return; 867 868 if (delta_exec < sysctl_sched_min_granularity) 869 return; 870 871 if (cfs_rq->nr_running > 1) { 872 struct sched_entity *se = __pick_next_entity(cfs_rq); 873 s64 delta = curr->vruntime - se->vruntime; 874 875 if (delta > ideal_runtime) 876 resched_task(rq_of(cfs_rq)->curr); 877 } 878} 879 880static void 881set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 882{ 883 /* 'current' is not kept within the tree. */ 884 if (se->on_rq) { 885 /* 886 * Any task has to be enqueued before it get to execute on 887 * a CPU. So account for the time it spent waiting on the 888 * runqueue. 889 */ 890 update_stats_wait_end(cfs_rq, se); 891 __dequeue_entity(cfs_rq, se); 892 } 893 894 update_stats_curr_start(cfs_rq, se); 895 cfs_rq->curr = se; 896#ifdef CONFIG_SCHEDSTATS 897 /* 898 * Track our maximum slice length, if the CPU's load is at 899 * least twice that of our own weight (i.e. dont track it 900 * when there are only lesser-weight tasks around): 901 */ 902 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 903 se->statistics.slice_max = max(se->statistics.slice_max, 904 se->sum_exec_runtime - se->prev_sum_exec_runtime); 905 } 906#endif 907 se->prev_sum_exec_runtime = se->sum_exec_runtime; 908} 909 910static int 911wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 912 913static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 914{ 915 struct sched_entity *se = __pick_next_entity(cfs_rq); 916 struct sched_entity *left = se; 917 918 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 919 se = cfs_rq->next; 920 921 /* 922 * Prefer last buddy, try to return the CPU to a preempted task. 923 */ 924 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 925 se = cfs_rq->last; 926 927 clear_buddies(cfs_rq, se); 928 929 return se; 930} 931 932static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 933{ 934 /* 935 * If still on the runqueue then deactivate_task() 936 * was not called and update_curr() has to be done: 937 */ 938 if (prev->on_rq) 939 update_curr(cfs_rq); 940 941 check_spread(cfs_rq, prev); 942 if (prev->on_rq) { 943 update_stats_wait_start(cfs_rq, prev); 944 /* Put 'current' back into the tree. */ 945 __enqueue_entity(cfs_rq, prev); 946 } 947 cfs_rq->curr = NULL; 948} 949 950static void 951entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) 952{ 953 /* 954 * Update run-time statistics of the 'current'. 955 */ 956 update_curr(cfs_rq); 957 958#ifdef CONFIG_SCHED_HRTICK 959 /* 960 * queued ticks are scheduled to match the slice, so don't bother 961 * validating it and just reschedule. 962 */ 963 if (queued) { 964 resched_task(rq_of(cfs_rq)->curr); 965 return; 966 } 967 /* 968 * don't let the period tick interfere with the hrtick preemption 969 */ 970 if (!sched_feat(DOUBLE_TICK) && 971 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) 972 return; 973#endif 974 975 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 976 check_preempt_tick(cfs_rq, curr); 977} 978 979/************************************************** 980 * CFS operations on tasks: 981 */ 982 983#ifdef CONFIG_SCHED_HRTICK 984static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 985{ 986 struct sched_entity *se = &p->se; 987 struct cfs_rq *cfs_rq = cfs_rq_of(se); 988 989 WARN_ON(task_rq(p) != rq); 990 991 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { 992 u64 slice = sched_slice(cfs_rq, se); 993 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 994 s64 delta = slice - ran; 995 996 if (delta < 0) { 997 if (rq->curr == p) 998 resched_task(p); 999 return; 1000 } 1001 1002 /* 1003 * Don't schedule slices shorter than 10000ns, that just 1004 * doesn't make sense. Rely on vruntime for fairness. 1005 */ 1006 if (rq->curr != p) 1007 delta = max_t(s64, 10000LL, delta); 1008 1009 hrtick_start(rq, delta); 1010 } 1011} 1012 1013/* 1014 * called from enqueue/dequeue and updates the hrtick when the 1015 * current task is from our class and nr_running is low enough 1016 * to matter. 1017 */ 1018static void hrtick_update(struct rq *rq) 1019{ 1020 struct task_struct *curr = rq->curr; 1021 1022 if (curr->sched_class != &fair_sched_class) 1023 return; 1024 1025 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) 1026 hrtick_start_fair(rq, curr); 1027} 1028#else /* !CONFIG_SCHED_HRTICK */ 1029static inline void 1030hrtick_start_fair(struct rq *rq, struct task_struct *p) 1031{ 1032} 1033 1034static inline void hrtick_update(struct rq *rq) 1035{ 1036} 1037#endif 1038 1039/* 1040 * The enqueue_task method is called before nr_running is 1041 * increased. Here we update the fair scheduling stats and 1042 * then put the task into the rbtree: 1043 */ 1044static void 1045enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) 1046{ 1047 struct cfs_rq *cfs_rq; 1048 struct sched_entity *se = &p->se; 1049 1050 for_each_sched_entity(se) { 1051 if (se->on_rq) 1052 break; 1053 cfs_rq = cfs_rq_of(se); 1054 enqueue_entity(cfs_rq, se, flags); 1055 flags = ENQUEUE_WAKEUP; 1056 } 1057 1058 hrtick_update(rq); 1059} 1060 1061/* 1062 * The dequeue_task method is called before nr_running is 1063 * decreased. We remove the task from the rbtree and 1064 * update the fair scheduling stats: 1065 */ 1066static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 1067{ 1068 struct cfs_rq *cfs_rq; 1069 struct sched_entity *se = &p->se; 1070 1071 for_each_sched_entity(se) { 1072 cfs_rq = cfs_rq_of(se); 1073 dequeue_entity(cfs_rq, se, flags); 1074 /* Don't dequeue parent if it has other entities besides us */ 1075 if (cfs_rq->load.weight) 1076 break; 1077 flags |= DEQUEUE_SLEEP; 1078 } 1079 1080 hrtick_update(rq); 1081} 1082 1083/* 1084 * sched_yield() support is very simple - we dequeue and enqueue. 1085 * 1086 * If compat_yield is turned on then we requeue to the end of the tree. 1087 */ 1088static void yield_task_fair(struct rq *rq) 1089{ 1090 struct task_struct *curr = rq->curr; 1091 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1092 struct sched_entity *rightmost, *se = &curr->se; 1093 1094 /* 1095 * Are we the only task in the tree? 1096 */ 1097 if (unlikely(cfs_rq->nr_running == 1)) 1098 return; 1099 1100 clear_buddies(cfs_rq, se); 1101 1102 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 1103 update_rq_clock(rq); 1104 /* 1105 * Update run-time statistics of the 'current'. 1106 */ 1107 update_curr(cfs_rq); 1108 1109 return; 1110 } 1111 /* 1112 * Find the rightmost entry in the rbtree: 1113 */ 1114 rightmost = __pick_last_entity(cfs_rq); 1115 /* 1116 * Already in the rightmost position? 1117 */ 1118 if (unlikely(!rightmost || entity_before(rightmost, se))) 1119 return; 1120 1121 /* 1122 * Minimally necessary key value to be last in the tree: 1123 * Upon rescheduling, sched_class::put_prev_task() will place 1124 * 'current' within the tree based on its new key value. 1125 */ 1126 se->vruntime = rightmost->vruntime + 1; 1127} 1128 1129#ifdef CONFIG_SMP 1130 1131static void task_waking_fair(struct rq *rq, struct task_struct *p) 1132{ 1133 struct sched_entity *se = &p->se; 1134 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1135 1136 se->vruntime -= cfs_rq->min_vruntime; 1137} 1138 1139#ifdef CONFIG_FAIR_GROUP_SCHED 1140/* 1141 * effective_load() calculates the load change as seen from the root_task_group 1142 * 1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1145 * can calculate the shift in shares. 1146 * 1147 * The problem is that perfectly aligning the shares is rather expensive, hence 1148 * we try to avoid doing that too often - see update_shares(), which ratelimits 1149 * this change. 1150 * 1151 * We compensate this by not only taking the current delta into account, but 1152 * also considering the delta between when the shares were last adjusted and 1153 * now. 1154 * 1155 * We still saw a performance dip, some tracing learned us that between 1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased 1157 * significantly. Therefore try to bias the error in direction of failing 1158 * the affine wakeup. 1159 * 1160 */ 1161static long effective_load(struct task_group *tg, int cpu, 1162 long wl, long wg) 1163{ 1164 struct sched_entity *se = tg->se[cpu]; 1165 1166 if (!tg->parent) 1167 return wl; 1168 1169 /* 1170 * By not taking the decrease of shares on the other cpu into 1171 * account our error leans towards reducing the affine wakeups. 1172 */ 1173 if (!wl && sched_feat(ASYM_EFF_LOAD)) 1174 return wl; 1175 1176 for_each_sched_entity(se) { 1177 long S, rw, s, a, b; 1178 long more_w; 1179 1180 /* 1181 * Instead of using this increment, also add the difference 1182 * between when the shares were last updated and now. 1183 */ 1184 more_w = se->my_q->load.weight - se->my_q->rq_weight; 1185 wl += more_w; 1186 wg += more_w; 1187 1188 S = se->my_q->tg->shares; 1189 s = se->my_q->shares; 1190 rw = se->my_q->rq_weight; 1191 1192 a = S*(rw + wl); 1193 b = S*rw + s*wg; 1194 1195 wl = s*(a-b); 1196 1197 if (likely(b)) 1198 wl /= b; 1199 1200 /* 1201 * Assume the group is already running and will 1202 * thus already be accounted for in the weight. 1203 * 1204 * That is, moving shares between CPUs, does not 1205 * alter the group weight. 1206 */ 1207 wg = 0; 1208 } 1209 1210 return wl; 1211} 1212 1213#else 1214 1215static inline unsigned long effective_load(struct task_group *tg, int cpu, 1216 unsigned long wl, unsigned long wg) 1217{ 1218 return wl; 1219} 1220 1221#endif 1222 1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1224{ 1225 unsigned long this_load, load; 1226 int idx, this_cpu, prev_cpu; 1227 unsigned long tl_per_task; 1228 struct task_group *tg; 1229 unsigned long weight; 1230 int balanced; 1231 1232 idx = sd->wake_idx; 1233 this_cpu = smp_processor_id(); 1234 prev_cpu = task_cpu(p); 1235 load = source_load(prev_cpu, idx); 1236 this_load = target_load(this_cpu, idx); 1237 1238 /* 1239 * If sync wakeup then subtract the (maximum possible) 1240 * effect of the currently running task from the load 1241 * of the current CPU: 1242 */ 1243 rcu_read_lock(); 1244 if (sync) { 1245 tg = task_group(current); 1246 weight = current->se.load.weight; 1247 1248 this_load += effective_load(tg, this_cpu, -weight, -weight); 1249 load += effective_load(tg, prev_cpu, 0, -weight); 1250 } 1251 1252 tg = task_group(p); 1253 weight = p->se.load.weight; 1254 1255 /* 1256 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1257 * due to the sync cause above having dropped this_load to 0, we'll 1258 * always have an imbalance, but there's really nothing you can do 1259 * about that, so that's good too. 1260 * 1261 * Otherwise check if either cpus are near enough in load to allow this 1262 * task to be woken on this_cpu. 1263 */ 1264 if (this_load) { 1265 unsigned long this_eff_load, prev_eff_load; 1266 1267 this_eff_load = 100; 1268 this_eff_load *= power_of(prev_cpu); 1269 this_eff_load *= this_load + 1270 effective_load(tg, this_cpu, weight, weight); 1271 1272 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; 1273 prev_eff_load *= power_of(this_cpu); 1274 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); 1275 1276 balanced = this_eff_load <= prev_eff_load; 1277 } else 1278 balanced = true; 1279 rcu_read_unlock(); 1280 1281 /* 1282 * If the currently running task will sleep within 1283 * a reasonable amount of time then attract this newly 1284 * woken task: 1285 */ 1286 if (sync && balanced) 1287 return 1; 1288 1289 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); 1290 tl_per_task = cpu_avg_load_per_task(this_cpu); 1291 1292 if (balanced || 1293 (this_load <= load && 1294 this_load + target_load(prev_cpu, idx) <= tl_per_task)) { 1295 /* 1296 * This domain has SD_WAKE_AFFINE and 1297 * p is cache cold in this domain, and 1298 * there is no bad imbalance. 1299 */ 1300 schedstat_inc(sd, ttwu_move_affine); 1301 schedstat_inc(p, se.statistics.nr_wakeups_affine); 1302 1303 return 1; 1304 } 1305 return 0; 1306} 1307 1308/* 1309 * find_idlest_group finds and returns the least busy CPU group within the 1310 * domain. 1311 */ 1312static struct sched_group * 1313find_idlest_group(struct sched_domain *sd, struct task_struct *p, 1314 int this_cpu, int load_idx) 1315{ 1316 struct sched_group *idlest = NULL, *group = sd->groups; 1317 unsigned long min_load = ULONG_MAX, this_load = 0; 1318 int imbalance = 100 + (sd->imbalance_pct-100)/2; 1319 1320 do { 1321 unsigned long load, avg_load; 1322 int local_group; 1323 int i; 1324 1325 /* Skip over this group if it has no CPUs allowed */ 1326 if (!cpumask_intersects(sched_group_cpus(group), 1327 &p->cpus_allowed)) 1328 continue; 1329 1330 local_group = cpumask_test_cpu(this_cpu, 1331 sched_group_cpus(group)); 1332 1333 /* Tally up the load of all CPUs in the group */ 1334 avg_load = 0; 1335 1336 for_each_cpu(i, sched_group_cpus(group)) { 1337 /* Bias balancing toward cpus of our domain */ 1338 if (local_group) 1339 load = source_load(i, load_idx); 1340 else 1341 load = target_load(i, load_idx); 1342 1343 avg_load += load; 1344 } 1345 1346 /* Adjust by relative CPU power of the group */ 1347 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1348 1349 if (local_group) { 1350 this_load = avg_load; 1351 } else if (avg_load < min_load) { 1352 min_load = avg_load; 1353 idlest = group; 1354 } 1355 } while (group = group->next, group != sd->groups); 1356 1357 if (!idlest || 100*this_load < imbalance*min_load) 1358 return NULL; 1359 return idlest; 1360} 1361 1362/* 1363 * find_idlest_cpu - find the idlest cpu among the cpus in group. 1364 */ 1365static int 1366find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 1367{ 1368 unsigned long load, min_load = ULONG_MAX; 1369 int idlest = -1; 1370 int i; 1371 1372 /* Traverse only the allowed CPUs */ 1373 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { 1374 load = weighted_cpuload(i); 1375 1376 if (load < min_load || (load == min_load && i == this_cpu)) { 1377 min_load = load; 1378 idlest = i; 1379 } 1380 } 1381 1382 return idlest; 1383} 1384 1385/* 1386 * Try and locate an idle CPU in the sched_domain. 1387 */ 1388static int select_idle_sibling(struct task_struct *p, int target) 1389{ 1390 int cpu = smp_processor_id(); 1391 int prev_cpu = task_cpu(p); 1392 struct sched_domain *sd; 1393 int i; 1394 1395 /* 1396 * If the task is going to be woken-up on this cpu and if it is 1397 * already idle, then it is the right target. 1398 */ 1399 if (target == cpu && idle_cpu(cpu)) 1400 return cpu; 1401 1402 /* 1403 * If the task is going to be woken-up on the cpu where it previously 1404 * ran and if it is currently idle, then it the right target. 1405 */ 1406 if (target == prev_cpu && idle_cpu(prev_cpu)) 1407 return prev_cpu; 1408 1409 /* 1410 * Otherwise, iterate the domains and find an elegible idle cpu. 1411 */ 1412 for_each_domain(target, sd) { 1413 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1414 break; 1415 1416 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 1417 if (idle_cpu(i)) { 1418 target = i; 1419 break; 1420 } 1421 } 1422 1423 /* 1424 * Lets stop looking for an idle sibling when we reached 1425 * the domain that spans the current cpu and prev_cpu. 1426 */ 1427 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && 1428 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1429 break; 1430 } 1431 1432 return target; 1433} 1434 1435/* 1436 * sched_balance_self: balance the current task (running on cpu) in domains 1437 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1438 * SD_BALANCE_EXEC. 1439 * 1440 * Balance, ie. select the least loaded group. 1441 * 1442 * Returns the target CPU number, or the same CPU if no balancing is needed. 1443 * 1444 * preempt must be disabled. 1445 */ 1446static int 1447select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1448{ 1449 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1450 int cpu = smp_processor_id(); 1451 int prev_cpu = task_cpu(p); 1452 int new_cpu = cpu; 1453 int want_affine = 0; 1454 int want_sd = 1; 1455 int sync = wake_flags & WF_SYNC; 1456 1457 if (sd_flag & SD_BALANCE_WAKE) { 1458 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) 1459 want_affine = 1; 1460 new_cpu = prev_cpu; 1461 } 1462 1463 for_each_domain(cpu, tmp) { 1464 if (!(tmp->flags & SD_LOAD_BALANCE)) 1465 continue; 1466 1467 /* 1468 * If power savings logic is enabled for a domain, see if we 1469 * are not overloaded, if so, don't balance wider. 1470 */ 1471 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 1472 unsigned long power = 0; 1473 unsigned long nr_running = 0; 1474 unsigned long capacity; 1475 int i; 1476 1477 for_each_cpu(i, sched_domain_span(tmp)) { 1478 power += power_of(i); 1479 nr_running += cpu_rq(i)->cfs.nr_running; 1480 } 1481 1482 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 1483 1484 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1485 nr_running /= 2; 1486 1487 if (nr_running < capacity) 1488 want_sd = 0; 1489 } 1490 1491 /* 1492 * If both cpu and prev_cpu are part of this domain, 1493 * cpu is a valid SD_WAKE_AFFINE target. 1494 */ 1495 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 1496 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 1497 affine_sd = tmp; 1498 want_affine = 0; 1499 } 1500 1501 if (!want_sd && !want_affine) 1502 break; 1503 1504 if (!(tmp->flags & sd_flag)) 1505 continue; 1506 1507 if (want_sd) 1508 sd = tmp; 1509 } 1510 1511#ifdef CONFIG_FAIR_GROUP_SCHED 1512 if (sched_feat(LB_SHARES_UPDATE)) { 1513 /* 1514 * Pick the largest domain to update shares over 1515 */ 1516 tmp = sd; 1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) 1518 tmp = affine_sd; 1519 1520 if (tmp) { 1521 raw_spin_unlock(&rq->lock); 1522 update_shares(tmp); 1523 raw_spin_lock(&rq->lock); 1524 } 1525 } 1526#endif 1527 1528 if (affine_sd) { 1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1530 return select_idle_sibling(p, cpu); 1531 else 1532 return select_idle_sibling(p, prev_cpu); 1533 } 1534 1535 while (sd) { 1536 int load_idx = sd->forkexec_idx; 1537 struct sched_group *group; 1538 int weight; 1539 1540 if (!(sd->flags & sd_flag)) { 1541 sd = sd->child; 1542 continue; 1543 } 1544 1545 if (sd_flag & SD_BALANCE_WAKE) 1546 load_idx = sd->wake_idx; 1547 1548 group = find_idlest_group(sd, p, cpu, load_idx); 1549 if (!group) { 1550 sd = sd->child; 1551 continue; 1552 } 1553 1554 new_cpu = find_idlest_cpu(group, p, cpu); 1555 if (new_cpu == -1 || new_cpu == cpu) { 1556 /* Now try balancing at a lower domain level of cpu */ 1557 sd = sd->child; 1558 continue; 1559 } 1560 1561 /* Now try balancing at a lower domain level of new_cpu */ 1562 cpu = new_cpu; 1563 weight = sd->span_weight; 1564 sd = NULL; 1565 for_each_domain(cpu, tmp) { 1566 if (weight <= tmp->span_weight) 1567 break; 1568 if (tmp->flags & sd_flag) 1569 sd = tmp; 1570 } 1571 /* while loop will break here if sd == NULL */ 1572 } 1573 1574 return new_cpu; 1575} 1576#endif /* CONFIG_SMP */ 1577 1578static unsigned long 1579wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 1580{ 1581 unsigned long gran = sysctl_sched_wakeup_granularity; 1582 1583 /* 1584 * Since its curr running now, convert the gran from real-time 1585 * to virtual-time in his units. 1586 * 1587 * By using 'se' instead of 'curr' we penalize light tasks, so 1588 * they get preempted easier. That is, if 'se' < 'curr' then 1589 * the resulting gran will be larger, therefore penalizing the 1590 * lighter, if otoh 'se' > 'curr' then the resulting gran will 1591 * be smaller, again penalizing the lighter task. 1592 * 1593 * This is especially important for buddies when the leftmost 1594 * task is higher priority than the buddy. 1595 */ 1596 if (unlikely(se->load.weight != NICE_0_LOAD)) 1597 gran = calc_delta_fair(gran, se); 1598 1599 return gran; 1600} 1601 1602/* 1603 * Should 'se' preempt 'curr'. 1604 * 1605 * |s1 1606 * |s2 1607 * |s3 1608 * g 1609 * |<--->|c 1610 * 1611 * w(c, s1) = -1 1612 * w(c, s2) = 0 1613 * w(c, s3) = 1 1614 * 1615 */ 1616static int 1617wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) 1618{ 1619 s64 gran, vdiff = curr->vruntime - se->vruntime; 1620 1621 if (vdiff <= 0) 1622 return -1; 1623 1624 gran = wakeup_gran(curr, se); 1625 if (vdiff > gran) 1626 return 1; 1627 1628 return 0; 1629} 1630 1631static void set_last_buddy(struct sched_entity *se) 1632{ 1633 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1634 for_each_sched_entity(se) 1635 cfs_rq_of(se)->last = se; 1636 } 1637} 1638 1639static void set_next_buddy(struct sched_entity *se) 1640{ 1641 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1642 for_each_sched_entity(se) 1643 cfs_rq_of(se)->next = se; 1644 } 1645} 1646 1647/* 1648 * Preempt the current task with a newly woken task if needed: 1649 */ 1650static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1651{ 1652 struct task_struct *curr = rq->curr; 1653 struct sched_entity *se = &curr->se, *pse = &p->se; 1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1655 int scale = cfs_rq->nr_running >= sched_nr_latency; 1656 1657 if (unlikely(rt_prio(p->prio))) 1658 goto preempt; 1659 1660 if (unlikely(p->sched_class != &fair_sched_class)) 1661 return; 1662 1663 if (unlikely(se == pse)) 1664 return; 1665 1666 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1667 set_next_buddy(pse); 1668 1669 /* 1670 * We can come here with TIF_NEED_RESCHED already set from new task 1671 * wake up path. 1672 */ 1673 if (test_tsk_need_resched(curr)) 1674 return; 1675 1676 /* 1677 * Batch and idle tasks do not preempt (their preemption is driven by 1678 * the tick): 1679 */ 1680 if (unlikely(p->policy != SCHED_NORMAL)) 1681 return; 1682 1683 /* Idle tasks are by definition preempted by everybody. */ 1684 if (unlikely(curr->policy == SCHED_IDLE)) 1685 goto preempt; 1686 1687 if (!sched_feat(WAKEUP_PREEMPT)) 1688 return; 1689 1690 update_curr(cfs_rq); 1691 find_matching_se(&se, &pse); 1692 BUG_ON(!pse); 1693 if (wakeup_preempt_entity(se, pse) == 1) 1694 goto preempt; 1695 1696 return; 1697 1698preempt: 1699 resched_task(curr); 1700 /* 1701 * Only set the backward buddy when the current task is still 1702 * on the rq. This can happen when a wakeup gets interleaved 1703 * with schedule on the ->pre_schedule() or idle_balance() 1704 * point, either of which can * drop the rq lock. 1705 * 1706 * Also, during early boot the idle thread is in the fair class, 1707 * for obvious reasons its a bad idea to schedule back to it. 1708 */ 1709 if (unlikely(!se->on_rq || curr == rq->idle)) 1710 return; 1711 1712 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 1713 set_last_buddy(se); 1714} 1715 1716static struct task_struct *pick_next_task_fair(struct rq *rq) 1717{ 1718 struct task_struct *p; 1719 struct cfs_rq *cfs_rq = &rq->cfs; 1720 struct sched_entity *se; 1721 1722 if (!cfs_rq->nr_running) 1723 return NULL; 1724 1725 do { 1726 se = pick_next_entity(cfs_rq); 1727 set_next_entity(cfs_rq, se); 1728 cfs_rq = group_cfs_rq(se); 1729 } while (cfs_rq); 1730 1731 p = task_of(se); 1732 hrtick_start_fair(rq, p); 1733 1734 return p; 1735} 1736 1737/* 1738 * Account for a descheduled task: 1739 */ 1740static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) 1741{ 1742 struct sched_entity *se = &prev->se; 1743 struct cfs_rq *cfs_rq; 1744 1745 for_each_sched_entity(se) { 1746 cfs_rq = cfs_rq_of(se); 1747 put_prev_entity(cfs_rq, se); 1748 } 1749} 1750 1751#ifdef CONFIG_SMP 1752/************************************************** 1753 * Fair scheduling class load-balancing methods: 1754 */ 1755 1756/* 1757 * pull_task - move a task from a remote runqueue to the local runqueue. 1758 * Both runqueues must be locked. 1759 */ 1760static void pull_task(struct rq *src_rq, struct task_struct *p, 1761 struct rq *this_rq, int this_cpu) 1762{ 1763 deactivate_task(src_rq, p, 0); 1764 set_task_cpu(p, this_cpu); 1765 activate_task(this_rq, p, 0); 1766 check_preempt_curr(this_rq, p, 0); 1767} 1768 1769/* 1770 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 1771 */ 1772static 1773int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 1774 struct sched_domain *sd, enum cpu_idle_type idle, 1775 int *all_pinned) 1776{ 1777 int tsk_cache_hot = 0; 1778 /* 1779 * We do not migrate tasks that are: 1780 * 1) running (obviously), or 1781 * 2) cannot be migrated to this CPU due to cpus_allowed, or 1782 * 3) are cache-hot on their current CPU. 1783 */ 1784 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 1785 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 1786 return 0; 1787 } 1788 *all_pinned = 0; 1789 1790 if (task_running(rq, p)) { 1791 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 1792 return 0; 1793 } 1794 1795 /* 1796 * Aggressive migration if: 1797 * 1) task is cache cold, or 1798 * 2) too many balance attempts have failed. 1799 */ 1800 1801 tsk_cache_hot = task_hot(p, rq->clock, sd); 1802 if (!tsk_cache_hot || 1803 sd->nr_balance_failed > sd->cache_nice_tries) { 1804#ifdef CONFIG_SCHEDSTATS 1805 if (tsk_cache_hot) { 1806 schedstat_inc(sd, lb_hot_gained[idle]); 1807 schedstat_inc(p, se.statistics.nr_forced_migrations); 1808 } 1809#endif 1810 return 1; 1811 } 1812 1813 if (tsk_cache_hot) { 1814 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 1815 return 0; 1816 } 1817 return 1; 1818} 1819 1820/* 1821 * move_one_task tries to move exactly one task from busiest to this_rq, as 1822 * part of active balancing operations within "domain". 1823 * Returns 1 if successful and 0 otherwise. 1824 * 1825 * Called with both runqueues locked. 1826 */ 1827static int 1828move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, 1829 struct sched_domain *sd, enum cpu_idle_type idle) 1830{ 1831 struct task_struct *p, *n; 1832 struct cfs_rq *cfs_rq; 1833 int pinned = 0; 1834 1835 for_each_leaf_cfs_rq(busiest, cfs_rq) { 1836 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 1837 1838 if (!can_migrate_task(p, busiest, this_cpu, 1839 sd, idle, &pinned)) 1840 continue; 1841 1842 pull_task(busiest, p, this_rq, this_cpu); 1843 /* 1844 * Right now, this is only the second place pull_task() 1845 * is called, so we can safely collect pull_task() 1846 * stats here rather than inside pull_task(). 1847 */ 1848 schedstat_inc(sd, lb_gained[idle]); 1849 return 1; 1850 } 1851 } 1852 1853 return 0; 1854} 1855 1856static unsigned long 1857balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 1858 unsigned long max_load_move, struct sched_domain *sd, 1859 enum cpu_idle_type idle, int *all_pinned, 1860 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 1861{ 1862 int loops = 0, pulled = 0, pinned = 0; 1863 long rem_load_move = max_load_move; 1864 struct task_struct *p, *n; 1865 1866 if (max_load_move == 0) 1867 goto out; 1868 1869 pinned = 1; 1870 1871 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 1872 if (loops++ > sysctl_sched_nr_migrate) 1873 break; 1874 1875 if ((p->se.load.weight >> 1) > rem_load_move || 1876 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) 1877 continue; 1878 1879 pull_task(busiest, p, this_rq, this_cpu); 1880 pulled++; 1881 rem_load_move -= p->se.load.weight; 1882 1883#ifdef CONFIG_PREEMPT 1884 /* 1885 * NEWIDLE balancing is a source of latency, so preemptible 1886 * kernels will stop after the first task is pulled to minimize 1887 * the critical section. 1888 */ 1889 if (idle == CPU_NEWLY_IDLE) 1890 break; 1891#endif 1892 1893 /* 1894 * We only want to steal up to the prescribed amount of 1895 * weighted load. 1896 */ 1897 if (rem_load_move <= 0) 1898 break; 1899 1900 if (p->prio < *this_best_prio) 1901 *this_best_prio = p->prio; 1902 } 1903out: 1904 /* 1905 * Right now, this is one of only two places pull_task() is called, 1906 * so we can safely collect pull_task() stats here rather than 1907 * inside pull_task(). 1908 */ 1909 schedstat_add(sd, lb_gained[idle], pulled); 1910 1911 if (all_pinned) 1912 *all_pinned = pinned; 1913 1914 return max_load_move - rem_load_move; 1915} 1916 1917#ifdef CONFIG_FAIR_GROUP_SCHED 1918static unsigned long 1919load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1920 unsigned long max_load_move, 1921 struct sched_domain *sd, enum cpu_idle_type idle, 1922 int *all_pinned, int *this_best_prio) 1923{ 1924 long rem_load_move = max_load_move; 1925 int busiest_cpu = cpu_of(busiest); 1926 struct task_group *tg; 1927 1928 rcu_read_lock(); 1929 update_h_load(busiest_cpu); 1930 1931 list_for_each_entry_rcu(tg, &task_groups, list) { 1932 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; 1933 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 1934 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 1935 u64 rem_load, moved_load; 1936 1937 /* 1938 * empty group 1939 */ 1940 if (!busiest_cfs_rq->task_weight) 1941 continue; 1942 1943 rem_load = (u64)rem_load_move * busiest_weight; 1944 rem_load = div_u64(rem_load, busiest_h_load + 1); 1945 1946 moved_load = balance_tasks(this_rq, this_cpu, busiest, 1947 rem_load, sd, idle, all_pinned, this_best_prio, 1948 busiest_cfs_rq); 1949 1950 if (!moved_load) 1951 continue; 1952 1953 moved_load *= busiest_h_load; 1954 moved_load = div_u64(moved_load, busiest_weight + 1); 1955 1956 rem_load_move -= moved_load; 1957 if (rem_load_move < 0) 1958 break; 1959 } 1960 rcu_read_unlock(); 1961 1962 return max_load_move - rem_load_move; 1963} 1964#else 1965static unsigned long 1966load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1967 unsigned long max_load_move, 1968 struct sched_domain *sd, enum cpu_idle_type idle, 1969 int *all_pinned, int *this_best_prio) 1970{ 1971 return balance_tasks(this_rq, this_cpu, busiest, 1972 max_load_move, sd, idle, all_pinned, 1973 this_best_prio, &busiest->cfs); 1974} 1975#endif 1976 1977/* 1978 * move_tasks tries to move up to max_load_move weighted load from busiest to 1979 * this_rq, as part of a balancing operation within domain "sd". 1980 * Returns 1 if successful and 0 otherwise. 1981 * 1982 * Called with both runqueues locked. 1983 */ 1984static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 1985 unsigned long max_load_move, 1986 struct sched_domain *sd, enum cpu_idle_type idle, 1987 int *all_pinned) 1988{ 1989 unsigned long total_load_moved = 0, load_moved; 1990 int this_best_prio = this_rq->curr->prio; 1991 1992 do { 1993 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 1994 max_load_move - total_load_moved, 1995 sd, idle, all_pinned, &this_best_prio); 1996 1997 total_load_moved += load_moved; 1998 1999#ifdef CONFIG_PREEMPT 2000 /* 2001 * NEWIDLE balancing is a source of latency, so preemptible 2002 * kernels will stop after the first task is pulled to minimize 2003 * the critical section. 2004 */ 2005 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 2006 break; 2007 2008 if (raw_spin_is_contended(&this_rq->lock) || 2009 raw_spin_is_contended(&busiest->lock)) 2010 break; 2011#endif 2012 } while (load_moved && max_load_move > total_load_moved); 2013 2014 return total_load_moved > 0; 2015} 2016 2017/********** Helpers for find_busiest_group ************************/ 2018/* 2019 * sd_lb_stats - Structure to store the statistics of a sched_domain 2020 * during load balancing. 2021 */ 2022struct sd_lb_stats { 2023 struct sched_group *busiest; /* Busiest group in this sd */ 2024 struct sched_group *this; /* Local group in this sd */ 2025 unsigned long total_load; /* Total load of all groups in sd */ 2026 unsigned long total_pwr; /* Total power of all groups in sd */ 2027 unsigned long avg_load; /* Average load across all groups in sd */ 2028 2029 /** Statistics of this group */ 2030 unsigned long this_load; 2031 unsigned long this_load_per_task; 2032 unsigned long this_nr_running; 2033 2034 /* Statistics of the busiest group */ 2035 unsigned long max_load; 2036 unsigned long busiest_load_per_task; 2037 unsigned long busiest_nr_running; 2038 unsigned long busiest_group_capacity; 2039 2040 int group_imb; /* Is there imbalance in this sd */ 2041#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2042 int power_savings_balance; /* Is powersave balance needed for this sd */ 2043 struct sched_group *group_min; /* Least loaded group in sd */ 2044 struct sched_group *group_leader; /* Group which relieves group_min */ 2045 unsigned long min_load_per_task; /* load_per_task in group_min */ 2046 unsigned long leader_nr_running; /* Nr running of group_leader */ 2047 unsigned long min_nr_running; /* Nr running of group_min */ 2048#endif 2049}; 2050 2051/* 2052 * sg_lb_stats - stats of a sched_group required for load_balancing 2053 */ 2054struct sg_lb_stats { 2055 unsigned long avg_load; /*Avg load across the CPUs of the group */ 2056 unsigned long group_load; /* Total load over the CPUs of the group */ 2057 unsigned long sum_nr_running; /* Nr tasks running in the group */ 2058 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2059 unsigned long group_capacity; 2060 int group_imb; /* Is there an imbalance in the group ? */ 2061}; 2062 2063/** 2064 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 2065 * @group: The group whose first cpu is to be returned. 2066 */ 2067static inline unsigned int group_first_cpu(struct sched_group *group) 2068{ 2069 return cpumask_first(sched_group_cpus(group)); 2070} 2071 2072/** 2073 * get_sd_load_idx - Obtain the load index for a given sched domain. 2074 * @sd: The sched_domain whose load_idx is to be obtained. 2075 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 2076 */ 2077static inline int get_sd_load_idx(struct sched_domain *sd, 2078 enum cpu_idle_type idle) 2079{ 2080 int load_idx; 2081 2082 switch (idle) { 2083 case CPU_NOT_IDLE: 2084 load_idx = sd->busy_idx; 2085 break; 2086 2087 case CPU_NEWLY_IDLE: 2088 load_idx = sd->newidle_idx; 2089 break; 2090 default: 2091 load_idx = sd->idle_idx; 2092 break; 2093 } 2094 2095 return load_idx; 2096} 2097 2098 2099#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2100/** 2101 * init_sd_power_savings_stats - Initialize power savings statistics for 2102 * the given sched_domain, during load balancing. 2103 * 2104 * @sd: Sched domain whose power-savings statistics are to be initialized. 2105 * @sds: Variable containing the statistics for sd. 2106 * @idle: Idle status of the CPU at which we're performing load-balancing. 2107 */ 2108static inline void init_sd_power_savings_stats(struct sched_domain *sd, 2109 struct sd_lb_stats *sds, enum cpu_idle_type idle) 2110{ 2111 /* 2112 * Busy processors will not participate in power savings 2113 * balance. 2114 */ 2115 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2116 sds->power_savings_balance = 0; 2117 else { 2118 sds->power_savings_balance = 1; 2119 sds->min_nr_running = ULONG_MAX; 2120 sds->leader_nr_running = 0; 2121 } 2122} 2123 2124/** 2125 * update_sd_power_savings_stats - Update the power saving stats for a 2126 * sched_domain while performing load balancing. 2127 * 2128 * @group: sched_group belonging to the sched_domain under consideration. 2129 * @sds: Variable containing the statistics of the sched_domain 2130 * @local_group: Does group contain the CPU for which we're performing 2131 * load balancing ? 2132 * @sgs: Variable containing the statistics of the group. 2133 */ 2134static inline void update_sd_power_savings_stats(struct sched_group *group, 2135 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) 2136{ 2137 2138 if (!sds->power_savings_balance) 2139 return; 2140 2141 /* 2142 * If the local group is idle or completely loaded 2143 * no need to do power savings balance at this domain 2144 */ 2145 if (local_group && (sds->this_nr_running >= sgs->group_capacity || 2146 !sds->this_nr_running)) 2147 sds->power_savings_balance = 0; 2148 2149 /* 2150 * If a group is already running at full capacity or idle, 2151 * don't include that group in power savings calculations 2152 */ 2153 if (!sds->power_savings_balance || 2154 sgs->sum_nr_running >= sgs->group_capacity || 2155 !sgs->sum_nr_running) 2156 return; 2157 2158 /* 2159 * Calculate the group which has the least non-idle load. 2160 * This is the group from where we need to pick up the load 2161 * for saving power 2162 */ 2163 if ((sgs->sum_nr_running < sds->min_nr_running) || 2164 (sgs->sum_nr_running == sds->min_nr_running && 2165 group_first_cpu(group) > group_first_cpu(sds->group_min))) { 2166 sds->group_min = group; 2167 sds->min_nr_running = sgs->sum_nr_running; 2168 sds->min_load_per_task = sgs->sum_weighted_load / 2169 sgs->sum_nr_running; 2170 } 2171 2172 /* 2173 * Calculate the group which is almost near its 2174 * capacity but still has some space to pick up some load 2175 * from other group and save more power 2176 */ 2177 if (sgs->sum_nr_running + 1 > sgs->group_capacity) 2178 return; 2179 2180 if (sgs->sum_nr_running > sds->leader_nr_running || 2181 (sgs->sum_nr_running == sds->leader_nr_running && 2182 group_first_cpu(group) < group_first_cpu(sds->group_leader))) { 2183 sds->group_leader = group; 2184 sds->leader_nr_running = sgs->sum_nr_running; 2185 } 2186} 2187 2188/** 2189 * check_power_save_busiest_group - see if there is potential for some power-savings balance 2190 * @sds: Variable containing the statistics of the sched_domain 2191 * under consideration. 2192 * @this_cpu: Cpu at which we're currently performing load-balancing. 2193 * @imbalance: Variable to store the imbalance. 2194 * 2195 * Description: 2196 * Check if we have potential to perform some power-savings balance. 2197 * If yes, set the busiest group to be the least loaded group in the 2198 * sched_domain, so that it's CPUs can be put to idle. 2199 * 2200 * Returns 1 if there is potential to perform power-savings balance. 2201 * Else returns 0. 2202 */ 2203static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, 2204 int this_cpu, unsigned long *imbalance) 2205{ 2206 if (!sds->power_savings_balance) 2207 return 0; 2208 2209 if (sds->this != sds->group_leader || 2210 sds->group_leader == sds->group_min) 2211 return 0; 2212 2213 *imbalance = sds->min_load_per_task; 2214 sds->busiest = sds->group_min; 2215 2216 return 1; 2217 2218} 2219#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 2220static inline void init_sd_power_savings_stats(struct sched_domain *sd, 2221 struct sd_lb_stats *sds, enum cpu_idle_type idle) 2222{ 2223 return; 2224} 2225 2226static inline void update_sd_power_savings_stats(struct sched_group *group, 2227 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) 2228{ 2229 return; 2230} 2231 2232static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, 2233 int this_cpu, unsigned long *imbalance) 2234{ 2235 return 0; 2236} 2237#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 2238 2239 2240unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 2241{ 2242 return SCHED_LOAD_SCALE; 2243} 2244 2245unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) 2246{ 2247 return default_scale_freq_power(sd, cpu); 2248} 2249 2250unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 2251{ 2252 unsigned long weight = sd->span_weight; 2253 unsigned long smt_gain = sd->smt_gain; 2254 2255 smt_gain /= weight; 2256 2257 return smt_gain; 2258} 2259 2260unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 2261{ 2262 return default_scale_smt_power(sd, cpu); 2263} 2264 2265unsigned long scale_rt_power(int cpu) 2266{ 2267 struct rq *rq = cpu_rq(cpu); 2268 u64 total, available; 2269 2270 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2271 available = total - rq->rt_avg; 2272 2273 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2274 total = SCHED_LOAD_SCALE; 2275 2276 total >>= SCHED_LOAD_SHIFT; 2277 2278 return div_u64(available, total); 2279} 2280 2281static void update_cpu_power(struct sched_domain *sd, int cpu) 2282{ 2283 unsigned long weight = sd->span_weight; 2284 unsigned long power = SCHED_LOAD_SCALE; 2285 struct sched_group *sdg = sd->groups; 2286 2287 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2288 if (sched_feat(ARCH_POWER)) 2289 power *= arch_scale_smt_power(sd, cpu); 2290 else 2291 power *= default_scale_smt_power(sd, cpu); 2292 2293 power >>= SCHED_LOAD_SHIFT; 2294 } 2295 2296 sdg->cpu_power_orig = power; 2297 2298 if (sched_feat(ARCH_POWER)) 2299 power *= arch_scale_freq_power(sd, cpu); 2300 else 2301 power *= default_scale_freq_power(sd, cpu); 2302 2303 power >>= SCHED_LOAD_SHIFT; 2304 2305 power *= scale_rt_power(cpu); 2306 power >>= SCHED_LOAD_SHIFT; 2307 2308 if (!power) 2309 power = 1; 2310 2311 cpu_rq(cpu)->cpu_power = power; 2312 sdg->cpu_power = power; 2313} 2314 2315static void update_group_power(struct sched_domain *sd, int cpu) 2316{ 2317 struct sched_domain *child = sd->child; 2318 struct sched_group *group, *sdg = sd->groups; 2319 unsigned long power; 2320 2321 if (!child) { 2322 update_cpu_power(sd, cpu); 2323 return; 2324 } 2325 2326 power = 0; 2327 2328 group = child->groups; 2329 do { 2330 power += group->cpu_power; 2331 group = group->next; 2332 } while (group != child->groups); 2333 2334 sdg->cpu_power = power; 2335} 2336 2337/* 2338 * Try and fix up capacity for tiny siblings, this is needed when 2339 * things like SD_ASYM_PACKING need f_b_g to select another sibling 2340 * which on its own isn't powerful enough. 2341 * 2342 * See update_sd_pick_busiest() and check_asym_packing(). 2343 */ 2344static inline int 2345fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 2346{ 2347 /* 2348 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2349 */ 2350 if (sd->level != SD_LV_SIBLING) 2351 return 0; 2352 2353 /* 2354 * If ~90% of the cpu_power is still there, we're good. 2355 */ 2356 if (group->cpu_power * 32 > group->cpu_power_orig * 29) 2357 return 1; 2358 2359 return 0; 2360} 2361 2362/** 2363 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2364 * @sd: The sched_domain whose statistics are to be updated. 2365 * @group: sched_group whose statistics are to be updated. 2366 * @this_cpu: Cpu for which load balance is currently performed. 2367 * @idle: Idle status of this_cpu 2368 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2369 * @sd_idle: Idle status of the sched_domain containing group. 2370 * @local_group: Does group contain this_cpu. 2371 * @cpus: Set of cpus considered for load balancing. 2372 * @balance: Should we balance. 2373 * @sgs: variable to hold the statistics for this group. 2374 */ 2375static inline void update_sg_lb_stats(struct sched_domain *sd, 2376 struct sched_group *group, int this_cpu, 2377 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2378 int local_group, const struct cpumask *cpus, 2379 int *balance, struct sg_lb_stats *sgs) 2380{ 2381 unsigned long load, max_cpu_load, min_cpu_load; 2382 int i; 2383 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2384 unsigned long avg_load_per_task = 0; 2385 2386 if (local_group) 2387 balance_cpu = group_first_cpu(group); 2388 2389 /* Tally up the load of all CPUs in the group */ 2390 max_cpu_load = 0; 2391 min_cpu_load = ~0UL; 2392 2393 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2394 struct rq *rq = cpu_rq(i); 2395 2396 if (*sd_idle && rq->nr_running) 2397 *sd_idle = 0; 2398 2399 /* Bias balancing toward cpus of our domain */ 2400 if (local_group) { 2401 if (idle_cpu(i) && !first_idle_cpu) { 2402 first_idle_cpu = 1; 2403 balance_cpu = i; 2404 } 2405 2406 load = target_load(i, load_idx); 2407 } else { 2408 load = source_load(i, load_idx); 2409 if (load > max_cpu_load) 2410 max_cpu_load = load; 2411 if (min_cpu_load > load) 2412 min_cpu_load = load; 2413 } 2414 2415 sgs->group_load += load; 2416 sgs->sum_nr_running += rq->nr_running; 2417 sgs->sum_weighted_load += weighted_cpuload(i); 2418 2419 } 2420 2421 /* 2422 * First idle cpu or the first cpu(busiest) in this sched group 2423 * is eligible for doing load balancing at this and above 2424 * domains. In the newly idle case, we will allow all the cpu's 2425 * to do the newly idle load balance. 2426 */ 2427 if (idle != CPU_NEWLY_IDLE && local_group) { 2428 if (balance_cpu != this_cpu) { 2429 *balance = 0; 2430 return; 2431 } 2432 update_group_power(sd, this_cpu); 2433 } 2434 2435 /* Adjust by relative CPU power of the group */ 2436 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2437 2438 /* 2439 * Consider the group unbalanced when the imbalance is larger 2440 * than the average weight of two tasks. 2441 * 2442 * APZ: with cgroup the avg task weight can vary wildly and 2443 * might not be a suitable number - should we keep a 2444 * normalized nr_running number somewhere that negates 2445 * the hierarchy? 2446 */ 2447 if (sgs->sum_nr_running) 2448 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2449 2450 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 2451 sgs->group_imb = 1; 2452 2453 sgs->group_capacity = 2454 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2455 if (!sgs->group_capacity) 2456 sgs->group_capacity = fix_small_capacity(sd, group); 2457} 2458 2459/** 2460 * update_sd_pick_busiest - return 1 on busiest group 2461 * @sd: sched_domain whose statistics are to be checked 2462 * @sds: sched_domain statistics 2463 * @sg: sched_group candidate to be checked for being the busiest 2464 * @sgs: sched_group statistics 2465 * @this_cpu: the current cpu 2466 * 2467 * Determine if @sg is a busier group than the previously selected 2468 * busiest group. 2469 */ 2470static bool update_sd_pick_busiest(struct sched_domain *sd, 2471 struct sd_lb_stats *sds, 2472 struct sched_group *sg, 2473 struct sg_lb_stats *sgs, 2474 int this_cpu) 2475{ 2476 if (sgs->avg_load <= sds->max_load) 2477 return false; 2478 2479 if (sgs->sum_nr_running > sgs->group_capacity) 2480 return true; 2481 2482 if (sgs->group_imb) 2483 return true; 2484 2485 /* 2486 * ASYM_PACKING needs to move all the work to the lowest 2487 * numbered CPUs in the group, therefore mark all groups 2488 * higher than ourself as busy. 2489 */ 2490 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 2491 this_cpu < group_first_cpu(sg)) { 2492 if (!sds->busiest) 2493 return true; 2494 2495 if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) 2496 return true; 2497 } 2498 2499 return false; 2500} 2501 2502/** 2503 * update_sd_lb_stats - Update sched_group's statistics for load balancing. 2504 * @sd: sched_domain whose statistics are to be updated. 2505 * @this_cpu: Cpu for which load balance is currently performed. 2506 * @idle: Idle status of this_cpu 2507 * @sd_idle: Idle status of the sched_domain containing sg. 2508 * @cpus: Set of cpus considered for load balancing. 2509 * @balance: Should we balance. 2510 * @sds: variable to hold the statistics for this sched_domain. 2511 */ 2512static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2513 enum cpu_idle_type idle, int *sd_idle, 2514 const struct cpumask *cpus, int *balance, 2515 struct sd_lb_stats *sds) 2516{ 2517 struct sched_domain *child = sd->child; 2518 struct sched_group *sg = sd->groups; 2519 struct sg_lb_stats sgs; 2520 int load_idx, prefer_sibling = 0; 2521 2522 if (child && child->flags & SD_PREFER_SIBLING) 2523 prefer_sibling = 1; 2524 2525 init_sd_power_savings_stats(sd, sds, idle); 2526 load_idx = get_sd_load_idx(sd, idle); 2527 2528 do { 2529 int local_group; 2530 2531 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2532 memset(&sgs, 0, sizeof(sgs)); 2533 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2534 local_group, cpus, balance, &sgs); 2535 2536 if (local_group && !(*balance)) 2537 return; 2538 2539 sds->total_load += sgs.group_load; 2540 sds->total_pwr += sg->cpu_power; 2541 2542 /* 2543 * In case the child domain prefers tasks go to siblings 2544 * first, lower the sg capacity to one so that we'll try 2545 * and move all the excess tasks away. 2546 */ 2547 if (prefer_sibling) 2548 sgs.group_capacity = min(sgs.group_capacity, 1UL); 2549 2550 if (local_group) { 2551 sds->this_load = sgs.avg_load; 2552 sds->this = sg; 2553 sds->this_nr_running = sgs.sum_nr_running; 2554 sds->this_load_per_task = sgs.sum_weighted_load; 2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2556 sds->max_load = sgs.avg_load; 2557 sds->busiest = sg; 2558 sds->busiest_nr_running = sgs.sum_nr_running; 2559 sds->busiest_group_capacity = sgs.group_capacity; 2560 sds->busiest_load_per_task = sgs.sum_weighted_load; 2561 sds->group_imb = sgs.group_imb; 2562 } 2563 2564 update_sd_power_savings_stats(sg, sds, local_group, &sgs); 2565 sg = sg->next; 2566 } while (sg != sd->groups); 2567} 2568 2569int __weak arch_sd_sibling_asym_packing(void) 2570{ 2571 return 0*SD_ASYM_PACKING; 2572} 2573 2574/** 2575 * check_asym_packing - Check to see if the group is packed into the 2576 * sched doman. 2577 * 2578 * This is primarily intended to used at the sibling level. Some 2579 * cores like POWER7 prefer to use lower numbered SMT threads. In the 2580 * case of POWER7, it can move to lower SMT modes only when higher 2581 * threads are idle. When in lower SMT modes, the threads will 2582 * perform better since they share less core resources. Hence when we 2583 * have idle threads, we want them to be the higher ones. 2584 * 2585 * This packing function is run on idle threads. It checks to see if 2586 * the busiest CPU in this domain (core in the P7 case) has a higher 2587 * CPU number than the packing function is being run on. Here we are 2588 * assuming lower CPU number will be equivalent to lower a SMT thread 2589 * number. 2590 * 2591 * Returns 1 when packing is required and a task should be moved to 2592 * this CPU. The amount of the imbalance is returned in *imbalance. 2593 * 2594 * @sd: The sched_domain whose packing is to be checked. 2595 * @sds: Statistics of the sched_domain which is to be packed 2596 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 2597 * @imbalance: returns amount of imbalanced due to packing. 2598 */ 2599static int check_asym_packing(struct sched_domain *sd, 2600 struct sd_lb_stats *sds, 2601 int this_cpu, unsigned long *imbalance) 2602{ 2603 int busiest_cpu; 2604 2605 if (!(sd->flags & SD_ASYM_PACKING)) 2606 return 0; 2607 2608 if (!sds->busiest) 2609 return 0; 2610 2611 busiest_cpu = group_first_cpu(sds->busiest); 2612 if (this_cpu > busiest_cpu) 2613 return 0; 2614 2615 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, 2616 SCHED_LOAD_SCALE); 2617 return 1; 2618} 2619 2620/** 2621 * fix_small_imbalance - Calculate the minor imbalance that exists 2622 * amongst the groups of a sched_domain, during 2623 * load balancing. 2624 * @sds: Statistics of the sched_domain whose imbalance is to be calculated. 2625 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 2626 * @imbalance: Variable to store the imbalance. 2627 */ 2628static inline void fix_small_imbalance(struct sd_lb_stats *sds, 2629 int this_cpu, unsigned long *imbalance) 2630{ 2631 unsigned long tmp, pwr_now = 0, pwr_move = 0; 2632 unsigned int imbn = 2; 2633 unsigned long scaled_busy_load_per_task; 2634 2635 if (sds->this_nr_running) { 2636 sds->this_load_per_task /= sds->this_nr_running; 2637 if (sds->busiest_load_per_task > 2638 sds->this_load_per_task) 2639 imbn = 1; 2640 } else 2641 sds->this_load_per_task = 2642 cpu_avg_load_per_task(this_cpu); 2643 2644 scaled_busy_load_per_task = sds->busiest_load_per_task 2645 * SCHED_LOAD_SCALE; 2646 scaled_busy_load_per_task /= sds->busiest->cpu_power; 2647 2648 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 2649 (scaled_busy_load_per_task * imbn)) { 2650 *imbalance = sds->busiest_load_per_task; 2651 return; 2652 } 2653 2654 /* 2655 * OK, we don't have enough imbalance to justify moving tasks, 2656 * however we may be able to increase total CPU power used by 2657 * moving them. 2658 */ 2659 2660 pwr_now += sds->busiest->cpu_power * 2661 min(sds->busiest_load_per_task, sds->max_load); 2662 pwr_now += sds->this->cpu_power * 2663 min(sds->this_load_per_task, sds->this_load); 2664 pwr_now /= SCHED_LOAD_SCALE; 2665 2666 /* Amount of load we'd subtract */ 2667 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 2668 sds->busiest->cpu_power; 2669 if (sds->max_load > tmp) 2670 pwr_move += sds->busiest->cpu_power * 2671 min(sds->busiest_load_per_task, sds->max_load - tmp); 2672 2673 /* Amount of load we'd add */ 2674 if (sds->max_load * sds->busiest->cpu_power < 2675 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 2676 tmp = (sds->max_load * sds->busiest->cpu_power) / 2677 sds->this->cpu_power; 2678 else 2679 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 2680 sds->this->cpu_power; 2681 pwr_move += sds->this->cpu_power * 2682 min(sds->this_load_per_task, sds->this_load + tmp); 2683 pwr_move /= SCHED_LOAD_SCALE; 2684 2685 /* Move if we gain throughput */ 2686 if (pwr_move > pwr_now) 2687 *imbalance = sds->busiest_load_per_task; 2688} 2689 2690/** 2691 * calculate_imbalance - Calculate the amount of imbalance present within the 2692 * groups of a given sched_domain during load balance. 2693 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 2694 * @this_cpu: Cpu for which currently load balance is being performed. 2695 * @imbalance: The variable to store the imbalance. 2696 */ 2697static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 2698 unsigned long *imbalance) 2699{ 2700 unsigned long max_pull, load_above_capacity = ~0UL; 2701 2702 sds->busiest_load_per_task /= sds->busiest_nr_running; 2703 if (sds->group_imb) { 2704 sds->busiest_load_per_task = 2705 min(sds->busiest_load_per_task, sds->avg_load); 2706 } 2707 2708 /* 2709 * In the presence of smp nice balancing, certain scenarios can have 2710 * max load less than avg load(as we skip the groups at or below 2711 * its cpu_power, while calculating max_load..) 2712 */ 2713 if (sds->max_load < sds->avg_load) { 2714 *imbalance = 0; 2715 return fix_small_imbalance(sds, this_cpu, imbalance); 2716 } 2717 2718 if (!sds->group_imb) { 2719 /* 2720 * Don't want to pull so many tasks that a group would go idle. 2721 */ 2722 load_above_capacity = (sds->busiest_nr_running - 2723 sds->busiest_group_capacity); 2724 2725 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); 2726 2727 load_above_capacity /= sds->busiest->cpu_power; 2728 } 2729 2730 /* 2731 * We're trying to get all the cpus to the average_load, so we don't 2732 * want to push ourselves above the average load, nor do we wish to 2733 * reduce the max loaded cpu below the average load. At the same time, 2734 * we also don't want to reduce the group load below the group capacity 2735 * (so that we can implement power-savings policies etc). Thus we look 2736 * for the minimum possible imbalance. 2737 * Be careful of negative numbers as they'll appear as very large values 2738 * with unsigned longs. 2739 */ 2740 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 2741 2742 /* How much load to actually move to equalise the imbalance */ 2743 *imbalance = min(max_pull * sds->busiest->cpu_power, 2744 (sds->avg_load - sds->this_load) * sds->this->cpu_power) 2745 / SCHED_LOAD_SCALE; 2746 2747 /* 2748 * if *imbalance is less than the average load per runnable task 2749 * there is no gaurantee that any tasks will be moved so we'll have 2750 * a think about bumping its value to force at least one task to be 2751 * moved 2752 */ 2753 if (*imbalance < sds->busiest_load_per_task) 2754 return fix_small_imbalance(sds, this_cpu, imbalance); 2755 2756} 2757/******* find_busiest_group() helpers end here *********************/ 2758 2759/** 2760 * find_busiest_group - Returns the busiest group within the sched_domain 2761 * if there is an imbalance. If there isn't an imbalance, and 2762 * the user has opted for power-savings, it returns a group whose 2763 * CPUs can be put to idle by rebalancing those tasks elsewhere, if 2764 * such a group exists. 2765 * 2766 * Also calculates the amount of weighted load which should be moved 2767 * to restore balance. 2768 * 2769 * @sd: The sched_domain whose busiest group is to be returned. 2770 * @this_cpu: The cpu for which load balancing is currently being performed. 2771 * @imbalance: Variable which stores amount of weighted load which should 2772 * be moved to restore balance/put a group to idle. 2773 * @idle: The idle status of this_cpu. 2774 * @sd_idle: The idleness of sd 2775 * @cpus: The set of CPUs under consideration for load-balancing. 2776 * @balance: Pointer to a variable indicating if this_cpu 2777 * is the appropriate cpu to perform load balancing at this_level. 2778 * 2779 * Returns: - the busiest group if imbalance exists. 2780 * - If no imbalance and user has opted for power-savings balance, 2781 * return the least loaded group whose CPUs can be 2782 * put to idle by rebalancing its tasks onto our group. 2783 */ 2784static struct sched_group * 2785find_busiest_group(struct sched_domain *sd, int this_cpu, 2786 unsigned long *imbalance, enum cpu_idle_type idle, 2787 int *sd_idle, const struct cpumask *cpus, int *balance) 2788{ 2789 struct sd_lb_stats sds; 2790 2791 memset(&sds, 0, sizeof(sds)); 2792 2793 /* 2794 * Compute the various statistics relavent for load balancing at 2795 * this level. 2796 */ 2797 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 2798 balance, &sds); 2799 2800 /* Cases where imbalance does not exist from POV of this_cpu */ 2801 /* 1) this_cpu is not the appropriate cpu to perform load balancing 2802 * at this level. 2803 * 2) There is no busy sibling group to pull from. 2804 * 3) This group is the busiest group. 2805 * 4) This group is more busy than the avg busieness at this 2806 * sched_domain. 2807 * 5) The imbalance is within the specified limit. 2808 */ 2809 if (!(*balance)) 2810 goto ret; 2811 2812 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 2813 check_asym_packing(sd, &sds, this_cpu, imbalance)) 2814 return sds.busiest; 2815 2816 if (!sds.busiest || sds.busiest_nr_running == 0) 2817 goto out_balanced; 2818 2819 if (sds.this_load >= sds.max_load) 2820 goto out_balanced; 2821 2822 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 2823 2824 if (sds.this_load >= sds.avg_load) 2825 goto out_balanced; 2826 2827 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2828 goto out_balanced; 2829 2830 /* Looks like there is an imbalance. Compute it */ 2831 calculate_imbalance(&sds, this_cpu, imbalance); 2832 return sds.busiest; 2833 2834out_balanced: 2835 /* 2836 * There is no obvious imbalance. But check if we can do some balancing 2837 * to save power. 2838 */ 2839 if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) 2840 return sds.busiest; 2841ret: 2842 *imbalance = 0; 2843 return NULL; 2844} 2845 2846/* 2847 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2848 */ 2849static struct rq * 2850find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 2851 enum cpu_idle_type idle, unsigned long imbalance, 2852 const struct cpumask *cpus) 2853{ 2854 struct rq *busiest = NULL, *rq; 2855 unsigned long max_load = 0; 2856 int i; 2857 2858 for_each_cpu(i, sched_group_cpus(group)) { 2859 unsigned long power = power_of(i); 2860 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2861 unsigned long wl; 2862 2863 if (!capacity) 2864 capacity = fix_small_capacity(sd, group); 2865 2866 if (!cpumask_test_cpu(i, cpus)) 2867 continue; 2868 2869 rq = cpu_rq(i); 2870 wl = weighted_cpuload(i); 2871 2872 /* 2873 * When comparing with imbalance, use weighted_cpuload() 2874 * which is not scaled with the cpu power. 2875 */ 2876 if (capacity && rq->nr_running == 1 && wl > imbalance) 2877 continue; 2878 2879 /* 2880 * For the load comparisons with the other cpu's, consider 2881 * the weighted_cpuload() scaled with the cpu power, so that 2882 * the load can be moved away from the cpu that is potentially 2883 * running at a lower capacity. 2884 */ 2885 wl = (wl * SCHED_LOAD_SCALE) / power; 2886 2887 if (wl > max_load) { 2888 max_load = wl; 2889 busiest = rq; 2890 } 2891 } 2892 2893 return busiest; 2894} 2895 2896/* 2897 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but 2898 * so long as it is large enough. 2899 */ 2900#define MAX_PINNED_INTERVAL 512 2901 2902/* Working cpumask for load_balance and load_balance_newidle. */ 2903static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2904 2905static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 2906 int busiest_cpu, int this_cpu) 2907{ 2908 if (idle == CPU_NEWLY_IDLE) { 2909 2910 /* 2911 * ASYM_PACKING needs to force migrate tasks from busy but 2912 * higher numbered CPUs in order to pack all tasks in the 2913 * lowest numbered CPUs. 2914 */ 2915 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 2916 return 1; 2917 2918 /* 2919 * The only task running in a non-idle cpu can be moved to this 2920 * cpu in an attempt to completely freeup the other CPU 2921 * package. 2922 * 2923 * The package power saving logic comes from 2924 * find_busiest_group(). If there are no imbalance, then 2925 * f_b_g() will return NULL. However when sched_mc={1,2} then 2926 * f_b_g() will select a group from which a running task may be 2927 * pulled to this cpu in order to make the other package idle. 2928 * If there is no opportunity to make a package idle and if 2929 * there are no imbalance, then f_b_g() will return NULL and no 2930 * action will be taken in load_balance_newidle(). 2931 * 2932 * Under normal task pull operation due to imbalance, there 2933 * will be more than one task in the source run queue and 2934 * move_tasks() will succeed. ld_moved will be true and this 2935 * active balance code will not be triggered. 2936 */ 2937 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2938 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2939 return 0; 2940 2941 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 2942 return 0; 2943 } 2944 2945 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 2946} 2947 2948static int active_load_balance_cpu_stop(void *data); 2949 2950/* 2951 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2952 * tasks if there is an imbalance. 2953 */ 2954static int load_balance(int this_cpu, struct rq *this_rq, 2955 struct sched_domain *sd, enum cpu_idle_type idle, 2956 int *balance) 2957{ 2958 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2959 struct sched_group *group; 2960 unsigned long imbalance; 2961 struct rq *busiest; 2962 unsigned long flags; 2963 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 2964 2965 cpumask_copy(cpus, cpu_active_mask); 2966 2967 /* 2968 * When power savings policy is enabled for the parent domain, idle 2969 * sibling can pick up load irrespective of busy siblings. In this case, 2970 * let the state of idle sibling percolate up as CPU_IDLE, instead of 2971 * portraying it as CPU_NOT_IDLE. 2972 */ 2973 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2974 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2975 sd_idle = 1; 2976 2977 schedstat_inc(sd, lb_count[idle]); 2978 2979redo: 2980 update_shares(sd); 2981 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2982 cpus, balance); 2983 2984 if (*balance == 0) 2985 goto out_balanced; 2986 2987 if (!group) { 2988 schedstat_inc(sd, lb_nobusyg[idle]); 2989 goto out_balanced; 2990 } 2991 2992 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 2993 if (!busiest) { 2994 schedstat_inc(sd, lb_nobusyq[idle]); 2995 goto out_balanced; 2996 } 2997 2998 BUG_ON(busiest == this_rq); 2999 3000 schedstat_add(sd, lb_imbalance[idle], imbalance); 3001 3002 ld_moved = 0; 3003 if (busiest->nr_running > 1) { 3004 /* 3005 * Attempt to move tasks. If find_busiest_group has found 3006 * an imbalance but busiest->nr_running <= 1, the group is 3007 * still unbalanced. ld_moved simply stays zero, so it is 3008 * correctly treated as an imbalance. 3009 */ 3010 local_irq_save(flags); 3011 double_rq_lock(this_rq, busiest); 3012 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3013 imbalance, sd, idle, &all_pinned); 3014 double_rq_unlock(this_rq, busiest); 3015 local_irq_restore(flags); 3016 3017 /* 3018 * some other cpu did the load balance for us. 3019 */ 3020 if (ld_moved && this_cpu != smp_processor_id()) 3021 resched_cpu(this_cpu); 3022 3023 /* All tasks on this runqueue were pinned by CPU affinity */ 3024 if (unlikely(all_pinned)) { 3025 cpumask_clear_cpu(cpu_of(busiest), cpus); 3026 if (!cpumask_empty(cpus)) 3027 goto redo; 3028 goto out_balanced; 3029 } 3030 } 3031 3032 if (!ld_moved) { 3033 schedstat_inc(sd, lb_failed[idle]); 3034 sd->nr_balance_failed++; 3035 3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3037 this_cpu)) { 3038 raw_spin_lock_irqsave(&busiest->lock, flags); 3039 3040 /* don't kick the active_load_balance_cpu_stop, 3041 * if the curr task on busiest cpu can't be 3042 * moved to this_cpu 3043 */ 3044 if (!cpumask_test_cpu(this_cpu, 3045 &busiest->curr->cpus_allowed)) { 3046 raw_spin_unlock_irqrestore(&busiest->lock, 3047 flags); 3048 all_pinned = 1; 3049 goto out_one_pinned; 3050 } 3051 3052 /* 3053 * ->active_balance synchronizes accesses to 3054 * ->active_balance_work. Once set, it's cleared 3055 * only after active load balance is finished. 3056 */ 3057 if (!busiest->active_balance) { 3058 busiest->active_balance = 1; 3059 busiest->push_cpu = this_cpu; 3060 active_balance = 1; 3061 } 3062 raw_spin_unlock_irqrestore(&busiest->lock, flags); 3063 3064 if (active_balance) 3065 stop_one_cpu_nowait(cpu_of(busiest), 3066 active_load_balance_cpu_stop, busiest, 3067 &busiest->active_balance_work); 3068 3069 /* 3070 * We've kicked active balancing, reset the failure 3071 * counter. 3072 */ 3073 sd->nr_balance_failed = sd->cache_nice_tries+1; 3074 } 3075 } else 3076 sd->nr_balance_failed = 0; 3077 3078 if (likely(!active_balance)) { 3079 /* We were unbalanced, so reset the balancing interval */ 3080 sd->balance_interval = sd->min_interval; 3081 } else { 3082 /* 3083 * If we've begun active balancing, start to back off. This 3084 * case may not be covered by the all_pinned logic if there 3085 * is only 1 task on the busy runqueue (because we don't call 3086 * move_tasks). 3087 */ 3088 if (sd->balance_interval < sd->max_interval) 3089 sd->balance_interval *= 2; 3090 } 3091 3092 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3093 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3094 ld_moved = -1; 3095 3096 goto out; 3097 3098out_balanced: 3099 schedstat_inc(sd, lb_balanced[idle]); 3100 3101 sd->nr_balance_failed = 0; 3102 3103out_one_pinned: 3104 /* tune up the balancing interval */ 3105 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 3106 (sd->balance_interval < sd->max_interval)) 3107 sd->balance_interval *= 2; 3108 3109 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3110 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3111 ld_moved = -1; 3112 else 3113 ld_moved = 0; 3114out: 3115 if (ld_moved) 3116 update_shares(sd); 3117 return ld_moved; 3118} 3119 3120/* 3121 * idle_balance is called by schedule() if this_cpu is about to become 3122 * idle. Attempts to pull tasks from other CPUs. 3123 */ 3124static void idle_balance(int this_cpu, struct rq *this_rq) 3125{ 3126 struct sched_domain *sd; 3127 int pulled_task = 0; 3128 unsigned long next_balance = jiffies + HZ; 3129 3130 this_rq->idle_stamp = this_rq->clock; 3131 3132 if (this_rq->avg_idle < sysctl_sched_migration_cost) 3133 return; 3134 3135 /* 3136 * Drop the rq->lock, but keep IRQ/preempt disabled. 3137 */ 3138 raw_spin_unlock(&this_rq->lock); 3139 3140 for_each_domain(this_cpu, sd) { 3141 unsigned long interval; 3142 int balance = 1; 3143 3144 if (!(sd->flags & SD_LOAD_BALANCE)) 3145 continue; 3146 3147 if (sd->flags & SD_BALANCE_NEWIDLE) { 3148 /* If we've pulled tasks over stop searching: */ 3149 pulled_task = load_balance(this_cpu, this_rq, 3150 sd, CPU_NEWLY_IDLE, &balance); 3151 } 3152 3153 interval = msecs_to_jiffies(sd->balance_interval); 3154 if (time_after(next_balance, sd->last_balance + interval)) 3155 next_balance = sd->last_balance + interval; 3156 if (pulled_task) { 3157 this_rq->idle_stamp = 0; 3158 break; 3159 } 3160 } 3161 3162 raw_spin_lock(&this_rq->lock); 3163 3164 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 3165 /* 3166 * We are going idle. next_balance may be set based on 3167 * a busy processor. So reset next_balance. 3168 */ 3169 this_rq->next_balance = next_balance; 3170 } 3171} 3172 3173/* 3174 * active_load_balance_cpu_stop is run by cpu stopper. It pushes 3175 * running tasks off the busiest CPU onto idle CPUs. It requires at 3176 * least 1 task to be running on each physical CPU where possible, and 3177 * avoids physical / logical imbalances. 3178 */ 3179static int active_load_balance_cpu_stop(void *data) 3180{ 3181 struct rq *busiest_rq = data; 3182 int busiest_cpu = cpu_of(busiest_rq); 3183 int target_cpu = busiest_rq->push_cpu; 3184 struct rq *target_rq = cpu_rq(target_cpu); 3185 struct sched_domain *sd; 3186 3187 raw_spin_lock_irq(&busiest_rq->lock); 3188 3189 /* make sure the requested cpu hasn't gone down in the meantime */ 3190 if (unlikely(busiest_cpu != smp_processor_id() || 3191 !busiest_rq->active_balance)) 3192 goto out_unlock; 3193 3194 /* Is there any task to move? */ 3195 if (busiest_rq->nr_running <= 1) 3196 goto out_unlock; 3197 3198 /* 3199 * This condition is "impossible", if it occurs 3200 * we need to fix it. Originally reported by 3201 * Bjorn Helgaas on a 128-cpu setup. 3202 */ 3203 BUG_ON(busiest_rq == target_rq); 3204 3205 /* move a task from busiest_rq to target_rq */ 3206 double_lock_balance(busiest_rq, target_rq); 3207 3208 /* Search for an sd spanning us and the target CPU. */ 3209 for_each_domain(target_cpu, sd) { 3210 if ((sd->flags & SD_LOAD_BALANCE) && 3211 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3212 break; 3213 } 3214 3215 if (likely(sd)) { 3216 schedstat_inc(sd, alb_count); 3217 3218 if (move_one_task(target_rq, target_cpu, busiest_rq, 3219 sd, CPU_IDLE)) 3220 schedstat_inc(sd, alb_pushed); 3221 else 3222 schedstat_inc(sd, alb_failed); 3223 } 3224 double_unlock_balance(busiest_rq, target_rq); 3225out_unlock: 3226 busiest_rq->active_balance = 0; 3227 raw_spin_unlock_irq(&busiest_rq->lock); 3228 return 0; 3229} 3230 3231#ifdef CONFIG_NO_HZ 3232 3233static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); 3234 3235static void trigger_sched_softirq(void *data) 3236{ 3237 raise_softirq_irqoff(SCHED_SOFTIRQ); 3238} 3239 3240static inline void init_sched_softirq_csd(struct call_single_data *csd) 3241{ 3242 csd->func = trigger_sched_softirq; 3243 csd->info = NULL; 3244 csd->flags = 0; 3245 csd->priv = 0; 3246} 3247 3248/* 3249 * idle load balancing details 3250 * - One of the idle CPUs nominates itself as idle load_balancer, while 3251 * entering idle. 3252 * - This idle load balancer CPU will also go into tickless mode when 3253 * it is idle, just like all other idle CPUs 3254 * - When one of the busy CPUs notice that there may be an idle rebalancing 3255 * needed, they will kick the idle load balancer, which then does idle 3256 * load balancing for all the idle CPUs. 3257 */ 3258static struct { 3259 atomic_t load_balancer; 3260 atomic_t first_pick_cpu; 3261 atomic_t second_pick_cpu; 3262 cpumask_var_t idle_cpus_mask; 3263 cpumask_var_t grp_idle_mask; 3264 unsigned long next_balance; /* in jiffy units */ 3265} nohz ____cacheline_aligned; 3266 3267int get_nohz_load_balancer(void) 3268{ 3269 return atomic_read(&nohz.load_balancer); 3270} 3271 3272#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3273/** 3274 * lowest_flag_domain - Return lowest sched_domain containing flag. 3275 * @cpu: The cpu whose lowest level of sched domain is to 3276 * be returned. 3277 * @flag: The flag to check for the lowest sched_domain 3278 * for the given cpu. 3279 * 3280 * Returns the lowest sched_domain of a cpu which contains the given flag. 3281 */ 3282static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) 3283{ 3284 struct sched_domain *sd; 3285 3286 for_each_domain(cpu, sd) 3287 if (sd && (sd->flags & flag)) 3288 break; 3289 3290 return sd; 3291} 3292 3293/** 3294 * for_each_flag_domain - Iterates over sched_domains containing the flag. 3295 * @cpu: The cpu whose domains we're iterating over. 3296 * @sd: variable holding the value of the power_savings_sd 3297 * for cpu. 3298 * @flag: The flag to filter the sched_domains to be iterated. 3299 * 3300 * Iterates over all the scheduler domains for a given cpu that has the 'flag' 3301 * set, starting from the lowest sched_domain to the highest. 3302 */ 3303#define for_each_flag_domain(cpu, sd, flag) \ 3304 for (sd = lowest_flag_domain(cpu, flag); \ 3305 (sd && (sd->flags & flag)); sd = sd->parent) 3306 3307/** 3308 * is_semi_idle_group - Checks if the given sched_group is semi-idle. 3309 * @ilb_group: group to be checked for semi-idleness 3310 * 3311 * Returns: 1 if the group is semi-idle. 0 otherwise. 3312 * 3313 * We define a sched_group to be semi idle if it has atleast one idle-CPU 3314 * and atleast one non-idle CPU. This helper function checks if the given 3315 * sched_group is semi-idle or not. 3316 */ 3317static inline int is_semi_idle_group(struct sched_group *ilb_group) 3318{ 3319 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, 3320 sched_group_cpus(ilb_group)); 3321 3322 /* 3323 * A sched_group is semi-idle when it has atleast one busy cpu 3324 * and atleast one idle cpu. 3325 */ 3326 if (cpumask_empty(nohz.grp_idle_mask)) 3327 return 0; 3328 3329 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) 3330 return 0; 3331 3332 return 1; 3333} 3334/** 3335 * find_new_ilb - Finds the optimum idle load balancer for nomination. 3336 * @cpu: The cpu which is nominating a new idle_load_balancer. 3337 * 3338 * Returns: Returns the id of the idle load balancer if it exists, 3339 * Else, returns >= nr_cpu_ids. 3340 * 3341 * This algorithm picks the idle load balancer such that it belongs to a 3342 * semi-idle powersavings sched_domain. The idea is to try and avoid 3343 * completely idle packages/cores just for the purpose of idle load balancing 3344 * when there are other idle cpu's which are better suited for that job. 3345 */ 3346static int find_new_ilb(int cpu) 3347{ 3348 struct sched_domain *sd; 3349 struct sched_group *ilb_group; 3350 3351 /* 3352 * Have idle load balancer selection from semi-idle packages only 3353 * when power-aware load balancing is enabled 3354 */ 3355 if (!(sched_smt_power_savings || sched_mc_power_savings)) 3356 goto out_done; 3357 3358 /* 3359 * Optimize for the case when we have no idle CPUs or only one 3360 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3361 */ 3362 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3363 goto out_done; 3364 3365 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3366 ilb_group = sd->groups; 3367 3368 do { 3369 if (is_semi_idle_group(ilb_group)) 3370 return cpumask_first(nohz.grp_idle_mask); 3371 3372 ilb_group = ilb_group->next; 3373 3374 } while (ilb_group != sd->groups); 3375 } 3376 3377out_done: 3378 return nr_cpu_ids; 3379} 3380#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3381static inline int find_new_ilb(int call_cpu) 3382{ 3383 return nr_cpu_ids; 3384} 3385#endif 3386 3387/* 3388 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 3389 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle 3390 * CPU (if there is one). 3391 */ 3392static void nohz_balancer_kick(int cpu) 3393{ 3394 int ilb_cpu; 3395 3396 nohz.next_balance++; 3397 3398 ilb_cpu = get_nohz_load_balancer(); 3399 3400 if (ilb_cpu >= nr_cpu_ids) { 3401 ilb_cpu = cpumask_first(nohz.idle_cpus_mask); 3402 if (ilb_cpu >= nr_cpu_ids) 3403 return; 3404 } 3405 3406 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 3407 struct call_single_data *cp; 3408 3409 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 3410 cp = &per_cpu(remote_sched_softirq_cb, cpu); 3411 __smp_call_function_single(ilb_cpu, cp, 0); 3412 } 3413 return; 3414} 3415 3416/* 3417 * This routine will try to nominate the ilb (idle load balancing) 3418 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3419 * load balancing on behalf of all those cpus. 3420 * 3421 * When the ilb owner becomes busy, we will not have new ilb owner until some 3422 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick 3423 * idle load balancing by kicking one of the idle CPUs. 3424 * 3425 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this 3426 * ilb owner CPU in future (when there is a need for idle load balancing on 3427 * behalf of all idle CPUs). 3428 */ 3429void select_nohz_load_balancer(int stop_tick) 3430{ 3431 int cpu = smp_processor_id(); 3432 3433 if (stop_tick) { 3434 if (!cpu_active(cpu)) { 3435 if (atomic_read(&nohz.load_balancer) != cpu) 3436 return; 3437 3438 /* 3439 * If we are going offline and still the leader, 3440 * give up! 3441 */ 3442 if (atomic_cmpxchg(&nohz.load_balancer, cpu, 3443 nr_cpu_ids) != cpu) 3444 BUG(); 3445 3446 return; 3447 } 3448 3449 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 3450 3451 if (atomic_read(&nohz.first_pick_cpu) == cpu) 3452 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); 3453 if (atomic_read(&nohz.second_pick_cpu) == cpu) 3454 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 3455 3456 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { 3457 int new_ilb; 3458 3459 /* make me the ilb owner */ 3460 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, 3461 cpu) != nr_cpu_ids) 3462 return; 3463 3464 /* 3465 * Check to see if there is a more power-efficient 3466 * ilb. 3467 */ 3468 new_ilb = find_new_ilb(cpu); 3469 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3470 atomic_set(&nohz.load_balancer, nr_cpu_ids); 3471 resched_cpu(new_ilb); 3472 return; 3473 } 3474 return; 3475 } 3476 } else { 3477 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) 3478 return; 3479 3480 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 3481 3482 if (atomic_read(&nohz.load_balancer) == cpu) 3483 if (atomic_cmpxchg(&nohz.load_balancer, cpu, 3484 nr_cpu_ids) != cpu) 3485 BUG(); 3486 } 3487 return; 3488} 3489#endif 3490 3491static DEFINE_SPINLOCK(balancing); 3492 3493/* 3494 * It checks each scheduling domain to see if it is due to be balanced, 3495 * and initiates a balancing operation if so. 3496 * 3497 * Balancing parameters are set up in arch_init_sched_domains. 3498 */ 3499static void rebalance_domains(int cpu, enum cpu_idle_type idle) 3500{ 3501 int balance = 1; 3502 struct rq *rq = cpu_rq(cpu); 3503 unsigned long interval; 3504 struct sched_domain *sd; 3505 /* Earliest time when we have to do rebalance again */ 3506 unsigned long next_balance = jiffies + 60*HZ; 3507 int update_next_balance = 0; 3508 int need_serialize; 3509 3510 for_each_domain(cpu, sd) { 3511 if (!(sd->flags & SD_LOAD_BALANCE)) 3512 continue; 3513 3514 interval = sd->balance_interval; 3515 if (idle != CPU_IDLE) 3516 interval *= sd->busy_factor; 3517 3518 /* scale ms to jiffies */ 3519 interval = msecs_to_jiffies(interval); 3520 if (unlikely(!interval)) 3521 interval = 1; 3522 if (interval > HZ*NR_CPUS/10) 3523 interval = HZ*NR_CPUS/10; 3524 3525 need_serialize = sd->flags & SD_SERIALIZE; 3526 3527 if (need_serialize) { 3528 if (!spin_trylock(&balancing)) 3529 goto out; 3530 } 3531 3532 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3533 if (load_balance(cpu, rq, sd, idle, &balance)) { 3534 /* 3535 * We've pulled tasks over so either we're no 3536 * longer idle, or one of our SMT siblings is 3537 * not idle. 3538 */ 3539 idle = CPU_NOT_IDLE; 3540 } 3541 sd->last_balance = jiffies; 3542 } 3543 if (need_serialize) 3544 spin_unlock(&balancing); 3545out: 3546 if (time_after(next_balance, sd->last_balance + interval)) { 3547 next_balance = sd->last_balance + interval; 3548 update_next_balance = 1; 3549 } 3550 3551 /* 3552 * Stop the load balance at this level. There is another 3553 * CPU in our sched group which is doing load balancing more 3554 * actively. 3555 */ 3556 if (!balance) 3557 break; 3558 } 3559 3560 /* 3561 * next_balance will be updated only when there is a need. 3562 * When the cpu is attached to null domain for ex, it will not be 3563 * updated. 3564 */ 3565 if (likely(update_next_balance)) 3566 rq->next_balance = next_balance; 3567} 3568 3569#ifdef CONFIG_NO_HZ 3570/* 3571 * In CONFIG_NO_HZ case, the idle balance kickee will do the 3572 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3573 */ 3574static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 3575{ 3576 struct rq *this_rq = cpu_rq(this_cpu); 3577 struct rq *rq; 3578 int balance_cpu; 3579 3580 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) 3581 return; 3582 3583 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 3584 if (balance_cpu == this_cpu) 3585 continue; 3586 3587 /* 3588 * If this cpu gets work to do, stop the load balancing 3589 * work being done for other cpus. Next load 3590 * balancing owner will pick it up. 3591 */ 3592 if (need_resched()) { 3593 this_rq->nohz_balance_kick = 0; 3594 break; 3595 } 3596 3597 raw_spin_lock_irq(&this_rq->lock); 3598 update_rq_clock(this_rq); 3599 update_cpu_load(this_rq); 3600 raw_spin_unlock_irq(&this_rq->lock); 3601 3602 rebalance_domains(balance_cpu, CPU_IDLE); 3603 3604 rq = cpu_rq(balance_cpu); 3605 if (time_after(this_rq->next_balance, rq->next_balance)) 3606 this_rq->next_balance = rq->next_balance; 3607 } 3608 nohz.next_balance = this_rq->next_balance; 3609 this_rq->nohz_balance_kick = 0; 3610} 3611 3612/* 3613 * Current heuristic for kicking the idle load balancer 3614 * - first_pick_cpu is the one of the busy CPUs. It will kick 3615 * idle load balancer when it has more than one process active. This 3616 * eliminates the need for idle load balancing altogether when we have 3617 * only one running process in the system (common case). 3618 * - If there are more than one busy CPU, idle load balancer may have 3619 * to run for active_load_balance to happen (i.e., two busy CPUs are 3620 * SMT or core siblings and can run better if they move to different 3621 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs 3622 * which will kick idle load balancer as soon as it has any load. 3623 */ 3624static inline int nohz_kick_needed(struct rq *rq, int cpu) 3625{ 3626 unsigned long now = jiffies; 3627 int ret; 3628 int first_pick_cpu, second_pick_cpu; 3629 3630 if (time_before(now, nohz.next_balance)) 3631 return 0; 3632 3633 if (rq->idle_at_tick) 3634 return 0; 3635 3636 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 3637 second_pick_cpu = atomic_read(&nohz.second_pick_cpu); 3638 3639 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && 3640 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) 3641 return 0; 3642 3643 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); 3644 if (ret == nr_cpu_ids || ret == cpu) { 3645 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 3646 if (rq->nr_running > 1) 3647 return 1; 3648 } else { 3649 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); 3650 if (ret == nr_cpu_ids || ret == cpu) { 3651 if (rq->nr_running) 3652 return 1; 3653 } 3654 } 3655 return 0; 3656} 3657#else 3658static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 3659#endif 3660 3661/* 3662 * run_rebalance_domains is triggered when needed from the scheduler tick. 3663 * Also triggered for nohz idle balancing (with nohz_balancing_kick set). 3664 */ 3665static void run_rebalance_domains(struct softirq_action *h) 3666{ 3667 int this_cpu = smp_processor_id(); 3668 struct rq *this_rq = cpu_rq(this_cpu); 3669 enum cpu_idle_type idle = this_rq->idle_at_tick ? 3670 CPU_IDLE : CPU_NOT_IDLE; 3671 3672 rebalance_domains(this_cpu, idle); 3673 3674 /* 3675 * If this cpu has a pending nohz_balance_kick, then do the 3676 * balancing on behalf of the other idle cpus whose ticks are 3677 * stopped. 3678 */ 3679 nohz_idle_balance(this_cpu, idle); 3680} 3681 3682static inline int on_null_domain(int cpu) 3683{ 3684 return !rcu_dereference_sched(cpu_rq(cpu)->sd); 3685} 3686 3687/* 3688 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3689 */ 3690static inline void trigger_load_balance(struct rq *rq, int cpu) 3691{ 3692 /* Don't need to rebalance while attached to NULL domain */ 3693 if (time_after_eq(jiffies, rq->next_balance) && 3694 likely(!on_null_domain(cpu))) 3695 raise_softirq(SCHED_SOFTIRQ); 3696#ifdef CONFIG_NO_HZ 3697 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 3698 nohz_balancer_kick(cpu); 3699#endif 3700} 3701 3702static void rq_online_fair(struct rq *rq) 3703{ 3704 update_sysctl(); 3705} 3706 3707static void rq_offline_fair(struct rq *rq) 3708{ 3709 update_sysctl(); 3710} 3711 3712#else /* CONFIG_SMP */ 3713 3714/* 3715 * on UP we do not need to balance between CPUs: 3716 */ 3717static inline void idle_balance(int cpu, struct rq *rq) 3718{ 3719} 3720 3721#endif /* CONFIG_SMP */ 3722 3723/* 3724 * scheduler tick hitting a task of our scheduling class: 3725 */ 3726static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) 3727{ 3728 struct cfs_rq *cfs_rq; 3729 struct sched_entity *se = &curr->se; 3730 3731 for_each_sched_entity(se) { 3732 cfs_rq = cfs_rq_of(se); 3733 entity_tick(cfs_rq, se, queued); 3734 } 3735} 3736 3737/* 3738 * called on fork with the child task as argument from the parent's context 3739 * - child not yet on the tasklist 3740 * - preemption disabled 3741 */ 3742static void task_fork_fair(struct task_struct *p) 3743{ 3744 struct cfs_rq *cfs_rq = task_cfs_rq(current); 3745 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 3746 int this_cpu = smp_processor_id(); 3747 struct rq *rq = this_rq(); 3748 unsigned long flags; 3749 3750 raw_spin_lock_irqsave(&rq->lock, flags); 3751 3752 update_rq_clock(rq); 3753 3754 if (unlikely(task_cpu(p) != this_cpu)) 3755 __set_task_cpu(p, this_cpu); 3756 3757 update_curr(cfs_rq); 3758 3759 if (curr) 3760 se->vruntime = curr->vruntime; 3761 place_entity(cfs_rq, se, 1); 3762 3763 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { 3764 /* 3765 * Upon rescheduling, sched_class::put_prev_task() will place 3766 * 'current' within the tree based on its new key value. 3767 */ 3768 swap(curr->vruntime, se->vruntime); 3769 resched_task(rq->curr); 3770 } 3771 3772 se->vruntime -= cfs_rq->min_vruntime; 3773 3774 raw_spin_unlock_irqrestore(&rq->lock, flags); 3775} 3776 3777/* 3778 * Priority of the task has changed. Check to see if we preempt 3779 * the current task. 3780 */ 3781static void prio_changed_fair(struct rq *rq, struct task_struct *p, 3782 int oldprio, int running) 3783{ 3784 /* 3785 * Reschedule if we are currently running on this runqueue and 3786 * our priority decreased, or if we are not currently running on 3787 * this runqueue and our priority is higher than the current's 3788 */ 3789 if (running) { 3790 if (p->prio > oldprio) 3791 resched_task(rq->curr); 3792 } else 3793 check_preempt_curr(rq, p, 0); 3794} 3795 3796/* 3797 * We switched to the sched_fair class. 3798 */ 3799static void switched_to_fair(struct rq *rq, struct task_struct *p, 3800 int running) 3801{ 3802 /* 3803 * We were most likely switched from sched_rt, so 3804 * kick off the schedule if running, otherwise just see 3805 * if we can still preempt the current task. 3806 */ 3807 if (running) 3808 resched_task(rq->curr); 3809 else 3810 check_preempt_curr(rq, p, 0); 3811} 3812 3813/* Account for a task changing its policy or group. 3814 * 3815 * This routine is mostly called to set cfs_rq->curr field when a task 3816 * migrates between groups/classes. 3817 */ 3818static void set_curr_task_fair(struct rq *rq) 3819{ 3820 struct sched_entity *se = &rq->curr->se; 3821 3822 for_each_sched_entity(se) 3823 set_next_entity(cfs_rq_of(se), se); 3824} 3825 3826#ifdef CONFIG_FAIR_GROUP_SCHED 3827static void moved_group_fair(struct task_struct *p, int on_rq) 3828{ 3829 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3830 3831 update_curr(cfs_rq); 3832 if (!on_rq) 3833 place_entity(cfs_rq, &p->se, 1); 3834} 3835#endif 3836 3837static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3838{ 3839 struct sched_entity *se = &task->se; 3840 unsigned int rr_interval = 0; 3841 3842 /* 3843 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 3844 * idle runqueue: 3845 */ 3846 if (rq->cfs.load.weight) 3847 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 3848 3849 return rr_interval; 3850} 3851 3852/* 3853 * All the scheduling class methods: 3854 */ 3855static const struct sched_class fair_sched_class = { 3856 .next = &idle_sched_class, 3857 .enqueue_task = enqueue_task_fair, 3858 .dequeue_task = dequeue_task_fair, 3859 .yield_task = yield_task_fair, 3860 3861 .check_preempt_curr = check_preempt_wakeup, 3862 3863 .pick_next_task = pick_next_task_fair, 3864 .put_prev_task = put_prev_task_fair, 3865 3866#ifdef CONFIG_SMP 3867 .select_task_rq = select_task_rq_fair, 3868 3869 .rq_online = rq_online_fair, 3870 .rq_offline = rq_offline_fair, 3871 3872 .task_waking = task_waking_fair, 3873#endif 3874 3875 .set_curr_task = set_curr_task_fair, 3876 .task_tick = task_tick_fair, 3877 .task_fork = task_fork_fair, 3878 3879 .prio_changed = prio_changed_fair, 3880 .switched_to = switched_to_fair, 3881 3882 .get_rr_interval = get_rr_interval_fair, 3883 3884#ifdef CONFIG_FAIR_GROUP_SCHED 3885 .moved_group = moved_group_fair, 3886#endif 3887}; 3888 3889#ifdef CONFIG_SCHED_DEBUG 3890static void print_cfs_stats(struct seq_file *m, int cpu) 3891{ 3892 struct cfs_rq *cfs_rq; 3893 3894 rcu_read_lock(); 3895 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 3896 print_cfs_rq(m, cpu, cfs_rq); 3897 rcu_read_unlock(); 3898} 3899#endif 3900