1/* 2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) 3 * 4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 5 * 6 * Interactivity improvements by Mike Galbraith 7 * (C) 2007 Mike Galbraith <efault@gmx.de> 8 * 9 * Various enhancements by Dmitry Adamushko. 10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> 11 * 12 * Group scheduling enhancements by Srivatsa Vaddagiri 13 * Copyright IBM Corporation, 2007 14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> 15 * 16 * Scaled math optimizations by Thomas Gleixner 17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 18 * 19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 21 */ 22 23/* 24 * Targeted preemption latency for CPU-bound tasks: 25 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 26 * 27 * NOTE: this latency value is not the same as the concept of 28 * 'timeslice length' - timeslices in CFS are of variable length 29 * and have no persistent notion like in traditional, time-slice 30 * based scheduling concepts. 31 * 32 * (to see the precise effective timeslice length of your workload, 33 * run vmstat and monitor the context-switches (cs) field) 34 */ 35unsigned int sysctl_sched_latency = 20000000ULL; 36 37/* 38 * Minimal preemption granularity for CPU-bound tasks: 39 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) 40 */ 41unsigned int sysctl_sched_min_granularity = 4000000ULL; 42 43/* 44 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 45 */ 46static unsigned int sched_nr_latency = 5; 47 48/* 49 * After fork, child runs first. (default) If set to 0 then 50 * parent will (try to) run first. 51 */ 52const_debug unsigned int sysctl_sched_child_runs_first = 1; 53 54/* 55 * sys_sched_yield() compat mode 56 * 57 * This option switches the agressive yield implementation of the 58 * old scheduler back on. 59 */ 60unsigned int __read_mostly sysctl_sched_compat_yield; 61 62/* 63 * SCHED_BATCH wake-up granularity. 64 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 65 * 66 * This option delays the preemption effects of decoupled workloads 67 * and reduces their over-scheduling. Synchronous workloads will still 68 * have immediate wakeup/sleep latencies. 69 */ 70unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; 71 72/* 73 * SCHED_OTHER wake-up granularity. 74 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 75 * 76 * This option delays the preemption effects of decoupled workloads 77 * and reduces their over-scheduling. Synchronous workloads will still 78 * have immediate wakeup/sleep latencies. 79 */ 80unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 81 82const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 83 84/************************************************************** 85 * CFS operations on generic schedulable entities: 86 */ 87 88#ifdef CONFIG_FAIR_GROUP_SCHED 89 90/* cpu runqueue to which this cfs_rq is attached */ 91static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 92{ 93 return cfs_rq->rq; 94} 95 96/* An entity is a task if it doesn't "own" a runqueue */ 97#define entity_is_task(se) (!se->my_q) 98 99#else /* CONFIG_FAIR_GROUP_SCHED */ 100 101static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 102{ 103 return container_of(cfs_rq, struct rq, cfs); 104} 105 106#define entity_is_task(se) 1 107 108#endif /* CONFIG_FAIR_GROUP_SCHED */ 109 110static inline struct task_struct *task_of(struct sched_entity *se) 111{ 112 return container_of(se, struct task_struct, se); 113} 114 115 116/************************************************************** 117 * Scheduling class tree data structure manipulation methods: 118 */ 119 120static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) 121{ 122 s64 delta = (s64)(vruntime - min_vruntime); 123 if (delta > 0) 124 min_vruntime = vruntime; 125 126 return min_vruntime; 127} 128 129static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) 130{ 131 s64 delta = (s64)(vruntime - min_vruntime); 132 if (delta < 0) 133 min_vruntime = vruntime; 134 135 return min_vruntime; 136} 137 138static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 139{ 140 return se->vruntime - cfs_rq->min_vruntime; 141} 142 143/* 144 * Enqueue an entity into the rb-tree: 145 */ 146static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 147{ 148 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 149 struct rb_node *parent = NULL; 150 struct sched_entity *entry; 151 s64 key = entity_key(cfs_rq, se); 152 int leftmost = 1; 153 154 /* 155 * Find the right place in the rbtree: 156 */ 157 while (*link) { 158 parent = *link; 159 entry = rb_entry(parent, struct sched_entity, run_node); 160 /* 161 * We dont care about collisions. Nodes with 162 * the same key stay together. 163 */ 164 if (key < entity_key(cfs_rq, entry)) { 165 link = &parent->rb_left; 166 } else { 167 link = &parent->rb_right; 168 leftmost = 0; 169 } 170 } 171 172 /* 173 * Maintain a cache of leftmost tree entries (it is frequently 174 * used): 175 */ 176 if (leftmost) 177 cfs_rq->rb_leftmost = &se->run_node; 178 179 rb_link_node(&se->run_node, parent, link); 180 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 181} 182 183static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 184{ 185 if (cfs_rq->rb_leftmost == &se->run_node) 186 cfs_rq->rb_leftmost = rb_next(&se->run_node); 187 188 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 189} 190 191static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) 192{ 193 return cfs_rq->rb_leftmost; 194} 195 196static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 197{ 198 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); 199} 200 201static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 202{ 203 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 204 struct sched_entity *se = NULL; 205 struct rb_node *parent; 206 207 while (*link) { 208 parent = *link; 209 se = rb_entry(parent, struct sched_entity, run_node); 210 link = &parent->rb_right; 211 } 212 213 return se; 214} 215 216/************************************************************** 217 * Scheduling class statistics methods: 218 */ 219 220#ifdef CONFIG_SCHED_DEBUG 221int sched_nr_latency_handler(struct ctl_table *table, int write, 222 struct file *filp, void __user *buffer, size_t *lenp, 223 loff_t *ppos) 224{ 225 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 226 227 if (ret || !write) 228 return ret; 229 230 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 231 sysctl_sched_min_granularity); 232 233 return 0; 234} 235#endif 236 237/* 238 * The idea is to set a period in which each task runs once. 239 * 240 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 241 * this period because otherwise the slices get too small. 242 * 243 * p = (nr <= nl) ? l : l*nr/nl 244 */ 245static u64 __sched_period(unsigned long nr_running) 246{ 247 u64 period = sysctl_sched_latency; 248 unsigned long nr_latency = sched_nr_latency; 249 250 if (unlikely(nr_running > nr_latency)) { 251 period *= nr_running; 252 do_div(period, nr_latency); 253 } 254 255 return period; 256} 257 258/* 259 * We calculate the wall-time slice from the period by taking a part 260 * proportional to the weight. 261 * 262 * s = p*w/rw 263 */ 264static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 265{ 266 u64 slice = __sched_period(cfs_rq->nr_running); 267 268 slice *= se->load.weight; 269 do_div(slice, cfs_rq->load.weight); 270 271 return slice; 272} 273 274/* 275 * We calculate the vruntime slice. 276 * 277 * vs = s/w = p/rw 278 */ 279static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) 280{ 281 u64 vslice = __sched_period(nr_running); 282 283 vslice *= NICE_0_LOAD; 284 do_div(vslice, rq_weight); 285 286 return vslice; 287} 288 289static u64 sched_vslice(struct cfs_rq *cfs_rq) 290{ 291 return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); 292} 293 294static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 295{ 296 return __sched_vslice(cfs_rq->load.weight + se->load.weight, 297 cfs_rq->nr_running + 1); 298} 299 300/* 301 * Update the current task's runtime statistics. Skip current tasks that 302 * are not in our scheduling class. 303 */ 304static inline void 305__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, 306 unsigned long delta_exec) 307{ 308 unsigned long delta_exec_weighted; 309 u64 vruntime; 310 311 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 312 313 curr->sum_exec_runtime += delta_exec; 314 schedstat_add(cfs_rq, exec_clock, delta_exec); 315 delta_exec_weighted = delta_exec; 316 if (unlikely(curr->load.weight != NICE_0_LOAD)) { 317 delta_exec_weighted = calc_delta_fair(delta_exec_weighted, 318 &curr->load); 319 } 320 curr->vruntime += delta_exec_weighted; 321 322 /* 323 * maintain cfs_rq->min_vruntime to be a monotonic increasing 324 * value tracking the leftmost vruntime in the tree. 325 */ 326 if (first_fair(cfs_rq)) { 327 vruntime = min_vruntime(curr->vruntime, 328 __pick_next_entity(cfs_rq)->vruntime); 329 } else 330 vruntime = curr->vruntime; 331 332 cfs_rq->min_vruntime = 333 max_vruntime(cfs_rq->min_vruntime, vruntime); 334} 335 336static void update_curr(struct cfs_rq *cfs_rq) 337{ 338 struct sched_entity *curr = cfs_rq->curr; 339 u64 now = rq_of(cfs_rq)->clock; 340 unsigned long delta_exec; 341 342 if (unlikely(!curr)) 343 return; 344 345 /* 346 * Get the amount of time the current task was running 347 * since the last time we changed load (this cannot 348 * overflow on 32 bits): 349 */ 350 delta_exec = (unsigned long)(now - curr->exec_start); 351 352 __update_curr(cfs_rq, curr, delta_exec); 353 curr->exec_start = now; 354 355 if (entity_is_task(curr)) { 356 struct task_struct *curtask = task_of(curr); 357 358 cpuacct_charge(curtask, delta_exec); 359 } 360} 361 362static inline void 363update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 364{ 365 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 366} 367 368/* 369 * Task is being enqueued - update stats: 370 */ 371static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 372{ 373 /* 374 * Are we enqueueing a waiting task? (for current tasks 375 * a dequeue/enqueue event is a NOP) 376 */ 377 if (se != cfs_rq->curr) 378 update_stats_wait_start(cfs_rq, se); 379} 380 381static void 382update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 383{ 384 schedstat_set(se->wait_max, max(se->wait_max, 385 rq_of(cfs_rq)->clock - se->wait_start)); 386 schedstat_set(se->wait_start, 0); 387} 388 389static inline void 390update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 391{ 392 /* 393 * Mark the end of the wait period if dequeueing a 394 * waiting task: 395 */ 396 if (se != cfs_rq->curr) 397 update_stats_wait_end(cfs_rq, se); 398} 399 400/* 401 * We are picking a new current task - update its stats: 402 */ 403static inline void 404update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 405{ 406 /* 407 * We are starting a new run period: 408 */ 409 se->exec_start = rq_of(cfs_rq)->clock; 410} 411 412/************************************************** 413 * Scheduling class queueing methods: 414 */ 415 416static void 417account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 418{ 419 update_load_add(&cfs_rq->load, se->load.weight); 420 cfs_rq->nr_running++; 421 se->on_rq = 1; 422} 423 424static void 425account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 426{ 427 update_load_sub(&cfs_rq->load, se->load.weight); 428 cfs_rq->nr_running--; 429 se->on_rq = 0; 430} 431 432static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 433{ 434#ifdef CONFIG_SCHEDSTATS 435 if (se->sleep_start) { 436 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 437 438 if ((s64)delta < 0) 439 delta = 0; 440 441 if (unlikely(delta > se->sleep_max)) 442 se->sleep_max = delta; 443 444 se->sleep_start = 0; 445 se->sum_sleep_runtime += delta; 446 } 447 if (se->block_start) { 448 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 449 450 if ((s64)delta < 0) 451 delta = 0; 452 453 if (unlikely(delta > se->block_max)) 454 se->block_max = delta; 455 456 se->block_start = 0; 457 se->sum_sleep_runtime += delta; 458 459 /* 460 * Blocking time is in units of nanosecs, so shift by 20 to 461 * get a milliseconds-range estimation of the amount of 462 * time that the task spent sleeping: 463 */ 464 if (unlikely(prof_on == SLEEP_PROFILING)) { 465 struct task_struct *tsk = task_of(se); 466 467 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 468 delta >> 20); 469 } 470 } 471#endif 472} 473 474static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 475{ 476#ifdef CONFIG_SCHED_DEBUG 477 s64 d = se->vruntime - cfs_rq->min_vruntime; 478 479 if (d < 0) 480 d = -d; 481 482 if (d > 3*sysctl_sched_latency) 483 schedstat_inc(cfs_rq, nr_spread_over); 484#endif 485} 486 487static void 488place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) 489{ 490 u64 vruntime; 491 492 vruntime = cfs_rq->min_vruntime; 493 494 if (sched_feat(TREE_AVG)) { 495 struct sched_entity *last = __pick_last_entity(cfs_rq); 496 if (last) { 497 vruntime += last->vruntime; 498 vruntime >>= 1; 499 } 500 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) 501 vruntime += sched_vslice(cfs_rq)/2; 502 503 /* 504 * The 'current' period is already promised to the current tasks, 505 * however the extra weight of the new task will slow them down a 506 * little, place the new task so that it fits in the slot that 507 * stays open at the end. 508 */ 509 if (initial && sched_feat(START_DEBIT)) 510 vruntime += sched_vslice_add(cfs_rq, se); 511 512 if (!initial) { 513 /* sleeps upto a single latency don't count. */ 514 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se)) 515 vruntime -= sysctl_sched_latency; 516 517 /* ensure we never gain time by being placed backwards. */ 518 vruntime = max_vruntime(se->vruntime, vruntime); 519 } 520 521 se->vruntime = vruntime; 522} 523 524static void 525enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 526{ 527 /* 528 * Update run-time statistics of the 'current'. 529 */ 530 update_curr(cfs_rq); 531 532 if (wakeup) { 533 place_entity(cfs_rq, se, 0); 534 enqueue_sleeper(cfs_rq, se); 535 } 536 537 update_stats_enqueue(cfs_rq, se); 538 check_spread(cfs_rq, se); 539 if (se != cfs_rq->curr) 540 __enqueue_entity(cfs_rq, se); 541 account_entity_enqueue(cfs_rq, se); 542} 543 544static void 545dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 546{ 547 /* 548 * Update run-time statistics of the 'current'. 549 */ 550 update_curr(cfs_rq); 551 552 update_stats_dequeue(cfs_rq, se); 553 if (sleep) { 554#ifdef CONFIG_SCHEDSTATS 555 if (entity_is_task(se)) { 556 struct task_struct *tsk = task_of(se); 557 558 if (tsk->state & TASK_INTERRUPTIBLE) 559 se->sleep_start = rq_of(cfs_rq)->clock; 560 if (tsk->state & TASK_UNINTERRUPTIBLE) 561 se->block_start = rq_of(cfs_rq)->clock; 562 } 563#endif 564 } 565 566 if (se != cfs_rq->curr) 567 __dequeue_entity(cfs_rq, se); 568 account_entity_dequeue(cfs_rq, se); 569} 570 571/* 572 * Preempt the current task with a newly woken task if needed: 573 */ 574static void 575check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 576{ 577 unsigned long ideal_runtime, delta_exec; 578 579 ideal_runtime = sched_slice(cfs_rq, curr); 580 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 581 if (delta_exec > ideal_runtime) 582 resched_task(rq_of(cfs_rq)->curr); 583} 584 585static void 586set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 587{ 588 /* 'current' is not kept within the tree. */ 589 if (se->on_rq) { 590 /* 591 * Any task has to be enqueued before it get to execute on 592 * a CPU. So account for the time it spent waiting on the 593 * runqueue. 594 */ 595 update_stats_wait_end(cfs_rq, se); 596 __dequeue_entity(cfs_rq, se); 597 } 598 599 update_stats_curr_start(cfs_rq, se); 600 cfs_rq->curr = se; 601#ifdef CONFIG_SCHEDSTATS 602 /* 603 * Track our maximum slice length, if the CPU's load is at 604 * least twice that of our own weight (i.e. dont track it 605 * when there are only lesser-weight tasks around): 606 */ 607 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 608 se->slice_max = max(se->slice_max, 609 se->sum_exec_runtime - se->prev_sum_exec_runtime); 610 } 611#endif 612 se->prev_sum_exec_runtime = se->sum_exec_runtime; 613} 614 615static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 616{ 617 struct sched_entity *se = NULL; 618 619 if (first_fair(cfs_rq)) { 620 se = __pick_next_entity(cfs_rq); 621 set_next_entity(cfs_rq, se); 622 } 623 624 return se; 625} 626 627static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 628{ 629 /* 630 * If still on the runqueue then deactivate_task() 631 * was not called and update_curr() has to be done: 632 */ 633 if (prev->on_rq) 634 update_curr(cfs_rq); 635 636 check_spread(cfs_rq, prev); 637 if (prev->on_rq) { 638 update_stats_wait_start(cfs_rq, prev); 639 /* Put 'current' back into the tree. */ 640 __enqueue_entity(cfs_rq, prev); 641 } 642 cfs_rq->curr = NULL; 643} 644 645static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 646{ 647 /* 648 * Update run-time statistics of the 'current'. 649 */ 650 update_curr(cfs_rq); 651 652 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 653 check_preempt_tick(cfs_rq, curr); 654} 655 656/************************************************** 657 * CFS operations on tasks: 658 */ 659 660#ifdef CONFIG_FAIR_GROUP_SCHED 661 662/* Walk up scheduling entities hierarchy */ 663#define for_each_sched_entity(se) \ 664 for (; se; se = se->parent) 665 666static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) 667{ 668 return p->se.cfs_rq; 669} 670 671/* runqueue on which this entity is (to be) queued */ 672static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) 673{ 674 return se->cfs_rq; 675} 676 677/* runqueue "owned" by this group */ 678static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) 679{ 680 return grp->my_q; 681} 682 683/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on 684 * another cpu ('this_cpu') 685 */ 686static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 687{ 688 return cfs_rq->tg->cfs_rq[this_cpu]; 689} 690 691/* Iterate thr' all leaf cfs_rq's on a runqueue */ 692#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 693 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 694 695/* Do the two (enqueued) entities belong to the same group ? */ 696static inline int 697is_same_group(struct sched_entity *se, struct sched_entity *pse) 698{ 699 if (se->cfs_rq == pse->cfs_rq) 700 return 1; 701 702 return 0; 703} 704 705static inline struct sched_entity *parent_entity(struct sched_entity *se) 706{ 707 return se->parent; 708} 709 710#else /* CONFIG_FAIR_GROUP_SCHED */ 711 712#define for_each_sched_entity(se) \ 713 for (; se; se = NULL) 714 715static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) 716{ 717 return &task_rq(p)->cfs; 718} 719 720static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) 721{ 722 struct task_struct *p = task_of(se); 723 struct rq *rq = task_rq(p); 724 725 return &rq->cfs; 726} 727 728/* runqueue "owned" by this group */ 729static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) 730{ 731 return NULL; 732} 733 734static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 735{ 736 return &cpu_rq(this_cpu)->cfs; 737} 738 739#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 740 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 741 742static inline int 743is_same_group(struct sched_entity *se, struct sched_entity *pse) 744{ 745 return 1; 746} 747 748static inline struct sched_entity *parent_entity(struct sched_entity *se) 749{ 750 return NULL; 751} 752 753#endif /* CONFIG_FAIR_GROUP_SCHED */ 754 755/* 756 * The enqueue_task method is called before nr_running is 757 * increased. Here we update the fair scheduling stats and 758 * then put the task into the rbtree: 759 */ 760static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 761{ 762 struct cfs_rq *cfs_rq; 763 struct sched_entity *se = &p->se; 764 765 for_each_sched_entity(se) { 766 if (se->on_rq) 767 break; 768 cfs_rq = cfs_rq_of(se); 769 enqueue_entity(cfs_rq, se, wakeup); 770 wakeup = 1; 771 } 772} 773 774/* 775 * The dequeue_task method is called before nr_running is 776 * decreased. We remove the task from the rbtree and 777 * update the fair scheduling stats: 778 */ 779static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 780{ 781 struct cfs_rq *cfs_rq; 782 struct sched_entity *se = &p->se; 783 784 for_each_sched_entity(se) { 785 cfs_rq = cfs_rq_of(se); 786 dequeue_entity(cfs_rq, se, sleep); 787 /* Don't dequeue parent if it has other entities besides us */ 788 if (cfs_rq->load.weight) 789 break; 790 sleep = 1; 791 } 792} 793 794/* 795 * sched_yield() support is very simple - we dequeue and enqueue. 796 * 797 * If compat_yield is turned on then we requeue to the end of the tree. 798 */ 799static void yield_task_fair(struct rq *rq) 800{ 801 struct task_struct *curr = rq->curr; 802 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 803 struct sched_entity *rightmost, *se = &curr->se; 804 805 /* 806 * Are we the only task in the tree? 807 */ 808 if (unlikely(cfs_rq->nr_running == 1)) 809 return; 810 811 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 812 __update_rq_clock(rq); 813 /* 814 * Update run-time statistics of the 'current'. 815 */ 816 update_curr(cfs_rq); 817 818 return; 819 } 820 /* 821 * Find the rightmost entry in the rbtree: 822 */ 823 rightmost = __pick_last_entity(cfs_rq); 824 /* 825 * Already in the rightmost position? 826 */ 827 if (unlikely(rightmost->vruntime < se->vruntime)) 828 return; 829 830 /* 831 * Minimally necessary key value to be last in the tree: 832 * Upon rescheduling, sched_class::put_prev_task() will place 833 * 'current' within the tree based on its new key value. 834 */ 835 se->vruntime = rightmost->vruntime + 1; 836} 837 838/* 839 * Preempt the current task with a newly woken task if needed: 840 */ 841static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 842{ 843 struct task_struct *curr = rq->curr; 844 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 845 struct sched_entity *se = &curr->se, *pse = &p->se; 846 unsigned long gran; 847 848 if (unlikely(rt_prio(p->prio))) { 849 update_rq_clock(rq); 850 update_curr(cfs_rq); 851 resched_task(curr); 852 return; 853 } 854 /* 855 * Batch tasks do not preempt (their preemption is driven by 856 * the tick): 857 */ 858 if (unlikely(p->policy == SCHED_BATCH)) 859 return; 860 861 if (!sched_feat(WAKEUP_PREEMPT)) 862 return; 863 864 while (!is_same_group(se, pse)) { 865 se = parent_entity(se); 866 pse = parent_entity(pse); 867 } 868 869 gran = sysctl_sched_wakeup_granularity; 870 if (unlikely(se->load.weight != NICE_0_LOAD)) 871 gran = calc_delta_fair(gran, &se->load); 872 873 if (pse->vruntime + gran < se->vruntime) 874 resched_task(curr); 875} 876 877static struct task_struct *pick_next_task_fair(struct rq *rq) 878{ 879 struct cfs_rq *cfs_rq = &rq->cfs; 880 struct sched_entity *se; 881 882 if (unlikely(!cfs_rq->nr_running)) 883 return NULL; 884 885 do { 886 se = pick_next_entity(cfs_rq); 887 cfs_rq = group_cfs_rq(se); 888 } while (cfs_rq); 889 890 return task_of(se); 891} 892 893/* 894 * Account for a descheduled task: 895 */ 896static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) 897{ 898 struct sched_entity *se = &prev->se; 899 struct cfs_rq *cfs_rq; 900 901 for_each_sched_entity(se) { 902 cfs_rq = cfs_rq_of(se); 903 put_prev_entity(cfs_rq, se); 904 } 905} 906 907#ifdef CONFIG_SMP 908/************************************************** 909 * Fair scheduling class load-balancing methods: 910 */ 911 912/* 913 * Load-balancing iterator. Note: while the runqueue stays locked 914 * during the whole iteration, the current task might be 915 * dequeued so the iterator has to be dequeue-safe. Here we 916 * achieve that by always pre-iterating before returning 917 * the current task: 918 */ 919static struct task_struct * 920__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) 921{ 922 struct task_struct *p; 923 924 if (!curr) 925 return NULL; 926 927 p = rb_entry(curr, struct task_struct, se.run_node); 928 cfs_rq->rb_load_balance_curr = rb_next(curr); 929 930 return p; 931} 932 933static struct task_struct *load_balance_start_fair(void *arg) 934{ 935 struct cfs_rq *cfs_rq = arg; 936 937 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); 938} 939 940static struct task_struct *load_balance_next_fair(void *arg) 941{ 942 struct cfs_rq *cfs_rq = arg; 943 944 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 945} 946 947#ifdef CONFIG_FAIR_GROUP_SCHED 948static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 949{ 950 struct sched_entity *curr; 951 struct task_struct *p; 952 953 if (!cfs_rq->nr_running) 954 return MAX_PRIO; 955 956 curr = cfs_rq->curr; 957 if (!curr) 958 curr = __pick_next_entity(cfs_rq); 959 960 p = task_of(curr); 961 962 return p->prio; 963} 964#endif 965 966static unsigned long 967load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 968 unsigned long max_load_move, 969 struct sched_domain *sd, enum cpu_idle_type idle, 970 int *all_pinned, int *this_best_prio) 971{ 972 struct cfs_rq *busy_cfs_rq; 973 long rem_load_move = max_load_move; 974 struct rq_iterator cfs_rq_iterator; 975 976 cfs_rq_iterator.start = load_balance_start_fair; 977 cfs_rq_iterator.next = load_balance_next_fair; 978 979 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 980#ifdef CONFIG_FAIR_GROUP_SCHED 981 struct cfs_rq *this_cfs_rq; 982 long imbalance; 983 unsigned long maxload; 984 985 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 986 987 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 988 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 989 if (imbalance <= 0) 990 continue; 991 992 /* Don't pull more than imbalance/2 */ 993 imbalance /= 2; 994 maxload = min(rem_load_move, imbalance); 995 996 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 997#else 998# define maxload rem_load_move 999#endif 1000 /* 1001 * pass busy_cfs_rq argument into 1002 * load_balance_[start|next]_fair iterators 1003 */ 1004 cfs_rq_iterator.arg = busy_cfs_rq; 1005 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, 1006 maxload, sd, idle, all_pinned, 1007 this_best_prio, 1008 &cfs_rq_iterator); 1009 1010 if (rem_load_move <= 0) 1011 break; 1012 } 1013 1014 return max_load_move - rem_load_move; 1015} 1016 1017static int 1018move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1019 struct sched_domain *sd, enum cpu_idle_type idle) 1020{ 1021 struct cfs_rq *busy_cfs_rq; 1022 struct rq_iterator cfs_rq_iterator; 1023 1024 cfs_rq_iterator.start = load_balance_start_fair; 1025 cfs_rq_iterator.next = load_balance_next_fair; 1026 1027 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1028 /* 1029 * pass busy_cfs_rq argument into 1030 * load_balance_[start|next]_fair iterators 1031 */ 1032 cfs_rq_iterator.arg = busy_cfs_rq; 1033 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 1034 &cfs_rq_iterator)) 1035 return 1; 1036 } 1037 1038 return 0; 1039} 1040#endif 1041 1042/* 1043 * scheduler tick hitting a task of our scheduling class: 1044 */ 1045static void task_tick_fair(struct rq *rq, struct task_struct *curr) 1046{ 1047 struct cfs_rq *cfs_rq; 1048 struct sched_entity *se = &curr->se; 1049 1050 for_each_sched_entity(se) { 1051 cfs_rq = cfs_rq_of(se); 1052 entity_tick(cfs_rq, se); 1053 } 1054} 1055 1056#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) 1057 1058/* 1059 * Share the fairness runtime between parent and child, thus the 1060 * total amount of pressure for CPU stays equal - new tasks 1061 * get a chance to run but frequent forkers are not allowed to 1062 * monopolize the CPU. Note: the parent runqueue is locked, 1063 * the child is not running yet. 1064 */ 1065static void task_new_fair(struct rq *rq, struct task_struct *p) 1066{ 1067 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1068 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1069 int this_cpu = smp_processor_id(); 1070 1071 sched_info_queued(p); 1072 1073 update_curr(cfs_rq); 1074 place_entity(cfs_rq, se, 1); 1075 1076 /* 'curr' will be NULL if the child belongs to a different group */ 1077 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1078 curr && curr->vruntime < se->vruntime) { 1079 /* 1080 * Upon rescheduling, sched_class::put_prev_task() will place 1081 * 'current' within the tree based on its new key value. 1082 */ 1083 swap(curr->vruntime, se->vruntime); 1084 } 1085 1086 enqueue_task_fair(rq, p, 0); 1087 resched_task(rq->curr); 1088} 1089 1090/* Account for a task changing its policy or group. 1091 * 1092 * This routine is mostly called to set cfs_rq->curr field when a task 1093 * migrates between groups/classes. 1094 */ 1095static void set_curr_task_fair(struct rq *rq) 1096{ 1097 struct sched_entity *se = &rq->curr->se; 1098 1099 for_each_sched_entity(se) 1100 set_next_entity(cfs_rq_of(se), se); 1101} 1102 1103/* 1104 * All the scheduling class methods: 1105 */ 1106static const struct sched_class fair_sched_class = { 1107 .next = &idle_sched_class, 1108 .enqueue_task = enqueue_task_fair, 1109 .dequeue_task = dequeue_task_fair, 1110 .yield_task = yield_task_fair, 1111 1112 .check_preempt_curr = check_preempt_wakeup, 1113 1114 .pick_next_task = pick_next_task_fair, 1115 .put_prev_task = put_prev_task_fair, 1116 1117#ifdef CONFIG_SMP 1118 .load_balance = load_balance_fair, 1119 .move_one_task = move_one_task_fair, 1120#endif 1121 1122 .set_curr_task = set_curr_task_fair, 1123 .task_tick = task_tick_fair, 1124 .task_new = task_new_fair, 1125}; 1126 1127#ifdef CONFIG_SCHED_DEBUG 1128static void print_cfs_stats(struct seq_file *m, int cpu) 1129{ 1130 struct cfs_rq *cfs_rq; 1131 1132#ifdef CONFIG_FAIR_GROUP_SCHED 1133 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); 1134#endif 1135 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1136 print_cfs_rq(m, cpu, cfs_rq); 1137} 1138#endif 1139