1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 *  Interactivity improvements by Mike Galbraith
7 *  (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 *  Various enhancements by Dmitry Adamushko.
10 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 *  Group scheduling enhancements by Srivatsa Vaddagiri
13 *  Copyright IBM Corporation, 2007
14 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 *  Scaled math optimizations by Thomas Gleixner
17 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 *
19 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21 */
22
23/*
24 * Targeted preemption latency for CPU-bound tasks:
25 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
26 *
27 * NOTE: this latency value is not the same as the concept of
28 * 'timeslice length' - timeslices in CFS are of variable length
29 * and have no persistent notion like in traditional, time-slice
30 * based scheduling concepts.
31 *
32 * (to see the precise effective timeslice length of your workload,
33 *  run vmstat and monitor the context-switches (cs) field)
34 */
35unsigned int sysctl_sched_latency = 20000000ULL;
36
37/*
38 * Minimal preemption granularity for CPU-bound tasks:
39 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
40 */
41unsigned int sysctl_sched_min_granularity = 4000000ULL;
42
43/*
44 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
45 */
46static unsigned int sched_nr_latency = 5;
47
48/*
49 * After fork, child runs first. (default) If set to 0 then
50 * parent will (try to) run first.
51 */
52const_debug unsigned int sysctl_sched_child_runs_first = 1;
53
54/*
55 * sys_sched_yield() compat mode
56 *
57 * This option switches the agressive yield implementation of the
58 * old scheduler back on.
59 */
60unsigned int __read_mostly sysctl_sched_compat_yield;
61
62/*
63 * SCHED_BATCH wake-up granularity.
64 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
65 *
66 * This option delays the preemption effects of decoupled workloads
67 * and reduces their over-scheduling. Synchronous workloads will still
68 * have immediate wakeup/sleep latencies.
69 */
70unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
71
72/*
73 * SCHED_OTHER wake-up granularity.
74 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
75 *
76 * This option delays the preemption effects of decoupled workloads
77 * and reduces their over-scheduling. Synchronous workloads will still
78 * have immediate wakeup/sleep latencies.
79 */
80unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
81
82const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
83
84/**************************************************************
85 * CFS operations on generic schedulable entities:
86 */
87
88#ifdef CONFIG_FAIR_GROUP_SCHED
89
90/* cpu runqueue to which this cfs_rq is attached */
91static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
92{
93	return cfs_rq->rq;
94}
95
96/* An entity is a task if it doesn't "own" a runqueue */
97#define entity_is_task(se)	(!se->my_q)
98
99#else	/* CONFIG_FAIR_GROUP_SCHED */
100
101static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
102{
103	return container_of(cfs_rq, struct rq, cfs);
104}
105
106#define entity_is_task(se)	1
107
108#endif	/* CONFIG_FAIR_GROUP_SCHED */
109
110static inline struct task_struct *task_of(struct sched_entity *se)
111{
112	return container_of(se, struct task_struct, se);
113}
114
115
116/**************************************************************
117 * Scheduling class tree data structure manipulation methods:
118 */
119
120static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
121{
122	s64 delta = (s64)(vruntime - min_vruntime);
123	if (delta > 0)
124		min_vruntime = vruntime;
125
126	return min_vruntime;
127}
128
129static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
130{
131	s64 delta = (s64)(vruntime - min_vruntime);
132	if (delta < 0)
133		min_vruntime = vruntime;
134
135	return min_vruntime;
136}
137
138static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
139{
140	return se->vruntime - cfs_rq->min_vruntime;
141}
142
143/*
144 * Enqueue an entity into the rb-tree:
145 */
146static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
147{
148	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
149	struct rb_node *parent = NULL;
150	struct sched_entity *entry;
151	s64 key = entity_key(cfs_rq, se);
152	int leftmost = 1;
153
154	/*
155	 * Find the right place in the rbtree:
156	 */
157	while (*link) {
158		parent = *link;
159		entry = rb_entry(parent, struct sched_entity, run_node);
160		/*
161		 * We dont care about collisions. Nodes with
162		 * the same key stay together.
163		 */
164		if (key < entity_key(cfs_rq, entry)) {
165			link = &parent->rb_left;
166		} else {
167			link = &parent->rb_right;
168			leftmost = 0;
169		}
170	}
171
172	/*
173	 * Maintain a cache of leftmost tree entries (it is frequently
174	 * used):
175	 */
176	if (leftmost)
177		cfs_rq->rb_leftmost = &se->run_node;
178
179	rb_link_node(&se->run_node, parent, link);
180	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
181}
182
183static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
184{
185	if (cfs_rq->rb_leftmost == &se->run_node)
186		cfs_rq->rb_leftmost = rb_next(&se->run_node);
187
188	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
189}
190
191static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
192{
193	return cfs_rq->rb_leftmost;
194}
195
196static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
197{
198	return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
199}
200
201static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
202{
203	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
204	struct sched_entity *se = NULL;
205	struct rb_node *parent;
206
207	while (*link) {
208		parent = *link;
209		se = rb_entry(parent, struct sched_entity, run_node);
210		link = &parent->rb_right;
211	}
212
213	return se;
214}
215
216/**************************************************************
217 * Scheduling class statistics methods:
218 */
219
220#ifdef CONFIG_SCHED_DEBUG
221int sched_nr_latency_handler(struct ctl_table *table, int write,
222		struct file *filp, void __user *buffer, size_t *lenp,
223		loff_t *ppos)
224{
225	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
226
227	if (ret || !write)
228		return ret;
229
230	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
231					sysctl_sched_min_granularity);
232
233	return 0;
234}
235#endif
236
237/*
238 * The idea is to set a period in which each task runs once.
239 *
240 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
241 * this period because otherwise the slices get too small.
242 *
243 * p = (nr <= nl) ? l : l*nr/nl
244 */
245static u64 __sched_period(unsigned long nr_running)
246{
247	u64 period = sysctl_sched_latency;
248	unsigned long nr_latency = sched_nr_latency;
249
250	if (unlikely(nr_running > nr_latency)) {
251		period *= nr_running;
252		do_div(period, nr_latency);
253	}
254
255	return period;
256}
257
258/*
259 * We calculate the wall-time slice from the period by taking a part
260 * proportional to the weight.
261 *
262 * s = p*w/rw
263 */
264static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
265{
266	u64 slice = __sched_period(cfs_rq->nr_running);
267
268	slice *= se->load.weight;
269	do_div(slice, cfs_rq->load.weight);
270
271	return slice;
272}
273
274/*
275 * We calculate the vruntime slice.
276 *
277 * vs = s/w = p/rw
278 */
279static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
280{
281	u64 vslice = __sched_period(nr_running);
282
283	vslice *= NICE_0_LOAD;
284	do_div(vslice, rq_weight);
285
286	return vslice;
287}
288
289static u64 sched_vslice(struct cfs_rq *cfs_rq)
290{
291	return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
292}
293
294static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
295{
296	return __sched_vslice(cfs_rq->load.weight + se->load.weight,
297			cfs_rq->nr_running + 1);
298}
299
300/*
301 * Update the current task's runtime statistics. Skip current tasks that
302 * are not in our scheduling class.
303 */
304static inline void
305__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
306	      unsigned long delta_exec)
307{
308	unsigned long delta_exec_weighted;
309	u64 vruntime;
310
311	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
312
313	curr->sum_exec_runtime += delta_exec;
314	schedstat_add(cfs_rq, exec_clock, delta_exec);
315	delta_exec_weighted = delta_exec;
316	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
317		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
318							&curr->load);
319	}
320	curr->vruntime += delta_exec_weighted;
321
322	/*
323	 * maintain cfs_rq->min_vruntime to be a monotonic increasing
324	 * value tracking the leftmost vruntime in the tree.
325	 */
326	if (first_fair(cfs_rq)) {
327		vruntime = min_vruntime(curr->vruntime,
328				__pick_next_entity(cfs_rq)->vruntime);
329	} else
330		vruntime = curr->vruntime;
331
332	cfs_rq->min_vruntime =
333		max_vruntime(cfs_rq->min_vruntime, vruntime);
334}
335
336static void update_curr(struct cfs_rq *cfs_rq)
337{
338	struct sched_entity *curr = cfs_rq->curr;
339	u64 now = rq_of(cfs_rq)->clock;
340	unsigned long delta_exec;
341
342	if (unlikely(!curr))
343		return;
344
345	/*
346	 * Get the amount of time the current task was running
347	 * since the last time we changed load (this cannot
348	 * overflow on 32 bits):
349	 */
350	delta_exec = (unsigned long)(now - curr->exec_start);
351
352	__update_curr(cfs_rq, curr, delta_exec);
353	curr->exec_start = now;
354
355	if (entity_is_task(curr)) {
356		struct task_struct *curtask = task_of(curr);
357
358		cpuacct_charge(curtask, delta_exec);
359	}
360}
361
362static inline void
363update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
364{
365	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
366}
367
368/*
369 * Task is being enqueued - update stats:
370 */
371static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
372{
373	/*
374	 * Are we enqueueing a waiting task? (for current tasks
375	 * a dequeue/enqueue event is a NOP)
376	 */
377	if (se != cfs_rq->curr)
378		update_stats_wait_start(cfs_rq, se);
379}
380
381static void
382update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
383{
384	schedstat_set(se->wait_max, max(se->wait_max,
385			rq_of(cfs_rq)->clock - se->wait_start));
386	schedstat_set(se->wait_start, 0);
387}
388
389static inline void
390update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
391{
392	/*
393	 * Mark the end of the wait period if dequeueing a
394	 * waiting task:
395	 */
396	if (se != cfs_rq->curr)
397		update_stats_wait_end(cfs_rq, se);
398}
399
400/*
401 * We are picking a new current task - update its stats:
402 */
403static inline void
404update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
405{
406	/*
407	 * We are starting a new run period:
408	 */
409	se->exec_start = rq_of(cfs_rq)->clock;
410}
411
412/**************************************************
413 * Scheduling class queueing methods:
414 */
415
416static void
417account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
418{
419	update_load_add(&cfs_rq->load, se->load.weight);
420	cfs_rq->nr_running++;
421	se->on_rq = 1;
422}
423
424static void
425account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
426{
427	update_load_sub(&cfs_rq->load, se->load.weight);
428	cfs_rq->nr_running--;
429	se->on_rq = 0;
430}
431
432static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
433{
434#ifdef CONFIG_SCHEDSTATS
435	if (se->sleep_start) {
436		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
437
438		if ((s64)delta < 0)
439			delta = 0;
440
441		if (unlikely(delta > se->sleep_max))
442			se->sleep_max = delta;
443
444		se->sleep_start = 0;
445		se->sum_sleep_runtime += delta;
446	}
447	if (se->block_start) {
448		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
449
450		if ((s64)delta < 0)
451			delta = 0;
452
453		if (unlikely(delta > se->block_max))
454			se->block_max = delta;
455
456		se->block_start = 0;
457		se->sum_sleep_runtime += delta;
458
459		/*
460		 * Blocking time is in units of nanosecs, so shift by 20 to
461		 * get a milliseconds-range estimation of the amount of
462		 * time that the task spent sleeping:
463		 */
464		if (unlikely(prof_on == SLEEP_PROFILING)) {
465			struct task_struct *tsk = task_of(se);
466
467			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
468				     delta >> 20);
469		}
470	}
471#endif
472}
473
474static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
475{
476#ifdef CONFIG_SCHED_DEBUG
477	s64 d = se->vruntime - cfs_rq->min_vruntime;
478
479	if (d < 0)
480		d = -d;
481
482	if (d > 3*sysctl_sched_latency)
483		schedstat_inc(cfs_rq, nr_spread_over);
484#endif
485}
486
487static void
488place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
489{
490	u64 vruntime;
491
492	vruntime = cfs_rq->min_vruntime;
493
494	if (sched_feat(TREE_AVG)) {
495		struct sched_entity *last = __pick_last_entity(cfs_rq);
496		if (last) {
497			vruntime += last->vruntime;
498			vruntime >>= 1;
499		}
500	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
501		vruntime += sched_vslice(cfs_rq)/2;
502
503	/*
504	 * The 'current' period is already promised to the current tasks,
505	 * however the extra weight of the new task will slow them down a
506	 * little, place the new task so that it fits in the slot that
507	 * stays open at the end.
508	 */
509	if (initial && sched_feat(START_DEBIT))
510		vruntime += sched_vslice_add(cfs_rq, se);
511
512	if (!initial) {
513		/* sleeps upto a single latency don't count. */
514		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se))
515			vruntime -= sysctl_sched_latency;
516
517		/* ensure we never gain time by being placed backwards. */
518		vruntime = max_vruntime(se->vruntime, vruntime);
519	}
520
521	se->vruntime = vruntime;
522}
523
524static void
525enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
526{
527	/*
528	 * Update run-time statistics of the 'current'.
529	 */
530	update_curr(cfs_rq);
531
532	if (wakeup) {
533		place_entity(cfs_rq, se, 0);
534		enqueue_sleeper(cfs_rq, se);
535	}
536
537	update_stats_enqueue(cfs_rq, se);
538	check_spread(cfs_rq, se);
539	if (se != cfs_rq->curr)
540		__enqueue_entity(cfs_rq, se);
541	account_entity_enqueue(cfs_rq, se);
542}
543
544static void
545dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
546{
547	/*
548	 * Update run-time statistics of the 'current'.
549	 */
550	update_curr(cfs_rq);
551
552	update_stats_dequeue(cfs_rq, se);
553	if (sleep) {
554#ifdef CONFIG_SCHEDSTATS
555		if (entity_is_task(se)) {
556			struct task_struct *tsk = task_of(se);
557
558			if (tsk->state & TASK_INTERRUPTIBLE)
559				se->sleep_start = rq_of(cfs_rq)->clock;
560			if (tsk->state & TASK_UNINTERRUPTIBLE)
561				se->block_start = rq_of(cfs_rq)->clock;
562		}
563#endif
564	}
565
566	if (se != cfs_rq->curr)
567		__dequeue_entity(cfs_rq, se);
568	account_entity_dequeue(cfs_rq, se);
569}
570
571/*
572 * Preempt the current task with a newly woken task if needed:
573 */
574static void
575check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
576{
577	unsigned long ideal_runtime, delta_exec;
578
579	ideal_runtime = sched_slice(cfs_rq, curr);
580	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
581	if (delta_exec > ideal_runtime)
582		resched_task(rq_of(cfs_rq)->curr);
583}
584
585static void
586set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
587{
588	/* 'current' is not kept within the tree. */
589	if (se->on_rq) {
590		/*
591		 * Any task has to be enqueued before it get to execute on
592		 * a CPU. So account for the time it spent waiting on the
593		 * runqueue.
594		 */
595		update_stats_wait_end(cfs_rq, se);
596		__dequeue_entity(cfs_rq, se);
597	}
598
599	update_stats_curr_start(cfs_rq, se);
600	cfs_rq->curr = se;
601#ifdef CONFIG_SCHEDSTATS
602	/*
603	 * Track our maximum slice length, if the CPU's load is at
604	 * least twice that of our own weight (i.e. dont track it
605	 * when there are only lesser-weight tasks around):
606	 */
607	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
608		se->slice_max = max(se->slice_max,
609			se->sum_exec_runtime - se->prev_sum_exec_runtime);
610	}
611#endif
612	se->prev_sum_exec_runtime = se->sum_exec_runtime;
613}
614
615static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
616{
617	struct sched_entity *se = NULL;
618
619	if (first_fair(cfs_rq)) {
620		se = __pick_next_entity(cfs_rq);
621		set_next_entity(cfs_rq, se);
622	}
623
624	return se;
625}
626
627static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
628{
629	/*
630	 * If still on the runqueue then deactivate_task()
631	 * was not called and update_curr() has to be done:
632	 */
633	if (prev->on_rq)
634		update_curr(cfs_rq);
635
636	check_spread(cfs_rq, prev);
637	if (prev->on_rq) {
638		update_stats_wait_start(cfs_rq, prev);
639		/* Put 'current' back into the tree. */
640		__enqueue_entity(cfs_rq, prev);
641	}
642	cfs_rq->curr = NULL;
643}
644
645static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
646{
647	/*
648	 * Update run-time statistics of the 'current'.
649	 */
650	update_curr(cfs_rq);
651
652	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
653		check_preempt_tick(cfs_rq, curr);
654}
655
656/**************************************************
657 * CFS operations on tasks:
658 */
659
660#ifdef CONFIG_FAIR_GROUP_SCHED
661
662/* Walk up scheduling entities hierarchy */
663#define for_each_sched_entity(se) \
664		for (; se; se = se->parent)
665
666static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
667{
668	return p->se.cfs_rq;
669}
670
671/* runqueue on which this entity is (to be) queued */
672static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
673{
674	return se->cfs_rq;
675}
676
677/* runqueue "owned" by this group */
678static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
679{
680	return grp->my_q;
681}
682
683/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
684 * another cpu ('this_cpu')
685 */
686static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
687{
688	return cfs_rq->tg->cfs_rq[this_cpu];
689}
690
691/* Iterate thr' all leaf cfs_rq's on a runqueue */
692#define for_each_leaf_cfs_rq(rq, cfs_rq) \
693	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
694
695/* Do the two (enqueued) entities belong to the same group ? */
696static inline int
697is_same_group(struct sched_entity *se, struct sched_entity *pse)
698{
699	if (se->cfs_rq == pse->cfs_rq)
700		return 1;
701
702	return 0;
703}
704
705static inline struct sched_entity *parent_entity(struct sched_entity *se)
706{
707	return se->parent;
708}
709
710#else	/* CONFIG_FAIR_GROUP_SCHED */
711
712#define for_each_sched_entity(se) \
713		for (; se; se = NULL)
714
715static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
716{
717	return &task_rq(p)->cfs;
718}
719
720static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
721{
722	struct task_struct *p = task_of(se);
723	struct rq *rq = task_rq(p);
724
725	return &rq->cfs;
726}
727
728/* runqueue "owned" by this group */
729static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
730{
731	return NULL;
732}
733
734static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
735{
736	return &cpu_rq(this_cpu)->cfs;
737}
738
739#define for_each_leaf_cfs_rq(rq, cfs_rq) \
740		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
741
742static inline int
743is_same_group(struct sched_entity *se, struct sched_entity *pse)
744{
745	return 1;
746}
747
748static inline struct sched_entity *parent_entity(struct sched_entity *se)
749{
750	return NULL;
751}
752
753#endif	/* CONFIG_FAIR_GROUP_SCHED */
754
755/*
756 * The enqueue_task method is called before nr_running is
757 * increased. Here we update the fair scheduling stats and
758 * then put the task into the rbtree:
759 */
760static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
761{
762	struct cfs_rq *cfs_rq;
763	struct sched_entity *se = &p->se;
764
765	for_each_sched_entity(se) {
766		if (se->on_rq)
767			break;
768		cfs_rq = cfs_rq_of(se);
769		enqueue_entity(cfs_rq, se, wakeup);
770		wakeup = 1;
771	}
772}
773
774/*
775 * The dequeue_task method is called before nr_running is
776 * decreased. We remove the task from the rbtree and
777 * update the fair scheduling stats:
778 */
779static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
780{
781	struct cfs_rq *cfs_rq;
782	struct sched_entity *se = &p->se;
783
784	for_each_sched_entity(se) {
785		cfs_rq = cfs_rq_of(se);
786		dequeue_entity(cfs_rq, se, sleep);
787		/* Don't dequeue parent if it has other entities besides us */
788		if (cfs_rq->load.weight)
789			break;
790		sleep = 1;
791	}
792}
793
794/*
795 * sched_yield() support is very simple - we dequeue and enqueue.
796 *
797 * If compat_yield is turned on then we requeue to the end of the tree.
798 */
799static void yield_task_fair(struct rq *rq)
800{
801	struct task_struct *curr = rq->curr;
802	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
803	struct sched_entity *rightmost, *se = &curr->se;
804
805	/*
806	 * Are we the only task in the tree?
807	 */
808	if (unlikely(cfs_rq->nr_running == 1))
809		return;
810
811	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
812		__update_rq_clock(rq);
813		/*
814		 * Update run-time statistics of the 'current'.
815		 */
816		update_curr(cfs_rq);
817
818		return;
819	}
820	/*
821	 * Find the rightmost entry in the rbtree:
822	 */
823	rightmost = __pick_last_entity(cfs_rq);
824	/*
825	 * Already in the rightmost position?
826	 */
827	if (unlikely(rightmost->vruntime < se->vruntime))
828		return;
829
830	/*
831	 * Minimally necessary key value to be last in the tree:
832	 * Upon rescheduling, sched_class::put_prev_task() will place
833	 * 'current' within the tree based on its new key value.
834	 */
835	se->vruntime = rightmost->vruntime + 1;
836}
837
838/*
839 * Preempt the current task with a newly woken task if needed:
840 */
841static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
842{
843	struct task_struct *curr = rq->curr;
844	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
845	struct sched_entity *se = &curr->se, *pse = &p->se;
846	unsigned long gran;
847
848	if (unlikely(rt_prio(p->prio))) {
849		update_rq_clock(rq);
850		update_curr(cfs_rq);
851		resched_task(curr);
852		return;
853	}
854	/*
855	 * Batch tasks do not preempt (their preemption is driven by
856	 * the tick):
857	 */
858	if (unlikely(p->policy == SCHED_BATCH))
859		return;
860
861	if (!sched_feat(WAKEUP_PREEMPT))
862		return;
863
864	while (!is_same_group(se, pse)) {
865		se = parent_entity(se);
866		pse = parent_entity(pse);
867	}
868
869	gran = sysctl_sched_wakeup_granularity;
870	if (unlikely(se->load.weight != NICE_0_LOAD))
871		gran = calc_delta_fair(gran, &se->load);
872
873	if (pse->vruntime + gran < se->vruntime)
874		resched_task(curr);
875}
876
877static struct task_struct *pick_next_task_fair(struct rq *rq)
878{
879	struct cfs_rq *cfs_rq = &rq->cfs;
880	struct sched_entity *se;
881
882	if (unlikely(!cfs_rq->nr_running))
883		return NULL;
884
885	do {
886		se = pick_next_entity(cfs_rq);
887		cfs_rq = group_cfs_rq(se);
888	} while (cfs_rq);
889
890	return task_of(se);
891}
892
893/*
894 * Account for a descheduled task:
895 */
896static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
897{
898	struct sched_entity *se = &prev->se;
899	struct cfs_rq *cfs_rq;
900
901	for_each_sched_entity(se) {
902		cfs_rq = cfs_rq_of(se);
903		put_prev_entity(cfs_rq, se);
904	}
905}
906
907#ifdef CONFIG_SMP
908/**************************************************
909 * Fair scheduling class load-balancing methods:
910 */
911
912/*
913 * Load-balancing iterator. Note: while the runqueue stays locked
914 * during the whole iteration, the current task might be
915 * dequeued so the iterator has to be dequeue-safe. Here we
916 * achieve that by always pre-iterating before returning
917 * the current task:
918 */
919static struct task_struct *
920__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
921{
922	struct task_struct *p;
923
924	if (!curr)
925		return NULL;
926
927	p = rb_entry(curr, struct task_struct, se.run_node);
928	cfs_rq->rb_load_balance_curr = rb_next(curr);
929
930	return p;
931}
932
933static struct task_struct *load_balance_start_fair(void *arg)
934{
935	struct cfs_rq *cfs_rq = arg;
936
937	return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
938}
939
940static struct task_struct *load_balance_next_fair(void *arg)
941{
942	struct cfs_rq *cfs_rq = arg;
943
944	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
945}
946
947#ifdef CONFIG_FAIR_GROUP_SCHED
948static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
949{
950	struct sched_entity *curr;
951	struct task_struct *p;
952
953	if (!cfs_rq->nr_running)
954		return MAX_PRIO;
955
956	curr = cfs_rq->curr;
957	if (!curr)
958		curr = __pick_next_entity(cfs_rq);
959
960	p = task_of(curr);
961
962	return p->prio;
963}
964#endif
965
966static unsigned long
967load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
968		  unsigned long max_load_move,
969		  struct sched_domain *sd, enum cpu_idle_type idle,
970		  int *all_pinned, int *this_best_prio)
971{
972	struct cfs_rq *busy_cfs_rq;
973	long rem_load_move = max_load_move;
974	struct rq_iterator cfs_rq_iterator;
975
976	cfs_rq_iterator.start = load_balance_start_fair;
977	cfs_rq_iterator.next = load_balance_next_fair;
978
979	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
980#ifdef CONFIG_FAIR_GROUP_SCHED
981		struct cfs_rq *this_cfs_rq;
982		long imbalance;
983		unsigned long maxload;
984
985		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
986
987		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
988		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
989		if (imbalance <= 0)
990			continue;
991
992		/* Don't pull more than imbalance/2 */
993		imbalance /= 2;
994		maxload = min(rem_load_move, imbalance);
995
996		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
997#else
998# define maxload rem_load_move
999#endif
1000		/*
1001		 * pass busy_cfs_rq argument into
1002		 * load_balance_[start|next]_fair iterators
1003		 */
1004		cfs_rq_iterator.arg = busy_cfs_rq;
1005		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1006					       maxload, sd, idle, all_pinned,
1007					       this_best_prio,
1008					       &cfs_rq_iterator);
1009
1010		if (rem_load_move <= 0)
1011			break;
1012	}
1013
1014	return max_load_move - rem_load_move;
1015}
1016
1017static int
1018move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1019		   struct sched_domain *sd, enum cpu_idle_type idle)
1020{
1021	struct cfs_rq *busy_cfs_rq;
1022	struct rq_iterator cfs_rq_iterator;
1023
1024	cfs_rq_iterator.start = load_balance_start_fair;
1025	cfs_rq_iterator.next = load_balance_next_fair;
1026
1027	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1028		/*
1029		 * pass busy_cfs_rq argument into
1030		 * load_balance_[start|next]_fair iterators
1031		 */
1032		cfs_rq_iterator.arg = busy_cfs_rq;
1033		if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
1034				       &cfs_rq_iterator))
1035		    return 1;
1036	}
1037
1038	return 0;
1039}
1040#endif
1041
1042/*
1043 * scheduler tick hitting a task of our scheduling class:
1044 */
1045static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1046{
1047	struct cfs_rq *cfs_rq;
1048	struct sched_entity *se = &curr->se;
1049
1050	for_each_sched_entity(se) {
1051		cfs_rq = cfs_rq_of(se);
1052		entity_tick(cfs_rq, se);
1053	}
1054}
1055
1056#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1057
1058/*
1059 * Share the fairness runtime between parent and child, thus the
1060 * total amount of pressure for CPU stays equal - new tasks
1061 * get a chance to run but frequent forkers are not allowed to
1062 * monopolize the CPU. Note: the parent runqueue is locked,
1063 * the child is not running yet.
1064 */
1065static void task_new_fair(struct rq *rq, struct task_struct *p)
1066{
1067	struct cfs_rq *cfs_rq = task_cfs_rq(p);
1068	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1069	int this_cpu = smp_processor_id();
1070
1071	sched_info_queued(p);
1072
1073	update_curr(cfs_rq);
1074	place_entity(cfs_rq, se, 1);
1075
1076	/* 'curr' will be NULL if the child belongs to a different group */
1077	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1078			curr && curr->vruntime < se->vruntime) {
1079		/*
1080		 * Upon rescheduling, sched_class::put_prev_task() will place
1081		 * 'current' within the tree based on its new key value.
1082		 */
1083		swap(curr->vruntime, se->vruntime);
1084	}
1085
1086	enqueue_task_fair(rq, p, 0);
1087	resched_task(rq->curr);
1088}
1089
1090/* Account for a task changing its policy or group.
1091 *
1092 * This routine is mostly called to set cfs_rq->curr field when a task
1093 * migrates between groups/classes.
1094 */
1095static void set_curr_task_fair(struct rq *rq)
1096{
1097	struct sched_entity *se = &rq->curr->se;
1098
1099	for_each_sched_entity(se)
1100		set_next_entity(cfs_rq_of(se), se);
1101}
1102
1103/*
1104 * All the scheduling class methods:
1105 */
1106static const struct sched_class fair_sched_class = {
1107	.next			= &idle_sched_class,
1108	.enqueue_task		= enqueue_task_fair,
1109	.dequeue_task		= dequeue_task_fair,
1110	.yield_task		= yield_task_fair,
1111
1112	.check_preempt_curr	= check_preempt_wakeup,
1113
1114	.pick_next_task		= pick_next_task_fair,
1115	.put_prev_task		= put_prev_task_fair,
1116
1117#ifdef CONFIG_SMP
1118	.load_balance		= load_balance_fair,
1119	.move_one_task		= move_one_task_fair,
1120#endif
1121
1122	.set_curr_task          = set_curr_task_fair,
1123	.task_tick		= task_tick_fair,
1124	.task_new		= task_new_fair,
1125};
1126
1127#ifdef CONFIG_SCHED_DEBUG
1128static void print_cfs_stats(struct seq_file *m, int cpu)
1129{
1130	struct cfs_rq *cfs_rq;
1131
1132#ifdef CONFIG_FAIR_GROUP_SCHED
1133	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1134#endif
1135	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1136		print_cfs_rq(m, cpu, cfs_rq);
1137}
1138#endif
1139