1/*
2 * Copyright (c) 2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31
32#include <machine/machine_routines.h>
33#include <machine/sched_param.h>
34#include <machine/machine_cpu.h>
35
36#include <kern/kern_types.h>
37#include <kern/debug.h>
38#include <kern/mach_param.h>
39#include <kern/machine.h>
40#include <kern/misc_protos.h>
41#include <kern/processor.h>
42#include <kern/queue.h>
43#include <kern/sched.h>
44#include <kern/sched_prim.h>
45#include <kern/task.h>
46#include <kern/thread.h>
47
48#include <sys/kdebug.h>
49
50/*
51 * Theory Statement
52 *
53 * How does the task scheduler work?
54 *
55 * It schedules threads across a few levels.
56 *
57 * RT threads are dealt with above us
58 * Bound threads go into the per-processor runq
59 * Non-bound threads are linked on their task's sched_group's runq
60 * sched_groups' sched_entries are linked on the pset's runq
61 *
62 * TODO: make this explicit - bound threads should have a different enqueue fxn
63 *
64 * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue
65 * or the current group's runqueue, then dequeue the next thread in that runqueue.
66 *
67 * We then manipulate the sched_entries to reflect the invariant that:
68 * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global
69 * runqueue.
70 *
71 * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting
72 * to run.  This reduces the excess contention bonus given to processes which have work spread among many threads
73 * as compared to processes which do the same amount of work under fewer threads.
74 *
75 * NOTE: Currently, the multiq scheduler only supports one pset.
76 *
77 * NOTE ABOUT thread->sched_pri:
78 *
79 * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0.
80 * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue.
81 *
82 * TODO: Future features:
83 *
84 * Decouple the task priority from the sched_entry priority, allowing for:
85 *      fast task priority change without having to iterate and re-dispatch all threads in the task.
86 *              i.e. task-wide priority, task-wide boosting
87 *      fancier group decay features
88 *
89 * Group (or task) decay:
90 *      Decay is used for a few different things:
91 *              Prioritizing latency-needing threads over throughput-needing threads for time-to-running
92 *              Balancing work between threads in a process
93 *              Balancing work done at the same priority between different processes
94 *              Recovering from priority inversions between two threads in the same process
95 *              Recovering from priority inversions between two threads in different processes
96 *              Simulating a proportional share scheduler by allowing lower priority threads
97 *                to run for a certain percentage of the time
98 *
99 *      Task decay lets us separately address the 'same process' and 'different process' needs,
100 *      which will allow us to make smarter tradeoffs in different cases.
101 *      For example, we could resolve priority inversion in the same process by reordering threads without dropping the
102 *      process below low priority threads in other processes.
103 *
104 * One lock to rule them all (or at least all the runqueues) instead of the pset locks
105 *
106 * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field.
107 * The entries array is 5K currently so it'd be really great to reduce.
108 * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime.
109 *
110 * When preempting a processor, store a flag saying if the preemption
111 * was from a thread in the same group or different group,
112 * and tell choose_thread about it.
113 *
114 * When choosing a processor, bias towards those running in the same
115 * group as I am running (at the same priority, or within a certain band?).
116 *
117 * Decide if we need to support psets.
118 * Decide how to support psets - do we need duplicate entries for each pset,
119 * or can we get away with putting the entry in either one or the other pset?
120 *
121 * Consider the right way to handle runq count - I don't want to iterate groups.
122 * Perhaps keep a global counter.  sched_run_count will not work.
123 * Alternate option - remove it from choose_processor. It doesn't add much value
124 * now that we have global runq.
125 *
126 * Need a better way of finding group to target instead of looking at current_task.
127 * Perhaps choose_thread could pass in the current thread?
128 *
129 * Consider unifying runq copy-pastes.
130 *
131 * Thoughts on having a group central quantum bucket:
132 *
133 * I see two algorithms to decide quanta:
134 * A) Hand off only when switching thread to thread in the same group
135 * B) Allocate and return quanta to the group's pool
136 *
137 * Issues:
138 * If a task blocks completely, should it come back with the leftover quanta
139 * or brand new quanta?
140 *
141 * Should I put a flag saying zero out a quanta you grab when youre dispatched'?
142 *
143 * Resolution:
144 * Handing off quanta between threads will help with jumping around in the current task
145 * but will not help when a thread from a different task is involved.
146 * Need an algorithm that works with round robin-ing between threads in different tasks
147 *
148 * But wait - round robining can only be triggered by quantum expire or blocking.
149 * We need something that works with preemption or yielding - that's the more interesting idea.
150 *
151 * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq.
152 * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq.
153 *
154 * New algorithm -
155 * Hand off quanta when hopping between threads with same sched_group
156 * Even if thread was blocked it uses last thread remaining quanta when it starts.
157 *
158 * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting
159 * interesting.
160 *
161 * A thought - perhaps the handoff approach doesn't work so well in the presence of
162 * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that
163 * woken thread will be what I switch to - other processor may have stolen it.
164 * What do we do there?
165 *
166 * Conclusions:
167 * We currently don't know of a scenario where quantum buckets on the task is beneficial.
168 * We will instead handoff quantum between threads in the task, and keep quantum
169 * on the preempted thread if it's preempted by something outside the task.
170 *
171 */
172
173#if DEBUG || DEVELOPMENT
174#define MULTIQ_SANITY_CHECK
175#endif
176
177typedef struct sched_entry {
178	queue_chain_t           links;
179	int16_t                 sched_pri;      /* scheduled (current) priority */
180	int16_t                 runq;
181	int32_t 		pad;
182} *sched_entry_t;
183
184typedef run_queue_t entry_queue_t;                      /* A run queue that holds sched_entries instead of threads */
185typedef run_queue_t group_runq_t;                       /* A run queue that is part of a sched_group */
186
187#define SCHED_ENTRY_NULL        ((sched_entry_t) 0)
188#define MULTIQ_ERUNQ            (-4)       		/* Indicates entry is on the main runq */
189
190/* Each level in the run queue corresponds to one entry in the entries array */
191struct sched_group {
192	struct sched_entry      entries[NRQS];
193	struct run_queue        runq;
194	queue_chain_t           sched_groups;
195};
196
197/* TODO: Turn this into an attribute in the sched dispatch struct */
198boolean_t               sched_groups_enabled = FALSE;
199
200/*
201 * Keep entry on the head of the runqueue while dequeueing threads.
202 * Only cycle it to the end of the runqueue when a thread in the task
203 * hits its quantum.
204 */
205static boolean_t        deep_drain = FALSE;
206
207/*
208 * Don't favor the task when an urgent thread is present.
209 */
210static boolean_t        drain_urgent_first = TRUE;
211
212/* Verify the consistency of the runq before touching it */
213static boolean_t        multiq_sanity_check = FALSE;
214
215/*
216 * Draining threads from the current task is preferred
217 * when they're less than X steps below the current
218 * global highest priority
219 */
220#define DEFAULT_DRAIN_BAND_LIMIT MAXPRI
221static integer_t        drain_band_limit;
222
223/*
224 * Don't go below this priority level if there is something above it in another task
225 */
226#define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE
227static integer_t        drain_depth_limit;
228
229
230static struct zone      *sched_group_zone;
231
232static uint64_t         num_sched_groups = 0;
233static queue_head_t     sched_groups;
234
235static lck_attr_t       sched_groups_lock_attr;
236static lck_grp_t        sched_groups_lock_grp;
237static lck_grp_attr_t   sched_groups_lock_grp_attr;
238
239static lck_mtx_t        sched_groups_lock;
240
241
242static void
243sched_multiq_init(void);
244
245static thread_t
246sched_multiq_steal_thread(processor_set_t pset);
247
248static void
249sched_multiq_thread_update_scan(void);
250
251static boolean_t
252sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options);
253
254static boolean_t
255sched_multiq_processor_queue_remove(processor_t processor, thread_t thread);
256
257void
258sched_multiq_quantum_expire(thread_t thread);
259
260static ast_t
261sched_multiq_processor_csw_check(processor_t processor);
262
263static boolean_t
264sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte);
265
266static int
267sched_multiq_runq_count(processor_t processor);
268
269static boolean_t
270sched_multiq_processor_queue_empty(processor_t processor);
271
272static uint64_t
273sched_multiq_runq_stats_count_sum(processor_t processor);
274
275static int
276sched_multiq_processor_bound_count(processor_t processor);
277
278static void
279sched_multiq_pset_init(processor_set_t pset);
280
281static void
282sched_multiq_processor_init(processor_t processor);
283
284static thread_t
285sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason);
286
287static void
288sched_multiq_processor_queue_shutdown(processor_t processor);
289
290static sched_mode_t
291sched_multiq_initial_thread_sched_mode(task_t parent_task);
292
293static boolean_t
294sched_multiq_should_current_thread_rechoose_processor(processor_t processor);
295
296const struct sched_dispatch_table sched_multiq_dispatch = {
297	.init                                           = sched_multiq_init,
298	.timebase_init                                  = sched_traditional_timebase_init,
299	.processor_init                                 = sched_multiq_processor_init,
300	.pset_init                                      = sched_multiq_pset_init,
301	.maintenance_continuation                       = sched_traditional_maintenance_continue,
302	.choose_thread                                  = sched_multiq_choose_thread,
303	.steal_thread                                   = sched_multiq_steal_thread,
304	.compute_priority                               = compute_priority,
305	.choose_processor                               = choose_processor,
306	.processor_enqueue                              = sched_multiq_processor_enqueue,
307	.processor_queue_shutdown                       = sched_multiq_processor_queue_shutdown,
308	.processor_queue_remove                         = sched_multiq_processor_queue_remove,
309	.processor_queue_empty                          = sched_multiq_processor_queue_empty,
310	.priority_is_urgent                             = priority_is_urgent,
311	.processor_csw_check                            = sched_multiq_processor_csw_check,
312	.processor_queue_has_priority                   = sched_multiq_processor_queue_has_priority,
313	.initial_quantum_size                           = sched_traditional_initial_quantum_size,
314	.initial_thread_sched_mode                      = sched_multiq_initial_thread_sched_mode,
315	.can_update_priority                            = can_update_priority,
316	.update_priority                                = update_priority,
317	.lightweight_update_priority                    = lightweight_update_priority,
318	.quantum_expire                                 = sched_multiq_quantum_expire,
319	.should_current_thread_rechoose_processor       = sched_multiq_should_current_thread_rechoose_processor,
320	.processor_runq_count                           = sched_multiq_runq_count,
321	.processor_runq_stats_count_sum                 = sched_multiq_runq_stats_count_sum,
322	.fairshare_init                                 = sched_traditional_fairshare_init,
323	.fairshare_runq_count                           = sched_traditional_fairshare_runq_count,
324	.fairshare_runq_stats_count_sum                 = sched_traditional_fairshare_runq_stats_count_sum,
325	.fairshare_enqueue                              = sched_traditional_fairshare_enqueue,
326	.fairshare_dequeue                              = sched_traditional_fairshare_dequeue,
327	.fairshare_queue_remove                         = sched_traditional_fairshare_queue_remove,
328	.processor_bound_count                          = sched_multiq_processor_bound_count,
329	.thread_update_scan                             = sched_multiq_thread_update_scan,
330	.direct_dispatch_to_idle_processors             = FALSE,
331};
332
333
334static void
335sched_multiq_init(void)
336{
337	sched_groups_enabled = TRUE;
338
339#if defined(MULTIQ_SANITY_CHECK)
340	PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check));
341#endif
342
343	PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain));
344
345	PE_parse_boot_argn("multiq_drain_urgent_first", &drain_urgent_first, sizeof(drain_urgent_first));
346
347	if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) {
348		drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT;
349	}
350
351	if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) {
352		drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT;
353	}
354
355	printf("multiq scheduler config: deep-drain %d, urgent first %d, depth limit %d, band limit %d, sanity check %d\n",
356	       deep_drain, drain_urgent_first, drain_depth_limit, drain_band_limit, multiq_sanity_check);
357
358	sched_group_zone = zinit(
359	                         sizeof(struct sched_group),
360	                         task_max * sizeof(struct sched_group),
361	                         PAGE_SIZE,
362	                         "sched groups");
363
364	zone_change(sched_group_zone, Z_NOENCRYPT, TRUE);
365	zone_change(sched_group_zone, Z_NOCALLOUT, TRUE);
366
367	queue_init(&sched_groups);
368
369	lck_grp_attr_setdefault(&sched_groups_lock_grp_attr);
370	lck_grp_init(&sched_groups_lock_grp, "sched_groups", &sched_groups_lock_grp_attr);
371	lck_attr_setdefault(&sched_groups_lock_attr);
372	lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr);
373
374	sched_traditional_init();
375}
376
377static void
378sched_multiq_processor_init(processor_t processor)
379{
380	run_queue_init(&processor->runq);
381}
382
383static void
384sched_multiq_pset_init(processor_set_t pset)
385{
386	run_queue_init(&pset->pset_runq);
387}
388
389static sched_mode_t
390sched_multiq_initial_thread_sched_mode(task_t parent_task)
391{
392	if (parent_task == kernel_task)
393		return TH_MODE_FIXED;
394	else
395		return TH_MODE_TIMESHARE;
396}
397
398sched_group_t
399sched_group_create(void)
400{
401	sched_group_t       sched_group;
402
403	if (!sched_groups_enabled)
404		return SCHED_GROUP_NULL;
405
406	sched_group = (sched_group_t)zalloc(sched_group_zone);
407
408	bzero(sched_group, sizeof(struct sched_group));
409
410	run_queue_init(&sched_group->runq);
411
412	for (int i = 0; i < NRQS; i++) {
413		sched_group->entries[i].runq = 0;
414		sched_group->entries[i].sched_pri = i;
415	}
416
417	lck_mtx_lock(&sched_groups_lock);
418	queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups);
419	num_sched_groups++;
420	lck_mtx_unlock(&sched_groups_lock);
421
422	return (sched_group);
423}
424
425void
426sched_group_destroy(sched_group_t sched_group)
427{
428	if (!sched_groups_enabled) {
429		assert(sched_group == SCHED_GROUP_NULL);
430		return;
431	}
432
433	assert(sched_group != SCHED_GROUP_NULL);
434	assert(sched_group->runq.count == 0);
435
436	for (int i = 0; i < NRQS; i++) {
437		assert(sched_group->entries[i].runq == 0);
438		assert(sched_group->entries[i].sched_pri == i);
439	}
440
441	lck_mtx_lock(&sched_groups_lock);
442	queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups);
443	num_sched_groups--;
444	lck_mtx_unlock(&sched_groups_lock);
445
446	zfree(sched_group_zone, sched_group);
447}
448
449__attribute__((always_inline))
450static inline entry_queue_t
451multiq_main_entryq(processor_t processor)
452{
453	return (entry_queue_t)&processor->processor_set->pset_runq;
454}
455
456__attribute__((always_inline))
457static inline run_queue_t
458multiq_bound_runq(processor_t processor)
459{
460	return &processor->runq;
461}
462
463__attribute__((always_inline))
464static inline sched_entry_t
465group_entry_for_pri(sched_group_t group, integer_t pri)
466{
467	return &group->entries[pri];
468}
469
470__attribute__((always_inline))
471static inline sched_group_t
472group_for_entry(sched_entry_t entry)
473{
474	sched_group_t group = (sched_group_t)(entry - entry->sched_pri);
475	return group;
476}
477
478/* Peek at the head of the runqueue */
479static sched_entry_t
480entry_queue_first_entry(entry_queue_t rq)
481{
482	assert(rq->count != 0);
483
484	queue_t queue = rq->queues + rq->highq;
485
486	sched_entry_t entry = (sched_entry_t)queue_first(queue);
487
488	assert(entry->sched_pri == rq->highq);
489
490	return entry;
491}
492
493#if defined(MULTIQ_SANITY_CHECK)
494
495__attribute__((always_inline))
496static inline boolean_t
497queue_chain_linked(queue_chain_t* chain)
498{
499	if (chain->next != NULL) {
500		assert(chain->prev != NULL);
501		return TRUE;
502	} else {
503		assert(chain->prev == NULL);
504		return FALSE;
505	}
506}
507
508static thread_t
509group_first_thread(sched_group_t group)
510{
511	group_runq_t rq = &group->runq;
512
513	assert(rq->count != 0);
514
515	queue_t queue = rq->queues + rq->highq;
516
517	thread_t thread = (thread_t)(void*)queue_first(queue);
518
519	assert(thread != THREAD_NULL);
520
521	assert(thread->sched_group == group);
522
523	/* TODO: May not be safe */
524	assert(thread->sched_pri == rq->highq);
525
526	return thread;
527}
528
529/* Asserts if entry is not in entry runq at pri */
530static void
531entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri)
532{
533	queue_t q;
534	sched_entry_t elem;
535
536	assert(queue_chain_linked(&entry->links));
537	assert(entry->runq == MULTIQ_ERUNQ);
538
539	q = &runq->queues[expected_pri];
540
541	queue_iterate(q, elem, sched_entry_t, links) {
542		if (elem == entry)
543			return;
544	}
545
546	panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri);
547}
548
549/* Asserts if thread is not in group at its priority */
550static void
551sched_group_check_thread(sched_group_t group, thread_t thread)
552{
553	queue_t q;
554	thread_t elem;
555	int pri = thread->sched_pri;
556
557	assert(thread->runq != PROCESSOR_NULL);
558
559	q = &group->runq.queues[pri];
560
561	queue_iterate(q, elem, thread_t, links) {
562		if (elem == thread)
563			return;
564	}
565
566	panic("group %p doesn't contain thread %p at pri %d", group, thread, pri);
567}
568
569static void
570global_check_entry_queue(entry_queue_t main_entryq)
571{
572	if (main_entryq->count == 0)
573		return;
574
575	sched_entry_t entry = entry_queue_first_entry(main_entryq);
576
577	assert(entry->runq == MULTIQ_ERUNQ);
578
579	sched_group_t group = group_for_entry(entry);
580
581	thread_t thread = group_first_thread(group);
582
583	__assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
584
585	assert(entry->sched_pri == group->runq.highq);
586
587	assert(entry == thread_entry);
588	assert(thread->runq != PROCESSOR_NULL);
589}
590
591static void
592group_check_run_queue(entry_queue_t main_entryq, sched_group_t group)
593{
594	if (group->runq.count == 0)
595		return;
596
597	thread_t thread = group_first_thread(group);
598
599	assert(thread->runq != PROCESSOR_NULL);
600
601	sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri);
602
603	entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri);
604
605	assert(sched_entry->sched_pri == thread->sched_pri);
606	assert(sched_entry->runq == MULTIQ_ERUNQ);
607}
608
609#endif /* defined(MULTIQ_SANITY_CHECK) */
610
611/*
612 * The run queue must not be empty.
613 */
614static sched_entry_t
615entry_queue_dequeue_entry(entry_queue_t rq)
616{
617	sched_entry_t   sched_entry;
618	queue_t         queue = rq->queues + rq->highq;
619
620	assert(rq->count > 0);
621	assert(!queue_empty(queue));
622
623	sched_entry = (sched_entry_t)dequeue_head(queue);
624
625	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
626	rq->count--;
627	if (SCHED(priority_is_urgent)(rq->highq)) {
628		rq->urgency--; assert(rq->urgency >= 0);
629	}
630	if (queue_empty(queue)) {
631		if (rq->highq != IDLEPRI)
632			clrbit(MAXPRI - rq->highq, rq->bitmap);
633		rq->highq = MAXPRI - ffsbit(rq->bitmap);
634	}
635
636	sched_entry->runq = 0;
637
638	return (sched_entry);
639}
640
641/*
642 * The run queue must not be empty.
643 */
644static boolean_t
645entry_queue_enqueue_entry(
646                          entry_queue_t rq,
647                          sched_entry_t entry,
648                          integer_t     options)
649{
650	int             sched_pri = entry->sched_pri;
651	queue_t         queue = rq->queues + sched_pri;
652	boolean_t       result = FALSE;
653
654	assert(entry->runq == 0);
655
656	if (queue_empty(queue)) {
657		enqueue_tail(queue, (queue_entry_t)entry);
658
659		setbit(MAXPRI - sched_pri, rq->bitmap);
660		if (sched_pri > rq->highq) {
661			rq->highq = sched_pri;
662			result = TRUE;
663		}
664	} else {
665		if (options & SCHED_TAILQ)
666			enqueue_tail(queue, (queue_entry_t)entry);
667		else
668			enqueue_head(queue, (queue_entry_t)entry);
669	}
670	if (SCHED(priority_is_urgent)(sched_pri))
671		rq->urgency++;
672	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
673	rq->count++;
674
675	entry->runq = MULTIQ_ERUNQ;
676
677	return (result);
678}
679
680/*
681 * The entry must be in this runqueue.
682 */
683static void
684entry_queue_remove_entry(
685                         entry_queue_t  rq,
686                         sched_entry_t  entry)
687{
688	int sched_pri = entry->sched_pri;
689
690#if defined(MULTIQ_SANITY_CHECK)
691	if (multiq_sanity_check) {
692		entry_queue_check_entry(rq, entry, sched_pri);
693	}
694#endif
695
696	remqueue((queue_entry_t)entry);
697
698	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
699	rq->count--;
700	if (SCHED(priority_is_urgent)(sched_pri)) {
701		rq->urgency--; assert(rq->urgency >= 0);
702	}
703
704	if (queue_empty(rq->queues + sched_pri)) {
705		/* update run queue status */
706		if (sched_pri != IDLEPRI)
707			clrbit(MAXPRI - sched_pri, rq->bitmap);
708		rq->highq = MAXPRI - ffsbit(rq->bitmap);
709	}
710
711	entry->runq = 0;
712}
713
714/*
715 * The run queue must not be empty.
716 *
717 * sets queue_empty to TRUE if queue is now empty at thread_pri
718 */
719static thread_t
720group_run_queue_dequeue_thread(
721                         group_runq_t   rq,
722                         integer_t     *thread_pri,
723                         boolean_t     *queue_empty)
724{
725	thread_t        thread;
726	queue_t         queue = rq->queues + rq->highq;
727
728	assert(rq->count > 0);
729	assert(!queue_empty(queue));
730
731	*thread_pri = rq->highq;
732
733	thread = (thread_t)(void*)dequeue_head(queue);
734
735	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
736	rq->count--;
737	if (SCHED(priority_is_urgent)(rq->highq)) {
738		rq->urgency--; assert(rq->urgency >= 0);
739	}
740	if (queue_empty(queue)) {
741		if (rq->highq != IDLEPRI)
742			clrbit(MAXPRI - rq->highq, rq->bitmap);
743		rq->highq = MAXPRI - ffsbit(rq->bitmap);
744		*queue_empty = TRUE;
745	} else {
746		*queue_empty = FALSE;
747	}
748
749	return (thread);
750}
751
752/*
753 * The run queue must not be empty.
754 * returns TRUE if queue was empty at thread_pri
755 */
756static boolean_t
757group_run_queue_enqueue_thread(
758                         group_runq_t   rq,
759                         thread_t       thread,
760                         integer_t      thread_pri,
761                         integer_t      options)
762{
763	queue_t         queue = rq->queues + thread_pri;
764	boolean_t       result = FALSE;
765
766	assert(thread->runq == PROCESSOR_NULL);
767
768	if (queue_empty(queue)) {
769		enqueue_tail(queue, (queue_entry_t)thread);
770
771		setbit(MAXPRI - thread_pri, rq->bitmap);
772		if (thread_pri > rq->highq) {
773			rq->highq = thread_pri;
774		}
775		result = TRUE;
776	} else {
777		if (options & SCHED_TAILQ)
778			enqueue_tail(queue, (queue_entry_t)thread);
779		else
780			enqueue_head(queue, (queue_entry_t)thread);
781	}
782	if (SCHED(priority_is_urgent)(thread_pri))
783		rq->urgency++;
784	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
785	rq->count++;
786
787	return (result);
788}
789
790/*
791 * The thread must be in this runqueue.
792 * returns TRUE if queue is now empty at thread_pri
793 */
794static boolean_t
795group_run_queue_remove_thread(
796                        group_runq_t    rq,
797                        thread_t        thread,
798                        integer_t       thread_pri)
799{
800	boolean_t       result = FALSE;
801
802	assert(thread->runq != PROCESSOR_NULL);
803
804	remqueue((queue_entry_t)thread);
805
806	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
807	rq->count--;
808	if (SCHED(priority_is_urgent)(thread_pri)) {
809		rq->urgency--; assert(rq->urgency >= 0);
810	}
811
812	if (queue_empty(rq->queues + thread_pri)) {
813		/* update run queue status */
814		if (thread_pri != IDLEPRI)
815			clrbit(MAXPRI - thread_pri, rq->bitmap);
816		rq->highq = MAXPRI - ffsbit(rq->bitmap);
817		result = TRUE;
818	}
819
820	thread->runq = PROCESSOR_NULL;
821
822	return result;
823}
824
825/*
826 * A thread's sched pri may change out from under us because
827 * we're clearing thread->runq here without the thread locked.
828 * Do not rely on it to be the same as when we enqueued.
829 */
830static thread_t
831sched_global_dequeue_thread(entry_queue_t main_entryq)
832{
833	boolean_t pri_level_empty = FALSE;
834	sched_entry_t entry;
835	group_runq_t group_runq;
836	thread_t thread;
837	integer_t thread_pri;
838	sched_group_t group;
839
840	assert(main_entryq->count > 0);
841
842	entry = entry_queue_dequeue_entry(main_entryq);
843
844	group = group_for_entry(entry);
845	group_runq = &group->runq;
846
847	thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
848
849	thread->runq = PROCESSOR_NULL;
850
851	if (!pri_level_empty) {
852		entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ);
853	}
854
855	return thread;
856}
857
858/* Dequeue a thread from the global runq without moving the entry */
859static thread_t
860sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq)
861{
862	boolean_t pri_level_empty = FALSE;
863	sched_entry_t entry;
864	group_runq_t group_runq;
865	thread_t thread;
866	integer_t thread_pri;
867	sched_group_t group;
868
869	assert(main_entryq->count > 0);
870
871	entry = entry_queue_first_entry(main_entryq);
872
873	group = group_for_entry(entry);
874	group_runq = &group->runq;
875
876	thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
877
878	thread->runq = PROCESSOR_NULL;
879
880	if (pri_level_empty) {
881		entry_queue_remove_entry(main_entryq, entry);
882	}
883
884	return thread;
885}
886
887
888static thread_t
889sched_group_dequeue_thread(
890                           entry_queue_t main_entryq,
891                           sched_group_t group)
892{
893	group_runq_t group_runq = &group->runq;
894	boolean_t pri_level_empty = FALSE;
895	thread_t thread;
896	integer_t thread_pri;
897
898	thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty);
899
900	thread->runq = PROCESSOR_NULL;
901
902	if (pri_level_empty) {
903		entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri));
904	}
905
906	return thread;
907}
908
909static void
910sched_group_remove_thread(
911                          entry_queue_t main_entryq,
912                          sched_group_t group,
913                          thread_t thread)
914{
915	integer_t thread_pri = thread->sched_pri;
916	sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri);
917
918#if defined(MULTIQ_SANITY_CHECK)
919	if (multiq_sanity_check) {
920		global_check_entry_queue(main_entryq);
921		group_check_run_queue(main_entryq, group);
922
923		sched_group_check_thread(group, thread);
924		entry_queue_check_entry(main_entryq, sched_entry, thread_pri);
925	}
926#endif
927
928	boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri);
929
930	if (pri_level_empty) {
931		entry_queue_remove_entry(main_entryq, sched_entry);
932	}
933
934#if defined(MULTIQ_SANITY_CHECK)
935	if (multiq_sanity_check) {
936		global_check_entry_queue(main_entryq);
937		group_check_run_queue(main_entryq, group);
938	}
939#endif
940}
941
942static void
943sched_group_enqueue_thread(
944                           entry_queue_t        main_entryq,
945                           sched_group_t        group,
946                           thread_t             thread,
947                           integer_t            options)
948{
949#if defined(MULTIQ_SANITY_CHECK)
950	if (multiq_sanity_check) {
951		global_check_entry_queue(main_entryq);
952		group_check_run_queue(main_entryq, group);
953	}
954#endif
955
956	int sched_pri = thread->sched_pri;
957
958	boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options);
959
960	if (pri_level_was_empty) {
961		/*
962		 * TODO: Need to figure out if passing options here is a good idea or not
963		 * What effects would it have?
964		 */
965		entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options);
966	}
967}
968
969/*
970 *  Locate a thread to execute from the run queue and return it.
971 *  Only choose a thread with greater or equal priority.
972 *
973 *  pset is locked, thread is not locked.
974 *
975 *  Returns THREAD_NULL if it cannot find a valid thread.
976 *
977 *  Note: we cannot rely on the value of thread->sched_pri in this path because
978 *  we don't have the thread locked.
979 *
980 *  TODO: Remove tracepoints
981 */
982static thread_t
983sched_multiq_choose_thread(
984                           processor_t      processor,
985                           int              priority,
986                           ast_t            reason)
987{
988	entry_queue_t   main_entryq = multiq_main_entryq(processor);
989	run_queue_t     bound_runq  = multiq_bound_runq(processor);
990
991	boolean_t choose_bound_runq = FALSE;
992
993	if (bound_runq->highq  < priority &&
994	    main_entryq->highq < priority)
995		return THREAD_NULL;
996
997	if (bound_runq->count && main_entryq->count) {
998		if (bound_runq->highq >= main_entryq->highq) {
999			choose_bound_runq = TRUE;
1000		} else {
1001			/* Use main runq */
1002		}
1003	} else if (bound_runq->count) {
1004		choose_bound_runq = TRUE;
1005	} else if (main_entryq->count) {
1006		/* Use main runq */
1007	} else {
1008		return (THREAD_NULL);
1009	}
1010
1011	if (choose_bound_runq) {
1012		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1013		    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1014		    MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0);
1015
1016		return run_queue_dequeue(bound_runq, SCHED_HEADQ);
1017	}
1018
1019	sched_group_t group = current_thread()->sched_group;
1020
1021#if defined(MULTIQ_SANITY_CHECK)
1022	if (multiq_sanity_check) {
1023		global_check_entry_queue(main_entryq);
1024		group_check_run_queue(main_entryq, group);
1025	}
1026#endif
1027
1028	/*
1029	 * Determine if we should look at the group or the global queue
1030	 *
1031	 * TODO:
1032	 * Perhaps pass reason as a 'should look inside' argument to choose_thread
1033	 * Should YIELD AST override drain limit?
1034	 */
1035	if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) {
1036		boolean_t   drain_limit_hit = FALSE;
1037
1038		if (main_entryq->highq > group->runq.highq) {
1039			/*
1040			 * If there's something elsewhere above the depth limit,
1041			 * don't pick a thread below the limit.
1042			 */
1043			if (main_entryq->highq > drain_depth_limit &&
1044			    group->runq.highq <= drain_depth_limit)
1045				drain_limit_hit = TRUE;
1046
1047			/*
1048			 * Don't go more than X steps below the global highest
1049			 */
1050			if ((main_entryq->highq - group->runq.highq) >= drain_band_limit)
1051				drain_limit_hit = TRUE;
1052
1053			/* Don't favor the task when an urgent thread is present. */
1054			if (drain_urgent_first && main_entryq->urgency > 0)
1055				drain_limit_hit = TRUE;
1056		}
1057
1058		if (!drain_limit_hit) {
1059			/* Pull from local runq */
1060			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1061			    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1062			    MACH_MULTIQ_GROUP, main_entryq->highq, group->runq.highq, 0, 0);
1063
1064			return sched_group_dequeue_thread(main_entryq, group);
1065		}
1066	}
1067
1068	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1069	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE,
1070	    MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0);
1071
1072	/* Couldn't pull from local runq, pull from global runq instead */
1073	if (deep_drain) {
1074		return sched_global_deep_drain_dequeue_thread(main_entryq);
1075	} else {
1076		return sched_global_dequeue_thread(main_entryq);
1077	}
1078}
1079
1080
1081/*
1082 * Thread must be locked, and not already be on a run queue.
1083 * pset is locked.
1084 */
1085static boolean_t
1086sched_multiq_processor_enqueue(
1087                               processor_t      processor,
1088                               thread_t         thread,
1089                               integer_t        options)
1090{
1091	boolean_t       result;
1092
1093	assert(processor == thread->chosen_processor);
1094
1095	if (thread->bound_processor != PROCESSOR_NULL) {
1096		assert(thread->bound_processor == processor);
1097
1098		result = run_queue_enqueue(multiq_bound_runq(processor), thread, options);
1099		thread->runq = processor;
1100
1101		return result;
1102	}
1103
1104	sched_group_enqueue_thread(multiq_main_entryq(processor),
1105	                           thread->sched_group,
1106	                           thread, options);
1107
1108	thread->runq = processor;
1109
1110	return (FALSE);
1111}
1112
1113/*
1114 * Called in the context of thread with thread and pset unlocked,
1115 * after updating thread priority but before propagating that priority
1116 * to the processor
1117 */
1118void
1119sched_multiq_quantum_expire(thread_t thread)
1120{
1121	if (deep_drain) {
1122		/*
1123		 * Move the entry at this priority to the end of the queue,
1124		 * to allow the next task a shot at running.
1125		 */
1126
1127		processor_t processor = thread->last_processor;
1128		processor_set_t pset = processor->processor_set;
1129		entry_queue_t entryq = multiq_main_entryq(processor);
1130
1131		pset_lock(pset);
1132
1133		sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri);
1134
1135		if (entry->runq == MULTIQ_ERUNQ) {
1136			entry_queue_remove_entry(entryq, entry);
1137			entry_queue_enqueue_entry(entryq, entry, SCHED_TAILQ);
1138		}
1139
1140		pset_unlock(pset);
1141	}
1142}
1143
1144static boolean_t
1145sched_multiq_processor_queue_empty(processor_t processor)
1146{
1147	return multiq_main_entryq(processor)->count == 0 &&
1148	       multiq_bound_runq(processor)->count  == 0;
1149}
1150
1151static ast_t
1152sched_multiq_processor_csw_check(processor_t processor)
1153{
1154	boolean_t       has_higher;
1155	int             pri;
1156
1157	entry_queue_t main_entryq = multiq_main_entryq(processor);
1158	run_queue_t   bound_runq  = multiq_bound_runq(processor);
1159
1160	assert(processor->active_thread != NULL);
1161
1162	pri = MAX(main_entryq->highq, bound_runq->highq);
1163
1164	if (first_timeslice(processor)) {
1165		has_higher = (pri > processor->current_pri);
1166	} else {
1167		has_higher = (pri >= processor->current_pri);
1168	}
1169
1170	if (has_higher) {
1171		if (main_entryq->urgency > 0)
1172			return (AST_PREEMPT | AST_URGENT);
1173
1174		if (bound_runq->urgency > 0)
1175			return (AST_PREEMPT | AST_URGENT);
1176
1177		if (processor->active_thread && thread_eager_preemption(processor->active_thread))
1178			return (AST_PREEMPT | AST_URGENT);
1179
1180		return AST_PREEMPT;
1181	}
1182
1183	return AST_NONE;
1184}
1185
1186static boolean_t
1187sched_multiq_processor_queue_has_priority(
1188                                          processor_t   processor,
1189                                          int           priority,
1190                                          boolean_t     gte)
1191{
1192	int qpri = MAX(multiq_main_entryq(processor)->highq, multiq_bound_runq(processor)->highq);
1193
1194	if (gte)
1195		return qpri >= priority;
1196	else
1197		return qpri > priority;
1198}
1199
1200static boolean_t
1201sched_multiq_should_current_thread_rechoose_processor(processor_t processor)
1202{
1203	return (processor->current_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor);
1204}
1205
1206static int
1207sched_multiq_runq_count(processor_t processor)
1208{
1209	/*
1210	 *  TODO: Decide whether to keep a count of runnable threads in the pset
1211	 *  or just return something less than the true count.
1212	 *
1213	 *  This needs to be fast, so no iterating the whole runq.
1214	 *
1215	 *  Another possible decision is to remove this - with global runq
1216	 *  it doesn't make much sense.
1217	 */
1218	return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count;
1219}
1220
1221static uint64_t
1222sched_multiq_runq_stats_count_sum(processor_t processor)
1223{
1224	/*
1225	 * TODO: This one does need to go through all the runqueues, but it's only needed for
1226	 * the sched stats tool
1227	 */
1228
1229	uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum;
1230
1231	if (processor->cpu_id == processor->processor_set->cpu_set_low)
1232		return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum;
1233	else
1234		return bound_sum;
1235}
1236
1237static int
1238sched_multiq_processor_bound_count(processor_t processor)
1239{
1240	return multiq_bound_runq(processor)->count;
1241}
1242
1243static void
1244sched_multiq_processor_queue_shutdown(processor_t processor)
1245{
1246	processor_set_t pset = processor->processor_set;
1247	entry_queue_t   main_entryq = multiq_main_entryq(processor);
1248	thread_t        thread;
1249	queue_head_t    tqueue;
1250
1251	/* We only need to migrate threads if this is the last active processor in the pset */
1252	if (pset->online_processor_count > 0) {
1253		pset_unlock(pset);
1254		return;
1255	}
1256
1257	queue_init(&tqueue);
1258
1259	/* Note that we do not remove bound threads from the queues here */
1260
1261	while (main_entryq->count > 0) {
1262		thread = sched_global_dequeue_thread(main_entryq);
1263		enqueue_tail(&tqueue, (queue_entry_t)thread);
1264	}
1265
1266	pset_unlock(pset);
1267
1268	while ((thread = (thread_t)(void*)dequeue_head(&tqueue)) != THREAD_NULL) {
1269		thread_lock(thread);
1270
1271		thread_setrun(thread, SCHED_TAILQ);
1272
1273		thread_unlock(thread);
1274	}
1275}
1276
1277/*
1278 * Thread is locked
1279 *
1280 * This is why we can never read sched_pri unless we have the thread locked.
1281 * Which we do in the enqueue and remove cases, but not the dequeue case.
1282 */
1283static boolean_t
1284sched_multiq_processor_queue_remove(
1285                                    processor_t processor,
1286                                    thread_t    thread)
1287{
1288	boolean_t removed = FALSE;
1289
1290	processor_set_t pset = processor->processor_set;
1291
1292	pset_lock(pset);
1293
1294	if (thread->runq != PROCESSOR_NULL) {
1295		/*
1296		 * Thread is on a run queue and we have a lock on
1297		 * that run queue.
1298		 */
1299
1300		assert(thread->runq == processor);
1301
1302		if (thread->bound_processor != PROCESSOR_NULL) {
1303			assert(processor == thread->bound_processor);
1304			run_queue_remove(multiq_bound_runq(processor), thread);
1305			thread->runq = PROCESSOR_NULL;
1306		} else {
1307			sched_group_remove_thread(multiq_main_entryq(processor),
1308			                          thread->sched_group,
1309			                          thread);
1310		}
1311
1312		removed = TRUE;
1313	}
1314
1315	pset_unlock(pset);
1316
1317	return removed;
1318}
1319
1320/* pset is locked, returned unlocked */
1321static thread_t
1322sched_multiq_steal_thread(processor_set_t pset)
1323{
1324	pset_unlock(pset);
1325	return (THREAD_NULL);
1326}
1327
1328/*
1329 * Scan the global queue for candidate groups, and scan those groups for
1330 * candidate threads.
1331 *
1332 * Returns TRUE if retry is needed.
1333 */
1334static boolean_t
1335group_scan(entry_queue_t runq) {
1336	int             count;
1337	queue_t         q;
1338	sched_group_t   group;
1339	sched_entry_t   entry;
1340
1341	if ((count = runq->count) > 0) {
1342		q = runq->queues + runq->highq;
1343		while (count > 0) {
1344			queue_iterate(q, entry, sched_entry_t, links) {
1345				group = group_for_entry(entry);
1346				if (group->runq.count > 0) {
1347					if (runq_scan(&group->runq))
1348						return (TRUE);
1349				}
1350				count--;
1351			}
1352			q--;
1353		}
1354	}
1355
1356	return (FALSE);
1357}
1358
1359static void
1360sched_multiq_thread_update_scan(void)
1361{
1362	boolean_t               restart_needed = FALSE;
1363	processor_t             processor = processor_list;
1364	processor_set_t         pset;
1365	thread_t                thread;
1366	spl_t                   s;
1367
1368	/*
1369	 *  We update the threads associated with each processor (bound and idle threads)
1370	 *  and then update the threads in each pset runqueue.
1371	 */
1372
1373	do {
1374		do {
1375			pset = processor->processor_set;
1376
1377			s = splsched();
1378			pset_lock(pset);
1379
1380			restart_needed = runq_scan(multiq_bound_runq(processor));
1381
1382			pset_unlock(pset);
1383			splx(s);
1384
1385			if (restart_needed)
1386				break;
1387
1388			thread = processor->idle_thread;
1389			if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
1390				if (thread_update_add_thread(thread) == FALSE) {
1391					restart_needed = TRUE;
1392					break;
1393				}
1394			}
1395		} while ((processor = processor->processor_list) != NULL);
1396
1397		/* Ok, we now have a collection of candidates -- fix them. */
1398		thread_update_process_threads();
1399
1400	} while (restart_needed);
1401
1402	pset = &pset0;
1403
1404	do {
1405		do {
1406			s = splsched();
1407			pset_lock(pset);
1408
1409			restart_needed = group_scan(&pset->pset_runq);
1410
1411			pset_unlock(pset);
1412			splx(s);
1413
1414			if (restart_needed)
1415				break;
1416		} while ((pset = pset->pset_list) != NULL);
1417
1418		/* Ok, we now have a collection of candidates -- fix them. */
1419		thread_update_process_threads();
1420
1421	} while (restart_needed);
1422}
1423
1424
1425