1/*
2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_FREE_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 *	File:	sched_prim.c
60 *	Author:	Avadis Tevanian, Jr.
61 *	Date:	1986
62 *
63 *	Scheduling primitives
64 *
65 */
66
67#include <debug.h>
68
69#include <mach/mach_types.h>
70#include <mach/machine.h>
71#include <mach/policy.h>
72#include <mach/sync_policy.h>
73#include <mach/thread_act.h>
74
75#include <machine/machine_routines.h>
76#include <machine/sched_param.h>
77#include <machine/machine_cpu.h>
78#include <machine/machlimits.h>
79
80#include <kern/kern_types.h>
81#include <kern/clock.h>
82#include <kern/counters.h>
83#include <kern/cpu_number.h>
84#include <kern/cpu_data.h>
85#include <kern/debug.h>
86#include <kern/lock.h>
87#include <kern/macro_help.h>
88#include <kern/machine.h>
89#include <kern/misc_protos.h>
90#include <kern/processor.h>
91#include <kern/queue.h>
92#include <kern/sched.h>
93#include <kern/sched_prim.h>
94#include <kern/syscall_subr.h>
95#include <kern/task.h>
96#include <kern/thread.h>
97#include <kern/wait_queue.h>
98#include <kern/ledger.h>
99
100#include <vm/pmap.h>
101#include <vm/vm_kern.h>
102#include <vm/vm_map.h>
103
104#include <mach/sdt.h>
105
106#include <sys/kdebug.h>
107
108#include <kern/pms.h>
109
110struct rt_queue	rt_runq;
111#define RT_RUNQ		((processor_t)-1)
112decl_simple_lock_data(static,rt_lock);
113
114#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY)
115static struct fairshare_queue	fs_runq;
116#define FS_RUNQ		((processor_t)-2)
117decl_simple_lock_data(static,fs_lock);
118#endif
119
120#define		DEFAULT_PREEMPTION_RATE		100		/* (1/s) */
121int			default_preemption_rate = DEFAULT_PREEMPTION_RATE;
122
123#define		DEFAULT_BG_PREEMPTION_RATE	400		/* (1/s) */
124int			default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
125
126#define		MAX_UNSAFE_QUANTA			800
127int			max_unsafe_quanta = MAX_UNSAFE_QUANTA;
128
129#define		MAX_POLL_QUANTA				2
130int			max_poll_quanta = MAX_POLL_QUANTA;
131
132#define		SCHED_POLL_YIELD_SHIFT		4		/* 1/16 */
133int			sched_poll_yield_shift = SCHED_POLL_YIELD_SHIFT;
134
135uint64_t	max_poll_computation;
136
137uint64_t	max_unsafe_computation;
138uint64_t	sched_safe_duration;
139
140#if defined(CONFIG_SCHED_TRADITIONAL)
141
142uint32_t	std_quantum;
143uint32_t	min_std_quantum;
144uint32_t	bg_quantum;
145
146uint32_t	std_quantum_us;
147uint32_t	bg_quantum_us;
148
149#endif /* CONFIG_SCHED_TRADITIONAL */
150
151uint32_t	thread_depress_time;
152uint32_t	default_timeshare_computation;
153uint32_t	default_timeshare_constraint;
154
155uint32_t	max_rt_quantum;
156uint32_t	min_rt_quantum;
157
158#if defined(CONFIG_SCHED_TRADITIONAL)
159
160unsigned	sched_tick;
161uint32_t	sched_tick_interval;
162
163uint32_t	sched_pri_shift = INT8_MAX;
164uint32_t	sched_fixed_shift;
165
166static boolean_t sched_traditional_use_pset_runqueue = FALSE;
167
168/* Defaults for timer deadline profiling */
169#define TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT 2000000 /* Timers with deadlines <=
170                                                        * 2ms */
171#define TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT 5000000 /* Timers with deadlines
172                                                          <= 5ms */
173uint64_t timer_deadline_tracking_bin_1;
174uint64_t timer_deadline_tracking_bin_2;
175
176__attribute__((always_inline))
177static inline run_queue_t runq_for_processor(processor_t processor)
178{
179	if (sched_traditional_use_pset_runqueue)
180		return &processor->processor_set->pset_runq;
181	else
182		return &processor->runq;
183}
184
185__attribute__((always_inline))
186static inline void runq_consider_incr_bound_count(processor_t processor, thread_t thread)
187{
188	if (thread->bound_processor == PROCESSOR_NULL)
189		return;
190
191	assert(thread->bound_processor == processor);
192
193	if (sched_traditional_use_pset_runqueue)
194		processor->processor_set->pset_runq_bound_count++;
195
196	processor->runq_bound_count++;
197}
198
199__attribute__((always_inline))
200static inline void runq_consider_decr_bound_count(processor_t processor, thread_t thread)
201{
202	if (thread->bound_processor == PROCESSOR_NULL)
203		return;
204
205	assert(thread->bound_processor == processor);
206
207	if (sched_traditional_use_pset_runqueue)
208		processor->processor_set->pset_runq_bound_count--;
209
210	processor->runq_bound_count--;
211}
212
213#endif /* CONFIG_SCHED_TRADITIONAL */
214
215uint64_t	sched_one_second_interval;
216
217uint32_t	sched_run_count, sched_share_count;
218uint32_t	sched_load_average, sched_mach_factor;
219
220/* Forwards */
221
222#if defined(CONFIG_SCHED_TRADITIONAL)
223
224static void load_shift_init(void) __attribute__((section("__TEXT, initcode")));
225static void preempt_pri_init(void) __attribute__((section("__TEXT, initcode")));
226
227#endif /* CONFIG_SCHED_TRADITIONAL */
228
229static thread_t	thread_select(
230					thread_t			thread,
231					processor_t			processor);
232
233#if CONFIG_SCHED_IDLE_IN_PLACE
234static thread_t	thread_select_idle(
235					thread_t			thread,
236					processor_t			processor);
237#endif
238
239thread_t	processor_idle(
240					thread_t			thread,
241					processor_t			processor);
242
243#if defined(CONFIG_SCHED_TRADITIONAL)
244
245static thread_t	steal_thread(
246					processor_set_t		pset);
247
248static thread_t	steal_thread_disabled(
249					processor_set_t		pset) __attribute__((unused));
250
251
252static thread_t	steal_processor_thread(
253					processor_t			processor);
254
255static void		thread_update_scan(void);
256
257static void processor_setrun(
258				 processor_t			processor,
259				 thread_t			thread,
260				 integer_t			options);
261
262static boolean_t
263processor_enqueue(
264				  processor_t		processor,
265				  thread_t		thread,
266				  integer_t		options);
267
268static boolean_t
269processor_queue_remove(
270					   processor_t			processor,
271					   thread_t		thread);
272
273static boolean_t	processor_queue_empty(processor_t		processor);
274
275static boolean_t	priority_is_urgent(int priority);
276
277static ast_t		processor_csw_check(processor_t processor);
278
279static boolean_t	processor_queue_has_priority(processor_t		processor,
280											int				priority,
281											boolean_t		gte);
282
283static boolean_t	should_current_thread_rechoose_processor(processor_t			processor);
284
285static int     sched_traditional_processor_runq_count(processor_t   processor);
286
287static boolean_t	sched_traditional_with_pset_runqueue_processor_queue_empty(processor_t		processor);
288
289static uint64_t     sched_traditional_processor_runq_stats_count_sum(processor_t   processor);
290
291static uint64_t		sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum(processor_t   processor);
292#endif
293
294
295#if defined(CONFIG_SCHED_TRADITIONAL)
296
297static void
298sched_traditional_init(void);
299
300static void
301sched_traditional_timebase_init(void);
302
303static void
304sched_traditional_processor_init(processor_t processor);
305
306static void
307sched_traditional_pset_init(processor_set_t pset);
308
309static void
310sched_traditional_with_pset_runqueue_init(void);
311
312#endif
313
314static void
315sched_realtime_init(void)  __attribute__((section("__TEXT, initcode")));
316
317static void
318sched_realtime_timebase_init(void);
319
320static void
321sched_timer_deadline_tracking_init(void);
322
323#if defined(CONFIG_SCHED_TRADITIONAL)
324static void
325sched_traditional_tick_continue(void);
326
327static uint32_t
328sched_traditional_initial_quantum_size(thread_t thread);
329
330static sched_mode_t
331sched_traditional_initial_thread_sched_mode(task_t parent_task);
332
333static boolean_t
334sched_traditional_supports_timeshare_mode(void);
335
336static thread_t
337sched_traditional_choose_thread(
338								processor_t		processor,
339								int				priority);
340
341#endif
342
343#if	DEBUG
344extern int debug_task;
345#define TLOG(a, fmt, args...) /* kprintf(fmt, ## args) */
346#else
347#define TLOG(a, fmt, args...) /* kprintf(fmt, ## args) */
348#endif
349
350#if	DEBUG
351static
352boolean_t	thread_runnable(
353				thread_t		thread);
354
355#endif	/*DEBUG*/
356
357/*
358 *	State machine
359 *
360 * states are combinations of:
361 *  R	running
362 *  W	waiting (or on wait queue)
363 *  N	non-interruptible
364 *  O	swapped out
365 *  I	being swapped in
366 *
367 * init	action
368 *	assert_wait thread_block    clear_wait 		swapout	swapin
369 *
370 * R	RW, RWN	    R;   setrun	    -	       		-
371 * RN	RWN	    RN;  setrun	    -	       		-
372 *
373 * RW		    W		    R	       		-
374 * RWN		    WN		    RN	       		-
375 *
376 * W				    R;   setrun		WO
377 * WN				    RN;  setrun		-
378 *
379 * RO				    -			-	R
380 *
381 */
382
383#if defined(CONFIG_SCHED_TRADITIONAL)
384int8_t		sched_load_shifts[NRQS];
385int		sched_preempt_pri[NRQBM];
386#endif
387
388
389#if defined(CONFIG_SCHED_TRADITIONAL)
390
391const struct sched_dispatch_table sched_traditional_dispatch = {
392	sched_traditional_init,
393	sched_traditional_timebase_init,
394	sched_traditional_processor_init,
395	sched_traditional_pset_init,
396	sched_traditional_tick_continue,
397	sched_traditional_choose_thread,
398	steal_thread,
399	compute_priority,
400	choose_processor,
401	processor_enqueue,
402	processor_queue_shutdown,
403	processor_queue_remove,
404	processor_queue_empty,
405	priority_is_urgent,
406	processor_csw_check,
407	processor_queue_has_priority,
408	sched_traditional_initial_quantum_size,
409	sched_traditional_initial_thread_sched_mode,
410	sched_traditional_supports_timeshare_mode,
411	can_update_priority,
412	update_priority,
413	lightweight_update_priority,
414	sched_traditional_quantum_expire,
415	should_current_thread_rechoose_processor,
416	sched_traditional_processor_runq_count,
417	sched_traditional_processor_runq_stats_count_sum,
418	sched_traditional_fairshare_init,
419	sched_traditional_fairshare_runq_count,
420	sched_traditional_fairshare_runq_stats_count_sum,
421	sched_traditional_fairshare_enqueue,
422	sched_traditional_fairshare_dequeue,
423	sched_traditional_fairshare_queue_remove,
424	TRUE /* direct_dispatch_to_idle_processors */
425};
426
427const struct sched_dispatch_table sched_traditional_with_pset_runqueue_dispatch = {
428	sched_traditional_with_pset_runqueue_init,
429	sched_traditional_timebase_init,
430	sched_traditional_processor_init,
431	sched_traditional_pset_init,
432	sched_traditional_tick_continue,
433	sched_traditional_choose_thread,
434	steal_thread,
435	compute_priority,
436	choose_processor,
437	processor_enqueue,
438	processor_queue_shutdown,
439	processor_queue_remove,
440	sched_traditional_with_pset_runqueue_processor_queue_empty,
441	priority_is_urgent,
442	processor_csw_check,
443	processor_queue_has_priority,
444	sched_traditional_initial_quantum_size,
445	sched_traditional_initial_thread_sched_mode,
446	sched_traditional_supports_timeshare_mode,
447	can_update_priority,
448	update_priority,
449	lightweight_update_priority,
450	sched_traditional_quantum_expire,
451	should_current_thread_rechoose_processor,
452	sched_traditional_processor_runq_count,
453	sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum,
454	sched_traditional_fairshare_init,
455	sched_traditional_fairshare_runq_count,
456	sched_traditional_fairshare_runq_stats_count_sum,
457	sched_traditional_fairshare_enqueue,
458	sched_traditional_fairshare_dequeue,
459	sched_traditional_fairshare_queue_remove,
460	FALSE /* direct_dispatch_to_idle_processors */
461};
462
463#endif
464
465const struct sched_dispatch_table *sched_current_dispatch = NULL;
466
467/*
468 * Statically allocate a buffer to hold the longest possible
469 * scheduler description string, as currently implemented.
470 * bsd/kern/kern_sysctl.c has a corresponding definition in bsd/
471 * to export to userspace via sysctl(3). If either version
472 * changes, update the other.
473 *
474 * Note that in addition to being an upper bound on the strings
475 * in the kernel, it's also an exact parameter to PE_get_default(),
476 * which interrogates the device tree on some platforms. That
477 * API requires the caller know the exact size of the device tree
478 * property, so we need both a legacy size (32) and the current size
479 * (48) to deal with old and new device trees. The device tree property
480 * is similarly padded to a fixed size so that the same kernel image
481 * can run on multiple devices with different schedulers configured
482 * in the device tree.
483 */
484#define SCHED_STRING_MAX_LENGTH (48)
485
486char sched_string[SCHED_STRING_MAX_LENGTH];
487static enum sched_enum _sched_enum = sched_enum_unknown;
488
489void
490sched_init(void)
491{
492	char sched_arg[SCHED_STRING_MAX_LENGTH] = { '\0' };
493
494	/* Check for runtime selection of the scheduler algorithm */
495	if (!PE_parse_boot_argn("sched", sched_arg, sizeof (sched_arg))) {
496		/* If no boot-args override, look in device tree */
497		if (!PE_get_default("kern.sched", sched_arg,
498							SCHED_STRING_MAX_LENGTH)) {
499			sched_arg[0] = '\0';
500		}
501	}
502
503	if (strlen(sched_arg) > 0) {
504		if (0) {
505			/* Allow pattern below */
506#if defined(CONFIG_SCHED_TRADITIONAL)
507		} else if (0 == strcmp(sched_arg, kSchedTraditionalString)) {
508			sched_current_dispatch = &sched_traditional_dispatch;
509			_sched_enum = sched_enum_traditional;
510			strlcpy(sched_string, kSchedTraditionalString, sizeof(sched_string));
511			kprintf("Scheduler: Runtime selection of %s\n", kSchedTraditionalString);
512		} else if (0 == strcmp(sched_arg, kSchedTraditionalWithPsetRunqueueString)) {
513			sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
514			_sched_enum = sched_enum_traditional_with_pset_runqueue;
515			strlcpy(sched_string, kSchedTraditionalWithPsetRunqueueString, sizeof(sched_string));
516			kprintf("Scheduler: Runtime selection of %s\n", kSchedTraditionalWithPsetRunqueueString);
517#endif
518#if defined(CONFIG_SCHED_PROTO)
519		} else if (0 == strcmp(sched_arg, kSchedProtoString)) {
520			sched_current_dispatch = &sched_proto_dispatch;
521			_sched_enum = sched_enum_proto;
522			strlcpy(sched_string, kSchedProtoString, sizeof(sched_string));
523			kprintf("Scheduler: Runtime selection of %s\n", kSchedProtoString);
524#endif
525#if defined(CONFIG_SCHED_GRRR)
526		} else if (0 == strcmp(sched_arg, kSchedGRRRString)) {
527			sched_current_dispatch = &sched_grrr_dispatch;
528			_sched_enum = sched_enum_grrr;
529			strlcpy(sched_string, kSchedGRRRString, sizeof(sched_string));
530			kprintf("Scheduler: Runtime selection of %s\n", kSchedGRRRString);
531#endif
532#if defined(CONFIG_SCHED_FIXEDPRIORITY)
533		} else if (0 == strcmp(sched_arg, kSchedFixedPriorityString)) {
534			sched_current_dispatch = &sched_fixedpriority_dispatch;
535			_sched_enum = sched_enum_fixedpriority;
536			strlcpy(sched_string, kSchedFixedPriorityString, sizeof(sched_string));
537			kprintf("Scheduler: Runtime selection of %s\n", kSchedFixedPriorityString);
538		} else if (0 == strcmp(sched_arg, kSchedFixedPriorityWithPsetRunqueueString)) {
539			sched_current_dispatch = &sched_fixedpriority_with_pset_runqueue_dispatch;
540			_sched_enum = sched_enum_fixedpriority_with_pset_runqueue;
541			strlcpy(sched_string, kSchedFixedPriorityWithPsetRunqueueString, sizeof(sched_string));
542			kprintf("Scheduler: Runtime selection of %s\n", kSchedFixedPriorityWithPsetRunqueueString);
543#endif
544		} else {
545			panic("Unrecognized scheduler algorithm: %s", sched_arg);
546		}
547	} else {
548#if   defined(CONFIG_SCHED_TRADITIONAL)
549		sched_current_dispatch = &sched_traditional_dispatch;
550		_sched_enum = sched_enum_traditional;
551		strlcpy(sched_string, kSchedTraditionalString, sizeof(sched_string));
552		kprintf("Scheduler: Default of %s\n", kSchedTraditionalString);
553#elif defined(CONFIG_SCHED_PROTO)
554		sched_current_dispatch = &sched_proto_dispatch;
555		_sched_enum = sched_enum_proto;
556		strlcpy(sched_string, kSchedProtoString, sizeof(sched_string));
557		kprintf("Scheduler: Default of %s\n", kSchedProtoString);
558#elif defined(CONFIG_SCHED_GRRR)
559		sched_current_dispatch = &sched_grrr_dispatch;
560		_sched_enum = sched_enum_grrr;
561		strlcpy(sched_string, kSchedGRRRString, sizeof(sched_string));
562		kprintf("Scheduler: Default of %s\n", kSchedGRRRString);
563#elif defined(CONFIG_SCHED_FIXEDPRIORITY)
564		sched_current_dispatch = &sched_fixedpriority_dispatch;
565		_sched_enum = sched_enum_fixedpriority;
566		strlcpy(sched_string, kSchedFixedPriorityString, sizeof(sched_string));
567		kprintf("Scheduler: Default of %s\n", kSchedFixedPriorityString);
568#else
569#error No default scheduler implementation
570#endif
571	}
572
573	SCHED(init)();
574	SCHED(fairshare_init)();
575	sched_realtime_init();
576	ast_init();
577	sched_timer_deadline_tracking_init();
578
579	SCHED(pset_init)(&pset0);
580	SCHED(processor_init)(master_processor);
581}
582
583void
584sched_timebase_init(void)
585{
586	uint64_t	abstime;
587
588	clock_interval_to_absolutetime_interval(1, NSEC_PER_SEC, &abstime);
589	sched_one_second_interval = abstime;
590
591	SCHED(timebase_init)();
592	sched_realtime_timebase_init();
593}
594
595#if defined(CONFIG_SCHED_TRADITIONAL)
596
597static void
598sched_traditional_init(void)
599{
600	/*
601	 * Calculate the timeslicing quantum
602	 * in us.
603	 */
604	if (default_preemption_rate < 1)
605		default_preemption_rate = DEFAULT_PREEMPTION_RATE;
606	std_quantum_us = (1000 * 1000) / default_preemption_rate;
607
608	printf("standard timeslicing quantum is %d us\n", std_quantum_us);
609
610	if (default_bg_preemption_rate < 1)
611		default_bg_preemption_rate = DEFAULT_BG_PREEMPTION_RATE;
612	bg_quantum_us = (1000 * 1000) / default_bg_preemption_rate;
613
614	printf("standard background quantum is %d us\n", bg_quantum_us);
615
616	load_shift_init();
617	preempt_pri_init();
618	sched_tick = 0;
619}
620
621static void
622sched_traditional_timebase_init(void)
623{
624	uint64_t	abstime;
625	uint32_t	shift;
626
627	/* standard timeslicing quantum */
628	clock_interval_to_absolutetime_interval(
629							std_quantum_us, NSEC_PER_USEC, &abstime);
630	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
631	std_quantum = (uint32_t)abstime;
632
633	/* smallest remaining quantum (250 us) */
634	clock_interval_to_absolutetime_interval(250, NSEC_PER_USEC, &abstime);
635	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
636	min_std_quantum = (uint32_t)abstime;
637
638	/* quantum for background tasks */
639	clock_interval_to_absolutetime_interval(
640							bg_quantum_us, NSEC_PER_USEC, &abstime);
641	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
642	bg_quantum = (uint32_t)abstime;
643
644	/* scheduler tick interval */
645	clock_interval_to_absolutetime_interval(USEC_PER_SEC >> SCHED_TICK_SHIFT,
646													NSEC_PER_USEC, &abstime);
647	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
648	sched_tick_interval = (uint32_t)abstime;
649
650	/*
651	 * Compute conversion factor from usage to
652	 * timesharing priorities with 5/8 ** n aging.
653	 */
654	abstime = (abstime * 5) / 3;
655	for (shift = 0; abstime > BASEPRI_DEFAULT; ++shift)
656		abstime >>= 1;
657	sched_fixed_shift = shift;
658
659	max_unsafe_computation = max_unsafe_quanta * std_quantum;
660	sched_safe_duration = 2 * max_unsafe_quanta * std_quantum;
661
662	max_poll_computation = max_poll_quanta * std_quantum;
663	thread_depress_time = 1 * std_quantum;
664	default_timeshare_computation = std_quantum / 2;
665	default_timeshare_constraint = std_quantum;
666
667}
668
669static void
670sched_traditional_processor_init(processor_t processor)
671{
672	if (!sched_traditional_use_pset_runqueue) {
673		run_queue_init(&processor->runq);
674	}
675	processor->runq_bound_count = 0;
676}
677
678static void
679sched_traditional_pset_init(processor_set_t pset)
680{
681	if (sched_traditional_use_pset_runqueue) {
682		run_queue_init(&pset->pset_runq);
683	}
684	pset->pset_runq_bound_count = 0;
685}
686
687static void
688sched_traditional_with_pset_runqueue_init(void)
689{
690	sched_traditional_init();
691	sched_traditional_use_pset_runqueue = TRUE;
692}
693
694#endif /* CONFIG_SCHED_TRADITIONAL */
695
696#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY)
697void
698sched_traditional_fairshare_init(void)
699{
700	simple_lock_init(&fs_lock, 0);
701
702	fs_runq.count = 0;
703	queue_init(&fs_runq.queue);
704}
705#endif
706
707static void
708sched_realtime_init(void)
709{
710	simple_lock_init(&rt_lock, 0);
711
712	rt_runq.count = 0;
713	queue_init(&rt_runq.queue);
714}
715
716static void
717sched_realtime_timebase_init(void)
718{
719	uint64_t abstime;
720
721	/* smallest rt computaton (50 us) */
722	clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime);
723	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
724	min_rt_quantum = (uint32_t)abstime;
725
726	/* maximum rt computation (50 ms) */
727	clock_interval_to_absolutetime_interval(
728		50, 1000*NSEC_PER_USEC, &abstime);
729	assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
730	max_rt_quantum = (uint32_t)abstime;
731
732}
733
734#if defined(CONFIG_SCHED_TRADITIONAL)
735
736/*
737 * Set up values for timeshare
738 * loading factors.
739 */
740static void
741load_shift_init(void)
742{
743	int8_t		k, *p = sched_load_shifts;
744	uint32_t	i, j;
745
746	*p++ = INT8_MIN; *p++ = 0;
747
748	for (i = j = 2, k = 1; i < NRQS; ++k) {
749		for (j <<= 1; i < j; ++i)
750			*p++ = k;
751	}
752}
753
754static void
755preempt_pri_init(void)
756{
757	int		i, *p = sched_preempt_pri;
758
759	for (i = BASEPRI_FOREGROUND + 1; i < MINPRI_KERNEL; ++i)
760		setbit(i, p);
761
762	for (i = BASEPRI_PREEMPT; i <= MAXPRI; ++i)
763		setbit(i, p);
764}
765
766#endif /* CONFIG_SCHED_TRADITIONAL */
767
768/*
769 *	Thread wait timer expiration.
770 */
771void
772thread_timer_expire(
773	void			*p0,
774	__unused void	*p1)
775{
776	thread_t		thread = p0;
777	spl_t			s;
778
779	s = splsched();
780	thread_lock(thread);
781	if (--thread->wait_timer_active == 0) {
782		if (thread->wait_timer_is_set) {
783			thread->wait_timer_is_set = FALSE;
784			clear_wait_internal(thread, THREAD_TIMED_OUT);
785		}
786	}
787	thread_unlock(thread);
788	splx(s);
789}
790
791#ifndef	__LP64__
792
793/*
794 *	thread_set_timer:
795 *
796 *	Set a timer for the current thread, if the thread
797 *	is ready to wait.  Must be called between assert_wait()
798 *	and thread_block().
799 */
800void
801thread_set_timer(
802	uint32_t		interval,
803	uint32_t		scale_factor)
804{
805	thread_t		thread = current_thread();
806	uint64_t		deadline;
807	spl_t			s;
808
809	s = splsched();
810	thread_lock(thread);
811	if ((thread->state & TH_WAIT) != 0) {
812		clock_interval_to_deadline(interval, scale_factor, &deadline);
813		if (!timer_call_enter(&thread->wait_timer, deadline, thread->sched_pri >= BASEPRI_RTQUEUES ? TIMER_CALL_CRITICAL : 0))
814			thread->wait_timer_active++;
815		thread->wait_timer_is_set = TRUE;
816	}
817	thread_unlock(thread);
818	splx(s);
819}
820
821void
822thread_set_timer_deadline(
823	uint64_t		deadline)
824{
825	thread_t		thread = current_thread();
826	spl_t			s;
827
828	s = splsched();
829	thread_lock(thread);
830	if ((thread->state & TH_WAIT) != 0) {
831		if (!timer_call_enter(&thread->wait_timer, deadline, thread->sched_pri >= BASEPRI_RTQUEUES ? TIMER_CALL_CRITICAL : 0))
832			thread->wait_timer_active++;
833		thread->wait_timer_is_set = TRUE;
834	}
835	thread_unlock(thread);
836	splx(s);
837}
838
839void
840thread_cancel_timer(void)
841{
842	thread_t		thread = current_thread();
843	spl_t			s;
844
845	s = splsched();
846	thread_lock(thread);
847	if (thread->wait_timer_is_set) {
848		if (timer_call_cancel(&thread->wait_timer))
849			thread->wait_timer_active--;
850		thread->wait_timer_is_set = FALSE;
851	}
852	thread_unlock(thread);
853	splx(s);
854}
855
856#endif	/* __LP64__ */
857
858/*
859 *	thread_unblock:
860 *
861 *	Unblock thread on wake up.
862 *
863 *	Returns TRUE if the thread is still running.
864 *
865 *	Thread must be locked.
866 */
867boolean_t
868thread_unblock(
869	thread_t		thread,
870	wait_result_t	wresult)
871{
872	boolean_t		result = FALSE;
873	thread_t		cthread = current_thread();
874
875	/*
876	 *	Set wait_result.
877	 */
878	thread->wait_result = wresult;
879
880	/*
881	 *	Cancel pending wait timer.
882	 */
883	if (thread->wait_timer_is_set) {
884		if (timer_call_cancel(&thread->wait_timer))
885			thread->wait_timer_active--;
886		thread->wait_timer_is_set = FALSE;
887	}
888
889	/*
890	 *	Update scheduling state: not waiting,
891	 *	set running.
892	 */
893	thread->state &= ~(TH_WAIT|TH_UNINT);
894
895	if (!(thread->state & TH_RUN)) {
896		thread->state |= TH_RUN;
897
898		(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
899
900		/*
901		 *	Update run counts.
902		 */
903		sched_run_incr();
904		if (thread->sched_mode == TH_MODE_TIMESHARE)
905			sched_share_incr();
906	}
907	else {
908		/*
909		 *	Signal if idling on another processor.
910		 */
911#if CONFIG_SCHED_IDLE_IN_PLACE
912		if (thread->state & TH_IDLE) {
913			processor_t		processor = thread->last_processor;
914
915			if (processor != current_processor())
916				machine_signal_idle(processor);
917		}
918#else
919		assert((thread->state & TH_IDLE) == 0);
920#endif
921
922		result = TRUE;
923	}
924
925	/*
926	 * Calculate deadline for real-time threads.
927	 */
928	if (thread->sched_mode == TH_MODE_REALTIME) {
929		thread->realtime.deadline = mach_absolute_time();
930		thread->realtime.deadline += thread->realtime.constraint;
931	}
932
933	/*
934	 * Clear old quantum, fail-safe computation, etc.
935	 */
936	thread->current_quantum = 0;
937	thread->computation_metered = 0;
938	thread->reason = AST_NONE;
939
940	/* Obtain power-relevant interrupt and "platform-idle exit" statistics.
941	 * We also account for "double hop" thread signaling via
942	 * the thread callout infrastructure.
943	 * DRK: consider removing the callout wakeup counters in the future
944	 * they're present for verification at the moment.
945	 */
946	boolean_t aticontext, pidle;
947	ml_get_power_state(&aticontext, &pidle);
948	if (__improbable(aticontext)) {
949		ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
950		uint64_t ttd = PROCESSOR_DATA(current_processor(), timer_call_ttd);
951		if (ttd) {
952			if (ttd <= timer_deadline_tracking_bin_1)
953				thread->thread_timer_wakeups_bin_1++;
954			else
955				if (ttd <= timer_deadline_tracking_bin_2)
956					thread->thread_timer_wakeups_bin_2++;
957		}
958		if (pidle) {
959			ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
960		}
961	} else if (thread_get_tag_internal(cthread) & THREAD_TAG_CALLOUT) {
962		if (cthread->callout_woken_from_icontext) {
963			ledger_credit(thread->t_ledger, task_ledgers.interrupt_wakeups, 1);
964			thread->thread_callout_interrupt_wakeups++;
965			if (cthread->callout_woken_from_platform_idle) {
966				ledger_credit(thread->t_ledger, task_ledgers.platform_idle_wakeups, 1);
967				thread->thread_callout_platform_idle_wakeups++;
968			}
969		}
970	}
971
972	if (thread_get_tag_internal(thread) & THREAD_TAG_CALLOUT) {
973			thread->callout_woken_from_icontext = aticontext;
974			thread->callout_woken_from_platform_idle = pidle;
975	}
976
977	/* Event should only be triggered if thread is not already running */
978	if (result == FALSE) {
979		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
980			MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
981			(uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, 0, 0);
982	}
983
984	DTRACE_SCHED2(wakeup, struct thread *, thread, struct proc *, thread->task->bsd_info);
985
986	return (result);
987}
988
989/*
990 *	Routine:	thread_go
991 *	Purpose:
992 *		Unblock and dispatch thread.
993 *	Conditions:
994 *		thread lock held, IPC locks may be held.
995 *		thread must have been pulled from wait queue under same lock hold.
996 *  Returns:
997 *		KERN_SUCCESS - Thread was set running
998 *		KERN_NOT_WAITING - Thread was not waiting
999 */
1000kern_return_t
1001thread_go(
1002	thread_t		thread,
1003	wait_result_t	wresult)
1004{
1005	assert(thread->at_safe_point == FALSE);
1006	assert(thread->wait_event == NO_EVENT64);
1007	assert(thread->wait_queue == WAIT_QUEUE_NULL);
1008
1009	if ((thread->state & (TH_WAIT|TH_TERMINATE)) == TH_WAIT) {
1010		if (!thread_unblock(thread, wresult))
1011			thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1012
1013		return (KERN_SUCCESS);
1014	}
1015
1016	return (KERN_NOT_WAITING);
1017}
1018
1019/*
1020 *	Routine:	thread_mark_wait_locked
1021 *	Purpose:
1022 *		Mark a thread as waiting.  If, given the circumstances,
1023 *		it doesn't want to wait (i.e. already aborted), then
1024 *		indicate that in the return value.
1025 *	Conditions:
1026 *		at splsched() and thread is locked.
1027 */
1028__private_extern__
1029wait_result_t
1030thread_mark_wait_locked(
1031	thread_t			thread,
1032	wait_interrupt_t 	interruptible)
1033{
1034	boolean_t		at_safe_point;
1035
1036	assert(thread == current_thread());
1037
1038	/*
1039	 *	The thread may have certain types of interrupts/aborts masked
1040	 *	off.  Even if the wait location says these types of interrupts
1041	 *	are OK, we have to honor mask settings (outer-scoped code may
1042	 *	not be able to handle aborts at the moment).
1043	 */
1044	if (interruptible > (thread->options & TH_OPT_INTMASK))
1045		interruptible = thread->options & TH_OPT_INTMASK;
1046
1047	at_safe_point = (interruptible == THREAD_ABORTSAFE);
1048
1049	if (	interruptible == THREAD_UNINT			||
1050			!(thread->sched_flags & TH_SFLAG_ABORT)	||
1051			(!at_safe_point &&
1052				(thread->sched_flags & TH_SFLAG_ABORTSAFELY))) {
1053
1054		if ( !(thread->state & TH_TERMINATE))
1055			DTRACE_SCHED(sleep);
1056
1057		thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
1058		thread->at_safe_point = at_safe_point;
1059		return (thread->wait_result = THREAD_WAITING);
1060	}
1061	else
1062	if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
1063		thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
1064
1065	return (thread->wait_result = THREAD_INTERRUPTED);
1066}
1067
1068/*
1069 *	Routine:	thread_interrupt_level
1070 *	Purpose:
1071 *	        Set the maximum interruptible state for the
1072 *		current thread.  The effective value of any
1073 *		interruptible flag passed into assert_wait
1074 *		will never exceed this.
1075 *
1076 *		Useful for code that must not be interrupted,
1077 *		but which calls code that doesn't know that.
1078 *	Returns:
1079 *		The old interrupt level for the thread.
1080 */
1081__private_extern__
1082wait_interrupt_t
1083thread_interrupt_level(
1084	wait_interrupt_t new_level)
1085{
1086	thread_t thread = current_thread();
1087	wait_interrupt_t result = thread->options & TH_OPT_INTMASK;
1088
1089	thread->options = (thread->options & ~TH_OPT_INTMASK) | (new_level & TH_OPT_INTMASK);
1090
1091	return result;
1092}
1093
1094/*
1095 * Check to see if an assert wait is possible, without actually doing one.
1096 * This is used by debug code in locks and elsewhere to verify that it is
1097 * always OK to block when trying to take a blocking lock (since waiting
1098 * for the actual assert_wait to catch the case may make it hard to detect
1099 * this case.
1100 */
1101boolean_t
1102assert_wait_possible(void)
1103{
1104
1105	thread_t thread;
1106
1107#if	DEBUG
1108	if(debug_mode) return TRUE;		/* Always succeed in debug mode */
1109#endif
1110
1111	thread = current_thread();
1112
1113	return (thread == NULL || wait_queue_assert_possible(thread));
1114}
1115
1116/*
1117 *	assert_wait:
1118 *
1119 *	Assert that the current thread is about to go to
1120 *	sleep until the specified event occurs.
1121 */
1122wait_result_t
1123assert_wait(
1124	event_t				event,
1125	wait_interrupt_t	interruptible)
1126{
1127	register wait_queue_t	wq;
1128	register int		index;
1129
1130	assert(event != NO_EVENT);
1131
1132	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1133		MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1134		VM_KERNEL_UNSLIDE(event), 0, 0, 0, 0);
1135
1136	index = wait_hash(event);
1137	wq = &wait_queues[index];
1138	return wait_queue_assert_wait(wq, event, interruptible, 0);
1139}
1140
1141wait_result_t
1142assert_wait_timeout(
1143	event_t				event,
1144	wait_interrupt_t	interruptible,
1145	uint32_t			interval,
1146	uint32_t			scale_factor)
1147{
1148	thread_t			thread = current_thread();
1149	wait_result_t		wresult;
1150	wait_queue_t		wqueue;
1151	uint64_t			deadline;
1152	spl_t				s;
1153
1154	assert(event != NO_EVENT);
1155	wqueue = &wait_queues[wait_hash(event)];
1156
1157	s = splsched();
1158	wait_queue_lock(wqueue);
1159	thread_lock(thread);
1160
1161	clock_interval_to_deadline(interval, scale_factor, &deadline);
1162
1163	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1164		MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1165		VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1166
1167	wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t, event),
1168													interruptible, deadline, thread);
1169
1170	thread_unlock(thread);
1171	wait_queue_unlock(wqueue);
1172	splx(s);
1173
1174	return (wresult);
1175}
1176
1177wait_result_t
1178assert_wait_deadline(
1179	event_t				event,
1180	wait_interrupt_t	interruptible,
1181	uint64_t			deadline)
1182{
1183	thread_t			thread = current_thread();
1184	wait_result_t		wresult;
1185	wait_queue_t		wqueue;
1186	spl_t				s;
1187
1188	assert(event != NO_EVENT);
1189	wqueue = &wait_queues[wait_hash(event)];
1190
1191	s = splsched();
1192	wait_queue_lock(wqueue);
1193	thread_lock(thread);
1194
1195	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1196		MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAIT)|DBG_FUNC_NONE,
1197		VM_KERNEL_UNSLIDE(event), interruptible, deadline, 0, 0);
1198
1199	wresult = wait_queue_assert_wait64_locked(wqueue, CAST_DOWN(event64_t,event),
1200													interruptible, deadline, thread);
1201
1202	thread_unlock(thread);
1203	wait_queue_unlock(wqueue);
1204	splx(s);
1205
1206	return (wresult);
1207}
1208
1209/*
1210 *	thread_sleep_fast_usimple_lock:
1211 *
1212 *	Cause the current thread to wait until the specified event
1213 *	occurs.  The specified simple_lock is unlocked before releasing
1214 *	the cpu and re-acquired as part of waking up.
1215 *
1216 *	This is the simple lock sleep interface for components that use a
1217 *	faster version of simple_lock() than is provided by usimple_lock().
1218 */
1219__private_extern__ wait_result_t
1220thread_sleep_fast_usimple_lock(
1221	event_t			event,
1222	simple_lock_t		lock,
1223	wait_interrupt_t	interruptible)
1224{
1225	wait_result_t res;
1226
1227	res = assert_wait(event, interruptible);
1228	if (res == THREAD_WAITING) {
1229		simple_unlock(lock);
1230		res = thread_block(THREAD_CONTINUE_NULL);
1231		simple_lock(lock);
1232	}
1233	return res;
1234}
1235
1236
1237/*
1238 *	thread_sleep_usimple_lock:
1239 *
1240 *	Cause the current thread to wait until the specified event
1241 *	occurs.  The specified usimple_lock is unlocked before releasing
1242 *	the cpu and re-acquired as part of waking up.
1243 *
1244 *	This is the simple lock sleep interface for components where
1245 *	simple_lock() is defined in terms of usimple_lock().
1246 */
1247wait_result_t
1248thread_sleep_usimple_lock(
1249	event_t			event,
1250	usimple_lock_t		lock,
1251	wait_interrupt_t	interruptible)
1252{
1253	wait_result_t res;
1254
1255	res = assert_wait(event, interruptible);
1256	if (res == THREAD_WAITING) {
1257		usimple_unlock(lock);
1258		res = thread_block(THREAD_CONTINUE_NULL);
1259		usimple_lock(lock);
1260	}
1261	return res;
1262}
1263
1264/*
1265 *	thread_sleep_lock_write:
1266 *
1267 *	Cause the current thread to wait until the specified event
1268 *	occurs.  The specified (write) lock is unlocked before releasing
1269 *	the cpu. The (write) lock will be re-acquired before returning.
1270 */
1271wait_result_t
1272thread_sleep_lock_write(
1273	event_t			event,
1274	lock_t			*lock,
1275	wait_interrupt_t interruptible)
1276{
1277	wait_result_t	res;
1278
1279	res = assert_wait(event, interruptible);
1280	if (res == THREAD_WAITING) {
1281		lock_write_done(lock);
1282		res = thread_block(THREAD_CONTINUE_NULL);
1283		lock_write(lock);
1284	}
1285	return res;
1286}
1287
1288/*
1289 * thread_stop:
1290 *
1291 * Force a preemption point for a thread and wait
1292 * for it to stop running.  Arbitrates access among
1293 * multiple stop requests. (released by unstop)
1294 *
1295 * The thread must enter a wait state and stop via a
1296 * separate means.
1297 *
1298 * Returns FALSE if interrupted.
1299 */
1300boolean_t
1301thread_stop(
1302	thread_t		thread)
1303{
1304	wait_result_t	wresult;
1305	spl_t			s = splsched();
1306
1307	wake_lock(thread);
1308	thread_lock(thread);
1309
1310	while (thread->state & TH_SUSP) {
1311		thread->wake_active = TRUE;
1312		thread_unlock(thread);
1313
1314		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1315		wake_unlock(thread);
1316		splx(s);
1317
1318		if (wresult == THREAD_WAITING)
1319			wresult = thread_block(THREAD_CONTINUE_NULL);
1320
1321		if (wresult != THREAD_AWAKENED)
1322			return (FALSE);
1323
1324		s = splsched();
1325		wake_lock(thread);
1326		thread_lock(thread);
1327	}
1328
1329	thread->state |= TH_SUSP;
1330
1331	while (thread->state & TH_RUN) {
1332		processor_t		processor = thread->last_processor;
1333
1334		if (processor != PROCESSOR_NULL && processor->active_thread == thread)
1335			cause_ast_check(processor);
1336
1337		thread->wake_active = TRUE;
1338		thread_unlock(thread);
1339
1340		wresult = assert_wait(&thread->wake_active, THREAD_ABORTSAFE);
1341		wake_unlock(thread);
1342		splx(s);
1343
1344		if (wresult == THREAD_WAITING)
1345			wresult = thread_block(THREAD_CONTINUE_NULL);
1346
1347		if (wresult != THREAD_AWAKENED) {
1348			thread_unstop(thread);
1349			return (FALSE);
1350		}
1351
1352		s = splsched();
1353		wake_lock(thread);
1354		thread_lock(thread);
1355	}
1356
1357	thread_unlock(thread);
1358	wake_unlock(thread);
1359	splx(s);
1360
1361	return (TRUE);
1362}
1363
1364/*
1365 * thread_unstop:
1366 *
1367 * Release a previous stop request and set
1368 * the thread running if appropriate.
1369 *
1370 * Use only after a successful stop operation.
1371 */
1372void
1373thread_unstop(
1374	thread_t	thread)
1375{
1376	spl_t		s = splsched();
1377
1378	wake_lock(thread);
1379	thread_lock(thread);
1380
1381	if ((thread->state & (TH_RUN|TH_WAIT|TH_SUSP)) == TH_SUSP) {
1382		thread->state &= ~TH_SUSP;
1383		thread_unblock(thread, THREAD_AWAKENED);
1384
1385		thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
1386	}
1387	else
1388	if (thread->state & TH_SUSP) {
1389		thread->state &= ~TH_SUSP;
1390
1391		if (thread->wake_active) {
1392			thread->wake_active = FALSE;
1393			thread_unlock(thread);
1394
1395			thread_wakeup(&thread->wake_active);
1396			wake_unlock(thread);
1397			splx(s);
1398
1399			return;
1400		}
1401	}
1402
1403	thread_unlock(thread);
1404	wake_unlock(thread);
1405	splx(s);
1406}
1407
1408/*
1409 * Thread locked, returns the same way
1410 */
1411static inline boolean_t
1412thread_isoncpu(thread_t thread)
1413{
1414	processor_t processor = thread->last_processor;
1415
1416	return ((processor != PROCESSOR_NULL) && (processor->active_thread == thread));
1417}
1418/*
1419 * thread_wait:
1420 *
1421 * Wait for a thread to stop running. (non-interruptible)
1422 *
1423 */
1424void
1425thread_wait(
1426	thread_t	thread,
1427	boolean_t	until_not_runnable)
1428{
1429	wait_result_t	wresult;
1430	boolean_t 	oncpu;
1431	processor_t	processor;
1432	spl_t		s = splsched();
1433
1434	wake_lock(thread);
1435	thread_lock(thread);
1436
1437	/*
1438	 * Wait until not running on a CPU.  If stronger requirement
1439	 * desired, wait until not runnable.  Assumption: if thread is
1440	 * on CPU, then TH_RUN is set, so we're not waiting in any case
1441	 * where the original, pure "TH_RUN" check would have let us
1442	 * finish.
1443	 */
1444	while ((oncpu = thread_isoncpu(thread)) ||
1445			(until_not_runnable && (thread->state & TH_RUN))) {
1446
1447		if (oncpu) {
1448			assert(thread->state & TH_RUN);
1449			processor = thread->last_processor;
1450			cause_ast_check(processor);
1451		}
1452
1453		thread->wake_active = TRUE;
1454		thread_unlock(thread);
1455
1456		wresult = assert_wait(&thread->wake_active, THREAD_UNINT);
1457		wake_unlock(thread);
1458		splx(s);
1459
1460		if (wresult == THREAD_WAITING)
1461			thread_block(THREAD_CONTINUE_NULL);
1462
1463		s = splsched();
1464		wake_lock(thread);
1465		thread_lock(thread);
1466	}
1467
1468	thread_unlock(thread);
1469	wake_unlock(thread);
1470	splx(s);
1471}
1472
1473/*
1474 *	Routine: clear_wait_internal
1475 *
1476 *		Clear the wait condition for the specified thread.
1477 *		Start the thread executing if that is appropriate.
1478 *	Arguments:
1479 *		thread		thread to awaken
1480 *		result		Wakeup result the thread should see
1481 *	Conditions:
1482 *		At splsched
1483 *		the thread is locked.
1484 *	Returns:
1485 *		KERN_SUCCESS		thread was rousted out a wait
1486 *		KERN_FAILURE		thread was waiting but could not be rousted
1487 *		KERN_NOT_WAITING	thread was not waiting
1488 */
1489__private_extern__ kern_return_t
1490clear_wait_internal(
1491	thread_t		thread,
1492	wait_result_t	wresult)
1493{
1494	wait_queue_t	wq = thread->wait_queue;
1495	uint32_t	i = LockTimeOut;
1496
1497	do {
1498		if (wresult == THREAD_INTERRUPTED && (thread->state & TH_UNINT))
1499			return (KERN_FAILURE);
1500
1501		if (wq != WAIT_QUEUE_NULL) {
1502			if (wait_queue_lock_try(wq)) {
1503				wait_queue_pull_thread_locked(wq, thread, TRUE);
1504				/* wait queue unlocked, thread still locked */
1505			}
1506			else {
1507				thread_unlock(thread);
1508				delay(1);
1509
1510				thread_lock(thread);
1511				if (wq != thread->wait_queue)
1512					return (KERN_NOT_WAITING);
1513
1514				continue;
1515			}
1516		}
1517
1518		return (thread_go(thread, wresult));
1519	} while ((--i > 0) || machine_timeout_suspended());
1520
1521	panic("clear_wait_internal: deadlock: thread=%p, wq=%p, cpu=%d\n",
1522		  thread, wq, cpu_number());
1523
1524	return (KERN_FAILURE);
1525}
1526
1527
1528/*
1529 *	clear_wait:
1530 *
1531 *	Clear the wait condition for the specified thread.  Start the thread
1532 *	executing if that is appropriate.
1533 *
1534 *	parameters:
1535 *	  thread		thread to awaken
1536 *	  result		Wakeup result the thread should see
1537 */
1538kern_return_t
1539clear_wait(
1540	thread_t		thread,
1541	wait_result_t	result)
1542{
1543	kern_return_t ret;
1544	spl_t		s;
1545
1546	s = splsched();
1547	thread_lock(thread);
1548	ret = clear_wait_internal(thread, result);
1549	thread_unlock(thread);
1550	splx(s);
1551	return ret;
1552}
1553
1554
1555/*
1556 *	thread_wakeup_prim:
1557 *
1558 *	Common routine for thread_wakeup, thread_wakeup_with_result,
1559 *	and thread_wakeup_one.
1560 *
1561 */
1562kern_return_t
1563thread_wakeup_prim(
1564	event_t			event,
1565	boolean_t		one_thread,
1566	wait_result_t		result)
1567{
1568	return (thread_wakeup_prim_internal(event, one_thread, result, -1));
1569}
1570
1571
1572kern_return_t
1573thread_wakeup_prim_internal(
1574	event_t			event,
1575	boolean_t		one_thread,
1576	wait_result_t		result,
1577	int			priority)
1578{
1579	register wait_queue_t	wq;
1580	register int			index;
1581
1582	index = wait_hash(event);
1583	wq = &wait_queues[index];
1584	if (one_thread)
1585		return (wait_queue_wakeup_one(wq, event, result, priority));
1586	else
1587	    return (wait_queue_wakeup_all(wq, event, result));
1588}
1589
1590/*
1591 *	thread_bind:
1592 *
1593 *	Force the current thread to execute on the specified processor.
1594 *
1595 *	Returns the previous binding.  PROCESSOR_NULL means
1596 *	not bound.
1597 *
1598 *	XXX - DO NOT export this to users - XXX
1599 */
1600processor_t
1601thread_bind(
1602	processor_t		processor)
1603{
1604	thread_t		self = current_thread();
1605	processor_t		prev;
1606	spl_t			s;
1607
1608	s = splsched();
1609	thread_lock(self);
1610
1611	prev = self->bound_processor;
1612	self->bound_processor = processor;
1613
1614	thread_unlock(self);
1615	splx(s);
1616
1617	return (prev);
1618}
1619
1620/*
1621 *	thread_select:
1622 *
1623 *	Select a new thread for the current processor to execute.
1624 *
1625 *	May select the current thread, which must be locked.
1626 */
1627static thread_t
1628thread_select(
1629	thread_t			thread,
1630	processor_t			processor)
1631{
1632	processor_set_t		pset = processor->processor_set;
1633	thread_t			new_thread = THREAD_NULL;
1634	boolean_t			inactive_state;
1635
1636	assert(processor == current_processor());
1637
1638	do {
1639		/*
1640		 *	Update the priority.
1641		 */
1642		if (SCHED(can_update_priority)(thread))
1643			SCHED(update_priority)(thread);
1644
1645		processor->current_pri = thread->sched_pri;
1646		processor->current_thmode = thread->sched_mode;
1647
1648		pset_lock(pset);
1649
1650		assert(pset->low_count);
1651		assert(pset->low_pri);
1652
1653		inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_processor_is_inactive(processor);
1654
1655		simple_lock(&rt_lock);
1656
1657		/*
1658		 *	Test to see if the current thread should continue
1659		 *	to run on this processor.  Must be runnable, and not
1660		 *	bound to a different processor, nor be in the wrong
1661		 *	processor set.
1662		 */
1663		if (	((thread->state & ~TH_SUSP) == TH_RUN)					&&
1664				(thread->sched_pri >= BASEPRI_RTQUEUES		||
1665				 processor->processor_meta == PROCESSOR_META_NULL ||
1666				 processor->processor_meta->primary == processor)		&&
1667				(thread->bound_processor == PROCESSOR_NULL	||
1668				 thread->bound_processor == processor)			&&
1669				(thread->affinity_set == AFFINITY_SET_NULL	||
1670				 thread->affinity_set->aset_pset == pset)			) {
1671			if (	thread->sched_pri >= BASEPRI_RTQUEUES	&&
1672						first_timeslice(processor)				) {
1673				if (rt_runq.count > 0) {
1674					register queue_t		q;
1675
1676					q = &rt_runq.queue;
1677					if (((thread_t)q->next)->realtime.deadline <
1678													processor->deadline) {
1679						thread = (thread_t)dequeue_head(q);
1680						thread->runq = PROCESSOR_NULL;
1681						SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1682						rt_runq.count--;
1683					}
1684				}
1685
1686				simple_unlock(&rt_lock);
1687
1688				processor->deadline = thread->realtime.deadline;
1689
1690				pset_unlock(pset);
1691
1692				return (thread);
1693			}
1694
1695			if (!inactive_state && (thread->sched_mode != TH_MODE_FAIRSHARE || SCHED(fairshare_runq_count)() == 0) && (rt_runq.count == 0 || BASEPRI_RTQUEUES < thread->sched_pri) &&
1696					(new_thread = SCHED(choose_thread)(processor, thread->sched_mode == TH_MODE_FAIRSHARE ? MINPRI : thread->sched_pri)) == THREAD_NULL) {
1697
1698				simple_unlock(&rt_lock);
1699
1700				/* I am the highest priority runnable (non-idle) thread */
1701
1702				pset_pri_hint(pset, processor, processor->current_pri);
1703
1704				pset_count_hint(pset, processor, SCHED(processor_runq_count)(processor));
1705
1706				processor->deadline = UINT64_MAX;
1707
1708				pset_unlock(pset);
1709
1710				return (thread);
1711			}
1712		}
1713
1714		if (new_thread != THREAD_NULL ||
1715				(SCHED(processor_queue_has_priority)(processor, rt_runq.count == 0 ? IDLEPRI : BASEPRI_RTQUEUES, TRUE) &&
1716					 (new_thread = SCHED(choose_thread)(processor, MINPRI)) != THREAD_NULL)) {
1717				simple_unlock(&rt_lock);
1718
1719				if (!inactive_state) {
1720					pset_pri_hint(pset, processor, new_thread->sched_pri);
1721
1722					pset_count_hint(pset, processor, SCHED(processor_runq_count)(processor));
1723				}
1724
1725				processor->deadline = UINT64_MAX;
1726				pset_unlock(pset);
1727
1728				return (new_thread);
1729		}
1730
1731		if (rt_runq.count > 0) {
1732			thread = (thread_t)dequeue_head(&rt_runq.queue);
1733
1734			thread->runq = PROCESSOR_NULL;
1735			SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
1736			rt_runq.count--;
1737
1738			simple_unlock(&rt_lock);
1739
1740			processor->deadline = thread->realtime.deadline;
1741			pset_unlock(pset);
1742
1743			return (thread);
1744		}
1745
1746		simple_unlock(&rt_lock);
1747
1748		/* No realtime threads and no normal threads on the per-processor
1749		 * runqueue. Finally check for global fairshare threads.
1750		 */
1751		if ((new_thread = SCHED(fairshare_dequeue)()) != THREAD_NULL) {
1752
1753			processor->deadline = UINT64_MAX;
1754			pset_unlock(pset);
1755
1756			return (new_thread);
1757		}
1758
1759		processor->deadline = UINT64_MAX;
1760
1761		/*
1762		 *	Set processor inactive based on
1763		 *	indication from the platform code.
1764		 */
1765		if (inactive_state) {
1766			if (processor->state == PROCESSOR_RUNNING)
1767				remqueue((queue_entry_t)processor);
1768			else
1769			if (processor->state == PROCESSOR_IDLE)
1770				remqueue((queue_entry_t)processor);
1771
1772			processor->state = PROCESSOR_INACTIVE;
1773
1774			pset_unlock(pset);
1775
1776			return (processor->idle_thread);
1777		}
1778
1779		/*
1780		 *	No runnable threads, attempt to steal
1781		 *	from other processors.
1782		 */
1783		new_thread = SCHED(steal_thread)(pset);
1784		if (new_thread != THREAD_NULL) {
1785			return (new_thread);
1786		}
1787
1788		/*
1789		 *	If other threads have appeared, shortcut
1790		 *	around again.
1791		 */
1792		if (!SCHED(processor_queue_empty)(processor) || rt_runq.count > 0 || SCHED(fairshare_runq_count)() > 0)
1793			continue;
1794
1795		pset_lock(pset);
1796
1797		/*
1798		 *	Nothing is runnable, so set this processor idle if it
1799		 *	was running.
1800		 */
1801		if (processor->state == PROCESSOR_RUNNING) {
1802			remqueue((queue_entry_t)processor);
1803			processor->state = PROCESSOR_IDLE;
1804
1805			if (processor->processor_meta == PROCESSOR_META_NULL || processor->processor_meta->primary == processor) {
1806				enqueue_head(&pset->idle_queue, (queue_entry_t)processor);
1807				pset_pri_init_hint(pset, processor);
1808				pset_count_init_hint(pset, processor);
1809			}
1810			else {
1811				enqueue_head(&processor->processor_meta->idle_queue, (queue_entry_t)processor);
1812					pset_unlock(pset);
1813					return (processor->idle_thread);
1814			}
1815		}
1816
1817		pset_unlock(pset);
1818
1819#if CONFIG_SCHED_IDLE_IN_PLACE
1820		/*
1821		 *	Choose idle thread if fast idle is not possible.
1822		 */
1823		if ((thread->state & (TH_IDLE|TH_TERMINATE|TH_SUSP)) || !(thread->state & TH_WAIT) || thread->wake_active || thread->sched_pri >= BASEPRI_RTQUEUES)
1824			return (processor->idle_thread);
1825
1826		/*
1827		 *	Perform idling activities directly without a
1828		 *	context switch.  Return dispatched thread,
1829		 *	else check again for a runnable thread.
1830		 */
1831		new_thread = thread_select_idle(thread, processor);
1832
1833#else /* !CONFIG_SCHED_IDLE_IN_PLACE */
1834
1835		/*
1836		 * Do a full context switch to idle so that the current
1837		 * thread can start running on another processor without
1838		 * waiting for the fast-idled processor to wake up.
1839		 */
1840		return (processor->idle_thread);
1841
1842#endif /* !CONFIG_SCHED_IDLE_IN_PLACE */
1843
1844	} while (new_thread == THREAD_NULL);
1845
1846	return (new_thread);
1847}
1848
1849#if CONFIG_SCHED_IDLE_IN_PLACE
1850/*
1851 *	thread_select_idle:
1852 *
1853 *	Idle the processor using the current thread context.
1854 *
1855 *	Called with thread locked, then dropped and relocked.
1856 */
1857static thread_t
1858thread_select_idle(
1859	thread_t		thread,
1860	processor_t		processor)
1861{
1862	thread_t		new_thread;
1863
1864	if (thread->sched_mode == TH_MODE_TIMESHARE)
1865		sched_share_decr();
1866	sched_run_decr();
1867
1868	thread->state |= TH_IDLE;
1869	processor->current_pri = IDLEPRI;
1870	processor->current_thmode = TH_MODE_NONE;
1871
1872	/* Reload precise timing global policy to thread-local policy */
1873	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
1874
1875	thread_unlock(thread);
1876
1877	/*
1878	 *	Switch execution timing to processor idle thread.
1879	 */
1880	processor->last_dispatch = mach_absolute_time();
1881	thread->last_run_time = processor->last_dispatch;
1882	thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
1883	PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
1884
1885	/*
1886	 *	Cancel the quantum timer while idling.
1887	 */
1888	timer_call_cancel(&processor->quantum_timer);
1889	processor->timeslice = 0;
1890
1891	(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
1892
1893	thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0);
1894
1895	/*
1896	 *	Enable interrupts and perform idling activities.  No
1897	 *	preemption due to TH_IDLE being set.
1898	 */
1899	spllo(); new_thread = processor_idle(thread, processor);
1900
1901	/*
1902	 *	Return at splsched.
1903	 */
1904	(*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
1905
1906	thread_lock(thread);
1907
1908	/*
1909	 * If we idled in place, simulate a context switch back
1910	 * to the original priority of the thread so that the
1911	 * platform layer cannot distinguish this from a true
1912	 * switch to the idle thread.
1913	 */
1914	if (thread->sched_mode == TH_MODE_REALTIME)
1915		thread_tell_urgency(THREAD_URGENCY_REAL_TIME, thread->realtime.period, thread->realtime.deadline);
1916	/* Identify non-promoted threads which have requested a
1917	 * "background" priority.
1918	 */
1919	else if ((thread->sched_pri <= MAXPRI_THROTTLE) &&
1920			 (thread->priority <= MAXPRI_THROTTLE))
1921		thread_tell_urgency(THREAD_URGENCY_BACKGROUND, thread->sched_pri, thread->priority);
1922	else
1923		thread_tell_urgency(THREAD_URGENCY_NORMAL, thread->sched_pri, thread->priority);
1924
1925	/*
1926	 *	If awakened, switch to thread timer and start a new quantum.
1927	 *	Otherwise skip; we will context switch to another thread or return here.
1928	 */
1929	if (!(thread->state & TH_WAIT)) {
1930		processor->last_dispatch = mach_absolute_time();
1931		thread_timer_event(processor->last_dispatch, &thread->system_timer);
1932		PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
1933
1934		thread_quantum_init(thread);
1935		thread->last_quantum_refill_time = processor->last_dispatch;
1936
1937		processor->quantum_end = processor->last_dispatch + thread->current_quantum;
1938		timer_call_enter1(&processor->quantum_timer, thread, processor->quantum_end, TIMER_CALL_CRITICAL);
1939		processor->timeslice = 1;
1940
1941		thread->computation_epoch = processor->last_dispatch;
1942	}
1943
1944	thread->state &= ~TH_IDLE;
1945
1946	sched_run_incr();
1947	if (thread->sched_mode == TH_MODE_TIMESHARE)
1948		sched_share_incr();
1949
1950	return (new_thread);
1951}
1952#endif /* CONFIG_SCHED_IDLE_IN_PLACE */
1953
1954#if defined(CONFIG_SCHED_TRADITIONAL)
1955static thread_t
1956sched_traditional_choose_thread(
1957								processor_t		processor,
1958								int				priority)
1959{
1960	thread_t thread;
1961
1962	thread = choose_thread(processor, runq_for_processor(processor), priority);
1963	if (thread != THREAD_NULL) {
1964		runq_consider_decr_bound_count(processor, thread);
1965	}
1966
1967	return thread;
1968}
1969
1970#endif /* defined(CONFIG_SCHED_TRADITIONAL)  */
1971
1972#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_FIXEDPRIORITY)
1973
1974/*
1975 *	choose_thread:
1976 *
1977 *	Locate a thread to execute from the processor run queue
1978 *	and return it.  Only choose a thread with greater or equal
1979 *	priority.
1980 *
1981 *	Associated pset must be locked.  Returns THREAD_NULL
1982 *	on failure.
1983 */
1984thread_t
1985choose_thread(
1986	processor_t		processor,
1987	run_queue_t		rq,
1988	int				priority)
1989{
1990	queue_t			queue = rq->queues + rq->highq;
1991	int				pri = rq->highq, count = rq->count;
1992	thread_t		thread;
1993
1994	while (count > 0 && pri >= priority) {
1995		thread = (thread_t)queue_first(queue);
1996		while (!queue_end(queue, (queue_entry_t)thread)) {
1997			if (thread->bound_processor == PROCESSOR_NULL ||
1998							thread->bound_processor == processor) {
1999				remqueue((queue_entry_t)thread);
2000
2001				thread->runq = PROCESSOR_NULL;
2002				SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2003				rq->count--;
2004				if (SCHED(priority_is_urgent)(pri)) {
2005					rq->urgency--; assert(rq->urgency >= 0);
2006				}
2007				if (queue_empty(queue)) {
2008					if (pri != IDLEPRI)
2009						clrbit(MAXPRI - pri, rq->bitmap);
2010					rq->highq = MAXPRI - ffsbit(rq->bitmap);
2011				}
2012
2013				return (thread);
2014			}
2015			count--;
2016
2017			thread = (thread_t)queue_next((queue_entry_t)thread);
2018		}
2019
2020		queue--; pri--;
2021	}
2022
2023	return (THREAD_NULL);
2024}
2025
2026#endif /* defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_FIXEDPRIORITY) */
2027
2028/*
2029 *	Perform a context switch and start executing the new thread.
2030 *
2031 *	Returns FALSE on failure, and the thread is re-dispatched.
2032 *
2033 *	Called at splsched.
2034 */
2035
2036#define funnel_release_check(thread, debug)				\
2037MACRO_BEGIN												\
2038	if ((thread)->funnel_state & TH_FN_OWNED) {			\
2039		(thread)->funnel_state = TH_FN_REFUNNEL;		\
2040		KERNEL_DEBUG(0x603242c | DBG_FUNC_NONE,			\
2041			(thread)->funnel_lock, (debug), 0, 0, 0);	\
2042		funnel_unlock((thread)->funnel_lock);			\
2043	}													\
2044MACRO_END
2045
2046#define funnel_refunnel_check(thread, debug)				\
2047MACRO_BEGIN													\
2048	if ((thread)->funnel_state & TH_FN_REFUNNEL) {			\
2049		kern_return_t	result = (thread)->wait_result;		\
2050															\
2051		(thread)->funnel_state = 0;							\
2052		KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE,				\
2053			(thread)->funnel_lock, (debug), 0, 0, 0);		\
2054		funnel_lock((thread)->funnel_lock);					\
2055		KERNEL_DEBUG(0x6032430 | DBG_FUNC_NONE,				\
2056			(thread)->funnel_lock, (debug), 0, 0, 0);		\
2057		(thread)->funnel_state = TH_FN_OWNED;				\
2058		(thread)->wait_result = result;						\
2059	}														\
2060MACRO_END
2061
2062static boolean_t
2063thread_invoke(
2064	register thread_t	self,
2065	register thread_t	thread,
2066	ast_t				reason)
2067{
2068	thread_continue_t	continuation = self->continuation;
2069	void				*parameter = self->parameter;
2070	processor_t			processor;
2071
2072	if (get_preemption_level() != 0) {
2073		int pl = get_preemption_level();
2074		panic("thread_invoke: preemption_level %d, possible cause: %s",
2075		    pl, (pl < 0 ? "unlocking an unlocked mutex or spinlock" :
2076			"blocking while holding a spinlock, or within interrupt context"));
2077	}
2078
2079	assert(self == current_thread());
2080
2081	/*
2082	 * Mark thread interruptible.
2083	 */
2084	thread_lock(thread);
2085	thread->state &= ~TH_UNINT;
2086
2087#if DEBUG
2088	assert(thread_runnable(thread));
2089#endif
2090
2091	/* Reload precise timing global policy to thread-local policy */
2092	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
2093
2094	/*
2095	 * Allow time constraint threads to hang onto
2096	 * a stack.
2097	 */
2098	if ((self->sched_mode == TH_MODE_REALTIME) && !self->reserved_stack)
2099		self->reserved_stack = self->kernel_stack;
2100
2101	if (continuation != NULL) {
2102		if (!thread->kernel_stack) {
2103			/*
2104			 * If we are using a privileged stack,
2105			 * check to see whether we can exchange it with
2106			 * that of the other thread.
2107			 */
2108			if (self->kernel_stack == self->reserved_stack && !thread->reserved_stack)
2109				goto need_stack;
2110
2111			/*
2112			 * Context switch by performing a stack handoff.
2113			 */
2114			continuation = thread->continuation;
2115			parameter = thread->parameter;
2116
2117			processor = current_processor();
2118			processor->active_thread = thread;
2119			processor->current_pri = thread->sched_pri;
2120			processor->current_thmode = thread->sched_mode;
2121			if (thread->last_processor != processor && thread->last_processor != NULL) {
2122				if (thread->last_processor->processor_set != processor->processor_set)
2123					thread->ps_switch++;
2124				thread->p_switch++;
2125			}
2126			thread->last_processor = processor;
2127			thread->c_switch++;
2128			ast_context(thread);
2129			thread_unlock(thread);
2130
2131			self->reason = reason;
2132
2133			processor->last_dispatch = mach_absolute_time();
2134			self->last_run_time = processor->last_dispatch;
2135			thread_timer_event(processor->last_dispatch, &thread->system_timer);
2136			PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2137
2138			/*
2139			 * Since non-precise user/kernel time doesn't update the state timer
2140			 * during privilege transitions, synthesize an event now.
2141			 */
2142			if (!thread->precise_user_kernel_time) {
2143				timer_switch(PROCESSOR_DATA(processor, current_state),
2144							 processor->last_dispatch,
2145							 PROCESSOR_DATA(processor, current_state));
2146			}
2147
2148			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2149				MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
2150				self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2151
2152			if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
2153				KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2154						(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2155			}
2156
2157			DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2158
2159			SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2160
2161			TLOG(1, "thread_invoke: calling stack_handoff\n");
2162			stack_handoff(self, thread);
2163
2164			DTRACE_SCHED(on__cpu);
2165
2166			thread_dispatch(self, thread);
2167
2168			thread->continuation = thread->parameter = NULL;
2169
2170			counter(c_thread_invoke_hits++);
2171
2172			funnel_refunnel_check(thread, 2);
2173			(void) spllo();
2174
2175			assert(continuation);
2176			call_continuation(continuation, parameter, thread->wait_result);
2177			/*NOTREACHED*/
2178		}
2179		else if (thread == self) {
2180			/* same thread but with continuation */
2181			ast_context(self);
2182			counter(++c_thread_invoke_same);
2183			thread_unlock(self);
2184
2185			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2186				MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2187				self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2188
2189			self->continuation = self->parameter = NULL;
2190
2191			funnel_refunnel_check(self, 3);
2192			(void) spllo();
2193
2194			call_continuation(continuation, parameter, self->wait_result);
2195			/*NOTREACHED*/
2196		}
2197	}
2198	else {
2199		/*
2200		 * Check that the other thread has a stack
2201		 */
2202		if (!thread->kernel_stack) {
2203need_stack:
2204			if (!stack_alloc_try(thread)) {
2205				counter(c_thread_invoke_misses++);
2206				thread_unlock(thread);
2207				thread_stack_enqueue(thread);
2208				return (FALSE);
2209			}
2210		}
2211		else if (thread == self) {
2212			ast_context(self);
2213			counter(++c_thread_invoke_same);
2214			thread_unlock(self);
2215
2216			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2217				MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2218				self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2219
2220			return (TRUE);
2221		}
2222	}
2223
2224	/*
2225	 * Context switch by full context save.
2226	 */
2227	processor = current_processor();
2228	processor->active_thread = thread;
2229	processor->current_pri = thread->sched_pri;
2230	processor->current_thmode = thread->sched_mode;
2231	if (thread->last_processor != processor && thread->last_processor != NULL) {
2232		if (thread->last_processor->processor_set != processor->processor_set)
2233			thread->ps_switch++;
2234		thread->p_switch++;
2235	}
2236	thread->last_processor = processor;
2237	thread->c_switch++;
2238	ast_context(thread);
2239	thread_unlock(thread);
2240
2241	counter(c_thread_invoke_csw++);
2242
2243	assert(self->runq == PROCESSOR_NULL);
2244	self->reason = reason;
2245
2246	processor->last_dispatch = mach_absolute_time();
2247	self->last_run_time = processor->last_dispatch;
2248	thread_timer_event(processor->last_dispatch, &thread->system_timer);
2249	PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
2250
2251	/*
2252	 * Since non-precise user/kernel time doesn't update the state timer
2253	 * during privilege transitions, synthesize an event now.
2254	 */
2255	if (!thread->precise_user_kernel_time) {
2256		timer_switch(PROCESSOR_DATA(processor, current_state),
2257					 processor->last_dispatch,
2258					 PROCESSOR_DATA(processor, current_state));
2259	}
2260
2261
2262	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2263		MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED) | DBG_FUNC_NONE,
2264		self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
2265
2266	if ((thread->chosen_processor != processor) && (thread->chosen_processor != NULL)) {
2267		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_MOVED)|DBG_FUNC_NONE,
2268				(uintptr_t)thread_tid(thread), (uintptr_t)thread->chosen_processor->cpu_id, 0, 0, 0);
2269	}
2270
2271	DTRACE_SCHED2(off__cpu, struct thread *, thread, struct proc *, thread->task->bsd_info);
2272
2273	SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
2274
2275	/*
2276	 * This is where we actually switch register context,
2277	 * and address space if required.  We will next run
2278	 * as a result of a subsequent context switch.
2279	 */
2280	assert(continuation == self->continuation);
2281	thread = machine_switch_context(self, continuation, thread);
2282	assert(self == current_thread());
2283	TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
2284
2285	DTRACE_SCHED(on__cpu);
2286
2287	/*
2288	 * We have been resumed and are set to run.
2289	 */
2290	thread_dispatch(thread, self);
2291
2292	if (continuation) {
2293		self->continuation = self->parameter = NULL;
2294
2295		funnel_refunnel_check(self, 3);
2296		(void) spllo();
2297
2298		call_continuation(continuation, parameter, self->wait_result);
2299		/*NOTREACHED*/
2300	}
2301
2302	return (TRUE);
2303}
2304
2305/*
2306 *	thread_dispatch:
2307 *
2308 *	Handle threads at context switch.  Re-dispatch other thread
2309 *	if still running, otherwise update run state and perform
2310 *	special actions.  Update quantum for other thread and begin
2311 *	the quantum for ourselves.
2312 *
2313 *	Called at splsched.
2314 */
2315void
2316thread_dispatch(
2317	thread_t		thread,
2318	thread_t		self)
2319{
2320	processor_t		processor = self->last_processor;
2321
2322	if (thread != THREAD_NULL) {
2323		/*
2324		 *	If blocked at a continuation, discard
2325		 *	the stack.
2326		 */
2327		if (thread->continuation != NULL && thread->kernel_stack != 0)
2328			stack_free(thread);
2329
2330		if (!(thread->state & TH_IDLE)) {
2331			int64_t consumed;
2332			int64_t remainder = 0;
2333
2334			if (processor->quantum_end > processor->last_dispatch)
2335				remainder = processor->quantum_end -
2336				    processor->last_dispatch;
2337
2338			consumed = thread->current_quantum - remainder;
2339
2340			if ((thread->reason & AST_LEDGER) == 0)
2341				/*
2342				 * Bill CPU time to both the individual thread
2343				 * and the task.
2344				 */
2345				ledger_credit(thread->t_ledger,
2346				    task_ledgers.cpu_time, consumed);
2347				ledger_credit(thread->t_threadledger,
2348				    thread_ledgers.cpu_time, consumed);
2349
2350			wake_lock(thread);
2351			thread_lock(thread);
2352
2353			/*
2354			 *	Compute remainder of current quantum.
2355			 */
2356			if (first_timeslice(processor) &&
2357			    processor->quantum_end > processor->last_dispatch)
2358				thread->current_quantum = (uint32_t)remainder;
2359			else
2360				thread->current_quantum = 0;
2361
2362			if (thread->sched_mode == TH_MODE_REALTIME) {
2363				/*
2364				 *	Cancel the deadline if the thread has
2365				 *	consumed the entire quantum.
2366				 */
2367				if (thread->current_quantum == 0) {
2368					thread->realtime.deadline = UINT64_MAX;
2369					thread->reason |= AST_QUANTUM;
2370				}
2371			} else {
2372#if defined(CONFIG_SCHED_TRADITIONAL)
2373				/*
2374				 *	For non-realtime threads treat a tiny
2375				 *	remaining quantum as an expired quantum
2376				 *	but include what's left next time.
2377				 */
2378				if (thread->current_quantum < min_std_quantum) {
2379					thread->reason |= AST_QUANTUM;
2380					thread->current_quantum += SCHED(initial_quantum_size)(thread);
2381				}
2382#endif
2383			}
2384
2385			/*
2386			 *	If we are doing a direct handoff then
2387			 *	take the remainder of the quantum.
2388			 */
2389			if ((thread->reason & (AST_HANDOFF|AST_QUANTUM)) == AST_HANDOFF) {
2390				self->current_quantum = thread->current_quantum;
2391				thread->reason |= AST_QUANTUM;
2392				thread->current_quantum = 0;
2393			}
2394
2395			thread->computation_metered += (processor->last_dispatch - thread->computation_epoch);
2396
2397			if (!(thread->state & TH_WAIT)) {
2398				/*
2399				 *	Still running.
2400				 */
2401				if (thread->reason & AST_QUANTUM)
2402					thread_setrun(thread, SCHED_TAILQ);
2403				else
2404				if (thread->reason & AST_PREEMPT)
2405					thread_setrun(thread, SCHED_HEADQ);
2406				else
2407					thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
2408
2409				thread->reason = AST_NONE;
2410
2411				if (thread->wake_active) {
2412					thread->wake_active = FALSE;
2413					thread_unlock(thread);
2414
2415					thread_wakeup(&thread->wake_active);
2416				}
2417				else
2418					thread_unlock(thread);
2419
2420				wake_unlock(thread);
2421			}
2422			else {
2423				/*
2424				 *	Waiting.
2425				 */
2426				boolean_t should_terminate = FALSE;
2427
2428				/* Only the first call to thread_dispatch
2429				 * after explicit termination should add
2430				 * the thread to the termination queue
2431				 */
2432				if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
2433					should_terminate = TRUE;
2434					thread->state |= TH_TERMINATE2;
2435				}
2436
2437				thread->state &= ~TH_RUN;
2438
2439				if (thread->sched_mode == TH_MODE_TIMESHARE)
2440					sched_share_decr();
2441				sched_run_decr();
2442
2443				(*thread->sched_call)(SCHED_CALL_BLOCK, thread);
2444
2445				if (thread->wake_active) {
2446					thread->wake_active = FALSE;
2447					thread_unlock(thread);
2448
2449					thread_wakeup(&thread->wake_active);
2450				}
2451				else
2452					thread_unlock(thread);
2453
2454				wake_unlock(thread);
2455
2456				if (should_terminate)
2457					thread_terminate_enqueue(thread);
2458			}
2459		}
2460	}
2461
2462	if (!(self->state & TH_IDLE)) {
2463
2464		if (self->sched_mode == TH_MODE_REALTIME)
2465			thread_tell_urgency(THREAD_URGENCY_REAL_TIME, self->realtime.period, self->realtime.deadline);
2466		/* Identify non-promoted threads which have requested a
2467		 * "background" priority.
2468		 */
2469		else if ((self->sched_pri <= MAXPRI_THROTTLE) &&
2470			(self->priority <= MAXPRI_THROTTLE))
2471			thread_tell_urgency(THREAD_URGENCY_BACKGROUND, self->sched_pri, self->priority);
2472		else
2473			thread_tell_urgency(THREAD_URGENCY_NORMAL, self->sched_pri, self->priority);
2474		/*
2475		 *	Get a new quantum if none remaining.
2476		 */
2477		if (self->current_quantum == 0) {
2478			thread_quantum_init(self);
2479			self->last_quantum_refill_time = processor->last_dispatch;
2480		}
2481
2482		/*
2483		 *	Set up quantum timer and timeslice.
2484		 */
2485		processor->quantum_end = (processor->last_dispatch + self->current_quantum);
2486		timer_call_enter1(&processor->quantum_timer, self, processor->quantum_end, TIMER_CALL_CRITICAL);
2487
2488		processor->timeslice = 1;
2489
2490		self->computation_epoch = processor->last_dispatch;
2491	}
2492	else {
2493		timer_call_cancel(&processor->quantum_timer);
2494		processor->timeslice = 0;
2495
2496		thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0);
2497	}
2498}
2499
2500#include <libkern/OSDebug.h>
2501
2502uint32_t	kdebug_thread_block = 0;
2503
2504
2505/*
2506 *	thread_block_reason:
2507 *
2508 *	Forces a reschedule, blocking the caller if a wait
2509 *	has been asserted.
2510 *
2511 *	If a continuation is specified, then thread_invoke will
2512 *	attempt to discard the thread's kernel stack.  When the
2513 *	thread resumes, it will execute the continuation function
2514 *	on a new kernel stack.
2515 */
2516counter(mach_counter_t  c_thread_block_calls = 0;)
2517
2518wait_result_t
2519thread_block_reason(
2520	thread_continue_t	continuation,
2521	void				*parameter,
2522	ast_t				reason)
2523{
2524	register thread_t		self = current_thread();
2525	register processor_t	processor;
2526	register thread_t		new_thread;
2527	spl_t					s;
2528
2529	counter(++c_thread_block_calls);
2530
2531	s = splsched();
2532
2533    if (!(reason & AST_PREEMPT))
2534		funnel_release_check(self, 2);
2535
2536	processor = current_processor();
2537
2538	/* If we're explicitly yielding, force a subsequent quantum */
2539	if (reason & AST_YIELD)
2540		processor->timeslice = 0;
2541
2542	/* We're handling all scheduling AST's */
2543	ast_off(AST_SCHEDULING);
2544
2545	self->continuation = continuation;
2546	self->parameter = parameter;
2547
2548	if (__improbable(kdebug_thread_block && kdebug_enable && self->state != TH_RUN)) {
2549		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2550			MACHDBG_CODE(DBG_MACH_SCHED,MACH_BLOCK),
2551			reason, VM_KERNEL_UNSLIDE(continuation), 0, 0, 0);
2552	}
2553
2554	do {
2555		thread_lock(self);
2556		new_thread = thread_select(self, processor);
2557		thread_unlock(self);
2558	} while (!thread_invoke(self, new_thread, reason));
2559
2560	funnel_refunnel_check(self, 5);
2561	splx(s);
2562
2563	return (self->wait_result);
2564}
2565
2566/*
2567 *	thread_block:
2568 *
2569 *	Block the current thread if a wait has been asserted.
2570 */
2571wait_result_t
2572thread_block(
2573	thread_continue_t	continuation)
2574{
2575	return thread_block_reason(continuation, NULL, AST_NONE);
2576}
2577
2578wait_result_t
2579thread_block_parameter(
2580	thread_continue_t	continuation,
2581	void				*parameter)
2582{
2583	return thread_block_reason(continuation, parameter, AST_NONE);
2584}
2585
2586/*
2587 *	thread_run:
2588 *
2589 *	Switch directly from the current thread to the
2590 *	new thread, handing off our quantum if appropriate.
2591 *
2592 *	New thread must be runnable, and not on a run queue.
2593 *
2594 *	Called at splsched.
2595 */
2596int
2597thread_run(
2598	thread_t			self,
2599	thread_continue_t	continuation,
2600	void				*parameter,
2601	thread_t			new_thread)
2602{
2603	ast_t		handoff = AST_HANDOFF;
2604
2605	funnel_release_check(self, 3);
2606
2607	self->continuation = continuation;
2608	self->parameter = parameter;
2609
2610	while (!thread_invoke(self, new_thread, handoff)) {
2611		processor_t		processor = current_processor();
2612
2613		thread_lock(self);
2614		new_thread = thread_select(self, processor);
2615		thread_unlock(self);
2616		handoff = AST_NONE;
2617	}
2618
2619	funnel_refunnel_check(self, 6);
2620
2621	return (self->wait_result);
2622}
2623
2624/*
2625 *	thread_continue:
2626 *
2627 *	Called at splsched when a thread first receives
2628 *	a new stack after a continuation.
2629 */
2630void
2631thread_continue(
2632	register thread_t	thread)
2633{
2634	register thread_t		self = current_thread();
2635	register thread_continue_t	continuation;
2636	register void			*parameter;
2637
2638	DTRACE_SCHED(on__cpu);
2639
2640	continuation = self->continuation;
2641	parameter = self->parameter;
2642
2643	thread_dispatch(thread, self);
2644
2645	self->continuation = self->parameter = NULL;
2646
2647	funnel_refunnel_check(self, 4);
2648
2649	if (thread != THREAD_NULL)
2650		(void)spllo();
2651
2652 TLOG(1, "thread_continue: calling call_continuation \n");
2653	call_continuation(continuation, parameter, self->wait_result);
2654	/*NOTREACHED*/
2655}
2656
2657void
2658thread_quantum_init(thread_t thread)
2659{
2660	if (thread->sched_mode == TH_MODE_REALTIME) {
2661		thread->current_quantum = thread->realtime.computation;
2662	} else {
2663		thread->current_quantum = SCHED(initial_quantum_size)(thread);
2664	}
2665}
2666
2667#if defined(CONFIG_SCHED_TRADITIONAL)
2668static uint32_t
2669sched_traditional_initial_quantum_size(thread_t thread)
2670{
2671	if ((thread == THREAD_NULL) || thread->priority > MAXPRI_THROTTLE)
2672		return std_quantum;
2673	else
2674		return bg_quantum;
2675}
2676
2677static sched_mode_t
2678sched_traditional_initial_thread_sched_mode(task_t parent_task)
2679{
2680	if (parent_task == kernel_task)
2681		return TH_MODE_FIXED;
2682	else
2683		return TH_MODE_TIMESHARE;
2684}
2685
2686static boolean_t
2687sched_traditional_supports_timeshare_mode(void)
2688{
2689	return TRUE;
2690}
2691
2692#endif /* CONFIG_SCHED_TRADITIONAL */
2693
2694/*
2695 *	run_queue_init:
2696 *
2697 *	Initialize a run queue before first use.
2698 */
2699void
2700run_queue_init(
2701	run_queue_t		rq)
2702{
2703	int				i;
2704
2705	rq->highq = IDLEPRI;
2706	for (i = 0; i < NRQBM; i++)
2707		rq->bitmap[i] = 0;
2708	setbit(MAXPRI - IDLEPRI, rq->bitmap);
2709	rq->urgency = rq->count = 0;
2710	for (i = 0; i < NRQS; i++)
2711		queue_init(&rq->queues[i]);
2712}
2713
2714#if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY)
2715int
2716sched_traditional_fairshare_runq_count(void)
2717{
2718	return fs_runq.count;
2719}
2720
2721uint64_t
2722sched_traditional_fairshare_runq_stats_count_sum(void)
2723{
2724	return fs_runq.runq_stats.count_sum;
2725}
2726
2727void
2728sched_traditional_fairshare_enqueue(thread_t thread)
2729{
2730	queue_t				queue = &fs_runq.queue;
2731
2732	simple_lock(&fs_lock);
2733
2734	enqueue_tail(queue, (queue_entry_t)thread);
2735
2736	thread->runq = FS_RUNQ;
2737	SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count);
2738	fs_runq.count++;
2739
2740	simple_unlock(&fs_lock);
2741}
2742
2743thread_t
2744sched_traditional_fairshare_dequeue(void)
2745{
2746	thread_t thread;
2747
2748	simple_lock(&fs_lock);
2749	if (fs_runq.count > 0) {
2750		thread = (thread_t)dequeue_head(&fs_runq.queue);
2751
2752		thread->runq = PROCESSOR_NULL;
2753		SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count);
2754		fs_runq.count--;
2755
2756		simple_unlock(&fs_lock);
2757
2758		return (thread);
2759	}
2760	simple_unlock(&fs_lock);
2761
2762	return THREAD_NULL;
2763}
2764
2765boolean_t
2766sched_traditional_fairshare_queue_remove(thread_t thread)
2767{
2768	queue_t			q;
2769
2770	simple_lock(&fs_lock);
2771	q = &fs_runq.queue;
2772
2773	if (FS_RUNQ == thread->runq) {
2774		remqueue((queue_entry_t)thread);
2775		SCHED_STATS_RUNQ_CHANGE(&fs_runq.runq_stats, fs_runq.count);
2776		fs_runq.count--;
2777
2778		thread->runq = PROCESSOR_NULL;
2779		simple_unlock(&fs_lock);
2780		return (TRUE);
2781	}
2782	else {
2783		/*
2784		 *	The thread left the run queue before we could
2785		 * 	lock the run queue.
2786		 */
2787		assert(thread->runq == PROCESSOR_NULL);
2788		simple_unlock(&fs_lock);
2789		return (FALSE);
2790	}
2791}
2792
2793#endif /* defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_PROTO) || defined(CONFIG_SCHED_GRRR) || defined(CONFIG_SCHED_FIXEDPRIORITY) */
2794
2795/*
2796 *	run_queue_dequeue:
2797 *
2798 *	Perform a dequeue operation on a run queue,
2799 *	and return the resulting thread.
2800 *
2801 *	The run queue must be locked (see thread_run_queue_remove()
2802 *	for more info), and not empty.
2803 */
2804thread_t
2805run_queue_dequeue(
2806	run_queue_t		rq,
2807	integer_t		options)
2808{
2809	thread_t		thread;
2810	queue_t			queue = rq->queues + rq->highq;
2811
2812	if (options & SCHED_HEADQ) {
2813		thread = (thread_t)dequeue_head(queue);
2814	}
2815	else {
2816		thread = (thread_t)dequeue_tail(queue);
2817	}
2818
2819	thread->runq = PROCESSOR_NULL;
2820	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2821	rq->count--;
2822	if (SCHED(priority_is_urgent)(rq->highq)) {
2823		rq->urgency--; assert(rq->urgency >= 0);
2824	}
2825	if (queue_empty(queue)) {
2826		if (rq->highq != IDLEPRI)
2827			clrbit(MAXPRI - rq->highq, rq->bitmap);
2828		rq->highq = MAXPRI - ffsbit(rq->bitmap);
2829	}
2830
2831	return (thread);
2832}
2833
2834/*
2835 *	run_queue_enqueue:
2836 *
2837 *	Perform a enqueue operation on a run queue.
2838 *
2839 *	The run queue must be locked (see thread_run_queue_remove()
2840 *	for more info).
2841 */
2842boolean_t
2843run_queue_enqueue(
2844							  run_queue_t		rq,
2845							  thread_t			thread,
2846							  integer_t		options)
2847{
2848	queue_t			queue = rq->queues + thread->sched_pri;
2849	boolean_t		result = FALSE;
2850
2851	if (queue_empty(queue)) {
2852		enqueue_tail(queue, (queue_entry_t)thread);
2853
2854		setbit(MAXPRI - thread->sched_pri, rq->bitmap);
2855		if (thread->sched_pri > rq->highq) {
2856			rq->highq = thread->sched_pri;
2857			result = TRUE;
2858		}
2859	}
2860	else
2861		if (options & SCHED_TAILQ)
2862			enqueue_tail(queue, (queue_entry_t)thread);
2863		else
2864			enqueue_head(queue, (queue_entry_t)thread);
2865
2866	if (SCHED(priority_is_urgent)(thread->sched_pri))
2867		rq->urgency++;
2868	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2869	rq->count++;
2870
2871	return (result);
2872
2873}
2874
2875/*
2876 *	run_queue_remove:
2877 *
2878 *	Remove a specific thread from a runqueue.
2879 *
2880 *	The run queue must be locked.
2881 */
2882void
2883run_queue_remove(
2884				  run_queue_t		rq,
2885				  thread_t			thread)
2886{
2887
2888	remqueue((queue_entry_t)thread);
2889	SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
2890	rq->count--;
2891	if (SCHED(priority_is_urgent)(thread->sched_pri)) {
2892		rq->urgency--; assert(rq->urgency >= 0);
2893	}
2894
2895	if (queue_empty(rq->queues + thread->sched_pri)) {
2896		/* update run queue status */
2897		if (thread->sched_pri != IDLEPRI)
2898			clrbit(MAXPRI - thread->sched_pri, rq->bitmap);
2899		rq->highq = MAXPRI - ffsbit(rq->bitmap);
2900	}
2901
2902	thread->runq = PROCESSOR_NULL;
2903}
2904
2905/*
2906 *	fairshare_setrun:
2907 *
2908 *	Dispatch a thread for round-robin execution.
2909 *
2910 *	Thread must be locked.  Associated pset must
2911 *	be locked, and is returned unlocked.
2912 */
2913static void
2914fairshare_setrun(
2915				  processor_t			processor,
2916				  thread_t			thread)
2917{
2918	processor_set_t		pset = processor->processor_set;
2919
2920	thread->chosen_processor = processor;
2921
2922	SCHED(fairshare_enqueue)(thread);
2923
2924	if (processor != current_processor())
2925		machine_signal_idle(processor);
2926
2927	pset_unlock(pset);
2928
2929}
2930
2931/*
2932 *	realtime_queue_insert:
2933 *
2934 *	Enqueue a thread for realtime execution.
2935 */
2936static boolean_t
2937realtime_queue_insert(
2938	thread_t			thread)
2939{
2940	queue_t				queue = &rt_runq.queue;
2941	uint64_t			deadline = thread->realtime.deadline;
2942	boolean_t			preempt = FALSE;
2943
2944	simple_lock(&rt_lock);
2945
2946	if (queue_empty(queue)) {
2947		enqueue_tail(queue, (queue_entry_t)thread);
2948		preempt = TRUE;
2949	}
2950	else {
2951		register thread_t	entry = (thread_t)queue_first(queue);
2952
2953		while (TRUE) {
2954			if (	queue_end(queue, (queue_entry_t)entry)	||
2955						deadline < entry->realtime.deadline		) {
2956				entry = (thread_t)queue_prev((queue_entry_t)entry);
2957				break;
2958			}
2959
2960			entry = (thread_t)queue_next((queue_entry_t)entry);
2961		}
2962
2963		if ((queue_entry_t)entry == queue)
2964			preempt = TRUE;
2965
2966		insque((queue_entry_t)thread, (queue_entry_t)entry);
2967	}
2968
2969	thread->runq = RT_RUNQ;
2970	SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
2971	rt_runq.count++;
2972
2973	simple_unlock(&rt_lock);
2974
2975	return (preempt);
2976}
2977
2978/*
2979 *	realtime_setrun:
2980 *
2981 *	Dispatch a thread for realtime execution.
2982 *
2983 *	Thread must be locked.  Associated pset must
2984 *	be locked, and is returned unlocked.
2985 */
2986static void
2987realtime_setrun(
2988	processor_t			processor,
2989	thread_t			thread)
2990{
2991	processor_set_t		pset = processor->processor_set;
2992
2993	thread->chosen_processor = processor;
2994
2995	/*
2996	 *	Dispatch directly onto idle processor.
2997	 */
2998	if ( (thread->bound_processor == processor)
2999		&& processor->state == PROCESSOR_IDLE) {
3000		remqueue((queue_entry_t)processor);
3001		enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3002
3003		processor->next_thread = thread;
3004		processor->deadline = thread->realtime.deadline;
3005		processor->state = PROCESSOR_DISPATCHING;
3006		pset_unlock(pset);
3007
3008		if (processor != current_processor())
3009			machine_signal_idle(processor);
3010		return;
3011	}
3012
3013	if (realtime_queue_insert(thread)) {
3014		int prstate = processor->state;
3015		if (processor == current_processor())
3016			ast_on(AST_PREEMPT | AST_URGENT);
3017		else if ((prstate == PROCESSOR_IDLE)  || (prstate == PROCESSOR_DISPATCHING))
3018			machine_signal_idle(processor);
3019		else
3020			cause_ast_check(processor);
3021	}
3022
3023	pset_unlock(pset);
3024}
3025
3026#if defined(CONFIG_SCHED_TRADITIONAL)
3027
3028static boolean_t
3029priority_is_urgent(int priority)
3030{
3031	return testbit(priority, sched_preempt_pri) ? TRUE : FALSE;
3032}
3033
3034/*
3035 *	processor_enqueue:
3036 *
3037 *	Enqueue thread on a processor run queue.  Thread must be locked,
3038 *	and not already be on a run queue.
3039 *
3040 *	Returns TRUE if a preemption is indicated based on the state
3041 *	of the run queue.
3042 *
3043 *	The run queue must be locked (see thread_run_queue_remove()
3044 *	for more info).
3045 */
3046static boolean_t
3047processor_enqueue(
3048	processor_t		processor,
3049	thread_t		thread,
3050	integer_t		options)
3051{
3052	run_queue_t		rq = runq_for_processor(processor);
3053	boolean_t		result;
3054
3055	result = run_queue_enqueue(rq, thread, options);
3056	thread->runq = processor;
3057	runq_consider_incr_bound_count(processor, thread);
3058
3059	return (result);
3060}
3061
3062#endif /* CONFIG_SCHED_TRADITIONAL */
3063
3064/*
3065 *	processor_setrun:
3066 *
3067 *	Dispatch a thread for execution on a
3068 *	processor.
3069 *
3070 *	Thread must be locked.  Associated pset must
3071 *	be locked, and is returned unlocked.
3072 */
3073static void
3074processor_setrun(
3075	processor_t			processor,
3076	thread_t			thread,
3077	integer_t			options)
3078{
3079	processor_set_t		pset = processor->processor_set;
3080	ast_t				preempt;
3081
3082	thread->chosen_processor = processor;
3083
3084	/*
3085	 *	Dispatch directly onto idle processor.
3086	 */
3087	if ( (SCHED(direct_dispatch_to_idle_processors) ||
3088		  thread->bound_processor == processor)
3089		&& processor->state == PROCESSOR_IDLE) {
3090		remqueue((queue_entry_t)processor);
3091		enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
3092
3093		processor->next_thread = thread;
3094		processor->deadline = UINT64_MAX;
3095		processor->state = PROCESSOR_DISPATCHING;
3096		pset_unlock(pset);
3097
3098		if (processor != current_processor())
3099			machine_signal_idle(processor);
3100		return;
3101	}
3102
3103	/*
3104	 *	Set preemption mode.
3105	 */
3106	if (SCHED(priority_is_urgent)(thread->sched_pri) && thread->sched_pri > processor->current_pri)
3107		preempt = (AST_PREEMPT | AST_URGENT);
3108	else if(processor->active_thread && thread_eager_preemption(processor->active_thread))
3109		preempt = (AST_PREEMPT | AST_URGENT);
3110	else
3111	if ((thread->sched_mode == TH_MODE_TIMESHARE) && thread->sched_pri < thread->priority)
3112		preempt = AST_NONE;
3113	else
3114		preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
3115
3116	if (!SCHED(processor_enqueue)(processor, thread, options))
3117		preempt = AST_NONE;
3118
3119	if (preempt != AST_NONE) {
3120		if (processor == current_processor()) {
3121			if (csw_check(processor) != AST_NONE)
3122				ast_on(preempt);
3123		}
3124		else
3125		if (	processor->state == PROCESSOR_IDLE || processor->state == PROCESSOR_DISPATCHING) {
3126			machine_signal_idle(processor);
3127		}
3128		else
3129		if (	(processor->state == PROCESSOR_RUNNING		||
3130				 processor->state == PROCESSOR_SHUTDOWN)		&&
3131				(thread->sched_pri >= processor->current_pri	||
3132				processor->current_thmode == TH_MODE_FAIRSHARE)) {
3133			cause_ast_check(processor);
3134		}
3135	}
3136	else
3137	if (	processor->state == PROCESSOR_SHUTDOWN		&&
3138			thread->sched_pri >= processor->current_pri	) {
3139		cause_ast_check(processor);
3140	}
3141	else
3142	if (	processor->state == PROCESSOR_IDLE	&&
3143			processor != current_processor()	) {
3144				machine_signal_idle(processor);
3145	}
3146
3147	pset_unlock(pset);
3148}
3149
3150#if defined(CONFIG_SCHED_TRADITIONAL)
3151
3152static boolean_t
3153processor_queue_empty(processor_t		processor)
3154{
3155	return runq_for_processor(processor)->count == 0;
3156
3157}
3158
3159static boolean_t
3160sched_traditional_with_pset_runqueue_processor_queue_empty(processor_t		processor)
3161{
3162	processor_set_t pset = processor->processor_set;
3163	int count = runq_for_processor(processor)->count;
3164
3165	/*
3166	 * The pset runq contains the count of all runnable threads
3167	 * for all processors in the pset. However, for threads that
3168	 * are bound to another processor, the current "processor"
3169	 * is not eligible to execute the thread. So we only
3170	 * include bound threads that our bound to the current
3171	 * "processor". This allows the processor to idle when the
3172	 * count of eligible threads drops to 0, even if there's
3173	 * a runnable thread bound to a different processor in the
3174	 * shared runq.
3175	 */
3176
3177	count -= pset->pset_runq_bound_count;
3178	count += processor->runq_bound_count;
3179
3180	return count == 0;
3181}
3182
3183static ast_t
3184processor_csw_check(processor_t processor)
3185{
3186	run_queue_t		runq;
3187	boolean_t		has_higher;
3188
3189	assert(processor->active_thread != NULL);
3190
3191	runq = runq_for_processor(processor);
3192	if (first_timeslice(processor)) {
3193		has_higher = (runq->highq > processor->current_pri);
3194	} else {
3195		has_higher = (runq->highq >= processor->current_pri);
3196	}
3197	if (has_higher) {
3198		if (runq->urgency > 0)
3199			return (AST_PREEMPT | AST_URGENT);
3200
3201		if (processor->active_thread && thread_eager_preemption(processor->active_thread))
3202			return (AST_PREEMPT | AST_URGENT);
3203
3204		return AST_PREEMPT;
3205	}
3206
3207	return AST_NONE;
3208}
3209
3210static boolean_t
3211processor_queue_has_priority(processor_t		processor,
3212							 int				priority,
3213							 boolean_t			gte)
3214{
3215	if (gte)
3216		return runq_for_processor(processor)->highq >= priority;
3217	else
3218		return runq_for_processor(processor)->highq > priority;
3219}
3220
3221static boolean_t
3222should_current_thread_rechoose_processor(processor_t			processor)
3223{
3224	return (processor->current_pri < BASEPRI_RTQUEUES
3225			&& processor->processor_meta != PROCESSOR_META_NULL
3226			&& processor->processor_meta->primary != processor);
3227}
3228
3229static int
3230sched_traditional_processor_runq_count(processor_t   processor)
3231{
3232	return runq_for_processor(processor)->count;
3233}
3234
3235
3236static uint64_t
3237sched_traditional_processor_runq_stats_count_sum(processor_t   processor)
3238{
3239	return runq_for_processor(processor)->runq_stats.count_sum;
3240}
3241
3242static uint64_t
3243sched_traditional_with_pset_runqueue_processor_runq_stats_count_sum(processor_t   processor)
3244{
3245	if (processor->cpu_id == processor->processor_set->cpu_set_low)
3246		return runq_for_processor(processor)->runq_stats.count_sum;
3247	else
3248		return 0ULL;
3249}
3250
3251#endif /* CONFIG_SCHED_TRADITIONAL */
3252
3253#define next_pset(p)	(((p)->pset_list != PROCESSOR_SET_NULL)? (p)->pset_list: (p)->node->psets)
3254
3255/*
3256 *	choose_next_pset:
3257 *
3258 *	Return the next sibling pset containing
3259 *	available processors.
3260 *
3261 *	Returns the original pset if none other is
3262 *	suitable.
3263 */
3264static processor_set_t
3265choose_next_pset(
3266	processor_set_t		pset)
3267{
3268	processor_set_t		nset = pset;
3269
3270	do {
3271		nset = next_pset(nset);
3272	} while (nset->online_processor_count < 1 && nset != pset);
3273
3274	return (nset);
3275}
3276
3277/*
3278 *	choose_processor:
3279 *
3280 *	Choose a processor for the thread, beginning at
3281 *	the pset.  Accepts an optional processor hint in
3282 *	the pset.
3283 *
3284 *	Returns a processor, possibly from a different pset.
3285 *
3286 *	The thread must be locked.  The pset must be locked,
3287 *	and the resulting pset is locked on return.
3288 */
3289processor_t
3290choose_processor(
3291	processor_set_t		pset,
3292	processor_t			processor,
3293	thread_t			thread)
3294{
3295	processor_set_t		nset, cset = pset;
3296	processor_meta_t	pmeta = PROCESSOR_META_NULL;
3297	processor_t             mprocessor;
3298
3299	/*
3300	 *	Prefer the hinted processor, when appropriate.
3301	 */
3302
3303	if (processor != PROCESSOR_NULL) {
3304		if (processor->processor_meta != PROCESSOR_META_NULL)
3305			processor = processor->processor_meta->primary;
3306	}
3307
3308	mprocessor = machine_choose_processor(pset, processor);
3309	if (mprocessor != PROCESSOR_NULL)
3310		processor = mprocessor;
3311
3312	if (processor != PROCESSOR_NULL) {
3313		if (processor->processor_set != pset ||
3314		    processor->state == PROCESSOR_INACTIVE ||
3315		    processor->state == PROCESSOR_SHUTDOWN ||
3316		    processor->state == PROCESSOR_OFF_LINE)
3317			processor = PROCESSOR_NULL;
3318		else
3319			if (processor->state == PROCESSOR_IDLE ||
3320			    ((thread->sched_pri >= BASEPRI_RTQUEUES) &&
3321			    (processor->current_pri < BASEPRI_RTQUEUES)))
3322				return (processor);
3323	}
3324
3325	/*
3326	 *	Iterate through the processor sets to locate
3327	 *	an appropriate processor.
3328	 */
3329	do {
3330		/*
3331		 *	Choose an idle processor.
3332		 */
3333		if (!queue_empty(&cset->idle_queue))
3334			return ((processor_t)queue_first(&cset->idle_queue));
3335
3336		if (thread->sched_pri >= BASEPRI_RTQUEUES) {
3337			integer_t lowest_priority = MAXPRI + 1;
3338			integer_t lowest_unpaired = MAXPRI + 1;
3339			uint64_t  furthest_deadline = 1;
3340			processor_t lp_processor = PROCESSOR_NULL;
3341			processor_t lp_unpaired = PROCESSOR_NULL;
3342			processor_t fd_processor = PROCESSOR_NULL;
3343
3344			lp_processor = cset->low_pri;
3345			/* Consider hinted processor */
3346			if (lp_processor != PROCESSOR_NULL &&
3347			    ((lp_processor->processor_meta == PROCESSOR_META_NULL) ||
3348			    ((lp_processor == lp_processor->processor_meta->primary) &&
3349			    !queue_empty(&lp_processor->processor_meta->idle_queue))) &&
3350			    lp_processor->state != PROCESSOR_INACTIVE &&
3351			    lp_processor->state != PROCESSOR_SHUTDOWN &&
3352			    lp_processor->state != PROCESSOR_OFF_LINE &&
3353			    (lp_processor->current_pri < thread->sched_pri))
3354				return lp_processor;
3355
3356			processor = (processor_t)queue_first(&cset->active_queue);
3357			while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) {
3358				/* Discover the processor executing the
3359				 * thread with the lowest priority within
3360				 * this pset, or the one with the furthest
3361				 * deadline
3362				 */
3363				integer_t cpri = processor->current_pri;
3364				if (cpri < lowest_priority) {
3365					lowest_priority = cpri;
3366					lp_processor = processor;
3367				}
3368
3369				if ((cpri >= BASEPRI_RTQUEUES) && (processor->deadline > furthest_deadline)) {
3370					furthest_deadline = processor->deadline;
3371					fd_processor = processor;
3372				}
3373
3374
3375				if (processor->processor_meta != PROCESSOR_META_NULL &&
3376				    !queue_empty(&processor->processor_meta->idle_queue)) {
3377					if (cpri < lowest_unpaired) {
3378						lowest_unpaired = cpri;
3379						lp_unpaired = processor;
3380						pmeta = processor->processor_meta;
3381					}
3382					else
3383						if (pmeta == PROCESSOR_META_NULL)
3384							pmeta = processor->processor_meta;
3385				}
3386				processor = (processor_t)queue_next((queue_entry_t)processor);
3387			}
3388
3389			if (thread->sched_pri > lowest_unpaired)
3390				return lp_unpaired;
3391
3392			if (pmeta != PROCESSOR_META_NULL)
3393				return ((processor_t)queue_first(&pmeta->idle_queue));
3394			if (thread->sched_pri > lowest_priority)
3395				return lp_processor;
3396			if (thread->realtime.deadline < furthest_deadline)
3397				return fd_processor;
3398
3399			processor = PROCESSOR_NULL;
3400		}
3401		else {
3402			/*
3403			 *	Check any hinted processors in the processor set if available.
3404			 */
3405			if (cset->low_pri != PROCESSOR_NULL && cset->low_pri->state != PROCESSOR_INACTIVE &&
3406					cset->low_pri->state != PROCESSOR_SHUTDOWN && cset->low_pri->state != PROCESSOR_OFF_LINE &&
3407						(processor == PROCESSOR_NULL ||
3408							(thread->sched_pri > BASEPRI_DEFAULT && cset->low_pri->current_pri < thread->sched_pri))) {
3409				processor = cset->low_pri;
3410			}
3411			else
3412			if (cset->low_count != PROCESSOR_NULL && cset->low_count->state != PROCESSOR_INACTIVE &&
3413					cset->low_count->state != PROCESSOR_SHUTDOWN && cset->low_count->state != PROCESSOR_OFF_LINE &&
3414						(processor == PROCESSOR_NULL || (thread->sched_pri <= BASEPRI_DEFAULT &&
3415															SCHED(processor_runq_count)(cset->low_count) < SCHED(processor_runq_count)(processor)))) {
3416				processor = cset->low_count;
3417			}
3418
3419			/*
3420			 *	Otherwise, choose an available processor in the set.
3421			 */
3422			if (processor == PROCESSOR_NULL) {
3423				processor = (processor_t)dequeue_head(&cset->active_queue);
3424				if (processor != PROCESSOR_NULL)
3425					enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
3426			}
3427
3428			if (processor != PROCESSOR_NULL && pmeta == PROCESSOR_META_NULL) {
3429				if (processor->processor_meta != PROCESSOR_META_NULL &&
3430											!queue_empty(&processor->processor_meta->idle_queue))
3431					pmeta = processor->processor_meta;
3432			}
3433		}
3434
3435		/*
3436		 *	Move onto the next processor set.
3437		 */
3438		nset = next_pset(cset);
3439
3440		if (nset != pset) {
3441			pset_unlock(cset);
3442
3443			cset = nset;
3444			pset_lock(cset);
3445		}
3446	} while (nset != pset);
3447
3448	/*
3449	 *	Make sure that we pick a running processor,
3450	 *	and that the correct processor set is locked.
3451	 */
3452	do {
3453		if (pmeta != PROCESSOR_META_NULL) {
3454			if (cset != pmeta->primary->processor_set) {
3455				pset_unlock(cset);
3456
3457				cset = pmeta->primary->processor_set;
3458				pset_lock(cset);
3459			}
3460
3461			if (!queue_empty(&pmeta->idle_queue))
3462				return ((processor_t)queue_first(&pmeta->idle_queue));
3463
3464			pmeta = PROCESSOR_META_NULL;
3465		}
3466
3467		/*
3468		 *	If we haven't been able to choose a processor,
3469		 *	pick the boot processor and return it.
3470		 */
3471		if (processor == PROCESSOR_NULL) {
3472			processor = master_processor;
3473
3474			/*
3475			 *	Check that the correct processor set is
3476			 *	returned locked.
3477			 */
3478			if (cset != processor->processor_set) {
3479				pset_unlock(cset);
3480
3481				cset = processor->processor_set;
3482				pset_lock(cset);
3483			}
3484
3485			return (processor);
3486		}
3487
3488		/*
3489		 *	Check that the processor set for the chosen
3490		 *	processor is locked.
3491		 */
3492		if (cset != processor->processor_set) {
3493			pset_unlock(cset);
3494
3495			cset = processor->processor_set;
3496			pset_lock(cset);
3497		}
3498
3499		/*
3500		 *	We must verify that the chosen processor is still available.
3501		 */
3502		if (processor->state == PROCESSOR_INACTIVE ||
3503					processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
3504			processor = PROCESSOR_NULL;
3505	} while (processor == PROCESSOR_NULL);
3506
3507	return (processor);
3508}
3509
3510/*
3511 *	thread_setrun:
3512 *
3513 *	Dispatch thread for execution, onto an idle
3514 *	processor or run queue, and signal a preemption
3515 *	as appropriate.
3516 *
3517 *	Thread must be locked.
3518 */
3519void
3520thread_setrun(
3521	thread_t			thread,
3522	integer_t			options)
3523{
3524	processor_t			processor;
3525	processor_set_t		pset;
3526
3527#if DEBUG
3528	assert(thread_runnable(thread));
3529#endif
3530
3531	/*
3532	 *	Update priority if needed.
3533	 */
3534	if (SCHED(can_update_priority)(thread))
3535		SCHED(update_priority)(thread);
3536
3537	assert(thread->runq == PROCESSOR_NULL);
3538
3539	if (thread->bound_processor == PROCESSOR_NULL) {
3540		/*
3541		 *	Unbound case.
3542		 */
3543		if (thread->affinity_set != AFFINITY_SET_NULL) {
3544			/*
3545			 * Use affinity set policy hint.
3546			 */
3547			pset = thread->affinity_set->aset_pset;
3548			pset_lock(pset);
3549
3550			processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3551		}
3552		else
3553		if (thread->last_processor != PROCESSOR_NULL) {
3554			/*
3555			 *	Simple (last processor) affinity case.
3556			 */
3557			processor = thread->last_processor;
3558			pset = processor->processor_set;
3559			pset_lock(pset);
3560			processor = SCHED(choose_processor)(pset, processor, thread);
3561
3562			if ((thread->last_processor != processor) && (thread->last_processor != PROCESSOR_NULL)) {
3563				KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LPA_BROKEN)|DBG_FUNC_NONE,
3564									  (uintptr_t)thread_tid(thread), (uintptr_t)thread->last_processor->cpu_id, (uintptr_t)processor->cpu_id, thread->last_processor->state, 0);
3565			}
3566
3567		}
3568		else {
3569			/*
3570			 *	No Affinity case:
3571			 *
3572			 *	Utilitize a per task hint to spread threads
3573			 *	among the available processor sets.
3574			 */
3575			task_t		task = thread->task;
3576
3577			pset = task->pset_hint;
3578			if (pset == PROCESSOR_SET_NULL)
3579				pset = current_processor()->processor_set;
3580
3581			pset = choose_next_pset(pset);
3582			pset_lock(pset);
3583
3584			processor = SCHED(choose_processor)(pset, PROCESSOR_NULL, thread);
3585			task->pset_hint = processor->processor_set;
3586		}
3587	}
3588	else {
3589		/*
3590		 *	Bound case:
3591		 *
3592		 *	Unconditionally dispatch on the processor.
3593		 */
3594		processor = thread->bound_processor;
3595		pset = processor->processor_set;
3596		pset_lock(pset);
3597	}
3598
3599	/*
3600	 *	Dispatch the thread on the choosen processor.
3601	 */
3602	if (thread->sched_pri >= BASEPRI_RTQUEUES)
3603		realtime_setrun(processor, thread);
3604	else if (thread->sched_mode == TH_MODE_FAIRSHARE)
3605		fairshare_setrun(processor, thread);
3606	else
3607		processor_setrun(processor, thread, options);
3608}
3609
3610processor_set_t
3611task_choose_pset(
3612	task_t		task)
3613{
3614	processor_set_t		pset = task->pset_hint;
3615
3616	if (pset != PROCESSOR_SET_NULL)
3617		pset = choose_next_pset(pset);
3618
3619	return (pset);
3620}
3621
3622#if defined(CONFIG_SCHED_TRADITIONAL)
3623
3624/*
3625 *	processor_queue_shutdown:
3626 *
3627 *	Shutdown a processor run queue by
3628 *	re-dispatching non-bound threads.
3629 *
3630 *	Associated pset must be locked, and is
3631 *	returned unlocked.
3632 */
3633void
3634processor_queue_shutdown(
3635	processor_t			processor)
3636{
3637	processor_set_t		pset = processor->processor_set;
3638	run_queue_t			rq = runq_for_processor(processor);
3639	queue_t				queue = rq->queues + rq->highq;
3640	int					pri = rq->highq, count = rq->count;
3641	thread_t			next, thread;
3642	queue_head_t		tqueue;
3643
3644	queue_init(&tqueue);
3645
3646	while (count > 0) {
3647		thread = (thread_t)queue_first(queue);
3648		while (!queue_end(queue, (queue_entry_t)thread)) {
3649			next = (thread_t)queue_next((queue_entry_t)thread);
3650
3651			if (thread->bound_processor == PROCESSOR_NULL) {
3652				remqueue((queue_entry_t)thread);
3653
3654				thread->runq = PROCESSOR_NULL;
3655				SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3656				runq_consider_decr_bound_count(processor, thread);
3657				rq->count--;
3658				if (SCHED(priority_is_urgent)(pri)) {
3659					rq->urgency--; assert(rq->urgency >= 0);
3660				}
3661				if (queue_empty(queue)) {
3662					if (pri != IDLEPRI)
3663						clrbit(MAXPRI - pri, rq->bitmap);
3664					rq->highq = MAXPRI - ffsbit(rq->bitmap);
3665				}
3666
3667				enqueue_tail(&tqueue, (queue_entry_t)thread);
3668			}
3669			count--;
3670
3671			thread = next;
3672		}
3673
3674		queue--; pri--;
3675	}
3676
3677	pset_unlock(pset);
3678
3679	while ((thread = (thread_t)dequeue_head(&tqueue)) != THREAD_NULL) {
3680		thread_lock(thread);
3681
3682		thread_setrun(thread, SCHED_TAILQ);
3683
3684		thread_unlock(thread);
3685	}
3686}
3687
3688#endif /* CONFIG_SCHED_TRADITIONAL */
3689
3690/*
3691 *	Check for a preemption point in
3692 *	the current context.
3693 *
3694 *	Called at splsched.
3695 */
3696ast_t
3697csw_check(
3698	processor_t		processor)
3699{
3700    return AST_PREEMPT | AST_URGENT;
3701
3702	ast_t			result = AST_NONE;
3703	thread_t		thread = processor->active_thread;
3704
3705	if (first_timeslice(processor)) {
3706		if (rt_runq.count > 0)
3707			return (AST_PREEMPT | AST_URGENT);
3708	}
3709	else {
3710		if (rt_runq.count > 0 && BASEPRI_RTQUEUES >= processor->current_pri)
3711			return (AST_PREEMPT | AST_URGENT);
3712	}
3713
3714	result = SCHED(processor_csw_check)(processor);
3715	if (result != AST_NONE)
3716		return (result);
3717
3718	if (SCHED(should_current_thread_rechoose_processor)(processor))
3719		return (AST_PREEMPT);
3720
3721	if (machine_processor_is_inactive(processor))
3722		return (AST_PREEMPT);
3723
3724	if (thread->state & TH_SUSP)
3725		return (AST_PREEMPT);
3726
3727	return (AST_NONE);
3728}
3729
3730/*
3731 *	set_sched_pri:
3732 *
3733 *	Set the scheduled priority of the specified thread.
3734 *
3735 *	This may cause the thread to change queues.
3736 *
3737 *	Thread must be locked.
3738 */
3739void
3740set_sched_pri(
3741	thread_t		thread,
3742	int				priority)
3743{
3744	boolean_t		removed = thread_run_queue_remove(thread);
3745
3746	thread->sched_pri = priority;
3747	if (removed)
3748		thread_setrun(thread, SCHED_PREEMPT | SCHED_TAILQ);
3749	else
3750	if (thread->state & TH_RUN) {
3751		processor_t		processor = thread->last_processor;
3752
3753		if (thread == current_thread()) {
3754			ast_t			preempt;
3755
3756			processor->current_pri = priority;
3757			processor->current_thmode = thread->sched_mode;
3758			if ((preempt = csw_check(processor)) != AST_NONE)
3759				ast_on(preempt);
3760		}
3761		else
3762		if (	processor != PROCESSOR_NULL						&&
3763				processor->active_thread == thread	)
3764			cause_ast_check(processor);
3765	}
3766}
3767
3768#if		0
3769
3770static void
3771run_queue_check(
3772	run_queue_t		rq,
3773	thread_t		thread)
3774{
3775	queue_t			q;
3776	queue_entry_t	qe;
3777
3778	if (rq != thread->runq)
3779		panic("run_queue_check: thread runq");
3780
3781	if (thread->sched_pri > MAXPRI || thread->sched_pri < MINPRI)
3782		panic("run_queue_check: thread sched_pri");
3783
3784	q = &rq->queues[thread->sched_pri];
3785	qe = queue_first(q);
3786	while (!queue_end(q, qe)) {
3787		if (qe == (queue_entry_t)thread)
3788			return;
3789
3790		qe = queue_next(qe);
3791	}
3792
3793	panic("run_queue_check: end");
3794}
3795
3796#endif	/* DEBUG */
3797
3798#if defined(CONFIG_SCHED_TRADITIONAL)
3799
3800/* locks the runqueue itself */
3801
3802static boolean_t
3803processor_queue_remove(
3804					   processor_t			processor,
3805					   thread_t		thread)
3806{
3807	void *			rqlock;
3808	run_queue_t		rq;
3809
3810	rqlock = &processor->processor_set->sched_lock;
3811	rq = runq_for_processor(processor);
3812
3813	simple_lock(rqlock);
3814	if (processor == thread->runq) {
3815		/*
3816		 *	Thread is on a run queue and we have a lock on
3817		 *	that run queue.
3818		 */
3819		runq_consider_decr_bound_count(processor, thread);
3820		run_queue_remove(rq, thread);
3821	}
3822	else {
3823		/*
3824		 *	The thread left the run queue before we could
3825		 * 	lock the run queue.
3826		 */
3827		assert(thread->runq == PROCESSOR_NULL);
3828		processor = PROCESSOR_NULL;
3829	}
3830
3831	simple_unlock(rqlock);
3832
3833	return (processor != PROCESSOR_NULL);
3834}
3835
3836#endif /* CONFIG_SCHED_TRADITIONAL */
3837
3838/*
3839 *	thread_run_queue_remove:
3840 *
3841 *	Remove a thread from a current run queue and
3842 *	return TRUE if successful.
3843 *
3844 *	Thread must be locked.
3845 */
3846boolean_t
3847thread_run_queue_remove(
3848	thread_t		thread)
3849{
3850	processor_t		processor = thread->runq;
3851
3852	/*
3853	 *	If processor is PROCESSOR_NULL, the thread will stay out of the
3854	 *	run queues because the caller locked the thread.  Otherwise
3855	 *	the thread is on a run queue, but could be chosen for dispatch
3856	 *	and removed.
3857	 */
3858	if (processor != PROCESSOR_NULL) {
3859		queue_t			q;
3860
3861		/*
3862		 *	The processor run queues are locked by the
3863		 *	processor set.  Real-time priorities use a
3864		 *	global queue with a dedicated lock.
3865		 */
3866		if (thread->sched_mode == TH_MODE_FAIRSHARE) {
3867			return SCHED(fairshare_queue_remove)(thread);
3868		}
3869
3870		if (thread->sched_pri < BASEPRI_RTQUEUES) {
3871			return SCHED(processor_queue_remove)(processor, thread);
3872		}
3873
3874		simple_lock(&rt_lock);
3875		q = &rt_runq.queue;
3876
3877		if (processor == thread->runq) {
3878			/*
3879			 *	Thread is on a run queue and we have a lock on
3880			 *	that run queue.
3881			 */
3882			remqueue((queue_entry_t)thread);
3883			SCHED_STATS_RUNQ_CHANGE(&rt_runq.runq_stats, rt_runq.count);
3884			rt_runq.count--;
3885
3886			thread->runq = PROCESSOR_NULL;
3887		}
3888		else {
3889			/*
3890			 *	The thread left the run queue before we could
3891			 * 	lock the run queue.
3892			 */
3893			assert(thread->runq == PROCESSOR_NULL);
3894			processor = PROCESSOR_NULL;
3895		}
3896
3897		simple_unlock(&rt_lock);
3898	}
3899
3900	return (processor != PROCESSOR_NULL);
3901}
3902
3903#if defined(CONFIG_SCHED_TRADITIONAL)
3904
3905/*
3906 *	steal_processor_thread:
3907 *
3908 *	Locate a thread to steal from the processor and
3909 *	return it.
3910 *
3911 *	Associated pset must be locked.  Returns THREAD_NULL
3912 *	on failure.
3913 */
3914static thread_t
3915steal_processor_thread(
3916	processor_t		processor)
3917{
3918	run_queue_t		rq = runq_for_processor(processor);
3919	queue_t			queue = rq->queues + rq->highq;
3920	int				pri = rq->highq, count = rq->count;
3921	thread_t		thread;
3922
3923	while (count > 0) {
3924		thread = (thread_t)queue_first(queue);
3925		while (!queue_end(queue, (queue_entry_t)thread)) {
3926			if (thread->bound_processor == PROCESSOR_NULL) {
3927				remqueue((queue_entry_t)thread);
3928
3929				thread->runq = PROCESSOR_NULL;
3930				SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count);
3931				runq_consider_decr_bound_count(processor, thread);
3932				rq->count--;
3933				if (SCHED(priority_is_urgent)(pri)) {
3934					rq->urgency--; assert(rq->urgency >= 0);
3935				}
3936				if (queue_empty(queue)) {
3937					if (pri != IDLEPRI)
3938						clrbit(MAXPRI - pri, rq->bitmap);
3939					rq->highq = MAXPRI - ffsbit(rq->bitmap);
3940				}
3941
3942				return (thread);
3943			}
3944			count--;
3945
3946			thread = (thread_t)queue_next((queue_entry_t)thread);
3947		}
3948
3949		queue--; pri--;
3950	}
3951
3952	return (THREAD_NULL);
3953}
3954
3955/*
3956 *	Locate and steal a thread, beginning
3957 *	at the pset.
3958 *
3959 *	The pset must be locked, and is returned
3960 *	unlocked.
3961 *
3962 *	Returns the stolen thread, or THREAD_NULL on
3963 *	failure.
3964 */
3965static thread_t
3966steal_thread(
3967	processor_set_t		pset)
3968{
3969	processor_set_t		nset, cset = pset;
3970	processor_t			processor;
3971	thread_t			thread;
3972
3973	do {
3974		processor = (processor_t)queue_first(&cset->active_queue);
3975		while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) {
3976			if (runq_for_processor(processor)->count > 0) {
3977				thread = steal_processor_thread(processor);
3978				if (thread != THREAD_NULL) {
3979					remqueue((queue_entry_t)processor);
3980					enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
3981
3982					pset_unlock(cset);
3983
3984					return (thread);
3985				}
3986			}
3987
3988			processor = (processor_t)queue_next((queue_entry_t)processor);
3989		}
3990
3991		nset = next_pset(cset);
3992
3993		if (nset != pset) {
3994			pset_unlock(cset);
3995
3996			cset = nset;
3997			pset_lock(cset);
3998		}
3999	} while (nset != pset);
4000
4001	pset_unlock(cset);
4002
4003	return (THREAD_NULL);
4004}
4005
4006static thread_t	steal_thread_disabled(
4007					processor_set_t		pset)
4008{
4009	pset_unlock(pset);
4010
4011	return (THREAD_NULL);
4012}
4013
4014#endif /* CONFIG_SCHED_TRADITIONAL */
4015
4016
4017int
4018thread_get_urgency(uint64_t *rt_period, uint64_t *rt_deadline)
4019{
4020	processor_t	processor;
4021	thread_t	thread;
4022
4023	processor = current_processor();
4024
4025	thread = processor->next_thread;
4026
4027	if (thread != NULL) {
4028		if (thread->sched_mode == TH_MODE_REALTIME) {
4029
4030			if (rt_period != NULL)
4031				*rt_period = thread->realtime.period;
4032			if (rt_deadline != NULL)
4033				*rt_deadline = thread->realtime.deadline;
4034
4035			KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_GET_URGENCY), THREAD_URGENCY_REAL_TIME, thread->realtime.period,
4036			    (thread->realtime.deadline >> 32), thread->realtime.deadline, 0);
4037
4038			return (THREAD_URGENCY_REAL_TIME);
4039		} else if ((thread->sched_pri <= MAXPRI_THROTTLE) &&
4040		    (thread->priority <= MAXPRI_THROTTLE)) {
4041			KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_GET_URGENCY), THREAD_URGENCY_BACKGROUND, thread->sched_pri, thread->priority, 0, 0);
4042			return (THREAD_URGENCY_BACKGROUND);
4043		}
4044		else
4045			KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_GET_URGENCY), THREAD_URGENCY_NORMAL, 0, 0, 0, 0);
4046
4047		return (THREAD_URGENCY_NORMAL);
4048	}
4049	else
4050		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_GET_URGENCY), THREAD_URGENCY_NONE, 0, 0, 0, 0);
4051	return (THREAD_URGENCY_NONE);
4052}
4053
4054
4055/*
4056 *	This is the processor idle loop, which just looks for other threads
4057 *	to execute.  Processor idle threads invoke this without supplying a
4058 *	current thread to idle without an asserted wait state.
4059 *
4060 *	Returns a the next thread to execute if dispatched directly.
4061 */
4062
4063#if 0
4064#define IDLE_KERNEL_DEBUG_CONSTANT(...) KERNEL_DEBUG_CONSTANT(__VA_ARGS__)
4065#else
4066#define IDLE_KERNEL_DEBUG_CONSTANT(...) do { } while(0)
4067#endif
4068
4069thread_t
4070processor_idle(
4071	thread_t			thread,
4072	processor_t			processor)
4073{
4074	processor_set_t		pset = processor->processor_set;
4075	thread_t			new_thread;
4076	int					state;
4077	(void)splsched();
4078
4079	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4080		MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_START,
4081		(uintptr_t)thread_tid(thread), 0, 0, 0, 0);
4082
4083	SCHED_STATS_CPU_IDLE_START(processor);
4084
4085	timer_switch(&PROCESSOR_DATA(processor, system_state),
4086									mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
4087	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
4088
4089	while (processor->next_thread == THREAD_NULL && SCHED(processor_queue_empty)(processor) && rt_runq.count == 0 && SCHED(fairshare_runq_count)() == 0 &&
4090				(thread == THREAD_NULL || ((thread->state & (TH_WAIT|TH_SUSP)) == TH_WAIT && !thread->wake_active))) {
4091		IDLE_KERNEL_DEBUG_CONSTANT(
4092			MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -1, 0);
4093
4094		machine_track_platform_idle(TRUE);
4095
4096		machine_idle();
4097
4098		machine_track_platform_idle(FALSE);
4099
4100		(void)splsched();
4101
4102		IDLE_KERNEL_DEBUG_CONSTANT(
4103			MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq.count, SCHED(processor_runq_count)(processor), -2, 0);
4104
4105		if (processor->state == PROCESSOR_INACTIVE && !machine_processor_is_inactive(processor))
4106			break;
4107	}
4108
4109	timer_switch(&PROCESSOR_DATA(processor, idle_state),
4110									mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
4111	PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
4112
4113	pset_lock(pset);
4114
4115	state = processor->state;
4116	if (state == PROCESSOR_DISPATCHING) {
4117		/*
4118		 *	Commmon case -- cpu dispatched.
4119		 */
4120		new_thread = processor->next_thread;
4121		processor->next_thread = THREAD_NULL;
4122		processor->state = PROCESSOR_RUNNING;
4123
4124		if (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE)					||
4125				(rt_runq.count > 0 && BASEPRI_RTQUEUES >= new_thread->sched_pri)	) {
4126			processor->deadline = UINT64_MAX;
4127
4128			pset_unlock(pset);
4129
4130			thread_lock(new_thread);
4131			KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REDISPATCH), (uintptr_t)thread_tid(new_thread), new_thread->sched_pri, rt_runq.count, 0, 0);
4132			thread_setrun(new_thread, SCHED_HEADQ);
4133			thread_unlock(new_thread);
4134
4135			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4136				MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4137				(uintptr_t)thread_tid(thread), state, 0, 0, 0);
4138
4139			return (THREAD_NULL);
4140		}
4141
4142		pset_unlock(pset);
4143
4144		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4145			MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4146			(uintptr_t)thread_tid(thread), state, (uintptr_t)thread_tid(new_thread), 0, 0);
4147
4148		return (new_thread);
4149	}
4150	else
4151	if (state == PROCESSOR_IDLE) {
4152		remqueue((queue_entry_t)processor);
4153
4154		processor->state = PROCESSOR_RUNNING;
4155		enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
4156	}
4157	else
4158	if (state == PROCESSOR_INACTIVE) {
4159		processor->state = PROCESSOR_RUNNING;
4160		enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
4161	}
4162	else
4163	if (state == PROCESSOR_SHUTDOWN) {
4164		/*
4165		 *	Going off-line.  Force a
4166		 *	reschedule.
4167		 */
4168		if ((new_thread = processor->next_thread) != THREAD_NULL) {
4169			processor->next_thread = THREAD_NULL;
4170			processor->deadline = UINT64_MAX;
4171
4172			pset_unlock(pset);
4173
4174			thread_lock(new_thread);
4175			thread_setrun(new_thread, SCHED_HEADQ);
4176			thread_unlock(new_thread);
4177
4178			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4179				MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4180				(uintptr_t)thread_tid(thread), state, 0, 0, 0);
4181
4182			return (THREAD_NULL);
4183		}
4184	}
4185
4186	pset_unlock(pset);
4187
4188	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4189		MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_END,
4190		(uintptr_t)thread_tid(thread), state, 0, 0, 0);
4191
4192	return (THREAD_NULL);
4193}
4194
4195/*
4196 *	Each processor has a dedicated thread which
4197 *	executes the idle loop when there is no suitable
4198 *	previous context.
4199 */
4200void
4201idle_thread(void)
4202{
4203	processor_t		processor = current_processor();
4204	thread_t		new_thread;
4205
4206	new_thread = processor_idle(THREAD_NULL, processor);
4207	if (new_thread != THREAD_NULL) {
4208		thread_run(processor->idle_thread, (thread_continue_t)idle_thread, NULL, new_thread);
4209		/*NOTREACHED*/
4210	}
4211
4212	thread_block((thread_continue_t)idle_thread);
4213	/*NOTREACHED*/
4214}
4215
4216kern_return_t
4217idle_thread_create(
4218	processor_t		processor)
4219{
4220	kern_return_t	result;
4221	thread_t		thread;
4222	spl_t			s;
4223
4224	result = kernel_thread_create((thread_continue_t)idle_thread, NULL, MAXPRI_KERNEL, &thread);
4225	if (result != KERN_SUCCESS)
4226		return (result);
4227
4228	s = splsched();
4229	thread_lock(thread);
4230	thread->bound_processor = processor;
4231	processor->idle_thread = thread;
4232	thread->sched_pri = thread->priority = IDLEPRI;
4233	thread->state = (TH_RUN | TH_IDLE);
4234	thread_unlock(thread);
4235	splx(s);
4236
4237	thread_deallocate(thread);
4238
4239	return (KERN_SUCCESS);
4240}
4241
4242/*
4243 * sched_startup:
4244 *
4245 * Kicks off scheduler services.
4246 *
4247 * Called at splsched.
4248 */
4249void
4250sched_startup(void)
4251{
4252	kern_return_t	result;
4253	thread_t		thread;
4254
4255	result = kernel_thread_start_priority((thread_continue_t)sched_init_thread,
4256										  (void *)SCHED(maintenance_continuation),
4257										  MAXPRI_KERNEL, &thread);
4258	if (result != KERN_SUCCESS)
4259		panic("sched_startup");
4260
4261	thread_deallocate(thread);
4262
4263	/*
4264	 * Yield to the sched_init_thread once, to
4265	 * initialize our own thread after being switched
4266	 * back to.
4267	 *
4268	 * The current thread is the only other thread
4269	 * active at this point.
4270	 */
4271	thread_block(THREAD_CONTINUE_NULL);
4272}
4273
4274#if defined(CONFIG_SCHED_TRADITIONAL)
4275
4276static uint64_t			sched_tick_deadline = 0;
4277
4278/*
4279 *	sched_init_thread:
4280 *
4281 *	Perform periodic bookkeeping functions about ten
4282 *	times per second.
4283 */
4284static void
4285sched_traditional_tick_continue(void)
4286{
4287	uint64_t			abstime = mach_absolute_time();
4288
4289	sched_tick++;
4290
4291	/*
4292	 *  Compute various averages.
4293	 */
4294	compute_averages();
4295
4296	/*
4297	 *  Scan the run queues for threads which
4298	 *  may need to be updated.
4299	 */
4300	thread_update_scan();
4301
4302	if (sched_tick_deadline == 0)
4303		sched_tick_deadline = abstime;
4304
4305	clock_deadline_for_periodic_event(sched_tick_interval, abstime,
4306														&sched_tick_deadline);
4307
4308	assert_wait_deadline((event_t)sched_traditional_tick_continue, THREAD_UNINT, sched_tick_deadline);
4309	thread_block((thread_continue_t)sched_traditional_tick_continue);
4310	/*NOTREACHED*/
4311}
4312
4313#endif /* CONFIG_SCHED_TRADITIONAL */
4314
4315void
4316sched_init_thread(void (*continuation)(void))
4317{
4318	thread_block(THREAD_CONTINUE_NULL);
4319
4320	continuation();
4321
4322	/*NOTREACHED*/
4323}
4324
4325#if defined(CONFIG_SCHED_TRADITIONAL)
4326
4327/*
4328 *	thread_update_scan / runq_scan:
4329 *
4330 *	Scan the run queues to account for timesharing threads
4331 *	which need to be updated.
4332 *
4333 *	Scanner runs in two passes.  Pass one squirrels likely
4334 *	threads away in an array, pass two does the update.
4335 *
4336 *	This is necessary because the run queue is locked for
4337 *	the candidate scan, but	the thread is locked for the update.
4338 *
4339 *	Array should be sized to make forward progress, without
4340 *	disabling preemption for long periods.
4341 */
4342
4343#define	THREAD_UPDATE_SIZE		128
4344
4345static thread_t		thread_update_array[THREAD_UPDATE_SIZE];
4346static int			thread_update_count = 0;
4347
4348/*
4349 *	Scan a runq for candidate threads.
4350 *
4351 *	Returns TRUE if retry is needed.
4352 */
4353static boolean_t
4354runq_scan(
4355	run_queue_t				runq)
4356{
4357	register int			count;
4358	register queue_t		q;
4359	register thread_t		thread;
4360
4361	if ((count = runq->count) > 0) {
4362	    q = runq->queues + runq->highq;
4363		while (count > 0) {
4364			queue_iterate(q, thread, thread_t, links) {
4365				if (		thread->sched_stamp != sched_tick		&&
4366						(thread->sched_mode == TH_MODE_TIMESHARE)	) {
4367					if (thread_update_count == THREAD_UPDATE_SIZE)
4368						return (TRUE);
4369
4370					thread_update_array[thread_update_count++] = thread;
4371					thread_reference_internal(thread);
4372				}
4373
4374				count--;
4375			}
4376
4377			q--;
4378		}
4379	}
4380
4381	return (FALSE);
4382}
4383
4384static void
4385thread_update_scan(void)
4386{
4387	boolean_t			restart_needed = FALSE;
4388	processor_t			processor = processor_list;
4389	processor_set_t		pset;
4390	thread_t			thread;
4391	spl_t				s;
4392
4393	do {
4394		do {
4395			pset = processor->processor_set;
4396
4397			s = splsched();
4398			pset_lock(pset);
4399
4400			restart_needed = runq_scan(runq_for_processor(processor));
4401
4402			pset_unlock(pset);
4403			splx(s);
4404
4405			if (restart_needed)
4406				break;
4407
4408			thread = processor->idle_thread;
4409			if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) {
4410				if (thread_update_count == THREAD_UPDATE_SIZE) {
4411					restart_needed = TRUE;
4412					break;
4413				}
4414
4415				thread_update_array[thread_update_count++] = thread;
4416				thread_reference_internal(thread);
4417			}
4418		} while ((processor = processor->processor_list) != NULL);
4419
4420	    /*
4421	     *	Ok, we now have a collection of candidates -- fix them.
4422	     */
4423	    while (thread_update_count > 0) {
4424			thread = thread_update_array[--thread_update_count];
4425			thread_update_array[thread_update_count] = THREAD_NULL;
4426
4427			s = splsched();
4428			thread_lock(thread);
4429			if (	!(thread->state & (TH_WAIT))	) {
4430				if (SCHED(can_update_priority)(thread))
4431					SCHED(update_priority)(thread);
4432			}
4433			thread_unlock(thread);
4434			splx(s);
4435
4436			thread_deallocate(thread);
4437	    }
4438	} while (restart_needed);
4439}
4440
4441#endif /* CONFIG_SCHED_TRADITIONAL */
4442
4443boolean_t
4444thread_eager_preemption(thread_t thread)
4445{
4446	return ((thread->sched_flags & TH_SFLAG_EAGERPREEMPT) != 0);
4447}
4448
4449void
4450thread_set_eager_preempt(thread_t thread)
4451{
4452	spl_t x;
4453	processor_t p;
4454	ast_t ast = AST_NONE;
4455
4456	x = splsched();
4457	p = current_processor();
4458
4459	thread_lock(thread);
4460	thread->sched_flags |= TH_SFLAG_EAGERPREEMPT;
4461
4462	if (thread == current_thread()) {
4463		thread_unlock(thread);
4464
4465		ast = csw_check(p);
4466		if (ast != AST_NONE) {
4467			(void) thread_block_reason(THREAD_CONTINUE_NULL, NULL, ast);
4468		}
4469	} else {
4470		p = thread->last_processor;
4471
4472		if (p != PROCESSOR_NULL	&& p->state == PROCESSOR_RUNNING &&
4473			p->active_thread == thread) {
4474			cause_ast_check(p);
4475		}
4476
4477		thread_unlock(thread);
4478	}
4479
4480	splx(x);
4481}
4482
4483void
4484thread_clear_eager_preempt(thread_t thread)
4485{
4486	spl_t x;
4487
4488	x = splsched();
4489	thread_lock(thread);
4490
4491	thread->sched_flags &= ~TH_SFLAG_EAGERPREEMPT;
4492
4493	thread_unlock(thread);
4494	splx(x);
4495}
4496/*
4497 * Scheduling statistics
4498 */
4499void
4500sched_stats_handle_csw(processor_t processor, int reasons, int selfpri, int otherpri)
4501{
4502	struct processor_sched_statistics *stats;
4503	boolean_t to_realtime = FALSE;
4504
4505	stats = &processor->processor_data.sched_stats;
4506	stats->csw_count++;
4507
4508	if (otherpri >= BASEPRI_REALTIME) {
4509		stats->rt_sched_count++;
4510		to_realtime = TRUE;
4511	}
4512
4513	if ((reasons & AST_PREEMPT) != 0) {
4514		stats->preempt_count++;
4515
4516		if (selfpri >= BASEPRI_REALTIME) {
4517			stats->preempted_rt_count++;
4518		}
4519
4520		if (to_realtime) {
4521			stats->preempted_by_rt_count++;
4522		}
4523
4524	}
4525}
4526
4527void
4528sched_stats_handle_runq_change(struct runq_stats *stats, int old_count)
4529{
4530	uint64_t timestamp = mach_absolute_time();
4531
4532	stats->count_sum += (timestamp - stats->last_change_timestamp) * old_count;
4533	stats->last_change_timestamp = timestamp;
4534}
4535
4536/*
4537 *     For calls from assembly code
4538 */
4539#undef thread_wakeup
4540void
4541thread_wakeup(
4542       event_t         x);
4543
4544void
4545thread_wakeup(
4546       event_t         x)
4547{
4548       thread_wakeup_with_result(x, THREAD_AWAKENED);
4549}
4550
4551boolean_t
4552preemption_enabled(void)
4553{
4554	return (get_preemption_level() == 0 && ml_get_interrupts_enabled());
4555}
4556
4557#if	DEBUG
4558static boolean_t
4559thread_runnable(
4560	thread_t	thread)
4561{
4562	return ((thread->state & (TH_RUN|TH_WAIT)) == TH_RUN);
4563}
4564#endif	/* DEBUG */
4565
4566static void
4567sched_timer_deadline_tracking_init(void) {
4568	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_1_DEFAULT, &timer_deadline_tracking_bin_1);
4569	nanoseconds_to_absolutetime(TIMER_DEADLINE_TRACKING_BIN_2_DEFAULT, &timer_deadline_tracking_bin_2);
4570}
4571