1/*
2 * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Timer interrupt callout module.
30 */
31
32#include <mach/mach_types.h>
33
34#include <kern/clock.h>
35#include <kern/processor.h>
36#include <kern/timer_call.h>
37#include <kern/timer_queue.h>
38#include <kern/call_entry.h>
39#include <kern/thread.h>
40
41#include <sys/kdebug.h>
42
43#if CONFIG_DTRACE
44#include <mach/sdt.h>
45#endif
46
47
48#if DEBUG
49#define TIMER_ASSERT	1
50#endif
51
52//#define TIMER_ASSERT	1
53//#define TIMER_DBG	1
54
55#if TIMER_DBG
56#define DBG(x...) kprintf("DBG: " x);
57#else
58#define DBG(x...)
59#endif
60
61#if TIMER_TRACE
62#define TIMER_KDEBUG_TRACE	KERNEL_DEBUG_CONSTANT_IST
63#else
64#define TIMER_KDEBUG_TRACE(x...)
65#endif
66
67
68lck_grp_t               timer_call_lck_grp;
69lck_attr_t              timer_call_lck_attr;
70lck_grp_attr_t          timer_call_lck_grp_attr;
71
72lck_grp_t               timer_longterm_lck_grp;
73lck_attr_t              timer_longterm_lck_attr;
74lck_grp_attr_t          timer_longterm_lck_grp_attr;
75
76
77#define timer_queue_lock_spin(queue)					\
78	lck_mtx_lock_spin_always(&queue->lock_data)
79
80#define timer_queue_unlock(queue)		\
81	lck_mtx_unlock_always(&queue->lock_data)
82
83
84#define QUEUE(x)	((queue_t)(x))
85#define MPQUEUE(x)	((mpqueue_head_t *)(x))
86#define TIMER_CALL(x)	((timer_call_t)(x))
87#define TCE(x)		(&(x->call_entry))
88/*
89 * The longterm timer object is a global structure holding all timers
90 * beyond the short-term, local timer queue threshold. The boot processor
91 * is responsible for moving each timer to its local timer queue
92 * if and when that timer becomes due within the threshold.
93 */
94#define TIMER_LONGTERM_NONE		EndOfAllTime
95#if defined(__x86_64__)
96#define	TIMER_LONGTERM_THRESHOLD	(1ULL * NSEC_PER_SEC)
97#else
98#define	TIMER_LONGTERM_THRESHOLD	TIMER_LONGTERM_NONE
99#endif
100
101typedef struct {
102	uint64_t	interval;	/* longterm timer interval */
103	uint64_t	margin;		/* fudge factor (10% of interval */
104	uint64_t	deadline;	/* first/soonest longterm deadline */
105	uint64_t	preempted;	/* sooner timer has pre-empted */
106	timer_call_t	call;		/* first/soonest longterm timer call */
107	uint64_t	deadline_set;	/* next timer set */
108	timer_call_data_t timer;	/* timer used by threshold management */
109					/* Stats: */
110	uint64_t	scans;		/*   num threshold timer scans */
111	uint64_t	preempts;	/*   num threshold reductions */
112	uint64_t	latency;	/*   average threshold latency */
113	uint64_t	latency_min;	/*   minimum threshold latency */
114	uint64_t	latency_max;	/*   maximum threshold latency */
115} threshold_t;
116
117typedef struct {
118	mpqueue_head_t	queue;		/* longterm timer list */
119	uint64_t	enqueues;	/* num timers queued */
120	uint64_t	dequeues;	/* num timers dequeued */
121	uint64_t	escalates;	/* num timers becoming shortterm */
122	uint64_t	scan_time;	/* last time the list was scanned */
123	threshold_t	threshold;	/* longterm timer threshold */
124} timer_longterm_t;
125
126timer_longterm_t		timer_longterm;
127
128static mpqueue_head_t		*timer_longterm_queue = NULL;
129
130static void			timer_longterm_init(void);
131static void			timer_longterm_callout(
132					timer_call_param_t	p0,
133					timer_call_param_t	p1);
134extern void			timer_longterm_scan(
135					timer_longterm_t	*tlp,
136					uint64_t		now);
137static void			timer_longterm_update(
138					timer_longterm_t *tlp);
139static void			timer_longterm_update_locked(
140					timer_longterm_t *tlp);
141static mpqueue_head_t *		timer_longterm_enqueue_unlocked(
142					timer_call_t		call,
143					uint64_t		now,
144					uint64_t		deadline,
145					mpqueue_head_t **	old_queue,
146					uint64_t		soft_deadline,
147					uint64_t		ttd,
148					timer_call_param_t	param1,
149					uint32_t		callout_flags);
150static void			timer_longterm_dequeued_locked(
151					timer_call_t		call);
152
153uint64_t past_deadline_timers;
154uint64_t past_deadline_deltas;
155uint64_t past_deadline_longest;
156uint64_t past_deadline_shortest = ~0ULL;
157enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
158
159uint64_t past_deadline_timer_adjustment;
160
161static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
162boolean_t 	mach_timer_coalescing_enabled = TRUE;
163
164mpqueue_head_t	*timer_call_enqueue_deadline_unlocked(
165			timer_call_t		call,
166			mpqueue_head_t		*queue,
167			uint64_t		deadline,
168			uint64_t		soft_deadline,
169			uint64_t		ttd,
170			timer_call_param_t	param1,
171			uint32_t		flags);
172
173mpqueue_head_t	*timer_call_dequeue_unlocked(
174			timer_call_t 		call);
175
176timer_coalescing_priority_params_t tcoal_prio_params;
177
178#if TCOAL_PRIO_STATS
179int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
180#define TCOAL_PRIO_STAT(x) (x++)
181#else
182#define TCOAL_PRIO_STAT(x)
183#endif
184
185static void
186timer_call_init_abstime(void)
187{
188	int i;
189	uint64_t result;
190	timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
191	nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
192	nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
193	tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
194	nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
195	tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
196	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
197	tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
198	tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
199	tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
200	tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
201	tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
202	tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
203
204	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
205	    &tcoal_prio_params.timer_coalesce_rt_abstime_max);
206	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
207	    &tcoal_prio_params.timer_coalesce_bg_abstime_max);
208	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
209	    &tcoal_prio_params.timer_coalesce_kt_abstime_max);
210	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
211	    &tcoal_prio_params.timer_coalesce_fp_abstime_max);
212	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
213	    &tcoal_prio_params.timer_coalesce_ts_abstime_max);
214
215	for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
216		tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
217		nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
218		    &tcoal_prio_params.latency_qos_abstime_max[i]);
219		tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
220	}
221}
222
223
224void
225timer_call_init(void)
226{
227	lck_attr_setdefault(&timer_call_lck_attr);
228	lck_grp_attr_setdefault(&timer_call_lck_grp_attr);
229	lck_grp_init(&timer_call_lck_grp, "timer_call", &timer_call_lck_grp_attr);
230
231	timer_longterm_init();
232	timer_call_init_abstime();
233}
234
235
236void
237timer_call_queue_init(mpqueue_head_t *queue)
238{
239	DBG("timer_call_queue_init(%p)\n", queue);
240	mpqueue_init(queue, &timer_call_lck_grp, &timer_call_lck_attr);
241}
242
243
244void
245timer_call_setup(
246	timer_call_t			call,
247	timer_call_func_t		func,
248	timer_call_param_t		param0)
249{
250	DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
251	call_entry_setup(TCE(call), func, param0);
252	simple_lock_init(&(call)->lock, 0);
253	call->async_dequeue = FALSE;
254}
255#if TIMER_ASSERT
256static __inline__ mpqueue_head_t *
257timer_call_entry_dequeue(
258	timer_call_t		entry)
259{
260        mpqueue_head_t	*old_queue = MPQUEUE(TCE(entry)->queue);
261
262	if (!hw_lock_held((hw_lock_t)&entry->lock))
263		panic("_call_entry_dequeue() "
264			"entry %p is not locked\n", entry);
265	/*
266	 * XXX The queue lock is actually a mutex in spin mode
267	 *     but there's no way to test for it being held
268	 *     so we pretend it's a spinlock!
269	 */
270	if (!hw_lock_held((hw_lock_t)&old_queue->lock_data))
271		panic("_call_entry_dequeue() "
272			"queue %p is not locked\n", old_queue);
273
274	call_entry_dequeue(TCE(entry));
275	old_queue->count--;
276
277	return (old_queue);
278}
279
280static __inline__ mpqueue_head_t *
281timer_call_entry_enqueue_deadline(
282	timer_call_t		entry,
283	mpqueue_head_t		*queue,
284	uint64_t		deadline)
285{
286	mpqueue_head_t	*old_queue = MPQUEUE(TCE(entry)->queue);
287
288	if (!hw_lock_held((hw_lock_t)&entry->lock))
289		panic("_call_entry_enqueue_deadline() "
290			"entry %p is not locked\n", entry);
291	/* XXX More lock pretense:  */
292	if (!hw_lock_held((hw_lock_t)&queue->lock_data))
293		panic("_call_entry_enqueue_deadline() "
294			"queue %p is not locked\n", queue);
295	if (old_queue != NULL && old_queue != queue)
296		panic("_call_entry_enqueue_deadline() "
297			"old_queue %p != queue", old_queue);
298
299	call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
300
301/* For efficiency, track the earliest soft deadline on the queue, so that
302 * fuzzy decisions can be made without lock acquisitions.
303 */
304	timer_call_t thead = (timer_call_t)queue_first(&queue->head);
305
306	queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
307
308	if (old_queue)
309		old_queue->count--;
310	queue->count++;
311
312	return (old_queue);
313}
314
315#else
316
317static __inline__ mpqueue_head_t *
318timer_call_entry_dequeue(
319	timer_call_t		entry)
320{
321	mpqueue_head_t	*old_queue = MPQUEUE(TCE(entry)->queue);
322
323	call_entry_dequeue(TCE(entry));
324	old_queue->count--;
325
326	return old_queue;
327}
328
329static __inline__ mpqueue_head_t *
330timer_call_entry_enqueue_deadline(
331	timer_call_t			entry,
332	mpqueue_head_t			*queue,
333	uint64_t			deadline)
334{
335	mpqueue_head_t	*old_queue = MPQUEUE(TCE(entry)->queue);
336
337	call_entry_enqueue_deadline(TCE(entry), QUEUE(queue), deadline);
338
339	/* For efficiency, track the earliest soft deadline on the queue,
340	 * so that fuzzy decisions can be made without lock acquisitions.
341	 */
342
343	timer_call_t thead = (timer_call_t)queue_first(&queue->head);
344	queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
345
346	if (old_queue)
347		old_queue->count--;
348	queue->count++;
349
350	return old_queue;
351}
352
353#endif
354
355static __inline__ void
356timer_call_entry_enqueue_tail(
357	timer_call_t			entry,
358	mpqueue_head_t			*queue)
359{
360	call_entry_enqueue_tail(TCE(entry), QUEUE(queue));
361	queue->count++;
362	return;
363}
364
365/*
366 * Remove timer entry from its queue but don't change the queue pointer
367 * and set the async_dequeue flag. This is locking case 2b.
368 */
369static __inline__ void
370timer_call_entry_dequeue_async(
371	timer_call_t		entry)
372{
373	mpqueue_head_t	*old_queue = MPQUEUE(TCE(entry)->queue);
374	if (old_queue) {
375		old_queue->count--;
376		(void) remque(qe(entry));
377		entry->async_dequeue = TRUE;
378	}
379	return;
380}
381
382#if TIMER_ASSERT
383unsigned timer_call_enqueue_deadline_unlocked_async1;
384unsigned timer_call_enqueue_deadline_unlocked_async2;
385#endif
386/*
387 * Assumes call_entry and queues unlocked, interrupts disabled.
388 */
389__inline__ mpqueue_head_t *
390timer_call_enqueue_deadline_unlocked(
391	timer_call_t 			call,
392	mpqueue_head_t			*queue,
393	uint64_t			deadline,
394	uint64_t			soft_deadline,
395	uint64_t			ttd,
396	timer_call_param_t		param1,
397	uint32_t			callout_flags)
398{
399	call_entry_t	entry = TCE(call);
400	mpqueue_head_t	*old_queue;
401
402	DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
403
404	simple_lock(&call->lock);
405
406	old_queue = MPQUEUE(entry->queue);
407
408	if (old_queue != NULL) {
409		timer_queue_lock_spin(old_queue);
410		if (call->async_dequeue) {
411			/* collision (1c): timer already dequeued, clear flag */
412#if TIMER_ASSERT
413			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
414				DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
415				call,
416				call->async_dequeue,
417				TCE(call)->queue,
418				0x1c, 0);
419			timer_call_enqueue_deadline_unlocked_async1++;
420#endif
421			call->async_dequeue = FALSE;
422			entry->queue = NULL;
423		} else if (old_queue != queue) {
424			timer_call_entry_dequeue(call);
425#if TIMER_ASSERT
426			timer_call_enqueue_deadline_unlocked_async2++;
427#endif
428		}
429		if (old_queue == timer_longterm_queue)
430			timer_longterm_dequeued_locked(call);
431		if (old_queue != queue) {
432			timer_queue_unlock(old_queue);
433			timer_queue_lock_spin(queue);
434		}
435	} else {
436		timer_queue_lock_spin(queue);
437	}
438
439	call->soft_deadline = soft_deadline;
440	call->flags = callout_flags;
441	TCE(call)->param1 = param1;
442	call->ttd = ttd;
443
444	timer_call_entry_enqueue_deadline(call, queue, deadline);
445	timer_queue_unlock(queue);
446	simple_unlock(&call->lock);
447
448	return (old_queue);
449}
450
451#if TIMER_ASSERT
452unsigned timer_call_dequeue_unlocked_async1;
453unsigned timer_call_dequeue_unlocked_async2;
454#endif
455mpqueue_head_t *
456timer_call_dequeue_unlocked(
457	timer_call_t 		call)
458{
459	call_entry_t	entry = TCE(call);
460	mpqueue_head_t	*old_queue;
461
462	DBG("timer_call_dequeue_unlocked(%p)\n", call);
463
464	simple_lock(&call->lock);
465	old_queue = MPQUEUE(entry->queue);
466#if TIMER_ASSERT
467	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
468		DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
469		call,
470		call->async_dequeue,
471		TCE(call)->queue,
472		0, 0);
473#endif
474	if (old_queue != NULL) {
475		timer_queue_lock_spin(old_queue);
476		if (call->async_dequeue) {
477			/* collision (1c): timer already dequeued, clear flag */
478#if TIMER_ASSERT
479			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
480				DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
481				call,
482				call->async_dequeue,
483				TCE(call)->queue,
484				0x1c, 0);
485			timer_call_dequeue_unlocked_async1++;
486#endif
487			call->async_dequeue = FALSE;
488			entry->queue = NULL;
489		} else {
490			timer_call_entry_dequeue(call);
491		}
492		if (old_queue == timer_longterm_queue)
493			timer_longterm_dequeued_locked(call);
494		timer_queue_unlock(old_queue);
495	}
496	simple_unlock(&call->lock);
497	return (old_queue);
498}
499
500
501/*
502 * Timer call entry locking model
503 * ==============================
504 *
505 * Timer call entries are linked on per-cpu timer queues which are protected
506 * by the queue lock and the call entry lock. The locking protocol is:
507 *
508 *  0) The canonical locking order is timer call entry followed by queue.
509 *
510 *  1) With only the entry lock held, entry.queue is valid:
511 *    1a) NULL: the entry is not queued, or
512 *    1b) non-NULL: this queue must be locked before the entry is modified.
513 *        After locking the queue, the call.async_dequeue flag must be checked:
514 *    1c) TRUE: the entry was removed from the queue by another thread
515 *	        and we must NULL the entry.queue and reset this flag, or
516 *    1d) FALSE: (ie. queued), the entry can be manipulated.
517 *
518 *  2) If a queue lock is obtained first, the queue is stable:
519 *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
520 *	  and dequeued.
521 *    2b) If a try-lock fails, it indicates that another thread is attempting
522 *        to change the entry and move it to a different position in this queue
523 *        or to different queue. The entry can be dequeued but it should not be
524 *        operated upon since it is being changed. Furthermore, we don't null
525 *	  the entry.queue pointer (protected by the entry lock we don't own).
526 *	  Instead, we set the async_dequeue flag -- see (1c).
527 *    2c) Same as 2b but occurring when a longterm timer is matured.
528 *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
529 *     should be manipulated with the appropriate timer queue lock held,
530 *     to prevent queue traversal observations from observing inconsistent
531 *     updates to an in-flight callout.
532 */
533
534/*
535 * Inlines timer_call_entry_dequeue() and timer_call_entry_enqueue_deadline()
536 * cast between pointer types (mpqueue_head_t *) and (queue_t) so that
537 * we can use the call_entry_dequeue() and call_entry_enqueue_deadline()
538 * methods to operate on timer_call structs as if they are call_entry structs.
539 * These structures are identical except for their queue head pointer fields.
540 *
541 * In the debug case, we assert that the timer call locking protocol
542 * is being obeyed.
543 */
544
545static boolean_t
546timer_call_enter_internal(
547	timer_call_t 		call,
548	timer_call_param_t	param1,
549	uint64_t 		deadline,
550	uint64_t 		leeway,
551	uint32_t 		flags,
552	boolean_t		ratelimited)
553{
554	mpqueue_head_t		*queue = NULL;
555	mpqueue_head_t		*old_queue;
556	spl_t			s;
557	uint64_t 		slop;
558	uint32_t		urgency;
559	uint64_t		sdeadline, ttd;
560
561	s = splclock();
562
563	sdeadline = deadline;
564	uint64_t ctime = mach_absolute_time();
565
566	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
567        	DECR_TIMER_ENTER | DBG_FUNC_START,
568		call,
569		param1, deadline, flags, 0);
570
571	urgency = (flags & TIMER_CALL_URGENCY_MASK);
572
573	boolean_t slop_ratelimited = FALSE;
574	slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
575
576	if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop)
577		slop = leeway;
578
579	if (UINT64_MAX - deadline <= slop) {
580		deadline = UINT64_MAX;
581	} else {
582		deadline += slop;
583	}
584
585	if (__improbable(deadline < ctime)) {
586		uint64_t delta = (ctime - deadline);
587
588		past_deadline_timers++;
589		past_deadline_deltas += delta;
590		if (delta > past_deadline_longest)
591			past_deadline_longest = deadline;
592		if (delta < past_deadline_shortest)
593			past_deadline_shortest = delta;
594
595		deadline = ctime + past_deadline_timer_adjustment;
596		sdeadline = deadline;
597	}
598
599	if (ratelimited || slop_ratelimited) {
600		flags |= TIMER_CALL_RATELIMITED;
601	} else {
602		flags &= ~TIMER_CALL_RATELIMITED;
603	}
604
605	ttd =  sdeadline - ctime;
606#if CONFIG_DTRACE
607	DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func,
608	timer_call_param_t, TCE(call)->param0, uint32_t, flags,
609	    (deadline - sdeadline),
610	    (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
611#endif
612
613	/* Program timer callout parameters under the appropriate per-CPU or
614	 * longterm queue lock. The callout may have been previously enqueued
615	 * and in-flight on this or another timer queue.
616	 */
617	if (!ratelimited && !slop_ratelimited) {
618		queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
619	}
620
621	if (queue == NULL) {
622		queue = timer_queue_assign(deadline);
623		old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
624	}
625
626#if TIMER_TRACE
627	TCE(call)->entry_time = ctime;
628#endif
629
630	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
631        	DECR_TIMER_ENTER | DBG_FUNC_END,
632		call,
633		(old_queue != NULL), deadline, queue->count, 0);
634
635	splx(s);
636
637	return (old_queue != NULL);
638}
639
640/*
641 * timer_call_*()
642 *	return boolean indicating whether the call was previously queued.
643 */
644boolean_t
645timer_call_enter(
646	timer_call_t		call,
647	uint64_t		deadline,
648	uint32_t		flags)
649{
650	return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
651}
652
653boolean_t
654timer_call_enter1(
655	timer_call_t		call,
656	timer_call_param_t	param1,
657	uint64_t		deadline,
658	uint32_t		flags)
659{
660	return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
661}
662
663boolean_t
664timer_call_enter_with_leeway(
665	timer_call_t		call,
666	timer_call_param_t	param1,
667	uint64_t		deadline,
668	uint64_t		leeway,
669	uint32_t		flags,
670	boolean_t		ratelimited)
671{
672	return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
673}
674
675boolean_t
676timer_call_cancel(
677	timer_call_t		call)
678{
679	mpqueue_head_t		*old_queue;
680	spl_t			s;
681
682	s = splclock();
683
684	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
685        	DECR_TIMER_CANCEL | DBG_FUNC_START,
686		call,
687		TCE(call)->deadline, call->soft_deadline, call->flags, 0);
688
689	old_queue = timer_call_dequeue_unlocked(call);
690
691	if (old_queue != NULL) {
692		timer_queue_lock_spin(old_queue);
693		if (!queue_empty(&old_queue->head)) {
694			timer_queue_cancel(old_queue, TCE(call)->deadline, CE(queue_first(&old_queue->head))->deadline);
695 			timer_call_t thead = (timer_call_t)queue_first(&old_queue->head);
696 			old_queue->earliest_soft_deadline = thead->flags & TIMER_CALL_RATELIMITED ? TCE(thead)->deadline : thead->soft_deadline;
697		}
698		else {
699			timer_queue_cancel(old_queue, TCE(call)->deadline, UINT64_MAX);
700			old_queue->earliest_soft_deadline = UINT64_MAX;
701		}
702		timer_queue_unlock(old_queue);
703	}
704	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
705        	DECR_TIMER_CANCEL | DBG_FUNC_END,
706		call,
707		old_queue,
708		TCE(call)->deadline - mach_absolute_time(),
709		TCE(call)->deadline - TCE(call)->entry_time, 0);
710	splx(s);
711
712#if CONFIG_DTRACE
713	DTRACE_TMR6(callout__cancel, timer_call_func_t, TCE(call)->func,
714	    timer_call_param_t, TCE(call)->param0, uint32_t, call->flags, 0,
715	    (call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF));
716#endif
717
718	return (old_queue != NULL);
719}
720
721static uint32_t	timer_queue_shutdown_lock_skips;
722static uint32_t timer_queue_shutdown_discarded;
723
724void
725timer_queue_shutdown(
726	mpqueue_head_t		*queue)
727{
728	timer_call_t		call;
729	mpqueue_head_t		*new_queue;
730	spl_t			s;
731
732
733	DBG("timer_queue_shutdown(%p)\n", queue);
734
735	s = splclock();
736
737	/* Note comma operator in while expression re-locking each iteration */
738	while (timer_queue_lock_spin(queue), !queue_empty(&queue->head)) {
739		call = TIMER_CALL(queue_first(&queue->head));
740
741		if (!simple_lock_try(&call->lock)) {
742			/*
743			 * case (2b) lock order inversion, dequeue and skip
744			 * Don't change the call_entry queue back-pointer
745			 * but set the async_dequeue field.
746			 */
747			timer_queue_shutdown_lock_skips++;
748			timer_call_entry_dequeue_async(call);
749#if TIMER_ASSERT
750			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
751				DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
752				call,
753				call->async_dequeue,
754				TCE(call)->queue,
755				0x2b, 0);
756#endif
757			timer_queue_unlock(queue);
758			continue;
759		}
760
761		boolean_t call_local = ((call->flags & TIMER_CALL_LOCAL) != 0);
762
763		/* remove entry from old queue */
764		timer_call_entry_dequeue(call);
765		timer_queue_unlock(queue);
766
767		if (call_local == FALSE) {
768			/* and queue it on new, discarding LOCAL timers */
769			new_queue = timer_queue_assign(TCE(call)->deadline);
770			timer_queue_lock_spin(new_queue);
771			timer_call_entry_enqueue_deadline(
772				call, new_queue, TCE(call)->deadline);
773			timer_queue_unlock(new_queue);
774		} else {
775			timer_queue_shutdown_discarded++;
776		}
777
778		/* The only lingering LOCAL timer should be this thread's
779		 * quantum expiration timer.
780		 */
781		assert((call_local == FALSE) ||
782		    (TCE(call)->func == thread_quantum_expire));
783
784		simple_unlock(&call->lock);
785	}
786
787	timer_queue_unlock(queue);
788	splx(s);
789}
790
791static uint32_t	timer_queue_expire_lock_skips;
792uint64_t
793timer_queue_expire_with_options(
794	mpqueue_head_t		*queue,
795	uint64_t		deadline,
796	boolean_t		rescan)
797{
798	timer_call_t	call = NULL;
799	uint32_t tc_iterations = 0;
800	DBG("timer_queue_expire(%p,)\n", queue);
801
802	uint64_t cur_deadline = deadline;
803	timer_queue_lock_spin(queue);
804
805	while (!queue_empty(&queue->head)) {
806		/* Upon processing one or more timer calls, refresh the
807		 * deadline to account for time elapsed in the callout
808		 */
809		if (++tc_iterations > 1)
810			cur_deadline = mach_absolute_time();
811
812		if (call == NULL)
813			call = TIMER_CALL(queue_first(&queue->head));
814
815		if (call->soft_deadline <= cur_deadline) {
816			timer_call_func_t		func;
817			timer_call_param_t		param0, param1;
818
819			TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->soft_deadline, 0, 0, 0);
820			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
821				DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
822				call,
823				call->soft_deadline,
824				TCE(call)->deadline,
825				TCE(call)->entry_time, 0);
826
827			if ((call->flags & TIMER_CALL_RATELIMITED) &&
828			    (TCE(call)->deadline > cur_deadline)) {
829				if (rescan == FALSE)
830					break;
831			}
832
833			if (!simple_lock_try(&call->lock)) {
834				/* case (2b) lock inversion, dequeue and skip */
835				timer_queue_expire_lock_skips++;
836				timer_call_entry_dequeue_async(call);
837				call = NULL;
838				continue;
839			}
840
841			timer_call_entry_dequeue(call);
842
843			func = TCE(call)->func;
844			param0 = TCE(call)->param0;
845			param1 = TCE(call)->param1;
846
847			simple_unlock(&call->lock);
848			timer_queue_unlock(queue);
849
850			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
851				DECR_TIMER_CALLOUT | DBG_FUNC_START,
852				call, VM_KERNEL_UNSLIDE(func), param0, param1, 0);
853
854#if CONFIG_DTRACE
855			DTRACE_TMR7(callout__start, timer_call_func_t, func,
856			    timer_call_param_t, param0, unsigned, call->flags,
857			    0, (call->ttd >> 32),
858			    (unsigned) (call->ttd & 0xFFFFFFFF), call);
859#endif
860			/* Maintain time-to-deadline in per-processor data
861			 * structure for thread wakeup deadline statistics.
862			 */
863			uint64_t *ttdp = &(PROCESSOR_DATA(current_processor(), timer_call_ttd));
864			*ttdp = call->ttd;
865			(*func)(param0, param1);
866			*ttdp = 0;
867#if CONFIG_DTRACE
868			DTRACE_TMR4(callout__end, timer_call_func_t, func,
869			    param0, param1, call);
870#endif
871
872			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
873				DECR_TIMER_CALLOUT | DBG_FUNC_END,
874				call, VM_KERNEL_UNSLIDE(func), param0, param1, 0);
875			call = NULL;
876			timer_queue_lock_spin(queue);
877		} else {
878			if (__probable(rescan == FALSE)) {
879				break;
880			} else {
881				int64_t skew = TCE(call)->deadline - call->soft_deadline;
882				assert(TCE(call)->deadline >= call->soft_deadline);
883
884				/* DRK: On a latency quality-of-service level change,
885				 * re-sort potentially rate-limited timers. The platform
886				 * layer determines which timers require
887				 * this. In the absence of the per-callout
888				 * synchronization requirement, a global resort could
889				 * be more efficient. The re-sort effectively
890				 * annuls all timer adjustments, i.e. the "soft
891				 * deadline" is the sort key.
892				 */
893
894				if (timer_resort_threshold(skew)) {
895					if (__probable(simple_lock_try(&call->lock))) {
896						timer_call_entry_dequeue(call);
897						timer_call_entry_enqueue_deadline(call, queue, call->soft_deadline);
898						simple_unlock(&call->lock);
899						call = NULL;
900					}
901				}
902				if (call) {
903					call = TIMER_CALL(queue_next(qe(call)));
904					if (queue_end(&queue->head, qe(call)))
905						break;
906				}
907			}
908		}
909	}
910
911	if (!queue_empty(&queue->head)) {
912		call = TIMER_CALL(queue_first(&queue->head));
913		cur_deadline = TCE(call)->deadline;
914		queue->earliest_soft_deadline = (call->flags & TIMER_CALL_RATELIMITED) ? TCE(call)->deadline: call->soft_deadline;
915	} else {
916		queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
917	}
918
919	timer_queue_unlock(queue);
920
921	return (cur_deadline);
922}
923
924uint64_t
925timer_queue_expire(
926	mpqueue_head_t		*queue,
927	uint64_t		deadline)
928{
929	return timer_queue_expire_with_options(queue, deadline, FALSE);
930}
931
932extern int serverperfmode;
933static uint32_t	timer_queue_migrate_lock_skips;
934/*
935 * timer_queue_migrate() is called by timer_queue_migrate_cpu()
936 * to move timer requests from the local processor (queue_from)
937 * to a target processor's (queue_to).
938 */
939int
940timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
941{
942	timer_call_t	call;
943	timer_call_t	head_to;
944	int		timers_migrated = 0;
945
946	DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
947
948	assert(!ml_get_interrupts_enabled());
949	assert(queue_from != queue_to);
950
951	if (serverperfmode) {
952		/*
953		 * if we're running a high end server
954		 * avoid migrations... they add latency
955		 * and don't save us power under typical
956		 * server workloads
957		 */
958		return -4;
959	}
960
961	/*
962	 * Take both local (from) and target (to) timer queue locks while
963	 * moving the timers from the local queue to the target processor.
964	 * We assume that the target is always the boot processor.
965	 * But only move if all of the following is true:
966	 *  - the target queue is non-empty
967	 *  - the local queue is non-empty
968	 *  - the local queue's first deadline is later than the target's
969	 *  - the local queue contains no non-migrateable "local" call
970	 * so that we need not have the target resync.
971	 */
972
973        timer_queue_lock_spin(queue_to);
974
975	head_to = TIMER_CALL(queue_first(&queue_to->head));
976	if (queue_empty(&queue_to->head)) {
977		timers_migrated = -1;
978		goto abort1;
979	}
980
981        timer_queue_lock_spin(queue_from);
982
983	if (queue_empty(&queue_from->head)) {
984		timers_migrated = -2;
985		goto abort2;
986	}
987
988	call = TIMER_CALL(queue_first(&queue_from->head));
989	if (TCE(call)->deadline < TCE(head_to)->deadline) {
990		timers_migrated = 0;
991		goto abort2;
992	}
993
994	/* perform scan for non-migratable timers */
995	do {
996		if (call->flags & TIMER_CALL_LOCAL) {
997			timers_migrated = -3;
998			goto abort2;
999		}
1000		call = TIMER_CALL(queue_next(qe(call)));
1001	} while (!queue_end(&queue_from->head, qe(call)));
1002
1003	/* migration loop itself -- both queues are locked */
1004	while (!queue_empty(&queue_from->head)) {
1005		call = TIMER_CALL(queue_first(&queue_from->head));
1006		if (!simple_lock_try(&call->lock)) {
1007			/* case (2b) lock order inversion, dequeue only */
1008#ifdef TIMER_ASSERT
1009			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1010				DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1011				call,
1012				TCE(call)->queue,
1013				call->lock.interlock.lock_data,
1014				0x2b, 0);
1015#endif
1016			timer_queue_migrate_lock_skips++;
1017			timer_call_entry_dequeue_async(call);
1018			continue;
1019		}
1020		timer_call_entry_dequeue(call);
1021		timer_call_entry_enqueue_deadline(
1022			call, queue_to, TCE(call)->deadline);
1023		timers_migrated++;
1024		simple_unlock(&call->lock);
1025	}
1026	queue_from->earliest_soft_deadline = UINT64_MAX;
1027abort2:
1028       	timer_queue_unlock(queue_from);
1029abort1:
1030       	timer_queue_unlock(queue_to);
1031
1032	return timers_migrated;
1033}
1034
1035void
1036timer_queue_trace_cpu(int ncpu)
1037{
1038	timer_call_nosync_cpu(
1039		ncpu,
1040		(void(*)())timer_queue_trace,
1041		(void*) timer_queue_cpu(ncpu));
1042}
1043
1044void
1045timer_queue_trace(
1046	mpqueue_head_t			*queue)
1047{
1048	timer_call_t	call;
1049	spl_t		s;
1050
1051	if (!kdebug_enable)
1052		return;
1053
1054	s = splclock();
1055	timer_queue_lock_spin(queue);
1056
1057	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1058        	DECR_TIMER_QUEUE | DBG_FUNC_START,
1059		queue->count, mach_absolute_time(), 0, 0, 0);
1060
1061	if (!queue_empty(&queue->head)) {
1062		call = TIMER_CALL(queue_first(&queue->head));
1063		do {
1064			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1065        			DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1066				call->soft_deadline,
1067				TCE(call)->deadline,
1068				TCE(call)->entry_time,
1069				TCE(call)->func,
1070				0);
1071			call = TIMER_CALL(queue_next(qe(call)));
1072		} while (!queue_end(&queue->head, qe(call)));
1073	}
1074
1075	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1076        	DECR_TIMER_QUEUE | DBG_FUNC_END,
1077		queue->count, mach_absolute_time(), 0, 0, 0);
1078
1079	timer_queue_unlock(queue);
1080	splx(s);
1081}
1082
1083void
1084timer_longterm_dequeued_locked(timer_call_t call)
1085{
1086	timer_longterm_t	*tlp = &timer_longterm;
1087
1088	tlp->dequeues++;
1089	if (call == tlp->threshold.call)
1090		tlp->threshold.call = NULL;
1091}
1092
1093/*
1094 * Place a timer call in the longterm list
1095 * and adjust the next timer callout deadline if the new timer is first.
1096 */
1097mpqueue_head_t *
1098timer_longterm_enqueue_unlocked(timer_call_t	call,
1099				uint64_t	now,
1100				uint64_t	deadline,
1101				mpqueue_head_t	**old_queue,
1102				uint64_t	soft_deadline,
1103				uint64_t	ttd,
1104				timer_call_param_t	param1,
1105				uint32_t	callout_flags)
1106{
1107	timer_longterm_t	*tlp = &timer_longterm;
1108	boolean_t		update_required = FALSE;
1109	uint64_t		longterm_threshold;
1110
1111	longterm_threshold = now + tlp->threshold.interval;
1112
1113	/*
1114	 * Return NULL without doing anything if:
1115	 *  - this timer is local, or
1116	 *  - the longterm mechanism is disabled, or
1117	 *  - this deadline is too short.
1118	 */
1119	if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1120	    (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1121		(deadline <= longterm_threshold))
1122		return NULL;
1123
1124	/*
1125 	 * Remove timer from its current queue, if any.
1126	 */
1127	*old_queue = timer_call_dequeue_unlocked(call);
1128
1129	/*
1130	 * Lock the longterm queue, queue timer and determine
1131	 * whether an update is necessary.
1132	 */
1133	assert(!ml_get_interrupts_enabled());
1134	simple_lock(&call->lock);
1135	timer_queue_lock_spin(timer_longterm_queue);
1136	TCE(call)->deadline = deadline;
1137	TCE(call)->param1 = param1;
1138	call->ttd = ttd;
1139	call->soft_deadline = soft_deadline;
1140	call->flags = callout_flags;
1141	timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1142
1143	tlp->enqueues++;
1144
1145	/*
1146	 * We'll need to update the currently set threshold timer
1147	 * if the new deadline is sooner and no sooner update is in flight.
1148	 */
1149	if (deadline < tlp->threshold.deadline &&
1150	    deadline < tlp->threshold.preempted) {
1151		tlp->threshold.preempted = deadline;
1152		tlp->threshold.call = call;
1153		update_required = TRUE;
1154	}
1155	timer_queue_unlock(timer_longterm_queue);
1156	simple_unlock(&call->lock);
1157
1158	if (update_required) {
1159		/*
1160		 * Note: this call expects that calling the master cpu
1161		 * alone does not involve locking the topo lock.
1162		 */
1163		timer_call_nosync_cpu(
1164			master_cpu,
1165			(void (*)(void *)) timer_longterm_update,
1166			(void *)tlp);
1167	}
1168
1169	return timer_longterm_queue;
1170}
1171
1172/*
1173 * Scan for timers below the longterm threshold.
1174 * Move these to the local timer queue (of the boot processor on which the
1175 * calling thread is running).
1176 * Both the local (boot) queue and the longterm queue are locked.
1177 * The scan is similar to the timer migrate sequence but is performed by
1178 * successively examining each timer on the longterm queue:
1179 *  - if within the short-term threshold
1180 *    - enter on the local queue (unless being deleted),
1181 *  - otherwise:
1182 *    - if sooner, deadline becomes the next threshold deadline.
1183 */
1184void
1185timer_longterm_scan(timer_longterm_t	*tlp,
1186		    uint64_t		now)
1187{
1188	queue_entry_t	qe;
1189	timer_call_t	call;
1190	uint64_t	threshold;
1191	uint64_t	deadline;
1192	mpqueue_head_t	*timer_master_queue;
1193
1194	assert(!ml_get_interrupts_enabled());
1195	assert(cpu_number() == master_cpu);
1196
1197	if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1198		threshold = now + tlp->threshold.interval;
1199	else
1200		threshold = TIMER_LONGTERM_NONE;
1201
1202	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1203	tlp->threshold.call = NULL;
1204
1205	if (queue_empty(&timer_longterm_queue->head))
1206		return;
1207
1208	timer_master_queue = timer_queue_cpu(master_cpu);
1209	timer_queue_lock_spin(timer_master_queue);
1210
1211	qe = queue_first(&timer_longterm_queue->head);
1212	while (!queue_end(&timer_longterm_queue->head, qe)) {
1213		call = TIMER_CALL(qe);
1214		deadline = call->soft_deadline;
1215		qe = queue_next(qe);
1216		if (!simple_lock_try(&call->lock)) {
1217			/* case (2c) lock order inversion, dequeue only */
1218#ifdef TIMER_ASSERT
1219			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1220				DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1221				call,
1222				TCE(call)->queue,
1223				call->lock.interlock.lock_data,
1224				0x2c, 0);
1225#endif
1226			timer_call_entry_dequeue_async(call);
1227			continue;
1228		}
1229		if (deadline < threshold) {
1230			/*
1231			 * This timer needs moving (escalating)
1232			 * to the local (boot) processor's queue.
1233			 */
1234#ifdef TIMER_ASSERT
1235			if (deadline < now)
1236				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1237       		 			DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1238					call,
1239					deadline,
1240					now,
1241					threshold,
1242					0);
1243#endif
1244			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1245       	 			DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1246				call,
1247				TCE(call)->deadline,
1248				TCE(call)->entry_time,
1249				TCE(call)->func,
1250				0);
1251			tlp->escalates++;
1252			timer_call_entry_dequeue(call);
1253			timer_call_entry_enqueue_deadline(
1254				call, timer_master_queue, TCE(call)->deadline);
1255			/*
1256			 * A side-effect of the following call is to update
1257			 * the actual hardware deadline if required.
1258			 */
1259			(void) timer_queue_assign(deadline);
1260		} else {
1261			if (deadline < tlp->threshold.deadline) {
1262				tlp->threshold.deadline = deadline;
1263				tlp->threshold.call = call;
1264			}
1265		}
1266		simple_unlock(&call->lock);
1267	}
1268
1269	timer_queue_unlock(timer_master_queue);
1270}
1271
1272void
1273timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1274{
1275	timer_longterm_t	*tlp = (timer_longterm_t *) p0;
1276
1277	timer_longterm_update(tlp);
1278}
1279
1280void
1281timer_longterm_update_locked(timer_longterm_t *tlp)
1282{
1283	uint64_t	latency;
1284
1285	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1286		DECR_TIMER_UPDATE | DBG_FUNC_START,
1287		&tlp->queue,
1288		tlp->threshold.deadline,
1289		tlp->threshold.preempted,
1290		tlp->queue.count, 0);
1291
1292	tlp->scan_time = mach_absolute_time();
1293	if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1294		tlp->threshold.preempts++;
1295		tlp->threshold.deadline = tlp->threshold.preempted;
1296		tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1297		/*
1298		 * Note: in the unlikely event that a pre-empted timer has
1299		 * itself been cancelled, we'll simply re-scan later at the
1300		 * time of the preempted/cancelled timer.
1301		 */
1302	} else {
1303		tlp->threshold.scans++;
1304
1305		/*
1306		 * Maintain a moving average of our wakeup latency.
1307		 * Clamp latency to 0 and ignore above threshold interval.
1308		 */
1309		if (tlp->scan_time > tlp->threshold.deadline_set)
1310			latency = tlp->scan_time - tlp->threshold.deadline_set;
1311		else
1312			latency = 0;
1313		if (latency < tlp->threshold.interval) {
1314			tlp->threshold.latency_min =
1315				MIN(tlp->threshold.latency_min, latency);
1316			tlp->threshold.latency_max =
1317				MAX(tlp->threshold.latency_max, latency);
1318			tlp->threshold.latency =
1319				(tlp->threshold.latency*99 + latency) / 100;
1320		}
1321
1322		timer_longterm_scan(tlp, tlp->scan_time);
1323	}
1324
1325	tlp->threshold.deadline_set = tlp->threshold.deadline;
1326	/* The next deadline timer to be set is adjusted */
1327	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1328		tlp->threshold.deadline_set -= tlp->threshold.margin;
1329		tlp->threshold.deadline_set -= tlp->threshold.latency;
1330	}
1331
1332	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1333		DECR_TIMER_UPDATE | DBG_FUNC_END,
1334		&tlp->queue,
1335		tlp->threshold.deadline,
1336		tlp->threshold.scans,
1337		tlp->queue.count, 0);
1338}
1339
1340void
1341timer_longterm_update(timer_longterm_t *tlp)
1342{
1343	spl_t	s = splclock();
1344
1345	timer_queue_lock_spin(timer_longterm_queue);
1346
1347	if (cpu_number() != master_cpu)
1348		panic("timer_longterm_update_master() on non-boot cpu");
1349
1350	timer_longterm_update_locked(tlp);
1351
1352	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE)
1353		timer_call_enter(
1354			&tlp->threshold.timer,
1355			tlp->threshold.deadline_set,
1356			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1357
1358	timer_queue_unlock(timer_longterm_queue);
1359	splx(s);
1360}
1361
1362void
1363timer_longterm_init(void)
1364{
1365	uint32_t		longterm;
1366	timer_longterm_t	*tlp = &timer_longterm;
1367
1368	DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1369
1370	/*
1371	 * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1372	 * or TIMER_LONGTERM_NONE (disabled) for server;
1373	 * overridden longterm boot-arg
1374	 */
1375	tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1376						 : TIMER_LONGTERM_THRESHOLD;
1377	if (PE_parse_boot_argn("longterm", &longterm, sizeof (longterm))) {
1378		tlp->threshold.interval = (longterm == 0) ?
1379						TIMER_LONGTERM_NONE :
1380						longterm * NSEC_PER_MSEC;
1381	}
1382	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1383		printf("Longterm timer threshold: %llu ms\n",
1384			tlp->threshold.interval / NSEC_PER_MSEC);
1385		kprintf("Longterm timer threshold: %llu ms\n",
1386			tlp->threshold.interval / NSEC_PER_MSEC);
1387		nanoseconds_to_absolutetime(tlp->threshold.interval,
1388					    &tlp->threshold.interval);
1389		tlp->threshold.margin = tlp->threshold.interval / 10;
1390		tlp->threshold.latency_min = EndOfAllTime;
1391		tlp->threshold.latency_max = 0;
1392	}
1393
1394	tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1395	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1396
1397	lck_attr_setdefault(&timer_longterm_lck_attr);
1398	lck_grp_attr_setdefault(&timer_longterm_lck_grp_attr);
1399	lck_grp_init(&timer_longterm_lck_grp,
1400		     "timer_longterm", &timer_longterm_lck_grp_attr);
1401	mpqueue_init(&tlp->queue,
1402		     &timer_longterm_lck_grp, &timer_longterm_lck_attr);
1403
1404	timer_call_setup(&tlp->threshold.timer,
1405			 timer_longterm_callout, (timer_call_param_t) tlp);
1406
1407	timer_longterm_queue = &tlp->queue;
1408}
1409
1410enum {
1411	THRESHOLD, QCOUNT,
1412	ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1413	LATENCY, LATENCY_MIN, LATENCY_MAX
1414};
1415uint64_t
1416timer_sysctl_get(int oid)
1417{
1418	timer_longterm_t	*tlp = &timer_longterm;
1419
1420	switch (oid) {
1421	case THRESHOLD:
1422		return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1423			0 : tlp->threshold.interval / NSEC_PER_MSEC;
1424	case QCOUNT:
1425		return tlp->queue.count;
1426	case ENQUEUES:
1427		return tlp->enqueues;
1428	case DEQUEUES:
1429		return tlp->dequeues;
1430	case ESCALATES:
1431		return tlp->escalates;
1432	case SCANS:
1433		return tlp->threshold.scans;
1434	case PREEMPTS:
1435		return tlp->threshold.preempts;
1436	case LATENCY:
1437		return tlp->threshold.latency;
1438	case LATENCY_MIN:
1439		return tlp->threshold.latency_min;
1440	case LATENCY_MAX:
1441		return tlp->threshold.latency_max;
1442	default:
1443		return 0;
1444	}
1445}
1446
1447/*
1448 * timer_master_scan() is the inverse of timer_longterm_scan()
1449 * since it un-escalates timers to the longterm queue.
1450 */
1451static void
1452timer_master_scan(timer_longterm_t	*tlp,
1453		  uint64_t		now)
1454{
1455	queue_entry_t	qe;
1456	timer_call_t	call;
1457	uint64_t	threshold;
1458	uint64_t	deadline;
1459	mpqueue_head_t	*timer_master_queue;
1460
1461	if (tlp->threshold.interval != TIMER_LONGTERM_NONE)
1462		threshold = now + tlp->threshold.interval;
1463	else
1464		threshold = TIMER_LONGTERM_NONE;
1465
1466	timer_master_queue = timer_queue_cpu(master_cpu);
1467	timer_queue_lock_spin(timer_master_queue);
1468
1469	qe = queue_first(&timer_master_queue->head);
1470	while (!queue_end(&timer_master_queue->head, qe)) {
1471		call = TIMER_CALL(qe);
1472		deadline = TCE(call)->deadline;
1473		qe = queue_next(qe);
1474		if ((call->flags & TIMER_CALL_LOCAL) != 0)
1475			continue;
1476		if (!simple_lock_try(&call->lock)) {
1477			/* case (2c) lock order inversion, dequeue only */
1478			timer_call_entry_dequeue_async(call);
1479			continue;
1480		}
1481		if (deadline > threshold) {
1482			/* move from master to longterm */
1483			timer_call_entry_dequeue(call);
1484			timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1485			if (deadline < tlp->threshold.deadline) {
1486				tlp->threshold.deadline = deadline;
1487				tlp->threshold.call = call;
1488			}
1489		}
1490		simple_unlock(&call->lock);
1491	}
1492	timer_queue_unlock(timer_master_queue);
1493}
1494
1495static void
1496timer_sysctl_set_threshold(uint64_t value)
1497{
1498	timer_longterm_t	*tlp = &timer_longterm;
1499	spl_t			s = splclock();
1500	boolean_t		threshold_increase;
1501
1502	timer_queue_lock_spin(timer_longterm_queue);
1503
1504	timer_call_cancel(&tlp->threshold.timer);
1505
1506	/*
1507	 * Set the new threshold and note whther it's increasing.
1508	 */
1509	if (value == 0) {
1510		tlp->threshold.interval = TIMER_LONGTERM_NONE;
1511		threshold_increase = TRUE;
1512		timer_call_cancel(&tlp->threshold.timer);
1513	} else {
1514		uint64_t	old_interval = tlp->threshold.interval;
1515		tlp->threshold.interval = value * NSEC_PER_MSEC;
1516		nanoseconds_to_absolutetime(tlp->threshold.interval,
1517					    &tlp->threshold.interval);
1518		tlp->threshold.margin = tlp->threshold.interval / 10;
1519		if  (old_interval == TIMER_LONGTERM_NONE)
1520			threshold_increase = FALSE;
1521		else
1522			threshold_increase = (tlp->threshold.interval > old_interval);
1523	}
1524
1525	if (threshold_increase /* or removal */) {
1526		/* Escalate timers from the longterm queue */
1527		timer_longterm_scan(tlp, mach_absolute_time());
1528	} else /* decrease or addition  */ {
1529		/*
1530		 * We scan the local/master queue for timers now longterm.
1531		 * To be strictly correct, we should scan all processor queues
1532		 * but timer migration results in most timers gravitating to the
1533		 * master processor in any case.
1534		 */
1535		timer_master_scan(tlp, mach_absolute_time());
1536	}
1537
1538	/* Set new timer accordingly */
1539	tlp->threshold.deadline_set = tlp->threshold.deadline;
1540	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1541		tlp->threshold.deadline_set -= tlp->threshold.margin;
1542		tlp->threshold.deadline_set -= tlp->threshold.latency;
1543		timer_call_enter(
1544			&tlp->threshold.timer,
1545			tlp->threshold.deadline_set,
1546			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1547	}
1548
1549	/* Reset stats */
1550	tlp->enqueues = 0;
1551	tlp->dequeues = 0;
1552	tlp->escalates = 0;
1553	tlp->threshold.scans = 0;
1554	tlp->threshold.preempts = 0;
1555	tlp->threshold.latency = 0;
1556	tlp->threshold.latency_min = EndOfAllTime;
1557	tlp->threshold.latency_max = 0;
1558
1559	timer_queue_unlock(timer_longterm_queue);
1560	splx(s);
1561}
1562
1563int
1564timer_sysctl_set(int oid, uint64_t value)
1565{
1566	switch (oid) {
1567	case THRESHOLD:
1568		timer_call_cpu(
1569			master_cpu,
1570			(void (*)(void *)) timer_sysctl_set_threshold,
1571			(void *) value);
1572		return KERN_SUCCESS;
1573	default:
1574		return KERN_INVALID_ARGUMENT;
1575	}
1576}
1577
1578
1579/* Select timer coalescing window based on per-task quality-of-service hints */
1580static boolean_t tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1581	uint32_t latency_qos;
1582	boolean_t adjusted = FALSE;
1583	task_t ctask = t->task;
1584
1585	if (ctask) {
1586		latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1587
1588		assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1589
1590		if (latency_qos) {
1591			*tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1592			*tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1593			*pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1594			adjusted = TRUE;
1595		}
1596	}
1597	return adjusted;
1598}
1599
1600
1601/* Adjust timer deadlines based on priority of the thread and the
1602 * urgency value provided at timeout establishment. With this mechanism,
1603 * timers are no longer necessarily sorted in order of soft deadline
1604 * on a given timer queue, i.e. they may be differentially skewed.
1605 * In the current scheme, this could lead to fewer pending timers
1606 * processed than is technically possible when the HW deadline arrives.
1607 */
1608static void
1609timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited) {
1610	int16_t tpri = cthread->sched_pri;
1611	if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1612		if (tpri >= BASEPRI_RTQUEUES ||
1613		urgency == TIMER_CALL_USER_CRITICAL) {
1614			*tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1615			*tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1616			TCOAL_PRIO_STAT(rt_tcl);
1617		} else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1618		(urgency == TIMER_CALL_USER_BACKGROUND)) {
1619			/* Determine if timer should be subjected to a lower QoS */
1620			if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1621				if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1622					return;
1623				} else {
1624					*pratelimited = FALSE;
1625				}
1626			}
1627			*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1628			*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1629			TCOAL_PRIO_STAT(bg_tcl);
1630		} else if (tpri >= MINPRI_KERNEL) {
1631			*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1632			*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1633			TCOAL_PRIO_STAT(kt_tcl);
1634		} else if (cthread->sched_mode == TH_MODE_FIXED) {
1635			*tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1636			*tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1637			TCOAL_PRIO_STAT(fp_tcl);
1638		} else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1639			TCOAL_PRIO_STAT(qos_tcl);
1640		} else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1641			*tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1642			*tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1643			TCOAL_PRIO_STAT(ts_tcl);
1644		} else {
1645			TCOAL_PRIO_STAT(nc_tcl);
1646		}
1647	} else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1648		*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1649		*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1650		TCOAL_PRIO_STAT(bg_tcl);
1651	} else {
1652		*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1653		*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1654		TCOAL_PRIO_STAT(kt_tcl);
1655	}
1656}
1657
1658
1659int timer_user_idle_level;
1660
1661uint64_t
1662timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1663{
1664	int32_t tcs_shift = 0;
1665	uint64_t tcs_max_abstime = 0;
1666	uint64_t adjval;
1667	uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1668
1669	if (mach_timer_coalescing_enabled &&
1670	    (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1671		timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1672
1673		if (tcs_shift >= 0)
1674			adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1675		else
1676			adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1677		/* Apply adjustments derived from "user idle level" heuristic */
1678		adjval += (adjval * timer_user_idle_level) >> 7;
1679		return adjval;
1680 	} else {
1681		return 0;
1682	}
1683}
1684
1685int
1686timer_get_user_idle_level(void) {
1687	return timer_user_idle_level;
1688}
1689
1690kern_return_t timer_set_user_idle_level(int ilevel) {
1691	boolean_t do_reeval = FALSE;
1692
1693	if ((ilevel < 0) || (ilevel > 128))
1694		return KERN_INVALID_ARGUMENT;
1695
1696	if (ilevel < timer_user_idle_level) {
1697		do_reeval = TRUE;
1698	}
1699
1700	timer_user_idle_level = ilevel;
1701
1702	if (do_reeval)
1703		ml_timer_evaluate();
1704
1705	return KERN_SUCCESS;
1706}
1707