1/*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#include <kern/kalloc.h>
25#include <kern/kern_types.h>
26#include <kern/locks.h>
27#include <kern/misc_protos.h>
28#include <kern/task.h>
29#include <kern/thread.h>
30#include <kern/zalloc.h>
31#include <machine/machine_cpu.h>
32
33#include <pmc/pmc.h>
34
35#include <libkern/OSAtomic.h>
36
37#if defined(__i386__) || defined(__x86_64__)
38#include <i386/mp.h>
39#endif
40
41#if CONFIG_COUNTERS
42
43/* various debug logging enable */
44#undef DEBUG_COUNTERS
45
46typedef uint8_t pmc_state_event_t;
47
48#define PMC_STATE_EVENT_START				0
49#define PMC_STATE_EVENT_STOP				1
50#define PMC_STATE_EVENT_FREE				2
51#define PMC_STATE_EVENT_INTERRUPT			3
52#define PMC_STATE_EVENT_END_OF_INTERRUPT	4
53#define PMC_STATE_EVENT_CONTEXT_IN			5
54#define PMC_STATE_EVENT_CONTEXT_OUT			6
55#define PMC_STATE_EVENT_LOAD_FINISHED		7
56#define PMC_STATE_EVENT_STORE_FINISHED		8
57
58/* PMC spin timeouts */
59#define PMC_SPIN_THRESHOLD	10	/* Number of spins to allow before checking mach_absolute_time() */
60#define PMC_SPIN_TIMEOUT_US	10	/* Time in microseconds before the spin causes an assert */
61
62uint64_t pmc_spin_timeout_count = 0;	/* Number of times where a PMC spin loop causes a timeout */
63
64#ifdef DEBUG_COUNTERS
65#	include <pexpert/pexpert.h>
66#	define COUNTER_DEBUG(...) \
67	do { \
68		kprintf("[%s:%s][%u] ", __FILE__, __PRETTY_FUNCTION__, cpu_number()); \
69		kprintf(__VA_ARGS__); \
70	} while(0)
71
72#	define PRINT_PERF_MON(x)	\
73	do { \
74		kprintf("perfmon: %p (obj: %p refCt: %u switchable: %u)\n", \
75			x, x->object, x->useCount, \
76			(x->methods.flags & PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING) ? \
77			1 : 0); \
78	} while(0)
79
80static const char const * pmc_state_state_name(pmc_state_t state) {
81	switch (PMC_STATE_STATE(state)) {
82		case PMC_STATE_STATE_INVALID:
83			return "INVALID";
84		case PMC_STATE_STATE_STOP:
85			return "STOP";
86		case PMC_STATE_STATE_CAN_RUN:
87			return "CAN_RUN";
88		case PMC_STATE_STATE_LOAD:
89			return "LOAD";
90		case PMC_STATE_STATE_RUN:
91			return "RUN";
92		case PMC_STATE_STATE_STORE:
93			return "STORE";
94		case PMC_STATE_STATE_INTERRUPT:
95			return "INTERRUPT";
96		case PMC_STATE_STATE_DEALLOC:
97			return "DEALLOC";
98		default:
99			return "UNKNOWN";
100	}
101}
102
103static const char const * pmc_state_event_name(pmc_state_event_t event) {
104	switch (event) {
105		case PMC_STATE_EVENT_START:
106			return "START";
107		case PMC_STATE_EVENT_STOP:
108			return "STOP";
109		case PMC_STATE_EVENT_FREE:
110			return "FREE";
111		case PMC_STATE_EVENT_INTERRUPT:
112			return "INTERRUPT";
113		case PMC_STATE_EVENT_END_OF_INTERRUPT:
114			return "END OF INTERRUPT";
115		case PMC_STATE_EVENT_CONTEXT_IN:
116			return "CONTEXT IN";
117		case PMC_STATE_EVENT_CONTEXT_OUT:
118			return "CONTEXT OUT";
119		case PMC_STATE_EVENT_LOAD_FINISHED:
120			return "LOAD_FINISHED";
121		case PMC_STATE_EVENT_STORE_FINISHED:
122			return "STORE_FINISHED";
123		default:
124			return "UNKNOWN";
125	}
126}
127
128#	define PMC_STATE_FORMAT	"<%s, %u, %s%s%s>"
129#	define PMC_STATE_ARGS(x)	pmc_state_state_name(x), PMC_STATE_CONTEXT_COUNT(x), ((PMC_STATE_FLAGS(x) & PMC_STATE_FLAGS_INTERRUPTING) ? "I" : ""), \
130					((PMC_STATE_FLAGS(x) & PMC_STATE_FLAGS_STOPPING) ? "S" : ""), ((PMC_STATE_FLAGS(x) & PMC_STATE_FLAGS_DEALLOCING) ? "D" : "")
131#else
132#	define COUNTER_DEBUG(...)
133#	define PRINT_PERF_MON(x)
134#	define PMC_STATE_FORMAT
135#	define PMC_STATE_ARGS(x)
136#endif
137
138/*!struct
139 * pmc_config is the data behind a pmc_config_t.
140 * @member object A pointer to an instance of IOPerformanceCounterConfiguration
141 * @member method A pointer to a method to call to handle PMI.
142 * @member interrupt_after_value Cause a PMI after the counter counts this many
143 * events.
144 * @member refCon Passed to the @method method as the refCon argument.
145 */
146struct pmc_config {
147	pmc_config_object_t object;
148	volatile pmc_interrupt_method_t method;
149	uint64_t interrupt_after_value;
150	void *refCon;
151};
152
153/*
154 * Allocation Zones
155 *
156 * Two allocation zones - Perf zone small and Perf zone big.
157 * Each zone has associated maximums, defined below.
158 * The small zone is the max of the smallest allocation objects (all sizes on
159 * K64):
160 *	perf_monitor_t - 48 bytes
161 *		perf_monitor_methods_t - 28 bytes
162 *	pmc_reservation_t - 48 bytes
163 *  pmc_config_t - 32 bytes
164 * perf_small_zone unit size is (on K64) 48 bytes
165 * perf_small_zone max count must be max number of perf monitors, plus (max
166 * number of reservations * 2). The "*2" is because each reservation has a
167 * pmc_config_t within.
168 *
169 * Big zone is max of the larger allocation units
170 *	pmc_t - 144 bytes
171 *		pmc_methods_t - 116 bytes
172 * perf_big_zone unit size is (on K64) 144 bytes
173 * perf_big_zone max count is the max number of PMCs we support.
174 */
175
176static zone_t perf_small_zone = NULL;
177#define MAX_PERF_SMALLS		(256 + 8196 + 8196)
178#define PERF_SMALL_UNIT_SZ	(MAX(MAX(sizeof(struct perf_monitor), \
179	sizeof(struct pmc_reservation)), sizeof(struct pmc_config)))
180
181static zone_t perf_big_zone = NULL;
182#define MAX_PERF_BIGS		(1024)
183#define PERF_BIG_UNIT_SZ	(sizeof(struct pmc))
184
185/*
186 * Locks and Lock groups
187 */
188static lck_grp_t *pmc_lock_grp = LCK_GRP_NULL;
189static lck_grp_attr_t *pmc_lock_grp_attr;
190static lck_attr_t *pmc_lock_attr;
191
192/* PMC tracking queue locks */
193
194static lck_mtx_t  cpu_monitor_queue_mutex;   /* protects per-cpu queues at initialisation time */
195static lck_spin_t perf_monitor_queue_spin;   /* protects adding and removing from queue */
196static lck_spin_t perf_counters_queue_spin;  /* protects adding and removing from queue */
197
198/* Reservation tracking queues lock */
199static lck_spin_t reservations_spin;
200
201/*
202 * Tracking queues
203 *
204 * Keeps track of registered perf monitors and perf counters
205 */
206
207static queue_head_t **cpu_monitor_queues = NULL;
208
209static queue_head_t *perf_monitors_queue = NULL;
210static volatile uint32_t perf_monitors_count = 0U;
211
212static queue_head_t *perf_counters_queue = NULL;
213static volatile uint32_t perf_counters_count = 0U;
214
215/*
216 * Reservation queues
217 *
218 * Keeps track of all system, task, and thread-level reservations (both active and
219 * inactive).
220 *
221 * We track them all here (rather than in their respective task or thread only)
222 * so that we can inspect our tracking data directly (rather than peeking at
223 * every task and thread) to determine if/when a new reservation would
224 * constitute a conflict.
225 */
226
227static queue_head_t *system_reservations = NULL;
228static volatile uint32_t system_reservation_count = 0U;
229
230static queue_head_t *task_reservations = NULL;
231static volatile uint32_t task_reservation_count = 0U;
232
233static queue_head_t *thread_reservations = NULL;
234static volatile uint32_t thread_reservation_count = 0U;
235
236#if XNU_KERNEL_PRIVATE
237
238/*
239 * init_pmc_locks creates and initializes all the locks and lock groups and lock
240 * attributes required for the pmc sub-system.
241 */
242static void init_pmc_locks(void) {
243	pmc_lock_attr = lck_attr_alloc_init();
244	assert(pmc_lock_attr);
245
246	pmc_lock_grp_attr = lck_grp_attr_alloc_init();
247	assert(pmc_lock_grp_attr);
248
249	pmc_lock_grp = lck_grp_alloc_init("pmc", pmc_lock_grp_attr);
250	assert(pmc_lock_grp);
251
252	lck_spin_init(&perf_monitor_queue_spin, pmc_lock_grp, pmc_lock_attr);
253	lck_spin_init(&perf_counters_queue_spin, pmc_lock_grp, pmc_lock_attr);
254
255	lck_spin_init(&reservations_spin, pmc_lock_grp, pmc_lock_attr);
256
257	lck_mtx_init(&cpu_monitor_queue_mutex, pmc_lock_grp, pmc_lock_attr);
258}
259
260/*
261 * init_pmc_zones initializes the allocation zones used by the pmc subsystem
262 */
263static void init_pmc_zones(void) {
264	perf_small_zone = zinit(PERF_SMALL_UNIT_SZ,
265		MAX_PERF_SMALLS * PERF_SMALL_UNIT_SZ, MAX_PERF_SMALLS,
266		"pmc.small zone");
267
268	assert(perf_small_zone);
269
270	perf_big_zone = zinit(PERF_BIG_UNIT_SZ,
271		MAX_PERF_BIGS * PERF_BIG_UNIT_SZ, MAX_PERF_BIGS,
272		"pmc.big zone");
273
274	assert(perf_big_zone);
275}
276
277/*
278 * init_pmc_queues allocates and initializes the tracking queues for
279 * registering and reserving individual pmcs and perf monitors.
280 */
281static void init_pmc_queues(void) {
282
283	perf_monitors_queue = (queue_head_t*)kalloc(sizeof(queue_head_t));
284	assert(perf_monitors_queue);
285
286	queue_init(perf_monitors_queue);
287
288	perf_counters_queue = (queue_head_t*)kalloc(sizeof(queue_head_t));
289	assert(perf_counters_queue);
290
291	queue_init(perf_counters_queue);
292
293	system_reservations = (queue_head_t*)kalloc(sizeof(queue_t));
294	assert(system_reservations);
295
296	queue_init(system_reservations);
297
298	task_reservations = (queue_head_t*)kalloc(sizeof(queue_head_t));
299	assert(task_reservations);
300
301	queue_init(task_reservations);
302
303	thread_reservations = (queue_head_t*)kalloc(sizeof(queue_head_t));
304	assert(thread_reservations);
305
306	queue_init(thread_reservations);
307}
308
309/*
310 * pmc_bootstrap brings up all the necessary infrastructure required to use the
311 * pmc sub-system.
312 */
313__private_extern__
314void pmc_bootstrap(void) {
315	/* build our alloc zones */
316	init_pmc_zones();
317
318	/* build the locks */
319	init_pmc_locks();
320
321	/* build our tracking queues */
322	init_pmc_queues();
323}
324
325#endif /* XNU_KERNEL_PRIVATE */
326
327/*
328 * Perf Monitor Internals
329 */
330
331static perf_monitor_t perf_monitor_alloc(void) {
332	/* perf monitors come from the perf small zone */
333	return (perf_monitor_t)zalloc(perf_small_zone);
334}
335
336static void perf_monitor_free(void *pm) {
337	zfree(perf_small_zone, pm);
338}
339
340static void perf_monitor_init(perf_monitor_t pm, int cpu) {
341	assert(pm);
342
343	pm->object = NULL;
344
345	bzero(&(pm->methods), sizeof(perf_monitor_methods_t));
346
347	pm->useCount = 1;	/* initial retain count of 1, for caller */
348
349	pm->reservedCounters = 0;
350
351	pm->cpu = cpu;
352
353	pm->link.next = pm->link.prev = (queue_entry_t)NULL;
354	pm->cpu_link.next = pm->cpu_link.prev = (queue_entry_t)NULL;
355}
356
357/*
358 * perf_monitor_dequeue removes the given perf_monitor_t from the
359 * perf_monitor_queue, thereby unregistering it with the system.
360 */
361static void perf_monitor_dequeue(perf_monitor_t pm) {
362	lck_spin_lock(&perf_monitor_queue_spin);
363
364	if (pm->methods.flags & PERFMON_FLAG_REQUIRES_IDLE_NOTIFICATIONS) {
365		/* If this flag is set, the monitor is already validated to be
366		 * accessible from a single cpu only.
367		 */
368		queue_remove(cpu_monitor_queues[pm->cpu], pm, perf_monitor_t, cpu_link);
369	}
370
371	/*
372	 * remove the @pm object from the @perf_monitor_queue queue (it is of type
373	 * <perf_monitor_t> and has a field called @link that is the queue_link_t
374	 */
375	queue_remove(perf_monitors_queue, pm, perf_monitor_t, link);
376
377	perf_monitors_count--;
378
379	lck_spin_unlock(&perf_monitor_queue_spin);
380}
381
382/*
383 * perf_monitor_enqueue adds the given perf_monitor_t to the perf_monitor_queue,
384 * thereby registering it for use with the system.
385 */
386static void perf_monitor_enqueue(perf_monitor_t pm) {
387
388	lck_mtx_lock(&cpu_monitor_queue_mutex);
389	lck_spin_lock(&perf_monitor_queue_spin);
390
391	if (pm->cpu >= 0) {
392            	/* Deferred initialisation; saves memory and permits ml_get_max_cpus()
393            	 * to block until cpu initialisation is complete.
394            	 */
395            	if (!cpu_monitor_queues) {
396            		uint32_t max_cpus;
397            		queue_head_t **queues;
398            		uint32_t i;
399
400            		lck_spin_unlock(&perf_monitor_queue_spin);
401
402            		max_cpus = ml_get_max_cpus();
403
404            		queues = (queue_head_t**)kalloc(sizeof(queue_head_t*) * max_cpus);
405            		assert(queues);
406            		for (i = 0; i < max_cpus; i++) {
407            			queue_head_t *queue = (queue_head_t*)kalloc(sizeof(queue_head_t));
408            			assert(queue);
409            			queue_init(queue);
410            			queues[i] = queue;
411            		}
412
413            		lck_spin_lock(&perf_monitor_queue_spin);
414
415            		cpu_monitor_queues = queues;
416            	}
417
418		queue_enter(cpu_monitor_queues[pm->cpu], pm, perf_monitor_t, cpu_link);
419	}
420
421	queue_enter(perf_monitors_queue, pm, perf_monitor_t, link);
422	perf_monitors_count++;
423
424	lck_spin_unlock(&perf_monitor_queue_spin);
425	lck_mtx_unlock(&cpu_monitor_queue_mutex);
426}
427
428/*
429 * perf_monitor_reference increments the reference count for the given
430 * perf_monitor_t.
431 */
432static void perf_monitor_reference(perf_monitor_t pm) {
433	assert(pm);
434
435	OSIncrementAtomic(&(pm->useCount));
436}
437
438/*
439 * perf_monitor_deallocate decrements the reference count for the given
440 * perf_monitor_t.  If the reference count hits 0, the object is released back
441 * to the perf_small_zone via a call to perf_monitor_free().
442 */
443static void perf_monitor_deallocate(perf_monitor_t pm) {
444	assert(pm);
445
446	/* If we just removed the last reference count */
447	if(1 == OSDecrementAtomic(&(pm->useCount))) {
448		/* Free the object */
449		perf_monitor_free(pm);
450	}
451}
452
453/*
454 * perf_monitor_find attempts to find a perf_monitor_t that corresponds to the
455 * given C++ object pointer that was used when registering with the subsystem.
456 *
457 * If found, the method returns the perf_monitor_t with an extra reference
458 * placed on the object (or NULL if not
459 * found).
460 *
461 * NOTE: Caller must use perf_monitor_deallocate to remove the extra reference after
462 * calling perf_monitor_find.
463 */
464static perf_monitor_t perf_monitor_find(perf_monitor_object_t monitor) {
465	assert(monitor);
466	perf_monitor_t element = NULL;
467	perf_monitor_t found = NULL;
468
469	lck_spin_lock(&perf_monitor_queue_spin);
470
471	queue_iterate(perf_monitors_queue, element, perf_monitor_t, link) {
472 		if(element->object == monitor) {
473			perf_monitor_reference(element);
474			found = element;
475			break;
476		}
477	}
478
479	lck_spin_unlock(&perf_monitor_queue_spin);
480
481	return found;
482}
483
484/*
485 * perf_monitor_add_pmc adds a newly registered PMC to the perf monitor it is
486 * associated with.
487 */
488
489static void perf_monitor_add_pmc(perf_monitor_t pm, pmc_t pmc __unused) {
490	assert(pm);
491	assert(pmc);
492
493	/* Today, we merely add a reference count now that a new pmc is attached */
494	perf_monitor_reference(pm);
495}
496
497/*
498 * perf_monitor_remove_pmc removes a newly *un*registered PMC from the perf
499 * monitor it is associated with.
500 */
501static void perf_monitor_remove_pmc(perf_monitor_t pm, pmc_t pmc __unused) {
502	assert(pm);
503	assert(pmc);
504
505	/* Today, we merely remove a reference count now that the pmc is detached */
506	perf_monitor_deallocate(pm);
507}
508
509/*
510 * Perf Counter internals
511 */
512
513static pmc_t pmc_alloc(void) {
514	return (pmc_t)zalloc(perf_big_zone);
515}
516
517static void pmc_free(void *pmc) {
518	zfree(perf_big_zone, pmc);
519}
520
521/*
522 * pmc_init initializes a newly allocated pmc_t
523 */
524static void pmc_init(pmc_t pmc) {
525	assert(pmc);
526
527	pmc->object = NULL;
528	pmc->monitor = NULL;
529
530	bzero(&pmc->methods, sizeof(pmc_methods_t));
531
532	/* One reference for the caller */
533	pmc->useCount = 1;
534}
535
536/*
537 * pmc_reference increments the reference count of the given pmc_t
538 */
539static void pmc_reference(pmc_t pmc) {
540	assert(pmc);
541
542	OSIncrementAtomic(&(pmc->useCount));
543}
544
545/*
546 * pmc_deallocate decrements the reference count of the given pmc_t. If the
547 * reference count hits zero, the given pmc_t is deallocated and released back
548 * to the allocation zone.
549 */
550static void pmc_deallocate(pmc_t pmc) {
551	assert(pmc);
552
553	/* If we just removed the last reference count */
554	if(1 == OSDecrementAtomic(&(pmc->useCount))) {
555		/* Free the pmc */
556		pmc_free(pmc);
557	}
558}
559
560/*
561 * pmc_dequeue removes the given, newly *un*registered pmc from the
562 * perf_counters_queue.
563 */
564static void pmc_dequeue(pmc_t pmc) {
565	lck_spin_lock(&perf_counters_queue_spin);
566
567	queue_remove(perf_counters_queue, pmc, pmc_t, link);
568
569	perf_counters_count--;
570
571	lck_spin_unlock(&perf_counters_queue_spin);
572}
573
574/*
575 * pmc_enqueue adds the given, newly registered pmc to the perf_counters_queue
576 */
577static void pmc_enqueue(pmc_t pmc) {
578	lck_spin_lock(&perf_counters_queue_spin);
579
580	queue_enter(perf_counters_queue, pmc, pmc_t, link);
581
582	perf_counters_count++;
583
584	lck_spin_unlock(&perf_counters_queue_spin);
585}
586
587/*
588 * pmc_find attempts to locate a pmc_t that was registered with the given
589 * pmc_object_t pointer.  If found, it returns the pmc_t with an extra reference
590 * which must be dropped by the caller by calling pmc_deallocate().
591 */
592static pmc_t pmc_find(pmc_object_t object) {
593	assert(object);
594
595	lck_spin_lock(&perf_counters_queue_spin);
596
597	pmc_t element = NULL;
598	pmc_t found = NULL;
599
600	queue_iterate(perf_counters_queue, element, pmc_t, link) {
601		if(element->object == object) {
602			pmc_reference(element);
603			found = element;
604			break;
605		}
606	}
607
608	lck_spin_unlock(&perf_counters_queue_spin);
609
610	return found;
611}
612
613/*
614 * Config internals
615 */
616
617/* Allocate a pmc_config_t */
618static pmc_config_t pmc_config_alloc(pmc_t pmc __unused) {
619	return (pmc_config_t)zalloc(perf_small_zone);
620}
621
622/* Free a pmc_config_t, and underlying pmc_config_object_t (if needed) */
623static void pmc_config_free(pmc_t pmc, pmc_config_t config) {
624	assert(pmc);
625	assert(config);
626
627	if(config->object) {
628		pmc->methods.free_config(pmc->object, config->object);
629		config->object = NULL;
630	}
631
632	zfree(perf_small_zone, config);
633}
634
635static kern_return_t pmc_open(pmc_t pmc) {
636	assert(pmc);
637	assert(pmc->object);
638	assert(pmc->open_object);
639
640	return pmc->methods.open(pmc->object, pmc->open_object);
641}
642
643static kern_return_t pmc_close(pmc_t pmc) {
644	assert(pmc);
645	assert(pmc->object);
646	assert(pmc->open_object);
647
648	return pmc->methods.close(pmc->object, pmc->open_object);
649}
650
651/*
652 * Reservation Internals
653 */
654
655static kern_return_t pmc_internal_reservation_set_pmc(pmc_reservation_t resv, pmc_t pmc);
656static void pmc_internal_reservation_store(pmc_reservation_t reservation);
657static void pmc_internal_reservation_load(pmc_reservation_t reservation);
658
659static pmc_reservation_t reservation_alloc(void) {
660	/* pmc reservations come from the perf small zone */
661	return (pmc_reservation_t)zalloc(perf_small_zone);
662}
663
664/*
665 * reservation_free deallocates and releases all resources associated with the
666 * given pmc_reservation_t.  This includes freeing the config used to create the
667 * reservation, decrementing the reference count for the pmc used to create the
668 * reservation, and deallocating the reservation's memory.
669 */
670static void reservation_free(pmc_reservation_t resv) {
671	/* Free config */
672	if(resv->config) {
673		assert(resv->pmc);
674
675		pmc_free_config(resv->pmc, resv->config);
676
677		resv->config = NULL;
678	}
679
680	/* release PMC */
681	(void)pmc_internal_reservation_set_pmc(resv, NULL);
682
683	/* Free reservation */
684	zfree(perf_small_zone, resv);
685}
686
687/*
688 * reservation_init initializes a newly created reservation.
689 */
690static void reservation_init(pmc_reservation_t resv) {
691	assert(resv);
692
693	resv->pmc = NULL;
694	resv->config = NULL;
695	resv->value = 0ULL;
696
697	resv->flags = 0U;
698	resv->state = PMC_STATE(PMC_STATE_STATE_STOP, 0, 0);
699	resv->active_last_context_in = 0U;
700
701	/*
702	 * Since this member is a union, we only need to set either the task
703	 * or thread to NULL.
704	 */
705	resv->task = TASK_NULL;
706}
707
708/*
709 * pmc_internal_reservation_set_pmc sets the pmc associated with the reservation object. If
710 * there was one set already, it is deallocated (reference is dropped) before
711 * the new one is set.  This methods increases the reference count of the given
712 * pmc_t.
713 *
714 * NOTE: It is okay to pass NULL as the pmc_t - this will have the effect of
715 * dropping the reference on any previously set pmc, and setting the reservation
716 * to having no pmc set.
717 */
718static kern_return_t pmc_internal_reservation_set_pmc(pmc_reservation_t resv, pmc_t pmc) {
719	assert(resv);
720
721	if(resv->pmc) {
722		(void)pmc_close(resv->pmc);
723		pmc_deallocate(resv->pmc);
724		resv->pmc = NULL;
725	}
726
727	resv->pmc = pmc;
728
729	if(resv->pmc) {
730		pmc_reference(resv->pmc);
731		if(KERN_SUCCESS != pmc_open(resv->pmc)) {
732			pmc_deallocate(resv->pmc);
733			resv->pmc = NULL;
734
735			return KERN_FAILURE;
736		}
737	}
738
739	return KERN_SUCCESS;
740}
741
742/*
743 * Used to place reservation into one of the system, task, and thread queues
744 * Assumes the queue's spin lock is already held.
745 */
746static void pmc_internal_reservation_enqueue(queue_t queue, pmc_reservation_t resv) {
747	assert(queue);
748	assert(resv);
749
750	queue_enter(queue, resv, pmc_reservation_t, link);
751}
752
753static void pmc_internal_reservation_dequeue(queue_t queue, pmc_reservation_t resv) {
754	assert(queue);
755	assert(resv);
756
757	queue_remove(queue, resv, pmc_reservation_t, link);
758}
759
760/* Returns TRUE if the reservation applies to the current execution context */
761static boolean_t pmc_internal_reservation_matches_context(pmc_reservation_t resv) {
762	boolean_t ret = FALSE;
763	assert(resv);
764
765	if(PMC_FLAG_IS_SYSTEM_SCOPE(resv->flags)) {
766		ret = TRUE;
767	} else if(PMC_FLAG_IS_TASK_SCOPE(resv->flags)) {
768		if(current_task() == resv->task) {
769			ret = TRUE;
770		}
771	} else if(PMC_FLAG_IS_THREAD_SCOPE(resv->flags)) {
772		if(current_thread() == resv->thread) {
773			ret = TRUE;
774		}
775	}
776
777	return ret;
778}
779
780/*
781 * pmc_accessible_core_count returns the number of logical cores that can access
782 * a given @pmc.  0 means every core in the system.
783 */
784static uint32_t pmc_accessible_core_count(pmc_t pmc) {
785	assert(pmc);
786
787	uint32_t *cores = NULL;
788	size_t coreCt = 0UL;
789
790	if(KERN_SUCCESS != pmc->methods.accessible_cores(pmc->object,
791		&cores, &coreCt)) {
792		coreCt = 0U;
793	}
794
795	return (uint32_t)coreCt;
796}
797
798/* spin lock for the queue must already be held */
799/*
800 * This method will inspect the task/thread of the reservation to see if it
801 * matches the new incoming one (for thread/task reservations only).  Will only
802 * return TRUE if the task/thread matches.
803 */
804static boolean_t pmc_internal_reservation_queue_contains_pmc(queue_t queue, pmc_reservation_t resv) {
805	assert(queue);
806	assert(resv);
807
808	boolean_t ret = FALSE;
809	pmc_reservation_t tmp = NULL;
810
811	queue_iterate(queue, tmp, pmc_reservation_t, link) {
812		if(tmp->pmc == resv->pmc) {
813			/* PMC matches - make sure scope matches first */
814			switch(PMC_FLAG_SCOPE(tmp->flags)) {
815				case PMC_FLAG_SCOPE_SYSTEM:
816					/*
817					 * Found a reservation in system queue with same pmc - always a
818					 * conflict.
819					 */
820					ret = TRUE;
821					break;
822				case PMC_FLAG_SCOPE_THREAD:
823					/*
824					 * Found one in thread queue with the same PMC as the
825					 * argument. Only a conflict if argument scope isn't
826					 * thread or system, or the threads match.
827					 */
828					ret = (PMC_FLAG_SCOPE(resv->flags) != PMC_FLAG_SCOPE_THREAD) ||
829						(tmp->thread == resv->thread);
830
831					if(!ret) {
832						/*
833						 * so far, no conflict - check that the pmc that is
834						 * being reserved isn't accessible from more than
835						 * one core, if it is, we need to say it's already
836						 * taken.
837						 */
838						if(1 != pmc_accessible_core_count(tmp->pmc)) {
839							ret = TRUE;
840						}
841					}
842					break;
843				case PMC_FLAG_SCOPE_TASK:
844					/*
845					 * Follow similar semantics for task scope.
846					 */
847
848					ret = (PMC_FLAG_SCOPE(resv->flags) != PMC_FLAG_SCOPE_TASK) ||
849						(tmp->task == resv->task);
850					if(!ret) {
851						/*
852						 * so far, no conflict - check that the pmc that is
853						 * being reserved isn't accessible from more than
854						 * one core, if it is, we need to say it's already
855						 * taken.
856						 */
857						if(1 != pmc_accessible_core_count(tmp->pmc)) {
858							ret = TRUE;
859						}
860					}
861
862					break;
863			}
864
865			if(ret) break;
866		}
867	}
868
869	return ret;
870}
871
872/*
873 * pmc_internal_reservation_validate_for_pmc returns TRUE if the given reservation can be
874 * added to its target queue without creating conflicts (target queue is
875 * determined by the reservation's scope flags). Further, this method returns
876 * FALSE if any level contains a reservation for a PMC that can be accessed from
877 * more than just 1 core, and the given reservation also wants the same PMC.
878 */
879static boolean_t pmc_internal_reservation_validate_for_pmc(pmc_reservation_t resv) {
880	assert(resv);
881	boolean_t ret = TRUE;
882
883	if(pmc_internal_reservation_queue_contains_pmc(system_reservations, resv) ||
884		pmc_internal_reservation_queue_contains_pmc(task_reservations, resv) ||
885		pmc_internal_reservation_queue_contains_pmc(thread_reservations, resv)) {
886		ret = FALSE;
887	}
888
889	return ret;
890}
891
892static void pmc_internal_update_thread_flag(thread_t thread, boolean_t newFlag) {
893	assert(thread);
894
895	/* See if this thread needs it's PMC flag set */
896	pmc_reservation_t tmp = NULL;
897
898	if(!newFlag) {
899		/*
900		 * If the parent task just dropped its reservation, iterate the thread
901		 * reservations to see if we need to keep the pmc flag set for the given
902		 * thread or not.
903		 */
904		lck_spin_lock(&reservations_spin);
905
906		queue_iterate(thread_reservations, tmp, pmc_reservation_t, link) {
907			if(tmp->thread == thread) {
908				newFlag = TRUE;
909				break;
910			}
911		}
912
913		lck_spin_unlock(&reservations_spin);
914	}
915
916	if(newFlag) {
917		OSBitOrAtomic(THREAD_PMC_FLAG, &thread->t_chud);
918	} else {
919		OSBitAndAtomic(~(THREAD_PMC_FLAG), &thread->t_chud);
920	}
921}
922
923/*
924 * This operation is (worst case) O(N*M) where N is number of threads in the
925 * given task, and M is the number of thread reservations in our system.
926 */
927static void pmc_internal_update_task_flag(task_t task, boolean_t newFlag) {
928	assert(task);
929	thread_t thread = NULL;
930
931	if(newFlag) {
932		OSBitOrAtomic(TASK_PMC_FLAG, &task->t_chud);
933	} else {
934		OSBitAndAtomic(~(TASK_PMC_FLAG), &task->t_chud);
935	}
936
937	task_lock(task);
938
939	queue_iterate(&task->threads, thread, thread_t, task_threads) {
940		/* propagate the task's mask down to each thread  */
941		pmc_internal_update_thread_flag(thread, newFlag);
942	}
943
944	task_unlock(task);
945}
946
947/*
948 * pmc_internal_reservation_add adds a reservation to the global tracking queues after
949 * ensuring there are no reservation conflicts.  To do this, it takes all the
950 * spin locks for all the queue (to ensure no other core goes and adds a
951 * reservation for the same pmc to a queue that has already been checked).
952 */
953static boolean_t pmc_internal_reservation_add(pmc_reservation_t resv) {
954	assert(resv);
955
956	boolean_t ret = FALSE;
957
958	/* always lock all three in the same order */
959	lck_spin_lock(&reservations_spin);
960
961	/* Check if the reservation can be added without conflicts */
962	if(pmc_internal_reservation_validate_for_pmc(resv)) {
963
964		/* add reservation to appropriate scope */
965		switch(PMC_FLAG_SCOPE(resv->flags)) {
966		case PMC_FLAG_SCOPE_SYSTEM:
967			/* Simply add it to the system queue */
968			pmc_internal_reservation_enqueue(system_reservations, resv);
969			system_reservation_count++;
970
971			lck_spin_unlock(&reservations_spin);
972
973			break;
974
975		case PMC_FLAG_SCOPE_TASK:
976			assert(resv->task);
977
978			/* Not only do we enqueue it in our local queue for tracking */
979			pmc_internal_reservation_enqueue(task_reservations, resv);
980			task_reservation_count++;
981
982			lck_spin_unlock(&reservations_spin);
983
984			/* update the task mask, and propagate it to existing threads */
985			pmc_internal_update_task_flag(resv->task, TRUE);
986			break;
987
988		/* Thread-switched counter */
989		case PMC_FLAG_SCOPE_THREAD:
990			assert(resv->thread);
991
992			/*
993			 * Works the same as a task-switched counter, only at
994			 * thread-scope
995			 */
996
997			pmc_internal_reservation_enqueue(thread_reservations, resv);
998			thread_reservation_count++;
999
1000			lck_spin_unlock(&reservations_spin);
1001
1002			pmc_internal_update_thread_flag(resv->thread, TRUE);
1003			break;
1004		}
1005
1006		ret = TRUE;
1007	} else {
1008		lck_spin_unlock(&reservations_spin);
1009	}
1010
1011	return ret;
1012}
1013
1014static void pmc_internal_reservation_broadcast(pmc_reservation_t reservation, void (*action_func)(void *)) {
1015	uint32_t * cores;
1016	size_t core_cnt;
1017
1018	/* Get the list of accessible cores */
1019	if (KERN_SUCCESS == pmc_get_accessible_core_list(reservation->pmc, &cores, &core_cnt)) {
1020		boolean_t intrs_enabled = ml_set_interrupts_enabled(FALSE);
1021
1022		/* Fast case: the PMC is only accessible from one core and we happen to be on it */
1023		if (core_cnt == 1 && cores[0] == (uint32_t)cpu_number()) {
1024			action_func(reservation);
1025		} else {
1026			/* Call action_func on every accessible core */
1027#if defined(__i386__) || defined(__x86_64__)
1028			size_t ii;
1029			cpumask_t mask = 0;
1030
1031			/* Build a mask for the accessible cores */
1032			if (core_cnt > 0) {
1033				for (ii = 0; ii < core_cnt; ii++) {
1034					mask |= cpu_to_cpumask(cores[ii]);
1035				}
1036			} else {
1037				/* core_cnt = 0 really means all cpus */
1038				mask = CPUMASK_ALL;
1039			}
1040			mp_cpus_call(mask, ASYNC, action_func, reservation);
1041#else
1042#error pmc_reservation_interrupt needs an inter-processor method invocation mechanism for this architecture
1043#endif
1044		}
1045
1046		ml_set_interrupts_enabled(intrs_enabled);
1047	}
1048
1049}
1050
1051/*
1052 * pmc_internal_reservation_remove removes the given reservation from the appropriate
1053 * reservation queue according to its scope.
1054 *
1055 * NOTE: The scope flag must have been set for this method to function.
1056 */
1057static void pmc_internal_reservation_remove(pmc_reservation_t resv) {
1058	assert(resv);
1059
1060	/*
1061	 * Due to the way the macros are written, we can't just blindly queue-remove
1062	 * the reservation without knowing which queue it's in. We figure this out
1063	 * using the reservation's scope flags.
1064	 */
1065
1066	/* Lock the global spin lock */
1067	lck_spin_lock(&reservations_spin);
1068
1069	switch(PMC_FLAG_SCOPE(resv->flags)) {
1070
1071		case PMC_FLAG_SCOPE_SYSTEM:
1072			pmc_internal_reservation_dequeue(system_reservations, resv);
1073			system_reservation_count--;
1074
1075			lck_spin_unlock(&reservations_spin);
1076
1077			break;
1078
1079		case PMC_FLAG_SCOPE_TASK:
1080			/* remove from the global queue */
1081			pmc_internal_reservation_dequeue(task_reservations, resv);
1082			task_reservation_count--;
1083
1084			/* unlock the global */
1085			lck_spin_unlock(&reservations_spin);
1086
1087			/* Recalculate task's counter mask */
1088			pmc_internal_update_task_flag(resv->task, FALSE);
1089
1090			break;
1091
1092		case PMC_FLAG_SCOPE_THREAD:
1093			pmc_internal_reservation_dequeue(thread_reservations, resv);
1094			thread_reservation_count--;
1095
1096			lck_spin_unlock(&reservations_spin);
1097
1098			/* recalculate the thread's counter mask */
1099			pmc_internal_update_thread_flag(resv->thread, FALSE);
1100
1101			break;
1102	}
1103}
1104
1105/* Reservation State Machine
1106 *
1107 * The PMC subsystem uses a 3-tuple of state information packed into a 32-bit quantity and a
1108 * set of 9 events to provide MP-safe bookkeeping and control flow.  The 3-tuple is comprised
1109 * of a state, a count of active contexts, and a set of modifier flags.  A state machine defines
1110 * the possible transitions at each event point given the current 3-tuple.  Atomicity is handled
1111 * by reading the current 3-tuple, applying the transformations indicated by the state machine
1112 * and then attempting to OSCompareAndSwap the transformed value.  If the OSCompareAndSwap fails,
1113 * the process is repeated until either the OSCompareAndSwap succeeds or not valid transitions are
1114 * available.
1115 *
1116 * The state machine is described using tuple notation for the current state and a related notation
1117 * for describing the transformations.  For concisness, the flag and state names are abbreviated as
1118 * follows:
1119 *
1120 * states:
1121 * S = STOP
1122 * CR = CAN_RUN
1123 * L = LOAD
1124 * R = RUN
1125 * ST = STORE
1126 * I = INTERRUPT
1127 * D = DEALLOC
1128 *
1129 * flags:
1130 *
1131 * S = STOPPING
1132 * D = DEALLOCING
1133 * I = INTERRUPTING
1134 *
1135 * The tuple notation is formed from the following pattern:
1136 *
1137 * tuple = < state, active-context-count, flags >
1138 * state = S | CR | L | R | ST | I | D
1139 * active-context-count = 0 | >0 | 1 | >1
1140 * flags = flags flag | blank
1141 * flag = S | D | I
1142 *
1143 * The transform notation is similar, but only describes the modifications made to the current state.
1144 * The notation is formed from the following pattern:
1145 *
1146 * transform = < state, active-context-count, flags >
1147 * state = S | CR | L | R | ST | I | D
1148 * active-context-count = + | - | blank
1149 * flags = flags flag | flags !flag | blank
1150 * flag = S | D | I
1151 *
1152 * And now for the state machine:
1153 * State		Start		Stop		Free		Interrupt		End Interrupt		Context In		Context Out	Load Finished		Store Finished
1154 * <CR, 0, >				<S, , >		<D, , >			<L, +, >
1155 * <D, 0, >
1156 * <D, 1, D>									< , -, !D>
1157 * <D, >1, D>									< , -, >
1158 * <I, 0, D>									<D, , !D>
1159 * <I, 0, S>	< , , !S>				< , , !SD>		<S, , !S>
1160 * <I, 0, >					< , , S>	< , , D>	<CR, , >
1161 * <L, 1, D>									<ST, -, >
1162 * <L, 1, ID>									<ST, -, >
1163 * <L, 1, IS>							< , , !SD>	<ST, -, >
1164 * <L, 1, S>	< , , !S>				< , , !SD>		<ST, -, >
1165 * <L, 1, >					< , , S>	< , , D>	< , , IS>							< , +, >	<R, , >
1166 * <L, >1, D>									< , -, >		<R, -, >
1167 * <L, >1, ID>									< , -, >		<R, -, >
1168 * <L, >1, IS>							< , , !SD>	< , -, >		<R, -, >
1169 * <L, >1, S>	< , , !S>				< , , !SD>		< , -, >		<R, -, >
1170 * <L, >1, >				< , , S>	< , , D>	< , , IS>							< , +, >		< , -, >		<R, , >
1171 * <R, 1, D>									<ST, -, >
1172 * <R, 1, ID>									<ST, -, >
1173 * <R, 1, IS>							< , , !SD>	<ST, -, >
1174 * <R, 1, S>	< , , !S>				< , , !SD>		<ST, -, >
1175 * <R, 1, >					< , , S>	< , , D>	< , , IS>							< , +, >	<ST, -, >
1176 * <R, >1, D>									< , -, >
1177 * <R, >1, ID>									< , -, >
1178 * <R, >1, IS>							< , , !SD>	< , -, >
1179 * <R, >1, S>	< , , !S>				< , , !SD>		< , -, >
1180 * <R, >1, >				< , , S>	< , , D>	< , , IS>							< , +, >		< , -, >
1181 * <S, 0, >		<CR, , >				<D, , >
1182 * <S, 1, ID>									<I, -, !I>
1183 * <S, 1, IS>							< , , !SD>	<I, -, !I>
1184 * <S, 1, S>	< , , !S>				<D, , !SD>		< , -, !S>
1185 * <S, 1, >					< , , S>	<D, , D>	<L, +, >		<CR, -, >
1186 * <S, >1, ID>									< , -, >
1187 * <S, >1, IS>							< , , !SD>	< , -, >
1188 * <S, >1, S>	< , , !S>				<D, , !SD>		< , -, >
1189 * <S, >1, >				< , , S>	<D, , D>		<L, +, >		< , -, >
1190 * <ST, 0, D>									<D, , !D>
1191 * <ST, 0, ID>									<I, , !I>
1192 * <ST, 0, IS>							< , , !SD>	<I, , !I>
1193 * <ST, 0, S>	< , , !S>				< , , !SD>		<S, , !S>
1194 * <ST, 0, >				< , , S>	< , , D>	< , , IS>							< , +, >		<CR, , >
1195 * <ST, >0, D>									< , -, >							<D, , >
1196 * <ST, >0, ID>								< , -, >							<S, , >
1197 * <ST, >0, IS>							< , , !SD>										< , -, >			<S, , >
1198 * <ST, >0, S>	< , , !S>				< , , !SD>		< , -, >							<S, , >
1199 * <ST, >0, >				< , , S>	< , , D>	< , , IS>							< , +, >		< , -, >			<L, , >
1200 */
1201
1202static uint32_t pmc_internal_reservation_next_state(uint32_t current_state, pmc_state_event_t event) {
1203	uint32_t new_state = PMC_STATE(PMC_STATE_STATE_INVALID, 0, 0);
1204
1205	switch (event) {
1206		case PMC_STATE_EVENT_START:
1207			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1208				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, PMC_STATE_FLAGS_STOPPING):
1209				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_STOPPING):
1210				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_STOPPING):
1211				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_STOPPING):
1212				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_STOPPING):
1213					new_state = PMC_STATE_MODIFY(current_state, 0, 0, PMC_STATE_FLAGS_STOPPING);
1214					break;
1215				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1216					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1217						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_CAN_RUN, 0, 0, 0);
1218					}
1219					break;
1220			}
1221			break;
1222		case PMC_STATE_EVENT_STOP:
1223			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1224				case PMC_STATE(PMC_STATE_STATE_CAN_RUN, 0, 0):
1225					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, 0);
1226					break;
1227				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, 0):
1228				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1229				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1230				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1231					new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_STOPPING, 0);
1232					break;
1233				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1234					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1235						new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_STOPPING, 0);
1236					}
1237					break;
1238			}
1239			break;
1240		case PMC_STATE_EVENT_FREE:
1241			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1242				case PMC_STATE(PMC_STATE_STATE_CAN_RUN, 0, 0):
1243					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, 0);
1244					break;
1245				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, PMC_STATE_FLAGS_STOPPING):
1246				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1247				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_STOPPING):
1248				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1249				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_STOPPING):
1250				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1251				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1252				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_STOPPING):
1253					new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_DEALLOCING, PMC_STATE_FLAGS_STOPPING);
1254					break;
1255				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, 0):
1256				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1257				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1258				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1259					new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_DEALLOCING, 0);
1260					break;
1261				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_STOPPING):
1262					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, PMC_STATE_FLAGS_DEALLOCING, PMC_STATE_FLAGS_STOPPING);
1263					break;
1264				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1265					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1266						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, PMC_STATE_FLAGS_DEALLOCING, 0);
1267					} else {
1268						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, 0);
1269					}
1270					break;
1271			}
1272			break;
1273		case PMC_STATE_EVENT_INTERRUPT:
1274			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1275				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1276				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1277				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1278					new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING, 0);
1279					break;
1280			}
1281			break;
1282		case PMC_STATE_EVENT_END_OF_INTERRUPT:
1283			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1284				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, PMC_STATE_FLAGS_DEALLOCING):
1285					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, PMC_STATE_FLAGS_DEALLOCING);
1286					break;
1287				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, PMC_STATE_FLAGS_STOPPING):
1288					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, PMC_STATE_FLAGS_STOPPING);
1289					break;
1290				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, 0):
1291					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_CAN_RUN, 0, 0, 0);
1292					break;
1293			}
1294			break;
1295		case PMC_STATE_EVENT_CONTEXT_IN:
1296			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1297				case PMC_STATE(PMC_STATE_STATE_CAN_RUN, 0, 0):
1298					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_LOAD, 1, 0, 0);
1299					break;
1300				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1301				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1302				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1303					new_state = PMC_STATE_MODIFY(current_state, 1, 0, 0);
1304					break;
1305				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1306					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1307						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_LOAD, 1, 0, 0);
1308					}
1309					break;
1310			}
1311			break;
1312		case PMC_STATE_EVENT_CONTEXT_OUT:
1313			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1314				case PMC_STATE(PMC_STATE_STATE_DEALLOC, 0, PMC_STATE_FLAGS_DEALLOCING):
1315					if (PMC_STATE_CONTEXT_COUNT(current_state) > 1) {
1316						new_state = PMC_STATE_MODIFY(current_state, -1, 0, PMC_STATE_FLAGS_DEALLOCING);
1317					} else {
1318						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1319					}
1320					break;
1321				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_DEALLOCING):
1322				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1323				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1324				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_STOPPING):
1325				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1326					if (PMC_STATE_CONTEXT_COUNT(current_state) > 1) {
1327						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1328					}
1329					break;
1330				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_DEALLOCING):
1331				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1332				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1333				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_STOPPING):
1334				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1335					if (PMC_STATE_CONTEXT_COUNT(current_state) == 1) {
1336						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STORE, -1, 0, 0);
1337					} else {
1338						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1339					}
1340					break;
1341				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1342				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1343					if (PMC_STATE_CONTEXT_COUNT(current_state) == 1) {
1344						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_INTERRUPT, -1, 0, PMC_STATE_FLAGS_INTERRUPTING);
1345					} else {
1346						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1347					}
1348					break;
1349				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_STOPPING):
1350					if (PMC_STATE_CONTEXT_COUNT(current_state) == 1) {
1351						new_state = PMC_STATE_MODIFY(current_state, -1, 0, PMC_STATE_FLAGS_STOPPING);
1352					} else {
1353						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1354					}
1355					break;
1356				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1357					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1358						if (PMC_STATE_CONTEXT_COUNT(current_state) == 1) {
1359							new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_CAN_RUN, -1, 0, 0);
1360						} else {
1361							new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1362						}
1363					}
1364					break;
1365				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_DEALLOCING):
1366				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1367				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1368				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_STOPPING):
1369				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1370					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1371						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1372					}
1373					break;
1374			}
1375			break;
1376		case PMC_STATE_EVENT_LOAD_FINISHED:
1377			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1378				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_DEALLOCING):
1379				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1380				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1381				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_STOPPING):
1382					if (PMC_STATE_CONTEXT_COUNT(current_state) > 1) {
1383						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_RUN, -1, 0, 0);
1384					} else {
1385						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STORE, -1, 0, 0);
1386					}
1387					break;
1388				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1389					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_RUN, 0, 0, 0);
1390					break;
1391			}
1392			break;
1393		case PMC_STATE_EVENT_STORE_FINISHED:
1394			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1395				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_DEALLOCING):
1396					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1397						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, PMC_STATE_FLAGS_DEALLOCING);
1398					} else {
1399						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, 0);
1400					}
1401					break;
1402				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1403				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1404					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1405						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_INTERRUPT, 0, 0, PMC_STATE_FLAGS_INTERRUPTING);
1406					} else {
1407						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, 0);
1408					}
1409					break;
1410				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_STOPPING):
1411					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1412						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, PMC_STATE_FLAGS_STOPPING);
1413					} else {
1414						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, 0);
1415					}
1416					break;
1417				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1418					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1419						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_CAN_RUN, 0, 0, 0);
1420					} else {
1421						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_LOAD, 0, 0, 0);
1422					}
1423					break;
1424			}
1425			break;
1426	}
1427
1428	return new_state;
1429}
1430
1431static uint32_t pmc_internal_reservation_move_for_event(pmc_reservation_t reservation, pmc_state_event_t event, pmc_state_t *old_state_out) {
1432	pmc_state_t oldState;
1433	pmc_state_t newState;
1434
1435	assert(reservation);
1436
1437	/* Determine what state change, if any, we need to do.  Keep trying until either we succeed doing a transition
1438	 * or the there is no valid move.
1439	 */
1440	do {
1441		oldState = reservation->state;
1442		newState = pmc_internal_reservation_next_state(oldState, event);
1443	} while (newState != PMC_STATE_INVALID && !OSCompareAndSwap(oldState, newState, &(reservation->state)));
1444
1445	if (newState != PMC_STATE_INVALID) {
1446		COUNTER_DEBUG("Moved reservation %p from state "PMC_STATE_FORMAT" to state "PMC_STATE_FORMAT" for event %s\n", reservation, PMC_STATE_ARGS(oldState), PMC_STATE_ARGS(newState), pmc_state_event_name(event));
1447	} else {
1448		COUNTER_DEBUG("No valid moves for reservation %p in state "PMC_STATE_FORMAT" for event %s\n", reservation, PMC_STATE_ARGS(oldState), pmc_state_event_name(event));
1449	}
1450
1451	if (old_state_out != NULL) {
1452		*old_state_out = oldState;
1453	}
1454
1455	return newState;
1456}
1457
1458static void pmc_internal_reservation_context_out(pmc_reservation_t reservation) {
1459	assert(reservation);
1460	pmc_state_t newState;
1461	pmc_state_t oldState;
1462
1463	/* Clear that the this reservation was active when this cpu did its last context in */
1464	OSBitAndAtomic(~(1U << cpu_number()), &(reservation->active_last_context_in));
1465
1466	/* Move the state machine */
1467	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_CONTEXT_OUT, &oldState))) {
1468		return;
1469	}
1470
1471	/* Do any actions required based on the state change */
1472	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_STORE && PMC_STATE_STATE(oldState) != PMC_STATE_STATE_STORE) {
1473		/* Just moved into STORE, so store the reservation. */
1474		pmc_internal_reservation_store(reservation);
1475	} else if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_DEALLOC && PMC_STATE_CONTEXT_COUNT(newState) == 0 && PMC_STATE_FLAGS(newState) == 0) {
1476		/* Wakeup any thread blocking for this reservation to hit <DEALLOC, 0, > */
1477		thread_wakeup((event_t)reservation);
1478	}
1479
1480}
1481
1482static void pmc_internal_reservation_context_in(pmc_reservation_t reservation) {
1483	assert(reservation);
1484	pmc_state_t oldState;
1485	pmc_state_t newState;
1486
1487	/* Move the state machine */
1488	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_CONTEXT_IN, &oldState))) {
1489		return;
1490	}
1491
1492	/* Mark that the reservation was active when this cpu did its last context in */
1493	OSBitOrAtomic(1U << cpu_number(), &(reservation->active_last_context_in));
1494
1495	/* Do any actions required based on the state change */
1496	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_LOAD && PMC_STATE_STATE(oldState) != PMC_STATE_STATE_LOAD) {
1497		/* Just moved into LOAD, so load the reservation. */
1498		pmc_internal_reservation_load(reservation);
1499	}
1500
1501}
1502
1503static void pmc_internal_reservation_store(pmc_reservation_t reservation) {
1504	assert(reservation);
1505	assert(PMC_STATE_STATE(reservation->state) == PMC_STATE_STATE_STORE);
1506
1507	assert(reservation->pmc);
1508	assert(reservation->config);
1509
1510	pmc_state_t newState;
1511	kern_return_t ret = KERN_SUCCESS;
1512
1513	pmc_t store_pmc = reservation->pmc;
1514	pmc_object_t store_pmc_obj = store_pmc->object;
1515	perf_monitor_t store_pm = store_pmc->monitor;
1516
1517	/*
1518	 * Instruct the Perf Monitor that contains this counter to turn
1519	 * off the global disable for this counter.
1520	 */
1521	ret = store_pm->methods.disable_counters(store_pm->object, &store_pmc_obj, 1);
1522	if(KERN_SUCCESS != ret) {
1523		COUNTER_DEBUG(" [error] disable_counters: 0x%x\n", ret);
1524		return;
1525	}
1526
1527	/* Instruct the counter to disable itself */
1528	ret = store_pmc->methods.disable(store_pmc_obj);
1529	if(KERN_SUCCESS != ret) {
1530		COUNTER_DEBUG("  [error] disable: 0x%x\n", ret);
1531	}
1532
1533	/* store the counter value into the reservation's stored count */
1534	ret = store_pmc->methods.get_count(store_pmc_obj, &reservation->value);
1535	if(KERN_SUCCESS != ret) {
1536		COUNTER_DEBUG("  [error] get_count: 0x%x\n", ret);
1537		return;
1538	}
1539
1540	/* Advance the state machine now that the STORE is finished */
1541	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_STORE_FINISHED, NULL))) {
1542		return;
1543	}
1544
1545	/* Do any actions required based on the state change */
1546	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_LOAD) {
1547		/* Just moved into LOAD, so load the reservation. */
1548		pmc_internal_reservation_load(reservation);
1549	} else if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_DEALLOC && PMC_STATE_CONTEXT_COUNT(newState) == 0 && PMC_STATE_FLAGS(newState) == 0) {
1550		/* Wakeup any thread blocking for this reservation to hit <DEALLOC, 0, > */
1551		thread_wakeup((event_t)reservation);
1552	}
1553
1554}
1555
1556static void pmc_internal_reservation_load(pmc_reservation_t reservation) {
1557	assert(reservation);
1558	assert(PMC_STATE_STATE(reservation->state) == PMC_STATE_STATE_LOAD);
1559
1560	pmc_state_t newState;
1561	kern_return_t ret = KERN_SUCCESS;
1562
1563	assert(reservation->pmc);
1564	assert(reservation->config);
1565
1566	pmc_t load_pmc = reservation->pmc;
1567	pmc_object_t load_pmc_obj = load_pmc->object;
1568	perf_monitor_t load_pm = load_pmc->monitor;
1569
1570	/* Set the control register up with the stored configuration */
1571	ret = load_pmc->methods.set_config(load_pmc_obj, reservation->config->object);
1572	if(KERN_SUCCESS != ret) {
1573		COUNTER_DEBUG("  [error] set_config: 0x%x\n", ret);
1574		return;
1575	}
1576
1577	/* load the counter value */
1578	ret = load_pmc->methods.set_count(load_pmc_obj, reservation->value);
1579	if(KERN_SUCCESS != ret) {
1580		COUNTER_DEBUG("  [error] set_count: 0x%x\n", ret);
1581		return;
1582	}
1583
1584	/* Locally enable the counter */
1585	ret = load_pmc->methods.enable(load_pmc_obj);
1586	if(KERN_SUCCESS != ret) {
1587		COUNTER_DEBUG("  [error] enable: 0x%x\n", ret);
1588		return;
1589	}
1590
1591	/*
1592	 * Instruct the Perf Monitor containing the pmc to enable the
1593	 * counter.
1594	 */
1595	ret = load_pm->methods.enable_counters(load_pm->object, &load_pmc_obj, 1);
1596	if(KERN_SUCCESS != ret) {
1597		COUNTER_DEBUG("  [error] enable_counters: 0x%x\n", ret);
1598		/* not on the hardware. */
1599		return;
1600	}
1601
1602	/* Advance the state machine now that the STORE is finished */
1603	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_LOAD_FINISHED, NULL))) {
1604		return;
1605	}
1606
1607	/* Do any actions required based on the state change */
1608	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_STORE) {
1609		/* Just moved into STORE, so store the reservation. */
1610		pmc_internal_reservation_store(reservation);
1611	}
1612
1613}
1614
1615/*
1616 * pmc_accessible_from_core will return TRUE if the given @pmc is directly
1617 * (e.g., hardware) readable from the given logical core.
1618 *
1619 * NOTE: This method is interrupt safe.
1620 */
1621static inline boolean_t pmc_accessible_from_core(pmc_t pmc, uint32_t logicalCore) {
1622	boolean_t ret = FALSE;
1623
1624	assert(pmc);
1625
1626	ret = pmc->methods.accessible_from_core(pmc->object, logicalCore);
1627
1628	return ret;
1629}
1630
1631static void pmc_internal_reservation_start_cpu(void * arg) {
1632	pmc_reservation_t reservation = (pmc_reservation_t)arg;
1633
1634	assert(reservation);
1635
1636
1637	if (pmc_internal_reservation_matches_context(reservation)) {
1638		/* We are in context, but the reservation may have already had the context_in method run.  Attempt
1639		 * to set this cpu's bit in the active_last_context_in mask.  If we set it, call context_in.
1640		 */
1641		uint32_t oldMask = OSBitOrAtomic(1U << cpu_number(), &(reservation->active_last_context_in));
1642
1643		if ((oldMask & (1U << cpu_number())) == 0) {
1644			COUNTER_DEBUG("Starting already in-context reservation %p for cpu %d\n", reservation, cpu_number());
1645
1646			pmc_internal_reservation_context_in(reservation);
1647		}
1648	}
1649}
1650
1651static void pmc_internal_reservation_stop_cpu(void * arg) {
1652	pmc_reservation_t reservation = (pmc_reservation_t)arg;
1653
1654	assert(reservation);
1655
1656
1657	if (pmc_internal_reservation_matches_context(reservation)) {
1658		COUNTER_DEBUG("Stopping in-context reservation %p for cpu %d\n", reservation, cpu_number());
1659
1660		pmc_internal_reservation_context_out(reservation);
1661	}
1662}
1663
1664/*!fn
1665 * pmc_reservation_interrupt is called when a PMC reservation which was setup
1666 * with an interrupt threshold counts the requested number of events. When the
1667 * underlying counter hits the threshold, an interrupt is generated, and this
1668 * method is called. This method marks the reservation as stopped, and passes
1669 * control off to the user-registered callback method, along with the
1670 * reservation (so that the user can, for example, write a 0 to the counter, and
1671 * restart the reservation).
1672 * This method assumes the reservation has a valid pmc_config_t within.
1673 *
1674 * @param target The pmc_reservation_t that caused the interrupt.
1675 * @param refCon User specified reference constant.
1676 */
1677static void pmc_reservation_interrupt(void *target, void *refCon) {
1678	pmc_reservation_t reservation = (pmc_reservation_t)target;
1679	pmc_state_t newState;
1680	uint64_t timeout;
1681	uint32_t spins;
1682
1683	assert(reservation);
1684
1685	/* Move the state machine */
1686	if (PMC_STATE_INVALID == pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_INTERRUPT, NULL)) {
1687		return;
1688	}
1689
1690	/* A valid state move has been made, but won't be picked up until a context switch occurs.  To cause matching
1691	 * contexts that are currently running to update, we do an inter-processor message to run pmc_internal_reservation_stop_cpu
1692	 * on every cpu that can access the PMC.
1693	 */
1694	pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_stop_cpu);
1695
1696	/* Spin waiting for the state to turn to INTERRUPT */
1697	nanoseconds_to_absolutetime(PMC_SPIN_TIMEOUT_US * 1000, &timeout);
1698	timeout += mach_absolute_time();
1699	spins = 0;
1700	while (PMC_STATE_STATE(reservation->state) != PMC_STATE_STATE_INTERRUPT) {
1701		/* Assert if this takes longer than PMC_SPIN_TIMEOUT_US */
1702		if (++spins > PMC_SPIN_THRESHOLD) {
1703			if (mach_absolute_time() > timeout) {
1704				pmc_spin_timeout_count++;
1705				assert(0);
1706			}
1707		}
1708
1709		cpu_pause();
1710	}
1711
1712	assert(reservation->config);
1713	assert(reservation->config->method);
1714
1715	/* Call the registered callback handler */
1716#if DEBUG_COUNTERS
1717	uint64_t start = mach_absolute_time();
1718#endif /* DEBUG */
1719
1720	(void)reservation->config->method(reservation, refCon);
1721
1722#if DEBUG_COUNTERS
1723	uint64_t end = mach_absolute_time();
1724	if((end - start) > 5000ULL) {
1725		kprintf("%s - user method %p took %llu ns\n", __FUNCTION__,
1726				reservation->config->method, (end - start));
1727	}
1728#endif
1729
1730	/* Move the state machine */
1731	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_END_OF_INTERRUPT, NULL))) {
1732		return;
1733	}
1734
1735	/* Do any post-move actions necessary */
1736	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_CAN_RUN) {
1737		pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_start_cpu);
1738	} else if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_DEALLOC && PMC_STATE_CONTEXT_COUNT(newState) == 0 && PMC_STATE_FLAGS(newState) == 0) {
1739		/* Wakeup any thread blocking for this reservation to hit <DEALLOC, 0, > */
1740		thread_wakeup((event_t)reservation);
1741	}
1742}
1743
1744/*
1745 * Apple-private KPI for Apple kext's (IOProfileFamily) only
1746 */
1747
1748#if 0
1749#pragma mark -
1750#pragma mark IOProfileFamily private KPI
1751#endif
1752
1753/*
1754 * perf_monitor_register registers a new Performance Monitor, and its associated
1755 * callback methods.  The given perf_monitor_object_t is the first argument to
1756 * each callback when they are called.
1757 */
1758kern_return_t perf_monitor_register(perf_monitor_object_t monitor,
1759	perf_monitor_methods_t *methods) {
1760	int cpu = -1;
1761
1762	COUNTER_DEBUG("registering perf monitor %p\n", monitor);
1763
1764	if(!monitor || !methods) {
1765		return KERN_INVALID_ARGUMENT;
1766	}
1767
1768	/* Protect against out-of-date driver kexts */
1769	if(MACH_PERFMON_METHODS_VERSION != methods->perf_monitor_methods_version) {
1770		return KERN_INVALID_ARGUMENT;
1771	}
1772
1773	/* If the monitor requires idle notifications, ensure that it is
1774	 * accessible from a single core only.
1775	 */
1776	if (methods->flags & PERFMON_FLAG_REQUIRES_IDLE_NOTIFICATIONS) {
1777		uint32_t *cores;
1778		size_t core_cnt;
1779
1780		if (KERN_SUCCESS == methods->accessible_cores(monitor, &cores, &core_cnt)) {
1781			/*
1782			 * Guard against disabled cores - monitors will always match and
1783			 * attempt registration, irrespective of 'cpus=x' boot-arg.
1784			 */
1785			if ((core_cnt == 1) && (cores[0] < (uint32_t)ml_get_max_cpus())) {
1786				cpu = cores[0];
1787			} else {
1788				return KERN_INVALID_ARGUMENT;
1789			}
1790		}
1791	}
1792
1793	/* All methods are required */
1794	if(!methods->accessible_cores |
1795	   !methods->enable_counters || !methods->disable_counters ||
1796	   !methods->on_idle || !methods->on_idle_exit) {
1797		return KERN_INVALID_ARGUMENT;
1798	}
1799
1800	/* prevent dupes. */
1801	perf_monitor_t dupe = perf_monitor_find(monitor);
1802	if(dupe) {
1803		COUNTER_DEBUG("Duplicate registration for %p\n", monitor);
1804		perf_monitor_deallocate(dupe);
1805		return KERN_FAILURE;
1806	}
1807
1808	perf_monitor_t pm = perf_monitor_alloc();
1809	if(!pm) {
1810		return KERN_RESOURCE_SHORTAGE;
1811	}
1812
1813	/* initialize the object */
1814	perf_monitor_init(pm, cpu);
1815
1816	/* copy in the registration info */
1817	pm->object = monitor;
1818	memcpy(&(pm->methods), methods, sizeof(perf_monitor_methods_t));
1819
1820	/* place it in the tracking queues */
1821	perf_monitor_enqueue(pm);
1822
1823	/* debug it */
1824	PRINT_PERF_MON(pm);
1825
1826	return KERN_SUCCESS;
1827}
1828
1829/*
1830 * perf_monitor_unregister unregisters a previously registered Perf Monitor,
1831 * looking it up by reference pointer (the same that was used in
1832 * perf_monitor_register()).
1833 */
1834kern_return_t perf_monitor_unregister(perf_monitor_object_t monitor) {
1835	kern_return_t ret = KERN_FAILURE;
1836
1837	COUNTER_DEBUG("unregistering perf monitor %p\n", monitor);
1838
1839	if(!monitor) {
1840		return KERN_INVALID_ARGUMENT;
1841	}
1842
1843	perf_monitor_t pm = perf_monitor_find(monitor);
1844	if(pm) {
1845		/* Remove it from the queues. */
1846		perf_monitor_dequeue(pm);
1847
1848		/* drop extra retain from find */
1849		perf_monitor_deallocate(pm);
1850
1851		/* and release the object */
1852		perf_monitor_deallocate(pm);
1853
1854		ret = KERN_SUCCESS;
1855	} else {
1856		COUNTER_DEBUG("could not find a registered pm that matches!\n");
1857	}
1858
1859	return ret;
1860}
1861
1862/*
1863 * pmc_register registers a new PMC for use with the pmc subsystem. Each PMC is
1864 * associated with a Perf Monitor.  Perf Monitors are looked up by the reference
1865 * pointer that was used to previously register them.
1866 *
1867 * PMCs are registered with a reference pointer (@pmc_object), and a set of
1868 * callback methods.  When the given callback methods are called from xnu, the
1869 * first argument will always be the reference pointer used to register the PMC.
1870 *
1871 * NOTE: @monitor must have been successfully registered via
1872 * perf_monitor_register before this method will succeed.
1873 */
1874kern_return_t pmc_register(perf_monitor_object_t monitor, pmc_object_t pmc_object,
1875	pmc_methods_t *methods, void *object) {
1876
1877	COUNTER_DEBUG("%p %p\n", monitor, pmc_object);
1878
1879	if(!monitor || !pmc_object || !methods || !object) {
1880		return KERN_INVALID_ARGUMENT;
1881	}
1882
1883	/* Prevent version mismatches */
1884	if(MACH_PMC_METHODS_VERSION != methods->pmc_methods_version) {
1885		COUNTER_DEBUG("version mismatch\n");
1886		return KERN_INVALID_ARGUMENT;
1887	}
1888
1889	/* All methods are required. */
1890	if(!methods->create_config ||
1891		!methods->free_config ||
1892		!methods->config_set_value ||
1893		!methods->config_set_threshold ||
1894		!methods->config_set_handler ||
1895		!methods->set_config ||
1896		!methods->get_monitor ||
1897		!methods->get_name ||
1898		!methods->accessible_from_core ||
1899		!methods->accessible_cores ||
1900		!methods->get_count ||
1901		!methods->set_count ||
1902		!methods->disable ||
1903		!methods->enable ||
1904		!methods->open ||
1905		!methods->close) {
1906		return KERN_INVALID_ARGUMENT;
1907	}
1908
1909	/* make sure this perf monitor object is already registered */
1910	/*
1911	 * NOTE: this adds a reference to the parent, so we'll have to drop it in
1912	 * any failure code paths from here on out.
1913	 */
1914	perf_monitor_t pm = perf_monitor_find(monitor);
1915	if(!pm) {
1916		COUNTER_DEBUG("Could not find perf monitor for %p\n", monitor);
1917		return KERN_INVALID_ARGUMENT;
1918	}
1919
1920	/* make a new pmc */
1921	pmc_t pmc = pmc_alloc();
1922	if(!pmc) {
1923		/* drop the extra reference from perf_monitor_find() */
1924		perf_monitor_deallocate(pm);
1925		return KERN_RESOURCE_SHORTAGE;
1926	}
1927
1928	/* init it */
1929	pmc_init(pmc);
1930
1931	pmc->object = pmc_object;
1932	pmc->open_object = object;
1933
1934	/* copy the callbacks in */
1935	memcpy(&(pmc->methods), methods, sizeof(pmc_methods_t));
1936
1937	pmc->monitor = pm;
1938
1939	perf_monitor_add_pmc(pmc->monitor, pmc);
1940
1941	/* enqueue it in our tracking queue */
1942	pmc_enqueue(pmc);
1943
1944	/* drop extra reference from perf_monitor_find() */
1945	perf_monitor_deallocate(pm);
1946
1947	return KERN_SUCCESS;
1948}
1949
1950/*
1951 * pmc_unregister unregisters a previously registered PMC, looking it up by
1952 * reference point to *both* the Perf Monitor it was created with, and the PMC's
1953 * reference pointer itself.
1954 */
1955kern_return_t pmc_unregister(perf_monitor_object_t monitor, pmc_object_t pmc_object) {
1956	COUNTER_DEBUG("%p %p\n", monitor, pmc_object);
1957
1958	if(!monitor || !pmc_object) {
1959		return KERN_INVALID_ARGUMENT;
1960	}
1961
1962	pmc_t pmc = pmc_find(pmc_object);
1963	if(!pmc) {
1964		COUNTER_DEBUG("Could not find a matching pmc.\n");
1965		return KERN_FAILURE;
1966	}
1967
1968	/* remove it from the global queue */
1969	pmc_dequeue(pmc);
1970
1971	perf_monitor_remove_pmc(pmc->monitor, pmc);
1972
1973	/* remove extra reference count from pmc_find() */
1974	pmc_deallocate(pmc);
1975
1976	/* dealloc the pmc */
1977	pmc_deallocate(pmc);
1978
1979	return KERN_SUCCESS;
1980}
1981
1982static void perf_monitor_reservation_add(perf_monitor_t monitor) {
1983    assert(monitor);
1984    OSIncrementAtomic(&(monitor->reservedCounters));
1985}
1986
1987static void perf_monitor_reservation_remove(perf_monitor_t monitor) {
1988    assert(monitor);
1989    OSDecrementAtomic(&(monitor->reservedCounters));
1990}
1991
1992#if 0
1993#pragma mark -
1994#pragma mark KPI
1995#endif
1996
1997/*
1998 * Begin in-kernel and in-kext KPI methods
1999 */
2000
2001/*
2002 * pmc_create_config creates a new configuration area from a given @pmc.
2003 *
2004 * NOTE: This method is not interrupt safe.
2005 */
2006kern_return_t pmc_create_config(pmc_t pmc, pmc_config_t *config) {
2007	pmc_config_t tmp = NULL;
2008
2009	if(!pmc || !config) {
2010		return KERN_INVALID_ARGUMENT;
2011	}
2012
2013	pmc_reference(pmc);
2014
2015	tmp = pmc_config_alloc(pmc);
2016	if(tmp) {
2017		tmp->object = pmc->methods.create_config(pmc->object);
2018
2019		if(!tmp->object) {
2020			pmc_config_free(pmc, tmp);
2021			tmp = NULL;
2022		} else {
2023			tmp->interrupt_after_value = 0ULL;
2024			tmp->method = NULL;
2025			tmp->refCon = NULL;
2026		}
2027	}
2028
2029	pmc_deallocate(pmc);
2030
2031	if(!tmp) {
2032		return KERN_RESOURCE_SHORTAGE;
2033	}
2034
2035	*config = tmp;
2036
2037	return KERN_SUCCESS;
2038}
2039
2040/*
2041 * pmc_free_config frees a configuration area created from a given @pmc
2042 *
2043 * NOTE: This method is not interrupt safe.
2044 */
2045void pmc_free_config(pmc_t pmc, pmc_config_t config) {
2046	assert(pmc);
2047	assert(config);
2048
2049	pmc_reference(pmc);
2050
2051	pmc_config_free(pmc, config);
2052
2053	pmc_deallocate(pmc);
2054}
2055
2056/*
2057 * pmc_config_set_value sets up configuration area key-value pairs.  These pairs
2058 * are to be either pre-known, or looked up via CoreProfile.framework.
2059 *
2060 * NOTE: This method is not interrupt safe.
2061 */
2062kern_return_t pmc_config_set_value(pmc_t pmc, pmc_config_t config,
2063	uint8_t id, uint64_t value) {
2064
2065	kern_return_t ret = KERN_INVALID_ARGUMENT;
2066
2067	if(!pmc || !config) {
2068		return ret;
2069	}
2070
2071	pmc_reference(pmc);
2072
2073	ret = pmc->methods.config_set_value(config->object, id, value);
2074
2075	pmc_deallocate(pmc);
2076
2077	return ret;
2078}
2079
2080/*
2081 * pmc_config_set_interrupt_threshold modifies a config object, instructing
2082 * the pmc that it should generate a call to the given pmc_interrupt_method_t
2083 * after the counter counts @threshold events.
2084 *
2085 * PMC Threshold handler methods will have the pmc_reservation_t that generated the interrupt
2086 * as the first argument when the interrupt handler is invoked, and the given
2087 * @refCon (which may be NULL) as the second.
2088 *
2089 * See pmc_interrupt_method_t.
2090 *
2091 * NOTE: This method is not interrupt safe.
2092 */
2093kern_return_t pmc_config_set_interrupt_threshold(pmc_t pmc, pmc_config_t config,
2094	uint64_t threshold, pmc_interrupt_method_t method, void *refCon) {
2095	kern_return_t ret = KERN_INVALID_ARGUMENT;
2096
2097	if(!config || !pmc) {
2098		return ret;
2099	}
2100
2101	assert(config);
2102	assert(pmc);
2103
2104	pmc_reference(pmc);
2105
2106	do {
2107		/*
2108		 * We have a minor annoyance to side-step here. The driver layer expects
2109		 * the config to never change once a reservation has been taken out with
2110		 * it.  However, in order to have the PMI method have the reservation as
2111		 * the first argument (in order to allow the user-method to, for
2112		 * example, write a 0 to it, and restart it), we need to create the
2113		 * pmc_reservation_t before setting it up in the config object.
2114		 * We overcome this by caching the method in the pmc_config_t stand-in,
2115		 * and mutating the pmc_config_object_t just before returning a
2116		 * reservation (in pmc_reserve() and friends, below).
2117		 */
2118
2119		/* might as well stash this away too. */
2120		config->interrupt_after_value = threshold;
2121		config->method = method;
2122		config->refCon = refCon;
2123
2124		ret = KERN_SUCCESS;
2125
2126	}while(0);
2127
2128	pmc_deallocate(pmc);
2129
2130	return ret;
2131}
2132
2133/*
2134 * pmc_get_pmc_list returns an allocated list of pmc_t's, as well as the number
2135 * of pmc_t's returned. Callers should free this list with a call to
2136 * pmc_free_pmc_list().
2137 *
2138 * NOTE: This method is not interrupt safe.
2139 */
2140kern_return_t pmc_get_pmc_list(pmc_t **pmcs, size_t *pmcCount) {
2141	pmc_t *array = NULL;
2142	pmc_t pmc = NULL;
2143	size_t count = 0UL;
2144
2145	do {
2146		/* Copy down (to the stack) the count of perf counters */
2147		vm_size_t size = perf_counters_count;
2148
2149		/* Allocate that sized chunk */
2150		array = (pmc_t *)kalloc(sizeof(pmc_t) * size);
2151		if(!array) {
2152			return KERN_RESOURCE_SHORTAGE;
2153		}
2154
2155		/* Take the spin lock */
2156		lck_spin_lock(&perf_counters_queue_spin);
2157
2158		/* verify the size didn't change while we were allocating */
2159		if(size != perf_counters_count) {
2160			/*
2161			 * queue size has changed between alloc and now - go back and
2162			 * make another pass.
2163			 */
2164
2165			/* drop the lock */
2166			lck_spin_unlock(&perf_counters_queue_spin);
2167
2168			/* free the block */
2169			kfree(array, sizeof(pmc_t) * size);
2170			array = NULL;
2171		}
2172
2173		/* if we get here, and array is NULL, we try again. */
2174	}while(!array);
2175
2176	/* copy the bits out */
2177	queue_iterate(perf_counters_queue, pmc, pmc_t, link) {
2178		/* copy out the pointer */
2179		array[count++] = pmc;
2180	}
2181
2182	lck_spin_unlock(&perf_counters_queue_spin);
2183
2184	/* return the list and the size */
2185	*pmcs = array;
2186	*pmcCount = count;
2187
2188	return KERN_SUCCESS;
2189}
2190
2191/*
2192 * pmc_free_pmc_list frees an array of pmc_t that has been returned from
2193 * pmc_get_pmc_list.
2194 *
2195 * NOTE: This method is not interrupt safe.
2196 */
2197void pmc_free_pmc_list(pmc_t *pmcs, size_t pmcCount) {
2198	if(pmcs && pmcCount) {
2199		COUNTER_DEBUG("pmcs: %p pmcCount: %lu\n", pmcs, pmcCount);
2200
2201		kfree(pmcs, pmcCount * sizeof(pmc_t));
2202	}
2203}
2204
2205kern_return_t pmc_find_by_name(const char *name, pmc_t **pmcs, size_t *pmcCount) {
2206	kern_return_t ret = KERN_INVALID_ARGUMENT;
2207
2208	if(!name || !pmcs || !pmcCount) {
2209		return ret;
2210	}
2211
2212	pmc_t *list = NULL;
2213	size_t count = 0UL;
2214
2215	if(KERN_SUCCESS == (ret = pmc_get_pmc_list(&list, &count))) {
2216		size_t matchCount = 0UL, ii = 0UL, swapPtr = 0UL;
2217		size_t len = strlen(name);
2218
2219		for(ii = 0UL; ii < count; ii++) {
2220			const char *pmcName = pmc_get_name(list[ii]);
2221
2222			if(strlen(pmcName) < len) {
2223				/*
2224				 * If the pmc name is shorter than the requested match, it's no
2225				 * match, as we're looking for the most specific match(es).
2226				 */
2227				continue;
2228			}
2229
2230			if(0 == strncmp(name, pmcName, len)) {
2231				pmc_t temp = list[ii];
2232
2233				// move matches to the head of the array.
2234				list[ii] = list[swapPtr];
2235				list[swapPtr] = temp;
2236				swapPtr++;
2237
2238				// keep a count of the matches
2239				matchCount++;
2240			}
2241		}
2242
2243		if(matchCount) {
2244			/*
2245			 * If we have matches, they are all at the head of the array, so
2246			 * just allocate enough space for @matchCount pmc_t's, and copy the
2247			 * head of the array to the new allocation.  Then free the old
2248			 * allocation.
2249			 */
2250
2251			pmc_t *result = (pmc_t *)kalloc(sizeof(pmc_t) * matchCount);
2252			if(result) {
2253				// copy the matches
2254				memcpy(result, list, sizeof(pmc_t) * matchCount);
2255
2256				ret = KERN_SUCCESS;
2257			}
2258
2259			pmc_free_pmc_list(list, count);
2260
2261			if(!result) {
2262				*pmcs = NULL;
2263				*pmcCount = 0UL;
2264				return KERN_RESOURCE_SHORTAGE;
2265			}
2266
2267			*pmcs = result;
2268			*pmcCount = matchCount;
2269		} else {
2270			*pmcs = NULL;
2271			*pmcCount = 0UL;
2272		}
2273	}
2274
2275	return ret;
2276}
2277
2278/*
2279 * pmc_get_name returns a pointer (not copied) to the human-readable name of the
2280 * given pmc.
2281 *
2282 * NOTE: Driver authors must take care to not allocate during this method, as
2283 * this method *IS* interrupt safe.
2284 */
2285const char *pmc_get_name(pmc_t pmc) {
2286	assert(pmc);
2287
2288	const char *name = pmc->methods.get_name(pmc->object);
2289
2290	return name;
2291}
2292
2293/*
2294 * pmc_get_accessible_core_list returns a pointer to an array of logical core
2295 * numbers (as well as the size of that array) that represent the local cores
2296 * (hardware threads) from which the given @pmc can be accessed directly.
2297 *
2298 * NOTE: This method is interrupt safe.
2299 */
2300kern_return_t pmc_get_accessible_core_list(pmc_t pmc, uint32_t **logicalCores,
2301	size_t *logicalCoreCt) {
2302
2303	kern_return_t ret = KERN_INVALID_ARGUMENT;
2304
2305	if(!pmc || !logicalCores || !logicalCoreCt) {
2306		return ret;
2307	}
2308
2309	ret = pmc->methods.accessible_cores(pmc->object, logicalCores, logicalCoreCt);
2310
2311	return ret;
2312}
2313
2314static boolean_t pmc_reservation_setup_pmi(pmc_reservation_t resv, pmc_config_t config) {
2315	assert(resv);
2316	assert(resv->pmc);
2317	assert(config);
2318	assert(config->object);
2319
2320	/* If there's no PMI to setup, return success */
2321	if(config->interrupt_after_value && config->method) {
2322
2323		/* set the threshold */
2324		kern_return_t ret = resv->pmc->methods.config_set_threshold(config->object,
2325			config->interrupt_after_value);
2326
2327		if(KERN_SUCCESS != ret) {
2328			/*
2329			 * This is the most useful error message here, as this only happens
2330			 * as a result of pmc_reserve*()
2331			 */
2332			COUNTER_DEBUG("Failed to set threshold for pmc %p\n", resv->pmc);
2333			return FALSE;
2334		}
2335
2336		if(KERN_SUCCESS != resv->pmc->methods.config_set_handler(config->object,
2337			(void *)resv, &pmc_reservation_interrupt, config->refCon)) {
2338
2339			COUNTER_DEBUG("Failed to set handler for pmc %p\n", resv->pmc);
2340			return FALSE;
2341		}
2342	}
2343
2344	return TRUE;
2345}
2346
2347/*
2348 * pmc_reserve will attempt to reserve the given @pmc, with a given
2349 * configuration object, for counting system-wide. This method will fail with
2350 * KERN_FAILURE if the given pmc is already reserved at any scope.
2351 *
2352 * This method consumes the given configuration object if it returns
2353 * KERN_SUCCESS. Any other return value indicates the caller
2354 * must free the config object via pmc_free_config().
2355 *
2356 * NOTE: This method is NOT interrupt safe.
2357 */
2358kern_return_t pmc_reserve(pmc_t pmc, pmc_config_t config,
2359	pmc_reservation_t *reservation) {
2360
2361	if(!pmc || !config || !reservation) {
2362		return KERN_INVALID_ARGUMENT;
2363	}
2364
2365	pmc_reservation_t resv = reservation_alloc();
2366	if(!resv) {
2367		return KERN_RESOURCE_SHORTAGE;
2368	}
2369
2370	reservation_init(resv);
2371
2372	resv->flags |= PMC_FLAG_SCOPE_SYSTEM;
2373	resv->config = config;
2374
2375	if(KERN_SUCCESS != pmc_internal_reservation_set_pmc(resv, pmc)) {
2376		resv->config = NULL;
2377		return KERN_FAILURE;
2378	}
2379
2380	/* enqueue reservation in proper place */
2381	if(!pmc_internal_reservation_add(resv) || !pmc_reservation_setup_pmi(resv, config)) {
2382		/* Prevent free of config object */
2383		resv->config = NULL;
2384
2385		reservation_free(resv);
2386		return KERN_FAILURE;
2387	}
2388
2389	perf_monitor_reservation_add(pmc->monitor);
2390
2391	*reservation = resv;
2392
2393	return KERN_SUCCESS;
2394}
2395
2396/*
2397 * pmc_reserve_task will attempt to reserve the given @pmc with a given
2398 * configuration object, for counting when the given @task is running on any
2399 * logical core that can directly access the given @pmc.  This method will fail
2400 * with KERN_FAILURE if the given pmc is already reserved at either system or
2401 * thread scope.
2402 *
2403 * This method consumes the given configuration object if it returns
2404 * KERN_SUCCESS. Any other return value indicates the caller
2405 * must free the config object via pmc_free_config().
2406 *
2407 * NOTE: You can reserve the same pmc for N different tasks concurrently.
2408 * NOTE: This method is NOT interrupt safe.
2409 */
2410kern_return_t pmc_reserve_task(pmc_t pmc, pmc_config_t config,
2411	task_t task, pmc_reservation_t *reservation) {
2412
2413	if(!pmc || !config || !reservation || !task) {
2414		return KERN_INVALID_ARGUMENT;
2415	}
2416
2417	if (!(pmc->monitor->methods.flags & PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING)) {
2418		COUNTER_DEBUG("pmc %p cannot be context switched!\n", pmc);
2419		return KERN_INVALID_ARGUMENT;
2420	}
2421
2422	pmc_reservation_t resv = reservation_alloc();
2423	if(!resv) {
2424		return KERN_RESOURCE_SHORTAGE;
2425	}
2426
2427	reservation_init(resv);
2428
2429	resv->flags |= PMC_FLAG_SCOPE_TASK;
2430	resv->task = task;
2431
2432	resv->config = config;
2433
2434	if(KERN_SUCCESS != pmc_internal_reservation_set_pmc(resv, pmc)) {
2435		resv->config = NULL;
2436		return KERN_FAILURE;
2437	}
2438
2439	/* enqueue reservation in proper place */
2440	if(!pmc_internal_reservation_add(resv) || !pmc_reservation_setup_pmi(resv, config)) {
2441		/* Prevent free of config object */
2442		resv->config = NULL;
2443
2444		reservation_free(resv);
2445		return KERN_FAILURE;
2446	}
2447
2448	perf_monitor_reservation_add(pmc->monitor);
2449
2450	*reservation = resv;
2451
2452	return KERN_SUCCESS;
2453}
2454
2455/*
2456 * pmc_reserve_thread will attempt to reserve the given @pmc with a given
2457 * configuration object, for counting when the given @thread is running on any
2458 * logical core that can directly access the given @pmc.  This method will fail
2459 * with KERN_FAILURE if the given pmc is already reserved at either system or
2460 * task scope.
2461 *
2462 * This method consumes the given configuration object if it returns
2463 * KERN_SUCCESS. Any other return value indicates the caller
2464 * must free the config object via pmc_free_config().
2465 *
2466 * NOTE: You can reserve the same pmc for N different threads concurrently.
2467 * NOTE: This method is NOT interrupt safe.
2468 */
2469kern_return_t pmc_reserve_thread(pmc_t pmc, pmc_config_t config,
2470	thread_t thread, pmc_reservation_t *reservation) {
2471	if(!pmc || !config || !reservation || !thread) {
2472		return KERN_INVALID_ARGUMENT;
2473	}
2474
2475	if (!(pmc->monitor->methods.flags & PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING)) {
2476		COUNTER_DEBUG("pmc %p cannot be context switched!\n", pmc);
2477		return KERN_INVALID_ARGUMENT;
2478	}
2479
2480	pmc_reservation_t resv = reservation_alloc();
2481	if(!resv) {
2482		return KERN_RESOURCE_SHORTAGE;
2483	}
2484
2485	reservation_init(resv);
2486
2487	resv->flags |= PMC_FLAG_SCOPE_THREAD;
2488	resv->thread = thread;
2489
2490	resv->config = config;
2491
2492	if(KERN_SUCCESS != pmc_internal_reservation_set_pmc(resv, pmc)) {
2493		resv->config = NULL;
2494		return KERN_FAILURE;
2495	}
2496
2497	/* enqueue reservation in proper place */
2498	if(!pmc_internal_reservation_add(resv) || !pmc_reservation_setup_pmi(resv, config)) {
2499		/* Prevent free of config object */
2500		resv->config = NULL;
2501
2502		reservation_free(resv);
2503		return KERN_FAILURE;
2504	}
2505
2506	perf_monitor_reservation_add(pmc->monitor);
2507
2508	*reservation = resv;
2509
2510	return KERN_SUCCESS;
2511}
2512
2513/*
2514 * pmc_reservation_start instructs the given reservation to start counting as
2515 * soon as possible.
2516 *
2517 * NOTE: This method is interrupt safe.
2518 */
2519kern_return_t pmc_reservation_start(pmc_reservation_t reservation) {
2520	pmc_state_t newState;
2521
2522	if(!reservation) {
2523		return KERN_INVALID_ARGUMENT;
2524	}
2525
2526	/* Move the state machine */
2527	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_START, NULL))) {
2528		return KERN_FAILURE;
2529	}
2530
2531	/* If we are currently in an interrupt, don't bother to broadcast since it won't do anything now and the interrupt will
2532	 * broadcast right before it leaves
2533	 */
2534	if (PMC_STATE_STATE(newState) != PMC_STATE_STATE_INTERRUPT) {
2535		/* A valid state move has been made, but won't be picked up until a context switch occurs.  To cause matching
2536		 * contexts that are currently running to update, we do an inter-processor message to run pmc_internal_reservation_start_cpu
2537		 * on every cpu that can access the PMC.
2538		 */
2539		pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_start_cpu);
2540	}
2541
2542	return KERN_SUCCESS;
2543}
2544
2545/*
2546 * pmc_reservation_stop instructs the given reservation to stop counting as
2547 * soon as possible.  When this method returns, the pmc will be marked as stopping
2548 * and subsequent calls to pmc_reservation_start will succeed.  This does not mean
2549 * that the pmc hardware has _actually_ stopped running.  Assuming no other changes
2550 * to the reservation state, the pmc hardware _will_ stop shortly.
2551 *
2552 */
2553kern_return_t pmc_reservation_stop(pmc_reservation_t reservation) {
2554	pmc_state_t newState;
2555
2556	if(!reservation) {
2557		return KERN_INVALID_ARGUMENT;
2558	}
2559
2560	/* Move the state machine */
2561	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_STOP, NULL))) {
2562		return KERN_FAILURE;
2563	}
2564
2565	/* If we are currently in an interrupt, don't bother to broadcast since it won't do anything now and the interrupt will
2566	 * broadcast right before it leaves.  Similarly, if we just moved directly to STOP, don't bother broadcasting.
2567	 */
2568	if (PMC_STATE_STATE(newState) != PMC_STATE_STATE_INTERRUPT && PMC_STATE_STATE(newState) != PMC_STATE_STATE_STOP) {
2569		/* A valid state move has been made, but won't be picked up until a context switch occurs.  To cause matching
2570			 * contexts that are currently running to update, we do an inter-processor message to run pmc_internal_reservation_stop_cpu
2571		 * on every cpu that can access the PMC.
2572		 */
2573
2574		pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_stop_cpu);
2575	}
2576
2577	return KERN_SUCCESS;
2578}
2579
2580/*
2581 * pmc_reservation_read will read the event count associated with a reservation.
2582 * If the caller is current executing in a context that both a) matches the
2583 * reservation's context, and b) can access the reservation's pmc directly, the
2584 * value will be read from hardware.  Otherwise, this returns the reservation's
2585 * stored value.
2586 *
2587 * NOTE: This method is interrupt safe.
2588 * NOTE: When not on the interrupt stack, this method may block.
2589 */
2590kern_return_t pmc_reservation_read(pmc_reservation_t reservation, uint64_t *value) {
2591	kern_return_t ret = KERN_FAILURE;
2592	uint64_t timeout;
2593	uint32_t spins;
2594
2595	if(!reservation || !value) {
2596		return KERN_INVALID_ARGUMENT;
2597	}
2598
2599	nanoseconds_to_absolutetime(PMC_SPIN_TIMEOUT_US * 1000, &timeout);
2600	timeout += mach_absolute_time();
2601	spins = 0;
2602	do {
2603		uint32_t state = reservation->state;
2604
2605		if((PMC_STATE_STATE(state) == PMC_STATE_STATE_RUN)) {
2606			/* Attempt read from hardware via drivers. */
2607
2608			assert(reservation->pmc);
2609
2610			ret = reservation->pmc->methods.get_count(reservation->pmc->object, value);
2611
2612			break;
2613		} else if ((PMC_STATE_STATE(state) == PMC_STATE_STATE_STORE) ||
2614				   (PMC_STATE_STATE(state) == PMC_STATE_STATE_LOAD)) {
2615			/* Spin */
2616			/* Assert if this takes longer than PMC_SPIN_TIMEOUT_US */
2617			if (++spins > PMC_SPIN_THRESHOLD) {
2618				if (mach_absolute_time() > timeout) {
2619					pmc_spin_timeout_count++;
2620					assert(0);
2621				}
2622			}
2623
2624			cpu_pause();
2625		} else {
2626			break;
2627		}
2628	} while (1);
2629
2630	/* If the direct hardware read failed (for whatever reason) */
2631	if(KERN_SUCCESS != ret) {
2632		/* Read stored value */
2633		*value = reservation->value;
2634	}
2635
2636	return KERN_SUCCESS;
2637}
2638
2639/*
2640 * pmc_reservation_write will write the event count associated with a reservation.
2641 * If the caller is current executing in a context that both a) matches the
2642 * reservation's context, and b) can access the reservation's pmc directly, the
2643 * value will be written to hardware.  Otherwise, this writes the reservation's
2644 * stored value.
2645 *
2646 * NOTE: This method is interrupt safe.
2647 * NOTE: When not on the interrupt stack, this method may block.
2648 */
2649kern_return_t pmc_reservation_write(pmc_reservation_t reservation, uint64_t value) {
2650	kern_return_t ret = KERN_FAILURE;
2651	uint64_t timeout;
2652	uint32_t spins;
2653
2654	if(!reservation) {
2655		return KERN_INVALID_ARGUMENT;
2656	}
2657
2658	nanoseconds_to_absolutetime(PMC_SPIN_TIMEOUT_US * 1000, &timeout);
2659	timeout += mach_absolute_time();
2660	spins = 0;
2661	do {
2662		uint32_t state = reservation->state;
2663
2664		if((PMC_STATE_STATE(state) == PMC_STATE_STATE_RUN)) {
2665				/* Write to hardware via drivers. */
2666			assert(reservation->pmc);
2667
2668			ret = reservation->pmc->methods.set_count(reservation->pmc->object, value);
2669			break;
2670		} else if ((PMC_STATE_STATE(state) == PMC_STATE_STATE_STORE) ||
2671				   (PMC_STATE_STATE(state) == PMC_STATE_STATE_LOAD)) {
2672			/* Spin */
2673			/* Assert if this takes longer than PMC_SPIN_TIMEOUT_US */
2674			if (++spins > PMC_SPIN_THRESHOLD) {
2675				if (mach_absolute_time() > timeout) {
2676					pmc_spin_timeout_count++;
2677					assert(0);
2678				}
2679			}
2680
2681			cpu_pause();
2682		} else {
2683			break;
2684		}
2685	} while (1);
2686
2687	if(KERN_SUCCESS != ret) {
2688		/* Write stored value */
2689		reservation->value = value;
2690	}
2691
2692	return KERN_SUCCESS;
2693}
2694
2695/*
2696 * pmc_reservation_free releases a reservation and all associated resources.
2697 *
2698 * NOTE: This method is NOT interrupt safe.
2699 */
2700kern_return_t pmc_reservation_free(pmc_reservation_t reservation) {
2701	pmc_state_t newState;
2702
2703	if(!reservation) {
2704		return KERN_INVALID_ARGUMENT;
2705	}
2706
2707	perf_monitor_reservation_remove(reservation->pmc->monitor);
2708
2709	/* Move the state machine */
2710	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_FREE, NULL))) {
2711		return KERN_FAILURE;
2712	}
2713
2714	/* If we didn't move directly to DEALLOC, help things along */
2715	if (PMC_STATE_STATE(newState) != PMC_STATE_STATE_DEALLOC) {
2716		/* A valid state move has been made, but won't be picked up until a context switch occurs.  To cause matching
2717		 * contexts that are currently running to update, we do an inter-processor message to run pmc_internal_reservation_stop_cpu
2718		 * on every cpu that can access the PMC.
2719		 */
2720		pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_stop_cpu);
2721	}
2722
2723	/* Block until the reservation hits the <DEALLOC, 0, > state */
2724	while (!(PMC_STATE_STATE(reservation->state) == PMC_STATE_STATE_DEALLOC && PMC_STATE_CONTEXT_COUNT(reservation->state) == 0 && PMC_STATE_FLAGS(reservation->state) == 0)) {
2725		assert_wait((event_t)reservation, THREAD_UNINT);
2726		thread_block(THREAD_CONTINUE_NULL);
2727	}
2728
2729	/* remove from queues */
2730	pmc_internal_reservation_remove(reservation);
2731
2732	/* free reservation */
2733	reservation_free(reservation);
2734
2735	return KERN_SUCCESS;
2736}
2737
2738/*
2739 * pmc_idle notifies eligible monitors of impending per-CPU idle, and can be used to save state.
2740 */
2741boolean_t pmc_idle(void) {
2742	perf_monitor_t monitor = NULL;
2743	queue_head_t *cpu_queue;
2744
2745	lck_spin_lock(&perf_monitor_queue_spin);
2746
2747	if (cpu_monitor_queues) {
2748		cpu_queue = cpu_monitor_queues[cpu_number()];
2749
2750		queue_iterate(cpu_queue, monitor, perf_monitor_t, cpu_link) {
2751			perf_monitor_methods_t *methods = &(monitor->methods);
2752			if ((methods->flags & PERFMON_FLAG_ALWAYS_ACTIVE) || (monitor->reservedCounters)) {
2753				methods->on_idle(monitor->object);
2754			}
2755		}
2756	}
2757
2758	lck_spin_unlock(&perf_monitor_queue_spin);
2759
2760	return TRUE;
2761}
2762
2763/*
2764 * pmc_idle_exit notifies eligible monitors of wake from idle; it can be used to restore state.
2765 */
2766boolean_t pmc_idle_exit(void) {
2767	perf_monitor_t monitor = NULL;
2768	queue_head_t *cpu_queue;
2769
2770	lck_spin_lock(&perf_monitor_queue_spin);
2771
2772	if (cpu_monitor_queues) {
2773		cpu_queue = cpu_monitor_queues[cpu_number()];
2774
2775		queue_iterate(cpu_queue, monitor, perf_monitor_t, cpu_link) {
2776			perf_monitor_methods_t *methods = &(monitor->methods);
2777			if ((methods->flags & PERFMON_FLAG_ALWAYS_ACTIVE) || (monitor->reservedCounters)) {
2778				methods->on_idle_exit(monitor->object);
2779			}
2780		}
2781	}
2782
2783	lck_spin_unlock(&perf_monitor_queue_spin);
2784
2785	return TRUE;
2786}
2787
2788/*
2789 * pmc_context_switch performs all context switching necessary to save all pmc
2790 * state associated with @oldThread (and the task to which @oldThread belongs),
2791 * as well as to restore all pmc state associated with @newThread (and the task
2792 * to which @newThread belongs).
2793 *
2794 * NOTE: This method IS interrupt safe.
2795 */
2796boolean_t pmc_context_switch(thread_t oldThread, thread_t newThread) {
2797	pmc_reservation_t resv = NULL;
2798	uint32_t cpuNum = cpu_number();
2799
2800	lck_spin_lock(&reservations_spin);
2801
2802	/* Save pmc states */
2803	if (thread_reservation_count) {
2804 		queue_iterate(thread_reservations, resv, pmc_reservation_t, link) {
2805			if ((oldThread == resv->thread) && pmc_accessible_from_core(resv->pmc, cpuNum)) {
2806				(void)pmc_internal_reservation_context_out(resv);
2807			}
2808		}
2809	}
2810
2811	if (task_reservation_count) {
2812		queue_iterate(task_reservations, resv, pmc_reservation_t, link) {
2813			if ((resv->task == oldThread->task) && pmc_accessible_from_core(resv->pmc, cpuNum)) {
2814    			(void)pmc_internal_reservation_context_out(resv);
2815			}
2816		}
2817	}
2818
2819	/* Restore */
2820	if (thread_reservation_count) {
2821		queue_iterate(thread_reservations, resv, pmc_reservation_t, link) {
2822			if ((resv->thread == newThread) && pmc_accessible_from_core(resv->pmc, cpuNum)) {
2823				(void)pmc_internal_reservation_context_in(resv);
2824			}
2825		}
2826	}
2827
2828	if (task_reservation_count) {
2829		queue_iterate(task_reservations, resv, pmc_reservation_t, link) {
2830			if ((resv->task == newThread->task) && pmc_accessible_from_core(resv->pmc, cpuNum)) {
2831				(void)pmc_internal_reservation_context_in(resv);
2832			}
2833		}
2834	}
2835
2836	lck_spin_unlock(&reservations_spin);
2837
2838	return TRUE;
2839}
2840
2841#else /* !CONFIG_COUNTERS */
2842
2843#if 0
2844#pragma mark -
2845#pragma mark Dummy functions
2846#endif
2847
2848/*
2849 * In the case that someone has chosen not to include the PMC KPI in some
2850 * configuration, we still have exports for kexts, so we'll need to define stub
2851 * methods that return failures.
2852 */
2853kern_return_t perf_monitor_register(perf_monitor_object_t monitor __unused,
2854	perf_monitor_methods_t *methods __unused) {
2855	return KERN_FAILURE;
2856}
2857
2858kern_return_t perf_monitor_unregister(perf_monitor_object_t monitor __unused) {
2859	return KERN_FAILURE;
2860}
2861
2862kern_return_t pmc_register(perf_monitor_object_t monitor __unused,
2863	pmc_object_t pmc __unused, pmc_methods_t *methods __unused, void *object __unused) {
2864	return KERN_FAILURE;
2865}
2866
2867kern_return_t pmc_unregister(perf_monitor_object_t monitor __unused,
2868	pmc_object_t pmc __unused) {
2869	return KERN_FAILURE;
2870}
2871
2872kern_return_t pmc_create_config(pmc_t pmc __unused,
2873	pmc_config_t *config __unused) {
2874	return KERN_FAILURE;
2875}
2876
2877void pmc_free_config(pmc_t pmc __unused, pmc_config_t config __unused) {
2878}
2879
2880kern_return_t pmc_config_set_value(pmc_t pmc __unused,
2881	pmc_config_t config __unused, uint8_t id __unused,
2882	uint64_t value __unused) {
2883	return KERN_FAILURE;
2884}
2885
2886kern_return_t pmc_config_set_interrupt_threshold(pmc_t pmc __unused,
2887	pmc_config_t config __unused, uint64_t threshold __unused,
2888	pmc_interrupt_method_t method __unused, void *refCon __unused) {
2889	return KERN_FAILURE;
2890}
2891
2892kern_return_t pmc_get_pmc_list(pmc_t **pmcs __unused, size_t *pmcCount __unused) {
2893	return KERN_FAILURE;
2894}
2895
2896void pmc_free_pmc_list(pmc_t *pmcs __unused, size_t pmcCount __unused) {
2897}
2898
2899kern_return_t pmc_find_by_name(const char *name __unused, pmc_t **pmcs __unused,
2900	size_t *pmcCount __unused) {
2901	return KERN_FAILURE;
2902}
2903
2904const char *pmc_get_name(pmc_t pmc __unused) {
2905	return "";
2906}
2907
2908kern_return_t pmc_get_accessible_core_list(pmc_t pmc __unused,
2909	uint32_t **logicalCores __unused, size_t *logicalCoreCt __unused) {
2910	return KERN_FAILURE;
2911}
2912
2913kern_return_t pmc_reserve(pmc_t pmc __unused,
2914	pmc_config_t config __unused, pmc_reservation_t *reservation __unused) {
2915	return KERN_FAILURE;
2916}
2917
2918kern_return_t pmc_reserve_task(pmc_t pmc __unused,
2919	pmc_config_t config __unused, task_t task __unused,
2920	pmc_reservation_t *reservation __unused) {
2921	return KERN_FAILURE;
2922}
2923
2924kern_return_t pmc_reserve_thread(pmc_t pmc __unused,
2925	pmc_config_t config __unused, thread_t thread __unused,
2926	pmc_reservation_t *reservation __unused) {
2927	return KERN_FAILURE;
2928}
2929
2930kern_return_t pmc_reservation_start(pmc_reservation_t reservation __unused) {
2931	return KERN_FAILURE;
2932}
2933
2934kern_return_t pmc_reservation_stop(pmc_reservation_t reservation __unused) {
2935	return KERN_FAILURE;
2936}
2937
2938kern_return_t pmc_reservation_read(pmc_reservation_t reservation __unused,
2939	uint64_t *value __unused) {
2940	return KERN_FAILURE;
2941}
2942
2943kern_return_t pmc_reservation_write(pmc_reservation_t reservation __unused,
2944	uint64_t value __unused) {
2945	return KERN_FAILURE;
2946}
2947
2948kern_return_t pmc_reservation_free(pmc_reservation_t reservation __unused) {
2949	return KERN_FAILURE;
2950}
2951
2952
2953#endif /* !CONFIG_COUNTERS */
2954