1/*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#include <kern/kalloc.h>
25#include <kern/kern_types.h>
26#include <kern/locks.h>
27#include <kern/misc_protos.h>
28#include <kern/task.h>
29#include <kern/thread.h>
30#include <kern/zalloc.h>
31#include <machine/machine_cpu.h>
32
33#include <pmc/pmc.h>
34
35#include <libkern/OSAtomic.h>
36
37#if defined(__i386__) || defined(__x86_64__)
38#include <i386/mp.h>
39#endif
40
41#if CONFIG_COUNTERS
42
43/* various debug logging enable */
44#undef DEBUG_COUNTERS
45
46typedef uint8_t pmc_state_event_t;
47
48#define PMC_STATE_EVENT_START				0
49#define PMC_STATE_EVENT_STOP				1
50#define PMC_STATE_EVENT_FREE				2
51#define PMC_STATE_EVENT_INTERRUPT			3
52#define PMC_STATE_EVENT_END_OF_INTERRUPT	4
53#define PMC_STATE_EVENT_CONTEXT_IN			5
54#define PMC_STATE_EVENT_CONTEXT_OUT			6
55#define PMC_STATE_EVENT_LOAD_FINISHED		7
56#define PMC_STATE_EVENT_STORE_FINISHED		8
57
58/* PMC spin timeouts */
59#define PMC_SPIN_THRESHOLD	10	/* Number of spins to allow before checking mach_absolute_time() */
60#define PMC_SPIN_TIMEOUT_US	10	/* Time in microseconds before the spin causes an assert */
61
62uint64_t pmc_spin_timeout_count = 0;	/* Number of times where a PMC spin loop causes a timeout */
63
64#ifdef DEBUG_COUNTERS
65#	include <pexpert/pexpert.h>
66#	define COUNTER_DEBUG(...) \
67	do { \
68		kprintf("[%s:%s][%u] ", __FILE__, __PRETTY_FUNCTION__, cpu_number()); \
69		kprintf(__VA_ARGS__); \
70	} while(0)
71
72#	define PRINT_PERF_MON(x)	\
73	do { \
74		kprintf("perfmon: %p (obj: %p refCt: %u switchable: %u)\n", \
75			x, x->object, x->useCount, \
76			(x->methods.flags & PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING) ? \
77			1 : 0); \
78	} while(0)
79
80static const char const * pmc_state_state_name(pmc_state_t state) {
81	switch (PMC_STATE_STATE(state)) {
82		case PMC_STATE_STATE_INVALID:
83			return "INVALID";
84		case PMC_STATE_STATE_STOP:
85			return "STOP";
86		case PMC_STATE_STATE_CAN_RUN:
87			return "CAN_RUN";
88		case PMC_STATE_STATE_LOAD:
89			return "LOAD";
90		case PMC_STATE_STATE_RUN:
91			return "RUN";
92		case PMC_STATE_STATE_STORE:
93			return "STORE";
94		case PMC_STATE_STATE_INTERRUPT:
95			return "INTERRUPT";
96		case PMC_STATE_STATE_DEALLOC:
97			return "DEALLOC";
98		default:
99			return "UNKNOWN";
100	}
101}
102
103static const char const * pmc_state_event_name(pmc_state_event_t event) {
104	switch (event) {
105		case PMC_STATE_EVENT_START:
106			return "START";
107		case PMC_STATE_EVENT_STOP:
108			return "STOP";
109		case PMC_STATE_EVENT_FREE:
110			return "FREE";
111		case PMC_STATE_EVENT_INTERRUPT:
112			return "INTERRUPT";
113		case PMC_STATE_EVENT_END_OF_INTERRUPT:
114			return "END OF INTERRUPT";
115		case PMC_STATE_EVENT_CONTEXT_IN:
116			return "CONTEXT IN";
117		case PMC_STATE_EVENT_CONTEXT_OUT:
118			return "CONTEXT OUT";
119		case PMC_STATE_EVENT_LOAD_FINISHED:
120			return "LOAD_FINISHED";
121		case PMC_STATE_EVENT_STORE_FINISHED:
122			return "STORE_FINISHED";
123		default:
124			return "UNKNOWN";
125	}
126}
127
128#	define PMC_STATE_FORMAT	"<%s, %u, %s%s%s>"
129#	define PMC_STATE_ARGS(x)	pmc_state_state_name(x), PMC_STATE_CONTEXT_COUNT(x), ((PMC_STATE_FLAGS(x) & PMC_STATE_FLAGS_INTERRUPTING) ? "I" : ""), \
130					((PMC_STATE_FLAGS(x) & PMC_STATE_FLAGS_STOPPING) ? "S" : ""), ((PMC_STATE_FLAGS(x) & PMC_STATE_FLAGS_DEALLOCING) ? "D" : "")
131#else
132#	define COUNTER_DEBUG(...)
133#	define PRINT_PERF_MON(x)
134#	define PMC_STATE_FORMAT
135#	define PMC_STATE_ARGS(x)
136#endif
137
138/*!struct
139 * pmc_config is the data behind a pmc_config_t.
140 * @member object A pointer to an instance of IOPerformanceCounterConfiguration
141 * @member method A pointer to a method to call to handle PMI.
142 * @member interrupt_after_value Cause a PMI after the counter counts this many
143 * events.
144 * @member refCon Passed to the @method method as the refCon argument.
145 */
146struct pmc_config {
147	pmc_config_object_t object;
148	volatile pmc_interrupt_method_t method;
149	uint64_t interrupt_after_value;
150	void *refCon;
151};
152
153/*
154 * Allocation Zones
155 *
156 * Two allocation zones - Perf zone small and Perf zone big.
157 * Each zone has associated maximums, defined below.
158 * The small zone is the max of the smallest allocation objects (all sizes on
159 * K64):
160 *	perf_monitor_t - 48 bytes
161 *		perf_monitor_methods_t - 28 bytes
162 *	pmc_reservation_t - 48 bytes
163 *  pmc_config_t - 32 bytes
164 * perf_small_zone unit size is (on K64) 48 bytes
165 * perf_small_zone max count must be max number of perf monitors, plus (max
166 * number of reservations * 2). The "*2" is because each reservation has a
167 * pmc_config_t within.
168 *
169 * Big zone is max of the larger allocation units
170 *	pmc_t - 144 bytes
171 *		pmc_methods_t - 116 bytes
172 * perf_big_zone unit size is (on K64) 144 bytes
173 * perf_big_zone max count is the max number of PMCs we support.
174 */
175
176static zone_t perf_small_zone = NULL;
177#define MAX_PERF_SMALLS		(256 + 8196 + 8196)
178#define PERF_SMALL_UNIT_SZ	(MAX(MAX(sizeof(struct perf_monitor), \
179	sizeof(struct pmc_reservation)), sizeof(struct pmc_config)))
180
181static zone_t perf_big_zone = NULL;
182#define MAX_PERF_BIGS		(1024)
183#define PERF_BIG_UNIT_SZ	(sizeof(struct pmc))
184
185/*
186 * Locks and Lock groups
187 */
188static lck_grp_t *pmc_lock_grp = LCK_GRP_NULL;
189static lck_grp_attr_t *pmc_lock_grp_attr;
190static lck_attr_t *pmc_lock_attr;
191
192/* PMC tracking queue locks */
193
194static lck_mtx_t  cpu_monitor_queue_mutex;   /* protects per-cpu queues at initialisation time */
195static lck_spin_t perf_monitor_queue_spin;   /* protects adding and removing from queue */
196static lck_spin_t perf_counters_queue_spin;  /* protects adding and removing from queue */
197
198/* Reservation tracking queues lock */
199static lck_spin_t reservations_spin;
200
201/*
202 * Tracking queues
203 *
204 * Keeps track of registered perf monitors and perf counters
205 */
206
207static queue_head_t **cpu_monitor_queues = NULL;
208
209static queue_head_t *perf_monitors_queue = NULL;
210static volatile uint32_t perf_monitors_count = 0U;
211
212static queue_head_t *perf_counters_queue = NULL;
213static volatile uint32_t perf_counters_count = 0U;
214
215/*
216 * Reservation queues
217 *
218 * Keeps track of all system, task, and thread-level reservations (both active and
219 * inactive).
220 *
221 * We track them all here (rather than in their respective task or thread only)
222 * so that we can inspect our tracking data directly (rather than peeking at
223 * every task and thread) to determine if/when a new reservation would
224 * constitute a conflict.
225 */
226
227static queue_head_t *system_reservations = NULL;
228static volatile uint32_t system_reservation_count = 0U;
229
230static queue_head_t *task_reservations = NULL;
231static volatile uint32_t task_reservation_count = 0U;
232
233static queue_head_t *thread_reservations = NULL;
234static volatile uint32_t thread_reservation_count = 0U;
235
236#if XNU_KERNEL_PRIVATE
237
238/*
239 * init_pmc_locks creates and initializes all the locks and lock groups and lock
240 * attributes required for the pmc sub-system.
241 */
242static void init_pmc_locks(void) {
243	pmc_lock_attr = lck_attr_alloc_init();
244	assert(pmc_lock_attr);
245
246	pmc_lock_grp_attr = lck_grp_attr_alloc_init();
247	assert(pmc_lock_grp_attr);
248
249	pmc_lock_grp = lck_grp_alloc_init("pmc", pmc_lock_grp_attr);
250	assert(pmc_lock_grp);
251
252	lck_spin_init(&perf_monitor_queue_spin, pmc_lock_grp, pmc_lock_attr);
253	lck_spin_init(&perf_counters_queue_spin, pmc_lock_grp, pmc_lock_attr);
254
255	lck_spin_init(&reservations_spin, pmc_lock_grp, pmc_lock_attr);
256
257	lck_mtx_init(&cpu_monitor_queue_mutex, pmc_lock_grp, pmc_lock_attr);
258}
259
260/*
261 * init_pmc_zones initializes the allocation zones used by the pmc subsystem
262 */
263static void init_pmc_zones(void) {
264	perf_small_zone = zinit(PERF_SMALL_UNIT_SZ,
265		MAX_PERF_SMALLS * PERF_SMALL_UNIT_SZ, MAX_PERF_SMALLS,
266		"pmc.small zone");
267
268	assert(perf_small_zone);
269
270	perf_big_zone = zinit(PERF_BIG_UNIT_SZ,
271		MAX_PERF_BIGS * PERF_BIG_UNIT_SZ, MAX_PERF_BIGS,
272		"pmc.big zone");
273
274	assert(perf_big_zone);
275}
276
277/*
278 * init_pmc_queues allocates and initializes the tracking queues for
279 * registering and reserving individual pmcs and perf monitors.
280 */
281static void init_pmc_queues(void) {
282
283	perf_monitors_queue = (queue_head_t*)kalloc(sizeof(queue_head_t));
284	assert(perf_monitors_queue);
285
286	queue_init(perf_monitors_queue);
287
288	perf_counters_queue = (queue_head_t*)kalloc(sizeof(queue_head_t));
289	assert(perf_counters_queue);
290
291	queue_init(perf_counters_queue);
292
293	system_reservations = (queue_head_t*)kalloc(sizeof(queue_t));
294	assert(system_reservations);
295
296	queue_init(system_reservations);
297
298	task_reservations = (queue_head_t*)kalloc(sizeof(queue_head_t));
299	assert(task_reservations);
300
301	queue_init(task_reservations);
302
303	thread_reservations = (queue_head_t*)kalloc(sizeof(queue_head_t));
304	assert(thread_reservations);
305
306	queue_init(thread_reservations);
307}
308
309/*
310 * pmc_bootstrap brings up all the necessary infrastructure required to use the
311 * pmc sub-system.
312 */
313__private_extern__
314void pmc_bootstrap(void) {
315	/* build our alloc zones */
316	init_pmc_zones();
317
318	/* build the locks */
319	init_pmc_locks();
320
321	/* build our tracking queues */
322	init_pmc_queues();
323}
324
325#endif /* XNU_KERNEL_PRIVATE */
326
327/*
328 * Perf Monitor Internals
329 */
330
331static perf_monitor_t perf_monitor_alloc(void) {
332	/* perf monitors come from the perf small zone */
333	return (perf_monitor_t)zalloc(perf_small_zone);
334}
335
336static void perf_monitor_free(void *pm) {
337	zfree(perf_small_zone, pm);
338}
339
340static void perf_monitor_init(perf_monitor_t pm, int cpu) {
341	assert(pm);
342
343	pm->object = NULL;
344
345	bzero(&(pm->methods), sizeof(perf_monitor_methods_t));
346
347	pm->useCount = 1;	/* initial retain count of 1, for caller */
348
349	pm->reservedCounters = 0;
350
351	pm->cpu = cpu;
352
353	pm->link.next = pm->link.prev = (queue_entry_t)NULL;
354	pm->cpu_link.next = pm->cpu_link.prev = (queue_entry_t)NULL;
355}
356
357/*
358 * perf_monitor_dequeue removes the given perf_monitor_t from the
359 * perf_monitor_queue, thereby unregistering it with the system.
360 */
361static void perf_monitor_dequeue(perf_monitor_t pm) {
362	lck_spin_lock(&perf_monitor_queue_spin);
363
364	if (pm->methods.flags & PERFMON_FLAG_REQUIRES_IDLE_NOTIFICATIONS) {
365		/* If this flag is set, the monitor is already validated to be
366		 * accessible from a single cpu only.
367		 */
368		queue_remove(cpu_monitor_queues[pm->cpu], pm, perf_monitor_t, cpu_link);
369	}
370
371	/*
372	 * remove the @pm object from the @perf_monitor_queue queue (it is of type
373	 * <perf_monitor_t> and has a field called @link that is the queue_link_t
374	 */
375	queue_remove(perf_monitors_queue, pm, perf_monitor_t, link);
376
377	perf_monitors_count--;
378
379	lck_spin_unlock(&perf_monitor_queue_spin);
380}
381
382/*
383 * perf_monitor_enqueue adds the given perf_monitor_t to the perf_monitor_queue,
384 * thereby registering it for use with the system.
385 */
386static void perf_monitor_enqueue(perf_monitor_t pm) {
387
388	lck_mtx_lock(&cpu_monitor_queue_mutex);
389	lck_spin_lock(&perf_monitor_queue_spin);
390
391	if (pm->cpu >= 0) {
392            	/* Deferred initialisation; saves memory and permits ml_get_max_cpus()
393            	 * to block until cpu initialisation is complete.
394            	 */
395            	if (!cpu_monitor_queues) {
396            		uint32_t max_cpus;
397            		queue_head_t **queues;
398            		uint32_t i;
399
400            		lck_spin_unlock(&perf_monitor_queue_spin);
401
402            		max_cpus = ml_get_max_cpus();
403
404            		queues = (queue_head_t**)kalloc(sizeof(queue_head_t*) * max_cpus);
405            		assert(queues);
406            		for (i = 0; i < max_cpus; i++) {
407            			queue_head_t *queue = (queue_head_t*)kalloc(sizeof(queue_head_t));
408            			assert(queue);
409            			queue_init(queue);
410            			queues[i] = queue;
411            		}
412
413            		lck_spin_lock(&perf_monitor_queue_spin);
414
415            		cpu_monitor_queues = queues;
416            	}
417
418		queue_enter(cpu_monitor_queues[pm->cpu], pm, perf_monitor_t, cpu_link);
419	}
420
421	queue_enter(perf_monitors_queue, pm, perf_monitor_t, link);
422	perf_monitors_count++;
423
424	lck_spin_unlock(&perf_monitor_queue_spin);
425	lck_mtx_unlock(&cpu_monitor_queue_mutex);
426}
427
428/*
429 * perf_monitor_reference increments the reference count for the given
430 * perf_monitor_t.
431 */
432static void perf_monitor_reference(perf_monitor_t pm) {
433	assert(pm);
434
435	OSIncrementAtomic(&(pm->useCount));
436}
437
438/*
439 * perf_monitor_deallocate decrements the reference count for the given
440 * perf_monitor_t.  If the reference count hits 0, the object is released back
441 * to the perf_small_zone via a call to perf_monitor_free().
442 */
443static void perf_monitor_deallocate(perf_monitor_t pm) {
444	assert(pm);
445
446	/* If we just removed the last reference count */
447	if(1 == OSDecrementAtomic(&(pm->useCount))) {
448		/* Free the object */
449		perf_monitor_free(pm);
450	}
451}
452
453/*
454 * perf_monitor_find attempts to find a perf_monitor_t that corresponds to the
455 * given C++ object pointer that was used when registering with the subsystem.
456 *
457 * If found, the method returns the perf_monitor_t with an extra reference
458 * placed on the object (or NULL if not
459 * found).
460 *
461 * NOTE: Caller must use perf_monitor_deallocate to remove the extra reference after
462 * calling perf_monitor_find.
463 */
464static perf_monitor_t perf_monitor_find(perf_monitor_object_t monitor) {
465	assert(monitor);
466	perf_monitor_t element = NULL;
467	perf_monitor_t found = NULL;
468
469	lck_spin_lock(&perf_monitor_queue_spin);
470
471	queue_iterate(perf_monitors_queue, element, perf_monitor_t, link) {
472 		if(element->object == monitor) {
473			perf_monitor_reference(element);
474			found = element;
475			break;
476		}
477	}
478
479	lck_spin_unlock(&perf_monitor_queue_spin);
480
481	return found;
482}
483
484/*
485 * perf_monitor_add_pmc adds a newly registered PMC to the perf monitor it is
486 * associated with.
487 */
488
489static void perf_monitor_add_pmc(perf_monitor_t pm, pmc_t pmc __unused) {
490	assert(pm);
491	assert(pmc);
492
493	/* Today, we merely add a reference count now that a new pmc is attached */
494	perf_monitor_reference(pm);
495}
496
497/*
498 * perf_monitor_remove_pmc removes a newly *un*registered PMC from the perf
499 * monitor it is associated with.
500 */
501static void perf_monitor_remove_pmc(perf_monitor_t pm, pmc_t pmc __unused) {
502	assert(pm);
503	assert(pmc);
504
505	/* Today, we merely remove a reference count now that the pmc is detached */
506	perf_monitor_deallocate(pm);
507}
508
509/*
510 * Perf Counter internals
511 */
512
513static pmc_t pmc_alloc(void) {
514	return (pmc_t)zalloc(perf_big_zone);
515}
516
517static void pmc_free(void *pmc) {
518	zfree(perf_big_zone, pmc);
519}
520
521/*
522 * pmc_init initializes a newly allocated pmc_t
523 */
524static void pmc_init(pmc_t pmc) {
525	assert(pmc);
526
527	pmc->object = NULL;
528	pmc->monitor = NULL;
529
530	bzero(&pmc->methods, sizeof(pmc_methods_t));
531
532	/* One reference for the caller */
533	pmc->useCount = 1;
534}
535
536/*
537 * pmc_reference increments the reference count of the given pmc_t
538 */
539static void pmc_reference(pmc_t pmc) {
540	assert(pmc);
541
542	OSIncrementAtomic(&(pmc->useCount));
543}
544
545/*
546 * pmc_deallocate decrements the reference count of the given pmc_t. If the
547 * reference count hits zero, the given pmc_t is deallocated and released back
548 * to the allocation zone.
549 */
550static void pmc_deallocate(pmc_t pmc) {
551	assert(pmc);
552
553	/* If we just removed the last reference count */
554	if(1 == OSDecrementAtomic(&(pmc->useCount))) {
555		/* Free the pmc */
556		pmc_free(pmc);
557	}
558}
559
560/*
561 * pmc_dequeue removes the given, newly *un*registered pmc from the
562 * perf_counters_queue.
563 */
564static void pmc_dequeue(pmc_t pmc) {
565	lck_spin_lock(&perf_counters_queue_spin);
566
567	queue_remove(perf_counters_queue, pmc, pmc_t, link);
568
569	perf_counters_count--;
570
571	lck_spin_unlock(&perf_counters_queue_spin);
572}
573
574/*
575 * pmc_enqueue adds the given, newly registered pmc to the perf_counters_queue
576 */
577static void pmc_enqueue(pmc_t pmc) {
578	lck_spin_lock(&perf_counters_queue_spin);
579
580	queue_enter(perf_counters_queue, pmc, pmc_t, link);
581
582	perf_counters_count++;
583
584	lck_spin_unlock(&perf_counters_queue_spin);
585}
586
587/*
588 * pmc_find attempts to locate a pmc_t that was registered with the given
589 * pmc_object_t pointer.  If found, it returns the pmc_t with an extra reference
590 * which must be dropped by the caller by calling pmc_deallocate().
591 */
592static pmc_t pmc_find(pmc_object_t object) {
593	assert(object);
594
595	lck_spin_lock(&perf_counters_queue_spin);
596
597	pmc_t element = NULL;
598	pmc_t found = NULL;
599
600	queue_iterate(perf_counters_queue, element, pmc_t, link) {
601		if(element->object == object) {
602			pmc_reference(element);
603			found = element;
604			break;
605		}
606	}
607
608	lck_spin_unlock(&perf_counters_queue_spin);
609
610	return found;
611}
612
613/*
614 * Config internals
615 */
616
617/* Allocate a pmc_config_t */
618static pmc_config_t pmc_config_alloc(pmc_t pmc __unused) {
619	return (pmc_config_t)zalloc(perf_small_zone);
620}
621
622/* Free a pmc_config_t, and underlying pmc_config_object_t (if needed) */
623static void pmc_config_free(pmc_t pmc, pmc_config_t config) {
624	assert(pmc);
625	assert(config);
626
627	if(config->object) {
628		pmc->methods.free_config(pmc->object, config->object);
629		config->object = NULL;
630	}
631
632	zfree(perf_small_zone, config);
633}
634
635static kern_return_t pmc_open(pmc_t pmc) {
636	assert(pmc);
637	assert(pmc->object);
638	assert(pmc->open_object);
639
640	return pmc->methods.open(pmc->object, pmc->open_object);
641}
642
643static kern_return_t pmc_close(pmc_t pmc) {
644	assert(pmc);
645	assert(pmc->object);
646	assert(pmc->open_object);
647
648	return pmc->methods.close(pmc->object, pmc->open_object);
649}
650
651/*
652 * Reservation Internals
653 */
654
655static kern_return_t pmc_internal_reservation_set_pmc(pmc_reservation_t resv, pmc_t pmc);
656static void pmc_internal_reservation_store(pmc_reservation_t reservation);
657static void pmc_internal_reservation_load(pmc_reservation_t reservation);
658
659static pmc_reservation_t reservation_alloc(void) {
660	/* pmc reservations come from the perf small zone */
661	return (pmc_reservation_t)zalloc(perf_small_zone);
662}
663
664/*
665 * reservation_free deallocates and releases all resources associated with the
666 * given pmc_reservation_t.  This includes freeing the config used to create the
667 * reservation, decrementing the reference count for the pmc used to create the
668 * reservation, and deallocating the reservation's memory.
669 */
670static void reservation_free(pmc_reservation_t resv) {
671	/* Free config */
672	if(resv->config) {
673		assert(resv->pmc);
674
675		pmc_free_config(resv->pmc, resv->config);
676
677		resv->config = NULL;
678	}
679
680	/* release PMC */
681	(void)pmc_internal_reservation_set_pmc(resv, NULL);
682
683	/* Free reservation */
684	zfree(perf_small_zone, resv);
685}
686
687/*
688 * reservation_init initializes a newly created reservation.
689 */
690static void reservation_init(pmc_reservation_t resv) {
691	assert(resv);
692
693	resv->pmc = NULL;
694	resv->config = NULL;
695	resv->value = 0ULL;
696
697	resv->flags = 0U;
698	resv->state = PMC_STATE(PMC_STATE_STATE_STOP, 0, 0);
699	resv->active_last_context_in = 0U;
700
701	/*
702	 * Since this member is a union, we only need to set either the task
703	 * or thread to NULL.
704	 */
705	resv->task = TASK_NULL;
706}
707
708/*
709 * pmc_internal_reservation_set_pmc sets the pmc associated with the reservation object. If
710 * there was one set already, it is deallocated (reference is dropped) before
711 * the new one is set.  This methods increases the reference count of the given
712 * pmc_t.
713 *
714 * NOTE: It is okay to pass NULL as the pmc_t - this will have the effect of
715 * dropping the reference on any previously set pmc, and setting the reservation
716 * to having no pmc set.
717 */
718static kern_return_t pmc_internal_reservation_set_pmc(pmc_reservation_t resv, pmc_t pmc) {
719	assert(resv);
720
721	if(resv->pmc) {
722		(void)pmc_close(resv->pmc);
723		pmc_deallocate(resv->pmc);
724		resv->pmc = NULL;
725	}
726
727	resv->pmc = pmc;
728
729	if(resv->pmc) {
730		pmc_reference(resv->pmc);
731		if(KERN_SUCCESS != pmc_open(resv->pmc)) {
732			pmc_deallocate(resv->pmc);
733			resv->pmc = NULL;
734
735			return KERN_FAILURE;
736		}
737	}
738
739	return KERN_SUCCESS;
740}
741
742/*
743 * Used to place reservation into one of the system, task, and thread queues
744 * Assumes the queue's spin lock is already held.
745 */
746static void pmc_internal_reservation_enqueue(queue_t queue, pmc_reservation_t resv) {
747	assert(queue);
748	assert(resv);
749
750	queue_enter(queue, resv, pmc_reservation_t, link);
751}
752
753static void pmc_internal_reservation_dequeue(queue_t queue, pmc_reservation_t resv) {
754	assert(queue);
755	assert(resv);
756
757	queue_remove(queue, resv, pmc_reservation_t, link);
758}
759
760/* Returns TRUE if the reservation applies to the current execution context */
761static boolean_t pmc_internal_reservation_matches_context(pmc_reservation_t resv) {
762	boolean_t ret = FALSE;
763	assert(resv);
764
765	if(PMC_FLAG_IS_SYSTEM_SCOPE(resv->flags)) {
766		ret = TRUE;
767	} else if(PMC_FLAG_IS_TASK_SCOPE(resv->flags)) {
768		if(current_task() == resv->task) {
769			ret = TRUE;
770		}
771	} else if(PMC_FLAG_IS_THREAD_SCOPE(resv->flags)) {
772		if(current_thread() == resv->thread) {
773			ret = TRUE;
774		}
775	}
776
777	return ret;
778}
779
780/*
781 * pmc_accessible_core_count returns the number of logical cores that can access
782 * a given @pmc.  0 means every core in the system.
783 */
784static uint32_t pmc_accessible_core_count(pmc_t pmc) {
785	assert(pmc);
786
787	uint32_t *cores = NULL;
788	size_t coreCt = 0UL;
789
790	if(KERN_SUCCESS != pmc->methods.accessible_cores(pmc->object,
791		&cores, &coreCt)) {
792		coreCt = 0U;
793	}
794
795	return (uint32_t)coreCt;
796}
797
798/* spin lock for the queue must already be held */
799/*
800 * This method will inspect the task/thread of the reservation to see if it
801 * matches the new incoming one (for thread/task reservations only).  Will only
802 * return TRUE if the task/thread matches.
803 */
804static boolean_t pmc_internal_reservation_queue_contains_pmc(queue_t queue, pmc_reservation_t resv) {
805	assert(queue);
806	assert(resv);
807
808	boolean_t ret = FALSE;
809	pmc_reservation_t tmp = NULL;
810
811	queue_iterate(queue, tmp, pmc_reservation_t, link) {
812		if(tmp->pmc == resv->pmc) {
813			/* PMC matches - make sure scope matches first */
814			switch(PMC_FLAG_SCOPE(tmp->flags)) {
815				case PMC_FLAG_SCOPE_SYSTEM:
816					/*
817					 * Found a reservation in system queue with same pmc - always a
818					 * conflict.
819					 */
820					ret = TRUE;
821					break;
822				case PMC_FLAG_SCOPE_THREAD:
823					/*
824					 * Found one in thread queue with the same PMC as the
825					 * argument. Only a conflict if argument scope isn't
826					 * thread or system, or the threads match.
827					 */
828					ret = (PMC_FLAG_SCOPE(resv->flags) != PMC_FLAG_SCOPE_THREAD) ||
829						(tmp->thread == resv->thread);
830
831					if(!ret) {
832						/*
833						 * so far, no conflict - check that the pmc that is
834						 * being reserved isn't accessible from more than
835						 * one core, if it is, we need to say it's already
836						 * taken.
837						 */
838						if(1 != pmc_accessible_core_count(tmp->pmc)) {
839							ret = TRUE;
840						}
841					}
842					break;
843				case PMC_FLAG_SCOPE_TASK:
844					/*
845					 * Follow similar semantics for task scope.
846					 */
847
848					ret = (PMC_FLAG_SCOPE(resv->flags) != PMC_FLAG_SCOPE_TASK) ||
849						(tmp->task == resv->task);
850					if(!ret) {
851						/*
852						 * so far, no conflict - check that the pmc that is
853						 * being reserved isn't accessible from more than
854						 * one core, if it is, we need to say it's already
855						 * taken.
856						 */
857						if(1 != pmc_accessible_core_count(tmp->pmc)) {
858							ret = TRUE;
859						}
860					}
861
862					break;
863			}
864
865			if(ret) break;
866		}
867	}
868
869	return ret;
870}
871
872/*
873 * pmc_internal_reservation_validate_for_pmc returns TRUE if the given reservation can be
874 * added to its target queue without creating conflicts (target queue is
875 * determined by the reservation's scope flags). Further, this method returns
876 * FALSE if any level contains a reservation for a PMC that can be accessed from
877 * more than just 1 core, and the given reservation also wants the same PMC.
878 */
879static boolean_t pmc_internal_reservation_validate_for_pmc(pmc_reservation_t resv) {
880	assert(resv);
881	boolean_t ret = TRUE;
882
883	if(pmc_internal_reservation_queue_contains_pmc(system_reservations, resv) ||
884		pmc_internal_reservation_queue_contains_pmc(task_reservations, resv) ||
885		pmc_internal_reservation_queue_contains_pmc(thread_reservations, resv)) {
886		ret = FALSE;
887	}
888
889	return ret;
890}
891
892static void pmc_internal_update_thread_flag(thread_t thread, boolean_t newFlag) {
893	assert(thread);
894
895	/* See if this thread needs it's PMC flag set */
896	pmc_reservation_t tmp = NULL;
897
898	if(!newFlag) {
899		/*
900		 * If the parent task just dropped its reservation, iterate the thread
901		 * reservations to see if we need to keep the pmc flag set for the given
902		 * thread or not.
903		 */
904		lck_spin_lock(&reservations_spin);
905
906		queue_iterate(thread_reservations, tmp, pmc_reservation_t, link) {
907			if(tmp->thread == thread) {
908				newFlag = TRUE;
909				break;
910			}
911		}
912
913		lck_spin_unlock(&reservations_spin);
914	}
915
916	if(newFlag) {
917		OSBitOrAtomic(THREAD_PMC_FLAG, &thread->t_chud);
918	} else {
919		OSBitAndAtomic(~(THREAD_PMC_FLAG), &thread->t_chud);
920	}
921}
922
923/*
924 * This operation is (worst case) O(N*M) where N is number of threads in the
925 * given task, and M is the number of thread reservations in our system.
926 */
927static void pmc_internal_update_task_flag(task_t task, boolean_t newFlag) {
928	assert(task);
929	thread_t thread = NULL;
930
931	if(newFlag) {
932		OSBitOrAtomic(TASK_PMC_FLAG, &task->t_chud);
933	} else {
934		OSBitAndAtomic(~(TASK_PMC_FLAG), &task->t_chud);
935	}
936
937	task_lock(task);
938
939	queue_iterate(&task->threads, thread, thread_t, task_threads) {
940		/* propagate the task's mask down to each thread  */
941		pmc_internal_update_thread_flag(thread, newFlag);
942	}
943
944	task_unlock(task);
945}
946
947/*
948 * pmc_internal_reservation_add adds a reservation to the global tracking queues after
949 * ensuring there are no reservation conflicts.  To do this, it takes all the
950 * spin locks for all the queue (to ensure no other core goes and adds a
951 * reservation for the same pmc to a queue that has already been checked).
952 */
953static boolean_t pmc_internal_reservation_add(pmc_reservation_t resv) {
954	assert(resv);
955
956	boolean_t ret = FALSE;
957
958	/* always lock all three in the same order */
959	lck_spin_lock(&reservations_spin);
960
961	/* Check if the reservation can be added without conflicts */
962	if(pmc_internal_reservation_validate_for_pmc(resv)) {
963
964		/* add reservation to appropriate scope */
965		switch(PMC_FLAG_SCOPE(resv->flags)) {
966		case PMC_FLAG_SCOPE_SYSTEM:
967			/* Simply add it to the system queue */
968			pmc_internal_reservation_enqueue(system_reservations, resv);
969			system_reservation_count++;
970
971			lck_spin_unlock(&reservations_spin);
972
973			break;
974
975		case PMC_FLAG_SCOPE_TASK:
976			assert(resv->task);
977
978			/* Not only do we enqueue it in our local queue for tracking */
979			pmc_internal_reservation_enqueue(task_reservations, resv);
980			task_reservation_count++;
981
982			lck_spin_unlock(&reservations_spin);
983
984			/* update the task mask, and propagate it to existing threads */
985			pmc_internal_update_task_flag(resv->task, TRUE);
986			break;
987
988		/* Thread-switched counter */
989		case PMC_FLAG_SCOPE_THREAD:
990			assert(resv->thread);
991
992			/*
993			 * Works the same as a task-switched counter, only at
994			 * thread-scope
995			 */
996
997			pmc_internal_reservation_enqueue(thread_reservations, resv);
998			thread_reservation_count++;
999
1000			lck_spin_unlock(&reservations_spin);
1001
1002			pmc_internal_update_thread_flag(resv->thread, TRUE);
1003			break;
1004		}
1005
1006		ret = TRUE;
1007	} else {
1008		lck_spin_unlock(&reservations_spin);
1009	}
1010
1011	return ret;
1012}
1013
1014static void pmc_internal_reservation_broadcast(pmc_reservation_t reservation, void (*action_func)(void *)) {
1015	uint32_t * cores;
1016	size_t core_cnt;
1017
1018	/* Get the list of accessible cores */
1019	if (KERN_SUCCESS == pmc_get_accessible_core_list(reservation->pmc, &cores, &core_cnt)) {
1020		boolean_t intrs_enabled = ml_set_interrupts_enabled(FALSE);
1021
1022		/* Fast case: the PMC is only accessible from one core and we happen to be on it */
1023		if (core_cnt == 1 && cores[0] == (uint32_t)cpu_number()) {
1024			action_func(reservation);
1025		} else {
1026			/* Call action_func on every accessible core */
1027#if defined(__i386__) || defined(__x86_64__)
1028			size_t ii;
1029			cpumask_t mask = 0;
1030
1031			/* Build a mask for the accessible cores */
1032			if (core_cnt > 0) {
1033				for (ii = 0; ii < core_cnt; ii++) {
1034					mask |= cpu_to_cpumask(cores[ii]);
1035				}
1036			} else {
1037				/* core_cnt = 0 really means all cpus */
1038				mask = CPUMASK_ALL;
1039			}
1040			mp_cpus_call(mask, ASYNC, action_func, reservation);
1041#elif defined(__arm__)
1042    panic("Please implement me: pmc_internal_reservation_broadcast");
1043#else
1044#error pmc_reservation_interrupt needs an inter-processor method invocation mechanism for this architecture
1045#endif
1046		}
1047
1048		ml_set_interrupts_enabled(intrs_enabled);
1049	}
1050
1051}
1052
1053/*
1054 * pmc_internal_reservation_remove removes the given reservation from the appropriate
1055 * reservation queue according to its scope.
1056 *
1057 * NOTE: The scope flag must have been set for this method to function.
1058 */
1059static void pmc_internal_reservation_remove(pmc_reservation_t resv) {
1060	assert(resv);
1061
1062	/*
1063	 * Due to the way the macros are written, we can't just blindly queue-remove
1064	 * the reservation without knowing which queue it's in. We figure this out
1065	 * using the reservation's scope flags.
1066	 */
1067
1068	/* Lock the global spin lock */
1069	lck_spin_lock(&reservations_spin);
1070
1071	switch(PMC_FLAG_SCOPE(resv->flags)) {
1072
1073		case PMC_FLAG_SCOPE_SYSTEM:
1074			pmc_internal_reservation_dequeue(system_reservations, resv);
1075			system_reservation_count--;
1076
1077			lck_spin_unlock(&reservations_spin);
1078
1079			break;
1080
1081		case PMC_FLAG_SCOPE_TASK:
1082			/* remove from the global queue */
1083			pmc_internal_reservation_dequeue(task_reservations, resv);
1084			task_reservation_count--;
1085
1086			/* unlock the global */
1087			lck_spin_unlock(&reservations_spin);
1088
1089			/* Recalculate task's counter mask */
1090			pmc_internal_update_task_flag(resv->task, FALSE);
1091
1092			break;
1093
1094		case PMC_FLAG_SCOPE_THREAD:
1095			pmc_internal_reservation_dequeue(thread_reservations, resv);
1096			thread_reservation_count--;
1097
1098			lck_spin_unlock(&reservations_spin);
1099
1100			/* recalculate the thread's counter mask */
1101			pmc_internal_update_thread_flag(resv->thread, FALSE);
1102
1103			break;
1104	}
1105}
1106
1107/* Reservation State Machine
1108 *
1109 * The PMC subsystem uses a 3-tuple of state information packed into a 32-bit quantity and a
1110 * set of 9 events to provide MP-safe bookkeeping and control flow.  The 3-tuple is comprised
1111 * of a state, a count of active contexts, and a set of modifier flags.  A state machine defines
1112 * the possible transitions at each event point given the current 3-tuple.  Atomicity is handled
1113 * by reading the current 3-tuple, applying the transformations indicated by the state machine
1114 * and then attempting to OSCompareAndSwap the transformed value.  If the OSCompareAndSwap fails,
1115 * the process is repeated until either the OSCompareAndSwap succeeds or not valid transitions are
1116 * available.
1117 *
1118 * The state machine is described using tuple notation for the current state and a related notation
1119 * for describing the transformations.  For concisness, the flag and state names are abbreviated as
1120 * follows:
1121 *
1122 * states:
1123 * S = STOP
1124 * CR = CAN_RUN
1125 * L = LOAD
1126 * R = RUN
1127 * ST = STORE
1128 * I = INTERRUPT
1129 * D = DEALLOC
1130 *
1131 * flags:
1132 *
1133 * S = STOPPING
1134 * D = DEALLOCING
1135 * I = INTERRUPTING
1136 *
1137 * The tuple notation is formed from the following pattern:
1138 *
1139 * tuple = < state, active-context-count, flags >
1140 * state = S | CR | L | R | ST | I | D
1141 * active-context-count = 0 | >0 | 1 | >1
1142 * flags = flags flag | blank
1143 * flag = S | D | I
1144 *
1145 * The transform notation is similar, but only describes the modifications made to the current state.
1146 * The notation is formed from the following pattern:
1147 *
1148 * transform = < state, active-context-count, flags >
1149 * state = S | CR | L | R | ST | I | D
1150 * active-context-count = + | - | blank
1151 * flags = flags flag | flags !flag | blank
1152 * flag = S | D | I
1153 *
1154 * And now for the state machine:
1155 * State		Start		Stop		Free		Interrupt		End Interrupt		Context In		Context Out	Load Finished		Store Finished
1156 * <CR, 0, >				<S, , >		<D, , >			<L, +, >
1157 * <D, 0, >
1158 * <D, 1, D>									< , -, !D>
1159 * <D, >1, D>									< , -, >
1160 * <I, 0, D>									<D, , !D>
1161 * <I, 0, S>	< , , !S>				< , , !SD>		<S, , !S>
1162 * <I, 0, >					< , , S>	< , , D>	<CR, , >
1163 * <L, 1, D>									<ST, -, >
1164 * <L, 1, ID>									<ST, -, >
1165 * <L, 1, IS>							< , , !SD>	<ST, -, >
1166 * <L, 1, S>	< , , !S>				< , , !SD>		<ST, -, >
1167 * <L, 1, >					< , , S>	< , , D>	< , , IS>							< , +, >	<R, , >
1168 * <L, >1, D>									< , -, >		<R, -, >
1169 * <L, >1, ID>									< , -, >		<R, -, >
1170 * <L, >1, IS>							< , , !SD>	< , -, >		<R, -, >
1171 * <L, >1, S>	< , , !S>				< , , !SD>		< , -, >		<R, -, >
1172 * <L, >1, >				< , , S>	< , , D>	< , , IS>							< , +, >		< , -, >		<R, , >
1173 * <R, 1, D>									<ST, -, >
1174 * <R, 1, ID>									<ST, -, >
1175 * <R, 1, IS>							< , , !SD>	<ST, -, >
1176 * <R, 1, S>	< , , !S>				< , , !SD>		<ST, -, >
1177 * <R, 1, >					< , , S>	< , , D>	< , , IS>							< , +, >	<ST, -, >
1178 * <R, >1, D>									< , -, >
1179 * <R, >1, ID>									< , -, >
1180 * <R, >1, IS>							< , , !SD>	< , -, >
1181 * <R, >1, S>	< , , !S>				< , , !SD>		< , -, >
1182 * <R, >1, >				< , , S>	< , , D>	< , , IS>							< , +, >		< , -, >
1183 * <S, 0, >		<CR, , >				<D, , >
1184 * <S, 1, ID>									<I, -, !I>
1185 * <S, 1, IS>							< , , !SD>	<I, -, !I>
1186 * <S, 1, S>	< , , !S>				<D, , !SD>		< , -, !S>
1187 * <S, 1, >					< , , S>	<D, , D>	<L, +, >		<CR, -, >
1188 * <S, >1, ID>									< , -, >
1189 * <S, >1, IS>							< , , !SD>	< , -, >
1190 * <S, >1, S>	< , , !S>				<D, , !SD>		< , -, >
1191 * <S, >1, >				< , , S>	<D, , D>		<L, +, >		< , -, >
1192 * <ST, 0, D>									<D, , !D>
1193 * <ST, 0, ID>									<I, , !I>
1194 * <ST, 0, IS>							< , , !SD>	<I, , !I>
1195 * <ST, 0, S>	< , , !S>				< , , !SD>		<S, , !S>
1196 * <ST, 0, >				< , , S>	< , , D>	< , , IS>							< , +, >		<CR, , >
1197 * <ST, >0, D>									< , -, >							<D, , >
1198 * <ST, >0, ID>								< , -, >							<S, , >
1199 * <ST, >0, IS>							< , , !SD>										< , -, >			<S, , >
1200 * <ST, >0, S>	< , , !S>				< , , !SD>		< , -, >							<S, , >
1201 * <ST, >0, >				< , , S>	< , , D>	< , , IS>							< , +, >		< , -, >			<L, , >
1202 */
1203
1204static uint32_t pmc_internal_reservation_next_state(uint32_t current_state, pmc_state_event_t event) {
1205	uint32_t new_state = PMC_STATE(PMC_STATE_STATE_INVALID, 0, 0);
1206
1207	switch (event) {
1208		case PMC_STATE_EVENT_START:
1209			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1210				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, PMC_STATE_FLAGS_STOPPING):
1211				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_STOPPING):
1212				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_STOPPING):
1213				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_STOPPING):
1214				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_STOPPING):
1215					new_state = PMC_STATE_MODIFY(current_state, 0, 0, PMC_STATE_FLAGS_STOPPING);
1216					break;
1217				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1218					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1219						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_CAN_RUN, 0, 0, 0);
1220					}
1221					break;
1222			}
1223			break;
1224		case PMC_STATE_EVENT_STOP:
1225			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1226				case PMC_STATE(PMC_STATE_STATE_CAN_RUN, 0, 0):
1227					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, 0);
1228					break;
1229				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, 0):
1230				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1231				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1232				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1233					new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_STOPPING, 0);
1234					break;
1235				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1236					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1237						new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_STOPPING, 0);
1238					}
1239					break;
1240			}
1241			break;
1242		case PMC_STATE_EVENT_FREE:
1243			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1244				case PMC_STATE(PMC_STATE_STATE_CAN_RUN, 0, 0):
1245					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, 0);
1246					break;
1247				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, PMC_STATE_FLAGS_STOPPING):
1248				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1249				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_STOPPING):
1250				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1251				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_STOPPING):
1252				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1253				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1254				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_STOPPING):
1255					new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_DEALLOCING, PMC_STATE_FLAGS_STOPPING);
1256					break;
1257				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, 0):
1258				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1259				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1260				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1261					new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_DEALLOCING, 0);
1262					break;
1263				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_STOPPING):
1264					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, PMC_STATE_FLAGS_DEALLOCING, PMC_STATE_FLAGS_STOPPING);
1265					break;
1266				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1267					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1268						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, PMC_STATE_FLAGS_DEALLOCING, 0);
1269					} else {
1270						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, 0);
1271					}
1272					break;
1273			}
1274			break;
1275		case PMC_STATE_EVENT_INTERRUPT:
1276			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1277				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1278				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1279				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1280					new_state = PMC_STATE_MODIFY(current_state, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING, 0);
1281					break;
1282			}
1283			break;
1284		case PMC_STATE_EVENT_END_OF_INTERRUPT:
1285			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1286				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, PMC_STATE_FLAGS_DEALLOCING):
1287					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, PMC_STATE_FLAGS_DEALLOCING);
1288					break;
1289				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, PMC_STATE_FLAGS_STOPPING):
1290					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, PMC_STATE_FLAGS_STOPPING);
1291					break;
1292				case PMC_STATE(PMC_STATE_STATE_INTERRUPT, 0, 0):
1293					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_CAN_RUN, 0, 0, 0);
1294					break;
1295			}
1296			break;
1297		case PMC_STATE_EVENT_CONTEXT_IN:
1298			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1299				case PMC_STATE(PMC_STATE_STATE_CAN_RUN, 0, 0):
1300					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_LOAD, 1, 0, 0);
1301					break;
1302				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1303				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1304				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1305					new_state = PMC_STATE_MODIFY(current_state, 1, 0, 0);
1306					break;
1307				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1308					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1309						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_LOAD, 1, 0, 0);
1310					}
1311					break;
1312			}
1313			break;
1314		case PMC_STATE_EVENT_CONTEXT_OUT:
1315			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1316				case PMC_STATE(PMC_STATE_STATE_DEALLOC, 0, PMC_STATE_FLAGS_DEALLOCING):
1317					if (PMC_STATE_CONTEXT_COUNT(current_state) > 1) {
1318						new_state = PMC_STATE_MODIFY(current_state, -1, 0, PMC_STATE_FLAGS_DEALLOCING);
1319					} else {
1320						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1321					}
1322					break;
1323				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_DEALLOCING):
1324				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1325				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1326				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_STOPPING):
1327				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1328					if (PMC_STATE_CONTEXT_COUNT(current_state) > 1) {
1329						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1330					}
1331					break;
1332				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_DEALLOCING):
1333				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1334				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1335				case PMC_STATE(PMC_STATE_STATE_RUN, 0, PMC_STATE_FLAGS_STOPPING):
1336				case PMC_STATE(PMC_STATE_STATE_RUN, 0, 0):
1337					if (PMC_STATE_CONTEXT_COUNT(current_state) == 1) {
1338						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STORE, -1, 0, 0);
1339					} else {
1340						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1341					}
1342					break;
1343				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1344				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1345					if (PMC_STATE_CONTEXT_COUNT(current_state) == 1) {
1346						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_INTERRUPT, -1, 0, PMC_STATE_FLAGS_INTERRUPTING);
1347					} else {
1348						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1349					}
1350					break;
1351				case PMC_STATE(PMC_STATE_STATE_STOP, 0, PMC_STATE_FLAGS_STOPPING):
1352					if (PMC_STATE_CONTEXT_COUNT(current_state) == 1) {
1353						new_state = PMC_STATE_MODIFY(current_state, -1, 0, PMC_STATE_FLAGS_STOPPING);
1354					} else {
1355						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1356					}
1357					break;
1358				case PMC_STATE(PMC_STATE_STATE_STOP, 0, 0):
1359					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1360						if (PMC_STATE_CONTEXT_COUNT(current_state) == 1) {
1361							new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_CAN_RUN, -1, 0, 0);
1362						} else {
1363							new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1364						}
1365					}
1366					break;
1367				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_DEALLOCING):
1368				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1369				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1370				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_STOPPING):
1371				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1372					if (PMC_STATE_CONTEXT_COUNT(current_state) > 0) {
1373						new_state = PMC_STATE_MODIFY(current_state, -1, 0, 0);
1374					}
1375					break;
1376			}
1377			break;
1378		case PMC_STATE_EVENT_LOAD_FINISHED:
1379			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1380				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_DEALLOCING):
1381				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1382				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1383				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, PMC_STATE_FLAGS_STOPPING):
1384					if (PMC_STATE_CONTEXT_COUNT(current_state) > 1) {
1385						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_RUN, -1, 0, 0);
1386					} else {
1387						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STORE, -1, 0, 0);
1388					}
1389					break;
1390				case PMC_STATE(PMC_STATE_STATE_LOAD, 0, 0):
1391					new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_RUN, 0, 0, 0);
1392					break;
1393			}
1394			break;
1395		case PMC_STATE_EVENT_STORE_FINISHED:
1396			switch (current_state & ~(PMC_STATE_CONTEXT_COUNT_MASK)) {
1397				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_DEALLOCING):
1398					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1399						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, PMC_STATE_FLAGS_DEALLOCING);
1400					} else {
1401						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_DEALLOC, 0, 0, 0);
1402					}
1403					break;
1404				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_DEALLOCING):
1405				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_INTERRUPTING | PMC_STATE_FLAGS_STOPPING):
1406					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1407						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_INTERRUPT, 0, 0, PMC_STATE_FLAGS_INTERRUPTING);
1408					} else {
1409						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, 0);
1410					}
1411					break;
1412				case PMC_STATE(PMC_STATE_STATE_STORE, 0, PMC_STATE_FLAGS_STOPPING):
1413					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1414						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, PMC_STATE_FLAGS_STOPPING);
1415					} else {
1416						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_STOP, 0, 0, 0);
1417					}
1418					break;
1419				case PMC_STATE(PMC_STATE_STATE_STORE, 0, 0):
1420					if (PMC_STATE_CONTEXT_COUNT(current_state) == 0) {
1421						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_CAN_RUN, 0, 0, 0);
1422					} else {
1423						new_state = PMC_STATE_MOVE(current_state, PMC_STATE_STATE_LOAD, 0, 0, 0);
1424					}
1425					break;
1426			}
1427			break;
1428	}
1429
1430	return new_state;
1431}
1432
1433static uint32_t pmc_internal_reservation_move_for_event(pmc_reservation_t reservation, pmc_state_event_t event, pmc_state_t *old_state_out) {
1434	pmc_state_t oldState;
1435	pmc_state_t newState;
1436
1437	assert(reservation);
1438
1439	/* Determine what state change, if any, we need to do.  Keep trying until either we succeed doing a transition
1440	 * or the there is no valid move.
1441	 */
1442	do {
1443		oldState = reservation->state;
1444		newState = pmc_internal_reservation_next_state(oldState, event);
1445	} while (newState != PMC_STATE_INVALID && !OSCompareAndSwap(oldState, newState, &(reservation->state)));
1446
1447	if (newState != PMC_STATE_INVALID) {
1448		COUNTER_DEBUG("Moved reservation %p from state "PMC_STATE_FORMAT" to state "PMC_STATE_FORMAT" for event %s\n", reservation, PMC_STATE_ARGS(oldState), PMC_STATE_ARGS(newState), pmc_state_event_name(event));
1449	} else {
1450		COUNTER_DEBUG("No valid moves for reservation %p in state "PMC_STATE_FORMAT" for event %s\n", reservation, PMC_STATE_ARGS(oldState), pmc_state_event_name(event));
1451	}
1452
1453	if (old_state_out != NULL) {
1454		*old_state_out = oldState;
1455	}
1456
1457	return newState;
1458}
1459
1460static void pmc_internal_reservation_context_out(pmc_reservation_t reservation) {
1461	assert(reservation);
1462	pmc_state_t newState;
1463	pmc_state_t oldState;
1464
1465	/* Clear that the this reservation was active when this cpu did its last context in */
1466	OSBitAndAtomic(~(1U << cpu_number()), &(reservation->active_last_context_in));
1467
1468	/* Move the state machine */
1469	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_CONTEXT_OUT, &oldState))) {
1470		return;
1471	}
1472
1473	/* Do any actions required based on the state change */
1474	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_STORE && PMC_STATE_STATE(oldState) != PMC_STATE_STATE_STORE) {
1475		/* Just moved into STORE, so store the reservation. */
1476		pmc_internal_reservation_store(reservation);
1477	} else if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_DEALLOC && PMC_STATE_CONTEXT_COUNT(newState) == 0 && PMC_STATE_FLAGS(newState) == 0) {
1478		/* Wakeup any thread blocking for this reservation to hit <DEALLOC, 0, > */
1479		thread_wakeup((event_t)reservation);
1480	}
1481
1482}
1483
1484static void pmc_internal_reservation_context_in(pmc_reservation_t reservation) {
1485	assert(reservation);
1486	pmc_state_t oldState;
1487	pmc_state_t newState;
1488
1489	/* Move the state machine */
1490	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_CONTEXT_IN, &oldState))) {
1491		return;
1492	}
1493
1494	/* Mark that the reservation was active when this cpu did its last context in */
1495	OSBitOrAtomic(1U << cpu_number(), &(reservation->active_last_context_in));
1496
1497	/* Do any actions required based on the state change */
1498	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_LOAD && PMC_STATE_STATE(oldState) != PMC_STATE_STATE_LOAD) {
1499		/* Just moved into LOAD, so load the reservation. */
1500		pmc_internal_reservation_load(reservation);
1501	}
1502
1503}
1504
1505static void pmc_internal_reservation_store(pmc_reservation_t reservation) {
1506	assert(reservation);
1507	assert(PMC_STATE_STATE(reservation->state) == PMC_STATE_STATE_STORE);
1508
1509	assert(reservation->pmc);
1510	assert(reservation->config);
1511
1512	pmc_state_t newState;
1513	kern_return_t ret = KERN_SUCCESS;
1514
1515	pmc_t store_pmc = reservation->pmc;
1516	pmc_object_t store_pmc_obj = store_pmc->object;
1517	perf_monitor_t store_pm = store_pmc->monitor;
1518
1519	/*
1520	 * Instruct the Perf Monitor that contains this counter to turn
1521	 * off the global disable for this counter.
1522	 */
1523	ret = store_pm->methods.disable_counters(store_pm->object, &store_pmc_obj, 1);
1524	if(KERN_SUCCESS != ret) {
1525		COUNTER_DEBUG(" [error] disable_counters: 0x%x\n", ret);
1526		return;
1527	}
1528
1529	/* Instruct the counter to disable itself */
1530	ret = store_pmc->methods.disable(store_pmc_obj);
1531	if(KERN_SUCCESS != ret) {
1532		COUNTER_DEBUG("  [error] disable: 0x%x\n", ret);
1533	}
1534
1535	/* store the counter value into the reservation's stored count */
1536	ret = store_pmc->methods.get_count(store_pmc_obj, &reservation->value);
1537	if(KERN_SUCCESS != ret) {
1538		COUNTER_DEBUG("  [error] get_count: 0x%x\n", ret);
1539		return;
1540	}
1541
1542	/* Advance the state machine now that the STORE is finished */
1543	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_STORE_FINISHED, NULL))) {
1544		return;
1545	}
1546
1547	/* Do any actions required based on the state change */
1548	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_LOAD) {
1549		/* Just moved into LOAD, so load the reservation. */
1550		pmc_internal_reservation_load(reservation);
1551	} else if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_DEALLOC && PMC_STATE_CONTEXT_COUNT(newState) == 0 && PMC_STATE_FLAGS(newState) == 0) {
1552		/* Wakeup any thread blocking for this reservation to hit <DEALLOC, 0, > */
1553		thread_wakeup((event_t)reservation);
1554	}
1555
1556}
1557
1558static void pmc_internal_reservation_load(pmc_reservation_t reservation) {
1559	assert(reservation);
1560	assert(PMC_STATE_STATE(reservation->state) == PMC_STATE_STATE_LOAD);
1561
1562	pmc_state_t newState;
1563	kern_return_t ret = KERN_SUCCESS;
1564
1565	assert(reservation->pmc);
1566	assert(reservation->config);
1567
1568	pmc_t load_pmc = reservation->pmc;
1569	pmc_object_t load_pmc_obj = load_pmc->object;
1570	perf_monitor_t load_pm = load_pmc->monitor;
1571
1572	/* Set the control register up with the stored configuration */
1573	ret = load_pmc->methods.set_config(load_pmc_obj, reservation->config->object);
1574	if(KERN_SUCCESS != ret) {
1575		COUNTER_DEBUG("  [error] set_config: 0x%x\n", ret);
1576		return;
1577	}
1578
1579	/* load the counter value */
1580	ret = load_pmc->methods.set_count(load_pmc_obj, reservation->value);
1581	if(KERN_SUCCESS != ret) {
1582		COUNTER_DEBUG("  [error] set_count: 0x%x\n", ret);
1583		return;
1584	}
1585
1586	/* Locally enable the counter */
1587	ret = load_pmc->methods.enable(load_pmc_obj);
1588	if(KERN_SUCCESS != ret) {
1589		COUNTER_DEBUG("  [error] enable: 0x%x\n", ret);
1590		return;
1591	}
1592
1593	/*
1594	 * Instruct the Perf Monitor containing the pmc to enable the
1595	 * counter.
1596	 */
1597	ret = load_pm->methods.enable_counters(load_pm->object, &load_pmc_obj, 1);
1598	if(KERN_SUCCESS != ret) {
1599		COUNTER_DEBUG("  [error] enable_counters: 0x%x\n", ret);
1600		/* not on the hardware. */
1601		return;
1602	}
1603
1604	/* Advance the state machine now that the STORE is finished */
1605	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_LOAD_FINISHED, NULL))) {
1606		return;
1607	}
1608
1609	/* Do any actions required based on the state change */
1610	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_STORE) {
1611		/* Just moved into STORE, so store the reservation. */
1612		pmc_internal_reservation_store(reservation);
1613	}
1614
1615}
1616
1617/*
1618 * pmc_accessible_from_core will return TRUE if the given @pmc is directly
1619 * (e.g., hardware) readable from the given logical core.
1620 *
1621 * NOTE: This method is interrupt safe.
1622 */
1623static inline boolean_t pmc_accessible_from_core(pmc_t pmc, uint32_t logicalCore) {
1624	boolean_t ret = FALSE;
1625
1626	assert(pmc);
1627
1628	ret = pmc->methods.accessible_from_core(pmc->object, logicalCore);
1629
1630	return ret;
1631}
1632
1633static void pmc_internal_reservation_start_cpu(void * arg) {
1634	pmc_reservation_t reservation = (pmc_reservation_t)arg;
1635
1636	assert(reservation);
1637
1638
1639	if (pmc_internal_reservation_matches_context(reservation)) {
1640		/* We are in context, but the reservation may have already had the context_in method run.  Attempt
1641		 * to set this cpu's bit in the active_last_context_in mask.  If we set it, call context_in.
1642		 */
1643		uint32_t oldMask = OSBitOrAtomic(1U << cpu_number(), &(reservation->active_last_context_in));
1644
1645		if ((oldMask & (1U << cpu_number())) == 0) {
1646			COUNTER_DEBUG("Starting already in-context reservation %p for cpu %d\n", reservation, cpu_number());
1647
1648			pmc_internal_reservation_context_in(reservation);
1649		}
1650	}
1651}
1652
1653static void pmc_internal_reservation_stop_cpu(void * arg) {
1654	pmc_reservation_t reservation = (pmc_reservation_t)arg;
1655
1656	assert(reservation);
1657
1658
1659	if (pmc_internal_reservation_matches_context(reservation)) {
1660		COUNTER_DEBUG("Stopping in-context reservation %p for cpu %d\n", reservation, cpu_number());
1661
1662		pmc_internal_reservation_context_out(reservation);
1663	}
1664}
1665
1666/*!fn
1667 * pmc_reservation_interrupt is called when a PMC reservation which was setup
1668 * with an interrupt threshold counts the requested number of events. When the
1669 * underlying counter hits the threshold, an interrupt is generated, and this
1670 * method is called. This method marks the reservation as stopped, and passes
1671 * control off to the user-registered callback method, along with the
1672 * reservation (so that the user can, for example, write a 0 to the counter, and
1673 * restart the reservation).
1674 * This method assumes the reservation has a valid pmc_config_t within.
1675 *
1676 * @param target The pmc_reservation_t that caused the interrupt.
1677 * @param refCon User specified reference constant.
1678 */
1679static void pmc_reservation_interrupt(void *target, void *refCon) {
1680	pmc_reservation_t reservation = (pmc_reservation_t)target;
1681	pmc_state_t newState;
1682	uint64_t timeout;
1683	uint32_t spins;
1684
1685	assert(reservation);
1686
1687	/* Move the state machine */
1688	if (PMC_STATE_INVALID == pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_INTERRUPT, NULL)) {
1689		return;
1690	}
1691
1692	/* A valid state move has been made, but won't be picked up until a context switch occurs.  To cause matching
1693	 * contexts that are currently running to update, we do an inter-processor message to run pmc_internal_reservation_stop_cpu
1694	 * on every cpu that can access the PMC.
1695	 */
1696	pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_stop_cpu);
1697
1698	/* Spin waiting for the state to turn to INTERRUPT */
1699	nanoseconds_to_absolutetime(PMC_SPIN_TIMEOUT_US * 1000, &timeout);
1700	timeout += mach_absolute_time();
1701	spins = 0;
1702	while (PMC_STATE_STATE(reservation->state) != PMC_STATE_STATE_INTERRUPT) {
1703		/* Assert if this takes longer than PMC_SPIN_TIMEOUT_US */
1704		if (++spins > PMC_SPIN_THRESHOLD) {
1705			if (mach_absolute_time() > timeout) {
1706				pmc_spin_timeout_count++;
1707				assert(0);
1708			}
1709		}
1710
1711		cpu_pause();
1712	}
1713
1714	assert(reservation->config);
1715	assert(reservation->config->method);
1716
1717	/* Call the registered callback handler */
1718#if DEBUG_COUNTERS
1719	uint64_t start = mach_absolute_time();
1720#endif /* DEBUG */
1721
1722	(void)reservation->config->method(reservation, refCon);
1723
1724#if DEBUG_COUNTERS
1725	uint64_t end = mach_absolute_time();
1726	if((end - start) > 5000ULL) {
1727		kprintf("%s - user method %p took %llu ns\n", __FUNCTION__,
1728				reservation->config->method, (end - start));
1729	}
1730#endif
1731
1732	/* Move the state machine */
1733	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_END_OF_INTERRUPT, NULL))) {
1734		return;
1735	}
1736
1737	/* Do any post-move actions necessary */
1738	if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_CAN_RUN) {
1739		pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_start_cpu);
1740	} else if (PMC_STATE_STATE(newState) == PMC_STATE_STATE_DEALLOC && PMC_STATE_CONTEXT_COUNT(newState) == 0 && PMC_STATE_FLAGS(newState) == 0) {
1741		/* Wakeup any thread blocking for this reservation to hit <DEALLOC, 0, > */
1742		thread_wakeup((event_t)reservation);
1743	}
1744}
1745
1746/*
1747 * Apple-private KPI for Apple kext's (IOProfileFamily) only
1748 */
1749
1750#if 0
1751#pragma mark -
1752#pragma mark IOProfileFamily private KPI
1753#endif
1754
1755/*
1756 * perf_monitor_register registers a new Performance Monitor, and its associated
1757 * callback methods.  The given perf_monitor_object_t is the first argument to
1758 * each callback when they are called.
1759 */
1760kern_return_t perf_monitor_register(perf_monitor_object_t monitor,
1761	perf_monitor_methods_t *methods) {
1762	int cpu = -1;
1763
1764	COUNTER_DEBUG("registering perf monitor %p\n", monitor);
1765
1766	if(!monitor || !methods) {
1767		return KERN_INVALID_ARGUMENT;
1768	}
1769
1770	/* Protect against out-of-date driver kexts */
1771	if(MACH_PERFMON_METHODS_VERSION != methods->perf_monitor_methods_version) {
1772		return KERN_INVALID_ARGUMENT;
1773	}
1774
1775	/* If the monitor requires idle notifications, ensure that it is
1776	 * accessible from a single core only.
1777	 */
1778	if (methods->flags & PERFMON_FLAG_REQUIRES_IDLE_NOTIFICATIONS) {
1779		uint32_t *cores;
1780		size_t core_cnt;
1781
1782		if (KERN_SUCCESS == methods->accessible_cores(monitor, &cores, &core_cnt)) {
1783			/*
1784			 * Guard against disabled cores - monitors will always match and
1785			 * attempt registration, irrespective of 'cpus=x' boot-arg.
1786			 */
1787			if ((core_cnt == 1) && (cores[0] < (uint32_t)ml_get_max_cpus())) {
1788				cpu = cores[0];
1789			} else {
1790				return KERN_INVALID_ARGUMENT;
1791			}
1792		}
1793	}
1794
1795	/* All methods are required */
1796	if(!methods->accessible_cores |
1797	   !methods->enable_counters || !methods->disable_counters ||
1798	   !methods->on_idle || !methods->on_idle_exit) {
1799		return KERN_INVALID_ARGUMENT;
1800	}
1801
1802	/* prevent dupes. */
1803	perf_monitor_t dupe = perf_monitor_find(monitor);
1804	if(dupe) {
1805		COUNTER_DEBUG("Duplicate registration for %p\n", monitor);
1806		perf_monitor_deallocate(dupe);
1807		return KERN_FAILURE;
1808	}
1809
1810	perf_monitor_t pm = perf_monitor_alloc();
1811	if(!pm) {
1812		return KERN_RESOURCE_SHORTAGE;
1813	}
1814
1815	/* initialize the object */
1816	perf_monitor_init(pm, cpu);
1817
1818	/* copy in the registration info */
1819	pm->object = monitor;
1820	memcpy(&(pm->methods), methods, sizeof(perf_monitor_methods_t));
1821
1822	/* place it in the tracking queues */
1823	perf_monitor_enqueue(pm);
1824
1825	/* debug it */
1826	PRINT_PERF_MON(pm);
1827
1828	return KERN_SUCCESS;
1829}
1830
1831/*
1832 * perf_monitor_unregister unregisters a previously registered Perf Monitor,
1833 * looking it up by reference pointer (the same that was used in
1834 * perf_monitor_register()).
1835 */
1836kern_return_t perf_monitor_unregister(perf_monitor_object_t monitor) {
1837	kern_return_t ret = KERN_FAILURE;
1838
1839	COUNTER_DEBUG("unregistering perf monitor %p\n", monitor);
1840
1841	if(!monitor) {
1842		return KERN_INVALID_ARGUMENT;
1843	}
1844
1845	perf_monitor_t pm = perf_monitor_find(monitor);
1846	if(pm) {
1847		/* Remove it from the queues. */
1848		perf_monitor_dequeue(pm);
1849
1850		/* drop extra retain from find */
1851		perf_monitor_deallocate(pm);
1852
1853		/* and release the object */
1854		perf_monitor_deallocate(pm);
1855
1856		ret = KERN_SUCCESS;
1857	} else {
1858		COUNTER_DEBUG("could not find a registered pm that matches!\n");
1859	}
1860
1861	return ret;
1862}
1863
1864/*
1865 * pmc_register registers a new PMC for use with the pmc subsystem. Each PMC is
1866 * associated with a Perf Monitor.  Perf Monitors are looked up by the reference
1867 * pointer that was used to previously register them.
1868 *
1869 * PMCs are registered with a reference pointer (@pmc_object), and a set of
1870 * callback methods.  When the given callback methods are called from xnu, the
1871 * first argument will always be the reference pointer used to register the PMC.
1872 *
1873 * NOTE: @monitor must have been successfully registered via
1874 * perf_monitor_register before this method will succeed.
1875 */
1876kern_return_t pmc_register(perf_monitor_object_t monitor, pmc_object_t pmc_object,
1877	pmc_methods_t *methods, void *object) {
1878
1879	COUNTER_DEBUG("%p %p\n", monitor, pmc_object);
1880
1881	if(!monitor || !pmc_object || !methods || !object) {
1882		return KERN_INVALID_ARGUMENT;
1883	}
1884
1885	/* Prevent version mismatches */
1886	if(MACH_PMC_METHODS_VERSION != methods->pmc_methods_version) {
1887		COUNTER_DEBUG("version mismatch\n");
1888		return KERN_INVALID_ARGUMENT;
1889	}
1890
1891	/* All methods are required. */
1892	if(!methods->create_config ||
1893		!methods->free_config ||
1894		!methods->config_set_value ||
1895		!methods->config_set_threshold ||
1896		!methods->config_set_handler ||
1897		!methods->set_config ||
1898		!methods->get_monitor ||
1899		!methods->get_name ||
1900		!methods->accessible_from_core ||
1901		!methods->accessible_cores ||
1902		!methods->get_count ||
1903		!methods->set_count ||
1904		!methods->disable ||
1905		!methods->enable ||
1906		!methods->open ||
1907		!methods->close) {
1908		return KERN_INVALID_ARGUMENT;
1909	}
1910
1911	/* make sure this perf monitor object is already registered */
1912	/*
1913	 * NOTE: this adds a reference to the parent, so we'll have to drop it in
1914	 * any failure code paths from here on out.
1915	 */
1916	perf_monitor_t pm = perf_monitor_find(monitor);
1917	if(!pm) {
1918		COUNTER_DEBUG("Could not find perf monitor for %p\n", monitor);
1919		return KERN_INVALID_ARGUMENT;
1920	}
1921
1922	/* make a new pmc */
1923	pmc_t pmc = pmc_alloc();
1924	if(!pmc) {
1925		/* drop the extra reference from perf_monitor_find() */
1926		perf_monitor_deallocate(pm);
1927		return KERN_RESOURCE_SHORTAGE;
1928	}
1929
1930	/* init it */
1931	pmc_init(pmc);
1932
1933	pmc->object = pmc_object;
1934	pmc->open_object = object;
1935
1936	/* copy the callbacks in */
1937	memcpy(&(pmc->methods), methods, sizeof(pmc_methods_t));
1938
1939	pmc->monitor = pm;
1940
1941	perf_monitor_add_pmc(pmc->monitor, pmc);
1942
1943	/* enqueue it in our tracking queue */
1944	pmc_enqueue(pmc);
1945
1946	/* drop extra reference from perf_monitor_find() */
1947	perf_monitor_deallocate(pm);
1948
1949	return KERN_SUCCESS;
1950}
1951
1952/*
1953 * pmc_unregister unregisters a previously registered PMC, looking it up by
1954 * reference point to *both* the Perf Monitor it was created with, and the PMC's
1955 * reference pointer itself.
1956 */
1957kern_return_t pmc_unregister(perf_monitor_object_t monitor, pmc_object_t pmc_object) {
1958	COUNTER_DEBUG("%p %p\n", monitor, pmc_object);
1959
1960	if(!monitor || !pmc_object) {
1961		return KERN_INVALID_ARGUMENT;
1962	}
1963
1964	pmc_t pmc = pmc_find(pmc_object);
1965	if(!pmc) {
1966		COUNTER_DEBUG("Could not find a matching pmc.\n");
1967		return KERN_FAILURE;
1968	}
1969
1970	/* remove it from the global queue */
1971	pmc_dequeue(pmc);
1972
1973	perf_monitor_remove_pmc(pmc->monitor, pmc);
1974
1975	/* remove extra reference count from pmc_find() */
1976	pmc_deallocate(pmc);
1977
1978	/* dealloc the pmc */
1979	pmc_deallocate(pmc);
1980
1981	return KERN_SUCCESS;
1982}
1983
1984static void perf_monitor_reservation_add(perf_monitor_t monitor) {
1985    assert(monitor);
1986    OSIncrementAtomic(&(monitor->reservedCounters));
1987}
1988
1989static void perf_monitor_reservation_remove(perf_monitor_t monitor) {
1990    assert(monitor);
1991    OSDecrementAtomic(&(monitor->reservedCounters));
1992}
1993
1994#if 0
1995#pragma mark -
1996#pragma mark KPI
1997#endif
1998
1999/*
2000 * Begin in-kernel and in-kext KPI methods
2001 */
2002
2003/*
2004 * pmc_create_config creates a new configuration area from a given @pmc.
2005 *
2006 * NOTE: This method is not interrupt safe.
2007 */
2008kern_return_t pmc_create_config(pmc_t pmc, pmc_config_t *config) {
2009	pmc_config_t tmp = NULL;
2010
2011	if(!pmc || !config) {
2012		return KERN_INVALID_ARGUMENT;
2013	}
2014
2015	pmc_reference(pmc);
2016
2017	tmp = pmc_config_alloc(pmc);
2018	if(tmp) {
2019		tmp->object = pmc->methods.create_config(pmc->object);
2020
2021		if(!tmp->object) {
2022			pmc_config_free(pmc, tmp);
2023			tmp = NULL;
2024		} else {
2025			tmp->interrupt_after_value = 0ULL;
2026			tmp->method = NULL;
2027			tmp->refCon = NULL;
2028		}
2029	}
2030
2031	pmc_deallocate(pmc);
2032
2033	if(!tmp) {
2034		return KERN_RESOURCE_SHORTAGE;
2035	}
2036
2037	*config = tmp;
2038
2039	return KERN_SUCCESS;
2040}
2041
2042/*
2043 * pmc_free_config frees a configuration area created from a given @pmc
2044 *
2045 * NOTE: This method is not interrupt safe.
2046 */
2047void pmc_free_config(pmc_t pmc, pmc_config_t config) {
2048	assert(pmc);
2049	assert(config);
2050
2051	pmc_reference(pmc);
2052
2053	pmc_config_free(pmc, config);
2054
2055	pmc_deallocate(pmc);
2056}
2057
2058/*
2059 * pmc_config_set_value sets up configuration area key-value pairs.  These pairs
2060 * are to be either pre-known, or looked up via CoreProfile.framework.
2061 *
2062 * NOTE: This method is not interrupt safe.
2063 */
2064kern_return_t pmc_config_set_value(pmc_t pmc, pmc_config_t config,
2065	uint8_t id, uint64_t value) {
2066
2067	kern_return_t ret = KERN_INVALID_ARGUMENT;
2068
2069	if(!pmc || !config) {
2070		return ret;
2071	}
2072
2073	pmc_reference(pmc);
2074
2075	ret = pmc->methods.config_set_value(config->object, id, value);
2076
2077	pmc_deallocate(pmc);
2078
2079	return ret;
2080}
2081
2082/*
2083 * pmc_config_set_interrupt_threshold modifies a config object, instructing
2084 * the pmc that it should generate a call to the given pmc_interrupt_method_t
2085 * after the counter counts @threshold events.
2086 *
2087 * PMC Threshold handler methods will have the pmc_reservation_t that generated the interrupt
2088 * as the first argument when the interrupt handler is invoked, and the given
2089 * @refCon (which may be NULL) as the second.
2090 *
2091 * See pmc_interrupt_method_t.
2092 *
2093 * NOTE: This method is not interrupt safe.
2094 */
2095kern_return_t pmc_config_set_interrupt_threshold(pmc_t pmc, pmc_config_t config,
2096	uint64_t threshold, pmc_interrupt_method_t method, void *refCon) {
2097	kern_return_t ret = KERN_INVALID_ARGUMENT;
2098
2099	if(!config || !pmc) {
2100		return ret;
2101	}
2102
2103	assert(config);
2104	assert(pmc);
2105
2106	pmc_reference(pmc);
2107
2108	do {
2109		/*
2110		 * We have a minor annoyance to side-step here. The driver layer expects
2111		 * the config to never change once a reservation has been taken out with
2112		 * it.  However, in order to have the PMI method have the reservation as
2113		 * the first argument (in order to allow the user-method to, for
2114		 * example, write a 0 to it, and restart it), we need to create the
2115		 * pmc_reservation_t before setting it up in the config object.
2116		 * We overcome this by caching the method in the pmc_config_t stand-in,
2117		 * and mutating the pmc_config_object_t just before returning a
2118		 * reservation (in pmc_reserve() and friends, below).
2119		 */
2120
2121		/* might as well stash this away too. */
2122		config->interrupt_after_value = threshold;
2123		config->method = method;
2124		config->refCon = refCon;
2125
2126		ret = KERN_SUCCESS;
2127
2128	}while(0);
2129
2130	pmc_deallocate(pmc);
2131
2132	return ret;
2133}
2134
2135/*
2136 * pmc_get_pmc_list returns an allocated list of pmc_t's, as well as the number
2137 * of pmc_t's returned. Callers should free this list with a call to
2138 * pmc_free_pmc_list().
2139 *
2140 * NOTE: This method is not interrupt safe.
2141 */
2142kern_return_t pmc_get_pmc_list(pmc_t **pmcs, size_t *pmcCount) {
2143	pmc_t *array = NULL;
2144	pmc_t pmc = NULL;
2145	size_t count = 0UL;
2146
2147	do {
2148		/* Copy down (to the stack) the count of perf counters */
2149		vm_size_t size = perf_counters_count;
2150
2151		/* Allocate that sized chunk */
2152		array = (pmc_t *)kalloc(sizeof(pmc_t) * size);
2153		if(!array) {
2154			return KERN_RESOURCE_SHORTAGE;
2155		}
2156
2157		/* Take the spin lock */
2158		lck_spin_lock(&perf_counters_queue_spin);
2159
2160		/* verify the size didn't change while we were allocating */
2161		if(size != perf_counters_count) {
2162			/*
2163			 * queue size has changed between alloc and now - go back and
2164			 * make another pass.
2165			 */
2166
2167			/* drop the lock */
2168			lck_spin_unlock(&perf_counters_queue_spin);
2169
2170			/* free the block */
2171			kfree(array, sizeof(pmc_t) * size);
2172			array = NULL;
2173		}
2174
2175		/* if we get here, and array is NULL, we try again. */
2176	}while(!array);
2177
2178	/* copy the bits out */
2179	queue_iterate(perf_counters_queue, pmc, pmc_t, link) {
2180		/* copy out the pointer */
2181		array[count++] = pmc;
2182	}
2183
2184	lck_spin_unlock(&perf_counters_queue_spin);
2185
2186	/* return the list and the size */
2187	*pmcs = array;
2188	*pmcCount = count;
2189
2190	return KERN_SUCCESS;
2191}
2192
2193/*
2194 * pmc_free_pmc_list frees an array of pmc_t that has been returned from
2195 * pmc_get_pmc_list.
2196 *
2197 * NOTE: This method is not interrupt safe.
2198 */
2199void pmc_free_pmc_list(pmc_t *pmcs, size_t pmcCount) {
2200	if(pmcs && pmcCount) {
2201		COUNTER_DEBUG("pmcs: %p pmcCount: %lu\n", pmcs, pmcCount);
2202
2203		kfree(pmcs, pmcCount * sizeof(pmc_t));
2204	}
2205}
2206
2207kern_return_t pmc_find_by_name(const char *name, pmc_t **pmcs, size_t *pmcCount) {
2208	kern_return_t ret = KERN_INVALID_ARGUMENT;
2209
2210	if(!name || !pmcs || !pmcCount) {
2211		return ret;
2212	}
2213
2214	pmc_t *list = NULL;
2215	size_t count = 0UL;
2216
2217	if(KERN_SUCCESS == (ret = pmc_get_pmc_list(&list, &count))) {
2218		size_t matchCount = 0UL, ii = 0UL, swapPtr = 0UL;
2219		size_t len = strlen(name);
2220
2221		for(ii = 0UL; ii < count; ii++) {
2222			const char *pmcName = pmc_get_name(list[ii]);
2223
2224			if(strlen(pmcName) < len) {
2225				/*
2226				 * If the pmc name is shorter than the requested match, it's no
2227				 * match, as we're looking for the most specific match(es).
2228				 */
2229				continue;
2230			}
2231
2232			if(0 == strncmp(name, pmcName, len)) {
2233				pmc_t temp = list[ii];
2234
2235				// move matches to the head of the array.
2236				list[ii] = list[swapPtr];
2237				list[swapPtr] = temp;
2238				swapPtr++;
2239
2240				// keep a count of the matches
2241				matchCount++;
2242			}
2243		}
2244
2245		if(matchCount) {
2246			/*
2247			 * If we have matches, they are all at the head of the array, so
2248			 * just allocate enough space for @matchCount pmc_t's, and copy the
2249			 * head of the array to the new allocation.  Then free the old
2250			 * allocation.
2251			 */
2252
2253			pmc_t *result = (pmc_t *)kalloc(sizeof(pmc_t) * matchCount);
2254			if(result) {
2255				// copy the matches
2256				memcpy(result, list, sizeof(pmc_t) * matchCount);
2257
2258				ret = KERN_SUCCESS;
2259			}
2260
2261			pmc_free_pmc_list(list, count);
2262
2263			if(!result) {
2264				*pmcs = NULL;
2265				*pmcCount = 0UL;
2266				return KERN_RESOURCE_SHORTAGE;
2267			}
2268
2269			*pmcs = result;
2270			*pmcCount = matchCount;
2271		} else {
2272			*pmcs = NULL;
2273			*pmcCount = 0UL;
2274		}
2275	}
2276
2277	return ret;
2278}
2279
2280/*
2281 * pmc_get_name returns a pointer (not copied) to the human-readable name of the
2282 * given pmc.
2283 *
2284 * NOTE: Driver authors must take care to not allocate during this method, as
2285 * this method *IS* interrupt safe.
2286 */
2287const char *pmc_get_name(pmc_t pmc) {
2288	assert(pmc);
2289
2290	const char *name = pmc->methods.get_name(pmc->object);
2291
2292	return name;
2293}
2294
2295/*
2296 * pmc_get_accessible_core_list returns a pointer to an array of logical core
2297 * numbers (as well as the size of that array) that represent the local cores
2298 * (hardware threads) from which the given @pmc can be accessed directly.
2299 *
2300 * NOTE: This method is interrupt safe.
2301 */
2302kern_return_t pmc_get_accessible_core_list(pmc_t pmc, uint32_t **logicalCores,
2303	size_t *logicalCoreCt) {
2304
2305	kern_return_t ret = KERN_INVALID_ARGUMENT;
2306
2307	if(!pmc || !logicalCores || !logicalCoreCt) {
2308		return ret;
2309	}
2310
2311	ret = pmc->methods.accessible_cores(pmc->object, logicalCores, logicalCoreCt);
2312
2313	return ret;
2314}
2315
2316static boolean_t pmc_reservation_setup_pmi(pmc_reservation_t resv, pmc_config_t config) {
2317	assert(resv);
2318	assert(resv->pmc);
2319	assert(config);
2320	assert(config->object);
2321
2322	/* If there's no PMI to setup, return success */
2323	if(config->interrupt_after_value && config->method) {
2324
2325		/* set the threshold */
2326		kern_return_t ret = resv->pmc->methods.config_set_threshold(config->object,
2327			config->interrupt_after_value);
2328
2329		if(KERN_SUCCESS != ret) {
2330			/*
2331			 * This is the most useful error message here, as this only happens
2332			 * as a result of pmc_reserve*()
2333			 */
2334			COUNTER_DEBUG("Failed to set threshold for pmc %p\n", resv->pmc);
2335			return FALSE;
2336		}
2337
2338		if(KERN_SUCCESS != resv->pmc->methods.config_set_handler(config->object,
2339			(void *)resv, &pmc_reservation_interrupt, config->refCon)) {
2340
2341			COUNTER_DEBUG("Failed to set handler for pmc %p\n", resv->pmc);
2342			return FALSE;
2343		}
2344	}
2345
2346	return TRUE;
2347}
2348
2349/*
2350 * pmc_reserve will attempt to reserve the given @pmc, with a given
2351 * configuration object, for counting system-wide. This method will fail with
2352 * KERN_FAILURE if the given pmc is already reserved at any scope.
2353 *
2354 * This method consumes the given configuration object if it returns
2355 * KERN_SUCCESS. Any other return value indicates the caller
2356 * must free the config object via pmc_free_config().
2357 *
2358 * NOTE: This method is NOT interrupt safe.
2359 */
2360kern_return_t pmc_reserve(pmc_t pmc, pmc_config_t config,
2361	pmc_reservation_t *reservation) {
2362
2363	if(!pmc || !config || !reservation) {
2364		return KERN_INVALID_ARGUMENT;
2365	}
2366
2367	pmc_reservation_t resv = reservation_alloc();
2368	if(!resv) {
2369		return KERN_RESOURCE_SHORTAGE;
2370	}
2371
2372	reservation_init(resv);
2373
2374	resv->flags |= PMC_FLAG_SCOPE_SYSTEM;
2375	resv->config = config;
2376
2377	if(KERN_SUCCESS != pmc_internal_reservation_set_pmc(resv, pmc)) {
2378		resv->config = NULL;
2379		return KERN_FAILURE;
2380	}
2381
2382	/* enqueue reservation in proper place */
2383	if(!pmc_internal_reservation_add(resv) || !pmc_reservation_setup_pmi(resv, config)) {
2384		/* Prevent free of config object */
2385		resv->config = NULL;
2386
2387		reservation_free(resv);
2388		return KERN_FAILURE;
2389	}
2390
2391	perf_monitor_reservation_add(pmc->monitor);
2392
2393	*reservation = resv;
2394
2395	return KERN_SUCCESS;
2396}
2397
2398/*
2399 * pmc_reserve_task will attempt to reserve the given @pmc with a given
2400 * configuration object, for counting when the given @task is running on any
2401 * logical core that can directly access the given @pmc.  This method will fail
2402 * with KERN_FAILURE if the given pmc is already reserved at either system or
2403 * thread scope.
2404 *
2405 * This method consumes the given configuration object if it returns
2406 * KERN_SUCCESS. Any other return value indicates the caller
2407 * must free the config object via pmc_free_config().
2408 *
2409 * NOTE: You can reserve the same pmc for N different tasks concurrently.
2410 * NOTE: This method is NOT interrupt safe.
2411 */
2412kern_return_t pmc_reserve_task(pmc_t pmc, pmc_config_t config,
2413	task_t task, pmc_reservation_t *reservation) {
2414
2415	if(!pmc || !config || !reservation || !task) {
2416		return KERN_INVALID_ARGUMENT;
2417	}
2418
2419	if (!(pmc->monitor->methods.flags & PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING)) {
2420		COUNTER_DEBUG("pmc %p cannot be context switched!\n", pmc);
2421		return KERN_INVALID_ARGUMENT;
2422	}
2423
2424	pmc_reservation_t resv = reservation_alloc();
2425	if(!resv) {
2426		return KERN_RESOURCE_SHORTAGE;
2427	}
2428
2429	reservation_init(resv);
2430
2431	resv->flags |= PMC_FLAG_SCOPE_TASK;
2432	resv->task = task;
2433
2434	resv->config = config;
2435
2436	if(KERN_SUCCESS != pmc_internal_reservation_set_pmc(resv, pmc)) {
2437		resv->config = NULL;
2438		return KERN_FAILURE;
2439	}
2440
2441	/* enqueue reservation in proper place */
2442	if(!pmc_internal_reservation_add(resv) || !pmc_reservation_setup_pmi(resv, config)) {
2443		/* Prevent free of config object */
2444		resv->config = NULL;
2445
2446		reservation_free(resv);
2447		return KERN_FAILURE;
2448	}
2449
2450	perf_monitor_reservation_add(pmc->monitor);
2451
2452	*reservation = resv;
2453
2454	return KERN_SUCCESS;
2455}
2456
2457/*
2458 * pmc_reserve_thread will attempt to reserve the given @pmc with a given
2459 * configuration object, for counting when the given @thread is running on any
2460 * logical core that can directly access the given @pmc.  This method will fail
2461 * with KERN_FAILURE if the given pmc is already reserved at either system or
2462 * task scope.
2463 *
2464 * This method consumes the given configuration object if it returns
2465 * KERN_SUCCESS. Any other return value indicates the caller
2466 * must free the config object via pmc_free_config().
2467 *
2468 * NOTE: You can reserve the same pmc for N different threads concurrently.
2469 * NOTE: This method is NOT interrupt safe.
2470 */
2471kern_return_t pmc_reserve_thread(pmc_t pmc, pmc_config_t config,
2472	thread_t thread, pmc_reservation_t *reservation) {
2473	if(!pmc || !config || !reservation || !thread) {
2474		return KERN_INVALID_ARGUMENT;
2475	}
2476
2477	if (!(pmc->monitor->methods.flags & PERFMON_FLAG_SUPPORTS_CONTEXT_SWITCHING)) {
2478		COUNTER_DEBUG("pmc %p cannot be context switched!\n", pmc);
2479		return KERN_INVALID_ARGUMENT;
2480	}
2481
2482	pmc_reservation_t resv = reservation_alloc();
2483	if(!resv) {
2484		return KERN_RESOURCE_SHORTAGE;
2485	}
2486
2487	reservation_init(resv);
2488
2489	resv->flags |= PMC_FLAG_SCOPE_THREAD;
2490	resv->thread = thread;
2491
2492	resv->config = config;
2493
2494	if(KERN_SUCCESS != pmc_internal_reservation_set_pmc(resv, pmc)) {
2495		resv->config = NULL;
2496		return KERN_FAILURE;
2497	}
2498
2499	/* enqueue reservation in proper place */
2500	if(!pmc_internal_reservation_add(resv) || !pmc_reservation_setup_pmi(resv, config)) {
2501		/* Prevent free of config object */
2502		resv->config = NULL;
2503
2504		reservation_free(resv);
2505		return KERN_FAILURE;
2506	}
2507
2508	perf_monitor_reservation_add(pmc->monitor);
2509
2510	*reservation = resv;
2511
2512	return KERN_SUCCESS;
2513}
2514
2515/*
2516 * pmc_reservation_start instructs the given reservation to start counting as
2517 * soon as possible.
2518 *
2519 * NOTE: This method is interrupt safe.
2520 */
2521kern_return_t pmc_reservation_start(pmc_reservation_t reservation) {
2522	pmc_state_t newState;
2523
2524	if(!reservation) {
2525		return KERN_INVALID_ARGUMENT;
2526	}
2527
2528	/* Move the state machine */
2529	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_START, NULL))) {
2530		return KERN_FAILURE;
2531	}
2532
2533	/* If we are currently in an interrupt, don't bother to broadcast since it won't do anything now and the interrupt will
2534	 * broadcast right before it leaves
2535	 */
2536	if (PMC_STATE_STATE(newState) != PMC_STATE_STATE_INTERRUPT) {
2537		/* A valid state move has been made, but won't be picked up until a context switch occurs.  To cause matching
2538		 * contexts that are currently running to update, we do an inter-processor message to run pmc_internal_reservation_start_cpu
2539		 * on every cpu that can access the PMC.
2540		 */
2541		pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_start_cpu);
2542	}
2543
2544	return KERN_SUCCESS;
2545}
2546
2547/*
2548 * pmc_reservation_stop instructs the given reservation to stop counting as
2549 * soon as possible.  When this method returns, the pmc will be marked as stopping
2550 * and subsequent calls to pmc_reservation_start will succeed.  This does not mean
2551 * that the pmc hardware has _actually_ stopped running.  Assuming no other changes
2552 * to the reservation state, the pmc hardware _will_ stop shortly.
2553 *
2554 */
2555kern_return_t pmc_reservation_stop(pmc_reservation_t reservation) {
2556	pmc_state_t newState;
2557
2558	if(!reservation) {
2559		return KERN_INVALID_ARGUMENT;
2560	}
2561
2562	/* Move the state machine */
2563	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_STOP, NULL))) {
2564		return KERN_FAILURE;
2565	}
2566
2567	/* If we are currently in an interrupt, don't bother to broadcast since it won't do anything now and the interrupt will
2568	 * broadcast right before it leaves.  Similarly, if we just moved directly to STOP, don't bother broadcasting.
2569	 */
2570	if (PMC_STATE_STATE(newState) != PMC_STATE_STATE_INTERRUPT && PMC_STATE_STATE(newState) != PMC_STATE_STATE_STOP) {
2571		/* A valid state move has been made, but won't be picked up until a context switch occurs.  To cause matching
2572			 * contexts that are currently running to update, we do an inter-processor message to run pmc_internal_reservation_stop_cpu
2573		 * on every cpu that can access the PMC.
2574		 */
2575
2576		pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_stop_cpu);
2577	}
2578
2579	return KERN_SUCCESS;
2580}
2581
2582/*
2583 * pmc_reservation_read will read the event count associated with a reservation.
2584 * If the caller is current executing in a context that both a) matches the
2585 * reservation's context, and b) can access the reservation's pmc directly, the
2586 * value will be read from hardware.  Otherwise, this returns the reservation's
2587 * stored value.
2588 *
2589 * NOTE: This method is interrupt safe.
2590 * NOTE: When not on the interrupt stack, this method may block.
2591 */
2592kern_return_t pmc_reservation_read(pmc_reservation_t reservation, uint64_t *value) {
2593	kern_return_t ret = KERN_FAILURE;
2594	uint64_t timeout;
2595	uint32_t spins;
2596
2597	if(!reservation || !value) {
2598		return KERN_INVALID_ARGUMENT;
2599	}
2600
2601	nanoseconds_to_absolutetime(PMC_SPIN_TIMEOUT_US * 1000, &timeout);
2602	timeout += mach_absolute_time();
2603	spins = 0;
2604	do {
2605		uint32_t state = reservation->state;
2606
2607		if((PMC_STATE_STATE(state) == PMC_STATE_STATE_RUN)) {
2608			/* Attempt read from hardware via drivers. */
2609
2610			assert(reservation->pmc);
2611
2612			ret = reservation->pmc->methods.get_count(reservation->pmc->object, value);
2613
2614			break;
2615		} else if ((PMC_STATE_STATE(state) == PMC_STATE_STATE_STORE) ||
2616				   (PMC_STATE_STATE(state) == PMC_STATE_STATE_LOAD)) {
2617			/* Spin */
2618			/* Assert if this takes longer than PMC_SPIN_TIMEOUT_US */
2619			if (++spins > PMC_SPIN_THRESHOLD) {
2620				if (mach_absolute_time() > timeout) {
2621					pmc_spin_timeout_count++;
2622					assert(0);
2623				}
2624			}
2625
2626			cpu_pause();
2627		} else {
2628			break;
2629		}
2630	} while (1);
2631
2632	/* If the direct hardware read failed (for whatever reason) */
2633	if(KERN_SUCCESS != ret) {
2634		/* Read stored value */
2635		*value = reservation->value;
2636	}
2637
2638	return KERN_SUCCESS;
2639}
2640
2641/*
2642 * pmc_reservation_write will write the event count associated with a reservation.
2643 * If the caller is current executing in a context that both a) matches the
2644 * reservation's context, and b) can access the reservation's pmc directly, the
2645 * value will be written to hardware.  Otherwise, this writes the reservation's
2646 * stored value.
2647 *
2648 * NOTE: This method is interrupt safe.
2649 * NOTE: When not on the interrupt stack, this method may block.
2650 */
2651kern_return_t pmc_reservation_write(pmc_reservation_t reservation, uint64_t value) {
2652	kern_return_t ret = KERN_FAILURE;
2653	uint64_t timeout;
2654	uint32_t spins;
2655
2656	if(!reservation) {
2657		return KERN_INVALID_ARGUMENT;
2658	}
2659
2660	nanoseconds_to_absolutetime(PMC_SPIN_TIMEOUT_US * 1000, &timeout);
2661	timeout += mach_absolute_time();
2662	spins = 0;
2663	do {
2664		uint32_t state = reservation->state;
2665
2666		if((PMC_STATE_STATE(state) == PMC_STATE_STATE_RUN)) {
2667				/* Write to hardware via drivers. */
2668			assert(reservation->pmc);
2669
2670			ret = reservation->pmc->methods.set_count(reservation->pmc->object, value);
2671			break;
2672		} else if ((PMC_STATE_STATE(state) == PMC_STATE_STATE_STORE) ||
2673				   (PMC_STATE_STATE(state) == PMC_STATE_STATE_LOAD)) {
2674			/* Spin */
2675			/* Assert if this takes longer than PMC_SPIN_TIMEOUT_US */
2676			if (++spins > PMC_SPIN_THRESHOLD) {
2677				if (mach_absolute_time() > timeout) {
2678					pmc_spin_timeout_count++;
2679					assert(0);
2680				}
2681			}
2682
2683			cpu_pause();
2684		} else {
2685			break;
2686		}
2687	} while (1);
2688
2689	if(KERN_SUCCESS != ret) {
2690		/* Write stored value */
2691		reservation->value = value;
2692	}
2693
2694	return KERN_SUCCESS;
2695}
2696
2697/*
2698 * pmc_reservation_free releases a reservation and all associated resources.
2699 *
2700 * NOTE: This method is NOT interrupt safe.
2701 */
2702kern_return_t pmc_reservation_free(pmc_reservation_t reservation) {
2703	pmc_state_t newState;
2704
2705	if(!reservation) {
2706		return KERN_INVALID_ARGUMENT;
2707	}
2708
2709	perf_monitor_reservation_remove(reservation->pmc->monitor);
2710
2711	/* Move the state machine */
2712	if (PMC_STATE_INVALID == (newState = pmc_internal_reservation_move_for_event(reservation, PMC_STATE_EVENT_FREE, NULL))) {
2713		return KERN_FAILURE;
2714	}
2715
2716	/* If we didn't move directly to DEALLOC, help things along */
2717	if (PMC_STATE_STATE(newState) != PMC_STATE_STATE_DEALLOC) {
2718		/* A valid state move has been made, but won't be picked up until a context switch occurs.  To cause matching
2719		 * contexts that are currently running to update, we do an inter-processor message to run pmc_internal_reservation_stop_cpu
2720		 * on every cpu that can access the PMC.
2721		 */
2722		pmc_internal_reservation_broadcast(reservation, pmc_internal_reservation_stop_cpu);
2723	}
2724
2725	/* Block until the reservation hits the <DEALLOC, 0, > state */
2726	while (!(PMC_STATE_STATE(reservation->state) == PMC_STATE_STATE_DEALLOC && PMC_STATE_CONTEXT_COUNT(reservation->state) == 0 && PMC_STATE_FLAGS(reservation->state) == 0)) {
2727		assert_wait((event_t)reservation, THREAD_UNINT);
2728		thread_block(THREAD_CONTINUE_NULL);
2729	}
2730
2731	/* remove from queues */
2732	pmc_internal_reservation_remove(reservation);
2733
2734	/* free reservation */
2735	reservation_free(reservation);
2736
2737	return KERN_SUCCESS;
2738}
2739
2740/*
2741 * pmc_idle notifies eligible monitors of impending per-CPU idle, and can be used to save state.
2742 */
2743boolean_t pmc_idle(void) {
2744	perf_monitor_t monitor = NULL;
2745	queue_head_t *cpu_queue;
2746
2747	lck_spin_lock(&perf_monitor_queue_spin);
2748
2749	if (cpu_monitor_queues) {
2750		cpu_queue = cpu_monitor_queues[cpu_number()];
2751
2752		queue_iterate(cpu_queue, monitor, perf_monitor_t, cpu_link) {
2753			perf_monitor_methods_t *methods = &(monitor->methods);
2754			if ((methods->flags & PERFMON_FLAG_ALWAYS_ACTIVE) || (monitor->reservedCounters)) {
2755				methods->on_idle(monitor->object);
2756			}
2757		}
2758	}
2759
2760	lck_spin_unlock(&perf_monitor_queue_spin);
2761
2762	return TRUE;
2763}
2764
2765/*
2766 * pmc_idle_exit notifies eligible monitors of wake from idle; it can be used to restore state.
2767 */
2768boolean_t pmc_idle_exit(void) {
2769	perf_monitor_t monitor = NULL;
2770	queue_head_t *cpu_queue;
2771
2772	lck_spin_lock(&perf_monitor_queue_spin);
2773
2774	if (cpu_monitor_queues) {
2775		cpu_queue = cpu_monitor_queues[cpu_number()];
2776
2777		queue_iterate(cpu_queue, monitor, perf_monitor_t, cpu_link) {
2778			perf_monitor_methods_t *methods = &(monitor->methods);
2779			if ((methods->flags & PERFMON_FLAG_ALWAYS_ACTIVE) || (monitor->reservedCounters)) {
2780				methods->on_idle_exit(monitor->object);
2781			}
2782		}
2783	}
2784
2785	lck_spin_unlock(&perf_monitor_queue_spin);
2786
2787	return TRUE;
2788}
2789
2790/*
2791 * pmc_context_switch performs all context switching necessary to save all pmc
2792 * state associated with @oldThread (and the task to which @oldThread belongs),
2793 * as well as to restore all pmc state associated with @newThread (and the task
2794 * to which @newThread belongs).
2795 *
2796 * NOTE: This method IS interrupt safe.
2797 */
2798boolean_t pmc_context_switch(thread_t oldThread, thread_t newThread) {
2799	pmc_reservation_t resv = NULL;
2800	uint32_t cpuNum = cpu_number();
2801
2802	lck_spin_lock(&reservations_spin);
2803
2804	/* Save pmc states */
2805	if (thread_reservation_count) {
2806 		queue_iterate(thread_reservations, resv, pmc_reservation_t, link) {
2807			if ((oldThread == resv->thread) && pmc_accessible_from_core(resv->pmc, cpuNum)) {
2808				(void)pmc_internal_reservation_context_out(resv);
2809			}
2810		}
2811	}
2812
2813	if (task_reservation_count) {
2814		queue_iterate(task_reservations, resv, pmc_reservation_t, link) {
2815			if ((resv->task == oldThread->task) && pmc_accessible_from_core(resv->pmc, cpuNum)) {
2816    			(void)pmc_internal_reservation_context_out(resv);
2817			}
2818		}
2819	}
2820
2821	/* Restore */
2822	if (thread_reservation_count) {
2823		queue_iterate(thread_reservations, resv, pmc_reservation_t, link) {
2824			if ((resv->thread == newThread) && pmc_accessible_from_core(resv->pmc, cpuNum)) {
2825				(void)pmc_internal_reservation_context_in(resv);
2826			}
2827		}
2828	}
2829
2830	if (task_reservation_count) {
2831		queue_iterate(task_reservations, resv, pmc_reservation_t, link) {
2832			if ((resv->task == newThread->task) && pmc_accessible_from_core(resv->pmc, cpuNum)) {
2833				(void)pmc_internal_reservation_context_in(resv);
2834			}
2835		}
2836	}
2837
2838	lck_spin_unlock(&reservations_spin);
2839
2840	return TRUE;
2841}
2842
2843#else /* !CONFIG_COUNTERS */
2844
2845#if 0
2846#pragma mark -
2847#pragma mark Dummy functions
2848#endif
2849
2850/*
2851 * In the case that someone has chosen not to include the PMC KPI in some
2852 * configuration, we still have exports for kexts, so we'll need to define stub
2853 * methods that return failures.
2854 */
2855kern_return_t perf_monitor_register(perf_monitor_object_t monitor __unused,
2856	perf_monitor_methods_t *methods __unused) {
2857	return KERN_FAILURE;
2858}
2859
2860kern_return_t perf_monitor_unregister(perf_monitor_object_t monitor __unused) {
2861	return KERN_FAILURE;
2862}
2863
2864kern_return_t pmc_register(perf_monitor_object_t monitor __unused,
2865	pmc_object_t pmc __unused, pmc_methods_t *methods __unused, void *object __unused) {
2866	return KERN_FAILURE;
2867}
2868
2869kern_return_t pmc_unregister(perf_monitor_object_t monitor __unused,
2870	pmc_object_t pmc __unused) {
2871	return KERN_FAILURE;
2872}
2873
2874kern_return_t pmc_create_config(pmc_t pmc __unused,
2875	pmc_config_t *config __unused) {
2876	return KERN_FAILURE;
2877}
2878
2879void pmc_free_config(pmc_t pmc __unused, pmc_config_t config __unused) {
2880}
2881
2882kern_return_t pmc_config_set_value(pmc_t pmc __unused,
2883	pmc_config_t config __unused, uint8_t id __unused,
2884	uint64_t value __unused) {
2885	return KERN_FAILURE;
2886}
2887
2888kern_return_t pmc_config_set_interrupt_threshold(pmc_t pmc __unused,
2889	pmc_config_t config __unused, uint64_t threshold __unused,
2890	pmc_interrupt_method_t method __unused, void *refCon __unused) {
2891	return KERN_FAILURE;
2892}
2893
2894kern_return_t pmc_get_pmc_list(pmc_t **pmcs __unused, size_t *pmcCount __unused) {
2895	return KERN_FAILURE;
2896}
2897
2898void pmc_free_pmc_list(pmc_t *pmcs __unused, size_t pmcCount __unused) {
2899}
2900
2901kern_return_t pmc_find_by_name(const char *name __unused, pmc_t **pmcs __unused,
2902	size_t *pmcCount __unused) {
2903	return KERN_FAILURE;
2904}
2905
2906const char *pmc_get_name(pmc_t pmc __unused) {
2907	return "";
2908}
2909
2910kern_return_t pmc_get_accessible_core_list(pmc_t pmc __unused,
2911	uint32_t **logicalCores __unused, size_t *logicalCoreCt __unused) {
2912	return KERN_FAILURE;
2913}
2914
2915kern_return_t pmc_reserve(pmc_t pmc __unused,
2916	pmc_config_t config __unused, pmc_reservation_t *reservation __unused) {
2917	return KERN_FAILURE;
2918}
2919
2920kern_return_t pmc_reserve_task(pmc_t pmc __unused,
2921	pmc_config_t config __unused, task_t task __unused,
2922	pmc_reservation_t *reservation __unused) {
2923	return KERN_FAILURE;
2924}
2925
2926kern_return_t pmc_reserve_thread(pmc_t pmc __unused,
2927	pmc_config_t config __unused, thread_t thread __unused,
2928	pmc_reservation_t *reservation __unused) {
2929	return KERN_FAILURE;
2930}
2931
2932kern_return_t pmc_reservation_start(pmc_reservation_t reservation __unused) {
2933	return KERN_FAILURE;
2934}
2935
2936kern_return_t pmc_reservation_stop(pmc_reservation_t reservation __unused) {
2937	return KERN_FAILURE;
2938}
2939
2940kern_return_t pmc_reservation_read(pmc_reservation_t reservation __unused,
2941	uint64_t *value __unused) {
2942	return KERN_FAILURE;
2943}
2944
2945kern_return_t pmc_reservation_write(pmc_reservation_t reservation __unused,
2946	uint64_t value __unused) {
2947	return KERN_FAILURE;
2948}
2949
2950kern_return_t pmc_reservation_free(pmc_reservation_t reservation __unused) {
2951	return KERN_FAILURE;
2952}
2953
2954
2955#endif /* !CONFIG_COUNTERS */
2956