• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6.36/arch/powerpc/kernel/
1/*
2 * Performance event support - powerpc architecture code
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_event.h>
14#include <linux/percpu.h>
15#include <linux/hardirq.h>
16#include <asm/reg.h>
17#include <asm/pmc.h>
18#include <asm/machdep.h>
19#include <asm/firmware.h>
20#include <asm/ptrace.h>
21
22struct cpu_hw_events {
23	int n_events;
24	int n_percpu;
25	int disabled;
26	int n_added;
27	int n_limited;
28	u8  pmcs_enabled;
29	struct perf_event *event[MAX_HWEVENTS];
30	u64 events[MAX_HWEVENTS];
31	unsigned int flags[MAX_HWEVENTS];
32	unsigned long mmcr[3];
33	struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS];
34	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
35	u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
36	unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
37	unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
38
39	unsigned int group_flag;
40	int n_txn_start;
41};
42DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
43
44struct power_pmu *ppmu;
45
46/*
47 * Normally, to ignore kernel events we set the FCS (freeze counters
48 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
49 * hypervisor bit set in the MSR, or if we are running on a processor
50 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
51 * then we need to use the FCHV bit to ignore kernel events.
52 */
53static unsigned int freeze_events_kernel = MMCR0_FCS;
54
55/*
56 * 32-bit doesn't have MMCRA but does have an MMCR2,
57 * and a few other names are different.
58 */
59#ifdef CONFIG_PPC32
60
61#define MMCR0_FCHV		0
62#define MMCR0_PMCjCE		MMCR0_PMCnCE
63
64#define SPRN_MMCRA		SPRN_MMCR2
65#define MMCRA_SAMPLE_ENABLE	0
66
67static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
68{
69	return 0;
70}
71static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
72static inline u32 perf_get_misc_flags(struct pt_regs *regs)
73{
74	return 0;
75}
76static inline void perf_read_regs(struct pt_regs *regs) { }
77static inline int perf_intr_is_nmi(struct pt_regs *regs)
78{
79	return 0;
80}
81
82#endif /* CONFIG_PPC32 */
83
84/*
85 * Things that are specific to 64-bit implementations.
86 */
87#ifdef CONFIG_PPC64
88
89static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
90{
91	unsigned long mmcra = regs->dsisr;
92
93	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
94		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
95		if (slot > 1)
96			return 4 * (slot - 1);
97	}
98	return 0;
99}
100
101/*
102 * The user wants a data address recorded.
103 * If we're not doing instruction sampling, give them the SDAR
104 * (sampled data address).  If we are doing instruction sampling, then
105 * only give them the SDAR if it corresponds to the instruction
106 * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
107 * bit in MMCRA.
108 */
109static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
110{
111	unsigned long mmcra = regs->dsisr;
112	unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
113		POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
114
115	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
116		*addrp = mfspr(SPRN_SDAR);
117}
118
119static inline u32 perf_get_misc_flags(struct pt_regs *regs)
120{
121	unsigned long mmcra = regs->dsisr;
122	unsigned long sihv = MMCRA_SIHV;
123	unsigned long sipr = MMCRA_SIPR;
124
125	if (TRAP(regs) != 0xf00)
126		return 0;	/* not a PMU interrupt */
127
128	if (ppmu->flags & PPMU_ALT_SIPR) {
129		sihv = POWER6_MMCRA_SIHV;
130		sipr = POWER6_MMCRA_SIPR;
131	}
132
133	/* PR has priority over HV, so order below is important */
134	if (mmcra & sipr)
135		return PERF_RECORD_MISC_USER;
136	if ((mmcra & sihv) && (freeze_events_kernel != MMCR0_FCHV))
137		return PERF_RECORD_MISC_HYPERVISOR;
138	return PERF_RECORD_MISC_KERNEL;
139}
140
141/*
142 * Overload regs->dsisr to store MMCRA so we only need to read it once
143 * on each interrupt.
144 */
145static inline void perf_read_regs(struct pt_regs *regs)
146{
147	regs->dsisr = mfspr(SPRN_MMCRA);
148}
149
150/*
151 * If interrupts were soft-disabled when a PMU interrupt occurs, treat
152 * it as an NMI.
153 */
154static inline int perf_intr_is_nmi(struct pt_regs *regs)
155{
156	return !regs->softe;
157}
158
159#endif /* CONFIG_PPC64 */
160
161static void perf_event_interrupt(struct pt_regs *regs);
162
163void perf_event_print_debug(void)
164{
165}
166
167/*
168 * Read one performance monitor counter (PMC).
169 */
170static unsigned long read_pmc(int idx)
171{
172	unsigned long val;
173
174	switch (idx) {
175	case 1:
176		val = mfspr(SPRN_PMC1);
177		break;
178	case 2:
179		val = mfspr(SPRN_PMC2);
180		break;
181	case 3:
182		val = mfspr(SPRN_PMC3);
183		break;
184	case 4:
185		val = mfspr(SPRN_PMC4);
186		break;
187	case 5:
188		val = mfspr(SPRN_PMC5);
189		break;
190	case 6:
191		val = mfspr(SPRN_PMC6);
192		break;
193#ifdef CONFIG_PPC64
194	case 7:
195		val = mfspr(SPRN_PMC7);
196		break;
197	case 8:
198		val = mfspr(SPRN_PMC8);
199		break;
200#endif /* CONFIG_PPC64 */
201	default:
202		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
203		val = 0;
204	}
205	return val;
206}
207
208/*
209 * Write one PMC.
210 */
211static void write_pmc(int idx, unsigned long val)
212{
213	switch (idx) {
214	case 1:
215		mtspr(SPRN_PMC1, val);
216		break;
217	case 2:
218		mtspr(SPRN_PMC2, val);
219		break;
220	case 3:
221		mtspr(SPRN_PMC3, val);
222		break;
223	case 4:
224		mtspr(SPRN_PMC4, val);
225		break;
226	case 5:
227		mtspr(SPRN_PMC5, val);
228		break;
229	case 6:
230		mtspr(SPRN_PMC6, val);
231		break;
232#ifdef CONFIG_PPC64
233	case 7:
234		mtspr(SPRN_PMC7, val);
235		break;
236	case 8:
237		mtspr(SPRN_PMC8, val);
238		break;
239#endif /* CONFIG_PPC64 */
240	default:
241		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
242	}
243}
244
245/*
246 * Check if a set of events can all go on the PMU at once.
247 * If they can't, this will look at alternative codes for the events
248 * and see if any combination of alternative codes is feasible.
249 * The feasible set is returned in event_id[].
250 */
251static int power_check_constraints(struct cpu_hw_events *cpuhw,
252				   u64 event_id[], unsigned int cflags[],
253				   int n_ev)
254{
255	unsigned long mask, value, nv;
256	unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
257	int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
258	int i, j;
259	unsigned long addf = ppmu->add_fields;
260	unsigned long tadd = ppmu->test_adder;
261
262	if (n_ev > ppmu->n_counter)
263		return -1;
264
265	/* First see if the events will go on as-is */
266	for (i = 0; i < n_ev; ++i) {
267		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
268		    && !ppmu->limited_pmc_event(event_id[i])) {
269			ppmu->get_alternatives(event_id[i], cflags[i],
270					       cpuhw->alternatives[i]);
271			event_id[i] = cpuhw->alternatives[i][0];
272		}
273		if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
274					 &cpuhw->avalues[i][0]))
275			return -1;
276	}
277	value = mask = 0;
278	for (i = 0; i < n_ev; ++i) {
279		nv = (value | cpuhw->avalues[i][0]) +
280			(value & cpuhw->avalues[i][0] & addf);
281		if ((((nv + tadd) ^ value) & mask) != 0 ||
282		    (((nv + tadd) ^ cpuhw->avalues[i][0]) &
283		     cpuhw->amasks[i][0]) != 0)
284			break;
285		value = nv;
286		mask |= cpuhw->amasks[i][0];
287	}
288	if (i == n_ev)
289		return 0;	/* all OK */
290
291	/* doesn't work, gather alternatives... */
292	if (!ppmu->get_alternatives)
293		return -1;
294	for (i = 0; i < n_ev; ++i) {
295		choice[i] = 0;
296		n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
297						  cpuhw->alternatives[i]);
298		for (j = 1; j < n_alt[i]; ++j)
299			ppmu->get_constraint(cpuhw->alternatives[i][j],
300					     &cpuhw->amasks[i][j],
301					     &cpuhw->avalues[i][j]);
302	}
303
304	/* enumerate all possibilities and see if any will work */
305	i = 0;
306	j = -1;
307	value = mask = nv = 0;
308	while (i < n_ev) {
309		if (j >= 0) {
310			/* we're backtracking, restore context */
311			value = svalues[i];
312			mask = smasks[i];
313			j = choice[i];
314		}
315		/*
316		 * See if any alternative k for event_id i,
317		 * where k > j, will satisfy the constraints.
318		 */
319		while (++j < n_alt[i]) {
320			nv = (value | cpuhw->avalues[i][j]) +
321				(value & cpuhw->avalues[i][j] & addf);
322			if ((((nv + tadd) ^ value) & mask) == 0 &&
323			    (((nv + tadd) ^ cpuhw->avalues[i][j])
324			     & cpuhw->amasks[i][j]) == 0)
325				break;
326		}
327		if (j >= n_alt[i]) {
328			/*
329			 * No feasible alternative, backtrack
330			 * to event_id i-1 and continue enumerating its
331			 * alternatives from where we got up to.
332			 */
333			if (--i < 0)
334				return -1;
335		} else {
336			/*
337			 * Found a feasible alternative for event_id i,
338			 * remember where we got up to with this event_id,
339			 * go on to the next event_id, and start with
340			 * the first alternative for it.
341			 */
342			choice[i] = j;
343			svalues[i] = value;
344			smasks[i] = mask;
345			value = nv;
346			mask |= cpuhw->amasks[i][j];
347			++i;
348			j = -1;
349		}
350	}
351
352	/* OK, we have a feasible combination, tell the caller the solution */
353	for (i = 0; i < n_ev; ++i)
354		event_id[i] = cpuhw->alternatives[i][choice[i]];
355	return 0;
356}
357
358/*
359 * Check if newly-added events have consistent settings for
360 * exclude_{user,kernel,hv} with each other and any previously
361 * added events.
362 */
363static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
364			  int n_prev, int n_new)
365{
366	int eu = 0, ek = 0, eh = 0;
367	int i, n, first;
368	struct perf_event *event;
369
370	n = n_prev + n_new;
371	if (n <= 1)
372		return 0;
373
374	first = 1;
375	for (i = 0; i < n; ++i) {
376		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
377			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
378			continue;
379		}
380		event = ctrs[i];
381		if (first) {
382			eu = event->attr.exclude_user;
383			ek = event->attr.exclude_kernel;
384			eh = event->attr.exclude_hv;
385			first = 0;
386		} else if (event->attr.exclude_user != eu ||
387			   event->attr.exclude_kernel != ek ||
388			   event->attr.exclude_hv != eh) {
389			return -EAGAIN;
390		}
391	}
392
393	if (eu || ek || eh)
394		for (i = 0; i < n; ++i)
395			if (cflags[i] & PPMU_LIMITED_PMC_OK)
396				cflags[i] |= PPMU_LIMITED_PMC_REQD;
397
398	return 0;
399}
400
401static void power_pmu_read(struct perf_event *event)
402{
403	s64 val, delta, prev;
404
405	if (!event->hw.idx)
406		return;
407	/*
408	 * Performance monitor interrupts come even when interrupts
409	 * are soft-disabled, as long as interrupts are hard-enabled.
410	 * Therefore we treat them like NMIs.
411	 */
412	do {
413		prev = local64_read(&event->hw.prev_count);
414		barrier();
415		val = read_pmc(event->hw.idx);
416	} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
417
418	/* The counters are only 32 bits wide */
419	delta = (val - prev) & 0xfffffffful;
420	local64_add(delta, &event->count);
421	local64_sub(delta, &event->hw.period_left);
422}
423
424/*
425 * On some machines, PMC5 and PMC6 can't be written, don't respect
426 * the freeze conditions, and don't generate interrupts.  This tells
427 * us if `event' is using such a PMC.
428 */
429static int is_limited_pmc(int pmcnum)
430{
431	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
432		&& (pmcnum == 5 || pmcnum == 6);
433}
434
435static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
436				    unsigned long pmc5, unsigned long pmc6)
437{
438	struct perf_event *event;
439	u64 val, prev, delta;
440	int i;
441
442	for (i = 0; i < cpuhw->n_limited; ++i) {
443		event = cpuhw->limited_counter[i];
444		if (!event->hw.idx)
445			continue;
446		val = (event->hw.idx == 5) ? pmc5 : pmc6;
447		prev = local64_read(&event->hw.prev_count);
448		event->hw.idx = 0;
449		delta = (val - prev) & 0xfffffffful;
450		local64_add(delta, &event->count);
451	}
452}
453
454static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
455				  unsigned long pmc5, unsigned long pmc6)
456{
457	struct perf_event *event;
458	u64 val;
459	int i;
460
461	for (i = 0; i < cpuhw->n_limited; ++i) {
462		event = cpuhw->limited_counter[i];
463		event->hw.idx = cpuhw->limited_hwidx[i];
464		val = (event->hw.idx == 5) ? pmc5 : pmc6;
465		local64_set(&event->hw.prev_count, val);
466		perf_event_update_userpage(event);
467	}
468}
469
470/*
471 * Since limited events don't respect the freeze conditions, we
472 * have to read them immediately after freezing or unfreezing the
473 * other events.  We try to keep the values from the limited
474 * events as consistent as possible by keeping the delay (in
475 * cycles and instructions) between freezing/unfreezing and reading
476 * the limited events as small and consistent as possible.
477 * Therefore, if any limited events are in use, we read them
478 * both, and always in the same order, to minimize variability,
479 * and do it inside the same asm that writes MMCR0.
480 */
481static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
482{
483	unsigned long pmc5, pmc6;
484
485	if (!cpuhw->n_limited) {
486		mtspr(SPRN_MMCR0, mmcr0);
487		return;
488	}
489
490	/*
491	 * Write MMCR0, then read PMC5 and PMC6 immediately.
492	 * To ensure we don't get a performance monitor interrupt
493	 * between writing MMCR0 and freezing/thawing the limited
494	 * events, we first write MMCR0 with the event overflow
495	 * interrupt enable bits turned off.
496	 */
497	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
498		     : "=&r" (pmc5), "=&r" (pmc6)
499		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
500		       "i" (SPRN_MMCR0),
501		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
502
503	if (mmcr0 & MMCR0_FC)
504		freeze_limited_counters(cpuhw, pmc5, pmc6);
505	else
506		thaw_limited_counters(cpuhw, pmc5, pmc6);
507
508	/*
509	 * Write the full MMCR0 including the event overflow interrupt
510	 * enable bits, if necessary.
511	 */
512	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
513		mtspr(SPRN_MMCR0, mmcr0);
514}
515
516/*
517 * Disable all events to prevent PMU interrupts and to allow
518 * events to be added or removed.
519 */
520void hw_perf_disable(void)
521{
522	struct cpu_hw_events *cpuhw;
523	unsigned long flags;
524
525	if (!ppmu)
526		return;
527	local_irq_save(flags);
528	cpuhw = &__get_cpu_var(cpu_hw_events);
529
530	if (!cpuhw->disabled) {
531		cpuhw->disabled = 1;
532		cpuhw->n_added = 0;
533
534		/*
535		 * Check if we ever enabled the PMU on this cpu.
536		 */
537		if (!cpuhw->pmcs_enabled) {
538			ppc_enable_pmcs();
539			cpuhw->pmcs_enabled = 1;
540		}
541
542		/*
543		 * Disable instruction sampling if it was enabled
544		 */
545		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
546			mtspr(SPRN_MMCRA,
547			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
548			mb();
549		}
550
551		/*
552		 * Set the 'freeze counters' bit.
553		 * The barrier is to make sure the mtspr has been
554		 * executed and the PMU has frozen the events
555		 * before we return.
556		 */
557		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
558		mb();
559	}
560	local_irq_restore(flags);
561}
562
563/*
564 * Re-enable all events if disable == 0.
565 * If we were previously disabled and events were added, then
566 * put the new config on the PMU.
567 */
568void hw_perf_enable(void)
569{
570	struct perf_event *event;
571	struct cpu_hw_events *cpuhw;
572	unsigned long flags;
573	long i;
574	unsigned long val;
575	s64 left;
576	unsigned int hwc_index[MAX_HWEVENTS];
577	int n_lim;
578	int idx;
579
580	if (!ppmu)
581		return;
582	local_irq_save(flags);
583	cpuhw = &__get_cpu_var(cpu_hw_events);
584	if (!cpuhw->disabled) {
585		local_irq_restore(flags);
586		return;
587	}
588	cpuhw->disabled = 0;
589
590	/*
591	 * If we didn't change anything, or only removed events,
592	 * no need to recalculate MMCR* settings and reset the PMCs.
593	 * Just reenable the PMU with the current MMCR* settings
594	 * (possibly updated for removal of events).
595	 */
596	if (!cpuhw->n_added) {
597		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
598		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
599		if (cpuhw->n_events == 0)
600			ppc_set_pmu_inuse(0);
601		goto out_enable;
602	}
603
604	/*
605	 * Compute MMCR* values for the new set of events
606	 */
607	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
608			       cpuhw->mmcr)) {
609		/* shouldn't ever get here */
610		printk(KERN_ERR "oops compute_mmcr failed\n");
611		goto out;
612	}
613
614	/*
615	 * Add in MMCR0 freeze bits corresponding to the
616	 * attr.exclude_* bits for the first event.
617	 * We have already checked that all events have the
618	 * same values for these bits as the first event.
619	 */
620	event = cpuhw->event[0];
621	if (event->attr.exclude_user)
622		cpuhw->mmcr[0] |= MMCR0_FCP;
623	if (event->attr.exclude_kernel)
624		cpuhw->mmcr[0] |= freeze_events_kernel;
625	if (event->attr.exclude_hv)
626		cpuhw->mmcr[0] |= MMCR0_FCHV;
627
628	/*
629	 * Write the new configuration to MMCR* with the freeze
630	 * bit set and set the hardware events to their initial values.
631	 * Then unfreeze the events.
632	 */
633	ppc_set_pmu_inuse(1);
634	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
635	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
636	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
637				| MMCR0_FC);
638
639	/*
640	 * Read off any pre-existing events that need to move
641	 * to another PMC.
642	 */
643	for (i = 0; i < cpuhw->n_events; ++i) {
644		event = cpuhw->event[i];
645		if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
646			power_pmu_read(event);
647			write_pmc(event->hw.idx, 0);
648			event->hw.idx = 0;
649		}
650	}
651
652	/*
653	 * Initialize the PMCs for all the new and moved events.
654	 */
655	cpuhw->n_limited = n_lim = 0;
656	for (i = 0; i < cpuhw->n_events; ++i) {
657		event = cpuhw->event[i];
658		if (event->hw.idx)
659			continue;
660		idx = hwc_index[i] + 1;
661		if (is_limited_pmc(idx)) {
662			cpuhw->limited_counter[n_lim] = event;
663			cpuhw->limited_hwidx[n_lim] = idx;
664			++n_lim;
665			continue;
666		}
667		val = 0;
668		if (event->hw.sample_period) {
669			left = local64_read(&event->hw.period_left);
670			if (left < 0x80000000L)
671				val = 0x80000000L - left;
672		}
673		local64_set(&event->hw.prev_count, val);
674		event->hw.idx = idx;
675		write_pmc(idx, val);
676		perf_event_update_userpage(event);
677	}
678	cpuhw->n_limited = n_lim;
679	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
680
681 out_enable:
682	mb();
683	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
684
685	/*
686	 * Enable instruction sampling if necessary
687	 */
688	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
689		mb();
690		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
691	}
692
693 out:
694	local_irq_restore(flags);
695}
696
697static int collect_events(struct perf_event *group, int max_count,
698			  struct perf_event *ctrs[], u64 *events,
699			  unsigned int *flags)
700{
701	int n = 0;
702	struct perf_event *event;
703
704	if (!is_software_event(group)) {
705		if (n >= max_count)
706			return -1;
707		ctrs[n] = group;
708		flags[n] = group->hw.event_base;
709		events[n++] = group->hw.config;
710	}
711	list_for_each_entry(event, &group->sibling_list, group_entry) {
712		if (!is_software_event(event) &&
713		    event->state != PERF_EVENT_STATE_OFF) {
714			if (n >= max_count)
715				return -1;
716			ctrs[n] = event;
717			flags[n] = event->hw.event_base;
718			events[n++] = event->hw.config;
719		}
720	}
721	return n;
722}
723
724/*
725 * Add a event to the PMU.
726 * If all events are not already frozen, then we disable and
727 * re-enable the PMU in order to get hw_perf_enable to do the
728 * actual work of reconfiguring the PMU.
729 */
730static int power_pmu_enable(struct perf_event *event)
731{
732	struct cpu_hw_events *cpuhw;
733	unsigned long flags;
734	int n0;
735	int ret = -EAGAIN;
736
737	local_irq_save(flags);
738	perf_disable();
739
740	/*
741	 * Add the event to the list (if there is room)
742	 * and check whether the total set is still feasible.
743	 */
744	cpuhw = &__get_cpu_var(cpu_hw_events);
745	n0 = cpuhw->n_events;
746	if (n0 >= ppmu->n_counter)
747		goto out;
748	cpuhw->event[n0] = event;
749	cpuhw->events[n0] = event->hw.config;
750	cpuhw->flags[n0] = event->hw.event_base;
751
752	/*
753	 * If group events scheduling transaction was started,
754	 * skip the schedulability test here, it will be peformed
755	 * at commit time(->commit_txn) as a whole
756	 */
757	if (cpuhw->group_flag & PERF_EVENT_TXN)
758		goto nocheck;
759
760	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
761		goto out;
762	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
763		goto out;
764	event->hw.config = cpuhw->events[n0];
765
766nocheck:
767	++cpuhw->n_events;
768	++cpuhw->n_added;
769
770	ret = 0;
771 out:
772	perf_enable();
773	local_irq_restore(flags);
774	return ret;
775}
776
777/*
778 * Remove a event from the PMU.
779 */
780static void power_pmu_disable(struct perf_event *event)
781{
782	struct cpu_hw_events *cpuhw;
783	long i;
784	unsigned long flags;
785
786	local_irq_save(flags);
787	perf_disable();
788
789	power_pmu_read(event);
790
791	cpuhw = &__get_cpu_var(cpu_hw_events);
792	for (i = 0; i < cpuhw->n_events; ++i) {
793		if (event == cpuhw->event[i]) {
794			while (++i < cpuhw->n_events) {
795				cpuhw->event[i-1] = cpuhw->event[i];
796				cpuhw->events[i-1] = cpuhw->events[i];
797				cpuhw->flags[i-1] = cpuhw->flags[i];
798			}
799			--cpuhw->n_events;
800			ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
801			if (event->hw.idx) {
802				write_pmc(event->hw.idx, 0);
803				event->hw.idx = 0;
804			}
805			perf_event_update_userpage(event);
806			break;
807		}
808	}
809	for (i = 0; i < cpuhw->n_limited; ++i)
810		if (event == cpuhw->limited_counter[i])
811			break;
812	if (i < cpuhw->n_limited) {
813		while (++i < cpuhw->n_limited) {
814			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
815			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
816		}
817		--cpuhw->n_limited;
818	}
819	if (cpuhw->n_events == 0) {
820		/* disable exceptions if no events are running */
821		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
822	}
823
824	perf_enable();
825	local_irq_restore(flags);
826}
827
828/*
829 * Re-enable interrupts on a event after they were throttled
830 * because they were coming too fast.
831 */
832static void power_pmu_unthrottle(struct perf_event *event)
833{
834	s64 val, left;
835	unsigned long flags;
836
837	if (!event->hw.idx || !event->hw.sample_period)
838		return;
839	local_irq_save(flags);
840	perf_disable();
841	power_pmu_read(event);
842	left = event->hw.sample_period;
843	event->hw.last_period = left;
844	val = 0;
845	if (left < 0x80000000L)
846		val = 0x80000000L - left;
847	write_pmc(event->hw.idx, val);
848	local64_set(&event->hw.prev_count, val);
849	local64_set(&event->hw.period_left, left);
850	perf_event_update_userpage(event);
851	perf_enable();
852	local_irq_restore(flags);
853}
854
855/*
856 * Start group events scheduling transaction
857 * Set the flag to make pmu::enable() not perform the
858 * schedulability test, it will be performed at commit time
859 */
860void power_pmu_start_txn(const struct pmu *pmu)
861{
862	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
863
864	cpuhw->group_flag |= PERF_EVENT_TXN;
865	cpuhw->n_txn_start = cpuhw->n_events;
866}
867
868/*
869 * Stop group events scheduling transaction
870 * Clear the flag and pmu::enable() will perform the
871 * schedulability test.
872 */
873void power_pmu_cancel_txn(const struct pmu *pmu)
874{
875	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
876
877	cpuhw->group_flag &= ~PERF_EVENT_TXN;
878}
879
880/*
881 * Commit group events scheduling transaction
882 * Perform the group schedulability test as a whole
883 * Return 0 if success
884 */
885int power_pmu_commit_txn(const struct pmu *pmu)
886{
887	struct cpu_hw_events *cpuhw;
888	long i, n;
889
890	if (!ppmu)
891		return -EAGAIN;
892	cpuhw = &__get_cpu_var(cpu_hw_events);
893	n = cpuhw->n_events;
894	if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
895		return -EAGAIN;
896	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n);
897	if (i < 0)
898		return -EAGAIN;
899
900	for (i = cpuhw->n_txn_start; i < n; ++i)
901		cpuhw->event[i]->hw.config = cpuhw->events[i];
902
903	cpuhw->group_flag &= ~PERF_EVENT_TXN;
904	return 0;
905}
906
907struct pmu power_pmu = {
908	.enable		= power_pmu_enable,
909	.disable	= power_pmu_disable,
910	.read		= power_pmu_read,
911	.unthrottle	= power_pmu_unthrottle,
912	.start_txn	= power_pmu_start_txn,
913	.cancel_txn	= power_pmu_cancel_txn,
914	.commit_txn	= power_pmu_commit_txn,
915};
916
917/*
918 * Return 1 if we might be able to put event on a limited PMC,
919 * or 0 if not.
920 * A event can only go on a limited PMC if it counts something
921 * that a limited PMC can count, doesn't require interrupts, and
922 * doesn't exclude any processor mode.
923 */
924static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
925				 unsigned int flags)
926{
927	int n;
928	u64 alt[MAX_EVENT_ALTERNATIVES];
929
930	if (event->attr.exclude_user
931	    || event->attr.exclude_kernel
932	    || event->attr.exclude_hv
933	    || event->attr.sample_period)
934		return 0;
935
936	if (ppmu->limited_pmc_event(ev))
937		return 1;
938
939	/*
940	 * The requested event_id isn't on a limited PMC already;
941	 * see if any alternative code goes on a limited PMC.
942	 */
943	if (!ppmu->get_alternatives)
944		return 0;
945
946	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
947	n = ppmu->get_alternatives(ev, flags, alt);
948
949	return n > 0;
950}
951
952/*
953 * Find an alternative event_id that goes on a normal PMC, if possible,
954 * and return the event_id code, or 0 if there is no such alternative.
955 * (Note: event_id code 0 is "don't count" on all machines.)
956 */
957static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
958{
959	u64 alt[MAX_EVENT_ALTERNATIVES];
960	int n;
961
962	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
963	n = ppmu->get_alternatives(ev, flags, alt);
964	if (!n)
965		return 0;
966	return alt[0];
967}
968
969/* Number of perf_events counting hardware events */
970static atomic_t num_events;
971/* Used to avoid races in calling reserve/release_pmc_hardware */
972static DEFINE_MUTEX(pmc_reserve_mutex);
973
974/*
975 * Release the PMU if this is the last perf_event.
976 */
977static void hw_perf_event_destroy(struct perf_event *event)
978{
979	if (!atomic_add_unless(&num_events, -1, 1)) {
980		mutex_lock(&pmc_reserve_mutex);
981		if (atomic_dec_return(&num_events) == 0)
982			release_pmc_hardware();
983		mutex_unlock(&pmc_reserve_mutex);
984	}
985}
986
987/*
988 * Translate a generic cache event_id config to a raw event_id code.
989 */
990static int hw_perf_cache_event(u64 config, u64 *eventp)
991{
992	unsigned long type, op, result;
993	int ev;
994
995	if (!ppmu->cache_events)
996		return -EINVAL;
997
998	/* unpack config */
999	type = config & 0xff;
1000	op = (config >> 8) & 0xff;
1001	result = (config >> 16) & 0xff;
1002
1003	if (type >= PERF_COUNT_HW_CACHE_MAX ||
1004	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
1005	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
1006		return -EINVAL;
1007
1008	ev = (*ppmu->cache_events)[type][op][result];
1009	if (ev == 0)
1010		return -EOPNOTSUPP;
1011	if (ev == -1)
1012		return -EINVAL;
1013	*eventp = ev;
1014	return 0;
1015}
1016
1017const struct pmu *hw_perf_event_init(struct perf_event *event)
1018{
1019	u64 ev;
1020	unsigned long flags;
1021	struct perf_event *ctrs[MAX_HWEVENTS];
1022	u64 events[MAX_HWEVENTS];
1023	unsigned int cflags[MAX_HWEVENTS];
1024	int n;
1025	int err;
1026	struct cpu_hw_events *cpuhw;
1027
1028	if (!ppmu)
1029		return ERR_PTR(-ENXIO);
1030	switch (event->attr.type) {
1031	case PERF_TYPE_HARDWARE:
1032		ev = event->attr.config;
1033		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
1034			return ERR_PTR(-EOPNOTSUPP);
1035		ev = ppmu->generic_events[ev];
1036		break;
1037	case PERF_TYPE_HW_CACHE:
1038		err = hw_perf_cache_event(event->attr.config, &ev);
1039		if (err)
1040			return ERR_PTR(err);
1041		break;
1042	case PERF_TYPE_RAW:
1043		ev = event->attr.config;
1044		break;
1045	default:
1046		return ERR_PTR(-EINVAL);
1047	}
1048	event->hw.config_base = ev;
1049	event->hw.idx = 0;
1050
1051	/*
1052	 * If we are not running on a hypervisor, force the
1053	 * exclude_hv bit to 0 so that we don't care what
1054	 * the user set it to.
1055	 */
1056	if (!firmware_has_feature(FW_FEATURE_LPAR))
1057		event->attr.exclude_hv = 0;
1058
1059	flags = 0;
1060	if (event->ctx->task)
1061		flags |= PPMU_ONLY_COUNT_RUN;
1062
1063	/*
1064	 * If this machine has limited events, check whether this
1065	 * event_id could go on a limited event.
1066	 */
1067	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
1068		if (can_go_on_limited_pmc(event, ev, flags)) {
1069			flags |= PPMU_LIMITED_PMC_OK;
1070		} else if (ppmu->limited_pmc_event(ev)) {
1071			/*
1072			 * The requested event_id is on a limited PMC,
1073			 * but we can't use a limited PMC; see if any
1074			 * alternative goes on a normal PMC.
1075			 */
1076			ev = normal_pmc_alternative(ev, flags);
1077			if (!ev)
1078				return ERR_PTR(-EINVAL);
1079		}
1080	}
1081
1082	/*
1083	 * If this is in a group, check if it can go on with all the
1084	 * other hardware events in the group.  We assume the event
1085	 * hasn't been linked into its leader's sibling list at this point.
1086	 */
1087	n = 0;
1088	if (event->group_leader != event) {
1089		n = collect_events(event->group_leader, ppmu->n_counter - 1,
1090				   ctrs, events, cflags);
1091		if (n < 0)
1092			return ERR_PTR(-EINVAL);
1093	}
1094	events[n] = ev;
1095	ctrs[n] = event;
1096	cflags[n] = flags;
1097	if (check_excludes(ctrs, cflags, n, 1))
1098		return ERR_PTR(-EINVAL);
1099
1100	cpuhw = &get_cpu_var(cpu_hw_events);
1101	err = power_check_constraints(cpuhw, events, cflags, n + 1);
1102	put_cpu_var(cpu_hw_events);
1103	if (err)
1104		return ERR_PTR(-EINVAL);
1105
1106	event->hw.config = events[n];
1107	event->hw.event_base = cflags[n];
1108	event->hw.last_period = event->hw.sample_period;
1109	local64_set(&event->hw.period_left, event->hw.last_period);
1110
1111	/*
1112	 * See if we need to reserve the PMU.
1113	 * If no events are currently in use, then we have to take a
1114	 * mutex to ensure that we don't race with another task doing
1115	 * reserve_pmc_hardware or release_pmc_hardware.
1116	 */
1117	err = 0;
1118	if (!atomic_inc_not_zero(&num_events)) {
1119		mutex_lock(&pmc_reserve_mutex);
1120		if (atomic_read(&num_events) == 0 &&
1121		    reserve_pmc_hardware(perf_event_interrupt))
1122			err = -EBUSY;
1123		else
1124			atomic_inc(&num_events);
1125		mutex_unlock(&pmc_reserve_mutex);
1126	}
1127	event->destroy = hw_perf_event_destroy;
1128
1129	if (err)
1130		return ERR_PTR(err);
1131	return &power_pmu;
1132}
1133
1134/*
1135 * A counter has overflowed; update its count and record
1136 * things if requested.  Note that interrupts are hard-disabled
1137 * here so there is no possibility of being interrupted.
1138 */
1139static void record_and_restart(struct perf_event *event, unsigned long val,
1140			       struct pt_regs *regs, int nmi)
1141{
1142	u64 period = event->hw.sample_period;
1143	s64 prev, delta, left;
1144	int record = 0;
1145
1146	/* we don't have to worry about interrupts here */
1147	prev = local64_read(&event->hw.prev_count);
1148	delta = (val - prev) & 0xfffffffful;
1149	local64_add(delta, &event->count);
1150
1151	/*
1152	 * See if the total period for this event has expired,
1153	 * and update for the next period.
1154	 */
1155	val = 0;
1156	left = local64_read(&event->hw.period_left) - delta;
1157	if (period) {
1158		if (left <= 0) {
1159			left += period;
1160			if (left <= 0)
1161				left = period;
1162			record = 1;
1163		}
1164		if (left < 0x80000000LL)
1165			val = 0x80000000LL - left;
1166	}
1167
1168	/*
1169	 * Finally record data if requested.
1170	 */
1171	if (record) {
1172		struct perf_sample_data data;
1173
1174		perf_sample_data_init(&data, ~0ULL);
1175		data.period = event->hw.last_period;
1176
1177		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
1178			perf_get_data_addr(regs, &data.addr);
1179
1180		if (perf_event_overflow(event, nmi, &data, regs)) {
1181			/*
1182			 * Interrupts are coming too fast - throttle them
1183			 * by setting the event to 0, so it will be
1184			 * at least 2^30 cycles until the next interrupt
1185			 * (assuming each event counts at most 2 counts
1186			 * per cycle).
1187			 */
1188			val = 0;
1189			left = ~0ULL >> 1;
1190		}
1191	}
1192
1193	write_pmc(event->hw.idx, val);
1194	local64_set(&event->hw.prev_count, val);
1195	local64_set(&event->hw.period_left, left);
1196	perf_event_update_userpage(event);
1197}
1198
1199/*
1200 * Called from generic code to get the misc flags (i.e. processor mode)
1201 * for an event_id.
1202 */
1203unsigned long perf_misc_flags(struct pt_regs *regs)
1204{
1205	u32 flags = perf_get_misc_flags(regs);
1206
1207	if (flags)
1208		return flags;
1209	return user_mode(regs) ? PERF_RECORD_MISC_USER :
1210		PERF_RECORD_MISC_KERNEL;
1211}
1212
1213/*
1214 * Called from generic code to get the instruction pointer
1215 * for an event_id.
1216 */
1217unsigned long perf_instruction_pointer(struct pt_regs *regs)
1218{
1219	unsigned long ip;
1220
1221	if (TRAP(regs) != 0xf00)
1222		return regs->nip;	/* not a PMU interrupt */
1223
1224	ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
1225	return ip;
1226}
1227
1228/*
1229 * Performance monitor interrupt stuff
1230 */
1231static void perf_event_interrupt(struct pt_regs *regs)
1232{
1233	int i;
1234	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
1235	struct perf_event *event;
1236	unsigned long val;
1237	int found = 0;
1238	int nmi;
1239
1240	if (cpuhw->n_limited)
1241		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
1242					mfspr(SPRN_PMC6));
1243
1244	perf_read_regs(regs);
1245
1246	nmi = perf_intr_is_nmi(regs);
1247	if (nmi)
1248		nmi_enter();
1249	else
1250		irq_enter();
1251
1252	for (i = 0; i < cpuhw->n_events; ++i) {
1253		event = cpuhw->event[i];
1254		if (!event->hw.idx || is_limited_pmc(event->hw.idx))
1255			continue;
1256		val = read_pmc(event->hw.idx);
1257		if ((int)val < 0) {
1258			/* event has overflowed */
1259			found = 1;
1260			record_and_restart(event, val, regs, nmi);
1261		}
1262	}
1263
1264	/*
1265	 * In case we didn't find and reset the event that caused
1266	 * the interrupt, scan all events and reset any that are
1267	 * negative, to avoid getting continual interrupts.
1268	 * Any that we processed in the previous loop will not be negative.
1269	 */
1270	if (!found) {
1271		for (i = 0; i < ppmu->n_counter; ++i) {
1272			if (is_limited_pmc(i + 1))
1273				continue;
1274			val = read_pmc(i + 1);
1275			if ((int)val < 0)
1276				write_pmc(i + 1, 0);
1277		}
1278	}
1279
1280	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1281
1282	if (nmi)
1283		nmi_exit();
1284	else
1285		irq_exit();
1286}
1287
1288static void power_pmu_setup(int cpu)
1289{
1290	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
1291
1292	if (!ppmu)
1293		return;
1294	memset(cpuhw, 0, sizeof(*cpuhw));
1295	cpuhw->mmcr[0] = MMCR0_FC;
1296}
1297
1298static int __cpuinit
1299power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1300{
1301	unsigned int cpu = (long)hcpu;
1302
1303	switch (action & ~CPU_TASKS_FROZEN) {
1304	case CPU_UP_PREPARE:
1305		power_pmu_setup(cpu);
1306		break;
1307
1308	default:
1309		break;
1310	}
1311
1312	return NOTIFY_OK;
1313}
1314
1315int register_power_pmu(struct power_pmu *pmu)
1316{
1317	if (ppmu)
1318		return -EBUSY;		/* something's already registered */
1319
1320	ppmu = pmu;
1321	pr_info("%s performance monitor hardware support registered\n",
1322		pmu->name);
1323
1324#ifdef MSR_HV
1325	/*
1326	 * Use FCHV to ignore kernel events if MSR.HV is set.
1327	 */
1328	if (mfmsr() & MSR_HV)
1329		freeze_events_kernel = MMCR0_FCHV;
1330#endif /* CONFIG_PPC64 */
1331
1332	perf_cpu_notifier(power_pmu_notifier);
1333
1334	return 0;
1335}
1336