1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Performance events core code:
4 *
5 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
6 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
7 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
8 *  Copyright  ��  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
9 */
10
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/cpu.h>
14#include <linux/smp.h>
15#include <linux/idr.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
20#include <linux/tick.h>
21#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
25#include <linux/reboot.h>
26#include <linux/vmstat.h>
27#include <linux/device.h>
28#include <linux/export.h>
29#include <linux/vmalloc.h>
30#include <linux/hardirq.h>
31#include <linux/hugetlb.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47#include <linux/namei.h>
48#include <linux/parser.h>
49#include <linux/sched/clock.h>
50#include <linux/sched/mm.h>
51#include <linux/proc_ns.h>
52#include <linux/mount.h>
53#include <linux/min_heap.h>
54#include <linux/highmem.h>
55#include <linux/pgtable.h>
56#include <linux/buildid.h>
57#include <linux/task_work.h>
58
59#include "internal.h"
60
61#include <asm/irq_regs.h>
62
63typedef int (*remote_function_f)(void *);
64
65struct remote_function_call {
66	struct task_struct	*p;
67	remote_function_f	func;
68	void			*info;
69	int			ret;
70};
71
72static void remote_function(void *data)
73{
74	struct remote_function_call *tfc = data;
75	struct task_struct *p = tfc->p;
76
77	if (p) {
78		/* -EAGAIN */
79		if (task_cpu(p) != smp_processor_id())
80			return;
81
82		/*
83		 * Now that we're on right CPU with IRQs disabled, we can test
84		 * if we hit the right task without races.
85		 */
86
87		tfc->ret = -ESRCH; /* No such (running) process */
88		if (p != current)
89			return;
90	}
91
92	tfc->ret = tfc->func(tfc->info);
93}
94
95/**
96 * task_function_call - call a function on the cpu on which a task runs
97 * @p:		the task to evaluate
98 * @func:	the function to be called
99 * @info:	the function call argument
100 *
101 * Calls the function @func when the task is currently running. This might
102 * be on the current CPU, which just calls the function directly.  This will
103 * retry due to any failures in smp_call_function_single(), such as if the
104 * task_cpu() goes offline concurrently.
105 *
106 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
107 */
108static int
109task_function_call(struct task_struct *p, remote_function_f func, void *info)
110{
111	struct remote_function_call data = {
112		.p	= p,
113		.func	= func,
114		.info	= info,
115		.ret	= -EAGAIN,
116	};
117	int ret;
118
119	for (;;) {
120		ret = smp_call_function_single(task_cpu(p), remote_function,
121					       &data, 1);
122		if (!ret)
123			ret = data.ret;
124
125		if (ret != -EAGAIN)
126			break;
127
128		cond_resched();
129	}
130
131	return ret;
132}
133
134/**
135 * cpu_function_call - call a function on the cpu
136 * @cpu:	target cpu to queue this function
137 * @func:	the function to be called
138 * @info:	the function call argument
139 *
140 * Calls the function @func on the remote cpu.
141 *
142 * returns: @func return value or -ENXIO when the cpu is offline
143 */
144static int cpu_function_call(int cpu, remote_function_f func, void *info)
145{
146	struct remote_function_call data = {
147		.p	= NULL,
148		.func	= func,
149		.info	= info,
150		.ret	= -ENXIO, /* No such CPU */
151	};
152
153	smp_call_function_single(cpu, remote_function, &data, 1);
154
155	return data.ret;
156}
157
158static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
159			  struct perf_event_context *ctx)
160{
161	raw_spin_lock(&cpuctx->ctx.lock);
162	if (ctx)
163		raw_spin_lock(&ctx->lock);
164}
165
166static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
167			    struct perf_event_context *ctx)
168{
169	if (ctx)
170		raw_spin_unlock(&ctx->lock);
171	raw_spin_unlock(&cpuctx->ctx.lock);
172}
173
174#define TASK_TOMBSTONE ((void *)-1L)
175
176static bool is_kernel_event(struct perf_event *event)
177{
178	return READ_ONCE(event->owner) == TASK_TOMBSTONE;
179}
180
181static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
182
183struct perf_event_context *perf_cpu_task_ctx(void)
184{
185	lockdep_assert_irqs_disabled();
186	return this_cpu_ptr(&perf_cpu_context)->task_ctx;
187}
188
189/*
190 * On task ctx scheduling...
191 *
192 * When !ctx->nr_events a task context will not be scheduled. This means
193 * we can disable the scheduler hooks (for performance) without leaving
194 * pending task ctx state.
195 *
196 * This however results in two special cases:
197 *
198 *  - removing the last event from a task ctx; this is relatively straight
199 *    forward and is done in __perf_remove_from_context.
200 *
201 *  - adding the first event to a task ctx; this is tricky because we cannot
202 *    rely on ctx->is_active and therefore cannot use event_function_call().
203 *    See perf_install_in_context().
204 *
205 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
206 */
207
208typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
209			struct perf_event_context *, void *);
210
211struct event_function_struct {
212	struct perf_event *event;
213	event_f func;
214	void *data;
215};
216
217static int event_function(void *info)
218{
219	struct event_function_struct *efs = info;
220	struct perf_event *event = efs->event;
221	struct perf_event_context *ctx = event->ctx;
222	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
223	struct perf_event_context *task_ctx = cpuctx->task_ctx;
224	int ret = 0;
225
226	lockdep_assert_irqs_disabled();
227
228	perf_ctx_lock(cpuctx, task_ctx);
229	/*
230	 * Since we do the IPI call without holding ctx->lock things can have
231	 * changed, double check we hit the task we set out to hit.
232	 */
233	if (ctx->task) {
234		if (ctx->task != current) {
235			ret = -ESRCH;
236			goto unlock;
237		}
238
239		/*
240		 * We only use event_function_call() on established contexts,
241		 * and event_function() is only ever called when active (or
242		 * rather, we'll have bailed in task_function_call() or the
243		 * above ctx->task != current test), therefore we must have
244		 * ctx->is_active here.
245		 */
246		WARN_ON_ONCE(!ctx->is_active);
247		/*
248		 * And since we have ctx->is_active, cpuctx->task_ctx must
249		 * match.
250		 */
251		WARN_ON_ONCE(task_ctx != ctx);
252	} else {
253		WARN_ON_ONCE(&cpuctx->ctx != ctx);
254	}
255
256	efs->func(event, cpuctx, ctx, efs->data);
257unlock:
258	perf_ctx_unlock(cpuctx, task_ctx);
259
260	return ret;
261}
262
263static void event_function_call(struct perf_event *event, event_f func, void *data)
264{
265	struct perf_event_context *ctx = event->ctx;
266	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
267	struct event_function_struct efs = {
268		.event = event,
269		.func = func,
270		.data = data,
271	};
272
273	if (!event->parent) {
274		/*
275		 * If this is a !child event, we must hold ctx::mutex to
276		 * stabilize the event->ctx relation. See
277		 * perf_event_ctx_lock().
278		 */
279		lockdep_assert_held(&ctx->mutex);
280	}
281
282	if (!task) {
283		cpu_function_call(event->cpu, event_function, &efs);
284		return;
285	}
286
287	if (task == TASK_TOMBSTONE)
288		return;
289
290again:
291	if (!task_function_call(task, event_function, &efs))
292		return;
293
294	raw_spin_lock_irq(&ctx->lock);
295	/*
296	 * Reload the task pointer, it might have been changed by
297	 * a concurrent perf_event_context_sched_out().
298	 */
299	task = ctx->task;
300	if (task == TASK_TOMBSTONE) {
301		raw_spin_unlock_irq(&ctx->lock);
302		return;
303	}
304	if (ctx->is_active) {
305		raw_spin_unlock_irq(&ctx->lock);
306		goto again;
307	}
308	func(event, NULL, ctx, data);
309	raw_spin_unlock_irq(&ctx->lock);
310}
311
312/*
313 * Similar to event_function_call() + event_function(), but hard assumes IRQs
314 * are already disabled and we're on the right CPU.
315 */
316static void event_function_local(struct perf_event *event, event_f func, void *data)
317{
318	struct perf_event_context *ctx = event->ctx;
319	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
320	struct task_struct *task = READ_ONCE(ctx->task);
321	struct perf_event_context *task_ctx = NULL;
322
323	lockdep_assert_irqs_disabled();
324
325	if (task) {
326		if (task == TASK_TOMBSTONE)
327			return;
328
329		task_ctx = ctx;
330	}
331
332	perf_ctx_lock(cpuctx, task_ctx);
333
334	task = ctx->task;
335	if (task == TASK_TOMBSTONE)
336		goto unlock;
337
338	if (task) {
339		/*
340		 * We must be either inactive or active and the right task,
341		 * otherwise we're screwed, since we cannot IPI to somewhere
342		 * else.
343		 */
344		if (ctx->is_active) {
345			if (WARN_ON_ONCE(task != current))
346				goto unlock;
347
348			if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
349				goto unlock;
350		}
351	} else {
352		WARN_ON_ONCE(&cpuctx->ctx != ctx);
353	}
354
355	func(event, cpuctx, ctx, data);
356unlock:
357	perf_ctx_unlock(cpuctx, task_ctx);
358}
359
360#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
361		       PERF_FLAG_FD_OUTPUT  |\
362		       PERF_FLAG_PID_CGROUP |\
363		       PERF_FLAG_FD_CLOEXEC)
364
365/*
366 * branch priv levels that need permission checks
367 */
368#define PERF_SAMPLE_BRANCH_PERM_PLM \
369	(PERF_SAMPLE_BRANCH_KERNEL |\
370	 PERF_SAMPLE_BRANCH_HV)
371
372enum event_type_t {
373	EVENT_FLEXIBLE = 0x1,
374	EVENT_PINNED = 0x2,
375	EVENT_TIME = 0x4,
376	/* see ctx_resched() for details */
377	EVENT_CPU = 0x8,
378	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
379};
380
381/*
382 * perf_sched_events : >0 events exist
383 */
384
385static void perf_sched_delayed(struct work_struct *work);
386DEFINE_STATIC_KEY_FALSE(perf_sched_events);
387static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
388static DEFINE_MUTEX(perf_sched_mutex);
389static atomic_t perf_sched_count;
390
391static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
392
393static atomic_t nr_mmap_events __read_mostly;
394static atomic_t nr_comm_events __read_mostly;
395static atomic_t nr_namespaces_events __read_mostly;
396static atomic_t nr_task_events __read_mostly;
397static atomic_t nr_freq_events __read_mostly;
398static atomic_t nr_switch_events __read_mostly;
399static atomic_t nr_ksymbol_events __read_mostly;
400static atomic_t nr_bpf_events __read_mostly;
401static atomic_t nr_cgroup_events __read_mostly;
402static atomic_t nr_text_poke_events __read_mostly;
403static atomic_t nr_build_id_events __read_mostly;
404
405static LIST_HEAD(pmus);
406static DEFINE_MUTEX(pmus_lock);
407static struct srcu_struct pmus_srcu;
408static cpumask_var_t perf_online_mask;
409static struct kmem_cache *perf_event_cache;
410
411/*
412 * perf event paranoia level:
413 *  -1 - not paranoid at all
414 *   0 - disallow raw tracepoint access for unpriv
415 *   1 - disallow cpu events for unpriv
416 *   2 - disallow kernel profiling for unpriv
417 */
418int sysctl_perf_event_paranoid __read_mostly = 2;
419
420/* Minimum for 512 kiB + 1 user control page */
421int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
422
423/*
424 * max perf event sample rate
425 */
426#define DEFAULT_MAX_SAMPLE_RATE		100000
427#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
428#define DEFAULT_CPU_TIME_MAX_PERCENT	25
429
430int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
431
432static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
433static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
434
435static int perf_sample_allowed_ns __read_mostly =
436	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
437
438static void update_perf_cpu_limits(void)
439{
440	u64 tmp = perf_sample_period_ns;
441
442	tmp *= sysctl_perf_cpu_time_max_percent;
443	tmp = div_u64(tmp, 100);
444	if (!tmp)
445		tmp = 1;
446
447	WRITE_ONCE(perf_sample_allowed_ns, tmp);
448}
449
450static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
451
452int perf_proc_update_handler(struct ctl_table *table, int write,
453		void *buffer, size_t *lenp, loff_t *ppos)
454{
455	int ret;
456	int perf_cpu = sysctl_perf_cpu_time_max_percent;
457	/*
458	 * If throttling is disabled don't allow the write:
459	 */
460	if (write && (perf_cpu == 100 || perf_cpu == 0))
461		return -EINVAL;
462
463	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
464	if (ret || !write)
465		return ret;
466
467	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
468	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
469	update_perf_cpu_limits();
470
471	return 0;
472}
473
474int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
475
476int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
477		void *buffer, size_t *lenp, loff_t *ppos)
478{
479	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
480
481	if (ret || !write)
482		return ret;
483
484	if (sysctl_perf_cpu_time_max_percent == 100 ||
485	    sysctl_perf_cpu_time_max_percent == 0) {
486		printk(KERN_WARNING
487		       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
488		WRITE_ONCE(perf_sample_allowed_ns, 0);
489	} else {
490		update_perf_cpu_limits();
491	}
492
493	return 0;
494}
495
496/*
497 * perf samples are done in some very critical code paths (NMIs).
498 * If they take too much CPU time, the system can lock up and not
499 * get any real work done.  This will drop the sample rate when
500 * we detect that events are taking too long.
501 */
502#define NR_ACCUMULATED_SAMPLES 128
503static DEFINE_PER_CPU(u64, running_sample_length);
504
505static u64 __report_avg;
506static u64 __report_allowed;
507
508static void perf_duration_warn(struct irq_work *w)
509{
510	printk_ratelimited(KERN_INFO
511		"perf: interrupt took too long (%lld > %lld), lowering "
512		"kernel.perf_event_max_sample_rate to %d\n",
513		__report_avg, __report_allowed,
514		sysctl_perf_event_sample_rate);
515}
516
517static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
518
519void perf_sample_event_took(u64 sample_len_ns)
520{
521	u64 max_len = READ_ONCE(perf_sample_allowed_ns);
522	u64 running_len;
523	u64 avg_len;
524	u32 max;
525
526	if (max_len == 0)
527		return;
528
529	/* Decay the counter by 1 average sample. */
530	running_len = __this_cpu_read(running_sample_length);
531	running_len -= running_len/NR_ACCUMULATED_SAMPLES;
532	running_len += sample_len_ns;
533	__this_cpu_write(running_sample_length, running_len);
534
535	/*
536	 * Note: this will be biased artifically low until we have
537	 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
538	 * from having to maintain a count.
539	 */
540	avg_len = running_len/NR_ACCUMULATED_SAMPLES;
541	if (avg_len <= max_len)
542		return;
543
544	__report_avg = avg_len;
545	__report_allowed = max_len;
546
547	/*
548	 * Compute a throttle threshold 25% below the current duration.
549	 */
550	avg_len += avg_len / 4;
551	max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
552	if (avg_len < max)
553		max /= (u32)avg_len;
554	else
555		max = 1;
556
557	WRITE_ONCE(perf_sample_allowed_ns, avg_len);
558	WRITE_ONCE(max_samples_per_tick, max);
559
560	sysctl_perf_event_sample_rate = max * HZ;
561	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
562
563	if (!irq_work_queue(&perf_duration_work)) {
564		early_printk("perf: interrupt took too long (%lld > %lld), lowering "
565			     "kernel.perf_event_max_sample_rate to %d\n",
566			     __report_avg, __report_allowed,
567			     sysctl_perf_event_sample_rate);
568	}
569}
570
571static atomic64_t perf_event_id;
572
573static void update_context_time(struct perf_event_context *ctx);
574static u64 perf_event_time(struct perf_event *event);
575
576void __weak perf_event_print_debug(void)	{ }
577
578static inline u64 perf_clock(void)
579{
580	return local_clock();
581}
582
583static inline u64 perf_event_clock(struct perf_event *event)
584{
585	return event->clock();
586}
587
588/*
589 * State based event timekeeping...
590 *
591 * The basic idea is to use event->state to determine which (if any) time
592 * fields to increment with the current delta. This means we only need to
593 * update timestamps when we change state or when they are explicitly requested
594 * (read).
595 *
596 * Event groups make things a little more complicated, but not terribly so. The
597 * rules for a group are that if the group leader is OFF the entire group is
598 * OFF, irrespecive of what the group member states are. This results in
599 * __perf_effective_state().
600 *
601 * A futher ramification is that when a group leader flips between OFF and
602 * !OFF, we need to update all group member times.
603 *
604 *
605 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
606 * need to make sure the relevant context time is updated before we try and
607 * update our timestamps.
608 */
609
610static __always_inline enum perf_event_state
611__perf_effective_state(struct perf_event *event)
612{
613	struct perf_event *leader = event->group_leader;
614
615	if (leader->state <= PERF_EVENT_STATE_OFF)
616		return leader->state;
617
618	return event->state;
619}
620
621static __always_inline void
622__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
623{
624	enum perf_event_state state = __perf_effective_state(event);
625	u64 delta = now - event->tstamp;
626
627	*enabled = event->total_time_enabled;
628	if (state >= PERF_EVENT_STATE_INACTIVE)
629		*enabled += delta;
630
631	*running = event->total_time_running;
632	if (state >= PERF_EVENT_STATE_ACTIVE)
633		*running += delta;
634}
635
636static void perf_event_update_time(struct perf_event *event)
637{
638	u64 now = perf_event_time(event);
639
640	__perf_update_times(event, now, &event->total_time_enabled,
641					&event->total_time_running);
642	event->tstamp = now;
643}
644
645static void perf_event_update_sibling_time(struct perf_event *leader)
646{
647	struct perf_event *sibling;
648
649	for_each_sibling_event(sibling, leader)
650		perf_event_update_time(sibling);
651}
652
653static void
654perf_event_set_state(struct perf_event *event, enum perf_event_state state)
655{
656	if (event->state == state)
657		return;
658
659	perf_event_update_time(event);
660	/*
661	 * If a group leader gets enabled/disabled all its siblings
662	 * are affected too.
663	 */
664	if ((event->state < 0) ^ (state < 0))
665		perf_event_update_sibling_time(event);
666
667	WRITE_ONCE(event->state, state);
668}
669
670/*
671 * UP store-release, load-acquire
672 */
673
674#define __store_release(ptr, val)					\
675do {									\
676	barrier();							\
677	WRITE_ONCE(*(ptr), (val));					\
678} while (0)
679
680#define __load_acquire(ptr)						\
681({									\
682	__unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));	\
683	barrier();							\
684	___p;								\
685})
686
687static void perf_ctx_disable(struct perf_event_context *ctx)
688{
689	struct perf_event_pmu_context *pmu_ctx;
690
691	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
692		perf_pmu_disable(pmu_ctx->pmu);
693}
694
695static void perf_ctx_enable(struct perf_event_context *ctx)
696{
697	struct perf_event_pmu_context *pmu_ctx;
698
699	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
700		perf_pmu_enable(pmu_ctx->pmu);
701}
702
703static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
704static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
705
706#ifdef CONFIG_CGROUP_PERF
707
708static inline bool
709perf_cgroup_match(struct perf_event *event)
710{
711	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
712
713	/* @event doesn't care about cgroup */
714	if (!event->cgrp)
715		return true;
716
717	/* wants specific cgroup scope but @cpuctx isn't associated with any */
718	if (!cpuctx->cgrp)
719		return false;
720
721	/*
722	 * Cgroup scoping is recursive.  An event enabled for a cgroup is
723	 * also enabled for all its descendant cgroups.  If @cpuctx's
724	 * cgroup is a descendant of @event's (the test covers identity
725	 * case), it's a match.
726	 */
727	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
728				    event->cgrp->css.cgroup);
729}
730
731static inline void perf_detach_cgroup(struct perf_event *event)
732{
733	css_put(&event->cgrp->css);
734	event->cgrp = NULL;
735}
736
737static inline int is_cgroup_event(struct perf_event *event)
738{
739	return event->cgrp != NULL;
740}
741
742static inline u64 perf_cgroup_event_time(struct perf_event *event)
743{
744	struct perf_cgroup_info *t;
745
746	t = per_cpu_ptr(event->cgrp->info, event->cpu);
747	return t->time;
748}
749
750static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
751{
752	struct perf_cgroup_info *t;
753
754	t = per_cpu_ptr(event->cgrp->info, event->cpu);
755	if (!__load_acquire(&t->active))
756		return t->time;
757	now += READ_ONCE(t->timeoffset);
758	return now;
759}
760
761static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
762{
763	if (adv)
764		info->time += now - info->timestamp;
765	info->timestamp = now;
766	/*
767	 * see update_context_time()
768	 */
769	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
770}
771
772static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
773{
774	struct perf_cgroup *cgrp = cpuctx->cgrp;
775	struct cgroup_subsys_state *css;
776	struct perf_cgroup_info *info;
777
778	if (cgrp) {
779		u64 now = perf_clock();
780
781		for (css = &cgrp->css; css; css = css->parent) {
782			cgrp = container_of(css, struct perf_cgroup, css);
783			info = this_cpu_ptr(cgrp->info);
784
785			__update_cgrp_time(info, now, true);
786			if (final)
787				__store_release(&info->active, 0);
788		}
789	}
790}
791
792static inline void update_cgrp_time_from_event(struct perf_event *event)
793{
794	struct perf_cgroup_info *info;
795
796	/*
797	 * ensure we access cgroup data only when needed and
798	 * when we know the cgroup is pinned (css_get)
799	 */
800	if (!is_cgroup_event(event))
801		return;
802
803	info = this_cpu_ptr(event->cgrp->info);
804	/*
805	 * Do not update time when cgroup is not active
806	 */
807	if (info->active)
808		__update_cgrp_time(info, perf_clock(), true);
809}
810
811static inline void
812perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
813{
814	struct perf_event_context *ctx = &cpuctx->ctx;
815	struct perf_cgroup *cgrp = cpuctx->cgrp;
816	struct perf_cgroup_info *info;
817	struct cgroup_subsys_state *css;
818
819	/*
820	 * ctx->lock held by caller
821	 * ensure we do not access cgroup data
822	 * unless we have the cgroup pinned (css_get)
823	 */
824	if (!cgrp)
825		return;
826
827	WARN_ON_ONCE(!ctx->nr_cgroups);
828
829	for (css = &cgrp->css; css; css = css->parent) {
830		cgrp = container_of(css, struct perf_cgroup, css);
831		info = this_cpu_ptr(cgrp->info);
832		__update_cgrp_time(info, ctx->timestamp, false);
833		__store_release(&info->active, 1);
834	}
835}
836
837/*
838 * reschedule events based on the cgroup constraint of task.
839 */
840static void perf_cgroup_switch(struct task_struct *task)
841{
842	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
843	struct perf_cgroup *cgrp;
844
845	/*
846	 * cpuctx->cgrp is set when the first cgroup event enabled,
847	 * and is cleared when the last cgroup event disabled.
848	 */
849	if (READ_ONCE(cpuctx->cgrp) == NULL)
850		return;
851
852	WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
853
854	cgrp = perf_cgroup_from_task(task, NULL);
855	if (READ_ONCE(cpuctx->cgrp) == cgrp)
856		return;
857
858	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
859	perf_ctx_disable(&cpuctx->ctx);
860
861	ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
862	/*
863	 * must not be done before ctxswout due
864	 * to update_cgrp_time_from_cpuctx() in
865	 * ctx_sched_out()
866	 */
867	cpuctx->cgrp = cgrp;
868	/*
869	 * set cgrp before ctxsw in to allow
870	 * perf_cgroup_set_timestamp() in ctx_sched_in()
871	 * to not have to pass task around
872	 */
873	ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
874
875	perf_ctx_enable(&cpuctx->ctx);
876	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
877}
878
879static int perf_cgroup_ensure_storage(struct perf_event *event,
880				struct cgroup_subsys_state *css)
881{
882	struct perf_cpu_context *cpuctx;
883	struct perf_event **storage;
884	int cpu, heap_size, ret = 0;
885
886	/*
887	 * Allow storage to have sufficent space for an iterator for each
888	 * possibly nested cgroup plus an iterator for events with no cgroup.
889	 */
890	for (heap_size = 1; css; css = css->parent)
891		heap_size++;
892
893	for_each_possible_cpu(cpu) {
894		cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
895		if (heap_size <= cpuctx->heap_size)
896			continue;
897
898		storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
899				       GFP_KERNEL, cpu_to_node(cpu));
900		if (!storage) {
901			ret = -ENOMEM;
902			break;
903		}
904
905		raw_spin_lock_irq(&cpuctx->ctx.lock);
906		if (cpuctx->heap_size < heap_size) {
907			swap(cpuctx->heap, storage);
908			if (storage == cpuctx->heap_default)
909				storage = NULL;
910			cpuctx->heap_size = heap_size;
911		}
912		raw_spin_unlock_irq(&cpuctx->ctx.lock);
913
914		kfree(storage);
915	}
916
917	return ret;
918}
919
920static inline int perf_cgroup_connect(int fd, struct perf_event *event,
921				      struct perf_event_attr *attr,
922				      struct perf_event *group_leader)
923{
924	struct perf_cgroup *cgrp;
925	struct cgroup_subsys_state *css;
926	struct fd f = fdget(fd);
927	int ret = 0;
928
929	if (!f.file)
930		return -EBADF;
931
932	css = css_tryget_online_from_dir(f.file->f_path.dentry,
933					 &perf_event_cgrp_subsys);
934	if (IS_ERR(css)) {
935		ret = PTR_ERR(css);
936		goto out;
937	}
938
939	ret = perf_cgroup_ensure_storage(event, css);
940	if (ret)
941		goto out;
942
943	cgrp = container_of(css, struct perf_cgroup, css);
944	event->cgrp = cgrp;
945
946	/*
947	 * all events in a group must monitor
948	 * the same cgroup because a task belongs
949	 * to only one perf cgroup at a time
950	 */
951	if (group_leader && group_leader->cgrp != cgrp) {
952		perf_detach_cgroup(event);
953		ret = -EINVAL;
954	}
955out:
956	fdput(f);
957	return ret;
958}
959
960static inline void
961perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
962{
963	struct perf_cpu_context *cpuctx;
964
965	if (!is_cgroup_event(event))
966		return;
967
968	/*
969	 * Because cgroup events are always per-cpu events,
970	 * @ctx == &cpuctx->ctx.
971	 */
972	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
973
974	if (ctx->nr_cgroups++)
975		return;
976
977	cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
978}
979
980static inline void
981perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
982{
983	struct perf_cpu_context *cpuctx;
984
985	if (!is_cgroup_event(event))
986		return;
987
988	/*
989	 * Because cgroup events are always per-cpu events,
990	 * @ctx == &cpuctx->ctx.
991	 */
992	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
993
994	if (--ctx->nr_cgroups)
995		return;
996
997	cpuctx->cgrp = NULL;
998}
999
1000#else /* !CONFIG_CGROUP_PERF */
1001
1002static inline bool
1003perf_cgroup_match(struct perf_event *event)
1004{
1005	return true;
1006}
1007
1008static inline void perf_detach_cgroup(struct perf_event *event)
1009{}
1010
1011static inline int is_cgroup_event(struct perf_event *event)
1012{
1013	return 0;
1014}
1015
1016static inline void update_cgrp_time_from_event(struct perf_event *event)
1017{
1018}
1019
1020static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
1021						bool final)
1022{
1023}
1024
1025static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1026				      struct perf_event_attr *attr,
1027				      struct perf_event *group_leader)
1028{
1029	return -EINVAL;
1030}
1031
1032static inline void
1033perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
1034{
1035}
1036
1037static inline u64 perf_cgroup_event_time(struct perf_event *event)
1038{
1039	return 0;
1040}
1041
1042static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
1043{
1044	return 0;
1045}
1046
1047static inline void
1048perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1049{
1050}
1051
1052static inline void
1053perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1054{
1055}
1056
1057static void perf_cgroup_switch(struct task_struct *task)
1058{
1059}
1060#endif
1061
1062/*
1063 * set default to be dependent on timer tick just
1064 * like original code
1065 */
1066#define PERF_CPU_HRTIMER (1000 / HZ)
1067/*
1068 * function must be called with interrupts disabled
1069 */
1070static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1071{
1072	struct perf_cpu_pmu_context *cpc;
1073	bool rotations;
1074
1075	lockdep_assert_irqs_disabled();
1076
1077	cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
1078	rotations = perf_rotate_context(cpc);
1079
1080	raw_spin_lock(&cpc->hrtimer_lock);
1081	if (rotations)
1082		hrtimer_forward_now(hr, cpc->hrtimer_interval);
1083	else
1084		cpc->hrtimer_active = 0;
1085	raw_spin_unlock(&cpc->hrtimer_lock);
1086
1087	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1088}
1089
1090static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
1091{
1092	struct hrtimer *timer = &cpc->hrtimer;
1093	struct pmu *pmu = cpc->epc.pmu;
1094	u64 interval;
1095
1096	/*
1097	 * check default is sane, if not set then force to
1098	 * default interval (1/tick)
1099	 */
1100	interval = pmu->hrtimer_interval_ms;
1101	if (interval < 1)
1102		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1103
1104	cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1105
1106	raw_spin_lock_init(&cpc->hrtimer_lock);
1107	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1108	timer->function = perf_mux_hrtimer_handler;
1109}
1110
1111static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
1112{
1113	struct hrtimer *timer = &cpc->hrtimer;
1114	unsigned long flags;
1115
1116	raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
1117	if (!cpc->hrtimer_active) {
1118		cpc->hrtimer_active = 1;
1119		hrtimer_forward_now(timer, cpc->hrtimer_interval);
1120		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1121	}
1122	raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
1123
1124	return 0;
1125}
1126
1127static int perf_mux_hrtimer_restart_ipi(void *arg)
1128{
1129	return perf_mux_hrtimer_restart(arg);
1130}
1131
1132void perf_pmu_disable(struct pmu *pmu)
1133{
1134	int *count = this_cpu_ptr(pmu->pmu_disable_count);
1135	if (!(*count)++)
1136		pmu->pmu_disable(pmu);
1137}
1138
1139void perf_pmu_enable(struct pmu *pmu)
1140{
1141	int *count = this_cpu_ptr(pmu->pmu_disable_count);
1142	if (!--(*count))
1143		pmu->pmu_enable(pmu);
1144}
1145
1146static void perf_assert_pmu_disabled(struct pmu *pmu)
1147{
1148	WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
1149}
1150
1151static void get_ctx(struct perf_event_context *ctx)
1152{
1153	refcount_inc(&ctx->refcount);
1154}
1155
1156static void *alloc_task_ctx_data(struct pmu *pmu)
1157{
1158	if (pmu->task_ctx_cache)
1159		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1160
1161	return NULL;
1162}
1163
1164static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1165{
1166	if (pmu->task_ctx_cache && task_ctx_data)
1167		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1168}
1169
1170static void free_ctx(struct rcu_head *head)
1171{
1172	struct perf_event_context *ctx;
1173
1174	ctx = container_of(head, struct perf_event_context, rcu_head);
1175	kfree(ctx);
1176}
1177
1178static void put_ctx(struct perf_event_context *ctx)
1179{
1180	if (refcount_dec_and_test(&ctx->refcount)) {
1181		if (ctx->parent_ctx)
1182			put_ctx(ctx->parent_ctx);
1183		if (ctx->task && ctx->task != TASK_TOMBSTONE)
1184			put_task_struct(ctx->task);
1185		call_rcu(&ctx->rcu_head, free_ctx);
1186	}
1187}
1188
1189/*
1190 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1191 * perf_pmu_migrate_context() we need some magic.
1192 *
1193 * Those places that change perf_event::ctx will hold both
1194 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1195 *
1196 * Lock ordering is by mutex address. There are two other sites where
1197 * perf_event_context::mutex nests and those are:
1198 *
1199 *  - perf_event_exit_task_context()	[ child , 0 ]
1200 *      perf_event_exit_event()
1201 *        put_event()			[ parent, 1 ]
1202 *
1203 *  - perf_event_init_context()		[ parent, 0 ]
1204 *      inherit_task_group()
1205 *        inherit_group()
1206 *          inherit_event()
1207 *            perf_event_alloc()
1208 *              perf_init_event()
1209 *                perf_try_init_event()	[ child , 1 ]
1210 *
1211 * While it appears there is an obvious deadlock here -- the parent and child
1212 * nesting levels are inverted between the two. This is in fact safe because
1213 * life-time rules separate them. That is an exiting task cannot fork, and a
1214 * spawning task cannot (yet) exit.
1215 *
1216 * But remember that these are parent<->child context relations, and
1217 * migration does not affect children, therefore these two orderings should not
1218 * interact.
1219 *
1220 * The change in perf_event::ctx does not affect children (as claimed above)
1221 * because the sys_perf_event_open() case will install a new event and break
1222 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1223 * concerned with cpuctx and that doesn't have children.
1224 *
1225 * The places that change perf_event::ctx will issue:
1226 *
1227 *   perf_remove_from_context();
1228 *   synchronize_rcu();
1229 *   perf_install_in_context();
1230 *
1231 * to affect the change. The remove_from_context() + synchronize_rcu() should
1232 * quiesce the event, after which we can install it in the new location. This
1233 * means that only external vectors (perf_fops, prctl) can perturb the event
1234 * while in transit. Therefore all such accessors should also acquire
1235 * perf_event_context::mutex to serialize against this.
1236 *
1237 * However; because event->ctx can change while we're waiting to acquire
1238 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1239 * function.
1240 *
1241 * Lock order:
1242 *    exec_update_lock
1243 *	task_struct::perf_event_mutex
1244 *	  perf_event_context::mutex
1245 *	    perf_event::child_mutex;
1246 *	      perf_event_context::lock
1247 *	    perf_event::mmap_mutex
1248 *	    mmap_lock
1249 *	      perf_addr_filters_head::lock
1250 *
1251 *    cpu_hotplug_lock
1252 *      pmus_lock
1253 *	  cpuctx->mutex / perf_event_context::mutex
1254 */
1255static struct perf_event_context *
1256perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1257{
1258	struct perf_event_context *ctx;
1259
1260again:
1261	rcu_read_lock();
1262	ctx = READ_ONCE(event->ctx);
1263	if (!refcount_inc_not_zero(&ctx->refcount)) {
1264		rcu_read_unlock();
1265		goto again;
1266	}
1267	rcu_read_unlock();
1268
1269	mutex_lock_nested(&ctx->mutex, nesting);
1270	if (event->ctx != ctx) {
1271		mutex_unlock(&ctx->mutex);
1272		put_ctx(ctx);
1273		goto again;
1274	}
1275
1276	return ctx;
1277}
1278
1279static inline struct perf_event_context *
1280perf_event_ctx_lock(struct perf_event *event)
1281{
1282	return perf_event_ctx_lock_nested(event, 0);
1283}
1284
1285static void perf_event_ctx_unlock(struct perf_event *event,
1286				  struct perf_event_context *ctx)
1287{
1288	mutex_unlock(&ctx->mutex);
1289	put_ctx(ctx);
1290}
1291
1292/*
1293 * This must be done under the ctx->lock, such as to serialize against
1294 * context_equiv(), therefore we cannot call put_ctx() since that might end up
1295 * calling scheduler related locks and ctx->lock nests inside those.
1296 */
1297static __must_check struct perf_event_context *
1298unclone_ctx(struct perf_event_context *ctx)
1299{
1300	struct perf_event_context *parent_ctx = ctx->parent_ctx;
1301
1302	lockdep_assert_held(&ctx->lock);
1303
1304	if (parent_ctx)
1305		ctx->parent_ctx = NULL;
1306	ctx->generation++;
1307
1308	return parent_ctx;
1309}
1310
1311static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1312				enum pid_type type)
1313{
1314	u32 nr;
1315	/*
1316	 * only top level events have the pid namespace they were created in
1317	 */
1318	if (event->parent)
1319		event = event->parent;
1320
1321	nr = __task_pid_nr_ns(p, type, event->ns);
1322	/* avoid -1 if it is idle thread or runs in another ns */
1323	if (!nr && !pid_alive(p))
1324		nr = -1;
1325	return nr;
1326}
1327
1328static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1329{
1330	return perf_event_pid_type(event, p, PIDTYPE_TGID);
1331}
1332
1333static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1334{
1335	return perf_event_pid_type(event, p, PIDTYPE_PID);
1336}
1337
1338/*
1339 * If we inherit events we want to return the parent event id
1340 * to userspace.
1341 */
1342static u64 primary_event_id(struct perf_event *event)
1343{
1344	u64 id = event->id;
1345
1346	if (event->parent)
1347		id = event->parent->id;
1348
1349	return id;
1350}
1351
1352/*
1353 * Get the perf_event_context for a task and lock it.
1354 *
1355 * This has to cope with the fact that until it is locked,
1356 * the context could get moved to another task.
1357 */
1358static struct perf_event_context *
1359perf_lock_task_context(struct task_struct *task, unsigned long *flags)
1360{
1361	struct perf_event_context *ctx;
1362
1363retry:
1364	/*
1365	 * One of the few rules of preemptible RCU is that one cannot do
1366	 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1367	 * part of the read side critical section was irqs-enabled -- see
1368	 * rcu_read_unlock_special().
1369	 *
1370	 * Since ctx->lock nests under rq->lock we must ensure the entire read
1371	 * side critical section has interrupts disabled.
1372	 */
1373	local_irq_save(*flags);
1374	rcu_read_lock();
1375	ctx = rcu_dereference(task->perf_event_ctxp);
1376	if (ctx) {
1377		/*
1378		 * If this context is a clone of another, it might
1379		 * get swapped for another underneath us by
1380		 * perf_event_task_sched_out, though the
1381		 * rcu_read_lock() protects us from any context
1382		 * getting freed.  Lock the context and check if it
1383		 * got swapped before we could get the lock, and retry
1384		 * if so.  If we locked the right context, then it
1385		 * can't get swapped on us any more.
1386		 */
1387		raw_spin_lock(&ctx->lock);
1388		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
1389			raw_spin_unlock(&ctx->lock);
1390			rcu_read_unlock();
1391			local_irq_restore(*flags);
1392			goto retry;
1393		}
1394
1395		if (ctx->task == TASK_TOMBSTONE ||
1396		    !refcount_inc_not_zero(&ctx->refcount)) {
1397			raw_spin_unlock(&ctx->lock);
1398			ctx = NULL;
1399		} else {
1400			WARN_ON_ONCE(ctx->task != task);
1401		}
1402	}
1403	rcu_read_unlock();
1404	if (!ctx)
1405		local_irq_restore(*flags);
1406	return ctx;
1407}
1408
1409/*
1410 * Get the context for a task and increment its pin_count so it
1411 * can't get swapped to another task.  This also increments its
1412 * reference count so that the context can't get freed.
1413 */
1414static struct perf_event_context *
1415perf_pin_task_context(struct task_struct *task)
1416{
1417	struct perf_event_context *ctx;
1418	unsigned long flags;
1419
1420	ctx = perf_lock_task_context(task, &flags);
1421	if (ctx) {
1422		++ctx->pin_count;
1423		raw_spin_unlock_irqrestore(&ctx->lock, flags);
1424	}
1425	return ctx;
1426}
1427
1428static void perf_unpin_context(struct perf_event_context *ctx)
1429{
1430	unsigned long flags;
1431
1432	raw_spin_lock_irqsave(&ctx->lock, flags);
1433	--ctx->pin_count;
1434	raw_spin_unlock_irqrestore(&ctx->lock, flags);
1435}
1436
1437/*
1438 * Update the record of the current time in a context.
1439 */
1440static void __update_context_time(struct perf_event_context *ctx, bool adv)
1441{
1442	u64 now = perf_clock();
1443
1444	lockdep_assert_held(&ctx->lock);
1445
1446	if (adv)
1447		ctx->time += now - ctx->timestamp;
1448	ctx->timestamp = now;
1449
1450	/*
1451	 * The above: time' = time + (now - timestamp), can be re-arranged
1452	 * into: time` = now + (time - timestamp), which gives a single value
1453	 * offset to compute future time without locks on.
1454	 *
1455	 * See perf_event_time_now(), which can be used from NMI context where
1456	 * it's (obviously) not possible to acquire ctx->lock in order to read
1457	 * both the above values in a consistent manner.
1458	 */
1459	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
1460}
1461
1462static void update_context_time(struct perf_event_context *ctx)
1463{
1464	__update_context_time(ctx, true);
1465}
1466
1467static u64 perf_event_time(struct perf_event *event)
1468{
1469	struct perf_event_context *ctx = event->ctx;
1470
1471	if (unlikely(!ctx))
1472		return 0;
1473
1474	if (is_cgroup_event(event))
1475		return perf_cgroup_event_time(event);
1476
1477	return ctx->time;
1478}
1479
1480static u64 perf_event_time_now(struct perf_event *event, u64 now)
1481{
1482	struct perf_event_context *ctx = event->ctx;
1483
1484	if (unlikely(!ctx))
1485		return 0;
1486
1487	if (is_cgroup_event(event))
1488		return perf_cgroup_event_time_now(event, now);
1489
1490	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
1491		return ctx->time;
1492
1493	now += READ_ONCE(ctx->timeoffset);
1494	return now;
1495}
1496
1497static enum event_type_t get_event_type(struct perf_event *event)
1498{
1499	struct perf_event_context *ctx = event->ctx;
1500	enum event_type_t event_type;
1501
1502	lockdep_assert_held(&ctx->lock);
1503
1504	/*
1505	 * It's 'group type', really, because if our group leader is
1506	 * pinned, so are we.
1507	 */
1508	if (event->group_leader != event)
1509		event = event->group_leader;
1510
1511	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1512	if (!ctx->task)
1513		event_type |= EVENT_CPU;
1514
1515	return event_type;
1516}
1517
1518/*
1519 * Helper function to initialize event group nodes.
1520 */
1521static void init_event_group(struct perf_event *event)
1522{
1523	RB_CLEAR_NODE(&event->group_node);
1524	event->group_index = 0;
1525}
1526
1527/*
1528 * Extract pinned or flexible groups from the context
1529 * based on event attrs bits.
1530 */
1531static struct perf_event_groups *
1532get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1533{
1534	if (event->attr.pinned)
1535		return &ctx->pinned_groups;
1536	else
1537		return &ctx->flexible_groups;
1538}
1539
1540/*
1541 * Helper function to initializes perf_event_group trees.
1542 */
1543static void perf_event_groups_init(struct perf_event_groups *groups)
1544{
1545	groups->tree = RB_ROOT;
1546	groups->index = 0;
1547}
1548
1549static inline struct cgroup *event_cgroup(const struct perf_event *event)
1550{
1551	struct cgroup *cgroup = NULL;
1552
1553#ifdef CONFIG_CGROUP_PERF
1554	if (event->cgrp)
1555		cgroup = event->cgrp->css.cgroup;
1556#endif
1557
1558	return cgroup;
1559}
1560
1561/*
1562 * Compare function for event groups;
1563 *
1564 * Implements complex key that first sorts by CPU and then by virtual index
1565 * which provides ordering when rotating groups for the same CPU.
1566 */
1567static __always_inline int
1568perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
1569		      const struct cgroup *left_cgroup, const u64 left_group_index,
1570		      const struct perf_event *right)
1571{
1572	if (left_cpu < right->cpu)
1573		return -1;
1574	if (left_cpu > right->cpu)
1575		return 1;
1576
1577	if (left_pmu) {
1578		if (left_pmu < right->pmu_ctx->pmu)
1579			return -1;
1580		if (left_pmu > right->pmu_ctx->pmu)
1581			return 1;
1582	}
1583
1584#ifdef CONFIG_CGROUP_PERF
1585	{
1586		const struct cgroup *right_cgroup = event_cgroup(right);
1587
1588		if (left_cgroup != right_cgroup) {
1589			if (!left_cgroup) {
1590				/*
1591				 * Left has no cgroup but right does, no
1592				 * cgroups come first.
1593				 */
1594				return -1;
1595			}
1596			if (!right_cgroup) {
1597				/*
1598				 * Right has no cgroup but left does, no
1599				 * cgroups come first.
1600				 */
1601				return 1;
1602			}
1603			/* Two dissimilar cgroups, order by id. */
1604			if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1605				return -1;
1606
1607			return 1;
1608		}
1609	}
1610#endif
1611
1612	if (left_group_index < right->group_index)
1613		return -1;
1614	if (left_group_index > right->group_index)
1615		return 1;
1616
1617	return 0;
1618}
1619
1620#define __node_2_pe(node) \
1621	rb_entry((node), struct perf_event, group_node)
1622
1623static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1624{
1625	struct perf_event *e = __node_2_pe(a);
1626	return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
1627				     e->group_index, __node_2_pe(b)) < 0;
1628}
1629
1630struct __group_key {
1631	int cpu;
1632	struct pmu *pmu;
1633	struct cgroup *cgroup;
1634};
1635
1636static inline int __group_cmp(const void *key, const struct rb_node *node)
1637{
1638	const struct __group_key *a = key;
1639	const struct perf_event *b = __node_2_pe(node);
1640
1641	/* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
1642	return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
1643}
1644
1645static inline int
1646__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
1647{
1648	const struct __group_key *a = key;
1649	const struct perf_event *b = __node_2_pe(node);
1650
1651	/* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
1652	return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
1653				     b->group_index, b);
1654}
1655
1656/*
1657 * Insert @event into @groups' tree; using
1658 *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
1659 * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
1660 */
1661static void
1662perf_event_groups_insert(struct perf_event_groups *groups,
1663			 struct perf_event *event)
1664{
1665	event->group_index = ++groups->index;
1666
1667	rb_add(&event->group_node, &groups->tree, __group_less);
1668}
1669
1670/*
1671 * Helper function to insert event into the pinned or flexible groups.
1672 */
1673static void
1674add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1675{
1676	struct perf_event_groups *groups;
1677
1678	groups = get_event_groups(event, ctx);
1679	perf_event_groups_insert(groups, event);
1680}
1681
1682/*
1683 * Delete a group from a tree.
1684 */
1685static void
1686perf_event_groups_delete(struct perf_event_groups *groups,
1687			 struct perf_event *event)
1688{
1689	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1690		     RB_EMPTY_ROOT(&groups->tree));
1691
1692	rb_erase(&event->group_node, &groups->tree);
1693	init_event_group(event);
1694}
1695
1696/*
1697 * Helper function to delete event from its groups.
1698 */
1699static void
1700del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1701{
1702	struct perf_event_groups *groups;
1703
1704	groups = get_event_groups(event, ctx);
1705	perf_event_groups_delete(groups, event);
1706}
1707
1708/*
1709 * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
1710 */
1711static struct perf_event *
1712perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1713			struct pmu *pmu, struct cgroup *cgrp)
1714{
1715	struct __group_key key = {
1716		.cpu = cpu,
1717		.pmu = pmu,
1718		.cgroup = cgrp,
1719	};
1720	struct rb_node *node;
1721
1722	node = rb_find_first(&key, &groups->tree, __group_cmp);
1723	if (node)
1724		return __node_2_pe(node);
1725
1726	return NULL;
1727}
1728
1729static struct perf_event *
1730perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
1731{
1732	struct __group_key key = {
1733		.cpu = event->cpu,
1734		.pmu = pmu,
1735		.cgroup = event_cgroup(event),
1736	};
1737	struct rb_node *next;
1738
1739	next = rb_next_match(&key, &event->group_node, __group_cmp);
1740	if (next)
1741		return __node_2_pe(next);
1742
1743	return NULL;
1744}
1745
1746#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)		\
1747	for (event = perf_event_groups_first(groups, cpu, pmu, NULL);	\
1748	     event; event = perf_event_groups_next(event, pmu))
1749
1750/*
1751 * Iterate through the whole groups tree.
1752 */
1753#define perf_event_groups_for_each(event, groups)			\
1754	for (event = rb_entry_safe(rb_first(&((groups)->tree)),		\
1755				typeof(*event), group_node); event;	\
1756		event = rb_entry_safe(rb_next(&event->group_node),	\
1757				typeof(*event), group_node))
1758
1759/*
1760 * Add an event from the lists for its context.
1761 * Must be called with ctx->mutex and ctx->lock held.
1762 */
1763static void
1764list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1765{
1766	lockdep_assert_held(&ctx->lock);
1767
1768	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1769	event->attach_state |= PERF_ATTACH_CONTEXT;
1770
1771	event->tstamp = perf_event_time(event);
1772
1773	/*
1774	 * If we're a stand alone event or group leader, we go to the context
1775	 * list, group events are kept attached to the group so that
1776	 * perf_group_detach can, at all times, locate all siblings.
1777	 */
1778	if (event->group_leader == event) {
1779		event->group_caps = event->event_caps;
1780		add_event_to_groups(event, ctx);
1781	}
1782
1783	list_add_rcu(&event->event_entry, &ctx->event_list);
1784	ctx->nr_events++;
1785	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1786		ctx->nr_user++;
1787	if (event->attr.inherit_stat)
1788		ctx->nr_stat++;
1789
1790	if (event->state > PERF_EVENT_STATE_OFF)
1791		perf_cgroup_event_enable(event, ctx);
1792
1793	ctx->generation++;
1794	event->pmu_ctx->nr_events++;
1795}
1796
1797/*
1798 * Initialize event state based on the perf_event_attr::disabled.
1799 */
1800static inline void perf_event__state_init(struct perf_event *event)
1801{
1802	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1803					      PERF_EVENT_STATE_INACTIVE;
1804}
1805
1806static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1807{
1808	int entry = sizeof(u64); /* value */
1809	int size = 0;
1810	int nr = 1;
1811
1812	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1813		size += sizeof(u64);
1814
1815	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1816		size += sizeof(u64);
1817
1818	if (event->attr.read_format & PERF_FORMAT_ID)
1819		entry += sizeof(u64);
1820
1821	if (event->attr.read_format & PERF_FORMAT_LOST)
1822		entry += sizeof(u64);
1823
1824	if (event->attr.read_format & PERF_FORMAT_GROUP) {
1825		nr += nr_siblings;
1826		size += sizeof(u64);
1827	}
1828
1829	size += entry * nr;
1830	event->read_size = size;
1831}
1832
1833static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1834{
1835	struct perf_sample_data *data;
1836	u16 size = 0;
1837
1838	if (sample_type & PERF_SAMPLE_IP)
1839		size += sizeof(data->ip);
1840
1841	if (sample_type & PERF_SAMPLE_ADDR)
1842		size += sizeof(data->addr);
1843
1844	if (sample_type & PERF_SAMPLE_PERIOD)
1845		size += sizeof(data->period);
1846
1847	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1848		size += sizeof(data->weight.full);
1849
1850	if (sample_type & PERF_SAMPLE_READ)
1851		size += event->read_size;
1852
1853	if (sample_type & PERF_SAMPLE_DATA_SRC)
1854		size += sizeof(data->data_src.val);
1855
1856	if (sample_type & PERF_SAMPLE_TRANSACTION)
1857		size += sizeof(data->txn);
1858
1859	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1860		size += sizeof(data->phys_addr);
1861
1862	if (sample_type & PERF_SAMPLE_CGROUP)
1863		size += sizeof(data->cgroup);
1864
1865	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1866		size += sizeof(data->data_page_size);
1867
1868	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1869		size += sizeof(data->code_page_size);
1870
1871	event->header_size = size;
1872}
1873
1874/*
1875 * Called at perf_event creation and when events are attached/detached from a
1876 * group.
1877 */
1878static void perf_event__header_size(struct perf_event *event)
1879{
1880	__perf_event_read_size(event,
1881			       event->group_leader->nr_siblings);
1882	__perf_event_header_size(event, event->attr.sample_type);
1883}
1884
1885static void perf_event__id_header_size(struct perf_event *event)
1886{
1887	struct perf_sample_data *data;
1888	u64 sample_type = event->attr.sample_type;
1889	u16 size = 0;
1890
1891	if (sample_type & PERF_SAMPLE_TID)
1892		size += sizeof(data->tid_entry);
1893
1894	if (sample_type & PERF_SAMPLE_TIME)
1895		size += sizeof(data->time);
1896
1897	if (sample_type & PERF_SAMPLE_IDENTIFIER)
1898		size += sizeof(data->id);
1899
1900	if (sample_type & PERF_SAMPLE_ID)
1901		size += sizeof(data->id);
1902
1903	if (sample_type & PERF_SAMPLE_STREAM_ID)
1904		size += sizeof(data->stream_id);
1905
1906	if (sample_type & PERF_SAMPLE_CPU)
1907		size += sizeof(data->cpu_entry);
1908
1909	event->id_header_size = size;
1910}
1911
1912static bool perf_event_validate_size(struct perf_event *event)
1913{
1914	/*
1915	 * The values computed here will be over-written when we actually
1916	 * attach the event.
1917	 */
1918	__perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1919	__perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1920	perf_event__id_header_size(event);
1921
1922	/*
1923	 * Sum the lot; should not exceed the 64k limit we have on records.
1924	 * Conservative limit to allow for callchains and other variable fields.
1925	 */
1926	if (event->read_size + event->header_size +
1927	    event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1928		return false;
1929
1930	return true;
1931}
1932
1933static void perf_group_attach(struct perf_event *event)
1934{
1935	struct perf_event *group_leader = event->group_leader, *pos;
1936
1937	lockdep_assert_held(&event->ctx->lock);
1938
1939	/*
1940	 * We can have double attach due to group movement (move_group) in
1941	 * perf_event_open().
1942	 */
1943	if (event->attach_state & PERF_ATTACH_GROUP)
1944		return;
1945
1946	event->attach_state |= PERF_ATTACH_GROUP;
1947
1948	if (group_leader == event)
1949		return;
1950
1951	WARN_ON_ONCE(group_leader->ctx != event->ctx);
1952
1953	group_leader->group_caps &= event->event_caps;
1954
1955	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1956	group_leader->nr_siblings++;
1957
1958	perf_event__header_size(group_leader);
1959
1960	for_each_sibling_event(pos, group_leader)
1961		perf_event__header_size(pos);
1962}
1963
1964/*
1965 * Remove an event from the lists for its context.
1966 * Must be called with ctx->mutex and ctx->lock held.
1967 */
1968static void
1969list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1970{
1971	WARN_ON_ONCE(event->ctx != ctx);
1972	lockdep_assert_held(&ctx->lock);
1973
1974	/*
1975	 * We can have double detach due to exit/hot-unplug + close.
1976	 */
1977	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1978		return;
1979
1980	event->attach_state &= ~PERF_ATTACH_CONTEXT;
1981
1982	ctx->nr_events--;
1983	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1984		ctx->nr_user--;
1985	if (event->attr.inherit_stat)
1986		ctx->nr_stat--;
1987
1988	list_del_rcu(&event->event_entry);
1989
1990	if (event->group_leader == event)
1991		del_event_from_groups(event, ctx);
1992
1993	/*
1994	 * If event was in error state, then keep it
1995	 * that way, otherwise bogus counts will be
1996	 * returned on read(). The only way to get out
1997	 * of error state is by explicit re-enabling
1998	 * of the event
1999	 */
2000	if (event->state > PERF_EVENT_STATE_OFF) {
2001		perf_cgroup_event_disable(event, ctx);
2002		perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2003	}
2004
2005	ctx->generation++;
2006	event->pmu_ctx->nr_events--;
2007}
2008
2009static int
2010perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2011{
2012	if (!has_aux(aux_event))
2013		return 0;
2014
2015	if (!event->pmu->aux_output_match)
2016		return 0;
2017
2018	return event->pmu->aux_output_match(aux_event);
2019}
2020
2021static void put_event(struct perf_event *event);
2022static void event_sched_out(struct perf_event *event,
2023			    struct perf_event_context *ctx);
2024
2025static void perf_put_aux_event(struct perf_event *event)
2026{
2027	struct perf_event_context *ctx = event->ctx;
2028	struct perf_event *iter;
2029
2030	/*
2031	 * If event uses aux_event tear down the link
2032	 */
2033	if (event->aux_event) {
2034		iter = event->aux_event;
2035		event->aux_event = NULL;
2036		put_event(iter);
2037		return;
2038	}
2039
2040	/*
2041	 * If the event is an aux_event, tear down all links to
2042	 * it from other events.
2043	 */
2044	for_each_sibling_event(iter, event->group_leader) {
2045		if (iter->aux_event != event)
2046			continue;
2047
2048		iter->aux_event = NULL;
2049		put_event(event);
2050
2051		/*
2052		 * If it's ACTIVE, schedule it out and put it into ERROR
2053		 * state so that we don't try to schedule it again. Note
2054		 * that perf_event_enable() will clear the ERROR status.
2055		 */
2056		event_sched_out(iter, ctx);
2057		perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2058	}
2059}
2060
2061static bool perf_need_aux_event(struct perf_event *event)
2062{
2063	return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2064}
2065
2066static int perf_get_aux_event(struct perf_event *event,
2067			      struct perf_event *group_leader)
2068{
2069	/*
2070	 * Our group leader must be an aux event if we want to be
2071	 * an aux_output. This way, the aux event will precede its
2072	 * aux_output events in the group, and therefore will always
2073	 * schedule first.
2074	 */
2075	if (!group_leader)
2076		return 0;
2077
2078	/*
2079	 * aux_output and aux_sample_size are mutually exclusive.
2080	 */
2081	if (event->attr.aux_output && event->attr.aux_sample_size)
2082		return 0;
2083
2084	if (event->attr.aux_output &&
2085	    !perf_aux_output_match(event, group_leader))
2086		return 0;
2087
2088	if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2089		return 0;
2090
2091	if (!atomic_long_inc_not_zero(&group_leader->refcount))
2092		return 0;
2093
2094	/*
2095	 * Link aux_outputs to their aux event; this is undone in
2096	 * perf_group_detach() by perf_put_aux_event(). When the
2097	 * group in torn down, the aux_output events loose their
2098	 * link to the aux_event and can't schedule any more.
2099	 */
2100	event->aux_event = group_leader;
2101
2102	return 1;
2103}
2104
2105static inline struct list_head *get_event_list(struct perf_event *event)
2106{
2107	return event->attr.pinned ? &event->pmu_ctx->pinned_active :
2108				    &event->pmu_ctx->flexible_active;
2109}
2110
2111/*
2112 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
2113 * cannot exist on their own, schedule them out and move them into the ERROR
2114 * state. Also see _perf_event_enable(), it will not be able to recover
2115 * this ERROR state.
2116 */
2117static inline void perf_remove_sibling_event(struct perf_event *event)
2118{
2119	event_sched_out(event, event->ctx);
2120	perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2121}
2122
2123static void perf_group_detach(struct perf_event *event)
2124{
2125	struct perf_event *leader = event->group_leader;
2126	struct perf_event *sibling, *tmp;
2127	struct perf_event_context *ctx = event->ctx;
2128
2129	lockdep_assert_held(&ctx->lock);
2130
2131	/*
2132	 * We can have double detach due to exit/hot-unplug + close.
2133	 */
2134	if (!(event->attach_state & PERF_ATTACH_GROUP))
2135		return;
2136
2137	event->attach_state &= ~PERF_ATTACH_GROUP;
2138
2139	perf_put_aux_event(event);
2140
2141	/*
2142	 * If this is a sibling, remove it from its group.
2143	 */
2144	if (leader != event) {
2145		list_del_init(&event->sibling_list);
2146		event->group_leader->nr_siblings--;
2147		goto out;
2148	}
2149
2150	/*
2151	 * If this was a group event with sibling events then
2152	 * upgrade the siblings to singleton events by adding them
2153	 * to whatever list we are on.
2154	 */
2155	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2156
2157		if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2158			perf_remove_sibling_event(sibling);
2159
2160		sibling->group_leader = sibling;
2161		list_del_init(&sibling->sibling_list);
2162
2163		/* Inherit group flags from the previous leader */
2164		sibling->group_caps = event->group_caps;
2165
2166		if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
2167			add_event_to_groups(sibling, event->ctx);
2168
2169			if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2170				list_add_tail(&sibling->active_list, get_event_list(sibling));
2171		}
2172
2173		WARN_ON_ONCE(sibling->ctx != event->ctx);
2174	}
2175
2176out:
2177	for_each_sibling_event(tmp, leader)
2178		perf_event__header_size(tmp);
2179
2180	perf_event__header_size(leader);
2181}
2182
2183static void sync_child_event(struct perf_event *child_event);
2184
2185static void perf_child_detach(struct perf_event *event)
2186{
2187	struct perf_event *parent_event = event->parent;
2188
2189	if (!(event->attach_state & PERF_ATTACH_CHILD))
2190		return;
2191
2192	event->attach_state &= ~PERF_ATTACH_CHILD;
2193
2194	if (WARN_ON_ONCE(!parent_event))
2195		return;
2196
2197	lockdep_assert_held(&parent_event->child_mutex);
2198
2199	sync_child_event(event);
2200	list_del_init(&event->child_list);
2201}
2202
2203static bool is_orphaned_event(struct perf_event *event)
2204{
2205	return event->state == PERF_EVENT_STATE_DEAD;
2206}
2207
2208static inline int
2209event_filter_match(struct perf_event *event)
2210{
2211	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2212	       perf_cgroup_match(event);
2213}
2214
2215static void
2216event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
2217{
2218	struct perf_event_pmu_context *epc = event->pmu_ctx;
2219	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
2220	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2221
2222	// XXX cpc serialization, probably per-cpu IRQ disabled
2223
2224	WARN_ON_ONCE(event->ctx != ctx);
2225	lockdep_assert_held(&ctx->lock);
2226
2227	if (event->state != PERF_EVENT_STATE_ACTIVE)
2228		return;
2229
2230	/*
2231	 * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2232	 * we can schedule events _OUT_ individually through things like
2233	 * __perf_remove_from_context().
2234	 */
2235	list_del_init(&event->active_list);
2236
2237	perf_pmu_disable(event->pmu);
2238
2239	event->pmu->del(event, 0);
2240	event->oncpu = -1;
2241
2242	if (event->pending_disable) {
2243		event->pending_disable = 0;
2244		perf_cgroup_event_disable(event, ctx);
2245		state = PERF_EVENT_STATE_OFF;
2246	}
2247
2248	if (event->pending_sigtrap) {
2249		bool dec = true;
2250
2251		event->pending_sigtrap = 0;
2252		if (state != PERF_EVENT_STATE_OFF &&
2253		    !event->pending_work) {
2254			event->pending_work = 1;
2255			dec = false;
2256			WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
2257			task_work_add(current, &event->pending_task, TWA_RESUME);
2258		}
2259		if (dec)
2260			local_dec(&event->ctx->nr_pending);
2261	}
2262
2263	perf_event_set_state(event, state);
2264
2265	if (!is_software_event(event))
2266		cpc->active_oncpu--;
2267	if (event->attr.freq && event->attr.sample_freq)
2268		ctx->nr_freq--;
2269	if (event->attr.exclusive || !cpc->active_oncpu)
2270		cpc->exclusive = 0;
2271
2272	perf_pmu_enable(event->pmu);
2273}
2274
2275static void
2276group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
2277{
2278	struct perf_event *event;
2279
2280	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2281		return;
2282
2283	perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
2284
2285	event_sched_out(group_event, ctx);
2286
2287	/*
2288	 * Schedule out siblings (if any):
2289	 */
2290	for_each_sibling_event(event, group_event)
2291		event_sched_out(event, ctx);
2292}
2293
2294#define DETACH_GROUP	0x01UL
2295#define DETACH_CHILD	0x02UL
2296#define DETACH_DEAD	0x04UL
2297
2298/*
2299 * Cross CPU call to remove a performance event
2300 *
2301 * We disable the event on the hardware level first. After that we
2302 * remove it from the context list.
2303 */
2304static void
2305__perf_remove_from_context(struct perf_event *event,
2306			   struct perf_cpu_context *cpuctx,
2307			   struct perf_event_context *ctx,
2308			   void *info)
2309{
2310	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
2311	unsigned long flags = (unsigned long)info;
2312
2313	if (ctx->is_active & EVENT_TIME) {
2314		update_context_time(ctx);
2315		update_cgrp_time_from_cpuctx(cpuctx, false);
2316	}
2317
2318	/*
2319	 * Ensure event_sched_out() switches to OFF, at the very least
2320	 * this avoids raising perf_pending_task() at this time.
2321	 */
2322	if (flags & DETACH_DEAD)
2323		event->pending_disable = 1;
2324	event_sched_out(event, ctx);
2325	if (flags & DETACH_GROUP)
2326		perf_group_detach(event);
2327	if (flags & DETACH_CHILD)
2328		perf_child_detach(event);
2329	list_del_event(event, ctx);
2330	if (flags & DETACH_DEAD)
2331		event->state = PERF_EVENT_STATE_DEAD;
2332
2333	if (!pmu_ctx->nr_events) {
2334		pmu_ctx->rotate_necessary = 0;
2335
2336		if (ctx->task && ctx->is_active) {
2337			struct perf_cpu_pmu_context *cpc;
2338
2339			cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
2340			WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
2341			cpc->task_epc = NULL;
2342		}
2343	}
2344
2345	if (!ctx->nr_events && ctx->is_active) {
2346		if (ctx == &cpuctx->ctx)
2347			update_cgrp_time_from_cpuctx(cpuctx, true);
2348
2349		ctx->is_active = 0;
2350		if (ctx->task) {
2351			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2352			cpuctx->task_ctx = NULL;
2353		}
2354	}
2355}
2356
2357/*
2358 * Remove the event from a task's (or a CPU's) list of events.
2359 *
2360 * If event->ctx is a cloned context, callers must make sure that
2361 * every task struct that event->ctx->task could possibly point to
2362 * remains valid.  This is OK when called from perf_release since
2363 * that only calls us on the top-level context, which can't be a clone.
2364 * When called from perf_event_exit_task, it's OK because the
2365 * context has been detached from its task.
2366 */
2367static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2368{
2369	struct perf_event_context *ctx = event->ctx;
2370
2371	lockdep_assert_held(&ctx->mutex);
2372
2373	/*
2374	 * Because of perf_event_exit_task(), perf_remove_from_context() ought
2375	 * to work in the face of TASK_TOMBSTONE, unlike every other
2376	 * event_function_call() user.
2377	 */
2378	raw_spin_lock_irq(&ctx->lock);
2379	if (!ctx->is_active) {
2380		__perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
2381					   ctx, (void *)flags);
2382		raw_spin_unlock_irq(&ctx->lock);
2383		return;
2384	}
2385	raw_spin_unlock_irq(&ctx->lock);
2386
2387	event_function_call(event, __perf_remove_from_context, (void *)flags);
2388}
2389
2390/*
2391 * Cross CPU call to disable a performance event
2392 */
2393static void __perf_event_disable(struct perf_event *event,
2394				 struct perf_cpu_context *cpuctx,
2395				 struct perf_event_context *ctx,
2396				 void *info)
2397{
2398	if (event->state < PERF_EVENT_STATE_INACTIVE)
2399		return;
2400
2401	if (ctx->is_active & EVENT_TIME) {
2402		update_context_time(ctx);
2403		update_cgrp_time_from_event(event);
2404	}
2405
2406	perf_pmu_disable(event->pmu_ctx->pmu);
2407
2408	if (event == event->group_leader)
2409		group_sched_out(event, ctx);
2410	else
2411		event_sched_out(event, ctx);
2412
2413	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2414	perf_cgroup_event_disable(event, ctx);
2415
2416	perf_pmu_enable(event->pmu_ctx->pmu);
2417}
2418
2419/*
2420 * Disable an event.
2421 *
2422 * If event->ctx is a cloned context, callers must make sure that
2423 * every task struct that event->ctx->task could possibly point to
2424 * remains valid.  This condition is satisfied when called through
2425 * perf_event_for_each_child or perf_event_for_each because they
2426 * hold the top-level event's child_mutex, so any descendant that
2427 * goes to exit will block in perf_event_exit_event().
2428 *
2429 * When called from perf_pending_irq it's OK because event->ctx
2430 * is the current context on this CPU and preemption is disabled,
2431 * hence we can't get into perf_event_task_sched_out for this context.
2432 */
2433static void _perf_event_disable(struct perf_event *event)
2434{
2435	struct perf_event_context *ctx = event->ctx;
2436
2437	raw_spin_lock_irq(&ctx->lock);
2438	if (event->state <= PERF_EVENT_STATE_OFF) {
2439		raw_spin_unlock_irq(&ctx->lock);
2440		return;
2441	}
2442	raw_spin_unlock_irq(&ctx->lock);
2443
2444	event_function_call(event, __perf_event_disable, NULL);
2445}
2446
2447void perf_event_disable_local(struct perf_event *event)
2448{
2449	event_function_local(event, __perf_event_disable, NULL);
2450}
2451
2452/*
2453 * Strictly speaking kernel users cannot create groups and therefore this
2454 * interface does not need the perf_event_ctx_lock() magic.
2455 */
2456void perf_event_disable(struct perf_event *event)
2457{
2458	struct perf_event_context *ctx;
2459
2460	ctx = perf_event_ctx_lock(event);
2461	_perf_event_disable(event);
2462	perf_event_ctx_unlock(event, ctx);
2463}
2464EXPORT_SYMBOL_GPL(perf_event_disable);
2465
2466void perf_event_disable_inatomic(struct perf_event *event)
2467{
2468	event->pending_disable = 1;
2469	irq_work_queue(&event->pending_irq);
2470}
2471
2472#define MAX_INTERRUPTS (~0ULL)
2473
2474static void perf_log_throttle(struct perf_event *event, int enable);
2475static void perf_log_itrace_start(struct perf_event *event);
2476
2477static int
2478event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
2479{
2480	struct perf_event_pmu_context *epc = event->pmu_ctx;
2481	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
2482	int ret = 0;
2483
2484	WARN_ON_ONCE(event->ctx != ctx);
2485
2486	lockdep_assert_held(&ctx->lock);
2487
2488	if (event->state <= PERF_EVENT_STATE_OFF)
2489		return 0;
2490
2491	WRITE_ONCE(event->oncpu, smp_processor_id());
2492	/*
2493	 * Order event::oncpu write to happen before the ACTIVE state is
2494	 * visible. This allows perf_event_{stop,read}() to observe the correct
2495	 * ->oncpu if it sees ACTIVE.
2496	 */
2497	smp_wmb();
2498	perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2499
2500	/*
2501	 * Unthrottle events, since we scheduled we might have missed several
2502	 * ticks already, also for a heavily scheduling task there is little
2503	 * guarantee it'll get a tick in a timely manner.
2504	 */
2505	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2506		perf_log_throttle(event, 1);
2507		event->hw.interrupts = 0;
2508	}
2509
2510	perf_pmu_disable(event->pmu);
2511
2512	perf_log_itrace_start(event);
2513
2514	if (event->pmu->add(event, PERF_EF_START)) {
2515		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2516		event->oncpu = -1;
2517		ret = -EAGAIN;
2518		goto out;
2519	}
2520
2521	if (!is_software_event(event))
2522		cpc->active_oncpu++;
2523	if (event->attr.freq && event->attr.sample_freq)
2524		ctx->nr_freq++;
2525
2526	if (event->attr.exclusive)
2527		cpc->exclusive = 1;
2528
2529out:
2530	perf_pmu_enable(event->pmu);
2531
2532	return ret;
2533}
2534
2535static int
2536group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
2537{
2538	struct perf_event *event, *partial_group = NULL;
2539	struct pmu *pmu = group_event->pmu_ctx->pmu;
2540
2541	if (group_event->state == PERF_EVENT_STATE_OFF)
2542		return 0;
2543
2544	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2545
2546	if (event_sched_in(group_event, ctx))
2547		goto error;
2548
2549	/*
2550	 * Schedule in siblings as one group (if any):
2551	 */
2552	for_each_sibling_event(event, group_event) {
2553		if (event_sched_in(event, ctx)) {
2554			partial_group = event;
2555			goto group_error;
2556		}
2557	}
2558
2559	if (!pmu->commit_txn(pmu))
2560		return 0;
2561
2562group_error:
2563	/*
2564	 * Groups can be scheduled in as one unit only, so undo any
2565	 * partial group before returning:
2566	 * The events up to the failed event are scheduled out normally.
2567	 */
2568	for_each_sibling_event(event, group_event) {
2569		if (event == partial_group)
2570			break;
2571
2572		event_sched_out(event, ctx);
2573	}
2574	event_sched_out(group_event, ctx);
2575
2576error:
2577	pmu->cancel_txn(pmu);
2578	return -EAGAIN;
2579}
2580
2581/*
2582 * Work out whether we can put this event group on the CPU now.
2583 */
2584static int group_can_go_on(struct perf_event *event, int can_add_hw)
2585{
2586	struct perf_event_pmu_context *epc = event->pmu_ctx;
2587	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
2588
2589	/*
2590	 * Groups consisting entirely of software events can always go on.
2591	 */
2592	if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2593		return 1;
2594	/*
2595	 * If an exclusive group is already on, no other hardware
2596	 * events can go on.
2597	 */
2598	if (cpc->exclusive)
2599		return 0;
2600	/*
2601	 * If this group is exclusive and there are already
2602	 * events on the CPU, it can't go on.
2603	 */
2604	if (event->attr.exclusive && !list_empty(get_event_list(event)))
2605		return 0;
2606	/*
2607	 * Otherwise, try to add it if all previous groups were able
2608	 * to go on.
2609	 */
2610	return can_add_hw;
2611}
2612
2613static void add_event_to_ctx(struct perf_event *event,
2614			       struct perf_event_context *ctx)
2615{
2616	list_add_event(event, ctx);
2617	perf_group_attach(event);
2618}
2619
2620static void task_ctx_sched_out(struct perf_event_context *ctx,
2621				enum event_type_t event_type)
2622{
2623	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2624
2625	if (!cpuctx->task_ctx)
2626		return;
2627
2628	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2629		return;
2630
2631	ctx_sched_out(ctx, event_type);
2632}
2633
2634static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2635				struct perf_event_context *ctx)
2636{
2637	ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
2638	if (ctx)
2639		 ctx_sched_in(ctx, EVENT_PINNED);
2640	ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
2641	if (ctx)
2642		 ctx_sched_in(ctx, EVENT_FLEXIBLE);
2643}
2644
2645/*
2646 * We want to maintain the following priority of scheduling:
2647 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2648 *  - task pinned (EVENT_PINNED)
2649 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2650 *  - task flexible (EVENT_FLEXIBLE).
2651 *
2652 * In order to avoid unscheduling and scheduling back in everything every
2653 * time an event is added, only do it for the groups of equal priority and
2654 * below.
2655 *
2656 * This can be called after a batch operation on task events, in which case
2657 * event_type is a bit mask of the types of events involved. For CPU events,
2658 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2659 */
2660/*
2661 * XXX: ctx_resched() reschedule entire perf_event_context while adding new
2662 * event to the context or enabling existing event in the context. We can
2663 * probably optimize it by rescheduling only affected pmu_ctx.
2664 */
2665static void ctx_resched(struct perf_cpu_context *cpuctx,
2666			struct perf_event_context *task_ctx,
2667			enum event_type_t event_type)
2668{
2669	bool cpu_event = !!(event_type & EVENT_CPU);
2670
2671	/*
2672	 * If pinned groups are involved, flexible groups also need to be
2673	 * scheduled out.
2674	 */
2675	if (event_type & EVENT_PINNED)
2676		event_type |= EVENT_FLEXIBLE;
2677
2678	event_type &= EVENT_ALL;
2679
2680	perf_ctx_disable(&cpuctx->ctx);
2681	if (task_ctx) {
2682		perf_ctx_disable(task_ctx);
2683		task_ctx_sched_out(task_ctx, event_type);
2684	}
2685
2686	/*
2687	 * Decide which cpu ctx groups to schedule out based on the types
2688	 * of events that caused rescheduling:
2689	 *  - EVENT_CPU: schedule out corresponding groups;
2690	 *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2691	 *  - otherwise, do nothing more.
2692	 */
2693	if (cpu_event)
2694		ctx_sched_out(&cpuctx->ctx, event_type);
2695	else if (event_type & EVENT_PINNED)
2696		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
2697
2698	perf_event_sched_in(cpuctx, task_ctx);
2699
2700	perf_ctx_enable(&cpuctx->ctx);
2701	if (task_ctx)
2702		perf_ctx_enable(task_ctx);
2703}
2704
2705void perf_pmu_resched(struct pmu *pmu)
2706{
2707	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2708	struct perf_event_context *task_ctx = cpuctx->task_ctx;
2709
2710	perf_ctx_lock(cpuctx, task_ctx);
2711	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2712	perf_ctx_unlock(cpuctx, task_ctx);
2713}
2714
2715/*
2716 * Cross CPU call to install and enable a performance event
2717 *
2718 * Very similar to remote_function() + event_function() but cannot assume that
2719 * things like ctx->is_active and cpuctx->task_ctx are set.
2720 */
2721static int  __perf_install_in_context(void *info)
2722{
2723	struct perf_event *event = info;
2724	struct perf_event_context *ctx = event->ctx;
2725	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2726	struct perf_event_context *task_ctx = cpuctx->task_ctx;
2727	bool reprogram = true;
2728	int ret = 0;
2729
2730	raw_spin_lock(&cpuctx->ctx.lock);
2731	if (ctx->task) {
2732		raw_spin_lock(&ctx->lock);
2733		task_ctx = ctx;
2734
2735		reprogram = (ctx->task == current);
2736
2737		/*
2738		 * If the task is running, it must be running on this CPU,
2739		 * otherwise we cannot reprogram things.
2740		 *
2741		 * If its not running, we don't care, ctx->lock will
2742		 * serialize against it becoming runnable.
2743		 */
2744		if (task_curr(ctx->task) && !reprogram) {
2745			ret = -ESRCH;
2746			goto unlock;
2747		}
2748
2749		WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2750	} else if (task_ctx) {
2751		raw_spin_lock(&task_ctx->lock);
2752	}
2753
2754#ifdef CONFIG_CGROUP_PERF
2755	if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2756		/*
2757		 * If the current cgroup doesn't match the event's
2758		 * cgroup, we should not try to schedule it.
2759		 */
2760		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2761		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2762					event->cgrp->css.cgroup);
2763	}
2764#endif
2765
2766	if (reprogram) {
2767		ctx_sched_out(ctx, EVENT_TIME);
2768		add_event_to_ctx(event, ctx);
2769		ctx_resched(cpuctx, task_ctx, get_event_type(event));
2770	} else {
2771		add_event_to_ctx(event, ctx);
2772	}
2773
2774unlock:
2775	perf_ctx_unlock(cpuctx, task_ctx);
2776
2777	return ret;
2778}
2779
2780static bool exclusive_event_installable(struct perf_event *event,
2781					struct perf_event_context *ctx);
2782
2783/*
2784 * Attach a performance event to a context.
2785 *
2786 * Very similar to event_function_call, see comment there.
2787 */
2788static void
2789perf_install_in_context(struct perf_event_context *ctx,
2790			struct perf_event *event,
2791			int cpu)
2792{
2793	struct task_struct *task = READ_ONCE(ctx->task);
2794
2795	lockdep_assert_held(&ctx->mutex);
2796
2797	WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2798
2799	if (event->cpu != -1)
2800		WARN_ON_ONCE(event->cpu != cpu);
2801
2802	/*
2803	 * Ensures that if we can observe event->ctx, both the event and ctx
2804	 * will be 'complete'. See perf_iterate_sb_cpu().
2805	 */
2806	smp_store_release(&event->ctx, ctx);
2807
2808	/*
2809	 * perf_event_attr::disabled events will not run and can be initialized
2810	 * without IPI. Except when this is the first event for the context, in
2811	 * that case we need the magic of the IPI to set ctx->is_active.
2812	 *
2813	 * The IOC_ENABLE that is sure to follow the creation of a disabled
2814	 * event will issue the IPI and reprogram the hardware.
2815	 */
2816	if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
2817	    ctx->nr_events && !is_cgroup_event(event)) {
2818		raw_spin_lock_irq(&ctx->lock);
2819		if (ctx->task == TASK_TOMBSTONE) {
2820			raw_spin_unlock_irq(&ctx->lock);
2821			return;
2822		}
2823		add_event_to_ctx(event, ctx);
2824		raw_spin_unlock_irq(&ctx->lock);
2825		return;
2826	}
2827
2828	if (!task) {
2829		cpu_function_call(cpu, __perf_install_in_context, event);
2830		return;
2831	}
2832
2833	/*
2834	 * Should not happen, we validate the ctx is still alive before calling.
2835	 */
2836	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2837		return;
2838
2839	/*
2840	 * Installing events is tricky because we cannot rely on ctx->is_active
2841	 * to be set in case this is the nr_events 0 -> 1 transition.
2842	 *
2843	 * Instead we use task_curr(), which tells us if the task is running.
2844	 * However, since we use task_curr() outside of rq::lock, we can race
2845	 * against the actual state. This means the result can be wrong.
2846	 *
2847	 * If we get a false positive, we retry, this is harmless.
2848	 *
2849	 * If we get a false negative, things are complicated. If we are after
2850	 * perf_event_context_sched_in() ctx::lock will serialize us, and the
2851	 * value must be correct. If we're before, it doesn't matter since
2852	 * perf_event_context_sched_in() will program the counter.
2853	 *
2854	 * However, this hinges on the remote context switch having observed
2855	 * our task->perf_event_ctxp[] store, such that it will in fact take
2856	 * ctx::lock in perf_event_context_sched_in().
2857	 *
2858	 * We do this by task_function_call(), if the IPI fails to hit the task
2859	 * we know any future context switch of task must see the
2860	 * perf_event_ctpx[] store.
2861	 */
2862
2863	/*
2864	 * This smp_mb() orders the task->perf_event_ctxp[] store with the
2865	 * task_cpu() load, such that if the IPI then does not find the task
2866	 * running, a future context switch of that task must observe the
2867	 * store.
2868	 */
2869	smp_mb();
2870again:
2871	if (!task_function_call(task, __perf_install_in_context, event))
2872		return;
2873
2874	raw_spin_lock_irq(&ctx->lock);
2875	task = ctx->task;
2876	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2877		/*
2878		 * Cannot happen because we already checked above (which also
2879		 * cannot happen), and we hold ctx->mutex, which serializes us
2880		 * against perf_event_exit_task_context().
2881		 */
2882		raw_spin_unlock_irq(&ctx->lock);
2883		return;
2884	}
2885	/*
2886	 * If the task is not running, ctx->lock will avoid it becoming so,
2887	 * thus we can safely install the event.
2888	 */
2889	if (task_curr(task)) {
2890		raw_spin_unlock_irq(&ctx->lock);
2891		goto again;
2892	}
2893	add_event_to_ctx(event, ctx);
2894	raw_spin_unlock_irq(&ctx->lock);
2895}
2896
2897/*
2898 * Cross CPU call to enable a performance event
2899 */
2900static void __perf_event_enable(struct perf_event *event,
2901				struct perf_cpu_context *cpuctx,
2902				struct perf_event_context *ctx,
2903				void *info)
2904{
2905	struct perf_event *leader = event->group_leader;
2906	struct perf_event_context *task_ctx;
2907
2908	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2909	    event->state <= PERF_EVENT_STATE_ERROR)
2910		return;
2911
2912	if (ctx->is_active)
2913		ctx_sched_out(ctx, EVENT_TIME);
2914
2915	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2916	perf_cgroup_event_enable(event, ctx);
2917
2918	if (!ctx->is_active)
2919		return;
2920
2921	if (!event_filter_match(event)) {
2922		ctx_sched_in(ctx, EVENT_TIME);
2923		return;
2924	}
2925
2926	/*
2927	 * If the event is in a group and isn't the group leader,
2928	 * then don't put it on unless the group is on.
2929	 */
2930	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2931		ctx_sched_in(ctx, EVENT_TIME);
2932		return;
2933	}
2934
2935	task_ctx = cpuctx->task_ctx;
2936	if (ctx->task)
2937		WARN_ON_ONCE(task_ctx != ctx);
2938
2939	ctx_resched(cpuctx, task_ctx, get_event_type(event));
2940}
2941
2942/*
2943 * Enable an event.
2944 *
2945 * If event->ctx is a cloned context, callers must make sure that
2946 * every task struct that event->ctx->task could possibly point to
2947 * remains valid.  This condition is satisfied when called through
2948 * perf_event_for_each_child or perf_event_for_each as described
2949 * for perf_event_disable.
2950 */
2951static void _perf_event_enable(struct perf_event *event)
2952{
2953	struct perf_event_context *ctx = event->ctx;
2954
2955	raw_spin_lock_irq(&ctx->lock);
2956	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2957	    event->state <  PERF_EVENT_STATE_ERROR) {
2958out:
2959		raw_spin_unlock_irq(&ctx->lock);
2960		return;
2961	}
2962
2963	/*
2964	 * If the event is in error state, clear that first.
2965	 *
2966	 * That way, if we see the event in error state below, we know that it
2967	 * has gone back into error state, as distinct from the task having
2968	 * been scheduled away before the cross-call arrived.
2969	 */
2970	if (event->state == PERF_EVENT_STATE_ERROR) {
2971		/*
2972		 * Detached SIBLING events cannot leave ERROR state.
2973		 */
2974		if (event->event_caps & PERF_EV_CAP_SIBLING &&
2975		    event->group_leader == event)
2976			goto out;
2977
2978		event->state = PERF_EVENT_STATE_OFF;
2979	}
2980	raw_spin_unlock_irq(&ctx->lock);
2981
2982	event_function_call(event, __perf_event_enable, NULL);
2983}
2984
2985/*
2986 * See perf_event_disable();
2987 */
2988void perf_event_enable(struct perf_event *event)
2989{
2990	struct perf_event_context *ctx;
2991
2992	ctx = perf_event_ctx_lock(event);
2993	_perf_event_enable(event);
2994	perf_event_ctx_unlock(event, ctx);
2995}
2996EXPORT_SYMBOL_GPL(perf_event_enable);
2997
2998struct stop_event_data {
2999	struct perf_event	*event;
3000	unsigned int		restart;
3001};
3002
3003static int __perf_event_stop(void *info)
3004{
3005	struct stop_event_data *sd = info;
3006	struct perf_event *event = sd->event;
3007
3008	/* if it's already INACTIVE, do nothing */
3009	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3010		return 0;
3011
3012	/* matches smp_wmb() in event_sched_in() */
3013	smp_rmb();
3014
3015	/*
3016	 * There is a window with interrupts enabled before we get here,
3017	 * so we need to check again lest we try to stop another CPU's event.
3018	 */
3019	if (READ_ONCE(event->oncpu) != smp_processor_id())
3020		return -EAGAIN;
3021
3022	event->pmu->stop(event, PERF_EF_UPDATE);
3023
3024	/*
3025	 * May race with the actual stop (through perf_pmu_output_stop()),
3026	 * but it is only used for events with AUX ring buffer, and such
3027	 * events will refuse to restart because of rb::aux_mmap_count==0,
3028	 * see comments in perf_aux_output_begin().
3029	 *
3030	 * Since this is happening on an event-local CPU, no trace is lost
3031	 * while restarting.
3032	 */
3033	if (sd->restart)
3034		event->pmu->start(event, 0);
3035
3036	return 0;
3037}
3038
3039static int perf_event_stop(struct perf_event *event, int restart)
3040{
3041	struct stop_event_data sd = {
3042		.event		= event,
3043		.restart	= restart,
3044	};
3045	int ret = 0;
3046
3047	do {
3048		if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3049			return 0;
3050
3051		/* matches smp_wmb() in event_sched_in() */
3052		smp_rmb();
3053
3054		/*
3055		 * We only want to restart ACTIVE events, so if the event goes
3056		 * inactive here (event->oncpu==-1), there's nothing more to do;
3057		 * fall through with ret==-ENXIO.
3058		 */
3059		ret = cpu_function_call(READ_ONCE(event->oncpu),
3060					__perf_event_stop, &sd);
3061	} while (ret == -EAGAIN);
3062
3063	return ret;
3064}
3065
3066/*
3067 * In order to contain the amount of racy and tricky in the address filter
3068 * configuration management, it is a two part process:
3069 *
3070 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3071 *      we update the addresses of corresponding vmas in
3072 *	event::addr_filter_ranges array and bump the event::addr_filters_gen;
3073 * (p2) when an event is scheduled in (pmu::add), it calls
3074 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3075 *      if the generation has changed since the previous call.
3076 *
3077 * If (p1) happens while the event is active, we restart it to force (p2).
3078 *
3079 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3080 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3081 *     ioctl;
3082 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3083 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
3084 *     for reading;
3085 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3086 *     of exec.
3087 */
3088void perf_event_addr_filters_sync(struct perf_event *event)
3089{
3090	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3091
3092	if (!has_addr_filter(event))
3093		return;
3094
3095	raw_spin_lock(&ifh->lock);
3096	if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3097		event->pmu->addr_filters_sync(event);
3098		event->hw.addr_filters_gen = event->addr_filters_gen;
3099	}
3100	raw_spin_unlock(&ifh->lock);
3101}
3102EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3103
3104static int _perf_event_refresh(struct perf_event *event, int refresh)
3105{
3106	/*
3107	 * not supported on inherited events
3108	 */
3109	if (event->attr.inherit || !is_sampling_event(event))
3110		return -EINVAL;
3111
3112	atomic_add(refresh, &event->event_limit);
3113	_perf_event_enable(event);
3114
3115	return 0;
3116}
3117
3118/*
3119 * See perf_event_disable()
3120 */
3121int perf_event_refresh(struct perf_event *event, int refresh)
3122{
3123	struct perf_event_context *ctx;
3124	int ret;
3125
3126	ctx = perf_event_ctx_lock(event);
3127	ret = _perf_event_refresh(event, refresh);
3128	perf_event_ctx_unlock(event, ctx);
3129
3130	return ret;
3131}
3132EXPORT_SYMBOL_GPL(perf_event_refresh);
3133
3134static int perf_event_modify_breakpoint(struct perf_event *bp,
3135					 struct perf_event_attr *attr)
3136{
3137	int err;
3138
3139	_perf_event_disable(bp);
3140
3141	err = modify_user_hw_breakpoint_check(bp, attr, true);
3142
3143	if (!bp->attr.disabled)
3144		_perf_event_enable(bp);
3145
3146	return err;
3147}
3148
3149/*
3150 * Copy event-type-independent attributes that may be modified.
3151 */
3152static void perf_event_modify_copy_attr(struct perf_event_attr *to,
3153					const struct perf_event_attr *from)
3154{
3155	to->sig_data = from->sig_data;
3156}
3157
3158static int perf_event_modify_attr(struct perf_event *event,
3159				  struct perf_event_attr *attr)
3160{
3161	int (*func)(struct perf_event *, struct perf_event_attr *);
3162	struct perf_event *child;
3163	int err;
3164
3165	if (event->attr.type != attr->type)
3166		return -EINVAL;
3167
3168	switch (event->attr.type) {
3169	case PERF_TYPE_BREAKPOINT:
3170		func = perf_event_modify_breakpoint;
3171		break;
3172	default:
3173		/* Place holder for future additions. */
3174		return -EOPNOTSUPP;
3175	}
3176
3177	WARN_ON_ONCE(event->ctx->parent_ctx);
3178
3179	mutex_lock(&event->child_mutex);
3180	/*
3181	 * Event-type-independent attributes must be copied before event-type
3182	 * modification, which will validate that final attributes match the
3183	 * source attributes after all relevant attributes have been copied.
3184	 */
3185	perf_event_modify_copy_attr(&event->attr, attr);
3186	err = func(event, attr);
3187	if (err)
3188		goto out;
3189	list_for_each_entry(child, &event->child_list, child_list) {
3190		perf_event_modify_copy_attr(&child->attr, attr);
3191		err = func(child, attr);
3192		if (err)
3193			goto out;
3194	}
3195out:
3196	mutex_unlock(&event->child_mutex);
3197	return err;
3198}
3199
3200static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
3201				enum event_type_t event_type)
3202{
3203	struct perf_event_context *ctx = pmu_ctx->ctx;
3204	struct perf_event *event, *tmp;
3205	struct pmu *pmu = pmu_ctx->pmu;
3206
3207	if (ctx->task && !ctx->is_active) {
3208		struct perf_cpu_pmu_context *cpc;
3209
3210		cpc = this_cpu_ptr(pmu->cpu_pmu_context);
3211		WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
3212		cpc->task_epc = NULL;
3213	}
3214
3215	if (!event_type)
3216		return;
3217
3218	perf_pmu_disable(pmu);
3219	if (event_type & EVENT_PINNED) {
3220		list_for_each_entry_safe(event, tmp,
3221					 &pmu_ctx->pinned_active,
3222					 active_list)
3223			group_sched_out(event, ctx);
3224	}
3225
3226	if (event_type & EVENT_FLEXIBLE) {
3227		list_for_each_entry_safe(event, tmp,
3228					 &pmu_ctx->flexible_active,
3229					 active_list)
3230			group_sched_out(event, ctx);
3231		/*
3232		 * Since we cleared EVENT_FLEXIBLE, also clear
3233		 * rotate_necessary, is will be reset by
3234		 * ctx_flexible_sched_in() when needed.
3235		 */
3236		pmu_ctx->rotate_necessary = 0;
3237	}
3238	perf_pmu_enable(pmu);
3239}
3240
3241static void
3242ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
3243{
3244	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3245	struct perf_event_pmu_context *pmu_ctx;
3246	int is_active = ctx->is_active;
3247
3248	lockdep_assert_held(&ctx->lock);
3249
3250	if (likely(!ctx->nr_events)) {
3251		/*
3252		 * See __perf_remove_from_context().
3253		 */
3254		WARN_ON_ONCE(ctx->is_active);
3255		if (ctx->task)
3256			WARN_ON_ONCE(cpuctx->task_ctx);
3257		return;
3258	}
3259
3260	/*
3261	 * Always update time if it was set; not only when it changes.
3262	 * Otherwise we can 'forget' to update time for any but the last
3263	 * context we sched out. For example:
3264	 *
3265	 *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3266	 *   ctx_sched_out(.event_type = EVENT_PINNED)
3267	 *
3268	 * would only update time for the pinned events.
3269	 */
3270	if (is_active & EVENT_TIME) {
3271		/* update (and stop) ctx time */
3272		update_context_time(ctx);
3273		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
3274		/*
3275		 * CPU-release for the below ->is_active store,
3276		 * see __load_acquire() in perf_event_time_now()
3277		 */
3278		barrier();
3279	}
3280
3281	ctx->is_active &= ~event_type;
3282	if (!(ctx->is_active & EVENT_ALL))
3283		ctx->is_active = 0;
3284
3285	if (ctx->task) {
3286		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3287		if (!ctx->is_active)
3288			cpuctx->task_ctx = NULL;
3289	}
3290
3291	is_active ^= ctx->is_active; /* changed bits */
3292
3293	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
3294		__pmu_ctx_sched_out(pmu_ctx, is_active);
3295}
3296
3297/*
3298 * Test whether two contexts are equivalent, i.e. whether they have both been
3299 * cloned from the same version of the same context.
3300 *
3301 * Equivalence is measured using a generation number in the context that is
3302 * incremented on each modification to it; see unclone_ctx(), list_add_event()
3303 * and list_del_event().
3304 */
3305static int context_equiv(struct perf_event_context *ctx1,
3306			 struct perf_event_context *ctx2)
3307{
3308	lockdep_assert_held(&ctx1->lock);
3309	lockdep_assert_held(&ctx2->lock);
3310
3311	/* Pinning disables the swap optimization */
3312	if (ctx1->pin_count || ctx2->pin_count)
3313		return 0;
3314
3315	/* If ctx1 is the parent of ctx2 */
3316	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3317		return 1;
3318
3319	/* If ctx2 is the parent of ctx1 */
3320	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3321		return 1;
3322
3323	/*
3324	 * If ctx1 and ctx2 have the same parent; we flatten the parent
3325	 * hierarchy, see perf_event_init_context().
3326	 */
3327	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3328			ctx1->parent_gen == ctx2->parent_gen)
3329		return 1;
3330
3331	/* Unmatched */
3332	return 0;
3333}
3334
3335static void __perf_event_sync_stat(struct perf_event *event,
3336				     struct perf_event *next_event)
3337{
3338	u64 value;
3339
3340	if (!event->attr.inherit_stat)
3341		return;
3342
3343	/*
3344	 * Update the event value, we cannot use perf_event_read()
3345	 * because we're in the middle of a context switch and have IRQs
3346	 * disabled, which upsets smp_call_function_single(), however
3347	 * we know the event must be on the current CPU, therefore we
3348	 * don't need to use it.
3349	 */
3350	if (event->state == PERF_EVENT_STATE_ACTIVE)
3351		event->pmu->read(event);
3352
3353	perf_event_update_time(event);
3354
3355	/*
3356	 * In order to keep per-task stats reliable we need to flip the event
3357	 * values when we flip the contexts.
3358	 */
3359	value = local64_read(&next_event->count);
3360	value = local64_xchg(&event->count, value);
3361	local64_set(&next_event->count, value);
3362
3363	swap(event->total_time_enabled, next_event->total_time_enabled);
3364	swap(event->total_time_running, next_event->total_time_running);
3365
3366	/*
3367	 * Since we swizzled the values, update the user visible data too.
3368	 */
3369	perf_event_update_userpage(event);
3370	perf_event_update_userpage(next_event);
3371}
3372
3373static void perf_event_sync_stat(struct perf_event_context *ctx,
3374				   struct perf_event_context *next_ctx)
3375{
3376	struct perf_event *event, *next_event;
3377
3378	if (!ctx->nr_stat)
3379		return;
3380
3381	update_context_time(ctx);
3382
3383	event = list_first_entry(&ctx->event_list,
3384				   struct perf_event, event_entry);
3385
3386	next_event = list_first_entry(&next_ctx->event_list,
3387					struct perf_event, event_entry);
3388
3389	while (&event->event_entry != &ctx->event_list &&
3390	       &next_event->event_entry != &next_ctx->event_list) {
3391
3392		__perf_event_sync_stat(event, next_event);
3393
3394		event = list_next_entry(event, event_entry);
3395		next_event = list_next_entry(next_event, event_entry);
3396	}
3397}
3398
3399#define double_list_for_each_entry(pos1, pos2, head1, head2, member)	\
3400	for (pos1 = list_first_entry(head1, typeof(*pos1), member),	\
3401	     pos2 = list_first_entry(head2, typeof(*pos2), member);	\
3402	     !list_entry_is_head(pos1, head1, member) &&		\
3403	     !list_entry_is_head(pos2, head2, member);			\
3404	     pos1 = list_next_entry(pos1, member),			\
3405	     pos2 = list_next_entry(pos2, member))
3406
3407static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
3408					  struct perf_event_context *next_ctx)
3409{
3410	struct perf_event_pmu_context *prev_epc, *next_epc;
3411
3412	if (!prev_ctx->nr_task_data)
3413		return;
3414
3415	double_list_for_each_entry(prev_epc, next_epc,
3416				   &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
3417				   pmu_ctx_entry) {
3418
3419		if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
3420			continue;
3421
3422		/*
3423		 * PMU specific parts of task perf context can require
3424		 * additional synchronization. As an example of such
3425		 * synchronization see implementation details of Intel
3426		 * LBR call stack data profiling;
3427		 */
3428		if (prev_epc->pmu->swap_task_ctx)
3429			prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
3430		else
3431			swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
3432	}
3433}
3434
3435static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
3436{
3437	struct perf_event_pmu_context *pmu_ctx;
3438	struct perf_cpu_pmu_context *cpc;
3439
3440	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3441		cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
3442
3443		if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
3444			pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
3445	}
3446}
3447
3448static void
3449perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
3450{
3451	struct perf_event_context *ctx = task->perf_event_ctxp;
3452	struct perf_event_context *next_ctx;
3453	struct perf_event_context *parent, *next_parent;
3454	int do_switch = 1;
3455
3456	if (likely(!ctx))
3457		return;
3458
3459	rcu_read_lock();
3460	next_ctx = rcu_dereference(next->perf_event_ctxp);
3461	if (!next_ctx)
3462		goto unlock;
3463
3464	parent = rcu_dereference(ctx->parent_ctx);
3465	next_parent = rcu_dereference(next_ctx->parent_ctx);
3466
3467	/* If neither context have a parent context; they cannot be clones. */
3468	if (!parent && !next_parent)
3469		goto unlock;
3470
3471	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3472		/*
3473		 * Looks like the two contexts are clones, so we might be
3474		 * able to optimize the context switch.  We lock both
3475		 * contexts and check that they are clones under the
3476		 * lock (including re-checking that neither has been
3477		 * uncloned in the meantime).  It doesn't matter which
3478		 * order we take the locks because no other cpu could
3479		 * be trying to lock both of these tasks.
3480		 */
3481		raw_spin_lock(&ctx->lock);
3482		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3483		if (context_equiv(ctx, next_ctx)) {
3484
3485			perf_ctx_disable(ctx);
3486
3487			/* PMIs are disabled; ctx->nr_pending is stable. */
3488			if (local_read(&ctx->nr_pending) ||
3489			    local_read(&next_ctx->nr_pending)) {
3490				/*
3491				 * Must not swap out ctx when there's pending
3492				 * events that rely on the ctx->task relation.
3493				 */
3494				raw_spin_unlock(&next_ctx->lock);
3495				rcu_read_unlock();
3496				goto inside_switch;
3497			}
3498
3499			WRITE_ONCE(ctx->task, next);
3500			WRITE_ONCE(next_ctx->task, task);
3501
3502			perf_ctx_sched_task_cb(ctx, false);
3503			perf_event_swap_task_ctx_data(ctx, next_ctx);
3504
3505			perf_ctx_enable(ctx);
3506
3507			/*
3508			 * RCU_INIT_POINTER here is safe because we've not
3509			 * modified the ctx and the above modification of
3510			 * ctx->task and ctx->task_ctx_data are immaterial
3511			 * since those values are always verified under
3512			 * ctx->lock which we're now holding.
3513			 */
3514			RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
3515			RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
3516
3517			do_switch = 0;
3518
3519			perf_event_sync_stat(ctx, next_ctx);
3520		}
3521		raw_spin_unlock(&next_ctx->lock);
3522		raw_spin_unlock(&ctx->lock);
3523	}
3524unlock:
3525	rcu_read_unlock();
3526
3527	if (do_switch) {
3528		raw_spin_lock(&ctx->lock);
3529		perf_ctx_disable(ctx);
3530
3531inside_switch:
3532		perf_ctx_sched_task_cb(ctx, false);
3533		task_ctx_sched_out(ctx, EVENT_ALL);
3534
3535		perf_ctx_enable(ctx);
3536		raw_spin_unlock(&ctx->lock);
3537	}
3538}
3539
3540static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3541static DEFINE_PER_CPU(int, perf_sched_cb_usages);
3542
3543void perf_sched_cb_dec(struct pmu *pmu)
3544{
3545	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
3546
3547	this_cpu_dec(perf_sched_cb_usages);
3548	barrier();
3549
3550	if (!--cpc->sched_cb_usage)
3551		list_del(&cpc->sched_cb_entry);
3552}
3553
3554
3555void perf_sched_cb_inc(struct pmu *pmu)
3556{
3557	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
3558
3559	if (!cpc->sched_cb_usage++)
3560		list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3561
3562	barrier();
3563	this_cpu_inc(perf_sched_cb_usages);
3564}
3565
3566/*
3567 * This function provides the context switch callback to the lower code
3568 * layer. It is invoked ONLY when the context switch callback is enabled.
3569 *
3570 * This callback is relevant even to per-cpu events; for example multi event
3571 * PEBS requires this to provide PID/TID information. This requires we flush
3572 * all queued PEBS records before we context switch to a new task.
3573 */
3574static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
3575{
3576	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3577	struct pmu *pmu;
3578
3579	pmu = cpc->epc.pmu;
3580
3581	/* software PMUs will not have sched_task */
3582	if (WARN_ON_ONCE(!pmu->sched_task))
3583		return;
3584
3585	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3586	perf_pmu_disable(pmu);
3587
3588	pmu->sched_task(cpc->task_epc, sched_in);
3589
3590	perf_pmu_enable(pmu);
3591	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3592}
3593
3594static void perf_pmu_sched_task(struct task_struct *prev,
3595				struct task_struct *next,
3596				bool sched_in)
3597{
3598	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3599	struct perf_cpu_pmu_context *cpc;
3600
3601	/* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
3602	if (prev == next || cpuctx->task_ctx)
3603		return;
3604
3605	list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
3606		__perf_pmu_sched_task(cpc, sched_in);
3607}
3608
3609static void perf_event_switch(struct task_struct *task,
3610			      struct task_struct *next_prev, bool sched_in);
3611
3612/*
3613 * Called from scheduler to remove the events of the current task,
3614 * with interrupts disabled.
3615 *
3616 * We stop each event and update the event value in event->count.
3617 *
3618 * This does not protect us against NMI, but disable()
3619 * sets the disabled bit in the control field of event _before_
3620 * accessing the event control register. If a NMI hits, then it will
3621 * not restart the event.
3622 */
3623void __perf_event_task_sched_out(struct task_struct *task,
3624				 struct task_struct *next)
3625{
3626	if (__this_cpu_read(perf_sched_cb_usages))
3627		perf_pmu_sched_task(task, next, false);
3628
3629	if (atomic_read(&nr_switch_events))
3630		perf_event_switch(task, next, false);
3631
3632	perf_event_context_sched_out(task, next);
3633
3634	/*
3635	 * if cgroup events exist on this CPU, then we need
3636	 * to check if we have to switch out PMU state.
3637	 * cgroup event are system-wide mode only
3638	 */
3639	perf_cgroup_switch(next);
3640}
3641
3642static bool perf_less_group_idx(const void *l, const void *r)
3643{
3644	const struct perf_event *le = *(const struct perf_event **)l;
3645	const struct perf_event *re = *(const struct perf_event **)r;
3646
3647	return le->group_index < re->group_index;
3648}
3649
3650static void swap_ptr(void *l, void *r)
3651{
3652	void **lp = l, **rp = r;
3653
3654	swap(*lp, *rp);
3655}
3656
3657static const struct min_heap_callbacks perf_min_heap = {
3658	.elem_size = sizeof(struct perf_event *),
3659	.less = perf_less_group_idx,
3660	.swp = swap_ptr,
3661};
3662
3663static void __heap_add(struct min_heap *heap, struct perf_event *event)
3664{
3665	struct perf_event **itrs = heap->data;
3666
3667	if (event) {
3668		itrs[heap->nr] = event;
3669		heap->nr++;
3670	}
3671}
3672
3673static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
3674{
3675	struct perf_cpu_pmu_context *cpc;
3676
3677	if (!pmu_ctx->ctx->task)
3678		return;
3679
3680	cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
3681	WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
3682	cpc->task_epc = pmu_ctx;
3683}
3684
3685static noinline int visit_groups_merge(struct perf_event_context *ctx,
3686				struct perf_event_groups *groups, int cpu,
3687				struct pmu *pmu,
3688				int (*func)(struct perf_event *, void *),
3689				void *data)
3690{
3691#ifdef CONFIG_CGROUP_PERF
3692	struct cgroup_subsys_state *css = NULL;
3693#endif
3694	struct perf_cpu_context *cpuctx = NULL;
3695	/* Space for per CPU and/or any CPU event iterators. */
3696	struct perf_event *itrs[2];
3697	struct min_heap event_heap;
3698	struct perf_event **evt;
3699	int ret;
3700
3701	if (pmu->filter && pmu->filter(pmu, cpu))
3702		return 0;
3703
3704	if (!ctx->task) {
3705		cpuctx = this_cpu_ptr(&perf_cpu_context);
3706		event_heap = (struct min_heap){
3707			.data = cpuctx->heap,
3708			.nr = 0,
3709			.size = cpuctx->heap_size,
3710		};
3711
3712		lockdep_assert_held(&cpuctx->ctx.lock);
3713
3714#ifdef CONFIG_CGROUP_PERF
3715		if (cpuctx->cgrp)
3716			css = &cpuctx->cgrp->css;
3717#endif
3718	} else {
3719		event_heap = (struct min_heap){
3720			.data = itrs,
3721			.nr = 0,
3722			.size = ARRAY_SIZE(itrs),
3723		};
3724		/* Events not within a CPU context may be on any CPU. */
3725		__heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
3726	}
3727	evt = event_heap.data;
3728
3729	__heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
3730
3731#ifdef CONFIG_CGROUP_PERF
3732	for (; css; css = css->parent)
3733		__heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
3734#endif
3735
3736	if (event_heap.nr) {
3737		__link_epc((*evt)->pmu_ctx);
3738		perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
3739	}
3740
3741	min_heapify_all(&event_heap, &perf_min_heap);
3742
3743	while (event_heap.nr) {
3744		ret = func(*evt, data);
3745		if (ret)
3746			return ret;
3747
3748		*evt = perf_event_groups_next(*evt, pmu);
3749		if (*evt)
3750			min_heapify(&event_heap, 0, &perf_min_heap);
3751		else
3752			min_heap_pop(&event_heap, &perf_min_heap);
3753	}
3754
3755	return 0;
3756}
3757
3758/*
3759 * Because the userpage is strictly per-event (there is no concept of context,
3760 * so there cannot be a context indirection), every userpage must be updated
3761 * when context time starts :-(
3762 *
3763 * IOW, we must not miss EVENT_TIME edges.
3764 */
3765static inline bool event_update_userpage(struct perf_event *event)
3766{
3767	if (likely(!atomic_read(&event->mmap_count)))
3768		return false;
3769
3770	perf_event_update_time(event);
3771	perf_event_update_userpage(event);
3772
3773	return true;
3774}
3775
3776static inline void group_update_userpage(struct perf_event *group_event)
3777{
3778	struct perf_event *event;
3779
3780	if (!event_update_userpage(group_event))
3781		return;
3782
3783	for_each_sibling_event(event, group_event)
3784		event_update_userpage(event);
3785}
3786
3787static int merge_sched_in(struct perf_event *event, void *data)
3788{
3789	struct perf_event_context *ctx = event->ctx;
3790	int *can_add_hw = data;
3791
3792	if (event->state <= PERF_EVENT_STATE_OFF)
3793		return 0;
3794
3795	if (!event_filter_match(event))
3796		return 0;
3797
3798	if (group_can_go_on(event, *can_add_hw)) {
3799		if (!group_sched_in(event, ctx))
3800			list_add_tail(&event->active_list, get_event_list(event));
3801	}
3802
3803	if (event->state == PERF_EVENT_STATE_INACTIVE) {
3804		*can_add_hw = 0;
3805		if (event->attr.pinned) {
3806			perf_cgroup_event_disable(event, ctx);
3807			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3808		} else {
3809			struct perf_cpu_pmu_context *cpc;
3810
3811			event->pmu_ctx->rotate_necessary = 1;
3812			cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
3813			perf_mux_hrtimer_restart(cpc);
3814			group_update_userpage(event);
3815		}
3816	}
3817
3818	return 0;
3819}
3820
3821static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
3822{
3823	struct perf_event_pmu_context *pmu_ctx;
3824	int can_add_hw = 1;
3825
3826	if (pmu) {
3827		visit_groups_merge(ctx, &ctx->pinned_groups,
3828				   smp_processor_id(), pmu,
3829				   merge_sched_in, &can_add_hw);
3830	} else {
3831		list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3832			can_add_hw = 1;
3833			visit_groups_merge(ctx, &ctx->pinned_groups,
3834					   smp_processor_id(), pmu_ctx->pmu,
3835					   merge_sched_in, &can_add_hw);
3836		}
3837	}
3838}
3839
3840static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
3841{
3842	struct perf_event_pmu_context *pmu_ctx;
3843	int can_add_hw = 1;
3844
3845	if (pmu) {
3846		visit_groups_merge(ctx, &ctx->flexible_groups,
3847				   smp_processor_id(), pmu,
3848				   merge_sched_in, &can_add_hw);
3849	} else {
3850		list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3851			can_add_hw = 1;
3852			visit_groups_merge(ctx, &ctx->flexible_groups,
3853					   smp_processor_id(), pmu_ctx->pmu,
3854					   merge_sched_in, &can_add_hw);
3855		}
3856	}
3857}
3858
3859static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
3860{
3861	ctx_flexible_sched_in(ctx, pmu);
3862}
3863
3864static void
3865ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
3866{
3867	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3868	int is_active = ctx->is_active;
3869
3870	lockdep_assert_held(&ctx->lock);
3871
3872	if (likely(!ctx->nr_events))
3873		return;
3874
3875	if (!(is_active & EVENT_TIME)) {
3876		/* start ctx time */
3877		__update_context_time(ctx, false);
3878		perf_cgroup_set_timestamp(cpuctx);
3879		/*
3880		 * CPU-release for the below ->is_active store,
3881		 * see __load_acquire() in perf_event_time_now()
3882		 */
3883		barrier();
3884	}
3885
3886	ctx->is_active |= (event_type | EVENT_TIME);
3887	if (ctx->task) {
3888		if (!is_active)
3889			cpuctx->task_ctx = ctx;
3890		else
3891			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3892	}
3893
3894	is_active ^= ctx->is_active; /* changed bits */
3895
3896	/*
3897	 * First go through the list and put on any pinned groups
3898	 * in order to give them the best chance of going on.
3899	 */
3900	if (is_active & EVENT_PINNED)
3901		ctx_pinned_sched_in(ctx, NULL);
3902
3903	/* Then walk through the lower prio flexible groups */
3904	if (is_active & EVENT_FLEXIBLE)
3905		ctx_flexible_sched_in(ctx, NULL);
3906}
3907
3908static void perf_event_context_sched_in(struct task_struct *task)
3909{
3910	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3911	struct perf_event_context *ctx;
3912
3913	rcu_read_lock();
3914	ctx = rcu_dereference(task->perf_event_ctxp);
3915	if (!ctx)
3916		goto rcu_unlock;
3917
3918	if (cpuctx->task_ctx == ctx) {
3919		perf_ctx_lock(cpuctx, ctx);
3920		perf_ctx_disable(ctx);
3921
3922		perf_ctx_sched_task_cb(ctx, true);
3923
3924		perf_ctx_enable(ctx);
3925		perf_ctx_unlock(cpuctx, ctx);
3926		goto rcu_unlock;
3927	}
3928
3929	perf_ctx_lock(cpuctx, ctx);
3930	/*
3931	 * We must check ctx->nr_events while holding ctx->lock, such
3932	 * that we serialize against perf_install_in_context().
3933	 */
3934	if (!ctx->nr_events)
3935		goto unlock;
3936
3937	perf_ctx_disable(ctx);
3938	/*
3939	 * We want to keep the following priority order:
3940	 * cpu pinned (that don't need to move), task pinned,
3941	 * cpu flexible, task flexible.
3942	 *
3943	 * However, if task's ctx is not carrying any pinned
3944	 * events, no need to flip the cpuctx's events around.
3945	 */
3946	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
3947		perf_ctx_disable(&cpuctx->ctx);
3948		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
3949	}
3950
3951	perf_event_sched_in(cpuctx, ctx);
3952
3953	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
3954
3955	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3956		perf_ctx_enable(&cpuctx->ctx);
3957
3958	perf_ctx_enable(ctx);
3959
3960unlock:
3961	perf_ctx_unlock(cpuctx, ctx);
3962rcu_unlock:
3963	rcu_read_unlock();
3964}
3965
3966/*
3967 * Called from scheduler to add the events of the current task
3968 * with interrupts disabled.
3969 *
3970 * We restore the event value and then enable it.
3971 *
3972 * This does not protect us against NMI, but enable()
3973 * sets the enabled bit in the control field of event _before_
3974 * accessing the event control register. If a NMI hits, then it will
3975 * keep the event running.
3976 */
3977void __perf_event_task_sched_in(struct task_struct *prev,
3978				struct task_struct *task)
3979{
3980	perf_event_context_sched_in(task);
3981
3982	if (atomic_read(&nr_switch_events))
3983		perf_event_switch(task, prev, true);
3984
3985	if (__this_cpu_read(perf_sched_cb_usages))
3986		perf_pmu_sched_task(prev, task, true);
3987}
3988
3989static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3990{
3991	u64 frequency = event->attr.sample_freq;
3992	u64 sec = NSEC_PER_SEC;
3993	u64 divisor, dividend;
3994
3995	int count_fls, nsec_fls, frequency_fls, sec_fls;
3996
3997	count_fls = fls64(count);
3998	nsec_fls = fls64(nsec);
3999	frequency_fls = fls64(frequency);
4000	sec_fls = 30;
4001
4002	/*
4003	 * We got @count in @nsec, with a target of sample_freq HZ
4004	 * the target period becomes:
4005	 *
4006	 *             @count * 10^9
4007	 * period = -------------------
4008	 *          @nsec * sample_freq
4009	 *
4010	 */
4011
4012	/*
4013	 * Reduce accuracy by one bit such that @a and @b converge
4014	 * to a similar magnitude.
4015	 */
4016#define REDUCE_FLS(a, b)		\
4017do {					\
4018	if (a##_fls > b##_fls) {	\
4019		a >>= 1;		\
4020		a##_fls--;		\
4021	} else {			\
4022		b >>= 1;		\
4023		b##_fls--;		\
4024	}				\
4025} while (0)
4026
4027	/*
4028	 * Reduce accuracy until either term fits in a u64, then proceed with
4029	 * the other, so that finally we can do a u64/u64 division.
4030	 */
4031	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
4032		REDUCE_FLS(nsec, frequency);
4033		REDUCE_FLS(sec, count);
4034	}
4035
4036	if (count_fls + sec_fls > 64) {
4037		divisor = nsec * frequency;
4038
4039		while (count_fls + sec_fls > 64) {
4040			REDUCE_FLS(count, sec);
4041			divisor >>= 1;
4042		}
4043
4044		dividend = count * sec;
4045	} else {
4046		dividend = count * sec;
4047
4048		while (nsec_fls + frequency_fls > 64) {
4049			REDUCE_FLS(nsec, frequency);
4050			dividend >>= 1;
4051		}
4052
4053		divisor = nsec * frequency;
4054	}
4055
4056	if (!divisor)
4057		return dividend;
4058
4059	return div64_u64(dividend, divisor);
4060}
4061
4062static DEFINE_PER_CPU(int, perf_throttled_count);
4063static DEFINE_PER_CPU(u64, perf_throttled_seq);
4064
4065static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
4066{
4067	struct hw_perf_event *hwc = &event->hw;
4068	s64 period, sample_period;
4069	s64 delta;
4070
4071	period = perf_calculate_period(event, nsec, count);
4072
4073	delta = (s64)(period - hwc->sample_period);
4074	delta = (delta + 7) / 8; /* low pass filter */
4075
4076	sample_period = hwc->sample_period + delta;
4077
4078	if (!sample_period)
4079		sample_period = 1;
4080
4081	hwc->sample_period = sample_period;
4082
4083	if (local64_read(&hwc->period_left) > 8*sample_period) {
4084		if (disable)
4085			event->pmu->stop(event, PERF_EF_UPDATE);
4086
4087		local64_set(&hwc->period_left, 0);
4088
4089		if (disable)
4090			event->pmu->start(event, PERF_EF_RELOAD);
4091	}
4092}
4093
4094/*
4095 * combine freq adjustment with unthrottling to avoid two passes over the
4096 * events. At the same time, make sure, having freq events does not change
4097 * the rate of unthrottling as that would introduce bias.
4098 */
4099static void
4100perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
4101{
4102	struct perf_event *event;
4103	struct hw_perf_event *hwc;
4104	u64 now, period = TICK_NSEC;
4105	s64 delta;
4106
4107	/*
4108	 * only need to iterate over all events iff:
4109	 * - context have events in frequency mode (needs freq adjust)
4110	 * - there are events to unthrottle on this cpu
4111	 */
4112	if (!(ctx->nr_freq || unthrottle))
4113		return;
4114
4115	raw_spin_lock(&ctx->lock);
4116
4117	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4118		if (event->state != PERF_EVENT_STATE_ACTIVE)
4119			continue;
4120
4121		// XXX use visit thingy to avoid the -1,cpu match
4122		if (!event_filter_match(event))
4123			continue;
4124
4125		perf_pmu_disable(event->pmu);
4126
4127		hwc = &event->hw;
4128
4129		if (hwc->interrupts == MAX_INTERRUPTS) {
4130			hwc->interrupts = 0;
4131			perf_log_throttle(event, 1);
4132			event->pmu->start(event, 0);
4133		}
4134
4135		if (!event->attr.freq || !event->attr.sample_freq)
4136			goto next;
4137
4138		/*
4139		 * stop the event and update event->count
4140		 */
4141		event->pmu->stop(event, PERF_EF_UPDATE);
4142
4143		now = local64_read(&event->count);
4144		delta = now - hwc->freq_count_stamp;
4145		hwc->freq_count_stamp = now;
4146
4147		/*
4148		 * restart the event
4149		 * reload only if value has changed
4150		 * we have stopped the event so tell that
4151		 * to perf_adjust_period() to avoid stopping it
4152		 * twice.
4153		 */
4154		if (delta > 0)
4155			perf_adjust_period(event, period, delta, false);
4156
4157		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4158	next:
4159		perf_pmu_enable(event->pmu);
4160	}
4161
4162	raw_spin_unlock(&ctx->lock);
4163}
4164
4165/*
4166 * Move @event to the tail of the @ctx's elegible events.
4167 */
4168static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4169{
4170	/*
4171	 * Rotate the first entry last of non-pinned groups. Rotation might be
4172	 * disabled by the inheritance code.
4173	 */
4174	if (ctx->rotate_disable)
4175		return;
4176
4177	perf_event_groups_delete(&ctx->flexible_groups, event);
4178	perf_event_groups_insert(&ctx->flexible_groups, event);
4179}
4180
4181/* pick an event from the flexible_groups to rotate */
4182static inline struct perf_event *
4183ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
4184{
4185	struct perf_event *event;
4186	struct rb_node *node;
4187	struct rb_root *tree;
4188	struct __group_key key = {
4189		.pmu = pmu_ctx->pmu,
4190	};
4191
4192	/* pick the first active flexible event */
4193	event = list_first_entry_or_null(&pmu_ctx->flexible_active,
4194					 struct perf_event, active_list);
4195	if (event)
4196		goto out;
4197
4198	/* if no active flexible event, pick the first event */
4199	tree = &pmu_ctx->ctx->flexible_groups.tree;
4200
4201	if (!pmu_ctx->ctx->task) {
4202		key.cpu = smp_processor_id();
4203
4204		node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4205		if (node)
4206			event = __node_2_pe(node);
4207		goto out;
4208	}
4209
4210	key.cpu = -1;
4211	node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4212	if (node) {
4213		event = __node_2_pe(node);
4214		goto out;
4215	}
4216
4217	key.cpu = smp_processor_id();
4218	node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4219	if (node)
4220		event = __node_2_pe(node);
4221
4222out:
4223	/*
4224	 * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4225	 * finds there are unschedulable events, it will set it again.
4226	 */
4227	pmu_ctx->rotate_necessary = 0;
4228
4229	return event;
4230}
4231
4232static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
4233{
4234	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4235	struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
4236	struct perf_event *cpu_event = NULL, *task_event = NULL;
4237	int cpu_rotate, task_rotate;
4238	struct pmu *pmu;
4239
4240	/*
4241	 * Since we run this from IRQ context, nobody can install new
4242	 * events, thus the event count values are stable.
4243	 */
4244
4245	cpu_epc = &cpc->epc;
4246	pmu = cpu_epc->pmu;
4247	task_epc = cpc->task_epc;
4248
4249	cpu_rotate = cpu_epc->rotate_necessary;
4250	task_rotate = task_epc ? task_epc->rotate_necessary : 0;
4251
4252	if (!(cpu_rotate || task_rotate))
4253		return false;
4254
4255	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4256	perf_pmu_disable(pmu);
4257
4258	if (task_rotate)
4259		task_event = ctx_event_to_rotate(task_epc);
4260	if (cpu_rotate)
4261		cpu_event = ctx_event_to_rotate(cpu_epc);
4262
4263	/*
4264	 * As per the order given at ctx_resched() first 'pop' task flexible
4265	 * and then, if needed CPU flexible.
4266	 */
4267	if (task_event || (task_epc && cpu_event)) {
4268		update_context_time(task_epc->ctx);
4269		__pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
4270	}
4271
4272	if (cpu_event) {
4273		update_context_time(&cpuctx->ctx);
4274		__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
4275		rotate_ctx(&cpuctx->ctx, cpu_event);
4276		__pmu_ctx_sched_in(&cpuctx->ctx, pmu);
4277	}
4278
4279	if (task_event)
4280		rotate_ctx(task_epc->ctx, task_event);
4281
4282	if (task_event || (task_epc && cpu_event))
4283		__pmu_ctx_sched_in(task_epc->ctx, pmu);
4284
4285	perf_pmu_enable(pmu);
4286	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4287
4288	return true;
4289}
4290
4291void perf_event_task_tick(void)
4292{
4293	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4294	struct perf_event_context *ctx;
4295	int throttled;
4296
4297	lockdep_assert_irqs_disabled();
4298
4299	__this_cpu_inc(perf_throttled_seq);
4300	throttled = __this_cpu_xchg(perf_throttled_count, 0);
4301	tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4302
4303	perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
4304
4305	rcu_read_lock();
4306	ctx = rcu_dereference(current->perf_event_ctxp);
4307	if (ctx)
4308		perf_adjust_freq_unthr_context(ctx, !!throttled);
4309	rcu_read_unlock();
4310}
4311
4312static int event_enable_on_exec(struct perf_event *event,
4313				struct perf_event_context *ctx)
4314{
4315	if (!event->attr.enable_on_exec)
4316		return 0;
4317
4318	event->attr.enable_on_exec = 0;
4319	if (event->state >= PERF_EVENT_STATE_INACTIVE)
4320		return 0;
4321
4322	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4323
4324	return 1;
4325}
4326
4327/*
4328 * Enable all of a task's events that have been marked enable-on-exec.
4329 * This expects task == current.
4330 */
4331static void perf_event_enable_on_exec(struct perf_event_context *ctx)
4332{
4333	struct perf_event_context *clone_ctx = NULL;
4334	enum event_type_t event_type = 0;
4335	struct perf_cpu_context *cpuctx;
4336	struct perf_event *event;
4337	unsigned long flags;
4338	int enabled = 0;
4339
4340	local_irq_save(flags);
4341	if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
4342		goto out;
4343
4344	if (!ctx->nr_events)
4345		goto out;
4346
4347	cpuctx = this_cpu_ptr(&perf_cpu_context);
4348	perf_ctx_lock(cpuctx, ctx);
4349	ctx_sched_out(ctx, EVENT_TIME);
4350
4351	list_for_each_entry(event, &ctx->event_list, event_entry) {
4352		enabled |= event_enable_on_exec(event, ctx);
4353		event_type |= get_event_type(event);
4354	}
4355
4356	/*
4357	 * Unclone and reschedule this context if we enabled any event.
4358	 */
4359	if (enabled) {
4360		clone_ctx = unclone_ctx(ctx);
4361		ctx_resched(cpuctx, ctx, event_type);
4362	} else {
4363		ctx_sched_in(ctx, EVENT_TIME);
4364	}
4365	perf_ctx_unlock(cpuctx, ctx);
4366
4367out:
4368	local_irq_restore(flags);
4369
4370	if (clone_ctx)
4371		put_ctx(clone_ctx);
4372}
4373
4374static void perf_remove_from_owner(struct perf_event *event);
4375static void perf_event_exit_event(struct perf_event *event,
4376				  struct perf_event_context *ctx);
4377
4378/*
4379 * Removes all events from the current task that have been marked
4380 * remove-on-exec, and feeds their values back to parent events.
4381 */
4382static void perf_event_remove_on_exec(struct perf_event_context *ctx)
4383{
4384	struct perf_event_context *clone_ctx = NULL;
4385	struct perf_event *event, *next;
4386	unsigned long flags;
4387	bool modified = false;
4388
4389	mutex_lock(&ctx->mutex);
4390
4391	if (WARN_ON_ONCE(ctx->task != current))
4392		goto unlock;
4393
4394	list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4395		if (!event->attr.remove_on_exec)
4396			continue;
4397
4398		if (!is_kernel_event(event))
4399			perf_remove_from_owner(event);
4400
4401		modified = true;
4402
4403		perf_event_exit_event(event, ctx);
4404	}
4405
4406	raw_spin_lock_irqsave(&ctx->lock, flags);
4407	if (modified)
4408		clone_ctx = unclone_ctx(ctx);
4409	raw_spin_unlock_irqrestore(&ctx->lock, flags);
4410
4411unlock:
4412	mutex_unlock(&ctx->mutex);
4413
4414	if (clone_ctx)
4415		put_ctx(clone_ctx);
4416}
4417
4418struct perf_read_data {
4419	struct perf_event *event;
4420	bool group;
4421	int ret;
4422};
4423
4424static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4425{
4426	u16 local_pkg, event_pkg;
4427
4428	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4429		int local_cpu = smp_processor_id();
4430
4431		event_pkg = topology_physical_package_id(event_cpu);
4432		local_pkg = topology_physical_package_id(local_cpu);
4433
4434		if (event_pkg == local_pkg)
4435			return local_cpu;
4436	}
4437
4438	return event_cpu;
4439}
4440
4441/*
4442 * Cross CPU call to read the hardware event
4443 */
4444static void __perf_event_read(void *info)
4445{
4446	struct perf_read_data *data = info;
4447	struct perf_event *sub, *event = data->event;
4448	struct perf_event_context *ctx = event->ctx;
4449	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4450	struct pmu *pmu = event->pmu;
4451
4452	/*
4453	 * If this is a task context, we need to check whether it is
4454	 * the current task context of this cpu.  If not it has been
4455	 * scheduled out before the smp call arrived.  In that case
4456	 * event->count would have been updated to a recent sample
4457	 * when the event was scheduled out.
4458	 */
4459	if (ctx->task && cpuctx->task_ctx != ctx)
4460		return;
4461
4462	raw_spin_lock(&ctx->lock);
4463	if (ctx->is_active & EVENT_TIME) {
4464		update_context_time(ctx);
4465		update_cgrp_time_from_event(event);
4466	}
4467
4468	perf_event_update_time(event);
4469	if (data->group)
4470		perf_event_update_sibling_time(event);
4471
4472	if (event->state != PERF_EVENT_STATE_ACTIVE)
4473		goto unlock;
4474
4475	if (!data->group) {
4476		pmu->read(event);
4477		data->ret = 0;
4478		goto unlock;
4479	}
4480
4481	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4482
4483	pmu->read(event);
4484
4485	for_each_sibling_event(sub, event) {
4486		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4487			/*
4488			 * Use sibling's PMU rather than @event's since
4489			 * sibling could be on different (eg: software) PMU.
4490			 */
4491			sub->pmu->read(sub);
4492		}
4493	}
4494
4495	data->ret = pmu->commit_txn(pmu);
4496
4497unlock:
4498	raw_spin_unlock(&ctx->lock);
4499}
4500
4501static inline u64 perf_event_count(struct perf_event *event)
4502{
4503	return local64_read(&event->count) + atomic64_read(&event->child_count);
4504}
4505
4506static void calc_timer_values(struct perf_event *event,
4507				u64 *now,
4508				u64 *enabled,
4509				u64 *running)
4510{
4511	u64 ctx_time;
4512
4513	*now = perf_clock();
4514	ctx_time = perf_event_time_now(event, *now);
4515	__perf_update_times(event, ctx_time, enabled, running);
4516}
4517
4518/*
4519 * NMI-safe method to read a local event, that is an event that
4520 * is:
4521 *   - either for the current task, or for this CPU
4522 *   - does not have inherit set, for inherited task events
4523 *     will not be local and we cannot read them atomically
4524 *   - must not have a pmu::count method
4525 */
4526int perf_event_read_local(struct perf_event *event, u64 *value,
4527			  u64 *enabled, u64 *running)
4528{
4529	unsigned long flags;
4530	int ret = 0;
4531
4532	/*
4533	 * Disabling interrupts avoids all counter scheduling (context
4534	 * switches, timer based rotation and IPIs).
4535	 */
4536	local_irq_save(flags);
4537
4538	/*
4539	 * It must not be an event with inherit set, we cannot read
4540	 * all child counters from atomic context.
4541	 */
4542	if (event->attr.inherit) {
4543		ret = -EOPNOTSUPP;
4544		goto out;
4545	}
4546
4547	/* If this is a per-task event, it must be for current */
4548	if ((event->attach_state & PERF_ATTACH_TASK) &&
4549	    event->hw.target != current) {
4550		ret = -EINVAL;
4551		goto out;
4552	}
4553
4554	/* If this is a per-CPU event, it must be for this CPU */
4555	if (!(event->attach_state & PERF_ATTACH_TASK) &&
4556	    event->cpu != smp_processor_id()) {
4557		ret = -EINVAL;
4558		goto out;
4559	}
4560
4561	/* If this is a pinned event it must be running on this CPU */
4562	if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4563		ret = -EBUSY;
4564		goto out;
4565	}
4566
4567	/*
4568	 * If the event is currently on this CPU, its either a per-task event,
4569	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4570	 * oncpu == -1).
4571	 */
4572	if (event->oncpu == smp_processor_id())
4573		event->pmu->read(event);
4574
4575	*value = local64_read(&event->count);
4576	if (enabled || running) {
4577		u64 __enabled, __running, __now;
4578
4579		calc_timer_values(event, &__now, &__enabled, &__running);
4580		if (enabled)
4581			*enabled = __enabled;
4582		if (running)
4583			*running = __running;
4584	}
4585out:
4586	local_irq_restore(flags);
4587
4588	return ret;
4589}
4590
4591static int perf_event_read(struct perf_event *event, bool group)
4592{
4593	enum perf_event_state state = READ_ONCE(event->state);
4594	int event_cpu, ret = 0;
4595
4596	/*
4597	 * If event is enabled and currently active on a CPU, update the
4598	 * value in the event structure:
4599	 */
4600again:
4601	if (state == PERF_EVENT_STATE_ACTIVE) {
4602		struct perf_read_data data;
4603
4604		/*
4605		 * Orders the ->state and ->oncpu loads such that if we see
4606		 * ACTIVE we must also see the right ->oncpu.
4607		 *
4608		 * Matches the smp_wmb() from event_sched_in().
4609		 */
4610		smp_rmb();
4611
4612		event_cpu = READ_ONCE(event->oncpu);
4613		if ((unsigned)event_cpu >= nr_cpu_ids)
4614			return 0;
4615
4616		data = (struct perf_read_data){
4617			.event = event,
4618			.group = group,
4619			.ret = 0,
4620		};
4621
4622		preempt_disable();
4623		event_cpu = __perf_event_read_cpu(event, event_cpu);
4624
4625		/*
4626		 * Purposely ignore the smp_call_function_single() return
4627		 * value.
4628		 *
4629		 * If event_cpu isn't a valid CPU it means the event got
4630		 * scheduled out and that will have updated the event count.
4631		 *
4632		 * Therefore, either way, we'll have an up-to-date event count
4633		 * after this.
4634		 */
4635		(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4636		preempt_enable();
4637		ret = data.ret;
4638
4639	} else if (state == PERF_EVENT_STATE_INACTIVE) {
4640		struct perf_event_context *ctx = event->ctx;
4641		unsigned long flags;
4642
4643		raw_spin_lock_irqsave(&ctx->lock, flags);
4644		state = event->state;
4645		if (state != PERF_EVENT_STATE_INACTIVE) {
4646			raw_spin_unlock_irqrestore(&ctx->lock, flags);
4647			goto again;
4648		}
4649
4650		/*
4651		 * May read while context is not active (e.g., thread is
4652		 * blocked), in that case we cannot update context time
4653		 */
4654		if (ctx->is_active & EVENT_TIME) {
4655			update_context_time(ctx);
4656			update_cgrp_time_from_event(event);
4657		}
4658
4659		perf_event_update_time(event);
4660		if (group)
4661			perf_event_update_sibling_time(event);
4662		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4663	}
4664
4665	return ret;
4666}
4667
4668/*
4669 * Initialize the perf_event context in a task_struct:
4670 */
4671static void __perf_event_init_context(struct perf_event_context *ctx)
4672{
4673	raw_spin_lock_init(&ctx->lock);
4674	mutex_init(&ctx->mutex);
4675	INIT_LIST_HEAD(&ctx->pmu_ctx_list);
4676	perf_event_groups_init(&ctx->pinned_groups);
4677	perf_event_groups_init(&ctx->flexible_groups);
4678	INIT_LIST_HEAD(&ctx->event_list);
4679	refcount_set(&ctx->refcount, 1);
4680}
4681
4682static void
4683__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
4684{
4685	epc->pmu = pmu;
4686	INIT_LIST_HEAD(&epc->pmu_ctx_entry);
4687	INIT_LIST_HEAD(&epc->pinned_active);
4688	INIT_LIST_HEAD(&epc->flexible_active);
4689	atomic_set(&epc->refcount, 1);
4690}
4691
4692static struct perf_event_context *
4693alloc_perf_context(struct task_struct *task)
4694{
4695	struct perf_event_context *ctx;
4696
4697	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4698	if (!ctx)
4699		return NULL;
4700
4701	__perf_event_init_context(ctx);
4702	if (task)
4703		ctx->task = get_task_struct(task);
4704
4705	return ctx;
4706}
4707
4708static struct task_struct *
4709find_lively_task_by_vpid(pid_t vpid)
4710{
4711	struct task_struct *task;
4712
4713	rcu_read_lock();
4714	if (!vpid)
4715		task = current;
4716	else
4717		task = find_task_by_vpid(vpid);
4718	if (task)
4719		get_task_struct(task);
4720	rcu_read_unlock();
4721
4722	if (!task)
4723		return ERR_PTR(-ESRCH);
4724
4725	return task;
4726}
4727
4728/*
4729 * Returns a matching context with refcount and pincount.
4730 */
4731static struct perf_event_context *
4732find_get_context(struct task_struct *task, struct perf_event *event)
4733{
4734	struct perf_event_context *ctx, *clone_ctx = NULL;
4735	struct perf_cpu_context *cpuctx;
4736	unsigned long flags;
4737	int err;
4738
4739	if (!task) {
4740		/* Must be root to operate on a CPU event: */
4741		err = perf_allow_cpu(&event->attr);
4742		if (err)
4743			return ERR_PTR(err);
4744
4745		cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
4746		ctx = &cpuctx->ctx;
4747		get_ctx(ctx);
4748		raw_spin_lock_irqsave(&ctx->lock, flags);
4749		++ctx->pin_count;
4750		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4751
4752		return ctx;
4753	}
4754
4755	err = -EINVAL;
4756retry:
4757	ctx = perf_lock_task_context(task, &flags);
4758	if (ctx) {
4759		clone_ctx = unclone_ctx(ctx);
4760		++ctx->pin_count;
4761
4762		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4763
4764		if (clone_ctx)
4765			put_ctx(clone_ctx);
4766	} else {
4767		ctx = alloc_perf_context(task);
4768		err = -ENOMEM;
4769		if (!ctx)
4770			goto errout;
4771
4772		err = 0;
4773		mutex_lock(&task->perf_event_mutex);
4774		/*
4775		 * If it has already passed perf_event_exit_task().
4776		 * we must see PF_EXITING, it takes this mutex too.
4777		 */
4778		if (task->flags & PF_EXITING)
4779			err = -ESRCH;
4780		else if (task->perf_event_ctxp)
4781			err = -EAGAIN;
4782		else {
4783			get_ctx(ctx);
4784			++ctx->pin_count;
4785			rcu_assign_pointer(task->perf_event_ctxp, ctx);
4786		}
4787		mutex_unlock(&task->perf_event_mutex);
4788
4789		if (unlikely(err)) {
4790			put_ctx(ctx);
4791
4792			if (err == -EAGAIN)
4793				goto retry;
4794			goto errout;
4795		}
4796	}
4797
4798	return ctx;
4799
4800errout:
4801	return ERR_PTR(err);
4802}
4803
4804static struct perf_event_pmu_context *
4805find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
4806		     struct perf_event *event)
4807{
4808	struct perf_event_pmu_context *new = NULL, *epc;
4809	void *task_ctx_data = NULL;
4810
4811	if (!ctx->task) {
4812		struct perf_cpu_pmu_context *cpc;
4813
4814		cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
4815		epc = &cpc->epc;
4816		raw_spin_lock_irq(&ctx->lock);
4817		if (!epc->ctx) {
4818			atomic_set(&epc->refcount, 1);
4819			epc->embedded = 1;
4820			list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
4821			epc->ctx = ctx;
4822		} else {
4823			WARN_ON_ONCE(epc->ctx != ctx);
4824			atomic_inc(&epc->refcount);
4825		}
4826		raw_spin_unlock_irq(&ctx->lock);
4827		return epc;
4828	}
4829
4830	new = kzalloc(sizeof(*epc), GFP_KERNEL);
4831	if (!new)
4832		return ERR_PTR(-ENOMEM);
4833
4834	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4835		task_ctx_data = alloc_task_ctx_data(pmu);
4836		if (!task_ctx_data) {
4837			kfree(new);
4838			return ERR_PTR(-ENOMEM);
4839		}
4840	}
4841
4842	__perf_init_event_pmu_context(new, pmu);
4843
4844	/*
4845	 * XXX
4846	 *
4847	 * lockdep_assert_held(&ctx->mutex);
4848	 *
4849	 * can't because perf_event_init_task() doesn't actually hold the
4850	 * child_ctx->mutex.
4851	 */
4852
4853	raw_spin_lock_irq(&ctx->lock);
4854	list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
4855		if (epc->pmu == pmu) {
4856			WARN_ON_ONCE(epc->ctx != ctx);
4857			atomic_inc(&epc->refcount);
4858			goto found_epc;
4859		}
4860	}
4861
4862	epc = new;
4863	new = NULL;
4864
4865	list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
4866	epc->ctx = ctx;
4867
4868found_epc:
4869	if (task_ctx_data && !epc->task_ctx_data) {
4870		epc->task_ctx_data = task_ctx_data;
4871		task_ctx_data = NULL;
4872		ctx->nr_task_data++;
4873	}
4874	raw_spin_unlock_irq(&ctx->lock);
4875
4876	free_task_ctx_data(pmu, task_ctx_data);
4877	kfree(new);
4878
4879	return epc;
4880}
4881
4882static void get_pmu_ctx(struct perf_event_pmu_context *epc)
4883{
4884	WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
4885}
4886
4887static void free_epc_rcu(struct rcu_head *head)
4888{
4889	struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
4890
4891	kfree(epc->task_ctx_data);
4892	kfree(epc);
4893}
4894
4895static void put_pmu_ctx(struct perf_event_pmu_context *epc)
4896{
4897	struct perf_event_context *ctx = epc->ctx;
4898	unsigned long flags;
4899
4900	/*
4901	 * XXX
4902	 *
4903	 * lockdep_assert_held(&ctx->mutex);
4904	 *
4905	 * can't because of the call-site in _free_event()/put_event()
4906	 * which isn't always called under ctx->mutex.
4907	 */
4908	if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
4909		return;
4910
4911	WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
4912
4913	list_del_init(&epc->pmu_ctx_entry);
4914	epc->ctx = NULL;
4915
4916	WARN_ON_ONCE(!list_empty(&epc->pinned_active));
4917	WARN_ON_ONCE(!list_empty(&epc->flexible_active));
4918
4919	raw_spin_unlock_irqrestore(&ctx->lock, flags);
4920
4921	if (epc->embedded)
4922		return;
4923
4924	call_rcu(&epc->rcu_head, free_epc_rcu);
4925}
4926
4927static void perf_event_free_filter(struct perf_event *event);
4928
4929static void free_event_rcu(struct rcu_head *head)
4930{
4931	struct perf_event *event = container_of(head, typeof(*event), rcu_head);
4932
4933	if (event->ns)
4934		put_pid_ns(event->ns);
4935	perf_event_free_filter(event);
4936	kmem_cache_free(perf_event_cache, event);
4937}
4938
4939static void ring_buffer_attach(struct perf_event *event,
4940			       struct perf_buffer *rb);
4941
4942static void detach_sb_event(struct perf_event *event)
4943{
4944	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4945
4946	raw_spin_lock(&pel->lock);
4947	list_del_rcu(&event->sb_list);
4948	raw_spin_unlock(&pel->lock);
4949}
4950
4951static bool is_sb_event(struct perf_event *event)
4952{
4953	struct perf_event_attr *attr = &event->attr;
4954
4955	if (event->parent)
4956		return false;
4957
4958	if (event->attach_state & PERF_ATTACH_TASK)
4959		return false;
4960
4961	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4962	    attr->comm || attr->comm_exec ||
4963	    attr->task || attr->ksymbol ||
4964	    attr->context_switch || attr->text_poke ||
4965	    attr->bpf_event)
4966		return true;
4967	return false;
4968}
4969
4970static void unaccount_pmu_sb_event(struct perf_event *event)
4971{
4972	if (is_sb_event(event))
4973		detach_sb_event(event);
4974}
4975
4976#ifdef CONFIG_NO_HZ_FULL
4977static DEFINE_SPINLOCK(nr_freq_lock);
4978#endif
4979
4980static void unaccount_freq_event_nohz(void)
4981{
4982#ifdef CONFIG_NO_HZ_FULL
4983	spin_lock(&nr_freq_lock);
4984	if (atomic_dec_and_test(&nr_freq_events))
4985		tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4986	spin_unlock(&nr_freq_lock);
4987#endif
4988}
4989
4990static void unaccount_freq_event(void)
4991{
4992	if (tick_nohz_full_enabled())
4993		unaccount_freq_event_nohz();
4994	else
4995		atomic_dec(&nr_freq_events);
4996}
4997
4998static void unaccount_event(struct perf_event *event)
4999{
5000	bool dec = false;
5001
5002	if (event->parent)
5003		return;
5004
5005	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
5006		dec = true;
5007	if (event->attr.mmap || event->attr.mmap_data)
5008		atomic_dec(&nr_mmap_events);
5009	if (event->attr.build_id)
5010		atomic_dec(&nr_build_id_events);
5011	if (event->attr.comm)
5012		atomic_dec(&nr_comm_events);
5013	if (event->attr.namespaces)
5014		atomic_dec(&nr_namespaces_events);
5015	if (event->attr.cgroup)
5016		atomic_dec(&nr_cgroup_events);
5017	if (event->attr.task)
5018		atomic_dec(&nr_task_events);
5019	if (event->attr.freq)
5020		unaccount_freq_event();
5021	if (event->attr.context_switch) {
5022		dec = true;
5023		atomic_dec(&nr_switch_events);
5024	}
5025	if (is_cgroup_event(event))
5026		dec = true;
5027	if (has_branch_stack(event))
5028		dec = true;
5029	if (event->attr.ksymbol)
5030		atomic_dec(&nr_ksymbol_events);
5031	if (event->attr.bpf_event)
5032		atomic_dec(&nr_bpf_events);
5033	if (event->attr.text_poke)
5034		atomic_dec(&nr_text_poke_events);
5035
5036	if (dec) {
5037		if (!atomic_add_unless(&perf_sched_count, -1, 1))
5038			schedule_delayed_work(&perf_sched_work, HZ);
5039	}
5040
5041	unaccount_pmu_sb_event(event);
5042}
5043
5044static void perf_sched_delayed(struct work_struct *work)
5045{
5046	mutex_lock(&perf_sched_mutex);
5047	if (atomic_dec_and_test(&perf_sched_count))
5048		static_branch_disable(&perf_sched_events);
5049	mutex_unlock(&perf_sched_mutex);
5050}
5051
5052/*
5053 * The following implement mutual exclusion of events on "exclusive" pmus
5054 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
5055 * at a time, so we disallow creating events that might conflict, namely:
5056 *
5057 *  1) cpu-wide events in the presence of per-task events,
5058 *  2) per-task events in the presence of cpu-wide events,
5059 *  3) two matching events on the same perf_event_context.
5060 *
5061 * The former two cases are handled in the allocation path (perf_event_alloc(),
5062 * _free_event()), the latter -- before the first perf_install_in_context().
5063 */
5064static int exclusive_event_init(struct perf_event *event)
5065{
5066	struct pmu *pmu = event->pmu;
5067
5068	if (!is_exclusive_pmu(pmu))
5069		return 0;
5070
5071	/*
5072	 * Prevent co-existence of per-task and cpu-wide events on the
5073	 * same exclusive pmu.
5074	 *
5075	 * Negative pmu::exclusive_cnt means there are cpu-wide
5076	 * events on this "exclusive" pmu, positive means there are
5077	 * per-task events.
5078	 *
5079	 * Since this is called in perf_event_alloc() path, event::ctx
5080	 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
5081	 * to mean "per-task event", because unlike other attach states it
5082	 * never gets cleared.
5083	 */
5084	if (event->attach_state & PERF_ATTACH_TASK) {
5085		if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
5086			return -EBUSY;
5087	} else {
5088		if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
5089			return -EBUSY;
5090	}
5091
5092	return 0;
5093}
5094
5095static void exclusive_event_destroy(struct perf_event *event)
5096{
5097	struct pmu *pmu = event->pmu;
5098
5099	if (!is_exclusive_pmu(pmu))
5100		return;
5101
5102	/* see comment in exclusive_event_init() */
5103	if (event->attach_state & PERF_ATTACH_TASK)
5104		atomic_dec(&pmu->exclusive_cnt);
5105	else
5106		atomic_inc(&pmu->exclusive_cnt);
5107}
5108
5109static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
5110{
5111	if ((e1->pmu == e2->pmu) &&
5112	    (e1->cpu == e2->cpu ||
5113	     e1->cpu == -1 ||
5114	     e2->cpu == -1))
5115		return true;
5116	return false;
5117}
5118
5119static bool exclusive_event_installable(struct perf_event *event,
5120					struct perf_event_context *ctx)
5121{
5122	struct perf_event *iter_event;
5123	struct pmu *pmu = event->pmu;
5124
5125	lockdep_assert_held(&ctx->mutex);
5126
5127	if (!is_exclusive_pmu(pmu))
5128		return true;
5129
5130	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
5131		if (exclusive_event_match(iter_event, event))
5132			return false;
5133	}
5134
5135	return true;
5136}
5137
5138static void perf_addr_filters_splice(struct perf_event *event,
5139				       struct list_head *head);
5140
5141static void _free_event(struct perf_event *event)
5142{
5143	irq_work_sync(&event->pending_irq);
5144
5145	unaccount_event(event);
5146
5147	security_perf_event_free(event);
5148
5149	if (event->rb) {
5150		/*
5151		 * Can happen when we close an event with re-directed output.
5152		 *
5153		 * Since we have a 0 refcount, perf_mmap_close() will skip
5154		 * over us; possibly making our ring_buffer_put() the last.
5155		 */
5156		mutex_lock(&event->mmap_mutex);
5157		ring_buffer_attach(event, NULL);
5158		mutex_unlock(&event->mmap_mutex);
5159	}
5160
5161	if (is_cgroup_event(event))
5162		perf_detach_cgroup(event);
5163
5164	if (!event->parent) {
5165		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
5166			put_callchain_buffers();
5167	}
5168
5169	perf_event_free_bpf_prog(event);
5170	perf_addr_filters_splice(event, NULL);
5171	kfree(event->addr_filter_ranges);
5172
5173	if (event->destroy)
5174		event->destroy(event);
5175
5176	/*
5177	 * Must be after ->destroy(), due to uprobe_perf_close() using
5178	 * hw.target.
5179	 */
5180	if (event->hw.target)
5181		put_task_struct(event->hw.target);
5182
5183	if (event->pmu_ctx)
5184		put_pmu_ctx(event->pmu_ctx);
5185
5186	/*
5187	 * perf_event_free_task() relies on put_ctx() being 'last', in particular
5188	 * all task references must be cleaned up.
5189	 */
5190	if (event->ctx)
5191		put_ctx(event->ctx);
5192
5193	exclusive_event_destroy(event);
5194	module_put(event->pmu->module);
5195
5196	call_rcu(&event->rcu_head, free_event_rcu);
5197}
5198
5199/*
5200 * Used to free events which have a known refcount of 1, such as in error paths
5201 * where the event isn't exposed yet and inherited events.
5202 */
5203static void free_event(struct perf_event *event)
5204{
5205	if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
5206				"unexpected event refcount: %ld; ptr=%p\n",
5207				atomic_long_read(&event->refcount), event)) {
5208		/* leak to avoid use-after-free */
5209		return;
5210	}
5211
5212	_free_event(event);
5213}
5214
5215/*
5216 * Remove user event from the owner task.
5217 */
5218static void perf_remove_from_owner(struct perf_event *event)
5219{
5220	struct task_struct *owner;
5221
5222	rcu_read_lock();
5223	/*
5224	 * Matches the smp_store_release() in perf_event_exit_task(). If we
5225	 * observe !owner it means the list deletion is complete and we can
5226	 * indeed free this event, otherwise we need to serialize on
5227	 * owner->perf_event_mutex.
5228	 */
5229	owner = READ_ONCE(event->owner);
5230	if (owner) {
5231		/*
5232		 * Since delayed_put_task_struct() also drops the last
5233		 * task reference we can safely take a new reference
5234		 * while holding the rcu_read_lock().
5235		 */
5236		get_task_struct(owner);
5237	}
5238	rcu_read_unlock();
5239
5240	if (owner) {
5241		/*
5242		 * If we're here through perf_event_exit_task() we're already
5243		 * holding ctx->mutex which would be an inversion wrt. the
5244		 * normal lock order.
5245		 *
5246		 * However we can safely take this lock because its the child
5247		 * ctx->mutex.
5248		 */
5249		mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5250
5251		/*
5252		 * We have to re-check the event->owner field, if it is cleared
5253		 * we raced with perf_event_exit_task(), acquiring the mutex
5254		 * ensured they're done, and we can proceed with freeing the
5255		 * event.
5256		 */
5257		if (event->owner) {
5258			list_del_init(&event->owner_entry);
5259			smp_store_release(&event->owner, NULL);
5260		}
5261		mutex_unlock(&owner->perf_event_mutex);
5262		put_task_struct(owner);
5263	}
5264}
5265
5266static void put_event(struct perf_event *event)
5267{
5268	if (!atomic_long_dec_and_test(&event->refcount))
5269		return;
5270
5271	_free_event(event);
5272}
5273
5274/*
5275 * Kill an event dead; while event:refcount will preserve the event
5276 * object, it will not preserve its functionality. Once the last 'user'
5277 * gives up the object, we'll destroy the thing.
5278 */
5279int perf_event_release_kernel(struct perf_event *event)
5280{
5281	struct perf_event_context *ctx = event->ctx;
5282	struct perf_event *child, *tmp;
5283	LIST_HEAD(free_list);
5284
5285	/*
5286	 * If we got here through err_alloc: free_event(event); we will not
5287	 * have attached to a context yet.
5288	 */
5289	if (!ctx) {
5290		WARN_ON_ONCE(event->attach_state &
5291				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5292		goto no_ctx;
5293	}
5294
5295	if (!is_kernel_event(event))
5296		perf_remove_from_owner(event);
5297
5298	ctx = perf_event_ctx_lock(event);
5299	WARN_ON_ONCE(ctx->parent_ctx);
5300
5301	/*
5302	 * Mark this event as STATE_DEAD, there is no external reference to it
5303	 * anymore.
5304	 *
5305	 * Anybody acquiring event->child_mutex after the below loop _must_
5306	 * also see this, most importantly inherit_event() which will avoid
5307	 * placing more children on the list.
5308	 *
5309	 * Thus this guarantees that we will in fact observe and kill _ALL_
5310	 * child events.
5311	 */
5312	perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
5313
5314	perf_event_ctx_unlock(event, ctx);
5315
5316again:
5317	mutex_lock(&event->child_mutex);
5318	list_for_each_entry(child, &event->child_list, child_list) {
5319
5320		/*
5321		 * Cannot change, child events are not migrated, see the
5322		 * comment with perf_event_ctx_lock_nested().
5323		 */
5324		ctx = READ_ONCE(child->ctx);
5325		/*
5326		 * Since child_mutex nests inside ctx::mutex, we must jump
5327		 * through hoops. We start by grabbing a reference on the ctx.
5328		 *
5329		 * Since the event cannot get freed while we hold the
5330		 * child_mutex, the context must also exist and have a !0
5331		 * reference count.
5332		 */
5333		get_ctx(ctx);
5334
5335		/*
5336		 * Now that we have a ctx ref, we can drop child_mutex, and
5337		 * acquire ctx::mutex without fear of it going away. Then we
5338		 * can re-acquire child_mutex.
5339		 */
5340		mutex_unlock(&event->child_mutex);
5341		mutex_lock(&ctx->mutex);
5342		mutex_lock(&event->child_mutex);
5343
5344		/*
5345		 * Now that we hold ctx::mutex and child_mutex, revalidate our
5346		 * state, if child is still the first entry, it didn't get freed
5347		 * and we can continue doing so.
5348		 */
5349		tmp = list_first_entry_or_null(&event->child_list,
5350					       struct perf_event, child_list);
5351		if (tmp == child) {
5352			perf_remove_from_context(child, DETACH_GROUP);
5353			list_move(&child->child_list, &free_list);
5354			/*
5355			 * This matches the refcount bump in inherit_event();
5356			 * this can't be the last reference.
5357			 */
5358			put_event(event);
5359		}
5360
5361		mutex_unlock(&event->child_mutex);
5362		mutex_unlock(&ctx->mutex);
5363		put_ctx(ctx);
5364		goto again;
5365	}
5366	mutex_unlock(&event->child_mutex);
5367
5368	list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5369		void *var = &child->ctx->refcount;
5370
5371		list_del(&child->child_list);
5372		free_event(child);
5373
5374		/*
5375		 * Wake any perf_event_free_task() waiting for this event to be
5376		 * freed.
5377		 */
5378		smp_mb(); /* pairs with wait_var_event() */
5379		wake_up_var(var);
5380	}
5381
5382no_ctx:
5383	put_event(event); /* Must be the 'last' reference */
5384	return 0;
5385}
5386EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5387
5388/*
5389 * Called when the last reference to the file is gone.
5390 */
5391static int perf_release(struct inode *inode, struct file *file)
5392{
5393	perf_event_release_kernel(file->private_data);
5394	return 0;
5395}
5396
5397static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5398{
5399	struct perf_event *child;
5400	u64 total = 0;
5401
5402	*enabled = 0;
5403	*running = 0;
5404
5405	mutex_lock(&event->child_mutex);
5406
5407	(void)perf_event_read(event, false);
5408	total += perf_event_count(event);
5409
5410	*enabled += event->total_time_enabled +
5411			atomic64_read(&event->child_total_time_enabled);
5412	*running += event->total_time_running +
5413			atomic64_read(&event->child_total_time_running);
5414
5415	list_for_each_entry(child, &event->child_list, child_list) {
5416		(void)perf_event_read(child, false);
5417		total += perf_event_count(child);
5418		*enabled += child->total_time_enabled;
5419		*running += child->total_time_running;
5420	}
5421	mutex_unlock(&event->child_mutex);
5422
5423	return total;
5424}
5425
5426u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5427{
5428	struct perf_event_context *ctx;
5429	u64 count;
5430
5431	ctx = perf_event_ctx_lock(event);
5432	count = __perf_event_read_value(event, enabled, running);
5433	perf_event_ctx_unlock(event, ctx);
5434
5435	return count;
5436}
5437EXPORT_SYMBOL_GPL(perf_event_read_value);
5438
5439static int __perf_read_group_add(struct perf_event *leader,
5440					u64 read_format, u64 *values)
5441{
5442	struct perf_event_context *ctx = leader->ctx;
5443	struct perf_event *sub;
5444	unsigned long flags;
5445	int n = 1; /* skip @nr */
5446	int ret;
5447
5448	ret = perf_event_read(leader, true);
5449	if (ret)
5450		return ret;
5451
5452	raw_spin_lock_irqsave(&ctx->lock, flags);
5453
5454	/*
5455	 * Since we co-schedule groups, {enabled,running} times of siblings
5456	 * will be identical to those of the leader, so we only publish one
5457	 * set.
5458	 */
5459	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5460		values[n++] += leader->total_time_enabled +
5461			atomic64_read(&leader->child_total_time_enabled);
5462	}
5463
5464	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5465		values[n++] += leader->total_time_running +
5466			atomic64_read(&leader->child_total_time_running);
5467	}
5468
5469	/*
5470	 * Write {count,id} tuples for every sibling.
5471	 */
5472	values[n++] += perf_event_count(leader);
5473	if (read_format & PERF_FORMAT_ID)
5474		values[n++] = primary_event_id(leader);
5475	if (read_format & PERF_FORMAT_LOST)
5476		values[n++] = atomic64_read(&leader->lost_samples);
5477
5478	for_each_sibling_event(sub, leader) {
5479		values[n++] += perf_event_count(sub);
5480		if (read_format & PERF_FORMAT_ID)
5481			values[n++] = primary_event_id(sub);
5482		if (read_format & PERF_FORMAT_LOST)
5483			values[n++] = atomic64_read(&sub->lost_samples);
5484	}
5485
5486	raw_spin_unlock_irqrestore(&ctx->lock, flags);
5487	return 0;
5488}
5489
5490static int perf_read_group(struct perf_event *event,
5491				   u64 read_format, char __user *buf)
5492{
5493	struct perf_event *leader = event->group_leader, *child;
5494	struct perf_event_context *ctx = leader->ctx;
5495	int ret;
5496	u64 *values;
5497
5498	lockdep_assert_held(&ctx->mutex);
5499
5500	values = kzalloc(event->read_size, GFP_KERNEL);
5501	if (!values)
5502		return -ENOMEM;
5503
5504	values[0] = 1 + leader->nr_siblings;
5505
5506	/*
5507	 * By locking the child_mutex of the leader we effectively
5508	 * lock the child list of all siblings.. XXX explain how.
5509	 */
5510	mutex_lock(&leader->child_mutex);
5511
5512	ret = __perf_read_group_add(leader, read_format, values);
5513	if (ret)
5514		goto unlock;
5515
5516	list_for_each_entry(child, &leader->child_list, child_list) {
5517		ret = __perf_read_group_add(child, read_format, values);
5518		if (ret)
5519			goto unlock;
5520	}
5521
5522	mutex_unlock(&leader->child_mutex);
5523
5524	ret = event->read_size;
5525	if (copy_to_user(buf, values, event->read_size))
5526		ret = -EFAULT;
5527	goto out;
5528
5529unlock:
5530	mutex_unlock(&leader->child_mutex);
5531out:
5532	kfree(values);
5533	return ret;
5534}
5535
5536static int perf_read_one(struct perf_event *event,
5537				 u64 read_format, char __user *buf)
5538{
5539	u64 enabled, running;
5540	u64 values[5];
5541	int n = 0;
5542
5543	values[n++] = __perf_event_read_value(event, &enabled, &running);
5544	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5545		values[n++] = enabled;
5546	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5547		values[n++] = running;
5548	if (read_format & PERF_FORMAT_ID)
5549		values[n++] = primary_event_id(event);
5550	if (read_format & PERF_FORMAT_LOST)
5551		values[n++] = atomic64_read(&event->lost_samples);
5552
5553	if (copy_to_user(buf, values, n * sizeof(u64)))
5554		return -EFAULT;
5555
5556	return n * sizeof(u64);
5557}
5558
5559static bool is_event_hup(struct perf_event *event)
5560{
5561	bool no_children;
5562
5563	if (event->state > PERF_EVENT_STATE_EXIT)
5564		return false;
5565
5566	mutex_lock(&event->child_mutex);
5567	no_children = list_empty(&event->child_list);
5568	mutex_unlock(&event->child_mutex);
5569	return no_children;
5570}
5571
5572/*
5573 * Read the performance event - simple non blocking version for now
5574 */
5575static ssize_t
5576__perf_read(struct perf_event *event, char __user *buf, size_t count)
5577{
5578	u64 read_format = event->attr.read_format;
5579	int ret;
5580
5581	/*
5582	 * Return end-of-file for a read on an event that is in
5583	 * error state (i.e. because it was pinned but it couldn't be
5584	 * scheduled on to the CPU at some point).
5585	 */
5586	if (event->state == PERF_EVENT_STATE_ERROR)
5587		return 0;
5588
5589	if (count < event->read_size)
5590		return -ENOSPC;
5591
5592	WARN_ON_ONCE(event->ctx->parent_ctx);
5593	if (read_format & PERF_FORMAT_GROUP)
5594		ret = perf_read_group(event, read_format, buf);
5595	else
5596		ret = perf_read_one(event, read_format, buf);
5597
5598	return ret;
5599}
5600
5601static ssize_t
5602perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5603{
5604	struct perf_event *event = file->private_data;
5605	struct perf_event_context *ctx;
5606	int ret;
5607
5608	ret = security_perf_event_read(event);
5609	if (ret)
5610		return ret;
5611
5612	ctx = perf_event_ctx_lock(event);
5613	ret = __perf_read(event, buf, count);
5614	perf_event_ctx_unlock(event, ctx);
5615
5616	return ret;
5617}
5618
5619static __poll_t perf_poll(struct file *file, poll_table *wait)
5620{
5621	struct perf_event *event = file->private_data;
5622	struct perf_buffer *rb;
5623	__poll_t events = EPOLLHUP;
5624
5625	poll_wait(file, &event->waitq, wait);
5626
5627	if (is_event_hup(event))
5628		return events;
5629
5630	/*
5631	 * Pin the event->rb by taking event->mmap_mutex; otherwise
5632	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
5633	 */
5634	mutex_lock(&event->mmap_mutex);
5635	rb = event->rb;
5636	if (rb)
5637		events = atomic_xchg(&rb->poll, 0);
5638	mutex_unlock(&event->mmap_mutex);
5639	return events;
5640}
5641
5642static void _perf_event_reset(struct perf_event *event)
5643{
5644	(void)perf_event_read(event, false);
5645	local64_set(&event->count, 0);
5646	perf_event_update_userpage(event);
5647}
5648
5649/* Assume it's not an event with inherit set. */
5650u64 perf_event_pause(struct perf_event *event, bool reset)
5651{
5652	struct perf_event_context *ctx;
5653	u64 count;
5654
5655	ctx = perf_event_ctx_lock(event);
5656	WARN_ON_ONCE(event->attr.inherit);
5657	_perf_event_disable(event);
5658	count = local64_read(&event->count);
5659	if (reset)
5660		local64_set(&event->count, 0);
5661	perf_event_ctx_unlock(event, ctx);
5662
5663	return count;
5664}
5665EXPORT_SYMBOL_GPL(perf_event_pause);
5666
5667/*
5668 * Holding the top-level event's child_mutex means that any
5669 * descendant process that has inherited this event will block
5670 * in perf_event_exit_event() if it goes to exit, thus satisfying the
5671 * task existence requirements of perf_event_enable/disable.
5672 */
5673static void perf_event_for_each_child(struct perf_event *event,
5674					void (*func)(struct perf_event *))
5675{
5676	struct perf_event *child;
5677
5678	WARN_ON_ONCE(event->ctx->parent_ctx);
5679
5680	mutex_lock(&event->child_mutex);
5681	func(event);
5682	list_for_each_entry(child, &event->child_list, child_list)
5683		func(child);
5684	mutex_unlock(&event->child_mutex);
5685}
5686
5687static void perf_event_for_each(struct perf_event *event,
5688				  void (*func)(struct perf_event *))
5689{
5690	struct perf_event_context *ctx = event->ctx;
5691	struct perf_event *sibling;
5692
5693	lockdep_assert_held(&ctx->mutex);
5694
5695	event = event->group_leader;
5696
5697	perf_event_for_each_child(event, func);
5698	for_each_sibling_event(sibling, event)
5699		perf_event_for_each_child(sibling, func);
5700}
5701
5702static void __perf_event_period(struct perf_event *event,
5703				struct perf_cpu_context *cpuctx,
5704				struct perf_event_context *ctx,
5705				void *info)
5706{
5707	u64 value = *((u64 *)info);
5708	bool active;
5709
5710	if (event->attr.freq) {
5711		event->attr.sample_freq = value;
5712	} else {
5713		event->attr.sample_period = value;
5714		event->hw.sample_period = value;
5715	}
5716
5717	active = (event->state == PERF_EVENT_STATE_ACTIVE);
5718	if (active) {
5719		perf_pmu_disable(event->pmu);
5720		/*
5721		 * We could be throttled; unthrottle now to avoid the tick
5722		 * trying to unthrottle while we already re-started the event.
5723		 */
5724		if (event->hw.interrupts == MAX_INTERRUPTS) {
5725			event->hw.interrupts = 0;
5726			perf_log_throttle(event, 1);
5727		}
5728		event->pmu->stop(event, PERF_EF_UPDATE);
5729	}
5730
5731	local64_set(&event->hw.period_left, 0);
5732
5733	if (active) {
5734		event->pmu->start(event, PERF_EF_RELOAD);
5735		perf_pmu_enable(event->pmu);
5736	}
5737}
5738
5739static int perf_event_check_period(struct perf_event *event, u64 value)
5740{
5741	return event->pmu->check_period(event, value);
5742}
5743
5744static int _perf_event_period(struct perf_event *event, u64 value)
5745{
5746	if (!is_sampling_event(event))
5747		return -EINVAL;
5748
5749	if (!value)
5750		return -EINVAL;
5751
5752	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5753		return -EINVAL;
5754
5755	if (perf_event_check_period(event, value))
5756		return -EINVAL;
5757
5758	if (!event->attr.freq && (value & (1ULL << 63)))
5759		return -EINVAL;
5760
5761	event_function_call(event, __perf_event_period, &value);
5762
5763	return 0;
5764}
5765
5766int perf_event_period(struct perf_event *event, u64 value)
5767{
5768	struct perf_event_context *ctx;
5769	int ret;
5770
5771	ctx = perf_event_ctx_lock(event);
5772	ret = _perf_event_period(event, value);
5773	perf_event_ctx_unlock(event, ctx);
5774
5775	return ret;
5776}
5777EXPORT_SYMBOL_GPL(perf_event_period);
5778
5779static const struct file_operations perf_fops;
5780
5781static inline int perf_fget_light(int fd, struct fd *p)
5782{
5783	struct fd f = fdget(fd);
5784	if (!f.file)
5785		return -EBADF;
5786
5787	if (f.file->f_op != &perf_fops) {
5788		fdput(f);
5789		return -EBADF;
5790	}
5791	*p = f;
5792	return 0;
5793}
5794
5795static int perf_event_set_output(struct perf_event *event,
5796				 struct perf_event *output_event);
5797static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5798static int perf_copy_attr(struct perf_event_attr __user *uattr,
5799			  struct perf_event_attr *attr);
5800
5801static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5802{
5803	void (*func)(struct perf_event *);
5804	u32 flags = arg;
5805
5806	switch (cmd) {
5807	case PERF_EVENT_IOC_ENABLE:
5808		func = _perf_event_enable;
5809		break;
5810	case PERF_EVENT_IOC_DISABLE:
5811		func = _perf_event_disable;
5812		break;
5813	case PERF_EVENT_IOC_RESET:
5814		func = _perf_event_reset;
5815		break;
5816
5817	case PERF_EVENT_IOC_REFRESH:
5818		return _perf_event_refresh(event, arg);
5819
5820	case PERF_EVENT_IOC_PERIOD:
5821	{
5822		u64 value;
5823
5824		if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5825			return -EFAULT;
5826
5827		return _perf_event_period(event, value);
5828	}
5829	case PERF_EVENT_IOC_ID:
5830	{
5831		u64 id = primary_event_id(event);
5832
5833		if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5834			return -EFAULT;
5835		return 0;
5836	}
5837
5838	case PERF_EVENT_IOC_SET_OUTPUT:
5839	{
5840		int ret;
5841		if (arg != -1) {
5842			struct perf_event *output_event;
5843			struct fd output;
5844			ret = perf_fget_light(arg, &output);
5845			if (ret)
5846				return ret;
5847			output_event = output.file->private_data;
5848			ret = perf_event_set_output(event, output_event);
5849			fdput(output);
5850		} else {
5851			ret = perf_event_set_output(event, NULL);
5852		}
5853		return ret;
5854	}
5855
5856	case PERF_EVENT_IOC_SET_FILTER:
5857		return perf_event_set_filter(event, (void __user *)arg);
5858
5859	case PERF_EVENT_IOC_SET_BPF:
5860	{
5861		struct bpf_prog *prog;
5862		int err;
5863
5864		prog = bpf_prog_get(arg);
5865		if (IS_ERR(prog))
5866			return PTR_ERR(prog);
5867
5868		err = perf_event_set_bpf_prog(event, prog, 0);
5869		if (err) {
5870			bpf_prog_put(prog);
5871			return err;
5872		}
5873
5874		return 0;
5875	}
5876
5877	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5878		struct perf_buffer *rb;
5879
5880		rcu_read_lock();
5881		rb = rcu_dereference(event->rb);
5882		if (!rb || !rb->nr_pages) {
5883			rcu_read_unlock();
5884			return -EINVAL;
5885		}
5886		rb_toggle_paused(rb, !!arg);
5887		rcu_read_unlock();
5888		return 0;
5889	}
5890
5891	case PERF_EVENT_IOC_QUERY_BPF:
5892		return perf_event_query_prog_array(event, (void __user *)arg);
5893
5894	case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5895		struct perf_event_attr new_attr;
5896		int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5897					 &new_attr);
5898
5899		if (err)
5900			return err;
5901
5902		return perf_event_modify_attr(event,  &new_attr);
5903	}
5904	default:
5905		return -ENOTTY;
5906	}
5907
5908	if (flags & PERF_IOC_FLAG_GROUP)
5909		perf_event_for_each(event, func);
5910	else
5911		perf_event_for_each_child(event, func);
5912
5913	return 0;
5914}
5915
5916static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5917{
5918	struct perf_event *event = file->private_data;
5919	struct perf_event_context *ctx;
5920	long ret;
5921
5922	/* Treat ioctl like writes as it is likely a mutating operation. */
5923	ret = security_perf_event_write(event);
5924	if (ret)
5925		return ret;
5926
5927	ctx = perf_event_ctx_lock(event);
5928	ret = _perf_ioctl(event, cmd, arg);
5929	perf_event_ctx_unlock(event, ctx);
5930
5931	return ret;
5932}
5933
5934#ifdef CONFIG_COMPAT
5935static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5936				unsigned long arg)
5937{
5938	switch (_IOC_NR(cmd)) {
5939	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5940	case _IOC_NR(PERF_EVENT_IOC_ID):
5941	case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5942	case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5943		/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5944		if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5945			cmd &= ~IOCSIZE_MASK;
5946			cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5947		}
5948		break;
5949	}
5950	return perf_ioctl(file, cmd, arg);
5951}
5952#else
5953# define perf_compat_ioctl NULL
5954#endif
5955
5956int perf_event_task_enable(void)
5957{
5958	struct perf_event_context *ctx;
5959	struct perf_event *event;
5960
5961	mutex_lock(&current->perf_event_mutex);
5962	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5963		ctx = perf_event_ctx_lock(event);
5964		perf_event_for_each_child(event, _perf_event_enable);
5965		perf_event_ctx_unlock(event, ctx);
5966	}
5967	mutex_unlock(&current->perf_event_mutex);
5968
5969	return 0;
5970}
5971
5972int perf_event_task_disable(void)
5973{
5974	struct perf_event_context *ctx;
5975	struct perf_event *event;
5976
5977	mutex_lock(&current->perf_event_mutex);
5978	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5979		ctx = perf_event_ctx_lock(event);
5980		perf_event_for_each_child(event, _perf_event_disable);
5981		perf_event_ctx_unlock(event, ctx);
5982	}
5983	mutex_unlock(&current->perf_event_mutex);
5984
5985	return 0;
5986}
5987
5988static int perf_event_index(struct perf_event *event)
5989{
5990	if (event->hw.state & PERF_HES_STOPPED)
5991		return 0;
5992
5993	if (event->state != PERF_EVENT_STATE_ACTIVE)
5994		return 0;
5995
5996	return event->pmu->event_idx(event);
5997}
5998
5999static void perf_event_init_userpage(struct perf_event *event)
6000{
6001	struct perf_event_mmap_page *userpg;
6002	struct perf_buffer *rb;
6003
6004	rcu_read_lock();
6005	rb = rcu_dereference(event->rb);
6006	if (!rb)
6007		goto unlock;
6008
6009	userpg = rb->user_page;
6010
6011	/* Allow new userspace to detect that bit 0 is deprecated */
6012	userpg->cap_bit0_is_deprecated = 1;
6013	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
6014	userpg->data_offset = PAGE_SIZE;
6015	userpg->data_size = perf_data_size(rb);
6016
6017unlock:
6018	rcu_read_unlock();
6019}
6020
6021void __weak arch_perf_update_userpage(
6022	struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
6023{
6024}
6025
6026/*
6027 * Callers need to ensure there can be no nesting of this function, otherwise
6028 * the seqlock logic goes bad. We can not serialize this because the arch
6029 * code calls this from NMI context.
6030 */
6031void perf_event_update_userpage(struct perf_event *event)
6032{
6033	struct perf_event_mmap_page *userpg;
6034	struct perf_buffer *rb;
6035	u64 enabled, running, now;
6036
6037	rcu_read_lock();
6038	rb = rcu_dereference(event->rb);
6039	if (!rb)
6040		goto unlock;
6041
6042	/*
6043	 * compute total_time_enabled, total_time_running
6044	 * based on snapshot values taken when the event
6045	 * was last scheduled in.
6046	 *
6047	 * we cannot simply called update_context_time()
6048	 * because of locking issue as we can be called in
6049	 * NMI context
6050	 */
6051	calc_timer_values(event, &now, &enabled, &running);
6052
6053	userpg = rb->user_page;
6054	/*
6055	 * Disable preemption to guarantee consistent time stamps are stored to
6056	 * the user page.
6057	 */
6058	preempt_disable();
6059	++userpg->lock;
6060	barrier();
6061	userpg->index = perf_event_index(event);
6062	userpg->offset = perf_event_count(event);
6063	if (userpg->index)
6064		userpg->offset -= local64_read(&event->hw.prev_count);
6065
6066	userpg->time_enabled = enabled +
6067			atomic64_read(&event->child_total_time_enabled);
6068
6069	userpg->time_running = running +
6070			atomic64_read(&event->child_total_time_running);
6071
6072	arch_perf_update_userpage(event, userpg, now);
6073
6074	barrier();
6075	++userpg->lock;
6076	preempt_enable();
6077unlock:
6078	rcu_read_unlock();
6079}
6080EXPORT_SYMBOL_GPL(perf_event_update_userpage);
6081
6082static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
6083{
6084	struct perf_event *event = vmf->vma->vm_file->private_data;
6085	struct perf_buffer *rb;
6086	vm_fault_t ret = VM_FAULT_SIGBUS;
6087
6088	if (vmf->flags & FAULT_FLAG_MKWRITE) {
6089		if (vmf->pgoff == 0)
6090			ret = 0;
6091		return ret;
6092	}
6093
6094	rcu_read_lock();
6095	rb = rcu_dereference(event->rb);
6096	if (!rb)
6097		goto unlock;
6098
6099	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
6100		goto unlock;
6101
6102	vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
6103	if (!vmf->page)
6104		goto unlock;
6105
6106	get_page(vmf->page);
6107	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
6108	vmf->page->index   = vmf->pgoff;
6109
6110	ret = 0;
6111unlock:
6112	rcu_read_unlock();
6113
6114	return ret;
6115}
6116
6117static void ring_buffer_attach(struct perf_event *event,
6118			       struct perf_buffer *rb)
6119{
6120	struct perf_buffer *old_rb = NULL;
6121	unsigned long flags;
6122
6123	WARN_ON_ONCE(event->parent);
6124
6125	if (event->rb) {
6126		/*
6127		 * Should be impossible, we set this when removing
6128		 * event->rb_entry and wait/clear when adding event->rb_entry.
6129		 */
6130		WARN_ON_ONCE(event->rcu_pending);
6131
6132		old_rb = event->rb;
6133		spin_lock_irqsave(&old_rb->event_lock, flags);
6134		list_del_rcu(&event->rb_entry);
6135		spin_unlock_irqrestore(&old_rb->event_lock, flags);
6136
6137		event->rcu_batches = get_state_synchronize_rcu();
6138		event->rcu_pending = 1;
6139	}
6140
6141	if (rb) {
6142		if (event->rcu_pending) {
6143			cond_synchronize_rcu(event->rcu_batches);
6144			event->rcu_pending = 0;
6145		}
6146
6147		spin_lock_irqsave(&rb->event_lock, flags);
6148		list_add_rcu(&event->rb_entry, &rb->event_list);
6149		spin_unlock_irqrestore(&rb->event_lock, flags);
6150	}
6151
6152	/*
6153	 * Avoid racing with perf_mmap_close(AUX): stop the event
6154	 * before swizzling the event::rb pointer; if it's getting
6155	 * unmapped, its aux_mmap_count will be 0 and it won't
6156	 * restart. See the comment in __perf_pmu_output_stop().
6157	 *
6158	 * Data will inevitably be lost when set_output is done in
6159	 * mid-air, but then again, whoever does it like this is
6160	 * not in for the data anyway.
6161	 */
6162	if (has_aux(event))
6163		perf_event_stop(event, 0);
6164
6165	rcu_assign_pointer(event->rb, rb);
6166
6167	if (old_rb) {
6168		ring_buffer_put(old_rb);
6169		/*
6170		 * Since we detached before setting the new rb, so that we
6171		 * could attach the new rb, we could have missed a wakeup.
6172		 * Provide it now.
6173		 */
6174		wake_up_all(&event->waitq);
6175	}
6176}
6177
6178static void ring_buffer_wakeup(struct perf_event *event)
6179{
6180	struct perf_buffer *rb;
6181
6182	if (event->parent)
6183		event = event->parent;
6184
6185	rcu_read_lock();
6186	rb = rcu_dereference(event->rb);
6187	if (rb) {
6188		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
6189			wake_up_all(&event->waitq);
6190	}
6191	rcu_read_unlock();
6192}
6193
6194struct perf_buffer *ring_buffer_get(struct perf_event *event)
6195{
6196	struct perf_buffer *rb;
6197
6198	if (event->parent)
6199		event = event->parent;
6200
6201	rcu_read_lock();
6202	rb = rcu_dereference(event->rb);
6203	if (rb) {
6204		if (!refcount_inc_not_zero(&rb->refcount))
6205			rb = NULL;
6206	}
6207	rcu_read_unlock();
6208
6209	return rb;
6210}
6211
6212void ring_buffer_put(struct perf_buffer *rb)
6213{
6214	if (!refcount_dec_and_test(&rb->refcount))
6215		return;
6216
6217	WARN_ON_ONCE(!list_empty(&rb->event_list));
6218
6219	call_rcu(&rb->rcu_head, rb_free_rcu);
6220}
6221
6222static void perf_mmap_open(struct vm_area_struct *vma)
6223{
6224	struct perf_event *event = vma->vm_file->private_data;
6225
6226	atomic_inc(&event->mmap_count);
6227	atomic_inc(&event->rb->mmap_count);
6228
6229	if (vma->vm_pgoff)
6230		atomic_inc(&event->rb->aux_mmap_count);
6231
6232	if (event->pmu->event_mapped)
6233		event->pmu->event_mapped(event, vma->vm_mm);
6234}
6235
6236static void perf_pmu_output_stop(struct perf_event *event);
6237
6238/*
6239 * A buffer can be mmap()ed multiple times; either directly through the same
6240 * event, or through other events by use of perf_event_set_output().
6241 *
6242 * In order to undo the VM accounting done by perf_mmap() we need to destroy
6243 * the buffer here, where we still have a VM context. This means we need
6244 * to detach all events redirecting to us.
6245 */
6246static void perf_mmap_close(struct vm_area_struct *vma)
6247{
6248	struct perf_event *event = vma->vm_file->private_data;
6249	struct perf_buffer *rb = ring_buffer_get(event);
6250	struct user_struct *mmap_user = rb->mmap_user;
6251	int mmap_locked = rb->mmap_locked;
6252	unsigned long size = perf_data_size(rb);
6253	bool detach_rest = false;
6254
6255	if (event->pmu->event_unmapped)
6256		event->pmu->event_unmapped(event, vma->vm_mm);
6257
6258	/*
6259	 * rb->aux_mmap_count will always drop before rb->mmap_count and
6260	 * event->mmap_count, so it is ok to use event->mmap_mutex to
6261	 * serialize with perf_mmap here.
6262	 */
6263	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
6264	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
6265		/*
6266		 * Stop all AUX events that are writing to this buffer,
6267		 * so that we can free its AUX pages and corresponding PMU
6268		 * data. Note that after rb::aux_mmap_count dropped to zero,
6269		 * they won't start any more (see perf_aux_output_begin()).
6270		 */
6271		perf_pmu_output_stop(event);
6272
6273		/* now it's safe to free the pages */
6274		atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
6275		atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
6276
6277		/* this has to be the last one */
6278		rb_free_aux(rb);
6279		WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
6280
6281		mutex_unlock(&event->mmap_mutex);
6282	}
6283
6284	if (atomic_dec_and_test(&rb->mmap_count))
6285		detach_rest = true;
6286
6287	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
6288		goto out_put;
6289
6290	ring_buffer_attach(event, NULL);
6291	mutex_unlock(&event->mmap_mutex);
6292
6293	/* If there's still other mmap()s of this buffer, we're done. */
6294	if (!detach_rest)
6295		goto out_put;
6296
6297	/*
6298	 * No other mmap()s, detach from all other events that might redirect
6299	 * into the now unreachable buffer. Somewhat complicated by the
6300	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
6301	 */
6302again:
6303	rcu_read_lock();
6304	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6305		if (!atomic_long_inc_not_zero(&event->refcount)) {
6306			/*
6307			 * This event is en-route to free_event() which will
6308			 * detach it and remove it from the list.
6309			 */
6310			continue;
6311		}
6312		rcu_read_unlock();
6313
6314		mutex_lock(&event->mmap_mutex);
6315		/*
6316		 * Check we didn't race with perf_event_set_output() which can
6317		 * swizzle the rb from under us while we were waiting to
6318		 * acquire mmap_mutex.
6319		 *
6320		 * If we find a different rb; ignore this event, a next
6321		 * iteration will no longer find it on the list. We have to
6322		 * still restart the iteration to make sure we're not now
6323		 * iterating the wrong list.
6324		 */
6325		if (event->rb == rb)
6326			ring_buffer_attach(event, NULL);
6327
6328		mutex_unlock(&event->mmap_mutex);
6329		put_event(event);
6330
6331		/*
6332		 * Restart the iteration; either we're on the wrong list or
6333		 * destroyed its integrity by doing a deletion.
6334		 */
6335		goto again;
6336	}
6337	rcu_read_unlock();
6338
6339	/*
6340	 * It could be there's still a few 0-ref events on the list; they'll
6341	 * get cleaned up by free_event() -- they'll also still have their
6342	 * ref on the rb and will free it whenever they are done with it.
6343	 *
6344	 * Aside from that, this buffer is 'fully' detached and unmapped,
6345	 * undo the VM accounting.
6346	 */
6347
6348	atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6349			&mmap_user->locked_vm);
6350	atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6351	free_uid(mmap_user);
6352
6353out_put:
6354	ring_buffer_put(rb); /* could be last */
6355}
6356
6357static const struct vm_operations_struct perf_mmap_vmops = {
6358	.open		= perf_mmap_open,
6359	.close		= perf_mmap_close, /* non mergeable */
6360	.fault		= perf_mmap_fault,
6361	.page_mkwrite	= perf_mmap_fault,
6362};
6363
6364static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6365{
6366	struct perf_event *event = file->private_data;
6367	unsigned long user_locked, user_lock_limit;
6368	struct user_struct *user = current_user();
6369	struct perf_buffer *rb = NULL;
6370	unsigned long locked, lock_limit;
6371	unsigned long vma_size;
6372	unsigned long nr_pages;
6373	long user_extra = 0, extra = 0;
6374	int ret = 0, flags = 0;
6375
6376	/*
6377	 * Don't allow mmap() of inherited per-task counters. This would
6378	 * create a performance issue due to all children writing to the
6379	 * same rb.
6380	 */
6381	if (event->cpu == -1 && event->attr.inherit)
6382		return -EINVAL;
6383
6384	if (!(vma->vm_flags & VM_SHARED))
6385		return -EINVAL;
6386
6387	ret = security_perf_event_read(event);
6388	if (ret)
6389		return ret;
6390
6391	vma_size = vma->vm_end - vma->vm_start;
6392
6393	if (vma->vm_pgoff == 0) {
6394		nr_pages = (vma_size / PAGE_SIZE) - 1;
6395	} else {
6396		/*
6397		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
6398		 * mapped, all subsequent mappings should have the same size
6399		 * and offset. Must be above the normal perf buffer.
6400		 */
6401		u64 aux_offset, aux_size;
6402
6403		if (!<