1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Performance events core code:
4 *
5 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
6 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
7 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
8 *  Copyright  ��  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
9 */
10
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/cpu.h>
14#include <linux/smp.h>
15#include <linux/idr.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
20#include <linux/tick.h>
21#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
25#include <linux/reboot.h>
26#include <linux/vmstat.h>
27#include <linux/device.h>
28#include <linux/export.h>
29#include <linux/vmalloc.h>
30#include <linux/hardirq.h>
31#include <linux/hugetlb.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47#include <linux/namei.h>
48#include <linux/parser.h>
49#include <linux/sched/clock.h>
50#include <linux/sched/mm.h>
51#include <linux/proc_ns.h>
52#include <linux/mount.h>
53#include <linux/min_heap.h>
54#include <linux/highmem.h>
55#include <linux/pgtable.h>
56#include <linux/buildid.h>
57#include <linux/task_work.h>
58
59#include "internal.h"
60
61#include <asm/irq_regs.h>
62
63typedef int (*remote_function_f)(void *);
64
65struct remote_function_call {
66	struct task_struct	*p;
67	remote_function_f	func;
68	void			*info;
69	int			ret;
70};
71
72static void remote_function(void *data)
73{
74	struct remote_function_call *tfc = data;
75	struct task_struct *p = tfc->p;
76
77	if (p) {
78		/* -EAGAIN */
79		if (task_cpu(p) != smp_processor_id())
80			return;
81
82		/*
83		 * Now that we're on right CPU with IRQs disabled, we can test
84		 * if we hit the right task without races.
85		 */
86
87		tfc->ret = -ESRCH; /* No such (running) process */
88		if (p != current)
89			return;
90	}
91
92	tfc->ret = tfc->func(tfc->info);
93}
94
95/**
96 * task_function_call - call a function on the cpu on which a task runs
97 * @p:		the task to evaluate
98 * @func:	the function to be called
99 * @info:	the function call argument
100 *
101 * Calls the function @func when the task is currently running. This might
102 * be on the current CPU, which just calls the function directly.  This will
103 * retry due to any failures in smp_call_function_single(), such as if the
104 * task_cpu() goes offline concurrently.
105 *
106 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
107 */
108static int
109task_function_call(struct task_struct *p, remote_function_f func, void *info)
110{
111	struct remote_function_call data = {
112		.p	= p,
113		.func	= func,
114		.info	= info,
115		.ret	= -EAGAIN,
116	};
117	int ret;
118
119	for (;;) {
120		ret = smp_call_function_single(task_cpu(p), remote_function,
121					       &data, 1);
122		if (!ret)
123			ret = data.ret;
124
125		if (ret != -EAGAIN)
126			break;
127
128		cond_resched();
129	}
130
131	return ret;
132}
133
134/**
135 * cpu_function_call - call a function on the cpu
136 * @cpu:	target cpu to queue this function
137 * @func:	the function to be called
138 * @info:	the function call argument
139 *
140 * Calls the function @func on the remote cpu.
141 *
142 * returns: @func return value or -ENXIO when the cpu is offline
143 */
144static int cpu_function_call(int cpu, remote_function_f func, void *info)
145{
146	struct remote_function_call data = {
147		.p	= NULL,
148		.func	= func,
149		.info	= info,
150		.ret	= -ENXIO, /* No such CPU */
151	};
152
153	smp_call_function_single(cpu, remote_function, &data, 1);
154
155	return data.ret;
156}
157
158static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
159			  struct perf_event_context *ctx)
160{
161	raw_spin_lock(&cpuctx->ctx.lock);
162	if (ctx)
163		raw_spin_lock(&ctx->lock);
164}
165
166static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
167			    struct perf_event_context *ctx)
168{
169	if (ctx)
170		raw_spin_unlock(&ctx->lock);
171	raw_spin_unlock(&cpuctx->ctx.lock);
172}
173
174#define TASK_TOMBSTONE ((void *)-1L)
175
176static bool is_kernel_event(struct perf_event *event)
177{
178	return READ_ONCE(event->owner) == TASK_TOMBSTONE;
179}
180
181static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
182
183struct perf_event_context *perf_cpu_task_ctx(void)
184{
185	lockdep_assert_irqs_disabled();
186	return this_cpu_ptr(&perf_cpu_context)->task_ctx;
187}
188
189/*
190 * On task ctx scheduling...
191 *
192 * When !ctx->nr_events a task context will not be scheduled. This means
193 * we can disable the scheduler hooks (for performance) without leaving
194 * pending task ctx state.
195 *
196 * This however results in two special cases:
197 *
198 *  - removing the last event from a task ctx; this is relatively straight
199 *    forward and is done in __perf_remove_from_context.
200 *
201 *  - adding the first event to a task ctx; this is tricky because we cannot
202 *    rely on ctx->is_active and therefore cannot use event_function_call().
203 *    See perf_install_in_context().
204 *
205 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
206 */
207
208typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
209			struct perf_event_context *, void *);
210
211struct event_function_struct {
212	struct perf_event *event;
213	event_f func;
214	void *data;
215};
216
217static int event_function(void *info)
218{
219	struct event_function_struct *efs = info;
220	struct perf_event *event = efs->event;
221	struct perf_event_context *ctx = event->ctx;
222	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
223	struct perf_event_context *task_ctx = cpuctx->task_ctx;
224	int ret = 0;
225
226	lockdep_assert_irqs_disabled();
227
228	perf_ctx_lock(cpuctx, task_ctx);
229	/*
230	 * Since we do the IPI call without holding ctx->lock things can have
231	 * changed, double check we hit the task we set out to hit.
232	 */
233	if (ctx->task) {
234		if (ctx->task != current) {
235			ret = -ESRCH;
236			goto unlock;
237		}
238
239		/*
240		 * We only use event_function_call() on established contexts,
241		 * and event_function() is only ever called when active (or
242		 * rather, we'll have bailed in task_function_call() or the
243		 * above ctx->task != current test), therefore we must have
244		 * ctx->is_active here.
245		 */
246		WARN_ON_ONCE(!ctx->is_active);
247		/*
248		 * And since we have ctx->is_active, cpuctx->task_ctx must
249		 * match.
250		 */
251		WARN_ON_ONCE(task_ctx != ctx);
252	} else {
253		WARN_ON_ONCE(&cpuctx->ctx != ctx);
254	}
255
256	efs->func(event, cpuctx, ctx, efs->data);
257unlock:
258	perf_ctx_unlock(cpuctx, task_ctx);
259
260	return ret;
261}
262
263static void event_function_call(struct perf_event *event, event_f func, void *data)
264{
265	struct perf_event_context *ctx = event->ctx;
266	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
267	struct event_function_struct efs = {
268		.event = event,
269		.func = func,
270		.data = data,
271	};
272
273	if (!event->parent) {
274		/*
275		 * If this is a !child event, we must hold ctx::mutex to
276		 * stabilize the event->ctx relation. See
277		 * perf_event_ctx_lock().
278		 */
279		lockdep_assert_held(&ctx->mutex);
280	}
281
282	if (!task) {
283		cpu_function_call(event->cpu, event_function, &efs);
284		return;
285	}
286
287	if (task == TASK_TOMBSTONE)
288		return;
289
290again:
291	if (!task_function_call(task, event_function, &efs))
292		return;
293
294	raw_spin_lock_irq(&ctx->lock);
295	/*
296	 * Reload the task pointer, it might have been changed by
297	 * a concurrent perf_event_context_sched_out().
298	 */
299	task = ctx->task;
300	if (task == TASK_TOMBSTONE) {
301		raw_spin_unlock_irq(&ctx->lock);
302		return;
303	}
304	if (ctx->is_active) {
305		raw_spin_unlock_irq(&ctx->lock);
306		goto again;
307	}
308	func(event, NULL, ctx, data);
309	raw_spin_unlock_irq(&ctx->lock);
310}
311
312/*
313 * Similar to event_function_call() + event_function(), but hard assumes IRQs
314 * are already disabled and we're on the right CPU.
315 */
316static void event_function_local(struct perf_event *event, event_f func, void *data)
317{
318	struct perf_event_context *ctx = event->ctx;
319	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
320	struct task_struct *task = READ_ONCE(ctx->task);
321	struct perf_event_context *task_ctx = NULL;
322
323	lockdep_assert_irqs_disabled();
324
325	if (task) {
326		if (task == TASK_TOMBSTONE)
327			return;
328
329		task_ctx = ctx;
330	}
331
332	perf_ctx_lock(cpuctx, task_ctx);
333
334	task = ctx->task;
335	if (task == TASK_TOMBSTONE)
336		goto unlock;
337
338	if (task) {
339		/*
340		 * We must be either inactive or active and the right task,
341		 * otherwise we're screwed, since we cannot IPI to somewhere
342		 * else.
343		 */
344		if (ctx->is_active) {
345			if (WARN_ON_ONCE(task != current))
346				goto unlock;
347
348			if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
349				goto unlock;
350		}
351	} else {
352		WARN_ON_ONCE(&cpuctx->ctx != ctx);
353	}
354
355	func(event, cpuctx, ctx, data);
356unlock:
357	perf_ctx_unlock(cpuctx, task_ctx);
358}
359
360#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
361		       PERF_FLAG_FD_OUTPUT  |\
362		       PERF_FLAG_PID_CGROUP |\
363		       PERF_FLAG_FD_CLOEXEC)
364
365/*
366 * branch priv levels that need permission checks
367 */
368#define PERF_SAMPLE_BRANCH_PERM_PLM \
369	(PERF_SAMPLE_BRANCH_KERNEL |\
370	 PERF_SAMPLE_BRANCH_HV)
371
372enum event_type_t {
373	EVENT_FLEXIBLE = 0x1,
374	EVENT_PINNED = 0x2,
375	EVENT_TIME = 0x4,
376	/* see ctx_resched() for details */
377	EVENT_CPU = 0x8,
378	EVENT_CGROUP = 0x10,
379	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
380};
381
382/*
383 * perf_sched_events : >0 events exist
384 */
385
386static void perf_sched_delayed(struct work_struct *work);
387DEFINE_STATIC_KEY_FALSE(perf_sched_events);
388static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
389static DEFINE_MUTEX(perf_sched_mutex);
390static atomic_t perf_sched_count;
391
392static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
393
394static atomic_t nr_mmap_events __read_mostly;
395static atomic_t nr_comm_events __read_mostly;
396static atomic_t nr_namespaces_events __read_mostly;
397static atomic_t nr_task_events __read_mostly;
398static atomic_t nr_freq_events __read_mostly;
399static atomic_t nr_switch_events __read_mostly;
400static atomic_t nr_ksymbol_events __read_mostly;
401static atomic_t nr_bpf_events __read_mostly;
402static atomic_t nr_cgroup_events __read_mostly;
403static atomic_t nr_text_poke_events __read_mostly;
404static atomic_t nr_build_id_events __read_mostly;
405
406static LIST_HEAD(pmus);
407static DEFINE_MUTEX(pmus_lock);
408static struct srcu_struct pmus_srcu;
409static cpumask_var_t perf_online_mask;
410static struct kmem_cache *perf_event_cache;
411
412/*
413 * perf event paranoia level:
414 *  -1 - not paranoid at all
415 *   0 - disallow raw tracepoint access for unpriv
416 *   1 - disallow cpu events for unpriv
417 *   2 - disallow kernel profiling for unpriv
418 */
419int sysctl_perf_event_paranoid __read_mostly = 2;
420
421/* Minimum for 512 kiB + 1 user control page */
422int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
423
424/*
425 * max perf event sample rate
426 */
427#define DEFAULT_MAX_SAMPLE_RATE		100000
428#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
429#define DEFAULT_CPU_TIME_MAX_PERCENT	25
430
431int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
432
433static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
434static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
435
436static int perf_sample_allowed_ns __read_mostly =
437	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
438
439static void update_perf_cpu_limits(void)
440{
441	u64 tmp = perf_sample_period_ns;
442
443	tmp *= sysctl_perf_cpu_time_max_percent;
444	tmp = div_u64(tmp, 100);
445	if (!tmp)
446		tmp = 1;
447
448	WRITE_ONCE(perf_sample_allowed_ns, tmp);
449}
450
451static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
452
453int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
454				       void *buffer, size_t *lenp, loff_t *ppos)
455{
456	int ret;
457	int perf_cpu = sysctl_perf_cpu_time_max_percent;
458	/*
459	 * If throttling is disabled don't allow the write:
460	 */
461	if (write && (perf_cpu == 100 || perf_cpu == 0))
462		return -EINVAL;
463
464	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
465	if (ret || !write)
466		return ret;
467
468	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
469	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
470	update_perf_cpu_limits();
471
472	return 0;
473}
474
475int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
476
477int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
478		void *buffer, size_t *lenp, loff_t *ppos)
479{
480	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
481
482	if (ret || !write)
483		return ret;
484
485	if (sysctl_perf_cpu_time_max_percent == 100 ||
486	    sysctl_perf_cpu_time_max_percent == 0) {
487		printk(KERN_WARNING
488		       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
489		WRITE_ONCE(perf_sample_allowed_ns, 0);
490	} else {
491		update_perf_cpu_limits();
492	}
493
494	return 0;
495}
496
497/*
498 * perf samples are done in some very critical code paths (NMIs).
499 * If they take too much CPU time, the system can lock up and not
500 * get any real work done.  This will drop the sample rate when
501 * we detect that events are taking too long.
502 */
503#define NR_ACCUMULATED_SAMPLES 128
504static DEFINE_PER_CPU(u64, running_sample_length);
505
506static u64 __report_avg;
507static u64 __report_allowed;
508
509static void perf_duration_warn(struct irq_work *w)
510{
511	printk_ratelimited(KERN_INFO
512		"perf: interrupt took too long (%lld > %lld), lowering "
513		"kernel.perf_event_max_sample_rate to %d\n",
514		__report_avg, __report_allowed,
515		sysctl_perf_event_sample_rate);
516}
517
518static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
519
520void perf_sample_event_took(u64 sample_len_ns)
521{
522	u64 max_len = READ_ONCE(perf_sample_allowed_ns);
523	u64 running_len;
524	u64 avg_len;
525	u32 max;
526
527	if (max_len == 0)
528		return;
529
530	/* Decay the counter by 1 average sample. */
531	running_len = __this_cpu_read(running_sample_length);
532	running_len -= running_len/NR_ACCUMULATED_SAMPLES;
533	running_len += sample_len_ns;
534	__this_cpu_write(running_sample_length, running_len);
535
536	/*
537	 * Note: this will be biased artifically low until we have
538	 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
539	 * from having to maintain a count.
540	 */
541	avg_len = running_len/NR_ACCUMULATED_SAMPLES;
542	if (avg_len <= max_len)
543		return;
544
545	__report_avg = avg_len;
546	__report_allowed = max_len;
547
548	/*
549	 * Compute a throttle threshold 25% below the current duration.
550	 */
551	avg_len += avg_len / 4;
552	max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
553	if (avg_len < max)
554		max /= (u32)avg_len;
555	else
556		max = 1;
557
558	WRITE_ONCE(perf_sample_allowed_ns, avg_len);
559	WRITE_ONCE(max_samples_per_tick, max);
560
561	sysctl_perf_event_sample_rate = max * HZ;
562	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
563
564	if (!irq_work_queue(&perf_duration_work)) {
565		early_printk("perf: interrupt took too long (%lld > %lld), lowering "
566			     "kernel.perf_event_max_sample_rate to %d\n",
567			     __report_avg, __report_allowed,
568			     sysctl_perf_event_sample_rate);
569	}
570}
571
572static atomic64_t perf_event_id;
573
574static void update_context_time(struct perf_event_context *ctx);
575static u64 perf_event_time(struct perf_event *event);
576
577void __weak perf_event_print_debug(void)	{ }
578
579static inline u64 perf_clock(void)
580{
581	return local_clock();
582}
583
584static inline u64 perf_event_clock(struct perf_event *event)
585{
586	return event->clock();
587}
588
589/*
590 * State based event timekeeping...
591 *
592 * The basic idea is to use event->state to determine which (if any) time
593 * fields to increment with the current delta. This means we only need to
594 * update timestamps when we change state or when they are explicitly requested
595 * (read).
596 *
597 * Event groups make things a little more complicated, but not terribly so. The
598 * rules for a group are that if the group leader is OFF the entire group is
599 * OFF, irrespecive of what the group member states are. This results in
600 * __perf_effective_state().
601 *
602 * A futher ramification is that when a group leader flips between OFF and
603 * !OFF, we need to update all group member times.
604 *
605 *
606 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
607 * need to make sure the relevant context time is updated before we try and
608 * update our timestamps.
609 */
610
611static __always_inline enum perf_event_state
612__perf_effective_state(struct perf_event *event)
613{
614	struct perf_event *leader = event->group_leader;
615
616	if (leader->state <= PERF_EVENT_STATE_OFF)
617		return leader->state;
618
619	return event->state;
620}
621
622static __always_inline void
623__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
624{
625	enum perf_event_state state = __perf_effective_state(event);
626	u64 delta = now - event->tstamp;
627
628	*enabled = event->total_time_enabled;
629	if (state >= PERF_EVENT_STATE_INACTIVE)
630		*enabled += delta;
631
632	*running = event->total_time_running;
633	if (state >= PERF_EVENT_STATE_ACTIVE)
634		*running += delta;
635}
636
637static void perf_event_update_time(struct perf_event *event)
638{
639	u64 now = perf_event_time(event);
640
641	__perf_update_times(event, now, &event->total_time_enabled,
642					&event->total_time_running);
643	event->tstamp = now;
644}
645
646static void perf_event_update_sibling_time(struct perf_event *leader)
647{
648	struct perf_event *sibling;
649
650	for_each_sibling_event(sibling, leader)
651		perf_event_update_time(sibling);
652}
653
654static void
655perf_event_set_state(struct perf_event *event, enum perf_event_state state)
656{
657	if (event->state == state)
658		return;
659
660	perf_event_update_time(event);
661	/*
662	 * If a group leader gets enabled/disabled all its siblings
663	 * are affected too.
664	 */
665	if ((event->state < 0) ^ (state < 0))
666		perf_event_update_sibling_time(event);
667
668	WRITE_ONCE(event->state, state);
669}
670
671/*
672 * UP store-release, load-acquire
673 */
674
675#define __store_release(ptr, val)					\
676do {									\
677	barrier();							\
678	WRITE_ONCE(*(ptr), (val));					\
679} while (0)
680
681#define __load_acquire(ptr)						\
682({									\
683	__unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));	\
684	barrier();							\
685	___p;								\
686})
687
688static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
689{
690	struct perf_event_pmu_context *pmu_ctx;
691
692	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
693		if (cgroup && !pmu_ctx->nr_cgroups)
694			continue;
695		perf_pmu_disable(pmu_ctx->pmu);
696	}
697}
698
699static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
700{
701	struct perf_event_pmu_context *pmu_ctx;
702
703	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
704		if (cgroup && !pmu_ctx->nr_cgroups)
705			continue;
706		perf_pmu_enable(pmu_ctx->pmu);
707	}
708}
709
710static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
711static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
712
713#ifdef CONFIG_CGROUP_PERF
714
715static inline bool
716perf_cgroup_match(struct perf_event *event)
717{
718	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
719
720	/* @event doesn't care about cgroup */
721	if (!event->cgrp)
722		return true;
723
724	/* wants specific cgroup scope but @cpuctx isn't associated with any */
725	if (!cpuctx->cgrp)
726		return false;
727
728	/*
729	 * Cgroup scoping is recursive.  An event enabled for a cgroup is
730	 * also enabled for all its descendant cgroups.  If @cpuctx's
731	 * cgroup is a descendant of @event's (the test covers identity
732	 * case), it's a match.
733	 */
734	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
735				    event->cgrp->css.cgroup);
736}
737
738static inline void perf_detach_cgroup(struct perf_event *event)
739{
740	css_put(&event->cgrp->css);
741	event->cgrp = NULL;
742}
743
744static inline int is_cgroup_event(struct perf_event *event)
745{
746	return event->cgrp != NULL;
747}
748
749static inline u64 perf_cgroup_event_time(struct perf_event *event)
750{
751	struct perf_cgroup_info *t;
752
753	t = per_cpu_ptr(event->cgrp->info, event->cpu);
754	return t->time;
755}
756
757static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
758{
759	struct perf_cgroup_info *t;
760
761	t = per_cpu_ptr(event->cgrp->info, event->cpu);
762	if (!__load_acquire(&t->active))
763		return t->time;
764	now += READ_ONCE(t->timeoffset);
765	return now;
766}
767
768static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
769{
770	if (adv)
771		info->time += now - info->timestamp;
772	info->timestamp = now;
773	/*
774	 * see update_context_time()
775	 */
776	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
777}
778
779static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
780{
781	struct perf_cgroup *cgrp = cpuctx->cgrp;
782	struct cgroup_subsys_state *css;
783	struct perf_cgroup_info *info;
784
785	if (cgrp) {
786		u64 now = perf_clock();
787
788		for (css = &cgrp->css; css; css = css->parent) {
789			cgrp = container_of(css, struct perf_cgroup, css);
790			info = this_cpu_ptr(cgrp->info);
791
792			__update_cgrp_time(info, now, true);
793			if (final)
794				__store_release(&info->active, 0);
795		}
796	}
797}
798
799static inline void update_cgrp_time_from_event(struct perf_event *event)
800{
801	struct perf_cgroup_info *info;
802
803	/*
804	 * ensure we access cgroup data only when needed and
805	 * when we know the cgroup is pinned (css_get)
806	 */
807	if (!is_cgroup_event(event))
808		return;
809
810	info = this_cpu_ptr(event->cgrp->info);
811	/*
812	 * Do not update time when cgroup is not active
813	 */
814	if (info->active)
815		__update_cgrp_time(info, perf_clock(), true);
816}
817
818static inline void
819perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
820{
821	struct perf_event_context *ctx = &cpuctx->ctx;
822	struct perf_cgroup *cgrp = cpuctx->cgrp;
823	struct perf_cgroup_info *info;
824	struct cgroup_subsys_state *css;
825
826	/*
827	 * ctx->lock held by caller
828	 * ensure we do not access cgroup data
829	 * unless we have the cgroup pinned (css_get)
830	 */
831	if (!cgrp)
832		return;
833
834	WARN_ON_ONCE(!ctx->nr_cgroups);
835
836	for (css = &cgrp->css; css; css = css->parent) {
837		cgrp = container_of(css, struct perf_cgroup, css);
838		info = this_cpu_ptr(cgrp->info);
839		__update_cgrp_time(info, ctx->timestamp, false);
840		__store_release(&info->active, 1);
841	}
842}
843
844/*
845 * reschedule events based on the cgroup constraint of task.
846 */
847static void perf_cgroup_switch(struct task_struct *task)
848{
849	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
850	struct perf_cgroup *cgrp;
851
852	/*
853	 * cpuctx->cgrp is set when the first cgroup event enabled,
854	 * and is cleared when the last cgroup event disabled.
855	 */
856	if (READ_ONCE(cpuctx->cgrp) == NULL)
857		return;
858
859	WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
860
861	cgrp = perf_cgroup_from_task(task, NULL);
862	if (READ_ONCE(cpuctx->cgrp) == cgrp)
863		return;
864
865	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
866	perf_ctx_disable(&cpuctx->ctx, true);
867
868	ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
869	/*
870	 * must not be done before ctxswout due
871	 * to update_cgrp_time_from_cpuctx() in
872	 * ctx_sched_out()
873	 */
874	cpuctx->cgrp = cgrp;
875	/*
876	 * set cgrp before ctxsw in to allow
877	 * perf_cgroup_set_timestamp() in ctx_sched_in()
878	 * to not have to pass task around
879	 */
880	ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
881
882	perf_ctx_enable(&cpuctx->ctx, true);
883	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
884}
885
886static int perf_cgroup_ensure_storage(struct perf_event *event,
887				struct cgroup_subsys_state *css)
888{
889	struct perf_cpu_context *cpuctx;
890	struct perf_event **storage;
891	int cpu, heap_size, ret = 0;
892
893	/*
894	 * Allow storage to have sufficent space for an iterator for each
895	 * possibly nested cgroup plus an iterator for events with no cgroup.
896	 */
897	for (heap_size = 1; css; css = css->parent)
898		heap_size++;
899
900	for_each_possible_cpu(cpu) {
901		cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
902		if (heap_size <= cpuctx->heap_size)
903			continue;
904
905		storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
906				       GFP_KERNEL, cpu_to_node(cpu));
907		if (!storage) {
908			ret = -ENOMEM;
909			break;
910		}
911
912		raw_spin_lock_irq(&cpuctx->ctx.lock);
913		if (cpuctx->heap_size < heap_size) {
914			swap(cpuctx->heap, storage);
915			if (storage == cpuctx->heap_default)
916				storage = NULL;
917			cpuctx->heap_size = heap_size;
918		}
919		raw_spin_unlock_irq(&cpuctx->ctx.lock);
920
921		kfree(storage);
922	}
923
924	return ret;
925}
926
927static inline int perf_cgroup_connect(int fd, struct perf_event *event,
928				      struct perf_event_attr *attr,
929				      struct perf_event *group_leader)
930{
931	struct perf_cgroup *cgrp;
932	struct cgroup_subsys_state *css;
933	struct fd f = fdget(fd);
934	int ret = 0;
935
936	if (!f.file)
937		return -EBADF;
938
939	css = css_tryget_online_from_dir(f.file->f_path.dentry,
940					 &perf_event_cgrp_subsys);
941	if (IS_ERR(css)) {
942		ret = PTR_ERR(css);
943		goto out;
944	}
945
946	ret = perf_cgroup_ensure_storage(event, css);
947	if (ret)
948		goto out;
949
950	cgrp = container_of(css, struct perf_cgroup, css);
951	event->cgrp = cgrp;
952
953	/*
954	 * all events in a group must monitor
955	 * the same cgroup because a task belongs
956	 * to only one perf cgroup at a time
957	 */
958	if (group_leader && group_leader->cgrp != cgrp) {
959		perf_detach_cgroup(event);
960		ret = -EINVAL;
961	}
962out:
963	fdput(f);
964	return ret;
965}
966
967static inline void
968perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
969{
970	struct perf_cpu_context *cpuctx;
971
972	if (!is_cgroup_event(event))
973		return;
974
975	event->pmu_ctx->nr_cgroups++;
976
977	/*
978	 * Because cgroup events are always per-cpu events,
979	 * @ctx == &cpuctx->ctx.
980	 */
981	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
982
983	if (ctx->nr_cgroups++)
984		return;
985
986	cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
987}
988
989static inline void
990perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
991{
992	struct perf_cpu_context *cpuctx;
993
994	if (!is_cgroup_event(event))
995		return;
996
997	event->pmu_ctx->nr_cgroups--;
998
999	/*
1000	 * Because cgroup events are always per-cpu events,
1001	 * @ctx == &cpuctx->ctx.
1002	 */
1003	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1004
1005	if (--ctx->nr_cgroups)
1006		return;
1007
1008	cpuctx->cgrp = NULL;
1009}
1010
1011#else /* !CONFIG_CGROUP_PERF */
1012
1013static inline bool
1014perf_cgroup_match(struct perf_event *event)
1015{
1016	return true;
1017}
1018
1019static inline void perf_detach_cgroup(struct perf_event *event)
1020{}
1021
1022static inline int is_cgroup_event(struct perf_event *event)
1023{
1024	return 0;
1025}
1026
1027static inline void update_cgrp_time_from_event(struct perf_event *event)
1028{
1029}
1030
1031static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
1032						bool final)
1033{
1034}
1035
1036static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1037				      struct perf_event_attr *attr,
1038				      struct perf_event *group_leader)
1039{
1040	return -EINVAL;
1041}
1042
1043static inline void
1044perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
1045{
1046}
1047
1048static inline u64 perf_cgroup_event_time(struct perf_event *event)
1049{
1050	return 0;
1051}
1052
1053static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
1054{
1055	return 0;
1056}
1057
1058static inline void
1059perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1060{
1061}
1062
1063static inline void
1064perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1065{
1066}
1067
1068static void perf_cgroup_switch(struct task_struct *task)
1069{
1070}
1071#endif
1072
1073/*
1074 * set default to be dependent on timer tick just
1075 * like original code
1076 */
1077#define PERF_CPU_HRTIMER (1000 / HZ)
1078/*
1079 * function must be called with interrupts disabled
1080 */
1081static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1082{
1083	struct perf_cpu_pmu_context *cpc;
1084	bool rotations;
1085
1086	lockdep_assert_irqs_disabled();
1087
1088	cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
1089	rotations = perf_rotate_context(cpc);
1090
1091	raw_spin_lock(&cpc->hrtimer_lock);
1092	if (rotations)
1093		hrtimer_forward_now(hr, cpc->hrtimer_interval);
1094	else
1095		cpc->hrtimer_active = 0;
1096	raw_spin_unlock(&cpc->hrtimer_lock);
1097
1098	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1099}
1100
1101static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
1102{
1103	struct hrtimer *timer = &cpc->hrtimer;
1104	struct pmu *pmu = cpc->epc.pmu;
1105	u64 interval;
1106
1107	/*
1108	 * check default is sane, if not set then force to
1109	 * default interval (1/tick)
1110	 */
1111	interval = pmu->hrtimer_interval_ms;
1112	if (interval < 1)
1113		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1114
1115	cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1116
1117	raw_spin_lock_init(&cpc->hrtimer_lock);
1118	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1119	timer->function = perf_mux_hrtimer_handler;
1120}
1121
1122static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
1123{
1124	struct hrtimer *timer = &cpc->hrtimer;
1125	unsigned long flags;
1126
1127	raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
1128	if (!cpc->hrtimer_active) {
1129		cpc->hrtimer_active = 1;
1130		hrtimer_forward_now(timer, cpc->hrtimer_interval);
1131		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1132	}
1133	raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
1134
1135	return 0;
1136}
1137
1138static int perf_mux_hrtimer_restart_ipi(void *arg)
1139{
1140	return perf_mux_hrtimer_restart(arg);
1141}
1142
1143void perf_pmu_disable(struct pmu *pmu)
1144{
1145	int *count = this_cpu_ptr(pmu->pmu_disable_count);
1146	if (!(*count)++)
1147		pmu->pmu_disable(pmu);
1148}
1149
1150void perf_pmu_enable(struct pmu *pmu)
1151{
1152	int *count = this_cpu_ptr(pmu->pmu_disable_count);
1153	if (!--(*count))
1154		pmu->pmu_enable(pmu);
1155}
1156
1157static void perf_assert_pmu_disabled(struct pmu *pmu)
1158{
1159	WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
1160}
1161
1162static void get_ctx(struct perf_event_context *ctx)
1163{
1164	refcount_inc(&ctx->refcount);
1165}
1166
1167static void *alloc_task_ctx_data(struct pmu *pmu)
1168{
1169	if (pmu->task_ctx_cache)
1170		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1171
1172	return NULL;
1173}
1174
1175static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1176{
1177	if (pmu->task_ctx_cache && task_ctx_data)
1178		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1179}
1180
1181static void free_ctx(struct rcu_head *head)
1182{
1183	struct perf_event_context *ctx;
1184
1185	ctx = container_of(head, struct perf_event_context, rcu_head);
1186	kfree(ctx);
1187}
1188
1189static void put_ctx(struct perf_event_context *ctx)
1190{
1191	if (refcount_dec_and_test(&ctx->refcount)) {
1192		if (ctx->parent_ctx)
1193			put_ctx(ctx->parent_ctx);
1194		if (ctx->task && ctx->task != TASK_TOMBSTONE)
1195			put_task_struct(ctx->task);
1196		call_rcu(&ctx->rcu_head, free_ctx);
1197	}
1198}
1199
1200/*
1201 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1202 * perf_pmu_migrate_context() we need some magic.
1203 *
1204 * Those places that change perf_event::ctx will hold both
1205 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1206 *
1207 * Lock ordering is by mutex address. There are two other sites where
1208 * perf_event_context::mutex nests and those are:
1209 *
1210 *  - perf_event_exit_task_context()	[ child , 0 ]
1211 *      perf_event_exit_event()
1212 *        put_event()			[ parent, 1 ]
1213 *
1214 *  - perf_event_init_context()		[ parent, 0 ]
1215 *      inherit_task_group()
1216 *        inherit_group()
1217 *          inherit_event()
1218 *            perf_event_alloc()
1219 *              perf_init_event()
1220 *                perf_try_init_event()	[ child , 1 ]
1221 *
1222 * While it appears there is an obvious deadlock here -- the parent and child
1223 * nesting levels are inverted between the two. This is in fact safe because
1224 * life-time rules separate them. That is an exiting task cannot fork, and a
1225 * spawning task cannot (yet) exit.
1226 *
1227 * But remember that these are parent<->child context relations, and
1228 * migration does not affect children, therefore these two orderings should not
1229 * interact.
1230 *
1231 * The change in perf_event::ctx does not affect children (as claimed above)
1232 * because the sys_perf_event_open() case will install a new event and break
1233 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1234 * concerned with cpuctx and that doesn't have children.
1235 *
1236 * The places that change perf_event::ctx will issue:
1237 *
1238 *   perf_remove_from_context();
1239 *   synchronize_rcu();
1240 *   perf_install_in_context();
1241 *
1242 * to affect the change. The remove_from_context() + synchronize_rcu() should
1243 * quiesce the event, after which we can install it in the new location. This
1244 * means that only external vectors (perf_fops, prctl) can perturb the event
1245 * while in transit. Therefore all such accessors should also acquire
1246 * perf_event_context::mutex to serialize against this.
1247 *
1248 * However; because event->ctx can change while we're waiting to acquire
1249 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1250 * function.
1251 *
1252 * Lock order:
1253 *    exec_update_lock
1254 *	task_struct::perf_event_mutex
1255 *	  perf_event_context::mutex
1256 *	    perf_event::child_mutex;
1257 *	      perf_event_context::lock
1258 *	    perf_event::mmap_mutex
1259 *	    mmap_lock
1260 *	      perf_addr_filters_head::lock
1261 *
1262 *    cpu_hotplug_lock
1263 *      pmus_lock
1264 *	  cpuctx->mutex / perf_event_context::mutex
1265 */
1266static struct perf_event_context *
1267perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1268{
1269	struct perf_event_context *ctx;
1270
1271again:
1272	rcu_read_lock();
1273	ctx = READ_ONCE(event->ctx);
1274	if (!refcount_inc_not_zero(&ctx->refcount)) {
1275		rcu_read_unlock();
1276		goto again;
1277	}
1278	rcu_read_unlock();
1279
1280	mutex_lock_nested(&ctx->mutex, nesting);
1281	if (event->ctx != ctx) {
1282		mutex_unlock(&ctx->mutex);
1283		put_ctx(ctx);
1284		goto again;
1285	}
1286
1287	return ctx;
1288}
1289
1290static inline struct perf_event_context *
1291perf_event_ctx_lock(struct perf_event *event)
1292{
1293	return perf_event_ctx_lock_nested(event, 0);
1294}
1295
1296static void perf_event_ctx_unlock(struct perf_event *event,
1297				  struct perf_event_context *ctx)
1298{
1299	mutex_unlock(&ctx->mutex);
1300	put_ctx(ctx);
1301}
1302
1303/*
1304 * This must be done under the ctx->lock, such as to serialize against
1305 * context_equiv(), therefore we cannot call put_ctx() since that might end up
1306 * calling scheduler related locks and ctx->lock nests inside those.
1307 */
1308static __must_check struct perf_event_context *
1309unclone_ctx(struct perf_event_context *ctx)
1310{
1311	struct perf_event_context *parent_ctx = ctx->parent_ctx;
1312
1313	lockdep_assert_held(&ctx->lock);
1314
1315	if (parent_ctx)
1316		ctx->parent_ctx = NULL;
1317	ctx->generation++;
1318
1319	return parent_ctx;
1320}
1321
1322static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1323				enum pid_type type)
1324{
1325	u32 nr;
1326	/*
1327	 * only top level events have the pid namespace they were created in
1328	 */
1329	if (event->parent)
1330		event = event->parent;
1331
1332	nr = __task_pid_nr_ns(p, type, event->ns);
1333	/* avoid -1 if it is idle thread or runs in another ns */
1334	if (!nr && !pid_alive(p))
1335		nr = -1;
1336	return nr;
1337}
1338
1339static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1340{
1341	return perf_event_pid_type(event, p, PIDTYPE_TGID);
1342}
1343
1344static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1345{
1346	return perf_event_pid_type(event, p, PIDTYPE_PID);
1347}
1348
1349/*
1350 * If we inherit events we want to return the parent event id
1351 * to userspace.
1352 */
1353static u64 primary_event_id(struct perf_event *event)
1354{
1355	u64 id = event->id;
1356
1357	if (event->parent)
1358		id = event->parent->id;
1359
1360	return id;
1361}
1362
1363/*
1364 * Get the perf_event_context for a task and lock it.
1365 *
1366 * This has to cope with the fact that until it is locked,
1367 * the context could get moved to another task.
1368 */
1369static struct perf_event_context *
1370perf_lock_task_context(struct task_struct *task, unsigned long *flags)
1371{
1372	struct perf_event_context *ctx;
1373
1374retry:
1375	/*
1376	 * One of the few rules of preemptible RCU is that one cannot do
1377	 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1378	 * part of the read side critical section was irqs-enabled -- see
1379	 * rcu_read_unlock_special().
1380	 *
1381	 * Since ctx->lock nests under rq->lock we must ensure the entire read
1382	 * side critical section has interrupts disabled.
1383	 */
1384	local_irq_save(*flags);
1385	rcu_read_lock();
1386	ctx = rcu_dereference(task->perf_event_ctxp);
1387	if (ctx) {
1388		/*
1389		 * If this context is a clone of another, it might
1390		 * get swapped for another underneath us by
1391		 * perf_event_task_sched_out, though the
1392		 * rcu_read_lock() protects us from any context
1393		 * getting freed.  Lock the context and check if it
1394		 * got swapped before we could get the lock, and retry
1395		 * if so.  If we locked the right context, then it
1396		 * can't get swapped on us any more.
1397		 */
1398		raw_spin_lock(&ctx->lock);
1399		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
1400			raw_spin_unlock(&ctx->lock);
1401			rcu_read_unlock();
1402			local_irq_restore(*flags);
1403			goto retry;
1404		}
1405
1406		if (ctx->task == TASK_TOMBSTONE ||
1407		    !refcount_inc_not_zero(&ctx->refcount)) {
1408			raw_spin_unlock(&ctx->lock);
1409			ctx = NULL;
1410		} else {
1411			WARN_ON_ONCE(ctx->task != task);
1412		}
1413	}
1414	rcu_read_unlock();
1415	if (!ctx)
1416		local_irq_restore(*flags);
1417	return ctx;
1418}
1419
1420/*
1421 * Get the context for a task and increment its pin_count so it
1422 * can't get swapped to another task.  This also increments its
1423 * reference count so that the context can't get freed.
1424 */
1425static struct perf_event_context *
1426perf_pin_task_context(struct task_struct *task)
1427{
1428	struct perf_event_context *ctx;
1429	unsigned long flags;
1430
1431	ctx = perf_lock_task_context(task, &flags);
1432	if (ctx) {
1433		++ctx->pin_count;
1434		raw_spin_unlock_irqrestore(&ctx->lock, flags);
1435	}
1436	return ctx;
1437}
1438
1439static void perf_unpin_context(struct perf_event_context *ctx)
1440{
1441	unsigned long flags;
1442
1443	raw_spin_lock_irqsave(&ctx->lock, flags);
1444	--ctx->pin_count;
1445	raw_spin_unlock_irqrestore(&ctx->lock, flags);
1446}
1447
1448/*
1449 * Update the record of the current time in a context.
1450 */
1451static void __update_context_time(struct perf_event_context *ctx, bool adv)
1452{
1453	u64 now = perf_clock();
1454
1455	lockdep_assert_held(&ctx->lock);
1456
1457	if (adv)
1458		ctx->time += now - ctx->timestamp;
1459	ctx->timestamp = now;
1460
1461	/*
1462	 * The above: time' = time + (now - timestamp), can be re-arranged
1463	 * into: time` = now + (time - timestamp), which gives a single value
1464	 * offset to compute future time without locks on.
1465	 *
1466	 * See perf_event_time_now(), which can be used from NMI context where
1467	 * it's (obviously) not possible to acquire ctx->lock in order to read
1468	 * both the above values in a consistent manner.
1469	 */
1470	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
1471}
1472
1473static void update_context_time(struct perf_event_context *ctx)
1474{
1475	__update_context_time(ctx, true);
1476}
1477
1478static u64 perf_event_time(struct perf_event *event)
1479{
1480	struct perf_event_context *ctx = event->ctx;
1481
1482	if (unlikely(!ctx))
1483		return 0;
1484
1485	if (is_cgroup_event(event))
1486		return perf_cgroup_event_time(event);
1487
1488	return ctx->time;
1489}
1490
1491static u64 perf_event_time_now(struct perf_event *event, u64 now)
1492{
1493	struct perf_event_context *ctx = event->ctx;
1494
1495	if (unlikely(!ctx))
1496		return 0;
1497
1498	if (is_cgroup_event(event))
1499		return perf_cgroup_event_time_now(event, now);
1500
1501	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
1502		return ctx->time;
1503
1504	now += READ_ONCE(ctx->timeoffset);
1505	return now;
1506}
1507
1508static enum event_type_t get_event_type(struct perf_event *event)
1509{
1510	struct perf_event_context *ctx = event->ctx;
1511	enum event_type_t event_type;
1512
1513	lockdep_assert_held(&ctx->lock);
1514
1515	/*
1516	 * It's 'group type', really, because if our group leader is
1517	 * pinned, so are we.
1518	 */
1519	if (event->group_leader != event)
1520		event = event->group_leader;
1521
1522	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1523	if (!ctx->task)
1524		event_type |= EVENT_CPU;
1525
1526	return event_type;
1527}
1528
1529/*
1530 * Helper function to initialize event group nodes.
1531 */
1532static void init_event_group(struct perf_event *event)
1533{
1534	RB_CLEAR_NODE(&event->group_node);
1535	event->group_index = 0;
1536}
1537
1538/*
1539 * Extract pinned or flexible groups from the context
1540 * based on event attrs bits.
1541 */
1542static struct perf_event_groups *
1543get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1544{
1545	if (event->attr.pinned)
1546		return &ctx->pinned_groups;
1547	else
1548		return &ctx->flexible_groups;
1549}
1550
1551/*
1552 * Helper function to initializes perf_event_group trees.
1553 */
1554static void perf_event_groups_init(struct perf_event_groups *groups)
1555{
1556	groups->tree = RB_ROOT;
1557	groups->index = 0;
1558}
1559
1560static inline struct cgroup *event_cgroup(const struct perf_event *event)
1561{
1562	struct cgroup *cgroup = NULL;
1563
1564#ifdef CONFIG_CGROUP_PERF
1565	if (event->cgrp)
1566		cgroup = event->cgrp->css.cgroup;
1567#endif
1568
1569	return cgroup;
1570}
1571
1572/*
1573 * Compare function for event groups;
1574 *
1575 * Implements complex key that first sorts by CPU and then by virtual index
1576 * which provides ordering when rotating groups for the same CPU.
1577 */
1578static __always_inline int
1579perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
1580		      const struct cgroup *left_cgroup, const u64 left_group_index,
1581		      const struct perf_event *right)
1582{
1583	if (left_cpu < right->cpu)
1584		return -1;
1585	if (left_cpu > right->cpu)
1586		return 1;
1587
1588	if (left_pmu) {
1589		if (left_pmu < right->pmu_ctx->pmu)
1590			return -1;
1591		if (left_pmu > right->pmu_ctx->pmu)
1592			return 1;
1593	}
1594
1595#ifdef CONFIG_CGROUP_PERF
1596	{
1597		const struct cgroup *right_cgroup = event_cgroup(right);
1598
1599		if (left_cgroup != right_cgroup) {
1600			if (!left_cgroup) {
1601				/*
1602				 * Left has no cgroup but right does, no
1603				 * cgroups come first.
1604				 */
1605				return -1;
1606			}
1607			if (!right_cgroup) {
1608				/*
1609				 * Right has no cgroup but left does, no
1610				 * cgroups come first.
1611				 */
1612				return 1;
1613			}
1614			/* Two dissimilar cgroups, order by id. */
1615			if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1616				return -1;
1617
1618			return 1;
1619		}
1620	}
1621#endif
1622
1623	if (left_group_index < right->group_index)
1624		return -1;
1625	if (left_group_index > right->group_index)
1626		return 1;
1627
1628	return 0;
1629}
1630
1631#define __node_2_pe(node) \
1632	rb_entry((node), struct perf_event, group_node)
1633
1634static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1635{
1636	struct perf_event *e = __node_2_pe(a);
1637	return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
1638				     e->group_index, __node_2_pe(b)) < 0;
1639}
1640
1641struct __group_key {
1642	int cpu;
1643	struct pmu *pmu;
1644	struct cgroup *cgroup;
1645};
1646
1647static inline int __group_cmp(const void *key, const struct rb_node *node)
1648{
1649	const struct __group_key *a = key;
1650	const struct perf_event *b = __node_2_pe(node);
1651
1652	/* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
1653	return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
1654}
1655
1656static inline int
1657__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
1658{
1659	const struct __group_key *a = key;
1660	const struct perf_event *b = __node_2_pe(node);
1661
1662	/* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
1663	return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
1664				     b->group_index, b);
1665}
1666
1667/*
1668 * Insert @event into @groups' tree; using
1669 *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
1670 * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
1671 */
1672static void
1673perf_event_groups_insert(struct perf_event_groups *groups,
1674			 struct perf_event *event)
1675{
1676	event->group_index = ++groups->index;
1677
1678	rb_add(&event->group_node, &groups->tree, __group_less);
1679}
1680
1681/*
1682 * Helper function to insert event into the pinned or flexible groups.
1683 */
1684static void
1685add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1686{
1687	struct perf_event_groups *groups;
1688
1689	groups = get_event_groups(event, ctx);
1690	perf_event_groups_insert(groups, event);
1691}
1692
1693/*
1694 * Delete a group from a tree.
1695 */
1696static void
1697perf_event_groups_delete(struct perf_event_groups *groups,
1698			 struct perf_event *event)
1699{
1700	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1701		     RB_EMPTY_ROOT(&groups->tree));
1702
1703	rb_erase(&event->group_node, &groups->tree);
1704	init_event_group(event);
1705}
1706
1707/*
1708 * Helper function to delete event from its groups.
1709 */
1710static void
1711del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1712{
1713	struct perf_event_groups *groups;
1714
1715	groups = get_event_groups(event, ctx);
1716	perf_event_groups_delete(groups, event);
1717}
1718
1719/*
1720 * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
1721 */
1722static struct perf_event *
1723perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1724			struct pmu *pmu, struct cgroup *cgrp)
1725{
1726	struct __group_key key = {
1727		.cpu = cpu,
1728		.pmu = pmu,
1729		.cgroup = cgrp,
1730	};
1731	struct rb_node *node;
1732
1733	node = rb_find_first(&key, &groups->tree, __group_cmp);
1734	if (node)
1735		return __node_2_pe(node);
1736
1737	return NULL;
1738}
1739
1740static struct perf_event *
1741perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
1742{
1743	struct __group_key key = {
1744		.cpu = event->cpu,
1745		.pmu = pmu,
1746		.cgroup = event_cgroup(event),
1747	};
1748	struct rb_node *next;
1749
1750	next = rb_next_match(&key, &event->group_node, __group_cmp);
1751	if (next)
1752		return __node_2_pe(next);
1753
1754	return NULL;
1755}
1756
1757#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)		\
1758	for (event = perf_event_groups_first(groups, cpu, pmu, NULL);	\
1759	     event; event = perf_event_groups_next(event, pmu))
1760
1761/*
1762 * Iterate through the whole groups tree.
1763 */
1764#define perf_event_groups_for_each(event, groups)			\
1765	for (event = rb_entry_safe(rb_first(&((groups)->tree)),		\
1766				typeof(*event), group_node); event;	\
1767		event = rb_entry_safe(rb_next(&event->group_node),	\
1768				typeof(*event), group_node))
1769
1770/*
1771 * Add an event from the lists for its context.
1772 * Must be called with ctx->mutex and ctx->lock held.
1773 */
1774static void
1775list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1776{
1777	lockdep_assert_held(&ctx->lock);
1778
1779	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1780	event->attach_state |= PERF_ATTACH_CONTEXT;
1781
1782	event->tstamp = perf_event_time(event);
1783
1784	/*
1785	 * If we're a stand alone event or group leader, we go to the context
1786	 * list, group events are kept attached to the group so that
1787	 * perf_group_detach can, at all times, locate all siblings.
1788	 */
1789	if (event->group_leader == event) {
1790		event->group_caps = event->event_caps;
1791		add_event_to_groups(event, ctx);
1792	}
1793
1794	list_add_rcu(&event->event_entry, &ctx->event_list);
1795	ctx->nr_events++;
1796	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1797		ctx->nr_user++;
1798	if (event->attr.inherit_stat)
1799		ctx->nr_stat++;
1800
1801	if (event->state > PERF_EVENT_STATE_OFF)
1802		perf_cgroup_event_enable(event, ctx);
1803
1804	ctx->generation++;
1805	event->pmu_ctx->nr_events++;
1806}
1807
1808/*
1809 * Initialize event state based on the perf_event_attr::disabled.
1810 */
1811static inline void perf_event__state_init(struct perf_event *event)
1812{
1813	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1814					      PERF_EVENT_STATE_INACTIVE;
1815}
1816
1817static int __perf_event_read_size(u64 read_format, int nr_siblings)
1818{
1819	int entry = sizeof(u64); /* value */
1820	int size = 0;
1821	int nr = 1;
1822
1823	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1824		size += sizeof(u64);
1825
1826	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1827		size += sizeof(u64);
1828
1829	if (read_format & PERF_FORMAT_ID)
1830		entry += sizeof(u64);
1831
1832	if (read_format & PERF_FORMAT_LOST)
1833		entry += sizeof(u64);
1834
1835	if (read_format & PERF_FORMAT_GROUP) {
1836		nr += nr_siblings;
1837		size += sizeof(u64);
1838	}
1839
1840	/*
1841	 * Since perf_event_validate_size() limits this to 16k and inhibits
1842	 * adding more siblings, this will never overflow.
1843	 */
1844	return size + nr * entry;
1845}
1846
1847static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1848{
1849	struct perf_sample_data *data;
1850	u16 size = 0;
1851
1852	if (sample_type & PERF_SAMPLE_IP)
1853		size += sizeof(data->ip);
1854
1855	if (sample_type & PERF_SAMPLE_ADDR)
1856		size += sizeof(data->addr);
1857
1858	if (sample_type & PERF_SAMPLE_PERIOD)
1859		size += sizeof(data->period);
1860
1861	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1862		size += sizeof(data->weight.full);
1863
1864	if (sample_type & PERF_SAMPLE_READ)
1865		size += event->read_size;
1866
1867	if (sample_type & PERF_SAMPLE_DATA_SRC)
1868		size += sizeof(data->data_src.val);
1869
1870	if (sample_type & PERF_SAMPLE_TRANSACTION)
1871		size += sizeof(data->txn);
1872
1873	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1874		size += sizeof(data->phys_addr);
1875
1876	if (sample_type & PERF_SAMPLE_CGROUP)
1877		size += sizeof(data->cgroup);
1878
1879	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1880		size += sizeof(data->data_page_size);
1881
1882	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1883		size += sizeof(data->code_page_size);
1884
1885	event->header_size = size;
1886}
1887
1888/*
1889 * Called at perf_event creation and when events are attached/detached from a
1890 * group.
1891 */
1892static void perf_event__header_size(struct perf_event *event)
1893{
1894	event->read_size =
1895		__perf_event_read_size(event->attr.read_format,
1896				       event->group_leader->nr_siblings);
1897	__perf_event_header_size(event, event->attr.sample_type);
1898}
1899
1900static void perf_event__id_header_size(struct perf_event *event)
1901{
1902	struct perf_sample_data *data;
1903	u64 sample_type = event->attr.sample_type;
1904	u16 size = 0;
1905
1906	if (sample_type & PERF_SAMPLE_TID)
1907		size += sizeof(data->tid_entry);
1908
1909	if (sample_type & PERF_SAMPLE_TIME)
1910		size += sizeof(data->time);
1911
1912	if (sample_type & PERF_SAMPLE_IDENTIFIER)
1913		size += sizeof(data->id);
1914
1915	if (sample_type & PERF_SAMPLE_ID)
1916		size += sizeof(data->id);
1917
1918	if (sample_type & PERF_SAMPLE_STREAM_ID)
1919		size += sizeof(data->stream_id);
1920
1921	if (sample_type & PERF_SAMPLE_CPU)
1922		size += sizeof(data->cpu_entry);
1923
1924	event->id_header_size = size;
1925}
1926
1927/*
1928 * Check that adding an event to the group does not result in anybody
1929 * overflowing the 64k event limit imposed by the output buffer.
1930 *
1931 * Specifically, check that the read_size for the event does not exceed 16k,
1932 * read_size being the one term that grows with groups size. Since read_size
1933 * depends on per-event read_format, also (re)check the existing events.
1934 *
1935 * This leaves 48k for the constant size fields and things like callchains,
1936 * branch stacks and register sets.
1937 */
1938static bool perf_event_validate_size(struct perf_event *event)
1939{
1940	struct perf_event *sibling, *group_leader = event->group_leader;
1941
1942	if (__perf_event_read_size(event->attr.read_format,
1943				   group_leader->nr_siblings + 1) > 16*1024)
1944		return false;
1945
1946	if (__perf_event_read_size(group_leader->attr.read_format,
1947				   group_leader->nr_siblings + 1) > 16*1024)
1948		return false;
1949
1950	/*
1951	 * When creating a new group leader, group_leader->ctx is initialized
1952	 * after the size has been validated, but we cannot safely use
1953	 * for_each_sibling_event() until group_leader->ctx is set. A new group
1954	 * leader cannot have any siblings yet, so we can safely skip checking
1955	 * the non-existent siblings.
1956	 */
1957	if (event == group_leader)
1958		return true;
1959
1960	for_each_sibling_event(sibling, group_leader) {
1961		if (__perf_event_read_size(sibling->attr.read_format,
1962					   group_leader->nr_siblings + 1) > 16*1024)
1963			return false;
1964	}
1965
1966	return true;
1967}
1968
1969static void perf_group_attach(struct perf_event *event)
1970{
1971	struct perf_event *group_leader = event->group_leader, *pos;
1972
1973	lockdep_assert_held(&event->ctx->lock);
1974
1975	/*
1976	 * We can have double attach due to group movement (move_group) in
1977	 * perf_event_open().
1978	 */
1979	if (event->attach_state & PERF_ATTACH_GROUP)
1980		return;
1981
1982	event->attach_state |= PERF_ATTACH_GROUP;
1983
1984	if (group_leader == event)
1985		return;
1986
1987	WARN_ON_ONCE(group_leader->ctx != event->ctx);
1988
1989	group_leader->group_caps &= event->event_caps;
1990
1991	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1992	group_leader->nr_siblings++;
1993	group_leader->group_generation++;
1994
1995	perf_event__header_size(group_leader);
1996
1997	for_each_sibling_event(pos, group_leader)
1998		perf_event__header_size(pos);
1999}
2000
2001/*
2002 * Remove an event from the lists for its context.
2003 * Must be called with ctx->mutex and ctx->lock held.
2004 */
2005static void
2006list_del_event(struct perf_event *event, struct perf_event_context *ctx)
2007{
2008	WARN_ON_ONCE(event->ctx != ctx);
2009	lockdep_assert_held(&ctx->lock);
2010
2011	/*
2012	 * We can have double detach due to exit/hot-unplug + close.
2013	 */
2014	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
2015		return;
2016
2017	event->attach_state &= ~PERF_ATTACH_CONTEXT;
2018
2019	ctx->nr_events--;
2020	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
2021		ctx->nr_user--;
2022	if (event->attr.inherit_stat)
2023		ctx->nr_stat--;
2024
2025	list_del_rcu(&event->event_entry);
2026
2027	if (event->group_leader == event)
2028		del_event_from_groups(event, ctx);
2029
2030	/*
2031	 * If event was in error state, then keep it
2032	 * that way, otherwise bogus counts will be
2033	 * returned on read(). The only way to get out
2034	 * of error state is by explicit re-enabling
2035	 * of the event
2036	 */
2037	if (event->state > PERF_EVENT_STATE_OFF) {
2038		perf_cgroup_event_disable(event, ctx);
2039		perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2040	}
2041
2042	ctx->generation++;
2043	event->pmu_ctx->nr_events--;
2044}
2045
2046static int
2047perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2048{
2049	if (!has_aux(aux_event))
2050		return 0;
2051
2052	if (!event->pmu->aux_output_match)
2053		return 0;
2054
2055	return event->pmu->aux_output_match(aux_event);
2056}
2057
2058static void put_event(struct perf_event *event);
2059static void event_sched_out(struct perf_event *event,
2060			    struct perf_event_context *ctx);
2061
2062static void perf_put_aux_event(struct perf_event *event)
2063{
2064	struct perf_event_context *ctx = event->ctx;
2065	struct perf_event *iter;
2066
2067	/*
2068	 * If event uses aux_event tear down the link
2069	 */
2070	if (event->aux_event) {
2071		iter = event->aux_event;
2072		event->aux_event = NULL;
2073		put_event(iter);
2074		return;
2075	}
2076
2077	/*
2078	 * If the event is an aux_event, tear down all links to
2079	 * it from other events.
2080	 */
2081	for_each_sibling_event(iter, event->group_leader) {
2082		if (iter->aux_event != event)
2083			continue;
2084
2085		iter->aux_event = NULL;
2086		put_event(event);
2087
2088		/*
2089		 * If it's ACTIVE, schedule it out and put it into ERROR
2090		 * state so that we don't try to schedule it again. Note
2091		 * that perf_event_enable() will clear the ERROR status.
2092		 */
2093		event_sched_out(iter, ctx);
2094		perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2095	}
2096}
2097
2098static bool perf_need_aux_event(struct perf_event *event)
2099{
2100	return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2101}
2102
2103static int perf_get_aux_event(struct perf_event *event,
2104			      struct perf_event *group_leader)
2105{
2106	/*
2107	 * Our group leader must be an aux event if we want to be
2108	 * an aux_output. This way, the aux event will precede its
2109	 * aux_output events in the group, and therefore will always
2110	 * schedule first.
2111	 */
2112	if (!group_leader)
2113		return 0;
2114
2115	/*
2116	 * aux_output and aux_sample_size are mutually exclusive.
2117	 */
2118	if (event->attr.aux_output && event->attr.aux_sample_size)
2119		return 0;
2120
2121	if (event->attr.aux_output &&
2122	    !perf_aux_output_match(event, group_leader))
2123		return 0;
2124
2125	if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2126		return 0;
2127
2128	if (!atomic_long_inc_not_zero(&group_leader->refcount))
2129		return 0;
2130
2131	/*
2132	 * Link aux_outputs to their aux event; this is undone in
2133	 * perf_group_detach() by perf_put_aux_event(). When the
2134	 * group in torn down, the aux_output events loose their
2135	 * link to the aux_event and can't schedule any more.
2136	 */
2137	event->aux_event = group_leader;
2138
2139	return 1;
2140}
2141
2142static inline struct list_head *get_event_list(struct perf_event *event)
2143{
2144	return event->attr.pinned ? &event->pmu_ctx->pinned_active :
2145				    &event->pmu_ctx->flexible_active;
2146}
2147
2148/*
2149 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
2150 * cannot exist on their own, schedule them out and move them into the ERROR
2151 * state. Also see _perf_event_enable(), it will not be able to recover
2152 * this ERROR state.
2153 */
2154static inline void perf_remove_sibling_event(struct perf_event *event)
2155{
2156	event_sched_out(event, event->ctx);
2157	perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2158}
2159
2160static void perf_group_detach(struct perf_event *event)
2161{
2162	struct perf_event *leader = event->group_leader;
2163	struct perf_event *sibling, *tmp;
2164	struct perf_event_context *ctx = event->ctx;
2165
2166	lockdep_assert_held(&ctx->lock);
2167
2168	/*
2169	 * We can have double detach due to exit/hot-unplug + close.
2170	 */
2171	if (!(event->attach_state & PERF_ATTACH_GROUP))
2172		return;
2173
2174	event->attach_state &= ~PERF_ATTACH_GROUP;
2175
2176	perf_put_aux_event(event);
2177
2178	/*
2179	 * If this is a sibling, remove it from its group.
2180	 */
2181	if (leader != event) {
2182		list_del_init(&event->sibling_list);
2183		event->group_leader->nr_siblings--;
2184		event->group_leader->group_generation++;
2185		goto out;
2186	}
2187
2188	/*
2189	 * If this was a group event with sibling events then
2190	 * upgrade the siblings to singleton events by adding them
2191	 * to whatever list we are on.
2192	 */
2193	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2194
2195		if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2196			perf_remove_sibling_event(sibling);
2197
2198		sibling->group_leader = sibling;
2199		list_del_init(&sibling->sibling_list);
2200
2201		/* Inherit group flags from the previous leader */
2202		sibling->group_caps = event->group_caps;
2203
2204		if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
2205			add_event_to_groups(sibling, event->ctx);
2206
2207			if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2208				list_add_tail(&sibling->active_list, get_event_list(sibling));
2209		}
2210
2211		WARN_ON_ONCE(sibling->ctx != event->ctx);
2212	}
2213
2214out:
2215	for_each_sibling_event(tmp, leader)
2216		perf_event__header_size(tmp);
2217
2218	perf_event__header_size(leader);
2219}
2220
2221static void sync_child_event(struct perf_event *child_event);
2222
2223static void perf_child_detach(struct perf_event *event)
2224{
2225	struct perf_event *parent_event = event->parent;
2226
2227	if (!(event->attach_state & PERF_ATTACH_CHILD))
2228		return;
2229
2230	event->attach_state &= ~PERF_ATTACH_CHILD;
2231
2232	if (WARN_ON_ONCE(!parent_event))
2233		return;
2234
2235	lockdep_assert_held(&parent_event->child_mutex);
2236
2237	sync_child_event(event);
2238	list_del_init(&event->child_list);
2239}
2240
2241static bool is_orphaned_event(struct perf_event *event)
2242{
2243	return event->state == PERF_EVENT_STATE_DEAD;
2244}
2245
2246static inline int
2247event_filter_match(struct perf_event *event)
2248{
2249	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2250	       perf_cgroup_match(event);
2251}
2252
2253static void
2254event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
2255{
2256	struct perf_event_pmu_context *epc = event->pmu_ctx;
2257	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
2258	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2259
2260	// XXX cpc serialization, probably per-cpu IRQ disabled
2261
2262	WARN_ON_ONCE(event->ctx != ctx);
2263	lockdep_assert_held(&ctx->lock);
2264
2265	if (event->state != PERF_EVENT_STATE_ACTIVE)
2266		return;
2267
2268	/*
2269	 * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2270	 * we can schedule events _OUT_ individually through things like
2271	 * __perf_remove_from_context().
2272	 */
2273	list_del_init(&event->active_list);
2274
2275	perf_pmu_disable(event->pmu);
2276
2277	event->pmu->del(event, 0);
2278	event->oncpu = -1;
2279
2280	if (event->pending_disable) {
2281		event->pending_disable = 0;
2282		perf_cgroup_event_disable(event, ctx);
2283		state = PERF_EVENT_STATE_OFF;
2284	}
2285
2286	if (event->pending_sigtrap) {
2287		bool dec = true;
2288
2289		event->pending_sigtrap = 0;
2290		if (state != PERF_EVENT_STATE_OFF &&
2291		    !event->pending_work) {
2292			event->pending_work = 1;
2293			dec = false;
2294			WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
2295			task_work_add(current, &event->pending_task, TWA_RESUME);
2296		}
2297		if (dec)
2298			local_dec(&event->ctx->nr_pending);
2299	}
2300
2301	perf_event_set_state(event, state);
2302
2303	if (!is_software_event(event))
2304		cpc->active_oncpu--;
2305	if (event->attr.freq && event->attr.sample_freq)
2306		ctx->nr_freq--;
2307	if (event->attr.exclusive || !cpc->active_oncpu)
2308		cpc->exclusive = 0;
2309
2310	perf_pmu_enable(event->pmu);
2311}
2312
2313static void
2314group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
2315{
2316	struct perf_event *event;
2317
2318	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2319		return;
2320
2321	perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
2322
2323	event_sched_out(group_event, ctx);
2324
2325	/*
2326	 * Schedule out siblings (if any):
2327	 */
2328	for_each_sibling_event(event, group_event)
2329		event_sched_out(event, ctx);
2330}
2331
2332#define DETACH_GROUP	0x01UL
2333#define DETACH_CHILD	0x02UL
2334#define DETACH_DEAD	0x04UL
2335
2336/*
2337 * Cross CPU call to remove a performance event
2338 *
2339 * We disable the event on the hardware level first. After that we
2340 * remove it from the context list.
2341 */
2342static void
2343__perf_remove_from_context(struct perf_event *event,
2344			   struct perf_cpu_context *cpuctx,
2345			   struct perf_event_context *ctx,
2346			   void *info)
2347{
2348	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
2349	unsigned long flags = (unsigned long)info;
2350
2351	if (ctx->is_active & EVENT_TIME) {
2352		update_context_time(ctx);
2353		update_cgrp_time_from_cpuctx(cpuctx, false);
2354	}
2355
2356	/*
2357	 * Ensure event_sched_out() switches to OFF, at the very least
2358	 * this avoids raising perf_pending_task() at this time.
2359	 */
2360	if (flags & DETACH_DEAD)
2361		event->pending_disable = 1;
2362	event_sched_out(event, ctx);
2363	if (flags & DETACH_GROUP)
2364		perf_group_detach(event);
2365	if (flags & DETACH_CHILD)
2366		perf_child_detach(event);
2367	list_del_event(event, ctx);
2368	if (flags & DETACH_DEAD)
2369		event->state = PERF_EVENT_STATE_DEAD;
2370
2371	if (!pmu_ctx->nr_events) {
2372		pmu_ctx->rotate_necessary = 0;
2373
2374		if (ctx->task && ctx->is_active) {
2375			struct perf_cpu_pmu_context *cpc;
2376
2377			cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
2378			WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
2379			cpc->task_epc = NULL;
2380		}
2381	}
2382
2383	if (!ctx->nr_events && ctx->is_active) {
2384		if (ctx == &cpuctx->ctx)
2385			update_cgrp_time_from_cpuctx(cpuctx, true);
2386
2387		ctx->is_active = 0;
2388		if (ctx->task) {
2389			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2390			cpuctx->task_ctx = NULL;
2391		}
2392	}
2393}
2394
2395/*
2396 * Remove the event from a task's (or a CPU's) list of events.
2397 *
2398 * If event->ctx is a cloned context, callers must make sure that
2399 * every task struct that event->ctx->task could possibly point to
2400 * remains valid.  This is OK when called from perf_release since
2401 * that only calls us on the top-level context, which can't be a clone.
2402 * When called from perf_event_exit_task, it's OK because the
2403 * context has been detached from its task.
2404 */
2405static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2406{
2407	struct perf_event_context *ctx = event->ctx;
2408
2409	lockdep_assert_held(&ctx->mutex);
2410
2411	/*
2412	 * Because of perf_event_exit_task(), perf_remove_from_context() ought
2413	 * to work in the face of TASK_TOMBSTONE, unlike every other
2414	 * event_function_call() user.
2415	 */
2416	raw_spin_lock_irq(&ctx->lock);
2417	if (!ctx->is_active) {
2418		__perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
2419					   ctx, (void *)flags);
2420		raw_spin_unlock_irq(&ctx->lock);
2421		return;
2422	}
2423	raw_spin_unlock_irq(&ctx->lock);
2424
2425	event_function_call(event, __perf_remove_from_context, (void *)flags);
2426}
2427
2428/*
2429 * Cross CPU call to disable a performance event
2430 */
2431static void __perf_event_disable(struct perf_event *event,
2432				 struct perf_cpu_context *cpuctx,
2433				 struct perf_event_context *ctx,
2434				 void *info)
2435{
2436	if (event->state < PERF_EVENT_STATE_INACTIVE)
2437		return;
2438
2439	if (ctx->is_active & EVENT_TIME) {
2440		update_context_time(ctx);
2441		update_cgrp_time_from_event(event);
2442	}
2443
2444	perf_pmu_disable(event->pmu_ctx->pmu);
2445
2446	if (event == event->group_leader)
2447		group_sched_out(event, ctx);
2448	else
2449		event_sched_out(event, ctx);
2450
2451	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2452	perf_cgroup_event_disable(event, ctx);
2453
2454	perf_pmu_enable(event->pmu_ctx->pmu);
2455}
2456
2457/*
2458 * Disable an event.
2459 *
2460 * If event->ctx is a cloned context, callers must make sure that
2461 * every task struct that event->ctx->task could possibly point to
2462 * remains valid.  This condition is satisfied when called through
2463 * perf_event_for_each_child or perf_event_for_each because they
2464 * hold the top-level event's child_mutex, so any descendant that
2465 * goes to exit will block in perf_event_exit_event().
2466 *
2467 * When called from perf_pending_irq it's OK because event->ctx
2468 * is the current context on this CPU and preemption is disabled,
2469 * hence we can't get into perf_event_task_sched_out for this context.
2470 */
2471static void _perf_event_disable(struct perf_event *event)
2472{
2473	struct perf_event_context *ctx = event->ctx;
2474
2475	raw_spin_lock_irq(&ctx->lock);
2476	if (event->state <= PERF_EVENT_STATE_OFF) {
2477		raw_spin_unlock_irq(&ctx->lock);
2478		return;
2479	}
2480	raw_spin_unlock_irq(&ctx->lock);
2481
2482	event_function_call(event, __perf_event_disable, NULL);
2483}
2484
2485void perf_event_disable_local(struct perf_event *event)
2486{
2487	event_function_local(event, __perf_event_disable, NULL);
2488}
2489
2490/*
2491 * Strictly speaking kernel users cannot create groups and therefore this
2492 * interface does not need the perf_event_ctx_lock() magic.
2493 */
2494void perf_event_disable(struct perf_event *event)
2495{
2496	struct perf_event_context *ctx;
2497
2498	ctx = perf_event_ctx_lock(event);
2499	_perf_event_disable(event);
2500	perf_event_ctx_unlock(event, ctx);
2501}
2502EXPORT_SYMBOL_GPL(perf_event_disable);
2503
2504void perf_event_disable_inatomic(struct perf_event *event)
2505{
2506	event->pending_disable = 1;
2507	irq_work_queue(&event->pending_irq);
2508}
2509
2510#define MAX_INTERRUPTS (~0ULL)
2511
2512static void perf_log_throttle(struct perf_event *event, int enable);
2513static void perf_log_itrace_start(struct perf_event *event);
2514
2515static int
2516event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
2517{
2518	struct perf_event_pmu_context *epc = event->pmu_ctx;
2519	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
2520	int ret = 0;
2521
2522	WARN_ON_ONCE(event->ctx != ctx);
2523
2524	lockdep_assert_held(&ctx->lock);
2525
2526	if (event->state <= PERF_EVENT_STATE_OFF)
2527		return 0;
2528
2529	WRITE_ONCE(event->oncpu, smp_processor_id());
2530	/*
2531	 * Order event::oncpu write to happen before the ACTIVE state is
2532	 * visible. This allows perf_event_{stop,read}() to observe the correct
2533	 * ->oncpu if it sees ACTIVE.
2534	 */
2535	smp_wmb();
2536	perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2537
2538	/*
2539	 * Unthrottle events, since we scheduled we might have missed several
2540	 * ticks already, also for a heavily scheduling task there is little
2541	 * guarantee it'll get a tick in a timely manner.
2542	 */
2543	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2544		perf_log_throttle(event, 1);
2545		event->hw.interrupts = 0;
2546	}
2547
2548	perf_pmu_disable(event->pmu);
2549
2550	perf_log_itrace_start(event);
2551
2552	if (event->pmu->add(event, PERF_EF_START)) {
2553		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2554		event->oncpu = -1;
2555		ret = -EAGAIN;
2556		goto out;
2557	}
2558
2559	if (!is_software_event(event))
2560		cpc->active_oncpu++;
2561	if (event->attr.freq && event->attr.sample_freq)
2562		ctx->nr_freq++;
2563
2564	if (event->attr.exclusive)
2565		cpc->exclusive = 1;
2566
2567out:
2568	perf_pmu_enable(event->pmu);
2569
2570	return ret;
2571}
2572
2573static int
2574group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
2575{
2576	struct perf_event *event, *partial_group = NULL;
2577	struct pmu *pmu = group_event->pmu_ctx->pmu;
2578
2579	if (group_event->state == PERF_EVENT_STATE_OFF)
2580		return 0;
2581
2582	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2583
2584	if (event_sched_in(group_event, ctx))
2585		goto error;
2586
2587	/*
2588	 * Schedule in siblings as one group (if any):
2589	 */
2590	for_each_sibling_event(event, group_event) {
2591		if (event_sched_in(event, ctx)) {
2592			partial_group = event;
2593			goto group_error;
2594		}
2595	}
2596
2597	if (!pmu->commit_txn(pmu))
2598		return 0;
2599
2600group_error:
2601	/*
2602	 * Groups can be scheduled in as one unit only, so undo any
2603	 * partial group before returning:
2604	 * The events up to the failed event are scheduled out normally.
2605	 */
2606	for_each_sibling_event(event, group_event) {
2607		if (event == partial_group)
2608			break;
2609
2610		event_sched_out(event, ctx);
2611	}
2612	event_sched_out(group_event, ctx);
2613
2614error:
2615	pmu->cancel_txn(pmu);
2616	return -EAGAIN;
2617}
2618
2619/*
2620 * Work out whether we can put this event group on the CPU now.
2621 */
2622static int group_can_go_on(struct perf_event *event, int can_add_hw)
2623{
2624	struct perf_event_pmu_context *epc = event->pmu_ctx;
2625	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
2626
2627	/*
2628	 * Groups consisting entirely of software events can always go on.
2629	 */
2630	if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2631		return 1;
2632	/*
2633	 * If an exclusive group is already on, no other hardware
2634	 * events can go on.
2635	 */
2636	if (cpc->exclusive)
2637		return 0;
2638	/*
2639	 * If this group is exclusive and there are already
2640	 * events on the CPU, it can't go on.
2641	 */
2642	if (event->attr.exclusive && !list_empty(get_event_list(event)))
2643		return 0;
2644	/*
2645	 * Otherwise, try to add it if all previous groups were able
2646	 * to go on.
2647	 */
2648	return can_add_hw;
2649}
2650
2651static void add_event_to_ctx(struct perf_event *event,
2652			       struct perf_event_context *ctx)
2653{
2654	list_add_event(event, ctx);
2655	perf_group_attach(event);
2656}
2657
2658static void task_ctx_sched_out(struct perf_event_context *ctx,
2659				enum event_type_t event_type)
2660{
2661	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2662
2663	if (!cpuctx->task_ctx)
2664		return;
2665
2666	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2667		return;
2668
2669	ctx_sched_out(ctx, event_type);
2670}
2671
2672static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2673				struct perf_event_context *ctx)
2674{
2675	ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
2676	if (ctx)
2677		 ctx_sched_in(ctx, EVENT_PINNED);
2678	ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
2679	if (ctx)
2680		 ctx_sched_in(ctx, EVENT_FLEXIBLE);
2681}
2682
2683/*
2684 * We want to maintain the following priority of scheduling:
2685 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2686 *  - task pinned (EVENT_PINNED)
2687 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2688 *  - task flexible (EVENT_FLEXIBLE).
2689 *
2690 * In order to avoid unscheduling and scheduling back in everything every
2691 * time an event is added, only do it for the groups of equal priority and
2692 * below.
2693 *
2694 * This can be called after a batch operation on task events, in which case
2695 * event_type is a bit mask of the types of events involved. For CPU events,
2696 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2697 */
2698/*
2699 * XXX: ctx_resched() reschedule entire perf_event_context while adding new
2700 * event to the context or enabling existing event in the context. We can
2701 * probably optimize it by rescheduling only affected pmu_ctx.
2702 */
2703static void ctx_resched(struct perf_cpu_context *cpuctx,
2704			struct perf_event_context *task_ctx,
2705			enum event_type_t event_type)
2706{
2707	bool cpu_event = !!(event_type & EVENT_CPU);
2708
2709	/*
2710	 * If pinned groups are involved, flexible groups also need to be
2711	 * scheduled out.
2712	 */
2713	if (event_type & EVENT_PINNED)
2714		event_type |= EVENT_FLEXIBLE;
2715
2716	event_type &= EVENT_ALL;
2717
2718	perf_ctx_disable(&cpuctx->ctx, false);
2719	if (task_ctx) {
2720		perf_ctx_disable(task_ctx, false);
2721		task_ctx_sched_out(task_ctx, event_type);
2722	}
2723
2724	/*
2725	 * Decide which cpu ctx groups to schedule out based on the types
2726	 * of events that caused rescheduling:
2727	 *  - EVENT_CPU: schedule out corresponding groups;
2728	 *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2729	 *  - otherwise, do nothing more.
2730	 */
2731	if (cpu_event)
2732		ctx_sched_out(&cpuctx->ctx, event_type);
2733	else if (event_type & EVENT_PINNED)
2734		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
2735
2736	perf_event_sched_in(cpuctx, task_ctx);
2737
2738	perf_ctx_enable(&cpuctx->ctx, false);
2739	if (task_ctx)
2740		perf_ctx_enable(task_ctx, false);
2741}
2742
2743void perf_pmu_resched(struct pmu *pmu)
2744{
2745	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2746	struct perf_event_context *task_ctx = cpuctx->task_ctx;
2747
2748	perf_ctx_lock(cpuctx, task_ctx);
2749	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2750	perf_ctx_unlock(cpuctx, task_ctx);
2751}
2752
2753/*
2754 * Cross CPU call to install and enable a performance event
2755 *
2756 * Very similar to remote_function() + event_function() but cannot assume that
2757 * things like ctx->is_active and cpuctx->task_ctx are set.
2758 */
2759static int  __perf_install_in_context(void *info)
2760{
2761	struct perf_event *event = info;
2762	struct perf_event_context *ctx = event->ctx;
2763	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
2764	struct perf_event_context *task_ctx = cpuctx->task_ctx;
2765	bool reprogram = true;
2766	int ret = 0;
2767
2768	raw_spin_lock(&cpuctx->ctx.lock);
2769	if (ctx->task) {
2770		raw_spin_lock(&ctx->lock);
2771		task_ctx = ctx;
2772
2773		reprogram = (ctx->task == current);
2774
2775		/*
2776		 * If the task is running, it must be running on this CPU,
2777		 * otherwise we cannot reprogram things.
2778		 *
2779		 * If its not running, we don't care, ctx->lock will
2780		 * serialize against it becoming runnable.
2781		 */
2782		if (task_curr(ctx->task) && !reprogram) {
2783			ret = -ESRCH;
2784			goto unlock;
2785		}
2786
2787		WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2788	} else if (task_ctx) {
2789		raw_spin_lock(&task_ctx->lock);
2790	}
2791
2792#ifdef CONFIG_CGROUP_PERF
2793	if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2794		/*
2795		 * If the current cgroup doesn't match the event's
2796		 * cgroup, we should not try to schedule it.
2797		 */
2798		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2799		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2800					event->cgrp->css.cgroup);
2801	}
2802#endif
2803
2804	if (reprogram) {
2805		ctx_sched_out(ctx, EVENT_TIME);
2806		add_event_to_ctx(event, ctx);
2807		ctx_resched(cpuctx, task_ctx, get_event_type(event));
2808	} else {
2809		add_event_to_ctx(event, ctx);
2810	}
2811
2812unlock:
2813	perf_ctx_unlock(cpuctx, task_ctx);
2814
2815	return ret;
2816}
2817
2818static bool exclusive_event_installable(struct perf_event *event,
2819					struct perf_event_context *ctx);
2820
2821/*
2822 * Attach a performance event to a context.
2823 *
2824 * Very similar to event_function_call, see comment there.
2825 */
2826static void
2827perf_install_in_context(struct perf_event_context *ctx,
2828			struct perf_event *event,
2829			int cpu)
2830{
2831	struct task_struct *task = READ_ONCE(ctx->task);
2832
2833	lockdep_assert_held(&ctx->mutex);
2834
2835	WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2836
2837	if (event->cpu != -1)
2838		WARN_ON_ONCE(event->cpu != cpu);
2839
2840	/*
2841	 * Ensures that if we can observe event->ctx, both the event and ctx
2842	 * will be 'complete'. See perf_iterate_sb_cpu().
2843	 */
2844	smp_store_release(&event->ctx, ctx);
2845
2846	/*
2847	 * perf_event_attr::disabled events will not run and can be initialized
2848	 * without IPI. Except when this is the first event for the context, in
2849	 * that case we need the magic of the IPI to set ctx->is_active.
2850	 *
2851	 * The IOC_ENABLE that is sure to follow the creation of a disabled
2852	 * event will issue the IPI and reprogram the hardware.
2853	 */
2854	if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
2855	    ctx->nr_events && !is_cgroup_event(event)) {
2856		raw_spin_lock_irq(&ctx->lock);
2857		if (ctx->task == TASK_TOMBSTONE) {
2858			raw_spin_unlock_irq(&ctx->lock);
2859			return;
2860		}
2861		add_event_to_ctx(event, ctx);
2862		raw_spin_unlock_irq(&ctx->lock);
2863		return;
2864	}
2865
2866	if (!task) {
2867		cpu_function_call(cpu, __perf_install_in_context, event);
2868		return;
2869	}
2870
2871	/*
2872	 * Should not happen, we validate the ctx is still alive before calling.
2873	 */
2874	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2875		return;
2876
2877	/*
2878	 * Installing events is tricky because we cannot rely on ctx->is_active
2879	 * to be set in case this is the nr_events 0 -> 1 transition.
2880	 *
2881	 * Instead we use task_curr(), which tells us if the task is running.
2882	 * However, since we use task_curr() outside of rq::lock, we can race
2883	 * against the actual state. This means the result can be wrong.
2884	 *
2885	 * If we get a false positive, we retry, this is harmless.
2886	 *
2887	 * If we get a false negative, things are complicated. If we are after
2888	 * perf_event_context_sched_in() ctx::lock will serialize us, and the
2889	 * value must be correct. If we're before, it doesn't matter since
2890	 * perf_event_context_sched_in() will program the counter.
2891	 *
2892	 * However, this hinges on the remote context switch having observed
2893	 * our task->perf_event_ctxp[] store, such that it will in fact take
2894	 * ctx::lock in perf_event_context_sched_in().
2895	 *
2896	 * We do this by task_function_call(), if the IPI fails to hit the task
2897	 * we know any future context switch of task must see the
2898	 * perf_event_ctpx[] store.
2899	 */
2900
2901	/*
2902	 * This smp_mb() orders the task->perf_event_ctxp[] store with the
2903	 * task_cpu() load, such that if the IPI then does not find the task
2904	 * running, a future context switch of that task must observe the
2905	 * store.
2906	 */
2907	smp_mb();
2908again:
2909	if (!task_function_call(task, __perf_install_in_context, event))
2910		return;
2911
2912	raw_spin_lock_irq(&ctx->lock);
2913	task = ctx->task;
2914	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2915		/*
2916		 * Cannot happen because we already checked above (which also
2917		 * cannot happen), and we hold ctx->mutex, which serializes us
2918		 * against perf_event_exit_task_context().
2919		 */
2920		raw_spin_unlock_irq(&ctx->lock);
2921		return;
2922	}
2923	/*
2924	 * If the task is not running, ctx->lock will avoid it becoming so,
2925	 * thus we can safely install the event.
2926	 */
2927	if (task_curr(task)) {
2928		raw_spin_unlock_irq(&ctx->lock);
2929		goto again;
2930	}
2931	add_event_to_ctx(event, ctx);
2932	raw_spin_unlock_irq(&ctx->lock);
2933}
2934
2935/*
2936 * Cross CPU call to enable a performance event
2937 */
2938static void __perf_event_enable(struct perf_event *event,
2939				struct perf_cpu_context *cpuctx,
2940				struct perf_event_context *ctx,
2941				void *info)
2942{
2943	struct perf_event *leader = event->group_leader;
2944	struct perf_event_context *task_ctx;
2945
2946	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2947	    event->state <= PERF_EVENT_STATE_ERROR)
2948		return;
2949
2950	if (ctx->is_active)
2951		ctx_sched_out(ctx, EVENT_TIME);
2952
2953	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2954	perf_cgroup_event_enable(event, ctx);
2955
2956	if (!ctx->is_active)
2957		return;
2958
2959	if (!event_filter_match(event)) {
2960		ctx_sched_in(ctx, EVENT_TIME);
2961		return;
2962	}
2963
2964	/*
2965	 * If the event is in a group and isn't the group leader,
2966	 * then don't put it on unless the group is on.
2967	 */
2968	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2969		ctx_sched_in(ctx, EVENT_TIME);
2970		return;
2971	}
2972
2973	task_ctx = cpuctx->task_ctx;
2974	if (ctx->task)
2975		WARN_ON_ONCE(task_ctx != ctx);
2976
2977	ctx_resched(cpuctx, task_ctx, get_event_type(event));
2978}
2979
2980/*
2981 * Enable an event.
2982 *
2983 * If event->ctx is a cloned context, callers must make sure that
2984 * every task struct that event->ctx->task could possibly point to
2985 * remains valid.  This condition is satisfied when called through
2986 * perf_event_for_each_child or perf_event_for_each as described
2987 * for perf_event_disable.
2988 */
2989static void _perf_event_enable(struct perf_event *event)
2990{
2991	struct perf_event_context *ctx = event->ctx;
2992
2993	raw_spin_lock_irq(&ctx->lock);
2994	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2995	    event->state <  PERF_EVENT_STATE_ERROR) {
2996out:
2997		raw_spin_unlock_irq(&ctx->lock);
2998		return;
2999	}
3000
3001	/*
3002	 * If the event is in error state, clear that first.
3003	 *
3004	 * That way, if we see the event in error state below, we know that it
3005	 * has gone back into error state, as distinct from the task having
3006	 * been scheduled away before the cross-call arrived.
3007	 */
3008	if (event->state == PERF_EVENT_STATE_ERROR) {
3009		/*
3010		 * Detached SIBLING events cannot leave ERROR state.
3011		 */
3012		if (event->event_caps & PERF_EV_CAP_SIBLING &&
3013		    event->group_leader == event)
3014			goto out;
3015
3016		event->state = PERF_EVENT_STATE_OFF;
3017	}
3018	raw_spin_unlock_irq(&ctx->lock);
3019
3020	event_function_call(event, __perf_event_enable, NULL);
3021}
3022
3023/*
3024 * See perf_event_disable();
3025 */
3026void perf_event_enable(struct perf_event *event)
3027{
3028	struct perf_event_context *ctx;
3029
3030	ctx = perf_event_ctx_lock(event);
3031	_perf_event_enable(event);
3032	perf_event_ctx_unlock(event, ctx);
3033}
3034EXPORT_SYMBOL_GPL(perf_event_enable);
3035
3036struct stop_event_data {
3037	struct perf_event	*event;
3038	unsigned int		restart;
3039};
3040
3041static int __perf_event_stop(void *info)
3042{
3043	struct stop_event_data *sd = info;
3044	struct perf_event *event = sd->event;
3045
3046	/* if it's already INACTIVE, do nothing */
3047	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3048		return 0;
3049
3050	/* matches smp_wmb() in event_sched_in() */
3051	smp_rmb();
3052
3053	/*
3054	 * There is a window with interrupts enabled before we get here,
3055	 * so we need to check again lest we try to stop another CPU's event.
3056	 */
3057	if (READ_ONCE(event->oncpu) != smp_processor_id())
3058		return -EAGAIN;
3059
3060	event->pmu->stop(event, PERF_EF_UPDATE);
3061
3062	/*
3063	 * May race with the actual stop (through perf_pmu_output_stop()),
3064	 * but it is only used for events with AUX ring buffer, and such
3065	 * events will refuse to restart because of rb::aux_mmap_count==0,
3066	 * see comments in perf_aux_output_begin().
3067	 *
3068	 * Since this is happening on an event-local CPU, no trace is lost
3069	 * while restarting.
3070	 */
3071	if (sd->restart)
3072		event->pmu->start(event, 0);
3073
3074	return 0;
3075}
3076
3077static int perf_event_stop(struct perf_event *event, int restart)
3078{
3079	struct stop_event_data sd = {
3080		.event		= event,
3081		.restart	= restart,
3082	};
3083	int ret = 0;
3084
3085	do {
3086		if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3087			return 0;
3088
3089		/* matches smp_wmb() in event_sched_in() */
3090		smp_rmb();
3091
3092		/*
3093		 * We only want to restart ACTIVE events, so if the event goes
3094		 * inactive here (event->oncpu==-1), there's nothing more to do;
3095		 * fall through with ret==-ENXIO.
3096		 */
3097		ret = cpu_function_call(READ_ONCE(event->oncpu),
3098					__perf_event_stop, &sd);
3099	} while (ret == -EAGAIN);
3100
3101	return ret;
3102}
3103
3104/*
3105 * In order to contain the amount of racy and tricky in the address filter
3106 * configuration management, it is a two part process:
3107 *
3108 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3109 *      we update the addresses of corresponding vmas in
3110 *	event::addr_filter_ranges array and bump the event::addr_filters_gen;
3111 * (p2) when an event is scheduled in (pmu::add), it calls
3112 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3113 *      if the generation has changed since the previous call.
3114 *
3115 * If (p1) happens while the event is active, we restart it to force (p2).
3116 *
3117 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3118 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3119 *     ioctl;
3120 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3121 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
3122 *     for reading;
3123 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3124 *     of exec.
3125 */
3126void perf_event_addr_filters_sync(struct perf_event *event)
3127{
3128	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3129
3130	if (!has_addr_filter(event))
3131		return;
3132
3133	raw_spin_lock(&ifh->lock);
3134	if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3135		event->pmu->addr_filters_sync(event);
3136		event->hw.addr_filters_gen = event->addr_filters_gen;
3137	}
3138	raw_spin_unlock(&ifh->lock);
3139}
3140EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3141
3142static int _perf_event_refresh(struct perf_event *event, int refresh)
3143{
3144	/*
3145	 * not supported on inherited events
3146	 */
3147	if (event->attr.inherit || !is_sampling_event(event))
3148		return -EINVAL;
3149
3150	atomic_add(refresh, &event->event_limit);
3151	_perf_event_enable(event);
3152
3153	return 0;
3154}
3155
3156/*
3157 * See perf_event_disable()
3158 */
3159int perf_event_refresh(struct perf_event *event, int refresh)
3160{
3161	struct perf_event_context *ctx;
3162	int ret;
3163
3164	ctx = perf_event_ctx_lock(event);
3165	ret = _perf_event_refresh(event, refresh);
3166	perf_event_ctx_unlock(event, ctx);
3167
3168	return ret;
3169}
3170EXPORT_SYMBOL_GPL(perf_event_refresh);
3171
3172static int perf_event_modify_breakpoint(struct perf_event *bp,
3173					 struct perf_event_attr *attr)
3174{
3175	int err;
3176
3177	_perf_event_disable(bp);
3178
3179	err = modify_user_hw_breakpoint_check(bp, attr, true);
3180
3181	if (!bp->attr.disabled)
3182		_perf_event_enable(bp);
3183
3184	return err;
3185}
3186
3187/*
3188 * Copy event-type-independent attributes that may be modified.
3189 */
3190static void perf_event_modify_copy_attr(struct perf_event_attr *to,
3191					const struct perf_event_attr *from)
3192{
3193	to->sig_data = from->sig_data;
3194}
3195
3196static int perf_event_modify_attr(struct perf_event *event,
3197				  struct perf_event_attr *attr)
3198{
3199	int (*func)(struct perf_event *, struct perf_event_attr *);
3200	struct perf_event *child;
3201	int err;
3202
3203	if (event->attr.type != attr->type)
3204		return -EINVAL;
3205
3206	switch (event->attr.type) {
3207	case PERF_TYPE_BREAKPOINT:
3208		func = perf_event_modify_breakpoint;
3209		break;
3210	default:
3211		/* Place holder for future additions. */
3212		return -EOPNOTSUPP;
3213	}
3214
3215	WARN_ON_ONCE(event->ctx->parent_ctx);
3216
3217	mutex_lock(&event->child_mutex);
3218	/*
3219	 * Event-type-independent attributes must be copied before event-type
3220	 * modification, which will validate that final attributes match the
3221	 * source attributes after all relevant attributes have been copied.
3222	 */
3223	perf_event_modify_copy_attr(&event->attr, attr);
3224	err = func(event, attr);
3225	if (err)
3226		goto out;
3227	list_for_each_entry(child, &event->child_list, child_list) {
3228		perf_event_modify_copy_attr(&child->attr, attr);
3229		err = func(child, attr);
3230		if (err)
3231			goto out;
3232	}
3233out:
3234	mutex_unlock(&event->child_mutex);
3235	return err;
3236}
3237
3238static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
3239				enum event_type_t event_type)
3240{
3241	struct perf_event_context *ctx = pmu_ctx->ctx;
3242	struct perf_event *event, *tmp;
3243	struct pmu *pmu = pmu_ctx->pmu;
3244
3245	if (ctx->task && !ctx->is_active) {
3246		struct perf_cpu_pmu_context *cpc;
3247
3248		cpc = this_cpu_ptr(pmu->cpu_pmu_context);
3249		WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
3250		cpc->task_epc = NULL;
3251	}
3252
3253	if (!event_type)
3254		return;
3255
3256	perf_pmu_disable(pmu);
3257	if (event_type & EVENT_PINNED) {
3258		list_for_each_entry_safe(event, tmp,
3259					 &pmu_ctx->pinned_active,
3260					 active_list)
3261			group_sched_out(event, ctx);
3262	}
3263
3264	if (event_type & EVENT_FLEXIBLE) {
3265		list_for_each_entry_safe(event, tmp,
3266					 &pmu_ctx->flexible_active,
3267					 active_list)
3268			group_sched_out(event, ctx);
3269		/*
3270		 * Since we cleared EVENT_FLEXIBLE, also clear
3271		 * rotate_necessary, is will be reset by
3272		 * ctx_flexible_sched_in() when needed.
3273		 */
3274		pmu_ctx->rotate_necessary = 0;
3275	}
3276	perf_pmu_enable(pmu);
3277}
3278
3279static void
3280ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
3281{
3282	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3283	struct perf_event_pmu_context *pmu_ctx;
3284	int is_active = ctx->is_active;
3285	bool cgroup = event_type & EVENT_CGROUP;
3286
3287	event_type &= ~EVENT_CGROUP;
3288
3289	lockdep_assert_held(&ctx->lock);
3290
3291	if (likely(!ctx->nr_events)) {
3292		/*
3293		 * See __perf_remove_from_context().
3294		 */
3295		WARN_ON_ONCE(ctx->is_active);
3296		if (ctx->task)
3297			WARN_ON_ONCE(cpuctx->task_ctx);
3298		return;
3299	}
3300
3301	/*
3302	 * Always update time if it was set; not only when it changes.
3303	 * Otherwise we can 'forget' to update time for any but the last
3304	 * context we sched out. For example:
3305	 *
3306	 *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3307	 *   ctx_sched_out(.event_type = EVENT_PINNED)
3308	 *
3309	 * would only update time for the pinned events.
3310	 */
3311	if (is_active & EVENT_TIME) {
3312		/* update (and stop) ctx time */
3313		update_context_time(ctx);
3314		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
3315		/*
3316		 * CPU-release for the below ->is_active store,
3317		 * see __load_acquire() in perf_event_time_now()
3318		 */
3319		barrier();
3320	}
3321
3322	ctx->is_active &= ~event_type;
3323	if (!(ctx->is_active & EVENT_ALL))
3324		ctx->is_active = 0;
3325
3326	if (ctx->task) {
3327		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3328		if (!ctx->is_active)
3329			cpuctx->task_ctx = NULL;
3330	}
3331
3332	is_active ^= ctx->is_active; /* changed bits */
3333
3334	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3335		if (cgroup && !pmu_ctx->nr_cgroups)
3336			continue;
3337		__pmu_ctx_sched_out(pmu_ctx, is_active);
3338	}
3339}
3340
3341/*
3342 * Test whether two contexts are equivalent, i.e. whether they have both been
3343 * cloned from the same version of the same context.
3344 *
3345 * Equivalence is measured using a generation number in the context that is
3346 * incremented on each modification to it; see unclone_ctx(), list_add_event()
3347 * and list_del_event().
3348 */
3349static int context_equiv(struct perf_event_context *ctx1,
3350			 struct perf_event_context *ctx2)
3351{
3352	lockdep_assert_held(&ctx1->lock);
3353	lockdep_assert_held(&ctx2->lock);
3354
3355	/* Pinning disables the swap optimization */
3356	if (ctx1->pin_count || ctx2->pin_count)
3357		return 0;
3358
3359	/* If ctx1 is the parent of ctx2 */
3360	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3361		return 1;
3362
3363	/* If ctx2 is the parent of ctx1 */
3364	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3365		return 1;
3366
3367	/*
3368	 * If ctx1 and ctx2 have the same parent; we flatten the parent
3369	 * hierarchy, see perf_event_init_context().
3370	 */
3371	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3372			ctx1->parent_gen == ctx2->parent_gen)
3373		return 1;
3374
3375	/* Unmatched */
3376	return 0;
3377}
3378
3379static void __perf_event_sync_stat(struct perf_event *event,
3380				     struct perf_event *next_event)
3381{
3382	u64 value;
3383
3384	if (!event->attr.inherit_stat)
3385		return;
3386
3387	/*
3388	 * Update the event value, we cannot use perf_event_read()
3389	 * because we're in the middle of a context switch and have IRQs
3390	 * disabled, which upsets smp_call_function_single(), however
3391	 * we know the event must be on the current CPU, therefore we
3392	 * don't need to use it.
3393	 */
3394	if (event->state == PERF_EVENT_STATE_ACTIVE)
3395		event->pmu->read(event);
3396
3397	perf_event_update_time(event);
3398
3399	/*
3400	 * In order to keep per-task stats reliable we need to flip the event
3401	 * values when we flip the contexts.
3402	 */
3403	value = local64_read(&next_event->count);
3404	value = local64_xchg(&event->count, value);
3405	local64_set(&next_event->count, value);
3406
3407	swap(event->total_time_enabled, next_event->total_time_enabled);
3408	swap(event->total_time_running, next_event->total_time_running);
3409
3410	/*
3411	 * Since we swizzled the values, update the user visible data too.
3412	 */
3413	perf_event_update_userpage(event);
3414	perf_event_update_userpage(next_event);
3415}
3416
3417static void perf_event_sync_stat(struct perf_event_context *ctx,
3418				   struct perf_event_context *next_ctx)
3419{
3420	struct perf_event *event, *next_event;
3421
3422	if (!ctx->nr_stat)
3423		return;
3424
3425	update_context_time(ctx);
3426
3427	event = list_first_entry(&ctx->event_list,
3428				   struct perf_event, event_entry);
3429
3430	next_event = list_first_entry(&next_ctx->event_list,
3431					struct perf_event, event_entry);
3432
3433	while (&event->event_entry != &ctx->event_list &&
3434	       &next_event->event_entry != &next_ctx->event_list) {
3435
3436		__perf_event_sync_stat(event, next_event);
3437
3438		event = list_next_entry(event, event_entry);
3439		next_event = list_next_entry(next_event, event_entry);
3440	}
3441}
3442
3443#define double_list_for_each_entry(pos1, pos2, head1, head2, member)	\
3444	for (pos1 = list_first_entry(head1, typeof(*pos1), member),	\
3445	     pos2 = list_first_entry(head2, typeof(*pos2), member);	\
3446	     !list_entry_is_head(pos1, head1, member) &&		\
3447	     !list_entry_is_head(pos2, head2, member);			\
3448	     pos1 = list_next_entry(pos1, member),			\
3449	     pos2 = list_next_entry(pos2, member))
3450
3451static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
3452					  struct perf_event_context *next_ctx)
3453{
3454	struct perf_event_pmu_context *prev_epc, *next_epc;
3455
3456	if (!prev_ctx->nr_task_data)
3457		return;
3458
3459	double_list_for_each_entry(prev_epc, next_epc,
3460				   &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
3461				   pmu_ctx_entry) {
3462
3463		if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
3464			continue;
3465
3466		/*
3467		 * PMU specific parts of task perf context can require
3468		 * additional synchronization. As an example of such
3469		 * synchronization see implementation details of Intel
3470		 * LBR call stack data profiling;
3471		 */
3472		if (prev_epc->pmu->swap_task_ctx)
3473			prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
3474		else
3475			swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
3476	}
3477}
3478
3479static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
3480{
3481	struct perf_event_pmu_context *pmu_ctx;
3482	struct perf_cpu_pmu_context *cpc;
3483
3484	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3485		cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
3486
3487		if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
3488			pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
3489	}
3490}
3491
3492static void
3493perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
3494{
3495	struct perf_event_context *ctx = task->perf_event_ctxp;
3496	struct perf_event_context *next_ctx;
3497	struct perf_event_context *parent, *next_parent;
3498	int do_switch = 1;
3499
3500	if (likely(!ctx))
3501		return;
3502
3503	rcu_read_lock();
3504	next_ctx = rcu_dereference(next->perf_event_ctxp);
3505	if (!next_ctx)
3506		goto unlock;
3507
3508	parent = rcu_dereference(ctx->parent_ctx);
3509	next_parent = rcu_dereference(next_ctx->parent_ctx);
3510
3511	/* If neither context have a parent context; they cannot be clones. */
3512	if (!parent && !next_parent)
3513		goto unlock;
3514
3515	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3516		/*
3517		 * Looks like the two contexts are clones, so we might be
3518		 * able to optimize the context switch.  We lock both
3519		 * contexts and check that they are clones under the
3520		 * lock (including re-checking that neither has been
3521		 * uncloned in the meantime).  It doesn't matter which
3522		 * order we take the locks because no other cpu could
3523		 * be trying to lock both of these tasks.
3524		 */
3525		raw_spin_lock(&ctx->lock);
3526		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3527		if (context_equiv(ctx, next_ctx)) {
3528
3529			perf_ctx_disable(ctx, false);
3530
3531			/* PMIs are disabled; ctx->nr_pending is stable. */
3532			if (local_read(&ctx->nr_pending) ||
3533			    local_read(&next_ctx->nr_pending)) {
3534				/*
3535				 * Must not swap out ctx when there's pending
3536				 * events that rely on the ctx->task relation.
3537				 */
3538				raw_spin_unlock(&next_ctx->lock);
3539				rcu_read_unlock();
3540				goto inside_switch;
3541			}
3542
3543			WRITE_ONCE(ctx->task, next);
3544			WRITE_ONCE(next_ctx->task, task);
3545
3546			perf_ctx_sched_task_cb(ctx, false);
3547			perf_event_swap_task_ctx_data(ctx, next_ctx);
3548
3549			perf_ctx_enable(ctx, false);
3550
3551			/*
3552			 * RCU_INIT_POINTER here is safe because we've not
3553			 * modified the ctx and the above modification of
3554			 * ctx->task and ctx->task_ctx_data are immaterial
3555			 * since those values are always verified under
3556			 * ctx->lock which we're now holding.
3557			 */
3558			RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
3559			RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
3560
3561			do_switch = 0;
3562
3563			perf_event_sync_stat(ctx, next_ctx);
3564		}
3565		raw_spin_unlock(&next_ctx->lock);
3566		raw_spin_unlock(&ctx->lock);
3567	}
3568unlock:
3569	rcu_read_unlock();
3570
3571	if (do_switch) {
3572		raw_spin_lock(&ctx->lock);
3573		perf_ctx_disable(ctx, false);
3574
3575inside_switch:
3576		perf_ctx_sched_task_cb(ctx, false);
3577		task_ctx_sched_out(ctx, EVENT_ALL);
3578
3579		perf_ctx_enable(ctx, false);
3580		raw_spin_unlock(&ctx->lock);
3581	}
3582}
3583
3584static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3585static DEFINE_PER_CPU(int, perf_sched_cb_usages);
3586
3587void perf_sched_cb_dec(struct pmu *pmu)
3588{
3589	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
3590
3591	this_cpu_dec(perf_sched_cb_usages);
3592	barrier();
3593
3594	if (!--cpc->sched_cb_usage)
3595		list_del(&cpc->sched_cb_entry);
3596}
3597
3598
3599void perf_sched_cb_inc(struct pmu *pmu)
3600{
3601	struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
3602
3603	if (!cpc->sched_cb_usage++)
3604		list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3605
3606	barrier();
3607	this_cpu_inc(perf_sched_cb_usages);
3608}
3609
3610/*
3611 * This function provides the context switch callback to the lower code
3612 * layer. It is invoked ONLY when the context switch callback is enabled.
3613 *
3614 * This callback is relevant even to per-cpu events; for example multi event
3615 * PEBS requires this to provide PID/TID information. This requires we flush
3616 * all queued PEBS records before we context switch to a new task.
3617 */
3618static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
3619{
3620	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3621	struct pmu *pmu;
3622
3623	pmu = cpc->epc.pmu;
3624
3625	/* software PMUs will not have sched_task */
3626	if (WARN_ON_ONCE(!pmu->sched_task))
3627		return;
3628
3629	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3630	perf_pmu_disable(pmu);
3631
3632	pmu->sched_task(cpc->task_epc, sched_in);
3633
3634	perf_pmu_enable(pmu);
3635	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3636}
3637
3638static void perf_pmu_sched_task(struct task_struct *prev,
3639				struct task_struct *next,
3640				bool sched_in)
3641{
3642	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3643	struct perf_cpu_pmu_context *cpc;
3644
3645	/* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
3646	if (prev == next || cpuctx->task_ctx)
3647		return;
3648
3649	list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
3650		__perf_pmu_sched_task(cpc, sched_in);
3651}
3652
3653static void perf_event_switch(struct task_struct *task,
3654			      struct task_struct *next_prev, bool sched_in);
3655
3656/*
3657 * Called from scheduler to remove the events of the current task,
3658 * with interrupts disabled.
3659 *
3660 * We stop each event and update the event value in event->count.
3661 *
3662 * This does not protect us against NMI, but disable()
3663 * sets the disabled bit in the control field of event _before_
3664 * accessing the event control register. If a NMI hits, then it will
3665 * not restart the event.
3666 */
3667void __perf_event_task_sched_out(struct task_struct *task,
3668				 struct task_struct *next)
3669{
3670	if (__this_cpu_read(perf_sched_cb_usages))
3671		perf_pmu_sched_task(task, next, false);
3672
3673	if (atomic_read(&nr_switch_events))
3674		perf_event_switch(task, next, false);
3675
3676	perf_event_context_sched_out(task, next);
3677
3678	/*
3679	 * if cgroup events exist on this CPU, then we need
3680	 * to check if we have to switch out PMU state.
3681	 * cgroup event are system-wide mode only
3682	 */
3683	perf_cgroup_switch(next);
3684}
3685
3686static bool perf_less_group_idx(const void *l, const void *r)
3687{
3688	const struct perf_event *le = *(const struct perf_event **)l;
3689	const struct perf_event *re = *(const struct perf_event **)r;
3690
3691	return le->group_index < re->group_index;
3692}
3693
3694static void swap_ptr(void *l, void *r)
3695{
3696	void **lp = l, **rp = r;
3697
3698	swap(*lp, *rp);
3699}
3700
3701static const struct min_heap_callbacks perf_min_heap = {
3702	.elem_size = sizeof(struct perf_event *),
3703	.less = perf_less_group_idx,
3704	.swp = swap_ptr,
3705};
3706
3707static void __heap_add(struct min_heap *heap, struct perf_event *event)
3708{
3709	struct perf_event **itrs = heap->data;
3710
3711	if (event) {
3712		itrs[heap->nr] = event;
3713		heap->nr++;
3714	}
3715}
3716
3717static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
3718{
3719	struct perf_cpu_pmu_context *cpc;
3720
3721	if (!pmu_ctx->ctx->task)
3722		return;
3723
3724	cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
3725	WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
3726	cpc->task_epc = pmu_ctx;
3727}
3728
3729static noinline int visit_groups_merge(struct perf_event_context *ctx,
3730				struct perf_event_groups *groups, int cpu,
3731				struct pmu *pmu,
3732				int (*func)(struct perf_event *, void *),
3733				void *data)
3734{
3735#ifdef CONFIG_CGROUP_PERF
3736	struct cgroup_subsys_state *css = NULL;
3737#endif
3738	struct perf_cpu_context *cpuctx = NULL;
3739	/* Space for per CPU and/or any CPU event iterators. */
3740	struct perf_event *itrs[2];
3741	struct min_heap event_heap;
3742	struct perf_event **evt;
3743	int ret;
3744
3745	if (pmu->filter && pmu->filter(pmu, cpu))
3746		return 0;
3747
3748	if (!ctx->task) {
3749		cpuctx = this_cpu_ptr(&perf_cpu_context);
3750		event_heap = (struct min_heap){
3751			.data = cpuctx->heap,
3752			.nr = 0,
3753			.size = cpuctx->heap_size,
3754		};
3755
3756		lockdep_assert_held(&cpuctx->ctx.lock);
3757
3758#ifdef CONFIG_CGROUP_PERF
3759		if (cpuctx->cgrp)
3760			css = &cpuctx->cgrp->css;
3761#endif
3762	} else {
3763		event_heap = (struct min_heap){
3764			.data = itrs,
3765			.nr = 0,
3766			.size = ARRAY_SIZE(itrs),
3767		};
3768		/* Events not within a CPU context may be on any CPU. */
3769		__heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
3770	}
3771	evt = event_heap.data;
3772
3773	__heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
3774
3775#ifdef CONFIG_CGROUP_PERF
3776	for (; css; css = css->parent)
3777		__heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
3778#endif
3779
3780	if (event_heap.nr) {
3781		__link_epc((*evt)->pmu_ctx);
3782		perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
3783	}
3784
3785	min_heapify_all(&event_heap, &perf_min_heap);
3786
3787	while (event_heap.nr) {
3788		ret = func(*evt, data);
3789		if (ret)
3790			return ret;
3791
3792		*evt = perf_event_groups_next(*evt, pmu);
3793		if (*evt)
3794			min_heapify(&event_heap, 0, &perf_min_heap);
3795		else
3796			min_heap_pop(&event_heap, &perf_min_heap);
3797	}
3798
3799	return 0;
3800}
3801
3802/*
3803 * Because the userpage is strictly per-event (there is no concept of context,
3804 * so there cannot be a context indirection), every userpage must be updated
3805 * when context time starts :-(
3806 *
3807 * IOW, we must not miss EVENT_TIME edges.
3808 */
3809static inline bool event_update_userpage(struct perf_event *event)
3810{
3811	if (likely(!atomic_read(&event->mmap_count)))
3812		return false;
3813
3814	perf_event_update_time(event);
3815	perf_event_update_userpage(event);
3816
3817	return true;
3818}
3819
3820static inline void group_update_userpage(struct perf_event *group_event)
3821{
3822	struct perf_event *event;
3823
3824	if (!event_update_userpage(group_event))
3825		return;
3826
3827	for_each_sibling_event(event, group_event)
3828		event_update_userpage(event);
3829}
3830
3831static int merge_sched_in(struct perf_event *event, void *data)
3832{
3833	struct perf_event_context *ctx = event->ctx;
3834	int *can_add_hw = data;
3835
3836	if (event->state <= PERF_EVENT_STATE_OFF)
3837		return 0;
3838
3839	if (!event_filter_match(event))
3840		return 0;
3841
3842	if (group_can_go_on(event, *can_add_hw)) {
3843		if (!group_sched_in(event, ctx))
3844			list_add_tail(&event->active_list, get_event_list(event));
3845	}
3846
3847	if (event->state == PERF_EVENT_STATE_INACTIVE) {
3848		*can_add_hw = 0;
3849		if (event->attr.pinned) {
3850			perf_cgroup_event_disable(event, ctx);
3851			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3852		} else {
3853			struct perf_cpu_pmu_context *cpc;
3854
3855			event->pmu_ctx->rotate_necessary = 1;
3856			cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
3857			perf_mux_hrtimer_restart(cpc);
3858			group_update_userpage(event);
3859		}
3860	}
3861
3862	return 0;
3863}
3864
3865static void pmu_groups_sched_in(struct perf_event_context *ctx,
3866				struct perf_event_groups *groups,
3867				struct pmu *pmu)
3868{
3869	int can_add_hw = 1;
3870	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
3871			   merge_sched_in, &can_add_hw);
3872}
3873
3874static void ctx_groups_sched_in(struct perf_event_context *ctx,
3875				struct perf_event_groups *groups,
3876				bool cgroup)
3877{
3878	struct perf_event_pmu_context *pmu_ctx;
3879
3880	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
3881		if (cgroup && !pmu_ctx->nr_cgroups)
3882			continue;
3883		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
3884	}
3885}
3886
3887static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
3888			       struct pmu *pmu)
3889{
3890	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
3891}
3892
3893static void
3894ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
3895{
3896	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3897	int is_active = ctx->is_active;
3898	bool cgroup = event_type & EVENT_CGROUP;
3899
3900	event_type &= ~EVENT_CGROUP;
3901
3902	lockdep_assert_held(&ctx->lock);
3903
3904	if (likely(!ctx->nr_events))
3905		return;
3906
3907	if (!(is_active & EVENT_TIME)) {
3908		/* start ctx time */
3909		__update_context_time(ctx, false);
3910		perf_cgroup_set_timestamp(cpuctx);
3911		/*
3912		 * CPU-release for the below ->is_active store,
3913		 * see __load_acquire() in perf_event_time_now()
3914		 */
3915		barrier();
3916	}
3917
3918	ctx->is_active |= (event_type | EVENT_TIME);
3919	if (ctx->task) {
3920		if (!is_active)
3921			cpuctx->task_ctx = ctx;
3922		else
3923			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3924	}
3925
3926	is_active ^= ctx->is_active; /* changed bits */
3927
3928	/*
3929	 * First go through the list and put on any pinned groups
3930	 * in order to give them the best chance of going on.
3931	 */
3932	if (is_active & EVENT_PINNED)
3933		ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
3934
3935	/* Then walk through the lower prio flexible groups */
3936	if (is_active & EVENT_FLEXIBLE)
3937		ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
3938}
3939
3940static void perf_event_context_sched_in(struct task_struct *task)
3941{
3942	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
3943	struct perf_event_context *ctx;
3944
3945	rcu_read_lock();
3946	ctx = rcu_dereference(task->perf_event_ctxp);
3947	if (!ctx)
3948		goto rcu_unlock;
3949
3950	if (cpuctx->task_ctx == ctx) {
3951		perf_ctx_lock(cpuctx, ctx);
3952		perf_ctx_disable(ctx, false);
3953
3954		perf_ctx_sched_task_cb(ctx, true);
3955
3956		perf_ctx_enable(ctx, false);
3957		perf_ctx_unlock(cpuctx, ctx);
3958		goto rcu_unlock;
3959	}
3960
3961	perf_ctx_lock(cpuctx, ctx);
3962	/*
3963	 * We must check ctx->nr_events while holding ctx->lock, such
3964	 * that we serialize against perf_install_in_context().
3965	 */
3966	if (!ctx->nr_events)
3967		goto unlock;
3968
3969	perf_ctx_disable(ctx, false);
3970	/*
3971	 * We want to keep the following priority order:
3972	 * cpu pinned (that don't need to move), task pinned,
3973	 * cpu flexible, task flexible.
3974	 *
3975	 * However, if task's ctx is not carrying any pinned
3976	 * events, no need to flip the cpuctx's events around.
3977	 */
3978	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
3979		perf_ctx_disable(&cpuctx->ctx, false);
3980		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
3981	}
3982
3983	perf_event_sched_in(cpuctx, ctx);
3984
3985	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
3986
3987	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3988		perf_ctx_enable(&cpuctx->ctx, false);
3989
3990	perf_ctx_enable(ctx, false);
3991
3992unlock:
3993	perf_ctx_unlock(cpuctx, ctx);
3994rcu_unlock:
3995	rcu_read_unlock();
3996}
3997
3998/*
3999 * Called from scheduler to add the events of the current task
4000 * with interrupts disabled.
4001 *
4002 * We restore the event value and then enable it.
4003 *
4004 * This does not protect us against NMI, but enable()
4005 * sets the enabled bit in the control field of event _before_
4006 * accessing the event control register. If a NMI hits, then it will
4007 * keep the event running.
4008 */
4009void __perf_event_task_sched_in(struct task_struct *prev,
4010				struct task_struct *task)
4011{
4012	perf_event_context_sched_in(task);
4013
4014	if (atomic_read(&nr_switch_events))
4015		perf_event_switch(task, prev, true);
4016
4017	if (__this_cpu_read(perf_sched_cb_usages))
4018		perf_pmu_sched_task(prev, task, true);
4019}
4020
4021static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
4022{
4023	u64 frequency = event->attr.sample_freq;
4024	u64 sec = NSEC_PER_SEC;
4025	u64 divisor, dividend;
4026
4027	int count_fls, nsec_fls, frequency_fls, sec_fls;
4028
4029	count_fls = fls64(count);
4030	nsec_fls = fls64(nsec);
4031	frequency_fls = fls64(frequency);
4032	sec_fls = 30;
4033
4034	/*
4035	 * We got @count in @nsec, with a target of sample_freq HZ
4036	 * the target period becomes:
4037	 *
4038	 *             @count * 10^9
4039	 * period = -------------------
4040	 *          @nsec * sample_freq
4041	 *
4042	 */
4043
4044	/*
4045	 * Reduce accuracy by one bit such that @a and @b converge
4046	 * to a similar magnitude.
4047	 */
4048#define REDUCE_FLS(a, b)		\
4049do {					\
4050	if (a##_fls > b##_fls) {	\
4051		a >>= 1;		\
4052		a##_fls--;		\
4053	} else {			\
4054		b >>= 1;		\
4055		b##_fls--;		\
4056	}				\
4057} while (0)
4058
4059	/*
4060	 * Reduce accuracy until either term fits in a u64, then proceed with
4061	 * the other, so that finally we can do a u64/u64 division.
4062	 */
4063	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
4064		REDUCE_FLS(nsec, frequency);
4065		REDUCE_FLS(sec, count);
4066	}
4067
4068	if (count_fls + sec_fls > 64) {
4069		divisor = nsec * frequency;
4070
4071		while (count_fls + sec_fls > 64) {
4072			REDUCE_FLS(count, sec);
4073			divisor >>= 1;
4074		}
4075
4076		dividend = count * sec;
4077	} else {
4078		dividend = count * sec;
4079
4080		while (nsec_fls + frequency_fls > 64) {
4081			REDUCE_FLS(nsec, frequency);
4082			dividend >>= 1;
4083		}
4084
4085		divisor = nsec * frequency;
4086	}
4087
4088	if (!divisor)
4089		return dividend;
4090
4091	return div64_u64(dividend, divisor);
4092}
4093
4094static DEFINE_PER_CPU(int, perf_throttled_count);
4095static DEFINE_PER_CPU(u64, perf_throttled_seq);
4096
4097static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
4098{
4099	struct hw_perf_event *hwc = &event->hw;
4100	s64 period, sample_period;
4101	s64 delta;
4102
4103	period = perf_calculate_period(event, nsec, count);
4104
4105	delta = (s64)(period - hwc->sample_period);
4106	delta = (delta + 7) / 8; /* low pass filter */
4107
4108	sample_period = hwc->sample_period + delta;
4109
4110	if (!sample_period)
4111		sample_period = 1;
4112
4113	hwc->sample_period = sample_period;
4114
4115	if (local64_read(&hwc->period_left) > 8*sample_period) {
4116		if (disable)
4117			event->pmu->stop(event, PERF_EF_UPDATE);
4118
4119		local64_set(&hwc->period_left, 0);
4120
4121		if (disable)
4122			event->pmu->start(event, PERF_EF_RELOAD);
4123	}
4124}
4125
4126/*
4127 * combine freq adjustment with unthrottling to avoid two passes over the
4128 * events. At the same time, make sure, having freq events does not change
4129 * the rate of unthrottling as that would introduce bias.
4130 */
4131static void
4132perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
4133{
4134	struct perf_event *event;
4135	struct hw_perf_event *hwc;
4136	u64 now, period = TICK_NSEC;
4137	s64 delta;
4138
4139	/*
4140	 * only need to iterate over all events iff:
4141	 * - context have events in frequency mode (needs freq adjust)
4142	 * - there are events to unthrottle on this cpu
4143	 */
4144	if (!(ctx->nr_freq || unthrottle))
4145		return;
4146
4147	raw_spin_lock(&ctx->lock);
4148
4149	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4150		if (event->state != PERF_EVENT_STATE_ACTIVE)
4151			continue;
4152
4153		// XXX use visit thingy to avoid the -1,cpu match
4154		if (!event_filter_match(event))
4155			continue;
4156
4157		perf_pmu_disable(event->pmu);
4158
4159		hwc = &event->hw;
4160
4161		if (hwc->interrupts == MAX_INTERRUPTS) {
4162			hwc->interrupts = 0;
4163			perf_log_throttle(event, 1);
4164			event->pmu->start(event, 0);
4165		}
4166
4167		if (!event->attr.freq || !event->attr.sample_freq)
4168			goto next;
4169
4170		/*
4171		 * stop the event and update event->count
4172		 */
4173		event->pmu->stop(event, PERF_EF_UPDATE);
4174
4175		now = local64_read(&event->count);
4176		delta = now - hwc->freq_count_stamp;
4177		hwc->freq_count_stamp = now;
4178
4179		/*
4180		 * restart the event
4181		 * reload only if value has changed
4182		 * we have stopped the event so tell that
4183		 * to perf_adjust_period() to avoid stopping it
4184		 * twice.
4185		 */
4186		if (delta > 0)
4187			perf_adjust_period(event, period, delta, false);
4188
4189		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4190	next:
4191		perf_pmu_enable(event->pmu);
4192	}
4193
4194	raw_spin_unlock(&ctx->lock);
4195}
4196
4197/*
4198 * Move @event to the tail of the @ctx's elegible events.
4199 */
4200static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4201{
4202	/*
4203	 * Rotate the first entry last of non-pinned groups. Rotation might be
4204	 * disabled by the inheritance code.
4205	 */
4206	if (ctx->rotate_disable)
4207		return;
4208
4209	perf_event_groups_delete(&ctx->flexible_groups, event);
4210	perf_event_groups_insert(&ctx->flexible_groups, event);
4211}
4212
4213/* pick an event from the flexible_groups to rotate */
4214static inline struct perf_event *
4215ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
4216{
4217	struct perf_event *event;
4218	struct rb_node *node;
4219	struct rb_root *tree;
4220	struct __group_key key = {
4221		.pmu = pmu_ctx->pmu,
4222	};
4223
4224	/* pick the first active flexible event */
4225	event = list_first_entry_or_null(&pmu_ctx->flexible_active,
4226					 struct perf_event, active_list);
4227	if (event)
4228		goto out;
4229
4230	/* if no active flexible event, pick the first event */
4231	tree = &pmu_ctx->ctx->flexible_groups.tree;
4232
4233	if (!pmu_ctx->ctx->task) {
4234		key.cpu = smp_processor_id();
4235
4236		node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4237		if (node)
4238			event = __node_2_pe(node);
4239		goto out;
4240	}
4241
4242	key.cpu = -1;
4243	node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4244	if (node) {
4245		event = __node_2_pe(node);
4246		goto out;
4247	}
4248
4249	key.cpu = smp_processor_id();
4250	node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
4251	if (node)
4252		event = __node_2_pe(node);
4253
4254out:
4255	/*
4256	 * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4257	 * finds there are unschedulable events, it will set it again.
4258	 */
4259	pmu_ctx->rotate_necessary = 0;
4260
4261	return event;
4262}
4263
4264static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
4265{
4266	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4267	struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
4268	struct perf_event *cpu_event = NULL, *task_event = NULL;
4269	int cpu_rotate, task_rotate;
4270	struct pmu *pmu;
4271
4272	/*
4273	 * Since we run this from IRQ context, nobody can install new
4274	 * events, thus the event count values are stable.
4275	 */
4276
4277	cpu_epc = &cpc->epc;
4278	pmu = cpu_epc->pmu;
4279	task_epc = cpc->task_epc;
4280
4281	cpu_rotate = cpu_epc->rotate_necessary;
4282	task_rotate = task_epc ? task_epc->rotate_necessary : 0;
4283
4284	if (!(cpu_rotate || task_rotate))
4285		return false;
4286
4287	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4288	perf_pmu_disable(pmu);
4289
4290	if (task_rotate)
4291		task_event = ctx_event_to_rotate(task_epc);
4292	if (cpu_rotate)
4293		cpu_event = ctx_event_to_rotate(cpu_epc);
4294
4295	/*
4296	 * As per the order given at ctx_resched() first 'pop' task flexible
4297	 * and then, if needed CPU flexible.
4298	 */
4299	if (task_event || (task_epc && cpu_event)) {
4300		update_context_time(task_epc->ctx);
4301		__pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
4302	}
4303
4304	if (cpu_event) {
4305		update_context_time(&cpuctx->ctx);
4306		__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
4307		rotate_ctx(&cpuctx->ctx, cpu_event);
4308		__pmu_ctx_sched_in(&cpuctx->ctx, pmu);
4309	}
4310
4311	if (task_event)
4312		rotate_ctx(task_epc->ctx, task_event);
4313
4314	if (task_event || (task_epc && cpu_event))
4315		__pmu_ctx_sched_in(task_epc->ctx, pmu);
4316
4317	perf_pmu_enable(pmu);
4318	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4319
4320	return true;
4321}
4322
4323void perf_event_task_tick(void)
4324{
4325	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4326	struct perf_event_context *ctx;
4327	int throttled;
4328
4329	lockdep_assert_irqs_disabled();
4330
4331	__this_cpu_inc(perf_throttled_seq);
4332	throttled = __this_cpu_xchg(perf_throttled_count, 0);
4333	tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4334
4335	perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
4336
4337	rcu_read_lock();
4338	ctx = rcu_dereference(current->perf_event_ctxp);
4339	if (ctx)
4340		perf_adjust_freq_unthr_context(ctx, !!throttled);
4341	rcu_read_unlock();
4342}
4343
4344static int event_enable_on_exec(struct perf_event *event,
4345				struct perf_event_context *ctx)
4346{
4347	if (!event->attr.enable_on_exec)
4348		return 0;
4349
4350	event->attr.enable_on_exec = 0;
4351	if (event->state >= PERF_EVENT_STATE_INACTIVE)
4352		return 0;
4353
4354	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4355
4356	return 1;
4357}
4358
4359/*
4360 * Enable all of a task's events that have been marked enable-on-exec.
4361 * This expects task == current.
4362 */
4363static void perf_event_enable_on_exec(struct perf_event_context *ctx)
4364{
4365	struct perf_event_context *clone_ctx = NULL;
4366	enum event_type_t event_type = 0;
4367	struct perf_cpu_context *cpuctx;
4368	struct perf_event *event;
4369	unsigned long flags;
4370	int enabled = 0;
4371
4372	local_irq_save(flags);
4373	if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
4374		goto out;
4375
4376	if (!ctx->nr_events)
4377		goto out;
4378
4379	cpuctx = this_cpu_ptr(&perf_cpu_context);
4380	perf_ctx_lock(cpuctx, ctx);
4381	ctx_sched_out(ctx, EVENT_TIME);
4382
4383	list_for_each_entry(event, &ctx->event_list, event_entry) {
4384		enabled |= event_enable_on_exec(event, ctx);
4385		event_type |= get_event_type(event);
4386	}
4387
4388	/*
4389	 * Unclone and reschedule this context if we enabled any event.
4390	 */
4391	if (enabled) {
4392		clone_ctx = unclone_ctx(ctx);
4393		ctx_resched(cpuctx, ctx, event_type);
4394	} else {
4395		ctx_sched_in(ctx, EVENT_TIME);
4396	}
4397	perf_ctx_unlock(cpuctx, ctx);
4398
4399out:
4400	local_irq_restore(flags);
4401
4402	if (clone_ctx)
4403		put_ctx(clone_ctx);
4404}
4405
4406static void perf_remove_from_owner(struct perf_event *event);
4407static void perf_event_exit_event(struct perf_event *event,
4408				  struct perf_event_context *ctx);
4409
4410/*
4411 * Removes all events from the current task that have been marked
4412 * remove-on-exec, and feeds their values back to parent events.
4413 */
4414static void perf_event_remove_on_exec(struct perf_event_context *ctx)
4415{
4416	struct perf_event_context *clone_ctx = NULL;
4417	struct perf_event *event, *next;
4418	unsigned long flags;
4419	bool modified = false;
4420
4421	mutex_lock(&ctx->mutex);
4422
4423	if (WARN_ON_ONCE(ctx->task != current))
4424		goto unlock;
4425
4426	list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4427		if (!event->attr.remove_on_exec)
4428			continue;
4429
4430		if (!is_kernel_event(event))
4431			perf_remove_from_owner(event);
4432
4433		modified = true;
4434
4435		perf_event_exit_event(event, ctx);
4436	}
4437
4438	raw_spin_lock_irqsave(&ctx->lock, flags);
4439	if (modified)
4440		clone_ctx = unclone_ctx(ctx);
4441	raw_spin_unlock_irqrestore(&ctx->lock, flags);
4442
4443unlock:
4444	mutex_unlock(&ctx->mutex);
4445
4446	if (clone_ctx)
4447		put_ctx(clone_ctx);
4448}
4449
4450struct perf_read_data {
4451	struct perf_event *event;
4452	bool group;
4453	int ret;
4454};
4455
4456static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4457{
4458	u16 local_pkg, event_pkg;
4459
4460	if ((unsigned)event_cpu >= nr_cpu_ids)
4461		return event_cpu;
4462
4463	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4464		int local_cpu = smp_processor_id();
4465
4466		event_pkg = topology_physical_package_id(event_cpu);
4467		local_pkg = topology_physical_package_id(local_cpu);
4468
4469		if (event_pkg == local_pkg)
4470			return local_cpu;
4471	}
4472
4473	return event_cpu;
4474}
4475
4476/*
4477 * Cross CPU call to read the hardware event
4478 */
4479static void __perf_event_read(void *info)
4480{
4481	struct perf_read_data *data = info;
4482	struct perf_event *sub, *event = data->event;
4483	struct perf_event_context *ctx = event->ctx;
4484	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
4485	struct pmu *pmu = event->pmu;
4486
4487	/*
4488	 * If this is a task context, we need to check whether it is
4489	 * the current task context of this cpu.  If not it has been
4490	 * scheduled out before the smp call arrived.  In that case
4491	 * event->count would have been updated to a recent sample
4492	 * when the event was scheduled out.
4493	 */
4494	if (ctx->task && cpuctx->task_ctx != ctx)
4495		return;
4496
4497	raw_spin_lock(&ctx->lock);
4498	if (ctx->is_active & EVENT_TIME) {
4499		update_context_time(ctx);
4500		update_cgrp_time_from_event(event);
4501	}
4502
4503	perf_event_update_time(event);
4504	if (data->group)
4505		perf_event_update_sibling_time(event);
4506
4507	if (event->state != PERF_EVENT_STATE_ACTIVE)
4508		goto unlock;
4509
4510	if (!data->group) {
4511		pmu->read(event);
4512		data->ret = 0;
4513		goto unlock;
4514	}
4515
4516	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4517
4518	pmu->read(event);
4519
4520	for_each_sibling_event(sub, event) {
4521		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4522			/*
4523			 * Use sibling's PMU rather than @event's since
4524			 * sibling could be on different (eg: software) PMU.
4525			 */
4526			sub->pmu->read(sub);
4527		}
4528	}
4529
4530	data->ret = pmu->commit_txn(pmu);
4531
4532unlock:
4533	raw_spin_unlock(&ctx->lock);
4534}
4535
4536static inline u64 perf_event_count(struct perf_event *event)
4537{
4538	return local64_read(&event->count) + atomic64_read(&event->child_count);
4539}
4540
4541static void calc_timer_values(struct perf_event *event,
4542				u64 *now,
4543				u64 *enabled,
4544				u64 *running)
4545{
4546	u64 ctx_time;
4547
4548	*now = perf_clock();
4549	ctx_time = perf_event_time_now(event, *now);
4550	__perf_update_times(event, ctx_time, enabled, running);
4551}
4552
4553/*
4554 * NMI-safe method to read a local event, that is an event that
4555 * is:
4556 *   - either for the current task, or for this CPU
4557 *   - does not have inherit set, for inherited task events
4558 *     will not be local and we cannot read them atomically
4559 *   - must not have a pmu::count method
4560 */
4561int perf_event_read_local(struct perf_event *event, u64 *value,
4562			  u64 *enabled, u64 *running)
4563{
4564	unsigned long flags;
4565	int event_oncpu;
4566	int event_cpu;
4567	int ret = 0;
4568
4569	/*
4570	 * Disabling interrupts avoids all counter scheduling (context
4571	 * switches, timer based rotation and IPIs).
4572	 */
4573	local_irq_save(flags);
4574
4575	/*
4576	 * It must not be an event with inherit set, we cannot read
4577	 * all child counters from atomic context.
4578	 */
4579	if (event->attr.inherit) {
4580		ret = -EOPNOTSUPP;
4581		goto out;
4582	}
4583
4584	/* If this is a per-task event, it must be for current */
4585	if ((event->attach_state & PERF_ATTACH_TASK) &&
4586	    event->hw.target != current) {
4587		ret = -EINVAL;
4588		goto out;
4589	}
4590
4591	/*
4592	 * Get the event CPU numbers, and adjust them to local if the event is
4593	 * a per-package event that can be read locally
4594	 */
4595	event_oncpu = __perf_event_read_cpu(event, event->oncpu);
4596	event_cpu = __perf_event_read_cpu(event, event->cpu);
4597
4598	/* If this is a per-CPU event, it must be for this CPU */
4599	if (!(event->attach_state & PERF_ATTACH_TASK) &&
4600	    event_cpu != smp_processor_id()) {
4601		ret = -EINVAL;
4602		goto out;
4603	}
4604
4605	/* If this is a pinned event it must be running on this CPU */
4606	if (event->attr.pinned && event_oncpu != smp_processor_id()) {
4607		ret = -EBUSY;
4608		goto out;
4609	}
4610
4611	/*
4612	 * If the event is currently on this CPU, its either a per-task event,
4613	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4614	 * oncpu == -1).
4615	 */
4616	if (event_oncpu == smp_processor_id())
4617		event->pmu->read(event);
4618
4619	*value = local64_read(&event->count);
4620	if (enabled || running) {
4621		u64 __enabled, __running, __now;
4622
4623		calc_timer_values(event, &__now, &__enabled, &__running);
4624		if (enabled)
4625			*enabled = __enabled;
4626		if (running)
4627			*running = __running;
4628	}
4629out:
4630	local_irq_restore(flags);
4631
4632	return ret;
4633}
4634
4635static int perf_event_read(struct perf_event *event, bool group)
4636{
4637	enum perf_event_state state = READ_ONCE(event->state);
4638	int event_cpu, ret = 0;
4639
4640	/*
4641	 * If event is enabled and currently active on a CPU, update the
4642	 * value in the event structure:
4643	 */
4644again:
4645	if (state == PERF_EVENT_STATE_ACTIVE) {
4646		struct perf_read_data data;
4647
4648		/*
4649		 * Orders the ->state and ->oncpu loads such that if we see
4650		 * ACTIVE we must also see the right ->oncpu.
4651		 *
4652		 * Matches the smp_wmb() from event_sched_in().
4653		 */
4654		smp_rmb();
4655
4656		event_cpu = READ_ONCE(event->oncpu);
4657		if ((unsigned)event_cpu >= nr_cpu_ids)
4658			return 0;
4659
4660		data = (struct perf_read_data){
4661			.event = event,
4662			.group = group,
4663			.ret = 0,
4664		};
4665
4666		preempt_disable();
4667		event_cpu = __perf_event_read_cpu(event, event_cpu);
4668
4669		/*
4670		 * Purposely ignore the smp_call_function_single() return
4671		 * value.
4672		 *
4673		 * If event_cpu isn't a valid CPU it means the event got
4674		 * scheduled out and that will have updated the event count.
4675		 *
4676		 * Therefore, either way, we'll have an up-to-date event count
4677		 * after this.
4678		 */
4679		(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4680		preempt_enable();
4681		ret = data.ret;
4682
4683	} else if (state == PERF_EVENT_STATE_INACTIVE) {
4684		struct perf_event_context *ctx = event->ctx;
4685		unsigned long flags;
4686
4687		raw_spin_lock_irqsave(&ctx->lock, flags);
4688		state = event->state;
4689		if (state != PERF_EVENT_STATE_INACTIVE) {
4690			raw_spin_unlock_irqrestore(&ctx->lock, flags);
4691			goto again;
4692		}
4693
4694		/*
4695		 * May read while context is not active (e.g., thread is
4696		 * blocked), in that case we cannot update context time
4697		 */
4698		if (ctx->is_active & EVENT_TIME) {
4699			update_context_time(ctx);
4700			update_cgrp_time_from_event(event);
4701		}
4702
4703		perf_event_update_time(event);
4704		if (group)
4705			perf_event_update_sibling_time(event);
4706		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4707	}
4708
4709	return ret;
4710}
4711
4712/*
4713 * Initialize the perf_event context in a task_struct:
4714 */
4715static void __perf_event_init_context(struct perf_event_context *ctx)
4716{
4717	raw_spin_lock_init(&ctx->lock);
4718	mutex_init(&ctx->mutex);
4719	INIT_LIST_HEAD(&ctx->pmu_ctx_list);
4720	perf_event_groups_init(&ctx->pinned_groups);
4721	perf_event_groups_init(&ctx->flexible_groups);
4722	INIT_LIST_HEAD(&ctx->event_list);
4723	refcount_set(&ctx->refcount, 1);
4724}
4725
4726static void
4727__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
4728{
4729	epc->pmu = pmu;
4730	INIT_LIST_HEAD(&epc->pmu_ctx_entry);
4731	INIT_LIST_HEAD(&epc->pinned_active);
4732	INIT_LIST_HEAD(&epc->flexible_active);
4733	atomic_set(&epc->refcount, 1);
4734}
4735
4736static struct perf_event_context *
4737alloc_perf_context(struct task_struct *task)
4738{
4739	struct perf_event_context *ctx;
4740
4741	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4742	if (!ctx)
4743		return NULL;
4744
4745	__perf_event_init_context(ctx);
4746	if (task)
4747		ctx->task = get_task_struct(task);
4748
4749	return ctx;
4750}
4751
4752static struct task_struct *
4753find_lively_task_by_vpid(pid_t vpid)
4754{
4755	struct task_struct *task;
4756
4757	rcu_read_lock();
4758	if (!vpid)
4759		task = current;
4760	else
4761		task = find_task_by_vpid(vpid);
4762	if (task)
4763		get_task_struct(task);
4764	rcu_read_unlock();
4765
4766	if (!task)
4767		return ERR_PTR(-ESRCH);
4768
4769	return task;
4770}
4771
4772/*
4773 * Returns a matching context with refcount and pincount.
4774 */
4775static struct perf_event_context *
4776find_get_context(struct task_struct *task, struct perf_event *event)
4777{
4778	struct perf_event_context *ctx, *clone_ctx = NULL;
4779	struct perf_cpu_context *cpuctx;
4780	unsigned long flags;
4781	int err;
4782
4783	if (!task) {
4784		/* Must be root to operate on a CPU event: */
4785		err = perf_allow_cpu(&event->attr);
4786		if (err)
4787			return ERR_PTR(err);
4788
4789		cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
4790		ctx = &cpuctx->ctx;
4791		get_ctx(ctx);
4792		raw_spin_lock_irqsave(&ctx->lock, flags);
4793		++ctx->pin_count;
4794		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4795
4796		return ctx;
4797	}
4798
4799	err = -EINVAL;
4800retry:
4801	ctx = perf_lock_task_context(task, &flags);
4802	if (ctx) {
4803		clone_ctx = unclone_ctx(ctx);
4804		++ctx->pin_count;
4805
4806		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4807
4808		if (clone_ctx)
4809			put_ctx(clone_ctx);
4810	} else {
4811		ctx = alloc_perf_context(task);
4812		err = -ENOMEM;
4813		if (!ctx)
4814			goto errout;
4815
4816		err = 0;
4817		mutex_lock(&task->perf_event_mutex);
4818		/*
4819		 * If it has already passed perf_event_exit_task().
4820		 * we must see PF_EXITING, it takes this mutex too.
4821		 */
4822		if (task->flags & PF_EXITING)
4823			err = -ESRCH;
4824		else if (task->perf_event_ctxp)
4825			err = -EAGAIN;
4826		else {
4827			get_ctx(ctx);
4828			++ctx->pin_count;
4829			rcu_assign_pointer(task->perf_event_ctxp, ctx);
4830		}
4831		mutex_unlock(&task->perf_event_mutex);
4832
4833		if (unlikely(err)) {
4834			put_ctx(ctx);
4835
4836			if (err == -EAGAIN)
4837				goto retry;
4838			goto errout;
4839		}
4840	}
4841
4842	return ctx;
4843
4844errout:
4845	return ERR_PTR(err);
4846}
4847
4848static struct perf_event_pmu_context *
4849find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
4850		     struct perf_event *event)
4851{
4852	struct perf_event_pmu_context *new = NULL, *epc;
4853	void *task_ctx_data = NULL;
4854
4855	if (!ctx->task) {
4856		/*
4857		 * perf_pmu_migrate_context() / __perf_pmu_install_event()
4858		 * relies on the fact that find_get_pmu_context() cannot fail
4859		 * for CPU contexts.
4860		 */
4861		struct perf_cpu_pmu_context *cpc;
4862
4863		cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
4864		epc = &cpc->epc;
4865		raw_spin_lock_irq(&ctx->lock);
4866		if (!epc->ctx) {
4867			atomic_set(&epc->refcount, 1);
4868			epc->embedded = 1;
4869			list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
4870			epc->ctx = ctx;
4871		} else {
4872			WARN_ON_ONCE(epc->ctx != ctx);
4873			atomic_inc(&epc->refcount);
4874		}
4875		raw_spin_unlock_irq(&ctx->lock);
4876		return epc;
4877	}
4878
4879	new = kzalloc(sizeof(*epc), GFP_KERNEL);
4880	if (!new)
4881		return ERR_PTR(-ENOMEM);
4882
4883	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4884		task_ctx_data = alloc_task_ctx_data(pmu);
4885		if (!task_ctx_data) {
4886			kfree(new);
4887			return ERR_PTR(-ENOMEM);
4888		}
4889	}
4890
4891	__perf_init_event_pmu_context(new, pmu);
4892
4893	/*
4894	 * XXX
4895	 *
4896	 * lockdep_assert_held(&ctx->mutex);
4897	 *
4898	 * can't because perf_event_init_task() doesn't actually hold the
4899	 * child_ctx->mutex.
4900	 */
4901
4902	raw_spin_lock_irq(&ctx->lock);
4903	list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
4904		if (epc->pmu == pmu) {
4905			WARN_ON_ONCE(epc->ctx != ctx);
4906			atomic_inc(&epc->refcount);
4907			goto found_epc;
4908		}
4909	}
4910
4911	epc = new;
4912	new = NULL;
4913
4914	list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
4915	epc->ctx = ctx;
4916
4917found_epc:
4918	if (task_ctx_data && !epc->task_ctx_data) {
4919		epc->task_ctx_data = task_ctx_data;
4920		task_ctx_data = NULL;
4921		ctx->nr_task_data++;
4922	}
4923	raw_spin_unlock_irq(&ctx->lock);
4924
4925	free_task_ctx_data(pmu, task_ctx_data);
4926	kfree(new);
4927
4928	return epc;
4929}
4930
4931static void get_pmu_ctx(struct perf_event_pmu_context *epc)
4932{
4933	WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
4934}
4935
4936static void free_epc_rcu(struct rcu_head *head)
4937{
4938	struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
4939
4940	kfree(epc->task_ctx_data);
4941	kfree(epc);
4942}
4943
4944static void put_pmu_ctx(struct perf_event_pmu_context *epc)
4945{
4946	struct perf_event_context *ctx = epc->ctx;
4947	unsigned long flags;
4948
4949	/*
4950	 * XXX
4951	 *
4952	 * lockdep_assert_held(&ctx->mutex);
4953	 *
4954	 * can't because of the call-site in _free_event()/put_event()
4955	 * which isn't always called under ctx->mutex.
4956	 */
4957	if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
4958		return;
4959
4960	WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
4961
4962	list_del_init(&epc->pmu_ctx_entry);
4963	epc->ctx = NULL;
4964
4965	WARN_ON_ONCE(!list_empty(&epc->pinned_active));
4966	WARN_ON_ONCE(!list_empty(&epc->flexible_active));
4967
4968	raw_spin_unlock_irqrestore(&ctx->lock, flags);
4969
4970	if (epc->embedded)
4971		return;
4972
4973	call_rcu(&epc->rcu_head, free_epc_rcu);
4974}
4975
4976static void perf_event_free_filter(struct perf_event *event);
4977
4978static void free_event_rcu(struct rcu_head *head)
4979{
4980	struct perf_event *event = container_of(head, typeof(*event), rcu_head);
4981
4982	if (event->ns)
4983		put_pid_ns(event->ns);
4984	perf_event_free_filter(event);
4985	kmem_cache_free(perf_event_cache, event);
4986}
4987
4988static void ring_buffer_attach(struct perf_event *event,
4989			       struct perf_buffer *rb);
4990
4991static void detach_sb_event(struct perf_event *event)
4992{
4993	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4994
4995	raw_spin_lock(&pel->lock);
4996	list_del_rcu(&event->sb_list);
4997	raw_spin_unlock(&pel->lock);
4998}
4999
5000static bool is_sb_event(struct perf_event *event)
5001{
5002	struct perf_event_attr *attr = &event->attr;
5003
5004	if (event->parent)
5005		return false;
5006
5007	if (event->attach_state & PERF_ATTACH_TASK)
5008		return false;
5009
5010	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
5011	    attr->comm || attr->comm_exec ||
5012	    attr->task || attr->ksymbol ||
5013	    attr->context_switch || attr->text_poke ||
5014	    attr->bpf_event)
5015		return true;
5016	return false;
5017}
5018
5019static void unaccount_pmu_sb_event(struct perf_event *event)
5020{
5021	if (is_sb_event(event))
5022		detach_sb_event(event);
5023}
5024
5025#ifdef CONFIG_NO_HZ_FULL
5026static DEFINE_SPINLOCK(nr_freq_lock);
5027#endif
5028
5029static void unaccount_freq_event_nohz(void)
5030{
5031#ifdef CONFIG_NO_HZ_FULL
5032	spin_lock(&nr_freq_lock);
5033	if (atomic_dec_and_test(&nr_freq_events))
5034		tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
5035	spin_unlock(&nr_freq_lock);
5036#endif
5037}
5038
5039static void unaccount_freq_event(void)
5040{
5041	if (tick_nohz_full_enabled())
5042		unaccount_freq_event_nohz();
5043	else
5044		atomic_dec(&nr_freq_events);
5045}
5046
5047static void unaccount_event(struct perf_event *event)
5048{
5049	bool dec = false;
5050
5051	if (event->parent)
5052		return;
5053
5054	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
5055		dec = true;
5056	if (event->attr.mmap || event->attr.mmap_data)
5057		atomic_dec(&nr_mmap_events);
5058	if (event->attr.build_id)
5059		atomic_dec(&nr_build_id_events);
5060	if (event->attr.comm)
5061		atomic_dec(&nr_comm_events);
5062	if (event->attr.namespaces)
5063		atomic_dec(&nr_namespaces_events);
5064	if (event->attr.cgroup)
5065		atomic_dec(&nr_cgroup_events);
5066	if (event->attr.task)
5067		atomic_dec(&nr_task_events);
5068	if (event->attr.freq)
5069		unaccount_freq_event();
5070	if (event->attr.context_switch) {
5071		dec = true;
5072		atomic_dec(&nr_switch_events);
5073	}
5074	if (is_cgroup_event(event))
5075		dec = true;
5076	if (has_branch_stack(event))
5077		dec = true;
5078	if (event->attr.ksymbol)
5079		atomic_dec(&nr_ksymbol_events);
5080	if (event->attr.bpf_event)
5081		atomic_dec(&nr_bpf_events);
5082	if (event->attr.text_poke)
5083		atomic_dec(&nr_text_poke_events);
5084
5085	if (dec) {
5086		if (!atomic_add_unless(&perf_sched_count, -1, 1))
5087			schedule_delayed_work(&perf_sched_work, HZ);
5088	}
5089
5090	unaccount_pmu_sb_event(event);
5091}
5092
5093static void perf_sched_delayed(struct work_struct *work)
5094{
5095	mutex_lock(&perf_sched_mutex);
5096	if (atomic_dec_and_test(&perf_sched_count))
5097		static_branch_disable(&perf_sched_events);
5098	mutex_unlock(&perf_sched_mutex);
5099}
5100
5101/*
5102 * The following implement mutual exclusion of events on "exclusive" pmus
5103 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
5104 * at a time, so we disallow creating events that might conflict, namely:
5105 *
5106 *  1) cpu-wide events in the presence of per-task events,
5107 *  2) per-task events in the presence of cpu-wide events,
5108 *  3) two matching events on the same perf_event_context.
5109 *
5110 * The former two cases are handled in the allocation path (perf_event_alloc(),
5111 * _free_event()), the latter -- before the first perf_install_in_context().
5112 */
5113static int exclusive_event_init(struct perf_event *event)
5114{
5115	struct pmu *pmu = event->pmu;
5116
5117	if (!is_exclusive_pmu(pmu))
5118		return 0;
5119
5120	/*
5121	 * Prevent co-existence of per-task and cpu-wide events on the
5122	 * same exclusive pmu.
5123	 *
5124	 * Negative pmu::exclusive_cnt means there are cpu-wide
5125	 * events on this "exclusive" pmu, positive means there are
5126	 * per-task events.
5127	 *
5128	 * Since this is called in perf_event_alloc() path, event::ctx
5129	 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
5130	 * to mean "per-task event", because unlike other attach states it
5131	 * never gets cleared.
5132	 */
5133	if (event->attach_state & PERF_ATTACH_TASK) {
5134		if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
5135			return -EBUSY;
5136	} else {
5137		if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
5138			return -EBUSY;
5139	}
5140
5141	return 0;
5142}
5143
5144static void exclusive_event_destroy(struct perf_event *event)
5145{
5146	struct pmu *pmu = event->pmu;
5147
5148	if (!is_exclusive_pmu(pmu))
5149		return;
5150
5151	/* see comment in exclusive_event_init() */
5152	if (event->attach_state & PERF_ATTACH_TASK)
5153		atomic_dec(&pmu->exclusive_cnt);
5154	else
5155		atomic_inc(&pmu->exclusive_cnt);
5156}
5157
5158static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
5159{
5160	if ((e1->pmu == e2->pmu) &&
5161	    (e1->cpu == e2->cpu ||
5162	     e1->cpu == -1 ||
5163	     e2->cpu == -1))
5164		return true;
5165	return false;
5166}
5167
5168static bool exclusive_event_installable(struct perf_event *event,
5169					struct perf_event_context *ctx)
5170{
5171	struct perf_event *iter_event;
5172	struct pmu *pmu = event->pmu;
5173
5174	lockdep_assert_held(&ctx->mutex);
5175
5176	if (!is_exclusive_pmu(pmu))
5177		return true;
5178
5179	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
5180		if (exclusive_event_match(iter_event, event))
5181			return false;
5182	}
5183
5184	return true;
5185}
5186
5187static void perf_addr_filters_splice(struct perf_event *event,
5188				       struct list_head *head);
5189
5190static void _free_event(struct perf_event *event)
5191{
5192	irq_work_sync(&event->pending_irq);
5193
5194	unaccount_event(event);
5195
5196	security_perf_event_free(event);
5197
5198	if (event->rb) {
5199		/*
5200		 * Can happen when we close an event with re-directed output.
5201		 *
5202		 * Since we have a 0 refcount, perf_mmap_close() will skip
5203		 * over us; possibly making our ring_buffer_put() the last.
5204		 */
5205		mutex_lock(&event->mmap_mutex);
5206		ring_buffer_attach(event, NULL);
5207		mutex_unlock(&event->mmap_mutex);
5208	}
5209
5210	if (is_cgroup_event(event))
5211		perf_detach_cgroup(event);
5212
5213	if (!event->parent) {
5214		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
5215			put_callchain_buffers();
5216	}
5217
5218	perf_event_free_bpf_prog(event);
5219	perf_addr_filters_splice(event, NULL);
5220	kfree(event->addr_filter_ranges);
5221
5222	if (event->destroy)
5223		event->destroy(event);
5224
5225	/*
5226	 * Must be after ->destroy(), due to uprobe_perf_close() using
5227	 * hw.target.
5228	 */
5229	if (event->hw.target)
5230		put_task_struct(event->hw.target);
5231
5232	if (event->pmu_ctx)
5233		put_pmu_ctx(event->pmu_ctx);
5234
5235	/*
5236	 * perf_event_free_task() relies on put_ctx() being 'last', in particular
5237	 * all task references must be cleaned up.
5238	 */
5239	if (event->ctx)
5240		put_ctx(event->ctx);
5241
5242	exclusive_event_destroy(event);
5243	module_put(event->pmu->module);
5244
5245	call_rcu(&event->rcu_head, free_event_rcu);
5246}
5247
5248/*
5249 * Used to free events which have a known refcount of 1, such as in error paths
5250 * where the event isn't exposed yet and inherited events.
5251 */
5252static void free_event(struct perf_event *event)
5253{
5254	if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
5255				"unexpected event refcount: %ld; ptr=%p\n",
5256				atomic_long_read(&event->refcount), event)) {
5257		/* leak to avoid use-after-free */
5258		return;
5259	}
5260
5261	_free_event(event);
5262}
5263
5264/*
5265 * Remove user event from the owner task.
5266 */
5267static void perf_remove_from_owner(struct perf_event *event)
5268{
5269	struct task_struct *owner;
5270
5271	rcu_read_lock();
5272	/*
5273	 * Matches the smp_store_release() in perf_event_exit_task(). If we
5274	 * observe !owner it means the list deletion is complete and we can
5275	 * indeed free this event, otherwise we need to serialize on
5276	 * owner->perf_event_mutex.
5277	 */
5278	owner = READ_ONCE(event->owner);
5279	if (owner) {
5280		/*
5281		 * Since delayed_put_task_struct() also drops the last
5282		 * task reference we can safely take a new reference
5283		 * while holding the rcu_read_lock().
5284		 */
5285		get_task_struct(owner);
5286	}
5287	rcu_read_unlock();
5288
5289	if (owner) {
5290		/*
5291		 * If we're here through perf_event_exit_task() we're already
5292		 * holding ctx->mutex which would be an inversion wrt. the
5293		 * normal lock order.
5294		 *
5295		 * However we can safely take this lock because its the child
5296		 * ctx->mutex.
5297		 */
5298		mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5299
5300		/*
5301		 * We have to re-check the event->owner field, if it is cleared
5302		 * we raced with perf_event_exit_task(), acquiring the mutex
5303		 * ensured they're done, and we can proceed with freeing the
5304		 * event.
5305		 */
5306		if (event->owner) {
5307			list_del_init(&event->owner_entry);
5308			smp_store_release(&event->owner, NULL);
5309		}
5310		mutex_unlock(&owner->perf_event_mutex);
5311		put_task_struct(owner);
5312	}
5313}
5314
5315static void put_event(struct perf_event *event)
5316{
5317	if (!atomic_long_dec_and_test(&event->refcount))
5318		return;
5319
5320	_free_event(event);
5321}
5322
5323/*
5324 * Kill an event dead; while event:refcount will preserve the event
5325 * object, it will not preserve its functionality. Once the last 'user'
5326 * gives up the object, we'll destroy the thing.
5327 */
5328int perf_event_release_kernel(struct perf_event *event)
5329{
5330	struct perf_event_context *ctx = event->ctx;
5331	struct perf_event *child, *tmp;
5332	LIST_HEAD(free_list);
5333
5334	/*
5335	 * If we got here through err_alloc: free_event(event); we will not
5336	 * have attached to a context yet.
5337	 */
5338	if (!ctx) {
5339		WARN_ON_ONCE(event->attach_state &
5340				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5341		goto no_ctx;
5342	}
5343
5344	if (!is_kernel_event(event))
5345		perf_remove_from_owner(event);
5346
5347	ctx = perf_event_ctx_lock(event);
5348	WARN_ON_ONCE(ctx->parent_ctx);
5349
5350	/*
5351	 * Mark this event as STATE_DEAD, there is no external reference to it
5352	 * anymore.
5353	 *
5354	 * Anybody acquiring event->child_mutex after the below loop _must_
5355	 * also see this, most importantly inherit_event() which will avoid
5356	 * placing more children on the list.
5357	 *
5358	 * Thus this guarantees that we will in fact observe and kill _ALL_
5359	 * child events.
5360	 */
5361	perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
5362
5363	perf_event_ctx_unlock(event, ctx);
5364
5365again:
5366	mutex_lock(&event->child_mutex);
5367	list_for_each_entry(child, &event->child_list, child_list) {
5368
5369		/*
5370		 * Cannot change, child events are not migrated, see the
5371		 * comment with perf_event_ctx_lock_nested().
5372		 */
5373		ctx = READ_ONCE(child->ctx);
5374		/*
5375		 * Since child_mutex nests inside ctx::mutex, we must jump
5376		 * through hoops. We start by grabbing a reference on the ctx.
5377		 *
5378		 * Since the event cannot get freed while we hold the
5379		 * child_mutex, the context must also exist and have a !0
5380		 * reference count.
5381		 */
5382		get_ctx(ctx);
5383
5384		/*
5385		 * Now that we have a ctx ref, we can drop child_mutex, and
5386		 * acquire ctx::mutex without fear of it going away. Then we
5387		 * can re-acquire child_mutex.
5388		 */
5389		mutex_unlock(&event->child_mutex);
5390		mutex_lock(&ctx->mutex);
5391		mutex_lock(&event->child_mutex);
5392
5393		/*
5394		 * Now that we hold ctx::mutex and child_mutex, revalidate our
5395		 * state, if child is still the first entry, it didn't get freed
5396		 * and we can continue doing so.
5397		 */
5398		tmp = list_first_entry_or_null(&event->child_list,
5399					       struct perf_event, child_list);
5400		if (tmp == child) {
5401			perf_remove_from_context(child, DETACH_GROUP);
5402			list_move(&child->child_list, &free_list);
5403			/*
5404			 * This matches the refcount bump in inherit_event();
5405			 * this can't be the last reference.
5406			 */
5407			put_event(event);
5408		}
5409
5410		mutex_unlock(&event->child_mutex);
5411		mutex_unlock(&ctx->mutex);
5412		put_ctx(ctx);
5413		goto again;
5414	}
5415	mutex_unlock(&event->child_mutex);
5416
5417	list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5418		void *var = &child->ctx->refcount;
5419
5420		list_del(&child->child_list);
5421		free_event(child);
5422
5423		/*
5424		 * Wake any perf_event_free_task() waiting for this event to be
5425		 * freed.
5426		 */
5427		smp_mb(); /* pairs with wait_var_event() */
5428		wake_up_var(var);
5429	}
5430
5431no_ctx:
5432	put_event(event); /* Must be the 'last' reference */
5433	return 0;
5434}
5435EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5436
5437/*
5438 * Called when the last reference to the file is gone.
5439 */
5440static int perf_release(struct inode *inode, struct file *file)
5441{
5442	perf_event_release_kernel(file->private_data);
5443	return 0;
5444}
5445
5446static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5447{
5448	struct perf_event *child;
5449	u64 total = 0;
5450
5451	*enabled = 0;
5452	*running = 0;
5453
5454	mutex_lock(&event->child_mutex);
5455
5456	(void)perf_event_read(event, false);
5457	total += perf_event_count(event);
5458
5459	*enabled += event->total_time_enabled +
5460			atomic64_read(&event->child_total_time_enabled);
5461	*running += event->total_time_running +
5462			atomic64_read(&event->child_total_time_running);
5463
5464	list_for_each_entry(child, &event->child_list, child_list) {
5465		(void)perf_event_read(child, false);
5466		total += perf_event_count(child);
5467		*enabled += child->total_time_enabled;
5468		*running += child->total_time_running;
5469	}
5470	mutex_unlock(&event->child_mutex);
5471
5472	return total;
5473}
5474
5475u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5476{
5477	struct perf_event_context *ctx;
5478	u64 count;
5479
5480	ctx = perf_event_ctx_lock(event);
5481	count = __perf_event_read_value(event, enabled, running);
5482	perf_event_ctx_unlock(event, ctx);
5483
5484	return count;
5485}
5486EXPORT_SYMBOL_GPL(perf_event_read_value);
5487
5488static int __perf_read_group_add(struct perf_event *leader,
5489					u64 read_format, u64 *values)
5490{
5491	struct perf_event_context *ctx = leader->ctx;
5492	struct perf_event *sub, *parent;
5493	unsigned long flags;
5494	int n = 1; /* skip @nr */
5495	int ret;
5496
5497	ret = perf_event_read(leader, true);
5498	if (ret)
5499		return ret;
5500
5501	raw_spin_lock_irqsave(&ctx->lock, flags);
5502	/*
5503	 * Verify the grouping between the parent and child (inherited)
5504	 * events is still in tact.
5505	 *
5506	 * Specifically:
5507	 *  - leader->ctx->lock pins leader->sibling_list
5508	 *  - parent->child_mutex pins parent->child_list
5509	 *  - parent->ctx->mutex pins parent->sibling_list
5510	 *
5511	 * Because parent->ctx != leader->ctx (and child_list nests inside
5512	 * ctx->mutex), group destruction is not atomic between children, also
5513	 * see perf_event_release_kernel(). Additionally, parent can grow the
5514	 * group.
5515	 *
5516	 * Therefore it is possible to have parent and child groups in a
5517	 * different configuration and summing over such a beast makes no sense
5518	 * what so ever.
5519	 *
5520	 * Reject this.
5521	 */
5522	parent = leader->parent;
5523	if (parent &&
5524	    (parent->group_generation != leader->group_generation ||
5525	     parent->nr_siblings != leader->nr_siblings)) {
5526		ret = -ECHILD;
5527		goto unlock;
5528	}
5529
5530	/*
5531	 * Since we co-schedule groups, {enabled,running} times of siblings
5532	 * will be identical to those of the leader, so we only publish one
5533	 * set.
5534	 */
5535	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5536		values[n++] += leader->total_time_enabled +
5537			atomic64_read(&leader->child_total_time_enabled);
5538	}
5539
5540	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5541		values[n++] += leader->total_time_running +
5542			atomic64_read(&leader->child_total_time_running);
5543	}
5544
5545	/*
5546	 * Write {count,id} tuples for every sibling.
5547	 */
5548	values[n++] += perf_event_count(leader);
5549	if (read_format & PERF_FORMAT_ID)
5550		values[n++] = primary_event_id(leader);
5551	if (read_format & PERF_FORMAT_LOST)
5552		values[n++] = atomic64_read(&leader->lost_samples);
5553
5554	for_each_sibling_event(sub, leader) {
5555		values[n++] += perf_event_count(sub);
5556		if (read_format & PERF_FORMAT_ID)
5557			values[n++] = primary_event_id(sub);
5558		if (read_format & PERF_FORMAT_LOST)
5559			values[n++] = atomic64_read(&sub->lost_samples);
5560	}
5561
5562unlock:
5563	raw_spin_unlock_irqrestore(&ctx->lock, flags);
5564	return ret;
5565}
5566
5567static int perf_read_group(struct perf_event *event,
5568				   u64 read_format, char __user *buf)
5569{
5570	struct perf_event *leader = event->group_leader, *child;
5571	struct perf_event_context *ctx = leader->ctx;
5572	int ret;
5573	u64 *values;
5574
5575	lockdep_assert_held(&ctx->mutex);
5576
5577	values = kzalloc(event->read_size, GFP_KERNEL);
5578	if (!values)
5579		return -ENOMEM;
5580
5581	values[0] = 1 + leader->nr_siblings;
5582
5583	mutex_lock(&leader->child_mutex);
5584
5585	ret = __perf_read_group_add(leader, read_format, values);
5586	if (ret)
5587		goto unlock;
5588
5589	list_for_each_entry(child, &leader->child_list, child_list) {
5590		ret = __perf_read_group_add(child, read_format, values);
5591		if (ret)
5592			goto unlock;
5593	}
5594
5595	mutex_unlock(&leader->child_mutex);
5596
5597	ret = event->read_size;
5598	if (copy_to_user(buf, values, event->read_size))
5599		ret = -EFAULT;
5600	goto out;
5601
5602unlock:
5603	mutex_unlock(&leader->child_mutex);
5604out:
5605	kfree(values);
5606	return ret;
5607}
5608
5609static int perf_read_one(struct perf_event *event,
5610				 u64 read_format, char __user *buf)
5611{
5612	u64 enabled, running;
5613	u64 values[5];
5614	int n = 0;
5615
5616	values[n++] = __perf_event_read_value(event, &enabled, &running);
5617	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5618		values[n++] = enabled;
5619	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5620		values[n++] = running;
5621	if (read_format & PERF_FORMAT_ID)
5622		values[n++] = primary_event_id(event);
5623	if (read_format & PERF_FORMAT_LOST)
5624		values[n++] = atomic64_read(&event->lost_samples);
5625
5626	if (copy_to_user(buf, values, n * sizeof(u64)))
5627		return -EFAULT;
5628
5629	return n * sizeof(u64);
5630}
5631
5632static bool is_event_hup(struct perf_event *event)
5633{
5634	bool no_children;
5635
5636	if (event->state > PERF_EVENT_STATE_EXIT)
5637		return false;
5638
5639	mutex_lock(&event->child_mutex);
5640	no_children = list_empty(&event->child_list);
5641	mutex_unlock(&event->child_mutex);
5642	return no_children;
5643}
5644
5645/*
5646 * Read the performance event - simple non blocking version for now
5647 */
5648static ssize_t
5649__perf_read(struct perf_event *event, char __user *buf, size_t count)
5650{
5651	u64 read_format = event->attr.read_format;
5652	int ret;
5653
5654	/*
5655	 * Return end-of-file for a read on an event that is in
5656	 * error state (i.e. because it was pinned but it couldn't be
5657	 * scheduled on to the CPU at some point).
5658	 */
5659	if (event->state == PERF_EVENT_STATE_ERROR)
5660		return 0;
5661
5662	if (count < event->read_size)
5663		return -ENOSPC;
5664
5665	WARN_ON_ONCE(event->ctx->parent_ctx);
5666	if (read_format & PERF_FORMAT_GROUP)
5667		ret = perf_read_group(event, read_format, buf);
5668	else
5669		ret = perf_read_one(event, read_format, buf);
5670
5671	return ret;
5672}
5673
5674static ssize_t
5675perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5676{
5677	struct perf_event *event = file->private_data;
5678	struct perf_event_context *ctx;
5679	int ret;
5680
5681	ret = security_perf_event_read(event);
5682	if (ret)
5683		return ret;
5684
5685	ctx = perf_event_ctx_lock(event);
5686	ret = __perf_read(event, buf, count);
5687	perf_event_ctx_unlock(event, ctx);
5688
5689	return ret;
5690}
5691
5692static __poll_t perf_poll(struct file *file, poll_table *wait)
5693{
5694	struct perf_event *event = file->private_data;
5695	struct perf_buffer *rb;
5696	__poll_t events = EPOLLHUP;
5697
5698	poll_wait(file, &event->waitq, wait);
5699
5700	if (is_event_hup(event))
5701		return events;
5702
5703	/*
5704	 * Pin the event->rb by taking event->mmap_mutex; otherwise
5705	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
5706	 */
5707	mutex_lock(&event->mmap_mutex);
5708	rb = event->rb;
5709	if (rb)
5710		events = atomic_xchg(&rb->poll, 0);
5711	mutex_unlock(&event->mmap_mutex);
5712	return events;
5713}
5714
5715static void _perf_event_reset(struct perf_event *event)
5716{
5717	(void)perf_event_read(event, false);
5718	local64_set(&event->count, 0);
5719	perf_event_update_userpage(event);
5720}
5721
5722/* Assume it's not an event with inherit set. */
5723u64 perf_event_pause(struct perf_event *event, bool reset)
5724{
5725	struct perf_event_context *ctx;
5726	u64 count;
5727
5728	ctx = perf_event_ctx_lock(event);
5729	WARN_ON_ONCE(event->attr.inherit);
5730	_perf_event_disable(event);
5731	count = local64_read(&event->count);
5732	if (reset)
5733		local64_set(&event->count, 0);
5734	perf_event_ctx_unlock(event, ctx);
5735
5736	return count;
5737}
5738EXPORT_SYMBOL_GPL(perf_event_pause);
5739
5740/*
5741 * Holding the top-level event's child_mutex means that any
5742 * descendant process that has inherited this event will block
5743 * in perf_event_exit_event() if it goes to exit, thus satisfying the
5744 * task existence requirements of perf_event_enable/disable.
5745 */
5746static void perf_event_for_each_child(struct perf_event *event,
5747					void (*func)(struct perf_event *))
5748{
5749	struct perf_event *child;
5750
5751	WARN_ON_ONCE(event->ctx->parent_ctx);
5752
5753	mutex_lock(&event->child_mutex);
5754	func(event);
5755	list_for_each_entry(child, &event->child_list, child_list)
5756		func(child);
5757	mutex_unlock(&event->child_mutex);
5758}
5759
5760static void perf_event_for_each(struct perf_event *event,
5761				  void (*func)(struct perf_event *))
5762{
5763	struct perf_event_context *ctx = event->ctx;
5764	struct perf_event *sibling;
5765
5766	lockdep_assert_held(&ctx->mutex);
5767
5768	event = event->group_leader;
5769
5770	perf_event_for_each_child(event, func);
5771	for_each_sibling_event(sibling, event)
5772		perf_event_for_each_child(sibling, func);
5773}
5774
5775static void __perf_event_period(struct perf_event *event,
5776				struct perf_cpu_context *cpuctx,
5777				struct perf_event_context *ctx,
5778				void *info)
5779{
5780	u64 value = *((u64 *)info);
5781	bool active;
5782
5783	if (event->attr.freq) {
5784		event->attr.sample_freq = value;
5785	} else {
5786		event->attr.sample_period = value;
5787		event->hw.sample_period = value;
5788	}
5789
5790	active = (event->state == PERF_EVENT_STATE_ACTIVE);
5791	if (active) {
5792		perf_pmu_disable(event->pmu);
5793		/*
5794		 * We could be throttled; unthrottle now to avoid the tick
5795		 * trying to unthrottle while we already re-started the event.
5796		 */
5797		if (event->hw.interrupts == MAX_INTERRUPTS) {
5798			event->hw.interrupts = 0;
5799			perf_log_throttle(event, 1);
5800		}
5801		event->pmu->stop(event, PERF_EF_UPDATE);
5802	}
5803
5804	local64_set(&event->hw.period_left, 0);
5805
5806	if (active) {
5807		event->pmu->start(event, PERF_EF_RELOAD);
5808		perf_pmu_enable(event->pmu);
5809	}
5810}
5811
5812static int perf_event_check_period(struct perf_event *event, u64 value)
5813{
5814	return event->pmu->check_period(event, value);
5815}
5816
5817static int _perf_event_period(struct perf_event *event, u64 value)
5818{
5819	if (!is_sampling_event(event))
5820		return -EINVAL;
5821
5822	if (!value)
5823		return -EINVAL;
5824
5825	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5826		return -EINVAL;
5827
5828	if (perf_event_check_period(event, value))
5829		return -EINVAL;
5830
5831	if (!event->attr.freq && (value & (1ULL << 63)))
5832		return -EINVAL;
5833
5834	event_function_call(event, __perf_event_period, &value);
5835
5836	return 0;
5837}
5838
5839int perf_event_period(struct perf_event *event, u64 value)
5840{
5841	struct perf_event_context *ctx;
5842	int ret;
5843
5844	ctx = perf_event_ctx_lock(event);
5845	ret = _perf_event_period(event, value);
5846	perf_event_ctx_unlock(event, ctx);
5847
5848	return ret;
5849}
5850EXPORT_SYMBOL_GPL(perf_event_period);
5851
5852static const struct file_operations perf_fops;
5853
5854static inline int perf_fget_light(int fd, struct fd *p)
5855{
5856	struct fd f = fdget(fd);
5857	if (!f.file)
5858		return -EBADF;
5859
5860	if (f.file->f_op != &perf_fops) {
5861		fdput(f);
5862		return -EBADF;
5863	}
5864	*p = f;
5865	return 0;
5866}
5867
5868static int perf_event_set_output(struct perf_event *event,
5869				 struct perf_event *output_event);
5870static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5871static int perf_copy_attr(struct perf_event_attr __user *uattr,
5872			  struct perf_event_attr *attr);
5873
5874static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5875{
5876	void (*func)(struct perf_event *);
5877	u32 flags = arg;
5878
5879	switch (cmd) {
5880	case PERF_EVENT_IOC_ENABLE:
5881		func = _perf_event_enable;
5882		break;
5883	case PERF_EVENT_IOC_DISABLE:
5884		func = _perf_event_disable;
5885		break;
5886	case PERF_EVENT_IOC_RESET:
5887		func = _perf_event_reset;
5888		break;
5889
5890	case PERF_EVENT_IOC_REFRESH:
5891		return _perf_event_refresh(event, arg);
5892
5893	case PERF_EVENT_IOC_PERIOD:
5894	{
5895		u64 value;
5896
5897		if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5898			return -EFAULT;
5899
5900		return _perf_event_period(event, value);
5901	}
5902	case PERF_EVENT_IOC_ID:
5903	{
5904		u64 id = primary_event_id(event);
5905
5906		if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5907			return -EFAULT;
5908		return 0;
5909	}
5910
5911	case PERF_EVENT_IOC_SET_OUTPUT:
5912	{
5913		int ret;
5914		if (arg != -1) {
5915			struct perf_event *output_event;
5916			struct fd output;
5917			ret = perf_fget_light(arg, &output);
5918			if (ret)
5919				return ret;
5920			output_event = output.file->private_data;
5921			ret = perf_event_set_output(event, output_event);
5922			fdput(output);
5923		} else {
5924			ret = perf_event_set_output(event, NULL);
5925		}
5926		return ret;
5927	}
5928
5929	case PERF_EVENT_IOC_SET_FILTER:
5930		return perf_event_set_filter(event, (void __user *)arg);
5931
5932	case PERF_EVENT_IOC_SET_BPF:
5933	{
5934		struct bpf_prog *prog;
5935		int err;
5936
5937		prog = bpf_prog_get(arg);
5938		if (IS_ERR(prog))
5939			return PTR_ERR(prog);
5940
5941		err = perf_event_set_bpf_prog(event, prog, 0);
5942		if (err) {
5943			bpf_prog_put(prog);
5944			return err;
5945		}
5946
5947		return 0;
5948	}
5949
5950	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5951		struct perf_buffer *rb;
5952
5953		rcu_read_lock();
5954		rb = rcu_dereference(event->rb);
5955		if (!rb || !rb->nr_pages) {
5956			rcu_read_unlock();
5957			return -EINVAL;
5958		}
5959		rb_toggle_paused(rb, !!arg);
5960		rcu_read_unlock();
5961		return 0;
5962	}
5963
5964	case PERF_EVENT_IOC_QUERY_BPF:
5965		return perf_event_query_prog_array(event, (void __user *)arg);
5966
5967	case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5968		struct perf_event_attr new_attr;
5969		int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5970					 &new_attr);
5971
5972		if (err)
5973			return err;
5974
5975		return perf_event_modify_attr(event,  &new_attr);
5976	}
5977	default:
5978		return -ENOTTY;
5979	}
5980
5981	if (flags & PERF_IOC_FLAG_GROUP)
5982		perf_event_for_each(event, func);
5983	else
5984		perf_event_for_each_child(event, func);
5985
5986	return 0;
5987}
5988
5989static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5990{
5991	struct perf_event *event = file->private_data;
5992	struct perf_event_context *ctx;
5993	long ret;
5994
5995	/* Treat ioctl like writes as it is likely a mutating operation. */
5996	ret = security_perf_event_write(event);
5997	if (ret)
5998		return ret;
5999
6000	ctx = perf_event_ctx_lock(event);
6001	ret = _perf_ioctl(event, cmd, arg);
6002	perf_event_ctx_unlock(event, ctx);
6003
6004	return ret;
6005}
6006
6007#ifdef CONFIG_COMPAT
6008static long perf_compat_ioctl(struct file *file, unsigned int cmd,
6009				unsigned long arg)
6010{
6011	switch (_IOC_NR(cmd)) {
6012	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
6013	case _IOC_NR(PERF_EVENT_IOC_ID):
6014	case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
6015	case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
6016		/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
6017		if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
6018			cmd &= ~IOCSIZE_MASK;
6019			cmd |= sizeof(void *) << IOCSIZE_SHIFT;
6020		}
6021		break;
6022	}
6023	return perf_ioctl(file, cmd, arg);
6024}
6025#else
6026# define perf_compat_ioctl NULL
6027#endif
6028
6029int perf_event_task_enable(void)
6030{
6031	struct perf_event_context *ctx;
6032	struct perf_event *event;
6033
6034	mutex_lock(&current->perf_event_mutex);
6035	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
6036		ctx = perf_event_ctx_lock(event);
6037		perf_event_for_each_child(event, _perf_event_enable);
6038		perf_event_ctx_unlock(event, ctx);
6039	}
6040	mutex_unlock(&current->perf_event_mutex);
6041
6042	return 0;
6043}
6044
6045int perf_event_task_disable(void)
6046{
6047	struct perf_event_context *ctx;
6048	struct perf_event *event;
6049
6050	mutex_lock(&current->perf_event_mutex);
6051	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
6052		ctx = perf_event_ctx_lock(event);
6053		perf_event_for_each_child(event, _perf_event_disable);
6054		perf_event_ctx_unlock(event, ctx);
6055	}
6056	mutex_unlock(&current->perf_event_mutex);
6057
6058	return 0;
6059}
6060
6061static int perf_event_index(struct perf_event *event)
6062{
6063	if (event->hw.state & PERF_HES_STOPPED)
6064		return 0;
6065
6066	if (event->state != PERF_EVENT_STATE_ACTIVE)
6067		return 0;
6068
6069	return event->pmu->event_idx(event);
6070}
6071
6072static void perf_event_init_userpage(struct perf_event *event)
6073{
6074	struct perf_event_mmap_page *userpg;
6075	struct perf_buffer *rb;
6076
6077	rcu_read_lock();
6078	rb = rcu_dereference(event->rb);
6079	if (!rb)
6080		goto unlock;
6081
6082	userpg = rb->user_page;
6083
6084	/* Allow new userspace to detect that bit 0 is deprecated */
6085	userpg->cap_bit0_is_deprecated = 1;
6086	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
6087	userpg->data_offset = PAGE_SIZE;
6088	userpg->data_size = perf_data_size(rb);
6089
6090unlock:
6091	rcu_read_unlock();
6092}
6093
6094void __weak arch_perf_update_userpage(
6095	struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
6096{
6097}
6098
6099/*
6100 * Callers need to ensure there can be no nesting of this function, otherwise
6101 * the seqlock logic goes bad. We can not serialize this because the arch
6102 * code calls this from NMI context.
6103 */
6104void perf_event_update_userpage(struct perf_event *event)
6105{
6106	struct perf_event_mmap_page *userpg;
6107	struct perf_buffer *rb;
6108	u64 enabled, running, now;
6109
6110	rcu_read_lock();
6111	rb = rcu_dereference(event->rb);
6112	if (!rb)
6113		goto unlock;
6114
6115	/*
6116	 * compute total_time_enabled, total_time_running
6117	 * based on snapshot values taken when the event
6118	 * was last scheduled in.
6119	 *
6120	 * we cannot simply called update_context_time()
6121	 * because of locking issue as we can be called in
6122	 * NMI context
6123	 */
6124	calc_timer_values(event, &now, &enabled, &running);
6125
6126	userpg = rb->user_page;
6127	/*
6128	 * Disable preemption to guarantee consistent time stamps are stored to
6129	 * the user page.
6130	 */
6131	preempt_disable();
6132	++userpg->lock;
6133	barrier();
6134	userpg->index = perf_event_index(event);
6135	userpg->offset = perf_event_count(event);
6136	if (userpg->index)
6137		userpg->offset -= local64_read(&event->hw.prev_count);
6138
6139	userpg->time_enabled = enabled +
6140			atomic64_read(&event->child_total_time_enabled);
6141
6142	userpg->time_running = running +
6143			atomic64_read(&event->child_total_time_running);
6144
6145	arch_perf_update_userpage(event, userpg, now);
6146
6147	barrier();
6148	++userpg->lock;
6149	preempt_enable();
6150unlock:
6151	rcu_read_unlock();
6152}
6153EXPORT_SYMBOL_GPL(perf_event_update_userpage);
6154
6155static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
6156{
6157	struct perf_event *event = vmf->vma->vm_file->private_data;
6158	struct perf_buffer *rb;
6159	vm_fault_t ret = VM_FAULT_SIGBUS;
6160
6161	if (vmf->flags & FAULT_FLAG_MKWRITE) {
6162		if (vmf->pgoff == 0)
6163			ret = 0;
6164		return ret;
6165	}
6166
6167	rcu_read_lock();
6168	rb = rcu_dereference(event->rb);
6169	if (!rb)
6170		goto unlock;
6171
6172	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
6173		goto unlock;
6174
6175	vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
6176	if (!vmf->page)
6177		goto unlock;
6178
6179	get_page(vmf->page);
6180	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
6181	vmf->page->index   = vmf->pgoff;
6182
6183	ret = 0;
6184unlock:
6185	rcu_read_unlock();
6186
6187	return ret;
6188}
6189
6190static void ring_buffer_attach(struct perf_event *event,
6191			       struct perf_buffer *rb)
6192{
6193	struct perf_buffer *old_rb = NULL;
6194	unsigned long flags;
6195
6196	WARN_ON_ONCE(event->parent);
6197
6198	if (event->rb) {
6199		/*
6200		 * Should be impossible, we set this when removing
6201		 * event->rb_entry and wait/clear when adding event->rb_entry.
6202		 */
6203		WARN_ON_ONCE(event->rcu_pending);
6204
6205		old_rb = event->rb;
6206		spin_lock_irqsave(&old_rb->event_lock, flags);
6207		list_del_rcu(&event->rb_entry);
6208		spin_unlock_irqrestore(&old_rb->event_lock, flags);
6209
6210		event->rcu_batches = get_state_synchronize_rcu();
6211		event->rcu_pending = 1;
6212	}
6213
6214	if (rb) {
6215		if (event->rcu_pending) {
6216			cond_synchronize_rcu(event->rcu_batches);
6217			event->rcu_pending = 0;
6218		}
6219
6220		spin_lock_irqsave(&rb->event_lock, flags);
6221		list_add_rcu(&event->rb_entry, &rb->event_list);
6222		spin_unlock_irqrestore(&rb->event_lock, flags);
6223	}
6224
6225	/*
6226	 * Avoid racing with perf_mmap_close(AUX): stop the event
6227	 * before swizzling the event::rb pointer; if it's getting
6228	 * unmapped, its aux_mmap_count will be 0 and it won't
6229	 * restart. See the comment in __perf_pmu_output_stop().
6230	 *
6231	 * Data will inevitably be lost when set_output is done in
6232	 * mid-air, but then again, whoever does it like this is
6233	 * not in for the data anyway.
6234	 */
6235	if (has_aux(event))
6236		perf_event_stop(event, 0);
6237
6238	rcu_assign_pointer(event->rb, rb);
6239
6240	if (old_rb) {
6241		ring_buffer_put(old_rb);
6242		/*
6243		 * Since we detached before setting the new rb, so that we
6244		 * could attach the new rb, we could have missed a wakeup.
6245		 * Provide it now.
6246		 */
6247		wake_up_all(&event->waitq);
6248	}
6249}
6250
6251static void ring_buffer_wakeup(struct perf_event *event)
6252{
6253	struct perf_buffer *rb;
6254
6255	if (event->parent)
6256		event = event->parent;
6257
6258	rcu_read_lock();
6259	rb = rcu_dereference(event->rb);
6260	if (rb) {
6261		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
6262			wake_up_all(&event->waitq);
6263	}
6264	rcu_read_unlock();
6265}
6266
6267struct perf_buffer *ring_buffer_get(struct perf_event *event)
6268{
6269	struct perf_buffer *rb;
6270
6271	if (event->parent)
6272		event = event->parent;
6273
6274	rcu_read_lock();
6275	rb = rcu_dereference(event->rb);
6276	if (rb) {
6277		if (!refcount_inc_not_zero(&rb->refcount))
6278			rb = NULL;
6279	}
6280	rcu_read_unlock();
6281
6282	return rb;
6283}
6284
6285void ring_buffer_put(struct perf_buffer *rb)
6286{
6287	if (!refcount_dec_and_test(&rb->refcount))
6288		return;
6289
6290	WARN_ON_ONCE(!list_empty(&rb->event_list));
6291
6292	call_rcu(&rb->rcu_head, rb_free_rcu);
6293}
6294
6295static void perf_mmap_open(struct vm_area_struct *vma)
6296{
6297	struct perf_event *event = vma->vm_file->private_data;
6298
6299	atomic_inc(&event->mmap_count);
6300	atomic_inc(&event->rb->mmap_count);
6301
6302	if (vma->vm_pgoff)
6303		atomic_inc(&event->rb->aux_mmap_count);
6304
6305	if (event->pmu->event_mapped)
6306		event->pmu->event_mapped(event, vma->vm_mm);
6307}
6308
6309static void perf_pmu_output_stop(struct perf_event *event);
6310
6311/*
6312 * A buffer can be mmap()ed multiple times; either directly through the same
6313 * event, or through other events by use of perf_event_set_output().
6314 *
6315 * In order to undo the VM accounting done by perf_mmap() we need to destroy
6316 * the buffer here, where we still have a VM context. This means we need
6317 * to detach all events redirecting to us.
6318 */
6319static void perf_mmap_close(struct vm_area_struct *vma)
6320{
6321	struct perf_event *event = vma->vm_file->private_data;
6322	struct perf_buffer *rb = ring_buffer_get(event);
6323	struct user_struct *mmap_user = rb->mmap_user;
6324	int mmap_locked = rb->mmap_locked;
6325	unsigned long size = perf_data_size(rb);
6326	bool detach_rest = false;
6327
6328	if (event->pmu->event_unmapped)
6329		event->pmu->event_unmapped(event, vma->vm_mm);
6330
6331	/*
6332	 * rb->aux_mmap_count will always drop before rb->mmap_count and
6333	 * event->mmap_count, so it is ok to use event->mmap_mutex to
6334	 * serialize with perf_mmap here.
6335	 */
6336	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
6337	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
6338		/*
6339		 * Stop all AUX events that are writing to this buffer,
6340		 * so that we can free its AUX pages and corresponding PMU
6341		 * data. Note that after rb::aux_mmap_count dropped to zero,
6342		 * they won't start any more (see perf_aux_output_begin()).
6343		 */
6344		perf_pmu_output_stop(event);
6345
6346		/* now it's safe to free the pages */
6347		atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
6348		atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
6349
6350		/* this has to be the last one */
6351		rb_free_aux(rb);
6352		WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
6353
6354		mutex_unlock(&event->mmap_mutex);
6355	}
6356
6357	if (atomic_dec_and_test(&rb->mmap_count))
6358		detach_rest = true;
6359
6360	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
6361		goto out_put;
6362
6363	ring_buffer_attach(event, NULL);
6364	mutex_unlock(&event->mmap_mutex);
6365
6366	/* If there's still other mmap()s of this buffer, we're done. */
6367	if (!detach_rest)
6368		goto out_put;
6369
6370	/*
6371	 * No other mmap()s, detach from all other events that might redirect
6372	 * into the now unreachable buffer. Somewhat complicated by the
6373	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
6374	 */
6375again:
6376	rcu_read_lock();
6377	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6378		if (!atomic_long_inc_not_zero(&event->refcount)) {
6379			/*
6380			 * This event is en-route to free_event() which will
6381			 * detach it and remove it from the list.
6382			 */
6383			continue;
6384		}
6385		rcu_read_unlock();
6386
6387		mutex_lock(&event->mmap_mutex);
6388		/*
6389		 * Check we didn't race with perf_event_set_output() which can
6390		 * swizzle the rb from under us while we were waiting to
6391		 * acquire mmap_mutex.
6392		 *
6393		 * If we find a different rb; ignore this event, a next
6394		 * iteration will no longer find it on the list. We have to
6395		 * still restart the iteration to make sure we're not now
6396		 * iterating the wrong list.
6397		 */
6398		if (event->rb == rb)
6399			ring_buffer_attach(event, NULL);
6400
6401		mutex_unlock(&event->mmap_mutex);
6402		put_event(event);
6403
6404		/*
6405		 * Restart the iteration; either we're on the wrong list or
6406		 * destroyed its integrity by doing a deletion.
6407		 */
6408		goto again;
6409	}
6410	rcu_read_unlock();
6411
6412	/*
6413	 * It could be there's still a few 0-ref events on the list; they'll
6414	 * get cleaned up by free_event() -- they'll also still have their
6415	 * ref on the rb and will free it whenever they are done with it.
6416	 *
6417	 * Aside from that, this buffer is 'fully' detached and unmapped,
6418	 * undo the VM accounting.
6419	 */
6420
6421	atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6422			&mmap_user->locked_vm);
6423	atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6424	free_uid(mmap_user);
6425
6426out_put:
6427	ring_buffer_put(rb); /* could be last */
6428}
6429
6430static const struct vm_operations_struct perf_mmap_vmops = {
6431	.open		= perf_mmap_open,
6432	.close		= perf_mmap_close, /* non mergeable */
6433	.fault		= perf_mmap_fault,
6434	.page_mkwrite	= perf_mmap_fault,
6435};
6436
6437static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6438{
6439	struct perf_event *event = file->private_data;
6440	unsigned long user_locked, user_lock_limit;
6441	struct user_struct *user = current_user();
6442	struct perf_buffer *rb = NULL;
6443	unsigned long locked, lock_limit;
6444	unsigned long vma_size;
6445	unsigned long nr_pages;
6446	long user_extra = 0, extra = 0;
6447	int ret = 0, flags = 0;
6448
6449	/*
6450	 * Don't allow mmap() of inherited per-task counters. This would
6451	 * create a performance issue due to all children writing to the
6452	 * same rb.
6453	 */
6454	if (event->cpu == -1 && event->attr.inherit)
6455		return -EINVAL;
6456
6457	if (!(vma->vm_flags & VM_SHARED))
6458		return -EINVAL;
6459
6460	ret = security_perf_event_read(event);
6461	if (ret)
6462		return ret;
6463
6464	vma_size = vma->vm_end - vma->vm_start;
6465
6466	if (vma->vm_pgoff == 0) {
6467		nr_pages = (vma_size / PAGE_SIZE) - 1;
6468	} else {
6469		/*
6470		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
6471		 * mapped, all subsequent mappings should have the same size
6472		 * and offset. Must be above the normal perf buffer.
6473		 */
6474		u64 aux_offset, aux_size;
6475
6476		if (!event->rb)
6477			return -EINVAL;
6478
6479		nr_pages = vma_size / PAGE_SIZE;
6480
6481		mutex_lock(&event->mmap_mutex);
6482		ret = -EINVAL;
6483
6484		rb = event->rb;
6485		if (!rb)
6486			goto aux_unlock;
6487
6488		aux_offset = READ_ONCE(rb->user_page->aux_offset);
6489		aux_size = READ_ONCE(rb->user_page->aux_size);
6490
6491		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6492			goto aux_unlock;
6493
6494		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6495			goto aux_unlock;
6496
6497		/* already mapped with a different offset */
6498		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6499			goto aux_unlock;
6500
6501		if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6502			goto aux_unlock;
6503
6504		/* already mapped with a different size */
6505		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6506			goto aux_unlock;
6507
6508		if (!is_power_of_2(nr_pages))
6509			goto aux_unlock;
6510
6511		if (!atomic_inc_not_zero(&rb->mmap_count))
6512			goto aux_unlock;
6513
6514		if (rb_has_aux(rb)) {
6515			atomic_inc(&rb->aux_mmap_count);
6516			ret = 0;
6517			goto unlock;
6518		}
6519
6520		atomic_set(&rb->aux_mmap_count, 1);
6521		user_extra = nr_pages;
6522
6523		goto accounting;
6524	}
6525
6526	/*
6527	 * If we have rb pages ensure they're a power-of-two number, so we
6528	 * can do bitmasks instead of modulo.
6529	 */
6530	if (nr_pages != 0 && !is_power_of_2(nr_pages))
6531		return -EINVAL;
6532
6533	if (vma_size != PAGE_SIZE * (1 + nr_pages))
6534		return -EINVAL;
6535
6536	WARN_ON_ONCE(event->ctx->parent_ctx);
6537again:
6538	mutex_lock(&event->mmap_mutex);
6539	if (event->rb) {
6540		if (data_page_nr(event->rb) != nr_pages) {
6541			ret = -EINVAL;
6542			goto unlock;
6543		}
6544
6545		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6546			/*
6547			 * Raced against perf_mmap_close(); remove the
6548			 * event and try again.
6549			 */
6550			ring_buffer_attach(event, NULL);
6551			mutex_unlock(&event->mmap_mutex);
6552			goto again;
6553		}
6554
6555		goto unlock;
6556	}
6557
6558	user_extra = nr_pages + 1;
6559
6560accounting:
6561	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6562
6563	/*
6564	 * Increase the limit linearly with more CPUs:
6565	 */
6566	user_lock_limit *= num_online_cpus();
6567
6568	user_locked = atomic_long_read(&user->locked_vm);
6569
6570	/*
6571	 * sysctl_perf_event_mlock may have changed, so that
6572	 *     user->locked_vm > user_lock_limit
6573	 */
6574	if (user_locked > user_lock_limit)
6575		user_locked = user_lock_limit;
6576	user_locked += user_extra;
6577
6578	if (user_locked > user_lock_limit) {
6579		/*
6580		 * charge locked_vm until it hits user_lock_limit;
6581		 * charge the rest from pinned_vm
6582		 */
6583		extra = user_locked - user_lock_limit;
6584		user_extra -= extra;
6585	}
6586
6587	lock_limit = rlimit(RLIMIT_MEMLOCK);
6588	lock_limit >>= PAGE_SHIFT;
6589	locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6590
6591	if ((locked > lock_limit) && perf_is_paranoid() &&
6592		!capable(CAP_IPC_LOCK)) {
6593		ret = -EPERM;
6594		goto unlock;
6595	}
6596
6597	WARN_ON(!rb && event->rb);
6598
6599	if (vma->vm_flags & VM_WRITE)
6600		flags |= RING_BUFFER_WRITABLE;
6601
6602	if (!rb) {
6603		rb = rb_alloc(nr_pages,
6604			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
6605			      event->cpu, flags);
6606
6607		if (!rb) {
6608			ret = -ENOMEM;
6609			goto unlock;
6610		}
6611
6612		atomic_set(&rb->mmap_count, 1);
6613		rb->mmap_user = get_current_user();
6614		rb->mmap_locked = extra;
6615
6616		ring_buffer_attach(event, rb);
6617
6618		perf_event_update_time(event);
6619		perf_event_init_userpage(event);
6620		perf_event_update_userpage(event);
6621	} else {
6622		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6623				   event->attr.aux_watermark, flags);
6624		if (!ret)
6625			rb->aux_mmap_locked = extra;
6626	}
6627
6628unlock:
6629	if (!ret) {
6630		atomic_long_add(user_extra, &user->locked_vm);
6631		atomic64_add(extra, &vma->vm_mm->pinned_vm);
6632
6633		atomic_inc(&event->mmap_count);
6634	} else if (rb) {
6635		atomic_dec(&rb->mmap_count);
6636	}
6637aux_unlock:
6638	mutex_unlock(&event->mmap_mutex);
6639
6640	/*
6641	 * Since pinned accounting is per vm we cannot allow fork() to copy our
6642	 * vma.
6643	 */
6644	vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
6645	vma->vm_ops = &perf_mmap_vmops;
6646
6647	if (event->pmu->event_mapped)
6648		event->pmu->event_mapped(event, vma->vm_mm);
6649
6650	return ret;
6651}
6652
6653static int perf_fasync(int fd, struct file *filp, int on)
6654{
6655	struct inode *inode = file_inode(filp);
6656	struct perf_event *event = filp->private_data;
6657	int retval;
6658
6659	inode_lock(inode);
6660	retval = fasync_helper(fd, filp, on, &event->fasync);
6661	inode_unlock(inode);
6662
6663	if (retval < 0)
6664		return retval;
6665
6666	return 0;
6667}
6668
6669static const struct file_operations perf_fops = {
6670	.llseek			= no_llseek,
6671	.release		= perf_release,
6672	.read			= perf_read,
6673	.poll			= perf_poll,
6674	.unlocked_ioctl		= perf_ioctl,
6675	.compat_ioctl		= perf_compat_ioctl,
6676	.mmap			= perf_mmap,
6677	.fasync			= perf_fasync,
6678};
6679
6680/*
6681 * Perf event wakeup
6682 *
6683 * If there's data, ensure we set the poll() state and publish everything
6684 * to user-space before waking everybody up.
6685 */
6686
6687static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6688{
6689	/* only the parent has fasync state */
6690	if (event->parent)
6691		event = event->parent;
6692	return &event->fasync;
6693}
6694
6695void perf_event_wakeup(struct perf_event *event)
6696{
6697	ring_buffer_wakeup(event);
6698
6699	if (event->pending_kill) {
6700		kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6701		event->pending_kill = 0;
6702	}
6703}
6704
6705static void perf_sigtrap(struct perf_event *event)
6706{
6707	/*
6708	 * We'd expect this to only occur if the irq_work is delayed and either
6709	 * ctx->task or current has changed in the meantime. This can be the
6710	 * case on architectures that do not implement arch_irq_work_raise().
6711	 */
6712	if (WARN_ON_ONCE(event->ctx->task != current))
6713		return;
6714
6715	/*
6716	 * Both perf_pending_task() and perf_pending_irq() can race with the
6717	 * task exiting.
6718	 */
6719	if (current->flags & PF_EXITING)
6720		return;
6721
6722	send_sig_perf((void __user *)event->pending_addr,
6723		      event->orig_type, event->attr.sig_data);
6724}
6725
6726/*
6727 * Deliver the pending work in-event-context or follow the context.
6728 */
6729static void __perf_pending_irq(struct perf_event *event)
6730{
6731	int cpu = READ_ONCE(event->oncpu);
6732
6733	/*
6734	 * If the event isn't running; we done. event_sched_out() will have
6735	 * taken care of things.
6736	 */
6737	if (cpu < 0)
6738		return;
6739
6740	/*
6741	 * Yay, we hit home and are in the context of the event.
6742	 */
6743	if (cpu == smp_processor_id()) {
6744		if (event->pending_sigtrap) {
6745			event->pending_sigtrap = 0;
6746			perf_sigtrap(event);
6747			local_dec(&event->ctx->nr_pending);
6748		}
6749		if (event->pending_disable) {
6750			event->pending_disable = 0;
6751			perf_event_disable_local(event);
6752		}
6753		return;
6754	}
6755
6756	/*
6757	 *  CPU-A			CPU-B
6758	 *
6759	 *  perf_event_disable_inatomic()
6760	 *    @pending_disable = CPU-A;
6761	 *    irq_work_queue();
6762	 *
6763	 *  sched-out
6764	 *    @pending_disable = -1;
6765	 *
6766	 *				sched-in
6767	 *				perf_event_disable_inatomic()
6768	 *				  @pending_disable = CPU-B;
6769	 *				  irq_work_queue(); // FAILS
6770	 *
6771	 *  irq_work_run()
6772	 *    perf_pending_irq()
6773	 *
6774	 * But the event runs on CPU-B and wants disabling there.
6775	 */
6776	irq_work_queue_on(&event->pending_irq, cpu);
6777}
6778
6779static void perf_pending_irq(struct irq_work *entry)
6780{
6781	struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
6782	int rctx;
6783
6784	/*
6785	 * If we 'fail' here, that's OK, it means recursion is already disabled
6786	 * and we won't recurse 'further'.
6787	 */
6788	rctx = perf_swevent_get_recursion_context();
6789
6790	/*
6791	 * The wakeup isn't bound to the context of the event -- it can happen
6792	 * irrespective of where the event is.
6793	 */
6794	if (event->pending_wakeup) {
6795		event->pending_wakeup = 0;
6796		perf_event_wakeup(event);
6797	}
6798
6799	__perf_pending_irq(event);
6800
6801	if (rctx >= 0)
6802		perf_swevent_put_recursion_context(rctx);
6803}
6804
6805static void perf_pending_task(struct callback_head *head)
6806{
6807	struct perf_event *event = container_of(head, struct perf_event, pending_task);
6808	int rctx;
6809
6810	/*
6811	 * If we 'fail' here, that's OK, it means recursion is already disabled
6812	 * and we won't recurse 'further'.
6813	 */
6814	preempt_disable_notrace();
6815	rctx = perf_swevent_get_recursion_context();
6816
6817	if (event->pending_work) {
6818		event->pending_work = 0;
6819		perf_sigtrap(event);
6820		local_dec(&event->ctx->nr_pending);
6821	}
6822
6823	if (rctx >= 0)
6824		perf_swevent_put_recursion_context(rctx);
6825	preempt_enable_notrace();
6826
6827	put_event(event);
6828}
6829
6830#ifdef CONFIG_GUEST_PERF_EVENTS
6831struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
6832
6833DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
6834DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
6835DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
6836
6837void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6838{
6839	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
6840		return;
6841
6842	rcu_assign_pointer(perf_guest_cbs, cbs);
6843	static_call_update(__perf_guest_state, cbs->state);
6844	static_call_update(__perf_guest_get_ip, cbs->get_ip);
6845
6846	/* Implementing ->handle_intel_pt_intr is optional. */
6847	if (cbs->handle_intel_pt_intr)
6848		static_call_update(__perf_guest_handle_intel_pt_intr,
6849				   cbs->handle_intel_pt_intr);
6850}
6851EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
6852
6853void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6854{
6855	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
6856		return;
6857
6858	rcu_assign_pointer(perf_guest_cbs, NULL);
6859	static_call_update(__perf_guest_state, (void *)&__static_call_return0);
6860	static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
6861	static_call_update(__perf_guest_handle_intel_pt_intr,
6862			   (void *)&__static_call_return0);
6863	synchronize_rcu();
6864}
6865EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
6866#endif
6867
6868static void
6869perf_output_sample_regs(struct perf_output_handle *handle,
6870			struct pt_regs *regs, u64 mask)
6871{
6872	int bit;
6873	DECLARE_BITMAP(_mask, 64);
6874
6875	bitmap_from_u64(_mask, mask);
6876	for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
6877		u64 val;
6878
6879		val = perf_reg_value(regs, bit);
6880		perf_output_put(handle, val);
6881	}
6882}
6883
6884static void perf_sample_regs_user(struct perf_regs *regs_user,
6885				  struct pt_regs *regs)
6886{
6887	if (user_mode(regs)) {
6888		regs_user->abi = perf_reg_abi(current);
6889		regs_user->regs = regs;
6890	} else if (!(current->flags & PF_KTHREAD)) {
6891		perf_get_regs_user(regs_user, regs);
6892	} else {
6893		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
6894		regs_user->regs = NULL;
6895	}
6896}
6897
6898static void perf_sample_regs_intr(struct perf_regs *regs_intr,
6899				  struct pt_regs *regs)
6900{
6901	regs_intr->regs = regs;
6902	regs_intr->abi  = perf_reg_abi(current);
6903}
6904
6905
6906/*
6907 * Get remaining task size from user stack pointer.
6908 *
6909 * It'd be better to take stack vma map and limit this more
6910 * precisely, but there's no way to get it safely under interrupt,
6911 * so using TASK_SIZE as limit.
6912 */
6913static u64 perf_ustack_task_size(struct pt_regs *regs)
6914{
6915	unsigned long addr = perf_user_stack_pointer(regs);
6916
6917	if (!addr || addr >= TASK_SIZE)
6918		return 0;
6919
6920	return TASK_SIZE - addr;
6921}
6922
6923static u16
6924perf_sample_ustack_size(u16 stack_size, u16 header_size,
6925			struct pt_regs *regs)
6926{
6927	u64 task_size;
6928
6929	/* No regs, no stack pointer, no dump. */
6930	if (!regs)
6931		return 0;
6932
6933	/*
6934	 * Check if we fit in with the requested stack size into the:
6935	 * - TASK_SIZE
6936	 *   If we don't, we limit the size to the TASK_SIZE.
6937	 *
6938	 * - remaining sample size
6939	 *   If we don't, we customize the stack size to
6940	 *   fit in to the remaining sample size.
6941	 */
6942
6943	task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6944	stack_size = min(stack_size, (u16) task_size);
6945
6946	/* Current header size plus static size and dynamic size. */
6947	header_size += 2 * sizeof(u64);
6948
6949	/* Do we fit in with the current stack dump size? */
6950	if ((u16) (header_size + stack_size) < header_size) {
6951		/*
6952		 * If we overflow the maximum size for the sample,
6953		 * we customize the stack dump size to fit in.
6954		 */
6955		stack_size = USHRT_MAX - header_size - sizeof(u64);
6956		stack_size = round_up(stack_size, sizeof(u64));
6957	}
6958
6959	return stack_size;
6960}
6961
6962static void
6963perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6964			  struct pt_regs *regs)
6965{
6966	/* Case of a kernel thread, nothing to dump */
6967	if (!regs) {
6968		u64 size = 0;
6969		perf_output_put(handle, size);
6970	} else {
6971		unsigned long sp;
6972		unsigned int rem;
6973		u64 dyn_size;
6974
6975		/*
6976		 * We dump:
6977		 * static size
6978		 *   - the size requested by user or the best one we can fit
6979		 *     in to the sample max size
6980		 * data
6981		 *   - user stack dump data
6982		 * dynamic size
6983		 *   - the actual dumped size
6984		 */
6985
6986		/* Static size. */
6987		perf_output_put(handle, dump_size);
6988
6989		/* Data. */
6990		sp = perf_user_stack_pointer(regs);
6991		rem = __output_copy_user(handle, (void *) sp, dump_size);
6992		dyn_size = dump_size - rem;
6993
6994		perf_output_skip(handle, rem);
6995
6996		/* Dynamic size. */
6997		perf_output_put(handle, dyn_size);
6998	}
6999}
7000
7001static unsigned long perf_prepare_sample_aux(struct perf_event *event,
7002					  struct perf_sample_data *data,
7003					  size_t size)
7004{
7005	struct perf_event *sampler = event->aux_event;
7006	struct perf_buffer *rb;
7007
7008	data->aux_size = 0;
7009
7010	if (!sampler)
7011		goto out;
7012
7013	if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
7014		goto out;
7015
7016	if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
7017		goto out;
7018
7019	rb = ring_buffer_get(sampler);
7020	if (!rb)
7021		goto out;
7022
7023	/*
7024	 * If this is an NMI hit inside sampling code, don't take
7025	 * the sample. See also perf_aux_sample_output().
7026	 */
7027	if (READ_ONCE(rb->aux_in_sampling)) {
7028		data->aux_size = 0;
7029	} else {
7030		size = min_t(size_t, size, perf_aux_size(rb));
7031		data->aux_size = ALIGN(size, sizeof(u64));
7032	}
7033	ring_buffer_put(rb);
7034
7035out:
7036	return data->aux_size;
7037}
7038
7039static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
7040                                 struct perf_event *event,
7041                                 struct perf_output_handle *handle,
7042                                 unsigned long size)
7043{
7044	unsigned long flags;
7045	long ret;
7046
7047	/*
7048	 * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
7049	 * paths. If we start calling them in NMI context, they may race with
7050	 * the IRQ ones, that is, for example, re-starting an event that's just
7051	 * been stopped, which is why we're using a separate callback that
7052	 * doesn't change the event state.
7053	 *
7054	 * IRQs need to be disabled to prevent IPIs from racing with us.
7055	 */
7056	local_irq_save(flags);
7057	/*
7058	 * Guard against NMI hits inside the critical section;
7059	 * see also perf_prepare_sample_aux().
7060	 */
7061	WRITE_ONCE(rb->aux_in_sampling, 1);
7062	barrier();
7063
7064	ret = event->pmu->snapshot_aux(event, handle, size);
7065
7066	barrier();
7067	WRITE_ONCE(rb->aux_in_sampling, 0);
7068	local_irq_restore(flags);
7069
7070	return ret;
7071}
7072
7073static void perf_aux_sample_output(struct perf_event *event,
7074				   struct perf_output_handle *handle,
7075				   struct perf_sample_data *data)
7076{
7077	struct perf_event *sampler = event->aux_event;
7078	struct perf_buffer *rb;
7079	unsigned long pad;
7080	long size;
7081
7082	if (WARN_ON_ONCE(!sampler || !data->aux_size))
7083		return;
7084
7085	rb = ring_buffer_get(sampler);
7086	if (!rb)
7087		return;
7088
7089	size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
7090
7091	/*
7092	 * An error here means that perf_output_copy() failed (returned a
7093	 * non-zero surplus that it didn't copy), which in its current
7094	 * enlightened implementation is not possible. If that changes, we'd
7095	 * like to know.
7096	 */
7097	if (WARN_ON_ONCE(size < 0))
7098		goto out_put;
7099
7100	/*
7101	 * The pad comes from ALIGN()ing data->aux_size up to u64 in
7102	 * perf_prepare_sample_aux(), so should not be more than that.
7103	 */
7104	pad = data->aux_size - size;
7105	if (WARN_ON_ONCE(pad >= sizeof(u64)))
7106		pad = 8;
7107
7108	if (pad) {
7109		u64 zero = 0;
7110		perf_output_copy(handle, &zero, pad);
7111	}
7112
7113out_put:
7114	ring_buffer_put(rb);
7115}
7116
7117/*
7118 * A set of common sample data types saved even for non-sample records
7119 * when event->attr.sample_id_all is set.
7120 */
7121#define PERF_SAMPLE_ID_ALL  (PERF_SAMPLE_TID | PERF_SAMPLE_TIME |	\
7122			     PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID |	\
7123			     PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
7124
7125static void __perf_event_header__init_id(struct perf_sample_data *data,
7126					 struct perf_event *event,
7127					 u64 sample_type)
7128{
7129	data->type = event->attr.sample_type;
7130	data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
7131
7132	if (sample_type & PERF_SAMPLE_TID) {
7133		/* namespace issues */
7134		data->tid_entry.pid = perf_event_pid(event, current);
7135		data->tid_entry.tid = perf_event_tid(event, current);
7136	}
7137
7138	if (sample_type & PERF_SAMPLE_TIME)
7139		data->time = perf_event_clock(event);
7140
7141	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
7142		data->id = primary_event_id(event);
7143
7144	if (sample_type & PERF_SAMPLE_STREAM_ID)
7145		data->stream_id = event->id;
7146
7147	if (sample_type & PERF_SAMPLE_CPU) {
7148		data->cpu_entry.cpu	 = raw_smp_processor_id();
7149		data->cpu_entry.reserved = 0;
7150	}
7151}
7152
7153void perf_event_header__init_id(struct perf_event_header *header,
7154				struct perf_sample_data *data,
7155				struct perf_event *event)
7156{
7157	if (event->attr.sample_id_all) {
7158		header->size += event->id_header_size;
7159		__perf_event_header__init_id(data, event, event->attr.sample_type);
7160	}
7161}
7162
7163static void __perf_event__output_id_sample(struct perf_output_handle *handle,
7164					   struct perf_sample_data *data)
7165{
7166	u64 sample_type = data->type;
7167
7168	if (sample_type & PERF_SAMPLE_TID)
7169		perf_output_put(handle, data->tid_entry);
7170
7171	if (sample_type & PERF_SAMPLE_TIME)
7172		perf_output_put(handle, data->time);
7173
7174	if (sample_type & PERF_SAMPLE_ID)
7175		perf_output_put(handle, data->id);
7176
7177	if (sample_type & PERF_SAMPLE_STREAM_ID)
7178		perf_output_put(handle, data->stream_id);
7179
7180	if (sample_type & PERF_SAMPLE_CPU)
7181		perf_output_put(handle, data->cpu_entry);
7182
7183	if (sample_type & PERF_SAMPLE_IDENTIFIER)
7184		perf_output_put(handle, data->id);
7185}
7186
7187void perf_event__output_id_sample(struct perf_event *event,
7188				  struct perf_output_handle *handle,
7189				  struct perf_sample_data *sample)
7190{
7191	if (event->attr.sample_id_all)
7192		__perf_event__output_id_sample(handle, sample);
7193}
7194
7195static void perf_output_read_one(struct perf_output_handle *handle,
7196				 struct perf_event *event,
7197				 u64 enabled, u64 running)
7198{
7199	u64 read_format = event->attr.read_format;
7200	u64 values[5];
7201	int n = 0;
7202
7203	values[n++] = perf_event_count(event);
7204	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
7205		values[n++] = enabled +
7206			atomic64_read(&event->child_total_time_enabled);
7207	}
7208	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
7209		values[n++] = running +
7210			atomic64_read(&event->child_total_time_running);
7211	}
7212	if (read_format & PERF_FORMAT_ID)
7213		values[n++] = primary_event_id(event);
7214	if (read_format & PERF_FORMAT_LOST)
7215		values[n++] = atomic64_read(&event->lost_samples);
7216
7217	__output_copy(handle, values, n * sizeof(u64));
7218}
7219
7220static void perf_output_read_group(struct perf_output_handle *handle,
7221			    struct perf_event *event,
7222			    u64 enabled, u64 running)
7223{
7224	struct perf_event *leader = event->group_leader, *sub;
7225	u64 read_format = event->attr.read_format;
7226	unsigned long flags;
7227	u64 values[6];
7228	int n = 0;
7229
7230	/*
7231	 * Disabling interrupts avoids all counter scheduling
7232	 * (context switches, timer based rotation and IPIs).
7233	 */
7234	local_irq_save(flags);
7235
7236	values[n++] = 1 + leader->nr_siblings;
7237
7238	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
7239		values[n++] = enabled;
7240
7241	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
7242		values[n++] = running;
7243
7244	if ((leader != event) &&
7245	    (leader->state == PERF_EVENT_STATE_ACTIVE))
7246		leader->pmu->read(leader);
7247
7248	values[n++] = perf_event_count(leader);
7249	if (read_format & PERF_FORMAT_ID)
7250		values[n++] = primary_event_id(leader);
7251	if (read_format & PERF_FORMAT_LOST)
7252		values[n++] = atomic64_read(&leader->lost_samples);
7253
7254	__output_copy(handle, values, n * sizeof(u64));
7255
7256	for_each_sibling_event(sub, leader) {
7257		n = 0;
7258
7259		if ((sub != event) &&
7260		    (sub->state == PERF_EVENT_STATE_ACTIVE))
7261			sub->pmu->read(sub);
7262
7263		values[n++] = perf_event_count(sub);
7264		if (read_format & PERF_FORMAT_ID)
7265			values[n++] = primary_event_id(sub);
7266		if (read_format & PERF_FORMAT_LOST)
7267			values[n++] = atomic64_read(&sub->lost_samples);
7268
7269		__output_copy(handle, values, n * sizeof(u64));
7270	}
7271
7272	local_irq_restore(flags);
7273}
7274
7275#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
7276				 PERF_FORMAT_TOTAL_TIME_RUNNING)
7277
7278/*
7279 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
7280 *
7281 * The problem is that its both hard and excessively expensive to iterate the
7282 * child list, not to mention that its impossible to IPI the children running
7283 * on another CPU, from interrupt/NMI context.
7284 */
7285static void perf_output_read(struct perf_output_handle *handle,
7286			     struct perf_event *event)
7287{
7288	u64 enabled = 0, running = 0, now;
7289	u64 read_format = event->attr.read_format;
7290
7291	/*
7292	 * compute total_time_enabled, total_time_running
7293	 * based on snapshot values taken when the event
7294	 * was last scheduled in.
7295	 *
7296	 * we cannot simply called update_context_time()
7297	 * because of locking issue as we are called in
7298	 * NMI context
7299	 */
7300	if (read_format & PERF_FORMAT_TOTAL_TIMES)
7301		calc_timer_values(event, &now, &enabled, &running);
7302
7303	if (event->attr.read_format & PERF_FORMAT_GROUP)
7304		perf_output_read_group(handle, event, enabled, running);
7305	else
7306		perf_output_read_one(handle, event, enabled, running);
7307}
7308
7309void perf_output_sample(struct perf_output_handle *handle,
7310			struct perf_event_header *header,
7311			struct perf_sample_data *data,
7312			struct perf_event *event)
7313{
7314	u64 sample_type = data->type;
7315
7316	perf_output_put(handle, *header);
7317
7318	if (sample_type & PERF_SAMPLE_IDENTIFIER)
7319		perf_output_put(handle, data->id);
7320
7321	if (sample_type & PERF_SAMPLE_IP)
7322		perf_output_put(handle, data->ip);
7323
7324	if (sample_type & PERF_SAMPLE_TID)
7325		perf_output_put(handle, data->tid_entry);
7326
7327	if (sample_type & PERF_SAMPLE_TIME)
7328		perf_output_put(handle, data->time);
7329
7330	if (sample_type & PERF_SAMPLE_ADDR)
7331		perf_output_put(handle, data->addr);
7332
7333	if (sample_type & PERF_SAMPLE_ID)
7334		perf_output_put(handle, data->id);
7335
7336	if (sample_type & PERF_SAMPLE_STREAM_ID)
7337		perf_output_put(handle, data->stream_id);
7338
7339	if (sample_type & PERF_SAMPLE_CPU)
7340		perf_output_put(handle, data->cpu_entry);
7341
7342	if (sample_type & PERF_SAMPLE_PERIOD)
7343		perf_output_put(handle, data->period);
7344
7345	if (sample_type & PERF_SAMPLE_READ)
7346		perf_output_read(handle, event);
7347
7348	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7349		int size = 1;
7350
7351		size += data->callchain->nr;
7352		size *= sizeof(u64);
7353		__output_copy(handle, data->callchain, size);
7354	}
7355
7356	if (sample_type & PERF_SAMPLE_RAW) {
7357		struct perf_raw_record *raw = data->raw;
7358
7359		if (raw) {
7360			struct perf_raw_frag *frag = &raw->frag;
7361
7362			perf_output_put(handle, raw->size);
7363			do {
7364				if (frag->copy) {
7365					__output_custom(handle, frag->copy,
7366							frag->data, frag->size);
7367				} else {
7368					__output_copy(handle, frag->data,
7369						      frag->size);
7370				}
7371				if (perf_raw_frag_last(frag))
7372					break;
7373				frag = frag->next;
7374			} while (1);
7375			if (frag->pad)
7376				__output_skip(handle, NULL, frag->pad);
7377		} else {
7378			struct {
7379				u32	size;
7380				u32	data;
7381			} raw = {
7382				.size = sizeof(u32),
7383				.data = 0,
7384			};
7385			perf_output_put(handle, raw);
7386		}
7387	}
7388
7389	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7390		if (data->br_stack) {
7391			size_t size;
7392
7393			size = data->br_stack->nr
7394			     * sizeof(struct perf_branch_entry);
7395
7396			perf_output_put(handle, data->br_stack->nr);
7397			if (branch_sample_hw_index(event))
7398				perf_output_put(handle, data->br_stack->hw_idx);
7399			perf_output_copy(handle, data->br_stack->entries, size);
7400			/*
7401			 * Add the extension space which is appended
7402			 * right after the struct perf_branch_stack.
7403			 */
7404			if (data->br_stack_cntr) {
7405				size = data->br_stack->nr * sizeof(u64);
7406				perf_output_copy(handle, data->br_stack_cntr, size);
7407			}
7408		} else {
7409			/*
7410			 * we always store at least the value of nr
7411			 */
7412			u64 nr = 0;
7413			perf_output_put(handle, nr);
7414		}
7415	}
7416
7417	if (sample_type & PERF_SAMPLE_REGS_USER) {
7418		u64 abi = data->regs_user.abi;
7419
7420		/*
7421		 * If there are no regs to dump, notice it through
7422		 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
7423		 */
7424		perf_output_put(handle, abi);
7425
7426		if (abi) {
7427			u64 mask = event->attr.sample_regs_user;
7428			perf_output_sample_regs(handle,
7429						data->regs_user.regs,
7430						mask);
7431		}
7432	}
7433
7434	if (sample_type & PERF_SAMPLE_STACK_USER) {
7435		perf_output_sample_ustack(handle,
7436					  data->stack_user_size,
7437					  data->regs_user.regs);
7438	}
7439
7440	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
7441		perf_output_put(handle, data->weight.full);
7442
7443	if (sample_type & PERF_SAMPLE_DATA_SRC)
7444		perf_output_put(handle, data->data_src.val);
7445
7446	if (sample_type & PERF_SAMPLE_TRANSACTION)
7447		perf_output_put(handle, data->txn);
7448
7449	if (sample_type & PERF_SAMPLE_REGS_INTR) {
7450		u64 abi = data->regs_intr.abi;
7451		/*
7452		 * If there are no regs to dump, notice it through
7453		 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
7454		 */
7455		perf_output_put(handle, abi);
7456
7457		if (abi) {
7458			u64 mask = event->attr.sample_regs_intr;
7459
7460			perf_output_sample_regs(handle,
7461						data->regs_intr.regs,
7462						mask);
7463		}
7464	}
7465
7466	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7467		perf_output_put(handle, data->phys_addr);
7468
7469	if (sample_type & PERF_SAMPLE_CGROUP)
7470		perf_output_put(handle, data->cgroup);
7471
7472	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7473		perf_output_put(handle, data->data_page_size);
7474
7475	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7476		perf_output_put(handle, data->code_page_size);
7477
7478	if (sample_type & PERF_SAMPLE_AUX) {
7479		perf_output_put(handle, data->aux_size);
7480
7481		if (data->aux_size)
7482			perf_aux_sample_output(event, handle, data);
7483	}
7484
7485	if (!event->attr.watermark) {
7486		int wakeup_events = event->attr.wakeup_events;
7487
7488		if (wakeup_events) {
7489			struct perf_buffer *rb = handle->rb;
7490			int events = local_inc_return(&rb->events);
7491
7492			if (events >= wakeup_events) {
7493				local_sub(wakeup_events, &rb->events);
7494				local_inc(&rb->wakeup);
7495			}
7496		}
7497	}
7498}
7499
7500static u64 perf_virt_to_phys(u64 virt)
7501{
7502	u64 phys_addr = 0;
7503
7504	if (!virt)
7505		return 0;
7506
7507	if (virt >= TASK_SIZE) {
7508		/* If it's vmalloc()d memory, leave phys_addr as 0 */
7509		if (virt_addr_valid((void *)(uintptr_t)virt) &&
7510		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
7511			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
7512	} else {
7513		/*
7514		 * Walking the pages tables for user address.
7515		 * Interrupts are disabled, so it prevents any tear down
7516		 * of the page tables.
7517		 * Try IRQ-safe get_user_page_fast_only first.
7518		 * If failed, leave phys_addr as 0.
7519		 */
7520		if (current->mm != NULL) {
7521			struct page *p;
7522
7523			pagefault_disable();
7524			if (get_user_page_fast_only(virt, 0, &p)) {
7525				phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
7526				put_page(p);
7527			}
7528			pagefault_enable();
7529		}
7530	}
7531
7532	return phys_addr;
7533}
7534
7535/*
7536 * Return the pagetable size of a given virtual address.
7537 */
7538static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
7539{
7540	u64 size = 0;
7541
7542#ifdef CONFIG_HAVE_FAST_GUP
7543	pgd_t *pgdp, pgd;
7544	p4d_t *p4dp, p4d;
7545	pud_t *pudp, pud;
7546	pmd_t *pmdp, pmd;
7547	pte_t *ptep, pte;
7548
7549	pgdp = pgd_offset(mm, addr);
7550	pgd = READ_ONCE(*pgdp);
7551	if (pgd_none(pgd))
7552		return 0;
7553
7554	if (pgd_leaf(pgd))
7555		return pgd_leaf_size(pgd);
7556
7557	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
7558	p4d = READ_ONCE(*p4dp);
7559	if (!p4d_present(p4d))
7560		return 0;
7561
7562	if (p4d_leaf(p4d))
7563		return p4d_leaf_size(p4d);
7564
7565	pudp = pud_offset_lockless(p4dp, p4d, addr);
7566	pud = READ_ONCE(*pudp);
7567	if (!pud_present(pud))
7568		return 0;
7569
7570	if (pud_leaf(pud))
7571		return pud_leaf_size(pud);
7572
7573	pmdp = pmd_offset_lockless(pudp, pud, addr);
7574again:
7575	pmd = pmdp_get_lockless(pmdp);
7576	if (!pmd_present(pmd))
7577		return 0;
7578
7579	if (pmd_leaf(pmd))
7580		return pmd_leaf_size(pmd);
7581
7582	ptep = pte_offset_map(&pmd, addr);
7583	if (!ptep)
7584		goto again;
7585
7586	pte = ptep_get_lockless(ptep);
7587	if (pte_present(pte))
7588		size = pte_leaf_size(pte);
7589	pte_unmap(ptep);
7590#endif /* CONFIG_HAVE_FAST_GUP */
7591
7592	return size;
7593}
7594
7595static u64 perf_get_page_size(unsigned long addr)
7596{
7597	struct mm_struct *mm;
7598	unsigned long flags;
7599	u64 size;
7600
7601	if (!addr)
7602		return 0;
7603
7604	/*
7605	 * Software page-table walkers must disable IRQs,
7606	 * which prevents any tear down of the page tables.
7607	 */
7608	local_irq_save(flags);
7609
7610	mm = current->mm;
7611	if (!mm) {
7612		/*
7613		 * For kernel threads and the like, use init_mm so that
7614		 * we can find kernel memory.
7615		 */
7616		mm = &init_mm;
7617	}
7618
7619	size = perf_get_pgtable_size(mm, addr);
7620
7621	local_irq_restore(flags);
7622
7623	return size;
7624}
7625
7626static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
7627
7628struct perf_callchain_entry *
7629perf_callchain(struct perf_event *event, struct pt_regs *regs)
7630{
7631	bool kernel = !event->attr.exclude_callchain_kernel;
7632	bool user   = !event->attr.exclude_callchain_user;
7633	/* Disallow cross-task user callchains. */
7634	bool crosstask = event->ctx->task && event->ctx->task != current;
7635	const u32 max_stack = event->attr.sample_max_stack;
7636	struct perf_callchain_entry *callchain;
7637
7638	if (!kernel && !user)
7639		return &__empty_callchain;
7640
7641	callchain = get_perf_callchain(regs, 0, kernel, user,
7642				       max_stack, crosstask, true);
7643	return callchain ?: &__empty_callchain;
7644}
7645
7646static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
7647{
7648	return d * !!(flags & s);
7649}
7650
7651void perf_prepare_sample(struct perf_sample_data *data,
7652			 struct perf_event *event,
7653			 struct pt_regs *regs)
7654{
7655	u64 sample_type = event->attr.sample_type;
7656	u64 filtered_sample_type;
7657
7658	/*
7659	 * Add the sample flags that are dependent to others.  And clear the
7660	 * sample flags that have already been done by the PMU driver.
7661	 */
7662	filtered_sample_type = sample_type;
7663	filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
7664					   PERF_SAMPLE_IP);
7665	filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
7666					   PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
7667	filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
7668					   PERF_SAMPLE_REGS_USER);
7669	filtered_sample_type &= ~data->sample_flags;
7670
7671	if (filtered_sample_type == 0) {
7672		/* Make sure it has the correct data->type for output */
7673		data->type = event->attr.sample_type;
7674		return;
7675	}
7676
7677	__perf_event_header__init_id(data, event, filtered_sample_type);
7678
7679	if (filtered_sample_type & PERF_SAMPLE_IP) {
7680		data->ip = perf_instruction_pointer(regs);
7681		data->sample_flags |= PERF_SAMPLE_IP;
7682	}
7683
7684	if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
7685		perf_sample_save_callchain(data, event, regs);
7686
7687	if (filtered_sample_type & PERF_SAMPLE_RAW) {
7688		data->raw = NULL;
7689		data->dyn_size += sizeof(u64);
7690		data->sample_flags |= PERF_SAMPLE_RAW;
7691	}
7692
7693	if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
7694		data->br_stack = NULL;
7695		data->dyn_size += sizeof(u64);
7696		data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
7697	}
7698
7699	if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
7700		perf_sample_regs_user(&data->regs_user, regs);
7701
7702	/*
7703	 * It cannot use the filtered_sample_type here as REGS_USER can be set
7704	 * by STACK_USER (using __cond_set() above) and we don't want to update
7705	 * the dyn_size if it's not requested by users.
7706	 */
7707	if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
7708		/* regs dump ABI info */
7709		int size = sizeof(u64);
7710
7711		if (data->regs_user.regs) {
7712			u64 mask = event->attr.sample_regs_user;
7713			size += hweight64(mask) * sizeof(u64);
7714		}
7715
7716		data->dyn_size += size;
7717		data->sample_flags |= PERF_SAMPLE_REGS_USER;
7718	}
7719
7720	if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
7721		/*
7722		 * Either we need PERF_SAMPLE_STACK_USER bit to be always
7723		 * processed as the last one or have additional check added
7724		 * in case new sample type is added, because we could eat
7725		 * up the rest of the sample size.
7726		 */
7727		u16 stack_size = event->attr.sample_stack_user;
7728		u16 header_size = perf_sample_data_size(data, event);
7729		u16 size = sizeof(u64);
7730
7731		stack_size = perf_sample_ustack_size(stack_size, header_size,
7732						     data->regs_user.regs);
7733
7734		/*
7735		 * If there is something to dump, add space for the dump
7736		 * itself and for the field that tells the dynamic size,
7737		 * which is how many have been actually dumped.
7738		 */
7739		if (stack_size)
7740			size += sizeof(u64) + stack_size;
7741
7742		data->stack_user_size = stack_size;
7743		data->dyn_size += size;
7744		data->sample_flags |= PERF_SAMPLE_STACK_USER;
7745	}
7746
7747	if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
7748		data->weight.full = 0;
7749		data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
7750	}
7751
7752	if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
7753		data->data_src.val = PERF_MEM_NA;
7754		data->sample_flags |= PERF_SAMPLE_DATA_SRC;
7755	}
7756
7757	if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
7758		data->txn = 0;
7759		data->sample_flags |= PERF_SAMPLE_TRANSACTION;
7760	}
7761
7762	if (filtered_sample_type & PERF_SAMPLE_ADDR) {
7763		data->addr = 0;
7764		data->sample_flags |= PERF_SAMPLE_ADDR;
7765	}
7766
7767	if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
7768		/* regs dump ABI info */
7769		int size = sizeof(u64);
7770
7771		perf_sample_regs_intr(&data->regs_intr, regs);
7772
7773		if (data->regs_intr.regs) {
7774			u64 mask = event->attr.sample_regs_intr;
7775
7776			size += hweight64(mask) * sizeof(u64);
7777		}
7778
7779		data->dyn_size += size;
7780		data->sample_flags |= PERF_SAMPLE_REGS_INTR;
7781	}
7782
7783	if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
7784		data->phys_addr = perf_virt_to_phys(data->addr);
7785		data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
7786	}
7787
7788#ifdef CONFIG_CGROUP_PERF
7789	if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
7790		struct cgroup *cgrp;
7791
7792		/* protected by RCU */
7793		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7794		data->cgroup = cgroup_id(cgrp);
7795		data->sample_flags |= PERF_SAMPLE_CGROUP;
7796	}
7797#endif
7798
7799	/*
7800	 * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
7801	 * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
7802	 * but the value will not dump to the userspace.
7803	 */
7804	if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
7805		data->data_page_size = perf_get_page_size(data->addr);
7806		data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
7807	}
7808
7809	if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
7810		data->code_page_size = perf_get_page_size(data->ip);
7811		data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
7812	}
7813
7814	if (filtered_sample_type & PERF_SAMPLE_AUX) {
7815		u64 size;
7816		u16 header_size = perf_sample_data_size(data, event);
7817
7818		header_size += sizeof(u64); /* size */
7819
7820		/*
7821		 * Given the 16bit nature of header::size, an AUX sample can
7822		 * easily overflow it, what with all the preceding sample bits.
7823		 * Make sure this doesn't happen by using up to U16_MAX bytes
7824		 * per sample in total (rounded down to 8 byte boundary).
7825		 */
7826		size = min_t(size_t, U16_MAX - header_size,
7827			     event->attr.aux_sample_size);
7828		size = rounddown(size, 8);
7829		size = perf_prepare_sample_aux(event, data, size);
7830
7831		WARN_ON_ONCE(size + header_size > U16_MAX);
7832		data->dyn_size += size + sizeof(u64); /* size above */
7833		data->sample_flags |= PERF_SAMPLE_AUX;
7834	}
7835}
7836
7837void perf_prepare_header(struct perf_event_header *header,
7838			 struct perf_sample_data *data,
7839			 struct perf_event *event,
7840			 struct pt_regs *regs)
7841{
7842	header->type = PERF_RECORD_SAMPLE;
7843	header->size = perf_sample_data_size(data, event);
7844	header->misc = perf_misc_flags(regs);
7845
7846	/*
7847	 * If you're adding more sample types here, you likely need to do
7848	 * something about the overflowing header::size, like repurpose the
7849	 * lowest 3 bits of size, which should be always zero at the moment.
7850	 * This raises a more important question, do we really need 512k sized
7851	 * samples and why, so good argumentation is in order for whatever you
7852	 * do here next.
7853	 */
7854	WARN_ON_ONCE(header->size & 7);
7855}
7856
7857static __always_inline int
7858__perf_event_output(struct perf_event *event,
7859		    struct perf_sample_data *data,
7860		    struct pt_regs *regs,
7861		    int (*output_begin)(struct perf_output_handle *,
7862					struct perf_sample_data *,
7863					struct perf_event *,
7864					unsigned int))
7865{
7866	struct perf_output_handle handle;
7867	struct perf_event_header header;
7868	int err;
7869
7870	/* protect the callchain buffers */
7871	rcu_read_lock();
7872
7873	perf_prepare_sample(data, event, regs);
7874	perf_prepare_header(&header, data, event, regs);
7875
7876	err = output_begin(&handle, data, event, header.size);
7877	if (err)
7878		goto exit;
7879
7880	perf_output_sample(&handle, &header, data, event);
7881
7882	perf_output_end(&handle);
7883
7884exit:
7885	rcu_read_unlock();
7886	return err;
7887}
7888
7889void
7890perf_event_output_forward(struct perf_event *event,
7891			 struct perf_sample_data *data,
7892			 struct pt_regs *regs)
7893{
7894	__perf_event_output(event, data, regs, perf_output_begin_forward);
7895}
7896
7897void
7898perf_event_output_backward(struct perf_event *event,
7899			   struct perf_sample_data *data,
7900			   struct pt_regs *regs)
7901{
7902	__perf_event_output(event, data, regs, perf_output_begin_backward);
7903}
7904
7905int
7906perf_event_output(struct perf_event *event,
7907		  struct perf_sample_data *data,
7908		  struct pt_regs *regs)
7909{
7910	return __perf_event_output(event, data, regs, perf_output_begin);
7911}
7912
7913/*
7914 * read event_id
7915 */
7916
7917struct perf_read_event {
7918	struct perf_event_header	header;
7919
7920	u32				pid;
7921	u32				tid;
7922};
7923
7924static void
7925perf_event_read_event(struct perf_event *event,
7926			struct task_struct *task)
7927{
7928	struct perf_output_handle handle;
7929	struct perf_sample_data sample;
7930	struct perf_read_event read_event = {
7931		.header = {
7932			.type = PERF_RECORD_READ,
7933			.misc = 0,
7934			.size = sizeof(read_event) + event->read_size,
7935		},
7936		.pid = perf_event_pid(event, task),
7937		.tid = perf_event_tid(event, task),
7938	};
7939	int ret;
7940
7941	perf_event_header__init_id(&read_event.header, &sample, event);
7942	ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
7943	if (ret)
7944		return;
7945
7946	perf_output_put(&handle, read_event);
7947	perf_output_read(&handle, event);
7948	perf_event__output_id_sample(event, &handle, &sample);
7949
7950	perf_output_end(&handle);
7951}
7952
7953typedef void (perf_iterate_f)(struct perf_event *event, void *data);
7954
7955static void
7956perf_iterate_ctx(struct perf_event_context *ctx,
7957		   perf_iterate_f output,
7958		   void *data, bool all)
7959{
7960	struct perf_event *event;
7961
7962	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7963		if (!all) {
7964			if (event->state < PERF_EVENT_STATE_INACTIVE)
7965				continue;
7966			if (!event_filter_match(event))
7967				continue;
7968		}
7969
7970		output(event, data);
7971	}
7972}
7973
7974static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
7975{
7976	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
7977	struct perf_event *event;
7978
7979	list_for_each_entry_rcu(event, &pel->list, sb_list) {
7980		/*
7981		 * Skip events that are not fully formed yet; ensure that
7982		 * if we observe event->ctx, both event and ctx will be
7983		 * complete enough. See perf_install_in_context().
7984		 */
7985		if (!smp_load_acquire(&event->ctx))
7986			continue;
7987
7988		if (event->state < PERF_EVENT_STATE_INACTIVE)
7989			continue;
7990		if (!event_filter_match(event))
7991			continue;
7992		output(event, data);
7993	}
7994}
7995
7996/*
7997 * Iterate all events that need to receive side-band events.
7998 *
7999 * For new callers; ensure that account_pmu_sb_event() includes
8000 * your event, otherwise it might not get delivered.
8001 */
8002static void
8003perf_iterate_sb(perf_iterate_f output, void *data,
8004	       struct perf_event_context *task_ctx)
8005{
8006	struct perf_event_context *ctx;
8007
8008	rcu_read_lock();
8009	preempt_disable();
8010
8011	/*
8012	 * If we have task_ctx != NULL we only notify the task context itself.
8013	 * The task_ctx is set only for EXIT events before releasing task
8014	 * context.
8015	 */
8016	if (task_ctx) {
8017		perf_iterate_ctx(task_ctx, output, data, false);
8018		goto done;
8019	}
8020
8021	perf_iterate_sb_cpu(output, data);
8022
8023	ctx = rcu_dereference(current->perf_event_ctxp);
8024	if (ctx)
8025		perf_iterate_ctx(ctx, output, data, false);
8026done:
8027	preempt_enable();
8028	rcu_read_unlock();
8029}
8030
8031/*
8032 * Clear all file-based filters at exec, they'll have to be
8033 * re-instated when/if these objects are mmapped again.
8034 */
8035static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
8036{
8037	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8038	struct perf_addr_filter *filter;
8039	unsigned int restart = 0, count = 0;
8040	unsigned long flags;
8041
8042	if (!has_addr_filter(event))
8043		return;
8044
8045	raw_spin_lock_irqsave(&ifh->lock, flags);
8046	list_for_each_entry(filter, &ifh->list, entry) {
8047		if (filter->path.dentry) {
8048			event->addr_filter_ranges[count].start = 0;
8049			event->addr_filter_ranges[count].size = 0;
8050			restart++;
8051		}
8052
8053		count++;
8054	}
8055
8056	if (restart)
8057		event->addr_filters_gen++;
8058	raw_spin_unlock_irqrestore(&ifh->lock, flags);
8059
8060	if (restart)
8061		perf_event_stop(event, 1);
8062}
8063
8064void perf_event_exec(void)
8065{
8066	struct perf_event_context *ctx;
8067
8068	ctx = perf_pin_task_context(current);
8069	if (!ctx)
8070		return;
8071
8072	perf_event_enable_on_exec(ctx);
8073	perf_event_remove_on_exec(ctx);
8074	perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
8075
8076	perf_unpin_context(ctx);
8077	put_ctx(ctx);
8078}
8079
8080struct remote_output {
8081	struct perf_buffer	*rb;
8082	int			err;
8083};
8084
8085static void __perf_event_output_stop(struct perf_event *event, void *data)
8086{
8087	struct perf_event *parent = event->parent;
8088	struct remote_output *ro = data;
8089	struct perf_buffer *rb = ro->rb;
8090	struct stop_event_data sd = {
8091		.event	= event,
8092	};
8093
8094	if (!has_aux(event))
8095		return;
8096
8097	if (!parent)
8098		parent = event;
8099
8100	/*
8101	 * In case of inheritance, it will be the parent that links to the
8102	 * ring-buffer, but it will be the child that's actually using it.
8103	 *
8104	 * We are using event::rb to determine if the event should be stopped,
8105	 * however this may race with ring_buffer_attach() (through set_output),
8106	 * which will make us skip the event that actually needs to be stopped.
8107	 * So ring_buffer_attach() has to stop an aux event before re-assigning
8108	 * its rb pointer.
8109	 */
8110	if (rcu_dereference(parent->rb) == rb)
8111		ro->err = __perf_event_stop(&sd);
8112}
8113
8114static int __perf_pmu_output_stop(void *info)
8115{
8116	struct perf_event *event = info;
8117	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
8118	struct remote_output ro = {
8119		.rb	= event->rb,
8120	};
8121
8122	rcu_read_lock();
8123	perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
8124	if (cpuctx->task_ctx)
8125		perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
8126				   &ro, false);
8127	rcu_read_unlock();
8128
8129	return ro.err;
8130}
8131
8132static void perf_pmu_output_stop(struct perf_event *event)
8133{
8134	struct perf_event *iter;
8135	int err, cpu;
8136
8137restart:
8138	rcu_read_lock();
8139	list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
8140		/*
8141		 * For per-CPU events, we need to make sure that neither they
8142		 * nor their children are running; for cpu==-1 events it's
8143		 * sufficient to stop the event itself if it's active, since
8144		 * it can't have children.
8145		 */
8146		cpu = iter->cpu;
8147		if (cpu == -1)
8148			cpu = READ_ONCE(iter->oncpu);
8149
8150		if (cpu == -1)
8151			continue;
8152
8153		err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
8154		if (err == -EAGAIN) {
8155			rcu_read_unlock();
8156			goto restart;
8157		}
8158	}
8159	rcu_read_unlock();
8160}
8161
8162/*
8163 * task tracking -- fork/exit
8164 *
8165 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
8166 */
8167
8168struct perf_task_event {
8169	struct task_struct		*task;
8170	struct perf_event_context	*task_ctx;
8171
8172	struct {
8173		struct perf_event_header	header;
8174
8175		u32				pid;
8176		u32				ppid;
8177		u32				tid;
8178		u32				ptid;
8179		u64				time;
8180	} event_id;
8181};
8182
8183static int perf_event_task_match(struct perf_event *event)
8184{
8185	return event->attr.comm  || event->attr.mmap ||
8186	       event->attr.mmap2 || event->attr.mmap_data ||
8187	       event->attr.task;
8188}
8189
8190static void perf_event_task_output(struct perf_event *event,
8191				   void *data)
8192{
8193	struct perf_task_event *task_event = data;
8194	struct perf_output_handle handle;
8195	struct perf_sample_data	sample;
8196	struct task_struct *task = task_event->task;
8197	int ret, size = task_event->event_id.header.size;
8198
8199	if (!perf_event_task_match(event))
8200		return;
8201
8202	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
8203
8204	ret = perf_output_begin(&handle, &sample, event,
8205				task_event->event_id.header.size);
8206	if (ret)
8207		goto out;
8208
8209	task_event->event_id.pid = perf_event_pid(event, task);
8210	task_event->event_id.tid = perf_event_tid(event, task);
8211
8212	if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
8213		task_event->event_id.ppid = perf_event_pid(event,
8214							task->real_parent);
8215		task_event->event_id.ptid = perf_event_pid(event,
8216							task->real_parent);
8217	} else {  /* PERF_RECORD_FORK */
8218		task_event->event_id.ppid = perf_event_pid(event, current);
8219		task_event->event_id.ptid = perf_event_tid(event, current);
8220	}
8221
8222	task_event->event_id.time = perf_event_clock(event);
8223
8224	perf_output_put(&handle, task_event->event_id);
8225
8226	perf_event__output_id_sample(event, &handle, &sample);
8227
8228	perf_output_end(&handle);
8229out:
8230	task_event->event_id.header.size = size;
8231}
8232
8233static void perf_event_task(struct task_struct *task,
8234			      struct perf_event_context *task_ctx,
8235			      int new)
8236{
8237	struct perf_task_event task_event;
8238
8239	if (!atomic_read(&nr_comm_events) &&
8240	    !atomic_read(&nr_mmap_events) &&
8241	    !atomic_read(&nr_task_events))
8242		return;
8243
8244	task_event = (struct perf_task_event){
8245		.task	  = task,
8246		.task_ctx = task_ctx,
8247		.event_id    = {
8248			.header = {
8249				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
8250				.misc = 0,
8251				.size = sizeof(task_event.event_id),
8252			},
8253			/* .pid  */
8254			/* .ppid */
8255			/* .tid  */
8256			/* .ptid */
8257			/* .time */
8258		},
8259	};
8260
8261	perf_iterate_sb(perf_event_task_output,
8262		       &task_event,
8263		       task_ctx);
8264}
8265
8266void perf_event_fork(struct task_struct *task)
8267{
8268	perf_event_task(task, NULL, 1);
8269	perf_event_namespaces(task);
8270}
8271
8272/*
8273 * comm tracking
8274 */
8275
8276struct perf_comm_event {
8277	struct task_struct	*task;
8278	char			*comm;
8279	int			comm_size;
8280
8281	struct {
8282		struct perf_event_header	header;
8283
8284		u32				pid;
8285		u32				tid;
8286	} event_id;
8287};
8288
8289static int perf_event_comm_match(struct perf_event *event)
8290{
8291	return event->attr.comm;
8292}
8293
8294static void perf_event_comm_output(struct perf_event *event,
8295				   void *data)
8296{
8297	struct perf_comm_event *comm_event = data;
8298	struct perf_output_handle handle;
8299	struct perf_sample_data sample;
8300	int size = comm_event->event_id.header.size;
8301	int ret;
8302
8303	if (!perf_event_comm_match(event))
8304		return;
8305
8306	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
8307	ret = perf_output_begin(&handle, &sample, event,
8308				comm_event->event_id.header.size);
8309
8310	if (ret)
8311		goto out;
8312
8313	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
8314	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
8315
8316	perf_output_put(&handle, comm_event->event_id);
8317	__output_copy(&handle, comm_event->comm,
8318				   comm_event->comm_size);
8319
8320	perf_event__output_id_sample(event, &handle, &sample);
8321
8322	perf_output_end(&handle);
8323out:
8324	comm_event->event_id.header.size = size;
8325}
8326
8327static void perf_event_comm_event(struct perf_comm_event *comm_event)
8328{
8329	char comm[TASK_COMM_LEN];
8330	unsigned int size;
8331
8332	memset(comm, 0, sizeof(comm));
8333	strscpy(comm, comm_event->task->comm, sizeof(comm));
8334	size = ALIGN(strlen(comm)+1, sizeof(u64));
8335
8336	comm_event->comm = comm;
8337	comm_event->comm_size = size;
8338
8339	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
8340
8341	perf_iterate_sb(perf_event_comm_output,
8342		       comm_event,
8343		       NULL);
8344}
8345
8346void perf_event_comm(struct task_struct *task, bool exec)
8347{
8348	struct perf_comm_event comm_event;
8349
8350	if (!atomic_read(&nr_comm_events))
8351		return;
8352
8353	comm_event = (struct perf_comm_event){
8354		.task	= task,
8355		/* .comm      */
8356		/* .comm_size */
8357		.event_id  = {
8358			.header = {
8359				.type = PERF_RECORD_COMM,
8360				.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
8361				/* .size */
8362			},
8363			/* .pid */
8364			/* .tid */
8365		},
8366	};
8367
8368	perf_event_comm_event(&comm_event);
8369}
8370
8371/*
8372 * namespaces tracking
8373 */
8374
8375struct perf_namespaces_event {
8376	struct task_struct		*task;
8377
8378	struct {
8379		struct perf_event_header	header;
8380
8381		u32				pid;
8382		u32				tid;
8383		u64				nr_namespaces;
8384		struct perf_ns_link_info	link_info[NR_NAMESPACES];
8385	} event_id;
8386};
8387
8388static int perf_event_namespaces_match(struct perf_event *event)
8389{
8390	return event->attr.namespaces;
8391}
8392
8393static void perf_event_namespaces_output(struct perf_event *event,
8394					 void *data)
8395{
8396	struct perf_namespaces_event *namespaces_event = data;
8397	struct perf_output_handle handle;
8398	struct perf_sample_data sample;
8399	u16 header_size = namespaces_event->event_id.header.size;
8400	int ret;
8401
8402	if (!perf_event_namespaces_match(event))
8403		return;
8404
8405	perf_event_header__init_id(&namespaces_event->event_id.header,
8406				   &sample, event);
8407	ret = perf_output_begin(&handle, &sample, event,
8408				namespaces_event->event_id.header.size);
8409	if (ret)
8410		goto out;
8411
8412	namespaces_event->event_id.pid = perf_event_pid(event,
8413							namespaces_event->task);
8414	namespaces_event->event_id.tid = perf_event_tid(event,
8415							namespaces_event->task);
8416
8417	perf_output_put(&handle, namespaces_event->event_id);
8418
8419	perf_event__output_id_sample(event, &handle, &sample);
8420
8421	perf_output_end(&handle);
8422out:
8423	namespaces_event->event_id.header.size = header_size;
8424}
8425
8426static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
8427				   struct task_struct *task,
8428				   const struct proc_ns_operations *ns_ops)
8429{
8430	struct path ns_path;
8431	struct inode *ns_inode;
8432	int error;
8433
8434	error = ns_get_path(&ns_path, task, ns_ops);
8435	if (!error) {
8436		ns_inode = ns_path.dentry->d_inode;
8437		ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
8438		ns_link_info->ino = ns_inode->i_ino;
8439		path_put(&ns_path);
8440	}
8441}
8442
8443void perf_event_namespaces(struct task_struct *task)
8444{
8445	struct perf_namespaces_event namespaces_event;
8446	struct perf_ns_link_info *ns_link_info;
8447
8448	if (!atomic_read(&nr_namespaces_events))
8449		return;
8450
8451	namespaces_event = (struct perf_namespaces_event){
8452		.task	= task,
8453		.event_id  = {
8454			.header = {
8455				.type = PERF_RECORD_NAMESPACES,
8456				.misc = 0,
8457				.size = sizeof(namespaces_event.event_id),
8458			},
8459			/* .pid */
8460			/* .tid */
8461			.nr_namespaces = NR_NAMESPACES,
8462			/* .link_info[NR_NAMESPACES] */
8463		},
8464	};
8465
8466	ns_link_info = namespaces_event.event_id.link_info;
8467
8468	perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
8469			       task, &mntns_operations);
8470
8471#ifdef CONFIG_USER_NS
8472	perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
8473			       task, &userns_operations);
8474#endif
8475#ifdef CONFIG_NET_NS
8476	perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
8477			       task, &netns_operations);
8478#endif
8479#ifdef CONFIG_UTS_NS
8480	perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
8481			       task, &utsns_operations);
8482#endif
8483#ifdef CONFIG_IPC_NS
8484	perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
8485			       task, &ipcns_operations);
8486#endif
8487#ifdef CONFIG_PID_NS
8488	perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
8489			       task, &pidns_operations);
8490#endif
8491#ifdef CONFIG_CGROUPS
8492	perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
8493			       task, &cgroupns_operations);
8494#endif
8495
8496	perf_iterate_sb(perf_event_namespaces_output,
8497			&namespaces_event,
8498			NULL);
8499}
8500
8501/*
8502 * cgroup tracking
8503 */
8504#ifdef CONFIG_CGROUP_PERF
8505
8506struct perf_cgroup_event {
8507	char				*path;
8508	int				path_size;
8509	struct {
8510		struct perf_event_header	header;
8511		u64				id;
8512		char				path[];
8513	} event_id;
8514};
8515
8516static int perf_event_cgroup_match(struct perf_event *event)
8517{
8518	return event->attr.cgroup;
8519}
8520
8521static void perf_event_cgroup_output(struct perf_event *event, void *data)
8522{
8523	struct perf_cgroup_event *cgroup_event = data;
8524	struct perf_output_handle handle;
8525	struct perf_sample_data sample;
8526	u16 header_size = cgroup_event->event_id.header.size;
8527	int ret;
8528
8529	if (!perf_event_cgroup_match(event))
8530		return;
8531
8532	perf_event_header__init_id(&cgroup_event->event_id.header,
8533				   &sample, event);
8534	ret = perf_output_begin(&handle, &sample, event,
8535				cgroup_event->event_id.header.size);
8536	if (ret)
8537		goto out;
8538
8539	perf_output_put(&handle, cgroup_event->event_id);
8540	__output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
8541
8542	perf_event__output_id_sample(event, &handle, &sample);
8543
8544	perf_output_end(&handle);
8545out:
8546	cgroup_event->event_id.header.size = header_size;
8547}
8548
8549static void perf_event_cgroup(struct cgroup *cgrp)
8550{
8551	struct perf_cgroup_event cgroup_event;
8552	char path_enomem[16] = "//enomem";
8553	char *pathname;
8554	size_t size;
8555
8556	if (!atomic_read(&nr_cgroup_events))
8557		return;
8558
8559	cgroup_event = (struct perf_cgroup_event){
8560		.event_id  = {
8561			.header = {
8562				.type = PERF_RECORD_CGROUP,
8563				.misc = 0,
8564				.size = sizeof(cgroup_event.event_id),
8565			},
8566			.id = cgroup_id(cgrp),
8567		},
8568	};
8569
8570	pathname = kmalloc(PATH_MAX, GFP_KERNEL);
8571	if (pathname == NULL) {
8572		cgroup_event.path = path_enomem;
8573	} else {
8574		/* just to be sure to have enough space for alignment */
8575		cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
8576		cgroup_event.path = pathname;
8577	}
8578
8579	/*
8580	 * Since our buffer works in 8 byte units we need to align our string
8581	 * size to a multiple of 8. However, we must guarantee the tail end is
8582	 * zero'd out to avoid leaking random bits to userspace.
8583	 */
8584	size = strlen(cgroup_event.path) + 1;
8585	while (!IS_ALIGNED(size, sizeof(u64)))
8586		cgroup_event.path[size++] = '\0';
8587
8588	cgroup_event.event_id.header.size += size;
8589	cgroup_event.path_size = size;
8590
8591	perf_iterate_sb(perf_event_cgroup_output,
8592			&cgroup_event,
8593			NULL);
8594
8595	kfree(pathname);
8596}
8597
8598#endif
8599
8600/*
8601 * mmap tracking
8602 */
8603
8604struct perf_mmap_event {
8605	struct vm_area_struct	*vma;
8606
8607	const char		*file_name;
8608	int			file_size;
8609	int			maj, min;
8610	u64			ino;
8611	u64			ino_generation;
8612	u32			prot, flags;
8613	u8			build_id[BUILD_ID_SIZE_MAX];
8614	u32			build_id_size;
8615
8616	struct {
8617		struct perf_event_header	header;
8618
8619		u32				pid;
8620		u32				tid;
8621		u64				start;
8622		u64				len;
8623		u64				pgoff;
8624	} event_id;
8625};
8626
8627static int perf_event_mmap_match(struct perf_event *event,
8628				 void *data)
8629{
8630	struct perf_mmap_event *mmap_event = data;
8631	struct vm_area_struct *vma = mmap_event->vma;
8632	int executable = vma->vm_flags & VM_EXEC;
8633
8634	return (!executable && event->attr.mmap_data) ||
8635	       (executable && (event->attr.mmap || event->attr.mmap2));
8636}
8637
8638static void perf_event_mmap_output(struct perf_event *event,
8639				   void *data)
8640{
8641	struct perf_mmap_event *mmap_event = data;
8642	struct perf_output_handle handle;
8643	struct perf_sample_data sample;
8644	int size = mmap_event->event_id.header.size;
8645	u32 type = mmap_event->event_id.header.type;
8646	bool use_build_id;
8647	int ret;
8648
8649	if (!perf_event_mmap_match(event, data))
8650		return;
8651
8652	if (event->attr.mmap2) {
8653		mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
8654		mmap_event->event_id.header.size += sizeof(mmap_event->maj);
8655		mmap_event->event_id.header.size += sizeof(mmap_event->min);
8656		mmap_event->event_id.header.size += sizeof(mmap_event->ino);
8657		mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
8658		mmap_event->event_id.header.size += sizeof(mmap_event->prot);
8659		mmap_event->event_id.header.size += sizeof(mmap_event->flags);
8660	}
8661
8662	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
8663	ret = perf_output_begin(&handle, &sample, event,
8664				mmap_event->event_id.header.size);
8665	if (ret)
8666		goto out;
8667
8668	mmap_event->event_id.pid = perf_event_pid(event, current);
8669	mmap_event->event_id.tid = perf_event_tid(event, current);
8670
8671	use_build_id = event->attr.build_id && mmap_event->build_id_size;
8672
8673	if (event->attr.mmap2 && use_build_id)
8674		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
8675
8676	perf_output_put(&handle, mmap_event->event_id);
8677
8678	if (event->attr.mmap2) {
8679		if (use_build_id) {
8680			u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
8681
8682			__output_copy(&handle, size, 4);
8683			__output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
8684		} else {
8685			perf_output_put(&handle, mmap_event->maj);
8686			perf_output_put(&handle, mmap_event->min);
8687			perf_output_put(&handle, mmap_event->ino);
8688			perf_output_put(&handle, mmap_event->ino_generation);
8689		}
8690		perf_output_put(&handle, mmap_event->prot);
8691		perf_output_put(&handle, mmap_event->flags);
8692	}
8693
8694	__output_copy(&handle, mmap_event->file_name,
8695				   mmap_event->file_size);
8696
8697	perf_event__output_id_sample(event, &handle, &sample);
8698
8699	perf_output_end(&handle);
8700out:
8701	mmap_event->event_id.header.size = size;
8702	mmap_event->event_id.header.type = type;
8703}
8704
8705static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
8706{
8707	struct vm_area_struct *vma = mmap_event->vma;
8708	struct file *file = vma->vm_file;
8709	int maj = 0, min = 0;
8710	u64 ino = 0, gen = 0;
8711	u32 prot = 0, flags = 0;
8712	unsigned int size;
8713	char tmp[16];
8714	char *buf = NULL;
8715	char *name = NULL;
8716
8717	if (vma->vm_flags & VM_READ)
8718		prot |= PROT_READ;
8719	if (vma->vm_flags & VM_WRITE)
8720		prot |= PROT_WRITE;
8721	if (vma->vm_flags & VM_EXEC)
8722		prot |= PROT_EXEC;
8723
8724	if (vma->vm_flags & VM_MAYSHARE)
8725		flags = MAP_SHARED;
8726	else
8727		flags = MAP_PRIVATE;
8728
8729	if (vma->vm_flags & VM_LOCKED)
8730		flags |= MAP_LOCKED;
8731	if (is_vm_hugetlb_page(vma))
8732		flags |= MAP_HUGETLB;
8733
8734	if (file) {
8735		struct inode *inode;
8736		dev_t dev;
8737
8738		buf = kmalloc(PATH_MAX, GFP_KERNEL);
8739		if (!buf) {
8740			name = "//enomem";
8741			goto cpy_name;
8742		}
8743		/*
8744		 * d_path() works from the end of the rb backwards, so we
8745		 * need to add enough zero bytes after the string to handle
8746		 * the 64bit alignment we do later.
8747		 */
8748		name = file_path(file, buf, PATH_MAX - sizeof(u64));
8749		if (IS_ERR(name)) {
8750			name = "//toolong";
8751			goto cpy_name;
8752		}
8753		inode = file_inode(vma->vm_file);
8754		dev = inode->i_sb->s_dev;
8755		ino = inode->i_ino;
8756		gen = inode->i_generation;
8757		maj = MAJOR(dev);
8758		min = MINOR(dev);
8759
8760		goto got_name;
8761	} else {
8762		if (vma->vm_ops && vma->vm_ops->name)
8763			name = (char *) vma->vm_ops->name(vma);
8764		if (!name)
8765			name = (char *)arch_vma_name(vma);
8766		if (!name) {
8767			if (vma_is_initial_heap(vma))
8768				name = "[heap]";
8769			else if (vma_is_initial_stack(vma))
8770				name = "[stack]";
8771			else
8772				name = "//anon";
8773		}
8774	}
8775
8776cpy_name:
8777	strscpy(tmp, name, sizeof(tmp));
8778	name = tmp;
8779got_name:
8780	/*
8781	 * Since our buffer works in 8 byte units we need to align our string
8782	 * size to a multiple of 8. However, we must guarantee the tail end is
8783	 * zero'd out to avoid leaking random bits to userspace.
8784	 */
8785	size = strlen(name)+1;
8786	while (!IS_ALIGNED(size, sizeof(u64)))
8787		name[size++] = '\0';
8788
8789	mmap_event->file_name = name;
8790	mmap_event->file_size = size;
8791	mmap_event->maj = maj;
8792	mmap_event->min = min;
8793	mmap_event->ino = ino;
8794	mmap_event->ino_generation = gen;
8795	mmap_event->prot = prot;
8796	mmap_event->flags = flags;
8797
8798	if (!(vma->vm_flags & VM_EXEC))
8799		mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
8800
8801	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
8802
8803	if (atomic_read(&nr_build_id_events))
8804		build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
8805
8806	perf_iterate_sb(perf_event_mmap_output,
8807		       mmap_event,
8808		       NULL);
8809
8810	kfree(buf);
8811}
8812
8813/*
8814 * Check whether inode and address range match filter criteria.
8815 */
8816static bool perf_addr_filter_match(struct perf_addr_filter *filter,
8817				     struct file *file, unsigned long offset,
8818				     unsigned long size)
8819{
8820	/* d_inode(NULL) won't be equal to any mapped user-space file */
8821	if (!filter->path.dentry)
8822		return false;
8823
8824	if (d_inode(filter->path.dentry) != file_inode(file))
8825		return false;
8826
8827	if (filter->offset > offset + size)
8828		return false;
8829
8830	if (filter->offset + filter->size < offset)
8831		return false;
8832
8833	return true;
8834}
8835
8836static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
8837					struct vm_area_struct *vma,
8838					struct perf_addr_filter_range *fr)
8839{
8840	unsigned long vma_size = vma->vm_end - vma->vm_start;
8841	unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8842	struct file *file = vma->vm_file;
8843
8844	if (!perf_addr_filter_match(filter, file, off, vma_size))
8845		return false;
8846
8847	if (filter->offset < off) {
8848		fr->start = vma->vm_start;
8849		fr->size = min(vma_size, filter->size - (off - filter->offset));
8850	} else {
8851		fr->start = vma->vm_start + filter->offset - off;
8852		fr->size = min(vma->vm_end - fr->start, filter->size);
8853	}
8854
8855	return true;
8856}
8857
8858static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
8859{
8860	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8861	struct vm_area_struct *vma = data;
8862	struct perf_addr_filter *filter;
8863	unsigned int restart = 0, count = 0;
8864	unsigned long flags;
8865
8866	if (!has_addr_filter(event))
8867		return;
8868
8869	if (!vma->vm_file)
8870		return;
8871
8872	raw_spin_lock_irqsave(&ifh->lock, flags);
8873	list_for_each_entry(filter, &ifh->list, entry) {
8874		if (perf_addr_filter_vma_adjust(filter, vma,
8875						&event->addr_filter_ranges[count]))
8876			restart++;
8877
8878		count++;
8879	}
8880
8881	if (restart)
8882		event->addr_filters_gen++;
8883	raw_spin_unlock_irqrestore(&ifh->lock, flags);
8884
8885	if (restart)
8886		perf_event_stop(event, 1);
8887}
8888
8889/*
8890 * Adjust all task's events' filters to the new vma
8891 */
8892static void perf_addr_filters_adjust(struct vm_area_struct *vma)
8893{
8894	struct perf_event_context *ctx;
8895
8896	/*
8897	 * Data tracing isn't supported yet and as such there is no need
8898	 * to keep track of anything that isn't related to executable code:
8899	 */
8900	if (!(vma->vm_flags & VM_EXEC))
8901		return;
8902
8903	rcu_read_lock();
8904	ctx = rcu_dereference(current->perf_event_ctxp);
8905	if (ctx)
8906		perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
8907	rcu_read_unlock();
8908}
8909
8910void perf_event_mmap(struct vm_area_struct *vma)
8911{
8912	struct perf_mmap_event mmap_event;
8913
8914	if (!atomic_read(&nr_mmap_events))
8915		return;
8916
8917	mmap_event = (struct perf_mmap_event){
8918		.vma	= vma,
8919		/* .file_name */
8920		/* .file_size */
8921		.event_id  = {
8922			.header = {
8923				.type = PERF_RECORD_MMAP,
8924				.misc = PERF_RECORD_MISC_USER,
8925				/* .size */
8926			},
8927			/* .pid */
8928			/* .tid */
8929			.start  = vma->vm_start,
8930			.len    = vma->vm_end - vma->vm_start,
8931			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
8932		},
8933		/* .maj (attr_mmap2 only) */
8934		/* .min (attr_mmap2 only) */
8935		/* .ino (attr_mmap2 only) */
8936		/* .ino_generation (attr_mmap2 only) */
8937		/* .prot (attr_mmap2 only) */
8938		/* .flags (attr_mmap2 only) */
8939	};
8940
8941	perf_addr_filters_adjust(vma);
8942	perf_event_mmap_event(&mmap_event);
8943}
8944
8945void perf_event_aux_event(struct perf_event *event, unsigned long head,
8946			  unsigned long size, u64 flags)
8947{
8948	struct perf_output_handle handle;
8949	struct perf_sample_data sample;
8950	struct perf_aux_event {
8951		struct perf_event_header	header;
8952		u64				offset;
8953		u64				size;
8954		u64				flags;
8955	} rec = {
8956		.header = {
8957			.type = PERF_RECORD_AUX,
8958			.misc = 0,
8959			.size = sizeof(rec),
8960		},
8961		.offset		= head,
8962		.size		= size,
8963		.flags		= flags,
8964	};
8965	int ret;
8966
8967	perf_event_header__init_id(&rec.header, &sample, event);
8968	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
8969
8970	if (ret)
8971		return;
8972
8973	perf_output_put(&handle, rec);
8974	perf_event__output_id_sample(event, &handle, &sample);
8975
8976	perf_output_end(&handle);
8977}
8978
8979/*
8980 * Lost/dropped samples logging
8981 */
8982void perf_log_lost_samples(struct perf_event *event, u64 lost)
8983{
8984	struct perf_output_handle handle;
8985	struct perf_sample_data sample;
8986	int ret;
8987
8988	struct {
8989		struct perf_event_header	header;
8990		u64				lost;
8991	} lost_samples_event = {
8992		.header = {
8993			.type = PERF_RECORD_LOST_SAMPLES,
8994			.misc = 0,
8995			.size = sizeof(lost_samples_event),
8996		},
8997		.lost		= lost,
8998	};
8999
9000	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
9001
9002	ret = perf_output_begin(&handle, &sample, event,
9003				lost_samples_event.header.size);
9004	if (ret)
9005		return;
9006
9007	perf_output_put(&handle, lost_samples_event);
9008	perf_event__output_id_sample(event, &handle, &sample);
9009	perf_output_end(&handle);
9010}
9011
9012/*
9013 * context_switch tracking
9014 */
9015
9016struct perf_switch_event {
9017	struct task_struct	*task;
9018	struct task_struct	*next_prev;
9019
9020	struct {
9021		struct perf_event_header	header;
9022		u32				next_prev_pid;
9023		u32				next_prev_tid;
9024	} event_id;
9025};
9026
9027static int perf_event_switch_match(struct perf_event *event)
9028{
9029	return event->attr.context_switch;
9030}
9031
9032static void perf_event_switch_output(struct perf_event *event, void *data)
9033{
9034	struct perf_switch_event *se = data;
9035	struct perf_output_handle handle;
9036	struct perf_sample_data sample;
9037	int ret;
9038
9039	if (!perf_event_switch_match(event))
9040		return;
9041
9042	/* Only CPU-wide events are allowed to see next/prev pid/tid */
9043	if (event->ctx->task) {
9044		se->event_id.header.type = PERF_RECORD_SWITCH;
9045		se->event_id.header.size = sizeof(se->event_id.header);
9046	} else {
9047		se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
9048		se->event_id.header.size = sizeof(se->event_id);
9049		se->event_id.next_prev_pid =
9050					perf_event_pid(event, se->next_prev);
9051		se->event_id.next_prev_tid =
9052					perf_event_tid(event, se->next_prev);
9053	}
9054
9055	perf_event_header__init_id(&se->event_id.header, &sample, event);
9056
9057	ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
9058	if (ret)
9059		return;
9060
9061	if (event->ctx->task)
9062		perf_output_put(&handle, se->event_id.header);
9063	else
9064		perf_output_put(&handle, se->event_id);
9065
9066	perf_event__output_id_sample(event, &handle, &sample);
9067
9068	perf_output_end(&handle);
9069}
9070
9071static void perf_event_switch(struct task_struct *task,
9072			      struct task_struct *next_prev, bool sched_in)
9073{
9074	struct perf_switch_event switch_event;
9075
9076	/* N.B. caller checks nr_switch_events != 0 */
9077
9078	switch_event = (struct perf_switch_event){
9079		.task		= task,
9080		.next_prev	= next_prev,
9081		.event_id	= {
9082			.header = {
9083				/* .type */
9084				.misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
9085				/* .size */
9086			},
9087			/* .next_prev_pid */
9088			/* .next_prev_tid */
9089		},
9090	};
9091
9092	if (!sched_in && task->on_rq) {
9093		switch_event.event_id.header.misc |=
9094				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
9095	}
9096
9097	perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
9098}
9099
9100/*
9101 * IRQ throttle logging
9102 */
9103
9104static void perf_log_throttle(struct perf_event *event, int enable)
9105{
9106	struct perf_output_handle handle;
9107	struct perf_sample_data sample;
9108	int ret;
9109
9110	struct {
9111		struct perf_event_header	header;
9112		u64				time;
9113		u64				id;
9114		u64				stream_id;
9115	} throttle_event = {
9116		.header = {
9117			.type = PERF_RECORD_THROTTLE,
9118			.misc = 0,
9119			.size = sizeof(throttle_event),
9120		},
9121		.time		= perf_event_clock(event),
9122		.id		= primary_event_id(event),
9123		.stream_id	= event->id,
9124	};
9125
9126	if (enable)
9127		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
9128
9129	perf_event_header__init_id(&throttle_event.header, &sample, event);
9130
9131	ret = perf_output_begin(&handle, &sample, event,
9132				throttle_event.header.size);
9133	if (ret)
9134		return;
9135
9136	perf_output_put(&handle, throttle_event);
9137	perf_event__output_id_sample(event, &handle, &sample);
9138	perf_output_end(&handle);
9139}
9140
9141/*
9142 * ksymbol register/unregister tracking
9143 */
9144
9145struct perf_ksymbol_event {
9146	const char	*name;
9147	int		name_len;
9148	struct {
9149		struct perf_event_header        header;
9150		u64				addr;
9151		u32				len;
9152		u16				ksym_type;
9153		u16				flags;
9154	} event_id;
9155};
9156
9157static int perf_event_ksymbol_match(struct perf_event *event)
9158{
9159	return event->attr.ksymbol;
9160}
9161
9162static void perf_event_ksymbol_output(struct perf_event *event, void *data)
9163{
9164	struct perf_ksymbol_event *ksymbol_event = data;
9165	struct perf_output_handle handle;
9166	struct perf_sample_data sample;
9167	int ret;
9168
9169	if (!perf_event_ksymbol_match(event))
9170		return;
9171
9172	perf_event_header__init_id(&ksymbol_event->event_id.header,
9173				   &sample, event);
9174	ret = perf_output_begin(&handle, &sample, event,
9175				ksymbol_event->event_id.header.size);
9176	if (ret)
9177		return;
9178
9179	perf_output_put(&handle, ksymbol_event->event_id);
9180	__output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
9181	perf_event__output_id_sample(event, &handle, &sample);
9182
9183	perf_output_end(&handle);
9184}
9185
9186void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
9187			const char *sym)
9188{
9189	struct perf_ksymbol_event ksymbol_event;
9190	char name[KSYM_NAME_LEN];
9191	u16 flags = 0;
9192	int name_len;
9193
9194	if (!atomic_read(&nr_ksymbol_events))
9195		return;
9196
9197	if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
9198	    ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
9199		goto err;
9200
9201	strscpy(name, sym, KSYM_NAME_LEN);
9202	name_len = strlen(name) + 1;
9203	while (!IS_ALIGNED(name_len, sizeof(u64)))
9204		name[name_len++] = '\0';
9205	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
9206
9207	if (unregister)
9208		flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
9209
9210	ksymbol_event = (struct perf_ksymbol_event){
9211		.name = name,
9212		.name_len = name_len,
9213		.event_id = {
9214			.header = {
9215				.type = PERF_RECORD_KSYMBOL,
9216				.size = sizeof(ksymbol_event.event_id) +
9217					name_len,
9218			},
9219			.addr = addr,
9220			.len = len,
9221			.ksym_type = ksym_type,
9222			.flags = flags,
9223		},
9224	};
9225
9226	perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
9227	return;
9228err:
9229	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
9230}
9231
9232/*
9233 * bpf program load/unload tracking
9234 */
9235
9236struct perf_bpf_event {
9237	struct bpf_prog	*prog;
9238	struct {
9239		struct perf_event_header        header;
9240		u16				type;
9241		u16				flags;
9242		u32				id;
9243		u8				tag[BPF_TAG_SIZE];
9244	} event_id;
9245};
9246
9247static int perf_event_bpf_match(struct perf_event *event)
9248{
9249	return event->attr.bpf_event;
9250}
9251
9252static void perf_event_bpf_output(struct perf_event *event, void *data)
9253{
9254	struct perf_bpf_event *bpf_event = data;
9255	struct perf_output_handle handle;
9256	struct perf_sample_data sample;
9257	int ret;
9258
9259	if (!perf_event_bpf_match(event))
9260		return;
9261
9262	perf_event_header__init_id(&bpf_event->event_id.header,
9263				   &sample, event);
9264	ret = perf_output_begin(&handle, &sample, event,
9265				bpf_event->event_id.header.size);
9266	if (ret)
9267		return;
9268
9269	perf_output_put(&handle, bpf_event->event_id);
9270	perf_event__output_id_sample(event, &handle, &sample);
9271
9272	perf_output_end(&handle);
9273}
9274
9275static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
9276					 enum perf_bpf_event_type type)
9277{
9278	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
9279	int i;
9280
9281	if (prog->aux->func_cnt == 0) {
9282		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
9283				   (u64)(unsigned long)prog->bpf_func,
9284				   prog->jited_len, unregister,
9285				   prog->aux->ksym.name);
9286	} else {
9287		for (i = 0; i < prog->aux->func_cnt; i++) {
9288			struct bpf_prog *subprog = prog->aux->func[i];
9289
9290			perf_event_ksymbol(
9291				PERF_RECORD_KSYMBOL_TYPE_BPF,
9292				(u64)(unsigned long)subprog->bpf_func,
9293				subprog->jited_len, unregister,
9294				subprog->aux->ksym.name);
9295		}
9296	}
9297}
9298
9299void perf_event_bpf_event(struct bpf_prog *prog,
9300			  enum perf_bpf_event_type type,
9301			  u16 flags)
9302{
9303	struct perf_bpf_event bpf_event;
9304
9305	switch (type) {
9306	case PERF_BPF_EVENT_PROG_LOAD:
9307	case PERF_BPF_EVENT_PROG_UNLOAD:
9308		if (atomic_read(&nr_ksymbol_events))
9309			perf_event_bpf_emit_ksymbols(prog, type);
9310		break;
9311	default:
9312		return;
9313	}
9314
9315	if (!atomic_read(&nr_bpf_events))
9316		return;
9317
9318	bpf_event = (struct perf_bpf_event){
9319		.prog = prog,
9320		.event_id = {
9321			.header = {
9322				.type = PERF_RECORD_BPF_EVENT,
9323				.size = sizeof(bpf_event.event_id),
9324			},
9325			.type = type,
9326			.flags = flags,
9327			.id = prog->aux->id,
9328		},
9329	};
9330
9331	BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
9332
9333	memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
9334	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
9335}
9336
9337struct perf_text_poke_event {
9338	const void		*old_bytes;
9339	const void		*new_bytes;
9340	size_t			pad;
9341	u16			old_len;
9342	u16			new_len;
9343
9344	struct {
9345		struct perf_event_header	header;
9346
9347		u64				addr;
9348	} event_id;
9349};
9350
9351static int perf_event_text_poke_match(struct perf_event *event)
9352{
9353	return event->attr.text_poke;
9354}
9355
9356static void perf_event_text_poke_output(struct perf_event *event, void *data)
9357{
9358	struct perf_text_poke_event *text_poke_event = data;
9359	struct perf_output_handle handle;
9360	struct perf_sample_data sample;
9361	u64 padding = 0;
9362	int ret;
9363
9364	if (!perf_event_text_poke_match(event))
9365		return;
9366
9367	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
9368
9369	ret = perf_output_begin(&handle, &sample, event,
9370				text_poke_event->event_id.header.size);
9371	if (ret)
9372		return;
9373
9374	perf_output_put(&handle, text_poke_event->event_id);
9375	perf_output_put(&handle, text_poke_event->old_len);
9376	perf_output_put(&handle, text_poke_event->new_len);
9377
9378	__output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
9379	__output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
9380
9381	if (text_poke_event->pad)
9382		__output_copy(&handle, &padding, text_poke_event->pad);
9383
9384	perf_event__output_id_sample(event, &handle, &sample);
9385
9386	perf_output_end(&handle);
9387}
9388
9389void perf_event_text_poke(const void *addr, const void *old_bytes,
9390			  size_t old_len, const void *new_bytes, size_t new_len)
9391{
9392	struct perf_text_poke_event text_poke_event;
9393	size_t tot, pad;
9394
9395	if (!atomic_read(&nr_text_poke_events))
9396		return;
9397
9398	tot  = sizeof(text_poke_event.old_len) + old_len;
9399	tot += sizeof(text_poke_event.new_len) + new_len;
9400	pad  = ALIGN(tot, sizeof(u64)) - tot;
9401
9402	text_poke_event = (struct perf_text_poke_event){
9403		.old_bytes    = old_bytes,
9404		.new_bytes    = new_bytes,
9405		.pad          = pad,
9406		.old_len      = old_len,
9407		.new_len      = new_len,
9408		.event_id  = {
9409			.header = {
9410				.type = PERF_RECORD_TEXT_POKE,
9411				.misc = PERF_RECORD_MISC_KERNEL,
9412				.size = sizeof(text_poke_event.event_id) + tot + pad,
9413			},
9414			.addr = (unsigned long)addr,
9415		},
9416	};
9417
9418	perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
9419}
9420
9421void perf_event_itrace_started(struct perf_event *event)
9422{
9423	event->attach_state |= PERF_ATTACH_ITRACE;
9424}
9425
9426static void perf_log_itrace_start(struct perf_event *event)
9427{
9428	struct perf_output_handle handle;
9429	struct perf_sample_data sample;
9430	struct perf_aux_event {
9431		struct perf_event_header        header;
9432		u32				pid;
9433		u32				tid;
9434	} rec;
9435	int ret;
9436
9437	if (event->parent)
9438		event = event->parent;
9439
9440	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
9441	    event->attach_state & PERF_ATTACH_ITRACE)
9442		return;
9443
9444	rec.header.type	= PERF_RECORD_ITRACE_START;
9445	rec.header.misc	= 0;
9446	rec.header.size	= sizeof(rec);
9447	rec.pid	= perf_event_pid(event, current);
9448	rec.tid	= perf_event_tid(event, current);
9449
9450	perf_event_header__init_id(&rec.header, &sample, event);
9451	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9452
9453	if (ret)
9454		return;
9455
9456	perf_output_put(&handle, rec);
9457	perf_event__output_id_sample(event, &handle, &sample);
9458
9459	perf_output_end(&handle);
9460}
9461
9462void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
9463{
9464	struct perf_output_handle handle;
9465	struct perf_sample_data sample;
9466	struct perf_aux_event {
9467		struct perf_event_header        header;
9468		u64				hw_id;
9469	} rec;
9470	int ret;
9471
9472	if (event->parent)
9473		event = event->parent;
9474
9475	rec.header.type	= PERF_RECORD_AUX_OUTPUT_HW_ID;
9476	rec.header.misc	= 0;
9477	rec.header.size	= sizeof(rec);
9478	rec.hw_id	= hw_id;
9479
9480	perf_event_header__init_id(&rec.header, &sample, event);
9481	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9482
9483	if (ret)
9484		return;
9485
9486	perf_output_put(&handle, rec);
9487	perf_event__output_id_sample(event, &handle, &sample);
9488
9489	perf_output_end(&handle);
9490}
9491EXPORT_SYMBOL_GPL(perf_report_aux_output_id);
9492
9493static int
9494__perf_event_account_interrupt(struct perf_event *event, int throttle)
9495{
9496	struct hw_perf_event *hwc = &event->hw;
9497	int ret = 0;
9498	u64 seq;
9499
9500	seq = __this_cpu_read(perf_throttled_seq);
9501	if (seq != hwc->interrupts_seq) {
9502		hwc->interrupts_seq = seq;
9503		hwc->interrupts = 1;
9504	} else {
9505		hwc->interrupts++;
9506		if (unlikely(throttle &&
9507			     hwc->interrupts > max_samples_per_tick)) {
9508			__this_cpu_inc(perf_throttled_count);
9509			tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
9510			hwc->interrupts = MAX_INTERRUPTS;
9511			perf_log_throttle(event, 0);
9512			ret = 1;
9513		}
9514	}
9515
9516	if (event->attr.freq) {
9517		u64 now = perf_clock();
9518		s64 delta = now - hwc->freq_time_stamp;
9519
9520		hwc->freq_time_stamp = now;
9521
9522		if (delta > 0 && delta < 2*TICK_NSEC)
9523			perf_adjust_period(event, delta, hwc->last_period, true);
9524	}
9525
9526	return ret;
9527}
9528
9529int perf_event_account_interrupt(struct perf_event *event)
9530{
9531	return __perf_event_account_interrupt(event, 1);
9532}
9533
9534static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
9535{
9536	/*
9537	 * Due to interrupt latency (AKA "skid"), we may enter the
9538	 * kernel before taking an overflow, even if the PMU is only
9539	 * counting user events.
9540	 */
9541	if (event->attr.exclude_kernel && !user_mode(regs))
9542		return false;
9543
9544	return true;
9545}
9546
9547/*
9548 * Generic event overflow handling, sampling.
9549 */
9550
9551static int __perf_event_overflow(struct perf_event *event,
9552				 int throttle, struct perf_sample_data *data,
9553				 struct pt_regs *regs)
9554{
9555	int events = atomic_read(&event->event_limit);
9556	int ret = 0;
9557
9558	/*
9559	 * Non-sampling counters might still use the PMI to fold short
9560	 * hardware counters, ignore those.
9561	 */
9562	if (unlikely(!is_sampling_event(event)))
9563		return 0;
9564
9565	ret = __perf_event_account_interrupt(event, throttle);
9566
9567	/*
9568	 * XXX event_limit might not quite work as expected on inherited
9569	 * events
9570	 */
9571
9572	event->pending_kill = POLL_IN;
9573	if (events && atomic_dec_and_test(&event->event_limit)) {
9574		ret = 1;
9575		event->pending_kill = POLL_HUP;
9576		perf_event_disable_inatomic(event);
9577	}
9578
9579	if (event->attr.sigtrap) {
9580		/*
9581		 * The desired behaviour of sigtrap vs invalid samples is a bit
9582		 * tricky; on the one hand, one should not loose the SIGTRAP if
9583		 * it is the first event, on the other hand, we should also not
9584		 * trigger the WARN or override the data address.
9585		 */
9586		bool valid_sample = sample_is_allowed(event, regs);
9587		unsigned int pending_id = 1;
9588
9589		if (regs)
9590			pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
9591		if (!event->pending_sigtrap) {
9592			event->pending_sigtrap = pending_id;
9593			local_inc(&event->ctx->nr_pending);
9594		} else if (event->attr.exclude_kernel && valid_sample) {
9595			/*
9596			 * Should not be able to return to user space without
9597			 * consuming pending_sigtrap; with exceptions:
9598			 *
9599			 *  1. Where !exclude_kernel, events can overflow again
9600			 *     in the kernel without returning to user space.
9601			 *
9602			 *  2. Events that can overflow again before the IRQ-
9603			 *     work without user space progress (e.g. hrtimer).
9604			 *     To approximate progress (with false negatives),
9605			 *     check 32-bit hash of the current IP.
9606			 */
9607			WARN_ON_ONCE(event->pending_sigtrap != pending_id);
9608		}
9609
9610		event->pending_addr = 0;
9611		if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
9612			event->pending_addr = data->addr;
9613		irq_work_queue(&event->pending_irq);
9614	}
9615
9616	READ_ONCE(event->overflow_handler)(event, data, regs);
9617
9618	if (*perf_event_fasync(event) && event->pending_kill) {
9619		event->pending_wakeup = 1;
9620		irq_work_queue(&event->pending_irq);
9621	}
9622
9623	return ret;
9624}
9625
9626int perf_event_overflow(struct perf_event *event,
9627			struct perf_sample_data *data,
9628			struct pt_regs *regs)
9629{
9630	return __perf_event_overflow(event, 1, data, regs);
9631}
9632
9633/*
9634 * Generic software event infrastructure
9635 */
9636
9637struct swevent_htable {
9638	struct swevent_hlist		*swevent_hlist;
9639	struct mutex			hlist_mutex;
9640	int				hlist_refcount;
9641
9642	/* Recursion avoidance in each contexts */
9643	int				recursion[PERF_NR_CONTEXTS];
9644};
9645
9646static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
9647
9648/*
9649 * We directly increment event->count and keep a second value in
9650 * event->hw.period_left to count intervals. This period event
9651 * is kept in the range [-sample_period, 0] so that we can use the
9652 * sign as trigger.
9653 */
9654
9655u64 perf_swevent_set_period(struct perf_event *event)
9656{
9657	struct hw_perf_event *hwc = &event->hw;
9658	u64 period = hwc->last_period;
9659	u64 nr, offset;
9660	s64 old, val;
9661
9662	hwc->last_period = hwc->sample_period;
9663
9664	old = local64_read(&hwc->period_left);
9665	do {
9666		val = old;
9667		if (val < 0)
9668			return 0;
9669
9670		nr = div64_u64(period + val, period);
9671		offset = nr * period;
9672		val -= offset;
9673	} while (!local64_try_cmpxchg(&hwc->period_left, &old, val));
9674
9675	return nr;
9676}
9677
9678static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
9679				    struct perf_sample_data *data,
9680				    struct pt_regs *regs)
9681{
9682	struct hw_perf_event *hwc = &event->hw;
9683	int throttle = 0;
9684
9685	if (!overflow)
9686		overflow = perf_swevent_set_period(event);
9687
9688	if (hwc->interrupts == MAX_INTERRUPTS)
9689		return;
9690
9691	for (; overflow; overflow--) {
9692		if (__perf_event_overflow(event, throttle,
9693					    data, regs)) {
9694			/*
9695			 * We inhibit the overflow from happening when
9696			 * hwc->interrupts == MAX_INTERRUPTS.
9697			 */
9698			break;
9699		}
9700		throttle = 1;
9701	}
9702}
9703
9704static void perf_swevent_event(struct perf_event *event, u64 nr,
9705			       struct perf_sample_data *data,
9706			       struct pt_regs *regs)
9707{
9708	struct hw_perf_event *hwc = &event->hw;
9709
9710	local64_add(nr, &event->count);
9711
9712	if (!regs)
9713		return;
9714
9715	if (!is_sampling_event(event))
9716		return;
9717
9718	if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
9719		data->period = nr;
9720		return perf_swevent_overflow(event, 1, data, regs);
9721	} else
9722		data->period = event->hw.last_period;
9723
9724	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
9725		return perf_swevent_overflow(event, 1, data, regs);
9726
9727	if (local64_add_negative(nr, &hwc->period_left))
9728		return;
9729
9730	perf_swevent_overflow(event, 0, data, regs);
9731}
9732
9733static int perf_exclude_event(struct perf_event *event,
9734			      struct pt_regs *regs)
9735{
9736	if (event->hw.state & PERF_HES_STOPPED)
9737		return 1;
9738
9739	if (regs) {
9740		if (event->attr.exclude_user && user_mode(regs))
9741			return 1;
9742
9743		if (event->attr.exclude_kernel && !user_mode(regs))
9744			return 1;
9745	}
9746
9747	return 0;
9748}
9749
9750static int perf_swevent_match(struct perf_event *event,
9751				enum perf_type_id type,
9752				u32 event_id,
9753				struct perf_sample_data *data,
9754				struct pt_regs *regs)
9755{
9756	if (event->attr.type != type)
9757		return 0;
9758
9759	if (event->attr.config != event_id)
9760		return 0;
9761
9762	if (perf_exclude_event(event, regs))
9763		return 0;
9764
9765	return 1;
9766}
9767
9768static inline u64 swevent_hash(u64 type, u32 event_id)
9769{
9770	u64 val = event_id | (type << 32);
9771
9772	return hash_64(val, SWEVENT_HLIST_BITS);
9773}
9774
9775static inline struct hlist_head *
9776__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
9777{
9778	u64 hash = swevent_hash(type, event_id);
9779
9780	return &hlist->heads[hash];
9781}
9782
9783/* For the read side: events when they trigger */
9784static inline struct hlist_head *
9785find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
9786{
9787	struct swevent_hlist *hlist;
9788
9789	hlist = rcu_dereference(swhash->swevent_hlist);
9790	if (!hlist)
9791		return NULL;
9792
9793	return __find_swevent_head(hlist, type, event_id);
9794}
9795
9796/* For the event head insertion and removal in the hlist */
9797static inline struct hlist_head *
9798find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
9799{
9800	struct swevent_hlist *hlist;
9801	u32 event_id = event->attr.config;
9802	u64 type = event->attr.type;
9803
9804	/*
9805	 * Event scheduling is always serialized against hlist allocation
9806	 * and release. Which makes the protected version suitable here.
9807	 * The context lock guarantees that.
9808	 */
9809	hlist = rcu_dereference_protected(swhash->swevent_hlist,
9810					  lockdep_is_held(&event->ctx->lock));
9811	if (!hlist)
9812		return NULL;
9813
9814	return __find_swevent_head(hlist, type, event_id);
9815}
9816
9817static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
9818				    u64 nr,
9819				    struct perf_sample_data *data,
9820				    struct pt_regs *regs)
9821{
9822	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9823	struct perf_event *event;
9824	struct hlist_head *head;
9825
9826	rcu_read_lock();
9827	head = find_swevent_head_rcu(swhash, type, event_id);
9828	if (!head)
9829		goto end;
9830
9831	hlist_for_each_entry_rcu(event, head, hlist_entry) {
9832		if (perf_swevent_match(event, type, event_id, data, regs))
9833			perf_swevent_event(event, nr, data, regs);
9834	}
9835end:
9836	rcu_read_unlock();
9837}
9838
9839DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
9840
9841int perf_swevent_get_recursion_context(void)
9842{
9843	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9844
9845	return get_recursion_context(swhash->recursion);
9846}
9847EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
9848
9849void perf_swevent_put_recursion_context(int rctx)
9850{
9851	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9852
9853	put_recursion_context(swhash->recursion, rctx);
9854}
9855
9856void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9857{
9858	struct perf_sample_data data;
9859
9860	if (WARN_ON_ONCE(!regs))
9861		return;
9862
9863	perf_sample_data_init(&data, addr, 0);
9864	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
9865}
9866
9867void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9868{
9869	int rctx;
9870
9871	preempt_disable_notrace();
9872	rctx = perf_swevent_get_recursion_context();
9873	if (unlikely(rctx < 0))
9874		goto fail;
9875
9876	___perf_sw_event(event_id, nr, regs, addr);
9877
9878	perf_swevent_put_recursion_context(rctx);
9879fail:
9880	preempt_enable_notrace();
9881}
9882
9883static void perf_swevent_read(struct perf_event *event)
9884{
9885}
9886
9887static int perf_swevent_add(struct perf_event *event, int flags)
9888{
9889	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9890	struct hw_perf_event *hwc = &event->hw;
9891	struct hlist_head *head;
9892
9893	if (is_sampling_event(event)) {
9894		hwc->last_period = hwc->sample_period;
9895		perf_swevent_set_period(event);
9896	}
9897
9898	hwc->state = !(flags & PERF_EF_START);
9899
9900	head = find_swevent_head(swhash, event);
9901	if (WARN_ON_ONCE(!head))
9902		return -EINVAL;
9903
9904	hlist_add_head_rcu(&event->hlist_entry, head);
9905	perf_event_update_userpage(event);
9906
9907	return 0;
9908}
9909
9910static void perf_swevent_del(struct perf_event *event, int flags)
9911{
9912	hlist_del_rcu(&event->hlist_entry);
9913}
9914
9915static void perf_swevent_start(struct perf_event *event, int flags)
9916{
9917	event->hw.state = 0;
9918}
9919
9920static void perf_swevent_stop(struct perf_event *event, int flags)
9921{
9922	event->hw.state = PERF_HES_STOPPED;
9923}
9924
9925/* Deref the hlist from the update side */
9926static inline struct swevent_hlist *
9927swevent_hlist_deref(struct swevent_htable *swhash)
9928{
9929	return rcu_dereference_protected(swhash->swevent_hlist,
9930					 lockdep_is_held(&swhash->hlist_mutex));
9931}
9932
9933static void swevent_hlist_release(struct swevent_htable *swhash)
9934{
9935	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
9936
9937	if (!hlist)
9938		return;
9939
9940	RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
9941	kfree_rcu(hlist, rcu_head);
9942}
9943
9944static void swevent_hlist_put_cpu(int cpu)
9945{
9946	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9947
9948	mutex_lock(&swhash->hlist_mutex);
9949
9950	if (!--swhash->hlist_refcount)
9951		swevent_hlist_release(swhash);
9952
9953	mutex_unlock(&swhash->hlist_mutex);
9954}
9955
9956static void swevent_hlist_put(void)
9957{
9958	int cpu;
9959
9960	for_each_possible_cpu(cpu)
9961		swevent_hlist_put_cpu(cpu);
9962}
9963
9964static int swevent_hlist_get_cpu(int cpu)
9965{
9966	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9967	int err = 0;
9968
9969	mutex_lock(&swhash->hlist_mutex);
9970	if (!swevent_hlist_deref(swhash) &&
9971	    cpumask_test_cpu(cpu, perf_online_mask)) {
9972		struct swevent_hlist *hlist;
9973
9974		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
9975		if (!hlist) {
9976			err = -ENOMEM;
9977			goto exit;
9978		}
9979		rcu_assign_pointer(swhash->swevent_hlist, hlist);
9980	}
9981	swhash->hlist_refcount++;
9982exit:
9983	mutex_unlock(&swhash->hlist_mutex);
9984
9985	return err;
9986}
9987
9988static int swevent_hlist_get(void)
9989{
9990	int err, cpu, failed_cpu;
9991
9992	mutex_lock(&pmus_lock);
9993	for_each_possible_cpu(cpu) {
9994		err = swevent_hlist_get_cpu(cpu);
9995		if (err) {
9996			failed_cpu = cpu;
9997			goto fail;
9998		}
9999	}
10000	mutex_unlock(&pmus_lock);
10001	return 0;
10002fail:
10003	for_each_possible_cpu(cpu) {
10004		if (cpu == failed_cpu)
10005			break;
10006		swevent_hlist_put_cpu(cpu);
10007	}
10008	mutex_unlock(&pmus_lock);
10009	return err;
10010}
10011
10012struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
10013
10014static void sw_perf_event_destroy(struct perf_event *event)
10015{
10016	u64 event_id = event->attr.config;
10017
10018	WARN_ON(event->parent);
10019
10020	static_key_slow_dec(&perf_swevent_enabled[event_id]);
10021	swevent_hlist_put();
10022}
10023
10024static struct pmu perf_cpu_clock; /* fwd declaration */
10025static struct pmu perf_task_clock;
10026
10027static int perf_swevent_init(struct perf_event *event)
10028{
10029	u64 event_id = event->attr.config;
10030
10031	if (event->attr.type != PERF_TYPE_SOFTWARE)
10032		return -ENOENT;
10033
10034	/*
10035	 * no branch sampling for software events
10036	 */
10037	if (has_branch_stack(event))
10038		return -EOPNOTSUPP;
10039
10040	switch (event_id) {
10041	case PERF_COUNT_SW_CPU_CLOCK:
10042		event->attr.type = perf_cpu_clock.type;
10043		return -ENOENT;
10044	case PERF_COUNT_SW_TASK_CLOCK:
10045		event->attr.type = perf_task_clock.type;
10046		return -ENOENT;
10047
10048	default:
10049		break;
10050	}
10051
10052	if (event_id >= PERF_COUNT_SW_MAX)
10053		return -ENOENT;
10054
10055	if (!event->parent) {
10056		int err;
10057
10058		err = swevent_hlist_get();
10059		if (err)
10060			return err;
10061
10062		static_key_slow_inc(&perf_swevent_enabled[event_id]);
10063		event->destroy = sw_perf_event_destroy;
10064	}
10065
10066	return 0;
10067}
10068
10069static struct pmu perf_swevent = {
10070	.task_ctx_nr	= perf_sw_context,
10071
10072	.capabilities	= PERF_PMU_CAP_NO_NMI,
10073
10074	.event_init	= perf_swevent_init,
10075	.add		= perf_swevent_add,
10076	.del		= perf_swevent_del,
10077	.start		= perf_swevent_start,
10078	.stop		= perf_swevent_stop,
10079	.read		= perf_swevent_read,
10080};
10081
10082#ifdef CONFIG_EVENT_TRACING
10083
10084static void tp_perf_event_destroy(struct perf_event *event)
10085{
10086	perf_trace_destroy(event);
10087}
10088
10089static int perf_tp_event_init(struct perf_event *event)
10090{
10091	int err;
10092
10093	if (event->attr.type != PERF_TYPE_TRACEPOINT)
10094		return -ENOENT;
10095
10096	/*
10097	 * no branch sampling for tracepoint events
10098	 */
10099	if (has_branch_stack(event))
10100		return -EOPNOTSUPP;
10101
10102	err = perf_trace_init(event);
10103	if (err)
10104		return err;
10105
10106	event->destroy = tp_perf_event_destroy;
10107
10108	return 0;
10109}
10110
10111static struct pmu perf_tracepoint = {
10112	.task_ctx_nr	= perf_sw_context,
10113
10114	.event_init	= perf_tp_event_init,
10115	.add		= perf_trace_add,
10116	.del		= perf_trace_del,
10117	.start		= perf_swevent_start,
10118	.stop		= perf_swevent_stop,
10119	.read		= perf_swevent_read,
10120};
10121
10122static int perf_tp_filter_match(struct perf_event *event,
10123				struct perf_sample_data *data)
10124{
10125	void *record = data->raw->frag.data;
10126
10127	/* only top level events have filters set */
10128	if (event->parent)
10129		event = event->parent;
10130
10131	if (likely(!event->filter) || filter_match_preds(event->filter, record))
10132		return 1;
10133	return 0;
10134}
10135
10136static int perf_tp_event_match(struct perf_event *event,
10137				struct perf_sample_data *data,
10138				struct pt_regs *regs)
10139{
10140	if (event->hw.state & PERF_HES_STOPPED)
10141		return 0;
10142	/*
10143	 * If exclude_kernel, only trace user-space tracepoints (uprobes)
10144	 */
10145	if (event->attr.exclude_kernel && !user_mode(regs))
10146		return 0;
10147
10148	if (!perf_tp_filter_match(event, data))
10149		return 0;
10150
10151	return 1;
10152}
10153
10154void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
10155			       struct trace_event_call *call, u64 count,
10156			       struct pt_regs *regs, struct hlist_head *head,
10157			       struct task_struct *task)
10158{
10159	if (bpf_prog_array_valid(call)) {
10160		*(struct pt_regs **)raw_data = regs;
10161		if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
10162			perf_swevent_put_recursion_context(rctx);
10163			return;
10164		}
10165	}
10166	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
10167		      rctx, task);
10168}
10169EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
10170
10171static void __perf_tp_event_target_task(u64 count, void *record,
10172					struct pt_regs *regs,
10173					struct perf_sample_data *data,
10174					struct perf_event *event)
10175{
10176	struct trace_entry *entry = record;
10177
10178	if (event->attr.config != entry->type)
10179		return;
10180	/* Cannot deliver synchronous signal to other task. */
10181	if (event->attr.sigtrap)
10182		return;
10183	if (perf_tp_event_match(event, data, regs))
10184		perf_swevent_event(event, count, data, regs);
10185}
10186
10187static void perf_tp_event_target_task(u64 count, void *record,
10188				      struct pt_regs *regs,
10189				      struct perf_sample_data *data,
10190				      struct perf_event_context *ctx)
10191{
10192	unsigned int cpu = smp_processor_id();
10193	struct pmu *pmu = &perf_tracepoint;
10194	struct perf_event *event, *sibling;
10195
10196	perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
10197		__perf_tp_event_target_task(count, record, regs, data, event);
10198		for_each_sibling_event(sibling, event)
10199			__perf_tp_event_target_task(count, record, regs, data, sibling);
10200	}
10201
10202	perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
10203		__perf_tp_event_target_task(count, record, regs, data, event);
10204		for_each_sibling_event(sibling, event)
10205			__perf_tp_event_target_task(count, record, regs, data, sibling);
10206	}
10207}
10208
10209void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
10210		   struct pt_regs *regs, struct hlist_head *head, int rctx,
10211		   struct task_struct *task)
10212{
10213	struct perf_sample_data data;
10214	struct perf_event *event;
10215
10216	struct perf_raw_record raw = {
10217		.frag = {
10218			.size = entry_size,
10219			.data = record,
10220		},
10221	};
10222
10223	perf_sample_data_init(&data, 0, 0);
10224	perf_sample_save_raw_data(&data, &raw);
10225
10226	perf_trace_buf_update(record, event_type);
10227
10228	hlist_for_each_entry_rcu(event, head, hlist_entry) {
10229		if (perf_tp_event_match(event, &data, regs)) {
10230			perf_swevent_event(event, count, &data, regs);
10231
10232			/*
10233			 * Here use the same on-stack perf_sample_data,
10234			 * some members in data are event-specific and
10235			 * need to be re-computed for different sweveents.
10236			 * Re-initialize data->sample_flags safely to avoid
10237			 * the problem that next event skips preparing data
10238			 * because data->sample_flags is set.
10239			 */
10240			perf_sample_data_init(&data, 0, 0);
10241			perf_sample_save_raw_data(&data, &raw);
10242		}
10243	}
10244
10245	/*
10246	 * If we got specified a target task, also iterate its context and
10247	 * deliver this event there too.
10248	 */
10249	if (task && task != current) {
10250		struct perf_event_context *ctx;
10251
10252		rcu_read_lock();
10253		ctx = rcu_dereference(task->perf_event_ctxp);
10254		if (!ctx)
10255			goto unlock;
10256
10257		raw_spin_lock(&ctx->lock);
10258		perf_tp_event_target_task(count, record, regs, &data, ctx);
10259		raw_spin_unlock(&ctx->lock);
10260unlock:
10261		rcu_read_unlock();
10262	}
10263
10264	perf_swevent_put_recursion_context(rctx);
10265}
10266EXPORT_SYMBOL_GPL(perf_tp_event);
10267
10268#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
10269/*
10270 * Flags in config, used by dynamic PMU kprobe and uprobe
10271 * The flags should match following PMU_FORMAT_ATTR().
10272 *
10273 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
10274 *                               if not set, create kprobe/uprobe
10275 *
10276 * The following values specify a reference counter (or semaphore in the
10277 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
10278 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
10279 *
10280 * PERF_UPROBE_REF_CTR_OFFSET_BITS	# of bits in config as th offset
10281 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT	# of bits to shift left
10282 */
10283enum perf_probe_config {
10284	PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
10285	PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
10286	PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
10287};
10288
10289PMU_FORMAT_ATTR(retprobe, "config:0");
10290#endif
10291
10292#ifdef CONFIG_KPROBE_EVENTS
10293static struct attribute *kprobe_attrs[] = {
10294	&format_attr_retprobe.attr,
10295	NULL,
10296};
10297
10298static struct attribute_group kprobe_format_group = {
10299	.name = "format",
10300	.attrs = kprobe_attrs,
10301};
10302
10303static const struct attribute_group *kprobe_attr_groups[] = {
10304	&kprobe_format_group,
10305	NULL,
10306};
10307
10308static int perf_kprobe_event_init(struct perf_event *event);
10309static struct pmu perf_kprobe = {
10310	.task_ctx_nr	= perf_sw_context,
10311	.event_init	= perf_kprobe_event_init,
10312	.add		= perf_trace_add,
10313	.del		= perf_trace_del,
10314	.start		= perf_swevent_start,
10315	.stop		= perf_swevent_stop,
10316	.read		= perf_swevent_read,
10317	.attr_groups	= kprobe_attr_groups,
10318};
10319
10320static int perf_kprobe_event_init(struct perf_event *event)
10321{
10322	int err;
10323	bool is_retprobe;
10324
10325	if (event->attr.type != perf_kprobe.type)
10326		return -ENOENT;
10327
10328	if (!perfmon_capable())
10329		return -EACCES;
10330
10331	/*
10332	 * no branch sampling for probe events
10333	 */
10334	if (has_branch_stack(event))
10335		return -EOPNOTSUPP;
10336
10337	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
10338	err = perf_kprobe_init(event, is_retprobe);
10339	if (err)
10340		return err;
10341
10342	event->destroy = perf_kprobe_destroy;
10343
10344	return 0;
10345}
10346#endif /* CONFIG_KPROBE_EVENTS */
10347
10348#ifdef CONFIG_UPROBE_EVENTS
10349PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
10350
10351static struct attribute *uprobe_attrs[] = {
10352	&format_attr_retprobe.attr,
10353	&format_attr_ref_ctr_offset.attr,
10354	NULL,
10355};
10356
10357static struct attribute_group uprobe_format_group = {
10358	.name = "format",
10359	.attrs = uprobe_attrs,
10360};
10361
10362static const struct attribute_group *uprobe_attr_groups[] = {
10363	&uprobe_format_group,
10364	NULL,
10365};
10366
10367static int perf_uprobe_event_init(struct perf_event *event);
10368static struct pmu perf_uprobe = {
10369	.task_ctx_nr	= perf_sw_context,
10370	.event_init	= perf_uprobe_event_init,
10371	.add		= perf_trace_add,
10372	.del		= perf_trace_del,
10373	.start		= perf_swevent_start,
10374	.stop		= perf_swevent_stop,
10375	.read		= perf_swevent_read,
10376	.attr_groups	= uprobe_attr_groups,
10377};
10378
10379static int perf_uprobe_event_init(struct perf_event *event)
10380{
10381	int err;
10382	unsigned long ref_ctr_offset;
10383	bool is_retprobe;
10384
10385	if (event->attr.type != perf_uprobe.type)
10386		return -ENOENT;
10387
10388	if (!perfmon_capable())
10389		return -EACCES;
10390
10391	/*
10392	 * no branch sampling for probe events
10393	 */
10394	if (has_branch_stack(event))
10395		return -EOPNOTSUPP;
10396
10397	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
10398	ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
10399	err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
10400	if (err)
10401		return err;
10402
10403	event->destroy = perf_uprobe_destroy;
10404
10405	return 0;
10406}
10407#endif /* CONFIG_UPROBE_EVENTS */
10408
10409static inline void perf_tp_register(void)
10410{
10411	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
10412#ifdef CONFIG_KPROBE_EVENTS
10413	perf_pmu_register(&perf_kprobe, "kprobe", -1);
10414#endif
10415#ifdef CONFIG_UPROBE_EVENTS
10416	perf_pmu_register(&perf_uprobe, "uprobe", -1);
10417#endif
10418}
10419
10420static void perf_event_free_filter(struct perf_event *event)
10421{
10422	ftrace_profile_free_filter(event);
10423}
10424
10425#ifdef CONFIG_BPF_SYSCALL
10426static void bpf_overflow_handler(struct perf_event *event,
10427				 struct perf_sample_data *data,
10428				 struct pt_regs *regs)
10429{
10430	struct bpf_perf_event_data_kern ctx = {
10431		.data = data,
10432		.event = event,
10433	};
10434	struct bpf_prog *prog;
10435	int ret = 0;
10436
10437	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
10438	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
10439		goto out;
10440	rcu_read_lock();
10441	prog = READ_ONCE(event->prog);
10442	if (prog) {
10443		perf_prepare_sample(data, event, regs);
10444		ret = bpf_prog_run(prog, &ctx);
10445	}
10446	rcu_read_unlock();
10447out:
10448	__this_cpu_dec(bpf_prog_active);
10449	if (!ret)
10450		return;
10451
10452	event->orig_overflow_handler(event, data, regs);
10453}
10454
10455static int perf_event_set_bpf_handler(struct perf_event *event,
10456				      struct bpf_prog *prog,
10457				      u64 bpf_cookie)
10458{
10459	if (event->overflow_handler_context)
10460		/* hw breakpoint or kernel counter */
10461		return -EINVAL;
10462
10463	if (event->prog)
10464		return -EEXIST;
10465
10466	if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
10467		return -EINVAL;
10468
10469	if (event->attr.precise_ip &&
10470	    prog->call_get_stack &&
10471	    (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
10472	     event->attr.exclude_callchain_kernel ||
10473	     event->attr.exclude_callchain_user)) {
10474		/*
10475		 * On perf_event with precise_ip, calling bpf_get_stack()
10476		 * may trigger unwinder warnings and occasional crashes.
10477		 * bpf_get_[stack|stackid] works around this issue by using
10478		 * callchain attached to perf_sample_data. If the
10479		 * perf_event does not full (kernel and user) callchain
10480		 * attached to perf_sample_data, do not allow attaching BPF
10481		 * program that calls bpf_get_[stack|stackid].
10482		 */
10483		return -EPROTO;
10484	}
10485
10486	event->prog = prog;
10487	event->bpf_cookie = bpf_cookie;
10488	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
10489	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
10490	return 0;
10491}
10492
10493static void perf_event_free_bpf_handler(struct perf_event *event)
10494{
10495	struct bpf_prog *prog = event->prog;
10496
10497	if (!prog)
10498		return;
10499
10500	WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
10501	event->prog = NULL;
10502	bpf_prog_put(prog);
10503}
10504#else
10505static int perf_event_set_bpf_handler(struct perf_event *event,
10506				      struct bpf_prog *prog,
10507				      u64 bpf_cookie)
10508{
10509	return -EOPNOTSUPP;
10510}
10511static void perf_event_free_bpf_handler(struct perf_event *event)
10512{
10513}
10514#endif
10515
10516/*
10517 * returns true if the event is a tracepoint, or a kprobe/upprobe created
10518 * with perf_event_open()
10519 */
10520static inline bool perf_event_is_tracing(struct perf_event *event)
10521{
10522	if (event->pmu == &perf_tracepoint)
10523		return true;
10524#ifdef CONFIG_KPROBE_EVENTS
10525	if (event->pmu == &perf_kprobe)
10526		return true;
10527#endif
10528#ifdef CONFIG_UPROBE_EVENTS
10529	if (event->pmu == &perf_uprobe)
10530		return true;
10531#endif
10532	return false;
10533}
10534
10535int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
10536			    u64 bpf_cookie)
10537{
10538	bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
10539
10540	if (!perf_event_is_tracing(event))
10541		return perf_event_set_bpf_handler(event, prog, bpf_cookie);
10542
10543	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
10544	is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
10545	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
10546	is_syscall_tp = is_syscall_trace_event(event->tp_event);
10547	if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
10548		/* bpf programs can only be attached to u/kprobe or tracepoint */
10549		return -EINVAL;
10550
10551	if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
10552	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
10553	    (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
10554		return -EINVAL;
10555
10556	if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
10557		/* only uprobe programs are allowed to be sleepable */
10558		return -EINVAL;
10559
10560	/* Kprobe override only works for kprobes, not uprobes. */
10561	if (prog->kprobe_override && !is_kprobe)
10562		return -EINVAL;
10563
10564	if (is_tracepoint || is_syscall_tp) {
10565		int off = trace_event_get_offsets(event->tp_event);
10566
10567		if (prog->aux->max_ctx_offset > off)
10568			return -EACCES;
10569	}
10570
10571	return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
10572}
10573
10574void perf_event_free_bpf_prog(struct perf_event *event)
10575{
10576	if (!perf_event_is_tracing(event)) {
10577		perf_event_free_bpf_handler(event);
10578		return;
10579	}
10580	perf_event_detach_bpf_prog(event);
10581}
10582
10583#else
10584
10585static inline void perf_tp_register(void)
10586{
10587}
10588
10589static void perf_event_free_filter(struct perf_event *event)
10590{
10591}
10592
10593int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
10594			    u64 bpf_cookie)
10595{
10596	return -ENOENT;
10597}
10598
10599void perf_event_free_bpf_prog(struct perf_event *event)
10600{
10601}
10602#endif /* CONFIG_EVENT_TRACING */
10603
10604#ifdef CONFIG_HAVE_HW_BREAKPOINT
10605void perf_bp_event(struct perf_event *bp, void *data)
10606{
10607	struct perf_sample_data sample;
10608	struct pt_regs *regs = data;
10609
10610	perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
10611
10612	if (!bp->hw.state && !perf_exclude_event(bp, regs))
10613		perf_swevent_event(bp, 1, &sample, regs);
10614}
10615#endif
10616
10617/*
10618 * Allocate a new address filter
10619 */
10620static struct perf_addr_filter *
10621perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
10622{
10623	int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
10624	struct perf_addr_filter *filter;
10625
10626	filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
10627	if (!filter)
10628		return NULL;
10629
10630	INIT_LIST_HEAD(&filter->entry);
10631	list_add_tail(&filter->entry, filters);
10632
10633	return filter;
10634}
10635
10636static void free_filters_list(struct list_head *filters)
10637{
10638	struct perf_addr_filter *filter, *iter;
10639
10640	list_for_each_entry_safe(filter, iter, filters, entry) {
10641		path_put(&filter->path);
10642		list_del(&filter->entry);
10643		kfree(filter);
10644	}
10645}
10646
10647/*
10648 * Free existing address filters and optionally install new ones
10649 */
10650static void perf_addr_filters_splice(struct perf_event *event,
10651				     struct list_head *head)
10652{
10653	unsigned long flags;
10654	LIST_HEAD(list);
10655
10656	if (!has_addr_filter(event))
10657		return;
10658
10659	/* don't bother with children, they don't have their own filters */
10660	if (event->parent)
10661		return;
10662
10663	raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
10664
10665	list_splice_init(&event->addr_filters.list, &list);
10666	if (head)
10667		list_splice(head, &event->addr_filters.list);
10668
10669	raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
10670
10671	free_filters_list(&list);
10672}
10673
10674/*
10675 * Scan through mm's vmas and see if one of them matches the
10676 * @filter; if so, adjust filter's address range.
10677 * Called with mm::mmap_lock down for reading.
10678 */
10679static void perf_addr_filter_apply(struct perf_addr_filter *filter,
10680				   struct mm_struct *mm,
10681				   struct perf_addr_filter_range *fr)
10682{
10683	struct vm_area_struct *vma;
10684	VMA_ITERATOR(vmi, mm, 0);
10685
10686	for_each_vma(vmi, vma) {
10687		if (!vma->vm_file)
10688			continue;
10689
10690		if (perf_addr_filter_vma_adjust(filter, vma, fr))
10691			return;
10692	}
10693}
10694
10695/*
10696 * Update event's address range filters based on the
10697 * task's existing mappings, if any.
10698 */
10699static void perf_event_addr_filters_apply(struct perf_event *event)
10700{
10701	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
10702	struct task_struct *task = READ_ONCE(event->ctx->task);
10703	struct perf_addr_filter *filter;
10704	struct mm_struct *mm = NULL;
10705	unsigned int count = 0;
10706	unsigned long flags;
10707
10708	/*
10709	 * We may observe TASK_TOMBSTONE, which means that the event tear-down
10710	 * will stop on the parent's child_mutex that our caller is also holding
10711	 */
10712	if (task == TASK_TOMBSTONE)
10713		return;
10714
10715	if (ifh->nr_file_filters) {
10716		mm = get_task_mm(task);
10717		if (!mm)
10718			goto restart;
10719
10720		mmap_read_lock(mm);
10721	}
10722
10723	raw_spin_lock_irqsave(&ifh->lock, flags);
10724	list_for_each_entry(filter, &ifh->list, entry) {
10725		if (filter->path.dentry) {
10726			/*
10727			 * Adjust base offset if the filter is associated to a
10728			 * binary that needs to be mapped:
10729			 */
10730			event->addr_filter_ranges[count].start = 0;
10731			event->addr_filter_ranges[count].size = 0;
10732
10733			perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
10734		} else {
10735			event->addr_filter_ranges[count].start = filter->offset;
10736			event->addr_filter_ranges[count].size  = filter->size;
10737		}
10738
10739		count++;
10740	}
10741
10742	event->addr_filters_gen++;
10743	raw_spin_unlock_irqrestore(&ifh->lock, flags);
10744
10745	if (ifh->nr_file_filters) {
10746		mmap_read_unlock(mm);
10747
10748		mmput(mm);
10749	}
10750
10751restart:
10752	perf_event_stop(event, 1);
10753}
10754
10755/*
10756 * Address range filtering: limiting the data to certain
10757 * instruction address ranges. Filters are ioctl()ed to us from
10758 * userspace as ascii strings.
10759 *
10760 * Filter string format:
10761 *
10762 * ACTION RANGE_SPEC
10763 * where ACTION is one of the
10764 *  * "filter": limit the trace to this region
10765 *  * "start": start tracing from this address
10766 *  * "stop": stop tracing at this address/region;
10767 * RANGE_SPEC is
10768 *  * for kernel addresses: <start address>[/<size>]
10769 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
10770 *
10771 * if <size> is not specified or is zero, the range is treated as a single
10772 * address; not valid for ACTION=="filter".
10773 */
10774enum {
10775	IF_ACT_NONE = -1,
10776	IF_ACT_FILTER,
10777	IF_ACT_START,
10778	IF_ACT_STOP,
10779	IF_SRC_FILE,
10780	IF_SRC_KERNEL,
10781	IF_SRC_FILEADDR,
10782	IF_SRC_KERNELADDR,
10783};
10784
10785enum {
10786	IF_STATE_ACTION = 0,
10787	IF_STATE_SOURCE,
10788	IF_STATE_END,
10789};
10790
10791static const match_table_t if_tokens = {
10792	{ IF_ACT_FILTER,	"filter" },
10793	{ IF_ACT_START,		"start" },
10794	{ IF_ACT_STOP,		"stop" },
10795	{ IF_SRC_FILE,		"%u/%u@%s" },
10796	{ IF_SRC_KERNEL,	"%u/%u" },
10797	{ IF_SRC_FILEADDR,	"%u@%s" },
10798	{ IF_SRC_KERNELADDR,	"%u" },
10799	{ IF_ACT_NONE,		NULL },
10800};
10801
10802/*
10803 * Address filter string parser
10804 */
10805static int
10806perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
10807			     struct list_head *filters)
10808{
10809	struct perf_addr_filter *filter = NULL;
10810	char *start, *orig, *filename = NULL;
10811	substring_t args[MAX_OPT_ARGS];
10812	int state = IF_STATE_ACTION, token;
10813	unsigned int kernel = 0;
10814	int ret = -EINVAL;
10815
10816	orig = fstr = kstrdup(fstr, GFP_KERNEL);
10817	if (!fstr)
10818		return -ENOMEM;
10819
10820	while ((start = strsep(&fstr, " ,\n")) != NULL) {
10821		static const enum perf_addr_filter_action_t actions[] = {
10822			[IF_ACT_FILTER]	= PERF_ADDR_FILTER_ACTION_FILTER,
10823			[IF_ACT_START]	= PERF_ADDR_FILTER_ACTION_START,
10824			[IF_ACT_STOP]	= PERF_ADDR_FILTER_ACTION_STOP,
10825		};
10826		ret = -EINVAL;
10827
10828		if (!*start)
10829			continue;
10830
10831		/* filter definition begins */
10832		if (state == IF_STATE_ACTION) {
10833			filter = perf_addr_filter_new(event, filters);
10834			if (!filter)
10835				goto fail;
10836		}
10837
10838		token = match_token(start, if_tokens, args);
10839		switch (token) {
10840		case IF_ACT_FILTER:
10841		case IF_ACT_START:
10842		case IF_ACT_STOP:
10843			if (state != IF_STATE_ACTION)
10844				goto fail;
10845
10846			filter->action = actions[token];
10847			state = IF_STATE_SOURCE;
10848			break;
10849
10850		case IF_SRC_KERNELADDR:
10851		case IF_SRC_KERNEL:
10852			kernel = 1;
10853			fallthrough;
10854
10855		case IF_SRC_FILEADDR:
10856		case IF_SRC_FILE:
10857			if (state != IF_STATE_SOURCE)
10858				goto fail;
10859
10860			*args[0].to = 0;
10861			ret = kstrtoul(args[0].from, 0, &filter->offset);
10862			if (ret)
10863				goto fail;
10864
10865			if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
10866				*args[1].to = 0;
10867				ret = kstrtoul(args[1].from, 0, &filter->size);
10868				if (ret)
10869					goto fail;
10870			}
10871
10872			if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
10873				int fpos = token == IF_SRC_FILE ? 2 : 1;
10874
10875				kfree(filename);
10876				filename = match_strdup(&args[fpos]);
10877				if (!filename) {
10878					ret = -ENOMEM;
10879					goto fail;
10880				}
10881			}
10882
10883			state = IF_STATE_END;
10884			break;
10885
10886		default:
10887			goto fail;
10888		}
10889
10890		/*
10891		 * Filter definition is fully parsed, validate and install it.
10892		 * Make sure that it doesn't contradict itself or the event's
10893		 * attribute.
10894		 */
10895		if (state == IF_STATE_END) {
10896			ret = -EINVAL;
10897
10898			/*
10899			 * ACTION "filter" must have a non-zero length region
10900			 * specified.
10901			 */
10902			if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
10903			    !filter->size)
10904				goto fail;
10905
10906			if (!kernel) {
10907				if (!filename)
10908					goto fail;
10909
10910				/*
10911				 * For now, we only support file-based filters
10912				 * in per-task events; doing so for CPU-wide
10913				 * events requires additional context switching
10914				 * trickery, since same object code will be
10915				 * mapped at different virtual addresses in
10916				 * different processes.
10917				 */
10918				ret = -EOPNOTSUPP;
10919				if (!event->ctx->task)
10920					goto fail;
10921
10922				/* look up the path and grab its inode */
10923				ret = kern_path(filename, LOOKUP_FOLLOW,
10924						&filter->path);
10925				if (ret)
10926					goto fail;
10927
10928				ret = -EINVAL;
10929				if (!filter->path.dentry ||
10930				    !S_ISREG(d_inode(filter->path.dentry)
10931					     ->i_mode))
10932					goto fail;
10933
10934				event->addr_filters.nr_file_filters++;
10935			}
10936
10937			/* ready to consume more filters */
10938			kfree(filename);
10939			filename = NULL;
10940			state = IF_STATE_ACTION;
10941			filter = NULL;
10942			kernel = 0;
10943		}
10944	}
10945
10946	if (state != IF_STATE_ACTION)
10947		goto fail;
10948
10949	kfree(filename);
10950	kfree(orig);
10951
10952	return 0;
10953
10954fail:
10955	kfree(filename);
10956	free_filters_list(filters);
10957	kfree(orig);
10958
10959	return ret;
10960}
10961
10962static int
10963perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
10964{
10965	LIST_HEAD(filters);
10966	int ret;
10967
10968	/*
10969	 * Since this is called in perf_ioctl() path, we're already holding
10970	 * ctx::mutex.
10971	 */
10972	lockdep_assert_held(&event->ctx->mutex);
10973
10974	if (WARN_ON_ONCE(event->parent))
10975		return -EINVAL;
10976
10977	ret = perf_event_parse_addr_filter(event, filter_str, &filters);
10978	if (ret)
10979		goto fail_clear_files;
10980
10981	ret = event->pmu->addr_filters_validate(&filters);
10982	if (ret)
10983		goto fail_free_filters;
10984
10985	/* remove existing filters, if any */
10986	perf_addr_filters_splice(event, &filters);
10987
10988	/* install new filters */
10989	perf_event_for_each_child(event, perf_event_addr_filters_apply);
10990
10991	return ret;
10992
10993fail_free_filters:
10994	free_filters_list(&filters);
10995
10996fail_clear_files:
10997	event->addr_filters.nr_file_filters = 0;
10998
10999	return ret;
11000}
11001
11002static int perf_event_set_filter(struct perf_event *event, void __user *arg)
11003{
11004	int ret = -EINVAL;
11005	char *filter_str;
11006
11007	filter_str = strndup_user(arg, PAGE_SIZE);
11008	if (IS_ERR(filter_str))
11009		return PTR_ERR(filter_str);
11010
11011#ifdef CONFIG_EVENT_TRACING
11012	if (perf_event_is_tracing(event)) {
11013		struct perf_event_context *ctx = event->ctx;
11014
11015		/*
11016		 * Beware, here be dragons!!
11017		 *
11018		 * the tracepoint muck will deadlock against ctx->mutex, but
11019		 * the tracepoint stuff does not actually need it. So
11020		 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
11021		 * already have a reference on ctx.
11022		 *
11023		 * This can result in event getting moved to a different ctx,
11024		 * but that does not affect the tracepoint state.
11025		 */
11026		mutex_unlock(&ctx->mutex);
11027		ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
11028		mutex_lock(&ctx->mutex);
11029	} else
11030#endif
11031	if (has_addr_filter(event))
11032		ret = perf_event_set_addr_filter(event, filter_str);
11033
11034	kfree(filter_str);
11035	return ret;
11036}
11037
11038/*
11039 * hrtimer based swevent callback
11040 */
11041
11042static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
11043{
11044	enum hrtimer_restart ret = HRTIMER_RESTART;
11045	struct perf_sample_data data;
11046	struct pt_regs *regs;
11047	struct perf_event *event;
11048	u64 period;
11049
11050	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
11051
11052	if (event->state != PERF_EVENT_STATE_ACTIVE)
11053		return HRTIMER_NORESTART;
11054
11055	event->pmu->read(event);
11056
11057	perf_sample_data_init(&data, 0, event->hw.last_period);
11058	regs = get_irq_regs();
11059
11060	if (regs && !perf_exclude_event(event, regs)) {
11061		if (!(event->attr.exclude_idle && is_idle_task(current)))
11062			if (__perf_event_overflow(event, 1, &data, regs))
11063				ret = HRTIMER_NORESTART;
11064	}
11065
11066	period = max_t(u64, 10000, event->hw.sample_period);
11067	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
11068
11069	return ret;
11070}
11071
11072static void perf_swevent_start_hrtimer(struct perf_event *event)
11073{
11074	struct hw_perf_event *hwc = &event->hw;
11075	s64 period;
11076
11077	if (!is_sampling_event(event))
11078		return;
11079
11080	period = local64_read(&hwc->period_left);
11081	if (period) {
11082		if (period < 0)
11083			period = 10000;
11084
11085		local64_set(&hwc->period_left, 0);
11086	} else {
11087		period = max_t(u64, 10000, hwc->sample_period);
11088	}
11089	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
11090		      HRTIMER_MODE_REL_PINNED_HARD);
11091}
11092
11093static void perf_swevent_cancel_hrtimer(struct perf_event *event)
11094{
11095	struct hw_perf_event *hwc = &event->hw;
11096
11097	if (is_sampling_event(event)) {
11098		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
11099		local64_set(&hwc->period_left, ktime_to_ns(remaining));
11100
11101		hrtimer_cancel(&hwc->hrtimer);
11102	}
11103}
11104
11105static void perf_swevent_init_hrtimer(struct perf_event *event)
11106{
11107	struct hw_perf_event *hwc = &event->hw;
11108
11109	if (!is_sampling_event(event))
11110		return;
11111
11112	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
11113	hwc->hrtimer.function = perf_swevent_hrtimer;
11114
11115	/*
11116	 * Since hrtimers have a fixed rate, we can do a static freq->period
11117	 * mapping and avoid the whole period adjust feedback stuff.
11118	 */
11119	if (event->attr.freq) {
11120		long freq = event->attr.sample_freq;
11121
11122		event->attr.sample_period = NSEC_PER_SEC / freq;
11123		hwc->sample_period = event->attr.sample_period;
11124		local64_set(&hwc->period_left, hwc->sample_period);
11125		hwc->last_period = hwc->sample_period;
11126		event->attr.freq = 0;
11127	}
11128}
11129
11130/*
11131 * Software event: cpu wall time clock
11132 */
11133
11134static void cpu_clock_event_update(struct perf_event *event)
11135{
11136	s64 prev;
11137	u64 now;
11138
11139	now = local_clock();
11140	prev = local64_xchg(&event->hw.prev_count, now);
11141	local64_add(now - prev, &event->count);
11142}
11143
11144static void cpu_clock_event_start(struct perf_event *event, int flags)
11145{
11146	local64_set(&event->hw.prev_count, local_clock());
11147	perf_swevent_start_hrtimer(event);
11148}
11149
11150static void cpu_clock_event_stop(struct perf_event *event, int flags)
11151{
11152	perf_swevent_cancel_hrtimer(event);
11153	cpu_clock_event_update(event);
11154}
11155
11156static int cpu_clock_event_add(struct perf_event *event, int flags)
11157{
11158	if (flags & PERF_EF_START)
11159		cpu_clock_event_start(event, flags);
11160	perf_event_update_userpage(event);
11161
11162	return 0;
11163}
11164
11165static void cpu_clock_event_del(struct perf_event *event, int flags)
11166{
11167	cpu_clock_event_stop(event, flags);
11168}
11169
11170static void cpu_clock_event_read(struct perf_event *event)
11171{
11172	cpu_clock_event_update(event);
11173}
11174
11175static int cpu_clock_event_init(struct perf_event *event)
11176{
11177	if (event->attr.type != perf_cpu_clock.type)
11178		return -ENOENT;
11179
11180	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
11181		return -ENOENT;
11182
11183	/*
11184	 * no branch sampling for software events
11185	 */
11186	if (has_branch_stack(event))
11187		return -EOPNOTSUPP;
11188
11189	perf_swevent_init_hrtimer(event);
11190
11191	return 0;
11192}
11193
11194static struct pmu perf_cpu_clock = {
11195	.task_ctx_nr	= perf_sw_context,
11196
11197	.capabilities	= PERF_PMU_CAP_NO_NMI,
11198	.dev		= PMU_NULL_DEV,
11199
11200	.event_init	= cpu_clock_event_init,
11201	.add		= cpu_clock_event_add,
11202	.del		= cpu_clock_event_del,
11203	.start		= cpu_clock_event_start,
11204	.stop		= cpu_clock_event_stop,
11205	.read		= cpu_clock_event_read,
11206};
11207
11208/*
11209 * Software event: task time clock
11210 */
11211
11212static void task_clock_event_update(struct perf_event *event, u64 now)
11213{
11214	u64 prev;
11215	s64 delta;
11216
11217	prev = local64_xchg(&event->hw.prev_count, now);
11218	delta = now - prev;
11219	local64_add(delta, &event->count);
11220}
11221
11222static void task_clock_event_start(struct perf_event *event, int flags)
11223{
11224	local64_set(&event->hw.prev_count, event->ctx->time);
11225	perf_swevent_start_hrtimer(event);
11226}
11227
11228static void task_clock_event_stop(struct perf_event *event, int flags)
11229{
11230	perf_swevent_cancel_hrtimer(event);
11231	task_clock_event_update(event, event->ctx->time);
11232}
11233
11234static int task_clock_event_add(struct perf_event *event, int flags)
11235{
11236	if (flags & PERF_EF_START)
11237		task_clock_event_start(event, flags);
11238	perf_event_update_userpage(event);
11239
11240	return 0;
11241}
11242
11243static void task_clock_event_del(struct perf_event *event, int flags)
11244{
11245	task_clock_event_stop(event, PERF_EF_UPDATE);
11246}
11247
11248static void task_clock_event_read(struct perf_event *event)
11249{
11250	u64 now = perf_clock();
11251	u64 delta = now - event->ctx->timestamp;
11252	u64 time = event->ctx->time + delta;
11253
11254	task_clock_event_update(event, time);
11255}
11256
11257static int task_clock_event_init(struct perf_event *event)
11258{
11259	if (event->attr.type != perf_task_clock.type)
11260		return -ENOENT;
11261
11262	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
11263		return -ENOENT;
11264
11265	/*
11266	 * no branch sampling for software events
11267	 */
11268	if (has_branch_stack(event))
11269		return -EOPNOTSUPP;
11270
11271	perf_swevent_init_hrtimer(event);
11272
11273	return 0;
11274}
11275
11276static struct pmu perf_task_clock = {
11277	.task_ctx_nr	= perf_sw_context,
11278
11279	.capabilities	= PERF_PMU_CAP_NO_NMI,
11280	.dev		= PMU_NULL_DEV,
11281
11282	.event_init	= task_clock_event_init,
11283	.add		= task_clock_event_add,
11284	.del		= task_clock_event_del,
11285	.start		= task_clock_event_start,
11286	.stop		= task_clock_event_stop,
11287	.read		= task_clock_event_read,
11288};
11289
11290static void perf_pmu_nop_void(struct pmu *pmu)
11291{
11292}
11293
11294static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
11295{
11296}
11297
11298static int perf_pmu_nop_int(struct pmu *pmu)
11299{
11300	return 0;
11301}
11302
11303static int perf_event_nop_int(struct perf_event *event, u64 value)
11304{
11305	return 0;
11306}
11307
11308static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
11309
11310static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
11311{
11312	__this_cpu_write(nop_txn_flags, flags);
11313
11314	if (flags & ~PERF_PMU_TXN_ADD)
11315		return;
11316
11317	perf_pmu_disable(pmu);
11318}
11319
11320static int perf_pmu_commit_txn(struct pmu *pmu)
11321{
11322	unsigned int flags = __this_cpu_read(nop_txn_flags);
11323
11324	__this_cpu_write(nop_txn_flags, 0);
11325
11326	if (flags & ~PERF_PMU_TXN_ADD)
11327		return 0;
11328
11329	perf_pmu_enable(pmu);
11330	return 0;
11331}
11332
11333static void perf_pmu_cancel_txn(struct pmu *pmu)
11334{
11335	unsigned int flags =  __this_cpu_read(nop_txn_flags);
11336
11337	__this_cpu_write(nop_txn_flags, 0);
11338
11339	if (flags & ~PERF_PMU_TXN_ADD)
11340		return;
11341
11342	perf_pmu_enable(pmu);
11343}
11344
11345static int perf_event_idx_default(struct perf_event *event)
11346{
11347	return 0;
11348}
11349
11350static void free_pmu_context(struct pmu *pmu)
11351{
11352	free_percpu(pmu->cpu_pmu_context);
11353}
11354
11355/*
11356 * Let userspace know that this PMU supports address range filtering:
11357 */
11358static ssize_t nr_addr_filters_show(struct device *dev,
11359				    struct device_attribute *attr,
11360				    char *page)
11361{
11362	struct pmu *pmu = dev_get_drvdata(dev);
11363
11364	return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
11365}
11366DEVICE_ATTR_RO(nr_addr_filters);
11367
11368static struct idr pmu_idr;
11369
11370static ssize_t
11371type_show(struct device *dev, struct device_attribute *attr, char *page)
11372{
11373	struct pmu *pmu = dev_get_drvdata(dev);
11374
11375	return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
11376}
11377static DEVICE_ATTR_RO(type);
11378
11379static ssize_t
11380perf_event_mux_interval_ms_show(struct device *dev,
11381				struct device_attribute *attr,
11382				char *page)
11383{
11384	struct pmu *pmu = dev_get_drvdata(dev);
11385
11386	return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
11387}
11388
11389static DEFINE_MUTEX(mux_interval_mutex);
11390
11391static ssize_t
11392perf_event_mux_interval_ms_store(struct device *dev,
11393				 struct device_attribute *attr,
11394				 const char *buf, size_t count)
11395{
11396	struct pmu *pmu = dev_get_drvdata(dev);
11397	int timer, cpu, ret;
11398
11399	ret = kstrtoint(buf, 0, &timer);
11400	if (ret)
11401		return ret;
11402
11403	if (timer < 1)
11404		return -EINVAL;
11405
11406	/* same value, noting to do */
11407	if (timer == pmu->hrtimer_interval_ms)
11408		return count;
11409
11410	mutex_lock(&mux_interval_mutex);
11411	pmu->hrtimer_interval_ms = timer;
11412
11413	/* update all cpuctx for this PMU */
11414	cpus_read_lock();
11415	for_each_online_cpu(cpu) {
11416		struct perf_cpu_pmu_context *cpc;
11417		cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
11418		cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
11419
11420		cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
11421	}
11422	cpus_read_unlock();
11423	mutex_unlock(&mux_interval_mutex);
11424
11425	return count;
11426}
11427static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
11428
11429static struct attribute *pmu_dev_attrs[] = {
11430	&dev_attr_type.attr,
11431	&dev_attr_perf_event_mux_interval_ms.attr,
11432	&dev_attr_nr_addr_filters.attr,
11433	NULL,
11434};
11435
11436static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
11437{
11438	struct device *dev = kobj_to_dev(kobj);
11439	struct pmu *pmu = dev_get_drvdata(dev);
11440
11441	if (n == 2 && !pmu->nr_addr_filters)
11442		return 0;
11443
11444	return a->mode;
11445}
11446
11447static struct attribute_group pmu_dev_attr_group = {
11448	.is_visible = pmu_dev_is_visible,
11449	.attrs = pmu_dev_attrs,
11450};
11451
11452static const struct attribute_group *pmu_dev_groups[] = {
11453	&pmu_dev_attr_group,
11454	NULL,
11455};
11456
11457static int pmu_bus_running;
11458static struct bus_type pmu_bus = {
11459	.name		= "event_source",
11460	.dev_groups	= pmu_dev_groups,
11461};
11462
11463static void pmu_dev_release(struct device *dev)
11464{
11465	kfree(dev);
11466}
11467
11468static int pmu_dev_alloc(struct pmu *pmu)
11469{
11470	int ret = -ENOMEM;
11471
11472	pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
11473	if (!pmu->dev)
11474		goto out;
11475
11476	pmu->dev->groups = pmu->attr_groups;
11477	device_initialize(pmu->dev);
11478
11479	dev_set_drvdata(pmu->dev, pmu);
11480	pmu->dev->bus = &pmu_bus;
11481	pmu->dev->parent = pmu->parent;
11482	pmu->dev->release = pmu_dev_release;
11483
11484	ret = dev_set_name(pmu->dev, "%s", pmu->name);
11485	if (ret)
11486		goto free_dev;
11487
11488	ret = device_add(pmu->dev);
11489	if (ret)
11490		goto free_dev;
11491
11492	if (pmu->attr_update) {
11493		ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
11494		if (ret)
11495			goto del_dev;
11496	}
11497
11498out:
11499	return ret;
11500
11501del_dev:
11502	device_del(pmu->dev);
11503
11504free_dev:
11505	put_device(pmu->dev);
11506	goto out;
11507}
11508
11509static struct lock_class_key cpuctx_mutex;
11510static struct lock_class_key cpuctx_lock;
11511
11512int perf_pmu_register(struct pmu *pmu, const char *name, int type)
11513{
11514	int cpu, ret, max = PERF_TYPE_MAX;
11515
11516	mutex_lock(&pmus_lock);
11517	ret = -ENOMEM;
11518	pmu->pmu_disable_count = alloc_percpu(int);
11519	if (!pmu->pmu_disable_count)
11520		goto unlock;
11521
11522	pmu->type = -1;
11523	if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
11524		ret = -EINVAL;
11525		goto free_pdc;
11526	}
11527
11528	pmu->name = name;
11529
11530	if (type >= 0)
11531		max = type;
11532
11533	ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
11534	if (ret < 0)
11535		goto free_pdc;
11536
11537	WARN_ON(type >= 0 && ret != type);
11538
11539	type = ret;
11540	pmu->type = type;
11541
11542	if (pmu_bus_running && !pmu->dev) {
11543		ret = pmu_dev_alloc(pmu);
11544		if (ret)
11545			goto free_idr;
11546	}
11547
11548	ret = -ENOMEM;
11549	pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
11550	if (!pmu->cpu_pmu_context)
11551		goto free_dev;
11552
11553	for_each_possible_cpu(cpu) {
11554		struct perf_cpu_pmu_context *cpc;
11555
11556		cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
11557		__perf_init_event_pmu_context(&cpc->epc, pmu);
11558		__perf_mux_hrtimer_init(cpc, cpu);
11559	}
11560
11561	if (!pmu->start_txn) {
11562		if (pmu->pmu_enable) {
11563			/*
11564			 * If we have pmu_enable/pmu_disable calls, install
11565			 * transaction stubs that use that to try and batch
11566			 * hardware accesses.
11567			 */
11568			pmu->start_txn  = perf_pmu_start_txn;
11569			pmu->commit_txn = perf_pmu_commit_txn;
11570			pmu->cancel_txn = perf_pmu_cancel_txn;
11571		} else {
11572			pmu->start_txn  = perf_pmu_nop_txn;
11573			pmu->commit_txn = perf_pmu_nop_int;
11574			pmu->cancel_txn = perf_pmu_nop_void;
11575		}
11576	}
11577
11578	if (!pmu->pmu_enable) {
11579		pmu->pmu_enable  = perf_pmu_nop_void;
11580		pmu->pmu_disable = perf_pmu_nop_void;
11581	}
11582
11583	if (!pmu->check_period)
11584		pmu->check_period = perf_event_nop_int;
11585
11586	if (!pmu->event_idx)
11587		pmu->event_idx = perf_event_idx_default;
11588
11589	list_add_rcu(&pmu->entry, &pmus);
11590	atomic_set(&pmu->exclusive_cnt, 0);
11591	ret = 0;
11592unlock:
11593	mutex_unlock(&pmus_lock);
11594
11595	return ret;
11596
11597free_dev:
11598	if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
11599		device_del(pmu->dev);
11600		put_device(pmu->dev);
11601	}
11602
11603free_idr:
11604	idr_remove(&pmu_idr, pmu->type);
11605
11606free_pdc:
11607	free_percpu(pmu->pmu_disable_count);
11608	goto unlock;
11609}
11610EXPORT_SYMBOL_GPL(perf_pmu_register);
11611
11612void perf_pmu_unregister(struct pmu *pmu)
11613{
11614	mutex_lock(&pmus_lock);
11615	list_del_rcu(&pmu->entry);
11616
11617	/*
11618	 * We dereference the pmu list under both SRCU and regular RCU, so
11619	 * synchronize against both of those.
11620	 */
11621	synchronize_srcu(&pmus_srcu);
11622	synchronize_rcu();
11623
11624	free_percpu(pmu->pmu_disable_count);
11625	idr_remove(&pmu_idr, pmu->type);
11626	if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
11627		if (pmu->nr_addr_filters)
11628			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
11629		device_del(pmu->dev);
11630		put_device(pmu->dev);
11631	}
11632	free_pmu_context(pmu);
11633	mutex_unlock(&pmus_lock);
11634}
11635EXPORT_SYMBOL_GPL(perf_pmu_unregister);
11636
11637static inline bool has_extended_regs(struct perf_event *event)
11638{
11639	return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
11640	       (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
11641}
11642
11643static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
11644{
11645	struct perf_event_context *ctx = NULL;
11646	int ret;
11647
11648	if (!try_module_get(pmu->module))
11649		return -ENODEV;
11650
11651	/*
11652	 * A number of pmu->event_init() methods iterate the sibling_list to,
11653	 * for example, validate if the group fits on the PMU. Therefore,
11654	 * if this is a sibling event, acquire the ctx->mutex to protect
11655	 * the sibling_list.
11656	 */
11657	if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
11658		/*
11659		 * This ctx->mutex can nest when we're called through
11660		 * inheritance. See the perf_event_ctx_lock_nested() comment.
11661		 */
11662		ctx = perf_event_ctx_lock_nested(event->group_leader,
11663						 SINGLE_DEPTH_NESTING);
11664		BUG_ON(!ctx);
11665	}
11666
11667	event->pmu = pmu;
11668	ret = pmu->event_init(event);
11669
11670	if (ctx)
11671		perf_event_ctx_unlock(event->group_leader, ctx);
11672
11673	if (!ret) {
11674		if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
11675		    has_extended_regs(event))
11676			ret = -EOPNOTSUPP;
11677
11678		if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
11679		    event_has_any_exclude_flag(event))
11680			ret = -EINVAL;
11681
11682		if (ret && event->destroy)
11683			event->destroy(event);
11684	}
11685
11686	if (ret)
11687		module_put(pmu->module);
11688
11689	return ret;
11690}
11691
11692static struct pmu *perf_init_event(struct perf_event *event)
11693{
11694	bool extended_type = false;
11695	int idx, type, ret;
11696	struct pmu *pmu;
11697
11698	idx = srcu_read_lock(&pmus_srcu);
11699
11700	/*
11701	 * Save original type before calling pmu->event_init() since certain
11702	 * pmus overwrites event->attr.type to forward event to another pmu.
11703	 */
11704	event->orig_type = event->attr.type;
11705
11706	/* Try parent's PMU first: */
11707	if (event->parent && event->parent->pmu) {
11708		pmu = event->parent->pmu;
11709		ret = perf_try_init_event(pmu, event);
11710		if (!ret)
11711			goto unlock;
11712	}
11713
11714	/*
11715	 * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
11716	 * are often aliases for PERF_TYPE_RAW.
11717	 */
11718	type = event->attr.type;
11719	if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
11720		type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
11721		if (!type) {
11722			type = PERF_TYPE_RAW;
11723		} else {
11724			extended_type = true;
11725			event->attr.config &= PERF_HW_EVENT_MASK;
11726		}
11727	}
11728
11729again:
11730	rcu_read_lock();
11731	pmu = idr_find(&pmu_idr, type);
11732	rcu_read_unlock();
11733	if (pmu) {
11734		if (event->attr.type != type && type != PERF_TYPE_RAW &&
11735		    !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
11736			goto fail;
11737
11738		ret = perf_try_init_event(pmu, event);
11739		if (ret == -ENOENT && event->attr.type != type && !extended_type) {
11740			type = event->attr.type;
11741			goto again;
11742		}
11743
11744		if (ret)
11745			pmu = ERR_PTR(ret);
11746
11747		goto unlock;
11748	}
11749
11750	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
11751		ret = perf_try_init_event(pmu, event);
11752		if (!ret)
11753			goto unlock;
11754
11755		if (ret != -ENOENT) {
11756			pmu = ERR_PTR(ret);
11757			goto unlock;
11758		}
11759	}
11760fail:
11761	pmu = ERR_PTR(-ENOENT);
11762unlock:
11763	srcu_read_unlock(&pmus_srcu, idx);
11764
11765	return pmu;
11766}
11767
11768static void attach_sb_event(struct perf_event *event)
11769{
11770	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
11771
11772	raw_spin_lock(&pel->lock);
11773	list_add_rcu(&event->sb_list, &pel->list);
11774	raw_spin_unlock(&pel->lock);
11775}
11776
11777/*
11778 * We keep a list of all !task (and therefore per-cpu) events
11779 * that need to receive side-band records.
11780 *
11781 * This avoids having to scan all the various PMU per-cpu contexts
11782 * looking for them.
11783 */
11784static void account_pmu_sb_event(struct perf_event *event)
11785{
11786	if (is_sb_event(event))
11787		attach_sb_event(event);
11788}
11789
11790/* Freq events need the tick to stay alive (see perf_event_task_tick). */
11791static void account_freq_event_nohz(void)
11792{
11793#ifdef CONFIG_NO_HZ_FULL
11794	/* Lock so we don't race with concurrent unaccount */
11795	spin_lock(&nr_freq_lock);
11796	if (atomic_inc_return(&nr_freq_events) == 1)
11797		tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
11798	spin_unlock(&nr_freq_lock);
11799#endif
11800}
11801
11802static void account_freq_event(void)
11803{
11804	if (tick_nohz_full_enabled())
11805		account_freq_event_nohz();
11806	else
11807		atomic_inc(&nr_freq_events);
11808}
11809
11810
11811static void account_event(struct perf_event *event)
11812{
11813	bool inc = false;
11814
11815	if (event->parent)
11816		return;
11817
11818	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
11819		inc = true;
11820	if (event->attr.mmap || event->attr.mmap_data)
11821		atomic_inc(&nr_mmap_events);
11822	if (event->attr.build_id)
11823		atomic_inc(&nr_build_id_events);
11824	if (event->attr.comm)
11825		atomic_inc(&nr_comm_events);
11826	if (event->attr.namespaces)
11827		atomic_inc(&nr_namespaces_events);
11828	if (event->attr.cgroup)
11829		atomic_inc(&nr_cgroup_events);
11830	if (event->attr.task)
11831		atomic_inc(&nr_task_events);
11832	if (event->attr.freq)
11833		account_freq_event();
11834	if (event->attr.context_switch) {
11835		atomic_inc(&nr_switch_events);
11836		inc = true;
11837	}
11838	if (has_branch_stack(event))
11839		inc = true;
11840	if (is_cgroup_event(event))
11841		inc = true;
11842	if (event->attr.ksymbol)
11843		atomic_inc(&nr_ksymbol_events);
11844	if (event->attr.bpf_event)
11845		atomic_inc(&nr_bpf_events);
11846	if (event->attr.text_poke)
11847		atomic_inc(&nr_text_poke_events);
11848
11849	if (inc) {
11850		/*
11851		 * We need the mutex here because static_branch_enable()
11852		 * must complete *before* the perf_sched_count increment
11853		 * becomes visible.
11854		 */
11855		if (atomic_inc_not_zero(&perf_sched_count))
11856			goto enabled;
11857
11858		mutex_lock(&perf_sched_mutex);
11859		if (!atomic_read(&perf_sched_count)) {
11860			static_branch_enable(&perf_sched_events);
11861			/*
11862			 * Guarantee that all CPUs observe they key change and
11863			 * call the perf scheduling hooks before proceeding to
11864			 * install events that need them.
11865			 */
11866			synchronize_rcu();
11867		}
11868		/*
11869		 * Now that we have waited for the sync_sched(), allow further
11870		 * increments to by-pass the mutex.
11871		 */
11872		atomic_inc(&perf_sched_count);
11873		mutex_unlock(&perf_sched_mutex);
11874	}
11875enabled:
11876
11877	account_pmu_sb_event(event);
11878}
11879
11880/*
11881 * Allocate and initialize an event structure
11882 */
11883static struct perf_event *
11884perf_event_alloc(struct perf_event_attr *attr, int cpu,
11885		 struct task_struct *task,
11886		 struct perf_event *group_leader,
11887		 struct perf_event *parent_event,
11888		 perf_overflow_handler_t overflow_handler,
11889		 void *context, int cgroup_fd)
11890{
11891	struct pmu *pmu;
11892	struct perf_event *event;
11893	struct hw_perf_event *hwc;
11894	long err = -EINVAL;
11895	int node;
11896
11897	if ((unsigned)cpu >= nr_cpu_ids) {
11898		if (!task || cpu != -1)
11899			return ERR_PTR(-EINVAL);
11900	}
11901	if (attr->sigtrap && !task) {
11902		/* Requires a task: avoid signalling random tasks. */
11903		return ERR_PTR(-EINVAL);
11904	}
11905
11906	node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
11907	event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
11908				      node);
11909	if (!event)
11910		return ERR_PTR(-ENOMEM);
11911
11912	/*
11913	 * Single events are their own group leaders, with an
11914	 * empty sibling list:
11915	 */
11916	if (!group_leader)
11917		group_leader = event;
11918
11919	mutex_init(&event->child_mutex);
11920	INIT_LIST_HEAD(&event->child_list);
11921
11922	INIT_LIST_HEAD(&event->event_entry);
11923	INIT_LIST_HEAD(&event->sibling_list);
11924	INIT_LIST_HEAD(&event->active_list);
11925	init_event_group(event);
11926	INIT_LIST_HEAD(&event->rb_entry);
11927	INIT_LIST_HEAD(&event->active_entry);
11928	INIT_LIST_HEAD(&event->addr_filters.list);
11929	INIT_HLIST_NODE(&event->hlist_entry);
11930
11931
11932	init_waitqueue_head(&event->waitq);
11933	init_irq_work(&event->pending_irq, perf_pending_irq);
11934	init_task_work(&event->pending_task, perf_pending_task);
11935
11936	mutex_init(&event->mmap_mutex);
11937	raw_spin_lock_init(&event->addr_filters.lock);
11938
11939	atomic_long_set(&event->refcount, 1);
11940	event->cpu		= cpu;
11941	event->attr		= *attr;
11942	event->group_leader	= group_leader;
11943	event->pmu		= NULL;
11944	event->oncpu		= -1;
11945
11946	event->parent		= parent_event;
11947
11948	event->ns		= get_pid_ns(task_active_pid_ns(current));
11949	event->id		= atomic64_inc_return(&perf_event_id);
11950
11951	event->state		= PERF_EVENT_STATE_INACTIVE;
11952
11953	if (parent_event)
11954		event->event_caps = parent_event->event_caps;
11955
11956	if (task) {
11957		event->attach_state = PERF_ATTACH_TASK;
11958		/*
11959		 * XXX pmu::event_init needs to know what task to account to
11960		 * and we cannot use the ctx information because we need the
11961		 * pmu before we get a ctx.
11962		 */
11963		event->hw.target = get_task_struct(task);
11964	}
11965
11966	event->clock = &local_clock;
11967	if (parent_event)
11968		event->clock = parent_event->clock;
11969
11970	if (!overflow_handler && parent_event) {
11971		overflow_handler = parent_event->overflow_handler;
11972		context = parent_event->overflow_handler_context;
11973#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
11974		if (overflow_handler == bpf_overflow_handler) {
11975			struct bpf_prog *prog = parent_event->prog;
11976
11977			bpf_prog_inc(prog);
11978			event->prog = prog;
11979			event->orig_overflow_handler =
11980				parent_event->orig_overflow_handler;
11981		}
11982#endif
11983	}
11984
11985	if (overflow_handler) {
11986		event->overflow_handler	= overflow_handler;
11987		event->overflow_handler_context = context;
11988	} else if (is_write_backward(event)){
11989		event->overflow_handler = perf_event_output_backward;
11990		event->overflow_handler_context = NULL;
11991	} else {
11992		event->overflow_handler = perf_event_output_forward;
11993		event->overflow_handler_context = NULL;
11994	}
11995
11996	perf_event__state_init(event);
11997
11998	pmu = NULL;
11999
12000	hwc = &event->hw;
12001	hwc->sample_period = attr->sample_period;
12002	if (attr->freq && attr->sample_freq)
12003		hwc->sample_period = 1;
12004	hwc->last_period = hwc->sample_period;
12005
12006	local64_set(&hwc->period_left, hwc->sample_period);
12007
12008	/*
12009	 * We currently do not support PERF_SAMPLE_READ on inherited events.
12010	 * See perf_output_read().
12011	 */
12012	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
12013		goto err_ns;
12014
12015	if (!has_branch_stack(event))
12016		event->attr.branch_sample_type = 0;
12017
12018	pmu = perf_init_event(event);
12019	if (IS_ERR(pmu)) {
12020		err = PTR_ERR(pmu);
12021		goto err_ns;
12022	}
12023
12024	/*
12025	 * Disallow uncore-task events. Similarly, disallow uncore-cgroup
12026	 * events (they don't make sense as the cgroup will be different
12027	 * on other CPUs in the uncore mask).
12028	 */
12029	if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
12030		err = -EINVAL;
12031		goto err_pmu;
12032	}
12033
12034	if (event->attr.aux_output &&
12035	    !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
12036		err = -EOPNOTSUPP;
12037		goto err_pmu;
12038	}
12039
12040	if (cgroup_fd != -1) {
12041		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
12042		if (err)
12043			goto err_pmu;
12044	}
12045
12046	err = exclusive_event_init(event);
12047	if (err)
12048		goto err_pmu;
12049
12050	if (has_addr_filter(event)) {
12051		event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
12052						    sizeof(struct perf_addr_filter_range),
12053						    GFP_KERNEL);
12054		if (!event->addr_filter_ranges) {
12055			err = -ENOMEM;
12056			goto err_per_task;
12057		}
12058
12059		/*
12060		 * Clone the parent's vma offsets: they are valid until exec()
12061		 * even if the mm is not shared with the parent.
12062		 */
12063		if (event->parent) {
12064			struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
12065
12066			raw_spin_lock_irq(&ifh->lock);
12067			memcpy(event->addr_filter_ranges,
12068			       event->parent->addr_filter_ranges,
12069			       pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
12070			raw_spin_unlock_irq(&ifh->lock);
12071		}
12072
12073		/* force hw sync on the address filters */
12074		event->addr_filters_gen = 1;
12075	}
12076
12077	if (!event->parent) {
12078		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
12079			err = get_callchain_buffers(attr->sample_max_stack);
12080			if (err)
12081				goto err_addr_filters;
12082		}
12083	}
12084
12085	err = security_perf_event_alloc(event);
12086	if (err)
12087		goto err_callchain_buffer;
12088
12089	/* symmetric to unaccount_event() in _free_event() */
12090	account_event(event);
12091
12092	return event;
12093
12094err_callchain_buffer:
12095	if (!event->parent) {
12096		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
12097			put_callchain_buffers();
12098	}
12099err_addr_filters:
12100	kfree(event->addr_filter_ranges);
12101
12102err_per_task:
12103	exclusive_event_destroy(event);
12104
12105err_pmu:
12106	if (is_cgroup_event(event))
12107		perf_detach_cgroup(event);
12108	if (event->destroy)
12109		event->destroy(event);
12110	module_put(pmu->module);
12111err_ns:
12112	if (event->hw.target)
12113		put_task_struct(event->hw.target);
12114	call_rcu(&event->rcu_head, free_event_rcu);
12115
12116	return ERR_PTR(err);
12117}
12118
12119static int perf_copy_attr(struct perf_event_attr __user *uattr,
12120			  struct perf_event_attr *attr)
12121{
12122	u32 size;
12123	int ret;
12124
12125	/* Zero the full structure, so that a short copy will be nice. */
12126	memset(attr, 0, sizeof(*attr));
12127
12128	ret = get_user(size, &uattr->size);
12129	if (ret)
12130		return ret;
12131
12132	/* ABI compatibility quirk: */
12133	if (!size)
12134		size = PERF_ATTR_SIZE_VER0;
12135	if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
12136		goto err_size;
12137
12138	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
12139	if (ret) {
12140		if (ret == -E2BIG)
12141			goto err_size;
12142		return ret;
12143	}
12144
12145	attr->size = size;
12146
12147	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
12148		return -EINVAL;
12149
12150	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
12151		return -EINVAL;
12152
12153	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
12154		return -EINVAL;
12155
12156	if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
12157		u64 mask = attr->branch_sample_type;
12158
12159		/* only using defined bits */
12160		if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
12161			return -EINVAL;
12162
12163		/* at least one branch bit must be set */
12164		if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
12165			return -EINVAL;
12166
12167		/* propagate priv level, when not set for branch */
12168		if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
12169
12170			/* exclude_kernel checked on syscall entry */
12171			if (!attr->exclude_kernel)
12172				mask |= PERF_SAMPLE_BRANCH_KERNEL;
12173
12174			if (!attr->exclude_user)
12175				mask |= PERF_SAMPLE_BRANCH_USER;
12176
12177			if (!attr->exclude_hv)
12178				mask |= PERF_SAMPLE_BRANCH_HV;
12179			/*
12180			 * adjust user setting (for HW filter setup)
12181			 */
12182			attr->branch_sample_type = mask;
12183		}
12184		/* privileged levels capture (kernel, hv): check permissions */
12185		if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
12186			ret = perf_allow_kernel(attr);
12187			if (ret)
12188				return ret;
12189		}
12190	}
12191
12192	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
12193		ret = perf_reg_validate(attr->sample_regs_user);
12194		if (ret)
12195			return ret;
12196	}
12197
12198	if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
12199		if (!arch_perf_have_user_stack_dump())
12200			return -ENOSYS;
12201
12202		/*
12203		 * We have __u32 type for the size, but so far
12204		 * we can only use __u16 as maximum due to the
12205		 * __u16 sample size limit.
12206		 */
12207		if (attr->sample_stack_user >= USHRT_MAX)
12208			return -EINVAL;
12209		else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
12210			return -EINVAL;
12211	}
12212
12213	if (!attr->sample_max_stack)
12214		attr->sample_max_stack = sysctl_perf_event_max_stack;
12215
12216	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
12217		ret = perf_reg_validate(attr->sample_regs_intr);
12218
12219#ifndef CONFIG_CGROUP_PERF
12220	if (attr->sample_type & PERF_SAMPLE_CGROUP)
12221		return -EINVAL;
12222#endif
12223	if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
12224	    (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
12225		return -EINVAL;
12226
12227	if (!attr->inherit && attr->inherit_thread)
12228		return -EINVAL;
12229
12230	if (attr->remove_on_exec && attr->enable_on_exec)
12231		return -EINVAL;
12232
12233	if (attr->sigtrap && !attr->remove_on_exec)
12234		return -EINVAL;
12235
12236out:
12237	return ret;
12238
12239err_size:
12240	put_user(sizeof(*attr), &uattr->size);
12241	ret = -E2BIG;
12242	goto out;
12243}
12244
12245static void mutex_lock_double(struct mutex *a, struct mutex *b)
12246{
12247	if (b < a)
12248		swap(a, b);
12249
12250	mutex_lock(a);
12251	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
12252}
12253
12254static int
12255perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
12256{
12257	struct perf_buffer *rb = NULL;
12258	int ret = -EINVAL;
12259
12260	if (!output_event) {
12261		mutex_lock(&event->mmap_mutex);
12262		goto set;
12263	}
12264
12265	/* don't allow circular references */
12266	if (event == output_event)
12267		goto out;
12268
12269	/*
12270	 * Don't allow cross-cpu buffers
12271	 */
12272	if (output_event->cpu != event->cpu)
12273		goto out;
12274
12275	/*
12276	 * If its not a per-cpu rb, it must be the same task.
12277	 */
12278	if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
12279		goto out;
12280
12281	/*
12282	 * Mixing clocks in the same buffer is trouble you don't need.
12283	 */
12284	if (output_event->clock != event->clock)
12285		goto out;
12286
12287	/*
12288	 * Either writing ring buffer from beginning or from end.
12289	 * Mixing is not allowed.
12290	 */
12291	if (is_write_backward(output_event) != is_write_backward(event))
12292		goto out;
12293
12294	/*
12295	 * If both events generate aux data, they must be on the same PMU
12296	 */
12297	if (has_aux(event) && has_aux(output_event) &&
12298	    event->pmu != output_event->pmu)
12299		goto out;
12300
12301	/*
12302	 * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
12303	 * output_event is already on rb->event_list, and the list iteration
12304	 * restarts after every removal, it is guaranteed this new event is
12305	 * observed *OR* if output_event is already removed, it's guaranteed we
12306	 * observe !rb->mmap_count.
12307	 */
12308	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
12309set:
12310	/* Can't redirect output if we've got an active mmap() */
12311	if (atomic_read(&event->mmap_count))
12312		goto unlock;
12313
12314	if (output_event) {
12315		/* get the rb we want to redirect to */
12316		rb = ring_buffer_get(output_event);
12317		if (!rb)
12318			goto unlock;
12319
12320		/* did we race against perf_mmap_close() */
12321		if (!atomic_read(&rb->mmap_count)) {
12322			ring_buffer_put(rb);
12323			goto unlock;
12324		}
12325	}
12326
12327	ring_buffer_attach(event, rb);
12328
12329	ret = 0;
12330unlock:
12331	mutex_unlock(&event->mmap_mutex);
12332	if (output_event)
12333		mutex_unlock(&output_event->mmap_mutex);
12334
12335out:
12336	return ret;
12337}
12338
12339static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
12340{
12341	bool nmi_safe = false;
12342
12343	switch (clk_id) {
12344	case CLOCK_MONOTONIC:
12345		event->clock = &ktime_get_mono_fast_ns;
12346		nmi_safe = true;
12347		break;
12348
12349	case CLOCK_MONOTONIC_RAW:
12350		event->clock = &ktime_get_raw_fast_ns;
12351		nmi_safe = true;
12352		break;
12353
12354	case CLOCK_REALTIME:
12355		event->clock = &ktime_get_real_ns;
12356		break;
12357
12358	case CLOCK_BOOTTIME:
12359		event->clock = &ktime_get_boottime_ns;
12360		break;
12361
12362	case CLOCK_TAI:
12363		event->clock = &ktime_get_clocktai_ns;
12364		break;
12365
12366	default:
12367		return -EINVAL;
12368	}
12369
12370	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
12371		return -EINVAL;
12372
12373	return 0;
12374}
12375
12376static bool
12377perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
12378{
12379	unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
12380	bool is_capable = perfmon_capable();
12381
12382	if (attr->sigtrap) {
12383		/*
12384		 * perf_event_attr::sigtrap sends signals to the other task.
12385		 * Require the current task to also have CAP_KILL.
12386		 */
12387		rcu_read_lock();
12388		is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
12389		rcu_read_unlock();
12390
12391		/*
12392		 * If the required capabilities aren't available, checks for
12393		 * ptrace permissions: upgrade to ATTACH, since sending signals
12394		 * can effectively change the target task.
12395		 */
12396		ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
12397	}
12398
12399	/*
12400	 * Preserve ptrace permission check for backwards compatibility. The
12401	 * ptrace check also includes checks that the current task and other
12402	 * task have matching uids, and is therefore not done here explicitly.
12403	 */
12404	return is_capable || ptrace_may_access(task, ptrace_mode);
12405}
12406
12407/**
12408 * sys_perf_event_open - open a performance event, associate it to a task/cpu
12409 *
12410 * @attr_uptr:	event_id type attributes for monitoring/sampling
12411 * @pid:		target pid
12412 * @cpu:		target cpu
12413 * @group_fd:		group leader event fd
12414 * @flags:		perf event open flags
12415 */
12416SYSCALL_DEFINE5(perf_event_open,
12417		struct perf_event_attr __user *, attr_uptr,
12418		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
12419{
12420	struct perf_event *group_leader = NULL, *output_event = NULL;
12421	struct perf_event_pmu_context *pmu_ctx;
12422	struct perf_event *event, *sibling;
12423	struct perf_event_attr attr;
12424	struct perf_event_context *ctx;
12425	struct file *event_file = NULL;
12426	struct fd group = {NULL, 0};
12427	struct task_struct *task = NULL;
12428	struct pmu *pmu;
12429	int event_fd;
12430	int move_group = 0;
12431	int err;
12432	int f_flags = O_RDWR;
12433	int cgroup_fd = -1;
12434
12435	/* for future expandability... */
12436	if (flags & ~PERF_FLAG_ALL)
12437		return -EINVAL;
12438
12439	err = perf_copy_attr(attr_uptr, &attr);
12440	if (err)
12441		return err;
12442
12443	/* Do we allow access to perf_event_open(2) ? */
12444	err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
12445	if (err)
12446		return err;
12447
12448	if (!attr.exclude_kernel) {
12449		err = perf_allow_kernel(&attr);
12450		if (err)
12451			return err;
12452	}
12453
12454	if (attr.namespaces) {
12455		if (!perfmon_capable())
12456			return -EACCES;
12457	}
12458
12459	if (attr.freq) {
12460		if (attr.sample_freq > sysctl_perf_event_sample_rate)
12461			return -EINVAL;
12462	} else {
12463		if (attr.sample_period & (1ULL << 63))
12464			return -EINVAL;
12465	}
12466
12467	/* Only privileged users can get physical addresses */
12468	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
12469		err = perf_allow_kernel(&attr);
12470		if (err)
12471			return err;
12472	}
12473
12474	/* REGS_INTR can leak data, lockdown must prevent this */
12475	if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
12476		err = security_locked_down(LOCKDOWN_PERF);
12477		if (err)
12478			return err;
12479	}
12480
12481	/*
12482	 * In cgroup mode, the pid argument is used to pass the fd
12483	 * opened to the cgroup directory in cgroupfs. The cpu argument
12484	 * designates the cpu on which to monitor threads from that
12485	 * cgroup.
12486	 */
12487	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
12488		return -EINVAL;
12489
12490	if (flags & PERF_FLAG_FD_CLOEXEC)
12491		f_flags |= O_CLOEXEC;
12492
12493	event_fd = get_unused_fd_flags(f_flags);
12494	if (event_fd < 0)
12495		return event_fd;
12496
12497	if (group_fd != -1) {
12498		err = perf_fget_light(group_fd, &group);
12499		if (err)
12500			goto err_fd;
12501		group_leader = group.file->private_data;
12502		if (flags & PERF_FLAG_FD_OUTPUT)
12503			output_event = group_leader;
12504		if (flags & PERF_FLAG_FD_NO_GROUP)
12505			group_leader = NULL;
12506	}
12507
12508	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
12509		task = find_lively_task_by_vpid(pid);
12510		if (IS_ERR(task)) {
12511			err = PTR_ERR(task);
12512			goto err_group_fd;
12513		}
12514	}
12515
12516	if (task && group_leader &&
12517	    group_leader->attr.inherit != attr.inherit) {
12518		err = -EINVAL;
12519		goto err_task;
12520	}
12521
12522	if (flags & PERF_FLAG_PID_CGROUP)
12523		cgroup_fd = pid;
12524
12525	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
12526				 NULL, NULL, cgroup_fd);
12527	if (IS_ERR(event)) {
12528		err = PTR_ERR(event);
12529		goto err_task;
12530	}
12531
12532	if (is_sampling_event(event)) {
12533		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
12534			err = -EOPNOTSUPP;
12535			goto err_alloc;
12536		}
12537	}
12538
12539	/*
12540	 * Special case software events and allow them to be part of
12541	 * any hardware group.
12542	 */
12543	pmu = event->pmu;
12544
12545	if (attr.use_clockid) {
12546		err = perf_event_set_clock(event, attr.clockid);
12547		if (err)
12548			goto err_alloc;
12549	}
12550
12551	if (pmu->task_ctx_nr == perf_sw_context)
12552		event->event_caps |= PERF_EV_CAP_SOFTWARE;
12553
12554	if (task) {
12555		err = down_read_interruptible(&task->signal->exec_update_lock);
12556		if (err)
12557			goto err_alloc;
12558
12559		/*
12560		 * We must hold exec_update_lock across this and any potential
12561		 * perf_install_in_context() call for this new event to
12562		 * serialize against exec() altering our credentials (and the
12563		 * perf_event_exit_task() that could imply).
12564		 */
12565		err = -EACCES;
12566		if (!perf_check_permission(&attr, task))
12567			goto err_cred;
12568	}
12569
12570	/*
12571	 * Get the target context (task or percpu):
12572	 */
12573	ctx = find_get_context(task, event);
12574	if (IS_ERR(ctx)) {
12575		err = PTR_ERR(ctx);
12576		goto err_cred;
12577	}
12578
12579	mutex_lock(&ctx->mutex);
12580
12581	if (ctx->task == TASK_TOMBSTONE) {
12582		err = -ESRCH;
12583		goto err_locked;
12584	}
12585
12586	if (!task) {
12587		/*
12588		 * Check if the @cpu we're creating an event for is online.
12589		 *
12590		 * We use the perf_cpu_context::ctx::mutex to serialize against
12591		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
12592		 */
12593		struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
12594
12595		if (!cpuctx->online) {
12596			err = -ENODEV;
12597			goto err_locked;
12598		}
12599	}
12600
12601	if (group_leader) {
12602		err = -EINVAL;
12603
12604		/*
12605		 * Do not allow a recursive hierarchy (this new sibling
12606		 * becoming part of another group-sibling):
12607		 */
12608		if (group_leader->group_leader != group_leader)
12609			goto err_locked;
12610
12611		/* All events in a group should have the same clock */
12612		if (group_leader->clock != event->clock)
12613			goto err_locked;
12614
12615		/*
12616		 * Make sure we're both events for the same CPU;
12617		 * grouping events for different CPUs is broken; since
12618		 * you can never concurrently schedule them anyhow.
12619		 */
12620		if (group_leader->cpu != event->cpu)
12621			goto err_locked;
12622
12623		/*
12624		 * Make sure we're both on the same context; either task or cpu.
12625		 */
12626		if (group_leader->ctx != ctx)
12627			goto err_locked;
12628
12629		/*
12630		 * Only a group leader can be exclusive or pinned
12631		 */
12632		if (attr.exclusive || attr.pinned)
12633			goto err_locked;
12634
12635		if (is_software_event(event) &&
12636		    !in_software_context(group_leader)) {
12637			/*
12638			 * If the event is a sw event, but the group_leader
12639			 * is on hw context.
12640			 *
12641			 * Allow the addition of software events to hw
12642			 * groups, this is safe because software events
12643			 * never fail to schedule.
12644			 *
12645			 * Note the comment that goes with struct
12646			 * perf_event_pmu_context.
12647			 */
12648			pmu = group_leader->pmu_ctx->pmu;
12649		} else if (!is_software_event(event)) {
12650			if (is_software_event(group_leader) &&
12651			    (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12652				/*
12653				 * In case the group is a pure software group, and we
12654				 * try to add a hardware event, move the whole group to
12655				 * the hardware context.
12656				 */
12657				move_group = 1;
12658			}
12659
12660			/* Don't allow group of multiple hw events from different pmus */
12661			if (!in_software_context(group_leader) &&
12662			    group_leader->pmu_ctx->pmu != pmu)
12663				goto err_locked;
12664		}
12665	}
12666
12667	/*
12668	 * Now that we're certain of the pmu; find the pmu_ctx.
12669	 */
12670	pmu_ctx = find_get_pmu_context(pmu, ctx, event);
12671	if (IS_ERR(pmu_ctx)) {
12672		err = PTR_ERR(pmu_ctx);
12673		goto err_locked;
12674	}
12675	event->pmu_ctx = pmu_ctx;
12676
12677	if (output_event) {
12678		err = perf_event_set_output(event, output_event);
12679		if (err)
12680			goto err_context;
12681	}
12682
12683	if (!perf_event_validate_size(event)) {
12684		err = -E2BIG;
12685		goto err_context;
12686	}
12687
12688	if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
12689		err = -EINVAL;
12690		goto err_context;
12691	}
12692
12693	/*
12694	 * Must be under the same ctx::mutex as perf_install_in_context(),
12695	 * because we need to serialize with concurrent event creation.
12696	 */
12697	if (!exclusive_event_installable(event, ctx)) {
12698		err = -EBUSY;
12699		goto err_context;
12700	}
12701
12702	WARN_ON_ONCE(ctx->parent_ctx);
12703
12704	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
12705	if (IS_ERR(event_file)) {
12706		err = PTR_ERR(event_file);
12707		event_file = NULL;
12708		goto err_context;
12709	}
12710
12711	/*
12712	 * This is the point on no return; we cannot fail hereafter. This is
12713	 * where we start modifying current state.
12714	 */
12715
12716	if (move_group) {
12717		perf_remove_from_context(group_leader, 0);
12718		put_pmu_ctx(group_leader->pmu_ctx);
12719
12720		for_each_sibling_event(sibling, group_leader) {
12721			perf_remove_from_context(sibling, 0);
12722			put_pmu_ctx(sibling->pmu_ctx);
12723		}
12724
12725		/*
12726		 * Install the group siblings before the group leader.
12727		 *
12728		 * Because a group leader will try and install the entire group
12729		 * (through the sibling list, which is still in-tact), we can
12730		 * end up with siblings installed in the wrong context.
12731		 *
12732		 * By installing siblings first we NO-OP because they're not
12733		 * reachable through the group lists.
12734		 */
12735		for_each_sibling_event(sibling, group_leader) {
12736			sibling->pmu_ctx = pmu_ctx;
12737			get_pmu_ctx(pmu_ctx);
12738			perf_event__state_init(sibling);
12739			perf_install_in_context(ctx, sibling, sibling->cpu);
12740		}
12741
12742		/*
12743		 * Removing from the context ends up with disabled
12744		 * event. What we want here is event in the initial
12745		 * startup state, ready to be add into new context.
12746		 */
12747		group_leader->pmu_ctx = pmu_ctx;
12748		get_pmu_ctx(pmu_ctx);
12749		perf_event__state_init(group_leader);
12750		perf_install_in_context(ctx, group_leader, group_leader->cpu);
12751	}
12752
12753	/*
12754	 * Precalculate sample_data sizes; do while holding ctx::mutex such
12755	 * that we're serialized against further additions and before
12756	 * perf_install_in_context() which is the point the event is active and
12757	 * can use these values.
12758	 */
12759	perf_event__header_size(event);
12760	perf_event__id_header_size(event);
12761
12762	event->owner = current;
12763
12764	perf_install_in_context(ctx, event, event->cpu);
12765	perf_unpin_context(ctx);
12766
12767	mutex_unlock(&ctx->mutex);
12768
12769	if (task) {
12770		up_read(&task->signal->exec_update_lock);
12771		put_task_struct(task);
12772	}
12773
12774	mutex_lock(&current->perf_event_mutex);
12775	list_add_tail(&event->owner_entry, &current->perf_event_list);
12776	mutex_unlock(&current->perf_event_mutex);
12777
12778	/*
12779	 * Drop the reference on the group_event after placing the
12780	 * new event on the sibling_list. This ensures destruction
12781	 * of the group leader will find the pointer to itself in
12782	 * perf_group_detach().
12783	 */
12784	fdput(group);
12785	fd_install(event_fd, event_file);
12786	return event_fd;
12787
12788err_context:
12789	put_pmu_ctx(event->pmu_ctx);
12790	event->pmu_ctx = NULL; /* _free_event() */
12791err_locked:
12792	mutex_unlock(&ctx->mutex);
12793	perf_unpin_context(ctx);
12794	put_ctx(ctx);
12795err_cred:
12796	if (task)
12797		up_read(&task->signal->exec_update_lock);
12798err_alloc:
12799	free_event(event);
12800err_task:
12801	if (task)
12802		put_task_struct(task);
12803err_group_fd:
12804	fdput(group);
12805err_fd:
12806	put_unused_fd(event_fd);
12807	return err;
12808}
12809
12810/**
12811 * perf_event_create_kernel_counter
12812 *
12813 * @attr: attributes of the counter to create
12814 * @cpu: cpu in which the counter is bound
12815 * @task: task to profile (NULL for percpu)
12816 * @overflow_handler: callback to trigger when we hit the event
12817 * @context: context data could be used in overflow_handler callback
12818 */
12819struct perf_event *
12820perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
12821				 struct task_struct *task,
12822				 perf_overflow_handler_t overflow_handler,
12823				 void *context)
12824{
12825	struct perf_event_pmu_context *pmu_ctx;
12826	struct perf_event_context *ctx;
12827	struct perf_event *event;
12828	struct pmu *pmu;
12829	int err;
12830
12831	/*
12832	 * Grouping is not supported for kernel events, neither is 'AUX',
12833	 * make sure the caller's intentions are adjusted.
12834	 */
12835	if (attr->aux_output)
12836		return ERR_PTR(-EINVAL);
12837
12838	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
12839				 overflow_handler, context, -1);
12840	if (IS_ERR(event)) {
12841		err = PTR_ERR(event);
12842		goto err;
12843	}
12844
12845	/* Mark owner so we could distinguish it from user events. */
12846	event->owner = TASK_TOMBSTONE;
12847	pmu = event->pmu;
12848
12849	if (pmu->task_ctx_nr == perf_sw_context)
12850		event->event_caps |= PERF_EV_CAP_SOFTWARE;
12851
12852	/*
12853	 * Get the target context (task or percpu):
12854	 */
12855	ctx = find_get_context(task, event);
12856	if (IS_ERR(ctx)) {
12857		err = PTR_ERR(ctx);
12858		goto err_alloc;
12859	}
12860
12861	WARN_ON_ONCE(ctx->parent_ctx);
12862	mutex_lock(&ctx->mutex);
12863	if (ctx->task == TASK_TOMBSTONE) {
12864		err = -ESRCH;
12865		goto err_unlock;
12866	}
12867
12868	pmu_ctx = find_get_pmu_context(pmu, ctx, event);
12869	if (IS_ERR(pmu_ctx)) {
12870		err = PTR_ERR(pmu_ctx);
12871		goto err_unlock;
12872	}
12873	event->pmu_ctx = pmu_ctx;
12874
12875	if (!task) {
12876		/*
12877		 * Check if the @cpu we're creating an event for is online.
12878		 *
12879		 * We use the perf_cpu_context::ctx::mutex to serialize against
12880		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
12881		 */
12882		struct perf_cpu_context *cpuctx =
12883			container_of(ctx, struct perf_cpu_context, ctx);
12884		if (!cpuctx->online) {
12885			err = -ENODEV;
12886			goto err_pmu_ctx;
12887		}
12888	}
12889
12890	if (!exclusive_event_installable(event, ctx)) {
12891		err = -EBUSY;
12892		goto err_pmu_ctx;
12893	}
12894
12895	perf_install_in_context(ctx, event, event->cpu);
12896	perf_unpin_context(ctx);
12897	mutex_unlock(&ctx->mutex);
12898
12899	return event;
12900
12901err_pmu_ctx:
12902	put_pmu_ctx(pmu_ctx);
12903	event->pmu_ctx = NULL; /* _free_event() */
12904err_unlock:
12905	mutex_unlock(&ctx->mutex);
12906	perf_unpin_context(ctx);
12907	put_ctx(ctx);
12908err_alloc:
12909	free_event(event);
12910err:
12911	return ERR_PTR(err);
12912}
12913EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
12914
12915static void __perf_pmu_remove(struct perf_event_context *ctx,
12916			      int cpu, struct pmu *pmu,
12917			      struct perf_event_groups *groups,
12918			      struct list_head *events)
12919{
12920	struct perf_event *event, *sibling;
12921
12922	perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
12923		perf_remove_from_context(event, 0);
12924		put_pmu_ctx(event->pmu_ctx);
12925		list_add(&event->migrate_entry, events);
12926
12927		for_each_sibling_event(sibling, event) {
12928			perf_remove_from_context(sibling, 0);
12929			put_pmu_ctx(sibling->pmu_ctx);
12930			list_add(&sibling->migrate_entry, events);
12931		}
12932	}
12933}
12934
12935static void __perf_pmu_install_event(struct pmu *pmu,
12936				     struct perf_event_context *ctx,
12937				     int cpu, struct perf_event *event)
12938{
12939	struct perf_event_pmu_context *epc;
12940	struct perf_event_context *old_ctx = event->ctx;
12941
12942	get_ctx(ctx); /* normally find_get_context() */
12943
12944	event->cpu = cpu;
12945	epc = find_get_pmu_context(pmu, ctx, event);
12946	event->pmu_ctx = epc;
12947
12948	if (event->state >= PERF_EVENT_STATE_OFF)
12949		event->state = PERF_EVENT_STATE_INACTIVE;
12950	perf_install_in_context(ctx, event, cpu);
12951
12952	/*
12953	 * Now that event->ctx is updated and visible, put the old ctx.
12954	 */
12955	put_ctx(old_ctx);
12956}
12957
12958static void __perf_pmu_install(struct perf_event_context *ctx,
12959			       int cpu, struct pmu *pmu, struct list_head *events)
12960{
12961	struct perf_event *event, *tmp;
12962
12963	/*
12964	 * Re-instate events in 2 passes.
12965	 *
12966	 * Skip over group leaders and only install siblings on this first
12967	 * pass, siblings will not get enabled without a leader, however a
12968	 * leader will enable its siblings, even if those are still on the old
12969	 * context.
12970	 */
12971	list_for_each_entry_safe(event, tmp, events, migrate_entry) {
12972		if (event->group_leader == event)
12973			continue;
12974
12975		list_del(&event->migrate_entry);
12976		__perf_pmu_install_event(pmu, ctx, cpu, event);
12977	}
12978
12979	/*
12980	 * Once all the siblings are setup properly, install the group leaders
12981	 * to make it go.
12982	 */
12983	list_for_each_entry_safe(event, tmp, events, migrate_entry) {
12984		list_del(&event->migrate_entry);
12985		__perf_pmu_install_event(pmu, ctx, cpu, event);
12986	}
12987}
12988
12989void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
12990{
12991	struct perf_event_context *src_ctx, *dst_ctx;
12992	LIST_HEAD(events);
12993
12994	/*
12995	 * Since per-cpu context is persistent, no need to grab an extra
12996	 * reference.
12997	 */
12998	src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
12999	dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
13000
13001	/*
13002	 * See perf_event_ctx_lock() for comments on the details
13003	 * of swizzling perf_event::ctx.
13004	 */
13005	mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
13006
13007	__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
13008	__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
13009
13010	if (!list_empty(&events)) {
13011		/*
13012		 * Wait for the events to quiesce before re-instating them.
13013		 */
13014		synchronize_rcu();
13015
13016		__perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
13017	}
13018
13019	mutex_unlock(&dst_ctx->mutex);
13020	mutex_unlock(&src_ctx->mutex);
13021}
13022EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
13023
13024static void sync_child_event(struct perf_event *child_event)
13025{
13026	struct perf_event *parent_event = child_event->parent;
13027	u64 child_val;
13028
13029	if (child_event->attr.inherit_stat) {
13030		struct task_struct *task = child_event->ctx->task;
13031
13032		if (task && task != TASK_TOMBSTONE)
13033			perf_event_read_event(child_event, task);
13034	}
13035
13036	child_val = perf_event_count(child_event);
13037
13038	/*
13039	 * Add back the child's count to the parent's count:
13040	 */
13041	atomic64_add(child_val, &parent_event->child_count);
13042	atomic64_add(child_event->total_time_enabled,
13043		     &parent_event->child_total_time_enabled);
13044	atomic64_add(child_event->total_time_running,
13045		     &parent_event->child_total_time_running);
13046}
13047
13048static void
13049perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
13050{
13051	struct perf_event *parent_event = event->parent;
13052	unsigned long detach_flags = 0;
13053
13054	if (parent_event) {
13055		/*
13056		 * Do not destroy the 'original' grouping; because of the
13057		 * context switch optimization the original events could've
13058		 * ended up in a random child task.
13059		 *
13060		 * If we were to destroy the original group, all group related
13061		 * operations would cease to function properly after this
13062		 * random child dies.
13063		 *
13064		 * Do destroy all inherited groups, we don't care about those
13065		 * and being thorough is better.
13066		 */
13067		detach_flags = DETACH_GROUP | DETACH_CHILD;
13068		mutex_lock(&parent_event->child_mutex);
13069	}
13070
13071	perf_remove_from_context(event, detach_flags);
13072
13073	raw_spin_lock_irq(&ctx->lock);
13074	if (event->state > PERF_EVENT_STATE_EXIT)
13075		perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
13076	raw_spin_unlock_irq(&ctx->lock);
13077
13078	/*
13079	 * Child events can be freed.
13080	 */
13081	if (parent_event) {
13082		mutex_unlock(&parent_event->child_mutex);
13083		/*
13084		 * Kick perf_poll() for is_event_hup();
13085		 */
13086		perf_event_wakeup(parent_event);
13087		free_event(event);
13088		put_event(parent_event);
13089		return;
13090	}
13091
13092	/*
13093	 * Parent events are governed by their filedesc, retain them.
13094	 */
13095	perf_event_wakeup(event);
13096}
13097
13098static void perf_event_exit_task_context(struct task_struct *child)
13099{
13100	struct perf_event_context *child_ctx, *clone_ctx = NULL;
13101	struct perf_event *child_event, *next;
13102
13103	WARN_ON_ONCE(child != current);
13104
13105	child_ctx = perf_pin_task_context(child);
13106	if (!child_ctx)
13107		return;
13108
13109	/*
13110	 * In order to reduce the amount of tricky in ctx tear-down, we hold
13111	 * ctx::mutex over the entire thing. This serializes against almost
13112	 * everything that wants to access the ctx.
13113	 *
13114	 * The exception is sys_perf_event_open() /
13115	 * perf_event_create_kernel_count() which does find_get_context()
13116	 * without ctx::mutex (it cannot because of the move_group double mutex
13117	 * lock thing). See the comments in perf_install_in_context().
13118	 */
13119	mutex_lock(&child_ctx->mutex);
13120
13121	/*
13122	 * In a single ctx::lock section, de-schedule the events and detach the
13123	 * context from the task such that we cannot ever get it scheduled back
13124	 * in.
13125	 */
13126	raw_spin_lock_irq(&child_ctx->lock);
13127	task_ctx_sched_out(child_ctx, EVENT_ALL);
13128
13129	/*
13130	 * Now that the context is inactive, destroy the task <-> ctx relation
13131	 * and mark the context dead.
13132	 */
13133	RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
13134	put_ctx(child_ctx); /* cannot be last */
13135	WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
13136	put_task_struct(current); /* cannot be last */
13137
13138	clone_ctx = unclone_ctx(child_ctx);
13139	raw_spin_unlock_irq(&child_ctx->lock);
13140
13141	if (clone_ctx)
13142		put_ctx(clone_ctx);
13143
13144	/*
13145	 * Report the task dead after unscheduling the events so that we
13146	 * won't get any samples after PERF_RECORD_EXIT. We can however still
13147	 * get a few PERF_RECORD_READ events.
13148	 */
13149	perf_event_task(child, child_ctx, 0);
13150
13151	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
13152		perf_event_exit_event(child_event, child_ctx);
13153
13154	mutex_unlock(&child_ctx->mutex);
13155
13156	put_ctx(child_ctx);
13157}
13158
13159/*
13160 * When a child task exits, feed back event values to parent events.
13161 *
13162 * Can be called with exec_update_lock held when called from
13163 * setup_new_exec().
13164 */
13165void perf_event_exit_task(struct task_struct *child)
13166{
13167	struct perf_event *event, *tmp;
13168
13169	mutex_lock(&child->perf_event_mutex);
13170	list_for_each_entry_safe(event, tmp, &child->perf_event_list,
13171				 owner_entry) {
13172		list_del_init(&event->owner_entry);
13173
13174		/*
13175		 * Ensure the list deletion is visible before we clear
13176		 * the owner, closes a race against perf_release() where
13177		 * we need to serialize on the owner->perf_event_mutex.
13178		 */
13179		smp_store_release(&event->owner, NULL);
13180	}
13181	mutex_unlock(&child->perf_event_mutex);
13182
13183	perf_event_exit_task_context(child);
13184
13185	/*
13186	 * The perf_event_exit_task_context calls perf_event_task
13187	 * with child's task_ctx, which generates EXIT events for
13188	 * child contexts and sets child->perf_event_ctxp[] to NULL.
13189	 * At this point we need to send EXIT events to cpu contexts.
13190	 */
13191	perf_event_task(child, NULL, 0);
13192}
13193
13194static void perf_free_event(struct perf_event *event,
13195			    struct perf_event_context *ctx)
13196{
13197	struct perf_event *parent = event->parent;
13198
13199	if (WARN_ON_ONCE(!parent))
13200		return;
13201
13202	mutex_lock(&parent->child_mutex);
13203	list_del_init(&event->child_list);
13204	mutex_unlock(&parent->child_mutex);
13205
13206	put_event(parent);
13207
13208	raw_spin_lock_irq(&ctx->lock);
13209	perf_group_detach(event);
13210	list_del_event(event, ctx);
13211	raw_spin_unlock_irq(&ctx->lock);
13212	free_event(event);
13213}
13214
13215/*
13216 * Free a context as created by inheritance by perf_event_init_task() below,
13217 * used by fork() in case of fail.
13218 *
13219 * Even though the task has never lived, the context and events have been
13220 * exposed through the child_list, so we must take care tearing it all down.
13221 */
13222void perf_event_free_task(struct task_struct *task)
13223{
13224	struct perf_event_context *ctx;
13225	struct perf_event *event, *tmp;
13226
13227	ctx = rcu_access_pointer(task->perf_event_ctxp);
13228	if (!ctx)
13229		return;
13230
13231	mutex_lock(&ctx->mutex);
13232	raw_spin_lock_irq(&ctx->lock);
13233	/*
13234	 * Destroy the task <-> ctx relation and mark the context dead.
13235	 *
13236	 * This is important because even though the task hasn't been
13237	 * exposed yet the context has been (through child_list).
13238	 */
13239	RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
13240	WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
13241	put_task_struct(task); /* cannot be last */
13242	raw_spin_unlock_irq(&ctx->lock);
13243
13244
13245	list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
13246		perf_free_event(event, ctx);
13247
13248	mutex_unlock(&ctx->mutex);
13249
13250	/*
13251	 * perf_event_release_kernel() could've stolen some of our
13252	 * child events and still have them on its free_list. In that
13253	 * case we must wait for these events to have been freed (in
13254	 * particular all their references to this task must've been
13255	 * dropped).
13256	 *
13257	 * Without this copy_process() will unconditionally free this
13258	 * task (irrespective of its reference count) and
13259	 * _free_event()'s put_task_struct(event->hw.target) will be a
13260	 * use-after-free.
13261	 *
13262	 * Wait for all events to drop their context reference.
13263	 */
13264	wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
13265	put_ctx(ctx); /* must be last */
13266}
13267
13268void perf_event_delayed_put(struct task_struct *task)
13269{
13270	WARN_ON_ONCE(task->perf_event_ctxp);
13271}
13272
13273struct file *perf_event_get(unsigned int fd)
13274{
13275	struct file *file = fget(fd);
13276	if (!file)
13277		return ERR_PTR(-EBADF);
13278
13279	if (file->f_op != &perf_fops) {
13280		fput(file);
13281		return ERR_PTR(-EBADF);
13282	}
13283
13284	return file;
13285}
13286
13287const struct perf_event *perf_get_event(struct file *file)
13288{
13289	if (file->f_op != &perf_fops)
13290		return ERR_PTR(-EINVAL);
13291
13292	return file->private_data;
13293}
13294
13295const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
13296{
13297	if (!event)
13298		return ERR_PTR(-EINVAL);
13299
13300	return &event->attr;
13301}
13302
13303/*
13304 * Inherit an event from parent task to child task.
13305 *
13306 * Returns:
13307 *  - valid pointer on success
13308 *  - NULL for orphaned events
13309 *  - IS_ERR() on error
13310 */
13311static struct perf_event *
13312inherit_event(struct perf_event *parent_event,
13313	      struct task_struct *parent,
13314	      struct perf_event_context *parent_ctx,
13315	      struct task_struct *child,
13316	      struct perf_event *group_leader,
13317	      struct perf_event_context *child_ctx)
13318{
13319	enum perf_event_state parent_state = parent_event->state;
13320	struct perf_event_pmu_context *pmu_ctx;
13321	struct perf_event *child_event;
13322	unsigned long flags;
13323
13324	/*
13325	 * Instead of creating recursive hierarchies of events,
13326	 * we link inherited events back to the original parent,
13327	 * which has a filp for sure, which we use as the reference
13328	 * count:
13329	 */
13330	if (parent_event->parent)
13331		parent_event = parent_event->parent;
13332
13333	child_event = perf_event_alloc(&parent_event->attr,
13334					   parent_event->cpu,
13335					   child,
13336					   group_leader, parent_event,
13337					   NULL, NULL, -1);
13338	if (IS_ERR(child_event))
13339		return child_event;
13340
13341	pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
13342	if (IS_ERR(pmu_ctx)) {
13343		free_event(child_event);
13344		return ERR_CAST(pmu_ctx);
13345	}
13346	child_event->pmu_ctx = pmu_ctx;
13347
13348	/*
13349	 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
13350	 * must be under the same lock in order to serialize against
13351	 * perf_event_release_kernel(), such that either we must observe
13352	 * is_orphaned_event() or they will observe us on the child_list.
13353	 */
13354	mutex_lock(&parent_event->child_mutex);
13355	if (is_orphaned_event(parent_event) ||
13356	    !atomic_long_inc_not_zero(&parent_event->refcount)) {
13357		mutex_unlock(&parent_event->child_mutex);
13358		/* task_ctx_data is freed with child_ctx */
13359		free_event(child_event);
13360		return NULL;
13361	}
13362
13363	get_ctx(child_ctx);
13364
13365	/*
13366	 * Make the child state follow the state of the parent event,
13367	 * not its attr.disabled bit.  We hold the parent's mutex,
13368	 * so we won't race with perf_event_{en, dis}able_family.
13369	 */
13370	if (parent_state >= PERF_EVENT_STATE_INACTIVE)
13371		child_event->state = PERF_EVENT_STATE_INACTIVE;
13372	else
13373		child_event->state = PERF_EVENT_STATE_OFF;
13374
13375	if (parent_event->attr.freq) {
13376		u64 sample_period = parent_event->hw.sample_period;
13377		struct hw_perf_event *hwc = &child_event->hw;
13378
13379		hwc->sample_period = sample_period;
13380		hwc->last_period   = sample_period;
13381
13382		local64_set(&hwc->period_left, sample_period);
13383	}
13384
13385	child_event->ctx = child_ctx;
13386	child_event->overflow_handler = parent_event->overflow_handler;
13387	child_event->overflow_handler_context
13388		= parent_event->overflow_handler_context;
13389
13390	/*
13391	 * Precalculate sample_data sizes
13392	 */
13393	perf_event__header_size(child_event);
13394	perf_event__id_header_size(child_event);
13395
13396	/*
13397	 * Link it up in the child's context:
13398	 */
13399	raw_spin_lock_irqsave(&child_ctx->lock, flags);
13400	add_event_to_ctx(child_event, child_ctx);
13401	child_event->attach_state |= PERF_ATTACH_CHILD;
13402	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
13403
13404	/*
13405	 * Link this into the parent event's child list
13406	 */
13407	list_add_tail(&child_event->child_list, &parent_event->child_list);
13408	mutex_unlock(&parent_event->child_mutex);
13409
13410	return child_event;
13411}
13412
13413/*
13414 * Inherits an event group.
13415 *
13416 * This will quietly suppress orphaned events; !inherit_event() is not an error.
13417 * This matches with perf_event_release_kernel() removing all child events.
13418 *
13419 * Returns:
13420 *  - 0 on success
13421 *  - <0 on error
13422 */
13423static int inherit_group(struct perf_event *parent_event,
13424	      struct task_struct *parent,
13425	      struct perf_event_context *parent_ctx,
13426	      struct task_struct *child,
13427	      struct perf_event_context *child_ctx)
13428{
13429	struct perf_event *leader;
13430	struct perf_event *sub;
13431	struct perf_event *child_ctr;
13432
13433	leader = inherit_event(parent_event, parent, parent_ctx,
13434				 child, NULL, child_ctx);
13435	if (IS_ERR(leader))
13436		return PTR_ERR(leader);
13437	/*
13438	 * @leader can be NULL here because of is_orphaned_event(). In this
13439	 * case inherit_event() will create individual events, similar to what
13440	 * perf_group_detach() would do anyway.
13441	 */
13442	for_each_sibling_event(sub, parent_event) {
13443		child_ctr = inherit_event(sub, parent, parent_ctx,
13444					    child, leader, child_ctx);
13445		if (IS_ERR(child_ctr))
13446			return PTR_ERR(child_ctr);
13447
13448		if (sub->aux_event == parent_event && child_ctr &&
13449		    !perf_get_aux_event(child_ctr, leader))
13450			return -EINVAL;
13451	}
13452	if (leader)
13453		leader->group_generation = parent_event->group_generation;
13454	return 0;
13455}
13456
13457/*
13458 * Creates the child task context and tries to inherit the event-group.
13459 *
13460 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
13461 * inherited_all set when we 'fail' to inherit an orphaned event; this is
13462 * consistent with perf_event_release_kernel() removing all child events.
13463 *
13464 * Returns:
13465 *  - 0 on success
13466 *  - <0 on error
13467 */
13468static int
13469inherit_task_group(struct perf_event *event, struct task_struct *parent,
13470		   struct perf_event_context *parent_ctx,
13471		   struct task_struct *child,
13472		   u64 clone_flags, int *inherited_all)
13473{
13474	struct perf_event_context *child_ctx;
13475	int ret;
13476
13477	if (!event->attr.inherit ||
13478	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
13479	    /* Do not inherit if sigtrap and signal handlers were cleared. */
13480	    (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
13481		*inherited_all = 0;
13482		return 0;
13483	}
13484
13485	child_ctx = child->perf_event_ctxp;
13486	if (!child_ctx) {
13487		/*
13488		 * This is executed from the parent task context, so
13489		 * inherit events that have been marked for cloning.
13490		 * First allocate and initialize a context for the
13491		 * child.
13492		 */
13493		child_ctx = alloc_perf_context(child);
13494		if (!child_ctx)
13495			return -ENOMEM;
13496
13497		child->perf_event_ctxp = child_ctx;
13498	}
13499
13500	ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
13501	if (ret)
13502		*inherited_all = 0;
13503
13504	return ret;
13505}
13506
13507/*
13508 * Initialize the perf_event context in task_struct
13509 */
13510static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
13511{
13512	struct perf_event_context *child_ctx, *parent_ctx;
13513	struct perf_event_context *cloned_ctx;
13514	struct perf_event *event;
13515	struct task_struct *parent = current;
13516	int inherited_all = 1;
13517	unsigned long flags;
13518	int ret = 0;
13519
13520	if (likely(!parent->perf_event_ctxp))
13521		return 0;
13522
13523	/*
13524	 * If the parent's context is a clone, pin it so it won't get
13525	 * swapped under us.
13526	 */
13527	parent_ctx = perf_pin_task_context(parent);
13528	if (!parent_ctx)
13529		return 0;
13530
13531	/*
13532	 * No need to check if parent_ctx != NULL here; since we saw
13533	 * it non-NULL earlier, the only reason for it to become NULL
13534	 * is if we exit, and since we're currently in the middle of
13535	 * a fork we can't be exiting at the same time.
13536	 */
13537
13538	/*
13539	 * Lock the parent list. No need to lock the child - not PID
13540	 * hashed yet and not running, so nobody can access it.
13541	 */
13542	mutex_lock(&parent_ctx->mutex);
13543
13544	/*
13545	 * We dont have to disable NMIs - we are only looking at
13546	 * the list, not manipulating it:
13547	 */
13548	perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
13549		ret = inherit_task_group(event, parent, parent_ctx,
13550					 child, clone_flags, &inherited_all);
13551		if (ret)
13552			goto out_unlock;
13553	}
13554
13555	/*
13556	 * We can't hold ctx->lock when iterating the ->flexible_group list due
13557	 * to allocations, but we need to prevent rotation because
13558	 * rotate_ctx() will change the list from interrupt context.
13559	 */
13560	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13561	parent_ctx->rotate_disable = 1;
13562	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13563
13564	perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
13565		ret = inherit_task_group(event, parent, parent_ctx,
13566					 child, clone_flags, &inherited_all);
13567		if (ret)
13568			goto out_unlock;
13569	}
13570
13571	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13572	parent_ctx->rotate_disable = 0;
13573
13574	child_ctx = child->perf_event_ctxp;
13575
13576	if (child_ctx && inherited_all) {
13577		/*
13578		 * Mark the child context as a clone of the parent
13579		 * context, or of whatever the parent is a clone of.
13580		 *
13581		 * Note that if the parent is a clone, the holding of
13582		 * parent_ctx->lock avoids it from being uncloned.
13583		 */
13584		cloned_ctx = parent_ctx->parent_ctx;
13585		if (cloned_ctx) {
13586			child_ctx->parent_ctx = cloned_ctx;
13587			child_ctx->parent_gen = parent_ctx->parent_gen;
13588		} else {
13589			child_ctx->parent_ctx = parent_ctx;
13590			child_ctx->parent_gen = parent_ctx->generation;
13591		}
13592		get_ctx(child_ctx->parent_ctx);
13593	}
13594
13595	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13596out_unlock:
13597	mutex_unlock(&parent_ctx->mutex);
13598
13599	perf_unpin_context(parent_ctx);
13600	put_ctx(parent_ctx);
13601
13602	return ret;
13603}
13604
13605/*
13606 * Initialize the perf_event context in task_struct
13607 */
13608int perf_event_init_task(struct task_struct *child, u64 clone_flags)
13609{
13610	int ret;
13611
13612	child->perf_event_ctxp = NULL;
13613	mutex_init(&child->perf_event_mutex);
13614	INIT_LIST_HEAD(&child->perf_event_list);
13615
13616	ret = perf_event_init_context(child, clone_flags);
13617	if (ret) {
13618		perf_event_free_task(child);
13619		return ret;
13620	}
13621
13622	return 0;
13623}
13624
13625static void __init perf_event_init_all_cpus(void)
13626{
13627	struct swevent_htable *swhash;
13628	struct perf_cpu_context *cpuctx;
13629	int cpu;
13630
13631	zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
13632
13633	for_each_possible_cpu(cpu) {
13634		swhash = &per_cpu(swevent_htable, cpu);
13635		mutex_init(&swhash->hlist_mutex);
13636
13637		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
13638		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
13639
13640		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
13641
13642		cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
13643		__perf_event_init_context(&cpuctx->ctx);
13644		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
13645		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
13646		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
13647		cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
13648		cpuctx->heap = cpuctx->heap_default;
13649	}
13650}
13651
13652static void perf_swevent_init_cpu(unsigned int cpu)
13653{
13654	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
13655
13656	mutex_lock(&swhash->hlist_mutex);
13657	if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
13658		struct swevent_hlist *hlist;
13659
13660		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
13661		WARN_ON(!hlist);
13662		rcu_assign_pointer(swhash->swevent_hlist, hlist);
13663	}
13664	mutex_unlock(&swhash->hlist_mutex);
13665}
13666
13667#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
13668static void __perf_event_exit_context(void *__info)
13669{
13670	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
13671	struct perf_event_context *ctx = __info;
13672	struct perf_event *event;
13673
13674	raw_spin_lock(&ctx->lock);
13675	ctx_sched_out(ctx, EVENT_TIME);
13676	list_for_each_entry(event, &ctx->event_list, event_entry)
13677		__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
13678	raw_spin_unlock(&ctx->lock);
13679}
13680
13681static void perf_event_exit_cpu_context(int cpu)
13682{
13683	struct perf_cpu_context *cpuctx;
13684	struct perf_event_context *ctx;
13685
13686	// XXX simplify cpuctx->online
13687	mutex_lock(&pmus_lock);
13688	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
13689	ctx = &cpuctx->ctx;
13690
13691	mutex_lock(&ctx->mutex);
13692	smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
13693	cpuctx->online = 0;
13694	mutex_unlock(&ctx->mutex);
13695	cpumask_clear_cpu(cpu, perf_online_mask);
13696	mutex_unlock(&pmus_lock);
13697}
13698#else
13699
13700static void perf_event_exit_cpu_context(int cpu) { }
13701
13702#endif
13703
13704int perf_event_init_cpu(unsigned int cpu)
13705{
13706	struct perf_cpu_context *cpuctx;
13707	struct perf_event_context *ctx;
13708
13709	perf_swevent_init_cpu(cpu);
13710
13711	mutex_lock(&pmus_lock);
13712	cpumask_set_cpu(cpu, perf_online_mask);
13713	cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
13714	ctx = &cpuctx->ctx;
13715
13716	mutex_lock(&ctx->mutex);
13717	cpuctx->online = 1;
13718	mutex_unlock(&ctx->mutex);
13719	mutex_unlock(&pmus_lock);
13720
13721	return 0;
13722}
13723
13724int perf_event_exit_cpu(unsigned int cpu)
13725{
13726	perf_event_exit_cpu_context(cpu);
13727	return 0;
13728}
13729
13730static int
13731perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
13732{
13733	int cpu;
13734
13735	for_each_online_cpu(cpu)
13736		perf_event_exit_cpu(cpu);
13737
13738	return NOTIFY_OK;
13739}
13740
13741/*
13742 * Run the perf reboot notifier at the very last possible moment so that
13743 * the generic watchdog code runs as long as possible.
13744 */
13745static struct notifier_block perf_reboot_notifier = {
13746	.notifier_call = perf_reboot,
13747	.priority = INT_MIN,
13748};
13749
13750void __init perf_event_init(void)
13751{
13752	int ret;
13753
13754	idr_init(&pmu_idr);
13755
13756	perf_event_init_all_cpus();
13757	init_srcu_struct(&pmus_srcu);
13758	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
13759	perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
13760	perf_pmu_register(&perf_task_clock, "task_clock", -1);
13761	perf_tp_register();
13762	perf_event_init_cpu(smp_processor_id());
13763	register_reboot_notifier(&perf_reboot_notifier);
13764
13765	ret = init_hw_breakpoint();
13766	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
13767
13768	perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
13769
13770	/*
13771	 * Build time assertion that we keep the data_head at the intended
13772	 * location.  IOW, validation we got the __reserved[] size right.
13773	 */
13774	BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
13775		     != 1024);
13776}
13777
13778ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
13779			      char *page)
13780{
13781	struct perf_pmu_events_attr *pmu_attr =
13782		container_of(attr, struct perf_pmu_events_attr, attr);
13783
13784	if (pmu_attr->event_str)
13785		return sprintf(page, "%s\n", pmu_attr->event_str);
13786
13787	return 0;
13788}
13789EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
13790
13791static int __init perf_event_sysfs_init(void)
13792{
13793	struct pmu *pmu;
13794	int ret;
13795
13796	mutex_lock(&pmus_lock);
13797
13798	ret = bus_register(&pmu_bus);
13799	if (ret)
13800		goto unlock;
13801
13802	list_for_each_entry(pmu, &pmus, entry) {
13803		if (pmu->dev)
13804			continue;
13805
13806		ret = pmu_dev_alloc(pmu);
13807		WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
13808	}
13809	pmu_bus_running = 1;
13810	ret = 0;
13811
13812unlock:
13813	mutex_unlock(&pmus_lock);
13814
13815	return ret;
13816}
13817device_initcall(perf_event_sysfs_init);
13818
13819#ifdef CONFIG_CGROUP_PERF
13820static struct cgroup_subsys_state *
13821perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
13822{
13823	struct perf_cgroup *jc;
13824
13825	jc = kzalloc(sizeof(*jc), GFP_KERNEL);
13826	if (!jc)
13827		return ERR_PTR(-ENOMEM);
13828
13829	jc->info = alloc_percpu(struct perf_cgroup_info);
13830	if (!jc->info) {
13831		kfree(jc);
13832		return ERR_PTR(-ENOMEM);
13833	}
13834
13835	return &jc->css;
13836}
13837
13838static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
13839{
13840	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
13841
13842	free_percpu(jc->info);
13843	kfree(jc);
13844}
13845
13846static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
13847{
13848	perf_event_cgroup(css->cgroup);
13849	return 0;
13850}
13851
13852static int __perf_cgroup_move(void *info)
13853{
13854	struct task_struct *task = info;
13855
13856	preempt_disable();
13857	perf_cgroup_switch(task);
13858	preempt_enable();
13859
13860	return 0;
13861}
13862
13863static void perf_cgroup_attach(struct cgroup_taskset *tset)
13864{
13865	struct task_struct *task;
13866	struct cgroup_subsys_state *css;
13867
13868	cgroup_taskset_for_each(task, css, tset)
13869		task_function_call(task, __perf_cgroup_move, task);
13870}
13871
13872struct cgroup_subsys perf_event_cgrp_subsys = {
13873	.css_alloc	= perf_cgroup_css_alloc,
13874	.css_free	= perf_cgroup_css_free,
13875	.css_online	= perf_cgroup_css_online,
13876	.attach		= perf_cgroup_attach,
13877	/*
13878	 * Implicitly enable on dfl hierarchy so that perf events can
13879	 * always be filtered by cgroup2 path as long as perf_event
13880	 * controller is not mounted on a legacy hierarchy.
13881	 */
13882	.implicit_on_dfl = true,
13883	.threaded	= true,
13884};
13885#endif /* CONFIG_CGROUP_PERF */
13886
13887DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
13888