1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Performance events core code:
4 *
5 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
6 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
7 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
8 *  Copyright  ��  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
9 */
10
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/cpu.h>
14#include <linux/smp.h>
15#include <linux/idr.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
20#include <linux/tick.h>
21#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
25#include <linux/reboot.h>
26#include <linux/vmstat.h>
27#include <linux/device.h>
28#include <linux/export.h>
29#include <linux/vmalloc.h>
30#include <linux/hardirq.h>
31#include <linux/hugetlb.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
37#include <linux/cgroup.h>
38#include <linux/perf_event.h>
39#include <linux/trace_events.h>
40#include <linux/hw_breakpoint.h>
41#include <linux/mm_types.h>
42#include <linux/module.h>
43#include <linux/mman.h>
44#include <linux/compat.h>
45#include <linux/bpf.h>
46#include <linux/filter.h>
47#include <linux/namei.h>
48#include <linux/parser.h>
49#include <linux/sched/clock.h>
50#include <linux/sched/mm.h>
51#include <linux/proc_ns.h>
52#include <linux/mount.h>
53#include <linux/min_heap.h>
54#include <linux/highmem.h>
55#include <linux/pgtable.h>
56#include <linux/buildid.h>
57
58#include "internal.h"
59
60#include <asm/irq_regs.h>
61
62typedef int (*remote_function_f)(void *);
63
64struct remote_function_call {
65	struct task_struct	*p;
66	remote_function_f	func;
67	void			*info;
68	int			ret;
69};
70
71static void remote_function(void *data)
72{
73	struct remote_function_call *tfc = data;
74	struct task_struct *p = tfc->p;
75
76	if (p) {
77		/* -EAGAIN */
78		if (task_cpu(p) != smp_processor_id())
79			return;
80
81		/*
82		 * Now that we're on right CPU with IRQs disabled, we can test
83		 * if we hit the right task without races.
84		 */
85
86		tfc->ret = -ESRCH; /* No such (running) process */
87		if (p != current)
88			return;
89	}
90
91	tfc->ret = tfc->func(tfc->info);
92}
93
94/**
95 * task_function_call - call a function on the cpu on which a task runs
96 * @p:		the task to evaluate
97 * @func:	the function to be called
98 * @info:	the function call argument
99 *
100 * Calls the function @func when the task is currently running. This might
101 * be on the current CPU, which just calls the function directly.  This will
102 * retry due to any failures in smp_call_function_single(), such as if the
103 * task_cpu() goes offline concurrently.
104 *
105 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
106 */
107static int
108task_function_call(struct task_struct *p, remote_function_f func, void *info)
109{
110	struct remote_function_call data = {
111		.p	= p,
112		.func	= func,
113		.info	= info,
114		.ret	= -EAGAIN,
115	};
116	int ret;
117
118	for (;;) {
119		ret = smp_call_function_single(task_cpu(p), remote_function,
120					       &data, 1);
121		if (!ret)
122			ret = data.ret;
123
124		if (ret != -EAGAIN)
125			break;
126
127		cond_resched();
128	}
129
130	return ret;
131}
132
133/**
134 * cpu_function_call - call a function on the cpu
135 * @cpu:	target cpu to queue this function
136 * @func:	the function to be called
137 * @info:	the function call argument
138 *
139 * Calls the function @func on the remote cpu.
140 *
141 * returns: @func return value or -ENXIO when the cpu is offline
142 */
143static int cpu_function_call(int cpu, remote_function_f func, void *info)
144{
145	struct remote_function_call data = {
146		.p	= NULL,
147		.func	= func,
148		.info	= info,
149		.ret	= -ENXIO, /* No such CPU */
150	};
151
152	smp_call_function_single(cpu, remote_function, &data, 1);
153
154	return data.ret;
155}
156
157static inline struct perf_cpu_context *
158__get_cpu_context(struct perf_event_context *ctx)
159{
160	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
161}
162
163static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
164			  struct perf_event_context *ctx)
165{
166	raw_spin_lock(&cpuctx->ctx.lock);
167	if (ctx)
168		raw_spin_lock(&ctx->lock);
169}
170
171static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
172			    struct perf_event_context *ctx)
173{
174	if (ctx)
175		raw_spin_unlock(&ctx->lock);
176	raw_spin_unlock(&cpuctx->ctx.lock);
177}
178
179#define TASK_TOMBSTONE ((void *)-1L)
180
181static bool is_kernel_event(struct perf_event *event)
182{
183	return READ_ONCE(event->owner) == TASK_TOMBSTONE;
184}
185
186/*
187 * On task ctx scheduling...
188 *
189 * When !ctx->nr_events a task context will not be scheduled. This means
190 * we can disable the scheduler hooks (for performance) without leaving
191 * pending task ctx state.
192 *
193 * This however results in two special cases:
194 *
195 *  - removing the last event from a task ctx; this is relatively straight
196 *    forward and is done in __perf_remove_from_context.
197 *
198 *  - adding the first event to a task ctx; this is tricky because we cannot
199 *    rely on ctx->is_active and therefore cannot use event_function_call().
200 *    See perf_install_in_context().
201 *
202 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
203 */
204
205typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
206			struct perf_event_context *, void *);
207
208struct event_function_struct {
209	struct perf_event *event;
210	event_f func;
211	void *data;
212};
213
214static int event_function(void *info)
215{
216	struct event_function_struct *efs = info;
217	struct perf_event *event = efs->event;
218	struct perf_event_context *ctx = event->ctx;
219	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
220	struct perf_event_context *task_ctx = cpuctx->task_ctx;
221	int ret = 0;
222
223	lockdep_assert_irqs_disabled();
224
225	perf_ctx_lock(cpuctx, task_ctx);
226	/*
227	 * Since we do the IPI call without holding ctx->lock things can have
228	 * changed, double check we hit the task we set out to hit.
229	 */
230	if (ctx->task) {
231		if (ctx->task != current) {
232			ret = -ESRCH;
233			goto unlock;
234		}
235
236		/*
237		 * We only use event_function_call() on established contexts,
238		 * and event_function() is only ever called when active (or
239		 * rather, we'll have bailed in task_function_call() or the
240		 * above ctx->task != current test), therefore we must have
241		 * ctx->is_active here.
242		 */
243		WARN_ON_ONCE(!ctx->is_active);
244		/*
245		 * And since we have ctx->is_active, cpuctx->task_ctx must
246		 * match.
247		 */
248		WARN_ON_ONCE(task_ctx != ctx);
249	} else {
250		WARN_ON_ONCE(&cpuctx->ctx != ctx);
251	}
252
253	efs->func(event, cpuctx, ctx, efs->data);
254unlock:
255	perf_ctx_unlock(cpuctx, task_ctx);
256
257	return ret;
258}
259
260static void event_function_call(struct perf_event *event, event_f func, void *data)
261{
262	struct perf_event_context *ctx = event->ctx;
263	struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
264	struct event_function_struct efs = {
265		.event = event,
266		.func = func,
267		.data = data,
268	};
269
270	if (!event->parent) {
271		/*
272		 * If this is a !child event, we must hold ctx::mutex to
273		 * stabilize the event->ctx relation. See
274		 * perf_event_ctx_lock().
275		 */
276		lockdep_assert_held(&ctx->mutex);
277	}
278
279	if (!task) {
280		cpu_function_call(event->cpu, event_function, &efs);
281		return;
282	}
283
284	if (task == TASK_TOMBSTONE)
285		return;
286
287again:
288	if (!task_function_call(task, event_function, &efs))
289		return;
290
291	raw_spin_lock_irq(&ctx->lock);
292	/*
293	 * Reload the task pointer, it might have been changed by
294	 * a concurrent perf_event_context_sched_out().
295	 */
296	task = ctx->task;
297	if (task == TASK_TOMBSTONE) {
298		raw_spin_unlock_irq(&ctx->lock);
299		return;
300	}
301	if (ctx->is_active) {
302		raw_spin_unlock_irq(&ctx->lock);
303		goto again;
304	}
305	func(event, NULL, ctx, data);
306	raw_spin_unlock_irq(&ctx->lock);
307}
308
309/*
310 * Similar to event_function_call() + event_function(), but hard assumes IRQs
311 * are already disabled and we're on the right CPU.
312 */
313static void event_function_local(struct perf_event *event, event_f func, void *data)
314{
315	struct perf_event_context *ctx = event->ctx;
316	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
317	struct task_struct *task = READ_ONCE(ctx->task);
318	struct perf_event_context *task_ctx = NULL;
319
320	lockdep_assert_irqs_disabled();
321
322	if (task) {
323		if (task == TASK_TOMBSTONE)
324			return;
325
326		task_ctx = ctx;
327	}
328
329	perf_ctx_lock(cpuctx, task_ctx);
330
331	task = ctx->task;
332	if (task == TASK_TOMBSTONE)
333		goto unlock;
334
335	if (task) {
336		/*
337		 * We must be either inactive or active and the right task,
338		 * otherwise we're screwed, since we cannot IPI to somewhere
339		 * else.
340		 */
341		if (ctx->is_active) {
342			if (WARN_ON_ONCE(task != current))
343				goto unlock;
344
345			if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
346				goto unlock;
347		}
348	} else {
349		WARN_ON_ONCE(&cpuctx->ctx != ctx);
350	}
351
352	func(event, cpuctx, ctx, data);
353unlock:
354	perf_ctx_unlock(cpuctx, task_ctx);
355}
356
357#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
358		       PERF_FLAG_FD_OUTPUT  |\
359		       PERF_FLAG_PID_CGROUP |\
360		       PERF_FLAG_FD_CLOEXEC)
361
362/*
363 * branch priv levels that need permission checks
364 */
365#define PERF_SAMPLE_BRANCH_PERM_PLM \
366	(PERF_SAMPLE_BRANCH_KERNEL |\
367	 PERF_SAMPLE_BRANCH_HV)
368
369enum event_type_t {
370	EVENT_FLEXIBLE = 0x1,
371	EVENT_PINNED = 0x2,
372	EVENT_TIME = 0x4,
373	/* see ctx_resched() for details */
374	EVENT_CPU = 0x8,
375	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
376};
377
378/*
379 * perf_sched_events : >0 events exist
380 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
381 */
382
383static void perf_sched_delayed(struct work_struct *work);
384DEFINE_STATIC_KEY_FALSE(perf_sched_events);
385static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
386static DEFINE_MUTEX(perf_sched_mutex);
387static atomic_t perf_sched_count;
388
389static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
390static DEFINE_PER_CPU(int, perf_sched_cb_usages);
391static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
392
393static atomic_t nr_mmap_events __read_mostly;
394static atomic_t nr_comm_events __read_mostly;
395static atomic_t nr_namespaces_events __read_mostly;
396static atomic_t nr_task_events __read_mostly;
397static atomic_t nr_freq_events __read_mostly;
398static atomic_t nr_switch_events __read_mostly;
399static atomic_t nr_ksymbol_events __read_mostly;
400static atomic_t nr_bpf_events __read_mostly;
401static atomic_t nr_cgroup_events __read_mostly;
402static atomic_t nr_text_poke_events __read_mostly;
403static atomic_t nr_build_id_events __read_mostly;
404
405static LIST_HEAD(pmus);
406static DEFINE_MUTEX(pmus_lock);
407static struct srcu_struct pmus_srcu;
408static cpumask_var_t perf_online_mask;
409static struct kmem_cache *perf_event_cache;
410
411/*
412 * perf event paranoia level:
413 *  -1 - not paranoid at all
414 *   0 - disallow raw tracepoint access for unpriv
415 *   1 - disallow cpu events for unpriv
416 *   2 - disallow kernel profiling for unpriv
417 */
418int sysctl_perf_event_paranoid __read_mostly = 2;
419
420/* Minimum for 512 kiB + 1 user control page */
421int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
422
423/*
424 * max perf event sample rate
425 */
426#define DEFAULT_MAX_SAMPLE_RATE		100000
427#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
428#define DEFAULT_CPU_TIME_MAX_PERCENT	25
429
430int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
431
432static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
433static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
434
435static int perf_sample_allowed_ns __read_mostly =
436	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
437
438static void update_perf_cpu_limits(void)
439{
440	u64 tmp = perf_sample_period_ns;
441
442	tmp *= sysctl_perf_cpu_time_max_percent;
443	tmp = div_u64(tmp, 100);
444	if (!tmp)
445		tmp = 1;
446
447	WRITE_ONCE(perf_sample_allowed_ns, tmp);
448}
449
450static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
451
452int perf_proc_update_handler(struct ctl_table *table, int write,
453		void *buffer, size_t *lenp, loff_t *ppos)
454{
455	int ret;
456	int perf_cpu = sysctl_perf_cpu_time_max_percent;
457	/*
458	 * If throttling is disabled don't allow the write:
459	 */
460	if (write && (perf_cpu == 100 || perf_cpu == 0))
461		return -EINVAL;
462
463	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
464	if (ret || !write)
465		return ret;
466
467	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
468	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
469	update_perf_cpu_limits();
470
471	return 0;
472}
473
474int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
475
476int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
477		void *buffer, size_t *lenp, loff_t *ppos)
478{
479	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
480
481	if (ret || !write)
482		return ret;
483
484	if (sysctl_perf_cpu_time_max_percent == 100 ||
485	    sysctl_perf_cpu_time_max_percent == 0) {
486		printk(KERN_WARNING
487		       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
488		WRITE_ONCE(perf_sample_allowed_ns, 0);
489	} else {
490		update_perf_cpu_limits();
491	}
492
493	return 0;
494}
495
496/*
497 * perf samples are done in some very critical code paths (NMIs).
498 * If they take too much CPU time, the system can lock up and not
499 * get any real work done.  This will drop the sample rate when
500 * we detect that events are taking too long.
501 */
502#define NR_ACCUMULATED_SAMPLES 128
503static DEFINE_PER_CPU(u64, running_sample_length);
504
505static u64 __report_avg;
506static u64 __report_allowed;
507
508static void perf_duration_warn(struct irq_work *w)
509{
510	printk_ratelimited(KERN_INFO
511		"perf: interrupt took too long (%lld > %lld), lowering "
512		"kernel.perf_event_max_sample_rate to %d\n",
513		__report_avg, __report_allowed,
514		sysctl_perf_event_sample_rate);
515}
516
517static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
518
519void perf_sample_event_took(u64 sample_len_ns)
520{
521	u64 max_len = READ_ONCE(perf_sample_allowed_ns);
522	u64 running_len;
523	u64 avg_len;
524	u32 max;
525
526	if (max_len == 0)
527		return;
528
529	/* Decay the counter by 1 average sample. */
530	running_len = __this_cpu_read(running_sample_length);
531	running_len -= running_len/NR_ACCUMULATED_SAMPLES;
532	running_len += sample_len_ns;
533	__this_cpu_write(running_sample_length, running_len);
534
535	/*
536	 * Note: this will be biased artifically low until we have
537	 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
538	 * from having to maintain a count.
539	 */
540	avg_len = running_len/NR_ACCUMULATED_SAMPLES;
541	if (avg_len <= max_len)
542		return;
543
544	__report_avg = avg_len;
545	__report_allowed = max_len;
546
547	/*
548	 * Compute a throttle threshold 25% below the current duration.
549	 */
550	avg_len += avg_len / 4;
551	max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
552	if (avg_len < max)
553		max /= (u32)avg_len;
554	else
555		max = 1;
556
557	WRITE_ONCE(perf_sample_allowed_ns, avg_len);
558	WRITE_ONCE(max_samples_per_tick, max);
559
560	sysctl_perf_event_sample_rate = max * HZ;
561	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
562
563	if (!irq_work_queue(&perf_duration_work)) {
564		early_printk("perf: interrupt took too long (%lld > %lld), lowering "
565			     "kernel.perf_event_max_sample_rate to %d\n",
566			     __report_avg, __report_allowed,
567			     sysctl_perf_event_sample_rate);
568	}
569}
570
571static atomic64_t perf_event_id;
572
573static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
574			      enum event_type_t event_type);
575
576static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
577			     enum event_type_t event_type);
578
579static void update_context_time(struct perf_event_context *ctx);
580static u64 perf_event_time(struct perf_event *event);
581
582void __weak perf_event_print_debug(void)	{ }
583
584static inline u64 perf_clock(void)
585{
586	return local_clock();
587}
588
589static inline u64 perf_event_clock(struct perf_event *event)
590{
591	return event->clock();
592}
593
594/*
595 * State based event timekeeping...
596 *
597 * The basic idea is to use event->state to determine which (if any) time
598 * fields to increment with the current delta. This means we only need to
599 * update timestamps when we change state or when they are explicitly requested
600 * (read).
601 *
602 * Event groups make things a little more complicated, but not terribly so. The
603 * rules for a group are that if the group leader is OFF the entire group is
604 * OFF, irrespecive of what the group member states are. This results in
605 * __perf_effective_state().
606 *
607 * A futher ramification is that when a group leader flips between OFF and
608 * !OFF, we need to update all group member times.
609 *
610 *
611 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
612 * need to make sure the relevant context time is updated before we try and
613 * update our timestamps.
614 */
615
616static __always_inline enum perf_event_state
617__perf_effective_state(struct perf_event *event)
618{
619	struct perf_event *leader = event->group_leader;
620
621	if (leader->state <= PERF_EVENT_STATE_OFF)
622		return leader->state;
623
624	return event->state;
625}
626
627static __always_inline void
628__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
629{
630	enum perf_event_state state = __perf_effective_state(event);
631	u64 delta = now - event->tstamp;
632
633	*enabled = event->total_time_enabled;
634	if (state >= PERF_EVENT_STATE_INACTIVE)
635		*enabled += delta;
636
637	*running = event->total_time_running;
638	if (state >= PERF_EVENT_STATE_ACTIVE)
639		*running += delta;
640}
641
642static void perf_event_update_time(struct perf_event *event)
643{
644	u64 now = perf_event_time(event);
645
646	__perf_update_times(event, now, &event->total_time_enabled,
647					&event->total_time_running);
648	event->tstamp = now;
649}
650
651static void perf_event_update_sibling_time(struct perf_event *leader)
652{
653	struct perf_event *sibling;
654
655	for_each_sibling_event(sibling, leader)
656		perf_event_update_time(sibling);
657}
658
659static void
660perf_event_set_state(struct perf_event *event, enum perf_event_state state)
661{
662	if (event->state == state)
663		return;
664
665	perf_event_update_time(event);
666	/*
667	 * If a group leader gets enabled/disabled all its siblings
668	 * are affected too.
669	 */
670	if ((event->state < 0) ^ (state < 0))
671		perf_event_update_sibling_time(event);
672
673	WRITE_ONCE(event->state, state);
674}
675
676/*
677 * UP store-release, load-acquire
678 */
679
680#define __store_release(ptr, val)					\
681do {									\
682	barrier();							\
683	WRITE_ONCE(*(ptr), (val));					\
684} while (0)
685
686#define __load_acquire(ptr)						\
687({									\
688	__unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));	\
689	barrier();							\
690	___p;								\
691})
692
693#ifdef CONFIG_CGROUP_PERF
694
695static inline bool
696perf_cgroup_match(struct perf_event *event)
697{
698	struct perf_event_context *ctx = event->ctx;
699	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
700
701	/* @event doesn't care about cgroup */
702	if (!event->cgrp)
703		return true;
704
705	/* wants specific cgroup scope but @cpuctx isn't associated with any */
706	if (!cpuctx->cgrp)
707		return false;
708
709	/*
710	 * Cgroup scoping is recursive.  An event enabled for a cgroup is
711	 * also enabled for all its descendant cgroups.  If @cpuctx's
712	 * cgroup is a descendant of @event's (the test covers identity
713	 * case), it's a match.
714	 */
715	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
716				    event->cgrp->css.cgroup);
717}
718
719static inline void perf_detach_cgroup(struct perf_event *event)
720{
721	css_put(&event->cgrp->css);
722	event->cgrp = NULL;
723}
724
725static inline int is_cgroup_event(struct perf_event *event)
726{
727	return event->cgrp != NULL;
728}
729
730static inline u64 perf_cgroup_event_time(struct perf_event *event)
731{
732	struct perf_cgroup_info *t;
733
734	t = per_cpu_ptr(event->cgrp->info, event->cpu);
735	return t->time;
736}
737
738static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
739{
740	struct perf_cgroup_info *t;
741
742	t = per_cpu_ptr(event->cgrp->info, event->cpu);
743	if (!__load_acquire(&t->active))
744		return t->time;
745	now += READ_ONCE(t->timeoffset);
746	return now;
747}
748
749static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
750{
751	if (adv)
752		info->time += now - info->timestamp;
753	info->timestamp = now;
754	/*
755	 * see update_context_time()
756	 */
757	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
758}
759
760static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
761{
762	struct perf_cgroup *cgrp = cpuctx->cgrp;
763	struct cgroup_subsys_state *css;
764	struct perf_cgroup_info *info;
765
766	if (cgrp) {
767		u64 now = perf_clock();
768
769		for (css = &cgrp->css; css; css = css->parent) {
770			cgrp = container_of(css, struct perf_cgroup, css);
771			info = this_cpu_ptr(cgrp->info);
772
773			__update_cgrp_time(info, now, true);
774			if (final)
775				__store_release(&info->active, 0);
776		}
777	}
778}
779
780static inline void update_cgrp_time_from_event(struct perf_event *event)
781{
782	struct perf_cgroup_info *info;
783
784	/*
785	 * ensure we access cgroup data only when needed and
786	 * when we know the cgroup is pinned (css_get)
787	 */
788	if (!is_cgroup_event(event))
789		return;
790
791	info = this_cpu_ptr(event->cgrp->info);
792	/*
793	 * Do not update time when cgroup is not active
794	 */
795	if (info->active)
796		__update_cgrp_time(info, perf_clock(), true);
797}
798
799static inline void
800perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
801{
802	struct perf_event_context *ctx = &cpuctx->ctx;
803	struct perf_cgroup *cgrp = cpuctx->cgrp;
804	struct perf_cgroup_info *info;
805	struct cgroup_subsys_state *css;
806
807	/*
808	 * ctx->lock held by caller
809	 * ensure we do not access cgroup data
810	 * unless we have the cgroup pinned (css_get)
811	 */
812	if (!cgrp)
813		return;
814
815	WARN_ON_ONCE(!ctx->nr_cgroups);
816
817	for (css = &cgrp->css; css; css = css->parent) {
818		cgrp = container_of(css, struct perf_cgroup, css);
819		info = this_cpu_ptr(cgrp->info);
820		__update_cgrp_time(info, ctx->timestamp, false);
821		__store_release(&info->active, 1);
822	}
823}
824
825static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
826
827/*
828 * reschedule events based on the cgroup constraint of task.
829 */
830static void perf_cgroup_switch(struct task_struct *task)
831{
832	struct perf_cgroup *cgrp;
833	struct perf_cpu_context *cpuctx, *tmp;
834	struct list_head *list;
835	unsigned long flags;
836
837	/*
838	 * Disable interrupts and preemption to avoid this CPU's
839	 * cgrp_cpuctx_entry to change under us.
840	 */
841	local_irq_save(flags);
842
843	cgrp = perf_cgroup_from_task(task, NULL);
844
845	list = this_cpu_ptr(&cgrp_cpuctx_list);
846	list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
847		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
848		if (READ_ONCE(cpuctx->cgrp) == cgrp)
849			continue;
850
851		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
852		perf_pmu_disable(cpuctx->ctx.pmu);
853
854		cpu_ctx_sched_out(cpuctx, EVENT_ALL);
855		/*
856		 * must not be done before ctxswout due
857		 * to update_cgrp_time_from_cpuctx() in
858		 * ctx_sched_out()
859		 */
860		cpuctx->cgrp = cgrp;
861		/*
862		 * set cgrp before ctxsw in to allow
863		 * perf_cgroup_set_timestamp() in ctx_sched_in()
864		 * to not have to pass task around
865		 */
866		cpu_ctx_sched_in(cpuctx, EVENT_ALL);
867
868		perf_pmu_enable(cpuctx->ctx.pmu);
869		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
870	}
871
872	local_irq_restore(flags);
873}
874
875static int perf_cgroup_ensure_storage(struct perf_event *event,
876				struct cgroup_subsys_state *css)
877{
878	struct perf_cpu_context *cpuctx;
879	struct perf_event **storage;
880	int cpu, heap_size, ret = 0;
881
882	/*
883	 * Allow storage to have sufficent space for an iterator for each
884	 * possibly nested cgroup plus an iterator for events with no cgroup.
885	 */
886	for (heap_size = 1; css; css = css->parent)
887		heap_size++;
888
889	for_each_possible_cpu(cpu) {
890		cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
891		if (heap_size <= cpuctx->heap_size)
892			continue;
893
894		storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
895				       GFP_KERNEL, cpu_to_node(cpu));
896		if (!storage) {
897			ret = -ENOMEM;
898			break;
899		}
900
901		raw_spin_lock_irq(&cpuctx->ctx.lock);
902		if (cpuctx->heap_size < heap_size) {
903			swap(cpuctx->heap, storage);
904			if (storage == cpuctx->heap_default)
905				storage = NULL;
906			cpuctx->heap_size = heap_size;
907		}
908		raw_spin_unlock_irq(&cpuctx->ctx.lock);
909
910		kfree(storage);
911	}
912
913	return ret;
914}
915
916static inline int perf_cgroup_connect(int fd, struct perf_event *event,
917				      struct perf_event_attr *attr,
918				      struct perf_event *group_leader)
919{
920	struct perf_cgroup *cgrp;
921	struct cgroup_subsys_state *css;
922	struct fd f = fdget(fd);
923	int ret = 0;
924
925	if (!f.file)
926		return -EBADF;
927
928	css = css_tryget_online_from_dir(f.file->f_path.dentry,
929					 &perf_event_cgrp_subsys);
930	if (IS_ERR(css)) {
931		ret = PTR_ERR(css);
932		goto out;
933	}
934
935	ret = perf_cgroup_ensure_storage(event, css);
936	if (ret)
937		goto out;
938
939	cgrp = container_of(css, struct perf_cgroup, css);
940	event->cgrp = cgrp;
941
942	/*
943	 * all events in a group must monitor
944	 * the same cgroup because a task belongs
945	 * to only one perf cgroup at a time
946	 */
947	if (group_leader && group_leader->cgrp != cgrp) {
948		perf_detach_cgroup(event);
949		ret = -EINVAL;
950	}
951out:
952	fdput(f);
953	return ret;
954}
955
956static inline void
957perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
958{
959	struct perf_cpu_context *cpuctx;
960
961	if (!is_cgroup_event(event))
962		return;
963
964	/*
965	 * Because cgroup events are always per-cpu events,
966	 * @ctx == &cpuctx->ctx.
967	 */
968	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
969
970	if (ctx->nr_cgroups++)
971		return;
972
973	cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
974	list_add(&cpuctx->cgrp_cpuctx_entry,
975			per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
976}
977
978static inline void
979perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
980{
981	struct perf_cpu_context *cpuctx;
982
983	if (!is_cgroup_event(event))
984		return;
985
986	/*
987	 * Because cgroup events are always per-cpu events,
988	 * @ctx == &cpuctx->ctx.
989	 */
990	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
991
992	if (--ctx->nr_cgroups)
993		return;
994
995	cpuctx->cgrp = NULL;
996	list_del(&cpuctx->cgrp_cpuctx_entry);
997}
998
999#else /* !CONFIG_CGROUP_PERF */
1000
1001static inline bool
1002perf_cgroup_match(struct perf_event *event)
1003{
1004	return true;
1005}
1006
1007static inline void perf_detach_cgroup(struct perf_event *event)
1008{}
1009
1010static inline int is_cgroup_event(struct perf_event *event)
1011{
1012	return 0;
1013}
1014
1015static inline void update_cgrp_time_from_event(struct perf_event *event)
1016{
1017}
1018
1019static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
1020						bool final)
1021{
1022}
1023
1024static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1025				      struct perf_event_attr *attr,
1026				      struct perf_event *group_leader)
1027{
1028	return -EINVAL;
1029}
1030
1031static inline void
1032perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
1033{
1034}
1035
1036static inline u64 perf_cgroup_event_time(struct perf_event *event)
1037{
1038	return 0;
1039}
1040
1041static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
1042{
1043	return 0;
1044}
1045
1046static inline void
1047perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1048{
1049}
1050
1051static inline void
1052perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1053{
1054}
1055
1056static void perf_cgroup_switch(struct task_struct *task)
1057{
1058}
1059#endif
1060
1061/*
1062 * set default to be dependent on timer tick just
1063 * like original code
1064 */
1065#define PERF_CPU_HRTIMER (1000 / HZ)
1066/*
1067 * function must be called with interrupts disabled
1068 */
1069static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1070{
1071	struct perf_cpu_context *cpuctx;
1072	bool rotations;
1073
1074	lockdep_assert_irqs_disabled();
1075
1076	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1077	rotations = perf_rotate_context(cpuctx);
1078
1079	raw_spin_lock(&cpuctx->hrtimer_lock);
1080	if (rotations)
1081		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1082	else
1083		cpuctx->hrtimer_active = 0;
1084	raw_spin_unlock(&cpuctx->hrtimer_lock);
1085
1086	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1087}
1088
1089static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1090{
1091	struct hrtimer *timer = &cpuctx->hrtimer;
1092	struct pmu *pmu = cpuctx->ctx.pmu;
1093	u64 interval;
1094
1095	/* no multiplexing needed for SW PMU */
1096	if (pmu->task_ctx_nr == perf_sw_context)
1097		return;
1098
1099	/*
1100	 * check default is sane, if not set then force to
1101	 * default interval (1/tick)
1102	 */
1103	interval = pmu->hrtimer_interval_ms;
1104	if (interval < 1)
1105		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1106
1107	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1108
1109	raw_spin_lock_init(&cpuctx->hrtimer_lock);
1110	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1111	timer->function = perf_mux_hrtimer_handler;
1112}
1113
1114static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1115{
1116	struct hrtimer *timer = &cpuctx->hrtimer;
1117	struct pmu *pmu = cpuctx->ctx.pmu;
1118	unsigned long flags;
1119
1120	/* not for SW PMU */
1121	if (pmu->task_ctx_nr == perf_sw_context)
1122		return 0;
1123
1124	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1125	if (!cpuctx->hrtimer_active) {
1126		cpuctx->hrtimer_active = 1;
1127		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1128		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1129	}
1130	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1131
1132	return 0;
1133}
1134
1135void perf_pmu_disable(struct pmu *pmu)
1136{
1137	int *count = this_cpu_ptr(pmu->pmu_disable_count);
1138	if (!(*count)++)
1139		pmu->pmu_disable(pmu);
1140}
1141
1142void perf_pmu_enable(struct pmu *pmu)
1143{
1144	int *count = this_cpu_ptr(pmu->pmu_disable_count);
1145	if (!--(*count))
1146		pmu->pmu_enable(pmu);
1147}
1148
1149static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1150
1151/*
1152 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1153 * perf_event_task_tick() are fully serialized because they're strictly cpu
1154 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1155 * disabled, while perf_event_task_tick is called from IRQ context.
1156 */
1157static void perf_event_ctx_activate(struct perf_event_context *ctx)
1158{
1159	struct list_head *head = this_cpu_ptr(&active_ctx_list);
1160
1161	lockdep_assert_irqs_disabled();
1162
1163	WARN_ON(!list_empty(&ctx->active_ctx_list));
1164
1165	list_add(&ctx->active_ctx_list, head);
1166}
1167
1168static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1169{
1170	lockdep_assert_irqs_disabled();
1171
1172	WARN_ON(list_empty(&ctx->active_ctx_list));
1173
1174	list_del_init(&ctx->active_ctx_list);
1175}
1176
1177static void get_ctx(struct perf_event_context *ctx)
1178{
1179	refcount_inc(&ctx->refcount);
1180}
1181
1182static void *alloc_task_ctx_data(struct pmu *pmu)
1183{
1184	if (pmu->task_ctx_cache)
1185		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1186
1187	return NULL;
1188}
1189
1190static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1191{
1192	if (pmu->task_ctx_cache && task_ctx_data)
1193		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1194}
1195
1196static void free_ctx(struct rcu_head *head)
1197{
1198	struct perf_event_context *ctx;
1199
1200	ctx = container_of(head, struct perf_event_context, rcu_head);
1201	free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1202	kfree(ctx);
1203}
1204
1205static void put_ctx(struct perf_event_context *ctx)
1206{
1207	if (refcount_dec_and_test(&ctx->refcount)) {
1208		if (ctx->parent_ctx)
1209			put_ctx(ctx->parent_ctx);
1210		if (ctx->task && ctx->task != TASK_TOMBSTONE)
1211			put_task_struct(ctx->task);
1212		call_rcu(&ctx->rcu_head, free_ctx);
1213	}
1214}
1215
1216/*
1217 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1218 * perf_pmu_migrate_context() we need some magic.
1219 *
1220 * Those places that change perf_event::ctx will hold both
1221 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1222 *
1223 * Lock ordering is by mutex address. There are two other sites where
1224 * perf_event_context::mutex nests and those are:
1225 *
1226 *  - perf_event_exit_task_context()	[ child , 0 ]
1227 *      perf_event_exit_event()
1228 *        put_event()			[ parent, 1 ]
1229 *
1230 *  - perf_event_init_context()		[ parent, 0 ]
1231 *      inherit_task_group()
1232 *        inherit_group()
1233 *          inherit_event()
1234 *            perf_event_alloc()
1235 *              perf_init_event()
1236 *                perf_try_init_event()	[ child , 1 ]
1237 *
1238 * While it appears there is an obvious deadlock here -- the parent and child
1239 * nesting levels are inverted between the two. This is in fact safe because
1240 * life-time rules separate them. That is an exiting task cannot fork, and a
1241 * spawning task cannot (yet) exit.
1242 *
1243 * But remember that these are parent<->child context relations, and
1244 * migration does not affect children, therefore these two orderings should not
1245 * interact.
1246 *
1247 * The change in perf_event::ctx does not affect children (as claimed above)
1248 * because the sys_perf_event_open() case will install a new event and break
1249 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1250 * concerned with cpuctx and that doesn't have children.
1251 *
1252 * The places that change perf_event::ctx will issue:
1253 *
1254 *   perf_remove_from_context();
1255 *   synchronize_rcu();
1256 *   perf_install_in_context();
1257 *
1258 * to affect the change. The remove_from_context() + synchronize_rcu() should
1259 * quiesce the event, after which we can install it in the new location. This
1260 * means that only external vectors (perf_fops, prctl) can perturb the event
1261 * while in transit. Therefore all such accessors should also acquire
1262 * perf_event_context::mutex to serialize against this.
1263 *
1264 * However; because event->ctx can change while we're waiting to acquire
1265 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1266 * function.
1267 *
1268 * Lock order:
1269 *    exec_update_lock
1270 *	task_struct::perf_event_mutex
1271 *	  perf_event_context::mutex
1272 *	    perf_event::child_mutex;
1273 *	      perf_event_context::lock
1274 *	    perf_event::mmap_mutex
1275 *	    mmap_lock
1276 *	      perf_addr_filters_head::lock
1277 *
1278 *    cpu_hotplug_lock
1279 *      pmus_lock
1280 *	  cpuctx->mutex / perf_event_context::mutex
1281 */
1282static struct perf_event_context *
1283perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1284{
1285	struct perf_event_context *ctx;
1286
1287again:
1288	rcu_read_lock();
1289	ctx = READ_ONCE(event->ctx);
1290	if (!refcount_inc_not_zero(&ctx->refcount)) {
1291		rcu_read_unlock();
1292		goto again;
1293	}
1294	rcu_read_unlock();
1295
1296	mutex_lock_nested(&ctx->mutex, nesting);
1297	if (event->ctx != ctx) {
1298		mutex_unlock(&ctx->mutex);
1299		put_ctx(ctx);
1300		goto again;
1301	}
1302
1303	return ctx;
1304}
1305
1306static inline struct perf_event_context *
1307perf_event_ctx_lock(struct perf_event *event)
1308{
1309	return perf_event_ctx_lock_nested(event, 0);
1310}
1311
1312static void perf_event_ctx_unlock(struct perf_event *event,
1313				  struct perf_event_context *ctx)
1314{
1315	mutex_unlock(&ctx->mutex);
1316	put_ctx(ctx);
1317}
1318
1319/*
1320 * This must be done under the ctx->lock, such as to serialize against
1321 * context_equiv(), therefore we cannot call put_ctx() since that might end up
1322 * calling scheduler related locks and ctx->lock nests inside those.
1323 */
1324static __must_check struct perf_event_context *
1325unclone_ctx(struct perf_event_context *ctx)
1326{
1327	struct perf_event_context *parent_ctx = ctx->parent_ctx;
1328
1329	lockdep_assert_held(&ctx->lock);
1330
1331	if (parent_ctx)
1332		ctx->parent_ctx = NULL;
1333	ctx->generation++;
1334
1335	return parent_ctx;
1336}
1337
1338static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1339				enum pid_type type)
1340{
1341	u32 nr;
1342	/*
1343	 * only top level events have the pid namespace they were created in
1344	 */
1345	if (event->parent)
1346		event = event->parent;
1347
1348	nr = __task_pid_nr_ns(p, type, event->ns);
1349	/* avoid -1 if it is idle thread or runs in another ns */
1350	if (!nr && !pid_alive(p))
1351		nr = -1;
1352	return nr;
1353}
1354
1355static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1356{
1357	return perf_event_pid_type(event, p, PIDTYPE_TGID);
1358}
1359
1360static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1361{
1362	return perf_event_pid_type(event, p, PIDTYPE_PID);
1363}
1364
1365/*
1366 * If we inherit events we want to return the parent event id
1367 * to userspace.
1368 */
1369static u64 primary_event_id(struct perf_event *event)
1370{
1371	u64 id = event->id;
1372
1373	if (event->parent)
1374		id = event->parent->id;
1375
1376	return id;
1377}
1378
1379/*
1380 * Get the perf_event_context for a task and lock it.
1381 *
1382 * This has to cope with the fact that until it is locked,
1383 * the context could get moved to another task.
1384 */
1385static struct perf_event_context *
1386perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1387{
1388	struct perf_event_context *ctx;
1389
1390retry:
1391	/*
1392	 * One of the few rules of preemptible RCU is that one cannot do
1393	 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1394	 * part of the read side critical section was irqs-enabled -- see
1395	 * rcu_read_unlock_special().
1396	 *
1397	 * Since ctx->lock nests under rq->lock we must ensure the entire read
1398	 * side critical section has interrupts disabled.
1399	 */
1400	local_irq_save(*flags);
1401	rcu_read_lock();
1402	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1403	if (ctx) {
1404		/*
1405		 * If this context is a clone of another, it might
1406		 * get swapped for another underneath us by
1407		 * perf_event_task_sched_out, though the
1408		 * rcu_read_lock() protects us from any context
1409		 * getting freed.  Lock the context and check if it
1410		 * got swapped before we could get the lock, and retry
1411		 * if so.  If we locked the right context, then it
1412		 * can't get swapped on us any more.
1413		 */
1414		raw_spin_lock(&ctx->lock);
1415		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1416			raw_spin_unlock(&ctx->lock);
1417			rcu_read_unlock();
1418			local_irq_restore(*flags);
1419			goto retry;
1420		}
1421
1422		if (ctx->task == TASK_TOMBSTONE ||
1423		    !refcount_inc_not_zero(&ctx->refcount)) {
1424			raw_spin_unlock(&ctx->lock);
1425			ctx = NULL;
1426		} else {
1427			WARN_ON_ONCE(ctx->task != task);
1428		}
1429	}
1430	rcu_read_unlock();
1431	if (!ctx)
1432		local_irq_restore(*flags);
1433	return ctx;
1434}
1435
1436/*
1437 * Get the context for a task and increment its pin_count so it
1438 * can't get swapped to another task.  This also increments its
1439 * reference count so that the context can't get freed.
1440 */
1441static struct perf_event_context *
1442perf_pin_task_context(struct task_struct *task, int ctxn)
1443{
1444	struct perf_event_context *ctx;
1445	unsigned long flags;
1446
1447	ctx = perf_lock_task_context(task, ctxn, &flags);
1448	if (ctx) {
1449		++ctx->pin_count;
1450		raw_spin_unlock_irqrestore(&ctx->lock, flags);
1451	}
1452	return ctx;
1453}
1454
1455static void perf_unpin_context(struct perf_event_context *ctx)
1456{
1457	unsigned long flags;
1458
1459	raw_spin_lock_irqsave(&ctx->lock, flags);
1460	--ctx->pin_count;
1461	raw_spin_unlock_irqrestore(&ctx->lock, flags);
1462}
1463
1464/*
1465 * Update the record of the current time in a context.
1466 */
1467static void __update_context_time(struct perf_event_context *ctx, bool adv)
1468{
1469	u64 now = perf_clock();
1470
1471	if (adv)
1472		ctx->time += now - ctx->timestamp;
1473	ctx->timestamp = now;
1474
1475	/*
1476	 * The above: time' = time + (now - timestamp), can be re-arranged
1477	 * into: time` = now + (time - timestamp), which gives a single value
1478	 * offset to compute future time without locks on.
1479	 *
1480	 * See perf_event_time_now(), which can be used from NMI context where
1481	 * it's (obviously) not possible to acquire ctx->lock in order to read
1482	 * both the above values in a consistent manner.
1483	 */
1484	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
1485}
1486
1487static void update_context_time(struct perf_event_context *ctx)
1488{
1489	__update_context_time(ctx, true);
1490}
1491
1492static u64 perf_event_time(struct perf_event *event)
1493{
1494	struct perf_event_context *ctx = event->ctx;
1495
1496	if (unlikely(!ctx))
1497		return 0;
1498
1499	if (is_cgroup_event(event))
1500		return perf_cgroup_event_time(event);
1501
1502	return ctx->time;
1503}
1504
1505static u64 perf_event_time_now(struct perf_event *event, u64 now)
1506{
1507	struct perf_event_context *ctx = event->ctx;
1508
1509	if (unlikely(!ctx))
1510		return 0;
1511
1512	if (is_cgroup_event(event))
1513		return perf_cgroup_event_time_now(event, now);
1514
1515	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
1516		return ctx->time;
1517
1518	now += READ_ONCE(ctx->timeoffset);
1519	return now;
1520}
1521
1522static enum event_type_t get_event_type(struct perf_event *event)
1523{
1524	struct perf_event_context *ctx = event->ctx;
1525	enum event_type_t event_type;
1526
1527	lockdep_assert_held(&ctx->lock);
1528
1529	/*
1530	 * It's 'group type', really, because if our group leader is
1531	 * pinned, so are we.
1532	 */
1533	if (event->group_leader != event)
1534		event = event->group_leader;
1535
1536	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1537	if (!ctx->task)
1538		event_type |= EVENT_CPU;
1539
1540	return event_type;
1541}
1542
1543/*
1544 * Helper function to initialize event group nodes.
1545 */
1546static void init_event_group(struct perf_event *event)
1547{
1548	RB_CLEAR_NODE(&event->group_node);
1549	event->group_index = 0;
1550}
1551
1552/*
1553 * Extract pinned or flexible groups from the context
1554 * based on event attrs bits.
1555 */
1556static struct perf_event_groups *
1557get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1558{
1559	if (event->attr.pinned)
1560		return &ctx->pinned_groups;
1561	else
1562		return &ctx->flexible_groups;
1563}
1564
1565/*
1566 * Helper function to initializes perf_event_group trees.
1567 */
1568static void perf_event_groups_init(struct perf_event_groups *groups)
1569{
1570	groups->tree = RB_ROOT;
1571	groups->index = 0;
1572}
1573
1574static inline struct cgroup *event_cgroup(const struct perf_event *event)
1575{
1576	struct cgroup *cgroup = NULL;
1577
1578#ifdef CONFIG_CGROUP_PERF
1579	if (event->cgrp)
1580		cgroup = event->cgrp->css.cgroup;
1581#endif
1582
1583	return cgroup;
1584}
1585
1586/*
1587 * Compare function for event groups;
1588 *
1589 * Implements complex key that first sorts by CPU and then by virtual index
1590 * which provides ordering when rotating groups for the same CPU.
1591 */
1592static __always_inline int
1593perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
1594		      const u64 left_group_index, const struct perf_event *right)
1595{
1596	if (left_cpu < right->cpu)
1597		return -1;
1598	if (left_cpu > right->cpu)
1599		return 1;
1600
1601#ifdef CONFIG_CGROUP_PERF
1602	{
1603		const struct cgroup *right_cgroup = event_cgroup(right);
1604
1605		if (left_cgroup != right_cgroup) {
1606			if (!left_cgroup) {
1607				/*
1608				 * Left has no cgroup but right does, no
1609				 * cgroups come first.
1610				 */
1611				return -1;
1612			}
1613			if (!right_cgroup) {
1614				/*
1615				 * Right has no cgroup but left does, no
1616				 * cgroups come first.
1617				 */
1618				return 1;
1619			}
1620			/* Two dissimilar cgroups, order by id. */
1621			if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1622				return -1;
1623
1624			return 1;
1625		}
1626	}
1627#endif
1628
1629	if (left_group_index < right->group_index)
1630		return -1;
1631	if (left_group_index > right->group_index)
1632		return 1;
1633
1634	return 0;
1635}
1636
1637#define __node_2_pe(node) \
1638	rb_entry((node), struct perf_event, group_node)
1639
1640static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1641{
1642	struct perf_event *e = __node_2_pe(a);
1643	return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
1644				     __node_2_pe(b)) < 0;
1645}
1646
1647struct __group_key {
1648	int cpu;
1649	struct cgroup *cgroup;
1650};
1651
1652static inline int __group_cmp(const void *key, const struct rb_node *node)
1653{
1654	const struct __group_key *a = key;
1655	const struct perf_event *b = __node_2_pe(node);
1656
1657	/* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
1658	return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
1659}
1660
1661/*
1662 * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1663 * key (see perf_event_groups_less). This places it last inside the CPU
1664 * subtree.
1665 */
1666static void
1667perf_event_groups_insert(struct perf_event_groups *groups,
1668			 struct perf_event *event)
1669{
1670	event->group_index = ++groups->index;
1671
1672	rb_add(&event->group_node, &groups->tree, __group_less);
1673}
1674
1675/*
1676 * Helper function to insert event into the pinned or flexible groups.
1677 */
1678static void
1679add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1680{
1681	struct perf_event_groups *groups;
1682
1683	groups = get_event_groups(event, ctx);
1684	perf_event_groups_insert(groups, event);
1685}
1686
1687/*
1688 * Delete a group from a tree.
1689 */
1690static void
1691perf_event_groups_delete(struct perf_event_groups *groups,
1692			 struct perf_event *event)
1693{
1694	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1695		     RB_EMPTY_ROOT(&groups->tree));
1696
1697	rb_erase(&event->group_node, &groups->tree);
1698	init_event_group(event);
1699}
1700
1701/*
1702 * Helper function to delete event from its groups.
1703 */
1704static void
1705del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1706{
1707	struct perf_event_groups *groups;
1708
1709	groups = get_event_groups(event, ctx);
1710	perf_event_groups_delete(groups, event);
1711}
1712
1713/*
1714 * Get the leftmost event in the cpu/cgroup subtree.
1715 */
1716static struct perf_event *
1717perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1718			struct cgroup *cgrp)
1719{
1720	struct __group_key key = {
1721		.cpu = cpu,
1722		.cgroup = cgrp,
1723	};
1724	struct rb_node *node;
1725
1726	node = rb_find_first(&key, &groups->tree, __group_cmp);
1727	if (node)
1728		return __node_2_pe(node);
1729
1730	return NULL;
1731}
1732
1733/*
1734 * Like rb_entry_next_safe() for the @cpu subtree.
1735 */
1736static struct perf_event *
1737perf_event_groups_next(struct perf_event *event)
1738{
1739	struct __group_key key = {
1740		.cpu = event->cpu,
1741		.cgroup = event_cgroup(event),
1742	};
1743	struct rb_node *next;
1744
1745	next = rb_next_match(&key, &event->group_node, __group_cmp);
1746	if (next)
1747		return __node_2_pe(next);
1748
1749	return NULL;
1750}
1751
1752/*
1753 * Iterate through the whole groups tree.
1754 */
1755#define perf_event_groups_for_each(event, groups)			\
1756	for (event = rb_entry_safe(rb_first(&((groups)->tree)),		\
1757				typeof(*event), group_node); event;	\
1758		event = rb_entry_safe(rb_next(&event->group_node),	\
1759				typeof(*event), group_node))
1760
1761/*
1762 * Add an event from the lists for its context.
1763 * Must be called with ctx->mutex and ctx->lock held.
1764 */
1765static void
1766list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1767{
1768	lockdep_assert_held(&ctx->lock);
1769
1770	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1771	event->attach_state |= PERF_ATTACH_CONTEXT;
1772
1773	event->tstamp = perf_event_time(event);
1774
1775	/*
1776	 * If we're a stand alone event or group leader, we go to the context
1777	 * list, group events are kept attached to the group so that
1778	 * perf_group_detach can, at all times, locate all siblings.
1779	 */
1780	if (event->group_leader == event) {
1781		event->group_caps = event->event_caps;
1782		add_event_to_groups(event, ctx);
1783	}
1784
1785	list_add_rcu(&event->event_entry, &ctx->event_list);
1786	ctx->nr_events++;
1787	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1788		ctx->nr_user++;
1789	if (event->attr.inherit_stat)
1790		ctx->nr_stat++;
1791
1792	if (event->state > PERF_EVENT_STATE_OFF)
1793		perf_cgroup_event_enable(event, ctx);
1794
1795	ctx->generation++;
1796}
1797
1798/*
1799 * Initialize event state based on the perf_event_attr::disabled.
1800 */
1801static inline void perf_event__state_init(struct perf_event *event)
1802{
1803	event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1804					      PERF_EVENT_STATE_INACTIVE;
1805}
1806
1807static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1808{
1809	int entry = sizeof(u64); /* value */
1810	int size = 0;
1811	int nr = 1;
1812
1813	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1814		size += sizeof(u64);
1815
1816	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1817		size += sizeof(u64);
1818
1819	if (event->attr.read_format & PERF_FORMAT_ID)
1820		entry += sizeof(u64);
1821
1822	if (event->attr.read_format & PERF_FORMAT_LOST)
1823		entry += sizeof(u64);
1824
1825	if (event->attr.read_format & PERF_FORMAT_GROUP) {
1826		nr += nr_siblings;
1827		size += sizeof(u64);
1828	}
1829
1830	size += entry * nr;
1831	event->read_size = size;
1832}
1833
1834static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1835{
1836	struct perf_sample_data *data;
1837	u16 size = 0;
1838
1839	if (sample_type & PERF_SAMPLE_IP)
1840		size += sizeof(data->ip);
1841
1842	if (sample_type & PERF_SAMPLE_ADDR)
1843		size += sizeof(data->addr);
1844
1845	if (sample_type & PERF_SAMPLE_PERIOD)
1846		size += sizeof(data->period);
1847
1848	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1849		size += sizeof(data->weight.full);
1850
1851	if (sample_type & PERF_SAMPLE_READ)
1852		size += event->read_size;
1853
1854	if (sample_type & PERF_SAMPLE_DATA_SRC)
1855		size += sizeof(data->data_src.val);
1856
1857	if (sample_type & PERF_SAMPLE_TRANSACTION)
1858		size += sizeof(data->txn);
1859
1860	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1861		size += sizeof(data->phys_addr);
1862
1863	if (sample_type & PERF_SAMPLE_CGROUP)
1864		size += sizeof(data->cgroup);
1865
1866	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1867		size += sizeof(data->data_page_size);
1868
1869	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1870		size += sizeof(data->code_page_size);
1871
1872	event->header_size = size;
1873}
1874
1875/*
1876 * Called at perf_event creation and when events are attached/detached from a
1877 * group.
1878 */
1879static void perf_event__header_size(struct perf_event *event)
1880{
1881	__perf_event_read_size(event,
1882			       event->group_leader->nr_siblings);
1883	__perf_event_header_size(event, event->attr.sample_type);
1884}
1885
1886static void perf_event__id_header_size(struct perf_event *event)
1887{
1888	struct perf_sample_data *data;
1889	u64 sample_type = event->attr.sample_type;
1890	u16 size = 0;
1891
1892	if (sample_type & PERF_SAMPLE_TID)
1893		size += sizeof(data->tid_entry);
1894
1895	if (sample_type & PERF_SAMPLE_TIME)
1896		size += sizeof(data->time);
1897
1898	if (sample_type & PERF_SAMPLE_IDENTIFIER)
1899		size += sizeof(data->id);
1900
1901	if (sample_type & PERF_SAMPLE_ID)
1902		size += sizeof(data->id);
1903
1904	if (sample_type & PERF_SAMPLE_STREAM_ID)
1905		size += sizeof(data->stream_id);
1906
1907	if (sample_type & PERF_SAMPLE_CPU)
1908		size += sizeof(data->cpu_entry);
1909
1910	event->id_header_size = size;
1911}
1912
1913static bool perf_event_validate_size(struct perf_event *event)
1914{
1915	/*
1916	 * The values computed here will be over-written when we actually
1917	 * attach the event.
1918	 */
1919	__perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1920	__perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1921	perf_event__id_header_size(event);
1922
1923	/*
1924	 * Sum the lot; should not exceed the 64k limit we have on records.
1925	 * Conservative limit to allow for callchains and other variable fields.
1926	 */
1927	if (event->read_size + event->header_size +
1928	    event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1929		return false;
1930
1931	return true;
1932}
1933
1934static void perf_group_attach(struct perf_event *event)
1935{
1936	struct perf_event *group_leader = event->group_leader, *pos;
1937
1938	lockdep_assert_held(&event->ctx->lock);
1939
1940	/*
1941	 * We can have double attach due to group movement in perf_event_open.
1942	 */
1943	if (event->attach_state & PERF_ATTACH_GROUP)
1944		return;
1945
1946	event->attach_state |= PERF_ATTACH_GROUP;
1947
1948	if (group_leader == event)
1949		return;
1950
1951	WARN_ON_ONCE(group_leader->ctx != event->ctx);
1952
1953	group_leader->group_caps &= event->event_caps;
1954
1955	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1956	group_leader->nr_siblings++;
1957
1958	perf_event__header_size(group_leader);
1959
1960	for_each_sibling_event(pos, group_leader)
1961		perf_event__header_size(pos);
1962}
1963
1964/*
1965 * Remove an event from the lists for its context.
1966 * Must be called with ctx->mutex and ctx->lock held.
1967 */
1968static void
1969list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1970{
1971	WARN_ON_ONCE(event->ctx != ctx);
1972	lockdep_assert_held(&ctx->lock);
1973
1974	/*
1975	 * We can have double detach due to exit/hot-unplug + close.
1976	 */
1977	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1978		return;
1979
1980	event->attach_state &= ~PERF_ATTACH_CONTEXT;
1981
1982	ctx->nr_events--;
1983	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1984		ctx->nr_user--;
1985	if (event->attr.inherit_stat)
1986		ctx->nr_stat--;
1987
1988	list_del_rcu(&event->event_entry);
1989
1990	if (event->group_leader == event)
1991		del_event_from_groups(event, ctx);
1992
1993	/*
1994	 * If event was in error state, then keep it
1995	 * that way, otherwise bogus counts will be
1996	 * returned on read(). The only way to get out
1997	 * of error state is by explicit re-enabling
1998	 * of the event
1999	 */
2000	if (event->state > PERF_EVENT_STATE_OFF) {
2001		perf_cgroup_event_disable(event, ctx);
2002		perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2003	}
2004
2005	ctx->generation++;
2006}
2007
2008static int
2009perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2010{
2011	if (!has_aux(aux_event))
2012		return 0;
2013
2014	if (!event->pmu->aux_output_match)
2015		return 0;
2016
2017	return event->pmu->aux_output_match(aux_event);
2018}
2019
2020static void put_event(struct perf_event *event);
2021static void event_sched_out(struct perf_event *event,
2022			    struct perf_cpu_context *cpuctx,
2023			    struct perf_event_context *ctx);
2024
2025static void perf_put_aux_event(struct perf_event *event)
2026{
2027	struct perf_event_context *ctx = event->ctx;
2028	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2029	struct perf_event *iter;
2030
2031	/*
2032	 * If event uses aux_event tear down the link
2033	 */
2034	if (event->aux_event) {
2035		iter = event->aux_event;
2036		event->aux_event = NULL;
2037		put_event(iter);
2038		return;
2039	}
2040
2041	/*
2042	 * If the event is an aux_event, tear down all links to
2043	 * it from other events.
2044	 */
2045	for_each_sibling_event(iter, event->group_leader) {
2046		if (iter->aux_event != event)
2047			continue;
2048
2049		iter->aux_event = NULL;
2050		put_event(event);
2051
2052		/*
2053		 * If it's ACTIVE, schedule it out and put it into ERROR
2054		 * state so that we don't try to schedule it again. Note
2055		 * that perf_event_enable() will clear the ERROR status.
2056		 */
2057		event_sched_out(iter, cpuctx, ctx);
2058		perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2059	}
2060}
2061
2062static bool perf_need_aux_event(struct perf_event *event)
2063{
2064	return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2065}
2066
2067static int perf_get_aux_event(struct perf_event *event,
2068			      struct perf_event *group_leader)
2069{
2070	/*
2071	 * Our group leader must be an aux event if we want to be
2072	 * an aux_output. This way, the aux event will precede its
2073	 * aux_output events in the group, and therefore will always
2074	 * schedule first.
2075	 */
2076	if (!group_leader)
2077		return 0;
2078
2079	/*
2080	 * aux_output and aux_sample_size are mutually exclusive.
2081	 */
2082	if (event->attr.aux_output && event->attr.aux_sample_size)
2083		return 0;
2084
2085	if (event->attr.aux_output &&
2086	    !perf_aux_output_match(event, group_leader))
2087		return 0;
2088
2089	if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2090		return 0;
2091
2092	if (!atomic_long_inc_not_zero(&group_leader->refcount))
2093		return 0;
2094
2095	/*
2096	 * Link aux_outputs to their aux event; this is undone in
2097	 * perf_group_detach() by perf_put_aux_event(). When the
2098	 * group in torn down, the aux_output events loose their
2099	 * link to the aux_event and can't schedule any more.
2100	 */
2101	event->aux_event = group_leader;
2102
2103	return 1;
2104}
2105
2106static inline struct list_head *get_event_list(struct perf_event *event)
2107{
2108	struct perf_event_context *ctx = event->ctx;
2109	return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2110}
2111
2112/*
2113 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
2114 * cannot exist on their own, schedule them out and move them into the ERROR
2115 * state. Also see _perf_event_enable(), it will not be able to recover
2116 * this ERROR state.
2117 */
2118static inline void perf_remove_sibling_event(struct perf_event *event)
2119{
2120	struct perf_event_context *ctx = event->ctx;
2121	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2122
2123	event_sched_out(event, cpuctx, ctx);
2124	perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2125}
2126
2127static void perf_group_detach(struct perf_event *event)
2128{
2129	struct perf_event *leader = event->group_leader;
2130	struct perf_event *sibling, *tmp;
2131	struct perf_event_context *ctx = event->ctx;
2132
2133	lockdep_assert_held(&ctx->lock);
2134
2135	/*
2136	 * We can have double detach due to exit/hot-unplug + close.
2137	 */
2138	if (!(event->attach_state & PERF_ATTACH_GROUP))
2139		return;
2140
2141	event->attach_state &= ~PERF_ATTACH_GROUP;
2142
2143	perf_put_aux_event(event);
2144
2145	/*
2146	 * If this is a sibling, remove it from its group.
2147	 */
2148	if (leader != event) {
2149		list_del_init(&event->sibling_list);
2150		event->group_leader->nr_siblings--;
2151		goto out;
2152	}
2153
2154	/*
2155	 * If this was a group event with sibling events then
2156	 * upgrade the siblings to singleton events by adding them
2157	 * to whatever list we are on.
2158	 */
2159	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2160
2161		if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2162			perf_remove_sibling_event(sibling);
2163
2164		sibling->group_leader = sibling;
2165		list_del_init(&sibling->sibling_list);
2166
2167		/* Inherit group flags from the previous leader */
2168		sibling->group_caps = event->group_caps;
2169
2170		if (!RB_EMPTY_NODE(&event->group_node)) {
2171			add_event_to_groups(sibling, event->ctx);
2172
2173			if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2174				list_add_tail(&sibling->active_list, get_event_list(sibling));
2175		}
2176
2177		WARN_ON_ONCE(sibling->ctx != event->ctx);
2178	}
2179
2180out:
2181	for_each_sibling_event(tmp, leader)
2182		perf_event__header_size(tmp);
2183
2184	perf_event__header_size(leader);
2185}
2186
2187static void sync_child_event(struct perf_event *child_event);
2188
2189static void perf_child_detach(struct perf_event *event)
2190{
2191	struct perf_event *parent_event = event->parent;
2192
2193	if (!(event->attach_state & PERF_ATTACH_CHILD))
2194		return;
2195
2196	event->attach_state &= ~PERF_ATTACH_CHILD;
2197
2198	if (WARN_ON_ONCE(!parent_event))
2199		return;
2200
2201	lockdep_assert_held(&parent_event->child_mutex);
2202
2203	sync_child_event(event);
2204	list_del_init(&event->child_list);
2205}
2206
2207static bool is_orphaned_event(struct perf_event *event)
2208{
2209	return event->state == PERF_EVENT_STATE_DEAD;
2210}
2211
2212static inline int __pmu_filter_match(struct perf_event *event)
2213{
2214	struct pmu *pmu = event->pmu;
2215	return pmu->filter_match ? pmu->filter_match(event) : 1;
2216}
2217
2218/*
2219 * Check whether we should attempt to schedule an event group based on
2220 * PMU-specific filtering. An event group can consist of HW and SW events,
2221 * potentially with a SW leader, so we must check all the filters, to
2222 * determine whether a group is schedulable:
2223 */
2224static inline int pmu_filter_match(struct perf_event *event)
2225{
2226	struct perf_event *sibling;
2227
2228	if (!__pmu_filter_match(event))
2229		return 0;
2230
2231	for_each_sibling_event(sibling, event) {
2232		if (!__pmu_filter_match(sibling))
2233			return 0;
2234	}
2235
2236	return 1;
2237}
2238
2239static inline int
2240event_filter_match(struct perf_event *event)
2241{
2242	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2243	       perf_cgroup_match(event) && pmu_filter_match(event);
2244}
2245
2246static void
2247event_sched_out(struct perf_event *event,
2248		  struct perf_cpu_context *cpuctx,
2249		  struct perf_event_context *ctx)
2250{
2251	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2252
2253	WARN_ON_ONCE(event->ctx != ctx);
2254	lockdep_assert_held(&ctx->lock);
2255
2256	if (event->state != PERF_EVENT_STATE_ACTIVE)
2257		return;
2258
2259	/*
2260	 * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2261	 * we can schedule events _OUT_ individually through things like
2262	 * __perf_remove_from_context().
2263	 */
2264	list_del_init(&event->active_list);
2265
2266	perf_pmu_disable(event->pmu);
2267
2268	event->pmu->del(event, 0);
2269	event->oncpu = -1;
2270
2271	if (READ_ONCE(event->pending_disable) >= 0) {
2272		WRITE_ONCE(event->pending_disable, -1);
2273		perf_cgroup_event_disable(event, ctx);
2274		state = PERF_EVENT_STATE_OFF;
2275	}
2276	perf_event_set_state(event, state);
2277
2278	if (!is_software_event(event))
2279		cpuctx->active_oncpu--;
2280	if (!--ctx->nr_active)
2281		perf_event_ctx_deactivate(ctx);
2282	if (event->attr.freq && event->attr.sample_freq)
2283		ctx->nr_freq--;
2284	if (event->attr.exclusive || !cpuctx->active_oncpu)
2285		cpuctx->exclusive = 0;
2286
2287	perf_pmu_enable(event->pmu);
2288}
2289
2290static void
2291group_sched_out(struct perf_event *group_event,
2292		struct perf_cpu_context *cpuctx,
2293		struct perf_event_context *ctx)
2294{
2295	struct perf_event *event;
2296
2297	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2298		return;
2299
2300	perf_pmu_disable(ctx->pmu);
2301
2302	event_sched_out(group_event, cpuctx, ctx);
2303
2304	/*
2305	 * Schedule out siblings (if any):
2306	 */
2307	for_each_sibling_event(event, group_event)
2308		event_sched_out(event, cpuctx, ctx);
2309
2310	perf_pmu_enable(ctx->pmu);
2311}
2312
2313#define DETACH_GROUP	0x01UL
2314#define DETACH_CHILD	0x02UL
2315
2316/*
2317 * Cross CPU call to remove a performance event
2318 *
2319 * We disable the event on the hardware level first. After that we
2320 * remove it from the context list.
2321 */
2322static void
2323__perf_remove_from_context(struct perf_event *event,
2324			   struct perf_cpu_context *cpuctx,
2325			   struct perf_event_context *ctx,
2326			   void *info)
2327{
2328	unsigned long flags = (unsigned long)info;
2329
2330	if (ctx->is_active & EVENT_TIME) {
2331		update_context_time(ctx);
2332		update_cgrp_time_from_cpuctx(cpuctx, false);
2333	}
2334
2335	event_sched_out(event, cpuctx, ctx);
2336	if (flags & DETACH_GROUP)
2337		perf_group_detach(event);
2338	if (flags & DETACH_CHILD)
2339		perf_child_detach(event);
2340	list_del_event(event, ctx);
2341
2342	if (!ctx->nr_events && ctx->is_active) {
2343		if (ctx == &cpuctx->ctx)
2344			update_cgrp_time_from_cpuctx(cpuctx, true);
2345
2346		ctx->is_active = 0;
2347		ctx->rotate_necessary = 0;
2348		if (ctx->task) {
2349			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2350			cpuctx->task_ctx = NULL;
2351		}
2352	}
2353}
2354
2355/*
2356 * Remove the event from a task's (or a CPU's) list of events.
2357 *
2358 * If event->ctx is a cloned context, callers must make sure that
2359 * every task struct that event->ctx->task could possibly point to
2360 * remains valid.  This is OK when called from perf_release since
2361 * that only calls us on the top-level context, which can't be a clone.
2362 * When called from perf_event_exit_task, it's OK because the
2363 * context has been detached from its task.
2364 */
2365static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2366{
2367	struct perf_event_context *ctx = event->ctx;
2368
2369	lockdep_assert_held(&ctx->mutex);
2370
2371	/*
2372	 * Because of perf_event_exit_task(), perf_remove_from_context() ought
2373	 * to work in the face of TASK_TOMBSTONE, unlike every other
2374	 * event_function_call() user.
2375	 */
2376	raw_spin_lock_irq(&ctx->lock);
2377	/*
2378	 * Cgroup events are per-cpu events, and must IPI because of
2379	 * cgrp_cpuctx_list.
2380	 */
2381	if (!ctx->is_active && !is_cgroup_event(event)) {
2382		__perf_remove_from_context(event, __get_cpu_context(ctx),
2383					   ctx, (void *)flags);
2384		raw_spin_unlock_irq(&ctx->lock);
2385		return;
2386	}
2387	raw_spin_unlock_irq(&ctx->lock);
2388
2389	event_function_call(event, __perf_remove_from_context, (void *)flags);
2390}
2391
2392/*
2393 * Cross CPU call to disable a performance event
2394 */
2395static void __perf_event_disable(struct perf_event *event,
2396				 struct perf_cpu_context *cpuctx,
2397				 struct perf_event_context *ctx,
2398				 void *info)
2399{
2400	if (event->state < PERF_EVENT_STATE_INACTIVE)
2401		return;
2402
2403	if (ctx->is_active & EVENT_TIME) {
2404		update_context_time(ctx);
2405		update_cgrp_time_from_event(event);
2406	}
2407
2408	if (event == event->group_leader)
2409		group_sched_out(event, cpuctx, ctx);
2410	else
2411		event_sched_out(event, cpuctx, ctx);
2412
2413	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2414	perf_cgroup_event_disable(event, ctx);
2415}
2416
2417/*
2418 * Disable an event.
2419 *
2420 * If event->ctx is a cloned context, callers must make sure that
2421 * every task struct that event->ctx->task could possibly point to
2422 * remains valid.  This condition is satisfied when called through
2423 * perf_event_for_each_child or perf_event_for_each because they
2424 * hold the top-level event's child_mutex, so any descendant that
2425 * goes to exit will block in perf_event_exit_event().
2426 *
2427 * When called from perf_pending_event it's OK because event->ctx
2428 * is the current context on this CPU and preemption is disabled,
2429 * hence we can't get into perf_event_task_sched_out for this context.
2430 */
2431static void _perf_event_disable(struct perf_event *event)
2432{
2433	struct perf_event_context *ctx = event->ctx;
2434
2435	raw_spin_lock_irq(&ctx->lock);
2436	if (event->state <= PERF_EVENT_STATE_OFF) {
2437		raw_spin_unlock_irq(&ctx->lock);
2438		return;
2439	}
2440	raw_spin_unlock_irq(&ctx->lock);
2441
2442	event_function_call(event, __perf_event_disable, NULL);
2443}
2444
2445void perf_event_disable_local(struct perf_event *event)
2446{
2447	event_function_local(event, __perf_event_disable, NULL);
2448}
2449
2450/*
2451 * Strictly speaking kernel users cannot create groups and therefore this
2452 * interface does not need the perf_event_ctx_lock() magic.
2453 */
2454void perf_event_disable(struct perf_event *event)
2455{
2456	struct perf_event_context *ctx;
2457
2458	ctx = perf_event_ctx_lock(event);
2459	_perf_event_disable(event);
2460	perf_event_ctx_unlock(event, ctx);
2461}
2462EXPORT_SYMBOL_GPL(perf_event_disable);
2463
2464void perf_event_disable_inatomic(struct perf_event *event)
2465{
2466	WRITE_ONCE(event->pending_disable, smp_processor_id());
2467	/* can fail, see perf_pending_event_disable() */
2468	irq_work_queue(&event->pending);
2469}
2470
2471#define MAX_INTERRUPTS (~0ULL)
2472
2473static void perf_log_throttle(struct perf_event *event, int enable);
2474static void perf_log_itrace_start(struct perf_event *event);
2475
2476static int
2477event_sched_in(struct perf_event *event,
2478		 struct perf_cpu_context *cpuctx,
2479		 struct perf_event_context *ctx)
2480{
2481	int ret = 0;
2482
2483	WARN_ON_ONCE(event->ctx != ctx);
2484
2485	lockdep_assert_held(&ctx->lock);
2486
2487	if (event->state <= PERF_EVENT_STATE_OFF)
2488		return 0;
2489
2490	WRITE_ONCE(event->oncpu, smp_processor_id());
2491	/*
2492	 * Order event::oncpu write to happen before the ACTIVE state is
2493	 * visible. This allows perf_event_{stop,read}() to observe the correct
2494	 * ->oncpu if it sees ACTIVE.
2495	 */
2496	smp_wmb();
2497	perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2498
2499	/*
2500	 * Unthrottle events, since we scheduled we might have missed several
2501	 * ticks already, also for a heavily scheduling task there is little
2502	 * guarantee it'll get a tick in a timely manner.
2503	 */
2504	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2505		perf_log_throttle(event, 1);
2506		event->hw.interrupts = 0;
2507	}
2508
2509	perf_pmu_disable(event->pmu);
2510
2511	perf_log_itrace_start(event);
2512
2513	if (event->pmu->add(event, PERF_EF_START)) {
2514		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2515		event->oncpu = -1;
2516		ret = -EAGAIN;
2517		goto out;
2518	}
2519
2520	if (!is_software_event(event))
2521		cpuctx->active_oncpu++;
2522	if (!ctx->nr_active++)
2523		perf_event_ctx_activate(ctx);
2524	if (event->attr.freq && event->attr.sample_freq)
2525		ctx->nr_freq++;
2526
2527	if (event->attr.exclusive)
2528		cpuctx->exclusive = 1;
2529
2530out:
2531	perf_pmu_enable(event->pmu);
2532
2533	return ret;
2534}
2535
2536static int
2537group_sched_in(struct perf_event *group_event,
2538	       struct perf_cpu_context *cpuctx,
2539	       struct perf_event_context *ctx)
2540{
2541	struct perf_event *event, *partial_group = NULL;
2542	struct pmu *pmu = ctx->pmu;
2543
2544	if (group_event->state == PERF_EVENT_STATE_OFF)
2545		return 0;
2546
2547	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2548
2549	if (event_sched_in(group_event, cpuctx, ctx))
2550		goto error;
2551
2552	/*
2553	 * Schedule in siblings as one group (if any):
2554	 */
2555	for_each_sibling_event(event, group_event) {
2556		if (event_sched_in(event, cpuctx, ctx)) {
2557			partial_group = event;
2558			goto group_error;
2559		}
2560	}
2561
2562	if (!pmu->commit_txn(pmu))
2563		return 0;
2564
2565group_error:
2566	/*
2567	 * Groups can be scheduled in as one unit only, so undo any
2568	 * partial group before returning:
2569	 * The events up to the failed event are scheduled out normally.
2570	 */
2571	for_each_sibling_event(event, group_event) {
2572		if (event == partial_group)
2573			break;
2574
2575		event_sched_out(event, cpuctx, ctx);
2576	}
2577	event_sched_out(group_event, cpuctx, ctx);
2578
2579error:
2580	pmu->cancel_txn(pmu);
2581	return -EAGAIN;
2582}
2583
2584/*
2585 * Work out whether we can put this event group on the CPU now.
2586 */
2587static int group_can_go_on(struct perf_event *event,
2588			   struct perf_cpu_context *cpuctx,
2589			   int can_add_hw)
2590{
2591	/*
2592	 * Groups consisting entirely of software events can always go on.
2593	 */
2594	if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2595		return 1;
2596	/*
2597	 * If an exclusive group is already on, no other hardware
2598	 * events can go on.
2599	 */
2600	if (cpuctx->exclusive)
2601		return 0;
2602	/*
2603	 * If this group is exclusive and there are already
2604	 * events on the CPU, it can't go on.
2605	 */
2606	if (event->attr.exclusive && !list_empty(get_event_list(event)))
2607		return 0;
2608	/*
2609	 * Otherwise, try to add it if all previous groups were able
2610	 * to go on.
2611	 */
2612	return can_add_hw;
2613}
2614
2615static void add_event_to_ctx(struct perf_event *event,
2616			       struct perf_event_context *ctx)
2617{
2618	list_add_event(event, ctx);
2619	perf_group_attach(event);
2620}
2621
2622static void ctx_sched_out(struct perf_event_context *ctx,
2623			  struct perf_cpu_context *cpuctx,
2624			  enum event_type_t event_type);
2625static void
2626ctx_sched_in(struct perf_event_context *ctx,
2627	     struct perf_cpu_context *cpuctx,
2628	     enum event_type_t event_type);
2629
2630static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2631			       struct perf_event_context *ctx,
2632			       enum event_type_t event_type)
2633{
2634	if (!cpuctx->task_ctx)
2635		return;
2636
2637	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2638		return;
2639
2640	ctx_sched_out(ctx, cpuctx, event_type);
2641}
2642
2643static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2644				struct perf_event_context *ctx)
2645{
2646	cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
2647	if (ctx)
2648		ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
2649	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
2650	if (ctx)
2651		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
2652}
2653
2654/*
2655 * We want to maintain the following priority of scheduling:
2656 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2657 *  - task pinned (EVENT_PINNED)
2658 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2659 *  - task flexible (EVENT_FLEXIBLE).
2660 *
2661 * In order to avoid unscheduling and scheduling back in everything every
2662 * time an event is added, only do it for the groups of equal priority and
2663 * below.
2664 *
2665 * This can be called after a batch operation on task events, in which case
2666 * event_type is a bit mask of the types of events involved. For CPU events,
2667 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2668 */
2669static void ctx_resched(struct perf_cpu_context *cpuctx,
2670			struct perf_event_context *task_ctx,
2671			enum event_type_t event_type)
2672{
2673	enum event_type_t ctx_event_type;
2674	bool cpu_event = !!(event_type & EVENT_CPU);
2675
2676	/*
2677	 * If pinned groups are involved, flexible groups also need to be
2678	 * scheduled out.
2679	 */
2680	if (event_type & EVENT_PINNED)
2681		event_type |= EVENT_FLEXIBLE;
2682
2683	ctx_event_type = event_type & EVENT_ALL;
2684
2685	perf_pmu_disable(cpuctx->ctx.pmu);
2686	if (task_ctx)
2687		task_ctx_sched_out(cpuctx, task_ctx, event_type);
2688
2689	/*
2690	 * Decide which cpu ctx groups to schedule out based on the types
2691	 * of events that caused rescheduling:
2692	 *  - EVENT_CPU: schedule out corresponding groups;
2693	 *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2694	 *  - otherwise, do nothing more.
2695	 */
2696	if (cpu_event)
2697		cpu_ctx_sched_out(cpuctx, ctx_event_type);
2698	else if (ctx_event_type & EVENT_PINNED)
2699		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2700
2701	perf_event_sched_in(cpuctx, task_ctx);
2702	perf_pmu_enable(cpuctx->ctx.pmu);
2703}
2704
2705void perf_pmu_resched(struct pmu *pmu)
2706{
2707	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2708	struct perf_event_context *task_ctx = cpuctx->task_ctx;
2709
2710	perf_ctx_lock(cpuctx, task_ctx);
2711	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2712	perf_ctx_unlock(cpuctx, task_ctx);
2713}
2714
2715/*
2716 * Cross CPU call to install and enable a performance event
2717 *
2718 * Very similar to remote_function() + event_function() but cannot assume that
2719 * things like ctx->is_active and cpuctx->task_ctx are set.
2720 */
2721static int  __perf_install_in_context(void *info)
2722{
2723	struct perf_event *event = info;
2724	struct perf_event_context *ctx = event->ctx;
2725	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2726	struct perf_event_context *task_ctx = cpuctx->task_ctx;
2727	bool reprogram = true;
2728	int ret = 0;
2729
2730	raw_spin_lock(&cpuctx->ctx.lock);
2731	if (ctx->task) {
2732		raw_spin_lock(&ctx->lock);
2733		task_ctx = ctx;
2734
2735		reprogram = (ctx->task == current);
2736
2737		/*
2738		 * If the task is running, it must be running on this CPU,
2739		 * otherwise we cannot reprogram things.
2740		 *
2741		 * If its not running, we don't care, ctx->lock will
2742		 * serialize against it becoming runnable.
2743		 */
2744		if (task_curr(ctx->task) && !reprogram) {
2745			ret = -ESRCH;
2746			goto unlock;
2747		}
2748
2749		WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2750	} else if (task_ctx) {
2751		raw_spin_lock(&task_ctx->lock);
2752	}
2753
2754#ifdef CONFIG_CGROUP_PERF
2755	if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2756		/*
2757		 * If the current cgroup doesn't match the event's
2758		 * cgroup, we should not try to schedule it.
2759		 */
2760		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2761		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2762					event->cgrp->css.cgroup);
2763	}
2764#endif
2765
2766	if (reprogram) {
2767		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2768		add_event_to_ctx(event, ctx);
2769		ctx_resched(cpuctx, task_ctx, get_event_type(event));
2770	} else {
2771		add_event_to_ctx(event, ctx);
2772	}
2773
2774unlock:
2775	perf_ctx_unlock(cpuctx, task_ctx);
2776
2777	return ret;
2778}
2779
2780static bool exclusive_event_installable(struct perf_event *event,
2781					struct perf_event_context *ctx);
2782
2783/*
2784 * Attach a performance event to a context.
2785 *
2786 * Very similar to event_function_call, see comment there.
2787 */
2788static void
2789perf_install_in_context(struct perf_event_context *ctx,
2790			struct perf_event *event,
2791			int cpu)
2792{
2793	struct task_struct *task = READ_ONCE(ctx->task);
2794
2795	lockdep_assert_held(&ctx->mutex);
2796
2797	WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2798
2799	if (event->cpu != -1)
2800		event->cpu = cpu;
2801
2802	/*
2803	 * Ensures that if we can observe event->ctx, both the event and ctx
2804	 * will be 'complete'. See perf_iterate_sb_cpu().
2805	 */
2806	smp_store_release(&event->ctx, ctx);
2807
2808	/*
2809	 * perf_event_attr::disabled events will not run and can be initialized
2810	 * without IPI. Except when this is the first event for the context, in
2811	 * that case we need the magic of the IPI to set ctx->is_active.
2812	 * Similarly, cgroup events for the context also needs the IPI to
2813	 * manipulate the cgrp_cpuctx_list.
2814	 *
2815	 * The IOC_ENABLE that is sure to follow the creation of a disabled
2816	 * event will issue the IPI and reprogram the hardware.
2817	 */
2818	if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
2819	    ctx->nr_events && !is_cgroup_event(event)) {
2820		raw_spin_lock_irq(&ctx->lock);
2821		if (ctx->task == TASK_TOMBSTONE) {
2822			raw_spin_unlock_irq(&ctx->lock);
2823			return;
2824		}
2825		add_event_to_ctx(event, ctx);
2826		raw_spin_unlock_irq(&ctx->lock);
2827		return;
2828	}
2829
2830	if (!task) {
2831		cpu_function_call(cpu, __perf_install_in_context, event);
2832		return;
2833	}
2834
2835	/*
2836	 * Should not happen, we validate the ctx is still alive before calling.
2837	 */
2838	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2839		return;
2840
2841	/*
2842	 * Installing events is tricky because we cannot rely on ctx->is_active
2843	 * to be set in case this is the nr_events 0 -> 1 transition.
2844	 *
2845	 * Instead we use task_curr(), which tells us if the task is running.
2846	 * However, since we use task_curr() outside of rq::lock, we can race
2847	 * against the actual state. This means the result can be wrong.
2848	 *
2849	 * If we get a false positive, we retry, this is harmless.
2850	 *
2851	 * If we get a false negative, things are complicated. If we are after
2852	 * perf_event_context_sched_in() ctx::lock will serialize us, and the
2853	 * value must be correct. If we're before, it doesn't matter since
2854	 * perf_event_context_sched_in() will program the counter.
2855	 *
2856	 * However, this hinges on the remote context switch having observed
2857	 * our task->perf_event_ctxp[] store, such that it will in fact take
2858	 * ctx::lock in perf_event_context_sched_in().
2859	 *
2860	 * We do this by task_function_call(), if the IPI fails to hit the task
2861	 * we know any future context switch of task must see the
2862	 * perf_event_ctpx[] store.
2863	 */
2864
2865	/*
2866	 * This smp_mb() orders the task->perf_event_ctxp[] store with the
2867	 * task_cpu() load, such that if the IPI then does not find the task
2868	 * running, a future context switch of that task must observe the
2869	 * store.
2870	 */
2871	smp_mb();
2872again:
2873	if (!task_function_call(task, __perf_install_in_context, event))
2874		return;
2875
2876	raw_spin_lock_irq(&ctx->lock);
2877	task = ctx->task;
2878	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2879		/*
2880		 * Cannot happen because we already checked above (which also
2881		 * cannot happen), and we hold ctx->mutex, which serializes us
2882		 * against perf_event_exit_task_context().
2883		 */
2884		raw_spin_unlock_irq(&ctx->lock);
2885		return;
2886	}
2887	/*
2888	 * If the task is not running, ctx->lock will avoid it becoming so,
2889	 * thus we can safely install the event.
2890	 */
2891	if (task_curr(task)) {
2892		raw_spin_unlock_irq(&ctx->lock);
2893		goto again;
2894	}
2895	add_event_to_ctx(event, ctx);
2896	raw_spin_unlock_irq(&ctx->lock);
2897}
2898
2899/*
2900 * Cross CPU call to enable a performance event
2901 */
2902static void __perf_event_enable(struct perf_event *event,
2903				struct perf_cpu_context *cpuctx,
2904				struct perf_event_context *ctx,
2905				void *info)
2906{
2907	struct perf_event *leader = event->group_leader;
2908	struct perf_event_context *task_ctx;
2909
2910	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2911	    event->state <= PERF_EVENT_STATE_ERROR)
2912		return;
2913
2914	if (ctx->is_active)
2915		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2916
2917	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2918	perf_cgroup_event_enable(event, ctx);
2919
2920	if (!ctx->is_active)
2921		return;
2922
2923	if (!event_filter_match(event)) {
2924		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
2925		return;
2926	}
2927
2928	/*
2929	 * If the event is in a group and isn't the group leader,
2930	 * then don't put it on unless the group is on.
2931	 */
2932	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2933		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
2934		return;
2935	}
2936
2937	task_ctx = cpuctx->task_ctx;
2938	if (ctx->task)
2939		WARN_ON_ONCE(task_ctx != ctx);
2940
2941	ctx_resched(cpuctx, task_ctx, get_event_type(event));
2942}
2943
2944/*
2945 * Enable an event.
2946 *
2947 * If event->ctx is a cloned context, callers must make sure that
2948 * every task struct that event->ctx->task could possibly point to
2949 * remains valid.  This condition is satisfied when called through
2950 * perf_event_for_each_child or perf_event_for_each as described
2951 * for perf_event_disable.
2952 */
2953static void _perf_event_enable(struct perf_event *event)
2954{
2955	struct perf_event_context *ctx = event->ctx;
2956
2957	raw_spin_lock_irq(&ctx->lock);
2958	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2959	    event->state <  PERF_EVENT_STATE_ERROR) {
2960out:
2961		raw_spin_unlock_irq(&ctx->lock);
2962		return;
2963	}
2964
2965	/*
2966	 * If the event is in error state, clear that first.
2967	 *
2968	 * That way, if we see the event in error state below, we know that it
2969	 * has gone back into error state, as distinct from the task having
2970	 * been scheduled away before the cross-call arrived.
2971	 */
2972	if (event->state == PERF_EVENT_STATE_ERROR) {
2973		/*
2974		 * Detached SIBLING events cannot leave ERROR state.
2975		 */
2976		if (event->event_caps & PERF_EV_CAP_SIBLING &&
2977		    event->group_leader == event)
2978			goto out;
2979
2980		event->state = PERF_EVENT_STATE_OFF;
2981	}
2982	raw_spin_unlock_irq(&ctx->lock);
2983
2984	event_function_call(event, __perf_event_enable, NULL);
2985}
2986
2987/*
2988 * See perf_event_disable();
2989 */
2990void perf_event_enable(struct perf_event *event)
2991{
2992	struct perf_event_context *ctx;
2993
2994	ctx = perf_event_ctx_lock(event);
2995	_perf_event_enable(event);
2996	perf_event_ctx_unlock(event, ctx);
2997}
2998EXPORT_SYMBOL_GPL(perf_event_enable);
2999
3000struct stop_event_data {
3001	struct perf_event	*event;
3002	unsigned int		restart;
3003};
3004
3005static int __perf_event_stop(void *info)
3006{
3007	struct stop_event_data *sd = info;
3008	struct perf_event *event = sd->event;
3009
3010	/* if it's already INACTIVE, do nothing */
3011	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3012		return 0;
3013
3014	/* matches smp_wmb() in event_sched_in() */
3015	smp_rmb();
3016
3017	/*
3018	 * There is a window with interrupts enabled before we get here,
3019	 * so we need to check again lest we try to stop another CPU's event.
3020	 */
3021	if (READ_ONCE(event->oncpu) != smp_processor_id())
3022		return -EAGAIN;
3023
3024	event->pmu->stop(event, PERF_EF_UPDATE);
3025
3026	/*
3027	 * May race with the actual stop (through perf_pmu_output_stop()),
3028	 * but it is only used for events with AUX ring buffer, and such
3029	 * events will refuse to restart because of rb::aux_mmap_count==0,
3030	 * see comments in perf_aux_output_begin().
3031	 *
3032	 * Since this is happening on an event-local CPU, no trace is lost
3033	 * while restarting.
3034	 */
3035	if (sd->restart)
3036		event->pmu->start(event, 0);
3037
3038	return 0;
3039}
3040
3041static int perf_event_stop(struct perf_event *event, int restart)
3042{
3043	struct stop_event_data sd = {
3044		.event		= event,
3045		.restart	= restart,
3046	};
3047	int ret = 0;
3048
3049	do {
3050		if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3051			return 0;
3052
3053		/* matches smp_wmb() in event_sched_in() */
3054		smp_rmb();
3055
3056		/*
3057		 * We only want to restart ACTIVE events, so if the event goes
3058		 * inactive here (event->oncpu==-1), there's nothing more to do;
3059		 * fall through with ret==-ENXIO.
3060		 */
3061		ret = cpu_function_call(READ_ONCE(event->oncpu),
3062					__perf_event_stop, &sd);
3063	} while (ret == -EAGAIN);
3064
3065	return ret;
3066}
3067
3068/*
3069 * In order to contain the amount of racy and tricky in the address filter
3070 * configuration management, it is a two part process:
3071 *
3072 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3073 *      we update the addresses of corresponding vmas in
3074 *	event::addr_filter_ranges array and bump the event::addr_filters_gen;
3075 * (p2) when an event is scheduled in (pmu::add), it calls
3076 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3077 *      if the generation has changed since the previous call.
3078 *
3079 * If (p1) happens while the event is active, we restart it to force (p2).
3080 *
3081 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3082 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3083 *     ioctl;
3084 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3085 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
3086 *     for reading;
3087 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3088 *     of exec.
3089 */
3090void perf_event_addr_filters_sync(struct perf_event *event)
3091{
3092	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3093
3094	if (!has_addr_filter(event))
3095		return;
3096
3097	raw_spin_lock(&ifh->lock);
3098	if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3099		event->pmu->addr_filters_sync(event);
3100		event->hw.addr_filters_gen = event->addr_filters_gen;
3101	}
3102	raw_spin_unlock(&ifh->lock);
3103}
3104EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3105
3106static int _perf_event_refresh(struct perf_event *event, int refresh)
3107{
3108	/*
3109	 * not supported on inherited events
3110	 */
3111	if (event->attr.inherit || !is_sampling_event(event))
3112		return -EINVAL;
3113
3114	atomic_add(refresh, &event->event_limit);
3115	_perf_event_enable(event);
3116
3117	return 0;
3118}
3119
3120/*
3121 * See perf_event_disable()
3122 */
3123int perf_event_refresh(struct perf_event *event, int refresh)
3124{
3125	struct perf_event_context *ctx;
3126	int ret;
3127
3128	ctx = perf_event_ctx_lock(event);
3129	ret = _perf_event_refresh(event, refresh);
3130	perf_event_ctx_unlock(event, ctx);
3131
3132	return ret;
3133}
3134EXPORT_SYMBOL_GPL(perf_event_refresh);
3135
3136static int perf_event_modify_breakpoint(struct perf_event *bp,
3137					 struct perf_event_attr *attr)
3138{
3139	int err;
3140
3141	_perf_event_disable(bp);
3142
3143	err = modify_user_hw_breakpoint_check(bp, attr, true);
3144
3145	if (!bp->attr.disabled)
3146		_perf_event_enable(bp);
3147
3148	return err;
3149}
3150
3151/*
3152 * Copy event-type-independent attributes that may be modified.
3153 */
3154static void perf_event_modify_copy_attr(struct perf_event_attr *to,
3155					const struct perf_event_attr *from)
3156{
3157	to->sig_data = from->sig_data;
3158}
3159
3160static int perf_event_modify_attr(struct perf_event *event,
3161				  struct perf_event_attr *attr)
3162{
3163	int (*func)(struct perf_event *, struct perf_event_attr *);
3164	struct perf_event *child;
3165	int err;
3166
3167	if (event->attr.type != attr->type)
3168		return -EINVAL;
3169
3170	switch (event->attr.type) {
3171	case PERF_TYPE_BREAKPOINT:
3172		func = perf_event_modify_breakpoint;
3173		break;
3174	default:
3175		/* Place holder for future additions. */
3176		return -EOPNOTSUPP;
3177	}
3178
3179	WARN_ON_ONCE(event->ctx->parent_ctx);
3180
3181	mutex_lock(&event->child_mutex);
3182	/*
3183	 * Event-type-independent attributes must be copied before event-type
3184	 * modification, which will validate that final attributes match the
3185	 * source attributes after all relevant attributes have been copied.
3186	 */
3187	perf_event_modify_copy_attr(&event->attr, attr);
3188	err = func(event, attr);
3189	if (err)
3190		goto out;
3191	list_for_each_entry(child, &event->child_list, child_list) {
3192		perf_event_modify_copy_attr(&child->attr, attr);
3193		err = func(child, attr);
3194		if (err)
3195			goto out;
3196	}
3197out:
3198	mutex_unlock(&event->child_mutex);
3199	return err;
3200}
3201
3202static void ctx_sched_out(struct perf_event_context *ctx,
3203			  struct perf_cpu_context *cpuctx,
3204			  enum event_type_t event_type)
3205{
3206	struct perf_event *event, *tmp;
3207	int is_active = ctx->is_active;
3208
3209	lockdep_assert_held(&ctx->lock);
3210
3211	if (likely(!ctx->nr_events)) {
3212		/*
3213		 * See __perf_remove_from_context().
3214		 */
3215		WARN_ON_ONCE(ctx->is_active);
3216		if (ctx->task)
3217			WARN_ON_ONCE(cpuctx->task_ctx);
3218		return;
3219	}
3220
3221	/*
3222	 * Always update time if it was set; not only when it changes.
3223	 * Otherwise we can 'forget' to update time for any but the last
3224	 * context we sched out. For example:
3225	 *
3226	 *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3227	 *   ctx_sched_out(.event_type = EVENT_PINNED)
3228	 *
3229	 * would only update time for the pinned events.
3230	 */
3231	if (is_active & EVENT_TIME) {
3232		/* update (and stop) ctx time */
3233		update_context_time(ctx);
3234		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
3235		/*
3236		 * CPU-release for the below ->is_active store,
3237		 * see __load_acquire() in perf_event_time_now()
3238		 */
3239		barrier();
3240	}
3241
3242	ctx->is_active &= ~event_type;
3243	if (!(ctx->is_active & EVENT_ALL))
3244		ctx->is_active = 0;
3245
3246	if (ctx->task) {
3247		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3248		if (!ctx->is_active)
3249			cpuctx->task_ctx = NULL;
3250	}
3251
3252	is_active ^= ctx->is_active; /* changed bits */
3253
3254	if (!ctx->nr_active || !(is_active & EVENT_ALL))
3255		return;
3256
3257	perf_pmu_disable(ctx->pmu);
3258	if (is_active & EVENT_PINNED) {
3259		list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3260			group_sched_out(event, cpuctx, ctx);
3261	}
3262
3263	if (is_active & EVENT_FLEXIBLE) {
3264		list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3265			group_sched_out(event, cpuctx, ctx);
3266
3267		/*
3268		 * Since we cleared EVENT_FLEXIBLE, also clear
3269		 * rotate_necessary, is will be reset by
3270		 * ctx_flexible_sched_in() when needed.
3271		 */
3272		ctx->rotate_necessary = 0;
3273	}
3274	perf_pmu_enable(ctx->pmu);
3275}
3276
3277/*
3278 * Test whether two contexts are equivalent, i.e. whether they have both been
3279 * cloned from the same version of the same context.
3280 *
3281 * Equivalence is measured using a generation number in the context that is
3282 * incremented on each modification to it; see unclone_ctx(), list_add_event()
3283 * and list_del_event().
3284 */
3285static int context_equiv(struct perf_event_context *ctx1,
3286			 struct perf_event_context *ctx2)
3287{
3288	lockdep_assert_held(&ctx1->lock);
3289	lockdep_assert_held(&ctx2->lock);
3290
3291	/* Pinning disables the swap optimization */
3292	if (ctx1->pin_count || ctx2->pin_count)
3293		return 0;
3294
3295	/* If ctx1 is the parent of ctx2 */
3296	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3297		return 1;
3298
3299	/* If ctx2 is the parent of ctx1 */
3300	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3301		return 1;
3302
3303	/*
3304	 * If ctx1 and ctx2 have the same parent; we flatten the parent
3305	 * hierarchy, see perf_event_init_context().
3306	 */
3307	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3308			ctx1->parent_gen == ctx2->parent_gen)
3309		return 1;
3310
3311	/* Unmatched */
3312	return 0;
3313}
3314
3315static void __perf_event_sync_stat(struct perf_event *event,
3316				     struct perf_event *next_event)
3317{
3318	u64 value;
3319
3320	if (!event->attr.inherit_stat)
3321		return;
3322
3323	/*
3324	 * Update the event value, we cannot use perf_event_read()
3325	 * because we're in the middle of a context switch and have IRQs
3326	 * disabled, which upsets smp_call_function_single(), however
3327	 * we know the event must be on the current CPU, therefore we
3328	 * don't need to use it.
3329	 */
3330	if (event->state == PERF_EVENT_STATE_ACTIVE)
3331		event->pmu->read(event);
3332
3333	perf_event_update_time(event);
3334
3335	/*
3336	 * In order to keep per-task stats reliable we need to flip the event
3337	 * values when we flip the contexts.
3338	 */
3339	value = local64_read(&next_event->count);
3340	value = local64_xchg(&event->count, value);
3341	local64_set(&next_event->count, value);
3342
3343	swap(event->total_time_enabled, next_event->total_time_enabled);
3344	swap(event->total_time_running, next_event->total_time_running);
3345
3346	/*
3347	 * Since we swizzled the values, update the user visible data too.
3348	 */
3349	perf_event_update_userpage(event);
3350	perf_event_update_userpage(next_event);
3351}
3352
3353static void perf_event_sync_stat(struct perf_event_context *ctx,
3354				   struct perf_event_context *next_ctx)
3355{
3356	struct perf_event *event, *next_event;
3357
3358	if (!ctx->nr_stat)
3359		return;
3360
3361	update_context_time(ctx);
3362
3363	event = list_first_entry(&ctx->event_list,
3364				   struct perf_event, event_entry);
3365
3366	next_event = list_first_entry(&next_ctx->event_list,
3367					struct perf_event, event_entry);
3368
3369	while (&event->event_entry != &ctx->event_list &&
3370	       &next_event->event_entry != &next_ctx->event_list) {
3371
3372		__perf_event_sync_stat(event, next_event);
3373
3374		event = list_next_entry(event, event_entry);
3375		next_event = list_next_entry(next_event, event_entry);
3376	}
3377}
3378
3379static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3380					 struct task_struct *next)
3381{
3382	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3383	struct perf_event_context *next_ctx;
3384	struct perf_event_context *parent, *next_parent;
3385	struct perf_cpu_context *cpuctx;
3386	int do_switch = 1;
3387	struct pmu *pmu;
3388
3389	if (likely(!ctx))
3390		return;
3391
3392	pmu = ctx->pmu;
3393	cpuctx = __get_cpu_context(ctx);
3394	if (!cpuctx->task_ctx)
3395		return;
3396
3397	rcu_read_lock();
3398	next_ctx = next->perf_event_ctxp[ctxn];
3399	if (!next_ctx)
3400		goto unlock;
3401
3402	parent = rcu_dereference(ctx->parent_ctx);
3403	next_parent = rcu_dereference(next_ctx->parent_ctx);
3404
3405	/* If neither context have a parent context; they cannot be clones. */
3406	if (!parent && !next_parent)
3407		goto unlock;
3408
3409	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3410		/*
3411		 * Looks like the two contexts are clones, so we might be
3412		 * able to optimize the context switch.  We lock both
3413		 * contexts and check that they are clones under the
3414		 * lock (including re-checking that neither has been
3415		 * uncloned in the meantime).  It doesn't matter which
3416		 * order we take the locks because no other cpu could
3417		 * be trying to lock both of these tasks.
3418		 */
3419		raw_spin_lock(&ctx->lock);
3420		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3421		if (context_equiv(ctx, next_ctx)) {
3422
3423			WRITE_ONCE(ctx->task, next);
3424			WRITE_ONCE(next_ctx->task, task);
3425
3426			perf_pmu_disable(pmu);
3427
3428			if (cpuctx->sched_cb_usage && pmu->sched_task)
3429				pmu->sched_task(ctx, false);
3430
3431			/*
3432			 * PMU specific parts of task perf context can require
3433			 * additional synchronization. As an example of such
3434			 * synchronization see implementation details of Intel
3435			 * LBR call stack data profiling;
3436			 */
3437			if (pmu->swap_task_ctx)
3438				pmu->swap_task_ctx(ctx, next_ctx);
3439			else
3440				swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3441
3442			perf_pmu_enable(pmu);
3443
3444			/*
3445			 * RCU_INIT_POINTER here is safe because we've not
3446			 * modified the ctx and the above modification of
3447			 * ctx->task and ctx->task_ctx_data are immaterial
3448			 * since those values are always verified under
3449			 * ctx->lock which we're now holding.
3450			 */
3451			RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3452			RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3453
3454			do_switch = 0;
3455
3456			perf_event_sync_stat(ctx, next_ctx);
3457		}
3458		raw_spin_unlock(&next_ctx->lock);
3459		raw_spin_unlock(&ctx->lock);
3460	}
3461unlock:
3462	rcu_read_unlock();
3463
3464	if (do_switch) {
3465		raw_spin_lock(&ctx->lock);
3466		perf_pmu_disable(pmu);
3467
3468		if (cpuctx->sched_cb_usage && pmu->sched_task)
3469			pmu->sched_task(ctx, false);
3470		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3471
3472		perf_pmu_enable(pmu);
3473		raw_spin_unlock(&ctx->lock);
3474	}
3475}
3476
3477static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3478
3479void perf_sched_cb_dec(struct pmu *pmu)
3480{
3481	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3482
3483	this_cpu_dec(perf_sched_cb_usages);
3484
3485	if (!--cpuctx->sched_cb_usage)
3486		list_del(&cpuctx->sched_cb_entry);
3487}
3488
3489
3490void perf_sched_cb_inc(struct pmu *pmu)
3491{
3492	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3493
3494	if (!cpuctx->sched_cb_usage++)
3495		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3496
3497	this_cpu_inc(perf_sched_cb_usages);
3498}
3499
3500/*
3501 * This function provides the context switch callback to the lower code
3502 * layer. It is invoked ONLY when the context switch callback is enabled.
3503 *
3504 * This callback is relevant even to per-cpu events; for example multi event
3505 * PEBS requires this to provide PID/TID information. This requires we flush
3506 * all queued PEBS records before we context switch to a new task.
3507 */
3508static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3509{
3510	struct pmu *pmu;
3511
3512	pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3513
3514	if (WARN_ON_ONCE(!pmu->sched_task))
3515		return;
3516
3517	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3518	perf_pmu_disable(pmu);
3519
3520	pmu->sched_task(cpuctx->task_ctx, sched_in);
3521
3522	perf_pmu_enable(pmu);
3523	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3524}
3525
3526static void perf_pmu_sched_task(struct task_struct *prev,
3527				struct task_struct *next,
3528				bool sched_in)
3529{
3530	struct perf_cpu_context *cpuctx;
3531
3532	if (prev == next)
3533		return;
3534
3535	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3536		/* will be handled in perf_event_context_sched_in/out */
3537		if (cpuctx->task_ctx)
3538			continue;
3539
3540		__perf_pmu_sched_task(cpuctx, sched_in);
3541	}
3542}
3543
3544static void perf_event_switch(struct task_struct *task,
3545			      struct task_struct *next_prev, bool sched_in);
3546
3547#define for_each_task_context_nr(ctxn)					\
3548	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3549
3550/*
3551 * Called from scheduler to remove the events of the current task,
3552 * with interrupts disabled.
3553 *
3554 * We stop each event and update the event value in event->count.
3555 *
3556 * This does not protect us against NMI, but disable()
3557 * sets the disabled bit in the control field of event _before_
3558 * accessing the event control register. If a NMI hits, then it will
3559 * not restart the event.
3560 */
3561void __perf_event_task_sched_out(struct task_struct *task,
3562				 struct task_struct *next)
3563{
3564	int ctxn;
3565
3566	if (__this_cpu_read(perf_sched_cb_usages))
3567		perf_pmu_sched_task(task, next, false);
3568
3569	if (atomic_read(&nr_switch_events))
3570		perf_event_switch(task, next, false);
3571
3572	for_each_task_context_nr(ctxn)
3573		perf_event_context_sched_out(task, ctxn, next);
3574
3575	/*
3576	 * if cgroup events exist on this CPU, then we need
3577	 * to check if we have to switch out PMU state.
3578	 * cgroup event are system-wide mode only
3579	 */
3580	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3581		perf_cgroup_switch(next);
3582}
3583
3584/*
3585 * Called with IRQs disabled
3586 */
3587static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3588			      enum event_type_t event_type)
3589{
3590	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3591}
3592
3593static bool perf_less_group_idx(const void *l, const void *r)
3594{
3595	const struct perf_event *le = *(const struct perf_event **)l;
3596	const struct perf_event *re = *(const struct perf_event **)r;
3597
3598	return le->group_index < re->group_index;
3599}
3600
3601static void swap_ptr(void *l, void *r)
3602{
3603	void **lp = l, **rp = r;
3604
3605	swap(*lp, *rp);
3606}
3607
3608static const struct min_heap_callbacks perf_min_heap = {
3609	.elem_size = sizeof(struct perf_event *),
3610	.less = perf_less_group_idx,
3611	.swp = swap_ptr,
3612};
3613
3614static void __heap_add(struct min_heap *heap, struct perf_event *event)
3615{
3616	struct perf_event **itrs = heap->data;
3617
3618	if (event) {
3619		itrs[heap->nr] = event;
3620		heap->nr++;
3621	}
3622}
3623
3624static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3625				struct perf_event_groups *groups, int cpu,
3626				int (*func)(struct perf_event *, void *),
3627				void *data)
3628{
3629#ifdef CONFIG_CGROUP_PERF
3630	struct cgroup_subsys_state *css = NULL;
3631#endif
3632	/* Space for per CPU and/or any CPU event iterators. */
3633	struct perf_event *itrs[2];
3634	struct min_heap event_heap;
3635	struct perf_event **evt;
3636	int ret;
3637
3638	if (cpuctx) {
3639		event_heap = (struct min_heap){
3640			.data = cpuctx->heap,
3641			.nr = 0,
3642			.size = cpuctx->heap_size,
3643		};
3644
3645		lockdep_assert_held(&cpuctx->ctx.lock);
3646
3647#ifdef CONFIG_CGROUP_PERF
3648		if (cpuctx->cgrp)
3649			css = &cpuctx->cgrp->css;
3650#endif
3651	} else {
3652		event_heap = (struct min_heap){
3653			.data = itrs,
3654			.nr = 0,
3655			.size = ARRAY_SIZE(itrs),
3656		};
3657		/* Events not within a CPU context may be on any CPU. */
3658		__heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3659	}
3660	evt = event_heap.data;
3661
3662	__heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3663
3664#ifdef CONFIG_CGROUP_PERF
3665	for (; css; css = css->parent)
3666		__heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3667#endif
3668
3669	min_heapify_all(&event_heap, &perf_min_heap);
3670
3671	while (event_heap.nr) {
3672		ret = func(*evt, data);
3673		if (ret)
3674			return ret;
3675
3676		*evt = perf_event_groups_next(*evt);
3677		if (*evt)
3678			min_heapify(&event_heap, 0, &perf_min_heap);
3679		else
3680			min_heap_pop(&event_heap, &perf_min_heap);
3681	}
3682
3683	return 0;
3684}
3685
3686/*
3687 * Because the userpage is strictly per-event (there is no concept of context,
3688 * so there cannot be a context indirection), every userpage must be updated
3689 * when context time starts :-(
3690 *
3691 * IOW, we must not miss EVENT_TIME edges.
3692 */
3693static inline bool event_update_userpage(struct perf_event *event)
3694{
3695	if (likely(!atomic_read(&event->mmap_count)))
3696		return false;
3697
3698	perf_event_update_time(event);
3699	perf_event_update_userpage(event);
3700
3701	return true;
3702}
3703
3704static inline void group_update_userpage(struct perf_event *group_event)
3705{
3706	struct perf_event *event;
3707
3708	if (!event_update_userpage(group_event))
3709		return;
3710
3711	for_each_sibling_event(event, group_event)
3712		event_update_userpage(event);
3713}
3714
3715static int merge_sched_in(struct perf_event *event, void *data)
3716{
3717	struct perf_event_context *ctx = event->ctx;
3718	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3719	int *can_add_hw = data;
3720
3721	if (event->state <= PERF_EVENT_STATE_OFF)
3722		return 0;
3723
3724	if (!event_filter_match(event))
3725		return 0;
3726
3727	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3728		if (!group_sched_in(event, cpuctx, ctx))
3729			list_add_tail(&event->active_list, get_event_list(event));
3730	}
3731
3732	if (event->state == PERF_EVENT_STATE_INACTIVE) {
3733		*can_add_hw = 0;
3734		if (event->attr.pinned) {
3735			perf_cgroup_event_disable(event, ctx);
3736			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3737		} else {
3738			ctx->rotate_necessary = 1;
3739			perf_mux_hrtimer_restart(cpuctx);
3740			group_update_userpage(event);
3741		}
3742	}
3743
3744	return 0;
3745}
3746
3747static void
3748ctx_pinned_sched_in(struct perf_event_context *ctx,
3749		    struct perf_cpu_context *cpuctx)
3750{
3751	int can_add_hw = 1;
3752
3753	if (ctx != &cpuctx->ctx)
3754		cpuctx = NULL;
3755
3756	visit_groups_merge(cpuctx, &ctx->pinned_groups,
3757			   smp_processor_id(),
3758			   merge_sched_in, &can_add_hw);
3759}
3760
3761static void
3762ctx_flexible_sched_in(struct perf_event_context *ctx,
3763		      struct perf_cpu_context *cpuctx)
3764{
3765	int can_add_hw = 1;
3766
3767	if (ctx != &cpuctx->ctx)
3768		cpuctx = NULL;
3769
3770	visit_groups_merge(cpuctx, &ctx->flexible_groups,
3771			   smp_processor_id(),
3772			   merge_sched_in, &can_add_hw);
3773}
3774
3775static void
3776ctx_sched_in(struct perf_event_context *ctx,
3777	     struct perf_cpu_context *cpuctx,
3778	     enum event_type_t event_type)
3779{
3780	int is_active = ctx->is_active;
3781
3782	lockdep_assert_held(&ctx->lock);
3783
3784	if (likely(!ctx->nr_events))
3785		return;
3786
3787	if (is_active ^ EVENT_TIME) {
3788		/* start ctx time */
3789		__update_context_time(ctx, false);
3790		perf_cgroup_set_timestamp(cpuctx);
3791		/*
3792		 * CPU-release for the below ->is_active store,
3793		 * see __load_acquire() in perf_event_time_now()
3794		 */
3795		barrier();
3796	}
3797
3798	ctx->is_active |= (event_type | EVENT_TIME);
3799	if (ctx->task) {
3800		if (!is_active)
3801			cpuctx->task_ctx = ctx;
3802		else
3803			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3804	}
3805
3806	is_active ^= ctx->is_active; /* changed bits */
3807
3808	/*
3809	 * First go through the list and put on any pinned groups
3810	 * in order to give them the best chance of going on.
3811	 */
3812	if (is_active & EVENT_PINNED)
3813		ctx_pinned_sched_in(ctx, cpuctx);
3814
3815	/* Then walk through the lower prio flexible groups */
3816	if (is_active & EVENT_FLEXIBLE)
3817		ctx_flexible_sched_in(ctx, cpuctx);
3818}
3819
3820static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3821			     enum event_type_t event_type)
3822{
3823	struct perf_event_context *ctx = &cpuctx->ctx;
3824
3825	ctx_sched_in(ctx, cpuctx, event_type);
3826}
3827
3828static void perf_event_context_sched_in(struct perf_event_context *ctx,
3829					struct task_struct *task)
3830{
3831	struct perf_cpu_context *cpuctx;
3832	struct pmu *pmu;
3833
3834	cpuctx = __get_cpu_context(ctx);
3835
3836	/*
3837	 * HACK: for HETEROGENEOUS the task context might have switched to a
3838	 * different PMU, force (re)set the context,
3839	 */
3840	pmu = ctx->pmu = cpuctx->ctx.pmu;
3841
3842	if (cpuctx->task_ctx == ctx) {
3843		if (cpuctx->sched_cb_usage)
3844			__perf_pmu_sched_task(cpuctx, true);
3845		return;
3846	}
3847
3848	perf_ctx_lock(cpuctx, ctx);
3849	/*
3850	 * We must check ctx->nr_events while holding ctx->lock, such
3851	 * that we serialize against perf_install_in_context().
3852	 */
3853	if (!ctx->nr_events)
3854		goto unlock;
3855
3856	perf_pmu_disable(pmu);
3857	/*
3858	 * We want to keep the following priority order:
3859	 * cpu pinned (that don't need to move), task pinned,
3860	 * cpu flexible, task flexible.
3861	 *
3862	 * However, if task's ctx is not carrying any pinned
3863	 * events, no need to flip the cpuctx's events around.
3864	 */
3865	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3866		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3867	perf_event_sched_in(cpuctx, ctx);
3868
3869	if (cpuctx->sched_cb_usage && pmu->sched_task)
3870		pmu->sched_task(cpuctx->task_ctx, true);
3871
3872	perf_pmu_enable(pmu);
3873
3874unlock:
3875	perf_ctx_unlock(cpuctx, ctx);
3876}
3877
3878/*
3879 * Called from scheduler to add the events of the current task
3880 * with interrupts disabled.
3881 *
3882 * We restore the event value and then enable it.
3883 *
3884 * This does not protect us against NMI, but enable()
3885 * sets the enabled bit in the control field of event _before_
3886 * accessing the event control register. If a NMI hits, then it will
3887 * keep the event running.
3888 */
3889void __perf_event_task_sched_in(struct task_struct *prev,
3890				struct task_struct *task)
3891{
3892	struct perf_event_context *ctx;
3893	int ctxn;
3894
3895	for_each_task_context_nr(ctxn) {
3896		ctx = task->perf_event_ctxp[ctxn];
3897		if (likely(!ctx))
3898			continue;
3899
3900		perf_event_context_sched_in(ctx, task);
3901	}
3902
3903	if (atomic_read(&nr_switch_events))
3904		perf_event_switch(task, prev, true);
3905
3906	if (__this_cpu_read(perf_sched_cb_usages))
3907		perf_pmu_sched_task(prev, task, true);
3908}
3909
3910static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3911{
3912	u64 frequency = event->attr.sample_freq;
3913	u64 sec = NSEC_PER_SEC;
3914	u64 divisor, dividend;
3915
3916	int count_fls, nsec_fls, frequency_fls, sec_fls;
3917
3918	count_fls = fls64(count);
3919	nsec_fls = fls64(nsec);
3920	frequency_fls = fls64(frequency);
3921	sec_fls = 30;
3922
3923	/*
3924	 * We got @count in @nsec, with a target of sample_freq HZ
3925	 * the target period becomes:
3926	 *
3927	 *             @count * 10^9
3928	 * period = -------------------
3929	 *          @nsec * sample_freq
3930	 *
3931	 */
3932
3933	/*
3934	 * Reduce accuracy by one bit such that @a and @b converge
3935	 * to a similar magnitude.
3936	 */
3937#define REDUCE_FLS(a, b)		\
3938do {					\
3939	if (a##_fls > b##_fls) {	\
3940		a >>= 1;		\
3941		a##_fls--;		\
3942	} else {			\
3943		b >>= 1;		\
3944		b##_fls--;		\
3945	}				\
3946} while (0)
3947
3948	/*
3949	 * Reduce accuracy until either term fits in a u64, then proceed with
3950	 * the other, so that finally we can do a u64/u64 division.
3951	 */
3952	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3953		REDUCE_FLS(nsec, frequency);
3954		REDUCE_FLS(sec, count);
3955	}
3956
3957	if (count_fls + sec_fls > 64) {
3958		divisor = nsec * frequency;
3959
3960		while (count_fls + sec_fls > 64) {
3961			REDUCE_FLS(count, sec);
3962			divisor >>= 1;
3963		}
3964
3965		dividend = count * sec;
3966	} else {
3967		dividend = count * sec;
3968
3969		while (nsec_fls + frequency_fls > 64) {
3970			REDUCE_FLS(nsec, frequency);
3971			dividend >>= 1;
3972		}
3973
3974		divisor = nsec * frequency;
3975	}
3976
3977	if (!divisor)
3978		return dividend;
3979
3980	return div64_u64(dividend, divisor);
3981}
3982
3983static DEFINE_PER_CPU(int, perf_throttled_count);
3984static DEFINE_PER_CPU(u64, perf_throttled_seq);
3985
3986static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3987{
3988	struct hw_perf_event *hwc = &event->hw;
3989	s64 period, sample_period;
3990	s64 delta;
3991
3992	period = perf_calculate_period(event, nsec, count);
3993
3994	delta = (s64)(period - hwc->sample_period);
3995	delta = (delta + 7) / 8; /* low pass filter */
3996
3997	sample_period = hwc->sample_period + delta;
3998
3999	if (!sample_period)
4000		sample_period = 1;
4001
4002	hwc->sample_period = sample_period;
4003
4004	if (local64_read(&hwc->period_left) > 8*sample_period) {
4005		if (disable)
4006			event->pmu->stop(event, PERF_EF_UPDATE);
4007
4008		local64_set(&hwc->period_left, 0);
4009
4010		if (disable)
4011			event->pmu->start(event, PERF_EF_RELOAD);
4012	}
4013}
4014
4015/*
4016 * combine freq adjustment with unthrottling to avoid two passes over the
4017 * events. At the same time, make sure, having freq events does not change
4018 * the rate of unthrottling as that would introduce bias.
4019 */
4020static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
4021					   int needs_unthr)
4022{
4023	struct perf_event *event;
4024	struct hw_perf_event *hwc;
4025	u64 now, period = TICK_NSEC;
4026	s64 delta;
4027
4028	/*
4029	 * only need to iterate over all events iff:
4030	 * - context have events in frequency mode (needs freq adjust)
4031	 * - there are events to unthrottle on this cpu
4032	 */
4033	if (!(ctx->nr_freq || needs_unthr))
4034		return;
4035
4036	raw_spin_lock(&ctx->lock);
4037	perf_pmu_disable(ctx->pmu);
4038
4039	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4040		if (event->state != PERF_EVENT_STATE_ACTIVE)
4041			continue;
4042
4043		if (!event_filter_match(event))
4044			continue;
4045
4046		perf_pmu_disable(event->pmu);
4047
4048		hwc = &event->hw;
4049
4050		if (hwc->interrupts == MAX_INTERRUPTS) {
4051			hwc->interrupts = 0;
4052			perf_log_throttle(event, 1);
4053			event->pmu->start(event, 0);
4054		}
4055
4056		if (!event->attr.freq || !event->attr.sample_freq)
4057			goto next;
4058
4059		/*
4060		 * stop the event and update event->count
4061		 */
4062		event->pmu->stop(event, PERF_EF_UPDATE);
4063
4064		now = local64_read(&event->count);
4065		delta = now - hwc->freq_count_stamp;
4066		hwc->freq_count_stamp = now;
4067
4068		/*
4069		 * restart the event
4070		 * reload only if value has changed
4071		 * we have stopped the event so tell that
4072		 * to perf_adjust_period() to avoid stopping it
4073		 * twice.
4074		 */
4075		if (delta > 0)
4076			perf_adjust_period(event, period, delta, false);
4077
4078		event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4079	next:
4080		perf_pmu_enable(event->pmu);
4081	}
4082
4083	perf_pmu_enable(ctx->pmu);
4084	raw_spin_unlock(&ctx->lock);
4085}
4086
4087/*
4088 * Move @event to the tail of the @ctx's elegible events.
4089 */
4090static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4091{
4092	/*
4093	 * Rotate the first entry last of non-pinned groups. Rotation might be
4094	 * disabled by the inheritance code.
4095	 */
4096	if (ctx->rotate_disable)
4097		return;
4098
4099	perf_event_groups_delete(&ctx->flexible_groups, event);
4100	perf_event_groups_insert(&ctx->flexible_groups, event);
4101}
4102
4103/* pick an event from the flexible_groups to rotate */
4104static inline struct perf_event *
4105ctx_event_to_rotate(struct perf_event_context *ctx)
4106{
4107	struct perf_event *event;
4108
4109	/* pick the first active flexible event */
4110	event = list_first_entry_or_null(&ctx->flexible_active,
4111					 struct perf_event, active_list);
4112
4113	/* if no active flexible event, pick the first event */
4114	if (!event) {
4115		event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4116				      typeof(*event), group_node);
4117	}
4118
4119	/*
4120	 * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4121	 * finds there are unschedulable events, it will set it again.
4122	 */
4123	ctx->rotate_necessary = 0;
4124
4125	return event;
4126}
4127
4128static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4129{
4130	struct perf_event *cpu_event = NULL, *task_event = NULL;
4131	struct perf_event_context *task_ctx = NULL;
4132	int cpu_rotate, task_rotate;
4133
4134	/*
4135	 * Since we run this from IRQ context, nobody can install new
4136	 * events, thus the event count values are stable.
4137	 */
4138
4139	cpu_rotate = cpuctx->ctx.rotate_necessary;
4140	task_ctx = cpuctx->task_ctx;
4141	task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4142
4143	if (!(cpu_rotate || task_rotate))
4144		return false;
4145
4146	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4147	perf_pmu_disable(cpuctx->ctx.pmu);
4148
4149	if (task_rotate)
4150		task_event = ctx_event_to_rotate(task_ctx);
4151	if (cpu_rotate)
4152		cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4153
4154	/*
4155	 * As per the order given at ctx_resched() first 'pop' task flexible
4156	 * and then, if needed CPU flexible.
4157	 */
4158	if (task_event || (task_ctx && cpu_event))
4159		ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4160	if (cpu_event)
4161		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4162
4163	if (task_event)
4164		rotate_ctx(task_ctx, task_event);
4165	if (cpu_event)
4166		rotate_ctx(&cpuctx->ctx, cpu_event);
4167
4168	perf_event_sched_in(cpuctx, task_ctx);
4169
4170	perf_pmu_enable(cpuctx->ctx.pmu);
4171	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4172
4173	return true;
4174}
4175
4176void perf_event_task_tick(void)
4177{
4178	struct list_head *head = this_cpu_ptr(&active_ctx_list);
4179	struct perf_event_context *ctx, *tmp;
4180	int throttled;
4181
4182	lockdep_assert_irqs_disabled();
4183
4184	__this_cpu_inc(perf_throttled_seq);
4185	throttled = __this_cpu_xchg(perf_throttled_count, 0);
4186	tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4187
4188	list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4189		perf_adjust_freq_unthr_context(ctx, throttled);
4190}
4191
4192static int event_enable_on_exec(struct perf_event *event,
4193				struct perf_event_context *ctx)
4194{
4195	if (!event->attr.enable_on_exec)
4196		return 0;
4197
4198	event->attr.enable_on_exec = 0;
4199	if (event->state >= PERF_EVENT_STATE_INACTIVE)
4200		return 0;
4201
4202	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4203
4204	return 1;
4205}
4206
4207/*
4208 * Enable all of a task's events that have been marked enable-on-exec.
4209 * This expects task == current.
4210 */
4211static void perf_event_enable_on_exec(int ctxn)
4212{
4213	struct perf_event_context *ctx, *clone_ctx = NULL;
4214	enum event_type_t event_type = 0;
4215	struct perf_cpu_context *cpuctx;
4216	struct perf_event *event;
4217	unsigned long flags;
4218	int enabled = 0;
4219
4220	local_irq_save(flags);
4221	ctx = current->perf_event_ctxp[ctxn];
4222	if (!ctx || !ctx->nr_events)
4223		goto out;
4224
4225	cpuctx = __get_cpu_context(ctx);
4226	perf_ctx_lock(cpuctx, ctx);
4227	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4228	list_for_each_entry(event, &ctx->event_list, event_entry) {
4229		enabled |= event_enable_on_exec(event, ctx);
4230		event_type |= get_event_type(event);
4231	}
4232
4233	/*
4234	 * Unclone and reschedule this context if we enabled any event.
4235	 */
4236	if (enabled) {
4237		clone_ctx = unclone_ctx(ctx);
4238		ctx_resched(cpuctx, ctx, event_type);
4239	} else {
4240		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
4241	}
4242	perf_ctx_unlock(cpuctx, ctx);
4243
4244out:
4245	local_irq_restore(flags);
4246
4247	if (clone_ctx)
4248		put_ctx(clone_ctx);
4249}
4250
4251static void perf_remove_from_owner(struct perf_event *event);
4252static void perf_event_exit_event(struct perf_event *event,
4253				  struct perf_event_context *ctx);
4254
4255/*
4256 * Removes all events from the current task that have been marked
4257 * remove-on-exec, and feeds their values back to parent events.
4258 */
4259static void perf_event_remove_on_exec(int ctxn)
4260{
4261	struct perf_event_context *ctx, *clone_ctx = NULL;
4262	struct perf_event *event, *next;
4263	unsigned long flags;
4264	bool modified = false;
4265
4266	ctx = perf_pin_task_context(current, ctxn);
4267	if (!ctx)
4268		return;
4269
4270	mutex_lock(&ctx->mutex);
4271
4272	if (WARN_ON_ONCE(ctx->task != current))
4273		goto unlock;
4274
4275	list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4276		if (!event->attr.remove_on_exec)
4277			continue;
4278
4279		if (!is_kernel_event(event))
4280			perf_remove_from_owner(event);
4281
4282		modified = true;
4283
4284		perf_event_exit_event(event, ctx);
4285	}
4286
4287	raw_spin_lock_irqsave(&ctx->lock, flags);
4288	if (modified)
4289		clone_ctx = unclone_ctx(ctx);
4290	--ctx->pin_count;
4291	raw_spin_unlock_irqrestore(&ctx->lock, flags);
4292
4293unlock:
4294	mutex_unlock(&ctx->mutex);
4295
4296	put_ctx(ctx);
4297	if (clone_ctx)
4298		put_ctx(clone_ctx);
4299}
4300
4301struct perf_read_data {
4302	struct perf_event *event;
4303	bool group;
4304	int ret;
4305};
4306
4307static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4308{
4309	u16 local_pkg, event_pkg;
4310
4311	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4312		int local_cpu = smp_processor_id();
4313
4314		event_pkg = topology_physical_package_id(event_cpu);
4315		local_pkg = topology_physical_package_id(local_cpu);
4316
4317		if (event_pkg == local_pkg)
4318			return local_cpu;
4319	}
4320
4321	return event_cpu;
4322}
4323
4324/*
4325 * Cross CPU call to read the hardware event
4326 */
4327static void __perf_event_read(void *info)
4328{
4329	struct perf_read_data *data = info;
4330	struct perf_event *sub, *event = data->event;
4331	struct perf_event_context *ctx = event->ctx;
4332	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4333	struct pmu *pmu = event->pmu;
4334
4335	/*
4336	 * If this is a task context, we need to check whether it is
4337	 * the current task context of this cpu.  If not it has been
4338	 * scheduled out before the smp call arrived.  In that case
4339	 * event->count would have been updated to a recent sample
4340	 * when the event was scheduled out.
4341	 */
4342	if (ctx->task && cpuctx->task_ctx != ctx)
4343		return;
4344
4345	raw_spin_lock(&ctx->lock);
4346	if (ctx->is_active & EVENT_TIME) {
4347		update_context_time(ctx);
4348		update_cgrp_time_from_event(event);
4349	}
4350
4351	perf_event_update_time(event);
4352	if (data->group)
4353		perf_event_update_sibling_time(event);
4354
4355	if (event->state != PERF_EVENT_STATE_ACTIVE)
4356		goto unlock;
4357
4358	if (!data->group) {
4359		pmu->read(event);
4360		data->ret = 0;
4361		goto unlock;
4362	}
4363
4364	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4365
4366	pmu->read(event);
4367
4368	for_each_sibling_event(sub, event) {
4369		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4370			/*
4371			 * Use sibling's PMU rather than @event's since
4372			 * sibling could be on different (eg: software) PMU.
4373			 */
4374			sub->pmu->read(sub);
4375		}
4376	}
4377
4378	data->ret = pmu->commit_txn(pmu);
4379
4380unlock:
4381	raw_spin_unlock(&ctx->lock);
4382}
4383
4384static inline u64 perf_event_count(struct perf_event *event)
4385{
4386	return local64_read(&event->count) + atomic64_read(&event->child_count);
4387}
4388
4389static void calc_timer_values(struct perf_event *event,
4390				u64 *now,
4391				u64 *enabled,
4392				u64 *running)
4393{
4394	u64 ctx_time;
4395
4396	*now = perf_clock();
4397	ctx_time = perf_event_time_now(event, *now);
4398	__perf_update_times(event, ctx_time, enabled, running);
4399}
4400
4401/*
4402 * NMI-safe method to read a local event, that is an event that
4403 * is:
4404 *   - either for the current task, or for this CPU
4405 *   - does not have inherit set, for inherited task events
4406 *     will not be local and we cannot read them atomically
4407 *   - must not have a pmu::count method
4408 */
4409int perf_event_read_local(struct perf_event *event, u64 *value,
4410			  u64 *enabled, u64 *running)
4411{
4412	unsigned long flags;
4413	int ret = 0;
4414
4415	/*
4416	 * Disabling interrupts avoids all counter scheduling (context
4417	 * switches, timer based rotation and IPIs).
4418	 */
4419	local_irq_save(flags);
4420
4421	/*
4422	 * It must not be an event with inherit set, we cannot read
4423	 * all child counters from atomic context.
4424	 */
4425	if (event->attr.inherit) {
4426		ret = -EOPNOTSUPP;
4427		goto out;
4428	}
4429
4430	/* If this is a per-task event, it must be for current */
4431	if ((event->attach_state & PERF_ATTACH_TASK) &&
4432	    event->hw.target != current) {
4433		ret = -EINVAL;
4434		goto out;
4435	}
4436
4437	/* If this is a per-CPU event, it must be for this CPU */
4438	if (!(event->attach_state & PERF_ATTACH_TASK) &&
4439	    event->cpu != smp_processor_id()) {
4440		ret = -EINVAL;
4441		goto out;
4442	}
4443
4444	/* If this is a pinned event it must be running on this CPU */
4445	if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4446		ret = -EBUSY;
4447		goto out;
4448	}
4449
4450	/*
4451	 * If the event is currently on this CPU, its either a per-task event,
4452	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4453	 * oncpu == -1).
4454	 */
4455	if (event->oncpu == smp_processor_id())
4456		event->pmu->read(event);
4457
4458	*value = local64_read(&event->count);
4459	if (enabled || running) {
4460		u64 __enabled, __running, __now;
4461
4462		calc_timer_values(event, &__now, &__enabled, &__running);
4463		if (enabled)
4464			*enabled = __enabled;
4465		if (running)
4466			*running = __running;
4467	}
4468out:
4469	local_irq_restore(flags);
4470
4471	return ret;
4472}
4473
4474static int perf_event_read(struct perf_event *event, bool group)
4475{
4476	enum perf_event_state state = READ_ONCE(event->state);
4477	int event_cpu, ret = 0;
4478
4479	/*
4480	 * If event is enabled and currently active on a CPU, update the
4481	 * value in the event structure:
4482	 */
4483again:
4484	if (state == PERF_EVENT_STATE_ACTIVE) {
4485		struct perf_read_data data;
4486
4487		/*
4488		 * Orders the ->state and ->oncpu loads such that if we see
4489		 * ACTIVE we must also see the right ->oncpu.
4490		 *
4491		 * Matches the smp_wmb() from event_sched_in().
4492		 */
4493		smp_rmb();
4494
4495		event_cpu = READ_ONCE(event->oncpu);
4496		if ((unsigned)event_cpu >= nr_cpu_ids)
4497			return 0;
4498
4499		data = (struct perf_read_data){
4500			.event = event,
4501			.group = group,
4502			.ret = 0,
4503		};
4504
4505		preempt_disable();
4506		event_cpu = __perf_event_read_cpu(event, event_cpu);
4507
4508		/*
4509		 * Purposely ignore the smp_call_function_single() return
4510		 * value.
4511		 *
4512		 * If event_cpu isn't a valid CPU it means the event got
4513		 * scheduled out and that will have updated the event count.
4514		 *
4515		 * Therefore, either way, we'll have an up-to-date event count
4516		 * after this.
4517		 */
4518		(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4519		preempt_enable();
4520		ret = data.ret;
4521
4522	} else if (state == PERF_EVENT_STATE_INACTIVE) {
4523		struct perf_event_context *ctx = event->ctx;
4524		unsigned long flags;
4525
4526		raw_spin_lock_irqsave(&ctx->lock, flags);
4527		state = event->state;
4528		if (state != PERF_EVENT_STATE_INACTIVE) {
4529			raw_spin_unlock_irqrestore(&ctx->lock, flags);
4530			goto again;
4531		}
4532
4533		/*
4534		 * May read while context is not active (e.g., thread is
4535		 * blocked), in that case we cannot update context time
4536		 */
4537		if (ctx->is_active & EVENT_TIME) {
4538			update_context_time(ctx);
4539			update_cgrp_time_from_event(event);
4540		}
4541
4542		perf_event_update_time(event);
4543		if (group)
4544			perf_event_update_sibling_time(event);
4545		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4546	}
4547
4548	return ret;
4549}
4550
4551/*
4552 * Initialize the perf_event context in a task_struct:
4553 */
4554static void __perf_event_init_context(struct perf_event_context *ctx)
4555{
4556	raw_spin_lock_init(&ctx->lock);
4557	mutex_init(&ctx->mutex);
4558	INIT_LIST_HEAD(&ctx->active_ctx_list);
4559	perf_event_groups_init(&ctx->pinned_groups);
4560	perf_event_groups_init(&ctx->flexible_groups);
4561	INIT_LIST_HEAD(&ctx->event_list);
4562	INIT_LIST_HEAD(&ctx->pinned_active);
4563	INIT_LIST_HEAD(&ctx->flexible_active);
4564	refcount_set(&ctx->refcount, 1);
4565}
4566
4567static struct perf_event_context *
4568alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4569{
4570	struct perf_event_context *ctx;
4571
4572	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4573	if (!ctx)
4574		return NULL;
4575
4576	__perf_event_init_context(ctx);
4577	if (task)
4578		ctx->task = get_task_struct(task);
4579	ctx->pmu = pmu;
4580
4581	return ctx;
4582}
4583
4584static struct task_struct *
4585find_lively_task_by_vpid(pid_t vpid)
4586{
4587	struct task_struct *task;
4588
4589	rcu_read_lock();
4590	if (!vpid)
4591		task = current;
4592	else
4593		task = find_task_by_vpid(vpid);
4594	if (task)
4595		get_task_struct(task);
4596	rcu_read_unlock();
4597
4598	if (!task)
4599		return ERR_PTR(-ESRCH);
4600
4601	return task;
4602}
4603
4604/*
4605 * Returns a matching context with refcount and pincount.
4606 */
4607static struct perf_event_context *
4608find_get_context(struct pmu *pmu, struct task_struct *task,
4609		struct perf_event *event)
4610{
4611	struct perf_event_context *ctx, *clone_ctx = NULL;
4612	struct perf_cpu_context *cpuctx;
4613	void *task_ctx_data = NULL;
4614	unsigned long flags;
4615	int ctxn, err;
4616	int cpu = event->cpu;
4617
4618	if (!task) {
4619		/* Must be root to operate on a CPU event: */
4620		err = perf_allow_cpu(&event->attr);
4621		if (err)
4622			return ERR_PTR(err);
4623
4624		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4625		ctx = &cpuctx->ctx;
4626		get_ctx(ctx);
4627		raw_spin_lock_irqsave(&ctx->lock, flags);
4628		++ctx->pin_count;
4629		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4630
4631		return ctx;
4632	}
4633
4634	err = -EINVAL;
4635	ctxn = pmu->task_ctx_nr;
4636	if (ctxn < 0)
4637		goto errout;
4638
4639	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4640		task_ctx_data = alloc_task_ctx_data(pmu);
4641		if (!task_ctx_data) {
4642			err = -ENOMEM;
4643			goto errout;
4644		}
4645	}
4646
4647retry:
4648	ctx = perf_lock_task_context(task, ctxn, &flags);
4649	if (ctx) {
4650		clone_ctx = unclone_ctx(ctx);
4651		++ctx->pin_count;
4652
4653		if (task_ctx_data && !ctx->task_ctx_data) {
4654			ctx->task_ctx_data = task_ctx_data;
4655			task_ctx_data = NULL;
4656		}
4657		raw_spin_unlock_irqrestore(&ctx->lock, flags);
4658
4659		if (clone_ctx)
4660			put_ctx(clone_ctx);
4661	} else {
4662		ctx = alloc_perf_context(pmu, task);
4663		err = -ENOMEM;
4664		if (!ctx)
4665			goto errout;
4666
4667		if (task_ctx_data) {
4668			ctx->task_ctx_data = task_ctx_data;
4669			task_ctx_data = NULL;
4670		}
4671
4672		err = 0;
4673		mutex_lock(&task->perf_event_mutex);
4674		/*
4675		 * If it has already passed perf_event_exit_task().
4676		 * we must see PF_EXITING, it takes this mutex too.
4677		 */
4678		if (task->flags & PF_EXITING)
4679			err = -ESRCH;
4680		else if (task->perf_event_ctxp[ctxn])
4681			err = -EAGAIN;
4682		else {
4683			get_ctx(ctx);
4684			++ctx->pin_count;
4685			rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4686		}
4687		mutex_unlock(&task->perf_event_mutex);
4688
4689		if (unlikely(err)) {
4690			put_ctx(ctx);
4691
4692			if (err == -EAGAIN)
4693				goto retry;
4694			goto errout;
4695		}
4696	}
4697
4698	free_task_ctx_data(pmu, task_ctx_data);
4699	return ctx;
4700
4701errout:
4702	free_task_ctx_data(pmu, task_ctx_data);
4703	return ERR_PTR(err);
4704}
4705
4706static void perf_event_free_filter(struct perf_event *event);
4707
4708static void free_event_rcu(struct rcu_head *head)
4709{
4710	struct perf_event *event;
4711
4712	event = container_of(head, struct perf_event, rcu_head);
4713	if (event->ns)
4714		put_pid_ns(event->ns);
4715	perf_event_free_filter(event);
4716	kmem_cache_free(perf_event_cache, event);
4717}
4718
4719static void ring_buffer_attach(struct perf_event *event,
4720			       struct perf_buffer *rb);
4721
4722static void detach_sb_event(struct perf_event *event)
4723{
4724	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4725
4726	raw_spin_lock(&pel->lock);
4727	list_del_rcu(&event->sb_list);
4728	raw_spin_unlock(&pel->lock);
4729}
4730
4731static bool is_sb_event(struct perf_event *event)
4732{
4733	struct perf_event_attr *attr = &event->attr;
4734
4735	if (event->parent)
4736		return false;
4737
4738	if (event->attach_state & PERF_ATTACH_TASK)
4739		return false;
4740
4741	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4742	    attr->comm || attr->comm_exec ||
4743	    attr->task || attr->ksymbol ||
4744	    attr->context_switch || attr->text_poke ||
4745	    attr->bpf_event)
4746		return true;
4747	return false;
4748}
4749
4750static void unaccount_pmu_sb_event(struct perf_event *event)
4751{
4752	if (is_sb_event(event))
4753		detach_sb_event(event);
4754}
4755
4756static void unaccount_event_cpu(struct perf_event *event, int cpu)
4757{
4758	if (event->parent)
4759		return;
4760
4761	if (is_cgroup_event(event))
4762		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4763}
4764
4765#ifdef CONFIG_NO_HZ_FULL
4766static DEFINE_SPINLOCK(nr_freq_lock);
4767#endif
4768
4769static void unaccount_freq_event_nohz(void)
4770{
4771#ifdef CONFIG_NO_HZ_FULL
4772	spin_lock(&nr_freq_lock);
4773	if (atomic_dec_and_test(&nr_freq_events))
4774		tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4775	spin_unlock(&nr_freq_lock);
4776#endif
4777}
4778
4779static void unaccount_freq_event(void)
4780{
4781	if (tick_nohz_full_enabled())
4782		unaccount_freq_event_nohz();
4783	else
4784		atomic_dec(&nr_freq_events);
4785}
4786
4787static void unaccount_event(struct perf_event *event)
4788{
4789	bool dec = false;
4790
4791	if (event->parent)
4792		return;
4793
4794	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
4795		dec = true;
4796	if (event->attr.mmap || event->attr.mmap_data)
4797		atomic_dec(&nr_mmap_events);
4798	if (event->attr.build_id)
4799		atomic_dec(&nr_build_id_events);
4800	if (event->attr.comm)
4801		atomic_dec(&nr_comm_events);
4802	if (event->attr.namespaces)
4803		atomic_dec(&nr_namespaces_events);
4804	if (event->attr.cgroup)
4805		atomic_dec(&nr_cgroup_events);
4806	if (event->attr.task)
4807		atomic_dec(&nr_task_events);
4808	if (event->attr.freq)
4809		unaccount_freq_event();
4810	if (event->attr.context_switch) {
4811		dec = true;
4812		atomic_dec(&nr_switch_events);
4813	}
4814	if (is_cgroup_event(event))
4815		dec = true;
4816	if (has_branch_stack(event))
4817		dec = true;
4818	if (event->attr.ksymbol)
4819		atomic_dec(&nr_ksymbol_events);
4820	if (event->attr.bpf_event)
4821		atomic_dec(&nr_bpf_events);
4822	if (event->attr.text_poke)
4823		atomic_dec(&nr_text_poke_events);
4824
4825	if (dec) {
4826		if (!atomic_add_unless(&perf_sched_count, -1, 1))
4827			schedule_delayed_work(&perf_sched_work, HZ);
4828	}
4829
4830	unaccount_event_cpu(event, event->cpu);
4831
4832	unaccount_pmu_sb_event(event);
4833}
4834
4835static void perf_sched_delayed(struct work_struct *work)
4836{
4837	mutex_lock(&perf_sched_mutex);
4838	if (atomic_dec_and_test(&perf_sched_count))
4839		static_branch_disable(&perf_sched_events);
4840	mutex_unlock(&perf_sched_mutex);
4841}
4842
4843/*
4844 * The following implement mutual exclusion of events on "exclusive" pmus
4845 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4846 * at a time, so we disallow creating events that might conflict, namely:
4847 *
4848 *  1) cpu-wide events in the presence of per-task events,
4849 *  2) per-task events in the presence of cpu-wide events,
4850 *  3) two matching events on the same context.
4851 *
4852 * The former two cases are handled in the allocation path (perf_event_alloc(),
4853 * _free_event()), the latter -- before the first perf_install_in_context().
4854 */
4855static int exclusive_event_init(struct perf_event *event)
4856{
4857	struct pmu *pmu = event->pmu;
4858
4859	if (!is_exclusive_pmu(pmu))
4860		return 0;
4861
4862	/*
4863	 * Prevent co-existence of per-task and cpu-wide events on the
4864	 * same exclusive pmu.
4865	 *
4866	 * Negative pmu::exclusive_cnt means there are cpu-wide
4867	 * events on this "exclusive" pmu, positive means there are
4868	 * per-task events.
4869	 *
4870	 * Since this is called in perf_event_alloc() path, event::ctx
4871	 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4872	 * to mean "per-task event", because unlike other attach states it
4873	 * never gets cleared.
4874	 */
4875	if (event->attach_state & PERF_ATTACH_TASK) {
4876		if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4877			return -EBUSY;
4878	} else {
4879		if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4880			return -EBUSY;
4881	}
4882
4883	return 0;
4884}
4885
4886static void exclusive_event_destroy(struct perf_event *event)
4887{
4888	struct pmu *pmu = event->pmu;
4889
4890	if (!is_exclusive_pmu(pmu))
4891		return;
4892
4893	/* see comment in exclusive_event_init() */
4894	if (event->attach_state & PERF_ATTACH_TASK)
4895		atomic_dec(&pmu->exclusive_cnt);
4896	else
4897		atomic_inc(&pmu->exclusive_cnt);
4898}
4899
4900static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4901{
4902	if ((e1->pmu == e2->pmu) &&
4903	    (e1->cpu == e2->cpu ||
4904	     e1->cpu == -1 ||
4905	     e2->cpu == -1))
4906		return true;
4907	return false;
4908}
4909
4910static bool exclusive_event_installable(struct perf_event *event,
4911					struct perf_event_context *ctx)
4912{
4913	struct perf_event *iter_event;
4914	struct pmu *pmu = event->pmu;
4915
4916	lockdep_assert_held(&ctx->mutex);
4917
4918	if (!is_exclusive_pmu(pmu))
4919		return true;
4920
4921	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4922		if (exclusive_event_match(iter_event, event))
4923			return false;
4924	}
4925
4926	return true;
4927}
4928
4929static void perf_addr_filters_splice(struct perf_event *event,
4930				       struct list_head *head);
4931
4932static void _free_event(struct perf_event *event)
4933{
4934	irq_work_sync(&event->pending);
4935
4936	unaccount_event(event);
4937
4938	security_perf_event_free(event);
4939
4940	if (event->rb) {
4941		/*
4942		 * Can happen when we close an event with re-directed output.
4943		 *
4944		 * Since we have a 0 refcount, perf_mmap_close() will skip
4945		 * over us; possibly making our ring_buffer_put() the last.
4946		 */
4947		mutex_lock(&event->mmap_mutex);
4948		ring_buffer_attach(event, NULL);
4949		mutex_unlock(&event->mmap_mutex);
4950	}
4951
4952	if (is_cgroup_event(event))
4953		perf_detach_cgroup(event);
4954
4955	if (!event->parent) {
4956		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4957			put_callchain_buffers();
4958	}
4959
4960	perf_event_free_bpf_prog(event);
4961	perf_addr_filters_splice(event, NULL);
4962	kfree(event->addr_filter_ranges);
4963
4964	if (event->destroy)
4965		event->destroy(event);
4966
4967	/*
4968	 * Must be after ->destroy(), due to uprobe_perf_close() using
4969	 * hw.target.
4970	 */
4971	if (event->hw.target)
4972		put_task_struct(event->hw.target);
4973
4974	/*
4975	 * perf_event_free_task() relies on put_ctx() being 'last', in particular
4976	 * all task references must be cleaned up.
4977	 */
4978	if (event->ctx)
4979		put_ctx(event->ctx);
4980
4981	exclusive_event_destroy(event);
4982	module_put(event->pmu->module);
4983
4984	call_rcu(&event->rcu_head, free_event_rcu);
4985}
4986
4987/*
4988 * Used to free events which have a known refcount of 1, such as in error paths
4989 * where the event isn't exposed yet and inherited events.
4990 */
4991static void free_event(struct perf_event *event)
4992{
4993	if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4994				"unexpected event refcount: %ld; ptr=%p\n",
4995				atomic_long_read(&event->refcount), event)) {
4996		/* leak to avoid use-after-free */
4997		return;
4998	}
4999
5000	_free_event(event);
5001}
5002
5003/*
5004 * Remove user event from the owner task.
5005 */
5006static void perf_remove_from_owner(struct perf_event *event)
5007{
5008	struct task_struct *owner;
5009
5010	rcu_read_lock();
5011	/*
5012	 * Matches the smp_store_release() in perf_event_exit_task(). If we
5013	 * observe !owner it means the list deletion is complete and we can
5014	 * indeed free this event, otherwise we need to serialize on
5015	 * owner->perf_event_mutex.
5016	 */
5017	owner = READ_ONCE(event->owner);
5018	if (owner) {
5019		/*
5020		 * Since delayed_put_task_struct() also drops the last
5021		 * task reference we can safely take a new reference
5022		 * while holding the rcu_read_lock().
5023		 */
5024		get_task_struct(owner);
5025	}
5026	rcu_read_unlock();
5027
5028	if (owner) {
5029		/*
5030		 * If we're here through perf_event_exit_task() we're already
5031		 * holding ctx->mutex which would be an inversion wrt. the
5032		 * normal lock order.
5033		 *
5034		 * However we can safely take this lock because its the child
5035		 * ctx->mutex.
5036		 */
5037		mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5038
5039		/*
5040		 * We have to re-check the event->owner field, if it is cleared
5041		 * we raced with perf_event_exit_task(), acquiring the mutex
5042		 * ensured they're done, and we can proceed with freeing the
5043		 * event.
5044		 */
5045		if (event->owner) {
5046			list_del_init(&event->owner_entry);
5047			smp_store_release(&event->owner, NULL);
5048		}
5049		mutex_unlock(&owner->perf_event_mutex);
5050		put_task_struct(owner);
5051	}
5052}
5053
5054static void put_event(struct perf_event *event)
5055{
5056	if (!atomic_long_dec_and_test(&event->refcount))
5057		return;
5058
5059	_free_event(event);
5060}
5061
5062/*
5063 * Kill an event dead; while event:refcount will preserve the event
5064 * object, it will not preserve its functionality. Once the last 'user'
5065 * gives up the object, we'll destroy the thing.
5066 */
5067int perf_event_release_kernel(struct perf_event *event)
5068{
5069	struct perf_event_context *ctx = event->ctx;
5070	struct perf_event *child, *tmp;
5071	LIST_HEAD(free_list);
5072
5073	/*
5074	 * If we got here through err_file: fput(event_file); we will not have
5075	 * attached to a context yet.
5076	 */
5077	if (!ctx) {
5078		WARN_ON_ONCE(event->attach_state &
5079				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5080		goto no_ctx;
5081	}
5082
5083	if (!is_kernel_event(event))
5084		perf_remove_from_owner(event);
5085
5086	ctx = perf_event_ctx_lock(event);
5087	WARN_ON_ONCE(ctx->parent_ctx);
5088	perf_remove_from_context(event, DETACH_GROUP);
5089
5090	raw_spin_lock_irq(&ctx->lock);
5091	/*
5092	 * Mark this event as STATE_DEAD, there is no external reference to it
5093	 * anymore.
5094	 *
5095	 * Anybody acquiring event->child_mutex after the below loop _must_
5096	 * also see this, most importantly inherit_event() which will avoid
5097	 * placing more children on the list.
5098	 *
5099	 * Thus this guarantees that we will in fact observe and kill _ALL_
5100	 * child events.
5101	 */
5102	event->state = PERF_EVENT_STATE_DEAD;
5103	raw_spin_unlock_irq(&ctx->lock);
5104
5105	perf_event_ctx_unlock(event, ctx);
5106
5107again:
5108	mutex_lock(&event->child_mutex);
5109	list_for_each_entry(child, &event->child_list, child_list) {
5110
5111		/*
5112		 * Cannot change, child events are not migrated, see the
5113		 * comment with perf_event_ctx_lock_nested().
5114		 */
5115		ctx = READ_ONCE(child->ctx);
5116		/*
5117		 * Since child_mutex nests inside ctx::mutex, we must jump
5118		 * through hoops. We start by grabbing a reference on the ctx.
5119		 *
5120		 * Since the event cannot get freed while we hold the
5121		 * child_mutex, the context must also exist and have a !0
5122		 * reference count.
5123		 */
5124		get_ctx(ctx);
5125
5126		/*
5127		 * Now that we have a ctx ref, we can drop child_mutex, and
5128		 * acquire ctx::mutex without fear of it going away. Then we
5129		 * can re-acquire child_mutex.
5130		 */
5131		mutex_unlock(&event->child_mutex);
5132		mutex_lock(&ctx->mutex);
5133		mutex_lock(&event->child_mutex);
5134
5135		/*
5136		 * Now that we hold ctx::mutex and child_mutex, revalidate our
5137		 * state, if child is still the first entry, it didn't get freed
5138		 * and we can continue doing so.
5139		 */
5140		tmp = list_first_entry_or_null(&event->child_list,
5141					       struct perf_event, child_list);
5142		if (tmp == child) {
5143			perf_remove_from_context(child, DETACH_GROUP);
5144			list_move(&child->child_list, &free_list);
5145			/*
5146			 * This matches the refcount bump in inherit_event();
5147			 * this can't be the last reference.
5148			 */
5149			put_event(event);
5150		}
5151
5152		mutex_unlock(&event->child_mutex);
5153		mutex_unlock(&ctx->mutex);
5154		put_ctx(ctx);
5155		goto again;
5156	}
5157	mutex_unlock(&event->child_mutex);
5158
5159	list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5160		void *var = &child->ctx->refcount;
5161
5162		list_del(&child->child_list);
5163		free_event(child);
5164
5165		/*
5166		 * Wake any perf_event_free_task() waiting for this event to be
5167		 * freed.
5168		 */
5169		smp_mb(); /* pairs with wait_var_event() */
5170		wake_up_var(var);
5171	}
5172
5173no_ctx:
5174	put_event(event); /* Must be the 'last' reference */
5175	return 0;
5176}
5177EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5178
5179/*
5180 * Called when the last reference to the file is gone.
5181 */
5182static int perf_release(struct inode *inode, struct file *file)
5183{
5184	perf_event_release_kernel(file->private_data);
5185	return 0;
5186}
5187
5188static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5189{
5190	struct perf_event *child;
5191	u64 total = 0;
5192
5193	*enabled = 0;
5194	*running = 0;
5195
5196	mutex_lock(&event->child_mutex);
5197
5198	(void)perf_event_read(event, false);
5199	total += perf_event_count(event);
5200
5201	*enabled += event->total_time_enabled +
5202			atomic64_read(&event->child_total_time_enabled);
5203	*running += event->total_time_running +
5204			atomic64_read(&event->child_total_time_running);
5205
5206	list_for_each_entry(child, &event->child_list, child_list) {
5207		(void)perf_event_read(child, false);
5208		total += perf_event_count(child);
5209		*enabled += child->total_time_enabled;
5210		*running += child->total_time_running;
5211	}
5212	mutex_unlock(&event->child_mutex);
5213
5214	return total;
5215}
5216
5217u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5218{
5219	struct perf_event_context *ctx;
5220	u64 count;
5221
5222	ctx = perf_event_ctx_lock(event);
5223	count = __perf_event_read_value(event, enabled, running);
5224	perf_event_ctx_unlock(event, ctx);
5225
5226	return count;
5227}
5228EXPORT_SYMBOL_GPL(perf_event_read_value);
5229
5230static int __perf_read_group_add(struct perf_event *leader,
5231					u64 read_format, u64 *values)
5232{
5233	struct perf_event_context *ctx = leader->ctx;
5234	struct perf_event *sub;
5235	unsigned long flags;
5236	int n = 1; /* skip @nr */
5237	int ret;
5238
5239	ret = perf_event_read(leader, true);
5240	if (ret)
5241		return ret;
5242
5243	raw_spin_lock_irqsave(&ctx->lock, flags);
5244
5245	/*
5246	 * Since we co-schedule groups, {enabled,running} times of siblings
5247	 * will be identical to those of the leader, so we only publish one
5248	 * set.
5249	 */
5250	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5251		values[n++] += leader->total_time_enabled +
5252			atomic64_read(&leader->child_total_time_enabled);
5253	}
5254
5255	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5256		values[n++] += leader->total_time_running +
5257			atomic64_read(&leader->child_total_time_running);
5258	}
5259
5260	/*
5261	 * Write {count,id} tuples for every sibling.
5262	 */
5263	values[n++] += perf_event_count(leader);
5264	if (read_format & PERF_FORMAT_ID)
5265		values[n++] = primary_event_id(leader);
5266	if (read_format & PERF_FORMAT_LOST)
5267		values[n++] = atomic64_read(&leader->lost_samples);
5268
5269	for_each_sibling_event(sub, leader) {
5270		values[n++] += perf_event_count(sub);
5271		if (read_format & PERF_FORMAT_ID)
5272			values[n++] = primary_event_id(sub);
5273		if (read_format & PERF_FORMAT_LOST)
5274			values[n++] = atomic64_read(&sub->lost_samples);
5275	}
5276
5277	raw_spin_unlock_irqrestore(&ctx->lock, flags);
5278	return 0;
5279}
5280
5281static int perf_read_group(struct perf_event *event,
5282				   u64 read_format, char __user *buf)
5283{
5284	struct perf_event *leader = event->group_leader, *child;
5285	struct perf_event_context *ctx = leader->ctx;
5286	int ret;
5287	u64 *values;
5288
5289	lockdep_assert_held(&ctx->mutex);
5290
5291	values = kzalloc(event->read_size, GFP_KERNEL);
5292	if (!values)
5293		return -ENOMEM;
5294
5295	values[0] = 1 + leader->nr_siblings;
5296
5297	/*
5298	 * By locking the child_mutex of the leader we effectively
5299	 * lock the child list of all siblings.. XXX explain how.
5300	 */
5301	mutex_lock(&leader->child_mutex);
5302
5303	ret = __perf_read_group_add(leader, read_format, values);
5304	if (ret)
5305		goto unlock;
5306
5307	list_for_each_entry(child, &leader->child_list, child_list) {
5308		ret = __perf_read_group_add(child, read_format, values);
5309		if (ret)
5310			goto unlock;
5311	}
5312
5313	mutex_unlock(&leader->child_mutex);
5314
5315	ret = event->read_size;
5316	if (copy_to_user(buf, values, event->read_size))
5317		ret = -EFAULT;
5318	goto out;
5319
5320unlock:
5321	mutex_unlock(&leader->child_mutex);
5322out:
5323	kfree(values);
5324	return ret;
5325}
5326
5327static int perf_read_one(struct perf_event *event,
5328				 u64 read_format, char __user *buf)
5329{
5330	u64 enabled, running;
5331	u64 values[5];
5332	int n = 0;
5333
5334	values[n++] = __perf_event_read_value(event, &enabled, &running);
5335	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5336		values[n++] = enabled;
5337	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5338		values[n++] = running;
5339	if (read_format & PERF_FORMAT_ID)
5340		values[n++] = primary_event_id(event);
5341	if (read_format & PERF_FORMAT_LOST)
5342		values[n++] = atomic64_read(&event->lost_samples);
5343
5344	if (copy_to_user(buf, values, n * sizeof(u64)))
5345		return -EFAULT;
5346
5347	return n * sizeof(u64);
5348}
5349
5350static bool is_event_hup(struct perf_event *event)
5351{
5352	bool no_children;
5353
5354	if (event->state > PERF_EVENT_STATE_EXIT)
5355		return false;
5356
5357	mutex_lock(&event->child_mutex);
5358	no_children = list_empty(&event->child_list);
5359	mutex_unlock(&event->child_mutex);
5360	return no_children;
5361}
5362
5363/*
5364 * Read the performance event - simple non blocking version for now
5365 */
5366static ssize_t
5367__perf_read(struct perf_event *event, char __user *buf, size_t count)
5368{
5369	u64 read_format = event->attr.read_format;
5370	int ret;
5371
5372	/*
5373	 * Return end-of-file for a read on an event that is in
5374	 * error state (i.e. because it was pinned but it couldn't be
5375	 * scheduled on to the CPU at some point).
5376	 */
5377	if (event->state == PERF_EVENT_STATE_ERROR)
5378		return 0;
5379
5380	if (count < event->read_size)
5381		return -ENOSPC;
5382
5383	WARN_ON_ONCE(event->ctx->parent_ctx);
5384	if (read_format & PERF_FORMAT_GROUP)
5385		ret = perf_read_group(event, read_format, buf);
5386	else
5387		ret = perf_read_one(event, read_format, buf);
5388
5389	return ret;
5390}
5391
5392static ssize_t
5393perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5394{
5395	struct perf_event *event = file->private_data;
5396	struct perf_event_context *ctx;
5397	int ret;
5398
5399	ret = security_perf_event_read(event);
5400	if (ret)
5401		return ret;
5402
5403	ctx = perf_event_ctx_lock(event);
5404	ret = __perf_read(event, buf, count);
5405	perf_event_ctx_unlock(event, ctx);
5406
5407	return ret;
5408}
5409
5410static __poll_t perf_poll(struct file *file, poll_table *wait)
5411{
5412	struct perf_event *event = file->private_data;
5413	struct perf_buffer *rb;
5414	__poll_t events = EPOLLHUP;
5415
5416	poll_wait(file, &event->waitq, wait);
5417
5418	if (is_event_hup(event))
5419		return events;
5420
5421	/*
5422	 * Pin the event->rb by taking event->mmap_mutex; otherwise
5423	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
5424	 */
5425	mutex_lock(&event->mmap_mutex);
5426	rb = event->rb;
5427	if (rb)
5428		events = atomic_xchg(&rb->poll, 0);
5429	mutex_unlock(&event->mmap_mutex);
5430	return events;
5431}
5432
5433static void _perf_event_reset(struct perf_event *event)
5434{
5435	(void)perf_event_read(event, false);
5436	local64_set(&event->count, 0);
5437	perf_event_update_userpage(event);
5438}
5439
5440/* Assume it's not an event with inherit set. */
5441u64 perf_event_pause(struct perf_event *event, bool reset)
5442{
5443	struct perf_event_context *ctx;
5444	u64 count;
5445
5446	ctx = perf_event_ctx_lock(event);
5447	WARN_ON_ONCE(event->attr.inherit);
5448	_perf_event_disable(event);
5449	count = local64_read(&event->count);
5450	if (reset)
5451		local64_set(&event->count, 0);
5452	perf_event_ctx_unlock(event, ctx);
5453
5454	return count;
5455}
5456EXPORT_SYMBOL_GPL(perf_event_pause);
5457
5458/*
5459 * Holding the top-level event's child_mutex means that any
5460 * descendant process that has inherited this event will block
5461 * in perf_event_exit_event() if it goes to exit, thus satisfying the
5462 * task existence requirements of perf_event_enable/disable.
5463 */
5464static void perf_event_for_each_child(struct perf_event *event,
5465					void (*func)(struct perf_event *))
5466{
5467	struct perf_event *child;
5468
5469	WARN_ON_ONCE(event->ctx->parent_ctx);
5470
5471	mutex_lock(&event->child_mutex);
5472	func(event);
5473	list_for_each_entry(child, &event->child_list, child_list)
5474		func(child);
5475	mutex_unlock(&event->child_mutex);
5476}
5477
5478static void perf_event_for_each(struct perf_event *event,
5479				  void (*func)(struct perf_event *))
5480{
5481	struct perf_event_context *ctx = event->ctx;
5482	struct perf_event *sibling;
5483
5484	lockdep_assert_held(&ctx->mutex);
5485
5486	event = event->group_leader;
5487
5488	perf_event_for_each_child(event, func);
5489	for_each_sibling_event(sibling, event)
5490		perf_event_for_each_child(sibling, func);
5491}
5492
5493static void __perf_event_period(struct perf_event *event,
5494				struct perf_cpu_context *cpuctx,
5495				struct perf_event_context *ctx,
5496				void *info)
5497{
5498	u64 value = *((u64 *)info);
5499	bool active;
5500
5501	if (event->attr.freq) {
5502		event->attr.sample_freq = value;
5503	} else {
5504		event->attr.sample_period = value;
5505		event->hw.sample_period = value;
5506	}
5507
5508	active = (event->state == PERF_EVENT_STATE_ACTIVE);
5509	if (active) {
5510		perf_pmu_disable(ctx->pmu);
5511		/*
5512		 * We could be throttled; unthrottle now to avoid the tick
5513		 * trying to unthrottle while we already re-started the event.
5514		 */
5515		if (event->hw.interrupts == MAX_INTERRUPTS) {
5516			event->hw.interrupts = 0;
5517			perf_log_throttle(event, 1);
5518		}
5519		event->pmu->stop(event, PERF_EF_UPDATE);
5520	}
5521
5522	local64_set(&event->hw.period_left, 0);
5523
5524	if (active) {
5525		event->pmu->start(event, PERF_EF_RELOAD);
5526		perf_pmu_enable(ctx->pmu);
5527	}
5528}
5529
5530static int perf_event_check_period(struct perf_event *event, u64 value)
5531{
5532	return event->pmu->check_period(event, value);
5533}
5534
5535static int _perf_event_period(struct perf_event *event, u64 value)
5536{
5537	if (!is_sampling_event(event))
5538		return -EINVAL;
5539
5540	if (!value)
5541		return -EINVAL;
5542
5543	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5544		return -EINVAL;
5545
5546	if (perf_event_check_period(event, value))
5547		return -EINVAL;
5548
5549	if (!event->attr.freq && (value & (1ULL << 63)))
5550		return -EINVAL;
5551
5552	event_function_call(event, __perf_event_period, &value);
5553
5554	return 0;
5555}
5556
5557int perf_event_period(struct perf_event *event, u64 value)
5558{
5559	struct perf_event_context *ctx;
5560	int ret;
5561
5562	ctx = perf_event_ctx_lock(event);
5563	ret = _perf_event_period(event, value);
5564	perf_event_ctx_unlock(event, ctx);
5565
5566	return ret;
5567}
5568EXPORT_SYMBOL_GPL(perf_event_period);
5569
5570static const struct file_operations perf_fops;
5571
5572static inline int perf_fget_light(int fd, struct fd *p)
5573{
5574	struct fd f = fdget(fd);
5575	if (!f.file)
5576		return -EBADF;
5577
5578	if (f.file->f_op != &perf_fops) {
5579		fdput(f);
5580		return -EBADF;
5581	}
5582	*p = f;
5583	return 0;
5584}
5585
5586static int perf_event_set_output(struct perf_event *event,
5587				 struct perf_event *output_event);
5588static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5589static int perf_copy_attr(struct perf_event_attr __user *uattr,
5590			  struct perf_event_attr *attr);
5591
5592static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5593{
5594	void (*func)(struct perf_event *);
5595	u32 flags = arg;
5596
5597	switch (cmd) {
5598	case PERF_EVENT_IOC_ENABLE:
5599		func = _perf_event_enable;
5600		break;
5601	case PERF_EVENT_IOC_DISABLE:
5602		func = _perf_event_disable;
5603		break;
5604	case PERF_EVENT_IOC_RESET:
5605		func = _perf_event_reset;
5606		break;
5607
5608	case PERF_EVENT_IOC_REFRESH:
5609		return _perf_event_refresh(event, arg);
5610
5611	case PERF_EVENT_IOC_PERIOD:
5612	{
5613		u64 value;
5614
5615		if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5616			return -EFAULT;
5617
5618		return _perf_event_period(event, value);
5619	}
5620	case PERF_EVENT_IOC_ID:
5621	{
5622		u64 id = primary_event_id(event);
5623
5624		if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5625			return -EFAULT;
5626		return 0;
5627	}
5628
5629	case PERF_EVENT_IOC_SET_OUTPUT:
5630	{
5631		int ret;
5632		if (arg != -1) {
5633			struct perf_event *output_event;
5634			struct fd output;
5635			ret = perf_fget_light(arg, &output);
5636			if (ret)
5637				return ret;
5638			output_event = output.file->private_data;
5639			ret = perf_event_set_output(event, output_event);
5640			fdput(output);
5641		} else {
5642			ret = perf_event_set_output(event, NULL);
5643		}
5644		return ret;
5645	}
5646
5647	case PERF_EVENT_IOC_SET_FILTER:
5648		return perf_event_set_filter(event, (void __user *)arg);
5649
5650	case PERF_EVENT_IOC_SET_BPF:
5651	{
5652		struct bpf_prog *prog;
5653		int err;
5654
5655		prog = bpf_prog_get(arg);
5656		if (IS_ERR(prog))
5657			return PTR_ERR(prog);
5658
5659		err = perf_event_set_bpf_prog(event, prog, 0);
5660		if (err) {
5661			bpf_prog_put(prog);
5662			return err;
5663		}
5664
5665		return 0;
5666	}
5667
5668	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5669		struct perf_buffer *rb;
5670
5671		rcu_read_lock();
5672		rb = rcu_dereference(event->rb);
5673		if (!rb || !rb->nr_pages) {
5674			rcu_read_unlock();
5675			return -EINVAL;
5676		}
5677		rb_toggle_paused(rb, !!arg);
5678		rcu_read_unlock();
5679		return 0;
5680	}
5681
5682	case PERF_EVENT_IOC_QUERY_BPF:
5683		return perf_event_query_prog_array(event, (void __user *)arg);
5684
5685	case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5686		struct perf_event_attr new_attr;
5687		int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5688					 &new_attr);
5689
5690		if (err)
5691			return err;
5692
5693		return perf_event_modify_attr(event,  &new_attr);
5694	}
5695	default:
5696		return -ENOTTY;
5697	}
5698
5699	if (flags & PERF_IOC_FLAG_GROUP)
5700		perf_event_for_each(event, func);
5701	else
5702		perf_event_for_each_child(event, func);
5703
5704	return 0;
5705}
5706
5707static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5708{
5709	struct perf_event *event = file->private_data;
5710	struct perf_event_context *ctx;
5711	long ret;
5712
5713	/* Treat ioctl like writes as it is likely a mutating operation. */
5714	ret = security_perf_event_write(event);
5715	if (ret)
5716		return ret;
5717
5718	ctx = perf_event_ctx_lock(event);
5719	ret = _perf_ioctl(event, cmd, arg);
5720	perf_event_ctx_unlock(event, ctx);
5721
5722	return ret;
5723}
5724
5725#ifdef CONFIG_COMPAT
5726static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5727				unsigned long arg)
5728{
5729	switch (_IOC_NR(cmd)) {
5730	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5731	case _IOC_NR(PERF_EVENT_IOC_ID):
5732	case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5733	case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5734		/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5735		if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5736			cmd &= ~IOCSIZE_MASK;
5737			cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5738		}
5739		break;
5740	}
5741	return perf_ioctl(file, cmd, arg);
5742}
5743#else
5744# define perf_compat_ioctl NULL
5745#endif
5746
5747int perf_event_task_enable(void)
5748{
5749	struct perf_event_context *ctx;
5750	struct perf_event *event;
5751
5752	mutex_lock(&current->perf_event_mutex);
5753	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5754		ctx = perf_event_ctx_lock(event);
5755		perf_event_for_each_child(event, _perf_event_enable);
5756		perf_event_ctx_unlock(event, ctx);
5757	}
5758	mutex_unlock(&current->perf_event_mutex);
5759
5760	return 0;
5761}
5762
5763int perf_event_task_disable(void)
5764{
5765	struct perf_event_context *ctx;
5766	struct perf_event *event;
5767
5768	mutex_lock(&current->perf_event_mutex);
5769	list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5770		ctx = perf_event_ctx_lock(event);
5771		perf_event_for_each_child(event, _perf_event_disable);
5772		perf_event_ctx_unlock(event, ctx);
5773	}
5774	mutex_unlock(&current->perf_event_mutex);
5775
5776	return 0;
5777}
5778
5779static int perf_event_index(struct perf_event *event)
5780{
5781	if (event->hw.state & PERF_HES_STOPPED)
5782		return 0;
5783
5784	if (event->state != PERF_EVENT_STATE_ACTIVE)
5785		return 0;
5786
5787	return event->pmu->event_idx(event);
5788}
5789
5790static void perf_event_init_userpage(struct perf_event *event)
5791{
5792	struct perf_event_mmap_page *userpg;
5793	struct perf_buffer *rb;
5794
5795	rcu_read_lock();
5796	rb = rcu_dereference(event->rb);
5797	if (!rb)
5798		goto unlock;
5799
5800	userpg = rb->user_page;
5801
5802	/* Allow new userspace to detect that bit 0 is deprecated */
5803	userpg->cap_bit0_is_deprecated = 1;
5804	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5805	userpg->data_offset = PAGE_SIZE;
5806	userpg->data_size = perf_data_size(rb);
5807
5808unlock:
5809	rcu_read_unlock();
5810}
5811
5812void __weak arch_perf_update_userpage(
5813	struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5814{
5815}
5816
5817/*
5818 * Callers need to ensure there can be no nesting of this function, otherwise
5819 * the seqlock logic goes bad. We can not serialize this because the arch
5820 * code calls this from NMI context.
5821 */
5822void perf_event_update_userpage(struct perf_event *event)
5823{
5824	struct perf_event_mmap_page *userpg;
5825	struct perf_buffer *rb;
5826	u64 enabled, running, now;
5827
5828	rcu_read_lock();
5829	rb = rcu_dereference(event->rb);
5830	if (!rb)
5831		goto unlock;
5832
5833	/*
5834	 * compute total_time_enabled, total_time_running
5835	 * based on snapshot values taken when the event
5836	 * was last scheduled in.
5837	 *
5838	 * we cannot simply called update_context_time()
5839	 * because of locking issue as we can be called in
5840	 * NMI context
5841	 */
5842	calc_timer_values(event, &now, &enabled, &running);
5843
5844	userpg = rb->user_page;
5845	/*
5846	 * Disable preemption to guarantee consistent time stamps are stored to
5847	 * the user page.
5848	 */
5849	preempt_disable();
5850	++userpg->lock;
5851	barrier();
5852	userpg->index = perf_event_index(event);
5853	userpg->offset = perf_event_count(event);
5854	if (userpg->index)
5855		userpg->offset -= local64_read(&event->hw.prev_count);
5856
5857	userpg->time_enabled = enabled +
5858			atomic64_read(&event->child_total_time_enabled);
5859
5860	userpg->time_running = running +
5861			atomic64_read(&event->child_total_time_running);
5862
5863	arch_perf_update_userpage(event, userpg, now);
5864
5865	barrier();
5866	++userpg->lock;
5867	preempt_enable();
5868unlock:
5869	rcu_read_unlock();
5870}
5871EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5872
5873static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5874{
5875	struct perf_event *event = vmf->vma->vm_file->private_data;
5876	struct perf_buffer *rb;
5877	vm_fault_t ret = VM_FAULT_SIGBUS;
5878
5879	if (vmf->flags & FAULT_FLAG_MKWRITE) {
5880		if (vmf->pgoff == 0)
5881			ret = 0;
5882		return ret;
5883	}
5884
5885	rcu_read_lock();
5886	rb = rcu_dereference(event->rb);
5887	if (!rb)
5888		goto unlock;
5889
5890	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5891		goto unlock;
5892
5893	vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5894	if (!vmf->page)
5895		goto unlock;
5896
5897	get_page(vmf->page);
5898	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5899	vmf->page->index   = vmf->pgoff;
5900
5901	ret = 0;
5902unlock:
5903	rcu_read_unlock();
5904
5905	return ret;
5906}
5907
5908static void ring_buffer_attach(struct perf_event *event,
5909			       struct perf_buffer *rb)
5910{
5911	struct perf_buffer *old_rb = NULL;
5912	unsigned long flags;
5913
5914	WARN_ON_ONCE(event->parent);
5915
5916	if (event->rb) {
5917		/*
5918		 * Should be impossible, we set this when removing
5919		 * event->rb_entry and wait/clear when adding event->rb_entry.
5920		 */
5921		WARN_ON_ONCE(event->rcu_pending);
5922
5923		old_rb = event->rb;
5924		spin_lock_irqsave(&old_rb->event_lock, flags);
5925		list_del_rcu(&event->rb_entry);
5926		spin_unlock_irqrestore(&old_rb->event_lock, flags);
5927
5928		event->rcu_batches = get_state_synchronize_rcu();
5929		event->rcu_pending = 1;
5930	}
5931
5932	if (rb) {
5933		if (event->rcu_pending) {
5934			cond_synchronize_rcu(event->rcu_batches);
5935			event->rcu_pending = 0;
5936		}
5937
5938		spin_lock_irqsave(&rb->event_lock, flags);
5939		list_add_rcu(&event->rb_entry, &rb->event_list);
5940		spin_unlock_irqrestore(&rb->event_lock, flags);
5941	}
5942
5943	/*
5944	 * Avoid racing with perf_mmap_close(AUX): stop the event
5945	 * before swizzling the event::rb pointer; if it's getting
5946	 * unmapped, its aux_mmap_count will be 0 and it won't
5947	 * restart. See the comment in __perf_pmu_output_stop().
5948	 *
5949	 * Data will inevitably be lost when set_output is done in
5950	 * mid-air, but then again, whoever does it like this is
5951	 * not in for the data anyway.
5952	 */
5953	if (has_aux(event))
5954		perf_event_stop(event, 0);
5955
5956	rcu_assign_pointer(event->rb, rb);
5957
5958	if (old_rb) {
5959		ring_buffer_put(old_rb);
5960		/*
5961		 * Since we detached before setting the new rb, so that we
5962		 * could attach the new rb, we could have missed a wakeup.
5963		 * Provide it now.
5964		 */
5965		wake_up_all(&event->waitq);
5966	}
5967}
5968
5969static void ring_buffer_wakeup(struct perf_event *event)
5970{
5971	struct perf_buffer *rb;
5972
5973	if (event->parent)
5974		event = event->parent;
5975
5976	rcu_read_lock();
5977	rb = rcu_dereference(event->rb);
5978	if (rb) {
5979		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5980			wake_up_all(&event->waitq);
5981	}
5982	rcu_read_unlock();
5983}
5984
5985struct perf_buffer *ring_buffer_get(struct perf_event *event)
5986{
5987	struct perf_buffer *rb;
5988
5989	if (event->parent)
5990		event = event->parent;
5991
5992	rcu_read_lock();
5993	rb = rcu_dereference(event->rb);
5994	if (rb) {
5995		if (!refcount_inc_not_zero(&rb->refcount))
5996			rb = NULL;
5997	}
5998	rcu_read_unlock();
5999
6000	return rb;
6001}
6002
6003void ring_buffer_put(struct perf_buffer *rb)
6004{
6005	if (!refcount_dec_and_test(&rb->refcount))
6006		return;
6007
6008	WARN_ON_ONCE(!list_empty(&rb->event_list));
6009
6010	call_rcu(&rb->rcu_head, rb_free_rcu);
6011}
6012
6013static void perf_mmap_open(struct vm_area_struct *vma)
6014{
6015	struct perf_event *event = vma->vm_file->private_data;
6016
6017	atomic_inc(&event->mmap_count);
6018	atomic_inc(&event->rb->mmap_count);
6019
6020	if (vma->vm_pgoff)
6021		atomic_inc(&event->rb->aux_mmap_count);
6022
6023	if (event->pmu->event_mapped)
6024		event->pmu->event_mapped(event, vma->vm_mm);
6025}
6026
6027static void perf_pmu_output_stop(struct perf_event *event);
6028
6029/*
6030 * A buffer can be mmap()ed multiple times; either directly through the same
6031 * event, or through other events by use of perf_event_set_output().
6032 *
6033 * In order to undo the VM accounting done by perf_mmap() we need to destroy
6034 * the buffer here, where we still have a VM context. This means we need
6035 * to detach all events redirecting to us.
6036 */
6037static void perf_mmap_close(struct vm_area_struct *vma)
6038{
6039	struct perf_event *event = vma->vm_file->private_data;
6040	struct perf_buffer *rb = ring_buffer_get(event);
6041	struct user_struct *mmap_user = rb->mmap_user;
6042	int mmap_locked = rb->mmap_locked;
6043	unsigned long size = perf_data_size(rb);
6044	bool detach_rest = false;
6045
6046	if (event->pmu->event_unmapped)
6047		event->pmu->event_unmapped(event, vma->vm_mm);
6048
6049	/*
6050	 * rb->aux_mmap_count will always drop before rb->mmap_count and
6051	 * event->mmap_count, so it is ok to use event->mmap_mutex to
6052	 * serialize with perf_mmap here.
6053	 */
6054	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
6055	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
6056		/*
6057		 * Stop all AUX events that are writing to this buffer,
6058		 * so that we can free its AUX pages and corresponding PMU
6059		 * data. Note that after rb::aux_mmap_count dropped to zero,
6060		 * they won't start any more (see perf_aux_output_begin()).
6061		 */
6062		perf_pmu_output_stop(event);
6063
6064		/* now it's safe to free the pages */
6065		atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
6066		atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
6067
6068		/* this has to be the last one */
6069		rb_free_aux(rb);
6070		WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
6071
6072		mutex_unlock(&event->mmap_mutex);
6073	}
6074
6075	if (atomic_dec_and_test(&rb->mmap_count))
6076		detach_rest = true;
6077
6078	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
6079		goto out_put;
6080
6081	ring_buffer_attach(event, NULL);
6082	mutex_unlock(&event->mmap_mutex);
6083
6084	/* If there's still other mmap()s of this buffer, we're done. */
6085	if (!detach_rest)
6086		goto out_put;
6087
6088	/*
6089	 * No other mmap()s, detach from all other events that might redirect
6090	 * into the now unreachable buffer. Somewhat complicated by the
6091	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
6092	 */
6093again:
6094	rcu_read_lock();
6095	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6096		if (!atomic_long_inc_not_zero(&event->refcount)) {
6097			/*
6098			 * This event is en-route to free_event() which will
6099			 * detach it and remove it from the list.
6100			 */
6101			continue;
6102		}
6103		rcu_read_unlock();
6104
6105		mutex_lock(&event->mmap_mutex);
6106		/*
6107		 * Check we didn't race with perf_event_set_output() which can
6108		 * swizzle the rb from under us while we were waiting to
6109		 * acquire mmap_mutex.
6110		 *
6111		 * If we find a different rb; ignore this event, a next
6112		 * iteration will no longer find it on the list. We have to
6113		 * still restart the iteration to make sure we're not now
6114		 * iterating the wrong list.
6115		 */
6116		if (event->rb == rb)
6117			ring_buffer_attach(event, NULL);
6118
6119		mutex_unlock(&event->mmap_mutex);
6120		put_event(event);
6121
6122		/*
6123		 * Restart the iteration; either we're on the wrong list or
6124		 * destroyed its integrity by doing a deletion.
6125		 */
6126		goto again;
6127	}
6128	rcu_read_unlock();
6129
6130	/*
6131	 * It could be there's still a few 0-ref events on the list; they'll
6132	 * get cleaned up by free_event() -- they'll also still have their
6133	 * ref on the rb and will free it whenever they are done with it.
6134	 *
6135	 * Aside from that, this buffer is 'fully' detached and unmapped,
6136	 * undo the VM accounting.
6137	 */
6138
6139	atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6140			&mmap_user->locked_vm);
6141	atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6142	free_uid(mmap_user);
6143
6144out_put:
6145	ring_buffer_put(rb); /* could be last */
6146}
6147
6148static const struct vm_operations_struct perf_mmap_vmops = {
6149	.open		= perf_mmap_open,
6150	.close		= perf_mmap_close, /* non mergeable */
6151	.fault		= perf_mmap_fault,
6152	.page_mkwrite	= perf_mmap_fault,
6153};
6154
6155static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6156{
6157	struct perf_event *event = file->private_data;
6158	unsigned long user_locked, user_lock_limit;
6159	struct user_struct *user = current_user();
6160	struct perf_buffer *rb = NULL;
6161	unsigned long locked, lock_limit;
6162	unsigned long vma_size;
6163	unsigned long nr_pages;
6164	long user_extra = 0, extra = 0;
6165	int ret = 0, flags = 0;
6166
6167	/*
6168	 * Don't allow mmap() of inherited per-task counters. This would
6169	 * create a performance issue due to all children writing to the
6170	 * same rb.
6171	 */
6172	if (event->cpu == -1 && event->attr.inherit)
6173		return -EINVAL;
6174
6175	if (!(vma->vm_flags & VM_SHARED))
6176		return -EINVAL;
6177
6178	ret = security_perf_event_read(event);
6179	if (ret)
6180		return ret;
6181
6182	vma_size = vma->vm_end - vma->vm_start;
6183
6184	if (vma->vm_pgoff == 0) {
6185		nr_pages = (vma_size / PAGE_SIZE) - 1;
6186	} else {
6187		/*
6188		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
6189		 * mapped, all subsequent mappings should have the same size
6190		 * and offset. Must be above the normal perf buffer.
6191		 */
6192		u64 aux_offset, aux_size;
6193
6194		if (!event->rb)
6195			return -EINVAL;
6196
6197		nr_pages = vma_size / PAGE_SIZE;
6198
6199		mutex_lock(&event->mmap_mutex);
6200		ret = -EINVAL;
6201
6202		rb = event->rb;
6203		if (!rb)
6204			goto aux_unlock;
6205
6206		aux_offset = READ_ONCE(rb->user_page->aux_offset);
6207		aux_size = READ_ONCE(rb->user_page->aux_size);
6208
6209		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6210			goto aux_unlock;
6211
6212		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6213			goto aux_unlock;
6214
6215		/* already mapped with a different offset */
6216		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6217			goto aux_unlock;
6218
6219		if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6220			goto aux_unlock;
6221
6222		/* already mapped with a different size */
6223		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6224			goto aux_unlock;
6225
6226		if (!is_power_of_2(nr_pages))
6227			goto aux_unlock;
6228
6229		if (!atomic_inc_not_zero(&rb->mmap_count))
6230			goto aux_unlock;
6231
6232		if (rb_has_aux(rb)) {
6233			atomic_inc(&rb->aux_mmap_count);
6234			ret = 0;
6235			goto unlock;
6236		}
6237
6238		atomic_set(&rb->aux_mmap_count, 1);
6239		user_extra = nr_pages;
6240
6241		goto accounting;
6242	}
6243
6244	/*
6245	 * If we have rb pages ensure they're a power-of-two number, so we
6246	 * can do bitmasks instead of modulo.
6247	 */
6248	if (nr_pages != 0 && !is_power_of_2(nr_pages))
6249		return -EINVAL;
6250
6251	if (vma_size != PAGE_SIZE * (1 + nr_pages))
6252		return -EINVAL;
6253
6254	WARN_ON_ONCE(event->ctx->parent_ctx);
6255again:
6256	mutex_lock(&event->mmap_mutex);
6257	if (event->rb) {
6258		if (data_page_nr(event->rb) != nr_pages) {
6259			ret = -EINVAL;
6260			goto unlock;
6261		}
6262
6263		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6264			/*
6265			 * Raced against perf_mmap_close(); remove the
6266			 * event and try again.
6267			 */
6268			ring_buffer_attach(event, NULL);
6269			mutex_unlock(&event->mmap_mutex);
6270			goto again;
6271		}
6272
6273		goto unlock;
6274	}
6275
6276	user_extra = nr_pages + 1;
6277
6278accounting:
6279	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6280
6281	/*
6282	 * Increase the limit linearly with more CPUs:
6283	 */
6284	user_lock_limit *= num_online_cpus();
6285
6286	user_locked = atomic_long_read(&user->locked_vm);
6287
6288	/*
6289	 * sysctl_perf_event_mlock may have changed, so that
6290	 *     user->locked_vm > user_lock_limit
6291	 */
6292	if (user_locked > user_lock_limit)
6293		user_locked = user_lock_limit;
6294	user_locked += user_extra;
6295
6296	if (user_locked > user_lock_limit) {
6297		/*
6298		 * charge locked_vm until it hits user_lock_limit;
6299		 * charge the rest from pinned_vm
6300		 */
6301		extra = user_locked - user_lock_limit;
6302		user_extra -= extra;
6303	}
6304
6305	lock_limit = rlimit(RLIMIT_MEMLOCK);
6306	lock_limit >>= PAGE_SHIFT;
6307	locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6308
6309	if ((locked > lock_limit) && perf_is_paranoid() &&
6310		!capable(CAP_IPC_LOCK)) {
6311		ret = -EPERM;
6312		goto unlock;
6313	}
6314
6315	WARN_ON(!rb && event->rb);
6316
6317	if (vma->vm_flags & VM_WRITE)
6318		flags |= RING_BUFFER_WRITABLE;
6319
6320	if (!rb) {
6321		rb = rb_alloc(nr_pages,
6322			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
6323			      event->cpu, flags);
6324
6325		if (!rb) {
6326			ret = -ENOMEM;
6327			goto unlock;
6328		}
6329
6330		atomic_set(&rb->mmap_count, 1);
6331		rb->mmap_user = get_current_user();
6332		rb->mmap_locked = extra;
6333
6334		ring_buffer_attach(event, rb);
6335
6336		perf_event_update_time(event);
6337		perf_event_init_userpage(event);
6338		perf_event_update_userpage(event);
6339	} else {
6340		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6341				   event->attr.aux_watermark, flags);
6342		if (!ret)
6343			rb->aux_mmap_locked = extra;
6344	}
6345
6346unlock:
6347	if (!ret) {
6348		atomic_long_add(user_extra, &user->locked_vm);
6349		atomic64_add(extra, &vma->vm_mm->pinned_vm);
6350
6351		atomic_inc(&event->mmap_count);
6352	} else if (rb) {
6353		atomic_dec(&rb->mmap_count);
6354	}
6355aux_unlock:
6356	mutex_unlock(&event->mmap_mutex);
6357
6358	/*
6359	 * Since pinned accounting is per vm we cannot allow fork() to copy our
6360	 * vma.
6361	 */
6362	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
6363	vma->vm_ops = &perf_mmap_vmops;
6364
6365	if (event->pmu->event_mapped)
6366		event->pmu->event_mapped(event, vma->vm_mm);
6367
6368	return ret;
6369}
6370
6371static int perf_fasync(int fd, struct file *filp, int on)
6372{
6373	struct inode *inode = file_inode(filp);
6374	struct perf_event *event = filp->private_data;
6375	int retval;
6376
6377	inode_lock(inode);
6378	retval = fasync_helper(fd, filp, on, &event->fasync);
6379	inode_unlock(inode);
6380
6381	if (retval < 0)
6382		return retval;
6383
6384	return 0;
6385}
6386
6387static const struct file_operations perf_fops = {
6388	.llseek			= no_llseek,
6389	.release		= perf_release,
6390	.read			= perf_read,
6391	.poll			= perf_poll,
6392	.unlocked_ioctl		= perf_ioctl,
6393	.compat_ioctl		= perf_compat_ioctl,
6394	.mmap			= perf_mmap,
6395	.fasync			= perf_fasync,
6396};
6397
6398/*
6399 * Perf event wakeup
6400 *
6401 * If there's data, ensure we set the poll() state and publish everything
6402 * to user-space before waking everybody up.
6403 */
6404
6405static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6406{
6407	/* only the parent has fasync state */
6408	if (event->parent)
6409		event = event->parent;
6410	return &event->fasync;
6411}
6412
6413void perf_event_wakeup(struct perf_event *event)
6414{
6415	ring_buffer_wakeup(event);
6416
6417	if (event->pending_kill) {
6418		kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6419		event->pending_kill = 0;
6420	}
6421}
6422
6423static void perf_sigtrap(struct perf_event *event)
6424{
6425	/*
6426	 * We'd expect this to only occur if the irq_work is delayed and either
6427	 * ctx->task or current has changed in the meantime. This can be the
6428	 * case on architectures that do not implement arch_irq_work_raise().
6429	 */
6430	if (WARN_ON_ONCE(event->ctx->task != current))
6431		return;
6432
6433	/*
6434	 * perf_pending_event() can race with the task exiting.
6435	 */
6436	if (current->flags & PF_EXITING)
6437		return;
6438
6439	send_sig_perf((void __user *)event->pending_addr,
6440		      event->attr.type, event->attr.sig_data);
6441}
6442
6443static void perf_pending_event_disable(struct perf_event *event)
6444{
6445	int cpu = READ_ONCE(event->pending_disable);
6446
6447	if (cpu < 0)
6448		return;
6449
6450	if (cpu == smp_processor_id()) {
6451		WRITE_ONCE(event->pending_disable, -1);
6452
6453		if (event->attr.sigtrap) {
6454			perf_sigtrap(event);
6455			atomic_set_release(&event->event_limit, 1); /* rearm event */
6456			return;
6457		}
6458
6459		perf_event_disable_local(event);
6460		return;
6461	}
6462
6463	/*
6464	 *  CPU-A			CPU-B