1/* 2 * Performance events core code: 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 7 * Copyright � 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 8 * 9 * For licensing details see kernel-base/COPYING 10 */ 11 12#include <linux/fs.h> 13#include <linux/mm.h> 14#include <linux/cpu.h> 15#include <linux/smp.h> 16#include <linux/file.h> 17#include <linux/poll.h> 18#include <linux/slab.h> 19#include <linux/hash.h> 20#include <linux/sysfs.h> 21#include <linux/dcache.h> 22#include <linux/percpu.h> 23#include <linux/ptrace.h> 24#include <linux/vmstat.h> 25#include <linux/vmalloc.h> 26#include <linux/hardirq.h> 27#include <linux/rculist.h> 28#include <linux/uaccess.h> 29#include <linux/syscalls.h> 30#include <linux/anon_inodes.h> 31#include <linux/kernel_stat.h> 32#include <linux/perf_event.h> 33#include <linux/ftrace_event.h> 34#include <linux/hw_breakpoint.h> 35 36#include <asm/irq_regs.h> 37 38/* 39 * Each CPU has a list of per CPU events: 40 */ 41static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 42 43int perf_max_events __read_mostly = 1; 44static int perf_reserved_percpu __read_mostly; 45static int perf_overcommit __read_mostly = 1; 46 47static atomic_t nr_events __read_mostly; 48static atomic_t nr_mmap_events __read_mostly; 49static atomic_t nr_comm_events __read_mostly; 50static atomic_t nr_task_events __read_mostly; 51 52/* 53 * perf event paranoia level: 54 * -1 - not paranoid at all 55 * 0 - disallow raw tracepoint access for unpriv 56 * 1 - disallow cpu events for unpriv 57 * 2 - disallow kernel profiling for unpriv 58 */ 59int sysctl_perf_event_paranoid __read_mostly = 1; 60 61int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 62 63/* 64 * max perf event sample rate 65 */ 66int sysctl_perf_event_sample_rate __read_mostly = 100000; 67 68static atomic64_t perf_event_id; 69 70/* 71 * Lock for (sysadmin-configurable) event reservations: 72 */ 73static DEFINE_SPINLOCK(perf_resource_lock); 74 75/* 76 * Architecture provided APIs - weak aliases: 77 */ 78extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) 79{ 80 return NULL; 81} 82 83void __weak hw_perf_disable(void) { barrier(); } 84void __weak hw_perf_enable(void) { barrier(); } 85 86void __weak perf_event_print_debug(void) { } 87 88static DEFINE_PER_CPU(int, perf_disable_count); 89 90void perf_disable(void) 91{ 92 if (!__get_cpu_var(perf_disable_count)++) 93 hw_perf_disable(); 94} 95 96void perf_enable(void) 97{ 98 if (!--__get_cpu_var(perf_disable_count)) 99 hw_perf_enable(); 100} 101 102static void get_ctx(struct perf_event_context *ctx) 103{ 104 WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); 105} 106 107static void free_ctx(struct rcu_head *head) 108{ 109 struct perf_event_context *ctx; 110 111 ctx = container_of(head, struct perf_event_context, rcu_head); 112 kfree(ctx); 113} 114 115static void put_ctx(struct perf_event_context *ctx) 116{ 117 if (atomic_dec_and_test(&ctx->refcount)) { 118 if (ctx->parent_ctx) 119 put_ctx(ctx->parent_ctx); 120 if (ctx->task) 121 put_task_struct(ctx->task); 122 call_rcu(&ctx->rcu_head, free_ctx); 123 } 124} 125 126static void unclone_ctx(struct perf_event_context *ctx) 127{ 128 if (ctx->parent_ctx) { 129 put_ctx(ctx->parent_ctx); 130 ctx->parent_ctx = NULL; 131 } 132} 133 134/* 135 * If we inherit events we want to return the parent event id 136 * to userspace. 137 */ 138static u64 primary_event_id(struct perf_event *event) 139{ 140 u64 id = event->id; 141 142 if (event->parent) 143 id = event->parent->id; 144 145 return id; 146} 147 148/* 149 * Get the perf_event_context for a task and lock it. 150 * This has to cope with with the fact that until it is locked, 151 * the context could get moved to another task. 152 */ 153static struct perf_event_context * 154perf_lock_task_context(struct task_struct *task, unsigned long *flags) 155{ 156 struct perf_event_context *ctx; 157 158 rcu_read_lock(); 159 retry: 160 ctx = rcu_dereference(task->perf_event_ctxp); 161 if (ctx) { 162 /* 163 * If this context is a clone of another, it might 164 * get swapped for another underneath us by 165 * perf_event_task_sched_out, though the 166 * rcu_read_lock() protects us from any context 167 * getting freed. Lock the context and check if it 168 * got swapped before we could get the lock, and retry 169 * if so. If we locked the right context, then it 170 * can't get swapped on us any more. 171 */ 172 raw_spin_lock_irqsave(&ctx->lock, *flags); 173 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 174 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 175 goto retry; 176 } 177 178 if (!atomic_inc_not_zero(&ctx->refcount)) { 179 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 180 ctx = NULL; 181 } 182 } 183 rcu_read_unlock(); 184 return ctx; 185} 186 187/* 188 * Get the context for a task and increment its pin_count so it 189 * can't get swapped to another task. This also increments its 190 * reference count so that the context can't get freed. 191 */ 192static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 193{ 194 struct perf_event_context *ctx; 195 unsigned long flags; 196 197 ctx = perf_lock_task_context(task, &flags); 198 if (ctx) { 199 ++ctx->pin_count; 200 raw_spin_unlock_irqrestore(&ctx->lock, flags); 201 } 202 return ctx; 203} 204 205static void perf_unpin_context(struct perf_event_context *ctx) 206{ 207 unsigned long flags; 208 209 raw_spin_lock_irqsave(&ctx->lock, flags); 210 --ctx->pin_count; 211 raw_spin_unlock_irqrestore(&ctx->lock, flags); 212 put_ctx(ctx); 213} 214 215static inline u64 perf_clock(void) 216{ 217 return local_clock(); 218} 219 220/* 221 * Update the record of the current time in a context. 222 */ 223static void update_context_time(struct perf_event_context *ctx) 224{ 225 u64 now = perf_clock(); 226 227 ctx->time += now - ctx->timestamp; 228 ctx->timestamp = now; 229} 230 231/* 232 * Update the total_time_enabled and total_time_running fields for a event. 233 */ 234static void update_event_times(struct perf_event *event) 235{ 236 struct perf_event_context *ctx = event->ctx; 237 u64 run_end; 238 239 if (event->state < PERF_EVENT_STATE_INACTIVE || 240 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 241 return; 242 243 if (ctx->is_active) 244 run_end = ctx->time; 245 else 246 run_end = event->tstamp_stopped; 247 248 event->total_time_enabled = run_end - event->tstamp_enabled; 249 250 if (event->state == PERF_EVENT_STATE_INACTIVE) 251 run_end = event->tstamp_stopped; 252 else 253 run_end = ctx->time; 254 255 event->total_time_running = run_end - event->tstamp_running; 256} 257 258/* 259 * Update total_time_enabled and total_time_running for all events in a group. 260 */ 261static void update_group_times(struct perf_event *leader) 262{ 263 struct perf_event *event; 264 265 update_event_times(leader); 266 list_for_each_entry(event, &leader->sibling_list, group_entry) 267 update_event_times(event); 268} 269 270static struct list_head * 271ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 272{ 273 if (event->attr.pinned) 274 return &ctx->pinned_groups; 275 else 276 return &ctx->flexible_groups; 277} 278 279/* 280 * Add a event from the lists for its context. 281 * Must be called with ctx->mutex and ctx->lock held. 282 */ 283static void 284list_add_event(struct perf_event *event, struct perf_event_context *ctx) 285{ 286 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 287 event->attach_state |= PERF_ATTACH_CONTEXT; 288 289 /* 290 * If we're a stand alone event or group leader, we go to the context 291 * list, group events are kept attached to the group so that 292 * perf_group_detach can, at all times, locate all siblings. 293 */ 294 if (event->group_leader == event) { 295 struct list_head *list; 296 297 if (is_software_event(event)) 298 event->group_flags |= PERF_GROUP_SOFTWARE; 299 300 list = ctx_group_list(event, ctx); 301 list_add_tail(&event->group_entry, list); 302 } 303 304 list_add_rcu(&event->event_entry, &ctx->event_list); 305 ctx->nr_events++; 306 if (event->attr.inherit_stat) 307 ctx->nr_stat++; 308} 309 310static void perf_group_attach(struct perf_event *event) 311{ 312 struct perf_event *group_leader = event->group_leader; 313 314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); 315 event->attach_state |= PERF_ATTACH_GROUP; 316 317 if (group_leader == event) 318 return; 319 320 if (group_leader->group_flags & PERF_GROUP_SOFTWARE && 321 !is_software_event(event)) 322 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; 323 324 list_add_tail(&event->group_entry, &group_leader->sibling_list); 325 group_leader->nr_siblings++; 326} 327 328/* 329 * Remove a event from the lists for its context. 330 * Must be called with ctx->mutex and ctx->lock held. 331 */ 332static void 333list_del_event(struct perf_event *event, struct perf_event_context *ctx) 334{ 335 /* 336 * We can have double detach due to exit/hot-unplug + close. 337 */ 338 if (!(event->attach_state & PERF_ATTACH_CONTEXT)) 339 return; 340 341 event->attach_state &= ~PERF_ATTACH_CONTEXT; 342 343 ctx->nr_events--; 344 if (event->attr.inherit_stat) 345 ctx->nr_stat--; 346 347 list_del_rcu(&event->event_entry); 348 349 if (event->group_leader == event) 350 list_del_init(&event->group_entry); 351 352 update_group_times(event); 353 354 /* 355 * If event was in error state, then keep it 356 * that way, otherwise bogus counts will be 357 * returned on read(). The only way to get out 358 * of error state is by explicit re-enabling 359 * of the event 360 */ 361 if (event->state > PERF_EVENT_STATE_OFF) 362 event->state = PERF_EVENT_STATE_OFF; 363} 364 365static void perf_group_detach(struct perf_event *event) 366{ 367 struct perf_event *sibling, *tmp; 368 struct list_head *list = NULL; 369 370 /* 371 * We can have double detach due to exit/hot-unplug + close. 372 */ 373 if (!(event->attach_state & PERF_ATTACH_GROUP)) 374 return; 375 376 event->attach_state &= ~PERF_ATTACH_GROUP; 377 378 /* 379 * If this is a sibling, remove it from its group. 380 */ 381 if (event->group_leader != event) { 382 list_del_init(&event->group_entry); 383 event->group_leader->nr_siblings--; 384 return; 385 } 386 387 if (!list_empty(&event->group_entry)) 388 list = &event->group_entry; 389 390 /* 391 * If this was a group event with sibling events then 392 * upgrade the siblings to singleton events by adding them 393 * to whatever list we are on. 394 */ 395 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 396 if (list) 397 list_move_tail(&sibling->group_entry, list); 398 sibling->group_leader = sibling; 399 400 /* Inherit group flags from the previous leader */ 401 sibling->group_flags = event->group_flags; 402 } 403} 404 405static inline int 406event_filter_match(struct perf_event *event) 407{ 408 return event->cpu == -1 || event->cpu == smp_processor_id(); 409} 410 411static void 412event_sched_out(struct perf_event *event, 413 struct perf_cpu_context *cpuctx, 414 struct perf_event_context *ctx) 415{ 416 u64 delta; 417 /* 418 * An event which could not be activated because of 419 * filter mismatch still needs to have its timings 420 * maintained, otherwise bogus information is return 421 * via read() for time_enabled, time_running: 422 */ 423 if (event->state == PERF_EVENT_STATE_INACTIVE 424 && !event_filter_match(event)) { 425 delta = ctx->time - event->tstamp_stopped; 426 event->tstamp_running += delta; 427 event->tstamp_stopped = ctx->time; 428 } 429 430 if (event->state != PERF_EVENT_STATE_ACTIVE) 431 return; 432 433 event->state = PERF_EVENT_STATE_INACTIVE; 434 if (event->pending_disable) { 435 event->pending_disable = 0; 436 event->state = PERF_EVENT_STATE_OFF; 437 } 438 event->tstamp_stopped = ctx->time; 439 event->pmu->disable(event); 440 event->oncpu = -1; 441 442 if (!is_software_event(event)) 443 cpuctx->active_oncpu--; 444 ctx->nr_active--; 445 if (event->attr.exclusive || !cpuctx->active_oncpu) 446 cpuctx->exclusive = 0; 447} 448 449static void 450group_sched_out(struct perf_event *group_event, 451 struct perf_cpu_context *cpuctx, 452 struct perf_event_context *ctx) 453{ 454 struct perf_event *event; 455 int state = group_event->state; 456 457 event_sched_out(group_event, cpuctx, ctx); 458 459 /* 460 * Schedule out siblings (if any): 461 */ 462 list_for_each_entry(event, &group_event->sibling_list, group_entry) 463 event_sched_out(event, cpuctx, ctx); 464 465 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) 466 cpuctx->exclusive = 0; 467} 468 469/* 470 * Cross CPU call to remove a performance event 471 * 472 * We disable the event on the hardware level first. After that we 473 * remove it from the context list. 474 */ 475static void __perf_event_remove_from_context(void *info) 476{ 477 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 478 struct perf_event *event = info; 479 struct perf_event_context *ctx = event->ctx; 480 481 /* 482 * If this is a task context, we need to check whether it is 483 * the current task context of this cpu. If not it has been 484 * scheduled out before the smp call arrived. 485 */ 486 if (ctx->task && cpuctx->task_ctx != ctx) 487 return; 488 489 raw_spin_lock(&ctx->lock); 490 /* 491 * Protect the list operation against NMI by disabling the 492 * events on a global level. 493 */ 494 perf_disable(); 495 496 event_sched_out(event, cpuctx, ctx); 497 498 list_del_event(event, ctx); 499 500 if (!ctx->task) { 501 /* 502 * Allow more per task events with respect to the 503 * reservation: 504 */ 505 cpuctx->max_pertask = 506 min(perf_max_events - ctx->nr_events, 507 perf_max_events - perf_reserved_percpu); 508 } 509 510 perf_enable(); 511 raw_spin_unlock(&ctx->lock); 512} 513 514 515/* 516 * Remove the event from a task's (or a CPU's) list of events. 517 * 518 * Must be called with ctx->mutex held. 519 * 520 * CPU events are removed with a smp call. For task events we only 521 * call when the task is on a CPU. 522 * 523 * If event->ctx is a cloned context, callers must make sure that 524 * every task struct that event->ctx->task could possibly point to 525 * remains valid. This is OK when called from perf_release since 526 * that only calls us on the top-level context, which can't be a clone. 527 * When called from perf_event_exit_task, it's OK because the 528 * context has been detached from its task. 529 */ 530static void perf_event_remove_from_context(struct perf_event *event) 531{ 532 struct perf_event_context *ctx = event->ctx; 533 struct task_struct *task = ctx->task; 534 535 if (!task) { 536 /* 537 * Per cpu events are removed via an smp call and 538 * the removal is always successful. 539 */ 540 smp_call_function_single(event->cpu, 541 __perf_event_remove_from_context, 542 event, 1); 543 return; 544 } 545 546retry: 547 task_oncpu_function_call(task, __perf_event_remove_from_context, 548 event); 549 550 raw_spin_lock_irq(&ctx->lock); 551 /* 552 * If the context is active we need to retry the smp call. 553 */ 554 if (ctx->nr_active && !list_empty(&event->group_entry)) { 555 raw_spin_unlock_irq(&ctx->lock); 556 goto retry; 557 } 558 559 /* 560 * The lock prevents that this context is scheduled in so we 561 * can remove the event safely, if the call above did not 562 * succeed. 563 */ 564 if (!list_empty(&event->group_entry)) 565 list_del_event(event, ctx); 566 raw_spin_unlock_irq(&ctx->lock); 567} 568 569/* 570 * Cross CPU call to disable a performance event 571 */ 572static void __perf_event_disable(void *info) 573{ 574 struct perf_event *event = info; 575 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 576 struct perf_event_context *ctx = event->ctx; 577 578 /* 579 * If this is a per-task event, need to check whether this 580 * event's task is the current task on this cpu. 581 */ 582 if (ctx->task && cpuctx->task_ctx != ctx) 583 return; 584 585 raw_spin_lock(&ctx->lock); 586 587 /* 588 * If the event is on, turn it off. 589 * If it is in error state, leave it in error state. 590 */ 591 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 592 update_context_time(ctx); 593 update_group_times(event); 594 if (event == event->group_leader) 595 group_sched_out(event, cpuctx, ctx); 596 else 597 event_sched_out(event, cpuctx, ctx); 598 event->state = PERF_EVENT_STATE_OFF; 599 } 600 601 raw_spin_unlock(&ctx->lock); 602} 603 604/* 605 * Disable a event. 606 * 607 * If event->ctx is a cloned context, callers must make sure that 608 * every task struct that event->ctx->task could possibly point to 609 * remains valid. This condition is satisifed when called through 610 * perf_event_for_each_child or perf_event_for_each because they 611 * hold the top-level event's child_mutex, so any descendant that 612 * goes to exit will block in sync_child_event. 613 * When called from perf_pending_event it's OK because event->ctx 614 * is the current context on this CPU and preemption is disabled, 615 * hence we can't get into perf_event_task_sched_out for this context. 616 */ 617void perf_event_disable(struct perf_event *event) 618{ 619 struct perf_event_context *ctx = event->ctx; 620 struct task_struct *task = ctx->task; 621 622 if (!task) { 623 /* 624 * Disable the event on the cpu that it's on 625 */ 626 smp_call_function_single(event->cpu, __perf_event_disable, 627 event, 1); 628 return; 629 } 630 631 retry: 632 task_oncpu_function_call(task, __perf_event_disable, event); 633 634 raw_spin_lock_irq(&ctx->lock); 635 /* 636 * If the event is still active, we need to retry the cross-call. 637 */ 638 if (event->state == PERF_EVENT_STATE_ACTIVE) { 639 raw_spin_unlock_irq(&ctx->lock); 640 goto retry; 641 } 642 643 /* 644 * Since we have the lock this context can't be scheduled 645 * in, so we can change the state safely. 646 */ 647 if (event->state == PERF_EVENT_STATE_INACTIVE) { 648 update_group_times(event); 649 event->state = PERF_EVENT_STATE_OFF; 650 } 651 652 raw_spin_unlock_irq(&ctx->lock); 653} 654 655static int 656event_sched_in(struct perf_event *event, 657 struct perf_cpu_context *cpuctx, 658 struct perf_event_context *ctx) 659{ 660 if (event->state <= PERF_EVENT_STATE_OFF) 661 return 0; 662 663 event->state = PERF_EVENT_STATE_ACTIVE; 664 event->oncpu = smp_processor_id(); 665 /* 666 * The new state must be visible before we turn it on in the hardware: 667 */ 668 smp_wmb(); 669 670 if (event->pmu->enable(event)) { 671 event->state = PERF_EVENT_STATE_INACTIVE; 672 event->oncpu = -1; 673 return -EAGAIN; 674 } 675 676 event->tstamp_running += ctx->time - event->tstamp_stopped; 677 678 if (!is_software_event(event)) 679 cpuctx->active_oncpu++; 680 ctx->nr_active++; 681 682 if (event->attr.exclusive) 683 cpuctx->exclusive = 1; 684 685 return 0; 686} 687 688static int 689group_sched_in(struct perf_event *group_event, 690 struct perf_cpu_context *cpuctx, 691 struct perf_event_context *ctx) 692{ 693 struct perf_event *event, *partial_group = NULL; 694 const struct pmu *pmu = group_event->pmu; 695 bool txn = false; 696 697 if (group_event->state == PERF_EVENT_STATE_OFF) 698 return 0; 699 700 /* Check if group transaction availabe */ 701 if (pmu->start_txn) 702 txn = true; 703 704 if (txn) 705 pmu->start_txn(pmu); 706 707 if (event_sched_in(group_event, cpuctx, ctx)) { 708 if (txn) 709 pmu->cancel_txn(pmu); 710 return -EAGAIN; 711 } 712 713 /* 714 * Schedule in siblings as one group (if any): 715 */ 716 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 717 if (event_sched_in(event, cpuctx, ctx)) { 718 partial_group = event; 719 goto group_error; 720 } 721 } 722 723 if (!txn || !pmu->commit_txn(pmu)) 724 return 0; 725 726group_error: 727 /* 728 * Groups can be scheduled in as one unit only, so undo any 729 * partial group before returning: 730 */ 731 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 732 if (event == partial_group) 733 break; 734 event_sched_out(event, cpuctx, ctx); 735 } 736 event_sched_out(group_event, cpuctx, ctx); 737 738 if (txn) 739 pmu->cancel_txn(pmu); 740 741 return -EAGAIN; 742} 743 744/* 745 * Work out whether we can put this event group on the CPU now. 746 */ 747static int group_can_go_on(struct perf_event *event, 748 struct perf_cpu_context *cpuctx, 749 int can_add_hw) 750{ 751 /* 752 * Groups consisting entirely of software events can always go on. 753 */ 754 if (event->group_flags & PERF_GROUP_SOFTWARE) 755 return 1; 756 /* 757 * If an exclusive group is already on, no other hardware 758 * events can go on. 759 */ 760 if (cpuctx->exclusive) 761 return 0; 762 /* 763 * If this group is exclusive and there are already 764 * events on the CPU, it can't go on. 765 */ 766 if (event->attr.exclusive && cpuctx->active_oncpu) 767 return 0; 768 /* 769 * Otherwise, try to add it if all previous groups were able 770 * to go on. 771 */ 772 return can_add_hw; 773} 774 775static void add_event_to_ctx(struct perf_event *event, 776 struct perf_event_context *ctx) 777{ 778 list_add_event(event, ctx); 779 perf_group_attach(event); 780 event->tstamp_enabled = ctx->time; 781 event->tstamp_running = ctx->time; 782 event->tstamp_stopped = ctx->time; 783} 784 785/* 786 * Cross CPU call to install and enable a performance event 787 * 788 * Must be called with ctx->mutex held 789 */ 790static void __perf_install_in_context(void *info) 791{ 792 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 793 struct perf_event *event = info; 794 struct perf_event_context *ctx = event->ctx; 795 struct perf_event *leader = event->group_leader; 796 int err; 797 798 /* 799 * If this is a task context, we need to check whether it is 800 * the current task context of this cpu. If not it has been 801 * scheduled out before the smp call arrived. 802 * Or possibly this is the right context but it isn't 803 * on this cpu because it had no events. 804 */ 805 if (ctx->task && cpuctx->task_ctx != ctx) { 806 if (cpuctx->task_ctx || ctx->task != current) 807 return; 808 cpuctx->task_ctx = ctx; 809 } 810 811 raw_spin_lock(&ctx->lock); 812 ctx->is_active = 1; 813 update_context_time(ctx); 814 815 /* 816 * Protect the list operation against NMI by disabling the 817 * events on a global level. NOP for non NMI based events. 818 */ 819 perf_disable(); 820 821 add_event_to_ctx(event, ctx); 822 823 if (event->cpu != -1 && event->cpu != smp_processor_id()) 824 goto unlock; 825 826 /* 827 * Don't put the event on if it is disabled or if 828 * it is in a group and the group isn't on. 829 */ 830 if (event->state != PERF_EVENT_STATE_INACTIVE || 831 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) 832 goto unlock; 833 834 /* 835 * An exclusive event can't go on if there are already active 836 * hardware events, and no hardware event can go on if there 837 * is already an exclusive event on. 838 */ 839 if (!group_can_go_on(event, cpuctx, 1)) 840 err = -EEXIST; 841 else 842 err = event_sched_in(event, cpuctx, ctx); 843 844 if (err) { 845 /* 846 * This event couldn't go on. If it is in a group 847 * then we have to pull the whole group off. 848 * If the event group is pinned then put it in error state. 849 */ 850 if (leader != event) 851 group_sched_out(leader, cpuctx, ctx); 852 if (leader->attr.pinned) { 853 update_group_times(leader); 854 leader->state = PERF_EVENT_STATE_ERROR; 855 } 856 } 857 858 if (!err && !ctx->task && cpuctx->max_pertask) 859 cpuctx->max_pertask--; 860 861 unlock: 862 perf_enable(); 863 864 raw_spin_unlock(&ctx->lock); 865} 866 867/* 868 * Attach a performance event to a context 869 * 870 * First we add the event to the list with the hardware enable bit 871 * in event->hw_config cleared. 872 * 873 * If the event is attached to a task which is on a CPU we use a smp 874 * call to enable it in the task context. The task might have been 875 * scheduled away, but we check this in the smp call again. 876 * 877 * Must be called with ctx->mutex held. 878 */ 879static void 880perf_install_in_context(struct perf_event_context *ctx, 881 struct perf_event *event, 882 int cpu) 883{ 884 struct task_struct *task = ctx->task; 885 886 if (!task) { 887 /* 888 * Per cpu events are installed via an smp call and 889 * the install is always successful. 890 */ 891 smp_call_function_single(cpu, __perf_install_in_context, 892 event, 1); 893 return; 894 } 895 896retry: 897 task_oncpu_function_call(task, __perf_install_in_context, 898 event); 899 900 raw_spin_lock_irq(&ctx->lock); 901 /* 902 * we need to retry the smp call. 903 */ 904 if (ctx->is_active && list_empty(&event->group_entry)) { 905 raw_spin_unlock_irq(&ctx->lock); 906 goto retry; 907 } 908 909 /* 910 * The lock prevents that this context is scheduled in so we 911 * can add the event safely, if it the call above did not 912 * succeed. 913 */ 914 if (list_empty(&event->group_entry)) 915 add_event_to_ctx(event, ctx); 916 raw_spin_unlock_irq(&ctx->lock); 917} 918 919/* 920 * Put a event into inactive state and update time fields. 921 * Enabling the leader of a group effectively enables all 922 * the group members that aren't explicitly disabled, so we 923 * have to update their ->tstamp_enabled also. 924 * Note: this works for group members as well as group leaders 925 * since the non-leader members' sibling_lists will be empty. 926 */ 927static void __perf_event_mark_enabled(struct perf_event *event, 928 struct perf_event_context *ctx) 929{ 930 struct perf_event *sub; 931 932 event->state = PERF_EVENT_STATE_INACTIVE; 933 event->tstamp_enabled = ctx->time - event->total_time_enabled; 934 list_for_each_entry(sub, &event->sibling_list, group_entry) 935 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 936 sub->tstamp_enabled = 937 ctx->time - sub->total_time_enabled; 938} 939 940/* 941 * Cross CPU call to enable a performance event 942 */ 943static void __perf_event_enable(void *info) 944{ 945 struct perf_event *event = info; 946 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 947 struct perf_event_context *ctx = event->ctx; 948 struct perf_event *leader = event->group_leader; 949 int err; 950 951 /* 952 * If this is a per-task event, need to check whether this 953 * event's task is the current task on this cpu. 954 */ 955 if (ctx->task && cpuctx->task_ctx != ctx) { 956 if (cpuctx->task_ctx || ctx->task != current) 957 return; 958 cpuctx->task_ctx = ctx; 959 } 960 961 raw_spin_lock(&ctx->lock); 962 ctx->is_active = 1; 963 update_context_time(ctx); 964 965 if (event->state >= PERF_EVENT_STATE_INACTIVE) 966 goto unlock; 967 __perf_event_mark_enabled(event, ctx); 968 969 if (event->cpu != -1 && event->cpu != smp_processor_id()) 970 goto unlock; 971 972 /* 973 * If the event is in a group and isn't the group leader, 974 * then don't put it on unless the group is on. 975 */ 976 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) 977 goto unlock; 978 979 if (!group_can_go_on(event, cpuctx, 1)) { 980 err = -EEXIST; 981 } else { 982 perf_disable(); 983 if (event == leader) 984 err = group_sched_in(event, cpuctx, ctx); 985 else 986 err = event_sched_in(event, cpuctx, ctx); 987 perf_enable(); 988 } 989 990 if (err) { 991 /* 992 * If this event can't go on and it's part of a 993 * group, then the whole group has to come off. 994 */ 995 if (leader != event) 996 group_sched_out(leader, cpuctx, ctx); 997 if (leader->attr.pinned) { 998 update_group_times(leader); 999 leader->state = PERF_EVENT_STATE_ERROR; 1000 } 1001 } 1002 1003 unlock: 1004 raw_spin_unlock(&ctx->lock); 1005} 1006 1007/* 1008 * Enable a event. 1009 * 1010 * If event->ctx is a cloned context, callers must make sure that 1011 * every task struct that event->ctx->task could possibly point to 1012 * remains valid. This condition is satisfied when called through 1013 * perf_event_for_each_child or perf_event_for_each as described 1014 * for perf_event_disable. 1015 */ 1016void perf_event_enable(struct perf_event *event) 1017{ 1018 struct perf_event_context *ctx = event->ctx; 1019 struct task_struct *task = ctx->task; 1020 1021 if (!task) { 1022 /* 1023 * Enable the event on the cpu that it's on 1024 */ 1025 smp_call_function_single(event->cpu, __perf_event_enable, 1026 event, 1); 1027 return; 1028 } 1029 1030 raw_spin_lock_irq(&ctx->lock); 1031 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1032 goto out; 1033 1034 /* 1035 * If the event is in error state, clear that first. 1036 * That way, if we see the event in error state below, we 1037 * know that it has gone back into error state, as distinct 1038 * from the task having been scheduled away before the 1039 * cross-call arrived. 1040 */ 1041 if (event->state == PERF_EVENT_STATE_ERROR) 1042 event->state = PERF_EVENT_STATE_OFF; 1043 1044 retry: 1045 raw_spin_unlock_irq(&ctx->lock); 1046 task_oncpu_function_call(task, __perf_event_enable, event); 1047 1048 raw_spin_lock_irq(&ctx->lock); 1049 1050 /* 1051 * If the context is active and the event is still off, 1052 * we need to retry the cross-call. 1053 */ 1054 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1055 goto retry; 1056 1057 /* 1058 * Since we have the lock this context can't be scheduled 1059 * in, so we can change the state safely. 1060 */ 1061 if (event->state == PERF_EVENT_STATE_OFF) 1062 __perf_event_mark_enabled(event, ctx); 1063 1064 out: 1065 raw_spin_unlock_irq(&ctx->lock); 1066} 1067 1068static int perf_event_refresh(struct perf_event *event, int refresh) 1069{ 1070 /* 1071 * not supported on inherited events 1072 */ 1073 if (event->attr.inherit) 1074 return -EINVAL; 1075 1076 atomic_add(refresh, &event->event_limit); 1077 perf_event_enable(event); 1078 1079 return 0; 1080} 1081 1082enum event_type_t { 1083 EVENT_FLEXIBLE = 0x1, 1084 EVENT_PINNED = 0x2, 1085 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 1086}; 1087 1088static void ctx_sched_out(struct perf_event_context *ctx, 1089 struct perf_cpu_context *cpuctx, 1090 enum event_type_t event_type) 1091{ 1092 struct perf_event *event; 1093 1094 raw_spin_lock(&ctx->lock); 1095 ctx->is_active = 0; 1096 if (likely(!ctx->nr_events)) 1097 goto out; 1098 update_context_time(ctx); 1099 1100 perf_disable(); 1101 if (!ctx->nr_active) 1102 goto out_enable; 1103 1104 if (event_type & EVENT_PINNED) 1105 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1106 group_sched_out(event, cpuctx, ctx); 1107 1108 if (event_type & EVENT_FLEXIBLE) 1109 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1110 group_sched_out(event, cpuctx, ctx); 1111 1112 out_enable: 1113 perf_enable(); 1114 out: 1115 raw_spin_unlock(&ctx->lock); 1116} 1117 1118/* 1119 * Test whether two contexts are equivalent, i.e. whether they 1120 * have both been cloned from the same version of the same context 1121 * and they both have the same number of enabled events. 1122 * If the number of enabled events is the same, then the set 1123 * of enabled events should be the same, because these are both 1124 * inherited contexts, therefore we can't access individual events 1125 * in them directly with an fd; we can only enable/disable all 1126 * events via prctl, or enable/disable all events in a family 1127 * via ioctl, which will have the same effect on both contexts. 1128 */ 1129static int context_equiv(struct perf_event_context *ctx1, 1130 struct perf_event_context *ctx2) 1131{ 1132 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx 1133 && ctx1->parent_gen == ctx2->parent_gen 1134 && !ctx1->pin_count && !ctx2->pin_count; 1135} 1136 1137static void __perf_event_sync_stat(struct perf_event *event, 1138 struct perf_event *next_event) 1139{ 1140 u64 value; 1141 1142 if (!event->attr.inherit_stat) 1143 return; 1144 1145 /* 1146 * Update the event value, we cannot use perf_event_read() 1147 * because we're in the middle of a context switch and have IRQs 1148 * disabled, which upsets smp_call_function_single(), however 1149 * we know the event must be on the current CPU, therefore we 1150 * don't need to use it. 1151 */ 1152 switch (event->state) { 1153 case PERF_EVENT_STATE_ACTIVE: 1154 event->pmu->read(event); 1155 /* fall-through */ 1156 1157 case PERF_EVENT_STATE_INACTIVE: 1158 update_event_times(event); 1159 break; 1160 1161 default: 1162 break; 1163 } 1164 1165 /* 1166 * In order to keep per-task stats reliable we need to flip the event 1167 * values when we flip the contexts. 1168 */ 1169 value = local64_read(&next_event->count); 1170 value = local64_xchg(&event->count, value); 1171 local64_set(&next_event->count, value); 1172 1173 swap(event->total_time_enabled, next_event->total_time_enabled); 1174 swap(event->total_time_running, next_event->total_time_running); 1175 1176 /* 1177 * Since we swizzled the values, update the user visible data too. 1178 */ 1179 perf_event_update_userpage(event); 1180 perf_event_update_userpage(next_event); 1181} 1182 1183#define list_next_entry(pos, member) \ 1184 list_entry(pos->member.next, typeof(*pos), member) 1185 1186static void perf_event_sync_stat(struct perf_event_context *ctx, 1187 struct perf_event_context *next_ctx) 1188{ 1189 struct perf_event *event, *next_event; 1190 1191 if (!ctx->nr_stat) 1192 return; 1193 1194 update_context_time(ctx); 1195 1196 event = list_first_entry(&ctx->event_list, 1197 struct perf_event, event_entry); 1198 1199 next_event = list_first_entry(&next_ctx->event_list, 1200 struct perf_event, event_entry); 1201 1202 while (&event->event_entry != &ctx->event_list && 1203 &next_event->event_entry != &next_ctx->event_list) { 1204 1205 __perf_event_sync_stat(event, next_event); 1206 1207 event = list_next_entry(event, event_entry); 1208 next_event = list_next_entry(next_event, event_entry); 1209 } 1210} 1211 1212/* 1213 * Called from scheduler to remove the events of the current task, 1214 * with interrupts disabled. 1215 * 1216 * We stop each event and update the event value in event->count. 1217 * 1218 * This does not protect us against NMI, but disable() 1219 * sets the disabled bit in the control field of event _before_ 1220 * accessing the event control register. If a NMI hits, then it will 1221 * not restart the event. 1222 */ 1223void perf_event_task_sched_out(struct task_struct *task, 1224 struct task_struct *next) 1225{ 1226 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1227 struct perf_event_context *ctx = task->perf_event_ctxp; 1228 struct perf_event_context *next_ctx; 1229 struct perf_event_context *parent; 1230 int do_switch = 1; 1231 1232 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1233 1234 if (likely(!ctx || !cpuctx->task_ctx)) 1235 return; 1236 1237 rcu_read_lock(); 1238 parent = rcu_dereference(ctx->parent_ctx); 1239 next_ctx = next->perf_event_ctxp; 1240 if (parent && next_ctx && 1241 rcu_dereference(next_ctx->parent_ctx) == parent) { 1242 /* 1243 * Looks like the two contexts are clones, so we might be 1244 * able to optimize the context switch. We lock both 1245 * contexts and check that they are clones under the 1246 * lock (including re-checking that neither has been 1247 * uncloned in the meantime). It doesn't matter which 1248 * order we take the locks because no other cpu could 1249 * be trying to lock both of these tasks. 1250 */ 1251 raw_spin_lock(&ctx->lock); 1252 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 1253 if (context_equiv(ctx, next_ctx)) { 1254 task->perf_event_ctxp = next_ctx; 1255 next->perf_event_ctxp = ctx; 1256 ctx->task = next; 1257 next_ctx->task = task; 1258 do_switch = 0; 1259 1260 perf_event_sync_stat(ctx, next_ctx); 1261 } 1262 raw_spin_unlock(&next_ctx->lock); 1263 raw_spin_unlock(&ctx->lock); 1264 } 1265 rcu_read_unlock(); 1266 1267 if (do_switch) { 1268 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 1269 cpuctx->task_ctx = NULL; 1270 } 1271} 1272 1273static void task_ctx_sched_out(struct perf_event_context *ctx, 1274 enum event_type_t event_type) 1275{ 1276 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1277 1278 if (!cpuctx->task_ctx) 1279 return; 1280 1281 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1282 return; 1283 1284 ctx_sched_out(ctx, cpuctx, event_type); 1285 cpuctx->task_ctx = NULL; 1286} 1287 1288/* 1289 * Called with IRQs disabled 1290 */ 1291static void __perf_event_task_sched_out(struct perf_event_context *ctx) 1292{ 1293 task_ctx_sched_out(ctx, EVENT_ALL); 1294} 1295 1296/* 1297 * Called with IRQs disabled 1298 */ 1299static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 1300 enum event_type_t event_type) 1301{ 1302 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); 1303} 1304 1305static void 1306ctx_pinned_sched_in(struct perf_event_context *ctx, 1307 struct perf_cpu_context *cpuctx) 1308{ 1309 struct perf_event *event; 1310 1311 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1312 if (event->state <= PERF_EVENT_STATE_OFF) 1313 continue; 1314 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1315 continue; 1316 1317 if (group_can_go_on(event, cpuctx, 1)) 1318 group_sched_in(event, cpuctx, ctx); 1319 1320 /* 1321 * If this pinned group hasn't been scheduled, 1322 * put it in error state. 1323 */ 1324 if (event->state == PERF_EVENT_STATE_INACTIVE) { 1325 update_group_times(event); 1326 event->state = PERF_EVENT_STATE_ERROR; 1327 } 1328 } 1329} 1330 1331static void 1332ctx_flexible_sched_in(struct perf_event_context *ctx, 1333 struct perf_cpu_context *cpuctx) 1334{ 1335 struct perf_event *event; 1336 int can_add_hw = 1; 1337 1338 list_for_each_entry(event, &ctx->flexible_groups, group_entry) { 1339 /* Ignore events in OFF or ERROR state */ 1340 if (event->state <= PERF_EVENT_STATE_OFF) 1341 continue; 1342 /* 1343 * Listen to the 'cpu' scheduling filter constraint 1344 * of events: 1345 */ 1346 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1347 continue; 1348 1349 if (group_can_go_on(event, cpuctx, can_add_hw)) 1350 if (group_sched_in(event, cpuctx, ctx)) 1351 can_add_hw = 0; 1352 } 1353} 1354 1355static void 1356ctx_sched_in(struct perf_event_context *ctx, 1357 struct perf_cpu_context *cpuctx, 1358 enum event_type_t event_type) 1359{ 1360 raw_spin_lock(&ctx->lock); 1361 ctx->is_active = 1; 1362 if (likely(!ctx->nr_events)) 1363 goto out; 1364 1365 ctx->timestamp = perf_clock(); 1366 1367 perf_disable(); 1368 1369 /* 1370 * First go through the list and put on any pinned groups 1371 * in order to give them the best chance of going on. 1372 */ 1373 if (event_type & EVENT_PINNED) 1374 ctx_pinned_sched_in(ctx, cpuctx); 1375 1376 /* Then walk through the lower prio flexible groups */ 1377 if (event_type & EVENT_FLEXIBLE) 1378 ctx_flexible_sched_in(ctx, cpuctx); 1379 1380 perf_enable(); 1381 out: 1382 raw_spin_unlock(&ctx->lock); 1383} 1384 1385static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 1386 enum event_type_t event_type) 1387{ 1388 struct perf_event_context *ctx = &cpuctx->ctx; 1389 1390 ctx_sched_in(ctx, cpuctx, event_type); 1391} 1392 1393static void task_ctx_sched_in(struct task_struct *task, 1394 enum event_type_t event_type) 1395{ 1396 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1397 struct perf_event_context *ctx = task->perf_event_ctxp; 1398 1399 if (likely(!ctx)) 1400 return; 1401 if (cpuctx->task_ctx == ctx) 1402 return; 1403 ctx_sched_in(ctx, cpuctx, event_type); 1404 cpuctx->task_ctx = ctx; 1405} 1406/* 1407 * Called from scheduler to add the events of the current task 1408 * with interrupts disabled. 1409 * 1410 * We restore the event value and then enable it. 1411 * 1412 * This does not protect us against NMI, but enable() 1413 * sets the enabled bit in the control field of event _before_ 1414 * accessing the event control register. If a NMI hits, then it will 1415 * keep the event running. 1416 */ 1417void perf_event_task_sched_in(struct task_struct *task) 1418{ 1419 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1420 struct perf_event_context *ctx = task->perf_event_ctxp; 1421 1422 if (likely(!ctx)) 1423 return; 1424 1425 if (cpuctx->task_ctx == ctx) 1426 return; 1427 1428 perf_disable(); 1429 1430 /* 1431 * We want to keep the following priority order: 1432 * cpu pinned (that don't need to move), task pinned, 1433 * cpu flexible, task flexible. 1434 */ 1435 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1436 1437 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 1438 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1439 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 1440 1441 cpuctx->task_ctx = ctx; 1442 1443 perf_enable(); 1444} 1445 1446#define MAX_INTERRUPTS (~0ULL) 1447 1448static void perf_log_throttle(struct perf_event *event, int enable); 1449 1450static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 1451{ 1452 u64 frequency = event->attr.sample_freq; 1453 u64 sec = NSEC_PER_SEC; 1454 u64 divisor, dividend; 1455 1456 int count_fls, nsec_fls, frequency_fls, sec_fls; 1457 1458 count_fls = fls64(count); 1459 nsec_fls = fls64(nsec); 1460 frequency_fls = fls64(frequency); 1461 sec_fls = 30; 1462 1463 /* 1464 * We got @count in @nsec, with a target of sample_freq HZ 1465 * the target period becomes: 1466 * 1467 * @count * 10^9 1468 * period = ------------------- 1469 * @nsec * sample_freq 1470 * 1471 */ 1472 1473 /* 1474 * Reduce accuracy by one bit such that @a and @b converge 1475 * to a similar magnitude. 1476 */ 1477#define REDUCE_FLS(a, b) \ 1478do { \ 1479 if (a##_fls > b##_fls) { \ 1480 a >>= 1; \ 1481 a##_fls--; \ 1482 } else { \ 1483 b >>= 1; \ 1484 b##_fls--; \ 1485 } \ 1486} while (0) 1487 1488 /* 1489 * Reduce accuracy until either term fits in a u64, then proceed with 1490 * the other, so that finally we can do a u64/u64 division. 1491 */ 1492 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { 1493 REDUCE_FLS(nsec, frequency); 1494 REDUCE_FLS(sec, count); 1495 } 1496 1497 if (count_fls + sec_fls > 64) { 1498 divisor = nsec * frequency; 1499 1500 while (count_fls + sec_fls > 64) { 1501 REDUCE_FLS(count, sec); 1502 divisor >>= 1; 1503 } 1504 1505 dividend = count * sec; 1506 } else { 1507 dividend = count * sec; 1508 1509 while (nsec_fls + frequency_fls > 64) { 1510 REDUCE_FLS(nsec, frequency); 1511 dividend >>= 1; 1512 } 1513 1514 divisor = nsec * frequency; 1515 } 1516 1517 if (!divisor) 1518 return dividend; 1519 1520 return div64_u64(dividend, divisor); 1521} 1522 1523static void perf_event_stop(struct perf_event *event) 1524{ 1525 if (!event->pmu->stop) 1526 return event->pmu->disable(event); 1527 1528 return event->pmu->stop(event); 1529} 1530 1531static int perf_event_start(struct perf_event *event) 1532{ 1533 if (!event->pmu->start) 1534 return event->pmu->enable(event); 1535 1536 return event->pmu->start(event); 1537} 1538 1539static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1540{ 1541 struct hw_perf_event *hwc = &event->hw; 1542 s64 period, sample_period; 1543 s64 delta; 1544 1545 period = perf_calculate_period(event, nsec, count); 1546 1547 delta = (s64)(period - hwc->sample_period); 1548 delta = (delta + 7) / 8; /* low pass filter */ 1549 1550 sample_period = hwc->sample_period + delta; 1551 1552 if (!sample_period) 1553 sample_period = 1; 1554 1555 hwc->sample_period = sample_period; 1556 1557 if (local64_read(&hwc->period_left) > 8*sample_period) { 1558 perf_disable(); 1559 perf_event_stop(event); 1560 local64_set(&hwc->period_left, 0); 1561 perf_event_start(event); 1562 perf_enable(); 1563 } 1564} 1565 1566static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1567{ 1568 struct perf_event *event; 1569 struct hw_perf_event *hwc; 1570 u64 interrupts, now; 1571 s64 delta; 1572 1573 raw_spin_lock(&ctx->lock); 1574 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1575 if (event->state != PERF_EVENT_STATE_ACTIVE) 1576 continue; 1577 1578 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1579 continue; 1580 1581 hwc = &event->hw; 1582 1583 interrupts = hwc->interrupts; 1584 hwc->interrupts = 0; 1585 1586 /* 1587 * unthrottle events on the tick 1588 */ 1589 if (interrupts == MAX_INTERRUPTS) { 1590 perf_log_throttle(event, 1); 1591 perf_disable(); 1592 event->pmu->unthrottle(event); 1593 perf_enable(); 1594 } 1595 1596 if (!event->attr.freq || !event->attr.sample_freq) 1597 continue; 1598 1599 perf_disable(); 1600 event->pmu->read(event); 1601 now = local64_read(&event->count); 1602 delta = now - hwc->freq_count_stamp; 1603 hwc->freq_count_stamp = now; 1604 1605 if (delta > 0) 1606 perf_adjust_period(event, TICK_NSEC, delta); 1607 perf_enable(); 1608 } 1609 raw_spin_unlock(&ctx->lock); 1610} 1611 1612/* 1613 * Round-robin a context's events: 1614 */ 1615static void rotate_ctx(struct perf_event_context *ctx) 1616{ 1617 raw_spin_lock(&ctx->lock); 1618 1619 /* 1620 * Rotate the first entry last of non-pinned groups. Rotation might be 1621 * disabled by the inheritance code. 1622 */ 1623 if (!ctx->rotate_disable) 1624 list_rotate_left(&ctx->flexible_groups); 1625 1626 raw_spin_unlock(&ctx->lock); 1627} 1628 1629void perf_event_task_tick(struct task_struct *curr) 1630{ 1631 struct perf_cpu_context *cpuctx; 1632 struct perf_event_context *ctx; 1633 int rotate = 0; 1634 1635 if (!atomic_read(&nr_events)) 1636 return; 1637 1638 cpuctx = &__get_cpu_var(perf_cpu_context); 1639 if (cpuctx->ctx.nr_events && 1640 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 1641 rotate = 1; 1642 1643 ctx = curr->perf_event_ctxp; 1644 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) 1645 rotate = 1; 1646 1647 perf_ctx_adjust_freq(&cpuctx->ctx); 1648 if (ctx) 1649 perf_ctx_adjust_freq(ctx); 1650 1651 if (!rotate) 1652 return; 1653 1654 perf_disable(); 1655 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1656 if (ctx) 1657 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1658 1659 rotate_ctx(&cpuctx->ctx); 1660 if (ctx) 1661 rotate_ctx(ctx); 1662 1663 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1664 if (ctx) 1665 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1666 perf_enable(); 1667} 1668 1669static int event_enable_on_exec(struct perf_event *event, 1670 struct perf_event_context *ctx) 1671{ 1672 if (!event->attr.enable_on_exec) 1673 return 0; 1674 1675 event->attr.enable_on_exec = 0; 1676 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1677 return 0; 1678 1679 __perf_event_mark_enabled(event, ctx); 1680 1681 return 1; 1682} 1683 1684/* 1685 * Enable all of a task's events that have been marked enable-on-exec. 1686 * This expects task == current. 1687 */ 1688static void perf_event_enable_on_exec(struct task_struct *task) 1689{ 1690 struct perf_event_context *ctx; 1691 struct perf_event *event; 1692 unsigned long flags; 1693 int enabled = 0; 1694 int ret; 1695 1696 local_irq_save(flags); 1697 ctx = task->perf_event_ctxp; 1698 if (!ctx || !ctx->nr_events) 1699 goto out; 1700 1701 __perf_event_task_sched_out(ctx); 1702 1703 raw_spin_lock(&ctx->lock); 1704 1705 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1706 ret = event_enable_on_exec(event, ctx); 1707 if (ret) 1708 enabled = 1; 1709 } 1710 1711 list_for_each_entry(event, &ctx->flexible_groups, group_entry) { 1712 ret = event_enable_on_exec(event, ctx); 1713 if (ret) 1714 enabled = 1; 1715 } 1716 1717 /* 1718 * Unclone this context if we enabled any event. 1719 */ 1720 if (enabled) 1721 unclone_ctx(ctx); 1722 1723 raw_spin_unlock(&ctx->lock); 1724 1725 perf_event_task_sched_in(task); 1726 out: 1727 local_irq_restore(flags); 1728} 1729 1730/* 1731 * Cross CPU call to read the hardware event 1732 */ 1733static void __perf_event_read(void *info) 1734{ 1735 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1736 struct perf_event *event = info; 1737 struct perf_event_context *ctx = event->ctx; 1738 1739 /* 1740 * If this is a task context, we need to check whether it is 1741 * the current task context of this cpu. If not it has been 1742 * scheduled out before the smp call arrived. In that case 1743 * event->count would have been updated to a recent sample 1744 * when the event was scheduled out. 1745 */ 1746 if (ctx->task && cpuctx->task_ctx != ctx) 1747 return; 1748 1749 raw_spin_lock(&ctx->lock); 1750 update_context_time(ctx); 1751 update_event_times(event); 1752 raw_spin_unlock(&ctx->lock); 1753 1754 event->pmu->read(event); 1755} 1756 1757static inline u64 perf_event_count(struct perf_event *event) 1758{ 1759 return local64_read(&event->count) + atomic64_read(&event->child_count); 1760} 1761 1762static u64 perf_event_read(struct perf_event *event) 1763{ 1764 /* 1765 * If event is enabled and currently active on a CPU, update the 1766 * value in the event structure: 1767 */ 1768 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1769 smp_call_function_single(event->oncpu, 1770 __perf_event_read, event, 1); 1771 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1772 struct perf_event_context *ctx = event->ctx; 1773 unsigned long flags; 1774 1775 raw_spin_lock_irqsave(&ctx->lock, flags); 1776 /* 1777 * may read while context is not active 1778 * (e.g., thread is blocked), in that case 1779 * we cannot update context time 1780 */ 1781 if (ctx->is_active) 1782 update_context_time(ctx); 1783 update_event_times(event); 1784 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1785 } 1786 1787 return perf_event_count(event); 1788} 1789 1790/* 1791 * Initialize the perf_event context in a task_struct: 1792 */ 1793static void 1794__perf_event_init_context(struct perf_event_context *ctx, 1795 struct task_struct *task) 1796{ 1797 raw_spin_lock_init(&ctx->lock); 1798 mutex_init(&ctx->mutex); 1799 INIT_LIST_HEAD(&ctx->pinned_groups); 1800 INIT_LIST_HEAD(&ctx->flexible_groups); 1801 INIT_LIST_HEAD(&ctx->event_list); 1802 atomic_set(&ctx->refcount, 1); 1803 ctx->task = task; 1804} 1805 1806static struct perf_event_context *find_get_context(pid_t pid, int cpu) 1807{ 1808 struct perf_event_context *ctx; 1809 struct perf_cpu_context *cpuctx; 1810 struct task_struct *task; 1811 unsigned long flags; 1812 int err; 1813 1814 if (pid == -1 && cpu != -1) { 1815 /* Must be root to operate on a CPU event: */ 1816 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 1817 return ERR_PTR(-EACCES); 1818 1819 if (cpu < 0 || cpu >= nr_cpumask_bits) 1820 return ERR_PTR(-EINVAL); 1821 1822 /* 1823 * We could be clever and allow to attach a event to an 1824 * offline CPU and activate it when the CPU comes up, but 1825 * that's for later. 1826 */ 1827 if (!cpu_online(cpu)) 1828 return ERR_PTR(-ENODEV); 1829 1830 cpuctx = &per_cpu(perf_cpu_context, cpu); 1831 ctx = &cpuctx->ctx; 1832 get_ctx(ctx); 1833 1834 return ctx; 1835 } 1836 1837 rcu_read_lock(); 1838 if (!pid) 1839 task = current; 1840 else 1841 task = find_task_by_vpid(pid); 1842 if (task) 1843 get_task_struct(task); 1844 rcu_read_unlock(); 1845 1846 if (!task) 1847 return ERR_PTR(-ESRCH); 1848 1849 /* 1850 * Can't attach events to a dying task. 1851 */ 1852 err = -ESRCH; 1853 if (task->flags & PF_EXITING) 1854 goto errout; 1855 1856 /* Reuse ptrace permission checks for now. */ 1857 err = -EACCES; 1858 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 1859 goto errout; 1860 1861 retry: 1862 ctx = perf_lock_task_context(task, &flags); 1863 if (ctx) { 1864 unclone_ctx(ctx); 1865 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1866 } 1867 1868 if (!ctx) { 1869 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 1870 err = -ENOMEM; 1871 if (!ctx) 1872 goto errout; 1873 __perf_event_init_context(ctx, task); 1874 get_ctx(ctx); 1875 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 1876 /* 1877 * We raced with some other task; use 1878 * the context they set. 1879 */ 1880 kfree(ctx); 1881 goto retry; 1882 } 1883 get_task_struct(task); 1884 } 1885 1886 put_task_struct(task); 1887 return ctx; 1888 1889 errout: 1890 put_task_struct(task); 1891 return ERR_PTR(err); 1892} 1893 1894static void perf_event_free_filter(struct perf_event *event); 1895 1896static void free_event_rcu(struct rcu_head *head) 1897{ 1898 struct perf_event *event; 1899 1900 event = container_of(head, struct perf_event, rcu_head); 1901 if (event->ns) 1902 put_pid_ns(event->ns); 1903 perf_event_free_filter(event); 1904 kfree(event); 1905} 1906 1907static void perf_pending_sync(struct perf_event *event); 1908static void perf_buffer_put(struct perf_buffer *buffer); 1909 1910static void free_event(struct perf_event *event) 1911{ 1912 perf_pending_sync(event); 1913 1914 if (!event->parent) { 1915 atomic_dec(&nr_events); 1916 if (event->attr.mmap || event->attr.mmap_data) 1917 atomic_dec(&nr_mmap_events); 1918 if (event->attr.comm) 1919 atomic_dec(&nr_comm_events); 1920 if (event->attr.task) 1921 atomic_dec(&nr_task_events); 1922 } 1923 1924 if (event->buffer) { 1925 perf_buffer_put(event->buffer); 1926 event->buffer = NULL; 1927 } 1928 1929 if (event->destroy) 1930 event->destroy(event); 1931 1932 put_ctx(event->ctx); 1933 call_rcu(&event->rcu_head, free_event_rcu); 1934} 1935 1936int perf_event_release_kernel(struct perf_event *event) 1937{ 1938 struct perf_event_context *ctx = event->ctx; 1939 1940 /* 1941 * Remove from the PMU, can't get re-enabled since we got 1942 * here because the last ref went. 1943 */ 1944 perf_event_disable(event); 1945 1946 WARN_ON_ONCE(ctx->parent_ctx); 1947 /* 1948 * There are two ways this annotation is useful: 1949 * 1950 * 1) there is a lock recursion from perf_event_exit_task 1951 * see the comment there. 1952 * 1953 * 2) there is a lock-inversion with mmap_sem through 1954 * perf_event_read_group(), which takes faults while 1955 * holding ctx->mutex, however this is called after 1956 * the last filedesc died, so there is no possibility 1957 * to trigger the AB-BA case. 1958 */ 1959 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 1960 raw_spin_lock_irq(&ctx->lock); 1961 perf_group_detach(event); 1962 list_del_event(event, ctx); 1963 raw_spin_unlock_irq(&ctx->lock); 1964 mutex_unlock(&ctx->mutex); 1965 1966 mutex_lock(&event->owner->perf_event_mutex); 1967 list_del_init(&event->owner_entry); 1968 mutex_unlock(&event->owner->perf_event_mutex); 1969 put_task_struct(event->owner); 1970 1971 free_event(event); 1972 1973 return 0; 1974} 1975EXPORT_SYMBOL_GPL(perf_event_release_kernel); 1976 1977/* 1978 * Called when the last reference to the file is gone. 1979 */ 1980static int perf_release(struct inode *inode, struct file *file) 1981{ 1982 struct perf_event *event = file->private_data; 1983 1984 file->private_data = NULL; 1985 1986 return perf_event_release_kernel(event); 1987} 1988 1989static int perf_event_read_size(struct perf_event *event) 1990{ 1991 int entry = sizeof(u64); /* value */ 1992 int size = 0; 1993 int nr = 1; 1994 1995 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1996 size += sizeof(u64); 1997 1998 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1999 size += sizeof(u64); 2000 2001 if (event->attr.read_format & PERF_FORMAT_ID) 2002 entry += sizeof(u64); 2003 2004 if (event->attr.read_format & PERF_FORMAT_GROUP) { 2005 nr += event->group_leader->nr_siblings; 2006 size += sizeof(u64); 2007 } 2008 2009 size += entry * nr; 2010 2011 return size; 2012} 2013 2014u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2015{ 2016 struct perf_event *child; 2017 u64 total = 0; 2018 2019 *enabled = 0; 2020 *running = 0; 2021 2022 mutex_lock(&event->child_mutex); 2023 total += perf_event_read(event); 2024 *enabled += event->total_time_enabled + 2025 atomic64_read(&event->child_total_time_enabled); 2026 *running += event->total_time_running + 2027 atomic64_read(&event->child_total_time_running); 2028 2029 list_for_each_entry(child, &event->child_list, child_list) { 2030 total += perf_event_read(child); 2031 *enabled += child->total_time_enabled; 2032 *running += child->total_time_running; 2033 } 2034 mutex_unlock(&event->child_mutex); 2035 2036 return total; 2037} 2038EXPORT_SYMBOL_GPL(perf_event_read_value); 2039 2040static int perf_event_read_group(struct perf_event *event, 2041 u64 read_format, char __user *buf) 2042{ 2043 struct perf_event *leader = event->group_leader, *sub; 2044 int n = 0, size = 0, ret = -EFAULT; 2045 struct perf_event_context *ctx = leader->ctx; 2046 u64 values[5]; 2047 u64 count, enabled, running; 2048 2049 mutex_lock(&ctx->mutex); 2050 count = perf_event_read_value(leader, &enabled, &running); 2051 2052 values[n++] = 1 + leader->nr_siblings; 2053 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 2054 values[n++] = enabled; 2055 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 2056 values[n++] = running; 2057 values[n++] = count; 2058 if (read_format & PERF_FORMAT_ID) 2059 values[n++] = primary_event_id(leader); 2060 2061 size = n * sizeof(u64); 2062 2063 if (copy_to_user(buf, values, size)) 2064 goto unlock; 2065 2066 ret = size; 2067 2068 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 2069 n = 0; 2070 2071 values[n++] = perf_event_read_value(sub, &enabled, &running); 2072 if (read_format & PERF_FORMAT_ID) 2073 values[n++] = primary_event_id(sub); 2074 2075 size = n * sizeof(u64); 2076 2077 if (copy_to_user(buf + ret, values, size)) { 2078 ret = -EFAULT; 2079 goto unlock; 2080 } 2081 2082 ret += size; 2083 } 2084unlock: 2085 mutex_unlock(&ctx->mutex); 2086 2087 return ret; 2088} 2089 2090static int perf_event_read_one(struct perf_event *event, 2091 u64 read_format, char __user *buf) 2092{ 2093 u64 enabled, running; 2094 u64 values[4]; 2095 int n = 0; 2096 2097 values[n++] = perf_event_read_value(event, &enabled, &running); 2098 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 2099 values[n++] = enabled; 2100 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 2101 values[n++] = running; 2102 if (read_format & PERF_FORMAT_ID) 2103 values[n++] = primary_event_id(event); 2104 2105 if (copy_to_user(buf, values, n * sizeof(u64))) 2106 return -EFAULT; 2107 2108 return n * sizeof(u64); 2109} 2110 2111/* 2112 * Read the performance event - simple non blocking version for now 2113 */ 2114static ssize_t 2115perf_read_hw(struct perf_event *event, char __user *buf, size_t count) 2116{ 2117 u64 read_format = event->attr.read_format; 2118 int ret; 2119 2120 /* 2121 * Return end-of-file for a read on a event that is in 2122 * error state (i.e. because it was pinned but it couldn't be 2123 * scheduled on to the CPU at some point). 2124 */ 2125 if (event->state == PERF_EVENT_STATE_ERROR) 2126 return 0; 2127 2128 if (count < perf_event_read_size(event)) 2129 return -ENOSPC; 2130 2131 WARN_ON_ONCE(event->ctx->parent_ctx); 2132 if (read_format & PERF_FORMAT_GROUP) 2133 ret = perf_event_read_group(event, read_format, buf); 2134 else 2135 ret = perf_event_read_one(event, read_format, buf); 2136 2137 return ret; 2138} 2139 2140static ssize_t 2141perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 2142{ 2143 struct perf_event *event = file->private_data; 2144 2145 return perf_read_hw(event, buf, count); 2146} 2147 2148static unsigned int perf_poll(struct file *file, poll_table *wait) 2149{ 2150 struct perf_event *event = file->private_data; 2151 struct perf_buffer *buffer; 2152 unsigned int events = POLL_HUP; 2153 2154 rcu_read_lock(); 2155 buffer = rcu_dereference(event->buffer); 2156 if (buffer) 2157 events = atomic_xchg(&buffer->poll, 0); 2158 rcu_read_unlock(); 2159 2160 poll_wait(file, &event->waitq, wait); 2161 2162 return events; 2163} 2164 2165static void perf_event_reset(struct perf_event *event) 2166{ 2167 (void)perf_event_read(event); 2168 local64_set(&event->count, 0); 2169 perf_event_update_userpage(event); 2170} 2171 2172/* 2173 * Holding the top-level event's child_mutex means that any 2174 * descendant process that has inherited this event will block 2175 * in sync_child_event if it goes to exit, thus satisfying the 2176 * task existence requirements of perf_event_enable/disable. 2177 */ 2178static void perf_event_for_each_child(struct perf_event *event, 2179 void (*func)(struct perf_event *)) 2180{ 2181 struct perf_event *child; 2182 2183 WARN_ON_ONCE(event->ctx->parent_ctx); 2184 mutex_lock(&event->child_mutex); 2185 func(event); 2186 list_for_each_entry(child, &event->child_list, child_list) 2187 func(child); 2188 mutex_unlock(&event->child_mutex); 2189} 2190 2191static void perf_event_for_each(struct perf_event *event, 2192 void (*func)(struct perf_event *)) 2193{ 2194 struct perf_event_context *ctx = event->ctx; 2195 struct perf_event *sibling; 2196 2197 WARN_ON_ONCE(ctx->parent_ctx); 2198 mutex_lock(&ctx->mutex); 2199 event = event->group_leader; 2200 2201 perf_event_for_each_child(event, func); 2202 func(event); 2203 list_for_each_entry(sibling, &event->sibling_list, group_entry) 2204 perf_event_for_each_child(event, func); 2205 mutex_unlock(&ctx->mutex); 2206} 2207 2208static int perf_event_period(struct perf_event *event, u64 __user *arg) 2209{ 2210 struct perf_event_context *ctx = event->ctx; 2211 int ret = 0; 2212 u64 value; 2213 2214 if (!event->attr.sample_period) 2215 return -EINVAL; 2216 2217 if (copy_from_user(&value, arg, sizeof(value))) 2218 return -EFAULT; 2219 2220 if (!value) 2221 return -EINVAL; 2222 2223 raw_spin_lock_irq(&ctx->lock); 2224 if (event->attr.freq) { 2225 if (value > sysctl_perf_event_sample_rate) { 2226 ret = -EINVAL; 2227 goto unlock; 2228 } 2229 2230 event->attr.sample_freq = value; 2231 } else { 2232 event->attr.sample_period = value; 2233 event->hw.sample_period = value; 2234 } 2235unlock: 2236 raw_spin_unlock_irq(&ctx->lock); 2237 2238 return ret; 2239} 2240 2241static const struct file_operations perf_fops; 2242 2243static struct perf_event *perf_fget_light(int fd, int *fput_needed) 2244{ 2245 struct file *file; 2246 2247 file = fget_light(fd, fput_needed); 2248 if (!file) 2249 return ERR_PTR(-EBADF); 2250 2251 if (file->f_op != &perf_fops) { 2252 fput_light(file, *fput_needed); 2253 *fput_needed = 0; 2254 return ERR_PTR(-EBADF); 2255 } 2256 2257 return file->private_data; 2258} 2259 2260static int perf_event_set_output(struct perf_event *event, 2261 struct perf_event *output_event); 2262static int perf_event_set_filter(struct perf_event *event, void __user *arg); 2263 2264static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2265{ 2266 struct perf_event *event = file->private_data; 2267 void (*func)(struct perf_event *); 2268 u32 flags = arg; 2269 2270 switch (cmd) { 2271 case PERF_EVENT_IOC_ENABLE: 2272 func = perf_event_enable; 2273 break; 2274 case PERF_EVENT_IOC_DISABLE: 2275 func = perf_event_disable; 2276 break; 2277 case PERF_EVENT_IOC_RESET: 2278 func = perf_event_reset; 2279 break; 2280 2281 case PERF_EVENT_IOC_REFRESH: 2282 return perf_event_refresh(event, arg); 2283 2284 case PERF_EVENT_IOC_PERIOD: 2285 return perf_event_period(event, (u64 __user *)arg); 2286 2287 case PERF_EVENT_IOC_SET_OUTPUT: 2288 { 2289 struct perf_event *output_event = NULL; 2290 int fput_needed = 0; 2291 int ret; 2292 2293 if (arg != -1) { 2294 output_event = perf_fget_light(arg, &fput_needed); 2295 if (IS_ERR(output_event)) 2296 return PTR_ERR(output_event); 2297 } 2298 2299 ret = perf_event_set_output(event, output_event); 2300 if (output_event) 2301 fput_light(output_event->filp, fput_needed); 2302 2303 return ret; 2304 } 2305 2306 case PERF_EVENT_IOC_SET_FILTER: 2307 return perf_event_set_filter(event, (void __user *)arg); 2308 2309 default: 2310 return -ENOTTY; 2311 } 2312 2313 if (flags & PERF_IOC_FLAG_GROUP) 2314 perf_event_for_each(event, func); 2315 else 2316 perf_event_for_each_child(event, func); 2317 2318 return 0; 2319} 2320 2321int perf_event_task_enable(void) 2322{ 2323 struct perf_event *event; 2324 2325 mutex_lock(¤t->perf_event_mutex); 2326 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) 2327 perf_event_for_each_child(event, perf_event_enable); 2328 mutex_unlock(¤t->perf_event_mutex); 2329 2330 return 0; 2331} 2332 2333int perf_event_task_disable(void) 2334{ 2335 struct perf_event *event; 2336 2337 mutex_lock(¤t->perf_event_mutex); 2338 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) 2339 perf_event_for_each_child(event, perf_event_disable); 2340 mutex_unlock(¤t->perf_event_mutex); 2341 2342 return 0; 2343} 2344 2345#ifndef PERF_EVENT_INDEX_OFFSET 2346# define PERF_EVENT_INDEX_OFFSET 0 2347#endif 2348 2349static int perf_event_index(struct perf_event *event) 2350{ 2351 if (event->state != PERF_EVENT_STATE_ACTIVE) 2352 return 0; 2353 2354 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 2355} 2356 2357/* 2358 * Callers need to ensure there can be no nesting of this function, otherwise 2359 * the seqlock logic goes bad. We can not serialize this because the arch 2360 * code calls this from NMI context. 2361 */ 2362void perf_event_update_userpage(struct perf_event *event) 2363{ 2364 struct perf_event_mmap_page *userpg; 2365 struct perf_buffer *buffer; 2366 2367 rcu_read_lock(); 2368 buffer = rcu_dereference(event->buffer); 2369 if (!buffer) 2370 goto unlock; 2371 2372 userpg = buffer->user_page; 2373 2374 /* 2375 * Disable preemption so as to not let the corresponding user-space 2376 * spin too long if we get preempted. 2377 */ 2378 preempt_disable(); 2379 ++userpg->lock; 2380 barrier(); 2381 userpg->index = perf_event_index(event); 2382 userpg->offset = perf_event_count(event); 2383 if (event->state == PERF_EVENT_STATE_ACTIVE) 2384 userpg->offset -= local64_read(&event->hw.prev_count); 2385 2386 userpg->time_enabled = event->total_time_enabled + 2387 atomic64_read(&event->child_total_time_enabled); 2388 2389 userpg->time_running = event->total_time_running + 2390 atomic64_read(&event->child_total_time_running); 2391 2392 barrier(); 2393 ++userpg->lock; 2394 preempt_enable(); 2395unlock: 2396 rcu_read_unlock(); 2397} 2398 2399static unsigned long perf_data_size(struct perf_buffer *buffer); 2400 2401static void 2402perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) 2403{ 2404 long max_size = perf_data_size(buffer); 2405 2406 if (watermark) 2407 buffer->watermark = min(max_size, watermark); 2408 2409 if (!buffer->watermark) 2410 buffer->watermark = max_size / 2; 2411 2412 if (flags & PERF_BUFFER_WRITABLE) 2413 buffer->writable = 1; 2414 2415 atomic_set(&buffer->refcount, 1); 2416} 2417 2418#ifndef CONFIG_PERF_USE_VMALLOC 2419 2420/* 2421 * Back perf_mmap() with regular GFP_KERNEL-0 pages. 2422 */ 2423 2424static struct page * 2425perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) 2426{ 2427 if (pgoff > buffer->nr_pages) 2428 return NULL; 2429 2430 if (pgoff == 0) 2431 return virt_to_page(buffer->user_page); 2432 2433 return virt_to_page(buffer->data_pages[pgoff - 1]); 2434} 2435 2436static void *perf_mmap_alloc_page(int cpu) 2437{ 2438 struct page *page; 2439 int node; 2440 2441 node = (cpu == -1) ? cpu : cpu_to_node(cpu); 2442 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 2443 if (!page) 2444 return NULL; 2445 2446 return page_address(page); 2447} 2448 2449static struct perf_buffer * 2450perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) 2451{ 2452 struct perf_buffer *buffer; 2453 unsigned long size; 2454 int i; 2455 2456 size = sizeof(struct perf_buffer); 2457 size += nr_pages * sizeof(void *); 2458 2459 buffer = kzalloc(size, GFP_KERNEL); 2460 if (!buffer) 2461 goto fail; 2462 2463 buffer->user_page = perf_mmap_alloc_page(cpu); 2464 if (!buffer->user_page) 2465 goto fail_user_page; 2466 2467 for (i = 0; i < nr_pages; i++) { 2468 buffer->data_pages[i] = perf_mmap_alloc_page(cpu); 2469 if (!buffer->data_pages[i]) 2470 goto fail_data_pages; 2471 } 2472 2473 buffer->nr_pages = nr_pages; 2474 2475 perf_buffer_init(buffer, watermark, flags); 2476 2477 return buffer; 2478 2479fail_data_pages: 2480 for (i--; i >= 0; i--) 2481 free_page((unsigned long)buffer->data_pages[i]); 2482 2483 free_page((unsigned long)buffer->user_page); 2484 2485fail_user_page: 2486 kfree(buffer); 2487 2488fail: 2489 return NULL; 2490} 2491 2492static void perf_mmap_free_page(unsigned long addr) 2493{ 2494 struct page *page = virt_to_page((void *)addr); 2495 2496 page->mapping = NULL; 2497 __free_page(page); 2498} 2499 2500static void perf_buffer_free(struct perf_buffer *buffer) 2501{ 2502 int i; 2503 2504 perf_mmap_free_page((unsigned long)buffer->user_page); 2505 for (i = 0; i < buffer->nr_pages; i++) 2506 perf_mmap_free_page((unsigned long)buffer->data_pages[i]); 2507 kfree(buffer); 2508} 2509 2510static inline int page_order(struct perf_buffer *buffer) 2511{ 2512 return 0; 2513} 2514 2515#else 2516 2517/* 2518 * Back perf_mmap() with vmalloc memory. 2519 * 2520 * Required for architectures that have d-cache aliasing issues. 2521 */ 2522 2523static inline int page_order(struct perf_buffer *buffer) 2524{ 2525 return buffer->page_order; 2526} 2527 2528static struct page * 2529perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) 2530{ 2531 if (pgoff > (1UL << page_order(buffer))) 2532 return NULL; 2533 2534 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); 2535} 2536 2537static void perf_mmap_unmark_page(void *addr) 2538{ 2539 struct page *page = vmalloc_to_page(addr); 2540 2541 page->mapping = NULL; 2542} 2543 2544static void perf_buffer_free_work(struct work_struct *work) 2545{ 2546 struct perf_buffer *buffer; 2547 void *base; 2548 int i, nr; 2549 2550 buffer = container_of(work, struct perf_buffer, work); 2551 nr = 1 << page_order(buffer); 2552 2553 base = buffer->user_page; 2554 for (i = 0; i < nr + 1; i++) 2555 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2556 2557 vfree(base); 2558 kfree(buffer); 2559} 2560 2561static void perf_buffer_free(struct perf_buffer *buffer) 2562{ 2563 schedule_work(&buffer->work); 2564} 2565 2566static struct perf_buffer * 2567perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) 2568{ 2569 struct perf_buffer *buffer; 2570 unsigned long size; 2571 void *all_buf; 2572 2573 size = sizeof(struct perf_buffer); 2574 size += sizeof(void *); 2575 2576 buffer = kzalloc(size, GFP_KERNEL); 2577 if (!buffer) 2578 goto fail; 2579 2580 INIT_WORK(&buffer->work, perf_buffer_free_work); 2581 2582 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 2583 if (!all_buf) 2584 goto fail_all_buf; 2585 2586 buffer->user_page = all_buf; 2587 buffer->data_pages[0] = all_buf + PAGE_SIZE; 2588 buffer->page_order = ilog2(nr_pages); 2589 buffer->nr_pages = 1; 2590 2591 perf_buffer_init(buffer, watermark, flags); 2592 2593 return buffer; 2594 2595fail_all_buf: 2596 kfree(buffer); 2597 2598fail: 2599 return NULL; 2600} 2601 2602#endif 2603 2604static unsigned long perf_data_size(struct perf_buffer *buffer) 2605{ 2606 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); 2607} 2608 2609static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2610{ 2611 struct perf_event *event = vma->vm_file->private_data; 2612 struct perf_buffer *buffer; 2613 int ret = VM_FAULT_SIGBUS; 2614 2615 if (vmf->flags & FAULT_FLAG_MKWRITE) { 2616 if (vmf->pgoff == 0) 2617 ret = 0; 2618 return ret; 2619 } 2620 2621 rcu_read_lock(); 2622 buffer = rcu_dereference(event->buffer); 2623 if (!buffer) 2624 goto unlock; 2625 2626 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 2627 goto unlock; 2628 2629 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); 2630 if (!vmf->page) 2631 goto unlock; 2632 2633 get_page(vmf->page); 2634 vmf->page->mapping = vma->vm_file->f_mapping; 2635 vmf->page->index = vmf->pgoff; 2636 2637 ret = 0; 2638unlock: 2639 rcu_read_unlock(); 2640 2641 return ret; 2642} 2643 2644static void perf_buffer_free_rcu(struct rcu_head *rcu_head) 2645{ 2646 struct perf_buffer *buffer; 2647 2648 buffer = container_of(rcu_head, struct perf_buffer, rcu_head); 2649 perf_buffer_free(buffer); 2650} 2651 2652static struct perf_buffer *perf_buffer_get(struct perf_event *event) 2653{ 2654 struct perf_buffer *buffer; 2655 2656 rcu_read_lock(); 2657 buffer = rcu_dereference(event->buffer); 2658 if (buffer) { 2659 if (!atomic_inc_not_zero(&buffer->refcount)) 2660 buffer = NULL; 2661 } 2662 rcu_read_unlock(); 2663 2664 return buffer; 2665} 2666 2667static void perf_buffer_put(struct perf_buffer *buffer) 2668{ 2669 if (!atomic_dec_and_test(&buffer->refcount)) 2670 return; 2671 2672 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); 2673} 2674 2675static void perf_mmap_open(struct vm_area_struct *vma) 2676{ 2677 struct perf_event *event = vma->vm_file->private_data; 2678 2679 atomic_inc(&event->mmap_count); 2680} 2681 2682static void perf_mmap_close(struct vm_area_struct *vma) 2683{ 2684 struct perf_event *event = vma->vm_file->private_data; 2685 2686 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2687 unsigned long size = perf_data_size(event->buffer); 2688 struct user_struct *user = event->mmap_user; 2689 struct perf_buffer *buffer = event->buffer; 2690 2691 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2692 vma->vm_mm->locked_vm -= event->mmap_locked; 2693 rcu_assign_pointer(event->buffer, NULL); 2694 mutex_unlock(&event->mmap_mutex); 2695 2696 perf_buffer_put(buffer); 2697 free_uid(user); 2698 } 2699} 2700 2701static const struct vm_operations_struct perf_mmap_vmops = { 2702 .open = perf_mmap_open, 2703 .close = perf_mmap_close, 2704 .fault = perf_mmap_fault, 2705 .page_mkwrite = perf_mmap_fault, 2706}; 2707 2708static int perf_mmap(struct file *file, struct vm_area_struct *vma) 2709{ 2710 struct perf_event *event = file->private_data; 2711 unsigned long user_locked, user_lock_limit; 2712 struct user_struct *user = current_user(); 2713 unsigned long locked, lock_limit; 2714 struct perf_buffer *buffer; 2715 unsigned long vma_size; 2716 unsigned long nr_pages; 2717 long user_extra, extra; 2718 int ret = 0, flags = 0; 2719 2720 /* 2721 * Don't allow mmap() of inherited per-task counters. This would 2722 * create a performance issue due to all children writing to the 2723 * same buffer. 2724 */ 2725 if (event->cpu == -1 && event->attr.inherit) 2726 return -EINVAL; 2727 2728 if (!(vma->vm_flags & VM_SHARED)) 2729 return -EINVAL; 2730 2731 vma_size = vma->vm_end - vma->vm_start; 2732 nr_pages = (vma_size / PAGE_SIZE) - 1; 2733 2734 /* 2735 * If we have buffer pages ensure they're a power-of-two number, so we 2736 * can do bitmasks instead of modulo. 2737 */ 2738 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 2739 return -EINVAL; 2740 2741 if (vma_size != PAGE_SIZE * (1 + nr_pages)) 2742 return -EINVAL; 2743 2744 if (vma->vm_pgoff != 0) 2745 return -EINVAL; 2746 2747 WARN_ON_ONCE(event->ctx->parent_ctx); 2748 mutex_lock(&event->mmap_mutex); 2749 if (event->buffer) { 2750 if (event->buffer->nr_pages == nr_pages) 2751 atomic_inc(&event->buffer->refcount); 2752 else 2753 ret = -EINVAL; 2754 goto unlock; 2755 } 2756 2757 user_extra = nr_pages + 1; 2758 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 2759 2760 /* 2761 * Increase the limit linearly with more CPUs: 2762 */ 2763 user_lock_limit *= num_online_cpus(); 2764 2765 user_locked = atomic_long_read(&user->locked_vm) + user_extra; 2766 2767 extra = 0; 2768 if (user_locked > user_lock_limit) 2769 extra = user_locked - user_lock_limit; 2770 2771 lock_limit = rlimit(RLIMIT_MEMLOCK); 2772 lock_limit >>= PAGE_SHIFT; 2773 locked = vma->vm_mm->locked_vm + extra; 2774 2775 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && 2776 !capable(CAP_IPC_LOCK)) { 2777 ret = -EPERM; 2778 goto unlock; 2779 } 2780 2781 WARN_ON(event->buffer); 2782 2783 if (vma->vm_flags & VM_WRITE) 2784 flags |= PERF_BUFFER_WRITABLE; 2785 2786 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, 2787 event->cpu, flags); 2788 if (!buffer) { 2789 ret = -ENOMEM; 2790 goto unlock; 2791 } 2792 rcu_assign_pointer(event->buffer, buffer); 2793 2794 atomic_long_add(user_extra, &user->locked_vm); 2795 event->mmap_locked = extra; 2796 event->mmap_user = get_current_user(); 2797 vma->vm_mm->locked_vm += event->mmap_locked; 2798 2799unlock: 2800 if (!ret) 2801 atomic_inc(&event->mmap_count); 2802 mutex_unlock(&event->mmap_mutex); 2803 2804 vma->vm_flags |= VM_RESERVED; 2805 vma->vm_ops = &perf_mmap_vmops; 2806 2807 return ret; 2808} 2809 2810static int perf_fasync(int fd, struct file *filp, int on) 2811{ 2812 struct inode *inode = filp->f_path.dentry->d_inode; 2813 struct perf_event *event = filp->private_data; 2814 int retval; 2815 2816 mutex_lock(&inode->i_mutex); 2817 retval = fasync_helper(fd, filp, on, &event->fasync); 2818 mutex_unlock(&inode->i_mutex); 2819 2820 if (retval < 0) 2821 return retval; 2822 2823 return 0; 2824} 2825 2826static const struct file_operations perf_fops = { 2827 .llseek = no_llseek, 2828 .release = perf_release, 2829 .read = perf_read, 2830 .poll = perf_poll, 2831 .unlocked_ioctl = perf_ioctl, 2832 .compat_ioctl = perf_ioctl, 2833 .mmap = perf_mmap, 2834 .fasync = perf_fasync, 2835}; 2836 2837/* 2838 * Perf event wakeup 2839 * 2840 * If there's data, ensure we set the poll() state and publish everything 2841 * to user-space before waking everybody up. 2842 */ 2843 2844void perf_event_wakeup(struct perf_event *event) 2845{ 2846 wake_up_all(&event->waitq); 2847 2848 if (event->pending_kill) { 2849 kill_fasync(&event->fasync, SIGIO, event->pending_kill); 2850 event->pending_kill = 0; 2851 } 2852} 2853 2854/* 2855 * Pending wakeups 2856 * 2857 * Handle the case where we need to wakeup up from NMI (or rq->lock) context. 2858 * 2859 * The NMI bit means we cannot possibly take locks. Therefore, maintain a 2860 * single linked list and use cmpxchg() to add entries lockless. 2861 */ 2862 2863static void perf_pending_event(struct perf_pending_entry *entry) 2864{ 2865 struct perf_event *event = container_of(entry, 2866 struct perf_event, pending); 2867 2868 if (event->pending_disable) { 2869 event->pending_disable = 0; 2870 __perf_event_disable(event); 2871 } 2872 2873 if (event->pending_wakeup) { 2874 event->pending_wakeup = 0; 2875 perf_event_wakeup(event); 2876 } 2877} 2878 2879#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) 2880 2881static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { 2882 PENDING_TAIL, 2883}; 2884 2885static void perf_pending_queue(struct perf_pending_entry *entry, 2886 void (*func)(struct perf_pending_entry *)) 2887{ 2888 struct perf_pending_entry **head; 2889 2890 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) 2891 return; 2892 2893 entry->func = func; 2894 2895 head = &get_cpu_var(perf_pending_head); 2896 2897 do { 2898 entry->next = *head; 2899 } while (cmpxchg(head, entry->next, entry) != entry->next); 2900 2901 set_perf_event_pending(); 2902 2903 put_cpu_var(perf_pending_head); 2904} 2905 2906static int __perf_pending_run(void) 2907{ 2908 struct perf_pending_entry *list; 2909 int nr = 0; 2910 2911 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); 2912 while (list != PENDING_TAIL) { 2913 void (*func)(struct perf_pending_entry *); 2914 struct perf_pending_entry *entry = list; 2915 2916 list = list->next; 2917 2918 func = entry->func; 2919 entry->next = NULL; 2920 /* 2921 * Ensure we observe the unqueue before we issue the wakeup, 2922 * so that we won't be waiting forever. 2923 * -- see perf_not_pending(). 2924 */ 2925 smp_wmb(); 2926 2927 func(entry); 2928 nr++; 2929 } 2930 2931 return nr; 2932} 2933 2934static inline int perf_not_pending(struct perf_event *event) 2935{ 2936 /* 2937 * If we flush on whatever cpu we run, there is a chance we don't 2938 * need to wait. 2939 */ 2940 get_cpu(); 2941 __perf_pending_run(); 2942 put_cpu(); 2943 2944 /* 2945 * Ensure we see the proper queue state before going to sleep 2946 * so that we do not miss the wakeup. -- see perf_pending_handle() 2947 */ 2948 smp_rmb(); 2949 return event->pending.next == NULL; 2950} 2951 2952static void perf_pending_sync(struct perf_event *event) 2953{ 2954 wait_event(event->waitq, perf_not_pending(event)); 2955} 2956 2957void perf_event_do_pending(void) 2958{ 2959 __perf_pending_run(); 2960} 2961 2962/* 2963 * Callchain support -- arch specific 2964 */ 2965 2966__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) 2967{ 2968 return NULL; 2969} 2970 2971 2972/* 2973 * We assume there is only KVM supporting the callbacks. 2974 * Later on, we might change it to a list if there is 2975 * another virtualization implementation supporting the callbacks. 2976 */ 2977struct perf_guest_info_callbacks *perf_guest_cbs; 2978 2979int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 2980{ 2981 perf_guest_cbs = cbs; 2982 return 0; 2983} 2984EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); 2985 2986int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 2987{ 2988 perf_guest_cbs = NULL; 2989 return 0; 2990} 2991EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 2992 2993/* 2994 * Output 2995 */ 2996static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, 2997 unsigned long offset, unsigned long head) 2998{ 2999 unsigned long mask; 3000 3001 if (!buffer->writable) 3002 return true; 3003 3004 mask = perf_data_size(buffer) - 1; 3005 3006 offset = (offset - tail) & mask; 3007 head = (head - tail) & mask; 3008 3009 if ((int)(head - offset) < 0) 3010 return false; 3011 3012 return true; 3013} 3014 3015static void perf_output_wakeup(struct perf_output_handle *handle) 3016{ 3017 atomic_set(&handle->buffer->poll, POLL_IN); 3018 3019 if (handle->nmi) { 3020 handle->event->pending_wakeup = 1; 3021 perf_pending_queue(&handle->event->pending, 3022 perf_pending_event); 3023 } else 3024 perf_event_wakeup(handle->event); 3025} 3026 3027/* 3028 * We need to ensure a later event_id doesn't publish a head when a former 3029 * event isn't done writing. However since we need to deal with NMIs we 3030 * cannot fully serialize things. 3031 * 3032 * We only publish the head (and generate a wakeup) when the outer-most 3033 * event completes. 3034 */ 3035static void perf_output_get_handle(struct perf_output_handle *handle) 3036{ 3037 struct perf_buffer *buffer = handle->buffer; 3038 3039 preempt_disable(); 3040 local_inc(&buffer->nest); 3041 handle->wakeup = local_read(&buffer->wakeup); 3042} 3043 3044static void perf_output_put_handle(struct perf_output_handle *handle) 3045{ 3046 struct perf_buffer *buffer = handle->buffer; 3047 unsigned long head; 3048 3049again: 3050 head = local_read(&buffer->head); 3051 3052 /* 3053 * IRQ/NMI can happen here, which means we can miss a head update. 3054 */ 3055 3056 if (!local_dec_and_test(&buffer->nest)) 3057 goto out; 3058 3059 /* 3060 * Publish the known good head. Rely on the full barrier implied 3061 * by atomic_dec_and_test() order the buffer->head read and this 3062 * write. 3063 */ 3064 buffer->user_page->data_head = head; 3065 3066 /* 3067 * Now check if we missed an update, rely on the (compiler) 3068 * barrier in atomic_dec_and_test() to re-read buffer->head. 3069 */ 3070 if (unlikely(head != local_read(&buffer->head))) { 3071 local_inc(&buffer->nest); 3072 goto again; 3073 } 3074 3075 if (handle->wakeup != local_read(&buffer->wakeup)) 3076 perf_output_wakeup(handle); 3077 3078 out: 3079 preempt_enable(); 3080} 3081 3082__always_inline void perf_output_copy(struct perf_output_handle *handle, 3083 const void *buf, unsigned int len) 3084{ 3085 do { 3086 unsigned long size = min_t(unsigned long, handle->size, len); 3087 3088 memcpy(handle->addr, buf, size); 3089 3090 len -= size; 3091 handle->addr += size; 3092 buf += size; 3093 handle->size -= size; 3094 if (!handle->size) { 3095 struct perf_buffer *buffer = handle->buffer; 3096 3097 handle->page++; 3098 handle->page &= buffer->nr_pages - 1; 3099 handle->addr = buffer->data_pages[handle->page]; 3100 handle->size = PAGE_SIZE << page_order(buffer); 3101 } 3102 } while (len); 3103} 3104 3105int perf_output_begin(struct perf_output_handle *handle, 3106 struct perf_event *event, unsigned int size, 3107 int nmi, int sample) 3108{ 3109 struct perf_buffer *buffer; 3110 unsigned long tail, offset, head; 3111 int have_lost; 3112 struct { 3113 struct perf_event_header header; 3114 u64 id; 3115 u64 lost; 3116 } lost_event; 3117 3118 rcu_read_lock(); 3119 /* 3120 * For inherited events we send all the output towards the parent. 3121 */ 3122 if (event->parent) 3123 event = event->parent; 3124 3125 buffer = rcu_dereference(event->buffer); 3126 if (!buffer) 3127 goto out; 3128 3129 handle->buffer = buffer; 3130 handle->event = event; 3131 handle->nmi = nmi; 3132 handle->sample = sample; 3133 3134 if (!buffer->nr_pages) 3135 goto out; 3136 3137 have_lost = local_read(&buffer->lost); 3138 if (have_lost) 3139 size += sizeof(lost_event); 3140 3141 perf_output_get_handle(handle); 3142 3143 do { 3144 /* 3145 * Userspace could choose to issue a mb() before updating the 3146 * tail pointer. So that all reads will be completed before the 3147 * write is issued. 3148 */ 3149 tail = ACCESS_ONCE(buffer->user_page->data_tail); 3150 smp_rmb(); 3151 offset = head = local_read(&buffer->head); 3152 head += size; 3153 if (unlikely(!perf_output_space(buffer, tail, offset, head))) 3154 goto fail; 3155 } while (local_cmpxchg(&buffer->head, offset, head) != offset); 3156 3157 if (head - local_read(&buffer->wakeup) > buffer->watermark) 3158 local_add(buffer->watermark, &buffer->wakeup); 3159 3160 handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); 3161 handle->page &= buffer->nr_pages - 1; 3162 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); 3163 handle->addr = buffer->data_pages[handle->page]; 3164 handle->addr += handle->size; 3165 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; 3166 3167 if (have_lost) { 3168 lost_event.header.type = PERF_RECORD_LOST; 3169 lost_event.header.misc = 0; 3170 lost_event.header.size = sizeof(lost_event); 3171 lost_event.id = event->id; 3172 lost_event.lost = local_xchg(&buffer->lost, 0); 3173 3174 perf_output_put(handle, lost_event); 3175 } 3176 3177 return 0; 3178 3179fail: 3180 local_inc(&buffer->lost); 3181 perf_output_put_handle(handle); 3182out: 3183 rcu_read_unlock(); 3184 3185 return -ENOSPC; 3186} 3187 3188void perf_output_end(struct perf_output_handle *handle) 3189{ 3190 struct perf_event *event = handle->event; 3191 struct perf_buffer *buffer = handle->buffer; 3192 3193 int wakeup_events = event->attr.wakeup_events; 3194 3195 if (handle->sample && wakeup_events) { 3196 int events = local_inc_return(&buffer->events); 3197 if (events >= wakeup_events) { 3198 local_sub(wakeup_events, &buffer->events); 3199 local_inc(&buffer->wakeup); 3200 } 3201 } 3202 3203 perf_output_put_handle(handle); 3204 rcu_read_unlock(); 3205} 3206 3207static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 3208{ 3209 /* 3210 * only top level events have the pid namespace they were created in 3211 */ 3212 if (event->parent) 3213 event = event->parent; 3214 3215 return task_tgid_nr_ns(p, event->ns); 3216} 3217 3218static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) 3219{ 3220 /* 3221 * only top level events have the pid namespace they were created in 3222 */ 3223 if (event->parent) 3224 event = event->parent; 3225 3226 return task_pid_nr_ns(p, event->ns); 3227} 3228 3229static void perf_output_read_one(struct perf_output_handle *handle, 3230 struct perf_event *event) 3231{ 3232 u64 read_format = event->attr.read_format; 3233 u64 values[4]; 3234 int n = 0; 3235 3236 values[n++] = perf_event_count(event); 3237 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3238 values[n++] = event->total_time_enabled + 3239 atomic64_read(&event->child_total_time_enabled); 3240 } 3241 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 3242 values[n++] = event->total_time_running + 3243 atomic64_read(&event->child_total_time_running); 3244 } 3245 if (read_format & PERF_FORMAT_ID) 3246 values[n++] = primary_event_id(event); 3247 3248 perf_output_copy(handle, values, n * sizeof(u64)); 3249} 3250 3251static void perf_output_read_group(struct perf_output_handle *handle, 3252 struct perf_event *event) 3253{ 3254 struct perf_event *leader = event->group_leader, *sub; 3255 u64 read_format = event->attr.read_format; 3256 u64 values[5]; 3257 int n = 0; 3258 3259 values[n++] = 1 + leader->nr_siblings; 3260 3261 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3262 values[n++] = leader->total_time_enabled; 3263 3264 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3265 values[n++] = leader->total_time_running; 3266 3267 if (leader != event) 3268 leader->pmu->read(leader); 3269 3270 values[n++] = perf_event_count(leader); 3271 if (read_format & PERF_FORMAT_ID) 3272 values[n++] = primary_event_id(leader); 3273 3274 perf_output_copy(handle, values, n * sizeof(u64)); 3275 3276 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 3277 n = 0; 3278 3279 if (sub != event) 3280 sub->pmu->read(sub); 3281 3282 values[n++] = perf_event_count(sub); 3283 if (read_format & PERF_FORMAT_ID) 3284 values[n++] = primary_event_id(sub); 3285 3286 perf_output_copy(handle, values, n * sizeof(u64)); 3287 } 3288} 3289 3290static void perf_output_read(struct perf_output_handle *handle, 3291 struct perf_event *event) 3292{ 3293 if (event->attr.read_format & PERF_FORMAT_GROUP) 3294 perf_output_read_group(handle, event); 3295 else 3296 perf_output_read_one(handle, event); 3297} 3298 3299void perf_output_sample(struct perf_output_handle *handle, 3300 struct perf_event_header *header, 3301 struct perf_sample_data *data, 3302 struct perf_event *event) 3303{ 3304 u64 sample_type = data->type; 3305 3306 perf_output_put(handle, *header); 3307 3308 if (sample_type & PERF_SAMPLE_IP) 3309 perf_output_put(handle, data->ip); 3310 3311 if (sample_type & PERF_SAMPLE_TID) 3312 perf_output_put(handle, data->tid_entry); 3313 3314 if (sample_type & PERF_SAMPLE_TIME) 3315 perf_output_put(handle, data->time); 3316 3317 if (sample_type & PERF_SAMPLE_ADDR) 3318 perf_output_put(handle, data->addr); 3319 3320 if (sample_type & PERF_SAMPLE_ID) 3321 perf_output_put(handle, data->id); 3322 3323 if (sample_type & PERF_SAMPLE_STREAM_ID) 3324 perf_output_put(handle, data->stream_id); 3325 3326 if (sample_type & PERF_SAMPLE_CPU) 3327 perf_output_put(handle, data->cpu_entry); 3328 3329 if (sample_type & PERF_SAMPLE_PERIOD) 3330 perf_output_put(handle, data->period); 3331 3332 if (sample_type & PERF_SAMPLE_READ) 3333 perf_output_read(handle, event); 3334 3335 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3336 if (data->callchain) { 3337 int size = 1; 3338 3339 if (data->callchain) 3340 size += data->callchain->nr; 3341 3342 size *= sizeof(u64); 3343 3344 perf_output_copy(handle, data->callchain, size); 3345 } else { 3346 u64 nr = 0; 3347 perf_output_put(handle, nr); 3348 } 3349 } 3350 3351 if (sample_type & PERF_SAMPLE_RAW) { 3352 if (data->raw) { 3353 perf_output_put(handle, data->raw->size); 3354 perf_output_copy(handle, data->raw->data, 3355 data->raw->size); 3356 } else { 3357 struct { 3358 u32 size; 3359 u32 data; 3360 } raw = { 3361 .size = sizeof(u32), 3362 .data = 0, 3363 }; 3364 perf_output_put(handle, raw); 3365 } 3366 } 3367} 3368 3369void perf_prepare_sample(struct perf_event_header *header, 3370 struct perf_sample_data *data, 3371 struct perf_event *event, 3372 struct pt_regs *regs) 3373{ 3374 u64 sample_type = event->attr.sample_type; 3375 3376 data->type = sample_type; 3377 3378 header->type = PERF_RECORD_SAMPLE; 3379 header->size = sizeof(*header); 3380 3381 header->misc = 0; 3382 header->misc |= perf_misc_flags(regs); 3383 3384 if (sample_type & PERF_SAMPLE_IP) { 3385 data->ip = perf_instruction_pointer(regs); 3386 3387 header->size += sizeof(data->ip); 3388 } 3389 3390 if (sample_type & PERF_SAMPLE_TID) { 3391 /* namespace issues */ 3392 data->tid_entry.pid = perf_event_pid(event, current); 3393 data->tid_entry.tid = perf_event_tid(event, current); 3394 3395 header->size += sizeof(data->tid_entry); 3396 } 3397 3398 if (sample_type & PERF_SAMPLE_TIME) { 3399 data->time = perf_clock(); 3400 3401 header->size += sizeof(data->time); 3402 } 3403 3404 if (sample_type & PERF_SAMPLE_ADDR) 3405 header->size += sizeof(data->addr); 3406 3407 if (sample_type & PERF_SAMPLE_ID) { 3408 data->id = primary_event_id(event); 3409 3410 header->size += sizeof(data->id); 3411 } 3412 3413 if (sample_type & PERF_SAMPLE_STREAM_ID) { 3414 data->stream_id = event->id; 3415 3416 header->size += sizeof(data->stream_id); 3417 } 3418 3419 if (sample_type & PERF_SAMPLE_CPU) { 3420 data->cpu_entry.cpu = raw_smp_processor_id(); 3421 data->cpu_entry.reserved = 0; 3422 3423 header->size += sizeof(data->cpu_entry); 3424 } 3425 3426 if (sample_type & PERF_SAMPLE_PERIOD) 3427 header->size += sizeof(data->period); 3428 3429 if (sample_type & PERF_SAMPLE_READ) 3430 header->size += perf_event_read_size(event); 3431 3432 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3433 int size = 1; 3434 3435 data->callchain = perf_callchain(regs); 3436 3437 if (data->callchain) 3438 size += data->callchain->nr; 3439 3440 header->size += size * sizeof(u64); 3441 } 3442 3443 if (sample_type & PERF_SAMPLE_RAW) { 3444 int size = sizeof(u32); 3445 3446 if (data->raw) 3447 size += data->raw->size; 3448 else 3449 size += sizeof(u32); 3450 3451 WARN_ON_ONCE(size & (sizeof(u64)-1)); 3452 header->size += size; 3453 } 3454} 3455 3456static void perf_event_output(struct perf_event *event, int nmi, 3457 struct perf_sample_data *data, 3458 struct pt_regs *regs) 3459{ 3460 struct perf_output_handle handle; 3461 struct perf_event_header header; 3462 3463 perf_prepare_sample(&header, data, event, regs); 3464 3465 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 3466 return; 3467 3468 perf_output_sample(&handle, &header, data, event); 3469 3470 perf_output_end(&handle); 3471} 3472 3473/* 3474 * read event_id 3475 */ 3476 3477struct perf_read_event { 3478 struct perf_event_header header; 3479 3480 u32 pid; 3481 u32 tid; 3482}; 3483 3484static void 3485perf_event_read_event(struct perf_event *event, 3486 struct task_struct *task) 3487{ 3488 struct perf_output_handle handle; 3489 struct perf_read_event read_event = { 3490 .header = { 3491 .type = PERF_RECORD_READ, 3492 .misc = 0, 3493 .size = sizeof(read_event) + perf_event_read_size(event), 3494 }, 3495 .pid = perf_event_pid(event, task), 3496 .tid = perf_event_tid(event, task), 3497 }; 3498 int ret; 3499 3500 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 3501 if (ret) 3502 return; 3503 3504 perf_output_put(&handle, read_event); 3505 perf_output_read(&handle, event); 3506 3507 perf_output_end(&handle); 3508} 3509 3510/* 3511 * task tracking -- fork/exit 3512 * 3513 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task 3514 */ 3515 3516struct perf_task_event { 3517 struct task_struct *task; 3518 struct perf_event_context *task_ctx; 3519 3520 struct { 3521 struct perf_event_header header; 3522 3523 u32 pid; 3524 u32 ppid; 3525 u32 tid; 3526 u32 ptid; 3527 u64 time; 3528 } event_id; 3529}; 3530 3531static void perf_event_task_output(struct perf_event *event, 3532 struct perf_task_event *task_event) 3533{ 3534 struct perf_output_handle handle; 3535 struct task_struct *task = task_event->task; 3536 int size, ret; 3537 3538 size = task_event->event_id.header.size; 3539 ret = perf_output_begin(&handle, event, size, 0, 0); 3540 3541 if (ret) 3542 return; 3543 3544 task_event->event_id.pid = perf_event_pid(event, task); 3545 task_event->event_id.ppid = perf_event_pid(event, current); 3546 3547 task_event->event_id.tid = perf_event_tid(event, task); 3548 task_event->event_id.ptid = perf_event_tid(event, current); 3549 3550 perf_output_put(&handle, task_event->event_id); 3551 3552 perf_output_end(&handle); 3553} 3554 3555static int perf_event_task_match(struct perf_event *event) 3556{ 3557 if (event->state < PERF_EVENT_STATE_INACTIVE) 3558 return 0; 3559 3560 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3561 return 0; 3562 3563 if (event->attr.comm || event->attr.mmap || 3564 event->attr.mmap_data || event->attr.task) 3565 return 1; 3566 3567 return 0; 3568} 3569 3570static void perf_event_task_ctx(struct perf_event_context *ctx, 3571 struct perf_task_event *task_event) 3572{ 3573 struct perf_event *event; 3574 3575 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3576 if (perf_event_task_match(event)) 3577 perf_event_task_output(event, task_event); 3578 } 3579} 3580 3581static void perf_event_task_event(struct perf_task_event *task_event) 3582{ 3583 struct perf_cpu_context *cpuctx; 3584 struct perf_event_context *ctx = task_event->task_ctx; 3585 3586 rcu_read_lock(); 3587 cpuctx = &get_cpu_var(perf_cpu_context); 3588 perf_event_task_ctx(&cpuctx->ctx, task_event); 3589 if (!ctx) 3590 ctx = rcu_dereference(current->perf_event_ctxp); 3591 if (ctx) 3592 perf_event_task_ctx(ctx, task_event); 3593 put_cpu_var(perf_cpu_context); 3594 rcu_read_unlock(); 3595} 3596 3597static void perf_event_task(struct task_struct *task, 3598 struct perf_event_context *task_ctx, 3599 int new) 3600{ 3601 struct perf_task_event task_event; 3602 3603 if (!atomic_read(&nr_comm_events) && 3604 !atomic_read(&nr_mmap_events) && 3605 !atomic_read(&nr_task_events)) 3606 return; 3607 3608 task_event = (struct perf_task_event){ 3609 .task = task, 3610 .task_ctx = task_ctx, 3611 .event_id = { 3612 .header = { 3613 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, 3614 .misc = 0, 3615 .size = sizeof(task_event.event_id), 3616 }, 3617 /* .pid */ 3618 /* .ppid */ 3619 /* .tid */ 3620 /* .ptid */ 3621 .time = perf_clock(), 3622 }, 3623 }; 3624 3625 perf_event_task_event(&task_event); 3626} 3627 3628void perf_event_fork(struct task_struct *task) 3629{ 3630 perf_event_task(task, NULL, 1); 3631} 3632 3633/* 3634 * comm tracking 3635 */ 3636 3637struct perf_comm_event { 3638 struct task_struct *task; 3639 char *comm; 3640 int comm_size; 3641 3642 struct { 3643 struct perf_event_header header; 3644 3645 u32 pid; 3646 u32 tid; 3647 } event_id; 3648}; 3649 3650static void perf_event_comm_output(struct perf_event *event, 3651 struct perf_comm_event *comm_event) 3652{ 3653 struct perf_output_handle handle; 3654 int size = comm_event->event_id.header.size; 3655 int ret = perf_output_begin(&handle, event, size, 0, 0); 3656 3657 if (ret) 3658 return; 3659 3660 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 3661 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 3662 3663 perf_output_put(&handle, comm_event->event_id); 3664 perf_output_copy(&handle, comm_event->comm, 3665 comm_event->comm_size); 3666 perf_output_end(&handle); 3667} 3668 3669static int perf_event_comm_match(struct perf_event *event) 3670{ 3671 if (event->state < PERF_EVENT_STATE_INACTIVE) 3672 return 0; 3673 3674 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3675 return 0; 3676 3677 if (event->attr.comm) 3678 return 1; 3679 3680 return 0; 3681} 3682 3683static void perf_event_comm_ctx(struct perf_event_context *ctx, 3684 struct perf_comm_event *comm_event) 3685{ 3686 struct perf_event *event; 3687 3688 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3689 if (perf_event_comm_match(event)) 3690 perf_event_comm_output(event, comm_event); 3691 } 3692} 3693 3694static void perf_event_comm_event(struct perf_comm_event *comm_event) 3695{ 3696 struct perf_cpu_context *cpuctx; 3697 struct perf_event_context *ctx; 3698 unsigned int size; 3699 char comm[TASK_COMM_LEN]; 3700 3701 memset(comm, 0, sizeof(comm)); 3702 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 3703 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3704 3705 comm_event->comm = comm; 3706 comm_event->comm_size = size; 3707 3708 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3709 3710 rcu_read_lock(); 3711 cpuctx = &get_cpu_var(perf_cpu_context); 3712 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3713 ctx = rcu_dereference(current->perf_event_ctxp); 3714 if (ctx) 3715 perf_event_comm_ctx(ctx, comm_event); 3716 put_cpu_var(perf_cpu_context); 3717 rcu_read_unlock(); 3718} 3719 3720void perf_event_comm(struct task_struct *task) 3721{ 3722 struct perf_comm_event comm_event; 3723 3724 if (task->perf_event_ctxp) 3725 perf_event_enable_on_exec(task); 3726 3727 if (!atomic_read(&nr_comm_events)) 3728 return; 3729 3730 comm_event = (struct perf_comm_event){ 3731 .task = task, 3732 /* .comm */ 3733 /* .comm_size */ 3734 .event_id = { 3735 .header = { 3736 .type = PERF_RECORD_COMM, 3737 .misc = 0, 3738 /* .size */ 3739 }, 3740 /* .pid */ 3741 /* .tid */ 3742 }, 3743 }; 3744 3745 perf_event_comm_event(&comm_event); 3746} 3747 3748/* 3749 * mmap tracking 3750 */ 3751 3752struct perf_mmap_event { 3753 struct vm_area_struct *vma; 3754 3755 const char *file_name; 3756 int file_size; 3757 3758 struct { 3759 struct perf_event_header header; 3760 3761 u32 pid; 3762 u32 tid; 3763 u64 start; 3764 u64 len; 3765 u64 pgoff; 3766 } event_id; 3767}; 3768 3769static void perf_event_mmap_output(struct perf_event *event, 3770 struct perf_mmap_event *mmap_event) 3771{ 3772 struct perf_output_handle handle; 3773 int size = mmap_event->event_id.header.size; 3774 int ret = perf_output_begin(&handle, event, size, 0, 0); 3775 3776 if (ret) 3777 return; 3778 3779 mmap_event->event_id.pid = perf_event_pid(event, current); 3780 mmap_event->event_id.tid = perf_event_tid(event, current); 3781 3782 perf_output_put(&handle, mmap_event->event_id); 3783 perf_output_copy(&handle, mmap_event->file_name, 3784 mmap_event->file_size); 3785 perf_output_end(&handle); 3786} 3787 3788static int perf_event_mmap_match(struct perf_event *event, 3789 struct perf_mmap_event *mmap_event, 3790 int executable) 3791{ 3792 if (event->state < PERF_EVENT_STATE_INACTIVE) 3793 return 0; 3794 3795 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3796 return 0; 3797 3798 if ((!executable && event->attr.mmap_data) || 3799 (executable && event->attr.mmap)) 3800 return 1; 3801 3802 return 0; 3803} 3804 3805static void perf_event_mmap_ctx(struct perf_event_context *ctx, 3806 struct perf_mmap_event *mmap_event, 3807 int executable) 3808{ 3809 struct perf_event *event; 3810 3811 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3812 if (perf_event_mmap_match(event, mmap_event, executable)) 3813 perf_event_mmap_output(event, mmap_event); 3814 } 3815} 3816 3817static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3818{ 3819 struct perf_cpu_context *cpuctx; 3820 struct perf_event_context *ctx; 3821 struct vm_area_struct *vma = mmap_event->vma; 3822 struct file *file = vma->vm_file; 3823 unsigned int size; 3824 char tmp[16]; 3825 char *buf = NULL; 3826 const char *name; 3827 3828 memset(tmp, 0, sizeof(tmp)); 3829 3830 if (file) { 3831 /* 3832 * d_path works from the end of the buffer backwards, so we 3833 * need to add enough zero bytes after the string to handle 3834 * the 64bit alignment we do later. 3835 */ 3836 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); 3837 if (!buf) { 3838 name = strncpy(tmp, "//enomem", sizeof(tmp)); 3839 goto got_name; 3840 } 3841 name = d_path(&file->f_path, buf, PATH_MAX); 3842 if (IS_ERR(name)) { 3843 name = strncpy(tmp, "//toolong", sizeof(tmp)); 3844 goto got_name; 3845 } 3846 } else { 3847 if (arch_vma_name(mmap_event->vma)) { 3848 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 3849 sizeof(tmp)); 3850 goto got_name; 3851 } 3852 3853 if (!vma->vm_mm) { 3854 name = strncpy(tmp, "[vdso]", sizeof(tmp)); 3855 goto got_name; 3856 } else if (vma->vm_start <= vma->vm_mm->start_brk && 3857 vma->vm_end >= vma->vm_mm->brk) { 3858 name = strncpy(tmp, "[heap]", sizeof(tmp)); 3859 goto got_name; 3860 } else if (vma->vm_start <= vma->vm_mm->start_stack && 3861 vma->vm_end >= vma->vm_mm->start_stack) { 3862 name = strncpy(tmp, "[stack]", sizeof(tmp)); 3863 goto got_name; 3864 } 3865 3866 name = strncpy(tmp, "//anon", sizeof(tmp)); 3867 goto got_name; 3868 } 3869 3870got_name: 3871 size = ALIGN(strlen(name)+1, sizeof(u64)); 3872 3873 mmap_event->file_name = name; 3874 mmap_event->file_size = size; 3875 3876 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3877 3878 rcu_read_lock(); 3879 cpuctx = &get_cpu_var(perf_cpu_context); 3880 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); 3881 ctx = rcu_dereference(current->perf_event_ctxp); 3882 if (ctx) 3883 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); 3884 put_cpu_var(perf_cpu_context); 3885 rcu_read_unlock(); 3886 3887 kfree(buf); 3888} 3889 3890void perf_event_mmap(struct vm_area_struct *vma) 3891{ 3892 struct perf_mmap_event mmap_event; 3893 3894 if (!atomic_read(&nr_mmap_events)) 3895 return; 3896 3897 mmap_event = (struct perf_mmap_event){ 3898 .vma = vma, 3899 /* .file_name */ 3900 /* .file_size */ 3901 .event_id = { 3902 .header = { 3903 .type = PERF_RECORD_MMAP, 3904 .misc = PERF_RECORD_MISC_USER, 3905 /* .size */ 3906 }, 3907 /* .pid */ 3908 /* .tid */ 3909 .start = vma->vm_start, 3910 .len = vma->vm_end - vma->vm_start, 3911 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, 3912 }, 3913 }; 3914 3915 perf_event_mmap_event(&mmap_event); 3916} 3917 3918/* 3919 * IRQ throttle logging 3920 */ 3921 3922static void perf_log_throttle(struct perf_event *event, int enable) 3923{ 3924 struct perf_output_handle handle; 3925 int ret; 3926 3927 struct { 3928 struct perf_event_header header; 3929 u64 time; 3930 u64 id; 3931 u64 stream_id; 3932 } throttle_event = { 3933 .header = { 3934 .type = PERF_RECORD_THROTTLE, 3935 .misc = 0, 3936 .size = sizeof(throttle_event), 3937 }, 3938 .time = perf_clock(), 3939 .id = primary_event_id(event), 3940 .stream_id = event->id, 3941 }; 3942 3943 if (enable) 3944 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 3945 3946 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); 3947 if (ret) 3948 return; 3949 3950 perf_output_put(&handle, throttle_event); 3951 perf_output_end(&handle); 3952} 3953 3954/* 3955 * Generic event overflow handling, sampling. 3956 */ 3957 3958static int __perf_event_overflow(struct perf_event *event, int nmi, 3959 int throttle, struct perf_sample_data *data, 3960 struct pt_regs *regs) 3961{ 3962 int events = atomic_read(&event->event_limit); 3963 struct hw_perf_event *hwc = &event->hw; 3964 int ret = 0; 3965 3966 throttle = (throttle && event->pmu->unthrottle != NULL); 3967 3968 if (!throttle) { 3969 hwc->interrupts++; 3970 } else { 3971 if (hwc->interrupts != MAX_INTERRUPTS) { 3972 hwc->interrupts++; 3973 if (HZ * hwc->interrupts > 3974 (u64)sysctl_perf_event_sample_rate) { 3975 hwc->interrupts = MAX_INTERRUPTS; 3976 perf_log_throttle(event, 0); 3977 ret = 1; 3978 } 3979 } else { 3980 /* 3981 * Keep re-disabling events even though on the previous 3982 * pass we disabled it - just in case we raced with a 3983 * sched-in and the event got enabled again: 3984 */ 3985 ret = 1; 3986 } 3987 } 3988 3989 if (event->attr.freq) { 3990 u64 now = perf_clock(); 3991 s64 delta = now - hwc->freq_time_stamp; 3992 3993 hwc->freq_time_stamp = now; 3994 3995 if (delta > 0 && delta < 2*TICK_NSEC) 3996 perf_adjust_period(event, delta, hwc->last_period); 3997 } 3998 3999 4000 event->pending_kill = POLL_IN; 4001 if (events && atomic_dec_and_test(&event->event_limit)) { 4002 ret = 1; 4003 event->pending_kill = POLL_HUP; 4004 if (nmi) { 4005 event->pending_disable = 1; 4006 perf_pending_queue(&event->pending, 4007 perf_pending_event); 4008 } else 4009 perf_event_disable(event); 4010 } 4011 4012 if (event->overflow_handler) 4013 event->overflow_handler(event, nmi, data, regs); 4014 else 4015 perf_event_output(event, nmi, data, regs); 4016 4017 return ret; 4018} 4019 4020int perf_event_overflow(struct perf_event *event, int nmi, 4021 struct perf_sample_data *data, 4022 struct pt_regs *regs) 4023{ 4024 return __perf_event_overflow(event, nmi, 1, data, regs); 4025} 4026 4027/* 4028 * Generic software event infrastructure 4029 */ 4030 4031/* 4032 * We directly increment event->count and keep a second value in 4033 * event->hw.period_left to count intervals. This period event 4034 * is kept in the range [-sample_period, 0] so that we can use the 4035 * sign as trigger. 4036 */ 4037 4038static u64 perf_swevent_set_period(struct perf_event *event) 4039{ 4040 struct hw_perf_event *hwc = &event->hw; 4041 u64 period = hwc->last_period; 4042 u64 nr, offset; 4043 s64 old, val; 4044 4045 hwc->last_period = hwc->sample_period; 4046 4047again: 4048 old = val = local64_read(&hwc->period_left); 4049 if (val < 0) 4050 return 0; 4051 4052 nr = div64_u64(period + val, period); 4053 offset = nr * period; 4054 val -= offset; 4055 if (local64_cmpxchg(&hwc->period_left, old, val) != old) 4056 goto again; 4057 4058 return nr; 4059} 4060 4061static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 4062 int nmi, struct perf_sample_data *data, 4063 struct pt_regs *regs) 4064{ 4065 struct hw_perf_event *hwc = &event->hw; 4066 int throttle = 0; 4067 4068 data->period = event->hw.last_period; 4069 if (!overflow) 4070 overflow = perf_swevent_set_period(event); 4071 4072 if (hwc->interrupts == MAX_INTERRUPTS) 4073 return; 4074 4075 for (; overflow; overflow--) { 4076 if (__perf_event_overflow(event, nmi, throttle, 4077 data, regs)) { 4078 /* 4079 * We inhibit the overflow from happening when 4080 * hwc->interrupts == MAX_INTERRUPTS. 4081 */ 4082 break; 4083 } 4084 throttle = 1; 4085 } 4086} 4087 4088static void perf_swevent_add(struct perf_event *event, u64 nr, 4089 int nmi, struct perf_sample_data *data, 4090 struct pt_regs *regs) 4091{ 4092 struct hw_perf_event *hwc = &event->hw; 4093 4094 local64_add(nr, &event->count); 4095 4096 if (!regs) 4097 return; 4098 4099 if (!hwc->sample_period) 4100 return; 4101 4102 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4103 return perf_swevent_overflow(event, 1, nmi, data, regs); 4104 4105 if (local64_add_negative(nr, &hwc->period_left)) 4106 return; 4107 4108 perf_swevent_overflow(event, 0, nmi, data, regs); 4109} 4110 4111static int perf_exclude_event(struct perf_event *event, 4112 struct pt_regs *regs) 4113{ 4114 if (regs) { 4115 if (event->attr.exclude_user && user_mode(regs)) 4116 return 1; 4117 4118 if (event->attr.exclude_kernel && !user_mode(regs)) 4119 return 1; 4120 } 4121 4122 return 0; 4123} 4124 4125static int perf_swevent_match(struct perf_event *event, 4126 enum perf_type_id type, 4127 u32 event_id, 4128 struct perf_sample_data *data, 4129 struct pt_regs *regs) 4130{ 4131 if (event->attr.type != type) 4132 return 0; 4133 4134 if (event->attr.config != event_id) 4135 return 0; 4136 4137 if (perf_exclude_event(event, regs)) 4138 return 0; 4139 4140 return 1; 4141} 4142 4143static inline u64 swevent_hash(u64 type, u32 event_id) 4144{ 4145 u64 val = event_id | (type << 32); 4146 4147 return hash_64(val, SWEVENT_HLIST_BITS); 4148} 4149 4150static inline struct hlist_head * 4151__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) 4152{ 4153 u64 hash = swevent_hash(type, event_id); 4154 4155 return &hlist->heads[hash]; 4156} 4157 4158/* For the read side: events when they trigger */ 4159static inline struct hlist_head * 4160find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4161{ 4162 struct swevent_hlist *hlist; 4163 4164 hlist = rcu_dereference(ctx->swevent_hlist); 4165 if (!hlist) 4166 return NULL; 4167 4168 return __find_swevent_head(hlist, type, event_id); 4169} 4170 4171/* For the event head insertion and removal in the hlist */ 4172static inline struct hlist_head * 4173find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) 4174{ 4175 struct swevent_hlist *hlist; 4176 u32 event_id = event->attr.config; 4177 u64 type = event->attr.type; 4178 4179 /* 4180 * Event scheduling is always serialized against hlist allocation 4181 * and release. Which makes the protected version suitable here. 4182 * The context lock guarantees that. 4183 */ 4184 hlist = rcu_dereference_protected(ctx->swevent_hlist, 4185 lockdep_is_held(&event->ctx->lock)); 4186 if (!hlist) 4187 return NULL; 4188 4189 return __find_swevent_head(hlist, type, event_id); 4190} 4191 4192static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4193 u64 nr, int nmi, 4194 struct perf_sample_data *data, 4195 struct pt_regs *regs) 4196{ 4197 struct perf_cpu_context *cpuctx; 4198 struct perf_event *event; 4199 struct hlist_node *node; 4200 struct hlist_head *head; 4201 4202 cpuctx = &__get_cpu_var(perf_cpu_context); 4203 4204 rcu_read_lock(); 4205 4206 head = find_swevent_head_rcu(cpuctx, type, event_id); 4207 4208 if (!head) 4209 goto end; 4210 4211 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4212 if (perf_swevent_match(event, type, event_id, data, regs)) 4213 perf_swevent_add(event, nr, nmi, data, regs); 4214 } 4215end: 4216 rcu_read_unlock(); 4217} 4218 4219int perf_swevent_get_recursion_context(void) 4220{ 4221 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4222 int rctx; 4223 4224 if (in_nmi()) 4225 rctx = 3; 4226 else if (in_irq()) 4227 rctx = 2; 4228 else if (in_softirq()) 4229 rctx = 1; 4230 else 4231 rctx = 0; 4232 4233 if (cpuctx->recursion[rctx]) 4234 return -1; 4235 4236 cpuctx->recursion[rctx]++; 4237 barrier(); 4238 4239 return rctx; 4240} 4241EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4242 4243void inline perf_swevent_put_recursion_context(int rctx) 4244{ 4245 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4246 barrier(); 4247 cpuctx->recursion[rctx]--; 4248} 4249 4250void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4251 struct pt_regs *regs, u64 addr) 4252{ 4253 struct perf_sample_data data; 4254 int rctx; 4255 4256 preempt_disable_notrace(); 4257 rctx = perf_swevent_get_recursion_context(); 4258 if (rctx < 0) 4259 return; 4260 4261 perf_sample_data_init(&data, addr); 4262 4263 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4264 4265 perf_swevent_put_recursion_context(rctx); 4266 preempt_enable_notrace(); 4267} 4268 4269static void perf_swevent_read(struct perf_event *event) 4270{ 4271} 4272 4273static int perf_swevent_enable(struct perf_event *event) 4274{ 4275 struct hw_perf_event *hwc = &event->hw; 4276 struct perf_cpu_context *cpuctx; 4277 struct hlist_head *head; 4278 4279 cpuctx = &__get_cpu_var(perf_cpu_context); 4280 4281 if (hwc->sample_period) { 4282 hwc->last_period = hwc->sample_period; 4283 perf_swevent_set_period(event); 4284 } 4285 4286 head = find_swevent_head(cpuctx, event); 4287 if (WARN_ON_ONCE(!head)) 4288 return -EINVAL; 4289 4290 hlist_add_head_rcu(&event->hlist_entry, head); 4291 4292 return 0; 4293} 4294 4295static void perf_swevent_disable(struct perf_event *event) 4296{ 4297 hlist_del_rcu(&event->hlist_entry); 4298} 4299 4300static void perf_swevent_void(struct perf_event *event) 4301{ 4302} 4303 4304static int perf_swevent_int(struct perf_event *event) 4305{ 4306 return 0; 4307} 4308 4309static const struct pmu perf_ops_generic = { 4310 .enable = perf_swevent_enable, 4311 .disable = perf_swevent_disable, 4312 .start = perf_swevent_int, 4313 .stop = perf_swevent_void, 4314 .read = perf_swevent_read, 4315 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ 4316}; 4317 4318/* 4319 * hrtimer based swevent callback 4320 */ 4321 4322static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) 4323{ 4324 enum hrtimer_restart ret = HRTIMER_RESTART; 4325 struct perf_sample_data data; 4326 struct pt_regs *regs; 4327 struct perf_event *event; 4328 u64 period; 4329 4330 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 4331 event->pmu->read(event); 4332 4333 perf_sample_data_init(&data, 0); 4334 data.period = event->hw.last_period; 4335 regs = get_irq_regs(); 4336 4337 if (regs && !perf_exclude_event(event, regs)) { 4338 if (!(event->attr.exclude_idle && current->pid == 0)) 4339 if (perf_event_overflow(event, 0, &data, regs)) 4340 ret = HRTIMER_NORESTART; 4341 } 4342 4343 period = max_t(u64, 10000, event->hw.sample_period); 4344 hrtimer_forward_now(hrtimer, ns_to_ktime(period)); 4345 4346 return ret; 4347} 4348 4349static void perf_swevent_start_hrtimer(struct perf_event *event) 4350{ 4351 struct hw_perf_event *hwc = &event->hw; 4352 4353 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4354 hwc->hrtimer.function = perf_swevent_hrtimer; 4355 if (hwc->sample_period) { 4356 u64 period; 4357 4358 if (hwc->remaining) { 4359 if (hwc->remaining < 0) 4360 period = 10000; 4361 else 4362 period = hwc->remaining; 4363 hwc->remaining = 0; 4364 } else { 4365 period = max_t(u64, 10000, hwc->sample_period); 4366 } 4367 __hrtimer_start_range_ns(&hwc->hrtimer, 4368 ns_to_ktime(period), 0, 4369 HRTIMER_MODE_REL, 0); 4370 } 4371} 4372 4373static void perf_swevent_cancel_hrtimer(struct perf_event *event) 4374{ 4375 struct hw_perf_event *hwc = &event->hw; 4376 4377 if (hwc->sample_period) { 4378 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 4379 hwc->remaining = ktime_to_ns(remaining); 4380 4381 hrtimer_cancel(&hwc->hrtimer); 4382 } 4383} 4384 4385/* 4386 * Software event: cpu wall time clock 4387 */ 4388 4389static void cpu_clock_perf_event_update(struct perf_event *event) 4390{ 4391 int cpu = raw_smp_processor_id(); 4392 s64 prev; 4393 u64 now; 4394 4395 now = cpu_clock(cpu); 4396 prev = local64_xchg(&event->hw.prev_count, now); 4397 local64_add(now - prev, &event->count); 4398} 4399 4400static int cpu_clock_perf_event_enable(struct perf_event *event) 4401{ 4402 struct hw_perf_event *hwc = &event->hw; 4403 int cpu = raw_smp_processor_id(); 4404 4405 local64_set(&hwc->prev_count, cpu_clock(cpu)); 4406 perf_swevent_start_hrtimer(event); 4407 4408 return 0; 4409} 4410 4411static void cpu_clock_perf_event_disable(struct perf_event *event) 4412{ 4413 perf_swevent_cancel_hrtimer(event); 4414 cpu_clock_perf_event_update(event); 4415} 4416 4417static void cpu_clock_perf_event_read(struct perf_event *event) 4418{ 4419 cpu_clock_perf_event_update(event); 4420} 4421 4422static const struct pmu perf_ops_cpu_clock = { 4423 .enable = cpu_clock_perf_event_enable, 4424 .disable = cpu_clock_perf_event_disable, 4425 .read = cpu_clock_perf_event_read, 4426}; 4427 4428/* 4429 * Software event: task time clock 4430 */ 4431 4432static void task_clock_perf_event_update(struct perf_event *event, u64 now) 4433{ 4434 u64 prev; 4435 s64 delta; 4436 4437 prev = local64_xchg(&event->hw.prev_count, now); 4438 delta = now - prev; 4439 local64_add(delta, &event->count); 4440} 4441 4442static int task_clock_perf_event_enable(struct perf_event *event) 4443{ 4444 struct hw_perf_event *hwc = &event->hw; 4445 u64 now; 4446 4447 now = event->ctx->time; 4448 4449 local64_set(&hwc->prev_count, now); 4450 4451 perf_swevent_start_hrtimer(event); 4452 4453 return 0; 4454} 4455 4456static void task_clock_perf_event_disable(struct perf_event *event) 4457{ 4458 perf_swevent_cancel_hrtimer(event); 4459 task_clock_perf_event_update(event, event->ctx->time); 4460 4461} 4462 4463static void task_clock_perf_event_read(struct perf_event *event) 4464{ 4465 u64 time; 4466 4467 if (!in_nmi()) { 4468 update_context_time(event->ctx); 4469 time = event->ctx->time; 4470 } else { 4471 u64 now = perf_clock(); 4472 u64 delta = now - event->ctx->timestamp; 4473 time = event->ctx->time + delta; 4474 } 4475 4476 task_clock_perf_event_update(event, time); 4477} 4478 4479static const struct pmu perf_ops_task_clock = { 4480 .enable = task_clock_perf_event_enable, 4481 .disable = task_clock_perf_event_disable, 4482 .read = task_clock_perf_event_read, 4483}; 4484 4485/* Deref the hlist from the update side */ 4486static inline struct swevent_hlist * 4487swevent_hlist_deref(struct perf_cpu_context *cpuctx) 4488{ 4489 return rcu_dereference_protected(cpuctx->swevent_hlist, 4490 lockdep_is_held(&cpuctx->hlist_mutex)); 4491} 4492 4493static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4494{ 4495 struct swevent_hlist *hlist; 4496 4497 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); 4498 kfree(hlist); 4499} 4500 4501static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4502{ 4503 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); 4504 4505 if (!hlist) 4506 return; 4507 4508 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4509 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4510} 4511 4512static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 4513{ 4514 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4515 4516 mutex_lock(&cpuctx->hlist_mutex); 4517 4518 if (!--cpuctx->hlist_refcount) 4519 swevent_hlist_release(cpuctx); 4520 4521 mutex_unlock(&cpuctx->hlist_mutex); 4522} 4523 4524static void swevent_hlist_put(struct perf_event *event) 4525{ 4526 int cpu; 4527 4528 if (event->cpu != -1) { 4529 swevent_hlist_put_cpu(event, event->cpu); 4530 return; 4531 } 4532 4533 for_each_possible_cpu(cpu) 4534 swevent_hlist_put_cpu(event, cpu); 4535} 4536 4537static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 4538{ 4539 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4540 int err = 0; 4541 4542 mutex_lock(&cpuctx->hlist_mutex); 4543 4544 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { 4545 struct swevent_hlist *hlist; 4546 4547 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4548 if (!hlist) { 4549 err = -ENOMEM; 4550 goto exit; 4551 } 4552 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 4553 } 4554 cpuctx->hlist_refcount++; 4555 exit: 4556 mutex_unlock(&cpuctx->hlist_mutex); 4557 4558 return err; 4559} 4560 4561static int swevent_hlist_get(struct perf_event *event) 4562{ 4563 int err; 4564 int cpu, failed_cpu; 4565 4566 if (event->cpu != -1) 4567 return swevent_hlist_get_cpu(event, event->cpu); 4568 4569 get_online_cpus(); 4570 for_each_possible_cpu(cpu) { 4571 err = swevent_hlist_get_cpu(event, cpu); 4572 if (err) { 4573 failed_cpu = cpu; 4574 goto fail; 4575 } 4576 } 4577 put_online_cpus(); 4578 4579 return 0; 4580 fail: 4581 for_each_possible_cpu(cpu) { 4582 if (cpu == failed_cpu) 4583 break; 4584 swevent_hlist_put_cpu(event, cpu); 4585 } 4586 4587 put_online_cpus(); 4588 return err; 4589} 4590 4591#ifdef CONFIG_EVENT_TRACING 4592 4593static const struct pmu perf_ops_tracepoint = { 4594 .enable = perf_trace_enable, 4595 .disable = perf_trace_disable, 4596 .start = perf_swevent_int, 4597 .stop = perf_swevent_void, 4598 .read = perf_swevent_read, 4599 .unthrottle = perf_swevent_void, 4600}; 4601 4602static int perf_tp_filter_match(struct perf_event *event, 4603 struct perf_sample_data *data) 4604{ 4605 void *record = data->raw->data; 4606 4607 if (likely(!event->filter) || filter_match_preds(event->filter, record)) 4608 return 1; 4609 return 0; 4610} 4611 4612static int perf_tp_event_match(struct perf_event *event, 4613 struct perf_sample_data *data, 4614 struct pt_regs *regs) 4615{ 4616 /* 4617 * All tracepoints are from kernel-space. 4618 */ 4619 if (event->attr.exclude_kernel) 4620 return 0; 4621 4622 if (!perf_tp_filter_match(event, data)) 4623 return 0; 4624 4625 return 1; 4626} 4627 4628void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 4629 struct pt_regs *regs, struct hlist_head *head, int rctx) 4630{ 4631 struct perf_sample_data data; 4632 struct perf_event *event; 4633 struct hlist_node *node; 4634 4635 struct perf_raw_record raw = { 4636 .size = entry_size, 4637 .data = record, 4638 }; 4639 4640 perf_sample_data_init(&data, addr); 4641 data.raw = &raw; 4642 4643 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4644 if (perf_tp_event_match(event, &data, regs)) 4645 perf_swevent_add(event, count, 1, &data, regs); 4646 } 4647 4648 perf_swevent_put_recursion_context(rctx); 4649} 4650EXPORT_SYMBOL_GPL(perf_tp_event); 4651 4652static void tp_perf_event_destroy(struct perf_event *event) 4653{ 4654 perf_trace_destroy(event); 4655} 4656 4657static const struct pmu *tp_perf_event_init(struct perf_event *event) 4658{ 4659 int err; 4660 4661 /* 4662 * Raw tracepoint data is a severe data leak, only allow root to 4663 * have these. 4664 */ 4665 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 4666 perf_paranoid_tracepoint_raw() && 4667 !capable(CAP_SYS_ADMIN)) 4668 return ERR_PTR(-EPERM); 4669 4670 err = perf_trace_init(event); 4671 if (err) 4672 return NULL; 4673 4674 event->destroy = tp_perf_event_destroy; 4675 4676 return &perf_ops_tracepoint; 4677} 4678 4679static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4680{ 4681 char *filter_str; 4682 int ret; 4683 4684 if (event->attr.type != PERF_TYPE_TRACEPOINT) 4685 return -EINVAL; 4686 4687 filter_str = strndup_user(arg, PAGE_SIZE); 4688 if (IS_ERR(filter_str)) 4689 return PTR_ERR(filter_str); 4690 4691 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); 4692 4693 kfree(filter_str); 4694 return ret; 4695} 4696 4697static void perf_event_free_filter(struct perf_event *event) 4698{ 4699 ftrace_profile_free_filter(event); 4700} 4701 4702#else 4703 4704static const struct pmu *tp_perf_event_init(struct perf_event *event) 4705{ 4706 return NULL; 4707} 4708 4709static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4710{ 4711 return -ENOENT; 4712} 4713 4714static void perf_event_free_filter(struct perf_event *event) 4715{ 4716} 4717 4718#endif /* CONFIG_EVENT_TRACING */ 4719 4720#ifdef CONFIG_HAVE_HW_BREAKPOINT 4721static void bp_perf_event_destroy(struct perf_event *event) 4722{ 4723 release_bp_slot(event); 4724} 4725 4726static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4727{ 4728 int err; 4729 4730 err = register_perf_hw_breakpoint(bp); 4731 if (err) 4732 return ERR_PTR(err); 4733 4734 bp->destroy = bp_perf_event_destroy; 4735 4736 return &perf_ops_bp; 4737} 4738 4739void perf_bp_event(struct perf_event *bp, void *data) 4740{ 4741 struct perf_sample_data sample; 4742 struct pt_regs *regs = data; 4743 4744 perf_sample_data_init(&sample, bp->attr.bp_addr); 4745 4746 if (!perf_exclude_event(bp, regs)) 4747 perf_swevent_add(bp, 1, 1, &sample, regs); 4748} 4749#else 4750static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4751{ 4752 return NULL; 4753} 4754 4755void perf_bp_event(struct perf_event *bp, void *regs) 4756{ 4757} 4758#endif 4759 4760atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4761 4762static void sw_perf_event_destroy(struct perf_event *event) 4763{ 4764 u64 event_id = event->attr.config; 4765 4766 WARN_ON(event->parent); 4767 4768 atomic_dec(&perf_swevent_enabled[event_id]); 4769 swevent_hlist_put(event); 4770} 4771 4772static const struct pmu *sw_perf_event_init(struct perf_event *event) 4773{ 4774 const struct pmu *pmu = NULL; 4775 u64 event_id = event->attr.config; 4776 4777 /* 4778 * Software events (currently) can't in general distinguish 4779 * between user, kernel and hypervisor events. 4780 * However, context switches and cpu migrations are considered 4781 * to be kernel events, and page faults are never hypervisor 4782 * events. 4783 */ 4784 switch (event_id) { 4785 case PERF_COUNT_SW_CPU_CLOCK: 4786 pmu = &perf_ops_cpu_clock; 4787 4788 break; 4789 case PERF_COUNT_SW_TASK_CLOCK: 4790 /* 4791 * If the user instantiates this as a per-cpu event, 4792 * use the cpu_clock event instead. 4793 */ 4794 if (event->ctx->task) 4795 pmu = &perf_ops_task_clock; 4796 else 4797 pmu = &perf_ops_cpu_clock; 4798 4799 break; 4800 case PERF_COUNT_SW_PAGE_FAULTS: 4801 case PERF_COUNT_SW_PAGE_FAULTS_MIN: 4802 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4803 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4804 case PERF_COUNT_SW_CPU_MIGRATIONS: 4805 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 4806 case PERF_COUNT_SW_EMULATION_FAULTS: 4807 if (!event->parent) { 4808 int err; 4809 4810 err = swevent_hlist_get(event); 4811 if (err) 4812 return ERR_PTR(err); 4813 4814 atomic_inc(&perf_swevent_enabled[event_id]); 4815 event->destroy = sw_perf_event_destroy; 4816 } 4817 pmu = &perf_ops_generic; 4818 break; 4819 } 4820 4821 return pmu; 4822} 4823 4824/* 4825 * Allocate and initialize a event structure 4826 */ 4827static struct perf_event * 4828perf_event_alloc(struct perf_event_attr *attr, 4829 int cpu, 4830 struct perf_event_context *ctx, 4831 struct perf_event *group_leader, 4832 struct perf_event *parent_event, 4833 perf_overflow_handler_t overflow_handler, 4834 gfp_t gfpflags) 4835{ 4836 const struct pmu *pmu; 4837 struct perf_event *event; 4838 struct hw_perf_event *hwc; 4839 long err; 4840 4841 event = kzalloc(sizeof(*event), gfpflags); 4842 if (!event) 4843 return ERR_PTR(-ENOMEM); 4844 4845 /* 4846 * Single events are their own group leaders, with an 4847 * empty sibling list: 4848 */ 4849 if (!group_leader) 4850 group_leader = event; 4851 4852 mutex_init(&event->child_mutex); 4853 INIT_LIST_HEAD(&event->child_list); 4854 4855 INIT_LIST_HEAD(&event->group_entry); 4856 INIT_LIST_HEAD(&event->event_entry); 4857 INIT_LIST_HEAD(&event->sibling_list); 4858 init_waitqueue_head(&event->waitq); 4859 4860 mutex_init(&event->mmap_mutex); 4861 4862 event->cpu = cpu; 4863 event->attr = *attr; 4864 event->group_leader = group_leader; 4865 event->pmu = NULL; 4866 event->ctx = ctx; 4867 event->oncpu = -1; 4868 4869 event->parent = parent_event; 4870 4871 event->ns = get_pid_ns(current->nsproxy->pid_ns); 4872 event->id = atomic64_inc_return(&perf_event_id); 4873 4874 event->state = PERF_EVENT_STATE_INACTIVE; 4875 4876 if (!overflow_handler && parent_event) 4877 overflow_handler = parent_event->overflow_handler; 4878 4879 event->overflow_handler = overflow_handler; 4880 4881 if (attr->disabled) 4882 event->state = PERF_EVENT_STATE_OFF; 4883 4884 pmu = NULL; 4885 4886 hwc = &event->hw; 4887 hwc->sample_period = attr->sample_period; 4888 if (attr->freq && attr->sample_freq) 4889 hwc->sample_period = 1; 4890 hwc->last_period = hwc->sample_period; 4891 4892 local64_set(&hwc->period_left, hwc->sample_period); 4893 4894 /* 4895 * we currently do not support PERF_FORMAT_GROUP on inherited events 4896 */ 4897 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 4898 goto done; 4899 4900 switch (attr->type) { 4901 case PERF_TYPE_RAW: 4902 case PERF_TYPE_HARDWARE: 4903 case PERF_TYPE_HW_CACHE: 4904 pmu = hw_perf_event_init(event); 4905 break; 4906 4907 case PERF_TYPE_SOFTWARE: 4908 pmu = sw_perf_event_init(event); 4909 break; 4910 4911 case PERF_TYPE_TRACEPOINT: 4912 pmu = tp_perf_event_init(event); 4913 break; 4914 4915 case PERF_TYPE_BREAKPOINT: 4916 pmu = bp_perf_event_init(event); 4917 break; 4918 4919 4920 default: 4921 break; 4922 } 4923done: 4924 err = 0; 4925 if (!pmu) 4926 err = -EINVAL; 4927 else if (IS_ERR(pmu)) 4928 err = PTR_ERR(pmu); 4929 4930 if (err) { 4931 if (event->ns) 4932 put_pid_ns(event->ns); 4933 kfree(event); 4934 return ERR_PTR(err); 4935 } 4936 4937 event->pmu = pmu; 4938 4939 if (!event->parent) { 4940 atomic_inc(&nr_events); 4941 if (event->attr.mmap || event->attr.mmap_data) 4942 atomic_inc(&nr_mmap_events); 4943 if (event->attr.comm) 4944 atomic_inc(&nr_comm_events); 4945 if (event->attr.task) 4946 atomic_inc(&nr_task_events); 4947 } 4948 4949 return event; 4950} 4951 4952static int perf_copy_attr(struct perf_event_attr __user *uattr, 4953 struct perf_event_attr *attr) 4954{ 4955 u32 size; 4956 int ret; 4957 4958 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) 4959 return -EFAULT; 4960 4961 /* 4962 * zero the full structure, so that a short copy will be nice. 4963 */ 4964 memset(attr, 0, sizeof(*attr)); 4965 4966 ret = get_user(size, &uattr->size); 4967 if (ret) 4968 return ret; 4969 4970 if (size > PAGE_SIZE) /* silly large */ 4971 goto err_size; 4972 4973 if (!size) /* abi compat */ 4974 size = PERF_ATTR_SIZE_VER0; 4975 4976 if (size < PERF_ATTR_SIZE_VER0) 4977 goto err_size; 4978 4979 /* 4980 * If we're handed a bigger struct than we know of, 4981 * ensure all the unknown bits are 0 - i.e. new 4982 * user-space does not rely on any kernel feature 4983 * extensions we dont know about yet. 4984 */ 4985 if (size > sizeof(*attr)) { 4986 unsigned char __user *addr; 4987 unsigned char __user *end; 4988 unsigned char val; 4989 4990 addr = (void __user *)uattr + sizeof(*attr); 4991 end = (void __user *)uattr + size; 4992 4993 for (; addr < end; addr++) { 4994 ret = get_user(val, addr); 4995 if (ret) 4996 return ret; 4997 if (val) 4998 goto err_size; 4999 } 5000 size = sizeof(*attr); 5001 } 5002 5003 ret = copy_from_user(attr, uattr, size); 5004 if (ret) 5005 return -EFAULT; 5006 5007 /* 5008 * If the type exists, the corresponding creation will verify 5009 * the attr->config. 5010 */ 5011 if (attr->type >= PERF_TYPE_MAX) 5012 return -EINVAL; 5013 5014 if (attr->__reserved_1) 5015 return -EINVAL; 5016 5017 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 5018 return -EINVAL; 5019 5020 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 5021 return -EINVAL; 5022 5023out: 5024 return ret; 5025 5026err_size: 5027 put_user(sizeof(*attr), &uattr->size); 5028 ret = -E2BIG; 5029 goto out; 5030} 5031 5032static int 5033perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5034{ 5035 struct perf_buffer *buffer = NULL, *old_buffer = NULL; 5036 int ret = -EINVAL; 5037 5038 if (!output_event) 5039 goto set; 5040 5041 /* don't allow circular references */ 5042 if (event == output_event) 5043 goto out; 5044 5045 /* 5046 * Don't allow cross-cpu buffers 5047 */ 5048 if (output_event->cpu != event->cpu) 5049 goto out; 5050 5051 /* 5052 * If its not a per-cpu buffer, it must be the same task. 5053 */ 5054 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 5055 goto out; 5056 5057set: 5058 mutex_lock(&event->mmap_mutex); 5059 /* Can't redirect output if we've got an active mmap() */ 5060 if (atomic_read(&event->mmap_count)) 5061 goto unlock; 5062 5063 if (output_event) { 5064 /* get the buffer we want to redirect to */ 5065 buffer = perf_buffer_get(output_event); 5066 if (!buffer) 5067 goto unlock; 5068 } 5069 5070 old_buffer = event->buffer; 5071 rcu_assign_pointer(event->buffer, buffer); 5072 ret = 0; 5073unlock: 5074 mutex_unlock(&event->mmap_mutex); 5075 5076 if (old_buffer) 5077 perf_buffer_put(old_buffer); 5078out: 5079 return ret; 5080} 5081 5082/** 5083 * sys_perf_event_open - open a performance event, associate it to a task/cpu 5084 * 5085 * @attr_uptr: event_id type attributes for monitoring/sampling 5086 * @pid: target pid 5087 * @cpu: target cpu 5088 * @group_fd: group leader event fd 5089 */ 5090SYSCALL_DEFINE5(perf_event_open, 5091 struct perf_event_attr __user *, attr_uptr, 5092 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5093{ 5094 struct perf_event *event, *group_leader = NULL, *output_event = NULL; 5095 struct perf_event_attr attr; 5096 struct perf_event_context *ctx; 5097 struct file *event_file = NULL; 5098 struct file *group_file = NULL; 5099 int event_fd; 5100 int fput_needed = 0; 5101 int err; 5102 5103 /* for future expandability... */ 5104 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 5105 return -EINVAL; 5106 5107 err = perf_copy_attr(attr_uptr, &attr); 5108 if (err) 5109 return err; 5110 5111 if (!attr.exclude_kernel) { 5112 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 5113 return -EACCES; 5114 } 5115 5116 if (attr.freq) { 5117 if (attr.sample_freq > sysctl_perf_event_sample_rate) 5118 return -EINVAL; 5119 } 5120 5121 event_fd = get_unused_fd_flags(O_RDWR); 5122 if (event_fd < 0) 5123 return event_fd; 5124 5125 /* 5126 * Get the target context (task or percpu): 5127 */ 5128 ctx = find_get_context(pid, cpu); 5129 if (IS_ERR(ctx)) { 5130 err = PTR_ERR(ctx); 5131 goto err_fd; 5132 } 5133 5134 if (group_fd != -1) { 5135 group_leader = perf_fget_light(group_fd, &fput_needed); 5136 if (IS_ERR(group_leader)) { 5137 err = PTR_ERR(group_leader); 5138 goto err_put_context; 5139 } 5140 group_file = group_leader->filp; 5141 if (flags & PERF_FLAG_FD_OUTPUT) 5142 output_event = group_leader; 5143 if (flags & PERF_FLAG_FD_NO_GROUP) 5144 group_leader = NULL; 5145 } 5146 5147 /* 5148 * Look up the group leader (we will attach this event to it): 5149 */ 5150 if (group_leader) { 5151 err = -EINVAL; 5152 5153 /* 5154 * Do not allow a recursive hierarchy (this new sibling 5155 * becoming part of another group-sibling): 5156 */ 5157 if (group_leader->group_leader != group_leader) 5158 goto err_put_context; 5159 /* 5160 * Do not allow to attach to a group in a different 5161 * task or CPU context: 5162 */ 5163 if (group_leader->ctx != ctx) 5164 goto err_put_context; 5165 /* 5166 * Only a group leader can be exclusive or pinned 5167 */ 5168 if (attr.exclusive || attr.pinned) 5169 goto err_put_context; 5170 } 5171 5172 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 5173 NULL, NULL, GFP_KERNEL); 5174 if (IS_ERR(event)) { 5175 err = PTR_ERR(event); 5176 goto err_put_context; 5177 } 5178 5179 if (output_event) { 5180 err = perf_event_set_output(event, output_event); 5181 if (err) 5182 goto err_free_put_context; 5183 } 5184 5185 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 5186 if (IS_ERR(event_file)) { 5187 err = PTR_ERR(event_file); 5188 goto err_free_put_context; 5189 } 5190 5191 event->filp = event_file; 5192 WARN_ON_ONCE(ctx->parent_ctx); 5193 mutex_lock(&ctx->mutex); 5194 perf_install_in_context(ctx, event, cpu); 5195 ++ctx->generation; 5196 mutex_unlock(&ctx->mutex); 5197 5198 event->owner = current; 5199 get_task_struct(current); 5200 mutex_lock(¤t->perf_event_mutex); 5201 list_add_tail(&event->owner_entry, ¤t->perf_event_list); 5202 mutex_unlock(¤t->perf_event_mutex); 5203 5204 /* 5205 * Drop the reference on the group_event after placing the 5206 * new event on the sibling_list. This ensures destruction 5207 * of the group leader will find the pointer to itself in 5208 * perf_group_detach(). 5209 */ 5210 fput_light(group_file, fput_needed); 5211 fd_install(event_fd, event_file); 5212 return event_fd; 5213 5214err_free_put_context: 5215 free_event(event); 5216err_put_context: 5217 fput_light(group_file, fput_needed); 5218 put_ctx(ctx); 5219err_fd: 5220 put_unused_fd(event_fd); 5221 return err; 5222} 5223 5224/** 5225 * perf_event_create_kernel_counter 5226 * 5227 * @attr: attributes of the counter to create 5228 * @cpu: cpu in which the counter is bound 5229 * @pid: task to profile 5230 */ 5231struct perf_event * 5232perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 5233 pid_t pid, 5234 perf_overflow_handler_t overflow_handler) 5235{ 5236 struct perf_event *event; 5237 struct perf_event_context *ctx; 5238 int err; 5239 5240 /* 5241 * Get the target context (task or percpu): 5242 */ 5243 5244 ctx = find_get_context(pid, cpu); 5245 if (IS_ERR(ctx)) { 5246 err = PTR_ERR(ctx); 5247 goto err_exit; 5248 } 5249 5250 event = perf_event_alloc(attr, cpu, ctx, NULL, 5251 NULL, overflow_handler, GFP_KERNEL); 5252 if (IS_ERR(event)) { 5253 err = PTR_ERR(event); 5254 goto err_put_context; 5255 } 5256 5257 event->filp = NULL; 5258 WARN_ON_ONCE(ctx->parent_ctx); 5259 mutex_lock(&ctx->mutex); 5260 perf_install_in_context(ctx, event, cpu); 5261 ++ctx->generation; 5262 mutex_unlock(&ctx->mutex); 5263 5264 event->owner = current; 5265 get_task_struct(current); 5266 mutex_lock(¤t->perf_event_mutex); 5267 list_add_tail(&event->owner_entry, ¤t->perf_event_list); 5268 mutex_unlock(¤t->perf_event_mutex); 5269 5270 return event; 5271 5272 err_put_context: 5273 put_ctx(ctx); 5274 err_exit: 5275 return ERR_PTR(err); 5276} 5277EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 5278 5279/* 5280 * inherit a event from parent task to child task: 5281 */ 5282static struct perf_event * 5283inherit_event(struct perf_event *parent_event, 5284 struct task_struct *parent, 5285 struct perf_event_context *parent_ctx, 5286 struct task_struct *child, 5287 struct perf_event *group_leader, 5288 struct perf_event_context *child_ctx) 5289{ 5290 struct perf_event *child_event; 5291 5292 /* 5293 * Instead of creating recursive hierarchies of events, 5294 * we link inherited events back to the original parent, 5295 * which has a filp for sure, which we use as the reference 5296 * count: 5297 */ 5298 if (parent_event->parent) 5299 parent_event = parent_event->parent; 5300 5301 child_event = perf_event_alloc(&parent_event->attr, 5302 parent_event->cpu, child_ctx, 5303 group_leader, parent_event, 5304 NULL, GFP_KERNEL); 5305 if (IS_ERR(child_event)) 5306 return child_event; 5307 get_ctx(child_ctx); 5308 5309 /* 5310 * Make the child state follow the state of the parent event, 5311 * not its attr.disabled bit. We hold the parent's mutex, 5312 * so we won't race with perf_event_{en, dis}able_family. 5313 */ 5314 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) 5315 child_event->state = PERF_EVENT_STATE_INACTIVE; 5316 else 5317 child_event->state = PERF_EVENT_STATE_OFF; 5318 5319 if (parent_event->attr.freq) { 5320 u64 sample_period = parent_event->hw.sample_period; 5321 struct hw_perf_event *hwc = &child_event->hw; 5322 5323 hwc->sample_period = sample_period; 5324 hwc->last_period = sample_period; 5325 5326 local64_set(&hwc->period_left, sample_period); 5327 } 5328 5329 child_event->overflow_handler = parent_event->overflow_handler; 5330 5331 /* 5332 * Link it up in the child's context: 5333 */ 5334 add_event_to_ctx(child_event, child_ctx); 5335 5336 /* 5337 * Get a reference to the parent filp - we will fput it 5338 * when the child event exits. This is safe to do because 5339 * we are in the parent and we know that the filp still 5340 * exists and has a nonzero count: 5341 */ 5342 atomic_long_inc(&parent_event->filp->f_count); 5343 5344 /* 5345 * Link this into the parent event's child list 5346 */ 5347 WARN_ON_ONCE(parent_event->ctx->parent_ctx); 5348 mutex_lock(&parent_event->child_mutex); 5349 list_add_tail(&child_event->child_list, &parent_event->child_list); 5350 mutex_unlock(&parent_event->child_mutex); 5351 5352 return child_event; 5353} 5354 5355static int inherit_group(struct perf_event *parent_event, 5356 struct task_struct *parent, 5357 struct perf_event_context *parent_ctx, 5358 struct task_struct *child, 5359 struct perf_event_context *child_ctx) 5360{ 5361 struct perf_event *leader; 5362 struct perf_event *sub; 5363 struct perf_event *child_ctr; 5364 5365 leader = inherit_event(parent_event, parent, parent_ctx, 5366 child, NULL, child_ctx); 5367 if (IS_ERR(leader)) 5368 return PTR_ERR(leader); 5369 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { 5370 child_ctr = inherit_event(sub, parent, parent_ctx, 5371 child, leader, child_ctx); 5372 if (IS_ERR(child_ctr)) 5373 return PTR_ERR(child_ctr); 5374 } 5375 return 0; 5376} 5377 5378static void sync_child_event(struct perf_event *child_event, 5379 struct task_struct *child) 5380{ 5381 struct perf_event *parent_event = child_event->parent; 5382 u64 child_val; 5383 5384 if (child_event->attr.inherit_stat) 5385 perf_event_read_event(child_event, child); 5386 5387 child_val = perf_event_count(child_event); 5388 5389 /* 5390 * Add back the child's count to the parent's count: 5391 */ 5392 atomic64_add(child_val, &parent_event->child_count); 5393 atomic64_add(child_event->total_time_enabled, 5394 &parent_event->child_total_time_enabled); 5395 atomic64_add(child_event->total_time_running, 5396 &parent_event->child_total_time_running); 5397 5398 /* 5399 * Remove this event from the parent's list 5400 */ 5401 WARN_ON_ONCE(parent_event->ctx->parent_ctx); 5402 mutex_lock(&parent_event->child_mutex); 5403 list_del_init(&child_event->child_list); 5404 mutex_unlock(&parent_event->child_mutex); 5405 5406 /* 5407 * Release the parent event, if this was the last 5408 * reference to it. 5409 */ 5410 fput(parent_event->filp); 5411} 5412 5413static void 5414__perf_event_exit_task(struct perf_event *child_event, 5415 struct perf_event_context *child_ctx, 5416 struct task_struct *child) 5417{ 5418 struct perf_event *parent_event; 5419 5420 perf_event_remove_from_context(child_event); 5421 5422 parent_event = child_event->parent; 5423 /* 5424 * It can happen that parent exits first, and has events 5425 * that are still around due to the child reference. These 5426 * events need to be zapped - but otherwise linger. 5427 */ 5428 if (parent_event) { 5429 sync_child_event(child_event, child); 5430 free_event(child_event); 5431 } 5432} 5433 5434/* 5435 * When a child task exits, feed back event values to parent events. 5436 */ 5437void perf_event_exit_task(struct task_struct *child) 5438{ 5439 struct perf_event *child_event, *tmp; 5440 struct perf_event_context *child_ctx; 5441 unsigned long flags; 5442 5443 if (likely(!child->perf_event_ctxp)) { 5444 perf_event_task(child, NULL, 0); 5445 return; 5446 } 5447 5448 local_irq_save(flags); 5449 /* 5450 * We can't reschedule here because interrupts are disabled, 5451 * and either child is current or it is a task that can't be 5452 * scheduled, so we are now safe from rescheduling changing 5453 * our context. 5454 */ 5455 child_ctx = child->perf_event_ctxp; 5456 __perf_event_task_sched_out(child_ctx); 5457 5458 /* 5459 * Take the context lock here so that if find_get_context is 5460 * reading child->perf_event_ctxp, we wait until it has 5461 * incremented the context's refcount before we do put_ctx below. 5462 */ 5463 raw_spin_lock(&child_ctx->lock); 5464 child->perf_event_ctxp = NULL; 5465 /* 5466 * If this context is a clone; unclone it so it can't get 5467 * swapped to another process while we're removing all 5468 * the events from it. 5469 */ 5470 unclone_ctx(child_ctx); 5471 update_context_time(child_ctx); 5472 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 5473 5474 /* 5475 * Report the task dead after unscheduling the events so that we 5476 * won't get any samples after PERF_RECORD_EXIT. We can however still 5477 * get a few PERF_RECORD_READ events. 5478 */ 5479 perf_event_task(child, child_ctx, 0); 5480 5481 /* 5482 * We can recurse on the same lock type through: 5483 * 5484 * __perf_event_exit_task() 5485 * sync_child_event() 5486 * fput(parent_event->filp) 5487 * perf_release() 5488 * mutex_lock(&ctx->mutex) 5489 * 5490 * But since its the parent context it won't be the same instance. 5491 */ 5492 mutex_lock(&child_ctx->mutex); 5493 5494again: 5495 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, 5496 group_entry) 5497 __perf_event_exit_task(child_event, child_ctx, child); 5498 5499 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, 5500 group_entry) 5501 __perf_event_exit_task(child_event, child_ctx, child); 5502 5503 /* 5504 * If the last event was a group event, it will have appended all 5505 * its siblings to the list, but we obtained 'tmp' before that which 5506 * will still point to the list head terminating the iteration. 5507 */ 5508 if (!list_empty(&child_ctx->pinned_groups) || 5509 !list_empty(&child_ctx->flexible_groups)) 5510 goto again; 5511 5512 mutex_unlock(&child_ctx->mutex); 5513 5514 put_ctx(child_ctx); 5515} 5516 5517static void perf_free_event(struct perf_event *event, 5518 struct perf_event_context *ctx) 5519{ 5520 struct perf_event *parent = event->parent; 5521 5522 if (WARN_ON_ONCE(!parent)) 5523 return; 5524 5525 mutex_lock(&parent->child_mutex); 5526 list_del_init(&event->child_list); 5527 mutex_unlock(&parent->child_mutex); 5528 5529 fput(parent->filp); 5530 5531 perf_group_detach(event); 5532 list_del_event(event, ctx); 5533 free_event(event); 5534} 5535 5536/* 5537 * free an unexposed, unused context as created by inheritance by 5538 * init_task below, used by fork() in case of fail. 5539 */ 5540void perf_event_free_task(struct task_struct *task) 5541{ 5542 struct perf_event_context *ctx = task->perf_event_ctxp; 5543 struct perf_event *event, *tmp; 5544 5545 if (!ctx) 5546 return; 5547 5548 mutex_lock(&ctx->mutex); 5549again: 5550 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5551 perf_free_event(event, ctx); 5552 5553 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 5554 group_entry) 5555 perf_free_event(event, ctx); 5556 5557 if (!list_empty(&ctx->pinned_groups) || 5558 !list_empty(&ctx->flexible_groups)) 5559 goto again; 5560 5561 mutex_unlock(&ctx->mutex); 5562 5563 put_ctx(ctx); 5564} 5565 5566static int 5567inherit_task_group(struct perf_event *event, struct task_struct *parent, 5568 struct perf_event_context *parent_ctx, 5569 struct task_struct *child, 5570 int *inherited_all) 5571{ 5572 int ret; 5573 struct perf_event_context *child_ctx = child->perf_event_ctxp; 5574 5575 if (!event->attr.inherit) { 5576 *inherited_all = 0; 5577 return 0; 5578 } 5579 5580 if (!child_ctx) { 5581 /* 5582 * This is executed from the parent task context, so 5583 * inherit events that have been marked for cloning. 5584 * First allocate and initialize a context for the 5585 * child. 5586 */ 5587 5588 child_ctx = kzalloc(sizeof(struct perf_event_context), 5589 GFP_KERNEL); 5590 if (!child_ctx) 5591 return -ENOMEM; 5592 5593 __perf_event_init_context(child_ctx, child); 5594 child->perf_event_ctxp = child_ctx; 5595 get_task_struct(child); 5596 } 5597 5598 ret = inherit_group(event, parent, parent_ctx, 5599 child, child_ctx); 5600 5601 if (ret) 5602 *inherited_all = 0; 5603 5604 return ret; 5605} 5606 5607 5608/* 5609 * Initialize the perf_event context in task_struct 5610 */ 5611int perf_event_init_task(struct task_struct *child) 5612{ 5613 struct perf_event_context *child_ctx, *parent_ctx; 5614 struct perf_event_context *cloned_ctx; 5615 struct perf_event *event; 5616 struct task_struct *parent = current; 5617 int inherited_all = 1; 5618 unsigned long flags; 5619 int ret = 0; 5620 5621 child->perf_event_ctxp = NULL; 5622 5623 mutex_init(&child->perf_event_mutex); 5624 INIT_LIST_HEAD(&child->perf_event_list); 5625 5626 if (likely(!parent->perf_event_ctxp)) 5627 return 0; 5628 5629 /* 5630 * If the parent's context is a clone, pin it so it won't get 5631 * swapped under us. 5632 */ 5633 parent_ctx = perf_pin_task_context(parent); 5634 5635 /* 5636 * No need to check if parent_ctx != NULL here; since we saw 5637 * it non-NULL earlier, the only reason for it to become NULL 5638 * is if we exit, and since we're currently in the middle of 5639 * a fork we can't be exiting at the same time. 5640 */ 5641 5642 /* 5643 * Lock the parent list. No need to lock the child - not PID 5644 * hashed yet and not running, so nobody can access it. 5645 */ 5646 mutex_lock(&parent_ctx->mutex); 5647 5648 /* 5649 * We dont have to disable NMIs - we are only looking at 5650 * the list, not manipulating it: 5651 */ 5652 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 5653 ret = inherit_task_group(event, parent, parent_ctx, child, 5654 &inherited_all); 5655 if (ret) 5656 break; 5657 } 5658 5659 /* 5660 * We can't hold ctx->lock when iterating the ->flexible_group list due 5661 * to allocations, but we need to prevent rotation because 5662 * rotate_ctx() will change the list from interrupt context. 5663 */ 5664 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 5665 parent_ctx->rotate_disable = 1; 5666 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 5667 5668 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 5669 ret = inherit_task_group(event, parent, parent_ctx, child, 5670 &inherited_all); 5671 if (ret) 5672 break; 5673 } 5674 5675 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 5676 parent_ctx->rotate_disable = 0; 5677 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 5678 5679 child_ctx = child->perf_event_ctxp; 5680 5681 if (child_ctx && inherited_all) { 5682 /* 5683 * Mark the child context as a clone of the parent 5684 * context, or of whatever the parent is a clone of. 5685 * Note that if the parent is a clone, it could get 5686 * uncloned at any point, but that doesn't matter 5687 * because the list of events and the generation 5688 * count can't have changed since we took the mutex. 5689 */ 5690 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); 5691 if (cloned_ctx) { 5692 child_ctx->parent_ctx = cloned_ctx; 5693 child_ctx->parent_gen = parent_ctx->parent_gen; 5694 } else { 5695 child_ctx->parent_ctx = parent_ctx; 5696 child_ctx->parent_gen = parent_ctx->generation; 5697 } 5698 get_ctx(child_ctx->parent_ctx); 5699 } 5700 5701 mutex_unlock(&parent_ctx->mutex); 5702 5703 perf_unpin_context(parent_ctx); 5704 5705 return ret; 5706} 5707 5708static void __init perf_event_init_all_cpus(void) 5709{ 5710 int cpu; 5711 struct perf_cpu_context *cpuctx; 5712 5713 for_each_possible_cpu(cpu) { 5714 cpuctx = &per_cpu(perf_cpu_context, cpu); 5715 mutex_init(&cpuctx->hlist_mutex); 5716 __perf_event_init_context(&cpuctx->ctx, NULL); 5717 } 5718} 5719 5720static void __cpuinit perf_event_init_cpu(int cpu) 5721{ 5722 struct perf_cpu_context *cpuctx; 5723 5724 cpuctx = &per_cpu(perf_cpu_context, cpu); 5725 5726 spin_lock(&perf_resource_lock); 5727 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5728 spin_unlock(&perf_resource_lock); 5729 5730 mutex_lock(&cpuctx->hlist_mutex); 5731 if (cpuctx->hlist_refcount > 0) { 5732 struct swevent_hlist *hlist; 5733 5734 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 5735 WARN_ON_ONCE(!hlist); 5736 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 5737 } 5738 mutex_unlock(&cpuctx->hlist_mutex); 5739} 5740 5741#ifdef CONFIG_HOTPLUG_CPU 5742static void __perf_event_exit_cpu(void *info) 5743{ 5744 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 5745 struct perf_event_context *ctx = &cpuctx->ctx; 5746 struct perf_event *event, *tmp; 5747 5748 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5749 __perf_event_remove_from_context(event); 5750 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 5751 __perf_event_remove_from_context(event); 5752} 5753static void perf_event_exit_cpu(int cpu) 5754{ 5755 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5756 struct perf_event_context *ctx = &cpuctx->ctx; 5757 5758 mutex_lock(&cpuctx->hlist_mutex); 5759 swevent_hlist_release(cpuctx); 5760 mutex_unlock(&cpuctx->hlist_mutex); 5761 5762 mutex_lock(&ctx->mutex); 5763 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); 5764 mutex_unlock(&ctx->mutex); 5765} 5766#else 5767static inline void perf_event_exit_cpu(int cpu) { } 5768#endif 5769 5770static int __cpuinit 5771perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 5772{ 5773 unsigned int cpu = (long)hcpu; 5774 5775 switch (action & ~CPU_TASKS_FROZEN) { 5776 5777 case CPU_UP_PREPARE: 5778 case CPU_DOWN_FAILED: 5779 perf_event_init_cpu(cpu); 5780 break; 5781 5782 case CPU_UP_CANCELED: 5783 case CPU_DOWN_PREPARE: 5784 perf_event_exit_cpu(cpu); 5785 break; 5786 5787 default: 5788 break; 5789 } 5790 5791 return NOTIFY_OK; 5792} 5793 5794/* 5795 * This has to have a higher priority than migration_notifier in sched.c. 5796 */ 5797static struct notifier_block __cpuinitdata perf_cpu_nb = { 5798 .notifier_call = perf_cpu_notify, 5799 .priority = 20, 5800}; 5801 5802void __init perf_event_init(void) 5803{ 5804 perf_event_init_all_cpus(); 5805 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 5806 (void *)(long)smp_processor_id()); 5807 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 5808 (void *)(long)smp_processor_id()); 5809 register_cpu_notifier(&perf_cpu_nb); 5810} 5811 5812static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, 5813 struct sysdev_class_attribute *attr, 5814 char *buf) 5815{ 5816 return sprintf(buf, "%d\n", perf_reserved_percpu); 5817} 5818 5819static ssize_t 5820perf_set_reserve_percpu(struct sysdev_class *class, 5821 struct sysdev_class_attribute *attr, 5822 const char *buf, 5823 size_t count) 5824{ 5825 struct perf_cpu_context *cpuctx; 5826 unsigned long val; 5827 int err, cpu, mpt; 5828 5829 err = strict_strtoul(buf, 10, &val); 5830 if (err) 5831 return err; 5832 if (val > perf_max_events) 5833 return -EINVAL; 5834 5835 spin_lock(&perf_resource_lock); 5836 perf_reserved_percpu = val; 5837 for_each_online_cpu(cpu) { 5838 cpuctx = &per_cpu(perf_cpu_context, cpu); 5839 raw_spin_lock_irq(&cpuctx->ctx.lock); 5840 mpt = min(perf_max_events - cpuctx->ctx.nr_events, 5841 perf_max_events - perf_reserved_percpu); 5842 cpuctx->max_pertask = mpt; 5843 raw_spin_unlock_irq(&cpuctx->ctx.lock); 5844 } 5845 spin_unlock(&perf_resource_lock); 5846 5847 return count; 5848} 5849 5850static ssize_t perf_show_overcommit(struct sysdev_class *class, 5851 struct sysdev_class_attribute *attr, 5852 char *buf) 5853{ 5854 return sprintf(buf, "%d\n", perf_overcommit); 5855} 5856 5857static ssize_t 5858perf_set_overcommit(struct sysdev_class *class, 5859 struct sysdev_class_attribute *attr, 5860 const char *buf, size_t count) 5861{ 5862 unsigned long val; 5863 int err; 5864 5865 err = strict_strtoul(buf, 10, &val); 5866 if (err) 5867 return err; 5868 if (val > 1) 5869 return -EINVAL; 5870 5871 spin_lock(&perf_resource_lock); 5872 perf_overcommit = val; 5873 spin_unlock(&perf_resource_lock); 5874 5875 return count; 5876} 5877 5878static SYSDEV_CLASS_ATTR( 5879 reserve_percpu, 5880 0644, 5881 perf_show_reserve_percpu, 5882 perf_set_reserve_percpu 5883 ); 5884 5885static SYSDEV_CLASS_ATTR( 5886 overcommit, 5887 0644, 5888 perf_show_overcommit, 5889 perf_set_overcommit 5890 ); 5891 5892static struct attribute *perfclass_attrs[] = { 5893 &attr_reserve_percpu.attr, 5894 &attr_overcommit.attr, 5895 NULL 5896}; 5897 5898static struct attribute_group perfclass_attr_group = { 5899 .attrs = perfclass_attrs, 5900 .name = "perf_events", 5901}; 5902 5903static int __init perf_event_sysfs_init(void) 5904{ 5905 return sysfs_create_group(&cpu_sysdev_class.kset.kobj, 5906 &perfclass_attr_group); 5907} 5908device_initcall(perf_event_sysfs_init); 5909