1/** 2 * @file buffer_sync.c 3 * 4 * @remark Copyright 2002 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * 9 * This is the core of the buffer management. Each 10 * CPU buffer is processed and entered into the 11 * global event buffer. Such processing is necessary 12 * in several circumstances, mentioned below. 13 * 14 * The processing does the job of converting the 15 * transitory EIP value into a persistent dentry/offset 16 * value that the profiler can record at its leisure. 17 * 18 * See fs/dcookies.c for a description of the dentry/offset 19 * objects. 20 */ 21 22#include <linux/mm.h> 23#include <linux/workqueue.h> 24#include <linux/notifier.h> 25#include <linux/dcookies.h> 26#include <linux/profile.h> 27#include <linux/module.h> 28#include <linux/fs.h> 29#include <linux/sched.h> 30 31#include "oprofile_stats.h" 32#include "event_buffer.h" 33#include "cpu_buffer.h" 34#include "buffer_sync.h" 35 36static LIST_HEAD(dying_tasks); 37static LIST_HEAD(dead_tasks); 38static cpumask_t marked_cpus = CPU_MASK_NONE; 39static DEFINE_SPINLOCK(task_mortuary); 40static void process_task_mortuary(void); 41 42 43/* Take ownership of the task struct and place it on the 44 * list for processing. Only after two full buffer syncs 45 * does the task eventually get freed, because by then 46 * we are sure we will not reference it again. 47 * Can be invoked from softirq via RCU callback due to 48 * call_rcu() of the task struct, hence the _irqsave. 49 */ 50static int task_free_notify(struct notifier_block * self, unsigned long val, void * data) 51{ 52 unsigned long flags; 53 struct task_struct * task = data; 54 spin_lock_irqsave(&task_mortuary, flags); 55 list_add(&task->tasks, &dying_tasks); 56 spin_unlock_irqrestore(&task_mortuary, flags); 57 return NOTIFY_OK; 58} 59 60 61/* The task is on its way out. A sync of the buffer means we can catch 62 * any remaining samples for this task. 63 */ 64static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data) 65{ 66 /* To avoid latency problems, we only process the current CPU, 67 * hoping that most samples for the task are on this CPU 68 */ 69 sync_buffer(raw_smp_processor_id()); 70 return 0; 71} 72 73 74/* The task is about to try a do_munmap(). We peek at what it's going to 75 * do, and if it's an executable region, process the samples first, so 76 * we don't lose any. This does not have to be exact, it's a QoI issue 77 * only. 78 */ 79static int munmap_notify(struct notifier_block * self, unsigned long val, void * data) 80{ 81 unsigned long addr = (unsigned long)data; 82 struct mm_struct * mm = current->mm; 83 struct vm_area_struct * mpnt; 84 85 down_read(&mm->mmap_sem); 86 87 mpnt = find_vma(mm, addr); 88 if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) { 89 up_read(&mm->mmap_sem); 90 /* To avoid latency problems, we only process the current CPU, 91 * hoping that most samples for the task are on this CPU 92 */ 93 sync_buffer(raw_smp_processor_id()); 94 return 0; 95 } 96 97 up_read(&mm->mmap_sem); 98 return 0; 99} 100 101 102/* We need to be told about new modules so we don't attribute to a previously 103 * loaded module, or drop the samples on the floor. 104 */ 105static int module_load_notify(struct notifier_block * self, unsigned long val, void * data) 106{ 107#ifdef CONFIG_MODULES 108 if (val != MODULE_STATE_COMING) 109 return 0; 110 111 mutex_lock(&buffer_mutex); 112 add_event_entry(ESCAPE_CODE); 113 add_event_entry(MODULE_LOADED_CODE); 114 mutex_unlock(&buffer_mutex); 115#endif 116 return 0; 117} 118 119 120static struct notifier_block task_free_nb = { 121 .notifier_call = task_free_notify, 122}; 123 124static struct notifier_block task_exit_nb = { 125 .notifier_call = task_exit_notify, 126}; 127 128static struct notifier_block munmap_nb = { 129 .notifier_call = munmap_notify, 130}; 131 132static struct notifier_block module_load_nb = { 133 .notifier_call = module_load_notify, 134}; 135 136 137static void end_sync(void) 138{ 139 end_cpu_work(); 140 /* make sure we don't leak task structs */ 141 process_task_mortuary(); 142 process_task_mortuary(); 143} 144 145 146int sync_start(void) 147{ 148 int err; 149 150 start_cpu_work(); 151 152 err = task_handoff_register(&task_free_nb); 153 if (err) 154 goto out1; 155 err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb); 156 if (err) 157 goto out2; 158 err = profile_event_register(PROFILE_MUNMAP, &munmap_nb); 159 if (err) 160 goto out3; 161 err = register_module_notifier(&module_load_nb); 162 if (err) 163 goto out4; 164 165out: 166 return err; 167out4: 168 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 169out3: 170 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 171out2: 172 task_handoff_unregister(&task_free_nb); 173out1: 174 end_sync(); 175 goto out; 176} 177 178 179void sync_stop(void) 180{ 181 unregister_module_notifier(&module_load_nb); 182 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 183 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 184 task_handoff_unregister(&task_free_nb); 185 end_sync(); 186} 187 188 189/* Optimisation. We can manage without taking the dcookie sem 190 * because we cannot reach this code without at least one 191 * dcookie user still being registered (namely, the reader 192 * of the event buffer). */ 193static inline unsigned long fast_get_dcookie(struct dentry * dentry, 194 struct vfsmount * vfsmnt) 195{ 196 unsigned long cookie; 197 198 if (dentry->d_cookie) 199 return (unsigned long)dentry; 200 get_dcookie(dentry, vfsmnt, &cookie); 201 return cookie; 202} 203 204 205/* Look up the dcookie for the task's first VM_EXECUTABLE mapping, 206 * which corresponds loosely to "application name". This is 207 * not strictly necessary but allows oprofile to associate 208 * shared-library samples with particular applications 209 */ 210static unsigned long get_exec_dcookie(struct mm_struct * mm) 211{ 212 unsigned long cookie = NO_COOKIE; 213 struct vm_area_struct * vma; 214 215 if (!mm) 216 goto out; 217 218 for (vma = mm->mmap; vma; vma = vma->vm_next) { 219 if (!vma->vm_file) 220 continue; 221 if (!(vma->vm_flags & VM_EXECUTABLE)) 222 continue; 223 cookie = fast_get_dcookie(vma->vm_file->f_path.dentry, 224 vma->vm_file->f_path.mnt); 225 break; 226 } 227 228out: 229 return cookie; 230} 231 232 233/* Convert the EIP value of a sample into a persistent dentry/offset 234 * pair that can then be added to the global event buffer. We make 235 * sure to do this lookup before a mm->mmap modification happens so 236 * we don't lose track. 237 */ 238static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset) 239{ 240 unsigned long cookie = NO_COOKIE; 241 struct vm_area_struct * vma; 242 243 for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { 244 245 if (addr < vma->vm_start || addr >= vma->vm_end) 246 continue; 247 248 if (vma->vm_file) { 249 cookie = fast_get_dcookie(vma->vm_file->f_path.dentry, 250 vma->vm_file->f_path.mnt); 251 *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - 252 vma->vm_start; 253 } else { 254 /* must be an anonymous map */ 255 *offset = addr; 256 } 257 258 break; 259 } 260 261 if (!vma) 262 cookie = INVALID_COOKIE; 263 264 return cookie; 265} 266 267 268static unsigned long last_cookie = INVALID_COOKIE; 269 270static void add_cpu_switch(int i) 271{ 272 add_event_entry(ESCAPE_CODE); 273 add_event_entry(CPU_SWITCH_CODE); 274 add_event_entry(i); 275 last_cookie = INVALID_COOKIE; 276} 277 278static void add_kernel_ctx_switch(unsigned int in_kernel) 279{ 280 add_event_entry(ESCAPE_CODE); 281 if (in_kernel) 282 add_event_entry(KERNEL_ENTER_SWITCH_CODE); 283 else 284 add_event_entry(KERNEL_EXIT_SWITCH_CODE); 285} 286 287static void 288add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) 289{ 290 add_event_entry(ESCAPE_CODE); 291 add_event_entry(CTX_SWITCH_CODE); 292 add_event_entry(task->pid); 293 add_event_entry(cookie); 294 /* Another code for daemon back-compat */ 295 add_event_entry(ESCAPE_CODE); 296 add_event_entry(CTX_TGID_CODE); 297 add_event_entry(task->tgid); 298} 299 300 301static void add_cookie_switch(unsigned long cookie) 302{ 303 add_event_entry(ESCAPE_CODE); 304 add_event_entry(COOKIE_SWITCH_CODE); 305 add_event_entry(cookie); 306} 307 308 309static void add_trace_begin(void) 310{ 311 add_event_entry(ESCAPE_CODE); 312 add_event_entry(TRACE_BEGIN_CODE); 313} 314 315 316static void add_sample_entry(unsigned long offset, unsigned long event) 317{ 318 add_event_entry(offset); 319 add_event_entry(event); 320} 321 322 323static int add_us_sample(struct mm_struct * mm, struct op_sample * s) 324{ 325 unsigned long cookie; 326 off_t offset; 327 328 cookie = lookup_dcookie(mm, s->eip, &offset); 329 330 if (cookie == INVALID_COOKIE) { 331 atomic_inc(&oprofile_stats.sample_lost_no_mapping); 332 return 0; 333 } 334 335 if (cookie != last_cookie) { 336 add_cookie_switch(cookie); 337 last_cookie = cookie; 338 } 339 340 add_sample_entry(offset, s->event); 341 342 return 1; 343} 344 345 346/* Add a sample to the global event buffer. If possible the 347 * sample is converted into a persistent dentry/offset pair 348 * for later lookup from userspace. 349 */ 350static int 351add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) 352{ 353 if (in_kernel) { 354 add_sample_entry(s->eip, s->event); 355 return 1; 356 } else if (mm) { 357 return add_us_sample(mm, s); 358 } else { 359 atomic_inc(&oprofile_stats.sample_lost_no_mm); 360 } 361 return 0; 362} 363 364 365static void release_mm(struct mm_struct * mm) 366{ 367 if (!mm) 368 return; 369 up_read(&mm->mmap_sem); 370 mmput(mm); 371} 372 373 374static struct mm_struct * take_tasks_mm(struct task_struct * task) 375{ 376 struct mm_struct * mm = get_task_mm(task); 377 if (mm) 378 down_read(&mm->mmap_sem); 379 return mm; 380} 381 382 383static inline int is_code(unsigned long val) 384{ 385 return val == ESCAPE_CODE; 386} 387 388 389/* "acquire" as many cpu buffer slots as we can */ 390static unsigned long get_slots(struct oprofile_cpu_buffer * b) 391{ 392 unsigned long head = b->head_pos; 393 unsigned long tail = b->tail_pos; 394 395 /* 396 * Subtle. This resets the persistent last_task 397 * and in_kernel values used for switching notes. 398 * BUT, there is a small window between reading 399 * head_pos, and this call, that means samples 400 * can appear at the new head position, but not 401 * be prefixed with the notes for switching 402 * kernel mode or a task switch. This small hole 403 * can lead to mis-attribution or samples where 404 * we don't know if it's in the kernel or not, 405 * at the start of an event buffer. 406 */ 407 cpu_buffer_reset(b); 408 409 if (head >= tail) 410 return head - tail; 411 412 return head + (b->buffer_size - tail); 413} 414 415 416static void increment_tail(struct oprofile_cpu_buffer * b) 417{ 418 unsigned long new_tail = b->tail_pos + 1; 419 420 rmb(); 421 422 if (new_tail < b->buffer_size) 423 b->tail_pos = new_tail; 424 else 425 b->tail_pos = 0; 426} 427 428 429/* Move tasks along towards death. Any tasks on dead_tasks 430 * will definitely have no remaining references in any 431 * CPU buffers at this point, because we use two lists, 432 * and to have reached the list, it must have gone through 433 * one full sync already. 434 */ 435static void process_task_mortuary(void) 436{ 437 unsigned long flags; 438 LIST_HEAD(local_dead_tasks); 439 struct task_struct * task; 440 struct task_struct * ttask; 441 442 spin_lock_irqsave(&task_mortuary, flags); 443 444 list_splice_init(&dead_tasks, &local_dead_tasks); 445 list_splice_init(&dying_tasks, &dead_tasks); 446 447 spin_unlock_irqrestore(&task_mortuary, flags); 448 449 list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) { 450 list_del(&task->tasks); 451 free_task(task); 452 } 453} 454 455 456static void mark_done(int cpu) 457{ 458 int i; 459 460 cpu_set(cpu, marked_cpus); 461 462 for_each_online_cpu(i) { 463 if (!cpu_isset(i, marked_cpus)) 464 return; 465 } 466 467 /* All CPUs have been processed at least once, 468 * we can process the mortuary once 469 */ 470 process_task_mortuary(); 471 472 cpus_clear(marked_cpus); 473} 474 475 476typedef enum { 477 sb_bt_ignore = -2, 478 sb_buffer_start, 479 sb_bt_start, 480 sb_sample_start, 481} sync_buffer_state; 482 483/* Sync one of the CPU's buffers into the global event buffer. 484 * Here we need to go through each batch of samples punctuated 485 * by context switch notes, taking the task's mmap_sem and doing 486 * lookup in task->mm->mmap to convert EIP into dcookie/offset 487 * value. 488 */ 489void sync_buffer(int cpu) 490{ 491 struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[cpu]; 492 struct mm_struct *mm = NULL; 493 struct task_struct * new; 494 unsigned long cookie = 0; 495 int in_kernel = 1; 496 unsigned int i; 497 sync_buffer_state state = sb_buffer_start; 498 unsigned long available; 499 500 mutex_lock(&buffer_mutex); 501 502 add_cpu_switch(cpu); 503 504 /* Remember, only we can modify tail_pos */ 505 506 available = get_slots(cpu_buf); 507 508 for (i = 0; i < available; ++i) { 509 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; 510 511 if (is_code(s->eip)) { 512 if (s->event <= CPU_IS_KERNEL) { 513 /* kernel/userspace switch */ 514 in_kernel = s->event; 515 if (state == sb_buffer_start) 516 state = sb_sample_start; 517 add_kernel_ctx_switch(s->event); 518 } else if (s->event == CPU_TRACE_BEGIN) { 519 state = sb_bt_start; 520 add_trace_begin(); 521 } else { 522 struct mm_struct * oldmm = mm; 523 524 /* userspace context switch */ 525 new = (struct task_struct *)s->event; 526 527 release_mm(oldmm); 528 mm = take_tasks_mm(new); 529 if (mm != oldmm) 530 cookie = get_exec_dcookie(mm); 531 add_user_ctx_switch(new, cookie); 532 } 533 } else { 534 if (state >= sb_bt_start && 535 !add_sample(mm, s, in_kernel)) { 536 if (state == sb_bt_start) { 537 state = sb_bt_ignore; 538 atomic_inc(&oprofile_stats.bt_lost_no_mapping); 539 } 540 } 541 } 542 543 increment_tail(cpu_buf); 544 } 545 release_mm(mm); 546 547 mark_done(cpu); 548 549 mutex_unlock(&buffer_mutex); 550} 551