1/** 2 * @file cpu_buffer.c 3 * 4 * @remark Copyright 2002-2009 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * @author Barry Kasindorf <barry.kasindorf@amd.com> 9 * @author Robert Richter <robert.richter@amd.com> 10 * 11 * Each CPU has a local buffer that stores PC value/event 12 * pairs. We also log context switches when we notice them. 13 * Eventually each CPU's buffer is processed into the global 14 * event buffer by sync_buffer(). 15 * 16 * We use a local buffer for two reasons: an NMI or similar 17 * interrupt cannot synchronise, and high sampling rates 18 * would lead to catastrophic global synchronisation if 19 * a global buffer was used. 20 */ 21 22#include <linux/sched.h> 23#include <linux/oprofile.h> 24#include <linux/errno.h> 25 26#include "event_buffer.h" 27#include "cpu_buffer.h" 28#include "buffer_sync.h" 29#include "oprof.h" 30 31#define OP_BUFFER_FLAGS 0 32 33static struct ring_buffer *op_ring_buffer; 34DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer); 35 36static void wq_sync_buffer(struct work_struct *work); 37 38#define DEFAULT_TIMER_EXPIRE (HZ / 10) 39static int work_enabled; 40 41unsigned long oprofile_get_cpu_buffer_size(void) 42{ 43 return oprofile_cpu_buffer_size; 44} 45 46void oprofile_cpu_buffer_inc_smpl_lost(void) 47{ 48 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 49 50 cpu_buf->sample_lost_overflow++; 51} 52 53void free_cpu_buffers(void) 54{ 55 if (op_ring_buffer) 56 ring_buffer_free(op_ring_buffer); 57 op_ring_buffer = NULL; 58} 59 60#define RB_EVENT_HDR_SIZE 4 61 62int alloc_cpu_buffers(void) 63{ 64 int i; 65 66 unsigned long buffer_size = oprofile_cpu_buffer_size; 67 unsigned long byte_size = buffer_size * (sizeof(struct op_sample) + 68 RB_EVENT_HDR_SIZE); 69 70 op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS); 71 if (!op_ring_buffer) 72 goto fail; 73 74 for_each_possible_cpu(i) { 75 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 76 77 b->last_task = NULL; 78 b->last_is_kernel = -1; 79 b->tracing = 0; 80 b->buffer_size = buffer_size; 81 b->sample_received = 0; 82 b->sample_lost_overflow = 0; 83 b->backtrace_aborted = 0; 84 b->sample_invalid_eip = 0; 85 b->cpu = i; 86 INIT_DELAYED_WORK(&b->work, wq_sync_buffer); 87 } 88 return 0; 89 90fail: 91 free_cpu_buffers(); 92 return -ENOMEM; 93} 94 95void start_cpu_work(void) 96{ 97 int i; 98 99 work_enabled = 1; 100 101 for_each_online_cpu(i) { 102 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 103 104 /* 105 * Spread the work by 1 jiffy per cpu so they dont all 106 * fire at once. 107 */ 108 schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i); 109 } 110} 111 112void end_cpu_work(void) 113{ 114 int i; 115 116 work_enabled = 0; 117 118 for_each_online_cpu(i) { 119 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 120 121 cancel_delayed_work(&b->work); 122 } 123} 124 125/* 126 * This function prepares the cpu buffer to write a sample. 127 * 128 * Struct op_entry is used during operations on the ring buffer while 129 * struct op_sample contains the data that is stored in the ring 130 * buffer. Struct entry can be uninitialized. The function reserves a 131 * data array that is specified by size. Use 132 * op_cpu_buffer_write_commit() after preparing the sample. In case of 133 * errors a null pointer is returned, otherwise the pointer to the 134 * sample. 135 * 136 */ 137struct op_sample 138*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size) 139{ 140 entry->event = ring_buffer_lock_reserve 141 (op_ring_buffer, sizeof(struct op_sample) + 142 size * sizeof(entry->sample->data[0])); 143 if (!entry->event) 144 return NULL; 145 entry->sample = ring_buffer_event_data(entry->event); 146 entry->size = size; 147 entry->data = entry->sample->data; 148 149 return entry->sample; 150} 151 152int op_cpu_buffer_write_commit(struct op_entry *entry) 153{ 154 return ring_buffer_unlock_commit(op_ring_buffer, entry->event); 155} 156 157struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) 158{ 159 struct ring_buffer_event *e; 160 e = ring_buffer_consume(op_ring_buffer, cpu, NULL, NULL); 161 if (!e) 162 return NULL; 163 164 entry->event = e; 165 entry->sample = ring_buffer_event_data(e); 166 entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample)) 167 / sizeof(entry->sample->data[0]); 168 entry->data = entry->sample->data; 169 return entry->sample; 170} 171 172unsigned long op_cpu_buffer_entries(int cpu) 173{ 174 return ring_buffer_entries_cpu(op_ring_buffer, cpu); 175} 176 177static int 178op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace, 179 int is_kernel, struct task_struct *task) 180{ 181 struct op_entry entry; 182 struct op_sample *sample; 183 unsigned long flags; 184 int size; 185 186 flags = 0; 187 188 if (backtrace) 189 flags |= TRACE_BEGIN; 190 191 /* notice a switch from user->kernel or vice versa */ 192 is_kernel = !!is_kernel; 193 if (cpu_buf->last_is_kernel != is_kernel) { 194 cpu_buf->last_is_kernel = is_kernel; 195 flags |= KERNEL_CTX_SWITCH; 196 if (is_kernel) 197 flags |= IS_KERNEL; 198 } 199 200 /* notice a task switch */ 201 if (cpu_buf->last_task != task) { 202 cpu_buf->last_task = task; 203 flags |= USER_CTX_SWITCH; 204 } 205 206 if (!flags) 207 /* nothing to do */ 208 return 0; 209 210 if (flags & USER_CTX_SWITCH) 211 size = 1; 212 else 213 size = 0; 214 215 sample = op_cpu_buffer_write_reserve(&entry, size); 216 if (!sample) 217 return -ENOMEM; 218 219 sample->eip = ESCAPE_CODE; 220 sample->event = flags; 221 222 if (size) 223 op_cpu_buffer_add_data(&entry, (unsigned long)task); 224 225 op_cpu_buffer_write_commit(&entry); 226 227 return 0; 228} 229 230static inline int 231op_add_sample(struct oprofile_cpu_buffer *cpu_buf, 232 unsigned long pc, unsigned long event) 233{ 234 struct op_entry entry; 235 struct op_sample *sample; 236 237 sample = op_cpu_buffer_write_reserve(&entry, 0); 238 if (!sample) 239 return -ENOMEM; 240 241 sample->eip = pc; 242 sample->event = event; 243 244 return op_cpu_buffer_write_commit(&entry); 245} 246 247/* 248 * This must be safe from any context. 249 * 250 * is_kernel is needed because on some architectures you cannot 251 * tell if you are in kernel or user space simply by looking at 252 * pc. We tag this in the buffer by generating kernel enter/exit 253 * events whenever is_kernel changes 254 */ 255static int 256log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, 257 unsigned long backtrace, int is_kernel, unsigned long event) 258{ 259 cpu_buf->sample_received++; 260 261 if (pc == ESCAPE_CODE) { 262 cpu_buf->sample_invalid_eip++; 263 return 0; 264 } 265 266 if (op_add_code(cpu_buf, backtrace, is_kernel, current)) 267 goto fail; 268 269 if (op_add_sample(cpu_buf, pc, event)) 270 goto fail; 271 272 return 1; 273 274fail: 275 cpu_buf->sample_lost_overflow++; 276 return 0; 277} 278 279static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) 280{ 281 cpu_buf->tracing = 1; 282} 283 284static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) 285{ 286 cpu_buf->tracing = 0; 287} 288 289static inline void 290__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, 291 unsigned long event, int is_kernel) 292{ 293 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 294 unsigned long backtrace = oprofile_backtrace_depth; 295 296 /* 297 * if log_sample() fail we can't backtrace since we lost the 298 * source of this event 299 */ 300 if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event)) 301 /* failed */ 302 return; 303 304 if (!backtrace) 305 return; 306 307 oprofile_begin_trace(cpu_buf); 308 oprofile_ops.backtrace(regs, backtrace); 309 oprofile_end_trace(cpu_buf); 310} 311 312void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, 313 unsigned long event, int is_kernel) 314{ 315 __oprofile_add_ext_sample(pc, regs, event, is_kernel); 316} 317 318void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) 319{ 320 int is_kernel; 321 unsigned long pc; 322 323 if (likely(regs)) { 324 is_kernel = !user_mode(regs); 325 pc = profile_pc(regs); 326 } else { 327 is_kernel = 0; /* This value will not be used */ 328 pc = ESCAPE_CODE; /* as this causes an early return. */ 329 } 330 331 __oprofile_add_ext_sample(pc, regs, event, is_kernel); 332} 333 334/* 335 * Add samples with data to the ring buffer. 336 * 337 * Use oprofile_add_data(&entry, val) to add data and 338 * oprofile_write_commit(&entry) to commit the sample. 339 */ 340void 341oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs, 342 unsigned long pc, int code, int size) 343{ 344 struct op_sample *sample; 345 int is_kernel = !user_mode(regs); 346 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 347 348 cpu_buf->sample_received++; 349 350 /* no backtraces for samples with data */ 351 if (op_add_code(cpu_buf, 0, is_kernel, current)) 352 goto fail; 353 354 sample = op_cpu_buffer_write_reserve(entry, size + 2); 355 if (!sample) 356 goto fail; 357 sample->eip = ESCAPE_CODE; 358 sample->event = 0; /* no flags */ 359 360 op_cpu_buffer_add_data(entry, code); 361 op_cpu_buffer_add_data(entry, pc); 362 363 return; 364 365fail: 366 entry->event = NULL; 367 cpu_buf->sample_lost_overflow++; 368} 369 370int oprofile_add_data(struct op_entry *entry, unsigned long val) 371{ 372 if (!entry->event) 373 return 0; 374 return op_cpu_buffer_add_data(entry, val); 375} 376 377int oprofile_add_data64(struct op_entry *entry, u64 val) 378{ 379 if (!entry->event) 380 return 0; 381 if (op_cpu_buffer_get_size(entry) < 2) 382 /* 383 * the function returns 0 to indicate a too small 384 * buffer, even if there is some space left 385 */ 386 return 0; 387 if (!op_cpu_buffer_add_data(entry, (u32)val)) 388 return 0; 389 return op_cpu_buffer_add_data(entry, (u32)(val >> 32)); 390} 391 392int oprofile_write_commit(struct op_entry *entry) 393{ 394 if (!entry->event) 395 return -EINVAL; 396 return op_cpu_buffer_write_commit(entry); 397} 398 399void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) 400{ 401 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 402 log_sample(cpu_buf, pc, 0, is_kernel, event); 403} 404 405void oprofile_add_trace(unsigned long pc) 406{ 407 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 408 409 if (!cpu_buf->tracing) 410 return; 411 412 /* 413 * broken frame can give an eip with the same value as an 414 * escape code, abort the trace if we get it 415 */ 416 if (pc == ESCAPE_CODE) 417 goto fail; 418 419 if (op_add_sample(cpu_buf, pc, 0)) 420 goto fail; 421 422 return; 423fail: 424 cpu_buf->tracing = 0; 425 cpu_buf->backtrace_aborted++; 426 return; 427} 428 429/* 430 * This serves to avoid cpu buffer overflow, and makes sure 431 * the task mortuary progresses 432 * 433 * By using schedule_delayed_work_on and then schedule_delayed_work 434 * we guarantee this will stay on the correct cpu 435 */ 436static void wq_sync_buffer(struct work_struct *work) 437{ 438 struct oprofile_cpu_buffer *b = 439 container_of(work, struct oprofile_cpu_buffer, work.work); 440 if (b->cpu != smp_processor_id()) { 441 printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n", 442 smp_processor_id(), b->cpu); 443 444 if (!cpu_online(b->cpu)) { 445 cancel_delayed_work(&b->work); 446 return; 447 } 448 } 449 sync_buffer(b->cpu); 450 451 /* don't re-add the work if we're shutting down */ 452 if (work_enabled) 453 schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE); 454} 455