1/*
2 * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
30#include <kern/thread.h>
31#include <mach/thread_status.h>
32
33typedef x86_saved_state_t savearea_t;
34
35#include <stdarg.h>
36#include <string.h>
37#include <sys/malloc.h>
38#include <sys/time.h>
39#include <sys/systm.h>
40#include <sys/proc.h>
41#include <sys/proc_internal.h>
42#include <sys/kauth.h>
43#include <sys/dtrace.h>
44#include <sys/dtrace_impl.h>
45#include <libkern/OSAtomic.h>
46#include <kern/thread_call.h>
47#include <kern/task.h>
48#include <kern/sched_prim.h>
49#include <miscfs/devfs/devfs.h>
50#include <mach/vm_param.h>
51
52extern dtrace_id_t      dtrace_probeid_error;   /* special ERROR probe */
53
54void
55dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which,
56    int fault, int fltoffs, uint64_t illval)
57{
58    /*
59     * For the case of the error probe firing lets
60     * stash away "illval" here, and special-case retrieving it in DIF_VARIABLE_ARG.
61     */
62    state->dts_arg_error_illval = illval;
63    dtrace_probe( dtrace_probeid_error, (uint64_t)(uintptr_t)state, epid, which, fault, fltoffs );
64}
65
66/*
67 * Atomicity and synchronization
68 */
69void
70dtrace_membar_producer(void)
71{
72	__asm__ volatile("sfence");
73}
74
75void
76dtrace_membar_consumer(void)
77{
78	__asm__ volatile("lfence");
79}
80
81/*
82 * Interrupt manipulation
83 * XXX dtrace_getipl() can be called from probe context.
84 */
85int
86dtrace_getipl(void)
87{
88	/*
89	 * XXX Drat, get_interrupt_level is MACH_KERNEL_PRIVATE
90	 * in osfmk/kern/cpu_data.h
91	 */
92	/* return get_interrupt_level(); */
93	return (ml_at_interrupt_context() ? 1: 0);
94}
95
96/*
97 * MP coordination
98 */
99
100extern void mp_broadcast(
101       void (*action_func)(void *),
102       void *arg);
103
104typedef struct xcArg {
105	processorid_t cpu;
106	dtrace_xcall_t f;
107	void *arg;
108} xcArg_t;
109
110static void
111xcRemote( void *foo )
112{
113	xcArg_t *pArg = (xcArg_t *)foo;
114
115	if ( pArg->cpu == CPU->cpu_id || pArg->cpu == DTRACE_CPUALL ) {
116		(pArg->f)(pArg->arg);
117	}
118}
119
120/*
121 * dtrace_xcall() is not called from probe context.
122 */
123void
124dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg)
125{
126	xcArg_t xcArg;
127
128	xcArg.cpu = cpu;
129	xcArg.f = f;
130	xcArg.arg = arg;
131
132	mp_broadcast( xcRemote, (void *)&xcArg);
133}
134
135/*
136 * Runtime and ABI
137 */
138extern greg_t
139dtrace_getfp(void)
140{
141	return (greg_t)__builtin_frame_address(0);
142}
143
144uint64_t
145dtrace_getreg(struct regs *savearea, uint_t reg)
146{
147	boolean_t is64Bit = proc_is64bit(current_proc());
148	x86_saved_state_t *regs = (x86_saved_state_t *)savearea;
149
150	if (is64Bit) {
151		/* beyond register SS */
152		if (reg > x86_SAVED_STATE64_COUNT - 1) {
153			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
154			return (0);
155		}
156		return ((uint64_t *)(&(regs->ss_64.gs)))[reg];
157	} else {
158		/* beyond register SS */
159		if (reg > x86_SAVED_STATE32_COUNT - 1) {
160			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
161			return (0);
162		}
163		return (uint64_t)((unsigned int *)(&(regs->ss_32.gs)))[reg];
164	}
165
166}
167
168#define RETURN_OFFSET 4
169#define RETURN_OFFSET64 8
170
171static int
172dtrace_getustack_common(uint64_t *pcstack, int pcstack_limit, user_addr_t pc,
173    user_addr_t sp)
174{
175#if 0
176	volatile uint16_t *flags =
177	    (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
178
179	uintptr_t oldcontext = lwp->lwp_oldcontext; /* XXX signal stack crawl */
180	size_t s1, s2;
181#endif
182	int ret = 0;
183	boolean_t is64Bit = proc_is64bit(current_proc());
184
185	ASSERT(pcstack == NULL || pcstack_limit > 0);
186
187#if 0 /* XXX signal stack crawl */
188	if (p->p_model == DATAMODEL_NATIVE) {
189		s1 = sizeof (struct frame) + 2 * sizeof (long);
190		s2 = s1 + sizeof (siginfo_t);
191	} else {
192		s1 = sizeof (struct frame32) + 3 * sizeof (int);
193		s2 = s1 + sizeof (siginfo32_t);
194	}
195#endif
196
197	while (pc != 0) {
198		ret++;
199		if (pcstack != NULL) {
200			*pcstack++ = (uint64_t)pc;
201			pcstack_limit--;
202			if (pcstack_limit <= 0)
203				break;
204		}
205
206		if (sp == 0)
207			break;
208
209#if 0 /* XXX signal stack crawl */
210		if (oldcontext == sp + s1 || oldcontext == sp + s2) {
211			if (p->p_model == DATAMODEL_NATIVE) {
212				ucontext_t *ucp = (ucontext_t *)oldcontext;
213				greg_t *gregs = ucp->uc_mcontext.gregs;
214
215				sp = dtrace_fulword(&gregs[REG_FP]);
216				pc = dtrace_fulword(&gregs[REG_PC]);
217
218				oldcontext = dtrace_fulword(&ucp->uc_link);
219			} else {
220				ucontext32_t *ucp = (ucontext32_t *)oldcontext;
221				greg32_t *gregs = ucp->uc_mcontext.gregs;
222
223				sp = dtrace_fuword32(&gregs[EBP]);
224				pc = dtrace_fuword32(&gregs[EIP]);
225
226				oldcontext = dtrace_fuword32(&ucp->uc_link);
227			}
228		}
229		else
230#endif
231		{
232			if (is64Bit) {
233				pc = dtrace_fuword64((sp + RETURN_OFFSET64));
234				sp = dtrace_fuword64(sp);
235			} else {
236				pc = dtrace_fuword32((sp + RETURN_OFFSET));
237				sp = dtrace_fuword32(sp);
238			}
239		}
240
241#if 0 /* XXX */
242		/*
243		 * This is totally bogus:  if we faulted, we're going to clear
244		 * the fault and break.  This is to deal with the apparently
245		 * broken Java stacks on x86.
246		 */
247		if (*flags & CPU_DTRACE_FAULT) {
248			*flags &= ~CPU_DTRACE_FAULT;
249			break;
250		}
251#endif
252	}
253
254	return (ret);
255}
256
257void
258dtrace_getupcstack(uint64_t *pcstack, int pcstack_limit)
259{
260	thread_t thread = current_thread();
261	x86_saved_state_t *regs;
262	user_addr_t pc, sp, fp;
263	volatile uint16_t *flags =
264	    (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
265	int n;
266	boolean_t is64Bit = proc_is64bit(current_proc());
267
268	if (*flags & CPU_DTRACE_FAULT)
269		return;
270
271	if (pcstack_limit <= 0)
272		return;
273
274	/*
275	 * If there's no user context we still need to zero the stack.
276	 */
277	if (thread == NULL)
278		goto zero;
279
280	regs = (x86_saved_state_t *)find_user_regs(thread);
281	if (regs == NULL)
282		goto zero;
283
284	*pcstack++ = (uint64_t)proc_selfpid();
285	pcstack_limit--;
286
287	if (pcstack_limit <= 0)
288		return;
289
290	if (is64Bit) {
291		pc = regs->ss_64.isf.rip;
292		sp = regs->ss_64.isf.rsp;
293		fp = regs->ss_64.rbp;
294	} else {
295		pc = regs->ss_32.eip;
296		sp = regs->ss_32.uesp;
297		fp = regs->ss_32.ebp;
298	}
299
300	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_ENTRY)) {
301		*pcstack++ = (uint64_t)pc;
302		pcstack_limit--;
303		if (pcstack_limit <= 0)
304			return;
305
306		if (is64Bit)
307			pc = dtrace_fuword64(sp);
308		else
309			pc = dtrace_fuword32(sp);
310	}
311
312	/*
313	 * Note that unlike ppc, the x86 code does not use
314	 * CPU_DTRACE_USTACK_FP. This is because x86 always
315	 * traces from the fp, even in syscall/profile/fbt
316	 * providers.
317	 */
318	n = dtrace_getustack_common(pcstack, pcstack_limit, pc, fp);
319	ASSERT(n >= 0);
320	ASSERT(n <= pcstack_limit);
321
322	pcstack += n;
323	pcstack_limit -= n;
324
325zero:
326	while (pcstack_limit-- > 0)
327		*pcstack++ = 0;
328}
329
330int
331dtrace_getustackdepth(void)
332{
333	thread_t thread = current_thread();
334	x86_saved_state_t *regs;
335	user_addr_t pc, sp, fp;
336	int n = 0;
337	boolean_t is64Bit = proc_is64bit(current_proc());
338
339	if (thread == NULL)
340		return 0;
341
342	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
343		return (-1);
344
345	regs = (x86_saved_state_t *)find_user_regs(thread);
346	if (regs == NULL)
347		return 0;
348
349	if (is64Bit) {
350		pc = regs->ss_64.isf.rip;
351		sp = regs->ss_64.isf.rsp;
352		fp = regs->ss_64.rbp;
353	} else {
354		pc = regs->ss_32.eip;
355		sp = regs->ss_32.uesp;
356		fp = regs->ss_32.ebp;
357	}
358
359	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_ENTRY)) {
360		n++;
361
362		if (is64Bit)
363			pc = dtrace_fuword64(sp);
364		else
365			pc = dtrace_fuword32(sp);
366	}
367
368	/*
369	 * Note that unlike ppc, the x86 code does not use
370	 * CPU_DTRACE_USTACK_FP. This is because x86 always
371	 * traces from the fp, even in syscall/profile/fbt
372	 * providers.
373	 */
374
375	n += dtrace_getustack_common(NULL, 0, pc, fp);
376
377	return (n);
378}
379
380void
381dtrace_getufpstack(uint64_t *pcstack, uint64_t *fpstack, int pcstack_limit)
382{
383	thread_t thread = current_thread();
384	savearea_t *regs;
385	user_addr_t pc, sp;
386	volatile uint16_t *flags =
387	    (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
388#if 0
389	uintptr_t oldcontext;
390	size_t s1, s2;
391#endif
392	boolean_t is64Bit = proc_is64bit(current_proc());
393
394	if (*flags & CPU_DTRACE_FAULT)
395		return;
396
397	if (pcstack_limit <= 0)
398		return;
399
400	/*
401	 * If there's no user context we still need to zero the stack.
402	 */
403	if (thread == NULL)
404		goto zero;
405
406	regs = (savearea_t *)find_user_regs(thread);
407	if (regs == NULL)
408		goto zero;
409
410	*pcstack++ = (uint64_t)proc_selfpid();
411	pcstack_limit--;
412
413	if (pcstack_limit <= 0)
414		return;
415
416	pc = regs->ss_32.eip;
417	sp = regs->ss_32.ebp;
418
419#if 0 /* XXX signal stack crawl */
420	oldcontext = lwp->lwp_oldcontext;
421
422	if (p->p_model == DATAMODEL_NATIVE) {
423		s1 = sizeof (struct frame) + 2 * sizeof (long);
424		s2 = s1 + sizeof (siginfo_t);
425	} else {
426		s1 = sizeof (struct frame32) + 3 * sizeof (int);
427		s2 = s1 + sizeof (siginfo32_t);
428	}
429#endif
430
431	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_ENTRY)) {
432		*pcstack++ = (uint64_t)pc;
433		*fpstack++ = 0;
434		pcstack_limit--;
435		if (pcstack_limit <= 0)
436			return;
437
438		if (is64Bit)
439			pc = dtrace_fuword64(sp);
440		else
441			pc = dtrace_fuword32(sp);
442	}
443
444	while (pc != 0) {
445		*pcstack++ = (uint64_t)pc;
446		*fpstack++ = sp;
447		pcstack_limit--;
448		if (pcstack_limit <= 0)
449			break;
450
451		if (sp == 0)
452			break;
453
454#if 0 /* XXX signal stack crawl */
455		if (oldcontext == sp + s1 || oldcontext == sp + s2) {
456			if (p->p_model == DATAMODEL_NATIVE) {
457				ucontext_t *ucp = (ucontext_t *)oldcontext;
458				greg_t *gregs = ucp->uc_mcontext.gregs;
459
460				sp = dtrace_fulword(&gregs[REG_FP]);
461				pc = dtrace_fulword(&gregs[REG_PC]);
462
463				oldcontext = dtrace_fulword(&ucp->uc_link);
464			} else {
465				ucontext_t *ucp = (ucontext_t *)oldcontext;
466				greg_t *gregs = ucp->uc_mcontext.gregs;
467
468				sp = dtrace_fuword32(&gregs[EBP]);
469				pc = dtrace_fuword32(&gregs[EIP]);
470
471				oldcontext = dtrace_fuword32(&ucp->uc_link);
472			}
473		}
474		else
475#endif
476		{
477			if (is64Bit) {
478				pc = dtrace_fuword64((sp + RETURN_OFFSET64));
479				sp = dtrace_fuword64(sp);
480			} else {
481				pc = dtrace_fuword32((sp + RETURN_OFFSET));
482				sp = dtrace_fuword32(sp);
483			}
484		}
485
486#if 0 /* XXX */
487		/*
488		 * This is totally bogus:  if we faulted, we're going to clear
489		 * the fault and break.  This is to deal with the apparently
490		 * broken Java stacks on x86.
491		 */
492		if (*flags & CPU_DTRACE_FAULT) {
493			*flags &= ~CPU_DTRACE_FAULT;
494			break;
495		}
496#endif
497	}
498
499zero:
500	while (pcstack_limit-- > 0)
501		*pcstack++ = 0;
502}
503
504void
505dtrace_getpcstack(pc_t *pcstack, int pcstack_limit, int aframes,
506		  uint32_t *intrpc)
507{
508	struct frame *fp = (struct frame *)dtrace_getfp();
509	struct frame *nextfp, *minfp, *stacktop;
510	int depth = 0;
511	int last = 0;
512	uintptr_t pc;
513	uintptr_t caller = CPU->cpu_dtrace_caller;
514	int on_intr;
515
516	if ((on_intr = CPU_ON_INTR(CPU)) != 0)
517		stacktop = (struct frame *)dtrace_get_cpu_int_stack_top();
518	else
519		stacktop = (struct frame *)(dtrace_get_kernel_stack(current_thread()) + KERNEL_STACK_SIZE);
520
521	minfp = fp;
522
523	aframes++;
524
525	if (intrpc != NULL && depth < pcstack_limit)
526		pcstack[depth++] = (pc_t)intrpc;
527
528	while (depth < pcstack_limit) {
529		nextfp = *(struct frame **)fp;
530		pc = *(uintptr_t *)(((uint32_t)fp) + RETURN_OFFSET);
531
532		if (nextfp <= minfp || nextfp >= stacktop) {
533			if (on_intr) {
534				/*
535				 * Hop from interrupt stack to thread stack.
536				 */
537				vm_offset_t kstack_base = dtrace_get_kernel_stack(current_thread());
538
539				minfp = (struct frame *)kstack_base;
540				stacktop = (struct frame *)(kstack_base + KERNEL_STACK_SIZE);
541
542				on_intr = 0;
543				continue;
544			}
545			/*
546			 * This is the last frame we can process; indicate
547			 * that we should return after processing this frame.
548			 */
549			last = 1;
550		}
551
552		if (aframes > 0) {
553			if (--aframes == 0 && caller != 0) {
554				/*
555				 * We've just run out of artificial frames,
556				 * and we have a valid caller -- fill it in
557				 * now.
558				 */
559				ASSERT(depth < pcstack_limit);
560				pcstack[depth++] = (pc_t)caller;
561				caller = 0;
562			}
563		} else {
564			if (depth < pcstack_limit)
565				pcstack[depth++] = (pc_t)pc;
566		}
567
568		if (last) {
569			while (depth < pcstack_limit)
570				pcstack[depth++] = 0;
571			return;
572		}
573
574		fp = nextfp;
575		minfp = fp;
576	}
577}
578
579struct frame {
580	struct frame *backchain;
581	uintptr_t retaddr;
582};
583
584uint64_t
585dtrace_getarg(int arg, int aframes)
586{
587	uint64_t val;
588	struct frame *fp = (struct frame *)dtrace_getfp();
589	uintptr_t *stack;
590	uintptr_t pc;
591	int i;
592
593	for (i = 1; i <= aframes; i++) {
594		fp = fp->backchain;
595		pc = fp->retaddr;
596
597		if (pc  == (uintptr_t)dtrace_invop_callsite) {
598			/*
599			 * If we pass through the invalid op handler, we will
600			 * use the pointer that it passed to the stack as the
601			 * second argument to dtrace_invop() as the pointer to
602			 * the frame we're hunting for.
603			 */
604
605			stack = (uintptr_t *)&fp[1]; /* Find marshalled arguments */
606			fp = (struct frame *)stack[1]; /* Grab *second* argument */
607			stack = (uintptr_t *)&fp[1]; /* Find marshalled arguments */
608			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
609			val = (uint64_t)(stack[arg]);
610			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
611			return val;
612		}
613	}
614
615	/*
616	 * Arrive here when provider has called dtrace_probe directly.
617	 */
618	stack = (uintptr_t *)&fp[1]; /* Find marshalled arguments */
619	stack++; /* Advance past probeID */
620
621	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
622	val = *(((uint64_t *)stack) + arg); /* dtrace_probe arguments arg0 .. arg4 are 64bits wide */
623	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
624
625	return (val);
626}
627
628/*
629 * Load/Store Safety
630 */
631void
632dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit))
633{
634	/*
635	 * "base" is the smallest toxic address in the range, "limit" is the first
636	 * VALID address greater than "base".
637	 */
638	func(0x0, VM_MIN_KERNEL_ADDRESS);
639	func(VM_MAX_KERNEL_ADDRESS + 1, ~(uintptr_t)0);
640}
641
642extern boolean_t pmap_valid_page(ppnum_t pn);
643
644boolean_t
645dtxnu_is_RAM_page(ppnum_t pn)
646{
647	return pmap_valid_page(pn);
648}
649
650