1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/* #pragma ident	"@(#)fbt.c	1.15	05/09/19 SMI" */
28
29#ifdef KERNEL
30#ifndef _KERNEL
31#define _KERNEL /* Solaris vs. Darwin */
32#endif
33#endif
34
35#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
36#include <kern/thread.h>
37#include <mach/thread_status.h>
38#include <mach/vm_param.h>
39#include <mach-o/loader.h>
40#include <mach-o/nlist.h>
41#include <libkern/kernel_mach_header.h>
42#include <libkern/OSAtomic.h>
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/errno.h>
47#include <sys/stat.h>
48#include <sys/ioctl.h>
49#include <sys/conf.h>
50#include <sys/fcntl.h>
51#include <miscfs/devfs/devfs.h>
52
53#include <sys/dtrace.h>
54#include <sys/dtrace_impl.h>
55#include <sys/fbt.h>
56
57#include <sys/dtrace_glue.h>
58
59#define DTRACE_INVOP_NOP_SKIP 1
60#define DTRACE_INVOP_MOVL_ESP_EBP 10
61#define DTRACE_INVOP_MOVL_ESP_EBP_SKIP 2
62#define DTRACE_INVOP_MOV_RSP_RBP 11
63#define DTRACE_INVOP_MOV_RSP_RBP_SKIP 3
64#define DTRACE_INVOP_POP_RBP 12
65#define DTRACE_INVOP_POP_RBP_SKIP 1
66#define DTRACE_INVOP_LEAVE_SKIP 1
67
68#define	FBT_PUSHL_EBP			0x55
69#define	FBT_MOVL_ESP_EBP0_V0	0x8b
70#define	FBT_MOVL_ESP_EBP1_V0	0xec
71#define	FBT_MOVL_ESP_EBP0_V1	0x89
72#define	FBT_MOVL_ESP_EBP1_V1	0xe5
73
74#define	FBT_PUSH_RBP			0x55
75#define	FBT_REX_RSP_RBP			0x48
76#define	FBT_MOV_RSP_RBP0		0x89
77#define	FBT_MOV_RSP_RBP1		0xe5
78#define	FBT_POP_RBP				0x5d
79
80#define	FBT_POPL_EBP			0x5d
81#define	FBT_RET					0xc3
82#define	FBT_RET_IMM16			0xc2
83#define	FBT_LEAVE				0xc9
84#define	FBT_JMP_SHORT_REL		0xeb /* Jump short, relative, displacement relative to next instr. */
85#define	FBT_JMP_NEAR_REL		0xe9 /* Jump near, relative, displacement relative to next instr. */
86#define	FBT_JMP_FAR_ABS			0xea /* Jump far, absolute, address given in operand */
87#define FBT_RET_LEN				1
88#define FBT_RET_IMM16_LEN		3
89#define	FBT_JMP_SHORT_REL_LEN	2
90#define	FBT_JMP_NEAR_REL_LEN	5
91#define	FBT_JMP_FAR_ABS_LEN		5
92
93#define	FBT_PATCHVAL			0xf0
94#define FBT_AFRAMES_ENTRY		7
95#define FBT_AFRAMES_RETURN		6
96
97#define	FBT_ENTRY	"entry"
98#define	FBT_RETURN	"return"
99#define	FBT_ADDR2NDX(addr)	((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask)
100
101extern dtrace_provider_id_t	fbt_id;
102extern fbt_probe_t		**fbt_probetab;
103extern int			fbt_probetab_mask;
104
105extern int			gIgnoreFBTBlacklist; /* From fbt_init */
106
107kern_return_t fbt_perfCallback(int, x86_saved_state_t *, uintptr_t *, __unused int);
108
109/*
110 * Critical routines that must not be probed. PR_5221096, PR_5379018.
111 * The blacklist must be kept in alphabetic order for purposes of bsearch().
112 */
113
114static const char * critical_blacklist[] =
115{
116	"bcopy_phys",
117	"console_cpu_alloc",
118	"console_cpu_free",
119	"cpu_IA32e_disable",
120	"cpu_IA32e_enable",
121	"cpu_NMI_interrupt",
122	"cpu_control",
123	"cpu_data_alloc",
124	"cpu_desc_init",
125	"cpu_desc_init64",
126	"cpu_desc_load",
127	"cpu_desc_load64",
128	"cpu_exit_wait",
129	"cpu_info",
130	"cpu_info_count",
131	"cpu_init",
132	"cpu_interrupt",
133	"cpu_machine_init",
134	"cpu_mode_init",
135	"cpu_processor_alloc",
136	"cpu_processor_free",
137	"cpu_signal_handler",
138	"cpu_sleep",
139	"cpu_start",
140	"cpu_subtype",
141	"cpu_thread_alloc",
142	"cpu_thread_halt",
143	"cpu_thread_init",
144	"cpu_threadtype",
145	"cpu_to_processor",
146	"cpu_topology_sort",
147	"cpu_topology_start_cpu",
148	"cpu_type",
149	"cpuid_cpu_display",
150	"cpuid_extfeatures",
151	"handle_pending_TLB_flushes",
152	"hw_compare_and_store",
153	"machine_idle_cstate",
154	"mca_cpu_alloc",
155	"mca_cpu_init",
156	"ml_nofault_copy",
157	"pmap_cpu_alloc",
158	"pmap_cpu_free",
159	"pmap_cpu_high_map_vaddr",
160	"pmap_cpu_high_shared_remap",
161	"pmap_cpu_init",
162	"register_cpu_setup_func",
163	"unregister_cpu_setup_func",
164	"vstart"
165};
166#define CRITICAL_BLACKLIST_COUNT (sizeof(critical_blacklist)/sizeof(critical_blacklist[0]))
167
168/*
169 * The transitive closure of entry points that can be reached from probe context.
170 * (Apart from routines whose names begin with dtrace_).
171 */
172static const char * probe_ctx_closure[] =
173{
174	"Debugger",
175	"IS_64BIT_PROCESS",
176	"OSCompareAndSwap",
177	"absolutetime_to_microtime",
178	"act_set_astbsd",
179	"ast_pending",
180	"clock_get_calendar_nanotime_nowait",
181	"copyin",
182	"copyin_user",
183	"copyinstr",
184	"copyout",
185	"copyoutstr",
186	"cpu_number",
187	"current_proc",
188	"current_processor",
189	"current_task",
190	"current_thread",
191	"debug_enter",
192	"find_user_regs",
193	"flush_tlb64",
194	"get_bsdtask_info",
195	"get_bsdthread_info",
196	"hw_atomic_and",
197	"kauth_cred_get",
198	"kauth_getgid",
199	"kauth_getuid",
200	"kernel_preempt_check",
201	"mach_absolute_time",
202	"max_valid_stack_address",
203	"ml_at_interrupt_context",
204	"ml_phys_write_byte_64",
205	"ml_phys_write_half_64",
206	"ml_phys_write_word_64",
207	"ml_set_interrupts_enabled",
208	"panic",
209	"pmap64_pde",
210	"pmap64_pdpt",
211	"pmap_find_phys",
212	"pmap_get_mapwindow",
213	"pmap_pde",
214	"pmap_pte",
215	"pmap_put_mapwindow",
216	"pmap_valid_page",
217	"prf",
218	"proc_is64bit",
219	"proc_selfname",
220	"psignal_lock",
221	"rtc_nanotime_load",
222	"rtc_nanotime_read",
223	"sdt_getargdesc",
224	"strlcpy",
225	"sync_iss_to_iks_unconditionally",
226	"systrace_stub",
227	"timer_grab"
228};
229#define PROBE_CTX_CLOSURE_COUNT (sizeof(probe_ctx_closure)/sizeof(probe_ctx_closure[0]))
230
231
232static int _cmp(const void *a, const void *b)
233{
234	return strncmp((const char *)a, *(const char **)b, strlen((const char *)a) + 1);
235}
236
237static const void * bsearch(
238	register const void *key,
239	const void *base0,
240	size_t nmemb,
241	register size_t size,
242	register int (*compar)(const void *, const void *)) {
243
244	register const char *base = base0;
245	register size_t lim;
246	register int cmp;
247	register const void *p;
248
249	for (lim = nmemb; lim != 0; lim >>= 1) {
250		p = base + (lim >> 1) * size;
251		cmp = (*compar)(key, p);
252		if (cmp == 0)
253			return p;
254		if (cmp > 0) {	/* key > p: move right */
255			base = (const char *)p + size;
256			lim--;
257		}		/* else move left */
258	}
259	return (NULL);
260}
261
262/*
263 * Module validation
264 */
265static int
266is_module_valid(struct modctl* ctl)
267{
268	ASSERT(!MOD_FBT_PROBES_PROVIDED(ctl));
269	ASSERT(!MOD_FBT_INVALID(ctl));
270
271	if (0 == ctl->mod_address || 0 == ctl->mod_size) {
272		return FALSE;
273	}
274
275	if (0 == ctl->mod_loaded) {
276	        return FALSE;
277	}
278
279	if (strstr(ctl->mod_modname, "CHUD") != NULL)
280		return FALSE;
281
282        /*
283	 * If the user sets this, trust they know what they are doing.
284	 */
285	if (gIgnoreFBTBlacklist)   /* per boot-arg set in fbt_init() */
286		return TRUE;
287
288	/*
289	 * These drivers control low level functions that when traced
290	 * cause problems often in the sleep/wake paths as well as
291	 * critical debug and panic paths.
292	 * If somebody really wants to drill in on one of these kexts, then
293	 * they can override blacklisting using the boot-arg above.
294	 */
295
296	if (strstr(ctl->mod_modname, "AppleACPIEC") != NULL)
297		return FALSE;
298
299	if (strstr(ctl->mod_modname, "AppleACPIPlatform") != NULL)
300		return FALSE;
301
302	if (strstr(ctl->mod_modname, "AppleRTC") != NULL)
303		return FALSE;
304
305	if (strstr(ctl->mod_modname, "IOACPIFamily") != NULL)
306		return FALSE;
307
308	if (strstr(ctl->mod_modname, "AppleIntelCPUPowerManagement") != NULL)
309		return FALSE;
310
311	if (strstr(ctl->mod_modname, "AppleProfile") != NULL)
312		return FALSE;
313
314	if (strstr(ctl->mod_modname, "AppleIntelProfile") != NULL)
315		return FALSE;
316
317	if (strstr(ctl->mod_modname, "AppleEFI") != NULL)
318		return FALSE;
319
320	return TRUE;
321}
322
323/*
324 * FBT probe name validation
325 */
326static int
327is_symbol_valid(const char* name)
328{
329	/*
330	 * If the user set this, trust they know what they are doing.
331	 */
332	if (gIgnoreFBTBlacklist)
333		return TRUE;
334
335	if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) {
336		/*
337		 * Anything beginning with "dtrace_" may be called
338		 * from probe context unless it explitly indicates
339		 * that it won't be called from probe context by
340		 * using the prefix "dtrace_safe_".
341		 */
342		return FALSE;
343	}
344
345	if (LIT_STRNSTART(name, "fasttrap_") ||
346	    LIT_STRNSTART(name, "fuword") ||
347	    LIT_STRNSTART(name, "suword") ||
348	    LIT_STRNEQL(name, "sprlock") ||
349	    LIT_STRNEQL(name, "sprunlock") ||
350	    LIT_STRNEQL(name, "uread") ||
351	    LIT_STRNEQL(name, "uwrite")) {
352		return FALSE; /* Fasttrap inner-workings. */
353	}
354
355	if (LIT_STRNSTART(name, "dsmos_"))
356		return FALSE; /* Don't Steal Mac OS X! */
357
358        if (LIT_STRNSTART(name, "_dtrace"))
359		return FALSE; /* Shims in dtrace.c */
360
361	if (LIT_STRNSTART(name, "chud"))
362		return FALSE; /* Professional courtesy. */
363
364	if (LIT_STRNSTART(name, "hibernate_"))
365		return FALSE; /* Let sleeping dogs lie. */
366
367	if (LIT_STRNEQL(name, "_ZNK6OSData14getBytesNoCopyEv"))
368		return FALSE;  /* Data::getBytesNoCopy, IOHibernateSystemWake path */
369
370	if (LIT_STRNEQL(name, "_ZN9IOService14newTemperatureElPS_") || /* IOService::newTemperature */
371	    LIT_STRNEQL(name, "_ZN9IOService26temperatureCriticalForZoneEPS_")) { /* IOService::temperatureCriticalForZone */
372		return FALSE; /* Per the fire code */
373	}
374
375	/*
376	 * Place no probes (illegal instructions) in the exception handling path!
377	 */
378	if (LIT_STRNEQL(name, "t_invop") ||
379	    LIT_STRNEQL(name, "enter_lohandler") ||
380	    LIT_STRNEQL(name, "lo_alltraps") ||
381	    LIT_STRNEQL(name, "kernel_trap") ||
382	    LIT_STRNEQL(name, "interrupt") ||
383	    LIT_STRNEQL(name, "i386_astintr")) {
384		return FALSE;
385	}
386
387	if (LIT_STRNEQL(name, "current_thread") ||
388	    LIT_STRNEQL(name, "ast_pending") ||
389	    LIT_STRNEQL(name, "fbt_perfCallback") ||
390	    LIT_STRNEQL(name, "machine_thread_get_kern_state") ||
391	    LIT_STRNEQL(name, "get_threadtask") ||
392	    LIT_STRNEQL(name, "ml_set_interrupts_enabled") ||
393	    LIT_STRNEQL(name, "dtrace_invop") ||
394	    LIT_STRNEQL(name, "fbt_invop") ||
395	    LIT_STRNEQL(name, "sdt_invop") ||
396	    LIT_STRNEQL(name, "max_valid_stack_address")) {
397		return FALSE;
398	}
399
400	/*
401	 * Voodoo.
402	 */
403	if (LIT_STRNSTART(name, "machine_stack_") ||
404	    LIT_STRNSTART(name, "mapping_") ||
405	    LIT_STRNEQL(name, "tmrCvt") ||
406
407	    LIT_STRNSTART(name, "tsc_") ||
408
409	    LIT_STRNSTART(name, "pmCPU") ||
410	    LIT_STRNEQL(name, "pmKextRegister") ||
411	    LIT_STRNEQL(name, "pmMarkAllCPUsOff") ||
412	    LIT_STRNEQL(name, "pmSafeMode") ||
413	    LIT_STRNEQL(name, "pmTimerSave") ||
414	    LIT_STRNEQL(name, "pmTimerRestore") ||
415	    LIT_STRNEQL(name, "pmUnRegister") ||
416	    LIT_STRNSTART(name, "pms") ||
417	    LIT_STRNEQL(name, "power_management_init") ||
418	    LIT_STRNSTART(name, "usimple_") ||
419	    LIT_STRNSTART(name, "lck_spin_lock") ||
420	    LIT_STRNSTART(name, "lck_spin_unlock") ||
421
422	    LIT_STRNSTART(name, "rtc_") ||
423	    LIT_STRNSTART(name, "_rtc_") ||
424	    LIT_STRNSTART(name, "rtclock_") ||
425	    LIT_STRNSTART(name, "clock_") ||
426	    LIT_STRNSTART(name, "absolutetime_to_") ||
427	    LIT_STRNEQL(name, "setPop") ||
428	    LIT_STRNEQL(name, "nanoseconds_to_absolutetime") ||
429	    LIT_STRNEQL(name, "nanotime_to_absolutetime") ||
430
431	    LIT_STRNSTART(name, "etimer_") ||
432
433	    LIT_STRNSTART(name, "commpage_") ||
434	    LIT_STRNSTART(name, "pmap_") ||
435	    LIT_STRNSTART(name, "ml_") ||
436	    LIT_STRNSTART(name, "PE_") ||
437	    LIT_STRNEQL(name, "kprintf") ||
438	    LIT_STRNSTART(name, "lapic_") ||
439	    LIT_STRNSTART(name, "act_machine") ||
440	    LIT_STRNSTART(name, "acpi_")  ||
441	    LIT_STRNSTART(name, "pal_")){
442		return FALSE;
443	}
444
445	/*
446         * Avoid machine_ routines. PR_5346750.
447         */
448        if (LIT_STRNSTART(name, "machine_"))
449		return FALSE;
450
451	if (LIT_STRNEQL(name, "handle_pending_TLB_flushes"))
452		return FALSE;
453
454        /*
455         * Place no probes on critical routines. PR_5221096
456         */
457        if (bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL)
458                return FALSE;
459
460        /*
461	 * Place no probes that could be hit in probe context.
462	 */
463	if (bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) {
464		return FALSE;
465	}
466
467	/*
468	 * Place no probes that could be hit on the way to the debugger.
469	 */
470	if (LIT_STRNSTART(name, "kdp_") ||
471	    LIT_STRNSTART(name, "kdb_") ||
472	    LIT_STRNSTART(name, "kdbg_") ||
473	    LIT_STRNSTART(name, "kdebug_") ||
474	    LIT_STRNSTART(name, "kernel_debug") ||
475	    LIT_STRNSTART(name, "debug_") ||
476	    LIT_STRNEQL(name, "Debugger") ||
477	    LIT_STRNEQL(name, "Call_DebuggerC") ||
478	    LIT_STRNEQL(name, "lock_debugger") ||
479	    LIT_STRNEQL(name, "unlock_debugger") ||
480	    LIT_STRNEQL(name, "packA")  ||
481	    LIT_STRNEQL(name, "unpackA")  ||
482	    LIT_STRNEQL(name, "SysChoked"))  {
483		return FALSE;
484	}
485
486
487	/*
488	 * Place no probes that could be hit on the way to a panic.
489	 */
490	if (NULL != strstr(name, "panic_") ||
491	    LIT_STRNEQL(name, "panic") ||
492	    LIT_STRNEQL(name, "preemption_underflow_panic")) {
493		return FALSE;
494	}
495
496	return TRUE;
497}
498
499int
500fbt_invop(uintptr_t addr, uintptr_t *state, uintptr_t rval)
501{
502	fbt_probe_t *fbt = fbt_probetab[FBT_ADDR2NDX(addr)];
503
504	for (; fbt != NULL; fbt = fbt->fbtp_hashnext) {
505		if ((uintptr_t)fbt->fbtp_patchpoint == addr) {
506
507			if (fbt->fbtp_roffset == 0) {
508				x86_saved_state64_t *regs = (x86_saved_state64_t *)state;
509
510				CPU->cpu_dtrace_caller = *(uintptr_t *)(((uintptr_t)(regs->isf.rsp))+sizeof(uint64_t)); // 8(%rsp)
511				/* 64-bit ABI, arguments passed in registers. */
512				dtrace_probe(fbt->fbtp_id, regs->rdi, regs->rsi, regs->rdx, regs->rcx, regs->r8);
513				CPU->cpu_dtrace_caller = 0;
514			} else {
515
516				dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset, rval, 0, 0, 0);
517				CPU->cpu_dtrace_caller = 0;
518			}
519
520			return (fbt->fbtp_rval);
521		}
522	}
523
524	return (0);
525}
526
527#define IS_USER_TRAP(regs) (regs && (((regs)->isf.cs & 3) != 0))
528#define T_INVALID_OPCODE 6
529#define FBT_EXCEPTION_CODE T_INVALID_OPCODE
530#define T_PREEMPT       255
531
532kern_return_t
533fbt_perfCallback(
534                int         		trapno,
535                x86_saved_state_t 	*tagged_regs,
536		uintptr_t		*lo_spp,
537                __unused int        unused2)
538{
539	kern_return_t retval = KERN_FAILURE;
540	x86_saved_state64_t *saved_state = saved_state64(tagged_regs);
541
542	if (FBT_EXCEPTION_CODE == trapno && !IS_USER_TRAP(saved_state)) {
543		boolean_t oldlevel;
544		uint64_t rsp_probe, fp, delta = 0;
545		uintptr_t old_sp;
546		uint32_t *pDst;
547		int emul;
548
549
550		oldlevel = ml_set_interrupts_enabled(FALSE);
551
552		/* Calculate where the stack pointer was when the probe instruction "fired." */
553		rsp_probe = saved_state->isf.rsp; /* Easy, x86_64 establishes this value in idt64.s */
554
555		__asm__ volatile(
556			"Ldtrace_invop_callsite_pre_label:\n"
557			".data\n"
558			".private_extern _dtrace_invop_callsite_pre\n"
559			"_dtrace_invop_callsite_pre:\n"
560			"  .quad Ldtrace_invop_callsite_pre_label\n"
561			".text\n"
562				 );
563
564		emul = dtrace_invop( saved_state->isf.rip, (uintptr_t *)saved_state, saved_state->rax );
565
566		__asm__ volatile(
567			"Ldtrace_invop_callsite_post_label:\n"
568			".data\n"
569			".private_extern _dtrace_invop_callsite_post\n"
570			"_dtrace_invop_callsite_post:\n"
571			"  .quad Ldtrace_invop_callsite_post_label\n"
572			".text\n"
573				 );
574
575		switch (emul) {
576		case DTRACE_INVOP_NOP:
577			saved_state->isf.rip += DTRACE_INVOP_NOP_SKIP;	/* Skip over the patched NOP (planted by sdt). */
578			retval = KERN_SUCCESS;
579			break;
580
581		case DTRACE_INVOP_MOV_RSP_RBP:
582			saved_state->rbp = rsp_probe;							/* Emulate patched mov %rsp,%rbp */
583			saved_state->isf.rip += DTRACE_INVOP_MOV_RSP_RBP_SKIP;	/* Skip over the bytes of the patched mov %rsp,%rbp */
584			retval = KERN_SUCCESS;
585			break;
586
587		case DTRACE_INVOP_POP_RBP:
588		case DTRACE_INVOP_LEAVE:
589/*
590 * Emulate first micro-op of patched leave: mov %rbp,%rsp
591 * fp points just below the return address slot for target's ret
592 * and at the slot holding the frame pointer saved by the target's prologue.
593 */
594			fp = saved_state->rbp;
595/* Emulate second micro-op of patched leave: patched pop %rbp
596 * savearea rbp is set for the frame of the caller to target
597 * The *live* %rsp will be adjusted below for pop increment(s)
598 */
599			saved_state->rbp = *(uint64_t *)fp;
600/* Skip over the patched leave */
601			saved_state->isf.rip += DTRACE_INVOP_LEAVE_SKIP;
602/*
603 * Lift the stack to account for the emulated leave
604 * Account for words local in this frame
605 * (in "case DTRACE_INVOP_POPL_EBP:" this is zero.)
606 */
607			delta = ((uint32_t *)fp) - ((uint32_t *)rsp_probe); /* delta is a *word* increment */
608/* Account for popping off the rbp (just accomplished by the emulation
609 * above...)
610 */
611			delta += 2;
612			saved_state->isf.rsp += (delta << 2);
613/* Obtain the stack pointer recorded by the trampolines */
614			old_sp = *lo_spp;
615/* Shift contents of stack */
616			for (pDst = (uint32_t *)fp;
617			     pDst > (((uint32_t *)old_sp));
618				 pDst--)
619				*pDst = pDst[-delta];
620
621/* Track the stack lift in "saved_state". */
622			saved_state = (x86_saved_state64_t *) (((uintptr_t)saved_state) + (delta << 2));
623/* Adjust the stack pointer utilized by the trampolines */
624			*lo_spp = old_sp + (delta << 2);
625
626			retval = KERN_SUCCESS;
627			break;
628
629		default:
630			retval = KERN_FAILURE;
631			break;
632		}
633		saved_state->isf.trapno = T_PREEMPT; /* Avoid call to i386_astintr()! */
634
635		ml_set_interrupts_enabled(oldlevel);
636	}
637
638	return retval;
639}
640
641/*ARGSUSED*/
642static void
643__provide_probe_64(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart)
644{
645	unsigned int			j;
646	unsigned int			doenable = 0;
647	dtrace_id_t			thisid;
648
649	fbt_probe_t *newfbt, *retfbt, *entryfbt;
650	machine_inst_t *instr, *limit, theInstr, i1, i2, i3;
651	int size;
652
653	for (j = 0, instr = symbolStart, theInstr = 0;
654	     (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2));
655	     j++) {
656		theInstr = instr[0];
657		if (theInstr == FBT_PUSH_RBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16)
658			break;
659
660		if ((size = dtrace_instr_size(instr)) <= 0)
661			break;
662
663		instr += size;
664	}
665
666	if (theInstr != FBT_PUSH_RBP)
667		return;
668
669	i1 = instr[1];
670	i2 = instr[2];
671	i3 = instr[3];
672
673	limit = (machine_inst_t *)instrHigh;
674
675	if (i1 == FBT_REX_RSP_RBP && i2 == FBT_MOV_RSP_RBP0 && i3 == FBT_MOV_RSP_RBP1) {
676		instr += 1; /* Advance to the mov %rsp,%rbp */
677		theInstr = i1;
678	} else {
679		return;
680	}
681#if 0
682	else {
683		/*
684		 * Sometimes, the compiler will schedule an intervening instruction
685		 * in the function prologue. Example:
686		 *
687		 * _mach_vm_read:
688		 * 000006d8        pushl   %ebp
689		 * 000006d9        movl    $0x00000004,%edx
690		 * 000006de        movl    %esp,%ebp
691		 *
692		 * Try the next instruction, to see if it is a movl %esp,%ebp
693		 */
694
695		instr += 1; /* Advance past the pushl %ebp */
696		if ((size = dtrace_instr_size(instr)) <= 0)
697			return;
698
699		instr += size;
700
701		if ((instr + 1) >= limit)
702			return;
703
704		i1 = instr[0];
705		i2 = instr[1];
706
707		if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) &&
708		    !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1))
709			return;
710
711		/* instr already points at the movl %esp,%ebp */
712		theInstr = i1;
713	}
714#endif
715	thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_ENTRY);
716	newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
717	strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS );
718
719	if (thisid != 0) {
720		/*
721		 * The dtrace_probe previously existed, so we have to hook
722		 * the newfbt entry onto the end of the existing fbt's chain.
723		 * If we find an fbt entry that was previously patched to
724		 * fire, (as indicated by the current patched value), then
725		 * we want to enable this newfbt on the spot.
726		 */
727		entryfbt = dtrace_probe_arg (fbt_id, thisid);
728		ASSERT (entryfbt != NULL);
729		for(; entryfbt != NULL; entryfbt = entryfbt->fbtp_next) {
730			if (entryfbt->fbtp_currentval == entryfbt->fbtp_patchval)
731				doenable++;
732
733			if (entryfbt->fbtp_next == NULL) {
734				entryfbt->fbtp_next = newfbt;
735				newfbt->fbtp_id = entryfbt->fbtp_id;
736				break;
737			}
738		}
739	}
740	else {
741		/*
742		 * The dtrace_probe did not previously exist, so we
743		 * create it and hook in the newfbt.  Since the probe is
744		 * new, we obviously do not need to enable it on the spot.
745		 */
746		newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, symbolName, FBT_ENTRY, FBT_AFRAMES_ENTRY, newfbt);
747		doenable = 0;
748	}
749
750	newfbt->fbtp_patchpoint = instr;
751	newfbt->fbtp_ctl = ctl;
752	newfbt->fbtp_loadcnt = ctl->mod_loadcnt;
753	newfbt->fbtp_rval = DTRACE_INVOP_MOV_RSP_RBP;
754	newfbt->fbtp_savedval = theInstr;
755	newfbt->fbtp_patchval = FBT_PATCHVAL;
756	newfbt->fbtp_currentval = 0;
757	newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
758	fbt_probetab[FBT_ADDR2NDX(instr)] = newfbt;
759
760	if (doenable)
761		fbt_enable(NULL, newfbt->fbtp_id, newfbt);
762
763	/*
764	 * The fbt entry chain is in place, one entry point per symbol.
765	 * The fbt return chain can have multiple return points per symbol.
766	 * Here we find the end of the fbt return chain.
767	 */
768
769	doenable=0;
770
771	thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN);
772	if (thisid != 0) {
773		/* The dtrace_probe previously existed, so we have to
774		 * find the end of the existing fbt chain.  If we find
775		 * an fbt return that was previously patched to fire,
776		 * (as indicated by the currrent patched value), then
777		 * we want to enable any new fbts on the spot.
778		 */
779		retfbt = dtrace_probe_arg (fbt_id, thisid);
780		ASSERT(retfbt != NULL);
781		for (;  retfbt != NULL; retfbt =  retfbt->fbtp_next) {
782			if (retfbt->fbtp_currentval == retfbt->fbtp_patchval)
783				doenable++;
784			if(retfbt->fbtp_next == NULL)
785				break;
786		}
787	}
788	else {
789		doenable = 0;
790		retfbt = NULL;
791	}
792
793again:
794	if (instr >= limit)
795		return;
796
797	/*
798	 * If this disassembly fails, then we've likely walked off into
799	 * a jump table or some other unsuitable area.  Bail out of the
800	 * disassembly now.
801	 */
802	if ((size = dtrace_instr_size(instr)) <= 0)
803		return;
804
805	/*
806	 * We (desperately) want to avoid erroneously instrumenting a
807	 * jump table, especially given that our markers are pretty
808	 * short:  two bytes on x86, and just one byte on amd64.  To
809	 * determine if we're looking at a true instruction sequence
810	 * or an inline jump table that happens to contain the same
811	 * byte sequences, we resort to some heuristic sleeze:  we
812	 * treat this instruction as being contained within a pointer,
813	 * and see if that pointer points to within the body of the
814	 * function.  If it does, we refuse to instrument it.
815	 */
816	for (j = 0; j < sizeof (uintptr_t); j++) {
817		uintptr_t check = (uintptr_t)instr - j;
818		uint8_t *ptr;
819
820		if (check < (uintptr_t)symbolStart)
821			break;
822
823		if (check + sizeof (uintptr_t) > (uintptr_t)limit)
824			continue;
825
826		ptr = *(uint8_t **)check;
827
828		if (ptr >= (uint8_t *)symbolStart && ptr < limit) {
829			instr += size;
830			goto again;
831		}
832	}
833
834	/*
835	 * OK, it's an instruction.
836	 */
837	theInstr = instr[0];
838
839	/* Walked onto the start of the next routine? If so, bail out of this function. */
840	if (theInstr == FBT_PUSH_RBP)
841		return;
842
843	if (!(size == 1 && (theInstr == FBT_POP_RBP || theInstr == FBT_LEAVE))) {
844		instr += size;
845		goto again;
846	}
847
848	/*
849	 * Found the pop %rbp; or leave.
850	 */
851	machine_inst_t *patch_instr = instr;
852
853	/*
854	 * Scan forward for a "ret", or "jmp".
855	 */
856	instr += size;
857	if (instr >= limit)
858		return;
859
860	size = dtrace_instr_size(instr);
861	if (size <= 0) /* Failed instruction decode? */
862		return;
863
864	theInstr = instr[0];
865
866	if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) &&
867	    !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) &&
868	    !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) &&
869	    !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) &&
870	    !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS)))
871		return;
872
873	/*
874	 * pop %rbp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner!
875	 */
876	newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
877	strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS );
878
879	if (retfbt == NULL) {
880		newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
881						      symbolName, FBT_RETURN, FBT_AFRAMES_RETURN, newfbt);
882	} else {
883		retfbt->fbtp_next = newfbt;
884		newfbt->fbtp_id = retfbt->fbtp_id;
885	}
886
887	retfbt = newfbt;
888	newfbt->fbtp_patchpoint = patch_instr;
889	newfbt->fbtp_ctl = ctl;
890	newfbt->fbtp_loadcnt = ctl->mod_loadcnt;
891
892	if (*patch_instr == FBT_POP_RBP) {
893		newfbt->fbtp_rval = DTRACE_INVOP_POP_RBP;
894	} else {
895		ASSERT(*patch_instr == FBT_LEAVE);
896		newfbt->fbtp_rval = DTRACE_INVOP_LEAVE;
897	}
898	newfbt->fbtp_roffset =
899	(uintptr_t)(patch_instr - (uint8_t *)symbolStart);
900
901	newfbt->fbtp_savedval = *patch_instr;
902	newfbt->fbtp_patchval = FBT_PATCHVAL;
903	newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)];
904	fbt_probetab[FBT_ADDR2NDX(patch_instr)] = newfbt;
905
906	if (doenable)
907		fbt_enable(NULL, newfbt->fbtp_id, newfbt);
908
909	instr += size;
910	goto again;
911}
912
913static void
914__kernel_syms_provide_module(void *arg, struct modctl *ctl)
915{
916#pragma unused(arg)
917	kernel_mach_header_t		*mh;
918	struct load_command		*cmd;
919	kernel_segment_command_t	*orig_ts = NULL, *orig_le = NULL;
920	struct symtab_command		*orig_st = NULL;
921	kernel_nlist_t			*sym = NULL;
922	char				*strings;
923	uintptr_t			instrLow, instrHigh;
924	char				*modname;
925	unsigned int			i;
926
927	mh = (kernel_mach_header_t *)(ctl->mod_address);
928	modname = ctl->mod_modname;
929
930	if (mh->magic != MH_MAGIC_KERNEL)
931		return;
932
933	cmd = (struct load_command *) &mh[1];
934	for (i = 0; i < mh->ncmds; i++) {
935		if (cmd->cmd == LC_SEGMENT_KERNEL) {
936			kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd;
937
938			if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT))
939				orig_ts = orig_sg;
940			else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT))
941				orig_le = orig_sg;
942			else if (LIT_STRNEQL(orig_sg->segname, ""))
943				orig_ts = orig_sg; /* kexts have a single unnamed segment */
944		}
945		else if (cmd->cmd == LC_SYMTAB)
946			orig_st = (struct symtab_command *) cmd;
947
948		cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize);
949	}
950
951	if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL))
952		return;
953
954	sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff);
955	strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff);
956
957	/* Find extent of the TEXT section */
958	instrLow = (uintptr_t)orig_ts->vmaddr;
959	instrHigh = (uintptr_t)(orig_ts->vmaddr + orig_ts->vmsize);
960
961	for (i = 0; i < orig_st->nsyms; i++) {
962		uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT);
963		char *name = strings + sym[i].n_un.n_strx;
964
965		/* Check that the symbol is a global and that it has a name. */
966		if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type))
967			continue;
968
969		if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */
970			continue;
971
972		/* Lop off omnipresent leading underscore. */
973		if (*name == '_')
974			name += 1;
975
976		/*
977		 * We're only blacklisting functions in the kernel for now.
978		 */
979		if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name))
980			continue;
981
982		__provide_probe_64(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value);
983	}
984}
985
986static void
987__user_syms_provide_module(void *arg, struct modctl *ctl)
988{
989#pragma unused(arg)
990	char				*modname;
991	unsigned int			i;
992
993	modname = ctl->mod_modname;
994
995	dtrace_module_symbols_t* module_symbols = ctl->mod_user_symbols;
996	if (module_symbols) {
997		for (i=0; i<module_symbols->dtmodsyms_count; i++) {
998
999		        /*
1000			 * symbol->dtsym_addr (the symbol address) passed in from
1001			 * user space, is already slid for both kexts and kernel.
1002			 */
1003			dtrace_symbol_t* symbol = &module_symbols->dtmodsyms_symbols[i];
1004
1005			char* name = symbol->dtsym_name;
1006
1007			/* Lop off omnipresent leading underscore. */
1008			if (*name == '_')
1009				name += 1;
1010
1011			/*
1012			 * We're only blacklisting functions in the kernel for now.
1013			 */
1014                        if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name))
1015			        continue;
1016
1017			__provide_probe_64(ctl, (uintptr_t)symbol->dtsym_addr, (uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size), modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr);
1018		}
1019	}
1020}
1021
1022extern int dtrace_kernel_symbol_mode;
1023
1024/*ARGSUSED*/
1025void
1026fbt_provide_module(void *arg, struct modctl *ctl)
1027{
1028	ASSERT(ctl != NULL);
1029	ASSERT(dtrace_kernel_symbol_mode != DTRACE_KERNEL_SYMBOLS_NEVER);
1030	lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
1031
1032	if (MOD_FBT_DONE(ctl))
1033		return;
1034
1035	if (!is_module_valid(ctl)) {
1036		ctl->mod_flags |= MODCTL_FBT_INVALID;
1037		return;
1038	}
1039
1040	if (MOD_HAS_KERNEL_SYMBOLS(ctl)) {
1041		__kernel_syms_provide_module(arg, ctl);
1042		ctl->mod_flags |= MODCTL_FBT_PROBES_PROVIDED;
1043		return;
1044	}
1045
1046	if (MOD_HAS_USERSPACE_SYMBOLS(ctl)) {
1047		__user_syms_provide_module(arg, ctl);
1048		ctl->mod_flags |= MODCTL_FBT_PROBES_PROVIDED;
1049		if (MOD_FBT_PROVIDE_PRIVATE_PROBES(ctl))
1050			ctl->mod_flags |= MODCTL_FBT_PRIVATE_PROBES_PROVIDED;
1051		return;
1052	}
1053}
1054