1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * Portions Copyright 2010 The FreeBSD Foundation
22 */
23
24/*
25 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29#include <sys/fasttrap_isa.h>
30#include <sys/fasttrap_impl.h>
31#include <sys/dtrace.h>
32#include <sys/dtrace_impl.h>
33#include <sys/cmn_err.h>
34#include <sys/types.h>
35#include <sys/dtrace_bsd.h>
36#include <sys/proc.h>
37#include <sys/reg.h>
38#include <sys/rmlock.h>
39#include <cddl/dev/dtrace/dtrace_cddl.h>
40#include <cddl/dev/dtrace/x86/regset.h>
41#include <machine/segments.h>
42#include <machine/pcb.h>
43#include <machine/trap.h>
44#include <sys/sysmacros.h>
45#include <sys/ptrace.h>
46
47#ifdef __i386__
48#define	r_rax	r_eax
49#define	r_rbx	r_ebx
50#define	r_rip	r_eip
51#define	r_rflags r_eflags
52#define	r_rsp	r_esp
53#define	r_rbp	r_ebp
54#endif
55
56/*
57 * Lossless User-Land Tracing on x86
58 * ---------------------------------
59 *
60 * The execution of most instructions is not dependent on the address; for
61 * these instructions it is sufficient to copy them into the user process's
62 * address space and execute them. To effectively single-step an instruction
63 * in user-land, we copy out the following sequence of instructions to scratch
64 * space in the user thread's ulwp_t structure.
65 *
66 * We then set the program counter (%eip or %rip) to point to this scratch
67 * space. Once execution resumes, the original instruction is executed and
68 * then control flow is redirected to what was originally the subsequent
69 * instruction. If the kernel attemps to deliver a signal while single-
70 * stepping, the signal is deferred and the program counter is moved into the
71 * second sequence of instructions. The second sequence ends in a trap into
72 * the kernel where the deferred signal is then properly handled and delivered.
73 *
74 * For instructions whose execute is position dependent, we perform simple
75 * emulation. These instructions are limited to control transfer
76 * instructions in 32-bit mode, but in 64-bit mode there's the added wrinkle
77 * of %rip-relative addressing that means that almost any instruction can be
78 * position dependent. For all the details on how we emulate generic
79 * instructions included %rip-relative instructions, see the code in
80 * fasttrap_pid_probe() below where we handle instructions of type
81 * FASTTRAP_T_COMMON (under the header: Generic Instruction Tracing).
82 */
83
84#define	FASTTRAP_MODRM_MOD(modrm)	(((modrm) >> 6) & 0x3)
85#define	FASTTRAP_MODRM_REG(modrm)	(((modrm) >> 3) & 0x7)
86#define	FASTTRAP_MODRM_RM(modrm)	((modrm) & 0x7)
87#define	FASTTRAP_MODRM(mod, reg, rm)	(((mod) << 6) | ((reg) << 3) | (rm))
88
89#define	FASTTRAP_SIB_SCALE(sib)		(((sib) >> 6) & 0x3)
90#define	FASTTRAP_SIB_INDEX(sib)		(((sib) >> 3) & 0x7)
91#define	FASTTRAP_SIB_BASE(sib)		((sib) & 0x7)
92
93#define	FASTTRAP_REX_W(rex)		(((rex) >> 3) & 1)
94#define	FASTTRAP_REX_R(rex)		(((rex) >> 2) & 1)
95#define	FASTTRAP_REX_X(rex)		(((rex) >> 1) & 1)
96#define	FASTTRAP_REX_B(rex)		((rex) & 1)
97#define	FASTTRAP_REX(w, r, x, b)	\
98	(0x40 | ((w) << 3) | ((r) << 2) | ((x) << 1) | (b))
99
100/*
101 * Single-byte op-codes.
102 */
103#define	FASTTRAP_PUSHL_EBP	0x55
104
105#define	FASTTRAP_JO		0x70
106#define	FASTTRAP_JNO		0x71
107#define	FASTTRAP_JB		0x72
108#define	FASTTRAP_JAE		0x73
109#define	FASTTRAP_JE		0x74
110#define	FASTTRAP_JNE		0x75
111#define	FASTTRAP_JBE		0x76
112#define	FASTTRAP_JA		0x77
113#define	FASTTRAP_JS		0x78
114#define	FASTTRAP_JNS		0x79
115#define	FASTTRAP_JP		0x7a
116#define	FASTTRAP_JNP		0x7b
117#define	FASTTRAP_JL		0x7c
118#define	FASTTRAP_JGE		0x7d
119#define	FASTTRAP_JLE		0x7e
120#define	FASTTRAP_JG		0x7f
121
122#define	FASTTRAP_NOP		0x90
123
124#define	FASTTRAP_MOV_EAX	0xb8
125#define	FASTTRAP_MOV_ECX	0xb9
126
127#define	FASTTRAP_RET16		0xc2
128#define	FASTTRAP_RET		0xc3
129
130#define	FASTTRAP_LOOPNZ		0xe0
131#define	FASTTRAP_LOOPZ		0xe1
132#define	FASTTRAP_LOOP		0xe2
133#define	FASTTRAP_JCXZ		0xe3
134
135#define	FASTTRAP_CALL		0xe8
136#define	FASTTRAP_JMP32		0xe9
137#define	FASTTRAP_JMP8		0xeb
138
139#define	FASTTRAP_INT3		0xcc
140#define	FASTTRAP_INT		0xcd
141
142#define	FASTTRAP_2_BYTE_OP	0x0f
143#define	FASTTRAP_GROUP5_OP	0xff
144
145/*
146 * Two-byte op-codes (second byte only).
147 */
148#define	FASTTRAP_0F_JO		0x80
149#define	FASTTRAP_0F_JNO		0x81
150#define	FASTTRAP_0F_JB		0x82
151#define	FASTTRAP_0F_JAE		0x83
152#define	FASTTRAP_0F_JE		0x84
153#define	FASTTRAP_0F_JNE		0x85
154#define	FASTTRAP_0F_JBE		0x86
155#define	FASTTRAP_0F_JA		0x87
156#define	FASTTRAP_0F_JS		0x88
157#define	FASTTRAP_0F_JNS		0x89
158#define	FASTTRAP_0F_JP		0x8a
159#define	FASTTRAP_0F_JNP		0x8b
160#define	FASTTRAP_0F_JL		0x8c
161#define	FASTTRAP_0F_JGE		0x8d
162#define	FASTTRAP_0F_JLE		0x8e
163#define	FASTTRAP_0F_JG		0x8f
164
165#define	FASTTRAP_EFLAGS_OF	0x800
166#define	FASTTRAP_EFLAGS_DF	0x400
167#define	FASTTRAP_EFLAGS_SF	0x080
168#define	FASTTRAP_EFLAGS_ZF	0x040
169#define	FASTTRAP_EFLAGS_AF	0x010
170#define	FASTTRAP_EFLAGS_PF	0x004
171#define	FASTTRAP_EFLAGS_CF	0x001
172
173/*
174 * Instruction prefixes.
175 */
176#define	FASTTRAP_PREFIX_OPERAND	0x66
177#define	FASTTRAP_PREFIX_ADDRESS	0x67
178#define	FASTTRAP_PREFIX_CS	0x2E
179#define	FASTTRAP_PREFIX_DS	0x3E
180#define	FASTTRAP_PREFIX_ES	0x26
181#define	FASTTRAP_PREFIX_FS	0x64
182#define	FASTTRAP_PREFIX_GS	0x65
183#define	FASTTRAP_PREFIX_SS	0x36
184#define	FASTTRAP_PREFIX_LOCK	0xF0
185#define	FASTTRAP_PREFIX_REP	0xF3
186#define	FASTTRAP_PREFIX_REPNE	0xF2
187
188#define	FASTTRAP_NOREG	0xff
189
190/*
191 * Map between instruction register encodings and the kernel constants which
192 * correspond to indicies into struct regs.
193 */
194#ifdef __amd64
195static const uint8_t regmap[16] = {
196	REG_RAX, REG_RCX, REG_RDX, REG_RBX, REG_RSP, REG_RBP, REG_RSI, REG_RDI,
197	REG_R8, REG_R9, REG_R10, REG_R11, REG_R12, REG_R13, REG_R14, REG_R15,
198};
199#else
200static const uint8_t regmap[8] = {
201	EAX, ECX, EDX, EBX, UESP, EBP, ESI, EDI
202};
203#endif
204
205static ulong_t fasttrap_getreg(struct reg *, uint_t);
206
207static uint64_t
208fasttrap_anarg(struct reg *rp, int function_entry, int argno)
209{
210	uint64_t value = 0;
211	int shift = function_entry ? 1 : 0;
212
213#ifdef __amd64
214	if (curproc->p_model == DATAMODEL_LP64) {
215		uintptr_t *stack;
216
217		/*
218		 * In 64-bit mode, the first six arguments are stored in
219		 * registers.
220		 */
221		if (argno < 6)
222			switch (argno) {
223			case 0:
224				return (rp->r_rdi);
225			case 1:
226				return (rp->r_rsi);
227			case 2:
228				return (rp->r_rdx);
229			case 3:
230				return (rp->r_rcx);
231			case 4:
232				return (rp->r_r8);
233			case 5:
234				return (rp->r_r9);
235			}
236
237		stack = (uintptr_t *)rp->r_rsp;
238		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
239		value = dtrace_fulword(&stack[argno - 6 + shift]);
240		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
241	} else {
242#endif
243		uint32_t *stack = (uint32_t *)rp->r_rsp;
244		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
245		value = dtrace_fuword32(&stack[argno + shift]);
246		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
247#ifdef __amd64
248	}
249#endif
250
251	return (value);
252}
253
254/*ARGSUSED*/
255int
256fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc,
257    fasttrap_probe_type_t type)
258{
259	uint8_t instr[FASTTRAP_MAX_INSTR_SIZE + 10];
260	size_t len = FASTTRAP_MAX_INSTR_SIZE;
261	size_t first = MIN(len, PAGESIZE - (pc & PAGEOFFSET));
262	uint_t start = 0;
263	int rmindex, size;
264	uint8_t seg, rex = 0;
265
266	/*
267	 * Read the instruction at the given address out of the process's
268	 * address space. We don't have to worry about a debugger
269	 * changing this instruction before we overwrite it with our trap
270	 * instruction since P_PR_LOCK is set. Since instructions can span
271	 * pages, we potentially read the instruction in two parts. If the
272	 * second part fails, we just zero out that part of the instruction.
273	 */
274	if (uread(p, &instr[0], first, pc) != 0)
275		return (-1);
276	if (len > first &&
277	    uread(p, &instr[first], len - first, pc + first) != 0) {
278		bzero(&instr[first], len - first);
279		len = first;
280	}
281
282	/*
283	 * If the disassembly fails, then we have a malformed instruction.
284	 */
285	if ((size = dtrace_instr_size_isa(instr, p->p_model, &rmindex)) <= 0)
286		return (-1);
287
288	/*
289	 * Make sure the disassembler isn't completely broken.
290	 */
291	ASSERT(-1 <= rmindex && rmindex < size);
292
293	/*
294	 * If the computed size is greater than the number of bytes read,
295	 * then it was a malformed instruction possibly because it fell on a
296	 * page boundary and the subsequent page was missing or because of
297	 * some malicious user.
298	 */
299	if (size > len)
300		return (-1);
301
302	tp->ftt_size = (uint8_t)size;
303	tp->ftt_segment = FASTTRAP_SEG_NONE;
304
305	/*
306	 * Find the start of the instruction's opcode by processing any
307	 * legacy prefixes.
308	 */
309	for (;;) {
310		seg = 0;
311		switch (instr[start]) {
312		case FASTTRAP_PREFIX_SS:
313			seg++;
314			/*FALLTHRU*/
315		case FASTTRAP_PREFIX_GS:
316			seg++;
317			/*FALLTHRU*/
318		case FASTTRAP_PREFIX_FS:
319			seg++;
320			/*FALLTHRU*/
321		case FASTTRAP_PREFIX_ES:
322			seg++;
323			/*FALLTHRU*/
324		case FASTTRAP_PREFIX_DS:
325			seg++;
326			/*FALLTHRU*/
327		case FASTTRAP_PREFIX_CS:
328			seg++;
329			/*FALLTHRU*/
330		case FASTTRAP_PREFIX_OPERAND:
331		case FASTTRAP_PREFIX_ADDRESS:
332		case FASTTRAP_PREFIX_LOCK:
333		case FASTTRAP_PREFIX_REP:
334		case FASTTRAP_PREFIX_REPNE:
335			if (seg != 0) {
336				/*
337				 * It's illegal for an instruction to specify
338				 * two segment prefixes -- give up on this
339				 * illegal instruction.
340				 */
341				if (tp->ftt_segment != FASTTRAP_SEG_NONE)
342					return (-1);
343
344				tp->ftt_segment = seg;
345			}
346			start++;
347			continue;
348		}
349		break;
350	}
351
352#ifdef __amd64
353	/*
354	 * Identify the REX prefix on 64-bit processes.
355	 */
356	if (p->p_model == DATAMODEL_LP64 && (instr[start] & 0xf0) == 0x40)
357		rex = instr[start++];
358#endif
359
360	/*
361	 * Now that we're pretty sure that the instruction is okay, copy the
362	 * valid part to the tracepoint.
363	 */
364	bcopy(instr, tp->ftt_instr, FASTTRAP_MAX_INSTR_SIZE);
365
366	tp->ftt_type = FASTTRAP_T_COMMON;
367	if (instr[start] == FASTTRAP_2_BYTE_OP) {
368		switch (instr[start + 1]) {
369		case FASTTRAP_0F_JO:
370		case FASTTRAP_0F_JNO:
371		case FASTTRAP_0F_JB:
372		case FASTTRAP_0F_JAE:
373		case FASTTRAP_0F_JE:
374		case FASTTRAP_0F_JNE:
375		case FASTTRAP_0F_JBE:
376		case FASTTRAP_0F_JA:
377		case FASTTRAP_0F_JS:
378		case FASTTRAP_0F_JNS:
379		case FASTTRAP_0F_JP:
380		case FASTTRAP_0F_JNP:
381		case FASTTRAP_0F_JL:
382		case FASTTRAP_0F_JGE:
383		case FASTTRAP_0F_JLE:
384		case FASTTRAP_0F_JG:
385			tp->ftt_type = FASTTRAP_T_JCC;
386			tp->ftt_code = (instr[start + 1] & 0x0f) | FASTTRAP_JO;
387			tp->ftt_dest = pc + tp->ftt_size +
388			    /* LINTED - alignment */
389			    *(int32_t *)&instr[start + 2];
390			break;
391		}
392	} else if (instr[start] == FASTTRAP_GROUP5_OP) {
393		uint_t mod = FASTTRAP_MODRM_MOD(instr[start + 1]);
394		uint_t reg = FASTTRAP_MODRM_REG(instr[start + 1]);
395		uint_t rm = FASTTRAP_MODRM_RM(instr[start + 1]);
396
397		if (reg == 2 || reg == 4) {
398			uint_t i, sz;
399
400			if (reg == 2)
401				tp->ftt_type = FASTTRAP_T_CALL;
402			else
403				tp->ftt_type = FASTTRAP_T_JMP;
404
405			if (mod == 3)
406				tp->ftt_code = 2;
407			else
408				tp->ftt_code = 1;
409
410			ASSERT(p->p_model == DATAMODEL_LP64 || rex == 0);
411
412			/*
413			 * See AMD x86-64 Architecture Programmer's Manual
414			 * Volume 3, Section 1.2.7, Table 1-12, and
415			 * Appendix A.3.1, Table A-15.
416			 */
417			if (mod != 3 && rm == 4) {
418				uint8_t sib = instr[start + 2];
419				uint_t index = FASTTRAP_SIB_INDEX(sib);
420				uint_t base = FASTTRAP_SIB_BASE(sib);
421
422				tp->ftt_scale = FASTTRAP_SIB_SCALE(sib);
423
424				tp->ftt_index = (index == 4) ?
425				    FASTTRAP_NOREG :
426				    regmap[index | (FASTTRAP_REX_X(rex) << 3)];
427				tp->ftt_base = (mod == 0 && base == 5) ?
428				    FASTTRAP_NOREG :
429				    regmap[base | (FASTTRAP_REX_B(rex) << 3)];
430
431				i = 3;
432				sz = mod == 1 ? 1 : 4;
433			} else {
434				/*
435				 * In 64-bit mode, mod == 0 and r/m == 5
436				 * denotes %rip-relative addressing; in 32-bit
437				 * mode, the base register isn't used. In both
438				 * modes, there is a 32-bit operand.
439				 */
440				if (mod == 0 && rm == 5) {
441#ifdef __amd64
442					if (p->p_model == DATAMODEL_LP64)
443						tp->ftt_base = REG_RIP;
444					else
445#endif
446						tp->ftt_base = FASTTRAP_NOREG;
447					sz = 4;
448				} else  {
449					uint8_t base = rm |
450					    (FASTTRAP_REX_B(rex) << 3);
451
452					tp->ftt_base = regmap[base];
453					sz = mod == 1 ? 1 : mod == 2 ? 4 : 0;
454				}
455				tp->ftt_index = FASTTRAP_NOREG;
456				i = 2;
457			}
458
459			if (sz == 1) {
460				tp->ftt_dest = *(int8_t *)&instr[start + i];
461			} else if (sz == 4) {
462				/* LINTED - alignment */
463				tp->ftt_dest = *(int32_t *)&instr[start + i];
464			} else {
465				tp->ftt_dest = 0;
466			}
467		}
468	} else {
469		switch (instr[start]) {
470		case FASTTRAP_RET:
471			tp->ftt_type = FASTTRAP_T_RET;
472			break;
473
474		case FASTTRAP_RET16:
475			tp->ftt_type = FASTTRAP_T_RET16;
476			/* LINTED - alignment */
477			tp->ftt_dest = *(uint16_t *)&instr[start + 1];
478			break;
479
480		case FASTTRAP_JO:
481		case FASTTRAP_JNO:
482		case FASTTRAP_JB:
483		case FASTTRAP_JAE:
484		case FASTTRAP_JE:
485		case FASTTRAP_JNE:
486		case FASTTRAP_JBE:
487		case FASTTRAP_JA:
488		case FASTTRAP_JS:
489		case FASTTRAP_JNS:
490		case FASTTRAP_JP:
491		case FASTTRAP_JNP:
492		case FASTTRAP_JL:
493		case FASTTRAP_JGE:
494		case FASTTRAP_JLE:
495		case FASTTRAP_JG:
496			tp->ftt_type = FASTTRAP_T_JCC;
497			tp->ftt_code = instr[start];
498			tp->ftt_dest = pc + tp->ftt_size +
499			    (int8_t)instr[start + 1];
500			break;
501
502		case FASTTRAP_LOOPNZ:
503		case FASTTRAP_LOOPZ:
504		case FASTTRAP_LOOP:
505			tp->ftt_type = FASTTRAP_T_LOOP;
506			tp->ftt_code = instr[start];
507			tp->ftt_dest = pc + tp->ftt_size +
508			    (int8_t)instr[start + 1];
509			break;
510
511		case FASTTRAP_JCXZ:
512			tp->ftt_type = FASTTRAP_T_JCXZ;
513			tp->ftt_dest = pc + tp->ftt_size +
514			    (int8_t)instr[start + 1];
515			break;
516
517		case FASTTRAP_CALL:
518			tp->ftt_type = FASTTRAP_T_CALL;
519			tp->ftt_dest = pc + tp->ftt_size +
520			    /* LINTED - alignment */
521			    *(int32_t *)&instr[start + 1];
522			tp->ftt_code = 0;
523			break;
524
525		case FASTTRAP_JMP32:
526			tp->ftt_type = FASTTRAP_T_JMP;
527			tp->ftt_dest = pc + tp->ftt_size +
528			    /* LINTED - alignment */
529			    *(int32_t *)&instr[start + 1];
530			break;
531		case FASTTRAP_JMP8:
532			tp->ftt_type = FASTTRAP_T_JMP;
533			tp->ftt_dest = pc + tp->ftt_size +
534			    (int8_t)instr[start + 1];
535			break;
536
537		case FASTTRAP_PUSHL_EBP:
538			if (start == 0)
539				tp->ftt_type = FASTTRAP_T_PUSHL_EBP;
540			break;
541
542		case FASTTRAP_NOP:
543#ifdef __amd64
544			ASSERT(p->p_model == DATAMODEL_LP64 || rex == 0);
545
546			/*
547			 * On amd64 we have to be careful not to confuse a nop
548			 * (actually xchgl %eax, %eax) with an instruction using
549			 * the same opcode, but that does something different
550			 * (e.g. xchgl %r8d, %eax or xcghq %r8, %rax).
551			 */
552			if (FASTTRAP_REX_B(rex) == 0)
553#endif
554				tp->ftt_type = FASTTRAP_T_NOP;
555			break;
556
557		case FASTTRAP_INT3:
558			/*
559			 * The pid provider shares the int3 trap with debugger
560			 * breakpoints so we can't instrument them.
561			 */
562			ASSERT(instr[start] == FASTTRAP_INSTR);
563			return (-1);
564
565		case FASTTRAP_INT:
566			/*
567			 * Interrupts seem like they could be traced with
568			 * no negative implications, but it's possible that
569			 * a thread could be redirected by the trap handling
570			 * code which would eventually return to the
571			 * instruction after the interrupt. If the interrupt
572			 * were in our scratch space, the subsequent
573			 * instruction might be overwritten before we return.
574			 * Accordingly we refuse to instrument any interrupt.
575			 */
576			return (-1);
577		}
578	}
579
580#ifdef __amd64
581	if (p->p_model == DATAMODEL_LP64 && tp->ftt_type == FASTTRAP_T_COMMON) {
582		/*
583		 * If the process is 64-bit and the instruction type is still
584		 * FASTTRAP_T_COMMON -- meaning we're going to copy it out an
585		 * execute it -- we need to watch for %rip-relative
586		 * addressing mode. See the portion of fasttrap_pid_probe()
587		 * below where we handle tracepoints with type
588		 * FASTTRAP_T_COMMON for how we emulate instructions that
589		 * employ %rip-relative addressing.
590		 */
591		if (rmindex != -1) {
592			uint_t mod = FASTTRAP_MODRM_MOD(instr[rmindex]);
593			uint_t reg = FASTTRAP_MODRM_REG(instr[rmindex]);
594			uint_t rm = FASTTRAP_MODRM_RM(instr[rmindex]);
595
596			ASSERT(rmindex > start);
597
598			if (mod == 0 && rm == 5) {
599				/*
600				 * We need to be sure to avoid other
601				 * registers used by this instruction. While
602				 * the reg field may determine the op code
603				 * rather than denoting a register, assuming
604				 * that it denotes a register is always safe.
605				 * We leave the REX field intact and use
606				 * whatever value's there for simplicity.
607				 */
608				if (reg != 0) {
609					tp->ftt_ripmode = FASTTRAP_RIP_1 |
610					    (FASTTRAP_RIP_X *
611					    FASTTRAP_REX_B(rex));
612					rm = 0;
613				} else {
614					tp->ftt_ripmode = FASTTRAP_RIP_2 |
615					    (FASTTRAP_RIP_X *
616					    FASTTRAP_REX_B(rex));
617					rm = 1;
618				}
619
620				tp->ftt_modrm = tp->ftt_instr[rmindex];
621				tp->ftt_instr[rmindex] =
622				    FASTTRAP_MODRM(2, reg, rm);
623			}
624		}
625	}
626#endif
627
628	return (0);
629}
630
631int
632fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
633{
634	fasttrap_instr_t instr = FASTTRAP_INSTR;
635
636	if (uwrite(p, &instr, 1, tp->ftt_pc) != 0)
637		return (-1);
638
639	return (0);
640}
641
642int
643fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
644{
645	uint8_t instr;
646
647	/*
648	 * Distinguish between read or write failures and a changed
649	 * instruction.
650	 */
651	if (uread(p, &instr, 1, tp->ftt_pc) != 0)
652		return (0);
653	if (instr != FASTTRAP_INSTR)
654		return (0);
655	if (uwrite(p, &tp->ftt_instr[0], 1, tp->ftt_pc) != 0)
656		return (-1);
657
658	return (0);
659}
660
661#ifdef __amd64
662static uintptr_t
663fasttrap_fulword_noerr(const void *uaddr)
664{
665	uintptr_t ret;
666
667	if ((ret = fasttrap_fulword(uaddr)) != -1)
668		return (ret);
669
670	return (0);
671}
672#endif
673
674static uint32_t
675fasttrap_fuword32_noerr(const void *uaddr)
676{
677	uint32_t ret;
678
679	if ((ret = fasttrap_fuword32(uaddr)) != -1)
680		return (ret);
681
682	return (0);
683}
684
685static void
686fasttrap_return_common(struct reg *rp, uintptr_t pc, pid_t pid,
687    uintptr_t new_pc)
688{
689	fasttrap_tracepoint_t *tp;
690	fasttrap_bucket_t *bucket;
691	fasttrap_id_t *id;
692	struct rm_priotracker tracker;
693
694	rm_rlock(&fasttrap_tp_lock, &tracker);
695	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
696
697	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
698		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
699		    tp->ftt_proc->ftpc_acount != 0)
700			break;
701	}
702
703	/*
704	 * Don't sweat it if we can't find the tracepoint again; unlike
705	 * when we're in fasttrap_pid_probe(), finding the tracepoint here
706	 * is not essential to the correct execution of the process.
707	 */
708	if (tp == NULL) {
709		rm_runlock(&fasttrap_tp_lock, &tracker);
710		return;
711	}
712
713	for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
714		/*
715		 * If there's a branch that could act as a return site, we
716		 * need to trace it, and check here if the program counter is
717		 * external to the function.
718		 */
719		if (tp->ftt_type != FASTTRAP_T_RET &&
720		    tp->ftt_type != FASTTRAP_T_RET16 &&
721		    new_pc - id->fti_probe->ftp_faddr <
722		    id->fti_probe->ftp_fsize)
723			continue;
724
725		dtrace_probe(id->fti_probe->ftp_id,
726		    pc - id->fti_probe->ftp_faddr,
727		    rp->r_rax, rp->r_rbx, 0, 0);
728	}
729
730	rm_runlock(&fasttrap_tp_lock, &tracker);
731}
732
733static void
734fasttrap_sigsegv(proc_t *p, kthread_t *t, uintptr_t addr)
735{
736	ksiginfo_t ksi;
737
738	ksiginfo_init(&ksi);
739	ksi.ksi_signo = SIGSEGV;
740	ksi.ksi_code = SEGV_MAPERR;
741	ksi.ksi_addr = (caddr_t)addr;
742	PROC_LOCK(p);
743	(void)tdksignal(t, SIGSEGV, &ksi);
744	PROC_UNLOCK(p);
745}
746
747#ifdef __amd64
748static void
749fasttrap_usdt_args64(fasttrap_probe_t *probe, struct reg *rp, int argc,
750    uintptr_t *argv)
751{
752	int i, x, cap = MIN(argc, probe->ftp_nargs);
753	uintptr_t *stack = (uintptr_t *)rp->r_rsp;
754
755	for (i = 0; i < cap; i++) {
756		x = probe->ftp_argmap[i];
757
758		if (x < 6)
759			argv[i] = (&rp->r_rdi)[x];
760		else
761			argv[i] = fasttrap_fulword_noerr(&stack[x]);
762	}
763
764	for (; i < argc; i++) {
765		argv[i] = 0;
766	}
767}
768#endif
769
770static void
771fasttrap_usdt_args32(fasttrap_probe_t *probe, struct reg *rp, int argc,
772    uint32_t *argv)
773{
774	int i, x, cap = MIN(argc, probe->ftp_nargs);
775	uint32_t *stack = (uint32_t *)rp->r_rsp;
776
777	for (i = 0; i < cap; i++) {
778		x = probe->ftp_argmap[i];
779
780		argv[i] = fasttrap_fuword32_noerr(&stack[x]);
781	}
782
783	for (; i < argc; i++) {
784		argv[i] = 0;
785	}
786}
787
788static int
789fasttrap_do_seg(fasttrap_tracepoint_t *tp, struct reg *rp, uintptr_t *addr)
790{
791	proc_t *p = curproc;
792#ifdef __i386__
793	struct segment_descriptor *desc;
794#else
795	struct user_segment_descriptor *desc;
796#endif
797	uint16_t sel = 0, ndx, type;
798	uintptr_t limit;
799
800	switch (tp->ftt_segment) {
801	case FASTTRAP_SEG_CS:
802		sel = rp->r_cs;
803		break;
804	case FASTTRAP_SEG_DS:
805		sel = rp->r_ds;
806		break;
807	case FASTTRAP_SEG_ES:
808		sel = rp->r_es;
809		break;
810	case FASTTRAP_SEG_FS:
811		sel = rp->r_fs;
812		break;
813	case FASTTRAP_SEG_GS:
814		sel = rp->r_gs;
815		break;
816	case FASTTRAP_SEG_SS:
817		sel = rp->r_ss;
818		break;
819	}
820
821	/*
822	 * Make sure the given segment register specifies a user priority
823	 * selector rather than a kernel selector.
824	 */
825	if (ISPL(sel) != SEL_UPL)
826		return (-1);
827
828	ndx = IDXSEL(sel);
829
830	/*
831	 * Check the bounds and grab the descriptor out of the specified
832	 * descriptor table.
833	 */
834	if (ISLDT(sel)) {
835#ifdef __i386__
836		if (ndx > p->p_md.md_ldt->ldt_len)
837			return (-1);
838
839		desc = (struct segment_descriptor *)
840		    p->p_md.md_ldt[ndx].ldt_base;
841#else
842		if (ndx > max_ldt_segment)
843			return (-1);
844
845		desc = (struct user_segment_descriptor *)
846		    p->p_md.md_ldt[ndx].ldt_base;
847#endif
848
849	} else {
850		if (ndx >= NGDT)
851			return (-1);
852
853#ifdef __i386__
854		desc = &gdt[ndx].sd;
855#else
856		desc = PCPU_PTR(gdt)[ndx];
857#endif
858	}
859
860	/*
861	 * The descriptor must have user privilege level and it must be
862	 * present in memory.
863	 */
864	if (desc->sd_dpl != SEL_UPL || desc->sd_p != 1)
865		return (-1);
866
867	type = desc->sd_type;
868
869	/*
870	 * If the S bit in the type field is not set, this descriptor can
871	 * only be used in system context.
872	 */
873	if ((type & 0x10) != 0x10)
874		return (-1);
875
876	limit = USD_GETLIMIT(desc) * (desc->sd_gran ? PAGESIZE : 1);
877
878	if (tp->ftt_segment == FASTTRAP_SEG_CS) {
879		/*
880		 * The code/data bit and readable bit must both be set.
881		 */
882		if ((type & 0xa) != 0xa)
883			return (-1);
884
885		if (*addr > limit)
886			return (-1);
887	} else {
888		/*
889		 * The code/data bit must be clear.
890		 */
891		if ((type & 0x8) != 0)
892			return (-1);
893
894		/*
895		 * If the expand-down bit is clear, we just check the limit as
896		 * it would naturally be applied. Otherwise, we need to check
897		 * that the address is the range [limit + 1 .. 0xffff] or
898		 * [limit + 1 ... 0xffffffff] depending on if the default
899		 * operand size bit is set.
900		 */
901		if ((type & 0x4) == 0) {
902			if (*addr > limit)
903				return (-1);
904		} else if (desc->sd_def32) {
905			if (*addr < limit + 1 || 0xffff < *addr)
906				return (-1);
907		} else {
908			if (*addr < limit + 1 || 0xffffffff < *addr)
909				return (-1);
910		}
911	}
912
913	*addr += USD_GETBASE(desc);
914
915	return (0);
916}
917
918int
919fasttrap_pid_probe(struct trapframe *tf)
920{
921	struct reg reg, *rp;
922	proc_t *p = curproc, *pp;
923	struct rm_priotracker tracker;
924	uint64_t gen;
925	uintptr_t pc;
926	uintptr_t new_pc = 0;
927	fasttrap_bucket_t *bucket;
928	fasttrap_tracepoint_t *tp, tp_local;
929	pid_t pid;
930	dtrace_icookie_t cookie;
931	uint_t is_enabled = 0;
932
933	fill_frame_regs(tf, &reg);
934	rp = &reg;
935
936	pc = rp->r_rip - 1;
937
938	/*
939	 * It's possible that a user (in a veritable orgy of bad planning)
940	 * could redirect this thread's flow of control before it reached the
941	 * return probe fasttrap. In this case we need to kill the process
942	 * since it's in a unrecoverable state.
943	 */
944	if (curthread->t_dtrace_step) {
945		ASSERT(curthread->t_dtrace_on);
946		fasttrap_sigtrap(p, curthread, pc);
947		return (0);
948	}
949
950	/*
951	 * Clear all user tracing flags.
952	 */
953	curthread->t_dtrace_ft = 0;
954	curthread->t_dtrace_pc = 0;
955	curthread->t_dtrace_npc = 0;
956	curthread->t_dtrace_scrpc = 0;
957	curthread->t_dtrace_astpc = 0;
958#ifdef __amd64
959	curthread->t_dtrace_regv = 0;
960#endif
961
962	/*
963	 * Treat a child created by a call to vfork(2) as if it were its
964	 * parent. We know that there's only one thread of control in such a
965	 * process: this one.
966	 */
967	pp = p;
968	sx_slock(&proctree_lock);
969	while (pp->p_vmspace == pp->p_pptr->p_vmspace)
970		pp = pp->p_pptr;
971	pid = pp->p_pid;
972	if (pp != p) {
973		PROC_LOCK(pp);
974		if ((pp->p_flag & P_WEXIT) != 0) {
975			/*
976			 * This can happen if the child was created with
977			 * rfork(2).  Userspace tracing cannot work reliably in
978			 * such a scenario, but we can at least try.
979			 */
980			PROC_UNLOCK(pp);
981			sx_sunlock(&proctree_lock);
982			return (-1);
983		}
984		_PHOLD_LITE(pp);
985		PROC_UNLOCK(pp);
986	}
987	sx_sunlock(&proctree_lock);
988
989	rm_rlock(&fasttrap_tp_lock, &tracker);
990
991	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
992
993	/*
994	 * Lookup the tracepoint that the process just hit.
995	 */
996	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
997		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
998		    tp->ftt_proc->ftpc_acount != 0)
999			break;
1000	}
1001
1002	/*
1003	 * If we couldn't find a matching tracepoint, either a tracepoint has
1004	 * been inserted without using the pid<pid> ioctl interface (see
1005	 * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
1006	 */
1007	if (tp == NULL) {
1008		rm_runlock(&fasttrap_tp_lock, &tracker);
1009		gen = atomic_load_acq_64(&pp->p_fasttrap_tp_gen);
1010		if (pp != p)
1011			PRELE(pp);
1012		if (curthread->t_fasttrap_tp_gen != gen) {
1013			/*
1014			 * At least one tracepoint associated with this PID has
1015			 * been removed from the table since #BP was raised.
1016			 * Speculate that we hit a tracepoint that has since
1017			 * been removed, and retry the instruction.
1018			 */
1019			curthread->t_fasttrap_tp_gen = gen;
1020#ifdef __amd64
1021			tf->tf_rip = pc;
1022#else
1023			tf->tf_eip = pc;
1024#endif
1025			return (0);
1026		}
1027		return (-1);
1028	}
1029	if (pp != p)
1030		PRELE(pp);
1031
1032	/*
1033	 * Set the program counter to the address of the traced instruction
1034	 * so that it looks right in ustack() output.
1035	 */
1036	rp->r_rip = pc;
1037
1038	if (tp->ftt_ids != NULL) {
1039		fasttrap_id_t *id;
1040
1041#ifdef __amd64
1042		if (p->p_model == DATAMODEL_LP64) {
1043			for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
1044				fasttrap_probe_t *probe = id->fti_probe;
1045
1046				if (id->fti_ptype == DTFTP_ENTRY) {
1047					/*
1048					 * We note that this was an entry
1049					 * probe to help ustack() find the
1050					 * first caller.
1051					 */
1052					cookie = dtrace_interrupt_disable();
1053					DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
1054					dtrace_probe(probe->ftp_id, rp->r_rdi,
1055					    rp->r_rsi, rp->r_rdx, rp->r_rcx,
1056					    rp->r_r8);
1057					DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
1058					dtrace_interrupt_enable(cookie);
1059				} else if (id->fti_ptype == DTFTP_IS_ENABLED) {
1060					/*
1061					 * Note that in this case, we don't
1062					 * call dtrace_probe() since it's only
1063					 * an artificial probe meant to change
1064					 * the flow of control so that it
1065					 * encounters the true probe.
1066					 */
1067					is_enabled = 1;
1068				} else if (probe->ftp_argmap == NULL) {
1069					dtrace_probe(probe->ftp_id, rp->r_rdi,
1070					    rp->r_rsi, rp->r_rdx, rp->r_rcx,
1071					    rp->r_r8);
1072				} else {
1073					uintptr_t t[5];
1074
1075					fasttrap_usdt_args64(probe, rp,
1076					    sizeof (t) / sizeof (t[0]), t);
1077
1078					dtrace_probe(probe->ftp_id, t[0], t[1],
1079					    t[2], t[3], t[4]);
1080				}
1081			}
1082		} else {
1083#endif
1084			uintptr_t s0, s1, s2, s3, s4, s5;
1085			uint32_t *stack = (uint32_t *)rp->r_rsp;
1086
1087			/*
1088			 * In 32-bit mode, all arguments are passed on the
1089			 * stack. If this is a function entry probe, we need
1090			 * to skip the first entry on the stack as it
1091			 * represents the return address rather than a
1092			 * parameter to the function.
1093			 */
1094			s0 = fasttrap_fuword32_noerr(&stack[0]);
1095			s1 = fasttrap_fuword32_noerr(&stack[1]);
1096			s2 = fasttrap_fuword32_noerr(&stack[2]);
1097			s3 = fasttrap_fuword32_noerr(&stack[3]);
1098			s4 = fasttrap_fuword32_noerr(&stack[4]);
1099			s5 = fasttrap_fuword32_noerr(&stack[5]);
1100
1101			for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
1102				fasttrap_probe_t *probe = id->fti_probe;
1103
1104				if (id->fti_ptype == DTFTP_ENTRY) {
1105					/*
1106					 * We note that this was an entry
1107					 * probe to help ustack() find the
1108					 * first caller.
1109					 */
1110					cookie = dtrace_interrupt_disable();
1111					DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
1112					dtrace_probe(probe->ftp_id, s1, s2,
1113					    s3, s4, s5);
1114					DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
1115					dtrace_interrupt_enable(cookie);
1116				} else if (id->fti_ptype == DTFTP_IS_ENABLED) {
1117					/*
1118					 * Note that in this case, we don't
1119					 * call dtrace_probe() since it's only
1120					 * an artificial probe meant to change
1121					 * the flow of control so that it
1122					 * encounters the true probe.
1123					 */
1124					is_enabled = 1;
1125				} else if (probe->ftp_argmap == NULL) {
1126					dtrace_probe(probe->ftp_id, s0, s1,
1127					    s2, s3, s4);
1128				} else {
1129					uint32_t t[5];
1130
1131					fasttrap_usdt_args32(probe, rp,
1132					    sizeof (t) / sizeof (t[0]), t);
1133
1134					dtrace_probe(probe->ftp_id, t[0], t[1],
1135					    t[2], t[3], t[4]);
1136				}
1137			}
1138#ifdef __amd64
1139		}
1140#endif
1141	}
1142
1143	/*
1144	 * We're about to do a bunch of work so we cache a local copy of
1145	 * the tracepoint to emulate the instruction, and then find the
1146	 * tracepoint again later if we need to light up any return probes.
1147	 */
1148	tp_local = *tp;
1149	rm_runlock(&fasttrap_tp_lock, &tracker);
1150	tp = &tp_local;
1151
1152	/*
1153	 * Set the program counter to appear as though the traced instruction
1154	 * had completely executed. This ensures that fasttrap_getreg() will
1155	 * report the expected value for REG_RIP.
1156	 */
1157	rp->r_rip = pc + tp->ftt_size;
1158
1159	/*
1160	 * If there's an is-enabled probe connected to this tracepoint it
1161	 * means that there was a 'xorl %eax, %eax' or 'xorq %rax, %rax'
1162	 * instruction that was placed there by DTrace when the binary was
1163	 * linked. As this probe is, in fact, enabled, we need to stuff 1
1164	 * into %eax or %rax. Accordingly, we can bypass all the instruction
1165	 * emulation logic since we know the inevitable result. It's possible
1166	 * that a user could construct a scenario where the 'is-enabled'
1167	 * probe was on some other instruction, but that would be a rather
1168	 * exotic way to shoot oneself in the foot.
1169	 */
1170	if (is_enabled) {
1171		rp->r_rax = 1;
1172		new_pc = rp->r_rip;
1173		goto done;
1174	}
1175
1176	/*
1177	 * We emulate certain types of instructions to ensure correctness
1178	 * (in the case of position dependent instructions) or optimize
1179	 * common cases. The rest we have the thread execute back in user-
1180	 * land.
1181	 */
1182	switch (tp->ftt_type) {
1183	case FASTTRAP_T_RET:
1184	case FASTTRAP_T_RET16:
1185	{
1186		uintptr_t dst = 0;
1187		uintptr_t addr = 0;
1188		int ret = 0;
1189
1190		/*
1191		 * We have to emulate _every_ facet of the behavior of a ret
1192		 * instruction including what happens if the load from %esp
1193		 * fails; in that case, we send a SIGSEGV.
1194		 */
1195#ifdef __amd64
1196		if (p->p_model == DATAMODEL_NATIVE) {
1197			ret = dst = fasttrap_fulword((void *)rp->r_rsp);
1198			addr = rp->r_rsp + sizeof (uintptr_t);
1199		} else {
1200#endif
1201			uint32_t dst32;
1202			ret = dst32 = fasttrap_fuword32((void *)rp->r_rsp);
1203			dst = dst32;
1204			addr = rp->r_rsp + sizeof (uint32_t);
1205#ifdef __amd64
1206		}
1207#endif
1208
1209		if (ret == -1) {
1210			fasttrap_sigsegv(p, curthread, rp->r_rsp);
1211			new_pc = pc;
1212			break;
1213		}
1214
1215		if (tp->ftt_type == FASTTRAP_T_RET16)
1216			addr += tp->ftt_dest;
1217
1218		rp->r_rsp = addr;
1219		new_pc = dst;
1220		break;
1221	}
1222
1223	case FASTTRAP_T_JCC:
1224	{
1225		uint_t taken = 0;
1226
1227		switch (tp->ftt_code) {
1228		case FASTTRAP_JO:
1229			taken = (rp->r_rflags & FASTTRAP_EFLAGS_OF) != 0;
1230			break;
1231		case FASTTRAP_JNO:
1232			taken = (rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0;
1233			break;
1234		case FASTTRAP_JB:
1235			taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) != 0;
1236			break;
1237		case FASTTRAP_JAE:
1238			taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) == 0;
1239			break;
1240		case FASTTRAP_JE:
1241			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0;
1242			break;
1243		case FASTTRAP_JNE:
1244			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0;
1245			break;
1246		case FASTTRAP_JBE:
1247			taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) != 0 ||
1248			    (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0;
1249			break;
1250		case FASTTRAP_JA:
1251			taken = (rp->r_rflags & FASTTRAP_EFLAGS_CF) == 0 &&
1252			    (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0;
1253			break;
1254		case FASTTRAP_JS:
1255			taken = (rp->r_rflags & FASTTRAP_EFLAGS_SF) != 0;
1256			break;
1257		case FASTTRAP_JNS:
1258			taken = (rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0;
1259			break;
1260		case FASTTRAP_JP:
1261			taken = (rp->r_rflags & FASTTRAP_EFLAGS_PF) != 0;
1262			break;
1263		case FASTTRAP_JNP:
1264			taken = (rp->r_rflags & FASTTRAP_EFLAGS_PF) == 0;
1265			break;
1266		case FASTTRAP_JL:
1267			taken = ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) !=
1268			    ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
1269			break;
1270		case FASTTRAP_JGE:
1271			taken = ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) ==
1272			    ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
1273			break;
1274		case FASTTRAP_JLE:
1275			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0 ||
1276			    ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) !=
1277			    ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
1278			break;
1279		case FASTTRAP_JG:
1280			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0 &&
1281			    ((rp->r_rflags & FASTTRAP_EFLAGS_SF) == 0) ==
1282			    ((rp->r_rflags & FASTTRAP_EFLAGS_OF) == 0);
1283			break;
1284
1285		}
1286
1287		if (taken)
1288			new_pc = tp->ftt_dest;
1289		else
1290			new_pc = pc + tp->ftt_size;
1291		break;
1292	}
1293
1294	case FASTTRAP_T_LOOP:
1295	{
1296		uint_t taken = 0;
1297#ifdef __amd64
1298		greg_t cx = rp->r_rcx--;
1299#else
1300		greg_t cx = rp->r_ecx--;
1301#endif
1302
1303		switch (tp->ftt_code) {
1304		case FASTTRAP_LOOPNZ:
1305			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) == 0 &&
1306			    cx != 0;
1307			break;
1308		case FASTTRAP_LOOPZ:
1309			taken = (rp->r_rflags & FASTTRAP_EFLAGS_ZF) != 0 &&
1310			    cx != 0;
1311			break;
1312		case FASTTRAP_LOOP:
1313			taken = (cx != 0);
1314			break;
1315		}
1316
1317		if (taken)
1318			new_pc = tp->ftt_dest;
1319		else
1320			new_pc = pc + tp->ftt_size;
1321		break;
1322	}
1323
1324	case FASTTRAP_T_JCXZ:
1325	{
1326#ifdef __amd64
1327		greg_t cx = rp->r_rcx;
1328#else
1329		greg_t cx = rp->r_ecx;
1330#endif
1331
1332		if (cx == 0)
1333			new_pc = tp->ftt_dest;
1334		else
1335			new_pc = pc + tp->ftt_size;
1336		break;
1337	}
1338
1339	case FASTTRAP_T_PUSHL_EBP:
1340	{
1341		int ret = 0;
1342
1343#ifdef __amd64
1344		if (p->p_model == DATAMODEL_NATIVE) {
1345			rp->r_rsp -= sizeof (uintptr_t);
1346			ret = fasttrap_sulword((void *)rp->r_rsp, rp->r_rbp);
1347		} else {
1348#endif
1349			rp->r_rsp -= sizeof (uint32_t);
1350			ret = fasttrap_suword32((void *)rp->r_rsp, rp->r_rbp);
1351#ifdef __amd64
1352		}
1353#endif
1354
1355		if (ret == -1) {
1356			fasttrap_sigsegv(p, curthread, rp->r_rsp);
1357			new_pc = pc;
1358			break;
1359		}
1360
1361		new_pc = pc + tp->ftt_size;
1362		break;
1363	}
1364
1365	case FASTTRAP_T_NOP:
1366		new_pc = pc + tp->ftt_size;
1367		break;
1368
1369	case FASTTRAP_T_JMP:
1370	case FASTTRAP_T_CALL:
1371		if (tp->ftt_code == 0) {
1372			new_pc = tp->ftt_dest;
1373		} else {
1374			uintptr_t value, addr = tp->ftt_dest;
1375
1376			if (tp->ftt_base != FASTTRAP_NOREG)
1377				addr += fasttrap_getreg(rp, tp->ftt_base);
1378			if (tp->ftt_index != FASTTRAP_NOREG)
1379				addr += fasttrap_getreg(rp, tp->ftt_index) <<
1380				    tp->ftt_scale;
1381
1382			if (tp->ftt_code == 1) {
1383				/*
1384				 * If there's a segment prefix for this
1385				 * instruction, we'll need to check permissions
1386				 * and bounds on the given selector, and adjust
1387				 * the address accordingly.
1388				 */
1389				if (tp->ftt_segment != FASTTRAP_SEG_NONE &&
1390				    fasttrap_do_seg(tp, rp, &addr) != 0) {
1391					fasttrap_sigsegv(p, curthread, addr);
1392					new_pc = pc;
1393					break;
1394				}
1395
1396#ifdef __amd64
1397				if (p->p_model == DATAMODEL_NATIVE) {
1398#endif
1399					if ((value = fasttrap_fulword((void *)addr))
1400					     == -1) {
1401						fasttrap_sigsegv(p, curthread,
1402						    addr);
1403						new_pc = pc;
1404						break;
1405					}
1406					new_pc = value;
1407#ifdef __amd64
1408				} else {
1409					uint32_t value32;
1410					addr = (uintptr_t)(uint32_t)addr;
1411					if ((value32 = fasttrap_fuword32((void *)addr))
1412					    == -1) {
1413						fasttrap_sigsegv(p, curthread,
1414						    addr);
1415						new_pc = pc;
1416						break;
1417					}
1418					new_pc = value32;
1419				}
1420#endif
1421			} else {
1422				new_pc = addr;
1423			}
1424		}
1425
1426		/*
1427		 * If this is a call instruction, we need to push the return
1428		 * address onto the stack. If this fails, we send the process
1429		 * a SIGSEGV and reset the pc to emulate what would happen if
1430		 * this instruction weren't traced.
1431		 */
1432		if (tp->ftt_type == FASTTRAP_T_CALL) {
1433			int ret = 0;
1434			uintptr_t addr = 0, pcps;
1435#ifdef __amd64
1436			if (p->p_model == DATAMODEL_NATIVE) {
1437				addr = rp->r_rsp - sizeof (uintptr_t);
1438				pcps = pc + tp->ftt_size;
1439				ret = fasttrap_sulword((void *)addr, pcps);
1440			} else {
1441#endif
1442				addr = rp->r_rsp - sizeof (uint32_t);
1443				pcps = (uint32_t)(pc + tp->ftt_size);
1444				ret = fasttrap_suword32((void *)addr, pcps);
1445#ifdef __amd64
1446			}
1447#endif
1448
1449			if (ret == -1) {
1450				fasttrap_sigsegv(p, curthread, addr);
1451				new_pc = pc;
1452				break;
1453			}
1454
1455			rp->r_rsp = addr;
1456		}
1457
1458		break;
1459
1460	case FASTTRAP_T_COMMON:
1461	{
1462		uintptr_t addr;
1463#if defined(__amd64)
1464		uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 22];
1465#else
1466		uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 7];
1467#endif
1468		uint_t i = 0;
1469		fasttrap_scrspace_t *scrspace;
1470		scrspace = fasttrap_scraddr(curthread, tp->ftt_proc);
1471		if (scrspace == NULL) {
1472			/*
1473			 * We failed to allocate scratch space for this thread.
1474			 * Try to write the original instruction back out and
1475			 * reset the pc.
1476			 */
1477			if (fasttrap_copyout(tp->ftt_instr, (void *)pc,
1478			    tp->ftt_size))
1479				fasttrap_sigtrap(p, curthread, pc);
1480			new_pc = pc;
1481			break;
1482		}
1483		addr = scrspace->ftss_addr;
1484
1485		/*
1486		 * Generic Instruction Tracing
1487		 * ---------------------------
1488		 *
1489		 * This is the layout of the scratch space in the user-land
1490		 * thread structure for our generated instructions.
1491		 *
1492		 *	32-bit mode			bytes
1493		 *	------------------------	-----
1494		 * a:	<original instruction>		<= 15
1495		 *	jmp	<pc + tp->ftt_size>	    5
1496		 * b:	<original instruction>		<= 15
1497		 *	int	T_DTRACE_RET		    2
1498		 *					-----
1499		 *					<= 37
1500		 *
1501		 *	64-bit mode			bytes
1502		 *	------------------------	-----
1503		 * a:	<original instruction>		<= 15
1504		 *	jmp	0(%rip)			    6
1505		 *	<pc + tp->ftt_size>		    8
1506		 * b:	<original instruction>		<= 15
1507		 * 	int	T_DTRACE_RET		    2
1508		 * 					-----
1509		 * 					<= 46
1510		 *
1511		 * The %pc is set to a, and curthread->t_dtrace_astpc is set
1512		 * to b. If we encounter a signal on the way out of the
1513		 * kernel, trap() will set %pc to curthread->t_dtrace_astpc
1514		 * so that we execute the original instruction and re-enter
1515		 * the kernel rather than redirecting to the next instruction.
1516		 *
1517		 * If there are return probes (so we know that we're going to
1518		 * need to reenter the kernel after executing the original
1519		 * instruction), the scratch space will just contain the
1520		 * original instruction followed by an interrupt -- the same
1521		 * data as at b.
1522		 *
1523		 * %rip-relative Addressing
1524		 * ------------------------
1525		 *
1526		 * There's a further complication in 64-bit mode due to %rip-
1527		 * relative addressing. While this is clearly a beneficial
1528		 * architectural decision for position independent code, it's
1529		 * hard not to see it as a personal attack against the pid
1530		 * provider since before there was a relatively small set of
1531		 * instructions to emulate; with %rip-relative addressing,
1532		 * almost every instruction can potentially depend on the
1533		 * address at which it's executed. Rather than emulating
1534		 * the broad spectrum of instructions that can now be
1535		 * position dependent, we emulate jumps and others as in
1536		 * 32-bit mode, and take a different tack for instructions
1537		 * using %rip-relative addressing.
1538		 *
1539		 * For every instruction that uses the ModRM byte, the
1540		 * in-kernel disassembler reports its location. We use the
1541		 * ModRM byte to identify that an instruction uses
1542		 * %rip-relative addressing and to see what other registers
1543		 * the instruction uses. To emulate those instructions,
1544		 * we modify the instruction to be %rax-relative rather than
1545		 * %rip-relative (or %rcx-relative if the instruction uses
1546		 * %rax; or %r8- or %r9-relative if the REX.B is present so
1547		 * we don't have to rewrite the REX prefix). We then load
1548		 * the value that %rip would have been into the scratch
1549		 * register and generate an instruction to reset the scratch
1550		 * register back to its original value. The instruction
1551		 * sequence looks like this:
1552		 *
1553		 *	64-mode %rip-relative		bytes
1554		 *	------------------------	-----
1555		 * a:	<modified instruction>		<= 15
1556		 *	movq	$<value>, %<scratch>	    6
1557		 *	jmp	0(%rip)			    6
1558		 *	<pc + tp->ftt_size>		    8
1559		 * b:	<modified instruction>  	<= 15
1560		 * 	int	T_DTRACE_RET		    2
1561		 * 					-----
1562		 *					   52
1563		 *
1564		 * We set curthread->t_dtrace_regv so that upon receiving
1565		 * a signal we can reset the value of the scratch register.
1566		 */
1567
1568		ASSERT(tp->ftt_size <= FASTTRAP_MAX_INSTR_SIZE);
1569
1570		curthread->t_dtrace_scrpc = addr;
1571		bcopy(tp->ftt_instr, &scratch[i], tp->ftt_size);
1572		i += tp->ftt_size;
1573
1574#ifdef __amd64
1575		if (tp->ftt_ripmode != 0) {
1576			greg_t *reg = NULL;
1577
1578			ASSERT(p->p_model == DATAMODEL_LP64);
1579			ASSERT(tp->ftt_ripmode &
1580			    (FASTTRAP_RIP_1 | FASTTRAP_RIP_2));
1581
1582			/*
1583			 * If this was a %rip-relative instruction, we change
1584			 * it to be either a %rax- or %rcx-relative
1585			 * instruction (depending on whether those registers
1586			 * are used as another operand; or %r8- or %r9-
1587			 * relative depending on the value of REX.B). We then
1588			 * set that register and generate a movq instruction
1589			 * to reset the value.
1590			 */
1591			if (tp->ftt_ripmode & FASTTRAP_RIP_X)
1592				scratch[i++] = FASTTRAP_REX(1, 0, 0, 1);
1593			else
1594				scratch[i++] = FASTTRAP_REX(1, 0, 0, 0);
1595
1596			if (tp->ftt_ripmode & FASTTRAP_RIP_1)
1597				scratch[i++] = FASTTRAP_MOV_EAX;
1598			else
1599				scratch[i++] = FASTTRAP_MOV_ECX;
1600
1601			switch (tp->ftt_ripmode) {
1602			case FASTTRAP_RIP_1:
1603				reg = &rp->r_rax;
1604				curthread->t_dtrace_reg = REG_RAX;
1605				break;
1606			case FASTTRAP_RIP_2:
1607				reg = &rp->r_rcx;
1608				curthread->t_dtrace_reg = REG_RCX;
1609				break;
1610			case FASTTRAP_RIP_1 | FASTTRAP_RIP_X:
1611				reg = &rp->r_r8;
1612				curthread->t_dtrace_reg = REG_R8;
1613				break;
1614			case FASTTRAP_RIP_2 | FASTTRAP_RIP_X:
1615				reg = &rp->r_r9;
1616				curthread->t_dtrace_reg = REG_R9;
1617				break;
1618			}
1619
1620			/* LINTED - alignment */
1621			*(uint64_t *)&scratch[i] = *reg;
1622			curthread->t_dtrace_regv = *reg;
1623			*reg = pc + tp->ftt_size;
1624			i += sizeof (uint64_t);
1625		}
1626#endif
1627
1628		/*
1629		 * Generate the branch instruction to what would have
1630		 * normally been the subsequent instruction. In 32-bit mode,
1631		 * this is just a relative branch; in 64-bit mode this is a
1632		 * %rip-relative branch that loads the 64-bit pc value
1633		 * immediately after the jmp instruction.
1634		 */
1635#ifdef __amd64
1636		if (p->p_model == DATAMODEL_LP64) {
1637			scratch[i++] = FASTTRAP_GROUP5_OP;
1638			scratch[i++] = FASTTRAP_MODRM(0, 4, 5);
1639			/* LINTED - alignment */
1640			*(uint32_t *)&scratch[i] = 0;
1641			i += sizeof (uint32_t);
1642			/* LINTED - alignment */
1643			*(uint64_t *)&scratch[i] = pc + tp->ftt_size;
1644			i += sizeof (uint64_t);
1645		} else {
1646#endif
1647			/*
1648			 * Set up the jmp to the next instruction; note that
1649			 * the size of the traced instruction cancels out.
1650			 */
1651			scratch[i++] = FASTTRAP_JMP32;
1652			/* LINTED - alignment */
1653			*(uint32_t *)&scratch[i] = pc - addr - 5;
1654			i += sizeof (uint32_t);
1655#ifdef __amd64
1656		}
1657#endif
1658
1659		curthread->t_dtrace_astpc = addr + i;
1660		bcopy(tp->ftt_instr, &scratch[i], tp->ftt_size);
1661		i += tp->ftt_size;
1662		scratch[i++] = FASTTRAP_INT;
1663		scratch[i++] = T_DTRACE_RET;
1664
1665		ASSERT(i <= sizeof (scratch));
1666
1667		if (uwrite(curproc, scratch, i, addr) != 0) {
1668			fasttrap_sigtrap(p, curthread, pc);
1669			new_pc = pc;
1670			break;
1671		}
1672		if (tp->ftt_retids != NULL) {
1673			curthread->t_dtrace_step = 1;
1674			curthread->t_dtrace_ret = 1;
1675			new_pc = curthread->t_dtrace_astpc;
1676		} else {
1677			new_pc = curthread->t_dtrace_scrpc;
1678		}
1679
1680		curthread->t_dtrace_pc = pc;
1681		curthread->t_dtrace_npc = pc + tp->ftt_size;
1682		curthread->t_dtrace_on = 1;
1683		break;
1684	}
1685
1686	default:
1687		panic("fasttrap: mishandled an instruction");
1688	}
1689
1690done:
1691	/*
1692	 * If there were no return probes when we first found the tracepoint,
1693	 * we should feel no obligation to honor any return probes that were
1694	 * subsequently enabled -- they'll just have to wait until the next
1695	 * time around.
1696	 */
1697	if (tp->ftt_retids != NULL) {
1698		/*
1699		 * We need to wait until the results of the instruction are
1700		 * apparent before invoking any return probes. If this
1701		 * instruction was emulated we can just call
1702		 * fasttrap_return_common(); if it needs to be executed, we
1703		 * need to wait until the user thread returns to the kernel.
1704		 */
1705		if (tp->ftt_type != FASTTRAP_T_COMMON) {
1706			/*
1707			 * Set the program counter to the address of the traced
1708			 * instruction so that it looks right in ustack()
1709			 * output. We had previously set it to the end of the
1710			 * instruction to simplify %rip-relative addressing.
1711			 */
1712			rp->r_rip = pc;
1713
1714			fasttrap_return_common(rp, pc, pid, new_pc);
1715		} else {
1716			ASSERT(curthread->t_dtrace_ret != 0);
1717			ASSERT(curthread->t_dtrace_pc == pc);
1718			ASSERT(curthread->t_dtrace_scrpc != 0);
1719			ASSERT(new_pc == curthread->t_dtrace_astpc);
1720		}
1721	}
1722
1723	rp->r_rip = new_pc;
1724
1725	PROC_LOCK(p);
1726	proc_write_regs(curthread, rp);
1727	PROC_UNLOCK(p);
1728
1729	return (0);
1730}
1731
1732int
1733fasttrap_return_probe(struct trapframe *tf)
1734{
1735	struct reg reg, *rp;
1736	proc_t *p = curproc;
1737	uintptr_t pc = curthread->t_dtrace_pc;
1738	uintptr_t npc = curthread->t_dtrace_npc;
1739
1740	fill_frame_regs(tf, &reg);
1741	rp = &reg;
1742
1743	curthread->t_dtrace_pc = 0;
1744	curthread->t_dtrace_npc = 0;
1745	curthread->t_dtrace_scrpc = 0;
1746	curthread->t_dtrace_astpc = 0;
1747
1748#ifdef illumos
1749	/*
1750	 * Treat a child created by a call to vfork(2) as if it were its
1751	 * parent. We know that there's only one thread of control in such a
1752	 * process: this one.
1753	 */
1754	while (p->p_flag & SVFORK) {
1755		p = p->p_parent;
1756	}
1757#endif
1758
1759	/*
1760	 * We set rp->r_rip to the address of the traced instruction so
1761	 * that it appears to dtrace_probe() that we're on the original
1762	 * instruction.
1763	 */
1764	rp->r_rip = pc;
1765
1766	fasttrap_return_common(rp, pc, p->p_pid, npc);
1767
1768	return (0);
1769}
1770
1771/*ARGSUSED*/
1772uint64_t
1773fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
1774    int aframes)
1775{
1776	struct reg r;
1777
1778	fill_regs(curthread, &r);
1779
1780	return (fasttrap_anarg(&r, 1, argno));
1781}
1782
1783/*ARGSUSED*/
1784uint64_t
1785fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
1786    int aframes)
1787{
1788	struct reg r;
1789
1790	fill_regs(curthread, &r);
1791
1792	return (fasttrap_anarg(&r, 0, argno));
1793}
1794
1795static ulong_t
1796fasttrap_getreg(struct reg *rp, uint_t reg)
1797{
1798#ifdef __amd64
1799	switch (reg) {
1800	case REG_R15:		return (rp->r_r15);
1801	case REG_R14:		return (rp->r_r14);
1802	case REG_R13:		return (rp->r_r13);
1803	case REG_R12:		return (rp->r_r12);
1804	case REG_R11:		return (rp->r_r11);
1805	case REG_R10:		return (rp->r_r10);
1806	case REG_R9:		return (rp->r_r9);
1807	case REG_R8:		return (rp->r_r8);
1808	case REG_RDI:		return (rp->r_rdi);
1809	case REG_RSI:		return (rp->r_rsi);
1810	case REG_RBP:		return (rp->r_rbp);
1811	case REG_RBX:		return (rp->r_rbx);
1812	case REG_RDX:		return (rp->r_rdx);
1813	case REG_RCX:		return (rp->r_rcx);
1814	case REG_RAX:		return (rp->r_rax);
1815	case REG_TRAPNO:	return (rp->r_trapno);
1816	case REG_ERR:		return (rp->r_err);
1817	case REG_RIP:		return (rp->r_rip);
1818	case REG_CS:		return (rp->r_cs);
1819	case REG_RFL:		return (rp->r_rflags);
1820	case REG_RSP:		return (rp->r_rsp);
1821	case REG_SS:		return (rp->r_ss);
1822	case REG_FS:		return (rp->r_fs);
1823	case REG_GS:		return (rp->r_gs);
1824	case REG_DS:		return (rp->r_ds);
1825	case REG_ES:		return (rp->r_es);
1826	case REG_FSBASE:	return (rdmsr(MSR_FSBASE));
1827	case REG_GSBASE:	return (rdmsr(MSR_GSBASE));
1828	}
1829
1830	panic("dtrace: illegal register constant");
1831	/*NOTREACHED*/
1832#else
1833#define _NGREG 19
1834	if (reg >= _NGREG)
1835		panic("dtrace: illegal register constant");
1836
1837	return (((greg_t *)&rp->r_gs)[reg]);
1838#endif
1839}
1840