1/*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28#include <kern/task.h>
29#include <kern/thread.h>
30#include <kern/assert.h>
31#include <kern/clock.h>
32#include <kern/locks.h>
33#include <kern/sched_prim.h>
34#include <kern/debug.h>
35#include <mach/machine/thread_status.h>
36#include <mach/thread_act.h>
37#include <mach/branch_predicates.h>
38
39#include <sys/kernel.h>
40#include <sys/vm.h>
41#include <sys/proc_internal.h>
42#include <sys/syscall.h>
43#include <sys/systm.h>
44#include <sys/user.h>
45#include <sys/errno.h>
46#include <sys/kdebug.h>
47#include <sys/sysent.h>
48#include <sys/sysproto.h>
49#include <sys/kauth.h>
50#include <sys/systm.h>
51
52#include <security/audit/audit.h>
53
54#include <i386/seg.h>
55#include <i386/machine_routines.h>
56#include <mach/i386/syscall_sw.h>
57
58#include <machine/pal_routines.h>
59
60#if CONFIG_DTRACE
61extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
62extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
63#endif
64
65extern void unix_syscall(x86_saved_state_t *);
66extern void unix_syscall64(x86_saved_state_t *);
67extern void *find_user_regs(thread_t);
68
69/* dynamically generated at build time based on syscalls.master */
70extern const char *syscallnames[];
71
72/*
73 * Function:	unix_syscall
74 *
75 * Inputs:	regs	- pointer to i386 save area
76 *
77 * Outputs:	none
78 */
79void
80unix_syscall(x86_saved_state_t *state)
81{
82	thread_t		thread;
83	void			*vt;
84	unsigned int		code;
85	struct sysent		*callp;
86
87	int			error;
88	vm_offset_t		params;
89	struct proc		*p;
90	struct uthread		*uthread;
91	x86_saved_state32_t	*regs;
92	boolean_t		is_vfork;
93
94	assert(is_saved_state32(state));
95	regs = saved_state32(state);
96#if DEBUG
97	if (regs->eax == 0x800)
98		thread_exception_return();
99#endif
100	thread = current_thread();
101	uthread = get_bsdthread_info(thread);
102
103	/* Get the approriate proc; may be different from task's for vfork() */
104	is_vfork = uthread->uu_flag & UT_VFORK;
105	if (__improbable(is_vfork != 0))
106		p = current_proc();
107	else
108		p = (struct proc *)get_bsdtask_info(current_task());
109
110	/* Verify that we are not being called from a task without a proc */
111	if (__improbable(p == NULL)) {
112		regs->eax = EPERM;
113		regs->efl |= EFL_CF;
114		task_terminate_internal(current_task());
115		thread_exception_return();
116		/* NOTREACHED */
117	}
118
119	code = regs->eax & I386_SYSCALL_NUMBER_MASK;
120	DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
121							  code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
122	params = (vm_offset_t) (regs->uesp + sizeof (int));
123
124	regs->efl &= ~(EFL_CF);
125
126	callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
127
128	if (__improbable(callp == sysent)) {
129		code = fuword(params);
130		params += sizeof(int);
131		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
132	}
133
134	vt = (void *)uthread->uu_arg;
135
136	if (callp->sy_arg_bytes != 0) {
137#if CONFIG_REQUIRES_U32_MUNGING
138		sy_munge_t	*mungerp;
139#else
140#error U32 syscalls on x86_64 kernel requires munging
141#endif
142		uint32_t	 nargs;
143
144		assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg));
145		nargs = callp->sy_arg_bytes;
146		error = copyin((user_addr_t) params, (char *) vt, nargs);
147		if (error) {
148			regs->eax = error;
149			regs->efl |= EFL_CF;
150			thread_exception_return();
151			/* NOTREACHED */
152		}
153
154		if (__probable(code != 180)) {
155	        	int *ip = (int *)vt;
156
157			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
158				BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
159				*ip, *(ip+1), *(ip+2), *(ip+3), 0);
160		}
161
162#if CONFIG_REQUIRES_U32_MUNGING
163		mungerp = callp->sy_arg_munge32;
164
165		if (mungerp != NULL)
166			(*mungerp)(vt);
167#endif
168	} else
169		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
170			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
171			0, 0, 0, 0, 0);
172
173	/*
174	 * Delayed binding of thread credential to process credential, if we
175	 * are not running with an explicitly set thread credential.
176	 */
177	kauth_cred_uthread_update(uthread, p);
178
179	uthread->uu_rval[0] = 0;
180	uthread->uu_rval[1] = 0;
181	uthread->uu_flag |= UT_NOTCANCELPT;
182	uthread->syscall_code = code;
183
184#ifdef JOE_DEBUG
185        uthread->uu_iocount = 0;
186        uthread->uu_vpindex = 0;
187#endif
188
189	AUDIT_SYSCALL_ENTER(code, p, uthread);
190	error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
191	AUDIT_SYSCALL_EXIT(code, p, uthread, error);
192
193#ifdef JOE_DEBUG
194        if (uthread->uu_iocount)
195                printf("system call returned with uu_iocount != 0\n");
196#endif
197#if CONFIG_DTRACE
198	uthread->t_dtrace_errno = error;
199#endif /* CONFIG_DTRACE */
200
201	if (__improbable(error == ERESTART)) {
202		/*
203		 * Move the user's pc back to repeat the syscall:
204		 * 5 bytes for a sysenter, or 2 for an int 8x.
205		 * The SYSENTER_TF_CS covers single-stepping over a sysenter
206		 * - see debug trap handler in idt.s/idt64.s
207		 */
208
209		pal_syscall_restart(thread, state);
210	}
211	else if (error != EJUSTRETURN) {
212		if (__improbable(error)) {
213		    regs->eax = error;
214		    regs->efl |= EFL_CF;	/* carry bit */
215		} else { /* (not error) */
216			/*
217			 * We split retval across two registers, in case the
218			 * syscall had a 64-bit return value, in which case
219			 * eax/edx matches the function call ABI.
220			 */
221		    regs->eax = uthread->uu_rval[0];
222		    regs->edx = uthread->uu_rval[1];
223		}
224	}
225
226	DEBUG_KPRINT_SYSCALL_UNIX(
227		"unix_syscall: error=%d retval=(%u,%u)\n",
228		error, regs->eax, regs->edx);
229
230	uthread->uu_flag &= ~UT_NOTCANCELPT;
231
232	if (__improbable(uthread->uu_lowpri_window)) {
233	        /*
234		 * task is marked as a low priority I/O type
235		 * and the I/O we issued while in this system call
236		 * collided with normal I/O operations... we'll
237		 * delay in order to mitigate the impact of this
238		 * task on the normal operation of the system
239		 */
240		throttle_lowpri_io(1);
241	}
242	if (__probable(code != 180))
243		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
244			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
245			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
246
247	if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) {
248		pal_execve_return(thread);
249	}
250
251	thread_exception_return();
252	/* NOTREACHED */
253}
254
255
256void
257unix_syscall64(x86_saved_state_t *state)
258{
259	thread_t	thread;
260	void			*vt;
261	unsigned int	code;
262	struct sysent	*callp;
263	int		args_in_regs;
264	boolean_t	args_start_at_rdi;
265	int		error;
266	struct proc	*p;
267	struct uthread	*uthread;
268	x86_saved_state64_t *regs;
269
270	assert(is_saved_state64(state));
271	regs = saved_state64(state);
272#if	DEBUG
273	if (regs->rax == 0x2000800)
274		thread_exception_return();
275#endif
276	thread = current_thread();
277	uthread = get_bsdthread_info(thread);
278
279	/* Get the approriate proc; may be different from task's for vfork() */
280	if (__probable(!(uthread->uu_flag & UT_VFORK)))
281		p = (struct proc *)get_bsdtask_info(current_task());
282	else
283		p = current_proc();
284
285	/* Verify that we are not being called from a task without a proc */
286	if (__improbable(p == NULL)) {
287		regs->rax = EPERM;
288		regs->isf.rflags |= EFL_CF;
289		task_terminate_internal(current_task());
290		thread_exception_return();
291		/* NOTREACHED */
292	}
293
294	code = regs->rax & SYSCALL_NUMBER_MASK;
295	DEBUG_KPRINT_SYSCALL_UNIX(
296		"unix_syscall64: code=%d(%s) rip=%llx\n",
297		code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
298	callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
299
300	vt = (void *)uthread->uu_arg;
301
302	if (__improbable(callp == sysent)) {
303	        /*
304		 * indirect system call... system call number
305		 * passed as 'arg0'
306		 */
307		code = regs->rdi;
308		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
309		args_start_at_rdi = FALSE;
310		args_in_regs = 5;
311	} else {
312		args_start_at_rdi = TRUE;
313		args_in_regs = 6;
314	}
315
316	if (callp->sy_narg != 0) {
317		assert(callp->sy_narg <= 8); /* size of uu_arg */
318
319		args_in_regs = MIN(args_in_regs, callp->sy_narg);
320		memcpy(vt, args_start_at_rdi ? &regs->rdi : &regs->rsi, args_in_regs * sizeof(syscall_arg_t));
321
322
323		if (code != 180) {
324			uint64_t *ip = (uint64_t *)vt;
325
326			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
327				BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
328				(int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
329		}
330
331		if (__improbable(callp->sy_narg > args_in_regs)) {
332			int copyin_count;
333
334			copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t);
335
336			error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count);
337			if (error) {
338				regs->rax = error;
339				regs->isf.rflags |= EFL_CF;
340				thread_exception_return();
341				/* NOTREACHED */
342			}
343		}
344	} else
345		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
346			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
347			0, 0, 0, 0, 0);
348
349	/*
350	 * Delayed binding of thread credential to process credential, if we
351	 * are not running with an explicitly set thread credential.
352	 */
353	kauth_cred_uthread_update(uthread, p);
354
355	uthread->uu_rval[0] = 0;
356	uthread->uu_rval[1] = 0;
357	uthread->uu_flag |= UT_NOTCANCELPT;
358	uthread->syscall_code = code;
359
360#ifdef JOE_DEBUG
361        uthread->uu_iocount = 0;
362        uthread->uu_vpindex = 0;
363#endif
364
365	AUDIT_SYSCALL_ENTER(code, p, uthread);
366	error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0]));
367	AUDIT_SYSCALL_EXIT(code, p, uthread, error);
368
369#ifdef JOE_DEBUG
370        if (uthread->uu_iocount)
371               printf("system call returned with uu_iocount != 0\n");
372#endif
373
374#if CONFIG_DTRACE
375	uthread->t_dtrace_errno = error;
376#endif /* CONFIG_DTRACE */
377
378	if (__improbable(error == ERESTART)) {
379		/*
380		 * all system calls come through via the syscall instruction
381		 * in 64 bit mode... its 2 bytes in length
382		 * move the user's pc back to repeat the syscall:
383		 */
384		pal_syscall_restart( thread, state );
385	}
386	else if (error != EJUSTRETURN) {
387		if (__improbable(error)) {
388			regs->rax = error;
389			regs->isf.rflags |= EFL_CF;	/* carry bit */
390		} else { /* (not error) */
391
392			switch (callp->sy_return_type) {
393			case _SYSCALL_RET_INT_T:
394				regs->rax = uthread->uu_rval[0];
395				regs->rdx = uthread->uu_rval[1];
396				break;
397			case _SYSCALL_RET_UINT_T:
398				regs->rax = ((u_int)uthread->uu_rval[0]);
399				regs->rdx = ((u_int)uthread->uu_rval[1]);
400				break;
401			case _SYSCALL_RET_OFF_T:
402			case _SYSCALL_RET_ADDR_T:
403			case _SYSCALL_RET_SIZE_T:
404			case _SYSCALL_RET_SSIZE_T:
405			case _SYSCALL_RET_UINT64_T:
406			        regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
407				regs->rdx = 0;
408				break;
409			case _SYSCALL_RET_NONE:
410				break;
411			default:
412				panic("unix_syscall: unknown return type");
413				break;
414			}
415			regs->isf.rflags &= ~EFL_CF;
416		}
417	}
418
419	DEBUG_KPRINT_SYSCALL_UNIX(
420		"unix_syscall64: error=%d retval=(%llu,%llu)\n",
421		error, regs->rax, regs->rdx);
422
423	uthread->uu_flag &= ~UT_NOTCANCELPT;
424
425	if (__improbable(uthread->uu_lowpri_window)) {
426	        /*
427		 * task is marked as a low priority I/O type
428		 * and the I/O we issued while in this system call
429		 * collided with normal I/O operations... we'll
430		 * delay in order to mitigate the impact of this
431		 * task on the normal operation of the system
432		 */
433		throttle_lowpri_io(1);
434	}
435	if (__probable(code != 180))
436		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
437			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
438			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
439
440	thread_exception_return();
441	/* NOTREACHED */
442}
443
444
445void
446unix_syscall_return(int error)
447{
448	thread_t		thread;
449	struct uthread		*uthread;
450	struct proc *p;
451	unsigned int code;
452	struct sysent *callp;
453
454	thread = current_thread();
455	uthread = get_bsdthread_info(thread);
456
457	pal_register_cache_state(thread, DIRTY);
458
459	p = current_proc();
460
461	if (proc_is64bit(p)) {
462		x86_saved_state64_t *regs;
463
464		regs = saved_state64(find_user_regs(thread));
465
466		code = uthread->syscall_code;
467		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
468
469#if CONFIG_DTRACE
470		if (callp->sy_call == dtrace_systrace_syscall)
471			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
472#endif /* CONFIG_DTRACE */
473		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
474
475		if (error == ERESTART) {
476			/*
477			 * repeat the syscall
478			 */
479			pal_syscall_restart( thread, find_user_regs(thread) );
480		}
481		else if (error != EJUSTRETURN) {
482			if (error) {
483				regs->rax = error;
484				regs->isf.rflags |= EFL_CF;	/* carry bit */
485			} else { /* (not error) */
486
487				switch (callp->sy_return_type) {
488				case _SYSCALL_RET_INT_T:
489					regs->rax = uthread->uu_rval[0];
490					regs->rdx = uthread->uu_rval[1];
491					break;
492				case _SYSCALL_RET_UINT_T:
493					regs->rax = ((u_int)uthread->uu_rval[0]);
494					regs->rdx = ((u_int)uthread->uu_rval[1]);
495					break;
496				case _SYSCALL_RET_OFF_T:
497				case _SYSCALL_RET_ADDR_T:
498				case _SYSCALL_RET_SIZE_T:
499				case _SYSCALL_RET_SSIZE_T:
500				case _SYSCALL_RET_UINT64_T:
501					regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
502					regs->rdx = 0;
503					break;
504				case _SYSCALL_RET_NONE:
505					break;
506				default:
507					panic("unix_syscall: unknown return type");
508					break;
509				}
510				regs->isf.rflags &= ~EFL_CF;
511			}
512		}
513		DEBUG_KPRINT_SYSCALL_UNIX(
514			"unix_syscall_return: error=%d retval=(%llu,%llu)\n",
515			error, regs->rax, regs->rdx);
516	} else {
517		x86_saved_state32_t	*regs;
518
519		regs = saved_state32(find_user_regs(thread));
520
521		regs->efl &= ~(EFL_CF);
522
523		code = uthread->syscall_code;
524		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
525
526#if CONFIG_DTRACE
527		if (callp->sy_call == dtrace_systrace_syscall)
528			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
529#endif /* CONFIG_DTRACE */
530		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
531
532		if (error == ERESTART) {
533			pal_syscall_restart( thread, find_user_regs(thread) );
534		}
535		else if (error != EJUSTRETURN) {
536			if (error) {
537				regs->eax = error;
538				regs->efl |= EFL_CF;	/* carry bit */
539			} else { /* (not error) */
540				regs->eax = uthread->uu_rval[0];
541				regs->edx = uthread->uu_rval[1];
542			}
543		}
544		DEBUG_KPRINT_SYSCALL_UNIX(
545			"unix_syscall_return: error=%d retval=(%u,%u)\n",
546			error, regs->eax, regs->edx);
547	}
548
549
550	uthread->uu_flag &= ~UT_NOTCANCELPT;
551
552	if (uthread->uu_lowpri_window) {
553	        /*
554		 * task is marked as a low priority I/O type
555		 * and the I/O we issued while in this system call
556		 * collided with normal I/O operations... we'll
557		 * delay in order to mitigate the impact of this
558		 * task on the normal operation of the system
559		 */
560		throttle_lowpri_io(1);
561	}
562	if (code != 180)
563		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
564			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
565			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
566
567	thread_exception_return();
568	/* NOTREACHED */
569}
570
571