1/*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28#include <kern/task.h>
29#include <kern/thread.h>
30#include <kern/assert.h>
31#include <kern/clock.h>
32#include <kern/locks.h>
33#include <kern/sched_prim.h>
34#include <kern/debug.h>
35#include <mach/machine/thread_status.h>
36#include <mach/thread_act.h>
37#include <mach/branch_predicates.h>
38
39#include <sys/kernel.h>
40#include <sys/vm.h>
41#include <sys/proc_internal.h>
42#include <sys/syscall.h>
43#include <sys/systm.h>
44#include <sys/user.h>
45#include <sys/errno.h>
46#include <sys/kdebug.h>
47#include <sys/sysent.h>
48#include <sys/sysproto.h>
49#include <sys/kauth.h>
50#include <sys/systm.h>
51
52#include <security/audit/audit.h>
53
54#include <i386/seg.h>
55#include <i386/machine_routines.h>
56#include <mach/i386/syscall_sw.h>
57
58#include <machine/pal_routines.h>
59
60#if CONFIG_DTRACE
61extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
62extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
63#endif
64
65extern void unix_syscall(x86_saved_state_t *);
66extern void unix_syscall64(x86_saved_state_t *);
67extern void *find_user_regs(thread_t);
68
69/* dynamically generated at build time based on syscalls.master */
70extern const char *syscallnames[];
71
72/*
73 * This needs to be a single switch so that it's "all on" or "all off",
74 * rather than being turned on for some code paths and not others, as this
75 * has a tendency to introduce "blame the next guy" bugs.
76 */
77#if DEBUG
78#define	FUNNEL_DEBUG	1	/* Check for funnel held on exit */
79#endif
80
81/*
82 * Function:	unix_syscall
83 *
84 * Inputs:	regs	- pointer to i386 save area
85 *
86 * Outputs:	none
87 */
88void
89unix_syscall(x86_saved_state_t *state)
90{
91	thread_t		thread;
92	void			*vt;
93	unsigned int		code;
94	struct sysent		*callp;
95
96	int			error;
97	vm_offset_t		params;
98	struct proc		*p;
99	struct uthread		*uthread;
100	x86_saved_state32_t	*regs;
101	boolean_t		is_vfork;
102
103	assert(is_saved_state32(state));
104	regs = saved_state32(state);
105#if DEBUG
106	if (regs->eax == 0x800)
107		thread_exception_return();
108#endif
109	thread = current_thread();
110	uthread = get_bsdthread_info(thread);
111
112	/* Get the approriate proc; may be different from task's for vfork() */
113	is_vfork = uthread->uu_flag & UT_VFORK;
114	if (__improbable(is_vfork != 0))
115		p = current_proc();
116	else
117		p = (struct proc *)get_bsdtask_info(current_task());
118
119	/* Verify that we are not being called from a task without a proc */
120	if (__improbable(p == NULL)) {
121		regs->eax = EPERM;
122		regs->efl |= EFL_CF;
123		task_terminate_internal(current_task());
124		thread_exception_return();
125		/* NOTREACHED */
126	}
127
128	code = regs->eax & I386_SYSCALL_NUMBER_MASK;
129	DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
130							  code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
131	params = (vm_offset_t) (regs->uesp + sizeof (int));
132
133	regs->efl &= ~(EFL_CF);
134
135	callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
136
137	if (__improbable(callp == sysent)) {
138		code = fuword(params);
139		params += sizeof(int);
140		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
141	}
142
143	vt = (void *)uthread->uu_arg;
144	uthread->uu_ap = vt;
145
146	if (callp->sy_arg_bytes != 0) {
147		sy_munge_t	*mungerp;
148		uint32_t	 nargs;
149
150		assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg));
151		nargs = callp->sy_arg_bytes;
152		error = copyin((user_addr_t) params, (char *) vt, nargs);
153		if (error) {
154			regs->eax = error;
155			regs->efl |= EFL_CF;
156			thread_exception_return();
157			/* NOTREACHED */
158		}
159
160		if (__probable(code != 180)) {
161	        	int *ip = (int *)vt;
162
163			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
164				BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
165				*ip, *(ip+1), *(ip+2), *(ip+3), 0);
166		}
167		mungerp = callp->sy_arg_munge32;
168
169		/*
170		 * If non-NULL, then call the syscall argument munger to
171		 * copy in arguments (see xnu/bsd/dev/{i386|x86_64}/munge.s); the
172		 * first argument is NULL because we are munging in place
173		 * after a copyin because the ABI currently doesn't use
174		 * registers to pass system call arguments.
175		 */
176		if (mungerp != NULL)
177			(*mungerp)(NULL, vt);
178	} else
179		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
180			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
181			0, 0, 0, 0, 0);
182
183	/*
184	 * Delayed binding of thread credential to process credential, if we
185	 * are not running with an explicitly set thread credential.
186	 */
187	kauth_cred_uthread_update(uthread, p);
188
189	uthread->uu_rval[0] = 0;
190	uthread->uu_rval[1] = regs->edx;
191	uthread->uu_flag |= UT_NOTCANCELPT;
192
193
194#ifdef JOE_DEBUG
195        uthread->uu_iocount = 0;
196        uthread->uu_vpindex = 0;
197#endif
198
199	AUDIT_SYSCALL_ENTER(code, p, uthread);
200	error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
201        AUDIT_SYSCALL_EXIT(code, p, uthread, error);
202
203#ifdef JOE_DEBUG
204        if (uthread->uu_iocount)
205                printf("system call returned with uu_iocount != 0\n");
206#endif
207#if CONFIG_DTRACE
208	uthread->t_dtrace_errno = error;
209#endif /* CONFIG_DTRACE */
210
211	if (__improbable(error == ERESTART)) {
212		/*
213		 * Move the user's pc back to repeat the syscall:
214		 * 5 bytes for a sysenter, or 2 for an int 8x.
215		 * The SYSENTER_TF_CS covers single-stepping over a sysenter
216		 * - see debug trap handler in idt.s/idt64.s
217		 */
218
219		pal_syscall_restart(thread, state);
220	}
221	else if (error != EJUSTRETURN) {
222		if (__improbable(error)) {
223		    regs->eax = error;
224		    regs->efl |= EFL_CF;	/* carry bit */
225		} else { /* (not error) */
226		    regs->eax = uthread->uu_rval[0];
227		    regs->edx = uthread->uu_rval[1];
228		}
229	}
230
231	DEBUG_KPRINT_SYSCALL_UNIX(
232		"unix_syscall: error=%d retval=(%u,%u)\n",
233		error, regs->eax, regs->edx);
234
235	uthread->uu_flag &= ~UT_NOTCANCELPT;
236#if FUNNEL_DEBUG
237	/*
238	 * if we're holding the funnel panic
239	 */
240	syscall_exit_funnelcheck();
241#endif /* FUNNEL_DEBUG */
242
243	if (__improbable(uthread->uu_lowpri_window)) {
244	        /*
245		 * task is marked as a low priority I/O type
246		 * and the I/O we issued while in this system call
247		 * collided with normal I/O operations... we'll
248		 * delay in order to mitigate the impact of this
249		 * task on the normal operation of the system
250		 */
251		throttle_lowpri_io(1);
252	}
253	if (__probable(code != 180))
254		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
255			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
256			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
257
258	if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) {
259		pal_execve_return(thread);
260	}
261
262	thread_exception_return();
263	/* NOTREACHED */
264}
265
266
267void
268unix_syscall64(x86_saved_state_t *state)
269{
270	thread_t	thread;
271	unsigned int	code;
272	struct sysent	*callp;
273	void		*uargp;
274	int		args_in_regs;
275	int		error;
276	struct proc	*p;
277	struct uthread	*uthread;
278	x86_saved_state64_t *regs;
279
280	assert(is_saved_state64(state));
281	regs = saved_state64(state);
282#if	DEBUG
283	if (regs->rax == 0x2000800)
284		thread_exception_return();
285#endif
286	thread = current_thread();
287	uthread = get_bsdthread_info(thread);
288
289	/* Get the approriate proc; may be different from task's for vfork() */
290	if (__probable(!(uthread->uu_flag & UT_VFORK)))
291		p = (struct proc *)get_bsdtask_info(current_task());
292	else
293		p = current_proc();
294
295	/* Verify that we are not being called from a task without a proc */
296	if (__improbable(p == NULL)) {
297		regs->rax = EPERM;
298		regs->isf.rflags |= EFL_CF;
299		task_terminate_internal(current_task());
300		thread_exception_return();
301		/* NOTREACHED */
302	}
303	args_in_regs = 6;
304
305	code = regs->rax & SYSCALL_NUMBER_MASK;
306	DEBUG_KPRINT_SYSCALL_UNIX(
307		"unix_syscall64: code=%d(%s) rip=%llx\n",
308		code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
309	callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
310	uargp = (void *)(&regs->rdi);
311
312	if (__improbable(callp == sysent)) {
313	        /*
314		 * indirect system call... system call number
315		 * passed as 'arg0'
316		 */
317	        code = regs->rdi;
318		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
319		uargp = (void *)(&regs->rsi);
320		args_in_regs = 5;
321	}
322	uthread->uu_ap = uargp;
323
324	if (callp->sy_narg != 0) {
325		if (code != 180) {
326			uint64_t *ip = (uint64_t *)uargp;
327
328			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
329				BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
330				(int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
331		}
332		assert(callp->sy_narg <= 8);
333
334		if (__improbable(callp->sy_narg > args_in_regs)) {
335			int copyin_count;
336
337			copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t);
338
339			error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&regs->v_arg6, copyin_count);
340			if (error) {
341				regs->rax = error;
342				regs->isf.rflags |= EFL_CF;
343				thread_exception_return();
344				/* NOTREACHED */
345			}
346		}
347	} else
348		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
349			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
350			0, 0, 0, 0, 0);
351
352	/*
353	 * Delayed binding of thread credential to process credential, if we
354	 * are not running with an explicitly set thread credential.
355	 */
356	kauth_cred_uthread_update(uthread, p);
357
358	uthread->uu_rval[0] = 0;
359	uthread->uu_rval[1] = 0;
360
361
362	uthread->uu_flag |= UT_NOTCANCELPT;
363
364#ifdef JOE_DEBUG
365        uthread->uu_iocount = 0;
366        uthread->uu_vpindex = 0;
367#endif
368
369	AUDIT_SYSCALL_ENTER(code, p, uthread);
370	error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0]));
371        AUDIT_SYSCALL_EXIT(code, p, uthread, error);
372
373#ifdef JOE_DEBUG
374        if (uthread->uu_iocount)
375               printf("system call returned with uu_iocount != 0\n");
376#endif
377
378#if CONFIG_DTRACE
379	uthread->t_dtrace_errno = error;
380#endif /* CONFIG_DTRACE */
381
382	if (__improbable(error == ERESTART)) {
383		/*
384		 * all system calls come through via the syscall instruction
385		 * in 64 bit mode... its 2 bytes in length
386		 * move the user's pc back to repeat the syscall:
387		 */
388		pal_syscall_restart( thread, state );
389	}
390	else if (error != EJUSTRETURN) {
391		if (__improbable(error)) {
392			regs->rax = error;
393			regs->isf.rflags |= EFL_CF;	/* carry bit */
394		} else { /* (not error) */
395
396			switch (callp->sy_return_type) {
397			case _SYSCALL_RET_INT_T:
398				regs->rax = uthread->uu_rval[0];
399				regs->rdx = uthread->uu_rval[1];
400				break;
401			case _SYSCALL_RET_UINT_T:
402				regs->rax = ((u_int)uthread->uu_rval[0]);
403				regs->rdx = ((u_int)uthread->uu_rval[1]);
404				break;
405			case _SYSCALL_RET_OFF_T:
406			case _SYSCALL_RET_ADDR_T:
407			case _SYSCALL_RET_SIZE_T:
408			case _SYSCALL_RET_SSIZE_T:
409			case _SYSCALL_RET_UINT64_T:
410			        regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
411				regs->rdx = 0;
412				break;
413			case _SYSCALL_RET_NONE:
414				break;
415			default:
416				panic("unix_syscall: unknown return type");
417				break;
418			}
419			regs->isf.rflags &= ~EFL_CF;
420		}
421	}
422
423	DEBUG_KPRINT_SYSCALL_UNIX(
424		"unix_syscall64: error=%d retval=(%llu,%llu)\n",
425		error, regs->rax, regs->rdx);
426
427	uthread->uu_flag &= ~UT_NOTCANCELPT;
428
429#if FUNNEL_DEBUG
430	/*
431	 * if we're holding the funnel panic
432	 */
433	syscall_exit_funnelcheck();
434#endif /* FUNNEL_DEBUG */
435
436	if (__improbable(uthread->uu_lowpri_window)) {
437	        /*
438		 * task is marked as a low priority I/O type
439		 * and the I/O we issued while in this system call
440		 * collided with normal I/O operations... we'll
441		 * delay in order to mitigate the impact of this
442		 * task on the normal operation of the system
443		 */
444		throttle_lowpri_io(1);
445	}
446	if (__probable(code != 180))
447		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
448			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
449			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
450
451	thread_exception_return();
452	/* NOTREACHED */
453}
454
455
456void
457unix_syscall_return(int error)
458{
459	thread_t		thread;
460	struct uthread		*uthread;
461	struct proc *p;
462	unsigned int code;
463	vm_offset_t params;
464	struct sysent *callp;
465
466	thread = current_thread();
467	uthread = get_bsdthread_info(thread);
468
469	pal_register_cache_state(thread, DIRTY);
470
471	p = current_proc();
472
473	if (proc_is64bit(p)) {
474		x86_saved_state64_t *regs;
475
476		regs = saved_state64(find_user_regs(thread));
477
478		/* reconstruct code for tracing before blasting rax */
479		code = regs->rax & SYSCALL_NUMBER_MASK;
480		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
481
482		if (callp == sysent)
483			/*
484			 * indirect system call... system call number
485			 * passed as 'arg0'
486			 */
487			code = regs->rdi;
488
489#if CONFIG_DTRACE
490		if (callp->sy_call == dtrace_systrace_syscall)
491			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
492#endif /* CONFIG_DTRACE */
493		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
494
495		if (error == ERESTART) {
496			/*
497			 * repeat the syscall
498			 */
499			pal_syscall_restart( thread, find_user_regs(thread) );
500		}
501		else if (error != EJUSTRETURN) {
502			if (error) {
503				regs->rax = error;
504				regs->isf.rflags |= EFL_CF;	/* carry bit */
505			} else { /* (not error) */
506
507				switch (callp->sy_return_type) {
508				case _SYSCALL_RET_INT_T:
509					regs->rax = uthread->uu_rval[0];
510					regs->rdx = uthread->uu_rval[1];
511					break;
512				case _SYSCALL_RET_UINT_T:
513					regs->rax = ((u_int)uthread->uu_rval[0]);
514					regs->rdx = ((u_int)uthread->uu_rval[1]);
515					break;
516				case _SYSCALL_RET_OFF_T:
517				case _SYSCALL_RET_ADDR_T:
518				case _SYSCALL_RET_SIZE_T:
519				case _SYSCALL_RET_SSIZE_T:
520				case _SYSCALL_RET_UINT64_T:
521					regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
522					regs->rdx = 0;
523					break;
524				case _SYSCALL_RET_NONE:
525					break;
526				default:
527					panic("unix_syscall: unknown return type");
528					break;
529				}
530				regs->isf.rflags &= ~EFL_CF;
531			}
532		}
533		DEBUG_KPRINT_SYSCALL_UNIX(
534			"unix_syscall_return: error=%d retval=(%llu,%llu)\n",
535			error, regs->rax, regs->rdx);
536	} else {
537		x86_saved_state32_t	*regs;
538
539		regs = saved_state32(find_user_regs(thread));
540
541		regs->efl &= ~(EFL_CF);
542		/* reconstruct code for tracing before blasting eax */
543		code = regs->eax & I386_SYSCALL_NUMBER_MASK;
544		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
545
546#if CONFIG_DTRACE
547		if (callp->sy_call == dtrace_systrace_syscall)
548			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
549#endif /* CONFIG_DTRACE */
550		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
551
552		if (callp == sysent) {
553			params = (vm_offset_t) (regs->uesp + sizeof (int));
554			code = fuword(params);
555		}
556		if (error == ERESTART) {
557			pal_syscall_restart( thread, find_user_regs(thread) );
558		}
559		else if (error != EJUSTRETURN) {
560			if (error) {
561				regs->eax = error;
562				regs->efl |= EFL_CF;	/* carry bit */
563			} else { /* (not error) */
564				regs->eax = uthread->uu_rval[0];
565				regs->edx = uthread->uu_rval[1];
566			}
567		}
568		DEBUG_KPRINT_SYSCALL_UNIX(
569			"unix_syscall_return: error=%d retval=(%u,%u)\n",
570			error, regs->eax, regs->edx);
571	}
572
573
574	uthread->uu_flag &= ~UT_NOTCANCELPT;
575
576#if FUNNEL_DEBUG
577	/*
578	 * if we're holding the funnel panic
579	 */
580	syscall_exit_funnelcheck();
581#endif /* FUNNEL_DEBUG */
582
583	if (uthread->uu_lowpri_window) {
584	        /*
585		 * task is marked as a low priority I/O type
586		 * and the I/O we issued while in this system call
587		 * collided with normal I/O operations... we'll
588		 * delay in order to mitigate the impact of this
589		 * task on the normal operation of the system
590		 */
591		throttle_lowpri_io(1);
592	}
593	if (code != 180)
594		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
595			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
596			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
597
598	thread_exception_return();
599	/* NOTREACHED */
600}
601
602