1/*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28#include <kern/task.h>
29#include <kern/thread.h>
30#include <kern/assert.h>
31#include <kern/clock.h>
32#include <kern/locks.h>
33#include <kern/sched_prim.h>
34#include <kern/debug.h>
35#include <mach/machine/thread_status.h>
36#include <mach/thread_act.h>
37#include <mach/branch_predicates.h>
38
39#include <sys/kernel.h>
40#include <sys/vm.h>
41#include <sys/proc_internal.h>
42#include <sys/syscall.h>
43#include <sys/systm.h>
44#include <sys/user.h>
45#include <sys/errno.h>
46#include <sys/kdebug.h>
47#include <sys/sysent.h>
48#include <sys/sysproto.h>
49#include <sys/kauth.h>
50#include <sys/systm.h>
51
52#include <security/audit/audit.h>
53
54#include <i386/seg.h>
55#include <i386/machine_routines.h>
56#include <mach/i386/syscall_sw.h>
57
58#include <machine/pal_routines.h>
59
60#if CONFIG_DTRACE
61extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
62extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
63#endif
64
65extern void unix_syscall(x86_saved_state_t *);
66extern void unix_syscall64(x86_saved_state_t *);
67extern void *find_user_regs(thread_t);
68
69extern void x86_toggle_sysenter_arg_store(thread_t thread, boolean_t valid);
70extern boolean_t x86_sysenter_arg_store_isvalid(thread_t thread);
71
72/* dynamically generated at build time based on syscalls.master */
73extern const char *syscallnames[];
74
75/*
76 * This needs to be a single switch so that it's "all on" or "all off",
77 * rather than being turned on for some code paths and not others, as this
78 * has a tendency to introduce "blame the next guy" bugs.
79 */
80#if DEBUG
81#define	FUNNEL_DEBUG	1	/* Check for funnel held on exit */
82#endif
83
84/*
85 * Function:	unix_syscall
86 *
87 * Inputs:	regs	- pointer to i386 save area
88 *
89 * Outputs:	none
90 */
91void
92unix_syscall(x86_saved_state_t *state)
93{
94	thread_t		thread;
95	void			*vt;
96	unsigned int		code;
97	struct sysent		*callp;
98
99	int			error =;
100	vm_offset_t		params;
101	struct proc		*p;
102	struct uthread		*uthread;
103	x86_saved_state32_t	*regs;
104	boolean_t		args_in_uthread;
105	boolean_t		is_vfork;
106
107	assert(is_saved_state32(state));
108	regs = saved_state32(state);
109#if DEBUG
110	if (regs->eax == 0x800)
111		thread_exception_return();
112#endif
113	thread = current_thread();
114	uthread = get_bsdthread_info(thread);
115
116	/* Get the approriate proc; may be different from task's for vfork() */
117	is_vfork = uthread->uu_flag & UT_VFORK;
118	if (__improbable(is_vfork != 0))
119		p = current_proc();
120	else
121		p = (struct proc *)get_bsdtask_info(current_task());
122
123	/* Verify that we are not being called from a task without a proc */
124	if (__improbable(p == NULL)) {
125		regs->eax = EPERM;
126		regs->efl |= EFL_CF;
127		task_terminate_internal(current_task());
128		thread_exception_return();
129		/* NOTREACHED */
130	}
131
132	code = regs->eax & I386_SYSCALL_NUMBER_MASK;
133	DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
134							  code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
135	args_in_uthread = ((regs->eax & I386_SYSCALL_ARG_BYTES_MASK) != 0) && x86_sysenter_arg_store_isvalid(thread);
136	params = (vm_offset_t) (regs->uesp + sizeof (int));
137
138	regs->efl &= ~(EFL_CF);
139
140	callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
141
142	if (__improbable(callp == sysent)) {
143		code = fuword(params);
144		params += sizeof(int);
145		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
146	}
147
148	vt = (void *)uthread->uu_arg;
149
150	if (callp->sy_arg_bytes != 0) {
151		sy_munge_t	*mungerp;
152
153		assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg));
154		if (!args_in_uthread)
155		{
156			uint32_t nargs;
157			nargs = callp->sy_arg_bytes;
158			error = copyin((user_addr_t) params, (char *) vt, nargs);
159			if (error) {
160				regs->eax = error;
161				regs->efl |= EFL_CF;
162				thread_exception_return();
163				/* NOTREACHED */
164			}
165		}
166
167		if (__probable(code != 180)) {
168	        	int *ip = (int *)vt;
169
170			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
171				BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
172				*ip, *(ip+1), *(ip+2), *(ip+3), 0);
173		}
174		mungerp = callp->sy_arg_munge32;
175
176		/*
177		 * If non-NULL, then call the syscall argument munger to
178		 * copy in arguments (see xnu/bsd/dev/{i386|x86_64}/munge.s); the
179		 * first argument is NULL because we are munging in place
180		 * after a copyin because the ABI currently doesn't use
181		 * registers to pass system call arguments.
182		 */
183		if (mungerp != NULL)
184			(*mungerp)(NULL, vt);
185	} else
186		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
187			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
188			0, 0, 0, 0, 0);
189
190	/*
191	 * Delayed binding of thread credential to process credential, if we
192	 * are not running with an explicitly set thread credential.
193	 */
194	kauth_cred_uthread_update(uthread, p);
195
196	uthread->uu_rval[0] = 0;
197	uthread->uu_rval[1] = regs->edx;
198	uthread->uu_flag |= UT_NOTCANCELPT;
199
200
201#ifdef JOE_DEBUG
202        uthread->uu_iocount = 0;
203        uthread->uu_vpindex = 0;
204#endif
205
206	AUDIT_SYSCALL_ENTER(code, p, uthread);
207	error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
208        AUDIT_SYSCALL_EXIT(code, p, uthread, error);
209
210#ifdef JOE_DEBUG
211        if (uthread->uu_iocount)
212                printf("system call returned with uu_iocount != 0\n");
213#endif
214#if CONFIG_DTRACE
215	uthread->t_dtrace_errno = error;
216#endif /* CONFIG_DTRACE */
217
218	if (__improbable(error == ERESTART)) {
219		/*
220		 * Move the user's pc back to repeat the syscall:
221		 * 5 bytes for a sysenter, or 2 for an int 8x.
222		 * The SYSENTER_TF_CS covers single-stepping over a sysenter
223		 * - see debug trap handler in idt.s/idt64.s
224		 */
225
226		pal_syscall_restart(thread, state);
227	}
228	else if (error != EJUSTRETURN) {
229		if (__improbable(error)) {
230		    regs->eax = error;
231		    regs->efl |= EFL_CF;	/* carry bit */
232		} else { /* (not error) */
233		    regs->eax = uthread->uu_rval[0];
234		    regs->edx = uthread->uu_rval[1];
235		}
236	}
237
238	DEBUG_KPRINT_SYSCALL_UNIX(
239		"unix_syscall: error=%d retval=(%u,%u)\n",
240		error, regs->eax, regs->edx);
241
242	uthread->uu_flag &= ~UT_NOTCANCELPT;
243#if FUNNEL_DEBUG
244	/*
245	 * if we're holding the funnel panic
246	 */
247	syscall_exit_funnelcheck();
248#endif /* FUNNEL_DEBUG */
249
250	if (__improbable(uthread->uu_lowpri_window)) {
251	        /*
252		 * task is marked as a low priority I/O type
253		 * and the I/O we issued while in this system call
254		 * collided with normal I/O operations... we'll
255		 * delay in order to mitigate the impact of this
256		 * task on the normal operation of the system
257		 */
258		throttle_lowpri_io(TRUE);
259	}
260	if (__probable(code != 180))
261		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
262			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
263			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
264
265	if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) {
266		pal_execve_return(thread);
267	}
268
269	thread_exception_return();
270	/* NOTREACHED */
271}
272
273
274void
275unix_syscall64(x86_saved_state_t *state)
276{
277	thread_t	thread;
278	unsigned int	code;
279	struct sysent	*callp;
280	void		*uargp;
281	int		args_in_regs;
282	int		error;
283	struct proc	*p;
284	struct uthread	*uthread;
285	x86_saved_state64_t *regs;
286
287	assert(is_saved_state64(state));
288	regs = saved_state64(state);
289#if	DEBUG
290	if (regs->rax == 0x2000800)
291		thread_exception_return();
292#endif
293	thread = current_thread();
294	uthread = get_bsdthread_info(thread);
295
296	/* Get the approriate proc; may be different from task's for vfork() */
297	if (__probable(!(uthread->uu_flag & UT_VFORK)))
298		p = (struct proc *)get_bsdtask_info(current_task());
299	else
300		p = current_proc();
301
302	/* Verify that we are not being called from a task without a proc */
303	if (__improbable(p == NULL)) {
304		regs->rax = EPERM;
305		regs->isf.rflags |= EFL_CF;
306		task_terminate_internal(current_task());
307		thread_exception_return();
308		/* NOTREACHED */
309	}
310	args_in_regs = 6;
311
312	code = regs->rax & SYSCALL_NUMBER_MASK;
313	DEBUG_KPRINT_SYSCALL_UNIX(
314		"unix_syscall64: code=%d(%s) rip=%llx\n",
315		code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
316	callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
317	uargp = (void *)(&regs->rdi);
318
319	if (__improbable(callp == sysent)) {
320	        /*
321		 * indirect system call... system call number
322		 * passed as 'arg0'
323		 */
324	        code = regs->rdi;
325		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
326		uargp = (void *)(&regs->rsi);
327		args_in_regs = 5;
328	}
329
330	if (callp->sy_narg != 0) {
331		if (code != 180) {
332			uint64_t *ip = (uint64_t *)uargp;
333
334			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
335				BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
336				(int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
337		}
338		assert(callp->sy_narg <= 8);
339
340		if (__improbable(callp->sy_narg > args_in_regs)) {
341			int copyin_count;
342
343			copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t);
344
345			error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&regs->v_arg6, copyin_count);
346			if (error) {
347				regs->rax = error;
348				regs->isf.rflags |= EFL_CF;
349				thread_exception_return();
350				/* NOTREACHED */
351			}
352		}
353		/*
354		 * XXX Turn 64 bit unsafe calls into nosys()
355		 */
356		if (__improbable(callp->sy_flags & UNSAFE_64BIT)) {
357			callp = &sysent[63];
358			goto unsafe;
359		}
360	} else
361		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
362			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
363			0, 0, 0, 0, 0);
364unsafe:
365
366	/*
367	 * Delayed binding of thread credential to process credential, if we
368	 * are not running with an explicitly set thread credential.
369	 */
370	kauth_cred_uthread_update(uthread, p);
371
372	uthread->uu_rval[0] = 0;
373	uthread->uu_rval[1] = 0;
374
375
376	uthread->uu_flag |= UT_NOTCANCELPT;
377
378#ifdef JOE_DEBUG
379        uthread->uu_iocount = 0;
380        uthread->uu_vpindex = 0;
381#endif
382
383	AUDIT_SYSCALL_ENTER(code, p, uthread);
384	error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0]));
385        AUDIT_SYSCALL_EXIT(code, p, uthread, error);
386
387#ifdef JOE_DEBUG
388        if (uthread->uu_iocount)
389               printf("system call returned with uu_iocount != 0\n");
390#endif
391
392#if CONFIG_DTRACE
393	uthread->t_dtrace_errno = error;
394#endif /* CONFIG_DTRACE */
395
396	if (__improbable(error == ERESTART)) {
397		/*
398		 * all system calls come through via the syscall instruction
399		 * in 64 bit mode... its 2 bytes in length
400		 * move the user's pc back to repeat the syscall:
401		 */
402		pal_syscall_restart( thread, state );
403	}
404	else if (error != EJUSTRETURN) {
405		if (__improbable(error)) {
406			regs->rax = error;
407			regs->isf.rflags |= EFL_CF;	/* carry bit */
408		} else { /* (not error) */
409
410			switch (callp->sy_return_type) {
411			case _SYSCALL_RET_INT_T:
412				regs->rax = uthread->uu_rval[0];
413				regs->rdx = uthread->uu_rval[1];
414				break;
415			case _SYSCALL_RET_UINT_T:
416				regs->rax = ((u_int)uthread->uu_rval[0]);
417				regs->rdx = ((u_int)uthread->uu_rval[1]);
418				break;
419			case _SYSCALL_RET_OFF_T:
420			case _SYSCALL_RET_ADDR_T:
421			case _SYSCALL_RET_SIZE_T:
422			case _SYSCALL_RET_SSIZE_T:
423			case _SYSCALL_RET_UINT64_T:
424			        regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
425				regs->rdx = 0;
426				break;
427			case _SYSCALL_RET_NONE:
428				break;
429			default:
430				panic("unix_syscall: unknown return type");
431				break;
432			}
433			regs->isf.rflags &= ~EFL_CF;
434		}
435	}
436
437	DEBUG_KPRINT_SYSCALL_UNIX(
438		"unix_syscall64: error=%d retval=(%llu,%llu)\n",
439		error, regs->rax, regs->rdx);
440
441	uthread->uu_flag &= ~UT_NOTCANCELPT;
442
443#if FUNNEL_DEBUG
444	/*
445	 * if we're holding the funnel panic
446	 */
447	syscall_exit_funnelcheck();
448#endif /* FUNNEL_DEBUG */
449
450	if (__improbable(uthread->uu_lowpri_window)) {
451	        /*
452		 * task is marked as a low priority I/O type
453		 * and the I/O we issued while in this system call
454		 * collided with normal I/O operations... we'll
455		 * delay in order to mitigate the impact of this
456		 * task on the normal operation of the system
457		 */
458		throttle_lowpri_io(TRUE);
459	}
460	if (__probable(code != 180))
461		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
462			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
463			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
464
465	thread_exception_return();
466	/* NOTREACHED */
467}
468
469
470void
471unix_syscall_return(int error)
472{
473	thread_t		thread;
474	struct uthread		*uthread;
475	struct proc *p;
476	unsigned int code;
477	vm_offset_t params;
478	struct sysent *callp;
479
480	thread = current_thread();
481	uthread = get_bsdthread_info(thread);
482
483	pal_register_cache_state(thread, DIRTY);
484
485	p = current_proc();
486
487	if (proc_is64bit(p)) {
488		x86_saved_state64_t *regs;
489
490		regs = saved_state64(find_user_regs(thread));
491
492		/* reconstruct code for tracing before blasting rax */
493		code = regs->rax & SYSCALL_NUMBER_MASK;
494		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
495
496		if (callp == sysent)
497			/*
498			 * indirect system call... system call number
499			 * passed as 'arg0'
500			 */
501			code = regs->rdi;
502
503#if CONFIG_DTRACE
504		if (callp->sy_call == dtrace_systrace_syscall)
505			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
506#endif /* CONFIG_DTRACE */
507		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
508
509		if (error == ERESTART) {
510			/*
511			 * repeat the syscall
512			 */
513			pal_syscall_restart( thread, find_user_regs(thread) );
514		}
515		else if (error != EJUSTRETURN) {
516			if (error) {
517				regs->rax = error;
518				regs->isf.rflags |= EFL_CF;	/* carry bit */
519			} else { /* (not error) */
520
521				switch (callp->sy_return_type) {
522				case _SYSCALL_RET_INT_T:
523					regs->rax = uthread->uu_rval[0];
524					regs->rdx = uthread->uu_rval[1];
525					break;
526				case _SYSCALL_RET_UINT_T:
527					regs->rax = ((u_int)uthread->uu_rval[0]);
528					regs->rdx = ((u_int)uthread->uu_rval[1]);
529					break;
530				case _SYSCALL_RET_OFF_T:
531				case _SYSCALL_RET_ADDR_T:
532				case _SYSCALL_RET_SIZE_T:
533				case _SYSCALL_RET_SSIZE_T:
534				case _SYSCALL_RET_UINT64_T:
535					regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
536					regs->rdx = 0;
537					break;
538				case _SYSCALL_RET_NONE:
539					break;
540				default:
541					panic("unix_syscall: unknown return type");
542					break;
543				}
544				regs->isf.rflags &= ~EFL_CF;
545			}
546		}
547		DEBUG_KPRINT_SYSCALL_UNIX(
548			"unix_syscall_return: error=%d retval=(%llu,%llu)\n",
549			error, regs->rax, regs->rdx);
550	} else {
551		x86_saved_state32_t	*regs;
552
553		regs = saved_state32(find_user_regs(thread));
554
555		regs->efl &= ~(EFL_CF);
556		/* reconstruct code for tracing before blasting eax */
557		code = regs->eax & I386_SYSCALL_NUMBER_MASK;
558		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
559
560#if CONFIG_DTRACE
561		if (callp->sy_call == dtrace_systrace_syscall)
562			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
563#endif /* CONFIG_DTRACE */
564		AUDIT_SYSCALL_EXIT(code, p, uthread, error);
565
566		if (callp == sysent) {
567			params = (vm_offset_t) (regs->uesp + sizeof (int));
568			code = fuword(params);
569		}
570		if (error == ERESTART) {
571			pal_syscall_restart( thread, find_user_regs(thread) );
572		}
573		else if (error != EJUSTRETURN) {
574			if (error) {
575				regs->eax = error;
576				regs->efl |= EFL_CF;	/* carry bit */
577			} else { /* (not error) */
578				regs->eax = uthread->uu_rval[0];
579				regs->edx = uthread->uu_rval[1];
580			}
581		}
582		DEBUG_KPRINT_SYSCALL_UNIX(
583			"unix_syscall_return: error=%d retval=(%u,%u)\n",
584			error, regs->eax, regs->edx);
585	}
586
587
588	uthread->uu_flag &= ~UT_NOTCANCELPT;
589
590#if FUNNEL_DEBUG
591	/*
592	 * if we're holding the funnel panic
593	 */
594	syscall_exit_funnelcheck();
595#endif /* FUNNEL_DEBUG */
596
597	if (uthread->uu_lowpri_window) {
598	        /*
599		 * task is marked as a low priority I/O type
600		 * and the I/O we issued while in this system call
601		 * collided with normal I/O operations... we'll
602		 * delay in order to mitigate the impact of this
603		 * task on the normal operation of the system
604		 */
605		throttle_lowpri_io(TRUE);
606	}
607	if (code != 180)
608		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
609			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
610			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
611
612	thread_exception_return();
613	/* NOTREACHED */
614}
615
616void
617munge_wwwlww(
618	__unused const void	*in32,
619	void			*out64)
620{
621	uint32_t	*arg32;
622	uint64_t	*arg64;
623
624	/* we convert in place in out64 */
625	arg32 = (uint32_t *) out64;
626	arg64 = (uint64_t *) out64;
627
628	arg64[5] = arg32[6];	/* wwwlwW */
629	arg64[4] = arg32[5];	/* wwwlWw */
630	arg32[7] = arg32[4];	/* wwwLww (hi) */
631	arg32[6] = arg32[3];	/* wwwLww (lo) */
632	arg64[2] = arg32[2];	/* wwWlww */
633	arg64[1] = arg32[1];	/* wWwlww */
634	arg64[0] = arg32[0];	/* Wwwlww */
635}
636
637
638void
639munge_wwlwww(
640	__unused const void	*in32,
641	void			*out64)
642{
643	uint32_t	*arg32;
644	uint64_t	*arg64;
645
646	/* we convert in place in out64 */
647	arg32 = (uint32_t *) out64;
648	arg64 = (uint64_t *) out64;
649
650	arg64[5] = arg32[6];	/* wwlwwW */
651	arg64[4] = arg32[5];	/* wwlwWw */
652	arg64[3] = arg32[4];	/* wwlWww  */
653	arg32[5] = arg32[3];	/* wwLwww (hi) */
654	arg32[4] = arg32[2];	/* wwLwww (lo) */
655	arg64[1] = arg32[1];	/* wWlwww */
656	arg64[0] = arg32[0];	/* Wwlwww */
657}
658
659