1/* 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28#include <kern/task.h> 29#include <kern/thread.h> 30#include <kern/assert.h> 31#include <kern/clock.h> 32#include <kern/locks.h> 33#include <kern/sched_prim.h> 34#include <kern/debug.h> 35#include <mach/machine/thread_status.h> 36#include <mach/thread_act.h> 37#include <mach/branch_predicates.h> 38 39#include <sys/kernel.h> 40#include <sys/vm.h> 41#include <sys/proc_internal.h> 42#include <sys/syscall.h> 43#include <sys/systm.h> 44#include <sys/user.h> 45#include <sys/errno.h> 46#include <sys/kdebug.h> 47#include <sys/sysent.h> 48#include <sys/sysproto.h> 49#include <sys/kauth.h> 50#include <sys/systm.h> 51 52#include <security/audit/audit.h> 53 54#include <i386/seg.h> 55#include <i386/machine_routines.h> 56#include <mach/i386/syscall_sw.h> 57 58#include <machine/pal_routines.h> 59 60#if CONFIG_DTRACE 61extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); 62extern void dtrace_systrace_syscall_return(unsigned short, int, int *); 63#endif 64 65extern void unix_syscall(x86_saved_state_t *); 66extern void unix_syscall64(x86_saved_state_t *); 67extern void *find_user_regs(thread_t); 68 69/* dynamically generated at build time based on syscalls.master */ 70extern const char *syscallnames[]; 71 72/* 73 * This needs to be a single switch so that it's "all on" or "all off", 74 * rather than being turned on for some code paths and not others, as this 75 * has a tendency to introduce "blame the next guy" bugs. 76 */ 77#if DEBUG 78#define FUNNEL_DEBUG 1 /* Check for funnel held on exit */ 79#endif 80 81/* 82 * Function: unix_syscall 83 * 84 * Inputs: regs - pointer to i386 save area 85 * 86 * Outputs: none 87 */ 88void 89unix_syscall(x86_saved_state_t *state) 90{ 91 thread_t thread; 92 void *vt; 93 unsigned int code; 94 struct sysent *callp; 95 96 int error; 97 vm_offset_t params; 98 struct proc *p; 99 struct uthread *uthread; 100 x86_saved_state32_t *regs; 101 boolean_t is_vfork; 102 103 assert(is_saved_state32(state)); 104 regs = saved_state32(state); 105#if DEBUG 106 if (regs->eax == 0x800) 107 thread_exception_return(); 108#endif 109 thread = current_thread(); 110 uthread = get_bsdthread_info(thread); 111 112 /* Get the approriate proc; may be different from task's for vfork() */ 113 is_vfork = uthread->uu_flag & UT_VFORK; 114 if (__improbable(is_vfork != 0)) 115 p = current_proc(); 116 else 117 p = (struct proc *)get_bsdtask_info(current_task()); 118 119 /* Verify that we are not being called from a task without a proc */ 120 if (__improbable(p == NULL)) { 121 regs->eax = EPERM; 122 regs->efl |= EFL_CF; 123 task_terminate_internal(current_task()); 124 thread_exception_return(); 125 /* NOTREACHED */ 126 } 127 128 code = regs->eax & I386_SYSCALL_NUMBER_MASK; 129 DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n", 130 code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip); 131 params = (vm_offset_t) (regs->uesp + sizeof (int)); 132 133 regs->efl &= ~(EFL_CF); 134 135 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 136 137 if (__improbable(callp == sysent)) { 138 code = fuword(params); 139 params += sizeof(int); 140 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 141 } 142 143 vt = (void *)uthread->uu_arg; 144 uthread->uu_ap = vt; 145 146 if (callp->sy_arg_bytes != 0) { 147 sy_munge_t *mungerp; 148 uint32_t nargs; 149 150 assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg)); 151 nargs = callp->sy_arg_bytes; 152 error = copyin((user_addr_t) params, (char *) vt, nargs); 153 if (error) { 154 regs->eax = error; 155 regs->efl |= EFL_CF; 156 thread_exception_return(); 157 /* NOTREACHED */ 158 } 159 160 if (__probable(code != 180)) { 161 int *ip = (int *)vt; 162 163 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 164 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 165 *ip, *(ip+1), *(ip+2), *(ip+3), 0); 166 } 167 mungerp = callp->sy_arg_munge32; 168 169 /* 170 * If non-NULL, then call the syscall argument munger to 171 * copy in arguments (see xnu/bsd/dev/{i386|x86_64}/munge.s); the 172 * first argument is NULL because we are munging in place 173 * after a copyin because the ABI currently doesn't use 174 * registers to pass system call arguments. 175 */ 176 if (mungerp != NULL) 177 (*mungerp)(NULL, vt); 178 } else 179 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 180 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 181 0, 0, 0, 0, 0); 182 183 /* 184 * Delayed binding of thread credential to process credential, if we 185 * are not running with an explicitly set thread credential. 186 */ 187 kauth_cred_uthread_update(uthread, p); 188 189 uthread->uu_rval[0] = 0; 190 uthread->uu_rval[1] = regs->edx; 191 uthread->uu_flag |= UT_NOTCANCELPT; 192 193 194#ifdef JOE_DEBUG 195 uthread->uu_iocount = 0; 196 uthread->uu_vpindex = 0; 197#endif 198 199 AUDIT_SYSCALL_ENTER(code, p, uthread); 200 error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); 201 AUDIT_SYSCALL_EXIT(code, p, uthread, error); 202 203#ifdef JOE_DEBUG 204 if (uthread->uu_iocount) 205 printf("system call returned with uu_iocount != 0\n"); 206#endif 207#if CONFIG_DTRACE 208 uthread->t_dtrace_errno = error; 209#endif /* CONFIG_DTRACE */ 210 211 if (__improbable(error == ERESTART)) { 212 /* 213 * Move the user's pc back to repeat the syscall: 214 * 5 bytes for a sysenter, or 2 for an int 8x. 215 * The SYSENTER_TF_CS covers single-stepping over a sysenter 216 * - see debug trap handler in idt.s/idt64.s 217 */ 218 219 pal_syscall_restart(thread, state); 220 } 221 else if (error != EJUSTRETURN) { 222 if (__improbable(error)) { 223 regs->eax = error; 224 regs->efl |= EFL_CF; /* carry bit */ 225 } else { /* (not error) */ 226 regs->eax = uthread->uu_rval[0]; 227 regs->edx = uthread->uu_rval[1]; 228 } 229 } 230 231 DEBUG_KPRINT_SYSCALL_UNIX( 232 "unix_syscall: error=%d retval=(%u,%u)\n", 233 error, regs->eax, regs->edx); 234 235 uthread->uu_flag &= ~UT_NOTCANCELPT; 236#if FUNNEL_DEBUG 237 /* 238 * if we're holding the funnel panic 239 */ 240 syscall_exit_funnelcheck(); 241#endif /* FUNNEL_DEBUG */ 242 243 if (__improbable(uthread->uu_lowpri_window)) { 244 /* 245 * task is marked as a low priority I/O type 246 * and the I/O we issued while in this system call 247 * collided with normal I/O operations... we'll 248 * delay in order to mitigate the impact of this 249 * task on the normal operation of the system 250 */ 251 throttle_lowpri_io(1); 252 } 253 if (__probable(code != 180)) 254 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 255 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, 256 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); 257 258 if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { 259 pal_execve_return(thread); 260 } 261 262 thread_exception_return(); 263 /* NOTREACHED */ 264} 265 266 267void 268unix_syscall64(x86_saved_state_t *state) 269{ 270 thread_t thread; 271 unsigned int code; 272 struct sysent *callp; 273 void *uargp; 274 int args_in_regs; 275 int error; 276 struct proc *p; 277 struct uthread *uthread; 278 x86_saved_state64_t *regs; 279 280 assert(is_saved_state64(state)); 281 regs = saved_state64(state); 282#if DEBUG 283 if (regs->rax == 0x2000800) 284 thread_exception_return(); 285#endif 286 thread = current_thread(); 287 uthread = get_bsdthread_info(thread); 288 289 /* Get the approriate proc; may be different from task's for vfork() */ 290 if (__probable(!(uthread->uu_flag & UT_VFORK))) 291 p = (struct proc *)get_bsdtask_info(current_task()); 292 else 293 p = current_proc(); 294 295 /* Verify that we are not being called from a task without a proc */ 296 if (__improbable(p == NULL)) { 297 regs->rax = EPERM; 298 regs->isf.rflags |= EFL_CF; 299 task_terminate_internal(current_task()); 300 thread_exception_return(); 301 /* NOTREACHED */ 302 } 303 args_in_regs = 6; 304 305 code = regs->rax & SYSCALL_NUMBER_MASK; 306 DEBUG_KPRINT_SYSCALL_UNIX( 307 "unix_syscall64: code=%d(%s) rip=%llx\n", 308 code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip); 309 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 310 uargp = (void *)(®s->rdi); 311 312 if (__improbable(callp == sysent)) { 313 /* 314 * indirect system call... system call number 315 * passed as 'arg0' 316 */ 317 code = regs->rdi; 318 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 319 uargp = (void *)(®s->rsi); 320 args_in_regs = 5; 321 } 322 uthread->uu_ap = uargp; 323 324 if (callp->sy_narg != 0) { 325 if (code != 180) { 326 uint64_t *ip = (uint64_t *)uargp; 327 328 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 329 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 330 (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0); 331 } 332 assert(callp->sy_narg <= 8); 333 334 if (__improbable(callp->sy_narg > args_in_regs)) { 335 int copyin_count; 336 337 copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t); 338 339 error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)®s->v_arg6, copyin_count); 340 if (error) { 341 regs->rax = error; 342 regs->isf.rflags |= EFL_CF; 343 thread_exception_return(); 344 /* NOTREACHED */ 345 } 346 } 347 } else 348 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 349 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 350 0, 0, 0, 0, 0); 351 352 /* 353 * Delayed binding of thread credential to process credential, if we 354 * are not running with an explicitly set thread credential. 355 */ 356 kauth_cred_uthread_update(uthread, p); 357 358 uthread->uu_rval[0] = 0; 359 uthread->uu_rval[1] = 0; 360 361 362 uthread->uu_flag |= UT_NOTCANCELPT; 363 364#ifdef JOE_DEBUG 365 uthread->uu_iocount = 0; 366 uthread->uu_vpindex = 0; 367#endif 368 369 AUDIT_SYSCALL_ENTER(code, p, uthread); 370 error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0])); 371 AUDIT_SYSCALL_EXIT(code, p, uthread, error); 372 373#ifdef JOE_DEBUG 374 if (uthread->uu_iocount) 375 printf("system call returned with uu_iocount != 0\n"); 376#endif 377 378#if CONFIG_DTRACE 379 uthread->t_dtrace_errno = error; 380#endif /* CONFIG_DTRACE */ 381 382 if (__improbable(error == ERESTART)) { 383 /* 384 * all system calls come through via the syscall instruction 385 * in 64 bit mode... its 2 bytes in length 386 * move the user's pc back to repeat the syscall: 387 */ 388 pal_syscall_restart( thread, state ); 389 } 390 else if (error != EJUSTRETURN) { 391 if (__improbable(error)) { 392 regs->rax = error; 393 regs->isf.rflags |= EFL_CF; /* carry bit */ 394 } else { /* (not error) */ 395 396 switch (callp->sy_return_type) { 397 case _SYSCALL_RET_INT_T: 398 regs->rax = uthread->uu_rval[0]; 399 regs->rdx = uthread->uu_rval[1]; 400 break; 401 case _SYSCALL_RET_UINT_T: 402 regs->rax = ((u_int)uthread->uu_rval[0]); 403 regs->rdx = ((u_int)uthread->uu_rval[1]); 404 break; 405 case _SYSCALL_RET_OFF_T: 406 case _SYSCALL_RET_ADDR_T: 407 case _SYSCALL_RET_SIZE_T: 408 case _SYSCALL_RET_SSIZE_T: 409 case _SYSCALL_RET_UINT64_T: 410 regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); 411 regs->rdx = 0; 412 break; 413 case _SYSCALL_RET_NONE: 414 break; 415 default: 416 panic("unix_syscall: unknown return type"); 417 break; 418 } 419 regs->isf.rflags &= ~EFL_CF; 420 } 421 } 422 423 DEBUG_KPRINT_SYSCALL_UNIX( 424 "unix_syscall64: error=%d retval=(%llu,%llu)\n", 425 error, regs->rax, regs->rdx); 426 427 uthread->uu_flag &= ~UT_NOTCANCELPT; 428 429#if FUNNEL_DEBUG 430 /* 431 * if we're holding the funnel panic 432 */ 433 syscall_exit_funnelcheck(); 434#endif /* FUNNEL_DEBUG */ 435 436 if (__improbable(uthread->uu_lowpri_window)) { 437 /* 438 * task is marked as a low priority I/O type 439 * and the I/O we issued while in this system call 440 * collided with normal I/O operations... we'll 441 * delay in order to mitigate the impact of this 442 * task on the normal operation of the system 443 */ 444 throttle_lowpri_io(1); 445 } 446 if (__probable(code != 180)) 447 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 448 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, 449 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); 450 451 thread_exception_return(); 452 /* NOTREACHED */ 453} 454 455 456void 457unix_syscall_return(int error) 458{ 459 thread_t thread; 460 struct uthread *uthread; 461 struct proc *p; 462 unsigned int code; 463 vm_offset_t params; 464 struct sysent *callp; 465 466 thread = current_thread(); 467 uthread = get_bsdthread_info(thread); 468 469 pal_register_cache_state(thread, DIRTY); 470 471 p = current_proc(); 472 473 if (proc_is64bit(p)) { 474 x86_saved_state64_t *regs; 475 476 regs = saved_state64(find_user_regs(thread)); 477 478 /* reconstruct code for tracing before blasting rax */ 479 code = regs->rax & SYSCALL_NUMBER_MASK; 480 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 481 482 if (callp == sysent) 483 /* 484 * indirect system call... system call number 485 * passed as 'arg0' 486 */ 487 code = regs->rdi; 488 489#if CONFIG_DTRACE 490 if (callp->sy_call == dtrace_systrace_syscall) 491 dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); 492#endif /* CONFIG_DTRACE */ 493 AUDIT_SYSCALL_EXIT(code, p, uthread, error); 494 495 if (error == ERESTART) { 496 /* 497 * repeat the syscall 498 */ 499 pal_syscall_restart( thread, find_user_regs(thread) ); 500 } 501 else if (error != EJUSTRETURN) { 502 if (error) { 503 regs->rax = error; 504 regs->isf.rflags |= EFL_CF; /* carry bit */ 505 } else { /* (not error) */ 506 507 switch (callp->sy_return_type) { 508 case _SYSCALL_RET_INT_T: 509 regs->rax = uthread->uu_rval[0]; 510 regs->rdx = uthread->uu_rval[1]; 511 break; 512 case _SYSCALL_RET_UINT_T: 513 regs->rax = ((u_int)uthread->uu_rval[0]); 514 regs->rdx = ((u_int)uthread->uu_rval[1]); 515 break; 516 case _SYSCALL_RET_OFF_T: 517 case _SYSCALL_RET_ADDR_T: 518 case _SYSCALL_RET_SIZE_T: 519 case _SYSCALL_RET_SSIZE_T: 520 case _SYSCALL_RET_UINT64_T: 521 regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); 522 regs->rdx = 0; 523 break; 524 case _SYSCALL_RET_NONE: 525 break; 526 default: 527 panic("unix_syscall: unknown return type"); 528 break; 529 } 530 regs->isf.rflags &= ~EFL_CF; 531 } 532 } 533 DEBUG_KPRINT_SYSCALL_UNIX( 534 "unix_syscall_return: error=%d retval=(%llu,%llu)\n", 535 error, regs->rax, regs->rdx); 536 } else { 537 x86_saved_state32_t *regs; 538 539 regs = saved_state32(find_user_regs(thread)); 540 541 regs->efl &= ~(EFL_CF); 542 /* reconstruct code for tracing before blasting eax */ 543 code = regs->eax & I386_SYSCALL_NUMBER_MASK; 544 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 545 546#if CONFIG_DTRACE 547 if (callp->sy_call == dtrace_systrace_syscall) 548 dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); 549#endif /* CONFIG_DTRACE */ 550 AUDIT_SYSCALL_EXIT(code, p, uthread, error); 551 552 if (callp == sysent) { 553 params = (vm_offset_t) (regs->uesp + sizeof (int)); 554 code = fuword(params); 555 } 556 if (error == ERESTART) { 557 pal_syscall_restart( thread, find_user_regs(thread) ); 558 } 559 else if (error != EJUSTRETURN) { 560 if (error) { 561 regs->eax = error; 562 regs->efl |= EFL_CF; /* carry bit */ 563 } else { /* (not error) */ 564 regs->eax = uthread->uu_rval[0]; 565 regs->edx = uthread->uu_rval[1]; 566 } 567 } 568 DEBUG_KPRINT_SYSCALL_UNIX( 569 "unix_syscall_return: error=%d retval=(%u,%u)\n", 570 error, regs->eax, regs->edx); 571 } 572 573 574 uthread->uu_flag &= ~UT_NOTCANCELPT; 575 576#if FUNNEL_DEBUG 577 /* 578 * if we're holding the funnel panic 579 */ 580 syscall_exit_funnelcheck(); 581#endif /* FUNNEL_DEBUG */ 582 583 if (uthread->uu_lowpri_window) { 584 /* 585 * task is marked as a low priority I/O type 586 * and the I/O we issued while in this system call 587 * collided with normal I/O operations... we'll 588 * delay in order to mitigate the impact of this 589 * task on the normal operation of the system 590 */ 591 throttle_lowpri_io(1); 592 } 593 if (code != 180) 594 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 595 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, 596 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); 597 598 thread_exception_return(); 599 /* NOTREACHED */ 600} 601 602