1/* 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28#include <kern/task.h> 29#include <kern/thread.h> 30#include <kern/assert.h> 31#include <kern/clock.h> 32#include <kern/locks.h> 33#include <kern/sched_prim.h> 34#include <kern/debug.h> 35#include <mach/machine/thread_status.h> 36#include <mach/thread_act.h> 37#include <mach/branch_predicates.h> 38 39#include <sys/kernel.h> 40#include <sys/vm.h> 41#include <sys/proc_internal.h> 42#include <sys/syscall.h> 43#include <sys/systm.h> 44#include <sys/user.h> 45#include <sys/errno.h> 46#include <sys/kdebug.h> 47#include <sys/sysent.h> 48#include <sys/sysproto.h> 49#include <sys/kauth.h> 50#include <sys/systm.h> 51 52#include <security/audit/audit.h> 53 54#include <i386/seg.h> 55#include <i386/machine_routines.h> 56#include <mach/i386/syscall_sw.h> 57 58#include <machine/pal_routines.h> 59 60#if CONFIG_DTRACE 61extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); 62extern void dtrace_systrace_syscall_return(unsigned short, int, int *); 63#endif 64 65extern void unix_syscall(x86_saved_state_t *); 66extern void unix_syscall64(x86_saved_state_t *); 67extern void *find_user_regs(thread_t); 68 69extern void x86_toggle_sysenter_arg_store(thread_t thread, boolean_t valid); 70extern boolean_t x86_sysenter_arg_store_isvalid(thread_t thread); 71 72/* dynamically generated at build time based on syscalls.master */ 73extern const char *syscallnames[]; 74 75/* 76 * This needs to be a single switch so that it's "all on" or "all off", 77 * rather than being turned on for some code paths and not others, as this 78 * has a tendency to introduce "blame the next guy" bugs. 79 */ 80#if DEBUG 81#define FUNNEL_DEBUG 1 /* Check for funnel held on exit */ 82#endif 83 84/* 85 * Function: unix_syscall 86 * 87 * Inputs: regs - pointer to i386 save area 88 * 89 * Outputs: none 90 */ 91void 92unix_syscall(x86_saved_state_t *state) 93{ 94 thread_t thread; 95 void *vt; 96 unsigned int code; 97 struct sysent *callp; 98 99 int error =; 100 vm_offset_t params; 101 struct proc *p; 102 struct uthread *uthread; 103 x86_saved_state32_t *regs; 104 boolean_t args_in_uthread; 105 boolean_t is_vfork; 106 107 assert(is_saved_state32(state)); 108 regs = saved_state32(state); 109#if DEBUG 110 if (regs->eax == 0x800) 111 thread_exception_return(); 112#endif 113 thread = current_thread(); 114 uthread = get_bsdthread_info(thread); 115 116 /* Get the approriate proc; may be different from task's for vfork() */ 117 is_vfork = uthread->uu_flag & UT_VFORK; 118 if (__improbable(is_vfork != 0)) 119 p = current_proc(); 120 else 121 p = (struct proc *)get_bsdtask_info(current_task()); 122 123 /* Verify that we are not being called from a task without a proc */ 124 if (__improbable(p == NULL)) { 125 regs->eax = EPERM; 126 regs->efl |= EFL_CF; 127 task_terminate_internal(current_task()); 128 thread_exception_return(); 129 /* NOTREACHED */ 130 } 131 132 code = regs->eax & I386_SYSCALL_NUMBER_MASK; 133 DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n", 134 code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip); 135 args_in_uthread = ((regs->eax & I386_SYSCALL_ARG_BYTES_MASK) != 0) && x86_sysenter_arg_store_isvalid(thread); 136 params = (vm_offset_t) (regs->uesp + sizeof (int)); 137 138 regs->efl &= ~(EFL_CF); 139 140 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 141 142 if (__improbable(callp == sysent)) { 143 code = fuword(params); 144 params += sizeof(int); 145 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 146 } 147 148 vt = (void *)uthread->uu_arg; 149 150 if (callp->sy_arg_bytes != 0) { 151 sy_munge_t *mungerp; 152 153 assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg)); 154 if (!args_in_uthread) 155 { 156 uint32_t nargs; 157 nargs = callp->sy_arg_bytes; 158 error = copyin((user_addr_t) params, (char *) vt, nargs); 159 if (error) { 160 regs->eax = error; 161 regs->efl |= EFL_CF; 162 thread_exception_return(); 163 /* NOTREACHED */ 164 } 165 } 166 167 if (__probable(code != 180)) { 168 int *ip = (int *)vt; 169 170 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 171 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 172 *ip, *(ip+1), *(ip+2), *(ip+3), 0); 173 } 174 mungerp = callp->sy_arg_munge32; 175 176 /* 177 * If non-NULL, then call the syscall argument munger to 178 * copy in arguments (see xnu/bsd/dev/{i386|x86_64}/munge.s); the 179 * first argument is NULL because we are munging in place 180 * after a copyin because the ABI currently doesn't use 181 * registers to pass system call arguments. 182 */ 183 if (mungerp != NULL) 184 (*mungerp)(NULL, vt); 185 } else 186 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 187 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 188 0, 0, 0, 0, 0); 189 190 /* 191 * Delayed binding of thread credential to process credential, if we 192 * are not running with an explicitly set thread credential. 193 */ 194 kauth_cred_uthread_update(uthread, p); 195 196 uthread->uu_rval[0] = 0; 197 uthread->uu_rval[1] = regs->edx; 198 uthread->uu_flag |= UT_NOTCANCELPT; 199 200 201#ifdef JOE_DEBUG 202 uthread->uu_iocount = 0; 203 uthread->uu_vpindex = 0; 204#endif 205 206 AUDIT_SYSCALL_ENTER(code, p, uthread); 207 error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); 208 AUDIT_SYSCALL_EXIT(code, p, uthread, error); 209 210#ifdef JOE_DEBUG 211 if (uthread->uu_iocount) 212 printf("system call returned with uu_iocount != 0\n"); 213#endif 214#if CONFIG_DTRACE 215 uthread->t_dtrace_errno = error; 216#endif /* CONFIG_DTRACE */ 217 218 if (__improbable(error == ERESTART)) { 219 /* 220 * Move the user's pc back to repeat the syscall: 221 * 5 bytes for a sysenter, or 2 for an int 8x. 222 * The SYSENTER_TF_CS covers single-stepping over a sysenter 223 * - see debug trap handler in idt.s/idt64.s 224 */ 225 226 pal_syscall_restart(thread, state); 227 } 228 else if (error != EJUSTRETURN) { 229 if (__improbable(error)) { 230 regs->eax = error; 231 regs->efl |= EFL_CF; /* carry bit */ 232 } else { /* (not error) */ 233 regs->eax = uthread->uu_rval[0]; 234 regs->edx = uthread->uu_rval[1]; 235 } 236 } 237 238 DEBUG_KPRINT_SYSCALL_UNIX( 239 "unix_syscall: error=%d retval=(%u,%u)\n", 240 error, regs->eax, regs->edx); 241 242 uthread->uu_flag &= ~UT_NOTCANCELPT; 243#if FUNNEL_DEBUG 244 /* 245 * if we're holding the funnel panic 246 */ 247 syscall_exit_funnelcheck(); 248#endif /* FUNNEL_DEBUG */ 249 250 if (__improbable(uthread->uu_lowpri_window)) { 251 /* 252 * task is marked as a low priority I/O type 253 * and the I/O we issued while in this system call 254 * collided with normal I/O operations... we'll 255 * delay in order to mitigate the impact of this 256 * task on the normal operation of the system 257 */ 258 throttle_lowpri_io(TRUE); 259 } 260 if (__probable(code != 180)) 261 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 262 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, 263 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); 264 265 if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { 266 pal_execve_return(thread); 267 } 268 269 thread_exception_return(); 270 /* NOTREACHED */ 271} 272 273 274void 275unix_syscall64(x86_saved_state_t *state) 276{ 277 thread_t thread; 278 unsigned int code; 279 struct sysent *callp; 280 void *uargp; 281 int args_in_regs; 282 int error; 283 struct proc *p; 284 struct uthread *uthread; 285 x86_saved_state64_t *regs; 286 287 assert(is_saved_state64(state)); 288 regs = saved_state64(state); 289#if DEBUG 290 if (regs->rax == 0x2000800) 291 thread_exception_return(); 292#endif 293 thread = current_thread(); 294 uthread = get_bsdthread_info(thread); 295 296 /* Get the approriate proc; may be different from task's for vfork() */ 297 if (__probable(!(uthread->uu_flag & UT_VFORK))) 298 p = (struct proc *)get_bsdtask_info(current_task()); 299 else 300 p = current_proc(); 301 302 /* Verify that we are not being called from a task without a proc */ 303 if (__improbable(p == NULL)) { 304 regs->rax = EPERM; 305 regs->isf.rflags |= EFL_CF; 306 task_terminate_internal(current_task()); 307 thread_exception_return(); 308 /* NOTREACHED */ 309 } 310 args_in_regs = 6; 311 312 code = regs->rax & SYSCALL_NUMBER_MASK; 313 DEBUG_KPRINT_SYSCALL_UNIX( 314 "unix_syscall64: code=%d(%s) rip=%llx\n", 315 code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip); 316 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 317 uargp = (void *)(®s->rdi); 318 319 if (__improbable(callp == sysent)) { 320 /* 321 * indirect system call... system call number 322 * passed as 'arg0' 323 */ 324 code = regs->rdi; 325 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 326 uargp = (void *)(®s->rsi); 327 args_in_regs = 5; 328 } 329 330 if (callp->sy_narg != 0) { 331 if (code != 180) { 332 uint64_t *ip = (uint64_t *)uargp; 333 334 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 335 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 336 (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0); 337 } 338 assert(callp->sy_narg <= 8); 339 340 if (__improbable(callp->sy_narg > args_in_regs)) { 341 int copyin_count; 342 343 copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t); 344 345 error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)®s->v_arg6, copyin_count); 346 if (error) { 347 regs->rax = error; 348 regs->isf.rflags |= EFL_CF; 349 thread_exception_return(); 350 /* NOTREACHED */ 351 } 352 } 353 /* 354 * XXX Turn 64 bit unsafe calls into nosys() 355 */ 356 if (__improbable(callp->sy_flags & UNSAFE_64BIT)) { 357 callp = &sysent[63]; 358 goto unsafe; 359 } 360 } else 361 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 362 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 363 0, 0, 0, 0, 0); 364unsafe: 365 366 /* 367 * Delayed binding of thread credential to process credential, if we 368 * are not running with an explicitly set thread credential. 369 */ 370 kauth_cred_uthread_update(uthread, p); 371 372 uthread->uu_rval[0] = 0; 373 uthread->uu_rval[1] = 0; 374 375 376 uthread->uu_flag |= UT_NOTCANCELPT; 377 378#ifdef JOE_DEBUG 379 uthread->uu_iocount = 0; 380 uthread->uu_vpindex = 0; 381#endif 382 383 AUDIT_SYSCALL_ENTER(code, p, uthread); 384 error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0])); 385 AUDIT_SYSCALL_EXIT(code, p, uthread, error); 386 387#ifdef JOE_DEBUG 388 if (uthread->uu_iocount) 389 printf("system call returned with uu_iocount != 0\n"); 390#endif 391 392#if CONFIG_DTRACE 393 uthread->t_dtrace_errno = error; 394#endif /* CONFIG_DTRACE */ 395 396 if (__improbable(error == ERESTART)) { 397 /* 398 * all system calls come through via the syscall instruction 399 * in 64 bit mode... its 2 bytes in length 400 * move the user's pc back to repeat the syscall: 401 */ 402 pal_syscall_restart( thread, state ); 403 } 404 else if (error != EJUSTRETURN) { 405 if (__improbable(error)) { 406 regs->rax = error; 407 regs->isf.rflags |= EFL_CF; /* carry bit */ 408 } else { /* (not error) */ 409 410 switch (callp->sy_return_type) { 411 case _SYSCALL_RET_INT_T: 412 regs->rax = uthread->uu_rval[0]; 413 regs->rdx = uthread->uu_rval[1]; 414 break; 415 case _SYSCALL_RET_UINT_T: 416 regs->rax = ((u_int)uthread->uu_rval[0]); 417 regs->rdx = ((u_int)uthread->uu_rval[1]); 418 break; 419 case _SYSCALL_RET_OFF_T: 420 case _SYSCALL_RET_ADDR_T: 421 case _SYSCALL_RET_SIZE_T: 422 case _SYSCALL_RET_SSIZE_T: 423 case _SYSCALL_RET_UINT64_T: 424 regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); 425 regs->rdx = 0; 426 break; 427 case _SYSCALL_RET_NONE: 428 break; 429 default: 430 panic("unix_syscall: unknown return type"); 431 break; 432 } 433 regs->isf.rflags &= ~EFL_CF; 434 } 435 } 436 437 DEBUG_KPRINT_SYSCALL_UNIX( 438 "unix_syscall64: error=%d retval=(%llu,%llu)\n", 439 error, regs->rax, regs->rdx); 440 441 uthread->uu_flag &= ~UT_NOTCANCELPT; 442 443#if FUNNEL_DEBUG 444 /* 445 * if we're holding the funnel panic 446 */ 447 syscall_exit_funnelcheck(); 448#endif /* FUNNEL_DEBUG */ 449 450 if (__improbable(uthread->uu_lowpri_window)) { 451 /* 452 * task is marked as a low priority I/O type 453 * and the I/O we issued while in this system call 454 * collided with normal I/O operations... we'll 455 * delay in order to mitigate the impact of this 456 * task on the normal operation of the system 457 */ 458 throttle_lowpri_io(TRUE); 459 } 460 if (__probable(code != 180)) 461 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 462 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, 463 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); 464 465 thread_exception_return(); 466 /* NOTREACHED */ 467} 468 469 470void 471unix_syscall_return(int error) 472{ 473 thread_t thread; 474 struct uthread *uthread; 475 struct proc *p; 476 unsigned int code; 477 vm_offset_t params; 478 struct sysent *callp; 479 480 thread = current_thread(); 481 uthread = get_bsdthread_info(thread); 482 483 pal_register_cache_state(thread, DIRTY); 484 485 p = current_proc(); 486 487 if (proc_is64bit(p)) { 488 x86_saved_state64_t *regs; 489 490 regs = saved_state64(find_user_regs(thread)); 491 492 /* reconstruct code for tracing before blasting rax */ 493 code = regs->rax & SYSCALL_NUMBER_MASK; 494 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 495 496 if (callp == sysent) 497 /* 498 * indirect system call... system call number 499 * passed as 'arg0' 500 */ 501 code = regs->rdi; 502 503#if CONFIG_DTRACE 504 if (callp->sy_call == dtrace_systrace_syscall) 505 dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); 506#endif /* CONFIG_DTRACE */ 507 AUDIT_SYSCALL_EXIT(code, p, uthread, error); 508 509 if (error == ERESTART) { 510 /* 511 * repeat the syscall 512 */ 513 pal_syscall_restart( thread, find_user_regs(thread) ); 514 } 515 else if (error != EJUSTRETURN) { 516 if (error) { 517 regs->rax = error; 518 regs->isf.rflags |= EFL_CF; /* carry bit */ 519 } else { /* (not error) */ 520 521 switch (callp->sy_return_type) { 522 case _SYSCALL_RET_INT_T: 523 regs->rax = uthread->uu_rval[0]; 524 regs->rdx = uthread->uu_rval[1]; 525 break; 526 case _SYSCALL_RET_UINT_T: 527 regs->rax = ((u_int)uthread->uu_rval[0]); 528 regs->rdx = ((u_int)uthread->uu_rval[1]); 529 break; 530 case _SYSCALL_RET_OFF_T: 531 case _SYSCALL_RET_ADDR_T: 532 case _SYSCALL_RET_SIZE_T: 533 case _SYSCALL_RET_SSIZE_T: 534 case _SYSCALL_RET_UINT64_T: 535 regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); 536 regs->rdx = 0; 537 break; 538 case _SYSCALL_RET_NONE: 539 break; 540 default: 541 panic("unix_syscall: unknown return type"); 542 break; 543 } 544 regs->isf.rflags &= ~EFL_CF; 545 } 546 } 547 DEBUG_KPRINT_SYSCALL_UNIX( 548 "unix_syscall_return: error=%d retval=(%llu,%llu)\n", 549 error, regs->rax, regs->rdx); 550 } else { 551 x86_saved_state32_t *regs; 552 553 regs = saved_state32(find_user_regs(thread)); 554 555 regs->efl &= ~(EFL_CF); 556 /* reconstruct code for tracing before blasting eax */ 557 code = regs->eax & I386_SYSCALL_NUMBER_MASK; 558 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; 559 560#if CONFIG_DTRACE 561 if (callp->sy_call == dtrace_systrace_syscall) 562 dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); 563#endif /* CONFIG_DTRACE */ 564 AUDIT_SYSCALL_EXIT(code, p, uthread, error); 565 566 if (callp == sysent) { 567 params = (vm_offset_t) (regs->uesp + sizeof (int)); 568 code = fuword(params); 569 } 570 if (error == ERESTART) { 571 pal_syscall_restart( thread, find_user_regs(thread) ); 572 } 573 else if (error != EJUSTRETURN) { 574 if (error) { 575 regs->eax = error; 576 regs->efl |= EFL_CF; /* carry bit */ 577 } else { /* (not error) */ 578 regs->eax = uthread->uu_rval[0]; 579 regs->edx = uthread->uu_rval[1]; 580 } 581 } 582 DEBUG_KPRINT_SYSCALL_UNIX( 583 "unix_syscall_return: error=%d retval=(%u,%u)\n", 584 error, regs->eax, regs->edx); 585 } 586 587 588 uthread->uu_flag &= ~UT_NOTCANCELPT; 589 590#if FUNNEL_DEBUG 591 /* 592 * if we're holding the funnel panic 593 */ 594 syscall_exit_funnelcheck(); 595#endif /* FUNNEL_DEBUG */ 596 597 if (uthread->uu_lowpri_window) { 598 /* 599 * task is marked as a low priority I/O type 600 * and the I/O we issued while in this system call 601 * collided with normal I/O operations... we'll 602 * delay in order to mitigate the impact of this 603 * task on the normal operation of the system 604 */ 605 throttle_lowpri_io(TRUE); 606 } 607 if (code != 180) 608 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 609 BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, 610 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); 611 612 thread_exception_return(); 613 /* NOTREACHED */ 614} 615 616void 617munge_wwwlww( 618 __unused const void *in32, 619 void *out64) 620{ 621 uint32_t *arg32; 622 uint64_t *arg64; 623 624 /* we convert in place in out64 */ 625 arg32 = (uint32_t *) out64; 626 arg64 = (uint64_t *) out64; 627 628 arg64[5] = arg32[6]; /* wwwlwW */ 629 arg64[4] = arg32[5]; /* wwwlWw */ 630 arg32[7] = arg32[4]; /* wwwLww (hi) */ 631 arg32[6] = arg32[3]; /* wwwLww (lo) */ 632 arg64[2] = arg32[2]; /* wwWlww */ 633 arg64[1] = arg32[1]; /* wWwlww */ 634 arg64[0] = arg32[0]; /* Wwwlww */ 635} 636 637 638void 639munge_wwlwww( 640 __unused const void *in32, 641 void *out64) 642{ 643 uint32_t *arg32; 644 uint64_t *arg64; 645 646 /* we convert in place in out64 */ 647 arg32 = (uint32_t *) out64; 648 arg64 = (uint64_t *) out64; 649 650 arg64[5] = arg32[6]; /* wwlwwW */ 651 arg64[4] = arg32[5]; /* wwlwWw */ 652 arg64[3] = arg32[4]; /* wwlWww */ 653 arg32[5] = arg32[3]; /* wwLwww (hi) */ 654 arg32[4] = arg32[2]; /* wwLwww (lo) */ 655 arg64[1] = arg32[1]; /* wWlwww */ 656 arg64[0] = arg32[0]; /* Wwlwww */ 657} 658 659