linux32_machdep.c revision 218059
1/*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2002 Doug Rabson 4 * Copyright (c) 2000 Marcel Moolenaar 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer 12 * in this position and unchanged. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. The name of the author may not be used to endorse or promote products 17 * derived from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include <sys/cdefs.h> 32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 218059 2011-01-29 07:22:33Z dchagin $"); 33 34#include <sys/param.h> 35#include <sys/kernel.h> 36#include <sys/systm.h> 37#include <sys/file.h> 38#include <sys/fcntl.h> 39#include <sys/clock.h> 40#include <sys/imgact.h> 41#include <sys/limits.h> 42#include <sys/lock.h> 43#include <sys/malloc.h> 44#include <sys/mman.h> 45#include <sys/mutex.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/resource.h> 49#include <sys/resourcevar.h> 50#include <sys/sched.h> 51#include <sys/syscallsubr.h> 52#include <sys/sysproto.h> 53#include <sys/unistd.h> 54#include <sys/wait.h> 55 56#include <machine/frame.h> 57#include <machine/pcb.h> 58#include <machine/psl.h> 59#include <machine/segments.h> 60#include <machine/specialreg.h> 61 62#include <vm/vm.h> 63#include <vm/pmap.h> 64#include <vm/vm_map.h> 65 66#include <compat/freebsd32/freebsd32_util.h> 67#include <amd64/linux32/linux.h> 68#include <amd64/linux32/linux32_proto.h> 69#include <compat/linux/linux_ipc.h> 70#include <compat/linux/linux_misc.h> 71#include <compat/linux/linux_signal.h> 72#include <compat/linux/linux_util.h> 73#include <compat/linux/linux_emul.h> 74 75struct l_old_select_argv { 76 l_int nfds; 77 l_uintptr_t readfds; 78 l_uintptr_t writefds; 79 l_uintptr_t exceptfds; 80 l_uintptr_t timeout; 81} __packed; 82 83int 84linux_to_bsd_sigaltstack(int lsa) 85{ 86 int bsa = 0; 87 88 if (lsa & LINUX_SS_DISABLE) 89 bsa |= SS_DISABLE; 90 if (lsa & LINUX_SS_ONSTACK) 91 bsa |= SS_ONSTACK; 92 return (bsa); 93} 94 95static int linux_mmap_common(struct thread *td, l_uintptr_t addr, 96 l_size_t len, l_int prot, l_int flags, l_int fd, 97 l_loff_t pos); 98 99int 100bsd_to_linux_sigaltstack(int bsa) 101{ 102 int lsa = 0; 103 104 if (bsa & SS_DISABLE) 105 lsa |= LINUX_SS_DISABLE; 106 if (bsa & SS_ONSTACK) 107 lsa |= LINUX_SS_ONSTACK; 108 return (lsa); 109} 110 111static void 112bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru) 113{ 114 115 lru->ru_utime.tv_sec = ru->ru_utime.tv_sec; 116 lru->ru_utime.tv_usec = ru->ru_utime.tv_usec; 117 lru->ru_stime.tv_sec = ru->ru_stime.tv_sec; 118 lru->ru_stime.tv_usec = ru->ru_stime.tv_usec; 119 lru->ru_maxrss = ru->ru_maxrss; 120 lru->ru_ixrss = ru->ru_ixrss; 121 lru->ru_idrss = ru->ru_idrss; 122 lru->ru_isrss = ru->ru_isrss; 123 lru->ru_minflt = ru->ru_minflt; 124 lru->ru_majflt = ru->ru_majflt; 125 lru->ru_nswap = ru->ru_nswap; 126 lru->ru_inblock = ru->ru_inblock; 127 lru->ru_oublock = ru->ru_oublock; 128 lru->ru_msgsnd = ru->ru_msgsnd; 129 lru->ru_msgrcv = ru->ru_msgrcv; 130 lru->ru_nsignals = ru->ru_nsignals; 131 lru->ru_nvcsw = ru->ru_nvcsw; 132 lru->ru_nivcsw = ru->ru_nivcsw; 133} 134 135int 136linux_execve(struct thread *td, struct linux_execve_args *args) 137{ 138 struct image_args eargs; 139 char *path; 140 int error; 141 142 LCONVPATHEXIST(td, args->path, &path); 143 144#ifdef DEBUG 145 if (ldebug(execve)) 146 printf(ARGS(execve, "%s"), path); 147#endif 148 149 error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE, 150 args->argp, args->envp); 151 free(path, M_TEMP); 152 if (error == 0) 153 error = kern_execve(td, &eargs, NULL); 154 if (error == 0) 155 /* Linux process can execute FreeBSD one, do not attempt 156 * to create emuldata for such process using 157 * linux_proc_init, this leads to a panic on KASSERT 158 * because such process has p->p_emuldata == NULL. 159 */ 160 if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX) 161 error = linux_proc_init(td, 0, 0); 162 return (error); 163} 164 165CTASSERT(sizeof(struct l_iovec32) == 8); 166 167static int 168linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop) 169{ 170 struct l_iovec32 iov32; 171 struct iovec *iov; 172 struct uio *uio; 173 uint32_t iovlen; 174 int error, i; 175 176 *uiop = NULL; 177 if (iovcnt > UIO_MAXIOV) 178 return (EINVAL); 179 iovlen = iovcnt * sizeof(struct iovec); 180 uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK); 181 iov = (struct iovec *)(uio + 1); 182 for (i = 0; i < iovcnt; i++) { 183 error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32)); 184 if (error) { 185 free(uio, M_IOV); 186 return (error); 187 } 188 iov[i].iov_base = PTRIN(iov32.iov_base); 189 iov[i].iov_len = iov32.iov_len; 190 } 191 uio->uio_iov = iov; 192 uio->uio_iovcnt = iovcnt; 193 uio->uio_segflg = UIO_USERSPACE; 194 uio->uio_offset = -1; 195 uio->uio_resid = 0; 196 for (i = 0; i < iovcnt; i++) { 197 if (iov->iov_len > INT_MAX - uio->uio_resid) { 198 free(uio, M_IOV); 199 return (EINVAL); 200 } 201 uio->uio_resid += iov->iov_len; 202 iov++; 203 } 204 *uiop = uio; 205 return (0); 206} 207 208int 209linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp, 210 int error) 211{ 212 struct l_iovec32 iov32; 213 struct iovec *iov; 214 uint32_t iovlen; 215 int i; 216 217 *iovp = NULL; 218 if (iovcnt > UIO_MAXIOV) 219 return (error); 220 iovlen = iovcnt * sizeof(struct iovec); 221 iov = malloc(iovlen, M_IOV, M_WAITOK); 222 for (i = 0; i < iovcnt; i++) { 223 error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32)); 224 if (error) { 225 free(iov, M_IOV); 226 return (error); 227 } 228 iov[i].iov_base = PTRIN(iov32.iov_base); 229 iov[i].iov_len = iov32.iov_len; 230 } 231 *iovp = iov; 232 return(0); 233 234} 235 236int 237linux_readv(struct thread *td, struct linux_readv_args *uap) 238{ 239 struct uio *auio; 240 int error; 241 242 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 243 if (error) 244 return (error); 245 error = kern_readv(td, uap->fd, auio); 246 free(auio, M_IOV); 247 return (error); 248} 249 250int 251linux_writev(struct thread *td, struct linux_writev_args *uap) 252{ 253 struct uio *auio; 254 int error; 255 256 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 257 if (error) 258 return (error); 259 error = kern_writev(td, uap->fd, auio); 260 free(auio, M_IOV); 261 return (error); 262} 263 264struct l_ipc_kludge { 265 l_uintptr_t msgp; 266 l_long msgtyp; 267} __packed; 268 269int 270linux_ipc(struct thread *td, struct linux_ipc_args *args) 271{ 272 273 switch (args->what & 0xFFFF) { 274 case LINUX_SEMOP: { 275 struct linux_semop_args a; 276 277 a.semid = args->arg1; 278 a.tsops = args->ptr; 279 a.nsops = args->arg2; 280 return (linux_semop(td, &a)); 281 } 282 case LINUX_SEMGET: { 283 struct linux_semget_args a; 284 285 a.key = args->arg1; 286 a.nsems = args->arg2; 287 a.semflg = args->arg3; 288 return (linux_semget(td, &a)); 289 } 290 case LINUX_SEMCTL: { 291 struct linux_semctl_args a; 292 int error; 293 294 a.semid = args->arg1; 295 a.semnum = args->arg2; 296 a.cmd = args->arg3; 297 error = copyin(args->ptr, &a.arg, sizeof(a.arg)); 298 if (error) 299 return (error); 300 return (linux_semctl(td, &a)); 301 } 302 case LINUX_MSGSND: { 303 struct linux_msgsnd_args a; 304 305 a.msqid = args->arg1; 306 a.msgp = args->ptr; 307 a.msgsz = args->arg2; 308 a.msgflg = args->arg3; 309 return (linux_msgsnd(td, &a)); 310 } 311 case LINUX_MSGRCV: { 312 struct linux_msgrcv_args a; 313 314 a.msqid = args->arg1; 315 a.msgsz = args->arg2; 316 a.msgflg = args->arg3; 317 if ((args->what >> 16) == 0) { 318 struct l_ipc_kludge tmp; 319 int error; 320 321 if (args->ptr == 0) 322 return (EINVAL); 323 error = copyin(args->ptr, &tmp, sizeof(tmp)); 324 if (error) 325 return (error); 326 a.msgp = PTRIN(tmp.msgp); 327 a.msgtyp = tmp.msgtyp; 328 } else { 329 a.msgp = args->ptr; 330 a.msgtyp = args->arg5; 331 } 332 return (linux_msgrcv(td, &a)); 333 } 334 case LINUX_MSGGET: { 335 struct linux_msgget_args a; 336 337 a.key = args->arg1; 338 a.msgflg = args->arg2; 339 return (linux_msgget(td, &a)); 340 } 341 case LINUX_MSGCTL: { 342 struct linux_msgctl_args a; 343 344 a.msqid = args->arg1; 345 a.cmd = args->arg2; 346 a.buf = args->ptr; 347 return (linux_msgctl(td, &a)); 348 } 349 case LINUX_SHMAT: { 350 struct linux_shmat_args a; 351 352 a.shmid = args->arg1; 353 a.shmaddr = args->ptr; 354 a.shmflg = args->arg2; 355 a.raddr = PTRIN((l_uint)args->arg3); 356 return (linux_shmat(td, &a)); 357 } 358 case LINUX_SHMDT: { 359 struct linux_shmdt_args a; 360 361 a.shmaddr = args->ptr; 362 return (linux_shmdt(td, &a)); 363 } 364 case LINUX_SHMGET: { 365 struct linux_shmget_args a; 366 367 a.key = args->arg1; 368 a.size = args->arg2; 369 a.shmflg = args->arg3; 370 return (linux_shmget(td, &a)); 371 } 372 case LINUX_SHMCTL: { 373 struct linux_shmctl_args a; 374 375 a.shmid = args->arg1; 376 a.cmd = args->arg2; 377 a.buf = args->ptr; 378 return (linux_shmctl(td, &a)); 379 } 380 default: 381 break; 382 } 383 384 return (EINVAL); 385} 386 387int 388linux_old_select(struct thread *td, struct linux_old_select_args *args) 389{ 390 struct l_old_select_argv linux_args; 391 struct linux_select_args newsel; 392 int error; 393 394#ifdef DEBUG 395 if (ldebug(old_select)) 396 printf(ARGS(old_select, "%p"), args->ptr); 397#endif 398 399 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 400 if (error) 401 return (error); 402 403 newsel.nfds = linux_args.nfds; 404 newsel.readfds = PTRIN(linux_args.readfds); 405 newsel.writefds = PTRIN(linux_args.writefds); 406 newsel.exceptfds = PTRIN(linux_args.exceptfds); 407 newsel.timeout = PTRIN(linux_args.timeout); 408 return (linux_select(td, &newsel)); 409} 410 411int 412linux_fork(struct thread *td, struct linux_fork_args *args) 413{ 414 int error; 415 struct proc *p2; 416 struct thread *td2; 417 418#ifdef DEBUG 419 if (ldebug(fork)) 420 printf(ARGS(fork, "")); 421#endif 422 423 if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0) 424 return (error); 425 426 if (error == 0) { 427 td->td_retval[0] = p2->p_pid; 428 td->td_retval[1] = 0; 429 } 430 431 if (td->td_retval[1] == 1) 432 td->td_retval[0] = 0; 433 error = linux_proc_init(td, td->td_retval[0], 0); 434 if (error) 435 return (error); 436 437 td2 = FIRST_THREAD_IN_PROC(p2); 438 439 /* 440 * Make this runnable after we are finished with it. 441 */ 442 thread_lock(td2); 443 TD_SET_CAN_RUN(td2); 444 sched_add(td2, SRQ_BORING); 445 thread_unlock(td2); 446 447 return (0); 448} 449 450int 451linux_vfork(struct thread *td, struct linux_vfork_args *args) 452{ 453 int error; 454 struct proc *p2; 455 struct thread *td2; 456 457#ifdef DEBUG 458 if (ldebug(vfork)) 459 printf(ARGS(vfork, "")); 460#endif 461 462 /* Exclude RFPPWAIT */ 463 if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0) 464 return (error); 465 if (error == 0) { 466 td->td_retval[0] = p2->p_pid; 467 td->td_retval[1] = 0; 468 } 469 /* Are we the child? */ 470 if (td->td_retval[1] == 1) 471 td->td_retval[0] = 0; 472 error = linux_proc_init(td, td->td_retval[0], 0); 473 if (error) 474 return (error); 475 476 PROC_LOCK(p2); 477 p2->p_flag |= P_PPWAIT; 478 PROC_UNLOCK(p2); 479 480 td2 = FIRST_THREAD_IN_PROC(p2); 481 482 /* 483 * Make this runnable after we are finished with it. 484 */ 485 thread_lock(td2); 486 TD_SET_CAN_RUN(td2); 487 sched_add(td2, SRQ_BORING); 488 thread_unlock(td2); 489 490 /* wait for the children to exit, ie. emulate vfork */ 491 PROC_LOCK(p2); 492 while (p2->p_flag & P_PPWAIT) 493 cv_wait(&p2->p_pwait, &p2->p_mtx); 494 PROC_UNLOCK(p2); 495 496 return (0); 497} 498 499int 500linux_clone(struct thread *td, struct linux_clone_args *args) 501{ 502 int error, ff = RFPROC | RFSTOPPED; 503 struct proc *p2; 504 struct thread *td2; 505 int exit_signal; 506 struct linux_emuldata *em; 507 508#ifdef DEBUG 509 if (ldebug(clone)) { 510 printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " 511 "child tid: %p"), (unsigned)args->flags, 512 args->stack, args->parent_tidptr, args->child_tidptr); 513 } 514#endif 515 516 exit_signal = args->flags & 0x000000ff; 517 if (LINUX_SIG_VALID(exit_signal)) { 518 if (exit_signal <= LINUX_SIGTBLSZ) 519 exit_signal = 520 linux_to_bsd_signal[_SIG_IDX(exit_signal)]; 521 } else if (exit_signal != 0) 522 return (EINVAL); 523 524 if (args->flags & LINUX_CLONE_VM) 525 ff |= RFMEM; 526 if (args->flags & LINUX_CLONE_SIGHAND) 527 ff |= RFSIGSHARE; 528 /* 529 * XXX: In Linux, sharing of fs info (chroot/cwd/umask) 530 * and open files is independant. In FreeBSD, its in one 531 * structure but in reality it does not cause any problems 532 * because both of these flags are usually set together. 533 */ 534 if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) 535 ff |= RFFDG; 536 537 /* 538 * Attempt to detect when linux_clone(2) is used for creating 539 * kernel threads. Unfortunately despite the existence of the 540 * CLONE_THREAD flag, version of linuxthreads package used in 541 * most popular distros as of beginning of 2005 doesn't make 542 * any use of it. Therefore, this detection relies on 543 * empirical observation that linuxthreads sets certain 544 * combination of flags, so that we can make more or less 545 * precise detection and notify the FreeBSD kernel that several 546 * processes are in fact part of the same threading group, so 547 * that special treatment is necessary for signal delivery 548 * between those processes and fd locking. 549 */ 550 if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS) 551 ff |= RFTHREAD; 552 553 if (args->flags & LINUX_CLONE_PARENT_SETTID) 554 if (args->parent_tidptr == NULL) 555 return (EINVAL); 556 557 error = fork1(td, ff, 0, &p2); 558 if (error) 559 return (error); 560 561 if (args->flags & (LINUX_CLONE_PARENT | LINUX_CLONE_THREAD)) { 562 sx_xlock(&proctree_lock); 563 PROC_LOCK(p2); 564 proc_reparent(p2, td->td_proc->p_pptr); 565 PROC_UNLOCK(p2); 566 sx_xunlock(&proctree_lock); 567 } 568 569 /* create the emuldata */ 570 error = linux_proc_init(td, p2->p_pid, args->flags); 571 /* reference it - no need to check this */ 572 em = em_find(p2, EMUL_DOLOCK); 573 KASSERT(em != NULL, ("clone: emuldata not found.\n")); 574 /* and adjust it */ 575 576 if (args->flags & LINUX_CLONE_THREAD) { 577#ifdef notyet 578 PROC_LOCK(p2); 579 p2->p_pgrp = td->td_proc->p_pgrp; 580 PROC_UNLOCK(p2); 581#endif 582 exit_signal = 0; 583 } 584 585 if (args->flags & LINUX_CLONE_CHILD_SETTID) 586 em->child_set_tid = args->child_tidptr; 587 else 588 em->child_set_tid = NULL; 589 590 if (args->flags & LINUX_CLONE_CHILD_CLEARTID) 591 em->child_clear_tid = args->child_tidptr; 592 else 593 em->child_clear_tid = NULL; 594 595 EMUL_UNLOCK(&emul_lock); 596 597 if (args->flags & LINUX_CLONE_PARENT_SETTID) { 598 error = copyout(&p2->p_pid, args->parent_tidptr, 599 sizeof(p2->p_pid)); 600 if (error) 601 printf(LMSG("copyout failed!")); 602 } 603 604 PROC_LOCK(p2); 605 p2->p_sigparent = exit_signal; 606 PROC_UNLOCK(p2); 607 td2 = FIRST_THREAD_IN_PROC(p2); 608 /* 609 * In a case of stack = NULL, we are supposed to COW calling process 610 * stack. This is what normal fork() does, so we just keep tf_rsp arg 611 * intact. 612 */ 613 if (args->stack) 614 td2->td_frame->tf_rsp = PTROUT(args->stack); 615 616 if (args->flags & LINUX_CLONE_SETTLS) { 617 struct user_segment_descriptor sd; 618 struct l_user_desc info; 619 struct pcb *pcb; 620 int a[2]; 621 622 error = copyin((void *)td->td_frame->tf_rsi, &info, 623 sizeof(struct l_user_desc)); 624 if (error) { 625 printf(LMSG("copyin failed!")); 626 } else { 627 /* We might copy out the entry_number as GUGS32_SEL. */ 628 info.entry_number = GUGS32_SEL; 629 error = copyout(&info, (void *)td->td_frame->tf_rsi, 630 sizeof(struct l_user_desc)); 631 if (error) 632 printf(LMSG("copyout failed!")); 633 634 a[0] = LINUX_LDT_entry_a(&info); 635 a[1] = LINUX_LDT_entry_b(&info); 636 637 memcpy(&sd, &a, sizeof(a)); 638#ifdef DEBUG 639 if (ldebug(clone)) 640 printf("Segment created in clone with " 641 "CLONE_SETTLS: lobase: %x, hibase: %x, " 642 "lolimit: %x, hilimit: %x, type: %i, " 643 "dpl: %i, p: %i, xx: %i, long: %i, " 644 "def32: %i, gran: %i\n", sd.sd_lobase, 645 sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit, 646 sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx, 647 sd.sd_long, sd.sd_def32, sd.sd_gran); 648#endif 649 pcb = td2->td_pcb; 650 pcb->pcb_gsbase = (register_t)info.base_addr; 651/* XXXKIB pcb->pcb_gs32sd = sd; */ 652 td2->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL); 653 set_pcb_flags(pcb, PCB_GS32BIT | PCB_32BIT); 654 } 655 } 656 657#ifdef DEBUG 658 if (ldebug(clone)) 659 printf(LMSG("clone: successful rfork to %d, " 660 "stack %p sig = %d"), (int)p2->p_pid, args->stack, 661 exit_signal); 662#endif 663 if (args->flags & LINUX_CLONE_VFORK) { 664 PROC_LOCK(p2); 665 p2->p_flag |= P_PPWAIT; 666 PROC_UNLOCK(p2); 667 } 668 669 /* 670 * Make this runnable after we are finished with it. 671 */ 672 thread_lock(td2); 673 TD_SET_CAN_RUN(td2); 674 sched_add(td2, SRQ_BORING); 675 thread_unlock(td2); 676 677 td->td_retval[0] = p2->p_pid; 678 td->td_retval[1] = 0; 679 680 if (args->flags & LINUX_CLONE_VFORK) { 681 /* wait for the children to exit, ie. emulate vfork */ 682 PROC_LOCK(p2); 683 while (p2->p_flag & P_PPWAIT) 684 cv_wait(&p2->p_pwait, &p2->p_mtx); 685 PROC_UNLOCK(p2); 686 } 687 688 return (0); 689} 690 691#define STACK_SIZE (2 * 1024 * 1024) 692#define GUARD_SIZE (4 * PAGE_SIZE) 693 694int 695linux_mmap2(struct thread *td, struct linux_mmap2_args *args) 696{ 697 698#ifdef DEBUG 699 if (ldebug(mmap2)) 700 printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"), 701 args->addr, args->len, args->prot, 702 args->flags, args->fd, args->pgoff); 703#endif 704 705 return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot, 706 args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff * 707 PAGE_SIZE)); 708} 709 710int 711linux_mmap(struct thread *td, struct linux_mmap_args *args) 712{ 713 int error; 714 struct l_mmap_argv linux_args; 715 716 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 717 if (error) 718 return (error); 719 720#ifdef DEBUG 721 if (ldebug(mmap)) 722 printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"), 723 linux_args.addr, linux_args.len, linux_args.prot, 724 linux_args.flags, linux_args.fd, linux_args.pgoff); 725#endif 726 727 return (linux_mmap_common(td, linux_args.addr, linux_args.len, 728 linux_args.prot, linux_args.flags, linux_args.fd, 729 (uint32_t)linux_args.pgoff)); 730} 731 732static int 733linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, 734 l_int flags, l_int fd, l_loff_t pos) 735{ 736 struct proc *p = td->td_proc; 737 struct mmap_args /* { 738 caddr_t addr; 739 size_t len; 740 int prot; 741 int flags; 742 int fd; 743 long pad; 744 off_t pos; 745 } */ bsd_args; 746 int error; 747 struct file *fp; 748 749 error = 0; 750 bsd_args.flags = 0; 751 fp = NULL; 752 753 /* 754 * Linux mmap(2): 755 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE 756 */ 757 if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) 758 return (EINVAL); 759 760 if (flags & LINUX_MAP_SHARED) 761 bsd_args.flags |= MAP_SHARED; 762 if (flags & LINUX_MAP_PRIVATE) 763 bsd_args.flags |= MAP_PRIVATE; 764 if (flags & LINUX_MAP_FIXED) 765 bsd_args.flags |= MAP_FIXED; 766 if (flags & LINUX_MAP_ANON) { 767 /* Enforce pos to be on page boundary, then ignore. */ 768 if ((pos & PAGE_MASK) != 0) 769 return (EINVAL); 770 pos = 0; 771 bsd_args.flags |= MAP_ANON; 772 } else 773 bsd_args.flags |= MAP_NOSYNC; 774 if (flags & LINUX_MAP_GROWSDOWN) 775 bsd_args.flags |= MAP_STACK; 776 777 /* 778 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC 779 * on Linux/i386. We do this to ensure maximum compatibility. 780 * Linux/ia64 does the same in i386 emulation mode. 781 */ 782 bsd_args.prot = prot; 783 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 784 bsd_args.prot |= PROT_READ | PROT_EXEC; 785 786 /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ 787 bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; 788 if (bsd_args.fd != -1) { 789 /* 790 * Linux follows Solaris mmap(2) description: 791 * The file descriptor fildes is opened with 792 * read permission, regardless of the 793 * protection options specified. 794 */ 795 796 if ((error = fget(td, bsd_args.fd, &fp)) != 0) 797 return (error); 798 if (fp->f_type != DTYPE_VNODE) { 799 fdrop(fp, td); 800 return (EINVAL); 801 } 802 803 /* Linux mmap() just fails for O_WRONLY files */ 804 if (!(fp->f_flag & FREAD)) { 805 fdrop(fp, td); 806 return (EACCES); 807 } 808 809 fdrop(fp, td); 810 } 811 812 if (flags & LINUX_MAP_GROWSDOWN) { 813 /* 814 * The Linux MAP_GROWSDOWN option does not limit auto 815 * growth of the region. Linux mmap with this option 816 * takes as addr the inital BOS, and as len, the initial 817 * region size. It can then grow down from addr without 818 * limit. However, Linux threads has an implicit internal 819 * limit to stack size of STACK_SIZE. Its just not 820 * enforced explicitly in Linux. But, here we impose 821 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack 822 * region, since we can do this with our mmap. 823 * 824 * Our mmap with MAP_STACK takes addr as the maximum 825 * downsize limit on BOS, and as len the max size of 826 * the region. It then maps the top SGROWSIZ bytes, 827 * and auto grows the region down, up to the limit 828 * in addr. 829 * 830 * If we don't use the MAP_STACK option, the effect 831 * of this code is to allocate a stack region of a 832 * fixed size of (STACK_SIZE - GUARD_SIZE). 833 */ 834 835 if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) { 836 /* 837 * Some Linux apps will attempt to mmap 838 * thread stacks near the top of their 839 * address space. If their TOS is greater 840 * than vm_maxsaddr, vm_map_growstack() 841 * will confuse the thread stack with the 842 * process stack and deliver a SEGV if they 843 * attempt to grow the thread stack past their 844 * current stacksize rlimit. To avoid this, 845 * adjust vm_maxsaddr upwards to reflect 846 * the current stacksize rlimit rather 847 * than the maximum possible stacksize. 848 * It would be better to adjust the 849 * mmap'ed region, but some apps do not check 850 * mmap's return value. 851 */ 852 PROC_LOCK(p); 853 p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK - 854 lim_cur(p, RLIMIT_STACK); 855 PROC_UNLOCK(p); 856 } 857 858 /* 859 * This gives us our maximum stack size and a new BOS. 860 * If we're using VM_STACK, then mmap will just map 861 * the top SGROWSIZ bytes, and let the stack grow down 862 * to the limit at BOS. If we're not using VM_STACK 863 * we map the full stack, since we don't have a way 864 * to autogrow it. 865 */ 866 if (len > STACK_SIZE - GUARD_SIZE) { 867 bsd_args.addr = (caddr_t)PTRIN(addr); 868 bsd_args.len = len; 869 } else { 870 bsd_args.addr = (caddr_t)PTRIN(addr) - 871 (STACK_SIZE - GUARD_SIZE - len); 872 bsd_args.len = STACK_SIZE - GUARD_SIZE; 873 } 874 } else { 875 bsd_args.addr = (caddr_t)PTRIN(addr); 876 bsd_args.len = len; 877 } 878 bsd_args.pos = pos; 879 880#ifdef DEBUG 881 if (ldebug(mmap)) 882 printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", 883 __func__, 884 (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot, 885 bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); 886#endif 887 error = mmap(td, &bsd_args); 888#ifdef DEBUG 889 if (ldebug(mmap)) 890 printf("-> %s() return: 0x%x (0x%08x)\n", 891 __func__, error, (u_int)td->td_retval[0]); 892#endif 893 return (error); 894} 895 896int 897linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) 898{ 899 struct mprotect_args bsd_args; 900 901 bsd_args.addr = uap->addr; 902 bsd_args.len = uap->len; 903 bsd_args.prot = uap->prot; 904 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 905 bsd_args.prot |= PROT_READ | PROT_EXEC; 906 return (mprotect(td, &bsd_args)); 907} 908 909int 910linux_iopl(struct thread *td, struct linux_iopl_args *args) 911{ 912 int error; 913 914 if (args->level < 0 || args->level > 3) 915 return (EINVAL); 916 if ((error = priv_check(td, PRIV_IO)) != 0) 917 return (error); 918 if ((error = securelevel_gt(td->td_ucred, 0)) != 0) 919 return (error); 920 td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) | 921 (args->level * (PSL_IOPL / 3)); 922 923 return (0); 924} 925 926int 927linux_pipe(struct thread *td, struct linux_pipe_args *args) 928{ 929 int error; 930 int fildes[2]; 931 932#ifdef DEBUG 933 if (ldebug(pipe)) 934 printf(ARGS(pipe, "*")); 935#endif 936 937 error = kern_pipe(td, fildes); 938 if (error) 939 return (error); 940 941 /* XXX: Close descriptors on error. */ 942 return (copyout(fildes, args->pipefds, sizeof fildes)); 943} 944 945int 946linux_sigaction(struct thread *td, struct linux_sigaction_args *args) 947{ 948 l_osigaction_t osa; 949 l_sigaction_t act, oact; 950 int error; 951 952#ifdef DEBUG 953 if (ldebug(sigaction)) 954 printf(ARGS(sigaction, "%d, %p, %p"), 955 args->sig, (void *)args->nsa, (void *)args->osa); 956#endif 957 958 if (args->nsa != NULL) { 959 error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); 960 if (error) 961 return (error); 962 act.lsa_handler = osa.lsa_handler; 963 act.lsa_flags = osa.lsa_flags; 964 act.lsa_restorer = osa.lsa_restorer; 965 LINUX_SIGEMPTYSET(act.lsa_mask); 966 act.lsa_mask.__bits[0] = osa.lsa_mask; 967 } 968 969 error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, 970 args->osa ? &oact : NULL); 971 972 if (args->osa != NULL && !error) { 973 osa.lsa_handler = oact.lsa_handler; 974 osa.lsa_flags = oact.lsa_flags; 975 osa.lsa_restorer = oact.lsa_restorer; 976 osa.lsa_mask = oact.lsa_mask.__bits[0]; 977 error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); 978 } 979 980 return (error); 981} 982 983/* 984 * Linux has two extra args, restart and oldmask. We don't use these, 985 * but it seems that "restart" is actually a context pointer that 986 * enables the signal to happen with a different register set. 987 */ 988int 989linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) 990{ 991 sigset_t sigmask; 992 l_sigset_t mask; 993 994#ifdef DEBUG 995 if (ldebug(sigsuspend)) 996 printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask); 997#endif 998 999 LINUX_SIGEMPTYSET(mask); 1000 mask.__bits[0] = args->mask; 1001 linux_to_bsd_sigset(&mask, &sigmask); 1002 return (kern_sigsuspend(td, sigmask)); 1003} 1004 1005int 1006linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) 1007{ 1008 l_sigset_t lmask; 1009 sigset_t sigmask; 1010 int error; 1011 1012#ifdef DEBUG 1013 if (ldebug(rt_sigsuspend)) 1014 printf(ARGS(rt_sigsuspend, "%p, %d"), 1015 (void *)uap->newset, uap->sigsetsize); 1016#endif 1017 1018 if (uap->sigsetsize != sizeof(l_sigset_t)) 1019 return (EINVAL); 1020 1021 error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); 1022 if (error) 1023 return (error); 1024 1025 linux_to_bsd_sigset(&lmask, &sigmask); 1026 return (kern_sigsuspend(td, sigmask)); 1027} 1028 1029int 1030linux_pause(struct thread *td, struct linux_pause_args *args) 1031{ 1032 struct proc *p = td->td_proc; 1033 sigset_t sigmask; 1034 1035#ifdef DEBUG 1036 if (ldebug(pause)) 1037 printf(ARGS(pause, "")); 1038#endif 1039 1040 PROC_LOCK(p); 1041 sigmask = td->td_sigmask; 1042 PROC_UNLOCK(p); 1043 return (kern_sigsuspend(td, sigmask)); 1044} 1045 1046int 1047linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) 1048{ 1049 stack_t ss, oss; 1050 l_stack_t lss; 1051 int error; 1052 1053#ifdef DEBUG 1054 if (ldebug(sigaltstack)) 1055 printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss); 1056#endif 1057 1058 if (uap->uss != NULL) { 1059 error = copyin(uap->uss, &lss, sizeof(l_stack_t)); 1060 if (error) 1061 return (error); 1062 1063 ss.ss_sp = PTRIN(lss.ss_sp); 1064 ss.ss_size = lss.ss_size; 1065 ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); 1066 } 1067 error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, 1068 (uap->uoss != NULL) ? &oss : NULL); 1069 if (!error && uap->uoss != NULL) { 1070 lss.ss_sp = PTROUT(oss.ss_sp); 1071 lss.ss_size = oss.ss_size; 1072 lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); 1073 error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); 1074 } 1075 1076 return (error); 1077} 1078 1079int 1080linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) 1081{ 1082 struct ftruncate_args sa; 1083 1084#ifdef DEBUG 1085 if (ldebug(ftruncate64)) 1086 printf(ARGS(ftruncate64, "%u, %jd"), args->fd, 1087 (intmax_t)args->length); 1088#endif 1089 1090 sa.fd = args->fd; 1091 sa.length = args->length; 1092 return ftruncate(td, &sa); 1093} 1094 1095int 1096linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap) 1097{ 1098 struct timeval atv; 1099 l_timeval atv32; 1100 struct timezone rtz; 1101 int error = 0; 1102 1103 if (uap->tp) { 1104 microtime(&atv); 1105 atv32.tv_sec = atv.tv_sec; 1106 atv32.tv_usec = atv.tv_usec; 1107 error = copyout(&atv32, uap->tp, sizeof(atv32)); 1108 } 1109 if (error == 0 && uap->tzp != NULL) { 1110 rtz.tz_minuteswest = tz_minuteswest; 1111 rtz.tz_dsttime = tz_dsttime; 1112 error = copyout(&rtz, uap->tzp, sizeof(rtz)); 1113 } 1114 return (error); 1115} 1116 1117int 1118linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap) 1119{ 1120 l_timeval atv32; 1121 struct timeval atv, *tvp; 1122 struct timezone atz, *tzp; 1123 int error; 1124 1125 if (uap->tp) { 1126 error = copyin(uap->tp, &atv32, sizeof(atv32)); 1127 if (error) 1128 return (error); 1129 atv.tv_sec = atv32.tv_sec; 1130 atv.tv_usec = atv32.tv_usec; 1131 tvp = &atv; 1132 } else 1133 tvp = NULL; 1134 if (uap->tzp) { 1135 error = copyin(uap->tzp, &atz, sizeof(atz)); 1136 if (error) 1137 return (error); 1138 tzp = &atz; 1139 } else 1140 tzp = NULL; 1141 return (kern_settimeofday(td, tvp, tzp)); 1142} 1143 1144int 1145linux_getrusage(struct thread *td, struct linux_getrusage_args *uap) 1146{ 1147 struct l_rusage s32; 1148 struct rusage s; 1149 int error; 1150 1151 error = kern_getrusage(td, uap->who, &s); 1152 if (error != 0) 1153 return (error); 1154 if (uap->rusage != NULL) { 1155 bsd_to_linux_rusage(&s, &s32); 1156 error = copyout(&s32, uap->rusage, sizeof(s32)); 1157 } 1158 return (error); 1159} 1160 1161int 1162linux_sched_rr_get_interval(struct thread *td, 1163 struct linux_sched_rr_get_interval_args *uap) 1164{ 1165 struct timespec ts; 1166 struct l_timespec ts32; 1167 int error; 1168 1169 error = kern_sched_rr_get_interval(td, uap->pid, &ts); 1170 if (error != 0) 1171 return (error); 1172 ts32.tv_sec = ts.tv_sec; 1173 ts32.tv_nsec = ts.tv_nsec; 1174 return (copyout(&ts32, uap->interval, sizeof(ts32))); 1175} 1176 1177int 1178linux_set_thread_area(struct thread *td, 1179 struct linux_set_thread_area_args *args) 1180{ 1181 struct l_user_desc info; 1182 struct user_segment_descriptor sd; 1183 struct pcb *pcb; 1184 int a[2]; 1185 int error; 1186 1187 error = copyin(args->desc, &info, sizeof(struct l_user_desc)); 1188 if (error) 1189 return (error); 1190 1191#ifdef DEBUG 1192 if (ldebug(set_thread_area)) 1193 printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, " 1194 "%i, %i, %i"), info.entry_number, info.base_addr, 1195 info.limit, info.seg_32bit, info.contents, 1196 info.read_exec_only, info.limit_in_pages, 1197 info.seg_not_present, info.useable); 1198#endif 1199 1200 /* 1201 * Semantics of Linux version: every thread in the system has array 1202 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. 1203 * This syscall loads one of the selected TLS decriptors with a value 1204 * and also loads GDT descriptors 6, 7 and 8 with the content of 1205 * the per-thread descriptors. 1206 * 1207 * Semantics of FreeBSD version: I think we can ignore that Linux has 1208 * three per-thread descriptors and use just the first one. 1209 * The tls_array[] is used only in [gs]et_thread_area() syscalls and 1210 * for loading the GDT descriptors. We use just one GDT descriptor 1211 * for TLS, so we will load just one. 1212 * 1213 * XXX: This doesn't work when a user space process tries to use more 1214 * than one TLS segment. Comment in the Linux source says wine might 1215 * do this. 1216 */ 1217 1218 /* 1219 * GLIBC reads current %gs and call set_thread_area() with it. 1220 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because 1221 * we use these segments. 1222 */ 1223 switch (info.entry_number) { 1224 case GUGS32_SEL: 1225 case GUDATA_SEL: 1226 case 6: 1227 case -1: 1228 info.entry_number = GUGS32_SEL; 1229 break; 1230 default: 1231 return (EINVAL); 1232 } 1233 1234 /* 1235 * We have to copy out the GDT entry we use. 1236 * 1237 * XXX: What if a user space program does not check the return value 1238 * and tries to use 6, 7 or 8? 1239 */ 1240 error = copyout(&info, args->desc, sizeof(struct l_user_desc)); 1241 if (error) 1242 return (error); 1243 1244 if (LINUX_LDT_empty(&info)) { 1245 a[0] = 0; 1246 a[1] = 0; 1247 } else { 1248 a[0] = LINUX_LDT_entry_a(&info); 1249 a[1] = LINUX_LDT_entry_b(&info); 1250 } 1251 1252 memcpy(&sd, &a, sizeof(a)); 1253#ifdef DEBUG 1254 if (ldebug(set_thread_area)) 1255 printf("Segment created in set_thread_area: " 1256 "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, " 1257 "type: %i, dpl: %i, p: %i, xx: %i, long: %i, " 1258 "def32: %i, gran: %i\n", 1259 sd.sd_lobase, 1260 sd.sd_hibase, 1261 sd.sd_lolimit, 1262 sd.sd_hilimit, 1263 sd.sd_type, 1264 sd.sd_dpl, 1265 sd.sd_p, 1266 sd.sd_xx, 1267 sd.sd_long, 1268 sd.sd_def32, 1269 sd.sd_gran); 1270#endif 1271 1272 pcb = td->td_pcb; 1273 pcb->pcb_gsbase = (register_t)info.base_addr; 1274 set_pcb_flags(pcb, PCB_32BIT | PCB_GS32BIT); 1275 update_gdt_gsbase(td, info.base_addr); 1276 1277 return (0); 1278} 1279 1280int 1281linux_wait4(struct thread *td, struct linux_wait4_args *args) 1282{ 1283 int error, options; 1284 struct rusage ru, *rup; 1285 struct l_rusage lru; 1286 struct proc *p; 1287 1288#ifdef DEBUG 1289 if (ldebug(wait4)) 1290 printf(ARGS(wait4, "%d, %p, %d, %p"), 1291 args->pid, (void *)args->status, args->options, 1292 (void *)args->rusage); 1293#endif 1294 1295 options = (args->options & (WNOHANG | WUNTRACED)); 1296 /* WLINUXCLONE should be equal to __WCLONE, but we make sure */ 1297 if (args->options & __WCLONE) 1298 options |= WLINUXCLONE; 1299 1300 if (args->rusage != NULL) 1301 rup = &ru; 1302 else 1303 rup = NULL; 1304 error = linux_common_wait(td, args->pid, args->status, options, rup); 1305 if (error) 1306 return (error); 1307 1308 p = td->td_proc; 1309 PROC_LOCK(p); 1310 sigqueue_delete(&p->p_sigqueue, SIGCHLD); 1311 PROC_UNLOCK(p); 1312 1313 if (args->rusage != NULL) { 1314 bsd_to_linux_rusage(rup, &lru); 1315 error = copyout(&lru, args->rusage, sizeof(lru)); 1316 } 1317 1318 return (error); 1319} 1320