linux32_machdep.c revision 163372
1/*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2002 Doug Rabson 4 * Copyright (c) 2000 Marcel Moolenaar 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer 12 * in this position and unchanged. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. The name of the author may not be used to endorse or promote products 17 * derived from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include <sys/cdefs.h> 32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 163372 2006-10-15 13:25:23Z netchild $"); 33 34#include <sys/param.h> 35#include <sys/kernel.h> 36#include <sys/systm.h> 37#include <sys/clock.h> 38#include <sys/imgact.h> 39#include <sys/limits.h> 40#include <sys/lock.h> 41#include <sys/malloc.h> 42#include <sys/mman.h> 43#include <sys/mutex.h> 44#include <sys/proc.h> 45#include <sys/resource.h> 46#include <sys/resourcevar.h> 47#include <sys/syscallsubr.h> 48#include <sys/sysproto.h> 49#include <sys/unistd.h> 50 51#include <machine/frame.h> 52 53#include <vm/vm.h> 54#include <vm/pmap.h> 55#include <vm/vm_extern.h> 56#include <vm/vm_kern.h> 57#include <vm/vm_map.h> 58 59#include <amd64/linux32/linux.h> 60#include <amd64/linux32/linux32_proto.h> 61#include <compat/linux/linux_ipc.h> 62#include <compat/linux/linux_signal.h> 63#include <compat/linux/linux_util.h> 64#include <compat/linux/linux_emul.h> 65 66struct l_old_select_argv { 67 l_int nfds; 68 l_uintptr_t readfds; 69 l_uintptr_t writefds; 70 l_uintptr_t exceptfds; 71 l_uintptr_t timeout; 72} __packed; 73 74int 75linux_to_bsd_sigaltstack(int lsa) 76{ 77 int bsa = 0; 78 79 if (lsa & LINUX_SS_DISABLE) 80 bsa |= SS_DISABLE; 81 if (lsa & LINUX_SS_ONSTACK) 82 bsa |= SS_ONSTACK; 83 return (bsa); 84} 85 86int 87bsd_to_linux_sigaltstack(int bsa) 88{ 89 int lsa = 0; 90 91 if (bsa & SS_DISABLE) 92 lsa |= LINUX_SS_DISABLE; 93 if (bsa & SS_ONSTACK) 94 lsa |= LINUX_SS_ONSTACK; 95 return (lsa); 96} 97 98/* 99 * Custom version of exec_copyin_args() so that we can translate 100 * the pointers. 101 */ 102static int 103linux_exec_copyin_args(struct image_args *args, char *fname, 104 enum uio_seg segflg, char **argv, char **envv) 105{ 106 char *argp, *envp; 107 u_int32_t *p32, arg; 108 size_t length; 109 int error; 110 111 bzero(args, sizeof(*args)); 112 if (argv == NULL) 113 return (EFAULT); 114 115 /* 116 * Allocate temporary demand zeroed space for argument and 117 * environment strings 118 */ 119 args->buf = (char *) kmem_alloc_wait(exec_map, 120 PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); 121 if (args->buf == NULL) 122 return (ENOMEM); 123 args->begin_argv = args->buf; 124 args->endp = args->begin_argv; 125 args->stringspace = ARG_MAX; 126 127 args->fname = args->buf + ARG_MAX; 128 129 /* 130 * Copy the file name. 131 */ 132 error = (segflg == UIO_SYSSPACE) ? 133 copystr(fname, args->fname, PATH_MAX, &length) : 134 copyinstr(fname, args->fname, PATH_MAX, &length); 135 if (error != 0) 136 goto err_exit; 137 138 /* 139 * extract arguments first 140 */ 141 p32 = (u_int32_t *)argv; 142 for (;;) { 143 error = copyin(p32++, &arg, sizeof(arg)); 144 if (error) 145 goto err_exit; 146 if (arg == 0) 147 break; 148 argp = PTRIN(arg); 149 error = copyinstr(argp, args->endp, args->stringspace, &length); 150 if (error) { 151 if (error == ENAMETOOLONG) 152 error = E2BIG; 153 154 goto err_exit; 155 } 156 args->stringspace -= length; 157 args->endp += length; 158 args->argc++; 159 } 160 161 args->begin_envv = args->endp; 162 163 /* 164 * extract environment strings 165 */ 166 if (envv) { 167 p32 = (u_int32_t *)envv; 168 for (;;) { 169 error = copyin(p32++, &arg, sizeof(arg)); 170 if (error) 171 goto err_exit; 172 if (arg == 0) 173 break; 174 envp = PTRIN(arg); 175 error = copyinstr(envp, args->endp, args->stringspace, 176 &length); 177 if (error) { 178 if (error == ENAMETOOLONG) 179 error = E2BIG; 180 goto err_exit; 181 } 182 args->stringspace -= length; 183 args->endp += length; 184 args->envc++; 185 } 186 } 187 188 return (0); 189 190err_exit: 191 kmem_free_wakeup(exec_map, (vm_offset_t)args->buf, 192 PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); 193 args->buf = NULL; 194 return (error); 195} 196 197int 198linux_execve(struct thread *td, struct linux_execve_args *args) 199{ 200 struct image_args eargs; 201 char *path; 202 int error; 203 204 LCONVPATHEXIST(td, args->path, &path); 205 206#ifdef DEBUG 207 if (ldebug(execve)) 208 printf(ARGS(execve, "%s"), path); 209#endif 210 211 error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp, 212 args->envp); 213 free(path, M_TEMP); 214 if (error == 0) 215 error = kern_execve(td, &eargs, NULL); 216 if (error == 0) 217 /* linux process can exec fbsd one, dont attempt 218 * to create emuldata for such process using 219 * linux_proc_init, this leads to a panic on KASSERT 220 * because such process has p->p_emuldata == NULL 221 */ 222 if (td->td_proc->p_sysent == &elf_linux_sysvec) 223 error = linux_proc_init(td, 0, 0); 224 return (error); 225} 226 227struct iovec32 { 228 u_int32_t iov_base; 229 int iov_len; 230}; 231 232CTASSERT(sizeof(struct iovec32) == 8); 233 234static int 235linux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop) 236{ 237 struct iovec32 iov32; 238 struct iovec *iov; 239 struct uio *uio; 240 u_int iovlen; 241 int error, i; 242 243 *uiop = NULL; 244 if (iovcnt > UIO_MAXIOV) 245 return (EINVAL); 246 iovlen = iovcnt * sizeof(struct iovec); 247 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 248 iov = (struct iovec *)(uio + 1); 249 for (i = 0; i < iovcnt; i++) { 250 error = copyin(&iovp[i], &iov32, sizeof(struct iovec32)); 251 if (error) { 252 free(uio, M_IOV); 253 return (error); 254 } 255 iov[i].iov_base = PTRIN(iov32.iov_base); 256 iov[i].iov_len = iov32.iov_len; 257 } 258 uio->uio_iov = iov; 259 uio->uio_iovcnt = iovcnt; 260 uio->uio_segflg = UIO_USERSPACE; 261 uio->uio_offset = -1; 262 uio->uio_resid = 0; 263 for (i = 0; i < iovcnt; i++) { 264 if (iov->iov_len > INT_MAX - uio->uio_resid) { 265 free(uio, M_IOV); 266 return (EINVAL); 267 } 268 uio->uio_resid += iov->iov_len; 269 iov++; 270 } 271 *uiop = uio; 272 return (0); 273} 274 275int 276linux_readv(struct thread *td, struct linux_readv_args *uap) 277{ 278 struct uio *auio; 279 int error; 280 281 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 282 if (error) 283 return (error); 284 error = kern_readv(td, uap->fd, auio); 285 free(auio, M_IOV); 286 return (error); 287} 288 289int 290linux_writev(struct thread *td, struct linux_writev_args *uap) 291{ 292 struct uio *auio; 293 int error; 294 295 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 296 if (error) 297 return (error); 298 error = kern_writev(td, uap->fd, auio); 299 free(auio, M_IOV); 300 return (error); 301} 302 303struct l_ipc_kludge { 304 l_uintptr_t msgp; 305 l_long msgtyp; 306} __packed; 307 308int 309linux_ipc(struct thread *td, struct linux_ipc_args *args) 310{ 311 312 switch (args->what & 0xFFFF) { 313 case LINUX_SEMOP: { 314 struct linux_semop_args a; 315 316 a.semid = args->arg1; 317 a.tsops = args->ptr; 318 a.nsops = args->arg2; 319 return (linux_semop(td, &a)); 320 } 321 case LINUX_SEMGET: { 322 struct linux_semget_args a; 323 324 a.key = args->arg1; 325 a.nsems = args->arg2; 326 a.semflg = args->arg3; 327 return (linux_semget(td, &a)); 328 } 329 case LINUX_SEMCTL: { 330 struct linux_semctl_args a; 331 int error; 332 333 a.semid = args->arg1; 334 a.semnum = args->arg2; 335 a.cmd = args->arg3; 336 error = copyin(args->ptr, &a.arg, sizeof(a.arg)); 337 if (error) 338 return (error); 339 return (linux_semctl(td, &a)); 340 } 341 case LINUX_MSGSND: { 342 struct linux_msgsnd_args a; 343 344 a.msqid = args->arg1; 345 a.msgp = args->ptr; 346 a.msgsz = args->arg2; 347 a.msgflg = args->arg3; 348 return (linux_msgsnd(td, &a)); 349 } 350 case LINUX_MSGRCV: { 351 struct linux_msgrcv_args a; 352 353 a.msqid = args->arg1; 354 a.msgsz = args->arg2; 355 a.msgflg = args->arg3; 356 if ((args->what >> 16) == 0) { 357 struct l_ipc_kludge tmp; 358 int error; 359 360 if (args->ptr == 0) 361 return (EINVAL); 362 error = copyin(args->ptr, &tmp, sizeof(tmp)); 363 if (error) 364 return (error); 365 a.msgp = PTRIN(tmp.msgp); 366 a.msgtyp = tmp.msgtyp; 367 } else { 368 a.msgp = args->ptr; 369 a.msgtyp = args->arg5; 370 } 371 return (linux_msgrcv(td, &a)); 372 } 373 case LINUX_MSGGET: { 374 struct linux_msgget_args a; 375 376 a.key = args->arg1; 377 a.msgflg = args->arg2; 378 return (linux_msgget(td, &a)); 379 } 380 case LINUX_MSGCTL: { 381 struct linux_msgctl_args a; 382 383 a.msqid = args->arg1; 384 a.cmd = args->arg2; 385 a.buf = args->ptr; 386 return (linux_msgctl(td, &a)); 387 } 388 case LINUX_SHMAT: { 389 struct linux_shmat_args a; 390 391 a.shmid = args->arg1; 392 a.shmaddr = args->ptr; 393 a.shmflg = args->arg2; 394 a.raddr = PTRIN((l_uint)args->arg3); 395 return (linux_shmat(td, &a)); 396 } 397 case LINUX_SHMDT: { 398 struct linux_shmdt_args a; 399 400 a.shmaddr = args->ptr; 401 return (linux_shmdt(td, &a)); 402 } 403 case LINUX_SHMGET: { 404 struct linux_shmget_args a; 405 406 a.key = args->arg1; 407 a.size = args->arg2; 408 a.shmflg = args->arg3; 409 return (linux_shmget(td, &a)); 410 } 411 case LINUX_SHMCTL: { 412 struct linux_shmctl_args a; 413 414 a.shmid = args->arg1; 415 a.cmd = args->arg2; 416 a.buf = args->ptr; 417 return (linux_shmctl(td, &a)); 418 } 419 default: 420 break; 421 } 422 423 return (EINVAL); 424} 425 426int 427linux_old_select(struct thread *td, struct linux_old_select_args *args) 428{ 429 struct l_old_select_argv linux_args; 430 struct linux_select_args newsel; 431 int error; 432 433#ifdef DEBUG 434 if (ldebug(old_select)) 435 printf(ARGS(old_select, "%p"), args->ptr); 436#endif 437 438 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 439 if (error) 440 return (error); 441 442 newsel.nfds = linux_args.nfds; 443 newsel.readfds = PTRIN(linux_args.readfds); 444 newsel.writefds = PTRIN(linux_args.writefds); 445 newsel.exceptfds = PTRIN(linux_args.exceptfds); 446 newsel.timeout = PTRIN(linux_args.timeout); 447 return (linux_select(td, &newsel)); 448} 449 450int 451linux_fork(struct thread *td, struct linux_fork_args *args) 452{ 453 int error; 454 455#ifdef DEBUG 456 if (ldebug(fork)) 457 printf(ARGS(fork, "")); 458#endif 459 460 if ((error = fork(td, (struct fork_args *)args)) != 0) 461 return (error); 462 463 if (td->td_retval[1] == 1) 464 td->td_retval[0] = 0; 465 error = linux_proc_init(td, td->td_retval[0], 0); 466 if (error) 467 return (error); 468 469 return (0); 470} 471 472int 473linux_vfork(struct thread *td, struct linux_vfork_args *args) 474{ 475 int error; 476 struct proc *p2; 477 478#ifdef DEBUG 479 if (ldebug(vfork)) 480 printf(ARGS(vfork, "")); 481#endif 482 483 /* exclude RFPPWAIT */ 484 if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0) 485 return (error); 486 if (error == 0) { 487 td->td_retval[0] = p2->p_pid; 488 td->td_retval[1] = 0; 489 } 490 /* Are we the child? */ 491 if (td->td_retval[1] == 1) 492 td->td_retval[0] = 0; 493 error = linux_proc_init(td, td->td_retval[0], 0); 494 if (error) 495 return (error); 496 /* wait for the children to exit, ie. emulate vfork */ 497 PROC_LOCK(p2); 498 p2->p_flag |= P_PPWAIT; 499 while (p2->p_flag & P_PPWAIT) 500 msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0); 501 PROC_UNLOCK(p2); 502 return (0); 503} 504 505int 506linux_clone(struct thread *td, struct linux_clone_args *args) 507{ 508 int error, ff = RFPROC | RFSTOPPED; 509 struct proc *p2; 510 struct thread *td2; 511 int exit_signal; 512 struct linux_emuldata *em; 513 514#ifdef DEBUG 515 if (ldebug(clone)) { 516 printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"), 517 (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack, 518 (unsigned int)(uintptr_t)args->parent_tidptr, 519 (unsigned int)(uintptr_t)args->child_tidptr); 520 } 521#endif 522 523 exit_signal = args->flags & 0x000000ff; 524 if (exit_signal >= LINUX_NSIG) 525 return (EINVAL); 526 527 if (exit_signal <= LINUX_SIGTBLSZ) 528 exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)]; 529 530 if (args->flags & CLONE_VM) 531 ff |= RFMEM; 532 if (args->flags & CLONE_SIGHAND) 533 ff |= RFSIGSHARE; 534 /* 535 * XXX: in linux sharing of fs info (chroot/cwd/umask) 536 * and open files is independant. in fbsd its in one 537 * structure but in reality it doesnt make any problems 538 * because both this flags are set at once usually. 539 */ 540 if (!(args->flags & (CLONE_FILES | CLONE_FS))) 541 ff |= RFFDG; 542 543 /* 544 * Attempt to detect when linux_clone(2) is used for creating 545 * kernel threads. Unfortunately despite the existence of the 546 * CLONE_THREAD flag, version of linuxthreads package used in 547 * most popular distros as of beginning of 2005 doesn't make 548 * any use of it. Therefore, this detection relay fully on 549 * empirical observation that linuxthreads sets certain 550 * combination of flags, so that we can make more or less 551 * precise detection and notify the FreeBSD kernel that several 552 * processes are in fact part of the same threading group, so 553 * that special treatment is necessary for signal delivery 554 * between those processes and fd locking. 555 */ 556 if ((args->flags & 0xffffff00) == THREADING_FLAGS) 557 ff |= RFTHREAD; 558 559 error = fork1(td, ff, 0, &p2); 560 if (error) 561 return (error); 562 563 /* create the emuldata */ 564 error = linux_proc_init(td, p2->p_pid, args->flags); 565 /* reference it - no need to check this */ 566 em = em_find(p2, EMUL_UNLOCKED); 567 KASSERT(em != NULL, ("clone: emuldata not found.\n")); 568 /* and adjust it */ 569 if (args->flags & CLONE_PARENT_SETTID) { 570 if (args->parent_tidptr == NULL) { 571 EMUL_UNLOCK(&emul_lock); 572 return (EINVAL); 573 } 574 error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); 575 if (error) { 576 EMUL_UNLOCK(&emul_lock); 577 return (error); 578 } 579 } 580 581 if (args->flags & (CLONE_PARENT|CLONE_THREAD)) { 582 sx_xlock(&proctree_lock); 583 PROC_LOCK(p2); 584 proc_reparent(p2, td->td_proc->p_pptr); 585 PROC_UNLOCK(p2); 586 sx_xunlock(&proctree_lock); 587 } 588 589 if (args->flags & CLONE_THREAD) { 590 /* XXX: linux mangles pgrp and pptr somehow 591 * I think it might be this but I am not sure. 592 */ 593#ifdef notyet 594 PROC_LOCK(p2); 595 p2->p_pgrp = td->td_proc->p_pgrp; 596 PROC_UNLOCK(p2); 597#endif 598 exit_signal = 0; 599 } 600 601 if (args->flags & CLONE_CHILD_SETTID) 602 em->child_set_tid = args->child_tidptr; 603 else 604 em->child_set_tid = NULL; 605 606 if (args->flags & CLONE_CHILD_CLEARTID) 607 em->child_clear_tid = args->child_tidptr; 608 else 609 em->child_clear_tid = NULL; 610 611 EMUL_UNLOCK(&emul_lock); 612 613 PROC_LOCK(p2); 614 p2->p_sigparent = exit_signal; 615 PROC_UNLOCK(p2); 616 td2 = FIRST_THREAD_IN_PROC(p2); 617 /* 618 * in a case of stack = NULL we are supposed to COW calling process stack 619 * this is what normal fork() does so we just keep the tf_rsp arg intact 620 */ 621 if (args->stack) 622 td2->td_frame->tf_rsp = PTROUT(args->stack); 623 624 if (args->flags & CLONE_SETTLS) { 625 /* XXX: todo */ 626 } 627 628#ifdef DEBUG 629 if (ldebug(clone)) 630 printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"), 631 (long)p2->p_pid, args->stack, exit_signal); 632#endif 633 634 /* 635 * Make this runnable after we are finished with it. 636 */ 637 mtx_lock_spin(&sched_lock); 638 TD_SET_CAN_RUN(td2); 639 setrunqueue(td2, SRQ_BORING); 640 mtx_unlock_spin(&sched_lock); 641 642 td->td_retval[0] = p2->p_pid; 643 td->td_retval[1] = 0; 644 return (0); 645} 646 647/* XXX move */ 648struct l_mmap_argv { 649 l_ulong addr; 650 l_ulong len; 651 l_ulong prot; 652 l_ulong flags; 653 l_ulong fd; 654 l_ulong pgoff; 655}; 656 657#define STACK_SIZE (2 * 1024 * 1024) 658#define GUARD_SIZE (4 * PAGE_SIZE) 659 660static int linux_mmap_common(struct thread *, struct l_mmap_argv *); 661 662int 663linux_mmap2(struct thread *td, struct linux_mmap2_args *args) 664{ 665 struct l_mmap_argv linux_args; 666 667#ifdef DEBUG 668 if (ldebug(mmap2)) 669 printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"), 670 (void *)(intptr_t)args->addr, args->len, args->prot, 671 args->flags, args->fd, args->pgoff); 672#endif 673 674 linux_args.addr = PTROUT(args->addr); 675 linux_args.len = args->len; 676 linux_args.prot = args->prot; 677 linux_args.flags = args->flags; 678 linux_args.fd = args->fd; 679 linux_args.pgoff = args->pgoff; 680 681 return (linux_mmap_common(td, &linux_args)); 682} 683 684int 685linux_mmap(struct thread *td, struct linux_mmap_args *args) 686{ 687 int error; 688 struct l_mmap_argv linux_args; 689 690 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 691 if (error) 692 return (error); 693 694#ifdef DEBUG 695 if (ldebug(mmap)) 696 printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"), 697 (void *)(intptr_t)linux_args.addr, linux_args.len, 698 linux_args.prot, linux_args.flags, linux_args.fd, 699 linux_args.pgoff); 700#endif 701 if ((linux_args.pgoff % PAGE_SIZE) != 0) 702 return (EINVAL); 703 linux_args.pgoff /= PAGE_SIZE; 704 705 return (linux_mmap_common(td, &linux_args)); 706} 707 708static int 709linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args) 710{ 711 struct proc *p = td->td_proc; 712 struct mmap_args /* { 713 caddr_t addr; 714 size_t len; 715 int prot; 716 int flags; 717 int fd; 718 long pad; 719 off_t pos; 720 } */ bsd_args; 721 int error; 722 723 error = 0; 724 bsd_args.flags = 0; 725 if (linux_args->flags & LINUX_MAP_SHARED) 726 bsd_args.flags |= MAP_SHARED; 727 if (linux_args->flags & LINUX_MAP_PRIVATE) 728 bsd_args.flags |= MAP_PRIVATE; 729 if (linux_args->flags & LINUX_MAP_FIXED) 730 bsd_args.flags |= MAP_FIXED; 731 if (linux_args->flags & LINUX_MAP_ANON) 732 bsd_args.flags |= MAP_ANON; 733 else 734 bsd_args.flags |= MAP_NOSYNC; 735 if (linux_args->flags & LINUX_MAP_GROWSDOWN) { 736 bsd_args.flags |= MAP_STACK; 737 738 /* 739 * The linux MAP_GROWSDOWN option does not limit auto 740 * growth of the region. Linux mmap with this option 741 * takes as addr the inital BOS, and as len, the initial 742 * region size. It can then grow down from addr without 743 * limit. However, linux threads has an implicit internal 744 * limit to stack size of STACK_SIZE. Its just not 745 * enforced explicitly in linux. But, here we impose 746 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack 747 * region, since we can do this with our mmap. 748 * 749 * Our mmap with MAP_STACK takes addr as the maximum 750 * downsize limit on BOS, and as len the max size of 751 * the region. It them maps the top SGROWSIZ bytes, 752 * and autgrows the region down, up to the limit 753 * in addr. 754 * 755 * If we don't use the MAP_STACK option, the effect 756 * of this code is to allocate a stack region of a 757 * fixed size of (STACK_SIZE - GUARD_SIZE). 758 */ 759 760 /* This gives us TOS */ 761 bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) + 762 linux_args->len; 763 764 if ((caddr_t)PTRIN(bsd_args.addr) > 765 p->p_vmspace->vm_maxsaddr) { 766 /* 767 * Some linux apps will attempt to mmap 768 * thread stacks near the top of their 769 * address space. If their TOS is greater 770 * than vm_maxsaddr, vm_map_growstack() 771 * will confuse the thread stack with the 772 * process stack and deliver a SEGV if they 773 * attempt to grow the thread stack past their 774 * current stacksize rlimit. To avoid this, 775 * adjust vm_maxsaddr upwards to reflect 776 * the current stacksize rlimit rather 777 * than the maximum possible stacksize. 778 * It would be better to adjust the 779 * mmap'ed region, but some apps do not check 780 * mmap's return value. 781 */ 782 PROC_LOCK(p); 783 p->p_vmspace->vm_maxsaddr = 784 (char *)LINUX32_USRSTACK - 785 lim_cur(p, RLIMIT_STACK); 786 PROC_UNLOCK(p); 787 } 788 789 /* This gives us our maximum stack size */ 790 if (linux_args->len > STACK_SIZE - GUARD_SIZE) 791 bsd_args.len = linux_args->len; 792 else 793 bsd_args.len = STACK_SIZE - GUARD_SIZE; 794 795 /* 796 * This gives us a new BOS. If we're using VM_STACK, then 797 * mmap will just map the top SGROWSIZ bytes, and let 798 * the stack grow down to the limit at BOS. If we're 799 * not using VM_STACK we map the full stack, since we 800 * don't have a way to autogrow it. 801 */ 802 bsd_args.addr -= bsd_args.len; 803 } else { 804 bsd_args.addr = (caddr_t)PTRIN(linux_args->addr); 805 bsd_args.len = linux_args->len; 806 } 807 /* 808 * XXX i386 Linux always emulator forces PROT_READ on (why?) 809 * so we do the same. We add PROT_EXEC to work around buggy 810 * applications (e.g. Java) that take advantage of the fact 811 * that execute permissions are not enforced by x86 CPUs. 812 */ 813 bsd_args.prot = linux_args->prot | PROT_EXEC | PROT_READ; 814 if (linux_args->flags & LINUX_MAP_ANON) 815 bsd_args.fd = -1; 816 else 817 bsd_args.fd = linux_args->fd; 818 bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE; 819 bsd_args.pad = 0; 820 821#ifdef DEBUG 822 if (ldebug(mmap)) 823 printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", 824 __func__, 825 (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot, 826 bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); 827#endif 828 error = mmap(td, &bsd_args); 829#ifdef DEBUG 830 if (ldebug(mmap)) 831 printf("-> %s() return: 0x%x (0x%08x)\n", 832 __func__, error, (u_int)td->td_retval[0]); 833#endif 834 return (error); 835} 836 837int 838linux_pipe(struct thread *td, struct linux_pipe_args *args) 839{ 840 int pip[2]; 841 int error; 842 register_t reg_rdx; 843 844#ifdef DEBUG 845 if (ldebug(pipe)) 846 printf(ARGS(pipe, "*")); 847#endif 848 849 reg_rdx = td->td_retval[1]; 850 error = pipe(td, 0); 851 if (error) { 852 td->td_retval[1] = reg_rdx; 853 return (error); 854 } 855 856 pip[0] = td->td_retval[0]; 857 pip[1] = td->td_retval[1]; 858 error = copyout(pip, args->pipefds, 2 * sizeof(int)); 859 if (error) { 860 td->td_retval[1] = reg_rdx; 861 return (error); 862 } 863 864 td->td_retval[1] = reg_rdx; 865 td->td_retval[0] = 0; 866 return (0); 867} 868 869int 870linux_sigaction(struct thread *td, struct linux_sigaction_args *args) 871{ 872 l_osigaction_t osa; 873 l_sigaction_t act, oact; 874 int error; 875 876#ifdef DEBUG 877 if (ldebug(sigaction)) 878 printf(ARGS(sigaction, "%d, %p, %p"), 879 args->sig, (void *)args->nsa, (void *)args->osa); 880#endif 881 882 if (args->nsa != NULL) { 883 error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); 884 if (error) 885 return (error); 886 act.lsa_handler = osa.lsa_handler; 887 act.lsa_flags = osa.lsa_flags; 888 act.lsa_restorer = osa.lsa_restorer; 889 LINUX_SIGEMPTYSET(act.lsa_mask); 890 act.lsa_mask.__bits[0] = osa.lsa_mask; 891 } 892 893 error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, 894 args->osa ? &oact : NULL); 895 896 if (args->osa != NULL && !error) { 897 osa.lsa_handler = oact.lsa_handler; 898 osa.lsa_flags = oact.lsa_flags; 899 osa.lsa_restorer = oact.lsa_restorer; 900 osa.lsa_mask = oact.lsa_mask.__bits[0]; 901 error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); 902 } 903 904 return (error); 905} 906 907/* 908 * Linux has two extra args, restart and oldmask. We dont use these, 909 * but it seems that "restart" is actually a context pointer that 910 * enables the signal to happen with a different register set. 911 */ 912int 913linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) 914{ 915 sigset_t sigmask; 916 l_sigset_t mask; 917 918#ifdef DEBUG 919 if (ldebug(sigsuspend)) 920 printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask); 921#endif 922 923 LINUX_SIGEMPTYSET(mask); 924 mask.__bits[0] = args->mask; 925 linux_to_bsd_sigset(&mask, &sigmask); 926 return (kern_sigsuspend(td, sigmask)); 927} 928 929int 930linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) 931{ 932 l_sigset_t lmask; 933 sigset_t sigmask; 934 int error; 935 936#ifdef DEBUG 937 if (ldebug(rt_sigsuspend)) 938 printf(ARGS(rt_sigsuspend, "%p, %d"), 939 (void *)uap->newset, uap->sigsetsize); 940#endif 941 942 if (uap->sigsetsize != sizeof(l_sigset_t)) 943 return (EINVAL); 944 945 error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); 946 if (error) 947 return (error); 948 949 linux_to_bsd_sigset(&lmask, &sigmask); 950 return (kern_sigsuspend(td, sigmask)); 951} 952 953int 954linux_pause(struct thread *td, struct linux_pause_args *args) 955{ 956 struct proc *p = td->td_proc; 957 sigset_t sigmask; 958 959#ifdef DEBUG 960 if (ldebug(pause)) 961 printf(ARGS(pause, "")); 962#endif 963 964 PROC_LOCK(p); 965 sigmask = td->td_sigmask; 966 PROC_UNLOCK(p); 967 return (kern_sigsuspend(td, sigmask)); 968} 969 970int 971linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) 972{ 973 stack_t ss, oss; 974 l_stack_t lss; 975 int error; 976 977#ifdef DEBUG 978 if (ldebug(sigaltstack)) 979 printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss); 980#endif 981 982 if (uap->uss != NULL) { 983 error = copyin(uap->uss, &lss, sizeof(l_stack_t)); 984 if (error) 985 return (error); 986 987 ss.ss_sp = PTRIN(lss.ss_sp); 988 ss.ss_size = lss.ss_size; 989 ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); 990 } 991 error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, 992 (uap->uoss != NULL) ? &oss : NULL); 993 if (!error && uap->uoss != NULL) { 994 lss.ss_sp = PTROUT(oss.ss_sp); 995 lss.ss_size = oss.ss_size; 996 lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); 997 error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); 998 } 999 1000 return (error); 1001} 1002 1003int 1004linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) 1005{ 1006 struct ftruncate_args sa; 1007 1008#ifdef DEBUG 1009 if (ldebug(ftruncate64)) 1010 printf(ARGS(ftruncate64, "%u, %jd"), args->fd, 1011 (intmax_t)args->length); 1012#endif 1013 1014 sa.fd = args->fd; 1015 sa.pad = 0; 1016 sa.length = args->length; 1017 return ftruncate(td, &sa); 1018} 1019 1020int 1021linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap) 1022{ 1023 struct timeval atv; 1024 l_timeval atv32; 1025 struct timezone rtz; 1026 int error = 0; 1027 1028 if (uap->tp) { 1029 microtime(&atv); 1030 atv32.tv_sec = atv.tv_sec; 1031 atv32.tv_usec = atv.tv_usec; 1032 error = copyout(&atv32, uap->tp, sizeof (atv32)); 1033 } 1034 if (error == 0 && uap->tzp != NULL) { 1035 rtz.tz_minuteswest = tz_minuteswest; 1036 rtz.tz_dsttime = tz_dsttime; 1037 error = copyout(&rtz, uap->tzp, sizeof (rtz)); 1038 } 1039 return (error); 1040} 1041 1042int 1043linux_nanosleep(struct thread *td, struct linux_nanosleep_args *uap) 1044{ 1045 struct timespec rqt, rmt; 1046 struct l_timespec ats32; 1047 int error; 1048 1049 error = copyin(uap->rqtp, &ats32, sizeof(ats32)); 1050 if (error != 0) 1051 return (error); 1052 rqt.tv_sec = ats32.tv_sec; 1053 rqt.tv_nsec = ats32.tv_nsec; 1054 error = kern_nanosleep(td, &rqt, &rmt); 1055 if (uap->rmtp != NULL) { 1056 ats32.tv_sec = rmt.tv_sec; 1057 ats32.tv_nsec = rmt.tv_nsec; 1058 error = copyout(&ats32, uap->rmtp, sizeof(ats32)); 1059 } 1060 return (error); 1061} 1062 1063int 1064linux_getrusage(struct thread *td, struct linux_getrusage_args *uap) 1065{ 1066 struct l_rusage s32; 1067 struct rusage s; 1068 int error; 1069 1070 error = kern_getrusage(td, uap->who, &s); 1071 if (error != 0) 1072 return (error); 1073 if (uap->rusage != NULL) { 1074 s32.ru_utime.tv_sec = s.ru_utime.tv_sec; 1075 s32.ru_utime.tv_usec = s.ru_utime.tv_usec; 1076 s32.ru_stime.tv_sec = s.ru_stime.tv_sec; 1077 s32.ru_stime.tv_usec = s.ru_stime.tv_usec; 1078 s32.ru_maxrss = s.ru_maxrss; 1079 s32.ru_ixrss = s.ru_ixrss; 1080 s32.ru_idrss = s.ru_idrss; 1081 s32.ru_isrss = s.ru_isrss; 1082 s32.ru_minflt = s.ru_minflt; 1083 s32.ru_majflt = s.ru_majflt; 1084 s32.ru_nswap = s.ru_nswap; 1085 s32.ru_inblock = s.ru_inblock; 1086 s32.ru_oublock = s.ru_oublock; 1087 s32.ru_msgsnd = s.ru_msgsnd; 1088 s32.ru_msgrcv = s.ru_msgrcv; 1089 s32.ru_nsignals = s.ru_nsignals; 1090 s32.ru_nvcsw = s.ru_nvcsw; 1091 s32.ru_nivcsw = s.ru_nivcsw; 1092 error = copyout(&s32, uap->rusage, sizeof(s32)); 1093 } 1094 return (error); 1095} 1096 1097int 1098linux_sched_rr_get_interval(struct thread *td, 1099 struct linux_sched_rr_get_interval_args *uap) 1100{ 1101 struct timespec ts; 1102 struct l_timespec ts32; 1103 int error; 1104 1105 error = kern_sched_rr_get_interval(td, uap->pid, &ts); 1106 if (error != 0) 1107 return (error); 1108 ts32.tv_sec = ts.tv_sec; 1109 ts32.tv_nsec = ts.tv_nsec; 1110 return (copyout(&ts32, uap->interval, sizeof(ts32))); 1111} 1112 1113int 1114linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) 1115{ 1116 struct mprotect_args bsd_args; 1117 1118 bsd_args.addr = uap->addr; 1119 bsd_args.len = uap->len; 1120 bsd_args.prot = uap->prot; 1121 /* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */ 1122 if ((bsd_args.prot & PROT_READ) != 0) 1123 bsd_args.prot |= PROT_EXEC; 1124 return (mprotect(td, &bsd_args)); 1125} 1126