linux32_machdep.c revision 166150
1/*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2002 Doug Rabson 4 * Copyright (c) 2000 Marcel Moolenaar 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer 12 * in this position and unchanged. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. The name of the author may not be used to endorse or promote products 17 * derived from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include <sys/cdefs.h> 32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 166150 2007-01-20 14:58:59Z netchild $"); 33 34#include <sys/param.h> 35#include <sys/kernel.h> 36#include <sys/systm.h> 37#include <sys/file.h> 38#include <sys/fcntl.h> 39#include <sys/clock.h> 40#include <sys/imgact.h> 41#include <sys/limits.h> 42#include <sys/lock.h> 43#include <sys/malloc.h> 44#include <sys/mman.h> 45#include <sys/mutex.h> 46#include <sys/proc.h> 47#include <sys/resource.h> 48#include <sys/resourcevar.h> 49#include <sys/syscallsubr.h> 50#include <sys/sysproto.h> 51#include <sys/unistd.h> 52 53#include <machine/frame.h> 54 55#include <vm/vm.h> 56#include <vm/pmap.h> 57#include <vm/vm_extern.h> 58#include <vm/vm_kern.h> 59#include <vm/vm_map.h> 60 61#include <amd64/linux32/linux.h> 62#include <amd64/linux32/linux32_proto.h> 63#include <compat/linux/linux_ipc.h> 64#include <compat/linux/linux_signal.h> 65#include <compat/linux/linux_util.h> 66#include <compat/linux/linux_emul.h> 67 68struct l_old_select_argv { 69 l_int nfds; 70 l_uintptr_t readfds; 71 l_uintptr_t writefds; 72 l_uintptr_t exceptfds; 73 l_uintptr_t timeout; 74} __packed; 75 76int 77linux_to_bsd_sigaltstack(int lsa) 78{ 79 int bsa = 0; 80 81 if (lsa & LINUX_SS_DISABLE) 82 bsa |= SS_DISABLE; 83 if (lsa & LINUX_SS_ONSTACK) 84 bsa |= SS_ONSTACK; 85 return (bsa); 86} 87 88int 89bsd_to_linux_sigaltstack(int bsa) 90{ 91 int lsa = 0; 92 93 if (bsa & SS_DISABLE) 94 lsa |= LINUX_SS_DISABLE; 95 if (bsa & SS_ONSTACK) 96 lsa |= LINUX_SS_ONSTACK; 97 return (lsa); 98} 99 100/* 101 * Custom version of exec_copyin_args() so that we can translate 102 * the pointers. 103 */ 104static int 105linux_exec_copyin_args(struct image_args *args, char *fname, 106 enum uio_seg segflg, char **argv, char **envv) 107{ 108 char *argp, *envp; 109 u_int32_t *p32, arg; 110 size_t length; 111 int error; 112 113 bzero(args, sizeof(*args)); 114 if (argv == NULL) 115 return (EFAULT); 116 117 /* 118 * Allocate temporary demand zeroed space for argument and 119 * environment strings 120 */ 121 args->buf = (char *) kmem_alloc_wait(exec_map, 122 PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); 123 if (args->buf == NULL) 124 return (ENOMEM); 125 args->begin_argv = args->buf; 126 args->endp = args->begin_argv; 127 args->stringspace = ARG_MAX; 128 129 args->fname = args->buf + ARG_MAX; 130 131 /* 132 * Copy the file name. 133 */ 134 error = (segflg == UIO_SYSSPACE) ? 135 copystr(fname, args->fname, PATH_MAX, &length) : 136 copyinstr(fname, args->fname, PATH_MAX, &length); 137 if (error != 0) 138 goto err_exit; 139 140 /* 141 * extract arguments first 142 */ 143 p32 = (u_int32_t *)argv; 144 for (;;) { 145 error = copyin(p32++, &arg, sizeof(arg)); 146 if (error) 147 goto err_exit; 148 if (arg == 0) 149 break; 150 argp = PTRIN(arg); 151 error = copyinstr(argp, args->endp, args->stringspace, &length); 152 if (error) { 153 if (error == ENAMETOOLONG) 154 error = E2BIG; 155 156 goto err_exit; 157 } 158 args->stringspace -= length; 159 args->endp += length; 160 args->argc++; 161 } 162 163 args->begin_envv = args->endp; 164 165 /* 166 * extract environment strings 167 */ 168 if (envv) { 169 p32 = (u_int32_t *)envv; 170 for (;;) { 171 error = copyin(p32++, &arg, sizeof(arg)); 172 if (error) 173 goto err_exit; 174 if (arg == 0) 175 break; 176 envp = PTRIN(arg); 177 error = copyinstr(envp, args->endp, args->stringspace, 178 &length); 179 if (error) { 180 if (error == ENAMETOOLONG) 181 error = E2BIG; 182 goto err_exit; 183 } 184 args->stringspace -= length; 185 args->endp += length; 186 args->envc++; 187 } 188 } 189 190 return (0); 191 192err_exit: 193 kmem_free_wakeup(exec_map, (vm_offset_t)args->buf, 194 PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); 195 args->buf = NULL; 196 return (error); 197} 198 199int 200linux_execve(struct thread *td, struct linux_execve_args *args) 201{ 202 struct image_args eargs; 203 char *path; 204 int error; 205 206 LCONVPATHEXIST(td, args->path, &path); 207 208#ifdef DEBUG 209 if (ldebug(execve)) 210 printf(ARGS(execve, "%s"), path); 211#endif 212 213 error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp, 214 args->envp); 215 free(path, M_TEMP); 216 if (error == 0) 217 error = kern_execve(td, &eargs, NULL); 218 if (error == 0) 219 /* linux process can exec fbsd one, dont attempt 220 * to create emuldata for such process using 221 * linux_proc_init, this leads to a panic on KASSERT 222 * because such process has p->p_emuldata == NULL 223 */ 224 if (td->td_proc->p_sysent == &elf_linux_sysvec) 225 error = linux_proc_init(td, 0, 0); 226 return (error); 227} 228 229struct iovec32 { 230 u_int32_t iov_base; 231 int iov_len; 232}; 233 234CTASSERT(sizeof(struct iovec32) == 8); 235 236static int 237linux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop) 238{ 239 struct iovec32 iov32; 240 struct iovec *iov; 241 struct uio *uio; 242 u_int iovlen; 243 int error, i; 244 245 *uiop = NULL; 246 if (iovcnt > UIO_MAXIOV) 247 return (EINVAL); 248 iovlen = iovcnt * sizeof(struct iovec); 249 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 250 iov = (struct iovec *)(uio + 1); 251 for (i = 0; i < iovcnt; i++) { 252 error = copyin(&iovp[i], &iov32, sizeof(struct iovec32)); 253 if (error) { 254 free(uio, M_IOV); 255 return (error); 256 } 257 iov[i].iov_base = PTRIN(iov32.iov_base); 258 iov[i].iov_len = iov32.iov_len; 259 } 260 uio->uio_iov = iov; 261 uio->uio_iovcnt = iovcnt; 262 uio->uio_segflg = UIO_USERSPACE; 263 uio->uio_offset = -1; 264 uio->uio_resid = 0; 265 for (i = 0; i < iovcnt; i++) { 266 if (iov->iov_len > INT_MAX - uio->uio_resid) { 267 free(uio, M_IOV); 268 return (EINVAL); 269 } 270 uio->uio_resid += iov->iov_len; 271 iov++; 272 } 273 *uiop = uio; 274 return (0); 275} 276 277int 278linux_readv(struct thread *td, struct linux_readv_args *uap) 279{ 280 struct uio *auio; 281 int error; 282 283 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 284 if (error) 285 return (error); 286 error = kern_readv(td, uap->fd, auio); 287 free(auio, M_IOV); 288 return (error); 289} 290 291int 292linux_writev(struct thread *td, struct linux_writev_args *uap) 293{ 294 struct uio *auio; 295 int error; 296 297 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 298 if (error) 299 return (error); 300 error = kern_writev(td, uap->fd, auio); 301 free(auio, M_IOV); 302 return (error); 303} 304 305struct l_ipc_kludge { 306 l_uintptr_t msgp; 307 l_long msgtyp; 308} __packed; 309 310int 311linux_ipc(struct thread *td, struct linux_ipc_args *args) 312{ 313 314 switch (args->what & 0xFFFF) { 315 case LINUX_SEMOP: { 316 struct linux_semop_args a; 317 318 a.semid = args->arg1; 319 a.tsops = args->ptr; 320 a.nsops = args->arg2; 321 return (linux_semop(td, &a)); 322 } 323 case LINUX_SEMGET: { 324 struct linux_semget_args a; 325 326 a.key = args->arg1; 327 a.nsems = args->arg2; 328 a.semflg = args->arg3; 329 return (linux_semget(td, &a)); 330 } 331 case LINUX_SEMCTL: { 332 struct linux_semctl_args a; 333 int error; 334 335 a.semid = args->arg1; 336 a.semnum = args->arg2; 337 a.cmd = args->arg3; 338 error = copyin(args->ptr, &a.arg, sizeof(a.arg)); 339 if (error) 340 return (error); 341 return (linux_semctl(td, &a)); 342 } 343 case LINUX_MSGSND: { 344 struct linux_msgsnd_args a; 345 346 a.msqid = args->arg1; 347 a.msgp = args->ptr; 348 a.msgsz = args->arg2; 349 a.msgflg = args->arg3; 350 return (linux_msgsnd(td, &a)); 351 } 352 case LINUX_MSGRCV: { 353 struct linux_msgrcv_args a; 354 355 a.msqid = args->arg1; 356 a.msgsz = args->arg2; 357 a.msgflg = args->arg3; 358 if ((args->what >> 16) == 0) { 359 struct l_ipc_kludge tmp; 360 int error; 361 362 if (args->ptr == 0) 363 return (EINVAL); 364 error = copyin(args->ptr, &tmp, sizeof(tmp)); 365 if (error) 366 return (error); 367 a.msgp = PTRIN(tmp.msgp); 368 a.msgtyp = tmp.msgtyp; 369 } else { 370 a.msgp = args->ptr; 371 a.msgtyp = args->arg5; 372 } 373 return (linux_msgrcv(td, &a)); 374 } 375 case LINUX_MSGGET: { 376 struct linux_msgget_args a; 377 378 a.key = args->arg1; 379 a.msgflg = args->arg2; 380 return (linux_msgget(td, &a)); 381 } 382 case LINUX_MSGCTL: { 383 struct linux_msgctl_args a; 384 385 a.msqid = args->arg1; 386 a.cmd = args->arg2; 387 a.buf = args->ptr; 388 return (linux_msgctl(td, &a)); 389 } 390 case LINUX_SHMAT: { 391 struct linux_shmat_args a; 392 393 a.shmid = args->arg1; 394 a.shmaddr = args->ptr; 395 a.shmflg = args->arg2; 396 a.raddr = PTRIN((l_uint)args->arg3); 397 return (linux_shmat(td, &a)); 398 } 399 case LINUX_SHMDT: { 400 struct linux_shmdt_args a; 401 402 a.shmaddr = args->ptr; 403 return (linux_shmdt(td, &a)); 404 } 405 case LINUX_SHMGET: { 406 struct linux_shmget_args a; 407 408 a.key = args->arg1; 409 a.size = args->arg2; 410 a.shmflg = args->arg3; 411 return (linux_shmget(td, &a)); 412 } 413 case LINUX_SHMCTL: { 414 struct linux_shmctl_args a; 415 416 a.shmid = args->arg1; 417 a.cmd = args->arg2; 418 a.buf = args->ptr; 419 return (linux_shmctl(td, &a)); 420 } 421 default: 422 break; 423 } 424 425 return (EINVAL); 426} 427 428int 429linux_old_select(struct thread *td, struct linux_old_select_args *args) 430{ 431 struct l_old_select_argv linux_args; 432 struct linux_select_args newsel; 433 int error; 434 435#ifdef DEBUG 436 if (ldebug(old_select)) 437 printf(ARGS(old_select, "%p"), args->ptr); 438#endif 439 440 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 441 if (error) 442 return (error); 443 444 newsel.nfds = linux_args.nfds; 445 newsel.readfds = PTRIN(linux_args.readfds); 446 newsel.writefds = PTRIN(linux_args.writefds); 447 newsel.exceptfds = PTRIN(linux_args.exceptfds); 448 newsel.timeout = PTRIN(linux_args.timeout); 449 return (linux_select(td, &newsel)); 450} 451 452int 453linux_fork(struct thread *td, struct linux_fork_args *args) 454{ 455 int error; 456 struct proc *p2; 457 struct thread *td2; 458 459#ifdef DEBUG 460 if (ldebug(fork)) 461 printf(ARGS(fork, "")); 462#endif 463 464 if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0) 465 return (error); 466 467 if (error == 0) { 468 td->td_retval[0] = p2->p_pid; 469 td->td_retval[1] = 0; 470 } 471 472 if (td->td_retval[1] == 1) 473 td->td_retval[0] = 0; 474 error = linux_proc_init(td, td->td_retval[0], 0); 475 if (error) 476 return (error); 477 478 td2 = FIRST_THREAD_IN_PROC(p2); 479 480 /* make it run */ 481 mtx_lock_spin(&sched_lock); 482 TD_SET_CAN_RUN(td2); 483 setrunqueue(td2, SRQ_BORING); 484 mtx_unlock_spin(&sched_lock); 485 486 return (0); 487} 488 489int 490linux_vfork(struct thread *td, struct linux_vfork_args *args) 491{ 492 int error; 493 struct proc *p2; 494 struct thread *td2; 495 496#ifdef DEBUG 497 if (ldebug(vfork)) 498 printf(ARGS(vfork, "")); 499#endif 500 501 /* exclude RFPPWAIT */ 502 if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0) 503 return (error); 504 if (error == 0) { 505 td->td_retval[0] = p2->p_pid; 506 td->td_retval[1] = 0; 507 } 508 /* Are we the child? */ 509 if (td->td_retval[1] == 1) 510 td->td_retval[0] = 0; 511 error = linux_proc_init(td, td->td_retval[0], 0); 512 if (error) 513 return (error); 514 515 PROC_LOCK(p2); 516 p2->p_flag |= P_PPWAIT; 517 PROC_UNLOCK(p2); 518 519 td2 = FIRST_THREAD_IN_PROC(p2); 520 521 /* make it run */ 522 mtx_lock_spin(&sched_lock); 523 TD_SET_CAN_RUN(td2); 524 setrunqueue(td2, SRQ_BORING); 525 mtx_unlock_spin(&sched_lock); 526 527 /* wait for the children to exit, ie. emulate vfork */ 528 PROC_LOCK(p2); 529 while (p2->p_flag & P_PPWAIT) 530 msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0); 531 PROC_UNLOCK(p2); 532 533 return (0); 534} 535 536int 537linux_clone(struct thread *td, struct linux_clone_args *args) 538{ 539 int error, ff = RFPROC | RFSTOPPED; 540 struct proc *p2; 541 struct thread *td2; 542 int exit_signal; 543 struct linux_emuldata *em; 544 545#ifdef DEBUG 546 if (ldebug(clone)) { 547 printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"), 548 (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack, 549 (unsigned int)(uintptr_t)args->parent_tidptr, 550 (unsigned int)(uintptr_t)args->child_tidptr); 551 } 552#endif 553 554 exit_signal = args->flags & 0x000000ff; 555 if (exit_signal >= LINUX_NSIG) 556 return (EINVAL); 557 558 if (exit_signal <= LINUX_SIGTBLSZ) 559 exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)]; 560 561 if (args->flags & CLONE_VM) 562 ff |= RFMEM; 563 if (args->flags & CLONE_SIGHAND) 564 ff |= RFSIGSHARE; 565 /* 566 * XXX: in linux sharing of fs info (chroot/cwd/umask) 567 * and open files is independant. in fbsd its in one 568 * structure but in reality it doesnt make any problems 569 * because both this flags are set at once usually. 570 */ 571 if (!(args->flags & (CLONE_FILES | CLONE_FS))) 572 ff |= RFFDG; 573 574 /* 575 * Attempt to detect when linux_clone(2) is used for creating 576 * kernel threads. Unfortunately despite the existence of the 577 * CLONE_THREAD flag, version of linuxthreads package used in 578 * most popular distros as of beginning of 2005 doesn't make 579 * any use of it. Therefore, this detection relay fully on 580 * empirical observation that linuxthreads sets certain 581 * combination of flags, so that we can make more or less 582 * precise detection and notify the FreeBSD kernel that several 583 * processes are in fact part of the same threading group, so 584 * that special treatment is necessary for signal delivery 585 * between those processes and fd locking. 586 */ 587 if ((args->flags & 0xffffff00) == THREADING_FLAGS) 588 ff |= RFTHREAD; 589 590 error = fork1(td, ff, 0, &p2); 591 if (error) 592 return (error); 593 594 /* create the emuldata */ 595 error = linux_proc_init(td, p2->p_pid, args->flags); 596 /* reference it - no need to check this */ 597 em = em_find(p2, EMUL_DOLOCK); 598 KASSERT(em != NULL, ("clone: emuldata not found.\n")); 599 /* and adjust it */ 600 if (args->flags & CLONE_PARENT_SETTID) { 601 if (args->parent_tidptr == NULL) { 602 EMUL_UNLOCK(&emul_lock); 603 return (EINVAL); 604 } 605 error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); 606 if (error) { 607 EMUL_UNLOCK(&emul_lock); 608 return (error); 609 } 610 } 611 612 if (args->flags & (CLONE_PARENT|CLONE_THREAD)) { 613 sx_xlock(&proctree_lock); 614 PROC_LOCK(p2); 615 proc_reparent(p2, td->td_proc->p_pptr); 616 PROC_UNLOCK(p2); 617 sx_xunlock(&proctree_lock); 618 } 619 620 if (args->flags & CLONE_THREAD) { 621 /* XXX: linux mangles pgrp and pptr somehow 622 * I think it might be this but I am not sure. 623 */ 624#ifdef notyet 625 PROC_LOCK(p2); 626 p2->p_pgrp = td->td_proc->p_pgrp; 627 PROC_UNLOCK(p2); 628#endif 629 exit_signal = 0; 630 } 631 632 if (args->flags & CLONE_CHILD_SETTID) 633 em->child_set_tid = args->child_tidptr; 634 else 635 em->child_set_tid = NULL; 636 637 if (args->flags & CLONE_CHILD_CLEARTID) 638 em->child_clear_tid = args->child_tidptr; 639 else 640 em->child_clear_tid = NULL; 641 642 EMUL_UNLOCK(&emul_lock); 643 644 PROC_LOCK(p2); 645 p2->p_sigparent = exit_signal; 646 PROC_UNLOCK(p2); 647 td2 = FIRST_THREAD_IN_PROC(p2); 648 /* 649 * in a case of stack = NULL we are supposed to COW calling process stack 650 * this is what normal fork() does so we just keep the tf_rsp arg intact 651 */ 652 if (args->stack) 653 td2->td_frame->tf_rsp = PTROUT(args->stack); 654 655 if (args->flags & CLONE_SETTLS) { 656 /* XXX: todo */ 657 } 658 659#ifdef DEBUG 660 if (ldebug(clone)) 661 printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"), 662 (long)p2->p_pid, args->stack, exit_signal); 663#endif 664 if (args->flags & CLONE_VFORK) { 665 PROC_LOCK(p2); 666 p2->p_flag |= P_PPWAIT; 667 PROC_UNLOCK(p2); 668 } 669 670 /* 671 * Make this runnable after we are finished with it. 672 */ 673 mtx_lock_spin(&sched_lock); 674 TD_SET_CAN_RUN(td2); 675 setrunqueue(td2, SRQ_BORING); 676 mtx_unlock_spin(&sched_lock); 677 678 td->td_retval[0] = p2->p_pid; 679 td->td_retval[1] = 0; 680 681 if (args->flags & CLONE_VFORK) { 682 /* wait for the children to exit, ie. emulate vfork */ 683 PROC_LOCK(p2); 684 while (p2->p_flag & P_PPWAIT) 685 msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0); 686 PROC_UNLOCK(p2); 687 } 688 689 return (0); 690} 691 692/* XXX move */ 693struct l_mmap_argv { 694 l_ulong addr; 695 l_ulong len; 696 l_ulong prot; 697 l_ulong flags; 698 l_ulong fd; 699 l_ulong pgoff; 700}; 701 702#define STACK_SIZE (2 * 1024 * 1024) 703#define GUARD_SIZE (4 * PAGE_SIZE) 704 705static int linux_mmap_common(struct thread *, struct l_mmap_argv *); 706 707int 708linux_mmap2(struct thread *td, struct linux_mmap2_args *args) 709{ 710 struct l_mmap_argv linux_args; 711 712#ifdef DEBUG 713 if (ldebug(mmap2)) 714 printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"), 715 (void *)(intptr_t)args->addr, args->len, args->prot, 716 args->flags, args->fd, args->pgoff); 717#endif 718 719 linux_args.addr = PTROUT(args->addr); 720 linux_args.len = args->len; 721 linux_args.prot = args->prot; 722 linux_args.flags = args->flags; 723 linux_args.fd = args->fd; 724 linux_args.pgoff = args->pgoff; 725 726 return (linux_mmap_common(td, &linux_args)); 727} 728 729int 730linux_mmap(struct thread *td, struct linux_mmap_args *args) 731{ 732 int error; 733 struct l_mmap_argv linux_args; 734 735 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 736 if (error) 737 return (error); 738 739#ifdef DEBUG 740 if (ldebug(mmap)) 741 printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"), 742 (void *)(intptr_t)linux_args.addr, linux_args.len, 743 linux_args.prot, linux_args.flags, linux_args.fd, 744 linux_args.pgoff); 745#endif 746 if ((linux_args.pgoff % PAGE_SIZE) != 0) 747 return (EINVAL); 748 linux_args.pgoff /= PAGE_SIZE; 749 750 return (linux_mmap_common(td, &linux_args)); 751} 752 753static int 754linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args) 755{ 756 struct proc *p = td->td_proc; 757 struct mmap_args /* { 758 caddr_t addr; 759 size_t len; 760 int prot; 761 int flags; 762 int fd; 763 long pad; 764 off_t pos; 765 } */ bsd_args; 766 int error; 767 struct file *fp; 768 769 error = 0; 770 bsd_args.flags = 0; 771 fp = NULL; 772 773 /* 774 * Linux mmap(2): 775 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE 776 */ 777 if (! ((linux_args->flags & LINUX_MAP_SHARED) ^ 778 (linux_args->flags & LINUX_MAP_PRIVATE))) 779 return (EINVAL); 780 781 if (linux_args->flags & LINUX_MAP_SHARED) 782 bsd_args.flags |= MAP_SHARED; 783 if (linux_args->flags & LINUX_MAP_PRIVATE) 784 bsd_args.flags |= MAP_PRIVATE; 785 if (linux_args->flags & LINUX_MAP_FIXED) 786 bsd_args.flags |= MAP_FIXED; 787 if (linux_args->flags & LINUX_MAP_ANON) 788 bsd_args.flags |= MAP_ANON; 789 else 790 bsd_args.flags |= MAP_NOSYNC; 791 if (linux_args->flags & LINUX_MAP_GROWSDOWN) { 792 bsd_args.flags |= MAP_STACK; 793 794 /* 795 * The linux MAP_GROWSDOWN option does not limit auto 796 * growth of the region. Linux mmap with this option 797 * takes as addr the inital BOS, and as len, the initial 798 * region size. It can then grow down from addr without 799 * limit. However, linux threads has an implicit internal 800 * limit to stack size of STACK_SIZE. Its just not 801 * enforced explicitly in linux. But, here we impose 802 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack 803 * region, since we can do this with our mmap. 804 * 805 * Our mmap with MAP_STACK takes addr as the maximum 806 * downsize limit on BOS, and as len the max size of 807 * the region. It them maps the top SGROWSIZ bytes, 808 * and autgrows the region down, up to the limit 809 * in addr. 810 * 811 * If we don't use the MAP_STACK option, the effect 812 * of this code is to allocate a stack region of a 813 * fixed size of (STACK_SIZE - GUARD_SIZE). 814 */ 815 816 /* This gives us TOS */ 817 bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) + 818 linux_args->len; 819 820 if ((caddr_t)PTRIN(bsd_args.addr) > 821 p->p_vmspace->vm_maxsaddr) { 822 /* 823 * Some linux apps will attempt to mmap 824 * thread stacks near the top of their 825 * address space. If their TOS is greater 826 * than vm_maxsaddr, vm_map_growstack() 827 * will confuse the thread stack with the 828 * process stack and deliver a SEGV if they 829 * attempt to grow the thread stack past their 830 * current stacksize rlimit. To avoid this, 831 * adjust vm_maxsaddr upwards to reflect 832 * the current stacksize rlimit rather 833 * than the maximum possible stacksize. 834 * It would be better to adjust the 835 * mmap'ed region, but some apps do not check 836 * mmap's return value. 837 */ 838 PROC_LOCK(p); 839 p->p_vmspace->vm_maxsaddr = 840 (char *)LINUX32_USRSTACK - 841 lim_cur(p, RLIMIT_STACK); 842 PROC_UNLOCK(p); 843 } 844 845 /* This gives us our maximum stack size */ 846 if (linux_args->len > STACK_SIZE - GUARD_SIZE) 847 bsd_args.len = linux_args->len; 848 else 849 bsd_args.len = STACK_SIZE - GUARD_SIZE; 850 851 /* 852 * This gives us a new BOS. If we're using VM_STACK, then 853 * mmap will just map the top SGROWSIZ bytes, and let 854 * the stack grow down to the limit at BOS. If we're 855 * not using VM_STACK we map the full stack, since we 856 * don't have a way to autogrow it. 857 */ 858 bsd_args.addr -= bsd_args.len; 859 } else { 860 bsd_args.addr = (caddr_t)PTRIN(linux_args->addr); 861 bsd_args.len = linux_args->len; 862 } 863 864 /* 865 * We add PROT_EXEC to work around buggy applications (e.g. Java) 866 * that take advantage of the fact that execute permissions are not 867 * enforced by x86 CPUs. 868 */ 869 bsd_args.prot = linux_args->prot | PROT_EXEC; 870 if (linux_args->flags & LINUX_MAP_ANON) 871 bsd_args.fd = -1; 872 else { 873 /* 874 * Linux follows Solaris mmap(2) description: 875 * The file descriptor fildes is opened with 876 * read permission, regardless of the 877 * protection options specified. 878 * If PROT_WRITE is specified, the application 879 * must have opened the file descriptor 880 * fildes with write permission unless 881 * MAP_PRIVATE is specified in the flag 882 * argument as described below. 883 */ 884 885 if ((error = fget(td, linux_args->fd, &fp)) != 0) 886 return (error); 887 if (fp->f_type != DTYPE_VNODE) { 888 fdrop(fp, td); 889 return (EINVAL); 890 } 891 892 /* Linux mmap() just fails for O_WRONLY files */ 893 if (! (fp->f_flag & FREAD)) { 894 fdrop(fp, td); 895 return (EACCES); 896 } 897 898 bsd_args.fd = linux_args->fd; 899 fdrop(fp, td); 900 } 901 bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE; 902 bsd_args.pad = 0; 903 904#ifdef DEBUG 905 if (ldebug(mmap)) 906 printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", 907 __func__, 908 (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot, 909 bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); 910#endif 911 error = mmap(td, &bsd_args); 912#ifdef DEBUG 913 if (ldebug(mmap)) 914 printf("-> %s() return: 0x%x (0x%08x)\n", 915 __func__, error, (u_int)td->td_retval[0]); 916#endif 917 return (error); 918} 919 920int 921linux_pipe(struct thread *td, struct linux_pipe_args *args) 922{ 923 int pip[2]; 924 int error; 925 register_t reg_rdx; 926 927#ifdef DEBUG 928 if (ldebug(pipe)) 929 printf(ARGS(pipe, "*")); 930#endif 931 932 reg_rdx = td->td_retval[1]; 933 error = pipe(td, 0); 934 if (error) { 935 td->td_retval[1] = reg_rdx; 936 return (error); 937 } 938 939 pip[0] = td->td_retval[0]; 940 pip[1] = td->td_retval[1]; 941 error = copyout(pip, args->pipefds, 2 * sizeof(int)); 942 if (error) { 943 td->td_retval[1] = reg_rdx; 944 return (error); 945 } 946 947 td->td_retval[1] = reg_rdx; 948 td->td_retval[0] = 0; 949 return (0); 950} 951 952int 953linux_sigaction(struct thread *td, struct linux_sigaction_args *args) 954{ 955 l_osigaction_t osa; 956 l_sigaction_t act, oact; 957 int error; 958 959#ifdef DEBUG 960 if (ldebug(sigaction)) 961 printf(ARGS(sigaction, "%d, %p, %p"), 962 args->sig, (void *)args->nsa, (void *)args->osa); 963#endif 964 965 if (args->nsa != NULL) { 966 error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); 967 if (error) 968 return (error); 969 act.lsa_handler = osa.lsa_handler; 970 act.lsa_flags = osa.lsa_flags; 971 act.lsa_restorer = osa.lsa_restorer; 972 LINUX_SIGEMPTYSET(act.lsa_mask); 973 act.lsa_mask.__bits[0] = osa.lsa_mask; 974 } 975 976 error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, 977 args->osa ? &oact : NULL); 978 979 if (args->osa != NULL && !error) { 980 osa.lsa_handler = oact.lsa_handler; 981 osa.lsa_flags = oact.lsa_flags; 982 osa.lsa_restorer = oact.lsa_restorer; 983 osa.lsa_mask = oact.lsa_mask.__bits[0]; 984 error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); 985 } 986 987 return (error); 988} 989 990/* 991 * Linux has two extra args, restart and oldmask. We dont use these, 992 * but it seems that "restart" is actually a context pointer that 993 * enables the signal to happen with a different register set. 994 */ 995int 996linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) 997{ 998 sigset_t sigmask; 999 l_sigset_t mask; 1000 1001#ifdef DEBUG 1002 if (ldebug(sigsuspend)) 1003 printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask); 1004#endif 1005 1006 LINUX_SIGEMPTYSET(mask); 1007 mask.__bits[0] = args->mask; 1008 linux_to_bsd_sigset(&mask, &sigmask); 1009 return (kern_sigsuspend(td, sigmask)); 1010} 1011 1012int 1013linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) 1014{ 1015 l_sigset_t lmask; 1016 sigset_t sigmask; 1017 int error; 1018 1019#ifdef DEBUG 1020 if (ldebug(rt_sigsuspend)) 1021 printf(ARGS(rt_sigsuspend, "%p, %d"), 1022 (void *)uap->newset, uap->sigsetsize); 1023#endif 1024 1025 if (uap->sigsetsize != sizeof(l_sigset_t)) 1026 return (EINVAL); 1027 1028 error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); 1029 if (error) 1030 return (error); 1031 1032 linux_to_bsd_sigset(&lmask, &sigmask); 1033 return (kern_sigsuspend(td, sigmask)); 1034} 1035 1036int 1037linux_pause(struct thread *td, struct linux_pause_args *args) 1038{ 1039 struct proc *p = td->td_proc; 1040 sigset_t sigmask; 1041 1042#ifdef DEBUG 1043 if (ldebug(pause)) 1044 printf(ARGS(pause, "")); 1045#endif 1046 1047 PROC_LOCK(p); 1048 sigmask = td->td_sigmask; 1049 PROC_UNLOCK(p); 1050 return (kern_sigsuspend(td, sigmask)); 1051} 1052 1053int 1054linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) 1055{ 1056 stack_t ss, oss; 1057 l_stack_t lss; 1058 int error; 1059 1060#ifdef DEBUG 1061 if (ldebug(sigaltstack)) 1062 printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss); 1063#endif 1064 1065 if (uap->uss != NULL) { 1066 error = copyin(uap->uss, &lss, sizeof(l_stack_t)); 1067 if (error) 1068 return (error); 1069 1070 ss.ss_sp = PTRIN(lss.ss_sp); 1071 ss.ss_size = lss.ss_size; 1072 ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); 1073 } 1074 error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, 1075 (uap->uoss != NULL) ? &oss : NULL); 1076 if (!error && uap->uoss != NULL) { 1077 lss.ss_sp = PTROUT(oss.ss_sp); 1078 lss.ss_size = oss.ss_size; 1079 lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); 1080 error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); 1081 } 1082 1083 return (error); 1084} 1085 1086int 1087linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) 1088{ 1089 struct ftruncate_args sa; 1090 1091#ifdef DEBUG 1092 if (ldebug(ftruncate64)) 1093 printf(ARGS(ftruncate64, "%u, %jd"), args->fd, 1094 (intmax_t)args->length); 1095#endif 1096 1097 sa.fd = args->fd; 1098 sa.pad = 0; 1099 sa.length = args->length; 1100 return ftruncate(td, &sa); 1101} 1102 1103int 1104linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap) 1105{ 1106 struct timeval atv; 1107 l_timeval atv32; 1108 struct timezone rtz; 1109 int error = 0; 1110 1111 if (uap->tp) { 1112 microtime(&atv); 1113 atv32.tv_sec = atv.tv_sec; 1114 atv32.tv_usec = atv.tv_usec; 1115 error = copyout(&atv32, uap->tp, sizeof (atv32)); 1116 } 1117 if (error == 0 && uap->tzp != NULL) { 1118 rtz.tz_minuteswest = tz_minuteswest; 1119 rtz.tz_dsttime = tz_dsttime; 1120 error = copyout(&rtz, uap->tzp, sizeof (rtz)); 1121 } 1122 return (error); 1123} 1124 1125int 1126linux_getrusage(struct thread *td, struct linux_getrusage_args *uap) 1127{ 1128 struct l_rusage s32; 1129 struct rusage s; 1130 int error; 1131 1132 error = kern_getrusage(td, uap->who, &s); 1133 if (error != 0) 1134 return (error); 1135 if (uap->rusage != NULL) { 1136 s32.ru_utime.tv_sec = s.ru_utime.tv_sec; 1137 s32.ru_utime.tv_usec = s.ru_utime.tv_usec; 1138 s32.ru_stime.tv_sec = s.ru_stime.tv_sec; 1139 s32.ru_stime.tv_usec = s.ru_stime.tv_usec; 1140 s32.ru_maxrss = s.ru_maxrss; 1141 s32.ru_ixrss = s.ru_ixrss; 1142 s32.ru_idrss = s.ru_idrss; 1143 s32.ru_isrss = s.ru_isrss; 1144 s32.ru_minflt = s.ru_minflt; 1145 s32.ru_majflt = s.ru_majflt; 1146 s32.ru_nswap = s.ru_nswap; 1147 s32.ru_inblock = s.ru_inblock; 1148 s32.ru_oublock = s.ru_oublock; 1149 s32.ru_msgsnd = s.ru_msgsnd; 1150 s32.ru_msgrcv = s.ru_msgrcv; 1151 s32.ru_nsignals = s.ru_nsignals; 1152 s32.ru_nvcsw = s.ru_nvcsw; 1153 s32.ru_nivcsw = s.ru_nivcsw; 1154 error = copyout(&s32, uap->rusage, sizeof(s32)); 1155 } 1156 return (error); 1157} 1158 1159int 1160linux_sched_rr_get_interval(struct thread *td, 1161 struct linux_sched_rr_get_interval_args *uap) 1162{ 1163 struct timespec ts; 1164 struct l_timespec ts32; 1165 int error; 1166 1167 error = kern_sched_rr_get_interval(td, uap->pid, &ts); 1168 if (error != 0) 1169 return (error); 1170 ts32.tv_sec = ts.tv_sec; 1171 ts32.tv_nsec = ts.tv_nsec; 1172 return (copyout(&ts32, uap->interval, sizeof(ts32))); 1173} 1174 1175int 1176linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) 1177{ 1178 struct mprotect_args bsd_args; 1179 1180 bsd_args.addr = uap->addr; 1181 bsd_args.len = uap->len; 1182 bsd_args.prot = uap->prot; 1183 /* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */ 1184 if ((bsd_args.prot & PROT_READ) != 0) 1185 bsd_args.prot |= PROT_EXEC; 1186 return (mprotect(td, &bsd_args)); 1187} 1188