linux32_machdep.c revision 218030
1/*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2002 Doug Rabson 4 * Copyright (c) 2000 Marcel Moolenaar 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer 12 * in this position and unchanged. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. The name of the author may not be used to endorse or promote products 17 * derived from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include <sys/cdefs.h> 32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 218030 2011-01-28 18:47:07Z dchagin $"); 33 34#include <sys/param.h> 35#include <sys/kernel.h> 36#include <sys/systm.h> 37#include <sys/file.h> 38#include <sys/fcntl.h> 39#include <sys/clock.h> 40#include <sys/imgact.h> 41#include <sys/limits.h> 42#include <sys/lock.h> 43#include <sys/malloc.h> 44#include <sys/mman.h> 45#include <sys/mutex.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/resource.h> 49#include <sys/resourcevar.h> 50#include <sys/sched.h> 51#include <sys/syscallsubr.h> 52#include <sys/sysproto.h> 53#include <sys/unistd.h> 54#include <sys/wait.h> 55 56#include <machine/frame.h> 57#include <machine/pcb.h> 58#include <machine/psl.h> 59#include <machine/segments.h> 60#include <machine/specialreg.h> 61 62#include <vm/vm.h> 63#include <vm/pmap.h> 64#include <vm/vm_map.h> 65 66#include <compat/freebsd32/freebsd32_util.h> 67#include <amd64/linux32/linux.h> 68#include <amd64/linux32/linux32_proto.h> 69#include <compat/linux/linux_ipc.h> 70#include <compat/linux/linux_misc.h> 71#include <compat/linux/linux_signal.h> 72#include <compat/linux/linux_util.h> 73#include <compat/linux/linux_emul.h> 74 75struct l_old_select_argv { 76 l_int nfds; 77 l_uintptr_t readfds; 78 l_uintptr_t writefds; 79 l_uintptr_t exceptfds; 80 l_uintptr_t timeout; 81} __packed; 82 83int 84linux_to_bsd_sigaltstack(int lsa) 85{ 86 int bsa = 0; 87 88 if (lsa & LINUX_SS_DISABLE) 89 bsa |= SS_DISABLE; 90 if (lsa & LINUX_SS_ONSTACK) 91 bsa |= SS_ONSTACK; 92 return (bsa); 93} 94 95static int linux_mmap_common(struct thread *td, l_uintptr_t addr, 96 l_size_t len, l_int prot, l_int flags, l_int fd, 97 l_loff_t pos); 98 99int 100bsd_to_linux_sigaltstack(int bsa) 101{ 102 int lsa = 0; 103 104 if (bsa & SS_DISABLE) 105 lsa |= LINUX_SS_DISABLE; 106 if (bsa & SS_ONSTACK) 107 lsa |= LINUX_SS_ONSTACK; 108 return (lsa); 109} 110 111static void bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru) 112{ 113 lru->ru_utime.tv_sec = ru->ru_utime.tv_sec; 114 lru->ru_utime.tv_usec = ru->ru_utime.tv_usec; 115 lru->ru_stime.tv_sec = ru->ru_stime.tv_sec; 116 lru->ru_stime.tv_usec = ru->ru_stime.tv_usec; 117 lru->ru_maxrss = ru->ru_maxrss; 118 lru->ru_ixrss = ru->ru_ixrss; 119 lru->ru_idrss = ru->ru_idrss; 120 lru->ru_isrss = ru->ru_isrss; 121 lru->ru_minflt = ru->ru_minflt; 122 lru->ru_majflt = ru->ru_majflt; 123 lru->ru_nswap = ru->ru_nswap; 124 lru->ru_inblock = ru->ru_inblock; 125 lru->ru_oublock = ru->ru_oublock; 126 lru->ru_msgsnd = ru->ru_msgsnd; 127 lru->ru_msgrcv = ru->ru_msgrcv; 128 lru->ru_nsignals = ru->ru_nsignals; 129 lru->ru_nvcsw = ru->ru_nvcsw; 130 lru->ru_nivcsw = ru->ru_nivcsw; 131} 132 133int 134linux_execve(struct thread *td, struct linux_execve_args *args) 135{ 136 struct image_args eargs; 137 char *path; 138 int error; 139 140 LCONVPATHEXIST(td, args->path, &path); 141 142#ifdef DEBUG 143 if (ldebug(execve)) 144 printf(ARGS(execve, "%s"), path); 145#endif 146 147 error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE, 148 args->argp, args->envp); 149 free(path, M_TEMP); 150 if (error == 0) 151 error = kern_execve(td, &eargs, NULL); 152 if (error == 0) 153 /* Linux process can execute FreeBSD one, do not attempt 154 * to create emuldata for such process using 155 * linux_proc_init, this leads to a panic on KASSERT 156 * because such process has p->p_emuldata == NULL. 157 */ 158 if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX) 159 error = linux_proc_init(td, 0, 0); 160 return (error); 161} 162 163CTASSERT(sizeof(struct l_iovec32) == 8); 164 165static int 166linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop) 167{ 168 struct l_iovec32 iov32; 169 struct iovec *iov; 170 struct uio *uio; 171 uint32_t iovlen; 172 int error, i; 173 174 *uiop = NULL; 175 if (iovcnt > UIO_MAXIOV) 176 return (EINVAL); 177 iovlen = iovcnt * sizeof(struct iovec); 178 uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK); 179 iov = (struct iovec *)(uio + 1); 180 for (i = 0; i < iovcnt; i++) { 181 error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32)); 182 if (error) { 183 free(uio, M_IOV); 184 return (error); 185 } 186 iov[i].iov_base = PTRIN(iov32.iov_base); 187 iov[i].iov_len = iov32.iov_len; 188 } 189 uio->uio_iov = iov; 190 uio->uio_iovcnt = iovcnt; 191 uio->uio_segflg = UIO_USERSPACE; 192 uio->uio_offset = -1; 193 uio->uio_resid = 0; 194 for (i = 0; i < iovcnt; i++) { 195 if (iov->iov_len > INT_MAX - uio->uio_resid) { 196 free(uio, M_IOV); 197 return (EINVAL); 198 } 199 uio->uio_resid += iov->iov_len; 200 iov++; 201 } 202 *uiop = uio; 203 return (0); 204} 205 206int 207linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp, 208 int error) 209{ 210 struct l_iovec32 iov32; 211 struct iovec *iov; 212 uint32_t iovlen; 213 int i; 214 215 *iovp = NULL; 216 if (iovcnt > UIO_MAXIOV) 217 return (error); 218 iovlen = iovcnt * sizeof(struct iovec); 219 iov = malloc(iovlen, M_IOV, M_WAITOK); 220 for (i = 0; i < iovcnt; i++) { 221 error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32)); 222 if (error) { 223 free(iov, M_IOV); 224 return (error); 225 } 226 iov[i].iov_base = PTRIN(iov32.iov_base); 227 iov[i].iov_len = iov32.iov_len; 228 } 229 *iovp = iov; 230 return(0); 231 232} 233 234int 235linux_readv(struct thread *td, struct linux_readv_args *uap) 236{ 237 struct uio *auio; 238 int error; 239 240 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 241 if (error) 242 return (error); 243 error = kern_readv(td, uap->fd, auio); 244 free(auio, M_IOV); 245 return (error); 246} 247 248int 249linux_writev(struct thread *td, struct linux_writev_args *uap) 250{ 251 struct uio *auio; 252 int error; 253 254 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 255 if (error) 256 return (error); 257 error = kern_writev(td, uap->fd, auio); 258 free(auio, M_IOV); 259 return (error); 260} 261 262struct l_ipc_kludge { 263 l_uintptr_t msgp; 264 l_long msgtyp; 265} __packed; 266 267int 268linux_ipc(struct thread *td, struct linux_ipc_args *args) 269{ 270 271 switch (args->what & 0xFFFF) { 272 case LINUX_SEMOP: { 273 struct linux_semop_args a; 274 275 a.semid = args->arg1; 276 a.tsops = args->ptr; 277 a.nsops = args->arg2; 278 return (linux_semop(td, &a)); 279 } 280 case LINUX_SEMGET: { 281 struct linux_semget_args a; 282 283 a.key = args->arg1; 284 a.nsems = args->arg2; 285 a.semflg = args->arg3; 286 return (linux_semget(td, &a)); 287 } 288 case LINUX_SEMCTL: { 289 struct linux_semctl_args a; 290 int error; 291 292 a.semid = args->arg1; 293 a.semnum = args->arg2; 294 a.cmd = args->arg3; 295 error = copyin(args->ptr, &a.arg, sizeof(a.arg)); 296 if (error) 297 return (error); 298 return (linux_semctl(td, &a)); 299 } 300 case LINUX_MSGSND: { 301 struct linux_msgsnd_args a; 302 303 a.msqid = args->arg1; 304 a.msgp = args->ptr; 305 a.msgsz = args->arg2; 306 a.msgflg = args->arg3; 307 return (linux_msgsnd(td, &a)); 308 } 309 case LINUX_MSGRCV: { 310 struct linux_msgrcv_args a; 311 312 a.msqid = args->arg1; 313 a.msgsz = args->arg2; 314 a.msgflg = args->arg3; 315 if ((args->what >> 16) == 0) { 316 struct l_ipc_kludge tmp; 317 int error; 318 319 if (args->ptr == 0) 320 return (EINVAL); 321 error = copyin(args->ptr, &tmp, sizeof(tmp)); 322 if (error) 323 return (error); 324 a.msgp = PTRIN(tmp.msgp); 325 a.msgtyp = tmp.msgtyp; 326 } else { 327 a.msgp = args->ptr; 328 a.msgtyp = args->arg5; 329 } 330 return (linux_msgrcv(td, &a)); 331 } 332 case LINUX_MSGGET: { 333 struct linux_msgget_args a; 334 335 a.key = args->arg1; 336 a.msgflg = args->arg2; 337 return (linux_msgget(td, &a)); 338 } 339 case LINUX_MSGCTL: { 340 struct linux_msgctl_args a; 341 342 a.msqid = args->arg1; 343 a.cmd = args->arg2; 344 a.buf = args->ptr; 345 return (linux_msgctl(td, &a)); 346 } 347 case LINUX_SHMAT: { 348 struct linux_shmat_args a; 349 350 a.shmid = args->arg1; 351 a.shmaddr = args->ptr; 352 a.shmflg = args->arg2; 353 a.raddr = PTRIN((l_uint)args->arg3); 354 return (linux_shmat(td, &a)); 355 } 356 case LINUX_SHMDT: { 357 struct linux_shmdt_args a; 358 359 a.shmaddr = args->ptr; 360 return (linux_shmdt(td, &a)); 361 } 362 case LINUX_SHMGET: { 363 struct linux_shmget_args a; 364 365 a.key = args->arg1; 366 a.size = args->arg2; 367 a.shmflg = args->arg3; 368 return (linux_shmget(td, &a)); 369 } 370 case LINUX_SHMCTL: { 371 struct linux_shmctl_args a; 372 373 a.shmid = args->arg1; 374 a.cmd = args->arg2; 375 a.buf = args->ptr; 376 return (linux_shmctl(td, &a)); 377 } 378 default: 379 break; 380 } 381 382 return (EINVAL); 383} 384 385int 386linux_old_select(struct thread *td, struct linux_old_select_args *args) 387{ 388 struct l_old_select_argv linux_args; 389 struct linux_select_args newsel; 390 int error; 391 392#ifdef DEBUG 393 if (ldebug(old_select)) 394 printf(ARGS(old_select, "%p"), args->ptr); 395#endif 396 397 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 398 if (error) 399 return (error); 400 401 newsel.nfds = linux_args.nfds; 402 newsel.readfds = PTRIN(linux_args.readfds); 403 newsel.writefds = PTRIN(linux_args.writefds); 404 newsel.exceptfds = PTRIN(linux_args.exceptfds); 405 newsel.timeout = PTRIN(linux_args.timeout); 406 return (linux_select(td, &newsel)); 407} 408 409int 410linux_fork(struct thread *td, struct linux_fork_args *args) 411{ 412 int error; 413 struct proc *p2; 414 struct thread *td2; 415 416#ifdef DEBUG 417 if (ldebug(fork)) 418 printf(ARGS(fork, "")); 419#endif 420 421 if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0) 422 return (error); 423 424 if (error == 0) { 425 td->td_retval[0] = p2->p_pid; 426 td->td_retval[1] = 0; 427 } 428 429 if (td->td_retval[1] == 1) 430 td->td_retval[0] = 0; 431 error = linux_proc_init(td, td->td_retval[0], 0); 432 if (error) 433 return (error); 434 435 td2 = FIRST_THREAD_IN_PROC(p2); 436 437 /* 438 * Make this runnable after we are finished with it. 439 */ 440 thread_lock(td2); 441 TD_SET_CAN_RUN(td2); 442 sched_add(td2, SRQ_BORING); 443 thread_unlock(td2); 444 445 return (0); 446} 447 448int 449linux_vfork(struct thread *td, struct linux_vfork_args *args) 450{ 451 int error; 452 struct proc *p2; 453 struct thread *td2; 454 455#ifdef DEBUG 456 if (ldebug(vfork)) 457 printf(ARGS(vfork, "")); 458#endif 459 460 /* Exclude RFPPWAIT */ 461 if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0) 462 return (error); 463 if (error == 0) { 464 td->td_retval[0] = p2->p_pid; 465 td->td_retval[1] = 0; 466 } 467 /* Are we the child? */ 468 if (td->td_retval[1] == 1) 469 td->td_retval[0] = 0; 470 error = linux_proc_init(td, td->td_retval[0], 0); 471 if (error) 472 return (error); 473 474 PROC_LOCK(p2); 475 p2->p_flag |= P_PPWAIT; 476 PROC_UNLOCK(p2); 477 478 td2 = FIRST_THREAD_IN_PROC(p2); 479 480 /* 481 * Make this runnable after we are finished with it. 482 */ 483 thread_lock(td2); 484 TD_SET_CAN_RUN(td2); 485 sched_add(td2, SRQ_BORING); 486 thread_unlock(td2); 487 488 /* wait for the children to exit, ie. emulate vfork */ 489 PROC_LOCK(p2); 490 while (p2->p_flag & P_PPWAIT) 491 cv_wait(&p2->p_pwait, &p2->p_mtx); 492 PROC_UNLOCK(p2); 493 494 return (0); 495} 496 497int 498linux_clone(struct thread *td, struct linux_clone_args *args) 499{ 500 int error, ff = RFPROC | RFSTOPPED; 501 struct proc *p2; 502 struct thread *td2; 503 int exit_signal; 504 struct linux_emuldata *em; 505 506#ifdef DEBUG 507 if (ldebug(clone)) { 508 printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " 509 "child tid: %p"), (unsigned)args->flags, 510 args->stack, args->parent_tidptr, args->child_tidptr); 511 } 512#endif 513 514 exit_signal = args->flags & 0x000000ff; 515 if (LINUX_SIG_VALID(exit_signal)) { 516 if (exit_signal <= LINUX_SIGTBLSZ) 517 exit_signal = 518 linux_to_bsd_signal[_SIG_IDX(exit_signal)]; 519 } else if (exit_signal != 0) 520 return (EINVAL); 521 522 if (args->flags & LINUX_CLONE_VM) 523 ff |= RFMEM; 524 if (args->flags & LINUX_CLONE_SIGHAND) 525 ff |= RFSIGSHARE; 526 /* 527 * XXX: In Linux, sharing of fs info (chroot/cwd/umask) 528 * and open files is independant. In FreeBSD, its in one 529 * structure but in reality it does not cause any problems 530 * because both of these flags are usually set together. 531 */ 532 if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) 533 ff |= RFFDG; 534 535 /* 536 * Attempt to detect when linux_clone(2) is used for creating 537 * kernel threads. Unfortunately despite the existence of the 538 * CLONE_THREAD flag, version of linuxthreads package used in 539 * most popular distros as of beginning of 2005 doesn't make 540 * any use of it. Therefore, this detection relies on 541 * empirical observation that linuxthreads sets certain 542 * combination of flags, so that we can make more or less 543 * precise detection and notify the FreeBSD kernel that several 544 * processes are in fact part of the same threading group, so 545 * that special treatment is necessary for signal delivery 546 * between those processes and fd locking. 547 */ 548 if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS) 549 ff |= RFTHREAD; 550 551 if (args->flags & LINUX_CLONE_PARENT_SETTID) 552 if (args->parent_tidptr == NULL) 553 return (EINVAL); 554 555 error = fork1(td, ff, 0, &p2); 556 if (error) 557 return (error); 558 559 if (args->flags & (LINUX_CLONE_PARENT | LINUX_CLONE_THREAD)) { 560 sx_xlock(&proctree_lock); 561 PROC_LOCK(p2); 562 proc_reparent(p2, td->td_proc->p_pptr); 563 PROC_UNLOCK(p2); 564 sx_xunlock(&proctree_lock); 565 } 566 567 /* create the emuldata */ 568 error = linux_proc_init(td, p2->p_pid, args->flags); 569 /* reference it - no need to check this */ 570 em = em_find(p2, EMUL_DOLOCK); 571 KASSERT(em != NULL, ("clone: emuldata not found.\n")); 572 /* and adjust it */ 573 574 if (args->flags & LINUX_CLONE_THREAD) { 575#ifdef notyet 576 PROC_LOCK(p2); 577 p2->p_pgrp = td->td_proc->p_pgrp; 578 PROC_UNLOCK(p2); 579#endif 580 exit_signal = 0; 581 } 582 583 if (args->flags & LINUX_CLONE_CHILD_SETTID) 584 em->child_set_tid = args->child_tidptr; 585 else 586 em->child_set_tid = NULL; 587 588 if (args->flags & LINUX_CLONE_CHILD_CLEARTID) 589 em->child_clear_tid = args->child_tidptr; 590 else 591 em->child_clear_tid = NULL; 592 593 EMUL_UNLOCK(&emul_lock); 594 595 if (args->flags & LINUX_CLONE_PARENT_SETTID) { 596 error = copyout(&p2->p_pid, args->parent_tidptr, 597 sizeof(p2->p_pid)); 598 if (error) 599 printf(LMSG("copyout failed!")); 600 } 601 602 PROC_LOCK(p2); 603 p2->p_sigparent = exit_signal; 604 PROC_UNLOCK(p2); 605 td2 = FIRST_THREAD_IN_PROC(p2); 606 /* 607 * In a case of stack = NULL, we are supposed to COW calling process 608 * stack. This is what normal fork() does, so we just keep tf_rsp arg 609 * intact. 610 */ 611 if (args->stack) 612 td2->td_frame->tf_rsp = PTROUT(args->stack); 613 614 if (args->flags & LINUX_CLONE_SETTLS) { 615 struct user_segment_descriptor sd; 616 struct l_user_desc info; 617 struct pcb *pcb; 618 int a[2]; 619 620 error = copyin((void *)td->td_frame->tf_rsi, &info, 621 sizeof(struct l_user_desc)); 622 if (error) { 623 printf(LMSG("copyin failed!")); 624 } else { 625 /* We might copy out the entry_number as GUGS32_SEL. */ 626 info.entry_number = GUGS32_SEL; 627 error = copyout(&info, (void *)td->td_frame->tf_rsi, 628 sizeof(struct l_user_desc)); 629 if (error) 630 printf(LMSG("copyout failed!")); 631 632 a[0] = LINUX_LDT_entry_a(&info); 633 a[1] = LINUX_LDT_entry_b(&info); 634 635 memcpy(&sd, &a, sizeof(a)); 636#ifdef DEBUG 637 if (ldebug(clone)) 638 printf("Segment created in clone with " 639 "CLONE_SETTLS: lobase: %x, hibase: %x, " 640 "lolimit: %x, hilimit: %x, type: %i, " 641 "dpl: %i, p: %i, xx: %i, long: %i, " 642 "def32: %i, gran: %i\n", sd.sd_lobase, 643 sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit, 644 sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx, 645 sd.sd_long, sd.sd_def32, sd.sd_gran); 646#endif 647 pcb = td2->td_pcb; 648 pcb->pcb_gsbase = (register_t)info.base_addr; 649/* XXXKIB pcb->pcb_gs32sd = sd; */ 650 td2->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL); 651 set_pcb_flags(pcb, PCB_GS32BIT | PCB_32BIT); 652 } 653 } 654 655#ifdef DEBUG 656 if (ldebug(clone)) 657 printf(LMSG("clone: successful rfork to %d, " 658 "stack %p sig = %d"), (int)p2->p_pid, args->stack, 659 exit_signal); 660#endif 661 if (args->flags & LINUX_CLONE_VFORK) { 662 PROC_LOCK(p2); 663 p2->p_flag |= P_PPWAIT; 664 PROC_UNLOCK(p2); 665 } 666 667 /* 668 * Make this runnable after we are finished with it. 669 */ 670 thread_lock(td2); 671 TD_SET_CAN_RUN(td2); 672 sched_add(td2, SRQ_BORING); 673 thread_unlock(td2); 674 675 td->td_retval[0] = p2->p_pid; 676 td->td_retval[1] = 0; 677 678 if (args->flags & LINUX_CLONE_VFORK) { 679 /* wait for the children to exit, ie. emulate vfork */ 680 PROC_LOCK(p2); 681 while (p2->p_flag & P_PPWAIT) 682 cv_wait(&p2->p_pwait, &p2->p_mtx); 683 PROC_UNLOCK(p2); 684 } 685 686 return (0); 687} 688 689#define STACK_SIZE (2 * 1024 * 1024) 690#define GUARD_SIZE (4 * PAGE_SIZE) 691 692int 693linux_mmap2(struct thread *td, struct linux_mmap2_args *args) 694{ 695 696#ifdef DEBUG 697 if (ldebug(mmap2)) 698 printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"), 699 args->addr, args->len, args->prot, 700 args->flags, args->fd, args->pgoff); 701#endif 702 703 return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot, 704 args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff * 705 PAGE_SIZE)); 706} 707 708int 709linux_mmap(struct thread *td, struct linux_mmap_args *args) 710{ 711 int error; 712 struct l_mmap_argv linux_args; 713 714 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 715 if (error) 716 return (error); 717 718#ifdef DEBUG 719 if (ldebug(mmap)) 720 printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"), 721 linux_args.addr, linux_args.len, linux_args.prot, 722 linux_args.flags, linux_args.fd, linux_args.pgoff); 723#endif 724 725 return (linux_mmap_common(td, linux_args.addr, linux_args.len, 726 linux_args.prot, linux_args.flags, linux_args.fd, 727 (uint32_t)linux_args.pgoff)); 728} 729 730static int 731linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, 732 l_int flags, l_int fd, l_loff_t pos) 733{ 734 struct proc *p = td->td_proc; 735 struct mmap_args /* { 736 caddr_t addr; 737 size_t len; 738 int prot; 739 int flags; 740 int fd; 741 long pad; 742 off_t pos; 743 } */ bsd_args; 744 int error; 745 struct file *fp; 746 747 error = 0; 748 bsd_args.flags = 0; 749 fp = NULL; 750 751 /* 752 * Linux mmap(2): 753 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE 754 */ 755 if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) 756 return (EINVAL); 757 758 if (flags & LINUX_MAP_SHARED) 759 bsd_args.flags |= MAP_SHARED; 760 if (flags & LINUX_MAP_PRIVATE) 761 bsd_args.flags |= MAP_PRIVATE; 762 if (flags & LINUX_MAP_FIXED) 763 bsd_args.flags |= MAP_FIXED; 764 if (flags & LINUX_MAP_ANON) { 765 /* Enforce pos to be on page boundary, then ignore. */ 766 if ((pos & PAGE_MASK) != 0) 767 return (EINVAL); 768 pos = 0; 769 bsd_args.flags |= MAP_ANON; 770 } else 771 bsd_args.flags |= MAP_NOSYNC; 772 if (flags & LINUX_MAP_GROWSDOWN) 773 bsd_args.flags |= MAP_STACK; 774 775 /* 776 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC 777 * on Linux/i386. We do this to ensure maximum compatibility. 778 * Linux/ia64 does the same in i386 emulation mode. 779 */ 780 bsd_args.prot = prot; 781 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 782 bsd_args.prot |= PROT_READ | PROT_EXEC; 783 784 /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ 785 bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; 786 if (bsd_args.fd != -1) { 787 /* 788 * Linux follows Solaris mmap(2) description: 789 * The file descriptor fildes is opened with 790 * read permission, regardless of the 791 * protection options specified. 792 */ 793 794 if ((error = fget(td, bsd_args.fd, &fp)) != 0) 795 return (error); 796 if (fp->f_type != DTYPE_VNODE) { 797 fdrop(fp, td); 798 return (EINVAL); 799 } 800 801 /* Linux mmap() just fails for O_WRONLY files */ 802 if (!(fp->f_flag & FREAD)) { 803 fdrop(fp, td); 804 return (EACCES); 805 } 806 807 fdrop(fp, td); 808 } 809 810 if (flags & LINUX_MAP_GROWSDOWN) { 811 /* 812 * The Linux MAP_GROWSDOWN option does not limit auto 813 * growth of the region. Linux mmap with this option 814 * takes as addr the inital BOS, and as len, the initial 815 * region size. It can then grow down from addr without 816 * limit. However, Linux threads has an implicit internal 817 * limit to stack size of STACK_SIZE. Its just not 818 * enforced explicitly in Linux. But, here we impose 819 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack 820 * region, since we can do this with our mmap. 821 * 822 * Our mmap with MAP_STACK takes addr as the maximum 823 * downsize limit on BOS, and as len the max size of 824 * the region. It then maps the top SGROWSIZ bytes, 825 * and auto grows the region down, up to the limit 826 * in addr. 827 * 828 * If we don't use the MAP_STACK option, the effect 829 * of this code is to allocate a stack region of a 830 * fixed size of (STACK_SIZE - GUARD_SIZE). 831 */ 832 833 if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) { 834 /* 835 * Some Linux apps will attempt to mmap 836 * thread stacks near the top of their 837 * address space. If their TOS is greater 838 * than vm_maxsaddr, vm_map_growstack() 839 * will confuse the thread stack with the 840 * process stack and deliver a SEGV if they 841 * attempt to grow the thread stack past their 842 * current stacksize rlimit. To avoid this, 843 * adjust vm_maxsaddr upwards to reflect 844 * the current stacksize rlimit rather 845 * than the maximum possible stacksize. 846 * It would be better to adjust the 847 * mmap'ed region, but some apps do not check 848 * mmap's return value. 849 */ 850 PROC_LOCK(p); 851 p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK - 852 lim_cur(p, RLIMIT_STACK); 853 PROC_UNLOCK(p); 854 } 855 856 /* 857 * This gives us our maximum stack size and a new BOS. 858 * If we're using VM_STACK, then mmap will just map 859 * the top SGROWSIZ bytes, and let the stack grow down 860 * to the limit at BOS. If we're not using VM_STACK 861 * we map the full stack, since we don't have a way 862 * to autogrow it. 863 */ 864 if (len > STACK_SIZE - GUARD_SIZE) { 865 bsd_args.addr = (caddr_t)PTRIN(addr); 866 bsd_args.len = len; 867 } else { 868 bsd_args.addr = (caddr_t)PTRIN(addr) - 869 (STACK_SIZE - GUARD_SIZE - len); 870 bsd_args.len = STACK_SIZE - GUARD_SIZE; 871 } 872 } else { 873 bsd_args.addr = (caddr_t)PTRIN(addr); 874 bsd_args.len = len; 875 } 876 bsd_args.pos = pos; 877 878#ifdef DEBUG 879 if (ldebug(mmap)) 880 printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", 881 __func__, 882 (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot, 883 bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); 884#endif 885 error = mmap(td, &bsd_args); 886#ifdef DEBUG 887 if (ldebug(mmap)) 888 printf("-> %s() return: 0x%x (0x%08x)\n", 889 __func__, error, (u_int)td->td_retval[0]); 890#endif 891 return (error); 892} 893 894int 895linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) 896{ 897 struct mprotect_args bsd_args; 898 899 bsd_args.addr = uap->addr; 900 bsd_args.len = uap->len; 901 bsd_args.prot = uap->prot; 902 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 903 bsd_args.prot |= PROT_READ | PROT_EXEC; 904 return (mprotect(td, &bsd_args)); 905} 906 907int 908linux_iopl(struct thread *td, struct linux_iopl_args *args) 909{ 910 int error; 911 912 if (args->level < 0 || args->level > 3) 913 return (EINVAL); 914 if ((error = priv_check(td, PRIV_IO)) != 0) 915 return (error); 916 if ((error = securelevel_gt(td->td_ucred, 0)) != 0) 917 return (error); 918 td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) | 919 (args->level * (PSL_IOPL / 3)); 920 921 return (0); 922} 923 924int 925linux_pipe(struct thread *td, struct linux_pipe_args *args) 926{ 927 int error; 928 int fildes[2]; 929 930#ifdef DEBUG 931 if (ldebug(pipe)) 932 printf(ARGS(pipe, "*")); 933#endif 934 935 error = kern_pipe(td, fildes); 936 if (error) 937 return (error); 938 939 /* XXX: Close descriptors on error. */ 940 return (copyout(fildes, args->pipefds, sizeof fildes)); 941} 942 943int 944linux_sigaction(struct thread *td, struct linux_sigaction_args *args) 945{ 946 l_osigaction_t osa; 947 l_sigaction_t act, oact; 948 int error; 949 950#ifdef DEBUG 951 if (ldebug(sigaction)) 952 printf(ARGS(sigaction, "%d, %p, %p"), 953 args->sig, (void *)args->nsa, (void *)args->osa); 954#endif 955 956 if (args->nsa != NULL) { 957 error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); 958 if (error) 959 return (error); 960 act.lsa_handler = osa.lsa_handler; 961 act.lsa_flags = osa.lsa_flags; 962 act.lsa_restorer = osa.lsa_restorer; 963 LINUX_SIGEMPTYSET(act.lsa_mask); 964 act.lsa_mask.__bits[0] = osa.lsa_mask; 965 } 966 967 error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, 968 args->osa ? &oact : NULL); 969 970 if (args->osa != NULL && !error) { 971 osa.lsa_handler = oact.lsa_handler; 972 osa.lsa_flags = oact.lsa_flags; 973 osa.lsa_restorer = oact.lsa_restorer; 974 osa.lsa_mask = oact.lsa_mask.__bits[0]; 975 error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); 976 } 977 978 return (error); 979} 980 981/* 982 * Linux has two extra args, restart and oldmask. We don't use these, 983 * but it seems that "restart" is actually a context pointer that 984 * enables the signal to happen with a different register set. 985 */ 986int 987linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) 988{ 989 sigset_t sigmask; 990 l_sigset_t mask; 991 992#ifdef DEBUG 993 if (ldebug(sigsuspend)) 994 printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask); 995#endif 996 997 LINUX_SIGEMPTYSET(mask); 998 mask.__bits[0] = args->mask; 999 linux_to_bsd_sigset(&mask, &sigmask); 1000 return (kern_sigsuspend(td, sigmask)); 1001} 1002 1003int 1004linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) 1005{ 1006 l_sigset_t lmask; 1007 sigset_t sigmask; 1008 int error; 1009 1010#ifdef DEBUG 1011 if (ldebug(rt_sigsuspend)) 1012 printf(ARGS(rt_sigsuspend, "%p, %d"), 1013 (void *)uap->newset, uap->sigsetsize); 1014#endif 1015 1016 if (uap->sigsetsize != sizeof(l_sigset_t)) 1017 return (EINVAL); 1018 1019 error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); 1020 if (error) 1021 return (error); 1022 1023 linux_to_bsd_sigset(&lmask, &sigmask); 1024 return (kern_sigsuspend(td, sigmask)); 1025} 1026 1027int 1028linux_pause(struct thread *td, struct linux_pause_args *args) 1029{ 1030 struct proc *p = td->td_proc; 1031 sigset_t sigmask; 1032 1033#ifdef DEBUG 1034 if (ldebug(pause)) 1035 printf(ARGS(pause, "")); 1036#endif 1037 1038 PROC_LOCK(p); 1039 sigmask = td->td_sigmask; 1040 PROC_UNLOCK(p); 1041 return (kern_sigsuspend(td, sigmask)); 1042} 1043 1044int 1045linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) 1046{ 1047 stack_t ss, oss; 1048 l_stack_t lss; 1049 int error; 1050 1051#ifdef DEBUG 1052 if (ldebug(sigaltstack)) 1053 printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss); 1054#endif 1055 1056 if (uap->uss != NULL) { 1057 error = copyin(uap->uss, &lss, sizeof(l_stack_t)); 1058 if (error) 1059 return (error); 1060 1061 ss.ss_sp = PTRIN(lss.ss_sp); 1062 ss.ss_size = lss.ss_size; 1063 ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); 1064 } 1065 error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, 1066 (uap->uoss != NULL) ? &oss : NULL); 1067 if (!error && uap->uoss != NULL) { 1068 lss.ss_sp = PTROUT(oss.ss_sp); 1069 lss.ss_size = oss.ss_size; 1070 lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); 1071 error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); 1072 } 1073 1074 return (error); 1075} 1076 1077int 1078linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) 1079{ 1080 struct ftruncate_args sa; 1081 1082#ifdef DEBUG 1083 if (ldebug(ftruncate64)) 1084 printf(ARGS(ftruncate64, "%u, %jd"), args->fd, 1085 (intmax_t)args->length); 1086#endif 1087 1088 sa.fd = args->fd; 1089 sa.length = args->length; 1090 return ftruncate(td, &sa); 1091} 1092 1093int 1094linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap) 1095{ 1096 struct timeval atv; 1097 l_timeval atv32; 1098 struct timezone rtz; 1099 int error = 0; 1100 1101 if (uap->tp) { 1102 microtime(&atv); 1103 atv32.tv_sec = atv.tv_sec; 1104 atv32.tv_usec = atv.tv_usec; 1105 error = copyout(&atv32, uap->tp, sizeof(atv32)); 1106 } 1107 if (error == 0 && uap->tzp != NULL) { 1108 rtz.tz_minuteswest = tz_minuteswest; 1109 rtz.tz_dsttime = tz_dsttime; 1110 error = copyout(&rtz, uap->tzp, sizeof(rtz)); 1111 } 1112 return (error); 1113} 1114 1115int 1116linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap) 1117{ 1118 l_timeval atv32; 1119 struct timeval atv, *tvp; 1120 struct timezone atz, *tzp; 1121 int error; 1122 1123 if (uap->tp) { 1124 error = copyin(uap->tp, &atv32, sizeof(atv32)); 1125 if (error) 1126 return (error); 1127 atv.tv_sec = atv32.tv_sec; 1128 atv.tv_usec = atv32.tv_usec; 1129 tvp = &atv; 1130 } else 1131 tvp = NULL; 1132 if (uap->tzp) { 1133 error = copyin(uap->tzp, &atz, sizeof(atz)); 1134 if (error) 1135 return (error); 1136 tzp = &atz; 1137 } else 1138 tzp = NULL; 1139 return (kern_settimeofday(td, tvp, tzp)); 1140} 1141 1142int 1143linux_getrusage(struct thread *td, struct linux_getrusage_args *uap) 1144{ 1145 struct l_rusage s32; 1146 struct rusage s; 1147 int error; 1148 1149 error = kern_getrusage(td, uap->who, &s); 1150 if (error != 0) 1151 return (error); 1152 if (uap->rusage != NULL) { 1153 bsd_to_linux_rusage(&s, &s32); 1154 error = copyout(&s32, uap->rusage, sizeof(s32)); 1155 } 1156 return (error); 1157} 1158 1159int 1160linux_sched_rr_get_interval(struct thread *td, 1161 struct linux_sched_rr_get_interval_args *uap) 1162{ 1163 struct timespec ts; 1164 struct l_timespec ts32; 1165 int error; 1166 1167 error = kern_sched_rr_get_interval(td, uap->pid, &ts); 1168 if (error != 0) 1169 return (error); 1170 ts32.tv_sec = ts.tv_sec; 1171 ts32.tv_nsec = ts.tv_nsec; 1172 return (copyout(&ts32, uap->interval, sizeof(ts32))); 1173} 1174 1175int 1176linux_set_thread_area(struct thread *td, 1177 struct linux_set_thread_area_args *args) 1178{ 1179 struct l_user_desc info; 1180 struct user_segment_descriptor sd; 1181 struct pcb *pcb; 1182 int a[2]; 1183 int error; 1184 1185 error = copyin(args->desc, &info, sizeof(struct l_user_desc)); 1186 if (error) 1187 return (error); 1188 1189#ifdef DEBUG 1190 if (ldebug(set_thread_area)) 1191 printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, " 1192 "%i, %i, %i"), info.entry_number, info.base_addr, 1193 info.limit, info.seg_32bit, info.contents, 1194 info.read_exec_only, info.limit_in_pages, 1195 info.seg_not_present, info.useable); 1196#endif 1197 1198 /* 1199 * Semantics of Linux version: every thread in the system has array 1200 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. 1201 * This syscall loads one of the selected TLS decriptors with a value 1202 * and also loads GDT descriptors 6, 7 and 8 with the content of 1203 * the per-thread descriptors. 1204 * 1205 * Semantics of FreeBSD version: I think we can ignore that Linux has 1206 * three per-thread descriptors and use just the first one. 1207 * The tls_array[] is used only in [gs]et_thread_area() syscalls and 1208 * for loading the GDT descriptors. We use just one GDT descriptor 1209 * for TLS, so we will load just one. 1210 * 1211 * XXX: This doesn't work when a user space process tries to use more 1212 * than one TLS segment. Comment in the Linux source says wine might 1213 * do this. 1214 */ 1215 1216 /* 1217 * GLIBC reads current %gs and call set_thread_area() with it. 1218 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because 1219 * we use these segments. 1220 */ 1221 switch (info.entry_number) { 1222 case GUGS32_SEL: 1223 case GUDATA_SEL: 1224 case 6: 1225 case -1: 1226 info.entry_number = GUGS32_SEL; 1227 break; 1228 default: 1229 return (EINVAL); 1230 } 1231 1232 /* 1233 * We have to copy out the GDT entry we use. 1234 * 1235 * XXX: What if a user space program does not check the return value 1236 * and tries to use 6, 7 or 8? 1237 */ 1238 error = copyout(&info, args->desc, sizeof(struct l_user_desc)); 1239 if (error) 1240 return (error); 1241 1242 if (LINUX_LDT_empty(&info)) { 1243 a[0] = 0; 1244 a[1] = 0; 1245 } else { 1246 a[0] = LINUX_LDT_entry_a(&info); 1247 a[1] = LINUX_LDT_entry_b(&info); 1248 } 1249 1250 memcpy(&sd, &a, sizeof(a)); 1251#ifdef DEBUG 1252 if (ldebug(set_thread_area)) 1253 printf("Segment created in set_thread_area: " 1254 "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, " 1255 "type: %i, dpl: %i, p: %i, xx: %i, long: %i, " 1256 "def32: %i, gran: %i\n", 1257 sd.sd_lobase, 1258 sd.sd_hibase, 1259 sd.sd_lolimit, 1260 sd.sd_hilimit, 1261 sd.sd_type, 1262 sd.sd_dpl, 1263 sd.sd_p, 1264 sd.sd_xx, 1265 sd.sd_long, 1266 sd.sd_def32, 1267 sd.sd_gran); 1268#endif 1269 1270 pcb = td->td_pcb; 1271 pcb->pcb_gsbase = (register_t)info.base_addr; 1272 set_pcb_flags(pcb, PCB_32BIT | PCB_GS32BIT); 1273 update_gdt_gsbase(td, info.base_addr); 1274 1275 return (0); 1276} 1277 1278int 1279linux_wait4(struct thread *td, struct linux_wait4_args *args) 1280{ 1281 int error, options; 1282 struct rusage ru, *rup; 1283 struct l_rusage lru; 1284 struct proc *p; 1285 1286#ifdef DEBUG 1287 if (ldebug(wait4)) 1288 printf(ARGS(wait4, "%d, %p, %d, %p"), 1289 args->pid, (void *)args->status, args->options, 1290 (void *)args->rusage); 1291#endif 1292 1293 options = (args->options & (WNOHANG | WUNTRACED)); 1294 /* WLINUXCLONE should be equal to __WCLONE, but we make sure */ 1295 if (args->options & __WCLONE) 1296 options |= WLINUXCLONE; 1297 1298 if (args->rusage != NULL) 1299 rup = &ru; 1300 else 1301 rup = NULL; 1302 error = linux_common_wait(td, args->pid, args->status, options, rup); 1303 if (error) 1304 return (error); 1305 1306 p = td->td_proc; 1307 PROC_LOCK(p); 1308 sigqueue_delete(&p->p_sigqueue, SIGCHLD); 1309 PROC_UNLOCK(p); 1310 1311 if (args->rusage != NULL) { 1312 bsd_to_linux_rusage(rup, &lru); 1313 error = copyout(&lru, args->rusage, sizeof(lru)); 1314 } 1315 1316 return (error); 1317} 1318