linux32_machdep.c revision 217896
1/*- 2 * Copyright (c) 2004 Tim J. Robbins 3 * Copyright (c) 2002 Doug Rabson 4 * Copyright (c) 2000 Marcel Moolenaar 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer 12 * in this position and unchanged. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. The name of the author may not be used to endorse or promote products 17 * derived from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include <sys/cdefs.h> 32__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 217896 2011-01-26 20:03:58Z dchagin $"); 33 34#include <sys/param.h> 35#include <sys/kernel.h> 36#include <sys/systm.h> 37#include <sys/file.h> 38#include <sys/fcntl.h> 39#include <sys/clock.h> 40#include <sys/imgact.h> 41#include <sys/limits.h> 42#include <sys/lock.h> 43#include <sys/malloc.h> 44#include <sys/mman.h> 45#include <sys/mutex.h> 46#include <sys/priv.h> 47#include <sys/proc.h> 48#include <sys/resource.h> 49#include <sys/resourcevar.h> 50#include <sys/sched.h> 51#include <sys/syscallsubr.h> 52#include <sys/sysproto.h> 53#include <sys/unistd.h> 54 55#include <machine/frame.h> 56#include <machine/pcb.h> 57#include <machine/psl.h> 58#include <machine/segments.h> 59#include <machine/specialreg.h> 60 61#include <vm/vm.h> 62#include <vm/pmap.h> 63#include <vm/vm_map.h> 64 65#include <compat/freebsd32/freebsd32_util.h> 66#include <amd64/linux32/linux.h> 67#include <amd64/linux32/linux32_proto.h> 68#include <compat/linux/linux_ipc.h> 69#include <compat/linux/linux_signal.h> 70#include <compat/linux/linux_util.h> 71#include <compat/linux/linux_emul.h> 72 73struct l_old_select_argv { 74 l_int nfds; 75 l_uintptr_t readfds; 76 l_uintptr_t writefds; 77 l_uintptr_t exceptfds; 78 l_uintptr_t timeout; 79} __packed; 80 81int 82linux_to_bsd_sigaltstack(int lsa) 83{ 84 int bsa = 0; 85 86 if (lsa & LINUX_SS_DISABLE) 87 bsa |= SS_DISABLE; 88 if (lsa & LINUX_SS_ONSTACK) 89 bsa |= SS_ONSTACK; 90 return (bsa); 91} 92 93static int linux_mmap_common(struct thread *td, l_uintptr_t addr, 94 l_size_t len, l_int prot, l_int flags, l_int fd, 95 l_loff_t pos); 96 97int 98bsd_to_linux_sigaltstack(int bsa) 99{ 100 int lsa = 0; 101 102 if (bsa & SS_DISABLE) 103 lsa |= LINUX_SS_DISABLE; 104 if (bsa & SS_ONSTACK) 105 lsa |= LINUX_SS_ONSTACK; 106 return (lsa); 107} 108 109int 110linux_execve(struct thread *td, struct linux_execve_args *args) 111{ 112 struct image_args eargs; 113 char *path; 114 int error; 115 116 LCONVPATHEXIST(td, args->path, &path); 117 118#ifdef DEBUG 119 if (ldebug(execve)) 120 printf(ARGS(execve, "%s"), path); 121#endif 122 123 error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE, 124 args->argp, args->envp); 125 free(path, M_TEMP); 126 if (error == 0) 127 error = kern_execve(td, &eargs, NULL); 128 if (error == 0) 129 /* Linux process can execute FreeBSD one, do not attempt 130 * to create emuldata for such process using 131 * linux_proc_init, this leads to a panic on KASSERT 132 * because such process has p->p_emuldata == NULL. 133 */ 134 if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX) 135 error = linux_proc_init(td, 0, 0); 136 return (error); 137} 138 139CTASSERT(sizeof(struct l_iovec32) == 8); 140 141static int 142linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop) 143{ 144 struct l_iovec32 iov32; 145 struct iovec *iov; 146 struct uio *uio; 147 uint32_t iovlen; 148 int error, i; 149 150 *uiop = NULL; 151 if (iovcnt > UIO_MAXIOV) 152 return (EINVAL); 153 iovlen = iovcnt * sizeof(struct iovec); 154 uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK); 155 iov = (struct iovec *)(uio + 1); 156 for (i = 0; i < iovcnt; i++) { 157 error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32)); 158 if (error) { 159 free(uio, M_IOV); 160 return (error); 161 } 162 iov[i].iov_base = PTRIN(iov32.iov_base); 163 iov[i].iov_len = iov32.iov_len; 164 } 165 uio->uio_iov = iov; 166 uio->uio_iovcnt = iovcnt; 167 uio->uio_segflg = UIO_USERSPACE; 168 uio->uio_offset = -1; 169 uio->uio_resid = 0; 170 for (i = 0; i < iovcnt; i++) { 171 if (iov->iov_len > INT_MAX - uio->uio_resid) { 172 free(uio, M_IOV); 173 return (EINVAL); 174 } 175 uio->uio_resid += iov->iov_len; 176 iov++; 177 } 178 *uiop = uio; 179 return (0); 180} 181 182int 183linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp, 184 int error) 185{ 186 struct l_iovec32 iov32; 187 struct iovec *iov; 188 uint32_t iovlen; 189 int i; 190 191 *iovp = NULL; 192 if (iovcnt > UIO_MAXIOV) 193 return (error); 194 iovlen = iovcnt * sizeof(struct iovec); 195 iov = malloc(iovlen, M_IOV, M_WAITOK); 196 for (i = 0; i < iovcnt; i++) { 197 error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32)); 198 if (error) { 199 free(iov, M_IOV); 200 return (error); 201 } 202 iov[i].iov_base = PTRIN(iov32.iov_base); 203 iov[i].iov_len = iov32.iov_len; 204 } 205 *iovp = iov; 206 return(0); 207 208} 209 210int 211linux_readv(struct thread *td, struct linux_readv_args *uap) 212{ 213 struct uio *auio; 214 int error; 215 216 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 217 if (error) 218 return (error); 219 error = kern_readv(td, uap->fd, auio); 220 free(auio, M_IOV); 221 return (error); 222} 223 224int 225linux_writev(struct thread *td, struct linux_writev_args *uap) 226{ 227 struct uio *auio; 228 int error; 229 230 error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 231 if (error) 232 return (error); 233 error = kern_writev(td, uap->fd, auio); 234 free(auio, M_IOV); 235 return (error); 236} 237 238struct l_ipc_kludge { 239 l_uintptr_t msgp; 240 l_long msgtyp; 241} __packed; 242 243int 244linux_ipc(struct thread *td, struct linux_ipc_args *args) 245{ 246 247 switch (args->what & 0xFFFF) { 248 case LINUX_SEMOP: { 249 struct linux_semop_args a; 250 251 a.semid = args->arg1; 252 a.tsops = args->ptr; 253 a.nsops = args->arg2; 254 return (linux_semop(td, &a)); 255 } 256 case LINUX_SEMGET: { 257 struct linux_semget_args a; 258 259 a.key = args->arg1; 260 a.nsems = args->arg2; 261 a.semflg = args->arg3; 262 return (linux_semget(td, &a)); 263 } 264 case LINUX_SEMCTL: { 265 struct linux_semctl_args a; 266 int error; 267 268 a.semid = args->arg1; 269 a.semnum = args->arg2; 270 a.cmd = args->arg3; 271 error = copyin(args->ptr, &a.arg, sizeof(a.arg)); 272 if (error) 273 return (error); 274 return (linux_semctl(td, &a)); 275 } 276 case LINUX_MSGSND: { 277 struct linux_msgsnd_args a; 278 279 a.msqid = args->arg1; 280 a.msgp = args->ptr; 281 a.msgsz = args->arg2; 282 a.msgflg = args->arg3; 283 return (linux_msgsnd(td, &a)); 284 } 285 case LINUX_MSGRCV: { 286 struct linux_msgrcv_args a; 287 288 a.msqid = args->arg1; 289 a.msgsz = args->arg2; 290 a.msgflg = args->arg3; 291 if ((args->what >> 16) == 0) { 292 struct l_ipc_kludge tmp; 293 int error; 294 295 if (args->ptr == 0) 296 return (EINVAL); 297 error = copyin(args->ptr, &tmp, sizeof(tmp)); 298 if (error) 299 return (error); 300 a.msgp = PTRIN(tmp.msgp); 301 a.msgtyp = tmp.msgtyp; 302 } else { 303 a.msgp = args->ptr; 304 a.msgtyp = args->arg5; 305 } 306 return (linux_msgrcv(td, &a)); 307 } 308 case LINUX_MSGGET: { 309 struct linux_msgget_args a; 310 311 a.key = args->arg1; 312 a.msgflg = args->arg2; 313 return (linux_msgget(td, &a)); 314 } 315 case LINUX_MSGCTL: { 316 struct linux_msgctl_args a; 317 318 a.msqid = args->arg1; 319 a.cmd = args->arg2; 320 a.buf = args->ptr; 321 return (linux_msgctl(td, &a)); 322 } 323 case LINUX_SHMAT: { 324 struct linux_shmat_args a; 325 326 a.shmid = args->arg1; 327 a.shmaddr = args->ptr; 328 a.shmflg = args->arg2; 329 a.raddr = PTRIN((l_uint)args->arg3); 330 return (linux_shmat(td, &a)); 331 } 332 case LINUX_SHMDT: { 333 struct linux_shmdt_args a; 334 335 a.shmaddr = args->ptr; 336 return (linux_shmdt(td, &a)); 337 } 338 case LINUX_SHMGET: { 339 struct linux_shmget_args a; 340 341 a.key = args->arg1; 342 a.size = args->arg2; 343 a.shmflg = args->arg3; 344 return (linux_shmget(td, &a)); 345 } 346 case LINUX_SHMCTL: { 347 struct linux_shmctl_args a; 348 349 a.shmid = args->arg1; 350 a.cmd = args->arg2; 351 a.buf = args->ptr; 352 return (linux_shmctl(td, &a)); 353 } 354 default: 355 break; 356 } 357 358 return (EINVAL); 359} 360 361int 362linux_old_select(struct thread *td, struct linux_old_select_args *args) 363{ 364 struct l_old_select_argv linux_args; 365 struct linux_select_args newsel; 366 int error; 367 368#ifdef DEBUG 369 if (ldebug(old_select)) 370 printf(ARGS(old_select, "%p"), args->ptr); 371#endif 372 373 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 374 if (error) 375 return (error); 376 377 newsel.nfds = linux_args.nfds; 378 newsel.readfds = PTRIN(linux_args.readfds); 379 newsel.writefds = PTRIN(linux_args.writefds); 380 newsel.exceptfds = PTRIN(linux_args.exceptfds); 381 newsel.timeout = PTRIN(linux_args.timeout); 382 return (linux_select(td, &newsel)); 383} 384 385int 386linux_fork(struct thread *td, struct linux_fork_args *args) 387{ 388 int error; 389 struct proc *p2; 390 struct thread *td2; 391 392#ifdef DEBUG 393 if (ldebug(fork)) 394 printf(ARGS(fork, "")); 395#endif 396 397 if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2)) != 0) 398 return (error); 399 400 if (error == 0) { 401 td->td_retval[0] = p2->p_pid; 402 td->td_retval[1] = 0; 403 } 404 405 if (td->td_retval[1] == 1) 406 td->td_retval[0] = 0; 407 error = linux_proc_init(td, td->td_retval[0], 0); 408 if (error) 409 return (error); 410 411 td2 = FIRST_THREAD_IN_PROC(p2); 412 413 /* 414 * Make this runnable after we are finished with it. 415 */ 416 thread_lock(td2); 417 TD_SET_CAN_RUN(td2); 418 sched_add(td2, SRQ_BORING); 419 thread_unlock(td2); 420 421 return (0); 422} 423 424int 425linux_vfork(struct thread *td, struct linux_vfork_args *args) 426{ 427 int error; 428 struct proc *p2; 429 struct thread *td2; 430 431#ifdef DEBUG 432 if (ldebug(vfork)) 433 printf(ARGS(vfork, "")); 434#endif 435 436 /* Exclude RFPPWAIT */ 437 if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2)) != 0) 438 return (error); 439 if (error == 0) { 440 td->td_retval[0] = p2->p_pid; 441 td->td_retval[1] = 0; 442 } 443 /* Are we the child? */ 444 if (td->td_retval[1] == 1) 445 td->td_retval[0] = 0; 446 error = linux_proc_init(td, td->td_retval[0], 0); 447 if (error) 448 return (error); 449 450 PROC_LOCK(p2); 451 p2->p_flag |= P_PPWAIT; 452 PROC_UNLOCK(p2); 453 454 td2 = FIRST_THREAD_IN_PROC(p2); 455 456 /* 457 * Make this runnable after we are finished with it. 458 */ 459 thread_lock(td2); 460 TD_SET_CAN_RUN(td2); 461 sched_add(td2, SRQ_BORING); 462 thread_unlock(td2); 463 464 /* wait for the children to exit, ie. emulate vfork */ 465 PROC_LOCK(p2); 466 while (p2->p_flag & P_PPWAIT) 467 cv_wait(&p2->p_pwait, &p2->p_mtx); 468 PROC_UNLOCK(p2); 469 470 return (0); 471} 472 473int 474linux_clone(struct thread *td, struct linux_clone_args *args) 475{ 476 int error, ff = RFPROC | RFSTOPPED; 477 struct proc *p2; 478 struct thread *td2; 479 int exit_signal; 480 struct linux_emuldata *em; 481 482#ifdef DEBUG 483 if (ldebug(clone)) { 484 printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " 485 "child tid: %p"), (unsigned)args->flags, 486 args->stack, args->parent_tidptr, args->child_tidptr); 487 } 488#endif 489 490 exit_signal = args->flags & 0x000000ff; 491 if (LINUX_SIG_VALID(exit_signal)) { 492 if (exit_signal <= LINUX_SIGTBLSZ) 493 exit_signal = 494 linux_to_bsd_signal[_SIG_IDX(exit_signal)]; 495 } else if (exit_signal != 0) 496 return (EINVAL); 497 498 if (args->flags & LINUX_CLONE_VM) 499 ff |= RFMEM; 500 if (args->flags & LINUX_CLONE_SIGHAND) 501 ff |= RFSIGSHARE; 502 /* 503 * XXX: In Linux, sharing of fs info (chroot/cwd/umask) 504 * and open files is independant. In FreeBSD, its in one 505 * structure but in reality it does not cause any problems 506 * because both of these flags are usually set together. 507 */ 508 if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) 509 ff |= RFFDG; 510 511 /* 512 * Attempt to detect when linux_clone(2) is used for creating 513 * kernel threads. Unfortunately despite the existence of the 514 * CLONE_THREAD flag, version of linuxthreads package used in 515 * most popular distros as of beginning of 2005 doesn't make 516 * any use of it. Therefore, this detection relies on 517 * empirical observation that linuxthreads sets certain 518 * combination of flags, so that we can make more or less 519 * precise detection and notify the FreeBSD kernel that several 520 * processes are in fact part of the same threading group, so 521 * that special treatment is necessary for signal delivery 522 * between those processes and fd locking. 523 */ 524 if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS) 525 ff |= RFTHREAD; 526 527 if (args->flags & LINUX_CLONE_PARENT_SETTID) 528 if (args->parent_tidptr == NULL) 529 return (EINVAL); 530 531 error = fork1(td, ff, 0, &p2); 532 if (error) 533 return (error); 534 535 if (args->flags & (LINUX_CLONE_PARENT | LINUX_CLONE_THREAD)) { 536 sx_xlock(&proctree_lock); 537 PROC_LOCK(p2); 538 proc_reparent(p2, td->td_proc->p_pptr); 539 PROC_UNLOCK(p2); 540 sx_xunlock(&proctree_lock); 541 } 542 543 /* create the emuldata */ 544 error = linux_proc_init(td, p2->p_pid, args->flags); 545 /* reference it - no need to check this */ 546 em = em_find(p2, EMUL_DOLOCK); 547 KASSERT(em != NULL, ("clone: emuldata not found.\n")); 548 /* and adjust it */ 549 550 if (args->flags & LINUX_CLONE_THREAD) { 551#ifdef notyet 552 PROC_LOCK(p2); 553 p2->p_pgrp = td->td_proc->p_pgrp; 554 PROC_UNLOCK(p2); 555#endif 556 exit_signal = 0; 557 } 558 559 if (args->flags & LINUX_CLONE_CHILD_SETTID) 560 em->child_set_tid = args->child_tidptr; 561 else 562 em->child_set_tid = NULL; 563 564 if (args->flags & LINUX_CLONE_CHILD_CLEARTID) 565 em->child_clear_tid = args->child_tidptr; 566 else 567 em->child_clear_tid = NULL; 568 569 EMUL_UNLOCK(&emul_lock); 570 571 if (args->flags & LINUX_CLONE_PARENT_SETTID) { 572 error = copyout(&p2->p_pid, args->parent_tidptr, 573 sizeof(p2->p_pid)); 574 if (error) 575 printf(LMSG("copyout failed!")); 576 } 577 578 PROC_LOCK(p2); 579 p2->p_sigparent = exit_signal; 580 PROC_UNLOCK(p2); 581 td2 = FIRST_THREAD_IN_PROC(p2); 582 /* 583 * In a case of stack = NULL, we are supposed to COW calling process 584 * stack. This is what normal fork() does, so we just keep tf_rsp arg 585 * intact. 586 */ 587 if (args->stack) 588 td2->td_frame->tf_rsp = PTROUT(args->stack); 589 590 if (args->flags & LINUX_CLONE_SETTLS) { 591 struct user_segment_descriptor sd; 592 struct l_user_desc info; 593 struct pcb *pcb; 594 int a[2]; 595 596 error = copyin((void *)td->td_frame->tf_rsi, &info, 597 sizeof(struct l_user_desc)); 598 if (error) { 599 printf(LMSG("copyin failed!")); 600 } else { 601 /* We might copy out the entry_number as GUGS32_SEL. */ 602 info.entry_number = GUGS32_SEL; 603 error = copyout(&info, (void *)td->td_frame->tf_rsi, 604 sizeof(struct l_user_desc)); 605 if (error) 606 printf(LMSG("copyout failed!")); 607 608 a[0] = LINUX_LDT_entry_a(&info); 609 a[1] = LINUX_LDT_entry_b(&info); 610 611 memcpy(&sd, &a, sizeof(a)); 612#ifdef DEBUG 613 if (ldebug(clone)) 614 printf("Segment created in clone with " 615 "CLONE_SETTLS: lobase: %x, hibase: %x, " 616 "lolimit: %x, hilimit: %x, type: %i, " 617 "dpl: %i, p: %i, xx: %i, long: %i, " 618 "def32: %i, gran: %i\n", sd.sd_lobase, 619 sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit, 620 sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx, 621 sd.sd_long, sd.sd_def32, sd.sd_gran); 622#endif 623 pcb = td2->td_pcb; 624 pcb->pcb_gsbase = (register_t)info.base_addr; 625/* XXXKIB pcb->pcb_gs32sd = sd; */ 626 td2->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL); 627 set_pcb_flags(pcb, PCB_GS32BIT | PCB_32BIT); 628 } 629 } 630 631#ifdef DEBUG 632 if (ldebug(clone)) 633 printf(LMSG("clone: successful rfork to %d, " 634 "stack %p sig = %d"), (int)p2->p_pid, args->stack, 635 exit_signal); 636#endif 637 if (args->flags & LINUX_CLONE_VFORK) { 638 PROC_LOCK(p2); 639 p2->p_flag |= P_PPWAIT; 640 PROC_UNLOCK(p2); 641 } 642 643 /* 644 * Make this runnable after we are finished with it. 645 */ 646 thread_lock(td2); 647 TD_SET_CAN_RUN(td2); 648 sched_add(td2, SRQ_BORING); 649 thread_unlock(td2); 650 651 td->td_retval[0] = p2->p_pid; 652 td->td_retval[1] = 0; 653 654 if (args->flags & LINUX_CLONE_VFORK) { 655 /* wait for the children to exit, ie. emulate vfork */ 656 PROC_LOCK(p2); 657 while (p2->p_flag & P_PPWAIT) 658 cv_wait(&p2->p_pwait, &p2->p_mtx); 659 PROC_UNLOCK(p2); 660 } 661 662 return (0); 663} 664 665#define STACK_SIZE (2 * 1024 * 1024) 666#define GUARD_SIZE (4 * PAGE_SIZE) 667 668int 669linux_mmap2(struct thread *td, struct linux_mmap2_args *args) 670{ 671 672#ifdef DEBUG 673 if (ldebug(mmap2)) 674 printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"), 675 args->addr, args->len, args->prot, 676 args->flags, args->fd, args->pgoff); 677#endif 678 679 return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot, 680 args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff * 681 PAGE_SIZE)); 682} 683 684int 685linux_mmap(struct thread *td, struct linux_mmap_args *args) 686{ 687 int error; 688 struct l_mmap_argv linux_args; 689 690 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 691 if (error) 692 return (error); 693 694#ifdef DEBUG 695 if (ldebug(mmap)) 696 printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"), 697 linux_args.addr, linux_args.len, linux_args.prot, 698 linux_args.flags, linux_args.fd, linux_args.pgoff); 699#endif 700 701 return (linux_mmap_common(td, linux_args.addr, linux_args.len, 702 linux_args.prot, linux_args.flags, linux_args.fd, 703 (uint32_t)linux_args.pgoff)); 704} 705 706static int 707linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, 708 l_int flags, l_int fd, l_loff_t pos) 709{ 710 struct proc *p = td->td_proc; 711 struct mmap_args /* { 712 caddr_t addr; 713 size_t len; 714 int prot; 715 int flags; 716 int fd; 717 long pad; 718 off_t pos; 719 } */ bsd_args; 720 int error; 721 struct file *fp; 722 723 error = 0; 724 bsd_args.flags = 0; 725 fp = NULL; 726 727 /* 728 * Linux mmap(2): 729 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE 730 */ 731 if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) 732 return (EINVAL); 733 734 if (flags & LINUX_MAP_SHARED) 735 bsd_args.flags |= MAP_SHARED; 736 if (flags & LINUX_MAP_PRIVATE) 737 bsd_args.flags |= MAP_PRIVATE; 738 if (flags & LINUX_MAP_FIXED) 739 bsd_args.flags |= MAP_FIXED; 740 if (flags & LINUX_MAP_ANON) { 741 /* Enforce pos to be on page boundary, then ignore. */ 742 if ((pos & PAGE_MASK) != 0) 743 return (EINVAL); 744 pos = 0; 745 bsd_args.flags |= MAP_ANON; 746 } else 747 bsd_args.flags |= MAP_NOSYNC; 748 if (flags & LINUX_MAP_GROWSDOWN) 749 bsd_args.flags |= MAP_STACK; 750 751 /* 752 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC 753 * on Linux/i386. We do this to ensure maximum compatibility. 754 * Linux/ia64 does the same in i386 emulation mode. 755 */ 756 bsd_args.prot = prot; 757 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 758 bsd_args.prot |= PROT_READ | PROT_EXEC; 759 760 /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ 761 bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; 762 if (bsd_args.fd != -1) { 763 /* 764 * Linux follows Solaris mmap(2) description: 765 * The file descriptor fildes is opened with 766 * read permission, regardless of the 767 * protection options specified. 768 */ 769 770 if ((error = fget(td, bsd_args.fd, &fp)) != 0) 771 return (error); 772 if (fp->f_type != DTYPE_VNODE) { 773 fdrop(fp, td); 774 return (EINVAL); 775 } 776 777 /* Linux mmap() just fails for O_WRONLY files */ 778 if (!(fp->f_flag & FREAD)) { 779 fdrop(fp, td); 780 return (EACCES); 781 } 782 783 fdrop(fp, td); 784 } 785 786 if (flags & LINUX_MAP_GROWSDOWN) { 787 /* 788 * The Linux MAP_GROWSDOWN option does not limit auto 789 * growth of the region. Linux mmap with this option 790 * takes as addr the inital BOS, and as len, the initial 791 * region size. It can then grow down from addr without 792 * limit. However, Linux threads has an implicit internal 793 * limit to stack size of STACK_SIZE. Its just not 794 * enforced explicitly in Linux. But, here we impose 795 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack 796 * region, since we can do this with our mmap. 797 * 798 * Our mmap with MAP_STACK takes addr as the maximum 799 * downsize limit on BOS, and as len the max size of 800 * the region. It then maps the top SGROWSIZ bytes, 801 * and auto grows the region down, up to the limit 802 * in addr. 803 * 804 * If we don't use the MAP_STACK option, the effect 805 * of this code is to allocate a stack region of a 806 * fixed size of (STACK_SIZE - GUARD_SIZE). 807 */ 808 809 if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) { 810 /* 811 * Some Linux apps will attempt to mmap 812 * thread stacks near the top of their 813 * address space. If their TOS is greater 814 * than vm_maxsaddr, vm_map_growstack() 815 * will confuse the thread stack with the 816 * process stack and deliver a SEGV if they 817 * attempt to grow the thread stack past their 818 * current stacksize rlimit. To avoid this, 819 * adjust vm_maxsaddr upwards to reflect 820 * the current stacksize rlimit rather 821 * than the maximum possible stacksize. 822 * It would be better to adjust the 823 * mmap'ed region, but some apps do not check 824 * mmap's return value. 825 */ 826 PROC_LOCK(p); 827 p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK - 828 lim_cur(p, RLIMIT_STACK); 829 PROC_UNLOCK(p); 830 } 831 832 /* 833 * This gives us our maximum stack size and a new BOS. 834 * If we're using VM_STACK, then mmap will just map 835 * the top SGROWSIZ bytes, and let the stack grow down 836 * to the limit at BOS. If we're not using VM_STACK 837 * we map the full stack, since we don't have a way 838 * to autogrow it. 839 */ 840 if (len > STACK_SIZE - GUARD_SIZE) { 841 bsd_args.addr = (caddr_t)PTRIN(addr); 842 bsd_args.len = len; 843 } else { 844 bsd_args.addr = (caddr_t)PTRIN(addr) - 845 (STACK_SIZE - GUARD_SIZE - len); 846 bsd_args.len = STACK_SIZE - GUARD_SIZE; 847 } 848 } else { 849 bsd_args.addr = (caddr_t)PTRIN(addr); 850 bsd_args.len = len; 851 } 852 bsd_args.pos = pos; 853 854#ifdef DEBUG 855 if (ldebug(mmap)) 856 printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", 857 __func__, 858 (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot, 859 bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); 860#endif 861 error = mmap(td, &bsd_args); 862#ifdef DEBUG 863 if (ldebug(mmap)) 864 printf("-> %s() return: 0x%x (0x%08x)\n", 865 __func__, error, (u_int)td->td_retval[0]); 866#endif 867 return (error); 868} 869 870int 871linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) 872{ 873 struct mprotect_args bsd_args; 874 875 bsd_args.addr = uap->addr; 876 bsd_args.len = uap->len; 877 bsd_args.prot = uap->prot; 878 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 879 bsd_args.prot |= PROT_READ | PROT_EXEC; 880 return (mprotect(td, &bsd_args)); 881} 882 883int 884linux_iopl(struct thread *td, struct linux_iopl_args *args) 885{ 886 int error; 887 888 if (args->level < 0 || args->level > 3) 889 return (EINVAL); 890 if ((error = priv_check(td, PRIV_IO)) != 0) 891 return (error); 892 if ((error = securelevel_gt(td->td_ucred, 0)) != 0) 893 return (error); 894 td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) | 895 (args->level * (PSL_IOPL / 3)); 896 897 return (0); 898} 899 900int 901linux_pipe(struct thread *td, struct linux_pipe_args *args) 902{ 903 int error; 904 int fildes[2]; 905 906#ifdef DEBUG 907 if (ldebug(pipe)) 908 printf(ARGS(pipe, "*")); 909#endif 910 911 error = kern_pipe(td, fildes); 912 if (error) 913 return (error); 914 915 /* XXX: Close descriptors on error. */ 916 return (copyout(fildes, args->pipefds, sizeof fildes)); 917} 918 919int 920linux_sigaction(struct thread *td, struct linux_sigaction_args *args) 921{ 922 l_osigaction_t osa; 923 l_sigaction_t act, oact; 924 int error; 925 926#ifdef DEBUG 927 if (ldebug(sigaction)) 928 printf(ARGS(sigaction, "%d, %p, %p"), 929 args->sig, (void *)args->nsa, (void *)args->osa); 930#endif 931 932 if (args->nsa != NULL) { 933 error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); 934 if (error) 935 return (error); 936 act.lsa_handler = osa.lsa_handler; 937 act.lsa_flags = osa.lsa_flags; 938 act.lsa_restorer = osa.lsa_restorer; 939 LINUX_SIGEMPTYSET(act.lsa_mask); 940 act.lsa_mask.__bits[0] = osa.lsa_mask; 941 } 942 943 error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, 944 args->osa ? &oact : NULL); 945 946 if (args->osa != NULL && !error) { 947 osa.lsa_handler = oact.lsa_handler; 948 osa.lsa_flags = oact.lsa_flags; 949 osa.lsa_restorer = oact.lsa_restorer; 950 osa.lsa_mask = oact.lsa_mask.__bits[0]; 951 error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); 952 } 953 954 return (error); 955} 956 957/* 958 * Linux has two extra args, restart and oldmask. We don't use these, 959 * but it seems that "restart" is actually a context pointer that 960 * enables the signal to happen with a different register set. 961 */ 962int 963linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) 964{ 965 sigset_t sigmask; 966 l_sigset_t mask; 967 968#ifdef DEBUG 969 if (ldebug(sigsuspend)) 970 printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask); 971#endif 972 973 LINUX_SIGEMPTYSET(mask); 974 mask.__bits[0] = args->mask; 975 linux_to_bsd_sigset(&mask, &sigmask); 976 return (kern_sigsuspend(td, sigmask)); 977} 978 979int 980linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) 981{ 982 l_sigset_t lmask; 983 sigset_t sigmask; 984 int error; 985 986#ifdef DEBUG 987 if (ldebug(rt_sigsuspend)) 988 printf(ARGS(rt_sigsuspend, "%p, %d"), 989 (void *)uap->newset, uap->sigsetsize); 990#endif 991 992 if (uap->sigsetsize != sizeof(l_sigset_t)) 993 return (EINVAL); 994 995 error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); 996 if (error) 997 return (error); 998 999 linux_to_bsd_sigset(&lmask, &sigmask); 1000 return (kern_sigsuspend(td, sigmask)); 1001} 1002 1003int 1004linux_pause(struct thread *td, struct linux_pause_args *args) 1005{ 1006 struct proc *p = td->td_proc; 1007 sigset_t sigmask; 1008 1009#ifdef DEBUG 1010 if (ldebug(pause)) 1011 printf(ARGS(pause, "")); 1012#endif 1013 1014 PROC_LOCK(p); 1015 sigmask = td->td_sigmask; 1016 PROC_UNLOCK(p); 1017 return (kern_sigsuspend(td, sigmask)); 1018} 1019 1020int 1021linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) 1022{ 1023 stack_t ss, oss; 1024 l_stack_t lss; 1025 int error; 1026 1027#ifdef DEBUG 1028 if (ldebug(sigaltstack)) 1029 printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss); 1030#endif 1031 1032 if (uap->uss != NULL) { 1033 error = copyin(uap->uss, &lss, sizeof(l_stack_t)); 1034 if (error) 1035 return (error); 1036 1037 ss.ss_sp = PTRIN(lss.ss_sp); 1038 ss.ss_size = lss.ss_size; 1039 ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); 1040 } 1041 error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, 1042 (uap->uoss != NULL) ? &oss : NULL); 1043 if (!error && uap->uoss != NULL) { 1044 lss.ss_sp = PTROUT(oss.ss_sp); 1045 lss.ss_size = oss.ss_size; 1046 lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); 1047 error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); 1048 } 1049 1050 return (error); 1051} 1052 1053int 1054linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) 1055{ 1056 struct ftruncate_args sa; 1057 1058#ifdef DEBUG 1059 if (ldebug(ftruncate64)) 1060 printf(ARGS(ftruncate64, "%u, %jd"), args->fd, 1061 (intmax_t)args->length); 1062#endif 1063 1064 sa.fd = args->fd; 1065 sa.length = args->length; 1066 return ftruncate(td, &sa); 1067} 1068 1069int 1070linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap) 1071{ 1072 struct timeval atv; 1073 l_timeval atv32; 1074 struct timezone rtz; 1075 int error = 0; 1076 1077 if (uap->tp) { 1078 microtime(&atv); 1079 atv32.tv_sec = atv.tv_sec; 1080 atv32.tv_usec = atv.tv_usec; 1081 error = copyout(&atv32, uap->tp, sizeof(atv32)); 1082 } 1083 if (error == 0 && uap->tzp != NULL) { 1084 rtz.tz_minuteswest = tz_minuteswest; 1085 rtz.tz_dsttime = tz_dsttime; 1086 error = copyout(&rtz, uap->tzp, sizeof(rtz)); 1087 } 1088 return (error); 1089} 1090 1091int 1092linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap) 1093{ 1094 l_timeval atv32; 1095 struct timeval atv, *tvp; 1096 struct timezone atz, *tzp; 1097 int error; 1098 1099 if (uap->tp) { 1100 error = copyin(uap->tp, &atv32, sizeof(atv32)); 1101 if (error) 1102 return (error); 1103 atv.tv_sec = atv32.tv_sec; 1104 atv.tv_usec = atv32.tv_usec; 1105 tvp = &atv; 1106 } else 1107 tvp = NULL; 1108 if (uap->tzp) { 1109 error = copyin(uap->tzp, &atz, sizeof(atz)); 1110 if (error) 1111 return (error); 1112 tzp = &atz; 1113 } else 1114 tzp = NULL; 1115 return (kern_settimeofday(td, tvp, tzp)); 1116} 1117 1118int 1119linux_getrusage(struct thread *td, struct linux_getrusage_args *uap) 1120{ 1121 struct l_rusage s32; 1122 struct rusage s; 1123 int error; 1124 1125 error = kern_getrusage(td, uap->who, &s); 1126 if (error != 0) 1127 return (error); 1128 if (uap->rusage != NULL) { 1129 s32.ru_utime.tv_sec = s.ru_utime.tv_sec; 1130 s32.ru_utime.tv_usec = s.ru_utime.tv_usec; 1131 s32.ru_stime.tv_sec = s.ru_stime.tv_sec; 1132 s32.ru_stime.tv_usec = s.ru_stime.tv_usec; 1133 s32.ru_maxrss = s.ru_maxrss; 1134 s32.ru_ixrss = s.ru_ixrss; 1135 s32.ru_idrss = s.ru_idrss; 1136 s32.ru_isrss = s.ru_isrss; 1137 s32.ru_minflt = s.ru_minflt; 1138 s32.ru_majflt = s.ru_majflt; 1139 s32.ru_nswap = s.ru_nswap; 1140 s32.ru_inblock = s.ru_inblock; 1141 s32.ru_oublock = s.ru_oublock; 1142 s32.ru_msgsnd = s.ru_msgsnd; 1143 s32.ru_msgrcv = s.ru_msgrcv; 1144 s32.ru_nsignals = s.ru_nsignals; 1145 s32.ru_nvcsw = s.ru_nvcsw; 1146 s32.ru_nivcsw = s.ru_nivcsw; 1147 error = copyout(&s32, uap->rusage, sizeof(s32)); 1148 } 1149 return (error); 1150} 1151 1152int 1153linux_sched_rr_get_interval(struct thread *td, 1154 struct linux_sched_rr_get_interval_args *uap) 1155{ 1156 struct timespec ts; 1157 struct l_timespec ts32; 1158 int error; 1159 1160 error = kern_sched_rr_get_interval(td, uap->pid, &ts); 1161 if (error != 0) 1162 return (error); 1163 ts32.tv_sec = ts.tv_sec; 1164 ts32.tv_nsec = ts.tv_nsec; 1165 return (copyout(&ts32, uap->interval, sizeof(ts32))); 1166} 1167 1168int 1169linux_set_thread_area(struct thread *td, 1170 struct linux_set_thread_area_args *args) 1171{ 1172 struct l_user_desc info; 1173 struct user_segment_descriptor sd; 1174 struct pcb *pcb; 1175 int a[2]; 1176 int error; 1177 1178 error = copyin(args->desc, &info, sizeof(struct l_user_desc)); 1179 if (error) 1180 return (error); 1181 1182#ifdef DEBUG 1183 if (ldebug(set_thread_area)) 1184 printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, " 1185 "%i, %i, %i"), info.entry_number, info.base_addr, 1186 info.limit, info.seg_32bit, info.contents, 1187 info.read_exec_only, info.limit_in_pages, 1188 info.seg_not_present, info.useable); 1189#endif 1190 1191 /* 1192 * Semantics of Linux version: every thread in the system has array 1193 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. 1194 * This syscall loads one of the selected TLS decriptors with a value 1195 * and also loads GDT descriptors 6, 7 and 8 with the content of 1196 * the per-thread descriptors. 1197 * 1198 * Semantics of FreeBSD version: I think we can ignore that Linux has 1199 * three per-thread descriptors and use just the first one. 1200 * The tls_array[] is used only in [gs]et_thread_area() syscalls and 1201 * for loading the GDT descriptors. We use just one GDT descriptor 1202 * for TLS, so we will load just one. 1203 * 1204 * XXX: This doesn't work when a user space process tries to use more 1205 * than one TLS segment. Comment in the Linux source says wine might 1206 * do this. 1207 */ 1208 1209 /* 1210 * GLIBC reads current %gs and call set_thread_area() with it. 1211 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because 1212 * we use these segments. 1213 */ 1214 switch (info.entry_number) { 1215 case GUGS32_SEL: 1216 case GUDATA_SEL: 1217 case 6: 1218 case -1: 1219 info.entry_number = GUGS32_SEL; 1220 break; 1221 default: 1222 return (EINVAL); 1223 } 1224 1225 /* 1226 * We have to copy out the GDT entry we use. 1227 * 1228 * XXX: What if a user space program does not check the return value 1229 * and tries to use 6, 7 or 8? 1230 */ 1231 error = copyout(&info, args->desc, sizeof(struct l_user_desc)); 1232 if (error) 1233 return (error); 1234 1235 if (LINUX_LDT_empty(&info)) { 1236 a[0] = 0; 1237 a[1] = 0; 1238 } else { 1239 a[0] = LINUX_LDT_entry_a(&info); 1240 a[1] = LINUX_LDT_entry_b(&info); 1241 } 1242 1243 memcpy(&sd, &a, sizeof(a)); 1244#ifdef DEBUG 1245 if (ldebug(set_thread_area)) 1246 printf("Segment created in set_thread_area: " 1247 "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, " 1248 "type: %i, dpl: %i, p: %i, xx: %i, long: %i, " 1249 "def32: %i, gran: %i\n", 1250 sd.sd_lobase, 1251 sd.sd_hibase, 1252 sd.sd_lolimit, 1253 sd.sd_hilimit, 1254 sd.sd_type, 1255 sd.sd_dpl, 1256 sd.sd_p, 1257 sd.sd_xx, 1258 sd.sd_long, 1259 sd.sd_def32, 1260 sd.sd_gran); 1261#endif 1262 1263 pcb = td->td_pcb; 1264 pcb->pcb_gsbase = (register_t)info.base_addr; 1265 set_pcb_flags(pcb, PCB_32BIT | PCB_GS32BIT); 1266 update_gdt_gsbase(td, info.base_addr); 1267 1268 return (0); 1269} 1270