linux32_machdep.c revision 163371
134355Sjb/*- 234355Sjb * Copyright (c) 2004 Tim J. Robbins 389985Sbde * Copyright (c) 2002 Doug Rabson 4151317Sdavidxu * Copyright (c) 2000 Marcel Moolenaar 534355Sjb * All rights reserved. 634355Sjb * 764002Speter * Redistribution and use in source and binary forms, with or without 834355Sjb * modification, are permitted provided that the following conditions 934355Sjb * are met: 1034355Sjb * 1. Redistributions of source code must retain the above copyright 1134355Sjb * notice, this list of conditions and the following disclaimer 1234355Sjb * in this position and unchanged. 1334355Sjb * 2. Redistributions in binary form must reproduce the above copyright 1434355Sjb * notice, this list of conditions and the following disclaimer in the 1534355Sjb * documentation and/or other materials provided with the distribution. 1634355Sjb * 3. The name of the author may not be used to endorse or promote products 1734355Sjb * derived from this software without specific prior written permission. 1834355Sjb * 1934355Sjb * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 2034355Sjb * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 2134355Sjb * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 2234355Sjb * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 2334355Sjb * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 2434355Sjb * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 2534355Sjb * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 2634355Sjb * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 2734355Sjb * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 2834355Sjb * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 2934355Sjb */ 3034355Sjb 3134355Sjb#include <sys/cdefs.h> 3234355Sjb__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_machdep.c 163371 2006-10-15 13:22:14Z netchild $"); 3334355Sjb 3434355Sjb#include <sys/param.h> 3534355Sjb#include <sys/kernel.h> 3634355Sjb#include <sys/systm.h> 3734355Sjb#include <sys/clock.h> 3834355Sjb#include <sys/imgact.h> 3934355Sjb#include <sys/limits.h> 4034355Sjb#include <sys/lock.h> 4134355Sjb#include <sys/malloc.h> 4234355Sjb#include <sys/mman.h> 4334355Sjb#include <sys/mutex.h> 4434355Sjb#include <sys/proc.h> 4534355Sjb#include <sys/resource.h> 4634355Sjb#include <sys/resourcevar.h> 4734355Sjb#include <sys/syscallsubr.h> 4834355Sjb#include <sys/sysproto.h> 4934355Sjb#include <sys/unistd.h> 5034355Sjb 5134355Sjb#include <machine/frame.h> 5234355Sjb 5334355Sjb#include <vm/vm.h> 5434355Sjb#include <vm/pmap.h> 5534355Sjb#include <vm/vm_extern.h> 5634355Sjb#include <vm/vm_kern.h> 5734355Sjb#include <vm/vm_map.h> 5834355Sjb 5934355Sjb#include <amd64/linux32/linux.h> 6034355Sjb#include <amd64/linux32/linux32_proto.h> 6134355Sjb#include <compat/linux/linux_ipc.h> 6234355Sjb#include <compat/linux/linux_signal.h> 6334355Sjb#include <compat/linux/linux_util.h> 6434355Sjb#include <compat/linux/linux_emul.h> 6534355Sjb 6634355Sjbstruct l_old_select_argv { 6734355Sjb l_int nfds; 6834355Sjb l_uintptr_t readfds; 6934355Sjb l_uintptr_t writefds; 7034355Sjb l_uintptr_t exceptfds; 7134355Sjb l_uintptr_t timeout; 7234355Sjb} __packed; 7334355Sjb 7434355Sjbint 7534355Sjblinux_to_bsd_sigaltstack(int lsa) 7634355Sjb{ 7734355Sjb int bsa = 0; 7834355Sjb 7934355Sjb if (lsa & LINUX_SS_DISABLE) 8034355Sjb bsa |= SS_DISABLE; 8134355Sjb if (lsa & LINUX_SS_ONSTACK) 8234355Sjb bsa |= SS_ONSTACK; 8334355Sjb return (bsa); 8434355Sjb} 8534355Sjb 8634355Sjbint 8734355Sjbbsd_to_linux_sigaltstack(int bsa) 8834355Sjb{ 8934355Sjb int lsa = 0; 9034355Sjb 9134355Sjb if (bsa & SS_DISABLE) 9234355Sjb lsa |= LINUX_SS_DISABLE; 9334355Sjb if (bsa & SS_ONSTACK) 9434355Sjb lsa |= LINUX_SS_ONSTACK; 9534355Sjb return (lsa); 9634355Sjb} 9734355Sjb 9834355Sjb/* 9934355Sjb * Custom version of exec_copyin_args() so that we can translate 10034355Sjb * the pointers. 10134355Sjb */ 10234355Sjbstatic int 10334355Sjblinux_exec_copyin_args(struct image_args *args, char *fname, 10434355Sjb enum uio_seg segflg, char **argv, char **envv) 10534355Sjb{ 10634355Sjb char *argp, *envp; 10734355Sjb u_int32_t *p32, arg; 10834355Sjb size_t length; 10934355Sjb int error; 110127891Sdfr 11134355Sjb bzero(args, sizeof(*args)); 11234355Sjb if (argv == NULL) 11334355Sjb return (EFAULT); 11434355Sjb 11534355Sjb /* 11634355Sjb * Allocate temporary demand zeroed space for argument and 11734355Sjb * environment strings 11834355Sjb */ 11934355Sjb args->buf = (char *) kmem_alloc_wait(exec_map, 12045065Salc PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); 12145065Salc if (args->buf == NULL) 12234355Sjb return (ENOMEM); 12334355Sjb args->begin_argv = args->buf; 12434355Sjb args->endp = args->begin_argv; 12534355Sjb args->stringspace = ARG_MAX; 12634355Sjb 12734355Sjb args->fname = args->buf + ARG_MAX; 12834355Sjb 12934355Sjb /* 13034355Sjb * Copy the file name. 13134355Sjb */ 13234355Sjb error = (segflg == UIO_SYSSPACE) ? 13334355Sjb copystr(fname, args->fname, PATH_MAX, &length) : 13434355Sjb copyinstr(fname, args->fname, PATH_MAX, &length); 13534355Sjb if (error != 0) 13634355Sjb goto err_exit; 13734355Sjb 13834355Sjb /* 13934355Sjb * extract arguments first 14034355Sjb */ 14134355Sjb p32 = (u_int32_t *)argv; 14234355Sjb for (;;) { 14335938Sdyson error = copyin(p32++, &arg, sizeof(arg)); 14434355Sjb if (error) 14534355Sjb goto err_exit; 14634355Sjb if (arg == 0) 14734355Sjb break; 14834355Sjb argp = PTRIN(arg); 14934355Sjb error = copyinstr(argp, args->endp, args->stringspace, &length); 15034355Sjb if (error) { 15134355Sjb if (error == ENAMETOOLONG) 15234355Sjb error = E2BIG; 15334355Sjb 15434355Sjb goto err_exit; 15534355Sjb } 15634355Sjb args->stringspace -= length; 15734355Sjb args->endp += length; 15834355Sjb args->argc++; 15934355Sjb } 16034355Sjb 161137875Smarks args->begin_envv = args->endp; 16234355Sjb 16334355Sjb /* 16434355Sjb * extract environment strings 16534355Sjb */ 16634355Sjb if (envv) { 16735938Sdyson p32 = (u_int32_t *)envv; 16835938Sdyson for (;;) { 16935938Sdyson error = copyin(p32++, &arg, sizeof(arg)); 17035938Sdyson if (error) 17135938Sdyson goto err_exit; 17235938Sdyson if (arg == 0) 17335938Sdyson break; 17435938Sdyson envp = PTRIN(arg); 175147814Sjhb error = copyinstr(envp, args->endp, args->stringspace, 176147814Sjhb &length); 17751138Salfred if (error) { 17851138Salfred if (error == ENAMETOOLONG) 17934355Sjb error = E2BIG; 18034355Sjb goto err_exit; 18134355Sjb } 18234355Sjb args->stringspace -= length; 18334355Sjb args->endp += length; 18434355Sjb args->envc++; 18534355Sjb } 18634355Sjb } 18734355Sjb 18834355Sjb return (0); 18934355Sjb 19056115Spetererr_exit: 19156115Speter kmem_free_wakeup(exec_map, (vm_offset_t)args->buf, 19234355Sjb PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); 19334355Sjb args->buf = NULL; 19434355Sjb return (error); 19534355Sjb} 19634355Sjb 19734355Sjbint 19834355Sjblinux_execve(struct thread *td, struct linux_execve_args *args) 19934355Sjb{ 20034355Sjb struct image_args eargs; 20134355Sjb char *path; 20234925Sdufault int error; 20334925Sdufault 20434925Sdufault LCONVPATHEXIST(td, args->path, &path); 20534925Sdufault 20634925Sdufault#ifdef DEBUG 20734925Sdufault if (ldebug(execve)) 20834925Sdufault printf(ARGS(execve, "%s"), path); 20934925Sdufault#endif 21035938Sdyson 21140931Sdg error = linux_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp, 21246155Sphk args->envp); 21351791Smarcel free(path, M_TEMP); 21451791Smarcel if (error == 0) 21551791Smarcel error = kern_execve(td, &eargs, NULL); 21651791Smarcel if (error == 0) 217112895Sjeff /* linux process can exec fbsd one, dont attempt 218112895Sjeff * to create emuldata for such process using 21956272Srwatson * linux_proc_init, this leads to a panic on KASSERT 22056272Srwatson * because such process has p->p_emuldata == NULL 22156272Srwatson */ 22256272Srwatson if (td->td_proc->p_sysent == &elf_linux_sysvec) 22356272Srwatson error = linux_proc_init(td, 0, 0); 22456272Srwatson return (error); 22556272Srwatson} 22656272Srwatson 22754803Srwatsonstruct iovec32 { 22854803Srwatson u_int32_t iov_base; 22954803Srwatson int iov_len; 23055943Sjasone}; 23156115Speter 23256115SpeterCTASSERT(sizeof(struct iovec32) == 8); 23359288Sjlemon 23459288Sjlemonstatic int 23561719Srwatsonlinux32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop) 23675039Srwatson{ 23775039Srwatson struct iovec32 iov32; 23875427Srwatson struct iovec *iov; 23983652Speter struct uio *uio; 24083796Srwatson u_int iovlen; 24185891Sphk int error, i; 24290889Sjulian 24390889Sjulian *uiop = NULL; 24490889Sjulian if (iovcnt > UIO_MAXIOV) 245103972Sarchie return (EINVAL); 246103972Sarchie iovlen = iovcnt * sizeof(struct iovec); 247103972Sarchie uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 248100897Srwatson iov = (struct iovec *)(uio + 1); 249100897Srwatson for (i = 0; i < iovcnt; i++) { 250100897Srwatson error = copyin(&iovp[i], &iov32, sizeof(struct iovec32)); 251100897Srwatson if (error) { 252100897Srwatson free(uio, M_IOV); 253100897Srwatson return (error); 25496084Smux } 25597372Smarcel iov[i].iov_base = PTRIN(iov32.iov_base); 25699856Salfred iov[i].iov_len = iov32.iov_len; 257100956Srwatson } 258103575Salfred uio->uio_iov = iov; 259122540Smckusick uio->uio_iovcnt = iovcnt; 260122540Smckusick uio->uio_segflg = UIO_USERSPACE; 261122540Smckusick uio->uio_offset = -1; 262122540Smckusick uio->uio_resid = 0; 263103575Salfred for (i = 0; i < iovcnt; i++) { 264103575Salfred if (iov->iov_len > INT_MAX - uio->uio_resid) { 265103575Salfred free(uio, M_IOV); 266103575Salfred return (EINVAL); 267103575Salfred } 268103575Salfred uio->uio_resid += iov->iov_len; 269103575Salfred iov++; 270103575Salfred } 271104731Srwatson *uiop = uio; 272105692Srwatson return (0); 273105692Srwatson} 274105692Srwatson 275104731Srwatsonint 276104731Srwatsonlinux_readv(struct thread *td, struct linux_readv_args *uap) 277105950Speter{ 278106467Srwatson struct uio *auio; 279105950Speter int error; 280106978Sdeischen 281106978Sdeischen error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 282106978Sdeischen if (error) 283107914Sdillon return (error); 284108406Srwatson error = kern_readv(td, uap->fd, auio); 285108406Srwatson free(auio, M_IOV); 286108406Srwatson return (error); 287108406Srwatson} 288112895Sjeff 289112902Sjeffint 290112902Sjefflinux_writev(struct thread *td, struct linux_writev_args *uap) 291112902Sjeff{ 292112902Sjeff struct uio *auio; 293112909Sjeff int error; 294112909Sjeff 295113276Smike error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); 296115800Srwatson if (error) 297115800Srwatson return (error); 298115800Srwatson error = kern_writev(td, uap->fd, auio); 299123253Smarcel free(auio, M_IOV); 300125369Sdeischen return (error); 301127484Smtm} 302127484Smtm 303132117Sphkstruct l_ipc_kludge { 304136831Srwatson l_uintptr_t msgp; 305136831Srwatson l_long msgtyp; 306136831Srwatson} __packed; 307136831Srwatson 308136831Srwatsonint 309136831Srwatsonlinux_ipc(struct thread *td, struct linux_ipc_args *args) 310136831Srwatson{ 311136831Srwatson 312136831Srwatson switch (args->what & 0xFFFF) { 313139013Sdavidxu case LINUX_SEMOP: { 314145435Sdavidxu struct linux_semop_args a; 315151317Sdavidxu 316151317Sdavidxu a.semid = args->arg1; 317 a.tsops = args->ptr; 318 a.nsops = args->arg2; 319 return (linux_semop(td, &a)); 320 } 321 case LINUX_SEMGET: { 322 struct linux_semget_args a; 323 324 a.key = args->arg1; 325 a.nsems = args->arg2; 326 a.semflg = args->arg3; 327 return (linux_semget(td, &a)); 328 } 329 case LINUX_SEMCTL: { 330 struct linux_semctl_args a; 331 int error; 332 333 a.semid = args->arg1; 334 a.semnum = args->arg2; 335 a.cmd = args->arg3; 336 error = copyin(args->ptr, &a.arg, sizeof(a.arg)); 337 if (error) 338 return (error); 339 return (linux_semctl(td, &a)); 340 } 341 case LINUX_MSGSND: { 342 struct linux_msgsnd_args a; 343 344 a.msqid = args->arg1; 345 a.msgp = args->ptr; 346 a.msgsz = args->arg2; 347 a.msgflg = args->arg3; 348 return (linux_msgsnd(td, &a)); 349 } 350 case LINUX_MSGRCV: { 351 struct linux_msgrcv_args a; 352 353 a.msqid = args->arg1; 354 a.msgsz = args->arg2; 355 a.msgflg = args->arg3; 356 if ((args->what >> 16) == 0) { 357 struct l_ipc_kludge tmp; 358 int error; 359 360 if (args->ptr == 0) 361 return (EINVAL); 362 error = copyin(args->ptr, &tmp, sizeof(tmp)); 363 if (error) 364 return (error); 365 a.msgp = PTRIN(tmp.msgp); 366 a.msgtyp = tmp.msgtyp; 367 } else { 368 a.msgp = args->ptr; 369 a.msgtyp = args->arg5; 370 } 371 return (linux_msgrcv(td, &a)); 372 } 373 case LINUX_MSGGET: { 374 struct linux_msgget_args a; 375 376 a.key = args->arg1; 377 a.msgflg = args->arg2; 378 return (linux_msgget(td, &a)); 379 } 380 case LINUX_MSGCTL: { 381 struct linux_msgctl_args a; 382 383 a.msqid = args->arg1; 384 a.cmd = args->arg2; 385 a.buf = args->ptr; 386 return (linux_msgctl(td, &a)); 387 } 388 case LINUX_SHMAT: { 389 struct linux_shmat_args a; 390 391 a.shmid = args->arg1; 392 a.shmaddr = args->ptr; 393 a.shmflg = args->arg2; 394 a.raddr = PTRIN((l_uint)args->arg3); 395 return (linux_shmat(td, &a)); 396 } 397 case LINUX_SHMDT: { 398 struct linux_shmdt_args a; 399 400 a.shmaddr = args->ptr; 401 return (linux_shmdt(td, &a)); 402 } 403 case LINUX_SHMGET: { 404 struct linux_shmget_args a; 405 406 a.key = args->arg1; 407 a.size = args->arg2; 408 a.shmflg = args->arg3; 409 return (linux_shmget(td, &a)); 410 } 411 case LINUX_SHMCTL: { 412 struct linux_shmctl_args a; 413 414 a.shmid = args->arg1; 415 a.cmd = args->arg2; 416 a.buf = args->ptr; 417 return (linux_shmctl(td, &a)); 418 } 419 default: 420 break; 421 } 422 423 return (EINVAL); 424} 425 426int 427linux_old_select(struct thread *td, struct linux_old_select_args *args) 428{ 429 struct l_old_select_argv linux_args; 430 struct linux_select_args newsel; 431 int error; 432 433#ifdef DEBUG 434 if (ldebug(old_select)) 435 printf(ARGS(old_select, "%p"), args->ptr); 436#endif 437 438 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 439 if (error) 440 return (error); 441 442 newsel.nfds = linux_args.nfds; 443 newsel.readfds = PTRIN(linux_args.readfds); 444 newsel.writefds = PTRIN(linux_args.writefds); 445 newsel.exceptfds = PTRIN(linux_args.exceptfds); 446 newsel.timeout = PTRIN(linux_args.timeout); 447 return (linux_select(td, &newsel)); 448} 449 450int 451linux_fork(struct thread *td, struct linux_fork_args *args) 452{ 453 int error; 454 455#ifdef DEBUG 456 if (ldebug(fork)) 457 printf(ARGS(fork, "")); 458#endif 459 460 if ((error = fork(td, (struct fork_args *)args)) != 0) 461 return (error); 462 463 if (td->td_retval[1] == 1) 464 td->td_retval[0] = 0; 465 error = linux_proc_init(td, td->td_retval[0], 0); 466 if (error) 467 return (error); 468 469 return (0); 470} 471 472int 473linux_vfork(struct thread *td, struct linux_vfork_args *args) 474{ 475 int error; 476 struct proc *p2; 477 478#ifdef DEBUG 479 if (ldebug(vfork)) 480 printf(ARGS(vfork, "")); 481#endif 482 483 /* exclude RFPPWAIT */ 484 if ((error = fork1(td, RFFDG | RFPROC | RFMEM, 0, &p2)) != 0) 485 return (error); 486 if (error == 0) { 487 td->td_retval[0] = p2->p_pid; 488 td->td_retval[1] = 0; 489 } 490 /* Are we the child? */ 491 if (td->td_retval[1] == 1) 492 td->td_retval[0] = 0; 493 error = linux_proc_init(td, td->td_retval[0], 0); 494 if (error) 495 return (error); 496 /* wait for the children to exit, ie. emulate vfork */ 497 PROC_LOCK(p2); 498 while (p2->p_flag & P_PPWAIT) 499 msleep(td->td_proc, &p2->p_mtx, PWAIT, "ppwait", 0); 500 PROC_UNLOCK(p2); 501 return (0); 502} 503 504int 505linux_clone(struct thread *td, struct linux_clone_args *args) 506{ 507 int error, ff = RFPROC | RFSTOPPED; 508 struct proc *p2; 509 struct thread *td2; 510 int exit_signal; 511 struct linux_emuldata *em; 512 513#ifdef DEBUG 514 if (ldebug(clone)) { 515 printf(ARGS(clone, "flags %x, stack %x, parent tid: %x, child tid: %x"), 516 (unsigned int)args->flags, (unsigned int)(uintptr_t)args->stack, 517 (unsigned int)(uintptr_t)args->parent_tidptr, 518 (unsigned int)(uintptr_t)args->child_tidptr); 519 } 520#endif 521 522 exit_signal = args->flags & 0x000000ff; 523 if (exit_signal >= LINUX_NSIG) 524 return (EINVAL); 525 526 if (exit_signal <= LINUX_SIGTBLSZ) 527 exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)]; 528 529 if (args->flags & CLONE_VM) 530 ff |= RFMEM; 531 if (args->flags & CLONE_SIGHAND) 532 ff |= RFSIGSHARE; 533 /* 534 * XXX: in linux sharing of fs info (chroot/cwd/umask) 535 * and open files is independant. in fbsd its in one 536 * structure but in reality it doesnt make any problems 537 * because both this flags are set at once usually. 538 */ 539 if (!(args->flags & (CLONE_FILES | CLONE_FS))) 540 ff |= RFFDG; 541 542 /* 543 * Attempt to detect when linux_clone(2) is used for creating 544 * kernel threads. Unfortunately despite the existence of the 545 * CLONE_THREAD flag, version of linuxthreads package used in 546 * most popular distros as of beginning of 2005 doesn't make 547 * any use of it. Therefore, this detection relay fully on 548 * empirical observation that linuxthreads sets certain 549 * combination of flags, so that we can make more or less 550 * precise detection and notify the FreeBSD kernel that several 551 * processes are in fact part of the same threading group, so 552 * that special treatment is necessary for signal delivery 553 * between those processes and fd locking. 554 */ 555 if ((args->flags & 0xffffff00) == THREADING_FLAGS) 556 ff |= RFTHREAD; 557 558 error = fork1(td, ff, 0, &p2); 559 if (error) 560 return (error); 561 562 /* create the emuldata */ 563 error = linux_proc_init(td, p2->p_pid, args->flags); 564 /* reference it - no need to check this */ 565 em = em_find(p2, EMUL_UNLOCKED); 566 KASSERT(em != NULL, ("clone: emuldata not found.\n")); 567 /* and adjust it */ 568 if (args->flags & CLONE_PARENT_SETTID) { 569 if (args->parent_tidptr == NULL) { 570 EMUL_UNLOCK(&emul_lock); 571 return (EINVAL); 572 } 573 error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); 574 if (error) { 575 EMUL_UNLOCK(&emul_lock); 576 return (error); 577 } 578 } 579 580 if (args->flags & (CLONE_PARENT|CLONE_THREAD)) { 581 sx_xlock(&proctree_lock); 582 PROC_LOCK(p2); 583 proc_reparent(p2, td->td_proc->p_pptr); 584 PROC_UNLOCK(p2); 585 sx_xunlock(&proctree_lock); 586 } 587 588 if (args->flags & CLONE_THREAD) { 589 /* XXX: linux mangles pgrp and pptr somehow 590 * I think it might be this but I am not sure. 591 */ 592#ifdef notyet 593 PROC_LOCK(p2); 594 p2->p_pgrp = td->td_proc->p_pgrp; 595 PROC_UNLOCK(p2); 596#endif 597 exit_signal = 0; 598 } 599 600 if (args->flags & CLONE_CHILD_SETTID) 601 em->child_set_tid = args->child_tidptr; 602 else 603 em->child_set_tid = NULL; 604 605 if (args->flags & CLONE_CHILD_CLEARTID) 606 em->child_clear_tid = args->child_tidptr; 607 else 608 em->child_clear_tid = NULL; 609 610 EMUL_UNLOCK(&emul_lock); 611 612 PROC_LOCK(p2); 613 p2->p_sigparent = exit_signal; 614 PROC_UNLOCK(p2); 615 td2 = FIRST_THREAD_IN_PROC(p2); 616 /* 617 * in a case of stack = NULL we are supposed to COW calling process stack 618 * this is what normal fork() does so we just keep the tf_rsp arg intact 619 */ 620 if (args->stack) 621 td2->td_frame->tf_rsp = PTROUT(args->stack); 622 623 if (args->flags & CLONE_SETTLS) { 624 /* XXX: todo */ 625 } 626 627#ifdef DEBUG 628 if (ldebug(clone)) 629 printf(LMSG("clone: successful rfork to %ld, stack %p sig = %d"), 630 (long)p2->p_pid, args->stack, exit_signal); 631#endif 632 633 /* 634 * Make this runnable after we are finished with it. 635 */ 636 mtx_lock_spin(&sched_lock); 637 TD_SET_CAN_RUN(td2); 638 setrunqueue(td2, SRQ_BORING); 639 mtx_unlock_spin(&sched_lock); 640 641 td->td_retval[0] = p2->p_pid; 642 td->td_retval[1] = 0; 643 return (0); 644} 645 646/* XXX move */ 647struct l_mmap_argv { 648 l_ulong addr; 649 l_ulong len; 650 l_ulong prot; 651 l_ulong flags; 652 l_ulong fd; 653 l_ulong pgoff; 654}; 655 656#define STACK_SIZE (2 * 1024 * 1024) 657#define GUARD_SIZE (4 * PAGE_SIZE) 658 659static int linux_mmap_common(struct thread *, struct l_mmap_argv *); 660 661int 662linux_mmap2(struct thread *td, struct linux_mmap2_args *args) 663{ 664 struct l_mmap_argv linux_args; 665 666#ifdef DEBUG 667 if (ldebug(mmap2)) 668 printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"), 669 (void *)(intptr_t)args->addr, args->len, args->prot, 670 args->flags, args->fd, args->pgoff); 671#endif 672 673 linux_args.addr = PTROUT(args->addr); 674 linux_args.len = args->len; 675 linux_args.prot = args->prot; 676 linux_args.flags = args->flags; 677 linux_args.fd = args->fd; 678 linux_args.pgoff = args->pgoff; 679 680 return (linux_mmap_common(td, &linux_args)); 681} 682 683int 684linux_mmap(struct thread *td, struct linux_mmap_args *args) 685{ 686 int error; 687 struct l_mmap_argv linux_args; 688 689 error = copyin(args->ptr, &linux_args, sizeof(linux_args)); 690 if (error) 691 return (error); 692 693#ifdef DEBUG 694 if (ldebug(mmap)) 695 printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"), 696 (void *)(intptr_t)linux_args.addr, linux_args.len, 697 linux_args.prot, linux_args.flags, linux_args.fd, 698 linux_args.pgoff); 699#endif 700 if ((linux_args.pgoff % PAGE_SIZE) != 0) 701 return (EINVAL); 702 linux_args.pgoff /= PAGE_SIZE; 703 704 return (linux_mmap_common(td, &linux_args)); 705} 706 707static int 708linux_mmap_common(struct thread *td, struct l_mmap_argv *linux_args) 709{ 710 struct proc *p = td->td_proc; 711 struct mmap_args /* { 712 caddr_t addr; 713 size_t len; 714 int prot; 715 int flags; 716 int fd; 717 long pad; 718 off_t pos; 719 } */ bsd_args; 720 int error; 721 722 error = 0; 723 bsd_args.flags = 0; 724 if (linux_args->flags & LINUX_MAP_SHARED) 725 bsd_args.flags |= MAP_SHARED; 726 if (linux_args->flags & LINUX_MAP_PRIVATE) 727 bsd_args.flags |= MAP_PRIVATE; 728 if (linux_args->flags & LINUX_MAP_FIXED) 729 bsd_args.flags |= MAP_FIXED; 730 if (linux_args->flags & LINUX_MAP_ANON) 731 bsd_args.flags |= MAP_ANON; 732 else 733 bsd_args.flags |= MAP_NOSYNC; 734 if (linux_args->flags & LINUX_MAP_GROWSDOWN) { 735 bsd_args.flags |= MAP_STACK; 736 737 /* 738 * The linux MAP_GROWSDOWN option does not limit auto 739 * growth of the region. Linux mmap with this option 740 * takes as addr the inital BOS, and as len, the initial 741 * region size. It can then grow down from addr without 742 * limit. However, linux threads has an implicit internal 743 * limit to stack size of STACK_SIZE. Its just not 744 * enforced explicitly in linux. But, here we impose 745 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack 746 * region, since we can do this with our mmap. 747 * 748 * Our mmap with MAP_STACK takes addr as the maximum 749 * downsize limit on BOS, and as len the max size of 750 * the region. It them maps the top SGROWSIZ bytes, 751 * and autgrows the region down, up to the limit 752 * in addr. 753 * 754 * If we don't use the MAP_STACK option, the effect 755 * of this code is to allocate a stack region of a 756 * fixed size of (STACK_SIZE - GUARD_SIZE). 757 */ 758 759 /* This gives us TOS */ 760 bsd_args.addr = (caddr_t)PTRIN(linux_args->addr) + 761 linux_args->len; 762 763 if ((caddr_t)PTRIN(bsd_args.addr) > 764 p->p_vmspace->vm_maxsaddr) { 765 /* 766 * Some linux apps will attempt to mmap 767 * thread stacks near the top of their 768 * address space. If their TOS is greater 769 * than vm_maxsaddr, vm_map_growstack() 770 * will confuse the thread stack with the 771 * process stack and deliver a SEGV if they 772 * attempt to grow the thread stack past their 773 * current stacksize rlimit. To avoid this, 774 * adjust vm_maxsaddr upwards to reflect 775 * the current stacksize rlimit rather 776 * than the maximum possible stacksize. 777 * It would be better to adjust the 778 * mmap'ed region, but some apps do not check 779 * mmap's return value. 780 */ 781 PROC_LOCK(p); 782 p->p_vmspace->vm_maxsaddr = 783 (char *)LINUX32_USRSTACK - 784 lim_cur(p, RLIMIT_STACK); 785 PROC_UNLOCK(p); 786 } 787 788 /* This gives us our maximum stack size */ 789 if (linux_args->len > STACK_SIZE - GUARD_SIZE) 790 bsd_args.len = linux_args->len; 791 else 792 bsd_args.len = STACK_SIZE - GUARD_SIZE; 793 794 /* 795 * This gives us a new BOS. If we're using VM_STACK, then 796 * mmap will just map the top SGROWSIZ bytes, and let 797 * the stack grow down to the limit at BOS. If we're 798 * not using VM_STACK we map the full stack, since we 799 * don't have a way to autogrow it. 800 */ 801 bsd_args.addr -= bsd_args.len; 802 } else { 803 bsd_args.addr = (caddr_t)PTRIN(linux_args->addr); 804 bsd_args.len = linux_args->len; 805 } 806 /* 807 * XXX i386 Linux always emulator forces PROT_READ on (why?) 808 * so we do the same. We add PROT_EXEC to work around buggy 809 * applications (e.g. Java) that take advantage of the fact 810 * that execute permissions are not enforced by x86 CPUs. 811 */ 812 bsd_args.prot = linux_args->prot | PROT_EXEC | PROT_READ; 813 if (linux_args->flags & LINUX_MAP_ANON) 814 bsd_args.fd = -1; 815 else 816 bsd_args.fd = linux_args->fd; 817 bsd_args.pos = (off_t)linux_args->pgoff * PAGE_SIZE; 818 bsd_args.pad = 0; 819 820#ifdef DEBUG 821 if (ldebug(mmap)) 822 printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", 823 __func__, 824 (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot, 825 bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); 826#endif 827 error = mmap(td, &bsd_args); 828#ifdef DEBUG 829 if (ldebug(mmap)) 830 printf("-> %s() return: 0x%x (0x%08x)\n", 831 __func__, error, (u_int)td->td_retval[0]); 832#endif 833 return (error); 834} 835 836int 837linux_pipe(struct thread *td, struct linux_pipe_args *args) 838{ 839 int pip[2]; 840 int error; 841 register_t reg_rdx; 842 843#ifdef DEBUG 844 if (ldebug(pipe)) 845 printf(ARGS(pipe, "*")); 846#endif 847 848 reg_rdx = td->td_retval[1]; 849 error = pipe(td, 0); 850 if (error) { 851 td->td_retval[1] = reg_rdx; 852 return (error); 853 } 854 855 pip[0] = td->td_retval[0]; 856 pip[1] = td->td_retval[1]; 857 error = copyout(pip, args->pipefds, 2 * sizeof(int)); 858 if (error) { 859 td->td_retval[1] = reg_rdx; 860 return (error); 861 } 862 863 td->td_retval[1] = reg_rdx; 864 td->td_retval[0] = 0; 865 return (0); 866} 867 868int 869linux_sigaction(struct thread *td, struct linux_sigaction_args *args) 870{ 871 l_osigaction_t osa; 872 l_sigaction_t act, oact; 873 int error; 874 875#ifdef DEBUG 876 if (ldebug(sigaction)) 877 printf(ARGS(sigaction, "%d, %p, %p"), 878 args->sig, (void *)args->nsa, (void *)args->osa); 879#endif 880 881 if (args->nsa != NULL) { 882 error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); 883 if (error) 884 return (error); 885 act.lsa_handler = osa.lsa_handler; 886 act.lsa_flags = osa.lsa_flags; 887 act.lsa_restorer = osa.lsa_restorer; 888 LINUX_SIGEMPTYSET(act.lsa_mask); 889 act.lsa_mask.__bits[0] = osa.lsa_mask; 890 } 891 892 error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, 893 args->osa ? &oact : NULL); 894 895 if (args->osa != NULL && !error) { 896 osa.lsa_handler = oact.lsa_handler; 897 osa.lsa_flags = oact.lsa_flags; 898 osa.lsa_restorer = oact.lsa_restorer; 899 osa.lsa_mask = oact.lsa_mask.__bits[0]; 900 error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); 901 } 902 903 return (error); 904} 905 906/* 907 * Linux has two extra args, restart and oldmask. We dont use these, 908 * but it seems that "restart" is actually a context pointer that 909 * enables the signal to happen with a different register set. 910 */ 911int 912linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) 913{ 914 sigset_t sigmask; 915 l_sigset_t mask; 916 917#ifdef DEBUG 918 if (ldebug(sigsuspend)) 919 printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask); 920#endif 921 922 LINUX_SIGEMPTYSET(mask); 923 mask.__bits[0] = args->mask; 924 linux_to_bsd_sigset(&mask, &sigmask); 925 return (kern_sigsuspend(td, sigmask)); 926} 927 928int 929linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) 930{ 931 l_sigset_t lmask; 932 sigset_t sigmask; 933 int error; 934 935#ifdef DEBUG 936 if (ldebug(rt_sigsuspend)) 937 printf(ARGS(rt_sigsuspend, "%p, %d"), 938 (void *)uap->newset, uap->sigsetsize); 939#endif 940 941 if (uap->sigsetsize != sizeof(l_sigset_t)) 942 return (EINVAL); 943 944 error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); 945 if (error) 946 return (error); 947 948 linux_to_bsd_sigset(&lmask, &sigmask); 949 return (kern_sigsuspend(td, sigmask)); 950} 951 952int 953linux_pause(struct thread *td, struct linux_pause_args *args) 954{ 955 struct proc *p = td->td_proc; 956 sigset_t sigmask; 957 958#ifdef DEBUG 959 if (ldebug(pause)) 960 printf(ARGS(pause, "")); 961#endif 962 963 PROC_LOCK(p); 964 sigmask = td->td_sigmask; 965 PROC_UNLOCK(p); 966 return (kern_sigsuspend(td, sigmask)); 967} 968 969int 970linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) 971{ 972 stack_t ss, oss; 973 l_stack_t lss; 974 int error; 975 976#ifdef DEBUG 977 if (ldebug(sigaltstack)) 978 printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss); 979#endif 980 981 if (uap->uss != NULL) { 982 error = copyin(uap->uss, &lss, sizeof(l_stack_t)); 983 if (error) 984 return (error); 985 986 ss.ss_sp = PTRIN(lss.ss_sp); 987 ss.ss_size = lss.ss_size; 988 ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); 989 } 990 error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, 991 (uap->uoss != NULL) ? &oss : NULL); 992 if (!error && uap->uoss != NULL) { 993 lss.ss_sp = PTROUT(oss.ss_sp); 994 lss.ss_size = oss.ss_size; 995 lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); 996 error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); 997 } 998 999 return (error); 1000} 1001 1002int 1003linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) 1004{ 1005 struct ftruncate_args sa; 1006 1007#ifdef DEBUG 1008 if (ldebug(ftruncate64)) 1009 printf(ARGS(ftruncate64, "%u, %jd"), args->fd, 1010 (intmax_t)args->length); 1011#endif 1012 1013 sa.fd = args->fd; 1014 sa.pad = 0; 1015 sa.length = args->length; 1016 return ftruncate(td, &sa); 1017} 1018 1019int 1020linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap) 1021{ 1022 struct timeval atv; 1023 l_timeval atv32; 1024 struct timezone rtz; 1025 int error = 0; 1026 1027 if (uap->tp) { 1028 microtime(&atv); 1029 atv32.tv_sec = atv.tv_sec; 1030 atv32.tv_usec = atv.tv_usec; 1031 error = copyout(&atv32, uap->tp, sizeof (atv32)); 1032 } 1033 if (error == 0 && uap->tzp != NULL) { 1034 rtz.tz_minuteswest = tz_minuteswest; 1035 rtz.tz_dsttime = tz_dsttime; 1036 error = copyout(&rtz, uap->tzp, sizeof (rtz)); 1037 } 1038 return (error); 1039} 1040 1041int 1042linux_nanosleep(struct thread *td, struct linux_nanosleep_args *uap) 1043{ 1044 struct timespec rqt, rmt; 1045 struct l_timespec ats32; 1046 int error; 1047 1048 error = copyin(uap->rqtp, &ats32, sizeof(ats32)); 1049 if (error != 0) 1050 return (error); 1051 rqt.tv_sec = ats32.tv_sec; 1052 rqt.tv_nsec = ats32.tv_nsec; 1053 error = kern_nanosleep(td, &rqt, &rmt); 1054 if (uap->rmtp != NULL) { 1055 ats32.tv_sec = rmt.tv_sec; 1056 ats32.tv_nsec = rmt.tv_nsec; 1057 error = copyout(&ats32, uap->rmtp, sizeof(ats32)); 1058 } 1059 return (error); 1060} 1061 1062int 1063linux_getrusage(struct thread *td, struct linux_getrusage_args *uap) 1064{ 1065 struct l_rusage s32; 1066 struct rusage s; 1067 int error; 1068 1069 error = kern_getrusage(td, uap->who, &s); 1070 if (error != 0) 1071 return (error); 1072 if (uap->rusage != NULL) { 1073 s32.ru_utime.tv_sec = s.ru_utime.tv_sec; 1074 s32.ru_utime.tv_usec = s.ru_utime.tv_usec; 1075 s32.ru_stime.tv_sec = s.ru_stime.tv_sec; 1076 s32.ru_stime.tv_usec = s.ru_stime.tv_usec; 1077 s32.ru_maxrss = s.ru_maxrss; 1078 s32.ru_ixrss = s.ru_ixrss; 1079 s32.ru_idrss = s.ru_idrss; 1080 s32.ru_isrss = s.ru_isrss; 1081 s32.ru_minflt = s.ru_minflt; 1082 s32.ru_majflt = s.ru_majflt; 1083 s32.ru_nswap = s.ru_nswap; 1084 s32.ru_inblock = s.ru_inblock; 1085 s32.ru_oublock = s.ru_oublock; 1086 s32.ru_msgsnd = s.ru_msgsnd; 1087 s32.ru_msgrcv = s.ru_msgrcv; 1088 s32.ru_nsignals = s.ru_nsignals; 1089 s32.ru_nvcsw = s.ru_nvcsw; 1090 s32.ru_nivcsw = s.ru_nivcsw; 1091 error = copyout(&s32, uap->rusage, sizeof(s32)); 1092 } 1093 return (error); 1094} 1095 1096int 1097linux_sched_rr_get_interval(struct thread *td, 1098 struct linux_sched_rr_get_interval_args *uap) 1099{ 1100 struct timespec ts; 1101 struct l_timespec ts32; 1102 int error; 1103 1104 error = kern_sched_rr_get_interval(td, uap->pid, &ts); 1105 if (error != 0) 1106 return (error); 1107 ts32.tv_sec = ts.tv_sec; 1108 ts32.tv_nsec = ts.tv_nsec; 1109 return (copyout(&ts32, uap->interval, sizeof(ts32))); 1110} 1111 1112int 1113linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) 1114{ 1115 struct mprotect_args bsd_args; 1116 1117 bsd_args.addr = uap->addr; 1118 bsd_args.len = uap->len; 1119 bsd_args.prot = uap->prot; 1120 /* XXX PROT_READ implies PROT_EXEC; see linux_mmap_common(). */ 1121 if ((bsd_args.prot & PROT_READ) != 0) 1122 bsd_args.prot |= PROT_EXEC; 1123 return (mprotect(td, &bsd_args)); 1124} 1125