linux_misc.c revision 50818
1/*-
2 * Copyright (c) 1994-1995 S�ren Schmidt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software withough specific prior written permission
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $FreeBSD: head/sys/compat/linux/linux_misc.c 50818 1999-09-02 21:50:42Z marcel $
29 */
30
31#include "opt_compat.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/sysproto.h>
36#include <sys/kernel.h>
37#include <sys/mman.h>
38#include <sys/proc.h>
39#include <sys/fcntl.h>
40#include <sys/imgact_aout.h>
41#include <sys/mount.h>
42#include <sys/namei.h>
43#include <sys/resourcevar.h>
44#include <sys/stat.h>
45#include <sys/sysctl.h>
46#include <sys/unistd.h>
47#include <sys/vnode.h>
48#include <sys/wait.h>
49#include <sys/time.h>
50
51#include <vm/vm.h>
52#include <vm/pmap.h>
53#include <vm/vm_kern.h>
54#include <vm/vm_prot.h>
55#include <vm/vm_map.h>
56#include <vm/vm_extern.h>
57
58#include <machine/frame.h>
59#include <machine/psl.h>
60#include <machine/sysarch.h>
61#include <machine/segments.h>
62
63#include <i386/linux/linux.h>
64#include <i386/linux/linux_proto.h>
65#include <i386/linux/linux_util.h>
66#include <i386/linux/linux_mib.h>
67
68#include <posix4/sched.h>
69
70static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] =
71{ RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
72  RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
73  RLIMIT_MEMLOCK, -1
74};
75
76int
77linux_alarm(struct proc *p, struct linux_alarm_args *args)
78{
79    struct itimerval it, old_it;
80    struct timeval tv;
81    int s;
82
83#ifdef DEBUG
84    printf("Linux-emul(%ld): alarm(%u)\n", (long)p->p_pid, args->secs);
85#endif
86    if (args->secs > 100000000)
87	return EINVAL;
88    it.it_value.tv_sec = (long)args->secs;
89    it.it_value.tv_usec = 0;
90    it.it_interval.tv_sec = 0;
91    it.it_interval.tv_usec = 0;
92    s = splsoftclock();
93    old_it = p->p_realtimer;
94    getmicrouptime(&tv);
95    if (timevalisset(&old_it.it_value))
96	untimeout(realitexpire, (caddr_t)p, p->p_ithandle);
97    if (it.it_value.tv_sec != 0) {
98	p->p_ithandle = timeout(realitexpire, (caddr_t)p, tvtohz(&it.it_value));
99	timevaladd(&it.it_value, &tv);
100    }
101    p->p_realtimer = it;
102    splx(s);
103    if (timevalcmp(&old_it.it_value, &tv, >)) {
104	timevalsub(&old_it.it_value, &tv);
105	if (old_it.it_value.tv_usec != 0)
106	    old_it.it_value.tv_sec++;
107	p->p_retval[0] = old_it.it_value.tv_sec;
108    }
109    return 0;
110}
111
112int
113linux_brk(struct proc *p, struct linux_brk_args *args)
114{
115#if 0
116    struct vmspace *vm = p->p_vmspace;
117    vm_offset_t new, old;
118    int error;
119
120    if ((vm_offset_t)args->dsend < (vm_offset_t)vm->vm_daddr)
121	return EINVAL;
122    if (((caddr_t)args->dsend - (caddr_t)vm->vm_daddr)
123	> p->p_rlimit[RLIMIT_DATA].rlim_cur)
124	return ENOMEM;
125
126    old = round_page((vm_offset_t)vm->vm_daddr) + ctob(vm->vm_dsize);
127    new = round_page((vm_offset_t)args->dsend);
128    p->p_retval[0] = old;
129    if ((new-old) > 0) {
130	if (swap_pager_full)
131	    return ENOMEM;
132	error = vm_map_find(&vm->vm_map, NULL, 0, &old, (new-old), FALSE,
133			VM_PROT_ALL, VM_PROT_ALL, 0);
134	if (error)
135	    return error;
136	vm->vm_dsize += btoc((new-old));
137	p->p_retval[0] = (int)(vm->vm_daddr + ctob(vm->vm_dsize));
138    }
139    return 0;
140#else
141    struct vmspace *vm = p->p_vmspace;
142    vm_offset_t new, old;
143    struct obreak_args /* {
144	char * nsize;
145    } */ tmp;
146
147#ifdef DEBUG
148    printf("Linux-emul(%ld): brk(%p)\n", (long)p->p_pid, (void *)args->dsend);
149#endif
150    old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize);
151    new = (vm_offset_t)args->dsend;
152    tmp.nsize = (char *) new;
153    if (((caddr_t)new > vm->vm_daddr) && !obreak(p, &tmp))
154	p->p_retval[0] = (int)new;
155    else
156	p->p_retval[0] = (int)old;
157
158    return 0;
159#endif
160}
161
162int
163linux_uselib(struct proc *p, struct linux_uselib_args *args)
164{
165    struct nameidata ni;
166    struct vnode *vp;
167    struct exec *a_out;
168    struct vattr attr;
169    vm_offset_t vmaddr;
170    unsigned long file_offset;
171    vm_offset_t buffer;
172    unsigned long bss_size;
173    int error;
174    caddr_t sg;
175    int locked;
176
177    sg = stackgap_init();
178    CHECKALTEXIST(p, &sg, args->library);
179
180#ifdef DEBUG
181    printf("Linux-emul(%ld): uselib(%s)\n", (long)p->p_pid, args->library);
182#endif
183
184    a_out = NULL;
185    locked = 0;
186    vp = NULL;
187
188    NDINIT(&ni, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, args->library, p);
189    error = namei(&ni);
190    if (error)
191	goto cleanup;
192
193    vp = ni.ni_vp;
194    if (vp == NULL) {
195	error = ENOEXEC;	/* ?? */
196	goto cleanup;
197    }
198
199    /*
200     * From here on down, we have a locked vnode that must be unlocked.
201     */
202    locked++;
203
204    /*
205     * Writable?
206     */
207    if (vp->v_writecount) {
208	error = ETXTBSY;
209	goto cleanup;
210    }
211
212    /*
213     * Executable?
214     */
215    error = VOP_GETATTR(vp, &attr, p->p_ucred, p);
216    if (error)
217	goto cleanup;
218
219    if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
220	((attr.va_mode & 0111) == 0) ||
221	(attr.va_type != VREG)) {
222	    error = ENOEXEC;
223	    goto cleanup;
224    }
225
226    /*
227     * Sensible size?
228     */
229    if (attr.va_size == 0) {
230	error = ENOEXEC;
231	goto cleanup;
232    }
233
234    /*
235     * Can we access it?
236     */
237    error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
238    if (error)
239	goto cleanup;
240
241    error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
242    if (error)
243	goto cleanup;
244
245    /*
246     * Lock no longer needed
247     */
248    VOP_UNLOCK(vp, 0, p);
249    locked = 0;
250
251    /*
252     * Pull in executable header into kernel_map
253     */
254    error = vm_mmap(kernel_map, (vm_offset_t *)&a_out, PAGE_SIZE,
255	    	    VM_PROT_READ, VM_PROT_READ, 0, (caddr_t)vp, 0);
256    if (error)
257	goto cleanup;
258
259    /*
260     * Is it a Linux binary ?
261     */
262    if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
263	error = ENOEXEC;
264	goto cleanup;
265    }
266
267    /* While we are here, we should REALLY do some more checks */
268
269    /*
270     * Set file/virtual offset based on a.out variant.
271     */
272    switch ((int)(a_out->a_magic & 0xffff)) {
273    case 0413:	/* ZMAGIC */
274	file_offset = 1024;
275	break;
276    case 0314:	/* QMAGIC */
277	file_offset = 0;
278	break;
279    default:
280	error = ENOEXEC;
281	goto cleanup;
282    }
283
284    bss_size = round_page(a_out->a_bss);
285
286    /*
287     * Check various fields in header for validity/bounds.
288     */
289    if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
290	error = ENOEXEC;
291	goto cleanup;
292    }
293
294    /* text + data can't exceed file size */
295    if (a_out->a_data + a_out->a_text > attr.va_size) {
296	error = EFAULT;
297	goto cleanup;
298    }
299
300    /*
301     * text/data/bss must not exceed limits
302     * XXX: this is not complete. it should check current usage PLUS
303     * the resources needed by this library.
304     */
305    if (a_out->a_text > MAXTSIZ ||
306	a_out->a_data + bss_size > p->p_rlimit[RLIMIT_DATA].rlim_cur) {
307	error = ENOMEM;
308	goto cleanup;
309    }
310
311    /*
312     * prevent more writers
313     */
314    vp->v_flag |= VTEXT;
315
316    /*
317     * Check if file_offset page aligned,.
318     * Currently we cannot handle misalinged file offsets,
319     * and so we read in the entire image (what a waste).
320     */
321    if (file_offset & PAGE_MASK) {
322#ifdef DEBUG
323printf("uselib: Non page aligned binary %lu\n", file_offset);
324#endif
325	/*
326	 * Map text+data read/write/execute
327	 */
328
329	/* a_entry is the load address and is page aligned */
330	vmaddr = trunc_page(a_out->a_entry);
331
332	/* get anon user mapping, read+write+execute */
333	error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &vmaddr,
334		    	    a_out->a_text + a_out->a_data, FALSE,
335			    VM_PROT_ALL, VM_PROT_ALL, 0);
336	if (error)
337	    goto cleanup;
338
339	/* map file into kernel_map */
340	error = vm_mmap(kernel_map, &buffer,
341			round_page(a_out->a_text + a_out->a_data + file_offset),
342		   	VM_PROT_READ, VM_PROT_READ, 0,
343			(caddr_t)vp, trunc_page(file_offset));
344	if (error)
345	    goto cleanup;
346
347	/* copy from kernel VM space to user space */
348	error = copyout((caddr_t)(void *)(uintptr_t)(buffer + file_offset),
349			(caddr_t)vmaddr, a_out->a_text + a_out->a_data);
350
351	/* release temporary kernel space */
352	vm_map_remove(kernel_map, buffer,
353		      buffer + round_page(a_out->a_text + a_out->a_data + file_offset));
354
355	if (error)
356	    goto cleanup;
357    }
358    else {
359#ifdef DEBUG
360printf("uselib: Page aligned binary %lu\n", file_offset);
361#endif
362	/*
363	 * for QMAGIC, a_entry is 20 bytes beyond the load address
364	 * to skip the executable header
365	 */
366	vmaddr = trunc_page(a_out->a_entry);
367
368	/*
369	 * Map it all into the process's space as a single copy-on-write
370	 * "data" segment.
371	 */
372	error = vm_mmap(&p->p_vmspace->vm_map, &vmaddr,
373		   	a_out->a_text + a_out->a_data,
374			VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED,
375			(caddr_t)vp, file_offset);
376	if (error)
377	    goto cleanup;
378    }
379#ifdef DEBUG
380printf("mem=%08x = %08x %08x\n", vmaddr, ((int*)vmaddr)[0], ((int*)vmaddr)[1]);
381#endif
382    if (bss_size != 0) {
383        /*
384	 * Calculate BSS start address
385	 */
386	vmaddr = trunc_page(a_out->a_entry) + a_out->a_text + a_out->a_data;
387
388	/*
389	 * allocate some 'anon' space
390	 */
391	error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &vmaddr,
392			    bss_size, FALSE,
393			    VM_PROT_ALL, VM_PROT_ALL, 0);
394	if (error)
395	    goto cleanup;
396    }
397
398cleanup:
399    /*
400     * Unlock vnode if needed
401     */
402    if (locked)
403	VOP_UNLOCK(vp, 0, p);
404
405    /*
406     * Release the kernel mapping.
407     */
408    if (a_out)
409	vm_map_remove(kernel_map, (vm_offset_t)a_out, (vm_offset_t)a_out + PAGE_SIZE);
410
411    return error;
412}
413
414/* XXX move */
415struct linux_select_argv {
416	int nfds;
417	fd_set *readfds;
418	fd_set *writefds;
419	fd_set *exceptfds;
420	struct timeval *timeout;
421};
422
423int
424linux_select(struct proc *p, struct linux_select_args *args)
425{
426    struct linux_select_argv linux_args;
427    struct linux_newselect_args newsel;
428    int error;
429
430#ifdef SELECT_DEBUG
431    printf("Linux-emul(%ld): select(%x)\n", (long)p->p_pid, args->ptr);
432#endif
433    if ((error = copyin((caddr_t)args->ptr, (caddr_t)&linux_args,
434			sizeof(linux_args))))
435	return error;
436
437    newsel.nfds = linux_args.nfds;
438    newsel.readfds = linux_args.readfds;
439    newsel.writefds = linux_args.writefds;
440    newsel.exceptfds = linux_args.exceptfds;
441    newsel.timeout = linux_args.timeout;
442
443    return linux_newselect(p, &newsel);
444}
445
446int
447linux_newselect(struct proc *p, struct linux_newselect_args *args)
448{
449    struct select_args bsa;
450    struct timeval tv0, tv1, utv, *tvp;
451    caddr_t sg;
452    int error;
453
454#ifdef DEBUG
455    printf("Linux-emul(%ld): newselect(%d, %p, %p, %p, %p)\n",
456  	(long)p->p_pid, args->nfds, (void *)args->readfds,
457	(void *)args->writefds, (void *)args->exceptfds,
458	(void *)args->timeout);
459#endif
460    error = 0;
461    bsa.nd = args->nfds;
462    bsa.in = args->readfds;
463    bsa.ou = args->writefds;
464    bsa.ex = args->exceptfds;
465    bsa.tv = args->timeout;
466
467    /*
468     * Store current time for computation of the amount of
469     * time left.
470     */
471    if (args->timeout) {
472	if ((error = copyin(args->timeout, &utv, sizeof(utv))))
473	    goto select_out;
474#ifdef DEBUG
475	printf("Linux-emul(%ld): incoming timeout (%ld/%ld)\n",
476	    (long)p->p_pid, utv.tv_sec, utv.tv_usec);
477#endif
478	if (itimerfix(&utv)) {
479	    /*
480	     * The timeval was invalid.  Convert it to something
481	     * valid that will act as it does under Linux.
482	     */
483	    sg = stackgap_init();
484	    tvp = stackgap_alloc(&sg, sizeof(utv));
485	    utv.tv_sec += utv.tv_usec / 1000000;
486	    utv.tv_usec %= 1000000;
487	    if (utv.tv_usec < 0) {
488		utv.tv_sec -= 1;
489		utv.tv_usec += 1000000;
490	    }
491	    if (utv.tv_sec < 0)
492		timevalclear(&utv);
493	    if ((error = copyout(&utv, tvp, sizeof(utv))))
494		goto select_out;
495	    bsa.tv = tvp;
496	}
497	microtime(&tv0);
498    }
499
500    error = select(p, &bsa);
501#ifdef DEBUG
502    printf("Linux-emul(%ld): real select returns %d\n", (long)p->p_pid, error);
503#endif
504
505    if (error) {
506	/*
507	 * See fs/select.c in the Linux kernel.  Without this,
508	 * Maelstrom doesn't work.
509	 */
510	if (error == ERESTART)
511	    error = EINTR;
512	goto select_out;
513    }
514
515    if (args->timeout) {
516	if (p->p_retval[0]) {
517	    /*
518	     * Compute how much time was left of the timeout,
519	     * by subtracting the current time and the time
520	     * before we started the call, and subtracting
521	     * that result from the user-supplied value.
522	     */
523	    microtime(&tv1);
524	    timevalsub(&tv1, &tv0);
525	    timevalsub(&utv, &tv1);
526	    if (utv.tv_sec < 0)
527		timevalclear(&utv);
528	} else
529	    timevalclear(&utv);
530#ifdef DEBUG
531	printf("Linux-emul(%ld): outgoing timeout (%ld/%ld)\n",
532	    (long)p->p_pid, utv.tv_sec, utv.tv_usec);
533#endif
534	if ((error = copyout(&utv, args->timeout, sizeof(utv))))
535	    goto select_out;
536    }
537
538select_out:
539#ifdef DEBUG
540    printf("Linux-emul(%ld): newselect_out -> %d\n", (long)p->p_pid, error);
541#endif
542    return error;
543}
544
545int
546linux_getpgid(struct proc *p, struct linux_getpgid_args *args)
547{
548    struct proc *curp;
549
550#ifdef DEBUG
551    printf("Linux-emul(%ld): getpgid(%d)\n", (long)p->p_pid, args->pid);
552#endif
553    if (args->pid != p->p_pid) {
554	if (!(curp = pfind(args->pid)))
555	    return ESRCH;
556    }
557    else
558	curp = p;
559    p->p_retval[0] = curp->p_pgid;
560    return 0;
561}
562
563int
564linux_fork(struct proc *p, struct linux_fork_args *args)
565{
566    int error;
567
568#ifdef DEBUG
569    printf("Linux-emul(%ld): fork()\n", (long)p->p_pid);
570#endif
571    if ((error = fork(p, (struct fork_args *)args)) != 0)
572	return error;
573    if (p->p_retval[1] == 1)
574	p->p_retval[0] = 0;
575    return 0;
576}
577
578int
579linux_vfork(struct proc *p, struct linux_vfork_args *args)
580{
581	int error;
582
583#ifdef DEBUG
584	printf("Linux-emul(%ld): vfork()\n", (long)p->p_pid);
585#endif
586
587	if ((error = vfork(p, (struct vfork_args *)args)) != 0)
588		return error;
589	/* Are we the child? */
590	if (p->p_retval[1] == 1)
591		p->p_retval[0] = 0;
592	return 0;
593}
594
595#define CLONE_VM	0x100
596#define CLONE_FS	0x200
597#define CLONE_FILES	0x400
598#define CLONE_SIGHAND	0x800
599#define CLONE_PID	0x1000
600
601int
602linux_clone(struct proc *p, struct linux_clone_args *args)
603{
604    int error, ff = RFPROC;
605    struct proc *p2;
606    int            exit_signal;
607    vm_offset_t    start;
608    struct rfork_args rf_args;
609
610#ifdef DEBUG
611    if (args->flags & CLONE_PID)
612	printf("linux_clone(%ld): CLONE_PID not yet supported\n",
613	       (long)p->p_pid);
614    printf("linux_clone(%ld): invoked with flags %x and stack %x\n",
615	   (long)p->p_pid, (unsigned int)args->flags,
616	   (unsigned int)args->stack);
617#endif
618
619    if (!args->stack)
620        return (EINVAL);
621
622    exit_signal = args->flags & 0x000000ff;
623    if (exit_signal >= LINUX_NSIG)
624	return EINVAL;
625    exit_signal = linux_to_bsd_signal[exit_signal];
626
627    /* RFTHREAD probably not necessary here, but it shouldn't hurt either */
628    ff |= RFTHREAD;
629
630    if (args->flags & CLONE_VM)
631	ff |= RFMEM;
632    if (args->flags & CLONE_SIGHAND)
633	ff |= RFSIGSHARE;
634    if (!(args->flags & CLONE_FILES))
635	ff |= RFFDG;
636
637    error = 0;
638    start = 0;
639
640    rf_args.flags = ff;
641    if ((error = rfork(p, &rf_args)) != 0)
642	return error;
643
644    p2 = pfind(p->p_retval[0]);
645    if (p2 == 0)
646 	return ESRCH;
647
648    p2->p_sigparent = exit_signal;
649    p2->p_md.md_regs->tf_esp = (unsigned int)args->stack;
650
651#ifdef DEBUG
652    printf ("linux_clone(%ld): successful rfork to %ld\n",
653	    (long)p->p_pid, (long)p2->p_pid);
654#endif
655    return 0;
656}
657
658/* XXX move */
659struct linux_mmap_argv {
660	linux_caddr_t addr;
661	int len;
662	int prot;
663	int flags;
664	int fd;
665	int pos;
666};
667
668#define STACK_SIZE  (2 * 1024 * 1024)
669#define GUARD_SIZE  (4 * PAGE_SIZE)
670int
671linux_mmap(struct proc *p, struct linux_mmap_args *args)
672{
673    struct mmap_args /* {
674	caddr_t addr;
675	size_t len;
676	int prot;
677	int flags;
678	int fd;
679	long pad;
680	off_t pos;
681    } */ bsd_args;
682    int error;
683    struct linux_mmap_argv linux_args;
684
685    if ((error = copyin((caddr_t)args->ptr, (caddr_t)&linux_args,
686			sizeof(linux_args))))
687	return error;
688#ifdef DEBUG
689    printf("Linux-emul(%ld): mmap(%p, %d, %d, %08x, %d, %d)\n",
690	(long)p->p_pid, (void *)linux_args.addr, linux_args.len,
691	linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pos);
692#endif
693    bsd_args.flags = 0;
694    if (linux_args.flags & LINUX_MAP_SHARED)
695	bsd_args.flags |= MAP_SHARED;
696    if (linux_args.flags & LINUX_MAP_PRIVATE)
697	bsd_args.flags |= MAP_PRIVATE;
698    if (linux_args.flags & LINUX_MAP_FIXED)
699	bsd_args.flags |= MAP_FIXED;
700    if (linux_args.flags & LINUX_MAP_ANON)
701	bsd_args.flags |= MAP_ANON;
702    if (linux_args.flags & LINUX_MAP_GROWSDOWN) {
703	bsd_args.flags |= MAP_STACK;
704
705	/* The linux MAP_GROWSDOWN option does not limit auto
706	 * growth of the region.  Linux mmap with this option
707	 * takes as addr the inital BOS, and as len, the initial
708	 * region size.  It can then grow down from addr without
709	 * limit.  However, linux threads has an implicit internal
710	 * limit to stack size of STACK_SIZE.  Its just not
711	 * enforced explicitly in linux.  But, here we impose
712	 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
713	 * region, since we can do this with our mmap.
714	 *
715	 * Our mmap with MAP_STACK takes addr as the maximum
716	 * downsize limit on BOS, and as len the max size of
717	 * the region.  It them maps the top SGROWSIZ bytes,
718	 * and autgrows the region down, up to the limit
719	 * in addr.
720	 *
721	 * If we don't use the MAP_STACK option, the effect
722	 * of this code is to allocate a stack region of a
723	 * fixed size of (STACK_SIZE - GUARD_SIZE).
724	 */
725
726	/* This gives us TOS */
727	bsd_args.addr = linux_args.addr + linux_args.len;
728
729	/* This gives us our maximum stack size */
730	if (linux_args.len > STACK_SIZE - GUARD_SIZE)
731	    bsd_args.len = linux_args.len;
732	else
733	    bsd_args.len  = STACK_SIZE - GUARD_SIZE;
734
735	/* This gives us a new BOS.  If we're using VM_STACK, then
736	 * mmap will just map the top SGROWSIZ bytes, and let
737	 * the stack grow down to the limit at BOS.  If we're
738	 * not using VM_STACK we map the full stack, since we
739	 * don't have a way to autogrow it.
740	 */
741	bsd_args.addr -= bsd_args.len;
742
743    } else {
744	bsd_args.addr = linux_args.addr;
745	bsd_args.len  = linux_args.len;
746    }
747
748    bsd_args.prot = linux_args.prot | PROT_READ;	/* always required */
749    bsd_args.fd = linux_args.fd;
750    bsd_args.pos = linux_args.pos;
751    bsd_args.pad = 0;
752    return mmap(p, &bsd_args);
753}
754
755int
756linux_mremap(struct proc *p, struct linux_mremap_args *args)
757{
758	struct munmap_args /* {
759		void *addr;
760		size_t len;
761	} */ bsd_args;
762	int error = 0;
763
764#ifdef DEBUG
765	printf("Linux-emul(%ld): mremap(%p, %08x, %08x, %08x)\n",
766	    (long)p->p_pid, (void *)args->addr, args->old_len, args->new_len,
767	    args->flags);
768#endif
769	args->new_len = round_page(args->new_len);
770	args->old_len = round_page(args->old_len);
771
772	if (args->new_len > args->old_len) {
773		p->p_retval[0] = 0;
774		return ENOMEM;
775	}
776
777	if (args->new_len < args->old_len) {
778		bsd_args.addr = args->addr + args->new_len;
779		bsd_args.len = args->old_len - args->new_len;
780		error = munmap(p, &bsd_args);
781	}
782
783	p->p_retval[0] = error ? 0 : (int)args->addr;
784	return error;
785}
786
787int
788linux_msync(struct proc *p, struct linux_msync_args *args)
789{
790	struct msync_args bsd_args;
791
792	bsd_args.addr = args->addr;
793	bsd_args.len = args->len;
794	bsd_args.flags = 0;	/* XXX ignore */
795
796	return msync(p, &bsd_args);
797}
798
799int
800linux_pipe(struct proc *p, struct linux_pipe_args *args)
801{
802    int error;
803    int reg_edx;
804
805#ifdef DEBUG
806    printf("Linux-emul(%ld): pipe(*)\n", (long)p->p_pid);
807#endif
808    reg_edx = p->p_retval[1];
809    error = pipe(p, 0);
810    if (error) {
811	p->p_retval[1] = reg_edx;
812	return error;
813    }
814
815    error = copyout(p->p_retval, args->pipefds, 2*sizeof(int));
816    if (error) {
817	p->p_retval[1] = reg_edx;
818	return error;
819    }
820
821    p->p_retval[1] = reg_edx;
822    p->p_retval[0] = 0;
823    return 0;
824}
825
826int
827linux_time(struct proc *p, struct linux_time_args *args)
828{
829    struct timeval tv;
830    linux_time_t tm;
831    int error;
832
833#ifdef DEBUG
834    printf("Linux-emul(%ld): time(*)\n", (long)p->p_pid);
835#endif
836    microtime(&tv);
837    tm = tv.tv_sec;
838    if (args->tm && (error = copyout(&tm, args->tm, sizeof(linux_time_t))))
839	return error;
840    p->p_retval[0] = tm;
841    return 0;
842}
843
844struct linux_times_argv {
845    long    tms_utime;
846    long    tms_stime;
847    long    tms_cutime;
848    long    tms_cstime;
849};
850
851#define CLK_TCK 100	/* Linux uses 100 */
852#define CONVTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
853
854int
855linux_times(struct proc *p, struct linux_times_args *args)
856{
857    struct timeval tv;
858    struct linux_times_argv tms;
859    struct rusage ru;
860    int error;
861
862#ifdef DEBUG
863    printf("Linux-emul(%ld): times(*)\n", (long)p->p_pid);
864#endif
865    calcru(p, &ru.ru_utime, &ru.ru_stime, NULL);
866
867    tms.tms_utime = CONVTCK(ru.ru_utime);
868    tms.tms_stime = CONVTCK(ru.ru_stime);
869
870    tms.tms_cutime = CONVTCK(p->p_stats->p_cru.ru_utime);
871    tms.tms_cstime = CONVTCK(p->p_stats->p_cru.ru_stime);
872
873    if ((error = copyout((caddr_t)&tms, (caddr_t)args->buf,
874	    	    sizeof(struct linux_times_argv))))
875	return error;
876
877    microuptime(&tv);
878    p->p_retval[0] = (int)CONVTCK(tv);
879    return 0;
880}
881
882int
883linux_newuname(struct proc *p, struct linux_newuname_args *args)
884{
885	struct linux_new_utsname utsname;
886	char *osrelease, *osname;
887
888#ifdef DEBUG
889	printf("Linux-emul(%ld): newuname(*)\n", (long)p->p_pid);
890#endif
891
892	osname = linux_get_osname(p);
893	osrelease = linux_get_osrelease(p);
894
895	bzero(&utsname, sizeof(struct linux_new_utsname));
896	strncpy(utsname.sysname, osname, LINUX_MAX_UTSNAME-1);
897	strncpy(utsname.nodename, hostname, LINUX_MAX_UTSNAME-1);
898	strncpy(utsname.release, osrelease, LINUX_MAX_UTSNAME-1);
899	strncpy(utsname.version, version, LINUX_MAX_UTSNAME-1);
900	strncpy(utsname.machine, machine, LINUX_MAX_UTSNAME-1);
901	strncpy(utsname.domainname, domainname, LINUX_MAX_UTSNAME-1);
902
903	return (copyout((caddr_t)&utsname, (caddr_t)args->buf,
904			sizeof(struct linux_new_utsname)));
905}
906
907struct linux_utimbuf {
908	linux_time_t l_actime;
909	linux_time_t l_modtime;
910};
911
912int
913linux_utime(struct proc *p, struct linux_utime_args *args)
914{
915    struct utimes_args /* {
916	char	*path;
917	struct	timeval *tptr;
918    } */ bsdutimes;
919    struct timeval tv[2], *tvp;
920    struct linux_utimbuf lut;
921    int error;
922    caddr_t sg;
923
924    sg = stackgap_init();
925    CHECKALTEXIST(p, &sg, args->fname);
926
927#ifdef DEBUG
928    printf("Linux-emul(%ld): utime(%s, *)\n", (long)p->p_pid, args->fname);
929#endif
930    if (args->times) {
931	if ((error = copyin(args->times, &lut, sizeof lut)))
932	    return error;
933	tv[0].tv_sec = lut.l_actime;
934	tv[0].tv_usec = 0;
935	tv[1].tv_sec = lut.l_modtime;
936	tv[1].tv_usec = 0;
937	/* so that utimes can copyin */
938	tvp = (struct timeval *)stackgap_alloc(&sg, sizeof(tv));
939	if ((error = copyout(tv, tvp, sizeof(tv))))
940	    return error;
941	bsdutimes.tptr = tvp;
942    } else
943	bsdutimes.tptr = NULL;
944
945    bsdutimes.path = args->fname;
946    return utimes(p, &bsdutimes);
947}
948
949#define __WCLONE 0x80000000
950
951int
952linux_waitpid(struct proc *p, struct linux_waitpid_args *args)
953{
954    struct wait_args /* {
955	int pid;
956	int *status;
957	int options;
958	struct	rusage *rusage;
959    } */ tmp;
960    int error, tmpstat;
961
962#ifdef DEBUG
963    printf("Linux-emul(%ld): waitpid(%d, %p, %d)\n",
964	(long)p->p_pid, args->pid, (void *)args->status, args->options);
965#endif
966    tmp.pid = args->pid;
967    tmp.status = args->status;
968    tmp.options = (args->options & (WNOHANG | WUNTRACED));
969    /* WLINUXCLONE should be equal to __WCLONE, but we make sure */
970    if (args->options & __WCLONE)
971	tmp.options |= WLINUXCLONE;
972    tmp.rusage = NULL;
973
974    if ((error = wait4(p, &tmp)) != 0)
975	return error;
976
977    if (args->status) {
978	if ((error = copyin(args->status, &tmpstat, sizeof(int))) != 0)
979	    return error;
980	if (WIFSIGNALED(tmpstat))
981	    tmpstat = (tmpstat & 0xffffff80) |
982		      bsd_to_linux_signal[WTERMSIG(tmpstat)];
983	else if (WIFSTOPPED(tmpstat))
984	    tmpstat = (tmpstat & 0xffff00ff) |
985		      (bsd_to_linux_signal[WSTOPSIG(tmpstat)]<<8);
986	return copyout(&tmpstat, args->status, sizeof(int));
987    } else
988	return 0;
989}
990
991int
992linux_wait4(struct proc *p, struct linux_wait4_args *args)
993{
994    struct wait_args /* {
995	int pid;
996	int *status;
997	int options;
998	struct	rusage *rusage;
999    } */ tmp;
1000    int error, tmpstat;
1001
1002#ifdef DEBUG
1003    printf("Linux-emul(%ld): wait4(%d, %p, %d, %p)\n",
1004	(long)p->p_pid, args->pid, (void *)args->status, args->options,
1005	(void *)args->rusage);
1006#endif
1007    tmp.pid = args->pid;
1008    tmp.status = args->status;
1009    tmp.options = (args->options & (WNOHANG | WUNTRACED));
1010    /* WLINUXCLONE should be equal to __WCLONE, but we make sure */
1011    if (args->options & __WCLONE)
1012	tmp.options |= WLINUXCLONE;
1013    tmp.rusage = args->rusage;
1014
1015    if ((error = wait4(p, &tmp)) != 0)
1016	return error;
1017
1018    p->p_siglist &= ~sigmask(SIGCHLD);
1019
1020    if (args->status) {
1021	if ((error = copyin(args->status, &tmpstat, sizeof(int))) != 0)
1022	    return error;
1023	if (WIFSIGNALED(tmpstat))
1024	    tmpstat = (tmpstat & 0xffffff80) |
1025		  bsd_to_linux_signal[WTERMSIG(tmpstat)];
1026	else if (WIFSTOPPED(tmpstat))
1027	    tmpstat = (tmpstat & 0xffff00ff) |
1028		  (bsd_to_linux_signal[WSTOPSIG(tmpstat)]<<8);
1029	return copyout(&tmpstat, args->status, sizeof(int));
1030    } else
1031	return 0;
1032}
1033
1034int
1035linux_mknod(struct proc *p, struct linux_mknod_args *args)
1036{
1037	caddr_t sg;
1038	struct mknod_args bsd_mknod;
1039	struct mkfifo_args bsd_mkfifo;
1040
1041	sg = stackgap_init();
1042
1043	CHECKALTCREAT(p, &sg, args->path);
1044
1045#ifdef DEBUG
1046	printf("Linux-emul(%ld): mknod(%s, %d, %d)\n",
1047	   (long)p->p_pid, args->path, args->mode, args->dev);
1048#endif
1049
1050	if (args->mode & S_IFIFO) {
1051		bsd_mkfifo.path = args->path;
1052		bsd_mkfifo.mode = args->mode;
1053		return mkfifo(p, &bsd_mkfifo);
1054	} else {
1055		bsd_mknod.path = args->path;
1056		bsd_mknod.mode = args->mode;
1057		bsd_mknod.dev = args->dev;
1058		return mknod(p, &bsd_mknod);
1059	}
1060}
1061
1062/*
1063 * UGH! This is just about the dumbest idea I've ever heard!!
1064 */
1065int
1066linux_personality(struct proc *p, struct linux_personality_args *args)
1067{
1068#ifdef DEBUG
1069	printf("Linux-emul(%ld): personality(%d)\n",
1070	   (long)p->p_pid, args->per);
1071#endif
1072	if (args->per != 0)
1073		return EINVAL;
1074
1075	/* Yes Jim, it's still a Linux... */
1076	p->p_retval[0] = 0;
1077	return 0;
1078}
1079
1080/*
1081 * Wrappers for get/setitimer for debugging..
1082 */
1083int
1084linux_setitimer(struct proc *p, struct linux_setitimer_args *args)
1085{
1086	struct setitimer_args bsa;
1087	struct itimerval foo;
1088	int error;
1089
1090#ifdef DEBUG
1091	printf("Linux-emul(%ld): setitimer(%p, %p)\n",
1092	    (long)p->p_pid, (void *)args->itv, (void *)args->oitv);
1093#endif
1094	bsa.which = args->which;
1095	bsa.itv = args->itv;
1096	bsa.oitv = args->oitv;
1097	if (args->itv) {
1098	    if ((error = copyin((caddr_t)args->itv, (caddr_t)&foo,
1099			sizeof(foo))))
1100		return error;
1101#ifdef DEBUG
1102	    printf("setitimer: value: sec: %ld, usec: %ld\n",
1103		foo.it_value.tv_sec, foo.it_value.tv_usec);
1104	    printf("setitimer: interval: sec: %ld, usec: %ld\n",
1105		foo.it_interval.tv_sec, foo.it_interval.tv_usec);
1106#endif
1107	}
1108	return setitimer(p, &bsa);
1109}
1110
1111int
1112linux_getitimer(struct proc *p, struct linux_getitimer_args *args)
1113{
1114	struct getitimer_args bsa;
1115#ifdef DEBUG
1116	printf("Linux-emul(%ld): getitimer(%p)\n",
1117	    (long)p->p_pid, (void *)args->itv);
1118#endif
1119	bsa.which = args->which;
1120	bsa.itv = args->itv;
1121	return getitimer(p, &bsa);
1122}
1123
1124int
1125linux_iopl(struct proc *p, struct linux_iopl_args *args)
1126{
1127	int error;
1128
1129	error = suser(p);
1130	if (error != 0)
1131		return error;
1132	if (securelevel > 0)
1133		return EPERM;
1134	p->p_md.md_regs->tf_eflags |= PSL_IOPL;
1135	return 0;
1136}
1137
1138int
1139linux_nice(struct proc *p, struct linux_nice_args *args)
1140{
1141	struct setpriority_args	bsd_args;
1142
1143	bsd_args.which = PRIO_PROCESS;
1144	bsd_args.who = 0;	/* current process */
1145	bsd_args.prio = args->inc;
1146	return setpriority(p, &bsd_args);
1147}
1148
1149int
1150linux_setgroups(p, uap)
1151	struct proc *p;
1152	struct linux_setgroups_args *uap;
1153{
1154	struct pcred *pc;
1155	linux_gid_t linux_gidset[NGROUPS];
1156	gid_t *bsd_gidset;
1157	int ngrp, error;
1158
1159	pc = p->p_cred;
1160	ngrp = uap->gidsetsize;
1161
1162	/*
1163	 * cr_groups[0] holds egid. Setting the whole set from
1164	 * the supplied set will cause egid to be changed too.
1165	 * Keep cr_groups[0] unchanged to prevent that.
1166	 */
1167
1168	if ((error = suser(p)) != 0)
1169		return (error);
1170
1171	if (ngrp >= NGROUPS)
1172		return (EINVAL);
1173
1174	pc->pc_ucred = crcopy(pc->pc_ucred);
1175	if (ngrp > 0) {
1176		error = copyin((caddr_t)uap->gidset, (caddr_t)linux_gidset,
1177			       ngrp * sizeof(linux_gid_t));
1178		if (error)
1179			return (error);
1180
1181		pc->pc_ucred->cr_ngroups = ngrp + 1;
1182
1183		bsd_gidset = pc->pc_ucred->cr_groups;
1184		ngrp--;
1185		while (ngrp >= 0) {
1186			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
1187			ngrp--;
1188		}
1189	}
1190	else
1191		pc->pc_ucred->cr_ngroups = 1;
1192
1193	setsugid(p);
1194	return (0);
1195}
1196
1197int
1198linux_getgroups(p, uap)
1199	struct proc *p;
1200	struct linux_getgroups_args *uap;
1201{
1202	struct pcred *pc;
1203	linux_gid_t linux_gidset[NGROUPS];
1204	gid_t *bsd_gidset;
1205	int bsd_gidsetsz, ngrp, error;
1206
1207	pc = p->p_cred;
1208	bsd_gidset = pc->pc_ucred->cr_groups;
1209	bsd_gidsetsz = pc->pc_ucred->cr_ngroups - 1;
1210
1211	/*
1212	 * cr_groups[0] holds egid. Returning the whole set
1213	 * here will cause a duplicate. Exclude cr_groups[0]
1214	 * to prevent that.
1215	 */
1216
1217	if ((ngrp = uap->gidsetsize) == 0) {
1218		p->p_retval[0] = bsd_gidsetsz;
1219		return (0);
1220	}
1221
1222	if (ngrp < bsd_gidsetsz)
1223		return (EINVAL);
1224
1225	ngrp = 0;
1226	while (ngrp < bsd_gidsetsz) {
1227		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
1228		ngrp++;
1229	}
1230
1231	if ((error = copyout((caddr_t)linux_gidset, (caddr_t)uap->gidset,
1232	    ngrp * sizeof(linux_gid_t))))
1233		return (error);
1234
1235	p->p_retval[0] = ngrp;
1236	return (0);
1237}
1238
1239int
1240linux_setrlimit(p, uap)
1241     struct proc *p;
1242     struct linux_setrlimit_args *uap;
1243{
1244    struct osetrlimit_args bsd;
1245
1246#ifdef DEBUG
1247    printf("Linux-emul(%ld): setrlimit(%d, %p)\n",
1248	   (long)p->p_pid, uap->resource, (void *)uap->rlim);
1249#endif
1250
1251    if (uap->resource >= LINUX_RLIM_NLIMITS)
1252	return EINVAL;
1253
1254    bsd.which = linux_to_bsd_resource[uap->resource];
1255
1256    if (bsd.which == -1)
1257	return EINVAL;
1258
1259    bsd.rlp = uap->rlim;
1260    return osetrlimit(p, &bsd);
1261}
1262
1263int
1264linux_getrlimit(p, uap)
1265     struct proc *p;
1266     struct linux_getrlimit_args *uap;
1267{
1268    struct ogetrlimit_args bsd;
1269
1270#ifdef DEBUG
1271    printf("Linux-emul(%ld): getrlimit(%d, %p)\n",
1272	   (long)p->p_pid, uap->resource, (void *)uap->rlim);
1273#endif
1274
1275    if (uap->resource >= LINUX_RLIM_NLIMITS)
1276	return EINVAL;
1277
1278    bsd.which = linux_to_bsd_resource[uap->resource];
1279
1280    if (bsd.which == -1)
1281	return EINVAL;
1282
1283    bsd.rlp = uap->rlim;
1284    return ogetrlimit(p, &bsd);
1285}
1286
1287int
1288linux_sched_setscheduler(p, uap)
1289	struct proc *p;
1290	struct linux_sched_setscheduler_args *uap;
1291{
1292	struct sched_setscheduler_args bsd;
1293
1294#ifdef DEBUG
1295	printf("Linux-emul(%ld): sched_setscheduler(%d, %d, %p)\n",
1296	       (long)p->p_pid, uap->pid, uap->policy, (void *)uap->param);
1297#endif
1298
1299	switch (uap->policy) {
1300	case LINUX_SCHED_OTHER:
1301		bsd.policy = SCHED_OTHER;
1302		break;
1303	case LINUX_SCHED_FIFO:
1304		bsd.policy = SCHED_FIFO;
1305		break;
1306	case LINUX_SCHED_RR:
1307		bsd.policy = SCHED_RR;
1308		break;
1309	default:
1310		return EINVAL;
1311	}
1312
1313	bsd.pid = uap->pid;
1314	bsd.param = uap->param;
1315	return sched_setscheduler(p, &bsd);
1316}
1317
1318int
1319linux_sched_getscheduler(p, uap)
1320	struct proc *p;
1321	struct linux_sched_getscheduler_args *uap;
1322{
1323	struct sched_getscheduler_args bsd;
1324	int error;
1325
1326#ifdef DEBUG
1327	printf("Linux-emul(%ld): sched_getscheduler(%d)\n",
1328	       (long)p->p_pid, uap->pid);
1329#endif
1330
1331	bsd.pid = uap->pid;
1332	error = sched_getscheduler(p, &bsd);
1333
1334	switch (p->p_retval[0]) {
1335	case SCHED_OTHER:
1336		p->p_retval[0] = LINUX_SCHED_OTHER;
1337		break;
1338	case SCHED_FIFO:
1339		p->p_retval[0] = LINUX_SCHED_FIFO;
1340		break;
1341	case SCHED_RR:
1342		p->p_retval[0] = LINUX_SCHED_RR;
1343		break;
1344	}
1345
1346	return error;
1347}
1348
1349struct linux_descriptor {
1350	unsigned int  entry_number;
1351	unsigned long base_addr;
1352	unsigned int  limit;
1353	unsigned int  seg_32bit:1;
1354	unsigned int  contents:2;
1355	unsigned int  read_exec_only:1;
1356	unsigned int  limit_in_pages:1;
1357	unsigned int  seg_not_present:1;
1358	unsigned int  useable:1;
1359};
1360
1361int
1362linux_modify_ldt(p, uap)
1363	struct proc *p;
1364	struct linux_modify_ldt_args *uap;
1365{
1366	int error;
1367	caddr_t sg;
1368	struct sysarch_args args;
1369	struct i386_ldt_args *ldt;
1370	struct linux_descriptor ld;
1371	union descriptor *desc;
1372
1373	sg = stackgap_init();
1374
1375	if (uap->ptr == NULL)
1376		return (EINVAL);
1377
1378	switch (uap->func) {
1379	case 0x00: /* read_ldt */
1380		ldt = stackgap_alloc(&sg, sizeof(*ldt));
1381		ldt->start = 0;
1382		ldt->desc = uap->ptr;
1383		ldt->num = uap->bytecount / sizeof(union descriptor);
1384		args.op = I386_GET_LDT;
1385		args.parms = (char*)ldt;
1386		error = sysarch(p, &args);
1387		p->p_retval[0] *= sizeof(union descriptor);
1388		break;
1389	case 0x01: /* write_ldt */
1390	case 0x11: /* write_ldt */
1391		if (uap->bytecount != sizeof(ld))
1392			return (EINVAL);
1393
1394		error = copyin(uap->ptr, &ld, sizeof(ld));
1395		if (error)
1396			return (error);
1397
1398		ldt = stackgap_alloc(&sg, sizeof(*ldt));
1399		desc = stackgap_alloc(&sg, sizeof(*desc));
1400		ldt->start = ld.entry_number;
1401		ldt->desc = desc;
1402		ldt->num = 1;
1403		desc->sd.sd_lolimit = (ld.limit & 0x0000ffff);
1404		desc->sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
1405		desc->sd.sd_lobase = (ld.base_addr & 0x00ffffff);
1406		desc->sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
1407		desc->sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
1408			(ld.contents << 2);
1409		desc->sd.sd_dpl = 3;
1410		desc->sd.sd_p = (ld.seg_not_present ^ 1);
1411		desc->sd.sd_xx = 0;
1412		desc->sd.sd_def32 = ld.seg_32bit;
1413		desc->sd.sd_gran = ld.limit_in_pages;
1414		args.op = I386_SET_LDT;
1415		args.parms = (char*)ldt;
1416		error = sysarch(p, &args);
1417		break;
1418	default:
1419		error = EINVAL;
1420		break;
1421	}
1422
1423	if (error == EOPNOTSUPP) {
1424		printf("linux: modify_ldt needs kernel option USER_LDT\n");
1425		error = ENOSYS;
1426	}
1427
1428	return (error);
1429}
1430