linux_machdep.c revision 67238
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $FreeBSD: head/sys/i386/linux/linux_machdep.c 67238 2000-10-17 00:25:43Z gallatin $
29 */
30
31#include <sys/param.h>
32#include <sys/mman.h>
33#include <sys/proc.h>
34#include <sys/sysproto.h>
35#include <sys/systm.h>
36#include <sys/unistd.h>
37#include <sys/resource.h>
38#include <sys/resourcevar.h>
39
40#include <machine/frame.h>
41#include <machine/psl.h>
42#include <machine/segments.h>
43#include <machine/sysarch.h>
44
45#include <vm/vm.h>
46#include <sys/lock.h>
47#include <vm/pmap.h>
48#include <vm/vm_map.h>
49
50
51#include <i386/linux/linux.h>
52#include <i386/linux/linux_proto.h>
53#include <compat/linux/linux_ipc.h>
54#include <compat/linux/linux_signal.h>
55#include <compat/linux/linux_util.h>
56
57struct linux_descriptor {
58	unsigned int  entry_number;
59	unsigned long base_addr;
60	unsigned int  limit;
61	unsigned int  seg_32bit:1;
62	unsigned int  contents:2;
63	unsigned int  read_exec_only:1;
64	unsigned int  limit_in_pages:1;
65	unsigned int  seg_not_present:1;
66	unsigned int  useable:1;
67};
68
69struct linux_select_argv {
70	int nfds;
71	fd_set *readfds;
72	fd_set *writefds;
73	fd_set *exceptfds;
74	struct timeval *timeout;
75};
76
77int
78linux_to_bsd_sigaltstack(int lsa)
79{
80	int bsa = 0;
81
82	if (lsa & LINUX_SS_DISABLE)
83		bsa |= SS_DISABLE;
84	if (lsa & LINUX_SS_ONSTACK)
85		bsa |= SS_ONSTACK;
86	if (lsa == LINUX_SS_ONSTACK_BC)
87		bsa = SS_ONSTACK;
88	return (bsa);
89}
90
91int
92bsd_to_linux_sigaltstack(int bsa)
93{
94	int lsa = 0;
95
96	if (bsa & SS_DISABLE)
97		lsa |= LINUX_SS_DISABLE;
98	if (bsa & SS_ONSTACK)
99		lsa |= LINUX_SS_ONSTACK;
100	return (lsa);
101}
102
103int
104linux_execve(struct proc *p, struct linux_execve_args *args)
105{
106	struct execve_args bsd;
107	caddr_t sg;
108
109	sg = stackgap_init();
110	CHECKALTEXIST(p, &sg, args->path);
111
112#ifdef DEBUG
113        printf("Linux-emul(%d): execve(%s)\n",
114	    p->p_pid, args->path);
115#endif
116
117	bsd.fname = args->path;
118	bsd.argv = args->argp;
119	bsd.envv = args->envp;
120	return (execve(p, &bsd));
121}
122
123int
124linux_ipc(struct proc *p, struct linux_ipc_args *args)
125{
126	switch (args->what) {
127	case LINUX_SEMOP:
128		return (linux_semop(p, args));
129	case LINUX_SEMGET:
130		return (linux_semget(p, args));
131	case LINUX_SEMCTL:
132		return (linux_semctl(p, args));
133	case LINUX_MSGSND:
134		return (linux_msgsnd(p, args));
135	case LINUX_MSGRCV:
136		return (linux_msgrcv(p, args));
137	case LINUX_MSGGET:
138		return (linux_msgget(p, args));
139	case LINUX_MSGCTL:
140		return (linux_msgctl(p, args));
141	case LINUX_SHMAT:
142		return (linux_shmat(p, args));
143	case LINUX_SHMDT:
144		return (linux_shmdt(p, args));
145	case LINUX_SHMGET:
146		return (linux_shmget(p, args));
147	case LINUX_SHMCTL:
148		return (linux_shmctl(p, args));
149	}
150
151	uprintf("LINUX: 'ipc' typ=%d not implemented\n", args->what);
152	return (ENOSYS);
153}
154
155int
156linux_select(struct proc *p, struct linux_select_args *args)
157{
158	struct linux_select_argv linux_args;
159	struct linux_newselect_args newsel;
160	int error;
161
162#ifdef SELECT_DEBUG
163	printf("Linux-emul(%ld): select(%x)\n", (long)p->p_pid, args->ptr);
164#endif
165
166	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
167	if (error)
168		return (error);
169
170	newsel.nfds = linux_args.nfds;
171	newsel.readfds = linux_args.readfds;
172	newsel.writefds = linux_args.writefds;
173	newsel.exceptfds = linux_args.exceptfds;
174	newsel.timeout = linux_args.timeout;
175	return (linux_newselect(p, &newsel));
176}
177
178int
179linux_fork(struct proc *p, struct linux_fork_args *args)
180{
181	int error;
182
183#ifdef DEBUG
184	printf("Linux-emul(%ld): fork()\n", (long)p->p_pid);
185#endif
186
187	if ((error = fork(p, (struct fork_args *)args)) != 0)
188		return (error);
189
190	if (p->p_retval[1] == 1)
191		p->p_retval[0] = 0;
192	return (0);
193}
194
195int
196linux_vfork(struct proc *p, struct linux_vfork_args *args)
197{
198	int error;
199
200#ifdef DEBUG
201	printf("Linux-emul(%ld): vfork()\n", (long)p->p_pid);
202#endif
203
204	if ((error = vfork(p, (struct vfork_args *)args)) != 0)
205		return (error);
206	/* Are we the child? */
207	if (p->p_retval[1] == 1)
208		p->p_retval[0] = 0;
209	return (0);
210}
211
212#define CLONE_VM	0x100
213#define CLONE_FS	0x200
214#define CLONE_FILES	0x400
215#define CLONE_SIGHAND	0x800
216#define CLONE_PID	0x1000
217
218int
219linux_clone(struct proc *p, struct linux_clone_args *args)
220{
221	int error, ff = RFPROC;
222	struct proc *p2;
223	int exit_signal;
224	vm_offset_t start;
225	struct rfork_args rf_args;
226
227#ifdef DEBUG
228	if (args->flags & CLONE_PID)
229		printf("linux_clone(%ld): CLONE_PID not yet supported\n",
230		    (long)p->p_pid);
231	printf("linux_clone(%ld): invoked with flags %x and stack %x\n",
232	    (long)p->p_pid, (unsigned int)args->flags,
233	    (unsigned int)args->stack);
234#endif
235
236	if (!args->stack)
237		return (EINVAL);
238
239	exit_signal = args->flags & 0x000000ff;
240	if (exit_signal >= LINUX_NSIG)
241		return (EINVAL);
242
243	if (exit_signal <= LINUX_SIGTBLSZ)
244		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
245
246	/* RFTHREAD probably not necessary here, but it shouldn't hurt */
247	ff |= RFTHREAD;
248
249	if (args->flags & CLONE_VM)
250		ff |= RFMEM;
251	if (args->flags & CLONE_SIGHAND)
252		ff |= RFSIGSHARE;
253	if (!(args->flags & CLONE_FILES))
254		ff |= RFFDG;
255
256	error = 0;
257	start = 0;
258
259	rf_args.flags = ff;
260	if ((error = rfork(p, &rf_args)) != 0)
261		return (error);
262
263	p2 = pfind(p->p_retval[0]);
264	if (p2 == 0)
265		return (ESRCH);
266
267	p2->p_sigparent = exit_signal;
268	p2->p_md.md_regs->tf_esp = (unsigned int)args->stack;
269
270#ifdef DEBUG
271	printf ("linux_clone(%ld): successful rfork to %ld\n", (long)p->p_pid,
272	    (long)p2->p_pid);
273#endif
274
275	return (0);
276}
277
278/* XXX move */
279struct linux_mmap_argv {
280	linux_caddr_t addr;
281	int len;
282	int prot;
283	int flags;
284	int fd;
285	int pos;
286};
287
288#define STACK_SIZE  (2 * 1024 * 1024)
289#define GUARD_SIZE  (4 * PAGE_SIZE)
290
291int
292linux_mmap(struct proc *p, struct linux_mmap_args *args)
293{
294	struct mmap_args /* {
295		caddr_t addr;
296		size_t len;
297		int prot;
298		int flags;
299		int fd;
300		long pad;
301		off_t pos;
302	} */ bsd_args;
303	int error;
304	struct linux_mmap_argv linux_args;
305
306	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
307	if (error)
308		return (error);
309
310#ifdef DEBUG
311	printf("Linux-emul(%ld): mmap(%p, %d, %d, 0x%08x, %d, %d)",
312	    (long)p->p_pid, (void *)linux_args.addr, linux_args.len,
313	    linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pos);
314#endif
315
316	bsd_args.flags = 0;
317	if (linux_args.flags & LINUX_MAP_SHARED)
318		bsd_args.flags |= MAP_SHARED;
319	if (linux_args.flags & LINUX_MAP_PRIVATE)
320		bsd_args.flags |= MAP_PRIVATE;
321	if (linux_args.flags & LINUX_MAP_FIXED)
322		bsd_args.flags |= MAP_FIXED;
323	if (linux_args.flags & LINUX_MAP_ANON)
324		bsd_args.flags |= MAP_ANON;
325	if (linux_args.flags & LINUX_MAP_GROWSDOWN) {
326		bsd_args.flags |= MAP_STACK;
327
328		/* The linux MAP_GROWSDOWN option does not limit auto
329		 * growth of the region.  Linux mmap with this option
330		 * takes as addr the inital BOS, and as len, the initial
331		 * region size.  It can then grow down from addr without
332		 * limit.  However, linux threads has an implicit internal
333		 * limit to stack size of STACK_SIZE.  Its just not
334		 * enforced explicitly in linux.  But, here we impose
335		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
336		 * region, since we can do this with our mmap.
337		 *
338		 * Our mmap with MAP_STACK takes addr as the maximum
339		 * downsize limit on BOS, and as len the max size of
340		 * the region.  It them maps the top SGROWSIZ bytes,
341		 * and autgrows the region down, up to the limit
342		 * in addr.
343		 *
344		 * If we don't use the MAP_STACK option, the effect
345		 * of this code is to allocate a stack region of a
346		 * fixed size of (STACK_SIZE - GUARD_SIZE).
347		 */
348
349		/* This gives us TOS */
350		bsd_args.addr = linux_args.addr + linux_args.len;
351
352		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
353			/* Some linux apps will attempt to mmap
354			 * thread stacks near the top of their
355			 * address space.  If their TOS is greater
356			 * than vm_maxsaddr, vm_map_growstack()
357			 * will confuse the thread stack with the
358			 * process stack and deliver a SEGV if they
359			 * attempt to grow the thread stack past their
360			 * current stacksize rlimit.  To avoid this,
361			 * adjust vm_maxsaddr upwards to reflect
362			 * the current stacksize rlimit rather
363			 * than the maximum possible stacksize.
364			 * It would be better to adjust the
365			 * mmap'ed region, but some apps do not check
366			 * mmap's return value.
367			 */
368			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
369			    p->p_rlimit[RLIMIT_STACK].rlim_cur;
370		}
371
372		/* This gives us our maximum stack size */
373		if (linux_args.len > STACK_SIZE - GUARD_SIZE)
374			bsd_args.len = linux_args.len;
375		else
376			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
377
378		/* This gives us a new BOS.  If we're using VM_STACK, then
379		 * mmap will just map the top SGROWSIZ bytes, and let
380		 * the stack grow down to the limit at BOS.  If we're
381		 * not using VM_STACK we map the full stack, since we
382		 * don't have a way to autogrow it.
383		 */
384		bsd_args.addr -= bsd_args.len;
385	} else {
386		bsd_args.addr = linux_args.addr;
387		bsd_args.len  = linux_args.len;
388	}
389
390	bsd_args.prot = linux_args.prot | PROT_READ;	/* always required */
391	if (linux_args.flags & LINUX_MAP_ANON)
392		bsd_args.fd = -1;
393	else
394		bsd_args.fd = linux_args.fd;
395	bsd_args.pos = linux_args.pos;
396	bsd_args.pad = 0;
397
398#ifdef DEBUG
399	printf("-> (%p, %d, %d, 0x%08x, %d, %d)\n", (void *)bsd_args.addr,
400	    bsd_args.len, bsd_args.prot, bsd_args.flags, bsd_args.fd,
401	    (int)bsd_args.pos);
402#endif
403
404	return (mmap(p, &bsd_args));
405}
406
407int
408linux_pipe(struct proc *p, struct linux_pipe_args *args)
409{
410	int error;
411	int reg_edx;
412
413#ifdef DEBUG
414	printf("Linux-emul(%ld): pipe(*)\n", (long)p->p_pid);
415#endif
416
417	reg_edx = p->p_retval[1];
418	error = pipe(p, 0);
419	if (error) {
420		p->p_retval[1] = reg_edx;
421		return (error);
422	}
423
424	error = copyout(p->p_retval, args->pipefds, 2*sizeof(int));
425	if (error) {
426		p->p_retval[1] = reg_edx;
427		return (error);
428	}
429
430	p->p_retval[1] = reg_edx;
431	p->p_retval[0] = 0;
432	return (0);
433}
434
435int
436linux_ioperm(struct proc *p, struct linux_ioperm_args *args)
437{
438	struct sysarch_args sa;
439	struct i386_ioperm_args *iia;
440	caddr_t sg;
441
442	sg = stackgap_init();
443	iia = stackgap_alloc(&sg, sizeof(struct i386_ioperm_args));
444	iia->start = args->start;
445	iia->length = args->length;
446	iia->enable = args->enable;
447	sa.op = I386_SET_IOPERM;
448	sa.parms = (char *)iia;
449	return (sysarch(p, &sa));
450}
451
452int
453linux_iopl(struct proc *p, struct linux_iopl_args *args)
454{
455	int error;
456
457	if (args->level < 0 || args->level > 3)
458		return (EINVAL);
459	if ((error = suser(p)) != 0)
460		return (error);
461	if (securelevel > 0)
462		return (EPERM);
463	p->p_md.md_regs->tf_eflags = (p->p_md.md_regs->tf_eflags & ~PSL_IOPL) |
464	    (args->level * (PSL_IOPL / 3));
465	return (0);
466}
467
468int
469linux_modify_ldt(p, uap)
470	struct proc *p;
471	struct linux_modify_ldt_args *uap;
472{
473	int error;
474	caddr_t sg;
475	struct sysarch_args args;
476	struct i386_ldt_args *ldt;
477	struct linux_descriptor ld;
478	union descriptor *desc;
479
480	sg = stackgap_init();
481
482	if (uap->ptr == NULL)
483		return (EINVAL);
484
485	switch (uap->func) {
486	case 0x00: /* read_ldt */
487		ldt = stackgap_alloc(&sg, sizeof(*ldt));
488		ldt->start = 0;
489		ldt->descs = uap->ptr;
490		ldt->num = uap->bytecount / sizeof(union descriptor);
491		args.op = I386_GET_LDT;
492		args.parms = (char*)ldt;
493		error = sysarch(p, &args);
494		p->p_retval[0] *= sizeof(union descriptor);
495		break;
496	case 0x01: /* write_ldt */
497	case 0x11: /* write_ldt */
498		if (uap->bytecount != sizeof(ld))
499			return (EINVAL);
500
501		error = copyin(uap->ptr, &ld, sizeof(ld));
502		if (error)
503			return (error);
504
505		ldt = stackgap_alloc(&sg, sizeof(*ldt));
506		desc = stackgap_alloc(&sg, sizeof(*desc));
507		ldt->start = ld.entry_number;
508		ldt->descs = desc;
509		ldt->num = 1;
510		desc->sd.sd_lolimit = (ld.limit & 0x0000ffff);
511		desc->sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
512		desc->sd.sd_lobase = (ld.base_addr & 0x00ffffff);
513		desc->sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
514		desc->sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
515			(ld.contents << 2);
516		desc->sd.sd_dpl = 3;
517		desc->sd.sd_p = (ld.seg_not_present ^ 1);
518		desc->sd.sd_xx = 0;
519		desc->sd.sd_def32 = ld.seg_32bit;
520		desc->sd.sd_gran = ld.limit_in_pages;
521		args.op = I386_SET_LDT;
522		args.parms = (char*)ldt;
523		error = sysarch(p, &args);
524		break;
525	default:
526		error = EINVAL;
527		break;
528	}
529
530	if (error == EOPNOTSUPP) {
531		printf("linux: modify_ldt needs kernel option USER_LDT\n");
532		error = ENOSYS;
533	}
534
535	return (error);
536}
537
538int
539linux_sigaction(struct proc *p, struct linux_sigaction_args *args)
540{
541	linux_osigaction_t osa;
542	linux_sigaction_t act, oact;
543	int error;
544
545#ifdef DEBUG
546	printf("Linux-emul(%ld): sigaction(%d, %p, %p)\n", (long)p->p_pid,
547	       args->sig, (void *)args->nsa, (void *)args->osa);
548#endif
549
550	if (args->nsa != NULL) {
551		error = copyin(args->nsa, &osa, sizeof(linux_osigaction_t));
552		if (error)
553			return (error);
554		act.lsa_handler = osa.lsa_handler;
555		act.lsa_flags = osa.lsa_flags;
556		act.lsa_restorer = osa.lsa_restorer;
557		LINUX_SIGEMPTYSET(act.lsa_mask);
558		act.lsa_mask.__bits[0] = osa.lsa_mask;
559	}
560
561	error = linux_do_sigaction(p, args->sig, args->nsa ? &act : NULL,
562	    args->osa ? &oact : NULL);
563
564	if (args->osa != NULL && !error) {
565		osa.lsa_handler = oact.lsa_handler;
566		osa.lsa_flags = oact.lsa_flags;
567		osa.lsa_restorer = oact.lsa_restorer;
568		osa.lsa_mask = oact.lsa_mask.__bits[0];
569		error = copyout(&osa, args->osa, sizeof(linux_osigaction_t));
570	}
571
572	return (error);
573}
574
575/*
576 * Linux has two extra args, restart and oldmask.  We dont use these,
577 * but it seems that "restart" is actually a context pointer that
578 * enables the signal to happen with a different register set.
579 */
580int
581linux_sigsuspend(struct proc *p, struct linux_sigsuspend_args *args)
582{
583	struct sigsuspend_args bsd;
584	sigset_t *sigmask;
585	linux_sigset_t mask;
586	caddr_t sg = stackgap_init();
587
588#ifdef DEBUG
589	printf("Linux-emul(%ld): sigsuspend(%08lx)\n",
590	       (long)p->p_pid, (unsigned long)args->mask);
591#endif
592
593	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
594	LINUX_SIGEMPTYSET(mask);
595	mask.__bits[0] = args->mask;
596	linux_to_bsd_sigset(&mask, sigmask);
597	bsd.sigmask = sigmask;
598	return (sigsuspend(p, &bsd));
599}
600
601int
602linux_rt_sigsuspend(p, uap)
603	struct proc *p;
604	struct linux_rt_sigsuspend_args *uap;
605{
606	linux_sigset_t lmask;
607	sigset_t *bmask;
608	struct sigsuspend_args bsd;
609	caddr_t sg = stackgap_init();
610	int error;
611
612#ifdef DEBUG
613	printf("Linux-emul(%ld): rt_sigsuspend(%p, %d)\n", (long)p->p_pid,
614	       (void *)uap->newset, uap->sigsetsize);
615#endif
616
617	if (uap->sigsetsize != sizeof(linux_sigset_t))
618		return (EINVAL);
619
620	error = copyin(uap->newset, &lmask, sizeof(linux_sigset_t));
621	if (error)
622		return (error);
623
624	bmask = stackgap_alloc(&sg, sizeof(sigset_t));
625	linux_to_bsd_sigset(&lmask, bmask);
626	bsd.sigmask = bmask;
627	return (sigsuspend(p, &bsd));
628}
629
630int
631linux_pause(struct proc *p, struct linux_pause_args *args)
632{
633	struct sigsuspend_args bsd;
634	sigset_t *sigmask;
635	caddr_t sg = stackgap_init();
636
637#ifdef DEBUG
638	printf("Linux-emul(%d): pause()\n", p->p_pid);
639#endif
640
641	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
642	*sigmask = p->p_sigmask;
643	bsd.sigmask = sigmask;
644	return (sigsuspend(p, &bsd));
645}
646
647int
648linux_sigaltstack(p, uap)
649	struct proc *p;
650	struct linux_sigaltstack_args *uap;
651{
652	struct sigaltstack_args bsd;
653	stack_t *ss, *oss;
654	linux_stack_t lss;
655	int error;
656	caddr_t sg = stackgap_init();
657
658#ifdef DEBUG
659	printf("Linux-emul(%ld): sigaltstack(%p, %p)\n",
660	    (long)p->p_pid, uap->uss, uap->uoss);
661#endif
662
663	if (uap->uss == NULL) {
664		ss = NULL;
665	} else {
666		error = copyin(uap->uss, &lss, sizeof(linux_stack_t));
667		if (error)
668			return (error);
669
670		ss = stackgap_alloc(&sg, sizeof(stack_t));
671		ss->ss_sp = lss.ss_sp;
672		ss->ss_size = (lss.ss_size >= LINUX_MINSIGSTKSZ &&
673		    lss.ss_size < MINSIGSTKSZ) ? MINSIGSTKSZ : lss.ss_size;
674		ss->ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
675	}
676	oss = (uap->uoss != NULL)
677	    ? stackgap_alloc(&sg, sizeof(stack_t))
678	    : NULL;
679
680	bsd.ss = ss;
681	bsd.oss = oss;
682	error = sigaltstack(p, &bsd);
683
684	if (!error && oss != NULL) {
685		lss.ss_sp = oss->ss_sp;
686		lss.ss_size = oss->ss_size;
687		lss.ss_flags = bsd_to_linux_sigaltstack(oss->ss_flags);
688		error = copyout(&lss, uap->uoss, sizeof(linux_stack_t));
689	}
690
691	return (error);
692}
693