linux_machdep.c revision 68520
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $FreeBSD: head/sys/i386/linux/linux_machdep.c 68520 2000-11-09 08:25:48Z marcel $
29 */
30
31#include <sys/param.h>
32#include <sys/mman.h>
33#include <sys/proc.h>
34#include <sys/sysproto.h>
35#include <sys/systm.h>
36#include <sys/unistd.h>
37#include <sys/resource.h>
38#include <sys/resourcevar.h>
39
40#include <machine/frame.h>
41#include <machine/psl.h>
42#include <machine/segments.h>
43#include <machine/sysarch.h>
44
45#include <vm/vm.h>
46#include <sys/lock.h>
47#include <vm/pmap.h>
48#include <vm/vm_map.h>
49
50#include <i386/linux/linux.h>
51#include <linux_proto.h>
52#include <compat/linux/linux_ipc.h>
53#include <compat/linux/linux_signal.h>
54#include <compat/linux/linux_util.h>
55
56struct linux_descriptor {
57	unsigned int  entry_number;
58	unsigned long base_addr;
59	unsigned int  limit;
60	unsigned int  seg_32bit:1;
61	unsigned int  contents:2;
62	unsigned int  read_exec_only:1;
63	unsigned int  limit_in_pages:1;
64	unsigned int  seg_not_present:1;
65	unsigned int  useable:1;
66};
67
68struct linux_select_argv {
69	int nfds;
70	fd_set *readfds;
71	fd_set *writefds;
72	fd_set *exceptfds;
73	struct timeval *timeout;
74};
75
76int
77linux_to_bsd_sigaltstack(int lsa)
78{
79	int bsa = 0;
80
81	if (lsa & LINUX_SS_DISABLE)
82		bsa |= SS_DISABLE;
83	if (lsa & LINUX_SS_ONSTACK)
84		bsa |= SS_ONSTACK;
85	if (lsa == LINUX_SS_ONSTACK_BC)
86		bsa = SS_ONSTACK;
87	return (bsa);
88}
89
90int
91bsd_to_linux_sigaltstack(int bsa)
92{
93	int lsa = 0;
94
95	if (bsa & SS_DISABLE)
96		lsa |= LINUX_SS_DISABLE;
97	if (bsa & SS_ONSTACK)
98		lsa |= LINUX_SS_ONSTACK;
99	return (lsa);
100}
101
102int
103linux_execve(struct proc *p, struct linux_execve_args *args)
104{
105	struct execve_args bsd;
106	caddr_t sg;
107
108	sg = stackgap_init();
109	CHECKALTEXIST(p, &sg, args->path);
110
111#ifdef DEBUG
112        printf("Linux-emul(%d): execve(%s)\n",
113	    p->p_pid, args->path);
114#endif
115
116	bsd.fname = args->path;
117	bsd.argv = args->argp;
118	bsd.envv = args->envp;
119	return (execve(p, &bsd));
120}
121
122int
123linux_ipc(struct proc *p, struct linux_ipc_args *args)
124{
125	switch (args->what) {
126	case LINUX_SEMOP:
127		return (linux_semop(p, args));
128	case LINUX_SEMGET:
129		return (linux_semget(p, args));
130	case LINUX_SEMCTL:
131		return (linux_semctl(p, args));
132	case LINUX_MSGSND:
133		return (linux_msgsnd(p, args));
134	case LINUX_MSGRCV:
135		return (linux_msgrcv(p, args));
136	case LINUX_MSGGET:
137		return (linux_msgget(p, args));
138	case LINUX_MSGCTL:
139		return (linux_msgctl(p, args));
140	case LINUX_SHMAT:
141		return (linux_shmat(p, args));
142	case LINUX_SHMDT:
143		return (linux_shmdt(p, args));
144	case LINUX_SHMGET:
145		return (linux_shmget(p, args));
146	case LINUX_SHMCTL:
147		return (linux_shmctl(p, args));
148	}
149
150	uprintf("LINUX: 'ipc' typ=%d not implemented\n", args->what);
151	return (ENOSYS);
152}
153
154int
155linux_select(struct proc *p, struct linux_select_args *args)
156{
157	struct linux_select_argv linux_args;
158	struct linux_newselect_args newsel;
159	int error;
160
161#ifdef SELECT_DEBUG
162	printf("Linux-emul(%ld): select(%x)\n", (long)p->p_pid, args->ptr);
163#endif
164
165	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
166	if (error)
167		return (error);
168
169	newsel.nfds = linux_args.nfds;
170	newsel.readfds = linux_args.readfds;
171	newsel.writefds = linux_args.writefds;
172	newsel.exceptfds = linux_args.exceptfds;
173	newsel.timeout = linux_args.timeout;
174	return (linux_newselect(p, &newsel));
175}
176
177int
178linux_fork(struct proc *p, struct linux_fork_args *args)
179{
180	int error;
181
182#ifdef DEBUG
183	printf("Linux-emul(%ld): fork()\n", (long)p->p_pid);
184#endif
185
186	if ((error = fork(p, (struct fork_args *)args)) != 0)
187		return (error);
188
189	if (p->p_retval[1] == 1)
190		p->p_retval[0] = 0;
191	return (0);
192}
193
194int
195linux_vfork(struct proc *p, struct linux_vfork_args *args)
196{
197	int error;
198
199#ifdef DEBUG
200	printf("Linux-emul(%ld): vfork()\n", (long)p->p_pid);
201#endif
202
203	if ((error = vfork(p, (struct vfork_args *)args)) != 0)
204		return (error);
205	/* Are we the child? */
206	if (p->p_retval[1] == 1)
207		p->p_retval[0] = 0;
208	return (0);
209}
210
211#define CLONE_VM	0x100
212#define CLONE_FS	0x200
213#define CLONE_FILES	0x400
214#define CLONE_SIGHAND	0x800
215#define CLONE_PID	0x1000
216
217int
218linux_clone(struct proc *p, struct linux_clone_args *args)
219{
220	int error, ff = RFPROC;
221	struct proc *p2;
222	int exit_signal;
223	vm_offset_t start;
224	struct rfork_args rf_args;
225
226#ifdef DEBUG
227	if (args->flags & CLONE_PID)
228		printf("linux_clone(%ld): CLONE_PID not yet supported\n",
229		    (long)p->p_pid);
230	printf("linux_clone(%ld): invoked with flags %x and stack %x\n",
231	    (long)p->p_pid, (unsigned int)args->flags,
232	    (unsigned int)args->stack);
233#endif
234
235	if (!args->stack)
236		return (EINVAL);
237
238	exit_signal = args->flags & 0x000000ff;
239	if (exit_signal >= LINUX_NSIG)
240		return (EINVAL);
241
242	if (exit_signal <= LINUX_SIGTBLSZ)
243		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
244
245	/* RFTHREAD probably not necessary here, but it shouldn't hurt */
246	ff |= RFTHREAD;
247
248	if (args->flags & CLONE_VM)
249		ff |= RFMEM;
250	if (args->flags & CLONE_SIGHAND)
251		ff |= RFSIGSHARE;
252	if (!(args->flags & CLONE_FILES))
253		ff |= RFFDG;
254
255	error = 0;
256	start = 0;
257
258	rf_args.flags = ff;
259	if ((error = rfork(p, &rf_args)) != 0)
260		return (error);
261
262	p2 = pfind(p->p_retval[0]);
263	if (p2 == 0)
264		return (ESRCH);
265
266	p2->p_sigparent = exit_signal;
267	p2->p_md.md_regs->tf_esp = (unsigned int)args->stack;
268
269#ifdef DEBUG
270	printf ("linux_clone(%ld): successful rfork to %ld\n", (long)p->p_pid,
271	    (long)p2->p_pid);
272#endif
273
274	return (0);
275}
276
277/* XXX move */
278struct linux_mmap_argv {
279	linux_caddr_t addr;
280	int len;
281	int prot;
282	int flags;
283	int fd;
284	int pos;
285};
286
287#define STACK_SIZE  (2 * 1024 * 1024)
288#define GUARD_SIZE  (4 * PAGE_SIZE)
289
290int
291linux_mmap(struct proc *p, struct linux_mmap_args *args)
292{
293	struct mmap_args /* {
294		caddr_t addr;
295		size_t len;
296		int prot;
297		int flags;
298		int fd;
299		long pad;
300		off_t pos;
301	} */ bsd_args;
302	int error;
303	struct linux_mmap_argv linux_args;
304
305	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
306	if (error)
307		return (error);
308
309#ifdef DEBUG
310	printf("Linux-emul(%ld): mmap(%p, %d, %d, 0x%08x, %d, %d)",
311	    (long)p->p_pid, (void *)linux_args.addr, linux_args.len,
312	    linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pos);
313#endif
314
315	bsd_args.flags = 0;
316	if (linux_args.flags & LINUX_MAP_SHARED)
317		bsd_args.flags |= MAP_SHARED;
318	if (linux_args.flags & LINUX_MAP_PRIVATE)
319		bsd_args.flags |= MAP_PRIVATE;
320	if (linux_args.flags & LINUX_MAP_FIXED)
321		bsd_args.flags |= MAP_FIXED;
322	if (linux_args.flags & LINUX_MAP_ANON)
323		bsd_args.flags |= MAP_ANON;
324	if (linux_args.flags & LINUX_MAP_GROWSDOWN) {
325		bsd_args.flags |= MAP_STACK;
326
327		/* The linux MAP_GROWSDOWN option does not limit auto
328		 * growth of the region.  Linux mmap with this option
329		 * takes as addr the inital BOS, and as len, the initial
330		 * region size.  It can then grow down from addr without
331		 * limit.  However, linux threads has an implicit internal
332		 * limit to stack size of STACK_SIZE.  Its just not
333		 * enforced explicitly in linux.  But, here we impose
334		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
335		 * region, since we can do this with our mmap.
336		 *
337		 * Our mmap with MAP_STACK takes addr as the maximum
338		 * downsize limit on BOS, and as len the max size of
339		 * the region.  It them maps the top SGROWSIZ bytes,
340		 * and autgrows the region down, up to the limit
341		 * in addr.
342		 *
343		 * If we don't use the MAP_STACK option, the effect
344		 * of this code is to allocate a stack region of a
345		 * fixed size of (STACK_SIZE - GUARD_SIZE).
346		 */
347
348		/* This gives us TOS */
349		bsd_args.addr = linux_args.addr + linux_args.len;
350
351		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
352			/* Some linux apps will attempt to mmap
353			 * thread stacks near the top of their
354			 * address space.  If their TOS is greater
355			 * than vm_maxsaddr, vm_map_growstack()
356			 * will confuse the thread stack with the
357			 * process stack and deliver a SEGV if they
358			 * attempt to grow the thread stack past their
359			 * current stacksize rlimit.  To avoid this,
360			 * adjust vm_maxsaddr upwards to reflect
361			 * the current stacksize rlimit rather
362			 * than the maximum possible stacksize.
363			 * It would be better to adjust the
364			 * mmap'ed region, but some apps do not check
365			 * mmap's return value.
366			 */
367			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
368			    p->p_rlimit[RLIMIT_STACK].rlim_cur;
369		}
370
371		/* This gives us our maximum stack size */
372		if (linux_args.len > STACK_SIZE - GUARD_SIZE)
373			bsd_args.len = linux_args.len;
374		else
375			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
376
377		/* This gives us a new BOS.  If we're using VM_STACK, then
378		 * mmap will just map the top SGROWSIZ bytes, and let
379		 * the stack grow down to the limit at BOS.  If we're
380		 * not using VM_STACK we map the full stack, since we
381		 * don't have a way to autogrow it.
382		 */
383		bsd_args.addr -= bsd_args.len;
384	} else {
385		bsd_args.addr = linux_args.addr;
386		bsd_args.len  = linux_args.len;
387	}
388
389	bsd_args.prot = linux_args.prot | PROT_READ;	/* always required */
390	if (linux_args.flags & LINUX_MAP_ANON)
391		bsd_args.fd = -1;
392	else
393		bsd_args.fd = linux_args.fd;
394	bsd_args.pos = linux_args.pos;
395	bsd_args.pad = 0;
396
397#ifdef DEBUG
398	printf("-> (%p, %d, %d, 0x%08x, %d, %d)\n", (void *)bsd_args.addr,
399	    bsd_args.len, bsd_args.prot, bsd_args.flags, bsd_args.fd,
400	    (int)bsd_args.pos);
401#endif
402
403	return (mmap(p, &bsd_args));
404}
405
406int
407linux_pipe(struct proc *p, struct linux_pipe_args *args)
408{
409	int error;
410	int reg_edx;
411
412#ifdef DEBUG
413	printf("Linux-emul(%ld): pipe(*)\n", (long)p->p_pid);
414#endif
415
416	reg_edx = p->p_retval[1];
417	error = pipe(p, 0);
418	if (error) {
419		p->p_retval[1] = reg_edx;
420		return (error);
421	}
422
423	error = copyout(p->p_retval, args->pipefds, 2*sizeof(int));
424	if (error) {
425		p->p_retval[1] = reg_edx;
426		return (error);
427	}
428
429	p->p_retval[1] = reg_edx;
430	p->p_retval[0] = 0;
431	return (0);
432}
433
434int
435linux_ioperm(struct proc *p, struct linux_ioperm_args *args)
436{
437	struct sysarch_args sa;
438	struct i386_ioperm_args *iia;
439	caddr_t sg;
440
441	sg = stackgap_init();
442	iia = stackgap_alloc(&sg, sizeof(struct i386_ioperm_args));
443	iia->start = args->start;
444	iia->length = args->length;
445	iia->enable = args->enable;
446	sa.op = I386_SET_IOPERM;
447	sa.parms = (char *)iia;
448	return (sysarch(p, &sa));
449}
450
451int
452linux_iopl(struct proc *p, struct linux_iopl_args *args)
453{
454	int error;
455
456	if (args->level < 0 || args->level > 3)
457		return (EINVAL);
458	if ((error = suser(p)) != 0)
459		return (error);
460	if (securelevel > 0)
461		return (EPERM);
462	p->p_md.md_regs->tf_eflags = (p->p_md.md_regs->tf_eflags & ~PSL_IOPL) |
463	    (args->level * (PSL_IOPL / 3));
464	return (0);
465}
466
467int
468linux_modify_ldt(p, uap)
469	struct proc *p;
470	struct linux_modify_ldt_args *uap;
471{
472	int error;
473	caddr_t sg;
474	struct sysarch_args args;
475	struct i386_ldt_args *ldt;
476	struct linux_descriptor ld;
477	union descriptor *desc;
478
479	sg = stackgap_init();
480
481	if (uap->ptr == NULL)
482		return (EINVAL);
483
484	switch (uap->func) {
485	case 0x00: /* read_ldt */
486		ldt = stackgap_alloc(&sg, sizeof(*ldt));
487		ldt->start = 0;
488		ldt->descs = uap->ptr;
489		ldt->num = uap->bytecount / sizeof(union descriptor);
490		args.op = I386_GET_LDT;
491		args.parms = (char*)ldt;
492		error = sysarch(p, &args);
493		p->p_retval[0] *= sizeof(union descriptor);
494		break;
495	case 0x01: /* write_ldt */
496	case 0x11: /* write_ldt */
497		if (uap->bytecount != sizeof(ld))
498			return (EINVAL);
499
500		error = copyin(uap->ptr, &ld, sizeof(ld));
501		if (error)
502			return (error);
503
504		ldt = stackgap_alloc(&sg, sizeof(*ldt));
505		desc = stackgap_alloc(&sg, sizeof(*desc));
506		ldt->start = ld.entry_number;
507		ldt->descs = desc;
508		ldt->num = 1;
509		desc->sd.sd_lolimit = (ld.limit & 0x0000ffff);
510		desc->sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
511		desc->sd.sd_lobase = (ld.base_addr & 0x00ffffff);
512		desc->sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
513		desc->sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
514			(ld.contents << 2);
515		desc->sd.sd_dpl = 3;
516		desc->sd.sd_p = (ld.seg_not_present ^ 1);
517		desc->sd.sd_xx = 0;
518		desc->sd.sd_def32 = ld.seg_32bit;
519		desc->sd.sd_gran = ld.limit_in_pages;
520		args.op = I386_SET_LDT;
521		args.parms = (char*)ldt;
522		error = sysarch(p, &args);
523		break;
524	default:
525		error = EINVAL;
526		break;
527	}
528
529	if (error == EOPNOTSUPP) {
530		printf("linux: modify_ldt needs kernel option USER_LDT\n");
531		error = ENOSYS;
532	}
533
534	return (error);
535}
536
537int
538linux_sigaction(struct proc *p, struct linux_sigaction_args *args)
539{
540	linux_osigaction_t osa;
541	linux_sigaction_t act, oact;
542	int error;
543
544#ifdef DEBUG
545	printf("Linux-emul(%ld): sigaction(%d, %p, %p)\n", (long)p->p_pid,
546	       args->sig, (void *)args->nsa, (void *)args->osa);
547#endif
548
549	if (args->nsa != NULL) {
550		error = copyin(args->nsa, &osa, sizeof(linux_osigaction_t));
551		if (error)
552			return (error);
553		act.lsa_handler = osa.lsa_handler;
554		act.lsa_flags = osa.lsa_flags;
555		act.lsa_restorer = osa.lsa_restorer;
556		LINUX_SIGEMPTYSET(act.lsa_mask);
557		act.lsa_mask.__bits[0] = osa.lsa_mask;
558	}
559
560	error = linux_do_sigaction(p, args->sig, args->nsa ? &act : NULL,
561	    args->osa ? &oact : NULL);
562
563	if (args->osa != NULL && !error) {
564		osa.lsa_handler = oact.lsa_handler;
565		osa.lsa_flags = oact.lsa_flags;
566		osa.lsa_restorer = oact.lsa_restorer;
567		osa.lsa_mask = oact.lsa_mask.__bits[0];
568		error = copyout(&osa, args->osa, sizeof(linux_osigaction_t));
569	}
570
571	return (error);
572}
573
574/*
575 * Linux has two extra args, restart and oldmask.  We dont use these,
576 * but it seems that "restart" is actually a context pointer that
577 * enables the signal to happen with a different register set.
578 */
579int
580linux_sigsuspend(struct proc *p, struct linux_sigsuspend_args *args)
581{
582	struct sigsuspend_args bsd;
583	sigset_t *sigmask;
584	linux_sigset_t mask;
585	caddr_t sg = stackgap_init();
586
587#ifdef DEBUG
588	printf("Linux-emul(%ld): sigsuspend(%08lx)\n",
589	       (long)p->p_pid, (unsigned long)args->mask);
590#endif
591
592	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
593	LINUX_SIGEMPTYSET(mask);
594	mask.__bits[0] = args->mask;
595	linux_to_bsd_sigset(&mask, sigmask);
596	bsd.sigmask = sigmask;
597	return (sigsuspend(p, &bsd));
598}
599
600int
601linux_rt_sigsuspend(p, uap)
602	struct proc *p;
603	struct linux_rt_sigsuspend_args *uap;
604{
605	linux_sigset_t lmask;
606	sigset_t *bmask;
607	struct sigsuspend_args bsd;
608	caddr_t sg = stackgap_init();
609	int error;
610
611#ifdef DEBUG
612	printf("Linux-emul(%ld): rt_sigsuspend(%p, %d)\n", (long)p->p_pid,
613	       (void *)uap->newset, uap->sigsetsize);
614#endif
615
616	if (uap->sigsetsize != sizeof(linux_sigset_t))
617		return (EINVAL);
618
619	error = copyin(uap->newset, &lmask, sizeof(linux_sigset_t));
620	if (error)
621		return (error);
622
623	bmask = stackgap_alloc(&sg, sizeof(sigset_t));
624	linux_to_bsd_sigset(&lmask, bmask);
625	bsd.sigmask = bmask;
626	return (sigsuspend(p, &bsd));
627}
628
629int
630linux_pause(struct proc *p, struct linux_pause_args *args)
631{
632	struct sigsuspend_args bsd;
633	sigset_t *sigmask;
634	caddr_t sg = stackgap_init();
635
636#ifdef DEBUG
637	printf("Linux-emul(%d): pause()\n", p->p_pid);
638#endif
639
640	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
641	*sigmask = p->p_sigmask;
642	bsd.sigmask = sigmask;
643	return (sigsuspend(p, &bsd));
644}
645
646int
647linux_sigaltstack(p, uap)
648	struct proc *p;
649	struct linux_sigaltstack_args *uap;
650{
651	struct sigaltstack_args bsd;
652	stack_t *ss, *oss;
653	linux_stack_t lss;
654	int error;
655	caddr_t sg = stackgap_init();
656
657#ifdef DEBUG
658	printf("Linux-emul(%ld): sigaltstack(%p, %p)\n",
659	    (long)p->p_pid, uap->uss, uap->uoss);
660#endif
661
662	if (uap->uss == NULL) {
663		ss = NULL;
664	} else {
665		error = copyin(uap->uss, &lss, sizeof(linux_stack_t));
666		if (error)
667			return (error);
668
669		ss = stackgap_alloc(&sg, sizeof(stack_t));
670		ss->ss_sp = lss.ss_sp;
671		ss->ss_size = lss.ss_size;
672		ss->ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
673	}
674	oss = (uap->uoss != NULL)
675	    ? stackgap_alloc(&sg, sizeof(stack_t))
676	    : NULL;
677
678	bsd.ss = ss;
679	bsd.oss = oss;
680	error = sigaltstack(p, &bsd);
681
682	if (!error && oss != NULL) {
683		lss.ss_sp = oss->ss_sp;
684		lss.ss_size = oss->ss_size;
685		lss.ss_flags = bsd_to_linux_sigaltstack(oss->ss_flags);
686		error = copyout(&lss, uap->uoss, sizeof(linux_stack_t));
687	}
688
689	return (error);
690}
691