linux_machdep.c revision 72543
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $FreeBSD: head/sys/i386/linux/linux_machdep.c 72543 2001-02-16 16:40:43Z jlemon $
29 */
30
31#include <sys/param.h>
32#include <sys/mman.h>
33#include <sys/proc.h>
34#include <sys/sysproto.h>
35#include <sys/systm.h>
36#include <sys/unistd.h>
37#include <sys/resource.h>
38#include <sys/resourcevar.h>
39
40#include <machine/frame.h>
41#include <machine/psl.h>
42#include <machine/segments.h>
43#include <machine/sysarch.h>
44
45#include <vm/vm.h>
46#include <sys/lock.h>
47#include <vm/pmap.h>
48#include <vm/vm_map.h>
49
50#include <i386/linux/linux.h>
51#include <i386/linux/linux_proto.h>
52#include <compat/linux/linux_ipc.h>
53#include <compat/linux/linux_signal.h>
54#include <compat/linux/linux_util.h>
55
56struct linux_descriptor {
57	unsigned int  entry_number;
58	unsigned long base_addr;
59	unsigned int  limit;
60	unsigned int  seg_32bit:1;
61	unsigned int  contents:2;
62	unsigned int  read_exec_only:1;
63	unsigned int  limit_in_pages:1;
64	unsigned int  seg_not_present:1;
65	unsigned int  useable:1;
66};
67
68struct linux_select_argv {
69	int nfds;
70	fd_set *readfds;
71	fd_set *writefds;
72	fd_set *exceptfds;
73	struct timeval *timeout;
74};
75
76int
77linux_to_bsd_sigaltstack(int lsa)
78{
79	int bsa = 0;
80
81	if (lsa & LINUX_SS_DISABLE)
82		bsa |= SS_DISABLE;
83	if (lsa & LINUX_SS_ONSTACK)
84		bsa |= SS_ONSTACK;
85	return (bsa);
86}
87
88int
89bsd_to_linux_sigaltstack(int bsa)
90{
91	int lsa = 0;
92
93	if (bsa & SS_DISABLE)
94		lsa |= LINUX_SS_DISABLE;
95	if (bsa & SS_ONSTACK)
96		lsa |= LINUX_SS_ONSTACK;
97	return (lsa);
98}
99
100int
101linux_execve(struct proc *p, struct linux_execve_args *args)
102{
103	struct execve_args bsd;
104	caddr_t sg;
105
106	sg = stackgap_init();
107	CHECKALTEXIST(p, &sg, args->path);
108
109#ifdef DEBUG
110	if (ldebug(execve))
111		printf(ARGS(execve, "%s"), args->path);
112#endif
113
114	bsd.fname = args->path;
115	bsd.argv = args->argp;
116	bsd.envv = args->envp;
117	return (execve(p, &bsd));
118}
119
120int
121linux_ipc(struct proc *p, struct linux_ipc_args *args)
122{
123	switch (args->what) {
124	case LINUX_SEMOP:
125		return (linux_semop(p, args));
126	case LINUX_SEMGET:
127		return (linux_semget(p, args));
128	case LINUX_SEMCTL:
129		return (linux_semctl(p, args));
130	case LINUX_MSGSND:
131		return (linux_msgsnd(p, args));
132	case LINUX_MSGRCV:
133		return (linux_msgrcv(p, args));
134	case LINUX_MSGGET:
135		return (linux_msgget(p, args));
136	case LINUX_MSGCTL:
137		return (linux_msgctl(p, args));
138	case LINUX_SHMAT:
139		return (linux_shmat(p, args));
140	case LINUX_SHMDT:
141		return (linux_shmdt(p, args));
142	case LINUX_SHMGET:
143		return (linux_shmget(p, args));
144	case LINUX_SHMCTL:
145		return (linux_shmctl(p, args));
146	}
147
148	uprintf("LINUX: 'ipc' typ=%d not implemented\n", args->what);
149	return (ENOSYS);
150}
151
152int
153linux_select(struct proc *p, struct linux_select_args *args)
154{
155	struct linux_select_argv linux_args;
156	struct linux_newselect_args newsel;
157	int error;
158
159#ifdef SELECT_DEBUG
160	if (ldebug(select))
161		printf(ARGS(select, "%x"), args->ptr);
162#endif
163
164	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
165	if (error)
166		return (error);
167
168	newsel.nfds = linux_args.nfds;
169	newsel.readfds = linux_args.readfds;
170	newsel.writefds = linux_args.writefds;
171	newsel.exceptfds = linux_args.exceptfds;
172	newsel.timeout = linux_args.timeout;
173	return (linux_newselect(p, &newsel));
174}
175
176int
177linux_fork(struct proc *p, struct linux_fork_args *args)
178{
179	int error;
180
181#ifdef DEBUG
182	if (ldebug(fork))
183		printf(ARGS(fork, ""));
184#endif
185
186	if ((error = fork(p, (struct fork_args *)args)) != 0)
187		return (error);
188
189	if (p->p_retval[1] == 1)
190		p->p_retval[0] = 0;
191	return (0);
192}
193
194int
195linux_vfork(struct proc *p, struct linux_vfork_args *args)
196{
197	int error;
198
199#ifdef DEBUG
200	if (ldebug(vfork))
201		printf(ARGS(vfork, ""));
202#endif
203
204	if ((error = vfork(p, (struct vfork_args *)args)) != 0)
205		return (error);
206	/* Are we the child? */
207	if (p->p_retval[1] == 1)
208		p->p_retval[0] = 0;
209	return (0);
210}
211
212#define CLONE_VM	0x100
213#define CLONE_FS	0x200
214#define CLONE_FILES	0x400
215#define CLONE_SIGHAND	0x800
216#define CLONE_PID	0x1000
217
218int
219linux_clone(struct proc *p, struct linux_clone_args *args)
220{
221	int error, ff = RFPROC;
222	struct proc *p2;
223	int exit_signal;
224	vm_offset_t start;
225	struct rfork_args rf_args;
226
227#ifdef DEBUG
228	if (ldebug(clone)) {
229		printf(ARGS(clone, "flags %x, stack %x"),
230		    (unsigned int)args->flags, (unsigned int)args->stack);
231		if (args->flags & CLONE_PID)
232			printf(LMSG("CLONE_PID not yet supported"));
233	}
234#endif
235
236	if (!args->stack)
237		return (EINVAL);
238
239	exit_signal = args->flags & 0x000000ff;
240	if (exit_signal >= LINUX_NSIG)
241		return (EINVAL);
242
243	if (exit_signal <= LINUX_SIGTBLSZ)
244		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
245
246	/* RFTHREAD probably not necessary here, but it shouldn't hurt */
247	ff |= RFTHREAD;
248
249	if (args->flags & CLONE_VM)
250		ff |= RFMEM;
251	if (args->flags & CLONE_SIGHAND)
252		ff |= RFSIGSHARE;
253	if (!(args->flags & CLONE_FILES))
254		ff |= RFFDG;
255
256	error = 0;
257	start = 0;
258
259	rf_args.flags = ff;
260	if ((error = rfork(p, &rf_args)) != 0)
261		return (error);
262
263	p2 = pfind(p->p_retval[0]);
264	if (p2 == NULL)
265		return (ESRCH);
266
267	PROC_LOCK(p2);
268	p2->p_sigparent = exit_signal;
269	PROC_UNLOCK(p2);
270	p2->p_md.md_regs->tf_esp = (unsigned int)args->stack;
271
272#ifdef DEBUG
273	if (ldebug(clone))
274		printf(LMSG("clone: successful rfork to %ld"),
275		    (long)p2->p_pid);
276#endif
277
278	return (0);
279}
280
281/* XXX move */
282struct linux_mmap_argv {
283	linux_caddr_t addr;
284	int len;
285	int prot;
286	int flags;
287	int fd;
288	int pos;
289};
290
291#define STACK_SIZE  (2 * 1024 * 1024)
292#define GUARD_SIZE  (4 * PAGE_SIZE)
293
294int
295linux_mmap(struct proc *p, struct linux_mmap_args *args)
296{
297	struct mmap_args /* {
298		caddr_t addr;
299		size_t len;
300		int prot;
301		int flags;
302		int fd;
303		long pad;
304		off_t pos;
305	} */ bsd_args;
306	int error;
307	struct linux_mmap_argv linux_args;
308
309	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
310	if (error)
311		return (error);
312
313#ifdef DEBUG
314	if (ldebug(mmap))
315		printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
316		    (void *)linux_args.addr, linux_args.len, linux_args.prot,
317		    linux_args.flags, linux_args.fd, linux_args.pos);
318#endif
319
320	bsd_args.flags = 0;
321	if (linux_args.flags & LINUX_MAP_SHARED)
322		bsd_args.flags |= MAP_SHARED;
323	if (linux_args.flags & LINUX_MAP_PRIVATE)
324		bsd_args.flags |= MAP_PRIVATE;
325	if (linux_args.flags & LINUX_MAP_FIXED)
326		bsd_args.flags |= MAP_FIXED;
327	if (linux_args.flags & LINUX_MAP_ANON)
328		bsd_args.flags |= MAP_ANON;
329	if (linux_args.flags & LINUX_MAP_GROWSDOWN) {
330		bsd_args.flags |= MAP_STACK;
331
332		/* The linux MAP_GROWSDOWN option does not limit auto
333		 * growth of the region.  Linux mmap with this option
334		 * takes as addr the inital BOS, and as len, the initial
335		 * region size.  It can then grow down from addr without
336		 * limit.  However, linux threads has an implicit internal
337		 * limit to stack size of STACK_SIZE.  Its just not
338		 * enforced explicitly in linux.  But, here we impose
339		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
340		 * region, since we can do this with our mmap.
341		 *
342		 * Our mmap with MAP_STACK takes addr as the maximum
343		 * downsize limit on BOS, and as len the max size of
344		 * the region.  It them maps the top SGROWSIZ bytes,
345		 * and autgrows the region down, up to the limit
346		 * in addr.
347		 *
348		 * If we don't use the MAP_STACK option, the effect
349		 * of this code is to allocate a stack region of a
350		 * fixed size of (STACK_SIZE - GUARD_SIZE).
351		 */
352
353		/* This gives us TOS */
354		bsd_args.addr = linux_args.addr + linux_args.len;
355
356		if (bsd_args.addr > p->p_vmspace->vm_maxsaddr) {
357			/* Some linux apps will attempt to mmap
358			 * thread stacks near the top of their
359			 * address space.  If their TOS is greater
360			 * than vm_maxsaddr, vm_map_growstack()
361			 * will confuse the thread stack with the
362			 * process stack and deliver a SEGV if they
363			 * attempt to grow the thread stack past their
364			 * current stacksize rlimit.  To avoid this,
365			 * adjust vm_maxsaddr upwards to reflect
366			 * the current stacksize rlimit rather
367			 * than the maximum possible stacksize.
368			 * It would be better to adjust the
369			 * mmap'ed region, but some apps do not check
370			 * mmap's return value.
371			 */
372			mtx_assert(&Giant, MA_OWNED);
373			p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
374			    p->p_rlimit[RLIMIT_STACK].rlim_cur;
375		}
376
377		/* This gives us our maximum stack size */
378		if (linux_args.len > STACK_SIZE - GUARD_SIZE)
379			bsd_args.len = linux_args.len;
380		else
381			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
382
383		/* This gives us a new BOS.  If we're using VM_STACK, then
384		 * mmap will just map the top SGROWSIZ bytes, and let
385		 * the stack grow down to the limit at BOS.  If we're
386		 * not using VM_STACK we map the full stack, since we
387		 * don't have a way to autogrow it.
388		 */
389		bsd_args.addr -= bsd_args.len;
390	} else {
391		bsd_args.addr = linux_args.addr;
392		bsd_args.len  = linux_args.len;
393	}
394
395	bsd_args.prot = linux_args.prot | PROT_READ;	/* always required */
396	if (linux_args.flags & LINUX_MAP_ANON)
397		bsd_args.fd = -1;
398	else
399		bsd_args.fd = linux_args.fd;
400	bsd_args.pos = linux_args.pos;
401	bsd_args.pad = 0;
402
403#ifdef DEBUG
404	if (ldebug(mmap))
405		printf("-> (%p, %d, %d, 0x%08x, %d, %d)\n",
406		    (void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
407		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
408#endif
409
410	return (mmap(p, &bsd_args));
411}
412
413int
414linux_pipe(struct proc *p, struct linux_pipe_args *args)
415{
416	int error;
417	int reg_edx;
418
419#ifdef DEBUG
420	if (ldebug(pipe))
421		printf(ARGS(pipe, "*"));
422#endif
423
424	reg_edx = p->p_retval[1];
425	error = pipe(p, 0);
426	if (error) {
427		p->p_retval[1] = reg_edx;
428		return (error);
429	}
430
431	error = copyout(p->p_retval, args->pipefds, 2*sizeof(int));
432	if (error) {
433		p->p_retval[1] = reg_edx;
434		return (error);
435	}
436
437	p->p_retval[1] = reg_edx;
438	p->p_retval[0] = 0;
439	return (0);
440}
441
442int
443linux_ioperm(struct proc *p, struct linux_ioperm_args *args)
444{
445	struct sysarch_args sa;
446	struct i386_ioperm_args *iia;
447	caddr_t sg;
448
449	sg = stackgap_init();
450	iia = stackgap_alloc(&sg, sizeof(struct i386_ioperm_args));
451	iia->start = args->start;
452	iia->length = args->length;
453	iia->enable = args->enable;
454	sa.op = I386_SET_IOPERM;
455	sa.parms = (char *)iia;
456	return (sysarch(p, &sa));
457}
458
459int
460linux_iopl(struct proc *p, struct linux_iopl_args *args)
461{
462	int error;
463
464	if (args->level < 0 || args->level > 3)
465		return (EINVAL);
466	if ((error = suser(p)) != 0)
467		return (error);
468	if (securelevel > 0)
469		return (EPERM);
470	p->p_md.md_regs->tf_eflags = (p->p_md.md_regs->tf_eflags & ~PSL_IOPL) |
471	    (args->level * (PSL_IOPL / 3));
472	return (0);
473}
474
475int
476linux_modify_ldt(p, uap)
477	struct proc *p;
478	struct linux_modify_ldt_args *uap;
479{
480	int error;
481	caddr_t sg;
482	struct sysarch_args args;
483	struct i386_ldt_args *ldt;
484	struct linux_descriptor ld;
485	union descriptor *desc;
486
487	sg = stackgap_init();
488
489	if (uap->ptr == NULL)
490		return (EINVAL);
491
492	switch (uap->func) {
493	case 0x00: /* read_ldt */
494		ldt = stackgap_alloc(&sg, sizeof(*ldt));
495		ldt->start = 0;
496		ldt->descs = uap->ptr;
497		ldt->num = uap->bytecount / sizeof(union descriptor);
498		args.op = I386_GET_LDT;
499		args.parms = (char*)ldt;
500		error = sysarch(p, &args);
501		p->p_retval[0] *= sizeof(union descriptor);
502		break;
503	case 0x01: /* write_ldt */
504	case 0x11: /* write_ldt */
505		if (uap->bytecount != sizeof(ld))
506			return (EINVAL);
507
508		error = copyin(uap->ptr, &ld, sizeof(ld));
509		if (error)
510			return (error);
511
512		ldt = stackgap_alloc(&sg, sizeof(*ldt));
513		desc = stackgap_alloc(&sg, sizeof(*desc));
514		ldt->start = ld.entry_number;
515		ldt->descs = desc;
516		ldt->num = 1;
517		desc->sd.sd_lolimit = (ld.limit & 0x0000ffff);
518		desc->sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
519		desc->sd.sd_lobase = (ld.base_addr & 0x00ffffff);
520		desc->sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
521		desc->sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
522			(ld.contents << 2);
523		desc->sd.sd_dpl = 3;
524		desc->sd.sd_p = (ld.seg_not_present ^ 1);
525		desc->sd.sd_xx = 0;
526		desc->sd.sd_def32 = ld.seg_32bit;
527		desc->sd.sd_gran = ld.limit_in_pages;
528		args.op = I386_SET_LDT;
529		args.parms = (char*)ldt;
530		error = sysarch(p, &args);
531		break;
532	default:
533		error = EINVAL;
534		break;
535	}
536
537	if (error == EOPNOTSUPP) {
538		printf("linux: modify_ldt needs kernel option USER_LDT\n");
539		error = ENOSYS;
540	}
541
542	return (error);
543}
544
545int
546linux_sigaction(struct proc *p, struct linux_sigaction_args *args)
547{
548	linux_osigaction_t osa;
549	linux_sigaction_t act, oact;
550	int error;
551
552#ifdef DEBUG
553	if (ldebug(sigaction))
554		printf(ARGS(sigaction, "%d, %p, %p"),
555		    args->sig, (void *)args->nsa, (void *)args->osa);
556#endif
557
558	if (args->nsa != NULL) {
559		error = copyin(args->nsa, &osa, sizeof(linux_osigaction_t));
560		if (error)
561			return (error);
562		act.lsa_handler = osa.lsa_handler;
563		act.lsa_flags = osa.lsa_flags;
564		act.lsa_restorer = osa.lsa_restorer;
565		LINUX_SIGEMPTYSET(act.lsa_mask);
566		act.lsa_mask.__bits[0] = osa.lsa_mask;
567	}
568
569	error = linux_do_sigaction(p, args->sig, args->nsa ? &act : NULL,
570	    args->osa ? &oact : NULL);
571
572	if (args->osa != NULL && !error) {
573		osa.lsa_handler = oact.lsa_handler;
574		osa.lsa_flags = oact.lsa_flags;
575		osa.lsa_restorer = oact.lsa_restorer;
576		osa.lsa_mask = oact.lsa_mask.__bits[0];
577		error = copyout(&osa, args->osa, sizeof(linux_osigaction_t));
578	}
579
580	return (error);
581}
582
583/*
584 * Linux has two extra args, restart and oldmask.  We dont use these,
585 * but it seems that "restart" is actually a context pointer that
586 * enables the signal to happen with a different register set.
587 */
588int
589linux_sigsuspend(struct proc *p, struct linux_sigsuspend_args *args)
590{
591	struct sigsuspend_args bsd;
592	sigset_t *sigmask;
593	linux_sigset_t mask;
594	caddr_t sg = stackgap_init();
595
596#ifdef DEBUG
597	if (ldebug(sigsuspend))
598		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
599#endif
600
601	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
602	LINUX_SIGEMPTYSET(mask);
603	mask.__bits[0] = args->mask;
604	linux_to_bsd_sigset(&mask, sigmask);
605	bsd.sigmask = sigmask;
606	return (sigsuspend(p, &bsd));
607}
608
609int
610linux_rt_sigsuspend(p, uap)
611	struct proc *p;
612	struct linux_rt_sigsuspend_args *uap;
613{
614	linux_sigset_t lmask;
615	sigset_t *bmask;
616	struct sigsuspend_args bsd;
617	caddr_t sg = stackgap_init();
618	int error;
619
620#ifdef DEBUG
621	if (ldebug(rt_sigsuspend))
622		printf(ARGS(rt_sigsuspend, "%p, %d"),
623		    (void *)uap->newset, uap->sigsetsize);
624#endif
625
626	if (uap->sigsetsize != sizeof(linux_sigset_t))
627		return (EINVAL);
628
629	error = copyin(uap->newset, &lmask, sizeof(linux_sigset_t));
630	if (error)
631		return (error);
632
633	bmask = stackgap_alloc(&sg, sizeof(sigset_t));
634	linux_to_bsd_sigset(&lmask, bmask);
635	bsd.sigmask = bmask;
636	return (sigsuspend(p, &bsd));
637}
638
639int
640linux_pause(struct proc *p, struct linux_pause_args *args)
641{
642	struct sigsuspend_args bsd;
643	sigset_t *sigmask;
644	caddr_t sg = stackgap_init();
645
646#ifdef DEBUG
647	if (ldebug(pause))
648		printf(ARGS(pause, ""));
649#endif
650
651	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
652	PROC_LOCK(p);
653	*sigmask = p->p_sigmask;
654	PROC_UNLOCK(p);
655	bsd.sigmask = sigmask;
656	return (sigsuspend(p, &bsd));
657}
658
659int
660linux_sigaltstack(p, uap)
661	struct proc *p;
662	struct linux_sigaltstack_args *uap;
663{
664	struct sigaltstack_args bsd;
665	stack_t *ss, *oss;
666	linux_stack_t lss;
667	int error;
668	caddr_t sg = stackgap_init();
669
670#ifdef DEBUG
671	if (ldebug(sigaltstack))
672		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
673#endif
674
675	if (uap->uss == NULL) {
676		ss = NULL;
677	} else {
678		error = copyin(uap->uss, &lss, sizeof(linux_stack_t));
679		if (error)
680			return (error);
681
682		ss = stackgap_alloc(&sg, sizeof(stack_t));
683		ss->ss_sp = lss.ss_sp;
684		ss->ss_size = lss.ss_size;
685		ss->ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
686	}
687	oss = (uap->uoss != NULL)
688	    ? stackgap_alloc(&sg, sizeof(stack_t))
689	    : NULL;
690
691	bsd.ss = ss;
692	bsd.oss = oss;
693	error = sigaltstack(p, &bsd);
694
695	if (!error && oss != NULL) {
696		lss.ss_sp = oss->ss_sp;
697		lss.ss_size = oss->ss_size;
698		lss.ss_flags = bsd_to_linux_sigaltstack(oss->ss_flags);
699		error = copyout(&lss, uap->uoss, sizeof(linux_stack_t));
700	}
701
702	return (error);
703}
704