linux_machdep.c revision 67051
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $FreeBSD: head/sys/i386/linux/linux_machdep.c 67051 2000-10-13 01:57:43Z gallatin $
29 */
30
31#include <sys/param.h>
32#include <sys/mman.h>
33#include <sys/proc.h>
34#include <sys/sysproto.h>
35#include <sys/systm.h>
36#include <sys/unistd.h>
37
38#include <machine/frame.h>
39#include <machine/psl.h>
40#include <machine/segments.h>
41#include <machine/sysarch.h>
42
43#include <i386/linux/linux.h>
44#include <i386/linux/linux_proto.h>
45#include <compat/linux/linux_ipc.h>
46#include <compat/linux/linux_signal.h>
47#include <compat/linux/linux_util.h>
48
49struct linux_descriptor {
50	unsigned int  entry_number;
51	unsigned long base_addr;
52	unsigned int  limit;
53	unsigned int  seg_32bit:1;
54	unsigned int  contents:2;
55	unsigned int  read_exec_only:1;
56	unsigned int  limit_in_pages:1;
57	unsigned int  seg_not_present:1;
58	unsigned int  useable:1;
59};
60
61struct linux_select_argv {
62	int nfds;
63	fd_set *readfds;
64	fd_set *writefds;
65	fd_set *exceptfds;
66	struct timeval *timeout;
67};
68
69int
70linux_to_bsd_sigaltstack(int lsa)
71{
72	int bsa = 0;
73
74	if (lsa & LINUX_SS_DISABLE)
75		bsa |= SS_DISABLE;
76	if (lsa & LINUX_SS_ONSTACK)
77		bsa |= SS_ONSTACK;
78	if (lsa == LINUX_SS_ONSTACK_BC)
79		bsa = SS_ONSTACK;
80	return (bsa);
81}
82
83int
84bsd_to_linux_sigaltstack(int bsa)
85{
86	int lsa = 0;
87
88	if (bsa & SS_DISABLE)
89		lsa |= LINUX_SS_DISABLE;
90	if (bsa & SS_ONSTACK)
91		lsa |= LINUX_SS_ONSTACK;
92	return (lsa);
93}
94
95int
96linux_execve(struct proc *p, struct linux_execve_args *args)
97{
98	struct execve_args bsd;
99	caddr_t sg;
100
101	sg = stackgap_init();
102	CHECKALTEXIST(p, &sg, args->path);
103
104#ifdef DEBUG
105        printf("Linux-emul(%d): execve(%s)\n",
106	    p->p_pid, args->path);
107#endif
108
109	bsd.fname = args->path;
110	bsd.argv = args->argp;
111	bsd.envv = args->envp;
112	return (execve(p, &bsd));
113}
114
115int
116linux_ipc(struct proc *p, struct linux_ipc_args *args)
117{
118	switch (args->what) {
119	case LINUX_SEMOP:
120		return (linux_semop(p, args));
121	case LINUX_SEMGET:
122		return (linux_semget(p, args));
123	case LINUX_SEMCTL:
124		return (linux_semctl(p, args));
125	case LINUX_MSGSND:
126		return (linux_msgsnd(p, args));
127	case LINUX_MSGRCV:
128		return (linux_msgrcv(p, args));
129	case LINUX_MSGGET:
130		return (linux_msgget(p, args));
131	case LINUX_MSGCTL:
132		return (linux_msgctl(p, args));
133	case LINUX_SHMAT:
134		return (linux_shmat(p, args));
135	case LINUX_SHMDT:
136		return (linux_shmdt(p, args));
137	case LINUX_SHMGET:
138		return (linux_shmget(p, args));
139	case LINUX_SHMCTL:
140		return (linux_shmctl(p, args));
141	}
142
143	uprintf("LINUX: 'ipc' typ=%d not implemented\n", args->what);
144	return (ENOSYS);
145}
146
147int
148linux_select(struct proc *p, struct linux_select_args *args)
149{
150	struct linux_select_argv linux_args;
151	struct linux_newselect_args newsel;
152	int error;
153
154#ifdef SELECT_DEBUG
155	printf("Linux-emul(%ld): select(%x)\n", (long)p->p_pid, args->ptr);
156#endif
157
158	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
159	if (error)
160		return (error);
161
162	newsel.nfds = linux_args.nfds;
163	newsel.readfds = linux_args.readfds;
164	newsel.writefds = linux_args.writefds;
165	newsel.exceptfds = linux_args.exceptfds;
166	newsel.timeout = linux_args.timeout;
167	return (linux_newselect(p, &newsel));
168}
169
170int
171linux_fork(struct proc *p, struct linux_fork_args *args)
172{
173	int error;
174
175#ifdef DEBUG
176	printf("Linux-emul(%ld): fork()\n", (long)p->p_pid);
177#endif
178
179	if ((error = fork(p, (struct fork_args *)args)) != 0)
180		return (error);
181
182	if (p->p_retval[1] == 1)
183		p->p_retval[0] = 0;
184	return (0);
185}
186
187int
188linux_vfork(struct proc *p, struct linux_vfork_args *args)
189{
190	int error;
191
192#ifdef DEBUG
193	printf("Linux-emul(%ld): vfork()\n", (long)p->p_pid);
194#endif
195
196	if ((error = vfork(p, (struct vfork_args *)args)) != 0)
197		return (error);
198	/* Are we the child? */
199	if (p->p_retval[1] == 1)
200		p->p_retval[0] = 0;
201	return (0);
202}
203
204#define CLONE_VM	0x100
205#define CLONE_FS	0x200
206#define CLONE_FILES	0x400
207#define CLONE_SIGHAND	0x800
208#define CLONE_PID	0x1000
209
210int
211linux_clone(struct proc *p, struct linux_clone_args *args)
212{
213	int error, ff = RFPROC;
214	struct proc *p2;
215	int exit_signal;
216	vm_offset_t start;
217	struct rfork_args rf_args;
218
219#ifdef DEBUG
220	if (args->flags & CLONE_PID)
221		printf("linux_clone(%ld): CLONE_PID not yet supported\n",
222		    (long)p->p_pid);
223	printf("linux_clone(%ld): invoked with flags %x and stack %x\n",
224	    (long)p->p_pid, (unsigned int)args->flags,
225	    (unsigned int)args->stack);
226#endif
227
228	if (!args->stack)
229		return (EINVAL);
230
231	exit_signal = args->flags & 0x000000ff;
232	if (exit_signal >= LINUX_NSIG)
233		return (EINVAL);
234
235	if (exit_signal <= LINUX_SIGTBLSZ)
236		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
237
238	/* RFTHREAD probably not necessary here, but it shouldn't hurt */
239	ff |= RFTHREAD;
240
241	if (args->flags & CLONE_VM)
242		ff |= RFMEM;
243	if (args->flags & CLONE_SIGHAND)
244		ff |= RFSIGSHARE;
245	if (!(args->flags & CLONE_FILES))
246		ff |= RFFDG;
247
248	error = 0;
249	start = 0;
250
251	rf_args.flags = ff;
252	if ((error = rfork(p, &rf_args)) != 0)
253		return (error);
254
255	p2 = pfind(p->p_retval[0]);
256	if (p2 == 0)
257		return (ESRCH);
258
259	p2->p_sigparent = exit_signal;
260	p2->p_md.md_regs->tf_esp = (unsigned int)args->stack;
261
262#ifdef DEBUG
263	printf ("linux_clone(%ld): successful rfork to %ld\n", (long)p->p_pid,
264	    (long)p2->p_pid);
265#endif
266
267	return (0);
268}
269
270/* XXX move */
271struct linux_mmap_argv {
272	linux_caddr_t addr;
273	int len;
274	int prot;
275	int flags;
276	int fd;
277	int pos;
278};
279
280#define STACK_SIZE  (2 * 1024 * 1024)
281#define GUARD_SIZE  (4 * PAGE_SIZE)
282
283int
284linux_mmap(struct proc *p, struct linux_mmap_args *args)
285{
286	struct mmap_args /* {
287		caddr_t addr;
288		size_t len;
289		int prot;
290		int flags;
291		int fd;
292		long pad;
293		off_t pos;
294	} */ bsd_args;
295	int error;
296	struct linux_mmap_argv linux_args;
297
298	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
299	if (error)
300		return (error);
301
302#ifdef DEBUG
303	printf("Linux-emul(%ld): mmap(%p, %d, %d, 0x%08x, %d, %d)",
304	    (long)p->p_pid, (void *)linux_args.addr, linux_args.len,
305	    linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pos);
306#endif
307
308	bsd_args.flags = 0;
309	if (linux_args.flags & LINUX_MAP_SHARED)
310		bsd_args.flags |= MAP_SHARED;
311	if (linux_args.flags & LINUX_MAP_PRIVATE)
312		bsd_args.flags |= MAP_PRIVATE;
313	if (linux_args.flags & LINUX_MAP_FIXED)
314		bsd_args.flags |= MAP_FIXED;
315	if (linux_args.flags & LINUX_MAP_ANON)
316		bsd_args.flags |= MAP_ANON;
317	if (linux_args.flags & LINUX_MAP_GROWSDOWN) {
318		bsd_args.flags |= MAP_STACK;
319
320		/* The linux MAP_GROWSDOWN option does not limit auto
321		 * growth of the region.  Linux mmap with this option
322		 * takes as addr the inital BOS, and as len, the initial
323		 * region size.  It can then grow down from addr without
324		 * limit.  However, linux threads has an implicit internal
325		 * limit to stack size of STACK_SIZE.  Its just not
326		 * enforced explicitly in linux.  But, here we impose
327		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
328		 * region, since we can do this with our mmap.
329		 *
330		 * Our mmap with MAP_STACK takes addr as the maximum
331		 * downsize limit on BOS, and as len the max size of
332		 * the region.  It them maps the top SGROWSIZ bytes,
333		 * and autgrows the region down, up to the limit
334		 * in addr.
335		 *
336		 * If we don't use the MAP_STACK option, the effect
337		 * of this code is to allocate a stack region of a
338		 * fixed size of (STACK_SIZE - GUARD_SIZE).
339		 */
340
341		/* This gives us TOS */
342		bsd_args.addr = linux_args.addr + linux_args.len;
343
344		/* This gives us our maximum stack size */
345		if (linux_args.len > STACK_SIZE - GUARD_SIZE)
346			bsd_args.len = linux_args.len;
347		else
348			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
349
350		/* This gives us a new BOS.  If we're using VM_STACK, then
351		 * mmap will just map the top SGROWSIZ bytes, and let
352		 * the stack grow down to the limit at BOS.  If we're
353		 * not using VM_STACK we map the full stack, since we
354		 * don't have a way to autogrow it.
355		 */
356		bsd_args.addr -= bsd_args.len;
357	} else {
358		bsd_args.addr = linux_args.addr;
359		bsd_args.len  = linux_args.len;
360	}
361
362	bsd_args.prot = linux_args.prot | PROT_READ;	/* always required */
363	if (linux_args.flags & LINUX_MAP_ANON)
364		bsd_args.fd = -1;
365	else
366		bsd_args.fd = linux_args.fd;
367	bsd_args.pos = linux_args.pos;
368	bsd_args.pad = 0;
369
370#ifdef DEBUG
371	printf("-> (%p, %d, %d, 0x%08x, %d, %d)\n", (void *)bsd_args.addr,
372	    bsd_args.len, bsd_args.prot, bsd_args.flags, bsd_args.fd,
373	    (int)bsd_args.pos);
374#endif
375
376	return (mmap(p, &bsd_args));
377}
378
379int
380linux_pipe(struct proc *p, struct linux_pipe_args *args)
381{
382	int error;
383	int reg_edx;
384
385#ifdef DEBUG
386	printf("Linux-emul(%ld): pipe(*)\n", (long)p->p_pid);
387#endif
388
389	reg_edx = p->p_retval[1];
390	error = pipe(p, 0);
391	if (error) {
392		p->p_retval[1] = reg_edx;
393		return (error);
394	}
395
396	error = copyout(p->p_retval, args->pipefds, 2*sizeof(int));
397	if (error) {
398		p->p_retval[1] = reg_edx;
399		return (error);
400	}
401
402	p->p_retval[1] = reg_edx;
403	p->p_retval[0] = 0;
404	return (0);
405}
406
407int
408linux_ioperm(struct proc *p, struct linux_ioperm_args *args)
409{
410	struct sysarch_args sa;
411	struct i386_ioperm_args *iia;
412	caddr_t sg;
413
414	sg = stackgap_init();
415	iia = stackgap_alloc(&sg, sizeof(struct i386_ioperm_args));
416	iia->start = args->start;
417	iia->length = args->length;
418	iia->enable = args->enable;
419	sa.op = I386_SET_IOPERM;
420	sa.parms = (char *)iia;
421	return (sysarch(p, &sa));
422}
423
424int
425linux_iopl(struct proc *p, struct linux_iopl_args *args)
426{
427	int error;
428
429	if (args->level < 0 || args->level > 3)
430		return (EINVAL);
431	if ((error = suser(p)) != 0)
432		return (error);
433	if (securelevel > 0)
434		return (EPERM);
435	p->p_md.md_regs->tf_eflags = (p->p_md.md_regs->tf_eflags & ~PSL_IOPL) |
436	    (args->level * (PSL_IOPL / 3));
437	return (0);
438}
439
440int
441linux_modify_ldt(p, uap)
442	struct proc *p;
443	struct linux_modify_ldt_args *uap;
444{
445	int error;
446	caddr_t sg;
447	struct sysarch_args args;
448	struct i386_ldt_args *ldt;
449	struct linux_descriptor ld;
450	union descriptor *desc;
451
452	sg = stackgap_init();
453
454	if (uap->ptr == NULL)
455		return (EINVAL);
456
457	switch (uap->func) {
458	case 0x00: /* read_ldt */
459		ldt = stackgap_alloc(&sg, sizeof(*ldt));
460		ldt->start = 0;
461		ldt->descs = uap->ptr;
462		ldt->num = uap->bytecount / sizeof(union descriptor);
463		args.op = I386_GET_LDT;
464		args.parms = (char*)ldt;
465		error = sysarch(p, &args);
466		p->p_retval[0] *= sizeof(union descriptor);
467		break;
468	case 0x01: /* write_ldt */
469	case 0x11: /* write_ldt */
470		if (uap->bytecount != sizeof(ld))
471			return (EINVAL);
472
473		error = copyin(uap->ptr, &ld, sizeof(ld));
474		if (error)
475			return (error);
476
477		ldt = stackgap_alloc(&sg, sizeof(*ldt));
478		desc = stackgap_alloc(&sg, sizeof(*desc));
479		ldt->start = ld.entry_number;
480		ldt->descs = desc;
481		ldt->num = 1;
482		desc->sd.sd_lolimit = (ld.limit & 0x0000ffff);
483		desc->sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
484		desc->sd.sd_lobase = (ld.base_addr & 0x00ffffff);
485		desc->sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
486		desc->sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
487			(ld.contents << 2);
488		desc->sd.sd_dpl = 3;
489		desc->sd.sd_p = (ld.seg_not_present ^ 1);
490		desc->sd.sd_xx = 0;
491		desc->sd.sd_def32 = ld.seg_32bit;
492		desc->sd.sd_gran = ld.limit_in_pages;
493		args.op = I386_SET_LDT;
494		args.parms = (char*)ldt;
495		error = sysarch(p, &args);
496		break;
497	default:
498		error = EINVAL;
499		break;
500	}
501
502	if (error == EOPNOTSUPP) {
503		printf("linux: modify_ldt needs kernel option USER_LDT\n");
504		error = ENOSYS;
505	}
506
507	return (error);
508}
509
510int
511linux_sigaction(struct proc *p, struct linux_sigaction_args *args)
512{
513	linux_osigaction_t osa;
514	linux_sigaction_t act, oact;
515	int error;
516
517#ifdef DEBUG
518	printf("Linux-emul(%ld): sigaction(%d, %p, %p)\n", (long)p->p_pid,
519	       args->sig, (void *)args->nsa, (void *)args->osa);
520#endif
521
522	if (args->nsa != NULL) {
523		error = copyin(args->nsa, &osa, sizeof(linux_osigaction_t));
524		if (error)
525			return (error);
526		act.lsa_handler = osa.lsa_handler;
527		act.lsa_flags = osa.lsa_flags;
528		act.lsa_restorer = osa.lsa_restorer;
529		LINUX_SIGEMPTYSET(act.lsa_mask);
530		act.lsa_mask.__bits[0] = osa.lsa_mask;
531	}
532
533	error = linux_do_sigaction(p, args->sig, args->nsa ? &act : NULL,
534	    args->osa ? &oact : NULL);
535
536	if (args->osa != NULL && !error) {
537		osa.lsa_handler = oact.lsa_handler;
538		osa.lsa_flags = oact.lsa_flags;
539		osa.lsa_restorer = oact.lsa_restorer;
540		osa.lsa_mask = oact.lsa_mask.__bits[0];
541		error = copyout(&osa, args->osa, sizeof(linux_osigaction_t));
542	}
543
544	return (error);
545}
546
547/*
548 * Linux has two extra args, restart and oldmask.  We dont use these,
549 * but it seems that "restart" is actually a context pointer that
550 * enables the signal to happen with a different register set.
551 */
552int
553linux_sigsuspend(struct proc *p, struct linux_sigsuspend_args *args)
554{
555	struct sigsuspend_args bsd;
556	sigset_t *sigmask;
557	linux_sigset_t mask;
558	caddr_t sg = stackgap_init();
559
560#ifdef DEBUG
561	printf("Linux-emul(%ld): sigsuspend(%08lx)\n",
562	       (long)p->p_pid, (unsigned long)args->mask);
563#endif
564
565	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
566	LINUX_SIGEMPTYSET(mask);
567	mask.__bits[0] = args->mask;
568	linux_to_bsd_sigset(&mask, sigmask);
569	bsd.sigmask = sigmask;
570	return (sigsuspend(p, &bsd));
571}
572
573int
574linux_rt_sigsuspend(p, uap)
575	struct proc *p;
576	struct linux_rt_sigsuspend_args *uap;
577{
578	linux_sigset_t lmask;
579	sigset_t *bmask;
580	struct sigsuspend_args bsd;
581	caddr_t sg = stackgap_init();
582	int error;
583
584#ifdef DEBUG
585	printf("Linux-emul(%ld): rt_sigsuspend(%p, %d)\n", (long)p->p_pid,
586	       (void *)uap->newset, uap->sigsetsize);
587#endif
588
589	if (uap->sigsetsize != sizeof(linux_sigset_t))
590		return (EINVAL);
591
592	error = copyin(uap->newset, &lmask, sizeof(linux_sigset_t));
593	if (error)
594		return (error);
595
596	bmask = stackgap_alloc(&sg, sizeof(sigset_t));
597	linux_to_bsd_sigset(&lmask, bmask);
598	bsd.sigmask = bmask;
599	return (sigsuspend(p, &bsd));
600}
601
602int
603linux_pause(struct proc *p, struct linux_pause_args *args)
604{
605	struct sigsuspend_args bsd;
606	sigset_t *sigmask;
607	caddr_t sg = stackgap_init();
608
609#ifdef DEBUG
610	printf("Linux-emul(%d): pause()\n", p->p_pid);
611#endif
612
613	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
614	*sigmask = p->p_sigmask;
615	bsd.sigmask = sigmask;
616	return (sigsuspend(p, &bsd));
617}
618
619int
620linux_sigaltstack(p, uap)
621	struct proc *p;
622	struct linux_sigaltstack_args *uap;
623{
624	struct sigaltstack_args bsd;
625	stack_t *ss, *oss;
626	linux_stack_t lss;
627	int error;
628	caddr_t sg = stackgap_init();
629
630#ifdef DEBUG
631	printf("Linux-emul(%ld): sigaltstack(%p, %p)\n",
632	    (long)p->p_pid, uap->uss, uap->uoss);
633#endif
634
635	if (uap->uss == NULL) {
636		ss = NULL;
637	} else {
638		error = copyin(uap->uss, &lss, sizeof(linux_stack_t));
639		if (error)
640			return (error);
641
642		ss = stackgap_alloc(&sg, sizeof(stack_t));
643		ss->ss_sp = lss.ss_sp;
644		ss->ss_size = (lss.ss_size >= LINUX_MINSIGSTKSZ &&
645		    lss.ss_size < MINSIGSTKSZ) ? MINSIGSTKSZ : lss.ss_size;
646		ss->ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
647	}
648	oss = (uap->uoss != NULL)
649	    ? stackgap_alloc(&sg, sizeof(stack_t))
650	    : NULL;
651
652	bsd.ss = ss;
653	bsd.oss = oss;
654	error = sigaltstack(p, &bsd);
655
656	if (!error && oss != NULL) {
657		lss.ss_sp = oss->ss_sp;
658		lss.ss_size = oss->ss_size;
659		lss.ss_flags = bsd_to_linux_sigaltstack(oss->ss_flags);
660		error = copyout(&lss, uap->uoss, sizeof(linux_stack_t));
661	}
662
663	return (error);
664}
665