linux_machdep.c revision 65067
1/*-
2 * Copyright (c) 2000 Marcel Moolenaar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 *    derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 *
28 * $FreeBSD: head/sys/i386/linux/linux_machdep.c 65067 2000-08-25 07:32:24Z marcel $
29 */
30
31#include <sys/param.h>
32#include <sys/mman.h>
33#include <sys/proc.h>
34#include <sys/sysproto.h>
35#include <sys/systm.h>
36#include <sys/unistd.h>
37
38#include <machine/frame.h>
39#include <machine/psl.h>
40#include <machine/segments.h>
41#include <machine/sysarch.h>
42
43#include <i386/linux/linux.h>
44#include <i386/linux/linux_proto.h>
45#include <compat/linux/linux_ipc.h>
46#include <compat/linux/linux_signal.h>
47#include <compat/linux/linux_util.h>
48
49struct linux_descriptor {
50	unsigned int  entry_number;
51	unsigned long base_addr;
52	unsigned int  limit;
53	unsigned int  seg_32bit:1;
54	unsigned int  contents:2;
55	unsigned int  read_exec_only:1;
56	unsigned int  limit_in_pages:1;
57	unsigned int  seg_not_present:1;
58	unsigned int  useable:1;
59};
60
61struct linux_select_argv {
62	int nfds;
63	fd_set *readfds;
64	fd_set *writefds;
65	fd_set *exceptfds;
66	struct timeval *timeout;
67};
68
69int
70linux_execve(struct proc *p, struct linux_execve_args *args)
71{
72	struct execve_args bsd;
73	caddr_t sg;
74
75	sg = stackgap_init();
76	CHECKALTEXIST(p, &sg, args->path);
77
78#ifdef DEBUG
79        printf("Linux-emul(%d): execve(%s)\n",
80	    p->p_pid, args->path);
81#endif
82
83	bsd.fname = args->path;
84	bsd.argv = args->argp;
85	bsd.envv = args->envp;
86	return (execve(p, &bsd));
87}
88
89int
90linux_ipc(struct proc *p, struct linux_ipc_args *args)
91{
92	switch (args->what) {
93	case LINUX_SEMOP:
94		return (linux_semop(p, args));
95	case LINUX_SEMGET:
96		return (linux_semget(p, args));
97	case LINUX_SEMCTL:
98		return (linux_semctl(p, args));
99	case LINUX_MSGSND:
100		return (linux_msgsnd(p, args));
101	case LINUX_MSGRCV:
102		return (linux_msgrcv(p, args));
103	case LINUX_MSGGET:
104		return (linux_msgget(p, args));
105	case LINUX_MSGCTL:
106		return (linux_msgctl(p, args));
107	case LINUX_SHMAT:
108		return (linux_shmat(p, args));
109	case LINUX_SHMDT:
110		return (linux_shmdt(p, args));
111	case LINUX_SHMGET:
112		return (linux_shmget(p, args));
113	case LINUX_SHMCTL:
114		return (linux_shmctl(p, args));
115	}
116
117	uprintf("LINUX: 'ipc' typ=%d not implemented\n", args->what);
118	return (ENOSYS);
119}
120
121int
122linux_select(struct proc *p, struct linux_select_args *args)
123{
124	struct linux_select_argv linux_args;
125	struct linux_newselect_args newsel;
126	int error;
127
128#ifdef SELECT_DEBUG
129	printf("Linux-emul(%ld): select(%x)\n", (long)p->p_pid, args->ptr);
130#endif
131
132	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
133	if (error)
134		return (error);
135
136	newsel.nfds = linux_args.nfds;
137	newsel.readfds = linux_args.readfds;
138	newsel.writefds = linux_args.writefds;
139	newsel.exceptfds = linux_args.exceptfds;
140	newsel.timeout = linux_args.timeout;
141	return (linux_newselect(p, &newsel));
142}
143
144int
145linux_fork(struct proc *p, struct linux_fork_args *args)
146{
147	int error;
148
149#ifdef DEBUG
150	printf("Linux-emul(%ld): fork()\n", (long)p->p_pid);
151#endif
152
153	if ((error = fork(p, (struct fork_args *)args)) != 0)
154		return (error);
155
156	if (p->p_retval[1] == 1)
157		p->p_retval[0] = 0;
158	return (0);
159}
160
161int
162linux_vfork(struct proc *p, struct linux_vfork_args *args)
163{
164	int error;
165
166#ifdef DEBUG
167	printf("Linux-emul(%ld): vfork()\n", (long)p->p_pid);
168#endif
169
170	if ((error = vfork(p, (struct vfork_args *)args)) != 0)
171		return (error);
172	/* Are we the child? */
173	if (p->p_retval[1] == 1)
174		p->p_retval[0] = 0;
175	return (0);
176}
177
178#define CLONE_VM	0x100
179#define CLONE_FS	0x200
180#define CLONE_FILES	0x400
181#define CLONE_SIGHAND	0x800
182#define CLONE_PID	0x1000
183
184int
185linux_clone(struct proc *p, struct linux_clone_args *args)
186{
187	int error, ff = RFPROC;
188	struct proc *p2;
189	int exit_signal;
190	vm_offset_t start;
191	struct rfork_args rf_args;
192
193#ifdef DEBUG
194	if (args->flags & CLONE_PID)
195		printf("linux_clone(%ld): CLONE_PID not yet supported\n",
196		    (long)p->p_pid);
197	printf("linux_clone(%ld): invoked with flags %x and stack %x\n",
198	    (long)p->p_pid, (unsigned int)args->flags,
199	    (unsigned int)args->stack);
200#endif
201
202	if (!args->stack)
203		return (EINVAL);
204
205	exit_signal = args->flags & 0x000000ff;
206	if (exit_signal >= LINUX_NSIG)
207		return (EINVAL);
208
209	if (exit_signal <= LINUX_SIGTBLSZ)
210		exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)];
211
212	/* RFTHREAD probably not necessary here, but it shouldn't hurt */
213	ff |= RFTHREAD;
214
215	if (args->flags & CLONE_VM)
216		ff |= RFMEM;
217	if (args->flags & CLONE_SIGHAND)
218		ff |= RFSIGSHARE;
219	if (!(args->flags & CLONE_FILES))
220		ff |= RFFDG;
221
222	error = 0;
223	start = 0;
224
225	rf_args.flags = ff;
226	if ((error = rfork(p, &rf_args)) != 0)
227		return (error);
228
229	p2 = pfind(p->p_retval[0]);
230	if (p2 == 0)
231		return (ESRCH);
232
233	p2->p_sigparent = exit_signal;
234	p2->p_md.md_regs->tf_esp = (unsigned int)args->stack;
235
236#ifdef DEBUG
237	printf ("linux_clone(%ld): successful rfork to %ld\n", (long)p->p_pid,
238	    (long)p2->p_pid);
239#endif
240
241	return (0);
242}
243
244/* XXX move */
245struct linux_mmap_argv {
246	linux_caddr_t addr;
247	int len;
248	int prot;
249	int flags;
250	int fd;
251	int pos;
252};
253
254#define STACK_SIZE  (2 * 1024 * 1024)
255#define GUARD_SIZE  (4 * PAGE_SIZE)
256
257int
258linux_mmap(struct proc *p, struct linux_mmap_args *args)
259{
260	struct mmap_args /* {
261		caddr_t addr;
262		size_t len;
263		int prot;
264		int flags;
265		int fd;
266		long pad;
267		off_t pos;
268	} */ bsd_args;
269	int error;
270	struct linux_mmap_argv linux_args;
271
272	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
273	if (error)
274		return (error);
275
276#ifdef DEBUG
277	printf("Linux-emul(%ld): mmap(%p, %d, %d, 0x%08x, %d, %d)",
278	    (long)p->p_pid, (void *)linux_args.addr, linux_args.len,
279	    linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pos);
280#endif
281
282	bsd_args.flags = 0;
283	if (linux_args.flags & LINUX_MAP_SHARED)
284		bsd_args.flags |= MAP_SHARED;
285	if (linux_args.flags & LINUX_MAP_PRIVATE)
286		bsd_args.flags |= MAP_PRIVATE;
287	if (linux_args.flags & LINUX_MAP_FIXED)
288		bsd_args.flags |= MAP_FIXED;
289	if (linux_args.flags & LINUX_MAP_ANON)
290		bsd_args.flags |= MAP_ANON;
291	if (linux_args.flags & LINUX_MAP_GROWSDOWN) {
292		bsd_args.flags |= MAP_STACK;
293
294		/* The linux MAP_GROWSDOWN option does not limit auto
295		 * growth of the region.  Linux mmap with this option
296		 * takes as addr the inital BOS, and as len, the initial
297		 * region size.  It can then grow down from addr without
298		 * limit.  However, linux threads has an implicit internal
299		 * limit to stack size of STACK_SIZE.  Its just not
300		 * enforced explicitly in linux.  But, here we impose
301		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
302		 * region, since we can do this with our mmap.
303		 *
304		 * Our mmap with MAP_STACK takes addr as the maximum
305		 * downsize limit on BOS, and as len the max size of
306		 * the region.  It them maps the top SGROWSIZ bytes,
307		 * and autgrows the region down, up to the limit
308		 * in addr.
309		 *
310		 * If we don't use the MAP_STACK option, the effect
311		 * of this code is to allocate a stack region of a
312		 * fixed size of (STACK_SIZE - GUARD_SIZE).
313		 */
314
315		/* This gives us TOS */
316		bsd_args.addr = linux_args.addr + linux_args.len;
317
318		/* This gives us our maximum stack size */
319		if (linux_args.len > STACK_SIZE - GUARD_SIZE)
320			bsd_args.len = linux_args.len;
321		else
322			bsd_args.len  = STACK_SIZE - GUARD_SIZE;
323
324		/* This gives us a new BOS.  If we're using VM_STACK, then
325		 * mmap will just map the top SGROWSIZ bytes, and let
326		 * the stack grow down to the limit at BOS.  If we're
327		 * not using VM_STACK we map the full stack, since we
328		 * don't have a way to autogrow it.
329		 */
330		bsd_args.addr -= bsd_args.len;
331	} else {
332		bsd_args.addr = linux_args.addr;
333		bsd_args.len  = linux_args.len;
334	}
335
336	bsd_args.prot = linux_args.prot | PROT_READ;	/* always required */
337	if (linux_args.flags & LINUX_MAP_ANON)
338		bsd_args.fd = -1;
339	else
340		bsd_args.fd = linux_args.fd;
341	bsd_args.pos = linux_args.pos;
342	bsd_args.pad = 0;
343
344#ifdef DEBUG
345	printf("-> (%p, %d, %d, 0x%08x, %d, %d)\n", (void *)bsd_args.addr,
346	    bsd_args.len, bsd_args.prot, bsd_args.flags, bsd_args.fd,
347	    (int)bsd_args.pos);
348#endif
349
350	return (mmap(p, &bsd_args));
351}
352
353int
354linux_pipe(struct proc *p, struct linux_pipe_args *args)
355{
356	int error;
357	int reg_edx;
358
359#ifdef DEBUG
360	printf("Linux-emul(%ld): pipe(*)\n", (long)p->p_pid);
361#endif
362
363	reg_edx = p->p_retval[1];
364	error = pipe(p, 0);
365	if (error) {
366		p->p_retval[1] = reg_edx;
367		return (error);
368	}
369
370	error = copyout(p->p_retval, args->pipefds, 2*sizeof(int));
371	if (error) {
372		p->p_retval[1] = reg_edx;
373		return (error);
374	}
375
376	p->p_retval[1] = reg_edx;
377	p->p_retval[0] = 0;
378	return (0);
379}
380
381int
382linux_ioperm(struct proc *p, struct linux_ioperm_args *args)
383{
384	struct sysarch_args sa;
385	struct i386_ioperm_args *iia;
386	caddr_t sg;
387
388	sg = stackgap_init();
389	iia = stackgap_alloc(&sg, sizeof(struct i386_ioperm_args));
390	iia->start = args->start;
391	iia->length = args->length;
392	iia->enable = args->enable;
393	sa.op = I386_SET_IOPERM;
394	sa.parms = (char *)iia;
395	return (sysarch(p, &sa));
396}
397
398int
399linux_iopl(struct proc *p, struct linux_iopl_args *args)
400{
401	int error;
402
403	if (args->level < 0 || args->level > 3)
404		return (EINVAL);
405	if ((error = suser(p)) != 0)
406		return (error);
407	if (securelevel > 0)
408		return (EPERM);
409	p->p_md.md_regs->tf_eflags = (p->p_md.md_regs->tf_eflags & ~PSL_IOPL) |
410	    (args->level * (PSL_IOPL / 3));
411	return (0);
412}
413
414int
415linux_modify_ldt(p, uap)
416	struct proc *p;
417	struct linux_modify_ldt_args *uap;
418{
419	int error;
420	caddr_t sg;
421	struct sysarch_args args;
422	struct i386_ldt_args *ldt;
423	struct linux_descriptor ld;
424	union descriptor *desc;
425
426	sg = stackgap_init();
427
428	if (uap->ptr == NULL)
429		return (EINVAL);
430
431	switch (uap->func) {
432	case 0x00: /* read_ldt */
433		ldt = stackgap_alloc(&sg, sizeof(*ldt));
434		ldt->start = 0;
435		ldt->descs = uap->ptr;
436		ldt->num = uap->bytecount / sizeof(union descriptor);
437		args.op = I386_GET_LDT;
438		args.parms = (char*)ldt;
439		error = sysarch(p, &args);
440		p->p_retval[0] *= sizeof(union descriptor);
441		break;
442	case 0x01: /* write_ldt */
443	case 0x11: /* write_ldt */
444		if (uap->bytecount != sizeof(ld))
445			return (EINVAL);
446
447		error = copyin(uap->ptr, &ld, sizeof(ld));
448		if (error)
449			return (error);
450
451		ldt = stackgap_alloc(&sg, sizeof(*ldt));
452		desc = stackgap_alloc(&sg, sizeof(*desc));
453		ldt->start = ld.entry_number;
454		ldt->descs = desc;
455		ldt->num = 1;
456		desc->sd.sd_lolimit = (ld.limit & 0x0000ffff);
457		desc->sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
458		desc->sd.sd_lobase = (ld.base_addr & 0x00ffffff);
459		desc->sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
460		desc->sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) |
461			(ld.contents << 2);
462		desc->sd.sd_dpl = 3;
463		desc->sd.sd_p = (ld.seg_not_present ^ 1);
464		desc->sd.sd_xx = 0;
465		desc->sd.sd_def32 = ld.seg_32bit;
466		desc->sd.sd_gran = ld.limit_in_pages;
467		args.op = I386_SET_LDT;
468		args.parms = (char*)ldt;
469		error = sysarch(p, &args);
470		break;
471	default:
472		error = EINVAL;
473		break;
474	}
475
476	if (error == EOPNOTSUPP) {
477		printf("linux: modify_ldt needs kernel option USER_LDT\n");
478		error = ENOSYS;
479	}
480
481	return (error);
482}
483
484int
485linux_sigaction(struct proc *p, struct linux_sigaction_args *args)
486{
487	linux_osigaction_t osa;
488	linux_sigaction_t act, oact;
489	int error;
490
491#ifdef DEBUG
492	printf("Linux-emul(%ld): sigaction(%d, %p, %p)\n", (long)p->p_pid,
493	       args->sig, (void *)args->nsa, (void *)args->osa);
494#endif
495
496	if (args->nsa != NULL) {
497		error = copyin(args->nsa, &osa, sizeof(linux_osigaction_t));
498		if (error)
499			return (error);
500		act.lsa_handler = osa.lsa_handler;
501		act.lsa_flags = osa.lsa_flags;
502		act.lsa_restorer = osa.lsa_restorer;
503		LINUX_SIGEMPTYSET(act.lsa_mask);
504		act.lsa_mask.__bits[0] = osa.lsa_mask;
505	}
506
507	error = linux_do_sigaction(p, args->sig, args->nsa ? &act : NULL,
508	    args->osa ? &oact : NULL);
509
510	if (args->osa != NULL && !error) {
511		osa.lsa_handler = oact.lsa_handler;
512		osa.lsa_flags = oact.lsa_flags;
513		osa.lsa_restorer = oact.lsa_restorer;
514		osa.lsa_mask = oact.lsa_mask.__bits[0];
515		error = copyout(&osa, args->osa, sizeof(linux_osigaction_t));
516	}
517
518	return (error);
519}
520
521/*
522 * Linux has two extra args, restart and oldmask.  We dont use these,
523 * but it seems that "restart" is actually a context pointer that
524 * enables the signal to happen with a different register set.
525 */
526int
527linux_sigsuspend(struct proc *p, struct linux_sigsuspend_args *args)
528{
529	struct sigsuspend_args bsd;
530	sigset_t *sigmask;
531	linux_sigset_t mask;
532	caddr_t sg = stackgap_init();
533
534#ifdef DEBUG
535	printf("Linux-emul(%ld): sigsuspend(%08lx)\n",
536	       (long)p->p_pid, (unsigned long)args->mask);
537#endif
538
539	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
540	LINUX_SIGEMPTYSET(mask);
541	mask.__bits[0] = args->mask;
542	linux_to_bsd_sigset(&mask, sigmask);
543	bsd.sigmask = sigmask;
544	return (sigsuspend(p, &bsd));
545}
546
547int
548linux_rt_sigsuspend(p, uap)
549	struct proc *p;
550	struct linux_rt_sigsuspend_args *uap;
551{
552	linux_sigset_t lmask;
553	sigset_t *bmask;
554	struct sigsuspend_args bsd;
555	caddr_t sg = stackgap_init();
556	int error;
557
558#ifdef DEBUG
559	printf("Linux-emul(%ld): rt_sigsuspend(%p, %d)\n", (long)p->p_pid,
560	       (void *)uap->newset, uap->sigsetsize);
561#endif
562
563	if (uap->sigsetsize != sizeof(linux_sigset_t))
564		return (EINVAL);
565
566	error = copyin(uap->newset, &lmask, sizeof(linux_sigset_t));
567	if (error)
568		return (error);
569
570	bmask = stackgap_alloc(&sg, sizeof(sigset_t));
571	linux_to_bsd_sigset(&lmask, bmask);
572	bsd.sigmask = bmask;
573	return (sigsuspend(p, &bsd));
574}
575
576int
577linux_pause(struct proc *p, struct linux_pause_args *args)
578{
579	struct sigsuspend_args bsd;
580	sigset_t *sigmask;
581	caddr_t sg = stackgap_init();
582
583#ifdef DEBUG
584	printf("Linux-emul(%d): pause()\n", p->p_pid);
585#endif
586
587	sigmask = stackgap_alloc(&sg, sizeof(sigset_t));
588	*sigmask = p->p_sigmask;
589	bsd.sigmask = sigmask;
590	return (sigsuspend(p, &bsd));
591}
592
593int
594linux_sigaltstack(p, uap)
595	struct proc *p;
596	struct linux_sigaltstack_args *uap;
597{
598	struct sigaltstack_args bsd;
599	stack_t *ss, *oss;
600	linux_stack_t lss;
601	int error;
602	caddr_t sg = stackgap_init();
603
604#ifdef DEBUG
605	printf("Linux-emul(%ld): sigaltstack(%p, %p)\n",
606	    (long)p->p_pid, uap->uss, uap->uoss);
607#endif
608
609	error = copyin(uap->uss, &lss, sizeof(linux_stack_t));
610	if (error)
611		return (error);
612
613	ss = stackgap_alloc(&sg, sizeof(stack_t));
614	ss->ss_sp = lss.ss_sp;
615	ss->ss_size = lss.ss_size;
616	ss->ss_flags = lss.ss_flags;
617
618	oss = (uap->uoss != NULL)
619	    ? stackgap_alloc(&sg, sizeof(stack_t))
620	    : NULL;
621
622	bsd.ss = ss;
623	bsd.oss = oss;
624	error = sigaltstack(p, &bsd);
625
626	if (!error && oss != NULL) {
627		lss.ss_sp = oss->ss_sp;
628		lss.ss_size = oss->ss_size;
629		lss.ss_flags = oss->ss_flags;
630		error = copyout(&lss, uap->uoss, sizeof(linux_stack_t));
631	}
632
633	return (error);
634}
635