s10_brand.c revision 12760:be364dc3be74
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/errno.h>
27#include <sys/exec.h>
28#include <sys/file.h>
29#include <sys/kmem.h>
30#include <sys/modctl.h>
31#include <sys/model.h>
32#include <sys/proc.h>
33#include <sys/syscall.h>
34#include <sys/systm.h>
35#include <sys/thread.h>
36#include <sys/cmn_err.h>
37#include <sys/archsystm.h>
38#include <sys/pathname.h>
39#include <sys/sunddi.h>
40
41#include <sys/machbrand.h>
42#include <sys/brand.h>
43#include "s10_brand.h"
44
45char *s10_emulation_table = NULL;
46
47void	s10_init_brand_data(zone_t *);
48void	s10_free_brand_data(zone_t *);
49void	s10_setbrand(proc_t *);
50int	s10_getattr(zone_t *, int, void *, size_t *);
51int	s10_setattr(zone_t *, int, void *, size_t);
52int	s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
53		uintptr_t, uintptr_t, uintptr_t);
54void	s10_copy_procdata(proc_t *, proc_t *);
55void	s10_proc_exit(struct proc *, klwp_t *);
56void	s10_exec();
57int	s10_initlwp(klwp_t *);
58void	s10_forklwp(klwp_t *, klwp_t *);
59void	s10_freelwp(klwp_t *);
60void	s10_lwpexit(klwp_t *);
61int	s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
62	long *, int, caddr_t, cred_t *, int);
63void	s10_sigset_native_to_s10(sigset_t *);
64void	s10_sigset_s10_to_native(sigset_t *);
65
66/* s10 brand */
67struct brand_ops s10_brops = {
68	s10_init_brand_data,
69	s10_free_brand_data,
70	s10_brandsys,
71	s10_setbrand,
72	s10_getattr,
73	s10_setattr,
74	s10_copy_procdata,
75	s10_proc_exit,
76	s10_exec,
77	lwp_setrval,
78	s10_initlwp,
79	s10_forklwp,
80	s10_freelwp,
81	s10_lwpexit,
82	s10_elfexec,
83	s10_sigset_native_to_s10,
84	s10_sigset_s10_to_native,
85	S10_NSIG,
86};
87
88#ifdef	sparc
89
90struct brand_mach_ops s10_mops = {
91	s10_brand_syscall_callback,
92	s10_brand_syscall32_callback
93};
94
95#else	/* sparc */
96
97#ifdef	__amd64
98
99struct brand_mach_ops s10_mops = {
100	s10_brand_sysenter_callback,
101	s10_brand_int91_callback,
102	s10_brand_syscall_callback,
103	s10_brand_syscall32_callback
104};
105
106#else	/* ! __amd64 */
107
108struct brand_mach_ops s10_mops = {
109	s10_brand_sysenter_callback,
110	NULL,
111	s10_brand_syscall_callback,
112	NULL
113};
114#endif	/* __amd64 */
115
116#endif	/* _sparc */
117
118struct brand	s10_brand = {
119	BRAND_VER_1,
120	"solaris10",
121	&s10_brops,
122	&s10_mops
123};
124
125static struct modlbrand modlbrand = {
126	&mod_brandops,		/* type of module */
127	"Solaris 10 Brand",	/* description of module */
128	&s10_brand		/* driver ops */
129};
130
131static struct modlinkage modlinkage = {
132	MODREV_1, (void *)&modlbrand, NULL
133};
134
135void
136s10_setbrand(proc_t *p)
137{
138	brand_solaris_setbrand(p, &s10_brand);
139}
140
141/*ARGSUSED*/
142int
143s10_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
144{
145	ASSERT(zone->zone_brand == &s10_brand);
146	if (attr == S10_EMUL_BITMAP) {
147		if (buf == NULL || *bufsize != sizeof (s10_emul_bitmap_t))
148			return (EINVAL);
149		if (copyout(((s10_zone_data_t *)zone->zone_brand_data)->
150		    emul_bitmap, buf, sizeof (s10_emul_bitmap_t)) != 0)
151			return (EFAULT);
152		return (0);
153	}
154
155	return (EINVAL);
156}
157
158int
159s10_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
160{
161	ASSERT(zone->zone_brand == &s10_brand);
162	if (attr == S10_EMUL_BITMAP) {
163		if (buf == NULL || bufsize != sizeof (s10_emul_bitmap_t))
164			return (EINVAL);
165		if (copyin(buf, ((s10_zone_data_t *)zone->zone_brand_data)->
166		    emul_bitmap, sizeof (s10_emul_bitmap_t)) != 0)
167			return (EFAULT);
168		return (0);
169	}
170
171	return (EINVAL);
172}
173
174#ifdef	__amd64
175/*
176 * The Nevada kernel clears %fs for threads in 64-bit x86 processes but S10's
177 * libc expects %fs to be nonzero.  This causes some committed
178 * libc/libthread interfaces (e.g., thr_main()) to fail, which impacts several
179 * libraries, including libdoor.  This function sets the specified LWP's %fs
180 * register to the legacy S10 selector value (LWPFS_SEL).
181 *
182 * The best solution to the aforementioned problem is backporting CRs
183 * 6467491 to Solaris 10 so that 64-bit x86 Solaris 10 processes
184 * would accept zero for %fs.  Backporting the CRs is a requirement for running
185 * S10 Containers in PV domUs because 64-bit Xen clears %fsbase when %fs is
186 * nonzero.  Such behavior breaks 64-bit processes because Xen has to fetch the
187 * FS segments' base addresses from the LWPs' GDTs, which are only capable of
188 * 32-bit addressing.
189 */
190/*ARGSUSED*/
191static void
192s10_amd64_correct_fsreg(klwp_t *l)
193{
194	if (lwp_getdatamodel(l) == DATAMODEL_NATIVE) {
195		kpreempt_disable();
196		l->lwp_pcb.pcb_fs = LWPFS_SEL;
197		l->lwp_pcb.pcb_rupdate = 1;
198		lwptot(l)->t_post_sys = 1;	/* Guarantee update_sregs() */
199		kpreempt_enable();
200	}
201}
202#endif	/* __amd64 */
203
204/*
205 * The native ld.so.1 is invoked with a set of -e options which we also want to
206 * strip off.  This function assumes the set of -e options immediately follows
207 * the native ld.so.1 command and is contiguous. This is OK, since we control
208 * the code in s10_isaexec_wrapper.  We do it this way so we don't accidently
209 * strip a -e option from the native command itself.  The format of an ld.so.1
210 * -e option looks like:
211 *	-e LD_NOENVIRON=1
212 */
213char *
214rm_e_options(char *args)
215{
216	char *p;
217
218	while (strncmp(args, "-e ", 3) == 0) {
219		args += 3;
220		if ((p = strchr(args, ' ')) != NULL)
221			args = p + 1;
222	}
223
224	return (args);
225}
226
227int
228s10_native()
229{
230	struct user	*up = PTOU(curproc);
231	char		*args_new, *comm_new, *p;
232	int		len;
233
234	/*
235	 * len has an extra value for the trailing '\0' so this covers the
236	 * appended " " in the following strcmps.
237	 */
238	len = sizeof (BRAND_NATIVE_LINKER32);
239
240	/*
241	 * Make sure that the process' interpreter is the native dynamic linker.
242	 * Convention dictates that native processes executing within solaris10-
243	 * branded zones are interpreted by the native dynamic linker (the
244	 * process and its arguments are specified as arguments to the dynamic
245	 * linker).  If this convention is violated (i.e.,
246	 * brandsys(B_S10_NATIVE, ...) is invoked by a process that shouldn't be
247	 * native), then do nothing and silently indicate success.
248	 */
249	if (strcmp(up->u_comm, S10_LINKER_NAME) != 0)
250		return (0);
251	if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER64 " ", len + 3) == 0)
252		len += 3;		/* to account for "/64" in the path */
253	else if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER32 " ", len) != 0)
254		return (0);
255
256	args_new = strdup(rm_e_options(&up->u_psargs[len]));
257	if ((p = strchr(args_new, ' ')) != NULL)
258		*p = '\0';
259	if ((comm_new = strrchr(args_new, '/')) != NULL)
260		comm_new = strdup(comm_new + 1);
261	else
262		comm_new = strdup(args_new);
263	if (p != NULL)
264		*p = ' ';
265
266	if ((strlen(args_new) != 0) && (strlen(comm_new) != 0)) {
267		mutex_enter(&curproc->p_lock);
268		(void) strlcpy(up->u_comm, comm_new, MAXCOMLEN+1);
269		(void) strlcpy(up->u_psargs, args_new, PSARGSZ);
270		mutex_exit(&curproc->p_lock);
271	}
272
273	strfree(args_new);
274	strfree(comm_new);
275	return (0);
276}
277
278/*ARGSUSED*/
279int
280s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
281    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
282{
283	proc_t	*p = curproc;
284	int	res;
285
286	*rval = 0;
287
288	if (cmd == B_S10_NATIVE)
289		return (s10_native());
290
291	res = brand_solaris_cmd(cmd, arg1, arg2, arg3, &s10_brand, S10_VERSION);
292	if (res >= 0)
293		return (res);
294
295	switch ((cmd)) {
296	case B_S10_PIDINFO:
297		/*
298		 * The s10 brand needs to be able to get the pid of the
299		 * current process and the pid of the zone's init, and it
300		 * needs to do this on every process startup.  Early in
301		 * brand startup, we can't call getpid() because calls to
302		 * getpid() represent a magical signal to some old-skool
303		 * debuggers.  By merging all of this into one call, we
304		 * make this quite a bit cheaper and easier to handle in
305		 * the brand module.
306		 */
307		if (copyout(&p->p_pid, (void *)arg1, sizeof (pid_t)) != 0)
308			return (EFAULT);
309		if (copyout(&p->p_zone->zone_proc_initpid, (void *)arg2,
310		    sizeof (pid_t)) != 0)
311			return (EFAULT);
312		return (0);
313
314	case B_S10_ISFDXATTRDIR: {
315		/*
316		 * This subcommand enables the userland brand emulation library
317		 * to determine whether a file descriptor refers to an extended
318		 * file attributes directory.  There is no standard syscall or
319		 * libc function that can make such a determination.
320		 */
321		file_t *dir_filep;
322
323		dir_filep = getf((int)arg1);
324		if (dir_filep == NULL)
325			return (EBADF);
326		ASSERT(dir_filep->f_vnode != NULL);
327		*rval = IS_XATTRDIR(dir_filep->f_vnode);
328		releasef((int)arg1);
329		return (0);
330	}
331
332#ifdef	__amd64
333	case B_S10_FSREGCORRECTION:
334		/*
335		 * This subcommand exists so that the SYS_lwp_private and
336		 * SYS_lwp_create syscalls can manually set the current thread's
337		 * %fs register to the legacy S10 selector value for 64-bit x86
338		 * processes.
339		 */
340		s10_amd64_correct_fsreg(ttolwp(curthread));
341		return (0);
342#endif	/* __amd64 */
343	}
344
345	return (EINVAL);
346}
347
348void
349s10_copy_procdata(proc_t *child, proc_t *parent)
350{
351	brand_solaris_copy_procdata(child, parent, &s10_brand);
352}
353
354void
355s10_proc_exit(struct proc *p, klwp_t *l)
356{
357	brand_solaris_proc_exit(p, l, &s10_brand);
358}
359
360void
361s10_exec()
362{
363	brand_solaris_exec(&s10_brand);
364}
365
366int
367s10_initlwp(klwp_t *l)
368{
369	return (brand_solaris_initlwp(l, &s10_brand));
370}
371
372void
373s10_forklwp(klwp_t *p, klwp_t *c)
374{
375	brand_solaris_forklwp(p, c, &s10_brand);
376
377#ifdef	__amd64
378	/*
379	 * Only correct the child's %fs register if the parent's %fs register
380	 * is LWPFS_SEL.  If the parent's %fs register is zero, then the Solaris
381	 * 10 environment that we're emulating uses a version of libc that
382	 * works when %fs is zero (i.e., it contains backports of CRs 6467491
383	 * and 6501650).
384	 */
385	if (p->lwp_pcb.pcb_fs == LWPFS_SEL)
386		s10_amd64_correct_fsreg(c);
387#endif	/* __amd64 */
388}
389
390void
391s10_freelwp(klwp_t *l)
392{
393	brand_solaris_freelwp(l, &s10_brand);
394}
395
396void
397s10_lwpexit(klwp_t *l)
398{
399	brand_solaris_lwpexit(l, &s10_brand);
400}
401
402void
403s10_free_brand_data(zone_t *zone)
404{
405	kmem_free(zone->zone_brand_data, sizeof (s10_zone_data_t));
406}
407
408void
409s10_init_brand_data(zone_t *zone)
410{
411	ASSERT(zone->zone_brand == &s10_brand);
412	ASSERT(zone->zone_brand_data == NULL);
413	zone->zone_brand_data = kmem_zalloc(sizeof (s10_zone_data_t), KM_SLEEP);
414}
415
416int
417s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
418	int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
419	int brand_action)
420{
421	return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz,
422	    setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME,
423	    S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32));
424}
425
426void
427s10_sigset_native_to_s10(sigset_t *set)
428{
429	int nativesig;
430	int s10sig;
431	sigset_t s10set;
432
433	/*
434	 * Shortcut: we know the first 32 signals are the same in both
435	 * s10 and native Solaris.  Just assign the first word.
436	 */
437	s10set.__sigbits[0] = set->__sigbits[0];
438	s10set.__sigbits[1] = 0;
439	s10set.__sigbits[2] = 0;
440	s10set.__sigbits[3] = 0;
441
442	/*
443	 * Copy the remainder of the initial set of common signals.
444	 */
445	for (nativesig = 33; nativesig < S10_SIGRTMIN; nativesig++)
446		if (sigismember(set, nativesig))
447			sigaddset(&s10set, nativesig);
448
449	/*
450	 * Convert any native RT signals to their S10 values.
451	 */
452	for (nativesig = _SIGRTMIN, s10sig = S10_SIGRTMIN;
453	    nativesig <= _SIGRTMAX && s10sig <= S10_SIGRTMAX;
454	    nativesig++, s10sig++) {
455		if (sigismember(set, nativesig))
456			sigaddset(&s10set, s10sig);
457	}
458
459	*set = s10set;
460}
461
462void
463s10_sigset_s10_to_native(sigset_t *set)
464{
465	int s10sig;
466	int nativesig;
467	sigset_t nativeset;
468
469	/*
470	 * Shortcut: we know the first 32 signals are the same in both
471	 * s10 and native Solaris.  Just assign the first word.
472	 */
473	nativeset.__sigbits[0] = set->__sigbits[0];
474	nativeset.__sigbits[1] = 0;
475	nativeset.__sigbits[2] = 0;
476	nativeset.__sigbits[3] = 0;
477
478	/*
479	 * Copy the remainder of the initial set of common signals.
480	 */
481	for (s10sig = 33; s10sig < S10_SIGRTMIN; s10sig++)
482		if (sigismember(set, s10sig))
483			sigaddset(&nativeset, s10sig);
484
485	/*
486	 * Convert any S10 RT signals to their native values.
487	 */
488	for (s10sig = S10_SIGRTMIN, nativesig = _SIGRTMIN;
489	    s10sig <= S10_SIGRTMAX && nativesig <= _SIGRTMAX;
490	    s10sig++, nativesig++) {
491		if (sigismember(set, s10sig))
492			sigaddset(&nativeset, nativesig);
493	}
494
495	*set = nativeset;
496}
497
498int
499_init(void)
500{
501	int err;
502
503	/*
504	 * Set up the table indicating which system calls we want to
505	 * interpose on.  We should probably build this automatically from
506	 * a list of system calls that is shared with the user-space
507	 * library.
508	 */
509	s10_emulation_table = kmem_zalloc(NSYSCALL, KM_SLEEP);
510	s10_emulation_table[S10_SYS_forkall] = 1;		/*   2 */
511	s10_emulation_table[S10_SYS_open] = 1;			/*   5 */
512	s10_emulation_table[S10_SYS_wait] = 1;			/*   7 */
513	s10_emulation_table[S10_SYS_creat] = 1;			/*   8 */
514	s10_emulation_table[S10_SYS_unlink] = 1;		/*  10 */
515	s10_emulation_table[S10_SYS_exec] = 1;			/*  11 */
516	s10_emulation_table[S10_SYS_chown] = 1;			/*  16 */
517	s10_emulation_table[S10_SYS_stat] = 1;			/*  18 */
518	s10_emulation_table[S10_SYS_umount] = 1;		/*  22 */
519	s10_emulation_table[S10_SYS_fstat] = 1;			/*  28 */
520	s10_emulation_table[S10_SYS_utime] = 1;			/*  30 */
521	s10_emulation_table[S10_SYS_access] = 1;		/*  33 */
522	s10_emulation_table[SYS_kill] = 1;			/*  37 */
523	s10_emulation_table[S10_SYS_dup] = 1;			/*  41 */
524	s10_emulation_table[SYS_ioctl] = 1;			/*  54 */
525	s10_emulation_table[SYS_execve] = 1;			/*  59 */
526	s10_emulation_table[SYS_acctctl] = 1;			/*  71 */
527	s10_emulation_table[S10_SYS_issetugid] = 1;		/*  75 */
528	s10_emulation_table[S10_SYS_fsat] = 1;			/*  76 */
529	s10_emulation_table[S10_SYS_rmdir] = 1;			/*  79 */
530	s10_emulation_table[SYS_getdents] = 1;			/*  81 */
531	s10_emulation_table[S10_SYS_poll] = 1;			/*  87 */
532	s10_emulation_table[S10_SYS_lstat] = 1;			/*  88 */
533	s10_emulation_table[S10_SYS_fchown] = 1;		/*  94 */
534	s10_emulation_table[SYS_sigprocmask] = 1;		/*  95 */
535	s10_emulation_table[SYS_sigsuspend] = 1;		/*  96 */
536	s10_emulation_table[SYS_sigaction] = 1;			/*  98 */
537	s10_emulation_table[SYS_sigpending] = 1;		/*  99 */
538	s10_emulation_table[SYS_waitid] = 1;			/* 107 */
539	s10_emulation_table[SYS_sigsendsys] = 1;		/* 108 */
540#if defined(__x86)
541	s10_emulation_table[S10_SYS_xstat] = 1;			/* 123 */
542	s10_emulation_table[S10_SYS_lxstat] = 1;		/* 124 */
543	s10_emulation_table[S10_SYS_fxstat] = 1;		/* 125 */
544	s10_emulation_table[S10_SYS_xmknod] = 1;		/* 126 */
545#endif
546	s10_emulation_table[S10_SYS_lchown] = 1;		/* 130 */
547	s10_emulation_table[S10_SYS_rename] = 1;		/* 134 */
548	s10_emulation_table[SYS_uname] = 1;			/* 135 */
549	s10_emulation_table[SYS_sysconfig] = 1;			/* 137 */
550	s10_emulation_table[SYS_systeminfo] = 1;		/* 139 */
551	s10_emulation_table[S10_SYS_fork1] = 1;			/* 143 */
552	s10_emulation_table[SYS_sigtimedwait] = 1;		/* 144 */
553	s10_emulation_table[S10_SYS_lwp_sema_wait] = 1;		/* 147 */
554	s10_emulation_table[S10_SYS_utimes] = 1;		/* 154 */
555	s10_emulation_table[SYS_lwp_create] = 1;		/* 159 */
556	s10_emulation_table[SYS_lwp_kill] = 1;			/* 163 */
557	s10_emulation_table[SYS_lwp_sigmask] = 1;		/* 165 */
558#if defined(__amd64)
559	s10_emulation_table[SYS_lwp_private] = 1;		/* 166 */
560#endif	/* __amd64 */
561	s10_emulation_table[S10_SYS_lwp_mutex_lock] = 1;	/* 169 */
562	s10_emulation_table[SYS_pwrite] = 1;			/* 174 */
563	s10_emulation_table[SYS_acl] = 1;			/* 185 */
564	s10_emulation_table[SYS_auditsys] = 1;			/* 186 */
565	s10_emulation_table[SYS_sigqueue] = 1;			/* 190 */
566	s10_emulation_table[SYS_facl] = 1;			/* 200 */
567	s10_emulation_table[SYS_signotify] = 1;			/* 205 */
568	s10_emulation_table[SYS_lwp_mutex_timedlock] = 1;	/* 210 */
569	s10_emulation_table[SYS_getdents64] = 1;		/* 213 */
570	s10_emulation_table[S10_SYS_stat64] = 1;		/* 215 */
571	s10_emulation_table[S10_SYS_lstat64] = 1;		/* 216 */
572	s10_emulation_table[S10_SYS_fstat64] = 1;		/* 217 */
573	s10_emulation_table[SYS_pwrite64] = 1;			/* 223 */
574	s10_emulation_table[S10_SYS_creat64] = 1;		/* 224 */
575	s10_emulation_table[S10_SYS_open64] = 1;		/* 225 */
576	s10_emulation_table[SYS_zone] = 1;			/* 227 */
577	s10_emulation_table[SYS_lwp_mutex_trylock] = 1;		/* 251 */
578
579	err = mod_install(&modlinkage);
580	if (err) {
581		cmn_err(CE_WARN, "Couldn't install brand module");
582		kmem_free(s10_emulation_table, NSYSCALL);
583	}
584
585	return (err);
586}
587
588int
589_info(struct modinfo *modinfop)
590{
591	return (mod_info(&modlinkage, modinfop));
592}
593
594int
595_fini(void)
596{
597	return (brand_solaris_fini(&s10_emulation_table, &modlinkage,
598	    &s10_brand));
599}
600