s10_brand.c revision 11315:92ff2a8d2f86
1226586Sdim/*
2226586Sdim * CDDL HEADER START
3226586Sdim *
4226586Sdim * The contents of this file are subject to the terms of the
5226586Sdim * Common Development and Distribution License (the "License").
6226586Sdim * You may not use this file except in compliance with the License.
7226586Sdim *
8226586Sdim * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9226586Sdim * or http://www.opensolaris.org/os/licensing.
10226586Sdim * See the License for the specific language governing permissions
11226586Sdim * and limitations under the License.
12226586Sdim *
13226586Sdim * When distributing Covered Code, include this CDDL HEADER in each
14239462Sdim * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15243830Sdim * If applicable, add the following below this CDDL HEADER, with the
16226586Sdim * fields enclosed by brackets "[]" replaced with your own identifying
17243830Sdim * information: Portions Copyright [yyyy] [name of copyright owner]
18239462Sdim *
19239462Sdim * CDDL HEADER END
20226586Sdim */
21226586Sdim/*
22226586Sdim * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23226586Sdim * Use is subject to license terms.
24226586Sdim */
25239462Sdim
26226586Sdim#include <sys/errno.h>
27226586Sdim#include <sys/exec.h>
28226586Sdim#include <sys/file.h>
29226586Sdim#include <sys/kmem.h>
30226586Sdim#include <sys/modctl.h>
31226586Sdim#include <sys/model.h>
32226586Sdim#include <sys/proc.h>
33226586Sdim#include <sys/syscall.h>
34226586Sdim#include <sys/systm.h>
35226586Sdim#include <sys/thread.h>
36243830Sdim#include <sys/cmn_err.h>
37243830Sdim#include <sys/archsystm.h>
38243830Sdim#include <sys/pathname.h>
39226586Sdim#include <sys/sunddi.h>
40226586Sdim
41226586Sdim#include <sys/machbrand.h>
42226586Sdim#include <sys/brand.h>
43226586Sdim#include "s10_brand.h"
44226586Sdim
45226586Sdimchar *s10_emulation_table = NULL;
46239462Sdim
47226586Sdimvoid	s10_init_brand_data(zone_t *);
48239462Sdimvoid	s10_free_brand_data(zone_t *);
49226586Sdimvoid	s10_setbrand(proc_t *);
50226586Sdimint	s10_getattr(zone_t *, int, void *, size_t *);
51226586Sdimint	s10_setattr(zone_t *, int, void *, size_t);
52226586Sdimint	s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
53239462Sdim		uintptr_t, uintptr_t, uintptr_t);
54239462Sdimvoid	s10_copy_procdata(proc_t *, proc_t *);
55239462Sdimvoid	s10_proc_exit(struct proc *, klwp_t *);
56239462Sdimvoid	s10_exec();
57239462Sdimint	s10_initlwp(klwp_t *);
58239462Sdimvoid	s10_forklwp(klwp_t *, klwp_t *);
59239462Sdimvoid	s10_freelwp(klwp_t *);
60239462Sdimvoid	s10_lwpexit(klwp_t *);
61239462Sdimint	s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
62239462Sdim	long *, int, caddr_t, cred_t *, int);
63239462Sdim
64251662Sdim/* s10 brand */
65251662Sdimstruct brand_ops s10_brops = {
66251662Sdim	s10_init_brand_data,
67251662Sdim	s10_free_brand_data,
68251662Sdim	s10_brandsys,
69251662Sdim	s10_setbrand,
70251662Sdim	s10_getattr,
71251662Sdim	s10_setattr,
72251662Sdim	s10_copy_procdata,
73251662Sdim	s10_proc_exit,
74251662Sdim	s10_exec,
75251662Sdim	lwp_setrval,
76226586Sdim	s10_initlwp,
77226586Sdim	s10_forklwp,
78239462Sdim	s10_freelwp,
79239462Sdim	s10_lwpexit,
80239462Sdim	s10_elfexec
81226586Sdim};
82239462Sdim
83226586Sdim#ifdef	sparc
84239462Sdim
85226586Sdimstruct brand_mach_ops s10_mops = {
86239462Sdim	s10_brand_syscall_callback,
87239462Sdim	s10_brand_syscall32_callback
88239462Sdim};
89239462Sdim
90239462Sdim#else	/* sparc */
91239462Sdim
92239462Sdim#ifdef	__amd64
93239462Sdim
94239462Sdimstruct brand_mach_ops s10_mops = {
95239462Sdim	s10_brand_sysenter_callback,
96239462Sdim	NULL,
97239462Sdim	s10_brand_int91_callback,
98239462Sdim	s10_brand_syscall_callback,
99239462Sdim	s10_brand_syscall32_callback,
100239462Sdim	NULL
101239462Sdim};
102239462Sdim
103239462Sdim#else	/* ! __amd64 */
104239462Sdim
105239462Sdimstruct brand_mach_ops s10_mops = {
106239462Sdim	s10_brand_sysenter_callback,
107239462Sdim	NULL,
108239462Sdim	NULL,
109239462Sdim	s10_brand_syscall_callback,
110239462Sdim	NULL,
111243830Sdim	NULL
112226586Sdim};
113243830Sdim#endif	/* __amd64 */
114226586Sdim
115243830Sdim#endif	/* _sparc */
116226586Sdim
117226586Sdimstruct brand	s10_brand = {
118226586Sdim	BRAND_VER_1,
119243830Sdim	"solaris10",
120226586Sdim	&s10_brops,
121226586Sdim	&s10_mops
122226586Sdim};
123226586Sdim
124226586Sdimstatic struct modlbrand modlbrand = {
125226586Sdim	&mod_brandops,		/* type of module */
126226586Sdim	"Solaris 10 Brand",	/* description of module */
127243830Sdim	&s10_brand		/* driver ops */
128226586Sdim};
129243830Sdim
130243830Sdimstatic struct modlinkage modlinkage = {
131243830Sdim	MODREV_1, (void *)&modlbrand, NULL
132243830Sdim};
133243830Sdim
134243830Sdimvoid
135243830Sdims10_setbrand(proc_t *p)
136243830Sdim{
137239462Sdim	ASSERT(p->p_brand == &s10_brand);
138239462Sdim	ASSERT(p->p_brand_data == NULL);
139226586Sdim
140239462Sdim	/*
141239462Sdim	 * We should only be called from exec(), when we know the process
142239462Sdim	 * is single-threaded.
143239462Sdim	 */
144239462Sdim	ASSERT(p->p_tlist == p->p_tlist->t_forw);
145239462Sdim
146239462Sdim	p->p_brand_data = kmem_zalloc(sizeof (s10_proc_data_t), KM_SLEEP);
147239462Sdim	(void) s10_initlwp(p->p_tlist->t_lwp);
148239462Sdim}
149239462Sdim
150239462Sdim/*ARGSUSED*/
151239462Sdimint
152239462Sdims10_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
153239462Sdim{
154239462Sdim	ASSERT(zone->zone_brand == &s10_brand);
155239462Sdim	if (attr == S10_EMUL_BITMAP) {
156239462Sdim		if (buf == NULL || *bufsize != sizeof (s10_emul_bitmap_t))
157239462Sdim			return (EINVAL);
158239462Sdim		if (copyout(((s10_zone_data_t *)zone->zone_brand_data)->
159239462Sdim		    emul_bitmap, buf, sizeof (s10_emul_bitmap_t)) != 0)
160239462Sdim			return (EFAULT);
161239462Sdim		return (0);
162239462Sdim	}
163239462Sdim
164239462Sdim	return (EINVAL);
165239462Sdim}
166239462Sdim
167239462Sdimint
168239462Sdims10_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
169239462Sdim{
170239462Sdim	ASSERT(zone->zone_brand == &s10_brand);
171239462Sdim	if (attr == S10_EMUL_BITMAP) {
172239462Sdim		if (buf == NULL || bufsize != sizeof (s10_emul_bitmap_t))
173239462Sdim			return (EINVAL);
174239462Sdim		if (copyin(buf, ((s10_zone_data_t *)zone->zone_brand_data)->
175243830Sdim		    emul_bitmap, sizeof (s10_emul_bitmap_t)) != 0)
176243830Sdim			return (EFAULT);
177239462Sdim		return (0);
178239462Sdim	}
179239462Sdim
180239462Sdim	return (EINVAL);
181239462Sdim}
182239462Sdim
183239462Sdim#ifdef	__amd64
184239462Sdim/*
185239462Sdim * The Nevada kernel clears %fs for threads in 64-bit x86 processes but S10's
186239462Sdim * libc expects %fs to be nonzero.  This causes some committed
187239462Sdim * libc/libthread interfaces (e.g., thr_main()) to fail, which impacts several
188239462Sdim * libraries, including libdoor.  This function sets the specified LWP's %fs
189239462Sdim * register to the legacy S10 selector value (LWPFS_SEL).
190239462Sdim *
191239462Sdim * The best solution to the aforementioned problem is backporting CRs
192239462Sdim * 6467491 to Solaris 10 so that 64-bit x86 Solaris 10 processes
193239462Sdim * would accept zero for %fs.  Backporting the CRs is a requirement for running
194239462Sdim * S10 Containers in PV domUs because 64-bit Xen clears %fsbase when %fs is
195243830Sdim * nonzero.  Such behavior breaks 64-bit processes because Xen has to fetch the
196243830Sdim * FS segments' base addresses from the LWPs' GDTs, which are only capable of
197243830Sdim * 32-bit addressing.
198243830Sdim */
199243830Sdim/*ARGSUSED*/
200243830Sdimstatic void
201239462Sdims10_amd64_correct_fsreg(klwp_t *l)
202239462Sdim{
203226586Sdim	if (lwp_getdatamodel(l) == DATAMODEL_NATIVE) {
204226586Sdim		kpreempt_disable();
205226586Sdim		l->lwp_pcb.pcb_fs = LWPFS_SEL;
206234353Sdim		l->lwp_pcb.pcb_rupdate = 1;
207239462Sdim		lwptot(l)->t_post_sys = 1;	/* Guarantee update_sregs() */
208243830Sdim		kpreempt_enable();
209243830Sdim	}
210239462Sdim}
211243830Sdim#endif	/* __amd64 */
212239462Sdim
213243830Sdimint
214226586Sdims10_native()
215243830Sdim{
216243830Sdim	struct user	*up = PTOU(curproc);
217243830Sdim	char		*args_new, *comm_new, *p;
218243830Sdim	int		len;
219226586Sdim
220243830Sdim	len = sizeof (S10_NATIVE_LINKER32 " ") - 1;
221243830Sdim
222243830Sdim	/*
223243830Sdim	 * Make sure that the process' interpreter is the native dynamic linker.
224243830Sdim	 * Convention dictates that native processes executing within solaris10-
225243830Sdim	 * branded zones are interpreted by the native dynamic linker (the
226243830Sdim	 * process and its arguments are specified as arguments to the dynamic
227243830Sdim	 * linker).  If this convention is violated (i.e.,
228243830Sdim	 * brandsys(B_S10_NATIVE, ...) is invoked by a process that shouldn't be
229243830Sdim	 * native), then do nothing and silently indicate success.
230243830Sdim	 */
231243830Sdim	if (strcmp(up->u_comm, S10_LINKER_NAME) != 0)
232243830Sdim		return (0);
233243830Sdim	if (strncmp(up->u_psargs, S10_NATIVE_LINKER64 " /", len + 4) == 0)
234243830Sdim		len += 3;		/* to account for "/64" in the path */
235243830Sdim	else if (strncmp(up->u_psargs, S10_NATIVE_LINKER32 " /", len + 1) != 0)
236243830Sdim		return (0);
237243830Sdim
238243830Sdim	args_new = strdup(&up->u_psargs[len]);
239226586Sdim	if ((p = strchr(args_new, ' ')) != NULL)
240239462Sdim		*p = '\0';
241226586Sdim	if ((comm_new = strrchr(args_new, '/')) != NULL)
242226586Sdim		comm_new = strdup(comm_new + 1);
243226586Sdim	else
244226586Sdim		comm_new = strdup(args_new);
245226586Sdim	if (p != NULL)
246226586Sdim		*p = ' ';
247226586Sdim
248226586Sdim	if ((strlen(args_new) != 0) && (strlen(comm_new) != 0)) {
249226586Sdim		mutex_enter(&curproc->p_lock);
250226586Sdim		(void) strlcpy(up->u_comm, comm_new, MAXCOMLEN+1);
251243830Sdim		(void) strlcpy(up->u_psargs, args_new, PSARGSZ);
252243830Sdim		mutex_exit(&curproc->p_lock);
253243830Sdim	}
254239462Sdim
255243830Sdim	strfree(args_new);
256243830Sdim	strfree(comm_new);
257243830Sdim	return (0);
258243830Sdim}
259243830Sdim
260243830Sdim/*
261243830Sdim * Get the address of the user-space system call handler from the user
262243830Sdim * process and attach it to the proc structure.
263243830Sdim */
264243830Sdim/*ARGSUSED*/
265239462Sdimint
266239462Sdims10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
267226586Sdim    uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6)
268226586Sdim{
269226586Sdim	s10_proc_data_t	*spd;
270226586Sdim	s10_brand_reg_t	reg;
271226586Sdim	proc_t		*p = curproc;
272226586Sdim	int		err;
273226586Sdim
274	*rval = 0;
275
276	/*
277	 * B_EXEC_BRAND is redundant
278	 * since the kernel assumes a native process doing an exec
279	 * in a branded zone is going to run a branded processes.
280	 * hence we don't support this operation.
281	 */
282	if (cmd == B_EXEC_BRAND)
283		return (ENOSYS);
284
285	if (cmd == B_S10_NATIVE)
286		return (s10_native());
287
288	/* For all other operations this must be a branded process. */
289	if (p->p_brand == &native_brand)
290		return (ENOSYS);
291
292	ASSERT(p->p_brand == &s10_brand);
293	ASSERT(p->p_brand_data != NULL);
294
295	spd = (s10_proc_data_t *)p->p_brand_data;
296
297	switch (cmd) {
298	case B_EXEC_NATIVE:
299		err = exec_common(
300		    (char *)arg1, (const char **)arg2, (const char **)arg3,
301		    EBA_NATIVE);
302		return (err);
303
304	case B_REGISTER:
305		if (p->p_model == DATAMODEL_NATIVE) {
306			if (copyin((void *)arg1, &reg, sizeof (reg)) != 0)
307				return (EFAULT);
308#if defined(_LP64)
309		} else {
310			s10_brand_reg32_t reg32;
311
312			if (copyin((void *)arg1, &reg32, sizeof (reg32)) != 0)
313				return (EFAULT);
314			reg.sbr_version = reg32.sbr_version;
315			reg.sbr_handler = (caddr_t)(uintptr_t)reg32.sbr_handler;
316#endif /* _LP64 */
317		}
318
319		if (reg.sbr_version != S10_VERSION)
320			return (ENOTSUP);
321		spd->spd_handler = reg.sbr_handler;
322		return (0);
323
324	case B_ELFDATA:
325		if (p->p_model == DATAMODEL_NATIVE) {
326			if (copyout(&spd->spd_elf_data, (void *)arg1,
327			    sizeof (s10_elf_data_t)) != 0)
328				return (EFAULT);
329#if defined(_LP64)
330		} else {
331			s10_elf_data32_t sed32;
332
333			sed32.sed_phdr = spd->spd_elf_data.sed_phdr;
334			sed32.sed_phent = spd->spd_elf_data.sed_phent;
335			sed32.sed_phnum = spd->spd_elf_data.sed_phnum;
336			sed32.sed_entry = spd->spd_elf_data.sed_entry;
337			sed32.sed_base = spd->spd_elf_data.sed_base;
338			sed32.sed_ldentry = spd->spd_elf_data.sed_ldentry;
339			sed32.sed_lddata = spd->spd_elf_data.sed_lddata;
340			if (copyout(&sed32, (void *)arg1, sizeof (sed32)) != 0)
341				return (EFAULT);
342#endif /* _LP64 */
343		}
344		return (0);
345
346	case B_S10_PIDINFO:
347		/*
348		 * The s10 brand needs to be able to get the pid of the
349		 * current process and the pid of the zone's init, and it
350		 * needs to do this on every process startup.  Early in
351		 * brand startup, we can't call getpid() because calls to
352		 * getpid() represent a magical signal to some old-skool
353		 * debuggers.  By merging all of this into one call, we
354		 * make this quite a bit cheaper and easier to handle in
355		 * the brand module.
356		 */
357		if (copyout(&p->p_pid, (void *)arg1, sizeof (pid_t)) != 0)
358			return (EFAULT);
359		if (copyout(&p->p_zone->zone_proc_initpid, (void *)arg2,
360		    sizeof (pid_t)) != 0)
361			return (EFAULT);
362		return (0);
363
364	case B_S10_TRUSS_POINT:
365		/*
366		 * This subcommand exists so that we can see truss output
367		 * from interposed system calls that return without first
368		 * calling any other system call, meaning they would be
369		 * invisible to truss(1).
370		 *
371		 * If the second argument is set non-zero, set errno to that
372		 * value as well.
373		 *
374		 * Arguments are:
375		 *
376		 *    arg1: syscall number
377		 *    arg2: errno
378		 */
379		return ((arg2 == 0) ? 0 : set_errno((uint_t)arg2));
380
381	case B_S10_ISFDXATTRDIR: {
382		/*
383		 * This subcommand enables the userland brand emulation library
384		 * to determine whether a file descriptor refers to an extended
385		 * file attributes directory.  There is no standard syscall or
386		 * libc function that can make such a determination.
387		 */
388		file_t *dir_filep;
389
390		dir_filep = getf((int)arg1);
391		if (dir_filep == NULL)
392			return (EBADF);
393		ASSERT(dir_filep->f_vnode != NULL);
394		*rval = IS_XATTRDIR(dir_filep->f_vnode);
395		releasef((int)arg1);
396		return (0);
397	}
398
399#ifdef	__amd64
400	case B_S10_FSREGCORRECTION:
401		/*
402		 * This subcommand exists so that the SYS_lwp_private and
403		 * SYS_lwp_create syscalls can manually set the current thread's
404		 * %fs register to the legacy S10 selector value for 64-bit x86
405		 * processes.
406		 */
407		s10_amd64_correct_fsreg(ttolwp(curthread));
408		return (0);
409#endif	/* __amd64 */
410	}
411
412	return (EINVAL);
413}
414
415/*
416 * Copy the per-process brand data from a parent proc to a child.
417 */
418void
419s10_copy_procdata(proc_t *child, proc_t *parent)
420{
421	s10_proc_data_t	*spd;
422
423	ASSERT(parent->p_brand == &s10_brand);
424	ASSERT(child->p_brand == &s10_brand);
425	ASSERT(parent->p_brand_data != NULL);
426	ASSERT(child->p_brand_data == NULL);
427
428	/* Just duplicate all the proc data of the parent for the child */
429	spd = kmem_alloc(sizeof (s10_proc_data_t), KM_SLEEP);
430	bcopy(parent->p_brand_data, spd, sizeof (s10_proc_data_t));
431	child->p_brand_data = spd;
432}
433
434/*ARGSUSED*/
435void
436s10_proc_exit(struct proc *p, klwp_t *l)
437{
438	ASSERT(p->p_brand == &s10_brand);
439	ASSERT(p->p_brand_data != NULL);
440
441	/*
442	 * We should only be called from proc_exit(), when we know that
443	 * process is single-threaded.
444	 */
445	ASSERT(p->p_tlist == p->p_tlist->t_forw);
446
447	/* upon exit, free our lwp brand data */
448	(void) s10_freelwp(ttolwp(curthread));
449
450	/* upon exit, free our proc brand data */
451	kmem_free(p->p_brand_data, sizeof (s10_proc_data_t));
452	p->p_brand_data = NULL;
453}
454
455void
456s10_exec()
457{
458	s10_proc_data_t	*spd = curproc->p_brand_data;
459
460	ASSERT(curproc->p_brand == &s10_brand);
461	ASSERT(curproc->p_brand_data != NULL);
462	ASSERT(ttolwp(curthread)->lwp_brand != NULL);
463
464	/*
465	 * We should only be called from exec(), when we know the process
466	 * is single-threaded.
467	 */
468	ASSERT(curproc->p_tlist == curproc->p_tlist->t_forw);
469
470	/* Upon exec, reset our lwp brand data. */
471	(void) s10_freelwp(ttolwp(curthread));
472	(void) s10_initlwp(ttolwp(curthread));
473
474	/*
475	 * Upon exec, reset all the proc brand data, except for the elf
476	 * data associated with the executable we are exec'ing.
477	 */
478	spd->spd_handler = NULL;
479}
480
481/*ARGSUSED*/
482int
483s10_initlwp(klwp_t *l)
484{
485	ASSERT(l->lwp_procp->p_brand == &s10_brand);
486	ASSERT(l->lwp_procp->p_brand_data != NULL);
487	ASSERT(l->lwp_brand == NULL);
488	l->lwp_brand = (void *)-1;
489	return (0);
490}
491
492/*ARGSUSED*/
493void
494s10_forklwp(klwp_t *p, klwp_t *c)
495{
496	ASSERT(p->lwp_procp->p_brand == &s10_brand);
497	ASSERT(c->lwp_procp->p_brand == &s10_brand);
498
499	ASSERT(p->lwp_procp->p_brand_data != NULL);
500	ASSERT(c->lwp_procp->p_brand_data != NULL);
501
502	/* Both LWPs have already had been initialized via s10_initlwp() */
503	ASSERT(p->lwp_brand != NULL);
504	ASSERT(c->lwp_brand != NULL);
505
506#ifdef	__amd64
507	/*
508	 * Only correct the child's %fs register if the parent's %fs register
509	 * is LWPFS_SEL.  If the parent's %fs register is zero, then the Solaris
510	 * 10 environment that we're emulating uses a version of libc that
511	 * works when %fs is zero (i.e., it contains backports of CRs 6467491
512	 * and 6501650).
513	 */
514	if (p->lwp_pcb.pcb_fs == LWPFS_SEL)
515		s10_amd64_correct_fsreg(c);
516#endif	/* __amd64 */
517}
518
519/*ARGSUSED*/
520void
521s10_freelwp(klwp_t *l)
522{
523	ASSERT(l->lwp_procp->p_brand == &s10_brand);
524	ASSERT(l->lwp_procp->p_brand_data != NULL);
525	ASSERT(l->lwp_brand != NULL);
526	l->lwp_brand = NULL;
527}
528
529/*ARGSUSED*/
530void
531s10_lwpexit(klwp_t *l)
532{
533	ASSERT(l->lwp_procp->p_brand == &s10_brand);
534	ASSERT(l->lwp_procp->p_brand_data != NULL);
535	ASSERT(l->lwp_brand != NULL);
536
537	/*
538	 * We should never be called for the last thread in a process.
539	 * (That case is handled by s10_proc_exit().)  There for this lwp
540	 * must be exiting from a multi-threaded process.
541	 */
542	ASSERT(l->lwp_procp->p_tlist != l->lwp_procp->p_tlist->t_forw);
543
544	l->lwp_brand = NULL;
545}
546
547void
548s10_free_brand_data(zone_t *zone)
549{
550	kmem_free(zone->zone_brand_data, sizeof (s10_zone_data_t));
551}
552
553void
554s10_init_brand_data(zone_t *zone)
555{
556	ASSERT(zone->zone_brand == &s10_brand);
557	ASSERT(zone->zone_brand_data == NULL);
558	zone->zone_brand_data = kmem_zalloc(sizeof (s10_zone_data_t), KM_SLEEP);
559}
560
561#if defined(_LP64)
562static void
563Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst)
564{
565	bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident));
566	dst->e_type =		src->e_type;
567	dst->e_machine =	src->e_machine;
568	dst->e_version =	src->e_version;
569	dst->e_entry =		src->e_entry;
570	dst->e_phoff =		src->e_phoff;
571	dst->e_shoff =		src->e_shoff;
572	dst->e_flags =		src->e_flags;
573	dst->e_ehsize =		src->e_ehsize;
574	dst->e_phentsize =	src->e_phentsize;
575	dst->e_phnum =		src->e_phnum;
576	dst->e_shentsize =	src->e_shentsize;
577	dst->e_shnum =		src->e_shnum;
578	dst->e_shstrndx =	src->e_shstrndx;
579}
580#endif /* _LP64 */
581
582int
583s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
584	int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
585	int brand_action)
586{
587	vnode_t		*nvp;
588	Ehdr		ehdr;
589	Addr		uphdr_vaddr;
590	intptr_t	voffset;
591	int		interp;
592	int		i, err;
593	struct execenv	env;
594	struct user	*up = PTOU(curproc);
595	s10_proc_data_t	*spd;
596	s10_elf_data_t	sed, *sedp;
597	char		*linker;
598	uintptr_t	lddata; /* lddata of executable's linker */
599
600	ASSERT(curproc->p_brand == &s10_brand);
601	ASSERT(curproc->p_brand_data != NULL);
602
603	spd = (s10_proc_data_t *)curproc->p_brand_data;
604	sedp = &spd->spd_elf_data;
605
606	args->brandname = S10_BRANDNAME;
607
608	/*
609	 * We will exec the brand library and then map in the target
610	 * application and (optionally) the brand's default linker.
611	 */
612	if (args->to_model == DATAMODEL_NATIVE) {
613		args->emulator = S10_LIB;
614		linker = S10_LINKER;
615#if defined(_LP64)
616	} else {
617		args->emulator = S10_LIB32;
618		linker = S10_LINKER32;
619#endif /* _LP64 */
620	}
621
622	if ((err = lookupname(args->emulator, UIO_SYSSPACE, FOLLOW, NULLVPP,
623	    &nvp)) != 0) {
624		uprintf("%s: not found.", args->emulator);
625		return (err);
626	}
627
628	if (args->to_model == DATAMODEL_NATIVE) {
629		err = elfexec(nvp, uap, args, idatap, level + 1, execsz,
630		    setid, exec_file, cred, brand_action);
631#if defined(_LP64)
632	} else {
633		err = elf32exec(nvp, uap, args, idatap, level + 1, execsz,
634		    setid, exec_file, cred, brand_action);
635#endif /* _LP64 */
636	}
637	VN_RELE(nvp);
638	if (err != 0)
639		return (err);
640
641	/*
642	 * The u_auxv vectors are set up by elfexec to point to the brand
643	 * emulation library and linker.  Save these so they can be copied to
644	 * the specific brand aux vectors.
645	 */
646	bzero(&sed, sizeof (sed));
647	for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
648		switch (up->u_auxv[i].a_type) {
649		case AT_SUN_LDDATA:
650			sed.sed_lddata = up->u_auxv[i].a_un.a_val;
651			break;
652		case AT_BASE:
653			sed.sed_base = up->u_auxv[i].a_un.a_val;
654			break;
655		case AT_ENTRY:
656			sed.sed_entry = up->u_auxv[i].a_un.a_val;
657			break;
658		case AT_PHDR:
659			sed.sed_phdr = up->u_auxv[i].a_un.a_val;
660			break;
661		case AT_PHENT:
662			sed.sed_phent = up->u_auxv[i].a_un.a_val;
663			break;
664		case AT_PHNUM:
665			sed.sed_phnum = up->u_auxv[i].a_un.a_val;
666			break;
667		default:
668			break;
669		}
670	}
671	/* Make sure the emulator has an entry point */
672	ASSERT(sed.sed_entry != NULL);
673	ASSERT(sed.sed_phdr != NULL);
674
675	bzero(&env, sizeof (env));
676	if (args->to_model == DATAMODEL_NATIVE) {
677		err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset,
678		    exec_file, &interp, &env.ex_bssbase, &env.ex_brkbase,
679		    &env.ex_brksize, NULL);
680#if defined(_LP64)
681	} else {
682		Elf32_Ehdr ehdr32;
683		Elf32_Addr uphdr_vaddr32;
684		err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
685		    &voffset, exec_file, &interp, &env.ex_bssbase,
686		    &env.ex_brkbase, &env.ex_brksize, NULL);
687		Ehdr32to64(&ehdr32, &ehdr);
688		if (uphdr_vaddr32 == (Elf32_Addr)-1)
689			uphdr_vaddr = (Addr)-1;
690		else
691			uphdr_vaddr = uphdr_vaddr32;
692#endif /* _LP64 */
693	}
694	if (err != 0)
695		return (err);
696
697	/*
698	 * Save off the important properties of the executable. The brand
699	 * library will ask us for this data later, when it is initializing
700	 * and getting ready to transfer control to the brand application.
701	 */
702	if (uphdr_vaddr == (Addr)-1)
703		sedp->sed_phdr = voffset + ehdr.e_phoff;
704	else
705		sedp->sed_phdr = voffset + uphdr_vaddr;
706	sedp->sed_entry = voffset + ehdr.e_entry;
707	sedp->sed_phent = ehdr.e_phentsize;
708	sedp->sed_phnum = ehdr.e_phnum;
709
710	if (interp) {
711		if (ehdr.e_type == ET_DYN) {
712			/*
713			 * This is a shared object executable, so we need to
714			 * pick a reasonable place to put the heap. Just don't
715			 * use the first page.
716			 */
717			env.ex_brkbase = (caddr_t)PAGESIZE;
718			env.ex_bssbase = (caddr_t)PAGESIZE;
719		}
720
721		/*
722		 * If the program needs an interpreter (most do), map it in and
723		 * store relevant information about it in the aux vector, where
724		 * the brand library can find it.
725		 */
726		if ((err = lookupname(linker, UIO_SYSSPACE,
727		    FOLLOW, NULLVPP, &nvp)) != 0) {
728			uprintf("%s: not found.", S10_LINKER);
729			return (err);
730		}
731		if (args->to_model == DATAMODEL_NATIVE) {
732			err = mapexec_brand(nvp, args, &ehdr,
733			    &uphdr_vaddr, &voffset, exec_file, &interp,
734			    NULL, NULL, NULL, &lddata);
735#if defined(_LP64)
736		} else {
737			Elf32_Ehdr ehdr32;
738			Elf32_Addr uphdr_vaddr32;
739			err = mapexec32_brand(nvp, args, &ehdr32,
740			    &uphdr_vaddr32, &voffset, exec_file, &interp,
741			    NULL, NULL, NULL, &lddata);
742			Ehdr32to64(&ehdr32, &ehdr);
743			if (uphdr_vaddr32 == (Elf32_Addr)-1)
744				uphdr_vaddr = (Addr)-1;
745			else
746				uphdr_vaddr = uphdr_vaddr32;
747#endif /* _LP64 */
748		}
749		VN_RELE(nvp);
750		if (err != 0)
751			return (err);
752
753		/*
754		 * Now that we know the base address of the brand's linker,
755		 * place it in the aux vector.
756		 */
757		sedp->sed_base = voffset;
758		sedp->sed_ldentry = voffset + ehdr.e_entry;
759		sedp->sed_lddata = voffset + lddata;
760	} else {
761		/*
762		 * This program has no interpreter. The brand library will
763		 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
764		 * so in this case, put the entry point of the main executable
765		 * there.
766		 */
767		if (ehdr.e_type == ET_EXEC) {
768			/*
769			 * An executable with no interpreter, this must be a
770			 * statically linked executable, which means we loaded
771			 * it at the address specified in the elf header, in
772			 * which case the e_entry field of the elf header is an
773			 * absolute address.
774			 */
775			sedp->sed_ldentry = ehdr.e_entry;
776			sedp->sed_entry = ehdr.e_entry;
777			sedp->sed_lddata = NULL;
778			sedp->sed_base = NULL;
779		} else {
780			/*
781			 * A shared object with no interpreter, we use the
782			 * calculated address from above.
783			 */
784			sedp->sed_ldentry = sedp->sed_entry;
785			sedp->sed_entry = NULL;
786			sedp->sed_phdr = NULL;
787			sedp->sed_phent = NULL;
788			sedp->sed_phnum = NULL;
789			sedp->sed_lddata = NULL;
790			sedp->sed_base = voffset;
791
792			if (ehdr.e_type == ET_DYN) {
793				/*
794				 * Delay setting the brkbase until the first
795				 * call to brk(); see elfexec() for details.
796				 */
797				env.ex_bssbase = (caddr_t)0;
798				env.ex_brkbase = (caddr_t)0;
799				env.ex_brksize = 0;
800			}
801		}
802	}
803
804	env.ex_magic = elfmagic;
805	env.ex_vp = vp;
806	setexecenv(&env);
807
808	/*
809	 * It's time to manipulate the process aux vectors.  First
810	 * we need to update the AT_SUN_AUXFLAGS aux vector to set
811	 * the AF_SUN_NOPLM flag.
812	 */
813	if (args->to_model == DATAMODEL_NATIVE) {
814		auxv_t		auxflags_auxv;
815
816		if (copyin(args->auxp_auxflags, &auxflags_auxv,
817		    sizeof (auxflags_auxv)) != 0)
818			return (EFAULT);
819
820		ASSERT(auxflags_auxv.a_type == AT_SUN_AUXFLAGS);
821		auxflags_auxv.a_un.a_val |= AF_SUN_NOPLM;
822		if (copyout(&auxflags_auxv, args->auxp_auxflags,
823		    sizeof (auxflags_auxv)) != 0)
824			return (EFAULT);
825#if defined(_LP64)
826	} else {
827		auxv32_t	auxflags_auxv32;
828
829		if (copyin(args->auxp_auxflags, &auxflags_auxv32,
830		    sizeof (auxflags_auxv32)) != 0)
831			return (EFAULT);
832
833		ASSERT(auxflags_auxv32.a_type == AT_SUN_AUXFLAGS);
834		auxflags_auxv32.a_un.a_val |= AF_SUN_NOPLM;
835		if (copyout(&auxflags_auxv32, args->auxp_auxflags,
836		    sizeof (auxflags_auxv32)) != 0)
837			return (EFAULT);
838#endif /* _LP64 */
839	}
840
841	/* Second, copy out the brand specific aux vectors. */
842	if (args->to_model == DATAMODEL_NATIVE) {
843		auxv_t s10_auxv[] = {
844		    { AT_SUN_BRAND_AUX1, 0 },
845		    { AT_SUN_BRAND_AUX2, 0 },
846		    { AT_SUN_BRAND_AUX3, 0 }
847		};
848
849		ASSERT(s10_auxv[0].a_type == AT_SUN_BRAND_S10_LDDATA);
850		s10_auxv[0].a_un.a_val = sed.sed_lddata;
851
852		if (copyout(&s10_auxv, args->auxp_brand,
853		    sizeof (s10_auxv)) != 0)
854			return (EFAULT);
855#if defined(_LP64)
856	} else {
857		auxv32_t s10_auxv32[] = {
858		    { AT_SUN_BRAND_AUX1, 0 },
859		    { AT_SUN_BRAND_AUX2, 0 },
860		    { AT_SUN_BRAND_AUX3, 0 }
861		};
862
863		ASSERT(s10_auxv32[0].a_type == AT_SUN_BRAND_S10_LDDATA);
864		s10_auxv32[0].a_un.a_val = (uint32_t)sed.sed_lddata;
865		if (copyout(&s10_auxv32, args->auxp_brand,
866		    sizeof (s10_auxv32)) != 0)
867			return (EFAULT);
868#endif /* _LP64 */
869	}
870
871	/*
872	 * Third, the the /proc aux vectors set up by elfexec() point to brand
873	 * emulation library and it's linker.  Copy these to the /proc brand
874	 * specific aux vector, and update the regular /proc aux vectors to
875	 * point to the executable (and it's linker).  This will enable
876	 * debuggers to access the executable via the usual /proc or elf notes
877	 * aux vectors.
878	 *
879	 * The brand emulation library's linker will get it's aux vectors off
880	 * the stack, and then update the stack with the executable's aux
881	 * vectors before jumping to the executable's linker.
882	 *
883	 * Debugging the brand emulation library must be done from
884	 * the global zone, where the librtld_db module knows how to fetch the
885	 * brand specific aux vectors to access the brand emulation libraries
886	 * linker.
887	 */
888	for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
889		ulong_t val;
890
891		switch (up->u_auxv[i].a_type) {
892		case AT_SUN_BRAND_S10_LDDATA:
893			up->u_auxv[i].a_un.a_val = sed.sed_lddata;
894			continue;
895		case AT_BASE:
896			val = sedp->sed_base;
897			break;
898		case AT_ENTRY:
899			val = sedp->sed_entry;
900			break;
901		case AT_PHDR:
902			val = sedp->sed_phdr;
903			break;
904		case AT_PHENT:
905			val = sedp->sed_phent;
906			break;
907		case AT_PHNUM:
908			val = sedp->sed_phnum;
909			break;
910		case AT_SUN_LDDATA:
911			val = sedp->sed_lddata;
912			break;
913		default:
914			continue;
915		}
916
917		up->u_auxv[i].a_un.a_val = val;
918		if (val == NULL) {
919			/* Hide the entry for static binaries */
920			up->u_auxv[i].a_type = AT_IGNORE;
921		}
922	}
923
924	/*
925	 * The last thing we do here is clear spd->spd_handler.  This is
926	 * important because if we're already a branded process and if this
927	 * exec succeeds, there is a window between when the exec() first
928	 * returns to the userland of the new process and when our brand
929	 * library get's initialized, during which we don't want system
930	 * calls to be re-directed to our brand library since it hasn't
931	 * been initialized yet.
932	 */
933	spd->spd_handler = NULL;
934
935	return (0);
936}
937
938
939int
940_init(void)
941{
942	int err;
943
944	/*
945	 * Set up the table indicating which system calls we want to
946	 * interpose on.  We should probably build this automatically from
947	 * a list of system calls that is shared with the user-space
948	 * library.
949	 */
950	s10_emulation_table = kmem_zalloc(NSYSCALL, KM_SLEEP);
951	s10_emulation_table[SYS_exec] = 1;			/*  11 */
952	s10_emulation_table[SYS_ioctl] = 1;			/*  54 */
953	s10_emulation_table[SYS_execve] = 1;			/*  59 */
954	s10_emulation_table[SYS_acctctl] = 1;			/*  71 */
955	s10_emulation_table[S10_SYS_issetugid] = 1;		/*  75 */
956	s10_emulation_table[SYS_getdents] = 1;			/*  81 */
957	s10_emulation_table[SYS_uname] = 1;			/* 135 */
958	s10_emulation_table[SYS_systeminfo] = 1;		/* 139 */
959#ifdef	__amd64
960	s10_emulation_table[SYS_lwp_create] = 1;		/* 159 */
961	s10_emulation_table[SYS_lwp_private] = 1;		/* 166 */
962#endif	/* __amd64 */
963	s10_emulation_table[SYS_pwrite] = 1;			/* 174 */
964	s10_emulation_table[SYS_auditsys] = 1;			/* 186 */
965	s10_emulation_table[SYS_sigqueue] = 1;			/* 190 */
966	s10_emulation_table[SYS_lwp_mutex_timedlock] = 1;	/* 210 */
967	s10_emulation_table[SYS_getdents64] = 1;		/* 213 */
968	s10_emulation_table[SYS_pwrite64] = 1;			/* 223 */
969	s10_emulation_table[SYS_zone] = 1;			/* 227 */
970	s10_emulation_table[SYS_lwp_mutex_trylock] = 1;		/* 251 */
971
972	err = mod_install(&modlinkage);
973	if (err) {
974		cmn_err(CE_WARN, "Couldn't install brand module");
975		kmem_free(s10_emulation_table, NSYSCALL);
976	}
977
978	return (err);
979}
980
981int
982_info(struct modinfo *modinfop)
983{
984	return (mod_info(&modlinkage, modinfop));
985}
986
987int
988_fini(void)
989{
990	int err;
991
992	/*
993	 * If there are any zones using this brand, we can't allow it to be
994	 * unloaded.
995	 */
996	if (brand_zone_count(&s10_brand))
997		return (EBUSY);
998
999	kmem_free(s10_emulation_table, NSYSCALL);
1000	s10_emulation_table = NULL;
1001
1002	err = mod_remove(&modlinkage);
1003	if (err)
1004		cmn_err(CE_WARN, "Couldn't unload s10 brand module");
1005
1006	return (err);
1007}
1008