1/*	$NetBSD: kern_proc.c,v 1.275 2024/06/02 12:11:35 andvar Exp $	*/
2
3/*-
4 * Copyright (c) 1999, 2006, 2007, 2008, 2020, 2023
5 *     The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
10 * NASA Ames Research Center, and by Andrew Doran.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34/*
35 * Copyright (c) 1982, 1986, 1989, 1991, 1993
36 *	The Regents of the University of California.  All rights reserved.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 *    notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 *    notice, this list of conditions and the following disclaimer in the
45 *    documentation and/or other materials provided with the distribution.
46 * 3. Neither the name of the University nor the names of its contributors
47 *    may be used to endorse or promote products derived from this software
48 *    without specific prior written permission.
49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
61 *
62 *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
63 */
64
65#include <sys/cdefs.h>
66__KERNEL_RCSID(0, "$NetBSD: kern_proc.c,v 1.275 2024/06/02 12:11:35 andvar Exp $");
67
68#ifdef _KERNEL_OPT
69#include "opt_kstack.h"
70#include "opt_maxuprc.h"
71#include "opt_dtrace.h"
72#include "opt_compat_netbsd32.h"
73#include "opt_kaslr.h"
74#endif
75
76#if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \
77    && !defined(_RUMPKERNEL)
78#define COMPAT_NETBSD32
79#endif
80
81#include <sys/param.h>
82#include <sys/systm.h>
83#include <sys/kernel.h>
84#include <sys/proc.h>
85#include <sys/resourcevar.h>
86#include <sys/buf.h>
87#include <sys/acct.h>
88#include <sys/wait.h>
89#include <sys/file.h>
90#include <ufs/ufs/quota.h>
91#include <sys/uio.h>
92#include <sys/pool.h>
93#include <sys/pset.h>
94#include <sys/ioctl.h>
95#include <sys/tty.h>
96#include <sys/signalvar.h>
97#include <sys/ras.h>
98#include <sys/filedesc.h>
99#include <sys/syscall_stats.h>
100#include <sys/kauth.h>
101#include <sys/sleepq.h>
102#include <sys/atomic.h>
103#include <sys/kmem.h>
104#include <sys/namei.h>
105#include <sys/dtrace_bsd.h>
106#include <sys/sysctl.h>
107#include <sys/exec.h>
108#include <sys/cpu.h>
109#include <sys/compat_stub.h>
110#include <sys/futex.h>
111#include <sys/pserialize.h>
112
113#include <uvm/uvm_extern.h>
114
115/*
116 * Process lists.
117 */
118
119struct proclist		allproc		__cacheline_aligned;
120struct proclist		zombproc	__cacheline_aligned;
121
122kmutex_t		proc_lock	__cacheline_aligned;
123static pserialize_t	proc_psz;
124
125/*
126 * pid to lwp/proc lookup is done by indexing the pid_table array.
127 * Since pid numbers are only allocated when an empty slot
128 * has been found, there is no need to search any lists ever.
129 * (an orphaned pgrp will lock the slot, a session will lock
130 * the pgrp with the same number.)
131 * If the table is too small it is reallocated with twice the
132 * previous size and the entries 'unzipped' into the two halves.
133 * A linked list of free entries is passed through the pt_lwp
134 * field of 'free' items - set odd to be an invalid ptr.  Two
135 * additional bits are also used to indicate if the slot is
136 * currently occupied by a proc or lwp, and if the PID is
137 * hidden from certain kinds of lookups.  We thus require a
138 * minimum alignment for proc and lwp structures (LWPs are
139 * at least 32-byte aligned).
140 */
141
142struct pid_table {
143	uintptr_t	pt_slot;
144	struct pgrp	*pt_pgrp;
145	pid_t		pt_pid;
146};
147
148#define	PT_F_FREE		((uintptr_t)__BIT(0))
149#define	PT_F_LWP		0	/* pseudo-flag */
150#define	PT_F_PROC		((uintptr_t)__BIT(1))
151
152#define	PT_F_TYPEBITS		(PT_F_FREE|PT_F_PROC)
153#define	PT_F_ALLBITS		(PT_F_FREE|PT_F_PROC)
154
155#define	PT_VALID(s)		(((s) & PT_F_FREE) == 0)
156#define	PT_RESERVED(s)		((s) == 0)
157#define	PT_NEXT(s)		((u_int)(s) >> 1)
158#define	PT_SET_FREE(pid)	(((pid) << 1) | PT_F_FREE)
159#define	PT_SET_LWP(l)		((uintptr_t)(l))
160#define	PT_SET_PROC(p)		(((uintptr_t)(p)) | PT_F_PROC)
161#define	PT_SET_RESERVED		0
162#define	PT_GET_LWP(s)		((struct lwp *)((s) & ~PT_F_ALLBITS))
163#define	PT_GET_PROC(s)		((struct proc *)((s) & ~PT_F_ALLBITS))
164#define	PT_GET_TYPE(s)		((s) & PT_F_TYPEBITS)
165#define	PT_IS_LWP(s)		(PT_GET_TYPE(s) == PT_F_LWP && (s) != 0)
166#define	PT_IS_PROC(s)		(PT_GET_TYPE(s) == PT_F_PROC)
167
168#define	MIN_PROC_ALIGNMENT	(PT_F_ALLBITS + 1)
169
170/*
171 * Table of process IDs (PIDs).
172 */
173static struct pid_table *pid_table	__read_mostly;
174
175#define	INITIAL_PID_TABLE_SIZE		(1 << 5)
176
177/* Table mask, threshold for growing and number of allocated PIDs. */
178static u_int		pid_tbl_mask	__read_mostly;
179static u_int		pid_alloc_lim	__read_mostly;
180static u_int		pid_alloc_cnt	__cacheline_aligned;
181
182/* Next free, last free and maximum PIDs. */
183static u_int		next_free_pt	__cacheline_aligned;
184static u_int		last_free_pt	__cacheline_aligned;
185static pid_t		pid_max		__read_mostly;
186
187/* Components of the first process -- never freed. */
188
189struct session session0 = {
190	.s_count = 1,
191	.s_sid = 0,
192};
193struct pgrp pgrp0 = {
194	.pg_members = LIST_HEAD_INITIALIZER(&pgrp0.pg_members),
195	.pg_session = &session0,
196};
197filedesc_t filedesc0;
198struct cwdinfo cwdi0 = {
199	.cwdi_cmask = CMASK,
200	.cwdi_refcnt = 1,
201};
202struct plimit limit0;
203struct pstats pstat0;
204struct vmspace vmspace0;
205struct sigacts sigacts0;
206struct proc proc0 = {
207	.p_lwps = LIST_HEAD_INITIALIZER(&proc0.p_lwps),
208	.p_sigwaiters = LIST_HEAD_INITIALIZER(&proc0.p_sigwaiters),
209	.p_nlwps = 1,
210	.p_nrlwps = 1,
211	.p_pgrp = &pgrp0,
212	.p_comm = "system",
213	/*
214	 * Set P_NOCLDWAIT so that kernel threads are reparented to init(8)
215	 * when they exit.  init(8) can easily wait them out for us.
216	 */
217	.p_flag = PK_SYSTEM | PK_NOCLDWAIT,
218	.p_stat = SACTIVE,
219	.p_nice = NZERO,
220	.p_emul = &emul_netbsd,
221	.p_cwdi = &cwdi0,
222	.p_limit = &limit0,
223	.p_fd = &filedesc0,
224	.p_vmspace = &vmspace0,
225	.p_stats = &pstat0,
226	.p_sigacts = &sigacts0,
227#ifdef PROC0_MD_INITIALIZERS
228	PROC0_MD_INITIALIZERS
229#endif
230};
231kauth_cred_t cred0;
232
233static const int	nofile	= NOFILE;
234static const int	maxuprc	= MAXUPRC;
235
236static int sysctl_doeproc(SYSCTLFN_PROTO);
237static int sysctl_kern_proc_args(SYSCTLFN_PROTO);
238static int sysctl_security_expose_address(SYSCTLFN_PROTO);
239
240#ifdef KASLR
241static int kern_expose_address = 0;
242#else
243static int kern_expose_address = 1;
244#endif
245/*
246 * The process list descriptors, used during pid allocation and
247 * by sysctl.  No locking on this data structure is needed since
248 * it is completely static.
249 */
250const struct proclist_desc proclists[] = {
251	{ &allproc	},
252	{ &zombproc	},
253	{ NULL		},
254};
255
256static struct pgrp *	pg_remove(pid_t);
257static void		pg_delete(pid_t);
258static void		orphanpg(struct pgrp *);
259
260static specificdata_domain_t proc_specificdata_domain;
261
262static pool_cache_t proc_cache;
263
264static kauth_listener_t proc_listener;
265
266static void fill_proc(const struct proc *, struct proc *, bool);
267static int fill_pathname(struct lwp *, pid_t, void *, size_t *);
268static int fill_cwd(struct lwp *, pid_t, void *, size_t *);
269
270static int
271proc_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
272    void *arg0, void *arg1, void *arg2, void *arg3)
273{
274	struct proc *p;
275	int result;
276
277	result = KAUTH_RESULT_DEFER;
278	p = arg0;
279
280	switch (action) {
281	case KAUTH_PROCESS_CANSEE: {
282		enum kauth_process_req req;
283
284		req = (enum kauth_process_req)(uintptr_t)arg1;
285
286		switch (req) {
287		case KAUTH_REQ_PROCESS_CANSEE_ARGS:
288		case KAUTH_REQ_PROCESS_CANSEE_ENTRY:
289		case KAUTH_REQ_PROCESS_CANSEE_OPENFILES:
290		case KAUTH_REQ_PROCESS_CANSEE_EPROC:
291			result = KAUTH_RESULT_ALLOW;
292			break;
293
294		case KAUTH_REQ_PROCESS_CANSEE_ENV:
295			if (kauth_cred_getuid(cred) !=
296			    kauth_cred_getuid(p->p_cred) ||
297			    kauth_cred_getuid(cred) !=
298			    kauth_cred_getsvuid(p->p_cred))
299				break;
300
301			result = KAUTH_RESULT_ALLOW;
302
303			break;
304
305		case KAUTH_REQ_PROCESS_CANSEE_KPTR:
306			if (!kern_expose_address)
307				break;
308
309			if (kern_expose_address == 1 && !(p->p_flag & PK_KMEM))
310				break;
311
312			result = KAUTH_RESULT_ALLOW;
313
314			break;
315
316		default:
317			break;
318		}
319
320		break;
321		}
322
323	case KAUTH_PROCESS_FORK: {
324		int lnprocs = (int)(unsigned long)arg2;
325
326		/*
327		 * Don't allow a nonprivileged user to use the last few
328		 * processes. The variable lnprocs is the current number of
329		 * processes, maxproc is the limit.
330		 */
331		if (__predict_false((lnprocs >= maxproc - 5)))
332			break;
333
334		result = KAUTH_RESULT_ALLOW;
335
336		break;
337		}
338
339	case KAUTH_PROCESS_CORENAME:
340	case KAUTH_PROCESS_STOPFLAG:
341		if (proc_uidmatch(cred, p->p_cred) == 0)
342			result = KAUTH_RESULT_ALLOW;
343
344		break;
345
346	default:
347		break;
348	}
349
350	return result;
351}
352
353static int
354proc_ctor(void *arg __unused, void *obj, int flags __unused)
355{
356	struct proc *p = obj;
357
358	memset(p, 0, sizeof(*p));
359	klist_init(&p->p_klist);
360
361	/*
362	 * There is no need for a proc_dtor() to do a klist_fini(),
363	 * since knote_proc_exit() ensures that p->p_klist is empty
364	 * when a process exits.
365	 */
366
367	return 0;
368}
369
370static pid_t proc_alloc_pid_slot(struct proc *, uintptr_t);
371
372/*
373 * Initialize global process hashing structures.
374 */
375void
376procinit(void)
377{
378	const struct proclist_desc *pd;
379	u_int i;
380#define	LINK_EMPTY ((PID_MAX + INITIAL_PID_TABLE_SIZE) & ~(INITIAL_PID_TABLE_SIZE - 1))
381
382	for (pd = proclists; pd->pd_list != NULL; pd++)
383		LIST_INIT(pd->pd_list);
384
385	mutex_init(&proc_lock, MUTEX_DEFAULT, IPL_NONE);
386
387	proc_psz = pserialize_create();
388
389	pid_table = kmem_alloc(INITIAL_PID_TABLE_SIZE
390	    * sizeof(struct pid_table), KM_SLEEP);
391	pid_tbl_mask = INITIAL_PID_TABLE_SIZE - 1;
392	pid_max = PID_MAX;
393
394	/* Set free list running through table...
395	   Preset 'use count' above PID_MAX so we allocate pid 1 next. */
396	for (i = 0; i <= pid_tbl_mask; i++) {
397		pid_table[i].pt_slot = PT_SET_FREE(LINK_EMPTY + i + 1);
398		pid_table[i].pt_pgrp = 0;
399		pid_table[i].pt_pid = 0;
400	}
401	/* slot 0 is just grabbed */
402	next_free_pt = 1;
403	/* Need to fix last entry. */
404	last_free_pt = pid_tbl_mask;
405	pid_table[last_free_pt].pt_slot = PT_SET_FREE(LINK_EMPTY);
406	/* point at which we grow table - to avoid reusing pids too often */
407	pid_alloc_lim = pid_tbl_mask - 1;
408#undef LINK_EMPTY
409
410	/* Reserve PID 1 for init(8). */	/* XXX slightly gross */
411	mutex_enter(&proc_lock);
412	if (proc_alloc_pid_slot(&proc0, PT_SET_RESERVED) != 1)
413		panic("failed to reserve PID 1 for init(8)");
414	mutex_exit(&proc_lock);
415
416	proc_specificdata_domain = specificdata_domain_create();
417	KASSERT(proc_specificdata_domain != NULL);
418
419	size_t proc_alignment = coherency_unit;
420	if (proc_alignment < MIN_PROC_ALIGNMENT)
421		proc_alignment = MIN_PROC_ALIGNMENT;
422
423	proc_cache = pool_cache_init(sizeof(struct proc), proc_alignment, 0, 0,
424	    "procpl", NULL, IPL_NONE, proc_ctor, NULL, NULL);
425
426	proc_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
427	    proc_listener_cb, NULL);
428}
429
430void
431procinit_sysctl(void)
432{
433	static struct sysctllog *clog;
434
435	sysctl_createv(&clog, 0, NULL, NULL,
436		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
437		       CTLTYPE_INT, "expose_address",
438		       SYSCTL_DESCR("Enable exposing kernel addresses"),
439		       sysctl_security_expose_address, 0,
440		       &kern_expose_address, 0, CTL_KERN, CTL_CREATE, CTL_EOL);
441	sysctl_createv(&clog, 0, NULL, NULL,
442		       CTLFLAG_PERMANENT,
443		       CTLTYPE_NODE, "proc",
444		       SYSCTL_DESCR("System-wide process information"),
445		       sysctl_doeproc, 0, NULL, 0,
446		       CTL_KERN, KERN_PROC, CTL_EOL);
447	sysctl_createv(&clog, 0, NULL, NULL,
448		       CTLFLAG_PERMANENT,
449		       CTLTYPE_NODE, "proc2",
450		       SYSCTL_DESCR("Machine-independent process information"),
451		       sysctl_doeproc, 0, NULL, 0,
452		       CTL_KERN, KERN_PROC2, CTL_EOL);
453	sysctl_createv(&clog, 0, NULL, NULL,
454		       CTLFLAG_PERMANENT,
455		       CTLTYPE_NODE, "proc_args",
456		       SYSCTL_DESCR("Process argument information"),
457		       sysctl_kern_proc_args, 0, NULL, 0,
458		       CTL_KERN, KERN_PROC_ARGS, CTL_EOL);
459
460	/*
461	  "nodes" under these:
462
463	  KERN_PROC_ALL
464	  KERN_PROC_PID pid
465	  KERN_PROC_PGRP pgrp
466	  KERN_PROC_SESSION sess
467	  KERN_PROC_TTY tty
468	  KERN_PROC_UID uid
469	  KERN_PROC_RUID uid
470	  KERN_PROC_GID gid
471	  KERN_PROC_RGID gid
472
473	  all in all, probably not worth the effort...
474	*/
475}
476
477/*
478 * Initialize process 0.
479 */
480void
481proc0_init(void)
482{
483	struct proc *p;
484	struct pgrp *pg;
485	struct rlimit *rlim;
486	rlim_t lim;
487	int i;
488
489	p = &proc0;
490	pg = &pgrp0;
491
492	mutex_init(&p->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
493	mutex_init(&p->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
494	p->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
495
496	rw_init(&p->p_reflock);
497	cv_init(&p->p_waitcv, "wait");
498	cv_init(&p->p_lwpcv, "lwpwait");
499
500	LIST_INSERT_HEAD(&p->p_lwps, &lwp0, l_sibling);
501
502	KASSERT(lwp0.l_lid == 0);
503	pid_table[lwp0.l_lid].pt_slot = PT_SET_LWP(&lwp0);
504	LIST_INSERT_HEAD(&allproc, p, p_list);
505
506	pid_table[lwp0.l_lid].pt_pgrp = pg;
507	LIST_INSERT_HEAD(&pg->pg_members, p, p_pglist);
508
509#ifdef __HAVE_SYSCALL_INTERN
510	(*p->p_emul->e_syscall_intern)(p);
511#endif
512
513	/* Create credentials. */
514	cred0 = kauth_cred_alloc();
515	p->p_cred = cred0;
516
517	/* Create the CWD info. */
518	rw_init(&cwdi0.cwdi_lock);
519
520	/* Create the limits structures. */
521	mutex_init(&limit0.pl_lock, MUTEX_DEFAULT, IPL_NONE);
522
523	rlim = limit0.pl_rlimit;
524	for (i = 0; i < __arraycount(limit0.pl_rlimit); i++) {
525		rlim[i].rlim_cur = RLIM_INFINITY;
526		rlim[i].rlim_max = RLIM_INFINITY;
527	}
528
529	rlim[RLIMIT_NOFILE].rlim_max = maxfiles;
530	rlim[RLIMIT_NOFILE].rlim_cur = maxfiles < nofile ? maxfiles : nofile;
531
532	rlim[RLIMIT_NPROC].rlim_max = maxproc;
533	rlim[RLIMIT_NPROC].rlim_cur = maxproc < maxuprc ? maxproc : maxuprc;
534
535	lim = MIN(VM_MAXUSER_ADDRESS, ctob((rlim_t)uvm_availmem(false)));
536	rlim[RLIMIT_RSS].rlim_max = lim;
537	rlim[RLIMIT_MEMLOCK].rlim_max = lim;
538	rlim[RLIMIT_MEMLOCK].rlim_cur = lim / 3;
539
540	rlim[RLIMIT_NTHR].rlim_max = maxlwp;
541	rlim[RLIMIT_NTHR].rlim_cur = maxlwp / 2;
542
543	/* Note that default core name has zero length. */
544	limit0.pl_corename = defcorename;
545	limit0.pl_cnlen = 0;
546	limit0.pl_refcnt = 1;
547	limit0.pl_writeable = false;
548	limit0.pl_sv_limit = NULL;
549
550	/* Configure virtual memory system, set vm rlimits. */
551	uvm_init_limits(p);
552
553	/* Initialize file descriptor table for proc0. */
554	fd_init(&filedesc0);
555
556	/*
557	 * Initialize proc0's vmspace, which uses the kernel pmap.
558	 * All kernel processes (which never have user space mappings)
559	 * share proc0's vmspace, and thus, the kernel pmap.
560	 */
561	uvmspace_init(&vmspace0, pmap_kernel(), round_page(VM_MIN_ADDRESS),
562	    trunc_page(VM_MAXUSER_ADDRESS),
563#ifdef __USE_TOPDOWN_VM
564	    true
565#else
566	    false
567#endif
568	    );
569
570	/* Initialize signal state for proc0. XXX IPL_SCHED */
571	mutex_init(&p->p_sigacts->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
572	siginit(p);
573
574	proc_initspecific(p);
575	kdtrace_proc_ctor(NULL, p);
576}
577
578/*
579 * Session reference counting.
580 */
581
582void
583proc_sesshold(struct session *ss)
584{
585
586	KASSERT(mutex_owned(&proc_lock));
587	ss->s_count++;
588}
589
590void
591proc_sessrele(struct session *ss)
592{
593	struct pgrp *pg;
594
595	KASSERT(mutex_owned(&proc_lock));
596	KASSERT(ss->s_count > 0);
597
598	/*
599	 * We keep the pgrp with the same id as the session in order to
600	 * stop a process being given the same pid.  Since the pgrp holds
601	 * a reference to the session, it must be a 'zombie' pgrp by now.
602	 */
603	if (--ss->s_count == 0) {
604		pg = pg_remove(ss->s_sid);
605	} else {
606		pg = NULL;
607		ss = NULL;
608	}
609
610	mutex_exit(&proc_lock);
611
612	if (pg)
613		kmem_free(pg, sizeof(struct pgrp));
614	if (ss)
615		kmem_free(ss, sizeof(struct session));
616}
617
618/*
619 * Check that the specified process group is in the session of the
620 * specified process.
621 * Treats -ve ids as process ids.
622 * Used to validate TIOCSPGRP requests.
623 */
624int
625pgid_in_session(struct proc *p, pid_t pg_id)
626{
627	struct pgrp *pgrp;
628	struct session *session;
629	int error;
630
631	if (pg_id == INT_MIN)
632		return EINVAL;
633
634	mutex_enter(&proc_lock);
635	if (pg_id < 0) {
636		struct proc *p1 = proc_find(-pg_id);
637		if (p1 == NULL) {
638			error = EINVAL;
639			goto fail;
640		}
641		pgrp = p1->p_pgrp;
642	} else {
643		pgrp = pgrp_find(pg_id);
644		if (pgrp == NULL) {
645			error = EINVAL;
646			goto fail;
647		}
648	}
649	session = pgrp->pg_session;
650	error = (session != p->p_pgrp->pg_session) ? EPERM : 0;
651fail:
652	mutex_exit(&proc_lock);
653	return error;
654}
655
656/*
657 * p_inferior: is p an inferior of q?
658 */
659static inline bool
660p_inferior(struct proc *p, struct proc *q)
661{
662
663	KASSERT(mutex_owned(&proc_lock));
664
665	for (; p != q; p = p->p_pptr)
666		if (p->p_pid == 0)
667			return false;
668	return true;
669}
670
671/*
672 * proc_find_lwp: locate an lwp in said proc by the ID.
673 *
674 * => Must be called with p::p_lock held.
675 * => LSIDL lwps are not returned because they are only partially
676 *    constructed while occupying the slot.
677 * => Callers need to be careful about lwp::l_stat of the returned
678 *    lwp.
679 */
680struct lwp *
681proc_find_lwp(proc_t *p, pid_t pid)
682{
683	struct pid_table *pt;
684	unsigned pt_mask;
685	struct lwp *l = NULL;
686	uintptr_t slot;
687	int s;
688
689	KASSERT(mutex_owned(p->p_lock));
690
691	/*
692	 * Look in the pid_table.  This is done unlocked inside a
693	 * pserialize read section covering pid_table's memory
694	 * allocation only, so take care to read things in the correct
695	 * order:
696	 *
697	 * 1. First read the table mask -- this only ever increases, in
698	 *    expand_pid_table, so a stale value is safely
699	 *    conservative.
700	 *
701	 * 2. Next read the pid table -- this is always set _before_
702	 *    the mask increases, so if we see a new table and stale
703	 *    mask, the mask is still valid for the table.
704	 */
705	s = pserialize_read_enter();
706	pt_mask = atomic_load_acquire(&pid_tbl_mask);
707	pt = &atomic_load_consume(&pid_table)[pid & pt_mask];
708	slot = atomic_load_consume(&pt->pt_slot);
709	if (__predict_false(!PT_IS_LWP(slot))) {
710		pserialize_read_exit(s);
711		return NULL;
712	}
713
714	/*
715	 * Check to see if the LWP is from the correct process.  We won't
716	 * see entries in pid_table from a prior process that also used "p",
717	 * by virtue of the fact that allocating "p" means all prior updates
718	 * to dependant data structures are visible to this thread.
719	 */
720	l = PT_GET_LWP(slot);
721	if (__predict_false(atomic_load_relaxed(&l->l_proc) != p)) {
722		pserialize_read_exit(s);
723		return NULL;
724	}
725
726	/*
727	 * We now know that p->p_lock holds this LWP stable.
728	 *
729	 * If the status is not LSIDL, it means the LWP is intended to be
730	 * findable by LID and l_lid cannot change behind us.
731	 *
732	 * No need to acquire the LWP's lock to check for LSIDL, as
733	 * p->p_lock must be held to transition in and out of LSIDL.
734	 * Any other observed state of is no particular interest.
735	 */
736	pserialize_read_exit(s);
737	return l->l_stat != LSIDL && l->l_lid == pid ? l : NULL;
738}
739
740/*
741 * proc_find_lwp_unlocked: locate an lwp in said proc by the ID.
742 *
743 * => Called in a pserialize read section with no locks held.
744 * => LSIDL lwps are not returned because they are only partially
745 *    constructed while occupying the slot.
746 * => Callers need to be careful about lwp::l_stat of the returned
747 *    lwp.
748 * => If an LWP is found, it's returned locked.
749 */
750struct lwp *
751proc_find_lwp_unlocked(proc_t *p, pid_t pid)
752{
753	struct pid_table *pt;
754	unsigned pt_mask;
755	struct lwp *l = NULL;
756	uintptr_t slot;
757
758	KASSERT(pserialize_in_read_section());
759
760	/*
761	 * Look in the pid_table.  This is done unlocked inside a
762	 * pserialize read section covering pid_table's memory
763	 * allocation only, so take care to read things in the correct
764	 * order:
765	 *
766	 * 1. First read the table mask -- this only ever increases, in
767	 *    expand_pid_table, so a stale value is safely
768	 *    conservative.
769	 *
770	 * 2. Next read the pid table -- this is always set _before_
771	 *    the mask increases, so if we see a new table and stale
772	 *    mask, the mask is still valid for the table.
773	 */
774	pt_mask = atomic_load_acquire(&pid_tbl_mask);
775	pt = &atomic_load_consume(&pid_table)[pid & pt_mask];
776	slot = atomic_load_consume(&pt->pt_slot);
777	if (__predict_false(!PT_IS_LWP(slot))) {
778		return NULL;
779	}
780
781	/*
782	 * Lock the LWP we found to get it stable.  If it's embryonic or
783	 * reaped (LSIDL) then none of the other fields can safely be
784	 * checked.
785	 */
786	l = PT_GET_LWP(slot);
787	lwp_lock(l);
788	if (__predict_false(l->l_stat == LSIDL)) {
789		lwp_unlock(l);
790		return NULL;
791	}
792
793	/*
794	 * l_proc and l_lid are now known stable because the LWP is not
795	 * LSIDL, so check those fields too to make sure we found the
796	 * right thing.
797	 */
798	if (__predict_false(l->l_proc != p || l->l_lid != pid)) {
799		lwp_unlock(l);
800		return NULL;
801	}
802
803	/* Everything checks out, return it locked. */
804	return l;
805}
806
807/*
808 * proc_find_lwp_acquire_proc: locate an lwp and acquire a lock
809 * on its containing proc.
810 *
811 * => Similar to proc_find_lwp(), but does not require you to have
812 *    the proc a priori.
813 * => Also returns proc * to caller, with p::p_lock held.
814 * => Same caveats apply.
815 */
816struct lwp *
817proc_find_lwp_acquire_proc(pid_t pid, struct proc **pp)
818{
819	struct pid_table *pt;
820	struct proc *p = NULL;
821	struct lwp *l = NULL;
822	uintptr_t slot;
823
824	KASSERT(pp != NULL);
825	mutex_enter(&proc_lock);
826	pt = &pid_table[pid & pid_tbl_mask];
827
828	slot = pt->pt_slot;
829	if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) {
830		l = PT_GET_LWP(slot);
831		p = l->l_proc;
832		mutex_enter(p->p_lock);
833		if (__predict_false(l->l_stat == LSIDL)) {
834			mutex_exit(p->p_lock);
835			l = NULL;
836			p = NULL;
837		}
838	}
839	mutex_exit(&proc_lock);
840
841	KASSERT(p == NULL || mutex_owned(p->p_lock));
842	*pp = p;
843	return l;
844}
845
846/*
847 * proc_find_raw_pid_table_locked: locate a process by the ID.
848 *
849 * => Must be called with proc_lock held.
850 */
851static proc_t *
852proc_find_raw_pid_table_locked(pid_t pid, bool any_lwpid)
853{
854	struct pid_table *pt;
855	proc_t *p = NULL;
856	uintptr_t slot;
857
858	/* No - used by DDB.  KASSERT(mutex_owned(&proc_lock)); */
859	pt = &pid_table[pid & pid_tbl_mask];
860
861	slot = pt->pt_slot;
862	if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) {
863		/*
864		 * When looking up processes, require a direct match
865		 * on the PID assigned to the proc, not just one of
866		 * its LWPs.
867		 *
868		 * N.B. We require lwp::l_proc of LSIDL LWPs to be
869		 * valid here.
870		 */
871		p = PT_GET_LWP(slot)->l_proc;
872		if (__predict_false(p->p_pid != pid && !any_lwpid))
873			p = NULL;
874	} else if (PT_IS_PROC(slot) && pt->pt_pid == pid) {
875		p = PT_GET_PROC(slot);
876	}
877	return p;
878}
879
880proc_t *
881proc_find_raw(pid_t pid)
882{
883
884	return proc_find_raw_pid_table_locked(pid, false);
885}
886
887static proc_t *
888proc_find_internal(pid_t pid, bool any_lwpid)
889{
890	proc_t *p;
891
892	KASSERT(mutex_owned(&proc_lock));
893
894	p = proc_find_raw_pid_table_locked(pid, any_lwpid);
895	if (__predict_false(p == NULL)) {
896		return NULL;
897	}
898
899	/*
900	 * Only allow live processes to be found by PID.
901	 * XXX: p_stat might change, since proc unlocked.
902	 */
903	if (__predict_true(p->p_stat == SACTIVE || p->p_stat == SSTOP)) {
904		return p;
905	}
906	return NULL;
907}
908
909proc_t *
910proc_find(pid_t pid)
911{
912	return proc_find_internal(pid, false);
913}
914
915proc_t *
916proc_find_lwpid(pid_t pid)
917{
918	return proc_find_internal(pid, true);
919}
920
921/*
922 * pgrp_find: locate a process group by the ID.
923 *
924 * => Must be called with proc_lock held.
925 */
926struct pgrp *
927pgrp_find(pid_t pgid)
928{
929	struct pgrp *pg;
930
931	KASSERT(mutex_owned(&proc_lock));
932
933	pg = pid_table[pgid & pid_tbl_mask].pt_pgrp;
934
935	/*
936	 * Cannot look up a process group that only exists because the
937	 * session has not died yet (traditional).
938	 */
939	if (pg == NULL || pg->pg_id != pgid || LIST_EMPTY(&pg->pg_members)) {
940		return NULL;
941	}
942	return pg;
943}
944
945static void
946expand_pid_table(void)
947{
948	size_t pt_size, tsz;
949	struct pid_table *n_pt, *new_pt;
950	uintptr_t slot;
951	struct pgrp *pgrp;
952	pid_t pid, rpid;
953	u_int i;
954	uint new_pt_mask;
955
956	KASSERT(mutex_owned(&proc_lock));
957
958	/* Unlock the pid_table briefly to allocate memory. */
959	pt_size = pid_tbl_mask + 1;
960	mutex_exit(&proc_lock);
961
962	tsz = pt_size * 2 * sizeof(struct pid_table);
963	new_pt = kmem_alloc(tsz, KM_SLEEP);
964	new_pt_mask = pt_size * 2 - 1;
965
966	/* XXX For now.  The pratical limit is much lower anyway. */
967	KASSERT(new_pt_mask <= FUTEX_TID_MASK);
968
969	mutex_enter(&proc_lock);
970	if (pt_size != pid_tbl_mask + 1) {
971		/* Another process beat us to it... */
972		mutex_exit(&proc_lock);
973		kmem_free(new_pt, tsz);
974		goto out;
975	}
976
977	/*
978	 * Copy entries from old table into new one.
979	 * If 'pid' is 'odd' we need to place in the upper half,
980	 * even pid's to the lower half.
981	 * Free items stay in the low half so we don't have to
982	 * fixup the reference to them.
983	 * We stuff free items on the front of the freelist
984	 * because we can't write to unmodified entries.
985	 * Processing the table backwards maintains a semblance
986	 * of issuing pid numbers that increase with time.
987	 */
988	i = pt_size - 1;
989	n_pt = new_pt + i;
990	for (; ; i--, n_pt--) {
991		slot = pid_table[i].pt_slot;
992		pgrp = pid_table[i].pt_pgrp;
993		if (!PT_VALID(slot)) {
994			/* Up 'use count' so that link is valid */
995			pid = (PT_NEXT(slot) + pt_size) & ~pt_size;
996			rpid = 0;
997			slot = PT_SET_FREE(pid);
998			if (pgrp)
999				pid = pgrp->pg_id;
1000		} else {
1001			pid = pid_table[i].pt_pid;
1002			rpid = pid;
1003		}
1004
1005		/* Save entry in appropriate half of table */
1006		n_pt[pid & pt_size].pt_slot = slot;
1007		n_pt[pid & pt_size].pt_pgrp = pgrp;
1008		n_pt[pid & pt_size].pt_pid = rpid;
1009
1010		/* Put other piece on start of free list */
1011		pid = (pid ^ pt_size) & ~pid_tbl_mask;
1012		n_pt[pid & pt_size].pt_slot =
1013			PT_SET_FREE((pid & ~pt_size) | next_free_pt);
1014		n_pt[pid & pt_size].pt_pgrp = 0;
1015		n_pt[pid & pt_size].pt_pid = 0;
1016
1017		next_free_pt = i | (pid & pt_size);
1018		if (i == 0)
1019			break;
1020	}
1021
1022	/* Save old table size and switch tables */
1023	tsz = pt_size * sizeof(struct pid_table);
1024	n_pt = pid_table;
1025	atomic_store_release(&pid_table, new_pt);
1026	KASSERT(new_pt_mask >= pid_tbl_mask);
1027	atomic_store_release(&pid_tbl_mask, new_pt_mask);
1028
1029	/*
1030	 * pid_max starts as PID_MAX (= 30000), once we have 16384
1031	 * allocated pids we need it to be larger!
1032	 */
1033	if (pid_tbl_mask > PID_MAX) {
1034		pid_max = pid_tbl_mask * 2 + 1;
1035		pid_alloc_lim |= pid_alloc_lim << 1;
1036	} else
1037		pid_alloc_lim <<= 1;	/* doubles number of free slots... */
1038
1039	mutex_exit(&proc_lock);
1040
1041	/*
1042	 * Make sure that unlocked access to the old pid_table is complete
1043	 * and then free it.
1044	 */
1045	pserialize_perform(proc_psz);
1046	kmem_free(n_pt, tsz);
1047
1048 out:	/* Return with proc_lock held again. */
1049	mutex_enter(&proc_lock);
1050}
1051
1052struct proc *
1053proc_alloc(void)
1054{
1055	struct proc *p;
1056
1057	p = pool_cache_get(proc_cache, PR_WAITOK);
1058	p->p_stat = SIDL;			/* protect against others */
1059	proc_initspecific(p);
1060	kdtrace_proc_ctor(NULL, p);
1061
1062	/*
1063	 * Allocate a placeholder in the pid_table.  When we create the
1064	 * first LWP for this process, it will take ownership of the
1065	 * slot.
1066	 */
1067	if (__predict_false(proc_alloc_pid(p) == -1)) {
1068		/* Allocating the PID failed; unwind. */
1069		proc_finispecific(p);
1070		proc_free_mem(p);
1071		p = NULL;
1072	}
1073	return p;
1074}
1075
1076/*
1077 * proc_alloc_pid_slot: allocate PID and record the occupant so that
1078 * proc_find_raw() can find it by the PID.
1079 */
1080static pid_t __noinline
1081proc_alloc_pid_slot(struct proc *p, uintptr_t slot)
1082{
1083	struct pid_table *pt;
1084	pid_t pid;
1085	int nxt;
1086
1087	KASSERT(mutex_owned(&proc_lock));
1088
1089	for (;;expand_pid_table()) {
1090		if (__predict_false(pid_alloc_cnt >= pid_alloc_lim)) {
1091			/* ensure pids cycle through 2000+ values */
1092			continue;
1093		}
1094		/*
1095		 * The first user process *must* be given PID 1.
1096		 * it has already been reserved for us.  This
1097		 * will be coming in from the proc_alloc() call
1098		 * above, and the entry will be usurped later when
1099		 * the first user LWP is created.
1100		 * XXX this is slightly gross.
1101		 */
1102		if (__predict_false(PT_RESERVED(pid_table[1].pt_slot) &&
1103				    p != &proc0)) {
1104			KASSERT(PT_IS_PROC(slot));
1105			pt = &pid_table[1];
1106			pt->pt_slot = slot;
1107			return 1;
1108		}
1109		pt = &pid_table[next_free_pt];
1110#ifdef DIAGNOSTIC
1111		if (__predict_false(PT_VALID(pt->pt_slot) || pt->pt_pgrp))
1112			panic("proc_alloc: slot busy");
1113#endif
1114		nxt = PT_NEXT(pt->pt_slot);
1115		if (nxt & pid_tbl_mask)
1116			break;
1117		/* Table full - expand (NB last entry not used....) */
1118	}
1119
1120	/* pid is 'saved use count' + 'size' + entry */
1121	pid = (nxt & ~pid_tbl_mask) + pid_tbl_mask + 1 + next_free_pt;
1122	if ((uint)pid > (uint)pid_max)
1123		pid &= pid_tbl_mask;
1124	next_free_pt = nxt & pid_tbl_mask;
1125
1126	/* XXX For now.  The pratical limit is much lower anyway. */
1127	KASSERT(pid <= FUTEX_TID_MASK);
1128
1129	/* Grab table slot */
1130	pt->pt_slot = slot;
1131
1132	KASSERT(pt->pt_pid == 0);
1133	pt->pt_pid = pid;
1134	pid_alloc_cnt++;
1135
1136	return pid;
1137}
1138
1139pid_t
1140proc_alloc_pid(struct proc *p)
1141{
1142	pid_t pid;
1143
1144	KASSERT((((uintptr_t)p) & PT_F_ALLBITS) == 0);
1145	KASSERT(p->p_stat == SIDL);
1146
1147	mutex_enter(&proc_lock);
1148	pid = proc_alloc_pid_slot(p, PT_SET_PROC(p));
1149	if (pid != -1)
1150		p->p_pid = pid;
1151	mutex_exit(&proc_lock);
1152
1153	return pid;
1154}
1155
1156pid_t
1157proc_alloc_lwpid(struct proc *p, struct lwp *l)
1158{
1159	struct pid_table *pt;
1160	pid_t pid;
1161
1162	KASSERT((((uintptr_t)l) & PT_F_ALLBITS) == 0);
1163	KASSERT(l->l_proc == p);
1164	KASSERT(l->l_stat == LSIDL);
1165
1166	/*
1167	 * For unlocked lookup in proc_find_lwp(), make sure l->l_proc
1168	 * is globally visible before the LWP becomes visible via the
1169	 * pid_table.
1170	 */
1171#ifndef __HAVE_ATOMIC_AS_MEMBAR
1172	membar_producer();
1173#endif
1174
1175	/*
1176	 * If the slot for p->p_pid currently points to the proc,
1177	 * then we should usurp this ID for the LWP.  This happens
1178	 * at least once per process (for the first LWP), and can
1179	 * happen again if the first LWP for a process exits and
1180	 * before the process creates another.
1181	 */
1182	mutex_enter(&proc_lock);
1183	pid = p->p_pid;
1184	pt = &pid_table[pid & pid_tbl_mask];
1185	KASSERT(pt->pt_pid == pid);
1186	if (PT_IS_PROC(pt->pt_slot)) {
1187		KASSERT(PT_GET_PROC(pt->pt_slot) == p);
1188		l->l_lid = pid;
1189		pt->pt_slot = PT_SET_LWP(l);
1190	} else {
1191		/* Need to allocate a new slot. */
1192		pid = proc_alloc_pid_slot(p, PT_SET_LWP(l));
1193		if (pid != -1)
1194			l->l_lid = pid;
1195	}
1196	mutex_exit(&proc_lock);
1197
1198	return pid;
1199}
1200
1201static void __noinline
1202proc_free_pid_internal(pid_t pid, uintptr_t type __diagused)
1203{
1204	struct pid_table *pt;
1205
1206	KASSERT(mutex_owned(&proc_lock));
1207
1208	pt = &pid_table[pid & pid_tbl_mask];
1209
1210	KASSERT(PT_GET_TYPE(pt->pt_slot) == type);
1211	KASSERT(pt->pt_pid == pid);
1212
1213	/* save pid use count in slot */
1214	pt->pt_slot = PT_SET_FREE(pid & ~pid_tbl_mask);
1215	pt->pt_pid = 0;
1216
1217	if (pt->pt_pgrp == NULL) {
1218		/* link last freed entry onto ours */
1219		pid &= pid_tbl_mask;
1220		pt = &pid_table[last_free_pt];
1221		pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pid);
1222		pt->pt_pid = 0;
1223		last_free_pt = pid;
1224		pid_alloc_cnt--;
1225	}
1226}
1227
1228/*
1229 * Free a process id - called from proc_free (in kern_exit.c)
1230 *
1231 * Called with the proc_lock held.
1232 */
1233void
1234proc_free_pid(pid_t pid)
1235{
1236
1237	KASSERT(mutex_owned(&proc_lock));
1238	proc_free_pid_internal(pid, PT_F_PROC);
1239}
1240
1241/*
1242 * Free a process id used by an LWP.  If this was the process's
1243 * first LWP, we convert the slot to point to the process; the
1244 * entry will get cleaned up later when the process finishes exiting.
1245 *
1246 * If not, then it's the same as proc_free_pid().
1247 */
1248void
1249proc_free_lwpid(struct proc *p, pid_t pid)
1250{
1251
1252	KASSERT(mutex_owned(&proc_lock));
1253
1254	if (__predict_true(p->p_pid == pid)) {
1255		struct pid_table *pt;
1256
1257		pt = &pid_table[pid & pid_tbl_mask];
1258
1259		KASSERT(pt->pt_pid == pid);
1260		KASSERT(PT_IS_LWP(pt->pt_slot));
1261		KASSERT(PT_GET_LWP(pt->pt_slot)->l_proc == p);
1262
1263		pt->pt_slot = PT_SET_PROC(p);
1264		return;
1265	}
1266	proc_free_pid_internal(pid, PT_F_LWP);
1267}
1268
1269void
1270proc_free_mem(struct proc *p)
1271{
1272
1273	kdtrace_proc_dtor(NULL, p);
1274	pool_cache_put(proc_cache, p);
1275}
1276
1277/*
1278 * proc_enterpgrp: move p to a new or existing process group (and session).
1279 *
1280 * If we are creating a new pgrp, the pgid should equal
1281 * the calling process' pid.
1282 * If is only valid to enter a process group that is in the session
1283 * of the process.
1284 * Also mksess should only be set if we are creating a process group
1285 *
1286 * Only called from sys_setsid, sys_setpgid and posix_spawn/spawn_return.
1287 */
1288int
1289proc_enterpgrp(struct proc *curp, pid_t pid, pid_t pgid, bool mksess)
1290{
1291	struct pgrp *new_pgrp, *pgrp;
1292	struct session *sess;
1293	struct proc *p;
1294	int rval;
1295	pid_t pg_id = NO_PGID;
1296
1297	/* Allocate data areas we might need before doing any validity checks */
1298	sess = mksess ? kmem_alloc(sizeof(*sess), KM_SLEEP) : NULL;
1299	new_pgrp = kmem_alloc(sizeof(*new_pgrp), KM_SLEEP);
1300
1301	mutex_enter(&proc_lock);
1302	rval = EPERM;	/* most common error (to save typing) */
1303
1304	/* Check pgrp exists or can be created */
1305	pgrp = pid_table[pgid & pid_tbl_mask].pt_pgrp;
1306	if (pgrp != NULL && pgrp->pg_id != pgid)
1307		goto done;
1308
1309	/* Can only set another process under restricted circumstances. */
1310	if (pid != curp->p_pid) {
1311		/* Must exist and be one of our children... */
1312		p = proc_find_internal(pid, false);
1313		if (p == NULL || !p_inferior(p, curp)) {
1314			rval = ESRCH;
1315			goto done;
1316		}
1317		/* ... in the same session... */
1318		if (sess != NULL || p->p_session != curp->p_session)
1319			goto done;
1320		/* ... existing pgid must be in same session ... */
1321		if (pgrp != NULL && pgrp->pg_session != p->p_session)
1322			goto done;
1323		/* ... and not done an exec. */
1324		if (p->p_flag & PK_EXEC) {
1325			rval = EACCES;
1326			goto done;
1327		}
1328	} else {
1329		/* ... setsid() cannot re-enter a pgrp */
1330		if (mksess && (curp->p_pgid == curp->p_pid ||
1331		    pgrp_find(curp->p_pid)))
1332			goto done;
1333		p = curp;
1334	}
1335
1336	/* Changing the process group/session of a session
1337	   leader is definitely off limits. */
1338	if (SESS_LEADER(p)) {
1339		if (sess == NULL && p->p_pgrp == pgrp)
1340			/* unless it's a definite noop */
1341			rval = 0;
1342		goto done;
1343	}
1344
1345	/* Can only create a process group with id of process */
1346	if (pgrp == NULL && pgid != pid)
1347		goto done;
1348
1349	/* Can only create a session if creating pgrp */
1350	if (sess != NULL && pgrp != NULL)
1351		goto done;
1352
1353	/* Check we allocated memory for a pgrp... */
1354	if (pgrp == NULL && new_pgrp == NULL)
1355		goto done;
1356
1357	/* Don't attach to 'zombie' pgrp */
1358	if (pgrp != NULL && LIST_EMPTY(&pgrp->pg_members))
1359		goto done;
1360
1361	/* Expect to succeed now */
1362	rval = 0;
1363
1364	if (pgrp == p->p_pgrp)
1365		/* nothing to do */
1366		goto done;
1367
1368	/* Ok all setup, link up required structures */
1369
1370	if (pgrp == NULL) {
1371		pgrp = new_pgrp;
1372		new_pgrp = NULL;
1373		if (sess != NULL) {
1374			sess->s_sid = p->p_pid;
1375			sess->s_leader = p;
1376			sess->s_count = 1;
1377			sess->s_ttyvp = NULL;
1378			sess->s_ttyp = NULL;
1379			sess->s_flags = p->p_session->s_flags & ~S_LOGIN_SET;
1380			memcpy(sess->s_login, p->p_session->s_login,
1381			    sizeof(sess->s_login));
1382			p->p_lflag &= ~PL_CONTROLT;
1383		} else {
1384			sess = p->p_pgrp->pg_session;
1385			proc_sesshold(sess);
1386		}
1387		pgrp->pg_session = sess;
1388		sess = NULL;
1389
1390		pgrp->pg_id = pgid;
1391		LIST_INIT(&pgrp->pg_members);
1392#ifdef DIAGNOSTIC
1393		if (__predict_false(pid_table[pgid & pid_tbl_mask].pt_pgrp))
1394			panic("enterpgrp: pgrp table slot in use");
1395		if (__predict_false(mksess && p != curp))
1396			panic("enterpgrp: mksession and p != curproc");
1397#endif
1398		pid_table[pgid & pid_tbl_mask].pt_pgrp = pgrp;
1399		pgrp->pg_jobc = 0;
1400	}
1401
1402	/*
1403	 * Adjust eligibility of affected pgrps to participate in job control.
1404	 * Increment eligibility counts before decrementing, otherwise we
1405	 * could reach 0 spuriously during the first call.
1406	 */
1407	fixjobc(p, pgrp, 1);
1408	fixjobc(p, p->p_pgrp, 0);
1409
1410	/* Interlock with ttread(). */
1411	mutex_spin_enter(&tty_lock);
1412
1413	/* Move process to requested group. */
1414	LIST_REMOVE(p, p_pglist);
1415	if (LIST_EMPTY(&p->p_pgrp->pg_members))
1416		/* defer delete until we've dumped the lock */
1417		pg_id = p->p_pgrp->pg_id;
1418	p->p_pgrp = pgrp;
1419	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
1420
1421	/* Done with the swap; we can release the tty mutex. */
1422	mutex_spin_exit(&tty_lock);
1423
1424    done:
1425	if (pg_id != NO_PGID) {
1426		/* Releases proc_lock. */
1427		pg_delete(pg_id);
1428	} else {
1429		mutex_exit(&proc_lock);
1430	}
1431	if (sess != NULL)
1432		kmem_free(sess, sizeof(*sess));
1433	if (new_pgrp != NULL)
1434		kmem_free(new_pgrp, sizeof(*new_pgrp));
1435#ifdef DEBUG_PGRP
1436	if (__predict_false(rval))
1437		printf("enterpgrp(%d,%d,%d), curproc %d, rval %d\n",
1438			pid, pgid, mksess, curp->p_pid, rval);
1439#endif
1440	return rval;
1441}
1442
1443/*
1444 * proc_leavepgrp: remove a process from its process group.
1445 *  => must be called with the proc_lock held, which will be released;
1446 */
1447void
1448proc_leavepgrp(struct proc *p)
1449{
1450	struct pgrp *pgrp;
1451
1452	KASSERT(mutex_owned(&proc_lock));
1453
1454	/* Interlock with ttread() */
1455	mutex_spin_enter(&tty_lock);
1456	pgrp = p->p_pgrp;
1457	LIST_REMOVE(p, p_pglist);
1458	p->p_pgrp = NULL;
1459	mutex_spin_exit(&tty_lock);
1460
1461	if (LIST_EMPTY(&pgrp->pg_members)) {
1462		/* Releases proc_lock. */
1463		pg_delete(pgrp->pg_id);
1464	} else {
1465		mutex_exit(&proc_lock);
1466	}
1467}
1468
1469/*
1470 * pg_remove: remove a process group from the table.
1471 *  => must be called with the proc_lock held;
1472 *  => returns process group to free;
1473 */
1474static struct pgrp *
1475pg_remove(pid_t pg_id)
1476{
1477	struct pgrp *pgrp;
1478	struct pid_table *pt;
1479
1480	KASSERT(mutex_owned(&proc_lock));
1481
1482	pt = &pid_table[pg_id & pid_tbl_mask];
1483	pgrp = pt->pt_pgrp;
1484
1485	KASSERT(pgrp != NULL);
1486	KASSERT(pgrp->pg_id == pg_id);
1487	KASSERT(LIST_EMPTY(&pgrp->pg_members));
1488
1489	pt->pt_pgrp = NULL;
1490
1491	if (!PT_VALID(pt->pt_slot)) {
1492		/* Orphaned pgrp, put slot onto free list. */
1493		KASSERT((PT_NEXT(pt->pt_slot) & pid_tbl_mask) == 0);
1494		pg_id &= pid_tbl_mask;
1495		pt = &pid_table[last_free_pt];
1496		pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pg_id);
1497		KASSERT(pt->pt_pid == 0);
1498		last_free_pt = pg_id;
1499		pid_alloc_cnt--;
1500	}
1501	return pgrp;
1502}
1503
1504/*
1505 * pg_delete: delete and free a process group.
1506 *  => must be called with the proc_lock held, which will be released.
1507 */
1508static void
1509pg_delete(pid_t pg_id)
1510{
1511	struct pgrp *pg;
1512	struct tty *ttyp;
1513	struct session *ss;
1514
1515	KASSERT(mutex_owned(&proc_lock));
1516
1517	pg = pid_table[pg_id & pid_tbl_mask].pt_pgrp;
1518	if (pg == NULL || pg->pg_id != pg_id || !LIST_EMPTY(&pg->pg_members)) {
1519		mutex_exit(&proc_lock);
1520		return;
1521	}
1522
1523	ss = pg->pg_session;
1524
1525	/* Remove reference (if any) from tty to this process group */
1526	mutex_spin_enter(&tty_lock);
1527	ttyp = ss->s_ttyp;
1528	if (ttyp != NULL && ttyp->t_pgrp == pg) {
1529		ttyp->t_pgrp = NULL;
1530		KASSERT(ttyp->t_session == ss);
1531	}
1532	mutex_spin_exit(&tty_lock);
1533
1534	/*
1535	 * The leading process group in a session is freed by proc_sessrele(),
1536	 * if last reference.  It will also release the locks.
1537	 */
1538	pg = (ss->s_sid != pg->pg_id) ? pg_remove(pg_id) : NULL;
1539	proc_sessrele(ss);
1540
1541	if (pg != NULL) {
1542		/* Free it, if was not done above. */
1543		kmem_free(pg, sizeof(struct pgrp));
1544	}
1545}
1546
1547/*
1548 * Adjust pgrp jobc counters when specified process changes process group.
1549 * We count the number of processes in each process group that "qualify"
1550 * the group for terminal job control (those with a parent in a different
1551 * process group of the same session).  If that count reaches zero, the
1552 * process group becomes orphaned.  Check both the specified process'
1553 * process group and that of its children.
1554 * entering == 0 => p is leaving specified group.
1555 * entering == 1 => p is entering specified group.
1556 *
1557 * Call with proc_lock held.
1558 */
1559void
1560fixjobc(struct proc *p, struct pgrp *pgrp, int entering)
1561{
1562	struct pgrp *hispgrp;
1563	struct session *mysession = pgrp->pg_session;
1564	struct proc *child;
1565
1566	KASSERT(mutex_owned(&proc_lock));
1567
1568	/*
1569	 * Check p's parent to see whether p qualifies its own process
1570	 * group; if so, adjust count for p's process group.
1571	 */
1572	hispgrp = p->p_pptr->p_pgrp;
1573	if (hispgrp != pgrp && hispgrp->pg_session == mysession) {
1574		if (entering) {
1575			pgrp->pg_jobc++;
1576			p->p_lflag &= ~PL_ORPHANPG;
1577		} else {
1578			/* KASSERT(pgrp->pg_jobc > 0); */
1579			if (--pgrp->pg_jobc == 0)
1580				orphanpg(pgrp);
1581		}
1582	}
1583
1584	/*
1585	 * Check this process' children to see whether they qualify
1586	 * their process groups; if so, adjust counts for children's
1587	 * process groups.
1588	 */
1589	LIST_FOREACH(child, &p->p_children, p_sibling) {
1590		hispgrp = child->p_pgrp;
1591		if (hispgrp != pgrp && hispgrp->pg_session == mysession &&
1592		    !P_ZOMBIE(child)) {
1593			if (entering) {
1594				child->p_lflag &= ~PL_ORPHANPG;
1595				hispgrp->pg_jobc++;
1596			} else {
1597				KASSERT(hispgrp->pg_jobc > 0);
1598				if (--hispgrp->pg_jobc == 0)
1599					orphanpg(hispgrp);
1600			}
1601		}
1602	}
1603}
1604
1605/*
1606 * A process group has become orphaned;
1607 * if there are any stopped processes in the group,
1608 * hang-up all process in that group.
1609 *
1610 * Call with proc_lock held.
1611 */
1612static void
1613orphanpg(struct pgrp *pg)
1614{
1615	struct proc *p;
1616
1617	KASSERT(mutex_owned(&proc_lock));
1618
1619	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
1620		if (p->p_stat == SSTOP) {
1621			p->p_lflag |= PL_ORPHANPG;
1622			psignal(p, SIGHUP);
1623			psignal(p, SIGCONT);
1624		}
1625	}
1626}
1627
1628#ifdef DDB
1629#include <ddb/db_output.h>
1630void pidtbl_dump(void);
1631void
1632pidtbl_dump(void)
1633{
1634	struct pid_table *pt;
1635	struct proc *p;
1636	struct pgrp *pgrp;
1637	uintptr_t slot;
1638	int id;
1639
1640	db_printf("pid table %p size %x, next %x, last %x\n",
1641		pid_table, pid_tbl_mask+1,
1642		next_free_pt, last_free_pt);
1643	for (pt = pid_table, id = 0; id <= pid_tbl_mask; id++, pt++) {
1644		slot = pt->pt_slot;
1645		if (!PT_VALID(slot) && !pt->pt_pgrp)
1646			continue;
1647		if (PT_IS_LWP(slot)) {
1648			p = PT_GET_LWP(slot)->l_proc;
1649		} else if (PT_IS_PROC(slot)) {
1650			p = PT_GET_PROC(slot);
1651		} else {
1652			p = NULL;
1653		}
1654		db_printf("  id %x: ", id);
1655		if (p != NULL)
1656			db_printf("slotpid %d proc %p id %d (0x%x) %s\n",
1657				pt->pt_pid, p, p->p_pid, p->p_pid, p->p_comm);
1658		else
1659			db_printf("next %x use %x\n",
1660				PT_NEXT(slot) & pid_tbl_mask,
1661				PT_NEXT(slot) & ~pid_tbl_mask);
1662		if ((pgrp = pt->pt_pgrp)) {
1663			db_printf("\tsession %p, sid %d, count %d, login %s\n",
1664			    pgrp->pg_session, pgrp->pg_session->s_sid,
1665			    pgrp->pg_session->s_count,
1666			    pgrp->pg_session->s_login);
1667			db_printf("\tpgrp %p, pg_id %d, pg_jobc %d, members %p\n",
1668			    pgrp, pgrp->pg_id, pgrp->pg_jobc,
1669			    LIST_FIRST(&pgrp->pg_members));
1670			LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
1671				db_printf("\t\tpid %d addr %p pgrp %p %s\n",
1672				    p->p_pid, p, p->p_pgrp, p->p_comm);
1673			}
1674		}
1675	}
1676}
1677#endif /* DDB */
1678
1679#ifdef KSTACK_CHECK_MAGIC
1680
1681#define	KSTACK_MAGIC	0xdeadbeaf
1682
1683/* XXX should be per process basis? */
1684static int	kstackleftmin = KSTACK_SIZE;
1685static int	kstackleftthres = KSTACK_SIZE / 8;
1686
1687void
1688kstack_setup_magic(const struct lwp *l)
1689{
1690	uint32_t *ip;
1691	uint32_t const *end;
1692
1693	KASSERT(l != NULL);
1694	KASSERT(l != &lwp0);
1695
1696	/*
1697	 * fill all the stack with magic number
1698	 * so that later modification on it can be detected.
1699	 */
1700	ip = (uint32_t *)KSTACK_LOWEST_ADDR(l);
1701	end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
1702	for (; ip < end; ip++) {
1703		*ip = KSTACK_MAGIC;
1704	}
1705}
1706
1707void
1708kstack_check_magic(const struct lwp *l)
1709{
1710	uint32_t const *ip, *end;
1711	int stackleft;
1712
1713	KASSERT(l != NULL);
1714
1715	/* don't check proc0 */ /*XXX*/
1716	if (l == &lwp0)
1717		return;
1718
1719#ifdef __MACHINE_STACK_GROWS_UP
1720	/* stack grows upwards (eg. hppa) */
1721	ip = (uint32_t *)((void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
1722	end = (uint32_t *)KSTACK_LOWEST_ADDR(l);
1723	for (ip--; ip >= end; ip--)
1724		if (*ip != KSTACK_MAGIC)
1725			break;
1726
1727	stackleft = (void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE - (void *)ip;
1728#else /* __MACHINE_STACK_GROWS_UP */
1729	/* stack grows downwards (eg. i386) */
1730	ip = (uint32_t *)KSTACK_LOWEST_ADDR(l);
1731	end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
1732	for (; ip < end; ip++)
1733		if (*ip != KSTACK_MAGIC)
1734			break;
1735
1736	stackleft = ((const char *)ip) - (const char *)KSTACK_LOWEST_ADDR(l);
1737#endif /* __MACHINE_STACK_GROWS_UP */
1738
1739	if (kstackleftmin > stackleft) {
1740		kstackleftmin = stackleft;
1741		if (stackleft < kstackleftthres)
1742			printf("warning: kernel stack left %d bytes"
1743			    "(pid %u:lid %u)\n", stackleft,
1744			    (u_int)l->l_proc->p_pid, (u_int)l->l_lid);
1745	}
1746
1747	if (stackleft <= 0) {
1748		panic("magic on the top of kernel stack changed for "
1749		    "pid %u, lid %u: maybe kernel stack overflow",
1750		    (u_int)l->l_proc->p_pid, (u_int)l->l_lid);
1751	}
1752}
1753#endif /* KSTACK_CHECK_MAGIC */
1754
1755int
1756proclist_foreach_call(struct proclist *list,
1757    int (*callback)(struct proc *, void *arg), void *arg)
1758{
1759	struct proc marker;
1760	struct proc *p;
1761	int ret = 0;
1762
1763	marker.p_flag = PK_MARKER;
1764	mutex_enter(&proc_lock);
1765	for (p = LIST_FIRST(list); ret == 0 && p != NULL;) {
1766		if (p->p_flag & PK_MARKER) {
1767			p = LIST_NEXT(p, p_list);
1768			continue;
1769		}
1770		LIST_INSERT_AFTER(p, &marker, p_list);
1771		ret = (*callback)(p, arg);
1772		KASSERT(mutex_owned(&proc_lock));
1773		p = LIST_NEXT(&marker, p_list);
1774		LIST_REMOVE(&marker, p_list);
1775	}
1776	mutex_exit(&proc_lock);
1777
1778	return ret;
1779}
1780
1781int
1782proc_vmspace_getref(struct proc *p, struct vmspace **vm)
1783{
1784
1785	/* XXXCDC: how should locking work here? */
1786
1787	/* curproc exception is for coredump. */
1788
1789	if ((p != curproc && (p->p_sflag & PS_WEXIT) != 0) ||
1790	    (p->p_vmspace->vm_refcnt < 1)) {
1791		return EFAULT;
1792	}
1793
1794	uvmspace_addref(p->p_vmspace);
1795	*vm = p->p_vmspace;
1796
1797	return 0;
1798}
1799
1800/*
1801 * Acquire a write lock on the process credential.
1802 */
1803void
1804proc_crmod_enter(void)
1805{
1806	struct lwp *l = curlwp;
1807	struct proc *p = l->l_proc;
1808	kauth_cred_t oc;
1809
1810	/* Reset what needs to be reset in plimit. */
1811	if (p->p_limit->pl_corename != defcorename) {
1812		lim_setcorename(p, defcorename, 0);
1813	}
1814
1815	mutex_enter(p->p_lock);
1816
1817	/* Ensure the LWP cached credentials are up to date. */
1818	if ((oc = l->l_cred) != p->p_cred) {
1819		l->l_cred = kauth_cred_hold(p->p_cred);
1820		kauth_cred_free(oc);
1821	}
1822}
1823
1824/*
1825 * Set in a new process credential, and drop the write lock.  The credential
1826 * must have a reference already.  Optionally, free a no-longer required
1827 * credential.
1828 */
1829void
1830proc_crmod_leave(kauth_cred_t scred, kauth_cred_t fcred, bool sugid)
1831{
1832	struct lwp *l = curlwp, *l2;
1833	struct proc *p = l->l_proc;
1834	kauth_cred_t oc;
1835
1836	KASSERT(mutex_owned(p->p_lock));
1837
1838	/* Is there a new credential to set in? */
1839	if (scred != NULL) {
1840		p->p_cred = scred;
1841		LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
1842			if (l2 != l) {
1843				lwp_lock(l2);
1844				l2->l_flag |= LW_CACHECRED;
1845				lwp_need_userret(l2);
1846				lwp_unlock(l2);
1847			}
1848		}
1849
1850		/* Ensure the LWP cached credentials are up to date. */
1851		if ((oc = l->l_cred) != scred) {
1852			l->l_cred = kauth_cred_hold(scred);
1853		}
1854	} else
1855		oc = NULL;	/* XXXgcc */
1856
1857	if (sugid) {
1858		/*
1859		 * Mark process as having changed credentials, stops
1860		 * tracing etc.
1861		 */
1862		p->p_flag |= PK_SUGID;
1863	}
1864
1865	mutex_exit(p->p_lock);
1866
1867	/* If there is a credential to be released, free it now. */
1868	if (fcred != NULL) {
1869		KASSERT(scred != NULL);
1870		kauth_cred_free(fcred);
1871		if (oc != scred)
1872			kauth_cred_free(oc);
1873	}
1874}
1875
1876/*
1877 * proc_specific_key_create --
1878 *	Create a key for subsystem proc-specific data.
1879 */
1880int
1881proc_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1882{
1883
1884	return (specificdata_key_create(proc_specificdata_domain, keyp, dtor));
1885}
1886
1887/*
1888 * proc_specific_key_delete --
1889 *	Delete a key for subsystem proc-specific data.
1890 */
1891void
1892proc_specific_key_delete(specificdata_key_t key)
1893{
1894
1895	specificdata_key_delete(proc_specificdata_domain, key);
1896}
1897
1898/*
1899 * proc_initspecific --
1900 *	Initialize a proc's specificdata container.
1901 */
1902void
1903proc_initspecific(struct proc *p)
1904{
1905	int error __diagused;
1906
1907	error = specificdata_init(proc_specificdata_domain, &p->p_specdataref);
1908	KASSERT(error == 0);
1909}
1910
1911/*
1912 * proc_finispecific --
1913 *	Finalize a proc's specificdata container.
1914 */
1915void
1916proc_finispecific(struct proc *p)
1917{
1918
1919	specificdata_fini(proc_specificdata_domain, &p->p_specdataref);
1920}
1921
1922/*
1923 * proc_getspecific --
1924 *	Return proc-specific data corresponding to the specified key.
1925 */
1926void *
1927proc_getspecific(struct proc *p, specificdata_key_t key)
1928{
1929
1930	return (specificdata_getspecific(proc_specificdata_domain,
1931					 &p->p_specdataref, key));
1932}
1933
1934/*
1935 * proc_setspecific --
1936 *	Set proc-specific data corresponding to the specified key.
1937 */
1938void
1939proc_setspecific(struct proc *p, specificdata_key_t key, void *data)
1940{
1941
1942	specificdata_setspecific(proc_specificdata_domain,
1943				 &p->p_specdataref, key, data);
1944}
1945
1946int
1947proc_uidmatch(kauth_cred_t cred, kauth_cred_t target)
1948{
1949	int r = 0;
1950
1951	if (kauth_cred_getuid(cred) != kauth_cred_getuid(target) ||
1952	    kauth_cred_getuid(cred) != kauth_cred_getsvuid(target)) {
1953		/*
1954		 * suid proc of ours or proc not ours
1955		 */
1956		r = EPERM;
1957	} else if (kauth_cred_getgid(target) != kauth_cred_getsvgid(target)) {
1958		/*
1959		 * sgid proc has sgid back to us temporarily
1960		 */
1961		r = EPERM;
1962	} else {
1963		/*
1964		 * our rgid must be in target's group list (ie,
1965		 * sub-processes started by a sgid process)
1966		 */
1967		int ismember = 0;
1968
1969		if (kauth_cred_ismember_gid(cred,
1970		    kauth_cred_getgid(target), &ismember) != 0 ||
1971		    !ismember)
1972			r = EPERM;
1973	}
1974
1975	return (r);
1976}
1977
1978/*
1979 * sysctl stuff
1980 */
1981
1982#define KERN_PROCSLOP	(5 * sizeof(struct kinfo_proc))
1983
1984static const u_int sysctl_flagmap[] = {
1985	PK_ADVLOCK, P_ADVLOCK,
1986	PK_EXEC, P_EXEC,
1987	PK_NOCLDWAIT, P_NOCLDWAIT,
1988	PK_32, P_32,
1989	PK_CLDSIGIGN, P_CLDSIGIGN,
1990	PK_SUGID, P_SUGID,
1991	0
1992};
1993
1994static const u_int sysctl_sflagmap[] = {
1995	PS_NOCLDSTOP, P_NOCLDSTOP,
1996	PS_WEXIT, P_WEXIT,
1997	PS_STOPFORK, P_STOPFORK,
1998	PS_STOPEXEC, P_STOPEXEC,
1999	PS_STOPEXIT, P_STOPEXIT,
2000	0
2001};
2002
2003static const u_int sysctl_slflagmap[] = {
2004	PSL_TRACED, P_TRACED,
2005	PSL_CHTRACED, P_CHTRACED,
2006	PSL_SYSCALL, P_SYSCALL,
2007	0
2008};
2009
2010static const u_int sysctl_lflagmap[] = {
2011	PL_CONTROLT, P_CONTROLT,
2012	PL_PPWAIT, P_PPWAIT,
2013	0
2014};
2015
2016static const u_int sysctl_stflagmap[] = {
2017	PST_PROFIL, P_PROFIL,
2018	0
2019
2020};
2021
2022/* used by kern_lwp also */
2023const u_int sysctl_lwpflagmap[] = {
2024	LW_SINTR, L_SINTR,
2025	LW_SYSTEM, L_SYSTEM,
2026	0
2027};
2028
2029/*
2030 * Find the most ``active'' lwp of a process and return it for ps display
2031 * purposes
2032 */
2033static struct lwp *
2034proc_active_lwp(struct proc *p)
2035{
2036	static const int ostat[] = {
2037		0,
2038		2,	/* LSIDL */
2039		6,	/* LSRUN */
2040		5,	/* LSSLEEP */
2041		4,	/* LSSTOP */
2042		0,	/* LSZOMB */
2043		1,	/* LSDEAD */
2044		7,	/* LSONPROC */
2045		3	/* LSSUSPENDED */
2046	};
2047
2048	struct lwp *l, *lp = NULL;
2049	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
2050		KASSERT(l->l_stat >= 0);
2051		KASSERT(l->l_stat < __arraycount(ostat));
2052		if (lp == NULL ||
2053		    ostat[l->l_stat] > ostat[lp->l_stat] ||
2054		    (ostat[l->l_stat] == ostat[lp->l_stat] &&
2055		    l->l_cpticks > lp->l_cpticks)) {
2056			lp = l;
2057			continue;
2058		}
2059	}
2060	return lp;
2061}
2062
2063static int
2064sysctl_doeproc(SYSCTLFN_ARGS)
2065{
2066	union {
2067		struct kinfo_proc kproc;
2068		struct kinfo_proc2 kproc2;
2069	} *kbuf;
2070	struct proc *p, *next, *marker;
2071	char *where, *dp;
2072	int type, op, arg, error;
2073	u_int elem_size, kelem_size, elem_count;
2074	size_t buflen, needed;
2075	bool match, zombie, mmmbrains;
2076	const bool allowaddr = get_expose_address(curproc);
2077
2078	if (namelen == 1 && name[0] == CTL_QUERY)
2079		return (sysctl_query(SYSCTLFN_CALL(rnode)));
2080
2081	dp = where = oldp;
2082	buflen = where != NULL ? *oldlenp : 0;
2083	error = 0;
2084	needed = 0;
2085	type = rnode->sysctl_num;
2086
2087	if (type == KERN_PROC) {
2088		if (namelen == 0)
2089			return EINVAL;
2090		switch (op = name[0]) {
2091		case KERN_PROC_ALL:
2092			if (namelen != 1)
2093				return EINVAL;
2094			arg = 0;
2095			break;
2096		default:
2097			if (namelen != 2)
2098				return EINVAL;
2099			arg = name[1];
2100			break;
2101		}
2102		elem_count = 0;	/* Hush little compiler, don't you cry */
2103		kelem_size = elem_size = sizeof(kbuf->kproc);
2104	} else {
2105		if (namelen != 4)
2106			return EINVAL;
2107		op = name[0];
2108		arg = name[1];
2109		elem_size = name[2];
2110		elem_count = name[3];
2111		kelem_size = sizeof(kbuf->kproc2);
2112	}
2113
2114	sysctl_unlock();
2115
2116	kbuf = kmem_zalloc(sizeof(*kbuf), KM_SLEEP);
2117	marker = kmem_alloc(sizeof(*marker), KM_SLEEP);
2118	marker->p_flag = PK_MARKER;
2119
2120	mutex_enter(&proc_lock);
2121	/*
2122	 * Start with zombies to prevent reporting processes twice, in case they
2123	 * are dying and being moved from the list of alive processes to zombies.
2124	 */
2125	mmmbrains = true;
2126	for (p = LIST_FIRST(&zombproc);; p = next) {
2127		if (p == NULL) {
2128			if (mmmbrains) {
2129				p = LIST_FIRST(&allproc);
2130				mmmbrains = false;
2131			}
2132			if (p == NULL)
2133				break;
2134		}
2135		next = LIST_NEXT(p, p_list);
2136		if ((p->p_flag & PK_MARKER) != 0)
2137			continue;
2138
2139		/*
2140		 * Skip embryonic processes.
2141		 */
2142		if (p->p_stat == SIDL)
2143			continue;
2144
2145		mutex_enter(p->p_lock);
2146		error = kauth_authorize_process(l->l_cred,
2147		    KAUTH_PROCESS_CANSEE, p,
2148		    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_EPROC), NULL, NULL);
2149		if (error != 0) {
2150			mutex_exit(p->p_lock);
2151			continue;
2152		}
2153
2154		/*
2155		 * Hande all the operations in one switch on the cost of
2156		 * algorithm complexity is on purpose. The win splitting this
2157		 * function into several similar copies makes maintenance
2158		 * burden, code grow and boost is negligible in practical
2159		 * systems.
2160		 */
2161		switch (op) {
2162		case KERN_PROC_PID:
2163			match = (p->p_pid == (pid_t)arg);
2164			break;
2165
2166		case KERN_PROC_PGRP:
2167			match = (p->p_pgrp->pg_id == (pid_t)arg);
2168			break;
2169
2170		case KERN_PROC_SESSION:
2171			match = (p->p_session->s_sid == (pid_t)arg);
2172			break;
2173
2174		case KERN_PROC_TTY:
2175			match = true;
2176			if (arg == (int) KERN_PROC_TTY_REVOKE) {
2177				if ((p->p_lflag & PL_CONTROLT) == 0 ||
2178				    p->p_session->s_ttyp == NULL ||
2179				    p->p_session->s_ttyvp != NULL) {
2180				    	match = false;
2181				}
2182			} else if ((p->p_lflag & PL_CONTROLT) == 0 ||
2183			    p->p_session->s_ttyp == NULL) {
2184				if ((dev_t)arg != KERN_PROC_TTY_NODEV) {
2185					match = false;
2186				}
2187			} else if (p->p_session->s_ttyp->t_dev != (dev_t)arg) {
2188				match = false;
2189			}
2190			break;
2191
2192		case KERN_PROC_UID:
2193			match = (kauth_cred_geteuid(p->p_cred) == (uid_t)arg);
2194			break;
2195
2196		case KERN_PROC_RUID:
2197			match = (kauth_cred_getuid(p->p_cred) == (uid_t)arg);
2198			break;
2199
2200		case KERN_PROC_GID:
2201			match = (kauth_cred_getegid(p->p_cred) == (uid_t)arg);
2202			break;
2203
2204		case KERN_PROC_RGID:
2205			match = (kauth_cred_getgid(p->p_cred) == (uid_t)arg);
2206			break;
2207
2208		case KERN_PROC_ALL:
2209			match = true;
2210			/* allow everything */
2211			break;
2212
2213		default:
2214			error = EINVAL;
2215			mutex_exit(p->p_lock);
2216			goto cleanup;
2217		}
2218		if (!match) {
2219			mutex_exit(p->p_lock);
2220			continue;
2221		}
2222
2223		/*
2224		 * Grab a hold on the process.
2225		 */
2226		if (mmmbrains) {
2227			zombie = true;
2228		} else {
2229			zombie = !rw_tryenter(&p->p_reflock, RW_READER);
2230		}
2231		if (zombie) {
2232			LIST_INSERT_AFTER(p, marker, p_list);
2233		}
2234
2235		if (buflen >= elem_size &&
2236		    (type == KERN_PROC || elem_count > 0)) {
2237			ruspace(p);	/* Update process vm resource use */
2238
2239			if (type == KERN_PROC) {
2240				fill_proc(p, &kbuf->kproc.kp_proc, allowaddr);
2241				fill_eproc(p, &kbuf->kproc.kp_eproc, zombie,
2242				    allowaddr);
2243			} else {
2244				fill_kproc2(p, &kbuf->kproc2, zombie,
2245				    allowaddr);
2246				elem_count--;
2247			}
2248			mutex_exit(p->p_lock);
2249			mutex_exit(&proc_lock);
2250			/*
2251			 * Copy out elem_size, but not larger than kelem_size
2252			 */
2253			error = sysctl_copyout(l, kbuf, dp,
2254			    uimin(kelem_size, elem_size));
2255			mutex_enter(&proc_lock);
2256			if (error) {
2257				goto bah;
2258			}
2259			dp += elem_size;
2260			buflen -= elem_size;
2261		} else {
2262			mutex_exit(p->p_lock);
2263		}
2264		needed += elem_size;
2265
2266		/*
2267		 * Release reference to process.
2268		 */
2269	 	if (zombie) {
2270			next = LIST_NEXT(marker, p_list);
2271 			LIST_REMOVE(marker, p_list);
2272		} else {
2273			rw_exit(&p->p_reflock);
2274			next = LIST_NEXT(p, p_list);
2275		}
2276
2277		/*
2278		 * Short-circuit break quickly!
2279		 */
2280		if (op == KERN_PROC_PID)
2281                	break;
2282	}
2283	mutex_exit(&proc_lock);
2284
2285	if (where != NULL) {
2286		*oldlenp = dp - where;
2287		if (needed > *oldlenp) {
2288			error = ENOMEM;
2289			goto out;
2290		}
2291	} else {
2292		needed += KERN_PROCSLOP;
2293		*oldlenp = needed;
2294	}
2295	kmem_free(kbuf, sizeof(*kbuf));
2296	kmem_free(marker, sizeof(*marker));
2297	sysctl_relock();
2298	return 0;
2299 bah:
2300 	if (zombie)
2301 		LIST_REMOVE(marker, p_list);
2302	else
2303		rw_exit(&p->p_reflock);
2304 cleanup:
2305	mutex_exit(&proc_lock);
2306 out:
2307	kmem_free(kbuf, sizeof(*kbuf));
2308	kmem_free(marker, sizeof(*marker));
2309	sysctl_relock();
2310	return error;
2311}
2312
2313int
2314copyin_psstrings(struct proc *p, struct ps_strings *arginfo)
2315{
2316#if !defined(_RUMPKERNEL)
2317	int retval;
2318
2319	if (p->p_flag & PK_32) {
2320		MODULE_HOOK_CALL(kern_proc32_copyin_hook, (p, arginfo),
2321		    enosys(), retval);
2322		return retval;
2323	}
2324#endif /* !defined(_RUMPKERNEL) */
2325
2326	return copyin_proc(p, (void *)p->p_psstrp, arginfo, sizeof(*arginfo));
2327}
2328
2329static int
2330copy_procargs_sysctl_cb(void *cookie_, const void *src, size_t off, size_t len)
2331{
2332	void **cookie = cookie_;
2333	struct lwp *l = cookie[0];
2334	char *dst = cookie[1];
2335
2336	return sysctl_copyout(l, src, dst + off, len);
2337}
2338
2339/*
2340 * sysctl helper routine for kern.proc_args pseudo-subtree.
2341 */
2342static int
2343sysctl_kern_proc_args(SYSCTLFN_ARGS)
2344{
2345	struct ps_strings pss;
2346	struct proc *p;
2347	pid_t pid;
2348	int type, error;
2349	void *cookie[2];
2350
2351	if (namelen == 1 && name[0] == CTL_QUERY)
2352		return (sysctl_query(SYSCTLFN_CALL(rnode)));
2353
2354	if (newp != NULL || namelen != 2)
2355		return (EINVAL);
2356	pid = name[0];
2357	type = name[1];
2358
2359	switch (type) {
2360	case KERN_PROC_PATHNAME:
2361		sysctl_unlock();
2362		error = fill_pathname(l, pid, oldp, oldlenp);
2363		sysctl_relock();
2364		return error;
2365
2366	case KERN_PROC_CWD:
2367		sysctl_unlock();
2368		error = fill_cwd(l, pid, oldp, oldlenp);
2369		sysctl_relock();
2370		return error;
2371
2372	case KERN_PROC_ARGV:
2373	case KERN_PROC_NARGV:
2374	case KERN_PROC_ENV:
2375	case KERN_PROC_NENV:
2376		/* ok */
2377		break;
2378	default:
2379		return (EINVAL);
2380	}
2381
2382	sysctl_unlock();
2383
2384	/* check pid */
2385	mutex_enter(&proc_lock);
2386	if ((p = proc_find(pid)) == NULL) {
2387		error = EINVAL;
2388		goto out_locked;
2389	}
2390	mutex_enter(p->p_lock);
2391
2392	/* Check permission. */
2393	if (type == KERN_PROC_ARGV || type == KERN_PROC_NARGV)
2394		error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
2395		    p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ARGS), NULL, NULL);
2396	else if (type == KERN_PROC_ENV || type == KERN_PROC_NENV)
2397		error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
2398		    p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENV), NULL, NULL);
2399	else
2400		error = EINVAL; /* XXXGCC */
2401	if (error) {
2402		mutex_exit(p->p_lock);
2403		goto out_locked;
2404	}
2405
2406	if (oldp == NULL) {
2407		if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV)
2408			*oldlenp = sizeof (int);
2409		else
2410			*oldlenp = ARG_MAX;	/* XXX XXX XXX */
2411		error = 0;
2412		mutex_exit(p->p_lock);
2413		goto out_locked;
2414	}
2415
2416	/*
2417	 * Zombies don't have a stack, so we can't read their psstrings.
2418	 * System processes also don't have a user stack.
2419	 */
2420	if (P_ZOMBIE(p) || (p->p_flag & PK_SYSTEM) != 0) {
2421		error = EINVAL;
2422		mutex_exit(p->p_lock);
2423		goto out_locked;
2424	}
2425
2426	error = rw_tryenter(&p->p_reflock, RW_READER) ? 0 : EBUSY;
2427	mutex_exit(p->p_lock);
2428	if (error) {
2429		goto out_locked;
2430	}
2431	mutex_exit(&proc_lock);
2432
2433	if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV) {
2434		int value;
2435		if ((error = copyin_psstrings(p, &pss)) == 0) {
2436			if (type == KERN_PROC_NARGV)
2437				value = pss.ps_nargvstr;
2438			else
2439				value = pss.ps_nenvstr;
2440			error = sysctl_copyout(l, &value, oldp, sizeof(value));
2441			*oldlenp = sizeof(value);
2442		}
2443	} else {
2444		cookie[0] = l;
2445		cookie[1] = oldp;
2446		error = copy_procargs(p, type, oldlenp,
2447		    copy_procargs_sysctl_cb, cookie);
2448	}
2449	rw_exit(&p->p_reflock);
2450	sysctl_relock();
2451	return error;
2452
2453out_locked:
2454	mutex_exit(&proc_lock);
2455	sysctl_relock();
2456	return error;
2457}
2458
2459int
2460copy_procargs(struct proc *p, int oid, size_t *limit,
2461    int (*cb)(void *, const void *, size_t, size_t), void *cookie)
2462{
2463	struct ps_strings pss;
2464	size_t len, i, loaded, entry_len;
2465	struct uio auio;
2466	struct iovec aiov;
2467	int error, argvlen;
2468	char *arg;
2469	char **argv;
2470	vaddr_t user_argv;
2471	struct vmspace *vmspace;
2472
2473	/*
2474	 * Allocate a temporary buffer to hold the argument vector and
2475	 * the arguments themselve.
2476	 */
2477	arg = kmem_alloc(PAGE_SIZE, KM_SLEEP);
2478	argv = kmem_alloc(PAGE_SIZE, KM_SLEEP);
2479
2480	/*
2481	 * Lock the process down in memory.
2482	 */
2483	vmspace = p->p_vmspace;
2484	uvmspace_addref(vmspace);
2485
2486	/*
2487	 * Read in the ps_strings structure.
2488	 */
2489	if ((error = copyin_psstrings(p, &pss)) != 0)
2490		goto done;
2491
2492	/*
2493	 * Now read the address of the argument vector.
2494	 */
2495	switch (oid) {
2496	case KERN_PROC_ARGV:
2497		user_argv = (uintptr_t)pss.ps_argvstr;
2498		argvlen = pss.ps_nargvstr;
2499		break;
2500	case KERN_PROC_ENV:
2501		user_argv = (uintptr_t)pss.ps_envstr;
2502		argvlen = pss.ps_nenvstr;
2503		break;
2504	default:
2505		error = EINVAL;
2506		goto done;
2507	}
2508
2509	if (argvlen < 0) {
2510		error = EIO;
2511		goto done;
2512	}
2513
2514
2515	/*
2516	 * Now copy each string.
2517	 */
2518	len = 0; /* bytes written to user buffer */
2519	loaded = 0; /* bytes from argv already processed */
2520	i = 0; /* To make compiler happy */
2521	entry_len = PROC_PTRSZ(p);
2522
2523	for (; argvlen; --argvlen) {
2524		int finished = 0;
2525		vaddr_t base;
2526		size_t xlen;
2527		int j;
2528
2529		if (loaded == 0) {
2530			size_t rem = entry_len * argvlen;
2531			loaded = MIN(rem, PAGE_SIZE);
2532			error = copyin_vmspace(vmspace,
2533			    (const void *)user_argv, argv, loaded);
2534			if (error)
2535				break;
2536			user_argv += loaded;
2537			i = 0;
2538		}
2539
2540#if !defined(_RUMPKERNEL)
2541		if (p->p_flag & PK_32)
2542			MODULE_HOOK_CALL(kern_proc32_base_hook,
2543			    (argv, i++), 0, base);
2544		else
2545#endif /* !defined(_RUMPKERNEL) */
2546			base = (vaddr_t)argv[i++];
2547		loaded -= entry_len;
2548
2549		/*
2550		 * The program has messed around with its arguments,
2551		 * possibly deleting some, and replacing them with
2552		 * NULL's. Treat this as the last argument and not
2553		 * a failure.
2554		 */
2555		if (base == 0)
2556			break;
2557
2558		while (!finished) {
2559			xlen = PAGE_SIZE - (base & PAGE_MASK);
2560
2561			aiov.iov_base = arg;
2562			aiov.iov_len = PAGE_SIZE;
2563			auio.uio_iov = &aiov;
2564			auio.uio_iovcnt = 1;
2565			auio.uio_offset = base;
2566			auio.uio_resid = xlen;
2567			auio.uio_rw = UIO_READ;
2568			UIO_SETUP_SYSSPACE(&auio);
2569			error = uvm_io(&vmspace->vm_map, &auio, 0);
2570			if (error)
2571				goto done;
2572
2573			/* Look for the end of the string */
2574			for (j = 0; j < xlen; j++) {
2575				if (arg[j] == '\0') {
2576					xlen = j + 1;
2577					finished = 1;
2578					break;
2579				}
2580			}
2581
2582			/* Check for user buffer overflow */
2583			if (len + xlen > *limit) {
2584				finished = 1;
2585				if (len > *limit)
2586					xlen = 0;
2587				else
2588					xlen = *limit - len;
2589			}
2590
2591			/* Copyout the page */
2592			error = (*cb)(cookie, arg, len, xlen);
2593			if (error)
2594				goto done;
2595
2596			len += xlen;
2597			base += xlen;
2598		}
2599	}
2600	*limit = len;
2601
2602done:
2603	kmem_free(argv, PAGE_SIZE);
2604	kmem_free(arg, PAGE_SIZE);
2605	uvmspace_free(vmspace);
2606	return error;
2607}
2608
2609/*
2610 * Fill in a proc structure for the specified process.
2611 */
2612static void
2613fill_proc(const struct proc *psrc, struct proc *p, bool allowaddr)
2614{
2615	COND_SET_STRUCT(p->p_list, psrc->p_list, allowaddr);
2616	memset(&p->p_auxlock, 0, sizeof(p->p_auxlock));
2617	COND_SET_STRUCT(p->p_lock, psrc->p_lock, allowaddr);
2618	memset(&p->p_stmutex, 0, sizeof(p->p_stmutex));
2619	memset(&p->p_reflock, 0, sizeof(p->p_reflock));
2620	COND_SET_STRUCT(p->p_waitcv, psrc->p_waitcv, allowaddr);
2621	COND_SET_STRUCT(p->p_lwpcv, psrc->p_lwpcv, allowaddr);
2622	COND_SET_PTR(p->p_cred, psrc->p_cred, allowaddr);
2623	COND_SET_PTR(p->p_fd, psrc->p_fd, allowaddr);
2624	COND_SET_PTR(p->p_cwdi, psrc->p_cwdi, allowaddr);
2625	COND_SET_PTR(p->p_stats, psrc->p_stats, allowaddr);
2626	COND_SET_PTR(p->p_limit, psrc->p_limit, allowaddr);
2627	COND_SET_PTR(p->p_vmspace, psrc->p_vmspace, allowaddr);
2628	COND_SET_PTR(p->p_sigacts, psrc->p_sigacts, allowaddr);
2629	COND_SET_PTR(p->p_aio, psrc->p_aio, allowaddr);
2630	p->p_mqueue_cnt = psrc->p_mqueue_cnt;
2631	memset(&p->p_specdataref, 0, sizeof(p->p_specdataref));
2632	p->p_exitsig = psrc->p_exitsig;
2633	p->p_flag = psrc->p_flag;
2634	p->p_sflag = psrc->p_sflag;
2635	p->p_slflag = psrc->p_slflag;
2636	p->p_lflag = psrc->p_lflag;
2637	p->p_stflag = psrc->p_stflag;
2638	p->p_stat = psrc->p_stat;
2639	p->p_trace_enabled = psrc->p_trace_enabled;
2640	p->p_pid = psrc->p_pid;
2641	COND_SET_STRUCT(p->p_pglist, psrc->p_pglist, allowaddr);
2642	COND_SET_PTR(p->p_pptr, psrc->p_pptr, allowaddr);
2643	COND_SET_STRUCT(p->p_sibling, psrc->p_sibling, allowaddr);
2644	COND_SET_STRUCT(p->p_children, psrc->p_children, allowaddr);
2645	COND_SET_STRUCT(p->p_lwps, psrc->p_lwps, allowaddr);
2646	COND_SET_PTR(p->p_raslist, psrc->p_raslist, allowaddr);
2647	p->p_nlwps = psrc->p_nlwps;
2648	p->p_nzlwps = psrc->p_nzlwps;
2649	p->p_nrlwps = psrc->p_nrlwps;
2650	p->p_nlwpwait = psrc->p_nlwpwait;
2651	p->p_ndlwps = psrc->p_ndlwps;
2652	p->p_nstopchild = psrc->p_nstopchild;
2653	p->p_waited = psrc->p_waited;
2654	COND_SET_PTR(p->p_zomblwp, psrc->p_zomblwp, allowaddr);
2655	COND_SET_PTR(p->p_vforklwp, psrc->p_vforklwp, allowaddr);
2656	COND_SET_PTR(p->p_sched_info, psrc->p_sched_info, allowaddr);
2657	p->p_estcpu = psrc->p_estcpu;
2658	p->p_estcpu_inherited = psrc->p_estcpu_inherited;
2659	p->p_forktime = psrc->p_forktime;
2660	p->p_pctcpu = psrc->p_pctcpu;
2661	COND_SET_PTR(p->p_opptr, psrc->p_opptr, allowaddr);
2662	COND_SET_PTR(p->p_timers, psrc->p_timers, allowaddr);
2663	p->p_rtime = psrc->p_rtime;
2664	p->p_uticks = psrc->p_uticks;
2665	p->p_sticks = psrc->p_sticks;
2666	p->p_iticks = psrc->p_iticks;
2667	p->p_xutime = psrc->p_xutime;
2668	p->p_xstime = psrc->p_xstime;
2669	p->p_traceflag = psrc->p_traceflag;
2670	COND_SET_PTR(p->p_tracep, psrc->p_tracep, allowaddr);
2671	COND_SET_PTR(p->p_textvp, psrc->p_textvp, allowaddr);
2672	COND_SET_PTR(p->p_emul, psrc->p_emul, allowaddr);
2673	COND_SET_PTR(p->p_emuldata, psrc->p_emuldata, allowaddr);
2674	COND_SET_CPTR(p->p_execsw, psrc->p_execsw, allowaddr);
2675	COND_SET_STRUCT(p->p_klist, psrc->p_klist, allowaddr);
2676	COND_SET_STRUCT(p->p_sigwaiters, psrc->p_sigwaiters, allowaddr);
2677	COND_SET_STRUCT(p->p_sigpend.sp_info, psrc->p_sigpend.sp_info,
2678	    allowaddr);
2679	p->p_sigpend.sp_set = psrc->p_sigpend.sp_set;
2680	COND_SET_PTR(p->p_lwpctl, psrc->p_lwpctl, allowaddr);
2681	p->p_ppid = psrc->p_ppid;
2682	p->p_oppid = psrc->p_oppid;
2683	COND_SET_PTR(p->p_path, psrc->p_path, allowaddr);
2684	p->p_sigctx = psrc->p_sigctx;
2685	p->p_nice = psrc->p_nice;
2686	memcpy(p->p_comm, psrc->p_comm, sizeof(p->p_comm));
2687	COND_SET_PTR(p->p_pgrp, psrc->p_pgrp, allowaddr);
2688	COND_SET_VALUE(p->p_psstrp, psrc->p_psstrp, allowaddr);
2689	p->p_pax = psrc->p_pax;
2690	p->p_xexit = psrc->p_xexit;
2691	p->p_xsig = psrc->p_xsig;
2692	p->p_acflag = psrc->p_acflag;
2693	COND_SET_STRUCT(p->p_md, psrc->p_md, allowaddr);
2694	p->p_stackbase = psrc->p_stackbase;
2695	COND_SET_PTR(p->p_dtrace, psrc->p_dtrace, allowaddr);
2696}
2697
2698/*
2699 * Fill in an eproc structure for the specified process.
2700 */
2701void
2702fill_eproc(struct proc *p, struct eproc *ep, bool zombie, bool allowaddr)
2703{
2704	struct tty *tp;
2705	struct lwp *l;
2706
2707	KASSERT(mutex_owned(&proc_lock));
2708	KASSERT(mutex_owned(p->p_lock));
2709
2710	COND_SET_PTR(ep->e_paddr, p, allowaddr);
2711	COND_SET_PTR(ep->e_sess, p->p_session, allowaddr);
2712	if (p->p_cred) {
2713		kauth_cred_topcred(p->p_cred, &ep->e_pcred);
2714		kauth_cred_toucred(p->p_cred, &ep->e_ucred);
2715	}
2716	if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) {
2717		struct vmspace *vm = p->p_vmspace;
2718
2719		ep->e_vm.vm_rssize = vm_resident_count(vm);
2720		ep->e_vm.vm_tsize = vm->vm_tsize;
2721		ep->e_vm.vm_dsize = vm->vm_dsize;
2722		ep->e_vm.vm_ssize = vm->vm_ssize;
2723		ep->e_vm.vm_map.size = vm->vm_map.size;
2724
2725		/* Pick the primary (first) LWP */
2726		l = proc_active_lwp(p);
2727		KASSERT(l != NULL);
2728		lwp_lock(l);
2729		if (l->l_wchan)
2730			strncpy(ep->e_wmesg, l->l_wmesg, WMESGLEN);
2731		lwp_unlock(l);
2732	}
2733	ep->e_ppid = p->p_ppid;
2734	if (p->p_pgrp && p->p_session) {
2735		ep->e_pgid = p->p_pgrp->pg_id;
2736		ep->e_jobc = p->p_pgrp->pg_jobc;
2737		ep->e_sid = p->p_session->s_sid;
2738		if ((p->p_lflag & PL_CONTROLT) &&
2739		    (tp = p->p_session->s_ttyp)) {
2740			ep->e_tdev = tp->t_dev;
2741			ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
2742			COND_SET_PTR(ep->e_tsess, tp->t_session, allowaddr);
2743		} else
2744			ep->e_tdev = (uint32_t)NODEV;
2745		ep->e_flag = p->p_session->s_ttyvp ? EPROC_CTTY : 0;
2746		if (SESS_LEADER(p))
2747			ep->e_flag |= EPROC_SLEADER;
2748		strncpy(ep->e_login, p->p_session->s_login, MAXLOGNAME);
2749	}
2750	ep->e_xsize = ep->e_xrssize = 0;
2751	ep->e_xccount = ep->e_xswrss = 0;
2752}
2753
2754/*
2755 * Fill in a kinfo_proc2 structure for the specified process.
2756 */
2757void
2758fill_kproc2(struct proc *p, struct kinfo_proc2 *ki, bool zombie, bool allowaddr)
2759{
2760	struct tty *tp;
2761	struct lwp *l;
2762	struct timeval ut, st, rt;
2763	sigset_t ss1, ss2;
2764	struct rusage ru;
2765	struct vmspace *vm;
2766
2767	KASSERT(mutex_owned(&proc_lock));
2768	KASSERT(mutex_owned(p->p_lock));
2769
2770	sigemptyset(&ss1);
2771	sigemptyset(&ss2);
2772
2773	COND_SET_VALUE(ki->p_paddr, PTRTOUINT64(p), allowaddr);
2774	COND_SET_VALUE(ki->p_fd, PTRTOUINT64(p->p_fd), allowaddr);
2775	COND_SET_VALUE(ki->p_cwdi, PTRTOUINT64(p->p_cwdi), allowaddr);
2776	COND_SET_VALUE(ki->p_stats, PTRTOUINT64(p->p_stats), allowaddr);
2777	COND_SET_VALUE(ki->p_limit, PTRTOUINT64(p->p_limit), allowaddr);
2778	COND_SET_VALUE(ki->p_vmspace, PTRTOUINT64(p->p_vmspace), allowaddr);
2779	COND_SET_VALUE(ki->p_sigacts, PTRTOUINT64(p->p_sigacts), allowaddr);
2780	COND_SET_VALUE(ki->p_sess, PTRTOUINT64(p->p_session), allowaddr);
2781	ki->p_tsess = 0;	/* may be changed if controlling tty below */
2782	COND_SET_VALUE(ki->p_ru, PTRTOUINT64(&p->p_stats->p_ru), allowaddr);
2783	ki->p_eflag = 0;
2784	ki->p_exitsig = p->p_exitsig;
2785	ki->p_flag = L_INMEM;   /* Process never swapped out */
2786	ki->p_flag |= sysctl_map_flags(sysctl_flagmap, p->p_flag);
2787	ki->p_flag |= sysctl_map_flags(sysctl_sflagmap, p->p_sflag);
2788	ki->p_flag |= sysctl_map_flags(sysctl_slflagmap, p->p_slflag);
2789	ki->p_flag |= sysctl_map_flags(sysctl_lflagmap, p->p_lflag);
2790	ki->p_flag |= sysctl_map_flags(sysctl_stflagmap, p->p_stflag);
2791	ki->p_pid = p->p_pid;
2792	ki->p_ppid = p->p_ppid;
2793	ki->p_uid = kauth_cred_geteuid(p->p_cred);
2794	ki->p_ruid = kauth_cred_getuid(p->p_cred);
2795	ki->p_gid = kauth_cred_getegid(p->p_cred);
2796	ki->p_rgid = kauth_cred_getgid(p->p_cred);
2797	ki->p_svuid = kauth_cred_getsvuid(p->p_cred);
2798	ki->p_svgid = kauth_cred_getsvgid(p->p_cred);
2799	ki->p_ngroups = kauth_cred_ngroups(p->p_cred);
2800	kauth_cred_getgroups(p->p_cred, ki->p_groups,
2801	    uimin(ki->p_ngroups, sizeof(ki->p_groups) / sizeof(ki->p_groups[0])),
2802	    UIO_SYSSPACE);
2803
2804	ki->p_uticks = p->p_uticks;
2805	ki->p_sticks = p->p_sticks;
2806	ki->p_iticks = p->p_iticks;
2807	ki->p_tpgid = NO_PGID;	/* may be changed if controlling tty below */
2808	COND_SET_VALUE(ki->p_tracep, PTRTOUINT64(p->p_tracep), allowaddr);
2809	ki->p_traceflag = p->p_traceflag;
2810
2811	memcpy(&ki->p_sigignore, &p->p_sigctx.ps_sigignore,sizeof(ki_sigset_t));
2812	memcpy(&ki->p_sigcatch, &p->p_sigctx.ps_sigcatch, sizeof(ki_sigset_t));
2813
2814	ki->p_cpticks = 0;
2815	ki->p_pctcpu = p->p_pctcpu;
2816	ki->p_estcpu = 0;
2817	ki->p_stat = p->p_stat; /* Will likely be overridden by LWP status */
2818	ki->p_realstat = p->p_stat;
2819	ki->p_nice = p->p_nice;
2820	ki->p_xstat = P_WAITSTATUS(p);
2821	ki->p_acflag = p->p_acflag;
2822
2823	strncpy(ki->p_comm, p->p_comm,
2824	    uimin(sizeof(ki->p_comm), sizeof(p->p_comm)));
2825	strncpy(ki->p_ename, p->p_emul->e_name, sizeof(ki->p_ename));
2826
2827	ki->p_nlwps = p->p_nlwps;
2828	ki->p_realflag = ki->p_flag;
2829
2830	if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) {
2831		vm = p->p_vmspace;
2832		ki->p_vm_rssize = vm_resident_count(vm);
2833		ki->p_vm_tsize = vm->vm_tsize;
2834		ki->p_vm_dsize = vm->vm_dsize;
2835		ki->p_vm_ssize = vm->vm_ssize;
2836		ki->p_vm_vsize = atop(vm->vm_map.size);
2837		/*
2838		 * Since the stack is initially mapped mostly with
2839		 * PROT_NONE and grown as needed, adjust the "mapped size"
2840		 * to skip the unused stack portion.
2841		 */
2842		ki->p_vm_msize =
2843		    atop(vm->vm_map.size) - vm->vm_issize + vm->vm_ssize;
2844
2845		/* Pick the primary (first) LWP */
2846		l = proc_active_lwp(p);
2847		KASSERT(l != NULL);
2848		lwp_lock(l);
2849		ki->p_nrlwps = p->p_nrlwps;
2850		ki->p_forw = 0;
2851		ki->p_back = 0;
2852		COND_SET_VALUE(ki->p_addr, PTRTOUINT64(l->l_addr), allowaddr);
2853		ki->p_stat = l->l_stat;
2854		ki->p_flag |= sysctl_map_flags(sysctl_lwpflagmap, l->l_flag);
2855		ki->p_swtime = l->l_swtime;
2856		ki->p_slptime = l->l_slptime;
2857		if (l->l_stat == LSONPROC)
2858			ki->p_schedflags = l->l_cpu->ci_schedstate.spc_flags;
2859		else
2860			ki->p_schedflags = 0;
2861		ki->p_priority = lwp_eprio(l);
2862		ki->p_usrpri = l->l_priority;
2863		if (l->l_wchan)
2864			strncpy(ki->p_wmesg, l->l_wmesg, sizeof(ki->p_wmesg));
2865		COND_SET_VALUE(ki->p_wchan, PTRTOUINT64(l->l_wchan), allowaddr);
2866		ki->p_cpuid = cpu_index(l->l_cpu);
2867		lwp_unlock(l);
2868		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
2869			/* This is hardly correct, but... */
2870			sigplusset(&l->l_sigpend.sp_set, &ss1);
2871			sigplusset(&l->l_sigmask, &ss2);
2872			ki->p_cpticks += l->l_cpticks;
2873			ki->p_pctcpu += l->l_pctcpu;
2874			ki->p_estcpu += l->l_estcpu;
2875		}
2876	}
2877	sigplusset(&p->p_sigpend.sp_set, &ss1);
2878	memcpy(&ki->p_siglist, &ss1, sizeof(ki_sigset_t));
2879	memcpy(&ki->p_sigmask, &ss2, sizeof(ki_sigset_t));
2880
2881	if (p->p_session != NULL) {
2882		ki->p_sid = p->p_session->s_sid;
2883		ki->p__pgid = p->p_pgrp->pg_id;
2884		if (p->p_session->s_ttyvp)
2885			ki->p_eflag |= EPROC_CTTY;
2886		if (SESS_LEADER(p))
2887			ki->p_eflag |= EPROC_SLEADER;
2888		strncpy(ki->p_login, p->p_session->s_login,
2889		    uimin(sizeof ki->p_login - 1, sizeof p->p_session->s_login));
2890		ki->p_jobc = p->p_pgrp->pg_jobc;
2891		if ((p->p_lflag & PL_CONTROLT) && (tp = p->p_session->s_ttyp)) {
2892			ki->p_tdev = tp->t_dev;
2893			ki->p_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
2894			COND_SET_VALUE(ki->p_tsess, PTRTOUINT64(tp->t_session),
2895			    allowaddr);
2896		} else {
2897			ki->p_tdev = (int32_t)NODEV;
2898		}
2899	}
2900
2901	if (!P_ZOMBIE(p) && !zombie) {
2902		ki->p_uvalid = 1;
2903		ki->p_ustart_sec = p->p_stats->p_start.tv_sec;
2904		ki->p_ustart_usec = p->p_stats->p_start.tv_usec;
2905
2906		calcru(p, &ut, &st, NULL, &rt);
2907		ki->p_rtime_sec = rt.tv_sec;
2908		ki->p_rtime_usec = rt.tv_usec;
2909		ki->p_uutime_sec = ut.tv_sec;
2910		ki->p_uutime_usec = ut.tv_usec;
2911		ki->p_ustime_sec = st.tv_sec;
2912		ki->p_ustime_usec = st.tv_usec;
2913
2914		memcpy(&ru, &p->p_stats->p_ru, sizeof(ru));
2915		rulwps(p, &ru);
2916		ki->p_uru_nvcsw = ru.ru_nvcsw;
2917		ki->p_uru_nivcsw = ru.ru_nivcsw;
2918		ki->p_uru_maxrss = ru.ru_maxrss;
2919		ki->p_uru_ixrss = ru.ru_ixrss;
2920		ki->p_uru_idrss = ru.ru_idrss;
2921		ki->p_uru_isrss = ru.ru_isrss;
2922		ki->p_uru_minflt = ru.ru_minflt;
2923		ki->p_uru_majflt = ru.ru_majflt;
2924		ki->p_uru_nswap = ru.ru_nswap;
2925		ki->p_uru_inblock = ru.ru_inblock;
2926		ki->p_uru_oublock = ru.ru_oublock;
2927		ki->p_uru_msgsnd = ru.ru_msgsnd;
2928		ki->p_uru_msgrcv = ru.ru_msgrcv;
2929		ki->p_uru_nsignals = ru.ru_nsignals;
2930
2931		timeradd(&p->p_stats->p_cru.ru_utime,
2932			 &p->p_stats->p_cru.ru_stime, &ut);
2933		ki->p_uctime_sec = ut.tv_sec;
2934		ki->p_uctime_usec = ut.tv_usec;
2935	}
2936}
2937
2938
2939int
2940proc_find_locked(struct lwp *l, struct proc **p, pid_t pid)
2941{
2942	int error;
2943
2944	mutex_enter(&proc_lock);
2945	if (pid == -1)
2946		*p = l->l_proc;
2947	else
2948		*p = proc_find(pid);
2949
2950	if (*p == NULL) {
2951		if (pid != -1)
2952			mutex_exit(&proc_lock);
2953		return ESRCH;
2954	}
2955	if (pid != -1)
2956		mutex_enter((*p)->p_lock);
2957	mutex_exit(&proc_lock);
2958
2959	error = kauth_authorize_process(l->l_cred,
2960	    KAUTH_PROCESS_CANSEE, *p,
2961	    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
2962	if (error) {
2963		if (pid != -1)
2964			mutex_exit((*p)->p_lock);
2965	}
2966	return error;
2967}
2968
2969static int
2970fill_pathname(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp)
2971{
2972	int error;
2973	struct proc *p;
2974
2975	if ((error = proc_find_locked(l, &p, pid)) != 0)
2976		return error;
2977
2978	if (p->p_path == NULL) {
2979		if (pid != -1)
2980			mutex_exit(p->p_lock);
2981		return ENOENT;
2982	}
2983
2984	size_t len = strlen(p->p_path) + 1;
2985	if (oldp != NULL) {
2986		size_t copylen = uimin(len, *oldlenp);
2987		error = sysctl_copyout(l, p->p_path, oldp, copylen);
2988		if (error == 0 && *oldlenp < len)
2989			error = ENOSPC;
2990	}
2991	*oldlenp = len;
2992	if (pid != -1)
2993		mutex_exit(p->p_lock);
2994	return error;
2995}
2996
2997static int
2998fill_cwd(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp)
2999{
3000	int error;
3001	struct proc *p;
3002	char *path;
3003	char *bp, *bend;
3004	struct cwdinfo *cwdi;
3005	struct vnode *vp;
3006	size_t len, lenused;
3007
3008	if ((error = proc_find_locked(l, &p, pid)) != 0)
3009		return error;
3010
3011	len = MAXPATHLEN * 4;
3012
3013	path = kmem_alloc(len, KM_SLEEP);
3014
3015	bp = &path[len];
3016	bend = bp;
3017	*(--bp) = '\0';
3018
3019	cwdi = p->p_cwdi;
3020	rw_enter(&cwdi->cwdi_lock, RW_READER);
3021	vp = cwdi->cwdi_cdir;
3022	error = getcwd_common(vp, NULL, &bp, path, len/2, 0, l);
3023	rw_exit(&cwdi->cwdi_lock);
3024
3025	if (error)
3026		goto out;
3027
3028	lenused = bend - bp;
3029
3030	if (oldp != NULL) {
3031		size_t copylen = uimin(lenused, *oldlenp);
3032		error = sysctl_copyout(l, bp, oldp, copylen);
3033		if (error == 0 && *oldlenp < lenused)
3034			error = ENOSPC;
3035	}
3036	*oldlenp = lenused;
3037out:
3038	if (pid != -1)
3039		mutex_exit(p->p_lock);
3040	kmem_free(path, len);
3041	return error;
3042}
3043
3044int
3045proc_getauxv(struct proc *p, void **buf, size_t *len)
3046{
3047	struct ps_strings pss;
3048	int error;
3049	void *uauxv, *kauxv;
3050	size_t size;
3051
3052	if ((error = copyin_psstrings(p, &pss)) != 0)
3053		return error;
3054	if (pss.ps_envstr == NULL)
3055		return EIO;
3056
3057	size = p->p_execsw->es_arglen;
3058	if (size == 0)
3059		return EIO;
3060
3061	size_t ptrsz = PROC_PTRSZ(p);
3062	uauxv = (void *)((char *)pss.ps_envstr + (pss.ps_nenvstr + 1) * ptrsz);
3063
3064	kauxv = kmem_alloc(size, KM_SLEEP);
3065
3066	error = copyin_proc(p, uauxv, kauxv, size);
3067	if (error) {
3068		kmem_free(kauxv, size);
3069		return error;
3070	}
3071
3072	*buf = kauxv;
3073	*len = size;
3074
3075	return 0;
3076}
3077
3078
3079static int
3080sysctl_security_expose_address(SYSCTLFN_ARGS)
3081{
3082	int expose_address, error;
3083	struct sysctlnode node;
3084
3085	node = *rnode;
3086	node.sysctl_data = &expose_address;
3087	expose_address = *(int *)rnode->sysctl_data;
3088	error = sysctl_lookup(SYSCTLFN_CALL(&node));
3089	if (error || newp == NULL)
3090		return error;
3091
3092	if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_KERNADDR,
3093	    0, NULL, NULL, NULL))
3094		return EPERM;
3095
3096	switch (expose_address) {
3097	case 0:
3098	case 1:
3099	case 2:
3100		break;
3101	default:
3102		return EINVAL;
3103	}
3104
3105	*(int *)rnode->sysctl_data = expose_address;
3106
3107	return 0;
3108}
3109
3110bool
3111get_expose_address(struct proc *p)
3112{
3113	/* allow only if sysctl variable is set or privileged */
3114	return kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE,
3115	    p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_KPTR), NULL, NULL) == 0;
3116}
3117