kern_resource.c revision 204670
1/*-
2 * Copyright (c) 1982, 1986, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/kern_resource.c 204670 2010-03-03 21:46:51Z rrs $");
39
40#include "opt_compat.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/sysproto.h>
45#include <sys/file.h>
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/malloc.h>
49#include <sys/mutex.h>
50#include <sys/priv.h>
51#include <sys/proc.h>
52#include <sys/refcount.h>
53#include <sys/resourcevar.h>
54#include <sys/rwlock.h>
55#include <sys/sched.h>
56#include <sys/sx.h>
57#include <sys/syscallsubr.h>
58#include <sys/sysent.h>
59#include <sys/time.h>
60#include <sys/umtx.h>
61
62#include <vm/vm.h>
63#include <vm/vm_param.h>
64#include <vm/pmap.h>
65#include <vm/vm_map.h>
66
67
68static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
69static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
70#define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
71static struct rwlock uihashtbl_lock;
72static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
73static u_long uihash;		/* size of hash table - 1 */
74
75static void	calcru1(struct proc *p, struct rusage_ext *ruxp,
76		    struct timeval *up, struct timeval *sp);
77static int	donice(struct thread *td, struct proc *chgp, int n);
78static struct uidinfo *uilookup(uid_t uid);
79
80/*
81 * Resource controls and accounting.
82 */
83#ifndef _SYS_SYSPROTO_H_
84struct getpriority_args {
85	int	which;
86	int	who;
87};
88#endif
89int
90getpriority(td, uap)
91	struct thread *td;
92	register struct getpriority_args *uap;
93{
94	struct proc *p;
95	struct pgrp *pg;
96	int error, low;
97
98	error = 0;
99	low = PRIO_MAX + 1;
100	switch (uap->which) {
101
102	case PRIO_PROCESS:
103		if (uap->who == 0)
104			low = td->td_proc->p_nice;
105		else {
106			p = pfind(uap->who);
107			if (p == NULL)
108				break;
109			if (p_cansee(td, p) == 0)
110				low = p->p_nice;
111			PROC_UNLOCK(p);
112		}
113		break;
114
115	case PRIO_PGRP:
116		sx_slock(&proctree_lock);
117		if (uap->who == 0) {
118			pg = td->td_proc->p_pgrp;
119			PGRP_LOCK(pg);
120		} else {
121			pg = pgfind(uap->who);
122			if (pg == NULL) {
123				sx_sunlock(&proctree_lock);
124				break;
125			}
126		}
127		sx_sunlock(&proctree_lock);
128		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
129			PROC_LOCK(p);
130			if (p_cansee(td, p) == 0) {
131				if (p->p_nice < low)
132					low = p->p_nice;
133			}
134			PROC_UNLOCK(p);
135		}
136		PGRP_UNLOCK(pg);
137		break;
138
139	case PRIO_USER:
140		if (uap->who == 0)
141			uap->who = td->td_ucred->cr_uid;
142		sx_slock(&allproc_lock);
143		FOREACH_PROC_IN_SYSTEM(p) {
144			/* Do not bother to check PRS_NEW processes */
145			if (p->p_state == PRS_NEW)
146				continue;
147			PROC_LOCK(p);
148			if (p_cansee(td, p) == 0 &&
149			    p->p_ucred->cr_uid == uap->who) {
150				if (p->p_nice < low)
151					low = p->p_nice;
152			}
153			PROC_UNLOCK(p);
154		}
155		sx_sunlock(&allproc_lock);
156		break;
157
158	default:
159		error = EINVAL;
160		break;
161	}
162	if (low == PRIO_MAX + 1 && error == 0)
163		error = ESRCH;
164	td->td_retval[0] = low;
165	return (error);
166}
167
168#ifndef _SYS_SYSPROTO_H_
169struct setpriority_args {
170	int	which;
171	int	who;
172	int	prio;
173};
174#endif
175int
176setpriority(td, uap)
177	struct thread *td;
178	struct setpriority_args *uap;
179{
180	struct proc *curp, *p;
181	struct pgrp *pg;
182	int found = 0, error = 0;
183
184	curp = td->td_proc;
185	switch (uap->which) {
186	case PRIO_PROCESS:
187		if (uap->who == 0) {
188			PROC_LOCK(curp);
189			error = donice(td, curp, uap->prio);
190			PROC_UNLOCK(curp);
191		} else {
192			p = pfind(uap->who);
193			if (p == NULL)
194				break;
195			error = p_cansee(td, p);
196			if (error == 0)
197				error = donice(td, p, uap->prio);
198			PROC_UNLOCK(p);
199		}
200		found++;
201		break;
202
203	case PRIO_PGRP:
204		sx_slock(&proctree_lock);
205		if (uap->who == 0) {
206			pg = curp->p_pgrp;
207			PGRP_LOCK(pg);
208		} else {
209			pg = pgfind(uap->who);
210			if (pg == NULL) {
211				sx_sunlock(&proctree_lock);
212				break;
213			}
214		}
215		sx_sunlock(&proctree_lock);
216		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
217			PROC_LOCK(p);
218			if (p_cansee(td, p) == 0) {
219				error = donice(td, p, uap->prio);
220				found++;
221			}
222			PROC_UNLOCK(p);
223		}
224		PGRP_UNLOCK(pg);
225		break;
226
227	case PRIO_USER:
228		if (uap->who == 0)
229			uap->who = td->td_ucred->cr_uid;
230		sx_slock(&allproc_lock);
231		FOREACH_PROC_IN_SYSTEM(p) {
232			PROC_LOCK(p);
233			if (p->p_ucred->cr_uid == uap->who &&
234			    p_cansee(td, p) == 0) {
235				error = donice(td, p, uap->prio);
236				found++;
237			}
238			PROC_UNLOCK(p);
239		}
240		sx_sunlock(&allproc_lock);
241		break;
242
243	default:
244		error = EINVAL;
245		break;
246	}
247	if (found == 0 && error == 0)
248		error = ESRCH;
249	return (error);
250}
251
252/*
253 * Set "nice" for a (whole) process.
254 */
255static int
256donice(struct thread *td, struct proc *p, int n)
257{
258	int error;
259
260	PROC_LOCK_ASSERT(p, MA_OWNED);
261	if ((error = p_cansched(td, p)))
262		return (error);
263	if (n > PRIO_MAX)
264		n = PRIO_MAX;
265	if (n < PRIO_MIN)
266		n = PRIO_MIN;
267	if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
268		return (EACCES);
269	sched_nice(p, n);
270	return (0);
271}
272
273/*
274 * Set realtime priority for LWP.
275 */
276#ifndef _SYS_SYSPROTO_H_
277struct rtprio_thread_args {
278	int		function;
279	lwpid_t		lwpid;
280	struct rtprio	*rtp;
281};
282#endif
283int
284rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
285{
286	struct proc *p;
287	struct rtprio rtp;
288	struct thread *td1;
289	int cierror, error;
290
291	/* Perform copyin before acquiring locks if needed. */
292	if (uap->function == RTP_SET)
293		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
294	else
295		cierror = 0;
296
297	/*
298	 * Though lwpid is unique, only current process is supported
299	 * since there is no efficient way to look up a LWP yet.
300	 */
301	p = td->td_proc;
302	PROC_LOCK(p);
303
304	switch (uap->function) {
305	case RTP_LOOKUP:
306		if ((error = p_cansee(td, p)))
307			break;
308		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
309			td1 = td;
310		else
311			td1 = thread_find(p, uap->lwpid);
312		if (td1 != NULL)
313			pri_to_rtp(td1, &rtp);
314		else
315			error = ESRCH;
316		PROC_UNLOCK(p);
317		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
318	case RTP_SET:
319		if ((error = p_cansched(td, p)) || (error = cierror))
320			break;
321
322		/* Disallow setting rtprio in most cases if not superuser. */
323/*
324 * Realtime priority has to be restricted for reasons which should be
325 * obvious.  However, for idle priority, there is a potential for
326 * system deadlock if an idleprio process gains a lock on a resource
327 * that other processes need (and the idleprio process can't run
328 * due to a CPU-bound normal process).  Fix me!  XXX
329 */
330#if 0
331		if (RTP_PRIO_IS_REALTIME(rtp.type)) {
332#else
333		if (rtp.type != RTP_PRIO_NORMAL) {
334#endif
335			error = priv_check(td, PRIV_SCHED_RTPRIO);
336			if (error)
337				break;
338		}
339
340		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
341			td1 = td;
342		else
343			td1 = thread_find(p, uap->lwpid);
344		if (td1 != NULL)
345			error = rtp_to_pri(&rtp, td1);
346		else
347			error = ESRCH;
348		break;
349	default:
350		error = EINVAL;
351		break;
352	}
353	PROC_UNLOCK(p);
354	return (error);
355}
356
357/*
358 * Set realtime priority.
359 */
360#ifndef _SYS_SYSPROTO_H_
361struct rtprio_args {
362	int		function;
363	pid_t		pid;
364	struct rtprio	*rtp;
365};
366#endif
367int
368rtprio(td, uap)
369	struct thread *td;		/* curthread */
370	register struct rtprio_args *uap;
371{
372	struct proc *p;
373	struct thread *tdp;
374	struct rtprio rtp;
375	int cierror, error;
376
377	/* Perform copyin before acquiring locks if needed. */
378	if (uap->function == RTP_SET)
379		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
380	else
381		cierror = 0;
382
383	if (uap->pid == 0) {
384		p = td->td_proc;
385		PROC_LOCK(p);
386	} else {
387		p = pfind(uap->pid);
388		if (p == NULL)
389			return (ESRCH);
390	}
391
392	switch (uap->function) {
393	case RTP_LOOKUP:
394		if ((error = p_cansee(td, p)))
395			break;
396		/*
397		 * Return OUR priority if no pid specified,
398		 * or if one is, report the highest priority
399		 * in the process.  There isn't much more you can do as
400		 * there is only room to return a single priority.
401		 * Note: specifying our own pid is not the same
402		 * as leaving it zero.
403		 */
404		if (uap->pid == 0) {
405			pri_to_rtp(td, &rtp);
406		} else {
407			struct rtprio rtp2;
408
409			rtp.type = RTP_PRIO_IDLE;
410			rtp.prio = RTP_PRIO_MAX;
411			FOREACH_THREAD_IN_PROC(p, tdp) {
412				pri_to_rtp(tdp, &rtp2);
413				if (rtp2.type <  rtp.type ||
414				    (rtp2.type == rtp.type &&
415				    rtp2.prio < rtp.prio)) {
416					rtp.type = rtp2.type;
417					rtp.prio = rtp2.prio;
418				}
419			}
420		}
421		PROC_UNLOCK(p);
422		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
423	case RTP_SET:
424		if ((error = p_cansched(td, p)) || (error = cierror))
425			break;
426
427		/* Disallow setting rtprio in most cases if not superuser. */
428/*
429 * Realtime priority has to be restricted for reasons which should be
430 * obvious.  However, for idle priority, there is a potential for
431 * system deadlock if an idleprio process gains a lock on a resource
432 * that other processes need (and the idleprio process can't run
433 * due to a CPU-bound normal process).  Fix me!  XXX
434 */
435#if 0
436		if (RTP_PRIO_IS_REALTIME(rtp.type)) {
437#else
438		if (rtp.type != RTP_PRIO_NORMAL) {
439#endif
440			error = priv_check(td, PRIV_SCHED_RTPRIO);
441			if (error)
442				break;
443		}
444
445		/*
446		 * If we are setting our own priority, set just our
447		 * thread but if we are doing another process,
448		 * do all the threads on that process. If we
449		 * specify our own pid we do the latter.
450		 */
451		if (uap->pid == 0) {
452			error = rtp_to_pri(&rtp, td);
453		} else {
454			FOREACH_THREAD_IN_PROC(p, td) {
455				if ((error = rtp_to_pri(&rtp, td)) != 0)
456					break;
457			}
458		}
459		break;
460	default:
461		error = EINVAL;
462		break;
463	}
464	PROC_UNLOCK(p);
465	return (error);
466}
467
468int
469rtp_to_pri(struct rtprio *rtp, struct thread *td)
470{
471	u_char	newpri;
472	u_char	oldpri;
473
474	thread_lock(td);
475	switch (RTP_PRIO_BASE(rtp->type)) {
476	case RTP_PRIO_REALTIME:
477		if (rtp->prio > RTP_PRIO_MAX) {
478			thread_unlock(td);
479			return (EINVAL);
480		}
481		newpri = PRI_MIN_REALTIME + rtp->prio;
482		break;
483	case RTP_PRIO_NORMAL:
484		if (rtp->prio >  (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)) {
485			thread_unlock(td);
486			return (EINVAL);
487		}
488		newpri = PRI_MIN_TIMESHARE + rtp->prio;
489		break;
490	case RTP_PRIO_IDLE:
491		newpri = PRI_MIN_IDLE + rtp->prio;
492		break;
493	default:
494		thread_unlock(td);
495		return (EINVAL);
496	}
497	sched_class(td, rtp->type);	/* XXX fix */
498	oldpri = td->td_user_pri;
499	sched_user_prio(td, newpri);
500	if (curthread == td)
501		sched_prio(curthread, td->td_user_pri); /* XXX dubious */
502	if (TD_ON_UPILOCK(td) && oldpri != newpri) {
503		thread_unlock(td);
504		umtx_pi_adjust(td, oldpri);
505	} else
506		thread_unlock(td);
507	return (0);
508}
509
510void
511pri_to_rtp(struct thread *td, struct rtprio *rtp)
512{
513
514	thread_lock(td);
515	switch (PRI_BASE(td->td_pri_class)) {
516	case PRI_REALTIME:
517		rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
518		break;
519	case PRI_TIMESHARE:
520		rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
521		break;
522	case PRI_IDLE:
523		rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
524		break;
525	default:
526		break;
527	}
528	rtp->type = td->td_pri_class;
529	thread_unlock(td);
530}
531
532#if defined(COMPAT_43)
533#ifndef _SYS_SYSPROTO_H_
534struct osetrlimit_args {
535	u_int	which;
536	struct	orlimit *rlp;
537};
538#endif
539int
540osetrlimit(td, uap)
541	struct thread *td;
542	register struct osetrlimit_args *uap;
543{
544	struct orlimit olim;
545	struct rlimit lim;
546	int error;
547
548	if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
549		return (error);
550	lim.rlim_cur = olim.rlim_cur;
551	lim.rlim_max = olim.rlim_max;
552	error = kern_setrlimit(td, uap->which, &lim);
553	return (error);
554}
555
556#ifndef _SYS_SYSPROTO_H_
557struct ogetrlimit_args {
558	u_int	which;
559	struct	orlimit *rlp;
560};
561#endif
562int
563ogetrlimit(td, uap)
564	struct thread *td;
565	register struct ogetrlimit_args *uap;
566{
567	struct orlimit olim;
568	struct rlimit rl;
569	struct proc *p;
570	int error;
571
572	if (uap->which >= RLIM_NLIMITS)
573		return (EINVAL);
574	p = td->td_proc;
575	PROC_LOCK(p);
576	lim_rlimit(p, uap->which, &rl);
577	PROC_UNLOCK(p);
578
579	/*
580	 * XXX would be more correct to convert only RLIM_INFINITY to the
581	 * old RLIM_INFINITY and fail with EOVERFLOW for other larger
582	 * values.  Most 64->32 and 32->16 conversions, including not
583	 * unimportant ones of uids are even more broken than what we
584	 * do here (they blindly truncate).  We don't do this correctly
585	 * here since we have little experience with EOVERFLOW yet.
586	 * Elsewhere, getuid() can't fail...
587	 */
588	olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
589	olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
590	error = copyout(&olim, uap->rlp, sizeof(olim));
591	return (error);
592}
593#endif /* COMPAT_43 */
594
595#ifndef _SYS_SYSPROTO_H_
596struct __setrlimit_args {
597	u_int	which;
598	struct	rlimit *rlp;
599};
600#endif
601int
602setrlimit(td, uap)
603	struct thread *td;
604	register struct __setrlimit_args *uap;
605{
606	struct rlimit alim;
607	int error;
608
609	if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
610		return (error);
611	error = kern_setrlimit(td, uap->which, &alim);
612	return (error);
613}
614
615static void
616lim_cb(void *arg)
617{
618	struct rlimit rlim;
619	struct thread *td;
620	struct proc *p;
621
622	p = arg;
623	PROC_LOCK_ASSERT(p, MA_OWNED);
624	/*
625	 * Check if the process exceeds its cpu resource allocation.  If
626	 * it reaches the max, arrange to kill the process in ast().
627	 */
628	if (p->p_cpulimit == RLIM_INFINITY)
629		return;
630	PROC_SLOCK(p);
631	FOREACH_THREAD_IN_PROC(p, td) {
632		thread_lock(td);
633		ruxagg(&p->p_rux, td);
634		thread_unlock(td);
635	}
636	PROC_SUNLOCK(p);
637	if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
638		lim_rlimit(p, RLIMIT_CPU, &rlim);
639		if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
640			killproc(p, "exceeded maximum CPU limit");
641		} else {
642			if (p->p_cpulimit < rlim.rlim_max)
643				p->p_cpulimit += 5;
644			psignal(p, SIGXCPU);
645		}
646	}
647	if ((p->p_flag & P_WEXIT) == 0)
648		callout_reset(&p->p_limco, hz, lim_cb, p);
649}
650
651int
652kern_setrlimit(td, which, limp)
653	struct thread *td;
654	u_int which;
655	struct rlimit *limp;
656{
657	struct plimit *newlim, *oldlim;
658	struct proc *p;
659	register struct rlimit *alimp;
660	struct rlimit oldssiz;
661	int error;
662
663	if (which >= RLIM_NLIMITS)
664		return (EINVAL);
665
666	/*
667	 * Preserve historical bugs by treating negative limits as unsigned.
668	 */
669	if (limp->rlim_cur < 0)
670		limp->rlim_cur = RLIM_INFINITY;
671	if (limp->rlim_max < 0)
672		limp->rlim_max = RLIM_INFINITY;
673
674	oldssiz.rlim_cur = 0;
675	p = td->td_proc;
676	newlim = lim_alloc();
677	PROC_LOCK(p);
678	oldlim = p->p_limit;
679	alimp = &oldlim->pl_rlimit[which];
680	if (limp->rlim_cur > alimp->rlim_max ||
681	    limp->rlim_max > alimp->rlim_max)
682		if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) {
683			PROC_UNLOCK(p);
684			lim_free(newlim);
685			return (error);
686		}
687	if (limp->rlim_cur > limp->rlim_max)
688		limp->rlim_cur = limp->rlim_max;
689	lim_copy(newlim, oldlim);
690	alimp = &newlim->pl_rlimit[which];
691
692	switch (which) {
693
694	case RLIMIT_CPU:
695		if (limp->rlim_cur != RLIM_INFINITY &&
696		    p->p_cpulimit == RLIM_INFINITY)
697			callout_reset(&p->p_limco, hz, lim_cb, p);
698		p->p_cpulimit = limp->rlim_cur;
699		break;
700	case RLIMIT_DATA:
701		if (limp->rlim_cur > maxdsiz)
702			limp->rlim_cur = maxdsiz;
703		if (limp->rlim_max > maxdsiz)
704			limp->rlim_max = maxdsiz;
705		break;
706
707	case RLIMIT_STACK:
708		if (limp->rlim_cur > maxssiz)
709			limp->rlim_cur = maxssiz;
710		if (limp->rlim_max > maxssiz)
711			limp->rlim_max = maxssiz;
712		oldssiz = *alimp;
713		if (td->td_proc->p_sysent->sv_fixlimit != NULL)
714			td->td_proc->p_sysent->sv_fixlimit(&oldssiz,
715			    RLIMIT_STACK);
716		break;
717
718	case RLIMIT_NOFILE:
719		if (limp->rlim_cur > maxfilesperproc)
720			limp->rlim_cur = maxfilesperproc;
721		if (limp->rlim_max > maxfilesperproc)
722			limp->rlim_max = maxfilesperproc;
723		break;
724
725	case RLIMIT_NPROC:
726		if (limp->rlim_cur > maxprocperuid)
727			limp->rlim_cur = maxprocperuid;
728		if (limp->rlim_max > maxprocperuid)
729			limp->rlim_max = maxprocperuid;
730		if (limp->rlim_cur < 1)
731			limp->rlim_cur = 1;
732		if (limp->rlim_max < 1)
733			limp->rlim_max = 1;
734		break;
735	}
736	if (td->td_proc->p_sysent->sv_fixlimit != NULL)
737		td->td_proc->p_sysent->sv_fixlimit(limp, which);
738	*alimp = *limp;
739	p->p_limit = newlim;
740	PROC_UNLOCK(p);
741	lim_free(oldlim);
742
743	if (which == RLIMIT_STACK) {
744		/*
745		 * Stack is allocated to the max at exec time with only
746		 * "rlim_cur" bytes accessible.  If stack limit is going
747		 * up make more accessible, if going down make inaccessible.
748		 */
749		if (limp->rlim_cur != oldssiz.rlim_cur) {
750			vm_offset_t addr;
751			vm_size_t size;
752			vm_prot_t prot;
753
754			if (limp->rlim_cur > oldssiz.rlim_cur) {
755				prot = p->p_sysent->sv_stackprot;
756				size = limp->rlim_cur - oldssiz.rlim_cur;
757				addr = p->p_sysent->sv_usrstack -
758				    limp->rlim_cur;
759			} else {
760				prot = VM_PROT_NONE;
761				size = oldssiz.rlim_cur - limp->rlim_cur;
762				addr = p->p_sysent->sv_usrstack -
763				    oldssiz.rlim_cur;
764			}
765			addr = trunc_page(addr);
766			size = round_page(size);
767			(void)vm_map_protect(&p->p_vmspace->vm_map,
768			    addr, addr + size, prot, FALSE);
769		}
770	}
771
772	return (0);
773}
774
775#ifndef _SYS_SYSPROTO_H_
776struct __getrlimit_args {
777	u_int	which;
778	struct	rlimit *rlp;
779};
780#endif
781/* ARGSUSED */
782int
783getrlimit(td, uap)
784	struct thread *td;
785	register struct __getrlimit_args *uap;
786{
787	struct rlimit rlim;
788	struct proc *p;
789	int error;
790
791	if (uap->which >= RLIM_NLIMITS)
792		return (EINVAL);
793	p = td->td_proc;
794	PROC_LOCK(p);
795	lim_rlimit(p, uap->which, &rlim);
796	PROC_UNLOCK(p);
797	error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
798	return (error);
799}
800
801/*
802 * Transform the running time and tick information for children of proc p
803 * into user and system time usage.
804 */
805void
806calccru(p, up, sp)
807	struct proc *p;
808	struct timeval *up;
809	struct timeval *sp;
810{
811
812	PROC_LOCK_ASSERT(p, MA_OWNED);
813	calcru1(p, &p->p_crux, up, sp);
814}
815
816/*
817 * Transform the running time and tick information in proc p into user
818 * and system time usage.  If appropriate, include the current time slice
819 * on this CPU.
820 */
821void
822calcru(struct proc *p, struct timeval *up, struct timeval *sp)
823{
824	struct thread *td;
825	uint64_t u;
826
827	PROC_LOCK_ASSERT(p, MA_OWNED);
828	PROC_SLOCK_ASSERT(p, MA_OWNED);
829	/*
830	 * If we are getting stats for the current process, then add in the
831	 * stats that this thread has accumulated in its current time slice.
832	 * We reset the thread and CPU state as if we had performed a context
833	 * switch right here.
834	 */
835	td = curthread;
836	if (td->td_proc == p) {
837		u = cpu_ticks();
838		p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
839		PCPU_SET(switchtime, u);
840	}
841	/* Make sure the per-thread stats are current. */
842	FOREACH_THREAD_IN_PROC(p, td) {
843		if (td->td_incruntime == 0)
844			continue;
845		thread_lock(td);
846		ruxagg(&p->p_rux, td);
847		thread_unlock(td);
848	}
849	calcru1(p, &p->p_rux, up, sp);
850}
851
852static void
853calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
854    struct timeval *sp)
855{
856	/* {user, system, interrupt, total} {ticks, usec}: */
857	u_int64_t ut, uu, st, su, it, tt, tu;
858
859	ut = ruxp->rux_uticks;
860	st = ruxp->rux_sticks;
861	it = ruxp->rux_iticks;
862	tt = ut + st + it;
863	if (tt == 0) {
864		/* Avoid divide by zero */
865		st = 1;
866		tt = 1;
867	}
868	tu = cputick2usec(ruxp->rux_runtime);
869	if ((int64_t)tu < 0) {
870		/* XXX: this should be an assert /phk */
871		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
872		    (intmax_t)tu, p->p_pid, p->p_comm);
873		tu = ruxp->rux_tu;
874	}
875
876	if (tu >= ruxp->rux_tu) {
877		/*
878		 * The normal case, time increased.
879		 * Enforce monotonicity of bucketed numbers.
880		 */
881		uu = (tu * ut) / tt;
882		if (uu < ruxp->rux_uu)
883			uu = ruxp->rux_uu;
884		su = (tu * st) / tt;
885		if (su < ruxp->rux_su)
886			su = ruxp->rux_su;
887	} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
888		/*
889		 * When we calibrate the cputicker, it is not uncommon to
890		 * see the presumably fixed frequency increase slightly over
891		 * time as a result of thermal stabilization and NTP
892		 * discipline (of the reference clock).  We therefore ignore
893		 * a bit of backwards slop because we  expect to catch up
894		 * shortly.  We use a 3 microsecond limit to catch low
895		 * counts and a 1% limit for high counts.
896		 */
897		uu = ruxp->rux_uu;
898		su = ruxp->rux_su;
899		tu = ruxp->rux_tu;
900	} else { /* tu < ruxp->rux_tu */
901		/*
902		 * What happened here was likely that a laptop, which ran at
903		 * a reduced clock frequency at boot, kicked into high gear.
904		 * The wisdom of spamming this message in that case is
905		 * dubious, but it might also be indicative of something
906		 * serious, so lets keep it and hope laptops can be made
907		 * more truthful about their CPU speed via ACPI.
908		 */
909		printf("calcru: runtime went backwards from %ju usec "
910		    "to %ju usec for pid %d (%s)\n",
911		    (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
912		    p->p_pid, p->p_comm);
913		uu = (tu * ut) / tt;
914		su = (tu * st) / tt;
915	}
916
917	ruxp->rux_uu = uu;
918	ruxp->rux_su = su;
919	ruxp->rux_tu = tu;
920
921	up->tv_sec = uu / 1000000;
922	up->tv_usec = uu % 1000000;
923	sp->tv_sec = su / 1000000;
924	sp->tv_usec = su % 1000000;
925}
926
927#ifndef _SYS_SYSPROTO_H_
928struct getrusage_args {
929	int	who;
930	struct	rusage *rusage;
931};
932#endif
933int
934getrusage(td, uap)
935	register struct thread *td;
936	register struct getrusage_args *uap;
937{
938	struct rusage ru;
939	int error;
940
941	error = kern_getrusage(td, uap->who, &ru);
942	if (error == 0)
943		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
944	return (error);
945}
946
947int
948kern_getrusage(td, who, rup)
949	struct thread *td;
950	int who;
951	struct rusage *rup;
952{
953	struct proc *p;
954	int error;
955
956	error = 0;
957	p = td->td_proc;
958	PROC_LOCK(p);
959	switch (who) {
960	case RUSAGE_SELF:
961		rufetchcalc(p, rup, &rup->ru_utime,
962		    &rup->ru_stime);
963		break;
964
965	case RUSAGE_CHILDREN:
966		*rup = p->p_stats->p_cru;
967		calccru(p, &rup->ru_utime, &rup->ru_stime);
968		break;
969
970	default:
971		error = EINVAL;
972	}
973	PROC_UNLOCK(p);
974	return (error);
975}
976
977void
978rucollect(struct rusage *ru, struct rusage *ru2)
979{
980	long *ip, *ip2;
981	int i;
982
983	if (ru->ru_maxrss < ru2->ru_maxrss)
984		ru->ru_maxrss = ru2->ru_maxrss;
985	ip = &ru->ru_first;
986	ip2 = &ru2->ru_first;
987	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
988		*ip++ += *ip2++;
989}
990
991void
992ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2,
993    struct rusage_ext *rux2)
994{
995
996	rux->rux_runtime += rux2->rux_runtime;
997	rux->rux_uticks += rux2->rux_uticks;
998	rux->rux_sticks += rux2->rux_sticks;
999	rux->rux_iticks += rux2->rux_iticks;
1000	rux->rux_uu += rux2->rux_uu;
1001	rux->rux_su += rux2->rux_su;
1002	rux->rux_tu += rux2->rux_tu;
1003	rucollect(ru, ru2);
1004}
1005
1006/*
1007 * Aggregate tick counts into the proc's rusage_ext.
1008 */
1009void
1010ruxagg(struct rusage_ext *rux, struct thread *td)
1011{
1012
1013	THREAD_LOCK_ASSERT(td, MA_OWNED);
1014	PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
1015	rux->rux_runtime += td->td_incruntime;
1016	rux->rux_uticks += td->td_uticks;
1017	rux->rux_sticks += td->td_sticks;
1018	rux->rux_iticks += td->td_iticks;
1019	td->td_incruntime = 0;
1020	td->td_uticks = 0;
1021	td->td_iticks = 0;
1022	td->td_sticks = 0;
1023}
1024
1025/*
1026 * Update the rusage_ext structure and fetch a valid aggregate rusage
1027 * for proc p if storage for one is supplied.
1028 */
1029void
1030rufetch(struct proc *p, struct rusage *ru)
1031{
1032	struct thread *td;
1033
1034	PROC_SLOCK_ASSERT(p, MA_OWNED);
1035
1036	*ru = p->p_ru;
1037	if (p->p_numthreads > 0)  {
1038		FOREACH_THREAD_IN_PROC(p, td) {
1039			thread_lock(td);
1040			ruxagg(&p->p_rux, td);
1041			thread_unlock(td);
1042			rucollect(ru, &td->td_ru);
1043		}
1044	}
1045}
1046
1047/*
1048 * Atomically perform a rufetch and a calcru together.
1049 * Consumers, can safely assume the calcru is executed only once
1050 * rufetch is completed.
1051 */
1052void
1053rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
1054    struct timeval *sp)
1055{
1056
1057	PROC_SLOCK(p);
1058	rufetch(p, ru);
1059	calcru(p, up, sp);
1060	PROC_SUNLOCK(p);
1061}
1062
1063/*
1064 * Allocate a new resource limits structure and initialize its
1065 * reference count and mutex pointer.
1066 */
1067struct plimit *
1068lim_alloc()
1069{
1070	struct plimit *limp;
1071
1072	limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
1073	refcount_init(&limp->pl_refcnt, 1);
1074	return (limp);
1075}
1076
1077struct plimit *
1078lim_hold(limp)
1079	struct plimit *limp;
1080{
1081
1082	refcount_acquire(&limp->pl_refcnt);
1083	return (limp);
1084}
1085
1086void
1087lim_fork(struct proc *p1, struct proc *p2)
1088{
1089	p2->p_limit = lim_hold(p1->p_limit);
1090	callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0);
1091	if (p1->p_cpulimit != RLIM_INFINITY)
1092		callout_reset(&p2->p_limco, hz, lim_cb, p2);
1093}
1094
1095void
1096lim_free(limp)
1097	struct plimit *limp;
1098{
1099
1100	KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
1101	if (refcount_release(&limp->pl_refcnt))
1102		free((void *)limp, M_PLIMIT);
1103}
1104
1105/*
1106 * Make a copy of the plimit structure.
1107 * We share these structures copy-on-write after fork.
1108 */
1109void
1110lim_copy(dst, src)
1111	struct plimit *dst, *src;
1112{
1113
1114	KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
1115	bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
1116}
1117
1118/*
1119 * Return the hard limit for a particular system resource.  The
1120 * which parameter specifies the index into the rlimit array.
1121 */
1122rlim_t
1123lim_max(struct proc *p, int which)
1124{
1125	struct rlimit rl;
1126
1127	lim_rlimit(p, which, &rl);
1128	return (rl.rlim_max);
1129}
1130
1131/*
1132 * Return the current (soft) limit for a particular system resource.
1133 * The which parameter which specifies the index into the rlimit array
1134 */
1135rlim_t
1136lim_cur(struct proc *p, int which)
1137{
1138	struct rlimit rl;
1139
1140	lim_rlimit(p, which, &rl);
1141	return (rl.rlim_cur);
1142}
1143
1144/*
1145 * Return a copy of the entire rlimit structure for the system limit
1146 * specified by 'which' in the rlimit structure pointed to by 'rlp'.
1147 */
1148void
1149lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
1150{
1151
1152	PROC_LOCK_ASSERT(p, MA_OWNED);
1153	KASSERT(which >= 0 && which < RLIM_NLIMITS,
1154	    ("request for invalid resource limit"));
1155	*rlp = p->p_limit->pl_rlimit[which];
1156	if (p->p_sysent->sv_fixlimit != NULL)
1157		p->p_sysent->sv_fixlimit(rlp, which);
1158}
1159
1160/*
1161 * Find the uidinfo structure for a uid.  This structure is used to
1162 * track the total resource consumption (process count, socket buffer
1163 * size, etc.) for the uid and impose limits.
1164 */
1165void
1166uihashinit()
1167{
1168
1169	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
1170	rw_init(&uihashtbl_lock, "uidinfo hash");
1171}
1172
1173/*
1174 * Look up a uidinfo struct for the parameter uid.
1175 * uihashtbl_lock must be locked.
1176 */
1177static struct uidinfo *
1178uilookup(uid)
1179	uid_t uid;
1180{
1181	struct uihashhead *uipp;
1182	struct uidinfo *uip;
1183
1184	rw_assert(&uihashtbl_lock, RA_LOCKED);
1185	uipp = UIHASH(uid);
1186	LIST_FOREACH(uip, uipp, ui_hash)
1187		if (uip->ui_uid == uid)
1188			break;
1189
1190	return (uip);
1191}
1192
1193/*
1194 * Find or allocate a struct uidinfo for a particular uid.
1195 * Increase refcount on uidinfo struct returned.
1196 * uifree() should be called on a struct uidinfo when released.
1197 */
1198struct uidinfo *
1199uifind(uid)
1200	uid_t uid;
1201{
1202	struct uidinfo *old_uip, *uip;
1203
1204	rw_rlock(&uihashtbl_lock);
1205	uip = uilookup(uid);
1206	if (uip == NULL) {
1207		rw_runlock(&uihashtbl_lock);
1208		uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
1209		rw_wlock(&uihashtbl_lock);
1210		/*
1211		 * There's a chance someone created our uidinfo while we
1212		 * were in malloc and not holding the lock, so we have to
1213		 * make sure we don't insert a duplicate uidinfo.
1214		 */
1215		if ((old_uip = uilookup(uid)) != NULL) {
1216			/* Someone else beat us to it. */
1217			free(uip, M_UIDINFO);
1218			uip = old_uip;
1219		} else {
1220			refcount_init(&uip->ui_ref, 0);
1221			uip->ui_uid = uid;
1222			mtx_init(&uip->ui_vmsize_mtx, "ui_vmsize", NULL,
1223			    MTX_DEF);
1224			LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
1225		}
1226	}
1227	uihold(uip);
1228	rw_unlock(&uihashtbl_lock);
1229	return (uip);
1230}
1231
1232/*
1233 * Place another refcount on a uidinfo struct.
1234 */
1235void
1236uihold(uip)
1237	struct uidinfo *uip;
1238{
1239
1240	refcount_acquire(&uip->ui_ref);
1241}
1242
1243/*-
1244 * Since uidinfo structs have a long lifetime, we use an
1245 * opportunistic refcounting scheme to avoid locking the lookup hash
1246 * for each release.
1247 *
1248 * If the refcount hits 0, we need to free the structure,
1249 * which means we need to lock the hash.
1250 * Optimal case:
1251 *   After locking the struct and lowering the refcount, if we find
1252 *   that we don't need to free, simply unlock and return.
1253 * Suboptimal case:
1254 *   If refcount lowering results in need to free, bump the count
1255 *   back up, lose the lock and acquire the locks in the proper
1256 *   order to try again.
1257 */
1258void
1259uifree(uip)
1260	struct uidinfo *uip;
1261{
1262	int old;
1263
1264	/* Prepare for optimal case. */
1265	old = uip->ui_ref;
1266	if (old > 1 && atomic_cmpset_int(&uip->ui_ref, old, old - 1))
1267		return;
1268
1269	/* Prepare for suboptimal case. */
1270	rw_wlock(&uihashtbl_lock);
1271	if (refcount_release(&uip->ui_ref)) {
1272		LIST_REMOVE(uip, ui_hash);
1273		rw_wunlock(&uihashtbl_lock);
1274		if (uip->ui_sbsize != 0)
1275			printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
1276			    uip->ui_uid, uip->ui_sbsize);
1277		if (uip->ui_proccnt != 0)
1278			printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
1279			    uip->ui_uid, uip->ui_proccnt);
1280		if (uip->ui_vmsize != 0)
1281			printf("freeing uidinfo: uid = %d, swapuse = %lld\n",
1282			    uip->ui_uid, (unsigned long long)uip->ui_vmsize);
1283		mtx_destroy(&uip->ui_vmsize_mtx);
1284		free(uip, M_UIDINFO);
1285		return;
1286	}
1287	/*
1288	 * Someone added a reference between atomic_cmpset_int() and
1289	 * rw_wlock(&uihashtbl_lock).
1290	 */
1291	rw_wunlock(&uihashtbl_lock);
1292}
1293
1294/*
1295 * Change the count associated with number of processes
1296 * a given user is using.  When 'max' is 0, don't enforce a limit
1297 */
1298int
1299chgproccnt(uip, diff, max)
1300	struct	uidinfo	*uip;
1301	int	diff;
1302	rlim_t	max;
1303{
1304
1305	/* Don't allow them to exceed max, but allow subtraction. */
1306	if (diff > 0 && max != 0) {
1307		if (atomic_fetchadd_long(&uip->ui_proccnt, (long)diff) + diff > max) {
1308			atomic_subtract_long(&uip->ui_proccnt, (long)diff);
1309			return (0);
1310		}
1311	} else {
1312		atomic_add_long(&uip->ui_proccnt, (long)diff);
1313		if (uip->ui_proccnt < 0)
1314			printf("negative proccnt for uid = %d\n", uip->ui_uid);
1315	}
1316	return (1);
1317}
1318
1319/*
1320 * Change the total socket buffer size a user has used.
1321 */
1322int
1323chgsbsize(uip, hiwat, to, max)
1324	struct	uidinfo	*uip;
1325	u_int  *hiwat;
1326	u_int	to;
1327	rlim_t	max;
1328{
1329	int diff;
1330
1331	diff = to - *hiwat;
1332	if (diff > 0) {
1333		if (atomic_fetchadd_long(&uip->ui_sbsize, (long)diff) + diff > max) {
1334			atomic_subtract_long(&uip->ui_sbsize, (long)diff);
1335			return (0);
1336		}
1337	} else {
1338		atomic_add_long(&uip->ui_sbsize, (long)diff);
1339		if (uip->ui_sbsize < 0)
1340			printf("negative sbsize for uid = %d\n", uip->ui_uid);
1341	}
1342	*hiwat = to;
1343	return (1);
1344}
1345
1346/*
1347 * Change the count associated with number of pseudo-terminals
1348 * a given user is using.  When 'max' is 0, don't enforce a limit
1349 */
1350int
1351chgptscnt(uip, diff, max)
1352	struct	uidinfo	*uip;
1353	int	diff;
1354	rlim_t	max;
1355{
1356
1357	/* Don't allow them to exceed max, but allow subtraction. */
1358	if (diff > 0 && max != 0) {
1359		if (atomic_fetchadd_long(&uip->ui_ptscnt, (long)diff) + diff > max) {
1360			atomic_subtract_long(&uip->ui_ptscnt, (long)diff);
1361			return (0);
1362		}
1363	} else {
1364		atomic_add_long(&uip->ui_ptscnt, (long)diff);
1365		if (uip->ui_ptscnt < 0)
1366			printf("negative ptscnt for uid = %d\n", uip->ui_uid);
1367	}
1368	return (1);
1369}
1370