kern_rctl.c revision 298330
1/*-
2 * Copyright (c) 2010 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: head/sys/kern/kern_rctl.c 298330 2016-04-20 02:09:38Z cem $
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/kern_rctl.c 298330 2016-04-20 02:09:38Z cem $");
34
35#include <sys/param.h>
36#include <sys/bus.h>
37#include <sys/malloc.h>
38#include <sys/queue.h>
39#include <sys/refcount.h>
40#include <sys/jail.h>
41#include <sys/kernel.h>
42#include <sys/limits.h>
43#include <sys/loginclass.h>
44#include <sys/priv.h>
45#include <sys/proc.h>
46#include <sys/racct.h>
47#include <sys/rctl.h>
48#include <sys/resourcevar.h>
49#include <sys/sx.h>
50#include <sys/sysent.h>
51#include <sys/sysproto.h>
52#include <sys/systm.h>
53#include <sys/types.h>
54#include <sys/eventhandler.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#include <sys/rwlock.h>
58#include <sys/sbuf.h>
59#include <sys/taskqueue.h>
60#include <sys/tree.h>
61#include <vm/uma.h>
62
63#ifdef RCTL
64#ifndef RACCT
65#error "The RCTL option requires the RACCT option"
66#endif
67
68FEATURE(rctl, "Resource Limits");
69
70#define	HRF_DEFAULT		0
71#define	HRF_DONT_INHERIT	1
72#define	HRF_DONT_ACCUMULATE	2
73
74#define	RCTL_MAX_INBUFSIZE	4 * 1024
75#define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
76#define	RCTL_LOG_BUFSIZE	128
77
78#define	RCTL_PCPU_SHIFT		(10 * 1000000)
79
80static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81static int rctl_log_rate_limit = 10;
82static int rctl_devctl_rate_limit = 10;
83
84/*
85 * Values below are initialized in rctl_init().
86 */
87static int rctl_throttle_min = -1;
88static int rctl_throttle_max = -1;
89static int rctl_throttle_pct = -1;
90static int rctl_throttle_pct2 = -1;
91
92static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96
97SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
98SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
99    &rctl_maxbufsize, 0, "Maximum output buffer size");
100SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
101    &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
102SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
103    &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
104SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
105    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
106    "Shortest throttling duration, in hz");
107TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
110    "Longest throttling duration, in hz");
111TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
114    "Throttling penalty for process consumption, in percent");
115TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
116SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
117    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
118    "Throttling penalty for container consumption, in percent");
119TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
120
121/*
122 * 'rctl_rule_link' connects a rule with every racct it's related to.
123 * For example, rule 'user:X:openfiles:deny=N/process' is linked
124 * with uidinfo for user X, and to each process of that user.
125 */
126struct rctl_rule_link {
127	LIST_ENTRY(rctl_rule_link)	rrl_next;
128	struct rctl_rule		*rrl_rule;
129	int				rrl_exceeded;
130};
131
132struct dict {
133	const char	*d_name;
134	int		d_value;
135};
136
137static struct dict subjectnames[] = {
138	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
139	{ "user", RCTL_SUBJECT_TYPE_USER },
140	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
141	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
142	{ NULL, -1 }};
143
144static struct dict resourcenames[] = {
145	{ "cputime", RACCT_CPU },
146	{ "datasize", RACCT_DATA },
147	{ "stacksize", RACCT_STACK },
148	{ "coredumpsize", RACCT_CORE },
149	{ "memoryuse", RACCT_RSS },
150	{ "memorylocked", RACCT_MEMLOCK },
151	{ "maxproc", RACCT_NPROC },
152	{ "openfiles", RACCT_NOFILE },
153	{ "vmemoryuse", RACCT_VMEM },
154	{ "pseudoterminals", RACCT_NPTS },
155	{ "swapuse", RACCT_SWAP },
156	{ "nthr", RACCT_NTHR },
157	{ "msgqqueued", RACCT_MSGQQUEUED },
158	{ "msgqsize", RACCT_MSGQSIZE },
159	{ "nmsgq", RACCT_NMSGQ },
160	{ "nsem", RACCT_NSEM },
161	{ "nsemop", RACCT_NSEMOP },
162	{ "nshm", RACCT_NSHM },
163	{ "shmsize", RACCT_SHMSIZE },
164	{ "wallclock", RACCT_WALLCLOCK },
165	{ "pcpu", RACCT_PCTCPU },
166	{ "readbps", RACCT_READBPS },
167	{ "writebps", RACCT_WRITEBPS },
168	{ "readiops", RACCT_READIOPS },
169	{ "writeiops", RACCT_WRITEIOPS },
170	{ NULL, -1 }};
171
172static struct dict actionnames[] = {
173	{ "sighup", RCTL_ACTION_SIGHUP },
174	{ "sigint", RCTL_ACTION_SIGINT },
175	{ "sigquit", RCTL_ACTION_SIGQUIT },
176	{ "sigill", RCTL_ACTION_SIGILL },
177	{ "sigtrap", RCTL_ACTION_SIGTRAP },
178	{ "sigabrt", RCTL_ACTION_SIGABRT },
179	{ "sigemt", RCTL_ACTION_SIGEMT },
180	{ "sigfpe", RCTL_ACTION_SIGFPE },
181	{ "sigkill", RCTL_ACTION_SIGKILL },
182	{ "sigbus", RCTL_ACTION_SIGBUS },
183	{ "sigsegv", RCTL_ACTION_SIGSEGV },
184	{ "sigsys", RCTL_ACTION_SIGSYS },
185	{ "sigpipe", RCTL_ACTION_SIGPIPE },
186	{ "sigalrm", RCTL_ACTION_SIGALRM },
187	{ "sigterm", RCTL_ACTION_SIGTERM },
188	{ "sigurg", RCTL_ACTION_SIGURG },
189	{ "sigstop", RCTL_ACTION_SIGSTOP },
190	{ "sigtstp", RCTL_ACTION_SIGTSTP },
191	{ "sigchld", RCTL_ACTION_SIGCHLD },
192	{ "sigttin", RCTL_ACTION_SIGTTIN },
193	{ "sigttou", RCTL_ACTION_SIGTTOU },
194	{ "sigio", RCTL_ACTION_SIGIO },
195	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
196	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
197	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
198	{ "sigprof", RCTL_ACTION_SIGPROF },
199	{ "sigwinch", RCTL_ACTION_SIGWINCH },
200	{ "siginfo", RCTL_ACTION_SIGINFO },
201	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
202	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
203	{ "sigthr", RCTL_ACTION_SIGTHR },
204	{ "deny", RCTL_ACTION_DENY },
205	{ "log", RCTL_ACTION_LOG },
206	{ "devctl", RCTL_ACTION_DEVCTL },
207	{ "throttle", RCTL_ACTION_THROTTLE },
208	{ NULL, -1 }};
209
210static void rctl_init(void);
211SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
212
213static uma_zone_t rctl_rule_zone;
214static uma_zone_t rctl_rule_link_zone;
215static struct rwlock rctl_lock;
216RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
217
218#define RCTL_RLOCK()		rw_rlock(&rctl_lock)
219#define RCTL_RUNLOCK()		rw_runlock(&rctl_lock)
220#define RCTL_WLOCK()		rw_wlock(&rctl_lock)
221#define RCTL_WUNLOCK()		rw_wunlock(&rctl_lock)
222#define RCTL_LOCK_ASSERT()	rw_assert(&rctl_lock, RA_LOCKED)
223#define RCTL_WLOCK_ASSERT()	rw_assert(&rctl_lock, RA_WLOCKED)
224
225static int rctl_rule_fully_specified(const struct rctl_rule *rule);
226static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
227
228static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
229
230static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
231{
232	int error, val = rctl_throttle_min;
233
234	error = sysctl_handle_int(oidp, &val, 0, req);
235	if (error || !req->newptr)
236		return (error);
237	if (val < 1 || val > rctl_throttle_max)
238		return (EINVAL);
239
240	RCTL_WLOCK();
241	rctl_throttle_min = val;
242	RCTL_WUNLOCK();
243
244	return (0);
245}
246
247static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
248{
249	int error, val = rctl_throttle_max;
250
251	error = sysctl_handle_int(oidp, &val, 0, req);
252	if (error || !req->newptr)
253		return (error);
254	if (val < rctl_throttle_min)
255		return (EINVAL);
256
257	RCTL_WLOCK();
258	rctl_throttle_max = val;
259	RCTL_WUNLOCK();
260
261	return (0);
262}
263
264static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
265{
266	int error, val = rctl_throttle_pct;
267
268	error = sysctl_handle_int(oidp, &val, 0, req);
269	if (error || !req->newptr)
270		return (error);
271	if (val < 0)
272		return (EINVAL);
273
274	RCTL_WLOCK();
275	rctl_throttle_pct = val;
276	RCTL_WUNLOCK();
277
278	return (0);
279}
280
281static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
282{
283	int error, val = rctl_throttle_pct2;
284
285	error = sysctl_handle_int(oidp, &val, 0, req);
286	if (error || !req->newptr)
287		return (error);
288	if (val < 0)
289		return (EINVAL);
290
291	RCTL_WLOCK();
292	rctl_throttle_pct2 = val;
293	RCTL_WUNLOCK();
294
295	return (0);
296}
297
298static const char *
299rctl_subject_type_name(int subject)
300{
301	int i;
302
303	for (i = 0; subjectnames[i].d_name != NULL; i++) {
304		if (subjectnames[i].d_value == subject)
305			return (subjectnames[i].d_name);
306	}
307
308	panic("rctl_subject_type_name: unknown subject type %d", subject);
309}
310
311static const char *
312rctl_action_name(int action)
313{
314	int i;
315
316	for (i = 0; actionnames[i].d_name != NULL; i++) {
317		if (actionnames[i].d_value == action)
318			return (actionnames[i].d_name);
319	}
320
321	panic("rctl_action_name: unknown action %d", action);
322}
323
324const char *
325rctl_resource_name(int resource)
326{
327	int i;
328
329	for (i = 0; resourcenames[i].d_name != NULL; i++) {
330		if (resourcenames[i].d_value == resource)
331			return (resourcenames[i].d_name);
332	}
333
334	panic("rctl_resource_name: unknown resource %d", resource);
335}
336
337static struct racct *
338rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
339{
340	struct ucred *cred = p->p_ucred;
341
342	ASSERT_RACCT_ENABLED();
343	RCTL_LOCK_ASSERT();
344
345	switch (rule->rr_per) {
346	case RCTL_SUBJECT_TYPE_PROCESS:
347		return (p->p_racct);
348	case RCTL_SUBJECT_TYPE_USER:
349		return (cred->cr_ruidinfo->ui_racct);
350	case RCTL_SUBJECT_TYPE_LOGINCLASS:
351		return (cred->cr_loginclass->lc_racct);
352	case RCTL_SUBJECT_TYPE_JAIL:
353		return (cred->cr_prison->pr_prison_racct->prr_racct);
354	default:
355		panic("%s: unknown per %d", __func__, rule->rr_per);
356	}
357}
358
359/*
360 * Return the amount of resource that can be allocated by 'p' before
361 * hitting 'rule'.
362 */
363static int64_t
364rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
365{
366	const struct racct *racct;
367	int64_t available;
368
369	ASSERT_RACCT_ENABLED();
370	RCTL_LOCK_ASSERT();
371
372	racct = rctl_proc_rule_to_racct(p, rule);
373	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
374
375	return (available);
376}
377
378/*
379 * Called every second for proc, uidinfo, loginclass, and jail containers.
380 * If the limit isn't exceeded, it decreases the usage amount to zero.
381 * Otherwise, it decreases it by the value of the limit.  This way
382 * resource consumption exceeding the limit "carries over" to the next
383 * period.
384 */
385void
386rctl_throttle_decay(struct racct *racct, int resource)
387{
388	struct rctl_rule *rule;
389	struct rctl_rule_link *link;
390	int64_t minavailable;
391
392	ASSERT_RACCT_ENABLED();
393
394	minavailable = INT64_MAX;
395
396	RCTL_RLOCK();
397
398	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
399		rule = link->rrl_rule;
400
401		if (rule->rr_resource != resource)
402			continue;
403		if (rule->rr_action != RCTL_ACTION_THROTTLE)
404			continue;
405
406		if (rule->rr_amount < minavailable)
407			minavailable = rule->rr_amount;
408	}
409
410	RCTL_RUNLOCK();
411
412	if (racct->r_resources[resource] < minavailable) {
413		racct->r_resources[resource] = 0;
414	} else {
415		/*
416		 * Cap utilization counter at ten times the limit.  Otherwise,
417		 * if we changed the rule lowering the allowed amount, it could
418		 * take unreasonably long time for the accumulated resource
419		 * usage to drop.
420		 */
421		if (racct->r_resources[resource] > minavailable * 10)
422			racct->r_resources[resource] = minavailable * 10;
423
424		racct->r_resources[resource] -= minavailable;
425	}
426}
427
428/*
429 * Special version of rctl_get_available() for the %CPU resource.
430 * We slightly cheat here and return less than we normally would.
431 */
432int64_t
433rctl_pcpu_available(const struct proc *p) {
434	struct rctl_rule *rule;
435	struct rctl_rule_link *link;
436	int64_t available, minavailable, limit;
437
438	ASSERT_RACCT_ENABLED();
439
440	minavailable = INT64_MAX;
441	limit = 0;
442
443	RCTL_RLOCK();
444
445	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
446		rule = link->rrl_rule;
447		if (rule->rr_resource != RACCT_PCTCPU)
448			continue;
449		if (rule->rr_action != RCTL_ACTION_DENY)
450			continue;
451		available = rctl_available_resource(p, rule);
452		if (available < minavailable) {
453			minavailable = available;
454			limit = rule->rr_amount;
455		}
456	}
457
458	RCTL_RUNLOCK();
459
460	/*
461	 * Return slightly less than actual value of the available
462	 * %cpu resource.  This makes %cpu throttling more agressive
463	 * and lets us act sooner than the limits are already exceeded.
464	 */
465	if (limit != 0) {
466		if (limit > 2 * RCTL_PCPU_SHIFT)
467			minavailable -= RCTL_PCPU_SHIFT;
468		else
469			minavailable -= (limit / 2);
470	}
471
472	return (minavailable);
473}
474
475static uint64_t
476xadd(uint64_t a, uint64_t b)
477{
478	uint64_t c;
479
480	c = a + b;
481
482	/*
483	 * Detect overflow.
484	 */
485	if (c < a || c < b)
486		return (UINT64_MAX);
487
488	return (c);
489}
490
491static uint64_t
492xmul(uint64_t a, uint64_t b)
493{
494
495	if (b != 0 && a > UINT64_MAX / b)
496		return (UINT64_MAX);
497
498	return (a * b);
499}
500
501/*
502 * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
503 * to what it keeps allocated now.  Returns non-zero if the allocation should
504 * be denied, 0 otherwise.
505 */
506int
507rctl_enforce(struct proc *p, int resource, uint64_t amount)
508{
509	static struct timeval log_lasttime, devctl_lasttime;
510	static int log_curtime = 0, devctl_curtime = 0;
511	struct rctl_rule *rule;
512	struct rctl_rule_link *link;
513	struct sbuf sb;
514	char *buf;
515	int64_t available;
516	uint64_t sleep_ms, sleep_ratio;
517	int should_deny = 0;
518
519
520	ASSERT_RACCT_ENABLED();
521
522	RCTL_RLOCK();
523
524	/*
525	 * There may be more than one matching rule; go through all of them.
526	 * Denial should be done last, after logging and sending signals.
527	 */
528	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
529		rule = link->rrl_rule;
530		if (rule->rr_resource != resource)
531			continue;
532
533		available = rctl_available_resource(p, rule);
534		if (available >= (int64_t)amount) {
535			link->rrl_exceeded = 0;
536			continue;
537		}
538
539		switch (rule->rr_action) {
540		case RCTL_ACTION_DENY:
541			should_deny = 1;
542			continue;
543		case RCTL_ACTION_LOG:
544			/*
545			 * If rrl_exceeded != 0, it means we've already
546			 * logged a warning for this process.
547			 */
548			if (link->rrl_exceeded != 0)
549				continue;
550
551			/*
552			 * If the process state is not fully initialized yet,
553			 * we can't access most of the required fields, e.g.
554			 * p->p_comm.  This happens when called from fork1().
555			 * Ignore this rule for now; it will be processed just
556			 * after fork, when called from racct_proc_fork_done().
557			 */
558			if (p->p_state != PRS_NORMAL)
559				continue;
560
561			if (!ppsratecheck(&log_lasttime, &log_curtime,
562			    rctl_log_rate_limit))
563				continue;
564
565			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
566			if (buf == NULL) {
567				printf("rctl_enforce: out of memory\n");
568				continue;
569			}
570			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
571			rctl_rule_to_sbuf(&sb, rule);
572			sbuf_finish(&sb);
573			printf("rctl: rule \"%s\" matched by pid %d "
574			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
575			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
576			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
577			sbuf_delete(&sb);
578			free(buf, M_RCTL);
579			link->rrl_exceeded = 1;
580			continue;
581		case RCTL_ACTION_DEVCTL:
582			if (link->rrl_exceeded != 0)
583				continue;
584
585			if (p->p_state != PRS_NORMAL)
586				continue;
587
588			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
589			    rctl_devctl_rate_limit))
590				continue;
591
592			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
593			if (buf == NULL) {
594				printf("rctl_enforce: out of memory\n");
595				continue;
596			}
597			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
598			sbuf_printf(&sb, "rule=");
599			rctl_rule_to_sbuf(&sb, rule);
600			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
601			    p->p_pid, p->p_ucred->cr_ruid,
602			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
603			sbuf_finish(&sb);
604			devctl_notify_f("RCTL", "rule", "matched",
605			    sbuf_data(&sb), M_NOWAIT);
606			sbuf_delete(&sb);
607			free(buf, M_RCTL);
608			link->rrl_exceeded = 1;
609			continue;
610		case RCTL_ACTION_THROTTLE:
611			if (p->p_state != PRS_NORMAL)
612				continue;
613
614			/*
615			 * Make the process sleep for a fraction of second
616			 * proportional to the ratio of process' resource
617			 * utilization compared to the limit.  The point is
618			 * to penalize resource hogs: processes that consume
619			 * more of the available resources sleep for longer.
620			 *
621			 * We're trying to defer division until the very end,
622			 * to minimize the rounding effects.  The following
623			 * calculation could have been written in a clearer
624			 * way like this:
625			 *
626			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
627			 *     rule->rr_amount;
628			 * sleep_ms *= rctl_throttle_pct / 100;
629			 * if (sleep_ms < rctl_throttle_min)
630			 *         sleep_ms = rctl_throttle_min;
631			 *
632			 */
633			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
634			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
635			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
636				sleep_ms = rctl_throttle_min * rule->rr_amount;
637
638			/*
639			 * Multiply that by the ratio of the resource
640			 * consumption for the container compared to the limit,
641			 * squared.  In other words, a process in a container
642			 * that is two times over the limit will be throttled
643			 * four times as much for hitting the same rule.  The
644			 * point is to penalize processes more if the container
645			 * itself (eg certain UID or jail) is above the limit.
646			 */
647			if (available < 0)
648				sleep_ratio = -available / rule->rr_amount;
649			else
650				sleep_ratio = 0;
651			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
652			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
653			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
654
655			/*
656			 * Finally the division.
657			 */
658			sleep_ms /= rule->rr_amount;
659
660			if (sleep_ms > rctl_throttle_max)
661				sleep_ms = rctl_throttle_max;
662#if 0
663			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
664			   __func__, p->p_pid, p->p_comm,
665			   p->p_racct->r_resources[resource],
666			   rule->rr_amount, (uintmax_t)sleep_ms,
667			   (uintmax_t)sleep_ratio, (intmax_t)available);
668#endif
669
670			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
671			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
672			racct_proc_throttle(p, sleep_ms);
673			continue;
674		default:
675			if (link->rrl_exceeded != 0)
676				continue;
677
678			if (p->p_state != PRS_NORMAL)
679				continue;
680
681			KASSERT(rule->rr_action > 0 &&
682			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
683			    ("rctl_enforce: unknown action %d",
684			     rule->rr_action));
685
686			/*
687			 * We're using the fact that RCTL_ACTION_SIG* values
688			 * are equal to their counterparts from sys/signal.h.
689			 */
690			kern_psignal(p, rule->rr_action);
691			link->rrl_exceeded = 1;
692			continue;
693		}
694	}
695
696	RCTL_RUNLOCK();
697
698	if (should_deny) {
699		/*
700		 * Return fake error code; the caller should change it
701		 * into one proper for the situation - EFSIZ, ENOMEM etc.
702		 */
703		return (EDOOFUS);
704	}
705
706	return (0);
707}
708
709uint64_t
710rctl_get_limit(struct proc *p, int resource)
711{
712	struct rctl_rule *rule;
713	struct rctl_rule_link *link;
714	uint64_t amount = UINT64_MAX;
715
716	ASSERT_RACCT_ENABLED();
717
718	RCTL_RLOCK();
719
720	/*
721	 * There may be more than one matching rule; go through all of them.
722	 * Denial should be done last, after logging and sending signals.
723	 */
724	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
725		rule = link->rrl_rule;
726		if (rule->rr_resource != resource)
727			continue;
728		if (rule->rr_action != RCTL_ACTION_DENY)
729			continue;
730		if (rule->rr_amount < amount)
731			amount = rule->rr_amount;
732	}
733
734	RCTL_RUNLOCK();
735
736	return (amount);
737}
738
739uint64_t
740rctl_get_available(struct proc *p, int resource)
741{
742	struct rctl_rule *rule;
743	struct rctl_rule_link *link;
744	int64_t available, minavailable, allocated;
745
746	minavailable = INT64_MAX;
747
748	ASSERT_RACCT_ENABLED();
749
750	RCTL_RLOCK();
751
752	/*
753	 * There may be more than one matching rule; go through all of them.
754	 * Denial should be done last, after logging and sending signals.
755	 */
756	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
757		rule = link->rrl_rule;
758		if (rule->rr_resource != resource)
759			continue;
760		if (rule->rr_action != RCTL_ACTION_DENY)
761			continue;
762		available = rctl_available_resource(p, rule);
763		if (available < minavailable)
764			minavailable = available;
765	}
766
767	RCTL_RUNLOCK();
768
769	/*
770	 * XXX: Think about this _hard_.
771	 */
772	allocated = p->p_racct->r_resources[resource];
773	if (minavailable < INT64_MAX - allocated)
774		minavailable += allocated;
775	if (minavailable < 0)
776		minavailable = 0;
777	return (minavailable);
778}
779
780static int
781rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
782{
783
784	ASSERT_RACCT_ENABLED();
785
786	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
787		if (rule->rr_subject_type != filter->rr_subject_type)
788			return (0);
789
790		switch (filter->rr_subject_type) {
791		case RCTL_SUBJECT_TYPE_PROCESS:
792			if (filter->rr_subject.rs_proc != NULL &&
793			    rule->rr_subject.rs_proc !=
794			    filter->rr_subject.rs_proc)
795				return (0);
796			break;
797		case RCTL_SUBJECT_TYPE_USER:
798			if (filter->rr_subject.rs_uip != NULL &&
799			    rule->rr_subject.rs_uip !=
800			    filter->rr_subject.rs_uip)
801				return (0);
802			break;
803		case RCTL_SUBJECT_TYPE_LOGINCLASS:
804			if (filter->rr_subject.rs_loginclass != NULL &&
805			    rule->rr_subject.rs_loginclass !=
806			    filter->rr_subject.rs_loginclass)
807				return (0);
808			break;
809		case RCTL_SUBJECT_TYPE_JAIL:
810			if (filter->rr_subject.rs_prison_racct != NULL &&
811			    rule->rr_subject.rs_prison_racct !=
812			    filter->rr_subject.rs_prison_racct)
813				return (0);
814			break;
815		default:
816			panic("rctl_rule_matches: unknown subject type %d",
817			    filter->rr_subject_type);
818		}
819	}
820
821	if (filter->rr_resource != RACCT_UNDEFINED) {
822		if (rule->rr_resource != filter->rr_resource)
823			return (0);
824	}
825
826	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
827		if (rule->rr_action != filter->rr_action)
828			return (0);
829	}
830
831	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
832		if (rule->rr_amount != filter->rr_amount)
833			return (0);
834	}
835
836	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
837		if (rule->rr_per != filter->rr_per)
838			return (0);
839	}
840
841	return (1);
842}
843
844static int
845str2value(const char *str, int *value, struct dict *table)
846{
847	int i;
848
849	if (value == NULL)
850		return (EINVAL);
851
852	for (i = 0; table[i].d_name != NULL; i++) {
853		if (strcasecmp(table[i].d_name, str) == 0) {
854			*value =  table[i].d_value;
855			return (0);
856		}
857	}
858
859	return (EINVAL);
860}
861
862static int
863str2id(const char *str, id_t *value)
864{
865	char *end;
866
867	if (str == NULL)
868		return (EINVAL);
869
870	*value = strtoul(str, &end, 10);
871	if ((size_t)(end - str) != strlen(str))
872		return (EINVAL);
873
874	return (0);
875}
876
877static int
878str2int64(const char *str, int64_t *value)
879{
880	char *end;
881
882	if (str == NULL)
883		return (EINVAL);
884
885	*value = strtoul(str, &end, 10);
886	if ((size_t)(end - str) != strlen(str))
887		return (EINVAL);
888
889	if (*value < 0)
890		return (ERANGE);
891
892	return (0);
893}
894
895/*
896 * Connect the rule to the racct, increasing refcount for the rule.
897 */
898static void
899rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
900{
901	struct rctl_rule_link *link;
902
903	ASSERT_RACCT_ENABLED();
904	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
905
906	rctl_rule_acquire(rule);
907	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
908	link->rrl_rule = rule;
909	link->rrl_exceeded = 0;
910
911	RCTL_WLOCK();
912	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
913	RCTL_WUNLOCK();
914}
915
916static int
917rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
918{
919	struct rctl_rule_link *link;
920
921	ASSERT_RACCT_ENABLED();
922	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
923	RCTL_WLOCK_ASSERT();
924
925	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
926	if (link == NULL)
927		return (ENOMEM);
928	rctl_rule_acquire(rule);
929	link->rrl_rule = rule;
930	link->rrl_exceeded = 0;
931
932	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
933	return (0);
934}
935
936/*
937 * Remove limits for a rules matching the filter and release
938 * the refcounts for the rules, possibly freeing them.  Returns
939 * the number of limit structures removed.
940 */
941static int
942rctl_racct_remove_rules(struct racct *racct,
943    const struct rctl_rule *filter)
944{
945	struct rctl_rule_link *link, *linktmp;
946	int removed = 0;
947
948	ASSERT_RACCT_ENABLED();
949	RCTL_WLOCK_ASSERT();
950
951	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
952		if (!rctl_rule_matches(link->rrl_rule, filter))
953			continue;
954
955		LIST_REMOVE(link, rrl_next);
956		rctl_rule_release(link->rrl_rule);
957		uma_zfree(rctl_rule_link_zone, link);
958		removed++;
959	}
960	return (removed);
961}
962
963static void
964rctl_rule_acquire_subject(struct rctl_rule *rule)
965{
966
967	ASSERT_RACCT_ENABLED();
968
969	switch (rule->rr_subject_type) {
970	case RCTL_SUBJECT_TYPE_UNDEFINED:
971	case RCTL_SUBJECT_TYPE_PROCESS:
972		break;
973	case RCTL_SUBJECT_TYPE_JAIL:
974		if (rule->rr_subject.rs_prison_racct != NULL)
975			prison_racct_hold(rule->rr_subject.rs_prison_racct);
976		break;
977	case RCTL_SUBJECT_TYPE_USER:
978		if (rule->rr_subject.rs_uip != NULL)
979			uihold(rule->rr_subject.rs_uip);
980		break;
981	case RCTL_SUBJECT_TYPE_LOGINCLASS:
982		if (rule->rr_subject.rs_loginclass != NULL)
983			loginclass_hold(rule->rr_subject.rs_loginclass);
984		break;
985	default:
986		panic("rctl_rule_acquire_subject: unknown subject type %d",
987		    rule->rr_subject_type);
988	}
989}
990
991static void
992rctl_rule_release_subject(struct rctl_rule *rule)
993{
994
995	ASSERT_RACCT_ENABLED();
996
997	switch (rule->rr_subject_type) {
998	case RCTL_SUBJECT_TYPE_UNDEFINED:
999	case RCTL_SUBJECT_TYPE_PROCESS:
1000		break;
1001	case RCTL_SUBJECT_TYPE_JAIL:
1002		if (rule->rr_subject.rs_prison_racct != NULL)
1003			prison_racct_free(rule->rr_subject.rs_prison_racct);
1004		break;
1005	case RCTL_SUBJECT_TYPE_USER:
1006		if (rule->rr_subject.rs_uip != NULL)
1007			uifree(rule->rr_subject.rs_uip);
1008		break;
1009	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1010		if (rule->rr_subject.rs_loginclass != NULL)
1011			loginclass_free(rule->rr_subject.rs_loginclass);
1012		break;
1013	default:
1014		panic("rctl_rule_release_subject: unknown subject type %d",
1015		    rule->rr_subject_type);
1016	}
1017}
1018
1019struct rctl_rule *
1020rctl_rule_alloc(int flags)
1021{
1022	struct rctl_rule *rule;
1023
1024	ASSERT_RACCT_ENABLED();
1025
1026	rule = uma_zalloc(rctl_rule_zone, flags);
1027	if (rule == NULL)
1028		return (NULL);
1029	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1030	rule->rr_subject.rs_proc = NULL;
1031	rule->rr_subject.rs_uip = NULL;
1032	rule->rr_subject.rs_loginclass = NULL;
1033	rule->rr_subject.rs_prison_racct = NULL;
1034	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1035	rule->rr_resource = RACCT_UNDEFINED;
1036	rule->rr_action = RCTL_ACTION_UNDEFINED;
1037	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1038	refcount_init(&rule->rr_refcount, 1);
1039
1040	return (rule);
1041}
1042
1043struct rctl_rule *
1044rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1045{
1046	struct rctl_rule *copy;
1047
1048	ASSERT_RACCT_ENABLED();
1049
1050	copy = uma_zalloc(rctl_rule_zone, flags);
1051	if (copy == NULL)
1052		return (NULL);
1053	copy->rr_subject_type = rule->rr_subject_type;
1054	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1055	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1056	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1057	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1058	copy->rr_per = rule->rr_per;
1059	copy->rr_resource = rule->rr_resource;
1060	copy->rr_action = rule->rr_action;
1061	copy->rr_amount = rule->rr_amount;
1062	refcount_init(&copy->rr_refcount, 1);
1063	rctl_rule_acquire_subject(copy);
1064
1065	return (copy);
1066}
1067
1068void
1069rctl_rule_acquire(struct rctl_rule *rule)
1070{
1071
1072	ASSERT_RACCT_ENABLED();
1073	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1074
1075	refcount_acquire(&rule->rr_refcount);
1076}
1077
1078static void
1079rctl_rule_free(void *context, int pending)
1080{
1081	struct rctl_rule *rule;
1082
1083	rule = (struct rctl_rule *)context;
1084
1085	ASSERT_RACCT_ENABLED();
1086	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1087
1088	/*
1089	 * We don't need locking here; rule is guaranteed to be inaccessible.
1090	 */
1091
1092	rctl_rule_release_subject(rule);
1093	uma_zfree(rctl_rule_zone, rule);
1094}
1095
1096void
1097rctl_rule_release(struct rctl_rule *rule)
1098{
1099
1100	ASSERT_RACCT_ENABLED();
1101	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1102
1103	if (refcount_release(&rule->rr_refcount)) {
1104		/*
1105		 * rctl_rule_release() is often called when iterating
1106		 * over all the uidinfo structures in the system,
1107		 * holding uihashtbl_lock.  Since rctl_rule_free()
1108		 * might end up calling uifree(), this would lead
1109		 * to lock recursion.  Use taskqueue to avoid this.
1110		 */
1111		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1112		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1113	}
1114}
1115
1116static int
1117rctl_rule_fully_specified(const struct rctl_rule *rule)
1118{
1119
1120	ASSERT_RACCT_ENABLED();
1121
1122	switch (rule->rr_subject_type) {
1123	case RCTL_SUBJECT_TYPE_UNDEFINED:
1124		return (0);
1125	case RCTL_SUBJECT_TYPE_PROCESS:
1126		if (rule->rr_subject.rs_proc == NULL)
1127			return (0);
1128		break;
1129	case RCTL_SUBJECT_TYPE_USER:
1130		if (rule->rr_subject.rs_uip == NULL)
1131			return (0);
1132		break;
1133	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1134		if (rule->rr_subject.rs_loginclass == NULL)
1135			return (0);
1136		break;
1137	case RCTL_SUBJECT_TYPE_JAIL:
1138		if (rule->rr_subject.rs_prison_racct == NULL)
1139			return (0);
1140		break;
1141	default:
1142		panic("rctl_rule_fully_specified: unknown subject type %d",
1143		    rule->rr_subject_type);
1144	}
1145	if (rule->rr_resource == RACCT_UNDEFINED)
1146		return (0);
1147	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1148		return (0);
1149	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1150		return (0);
1151	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1152		return (0);
1153
1154	return (1);
1155}
1156
1157static int
1158rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1159{
1160	struct rctl_rule *rule;
1161	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1162	     *amountstr, *perstr;
1163	id_t id;
1164	int error = 0;
1165
1166	ASSERT_RACCT_ENABLED();
1167
1168	rule = rctl_rule_alloc(M_WAITOK);
1169
1170	subjectstr = strsep(&rulestr, ":");
1171	subject_idstr = strsep(&rulestr, ":");
1172	resourcestr = strsep(&rulestr, ":");
1173	actionstr = strsep(&rulestr, "=/");
1174	amountstr = strsep(&rulestr, "/");
1175	perstr = rulestr;
1176
1177	if (subjectstr == NULL || subjectstr[0] == '\0')
1178		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1179	else {
1180		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1181		if (error != 0)
1182			goto out;
1183	}
1184
1185	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1186		rule->rr_subject.rs_proc = NULL;
1187		rule->rr_subject.rs_uip = NULL;
1188		rule->rr_subject.rs_loginclass = NULL;
1189		rule->rr_subject.rs_prison_racct = NULL;
1190	} else {
1191		switch (rule->rr_subject_type) {
1192		case RCTL_SUBJECT_TYPE_UNDEFINED:
1193			error = EINVAL;
1194			goto out;
1195		case RCTL_SUBJECT_TYPE_PROCESS:
1196			error = str2id(subject_idstr, &id);
1197			if (error != 0)
1198				goto out;
1199			sx_assert(&allproc_lock, SA_LOCKED);
1200			rule->rr_subject.rs_proc = pfind(id);
1201			if (rule->rr_subject.rs_proc == NULL) {
1202				error = ESRCH;
1203				goto out;
1204			}
1205			PROC_UNLOCK(rule->rr_subject.rs_proc);
1206			break;
1207		case RCTL_SUBJECT_TYPE_USER:
1208			error = str2id(subject_idstr, &id);
1209			if (error != 0)
1210				goto out;
1211			rule->rr_subject.rs_uip = uifind(id);
1212			break;
1213		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1214			rule->rr_subject.rs_loginclass =
1215			    loginclass_find(subject_idstr);
1216			if (rule->rr_subject.rs_loginclass == NULL) {
1217				error = ENAMETOOLONG;
1218				goto out;
1219			}
1220			break;
1221		case RCTL_SUBJECT_TYPE_JAIL:
1222			rule->rr_subject.rs_prison_racct =
1223			    prison_racct_find(subject_idstr);
1224			if (rule->rr_subject.rs_prison_racct == NULL) {
1225				error = ENAMETOOLONG;
1226				goto out;
1227			}
1228			break;
1229               default:
1230                       panic("rctl_string_to_rule: unknown subject type %d",
1231                           rule->rr_subject_type);
1232               }
1233	}
1234
1235	if (resourcestr == NULL || resourcestr[0] == '\0')
1236		rule->rr_resource = RACCT_UNDEFINED;
1237	else {
1238		error = str2value(resourcestr, &rule->rr_resource,
1239		    resourcenames);
1240		if (error != 0)
1241			goto out;
1242	}
1243
1244	if (actionstr == NULL || actionstr[0] == '\0')
1245		rule->rr_action = RCTL_ACTION_UNDEFINED;
1246	else {
1247		error = str2value(actionstr, &rule->rr_action, actionnames);
1248		if (error != 0)
1249			goto out;
1250	}
1251
1252	if (amountstr == NULL || amountstr[0] == '\0')
1253		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1254	else {
1255		error = str2int64(amountstr, &rule->rr_amount);
1256		if (error != 0)
1257			goto out;
1258		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1259			if (rule->rr_amount > INT64_MAX / 1000000) {
1260				error = ERANGE;
1261				goto out;
1262			}
1263			rule->rr_amount *= 1000000;
1264		}
1265	}
1266
1267	if (perstr == NULL || perstr[0] == '\0')
1268		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1269	else {
1270		error = str2value(perstr, &rule->rr_per, subjectnames);
1271		if (error != 0)
1272			goto out;
1273	}
1274
1275out:
1276	if (error == 0)
1277		*rulep = rule;
1278	else
1279		rctl_rule_release(rule);
1280
1281	return (error);
1282}
1283
1284/*
1285 * Link a rule with all the subjects it applies to.
1286 */
1287int
1288rctl_rule_add(struct rctl_rule *rule)
1289{
1290	struct proc *p;
1291	struct ucred *cred;
1292	struct uidinfo *uip;
1293	struct prison *pr;
1294	struct prison_racct *prr;
1295	struct loginclass *lc;
1296	struct rctl_rule *rule2;
1297	int match;
1298
1299	ASSERT_RACCT_ENABLED();
1300	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1301
1302	/*
1303	 * Some rules just don't make sense, like "deny" rule for an undeniable
1304	 * resource.  The exception are the RSS and %CPU resources - they are
1305	 * not deniable in the racct sense, but the limit is enforced in
1306	 * a different way.
1307	 */
1308	if (rule->rr_action == RCTL_ACTION_DENY &&
1309	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1310	    rule->rr_resource != RACCT_RSS &&
1311	    rule->rr_resource != RACCT_PCTCPU) {
1312		return (EOPNOTSUPP);
1313	}
1314
1315	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1316	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1317		return (EOPNOTSUPP);
1318	}
1319
1320	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1321	    rule->rr_resource == RACCT_PCTCPU) {
1322		return (EOPNOTSUPP);
1323	}
1324
1325	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1326	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1327		return (EOPNOTSUPP);
1328	}
1329
1330	/*
1331	 * Make sure there are no duplicated rules.  Also, for the "deny"
1332	 * rules, remove ones differing only by "amount".
1333	 */
1334	if (rule->rr_action == RCTL_ACTION_DENY) {
1335		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1336		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1337		rctl_rule_remove(rule2);
1338		rctl_rule_release(rule2);
1339	} else
1340		rctl_rule_remove(rule);
1341
1342	switch (rule->rr_subject_type) {
1343	case RCTL_SUBJECT_TYPE_PROCESS:
1344		p = rule->rr_subject.rs_proc;
1345		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1346
1347		rctl_racct_add_rule(p->p_racct, rule);
1348		/*
1349		 * In case of per-process rule, we don't have anything more
1350		 * to do.
1351		 */
1352		return (0);
1353
1354	case RCTL_SUBJECT_TYPE_USER:
1355		uip = rule->rr_subject.rs_uip;
1356		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1357		rctl_racct_add_rule(uip->ui_racct, rule);
1358		break;
1359
1360	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1361		lc = rule->rr_subject.rs_loginclass;
1362		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1363		rctl_racct_add_rule(lc->lc_racct, rule);
1364		break;
1365
1366	case RCTL_SUBJECT_TYPE_JAIL:
1367		prr = rule->rr_subject.rs_prison_racct;
1368		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1369		rctl_racct_add_rule(prr->prr_racct, rule);
1370		break;
1371
1372	default:
1373		panic("rctl_rule_add: unknown subject type %d",
1374		    rule->rr_subject_type);
1375	}
1376
1377	/*
1378	 * Now go through all the processes and add the new rule to the ones
1379	 * it applies to.
1380	 */
1381	sx_assert(&allproc_lock, SA_LOCKED);
1382	FOREACH_PROC_IN_SYSTEM(p) {
1383		cred = p->p_ucred;
1384		switch (rule->rr_subject_type) {
1385		case RCTL_SUBJECT_TYPE_USER:
1386			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1387			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1388				break;
1389			continue;
1390		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1391			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1392				break;
1393			continue;
1394		case RCTL_SUBJECT_TYPE_JAIL:
1395			match = 0;
1396			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1397				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1398					match = 1;
1399					break;
1400				}
1401			}
1402			if (match)
1403				break;
1404			continue;
1405		default:
1406			panic("rctl_rule_add: unknown subject type %d",
1407			    rule->rr_subject_type);
1408		}
1409
1410		rctl_racct_add_rule(p->p_racct, rule);
1411	}
1412
1413	return (0);
1414}
1415
1416static void
1417rctl_rule_pre_callback(void)
1418{
1419
1420	RCTL_WLOCK();
1421}
1422
1423static void
1424rctl_rule_post_callback(void)
1425{
1426
1427	RCTL_WUNLOCK();
1428}
1429
1430static void
1431rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1432{
1433	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1434	int found = 0;
1435
1436	ASSERT_RACCT_ENABLED();
1437	RCTL_WLOCK_ASSERT();
1438
1439	found += rctl_racct_remove_rules(racct, filter);
1440
1441	*((int *)arg3) += found;
1442}
1443
1444/*
1445 * Remove all rules that match the filter.
1446 */
1447int
1448rctl_rule_remove(struct rctl_rule *filter)
1449{
1450	struct proc *p;
1451	int found = 0;
1452
1453	ASSERT_RACCT_ENABLED();
1454
1455	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1456	    filter->rr_subject.rs_proc != NULL) {
1457		p = filter->rr_subject.rs_proc;
1458		RCTL_WLOCK();
1459		found = rctl_racct_remove_rules(p->p_racct, filter);
1460		RCTL_WUNLOCK();
1461		if (found)
1462			return (0);
1463		return (ESRCH);
1464	}
1465
1466	loginclass_racct_foreach(rctl_rule_remove_callback,
1467	    rctl_rule_pre_callback, rctl_rule_post_callback,
1468	    filter, (void *)&found);
1469	ui_racct_foreach(rctl_rule_remove_callback,
1470	    rctl_rule_pre_callback, rctl_rule_post_callback,
1471	    filter, (void *)&found);
1472	prison_racct_foreach(rctl_rule_remove_callback,
1473	    rctl_rule_pre_callback, rctl_rule_post_callback,
1474	    filter, (void *)&found);
1475
1476	sx_assert(&allproc_lock, SA_LOCKED);
1477	RCTL_WLOCK();
1478	FOREACH_PROC_IN_SYSTEM(p) {
1479		found += rctl_racct_remove_rules(p->p_racct, filter);
1480	}
1481	RCTL_WUNLOCK();
1482
1483	if (found)
1484		return (0);
1485	return (ESRCH);
1486}
1487
1488/*
1489 * Appends a rule to the sbuf.
1490 */
1491static void
1492rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1493{
1494	int64_t amount;
1495
1496	ASSERT_RACCT_ENABLED();
1497
1498	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1499
1500	switch (rule->rr_subject_type) {
1501	case RCTL_SUBJECT_TYPE_PROCESS:
1502		if (rule->rr_subject.rs_proc == NULL)
1503			sbuf_printf(sb, ":");
1504		else
1505			sbuf_printf(sb, "%d:",
1506			    rule->rr_subject.rs_proc->p_pid);
1507		break;
1508	case RCTL_SUBJECT_TYPE_USER:
1509		if (rule->rr_subject.rs_uip == NULL)
1510			sbuf_printf(sb, ":");
1511		else
1512			sbuf_printf(sb, "%d:",
1513			    rule->rr_subject.rs_uip->ui_uid);
1514		break;
1515	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1516		if (rule->rr_subject.rs_loginclass == NULL)
1517			sbuf_printf(sb, ":");
1518		else
1519			sbuf_printf(sb, "%s:",
1520			    rule->rr_subject.rs_loginclass->lc_name);
1521		break;
1522	case RCTL_SUBJECT_TYPE_JAIL:
1523		if (rule->rr_subject.rs_prison_racct == NULL)
1524			sbuf_printf(sb, ":");
1525		else
1526			sbuf_printf(sb, "%s:",
1527			    rule->rr_subject.rs_prison_racct->prr_name);
1528		break;
1529	default:
1530		panic("rctl_rule_to_sbuf: unknown subject type %d",
1531		    rule->rr_subject_type);
1532	}
1533
1534	amount = rule->rr_amount;
1535	if (amount != RCTL_AMOUNT_UNDEFINED &&
1536	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1537		amount /= 1000000;
1538
1539	sbuf_printf(sb, "%s:%s=%jd",
1540	    rctl_resource_name(rule->rr_resource),
1541	    rctl_action_name(rule->rr_action),
1542	    amount);
1543
1544	if (rule->rr_per != rule->rr_subject_type)
1545		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1546}
1547
1548/*
1549 * Routine used by RCTL syscalls to read in input string.
1550 */
1551static int
1552rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1553{
1554	char *str;
1555	int error;
1556
1557	ASSERT_RACCT_ENABLED();
1558
1559	if (inbuflen <= 0)
1560		return (EINVAL);
1561	if (inbuflen > RCTL_MAX_INBUFSIZE)
1562		return (E2BIG);
1563
1564	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1565	error = copyinstr(inbufp, str, inbuflen, NULL);
1566	if (error != 0) {
1567		free(str, M_RCTL);
1568		return (error);
1569	}
1570
1571	*inputstr = str;
1572
1573	return (0);
1574}
1575
1576/*
1577 * Routine used by RCTL syscalls to write out output string.
1578 */
1579static int
1580rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1581{
1582	int error;
1583
1584	ASSERT_RACCT_ENABLED();
1585
1586	if (outputsbuf == NULL)
1587		return (0);
1588
1589	sbuf_finish(outputsbuf);
1590	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1591		sbuf_delete(outputsbuf);
1592		return (ERANGE);
1593	}
1594	error = copyout(sbuf_data(outputsbuf), outbufp,
1595	    sbuf_len(outputsbuf) + 1);
1596	sbuf_delete(outputsbuf);
1597	return (error);
1598}
1599
1600static struct sbuf *
1601rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1602{
1603	struct sbuf *sb;
1604	int64_t amount;
1605	int i;
1606
1607	ASSERT_RACCT_ENABLED();
1608
1609	sb = sbuf_new_auto();
1610	for (i = 0; i <= RACCT_MAX; i++) {
1611		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1612			continue;
1613		amount = racct->r_resources[i];
1614		if (RACCT_IS_IN_MILLIONS(i))
1615			amount /= 1000000;
1616		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1617	}
1618	sbuf_setpos(sb, sbuf_len(sb) - 1);
1619	return (sb);
1620}
1621
1622int
1623sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1624{
1625	struct rctl_rule *filter;
1626	struct sbuf *outputsbuf = NULL;
1627	struct proc *p;
1628	struct uidinfo *uip;
1629	struct loginclass *lc;
1630	struct prison_racct *prr;
1631	char *inputstr;
1632	int error;
1633
1634	if (!racct_enable)
1635		return (ENOSYS);
1636
1637	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1638	if (error != 0)
1639		return (error);
1640
1641	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1642	if (error != 0)
1643		return (error);
1644
1645	sx_slock(&allproc_lock);
1646	error = rctl_string_to_rule(inputstr, &filter);
1647	free(inputstr, M_RCTL);
1648	if (error != 0) {
1649		sx_sunlock(&allproc_lock);
1650		return (error);
1651	}
1652
1653	switch (filter->rr_subject_type) {
1654	case RCTL_SUBJECT_TYPE_PROCESS:
1655		p = filter->rr_subject.rs_proc;
1656		if (p == NULL) {
1657			error = EINVAL;
1658			goto out;
1659		}
1660		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1661		break;
1662	case RCTL_SUBJECT_TYPE_USER:
1663		uip = filter->rr_subject.rs_uip;
1664		if (uip == NULL) {
1665			error = EINVAL;
1666			goto out;
1667		}
1668		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1669		break;
1670	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1671		lc = filter->rr_subject.rs_loginclass;
1672		if (lc == NULL) {
1673			error = EINVAL;
1674			goto out;
1675		}
1676		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1677		break;
1678	case RCTL_SUBJECT_TYPE_JAIL:
1679		prr = filter->rr_subject.rs_prison_racct;
1680		if (prr == NULL) {
1681			error = EINVAL;
1682			goto out;
1683		}
1684		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1685		break;
1686	default:
1687		error = EINVAL;
1688	}
1689out:
1690	rctl_rule_release(filter);
1691	sx_sunlock(&allproc_lock);
1692	if (error != 0)
1693		return (error);
1694
1695	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1696
1697	return (error);
1698}
1699
1700static void
1701rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1702{
1703	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1704	struct rctl_rule_link *link;
1705	struct sbuf *sb = (struct sbuf *)arg3;
1706
1707	ASSERT_RACCT_ENABLED();
1708	RCTL_LOCK_ASSERT();
1709
1710	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1711		if (!rctl_rule_matches(link->rrl_rule, filter))
1712			continue;
1713		rctl_rule_to_sbuf(sb, link->rrl_rule);
1714		sbuf_printf(sb, ",");
1715	}
1716}
1717
1718int
1719sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1720{
1721	struct sbuf *sb;
1722	struct rctl_rule *filter;
1723	struct rctl_rule_link *link;
1724	struct proc *p;
1725	char *inputstr, *buf;
1726	size_t bufsize;
1727	int error;
1728
1729	if (!racct_enable)
1730		return (ENOSYS);
1731
1732	error = priv_check(td, PRIV_RCTL_GET_RULES);
1733	if (error != 0)
1734		return (error);
1735
1736	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1737	if (error != 0)
1738		return (error);
1739
1740	sx_slock(&allproc_lock);
1741	error = rctl_string_to_rule(inputstr, &filter);
1742	free(inputstr, M_RCTL);
1743	if (error != 0) {
1744		sx_sunlock(&allproc_lock);
1745		return (error);
1746	}
1747
1748	bufsize = uap->outbuflen;
1749	if (bufsize > rctl_maxbufsize) {
1750		sx_sunlock(&allproc_lock);
1751		return (E2BIG);
1752	}
1753
1754	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1755	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1756	KASSERT(sb != NULL, ("sbuf_new failed"));
1757
1758	FOREACH_PROC_IN_SYSTEM(p) {
1759		RCTL_RLOCK();
1760		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1761			/*
1762			 * Non-process rules will be added to the buffer later.
1763			 * Adding them here would result in duplicated output.
1764			 */
1765			if (link->rrl_rule->rr_subject_type !=
1766			    RCTL_SUBJECT_TYPE_PROCESS)
1767				continue;
1768			if (!rctl_rule_matches(link->rrl_rule, filter))
1769				continue;
1770			rctl_rule_to_sbuf(sb, link->rrl_rule);
1771			sbuf_printf(sb, ",");
1772		}
1773		RCTL_RUNLOCK();
1774	}
1775
1776	loginclass_racct_foreach(rctl_get_rules_callback,
1777	    rctl_rule_pre_callback, rctl_rule_post_callback,
1778	    filter, sb);
1779	ui_racct_foreach(rctl_get_rules_callback,
1780	    rctl_rule_pre_callback, rctl_rule_post_callback,
1781	    filter, sb);
1782	prison_racct_foreach(rctl_get_rules_callback,
1783	    rctl_rule_pre_callback, rctl_rule_post_callback,
1784	    filter, sb);
1785	if (sbuf_error(sb) == ENOMEM) {
1786		error = ERANGE;
1787		goto out;
1788	}
1789
1790	/*
1791	 * Remove trailing ",".
1792	 */
1793	if (sbuf_len(sb) > 0)
1794		sbuf_setpos(sb, sbuf_len(sb) - 1);
1795
1796	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1797out:
1798	rctl_rule_release(filter);
1799	sx_sunlock(&allproc_lock);
1800	free(buf, M_RCTL);
1801	return (error);
1802}
1803
1804int
1805sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1806{
1807	struct sbuf *sb;
1808	struct rctl_rule *filter;
1809	struct rctl_rule_link *link;
1810	char *inputstr, *buf;
1811	size_t bufsize;
1812	int error;
1813
1814	if (!racct_enable)
1815		return (ENOSYS);
1816
1817	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1818	if (error != 0)
1819		return (error);
1820
1821	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1822	if (error != 0)
1823		return (error);
1824
1825	sx_slock(&allproc_lock);
1826	error = rctl_string_to_rule(inputstr, &filter);
1827	free(inputstr, M_RCTL);
1828	if (error != 0) {
1829		sx_sunlock(&allproc_lock);
1830		return (error);
1831	}
1832
1833	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1834		rctl_rule_release(filter);
1835		sx_sunlock(&allproc_lock);
1836		return (EINVAL);
1837	}
1838	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1839		rctl_rule_release(filter);
1840		sx_sunlock(&allproc_lock);
1841		return (EOPNOTSUPP);
1842	}
1843	if (filter->rr_subject.rs_proc == NULL) {
1844		rctl_rule_release(filter);
1845		sx_sunlock(&allproc_lock);
1846		return (EINVAL);
1847	}
1848
1849	bufsize = uap->outbuflen;
1850	if (bufsize > rctl_maxbufsize) {
1851		rctl_rule_release(filter);
1852		sx_sunlock(&allproc_lock);
1853		return (E2BIG);
1854	}
1855
1856	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1857	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1858	KASSERT(sb != NULL, ("sbuf_new failed"));
1859
1860	RCTL_RLOCK();
1861	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1862	    rrl_next) {
1863		rctl_rule_to_sbuf(sb, link->rrl_rule);
1864		sbuf_printf(sb, ",");
1865	}
1866	RCTL_RUNLOCK();
1867	if (sbuf_error(sb) == ENOMEM) {
1868		error = ERANGE;
1869		sbuf_delete(sb);
1870		goto out;
1871	}
1872
1873	/*
1874	 * Remove trailing ",".
1875	 */
1876	if (sbuf_len(sb) > 0)
1877		sbuf_setpos(sb, sbuf_len(sb) - 1);
1878
1879	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1880out:
1881	rctl_rule_release(filter);
1882	sx_sunlock(&allproc_lock);
1883	free(buf, M_RCTL);
1884	return (error);
1885}
1886
1887int
1888sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1889{
1890	struct rctl_rule *rule;
1891	char *inputstr;
1892	int error;
1893
1894	if (!racct_enable)
1895		return (ENOSYS);
1896
1897	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1898	if (error != 0)
1899		return (error);
1900
1901	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1902	if (error != 0)
1903		return (error);
1904
1905	sx_slock(&allproc_lock);
1906	error = rctl_string_to_rule(inputstr, &rule);
1907	free(inputstr, M_RCTL);
1908	if (error != 0) {
1909		sx_sunlock(&allproc_lock);
1910		return (error);
1911	}
1912	/*
1913	 * The 'per' part of a rule is optional.
1914	 */
1915	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1916	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1917		rule->rr_per = rule->rr_subject_type;
1918
1919	if (!rctl_rule_fully_specified(rule)) {
1920		error = EINVAL;
1921		goto out;
1922	}
1923
1924	error = rctl_rule_add(rule);
1925
1926out:
1927	rctl_rule_release(rule);
1928	sx_sunlock(&allproc_lock);
1929	return (error);
1930}
1931
1932int
1933sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1934{
1935	struct rctl_rule *filter;
1936	char *inputstr;
1937	int error;
1938
1939	if (!racct_enable)
1940		return (ENOSYS);
1941
1942	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1943	if (error != 0)
1944		return (error);
1945
1946	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1947	if (error != 0)
1948		return (error);
1949
1950	sx_slock(&allproc_lock);
1951	error = rctl_string_to_rule(inputstr, &filter);
1952	free(inputstr, M_RCTL);
1953	if (error != 0) {
1954		sx_sunlock(&allproc_lock);
1955		return (error);
1956	}
1957
1958	error = rctl_rule_remove(filter);
1959	rctl_rule_release(filter);
1960	sx_sunlock(&allproc_lock);
1961
1962	return (error);
1963}
1964
1965/*
1966 * Update RCTL rule list after credential change.
1967 */
1968void
1969rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1970{
1971	LIST_HEAD(, rctl_rule_link) newrules;
1972	struct rctl_rule_link *link, *newlink;
1973	struct uidinfo *newuip;
1974	struct loginclass *newlc;
1975	struct prison_racct *newprr;
1976	int rulecnt, i;
1977
1978	ASSERT_RACCT_ENABLED();
1979
1980	newuip = newcred->cr_ruidinfo;
1981	newlc = newcred->cr_loginclass;
1982	newprr = newcred->cr_prison->pr_prison_racct;
1983
1984	LIST_INIT(&newrules);
1985
1986again:
1987	/*
1988	 * First, count the rules that apply to the process with new
1989	 * credentials.
1990	 */
1991	rulecnt = 0;
1992	RCTL_RLOCK();
1993	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1994		if (link->rrl_rule->rr_subject_type ==
1995		    RCTL_SUBJECT_TYPE_PROCESS)
1996			rulecnt++;
1997	}
1998	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1999		rulecnt++;
2000	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
2001		rulecnt++;
2002	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
2003		rulecnt++;
2004	RCTL_RUNLOCK();
2005
2006	/*
2007	 * Create temporary list.  We've dropped the rctl_lock in order
2008	 * to use M_WAITOK.
2009	 */
2010	for (i = 0; i < rulecnt; i++) {
2011		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2012		newlink->rrl_rule = NULL;
2013		newlink->rrl_exceeded = 0;
2014		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2015	}
2016
2017	newlink = LIST_FIRST(&newrules);
2018
2019	/*
2020	 * Assign rules to the newly allocated list entries.
2021	 */
2022	RCTL_WLOCK();
2023	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2024		if (link->rrl_rule->rr_subject_type ==
2025		    RCTL_SUBJECT_TYPE_PROCESS) {
2026			if (newlink == NULL)
2027				goto goaround;
2028			rctl_rule_acquire(link->rrl_rule);
2029			newlink->rrl_rule = link->rrl_rule;
2030			newlink->rrl_exceeded = link->rrl_exceeded;
2031			newlink = LIST_NEXT(newlink, rrl_next);
2032			rulecnt--;
2033		}
2034	}
2035
2036	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2037		if (newlink == NULL)
2038			goto goaround;
2039		rctl_rule_acquire(link->rrl_rule);
2040		newlink->rrl_rule = link->rrl_rule;
2041		newlink->rrl_exceeded = link->rrl_exceeded;
2042		newlink = LIST_NEXT(newlink, rrl_next);
2043		rulecnt--;
2044	}
2045
2046	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2047		if (newlink == NULL)
2048			goto goaround;
2049		rctl_rule_acquire(link->rrl_rule);
2050		newlink->rrl_rule = link->rrl_rule;
2051		newlink->rrl_exceeded = link->rrl_exceeded;
2052		newlink = LIST_NEXT(newlink, rrl_next);
2053		rulecnt--;
2054	}
2055
2056	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2057		if (newlink == NULL)
2058			goto goaround;
2059		rctl_rule_acquire(link->rrl_rule);
2060		newlink->rrl_rule = link->rrl_rule;
2061		newlink->rrl_exceeded = link->rrl_exceeded;
2062		newlink = LIST_NEXT(newlink, rrl_next);
2063		rulecnt--;
2064	}
2065
2066	if (rulecnt == 0) {
2067		/*
2068		 * Free the old rule list.
2069		 */
2070		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2071			link = LIST_FIRST(&p->p_racct->r_rule_links);
2072			LIST_REMOVE(link, rrl_next);
2073			rctl_rule_release(link->rrl_rule);
2074			uma_zfree(rctl_rule_link_zone, link);
2075		}
2076
2077		/*
2078		 * Replace lists and we're done.
2079		 *
2080		 * XXX: Is there any way to switch list heads instead
2081		 *      of iterating here?
2082		 */
2083		while (!LIST_EMPTY(&newrules)) {
2084			newlink = LIST_FIRST(&newrules);
2085			LIST_REMOVE(newlink, rrl_next);
2086			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2087			    newlink, rrl_next);
2088		}
2089
2090		RCTL_WUNLOCK();
2091
2092		return;
2093	}
2094
2095goaround:
2096	RCTL_WUNLOCK();
2097
2098	/*
2099	 * Rule list changed while we were not holding the rctl_lock.
2100	 * Free the new list and try again.
2101	 */
2102	while (!LIST_EMPTY(&newrules)) {
2103		newlink = LIST_FIRST(&newrules);
2104		LIST_REMOVE(newlink, rrl_next);
2105		if (newlink->rrl_rule != NULL)
2106			rctl_rule_release(newlink->rrl_rule);
2107		uma_zfree(rctl_rule_link_zone, newlink);
2108	}
2109
2110	goto again;
2111}
2112
2113/*
2114 * Assign RCTL rules to the newly created process.
2115 */
2116int
2117rctl_proc_fork(struct proc *parent, struct proc *child)
2118{
2119	struct rctl_rule *rule;
2120	struct rctl_rule_link *link;
2121	int error;
2122
2123	LIST_INIT(&child->p_racct->r_rule_links);
2124
2125	ASSERT_RACCT_ENABLED();
2126	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2127
2128	RCTL_WLOCK();
2129
2130	/*
2131	 * Go through limits applicable to the parent and assign them
2132	 * to the child.  Rules with 'process' subject have to be duplicated
2133	 * in order to make their rr_subject point to the new process.
2134	 */
2135	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2136		if (link->rrl_rule->rr_subject_type ==
2137		    RCTL_SUBJECT_TYPE_PROCESS) {
2138			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2139			if (rule == NULL)
2140				goto fail;
2141			KASSERT(rule->rr_subject.rs_proc == parent,
2142			    ("rule->rr_subject.rs_proc != parent"));
2143			rule->rr_subject.rs_proc = child;
2144			error = rctl_racct_add_rule_locked(child->p_racct,
2145			    rule);
2146			rctl_rule_release(rule);
2147			if (error != 0)
2148				goto fail;
2149		} else {
2150			error = rctl_racct_add_rule_locked(child->p_racct,
2151			    link->rrl_rule);
2152			if (error != 0)
2153				goto fail;
2154		}
2155	}
2156
2157	RCTL_WUNLOCK();
2158	return (0);
2159
2160fail:
2161	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2162		link = LIST_FIRST(&child->p_racct->r_rule_links);
2163		LIST_REMOVE(link, rrl_next);
2164		rctl_rule_release(link->rrl_rule);
2165		uma_zfree(rctl_rule_link_zone, link);
2166	}
2167	RCTL_WUNLOCK();
2168	return (EAGAIN);
2169}
2170
2171/*
2172 * Release rules attached to the racct.
2173 */
2174void
2175rctl_racct_release(struct racct *racct)
2176{
2177	struct rctl_rule_link *link;
2178
2179	ASSERT_RACCT_ENABLED();
2180
2181	RCTL_WLOCK();
2182	while (!LIST_EMPTY(&racct->r_rule_links)) {
2183		link = LIST_FIRST(&racct->r_rule_links);
2184		LIST_REMOVE(link, rrl_next);
2185		rctl_rule_release(link->rrl_rule);
2186		uma_zfree(rctl_rule_link_zone, link);
2187	}
2188	RCTL_WUNLOCK();
2189}
2190
2191static void
2192rctl_init(void)
2193{
2194
2195	if (!racct_enable)
2196		return;
2197
2198	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2199	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2200	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2201	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2202	    UMA_ALIGN_PTR, 0);
2203
2204	/*
2205	 * Set default values, making sure not to overwrite the ones
2206	 * fetched from tunables.  Most of those could be set at the
2207	 * declaration, except for the rctl_throttle_max - we cannot
2208	 * set it there due to hz not being compile time constant.
2209	 */
2210	if (rctl_throttle_min < 1)
2211		rctl_throttle_min = 1;
2212	if (rctl_throttle_max < rctl_throttle_min)
2213		rctl_throttle_max = 2 * hz;
2214	if (rctl_throttle_pct < 0)
2215		rctl_throttle_pct = 100;
2216	if (rctl_throttle_pct2 < 0)
2217		rctl_throttle_pct2 = 100;
2218}
2219
2220#else /* !RCTL */
2221
2222int
2223sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2224{
2225
2226	return (ENOSYS);
2227}
2228
2229int
2230sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2231{
2232
2233	return (ENOSYS);
2234}
2235
2236int
2237sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2238{
2239
2240	return (ENOSYS);
2241}
2242
2243int
2244sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2245{
2246
2247	return (ENOSYS);
2248}
2249
2250int
2251sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2252{
2253
2254	return (ENOSYS);
2255}
2256
2257#endif /* !RCTL */
2258