kern_rctl.c revision 298050
1/*-
2 * Copyright (c) 2010 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: head/sys/kern/kern_rctl.c 298050 2016-04-15 13:34:59Z trasz $
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/kern_rctl.c 298050 2016-04-15 13:34:59Z trasz $");
34
35#include <sys/param.h>
36#include <sys/bus.h>
37#include <sys/malloc.h>
38#include <sys/queue.h>
39#include <sys/refcount.h>
40#include <sys/jail.h>
41#include <sys/kernel.h>
42#include <sys/limits.h>
43#include <sys/loginclass.h>
44#include <sys/priv.h>
45#include <sys/proc.h>
46#include <sys/racct.h>
47#include <sys/rctl.h>
48#include <sys/resourcevar.h>
49#include <sys/sx.h>
50#include <sys/sysent.h>
51#include <sys/sysproto.h>
52#include <sys/systm.h>
53#include <sys/types.h>
54#include <sys/eventhandler.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#include <sys/rwlock.h>
58#include <sys/sbuf.h>
59#include <sys/taskqueue.h>
60#include <sys/tree.h>
61#include <vm/uma.h>
62
63#ifdef RCTL
64#ifndef RACCT
65#error "The RCTL option requires the RACCT option"
66#endif
67
68FEATURE(rctl, "Resource Limits");
69
70#define	HRF_DEFAULT		0
71#define	HRF_DONT_INHERIT	1
72#define	HRF_DONT_ACCUMULATE	2
73
74#define	RCTL_MAX_INBUFSIZE	4 * 1024
75#define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
76#define	RCTL_LOG_BUFSIZE	128
77
78#define	RCTL_PCPU_SHIFT		(10 * 1000000)
79
80static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81static int rctl_log_rate_limit = 10;
82static int rctl_devctl_rate_limit = 10;
83
84/*
85 * Values below are initialized in rctl_init().
86 */
87static int rctl_throttle_min = -1;
88static int rctl_throttle_max = -1;
89static int rctl_throttle_pct = -1;
90static int rctl_throttle_pct2 = -1;
91
92static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96
97SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
98SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
99    &rctl_maxbufsize, 0, "Maximum output buffer size");
100SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
101    &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
102SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
103    &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
104SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
105    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
106    "Shortest throttling duration, in hz");
107TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
110    "Longest throttling duration, in hz");
111TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
114    "Throttling penalty for process consumption, in percent");
115TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
116SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
117    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
118    "Throttling penalty for container consumption, in percent");
119TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
120
121/*
122 * 'rctl_rule_link' connects a rule with every racct it's related to.
123 * For example, rule 'user:X:openfiles:deny=N/process' is linked
124 * with uidinfo for user X, and to each process of that user.
125 */
126struct rctl_rule_link {
127	LIST_ENTRY(rctl_rule_link)	rrl_next;
128	struct rctl_rule		*rrl_rule;
129	int				rrl_exceeded;
130};
131
132struct dict {
133	const char	*d_name;
134	int		d_value;
135};
136
137static struct dict subjectnames[] = {
138	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
139	{ "user", RCTL_SUBJECT_TYPE_USER },
140	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
141	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
142	{ NULL, -1 }};
143
144static struct dict resourcenames[] = {
145	{ "cputime", RACCT_CPU },
146	{ "datasize", RACCT_DATA },
147	{ "stacksize", RACCT_STACK },
148	{ "coredumpsize", RACCT_CORE },
149	{ "memoryuse", RACCT_RSS },
150	{ "memorylocked", RACCT_MEMLOCK },
151	{ "maxproc", RACCT_NPROC },
152	{ "openfiles", RACCT_NOFILE },
153	{ "vmemoryuse", RACCT_VMEM },
154	{ "pseudoterminals", RACCT_NPTS },
155	{ "swapuse", RACCT_SWAP },
156	{ "nthr", RACCT_NTHR },
157	{ "msgqqueued", RACCT_MSGQQUEUED },
158	{ "msgqsize", RACCT_MSGQSIZE },
159	{ "nmsgq", RACCT_NMSGQ },
160	{ "nsem", RACCT_NSEM },
161	{ "nsemop", RACCT_NSEMOP },
162	{ "nshm", RACCT_NSHM },
163	{ "shmsize", RACCT_SHMSIZE },
164	{ "wallclock", RACCT_WALLCLOCK },
165	{ "pcpu", RACCT_PCTCPU },
166	{ "readbps", RACCT_READBPS },
167	{ "writebps", RACCT_WRITEBPS },
168	{ "readiops", RACCT_READIOPS },
169	{ "writeiops", RACCT_WRITEIOPS },
170	{ NULL, -1 }};
171
172static struct dict actionnames[] = {
173	{ "sighup", RCTL_ACTION_SIGHUP },
174	{ "sigint", RCTL_ACTION_SIGINT },
175	{ "sigquit", RCTL_ACTION_SIGQUIT },
176	{ "sigill", RCTL_ACTION_SIGILL },
177	{ "sigtrap", RCTL_ACTION_SIGTRAP },
178	{ "sigabrt", RCTL_ACTION_SIGABRT },
179	{ "sigemt", RCTL_ACTION_SIGEMT },
180	{ "sigfpe", RCTL_ACTION_SIGFPE },
181	{ "sigkill", RCTL_ACTION_SIGKILL },
182	{ "sigbus", RCTL_ACTION_SIGBUS },
183	{ "sigsegv", RCTL_ACTION_SIGSEGV },
184	{ "sigsys", RCTL_ACTION_SIGSYS },
185	{ "sigpipe", RCTL_ACTION_SIGPIPE },
186	{ "sigalrm", RCTL_ACTION_SIGALRM },
187	{ "sigterm", RCTL_ACTION_SIGTERM },
188	{ "sigurg", RCTL_ACTION_SIGURG },
189	{ "sigstop", RCTL_ACTION_SIGSTOP },
190	{ "sigtstp", RCTL_ACTION_SIGTSTP },
191	{ "sigchld", RCTL_ACTION_SIGCHLD },
192	{ "sigttin", RCTL_ACTION_SIGTTIN },
193	{ "sigttou", RCTL_ACTION_SIGTTOU },
194	{ "sigio", RCTL_ACTION_SIGIO },
195	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
196	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
197	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
198	{ "sigprof", RCTL_ACTION_SIGPROF },
199	{ "sigwinch", RCTL_ACTION_SIGWINCH },
200	{ "siginfo", RCTL_ACTION_SIGINFO },
201	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
202	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
203	{ "sigthr", RCTL_ACTION_SIGTHR },
204	{ "deny", RCTL_ACTION_DENY },
205	{ "log", RCTL_ACTION_LOG },
206	{ "devctl", RCTL_ACTION_DEVCTL },
207	{ "throttle", RCTL_ACTION_THROTTLE },
208	{ NULL, -1 }};
209
210static void rctl_init(void);
211SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
212
213static uma_zone_t rctl_rule_zone;
214static uma_zone_t rctl_rule_link_zone;
215static struct rwlock rctl_lock;
216RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
217
218#define RCTL_RLOCK()		rw_rlock(&rctl_lock)
219#define RCTL_RUNLOCK()		rw_runlock(&rctl_lock)
220#define RCTL_WLOCK()		rw_wlock(&rctl_lock)
221#define RCTL_WUNLOCK()		rw_wunlock(&rctl_lock)
222#define RCTL_LOCK_ASSERT()	rw_assert(&rctl_lock, RA_LOCKED)
223#define RCTL_WLOCK_ASSERT()	rw_assert(&rctl_lock, RA_WLOCKED)
224
225static int rctl_rule_fully_specified(const struct rctl_rule *rule);
226static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
227
228static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
229
230static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
231{
232	int error, val = rctl_throttle_min;
233
234	error = sysctl_handle_int(oidp, &val, 0, req);
235	if (error || !req->newptr)
236		return (error);
237	if (val < 1 || val > rctl_throttle_max)
238		return (EINVAL);
239
240	RCTL_WLOCK();
241	rctl_throttle_min = val;
242	RCTL_WUNLOCK();
243
244	return (0);
245}
246
247static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
248{
249	int error, val = rctl_throttle_max;
250
251	error = sysctl_handle_int(oidp, &val, 0, req);
252	if (error || !req->newptr)
253		return (error);
254	if (val < rctl_throttle_min)
255		return (EINVAL);
256
257	RCTL_WLOCK();
258	rctl_throttle_max = val;
259	RCTL_WUNLOCK();
260
261	return (0);
262}
263
264static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
265{
266	int error, val = rctl_throttle_pct;
267
268	error = sysctl_handle_int(oidp, &val, 0, req);
269	if (error || !req->newptr)
270		return (error);
271	if (val < 0)
272		return (EINVAL);
273
274	RCTL_WLOCK();
275	rctl_throttle_pct = val;
276	RCTL_WUNLOCK();
277
278	return (0);
279}
280
281static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
282{
283	int error, val = rctl_throttle_pct2;
284
285	error = sysctl_handle_int(oidp, &val, 0, req);
286	if (error || !req->newptr)
287		return (error);
288	if (val < 0)
289		return (EINVAL);
290
291	RCTL_WLOCK();
292	rctl_throttle_pct2 = val;
293	RCTL_WUNLOCK();
294
295	return (0);
296}
297
298static const char *
299rctl_subject_type_name(int subject)
300{
301	int i;
302
303	for (i = 0; subjectnames[i].d_name != NULL; i++) {
304		if (subjectnames[i].d_value == subject)
305			return (subjectnames[i].d_name);
306	}
307
308	panic("rctl_subject_type_name: unknown subject type %d", subject);
309}
310
311static const char *
312rctl_action_name(int action)
313{
314	int i;
315
316	for (i = 0; actionnames[i].d_name != NULL; i++) {
317		if (actionnames[i].d_value == action)
318			return (actionnames[i].d_name);
319	}
320
321	panic("rctl_action_name: unknown action %d", action);
322}
323
324const char *
325rctl_resource_name(int resource)
326{
327	int i;
328
329	for (i = 0; resourcenames[i].d_name != NULL; i++) {
330		if (resourcenames[i].d_value == resource)
331			return (resourcenames[i].d_name);
332	}
333
334	panic("rctl_resource_name: unknown resource %d", resource);
335}
336
337static struct racct *
338rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
339{
340	struct ucred *cred = p->p_ucred;
341
342	ASSERT_RACCT_ENABLED();
343	RCTL_LOCK_ASSERT();
344
345	switch (rule->rr_per) {
346	case RCTL_SUBJECT_TYPE_PROCESS:
347		return (p->p_racct);
348	case RCTL_SUBJECT_TYPE_USER:
349		return (cred->cr_ruidinfo->ui_racct);
350	case RCTL_SUBJECT_TYPE_LOGINCLASS:
351		return (cred->cr_loginclass->lc_racct);
352	case RCTL_SUBJECT_TYPE_JAIL:
353		return (cred->cr_prison->pr_prison_racct->prr_racct);
354	default:
355		panic("%s: unknown per %d", __func__, rule->rr_per);
356	}
357}
358
359/*
360 * Return the amount of resource that can be allocated by 'p' before
361 * hitting 'rule'.
362 */
363static int64_t
364rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
365{
366	const struct racct *racct;
367	int64_t available;
368
369	ASSERT_RACCT_ENABLED();
370	RCTL_LOCK_ASSERT();
371
372	racct = rctl_proc_rule_to_racct(p, rule);
373	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
374
375	return (available);
376}
377
378/*
379 * Called every second for proc, uidinfo, loginclass, and jail containers.
380 * If the limit isn't exceeded, it decreases the usage amount to zero.
381 * Otherwise, it decreases it by the value of the limit.  This way
382 * resource consumption exceeding the limit "carries over" to the next
383 * period.
384 */
385void
386rctl_throttle_decay(struct racct *racct, int resource)
387{
388	struct rctl_rule *rule;
389	struct rctl_rule_link *link;
390	int64_t minavailable;
391
392	ASSERT_RACCT_ENABLED();
393
394	minavailable = INT64_MAX;
395
396	RCTL_RLOCK();
397
398	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
399		rule = link->rrl_rule;
400
401		if (rule->rr_resource != resource)
402			continue;
403		if (rule->rr_action != RCTL_ACTION_THROTTLE)
404			continue;
405
406		if (rule->rr_amount < minavailable)
407			minavailable = rule->rr_amount;
408	}
409
410	RCTL_RUNLOCK();
411
412	if (racct->r_resources[resource] < minavailable) {
413		racct->r_resources[resource] = 0;
414	} else {
415		/*
416		 * Cap utilization counter at ten times the limit.  Otherwise,
417		 * if we changed the rule lowering the allowed amount, it could
418		 * take unreasonably long time for the accumulated resource
419		 * usage to drop.
420		 */
421		if (racct->r_resources[resource] > minavailable * 10)
422			racct->r_resources[resource] = minavailable * 10;
423
424		racct->r_resources[resource] -= minavailable;
425	}
426}
427
428/*
429 * Special version of rctl_get_available() for the %CPU resource.
430 * We slightly cheat here and return less than we normally would.
431 */
432int64_t
433rctl_pcpu_available(const struct proc *p) {
434	struct rctl_rule *rule;
435	struct rctl_rule_link *link;
436	int64_t available, minavailable, limit;
437
438	ASSERT_RACCT_ENABLED();
439
440	minavailable = INT64_MAX;
441	limit = 0;
442
443	RCTL_RLOCK();
444
445	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
446		rule = link->rrl_rule;
447		if (rule->rr_resource != RACCT_PCTCPU)
448			continue;
449		if (rule->rr_action != RCTL_ACTION_DENY)
450			continue;
451		available = rctl_available_resource(p, rule);
452		if (available < minavailable) {
453			minavailable = available;
454			limit = rule->rr_amount;
455		}
456	}
457
458	RCTL_RUNLOCK();
459
460	/*
461	 * Return slightly less than actual value of the available
462	 * %cpu resource.  This makes %cpu throttling more agressive
463	 * and lets us act sooner than the limits are already exceeded.
464	 */
465	if (limit != 0) {
466		if (limit > 2 * RCTL_PCPU_SHIFT)
467			minavailable -= RCTL_PCPU_SHIFT;
468		else
469			minavailable -= (limit / 2);
470	}
471
472	return (minavailable);
473}
474
475static uint64_t
476xadd(uint64_t a, uint64_t b)
477{
478	uint64_t c;
479
480	c = a + b;
481
482	/*
483	 * Detect overflow.
484	 */
485	if (c < a || c < b)
486		return (UINT64_MAX);
487
488	return (c);
489}
490
491static uint64_t
492xmul(uint64_t a, uint64_t b)
493{
494
495	if (b != 0 && a > UINT64_MAX / b)
496		return (UINT64_MAX);
497
498	return (a * b);
499}
500
501/*
502 * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
503 * to what it keeps allocated now.  Returns non-zero if the allocation should
504 * be denied, 0 otherwise.
505 */
506int
507rctl_enforce(struct proc *p, int resource, uint64_t amount)
508{
509	static struct timeval log_lasttime, devctl_lasttime;
510	static int log_curtime = 0, devctl_curtime = 0;
511	struct rctl_rule *rule;
512	struct rctl_rule_link *link;
513	struct sbuf sb;
514	char *buf;
515	int64_t available;
516	uint64_t sleep_ms, sleep_ratio;
517	int should_deny = 0;
518
519
520	ASSERT_RACCT_ENABLED();
521
522	RCTL_RLOCK();
523
524	/*
525	 * There may be more than one matching rule; go through all of them.
526	 * Denial should be done last, after logging and sending signals.
527	 */
528	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
529		rule = link->rrl_rule;
530		if (rule->rr_resource != resource)
531			continue;
532
533		available = rctl_available_resource(p, rule);
534		if (available >= (int64_t)amount) {
535			link->rrl_exceeded = 0;
536			continue;
537		}
538
539		switch (rule->rr_action) {
540		case RCTL_ACTION_DENY:
541			should_deny = 1;
542			continue;
543		case RCTL_ACTION_LOG:
544			/*
545			 * If rrl_exceeded != 0, it means we've already
546			 * logged a warning for this process.
547			 */
548			if (link->rrl_exceeded != 0)
549				continue;
550
551			/*
552			 * If the process state is not fully initialized yet,
553			 * we can't access most of the required fields, e.g.
554			 * p->p_comm.  This happens when called from fork1().
555			 * Ignore this rule for now; it will be processed just
556			 * after fork, when called from racct_proc_fork_done().
557			 */
558			if (p->p_state != PRS_NORMAL)
559				continue;
560
561			if (!ppsratecheck(&log_lasttime, &log_curtime,
562			    rctl_log_rate_limit))
563				continue;
564
565			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
566			if (buf == NULL) {
567				printf("rctl_enforce: out of memory\n");
568				continue;
569			}
570			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
571			rctl_rule_to_sbuf(&sb, rule);
572			sbuf_finish(&sb);
573			printf("rctl: rule \"%s\" matched by pid %d "
574			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
575			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
576			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
577			sbuf_delete(&sb);
578			free(buf, M_RCTL);
579			link->rrl_exceeded = 1;
580			continue;
581		case RCTL_ACTION_DEVCTL:
582			if (link->rrl_exceeded != 0)
583				continue;
584
585			if (p->p_state != PRS_NORMAL)
586				continue;
587
588			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
589			    rctl_devctl_rate_limit))
590				continue;
591
592			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
593			if (buf == NULL) {
594				printf("rctl_enforce: out of memory\n");
595				continue;
596			}
597			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
598			sbuf_printf(&sb, "rule=");
599			rctl_rule_to_sbuf(&sb, rule);
600			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
601			    p->p_pid, p->p_ucred->cr_ruid,
602			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
603			sbuf_finish(&sb);
604			devctl_notify_f("RCTL", "rule", "matched",
605			    sbuf_data(&sb), M_NOWAIT);
606			sbuf_delete(&sb);
607			free(buf, M_RCTL);
608			link->rrl_exceeded = 1;
609			continue;
610		case RCTL_ACTION_THROTTLE:
611			if (p->p_state != PRS_NORMAL)
612				continue;
613
614			/*
615			 * Make the process sleep for a fraction of second
616			 * proportional to the ratio of process' resource
617			 * utilization compared to the limit.  The point is
618			 * to penalize resource hogs: processes that consume
619			 * more of the available resources sleep for longer.
620			 *
621			 * We're trying to defer division until the very end,
622			 * to minimize the rounding effects.  The following
623			 * calculation could have been written in a clearer
624			 * way like this:
625			 *
626			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
627			 *     rule->rr_amount;
628			 * sleep_ms *= rctl_throttle_pct / 100;
629			 * if (sleep_ms < rctl_throttle_min)
630			 *         sleep_ms = rctl_throttle_min;
631			 *
632			 */
633			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
634			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
635			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
636				sleep_ms = rctl_throttle_min * rule->rr_amount;
637
638			/*
639			 * Multiply that by the ratio of the resource
640			 * consumption for the container compared to the limit,
641			 * squared.  In other words, a process in a container
642			 * that is two times over the limit will be throttled
643			 * four times as much for hitting the same rule.  The
644			 * point is to penalize processes more if the container
645			 * itself (eg certain UID or jail) is above the limit.
646			 */
647			if (available < 0)
648				sleep_ratio = -available / rule->rr_amount;
649			else
650				sleep_ratio = 0;
651			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
652			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
653			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
654
655			/*
656			 * Finally the division.
657			 */
658			sleep_ms /= rule->rr_amount;
659
660			if (sleep_ms > rctl_throttle_max)
661				sleep_ms = rctl_throttle_max;
662#if 0
663			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
664			   __func__, p->p_pid, p->p_comm,
665			   p->p_racct->r_resources[resource],
666			   rule->rr_amount, sleep_ms, sleep_ratio, available);
667#endif
668
669			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
670			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
671			racct_proc_throttle(p, sleep_ms);
672			continue;
673		default:
674			if (link->rrl_exceeded != 0)
675				continue;
676
677			if (p->p_state != PRS_NORMAL)
678				continue;
679
680			KASSERT(rule->rr_action > 0 &&
681			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
682			    ("rctl_enforce: unknown action %d",
683			     rule->rr_action));
684
685			/*
686			 * We're using the fact that RCTL_ACTION_SIG* values
687			 * are equal to their counterparts from sys/signal.h.
688			 */
689			kern_psignal(p, rule->rr_action);
690			link->rrl_exceeded = 1;
691			continue;
692		}
693	}
694
695	RCTL_RUNLOCK();
696
697	if (should_deny) {
698		/*
699		 * Return fake error code; the caller should change it
700		 * into one proper for the situation - EFSIZ, ENOMEM etc.
701		 */
702		return (EDOOFUS);
703	}
704
705	return (0);
706}
707
708uint64_t
709rctl_get_limit(struct proc *p, int resource)
710{
711	struct rctl_rule *rule;
712	struct rctl_rule_link *link;
713	uint64_t amount = UINT64_MAX;
714
715	ASSERT_RACCT_ENABLED();
716
717	RCTL_RLOCK();
718
719	/*
720	 * There may be more than one matching rule; go through all of them.
721	 * Denial should be done last, after logging and sending signals.
722	 */
723	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
724		rule = link->rrl_rule;
725		if (rule->rr_resource != resource)
726			continue;
727		if (rule->rr_action != RCTL_ACTION_DENY)
728			continue;
729		if (rule->rr_amount < amount)
730			amount = rule->rr_amount;
731	}
732
733	RCTL_RUNLOCK();
734
735	return (amount);
736}
737
738uint64_t
739rctl_get_available(struct proc *p, int resource)
740{
741	struct rctl_rule *rule;
742	struct rctl_rule_link *link;
743	int64_t available, minavailable, allocated;
744
745	minavailable = INT64_MAX;
746
747	ASSERT_RACCT_ENABLED();
748
749	RCTL_RLOCK();
750
751	/*
752	 * There may be more than one matching rule; go through all of them.
753	 * Denial should be done last, after logging and sending signals.
754	 */
755	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
756		rule = link->rrl_rule;
757		if (rule->rr_resource != resource)
758			continue;
759		if (rule->rr_action != RCTL_ACTION_DENY)
760			continue;
761		available = rctl_available_resource(p, rule);
762		if (available < minavailable)
763			minavailable = available;
764	}
765
766	RCTL_RUNLOCK();
767
768	/*
769	 * XXX: Think about this _hard_.
770	 */
771	allocated = p->p_racct->r_resources[resource];
772	if (minavailable < INT64_MAX - allocated)
773		minavailable += allocated;
774	if (minavailable < 0)
775		minavailable = 0;
776	return (minavailable);
777}
778
779static int
780rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
781{
782
783	ASSERT_RACCT_ENABLED();
784
785	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
786		if (rule->rr_subject_type != filter->rr_subject_type)
787			return (0);
788
789		switch (filter->rr_subject_type) {
790		case RCTL_SUBJECT_TYPE_PROCESS:
791			if (filter->rr_subject.rs_proc != NULL &&
792			    rule->rr_subject.rs_proc !=
793			    filter->rr_subject.rs_proc)
794				return (0);
795			break;
796		case RCTL_SUBJECT_TYPE_USER:
797			if (filter->rr_subject.rs_uip != NULL &&
798			    rule->rr_subject.rs_uip !=
799			    filter->rr_subject.rs_uip)
800				return (0);
801			break;
802		case RCTL_SUBJECT_TYPE_LOGINCLASS:
803			if (filter->rr_subject.rs_loginclass != NULL &&
804			    rule->rr_subject.rs_loginclass !=
805			    filter->rr_subject.rs_loginclass)
806				return (0);
807			break;
808		case RCTL_SUBJECT_TYPE_JAIL:
809			if (filter->rr_subject.rs_prison_racct != NULL &&
810			    rule->rr_subject.rs_prison_racct !=
811			    filter->rr_subject.rs_prison_racct)
812				return (0);
813			break;
814		default:
815			panic("rctl_rule_matches: unknown subject type %d",
816			    filter->rr_subject_type);
817		}
818	}
819
820	if (filter->rr_resource != RACCT_UNDEFINED) {
821		if (rule->rr_resource != filter->rr_resource)
822			return (0);
823	}
824
825	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
826		if (rule->rr_action != filter->rr_action)
827			return (0);
828	}
829
830	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
831		if (rule->rr_amount != filter->rr_amount)
832			return (0);
833	}
834
835	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
836		if (rule->rr_per != filter->rr_per)
837			return (0);
838	}
839
840	return (1);
841}
842
843static int
844str2value(const char *str, int *value, struct dict *table)
845{
846	int i;
847
848	if (value == NULL)
849		return (EINVAL);
850
851	for (i = 0; table[i].d_name != NULL; i++) {
852		if (strcasecmp(table[i].d_name, str) == 0) {
853			*value =  table[i].d_value;
854			return (0);
855		}
856	}
857
858	return (EINVAL);
859}
860
861static int
862str2id(const char *str, id_t *value)
863{
864	char *end;
865
866	if (str == NULL)
867		return (EINVAL);
868
869	*value = strtoul(str, &end, 10);
870	if ((size_t)(end - str) != strlen(str))
871		return (EINVAL);
872
873	return (0);
874}
875
876static int
877str2int64(const char *str, int64_t *value)
878{
879	char *end;
880
881	if (str == NULL)
882		return (EINVAL);
883
884	*value = strtoul(str, &end, 10);
885	if ((size_t)(end - str) != strlen(str))
886		return (EINVAL);
887
888	if (*value < 0)
889		return (ERANGE);
890
891	return (0);
892}
893
894/*
895 * Connect the rule to the racct, increasing refcount for the rule.
896 */
897static void
898rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
899{
900	struct rctl_rule_link *link;
901
902	ASSERT_RACCT_ENABLED();
903	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
904
905	rctl_rule_acquire(rule);
906	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
907	link->rrl_rule = rule;
908	link->rrl_exceeded = 0;
909
910	RCTL_WLOCK();
911	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
912	RCTL_WUNLOCK();
913}
914
915static int
916rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
917{
918	struct rctl_rule_link *link;
919
920	ASSERT_RACCT_ENABLED();
921	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
922	RCTL_WLOCK_ASSERT();
923
924	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
925	if (link == NULL)
926		return (ENOMEM);
927	rctl_rule_acquire(rule);
928	link->rrl_rule = rule;
929	link->rrl_exceeded = 0;
930
931	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
932	return (0);
933}
934
935/*
936 * Remove limits for a rules matching the filter and release
937 * the refcounts for the rules, possibly freeing them.  Returns
938 * the number of limit structures removed.
939 */
940static int
941rctl_racct_remove_rules(struct racct *racct,
942    const struct rctl_rule *filter)
943{
944	struct rctl_rule_link *link, *linktmp;
945	int removed = 0;
946
947	ASSERT_RACCT_ENABLED();
948	RCTL_WLOCK_ASSERT();
949
950	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
951		if (!rctl_rule_matches(link->rrl_rule, filter))
952			continue;
953
954		LIST_REMOVE(link, rrl_next);
955		rctl_rule_release(link->rrl_rule);
956		uma_zfree(rctl_rule_link_zone, link);
957		removed++;
958	}
959	return (removed);
960}
961
962static void
963rctl_rule_acquire_subject(struct rctl_rule *rule)
964{
965
966	ASSERT_RACCT_ENABLED();
967
968	switch (rule->rr_subject_type) {
969	case RCTL_SUBJECT_TYPE_UNDEFINED:
970	case RCTL_SUBJECT_TYPE_PROCESS:
971		break;
972	case RCTL_SUBJECT_TYPE_JAIL:
973		if (rule->rr_subject.rs_prison_racct != NULL)
974			prison_racct_hold(rule->rr_subject.rs_prison_racct);
975		break;
976	case RCTL_SUBJECT_TYPE_USER:
977		if (rule->rr_subject.rs_uip != NULL)
978			uihold(rule->rr_subject.rs_uip);
979		break;
980	case RCTL_SUBJECT_TYPE_LOGINCLASS:
981		if (rule->rr_subject.rs_loginclass != NULL)
982			loginclass_hold(rule->rr_subject.rs_loginclass);
983		break;
984	default:
985		panic("rctl_rule_acquire_subject: unknown subject type %d",
986		    rule->rr_subject_type);
987	}
988}
989
990static void
991rctl_rule_release_subject(struct rctl_rule *rule)
992{
993
994	ASSERT_RACCT_ENABLED();
995
996	switch (rule->rr_subject_type) {
997	case RCTL_SUBJECT_TYPE_UNDEFINED:
998	case RCTL_SUBJECT_TYPE_PROCESS:
999		break;
1000	case RCTL_SUBJECT_TYPE_JAIL:
1001		if (rule->rr_subject.rs_prison_racct != NULL)
1002			prison_racct_free(rule->rr_subject.rs_prison_racct);
1003		break;
1004	case RCTL_SUBJECT_TYPE_USER:
1005		if (rule->rr_subject.rs_uip != NULL)
1006			uifree(rule->rr_subject.rs_uip);
1007		break;
1008	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1009		if (rule->rr_subject.rs_loginclass != NULL)
1010			loginclass_free(rule->rr_subject.rs_loginclass);
1011		break;
1012	default:
1013		panic("rctl_rule_release_subject: unknown subject type %d",
1014		    rule->rr_subject_type);
1015	}
1016}
1017
1018struct rctl_rule *
1019rctl_rule_alloc(int flags)
1020{
1021	struct rctl_rule *rule;
1022
1023	ASSERT_RACCT_ENABLED();
1024
1025	rule = uma_zalloc(rctl_rule_zone, flags);
1026	if (rule == NULL)
1027		return (NULL);
1028	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1029	rule->rr_subject.rs_proc = NULL;
1030	rule->rr_subject.rs_uip = NULL;
1031	rule->rr_subject.rs_loginclass = NULL;
1032	rule->rr_subject.rs_prison_racct = NULL;
1033	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1034	rule->rr_resource = RACCT_UNDEFINED;
1035	rule->rr_action = RCTL_ACTION_UNDEFINED;
1036	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1037	refcount_init(&rule->rr_refcount, 1);
1038
1039	return (rule);
1040}
1041
1042struct rctl_rule *
1043rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1044{
1045	struct rctl_rule *copy;
1046
1047	ASSERT_RACCT_ENABLED();
1048
1049	copy = uma_zalloc(rctl_rule_zone, flags);
1050	if (copy == NULL)
1051		return (NULL);
1052	copy->rr_subject_type = rule->rr_subject_type;
1053	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1054	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1055	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1056	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1057	copy->rr_per = rule->rr_per;
1058	copy->rr_resource = rule->rr_resource;
1059	copy->rr_action = rule->rr_action;
1060	copy->rr_amount = rule->rr_amount;
1061	refcount_init(&copy->rr_refcount, 1);
1062	rctl_rule_acquire_subject(copy);
1063
1064	return (copy);
1065}
1066
1067void
1068rctl_rule_acquire(struct rctl_rule *rule)
1069{
1070
1071	ASSERT_RACCT_ENABLED();
1072	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1073
1074	refcount_acquire(&rule->rr_refcount);
1075}
1076
1077static void
1078rctl_rule_free(void *context, int pending)
1079{
1080	struct rctl_rule *rule;
1081
1082	rule = (struct rctl_rule *)context;
1083
1084	ASSERT_RACCT_ENABLED();
1085	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1086
1087	/*
1088	 * We don't need locking here; rule is guaranteed to be inaccessible.
1089	 */
1090
1091	rctl_rule_release_subject(rule);
1092	uma_zfree(rctl_rule_zone, rule);
1093}
1094
1095void
1096rctl_rule_release(struct rctl_rule *rule)
1097{
1098
1099	ASSERT_RACCT_ENABLED();
1100	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1101
1102	if (refcount_release(&rule->rr_refcount)) {
1103		/*
1104		 * rctl_rule_release() is often called when iterating
1105		 * over all the uidinfo structures in the system,
1106		 * holding uihashtbl_lock.  Since rctl_rule_free()
1107		 * might end up calling uifree(), this would lead
1108		 * to lock recursion.  Use taskqueue to avoid this.
1109		 */
1110		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1111		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1112	}
1113}
1114
1115static int
1116rctl_rule_fully_specified(const struct rctl_rule *rule)
1117{
1118
1119	ASSERT_RACCT_ENABLED();
1120
1121	switch (rule->rr_subject_type) {
1122	case RCTL_SUBJECT_TYPE_UNDEFINED:
1123		return (0);
1124	case RCTL_SUBJECT_TYPE_PROCESS:
1125		if (rule->rr_subject.rs_proc == NULL)
1126			return (0);
1127		break;
1128	case RCTL_SUBJECT_TYPE_USER:
1129		if (rule->rr_subject.rs_uip == NULL)
1130			return (0);
1131		break;
1132	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1133		if (rule->rr_subject.rs_loginclass == NULL)
1134			return (0);
1135		break;
1136	case RCTL_SUBJECT_TYPE_JAIL:
1137		if (rule->rr_subject.rs_prison_racct == NULL)
1138			return (0);
1139		break;
1140	default:
1141		panic("rctl_rule_fully_specified: unknown subject type %d",
1142		    rule->rr_subject_type);
1143	}
1144	if (rule->rr_resource == RACCT_UNDEFINED)
1145		return (0);
1146	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1147		return (0);
1148	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1149		return (0);
1150	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1151		return (0);
1152
1153	return (1);
1154}
1155
1156static int
1157rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1158{
1159	struct rctl_rule *rule;
1160	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1161	     *amountstr, *perstr;
1162	id_t id;
1163	int error = 0;
1164
1165	ASSERT_RACCT_ENABLED();
1166
1167	rule = rctl_rule_alloc(M_WAITOK);
1168
1169	subjectstr = strsep(&rulestr, ":");
1170	subject_idstr = strsep(&rulestr, ":");
1171	resourcestr = strsep(&rulestr, ":");
1172	actionstr = strsep(&rulestr, "=/");
1173	amountstr = strsep(&rulestr, "/");
1174	perstr = rulestr;
1175
1176	if (subjectstr == NULL || subjectstr[0] == '\0')
1177		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1178	else {
1179		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1180		if (error != 0)
1181			goto out;
1182	}
1183
1184	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1185		rule->rr_subject.rs_proc = NULL;
1186		rule->rr_subject.rs_uip = NULL;
1187		rule->rr_subject.rs_loginclass = NULL;
1188		rule->rr_subject.rs_prison_racct = NULL;
1189	} else {
1190		switch (rule->rr_subject_type) {
1191		case RCTL_SUBJECT_TYPE_UNDEFINED:
1192			error = EINVAL;
1193			goto out;
1194		case RCTL_SUBJECT_TYPE_PROCESS:
1195			error = str2id(subject_idstr, &id);
1196			if (error != 0)
1197				goto out;
1198			sx_assert(&allproc_lock, SA_LOCKED);
1199			rule->rr_subject.rs_proc = pfind(id);
1200			if (rule->rr_subject.rs_proc == NULL) {
1201				error = ESRCH;
1202				goto out;
1203			}
1204			PROC_UNLOCK(rule->rr_subject.rs_proc);
1205			break;
1206		case RCTL_SUBJECT_TYPE_USER:
1207			error = str2id(subject_idstr, &id);
1208			if (error != 0)
1209				goto out;
1210			rule->rr_subject.rs_uip = uifind(id);
1211			break;
1212		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1213			rule->rr_subject.rs_loginclass =
1214			    loginclass_find(subject_idstr);
1215			if (rule->rr_subject.rs_loginclass == NULL) {
1216				error = ENAMETOOLONG;
1217				goto out;
1218			}
1219			break;
1220		case RCTL_SUBJECT_TYPE_JAIL:
1221			rule->rr_subject.rs_prison_racct =
1222			    prison_racct_find(subject_idstr);
1223			if (rule->rr_subject.rs_prison_racct == NULL) {
1224				error = ENAMETOOLONG;
1225				goto out;
1226			}
1227			break;
1228               default:
1229                       panic("rctl_string_to_rule: unknown subject type %d",
1230                           rule->rr_subject_type);
1231               }
1232	}
1233
1234	if (resourcestr == NULL || resourcestr[0] == '\0')
1235		rule->rr_resource = RACCT_UNDEFINED;
1236	else {
1237		error = str2value(resourcestr, &rule->rr_resource,
1238		    resourcenames);
1239		if (error != 0)
1240			goto out;
1241	}
1242
1243	if (actionstr == NULL || actionstr[0] == '\0')
1244		rule->rr_action = RCTL_ACTION_UNDEFINED;
1245	else {
1246		error = str2value(actionstr, &rule->rr_action, actionnames);
1247		if (error != 0)
1248			goto out;
1249	}
1250
1251	if (amountstr == NULL || amountstr[0] == '\0')
1252		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1253	else {
1254		error = str2int64(amountstr, &rule->rr_amount);
1255		if (error != 0)
1256			goto out;
1257		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1258			if (rule->rr_amount > INT64_MAX / 1000000) {
1259				error = ERANGE;
1260				goto out;
1261			}
1262			rule->rr_amount *= 1000000;
1263		}
1264	}
1265
1266	if (perstr == NULL || perstr[0] == '\0')
1267		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1268	else {
1269		error = str2value(perstr, &rule->rr_per, subjectnames);
1270		if (error != 0)
1271			goto out;
1272	}
1273
1274out:
1275	if (error == 0)
1276		*rulep = rule;
1277	else
1278		rctl_rule_release(rule);
1279
1280	return (error);
1281}
1282
1283/*
1284 * Link a rule with all the subjects it applies to.
1285 */
1286int
1287rctl_rule_add(struct rctl_rule *rule)
1288{
1289	struct proc *p;
1290	struct ucred *cred;
1291	struct uidinfo *uip;
1292	struct prison *pr;
1293	struct prison_racct *prr;
1294	struct loginclass *lc;
1295	struct rctl_rule *rule2;
1296	int match;
1297
1298	ASSERT_RACCT_ENABLED();
1299	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1300
1301	/*
1302	 * Some rules just don't make sense, like "deny" rule for an undeniable
1303	 * resource.  The exception are the RSS and %CPU resources - they are
1304	 * not deniable in the racct sense, but the limit is enforced in
1305	 * a different way.
1306	 */
1307	if (rule->rr_action == RCTL_ACTION_DENY &&
1308	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1309	    rule->rr_resource != RACCT_RSS &&
1310	    rule->rr_resource != RACCT_PCTCPU) {
1311		return (EOPNOTSUPP);
1312	}
1313
1314	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1315	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1316		return (EOPNOTSUPP);
1317	}
1318
1319	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1320	    rule->rr_resource == RACCT_PCTCPU) {
1321		return (EOPNOTSUPP);
1322	}
1323
1324	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1325	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1326		return (EOPNOTSUPP);
1327	}
1328
1329	/*
1330	 * Make sure there are no duplicated rules.  Also, for the "deny"
1331	 * rules, remove ones differing only by "amount".
1332	 */
1333	if (rule->rr_action == RCTL_ACTION_DENY) {
1334		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1335		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1336		rctl_rule_remove(rule2);
1337		rctl_rule_release(rule2);
1338	} else
1339		rctl_rule_remove(rule);
1340
1341	switch (rule->rr_subject_type) {
1342	case RCTL_SUBJECT_TYPE_PROCESS:
1343		p = rule->rr_subject.rs_proc;
1344		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1345
1346		rctl_racct_add_rule(p->p_racct, rule);
1347		/*
1348		 * In case of per-process rule, we don't have anything more
1349		 * to do.
1350		 */
1351		return (0);
1352
1353	case RCTL_SUBJECT_TYPE_USER:
1354		uip = rule->rr_subject.rs_uip;
1355		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1356		rctl_racct_add_rule(uip->ui_racct, rule);
1357		break;
1358
1359	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1360		lc = rule->rr_subject.rs_loginclass;
1361		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1362		rctl_racct_add_rule(lc->lc_racct, rule);
1363		break;
1364
1365	case RCTL_SUBJECT_TYPE_JAIL:
1366		prr = rule->rr_subject.rs_prison_racct;
1367		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1368		rctl_racct_add_rule(prr->prr_racct, rule);
1369		break;
1370
1371	default:
1372		panic("rctl_rule_add: unknown subject type %d",
1373		    rule->rr_subject_type);
1374	}
1375
1376	/*
1377	 * Now go through all the processes and add the new rule to the ones
1378	 * it applies to.
1379	 */
1380	sx_assert(&allproc_lock, SA_LOCKED);
1381	FOREACH_PROC_IN_SYSTEM(p) {
1382		cred = p->p_ucred;
1383		switch (rule->rr_subject_type) {
1384		case RCTL_SUBJECT_TYPE_USER:
1385			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1386			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1387				break;
1388			continue;
1389		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1390			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1391				break;
1392			continue;
1393		case RCTL_SUBJECT_TYPE_JAIL:
1394			match = 0;
1395			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1396				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1397					match = 1;
1398					break;
1399				}
1400			}
1401			if (match)
1402				break;
1403			continue;
1404		default:
1405			panic("rctl_rule_add: unknown subject type %d",
1406			    rule->rr_subject_type);
1407		}
1408
1409		rctl_racct_add_rule(p->p_racct, rule);
1410	}
1411
1412	return (0);
1413}
1414
1415static void
1416rctl_rule_pre_callback(void)
1417{
1418
1419	RCTL_WLOCK();
1420}
1421
1422static void
1423rctl_rule_post_callback(void)
1424{
1425
1426	RCTL_WUNLOCK();
1427}
1428
1429static void
1430rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1431{
1432	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1433	int found = 0;
1434
1435	ASSERT_RACCT_ENABLED();
1436	RCTL_WLOCK_ASSERT();
1437
1438	found += rctl_racct_remove_rules(racct, filter);
1439
1440	*((int *)arg3) += found;
1441}
1442
1443/*
1444 * Remove all rules that match the filter.
1445 */
1446int
1447rctl_rule_remove(struct rctl_rule *filter)
1448{
1449	struct proc *p;
1450	int found = 0;
1451
1452	ASSERT_RACCT_ENABLED();
1453
1454	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1455	    filter->rr_subject.rs_proc != NULL) {
1456		p = filter->rr_subject.rs_proc;
1457		RCTL_WLOCK();
1458		found = rctl_racct_remove_rules(p->p_racct, filter);
1459		RCTL_WUNLOCK();
1460		if (found)
1461			return (0);
1462		return (ESRCH);
1463	}
1464
1465	loginclass_racct_foreach(rctl_rule_remove_callback,
1466	    rctl_rule_pre_callback, rctl_rule_post_callback,
1467	    filter, (void *)&found);
1468	ui_racct_foreach(rctl_rule_remove_callback,
1469	    rctl_rule_pre_callback, rctl_rule_post_callback,
1470	    filter, (void *)&found);
1471	prison_racct_foreach(rctl_rule_remove_callback,
1472	    rctl_rule_pre_callback, rctl_rule_post_callback,
1473	    filter, (void *)&found);
1474
1475	sx_assert(&allproc_lock, SA_LOCKED);
1476	RCTL_WLOCK();
1477	FOREACH_PROC_IN_SYSTEM(p) {
1478		found += rctl_racct_remove_rules(p->p_racct, filter);
1479	}
1480	RCTL_WUNLOCK();
1481
1482	if (found)
1483		return (0);
1484	return (ESRCH);
1485}
1486
1487/*
1488 * Appends a rule to the sbuf.
1489 */
1490static void
1491rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1492{
1493	int64_t amount;
1494
1495	ASSERT_RACCT_ENABLED();
1496
1497	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1498
1499	switch (rule->rr_subject_type) {
1500	case RCTL_SUBJECT_TYPE_PROCESS:
1501		if (rule->rr_subject.rs_proc == NULL)
1502			sbuf_printf(sb, ":");
1503		else
1504			sbuf_printf(sb, "%d:",
1505			    rule->rr_subject.rs_proc->p_pid);
1506		break;
1507	case RCTL_SUBJECT_TYPE_USER:
1508		if (rule->rr_subject.rs_uip == NULL)
1509			sbuf_printf(sb, ":");
1510		else
1511			sbuf_printf(sb, "%d:",
1512			    rule->rr_subject.rs_uip->ui_uid);
1513		break;
1514	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1515		if (rule->rr_subject.rs_loginclass == NULL)
1516			sbuf_printf(sb, ":");
1517		else
1518			sbuf_printf(sb, "%s:",
1519			    rule->rr_subject.rs_loginclass->lc_name);
1520		break;
1521	case RCTL_SUBJECT_TYPE_JAIL:
1522		if (rule->rr_subject.rs_prison_racct == NULL)
1523			sbuf_printf(sb, ":");
1524		else
1525			sbuf_printf(sb, "%s:",
1526			    rule->rr_subject.rs_prison_racct->prr_name);
1527		break;
1528	default:
1529		panic("rctl_rule_to_sbuf: unknown subject type %d",
1530		    rule->rr_subject_type);
1531	}
1532
1533	amount = rule->rr_amount;
1534	if (amount != RCTL_AMOUNT_UNDEFINED &&
1535	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1536		amount /= 1000000;
1537
1538	sbuf_printf(sb, "%s:%s=%jd",
1539	    rctl_resource_name(rule->rr_resource),
1540	    rctl_action_name(rule->rr_action),
1541	    amount);
1542
1543	if (rule->rr_per != rule->rr_subject_type)
1544		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1545}
1546
1547/*
1548 * Routine used by RCTL syscalls to read in input string.
1549 */
1550static int
1551rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1552{
1553	char *str;
1554	int error;
1555
1556	ASSERT_RACCT_ENABLED();
1557
1558	if (inbuflen <= 0)
1559		return (EINVAL);
1560	if (inbuflen > RCTL_MAX_INBUFSIZE)
1561		return (E2BIG);
1562
1563	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1564	error = copyinstr(inbufp, str, inbuflen, NULL);
1565	if (error != 0) {
1566		free(str, M_RCTL);
1567		return (error);
1568	}
1569
1570	*inputstr = str;
1571
1572	return (0);
1573}
1574
1575/*
1576 * Routine used by RCTL syscalls to write out output string.
1577 */
1578static int
1579rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1580{
1581	int error;
1582
1583	ASSERT_RACCT_ENABLED();
1584
1585	if (outputsbuf == NULL)
1586		return (0);
1587
1588	sbuf_finish(outputsbuf);
1589	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1590		sbuf_delete(outputsbuf);
1591		return (ERANGE);
1592	}
1593	error = copyout(sbuf_data(outputsbuf), outbufp,
1594	    sbuf_len(outputsbuf) + 1);
1595	sbuf_delete(outputsbuf);
1596	return (error);
1597}
1598
1599static struct sbuf *
1600rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1601{
1602	struct sbuf *sb;
1603	int64_t amount;
1604	int i;
1605
1606	ASSERT_RACCT_ENABLED();
1607
1608	sb = sbuf_new_auto();
1609	for (i = 0; i <= RACCT_MAX; i++) {
1610		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1611			continue;
1612		amount = racct->r_resources[i];
1613		if (RACCT_IS_IN_MILLIONS(i))
1614			amount /= 1000000;
1615		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1616	}
1617	sbuf_setpos(sb, sbuf_len(sb) - 1);
1618	return (sb);
1619}
1620
1621int
1622sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1623{
1624	struct rctl_rule *filter;
1625	struct sbuf *outputsbuf = NULL;
1626	struct proc *p;
1627	struct uidinfo *uip;
1628	struct loginclass *lc;
1629	struct prison_racct *prr;
1630	char *inputstr;
1631	int error;
1632
1633	if (!racct_enable)
1634		return (ENOSYS);
1635
1636	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1637	if (error != 0)
1638		return (error);
1639
1640	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1641	if (error != 0)
1642		return (error);
1643
1644	sx_slock(&allproc_lock);
1645	error = rctl_string_to_rule(inputstr, &filter);
1646	free(inputstr, M_RCTL);
1647	if (error != 0) {
1648		sx_sunlock(&allproc_lock);
1649		return (error);
1650	}
1651
1652	switch (filter->rr_subject_type) {
1653	case RCTL_SUBJECT_TYPE_PROCESS:
1654		p = filter->rr_subject.rs_proc;
1655		if (p == NULL) {
1656			error = EINVAL;
1657			goto out;
1658		}
1659		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1660		break;
1661	case RCTL_SUBJECT_TYPE_USER:
1662		uip = filter->rr_subject.rs_uip;
1663		if (uip == NULL) {
1664			error = EINVAL;
1665			goto out;
1666		}
1667		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1668		break;
1669	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1670		lc = filter->rr_subject.rs_loginclass;
1671		if (lc == NULL) {
1672			error = EINVAL;
1673			goto out;
1674		}
1675		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1676		break;
1677	case RCTL_SUBJECT_TYPE_JAIL:
1678		prr = filter->rr_subject.rs_prison_racct;
1679		if (prr == NULL) {
1680			error = EINVAL;
1681			goto out;
1682		}
1683		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1684		break;
1685	default:
1686		error = EINVAL;
1687	}
1688out:
1689	rctl_rule_release(filter);
1690	sx_sunlock(&allproc_lock);
1691	if (error != 0)
1692		return (error);
1693
1694	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1695
1696	return (error);
1697}
1698
1699static void
1700rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1701{
1702	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1703	struct rctl_rule_link *link;
1704	struct sbuf *sb = (struct sbuf *)arg3;
1705
1706	ASSERT_RACCT_ENABLED();
1707	RCTL_LOCK_ASSERT();
1708
1709	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1710		if (!rctl_rule_matches(link->rrl_rule, filter))
1711			continue;
1712		rctl_rule_to_sbuf(sb, link->rrl_rule);
1713		sbuf_printf(sb, ",");
1714	}
1715}
1716
1717int
1718sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1719{
1720	struct sbuf *sb;
1721	struct rctl_rule *filter;
1722	struct rctl_rule_link *link;
1723	struct proc *p;
1724	char *inputstr, *buf;
1725	size_t bufsize;
1726	int error;
1727
1728	if (!racct_enable)
1729		return (ENOSYS);
1730
1731	error = priv_check(td, PRIV_RCTL_GET_RULES);
1732	if (error != 0)
1733		return (error);
1734
1735	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1736	if (error != 0)
1737		return (error);
1738
1739	sx_slock(&allproc_lock);
1740	error = rctl_string_to_rule(inputstr, &filter);
1741	free(inputstr, M_RCTL);
1742	if (error != 0) {
1743		sx_sunlock(&allproc_lock);
1744		return (error);
1745	}
1746
1747	bufsize = uap->outbuflen;
1748	if (bufsize > rctl_maxbufsize) {
1749		sx_sunlock(&allproc_lock);
1750		return (E2BIG);
1751	}
1752
1753	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1754	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1755	KASSERT(sb != NULL, ("sbuf_new failed"));
1756
1757	FOREACH_PROC_IN_SYSTEM(p) {
1758		RCTL_RLOCK();
1759		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1760			/*
1761			 * Non-process rules will be added to the buffer later.
1762			 * Adding them here would result in duplicated output.
1763			 */
1764			if (link->rrl_rule->rr_subject_type !=
1765			    RCTL_SUBJECT_TYPE_PROCESS)
1766				continue;
1767			if (!rctl_rule_matches(link->rrl_rule, filter))
1768				continue;
1769			rctl_rule_to_sbuf(sb, link->rrl_rule);
1770			sbuf_printf(sb, ",");
1771		}
1772		RCTL_RUNLOCK();
1773	}
1774
1775	loginclass_racct_foreach(rctl_get_rules_callback,
1776	    rctl_rule_pre_callback, rctl_rule_post_callback,
1777	    filter, sb);
1778	ui_racct_foreach(rctl_get_rules_callback,
1779	    rctl_rule_pre_callback, rctl_rule_post_callback,
1780	    filter, sb);
1781	prison_racct_foreach(rctl_get_rules_callback,
1782	    rctl_rule_pre_callback, rctl_rule_post_callback,
1783	    filter, sb);
1784	if (sbuf_error(sb) == ENOMEM) {
1785		error = ERANGE;
1786		goto out;
1787	}
1788
1789	/*
1790	 * Remove trailing ",".
1791	 */
1792	if (sbuf_len(sb) > 0)
1793		sbuf_setpos(sb, sbuf_len(sb) - 1);
1794
1795	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1796out:
1797	rctl_rule_release(filter);
1798	sx_sunlock(&allproc_lock);
1799	free(buf, M_RCTL);
1800	return (error);
1801}
1802
1803int
1804sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1805{
1806	struct sbuf *sb;
1807	struct rctl_rule *filter;
1808	struct rctl_rule_link *link;
1809	char *inputstr, *buf;
1810	size_t bufsize;
1811	int error;
1812
1813	if (!racct_enable)
1814		return (ENOSYS);
1815
1816	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1817	if (error != 0)
1818		return (error);
1819
1820	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1821	if (error != 0)
1822		return (error);
1823
1824	sx_slock(&allproc_lock);
1825	error = rctl_string_to_rule(inputstr, &filter);
1826	free(inputstr, M_RCTL);
1827	if (error != 0) {
1828		sx_sunlock(&allproc_lock);
1829		return (error);
1830	}
1831
1832	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1833		rctl_rule_release(filter);
1834		sx_sunlock(&allproc_lock);
1835		return (EINVAL);
1836	}
1837	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1838		rctl_rule_release(filter);
1839		sx_sunlock(&allproc_lock);
1840		return (EOPNOTSUPP);
1841	}
1842	if (filter->rr_subject.rs_proc == NULL) {
1843		rctl_rule_release(filter);
1844		sx_sunlock(&allproc_lock);
1845		return (EINVAL);
1846	}
1847
1848	bufsize = uap->outbuflen;
1849	if (bufsize > rctl_maxbufsize) {
1850		rctl_rule_release(filter);
1851		sx_sunlock(&allproc_lock);
1852		return (E2BIG);
1853	}
1854
1855	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1856	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1857	KASSERT(sb != NULL, ("sbuf_new failed"));
1858
1859	RCTL_RLOCK();
1860	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1861	    rrl_next) {
1862		rctl_rule_to_sbuf(sb, link->rrl_rule);
1863		sbuf_printf(sb, ",");
1864	}
1865	RCTL_RUNLOCK();
1866	if (sbuf_error(sb) == ENOMEM) {
1867		error = ERANGE;
1868		goto out;
1869	}
1870
1871	/*
1872	 * Remove trailing ",".
1873	 */
1874	if (sbuf_len(sb) > 0)
1875		sbuf_setpos(sb, sbuf_len(sb) - 1);
1876
1877	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1878out:
1879	rctl_rule_release(filter);
1880	sx_sunlock(&allproc_lock);
1881	free(buf, M_RCTL);
1882	return (error);
1883}
1884
1885int
1886sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1887{
1888	struct rctl_rule *rule;
1889	char *inputstr;
1890	int error;
1891
1892	if (!racct_enable)
1893		return (ENOSYS);
1894
1895	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1896	if (error != 0)
1897		return (error);
1898
1899	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1900	if (error != 0)
1901		return (error);
1902
1903	sx_slock(&allproc_lock);
1904	error = rctl_string_to_rule(inputstr, &rule);
1905	free(inputstr, M_RCTL);
1906	if (error != 0) {
1907		sx_sunlock(&allproc_lock);
1908		return (error);
1909	}
1910	/*
1911	 * The 'per' part of a rule is optional.
1912	 */
1913	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1914	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1915		rule->rr_per = rule->rr_subject_type;
1916
1917	if (!rctl_rule_fully_specified(rule)) {
1918		error = EINVAL;
1919		goto out;
1920	}
1921
1922	error = rctl_rule_add(rule);
1923
1924out:
1925	rctl_rule_release(rule);
1926	sx_sunlock(&allproc_lock);
1927	return (error);
1928}
1929
1930int
1931sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1932{
1933	struct rctl_rule *filter;
1934	char *inputstr;
1935	int error;
1936
1937	if (!racct_enable)
1938		return (ENOSYS);
1939
1940	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1941	if (error != 0)
1942		return (error);
1943
1944	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1945	if (error != 0)
1946		return (error);
1947
1948	sx_slock(&allproc_lock);
1949	error = rctl_string_to_rule(inputstr, &filter);
1950	free(inputstr, M_RCTL);
1951	if (error != 0) {
1952		sx_sunlock(&allproc_lock);
1953		return (error);
1954	}
1955
1956	error = rctl_rule_remove(filter);
1957	rctl_rule_release(filter);
1958	sx_sunlock(&allproc_lock);
1959
1960	return (error);
1961}
1962
1963/*
1964 * Update RCTL rule list after credential change.
1965 */
1966void
1967rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1968{
1969	LIST_HEAD(, rctl_rule_link) newrules;
1970	struct rctl_rule_link *link, *newlink;
1971	struct uidinfo *newuip;
1972	struct loginclass *newlc;
1973	struct prison_racct *newprr;
1974	int rulecnt, i;
1975
1976	ASSERT_RACCT_ENABLED();
1977
1978	newuip = newcred->cr_ruidinfo;
1979	newlc = newcred->cr_loginclass;
1980	newprr = newcred->cr_prison->pr_prison_racct;
1981
1982	LIST_INIT(&newrules);
1983
1984again:
1985	/*
1986	 * First, count the rules that apply to the process with new
1987	 * credentials.
1988	 */
1989	rulecnt = 0;
1990	RCTL_RLOCK();
1991	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1992		if (link->rrl_rule->rr_subject_type ==
1993		    RCTL_SUBJECT_TYPE_PROCESS)
1994			rulecnt++;
1995	}
1996	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1997		rulecnt++;
1998	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1999		rulecnt++;
2000	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
2001		rulecnt++;
2002	RCTL_RUNLOCK();
2003
2004	/*
2005	 * Create temporary list.  We've dropped the rctl_lock in order
2006	 * to use M_WAITOK.
2007	 */
2008	for (i = 0; i < rulecnt; i++) {
2009		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2010		newlink->rrl_rule = NULL;
2011		newlink->rrl_exceeded = 0;
2012		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2013	}
2014
2015	newlink = LIST_FIRST(&newrules);
2016
2017	/*
2018	 * Assign rules to the newly allocated list entries.
2019	 */
2020	RCTL_WLOCK();
2021	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2022		if (link->rrl_rule->rr_subject_type ==
2023		    RCTL_SUBJECT_TYPE_PROCESS) {
2024			if (newlink == NULL)
2025				goto goaround;
2026			rctl_rule_acquire(link->rrl_rule);
2027			newlink->rrl_rule = link->rrl_rule;
2028			newlink->rrl_exceeded = link->rrl_exceeded;
2029			newlink = LIST_NEXT(newlink, rrl_next);
2030			rulecnt--;
2031		}
2032	}
2033
2034	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2035		if (newlink == NULL)
2036			goto goaround;
2037		rctl_rule_acquire(link->rrl_rule);
2038		newlink->rrl_rule = link->rrl_rule;
2039		newlink->rrl_exceeded = link->rrl_exceeded;
2040		newlink = LIST_NEXT(newlink, rrl_next);
2041		rulecnt--;
2042	}
2043
2044	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2045		if (newlink == NULL)
2046			goto goaround;
2047		rctl_rule_acquire(link->rrl_rule);
2048		newlink->rrl_rule = link->rrl_rule;
2049		newlink->rrl_exceeded = link->rrl_exceeded;
2050		newlink = LIST_NEXT(newlink, rrl_next);
2051		rulecnt--;
2052	}
2053
2054	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2055		if (newlink == NULL)
2056			goto goaround;
2057		rctl_rule_acquire(link->rrl_rule);
2058		newlink->rrl_rule = link->rrl_rule;
2059		newlink->rrl_exceeded = link->rrl_exceeded;
2060		newlink = LIST_NEXT(newlink, rrl_next);
2061		rulecnt--;
2062	}
2063
2064	if (rulecnt == 0) {
2065		/*
2066		 * Free the old rule list.
2067		 */
2068		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2069			link = LIST_FIRST(&p->p_racct->r_rule_links);
2070			LIST_REMOVE(link, rrl_next);
2071			rctl_rule_release(link->rrl_rule);
2072			uma_zfree(rctl_rule_link_zone, link);
2073		}
2074
2075		/*
2076		 * Replace lists and we're done.
2077		 *
2078		 * XXX: Is there any way to switch list heads instead
2079		 *      of iterating here?
2080		 */
2081		while (!LIST_EMPTY(&newrules)) {
2082			newlink = LIST_FIRST(&newrules);
2083			LIST_REMOVE(newlink, rrl_next);
2084			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2085			    newlink, rrl_next);
2086		}
2087
2088		RCTL_WUNLOCK();
2089
2090		return;
2091	}
2092
2093goaround:
2094	RCTL_WUNLOCK();
2095
2096	/*
2097	 * Rule list changed while we were not holding the rctl_lock.
2098	 * Free the new list and try again.
2099	 */
2100	while (!LIST_EMPTY(&newrules)) {
2101		newlink = LIST_FIRST(&newrules);
2102		LIST_REMOVE(newlink, rrl_next);
2103		if (newlink->rrl_rule != NULL)
2104			rctl_rule_release(newlink->rrl_rule);
2105		uma_zfree(rctl_rule_link_zone, newlink);
2106	}
2107
2108	goto again;
2109}
2110
2111/*
2112 * Assign RCTL rules to the newly created process.
2113 */
2114int
2115rctl_proc_fork(struct proc *parent, struct proc *child)
2116{
2117	struct rctl_rule *rule;
2118	struct rctl_rule_link *link;
2119	int error;
2120
2121	LIST_INIT(&child->p_racct->r_rule_links);
2122
2123	ASSERT_RACCT_ENABLED();
2124	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2125
2126	RCTL_WLOCK();
2127
2128	/*
2129	 * Go through limits applicable to the parent and assign them
2130	 * to the child.  Rules with 'process' subject have to be duplicated
2131	 * in order to make their rr_subject point to the new process.
2132	 */
2133	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2134		if (link->rrl_rule->rr_subject_type ==
2135		    RCTL_SUBJECT_TYPE_PROCESS) {
2136			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2137			if (rule == NULL)
2138				goto fail;
2139			KASSERT(rule->rr_subject.rs_proc == parent,
2140			    ("rule->rr_subject.rs_proc != parent"));
2141			rule->rr_subject.rs_proc = child;
2142			error = rctl_racct_add_rule_locked(child->p_racct,
2143			    rule);
2144			rctl_rule_release(rule);
2145			if (error != 0)
2146				goto fail;
2147		} else {
2148			error = rctl_racct_add_rule_locked(child->p_racct,
2149			    link->rrl_rule);
2150			if (error != 0)
2151				goto fail;
2152		}
2153	}
2154
2155	RCTL_WUNLOCK();
2156	return (0);
2157
2158fail:
2159	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2160		link = LIST_FIRST(&child->p_racct->r_rule_links);
2161		LIST_REMOVE(link, rrl_next);
2162		rctl_rule_release(link->rrl_rule);
2163		uma_zfree(rctl_rule_link_zone, link);
2164	}
2165	RCTL_WUNLOCK();
2166	return (EAGAIN);
2167}
2168
2169/*
2170 * Release rules attached to the racct.
2171 */
2172void
2173rctl_racct_release(struct racct *racct)
2174{
2175	struct rctl_rule_link *link;
2176
2177	ASSERT_RACCT_ENABLED();
2178
2179	RCTL_WLOCK();
2180	while (!LIST_EMPTY(&racct->r_rule_links)) {
2181		link = LIST_FIRST(&racct->r_rule_links);
2182		LIST_REMOVE(link, rrl_next);
2183		rctl_rule_release(link->rrl_rule);
2184		uma_zfree(rctl_rule_link_zone, link);
2185	}
2186	RCTL_WUNLOCK();
2187}
2188
2189static void
2190rctl_init(void)
2191{
2192
2193	if (!racct_enable)
2194		return;
2195
2196	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2197	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2198	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2199	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2200	    UMA_ALIGN_PTR, 0);
2201
2202	/*
2203	 * Set default values, making sure not to overwrite the ones
2204	 * fetched from tunables.  Most of those could be set at the
2205	 * declaration, except for the rctl_throttle_max - we cannot
2206	 * set it there due to hz not being compile time constant.
2207	 */
2208	if (rctl_throttle_min < 1)
2209		rctl_throttle_min = 1;
2210	if (rctl_throttle_max < rctl_throttle_min)
2211		rctl_throttle_max = 2 * hz;
2212	if (rctl_throttle_pct < 0)
2213		rctl_throttle_pct = 100;
2214	if (rctl_throttle_pct2 < 0)
2215		rctl_throttle_pct2 = 100;
2216}
2217
2218#else /* !RCTL */
2219
2220int
2221sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2222{
2223
2224	return (ENOSYS);
2225}
2226
2227int
2228sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2229{
2230
2231	return (ENOSYS);
2232}
2233
2234int
2235sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2236{
2237
2238	return (ENOSYS);
2239}
2240
2241int
2242sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2243{
2244
2245	return (ENOSYS);
2246}
2247
2248int
2249sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2250{
2251
2252	return (ENOSYS);
2253}
2254
2255#endif /* !RCTL */
2256