1/*-
2 * Copyright (c) 2010 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: releng/11.0/sys/kern/kern_rctl.c 298819 2016-04-29 22:15:33Z pfg $
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: releng/11.0/sys/kern/kern_rctl.c 298819 2016-04-29 22:15:33Z pfg $");
34
35#include <sys/param.h>
36#include <sys/bus.h>
37#include <sys/malloc.h>
38#include <sys/queue.h>
39#include <sys/refcount.h>
40#include <sys/jail.h>
41#include <sys/kernel.h>
42#include <sys/limits.h>
43#include <sys/loginclass.h>
44#include <sys/priv.h>
45#include <sys/proc.h>
46#include <sys/racct.h>
47#include <sys/rctl.h>
48#include <sys/resourcevar.h>
49#include <sys/sx.h>
50#include <sys/sysent.h>
51#include <sys/sysproto.h>
52#include <sys/systm.h>
53#include <sys/types.h>
54#include <sys/eventhandler.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#include <sys/rwlock.h>
58#include <sys/sbuf.h>
59#include <sys/taskqueue.h>
60#include <sys/tree.h>
61#include <vm/uma.h>
62
63#ifdef RCTL
64#ifndef RACCT
65#error "The RCTL option requires the RACCT option"
66#endif
67
68FEATURE(rctl, "Resource Limits");
69
70#define	HRF_DEFAULT		0
71#define	HRF_DONT_INHERIT	1
72#define	HRF_DONT_ACCUMULATE	2
73
74#define	RCTL_MAX_INBUFSIZE	4 * 1024
75#define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
76#define	RCTL_LOG_BUFSIZE	128
77
78#define	RCTL_PCPU_SHIFT		(10 * 1000000)
79
80static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81static int rctl_log_rate_limit = 10;
82static int rctl_devctl_rate_limit = 10;
83
84/*
85 * Values below are initialized in rctl_init().
86 */
87static int rctl_throttle_min = -1;
88static int rctl_throttle_max = -1;
89static int rctl_throttle_pct = -1;
90static int rctl_throttle_pct2 = -1;
91
92static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96
97SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
98SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
99    &rctl_maxbufsize, 0, "Maximum output buffer size");
100SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
101    &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
102SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
103    &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
104SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
105    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
106    "Shortest throttling duration, in hz");
107TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
110    "Longest throttling duration, in hz");
111TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
114    "Throttling penalty for process consumption, in percent");
115TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
116SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
117    CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
118    "Throttling penalty for container consumption, in percent");
119TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
120
121/*
122 * 'rctl_rule_link' connects a rule with every racct it's related to.
123 * For example, rule 'user:X:openfiles:deny=N/process' is linked
124 * with uidinfo for user X, and to each process of that user.
125 */
126struct rctl_rule_link {
127	LIST_ENTRY(rctl_rule_link)	rrl_next;
128	struct rctl_rule		*rrl_rule;
129	int				rrl_exceeded;
130};
131
132struct dict {
133	const char	*d_name;
134	int		d_value;
135};
136
137static struct dict subjectnames[] = {
138	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
139	{ "user", RCTL_SUBJECT_TYPE_USER },
140	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
141	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
142	{ NULL, -1 }};
143
144static struct dict resourcenames[] = {
145	{ "cputime", RACCT_CPU },
146	{ "datasize", RACCT_DATA },
147	{ "stacksize", RACCT_STACK },
148	{ "coredumpsize", RACCT_CORE },
149	{ "memoryuse", RACCT_RSS },
150	{ "memorylocked", RACCT_MEMLOCK },
151	{ "maxproc", RACCT_NPROC },
152	{ "openfiles", RACCT_NOFILE },
153	{ "vmemoryuse", RACCT_VMEM },
154	{ "pseudoterminals", RACCT_NPTS },
155	{ "swapuse", RACCT_SWAP },
156	{ "nthr", RACCT_NTHR },
157	{ "msgqqueued", RACCT_MSGQQUEUED },
158	{ "msgqsize", RACCT_MSGQSIZE },
159	{ "nmsgq", RACCT_NMSGQ },
160	{ "nsem", RACCT_NSEM },
161	{ "nsemop", RACCT_NSEMOP },
162	{ "nshm", RACCT_NSHM },
163	{ "shmsize", RACCT_SHMSIZE },
164	{ "wallclock", RACCT_WALLCLOCK },
165	{ "pcpu", RACCT_PCTCPU },
166	{ "readbps", RACCT_READBPS },
167	{ "writebps", RACCT_WRITEBPS },
168	{ "readiops", RACCT_READIOPS },
169	{ "writeiops", RACCT_WRITEIOPS },
170	{ NULL, -1 }};
171
172static struct dict actionnames[] = {
173	{ "sighup", RCTL_ACTION_SIGHUP },
174	{ "sigint", RCTL_ACTION_SIGINT },
175	{ "sigquit", RCTL_ACTION_SIGQUIT },
176	{ "sigill", RCTL_ACTION_SIGILL },
177	{ "sigtrap", RCTL_ACTION_SIGTRAP },
178	{ "sigabrt", RCTL_ACTION_SIGABRT },
179	{ "sigemt", RCTL_ACTION_SIGEMT },
180	{ "sigfpe", RCTL_ACTION_SIGFPE },
181	{ "sigkill", RCTL_ACTION_SIGKILL },
182	{ "sigbus", RCTL_ACTION_SIGBUS },
183	{ "sigsegv", RCTL_ACTION_SIGSEGV },
184	{ "sigsys", RCTL_ACTION_SIGSYS },
185	{ "sigpipe", RCTL_ACTION_SIGPIPE },
186	{ "sigalrm", RCTL_ACTION_SIGALRM },
187	{ "sigterm", RCTL_ACTION_SIGTERM },
188	{ "sigurg", RCTL_ACTION_SIGURG },
189	{ "sigstop", RCTL_ACTION_SIGSTOP },
190	{ "sigtstp", RCTL_ACTION_SIGTSTP },
191	{ "sigchld", RCTL_ACTION_SIGCHLD },
192	{ "sigttin", RCTL_ACTION_SIGTTIN },
193	{ "sigttou", RCTL_ACTION_SIGTTOU },
194	{ "sigio", RCTL_ACTION_SIGIO },
195	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
196	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
197	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
198	{ "sigprof", RCTL_ACTION_SIGPROF },
199	{ "sigwinch", RCTL_ACTION_SIGWINCH },
200	{ "siginfo", RCTL_ACTION_SIGINFO },
201	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
202	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
203	{ "sigthr", RCTL_ACTION_SIGTHR },
204	{ "deny", RCTL_ACTION_DENY },
205	{ "log", RCTL_ACTION_LOG },
206	{ "devctl", RCTL_ACTION_DEVCTL },
207	{ "throttle", RCTL_ACTION_THROTTLE },
208	{ NULL, -1 }};
209
210static void rctl_init(void);
211SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
212
213static uma_zone_t rctl_rule_zone;
214static uma_zone_t rctl_rule_link_zone;
215
216static int rctl_rule_fully_specified(const struct rctl_rule *rule);
217static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
218
219static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
220
221static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
222{
223	int error, val = rctl_throttle_min;
224
225	error = sysctl_handle_int(oidp, &val, 0, req);
226	if (error || !req->newptr)
227		return (error);
228	if (val < 1 || val > rctl_throttle_max)
229		return (EINVAL);
230
231	RACCT_LOCK();
232	rctl_throttle_min = val;
233	RACCT_UNLOCK();
234
235	return (0);
236}
237
238static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
239{
240	int error, val = rctl_throttle_max;
241
242	error = sysctl_handle_int(oidp, &val, 0, req);
243	if (error || !req->newptr)
244		return (error);
245	if (val < rctl_throttle_min)
246		return (EINVAL);
247
248	RACCT_LOCK();
249	rctl_throttle_max = val;
250	RACCT_UNLOCK();
251
252	return (0);
253}
254
255static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
256{
257	int error, val = rctl_throttle_pct;
258
259	error = sysctl_handle_int(oidp, &val, 0, req);
260	if (error || !req->newptr)
261		return (error);
262	if (val < 0)
263		return (EINVAL);
264
265	RACCT_LOCK();
266	rctl_throttle_pct = val;
267	RACCT_UNLOCK();
268
269	return (0);
270}
271
272static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
273{
274	int error, val = rctl_throttle_pct2;
275
276	error = sysctl_handle_int(oidp, &val, 0, req);
277	if (error || !req->newptr)
278		return (error);
279	if (val < 0)
280		return (EINVAL);
281
282	RACCT_LOCK();
283	rctl_throttle_pct2 = val;
284	RACCT_UNLOCK();
285
286	return (0);
287}
288
289static const char *
290rctl_subject_type_name(int subject)
291{
292	int i;
293
294	for (i = 0; subjectnames[i].d_name != NULL; i++) {
295		if (subjectnames[i].d_value == subject)
296			return (subjectnames[i].d_name);
297	}
298
299	panic("rctl_subject_type_name: unknown subject type %d", subject);
300}
301
302static const char *
303rctl_action_name(int action)
304{
305	int i;
306
307	for (i = 0; actionnames[i].d_name != NULL; i++) {
308		if (actionnames[i].d_value == action)
309			return (actionnames[i].d_name);
310	}
311
312	panic("rctl_action_name: unknown action %d", action);
313}
314
315const char *
316rctl_resource_name(int resource)
317{
318	int i;
319
320	for (i = 0; resourcenames[i].d_name != NULL; i++) {
321		if (resourcenames[i].d_value == resource)
322			return (resourcenames[i].d_name);
323	}
324
325	panic("rctl_resource_name: unknown resource %d", resource);
326}
327
328static struct racct *
329rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
330{
331	struct ucred *cred = p->p_ucred;
332
333	ASSERT_RACCT_ENABLED();
334	RACCT_LOCK_ASSERT();
335
336	switch (rule->rr_per) {
337	case RCTL_SUBJECT_TYPE_PROCESS:
338		return (p->p_racct);
339	case RCTL_SUBJECT_TYPE_USER:
340		return (cred->cr_ruidinfo->ui_racct);
341	case RCTL_SUBJECT_TYPE_LOGINCLASS:
342		return (cred->cr_loginclass->lc_racct);
343	case RCTL_SUBJECT_TYPE_JAIL:
344		return (cred->cr_prison->pr_prison_racct->prr_racct);
345	default:
346		panic("%s: unknown per %d", __func__, rule->rr_per);
347	}
348}
349
350/*
351 * Return the amount of resource that can be allocated by 'p' before
352 * hitting 'rule'.
353 */
354static int64_t
355rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
356{
357	const struct racct *racct;
358	int64_t available;
359
360	ASSERT_RACCT_ENABLED();
361	RACCT_LOCK_ASSERT();
362
363	racct = rctl_proc_rule_to_racct(p, rule);
364	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
365
366	return (available);
367}
368
369/*
370 * Called every second for proc, uidinfo, loginclass, and jail containers.
371 * If the limit isn't exceeded, it decreases the usage amount to zero.
372 * Otherwise, it decreases it by the value of the limit.  This way
373 * resource consumption exceeding the limit "carries over" to the next
374 * period.
375 */
376void
377rctl_throttle_decay(struct racct *racct, int resource)
378{
379	struct rctl_rule *rule;
380	struct rctl_rule_link *link;
381	int64_t minavailable;
382
383	ASSERT_RACCT_ENABLED();
384	RACCT_LOCK_ASSERT();
385
386	minavailable = INT64_MAX;
387
388	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
389		rule = link->rrl_rule;
390
391		if (rule->rr_resource != resource)
392			continue;
393		if (rule->rr_action != RCTL_ACTION_THROTTLE)
394			continue;
395
396		if (rule->rr_amount < minavailable)
397			minavailable = rule->rr_amount;
398	}
399
400	if (racct->r_resources[resource] < minavailable) {
401		racct->r_resources[resource] = 0;
402	} else {
403		/*
404		 * Cap utilization counter at ten times the limit.  Otherwise,
405		 * if we changed the rule lowering the allowed amount, it could
406		 * take unreasonably long time for the accumulated resource
407		 * usage to drop.
408		 */
409		if (racct->r_resources[resource] > minavailable * 10)
410			racct->r_resources[resource] = minavailable * 10;
411
412		racct->r_resources[resource] -= minavailable;
413	}
414}
415
416/*
417 * Special version of rctl_get_available() for the %CPU resource.
418 * We slightly cheat here and return less than we normally would.
419 */
420int64_t
421rctl_pcpu_available(const struct proc *p) {
422	struct rctl_rule *rule;
423	struct rctl_rule_link *link;
424	int64_t available, minavailable, limit;
425
426	ASSERT_RACCT_ENABLED();
427	RACCT_LOCK_ASSERT();
428
429	minavailable = INT64_MAX;
430	limit = 0;
431
432	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
433		rule = link->rrl_rule;
434		if (rule->rr_resource != RACCT_PCTCPU)
435			continue;
436		if (rule->rr_action != RCTL_ACTION_DENY)
437			continue;
438		available = rctl_available_resource(p, rule);
439		if (available < minavailable) {
440			minavailable = available;
441			limit = rule->rr_amount;
442		}
443	}
444
445	/*
446	 * Return slightly less than actual value of the available
447	 * %cpu resource.  This makes %cpu throttling more aggressive
448	 * and lets us act sooner than the limits are already exceeded.
449	 */
450	if (limit != 0) {
451		if (limit > 2 * RCTL_PCPU_SHIFT)
452			minavailable -= RCTL_PCPU_SHIFT;
453		else
454			minavailable -= (limit / 2);
455	}
456
457	return (minavailable);
458}
459
460static uint64_t
461xadd(uint64_t a, uint64_t b)
462{
463	uint64_t c;
464
465	c = a + b;
466
467	/*
468	 * Detect overflow.
469	 */
470	if (c < a || c < b)
471		return (UINT64_MAX);
472
473	return (c);
474}
475
476static uint64_t
477xmul(uint64_t a, uint64_t b)
478{
479
480	if (b != 0 && a > UINT64_MAX / b)
481		return (UINT64_MAX);
482
483	return (a * b);
484}
485
486/*
487 * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
488 * to what it keeps allocated now.  Returns non-zero if the allocation should
489 * be denied, 0 otherwise.
490 */
491int
492rctl_enforce(struct proc *p, int resource, uint64_t amount)
493{
494	static struct timeval log_lasttime, devctl_lasttime;
495	static int log_curtime = 0, devctl_curtime = 0;
496	struct rctl_rule *rule;
497	struct rctl_rule_link *link;
498	struct sbuf sb;
499	char *buf;
500	int64_t available;
501	uint64_t sleep_ms, sleep_ratio;
502	int should_deny = 0;
503
504	ASSERT_RACCT_ENABLED();
505	RACCT_LOCK_ASSERT();
506
507	/*
508	 * There may be more than one matching rule; go through all of them.
509	 * Denial should be done last, after logging and sending signals.
510	 */
511	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
512		rule = link->rrl_rule;
513		if (rule->rr_resource != resource)
514			continue;
515
516		available = rctl_available_resource(p, rule);
517		if (available >= (int64_t)amount) {
518			link->rrl_exceeded = 0;
519			continue;
520		}
521
522		switch (rule->rr_action) {
523		case RCTL_ACTION_DENY:
524			should_deny = 1;
525			continue;
526		case RCTL_ACTION_LOG:
527			/*
528			 * If rrl_exceeded != 0, it means we've already
529			 * logged a warning for this process.
530			 */
531			if (link->rrl_exceeded != 0)
532				continue;
533
534			/*
535			 * If the process state is not fully initialized yet,
536			 * we can't access most of the required fields, e.g.
537			 * p->p_comm.  This happens when called from fork1().
538			 * Ignore this rule for now; it will be processed just
539			 * after fork, when called from racct_proc_fork_done().
540			 */
541			if (p->p_state != PRS_NORMAL)
542				continue;
543
544			if (!ppsratecheck(&log_lasttime, &log_curtime,
545			    rctl_log_rate_limit))
546				continue;
547
548			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
549			if (buf == NULL) {
550				printf("rctl_enforce: out of memory\n");
551				continue;
552			}
553			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
554			rctl_rule_to_sbuf(&sb, rule);
555			sbuf_finish(&sb);
556			printf("rctl: rule \"%s\" matched by pid %d "
557			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
558			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
559			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
560			sbuf_delete(&sb);
561			free(buf, M_RCTL);
562			link->rrl_exceeded = 1;
563			continue;
564		case RCTL_ACTION_DEVCTL:
565			if (link->rrl_exceeded != 0)
566				continue;
567
568			if (p->p_state != PRS_NORMAL)
569				continue;
570
571			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
572			    rctl_devctl_rate_limit))
573				continue;
574
575			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
576			if (buf == NULL) {
577				printf("rctl_enforce: out of memory\n");
578				continue;
579			}
580			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
581			sbuf_printf(&sb, "rule=");
582			rctl_rule_to_sbuf(&sb, rule);
583			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
584			    p->p_pid, p->p_ucred->cr_ruid,
585			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
586			sbuf_finish(&sb);
587			devctl_notify_f("RCTL", "rule", "matched",
588			    sbuf_data(&sb), M_NOWAIT);
589			sbuf_delete(&sb);
590			free(buf, M_RCTL);
591			link->rrl_exceeded = 1;
592			continue;
593		case RCTL_ACTION_THROTTLE:
594			if (p->p_state != PRS_NORMAL)
595				continue;
596
597			/*
598			 * Make the process sleep for a fraction of second
599			 * proportional to the ratio of process' resource
600			 * utilization compared to the limit.  The point is
601			 * to penalize resource hogs: processes that consume
602			 * more of the available resources sleep for longer.
603			 *
604			 * We're trying to defer division until the very end,
605			 * to minimize the rounding effects.  The following
606			 * calculation could have been written in a clearer
607			 * way like this:
608			 *
609			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
610			 *     rule->rr_amount;
611			 * sleep_ms *= rctl_throttle_pct / 100;
612			 * if (sleep_ms < rctl_throttle_min)
613			 *         sleep_ms = rctl_throttle_min;
614			 *
615			 */
616			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
617			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
618			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
619				sleep_ms = rctl_throttle_min * rule->rr_amount;
620
621			/*
622			 * Multiply that by the ratio of the resource
623			 * consumption for the container compared to the limit,
624			 * squared.  In other words, a process in a container
625			 * that is two times over the limit will be throttled
626			 * four times as much for hitting the same rule.  The
627			 * point is to penalize processes more if the container
628			 * itself (eg certain UID or jail) is above the limit.
629			 */
630			if (available < 0)
631				sleep_ratio = -available / rule->rr_amount;
632			else
633				sleep_ratio = 0;
634			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
635			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
636			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
637
638			/*
639			 * Finally the division.
640			 */
641			sleep_ms /= rule->rr_amount;
642
643			if (sleep_ms > rctl_throttle_max)
644				sleep_ms = rctl_throttle_max;
645#if 0
646			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
647			   __func__, p->p_pid, p->p_comm,
648			   p->p_racct->r_resources[resource],
649			   rule->rr_amount, (uintmax_t)sleep_ms,
650			   (uintmax_t)sleep_ratio, (intmax_t)available);
651#endif
652
653			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
654			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
655			racct_proc_throttle(p, sleep_ms);
656			continue;
657		default:
658			if (link->rrl_exceeded != 0)
659				continue;
660
661			if (p->p_state != PRS_NORMAL)
662				continue;
663
664			KASSERT(rule->rr_action > 0 &&
665			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
666			    ("rctl_enforce: unknown action %d",
667			     rule->rr_action));
668
669			/*
670			 * We're using the fact that RCTL_ACTION_SIG* values
671			 * are equal to their counterparts from sys/signal.h.
672			 */
673			kern_psignal(p, rule->rr_action);
674			link->rrl_exceeded = 1;
675			continue;
676		}
677	}
678
679	if (should_deny) {
680		/*
681		 * Return fake error code; the caller should change it
682		 * into one proper for the situation - EFSIZ, ENOMEM etc.
683		 */
684		return (EDOOFUS);
685	}
686
687	return (0);
688}
689
690uint64_t
691rctl_get_limit(struct proc *p, int resource)
692{
693	struct rctl_rule *rule;
694	struct rctl_rule_link *link;
695	uint64_t amount = UINT64_MAX;
696
697	ASSERT_RACCT_ENABLED();
698	RACCT_LOCK_ASSERT();
699
700	/*
701	 * There may be more than one matching rule; go through all of them.
702	 * Denial should be done last, after logging and sending signals.
703	 */
704	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
705		rule = link->rrl_rule;
706		if (rule->rr_resource != resource)
707			continue;
708		if (rule->rr_action != RCTL_ACTION_DENY)
709			continue;
710		if (rule->rr_amount < amount)
711			amount = rule->rr_amount;
712	}
713
714	return (amount);
715}
716
717uint64_t
718rctl_get_available(struct proc *p, int resource)
719{
720	struct rctl_rule *rule;
721	struct rctl_rule_link *link;
722	int64_t available, minavailable, allocated;
723
724	minavailable = INT64_MAX;
725
726	ASSERT_RACCT_ENABLED();
727	RACCT_LOCK_ASSERT();
728
729	/*
730	 * There may be more than one matching rule; go through all of them.
731	 * Denial should be done last, after logging and sending signals.
732	 */
733	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
734		rule = link->rrl_rule;
735		if (rule->rr_resource != resource)
736			continue;
737		if (rule->rr_action != RCTL_ACTION_DENY)
738			continue;
739		available = rctl_available_resource(p, rule);
740		if (available < minavailable)
741			minavailable = available;
742	}
743
744	/*
745	 * XXX: Think about this _hard_.
746	 */
747	allocated = p->p_racct->r_resources[resource];
748	if (minavailable < INT64_MAX - allocated)
749		minavailable += allocated;
750	if (minavailable < 0)
751		minavailable = 0;
752
753	return (minavailable);
754}
755
756static int
757rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
758{
759
760	ASSERT_RACCT_ENABLED();
761
762	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
763		if (rule->rr_subject_type != filter->rr_subject_type)
764			return (0);
765
766		switch (filter->rr_subject_type) {
767		case RCTL_SUBJECT_TYPE_PROCESS:
768			if (filter->rr_subject.rs_proc != NULL &&
769			    rule->rr_subject.rs_proc !=
770			    filter->rr_subject.rs_proc)
771				return (0);
772			break;
773		case RCTL_SUBJECT_TYPE_USER:
774			if (filter->rr_subject.rs_uip != NULL &&
775			    rule->rr_subject.rs_uip !=
776			    filter->rr_subject.rs_uip)
777				return (0);
778			break;
779		case RCTL_SUBJECT_TYPE_LOGINCLASS:
780			if (filter->rr_subject.rs_loginclass != NULL &&
781			    rule->rr_subject.rs_loginclass !=
782			    filter->rr_subject.rs_loginclass)
783				return (0);
784			break;
785		case RCTL_SUBJECT_TYPE_JAIL:
786			if (filter->rr_subject.rs_prison_racct != NULL &&
787			    rule->rr_subject.rs_prison_racct !=
788			    filter->rr_subject.rs_prison_racct)
789				return (0);
790			break;
791		default:
792			panic("rctl_rule_matches: unknown subject type %d",
793			    filter->rr_subject_type);
794		}
795	}
796
797	if (filter->rr_resource != RACCT_UNDEFINED) {
798		if (rule->rr_resource != filter->rr_resource)
799			return (0);
800	}
801
802	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
803		if (rule->rr_action != filter->rr_action)
804			return (0);
805	}
806
807	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
808		if (rule->rr_amount != filter->rr_amount)
809			return (0);
810	}
811
812	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
813		if (rule->rr_per != filter->rr_per)
814			return (0);
815	}
816
817	return (1);
818}
819
820static int
821str2value(const char *str, int *value, struct dict *table)
822{
823	int i;
824
825	if (value == NULL)
826		return (EINVAL);
827
828	for (i = 0; table[i].d_name != NULL; i++) {
829		if (strcasecmp(table[i].d_name, str) == 0) {
830			*value =  table[i].d_value;
831			return (0);
832		}
833	}
834
835	return (EINVAL);
836}
837
838static int
839str2id(const char *str, id_t *value)
840{
841	char *end;
842
843	if (str == NULL)
844		return (EINVAL);
845
846	*value = strtoul(str, &end, 10);
847	if ((size_t)(end - str) != strlen(str))
848		return (EINVAL);
849
850	return (0);
851}
852
853static int
854str2int64(const char *str, int64_t *value)
855{
856	char *end;
857
858	if (str == NULL)
859		return (EINVAL);
860
861	*value = strtoul(str, &end, 10);
862	if ((size_t)(end - str) != strlen(str))
863		return (EINVAL);
864
865	if (*value < 0)
866		return (ERANGE);
867
868	return (0);
869}
870
871/*
872 * Connect the rule to the racct, increasing refcount for the rule.
873 */
874static void
875rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
876{
877	struct rctl_rule_link *link;
878
879	ASSERT_RACCT_ENABLED();
880	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
881
882	rctl_rule_acquire(rule);
883	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
884	link->rrl_rule = rule;
885	link->rrl_exceeded = 0;
886
887	RACCT_LOCK();
888	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
889	RACCT_UNLOCK();
890}
891
892static int
893rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
894{
895	struct rctl_rule_link *link;
896
897	ASSERT_RACCT_ENABLED();
898	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
899	RACCT_LOCK_ASSERT();
900
901	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
902	if (link == NULL)
903		return (ENOMEM);
904	rctl_rule_acquire(rule);
905	link->rrl_rule = rule;
906	link->rrl_exceeded = 0;
907
908	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
909
910	return (0);
911}
912
913/*
914 * Remove limits for a rules matching the filter and release
915 * the refcounts for the rules, possibly freeing them.  Returns
916 * the number of limit structures removed.
917 */
918static int
919rctl_racct_remove_rules(struct racct *racct,
920    const struct rctl_rule *filter)
921{
922	struct rctl_rule_link *link, *linktmp;
923	int removed = 0;
924
925	ASSERT_RACCT_ENABLED();
926	RACCT_LOCK_ASSERT();
927
928	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
929		if (!rctl_rule_matches(link->rrl_rule, filter))
930			continue;
931
932		LIST_REMOVE(link, rrl_next);
933		rctl_rule_release(link->rrl_rule);
934		uma_zfree(rctl_rule_link_zone, link);
935		removed++;
936	}
937	return (removed);
938}
939
940static void
941rctl_rule_acquire_subject(struct rctl_rule *rule)
942{
943
944	ASSERT_RACCT_ENABLED();
945
946	switch (rule->rr_subject_type) {
947	case RCTL_SUBJECT_TYPE_UNDEFINED:
948	case RCTL_SUBJECT_TYPE_PROCESS:
949		break;
950	case RCTL_SUBJECT_TYPE_JAIL:
951		if (rule->rr_subject.rs_prison_racct != NULL)
952			prison_racct_hold(rule->rr_subject.rs_prison_racct);
953		break;
954	case RCTL_SUBJECT_TYPE_USER:
955		if (rule->rr_subject.rs_uip != NULL)
956			uihold(rule->rr_subject.rs_uip);
957		break;
958	case RCTL_SUBJECT_TYPE_LOGINCLASS:
959		if (rule->rr_subject.rs_loginclass != NULL)
960			loginclass_hold(rule->rr_subject.rs_loginclass);
961		break;
962	default:
963		panic("rctl_rule_acquire_subject: unknown subject type %d",
964		    rule->rr_subject_type);
965	}
966}
967
968static void
969rctl_rule_release_subject(struct rctl_rule *rule)
970{
971
972	ASSERT_RACCT_ENABLED();
973
974	switch (rule->rr_subject_type) {
975	case RCTL_SUBJECT_TYPE_UNDEFINED:
976	case RCTL_SUBJECT_TYPE_PROCESS:
977		break;
978	case RCTL_SUBJECT_TYPE_JAIL:
979		if (rule->rr_subject.rs_prison_racct != NULL)
980			prison_racct_free(rule->rr_subject.rs_prison_racct);
981		break;
982	case RCTL_SUBJECT_TYPE_USER:
983		if (rule->rr_subject.rs_uip != NULL)
984			uifree(rule->rr_subject.rs_uip);
985		break;
986	case RCTL_SUBJECT_TYPE_LOGINCLASS:
987		if (rule->rr_subject.rs_loginclass != NULL)
988			loginclass_free(rule->rr_subject.rs_loginclass);
989		break;
990	default:
991		panic("rctl_rule_release_subject: unknown subject type %d",
992		    rule->rr_subject_type);
993	}
994}
995
996struct rctl_rule *
997rctl_rule_alloc(int flags)
998{
999	struct rctl_rule *rule;
1000
1001	ASSERT_RACCT_ENABLED();
1002
1003	rule = uma_zalloc(rctl_rule_zone, flags);
1004	if (rule == NULL)
1005		return (NULL);
1006	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1007	rule->rr_subject.rs_proc = NULL;
1008	rule->rr_subject.rs_uip = NULL;
1009	rule->rr_subject.rs_loginclass = NULL;
1010	rule->rr_subject.rs_prison_racct = NULL;
1011	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1012	rule->rr_resource = RACCT_UNDEFINED;
1013	rule->rr_action = RCTL_ACTION_UNDEFINED;
1014	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1015	refcount_init(&rule->rr_refcount, 1);
1016
1017	return (rule);
1018}
1019
1020struct rctl_rule *
1021rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1022{
1023	struct rctl_rule *copy;
1024
1025	ASSERT_RACCT_ENABLED();
1026
1027	copy = uma_zalloc(rctl_rule_zone, flags);
1028	if (copy == NULL)
1029		return (NULL);
1030	copy->rr_subject_type = rule->rr_subject_type;
1031	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1032	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1033	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1034	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1035	copy->rr_per = rule->rr_per;
1036	copy->rr_resource = rule->rr_resource;
1037	copy->rr_action = rule->rr_action;
1038	copy->rr_amount = rule->rr_amount;
1039	refcount_init(&copy->rr_refcount, 1);
1040	rctl_rule_acquire_subject(copy);
1041
1042	return (copy);
1043}
1044
1045void
1046rctl_rule_acquire(struct rctl_rule *rule)
1047{
1048
1049	ASSERT_RACCT_ENABLED();
1050	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1051
1052	refcount_acquire(&rule->rr_refcount);
1053}
1054
1055static void
1056rctl_rule_free(void *context, int pending)
1057{
1058	struct rctl_rule *rule;
1059
1060	rule = (struct rctl_rule *)context;
1061
1062	ASSERT_RACCT_ENABLED();
1063	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1064
1065	/*
1066	 * We don't need locking here; rule is guaranteed to be inaccessible.
1067	 */
1068
1069	rctl_rule_release_subject(rule);
1070	uma_zfree(rctl_rule_zone, rule);
1071}
1072
1073void
1074rctl_rule_release(struct rctl_rule *rule)
1075{
1076
1077	ASSERT_RACCT_ENABLED();
1078	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1079
1080	if (refcount_release(&rule->rr_refcount)) {
1081		/*
1082		 * rctl_rule_release() is often called when iterating
1083		 * over all the uidinfo structures in the system,
1084		 * holding uihashtbl_lock.  Since rctl_rule_free()
1085		 * might end up calling uifree(), this would lead
1086		 * to lock recursion.  Use taskqueue to avoid this.
1087		 */
1088		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1089		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1090	}
1091}
1092
1093static int
1094rctl_rule_fully_specified(const struct rctl_rule *rule)
1095{
1096
1097	ASSERT_RACCT_ENABLED();
1098
1099	switch (rule->rr_subject_type) {
1100	case RCTL_SUBJECT_TYPE_UNDEFINED:
1101		return (0);
1102	case RCTL_SUBJECT_TYPE_PROCESS:
1103		if (rule->rr_subject.rs_proc == NULL)
1104			return (0);
1105		break;
1106	case RCTL_SUBJECT_TYPE_USER:
1107		if (rule->rr_subject.rs_uip == NULL)
1108			return (0);
1109		break;
1110	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1111		if (rule->rr_subject.rs_loginclass == NULL)
1112			return (0);
1113		break;
1114	case RCTL_SUBJECT_TYPE_JAIL:
1115		if (rule->rr_subject.rs_prison_racct == NULL)
1116			return (0);
1117		break;
1118	default:
1119		panic("rctl_rule_fully_specified: unknown subject type %d",
1120		    rule->rr_subject_type);
1121	}
1122	if (rule->rr_resource == RACCT_UNDEFINED)
1123		return (0);
1124	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1125		return (0);
1126	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1127		return (0);
1128	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1129		return (0);
1130
1131	return (1);
1132}
1133
1134static int
1135rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1136{
1137	struct rctl_rule *rule;
1138	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1139	     *amountstr, *perstr;
1140	id_t id;
1141	int error = 0;
1142
1143	ASSERT_RACCT_ENABLED();
1144
1145	rule = rctl_rule_alloc(M_WAITOK);
1146
1147	subjectstr = strsep(&rulestr, ":");
1148	subject_idstr = strsep(&rulestr, ":");
1149	resourcestr = strsep(&rulestr, ":");
1150	actionstr = strsep(&rulestr, "=/");
1151	amountstr = strsep(&rulestr, "/");
1152	perstr = rulestr;
1153
1154	if (subjectstr == NULL || subjectstr[0] == '\0')
1155		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1156	else {
1157		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1158		if (error != 0)
1159			goto out;
1160	}
1161
1162	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1163		rule->rr_subject.rs_proc = NULL;
1164		rule->rr_subject.rs_uip = NULL;
1165		rule->rr_subject.rs_loginclass = NULL;
1166		rule->rr_subject.rs_prison_racct = NULL;
1167	} else {
1168		switch (rule->rr_subject_type) {
1169		case RCTL_SUBJECT_TYPE_UNDEFINED:
1170			error = EINVAL;
1171			goto out;
1172		case RCTL_SUBJECT_TYPE_PROCESS:
1173			error = str2id(subject_idstr, &id);
1174			if (error != 0)
1175				goto out;
1176			sx_assert(&allproc_lock, SA_LOCKED);
1177			rule->rr_subject.rs_proc = pfind(id);
1178			if (rule->rr_subject.rs_proc == NULL) {
1179				error = ESRCH;
1180				goto out;
1181			}
1182			PROC_UNLOCK(rule->rr_subject.rs_proc);
1183			break;
1184		case RCTL_SUBJECT_TYPE_USER:
1185			error = str2id(subject_idstr, &id);
1186			if (error != 0)
1187				goto out;
1188			rule->rr_subject.rs_uip = uifind(id);
1189			break;
1190		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1191			rule->rr_subject.rs_loginclass =
1192			    loginclass_find(subject_idstr);
1193			if (rule->rr_subject.rs_loginclass == NULL) {
1194				error = ENAMETOOLONG;
1195				goto out;
1196			}
1197			break;
1198		case RCTL_SUBJECT_TYPE_JAIL:
1199			rule->rr_subject.rs_prison_racct =
1200			    prison_racct_find(subject_idstr);
1201			if (rule->rr_subject.rs_prison_racct == NULL) {
1202				error = ENAMETOOLONG;
1203				goto out;
1204			}
1205			break;
1206               default:
1207                       panic("rctl_string_to_rule: unknown subject type %d",
1208                           rule->rr_subject_type);
1209               }
1210	}
1211
1212	if (resourcestr == NULL || resourcestr[0] == '\0')
1213		rule->rr_resource = RACCT_UNDEFINED;
1214	else {
1215		error = str2value(resourcestr, &rule->rr_resource,
1216		    resourcenames);
1217		if (error != 0)
1218			goto out;
1219	}
1220
1221	if (actionstr == NULL || actionstr[0] == '\0')
1222		rule->rr_action = RCTL_ACTION_UNDEFINED;
1223	else {
1224		error = str2value(actionstr, &rule->rr_action, actionnames);
1225		if (error != 0)
1226			goto out;
1227	}
1228
1229	if (amountstr == NULL || amountstr[0] == '\0')
1230		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1231	else {
1232		error = str2int64(amountstr, &rule->rr_amount);
1233		if (error != 0)
1234			goto out;
1235		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1236			if (rule->rr_amount > INT64_MAX / 1000000) {
1237				error = ERANGE;
1238				goto out;
1239			}
1240			rule->rr_amount *= 1000000;
1241		}
1242	}
1243
1244	if (perstr == NULL || perstr[0] == '\0')
1245		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1246	else {
1247		error = str2value(perstr, &rule->rr_per, subjectnames);
1248		if (error != 0)
1249			goto out;
1250	}
1251
1252out:
1253	if (error == 0)
1254		*rulep = rule;
1255	else
1256		rctl_rule_release(rule);
1257
1258	return (error);
1259}
1260
1261/*
1262 * Link a rule with all the subjects it applies to.
1263 */
1264int
1265rctl_rule_add(struct rctl_rule *rule)
1266{
1267	struct proc *p;
1268	struct ucred *cred;
1269	struct uidinfo *uip;
1270	struct prison *pr;
1271	struct prison_racct *prr;
1272	struct loginclass *lc;
1273	struct rctl_rule *rule2;
1274	int match;
1275
1276	ASSERT_RACCT_ENABLED();
1277	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1278
1279	/*
1280	 * Some rules just don't make sense, like "deny" rule for an undeniable
1281	 * resource.  The exception are the RSS and %CPU resources - they are
1282	 * not deniable in the racct sense, but the limit is enforced in
1283	 * a different way.
1284	 */
1285	if (rule->rr_action == RCTL_ACTION_DENY &&
1286	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1287	    rule->rr_resource != RACCT_RSS &&
1288	    rule->rr_resource != RACCT_PCTCPU) {
1289		return (EOPNOTSUPP);
1290	}
1291
1292	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1293	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1294		return (EOPNOTSUPP);
1295	}
1296
1297	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1298	    rule->rr_resource == RACCT_PCTCPU) {
1299		return (EOPNOTSUPP);
1300	}
1301
1302	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1303	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1304		return (EOPNOTSUPP);
1305	}
1306
1307	/*
1308	 * Make sure there are no duplicated rules.  Also, for the "deny"
1309	 * rules, remove ones differing only by "amount".
1310	 */
1311	if (rule->rr_action == RCTL_ACTION_DENY) {
1312		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1313		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1314		rctl_rule_remove(rule2);
1315		rctl_rule_release(rule2);
1316	} else
1317		rctl_rule_remove(rule);
1318
1319	switch (rule->rr_subject_type) {
1320	case RCTL_SUBJECT_TYPE_PROCESS:
1321		p = rule->rr_subject.rs_proc;
1322		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1323
1324		rctl_racct_add_rule(p->p_racct, rule);
1325		/*
1326		 * In case of per-process rule, we don't have anything more
1327		 * to do.
1328		 */
1329		return (0);
1330
1331	case RCTL_SUBJECT_TYPE_USER:
1332		uip = rule->rr_subject.rs_uip;
1333		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1334		rctl_racct_add_rule(uip->ui_racct, rule);
1335		break;
1336
1337	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1338		lc = rule->rr_subject.rs_loginclass;
1339		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1340		rctl_racct_add_rule(lc->lc_racct, rule);
1341		break;
1342
1343	case RCTL_SUBJECT_TYPE_JAIL:
1344		prr = rule->rr_subject.rs_prison_racct;
1345		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1346		rctl_racct_add_rule(prr->prr_racct, rule);
1347		break;
1348
1349	default:
1350		panic("rctl_rule_add: unknown subject type %d",
1351		    rule->rr_subject_type);
1352	}
1353
1354	/*
1355	 * Now go through all the processes and add the new rule to the ones
1356	 * it applies to.
1357	 */
1358	sx_assert(&allproc_lock, SA_LOCKED);
1359	FOREACH_PROC_IN_SYSTEM(p) {
1360		cred = p->p_ucred;
1361		switch (rule->rr_subject_type) {
1362		case RCTL_SUBJECT_TYPE_USER:
1363			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1364			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1365				break;
1366			continue;
1367		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1368			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1369				break;
1370			continue;
1371		case RCTL_SUBJECT_TYPE_JAIL:
1372			match = 0;
1373			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1374				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1375					match = 1;
1376					break;
1377				}
1378			}
1379			if (match)
1380				break;
1381			continue;
1382		default:
1383			panic("rctl_rule_add: unknown subject type %d",
1384			    rule->rr_subject_type);
1385		}
1386
1387		rctl_racct_add_rule(p->p_racct, rule);
1388	}
1389
1390	return (0);
1391}
1392
1393static void
1394rctl_rule_pre_callback(void)
1395{
1396
1397	RACCT_LOCK();
1398}
1399
1400static void
1401rctl_rule_post_callback(void)
1402{
1403
1404	RACCT_UNLOCK();
1405}
1406
1407static void
1408rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1409{
1410	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1411	int found = 0;
1412
1413	ASSERT_RACCT_ENABLED();
1414	RACCT_LOCK_ASSERT();
1415
1416	found += rctl_racct_remove_rules(racct, filter);
1417
1418	*((int *)arg3) += found;
1419}
1420
1421/*
1422 * Remove all rules that match the filter.
1423 */
1424int
1425rctl_rule_remove(struct rctl_rule *filter)
1426{
1427	struct proc *p;
1428	int found = 0;
1429
1430	ASSERT_RACCT_ENABLED();
1431
1432	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1433	    filter->rr_subject.rs_proc != NULL) {
1434		p = filter->rr_subject.rs_proc;
1435		RACCT_LOCK();
1436		found = rctl_racct_remove_rules(p->p_racct, filter);
1437		RACCT_UNLOCK();
1438		if (found)
1439			return (0);
1440		return (ESRCH);
1441	}
1442
1443	loginclass_racct_foreach(rctl_rule_remove_callback,
1444	    rctl_rule_pre_callback, rctl_rule_post_callback,
1445	    filter, (void *)&found);
1446	ui_racct_foreach(rctl_rule_remove_callback,
1447	    rctl_rule_pre_callback, rctl_rule_post_callback,
1448	    filter, (void *)&found);
1449	prison_racct_foreach(rctl_rule_remove_callback,
1450	    rctl_rule_pre_callback, rctl_rule_post_callback,
1451	    filter, (void *)&found);
1452
1453	sx_assert(&allproc_lock, SA_LOCKED);
1454	RACCT_LOCK();
1455	FOREACH_PROC_IN_SYSTEM(p) {
1456		found += rctl_racct_remove_rules(p->p_racct, filter);
1457	}
1458	RACCT_UNLOCK();
1459
1460	if (found)
1461		return (0);
1462	return (ESRCH);
1463}
1464
1465/*
1466 * Appends a rule to the sbuf.
1467 */
1468static void
1469rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1470{
1471	int64_t amount;
1472
1473	ASSERT_RACCT_ENABLED();
1474
1475	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1476
1477	switch (rule->rr_subject_type) {
1478	case RCTL_SUBJECT_TYPE_PROCESS:
1479		if (rule->rr_subject.rs_proc == NULL)
1480			sbuf_printf(sb, ":");
1481		else
1482			sbuf_printf(sb, "%d:",
1483			    rule->rr_subject.rs_proc->p_pid);
1484		break;
1485	case RCTL_SUBJECT_TYPE_USER:
1486		if (rule->rr_subject.rs_uip == NULL)
1487			sbuf_printf(sb, ":");
1488		else
1489			sbuf_printf(sb, "%d:",
1490			    rule->rr_subject.rs_uip->ui_uid);
1491		break;
1492	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1493		if (rule->rr_subject.rs_loginclass == NULL)
1494			sbuf_printf(sb, ":");
1495		else
1496			sbuf_printf(sb, "%s:",
1497			    rule->rr_subject.rs_loginclass->lc_name);
1498		break;
1499	case RCTL_SUBJECT_TYPE_JAIL:
1500		if (rule->rr_subject.rs_prison_racct == NULL)
1501			sbuf_printf(sb, ":");
1502		else
1503			sbuf_printf(sb, "%s:",
1504			    rule->rr_subject.rs_prison_racct->prr_name);
1505		break;
1506	default:
1507		panic("rctl_rule_to_sbuf: unknown subject type %d",
1508		    rule->rr_subject_type);
1509	}
1510
1511	amount = rule->rr_amount;
1512	if (amount != RCTL_AMOUNT_UNDEFINED &&
1513	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1514		amount /= 1000000;
1515
1516	sbuf_printf(sb, "%s:%s=%jd",
1517	    rctl_resource_name(rule->rr_resource),
1518	    rctl_action_name(rule->rr_action),
1519	    amount);
1520
1521	if (rule->rr_per != rule->rr_subject_type)
1522		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1523}
1524
1525/*
1526 * Routine used by RCTL syscalls to read in input string.
1527 */
1528static int
1529rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1530{
1531	char *str;
1532	int error;
1533
1534	ASSERT_RACCT_ENABLED();
1535
1536	if (inbuflen <= 0)
1537		return (EINVAL);
1538	if (inbuflen > RCTL_MAX_INBUFSIZE)
1539		return (E2BIG);
1540
1541	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1542	error = copyinstr(inbufp, str, inbuflen, NULL);
1543	if (error != 0) {
1544		free(str, M_RCTL);
1545		return (error);
1546	}
1547
1548	*inputstr = str;
1549
1550	return (0);
1551}
1552
1553/*
1554 * Routine used by RCTL syscalls to write out output string.
1555 */
1556static int
1557rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1558{
1559	int error;
1560
1561	ASSERT_RACCT_ENABLED();
1562
1563	if (outputsbuf == NULL)
1564		return (0);
1565
1566	sbuf_finish(outputsbuf);
1567	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1568		sbuf_delete(outputsbuf);
1569		return (ERANGE);
1570	}
1571	error = copyout(sbuf_data(outputsbuf), outbufp,
1572	    sbuf_len(outputsbuf) + 1);
1573	sbuf_delete(outputsbuf);
1574	return (error);
1575}
1576
1577static struct sbuf *
1578rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1579{
1580	struct sbuf *sb;
1581	int64_t amount;
1582	int i;
1583
1584	ASSERT_RACCT_ENABLED();
1585
1586	sb = sbuf_new_auto();
1587	for (i = 0; i <= RACCT_MAX; i++) {
1588		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1589			continue;
1590		RACCT_LOCK();
1591		amount = racct->r_resources[i];
1592		RACCT_UNLOCK();
1593		if (RACCT_IS_IN_MILLIONS(i))
1594			amount /= 1000000;
1595		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1596	}
1597	sbuf_setpos(sb, sbuf_len(sb) - 1);
1598	return (sb);
1599}
1600
1601int
1602sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1603{
1604	struct rctl_rule *filter;
1605	struct sbuf *outputsbuf = NULL;
1606	struct proc *p;
1607	struct uidinfo *uip;
1608	struct loginclass *lc;
1609	struct prison_racct *prr;
1610	char *inputstr;
1611	int error;
1612
1613	if (!racct_enable)
1614		return (ENOSYS);
1615
1616	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1617	if (error != 0)
1618		return (error);
1619
1620	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1621	if (error != 0)
1622		return (error);
1623
1624	sx_slock(&allproc_lock);
1625	error = rctl_string_to_rule(inputstr, &filter);
1626	free(inputstr, M_RCTL);
1627	if (error != 0) {
1628		sx_sunlock(&allproc_lock);
1629		return (error);
1630	}
1631
1632	switch (filter->rr_subject_type) {
1633	case RCTL_SUBJECT_TYPE_PROCESS:
1634		p = filter->rr_subject.rs_proc;
1635		if (p == NULL) {
1636			error = EINVAL;
1637			goto out;
1638		}
1639		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1640		break;
1641	case RCTL_SUBJECT_TYPE_USER:
1642		uip = filter->rr_subject.rs_uip;
1643		if (uip == NULL) {
1644			error = EINVAL;
1645			goto out;
1646		}
1647		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1648		break;
1649	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1650		lc = filter->rr_subject.rs_loginclass;
1651		if (lc == NULL) {
1652			error = EINVAL;
1653			goto out;
1654		}
1655		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1656		break;
1657	case RCTL_SUBJECT_TYPE_JAIL:
1658		prr = filter->rr_subject.rs_prison_racct;
1659		if (prr == NULL) {
1660			error = EINVAL;
1661			goto out;
1662		}
1663		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1664		break;
1665	default:
1666		error = EINVAL;
1667	}
1668out:
1669	rctl_rule_release(filter);
1670	sx_sunlock(&allproc_lock);
1671	if (error != 0)
1672		return (error);
1673
1674	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1675
1676	return (error);
1677}
1678
1679static void
1680rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1681{
1682	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1683	struct rctl_rule_link *link;
1684	struct sbuf *sb = (struct sbuf *)arg3;
1685
1686	ASSERT_RACCT_ENABLED();
1687	RACCT_LOCK_ASSERT();
1688
1689	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1690		if (!rctl_rule_matches(link->rrl_rule, filter))
1691			continue;
1692		rctl_rule_to_sbuf(sb, link->rrl_rule);
1693		sbuf_printf(sb, ",");
1694	}
1695}
1696
1697int
1698sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1699{
1700	struct sbuf *sb;
1701	struct rctl_rule *filter;
1702	struct rctl_rule_link *link;
1703	struct proc *p;
1704	char *inputstr, *buf;
1705	size_t bufsize;
1706	int error;
1707
1708	if (!racct_enable)
1709		return (ENOSYS);
1710
1711	error = priv_check(td, PRIV_RCTL_GET_RULES);
1712	if (error != 0)
1713		return (error);
1714
1715	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1716	if (error != 0)
1717		return (error);
1718
1719	sx_slock(&allproc_lock);
1720	error = rctl_string_to_rule(inputstr, &filter);
1721	free(inputstr, M_RCTL);
1722	if (error != 0) {
1723		sx_sunlock(&allproc_lock);
1724		return (error);
1725	}
1726
1727	bufsize = uap->outbuflen;
1728	if (bufsize > rctl_maxbufsize) {
1729		sx_sunlock(&allproc_lock);
1730		return (E2BIG);
1731	}
1732
1733	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1734	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1735	KASSERT(sb != NULL, ("sbuf_new failed"));
1736
1737	FOREACH_PROC_IN_SYSTEM(p) {
1738		RACCT_LOCK();
1739		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1740			/*
1741			 * Non-process rules will be added to the buffer later.
1742			 * Adding them here would result in duplicated output.
1743			 */
1744			if (link->rrl_rule->rr_subject_type !=
1745			    RCTL_SUBJECT_TYPE_PROCESS)
1746				continue;
1747			if (!rctl_rule_matches(link->rrl_rule, filter))
1748				continue;
1749			rctl_rule_to_sbuf(sb, link->rrl_rule);
1750			sbuf_printf(sb, ",");
1751		}
1752		RACCT_UNLOCK();
1753	}
1754
1755	loginclass_racct_foreach(rctl_get_rules_callback,
1756	    rctl_rule_pre_callback, rctl_rule_post_callback,
1757	    filter, sb);
1758	ui_racct_foreach(rctl_get_rules_callback,
1759	    rctl_rule_pre_callback, rctl_rule_post_callback,
1760	    filter, sb);
1761	prison_racct_foreach(rctl_get_rules_callback,
1762	    rctl_rule_pre_callback, rctl_rule_post_callback,
1763	    filter, sb);
1764	if (sbuf_error(sb) == ENOMEM) {
1765		error = ERANGE;
1766		goto out;
1767	}
1768
1769	/*
1770	 * Remove trailing ",".
1771	 */
1772	if (sbuf_len(sb) > 0)
1773		sbuf_setpos(sb, sbuf_len(sb) - 1);
1774
1775	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1776out:
1777	rctl_rule_release(filter);
1778	sx_sunlock(&allproc_lock);
1779	free(buf, M_RCTL);
1780	return (error);
1781}
1782
1783int
1784sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1785{
1786	struct sbuf *sb;
1787	struct rctl_rule *filter;
1788	struct rctl_rule_link *link;
1789	char *inputstr, *buf;
1790	size_t bufsize;
1791	int error;
1792
1793	if (!racct_enable)
1794		return (ENOSYS);
1795
1796	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1797	if (error != 0)
1798		return (error);
1799
1800	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1801	if (error != 0)
1802		return (error);
1803
1804	sx_slock(&allproc_lock);
1805	error = rctl_string_to_rule(inputstr, &filter);
1806	free(inputstr, M_RCTL);
1807	if (error != 0) {
1808		sx_sunlock(&allproc_lock);
1809		return (error);
1810	}
1811
1812	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1813		rctl_rule_release(filter);
1814		sx_sunlock(&allproc_lock);
1815		return (EINVAL);
1816	}
1817	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1818		rctl_rule_release(filter);
1819		sx_sunlock(&allproc_lock);
1820		return (EOPNOTSUPP);
1821	}
1822	if (filter->rr_subject.rs_proc == NULL) {
1823		rctl_rule_release(filter);
1824		sx_sunlock(&allproc_lock);
1825		return (EINVAL);
1826	}
1827
1828	bufsize = uap->outbuflen;
1829	if (bufsize > rctl_maxbufsize) {
1830		rctl_rule_release(filter);
1831		sx_sunlock(&allproc_lock);
1832		return (E2BIG);
1833	}
1834
1835	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1836	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1837	KASSERT(sb != NULL, ("sbuf_new failed"));
1838
1839	RACCT_LOCK();
1840	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1841	    rrl_next) {
1842		rctl_rule_to_sbuf(sb, link->rrl_rule);
1843		sbuf_printf(sb, ",");
1844	}
1845	RACCT_UNLOCK();
1846	if (sbuf_error(sb) == ENOMEM) {
1847		error = ERANGE;
1848		sbuf_delete(sb);
1849		goto out;
1850	}
1851
1852	/*
1853	 * Remove trailing ",".
1854	 */
1855	if (sbuf_len(sb) > 0)
1856		sbuf_setpos(sb, sbuf_len(sb) - 1);
1857
1858	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1859out:
1860	rctl_rule_release(filter);
1861	sx_sunlock(&allproc_lock);
1862	free(buf, M_RCTL);
1863	return (error);
1864}
1865
1866int
1867sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1868{
1869	struct rctl_rule *rule;
1870	char *inputstr;
1871	int error;
1872
1873	if (!racct_enable)
1874		return (ENOSYS);
1875
1876	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1877	if (error != 0)
1878		return (error);
1879
1880	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1881	if (error != 0)
1882		return (error);
1883
1884	sx_slock(&allproc_lock);
1885	error = rctl_string_to_rule(inputstr, &rule);
1886	free(inputstr, M_RCTL);
1887	if (error != 0) {
1888		sx_sunlock(&allproc_lock);
1889		return (error);
1890	}
1891	/*
1892	 * The 'per' part of a rule is optional.
1893	 */
1894	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1895	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1896		rule->rr_per = rule->rr_subject_type;
1897
1898	if (!rctl_rule_fully_specified(rule)) {
1899		error = EINVAL;
1900		goto out;
1901	}
1902
1903	error = rctl_rule_add(rule);
1904
1905out:
1906	rctl_rule_release(rule);
1907	sx_sunlock(&allproc_lock);
1908	return (error);
1909}
1910
1911int
1912sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1913{
1914	struct rctl_rule *filter;
1915	char *inputstr;
1916	int error;
1917
1918	if (!racct_enable)
1919		return (ENOSYS);
1920
1921	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1922	if (error != 0)
1923		return (error);
1924
1925	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1926	if (error != 0)
1927		return (error);
1928
1929	sx_slock(&allproc_lock);
1930	error = rctl_string_to_rule(inputstr, &filter);
1931	free(inputstr, M_RCTL);
1932	if (error != 0) {
1933		sx_sunlock(&allproc_lock);
1934		return (error);
1935	}
1936
1937	error = rctl_rule_remove(filter);
1938	rctl_rule_release(filter);
1939	sx_sunlock(&allproc_lock);
1940
1941	return (error);
1942}
1943
1944/*
1945 * Update RCTL rule list after credential change.
1946 */
1947void
1948rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1949{
1950	LIST_HEAD(, rctl_rule_link) newrules;
1951	struct rctl_rule_link *link, *newlink;
1952	struct uidinfo *newuip;
1953	struct loginclass *newlc;
1954	struct prison_racct *newprr;
1955	int rulecnt, i;
1956
1957	ASSERT_RACCT_ENABLED();
1958
1959	newuip = newcred->cr_ruidinfo;
1960	newlc = newcred->cr_loginclass;
1961	newprr = newcred->cr_prison->pr_prison_racct;
1962
1963	LIST_INIT(&newrules);
1964
1965again:
1966	/*
1967	 * First, count the rules that apply to the process with new
1968	 * credentials.
1969	 */
1970	rulecnt = 0;
1971	RACCT_LOCK();
1972	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1973		if (link->rrl_rule->rr_subject_type ==
1974		    RCTL_SUBJECT_TYPE_PROCESS)
1975			rulecnt++;
1976	}
1977	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1978		rulecnt++;
1979	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1980		rulecnt++;
1981	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1982		rulecnt++;
1983	RACCT_UNLOCK();
1984
1985	/*
1986	 * Create temporary list.  We've dropped the rctl_lock in order
1987	 * to use M_WAITOK.
1988	 */
1989	for (i = 0; i < rulecnt; i++) {
1990		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
1991		newlink->rrl_rule = NULL;
1992		newlink->rrl_exceeded = 0;
1993		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
1994	}
1995
1996	newlink = LIST_FIRST(&newrules);
1997
1998	/*
1999	 * Assign rules to the newly allocated list entries.
2000	 */
2001	RACCT_LOCK();
2002	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2003		if (link->rrl_rule->rr_subject_type ==
2004		    RCTL_SUBJECT_TYPE_PROCESS) {
2005			if (newlink == NULL)
2006				goto goaround;
2007			rctl_rule_acquire(link->rrl_rule);
2008			newlink->rrl_rule = link->rrl_rule;
2009			newlink->rrl_exceeded = link->rrl_exceeded;
2010			newlink = LIST_NEXT(newlink, rrl_next);
2011			rulecnt--;
2012		}
2013	}
2014
2015	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2016		if (newlink == NULL)
2017			goto goaround;
2018		rctl_rule_acquire(link->rrl_rule);
2019		newlink->rrl_rule = link->rrl_rule;
2020		newlink->rrl_exceeded = link->rrl_exceeded;
2021		newlink = LIST_NEXT(newlink, rrl_next);
2022		rulecnt--;
2023	}
2024
2025	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2026		if (newlink == NULL)
2027			goto goaround;
2028		rctl_rule_acquire(link->rrl_rule);
2029		newlink->rrl_rule = link->rrl_rule;
2030		newlink->rrl_exceeded = link->rrl_exceeded;
2031		newlink = LIST_NEXT(newlink, rrl_next);
2032		rulecnt--;
2033	}
2034
2035	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2036		if (newlink == NULL)
2037			goto goaround;
2038		rctl_rule_acquire(link->rrl_rule);
2039		newlink->rrl_rule = link->rrl_rule;
2040		newlink->rrl_exceeded = link->rrl_exceeded;
2041		newlink = LIST_NEXT(newlink, rrl_next);
2042		rulecnt--;
2043	}
2044
2045	if (rulecnt == 0) {
2046		/*
2047		 * Free the old rule list.
2048		 */
2049		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2050			link = LIST_FIRST(&p->p_racct->r_rule_links);
2051			LIST_REMOVE(link, rrl_next);
2052			rctl_rule_release(link->rrl_rule);
2053			uma_zfree(rctl_rule_link_zone, link);
2054		}
2055
2056		/*
2057		 * Replace lists and we're done.
2058		 *
2059		 * XXX: Is there any way to switch list heads instead
2060		 *      of iterating here?
2061		 */
2062		while (!LIST_EMPTY(&newrules)) {
2063			newlink = LIST_FIRST(&newrules);
2064			LIST_REMOVE(newlink, rrl_next);
2065			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2066			    newlink, rrl_next);
2067		}
2068
2069		RACCT_UNLOCK();
2070
2071		return;
2072	}
2073
2074goaround:
2075	RACCT_UNLOCK();
2076
2077	/*
2078	 * Rule list changed while we were not holding the rctl_lock.
2079	 * Free the new list and try again.
2080	 */
2081	while (!LIST_EMPTY(&newrules)) {
2082		newlink = LIST_FIRST(&newrules);
2083		LIST_REMOVE(newlink, rrl_next);
2084		if (newlink->rrl_rule != NULL)
2085			rctl_rule_release(newlink->rrl_rule);
2086		uma_zfree(rctl_rule_link_zone, newlink);
2087	}
2088
2089	goto again;
2090}
2091
2092/*
2093 * Assign RCTL rules to the newly created process.
2094 */
2095int
2096rctl_proc_fork(struct proc *parent, struct proc *child)
2097{
2098	struct rctl_rule *rule;
2099	struct rctl_rule_link *link;
2100	int error;
2101
2102	ASSERT_RACCT_ENABLED();
2103	RACCT_LOCK_ASSERT();
2104	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2105
2106	LIST_INIT(&child->p_racct->r_rule_links);
2107
2108	/*
2109	 * Go through limits applicable to the parent and assign them
2110	 * to the child.  Rules with 'process' subject have to be duplicated
2111	 * in order to make their rr_subject point to the new process.
2112	 */
2113	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2114		if (link->rrl_rule->rr_subject_type ==
2115		    RCTL_SUBJECT_TYPE_PROCESS) {
2116			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2117			if (rule == NULL)
2118				goto fail;
2119			KASSERT(rule->rr_subject.rs_proc == parent,
2120			    ("rule->rr_subject.rs_proc != parent"));
2121			rule->rr_subject.rs_proc = child;
2122			error = rctl_racct_add_rule_locked(child->p_racct,
2123			    rule);
2124			rctl_rule_release(rule);
2125			if (error != 0)
2126				goto fail;
2127		} else {
2128			error = rctl_racct_add_rule_locked(child->p_racct,
2129			    link->rrl_rule);
2130			if (error != 0)
2131				goto fail;
2132		}
2133	}
2134
2135	return (0);
2136
2137fail:
2138	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2139		link = LIST_FIRST(&child->p_racct->r_rule_links);
2140		LIST_REMOVE(link, rrl_next);
2141		rctl_rule_release(link->rrl_rule);
2142		uma_zfree(rctl_rule_link_zone, link);
2143	}
2144
2145	return (EAGAIN);
2146}
2147
2148/*
2149 * Release rules attached to the racct.
2150 */
2151void
2152rctl_racct_release(struct racct *racct)
2153{
2154	struct rctl_rule_link *link;
2155
2156	ASSERT_RACCT_ENABLED();
2157	RACCT_LOCK_ASSERT();
2158
2159	while (!LIST_EMPTY(&racct->r_rule_links)) {
2160		link = LIST_FIRST(&racct->r_rule_links);
2161		LIST_REMOVE(link, rrl_next);
2162		rctl_rule_release(link->rrl_rule);
2163		uma_zfree(rctl_rule_link_zone, link);
2164	}
2165}
2166
2167static void
2168rctl_init(void)
2169{
2170
2171	if (!racct_enable)
2172		return;
2173
2174	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2175	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2176	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2177	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2178	    UMA_ALIGN_PTR, 0);
2179
2180	/*
2181	 * Set default values, making sure not to overwrite the ones
2182	 * fetched from tunables.  Most of those could be set at the
2183	 * declaration, except for the rctl_throttle_max - we cannot
2184	 * set it there due to hz not being compile time constant.
2185	 */
2186	if (rctl_throttle_min < 1)
2187		rctl_throttle_min = 1;
2188	if (rctl_throttle_max < rctl_throttle_min)
2189		rctl_throttle_max = 2 * hz;
2190	if (rctl_throttle_pct < 0)
2191		rctl_throttle_pct = 100;
2192	if (rctl_throttle_pct2 < 0)
2193		rctl_throttle_pct2 = 100;
2194}
2195
2196#else /* !RCTL */
2197
2198int
2199sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2200{
2201
2202	return (ENOSYS);
2203}
2204
2205int
2206sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2207{
2208
2209	return (ENOSYS);
2210}
2211
2212int
2213sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2214{
2215
2216	return (ENOSYS);
2217}
2218
2219int
2220sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2221{
2222
2223	return (ENOSYS);
2224}
2225
2226int
2227sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2228{
2229
2230	return (ENOSYS);
2231}
2232
2233#endif /* !RCTL */
2234