kern_rctl.c revision 297633
1/*-
2 * Copyright (c) 2010 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: head/sys/kern/kern_rctl.c 297633 2016-04-07 04:23:25Z trasz $
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/kern_rctl.c 297633 2016-04-07 04:23:25Z trasz $");
34
35#include <sys/param.h>
36#include <sys/bus.h>
37#include <sys/malloc.h>
38#include <sys/queue.h>
39#include <sys/refcount.h>
40#include <sys/jail.h>
41#include <sys/kernel.h>
42#include <sys/limits.h>
43#include <sys/loginclass.h>
44#include <sys/priv.h>
45#include <sys/proc.h>
46#include <sys/racct.h>
47#include <sys/rctl.h>
48#include <sys/resourcevar.h>
49#include <sys/sx.h>
50#include <sys/sysent.h>
51#include <sys/sysproto.h>
52#include <sys/systm.h>
53#include <sys/types.h>
54#include <sys/eventhandler.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#include <sys/rwlock.h>
58#include <sys/sbuf.h>
59#include <sys/taskqueue.h>
60#include <sys/tree.h>
61#include <vm/uma.h>
62
63#ifdef RCTL
64#ifndef RACCT
65#error "The RCTL option requires the RACCT option"
66#endif
67
68FEATURE(rctl, "Resource Limits");
69
70#define	HRF_DEFAULT		0
71#define	HRF_DONT_INHERIT	1
72#define	HRF_DONT_ACCUMULATE	2
73
74#define	RCTL_MAX_INBUFSIZE	4 * 1024
75#define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
76#define	RCTL_LOG_BUFSIZE	128
77
78#define	RCTL_PCPU_SHIFT		(10 * 1000000)
79
80static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81static int rctl_log_rate_limit = 10;
82static int rctl_devctl_rate_limit = 10;
83static unsigned int rctl_throttle_min = 0;
84static unsigned int rctl_throttle_max = 0;
85static unsigned int rctl_throttle_pct = 0;
86static unsigned int rctl_throttle_pct2 = 0;
87
88SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
89SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
90    &rctl_maxbufsize, 0, "Maximum output buffer size");
91SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
92    &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
93SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RW,
94    &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
95SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_min, CTLFLAG_RDTUN,
96    &rctl_throttle_min, 0, "Shortest throttling duration, in hz");
97SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_max, CTLFLAG_RDTUN,
98    &rctl_throttle_max, 0, "Longest throttling duration, in hz");
99SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLFLAG_RDTUN,
100    &rctl_throttle_pct, 0,
101    "Throttling penalty for process consumption, in percent");
102SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLFLAG_RDTUN,
103    &rctl_throttle_pct2, 0,
104    "Throttling penalty for container consumption, in percent");
105
106/*
107 * 'rctl_rule_link' connects a rule with every racct it's related to.
108 * For example, rule 'user:X:openfiles:deny=N/process' is linked
109 * with uidinfo for user X, and to each process of that user.
110 */
111struct rctl_rule_link {
112	LIST_ENTRY(rctl_rule_link)	rrl_next;
113	struct rctl_rule		*rrl_rule;
114	int				rrl_exceeded;
115};
116
117struct dict {
118	const char	*d_name;
119	int		d_value;
120};
121
122static struct dict subjectnames[] = {
123	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
124	{ "user", RCTL_SUBJECT_TYPE_USER },
125	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
126	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
127	{ NULL, -1 }};
128
129static struct dict resourcenames[] = {
130	{ "cputime", RACCT_CPU },
131	{ "datasize", RACCT_DATA },
132	{ "stacksize", RACCT_STACK },
133	{ "coredumpsize", RACCT_CORE },
134	{ "memoryuse", RACCT_RSS },
135	{ "memorylocked", RACCT_MEMLOCK },
136	{ "maxproc", RACCT_NPROC },
137	{ "openfiles", RACCT_NOFILE },
138	{ "vmemoryuse", RACCT_VMEM },
139	{ "pseudoterminals", RACCT_NPTS },
140	{ "swapuse", RACCT_SWAP },
141	{ "nthr", RACCT_NTHR },
142	{ "msgqqueued", RACCT_MSGQQUEUED },
143	{ "msgqsize", RACCT_MSGQSIZE },
144	{ "nmsgq", RACCT_NMSGQ },
145	{ "nsem", RACCT_NSEM },
146	{ "nsemop", RACCT_NSEMOP },
147	{ "nshm", RACCT_NSHM },
148	{ "shmsize", RACCT_SHMSIZE },
149	{ "wallclock", RACCT_WALLCLOCK },
150	{ "pcpu", RACCT_PCTCPU },
151	{ "readbps", RACCT_READBPS },
152	{ "writebps", RACCT_WRITEBPS },
153	{ "readiops", RACCT_READIOPS },
154	{ "writeiops", RACCT_WRITEIOPS },
155	{ NULL, -1 }};
156
157static struct dict actionnames[] = {
158	{ "sighup", RCTL_ACTION_SIGHUP },
159	{ "sigint", RCTL_ACTION_SIGINT },
160	{ "sigquit", RCTL_ACTION_SIGQUIT },
161	{ "sigill", RCTL_ACTION_SIGILL },
162	{ "sigtrap", RCTL_ACTION_SIGTRAP },
163	{ "sigabrt", RCTL_ACTION_SIGABRT },
164	{ "sigemt", RCTL_ACTION_SIGEMT },
165	{ "sigfpe", RCTL_ACTION_SIGFPE },
166	{ "sigkill", RCTL_ACTION_SIGKILL },
167	{ "sigbus", RCTL_ACTION_SIGBUS },
168	{ "sigsegv", RCTL_ACTION_SIGSEGV },
169	{ "sigsys", RCTL_ACTION_SIGSYS },
170	{ "sigpipe", RCTL_ACTION_SIGPIPE },
171	{ "sigalrm", RCTL_ACTION_SIGALRM },
172	{ "sigterm", RCTL_ACTION_SIGTERM },
173	{ "sigurg", RCTL_ACTION_SIGURG },
174	{ "sigstop", RCTL_ACTION_SIGSTOP },
175	{ "sigtstp", RCTL_ACTION_SIGTSTP },
176	{ "sigchld", RCTL_ACTION_SIGCHLD },
177	{ "sigttin", RCTL_ACTION_SIGTTIN },
178	{ "sigttou", RCTL_ACTION_SIGTTOU },
179	{ "sigio", RCTL_ACTION_SIGIO },
180	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
181	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
182	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
183	{ "sigprof", RCTL_ACTION_SIGPROF },
184	{ "sigwinch", RCTL_ACTION_SIGWINCH },
185	{ "siginfo", RCTL_ACTION_SIGINFO },
186	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
187	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
188	{ "sigthr", RCTL_ACTION_SIGTHR },
189	{ "deny", RCTL_ACTION_DENY },
190	{ "log", RCTL_ACTION_LOG },
191	{ "devctl", RCTL_ACTION_DEVCTL },
192	{ "throttle", RCTL_ACTION_THROTTLE },
193	{ NULL, -1 }};
194
195static void rctl_init(void);
196SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
197
198static uma_zone_t rctl_rule_link_zone;
199static uma_zone_t rctl_rule_zone;
200static struct rwlock rctl_lock;
201RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
202
203#define RCTL_RLOCK()		rw_rlock(&rctl_lock)
204#define RCTL_RUNLOCK()		rw_runlock(&rctl_lock)
205#define RCTL_WLOCK()		rw_wlock(&rctl_lock)
206#define RCTL_WUNLOCK()		rw_wunlock(&rctl_lock)
207#define RCTL_LOCK_ASSERT()	rw_assert(&rctl_lock, RA_LOCKED)
208#define RCTL_WLOCK_ASSERT()	rw_assert(&rctl_lock, RA_WLOCKED)
209
210static int rctl_rule_fully_specified(const struct rctl_rule *rule);
211static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
212
213static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
214
215static const char *
216rctl_subject_type_name(int subject)
217{
218	int i;
219
220	for (i = 0; subjectnames[i].d_name != NULL; i++) {
221		if (subjectnames[i].d_value == subject)
222			return (subjectnames[i].d_name);
223	}
224
225	panic("rctl_subject_type_name: unknown subject type %d", subject);
226}
227
228static const char *
229rctl_action_name(int action)
230{
231	int i;
232
233	for (i = 0; actionnames[i].d_name != NULL; i++) {
234		if (actionnames[i].d_value == action)
235			return (actionnames[i].d_name);
236	}
237
238	panic("rctl_action_name: unknown action %d", action);
239}
240
241const char *
242rctl_resource_name(int resource)
243{
244	int i;
245
246	for (i = 0; resourcenames[i].d_name != NULL; i++) {
247		if (resourcenames[i].d_value == resource)
248			return (resourcenames[i].d_name);
249	}
250
251	panic("rctl_resource_name: unknown resource %d", resource);
252}
253
254static struct racct *
255rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
256{
257	struct ucred *cred = p->p_ucred;
258
259	ASSERT_RACCT_ENABLED();
260	RCTL_LOCK_ASSERT();
261
262	switch (rule->rr_per) {
263	case RCTL_SUBJECT_TYPE_PROCESS:
264		return (p->p_racct);
265	case RCTL_SUBJECT_TYPE_USER:
266		return (cred->cr_ruidinfo->ui_racct);
267	case RCTL_SUBJECT_TYPE_LOGINCLASS:
268		return (cred->cr_loginclass->lc_racct);
269	case RCTL_SUBJECT_TYPE_JAIL:
270		return (cred->cr_prison->pr_prison_racct->prr_racct);
271	default:
272		panic("%s: unknown per %d", __func__, rule->rr_per);
273	}
274}
275
276/*
277 * Return the amount of resource that can be allocated by 'p' before
278 * hitting 'rule'.
279 */
280static int64_t
281rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
282{
283	int64_t available;
284	const struct racct *racct;
285
286	ASSERT_RACCT_ENABLED();
287	RCTL_LOCK_ASSERT();
288
289	racct = rctl_proc_rule_to_racct(p, rule);
290	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
291
292	return (available);
293}
294
295/*
296 * Called every second for proc, uidinfo, loginclass, and jail containers.
297 * If the limit isn't exceeded, it decreases the usage amount to zero.
298 * Otherwise, it decreases it by the value of the limit.  This way
299 * resource consumption exceeding the limit "carries over" to the next
300 * period.
301 */
302void
303rctl_throttle_decay(struct racct *racct, int resource)
304{
305	struct rctl_rule *rule;
306	struct rctl_rule_link *link;
307	int64_t minavailable;
308
309	ASSERT_RACCT_ENABLED();
310
311	minavailable = INT64_MAX;
312
313	RCTL_RLOCK();
314
315	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
316		rule = link->rrl_rule;
317
318		if (rule->rr_resource != resource)
319			continue;
320		if (rule->rr_action != RCTL_ACTION_THROTTLE)
321			continue;
322
323		if (rule->rr_amount < minavailable)
324			minavailable = rule->rr_amount;
325	}
326
327	RCTL_RUNLOCK();
328
329	if (racct->r_resources[resource] < minavailable) {
330		racct->r_resources[resource] = 0;
331	} else {
332		/*
333		 * Cap utilization counter at ten times the limit.  Otherwise,
334		 * if we changed the rule lowering the allowed amount, it could
335		 * take unreasonably long time for the accumulated resource
336		 * usage to drop.
337		 */
338		if (racct->r_resources[resource] > minavailable * 10)
339			racct->r_resources[resource] = minavailable * 10;
340
341		racct->r_resources[resource] -= minavailable;
342	}
343}
344
345/*
346 * Special version of rctl_get_available() for the %CPU resource.
347 * We slightly cheat here and return less than we normally would.
348 */
349int64_t
350rctl_pcpu_available(const struct proc *p) {
351	struct rctl_rule *rule;
352	struct rctl_rule_link *link;
353	int64_t available, minavailable, limit;
354
355	ASSERT_RACCT_ENABLED();
356
357	minavailable = INT64_MAX;
358	limit = 0;
359
360	RCTL_RLOCK();
361
362	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
363		rule = link->rrl_rule;
364		if (rule->rr_resource != RACCT_PCTCPU)
365			continue;
366		if (rule->rr_action != RCTL_ACTION_DENY)
367			continue;
368		available = rctl_available_resource(p, rule);
369		if (available < minavailable) {
370			minavailable = available;
371			limit = rule->rr_amount;
372		}
373	}
374
375	RCTL_RUNLOCK();
376
377	/*
378	 * Return slightly less than actual value of the available
379	 * %cpu resource.  This makes %cpu throttling more agressive
380	 * and lets us act sooner than the limits are already exceeded.
381	 */
382	if (limit != 0) {
383		if (limit > 2 * RCTL_PCPU_SHIFT)
384			minavailable -= RCTL_PCPU_SHIFT;
385		else
386			minavailable -= (limit / 2);
387	}
388
389	return (minavailable);
390}
391
392static uint64_t
393xadd(uint64_t a, uint64_t b)
394{
395	uint64_t c;
396
397	c = a + b;
398
399	/*
400	 * Detect overflow.
401	 */
402	if (c < a || c < b)
403		return (UINT64_MAX);
404
405	return (c);
406}
407
408static uint64_t
409xmul(uint64_t a, uint64_t b)
410{
411	uint64_t c;
412
413	if (a == 0 || b == 0)
414		return (0);
415
416	c = a * b;
417
418	if (c < a || c < b)
419		return (UINT64_MAX);
420
421	return (c);
422}
423
424/*
425 * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
426 * to what it keeps allocated now.  Returns non-zero if the allocation should
427 * be denied, 0 otherwise.
428 */
429int
430rctl_enforce(struct proc *p, int resource, uint64_t amount)
431{
432	static struct timeval log_lasttime, devctl_lasttime;
433	static int log_curtime = 0, devctl_curtime = 0;
434	struct rctl_rule *rule;
435	struct rctl_rule_link *link;
436	struct sbuf sb;
437	int64_t available;
438	uint64_t sleep_ms, sleep_ratio;
439	int should_deny = 0;
440	char *buf;
441
442
443	ASSERT_RACCT_ENABLED();
444
445	RCTL_RLOCK();
446
447	/*
448	 * There may be more than one matching rule; go through all of them.
449	 * Denial should be done last, after logging and sending signals.
450	 */
451	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
452		rule = link->rrl_rule;
453		if (rule->rr_resource != resource)
454			continue;
455
456		available = rctl_available_resource(p, rule);
457		if (available >= (int64_t)amount) {
458			link->rrl_exceeded = 0;
459			continue;
460		}
461
462		switch (rule->rr_action) {
463		case RCTL_ACTION_DENY:
464			should_deny = 1;
465			continue;
466		case RCTL_ACTION_LOG:
467			/*
468			 * If rrl_exceeded != 0, it means we've already
469			 * logged a warning for this process.
470			 */
471			if (link->rrl_exceeded != 0)
472				continue;
473
474			/*
475			 * If the process state is not fully initialized yet,
476			 * we can't access most of the required fields, e.g.
477			 * p->p_comm.  This happens when called from fork1().
478			 * Ignore this rule for now; it will be processed just
479			 * after fork, when called from racct_proc_fork_done().
480			 */
481			if (p->p_state != PRS_NORMAL)
482				continue;
483
484			if (!ppsratecheck(&log_lasttime, &log_curtime,
485			    rctl_log_rate_limit))
486				continue;
487
488			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
489			if (buf == NULL) {
490				printf("rctl_enforce: out of memory\n");
491				continue;
492			}
493			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
494			rctl_rule_to_sbuf(&sb, rule);
495			sbuf_finish(&sb);
496			printf("rctl: rule \"%s\" matched by pid %d "
497			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
498			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
499			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
500			sbuf_delete(&sb);
501			free(buf, M_RCTL);
502			link->rrl_exceeded = 1;
503			continue;
504		case RCTL_ACTION_DEVCTL:
505			if (link->rrl_exceeded != 0)
506				continue;
507
508			if (p->p_state != PRS_NORMAL)
509				continue;
510
511			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
512			    rctl_devctl_rate_limit))
513				continue;
514
515			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
516			if (buf == NULL) {
517				printf("rctl_enforce: out of memory\n");
518				continue;
519			}
520			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
521			sbuf_printf(&sb, "rule=");
522			rctl_rule_to_sbuf(&sb, rule);
523			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
524			    p->p_pid, p->p_ucred->cr_ruid,
525			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
526			sbuf_finish(&sb);
527			devctl_notify_f("RCTL", "rule", "matched",
528			    sbuf_data(&sb), M_NOWAIT);
529			sbuf_delete(&sb);
530			free(buf, M_RCTL);
531			link->rrl_exceeded = 1;
532			continue;
533		case RCTL_ACTION_THROTTLE:
534			if (p->p_state != PRS_NORMAL)
535				continue;
536
537			/*
538			 * Make the process sleep for a fraction of second
539			 * proportional to the ratio of process' resource
540			 * utilization compared to the limit.  The point is
541			 * to penalize resource hogs: processes that consume
542			 * more of the available resources sleep for longer.
543			 *
544			 * We're trying to defer division until the very end,
545			 * to minimize the rounding effects.  The following
546			 * calculation could have been written in a clearer
547			 * way like this:
548			 *
549			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
550			 *     rule->rr_amount;
551			 * sleep_ms *= rctl_throttle_pct / 100;
552			 * if (sleep_ms < rctl_throttle_min)
553			 *         sleep_ms = rctl_throttle_min;
554			 *
555			 */
556			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
557			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
558			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
559				sleep_ms = rctl_throttle_min * rule->rr_amount;
560
561			/*
562			 * Multiply that by the ratio of the resource
563			 * consumption for the container compared to the limit,
564			 * squared.  In other words, a process in a container
565			 * that is two times over the limit will be throttled
566			 * four times as much for hitting the same rule.  The
567			 * point is to penalize processes more if the container
568			 * itself (eg certain UID or jail) is above the limit.
569			 */
570			if (available < 0)
571				sleep_ratio = -available / rule->rr_amount;
572			else
573				sleep_ratio = 0;
574			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
575			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
576			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
577
578			/*
579			 * Finally the division.
580			 */
581			sleep_ms /= rule->rr_amount;
582
583			if (sleep_ms > rctl_throttle_max)
584				sleep_ms = rctl_throttle_max;
585#if 0
586			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
587			   __func__, p->p_pid, p->p_comm,
588			   p->p_racct->r_resources[resource],
589			   rule->rr_amount, sleep_ms, sleep_ratio, available);
590#endif
591
592			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
593			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
594			racct_proc_throttle(p, sleep_ms);
595			continue;
596		default:
597			if (link->rrl_exceeded != 0)
598				continue;
599
600			if (p->p_state != PRS_NORMAL)
601				continue;
602
603			KASSERT(rule->rr_action > 0 &&
604			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
605			    ("rctl_enforce: unknown action %d",
606			     rule->rr_action));
607
608			/*
609			 * We're using the fact that RCTL_ACTION_SIG* values
610			 * are equal to their counterparts from sys/signal.h.
611			 */
612			kern_psignal(p, rule->rr_action);
613			link->rrl_exceeded = 1;
614			continue;
615		}
616	}
617
618	RCTL_RUNLOCK();
619
620	if (should_deny) {
621		/*
622		 * Return fake error code; the caller should change it
623		 * into one proper for the situation - EFSIZ, ENOMEM etc.
624		 */
625		return (EDOOFUS);
626	}
627
628	return (0);
629}
630
631uint64_t
632rctl_get_limit(struct proc *p, int resource)
633{
634	struct rctl_rule *rule;
635	struct rctl_rule_link *link;
636	uint64_t amount = UINT64_MAX;
637
638	ASSERT_RACCT_ENABLED();
639
640	RCTL_RLOCK();
641
642	/*
643	 * There may be more than one matching rule; go through all of them.
644	 * Denial should be done last, after logging and sending signals.
645	 */
646	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
647		rule = link->rrl_rule;
648		if (rule->rr_resource != resource)
649			continue;
650		if (rule->rr_action != RCTL_ACTION_DENY)
651			continue;
652		if (rule->rr_amount < amount)
653			amount = rule->rr_amount;
654	}
655
656	RCTL_RUNLOCK();
657
658	return (amount);
659}
660
661uint64_t
662rctl_get_available(struct proc *p, int resource)
663{
664	struct rctl_rule *rule;
665	struct rctl_rule_link *link;
666	int64_t available, minavailable, allocated;
667
668	minavailable = INT64_MAX;
669
670	ASSERT_RACCT_ENABLED();
671
672	RCTL_RLOCK();
673
674	/*
675	 * There may be more than one matching rule; go through all of them.
676	 * Denial should be done last, after logging and sending signals.
677	 */
678	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
679		rule = link->rrl_rule;
680		if (rule->rr_resource != resource)
681			continue;
682		if (rule->rr_action != RCTL_ACTION_DENY)
683			continue;
684		available = rctl_available_resource(p, rule);
685		if (available < minavailable)
686			minavailable = available;
687	}
688
689	RCTL_RUNLOCK();
690
691	/*
692	 * XXX: Think about this _hard_.
693	 */
694	allocated = p->p_racct->r_resources[resource];
695	if (minavailable < INT64_MAX - allocated)
696		minavailable += allocated;
697	if (minavailable < 0)
698		minavailable = 0;
699	return (minavailable);
700}
701
702static int
703rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
704{
705
706	ASSERT_RACCT_ENABLED();
707
708	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
709		if (rule->rr_subject_type != filter->rr_subject_type)
710			return (0);
711
712		switch (filter->rr_subject_type) {
713		case RCTL_SUBJECT_TYPE_PROCESS:
714			if (filter->rr_subject.rs_proc != NULL &&
715			    rule->rr_subject.rs_proc !=
716			    filter->rr_subject.rs_proc)
717				return (0);
718			break;
719		case RCTL_SUBJECT_TYPE_USER:
720			if (filter->rr_subject.rs_uip != NULL &&
721			    rule->rr_subject.rs_uip !=
722			    filter->rr_subject.rs_uip)
723				return (0);
724			break;
725		case RCTL_SUBJECT_TYPE_LOGINCLASS:
726			if (filter->rr_subject.rs_loginclass != NULL &&
727			    rule->rr_subject.rs_loginclass !=
728			    filter->rr_subject.rs_loginclass)
729				return (0);
730			break;
731		case RCTL_SUBJECT_TYPE_JAIL:
732			if (filter->rr_subject.rs_prison_racct != NULL &&
733			    rule->rr_subject.rs_prison_racct !=
734			    filter->rr_subject.rs_prison_racct)
735				return (0);
736			break;
737		default:
738			panic("rctl_rule_matches: unknown subject type %d",
739			    filter->rr_subject_type);
740		}
741	}
742
743	if (filter->rr_resource != RACCT_UNDEFINED) {
744		if (rule->rr_resource != filter->rr_resource)
745			return (0);
746	}
747
748	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
749		if (rule->rr_action != filter->rr_action)
750			return (0);
751	}
752
753	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
754		if (rule->rr_amount != filter->rr_amount)
755			return (0);
756	}
757
758	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
759		if (rule->rr_per != filter->rr_per)
760			return (0);
761	}
762
763	return (1);
764}
765
766static int
767str2value(const char *str, int *value, struct dict *table)
768{
769	int i;
770
771	if (value == NULL)
772		return (EINVAL);
773
774	for (i = 0; table[i].d_name != NULL; i++) {
775		if (strcasecmp(table[i].d_name, str) == 0) {
776			*value =  table[i].d_value;
777			return (0);
778		}
779	}
780
781	return (EINVAL);
782}
783
784static int
785str2id(const char *str, id_t *value)
786{
787	char *end;
788
789	if (str == NULL)
790		return (EINVAL);
791
792	*value = strtoul(str, &end, 10);
793	if ((size_t)(end - str) != strlen(str))
794		return (EINVAL);
795
796	return (0);
797}
798
799static int
800str2int64(const char *str, int64_t *value)
801{
802	char *end;
803
804	if (str == NULL)
805		return (EINVAL);
806
807	*value = strtoul(str, &end, 10);
808	if ((size_t)(end - str) != strlen(str))
809		return (EINVAL);
810
811	if (*value < 0)
812		return (ERANGE);
813
814	return (0);
815}
816
817/*
818 * Connect the rule to the racct, increasing refcount for the rule.
819 */
820static void
821rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
822{
823	struct rctl_rule_link *link;
824
825	ASSERT_RACCT_ENABLED();
826	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
827
828	rctl_rule_acquire(rule);
829	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
830	link->rrl_rule = rule;
831	link->rrl_exceeded = 0;
832
833	RCTL_WLOCK();
834	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
835	RCTL_WUNLOCK();
836}
837
838static int
839rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
840{
841	struct rctl_rule_link *link;
842
843	ASSERT_RACCT_ENABLED();
844	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
845	RCTL_WLOCK_ASSERT();
846
847	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
848	if (link == NULL)
849		return (ENOMEM);
850	rctl_rule_acquire(rule);
851	link->rrl_rule = rule;
852	link->rrl_exceeded = 0;
853
854	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
855	return (0);
856}
857
858/*
859 * Remove limits for a rules matching the filter and release
860 * the refcounts for the rules, possibly freeing them.  Returns
861 * the number of limit structures removed.
862 */
863static int
864rctl_racct_remove_rules(struct racct *racct,
865    const struct rctl_rule *filter)
866{
867	int removed = 0;
868	struct rctl_rule_link *link, *linktmp;
869
870	ASSERT_RACCT_ENABLED();
871	RCTL_WLOCK_ASSERT();
872
873	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
874		if (!rctl_rule_matches(link->rrl_rule, filter))
875			continue;
876
877		LIST_REMOVE(link, rrl_next);
878		rctl_rule_release(link->rrl_rule);
879		uma_zfree(rctl_rule_link_zone, link);
880		removed++;
881	}
882	return (removed);
883}
884
885static void
886rctl_rule_acquire_subject(struct rctl_rule *rule)
887{
888
889	ASSERT_RACCT_ENABLED();
890
891	switch (rule->rr_subject_type) {
892	case RCTL_SUBJECT_TYPE_UNDEFINED:
893	case RCTL_SUBJECT_TYPE_PROCESS:
894		break;
895	case RCTL_SUBJECT_TYPE_JAIL:
896		if (rule->rr_subject.rs_prison_racct != NULL)
897			prison_racct_hold(rule->rr_subject.rs_prison_racct);
898		break;
899	case RCTL_SUBJECT_TYPE_USER:
900		if (rule->rr_subject.rs_uip != NULL)
901			uihold(rule->rr_subject.rs_uip);
902		break;
903	case RCTL_SUBJECT_TYPE_LOGINCLASS:
904		if (rule->rr_subject.rs_loginclass != NULL)
905			loginclass_hold(rule->rr_subject.rs_loginclass);
906		break;
907	default:
908		panic("rctl_rule_acquire_subject: unknown subject type %d",
909		    rule->rr_subject_type);
910	}
911}
912
913static void
914rctl_rule_release_subject(struct rctl_rule *rule)
915{
916
917	ASSERT_RACCT_ENABLED();
918
919	switch (rule->rr_subject_type) {
920	case RCTL_SUBJECT_TYPE_UNDEFINED:
921	case RCTL_SUBJECT_TYPE_PROCESS:
922		break;
923	case RCTL_SUBJECT_TYPE_JAIL:
924		if (rule->rr_subject.rs_prison_racct != NULL)
925			prison_racct_free(rule->rr_subject.rs_prison_racct);
926		break;
927	case RCTL_SUBJECT_TYPE_USER:
928		if (rule->rr_subject.rs_uip != NULL)
929			uifree(rule->rr_subject.rs_uip);
930		break;
931	case RCTL_SUBJECT_TYPE_LOGINCLASS:
932		if (rule->rr_subject.rs_loginclass != NULL)
933			loginclass_free(rule->rr_subject.rs_loginclass);
934		break;
935	default:
936		panic("rctl_rule_release_subject: unknown subject type %d",
937		    rule->rr_subject_type);
938	}
939}
940
941struct rctl_rule *
942rctl_rule_alloc(int flags)
943{
944	struct rctl_rule *rule;
945
946	ASSERT_RACCT_ENABLED();
947
948	rule = uma_zalloc(rctl_rule_zone, flags);
949	if (rule == NULL)
950		return (NULL);
951	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
952	rule->rr_subject.rs_proc = NULL;
953	rule->rr_subject.rs_uip = NULL;
954	rule->rr_subject.rs_loginclass = NULL;
955	rule->rr_subject.rs_prison_racct = NULL;
956	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
957	rule->rr_resource = RACCT_UNDEFINED;
958	rule->rr_action = RCTL_ACTION_UNDEFINED;
959	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
960	refcount_init(&rule->rr_refcount, 1);
961
962	return (rule);
963}
964
965struct rctl_rule *
966rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
967{
968	struct rctl_rule *copy;
969
970	ASSERT_RACCT_ENABLED();
971
972	copy = uma_zalloc(rctl_rule_zone, flags);
973	if (copy == NULL)
974		return (NULL);
975	copy->rr_subject_type = rule->rr_subject_type;
976	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
977	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
978	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
979	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
980	copy->rr_per = rule->rr_per;
981	copy->rr_resource = rule->rr_resource;
982	copy->rr_action = rule->rr_action;
983	copy->rr_amount = rule->rr_amount;
984	refcount_init(&copy->rr_refcount, 1);
985	rctl_rule_acquire_subject(copy);
986
987	return (copy);
988}
989
990void
991rctl_rule_acquire(struct rctl_rule *rule)
992{
993
994	ASSERT_RACCT_ENABLED();
995	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
996
997	refcount_acquire(&rule->rr_refcount);
998}
999
1000static void
1001rctl_rule_free(void *context, int pending)
1002{
1003	struct rctl_rule *rule;
1004
1005	rule = (struct rctl_rule *)context;
1006
1007	ASSERT_RACCT_ENABLED();
1008	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1009
1010	/*
1011	 * We don't need locking here; rule is guaranteed to be inaccessible.
1012	 */
1013
1014	rctl_rule_release_subject(rule);
1015	uma_zfree(rctl_rule_zone, rule);
1016}
1017
1018void
1019rctl_rule_release(struct rctl_rule *rule)
1020{
1021
1022	ASSERT_RACCT_ENABLED();
1023	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1024
1025	if (refcount_release(&rule->rr_refcount)) {
1026		/*
1027		 * rctl_rule_release() is often called when iterating
1028		 * over all the uidinfo structures in the system,
1029		 * holding uihashtbl_lock.  Since rctl_rule_free()
1030		 * might end up calling uifree(), this would lead
1031		 * to lock recursion.  Use taskqueue to avoid this.
1032		 */
1033		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1034		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1035	}
1036}
1037
1038static int
1039rctl_rule_fully_specified(const struct rctl_rule *rule)
1040{
1041
1042	ASSERT_RACCT_ENABLED();
1043
1044	switch (rule->rr_subject_type) {
1045	case RCTL_SUBJECT_TYPE_UNDEFINED:
1046		return (0);
1047	case RCTL_SUBJECT_TYPE_PROCESS:
1048		if (rule->rr_subject.rs_proc == NULL)
1049			return (0);
1050		break;
1051	case RCTL_SUBJECT_TYPE_USER:
1052		if (rule->rr_subject.rs_uip == NULL)
1053			return (0);
1054		break;
1055	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1056		if (rule->rr_subject.rs_loginclass == NULL)
1057			return (0);
1058		break;
1059	case RCTL_SUBJECT_TYPE_JAIL:
1060		if (rule->rr_subject.rs_prison_racct == NULL)
1061			return (0);
1062		break;
1063	default:
1064		panic("rctl_rule_fully_specified: unknown subject type %d",
1065		    rule->rr_subject_type);
1066	}
1067	if (rule->rr_resource == RACCT_UNDEFINED)
1068		return (0);
1069	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1070		return (0);
1071	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1072		return (0);
1073	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1074		return (0);
1075
1076	return (1);
1077}
1078
1079static int
1080rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1081{
1082	int error = 0;
1083	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1084	     *amountstr, *perstr;
1085	struct rctl_rule *rule;
1086	id_t id;
1087
1088	ASSERT_RACCT_ENABLED();
1089
1090	rule = rctl_rule_alloc(M_WAITOK);
1091
1092	subjectstr = strsep(&rulestr, ":");
1093	subject_idstr = strsep(&rulestr, ":");
1094	resourcestr = strsep(&rulestr, ":");
1095	actionstr = strsep(&rulestr, "=/");
1096	amountstr = strsep(&rulestr, "/");
1097	perstr = rulestr;
1098
1099	if (subjectstr == NULL || subjectstr[0] == '\0')
1100		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1101	else {
1102		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1103		if (error != 0)
1104			goto out;
1105	}
1106
1107	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1108		rule->rr_subject.rs_proc = NULL;
1109		rule->rr_subject.rs_uip = NULL;
1110		rule->rr_subject.rs_loginclass = NULL;
1111		rule->rr_subject.rs_prison_racct = NULL;
1112	} else {
1113		switch (rule->rr_subject_type) {
1114		case RCTL_SUBJECT_TYPE_UNDEFINED:
1115			error = EINVAL;
1116			goto out;
1117		case RCTL_SUBJECT_TYPE_PROCESS:
1118			error = str2id(subject_idstr, &id);
1119			if (error != 0)
1120				goto out;
1121			sx_assert(&allproc_lock, SA_LOCKED);
1122			rule->rr_subject.rs_proc = pfind(id);
1123			if (rule->rr_subject.rs_proc == NULL) {
1124				error = ESRCH;
1125				goto out;
1126			}
1127			PROC_UNLOCK(rule->rr_subject.rs_proc);
1128			break;
1129		case RCTL_SUBJECT_TYPE_USER:
1130			error = str2id(subject_idstr, &id);
1131			if (error != 0)
1132				goto out;
1133			rule->rr_subject.rs_uip = uifind(id);
1134			break;
1135		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1136			rule->rr_subject.rs_loginclass =
1137			    loginclass_find(subject_idstr);
1138			if (rule->rr_subject.rs_loginclass == NULL) {
1139				error = ENAMETOOLONG;
1140				goto out;
1141			}
1142			break;
1143		case RCTL_SUBJECT_TYPE_JAIL:
1144			rule->rr_subject.rs_prison_racct =
1145			    prison_racct_find(subject_idstr);
1146			if (rule->rr_subject.rs_prison_racct == NULL) {
1147				error = ENAMETOOLONG;
1148				goto out;
1149			}
1150			break;
1151               default:
1152                       panic("rctl_string_to_rule: unknown subject type %d",
1153                           rule->rr_subject_type);
1154               }
1155	}
1156
1157	if (resourcestr == NULL || resourcestr[0] == '\0')
1158		rule->rr_resource = RACCT_UNDEFINED;
1159	else {
1160		error = str2value(resourcestr, &rule->rr_resource,
1161		    resourcenames);
1162		if (error != 0)
1163			goto out;
1164	}
1165
1166	if (actionstr == NULL || actionstr[0] == '\0')
1167		rule->rr_action = RCTL_ACTION_UNDEFINED;
1168	else {
1169		error = str2value(actionstr, &rule->rr_action, actionnames);
1170		if (error != 0)
1171			goto out;
1172	}
1173
1174	if (amountstr == NULL || amountstr[0] == '\0')
1175		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1176	else {
1177		error = str2int64(amountstr, &rule->rr_amount);
1178		if (error != 0)
1179			goto out;
1180		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1181			if (rule->rr_amount > INT64_MAX / 1000000) {
1182				error = ERANGE;
1183				goto out;
1184			}
1185			rule->rr_amount *= 1000000;
1186		}
1187	}
1188
1189	if (perstr == NULL || perstr[0] == '\0')
1190		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1191	else {
1192		error = str2value(perstr, &rule->rr_per, subjectnames);
1193		if (error != 0)
1194			goto out;
1195	}
1196
1197out:
1198	if (error == 0)
1199		*rulep = rule;
1200	else
1201		rctl_rule_release(rule);
1202
1203	return (error);
1204}
1205
1206/*
1207 * Link a rule with all the subjects it applies to.
1208 */
1209int
1210rctl_rule_add(struct rctl_rule *rule)
1211{
1212	struct proc *p;
1213	struct ucred *cred;
1214	struct uidinfo *uip;
1215	struct prison *pr;
1216	struct prison_racct *prr;
1217	struct loginclass *lc;
1218	struct rctl_rule *rule2;
1219	int match;
1220
1221	ASSERT_RACCT_ENABLED();
1222	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1223
1224	/*
1225	 * Some rules just don't make sense, like "deny" rule for an undeniable
1226	 * resource.  The exception are the RSS and %CPU resources - they are
1227	 * not deniable in the racct sense, but the limit is enforced in
1228	 * a different way.
1229	 */
1230	if (rule->rr_action == RCTL_ACTION_DENY &&
1231	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1232	    rule->rr_resource != RACCT_RSS &&
1233	    rule->rr_resource != RACCT_PCTCPU) {
1234		return (EOPNOTSUPP);
1235	}
1236
1237	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1238	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1239		return (EOPNOTSUPP);
1240	}
1241
1242	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1243	    rule->rr_resource == RACCT_PCTCPU) {
1244		return (EOPNOTSUPP);
1245	}
1246
1247	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1248	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1249		return (EOPNOTSUPP);
1250	}
1251
1252	/*
1253	 * Make sure there are no duplicated rules.  Also, for the "deny"
1254	 * rules, remove ones differing only by "amount".
1255	 */
1256	if (rule->rr_action == RCTL_ACTION_DENY) {
1257		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1258		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1259		rctl_rule_remove(rule2);
1260		rctl_rule_release(rule2);
1261	} else
1262		rctl_rule_remove(rule);
1263
1264	switch (rule->rr_subject_type) {
1265	case RCTL_SUBJECT_TYPE_PROCESS:
1266		p = rule->rr_subject.rs_proc;
1267		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1268
1269		rctl_racct_add_rule(p->p_racct, rule);
1270		/*
1271		 * In case of per-process rule, we don't have anything more
1272		 * to do.
1273		 */
1274		return (0);
1275
1276	case RCTL_SUBJECT_TYPE_USER:
1277		uip = rule->rr_subject.rs_uip;
1278		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1279		rctl_racct_add_rule(uip->ui_racct, rule);
1280		break;
1281
1282	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1283		lc = rule->rr_subject.rs_loginclass;
1284		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1285		rctl_racct_add_rule(lc->lc_racct, rule);
1286		break;
1287
1288	case RCTL_SUBJECT_TYPE_JAIL:
1289		prr = rule->rr_subject.rs_prison_racct;
1290		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1291		rctl_racct_add_rule(prr->prr_racct, rule);
1292		break;
1293
1294	default:
1295		panic("rctl_rule_add: unknown subject type %d",
1296		    rule->rr_subject_type);
1297	}
1298
1299	/*
1300	 * Now go through all the processes and add the new rule to the ones
1301	 * it applies to.
1302	 */
1303	sx_assert(&allproc_lock, SA_LOCKED);
1304	FOREACH_PROC_IN_SYSTEM(p) {
1305		cred = p->p_ucred;
1306		switch (rule->rr_subject_type) {
1307		case RCTL_SUBJECT_TYPE_USER:
1308			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1309			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1310				break;
1311			continue;
1312		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1313			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1314				break;
1315			continue;
1316		case RCTL_SUBJECT_TYPE_JAIL:
1317			match = 0;
1318			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1319				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1320					match = 1;
1321					break;
1322				}
1323			}
1324			if (match)
1325				break;
1326			continue;
1327		default:
1328			panic("rctl_rule_add: unknown subject type %d",
1329			    rule->rr_subject_type);
1330		}
1331
1332		rctl_racct_add_rule(p->p_racct, rule);
1333	}
1334
1335	return (0);
1336}
1337
1338static void
1339rctl_rule_pre_callback(void)
1340{
1341
1342	RCTL_WLOCK();
1343}
1344
1345static void
1346rctl_rule_post_callback(void)
1347{
1348
1349	RCTL_WUNLOCK();
1350}
1351
1352static void
1353rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1354{
1355	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1356	int found = 0;
1357
1358	ASSERT_RACCT_ENABLED();
1359	RCTL_WLOCK_ASSERT();
1360
1361	found += rctl_racct_remove_rules(racct, filter);
1362
1363	*((int *)arg3) += found;
1364}
1365
1366/*
1367 * Remove all rules that match the filter.
1368 */
1369int
1370rctl_rule_remove(struct rctl_rule *filter)
1371{
1372	int found = 0;
1373	struct proc *p;
1374
1375	ASSERT_RACCT_ENABLED();
1376
1377	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1378	    filter->rr_subject.rs_proc != NULL) {
1379		p = filter->rr_subject.rs_proc;
1380		RCTL_WLOCK();
1381		found = rctl_racct_remove_rules(p->p_racct, filter);
1382		RCTL_WUNLOCK();
1383		if (found)
1384			return (0);
1385		return (ESRCH);
1386	}
1387
1388	loginclass_racct_foreach(rctl_rule_remove_callback,
1389	    rctl_rule_pre_callback, rctl_rule_post_callback,
1390	    filter, (void *)&found);
1391	ui_racct_foreach(rctl_rule_remove_callback,
1392	    rctl_rule_pre_callback, rctl_rule_post_callback,
1393	    filter, (void *)&found);
1394	prison_racct_foreach(rctl_rule_remove_callback,
1395	    rctl_rule_pre_callback, rctl_rule_post_callback,
1396	    filter, (void *)&found);
1397
1398	sx_assert(&allproc_lock, SA_LOCKED);
1399	RCTL_WLOCK();
1400	FOREACH_PROC_IN_SYSTEM(p) {
1401		found += rctl_racct_remove_rules(p->p_racct, filter);
1402	}
1403	RCTL_WUNLOCK();
1404
1405	if (found)
1406		return (0);
1407	return (ESRCH);
1408}
1409
1410/*
1411 * Appends a rule to the sbuf.
1412 */
1413static void
1414rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1415{
1416	int64_t amount;
1417
1418	ASSERT_RACCT_ENABLED();
1419
1420	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1421
1422	switch (rule->rr_subject_type) {
1423	case RCTL_SUBJECT_TYPE_PROCESS:
1424		if (rule->rr_subject.rs_proc == NULL)
1425			sbuf_printf(sb, ":");
1426		else
1427			sbuf_printf(sb, "%d:",
1428			    rule->rr_subject.rs_proc->p_pid);
1429		break;
1430	case RCTL_SUBJECT_TYPE_USER:
1431		if (rule->rr_subject.rs_uip == NULL)
1432			sbuf_printf(sb, ":");
1433		else
1434			sbuf_printf(sb, "%d:",
1435			    rule->rr_subject.rs_uip->ui_uid);
1436		break;
1437	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1438		if (rule->rr_subject.rs_loginclass == NULL)
1439			sbuf_printf(sb, ":");
1440		else
1441			sbuf_printf(sb, "%s:",
1442			    rule->rr_subject.rs_loginclass->lc_name);
1443		break;
1444	case RCTL_SUBJECT_TYPE_JAIL:
1445		if (rule->rr_subject.rs_prison_racct == NULL)
1446			sbuf_printf(sb, ":");
1447		else
1448			sbuf_printf(sb, "%s:",
1449			    rule->rr_subject.rs_prison_racct->prr_name);
1450		break;
1451	default:
1452		panic("rctl_rule_to_sbuf: unknown subject type %d",
1453		    rule->rr_subject_type);
1454	}
1455
1456	amount = rule->rr_amount;
1457	if (amount != RCTL_AMOUNT_UNDEFINED &&
1458	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1459		amount /= 1000000;
1460
1461	sbuf_printf(sb, "%s:%s=%jd",
1462	    rctl_resource_name(rule->rr_resource),
1463	    rctl_action_name(rule->rr_action),
1464	    amount);
1465
1466	if (rule->rr_per != rule->rr_subject_type)
1467		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1468}
1469
1470/*
1471 * Routine used by RCTL syscalls to read in input string.
1472 */
1473static int
1474rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1475{
1476	int error;
1477	char *str;
1478
1479	ASSERT_RACCT_ENABLED();
1480
1481	if (inbuflen <= 0)
1482		return (EINVAL);
1483	if (inbuflen > RCTL_MAX_INBUFSIZE)
1484		return (E2BIG);
1485
1486	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1487	error = copyinstr(inbufp, str, inbuflen, NULL);
1488	if (error != 0) {
1489		free(str, M_RCTL);
1490		return (error);
1491	}
1492
1493	*inputstr = str;
1494
1495	return (0);
1496}
1497
1498/*
1499 * Routine used by RCTL syscalls to write out output string.
1500 */
1501static int
1502rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1503{
1504	int error;
1505
1506	ASSERT_RACCT_ENABLED();
1507
1508	if (outputsbuf == NULL)
1509		return (0);
1510
1511	sbuf_finish(outputsbuf);
1512	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1513		sbuf_delete(outputsbuf);
1514		return (ERANGE);
1515	}
1516	error = copyout(sbuf_data(outputsbuf), outbufp,
1517	    sbuf_len(outputsbuf) + 1);
1518	sbuf_delete(outputsbuf);
1519	return (error);
1520}
1521
1522static struct sbuf *
1523rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1524{
1525	int i;
1526	int64_t amount;
1527	struct sbuf *sb;
1528
1529	ASSERT_RACCT_ENABLED();
1530
1531	sb = sbuf_new_auto();
1532	for (i = 0; i <= RACCT_MAX; i++) {
1533		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1534			continue;
1535		amount = racct->r_resources[i];
1536		if (RACCT_IS_IN_MILLIONS(i))
1537			amount /= 1000000;
1538		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1539	}
1540	sbuf_setpos(sb, sbuf_len(sb) - 1);
1541	return (sb);
1542}
1543
1544int
1545sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1546{
1547	int error;
1548	char *inputstr;
1549	struct rctl_rule *filter;
1550	struct sbuf *outputsbuf = NULL;
1551	struct proc *p;
1552	struct uidinfo *uip;
1553	struct loginclass *lc;
1554	struct prison_racct *prr;
1555
1556	if (!racct_enable)
1557		return (ENOSYS);
1558
1559	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1560	if (error != 0)
1561		return (error);
1562
1563	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1564	if (error != 0)
1565		return (error);
1566
1567	sx_slock(&allproc_lock);
1568	error = rctl_string_to_rule(inputstr, &filter);
1569	free(inputstr, M_RCTL);
1570	if (error != 0) {
1571		sx_sunlock(&allproc_lock);
1572		return (error);
1573	}
1574
1575	switch (filter->rr_subject_type) {
1576	case RCTL_SUBJECT_TYPE_PROCESS:
1577		p = filter->rr_subject.rs_proc;
1578		if (p == NULL) {
1579			error = EINVAL;
1580			goto out;
1581		}
1582		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1583		break;
1584	case RCTL_SUBJECT_TYPE_USER:
1585		uip = filter->rr_subject.rs_uip;
1586		if (uip == NULL) {
1587			error = EINVAL;
1588			goto out;
1589		}
1590		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1591		break;
1592	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1593		lc = filter->rr_subject.rs_loginclass;
1594		if (lc == NULL) {
1595			error = EINVAL;
1596			goto out;
1597		}
1598		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1599		break;
1600	case RCTL_SUBJECT_TYPE_JAIL:
1601		prr = filter->rr_subject.rs_prison_racct;
1602		if (prr == NULL) {
1603			error = EINVAL;
1604			goto out;
1605		}
1606		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1607		break;
1608	default:
1609		error = EINVAL;
1610	}
1611out:
1612	rctl_rule_release(filter);
1613	sx_sunlock(&allproc_lock);
1614	if (error != 0)
1615		return (error);
1616
1617	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1618
1619	return (error);
1620}
1621
1622static void
1623rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1624{
1625	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1626	struct rctl_rule_link *link;
1627	struct sbuf *sb = (struct sbuf *)arg3;
1628
1629	ASSERT_RACCT_ENABLED();
1630	RCTL_LOCK_ASSERT();
1631
1632	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1633		if (!rctl_rule_matches(link->rrl_rule, filter))
1634			continue;
1635		rctl_rule_to_sbuf(sb, link->rrl_rule);
1636		sbuf_printf(sb, ",");
1637	}
1638}
1639
1640int
1641sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1642{
1643	int error;
1644	size_t bufsize;
1645	char *inputstr, *buf;
1646	struct sbuf *sb;
1647	struct rctl_rule *filter;
1648	struct rctl_rule_link *link;
1649	struct proc *p;
1650
1651	if (!racct_enable)
1652		return (ENOSYS);
1653
1654	error = priv_check(td, PRIV_RCTL_GET_RULES);
1655	if (error != 0)
1656		return (error);
1657
1658	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1659	if (error != 0)
1660		return (error);
1661
1662	sx_slock(&allproc_lock);
1663	error = rctl_string_to_rule(inputstr, &filter);
1664	free(inputstr, M_RCTL);
1665	if (error != 0) {
1666		sx_sunlock(&allproc_lock);
1667		return (error);
1668	}
1669
1670	bufsize = uap->outbuflen;
1671	if (bufsize > rctl_maxbufsize) {
1672		sx_sunlock(&allproc_lock);
1673		return (E2BIG);
1674	}
1675
1676	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1677	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1678	KASSERT(sb != NULL, ("sbuf_new failed"));
1679
1680	FOREACH_PROC_IN_SYSTEM(p) {
1681		RCTL_RLOCK();
1682		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1683			/*
1684			 * Non-process rules will be added to the buffer later.
1685			 * Adding them here would result in duplicated output.
1686			 */
1687			if (link->rrl_rule->rr_subject_type !=
1688			    RCTL_SUBJECT_TYPE_PROCESS)
1689				continue;
1690			if (!rctl_rule_matches(link->rrl_rule, filter))
1691				continue;
1692			rctl_rule_to_sbuf(sb, link->rrl_rule);
1693			sbuf_printf(sb, ",");
1694		}
1695		RCTL_RUNLOCK();
1696	}
1697
1698	loginclass_racct_foreach(rctl_get_rules_callback,
1699	    rctl_rule_pre_callback, rctl_rule_post_callback,
1700	    filter, sb);
1701	ui_racct_foreach(rctl_get_rules_callback,
1702	    rctl_rule_pre_callback, rctl_rule_post_callback,
1703	    filter, sb);
1704	prison_racct_foreach(rctl_get_rules_callback,
1705	    rctl_rule_pre_callback, rctl_rule_post_callback,
1706	    filter, sb);
1707	if (sbuf_error(sb) == ENOMEM) {
1708		error = ERANGE;
1709		goto out;
1710	}
1711
1712	/*
1713	 * Remove trailing ",".
1714	 */
1715	if (sbuf_len(sb) > 0)
1716		sbuf_setpos(sb, sbuf_len(sb) - 1);
1717
1718	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1719out:
1720	rctl_rule_release(filter);
1721	sx_sunlock(&allproc_lock);
1722	free(buf, M_RCTL);
1723	return (error);
1724}
1725
1726int
1727sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1728{
1729	int error;
1730	size_t bufsize;
1731	char *inputstr, *buf;
1732	struct sbuf *sb;
1733	struct rctl_rule *filter;
1734	struct rctl_rule_link *link;
1735
1736	if (!racct_enable)
1737		return (ENOSYS);
1738
1739	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1740	if (error != 0)
1741		return (error);
1742
1743	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1744	if (error != 0)
1745		return (error);
1746
1747	sx_slock(&allproc_lock);
1748	error = rctl_string_to_rule(inputstr, &filter);
1749	free(inputstr, M_RCTL);
1750	if (error != 0) {
1751		sx_sunlock(&allproc_lock);
1752		return (error);
1753	}
1754
1755	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1756		rctl_rule_release(filter);
1757		sx_sunlock(&allproc_lock);
1758		return (EINVAL);
1759	}
1760	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1761		rctl_rule_release(filter);
1762		sx_sunlock(&allproc_lock);
1763		return (EOPNOTSUPP);
1764	}
1765	if (filter->rr_subject.rs_proc == NULL) {
1766		rctl_rule_release(filter);
1767		sx_sunlock(&allproc_lock);
1768		return (EINVAL);
1769	}
1770
1771	bufsize = uap->outbuflen;
1772	if (bufsize > rctl_maxbufsize) {
1773		rctl_rule_release(filter);
1774		sx_sunlock(&allproc_lock);
1775		return (E2BIG);
1776	}
1777
1778	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1779	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1780	KASSERT(sb != NULL, ("sbuf_new failed"));
1781
1782	RCTL_RLOCK();
1783	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1784	    rrl_next) {
1785		rctl_rule_to_sbuf(sb, link->rrl_rule);
1786		sbuf_printf(sb, ",");
1787	}
1788	RCTL_RUNLOCK();
1789	if (sbuf_error(sb) == ENOMEM) {
1790		error = ERANGE;
1791		goto out;
1792	}
1793
1794	/*
1795	 * Remove trailing ",".
1796	 */
1797	if (sbuf_len(sb) > 0)
1798		sbuf_setpos(sb, sbuf_len(sb) - 1);
1799
1800	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1801out:
1802	rctl_rule_release(filter);
1803	sx_sunlock(&allproc_lock);
1804	free(buf, M_RCTL);
1805	return (error);
1806}
1807
1808int
1809sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1810{
1811	int error;
1812	struct rctl_rule *rule;
1813	char *inputstr;
1814
1815	if (!racct_enable)
1816		return (ENOSYS);
1817
1818	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1819	if (error != 0)
1820		return (error);
1821
1822	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1823	if (error != 0)
1824		return (error);
1825
1826	sx_slock(&allproc_lock);
1827	error = rctl_string_to_rule(inputstr, &rule);
1828	free(inputstr, M_RCTL);
1829	if (error != 0) {
1830		sx_sunlock(&allproc_lock);
1831		return (error);
1832	}
1833	/*
1834	 * The 'per' part of a rule is optional.
1835	 */
1836	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1837	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1838		rule->rr_per = rule->rr_subject_type;
1839
1840	if (!rctl_rule_fully_specified(rule)) {
1841		error = EINVAL;
1842		goto out;
1843	}
1844
1845	error = rctl_rule_add(rule);
1846
1847out:
1848	rctl_rule_release(rule);
1849	sx_sunlock(&allproc_lock);
1850	return (error);
1851}
1852
1853int
1854sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1855{
1856	int error;
1857	struct rctl_rule *filter;
1858	char *inputstr;
1859
1860	if (!racct_enable)
1861		return (ENOSYS);
1862
1863	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1864	if (error != 0)
1865		return (error);
1866
1867	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1868	if (error != 0)
1869		return (error);
1870
1871	sx_slock(&allproc_lock);
1872	error = rctl_string_to_rule(inputstr, &filter);
1873	free(inputstr, M_RCTL);
1874	if (error != 0) {
1875		sx_sunlock(&allproc_lock);
1876		return (error);
1877	}
1878
1879	error = rctl_rule_remove(filter);
1880	rctl_rule_release(filter);
1881	sx_sunlock(&allproc_lock);
1882
1883	return (error);
1884}
1885
1886/*
1887 * Update RCTL rule list after credential change.
1888 */
1889void
1890rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1891{
1892	int rulecnt, i;
1893	struct rctl_rule_link *link, *newlink;
1894	struct uidinfo *newuip;
1895	struct loginclass *newlc;
1896	struct prison_racct *newprr;
1897	LIST_HEAD(, rctl_rule_link) newrules;
1898
1899	ASSERT_RACCT_ENABLED();
1900
1901	newuip = newcred->cr_ruidinfo;
1902	newlc = newcred->cr_loginclass;
1903	newprr = newcred->cr_prison->pr_prison_racct;
1904
1905	LIST_INIT(&newrules);
1906
1907again:
1908	/*
1909	 * First, count the rules that apply to the process with new
1910	 * credentials.
1911	 */
1912	rulecnt = 0;
1913	RCTL_RLOCK();
1914	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1915		if (link->rrl_rule->rr_subject_type ==
1916		    RCTL_SUBJECT_TYPE_PROCESS)
1917			rulecnt++;
1918	}
1919	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1920		rulecnt++;
1921	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1922		rulecnt++;
1923	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1924		rulecnt++;
1925	RCTL_RUNLOCK();
1926
1927	/*
1928	 * Create temporary list.  We've dropped the rctl_lock in order
1929	 * to use M_WAITOK.
1930	 */
1931	for (i = 0; i < rulecnt; i++) {
1932		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
1933		newlink->rrl_rule = NULL;
1934		newlink->rrl_exceeded = 0;
1935		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
1936	}
1937
1938	newlink = LIST_FIRST(&newrules);
1939
1940	/*
1941	 * Assign rules to the newly allocated list entries.
1942	 */
1943	RCTL_WLOCK();
1944	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1945		if (link->rrl_rule->rr_subject_type ==
1946		    RCTL_SUBJECT_TYPE_PROCESS) {
1947			if (newlink == NULL)
1948				goto goaround;
1949			rctl_rule_acquire(link->rrl_rule);
1950			newlink->rrl_rule = link->rrl_rule;
1951			newlink->rrl_exceeded = link->rrl_exceeded;
1952			newlink = LIST_NEXT(newlink, rrl_next);
1953			rulecnt--;
1954		}
1955	}
1956
1957	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
1958		if (newlink == NULL)
1959			goto goaround;
1960		rctl_rule_acquire(link->rrl_rule);
1961		newlink->rrl_rule = link->rrl_rule;
1962		newlink->rrl_exceeded = link->rrl_exceeded;
1963		newlink = LIST_NEXT(newlink, rrl_next);
1964		rulecnt--;
1965	}
1966
1967	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
1968		if (newlink == NULL)
1969			goto goaround;
1970		rctl_rule_acquire(link->rrl_rule);
1971		newlink->rrl_rule = link->rrl_rule;
1972		newlink->rrl_exceeded = link->rrl_exceeded;
1973		newlink = LIST_NEXT(newlink, rrl_next);
1974		rulecnt--;
1975	}
1976
1977	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
1978		if (newlink == NULL)
1979			goto goaround;
1980		rctl_rule_acquire(link->rrl_rule);
1981		newlink->rrl_rule = link->rrl_rule;
1982		newlink->rrl_exceeded = link->rrl_exceeded;
1983		newlink = LIST_NEXT(newlink, rrl_next);
1984		rulecnt--;
1985	}
1986
1987	if (rulecnt == 0) {
1988		/*
1989		 * Free the old rule list.
1990		 */
1991		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
1992			link = LIST_FIRST(&p->p_racct->r_rule_links);
1993			LIST_REMOVE(link, rrl_next);
1994			rctl_rule_release(link->rrl_rule);
1995			uma_zfree(rctl_rule_link_zone, link);
1996		}
1997
1998		/*
1999		 * Replace lists and we're done.
2000		 *
2001		 * XXX: Is there any way to switch list heads instead
2002		 *      of iterating here?
2003		 */
2004		while (!LIST_EMPTY(&newrules)) {
2005			newlink = LIST_FIRST(&newrules);
2006			LIST_REMOVE(newlink, rrl_next);
2007			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2008			    newlink, rrl_next);
2009		}
2010
2011		RCTL_WUNLOCK();
2012
2013		return;
2014	}
2015
2016goaround:
2017	RCTL_WUNLOCK();
2018
2019	/*
2020	 * Rule list changed while we were not holding the rctl_lock.
2021	 * Free the new list and try again.
2022	 */
2023	while (!LIST_EMPTY(&newrules)) {
2024		newlink = LIST_FIRST(&newrules);
2025		LIST_REMOVE(newlink, rrl_next);
2026		if (newlink->rrl_rule != NULL)
2027			rctl_rule_release(newlink->rrl_rule);
2028		uma_zfree(rctl_rule_link_zone, newlink);
2029	}
2030
2031	goto again;
2032}
2033
2034/*
2035 * Assign RCTL rules to the newly created process.
2036 */
2037int
2038rctl_proc_fork(struct proc *parent, struct proc *child)
2039{
2040	int error;
2041	struct rctl_rule_link *link;
2042	struct rctl_rule *rule;
2043
2044	LIST_INIT(&child->p_racct->r_rule_links);
2045
2046	ASSERT_RACCT_ENABLED();
2047	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2048
2049	RCTL_WLOCK();
2050
2051	/*
2052	 * Go through limits applicable to the parent and assign them
2053	 * to the child.  Rules with 'process' subject have to be duplicated
2054	 * in order to make their rr_subject point to the new process.
2055	 */
2056	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2057		if (link->rrl_rule->rr_subject_type ==
2058		    RCTL_SUBJECT_TYPE_PROCESS) {
2059			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2060			if (rule == NULL)
2061				goto fail;
2062			KASSERT(rule->rr_subject.rs_proc == parent,
2063			    ("rule->rr_subject.rs_proc != parent"));
2064			rule->rr_subject.rs_proc = child;
2065			error = rctl_racct_add_rule_locked(child->p_racct,
2066			    rule);
2067			rctl_rule_release(rule);
2068			if (error != 0)
2069				goto fail;
2070		} else {
2071			error = rctl_racct_add_rule_locked(child->p_racct,
2072			    link->rrl_rule);
2073			if (error != 0)
2074				goto fail;
2075		}
2076	}
2077
2078	RCTL_WUNLOCK();
2079	return (0);
2080
2081fail:
2082	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2083		link = LIST_FIRST(&child->p_racct->r_rule_links);
2084		LIST_REMOVE(link, rrl_next);
2085		rctl_rule_release(link->rrl_rule);
2086		uma_zfree(rctl_rule_link_zone, link);
2087	}
2088	RCTL_WUNLOCK();
2089	return (EAGAIN);
2090}
2091
2092/*
2093 * Release rules attached to the racct.
2094 */
2095void
2096rctl_racct_release(struct racct *racct)
2097{
2098	struct rctl_rule_link *link;
2099
2100	ASSERT_RACCT_ENABLED();
2101
2102	RCTL_WLOCK();
2103	while (!LIST_EMPTY(&racct->r_rule_links)) {
2104		link = LIST_FIRST(&racct->r_rule_links);
2105		LIST_REMOVE(link, rrl_next);
2106		rctl_rule_release(link->rrl_rule);
2107		uma_zfree(rctl_rule_link_zone, link);
2108	}
2109	RCTL_WUNLOCK();
2110}
2111
2112static void
2113rctl_init(void)
2114{
2115
2116	if (!racct_enable)
2117		return;
2118
2119	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2120	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2121	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
2122	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2123	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
2124
2125	if (rctl_throttle_min <= 0)
2126		rctl_throttle_min = 1;
2127	if (rctl_throttle_max <= 0)
2128		rctl_throttle_max = 2 * hz;
2129	if (rctl_throttle_pct <= 0)
2130		rctl_throttle_pct = 100;
2131	if (rctl_throttle_pct2 <= 0)
2132		rctl_throttle_pct2 = 100;
2133}
2134
2135#else /* !RCTL */
2136
2137int
2138sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2139{
2140
2141	return (ENOSYS);
2142}
2143
2144int
2145sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2146{
2147
2148	return (ENOSYS);
2149}
2150
2151int
2152sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2153{
2154
2155	return (ENOSYS);
2156}
2157
2158int
2159sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2160{
2161
2162	return (ENOSYS);
2163}
2164
2165int
2166sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2167{
2168
2169	return (ENOSYS);
2170}
2171
2172#endif /* !RCTL */
2173