kern_racct.c revision 258622
1/*-
2 * Copyright (c) 2010 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: head/sys/kern/kern_racct.c 258622 2013-11-26 08:46:27Z avg $
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 258622 2013-11-26 08:46:27Z avg $");
34
35#include "opt_sched.h"
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/eventhandler.h>
40#include <sys/jail.h>
41#include <sys/kernel.h>
42#include <sys/kthread.h>
43#include <sys/lock.h>
44#include <sys/loginclass.h>
45#include <sys/malloc.h>
46#include <sys/mutex.h>
47#include <sys/proc.h>
48#include <sys/racct.h>
49#include <sys/resourcevar.h>
50#include <sys/sbuf.h>
51#include <sys/sched.h>
52#include <sys/sdt.h>
53#include <sys/smp.h>
54#include <sys/sx.h>
55#include <sys/sysctl.h>
56#include <sys/sysent.h>
57#include <sys/sysproto.h>
58#include <sys/umtx.h>
59#include <machine/smp.h>
60
61#ifdef RCTL
62#include <sys/rctl.h>
63#endif
64
65#ifdef RACCT
66
67FEATURE(racct, "Resource Accounting");
68
69/*
70 * Do not block processes that have their %cpu usage <= pcpu_threshold.
71 */
72static int pcpu_threshold = 1;
73
74SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
75SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
76    0, "Processes with higher %cpu usage than this value can be throttled.");
77
78/*
79 * How many seconds it takes to use the scheduler %cpu calculations.  When a
80 * process starts, we compute its %cpu usage by dividing its runtime by the
81 * process wall clock time.  After RACCT_PCPU_SECS pass, we use the value
82 * provided by the scheduler.
83 */
84#define RACCT_PCPU_SECS		3
85
86static struct mtx racct_lock;
87MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
88
89static uma_zone_t racct_zone;
90
91static void racct_sub_racct(struct racct *dest, const struct racct *src);
92static void racct_sub_cred_locked(struct ucred *cred, int resource,
93		uint64_t amount);
94static void racct_add_cred_locked(struct ucred *cred, int resource,
95		uint64_t amount);
96
97SDT_PROVIDER_DEFINE(racct);
98SDT_PROBE_DEFINE3(racct, kernel, rusage, add, "struct proc *", "int",
99    "uint64_t");
100SDT_PROBE_DEFINE3(racct, kernel, rusage, add__failure,
101    "struct proc *", "int", "uint64_t");
102SDT_PROBE_DEFINE3(racct, kernel, rusage, add__cred, "struct ucred *",
103    "int", "uint64_t");
104SDT_PROBE_DEFINE3(racct, kernel, rusage, add__force, "struct proc *",
105    "int", "uint64_t");
106SDT_PROBE_DEFINE3(racct, kernel, rusage, set, "struct proc *", "int",
107    "uint64_t");
108SDT_PROBE_DEFINE3(racct, kernel, rusage, set__failure,
109    "struct proc *", "int", "uint64_t");
110SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, "struct proc *", "int",
111    "uint64_t");
112SDT_PROBE_DEFINE3(racct, kernel, rusage, sub__cred, "struct ucred *",
113    "int", "uint64_t");
114SDT_PROBE_DEFINE1(racct, kernel, racct, create, "struct racct *");
115SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, "struct racct *");
116SDT_PROBE_DEFINE2(racct, kernel, racct, join, "struct racct *",
117    "struct racct *");
118SDT_PROBE_DEFINE2(racct, kernel, racct, join__failure,
119    "struct racct *", "struct racct *");
120SDT_PROBE_DEFINE2(racct, kernel, racct, leave, "struct racct *",
121    "struct racct *");
122
123int racct_types[] = {
124	[RACCT_CPU] =
125		RACCT_IN_MILLIONS,
126	[RACCT_DATA] =
127		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
128	[RACCT_STACK] =
129		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
130	[RACCT_CORE] =
131		RACCT_DENIABLE,
132	[RACCT_RSS] =
133		RACCT_RECLAIMABLE,
134	[RACCT_MEMLOCK] =
135		RACCT_RECLAIMABLE | RACCT_DENIABLE,
136	[RACCT_NPROC] =
137		RACCT_RECLAIMABLE | RACCT_DENIABLE,
138	[RACCT_NOFILE] =
139		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
140	[RACCT_VMEM] =
141		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
142	[RACCT_NPTS] =
143		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
144	[RACCT_SWAP] =
145		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
146	[RACCT_NTHR] =
147		RACCT_RECLAIMABLE | RACCT_DENIABLE,
148	[RACCT_MSGQQUEUED] =
149		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
150	[RACCT_MSGQSIZE] =
151		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
152	[RACCT_NMSGQ] =
153		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
154	[RACCT_NSEM] =
155		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
156	[RACCT_NSEMOP] =
157		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
158	[RACCT_NSHM] =
159		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
160	[RACCT_SHMSIZE] =
161		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
162	[RACCT_WALLCLOCK] =
163		RACCT_IN_MILLIONS,
164	[RACCT_PCTCPU] =
165		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
166
167static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
168
169#ifdef SCHED_4BSD
170/*
171 * Contains intermediate values for %cpu calculations to avoid using floating
172 * point in the kernel.
173 * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
174 * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
175 * zero so the calculations are more straightforward.
176 */
177fixpt_t ccpu_exp[] = {
178	[0] = FSCALE * 1,
179	[1] = FSCALE * 0.95122942450071400909,
180	[2] = FSCALE * 0.90483741803595957316,
181	[3] = FSCALE * 0.86070797642505780722,
182	[4] = FSCALE * 0.81873075307798185866,
183	[5] = FSCALE * 0.77880078307140486824,
184	[6] = FSCALE * 0.74081822068171786606,
185	[7] = FSCALE * 0.70468808971871343435,
186	[8] = FSCALE * 0.67032004603563930074,
187	[9] = FSCALE * 0.63762815162177329314,
188	[10] = FSCALE * 0.60653065971263342360,
189	[11] = FSCALE * 0.57694981038048669531,
190	[12] = FSCALE * 0.54881163609402643262,
191	[13] = FSCALE * 0.52204577676101604789,
192	[14] = FSCALE * 0.49658530379140951470,
193	[15] = FSCALE * 0.47236655274101470713,
194	[16] = FSCALE * 0.44932896411722159143,
195	[17] = FSCALE * 0.42741493194872666992,
196	[18] = FSCALE * 0.40656965974059911188,
197	[19] = FSCALE * 0.38674102345450120691,
198	[20] = FSCALE * 0.36787944117144232159,
199	[21] = FSCALE * 0.34993774911115535467,
200	[22] = FSCALE * 0.33287108369807955328,
201	[23] = FSCALE * 0.31663676937905321821,
202	[24] = FSCALE * 0.30119421191220209664,
203	[25] = FSCALE * 0.28650479686019010032,
204	[26] = FSCALE * 0.27253179303401260312,
205	[27] = FSCALE * 0.25924026064589150757,
206	[28] = FSCALE * 0.24659696394160647693,
207	[29] = FSCALE * 0.23457028809379765313,
208	[30] = FSCALE * 0.22313016014842982893,
209	[31] = FSCALE * 0.21224797382674305771,
210	[32] = FSCALE * 0.20189651799465540848,
211	[33] = FSCALE * 0.19204990862075411423,
212	[34] = FSCALE * 0.18268352405273465022,
213	[35] = FSCALE * 0.17377394345044512668,
214	[36] = FSCALE * 0.16529888822158653829,
215	[37] = FSCALE * 0.15723716631362761621,
216	[38] = FSCALE * 0.14956861922263505264,
217	[39] = FSCALE * 0.14227407158651357185,
218	[40] = FSCALE * 0.13533528323661269189,
219	[41] = FSCALE * 0.12873490358780421886,
220	[42] = FSCALE * 0.12245642825298191021,
221	[43] = FSCALE * 0.11648415777349695786,
222	[44] = FSCALE * 0.11080315836233388333,
223	[45] = FSCALE * 0.10539922456186433678,
224	[46] = FSCALE * 0.10025884372280373372,
225	[47] = FSCALE * 0.09536916221554961888,
226	[48] = FSCALE * 0.09071795328941250337,
227	[49] = FSCALE * 0.08629358649937051097,
228	[50] = FSCALE * 0.08208499862389879516,
229	[51] = FSCALE * 0.07808166600115315231,
230	[52] = FSCALE * 0.07427357821433388042,
231	[53] = FSCALE * 0.07065121306042958674,
232	[54] = FSCALE * 0.06720551273974976512,
233	[55] = FSCALE * 0.06392786120670757270,
234	[56] = FSCALE * 0.06081006262521796499,
235	[57] = FSCALE * 0.05784432087483846296,
236	[58] = FSCALE * 0.05502322005640722902,
237	[59] = FSCALE * 0.05233970594843239308,
238	[60] = FSCALE * 0.04978706836786394297,
239	[61] = FSCALE * 0.04735892439114092119,
240	[62] = FSCALE * 0.04504920239355780606,
241	[63] = FSCALE * 0.04285212686704017991,
242	[64] = FSCALE * 0.04076220397836621516,
243	[65] = FSCALE * 0.03877420783172200988,
244	[66] = FSCALE * 0.03688316740124000544,
245	[67] = FSCALE * 0.03508435410084502588,
246	[68] = FSCALE * 0.03337326996032607948,
247	[69] = FSCALE * 0.03174563637806794323,
248	[70] = FSCALE * 0.03019738342231850073,
249	[71] = FSCALE * 0.02872463965423942912,
250	[72] = FSCALE * 0.02732372244729256080,
251	[73] = FSCALE * 0.02599112877875534358,
252	[74] = FSCALE * 0.02472352647033939120,
253	[75] = FSCALE * 0.02351774585600910823,
254	[76] = FSCALE * 0.02237077185616559577,
255	[77] = FSCALE * 0.02127973643837716938,
256	[78] = FSCALE * 0.02024191144580438847,
257	[79] = FSCALE * 0.01925470177538692429,
258	[80] = FSCALE * 0.01831563888873418029,
259	[81] = FSCALE * 0.01742237463949351138,
260	[82] = FSCALE * 0.01657267540176124754,
261	[83] = FSCALE * 0.01576441648485449082,
262	[84] = FSCALE * 0.01499557682047770621,
263	[85] = FSCALE * 0.01426423390899925527,
264	[86] = FSCALE * 0.01356855901220093175,
265	[87] = FSCALE * 0.01290681258047986886,
266	[88] = FSCALE * 0.01227733990306844117,
267	[89] = FSCALE * 0.01167856697039544521,
268	[90] = FSCALE * 0.01110899653824230649,
269	[91] = FSCALE * 0.01056720438385265337,
270	[92] = FSCALE * 0.01005183574463358164,
271	[93] = FSCALE * 0.00956160193054350793,
272	[94] = FSCALE * 0.00909527710169581709,
273	[95] = FSCALE * 0.00865169520312063417,
274	[96] = FSCALE * 0.00822974704902002884,
275	[97] = FSCALE * 0.00782837754922577143,
276	[98] = FSCALE * 0.00744658307092434051,
277	[99] = FSCALE * 0.00708340892905212004,
278	[100] = FSCALE * 0.00673794699908546709,
279	[101] = FSCALE * 0.00640933344625638184,
280	[102] = FSCALE * 0.00609674656551563610,
281	[103] = FSCALE * 0.00579940472684214321,
282	[104] = FSCALE * 0.00551656442076077241,
283	[105] = FSCALE * 0.00524751839918138427,
284	[106] = FSCALE * 0.00499159390691021621,
285	[107] = FSCALE * 0.00474815099941147558,
286	[108] = FSCALE * 0.00451658094261266798,
287	[109] = FSCALE * 0.00429630469075234057,
288	[110] = FSCALE * 0.00408677143846406699,
289};
290#endif
291
292#define	CCPU_EXP_MAX	110
293
294/*
295 * This function is analogical to the getpcpu() function in the ps(1) command.
296 * They should both calculate in the same way so that the racct %cpu
297 * calculations are consistent with the values showed by the ps(1) tool.
298 * The calculations are more complex in the 4BSD scheduler because of the value
299 * of the ccpu variable.  In ULE it is defined to be zero which saves us some
300 * work.
301 */
302static uint64_t
303racct_getpcpu(struct proc *p, u_int pcpu)
304{
305	u_int swtime;
306#ifdef SCHED_4BSD
307	fixpt_t pctcpu, pctcpu_next;
308#endif
309#ifdef SMP
310	struct pcpu *pc;
311	int found;
312#endif
313	fixpt_t p_pctcpu;
314	struct thread *td;
315
316	/*
317	 * If the process is swapped out, we count its %cpu usage as zero.
318	 * This behaviour is consistent with the userland ps(1) tool.
319	 */
320	if ((p->p_flag & P_INMEM) == 0)
321		return (0);
322	swtime = (ticks - p->p_swtick) / hz;
323
324	/*
325	 * For short-lived processes, the sched_pctcpu() returns small
326	 * values even for cpu intensive processes.  Therefore we use
327	 * our own estimate in this case.
328	 */
329	if (swtime < RACCT_PCPU_SECS)
330		return (pcpu);
331
332	p_pctcpu = 0;
333	FOREACH_THREAD_IN_PROC(p, td) {
334		if (td == PCPU_GET(idlethread))
335			continue;
336#ifdef SMP
337		found = 0;
338		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
339			if (td == pc->pc_idlethread) {
340				found = 1;
341				break;
342			}
343		}
344		if (found)
345			continue;
346#endif
347		thread_lock(td);
348#ifdef SCHED_4BSD
349		pctcpu = sched_pctcpu(td);
350		/* Count also the yet unfinished second. */
351		pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
352		pctcpu_next += sched_pctcpu_delta(td);
353		p_pctcpu += max(pctcpu, pctcpu_next);
354#else
355		/*
356		 * In ULE the %cpu statistics are updated on every
357		 * sched_pctcpu() call.  So special calculations to
358		 * account for the latest (unfinished) second are
359		 * not needed.
360		 */
361		p_pctcpu += sched_pctcpu(td);
362#endif
363		thread_unlock(td);
364	}
365
366#ifdef SCHED_4BSD
367	if (swtime <= CCPU_EXP_MAX)
368		return ((100 * (uint64_t)p_pctcpu * 1000000) /
369		    (FSCALE - ccpu_exp[swtime]));
370#endif
371
372	return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
373}
374
375static void
376racct_add_racct(struct racct *dest, const struct racct *src)
377{
378	int i;
379
380	mtx_assert(&racct_lock, MA_OWNED);
381
382	/*
383	 * Update resource usage in dest.
384	 */
385	for (i = 0; i <= RACCT_MAX; i++) {
386		KASSERT(dest->r_resources[i] >= 0,
387		    ("%s: resource %d propagation meltdown: dest < 0",
388		    __func__, i));
389		KASSERT(src->r_resources[i] >= 0,
390		    ("%s: resource %d propagation meltdown: src < 0",
391		    __func__, i));
392		dest->r_resources[i] += src->r_resources[i];
393	}
394}
395
396static void
397racct_sub_racct(struct racct *dest, const struct racct *src)
398{
399	int i;
400
401	mtx_assert(&racct_lock, MA_OWNED);
402
403	/*
404	 * Update resource usage in dest.
405	 */
406	for (i = 0; i <= RACCT_MAX; i++) {
407		if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
408			KASSERT(dest->r_resources[i] >= 0,
409			    ("%s: resource %d propagation meltdown: dest < 0",
410			    __func__, i));
411			KASSERT(src->r_resources[i] >= 0,
412			    ("%s: resource %d propagation meltdown: src < 0",
413			    __func__, i));
414			KASSERT(src->r_resources[i] <= dest->r_resources[i],
415			    ("%s: resource %d propagation meltdown: src > dest",
416			    __func__, i));
417		}
418		if (RACCT_CAN_DROP(i)) {
419			dest->r_resources[i] -= src->r_resources[i];
420			if (dest->r_resources[i] < 0) {
421				KASSERT(RACCT_IS_SLOPPY(i) ||
422				    RACCT_IS_DECAYING(i),
423				    ("%s: resource %d usage < 0", __func__, i));
424				dest->r_resources[i] = 0;
425			}
426		}
427	}
428}
429
430void
431racct_create(struct racct **racctp)
432{
433
434	SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0);
435
436	KASSERT(*racctp == NULL, ("racct already allocated"));
437
438	*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
439}
440
441static void
442racct_destroy_locked(struct racct **racctp)
443{
444	int i;
445	struct racct *racct;
446
447	SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0);
448
449	mtx_assert(&racct_lock, MA_OWNED);
450	KASSERT(racctp != NULL, ("NULL racctp"));
451	KASSERT(*racctp != NULL, ("NULL racct"));
452
453	racct = *racctp;
454
455	for (i = 0; i <= RACCT_MAX; i++) {
456		if (RACCT_IS_SLOPPY(i))
457			continue;
458		if (!RACCT_IS_RECLAIMABLE(i))
459			continue;
460		KASSERT(racct->r_resources[i] == 0,
461		    ("destroying non-empty racct: "
462		    "%ju allocated for resource %d\n",
463		    racct->r_resources[i], i));
464	}
465	uma_zfree(racct_zone, racct);
466	*racctp = NULL;
467}
468
469void
470racct_destroy(struct racct **racct)
471{
472
473	mtx_lock(&racct_lock);
474	racct_destroy_locked(racct);
475	mtx_unlock(&racct_lock);
476}
477
478/*
479 * Increase consumption of 'resource' by 'amount' for 'racct'
480 * and all its parents.  Differently from other cases, 'amount' here
481 * may be less than zero.
482 */
483static void
484racct_alloc_resource(struct racct *racct, int resource,
485    uint64_t amount)
486{
487
488	mtx_assert(&racct_lock, MA_OWNED);
489	KASSERT(racct != NULL, ("NULL racct"));
490
491	racct->r_resources[resource] += amount;
492	if (racct->r_resources[resource] < 0) {
493		KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
494		    ("%s: resource %d usage < 0", __func__, resource));
495		racct->r_resources[resource] = 0;
496	}
497
498	/*
499	 * There are some cases where the racct %cpu resource would grow
500	 * beyond 100%.
501	 * For example in racct_proc_exit() we add the process %cpu usage
502	 * to the ucred racct containers.  If too many processes terminated
503	 * in a short time span, the ucred %cpu resource could grow too much.
504	 * Also, the 4BSD scheduler sometimes returns for a thread more than
505	 * 100% cpu usage.  So we set a boundary here to 100%.
506	 */
507	if ((resource == RACCT_PCTCPU) &&
508	    (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
509		racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
510}
511
512static int
513racct_add_locked(struct proc *p, int resource, uint64_t amount)
514{
515#ifdef RCTL
516	int error;
517#endif
518
519	SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0);
520
521	/*
522	 * We need proc lock to dereference p->p_ucred.
523	 */
524	PROC_LOCK_ASSERT(p, MA_OWNED);
525
526#ifdef RCTL
527	error = rctl_enforce(p, resource, amount);
528	if (error && RACCT_IS_DENIABLE(resource)) {
529		SDT_PROBE(racct, kernel, rusage, add__failure, p, resource,
530		    amount, 0, 0);
531		return (error);
532	}
533#endif
534	racct_alloc_resource(p->p_racct, resource, amount);
535	racct_add_cred_locked(p->p_ucred, resource, amount);
536
537	return (0);
538}
539
540/*
541 * Increase allocation of 'resource' by 'amount' for process 'p'.
542 * Return 0 if it's below limits, or errno, if it's not.
543 */
544int
545racct_add(struct proc *p, int resource, uint64_t amount)
546{
547	int error;
548
549	mtx_lock(&racct_lock);
550	error = racct_add_locked(p, resource, amount);
551	mtx_unlock(&racct_lock);
552	return (error);
553}
554
555static void
556racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
557{
558	struct prison *pr;
559
560	SDT_PROBE(racct, kernel, rusage, add__cred, cred, resource, amount,
561	    0, 0);
562
563	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
564	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
565		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
566		    amount);
567	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount);
568}
569
570/*
571 * Increase allocation of 'resource' by 'amount' for credential 'cred'.
572 * Doesn't check for limits and never fails.
573 *
574 * XXX: Shouldn't this ever return an error?
575 */
576void
577racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
578{
579
580	mtx_lock(&racct_lock);
581	racct_add_cred_locked(cred, resource, amount);
582	mtx_unlock(&racct_lock);
583}
584
585/*
586 * Increase allocation of 'resource' by 'amount' for process 'p'.
587 * Doesn't check for limits and never fails.
588 */
589void
590racct_add_force(struct proc *p, int resource, uint64_t amount)
591{
592
593	SDT_PROBE(racct, kernel, rusage, add__force, p, resource, amount, 0, 0);
594
595	/*
596	 * We need proc lock to dereference p->p_ucred.
597	 */
598	PROC_LOCK_ASSERT(p, MA_OWNED);
599
600	mtx_lock(&racct_lock);
601	racct_alloc_resource(p->p_racct, resource, amount);
602	mtx_unlock(&racct_lock);
603	racct_add_cred(p->p_ucred, resource, amount);
604}
605
606static int
607racct_set_locked(struct proc *p, int resource, uint64_t amount)
608{
609	int64_t old_amount, decayed_amount;
610	int64_t diff_proc, diff_cred;
611#ifdef RCTL
612	int error;
613#endif
614
615	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
616
617	/*
618	 * We need proc lock to dereference p->p_ucred.
619	 */
620	PROC_LOCK_ASSERT(p, MA_OWNED);
621
622	old_amount = p->p_racct->r_resources[resource];
623	/*
624	 * The diffs may be negative.
625	 */
626	diff_proc = amount - old_amount;
627	if (RACCT_IS_DECAYING(resource)) {
628		/*
629		 * Resources in per-credential racct containers may decay.
630		 * If this is the case, we need to calculate the difference
631		 * between the new amount and the proportional value of the
632		 * old amount that has decayed in the ucred racct containers.
633		 */
634		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
635		diff_cred = amount - decayed_amount;
636	} else
637		diff_cred = diff_proc;
638#ifdef notyet
639	KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
640	    ("%s: usage of non-droppable resource %d dropping", __func__,
641	     resource));
642#endif
643#ifdef RCTL
644	if (diff_proc > 0) {
645		error = rctl_enforce(p, resource, diff_proc);
646		if (error && RACCT_IS_DENIABLE(resource)) {
647			SDT_PROBE(racct, kernel, rusage, set__failure, p,
648			    resource, amount, 0, 0);
649			return (error);
650		}
651	}
652#endif
653	racct_alloc_resource(p->p_racct, resource, diff_proc);
654	if (diff_cred > 0)
655		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
656	else if (diff_cred < 0)
657		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
658
659	return (0);
660}
661
662/*
663 * Set allocation of 'resource' to 'amount' for process 'p'.
664 * Return 0 if it's below limits, or errno, if it's not.
665 *
666 * Note that decreasing the allocation always returns 0,
667 * even if it's above the limit.
668 */
669int
670racct_set(struct proc *p, int resource, uint64_t amount)
671{
672	int error;
673
674	mtx_lock(&racct_lock);
675	error = racct_set_locked(p, resource, amount);
676	mtx_unlock(&racct_lock);
677	return (error);
678}
679
680static void
681racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
682{
683	int64_t old_amount, decayed_amount;
684	int64_t diff_proc, diff_cred;
685
686	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
687
688	/*
689	 * We need proc lock to dereference p->p_ucred.
690	 */
691	PROC_LOCK_ASSERT(p, MA_OWNED);
692
693	old_amount = p->p_racct->r_resources[resource];
694	/*
695	 * The diffs may be negative.
696	 */
697	diff_proc = amount - old_amount;
698	if (RACCT_IS_DECAYING(resource)) {
699		/*
700		 * Resources in per-credential racct containers may decay.
701		 * If this is the case, we need to calculate the difference
702		 * between the new amount and the proportional value of the
703		 * old amount that has decayed in the ucred racct containers.
704		 */
705		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
706		diff_cred = amount - decayed_amount;
707	} else
708		diff_cred = diff_proc;
709
710	racct_alloc_resource(p->p_racct, resource, diff_proc);
711	if (diff_cred > 0)
712		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
713	else if (diff_cred < 0)
714		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
715}
716
717void
718racct_set_force(struct proc *p, int resource, uint64_t amount)
719{
720	mtx_lock(&racct_lock);
721	racct_set_force_locked(p, resource, amount);
722	mtx_unlock(&racct_lock);
723}
724
725/*
726 * Returns amount of 'resource' the process 'p' can keep allocated.
727 * Allocating more than that would be denied, unless the resource
728 * is marked undeniable.  Amount of already allocated resource does
729 * not matter.
730 */
731uint64_t
732racct_get_limit(struct proc *p, int resource)
733{
734
735#ifdef RCTL
736	return (rctl_get_limit(p, resource));
737#else
738	return (UINT64_MAX);
739#endif
740}
741
742/*
743 * Returns amount of 'resource' the process 'p' can keep allocated.
744 * Allocating more than that would be denied, unless the resource
745 * is marked undeniable.  Amount of already allocated resource does
746 * matter.
747 */
748uint64_t
749racct_get_available(struct proc *p, int resource)
750{
751
752#ifdef RCTL
753	return (rctl_get_available(p, resource));
754#else
755	return (UINT64_MAX);
756#endif
757}
758
759/*
760 * Returns amount of the %cpu resource that process 'p' can add to its %cpu
761 * utilization.  Adding more than that would lead to the process being
762 * throttled.
763 */
764static int64_t
765racct_pcpu_available(struct proc *p)
766{
767
768#ifdef RCTL
769	return (rctl_pcpu_available(p));
770#else
771	return (INT64_MAX);
772#endif
773}
774
775/*
776 * Decrease allocation of 'resource' by 'amount' for process 'p'.
777 */
778void
779racct_sub(struct proc *p, int resource, uint64_t amount)
780{
781
782	SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0);
783
784	/*
785	 * We need proc lock to dereference p->p_ucred.
786	 */
787	PROC_LOCK_ASSERT(p, MA_OWNED);
788	KASSERT(RACCT_CAN_DROP(resource),
789	    ("%s: called for non-droppable resource %d", __func__, resource));
790
791	mtx_lock(&racct_lock);
792	KASSERT(amount <= p->p_racct->r_resources[resource],
793	    ("%s: freeing %ju of resource %d, which is more "
794	     "than allocated %jd for %s (pid %d)", __func__, amount, resource,
795	    (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
796
797	racct_alloc_resource(p->p_racct, resource, -amount);
798	racct_sub_cred_locked(p->p_ucred, resource, amount);
799	mtx_unlock(&racct_lock);
800}
801
802static void
803racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
804{
805	struct prison *pr;
806
807	SDT_PROBE(racct, kernel, rusage, sub__cred, cred, resource, amount,
808	    0, 0);
809
810#ifdef notyet
811	KASSERT(RACCT_CAN_DROP(resource),
812	    ("%s: called for resource %d which can not drop", __func__,
813	     resource));
814#endif
815
816	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
817	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
818		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
819		    -amount);
820	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount);
821}
822
823/*
824 * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
825 */
826void
827racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
828{
829
830	mtx_lock(&racct_lock);
831	racct_sub_cred_locked(cred, resource, amount);
832	mtx_unlock(&racct_lock);
833}
834
835/*
836 * Inherit resource usage information from the parent process.
837 */
838int
839racct_proc_fork(struct proc *parent, struct proc *child)
840{
841	int i, error = 0;
842
843	/*
844	 * Create racct for the child process.
845	 */
846	racct_create(&child->p_racct);
847
848	PROC_LOCK(parent);
849	PROC_LOCK(child);
850	mtx_lock(&racct_lock);
851
852#ifdef RCTL
853	error = rctl_proc_fork(parent, child);
854	if (error != 0)
855		goto out;
856#endif
857
858	/* Init process cpu time. */
859	child->p_prev_runtime = 0;
860	child->p_throttled = 0;
861
862	/*
863	 * Inherit resource usage.
864	 */
865	for (i = 0; i <= RACCT_MAX; i++) {
866		if (parent->p_racct->r_resources[i] == 0 ||
867		    !RACCT_IS_INHERITABLE(i))
868			continue;
869
870		error = racct_set_locked(child, i,
871		    parent->p_racct->r_resources[i]);
872		if (error != 0)
873			goto out;
874	}
875
876	error = racct_add_locked(child, RACCT_NPROC, 1);
877	error += racct_add_locked(child, RACCT_NTHR, 1);
878
879out:
880	mtx_unlock(&racct_lock);
881	PROC_UNLOCK(child);
882	PROC_UNLOCK(parent);
883
884	if (error != 0)
885		racct_proc_exit(child);
886
887	return (error);
888}
889
890/*
891 * Called at the end of fork1(), to handle rules that require the process
892 * to be fully initialized.
893 */
894void
895racct_proc_fork_done(struct proc *child)
896{
897
898#ifdef RCTL
899	PROC_LOCK(child);
900	mtx_lock(&racct_lock);
901	rctl_enforce(child, RACCT_NPROC, 0);
902	rctl_enforce(child, RACCT_NTHR, 0);
903	mtx_unlock(&racct_lock);
904	PROC_UNLOCK(child);
905#endif
906}
907
908void
909racct_proc_exit(struct proc *p)
910{
911	int i;
912	uint64_t runtime;
913	struct timeval wallclock;
914	uint64_t pct_estimate, pct;
915
916	PROC_LOCK(p);
917	/*
918	 * We don't need to calculate rux, proc_reap() has already done this.
919	 */
920	runtime = cputick2usec(p->p_rux.rux_runtime);
921#ifdef notyet
922	KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
923#else
924	if (runtime < p->p_prev_runtime)
925		runtime = p->p_prev_runtime;
926#endif
927	microuptime(&wallclock);
928	timevalsub(&wallclock, &p->p_stats->p_start);
929	if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
930		pct_estimate = (1000000 * runtime * 100) /
931		    ((uint64_t)wallclock.tv_sec * 1000000 +
932		    wallclock.tv_usec);
933	} else
934		pct_estimate = 0;
935	pct = racct_getpcpu(p, pct_estimate);
936
937	mtx_lock(&racct_lock);
938	racct_set_locked(p, RACCT_CPU, runtime);
939	racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
940
941	for (i = 0; i <= RACCT_MAX; i++) {
942		if (p->p_racct->r_resources[i] == 0)
943			continue;
944	    	if (!RACCT_IS_RECLAIMABLE(i))
945			continue;
946		racct_set_locked(p, i, 0);
947	}
948
949	mtx_unlock(&racct_lock);
950	PROC_UNLOCK(p);
951
952#ifdef RCTL
953	rctl_racct_release(p->p_racct);
954#endif
955	racct_destroy(&p->p_racct);
956}
957
958/*
959 * Called after credentials change, to move resource utilisation
960 * between raccts.
961 */
962void
963racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
964    struct ucred *newcred)
965{
966	struct uidinfo *olduip, *newuip;
967	struct loginclass *oldlc, *newlc;
968	struct prison *oldpr, *newpr, *pr;
969
970	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
971
972	newuip = newcred->cr_ruidinfo;
973	olduip = oldcred->cr_ruidinfo;
974	newlc = newcred->cr_loginclass;
975	oldlc = oldcred->cr_loginclass;
976	newpr = newcred->cr_prison;
977	oldpr = oldcred->cr_prison;
978
979	mtx_lock(&racct_lock);
980	if (newuip != olduip) {
981		racct_sub_racct(olduip->ui_racct, p->p_racct);
982		racct_add_racct(newuip->ui_racct, p->p_racct);
983	}
984	if (newlc != oldlc) {
985		racct_sub_racct(oldlc->lc_racct, p->p_racct);
986		racct_add_racct(newlc->lc_racct, p->p_racct);
987	}
988	if (newpr != oldpr) {
989		for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
990			racct_sub_racct(pr->pr_prison_racct->prr_racct,
991			    p->p_racct);
992		for (pr = newpr; pr != NULL; pr = pr->pr_parent)
993			racct_add_racct(pr->pr_prison_racct->prr_racct,
994			    p->p_racct);
995	}
996	mtx_unlock(&racct_lock);
997
998#ifdef RCTL
999	rctl_proc_ucred_changed(p, newcred);
1000#endif
1001}
1002
1003void
1004racct_move(struct racct *dest, struct racct *src)
1005{
1006
1007	mtx_lock(&racct_lock);
1008
1009	racct_add_racct(dest, src);
1010	racct_sub_racct(src, src);
1011
1012	mtx_unlock(&racct_lock);
1013}
1014
1015static void
1016racct_proc_throttle(struct proc *p)
1017{
1018	struct thread *td;
1019#ifdef SMP
1020	int cpuid;
1021#endif
1022
1023	PROC_LOCK_ASSERT(p, MA_OWNED);
1024
1025	/*
1026	 * Do not block kernel processes.  Also do not block processes with
1027	 * low %cpu utilization to improve interactivity.
1028	 */
1029	if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
1030	    (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
1031		return;
1032	p->p_throttled = 1;
1033
1034	FOREACH_THREAD_IN_PROC(p, td) {
1035		thread_lock(td);
1036		switch (td->td_state) {
1037		case TDS_RUNQ:
1038			/*
1039			 * If the thread is on the scheduler run-queue, we can
1040			 * not just remove it from there.  So we set the flag
1041			 * TDF_NEEDRESCHED for the thread, so that once it is
1042			 * running, it is taken off the cpu as soon as possible.
1043			 */
1044			td->td_flags |= TDF_NEEDRESCHED;
1045			break;
1046		case TDS_RUNNING:
1047			/*
1048			 * If the thread is running, we request a context
1049			 * switch for it by setting the TDF_NEEDRESCHED flag.
1050			 */
1051			td->td_flags |= TDF_NEEDRESCHED;
1052#ifdef SMP
1053			cpuid = td->td_oncpu;
1054			if ((cpuid != NOCPU) && (td != curthread))
1055				ipi_cpu(cpuid, IPI_AST);
1056#endif
1057			break;
1058		default:
1059			break;
1060		}
1061		thread_unlock(td);
1062	}
1063}
1064
1065static void
1066racct_proc_wakeup(struct proc *p)
1067{
1068	PROC_LOCK_ASSERT(p, MA_OWNED);
1069
1070	if (p->p_throttled) {
1071		p->p_throttled = 0;
1072		wakeup(p->p_racct);
1073	}
1074}
1075
1076static void
1077racct_decay_resource(struct racct *racct, void * res, void* dummy)
1078{
1079	int resource;
1080	int64_t r_old, r_new;
1081
1082	resource = *(int *)res;
1083	r_old = racct->r_resources[resource];
1084
1085	/* If there is nothing to decay, just exit. */
1086	if (r_old <= 0)
1087		return;
1088
1089	mtx_lock(&racct_lock);
1090	r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
1091	racct->r_resources[resource] = r_new;
1092	mtx_unlock(&racct_lock);
1093}
1094
1095static void
1096racct_decay(int resource)
1097{
1098	ui_racct_foreach(racct_decay_resource, &resource, NULL);
1099	loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
1100	prison_racct_foreach(racct_decay_resource, &resource, NULL);
1101}
1102
1103static void
1104racctd(void)
1105{
1106	struct thread *td;
1107	struct proc *p;
1108	struct timeval wallclock;
1109	uint64_t runtime;
1110	uint64_t pct, pct_estimate;
1111
1112	for (;;) {
1113		racct_decay(RACCT_PCTCPU);
1114
1115		sx_slock(&allproc_lock);
1116
1117		LIST_FOREACH(p, &zombproc, p_list) {
1118			PROC_LOCK(p);
1119			racct_set(p, RACCT_PCTCPU, 0);
1120			PROC_UNLOCK(p);
1121		}
1122
1123		FOREACH_PROC_IN_SYSTEM(p) {
1124			PROC_LOCK(p);
1125			if (p->p_state != PRS_NORMAL) {
1126				PROC_UNLOCK(p);
1127				continue;
1128			}
1129
1130			microuptime(&wallclock);
1131			timevalsub(&wallclock, &p->p_stats->p_start);
1132			PROC_SLOCK(p);
1133			FOREACH_THREAD_IN_PROC(p, td)
1134				ruxagg(p, td);
1135			runtime = cputick2usec(p->p_rux.rux_runtime);
1136			PROC_SUNLOCK(p);
1137#ifdef notyet
1138			KASSERT(runtime >= p->p_prev_runtime,
1139			    ("runtime < p_prev_runtime"));
1140#else
1141			if (runtime < p->p_prev_runtime)
1142				runtime = p->p_prev_runtime;
1143#endif
1144			p->p_prev_runtime = runtime;
1145			if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
1146				pct_estimate = (1000000 * runtime * 100) /
1147				    ((uint64_t)wallclock.tv_sec * 1000000 +
1148				    wallclock.tv_usec);
1149			} else
1150				pct_estimate = 0;
1151			pct = racct_getpcpu(p, pct_estimate);
1152			mtx_lock(&racct_lock);
1153			racct_set_force_locked(p, RACCT_PCTCPU, pct);
1154			racct_set_locked(p, RACCT_CPU, runtime);
1155			racct_set_locked(p, RACCT_WALLCLOCK,
1156			    (uint64_t)wallclock.tv_sec * 1000000 +
1157			    wallclock.tv_usec);
1158			mtx_unlock(&racct_lock);
1159			PROC_UNLOCK(p);
1160		}
1161
1162		/*
1163		 * To ensure that processes are throttled in a fair way, we need
1164		 * to iterate over all processes again and check the limits
1165		 * for %cpu resource only after ucred racct containers have been
1166		 * properly filled.
1167		 */
1168		FOREACH_PROC_IN_SYSTEM(p) {
1169			PROC_LOCK(p);
1170			if (p->p_state != PRS_NORMAL) {
1171				PROC_UNLOCK(p);
1172				continue;
1173			}
1174
1175			if (racct_pcpu_available(p) <= 0)
1176				racct_proc_throttle(p);
1177			else if (p->p_throttled)
1178				racct_proc_wakeup(p);
1179			PROC_UNLOCK(p);
1180		}
1181		sx_sunlock(&allproc_lock);
1182		pause("-", hz);
1183	}
1184}
1185
1186static struct kproc_desc racctd_kp = {
1187	"racctd",
1188	racctd,
1189	NULL
1190};
1191SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp);
1192
1193static void
1194racct_init(void)
1195{
1196
1197	racct_zone = uma_zcreate("racct", sizeof(struct racct),
1198	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1199	/*
1200	 * XXX: Move this somewhere.
1201	 */
1202	prison0.pr_prison_racct = prison_racct_find("0");
1203}
1204SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
1205
1206#else /* !RACCT */
1207
1208int
1209racct_add(struct proc *p, int resource, uint64_t amount)
1210{
1211
1212	return (0);
1213}
1214
1215void
1216racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
1217{
1218}
1219
1220void
1221racct_add_force(struct proc *p, int resource, uint64_t amount)
1222{
1223
1224	return;
1225}
1226
1227int
1228racct_set(struct proc *p, int resource, uint64_t amount)
1229{
1230
1231	return (0);
1232}
1233
1234void
1235racct_set_force(struct proc *p, int resource, uint64_t amount)
1236{
1237}
1238
1239void
1240racct_sub(struct proc *p, int resource, uint64_t amount)
1241{
1242}
1243
1244void
1245racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
1246{
1247}
1248
1249uint64_t
1250racct_get_limit(struct proc *p, int resource)
1251{
1252
1253	return (UINT64_MAX);
1254}
1255
1256uint64_t
1257racct_get_available(struct proc *p, int resource)
1258{
1259
1260	return (UINT64_MAX);
1261}
1262
1263void
1264racct_create(struct racct **racctp)
1265{
1266}
1267
1268void
1269racct_destroy(struct racct **racctp)
1270{
1271}
1272
1273int
1274racct_proc_fork(struct proc *parent, struct proc *child)
1275{
1276
1277	return (0);
1278}
1279
1280void
1281racct_proc_fork_done(struct proc *child)
1282{
1283}
1284
1285void
1286racct_proc_exit(struct proc *p)
1287{
1288}
1289
1290#endif /* !RACCT */
1291