kern_racct.c revision 243088
1309124Sdim/*-
2193323Sed * Copyright (c) 2010 The FreeBSD Foundation
3193323Sed * All rights reserved.
4193323Sed *
5193323Sed * This software was developed by Edward Tomasz Napierala under sponsorship
6193323Sed * from the FreeBSD Foundation.
7193323Sed *
8193323Sed * Redistribution and use in source and binary forms, with or without
9193323Sed * modification, are permitted provided that the following conditions
10193323Sed * are met:
11193323Sed * 1. Redistributions of source code must retain the above copyright
12193323Sed *    notice, this list of conditions and the following disclaimer.
13193323Sed * 2. Redistributions in binary form must reproduce the above copyright
14193323Sed *    notice, this list of conditions and the following disclaimer in the
15193323Sed *    documentation and/or other materials provided with the distribution.
16193323Sed *
17276479Sdim * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18261991Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19276479Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20193323Sed * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21249423Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22249423Sdim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23276479Sdim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24249423Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25249423Sdim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26288943Sdim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27288943Sdim * SUCH DAMAGE.
28226633Sdim *
29288943Sdim * $FreeBSD: head/sys/kern/kern_racct.c 243088 2012-11-15 15:55:49Z trasz $
30224145Sdim */
31193323Sed
32288943Sdim#include <sys/cdefs.h>
33193323Sed__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 243088 2012-11-15 15:55:49Z trasz $");
34210299Sed
35210299Sed#include "opt_kdtrace.h"
36210299Sed#include "opt_sched.h"
37210299Sed
38226633Sdim#include <sys/param.h>
39210299Sed#include <sys/systm.h>
40210299Sed#include <sys/eventhandler.h>
41234353Sdim#include <sys/jail.h>
42234353Sdim#include <sys/kernel.h>
43234353Sdim#include <sys/kthread.h>
44234353Sdim#include <sys/lock.h>
45234353Sdim#include <sys/loginclass.h>
46234353Sdim#include <sys/malloc.h>
47239462Sdim#include <sys/mutex.h>
48234353Sdim#include <sys/proc.h>
49234353Sdim#include <sys/racct.h>
50234353Sdim#include <sys/resourcevar.h>
51234353Sdim#include <sys/sbuf.h>
52276479Sdim#include <sys/sched.h>
53234353Sdim#include <sys/sdt.h>
54210299Sed#include <sys/smp.h>
55276479Sdim#include <sys/sx.h>
56210299Sed#include <sys/sysctl.h>
57210299Sed#include <sys/sysent.h>
58210299Sed#include <sys/sysproto.h>
59234353Sdim#include <sys/umtx.h>
60234353Sdim#include <machine/smp.h>
61234353Sdim
62234353Sdim#ifdef RCTL
63210299Sed#include <sys/rctl.h>
64210299Sed#endif
65210299Sed
66296417Sdim#ifdef RACCT
67234353Sdim
68234353SdimFEATURE(racct, "Resource Accounting");
69210299Sed
70234353Sdim/*
71210299Sed * Do not block processes that have their %cpu usage <= pcpu_threshold.
72234353Sdim */
73234353Sdimstatic int pcpu_threshold = 1;
74210299Sed
75210299SedSYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
76210299SedSYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
77234353Sdim    0, "Processes with higher %cpu usage than this value can be throttled.");
78296417Sdim
79234353Sdim/*
80234353Sdim * How many seconds it takes to use the scheduler %cpu calculations.  When a
81234353Sdim * process starts, we compute its %cpu usage by dividing its runtime by the
82234353Sdim * process wall clock time.  After RACCT_PCPU_SECS pass, we use the value
83296417Sdim * provided by the scheduler.
84234353Sdim */
85234353Sdim#define RACCT_PCPU_SECS		3
86234353Sdim
87210299Sedstatic struct mtx racct_lock;
88210299SedMTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
89296417Sdim
90296417Sdimstatic uma_zone_t racct_zone;
91296417Sdim
92296417Sdimstatic void racct_sub_racct(struct racct *dest, const struct racct *src);
93296417Sdimstatic void racct_sub_cred_locked(struct ucred *cred, int resource,
94296417Sdim		uint64_t amount);
95296417Sdimstatic void racct_add_cred_locked(struct ucred *cred, int resource,
96296417Sdim		uint64_t amount);
97296417Sdim
98309124SdimSDT_PROVIDER_DEFINE(racct);
99309124SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, add, add, "struct proc *", "int",
100309124Sdim    "uint64_t");
101309124SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, add_failure, add-failure,
102309124Sdim    "struct proc *", "int", "uint64_t");
103309124SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, add_cred, add-cred, "struct ucred *",
104296417Sdim    "int", "uint64_t");
105296417SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, add_force, add-force, "struct proc *",
106296417Sdim    "int", "uint64_t");
107296417SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, set, set, "struct proc *", "int",
108296417Sdim    "uint64_t");
109195340SedSDT_PROBE_DEFINE3(racct, kernel, rusage, set_failure, set-failure,
110195340Sed    "struct proc *", "int", "uint64_t");
111195340SedSDT_PROBE_DEFINE3(racct, kernel, rusage, sub, sub, "struct proc *", "int",
112226633Sdim    "uint64_t");
113195340SedSDT_PROBE_DEFINE3(racct, kernel, rusage, sub_cred, sub-cred, "struct ucred *",
114195340Sed    "int", "uint64_t");
115195340SedSDT_PROBE_DEFINE1(racct, kernel, racct, create, create, "struct racct *");
116195340SedSDT_PROBE_DEFINE1(racct, kernel, racct, destroy, destroy, "struct racct *");
117195340SedSDT_PROBE_DEFINE2(racct, kernel, racct, join, join, "struct racct *",
118195340Sed    "struct racct *");
119195340SedSDT_PROBE_DEFINE2(racct, kernel, racct, join_failure, join-failure,
120195340Sed    "struct racct *", "struct racct *");
121193323SedSDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *",
122234353Sdim    "struct racct *");
123234353Sdim
124234353Sdimint racct_types[] = {
125234353Sdim	[RACCT_CPU] =
126234353Sdim		RACCT_IN_MILLIONS,
127234353Sdim	[RACCT_DATA] =
128234353Sdim		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
129234353Sdim	[RACCT_STACK] =
130193323Sed		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
131195340Sed	[RACCT_CORE] =
132193323Sed		RACCT_DENIABLE,
133193323Sed	[RACCT_RSS] =
134193323Sed		RACCT_RECLAIMABLE,
135193323Sed	[RACCT_MEMLOCK] =
136193323Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE,
137193323Sed	[RACCT_NPROC] =
138193323Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE,
139193323Sed	[RACCT_NOFILE] =
140193323Sed		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
141193323Sed	[RACCT_VMEM] =
142193323Sed		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
143193323Sed	[RACCT_NPTS] =
144193323Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
145193323Sed	[RACCT_SWAP] =
146193323Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
147210299Sed	[RACCT_NTHR] =
148193323Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE,
149195340Sed	[RACCT_MSGQQUEUED] =
150198090Srdivacky		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
151210299Sed	[RACCT_MSGQSIZE] =
152210299Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
153193323Sed	[RACCT_NMSGQ] =
154210299Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
155210299Sed	[RACCT_NSEM] =
156210299Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
157210299Sed	[RACCT_NSEMOP] =
158296417Sdim		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
159210299Sed	[RACCT_NSHM] =
160210299Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
161193323Sed	[RACCT_SHMSIZE] =
162193323Sed		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
163210299Sed	[RACCT_WALLCLOCK] =
164193323Sed		RACCT_IN_MILLIONS,
165296417Sdim	[RACCT_PCTCPU] =
166210299Sed		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
167193323Sed
168193323Sedstatic const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
169193323Sed
170193323Sed#ifdef SCHED_4BSD
171195340Sed/*
172195340Sed * Contains intermediate values for %cpu calculations to avoid using floating
173193323Sed * point in the kernel.
174193323Sed * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
175193323Sed * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
176193323Sed * zero so the calculations are more straightforward.
177193323Sed */
178193323Sedfixpt_t ccpu_exp[] = {
179193323Sed	[0] = FSCALE * 1,
180195340Sed	[1] = FSCALE * 0.95122942450071400909,
181195340Sed	[2] = FSCALE * 0.90483741803595957316,
182195340Sed	[3] = FSCALE * 0.86070797642505780722,
183195340Sed	[4] = FSCALE * 0.81873075307798185866,
184193323Sed	[5] = FSCALE * 0.77880078307140486824,
185193323Sed	[6] = FSCALE * 0.74081822068171786606,
186204792Srdivacky	[7] = FSCALE * 0.70468808971871343435,
187204792Srdivacky	[8] = FSCALE * 0.67032004603563930074,
188204792Srdivacky	[9] = FSCALE * 0.63762815162177329314,
189204792Srdivacky	[10] = FSCALE * 0.60653065971263342360,
190327952Sdim	[11] = FSCALE * 0.57694981038048669531,
191327952Sdim	[12] = FSCALE * 0.54881163609402643262,
192327952Sdim	[13] = FSCALE * 0.52204577676101604789,
193327952Sdim	[14] = FSCALE * 0.49658530379140951470,
194327952Sdim	[15] = FSCALE * 0.47236655274101470713,
195327952Sdim	[16] = FSCALE * 0.44932896411722159143,
196327952Sdim	[17] = FSCALE * 0.42741493194872666992,
197327952Sdim	[18] = FSCALE * 0.40656965974059911188,
198327952Sdim	[19] = FSCALE * 0.38674102345450120691,
199327952Sdim	[20] = FSCALE * 0.36787944117144232159,
200327952Sdim	[21] = FSCALE * 0.34993774911115535467,
201327952Sdim	[22] = FSCALE * 0.33287108369807955328,
202327952Sdim	[23] = FSCALE * 0.31663676937905321821,
203193323Sed	[24] = FSCALE * 0.30119421191220209664,
204327952Sdim	[25] = FSCALE * 0.28650479686019010032,
205296417Sdim	[26] = FSCALE * 0.27253179303401260312,
206193323Sed	[27] = FSCALE * 0.25924026064589150757,
207193323Sed	[28] = FSCALE * 0.24659696394160647693,
208193323Sed	[29] = FSCALE * 0.23457028809379765313,
209195340Sed	[30] = FSCALE * 0.22313016014842982893,
210204642Srdivacky	[31] = FSCALE * 0.21224797382674305771,
211261991Sdim	[32] = FSCALE * 0.20189651799465540848,
212309124Sdim	[33] = FSCALE * 0.19204990862075411423,
213204642Srdivacky	[34] = FSCALE * 0.18268352405273465022,
214204642Srdivacky	[35] = FSCALE * 0.17377394345044512668,
215296417Sdim	[36] = FSCALE * 0.16529888822158653829,
216204642Srdivacky	[37] = FSCALE * 0.15723716631362761621,
217204642Srdivacky	[38] = FSCALE * 0.14956861922263505264,
218204642Srdivacky	[39] = FSCALE * 0.14227407158651357185,
219204642Srdivacky	[40] = FSCALE * 0.13533528323661269189,
220204642Srdivacky	[41] = FSCALE * 0.12873490358780421886,
221296417Sdim	[42] = FSCALE * 0.12245642825298191021,
222204642Srdivacky	[43] = FSCALE * 0.11648415777349695786,
223204642Srdivacky	[44] = FSCALE * 0.11080315836233388333,
224193323Sed	[45] = FSCALE * 0.10539922456186433678,
225226633Sdim	[46] = FSCALE * 0.10025884372280373372,
226261991Sdim	[47] = FSCALE * 0.09536916221554961888,
227202878Srdivacky	[48] = FSCALE * 0.09071795328941250337,
228204642Srdivacky	[49] = FSCALE * 0.08629358649937051097,
229193323Sed	[50] = FSCALE * 0.08208499862389879516,
230193323Sed	[51] = FSCALE * 0.07808166600115315231,
231193323Sed	[52] = FSCALE * 0.07427357821433388042,
232193323Sed	[53] = FSCALE * 0.07065121306042958674,
233193323Sed	[54] = FSCALE * 0.06720551273974976512,
234204642Srdivacky	[55] = FSCALE * 0.06392786120670757270,
235193323Sed	[56] = FSCALE * 0.06081006262521796499,
236193323Sed	[57] = FSCALE * 0.05784432087483846296,
237193323Sed	[58] = FSCALE * 0.05502322005640722902,
238193323Sed	[59] = FSCALE * 0.05233970594843239308,
239288943Sdim	[60] = FSCALE * 0.04978706836786394297,
240288943Sdim	[61] = FSCALE * 0.04735892439114092119,
241288943Sdim	[62] = FSCALE * 0.04504920239355780606,
242193323Sed	[63] = FSCALE * 0.04285212686704017991,
243198090Srdivacky	[64] = FSCALE * 0.04076220397836621516,
244193323Sed	[65] = FSCALE * 0.03877420783172200988,
245193323Sed	[66] = FSCALE * 0.03688316740124000544,
246198090Srdivacky	[67] = FSCALE * 0.03508435410084502588,
247198090Srdivacky	[68] = FSCALE * 0.03337326996032607948,
248207618Srdivacky	[69] = FSCALE * 0.03174563637806794323,
249198090Srdivacky	[70] = FSCALE * 0.03019738342231850073,
250198090Srdivacky	[71] = FSCALE * 0.02872463965423942912,
251198090Srdivacky	[72] = FSCALE * 0.02732372244729256080,
252193323Sed	[73] = FSCALE * 0.02599112877875534358,
253193323Sed	[74] = FSCALE * 0.02472352647033939120,
254198090Srdivacky	[75] = FSCALE * 0.02351774585600910823,
255198090Srdivacky	[76] = FSCALE * 0.02237077185616559577,
256193323Sed	[77] = FSCALE * 0.02127973643837716938,
257198090Srdivacky	[78] = FSCALE * 0.02024191144580438847,
258198090Srdivacky	[79] = FSCALE * 0.01925470177538692429,
259198090Srdivacky	[80] = FSCALE * 0.01831563888873418029,
260296417Sdim	[81] = FSCALE * 0.01742237463949351138,
261198090Srdivacky	[82] = FSCALE * 0.01657267540176124754,
262198090Srdivacky	[83] = FSCALE * 0.01576441648485449082,
263198090Srdivacky	[84] = FSCALE * 0.01499557682047770621,
264198090Srdivacky	[85] = FSCALE * 0.01426423390899925527,
265198090Srdivacky	[86] = FSCALE * 0.01356855901220093175,
266198090Srdivacky	[87] = FSCALE * 0.01290681258047986886,
267296417Sdim	[88] = FSCALE * 0.01227733990306844117,
268296417Sdim	[89] = FSCALE * 0.01167856697039544521,
269198090Srdivacky	[90] = FSCALE * 0.01110899653824230649,
270198090Srdivacky	[91] = FSCALE * 0.01056720438385265337,
271193323Sed	[92] = FSCALE * 0.01005183574463358164,
272193323Sed	[93] = FSCALE * 0.00956160193054350793,
273193323Sed	[94] = FSCALE * 0.00909527710169581709,
274193323Sed	[95] = FSCALE * 0.00865169520312063417,
275193323Sed	[96] = FSCALE * 0.00822974704902002884,
276198090Srdivacky	[97] = FSCALE * 0.00782837754922577143,
277288943Sdim	[98] = FSCALE * 0.00744658307092434051,
278288943Sdim	[99] = FSCALE * 0.00708340892905212004,
279288943Sdim	[100] = FSCALE * 0.00673794699908546709,
280288943Sdim	[101] = FSCALE * 0.00640933344625638184,
281296417Sdim	[102] = FSCALE * 0.00609674656551563610,
282288943Sdim	[103] = FSCALE * 0.00579940472684214321,
283296417Sdim	[104] = FSCALE * 0.00551656442076077241,
284288943Sdim	[105] = FSCALE * 0.00524751839918138427,
285288943Sdim	[106] = FSCALE * 0.00499159390691021621,
286193323Sed	[107] = FSCALE * 0.00474815099941147558,
287198090Srdivacky	[108] = FSCALE * 0.00451658094261266798,
288193323Sed	[109] = FSCALE * 0.00429630469075234057,
289193323Sed	[110] = FSCALE * 0.00408677143846406699,
290193323Sed};
291198090Srdivacky#endif
292207618Srdivacky
293276479Sdim#define	CCPU_EXP_MAX	110
294193323Sed
295193323Sed/*
296193323Sed * This function is analogical to the getpcpu() function in the ps(1) command.
297198090Srdivacky * They should both calculate in the same way so that the racct %cpu
298276479Sdim * calculations are consistent with the values showed by the ps(1) tool.
299193323Sed * The calculations are more complex in the 4BSD scheduler because of the value
300261991Sdim * of the ccpu variable.  In ULE it is defined to be zero which saves us some
301261991Sdim * work.
302193323Sed */
303193323Sedstatic uint64_t
304193323Sedracct_getpcpu(struct proc *p, u_int pcpu)
305193323Sed{
306193323Sed	u_int swtime;
307193323Sed#ifdef SCHED_4BSD
308198090Srdivacky	fixpt_t pctcpu, pctcpu_next;
309198090Srdivacky#endif
310198090Srdivacky#ifdef SMP
311193323Sed	struct pcpu *pc;
312198090Srdivacky	int found;
313226633Sdim#endif
314198090Srdivacky	fixpt_t p_pctcpu;
315198090Srdivacky	struct thread *td;
316198090Srdivacky
317198090Srdivacky	/*
318198090Srdivacky	 * If the process is swapped out, we count its %cpu usage as zero.
319198090Srdivacky	 * This behaviour is consistent with the userland ps(1) tool.
320198090Srdivacky	 */
321198090Srdivacky	if ((p->p_flag & P_INMEM) == 0)
322198090Srdivacky		return (0);
323207618Srdivacky	swtime = (ticks - p->p_swtick) / hz;
324198090Srdivacky
325198090Srdivacky	/*
326198090Srdivacky	 * For short-lived processes, the sched_pctcpu() returns small
327205407Srdivacky	 * values even for cpu intensive processes.  Therefore we use
328198090Srdivacky	 * our own estimate in this case.
329210299Sed	 */
330205407Srdivacky	if (swtime < RACCT_PCPU_SECS)
331205407Srdivacky		return (pcpu);
332198090Srdivacky
333210299Sed	p_pctcpu = 0;
334198090Srdivacky	FOREACH_THREAD_IN_PROC(p, td) {
335198090Srdivacky		if (td == PCPU_GET(idlethread))
336198090Srdivacky			continue;
337198090Srdivacky#ifdef SMP
338198090Srdivacky		found = 0;
339198090Srdivacky		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
340198090Srdivacky			if (td == pc->pc_idlethread) {
341198090Srdivacky				found = 1;
342226633Sdim				break;
343198090Srdivacky			}
344198090Srdivacky		}
345198090Srdivacky		if (found)
346198090Srdivacky			continue;
347198090Srdivacky#endif
348198090Srdivacky		thread_lock(td);
349198090Srdivacky#ifdef SCHED_4BSD
350207618Srdivacky		pctcpu = sched_pctcpu(td);
351198090Srdivacky		/* Count also the yet unfinished second. */
352198090Srdivacky		pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
353221345Sdim		pctcpu_next += sched_pctcpu_delta(td);
354261991Sdim		p_pctcpu += max(pctcpu, pctcpu_next);
355198090Srdivacky#else
356198090Srdivacky		/*
357210299Sed		 * In ULE the %cpu statistics are updated on every
358198090Srdivacky		 * sched_pctcpu() call.  So special calculations to
359198090Srdivacky		 * account for the latest (unfinished) second are
360198090Srdivacky		 * not needed.
361198090Srdivacky		 */
362198090Srdivacky		p_pctcpu += sched_pctcpu(td);
363198090Srdivacky#endif
364198090Srdivacky		thread_unlock(td);
365210299Sed	}
366198090Srdivacky
367198090Srdivacky#ifdef SCHED_4BSD
368198090Srdivacky	if (swtime <= CCPU_EXP_MAX)
369198090Srdivacky		return ((100 * (uint64_t)p_pctcpu * 1000000) /
370198090Srdivacky		    (FSCALE - ccpu_exp[swtime]));
371198090Srdivacky#endif
372198090Srdivacky
373198090Srdivacky	return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
374198090Srdivacky}
375198090Srdivacky
376198090Srdivackystatic void
377198090Srdivackyracct_add_racct(struct racct *dest, const struct racct *src)
378202878Srdivacky{
379198090Srdivacky	int i;
380198090Srdivacky
381198090Srdivacky	mtx_assert(&racct_lock, MA_OWNED);
382198090Srdivacky
383193323Sed	/*
384193323Sed	 * Update resource usage in dest.
385193323Sed	 */
386193323Sed	for (i = 0; i <= RACCT_MAX; i++) {
387193323Sed		KASSERT(dest->r_resources[i] >= 0,
388193323Sed		    ("%s: resource %d propagation meltdown: dest < 0",
389193323Sed		    __func__, i));
390193323Sed		KASSERT(src->r_resources[i] >= 0,
391193323Sed		    ("%s: resource %d propagation meltdown: src < 0",
392193323Sed		    __func__, i));
393193323Sed		dest->r_resources[i] += src->r_resources[i];
394193323Sed	}
395193323Sed}
396193323Sed
397193323Sedstatic void
398198090Srdivackyracct_sub_racct(struct racct *dest, const struct racct *src)
399198090Srdivacky{
400226633Sdim	int i;
401226633Sdim
402193323Sed	mtx_assert(&racct_lock, MA_OWNED);
403288943Sdim
404288943Sdim	/*
405193323Sed	 * Update resource usage in dest.
406198090Srdivacky	 */
407193323Sed	for (i = 0; i <= RACCT_MAX; i++) {
408193323Sed		if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
409198090Srdivacky			KASSERT(dest->r_resources[i] >= 0,
410198090Srdivacky			    ("%s: resource %d propagation meltdown: dest < 0",
411198090Srdivacky			    __func__, i));
412198090Srdivacky			KASSERT(src->r_resources[i] >= 0,
413288943Sdim			    ("%s: resource %d propagation meltdown: src < 0",
414261991Sdim			    __func__, i));
415200581Srdivacky			KASSERT(src->r_resources[i] <= dest->r_resources[i],
416193323Sed			    ("%s: resource %d propagation meltdown: src > dest",
417193323Sed			    __func__, i));
418193323Sed		}
419193323Sed		if (RACCT_CAN_DROP(i)) {
420193323Sed			dest->r_resources[i] -= src->r_resources[i];
421198090Srdivacky			if (dest->r_resources[i] < 0) {
422198090Srdivacky				KASSERT(RACCT_IS_SLOPPY(i) ||
423198090Srdivacky				    RACCT_IS_DECAYING(i),
424203954Srdivacky				    ("%s: resource %d usage < 0", __func__, i));
425261991Sdim				dest->r_resources[i] = 0;
426203954Srdivacky			}
427203954Srdivacky		}
428296417Sdim	}
429207618Srdivacky}
430288943Sdim
431203954Srdivackyvoid
432203954Srdivackyracct_create(struct racct **racctp)
433203954Srdivacky{
434203954Srdivacky
435203954Srdivacky	SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0);
436203954Srdivacky
437203954Srdivacky	KASSERT(*racctp == NULL, ("racct already allocated"));
438203954Srdivacky
439296417Sdim	*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
440203954Srdivacky}
441193323Sed
442203954Srdivackystatic void
443203954Srdivackyracct_destroy_locked(struct racct **racctp)
444203954Srdivacky{
445203954Srdivacky	int i;
446203954Srdivacky	struct racct *racct;
447193323Sed
448193323Sed	SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0);
449198090Srdivacky
450198090Srdivacky	mtx_assert(&racct_lock, MA_OWNED);
451198090Srdivacky	KASSERT(racctp != NULL, ("NULL racctp"));
452198090Srdivacky	KASSERT(*racctp != NULL, ("NULL racct"));
453198090Srdivacky
454193323Sed	racct = *racctp;
455193323Sed
456193323Sed	for (i = 0; i <= RACCT_MAX; i++) {
457193323Sed		if (RACCT_IS_SLOPPY(i))
458193323Sed			continue;
459193323Sed		if (!RACCT_IS_RECLAIMABLE(i))
460226633Sdim			continue;
461198090Srdivacky		KASSERT(racct->r_resources[i] == 0,
462198090Srdivacky		    ("destroying non-empty racct: "
463198090Srdivacky		    "%ju allocated for resource %d\n",
464288943Sdim		    racct->r_resources[i], i));
465288943Sdim	}
466288943Sdim	uma_zfree(racct_zone, racct);
467288943Sdim	*racctp = NULL;
468288943Sdim}
469288943Sdim
470288943Sdimvoid
471288943Sdimracct_destroy(struct racct **racct)
472288943Sdim{
473288943Sdim
474288943Sdim	mtx_lock(&racct_lock);
475288943Sdim	racct_destroy_locked(racct);
476288943Sdim	mtx_unlock(&racct_lock);
477288943Sdim}
478194612Sed
479288943Sdim/*
480288943Sdim * Increase consumption of 'resource' by 'amount' for 'racct'
481193323Sed * and all its parents.  Differently from other cases, 'amount' here
482288943Sdim * may be less than zero.
483198090Srdivacky */
484198090Srdivackystatic void
485198090Srdivackyracct_alloc_resource(struct racct *racct, int resource,
486198090Srdivacky    uint64_t amount)
487198090Srdivacky{
488198090Srdivacky
489198090Srdivacky	mtx_assert(&racct_lock, MA_OWNED);
490198090Srdivacky	KASSERT(racct != NULL, ("NULL racct"));
491198090Srdivacky
492193323Sed	racct->r_resources[resource] += amount;
493226633Sdim	if (racct->r_resources[resource] < 0) {
494193323Sed		KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
495198090Srdivacky		    ("%s: resource %d usage < 0", __func__, resource));
496198090Srdivacky		racct->r_resources[resource] = 0;
497193323Sed	}
498193323Sed
499204642Srdivacky	/*
500193323Sed	 * There are some cases where the racct %cpu resource would grow
501193323Sed	 * beyond 100%.
502193323Sed	 * For example in racct_proc_exit() we add the process %cpu usage
503198090Srdivacky	 * to the ucred racct containers.  If too many processes terminated
504193323Sed	 * in a short time span, the ucred %cpu resource could grow too much.
505198090Srdivacky	 * Also, the 4BSD scheduler sometimes returns for a thread more than
506198090Srdivacky	 * 100% cpu usage.  So we set a boundary here to 100%.
507234353Sdim	 */
508296417Sdim	if ((resource == RACCT_PCTCPU) &&
509234353Sdim	    (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
510198090Srdivacky		racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
511194178Sed}
512193323Sed
513193323Sedstatic int
514193323Sedracct_add_locked(struct proc *p, int resource, uint64_t amount)
515193323Sed{
516288943Sdim#ifdef RCTL
517288943Sdim	int error;
518193323Sed#endif
519193323Sed
520193323Sed	SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0);
521195340Sed
522195340Sed	/*
523195340Sed	 * We need proc lock to dereference p->p_ucred.
524195340Sed	 */
525193323Sed	PROC_LOCK_ASSERT(p, MA_OWNED);
526193323Sed
527204792Srdivacky#ifdef RCTL
528204792Srdivacky	error = rctl_enforce(p, resource, amount);
529204792Srdivacky	if (error && RACCT_IS_DENIABLE(resource)) {
530204792Srdivacky		SDT_PROBE(racct, kernel, rusage, add_failure, p, resource,
531193323Sed		    amount, 0, 0);
532193323Sed		return (error);
533296417Sdim	}
534193323Sed#endif
535193323Sed	racct_alloc_resource(p->p_racct, resource, amount);
536193323Sed	racct_add_cred_locked(p->p_ucred, resource, amount);
537193323Sed
538204642Srdivacky	return (0);
539309124Sdim}
540204642Srdivacky
541204642Srdivacky/*
542296417Sdim * Increase allocation of 'resource' by 'amount' for process 'p'.
543204642Srdivacky * Return 0 if it's below limits, or errno, if it's not.
544204642Srdivacky */
545204642Srdivackyint
546204642Srdivackyracct_add(struct proc *p, int resource, uint64_t amount)
547204642Srdivacky{
548296417Sdim	int error;
549204642Srdivacky
550204642Srdivacky	mtx_lock(&racct_lock);
551198090Srdivacky	error = racct_add_locked(p, resource, amount);
552288943Sdim	mtx_unlock(&racct_lock);
553202878Srdivacky	return (error);
554204642Srdivacky}
555193323Sed
556193323Sedstatic void
557193323Sedracct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
558309124Sdim{
559309124Sdim	struct prison *pr;
560204642Srdivacky
561309124Sdim	SDT_PROBE(racct, kernel, rusage, add_cred, cred, resource, amount,
562309124Sdim	    0, 0);
563309124Sdim
564204642Srdivacky	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
565314564Sdim	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
566314564Sdim		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
567296417Sdim		    amount);
568309124Sdim	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount);
569309124Sdim}
570204642Srdivacky
571309124Sdim/*
572309124Sdim * Increase allocation of 'resource' by 'amount' for credential 'cred'.
573204642Srdivacky * Doesn't check for limits and never fails.
574309124Sdim *
575309124Sdim * XXX: Shouldn't this ever return an error?
576309124Sdim */
577309124Sdimvoid
578309124Sdimracct_add_cred(struct ucred *cred, int resource, uint64_t amount)
579309124Sdim{
580309124Sdim
581309124Sdim	mtx_lock(&racct_lock);
582309124Sdim	racct_add_cred_locked(cred, resource, amount);
583309124Sdim	mtx_unlock(&racct_lock);
584309124Sdim}
585309124Sdim
586309124Sdim/*
587204642Srdivacky * Increase allocation of 'resource' by 'amount' for process 'p'.
588204642Srdivacky * Doesn't check for limits and never fails.
589193323Sed */
590193323Sedvoid
591193323Sedracct_add_force(struct proc *p, int resource, uint64_t amount)
592204642Srdivacky{
593204642Srdivacky
594204642Srdivacky	SDT_PROBE(racct, kernel, rusage, add_force, p, resource, amount, 0, 0);
595204642Srdivacky
596204642Srdivacky	/*
597204642Srdivacky	 * We need proc lock to dereference p->p_ucred.
598204642Srdivacky	 */
599204642Srdivacky	PROC_LOCK_ASSERT(p, MA_OWNED);
600204642Srdivacky
601204642Srdivacky	mtx_lock(&racct_lock);
602204642Srdivacky	racct_alloc_resource(p->p_racct, resource, amount);
603204642Srdivacky	mtx_unlock(&racct_lock);
604204642Srdivacky	racct_add_cred(p->p_ucred, resource, amount);
605193323Sed}
606218893Sdim
607204642Srdivackystatic int
608218893Sdimracct_set_locked(struct proc *p, int resource, uint64_t amount)
609218893Sdim{
610296417Sdim	int64_t old_amount, decayed_amount;
611218893Sdim	int64_t diff_proc, diff_cred;
612218893Sdim#ifdef RCTL
613218893Sdim	int error;
614204642Srdivacky#endif
615218893Sdim
616276479Sdim	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
617204642Srdivacky
618204642Srdivacky	/*
619296417Sdim	 * We need proc lock to dereference p->p_ucred.
620218893Sdim	 */
621276479Sdim	PROC_LOCK_ASSERT(p, MA_OWNED);
622204642Srdivacky
623204642Srdivacky	old_amount = p->p_racct->r_resources[resource];
624276479Sdim	/*
625204642Srdivacky	 * The diffs may be negative.
626204642Srdivacky	 */
627296417Sdim	diff_proc = amount - old_amount;
628296417Sdim	if (RACCT_IS_DECAYING(resource)) {
629218893Sdim		/*
630204642Srdivacky		 * Resources in per-credential racct containers may decay.
631218893Sdim		 * If this is the case, we need to calculate the difference
632218893Sdim		 * between the new amount and the proportional value of the
633218893Sdim		 * old amount that has decayed in the ucred racct containers.
634218893Sdim		 */
635218893Sdim		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
636296417Sdim		diff_cred = amount - decayed_amount;
637296417Sdim	} else
638218893Sdim		diff_cred = diff_proc;
639218893Sdim#ifdef notyet
640204642Srdivacky	KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
641204642Srdivacky	    ("%s: usage of non-droppable resource %d dropping", __func__,
642198090Srdivacky	     resource));
643207618Srdivacky#endif
644207618Srdivacky#ifdef RCTL
645204642Srdivacky	if (diff_proc > 0) {
646204642Srdivacky		error = rctl_enforce(p, resource, diff_proc);
647204642Srdivacky		if (error && RACCT_IS_DENIABLE(resource)) {
648204642Srdivacky			SDT_PROBE(racct, kernel, rusage, set_failure, p,
649204642Srdivacky			    resource, amount, 0, 0);
650198090Srdivacky			return (error);
651204642Srdivacky		}
652204642Srdivacky	}
653212904Sdim#endif
654212904Sdim	racct_alloc_resource(p->p_racct, resource, diff_proc);
655212904Sdim	if (diff_cred > 0)
656212904Sdim		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
657212904Sdim	else if (diff_cred < 0)
658204642Srdivacky		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
659204642Srdivacky
660204642Srdivacky	return (0);
661204642Srdivacky}
662204642Srdivacky
663204642Srdivacky/*
664204642Srdivacky * Set allocation of 'resource' to 'amount' for process 'p'.
665234353Sdim * Return 0 if it's below limits, or errno, if it's not.
666234353Sdim *
667204642Srdivacky * Note that decreasing the allocation always returns 0,
668234353Sdim * even if it's above the limit.
669204642Srdivacky */
670204642Srdivackyint
671204642Srdivackyracct_set(struct proc *p, int resource, uint64_t amount)
672204642Srdivacky{
673198090Srdivacky	int error;
674204642Srdivacky
675193323Sed	mtx_lock(&racct_lock);
676207618Srdivacky	error = racct_set_locked(p, resource, amount);
677207618Srdivacky	mtx_unlock(&racct_lock);
678204642Srdivacky	return (error);
679226633Sdim}
680193323Sed
681204642Srdivackystatic void
682204642Srdivackyracct_set_force_locked(struct proc *p, int resource, uint64_t amount)
683204642Srdivacky{
684204642Srdivacky	int64_t old_amount, decayed_amount;
685204642Srdivacky	int64_t diff_proc, diff_cred;
686204642Srdivacky
687204642Srdivacky	SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
688218893Sdim
689204642Srdivacky	/*
690204642Srdivacky	 * We need proc lock to dereference p->p_ucred.
691204642Srdivacky	 */
692296417Sdim	PROC_LOCK_ASSERT(p, MA_OWNED);
693204642Srdivacky
694204642Srdivacky	old_amount = p->p_racct->r_resources[resource];
695204642Srdivacky	/*
696276479Sdim	 * The diffs may be negative.
697296417Sdim	 */
698204642Srdivacky	diff_proc = amount - old_amount;
699204642Srdivacky	if (RACCT_IS_DECAYING(resource)) {
700204642Srdivacky		/*
701204642Srdivacky		 * Resources in per-credential racct containers may decay.
702204642Srdivacky		 * If this is the case, we need to calculate the difference
703204642Srdivacky		 * between the new amount and the proportional value of the
704226633Sdim		 * old amount that has decayed in the ucred racct containers.
705204642Srdivacky		 */
706204642Srdivacky		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
707204642Srdivacky		diff_cred = amount - decayed_amount;
708212904Sdim	} else
709212904Sdim		diff_cred = diff_proc;
710212904Sdim
711212904Sdim	racct_alloc_resource(p->p_racct, resource, diff_proc);
712212904Sdim	if (diff_cred > 0)
713212904Sdim		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
714212904Sdim	else if (diff_cred < 0)
715212904Sdim		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
716212904Sdim}
717204642Srdivacky
718226633Sdimvoid
719204642Srdivackyracct_set_force(struct proc *p, int resource, uint64_t amount)
720207618Srdivacky{
721207618Srdivacky	mtx_lock(&racct_lock);
722204642Srdivacky	racct_set_force_locked(p, resource, amount);
723207618Srdivacky	mtx_unlock(&racct_lock);
724207618Srdivacky}
725204642Srdivacky
726204642Srdivacky/*
727204642Srdivacky * Returns amount of 'resource' the process 'p' can keep allocated.
728234353Sdim * Allocating more than that would be denied, unless the resource
729204642Srdivacky * is marked undeniable.  Amount of already allocated resource does
730202878Srdivacky * not matter.
731204642Srdivacky */
732204642Srdivackyuint64_t
733204642Srdivackyracct_get_limit(struct proc *p, int resource)
734202878Srdivacky{
735204642Srdivacky
736202878Srdivacky#ifdef RCTL
737204642Srdivacky	return (rctl_get_limit(p, resource));
738204642Srdivacky#else
739204642Srdivacky	return (UINT64_MAX);
740204642Srdivacky#endif
741204642Srdivacky}
742202878Srdivacky
743193323Sed/*
744204642Srdivacky * Returns amount of 'resource' the process 'p' can keep allocated.
745204642Srdivacky * Allocating more than that would be denied, unless the resource
746193323Sed * is marked undeniable.  Amount of already allocated resource does
747193323Sed * matter.
748193323Sed */
749226633Sdimuint64_t
750193323Sedracct_get_available(struct proc *p, int resource)
751204642Srdivacky{
752204642Srdivacky
753204642Srdivacky#ifdef RCTL
754204642Srdivacky	return (rctl_get_available(p, resource));
755204642Srdivacky#else
756218893Sdim	return (UINT64_MAX);
757193323Sed#endif
758204642Srdivacky}
759296417Sdim
760204642Srdivacky/*
761204642Srdivacky * Returns amount of the %cpu resource that process 'p' can add to its %cpu
762204642Srdivacky * utilization.  Adding more than that would lead to the process being
763276479Sdim * throttled.
764321369Sdim */
765321369Sdimstatic int64_t
766321369Sdimracct_pcpu_available(struct proc *p)
767321369Sdim{
768321369Sdim
769321369Sdim#ifdef RCTL
770321369Sdim	return (rctl_pcpu_available(p));
771321369Sdim#else
772321369Sdim	return (INT64_MAX);
773321369Sdim#endif
774321369Sdim}
775321369Sdim
776321369Sdim/*
777321369Sdim * Decrease allocation of 'resource' by 'amount' for process 'p'.
778321369Sdim */
779321369Sdimvoid
780321369Sdimracct_sub(struct proc *p, int resource, uint64_t amount)
781321369Sdim{
782321369Sdim
783321369Sdim	SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0);
784321369Sdim
785321369Sdim	/*
786321369Sdim	 * We need proc lock to dereference p->p_ucred.
787321369Sdim	 */
788321369Sdim	PROC_LOCK_ASSERT(p, MA_OWNED);
789321369Sdim	KASSERT(RACCT_CAN_DROP(resource),
790321369Sdim	    ("%s: called for non-droppable resource %d", __func__, resource));
791321369Sdim
792321369Sdim	mtx_lock(&racct_lock);
793321369Sdim	KASSERT(amount <= p->p_racct->r_resources[resource],
794321369Sdim	    ("%s: freeing %ju of resource %d, which is more "
795321369Sdim	     "than allocated %jd for %s (pid %d)", __func__, amount, resource,
796321369Sdim	    (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
797321369Sdim
798321369Sdim	racct_alloc_resource(p->p_racct, resource, -amount);
799321369Sdim	racct_sub_cred_locked(p->p_ucred, resource, amount);
800321369Sdim	mtx_unlock(&racct_lock);
801321369Sdim}
802321369Sdim
803204642Srdivackystatic void
804204642Srdivackyracct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
805321369Sdim{
806321369Sdim	struct prison *pr;
807204642Srdivacky
808204642Srdivacky	SDT_PROBE(racct, kernel, rusage, sub_cred, cred, resource, amount,
809204642Srdivacky	    0, 0);
810321369Sdim
811204642Srdivacky#ifdef notyet
812204642Srdivacky	KASSERT(RACCT_CAN_DROP(resource),
813321369Sdim	    ("%s: called for resource %d which can not drop", __func__,
814204642Srdivacky	     resource));
815204642Srdivacky#endif
816204642Srdivacky
817288943Sdim	racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
818288943Sdim	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
819288943Sdim		racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
820288943Sdim		    -amount);
821288943Sdim	racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount);
822288943Sdim}
823288943Sdim
824288943Sdim/*
825288943Sdim * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
826204642Srdivacky */
827193323Sedvoid
828193323Sedracct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
829204642Srdivacky{
830193323Sed
831193323Sed	mtx_lock(&racct_lock);
832193323Sed	racct_sub_cred_locked(cred, resource, amount);
833226633Sdim	mtx_unlock(&racct_lock);
834193323Sed}
835194178Sed
836193323Sed/*
837296417Sdim * Inherit resource usage information from the parent process.
838193323Sed */
839193323Sedint
840195340Sedracct_proc_fork(struct proc *parent, struct proc *child)
841193323Sed{
842193323Sed	int i, error = 0;
843194178Sed
844195340Sed	/*
845193323Sed	 * Create racct for the child process.
846193323Sed	 */
847193323Sed	racct_create(&child->p_racct);
848193323Sed
849193323Sed	PROC_LOCK(parent);
850198090Srdivacky	PROC_LOCK(child);
851193323Sed	mtx_lock(&racct_lock);
852193323Sed
853193323Sed#ifdef RCTL
854193323Sed	error = rctl_proc_fork(parent, child);
855207618Srdivacky	if (error != 0)
856193323Sed		goto out;
857221345Sdim#endif
858261991Sdim
859193323Sed	/* Init process cpu time. */
860193323Sed	child->p_prev_runtime = 0;
861193323Sed	child->p_throttled = 0;
862198090Srdivacky
863193323Sed	/*
864193323Sed	 * Inherit resource usage.
865193323Sed	 */
866193323Sed	for (i = 0; i <= RACCT_MAX; i++) {
867193323Sed		if (parent->p_racct->r_resources[i] == 0 ||
868193323Sed		    !RACCT_IS_INHERITABLE(i))
869226633Sdim			continue;
870226633Sdim
871226633Sdim		error = racct_set_locked(child, i,
872226633Sdim		    parent->p_racct->r_resources[i]);
873226633Sdim		if (error != 0)
874226633Sdim			goto out;
875226633Sdim	}
876226633Sdim
877226633Sdim	error = racct_add_locked(child, RACCT_NPROC, 1);
878226633Sdim	error += racct_add_locked(child, RACCT_NTHR, 1);
879226633Sdim
880226633Sdimout:
881226633Sdim	mtx_unlock(&racct_lock);
882226633Sdim	PROC_UNLOCK(child);
883296417Sdim	PROC_UNLOCK(parent);
884226633Sdim
885226633Sdim	if (error != 0)
886226633Sdim		racct_proc_exit(child);
887226633Sdim
888226633Sdim	return (error);
889226633Sdim}
890226633Sdim
891226633Sdim/*
892226633Sdim * Called at the end of fork1(), to handle rules that require the process
893226633Sdim * to be fully initialized.
894327952Sdim */
895226633Sdimvoid
896226633Sdimracct_proc_fork_done(struct proc *child)
897226633Sdim{
898226633Sdim
899226633Sdim#ifdef RCTL
900234353Sdim	PROC_LOCK(child);
901234353Sdim	mtx_lock(&racct_lock);
902234353Sdim	rctl_enforce(child, RACCT_NPROC, 0);
903234353Sdim	rctl_enforce(child, RACCT_NTHR, 0);
904234353Sdim	mtx_unlock(&racct_lock);
905234353Sdim	PROC_UNLOCK(child);
906234353Sdim#endif
907234353Sdim}
908234353Sdim
909234353Sdimvoid
910234353Sdimracct_proc_exit(struct proc *p)
911234353Sdim{
912234353Sdim	int i;
913276479Sdim	uint64_t runtime;
914234353Sdim	struct timeval wallclock;
915226633Sdim	uint64_t pct_estimate, pct;
916234353Sdim
917276479Sdim	PROC_LOCK(p);
918226633Sdim	/*
919226633Sdim	 * We don't need to calculate rux, proc_reap() has already done this.
920234353Sdim	 */
921234353Sdim	runtime = cputick2usec(p->p_rux.rux_runtime);
922296417Sdim#ifdef notyet
923234353Sdim	KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
924276479Sdim#else
925234353Sdim	if (runtime < p->p_prev_runtime)
926226633Sdim		runtime = p->p_prev_runtime;
927234353Sdim#endif
928234353Sdim	microuptime(&wallclock);
929296417Sdim	timevalsub(&wallclock, &p->p_stats->p_start);
930226633Sdim	if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
931226633Sdim		pct_estimate = (1000000 * runtime * 100) /
932234353Sdim		    ((uint64_t)wallclock.tv_sec * 1000000 +
933296417Sdim		    wallclock.tv_usec);
934276479Sdim	} else
935234353Sdim		pct_estimate = 0;
936234353Sdim	pct = racct_getpcpu(p, pct_estimate);
937234353Sdim
938234353Sdim	mtx_lock(&racct_lock);
939234353Sdim	racct_set_locked(p, RACCT_CPU, runtime);
940234353Sdim	racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
941234353Sdim
942234353Sdim	for (i = 0; i <= RACCT_MAX; i++) {
943234353Sdim		if (p->p_racct->r_resources[i] == 0)
944226633Sdim			continue;
945276479Sdim	    	if (!RACCT_IS_RECLAIMABLE(i))
946226633Sdim			continue;
947226633Sdim		racct_set_locked(p, i, 0);
948226633Sdim	}
949276479Sdim
950226633Sdim	mtx_unlock(&racct_lock);
951226633Sdim	PROC_UNLOCK(p);
952234353Sdim
953226633Sdim#ifdef RCTL
954234353Sdim	rctl_racct_release(p->p_racct);
955234353Sdim#endif
956309124Sdim	racct_destroy(&p->p_racct);
957309124Sdim}
958309124Sdim
959309124Sdim/*
960309124Sdim * Called after credentials change, to move resource utilisation
961309124Sdim * between raccts.
962309124Sdim */
963309124Sdimvoid
964309124Sdimracct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
965309124Sdim    struct ucred *newcred)
966309124Sdim{
967309124Sdim	struct uidinfo *olduip, *newuip;
968309124Sdim	struct loginclass *oldlc, *newlc;
969309124Sdim	struct prison *oldpr, *newpr, *pr;
970309124Sdim
971309124Sdim	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
972309124Sdim
973234353Sdim	newuip = newcred->cr_ruidinfo;
974234353Sdim	olduip = oldcred->cr_ruidinfo;
975234353Sdim	newlc = newcred->cr_loginclass;
976234353Sdim	oldlc = oldcred->cr_loginclass;
977296417Sdim	newpr = newcred->cr_prison;
978234353Sdim	oldpr = oldcred->cr_prison;
979234353Sdim
980234353Sdim	mtx_lock(&racct_lock);
981234353Sdim	if (newuip != olduip) {
982296417Sdim		racct_sub_racct(olduip->ui_racct, p->p_racct);
983296417Sdim		racct_add_racct(newuip->ui_racct, p->p_racct);
984226633Sdim	}
985234353Sdim	if (newlc != oldlc) {
986296417Sdim		racct_sub_racct(oldlc->lc_racct, p->p_racct);
987296417Sdim		racct_add_racct(newlc->lc_racct, p->p_racct);
988296417Sdim	}
989234353Sdim	if (newpr != oldpr) {
990234353Sdim		for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
991234353Sdim			racct_sub_racct(pr->pr_prison_racct->prr_racct,
992234353Sdim			    p->p_racct);
993234353Sdim		for (pr = newpr; pr != NULL; pr = pr->pr_parent)
994234353Sdim			racct_add_racct(pr->pr_prison_racct->prr_racct,
995234353Sdim			    p->p_racct);
996234353Sdim	}
997234353Sdim	mtx_unlock(&racct_lock);
998296417Sdim
999234353Sdim#ifdef RCTL
1000226633Sdim	rctl_proc_ucred_changed(p, newcred);
1001296417Sdim#endif
1002309124Sdim}
1003234353Sdim
1004234353Sdimvoid
1005234353Sdimracct_move(struct racct *dest, struct racct *src)
1006226633Sdim{
1007226633Sdim
1008234353Sdim	mtx_lock(&racct_lock);
1009234353Sdim
1010234353Sdim	racct_add_racct(dest, src);
1011234353Sdim	racct_sub_racct(src, src);
1012234353Sdim
1013234353Sdim	mtx_unlock(&racct_lock);
1014234353Sdim}
1015234353Sdim
1016234353Sdimstatic void
1017234353Sdimracct_proc_throttle(struct proc *p)
1018234353Sdim{
1019234353Sdim	struct thread *td;
1020234353Sdim#ifdef SMP
1021234353Sdim	int cpuid;
1022234353Sdim#endif
1023234353Sdim
1024234353Sdim	PROC_LOCK_ASSERT(p, MA_OWNED);
1025234353Sdim
1026234353Sdim	/*
1027234353Sdim	 * Do not block kernel processes.  Also do not block processes with
1028234353Sdim	 * low %cpu utilization to improve interactivity.
1029234353Sdim	 */
1030234353Sdim	if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
1031234353Sdim	    (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
1032234353Sdim		return;
1033234353Sdim	p->p_throttled = 1;
1034234353Sdim
1035234353Sdim	FOREACH_THREAD_IN_PROC(p, td) {
1036234353Sdim		switch (td->td_state) {
1037234353Sdim		case TDS_RUNQ:
1038234353Sdim			/*
1039234353Sdim			 * If the thread is on the scheduler run-queue, we can
1040234353Sdim			 * not just remove it from there.  So we set the flag
1041234353Sdim			 * TDF_NEEDRESCHED for the thread, so that once it is
1042234353Sdim			 * running, it is taken off the cpu as soon as possible.
1043234353Sdim			 */
1044234353Sdim			thread_lock(td);
1045234353Sdim			td->td_flags |= TDF_NEEDRESCHED;
1046234353Sdim			thread_unlock(td);
1047234353Sdim			break;
1048234353Sdim		case TDS_RUNNING:
1049234353Sdim			/*
1050234353Sdim			 * If the thread is running, we request a context
1051234353Sdim			 * switch for it by setting the TDF_NEEDRESCHED flag.
1052234353Sdim			 */
1053234353Sdim			thread_lock(td);
1054276479Sdim			td->td_flags |= TDF_NEEDRESCHED;
1055276479Sdim#ifdef SMP
1056309124Sdim			cpuid = td->td_oncpu;
1057309124Sdim			if ((cpuid != NOCPU) && (td != curthread))
1058276479Sdim				ipi_cpu(cpuid, IPI_AST);
1059276479Sdim#endif
1060276479Sdim			thread_unlock(td);
1061276479Sdim			break;
1062276479Sdim		default:
1063309124Sdim			break;
1064276479Sdim		}
1065276479Sdim	}
1066276479Sdim}
1067276479Sdim
1068276479Sdimstatic void
1069276479Sdimracct_proc_wakeup(struct proc *p)
1070276479Sdim{
1071296417Sdim	PROC_LOCK_ASSERT(p, MA_OWNED);
1072276479Sdim
1073276479Sdim	if (p->p_throttled) {
1074276479Sdim		p->p_throttled = 0;
1075276479Sdim		wakeup(p->p_racct);
1076276479Sdim	}
1077276479Sdim}
1078276479Sdim
1079276479Sdimstatic void
1080276479Sdimracct_decay_resource(struct racct *racct, void * res, void* dummy)
1081276479Sdim{
1082276479Sdim	int resource;
1083276479Sdim	int64_t r_old, r_new;
1084276479Sdim
1085276479Sdim	resource = *(int *)res;
1086276479Sdim	r_old = racct->r_resources[resource];
1087276479Sdim
1088276479Sdim	/* If there is nothing to decay, just exit. */
1089276479Sdim	if (r_old <= 0)
1090276479Sdim		return;
1091276479Sdim
1092276479Sdim	mtx_lock(&racct_lock);
1093276479Sdim	r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
1094276479Sdim	racct->r_resources[resource] = r_new;
1095276479Sdim	mtx_unlock(&racct_lock);
1096276479Sdim}
1097276479Sdim
1098276479Sdimstatic void
1099276479Sdimracct_decay(int resource)
1100276479Sdim{
1101276479Sdim	ui_racct_foreach(racct_decay_resource, &resource, NULL);
1102276479Sdim	loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
1103288943Sdim	prison_racct_foreach(racct_decay_resource, &resource, NULL);
1104288943Sdim}
1105288943Sdim
1106288943Sdimstatic void
1107288943Sdimracctd(void)
1108288943Sdim{
1109288943Sdim	struct thread *td;
1110288943Sdim	struct proc *p;
1111288943Sdim	struct timeval wallclock;
1112288943Sdim	uint64_t runtime;
1113288943Sdim	uint64_t pct, pct_estimate;
1114288943Sdim
1115288943Sdim	for (;;) {
1116288943Sdim		racct_decay(RACCT_PCTCPU);
1117288943Sdim
1118288943Sdim		sx_slock(&allproc_lock);
1119288943Sdim
1120288943Sdim		LIST_FOREACH(p, &zombproc, p_list) {
1121288943Sdim			PROC_LOCK(p);
1122288943Sdim			racct_set(p, RACCT_PCTCPU, 0);
1123288943Sdim			PROC_UNLOCK(p);
1124288943Sdim		}
1125288943Sdim
1126288943Sdim		FOREACH_PROC_IN_SYSTEM(p) {
1127288943Sdim			PROC_LOCK(p);
1128288943Sdim			if (p->p_state != PRS_NORMAL) {
1129288943Sdim				PROC_UNLOCK(p);
1130288943Sdim				continue;
1131202878Srdivacky			}
1132202878Srdivacky
1133202878Srdivacky			microuptime(&wallclock);
1134202878Srdivacky			timevalsub(&wallclock, &p->p_stats->p_start);
1135202878Srdivacky			PROC_SLOCK(p);
1136202878Srdivacky			FOREACH_THREAD_IN_PROC(p, td)
1137226633Sdim				ruxagg(p, td);
1138276479Sdim			runtime = cputick2usec(p->p_rux.rux_runtime);
1139276479Sdim			PROC_SUNLOCK(p);
1140276479Sdim#ifdef notyet
1141224145Sdim			KASSERT(runtime >= p->p_prev_runtime,
1142224145Sdim			    ("runtime < p_prev_runtime"));
1143202878Srdivacky#else
1144226633Sdim			if (runtime < p->p_prev_runtime)
1145226633Sdim				runtime = p->p_prev_runtime;
1146276479Sdim#endif
1147276479Sdim			p->p_prev_runtime = runtime;
1148276479Sdim			if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
1149276479Sdim				pct_estimate = (1000000 * runtime * 100) /
1150276479Sdim				    ((uint64_t)wallclock.tv_sec * 1000000 +
1151276479Sdim				    wallclock.tv_usec);
1152276479Sdim			} else
1153296417Sdim				pct_estimate = 0;
1154296417Sdim			pct = racct_getpcpu(p, pct_estimate);
1155296417Sdim			mtx_lock(&racct_lock);
1156276479Sdim			racct_set_force_locked(p, RACCT_PCTCPU, pct);
1157327952Sdim			racct_set_locked(p, RACCT_CPU, runtime);
1158327952Sdim			racct_set_locked(p, RACCT_WALLCLOCK,
1159226633Sdim			    (uint64_t)wallclock.tv_sec * 1000000 +
1160202878Srdivacky			    wallclock.tv_usec);
1161327952Sdim			mtx_unlock(&racct_lock);
1162276479Sdim			PROC_UNLOCK(p);
1163276479Sdim		}
1164226633Sdim
1165276479Sdim		/*
1166276479Sdim		 * To ensure that processes are throttled in a fair way, we need
1167276479Sdim		 * to iterate over all processes again and check the limits
1168276479Sdim		 * for %cpu resource only after ucred racct containers have been
1169276479Sdim		 * properly filled.
1170276479Sdim		 */
1171276479Sdim		FOREACH_PROC_IN_SYSTEM(p) {
1172276479Sdim			PROC_LOCK(p);
1173327952Sdim			if (p->p_state != PRS_NORMAL) {
1174276479Sdim				PROC_UNLOCK(p);
1175276479Sdim				continue;
1176226633Sdim			}
1177327952Sdim
1178226633Sdim			if (racct_pcpu_available(p) <= 0)
1179276479Sdim				racct_proc_throttle(p);
1180234353Sdim			else if (p->p_throttled)
1181276479Sdim				racct_proc_wakeup(p);
1182327952Sdim			PROC_UNLOCK(p);
1183226633Sdim		}
1184226633Sdim		sx_sunlock(&allproc_lock);
1185276479Sdim		pause("-", hz);
1186276479Sdim	}
1187276479Sdim}
1188276479Sdim
1189276479Sdimstatic struct kproc_desc racctd_kp = {
1190276479Sdim	"racctd",
1191327952Sdim	racctd,
1192276479Sdim	NULL
1193276479Sdim};
1194276479SdimSYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp);
1195276479Sdim
1196276479Sdimstatic void
1197276479Sdimracct_init(void)
1198276479Sdim{
1199276479Sdim
1200276479Sdim	racct_zone = uma_zcreate("racct", sizeof(struct racct),
1201327952Sdim	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1202276479Sdim	/*
1203276479Sdim	 * XXX: Move this somewhere.
1204276479Sdim	 */
1205276479Sdim	prison0.pr_prison_racct = prison_racct_find("0");
1206276479Sdim}
1207276479SdimSYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
1208276479Sdim
1209276479Sdim#else /* !RACCT */
1210276479Sdim
1211296417Sdimint
1212276479Sdimracct_add(struct proc *p, int resource, uint64_t amount)
1213226633Sdim{
1214226633Sdim
1215276479Sdim	return (0);
1216226633Sdim}
1217226633Sdim
1218276479Sdimvoid
1219226633Sdimracct_add_cred(struct ucred *cred, int resource, uint64_t amount)
1220226633Sdim{
1221203954Srdivacky}
1222202878Srdivacky
1223309124Sdimvoid
1224202878Srdivackyracct_add_force(struct proc *p, int resource, uint64_t amount)
1225234353Sdim{
1226234353Sdim
1227234353Sdim	return;
1228234353Sdim}
1229234353Sdim
1230234353Sdimint
1231234353Sdimracct_set(struct proc *p, int resource, uint64_t amount)
1232234353Sdim{
1233234353Sdim
1234234353Sdim	return (0);
1235314564Sdim}
1236314564Sdim
1237314564Sdimvoid
1238314564Sdimracct_set_force(struct proc *p, int resource, uint64_t amount)
1239314564Sdim{
1240202878Srdivacky}
1241314564Sdim
1242314564Sdimvoid
1243224145Sdimracct_sub(struct proc *p, int resource, uint64_t amount)
1244296417Sdim{
1245296417Sdim}
1246224145Sdim
1247234353Sdimvoid
1248234353Sdimracct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
1249202878Srdivacky{
1250234353Sdim}
1251234353Sdim
1252234353Sdimuint64_t
1253234353Sdimracct_get_limit(struct proc *p, int resource)
1254234353Sdim{
1255202878Srdivacky
1256234353Sdim	return (UINT64_MAX);
1257296417Sdim}
1258202878Srdivacky
1259288943Sdimuint64_t
1260288943Sdimracct_get_available(struct proc *p, int resource)
1261288943Sdim{
1262288943Sdim
1263288943Sdim	return (UINT64_MAX);
1264288943Sdim}
1265202878Srdivacky
1266221345Sdimvoid
1267221345Sdimracct_create(struct racct **racctp)
1268221345Sdim{
1269224145Sdim}
1270224145Sdim
1271202878Srdivackyvoid
1272202878Srdivackyracct_destroy(struct racct **racctp)
1273202878Srdivacky{
1274221345Sdim}
1275202878Srdivacky
1276202878Srdivackyint
1277202878Srdivackyracct_proc_fork(struct proc *parent, struct proc *child)
1278202878Srdivacky{
1279202878Srdivacky
1280202878Srdivacky	return (0);
1281202878Srdivacky}
1282202878Srdivacky
1283234353Sdimvoid
1284234353Sdimracct_proc_fork_done(struct proc *child)
1285234353Sdim{
1286202878Srdivacky}
1287202878Srdivacky
1288224145Sdimvoid
1289234353Sdimracct_proc_exit(struct proc *p)
1290288943Sdim{
1291261991Sdim}
1292288943Sdim
1293261991Sdim#endif /* !RACCT */
1294288943Sdim