1220137Strasz/*-
2220137Strasz * Copyright (c) 2010 The FreeBSD Foundation
3220137Strasz * All rights reserved.
4220137Strasz *
5220137Strasz * This software was developed by Edward Tomasz Napierala under sponsorship
6220137Strasz * from the FreeBSD Foundation.
7220137Strasz *
8220137Strasz * Redistribution and use in source and binary forms, with or without
9220137Strasz * modification, are permitted provided that the following conditions
10220137Strasz * are met:
11220137Strasz * 1. Redistributions of source code must retain the above copyright
12220137Strasz *    notice, this list of conditions and the following disclaimer.
13220137Strasz * 2. Redistributions in binary form must reproduce the above copyright
14220137Strasz *    notice, this list of conditions and the following disclaimer in the
15220137Strasz *    documentation and/or other materials provided with the distribution.
16220137Strasz *
17220137Strasz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18220137Strasz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19220137Strasz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20220137Strasz * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21220137Strasz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22220137Strasz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23220137Strasz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24220137Strasz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25220137Strasz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26220137Strasz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27220137Strasz * SUCH DAMAGE.
28220137Strasz *
29220137Strasz * $FreeBSD: releng/11.0/sys/kern/kern_racct.c 298414 2016-04-21 16:22:52Z trasz $
30220137Strasz */
31220137Strasz
32220137Strasz#include <sys/cdefs.h>
33220137Strasz__FBSDID("$FreeBSD: releng/11.0/sys/kern/kern_racct.c 298414 2016-04-21 16:22:52Z trasz $");
34220137Strasz
35242139Strasz#include "opt_sched.h"
36220137Strasz
37220137Strasz#include <sys/param.h>
38297633Strasz#include <sys/buf.h>
39228430Savg#include <sys/systm.h>
40220137Strasz#include <sys/eventhandler.h>
41220137Strasz#include <sys/jail.h>
42220137Strasz#include <sys/kernel.h>
43220137Strasz#include <sys/kthread.h>
44220137Strasz#include <sys/lock.h>
45220137Strasz#include <sys/loginclass.h>
46220137Strasz#include <sys/malloc.h>
47220137Strasz#include <sys/mutex.h>
48220137Strasz#include <sys/proc.h>
49220137Strasz#include <sys/racct.h>
50220137Strasz#include <sys/resourcevar.h>
51220137Strasz#include <sys/sbuf.h>
52220137Strasz#include <sys/sched.h>
53220137Strasz#include <sys/sdt.h>
54242139Strasz#include <sys/smp.h>
55220137Strasz#include <sys/sx.h>
56242139Strasz#include <sys/sysctl.h>
57220137Strasz#include <sys/sysent.h>
58220137Strasz#include <sys/sysproto.h>
59220137Strasz#include <sys/umtx.h>
60242139Strasz#include <machine/smp.h>
61220137Strasz
62220137Strasz#ifdef RCTL
63220137Strasz#include <sys/rctl.h>
64220137Strasz#endif
65220137Strasz
66220137Strasz#ifdef RACCT
67220137Strasz
68220137StraszFEATURE(racct, "Resource Accounting");
69220137Strasz
70242139Strasz/*
71242139Strasz * Do not block processes that have their %cpu usage <= pcpu_threshold.
72242139Strasz */
73242139Straszstatic int pcpu_threshold = 1;
74282901Strasz#ifdef RACCT_DEFAULT_TO_DISABLED
75282213Straszint racct_enable = 0;
76282213Strasz#else
77282213Straszint racct_enable = 1;
78282213Strasz#endif
79242139Strasz
80242139StraszSYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
81282213StraszSYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
82282213Strasz    0, "Enable RACCT/RCTL");
83242139StraszSYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
84242139Strasz    0, "Processes with higher %cpu usage than this value can be throttled.");
85242139Strasz
86242139Strasz/*
87242139Strasz * How many seconds it takes to use the scheduler %cpu calculations.  When a
88242139Strasz * process starts, we compute its %cpu usage by dividing its runtime by the
89242139Strasz * process wall clock time.  After RACCT_PCPU_SECS pass, we use the value
90242139Strasz * provided by the scheduler.
91242139Strasz */
92242139Strasz#define RACCT_PCPU_SECS		3
93242139Strasz
94298414Straszstruct mtx racct_lock;
95220137StraszMTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
96220137Strasz
97220137Straszstatic uma_zone_t racct_zone;
98220137Strasz
99220137Straszstatic void racct_sub_racct(struct racct *dest, const struct racct *src);
100220137Straszstatic void racct_sub_cred_locked(struct ucred *cred, int resource,
101220137Strasz		uint64_t amount);
102220137Straszstatic void racct_add_cred_locked(struct ucred *cred, int resource,
103220137Strasz		uint64_t amount);
104220137Strasz
105220137StraszSDT_PROVIDER_DEFINE(racct);
106292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, add,
107220137Strasz    "struct proc *", "int", "uint64_t");
108292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, add__failure,
109220137Strasz    "struct proc *", "int", "uint64_t");
110298414StraszSDT_PROBE_DEFINE3(racct, , rusage, add__buf,
111298414Strasz    "struct proc *", "const struct buf *", "int");
112292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, add__cred,
113292384Smarkj    "struct ucred *", "int", "uint64_t");
114292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, add__force,
115292384Smarkj    "struct proc *", "int", "uint64_t");
116292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, set,
117292384Smarkj    "struct proc *", "int", "uint64_t");
118292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, set__failure,
119292384Smarkj    "struct proc *", "int", "uint64_t");
120297489StraszSDT_PROBE_DEFINE3(racct, , rusage, set__force,
121297489Strasz    "struct proc *", "int", "uint64_t");
122292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, sub,
123292384Smarkj    "struct proc *", "int", "uint64_t");
124292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, sub__cred,
125292384Smarkj    "struct ucred *", "int", "uint64_t");
126292384SmarkjSDT_PROBE_DEFINE1(racct, , racct, create,
127220137Strasz    "struct racct *");
128292384SmarkjSDT_PROBE_DEFINE1(racct, , racct, destroy,
129292384Smarkj    "struct racct *");
130292384SmarkjSDT_PROBE_DEFINE2(racct, , racct, join,
131220137Strasz    "struct racct *", "struct racct *");
132292384SmarkjSDT_PROBE_DEFINE2(racct, , racct, join__failure,
133292384Smarkj    "struct racct *", "struct racct *");
134292384SmarkjSDT_PROBE_DEFINE2(racct, , racct, leave,
135292384Smarkj    "struct racct *", "struct racct *");
136220137Strasz
137220137Straszint racct_types[] = {
138220137Strasz	[RACCT_CPU] =
139224036Strasz		RACCT_IN_MILLIONS,
140220137Strasz	[RACCT_DATA] =
141220137Strasz		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
142220137Strasz	[RACCT_STACK] =
143220137Strasz		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
144220137Strasz	[RACCT_CORE] =
145220137Strasz		RACCT_DENIABLE,
146220137Strasz	[RACCT_RSS] =
147220137Strasz		RACCT_RECLAIMABLE,
148220137Strasz	[RACCT_MEMLOCK] =
149220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE,
150220137Strasz	[RACCT_NPROC] =
151220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE,
152220137Strasz	[RACCT_NOFILE] =
153220137Strasz		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
154220137Strasz	[RACCT_VMEM] =
155220137Strasz		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
156220137Strasz	[RACCT_NPTS] =
157220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
158220137Strasz	[RACCT_SWAP] =
159220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
160220137Strasz	[RACCT_NTHR] =
161220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE,
162220137Strasz	[RACCT_MSGQQUEUED] =
163220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
164220137Strasz	[RACCT_MSGQSIZE] =
165220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
166220137Strasz	[RACCT_NMSGQ] =
167220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
168220137Strasz	[RACCT_NSEM] =
169220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
170220137Strasz	[RACCT_NSEMOP] =
171220137Strasz		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
172220137Strasz	[RACCT_NSHM] =
173220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
174220137Strasz	[RACCT_SHMSIZE] =
175220137Strasz		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
176220137Strasz	[RACCT_WALLCLOCK] =
177242139Strasz		RACCT_IN_MILLIONS,
178242139Strasz	[RACCT_PCTCPU] =
179297633Strasz		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS,
180297633Strasz	[RACCT_READBPS] =
181297633Strasz		RACCT_DECAYING,
182297633Strasz	[RACCT_WRITEBPS] =
183297633Strasz		RACCT_DECAYING,
184297633Strasz	[RACCT_READIOPS] =
185297633Strasz		RACCT_DECAYING,
186297633Strasz	[RACCT_WRITEIOPS] =
187297633Strasz		RACCT_DECAYING };
188220137Strasz
189242139Straszstatic const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
190242139Strasz
191242139Strasz#ifdef SCHED_4BSD
192242139Strasz/*
193242139Strasz * Contains intermediate values for %cpu calculations to avoid using floating
194242139Strasz * point in the kernel.
195242139Strasz * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
196242139Strasz * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
197242139Strasz * zero so the calculations are more straightforward.
198242139Strasz */
199242139Straszfixpt_t ccpu_exp[] = {
200242139Strasz	[0] = FSCALE * 1,
201242139Strasz	[1] = FSCALE * 0.95122942450071400909,
202242139Strasz	[2] = FSCALE * 0.90483741803595957316,
203242139Strasz	[3] = FSCALE * 0.86070797642505780722,
204242139Strasz	[4] = FSCALE * 0.81873075307798185866,
205242139Strasz	[5] = FSCALE * 0.77880078307140486824,
206242139Strasz	[6] = FSCALE * 0.74081822068171786606,
207242139Strasz	[7] = FSCALE * 0.70468808971871343435,
208242139Strasz	[8] = FSCALE * 0.67032004603563930074,
209242139Strasz	[9] = FSCALE * 0.63762815162177329314,
210242139Strasz	[10] = FSCALE * 0.60653065971263342360,
211242139Strasz	[11] = FSCALE * 0.57694981038048669531,
212242139Strasz	[12] = FSCALE * 0.54881163609402643262,
213242139Strasz	[13] = FSCALE * 0.52204577676101604789,
214242139Strasz	[14] = FSCALE * 0.49658530379140951470,
215242139Strasz	[15] = FSCALE * 0.47236655274101470713,
216242139Strasz	[16] = FSCALE * 0.44932896411722159143,
217242139Strasz	[17] = FSCALE * 0.42741493194872666992,
218242139Strasz	[18] = FSCALE * 0.40656965974059911188,
219242139Strasz	[19] = FSCALE * 0.38674102345450120691,
220242139Strasz	[20] = FSCALE * 0.36787944117144232159,
221242139Strasz	[21] = FSCALE * 0.34993774911115535467,
222242139Strasz	[22] = FSCALE * 0.33287108369807955328,
223242139Strasz	[23] = FSCALE * 0.31663676937905321821,
224242139Strasz	[24] = FSCALE * 0.30119421191220209664,
225242139Strasz	[25] = FSCALE * 0.28650479686019010032,
226242139Strasz	[26] = FSCALE * 0.27253179303401260312,
227242139Strasz	[27] = FSCALE * 0.25924026064589150757,
228242139Strasz	[28] = FSCALE * 0.24659696394160647693,
229242139Strasz	[29] = FSCALE * 0.23457028809379765313,
230242139Strasz	[30] = FSCALE * 0.22313016014842982893,
231242139Strasz	[31] = FSCALE * 0.21224797382674305771,
232242139Strasz	[32] = FSCALE * 0.20189651799465540848,
233242139Strasz	[33] = FSCALE * 0.19204990862075411423,
234242139Strasz	[34] = FSCALE * 0.18268352405273465022,
235242139Strasz	[35] = FSCALE * 0.17377394345044512668,
236242139Strasz	[36] = FSCALE * 0.16529888822158653829,
237242139Strasz	[37] = FSCALE * 0.15723716631362761621,
238242139Strasz	[38] = FSCALE * 0.14956861922263505264,
239242139Strasz	[39] = FSCALE * 0.14227407158651357185,
240242139Strasz	[40] = FSCALE * 0.13533528323661269189,
241242139Strasz	[41] = FSCALE * 0.12873490358780421886,
242242139Strasz	[42] = FSCALE * 0.12245642825298191021,
243242139Strasz	[43] = FSCALE * 0.11648415777349695786,
244242139Strasz	[44] = FSCALE * 0.11080315836233388333,
245242139Strasz	[45] = FSCALE * 0.10539922456186433678,
246242139Strasz	[46] = FSCALE * 0.10025884372280373372,
247242139Strasz	[47] = FSCALE * 0.09536916221554961888,
248242139Strasz	[48] = FSCALE * 0.09071795328941250337,
249242139Strasz	[49] = FSCALE * 0.08629358649937051097,
250242139Strasz	[50] = FSCALE * 0.08208499862389879516,
251242139Strasz	[51] = FSCALE * 0.07808166600115315231,
252242139Strasz	[52] = FSCALE * 0.07427357821433388042,
253242139Strasz	[53] = FSCALE * 0.07065121306042958674,
254242139Strasz	[54] = FSCALE * 0.06720551273974976512,
255242139Strasz	[55] = FSCALE * 0.06392786120670757270,
256242139Strasz	[56] = FSCALE * 0.06081006262521796499,
257242139Strasz	[57] = FSCALE * 0.05784432087483846296,
258242139Strasz	[58] = FSCALE * 0.05502322005640722902,
259242139Strasz	[59] = FSCALE * 0.05233970594843239308,
260242139Strasz	[60] = FSCALE * 0.04978706836786394297,
261242139Strasz	[61] = FSCALE * 0.04735892439114092119,
262242139Strasz	[62] = FSCALE * 0.04504920239355780606,
263242139Strasz	[63] = FSCALE * 0.04285212686704017991,
264242139Strasz	[64] = FSCALE * 0.04076220397836621516,
265242139Strasz	[65] = FSCALE * 0.03877420783172200988,
266242139Strasz	[66] = FSCALE * 0.03688316740124000544,
267242139Strasz	[67] = FSCALE * 0.03508435410084502588,
268242139Strasz	[68] = FSCALE * 0.03337326996032607948,
269242139Strasz	[69] = FSCALE * 0.03174563637806794323,
270242139Strasz	[70] = FSCALE * 0.03019738342231850073,
271242139Strasz	[71] = FSCALE * 0.02872463965423942912,
272242139Strasz	[72] = FSCALE * 0.02732372244729256080,
273242139Strasz	[73] = FSCALE * 0.02599112877875534358,
274242139Strasz	[74] = FSCALE * 0.02472352647033939120,
275242139Strasz	[75] = FSCALE * 0.02351774585600910823,
276242139Strasz	[76] = FSCALE * 0.02237077185616559577,
277242139Strasz	[77] = FSCALE * 0.02127973643837716938,
278242139Strasz	[78] = FSCALE * 0.02024191144580438847,
279242139Strasz	[79] = FSCALE * 0.01925470177538692429,
280242139Strasz	[80] = FSCALE * 0.01831563888873418029,
281242139Strasz	[81] = FSCALE * 0.01742237463949351138,
282242139Strasz	[82] = FSCALE * 0.01657267540176124754,
283242139Strasz	[83] = FSCALE * 0.01576441648485449082,
284242139Strasz	[84] = FSCALE * 0.01499557682047770621,
285242139Strasz	[85] = FSCALE * 0.01426423390899925527,
286242139Strasz	[86] = FSCALE * 0.01356855901220093175,
287242139Strasz	[87] = FSCALE * 0.01290681258047986886,
288242139Strasz	[88] = FSCALE * 0.01227733990306844117,
289242139Strasz	[89] = FSCALE * 0.01167856697039544521,
290242139Strasz	[90] = FSCALE * 0.01110899653824230649,
291242139Strasz	[91] = FSCALE * 0.01056720438385265337,
292242139Strasz	[92] = FSCALE * 0.01005183574463358164,
293242139Strasz	[93] = FSCALE * 0.00956160193054350793,
294242139Strasz	[94] = FSCALE * 0.00909527710169581709,
295242139Strasz	[95] = FSCALE * 0.00865169520312063417,
296242139Strasz	[96] = FSCALE * 0.00822974704902002884,
297242139Strasz	[97] = FSCALE * 0.00782837754922577143,
298242139Strasz	[98] = FSCALE * 0.00744658307092434051,
299242139Strasz	[99] = FSCALE * 0.00708340892905212004,
300242139Strasz	[100] = FSCALE * 0.00673794699908546709,
301242139Strasz	[101] = FSCALE * 0.00640933344625638184,
302242139Strasz	[102] = FSCALE * 0.00609674656551563610,
303242139Strasz	[103] = FSCALE * 0.00579940472684214321,
304242139Strasz	[104] = FSCALE * 0.00551656442076077241,
305242139Strasz	[105] = FSCALE * 0.00524751839918138427,
306242139Strasz	[106] = FSCALE * 0.00499159390691021621,
307242139Strasz	[107] = FSCALE * 0.00474815099941147558,
308242139Strasz	[108] = FSCALE * 0.00451658094261266798,
309242139Strasz	[109] = FSCALE * 0.00429630469075234057,
310242139Strasz	[110] = FSCALE * 0.00408677143846406699,
311242139Strasz};
312242139Strasz#endif
313242139Strasz
314242139Strasz#define	CCPU_EXP_MAX	110
315242139Strasz
316242139Strasz/*
317242139Strasz * This function is analogical to the getpcpu() function in the ps(1) command.
318242139Strasz * They should both calculate in the same way so that the racct %cpu
319242139Strasz * calculations are consistent with the values showed by the ps(1) tool.
320242139Strasz * The calculations are more complex in the 4BSD scheduler because of the value
321242139Strasz * of the ccpu variable.  In ULE it is defined to be zero which saves us some
322242139Strasz * work.
323242139Strasz */
324242139Straszstatic uint64_t
325242139Straszracct_getpcpu(struct proc *p, u_int pcpu)
326242139Strasz{
327242139Strasz	u_int swtime;
328242139Strasz#ifdef SCHED_4BSD
329242139Strasz	fixpt_t pctcpu, pctcpu_next;
330242139Strasz#endif
331242139Strasz#ifdef SMP
332242139Strasz	struct pcpu *pc;
333242139Strasz	int found;
334242139Strasz#endif
335242139Strasz	fixpt_t p_pctcpu;
336242139Strasz	struct thread *td;
337242139Strasz
338282213Strasz	ASSERT_RACCT_ENABLED();
339282213Strasz
340242139Strasz	/*
341242139Strasz	 * If the process is swapped out, we count its %cpu usage as zero.
342242139Strasz	 * This behaviour is consistent with the userland ps(1) tool.
343242139Strasz	 */
344242139Strasz	if ((p->p_flag & P_INMEM) == 0)
345242139Strasz		return (0);
346242139Strasz	swtime = (ticks - p->p_swtick) / hz;
347242139Strasz
348242139Strasz	/*
349242139Strasz	 * For short-lived processes, the sched_pctcpu() returns small
350242139Strasz	 * values even for cpu intensive processes.  Therefore we use
351242139Strasz	 * our own estimate in this case.
352242139Strasz	 */
353242139Strasz	if (swtime < RACCT_PCPU_SECS)
354242139Strasz		return (pcpu);
355242139Strasz
356242139Strasz	p_pctcpu = 0;
357242139Strasz	FOREACH_THREAD_IN_PROC(p, td) {
358242139Strasz		if (td == PCPU_GET(idlethread))
359242139Strasz			continue;
360242139Strasz#ifdef SMP
361242139Strasz		found = 0;
362242139Strasz		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
363242139Strasz			if (td == pc->pc_idlethread) {
364242139Strasz				found = 1;
365242139Strasz				break;
366242139Strasz			}
367242139Strasz		}
368242139Strasz		if (found)
369242139Strasz			continue;
370242139Strasz#endif
371242139Strasz		thread_lock(td);
372242139Strasz#ifdef SCHED_4BSD
373242139Strasz		pctcpu = sched_pctcpu(td);
374242139Strasz		/* Count also the yet unfinished second. */
375242139Strasz		pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
376242139Strasz		pctcpu_next += sched_pctcpu_delta(td);
377242139Strasz		p_pctcpu += max(pctcpu, pctcpu_next);
378242139Strasz#else
379242139Strasz		/*
380242139Strasz		 * In ULE the %cpu statistics are updated on every
381242139Strasz		 * sched_pctcpu() call.  So special calculations to
382242139Strasz		 * account for the latest (unfinished) second are
383242139Strasz		 * not needed.
384242139Strasz		 */
385242139Strasz		p_pctcpu += sched_pctcpu(td);
386242139Strasz#endif
387242139Strasz		thread_unlock(td);
388242139Strasz	}
389242139Strasz
390242139Strasz#ifdef SCHED_4BSD
391242139Strasz	if (swtime <= CCPU_EXP_MAX)
392242139Strasz		return ((100 * (uint64_t)p_pctcpu * 1000000) /
393242139Strasz		    (FSCALE - ccpu_exp[swtime]));
394242139Strasz#endif
395242139Strasz
396242139Strasz	return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
397242139Strasz}
398242139Strasz
399220137Straszstatic void
400220137Straszracct_add_racct(struct racct *dest, const struct racct *src)
401220137Strasz{
402220137Strasz	int i;
403220137Strasz
404282213Strasz	ASSERT_RACCT_ENABLED();
405297578Strasz	RACCT_LOCK_ASSERT();
406220137Strasz
407220137Strasz	/*
408220137Strasz	 * Update resource usage in dest.
409220137Strasz	 */
410220137Strasz	for (i = 0; i <= RACCT_MAX; i++) {
411220137Strasz		KASSERT(dest->r_resources[i] >= 0,
412243088Strasz		    ("%s: resource %d propagation meltdown: dest < 0",
413243088Strasz		    __func__, i));
414220137Strasz		KASSERT(src->r_resources[i] >= 0,
415243088Strasz		    ("%s: resource %d propagation meltdown: src < 0",
416243088Strasz		    __func__, i));
417220137Strasz		dest->r_resources[i] += src->r_resources[i];
418220137Strasz	}
419220137Strasz}
420220137Strasz
421220137Straszstatic void
422220137Straszracct_sub_racct(struct racct *dest, const struct racct *src)
423220137Strasz{
424220137Strasz	int i;
425220137Strasz
426282213Strasz	ASSERT_RACCT_ENABLED();
427297578Strasz	RACCT_LOCK_ASSERT();
428220137Strasz
429220137Strasz	/*
430220137Strasz	 * Update resource usage in dest.
431220137Strasz	 */
432220137Strasz	for (i = 0; i <= RACCT_MAX; i++) {
433243070Strasz		if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
434220137Strasz			KASSERT(dest->r_resources[i] >= 0,
435243088Strasz			    ("%s: resource %d propagation meltdown: dest < 0",
436243088Strasz			    __func__, i));
437220137Strasz			KASSERT(src->r_resources[i] >= 0,
438243088Strasz			    ("%s: resource %d propagation meltdown: src < 0",
439243088Strasz			    __func__, i));
440220137Strasz			KASSERT(src->r_resources[i] <= dest->r_resources[i],
441243088Strasz			    ("%s: resource %d propagation meltdown: src > dest",
442243088Strasz			    __func__, i));
443220137Strasz		}
444242139Strasz		if (RACCT_CAN_DROP(i)) {
445220137Strasz			dest->r_resources[i] -= src->r_resources[i];
446220137Strasz			if (dest->r_resources[i] < 0) {
447243070Strasz				KASSERT(RACCT_IS_SLOPPY(i) ||
448243070Strasz				    RACCT_IS_DECAYING(i),
449243088Strasz				    ("%s: resource %d usage < 0", __func__, i));
450220137Strasz				dest->r_resources[i] = 0;
451220137Strasz			}
452220137Strasz		}
453220137Strasz	}
454220137Strasz}
455220137Strasz
456220137Straszvoid
457220137Straszracct_create(struct racct **racctp)
458220137Strasz{
459220137Strasz
460282213Strasz	if (!racct_enable)
461282213Strasz		return;
462282213Strasz
463292384Smarkj	SDT_PROBE1(racct, , racct, create, racctp);
464220137Strasz
465220137Strasz	KASSERT(*racctp == NULL, ("racct already allocated"));
466220137Strasz
467220137Strasz	*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
468220137Strasz}
469220137Strasz
470220137Straszstatic void
471220137Straszracct_destroy_locked(struct racct **racctp)
472220137Strasz{
473298045Strasz	struct racct *racct;
474220137Strasz	int i;
475220137Strasz
476282213Strasz	ASSERT_RACCT_ENABLED();
477282213Strasz
478292384Smarkj	SDT_PROBE1(racct, , racct, destroy, racctp);
479220137Strasz
480297578Strasz	RACCT_LOCK_ASSERT();
481220137Strasz	KASSERT(racctp != NULL, ("NULL racctp"));
482220137Strasz	KASSERT(*racctp != NULL, ("NULL racct"));
483220137Strasz
484220137Strasz	racct = *racctp;
485220137Strasz
486220137Strasz	for (i = 0; i <= RACCT_MAX; i++) {
487223844Strasz		if (RACCT_IS_SLOPPY(i))
488220137Strasz			continue;
489223844Strasz		if (!RACCT_IS_RECLAIMABLE(i))
490220137Strasz			continue;
491220137Strasz		KASSERT(racct->r_resources[i] == 0,
492220137Strasz		    ("destroying non-empty racct: "
493220137Strasz		    "%ju allocated for resource %d\n",
494220137Strasz		    racct->r_resources[i], i));
495220137Strasz	}
496220137Strasz	uma_zfree(racct_zone, racct);
497220137Strasz	*racctp = NULL;
498220137Strasz}
499220137Strasz
500220137Straszvoid
501220137Straszracct_destroy(struct racct **racct)
502220137Strasz{
503220137Strasz
504282213Strasz	if (!racct_enable)
505282213Strasz		return;
506282213Strasz
507297578Strasz	RACCT_LOCK();
508220137Strasz	racct_destroy_locked(racct);
509297578Strasz	RACCT_UNLOCK();
510220137Strasz}
511220137Strasz
512220137Strasz/*
513292162Strasz * Increase consumption of 'resource' by 'amount' for 'racct',
514292162Strasz * but not its parents.  Differently from other cases, 'amount' here
515220137Strasz * may be less than zero.
516220137Strasz */
517220137Straszstatic void
518284378Sjlhracct_adjust_resource(struct racct *racct, int resource,
519292161Strasz    int64_t amount)
520220137Strasz{
521220137Strasz
522282213Strasz	ASSERT_RACCT_ENABLED();
523297578Strasz	RACCT_LOCK_ASSERT();
524220137Strasz	KASSERT(racct != NULL, ("NULL racct"));
525220137Strasz
526220137Strasz	racct->r_resources[resource] += amount;
527220137Strasz	if (racct->r_resources[resource] < 0) {
528242139Strasz		KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
529243088Strasz		    ("%s: resource %d usage < 0", __func__, resource));
530220137Strasz		racct->r_resources[resource] = 0;
531220137Strasz	}
532242139Strasz
533242139Strasz	/*
534242139Strasz	 * There are some cases where the racct %cpu resource would grow
535290662Sjpaetzel	 * beyond 100% per core.  For example in racct_proc_exit() we add
536290662Sjpaetzel	 * the process %cpu usage to the ucred racct containers.  If too
537290662Sjpaetzel	 * many processes terminated in a short time span, the ucred %cpu
538290662Sjpaetzel	 * resource could grow too much.  Also, the 4BSD scheduler sometimes
539290662Sjpaetzel	 * returns for a thread more than 100% cpu usage. So we set a sane
540290662Sjpaetzel	 * boundary here to 100% * the maxumum number of CPUs.
541242139Strasz	 */
542242139Strasz	if ((resource == RACCT_PCTCPU) &&
543290662Sjpaetzel	    (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU))
544290662Sjpaetzel		racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU;
545220137Strasz}
546220137Strasz
547225944Straszstatic int
548297490Straszracct_add_locked(struct proc *p, int resource, uint64_t amount, int force)
549220137Strasz{
550220137Strasz#ifdef RCTL
551220137Strasz	int error;
552220137Strasz#endif
553220137Strasz
554282213Strasz	ASSERT_RACCT_ENABLED();
555282213Strasz
556220137Strasz	/*
557220137Strasz	 * We need proc lock to dereference p->p_ucred.
558220137Strasz	 */
559220137Strasz	PROC_LOCK_ASSERT(p, MA_OWNED);
560220137Strasz
561220137Strasz#ifdef RCTL
562297492Strasz	error = rctl_enforce(p, resource, amount);
563297492Strasz	if (error && !force && RACCT_IS_DENIABLE(resource)) {
564297492Strasz		SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount);
565297492Strasz		return (error);
566220137Strasz	}
567220137Strasz#endif
568284378Sjlh	racct_adjust_resource(p->p_racct, resource, amount);
569220137Strasz	racct_add_cred_locked(p->p_ucred, resource, amount);
570220137Strasz
571220137Strasz	return (0);
572220137Strasz}
573220137Strasz
574225944Strasz/*
575225944Strasz * Increase allocation of 'resource' by 'amount' for process 'p'.
576225944Strasz * Return 0 if it's below limits, or errno, if it's not.
577225944Strasz */
578225944Straszint
579225944Straszracct_add(struct proc *p, int resource, uint64_t amount)
580225944Strasz{
581225944Strasz	int error;
582225944Strasz
583282213Strasz	if (!racct_enable)
584282213Strasz		return (0);
585282213Strasz
586297490Strasz	SDT_PROBE3(racct, , rusage, add, p, resource, amount);
587297490Strasz
588297578Strasz	RACCT_LOCK();
589297490Strasz	error = racct_add_locked(p, resource, amount, 0);
590297578Strasz	RACCT_UNLOCK();
591225944Strasz	return (error);
592225944Strasz}
593225944Strasz
594297491Strasz/*
595297491Strasz * Increase allocation of 'resource' by 'amount' for process 'p'.
596297491Strasz * Doesn't check for limits and never fails.
597297491Strasz */
598297491Straszvoid
599297491Straszracct_add_force(struct proc *p, int resource, uint64_t amount)
600297491Strasz{
601297491Strasz
602297491Strasz	if (!racct_enable)
603297491Strasz		return;
604297491Strasz
605297491Strasz	SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
606297491Strasz
607297578Strasz	RACCT_LOCK();
608297491Strasz	racct_add_locked(p, resource, amount, 1);
609297578Strasz	RACCT_UNLOCK();
610297491Strasz}
611297491Strasz
612220137Straszstatic void
613220137Straszracct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
614220137Strasz{
615220137Strasz	struct prison *pr;
616220137Strasz
617282213Strasz	ASSERT_RACCT_ENABLED();
618282213Strasz
619284378Sjlh	racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
620220137Strasz	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
621284378Sjlh		racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
622221362Strasz		    amount);
623284378Sjlh	racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount);
624220137Strasz}
625220137Strasz
626220137Strasz/*
627220137Strasz * Increase allocation of 'resource' by 'amount' for credential 'cred'.
628220137Strasz * Doesn't check for limits and never fails.
629220137Strasz */
630220137Straszvoid
631220137Straszracct_add_cred(struct ucred *cred, int resource, uint64_t amount)
632220137Strasz{
633220137Strasz
634282213Strasz	if (!racct_enable)
635282213Strasz		return;
636282213Strasz
637298414Strasz	SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount);
638298414Strasz
639297578Strasz	RACCT_LOCK();
640220137Strasz	racct_add_cred_locked(cred, resource, amount);
641297578Strasz	RACCT_UNLOCK();
642220137Strasz}
643220137Strasz
644297633Strasz/*
645297633Strasz * Account for disk IO resource consumption.  Checks for limits,
646297633Strasz * but never fails, due to disk limits being undeniable.
647297633Strasz */
648297633Straszvoid
649297633Straszracct_add_buf(struct proc *p, const struct buf *bp, int is_write)
650297633Strasz{
651297633Strasz
652297633Strasz	ASSERT_RACCT_ENABLED();
653297633Strasz	PROC_LOCK_ASSERT(p, MA_OWNED);
654297633Strasz
655298414Strasz	SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write);
656298414Strasz
657297633Strasz	RACCT_LOCK();
658297633Strasz	if (is_write) {
659297633Strasz		racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1);
660297633Strasz		racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1);
661297633Strasz	} else {
662297633Strasz		racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1);
663297633Strasz		racct_add_locked(curproc, RACCT_READIOPS, 1, 1);
664297633Strasz	}
665297633Strasz	RACCT_UNLOCK();
666297633Strasz}
667297633Strasz
668220137Straszstatic int
669297489Straszracct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
670220137Strasz{
671298045Strasz	int64_t old_amount, decayed_amount, diff_proc, diff_cred;
672220137Strasz#ifdef RCTL
673220137Strasz	int error;
674220137Strasz#endif
675220137Strasz
676282213Strasz	ASSERT_RACCT_ENABLED();
677282213Strasz
678220137Strasz	/*
679220137Strasz	 * We need proc lock to dereference p->p_ucred.
680220137Strasz	 */
681220137Strasz	PROC_LOCK_ASSERT(p, MA_OWNED);
682220137Strasz
683242139Strasz	old_amount = p->p_racct->r_resources[resource];
684242139Strasz	/*
685242139Strasz	 * The diffs may be negative.
686242139Strasz	 */
687242139Strasz	diff_proc = amount - old_amount;
688297633Strasz	if (resource == RACCT_PCTCPU) {
689242139Strasz		/*
690242139Strasz		 * Resources in per-credential racct containers may decay.
691242139Strasz		 * If this is the case, we need to calculate the difference
692242139Strasz		 * between the new amount and the proportional value of the
693242139Strasz		 * old amount that has decayed in the ucred racct containers.
694242139Strasz		 */
695242139Strasz		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
696242139Strasz		diff_cred = amount - decayed_amount;
697242139Strasz	} else
698242139Strasz		diff_cred = diff_proc;
699220137Strasz#ifdef notyet
700242139Strasz	KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
701243088Strasz	    ("%s: usage of non-droppable resource %d dropping", __func__,
702220137Strasz	     resource));
703220137Strasz#endif
704220137Strasz#ifdef RCTL
705297492Strasz	if (diff_proc > 0) {
706242139Strasz		error = rctl_enforce(p, resource, diff_proc);
707297492Strasz		if (error && !force && RACCT_IS_DENIABLE(resource)) {
708292384Smarkj			SDT_PROBE3(racct, , rusage, set__failure, p, resource,
709292384Smarkj			    amount);
710220137Strasz			return (error);
711220137Strasz		}
712220137Strasz	}
713220137Strasz#endif
714284378Sjlh	racct_adjust_resource(p->p_racct, resource, diff_proc);
715242139Strasz	if (diff_cred > 0)
716242139Strasz		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
717242139Strasz	else if (diff_cred < 0)
718242139Strasz		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
719220137Strasz
720220137Strasz	return (0);
721220137Strasz}
722220137Strasz
723220137Strasz/*
724220137Strasz * Set allocation of 'resource' to 'amount' for process 'p'.
725220137Strasz * Return 0 if it's below limits, or errno, if it's not.
726220137Strasz *
727220137Strasz * Note that decreasing the allocation always returns 0,
728220137Strasz * even if it's above the limit.
729220137Strasz */
730220137Straszint
731220137Straszracct_set(struct proc *p, int resource, uint64_t amount)
732220137Strasz{
733220137Strasz	int error;
734220137Strasz
735282213Strasz	if (!racct_enable)
736282213Strasz		return (0);
737282213Strasz
738297489Strasz	SDT_PROBE3(racct, , rusage, set__force, p, resource, amount);
739297489Strasz
740297578Strasz	RACCT_LOCK();
741297489Strasz	error = racct_set_locked(p, resource, amount, 0);
742297578Strasz	RACCT_UNLOCK();
743220137Strasz	return (error);
744220137Strasz}
745220137Strasz
746297491Straszvoid
747297491Straszracct_set_force(struct proc *p, int resource, uint64_t amount)
748297491Strasz{
749297491Strasz
750297491Strasz	if (!racct_enable)
751297491Strasz		return;
752297491Strasz
753297491Strasz	SDT_PROBE3(racct, , rusage, set, p, resource, amount);
754297491Strasz
755297578Strasz	RACCT_LOCK();
756297491Strasz	racct_set_locked(p, resource, amount, 1);
757297578Strasz	RACCT_UNLOCK();
758297491Strasz}
759297491Strasz
760220137Strasz/*
761220137Strasz * Returns amount of 'resource' the process 'p' can keep allocated.
762220137Strasz * Allocating more than that would be denied, unless the resource
763220137Strasz * is marked undeniable.  Amount of already allocated resource does
764220137Strasz * not matter.
765220137Strasz */
766220137Straszuint64_t
767220137Straszracct_get_limit(struct proc *p, int resource)
768220137Strasz{
769298414Strasz#ifdef RCTL
770298414Strasz	uint64_t available;
771220137Strasz
772282213Strasz	if (!racct_enable)
773282213Strasz		return (UINT64_MAX);
774282213Strasz
775298414Strasz	RACCT_LOCK();
776298414Strasz	available = rctl_get_limit(p, resource);
777298414Strasz	RACCT_UNLOCK();
778298414Strasz
779298414Strasz	return (available);
780220137Strasz#else
781298414Strasz
782220137Strasz	return (UINT64_MAX);
783220137Strasz#endif
784220137Strasz}
785220137Strasz
786220137Strasz/*
787220137Strasz * Returns amount of 'resource' the process 'p' can keep allocated.
788220137Strasz * Allocating more than that would be denied, unless the resource
789220137Strasz * is marked undeniable.  Amount of already allocated resource does
790220137Strasz * matter.
791220137Strasz */
792220137Straszuint64_t
793220137Straszracct_get_available(struct proc *p, int resource)
794220137Strasz{
795298414Strasz#ifdef RCTL
796298414Strasz	uint64_t available;
797220137Strasz
798282213Strasz	if (!racct_enable)
799282213Strasz		return (UINT64_MAX);
800282213Strasz
801298414Strasz	RACCT_LOCK();
802298414Strasz	available = rctl_get_available(p, resource);
803298414Strasz	RACCT_UNLOCK();
804298414Strasz
805298414Strasz	return (available);
806220137Strasz#else
807298414Strasz
808220137Strasz	return (UINT64_MAX);
809220137Strasz#endif
810220137Strasz}
811220137Strasz
812220137Strasz/*
813242139Strasz * Returns amount of the %cpu resource that process 'p' can add to its %cpu
814242139Strasz * utilization.  Adding more than that would lead to the process being
815242139Strasz * throttled.
816242139Strasz */
817242139Straszstatic int64_t
818242139Straszracct_pcpu_available(struct proc *p)
819242139Strasz{
820298414Strasz#ifdef RCTL
821298414Strasz	uint64_t available;
822242139Strasz
823282213Strasz	ASSERT_RACCT_ENABLED();
824282213Strasz
825298414Strasz	RACCT_LOCK();
826298414Strasz	available = rctl_pcpu_available(p);
827298414Strasz	RACCT_UNLOCK();
828298414Strasz
829298414Strasz	return (available);
830242139Strasz#else
831298414Strasz
832242139Strasz	return (INT64_MAX);
833242139Strasz#endif
834242139Strasz}
835242139Strasz
836242139Strasz/*
837220137Strasz * Decrease allocation of 'resource' by 'amount' for process 'p'.
838220137Strasz */
839220137Straszvoid
840220137Straszracct_sub(struct proc *p, int resource, uint64_t amount)
841220137Strasz{
842220137Strasz
843282213Strasz	if (!racct_enable)
844282213Strasz		return;
845282213Strasz
846292384Smarkj	SDT_PROBE3(racct, , rusage, sub, p, resource, amount);
847220137Strasz
848220137Strasz	/*
849220137Strasz	 * We need proc lock to dereference p->p_ucred.
850220137Strasz	 */
851220137Strasz	PROC_LOCK_ASSERT(p, MA_OWNED);
852242139Strasz	KASSERT(RACCT_CAN_DROP(resource),
853243088Strasz	    ("%s: called for non-droppable resource %d", __func__, resource));
854220137Strasz
855297578Strasz	RACCT_LOCK();
856220137Strasz	KASSERT(amount <= p->p_racct->r_resources[resource],
857243088Strasz	    ("%s: freeing %ju of resource %d, which is more "
858243088Strasz	     "than allocated %jd for %s (pid %d)", __func__, amount, resource,
859220137Strasz	    (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
860220137Strasz
861284378Sjlh	racct_adjust_resource(p->p_racct, resource, -amount);
862220137Strasz	racct_sub_cred_locked(p->p_ucred, resource, amount);
863297578Strasz	RACCT_UNLOCK();
864220137Strasz}
865220137Strasz
866220137Straszstatic void
867220137Straszracct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
868220137Strasz{
869220137Strasz	struct prison *pr;
870220137Strasz
871282213Strasz	ASSERT_RACCT_ENABLED();
872282213Strasz
873284378Sjlh	racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
874220137Strasz	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
875284378Sjlh		racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
876221362Strasz		    -amount);
877284378Sjlh	racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount);
878220137Strasz}
879220137Strasz
880220137Strasz/*
881220137Strasz * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
882220137Strasz */
883220137Straszvoid
884220137Straszracct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
885220137Strasz{
886220137Strasz
887282213Strasz	if (!racct_enable)
888282213Strasz		return;
889282213Strasz
890298414Strasz	SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount);
891298414Strasz
892298414Strasz#ifdef notyet
893298414Strasz	KASSERT(RACCT_CAN_DROP(resource),
894298414Strasz	    ("%s: called for resource %d which can not drop", __func__,
895298414Strasz	     resource));
896298414Strasz#endif
897298414Strasz
898297578Strasz	RACCT_LOCK();
899220137Strasz	racct_sub_cred_locked(cred, resource, amount);
900297578Strasz	RACCT_UNLOCK();
901220137Strasz}
902220137Strasz
903220137Strasz/*
904220137Strasz * Inherit resource usage information from the parent process.
905220137Strasz */
906220137Straszint
907220137Straszracct_proc_fork(struct proc *parent, struct proc *child)
908220137Strasz{
909220137Strasz	int i, error = 0;
910220137Strasz
911282213Strasz	if (!racct_enable)
912282213Strasz		return (0);
913282213Strasz
914220137Strasz	/*
915220137Strasz	 * Create racct for the child process.
916220137Strasz	 */
917220137Strasz	racct_create(&child->p_racct);
918220137Strasz
919220137Strasz	PROC_LOCK(parent);
920220137Strasz	PROC_LOCK(child);
921297578Strasz	RACCT_LOCK();
922220137Strasz
923225981Strasz#ifdef RCTL
924225981Strasz	error = rctl_proc_fork(parent, child);
925225981Strasz	if (error != 0)
926225981Strasz		goto out;
927225981Strasz#endif
928225981Strasz
929242139Strasz	/* Init process cpu time. */
930242139Strasz	child->p_prev_runtime = 0;
931242139Strasz	child->p_throttled = 0;
932242139Strasz
933220137Strasz	/*
934220137Strasz	 * Inherit resource usage.
935220137Strasz	 */
936220137Strasz	for (i = 0; i <= RACCT_MAX; i++) {
937220137Strasz		if (parent->p_racct->r_resources[i] == 0 ||
938223844Strasz		    !RACCT_IS_INHERITABLE(i))
939220137Strasz			continue;
940220137Strasz
941220137Strasz		error = racct_set_locked(child, i,
942297489Strasz		    parent->p_racct->r_resources[i], 0);
943225938Strasz		if (error != 0)
944220137Strasz			goto out;
945220137Strasz	}
946220137Strasz
947297490Strasz	error = racct_add_locked(child, RACCT_NPROC, 1, 0);
948297490Strasz	error += racct_add_locked(child, RACCT_NTHR, 1, 0);
949225944Strasz
950220137Straszout:
951297578Strasz	RACCT_UNLOCK();
952220137Strasz	PROC_UNLOCK(child);
953220137Strasz	PROC_UNLOCK(parent);
954220137Strasz
955235787Strasz	if (error != 0)
956235787Strasz		racct_proc_exit(child);
957235787Strasz
958220137Strasz	return (error);
959220137Strasz}
960220137Strasz
961225940Strasz/*
962225940Strasz * Called at the end of fork1(), to handle rules that require the process
963225940Strasz * to be fully initialized.
964225940Strasz */
965220137Straszvoid
966225940Straszracct_proc_fork_done(struct proc *child)
967225940Strasz{
968225940Strasz
969282213Strasz	if (!racct_enable)
970282213Strasz		return;
971282213Strasz
972298414Strasz	PROC_LOCK_ASSERT(child, MA_OWNED);
973298414Strasz
974298414Strasz#ifdef RCTL
975297578Strasz	RACCT_LOCK();
976225940Strasz	rctl_enforce(child, RACCT_NPROC, 0);
977225940Strasz	rctl_enforce(child, RACCT_NTHR, 0);
978297578Strasz	RACCT_UNLOCK();
979225940Strasz#endif
980225940Strasz}
981225940Strasz
982225940Straszvoid
983220137Straszracct_proc_exit(struct proc *p)
984220137Strasz{
985298045Strasz	struct timeval wallclock;
986298045Strasz	uint64_t pct_estimate, pct, runtime;
987225364Strasz	int i;
988220137Strasz
989282213Strasz	if (!racct_enable)
990282213Strasz		return;
991282213Strasz
992220137Strasz	PROC_LOCK(p);
993220137Strasz	/*
994220137Strasz	 * We don't need to calculate rux, proc_reap() has already done this.
995220137Strasz	 */
996220137Strasz	runtime = cputick2usec(p->p_rux.rux_runtime);
997220137Strasz#ifdef notyet
998220137Strasz	KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
999220137Strasz#else
1000220137Strasz	if (runtime < p->p_prev_runtime)
1001220137Strasz		runtime = p->p_prev_runtime;
1002220137Strasz#endif
1003242139Strasz	microuptime(&wallclock);
1004242139Strasz	timevalsub(&wallclock, &p->p_stats->p_start);
1005242957Strasz	if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
1006242957Strasz		pct_estimate = (1000000 * runtime * 100) /
1007242957Strasz		    ((uint64_t)wallclock.tv_sec * 1000000 +
1008242957Strasz		    wallclock.tv_usec);
1009242957Strasz	} else
1010242957Strasz		pct_estimate = 0;
1011242139Strasz	pct = racct_getpcpu(p, pct_estimate);
1012242139Strasz
1013297578Strasz	RACCT_LOCK();
1014297489Strasz	racct_set_locked(p, RACCT_CPU, runtime, 0);
1015242139Strasz	racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
1016220137Strasz
1017225364Strasz	for (i = 0; i <= RACCT_MAX; i++) {
1018225364Strasz		if (p->p_racct->r_resources[i] == 0)
1019225364Strasz			continue;
1020225364Strasz	    	if (!RACCT_IS_RECLAIMABLE(i))
1021225364Strasz			continue;
1022297489Strasz		racct_set_locked(p, i, 0, 0);
1023225364Strasz	}
1024225364Strasz
1025220137Strasz#ifdef RCTL
1026220137Strasz	rctl_racct_release(p->p_racct);
1027220137Strasz#endif
1028298414Strasz	racct_destroy_locked(&p->p_racct);
1029298414Strasz	RACCT_UNLOCK();
1030298414Strasz	PROC_UNLOCK(p);
1031220137Strasz}
1032220137Strasz
1033220137Strasz/*
1034220137Strasz * Called after credentials change, to move resource utilisation
1035220137Strasz * between raccts.
1036220137Strasz */
1037220137Straszvoid
1038220137Straszracct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
1039220137Strasz    struct ucred *newcred)
1040220137Strasz{
1041220137Strasz	struct uidinfo *olduip, *newuip;
1042220137Strasz	struct loginclass *oldlc, *newlc;
1043220137Strasz	struct prison *oldpr, *newpr, *pr;
1044220137Strasz
1045282213Strasz	if (!racct_enable)
1046282213Strasz		return;
1047282213Strasz
1048220137Strasz	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1049220137Strasz
1050220137Strasz	newuip = newcred->cr_ruidinfo;
1051220137Strasz	olduip = oldcred->cr_ruidinfo;
1052220137Strasz	newlc = newcred->cr_loginclass;
1053220137Strasz	oldlc = oldcred->cr_loginclass;
1054220137Strasz	newpr = newcred->cr_prison;
1055220137Strasz	oldpr = oldcred->cr_prison;
1056220137Strasz
1057297578Strasz	RACCT_LOCK();
1058220137Strasz	if (newuip != olduip) {
1059220137Strasz		racct_sub_racct(olduip->ui_racct, p->p_racct);
1060220137Strasz		racct_add_racct(newuip->ui_racct, p->p_racct);
1061220137Strasz	}
1062220137Strasz	if (newlc != oldlc) {
1063220137Strasz		racct_sub_racct(oldlc->lc_racct, p->p_racct);
1064220137Strasz		racct_add_racct(newlc->lc_racct, p->p_racct);
1065220137Strasz	}
1066220137Strasz	if (newpr != oldpr) {
1067220137Strasz		for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
1068221362Strasz			racct_sub_racct(pr->pr_prison_racct->prr_racct,
1069221362Strasz			    p->p_racct);
1070220137Strasz		for (pr = newpr; pr != NULL; pr = pr->pr_parent)
1071221362Strasz			racct_add_racct(pr->pr_prison_racct->prr_racct,
1072221362Strasz			    p->p_racct);
1073220137Strasz	}
1074297578Strasz	RACCT_UNLOCK();
1075220137Strasz
1076220137Strasz#ifdef RCTL
1077220137Strasz	rctl_proc_ucred_changed(p, newcred);
1078220137Strasz#endif
1079220137Strasz}
1080220137Strasz
1081232598Straszvoid
1082232598Straszracct_move(struct racct *dest, struct racct *src)
1083232598Strasz{
1084232598Strasz
1085282213Strasz	ASSERT_RACCT_ENABLED();
1086282213Strasz
1087297578Strasz	RACCT_LOCK();
1088232598Strasz	racct_add_racct(dest, src);
1089232598Strasz	racct_sub_racct(src, src);
1090297578Strasz	RACCT_UNLOCK();
1091232598Strasz}
1092232598Strasz
1093297633Strasz/*
1094297633Strasz * Make the process sleep in userret() for 'timeout' ticks.  Setting
1095297633Strasz * timeout to -1 makes it sleep until woken up by racct_proc_wakeup().
1096297633Strasz */
1097297633Straszvoid
1098297633Straszracct_proc_throttle(struct proc *p, int timeout)
1099242139Strasz{
1100242139Strasz	struct thread *td;
1101242139Strasz#ifdef SMP
1102242139Strasz	int cpuid;
1103242139Strasz#endif
1104242139Strasz
1105297633Strasz	KASSERT(timeout != 0, ("timeout %d", timeout));
1106282213Strasz	ASSERT_RACCT_ENABLED();
1107242139Strasz	PROC_LOCK_ASSERT(p, MA_OWNED);
1108242139Strasz
1109242139Strasz	/*
1110242139Strasz	 * Do not block kernel processes.  Also do not block processes with
1111242139Strasz	 * low %cpu utilization to improve interactivity.
1112242139Strasz	 */
1113297633Strasz	if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0)
1114242139Strasz		return;
1115242139Strasz
1116297633Strasz	if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
1117297633Strasz		return;
1118297633Strasz
1119297633Strasz	p->p_throttled = timeout;
1120297633Strasz
1121242139Strasz	FOREACH_THREAD_IN_PROC(p, td) {
1122248298Strasz		thread_lock(td);
1123242139Strasz		switch (td->td_state) {
1124242139Strasz		case TDS_RUNQ:
1125242139Strasz			/*
1126242139Strasz			 * If the thread is on the scheduler run-queue, we can
1127242139Strasz			 * not just remove it from there.  So we set the flag
1128242139Strasz			 * TDF_NEEDRESCHED for the thread, so that once it is
1129242139Strasz			 * running, it is taken off the cpu as soon as possible.
1130242139Strasz			 */
1131242139Strasz			td->td_flags |= TDF_NEEDRESCHED;
1132242139Strasz			break;
1133242139Strasz		case TDS_RUNNING:
1134242139Strasz			/*
1135242139Strasz			 * If the thread is running, we request a context
1136242139Strasz			 * switch for it by setting the TDF_NEEDRESCHED flag.
1137242139Strasz			 */
1138242139Strasz			td->td_flags |= TDF_NEEDRESCHED;
1139242139Strasz#ifdef SMP
1140242139Strasz			cpuid = td->td_oncpu;
1141242139Strasz			if ((cpuid != NOCPU) && (td != curthread))
1142242139Strasz				ipi_cpu(cpuid, IPI_AST);
1143242139Strasz#endif
1144242139Strasz			break;
1145242139Strasz		default:
1146242139Strasz			break;
1147242139Strasz		}
1148248298Strasz		thread_unlock(td);
1149242139Strasz	}
1150242139Strasz}
1151242139Strasz
1152242139Straszstatic void
1153242139Straszracct_proc_wakeup(struct proc *p)
1154242139Strasz{
1155282213Strasz
1156282213Strasz	ASSERT_RACCT_ENABLED();
1157282213Strasz
1158242139Strasz	PROC_LOCK_ASSERT(p, MA_OWNED);
1159242139Strasz
1160297633Strasz	if (p->p_throttled != 0) {
1161242139Strasz		p->p_throttled = 0;
1162242139Strasz		wakeup(p->p_racct);
1163242139Strasz	}
1164242139Strasz}
1165242139Strasz
1166242139Straszstatic void
1167297494Straszracct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
1168242139Strasz{
1169242139Strasz	int64_t r_old, r_new;
1170242139Strasz
1171282213Strasz	ASSERT_RACCT_ENABLED();
1172297578Strasz	RACCT_LOCK_ASSERT();
1173282213Strasz
1174297633Strasz#ifdef RCTL
1175297633Strasz	rctl_throttle_decay(racct, RACCT_READBPS);
1176297633Strasz	rctl_throttle_decay(racct, RACCT_WRITEBPS);
1177297633Strasz	rctl_throttle_decay(racct, RACCT_READIOPS);
1178297633Strasz	rctl_throttle_decay(racct, RACCT_WRITEIOPS);
1179297633Strasz#endif
1180297633Strasz
1181297494Strasz	r_old = racct->r_resources[RACCT_PCTCPU];
1182242139Strasz
1183242139Strasz	/* If there is nothing to decay, just exit. */
1184242139Strasz	if (r_old <= 0)
1185242139Strasz		return;
1186242139Strasz
1187242139Strasz	r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
1188297494Strasz	racct->r_resources[RACCT_PCTCPU] = r_new;
1189290857Strasz}
1190290857Strasz
1191290857Straszstatic void
1192290857Straszracct_decay_pre(void)
1193290857Strasz{
1194290857Strasz
1195297578Strasz	RACCT_LOCK();
1196290857Strasz}
1197290857Strasz
1198290857Straszstatic void
1199290857Straszracct_decay_post(void)
1200290857Strasz{
1201290857Strasz
1202297578Strasz	RACCT_UNLOCK();
1203242139Strasz}
1204242139Strasz
1205242139Straszstatic void
1206297495Straszracct_decay(void)
1207242139Strasz{
1208282213Strasz
1209282213Strasz	ASSERT_RACCT_ENABLED();
1210282213Strasz
1211297494Strasz	ui_racct_foreach(racct_decay_callback, racct_decay_pre,
1212297494Strasz	    racct_decay_post, NULL, NULL);
1213297494Strasz	loginclass_racct_foreach(racct_decay_callback, racct_decay_pre,
1214297494Strasz	    racct_decay_post, NULL, NULL);
1215297494Strasz	prison_racct_foreach(racct_decay_callback, racct_decay_pre,
1216297494Strasz	    racct_decay_post, NULL, NULL);
1217242139Strasz}
1218242139Strasz
1219242139Straszstatic void
1220220137Straszracctd(void)
1221220137Strasz{
1222220137Strasz	struct thread *td;
1223220137Strasz	struct proc *p;
1224220137Strasz	struct timeval wallclock;
1225298045Strasz	uint64_t pct, pct_estimate, runtime;
1226220137Strasz
1227282213Strasz	ASSERT_RACCT_ENABLED();
1228282213Strasz
1229220137Strasz	for (;;) {
1230297494Strasz		racct_decay();
1231242139Strasz
1232220137Strasz		sx_slock(&allproc_lock);
1233220137Strasz
1234242139Strasz		LIST_FOREACH(p, &zombproc, p_list) {
1235242139Strasz			PROC_LOCK(p);
1236242139Strasz			racct_set(p, RACCT_PCTCPU, 0);
1237242139Strasz			PROC_UNLOCK(p);
1238242139Strasz		}
1239242139Strasz
1240220137Strasz		FOREACH_PROC_IN_SYSTEM(p) {
1241242139Strasz			PROC_LOCK(p);
1242242139Strasz			if (p->p_state != PRS_NORMAL) {
1243242139Strasz				PROC_UNLOCK(p);
1244220137Strasz				continue;
1245242139Strasz			}
1246220137Strasz
1247220137Strasz			microuptime(&wallclock);
1248220137Strasz			timevalsub(&wallclock, &p->p_stats->p_start);
1249275121Skib			PROC_STATLOCK(p);
1250232782Strasz			FOREACH_THREAD_IN_PROC(p, td)
1251220137Strasz				ruxagg(p, td);
1252220137Strasz			runtime = cputick2usec(p->p_rux.rux_runtime);
1253275121Skib			PROC_STATUNLOCK(p);
1254220137Strasz#ifdef notyet
1255220137Strasz			KASSERT(runtime >= p->p_prev_runtime,
1256220137Strasz			    ("runtime < p_prev_runtime"));
1257220137Strasz#else
1258220137Strasz			if (runtime < p->p_prev_runtime)
1259220137Strasz				runtime = p->p_prev_runtime;
1260220137Strasz#endif
1261220137Strasz			p->p_prev_runtime = runtime;
1262242957Strasz			if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
1263242957Strasz				pct_estimate = (1000000 * runtime * 100) /
1264242957Strasz				    ((uint64_t)wallclock.tv_sec * 1000000 +
1265242957Strasz				    wallclock.tv_usec);
1266242957Strasz			} else
1267242957Strasz				pct_estimate = 0;
1268242139Strasz			pct = racct_getpcpu(p, pct_estimate);
1269297578Strasz			RACCT_LOCK();
1270297633Strasz#ifdef RCTL
1271297633Strasz			rctl_throttle_decay(p->p_racct, RACCT_READBPS);
1272297633Strasz			rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
1273297633Strasz			rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
1274297633Strasz			rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
1275297633Strasz#endif
1276297489Strasz			racct_set_locked(p, RACCT_PCTCPU, pct, 1);
1277297489Strasz			racct_set_locked(p, RACCT_CPU, runtime, 0);
1278220137Strasz			racct_set_locked(p, RACCT_WALLCLOCK,
1279233126Sjh			    (uint64_t)wallclock.tv_sec * 1000000 +
1280297489Strasz			    wallclock.tv_usec, 0);
1281297578Strasz			RACCT_UNLOCK();
1282220137Strasz			PROC_UNLOCK(p);
1283220137Strasz		}
1284242139Strasz
1285242139Strasz		/*
1286242139Strasz		 * To ensure that processes are throttled in a fair way, we need
1287242139Strasz		 * to iterate over all processes again and check the limits
1288242139Strasz		 * for %cpu resource only after ucred racct containers have been
1289242139Strasz		 * properly filled.
1290242139Strasz		 */
1291242139Strasz		FOREACH_PROC_IN_SYSTEM(p) {
1292242139Strasz			PROC_LOCK(p);
1293242139Strasz			if (p->p_state != PRS_NORMAL) {
1294242139Strasz				PROC_UNLOCK(p);
1295242139Strasz				continue;
1296242139Strasz			}
1297242139Strasz
1298297633Strasz			if (racct_pcpu_available(p) <= 0) {
1299297633Strasz				if (p->p_racct->r_resources[RACCT_PCTCPU] >
1300297633Strasz				    pcpu_threshold)
1301297633Strasz					racct_proc_throttle(p, -1);
1302297633Strasz			} else if (p->p_throttled == -1) {
1303242139Strasz				racct_proc_wakeup(p);
1304297633Strasz			}
1305242139Strasz			PROC_UNLOCK(p);
1306242139Strasz		}
1307220137Strasz		sx_sunlock(&allproc_lock);
1308220137Strasz		pause("-", hz);
1309220137Strasz	}
1310220137Strasz}
1311220137Strasz
1312220137Straszstatic struct kproc_desc racctd_kp = {
1313220137Strasz	"racctd",
1314220137Strasz	racctd,
1315220137Strasz	NULL
1316220137Strasz};
1317220137Strasz
1318220137Straszstatic void
1319282213Straszracctd_init(void)
1320282213Strasz{
1321282213Strasz	if (!racct_enable)
1322282213Strasz		return;
1323282213Strasz
1324282213Strasz	kproc_start(&racctd_kp);
1325282213Strasz}
1326282213StraszSYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
1327282213Strasz
1328282213Straszstatic void
1329220137Straszracct_init(void)
1330220137Strasz{
1331282213Strasz	if (!racct_enable)
1332282213Strasz		return;
1333220137Strasz
1334220137Strasz	racct_zone = uma_zcreate("racct", sizeof(struct racct),
1335298050Strasz	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1336220137Strasz	/*
1337220137Strasz	 * XXX: Move this somewhere.
1338220137Strasz	 */
1339221362Strasz	prison0.pr_prison_racct = prison_racct_find("0");
1340220137Strasz}
1341220137StraszSYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
1342220137Strasz
1343220137Strasz#endif /* !RACCT */
1344