Deleted Added
full compact
kern_racct.c (260817) kern_racct.c (284665)
1/*-
2 * Copyright (c) 2010 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
1/*-
2 * Copyright (c) 2010 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Edward Tomasz Napierala under sponsorship
6 * from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * $FreeBSD: stable/10/sys/kern/kern_racct.c 260817 2014-01-17 10:58:59Z avg $
29 * $FreeBSD: stable/10/sys/kern/kern_racct.c 284665 2015-06-21 06:28:26Z trasz $
30 */
31
32#include <sys/cdefs.h>
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: stable/10/sys/kern/kern_racct.c 260817 2014-01-17 10:58:59Z avg $");
33__FBSDID("$FreeBSD: stable/10/sys/kern/kern_racct.c 284665 2015-06-21 06:28:26Z trasz $");
34
35#include "opt_kdtrace.h"
36#include "opt_sched.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/eventhandler.h>
41#include <sys/jail.h>
42#include <sys/kernel.h>
43#include <sys/kthread.h>
44#include <sys/lock.h>
45#include <sys/loginclass.h>
46#include <sys/malloc.h>
47#include <sys/mutex.h>
48#include <sys/proc.h>
49#include <sys/racct.h>
50#include <sys/resourcevar.h>
51#include <sys/sbuf.h>
52#include <sys/sched.h>
53#include <sys/sdt.h>
54#include <sys/smp.h>
55#include <sys/sx.h>
56#include <sys/sysctl.h>
57#include <sys/sysent.h>
58#include <sys/sysproto.h>
59#include <sys/umtx.h>
60#include <machine/smp.h>
61
62#ifdef RCTL
63#include <sys/rctl.h>
64#endif
65
66#ifdef RACCT
67
68FEATURE(racct, "Resource Accounting");
69
70/*
71 * Do not block processes that have their %cpu usage <= pcpu_threshold.
72 */
73static int pcpu_threshold = 1;
34
35#include "opt_kdtrace.h"
36#include "opt_sched.h"
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/eventhandler.h>
41#include <sys/jail.h>
42#include <sys/kernel.h>
43#include <sys/kthread.h>
44#include <sys/lock.h>
45#include <sys/loginclass.h>
46#include <sys/malloc.h>
47#include <sys/mutex.h>
48#include <sys/proc.h>
49#include <sys/racct.h>
50#include <sys/resourcevar.h>
51#include <sys/sbuf.h>
52#include <sys/sched.h>
53#include <sys/sdt.h>
54#include <sys/smp.h>
55#include <sys/sx.h>
56#include <sys/sysctl.h>
57#include <sys/sysent.h>
58#include <sys/sysproto.h>
59#include <sys/umtx.h>
60#include <machine/smp.h>
61
62#ifdef RCTL
63#include <sys/rctl.h>
64#endif
65
66#ifdef RACCT
67
68FEATURE(racct, "Resource Accounting");
69
70/*
71 * Do not block processes that have their %cpu usage <= pcpu_threshold.
72 */
73static int pcpu_threshold = 1;
74#ifdef RACCT_DEFAULT_TO_DISABLED
75int racct_enable = 0;
76#else
77int racct_enable = 1;
78#endif
74
75SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
79
80SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
81SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
82 0, "Enable RACCT/RCTL");
76SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
77 0, "Processes with higher %cpu usage than this value can be throttled.");
78
79/*
80 * How many seconds it takes to use the scheduler %cpu calculations. When a
81 * process starts, we compute its %cpu usage by dividing its runtime by the
82 * process wall clock time. After RACCT_PCPU_SECS pass, we use the value
83 * provided by the scheduler.
84 */
85#define RACCT_PCPU_SECS 3
86
87static struct mtx racct_lock;
88MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
89
90static uma_zone_t racct_zone;
91
92static void racct_sub_racct(struct racct *dest, const struct racct *src);
93static void racct_sub_cred_locked(struct ucred *cred, int resource,
94 uint64_t amount);
95static void racct_add_cred_locked(struct ucred *cred, int resource,
96 uint64_t amount);
97
98SDT_PROVIDER_DEFINE(racct);
99SDT_PROBE_DEFINE3(racct, kernel, rusage, add, "struct proc *", "int",
100 "uint64_t");
101SDT_PROBE_DEFINE3(racct, kernel, rusage, add__failure,
102 "struct proc *", "int", "uint64_t");
103SDT_PROBE_DEFINE3(racct, kernel, rusage, add__cred, "struct ucred *",
104 "int", "uint64_t");
105SDT_PROBE_DEFINE3(racct, kernel, rusage, add__force, "struct proc *",
106 "int", "uint64_t");
107SDT_PROBE_DEFINE3(racct, kernel, rusage, set, "struct proc *", "int",
108 "uint64_t");
109SDT_PROBE_DEFINE3(racct, kernel, rusage, set__failure,
110 "struct proc *", "int", "uint64_t");
111SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, "struct proc *", "int",
112 "uint64_t");
113SDT_PROBE_DEFINE3(racct, kernel, rusage, sub__cred, "struct ucred *",
114 "int", "uint64_t");
115SDT_PROBE_DEFINE1(racct, kernel, racct, create, "struct racct *");
116SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, "struct racct *");
117SDT_PROBE_DEFINE2(racct, kernel, racct, join, "struct racct *",
118 "struct racct *");
119SDT_PROBE_DEFINE2(racct, kernel, racct, join__failure,
120 "struct racct *", "struct racct *");
121SDT_PROBE_DEFINE2(racct, kernel, racct, leave, "struct racct *",
122 "struct racct *");
123
124int racct_types[] = {
125 [RACCT_CPU] =
126 RACCT_IN_MILLIONS,
127 [RACCT_DATA] =
128 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
129 [RACCT_STACK] =
130 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
131 [RACCT_CORE] =
132 RACCT_DENIABLE,
133 [RACCT_RSS] =
134 RACCT_RECLAIMABLE,
135 [RACCT_MEMLOCK] =
136 RACCT_RECLAIMABLE | RACCT_DENIABLE,
137 [RACCT_NPROC] =
138 RACCT_RECLAIMABLE | RACCT_DENIABLE,
139 [RACCT_NOFILE] =
140 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
141 [RACCT_VMEM] =
142 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
143 [RACCT_NPTS] =
144 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
145 [RACCT_SWAP] =
146 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
147 [RACCT_NTHR] =
148 RACCT_RECLAIMABLE | RACCT_DENIABLE,
149 [RACCT_MSGQQUEUED] =
150 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
151 [RACCT_MSGQSIZE] =
152 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
153 [RACCT_NMSGQ] =
154 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
155 [RACCT_NSEM] =
156 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
157 [RACCT_NSEMOP] =
158 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
159 [RACCT_NSHM] =
160 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
161 [RACCT_SHMSIZE] =
162 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
163 [RACCT_WALLCLOCK] =
164 RACCT_IN_MILLIONS,
165 [RACCT_PCTCPU] =
166 RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
167
168static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
169
170#ifdef SCHED_4BSD
171/*
172 * Contains intermediate values for %cpu calculations to avoid using floating
173 * point in the kernel.
174 * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
175 * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
176 * zero so the calculations are more straightforward.
177 */
178fixpt_t ccpu_exp[] = {
179 [0] = FSCALE * 1,
180 [1] = FSCALE * 0.95122942450071400909,
181 [2] = FSCALE * 0.90483741803595957316,
182 [3] = FSCALE * 0.86070797642505780722,
183 [4] = FSCALE * 0.81873075307798185866,
184 [5] = FSCALE * 0.77880078307140486824,
185 [6] = FSCALE * 0.74081822068171786606,
186 [7] = FSCALE * 0.70468808971871343435,
187 [8] = FSCALE * 0.67032004603563930074,
188 [9] = FSCALE * 0.63762815162177329314,
189 [10] = FSCALE * 0.60653065971263342360,
190 [11] = FSCALE * 0.57694981038048669531,
191 [12] = FSCALE * 0.54881163609402643262,
192 [13] = FSCALE * 0.52204577676101604789,
193 [14] = FSCALE * 0.49658530379140951470,
194 [15] = FSCALE * 0.47236655274101470713,
195 [16] = FSCALE * 0.44932896411722159143,
196 [17] = FSCALE * 0.42741493194872666992,
197 [18] = FSCALE * 0.40656965974059911188,
198 [19] = FSCALE * 0.38674102345450120691,
199 [20] = FSCALE * 0.36787944117144232159,
200 [21] = FSCALE * 0.34993774911115535467,
201 [22] = FSCALE * 0.33287108369807955328,
202 [23] = FSCALE * 0.31663676937905321821,
203 [24] = FSCALE * 0.30119421191220209664,
204 [25] = FSCALE * 0.28650479686019010032,
205 [26] = FSCALE * 0.27253179303401260312,
206 [27] = FSCALE * 0.25924026064589150757,
207 [28] = FSCALE * 0.24659696394160647693,
208 [29] = FSCALE * 0.23457028809379765313,
209 [30] = FSCALE * 0.22313016014842982893,
210 [31] = FSCALE * 0.21224797382674305771,
211 [32] = FSCALE * 0.20189651799465540848,
212 [33] = FSCALE * 0.19204990862075411423,
213 [34] = FSCALE * 0.18268352405273465022,
214 [35] = FSCALE * 0.17377394345044512668,
215 [36] = FSCALE * 0.16529888822158653829,
216 [37] = FSCALE * 0.15723716631362761621,
217 [38] = FSCALE * 0.14956861922263505264,
218 [39] = FSCALE * 0.14227407158651357185,
219 [40] = FSCALE * 0.13533528323661269189,
220 [41] = FSCALE * 0.12873490358780421886,
221 [42] = FSCALE * 0.12245642825298191021,
222 [43] = FSCALE * 0.11648415777349695786,
223 [44] = FSCALE * 0.11080315836233388333,
224 [45] = FSCALE * 0.10539922456186433678,
225 [46] = FSCALE * 0.10025884372280373372,
226 [47] = FSCALE * 0.09536916221554961888,
227 [48] = FSCALE * 0.09071795328941250337,
228 [49] = FSCALE * 0.08629358649937051097,
229 [50] = FSCALE * 0.08208499862389879516,
230 [51] = FSCALE * 0.07808166600115315231,
231 [52] = FSCALE * 0.07427357821433388042,
232 [53] = FSCALE * 0.07065121306042958674,
233 [54] = FSCALE * 0.06720551273974976512,
234 [55] = FSCALE * 0.06392786120670757270,
235 [56] = FSCALE * 0.06081006262521796499,
236 [57] = FSCALE * 0.05784432087483846296,
237 [58] = FSCALE * 0.05502322005640722902,
238 [59] = FSCALE * 0.05233970594843239308,
239 [60] = FSCALE * 0.04978706836786394297,
240 [61] = FSCALE * 0.04735892439114092119,
241 [62] = FSCALE * 0.04504920239355780606,
242 [63] = FSCALE * 0.04285212686704017991,
243 [64] = FSCALE * 0.04076220397836621516,
244 [65] = FSCALE * 0.03877420783172200988,
245 [66] = FSCALE * 0.03688316740124000544,
246 [67] = FSCALE * 0.03508435410084502588,
247 [68] = FSCALE * 0.03337326996032607948,
248 [69] = FSCALE * 0.03174563637806794323,
249 [70] = FSCALE * 0.03019738342231850073,
250 [71] = FSCALE * 0.02872463965423942912,
251 [72] = FSCALE * 0.02732372244729256080,
252 [73] = FSCALE * 0.02599112877875534358,
253 [74] = FSCALE * 0.02472352647033939120,
254 [75] = FSCALE * 0.02351774585600910823,
255 [76] = FSCALE * 0.02237077185616559577,
256 [77] = FSCALE * 0.02127973643837716938,
257 [78] = FSCALE * 0.02024191144580438847,
258 [79] = FSCALE * 0.01925470177538692429,
259 [80] = FSCALE * 0.01831563888873418029,
260 [81] = FSCALE * 0.01742237463949351138,
261 [82] = FSCALE * 0.01657267540176124754,
262 [83] = FSCALE * 0.01576441648485449082,
263 [84] = FSCALE * 0.01499557682047770621,
264 [85] = FSCALE * 0.01426423390899925527,
265 [86] = FSCALE * 0.01356855901220093175,
266 [87] = FSCALE * 0.01290681258047986886,
267 [88] = FSCALE * 0.01227733990306844117,
268 [89] = FSCALE * 0.01167856697039544521,
269 [90] = FSCALE * 0.01110899653824230649,
270 [91] = FSCALE * 0.01056720438385265337,
271 [92] = FSCALE * 0.01005183574463358164,
272 [93] = FSCALE * 0.00956160193054350793,
273 [94] = FSCALE * 0.00909527710169581709,
274 [95] = FSCALE * 0.00865169520312063417,
275 [96] = FSCALE * 0.00822974704902002884,
276 [97] = FSCALE * 0.00782837754922577143,
277 [98] = FSCALE * 0.00744658307092434051,
278 [99] = FSCALE * 0.00708340892905212004,
279 [100] = FSCALE * 0.00673794699908546709,
280 [101] = FSCALE * 0.00640933344625638184,
281 [102] = FSCALE * 0.00609674656551563610,
282 [103] = FSCALE * 0.00579940472684214321,
283 [104] = FSCALE * 0.00551656442076077241,
284 [105] = FSCALE * 0.00524751839918138427,
285 [106] = FSCALE * 0.00499159390691021621,
286 [107] = FSCALE * 0.00474815099941147558,
287 [108] = FSCALE * 0.00451658094261266798,
288 [109] = FSCALE * 0.00429630469075234057,
289 [110] = FSCALE * 0.00408677143846406699,
290};
291#endif
292
293#define CCPU_EXP_MAX 110
294
295/*
296 * This function is analogical to the getpcpu() function in the ps(1) command.
297 * They should both calculate in the same way so that the racct %cpu
298 * calculations are consistent with the values showed by the ps(1) tool.
299 * The calculations are more complex in the 4BSD scheduler because of the value
300 * of the ccpu variable. In ULE it is defined to be zero which saves us some
301 * work.
302 */
303static uint64_t
304racct_getpcpu(struct proc *p, u_int pcpu)
305{
306 u_int swtime;
307#ifdef SCHED_4BSD
308 fixpt_t pctcpu, pctcpu_next;
309#endif
310#ifdef SMP
311 struct pcpu *pc;
312 int found;
313#endif
314 fixpt_t p_pctcpu;
315 struct thread *td;
316
83SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
84 0, "Processes with higher %cpu usage than this value can be throttled.");
85
86/*
87 * How many seconds it takes to use the scheduler %cpu calculations. When a
88 * process starts, we compute its %cpu usage by dividing its runtime by the
89 * process wall clock time. After RACCT_PCPU_SECS pass, we use the value
90 * provided by the scheduler.
91 */
92#define RACCT_PCPU_SECS 3
93
94static struct mtx racct_lock;
95MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
96
97static uma_zone_t racct_zone;
98
99static void racct_sub_racct(struct racct *dest, const struct racct *src);
100static void racct_sub_cred_locked(struct ucred *cred, int resource,
101 uint64_t amount);
102static void racct_add_cred_locked(struct ucred *cred, int resource,
103 uint64_t amount);
104
105SDT_PROVIDER_DEFINE(racct);
106SDT_PROBE_DEFINE3(racct, kernel, rusage, add, "struct proc *", "int",
107 "uint64_t");
108SDT_PROBE_DEFINE3(racct, kernel, rusage, add__failure,
109 "struct proc *", "int", "uint64_t");
110SDT_PROBE_DEFINE3(racct, kernel, rusage, add__cred, "struct ucred *",
111 "int", "uint64_t");
112SDT_PROBE_DEFINE3(racct, kernel, rusage, add__force, "struct proc *",
113 "int", "uint64_t");
114SDT_PROBE_DEFINE3(racct, kernel, rusage, set, "struct proc *", "int",
115 "uint64_t");
116SDT_PROBE_DEFINE3(racct, kernel, rusage, set__failure,
117 "struct proc *", "int", "uint64_t");
118SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, "struct proc *", "int",
119 "uint64_t");
120SDT_PROBE_DEFINE3(racct, kernel, rusage, sub__cred, "struct ucred *",
121 "int", "uint64_t");
122SDT_PROBE_DEFINE1(racct, kernel, racct, create, "struct racct *");
123SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, "struct racct *");
124SDT_PROBE_DEFINE2(racct, kernel, racct, join, "struct racct *",
125 "struct racct *");
126SDT_PROBE_DEFINE2(racct, kernel, racct, join__failure,
127 "struct racct *", "struct racct *");
128SDT_PROBE_DEFINE2(racct, kernel, racct, leave, "struct racct *",
129 "struct racct *");
130
131int racct_types[] = {
132 [RACCT_CPU] =
133 RACCT_IN_MILLIONS,
134 [RACCT_DATA] =
135 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
136 [RACCT_STACK] =
137 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
138 [RACCT_CORE] =
139 RACCT_DENIABLE,
140 [RACCT_RSS] =
141 RACCT_RECLAIMABLE,
142 [RACCT_MEMLOCK] =
143 RACCT_RECLAIMABLE | RACCT_DENIABLE,
144 [RACCT_NPROC] =
145 RACCT_RECLAIMABLE | RACCT_DENIABLE,
146 [RACCT_NOFILE] =
147 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
148 [RACCT_VMEM] =
149 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
150 [RACCT_NPTS] =
151 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
152 [RACCT_SWAP] =
153 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
154 [RACCT_NTHR] =
155 RACCT_RECLAIMABLE | RACCT_DENIABLE,
156 [RACCT_MSGQQUEUED] =
157 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
158 [RACCT_MSGQSIZE] =
159 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
160 [RACCT_NMSGQ] =
161 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
162 [RACCT_NSEM] =
163 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
164 [RACCT_NSEMOP] =
165 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
166 [RACCT_NSHM] =
167 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
168 [RACCT_SHMSIZE] =
169 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
170 [RACCT_WALLCLOCK] =
171 RACCT_IN_MILLIONS,
172 [RACCT_PCTCPU] =
173 RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
174
175static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
176
177#ifdef SCHED_4BSD
178/*
179 * Contains intermediate values for %cpu calculations to avoid using floating
180 * point in the kernel.
181 * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
182 * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
183 * zero so the calculations are more straightforward.
184 */
185fixpt_t ccpu_exp[] = {
186 [0] = FSCALE * 1,
187 [1] = FSCALE * 0.95122942450071400909,
188 [2] = FSCALE * 0.90483741803595957316,
189 [3] = FSCALE * 0.86070797642505780722,
190 [4] = FSCALE * 0.81873075307798185866,
191 [5] = FSCALE * 0.77880078307140486824,
192 [6] = FSCALE * 0.74081822068171786606,
193 [7] = FSCALE * 0.70468808971871343435,
194 [8] = FSCALE * 0.67032004603563930074,
195 [9] = FSCALE * 0.63762815162177329314,
196 [10] = FSCALE * 0.60653065971263342360,
197 [11] = FSCALE * 0.57694981038048669531,
198 [12] = FSCALE * 0.54881163609402643262,
199 [13] = FSCALE * 0.52204577676101604789,
200 [14] = FSCALE * 0.49658530379140951470,
201 [15] = FSCALE * 0.47236655274101470713,
202 [16] = FSCALE * 0.44932896411722159143,
203 [17] = FSCALE * 0.42741493194872666992,
204 [18] = FSCALE * 0.40656965974059911188,
205 [19] = FSCALE * 0.38674102345450120691,
206 [20] = FSCALE * 0.36787944117144232159,
207 [21] = FSCALE * 0.34993774911115535467,
208 [22] = FSCALE * 0.33287108369807955328,
209 [23] = FSCALE * 0.31663676937905321821,
210 [24] = FSCALE * 0.30119421191220209664,
211 [25] = FSCALE * 0.28650479686019010032,
212 [26] = FSCALE * 0.27253179303401260312,
213 [27] = FSCALE * 0.25924026064589150757,
214 [28] = FSCALE * 0.24659696394160647693,
215 [29] = FSCALE * 0.23457028809379765313,
216 [30] = FSCALE * 0.22313016014842982893,
217 [31] = FSCALE * 0.21224797382674305771,
218 [32] = FSCALE * 0.20189651799465540848,
219 [33] = FSCALE * 0.19204990862075411423,
220 [34] = FSCALE * 0.18268352405273465022,
221 [35] = FSCALE * 0.17377394345044512668,
222 [36] = FSCALE * 0.16529888822158653829,
223 [37] = FSCALE * 0.15723716631362761621,
224 [38] = FSCALE * 0.14956861922263505264,
225 [39] = FSCALE * 0.14227407158651357185,
226 [40] = FSCALE * 0.13533528323661269189,
227 [41] = FSCALE * 0.12873490358780421886,
228 [42] = FSCALE * 0.12245642825298191021,
229 [43] = FSCALE * 0.11648415777349695786,
230 [44] = FSCALE * 0.11080315836233388333,
231 [45] = FSCALE * 0.10539922456186433678,
232 [46] = FSCALE * 0.10025884372280373372,
233 [47] = FSCALE * 0.09536916221554961888,
234 [48] = FSCALE * 0.09071795328941250337,
235 [49] = FSCALE * 0.08629358649937051097,
236 [50] = FSCALE * 0.08208499862389879516,
237 [51] = FSCALE * 0.07808166600115315231,
238 [52] = FSCALE * 0.07427357821433388042,
239 [53] = FSCALE * 0.07065121306042958674,
240 [54] = FSCALE * 0.06720551273974976512,
241 [55] = FSCALE * 0.06392786120670757270,
242 [56] = FSCALE * 0.06081006262521796499,
243 [57] = FSCALE * 0.05784432087483846296,
244 [58] = FSCALE * 0.05502322005640722902,
245 [59] = FSCALE * 0.05233970594843239308,
246 [60] = FSCALE * 0.04978706836786394297,
247 [61] = FSCALE * 0.04735892439114092119,
248 [62] = FSCALE * 0.04504920239355780606,
249 [63] = FSCALE * 0.04285212686704017991,
250 [64] = FSCALE * 0.04076220397836621516,
251 [65] = FSCALE * 0.03877420783172200988,
252 [66] = FSCALE * 0.03688316740124000544,
253 [67] = FSCALE * 0.03508435410084502588,
254 [68] = FSCALE * 0.03337326996032607948,
255 [69] = FSCALE * 0.03174563637806794323,
256 [70] = FSCALE * 0.03019738342231850073,
257 [71] = FSCALE * 0.02872463965423942912,
258 [72] = FSCALE * 0.02732372244729256080,
259 [73] = FSCALE * 0.02599112877875534358,
260 [74] = FSCALE * 0.02472352647033939120,
261 [75] = FSCALE * 0.02351774585600910823,
262 [76] = FSCALE * 0.02237077185616559577,
263 [77] = FSCALE * 0.02127973643837716938,
264 [78] = FSCALE * 0.02024191144580438847,
265 [79] = FSCALE * 0.01925470177538692429,
266 [80] = FSCALE * 0.01831563888873418029,
267 [81] = FSCALE * 0.01742237463949351138,
268 [82] = FSCALE * 0.01657267540176124754,
269 [83] = FSCALE * 0.01576441648485449082,
270 [84] = FSCALE * 0.01499557682047770621,
271 [85] = FSCALE * 0.01426423390899925527,
272 [86] = FSCALE * 0.01356855901220093175,
273 [87] = FSCALE * 0.01290681258047986886,
274 [88] = FSCALE * 0.01227733990306844117,
275 [89] = FSCALE * 0.01167856697039544521,
276 [90] = FSCALE * 0.01110899653824230649,
277 [91] = FSCALE * 0.01056720438385265337,
278 [92] = FSCALE * 0.01005183574463358164,
279 [93] = FSCALE * 0.00956160193054350793,
280 [94] = FSCALE * 0.00909527710169581709,
281 [95] = FSCALE * 0.00865169520312063417,
282 [96] = FSCALE * 0.00822974704902002884,
283 [97] = FSCALE * 0.00782837754922577143,
284 [98] = FSCALE * 0.00744658307092434051,
285 [99] = FSCALE * 0.00708340892905212004,
286 [100] = FSCALE * 0.00673794699908546709,
287 [101] = FSCALE * 0.00640933344625638184,
288 [102] = FSCALE * 0.00609674656551563610,
289 [103] = FSCALE * 0.00579940472684214321,
290 [104] = FSCALE * 0.00551656442076077241,
291 [105] = FSCALE * 0.00524751839918138427,
292 [106] = FSCALE * 0.00499159390691021621,
293 [107] = FSCALE * 0.00474815099941147558,
294 [108] = FSCALE * 0.00451658094261266798,
295 [109] = FSCALE * 0.00429630469075234057,
296 [110] = FSCALE * 0.00408677143846406699,
297};
298#endif
299
300#define CCPU_EXP_MAX 110
301
302/*
303 * This function is analogical to the getpcpu() function in the ps(1) command.
304 * They should both calculate in the same way so that the racct %cpu
305 * calculations are consistent with the values showed by the ps(1) tool.
306 * The calculations are more complex in the 4BSD scheduler because of the value
307 * of the ccpu variable. In ULE it is defined to be zero which saves us some
308 * work.
309 */
310static uint64_t
311racct_getpcpu(struct proc *p, u_int pcpu)
312{
313 u_int swtime;
314#ifdef SCHED_4BSD
315 fixpt_t pctcpu, pctcpu_next;
316#endif
317#ifdef SMP
318 struct pcpu *pc;
319 int found;
320#endif
321 fixpt_t p_pctcpu;
322 struct thread *td;
323
324 ASSERT_RACCT_ENABLED();
325
317 /*
318 * If the process is swapped out, we count its %cpu usage as zero.
319 * This behaviour is consistent with the userland ps(1) tool.
320 */
321 if ((p->p_flag & P_INMEM) == 0)
322 return (0);
323 swtime = (ticks - p->p_swtick) / hz;
324
325 /*
326 * For short-lived processes, the sched_pctcpu() returns small
327 * values even for cpu intensive processes. Therefore we use
328 * our own estimate in this case.
329 */
330 if (swtime < RACCT_PCPU_SECS)
331 return (pcpu);
332
333 p_pctcpu = 0;
334 FOREACH_THREAD_IN_PROC(p, td) {
335 if (td == PCPU_GET(idlethread))
336 continue;
337#ifdef SMP
338 found = 0;
339 STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
340 if (td == pc->pc_idlethread) {
341 found = 1;
342 break;
343 }
344 }
345 if (found)
346 continue;
347#endif
348 thread_lock(td);
349#ifdef SCHED_4BSD
350 pctcpu = sched_pctcpu(td);
351 /* Count also the yet unfinished second. */
352 pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
353 pctcpu_next += sched_pctcpu_delta(td);
354 p_pctcpu += max(pctcpu, pctcpu_next);
355#else
356 /*
357 * In ULE the %cpu statistics are updated on every
358 * sched_pctcpu() call. So special calculations to
359 * account for the latest (unfinished) second are
360 * not needed.
361 */
362 p_pctcpu += sched_pctcpu(td);
363#endif
364 thread_unlock(td);
365 }
366
367#ifdef SCHED_4BSD
368 if (swtime <= CCPU_EXP_MAX)
369 return ((100 * (uint64_t)p_pctcpu * 1000000) /
370 (FSCALE - ccpu_exp[swtime]));
371#endif
372
373 return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
374}
375
376static void
377racct_add_racct(struct racct *dest, const struct racct *src)
378{
379 int i;
380
326 /*
327 * If the process is swapped out, we count its %cpu usage as zero.
328 * This behaviour is consistent with the userland ps(1) tool.
329 */
330 if ((p->p_flag & P_INMEM) == 0)
331 return (0);
332 swtime = (ticks - p->p_swtick) / hz;
333
334 /*
335 * For short-lived processes, the sched_pctcpu() returns small
336 * values even for cpu intensive processes. Therefore we use
337 * our own estimate in this case.
338 */
339 if (swtime < RACCT_PCPU_SECS)
340 return (pcpu);
341
342 p_pctcpu = 0;
343 FOREACH_THREAD_IN_PROC(p, td) {
344 if (td == PCPU_GET(idlethread))
345 continue;
346#ifdef SMP
347 found = 0;
348 STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
349 if (td == pc->pc_idlethread) {
350 found = 1;
351 break;
352 }
353 }
354 if (found)
355 continue;
356#endif
357 thread_lock(td);
358#ifdef SCHED_4BSD
359 pctcpu = sched_pctcpu(td);
360 /* Count also the yet unfinished second. */
361 pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
362 pctcpu_next += sched_pctcpu_delta(td);
363 p_pctcpu += max(pctcpu, pctcpu_next);
364#else
365 /*
366 * In ULE the %cpu statistics are updated on every
367 * sched_pctcpu() call. So special calculations to
368 * account for the latest (unfinished) second are
369 * not needed.
370 */
371 p_pctcpu += sched_pctcpu(td);
372#endif
373 thread_unlock(td);
374 }
375
376#ifdef SCHED_4BSD
377 if (swtime <= CCPU_EXP_MAX)
378 return ((100 * (uint64_t)p_pctcpu * 1000000) /
379 (FSCALE - ccpu_exp[swtime]));
380#endif
381
382 return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
383}
384
385static void
386racct_add_racct(struct racct *dest, const struct racct *src)
387{
388 int i;
389
390 ASSERT_RACCT_ENABLED();
381 mtx_assert(&racct_lock, MA_OWNED);
382
383 /*
384 * Update resource usage in dest.
385 */
386 for (i = 0; i <= RACCT_MAX; i++) {
387 KASSERT(dest->r_resources[i] >= 0,
388 ("%s: resource %d propagation meltdown: dest < 0",
389 __func__, i));
390 KASSERT(src->r_resources[i] >= 0,
391 ("%s: resource %d propagation meltdown: src < 0",
392 __func__, i));
393 dest->r_resources[i] += src->r_resources[i];
394 }
395}
396
397static void
398racct_sub_racct(struct racct *dest, const struct racct *src)
399{
400 int i;
401
391 mtx_assert(&racct_lock, MA_OWNED);
392
393 /*
394 * Update resource usage in dest.
395 */
396 for (i = 0; i <= RACCT_MAX; i++) {
397 KASSERT(dest->r_resources[i] >= 0,
398 ("%s: resource %d propagation meltdown: dest < 0",
399 __func__, i));
400 KASSERT(src->r_resources[i] >= 0,
401 ("%s: resource %d propagation meltdown: src < 0",
402 __func__, i));
403 dest->r_resources[i] += src->r_resources[i];
404 }
405}
406
407static void
408racct_sub_racct(struct racct *dest, const struct racct *src)
409{
410 int i;
411
412 ASSERT_RACCT_ENABLED();
402 mtx_assert(&racct_lock, MA_OWNED);
403
404 /*
405 * Update resource usage in dest.
406 */
407 for (i = 0; i <= RACCT_MAX; i++) {
408 if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
409 KASSERT(dest->r_resources[i] >= 0,
410 ("%s: resource %d propagation meltdown: dest < 0",
411 __func__, i));
412 KASSERT(src->r_resources[i] >= 0,
413 ("%s: resource %d propagation meltdown: src < 0",
414 __func__, i));
415 KASSERT(src->r_resources[i] <= dest->r_resources[i],
416 ("%s: resource %d propagation meltdown: src > dest",
417 __func__, i));
418 }
419 if (RACCT_CAN_DROP(i)) {
420 dest->r_resources[i] -= src->r_resources[i];
421 if (dest->r_resources[i] < 0) {
422 KASSERT(RACCT_IS_SLOPPY(i) ||
423 RACCT_IS_DECAYING(i),
424 ("%s: resource %d usage < 0", __func__, i));
425 dest->r_resources[i] = 0;
426 }
427 }
428 }
429}
430
431void
432racct_create(struct racct **racctp)
433{
434
413 mtx_assert(&racct_lock, MA_OWNED);
414
415 /*
416 * Update resource usage in dest.
417 */
418 for (i = 0; i <= RACCT_MAX; i++) {
419 if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
420 KASSERT(dest->r_resources[i] >= 0,
421 ("%s: resource %d propagation meltdown: dest < 0",
422 __func__, i));
423 KASSERT(src->r_resources[i] >= 0,
424 ("%s: resource %d propagation meltdown: src < 0",
425 __func__, i));
426 KASSERT(src->r_resources[i] <= dest->r_resources[i],
427 ("%s: resource %d propagation meltdown: src > dest",
428 __func__, i));
429 }
430 if (RACCT_CAN_DROP(i)) {
431 dest->r_resources[i] -= src->r_resources[i];
432 if (dest->r_resources[i] < 0) {
433 KASSERT(RACCT_IS_SLOPPY(i) ||
434 RACCT_IS_DECAYING(i),
435 ("%s: resource %d usage < 0", __func__, i));
436 dest->r_resources[i] = 0;
437 }
438 }
439 }
440}
441
442void
443racct_create(struct racct **racctp)
444{
445
446 if (!racct_enable)
447 return;
448
435 SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0);
436
437 KASSERT(*racctp == NULL, ("racct already allocated"));
438
439 *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
440}
441
442static void
443racct_destroy_locked(struct racct **racctp)
444{
445 int i;
446 struct racct *racct;
447
449 SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0);
450
451 KASSERT(*racctp == NULL, ("racct already allocated"));
452
453 *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
454}
455
456static void
457racct_destroy_locked(struct racct **racctp)
458{
459 int i;
460 struct racct *racct;
461
462 ASSERT_RACCT_ENABLED();
463
448 SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0);
449
450 mtx_assert(&racct_lock, MA_OWNED);
451 KASSERT(racctp != NULL, ("NULL racctp"));
452 KASSERT(*racctp != NULL, ("NULL racct"));
453
454 racct = *racctp;
455
456 for (i = 0; i <= RACCT_MAX; i++) {
457 if (RACCT_IS_SLOPPY(i))
458 continue;
459 if (!RACCT_IS_RECLAIMABLE(i))
460 continue;
461 KASSERT(racct->r_resources[i] == 0,
462 ("destroying non-empty racct: "
463 "%ju allocated for resource %d\n",
464 racct->r_resources[i], i));
465 }
466 uma_zfree(racct_zone, racct);
467 *racctp = NULL;
468}
469
470void
471racct_destroy(struct racct **racct)
472{
473
464 SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0);
465
466 mtx_assert(&racct_lock, MA_OWNED);
467 KASSERT(racctp != NULL, ("NULL racctp"));
468 KASSERT(*racctp != NULL, ("NULL racct"));
469
470 racct = *racctp;
471
472 for (i = 0; i <= RACCT_MAX; i++) {
473 if (RACCT_IS_SLOPPY(i))
474 continue;
475 if (!RACCT_IS_RECLAIMABLE(i))
476 continue;
477 KASSERT(racct->r_resources[i] == 0,
478 ("destroying non-empty racct: "
479 "%ju allocated for resource %d\n",
480 racct->r_resources[i], i));
481 }
482 uma_zfree(racct_zone, racct);
483 *racctp = NULL;
484}
485
486void
487racct_destroy(struct racct **racct)
488{
489
490 if (!racct_enable)
491 return;
492
474 mtx_lock(&racct_lock);
475 racct_destroy_locked(racct);
476 mtx_unlock(&racct_lock);
477}
478
479/*
480 * Increase consumption of 'resource' by 'amount' for 'racct'
481 * and all its parents. Differently from other cases, 'amount' here
482 * may be less than zero.
483 */
484static void
485racct_alloc_resource(struct racct *racct, int resource,
486 uint64_t amount)
487{
488
493 mtx_lock(&racct_lock);
494 racct_destroy_locked(racct);
495 mtx_unlock(&racct_lock);
496}
497
498/*
499 * Increase consumption of 'resource' by 'amount' for 'racct'
500 * and all its parents. Differently from other cases, 'amount' here
501 * may be less than zero.
502 */
503static void
504racct_alloc_resource(struct racct *racct, int resource,
505 uint64_t amount)
506{
507
508 ASSERT_RACCT_ENABLED();
489 mtx_assert(&racct_lock, MA_OWNED);
490 KASSERT(racct != NULL, ("NULL racct"));
491
492 racct->r_resources[resource] += amount;
493 if (racct->r_resources[resource] < 0) {
494 KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
495 ("%s: resource %d usage < 0", __func__, resource));
496 racct->r_resources[resource] = 0;
497 }
498
499 /*
500 * There are some cases where the racct %cpu resource would grow
501 * beyond 100%.
502 * For example in racct_proc_exit() we add the process %cpu usage
503 * to the ucred racct containers. If too many processes terminated
504 * in a short time span, the ucred %cpu resource could grow too much.
505 * Also, the 4BSD scheduler sometimes returns for a thread more than
506 * 100% cpu usage. So we set a boundary here to 100%.
507 */
508 if ((resource == RACCT_PCTCPU) &&
509 (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
510 racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
511}
512
513static int
514racct_add_locked(struct proc *p, int resource, uint64_t amount)
515{
516#ifdef RCTL
517 int error;
518#endif
519
509 mtx_assert(&racct_lock, MA_OWNED);
510 KASSERT(racct != NULL, ("NULL racct"));
511
512 racct->r_resources[resource] += amount;
513 if (racct->r_resources[resource] < 0) {
514 KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
515 ("%s: resource %d usage < 0", __func__, resource));
516 racct->r_resources[resource] = 0;
517 }
518
519 /*
520 * There are some cases where the racct %cpu resource would grow
521 * beyond 100%.
522 * For example in racct_proc_exit() we add the process %cpu usage
523 * to the ucred racct containers. If too many processes terminated
524 * in a short time span, the ucred %cpu resource could grow too much.
525 * Also, the 4BSD scheduler sometimes returns for a thread more than
526 * 100% cpu usage. So we set a boundary here to 100%.
527 */
528 if ((resource == RACCT_PCTCPU) &&
529 (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
530 racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
531}
532
533static int
534racct_add_locked(struct proc *p, int resource, uint64_t amount)
535{
536#ifdef RCTL
537 int error;
538#endif
539
540 ASSERT_RACCT_ENABLED();
541
520 SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0);
521
522 /*
523 * We need proc lock to dereference p->p_ucred.
524 */
525 PROC_LOCK_ASSERT(p, MA_OWNED);
526
527#ifdef RCTL
528 error = rctl_enforce(p, resource, amount);
529 if (error && RACCT_IS_DENIABLE(resource)) {
530 SDT_PROBE(racct, kernel, rusage, add__failure, p, resource,
531 amount, 0, 0);
532 return (error);
533 }
534#endif
535 racct_alloc_resource(p->p_racct, resource, amount);
536 racct_add_cred_locked(p->p_ucred, resource, amount);
537
538 return (0);
539}
540
541/*
542 * Increase allocation of 'resource' by 'amount' for process 'p'.
543 * Return 0 if it's below limits, or errno, if it's not.
544 */
545int
546racct_add(struct proc *p, int resource, uint64_t amount)
547{
548 int error;
549
542 SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0);
543
544 /*
545 * We need proc lock to dereference p->p_ucred.
546 */
547 PROC_LOCK_ASSERT(p, MA_OWNED);
548
549#ifdef RCTL
550 error = rctl_enforce(p, resource, amount);
551 if (error && RACCT_IS_DENIABLE(resource)) {
552 SDT_PROBE(racct, kernel, rusage, add__failure, p, resource,
553 amount, 0, 0);
554 return (error);
555 }
556#endif
557 racct_alloc_resource(p->p_racct, resource, amount);
558 racct_add_cred_locked(p->p_ucred, resource, amount);
559
560 return (0);
561}
562
563/*
564 * Increase allocation of 'resource' by 'amount' for process 'p'.
565 * Return 0 if it's below limits, or errno, if it's not.
566 */
567int
568racct_add(struct proc *p, int resource, uint64_t amount)
569{
570 int error;
571
572 if (!racct_enable)
573 return (0);
574
550 mtx_lock(&racct_lock);
551 error = racct_add_locked(p, resource, amount);
552 mtx_unlock(&racct_lock);
553 return (error);
554}
555
556static void
557racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
558{
559 struct prison *pr;
560
575 mtx_lock(&racct_lock);
576 error = racct_add_locked(p, resource, amount);
577 mtx_unlock(&racct_lock);
578 return (error);
579}
580
581static void
582racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
583{
584 struct prison *pr;
585
586 ASSERT_RACCT_ENABLED();
587
561 SDT_PROBE(racct, kernel, rusage, add__cred, cred, resource, amount,
562 0, 0);
563
564 racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
565 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
566 racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
567 amount);
568 racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount);
569}
570
571/*
572 * Increase allocation of 'resource' by 'amount' for credential 'cred'.
573 * Doesn't check for limits and never fails.
574 *
575 * XXX: Shouldn't this ever return an error?
576 */
577void
578racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
579{
580
588 SDT_PROBE(racct, kernel, rusage, add__cred, cred, resource, amount,
589 0, 0);
590
591 racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
592 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
593 racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
594 amount);
595 racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount);
596}
597
598/*
599 * Increase allocation of 'resource' by 'amount' for credential 'cred'.
600 * Doesn't check for limits and never fails.
601 *
602 * XXX: Shouldn't this ever return an error?
603 */
604void
605racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
606{
607
608 if (!racct_enable)
609 return;
610
581 mtx_lock(&racct_lock);
582 racct_add_cred_locked(cred, resource, amount);
583 mtx_unlock(&racct_lock);
584}
585
586/*
587 * Increase allocation of 'resource' by 'amount' for process 'p'.
588 * Doesn't check for limits and never fails.
589 */
590void
591racct_add_force(struct proc *p, int resource, uint64_t amount)
592{
593
611 mtx_lock(&racct_lock);
612 racct_add_cred_locked(cred, resource, amount);
613 mtx_unlock(&racct_lock);
614}
615
616/*
617 * Increase allocation of 'resource' by 'amount' for process 'p'.
618 * Doesn't check for limits and never fails.
619 */
620void
621racct_add_force(struct proc *p, int resource, uint64_t amount)
622{
623
624 if (!racct_enable)
625 return;
626
594 SDT_PROBE(racct, kernel, rusage, add__force, p, resource, amount, 0, 0);
595
596 /*
597 * We need proc lock to dereference p->p_ucred.
598 */
599 PROC_LOCK_ASSERT(p, MA_OWNED);
600
601 mtx_lock(&racct_lock);
602 racct_alloc_resource(p->p_racct, resource, amount);
603 mtx_unlock(&racct_lock);
604 racct_add_cred(p->p_ucred, resource, amount);
605}
606
607static int
608racct_set_locked(struct proc *p, int resource, uint64_t amount)
609{
610 int64_t old_amount, decayed_amount;
611 int64_t diff_proc, diff_cred;
612#ifdef RCTL
613 int error;
614#endif
615
627 SDT_PROBE(racct, kernel, rusage, add__force, p, resource, amount, 0, 0);
628
629 /*
630 * We need proc lock to dereference p->p_ucred.
631 */
632 PROC_LOCK_ASSERT(p, MA_OWNED);
633
634 mtx_lock(&racct_lock);
635 racct_alloc_resource(p->p_racct, resource, amount);
636 mtx_unlock(&racct_lock);
637 racct_add_cred(p->p_ucred, resource, amount);
638}
639
640static int
641racct_set_locked(struct proc *p, int resource, uint64_t amount)
642{
643 int64_t old_amount, decayed_amount;
644 int64_t diff_proc, diff_cred;
645#ifdef RCTL
646 int error;
647#endif
648
649 ASSERT_RACCT_ENABLED();
650
616 SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
617
618 /*
619 * We need proc lock to dereference p->p_ucred.
620 */
621 PROC_LOCK_ASSERT(p, MA_OWNED);
622
623 old_amount = p->p_racct->r_resources[resource];
624 /*
625 * The diffs may be negative.
626 */
627 diff_proc = amount - old_amount;
628 if (RACCT_IS_DECAYING(resource)) {
629 /*
630 * Resources in per-credential racct containers may decay.
631 * If this is the case, we need to calculate the difference
632 * between the new amount and the proportional value of the
633 * old amount that has decayed in the ucred racct containers.
634 */
635 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
636 diff_cred = amount - decayed_amount;
637 } else
638 diff_cred = diff_proc;
639#ifdef notyet
640 KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
641 ("%s: usage of non-droppable resource %d dropping", __func__,
642 resource));
643#endif
644#ifdef RCTL
645 if (diff_proc > 0) {
646 error = rctl_enforce(p, resource, diff_proc);
647 if (error && RACCT_IS_DENIABLE(resource)) {
648 SDT_PROBE(racct, kernel, rusage, set__failure, p,
649 resource, amount, 0, 0);
650 return (error);
651 }
652 }
653#endif
654 racct_alloc_resource(p->p_racct, resource, diff_proc);
655 if (diff_cred > 0)
656 racct_add_cred_locked(p->p_ucred, resource, diff_cred);
657 else if (diff_cred < 0)
658 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
659
660 return (0);
661}
662
663/*
664 * Set allocation of 'resource' to 'amount' for process 'p'.
665 * Return 0 if it's below limits, or errno, if it's not.
666 *
667 * Note that decreasing the allocation always returns 0,
668 * even if it's above the limit.
669 */
670int
671racct_set(struct proc *p, int resource, uint64_t amount)
672{
673 int error;
674
651 SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
652
653 /*
654 * We need proc lock to dereference p->p_ucred.
655 */
656 PROC_LOCK_ASSERT(p, MA_OWNED);
657
658 old_amount = p->p_racct->r_resources[resource];
659 /*
660 * The diffs may be negative.
661 */
662 diff_proc = amount - old_amount;
663 if (RACCT_IS_DECAYING(resource)) {
664 /*
665 * Resources in per-credential racct containers may decay.
666 * If this is the case, we need to calculate the difference
667 * between the new amount and the proportional value of the
668 * old amount that has decayed in the ucred racct containers.
669 */
670 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
671 diff_cred = amount - decayed_amount;
672 } else
673 diff_cred = diff_proc;
674#ifdef notyet
675 KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
676 ("%s: usage of non-droppable resource %d dropping", __func__,
677 resource));
678#endif
679#ifdef RCTL
680 if (diff_proc > 0) {
681 error = rctl_enforce(p, resource, diff_proc);
682 if (error && RACCT_IS_DENIABLE(resource)) {
683 SDT_PROBE(racct, kernel, rusage, set__failure, p,
684 resource, amount, 0, 0);
685 return (error);
686 }
687 }
688#endif
689 racct_alloc_resource(p->p_racct, resource, diff_proc);
690 if (diff_cred > 0)
691 racct_add_cred_locked(p->p_ucred, resource, diff_cred);
692 else if (diff_cred < 0)
693 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
694
695 return (0);
696}
697
698/*
699 * Set allocation of 'resource' to 'amount' for process 'p'.
700 * Return 0 if it's below limits, or errno, if it's not.
701 *
702 * Note that decreasing the allocation always returns 0,
703 * even if it's above the limit.
704 */
705int
706racct_set(struct proc *p, int resource, uint64_t amount)
707{
708 int error;
709
710 if (!racct_enable)
711 return (0);
712
675 mtx_lock(&racct_lock);
676 error = racct_set_locked(p, resource, amount);
677 mtx_unlock(&racct_lock);
678 return (error);
679}
680
681static void
682racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
683{
684 int64_t old_amount, decayed_amount;
685 int64_t diff_proc, diff_cred;
686
713 mtx_lock(&racct_lock);
714 error = racct_set_locked(p, resource, amount);
715 mtx_unlock(&racct_lock);
716 return (error);
717}
718
719static void
720racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
721{
722 int64_t old_amount, decayed_amount;
723 int64_t diff_proc, diff_cred;
724
725 ASSERT_RACCT_ENABLED();
726
687 SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
688
689 /*
690 * We need proc lock to dereference p->p_ucred.
691 */
692 PROC_LOCK_ASSERT(p, MA_OWNED);
693
694 old_amount = p->p_racct->r_resources[resource];
695 /*
696 * The diffs may be negative.
697 */
698 diff_proc = amount - old_amount;
699 if (RACCT_IS_DECAYING(resource)) {
700 /*
701 * Resources in per-credential racct containers may decay.
702 * If this is the case, we need to calculate the difference
703 * between the new amount and the proportional value of the
704 * old amount that has decayed in the ucred racct containers.
705 */
706 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
707 diff_cred = amount - decayed_amount;
708 } else
709 diff_cred = diff_proc;
710
711 racct_alloc_resource(p->p_racct, resource, diff_proc);
712 if (diff_cred > 0)
713 racct_add_cred_locked(p->p_ucred, resource, diff_cred);
714 else if (diff_cred < 0)
715 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
716}
717
718void
719racct_set_force(struct proc *p, int resource, uint64_t amount)
720{
727 SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
728
729 /*
730 * We need proc lock to dereference p->p_ucred.
731 */
732 PROC_LOCK_ASSERT(p, MA_OWNED);
733
734 old_amount = p->p_racct->r_resources[resource];
735 /*
736 * The diffs may be negative.
737 */
738 diff_proc = amount - old_amount;
739 if (RACCT_IS_DECAYING(resource)) {
740 /*
741 * Resources in per-credential racct containers may decay.
742 * If this is the case, we need to calculate the difference
743 * between the new amount and the proportional value of the
744 * old amount that has decayed in the ucred racct containers.
745 */
746 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
747 diff_cred = amount - decayed_amount;
748 } else
749 diff_cred = diff_proc;
750
751 racct_alloc_resource(p->p_racct, resource, diff_proc);
752 if (diff_cred > 0)
753 racct_add_cred_locked(p->p_ucred, resource, diff_cred);
754 else if (diff_cred < 0)
755 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
756}
757
758void
759racct_set_force(struct proc *p, int resource, uint64_t amount)
760{
761
762 if (!racct_enable)
763 return;
764
721 mtx_lock(&racct_lock);
722 racct_set_force_locked(p, resource, amount);
723 mtx_unlock(&racct_lock);
724}
725
726/*
727 * Returns amount of 'resource' the process 'p' can keep allocated.
728 * Allocating more than that would be denied, unless the resource
729 * is marked undeniable. Amount of already allocated resource does
730 * not matter.
731 */
732uint64_t
733racct_get_limit(struct proc *p, int resource)
734{
735
765 mtx_lock(&racct_lock);
766 racct_set_force_locked(p, resource, amount);
767 mtx_unlock(&racct_lock);
768}
769
770/*
771 * Returns amount of 'resource' the process 'p' can keep allocated.
772 * Allocating more than that would be denied, unless the resource
773 * is marked undeniable. Amount of already allocated resource does
774 * not matter.
775 */
776uint64_t
777racct_get_limit(struct proc *p, int resource)
778{
779
780 if (!racct_enable)
781 return (UINT64_MAX);
782
736#ifdef RCTL
737 return (rctl_get_limit(p, resource));
738#else
739 return (UINT64_MAX);
740#endif
741}
742
743/*
744 * Returns amount of 'resource' the process 'p' can keep allocated.
745 * Allocating more than that would be denied, unless the resource
746 * is marked undeniable. Amount of already allocated resource does
747 * matter.
748 */
749uint64_t
750racct_get_available(struct proc *p, int resource)
751{
752
783#ifdef RCTL
784 return (rctl_get_limit(p, resource));
785#else
786 return (UINT64_MAX);
787#endif
788}
789
790/*
791 * Returns amount of 'resource' the process 'p' can keep allocated.
792 * Allocating more than that would be denied, unless the resource
793 * is marked undeniable. Amount of already allocated resource does
794 * matter.
795 */
796uint64_t
797racct_get_available(struct proc *p, int resource)
798{
799
800 if (!racct_enable)
801 return (UINT64_MAX);
802
753#ifdef RCTL
754 return (rctl_get_available(p, resource));
755#else
756 return (UINT64_MAX);
757#endif
758}
759
760/*
761 * Returns amount of the %cpu resource that process 'p' can add to its %cpu
762 * utilization. Adding more than that would lead to the process being
763 * throttled.
764 */
765static int64_t
766racct_pcpu_available(struct proc *p)
767{
768
803#ifdef RCTL
804 return (rctl_get_available(p, resource));
805#else
806 return (UINT64_MAX);
807#endif
808}
809
810/*
811 * Returns amount of the %cpu resource that process 'p' can add to its %cpu
812 * utilization. Adding more than that would lead to the process being
813 * throttled.
814 */
815static int64_t
816racct_pcpu_available(struct proc *p)
817{
818
819 ASSERT_RACCT_ENABLED();
820
769#ifdef RCTL
770 return (rctl_pcpu_available(p));
771#else
772 return (INT64_MAX);
773#endif
774}
775
776/*
777 * Decrease allocation of 'resource' by 'amount' for process 'p'.
778 */
779void
780racct_sub(struct proc *p, int resource, uint64_t amount)
781{
782
821#ifdef RCTL
822 return (rctl_pcpu_available(p));
823#else
824 return (INT64_MAX);
825#endif
826}
827
828/*
829 * Decrease allocation of 'resource' by 'amount' for process 'p'.
830 */
831void
832racct_sub(struct proc *p, int resource, uint64_t amount)
833{
834
835 if (!racct_enable)
836 return;
837
783 SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0);
784
785 /*
786 * We need proc lock to dereference p->p_ucred.
787 */
788 PROC_LOCK_ASSERT(p, MA_OWNED);
789 KASSERT(RACCT_CAN_DROP(resource),
790 ("%s: called for non-droppable resource %d", __func__, resource));
791
792 mtx_lock(&racct_lock);
793 KASSERT(amount <= p->p_racct->r_resources[resource],
794 ("%s: freeing %ju of resource %d, which is more "
795 "than allocated %jd for %s (pid %d)", __func__, amount, resource,
796 (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
797
798 racct_alloc_resource(p->p_racct, resource, -amount);
799 racct_sub_cred_locked(p->p_ucred, resource, amount);
800 mtx_unlock(&racct_lock);
801}
802
803static void
804racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
805{
806 struct prison *pr;
807
838 SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0);
839
840 /*
841 * We need proc lock to dereference p->p_ucred.
842 */
843 PROC_LOCK_ASSERT(p, MA_OWNED);
844 KASSERT(RACCT_CAN_DROP(resource),
845 ("%s: called for non-droppable resource %d", __func__, resource));
846
847 mtx_lock(&racct_lock);
848 KASSERT(amount <= p->p_racct->r_resources[resource],
849 ("%s: freeing %ju of resource %d, which is more "
850 "than allocated %jd for %s (pid %d)", __func__, amount, resource,
851 (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
852
853 racct_alloc_resource(p->p_racct, resource, -amount);
854 racct_sub_cred_locked(p->p_ucred, resource, amount);
855 mtx_unlock(&racct_lock);
856}
857
858static void
859racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
860{
861 struct prison *pr;
862
863 ASSERT_RACCT_ENABLED();
864
808 SDT_PROBE(racct, kernel, rusage, sub__cred, cred, resource, amount,
809 0, 0);
810
811#ifdef notyet
812 KASSERT(RACCT_CAN_DROP(resource),
813 ("%s: called for resource %d which can not drop", __func__,
814 resource));
815#endif
816
817 racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
818 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
819 racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
820 -amount);
821 racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount);
822}
823
824/*
825 * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
826 */
827void
828racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
829{
830
865 SDT_PROBE(racct, kernel, rusage, sub__cred, cred, resource, amount,
866 0, 0);
867
868#ifdef notyet
869 KASSERT(RACCT_CAN_DROP(resource),
870 ("%s: called for resource %d which can not drop", __func__,
871 resource));
872#endif
873
874 racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
875 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
876 racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
877 -amount);
878 racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount);
879}
880
881/*
882 * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
883 */
884void
885racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
886{
887
888 if (!racct_enable)
889 return;
890
831 mtx_lock(&racct_lock);
832 racct_sub_cred_locked(cred, resource, amount);
833 mtx_unlock(&racct_lock);
834}
835
836/*
837 * Inherit resource usage information from the parent process.
838 */
839int
840racct_proc_fork(struct proc *parent, struct proc *child)
841{
842 int i, error = 0;
843
891 mtx_lock(&racct_lock);
892 racct_sub_cred_locked(cred, resource, amount);
893 mtx_unlock(&racct_lock);
894}
895
896/*
897 * Inherit resource usage information from the parent process.
898 */
899int
900racct_proc_fork(struct proc *parent, struct proc *child)
901{
902 int i, error = 0;
903
904 if (!racct_enable)
905 return (0);
906
844 /*
845 * Create racct for the child process.
846 */
847 racct_create(&child->p_racct);
848
849 PROC_LOCK(parent);
850 PROC_LOCK(child);
851 mtx_lock(&racct_lock);
852
853#ifdef RCTL
854 error = rctl_proc_fork(parent, child);
855 if (error != 0)
856 goto out;
857#endif
858
859 /* Init process cpu time. */
860 child->p_prev_runtime = 0;
861 child->p_throttled = 0;
862
863 /*
864 * Inherit resource usage.
865 */
866 for (i = 0; i <= RACCT_MAX; i++) {
867 if (parent->p_racct->r_resources[i] == 0 ||
868 !RACCT_IS_INHERITABLE(i))
869 continue;
870
871 error = racct_set_locked(child, i,
872 parent->p_racct->r_resources[i]);
873 if (error != 0)
874 goto out;
875 }
876
877 error = racct_add_locked(child, RACCT_NPROC, 1);
878 error += racct_add_locked(child, RACCT_NTHR, 1);
879
880out:
881 mtx_unlock(&racct_lock);
882 PROC_UNLOCK(child);
883 PROC_UNLOCK(parent);
884
885 if (error != 0)
886 racct_proc_exit(child);
887
888 return (error);
889}
890
891/*
892 * Called at the end of fork1(), to handle rules that require the process
893 * to be fully initialized.
894 */
895void
896racct_proc_fork_done(struct proc *child)
897{
898
899#ifdef RCTL
907 /*
908 * Create racct for the child process.
909 */
910 racct_create(&child->p_racct);
911
912 PROC_LOCK(parent);
913 PROC_LOCK(child);
914 mtx_lock(&racct_lock);
915
916#ifdef RCTL
917 error = rctl_proc_fork(parent, child);
918 if (error != 0)
919 goto out;
920#endif
921
922 /* Init process cpu time. */
923 child->p_prev_runtime = 0;
924 child->p_throttled = 0;
925
926 /*
927 * Inherit resource usage.
928 */
929 for (i = 0; i <= RACCT_MAX; i++) {
930 if (parent->p_racct->r_resources[i] == 0 ||
931 !RACCT_IS_INHERITABLE(i))
932 continue;
933
934 error = racct_set_locked(child, i,
935 parent->p_racct->r_resources[i]);
936 if (error != 0)
937 goto out;
938 }
939
940 error = racct_add_locked(child, RACCT_NPROC, 1);
941 error += racct_add_locked(child, RACCT_NTHR, 1);
942
943out:
944 mtx_unlock(&racct_lock);
945 PROC_UNLOCK(child);
946 PROC_UNLOCK(parent);
947
948 if (error != 0)
949 racct_proc_exit(child);
950
951 return (error);
952}
953
954/*
955 * Called at the end of fork1(), to handle rules that require the process
956 * to be fully initialized.
957 */
958void
959racct_proc_fork_done(struct proc *child)
960{
961
962#ifdef RCTL
963 if (!racct_enable)
964 return;
965
900 PROC_LOCK(child);
901 mtx_lock(&racct_lock);
902 rctl_enforce(child, RACCT_NPROC, 0);
903 rctl_enforce(child, RACCT_NTHR, 0);
904 mtx_unlock(&racct_lock);
905 PROC_UNLOCK(child);
906#endif
907}
908
909void
910racct_proc_exit(struct proc *p)
911{
912 int i;
913 uint64_t runtime;
914 struct timeval wallclock;
915 uint64_t pct_estimate, pct;
916
966 PROC_LOCK(child);
967 mtx_lock(&racct_lock);
968 rctl_enforce(child, RACCT_NPROC, 0);
969 rctl_enforce(child, RACCT_NTHR, 0);
970 mtx_unlock(&racct_lock);
971 PROC_UNLOCK(child);
972#endif
973}
974
975void
976racct_proc_exit(struct proc *p)
977{
978 int i;
979 uint64_t runtime;
980 struct timeval wallclock;
981 uint64_t pct_estimate, pct;
982
983 if (!racct_enable)
984 return;
985
917 PROC_LOCK(p);
918 /*
919 * We don't need to calculate rux, proc_reap() has already done this.
920 */
921 runtime = cputick2usec(p->p_rux.rux_runtime);
922#ifdef notyet
923 KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
924#else
925 if (runtime < p->p_prev_runtime)
926 runtime = p->p_prev_runtime;
927#endif
928 microuptime(&wallclock);
929 timevalsub(&wallclock, &p->p_stats->p_start);
930 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
931 pct_estimate = (1000000 * runtime * 100) /
932 ((uint64_t)wallclock.tv_sec * 1000000 +
933 wallclock.tv_usec);
934 } else
935 pct_estimate = 0;
936 pct = racct_getpcpu(p, pct_estimate);
937
938 mtx_lock(&racct_lock);
939 racct_set_locked(p, RACCT_CPU, runtime);
940 racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
941
942 for (i = 0; i <= RACCT_MAX; i++) {
943 if (p->p_racct->r_resources[i] == 0)
944 continue;
945 if (!RACCT_IS_RECLAIMABLE(i))
946 continue;
947 racct_set_locked(p, i, 0);
948 }
949
950 mtx_unlock(&racct_lock);
951 PROC_UNLOCK(p);
952
953#ifdef RCTL
954 rctl_racct_release(p->p_racct);
955#endif
956 racct_destroy(&p->p_racct);
957}
958
959/*
960 * Called after credentials change, to move resource utilisation
961 * between raccts.
962 */
963void
964racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
965 struct ucred *newcred)
966{
967 struct uidinfo *olduip, *newuip;
968 struct loginclass *oldlc, *newlc;
969 struct prison *oldpr, *newpr, *pr;
970
986 PROC_LOCK(p);
987 /*
988 * We don't need to calculate rux, proc_reap() has already done this.
989 */
990 runtime = cputick2usec(p->p_rux.rux_runtime);
991#ifdef notyet
992 KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
993#else
994 if (runtime < p->p_prev_runtime)
995 runtime = p->p_prev_runtime;
996#endif
997 microuptime(&wallclock);
998 timevalsub(&wallclock, &p->p_stats->p_start);
999 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
1000 pct_estimate = (1000000 * runtime * 100) /
1001 ((uint64_t)wallclock.tv_sec * 1000000 +
1002 wallclock.tv_usec);
1003 } else
1004 pct_estimate = 0;
1005 pct = racct_getpcpu(p, pct_estimate);
1006
1007 mtx_lock(&racct_lock);
1008 racct_set_locked(p, RACCT_CPU, runtime);
1009 racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
1010
1011 for (i = 0; i <= RACCT_MAX; i++) {
1012 if (p->p_racct->r_resources[i] == 0)
1013 continue;
1014 if (!RACCT_IS_RECLAIMABLE(i))
1015 continue;
1016 racct_set_locked(p, i, 0);
1017 }
1018
1019 mtx_unlock(&racct_lock);
1020 PROC_UNLOCK(p);
1021
1022#ifdef RCTL
1023 rctl_racct_release(p->p_racct);
1024#endif
1025 racct_destroy(&p->p_racct);
1026}
1027
1028/*
1029 * Called after credentials change, to move resource utilisation
1030 * between raccts.
1031 */
1032void
1033racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
1034 struct ucred *newcred)
1035{
1036 struct uidinfo *olduip, *newuip;
1037 struct loginclass *oldlc, *newlc;
1038 struct prison *oldpr, *newpr, *pr;
1039
1040 if (!racct_enable)
1041 return;
1042
971 PROC_LOCK_ASSERT(p, MA_NOTOWNED);
972
973 newuip = newcred->cr_ruidinfo;
974 olduip = oldcred->cr_ruidinfo;
975 newlc = newcred->cr_loginclass;
976 oldlc = oldcred->cr_loginclass;
977 newpr = newcred->cr_prison;
978 oldpr = oldcred->cr_prison;
979
980 mtx_lock(&racct_lock);
981 if (newuip != olduip) {
982 racct_sub_racct(olduip->ui_racct, p->p_racct);
983 racct_add_racct(newuip->ui_racct, p->p_racct);
984 }
985 if (newlc != oldlc) {
986 racct_sub_racct(oldlc->lc_racct, p->p_racct);
987 racct_add_racct(newlc->lc_racct, p->p_racct);
988 }
989 if (newpr != oldpr) {
990 for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
991 racct_sub_racct(pr->pr_prison_racct->prr_racct,
992 p->p_racct);
993 for (pr = newpr; pr != NULL; pr = pr->pr_parent)
994 racct_add_racct(pr->pr_prison_racct->prr_racct,
995 p->p_racct);
996 }
997 mtx_unlock(&racct_lock);
998
999#ifdef RCTL
1000 rctl_proc_ucred_changed(p, newcred);
1001#endif
1002}
1003
1004void
1005racct_move(struct racct *dest, struct racct *src)
1006{
1007
1043 PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1044
1045 newuip = newcred->cr_ruidinfo;
1046 olduip = oldcred->cr_ruidinfo;
1047 newlc = newcred->cr_loginclass;
1048 oldlc = oldcred->cr_loginclass;
1049 newpr = newcred->cr_prison;
1050 oldpr = oldcred->cr_prison;
1051
1052 mtx_lock(&racct_lock);
1053 if (newuip != olduip) {
1054 racct_sub_racct(olduip->ui_racct, p->p_racct);
1055 racct_add_racct(newuip->ui_racct, p->p_racct);
1056 }
1057 if (newlc != oldlc) {
1058 racct_sub_racct(oldlc->lc_racct, p->p_racct);
1059 racct_add_racct(newlc->lc_racct, p->p_racct);
1060 }
1061 if (newpr != oldpr) {
1062 for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
1063 racct_sub_racct(pr->pr_prison_racct->prr_racct,
1064 p->p_racct);
1065 for (pr = newpr; pr != NULL; pr = pr->pr_parent)
1066 racct_add_racct(pr->pr_prison_racct->prr_racct,
1067 p->p_racct);
1068 }
1069 mtx_unlock(&racct_lock);
1070
1071#ifdef RCTL
1072 rctl_proc_ucred_changed(p, newcred);
1073#endif
1074}
1075
1076void
1077racct_move(struct racct *dest, struct racct *src)
1078{
1079
1080 ASSERT_RACCT_ENABLED();
1081
1008 mtx_lock(&racct_lock);
1009
1010 racct_add_racct(dest, src);
1011 racct_sub_racct(src, src);
1012
1013 mtx_unlock(&racct_lock);
1014}
1015
1016static void
1017racct_proc_throttle(struct proc *p)
1018{
1019 struct thread *td;
1020#ifdef SMP
1021 int cpuid;
1022#endif
1023
1082 mtx_lock(&racct_lock);
1083
1084 racct_add_racct(dest, src);
1085 racct_sub_racct(src, src);
1086
1087 mtx_unlock(&racct_lock);
1088}
1089
1090static void
1091racct_proc_throttle(struct proc *p)
1092{
1093 struct thread *td;
1094#ifdef SMP
1095 int cpuid;
1096#endif
1097
1098 ASSERT_RACCT_ENABLED();
1024 PROC_LOCK_ASSERT(p, MA_OWNED);
1025
1026 /*
1027 * Do not block kernel processes. Also do not block processes with
1028 * low %cpu utilization to improve interactivity.
1029 */
1030 if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
1031 (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
1032 return;
1033 p->p_throttled = 1;
1034
1035 FOREACH_THREAD_IN_PROC(p, td) {
1036 thread_lock(td);
1037 switch (td->td_state) {
1038 case TDS_RUNQ:
1039 /*
1040 * If the thread is on the scheduler run-queue, we can
1041 * not just remove it from there. So we set the flag
1042 * TDF_NEEDRESCHED for the thread, so that once it is
1043 * running, it is taken off the cpu as soon as possible.
1044 */
1045 td->td_flags |= TDF_NEEDRESCHED;
1046 break;
1047 case TDS_RUNNING:
1048 /*
1049 * If the thread is running, we request a context
1050 * switch for it by setting the TDF_NEEDRESCHED flag.
1051 */
1052 td->td_flags |= TDF_NEEDRESCHED;
1053#ifdef SMP
1054 cpuid = td->td_oncpu;
1055 if ((cpuid != NOCPU) && (td != curthread))
1056 ipi_cpu(cpuid, IPI_AST);
1057#endif
1058 break;
1059 default:
1060 break;
1061 }
1062 thread_unlock(td);
1063 }
1064}
1065
1066static void
1067racct_proc_wakeup(struct proc *p)
1068{
1099 PROC_LOCK_ASSERT(p, MA_OWNED);
1100
1101 /*
1102 * Do not block kernel processes. Also do not block processes with
1103 * low %cpu utilization to improve interactivity.
1104 */
1105 if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
1106 (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
1107 return;
1108 p->p_throttled = 1;
1109
1110 FOREACH_THREAD_IN_PROC(p, td) {
1111 thread_lock(td);
1112 switch (td->td_state) {
1113 case TDS_RUNQ:
1114 /*
1115 * If the thread is on the scheduler run-queue, we can
1116 * not just remove it from there. So we set the flag
1117 * TDF_NEEDRESCHED for the thread, so that once it is
1118 * running, it is taken off the cpu as soon as possible.
1119 */
1120 td->td_flags |= TDF_NEEDRESCHED;
1121 break;
1122 case TDS_RUNNING:
1123 /*
1124 * If the thread is running, we request a context
1125 * switch for it by setting the TDF_NEEDRESCHED flag.
1126 */
1127 td->td_flags |= TDF_NEEDRESCHED;
1128#ifdef SMP
1129 cpuid = td->td_oncpu;
1130 if ((cpuid != NOCPU) && (td != curthread))
1131 ipi_cpu(cpuid, IPI_AST);
1132#endif
1133 break;
1134 default:
1135 break;
1136 }
1137 thread_unlock(td);
1138 }
1139}
1140
1141static void
1142racct_proc_wakeup(struct proc *p)
1143{
1144
1145 ASSERT_RACCT_ENABLED();
1146
1069 PROC_LOCK_ASSERT(p, MA_OWNED);
1070
1071 if (p->p_throttled) {
1072 p->p_throttled = 0;
1073 wakeup(p->p_racct);
1074 }
1075}
1076
1077static void
1078racct_decay_resource(struct racct *racct, void * res, void* dummy)
1079{
1080 int resource;
1081 int64_t r_old, r_new;
1082
1147 PROC_LOCK_ASSERT(p, MA_OWNED);
1148
1149 if (p->p_throttled) {
1150 p->p_throttled = 0;
1151 wakeup(p->p_racct);
1152 }
1153}
1154
1155static void
1156racct_decay_resource(struct racct *racct, void * res, void* dummy)
1157{
1158 int resource;
1159 int64_t r_old, r_new;
1160
1161 ASSERT_RACCT_ENABLED();
1162
1083 resource = *(int *)res;
1084 r_old = racct->r_resources[resource];
1085
1086 /* If there is nothing to decay, just exit. */
1087 if (r_old <= 0)
1088 return;
1089
1090 mtx_lock(&racct_lock);
1091 r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
1092 racct->r_resources[resource] = r_new;
1093 mtx_unlock(&racct_lock);
1094}
1095
1096static void
1097racct_decay(int resource)
1098{
1163 resource = *(int *)res;
1164 r_old = racct->r_resources[resource];
1165
1166 /* If there is nothing to decay, just exit. */
1167 if (r_old <= 0)
1168 return;
1169
1170 mtx_lock(&racct_lock);
1171 r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
1172 racct->r_resources[resource] = r_new;
1173 mtx_unlock(&racct_lock);
1174}
1175
1176static void
1177racct_decay(int resource)
1178{
1179
1180 ASSERT_RACCT_ENABLED();
1181
1099 ui_racct_foreach(racct_decay_resource, &resource, NULL);
1100 loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
1101 prison_racct_foreach(racct_decay_resource, &resource, NULL);
1102}
1103
1104static void
1105racctd(void)
1106{
1107 struct thread *td;
1108 struct proc *p;
1109 struct timeval wallclock;
1110 uint64_t runtime;
1111 uint64_t pct, pct_estimate;
1112
1182 ui_racct_foreach(racct_decay_resource, &resource, NULL);
1183 loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
1184 prison_racct_foreach(racct_decay_resource, &resource, NULL);
1185}
1186
1187static void
1188racctd(void)
1189{
1190 struct thread *td;
1191 struct proc *p;
1192 struct timeval wallclock;
1193 uint64_t runtime;
1194 uint64_t pct, pct_estimate;
1195
1196 ASSERT_RACCT_ENABLED();
1197
1113 for (;;) {
1114 racct_decay(RACCT_PCTCPU);
1115
1116 sx_slock(&allproc_lock);
1117
1118 LIST_FOREACH(p, &zombproc, p_list) {
1119 PROC_LOCK(p);
1120 racct_set(p, RACCT_PCTCPU, 0);
1121 PROC_UNLOCK(p);
1122 }
1123
1124 FOREACH_PROC_IN_SYSTEM(p) {
1125 PROC_LOCK(p);
1126 if (p->p_state != PRS_NORMAL) {
1127 PROC_UNLOCK(p);
1128 continue;
1129 }
1130
1131 microuptime(&wallclock);
1132 timevalsub(&wallclock, &p->p_stats->p_start);
1133 PROC_SLOCK(p);
1134 FOREACH_THREAD_IN_PROC(p, td)
1135 ruxagg(p, td);
1136 runtime = cputick2usec(p->p_rux.rux_runtime);
1137 PROC_SUNLOCK(p);
1138#ifdef notyet
1139 KASSERT(runtime >= p->p_prev_runtime,
1140 ("runtime < p_prev_runtime"));
1141#else
1142 if (runtime < p->p_prev_runtime)
1143 runtime = p->p_prev_runtime;
1144#endif
1145 p->p_prev_runtime = runtime;
1146 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
1147 pct_estimate = (1000000 * runtime * 100) /
1148 ((uint64_t)wallclock.tv_sec * 1000000 +
1149 wallclock.tv_usec);
1150 } else
1151 pct_estimate = 0;
1152 pct = racct_getpcpu(p, pct_estimate);
1153 mtx_lock(&racct_lock);
1154 racct_set_force_locked(p, RACCT_PCTCPU, pct);
1155 racct_set_locked(p, RACCT_CPU, runtime);
1156 racct_set_locked(p, RACCT_WALLCLOCK,
1157 (uint64_t)wallclock.tv_sec * 1000000 +
1158 wallclock.tv_usec);
1159 mtx_unlock(&racct_lock);
1160 PROC_UNLOCK(p);
1161 }
1162
1163 /*
1164 * To ensure that processes are throttled in a fair way, we need
1165 * to iterate over all processes again and check the limits
1166 * for %cpu resource only after ucred racct containers have been
1167 * properly filled.
1168 */
1169 FOREACH_PROC_IN_SYSTEM(p) {
1170 PROC_LOCK(p);
1171 if (p->p_state != PRS_NORMAL) {
1172 PROC_UNLOCK(p);
1173 continue;
1174 }
1175
1176 if (racct_pcpu_available(p) <= 0)
1177 racct_proc_throttle(p);
1178 else if (p->p_throttled)
1179 racct_proc_wakeup(p);
1180 PROC_UNLOCK(p);
1181 }
1182 sx_sunlock(&allproc_lock);
1183 pause("-", hz);
1184 }
1185}
1186
1187static struct kproc_desc racctd_kp = {
1188 "racctd",
1189 racctd,
1190 NULL
1191};
1198 for (;;) {
1199 racct_decay(RACCT_PCTCPU);
1200
1201 sx_slock(&allproc_lock);
1202
1203 LIST_FOREACH(p, &zombproc, p_list) {
1204 PROC_LOCK(p);
1205 racct_set(p, RACCT_PCTCPU, 0);
1206 PROC_UNLOCK(p);
1207 }
1208
1209 FOREACH_PROC_IN_SYSTEM(p) {
1210 PROC_LOCK(p);
1211 if (p->p_state != PRS_NORMAL) {
1212 PROC_UNLOCK(p);
1213 continue;
1214 }
1215
1216 microuptime(&wallclock);
1217 timevalsub(&wallclock, &p->p_stats->p_start);
1218 PROC_SLOCK(p);
1219 FOREACH_THREAD_IN_PROC(p, td)
1220 ruxagg(p, td);
1221 runtime = cputick2usec(p->p_rux.rux_runtime);
1222 PROC_SUNLOCK(p);
1223#ifdef notyet
1224 KASSERT(runtime >= p->p_prev_runtime,
1225 ("runtime < p_prev_runtime"));
1226#else
1227 if (runtime < p->p_prev_runtime)
1228 runtime = p->p_prev_runtime;
1229#endif
1230 p->p_prev_runtime = runtime;
1231 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
1232 pct_estimate = (1000000 * runtime * 100) /
1233 ((uint64_t)wallclock.tv_sec * 1000000 +
1234 wallclock.tv_usec);
1235 } else
1236 pct_estimate = 0;
1237 pct = racct_getpcpu(p, pct_estimate);
1238 mtx_lock(&racct_lock);
1239 racct_set_force_locked(p, RACCT_PCTCPU, pct);
1240 racct_set_locked(p, RACCT_CPU, runtime);
1241 racct_set_locked(p, RACCT_WALLCLOCK,
1242 (uint64_t)wallclock.tv_sec * 1000000 +
1243 wallclock.tv_usec);
1244 mtx_unlock(&racct_lock);
1245 PROC_UNLOCK(p);
1246 }
1247
1248 /*
1249 * To ensure that processes are throttled in a fair way, we need
1250 * to iterate over all processes again and check the limits
1251 * for %cpu resource only after ucred racct containers have been
1252 * properly filled.
1253 */
1254 FOREACH_PROC_IN_SYSTEM(p) {
1255 PROC_LOCK(p);
1256 if (p->p_state != PRS_NORMAL) {
1257 PROC_UNLOCK(p);
1258 continue;
1259 }
1260
1261 if (racct_pcpu_available(p) <= 0)
1262 racct_proc_throttle(p);
1263 else if (p->p_throttled)
1264 racct_proc_wakeup(p);
1265 PROC_UNLOCK(p);
1266 }
1267 sx_sunlock(&allproc_lock);
1268 pause("-", hz);
1269 }
1270}
1271
1272static struct kproc_desc racctd_kp = {
1273 "racctd",
1274 racctd,
1275 NULL
1276};
1192SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp);
1193
1194static void
1277
1278static void
1279racctd_init(void)
1280{
1281 if (!racct_enable)
1282 return;
1283
1284 kproc_start(&racctd_kp);
1285}
1286SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
1287
1288static void
1195racct_init(void)
1196{
1289racct_init(void)
1290{
1291 if (!racct_enable)
1292 return;
1197
1198 racct_zone = uma_zcreate("racct", sizeof(struct racct),
1199 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1200 /*
1201 * XXX: Move this somewhere.
1202 */
1203 prison0.pr_prison_racct = prison_racct_find("0");
1204}
1205SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
1206
1207#else /* !RACCT */
1208
1209int
1210racct_add(struct proc *p, int resource, uint64_t amount)
1211{
1212
1213 return (0);
1214}
1215
1216void
1217racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
1218{
1219}
1220
1221void
1222racct_add_force(struct proc *p, int resource, uint64_t amount)
1223{
1224
1225 return;
1226}
1227
1228int
1229racct_set(struct proc *p, int resource, uint64_t amount)
1230{
1231
1232 return (0);
1233}
1234
1235void
1236racct_set_force(struct proc *p, int resource, uint64_t amount)
1237{
1238}
1239
1240void
1241racct_sub(struct proc *p, int resource, uint64_t amount)
1242{
1243}
1244
1245void
1246racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
1247{
1248}
1249
1250uint64_t
1251racct_get_limit(struct proc *p, int resource)
1252{
1253
1254 return (UINT64_MAX);
1255}
1256
1257uint64_t
1258racct_get_available(struct proc *p, int resource)
1259{
1260
1261 return (UINT64_MAX);
1262}
1263
1264void
1265racct_create(struct racct **racctp)
1266{
1267}
1268
1269void
1270racct_destroy(struct racct **racctp)
1271{
1272}
1273
1274int
1275racct_proc_fork(struct proc *parent, struct proc *child)
1276{
1277
1278 return (0);
1279}
1280
1281void
1282racct_proc_fork_done(struct proc *child)
1283{
1284}
1285
1286void
1287racct_proc_exit(struct proc *p)
1288{
1289}
1290
1291#endif /* !RACCT */
1293
1294 racct_zone = uma_zcreate("racct", sizeof(struct racct),
1295 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1296 /*
1297 * XXX: Move this somewhere.
1298 */
1299 prison0.pr_prison_racct = prison_racct_find("0");
1300}
1301SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
1302
1303#else /* !RACCT */
1304
1305int
1306racct_add(struct proc *p, int resource, uint64_t amount)
1307{
1308
1309 return (0);
1310}
1311
1312void
1313racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
1314{
1315}
1316
1317void
1318racct_add_force(struct proc *p, int resource, uint64_t amount)
1319{
1320
1321 return;
1322}
1323
1324int
1325racct_set(struct proc *p, int resource, uint64_t amount)
1326{
1327
1328 return (0);
1329}
1330
1331void
1332racct_set_force(struct proc *p, int resource, uint64_t amount)
1333{
1334}
1335
1336void
1337racct_sub(struct proc *p, int resource, uint64_t amount)
1338{
1339}
1340
1341void
1342racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
1343{
1344}
1345
1346uint64_t
1347racct_get_limit(struct proc *p, int resource)
1348{
1349
1350 return (UINT64_MAX);
1351}
1352
1353uint64_t
1354racct_get_available(struct proc *p, int resource)
1355{
1356
1357 return (UINT64_MAX);
1358}
1359
1360void
1361racct_create(struct racct **racctp)
1362{
1363}
1364
1365void
1366racct_destroy(struct racct **racctp)
1367{
1368}
1369
1370int
1371racct_proc_fork(struct proc *parent, struct proc *child)
1372{
1373
1374 return (0);
1375}
1376
1377void
1378racct_proc_fork_done(struct proc *child)
1379{
1380}
1381
1382void
1383racct_proc_exit(struct proc *p)
1384{
1385}
1386
1387#endif /* !RACCT */