kern_racct.c revision 243070
1/*- 2 * Copyright (c) 2010 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by Edward Tomasz Napierala under sponsorship 6 * from the FreeBSD Foundation. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD: head/sys/kern/kern_racct.c 243070 2012-11-15 14:11:34Z trasz $ 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 243070 2012-11-15 14:11:34Z trasz $"); 34 35#include "opt_kdtrace.h" 36#include "opt_sched.h" 37 38#include <sys/param.h> 39#include <sys/systm.h> 40#include <sys/eventhandler.h> 41#include <sys/jail.h> 42#include <sys/kernel.h> 43#include <sys/kthread.h> 44#include <sys/lock.h> 45#include <sys/loginclass.h> 46#include <sys/malloc.h> 47#include <sys/mutex.h> 48#include <sys/proc.h> 49#include <sys/racct.h> 50#include <sys/resourcevar.h> 51#include <sys/sbuf.h> 52#include <sys/sched.h> 53#include <sys/sdt.h> 54#include <sys/smp.h> 55#include <sys/sx.h> 56#include <sys/sysctl.h> 57#include <sys/sysent.h> 58#include <sys/sysproto.h> 59#include <sys/umtx.h> 60#include <machine/smp.h> 61 62#ifdef RCTL 63#include <sys/rctl.h> 64#endif 65 66#ifdef RACCT 67 68FEATURE(racct, "Resource Accounting"); 69 70/* 71 * Do not block processes that have their %cpu usage <= pcpu_threshold. 72 */ 73static int pcpu_threshold = 1; 74 75SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); 76SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 77 0, "Processes with higher %cpu usage than this value can be throttled."); 78 79/* 80 * How many seconds it takes to use the scheduler %cpu calculations. When a 81 * process starts, we compute its %cpu usage by dividing its runtime by the 82 * process wall clock time. After RACCT_PCPU_SECS pass, we use the value 83 * provided by the scheduler. 84 */ 85#define RACCT_PCPU_SECS 3 86 87static struct mtx racct_lock; 88MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); 89 90static uma_zone_t racct_zone; 91 92static void racct_sub_racct(struct racct *dest, const struct racct *src); 93static void racct_sub_cred_locked(struct ucred *cred, int resource, 94 uint64_t amount); 95static void racct_add_cred_locked(struct ucred *cred, int resource, 96 uint64_t amount); 97 98SDT_PROVIDER_DEFINE(racct); 99SDT_PROBE_DEFINE3(racct, kernel, rusage, add, add, "struct proc *", "int", 100 "uint64_t"); 101SDT_PROBE_DEFINE3(racct, kernel, rusage, add_failure, add-failure, 102 "struct proc *", "int", "uint64_t"); 103SDT_PROBE_DEFINE3(racct, kernel, rusage, add_cred, add-cred, "struct ucred *", 104 "int", "uint64_t"); 105SDT_PROBE_DEFINE3(racct, kernel, rusage, add_force, add-force, "struct proc *", 106 "int", "uint64_t"); 107SDT_PROBE_DEFINE3(racct, kernel, rusage, set, set, "struct proc *", "int", 108 "uint64_t"); 109SDT_PROBE_DEFINE3(racct, kernel, rusage, set_failure, set-failure, 110 "struct proc *", "int", "uint64_t"); 111SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, sub, "struct proc *", "int", 112 "uint64_t"); 113SDT_PROBE_DEFINE3(racct, kernel, rusage, sub_cred, sub-cred, "struct ucred *", 114 "int", "uint64_t"); 115SDT_PROBE_DEFINE1(racct, kernel, racct, create, create, "struct racct *"); 116SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, destroy, "struct racct *"); 117SDT_PROBE_DEFINE2(racct, kernel, racct, join, join, "struct racct *", 118 "struct racct *"); 119SDT_PROBE_DEFINE2(racct, kernel, racct, join_failure, join-failure, 120 "struct racct *", "struct racct *"); 121SDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *", 122 "struct racct *"); 123 124int racct_types[] = { 125 [RACCT_CPU] = 126 RACCT_IN_MILLIONS, 127 [RACCT_DATA] = 128 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 129 [RACCT_STACK] = 130 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 131 [RACCT_CORE] = 132 RACCT_DENIABLE, 133 [RACCT_RSS] = 134 RACCT_RECLAIMABLE, 135 [RACCT_MEMLOCK] = 136 RACCT_RECLAIMABLE | RACCT_DENIABLE, 137 [RACCT_NPROC] = 138 RACCT_RECLAIMABLE | RACCT_DENIABLE, 139 [RACCT_NOFILE] = 140 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 141 [RACCT_VMEM] = 142 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 143 [RACCT_NPTS] = 144 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 145 [RACCT_SWAP] = 146 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 147 [RACCT_NTHR] = 148 RACCT_RECLAIMABLE | RACCT_DENIABLE, 149 [RACCT_MSGQQUEUED] = 150 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 151 [RACCT_MSGQSIZE] = 152 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 153 [RACCT_NMSGQ] = 154 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 155 [RACCT_NSEM] = 156 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 157 [RACCT_NSEMOP] = 158 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 159 [RACCT_NSHM] = 160 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 161 [RACCT_SHMSIZE] = 162 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 163 [RACCT_WALLCLOCK] = 164 RACCT_IN_MILLIONS, 165 [RACCT_PCTCPU] = 166 RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; 167 168static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; 169 170#ifdef SCHED_4BSD 171/* 172 * Contains intermediate values for %cpu calculations to avoid using floating 173 * point in the kernel. 174 * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) 175 * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to 176 * zero so the calculations are more straightforward. 177 */ 178fixpt_t ccpu_exp[] = { 179 [0] = FSCALE * 1, 180 [1] = FSCALE * 0.95122942450071400909, 181 [2] = FSCALE * 0.90483741803595957316, 182 [3] = FSCALE * 0.86070797642505780722, 183 [4] = FSCALE * 0.81873075307798185866, 184 [5] = FSCALE * 0.77880078307140486824, 185 [6] = FSCALE * 0.74081822068171786606, 186 [7] = FSCALE * 0.70468808971871343435, 187 [8] = FSCALE * 0.67032004603563930074, 188 [9] = FSCALE * 0.63762815162177329314, 189 [10] = FSCALE * 0.60653065971263342360, 190 [11] = FSCALE * 0.57694981038048669531, 191 [12] = FSCALE * 0.54881163609402643262, 192 [13] = FSCALE * 0.52204577676101604789, 193 [14] = FSCALE * 0.49658530379140951470, 194 [15] = FSCALE * 0.47236655274101470713, 195 [16] = FSCALE * 0.44932896411722159143, 196 [17] = FSCALE * 0.42741493194872666992, 197 [18] = FSCALE * 0.40656965974059911188, 198 [19] = FSCALE * 0.38674102345450120691, 199 [20] = FSCALE * 0.36787944117144232159, 200 [21] = FSCALE * 0.34993774911115535467, 201 [22] = FSCALE * 0.33287108369807955328, 202 [23] = FSCALE * 0.31663676937905321821, 203 [24] = FSCALE * 0.30119421191220209664, 204 [25] = FSCALE * 0.28650479686019010032, 205 [26] = FSCALE * 0.27253179303401260312, 206 [27] = FSCALE * 0.25924026064589150757, 207 [28] = FSCALE * 0.24659696394160647693, 208 [29] = FSCALE * 0.23457028809379765313, 209 [30] = FSCALE * 0.22313016014842982893, 210 [31] = FSCALE * 0.21224797382674305771, 211 [32] = FSCALE * 0.20189651799465540848, 212 [33] = FSCALE * 0.19204990862075411423, 213 [34] = FSCALE * 0.18268352405273465022, 214 [35] = FSCALE * 0.17377394345044512668, 215 [36] = FSCALE * 0.16529888822158653829, 216 [37] = FSCALE * 0.15723716631362761621, 217 [38] = FSCALE * 0.14956861922263505264, 218 [39] = FSCALE * 0.14227407158651357185, 219 [40] = FSCALE * 0.13533528323661269189, 220 [41] = FSCALE * 0.12873490358780421886, 221 [42] = FSCALE * 0.12245642825298191021, 222 [43] = FSCALE * 0.11648415777349695786, 223 [44] = FSCALE * 0.11080315836233388333, 224 [45] = FSCALE * 0.10539922456186433678, 225 [46] = FSCALE * 0.10025884372280373372, 226 [47] = FSCALE * 0.09536916221554961888, 227 [48] = FSCALE * 0.09071795328941250337, 228 [49] = FSCALE * 0.08629358649937051097, 229 [50] = FSCALE * 0.08208499862389879516, 230 [51] = FSCALE * 0.07808166600115315231, 231 [52] = FSCALE * 0.07427357821433388042, 232 [53] = FSCALE * 0.07065121306042958674, 233 [54] = FSCALE * 0.06720551273974976512, 234 [55] = FSCALE * 0.06392786120670757270, 235 [56] = FSCALE * 0.06081006262521796499, 236 [57] = FSCALE * 0.05784432087483846296, 237 [58] = FSCALE * 0.05502322005640722902, 238 [59] = FSCALE * 0.05233970594843239308, 239 [60] = FSCALE * 0.04978706836786394297, 240 [61] = FSCALE * 0.04735892439114092119, 241 [62] = FSCALE * 0.04504920239355780606, 242 [63] = FSCALE * 0.04285212686704017991, 243 [64] = FSCALE * 0.04076220397836621516, 244 [65] = FSCALE * 0.03877420783172200988, 245 [66] = FSCALE * 0.03688316740124000544, 246 [67] = FSCALE * 0.03508435410084502588, 247 [68] = FSCALE * 0.03337326996032607948, 248 [69] = FSCALE * 0.03174563637806794323, 249 [70] = FSCALE * 0.03019738342231850073, 250 [71] = FSCALE * 0.02872463965423942912, 251 [72] = FSCALE * 0.02732372244729256080, 252 [73] = FSCALE * 0.02599112877875534358, 253 [74] = FSCALE * 0.02472352647033939120, 254 [75] = FSCALE * 0.02351774585600910823, 255 [76] = FSCALE * 0.02237077185616559577, 256 [77] = FSCALE * 0.02127973643837716938, 257 [78] = FSCALE * 0.02024191144580438847, 258 [79] = FSCALE * 0.01925470177538692429, 259 [80] = FSCALE * 0.01831563888873418029, 260 [81] = FSCALE * 0.01742237463949351138, 261 [82] = FSCALE * 0.01657267540176124754, 262 [83] = FSCALE * 0.01576441648485449082, 263 [84] = FSCALE * 0.01499557682047770621, 264 [85] = FSCALE * 0.01426423390899925527, 265 [86] = FSCALE * 0.01356855901220093175, 266 [87] = FSCALE * 0.01290681258047986886, 267 [88] = FSCALE * 0.01227733990306844117, 268 [89] = FSCALE * 0.01167856697039544521, 269 [90] = FSCALE * 0.01110899653824230649, 270 [91] = FSCALE * 0.01056720438385265337, 271 [92] = FSCALE * 0.01005183574463358164, 272 [93] = FSCALE * 0.00956160193054350793, 273 [94] = FSCALE * 0.00909527710169581709, 274 [95] = FSCALE * 0.00865169520312063417, 275 [96] = FSCALE * 0.00822974704902002884, 276 [97] = FSCALE * 0.00782837754922577143, 277 [98] = FSCALE * 0.00744658307092434051, 278 [99] = FSCALE * 0.00708340892905212004, 279 [100] = FSCALE * 0.00673794699908546709, 280 [101] = FSCALE * 0.00640933344625638184, 281 [102] = FSCALE * 0.00609674656551563610, 282 [103] = FSCALE * 0.00579940472684214321, 283 [104] = FSCALE * 0.00551656442076077241, 284 [105] = FSCALE * 0.00524751839918138427, 285 [106] = FSCALE * 0.00499159390691021621, 286 [107] = FSCALE * 0.00474815099941147558, 287 [108] = FSCALE * 0.00451658094261266798, 288 [109] = FSCALE * 0.00429630469075234057, 289 [110] = FSCALE * 0.00408677143846406699, 290}; 291#endif 292 293#define CCPU_EXP_MAX 110 294 295/* 296 * This function is analogical to the getpcpu() function in the ps(1) command. 297 * They should both calculate in the same way so that the racct %cpu 298 * calculations are consistent with the values showed by the ps(1) tool. 299 * The calculations are more complex in the 4BSD scheduler because of the value 300 * of the ccpu variable. In ULE it is defined to be zero which saves us some 301 * work. 302 */ 303static uint64_t 304racct_getpcpu(struct proc *p, u_int pcpu) 305{ 306 u_int swtime; 307#ifdef SCHED_4BSD 308 fixpt_t pctcpu, pctcpu_next; 309#endif 310#ifdef SMP 311 struct pcpu *pc; 312 int found; 313#endif 314 fixpt_t p_pctcpu; 315 struct thread *td; 316 317 /* 318 * If the process is swapped out, we count its %cpu usage as zero. 319 * This behaviour is consistent with the userland ps(1) tool. 320 */ 321 if ((p->p_flag & P_INMEM) == 0) 322 return (0); 323 swtime = (ticks - p->p_swtick) / hz; 324 325 /* 326 * For short-lived processes, the sched_pctcpu() returns small 327 * values even for cpu intensive processes. Therefore we use 328 * our own estimate in this case. 329 */ 330 if (swtime < RACCT_PCPU_SECS) 331 return (pcpu); 332 333 p_pctcpu = 0; 334 FOREACH_THREAD_IN_PROC(p, td) { 335 if (td == PCPU_GET(idlethread)) 336 continue; 337#ifdef SMP 338 found = 0; 339 STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 340 if (td == pc->pc_idlethread) { 341 found = 1; 342 break; 343 } 344 } 345 if (found) 346 continue; 347#endif 348 thread_lock(td); 349#ifdef SCHED_4BSD 350 pctcpu = sched_pctcpu(td); 351 /* Count also the yet unfinished second. */ 352 pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; 353 pctcpu_next += sched_pctcpu_delta(td); 354 p_pctcpu += max(pctcpu, pctcpu_next); 355#else 356 /* 357 * In ULE the %cpu statistics are updated on every 358 * sched_pctcpu() call. So special calculations to 359 * account for the latest (unfinished) second are 360 * not needed. 361 */ 362 p_pctcpu += sched_pctcpu(td); 363#endif 364 thread_unlock(td); 365 } 366 367#ifdef SCHED_4BSD 368 if (swtime <= CCPU_EXP_MAX) 369 return ((100 * (uint64_t)p_pctcpu * 1000000) / 370 (FSCALE - ccpu_exp[swtime])); 371#endif 372 373 return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); 374} 375 376static void 377racct_add_racct(struct racct *dest, const struct racct *src) 378{ 379 int i; 380 381 mtx_assert(&racct_lock, MA_OWNED); 382 383 /* 384 * Update resource usage in dest. 385 */ 386 for (i = 0; i <= RACCT_MAX; i++) { 387 KASSERT(dest->r_resources[i] >= 0, 388 ("racct propagation meltdown: dest < 0")); 389 KASSERT(src->r_resources[i] >= 0, 390 ("racct propagation meltdown: src < 0")); 391 dest->r_resources[i] += src->r_resources[i]; 392 } 393} 394 395static void 396racct_sub_racct(struct racct *dest, const struct racct *src) 397{ 398 int i; 399 400 mtx_assert(&racct_lock, MA_OWNED); 401 402 /* 403 * Update resource usage in dest. 404 */ 405 for (i = 0; i <= RACCT_MAX; i++) { 406 if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { 407 KASSERT(dest->r_resources[i] >= 0, 408 ("racct propagation meltdown: dest < 0")); 409 KASSERT(src->r_resources[i] >= 0, 410 ("racct propagation meltdown: src < 0")); 411 KASSERT(src->r_resources[i] <= dest->r_resources[i], 412 ("racct propagation meltdown: src > dest")); 413 } 414 if (RACCT_CAN_DROP(i)) { 415 dest->r_resources[i] -= src->r_resources[i]; 416 if (dest->r_resources[i] < 0) { 417 KASSERT(RACCT_IS_SLOPPY(i) || 418 RACCT_IS_DECAYING(i), 419 ("racct_sub_racct: usage < 0")); 420 dest->r_resources[i] = 0; 421 } 422 } 423 } 424} 425 426void 427racct_create(struct racct **racctp) 428{ 429 430 SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0); 431 432 KASSERT(*racctp == NULL, ("racct already allocated")); 433 434 *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); 435} 436 437static void 438racct_destroy_locked(struct racct **racctp) 439{ 440 int i; 441 struct racct *racct; 442 443 SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0); 444 445 mtx_assert(&racct_lock, MA_OWNED); 446 KASSERT(racctp != NULL, ("NULL racctp")); 447 KASSERT(*racctp != NULL, ("NULL racct")); 448 449 racct = *racctp; 450 451 for (i = 0; i <= RACCT_MAX; i++) { 452 if (RACCT_IS_SLOPPY(i)) 453 continue; 454 if (!RACCT_IS_RECLAIMABLE(i)) 455 continue; 456 KASSERT(racct->r_resources[i] == 0, 457 ("destroying non-empty racct: " 458 "%ju allocated for resource %d\n", 459 racct->r_resources[i], i)); 460 } 461 uma_zfree(racct_zone, racct); 462 *racctp = NULL; 463} 464 465void 466racct_destroy(struct racct **racct) 467{ 468 469 mtx_lock(&racct_lock); 470 racct_destroy_locked(racct); 471 mtx_unlock(&racct_lock); 472} 473 474/* 475 * Increase consumption of 'resource' by 'amount' for 'racct' 476 * and all its parents. Differently from other cases, 'amount' here 477 * may be less than zero. 478 */ 479static void 480racct_alloc_resource(struct racct *racct, int resource, 481 uint64_t amount) 482{ 483 484 mtx_assert(&racct_lock, MA_OWNED); 485 KASSERT(racct != NULL, ("NULL racct")); 486 487 racct->r_resources[resource] += amount; 488 if (racct->r_resources[resource] < 0) { 489 KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), 490 ("racct_alloc_resource: usage < 0")); 491 racct->r_resources[resource] = 0; 492 } 493 494 /* 495 * There are some cases where the racct %cpu resource would grow 496 * beyond 100%. 497 * For example in racct_proc_exit() we add the process %cpu usage 498 * to the ucred racct containers. If too many processes terminated 499 * in a short time span, the ucred %cpu resource could grow too much. 500 * Also, the 4BSD scheduler sometimes returns for a thread more than 501 * 100% cpu usage. So we set a boundary here to 100%. 502 */ 503 if ((resource == RACCT_PCTCPU) && 504 (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000)) 505 racct->r_resources[RACCT_PCTCPU] = 100 * 1000000; 506} 507 508static int 509racct_add_locked(struct proc *p, int resource, uint64_t amount) 510{ 511#ifdef RCTL 512 int error; 513#endif 514 515 SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0); 516 517 /* 518 * We need proc lock to dereference p->p_ucred. 519 */ 520 PROC_LOCK_ASSERT(p, MA_OWNED); 521 522#ifdef RCTL 523 error = rctl_enforce(p, resource, amount); 524 if (error && RACCT_IS_DENIABLE(resource)) { 525 SDT_PROBE(racct, kernel, rusage, add_failure, p, resource, 526 amount, 0, 0); 527 return (error); 528 } 529#endif 530 racct_alloc_resource(p->p_racct, resource, amount); 531 racct_add_cred_locked(p->p_ucred, resource, amount); 532 533 return (0); 534} 535 536/* 537 * Increase allocation of 'resource' by 'amount' for process 'p'. 538 * Return 0 if it's below limits, or errno, if it's not. 539 */ 540int 541racct_add(struct proc *p, int resource, uint64_t amount) 542{ 543 int error; 544 545 mtx_lock(&racct_lock); 546 error = racct_add_locked(p, resource, amount); 547 mtx_unlock(&racct_lock); 548 return (error); 549} 550 551static void 552racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) 553{ 554 struct prison *pr; 555 556 SDT_PROBE(racct, kernel, rusage, add_cred, cred, resource, amount, 557 0, 0); 558 559 racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount); 560 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 561 racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 562 amount); 563 racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount); 564} 565 566/* 567 * Increase allocation of 'resource' by 'amount' for credential 'cred'. 568 * Doesn't check for limits and never fails. 569 * 570 * XXX: Shouldn't this ever return an error? 571 */ 572void 573racct_add_cred(struct ucred *cred, int resource, uint64_t amount) 574{ 575 576 mtx_lock(&racct_lock); 577 racct_add_cred_locked(cred, resource, amount); 578 mtx_unlock(&racct_lock); 579} 580 581/* 582 * Increase allocation of 'resource' by 'amount' for process 'p'. 583 * Doesn't check for limits and never fails. 584 */ 585void 586racct_add_force(struct proc *p, int resource, uint64_t amount) 587{ 588 589 SDT_PROBE(racct, kernel, rusage, add_force, p, resource, amount, 0, 0); 590 591 /* 592 * We need proc lock to dereference p->p_ucred. 593 */ 594 PROC_LOCK_ASSERT(p, MA_OWNED); 595 596 mtx_lock(&racct_lock); 597 racct_alloc_resource(p->p_racct, resource, amount); 598 mtx_unlock(&racct_lock); 599 racct_add_cred(p->p_ucred, resource, amount); 600} 601 602static int 603racct_set_locked(struct proc *p, int resource, uint64_t amount) 604{ 605 int64_t old_amount, decayed_amount; 606 int64_t diff_proc, diff_cred; 607#ifdef RCTL 608 int error; 609#endif 610 611 SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 612 613 /* 614 * We need proc lock to dereference p->p_ucred. 615 */ 616 PROC_LOCK_ASSERT(p, MA_OWNED); 617 618 old_amount = p->p_racct->r_resources[resource]; 619 /* 620 * The diffs may be negative. 621 */ 622 diff_proc = amount - old_amount; 623 if (RACCT_IS_DECAYING(resource)) { 624 /* 625 * Resources in per-credential racct containers may decay. 626 * If this is the case, we need to calculate the difference 627 * between the new amount and the proportional value of the 628 * old amount that has decayed in the ucred racct containers. 629 */ 630 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 631 diff_cred = amount - decayed_amount; 632 } else 633 diff_cred = diff_proc; 634#ifdef notyet 635 KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), 636 ("racct_set: usage of non-droppable resource %d dropping", 637 resource)); 638#endif 639#ifdef RCTL 640 if (diff_proc > 0) { 641 error = rctl_enforce(p, resource, diff_proc); 642 if (error && RACCT_IS_DENIABLE(resource)) { 643 SDT_PROBE(racct, kernel, rusage, set_failure, p, 644 resource, amount, 0, 0); 645 return (error); 646 } 647 } 648#endif 649 racct_alloc_resource(p->p_racct, resource, diff_proc); 650 if (diff_cred > 0) 651 racct_add_cred_locked(p->p_ucred, resource, diff_cred); 652 else if (diff_cred < 0) 653 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 654 655 return (0); 656} 657 658/* 659 * Set allocation of 'resource' to 'amount' for process 'p'. 660 * Return 0 if it's below limits, or errno, if it's not. 661 * 662 * Note that decreasing the allocation always returns 0, 663 * even if it's above the limit. 664 */ 665int 666racct_set(struct proc *p, int resource, uint64_t amount) 667{ 668 int error; 669 670 mtx_lock(&racct_lock); 671 error = racct_set_locked(p, resource, amount); 672 mtx_unlock(&racct_lock); 673 return (error); 674} 675 676static void 677racct_set_force_locked(struct proc *p, int resource, uint64_t amount) 678{ 679 int64_t old_amount, decayed_amount; 680 int64_t diff_proc, diff_cred; 681 682 SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 683 684 /* 685 * We need proc lock to dereference p->p_ucred. 686 */ 687 PROC_LOCK_ASSERT(p, MA_OWNED); 688 689 old_amount = p->p_racct->r_resources[resource]; 690 /* 691 * The diffs may be negative. 692 */ 693 diff_proc = amount - old_amount; 694 if (RACCT_IS_DECAYING(resource)) { 695 /* 696 * Resources in per-credential racct containers may decay. 697 * If this is the case, we need to calculate the difference 698 * between the new amount and the proportional value of the 699 * old amount that has decayed in the ucred racct containers. 700 */ 701 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 702 diff_cred = amount - decayed_amount; 703 } else 704 diff_cred = diff_proc; 705 706 racct_alloc_resource(p->p_racct, resource, diff_proc); 707 if (diff_cred > 0) 708 racct_add_cred_locked(p->p_ucred, resource, diff_cred); 709 else if (diff_cred < 0) 710 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 711} 712 713void 714racct_set_force(struct proc *p, int resource, uint64_t amount) 715{ 716 mtx_lock(&racct_lock); 717 racct_set_force_locked(p, resource, amount); 718 mtx_unlock(&racct_lock); 719} 720 721/* 722 * Returns amount of 'resource' the process 'p' can keep allocated. 723 * Allocating more than that would be denied, unless the resource 724 * is marked undeniable. Amount of already allocated resource does 725 * not matter. 726 */ 727uint64_t 728racct_get_limit(struct proc *p, int resource) 729{ 730 731#ifdef RCTL 732 return (rctl_get_limit(p, resource)); 733#else 734 return (UINT64_MAX); 735#endif 736} 737 738/* 739 * Returns amount of 'resource' the process 'p' can keep allocated. 740 * Allocating more than that would be denied, unless the resource 741 * is marked undeniable. Amount of already allocated resource does 742 * matter. 743 */ 744uint64_t 745racct_get_available(struct proc *p, int resource) 746{ 747 748#ifdef RCTL 749 return (rctl_get_available(p, resource)); 750#else 751 return (UINT64_MAX); 752#endif 753} 754 755/* 756 * Returns amount of the %cpu resource that process 'p' can add to its %cpu 757 * utilization. Adding more than that would lead to the process being 758 * throttled. 759 */ 760static int64_t 761racct_pcpu_available(struct proc *p) 762{ 763 764#ifdef RCTL 765 return (rctl_pcpu_available(p)); 766#else 767 return (INT64_MAX); 768#endif 769} 770 771/* 772 * Decrease allocation of 'resource' by 'amount' for process 'p'. 773 */ 774void 775racct_sub(struct proc *p, int resource, uint64_t amount) 776{ 777 778 SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0); 779 780 /* 781 * We need proc lock to dereference p->p_ucred. 782 */ 783 PROC_LOCK_ASSERT(p, MA_OWNED); 784 KASSERT(RACCT_CAN_DROP(resource), 785 ("racct_sub: called for non-droppable resource %d", resource)); 786 787 mtx_lock(&racct_lock); 788 KASSERT(amount <= p->p_racct->r_resources[resource], 789 ("racct_sub: freeing %ju of resource %d, which is more " 790 "than allocated %jd for %s (pid %d)", amount, resource, 791 (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); 792 793 racct_alloc_resource(p->p_racct, resource, -amount); 794 racct_sub_cred_locked(p->p_ucred, resource, amount); 795 mtx_unlock(&racct_lock); 796} 797 798static void 799racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 800{ 801 struct prison *pr; 802 803 SDT_PROBE(racct, kernel, rusage, sub_cred, cred, resource, amount, 804 0, 0); 805 806#ifdef notyet 807 KASSERT(RACCT_CAN_DROP(resource), 808 ("racct_sub_cred: called for resource %d which can not drop", 809 resource)); 810#endif 811 812 racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); 813 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 814 racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 815 -amount); 816 racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount); 817} 818 819/* 820 * Decrease allocation of 'resource' by 'amount' for credential 'cred'. 821 */ 822void 823racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 824{ 825 826 mtx_lock(&racct_lock); 827 racct_sub_cred_locked(cred, resource, amount); 828 mtx_unlock(&racct_lock); 829} 830 831/* 832 * Inherit resource usage information from the parent process. 833 */ 834int 835racct_proc_fork(struct proc *parent, struct proc *child) 836{ 837 int i, error = 0; 838 839 /* 840 * Create racct for the child process. 841 */ 842 racct_create(&child->p_racct); 843 844 PROC_LOCK(parent); 845 PROC_LOCK(child); 846 mtx_lock(&racct_lock); 847 848#ifdef RCTL 849 error = rctl_proc_fork(parent, child); 850 if (error != 0) 851 goto out; 852#endif 853 854 /* Init process cpu time. */ 855 child->p_prev_runtime = 0; 856 child->p_throttled = 0; 857 858 /* 859 * Inherit resource usage. 860 */ 861 for (i = 0; i <= RACCT_MAX; i++) { 862 if (parent->p_racct->r_resources[i] == 0 || 863 !RACCT_IS_INHERITABLE(i)) 864 continue; 865 866 error = racct_set_locked(child, i, 867 parent->p_racct->r_resources[i]); 868 if (error != 0) 869 goto out; 870 } 871 872 error = racct_add_locked(child, RACCT_NPROC, 1); 873 error += racct_add_locked(child, RACCT_NTHR, 1); 874 875out: 876 mtx_unlock(&racct_lock); 877 PROC_UNLOCK(child); 878 PROC_UNLOCK(parent); 879 880 if (error != 0) 881 racct_proc_exit(child); 882 883 return (error); 884} 885 886/* 887 * Called at the end of fork1(), to handle rules that require the process 888 * to be fully initialized. 889 */ 890void 891racct_proc_fork_done(struct proc *child) 892{ 893 894#ifdef RCTL 895 PROC_LOCK(child); 896 mtx_lock(&racct_lock); 897 rctl_enforce(child, RACCT_NPROC, 0); 898 rctl_enforce(child, RACCT_NTHR, 0); 899 mtx_unlock(&racct_lock); 900 PROC_UNLOCK(child); 901#endif 902} 903 904void 905racct_proc_exit(struct proc *p) 906{ 907 int i; 908 uint64_t runtime; 909 struct timeval wallclock; 910 uint64_t pct_estimate, pct; 911 912 PROC_LOCK(p); 913 /* 914 * We don't need to calculate rux, proc_reap() has already done this. 915 */ 916 runtime = cputick2usec(p->p_rux.rux_runtime); 917#ifdef notyet 918 KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); 919#else 920 if (runtime < p->p_prev_runtime) 921 runtime = p->p_prev_runtime; 922#endif 923 microuptime(&wallclock); 924 timevalsub(&wallclock, &p->p_stats->p_start); 925 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 926 pct_estimate = (1000000 * runtime * 100) / 927 ((uint64_t)wallclock.tv_sec * 1000000 + 928 wallclock.tv_usec); 929 } else 930 pct_estimate = 0; 931 pct = racct_getpcpu(p, pct_estimate); 932 933 mtx_lock(&racct_lock); 934 racct_set_locked(p, RACCT_CPU, runtime); 935 racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); 936 937 for (i = 0; i <= RACCT_MAX; i++) { 938 if (p->p_racct->r_resources[i] == 0) 939 continue; 940 if (!RACCT_IS_RECLAIMABLE(i)) 941 continue; 942 racct_set_locked(p, i, 0); 943 } 944 945 mtx_unlock(&racct_lock); 946 PROC_UNLOCK(p); 947 948#ifdef RCTL 949 rctl_racct_release(p->p_racct); 950#endif 951 racct_destroy(&p->p_racct); 952} 953 954/* 955 * Called after credentials change, to move resource utilisation 956 * between raccts. 957 */ 958void 959racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, 960 struct ucred *newcred) 961{ 962 struct uidinfo *olduip, *newuip; 963 struct loginclass *oldlc, *newlc; 964 struct prison *oldpr, *newpr, *pr; 965 966 PROC_LOCK_ASSERT(p, MA_NOTOWNED); 967 968 newuip = newcred->cr_ruidinfo; 969 olduip = oldcred->cr_ruidinfo; 970 newlc = newcred->cr_loginclass; 971 oldlc = oldcred->cr_loginclass; 972 newpr = newcred->cr_prison; 973 oldpr = oldcred->cr_prison; 974 975 mtx_lock(&racct_lock); 976 if (newuip != olduip) { 977 racct_sub_racct(olduip->ui_racct, p->p_racct); 978 racct_add_racct(newuip->ui_racct, p->p_racct); 979 } 980 if (newlc != oldlc) { 981 racct_sub_racct(oldlc->lc_racct, p->p_racct); 982 racct_add_racct(newlc->lc_racct, p->p_racct); 983 } 984 if (newpr != oldpr) { 985 for (pr = oldpr; pr != NULL; pr = pr->pr_parent) 986 racct_sub_racct(pr->pr_prison_racct->prr_racct, 987 p->p_racct); 988 for (pr = newpr; pr != NULL; pr = pr->pr_parent) 989 racct_add_racct(pr->pr_prison_racct->prr_racct, 990 p->p_racct); 991 } 992 mtx_unlock(&racct_lock); 993 994#ifdef RCTL 995 rctl_proc_ucred_changed(p, newcred); 996#endif 997} 998 999void 1000racct_move(struct racct *dest, struct racct *src) 1001{ 1002 1003 mtx_lock(&racct_lock); 1004 1005 racct_add_racct(dest, src); 1006 racct_sub_racct(src, src); 1007 1008 mtx_unlock(&racct_lock); 1009} 1010 1011static void 1012racct_proc_throttle(struct proc *p) 1013{ 1014 struct thread *td; 1015#ifdef SMP 1016 int cpuid; 1017#endif 1018 1019 PROC_LOCK_ASSERT(p, MA_OWNED); 1020 1021 /* 1022 * Do not block kernel processes. Also do not block processes with 1023 * low %cpu utilization to improve interactivity. 1024 */ 1025 if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) || 1026 (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) 1027 return; 1028 p->p_throttled = 1; 1029 1030 FOREACH_THREAD_IN_PROC(p, td) { 1031 switch (td->td_state) { 1032 case TDS_RUNQ: 1033 /* 1034 * If the thread is on the scheduler run-queue, we can 1035 * not just remove it from there. So we set the flag 1036 * TDF_NEEDRESCHED for the thread, so that once it is 1037 * running, it is taken off the cpu as soon as possible. 1038 */ 1039 thread_lock(td); 1040 td->td_flags |= TDF_NEEDRESCHED; 1041 thread_unlock(td); 1042 break; 1043 case TDS_RUNNING: 1044 /* 1045 * If the thread is running, we request a context 1046 * switch for it by setting the TDF_NEEDRESCHED flag. 1047 */ 1048 thread_lock(td); 1049 td->td_flags |= TDF_NEEDRESCHED; 1050#ifdef SMP 1051 cpuid = td->td_oncpu; 1052 if ((cpuid != NOCPU) && (td != curthread)) 1053 ipi_cpu(cpuid, IPI_AST); 1054#endif 1055 thread_unlock(td); 1056 break; 1057 default: 1058 break; 1059 } 1060 } 1061} 1062 1063static void 1064racct_proc_wakeup(struct proc *p) 1065{ 1066 PROC_LOCK_ASSERT(p, MA_OWNED); 1067 1068 if (p->p_throttled) { 1069 p->p_throttled = 0; 1070 wakeup(p->p_racct); 1071 } 1072} 1073 1074static void 1075racct_decay_resource(struct racct *racct, void * res, void* dummy) 1076{ 1077 int resource; 1078 int64_t r_old, r_new; 1079 1080 resource = *(int *)res; 1081 r_old = racct->r_resources[resource]; 1082 1083 /* If there is nothing to decay, just exit. */ 1084 if (r_old <= 0) 1085 return; 1086 1087 mtx_lock(&racct_lock); 1088 r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; 1089 racct->r_resources[resource] = r_new; 1090 mtx_unlock(&racct_lock); 1091} 1092 1093static void 1094racct_decay(int resource) 1095{ 1096 ui_racct_foreach(racct_decay_resource, &resource, NULL); 1097 loginclass_racct_foreach(racct_decay_resource, &resource, NULL); 1098 prison_racct_foreach(racct_decay_resource, &resource, NULL); 1099} 1100 1101static void 1102racctd(void) 1103{ 1104 struct thread *td; 1105 struct proc *p; 1106 struct timeval wallclock; 1107 uint64_t runtime; 1108 uint64_t pct, pct_estimate; 1109 1110 for (;;) { 1111 racct_decay(RACCT_PCTCPU); 1112 1113 sx_slock(&allproc_lock); 1114 1115 LIST_FOREACH(p, &zombproc, p_list) { 1116 PROC_LOCK(p); 1117 racct_set(p, RACCT_PCTCPU, 0); 1118 PROC_UNLOCK(p); 1119 } 1120 1121 FOREACH_PROC_IN_SYSTEM(p) { 1122 PROC_LOCK(p); 1123 if (p->p_state != PRS_NORMAL) { 1124 PROC_UNLOCK(p); 1125 continue; 1126 } 1127 1128 microuptime(&wallclock); 1129 timevalsub(&wallclock, &p->p_stats->p_start); 1130 PROC_SLOCK(p); 1131 FOREACH_THREAD_IN_PROC(p, td) 1132 ruxagg(p, td); 1133 runtime = cputick2usec(p->p_rux.rux_runtime); 1134 PROC_SUNLOCK(p); 1135#ifdef notyet 1136 KASSERT(runtime >= p->p_prev_runtime, 1137 ("runtime < p_prev_runtime")); 1138#else 1139 if (runtime < p->p_prev_runtime) 1140 runtime = p->p_prev_runtime; 1141#endif 1142 p->p_prev_runtime = runtime; 1143 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1144 pct_estimate = (1000000 * runtime * 100) / 1145 ((uint64_t)wallclock.tv_sec * 1000000 + 1146 wallclock.tv_usec); 1147 } else 1148 pct_estimate = 0; 1149 pct = racct_getpcpu(p, pct_estimate); 1150 mtx_lock(&racct_lock); 1151 racct_set_force_locked(p, RACCT_PCTCPU, pct); 1152 racct_set_locked(p, RACCT_CPU, runtime); 1153 racct_set_locked(p, RACCT_WALLCLOCK, 1154 (uint64_t)wallclock.tv_sec * 1000000 + 1155 wallclock.tv_usec); 1156 mtx_unlock(&racct_lock); 1157 PROC_UNLOCK(p); 1158 } 1159 1160 /* 1161 * To ensure that processes are throttled in a fair way, we need 1162 * to iterate over all processes again and check the limits 1163 * for %cpu resource only after ucred racct containers have been 1164 * properly filled. 1165 */ 1166 FOREACH_PROC_IN_SYSTEM(p) { 1167 PROC_LOCK(p); 1168 if (p->p_state != PRS_NORMAL) { 1169 PROC_UNLOCK(p); 1170 continue; 1171 } 1172 1173 if (racct_pcpu_available(p) <= 0) 1174 racct_proc_throttle(p); 1175 else if (p->p_throttled) 1176 racct_proc_wakeup(p); 1177 PROC_UNLOCK(p); 1178 } 1179 sx_sunlock(&allproc_lock); 1180 pause("-", hz); 1181 } 1182} 1183 1184static struct kproc_desc racctd_kp = { 1185 "racctd", 1186 racctd, 1187 NULL 1188}; 1189SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp); 1190 1191static void 1192racct_init(void) 1193{ 1194 1195 racct_zone = uma_zcreate("racct", sizeof(struct racct), 1196 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1197 /* 1198 * XXX: Move this somewhere. 1199 */ 1200 prison0.pr_prison_racct = prison_racct_find("0"); 1201} 1202SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); 1203 1204#else /* !RACCT */ 1205 1206int 1207racct_add(struct proc *p, int resource, uint64_t amount) 1208{ 1209 1210 return (0); 1211} 1212 1213void 1214racct_add_cred(struct ucred *cred, int resource, uint64_t amount) 1215{ 1216} 1217 1218void 1219racct_add_force(struct proc *p, int resource, uint64_t amount) 1220{ 1221 1222 return; 1223} 1224 1225int 1226racct_set(struct proc *p, int resource, uint64_t amount) 1227{ 1228 1229 return (0); 1230} 1231 1232void 1233racct_set_force(struct proc *p, int resource, uint64_t amount) 1234{ 1235} 1236 1237void 1238racct_sub(struct proc *p, int resource, uint64_t amount) 1239{ 1240} 1241 1242void 1243racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 1244{ 1245} 1246 1247uint64_t 1248racct_get_limit(struct proc *p, int resource) 1249{ 1250 1251 return (UINT64_MAX); 1252} 1253 1254uint64_t 1255racct_get_available(struct proc *p, int resource) 1256{ 1257 1258 return (UINT64_MAX); 1259} 1260 1261void 1262racct_create(struct racct **racctp) 1263{ 1264} 1265 1266void 1267racct_destroy(struct racct **racctp) 1268{ 1269} 1270 1271int 1272racct_proc_fork(struct proc *parent, struct proc *child) 1273{ 1274 1275 return (0); 1276} 1277 1278void 1279racct_proc_fork_done(struct proc *child) 1280{ 1281} 1282 1283void 1284racct_proc_exit(struct proc *p) 1285{ 1286} 1287 1288#endif /* !RACCT */ 1289