kern_racct.c revision 258622
1/*- 2 * Copyright (c) 2010 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by Edward Tomasz Napierala under sponsorship 6 * from the FreeBSD Foundation. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD: head/sys/kern/kern_racct.c 258622 2013-11-26 08:46:27Z avg $ 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 258622 2013-11-26 08:46:27Z avg $"); 34 35#include "opt_sched.h" 36 37#include <sys/param.h> 38#include <sys/systm.h> 39#include <sys/eventhandler.h> 40#include <sys/jail.h> 41#include <sys/kernel.h> 42#include <sys/kthread.h> 43#include <sys/lock.h> 44#include <sys/loginclass.h> 45#include <sys/malloc.h> 46#include <sys/mutex.h> 47#include <sys/proc.h> 48#include <sys/racct.h> 49#include <sys/resourcevar.h> 50#include <sys/sbuf.h> 51#include <sys/sched.h> 52#include <sys/sdt.h> 53#include <sys/smp.h> 54#include <sys/sx.h> 55#include <sys/sysctl.h> 56#include <sys/sysent.h> 57#include <sys/sysproto.h> 58#include <sys/umtx.h> 59#include <machine/smp.h> 60 61#ifdef RCTL 62#include <sys/rctl.h> 63#endif 64 65#ifdef RACCT 66 67FEATURE(racct, "Resource Accounting"); 68 69/* 70 * Do not block processes that have their %cpu usage <= pcpu_threshold. 71 */ 72static int pcpu_threshold = 1; 73 74SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); 75SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 76 0, "Processes with higher %cpu usage than this value can be throttled."); 77 78/* 79 * How many seconds it takes to use the scheduler %cpu calculations. When a 80 * process starts, we compute its %cpu usage by dividing its runtime by the 81 * process wall clock time. After RACCT_PCPU_SECS pass, we use the value 82 * provided by the scheduler. 83 */ 84#define RACCT_PCPU_SECS 3 85 86static struct mtx racct_lock; 87MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); 88 89static uma_zone_t racct_zone; 90 91static void racct_sub_racct(struct racct *dest, const struct racct *src); 92static void racct_sub_cred_locked(struct ucred *cred, int resource, 93 uint64_t amount); 94static void racct_add_cred_locked(struct ucred *cred, int resource, 95 uint64_t amount); 96 97SDT_PROVIDER_DEFINE(racct); 98SDT_PROBE_DEFINE3(racct, kernel, rusage, add, "struct proc *", "int", 99 "uint64_t"); 100SDT_PROBE_DEFINE3(racct, kernel, rusage, add__failure, 101 "struct proc *", "int", "uint64_t"); 102SDT_PROBE_DEFINE3(racct, kernel, rusage, add__cred, "struct ucred *", 103 "int", "uint64_t"); 104SDT_PROBE_DEFINE3(racct, kernel, rusage, add__force, "struct proc *", 105 "int", "uint64_t"); 106SDT_PROBE_DEFINE3(racct, kernel, rusage, set, "struct proc *", "int", 107 "uint64_t"); 108SDT_PROBE_DEFINE3(racct, kernel, rusage, set__failure, 109 "struct proc *", "int", "uint64_t"); 110SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, "struct proc *", "int", 111 "uint64_t"); 112SDT_PROBE_DEFINE3(racct, kernel, rusage, sub__cred, "struct ucred *", 113 "int", "uint64_t"); 114SDT_PROBE_DEFINE1(racct, kernel, racct, create, "struct racct *"); 115SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, "struct racct *"); 116SDT_PROBE_DEFINE2(racct, kernel, racct, join, "struct racct *", 117 "struct racct *"); 118SDT_PROBE_DEFINE2(racct, kernel, racct, join__failure, 119 "struct racct *", "struct racct *"); 120SDT_PROBE_DEFINE2(racct, kernel, racct, leave, "struct racct *", 121 "struct racct *"); 122 123int racct_types[] = { 124 [RACCT_CPU] = 125 RACCT_IN_MILLIONS, 126 [RACCT_DATA] = 127 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 128 [RACCT_STACK] = 129 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 130 [RACCT_CORE] = 131 RACCT_DENIABLE, 132 [RACCT_RSS] = 133 RACCT_RECLAIMABLE, 134 [RACCT_MEMLOCK] = 135 RACCT_RECLAIMABLE | RACCT_DENIABLE, 136 [RACCT_NPROC] = 137 RACCT_RECLAIMABLE | RACCT_DENIABLE, 138 [RACCT_NOFILE] = 139 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 140 [RACCT_VMEM] = 141 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 142 [RACCT_NPTS] = 143 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 144 [RACCT_SWAP] = 145 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 146 [RACCT_NTHR] = 147 RACCT_RECLAIMABLE | RACCT_DENIABLE, 148 [RACCT_MSGQQUEUED] = 149 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 150 [RACCT_MSGQSIZE] = 151 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 152 [RACCT_NMSGQ] = 153 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 154 [RACCT_NSEM] = 155 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 156 [RACCT_NSEMOP] = 157 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 158 [RACCT_NSHM] = 159 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 160 [RACCT_SHMSIZE] = 161 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 162 [RACCT_WALLCLOCK] = 163 RACCT_IN_MILLIONS, 164 [RACCT_PCTCPU] = 165 RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; 166 167static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; 168 169#ifdef SCHED_4BSD 170/* 171 * Contains intermediate values for %cpu calculations to avoid using floating 172 * point in the kernel. 173 * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) 174 * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to 175 * zero so the calculations are more straightforward. 176 */ 177fixpt_t ccpu_exp[] = { 178 [0] = FSCALE * 1, 179 [1] = FSCALE * 0.95122942450071400909, 180 [2] = FSCALE * 0.90483741803595957316, 181 [3] = FSCALE * 0.86070797642505780722, 182 [4] = FSCALE * 0.81873075307798185866, 183 [5] = FSCALE * 0.77880078307140486824, 184 [6] = FSCALE * 0.74081822068171786606, 185 [7] = FSCALE * 0.70468808971871343435, 186 [8] = FSCALE * 0.67032004603563930074, 187 [9] = FSCALE * 0.63762815162177329314, 188 [10] = FSCALE * 0.60653065971263342360, 189 [11] = FSCALE * 0.57694981038048669531, 190 [12] = FSCALE * 0.54881163609402643262, 191 [13] = FSCALE * 0.52204577676101604789, 192 [14] = FSCALE * 0.49658530379140951470, 193 [15] = FSCALE * 0.47236655274101470713, 194 [16] = FSCALE * 0.44932896411722159143, 195 [17] = FSCALE * 0.42741493194872666992, 196 [18] = FSCALE * 0.40656965974059911188, 197 [19] = FSCALE * 0.38674102345450120691, 198 [20] = FSCALE * 0.36787944117144232159, 199 [21] = FSCALE * 0.34993774911115535467, 200 [22] = FSCALE * 0.33287108369807955328, 201 [23] = FSCALE * 0.31663676937905321821, 202 [24] = FSCALE * 0.30119421191220209664, 203 [25] = FSCALE * 0.28650479686019010032, 204 [26] = FSCALE * 0.27253179303401260312, 205 [27] = FSCALE * 0.25924026064589150757, 206 [28] = FSCALE * 0.24659696394160647693, 207 [29] = FSCALE * 0.23457028809379765313, 208 [30] = FSCALE * 0.22313016014842982893, 209 [31] = FSCALE * 0.21224797382674305771, 210 [32] = FSCALE * 0.20189651799465540848, 211 [33] = FSCALE * 0.19204990862075411423, 212 [34] = FSCALE * 0.18268352405273465022, 213 [35] = FSCALE * 0.17377394345044512668, 214 [36] = FSCALE * 0.16529888822158653829, 215 [37] = FSCALE * 0.15723716631362761621, 216 [38] = FSCALE * 0.14956861922263505264, 217 [39] = FSCALE * 0.14227407158651357185, 218 [40] = FSCALE * 0.13533528323661269189, 219 [41] = FSCALE * 0.12873490358780421886, 220 [42] = FSCALE * 0.12245642825298191021, 221 [43] = FSCALE * 0.11648415777349695786, 222 [44] = FSCALE * 0.11080315836233388333, 223 [45] = FSCALE * 0.10539922456186433678, 224 [46] = FSCALE * 0.10025884372280373372, 225 [47] = FSCALE * 0.09536916221554961888, 226 [48] = FSCALE * 0.09071795328941250337, 227 [49] = FSCALE * 0.08629358649937051097, 228 [50] = FSCALE * 0.08208499862389879516, 229 [51] = FSCALE * 0.07808166600115315231, 230 [52] = FSCALE * 0.07427357821433388042, 231 [53] = FSCALE * 0.07065121306042958674, 232 [54] = FSCALE * 0.06720551273974976512, 233 [55] = FSCALE * 0.06392786120670757270, 234 [56] = FSCALE * 0.06081006262521796499, 235 [57] = FSCALE * 0.05784432087483846296, 236 [58] = FSCALE * 0.05502322005640722902, 237 [59] = FSCALE * 0.05233970594843239308, 238 [60] = FSCALE * 0.04978706836786394297, 239 [61] = FSCALE * 0.04735892439114092119, 240 [62] = FSCALE * 0.04504920239355780606, 241 [63] = FSCALE * 0.04285212686704017991, 242 [64] = FSCALE * 0.04076220397836621516, 243 [65] = FSCALE * 0.03877420783172200988, 244 [66] = FSCALE * 0.03688316740124000544, 245 [67] = FSCALE * 0.03508435410084502588, 246 [68] = FSCALE * 0.03337326996032607948, 247 [69] = FSCALE * 0.03174563637806794323, 248 [70] = FSCALE * 0.03019738342231850073, 249 [71] = FSCALE * 0.02872463965423942912, 250 [72] = FSCALE * 0.02732372244729256080, 251 [73] = FSCALE * 0.02599112877875534358, 252 [74] = FSCALE * 0.02472352647033939120, 253 [75] = FSCALE * 0.02351774585600910823, 254 [76] = FSCALE * 0.02237077185616559577, 255 [77] = FSCALE * 0.02127973643837716938, 256 [78] = FSCALE * 0.02024191144580438847, 257 [79] = FSCALE * 0.01925470177538692429, 258 [80] = FSCALE * 0.01831563888873418029, 259 [81] = FSCALE * 0.01742237463949351138, 260 [82] = FSCALE * 0.01657267540176124754, 261 [83] = FSCALE * 0.01576441648485449082, 262 [84] = FSCALE * 0.01499557682047770621, 263 [85] = FSCALE * 0.01426423390899925527, 264 [86] = FSCALE * 0.01356855901220093175, 265 [87] = FSCALE * 0.01290681258047986886, 266 [88] = FSCALE * 0.01227733990306844117, 267 [89] = FSCALE * 0.01167856697039544521, 268 [90] = FSCALE * 0.01110899653824230649, 269 [91] = FSCALE * 0.01056720438385265337, 270 [92] = FSCALE * 0.01005183574463358164, 271 [93] = FSCALE * 0.00956160193054350793, 272 [94] = FSCALE * 0.00909527710169581709, 273 [95] = FSCALE * 0.00865169520312063417, 274 [96] = FSCALE * 0.00822974704902002884, 275 [97] = FSCALE * 0.00782837754922577143, 276 [98] = FSCALE * 0.00744658307092434051, 277 [99] = FSCALE * 0.00708340892905212004, 278 [100] = FSCALE * 0.00673794699908546709, 279 [101] = FSCALE * 0.00640933344625638184, 280 [102] = FSCALE * 0.00609674656551563610, 281 [103] = FSCALE * 0.00579940472684214321, 282 [104] = FSCALE * 0.00551656442076077241, 283 [105] = FSCALE * 0.00524751839918138427, 284 [106] = FSCALE * 0.00499159390691021621, 285 [107] = FSCALE * 0.00474815099941147558, 286 [108] = FSCALE * 0.00451658094261266798, 287 [109] = FSCALE * 0.00429630469075234057, 288 [110] = FSCALE * 0.00408677143846406699, 289}; 290#endif 291 292#define CCPU_EXP_MAX 110 293 294/* 295 * This function is analogical to the getpcpu() function in the ps(1) command. 296 * They should both calculate in the same way so that the racct %cpu 297 * calculations are consistent with the values showed by the ps(1) tool. 298 * The calculations are more complex in the 4BSD scheduler because of the value 299 * of the ccpu variable. In ULE it is defined to be zero which saves us some 300 * work. 301 */ 302static uint64_t 303racct_getpcpu(struct proc *p, u_int pcpu) 304{ 305 u_int swtime; 306#ifdef SCHED_4BSD 307 fixpt_t pctcpu, pctcpu_next; 308#endif 309#ifdef SMP 310 struct pcpu *pc; 311 int found; 312#endif 313 fixpt_t p_pctcpu; 314 struct thread *td; 315 316 /* 317 * If the process is swapped out, we count its %cpu usage as zero. 318 * This behaviour is consistent with the userland ps(1) tool. 319 */ 320 if ((p->p_flag & P_INMEM) == 0) 321 return (0); 322 swtime = (ticks - p->p_swtick) / hz; 323 324 /* 325 * For short-lived processes, the sched_pctcpu() returns small 326 * values even for cpu intensive processes. Therefore we use 327 * our own estimate in this case. 328 */ 329 if (swtime < RACCT_PCPU_SECS) 330 return (pcpu); 331 332 p_pctcpu = 0; 333 FOREACH_THREAD_IN_PROC(p, td) { 334 if (td == PCPU_GET(idlethread)) 335 continue; 336#ifdef SMP 337 found = 0; 338 STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 339 if (td == pc->pc_idlethread) { 340 found = 1; 341 break; 342 } 343 } 344 if (found) 345 continue; 346#endif 347 thread_lock(td); 348#ifdef SCHED_4BSD 349 pctcpu = sched_pctcpu(td); 350 /* Count also the yet unfinished second. */ 351 pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; 352 pctcpu_next += sched_pctcpu_delta(td); 353 p_pctcpu += max(pctcpu, pctcpu_next); 354#else 355 /* 356 * In ULE the %cpu statistics are updated on every 357 * sched_pctcpu() call. So special calculations to 358 * account for the latest (unfinished) second are 359 * not needed. 360 */ 361 p_pctcpu += sched_pctcpu(td); 362#endif 363 thread_unlock(td); 364 } 365 366#ifdef SCHED_4BSD 367 if (swtime <= CCPU_EXP_MAX) 368 return ((100 * (uint64_t)p_pctcpu * 1000000) / 369 (FSCALE - ccpu_exp[swtime])); 370#endif 371 372 return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); 373} 374 375static void 376racct_add_racct(struct racct *dest, const struct racct *src) 377{ 378 int i; 379 380 mtx_assert(&racct_lock, MA_OWNED); 381 382 /* 383 * Update resource usage in dest. 384 */ 385 for (i = 0; i <= RACCT_MAX; i++) { 386 KASSERT(dest->r_resources[i] >= 0, 387 ("%s: resource %d propagation meltdown: dest < 0", 388 __func__, i)); 389 KASSERT(src->r_resources[i] >= 0, 390 ("%s: resource %d propagation meltdown: src < 0", 391 __func__, i)); 392 dest->r_resources[i] += src->r_resources[i]; 393 } 394} 395 396static void 397racct_sub_racct(struct racct *dest, const struct racct *src) 398{ 399 int i; 400 401 mtx_assert(&racct_lock, MA_OWNED); 402 403 /* 404 * Update resource usage in dest. 405 */ 406 for (i = 0; i <= RACCT_MAX; i++) { 407 if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { 408 KASSERT(dest->r_resources[i] >= 0, 409 ("%s: resource %d propagation meltdown: dest < 0", 410 __func__, i)); 411 KASSERT(src->r_resources[i] >= 0, 412 ("%s: resource %d propagation meltdown: src < 0", 413 __func__, i)); 414 KASSERT(src->r_resources[i] <= dest->r_resources[i], 415 ("%s: resource %d propagation meltdown: src > dest", 416 __func__, i)); 417 } 418 if (RACCT_CAN_DROP(i)) { 419 dest->r_resources[i] -= src->r_resources[i]; 420 if (dest->r_resources[i] < 0) { 421 KASSERT(RACCT_IS_SLOPPY(i) || 422 RACCT_IS_DECAYING(i), 423 ("%s: resource %d usage < 0", __func__, i)); 424 dest->r_resources[i] = 0; 425 } 426 } 427 } 428} 429 430void 431racct_create(struct racct **racctp) 432{ 433 434 SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0); 435 436 KASSERT(*racctp == NULL, ("racct already allocated")); 437 438 *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); 439} 440 441static void 442racct_destroy_locked(struct racct **racctp) 443{ 444 int i; 445 struct racct *racct; 446 447 SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0); 448 449 mtx_assert(&racct_lock, MA_OWNED); 450 KASSERT(racctp != NULL, ("NULL racctp")); 451 KASSERT(*racctp != NULL, ("NULL racct")); 452 453 racct = *racctp; 454 455 for (i = 0; i <= RACCT_MAX; i++) { 456 if (RACCT_IS_SLOPPY(i)) 457 continue; 458 if (!RACCT_IS_RECLAIMABLE(i)) 459 continue; 460 KASSERT(racct->r_resources[i] == 0, 461 ("destroying non-empty racct: " 462 "%ju allocated for resource %d\n", 463 racct->r_resources[i], i)); 464 } 465 uma_zfree(racct_zone, racct); 466 *racctp = NULL; 467} 468 469void 470racct_destroy(struct racct **racct) 471{ 472 473 mtx_lock(&racct_lock); 474 racct_destroy_locked(racct); 475 mtx_unlock(&racct_lock); 476} 477 478/* 479 * Increase consumption of 'resource' by 'amount' for 'racct' 480 * and all its parents. Differently from other cases, 'amount' here 481 * may be less than zero. 482 */ 483static void 484racct_alloc_resource(struct racct *racct, int resource, 485 uint64_t amount) 486{ 487 488 mtx_assert(&racct_lock, MA_OWNED); 489 KASSERT(racct != NULL, ("NULL racct")); 490 491 racct->r_resources[resource] += amount; 492 if (racct->r_resources[resource] < 0) { 493 KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), 494 ("%s: resource %d usage < 0", __func__, resource)); 495 racct->r_resources[resource] = 0; 496 } 497 498 /* 499 * There are some cases where the racct %cpu resource would grow 500 * beyond 100%. 501 * For example in racct_proc_exit() we add the process %cpu usage 502 * to the ucred racct containers. If too many processes terminated 503 * in a short time span, the ucred %cpu resource could grow too much. 504 * Also, the 4BSD scheduler sometimes returns for a thread more than 505 * 100% cpu usage. So we set a boundary here to 100%. 506 */ 507 if ((resource == RACCT_PCTCPU) && 508 (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000)) 509 racct->r_resources[RACCT_PCTCPU] = 100 * 1000000; 510} 511 512static int 513racct_add_locked(struct proc *p, int resource, uint64_t amount) 514{ 515#ifdef RCTL 516 int error; 517#endif 518 519 SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0); 520 521 /* 522 * We need proc lock to dereference p->p_ucred. 523 */ 524 PROC_LOCK_ASSERT(p, MA_OWNED); 525 526#ifdef RCTL 527 error = rctl_enforce(p, resource, amount); 528 if (error && RACCT_IS_DENIABLE(resource)) { 529 SDT_PROBE(racct, kernel, rusage, add__failure, p, resource, 530 amount, 0, 0); 531 return (error); 532 } 533#endif 534 racct_alloc_resource(p->p_racct, resource, amount); 535 racct_add_cred_locked(p->p_ucred, resource, amount); 536 537 return (0); 538} 539 540/* 541 * Increase allocation of 'resource' by 'amount' for process 'p'. 542 * Return 0 if it's below limits, or errno, if it's not. 543 */ 544int 545racct_add(struct proc *p, int resource, uint64_t amount) 546{ 547 int error; 548 549 mtx_lock(&racct_lock); 550 error = racct_add_locked(p, resource, amount); 551 mtx_unlock(&racct_lock); 552 return (error); 553} 554 555static void 556racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) 557{ 558 struct prison *pr; 559 560 SDT_PROBE(racct, kernel, rusage, add__cred, cred, resource, amount, 561 0, 0); 562 563 racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount); 564 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 565 racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 566 amount); 567 racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount); 568} 569 570/* 571 * Increase allocation of 'resource' by 'amount' for credential 'cred'. 572 * Doesn't check for limits and never fails. 573 * 574 * XXX: Shouldn't this ever return an error? 575 */ 576void 577racct_add_cred(struct ucred *cred, int resource, uint64_t amount) 578{ 579 580 mtx_lock(&racct_lock); 581 racct_add_cred_locked(cred, resource, amount); 582 mtx_unlock(&racct_lock); 583} 584 585/* 586 * Increase allocation of 'resource' by 'amount' for process 'p'. 587 * Doesn't check for limits and never fails. 588 */ 589void 590racct_add_force(struct proc *p, int resource, uint64_t amount) 591{ 592 593 SDT_PROBE(racct, kernel, rusage, add__force, p, resource, amount, 0, 0); 594 595 /* 596 * We need proc lock to dereference p->p_ucred. 597 */ 598 PROC_LOCK_ASSERT(p, MA_OWNED); 599 600 mtx_lock(&racct_lock); 601 racct_alloc_resource(p->p_racct, resource, amount); 602 mtx_unlock(&racct_lock); 603 racct_add_cred(p->p_ucred, resource, amount); 604} 605 606static int 607racct_set_locked(struct proc *p, int resource, uint64_t amount) 608{ 609 int64_t old_amount, decayed_amount; 610 int64_t diff_proc, diff_cred; 611#ifdef RCTL 612 int error; 613#endif 614 615 SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 616 617 /* 618 * We need proc lock to dereference p->p_ucred. 619 */ 620 PROC_LOCK_ASSERT(p, MA_OWNED); 621 622 old_amount = p->p_racct->r_resources[resource]; 623 /* 624 * The diffs may be negative. 625 */ 626 diff_proc = amount - old_amount; 627 if (RACCT_IS_DECAYING(resource)) { 628 /* 629 * Resources in per-credential racct containers may decay. 630 * If this is the case, we need to calculate the difference 631 * between the new amount and the proportional value of the 632 * old amount that has decayed in the ucred racct containers. 633 */ 634 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 635 diff_cred = amount - decayed_amount; 636 } else 637 diff_cred = diff_proc; 638#ifdef notyet 639 KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), 640 ("%s: usage of non-droppable resource %d dropping", __func__, 641 resource)); 642#endif 643#ifdef RCTL 644 if (diff_proc > 0) { 645 error = rctl_enforce(p, resource, diff_proc); 646 if (error && RACCT_IS_DENIABLE(resource)) { 647 SDT_PROBE(racct, kernel, rusage, set__failure, p, 648 resource, amount, 0, 0); 649 return (error); 650 } 651 } 652#endif 653 racct_alloc_resource(p->p_racct, resource, diff_proc); 654 if (diff_cred > 0) 655 racct_add_cred_locked(p->p_ucred, resource, diff_cred); 656 else if (diff_cred < 0) 657 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 658 659 return (0); 660} 661 662/* 663 * Set allocation of 'resource' to 'amount' for process 'p'. 664 * Return 0 if it's below limits, or errno, if it's not. 665 * 666 * Note that decreasing the allocation always returns 0, 667 * even if it's above the limit. 668 */ 669int 670racct_set(struct proc *p, int resource, uint64_t amount) 671{ 672 int error; 673 674 mtx_lock(&racct_lock); 675 error = racct_set_locked(p, resource, amount); 676 mtx_unlock(&racct_lock); 677 return (error); 678} 679 680static void 681racct_set_force_locked(struct proc *p, int resource, uint64_t amount) 682{ 683 int64_t old_amount, decayed_amount; 684 int64_t diff_proc, diff_cred; 685 686 SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 687 688 /* 689 * We need proc lock to dereference p->p_ucred. 690 */ 691 PROC_LOCK_ASSERT(p, MA_OWNED); 692 693 old_amount = p->p_racct->r_resources[resource]; 694 /* 695 * The diffs may be negative. 696 */ 697 diff_proc = amount - old_amount; 698 if (RACCT_IS_DECAYING(resource)) { 699 /* 700 * Resources in per-credential racct containers may decay. 701 * If this is the case, we need to calculate the difference 702 * between the new amount and the proportional value of the 703 * old amount that has decayed in the ucred racct containers. 704 */ 705 decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 706 diff_cred = amount - decayed_amount; 707 } else 708 diff_cred = diff_proc; 709 710 racct_alloc_resource(p->p_racct, resource, diff_proc); 711 if (diff_cred > 0) 712 racct_add_cred_locked(p->p_ucred, resource, diff_cred); 713 else if (diff_cred < 0) 714 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 715} 716 717void 718racct_set_force(struct proc *p, int resource, uint64_t amount) 719{ 720 mtx_lock(&racct_lock); 721 racct_set_force_locked(p, resource, amount); 722 mtx_unlock(&racct_lock); 723} 724 725/* 726 * Returns amount of 'resource' the process 'p' can keep allocated. 727 * Allocating more than that would be denied, unless the resource 728 * is marked undeniable. Amount of already allocated resource does 729 * not matter. 730 */ 731uint64_t 732racct_get_limit(struct proc *p, int resource) 733{ 734 735#ifdef RCTL 736 return (rctl_get_limit(p, resource)); 737#else 738 return (UINT64_MAX); 739#endif 740} 741 742/* 743 * Returns amount of 'resource' the process 'p' can keep allocated. 744 * Allocating more than that would be denied, unless the resource 745 * is marked undeniable. Amount of already allocated resource does 746 * matter. 747 */ 748uint64_t 749racct_get_available(struct proc *p, int resource) 750{ 751 752#ifdef RCTL 753 return (rctl_get_available(p, resource)); 754#else 755 return (UINT64_MAX); 756#endif 757} 758 759/* 760 * Returns amount of the %cpu resource that process 'p' can add to its %cpu 761 * utilization. Adding more than that would lead to the process being 762 * throttled. 763 */ 764static int64_t 765racct_pcpu_available(struct proc *p) 766{ 767 768#ifdef RCTL 769 return (rctl_pcpu_available(p)); 770#else 771 return (INT64_MAX); 772#endif 773} 774 775/* 776 * Decrease allocation of 'resource' by 'amount' for process 'p'. 777 */ 778void 779racct_sub(struct proc *p, int resource, uint64_t amount) 780{ 781 782 SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0); 783 784 /* 785 * We need proc lock to dereference p->p_ucred. 786 */ 787 PROC_LOCK_ASSERT(p, MA_OWNED); 788 KASSERT(RACCT_CAN_DROP(resource), 789 ("%s: called for non-droppable resource %d", __func__, resource)); 790 791 mtx_lock(&racct_lock); 792 KASSERT(amount <= p->p_racct->r_resources[resource], 793 ("%s: freeing %ju of resource %d, which is more " 794 "than allocated %jd for %s (pid %d)", __func__, amount, resource, 795 (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); 796 797 racct_alloc_resource(p->p_racct, resource, -amount); 798 racct_sub_cred_locked(p->p_ucred, resource, amount); 799 mtx_unlock(&racct_lock); 800} 801 802static void 803racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 804{ 805 struct prison *pr; 806 807 SDT_PROBE(racct, kernel, rusage, sub__cred, cred, resource, amount, 808 0, 0); 809 810#ifdef notyet 811 KASSERT(RACCT_CAN_DROP(resource), 812 ("%s: called for resource %d which can not drop", __func__, 813 resource)); 814#endif 815 816 racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); 817 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 818 racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 819 -amount); 820 racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount); 821} 822 823/* 824 * Decrease allocation of 'resource' by 'amount' for credential 'cred'. 825 */ 826void 827racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 828{ 829 830 mtx_lock(&racct_lock); 831 racct_sub_cred_locked(cred, resource, amount); 832 mtx_unlock(&racct_lock); 833} 834 835/* 836 * Inherit resource usage information from the parent process. 837 */ 838int 839racct_proc_fork(struct proc *parent, struct proc *child) 840{ 841 int i, error = 0; 842 843 /* 844 * Create racct for the child process. 845 */ 846 racct_create(&child->p_racct); 847 848 PROC_LOCK(parent); 849 PROC_LOCK(child); 850 mtx_lock(&racct_lock); 851 852#ifdef RCTL 853 error = rctl_proc_fork(parent, child); 854 if (error != 0) 855 goto out; 856#endif 857 858 /* Init process cpu time. */ 859 child->p_prev_runtime = 0; 860 child->p_throttled = 0; 861 862 /* 863 * Inherit resource usage. 864 */ 865 for (i = 0; i <= RACCT_MAX; i++) { 866 if (parent->p_racct->r_resources[i] == 0 || 867 !RACCT_IS_INHERITABLE(i)) 868 continue; 869 870 error = racct_set_locked(child, i, 871 parent->p_racct->r_resources[i]); 872 if (error != 0) 873 goto out; 874 } 875 876 error = racct_add_locked(child, RACCT_NPROC, 1); 877 error += racct_add_locked(child, RACCT_NTHR, 1); 878 879out: 880 mtx_unlock(&racct_lock); 881 PROC_UNLOCK(child); 882 PROC_UNLOCK(parent); 883 884 if (error != 0) 885 racct_proc_exit(child); 886 887 return (error); 888} 889 890/* 891 * Called at the end of fork1(), to handle rules that require the process 892 * to be fully initialized. 893 */ 894void 895racct_proc_fork_done(struct proc *child) 896{ 897 898#ifdef RCTL 899 PROC_LOCK(child); 900 mtx_lock(&racct_lock); 901 rctl_enforce(child, RACCT_NPROC, 0); 902 rctl_enforce(child, RACCT_NTHR, 0); 903 mtx_unlock(&racct_lock); 904 PROC_UNLOCK(child); 905#endif 906} 907 908void 909racct_proc_exit(struct proc *p) 910{ 911 int i; 912 uint64_t runtime; 913 struct timeval wallclock; 914 uint64_t pct_estimate, pct; 915 916 PROC_LOCK(p); 917 /* 918 * We don't need to calculate rux, proc_reap() has already done this. 919 */ 920 runtime = cputick2usec(p->p_rux.rux_runtime); 921#ifdef notyet 922 KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); 923#else 924 if (runtime < p->p_prev_runtime) 925 runtime = p->p_prev_runtime; 926#endif 927 microuptime(&wallclock); 928 timevalsub(&wallclock, &p->p_stats->p_start); 929 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 930 pct_estimate = (1000000 * runtime * 100) / 931 ((uint64_t)wallclock.tv_sec * 1000000 + 932 wallclock.tv_usec); 933 } else 934 pct_estimate = 0; 935 pct = racct_getpcpu(p, pct_estimate); 936 937 mtx_lock(&racct_lock); 938 racct_set_locked(p, RACCT_CPU, runtime); 939 racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); 940 941 for (i = 0; i <= RACCT_MAX; i++) { 942 if (p->p_racct->r_resources[i] == 0) 943 continue; 944 if (!RACCT_IS_RECLAIMABLE(i)) 945 continue; 946 racct_set_locked(p, i, 0); 947 } 948 949 mtx_unlock(&racct_lock); 950 PROC_UNLOCK(p); 951 952#ifdef RCTL 953 rctl_racct_release(p->p_racct); 954#endif 955 racct_destroy(&p->p_racct); 956} 957 958/* 959 * Called after credentials change, to move resource utilisation 960 * between raccts. 961 */ 962void 963racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, 964 struct ucred *newcred) 965{ 966 struct uidinfo *olduip, *newuip; 967 struct loginclass *oldlc, *newlc; 968 struct prison *oldpr, *newpr, *pr; 969 970 PROC_LOCK_ASSERT(p, MA_NOTOWNED); 971 972 newuip = newcred->cr_ruidinfo; 973 olduip = oldcred->cr_ruidinfo; 974 newlc = newcred->cr_loginclass; 975 oldlc = oldcred->cr_loginclass; 976 newpr = newcred->cr_prison; 977 oldpr = oldcred->cr_prison; 978 979 mtx_lock(&racct_lock); 980 if (newuip != olduip) { 981 racct_sub_racct(olduip->ui_racct, p->p_racct); 982 racct_add_racct(newuip->ui_racct, p->p_racct); 983 } 984 if (newlc != oldlc) { 985 racct_sub_racct(oldlc->lc_racct, p->p_racct); 986 racct_add_racct(newlc->lc_racct, p->p_racct); 987 } 988 if (newpr != oldpr) { 989 for (pr = oldpr; pr != NULL; pr = pr->pr_parent) 990 racct_sub_racct(pr->pr_prison_racct->prr_racct, 991 p->p_racct); 992 for (pr = newpr; pr != NULL; pr = pr->pr_parent) 993 racct_add_racct(pr->pr_prison_racct->prr_racct, 994 p->p_racct); 995 } 996 mtx_unlock(&racct_lock); 997 998#ifdef RCTL 999 rctl_proc_ucred_changed(p, newcred); 1000#endif 1001} 1002 1003void 1004racct_move(struct racct *dest, struct racct *src) 1005{ 1006 1007 mtx_lock(&racct_lock); 1008 1009 racct_add_racct(dest, src); 1010 racct_sub_racct(src, src); 1011 1012 mtx_unlock(&racct_lock); 1013} 1014 1015static void 1016racct_proc_throttle(struct proc *p) 1017{ 1018 struct thread *td; 1019#ifdef SMP 1020 int cpuid; 1021#endif 1022 1023 PROC_LOCK_ASSERT(p, MA_OWNED); 1024 1025 /* 1026 * Do not block kernel processes. Also do not block processes with 1027 * low %cpu utilization to improve interactivity. 1028 */ 1029 if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) || 1030 (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) 1031 return; 1032 p->p_throttled = 1; 1033 1034 FOREACH_THREAD_IN_PROC(p, td) { 1035 thread_lock(td); 1036 switch (td->td_state) { 1037 case TDS_RUNQ: 1038 /* 1039 * If the thread is on the scheduler run-queue, we can 1040 * not just remove it from there. So we set the flag 1041 * TDF_NEEDRESCHED for the thread, so that once it is 1042 * running, it is taken off the cpu as soon as possible. 1043 */ 1044 td->td_flags |= TDF_NEEDRESCHED; 1045 break; 1046 case TDS_RUNNING: 1047 /* 1048 * If the thread is running, we request a context 1049 * switch for it by setting the TDF_NEEDRESCHED flag. 1050 */ 1051 td->td_flags |= TDF_NEEDRESCHED; 1052#ifdef SMP 1053 cpuid = td->td_oncpu; 1054 if ((cpuid != NOCPU) && (td != curthread)) 1055 ipi_cpu(cpuid, IPI_AST); 1056#endif 1057 break; 1058 default: 1059 break; 1060 } 1061 thread_unlock(td); 1062 } 1063} 1064 1065static void 1066racct_proc_wakeup(struct proc *p) 1067{ 1068 PROC_LOCK_ASSERT(p, MA_OWNED); 1069 1070 if (p->p_throttled) { 1071 p->p_throttled = 0; 1072 wakeup(p->p_racct); 1073 } 1074} 1075 1076static void 1077racct_decay_resource(struct racct *racct, void * res, void* dummy) 1078{ 1079 int resource; 1080 int64_t r_old, r_new; 1081 1082 resource = *(int *)res; 1083 r_old = racct->r_resources[resource]; 1084 1085 /* If there is nothing to decay, just exit. */ 1086 if (r_old <= 0) 1087 return; 1088 1089 mtx_lock(&racct_lock); 1090 r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; 1091 racct->r_resources[resource] = r_new; 1092 mtx_unlock(&racct_lock); 1093} 1094 1095static void 1096racct_decay(int resource) 1097{ 1098 ui_racct_foreach(racct_decay_resource, &resource, NULL); 1099 loginclass_racct_foreach(racct_decay_resource, &resource, NULL); 1100 prison_racct_foreach(racct_decay_resource, &resource, NULL); 1101} 1102 1103static void 1104racctd(void) 1105{ 1106 struct thread *td; 1107 struct proc *p; 1108 struct timeval wallclock; 1109 uint64_t runtime; 1110 uint64_t pct, pct_estimate; 1111 1112 for (;;) { 1113 racct_decay(RACCT_PCTCPU); 1114 1115 sx_slock(&allproc_lock); 1116 1117 LIST_FOREACH(p, &zombproc, p_list) { 1118 PROC_LOCK(p); 1119 racct_set(p, RACCT_PCTCPU, 0); 1120 PROC_UNLOCK(p); 1121 } 1122 1123 FOREACH_PROC_IN_SYSTEM(p) { 1124 PROC_LOCK(p); 1125 if (p->p_state != PRS_NORMAL) { 1126 PROC_UNLOCK(p); 1127 continue; 1128 } 1129 1130 microuptime(&wallclock); 1131 timevalsub(&wallclock, &p->p_stats->p_start); 1132 PROC_SLOCK(p); 1133 FOREACH_THREAD_IN_PROC(p, td) 1134 ruxagg(p, td); 1135 runtime = cputick2usec(p->p_rux.rux_runtime); 1136 PROC_SUNLOCK(p); 1137#ifdef notyet 1138 KASSERT(runtime >= p->p_prev_runtime, 1139 ("runtime < p_prev_runtime")); 1140#else 1141 if (runtime < p->p_prev_runtime) 1142 runtime = p->p_prev_runtime; 1143#endif 1144 p->p_prev_runtime = runtime; 1145 if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1146 pct_estimate = (1000000 * runtime * 100) / 1147 ((uint64_t)wallclock.tv_sec * 1000000 + 1148 wallclock.tv_usec); 1149 } else 1150 pct_estimate = 0; 1151 pct = racct_getpcpu(p, pct_estimate); 1152 mtx_lock(&racct_lock); 1153 racct_set_force_locked(p, RACCT_PCTCPU, pct); 1154 racct_set_locked(p, RACCT_CPU, runtime); 1155 racct_set_locked(p, RACCT_WALLCLOCK, 1156 (uint64_t)wallclock.tv_sec * 1000000 + 1157 wallclock.tv_usec); 1158 mtx_unlock(&racct_lock); 1159 PROC_UNLOCK(p); 1160 } 1161 1162 /* 1163 * To ensure that processes are throttled in a fair way, we need 1164 * to iterate over all processes again and check the limits 1165 * for %cpu resource only after ucred racct containers have been 1166 * properly filled. 1167 */ 1168 FOREACH_PROC_IN_SYSTEM(p) { 1169 PROC_LOCK(p); 1170 if (p->p_state != PRS_NORMAL) { 1171 PROC_UNLOCK(p); 1172 continue; 1173 } 1174 1175 if (racct_pcpu_available(p) <= 0) 1176 racct_proc_throttle(p); 1177 else if (p->p_throttled) 1178 racct_proc_wakeup(p); 1179 PROC_UNLOCK(p); 1180 } 1181 sx_sunlock(&allproc_lock); 1182 pause("-", hz); 1183 } 1184} 1185 1186static struct kproc_desc racctd_kp = { 1187 "racctd", 1188 racctd, 1189 NULL 1190}; 1191SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp); 1192 1193static void 1194racct_init(void) 1195{ 1196 1197 racct_zone = uma_zcreate("racct", sizeof(struct racct), 1198 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1199 /* 1200 * XXX: Move this somewhere. 1201 */ 1202 prison0.pr_prison_racct = prison_racct_find("0"); 1203} 1204SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); 1205 1206#else /* !RACCT */ 1207 1208int 1209racct_add(struct proc *p, int resource, uint64_t amount) 1210{ 1211 1212 return (0); 1213} 1214 1215void 1216racct_add_cred(struct ucred *cred, int resource, uint64_t amount) 1217{ 1218} 1219 1220void 1221racct_add_force(struct proc *p, int resource, uint64_t amount) 1222{ 1223 1224 return; 1225} 1226 1227int 1228racct_set(struct proc *p, int resource, uint64_t amount) 1229{ 1230 1231 return (0); 1232} 1233 1234void 1235racct_set_force(struct proc *p, int resource, uint64_t amount) 1236{ 1237} 1238 1239void 1240racct_sub(struct proc *p, int resource, uint64_t amount) 1241{ 1242} 1243 1244void 1245racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 1246{ 1247} 1248 1249uint64_t 1250racct_get_limit(struct proc *p, int resource) 1251{ 1252 1253 return (UINT64_MAX); 1254} 1255 1256uint64_t 1257racct_get_available(struct proc *p, int resource) 1258{ 1259 1260 return (UINT64_MAX); 1261} 1262 1263void 1264racct_create(struct racct **racctp) 1265{ 1266} 1267 1268void 1269racct_destroy(struct racct **racctp) 1270{ 1271} 1272 1273int 1274racct_proc_fork(struct proc *parent, struct proc *child) 1275{ 1276 1277 return (0); 1278} 1279 1280void 1281racct_proc_fork_done(struct proc *child) 1282{ 1283} 1284 1285void 1286racct_proc_exit(struct proc *p) 1287{ 1288} 1289 1290#endif /* !RACCT */ 1291