kern_racct.c revision 242957
1250323Sdteske/*- 2250323Sdteske * Copyright (c) 2010 The FreeBSD Foundation 3250323Sdteske * All rights reserved. 4252980Sdteske * 5250323Sdteske * This software was developed by Edward Tomasz Napierala under sponsorship 6250323Sdteske * from the FreeBSD Foundation. 7250323Sdteske * 8250323Sdteske * Redistribution and use in source and binary forms, with or without 9250323Sdteske * modification, are permitted provided that the following conditions 10250323Sdteske * are met: 11250323Sdteske * 1. Redistributions of source code must retain the above copyright 12250323Sdteske * notice, this list of conditions and the following disclaimer. 13250323Sdteske * 2. Redistributions in binary form must reproduce the above copyright 14250323Sdteske * notice, this list of conditions and the following disclaimer in the 15250323Sdteske * documentation and/or other materials provided with the distribution. 16250323Sdteske * 17250323Sdteske * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18250323Sdteske * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19250323Sdteske * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20250323Sdteske * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21250323Sdteske * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22250323Sdteske * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23250323Sdteske * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24250323Sdteske * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25250323Sdteske * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26250323Sdteske * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27250323Sdteske * SUCH DAMAGE. 28250323Sdteske * 29250323Sdteske * $FreeBSD: head/sys/kern/kern_racct.c 242957 2012-11-13 11:29:08Z trasz $ 30250323Sdteske */ 31250323Sdteske 32250323Sdteske#include <sys/cdefs.h> 33250323Sdteske__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 242957 2012-11-13 11:29:08Z trasz $"); 34250323Sdteske 35252745Sdteske#include "opt_kdtrace.h" 36252745Sdteske#include "opt_sched.h" 37250323Sdteske 38250323Sdteske#include <sys/param.h> 39252077Sdteske#include <sys/systm.h> 40250323Sdteske#include <sys/eventhandler.h> 41250323Sdteske#include <sys/jail.h> 42250323Sdteske#include <sys/kernel.h> 43250323Sdteske#include <sys/kthread.h> 44250323Sdteske#include <sys/lock.h> 45250323Sdteske#include <sys/loginclass.h> 46250323Sdteske#include <sys/malloc.h> 47250323Sdteske#include <sys/mutex.h> 48250323Sdteske#include <sys/proc.h> 49250323Sdteske#include <sys/racct.h> 50250323Sdteske#include <sys/resourcevar.h> 51250323Sdteske#include <sys/sbuf.h> 52250323Sdteske#include <sys/sched.h> 53252740Sdteske#include <sys/sdt.h> 54252745Sdteske#include <sys/smp.h> 55252745Sdteske#include <sys/sx.h> 56252745Sdteske#include <sys/sysctl.h> 57252745Sdteske#include <sys/sysent.h> 58252745Sdteske#include <sys/sysproto.h> 59252740Sdteske#include <sys/umtx.h> 60252740Sdteske#include <machine/smp.h> 61252745Sdteske 62252740Sdteske#ifdef RCTL 63252740Sdteske#include <sys/rctl.h> 64250323Sdteske#endif 65250323Sdteske 66250323Sdteske#ifdef RACCT 67250323Sdteske 68250323SdteskeFEATURE(racct, "Resource Accounting"); 69250323Sdteske 70250323Sdteske/* 71250323Sdteske * Do not block processes that have their %cpu usage <= pcpu_threshold. 72250323Sdteske */ 73250323Sdteskestatic int pcpu_threshold = 1; 74250323Sdteske 75250323SdteskeSYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); 76250323SdteskeSYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 77250323Sdteske 0, "Processes with higher %cpu usage than this value can be throttled."); 78250323Sdteske 79250323Sdteske/* 80250323Sdteske * How many seconds it takes to use the scheduler %cpu calculations. When a 81250323Sdteske * process starts, we compute its %cpu usage by dividing its runtime by the 82250323Sdteske * process wall clock time. After RACCT_PCPU_SECS pass, we use the value 83250323Sdteske * provided by the scheduler. 84250323Sdteske */ 85250323Sdteske#define RACCT_PCPU_SECS 3 86250323Sdteske 87251355Sdteskestatic struct mtx racct_lock; 88250323SdteskeMTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); 89250323Sdteske 90251355Sdteskestatic uma_zone_t racct_zone; 91250323Sdteske 92250323Sdteskestatic void racct_sub_racct(struct racct *dest, const struct racct *src); 93251355Sdteskestatic void racct_sub_cred_locked(struct ucred *cred, int resource, 94250538Sdteske uint64_t amount); 95250323Sdteskestatic void racct_add_cred_locked(struct ucred *cred, int resource, 96250323Sdteske uint64_t amount); 97250323Sdteske 98250538SdteskeSDT_PROVIDER_DEFINE(racct); 99250538SdteskeSDT_PROBE_DEFINE3(racct, kernel, rusage, add, add, "struct proc *", "int", 100250538Sdteske "uint64_t"); 101250538SdteskeSDT_PROBE_DEFINE3(racct, kernel, rusage, add_failure, add-failure, 102251354Sdteske "struct proc *", "int", "uint64_t"); 103250323SdteskeSDT_PROBE_DEFINE3(racct, kernel, rusage, add_cred, add-cred, "struct ucred *", 104250323Sdteske "int", "uint64_t"); 105250323SdteskeSDT_PROBE_DEFINE3(racct, kernel, rusage, add_force, add-force, "struct proc *", 106250538Sdteske "int", "uint64_t"); 107250538SdteskeSDT_PROBE_DEFINE3(racct, kernel, rusage, set, set, "struct proc *", "int", 108250538Sdteske "uint64_t"); 109250538SdteskeSDT_PROBE_DEFINE3(racct, kernel, rusage, set_failure, set-failure, 110251354Sdteske "struct proc *", "int", "uint64_t"); 111250323SdteskeSDT_PROBE_DEFINE3(racct, kernel, rusage, sub, sub, "struct proc *", "int", 112251354Sdteske "uint64_t"); 113251355SdteskeSDT_PROBE_DEFINE3(racct, kernel, rusage, sub_cred, sub-cred, "struct ucred *", 114250538Sdteske "int", "uint64_t"); 115250323SdteskeSDT_PROBE_DEFINE1(racct, kernel, racct, create, create, "struct racct *"); 116250323SdteskeSDT_PROBE_DEFINE1(racct, kernel, racct, destroy, destroy, "struct racct *"); 117251355SdteskeSDT_PROBE_DEFINE2(racct, kernel, racct, join, join, "struct racct *", 118250323Sdteske "struct racct *"); 119250323SdteskeSDT_PROBE_DEFINE2(racct, kernel, racct, join_failure, join-failure, 120250323Sdteske "struct racct *", "struct racct *"); 121250323SdteskeSDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *", 122250323Sdteske "struct racct *"); 123250323Sdteske 124250323Sdteskeint racct_types[] = { 125250323Sdteske [RACCT_CPU] = 126250323Sdteske RACCT_IN_MILLIONS, 127250323Sdteske [RACCT_DATA] = 128250323Sdteske RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 129250323Sdteske [RACCT_STACK] = 130250323Sdteske RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 131250323Sdteske [RACCT_CORE] = 132250323Sdteske RACCT_DENIABLE, 133250323Sdteske [RACCT_RSS] = 134250323Sdteske RACCT_RECLAIMABLE, 135250323Sdteske [RACCT_MEMLOCK] = 136250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE, 137250323Sdteske [RACCT_NPROC] = 138252771Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE, 139250323Sdteske [RACCT_NOFILE] = 140250323Sdteske RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 141250323Sdteske [RACCT_VMEM] = 142250323Sdteske RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 143250323Sdteske [RACCT_NPTS] = 144250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 145250323Sdteske [RACCT_SWAP] = 146250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 147250323Sdteske [RACCT_NTHR] = 148250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE, 149250323Sdteske [RACCT_MSGQQUEUED] = 150250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 151250323Sdteske [RACCT_MSGQSIZE] = 152250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 153250323Sdteske [RACCT_NMSGQ] = 154250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 155250323Sdteske [RACCT_NSEM] = 156250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 157250323Sdteske [RACCT_NSEMOP] = 158250323Sdteske RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 159250323Sdteske [RACCT_NSHM] = 160250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 161252771Sdteske [RACCT_SHMSIZE] = 162250323Sdteske RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 163250323Sdteske [RACCT_WALLCLOCK] = 164250323Sdteske RACCT_IN_MILLIONS, 165250323Sdteske [RACCT_PCTCPU] = 166250323Sdteske RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; 167250323Sdteske 168250323Sdteskestatic const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; 169250323Sdteske 170250323Sdteske#ifdef SCHED_4BSD 171250323Sdteske/* 172250323Sdteske * Contains intermediate values for %cpu calculations to avoid using floating 173252740Sdteske * point in the kernel. 174252740Sdteske * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) 175252740Sdteske * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to 176252740Sdteske * zero so the calculations are more straightforward. 177252740Sdteske */ 178252740Sdteskefixpt_t ccpu_exp[] = { 179252740Sdteske [0] = FSCALE * 1, 180252740Sdteske [1] = FSCALE * 0.95122942450071400909, 181252740Sdteske [2] = FSCALE * 0.90483741803595957316, 182250323Sdteske [3] = FSCALE * 0.86070797642505780722, 183250323Sdteske [4] = FSCALE * 0.81873075307798185866, 184250323Sdteske [5] = FSCALE * 0.77880078307140486824, 185250323Sdteske [6] = FSCALE * 0.74081822068171786606, 186250323Sdteske [7] = FSCALE * 0.70468808971871343435, 187250323Sdteske [8] = FSCALE * 0.67032004603563930074, 188250323Sdteske [9] = FSCALE * 0.63762815162177329314, 189250323Sdteske [10] = FSCALE * 0.60653065971263342360, 190250323Sdteske [11] = FSCALE * 0.57694981038048669531, 191250323Sdteske [12] = FSCALE * 0.54881163609402643262, 192250323Sdteske [13] = FSCALE * 0.52204577676101604789, 193250323Sdteske [14] = FSCALE * 0.49658530379140951470, 194250323Sdteske [15] = FSCALE * 0.47236655274101470713, 195250323Sdteske [16] = FSCALE * 0.44932896411722159143, 196250323Sdteske [17] = FSCALE * 0.42741493194872666992, 197250323Sdteske [18] = FSCALE * 0.40656965974059911188, 198250323Sdteske [19] = FSCALE * 0.38674102345450120691, 199250323Sdteske [20] = FSCALE * 0.36787944117144232159, 200250323Sdteske [21] = FSCALE * 0.34993774911115535467, 201250323Sdteske [22] = FSCALE * 0.33287108369807955328, 202250323Sdteske [23] = FSCALE * 0.31663676937905321821, 203250323Sdteske [24] = FSCALE * 0.30119421191220209664, 204250323Sdteske [25] = FSCALE * 0.28650479686019010032, 205250323Sdteske [26] = FSCALE * 0.27253179303401260312, 206250323Sdteske [27] = FSCALE * 0.25924026064589150757, 207250323Sdteske [28] = FSCALE * 0.24659696394160647693, 208250323Sdteske [29] = FSCALE * 0.23457028809379765313, 209250323Sdteske [30] = FSCALE * 0.22313016014842982893, 210250323Sdteske [31] = FSCALE * 0.21224797382674305771, 211250323Sdteske [32] = FSCALE * 0.20189651799465540848, 212250323Sdteske [33] = FSCALE * 0.19204990862075411423, 213250323Sdteske [34] = FSCALE * 0.18268352405273465022, 214250323Sdteske [35] = FSCALE * 0.17377394345044512668, 215250323Sdteske [36] = FSCALE * 0.16529888822158653829, 216250323Sdteske [37] = FSCALE * 0.15723716631362761621, 217250323Sdteske [38] = FSCALE * 0.14956861922263505264, 218250323Sdteske [39] = FSCALE * 0.14227407158651357185, 219250323Sdteske [40] = FSCALE * 0.13533528323661269189, 220250323Sdteske [41] = FSCALE * 0.12873490358780421886, 221250323Sdteske [42] = FSCALE * 0.12245642825298191021, 222252178Sdteske [43] = FSCALE * 0.11648415777349695786, 223250323Sdteske [44] = FSCALE * 0.11080315836233388333, 224250323Sdteske [45] = FSCALE * 0.10539922456186433678, 225250323Sdteske [46] = FSCALE * 0.10025884372280373372, 226250323Sdteske [47] = FSCALE * 0.09536916221554961888, 227250323Sdteske [48] = FSCALE * 0.09071795328941250337, 228250323Sdteske [49] = FSCALE * 0.08629358649937051097, 229250323Sdteske [50] = FSCALE * 0.08208499862389879516, 230250323Sdteske [51] = FSCALE * 0.07808166600115315231, 231252178Sdteske [52] = FSCALE * 0.07427357821433388042, 232250323Sdteske [53] = FSCALE * 0.07065121306042958674, 233250323Sdteske [54] = FSCALE * 0.06720551273974976512, 234250323Sdteske [55] = FSCALE * 0.06392786120670757270, 235250323Sdteske [56] = FSCALE * 0.06081006262521796499, 236250323Sdteske [57] = FSCALE * 0.05784432087483846296, 237250323Sdteske [58] = FSCALE * 0.05502322005640722902, 238250323Sdteske [59] = FSCALE * 0.05233970594843239308, 239250323Sdteske [60] = FSCALE * 0.04978706836786394297, 240250323Sdteske [61] = FSCALE * 0.04735892439114092119, 241250323Sdteske [62] = FSCALE * 0.04504920239355780606, 242250323Sdteske [63] = FSCALE * 0.04285212686704017991, 243250323Sdteske [64] = FSCALE * 0.04076220397836621516, 244250323Sdteske [65] = FSCALE * 0.03877420783172200988, 245250323Sdteske [66] = FSCALE * 0.03688316740124000544, 246250323Sdteske [67] = FSCALE * 0.03508435410084502588, 247250323Sdteske [68] = FSCALE * 0.03337326996032607948, 248250323Sdteske [69] = FSCALE * 0.03174563637806794323, 249250323Sdteske [70] = FSCALE * 0.03019738342231850073, 250250323Sdteske [71] = FSCALE * 0.02872463965423942912, 251250323Sdteske [72] = FSCALE * 0.02732372244729256080, 252250323Sdteske [73] = FSCALE * 0.02599112877875534358, 253250323Sdteske [74] = FSCALE * 0.02472352647033939120, 254250323Sdteske [75] = FSCALE * 0.02351774585600910823, 255250323Sdteske [76] = FSCALE * 0.02237077185616559577, 256250323Sdteske [77] = FSCALE * 0.02127973643837716938, 257250323Sdteske [78] = FSCALE * 0.02024191144580438847, 258250323Sdteske [79] = FSCALE * 0.01925470177538692429, 259250323Sdteske [80] = FSCALE * 0.01831563888873418029, 260250323Sdteske [81] = FSCALE * 0.01742237463949351138, 261250323Sdteske [82] = FSCALE * 0.01657267540176124754, 262250323Sdteske [83] = FSCALE * 0.01576441648485449082, 263250323Sdteske [84] = FSCALE * 0.01499557682047770621, 264250323Sdteske [85] = FSCALE * 0.01426423390899925527, 265250323Sdteske [86] = FSCALE * 0.01356855901220093175, 266250323Sdteske [87] = FSCALE * 0.01290681258047986886, 267250323Sdteske [88] = FSCALE * 0.01227733990306844117, 268250323Sdteske [89] = FSCALE * 0.01167856697039544521, 269250323Sdteske [90] = FSCALE * 0.01110899653824230649, 270250323Sdteske [91] = FSCALE * 0.01056720438385265337, 271250323Sdteske [92] = FSCALE * 0.01005183574463358164, 272250323Sdteske [93] = FSCALE * 0.00956160193054350793, 273250323Sdteske [94] = FSCALE * 0.00909527710169581709, 274250323Sdteske [95] = FSCALE * 0.00865169520312063417, 275250323Sdteske [96] = FSCALE * 0.00822974704902002884, 276250323Sdteske [97] = FSCALE * 0.00782837754922577143, 277250323Sdteske [98] = FSCALE * 0.00744658307092434051, 278250323Sdteske [99] = FSCALE * 0.00708340892905212004, 279250323Sdteske [100] = FSCALE * 0.00673794699908546709, 280250323Sdteske [101] = FSCALE * 0.00640933344625638184, 281250323Sdteske [102] = FSCALE * 0.00609674656551563610, 282250323Sdteske [103] = FSCALE * 0.00579940472684214321, 283250323Sdteske [104] = FSCALE * 0.00551656442076077241, 284250323Sdteske [105] = FSCALE * 0.00524751839918138427, 285250323Sdteske [106] = FSCALE * 0.00499159390691021621, 286250323Sdteske [107] = FSCALE * 0.00474815099941147558, 287250323Sdteske [108] = FSCALE * 0.00451658094261266798, 288250323Sdteske [109] = FSCALE * 0.00429630469075234057, 289250323Sdteske [110] = FSCALE * 0.00408677143846406699, 290250323Sdteske}; 291250323Sdteske#endif 292250323Sdteske 293250323Sdteske#define CCPU_EXP_MAX 110 294250323Sdteske 295250323Sdteske/* 296250323Sdteske * This function is analogical to the getpcpu() function in the ps(1) command. 297251236Sdteske * They should both calculate in the same way so that the racct %cpu 298251236Sdteske * calculations are consistent with the values showed by the ps(1) tool. 299250323Sdteske * The calculations are more complex in the 4BSD scheduler because of the value 300250323Sdteske * of the ccpu variable. In ULE it is defined to be zero which saves us some 301250323Sdteske * work. 302251264Sdteske */ 303251264Sdteskestatic uint64_t 304251264Sdteskeracct_getpcpu(struct proc *p, u_int pcpu) 305251264Sdteske{ 306251264Sdteske u_int swtime; 307251264Sdteske#ifdef SCHED_4BSD 308250323Sdteske fixpt_t pctcpu, pctcpu_next; 309250323Sdteske#endif 310250323Sdteske#ifdef SMP 311250323Sdteske struct pcpu *pc; 312250323Sdteske int found; 313250323Sdteske#endif 314251264Sdteske fixpt_t p_pctcpu; 315250323Sdteske struct thread *td; 316250323Sdteske 317250323Sdteske /* 318250323Sdteske * If the process is swapped out, we count its %cpu usage as zero. 319250323Sdteske * This behaviour is consistent with the userland ps(1) tool. 320251264Sdteske */ 321251264Sdteske if ((p->p_flag & P_INMEM) == 0) 322250323Sdteske return (0); 323251236Sdteske swtime = (ticks - p->p_swtick) / hz; 324251232Sdteske 325251232Sdteske /* 326251232Sdteske * For short-lived processes, the sched_pctcpu() returns small 327251232Sdteske * values even for cpu intensive processes. Therefore we use 328251232Sdteske * our own estimate in this case. 329251232Sdteske */ 330251236Sdteske if (swtime < RACCT_PCPU_SECS) 331251236Sdteske return (pcpu); 332251266Sdteske 333251266Sdteske p_pctcpu = 0; 334251266Sdteske FOREACH_THREAD_IN_PROC(p, td) { 335251266Sdteske if (td == PCPU_GET(idlethread)) 336251266Sdteske continue; 337251266Sdteske#ifdef SMP 338251266Sdteske found = 0; 339251266Sdteske STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 340251266Sdteske if (td == pc->pc_idlethread) { 341251266Sdteske found = 1; 342251266Sdteske break; 343251266Sdteske } 344250323Sdteske } 345251236Sdteske if (found) 346250323Sdteske continue; 347250323Sdteske#endif 348250323Sdteske thread_lock(td); 349250323Sdteske#ifdef SCHED_4BSD 350250323Sdteske pctcpu = sched_pctcpu(td); 351250323Sdteske /* Count also the yet unfinished second. */ 352250323Sdteske pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; 353250323Sdteske pctcpu_next += sched_pctcpu_delta(td); 354250323Sdteske p_pctcpu += max(pctcpu, pctcpu_next); 355250323Sdteske#else 356250323Sdteske /* 357250323Sdteske * In ULE the %cpu statistics are updated on every 358250323Sdteske * sched_pctcpu() call. So special calculations to 359250323Sdteske * account for the latest (unfinished) second are 360250323Sdteske * not needed. 361250323Sdteske */ 362250323Sdteske p_pctcpu += sched_pctcpu(td); 363250323Sdteske#endif 364250323Sdteske thread_unlock(td); 365250323Sdteske } 366250323Sdteske 367250323Sdteske#ifdef SCHED_4BSD 368250323Sdteske if (swtime <= CCPU_EXP_MAX) 369250323Sdteske return ((100 * (uint64_t)p_pctcpu * 1000000) / 370250323Sdteske (FSCALE - ccpu_exp[swtime])); 371250323Sdteske#endif 372250323Sdteske 373250323Sdteske return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); 374250323Sdteske} 375250323Sdteske 376250323Sdteskestatic void 377250323Sdteskeracct_add_racct(struct racct *dest, const struct racct *src) 378251266Sdteske{ 379250323Sdteske int i; 380250323Sdteske 381250323Sdteske mtx_assert(&racct_lock, MA_OWNED); 382250323Sdteske 383250323Sdteske /* 384250323Sdteske * Update resource usage in dest. 385251361Sdteske */ 386251361Sdteske for (i = 0; i <= RACCT_MAX; i++) { 387251361Sdteske KASSERT(dest->r_resources[i] >= 0, 388251361Sdteske ("racct propagation meltdown: dest < 0")); 389250323Sdteske KASSERT(src->r_resources[i] >= 0, 390250323Sdteske ("racct propagation meltdown: src < 0")); 391250323Sdteske dest->r_resources[i] += src->r_resources[i]; 392251264Sdteske } 393251264Sdteske} 394251264Sdteske 395251264Sdteskestatic void 396251264Sdteskeracct_sub_racct(struct racct *dest, const struct racct *src) 397250323Sdteske{ 398250323Sdteske int i; 399250323Sdteske 400251264Sdteske mtx_assert(&racct_lock, MA_OWNED); 401250323Sdteske 402250323Sdteske /* 403250323Sdteske * Update resource usage in dest. 404251264Sdteske */ 405250323Sdteske for (i = 0; i <= RACCT_MAX; i++) { 406252178Sdteske if (!RACCT_IS_SLOPPY(i)) { 407250323Sdteske KASSERT(dest->r_resources[i] >= 0, 408250323Sdteske ("racct propagation meltdown: dest < 0")); 409250323Sdteske KASSERT(src->r_resources[i] >= 0, 410250323Sdteske ("racct propagation meltdown: src < 0")); 411251264Sdteske KASSERT(src->r_resources[i] <= dest->r_resources[i], 412250323Sdteske ("racct propagation meltdown: src > dest")); 413250323Sdteske } 414250323Sdteske if (RACCT_CAN_DROP(i)) { 415250323Sdteske dest->r_resources[i] -= src->r_resources[i]; 416250323Sdteske if (dest->r_resources[i] < 0) { 417250323Sdteske KASSERT(RACCT_IS_SLOPPY(i), 418250323Sdteske ("racct_sub_racct: usage < 0")); 419250323Sdteske dest->r_resources[i] = 0; 420250323Sdteske } 421250323Sdteske } 422250323Sdteske } 423250323Sdteske} 424250323Sdteske 425250323Sdteskevoid 426250323Sdteskeracct_create(struct racct **racctp) 427250323Sdteske{ 428250323Sdteske 429250323Sdteske SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0); 430250323Sdteske 431250323Sdteske KASSERT(*racctp == NULL, ("racct already allocated")); 432250323Sdteske 433250323Sdteske *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); 434250323Sdteske} 435250323Sdteske 436250323Sdteskestatic void 437250323Sdteskeracct_destroy_locked(struct racct **racctp) 438251264Sdteske{ 439251264Sdteske int i; 440251264Sdteske struct racct *racct; 441250323Sdteske 442250323Sdteske SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0); 443250323Sdteske 444250323Sdteske mtx_assert(&racct_lock, MA_OWNED); 445250323Sdteske KASSERT(racctp != NULL, ("NULL racctp")); 446250323Sdteske KASSERT(*racctp != NULL, ("NULL racct")); 447250323Sdteske 448250323Sdteske racct = *racctp; 449250323Sdteske 450250323Sdteske for (i = 0; i <= RACCT_MAX; i++) { 451250323Sdteske if (RACCT_IS_SLOPPY(i)) 452250323Sdteske continue; 453250323Sdteske if (!RACCT_IS_RECLAIMABLE(i)) 454250323Sdteske continue; 455250323Sdteske KASSERT(racct->r_resources[i] == 0, 456250323Sdteske ("destroying non-empty racct: " 457250323Sdteske "%ju allocated for resource %d\n", 458250323Sdteske racct->r_resources[i], i)); 459250323Sdteske } 460250323Sdteske uma_zfree(racct_zone, racct); 461250323Sdteske *racctp = NULL; 462250323Sdteske} 463250323Sdteske 464250323Sdteskevoid 465250323Sdteskeracct_destroy(struct racct **racct) 466250323Sdteske{ 467250323Sdteske 468250323Sdteske mtx_lock(&racct_lock); 469250323Sdteske racct_destroy_locked(racct); 470250323Sdteske mtx_unlock(&racct_lock); 471250323Sdteske} 472250323Sdteske 473250323Sdteske/* 474250323Sdteske * Increase consumption of 'resource' by 'amount' for 'racct' 475250323Sdteske * and all its parents. Differently from other cases, 'amount' here 476250323Sdteske * may be less than zero. 477250323Sdteske */ 478250323Sdteskestatic void 479250323Sdteskeracct_alloc_resource(struct racct *racct, int resource, 480250538Sdteske uint64_t amount) 481250538Sdteske{ 482250538Sdteske 483250538Sdteske mtx_assert(&racct_lock, MA_OWNED); 484250538Sdteske KASSERT(racct != NULL, ("NULL racct")); 485250538Sdteske 486250323Sdteske racct->r_resources[resource] += amount; 487250323Sdteske if (racct->r_resources[resource] < 0) { 488250323Sdteske KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), 489251232Sdteske ("racct_alloc_resource: usage < 0")); 490251357Sdteske racct->r_resources[resource] = 0; 491251232Sdteske } 492251232Sdteske 493251232Sdteske /* 494251232Sdteske * There are some cases where the racct %cpu resource would grow 495251232Sdteske * beyond 100%. 496251232Sdteske * For example in racct_proc_exit() we add the process %cpu usage 497251236Sdteske * to the ucred racct containers. If too many processes terminated 498251236Sdteske * in a short time span, the ucred %cpu resource could grow too much. 499251236Sdteske * Also, the 4BSD scheduler sometimes returns for a thread more than 500250323Sdteske * 100% cpu usage. So we set a boundary here to 100%. 501250323Sdteske */ 502250323Sdteske if ((resource == RACCT_PCTCPU) && 503250323Sdteske (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000)) 504250323Sdteske racct->r_resources[RACCT_PCTCPU] = 100 * 1000000; 505250323Sdteske} 506250323Sdteske 507250323Sdteskestatic int 508251232Sdteskeracct_add_locked(struct proc *p, int resource, uint64_t amount) 509251232Sdteske{ 510250323Sdteske#ifdef RCTL 511250323Sdteske int error; 512250323Sdteske#endif 513251232Sdteske 514251232Sdteske SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0); 515250323Sdteske 516250323Sdteske /* 517250323Sdteske * We need proc lock to dereference p->p_ucred. 518251236Sdteske */ 519251236Sdteske PROC_LOCK_ASSERT(p, MA_OWNED); 520250323Sdteske 521250323Sdteske#ifdef RCTL 522251236Sdteske error = rctl_enforce(p, resource, amount); 523250323Sdteske if (error && RACCT_IS_DENIABLE(resource)) { 524251236Sdteske SDT_PROBE(racct, kernel, rusage, add_failure, p, resource, 525251361Sdteske amount, 0, 0); 526250323Sdteske return (error); 527250323Sdteske } 528250323Sdteske#endif 529250323Sdteske racct_alloc_resource(p->p_racct, resource, amount); 530250323Sdteske racct_add_cred_locked(p->p_ucred, resource, amount); 531250323Sdteske 532250323Sdteske return (0); 533250323Sdteske} 534250323Sdteske 535250323Sdteske/* 536250323Sdteske * Increase allocation of 'resource' by 'amount' for process 'p'. 537251236Sdteske * Return 0 if it's below limits, or errno, if it's not. 538250323Sdteske */ 539250323Sdteskeint 540250323Sdteskeracct_add(struct proc *p, int resource, uint64_t amount) 541251264Sdteske{ 542251264Sdteske int error; 543251264Sdteske 544250323Sdteske mtx_lock(&racct_lock); 545250323Sdteske error = racct_add_locked(p, resource, amount); 546250323Sdteske mtx_unlock(&racct_lock); 547250323Sdteske return (error); 548251264Sdteske} 549251232Sdteske 550251264Sdteskestatic void 551251264Sdteskeracct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) 552251232Sdteske{ 553251232Sdteske struct prison *pr; 554251232Sdteske 555251232Sdteske SDT_PROBE(racct, kernel, rusage, add_cred, cred, resource, amount, 556251232Sdteske 0, 0); 557251232Sdteske 558251232Sdteske racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount); 559251236Sdteske for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 560251236Sdteske racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 561250323Sdteske amount); 562250323Sdteske racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount); 563250323Sdteske} 564250323Sdteske 565250323Sdteske/* 566251232Sdteske * Increase allocation of 'resource' by 'amount' for credential 'cred'. 567251232Sdteske * Doesn't check for limits and never fails. 568250323Sdteske * 569250323Sdteske * XXX: Shouldn't this ever return an error? 570250323Sdteske */ 571250323Sdteskevoid 572251236Sdteskeracct_add_cred(struct ucred *cred, int resource, uint64_t amount) 573250323Sdteske{ 574250323Sdteske 575250323Sdteske mtx_lock(&racct_lock); 576250323Sdteske racct_add_cred_locked(cred, resource, amount); 577250323Sdteske mtx_unlock(&racct_lock); 578250323Sdteske} 579250323Sdteske 580250323Sdteske/* 581250323Sdteske * Increase allocation of 'resource' by 'amount' for process 'p'. 582250323Sdteske * Doesn't check for limits and never fails. 583250323Sdteske */ 584250323Sdteskevoid 585251264Sdteskeracct_add_force(struct proc *p, int resource, uint64_t amount) 586251264Sdteske{ 587250323Sdteske 588251264Sdteske SDT_PROBE(racct, kernel, rusage, add_force, p, resource, amount, 0, 0); 589252771Sdteske 590252771Sdteske /* 591251264Sdteske * We need proc lock to dereference p->p_ucred. 592251264Sdteske */ 593251264Sdteske PROC_LOCK_ASSERT(p, MA_OWNED); 594251264Sdteske 595250323Sdteske mtx_lock(&racct_lock); 596250323Sdteske racct_alloc_resource(p->p_racct, resource, amount); 597250323Sdteske mtx_unlock(&racct_lock); 598250323Sdteske racct_add_cred(p->p_ucred, resource, amount); 599250323Sdteske} 600250323Sdteske 601250323Sdteskestatic int 602250323Sdteskeracct_set_locked(struct proc *p, int resource, uint64_t amount) 603250323Sdteske{ 604250323Sdteske int64_t old_amount, decayed_amount; 605250323Sdteske int64_t diff_proc, diff_cred; 606250323Sdteske#ifdef RCTL 607250323Sdteske int error; 608250323Sdteske#endif 609251232Sdteske 610251232Sdteske SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 611251232Sdteske 612251232Sdteske /* 613251232Sdteske * We need proc lock to dereference p->p_ucred. 614251232Sdteske */ 615251232Sdteske PROC_LOCK_ASSERT(p, MA_OWNED); 616251232Sdteske 617251232Sdteske old_amount = p->p_racct->r_resources[resource]; 618251236Sdteske /* 619251236Sdteske * The diffs may be negative. 620250323Sdteske */ 621250323Sdteske diff_proc = amount - old_amount; 622250323Sdteske if (RACCT_IS_DECAYING(resource)) { 623250323Sdteske /* 624250323Sdteske * Resources in per-credential racct containers may decay. 625251232Sdteske * If this is the case, we need to calculate the difference 626251232Sdteske * between the new amount and the proportional value of the 627250323Sdteske * old amount that has decayed in the ucred racct containers. 628251758Sdteske */ 629251758Sdteske decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 630250323Sdteske diff_cred = amount - decayed_amount; 631250323Sdteske } else 632251758Sdteske diff_cred = diff_proc; 633252742Sdteske#ifdef notyet 634252742Sdteske KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), 635251758Sdteske ("racct_set: usage of non-droppable resource %d dropping", 636251758Sdteske resource)); 637251758Sdteske#endif 638251758Sdteske#ifdef RCTL 639251758Sdteske if (diff_proc > 0) { 640252771Sdteske error = rctl_enforce(p, resource, diff_proc); 641251758Sdteske if (error && RACCT_IS_DENIABLE(resource)) { 642252771Sdteske SDT_PROBE(racct, kernel, rusage, set_failure, p, 643252842Sdteske resource, amount, 0, 0); 644251758Sdteske return (error); 645251758Sdteske } 646251758Sdteske } 647251758Sdteske#endif 648252771Sdteske racct_alloc_resource(p->p_racct, resource, diff_proc); 649251758Sdteske if (diff_cred > 0) 650252771Sdteske racct_add_cred_locked(p->p_ucred, resource, diff_cred); 651252844Sdteske else if (diff_cred < 0) 652251758Sdteske racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 653251758Sdteske 654251758Sdteske return (0); 655251758Sdteske} 656252771Sdteske 657251758Sdteske/* 658252771Sdteske * Set allocation of 'resource' to 'amount' for process 'p'. 659252775Sdteske * Return 0 if it's below limits, or errno, if it's not. 660251758Sdteske * 661251758Sdteske * Note that decreasing the allocation always returns 0, 662251758Sdteske * even if it's above the limit. 663251758Sdteske */ 664250323Sdteskeint 665250323Sdteskeracct_set(struct proc *p, int resource, uint64_t amount) 666250323Sdteske{ 667250323Sdteske int error; 668250323Sdteske 669250323Sdteske mtx_lock(&racct_lock); 670250323Sdteske error = racct_set_locked(p, resource, amount); 671250323Sdteske mtx_unlock(&racct_lock); 672250323Sdteske return (error); 673250323Sdteske} 674250323Sdteske 675250323Sdteskestatic void 676250323Sdteskeracct_set_force_locked(struct proc *p, int resource, uint64_t amount) 677250323Sdteske{ 678250323Sdteske int64_t old_amount, decayed_amount; 679250323Sdteske int64_t diff_proc, diff_cred; 680250323Sdteske 681250323Sdteske SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 682250323Sdteske 683250323Sdteske /* 684252773Sdteske * We need proc lock to dereference p->p_ucred. 685250323Sdteske */ 686250323Sdteske PROC_LOCK_ASSERT(p, MA_OWNED); 687250323Sdteske 688250323Sdteske old_amount = p->p_racct->r_resources[resource]; 689250323Sdteske /* 690250323Sdteske * The diffs may be negative. 691250323Sdteske */ 692251236Sdteske diff_proc = amount - old_amount; 693250323Sdteske if (RACCT_IS_DECAYING(resource)) { 694250323Sdteske /* 695250323Sdteske * Resources in per-credential racct containers may decay. 696250323Sdteske * If this is the case, we need to calculate the difference 697250323Sdteske * between the new amount and the proportional value of the 698250323Sdteske * old amount that has decayed in the ucred racct containers. 699250323Sdteske */ 700250323Sdteske decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 701250323Sdteske diff_cred = amount - decayed_amount; 702250323Sdteske } else 703250323Sdteske diff_cred = diff_proc; 704250323Sdteske 705250323Sdteske racct_alloc_resource(p->p_racct, resource, diff_proc); 706250323Sdteske if (diff_cred > 0) 707250323Sdteske racct_add_cred_locked(p->p_ucred, resource, diff_cred); 708250323Sdteske else if (diff_cred < 0) 709250323Sdteske racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 710250323Sdteske} 711250323Sdteske 712250323Sdteskevoid 713250323Sdteskeracct_set_force(struct proc *p, int resource, uint64_t amount) 714251236Sdteske{ 715250323Sdteske mtx_lock(&racct_lock); 716250323Sdteske racct_set_force_locked(p, resource, amount); 717250323Sdteske mtx_unlock(&racct_lock); 718250323Sdteske} 719250323Sdteske 720250323Sdteske/* 721250323Sdteske * Returns amount of 'resource' the process 'p' can keep allocated. 722250323Sdteske * Allocating more than that would be denied, unless the resource 723251236Sdteske * is marked undeniable. Amount of already allocated resource does 724251236Sdteske * not matter. 725250323Sdteske */ 726250323Sdteskeuint64_t 727250323Sdteskeracct_get_limit(struct proc *p, int resource) 728250323Sdteske{ 729250323Sdteske 730250323Sdteske#ifdef RCTL 731250323Sdteske return (rctl_get_limit(p, resource)); 732251236Sdteske#else 733250323Sdteske return (UINT64_MAX); 734250323Sdteske#endif 735250323Sdteske} 736250323Sdteske 737251236Sdteske/* 738250323Sdteske * Returns amount of 'resource' the process 'p' can keep allocated. 739250323Sdteske * Allocating more than that would be denied, unless the resource 740250323Sdteske * is marked undeniable. Amount of already allocated resource does 741251236Sdteske * matter. 742250323Sdteske */ 743250323Sdteskeuint64_t 744250323Sdteskeracct_get_available(struct proc *p, int resource) 745251236Sdteske{ 746250323Sdteske 747250323Sdteske#ifdef RCTL 748250323Sdteske return (rctl_get_available(p, resource)); 749251236Sdteske#else 750250323Sdteske return (UINT64_MAX); 751250323Sdteske#endif 752250323Sdteske} 753250323Sdteske 754250323Sdteske/* 755250323Sdteske * Returns amount of the %cpu resource that process 'p' can add to its %cpu 756250323Sdteske * utilization. Adding more than that would lead to the process being 757250323Sdteske * throttled. 758250323Sdteske */ 759250323Sdteskestatic int64_t 760250323Sdteskeracct_pcpu_available(struct proc *p) 761250323Sdteske{ 762250323Sdteske 763250323Sdteske#ifdef RCTL 764250323Sdteske return (rctl_pcpu_available(p)); 765250323Sdteske#else 766250323Sdteske return (INT64_MAX); 767250323Sdteske#endif 768250323Sdteske} 769250323Sdteske 770250323Sdteske/* 771250323Sdteske * Decrease allocation of 'resource' by 'amount' for process 'p'. 772250323Sdteske */ 773250323Sdteskevoid 774250323Sdteskeracct_sub(struct proc *p, int resource, uint64_t amount) 775250323Sdteske{ 776250323Sdteske 777250323Sdteske SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0); 778250323Sdteske 779250323Sdteske /* 780250323Sdteske * We need proc lock to dereference p->p_ucred. 781250323Sdteske */ 782250323Sdteske PROC_LOCK_ASSERT(p, MA_OWNED); 783250323Sdteske KASSERT(RACCT_CAN_DROP(resource), 784250323Sdteske ("racct_sub: called for non-droppable resource %d", resource)); 785250323Sdteske 786250323Sdteske mtx_lock(&racct_lock); 787250323Sdteske KASSERT(amount <= p->p_racct->r_resources[resource], 788251236Sdteske ("racct_sub: freeing %ju of resource %d, which is more " 789250323Sdteske "than allocated %jd for %s (pid %d)", amount, resource, 790251361Sdteske (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); 791250323Sdteske 792250323Sdteske racct_alloc_resource(p->p_racct, resource, -amount); 793250323Sdteske racct_sub_cred_locked(p->p_ucred, resource, amount); 794250323Sdteske mtx_unlock(&racct_lock); 795250323Sdteske} 796250323Sdteske 797250323Sdteskestatic void 798250323Sdteskeracct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 799250323Sdteske{ 800250323Sdteske struct prison *pr; 801250323Sdteske 802250323Sdteske SDT_PROBE(racct, kernel, rusage, sub_cred, cred, resource, amount, 803250323Sdteske 0, 0); 804251236Sdteske 805251236Sdteske#ifdef notyet 806250323Sdteske KASSERT(RACCT_CAN_DROP(resource), 807250323Sdteske ("racct_sub_cred: called for resource %d which can not drop", 808250323Sdteske resource)); 809250323Sdteske#endif 810250323Sdteske 811250323Sdteske racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); 812250323Sdteske for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 813250323Sdteske racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 814250323Sdteske -amount); 815250323Sdteske racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount); 816250323Sdteske} 817250323Sdteske 818250323Sdteske/* 819250323Sdteske * Decrease allocation of 'resource' by 'amount' for credential 'cred'. 820250323Sdteske */ 821250323Sdteskevoid 822250323Sdteskeracct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 823250323Sdteske{ 824250323Sdteske 825250323Sdteske mtx_lock(&racct_lock); 826250323Sdteske racct_sub_cred_locked(cred, resource, amount); 827252745Sdteske mtx_unlock(&racct_lock); 828252745Sdteske} 829252745Sdteske 830252745Sdteske/* 831252745Sdteske * Inherit resource usage information from the parent process. 832252745Sdteske */ 833252745Sdteskeint 834252745Sdteskeracct_proc_fork(struct proc *parent, struct proc *child) 835252745Sdteske{ 836252745Sdteske int i, error = 0; 837252745Sdteske 838252745Sdteske /* 839252745Sdteske * Create racct for the child process. 840252745Sdteske */ 841252745Sdteske racct_create(&child->p_racct); 842252745Sdteske 843252745Sdteske PROC_LOCK(parent); 844252745Sdteske PROC_LOCK(child); 845252745Sdteske mtx_lock(&racct_lock); 846252745Sdteske 847252745Sdteske#ifdef RCTL 848252745Sdteske error = rctl_proc_fork(parent, child); 849252745Sdteske if (error != 0) 850252745Sdteske goto out; 851252745Sdteske#endif 852252745Sdteske 853252745Sdteske /* Init process cpu time. */ 854252745Sdteske child->p_prev_runtime = 0; 855252745Sdteske child->p_throttled = 0; 856252745Sdteske 857252745Sdteske /* 858252745Sdteske * Inherit resource usage. 859252745Sdteske */ 860252745Sdteske for (i = 0; i <= RACCT_MAX; i++) { 861252745Sdteske if (parent->p_racct->r_resources[i] == 0 || 862252745Sdteske !RACCT_IS_INHERITABLE(i)) 863252745Sdteske continue; 864252745Sdteske 865252745Sdteske error = racct_set_locked(child, i, 866252745Sdteske parent->p_racct->r_resources[i]); 867252745Sdteske if (error != 0) 868252745Sdteske goto out; 869252745Sdteske } 870252745Sdteske 871252745Sdteske error = racct_add_locked(child, RACCT_NPROC, 1); 872252745Sdteske error += racct_add_locked(child, RACCT_NTHR, 1); 873252745Sdteske 874252745Sdteskeout: 875252745Sdteske mtx_unlock(&racct_lock); 876252745Sdteske PROC_UNLOCK(child); 877252745Sdteske PROC_UNLOCK(parent); 878252745Sdteske 879252745Sdteske if (error != 0) 880252745Sdteske racct_proc_exit(child); 881252745Sdteske 882252745Sdteske return (error); 883252745Sdteske} 884252774Sdteske 885252745Sdteske/* 886252745Sdteske * Called at the end of fork1(), to handle rules that require the process 887252745Sdteske * to be fully initialized. 888252745Sdteske */ 889252745Sdteskevoid 890252745Sdteskeracct_proc_fork_done(struct proc *child) 891252745Sdteske{ 892252745Sdteske 893252745Sdteske#ifdef RCTL 894252745Sdteske PROC_LOCK(child); 895252745Sdteske mtx_lock(&racct_lock); 896252745Sdteske rctl_enforce(child, RACCT_NPROC, 0); 897252745Sdteske rctl_enforce(child, RACCT_NTHR, 0); 898252745Sdteske mtx_unlock(&racct_lock); 899252745Sdteske PROC_UNLOCK(child); 900252745Sdteske#endif 901252745Sdteske} 902252745Sdteske 903252745Sdteskevoid 904252745Sdteskeracct_proc_exit(struct proc *p) 905252745Sdteske{ 906252745Sdteske int i; 907252745Sdteske uint64_t runtime; 908252745Sdteske struct timeval wallclock; 909252745Sdteske uint64_t pct_estimate, pct; 910252745Sdteske 911252745Sdteske PROC_LOCK(p); 912252745Sdteske /* 913252745Sdteske * We don't need to calculate rux, proc_reap() has already done this. 914252745Sdteske */ 915252745Sdteske runtime = cputick2usec(p->p_rux.rux_runtime); 916252745Sdteske#ifdef notyet 917252745Sdteske KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); 918252745Sdteske#else 919252745Sdteske if (runtime < p->p_prev_runtime) 920252745Sdteske runtime = p->p_prev_runtime; 921252745Sdteske#endif 922252745Sdteske microuptime(&wallclock); 923252745Sdteske timevalsub(&wallclock, &p->p_stats->p_start); 924252745Sdteske if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 925252745Sdteske pct_estimate = (1000000 * runtime * 100) / 926252745Sdteske ((uint64_t)wallclock.tv_sec * 1000000 + 927252745Sdteske wallclock.tv_usec); 928252745Sdteske } else 929252745Sdteske pct_estimate = 0; 930252745Sdteske pct = racct_getpcpu(p, pct_estimate); 931252745Sdteske 932252745Sdteske mtx_lock(&racct_lock); 933252745Sdteske racct_set_locked(p, RACCT_CPU, runtime); 934252745Sdteske racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); 935252745Sdteske 936252745Sdteske for (i = 0; i <= RACCT_MAX; i++) { 937252745Sdteske if (p->p_racct->r_resources[i] == 0) 938252745Sdteske continue; 939252745Sdteske if (!RACCT_IS_RECLAIMABLE(i)) 940252745Sdteske continue; 941252745Sdteske racct_set_locked(p, i, 0); 942252745Sdteske } 943252745Sdteske 944252745Sdteske mtx_unlock(&racct_lock); 945252745Sdteske PROC_UNLOCK(p); 946252745Sdteske 947252745Sdteske#ifdef RCTL 948252745Sdteske rctl_racct_release(p->p_racct); 949252745Sdteske#endif 950252745Sdteske racct_destroy(&p->p_racct); 951252745Sdteske} 952252745Sdteske 953252745Sdteske/* 954252745Sdteske * Called after credentials change, to move resource utilisation 955252745Sdteske * between raccts. 956252745Sdteske */ 957252745Sdteskevoid 958252745Sdteskeracct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, 959252771Sdteske struct ucred *newcred) 960252771Sdteske{ 961252771Sdteske struct uidinfo *olduip, *newuip; 962252771Sdteske struct loginclass *oldlc, *newlc; 963252745Sdteske struct prison *oldpr, *newpr, *pr; 964252745Sdteske 965252745Sdteske PROC_LOCK_ASSERT(p, MA_NOTOWNED); 966252745Sdteske 967252745Sdteske newuip = newcred->cr_ruidinfo; 968252745Sdteske olduip = oldcred->cr_ruidinfo; 969252745Sdteske newlc = newcred->cr_loginclass; 970252745Sdteske oldlc = oldcred->cr_loginclass; 971252745Sdteske newpr = newcred->cr_prison; 972252745Sdteske oldpr = oldcred->cr_prison; 973252745Sdteske 974252745Sdteske mtx_lock(&racct_lock); 975252745Sdteske if (newuip != olduip) { 976252745Sdteske racct_sub_racct(olduip->ui_racct, p->p_racct); 977252745Sdteske racct_add_racct(newuip->ui_racct, p->p_racct); 978252745Sdteske } 979252745Sdteske if (newlc != oldlc) { 980252745Sdteske racct_sub_racct(oldlc->lc_racct, p->p_racct); 981252745Sdteske racct_add_racct(newlc->lc_racct, p->p_racct); 982252745Sdteske } 983252745Sdteske if (newpr != oldpr) { 984252745Sdteske for (pr = oldpr; pr != NULL; pr = pr->pr_parent) 985252745Sdteske racct_sub_racct(pr->pr_prison_racct->prr_racct, 986252745Sdteske p->p_racct); 987252745Sdteske for (pr = newpr; pr != NULL; pr = pr->pr_parent) 988252745Sdteske racct_add_racct(pr->pr_prison_racct->prr_racct, 989252745Sdteske p->p_racct); 990252745Sdteske } 991252745Sdteske mtx_unlock(&racct_lock); 992252745Sdteske 993252745Sdteske#ifdef RCTL 994252745Sdteske rctl_proc_ucred_changed(p, newcred); 995252745Sdteske#endif 996252745Sdteske} 997252745Sdteske 998252745Sdteskevoid 999252745Sdteskeracct_move(struct racct *dest, struct racct *src) 1000252745Sdteske{ 1001252745Sdteske 1002252745Sdteske mtx_lock(&racct_lock); 1003252745Sdteske 1004252745Sdteske racct_add_racct(dest, src); 1005252745Sdteske racct_sub_racct(src, src); 1006252745Sdteske 1007252745Sdteske mtx_unlock(&racct_lock); 1008252745Sdteske} 1009252745Sdteske 1010252745Sdteskestatic void 1011252745Sdteskeracct_proc_throttle(struct proc *p) 1012252745Sdteske{ 1013252745Sdteske struct thread *td; 1014252745Sdteske#ifdef SMP 1015252745Sdteske int cpuid; 1016252745Sdteske#endif 1017252745Sdteske 1018252745Sdteske PROC_LOCK_ASSERT(p, MA_OWNED); 1019252745Sdteske 1020252745Sdteske /* 1021252745Sdteske * Do not block kernel processes. Also do not block processes with 1022252745Sdteske * low %cpu utilization to improve interactivity. 1023252745Sdteske */ 1024252745Sdteske if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) || 1025252745Sdteske (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) 1026252745Sdteske return; 1027252745Sdteske p->p_throttled = 1; 1028252745Sdteske 1029252745Sdteske FOREACH_THREAD_IN_PROC(p, td) { 1030252745Sdteske switch (td->td_state) { 1031252745Sdteske case TDS_RUNQ: 1032252745Sdteske /* 1033252745Sdteske * If the thread is on the scheduler run-queue, we can 1034252745Sdteske * not just remove it from there. So we set the flag 1035252745Sdteske * TDF_NEEDRESCHED for the thread, so that once it is 1036252745Sdteske * running, it is taken off the cpu as soon as possible. 1037252745Sdteske */ 1038252745Sdteske thread_lock(td); 1039252745Sdteske td->td_flags |= TDF_NEEDRESCHED; 1040252745Sdteske thread_unlock(td); 1041252745Sdteske break; 1042252745Sdteske case TDS_RUNNING: 1043252745Sdteske /* 1044252745Sdteske * If the thread is running, we request a context 1045252745Sdteske * switch for it by setting the TDF_NEEDRESCHED flag. 1046252745Sdteske */ 1047252745Sdteske thread_lock(td); 1048252745Sdteske td->td_flags |= TDF_NEEDRESCHED; 1049252745Sdteske#ifdef SMP 1050252745Sdteske cpuid = td->td_oncpu; 1051252745Sdteske if ((cpuid != NOCPU) && (td != curthread)) 1052252745Sdteske ipi_cpu(cpuid, IPI_AST); 1053252745Sdteske#endif 1054252745Sdteske thread_unlock(td); 1055252745Sdteske break; 1056252745Sdteske default: 1057252745Sdteske break; 1058252745Sdteske } 1059252745Sdteske } 1060252775Sdteske} 1061252775Sdteske 1062252775Sdteskestatic void 1063252775Sdteskeracct_proc_wakeup(struct proc *p) 1064252775Sdteske{ 1065252775Sdteske PROC_LOCK_ASSERT(p, MA_OWNED); 1066252775Sdteske 1067252775Sdteske if (p->p_throttled) { 1068252775Sdteske p->p_throttled = 0; 1069252775Sdteske wakeup(p->p_racct); 1070252775Sdteske } 1071252775Sdteske} 1072252775Sdteske 1073252775Sdteskestatic void 1074252775Sdteskeracct_decay_resource(struct racct *racct, void * res, void* dummy) 1075252775Sdteske{ 1076252775Sdteske int resource; 1077252775Sdteske int64_t r_old, r_new; 1078252775Sdteske 1079252775Sdteske resource = *(int *)res; 1080252775Sdteske r_old = racct->r_resources[resource]; 1081252775Sdteske 1082252775Sdteske /* If there is nothing to decay, just exit. */ 1083252775Sdteske if (r_old <= 0) 1084252775Sdteske return; 1085252775Sdteske 1086252775Sdteske mtx_lock(&racct_lock); 1087252775Sdteske r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; 1088252775Sdteske racct->r_resources[resource] = r_new; 1089252775Sdteske mtx_unlock(&racct_lock); 1090252775Sdteske} 1091252775Sdteske 1092252775Sdteskestatic void 1093252775Sdteskeracct_decay(int resource) 1094252775Sdteske{ 1095252775Sdteske ui_racct_foreach(racct_decay_resource, &resource, NULL); 1096252775Sdteske loginclass_racct_foreach(racct_decay_resource, &resource, NULL); 1097252775Sdteske prison_racct_foreach(racct_decay_resource, &resource, NULL); 1098252775Sdteske} 1099252775Sdteske 1100252775Sdteskestatic void 1101252775Sdteskeracctd(void) 1102252775Sdteske{ 1103252775Sdteske struct thread *td; 1104252775Sdteske struct proc *p; 1105252775Sdteske struct timeval wallclock; 1106252775Sdteske uint64_t runtime; 1107252775Sdteske uint64_t pct, pct_estimate; 1108252775Sdteske 1109252775Sdteske for (;;) { 1110252775Sdteske racct_decay(RACCT_PCTCPU); 1111252775Sdteske 1112252775Sdteske sx_slock(&allproc_lock); 1113252775Sdteske 1114252775Sdteske LIST_FOREACH(p, &zombproc, p_list) { 1115252775Sdteske PROC_LOCK(p); 1116252775Sdteske racct_set(p, RACCT_PCTCPU, 0); 1117252775Sdteske PROC_UNLOCK(p); 1118252775Sdteske } 1119252775Sdteske 1120252775Sdteske FOREACH_PROC_IN_SYSTEM(p) { 1121252775Sdteske PROC_LOCK(p); 1122252775Sdteske if (p->p_state != PRS_NORMAL) { 1123252775Sdteske PROC_UNLOCK(p); 1124252775Sdteske continue; 1125252775Sdteske } 1126252775Sdteske 1127252775Sdteske microuptime(&wallclock); 1128252775Sdteske timevalsub(&wallclock, &p->p_stats->p_start); 1129252775Sdteske PROC_SLOCK(p); 1130252775Sdteske FOREACH_THREAD_IN_PROC(p, td) 1131252775Sdteske ruxagg(p, td); 1132252775Sdteske runtime = cputick2usec(p->p_rux.rux_runtime); 1133252775Sdteske PROC_SUNLOCK(p); 1134252775Sdteske#ifdef notyet 1135252775Sdteske KASSERT(runtime >= p->p_prev_runtime, 1136252775Sdteske ("runtime < p_prev_runtime")); 1137252775Sdteske#else 1138252775Sdteske if (runtime < p->p_prev_runtime) 1139252775Sdteske runtime = p->p_prev_runtime; 1140252775Sdteske#endif 1141252775Sdteske p->p_prev_runtime = runtime; 1142252775Sdteske if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1143252775Sdteske pct_estimate = (1000000 * runtime * 100) / 1144252775Sdteske ((uint64_t)wallclock.tv_sec * 1000000 + 1145252775Sdteske wallclock.tv_usec); 1146252775Sdteske } else 1147252775Sdteske pct_estimate = 0; 1148252775Sdteske pct = racct_getpcpu(p, pct_estimate); 1149252775Sdteske mtx_lock(&racct_lock); 1150252775Sdteske racct_set_force_locked(p, RACCT_PCTCPU, pct); 1151252775Sdteske racct_set_locked(p, RACCT_CPU, runtime); 1152252775Sdteske racct_set_locked(p, RACCT_WALLCLOCK, 1153252775Sdteske (uint64_t)wallclock.tv_sec * 1000000 + 1154252775Sdteske wallclock.tv_usec); 1155252775Sdteske mtx_unlock(&racct_lock); 1156252775Sdteske PROC_UNLOCK(p); 1157252775Sdteske } 1158252775Sdteske 1159252775Sdteske /* 1160252775Sdteske * To ensure that processes are throttled in a fair way, we need 1161252775Sdteske * to iterate over all processes again and check the limits 1162252775Sdteske * for %cpu resource only after ucred racct containers have been 1163252775Sdteske * properly filled. 1164252775Sdteske */ 1165252775Sdteske FOREACH_PROC_IN_SYSTEM(p) { 1166252775Sdteske PROC_LOCK(p); 1167252775Sdteske if (p->p_state != PRS_NORMAL) { 1168252775Sdteske PROC_UNLOCK(p); 1169252775Sdteske continue; 1170252775Sdteske } 1171252775Sdteske 1172252775Sdteske if (racct_pcpu_available(p) <= 0) 1173252775Sdteske racct_proc_throttle(p); 1174252775Sdteske else if (p->p_throttled) 1175252775Sdteske racct_proc_wakeup(p); 1176252775Sdteske PROC_UNLOCK(p); 1177252775Sdteske } 1178252775Sdteske sx_sunlock(&allproc_lock); 1179252775Sdteske pause("-", hz); 1180252775Sdteske } 1181252775Sdteske} 1182252775Sdteske 1183252775Sdteskestatic struct kproc_desc racctd_kp = { 1184252775Sdteske "racctd", 1185252775Sdteske racctd, 1186252775Sdteske NULL 1187252775Sdteske}; 1188252775SdteskeSYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp); 1189252844Sdteske 1190252844Sdteskestatic void 1191252844Sdteskeracct_init(void) 1192252844Sdteske{ 1193252844Sdteske 1194252844Sdteske racct_zone = uma_zcreate("racct", sizeof(struct racct), 1195252844Sdteske NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1196252844Sdteske /* 1197252844Sdteske * XXX: Move this somewhere. 1198250323Sdteske */ 1199250323Sdteske prison0.pr_prison_racct = prison_racct_find("0"); 1200250323Sdteske} 1201250323SdteskeSYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); 1202250323Sdteske 1203#else /* !RACCT */ 1204 1205int 1206racct_add(struct proc *p, int resource, uint64_t amount) 1207{ 1208 1209 return (0); 1210} 1211 1212void 1213racct_add_cred(struct ucred *cred, int resource, uint64_t amount) 1214{ 1215} 1216 1217void 1218racct_add_force(struct proc *p, int resource, uint64_t amount) 1219{ 1220 1221 return; 1222} 1223 1224int 1225racct_set(struct proc *p, int resource, uint64_t amount) 1226{ 1227 1228 return (0); 1229} 1230 1231void 1232racct_set_force(struct proc *p, int resource, uint64_t amount) 1233{ 1234} 1235 1236void 1237racct_sub(struct proc *p, int resource, uint64_t amount) 1238{ 1239} 1240 1241void 1242racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 1243{ 1244} 1245 1246uint64_t 1247racct_get_limit(struct proc *p, int resource) 1248{ 1249 1250 return (UINT64_MAX); 1251} 1252 1253uint64_t 1254racct_get_available(struct proc *p, int resource) 1255{ 1256 1257 return (UINT64_MAX); 1258} 1259 1260void 1261racct_create(struct racct **racctp) 1262{ 1263} 1264 1265void 1266racct_destroy(struct racct **racctp) 1267{ 1268} 1269 1270int 1271racct_proc_fork(struct proc *parent, struct proc *child) 1272{ 1273 1274 return (0); 1275} 1276 1277void 1278racct_proc_fork_done(struct proc *child) 1279{ 1280} 1281 1282void 1283racct_proc_exit(struct proc *p) 1284{ 1285} 1286 1287#endif /* !RACCT */ 1288