1220137Strasz/*- 2220137Strasz * Copyright (c) 2010 The FreeBSD Foundation 3220137Strasz * All rights reserved. 4220137Strasz * 5220137Strasz * This software was developed by Edward Tomasz Napierala under sponsorship 6220137Strasz * from the FreeBSD Foundation. 7220137Strasz * 8220137Strasz * Redistribution and use in source and binary forms, with or without 9220137Strasz * modification, are permitted provided that the following conditions 10220137Strasz * are met: 11220137Strasz * 1. Redistributions of source code must retain the above copyright 12220137Strasz * notice, this list of conditions and the following disclaimer. 13220137Strasz * 2. Redistributions in binary form must reproduce the above copyright 14220137Strasz * notice, this list of conditions and the following disclaimer in the 15220137Strasz * documentation and/or other materials provided with the distribution. 16220137Strasz * 17220137Strasz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18220137Strasz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19220137Strasz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20220137Strasz * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21220137Strasz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22220137Strasz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23220137Strasz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24220137Strasz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25220137Strasz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26220137Strasz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27220137Strasz * SUCH DAMAGE. 28220137Strasz * 29220137Strasz * $FreeBSD: releng/11.0/sys/kern/kern_racct.c 298414 2016-04-21 16:22:52Z trasz $ 30220137Strasz */ 31220137Strasz 32220137Strasz#include <sys/cdefs.h> 33220137Strasz__FBSDID("$FreeBSD: releng/11.0/sys/kern/kern_racct.c 298414 2016-04-21 16:22:52Z trasz $"); 34220137Strasz 35242139Strasz#include "opt_sched.h" 36220137Strasz 37220137Strasz#include <sys/param.h> 38297633Strasz#include <sys/buf.h> 39228430Savg#include <sys/systm.h> 40220137Strasz#include <sys/eventhandler.h> 41220137Strasz#include <sys/jail.h> 42220137Strasz#include <sys/kernel.h> 43220137Strasz#include <sys/kthread.h> 44220137Strasz#include <sys/lock.h> 45220137Strasz#include <sys/loginclass.h> 46220137Strasz#include <sys/malloc.h> 47220137Strasz#include <sys/mutex.h> 48220137Strasz#include <sys/proc.h> 49220137Strasz#include <sys/racct.h> 50220137Strasz#include <sys/resourcevar.h> 51220137Strasz#include <sys/sbuf.h> 52220137Strasz#include <sys/sched.h> 53220137Strasz#include <sys/sdt.h> 54242139Strasz#include <sys/smp.h> 55220137Strasz#include <sys/sx.h> 56242139Strasz#include <sys/sysctl.h> 57220137Strasz#include <sys/sysent.h> 58220137Strasz#include <sys/sysproto.h> 59220137Strasz#include <sys/umtx.h> 60242139Strasz#include <machine/smp.h> 61220137Strasz 62220137Strasz#ifdef RCTL 63220137Strasz#include <sys/rctl.h> 64220137Strasz#endif 65220137Strasz 66220137Strasz#ifdef RACCT 67220137Strasz 68220137StraszFEATURE(racct, "Resource Accounting"); 69220137Strasz 70242139Strasz/* 71242139Strasz * Do not block processes that have their %cpu usage <= pcpu_threshold. 72242139Strasz */ 73242139Straszstatic int pcpu_threshold = 1; 74282901Strasz#ifdef RACCT_DEFAULT_TO_DISABLED 75282213Straszint racct_enable = 0; 76282213Strasz#else 77282213Straszint racct_enable = 1; 78282213Strasz#endif 79242139Strasz 80242139StraszSYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); 81282213StraszSYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 82282213Strasz 0, "Enable RACCT/RCTL"); 83242139StraszSYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 84242139Strasz 0, "Processes with higher %cpu usage than this value can be throttled."); 85242139Strasz 86242139Strasz/* 87242139Strasz * How many seconds it takes to use the scheduler %cpu calculations. When a 88242139Strasz * process starts, we compute its %cpu usage by dividing its runtime by the 89242139Strasz * process wall clock time. After RACCT_PCPU_SECS pass, we use the value 90242139Strasz * provided by the scheduler. 91242139Strasz */ 92242139Strasz#define RACCT_PCPU_SECS 3 93242139Strasz 94298414Straszstruct mtx racct_lock; 95220137StraszMTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); 96220137Strasz 97220137Straszstatic uma_zone_t racct_zone; 98220137Strasz 99220137Straszstatic void racct_sub_racct(struct racct *dest, const struct racct *src); 100220137Straszstatic void racct_sub_cred_locked(struct ucred *cred, int resource, 101220137Strasz uint64_t amount); 102220137Straszstatic void racct_add_cred_locked(struct ucred *cred, int resource, 103220137Strasz uint64_t amount); 104220137Strasz 105220137StraszSDT_PROVIDER_DEFINE(racct); 106292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, add, 107220137Strasz "struct proc *", "int", "uint64_t"); 108292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, add__failure, 109220137Strasz "struct proc *", "int", "uint64_t"); 110298414StraszSDT_PROBE_DEFINE3(racct, , rusage, add__buf, 111298414Strasz "struct proc *", "const struct buf *", "int"); 112292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, add__cred, 113292384Smarkj "struct ucred *", "int", "uint64_t"); 114292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, add__force, 115292384Smarkj "struct proc *", "int", "uint64_t"); 116292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, set, 117292384Smarkj "struct proc *", "int", "uint64_t"); 118292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, set__failure, 119292384Smarkj "struct proc *", "int", "uint64_t"); 120297489StraszSDT_PROBE_DEFINE3(racct, , rusage, set__force, 121297489Strasz "struct proc *", "int", "uint64_t"); 122292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, sub, 123292384Smarkj "struct proc *", "int", "uint64_t"); 124292384SmarkjSDT_PROBE_DEFINE3(racct, , rusage, sub__cred, 125292384Smarkj "struct ucred *", "int", "uint64_t"); 126292384SmarkjSDT_PROBE_DEFINE1(racct, , racct, create, 127220137Strasz "struct racct *"); 128292384SmarkjSDT_PROBE_DEFINE1(racct, , racct, destroy, 129292384Smarkj "struct racct *"); 130292384SmarkjSDT_PROBE_DEFINE2(racct, , racct, join, 131220137Strasz "struct racct *", "struct racct *"); 132292384SmarkjSDT_PROBE_DEFINE2(racct, , racct, join__failure, 133292384Smarkj "struct racct *", "struct racct *"); 134292384SmarkjSDT_PROBE_DEFINE2(racct, , racct, leave, 135292384Smarkj "struct racct *", "struct racct *"); 136220137Strasz 137220137Straszint racct_types[] = { 138220137Strasz [RACCT_CPU] = 139224036Strasz RACCT_IN_MILLIONS, 140220137Strasz [RACCT_DATA] = 141220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 142220137Strasz [RACCT_STACK] = 143220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 144220137Strasz [RACCT_CORE] = 145220137Strasz RACCT_DENIABLE, 146220137Strasz [RACCT_RSS] = 147220137Strasz RACCT_RECLAIMABLE, 148220137Strasz [RACCT_MEMLOCK] = 149220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE, 150220137Strasz [RACCT_NPROC] = 151220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE, 152220137Strasz [RACCT_NOFILE] = 153220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 154220137Strasz [RACCT_VMEM] = 155220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 156220137Strasz [RACCT_NPTS] = 157220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 158220137Strasz [RACCT_SWAP] = 159220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 160220137Strasz [RACCT_NTHR] = 161220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE, 162220137Strasz [RACCT_MSGQQUEUED] = 163220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 164220137Strasz [RACCT_MSGQSIZE] = 165220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 166220137Strasz [RACCT_NMSGQ] = 167220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 168220137Strasz [RACCT_NSEM] = 169220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 170220137Strasz [RACCT_NSEMOP] = 171220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 172220137Strasz [RACCT_NSHM] = 173220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 174220137Strasz [RACCT_SHMSIZE] = 175220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 176220137Strasz [RACCT_WALLCLOCK] = 177242139Strasz RACCT_IN_MILLIONS, 178242139Strasz [RACCT_PCTCPU] = 179297633Strasz RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS, 180297633Strasz [RACCT_READBPS] = 181297633Strasz RACCT_DECAYING, 182297633Strasz [RACCT_WRITEBPS] = 183297633Strasz RACCT_DECAYING, 184297633Strasz [RACCT_READIOPS] = 185297633Strasz RACCT_DECAYING, 186297633Strasz [RACCT_WRITEIOPS] = 187297633Strasz RACCT_DECAYING }; 188220137Strasz 189242139Straszstatic const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; 190242139Strasz 191242139Strasz#ifdef SCHED_4BSD 192242139Strasz/* 193242139Strasz * Contains intermediate values for %cpu calculations to avoid using floating 194242139Strasz * point in the kernel. 195242139Strasz * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) 196242139Strasz * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to 197242139Strasz * zero so the calculations are more straightforward. 198242139Strasz */ 199242139Straszfixpt_t ccpu_exp[] = { 200242139Strasz [0] = FSCALE * 1, 201242139Strasz [1] = FSCALE * 0.95122942450071400909, 202242139Strasz [2] = FSCALE * 0.90483741803595957316, 203242139Strasz [3] = FSCALE * 0.86070797642505780722, 204242139Strasz [4] = FSCALE * 0.81873075307798185866, 205242139Strasz [5] = FSCALE * 0.77880078307140486824, 206242139Strasz [6] = FSCALE * 0.74081822068171786606, 207242139Strasz [7] = FSCALE * 0.70468808971871343435, 208242139Strasz [8] = FSCALE * 0.67032004603563930074, 209242139Strasz [9] = FSCALE * 0.63762815162177329314, 210242139Strasz [10] = FSCALE * 0.60653065971263342360, 211242139Strasz [11] = FSCALE * 0.57694981038048669531, 212242139Strasz [12] = FSCALE * 0.54881163609402643262, 213242139Strasz [13] = FSCALE * 0.52204577676101604789, 214242139Strasz [14] = FSCALE * 0.49658530379140951470, 215242139Strasz [15] = FSCALE * 0.47236655274101470713, 216242139Strasz [16] = FSCALE * 0.44932896411722159143, 217242139Strasz [17] = FSCALE * 0.42741493194872666992, 218242139Strasz [18] = FSCALE * 0.40656965974059911188, 219242139Strasz [19] = FSCALE * 0.38674102345450120691, 220242139Strasz [20] = FSCALE * 0.36787944117144232159, 221242139Strasz [21] = FSCALE * 0.34993774911115535467, 222242139Strasz [22] = FSCALE * 0.33287108369807955328, 223242139Strasz [23] = FSCALE * 0.31663676937905321821, 224242139Strasz [24] = FSCALE * 0.30119421191220209664, 225242139Strasz [25] = FSCALE * 0.28650479686019010032, 226242139Strasz [26] = FSCALE * 0.27253179303401260312, 227242139Strasz [27] = FSCALE * 0.25924026064589150757, 228242139Strasz [28] = FSCALE * 0.24659696394160647693, 229242139Strasz [29] = FSCALE * 0.23457028809379765313, 230242139Strasz [30] = FSCALE * 0.22313016014842982893, 231242139Strasz [31] = FSCALE * 0.21224797382674305771, 232242139Strasz [32] = FSCALE * 0.20189651799465540848, 233242139Strasz [33] = FSCALE * 0.19204990862075411423, 234242139Strasz [34] = FSCALE * 0.18268352405273465022, 235242139Strasz [35] = FSCALE * 0.17377394345044512668, 236242139Strasz [36] = FSCALE * 0.16529888822158653829, 237242139Strasz [37] = FSCALE * 0.15723716631362761621, 238242139Strasz [38] = FSCALE * 0.14956861922263505264, 239242139Strasz [39] = FSCALE * 0.14227407158651357185, 240242139Strasz [40] = FSCALE * 0.13533528323661269189, 241242139Strasz [41] = FSCALE * 0.12873490358780421886, 242242139Strasz [42] = FSCALE * 0.12245642825298191021, 243242139Strasz [43] = FSCALE * 0.11648415777349695786, 244242139Strasz [44] = FSCALE * 0.11080315836233388333, 245242139Strasz [45] = FSCALE * 0.10539922456186433678, 246242139Strasz [46] = FSCALE * 0.10025884372280373372, 247242139Strasz [47] = FSCALE * 0.09536916221554961888, 248242139Strasz [48] = FSCALE * 0.09071795328941250337, 249242139Strasz [49] = FSCALE * 0.08629358649937051097, 250242139Strasz [50] = FSCALE * 0.08208499862389879516, 251242139Strasz [51] = FSCALE * 0.07808166600115315231, 252242139Strasz [52] = FSCALE * 0.07427357821433388042, 253242139Strasz [53] = FSCALE * 0.07065121306042958674, 254242139Strasz [54] = FSCALE * 0.06720551273974976512, 255242139Strasz [55] = FSCALE * 0.06392786120670757270, 256242139Strasz [56] = FSCALE * 0.06081006262521796499, 257242139Strasz [57] = FSCALE * 0.05784432087483846296, 258242139Strasz [58] = FSCALE * 0.05502322005640722902, 259242139Strasz [59] = FSCALE * 0.05233970594843239308, 260242139Strasz [60] = FSCALE * 0.04978706836786394297, 261242139Strasz [61] = FSCALE * 0.04735892439114092119, 262242139Strasz [62] = FSCALE * 0.04504920239355780606, 263242139Strasz [63] = FSCALE * 0.04285212686704017991, 264242139Strasz [64] = FSCALE * 0.04076220397836621516, 265242139Strasz [65] = FSCALE * 0.03877420783172200988, 266242139Strasz [66] = FSCALE * 0.03688316740124000544, 267242139Strasz [67] = FSCALE * 0.03508435410084502588, 268242139Strasz [68] = FSCALE * 0.03337326996032607948, 269242139Strasz [69] = FSCALE * 0.03174563637806794323, 270242139Strasz [70] = FSCALE * 0.03019738342231850073, 271242139Strasz [71] = FSCALE * 0.02872463965423942912, 272242139Strasz [72] = FSCALE * 0.02732372244729256080, 273242139Strasz [73] = FSCALE * 0.02599112877875534358, 274242139Strasz [74] = FSCALE * 0.02472352647033939120, 275242139Strasz [75] = FSCALE * 0.02351774585600910823, 276242139Strasz [76] = FSCALE * 0.02237077185616559577, 277242139Strasz [77] = FSCALE * 0.02127973643837716938, 278242139Strasz [78] = FSCALE * 0.02024191144580438847, 279242139Strasz [79] = FSCALE * 0.01925470177538692429, 280242139Strasz [80] = FSCALE * 0.01831563888873418029, 281242139Strasz [81] = FSCALE * 0.01742237463949351138, 282242139Strasz [82] = FSCALE * 0.01657267540176124754, 283242139Strasz [83] = FSCALE * 0.01576441648485449082, 284242139Strasz [84] = FSCALE * 0.01499557682047770621, 285242139Strasz [85] = FSCALE * 0.01426423390899925527, 286242139Strasz [86] = FSCALE * 0.01356855901220093175, 287242139Strasz [87] = FSCALE * 0.01290681258047986886, 288242139Strasz [88] = FSCALE * 0.01227733990306844117, 289242139Strasz [89] = FSCALE * 0.01167856697039544521, 290242139Strasz [90] = FSCALE * 0.01110899653824230649, 291242139Strasz [91] = FSCALE * 0.01056720438385265337, 292242139Strasz [92] = FSCALE * 0.01005183574463358164, 293242139Strasz [93] = FSCALE * 0.00956160193054350793, 294242139Strasz [94] = FSCALE * 0.00909527710169581709, 295242139Strasz [95] = FSCALE * 0.00865169520312063417, 296242139Strasz [96] = FSCALE * 0.00822974704902002884, 297242139Strasz [97] = FSCALE * 0.00782837754922577143, 298242139Strasz [98] = FSCALE * 0.00744658307092434051, 299242139Strasz [99] = FSCALE * 0.00708340892905212004, 300242139Strasz [100] = FSCALE * 0.00673794699908546709, 301242139Strasz [101] = FSCALE * 0.00640933344625638184, 302242139Strasz [102] = FSCALE * 0.00609674656551563610, 303242139Strasz [103] = FSCALE * 0.00579940472684214321, 304242139Strasz [104] = FSCALE * 0.00551656442076077241, 305242139Strasz [105] = FSCALE * 0.00524751839918138427, 306242139Strasz [106] = FSCALE * 0.00499159390691021621, 307242139Strasz [107] = FSCALE * 0.00474815099941147558, 308242139Strasz [108] = FSCALE * 0.00451658094261266798, 309242139Strasz [109] = FSCALE * 0.00429630469075234057, 310242139Strasz [110] = FSCALE * 0.00408677143846406699, 311242139Strasz}; 312242139Strasz#endif 313242139Strasz 314242139Strasz#define CCPU_EXP_MAX 110 315242139Strasz 316242139Strasz/* 317242139Strasz * This function is analogical to the getpcpu() function in the ps(1) command. 318242139Strasz * They should both calculate in the same way so that the racct %cpu 319242139Strasz * calculations are consistent with the values showed by the ps(1) tool. 320242139Strasz * The calculations are more complex in the 4BSD scheduler because of the value 321242139Strasz * of the ccpu variable. In ULE it is defined to be zero which saves us some 322242139Strasz * work. 323242139Strasz */ 324242139Straszstatic uint64_t 325242139Straszracct_getpcpu(struct proc *p, u_int pcpu) 326242139Strasz{ 327242139Strasz u_int swtime; 328242139Strasz#ifdef SCHED_4BSD 329242139Strasz fixpt_t pctcpu, pctcpu_next; 330242139Strasz#endif 331242139Strasz#ifdef SMP 332242139Strasz struct pcpu *pc; 333242139Strasz int found; 334242139Strasz#endif 335242139Strasz fixpt_t p_pctcpu; 336242139Strasz struct thread *td; 337242139Strasz 338282213Strasz ASSERT_RACCT_ENABLED(); 339282213Strasz 340242139Strasz /* 341242139Strasz * If the process is swapped out, we count its %cpu usage as zero. 342242139Strasz * This behaviour is consistent with the userland ps(1) tool. 343242139Strasz */ 344242139Strasz if ((p->p_flag & P_INMEM) == 0) 345242139Strasz return (0); 346242139Strasz swtime = (ticks - p->p_swtick) / hz; 347242139Strasz 348242139Strasz /* 349242139Strasz * For short-lived processes, the sched_pctcpu() returns small 350242139Strasz * values even for cpu intensive processes. Therefore we use 351242139Strasz * our own estimate in this case. 352242139Strasz */ 353242139Strasz if (swtime < RACCT_PCPU_SECS) 354242139Strasz return (pcpu); 355242139Strasz 356242139Strasz p_pctcpu = 0; 357242139Strasz FOREACH_THREAD_IN_PROC(p, td) { 358242139Strasz if (td == PCPU_GET(idlethread)) 359242139Strasz continue; 360242139Strasz#ifdef SMP 361242139Strasz found = 0; 362242139Strasz STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 363242139Strasz if (td == pc->pc_idlethread) { 364242139Strasz found = 1; 365242139Strasz break; 366242139Strasz } 367242139Strasz } 368242139Strasz if (found) 369242139Strasz continue; 370242139Strasz#endif 371242139Strasz thread_lock(td); 372242139Strasz#ifdef SCHED_4BSD 373242139Strasz pctcpu = sched_pctcpu(td); 374242139Strasz /* Count also the yet unfinished second. */ 375242139Strasz pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; 376242139Strasz pctcpu_next += sched_pctcpu_delta(td); 377242139Strasz p_pctcpu += max(pctcpu, pctcpu_next); 378242139Strasz#else 379242139Strasz /* 380242139Strasz * In ULE the %cpu statistics are updated on every 381242139Strasz * sched_pctcpu() call. So special calculations to 382242139Strasz * account for the latest (unfinished) second are 383242139Strasz * not needed. 384242139Strasz */ 385242139Strasz p_pctcpu += sched_pctcpu(td); 386242139Strasz#endif 387242139Strasz thread_unlock(td); 388242139Strasz } 389242139Strasz 390242139Strasz#ifdef SCHED_4BSD 391242139Strasz if (swtime <= CCPU_EXP_MAX) 392242139Strasz return ((100 * (uint64_t)p_pctcpu * 1000000) / 393242139Strasz (FSCALE - ccpu_exp[swtime])); 394242139Strasz#endif 395242139Strasz 396242139Strasz return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); 397242139Strasz} 398242139Strasz 399220137Straszstatic void 400220137Straszracct_add_racct(struct racct *dest, const struct racct *src) 401220137Strasz{ 402220137Strasz int i; 403220137Strasz 404282213Strasz ASSERT_RACCT_ENABLED(); 405297578Strasz RACCT_LOCK_ASSERT(); 406220137Strasz 407220137Strasz /* 408220137Strasz * Update resource usage in dest. 409220137Strasz */ 410220137Strasz for (i = 0; i <= RACCT_MAX; i++) { 411220137Strasz KASSERT(dest->r_resources[i] >= 0, 412243088Strasz ("%s: resource %d propagation meltdown: dest < 0", 413243088Strasz __func__, i)); 414220137Strasz KASSERT(src->r_resources[i] >= 0, 415243088Strasz ("%s: resource %d propagation meltdown: src < 0", 416243088Strasz __func__, i)); 417220137Strasz dest->r_resources[i] += src->r_resources[i]; 418220137Strasz } 419220137Strasz} 420220137Strasz 421220137Straszstatic void 422220137Straszracct_sub_racct(struct racct *dest, const struct racct *src) 423220137Strasz{ 424220137Strasz int i; 425220137Strasz 426282213Strasz ASSERT_RACCT_ENABLED(); 427297578Strasz RACCT_LOCK_ASSERT(); 428220137Strasz 429220137Strasz /* 430220137Strasz * Update resource usage in dest. 431220137Strasz */ 432220137Strasz for (i = 0; i <= RACCT_MAX; i++) { 433243070Strasz if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { 434220137Strasz KASSERT(dest->r_resources[i] >= 0, 435243088Strasz ("%s: resource %d propagation meltdown: dest < 0", 436243088Strasz __func__, i)); 437220137Strasz KASSERT(src->r_resources[i] >= 0, 438243088Strasz ("%s: resource %d propagation meltdown: src < 0", 439243088Strasz __func__, i)); 440220137Strasz KASSERT(src->r_resources[i] <= dest->r_resources[i], 441243088Strasz ("%s: resource %d propagation meltdown: src > dest", 442243088Strasz __func__, i)); 443220137Strasz } 444242139Strasz if (RACCT_CAN_DROP(i)) { 445220137Strasz dest->r_resources[i] -= src->r_resources[i]; 446220137Strasz if (dest->r_resources[i] < 0) { 447243070Strasz KASSERT(RACCT_IS_SLOPPY(i) || 448243070Strasz RACCT_IS_DECAYING(i), 449243088Strasz ("%s: resource %d usage < 0", __func__, i)); 450220137Strasz dest->r_resources[i] = 0; 451220137Strasz } 452220137Strasz } 453220137Strasz } 454220137Strasz} 455220137Strasz 456220137Straszvoid 457220137Straszracct_create(struct racct **racctp) 458220137Strasz{ 459220137Strasz 460282213Strasz if (!racct_enable) 461282213Strasz return; 462282213Strasz 463292384Smarkj SDT_PROBE1(racct, , racct, create, racctp); 464220137Strasz 465220137Strasz KASSERT(*racctp == NULL, ("racct already allocated")); 466220137Strasz 467220137Strasz *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); 468220137Strasz} 469220137Strasz 470220137Straszstatic void 471220137Straszracct_destroy_locked(struct racct **racctp) 472220137Strasz{ 473298045Strasz struct racct *racct; 474220137Strasz int i; 475220137Strasz 476282213Strasz ASSERT_RACCT_ENABLED(); 477282213Strasz 478292384Smarkj SDT_PROBE1(racct, , racct, destroy, racctp); 479220137Strasz 480297578Strasz RACCT_LOCK_ASSERT(); 481220137Strasz KASSERT(racctp != NULL, ("NULL racctp")); 482220137Strasz KASSERT(*racctp != NULL, ("NULL racct")); 483220137Strasz 484220137Strasz racct = *racctp; 485220137Strasz 486220137Strasz for (i = 0; i <= RACCT_MAX; i++) { 487223844Strasz if (RACCT_IS_SLOPPY(i)) 488220137Strasz continue; 489223844Strasz if (!RACCT_IS_RECLAIMABLE(i)) 490220137Strasz continue; 491220137Strasz KASSERT(racct->r_resources[i] == 0, 492220137Strasz ("destroying non-empty racct: " 493220137Strasz "%ju allocated for resource %d\n", 494220137Strasz racct->r_resources[i], i)); 495220137Strasz } 496220137Strasz uma_zfree(racct_zone, racct); 497220137Strasz *racctp = NULL; 498220137Strasz} 499220137Strasz 500220137Straszvoid 501220137Straszracct_destroy(struct racct **racct) 502220137Strasz{ 503220137Strasz 504282213Strasz if (!racct_enable) 505282213Strasz return; 506282213Strasz 507297578Strasz RACCT_LOCK(); 508220137Strasz racct_destroy_locked(racct); 509297578Strasz RACCT_UNLOCK(); 510220137Strasz} 511220137Strasz 512220137Strasz/* 513292162Strasz * Increase consumption of 'resource' by 'amount' for 'racct', 514292162Strasz * but not its parents. Differently from other cases, 'amount' here 515220137Strasz * may be less than zero. 516220137Strasz */ 517220137Straszstatic void 518284378Sjlhracct_adjust_resource(struct racct *racct, int resource, 519292161Strasz int64_t amount) 520220137Strasz{ 521220137Strasz 522282213Strasz ASSERT_RACCT_ENABLED(); 523297578Strasz RACCT_LOCK_ASSERT(); 524220137Strasz KASSERT(racct != NULL, ("NULL racct")); 525220137Strasz 526220137Strasz racct->r_resources[resource] += amount; 527220137Strasz if (racct->r_resources[resource] < 0) { 528242139Strasz KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), 529243088Strasz ("%s: resource %d usage < 0", __func__, resource)); 530220137Strasz racct->r_resources[resource] = 0; 531220137Strasz } 532242139Strasz 533242139Strasz /* 534242139Strasz * There are some cases where the racct %cpu resource would grow 535290662Sjpaetzel * beyond 100% per core. For example in racct_proc_exit() we add 536290662Sjpaetzel * the process %cpu usage to the ucred racct containers. If too 537290662Sjpaetzel * many processes terminated in a short time span, the ucred %cpu 538290662Sjpaetzel * resource could grow too much. Also, the 4BSD scheduler sometimes 539290662Sjpaetzel * returns for a thread more than 100% cpu usage. So we set a sane 540290662Sjpaetzel * boundary here to 100% * the maxumum number of CPUs. 541242139Strasz */ 542242139Strasz if ((resource == RACCT_PCTCPU) && 543290662Sjpaetzel (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) 544290662Sjpaetzel racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; 545220137Strasz} 546220137Strasz 547225944Straszstatic int 548297490Straszracct_add_locked(struct proc *p, int resource, uint64_t amount, int force) 549220137Strasz{ 550220137Strasz#ifdef RCTL 551220137Strasz int error; 552220137Strasz#endif 553220137Strasz 554282213Strasz ASSERT_RACCT_ENABLED(); 555282213Strasz 556220137Strasz /* 557220137Strasz * We need proc lock to dereference p->p_ucred. 558220137Strasz */ 559220137Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 560220137Strasz 561220137Strasz#ifdef RCTL 562297492Strasz error = rctl_enforce(p, resource, amount); 563297492Strasz if (error && !force && RACCT_IS_DENIABLE(resource)) { 564297492Strasz SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); 565297492Strasz return (error); 566220137Strasz } 567220137Strasz#endif 568284378Sjlh racct_adjust_resource(p->p_racct, resource, amount); 569220137Strasz racct_add_cred_locked(p->p_ucred, resource, amount); 570220137Strasz 571220137Strasz return (0); 572220137Strasz} 573220137Strasz 574225944Strasz/* 575225944Strasz * Increase allocation of 'resource' by 'amount' for process 'p'. 576225944Strasz * Return 0 if it's below limits, or errno, if it's not. 577225944Strasz */ 578225944Straszint 579225944Straszracct_add(struct proc *p, int resource, uint64_t amount) 580225944Strasz{ 581225944Strasz int error; 582225944Strasz 583282213Strasz if (!racct_enable) 584282213Strasz return (0); 585282213Strasz 586297490Strasz SDT_PROBE3(racct, , rusage, add, p, resource, amount); 587297490Strasz 588297578Strasz RACCT_LOCK(); 589297490Strasz error = racct_add_locked(p, resource, amount, 0); 590297578Strasz RACCT_UNLOCK(); 591225944Strasz return (error); 592225944Strasz} 593225944Strasz 594297491Strasz/* 595297491Strasz * Increase allocation of 'resource' by 'amount' for process 'p'. 596297491Strasz * Doesn't check for limits and never fails. 597297491Strasz */ 598297491Straszvoid 599297491Straszracct_add_force(struct proc *p, int resource, uint64_t amount) 600297491Strasz{ 601297491Strasz 602297491Strasz if (!racct_enable) 603297491Strasz return; 604297491Strasz 605297491Strasz SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); 606297491Strasz 607297578Strasz RACCT_LOCK(); 608297491Strasz racct_add_locked(p, resource, amount, 1); 609297578Strasz RACCT_UNLOCK(); 610297491Strasz} 611297491Strasz 612220137Straszstatic void 613220137Straszracct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) 614220137Strasz{ 615220137Strasz struct prison *pr; 616220137Strasz 617282213Strasz ASSERT_RACCT_ENABLED(); 618282213Strasz 619284378Sjlh racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); 620220137Strasz for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 621284378Sjlh racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, 622221362Strasz amount); 623284378Sjlh racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); 624220137Strasz} 625220137Strasz 626220137Strasz/* 627220137Strasz * Increase allocation of 'resource' by 'amount' for credential 'cred'. 628220137Strasz * Doesn't check for limits and never fails. 629220137Strasz */ 630220137Straszvoid 631220137Straszracct_add_cred(struct ucred *cred, int resource, uint64_t amount) 632220137Strasz{ 633220137Strasz 634282213Strasz if (!racct_enable) 635282213Strasz return; 636282213Strasz 637298414Strasz SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount); 638298414Strasz 639297578Strasz RACCT_LOCK(); 640220137Strasz racct_add_cred_locked(cred, resource, amount); 641297578Strasz RACCT_UNLOCK(); 642220137Strasz} 643220137Strasz 644297633Strasz/* 645297633Strasz * Account for disk IO resource consumption. Checks for limits, 646297633Strasz * but never fails, due to disk limits being undeniable. 647297633Strasz */ 648297633Straszvoid 649297633Straszracct_add_buf(struct proc *p, const struct buf *bp, int is_write) 650297633Strasz{ 651297633Strasz 652297633Strasz ASSERT_RACCT_ENABLED(); 653297633Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 654297633Strasz 655298414Strasz SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write); 656298414Strasz 657297633Strasz RACCT_LOCK(); 658297633Strasz if (is_write) { 659297633Strasz racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1); 660297633Strasz racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1); 661297633Strasz } else { 662297633Strasz racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1); 663297633Strasz racct_add_locked(curproc, RACCT_READIOPS, 1, 1); 664297633Strasz } 665297633Strasz RACCT_UNLOCK(); 666297633Strasz} 667297633Strasz 668220137Straszstatic int 669297489Straszracct_set_locked(struct proc *p, int resource, uint64_t amount, int force) 670220137Strasz{ 671298045Strasz int64_t old_amount, decayed_amount, diff_proc, diff_cred; 672220137Strasz#ifdef RCTL 673220137Strasz int error; 674220137Strasz#endif 675220137Strasz 676282213Strasz ASSERT_RACCT_ENABLED(); 677282213Strasz 678220137Strasz /* 679220137Strasz * We need proc lock to dereference p->p_ucred. 680220137Strasz */ 681220137Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 682220137Strasz 683242139Strasz old_amount = p->p_racct->r_resources[resource]; 684242139Strasz /* 685242139Strasz * The diffs may be negative. 686242139Strasz */ 687242139Strasz diff_proc = amount - old_amount; 688297633Strasz if (resource == RACCT_PCTCPU) { 689242139Strasz /* 690242139Strasz * Resources in per-credential racct containers may decay. 691242139Strasz * If this is the case, we need to calculate the difference 692242139Strasz * between the new amount and the proportional value of the 693242139Strasz * old amount that has decayed in the ucred racct containers. 694242139Strasz */ 695242139Strasz decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 696242139Strasz diff_cred = amount - decayed_amount; 697242139Strasz } else 698242139Strasz diff_cred = diff_proc; 699220137Strasz#ifdef notyet 700242139Strasz KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), 701243088Strasz ("%s: usage of non-droppable resource %d dropping", __func__, 702220137Strasz resource)); 703220137Strasz#endif 704220137Strasz#ifdef RCTL 705297492Strasz if (diff_proc > 0) { 706242139Strasz error = rctl_enforce(p, resource, diff_proc); 707297492Strasz if (error && !force && RACCT_IS_DENIABLE(resource)) { 708292384Smarkj SDT_PROBE3(racct, , rusage, set__failure, p, resource, 709292384Smarkj amount); 710220137Strasz return (error); 711220137Strasz } 712220137Strasz } 713220137Strasz#endif 714284378Sjlh racct_adjust_resource(p->p_racct, resource, diff_proc); 715242139Strasz if (diff_cred > 0) 716242139Strasz racct_add_cred_locked(p->p_ucred, resource, diff_cred); 717242139Strasz else if (diff_cred < 0) 718242139Strasz racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 719220137Strasz 720220137Strasz return (0); 721220137Strasz} 722220137Strasz 723220137Strasz/* 724220137Strasz * Set allocation of 'resource' to 'amount' for process 'p'. 725220137Strasz * Return 0 if it's below limits, or errno, if it's not. 726220137Strasz * 727220137Strasz * Note that decreasing the allocation always returns 0, 728220137Strasz * even if it's above the limit. 729220137Strasz */ 730220137Straszint 731220137Straszracct_set(struct proc *p, int resource, uint64_t amount) 732220137Strasz{ 733220137Strasz int error; 734220137Strasz 735282213Strasz if (!racct_enable) 736282213Strasz return (0); 737282213Strasz 738297489Strasz SDT_PROBE3(racct, , rusage, set__force, p, resource, amount); 739297489Strasz 740297578Strasz RACCT_LOCK(); 741297489Strasz error = racct_set_locked(p, resource, amount, 0); 742297578Strasz RACCT_UNLOCK(); 743220137Strasz return (error); 744220137Strasz} 745220137Strasz 746297491Straszvoid 747297491Straszracct_set_force(struct proc *p, int resource, uint64_t amount) 748297491Strasz{ 749297491Strasz 750297491Strasz if (!racct_enable) 751297491Strasz return; 752297491Strasz 753297491Strasz SDT_PROBE3(racct, , rusage, set, p, resource, amount); 754297491Strasz 755297578Strasz RACCT_LOCK(); 756297491Strasz racct_set_locked(p, resource, amount, 1); 757297578Strasz RACCT_UNLOCK(); 758297491Strasz} 759297491Strasz 760220137Strasz/* 761220137Strasz * Returns amount of 'resource' the process 'p' can keep allocated. 762220137Strasz * Allocating more than that would be denied, unless the resource 763220137Strasz * is marked undeniable. Amount of already allocated resource does 764220137Strasz * not matter. 765220137Strasz */ 766220137Straszuint64_t 767220137Straszracct_get_limit(struct proc *p, int resource) 768220137Strasz{ 769298414Strasz#ifdef RCTL 770298414Strasz uint64_t available; 771220137Strasz 772282213Strasz if (!racct_enable) 773282213Strasz return (UINT64_MAX); 774282213Strasz 775298414Strasz RACCT_LOCK(); 776298414Strasz available = rctl_get_limit(p, resource); 777298414Strasz RACCT_UNLOCK(); 778298414Strasz 779298414Strasz return (available); 780220137Strasz#else 781298414Strasz 782220137Strasz return (UINT64_MAX); 783220137Strasz#endif 784220137Strasz} 785220137Strasz 786220137Strasz/* 787220137Strasz * Returns amount of 'resource' the process 'p' can keep allocated. 788220137Strasz * Allocating more than that would be denied, unless the resource 789220137Strasz * is marked undeniable. Amount of already allocated resource does 790220137Strasz * matter. 791220137Strasz */ 792220137Straszuint64_t 793220137Straszracct_get_available(struct proc *p, int resource) 794220137Strasz{ 795298414Strasz#ifdef RCTL 796298414Strasz uint64_t available; 797220137Strasz 798282213Strasz if (!racct_enable) 799282213Strasz return (UINT64_MAX); 800282213Strasz 801298414Strasz RACCT_LOCK(); 802298414Strasz available = rctl_get_available(p, resource); 803298414Strasz RACCT_UNLOCK(); 804298414Strasz 805298414Strasz return (available); 806220137Strasz#else 807298414Strasz 808220137Strasz return (UINT64_MAX); 809220137Strasz#endif 810220137Strasz} 811220137Strasz 812220137Strasz/* 813242139Strasz * Returns amount of the %cpu resource that process 'p' can add to its %cpu 814242139Strasz * utilization. Adding more than that would lead to the process being 815242139Strasz * throttled. 816242139Strasz */ 817242139Straszstatic int64_t 818242139Straszracct_pcpu_available(struct proc *p) 819242139Strasz{ 820298414Strasz#ifdef RCTL 821298414Strasz uint64_t available; 822242139Strasz 823282213Strasz ASSERT_RACCT_ENABLED(); 824282213Strasz 825298414Strasz RACCT_LOCK(); 826298414Strasz available = rctl_pcpu_available(p); 827298414Strasz RACCT_UNLOCK(); 828298414Strasz 829298414Strasz return (available); 830242139Strasz#else 831298414Strasz 832242139Strasz return (INT64_MAX); 833242139Strasz#endif 834242139Strasz} 835242139Strasz 836242139Strasz/* 837220137Strasz * Decrease allocation of 'resource' by 'amount' for process 'p'. 838220137Strasz */ 839220137Straszvoid 840220137Straszracct_sub(struct proc *p, int resource, uint64_t amount) 841220137Strasz{ 842220137Strasz 843282213Strasz if (!racct_enable) 844282213Strasz return; 845282213Strasz 846292384Smarkj SDT_PROBE3(racct, , rusage, sub, p, resource, amount); 847220137Strasz 848220137Strasz /* 849220137Strasz * We need proc lock to dereference p->p_ucred. 850220137Strasz */ 851220137Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 852242139Strasz KASSERT(RACCT_CAN_DROP(resource), 853243088Strasz ("%s: called for non-droppable resource %d", __func__, resource)); 854220137Strasz 855297578Strasz RACCT_LOCK(); 856220137Strasz KASSERT(amount <= p->p_racct->r_resources[resource], 857243088Strasz ("%s: freeing %ju of resource %d, which is more " 858243088Strasz "than allocated %jd for %s (pid %d)", __func__, amount, resource, 859220137Strasz (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); 860220137Strasz 861284378Sjlh racct_adjust_resource(p->p_racct, resource, -amount); 862220137Strasz racct_sub_cred_locked(p->p_ucred, resource, amount); 863297578Strasz RACCT_UNLOCK(); 864220137Strasz} 865220137Strasz 866220137Straszstatic void 867220137Straszracct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 868220137Strasz{ 869220137Strasz struct prison *pr; 870220137Strasz 871282213Strasz ASSERT_RACCT_ENABLED(); 872282213Strasz 873284378Sjlh racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); 874220137Strasz for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 875284378Sjlh racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, 876221362Strasz -amount); 877284378Sjlh racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); 878220137Strasz} 879220137Strasz 880220137Strasz/* 881220137Strasz * Decrease allocation of 'resource' by 'amount' for credential 'cred'. 882220137Strasz */ 883220137Straszvoid 884220137Straszracct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 885220137Strasz{ 886220137Strasz 887282213Strasz if (!racct_enable) 888282213Strasz return; 889282213Strasz 890298414Strasz SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount); 891298414Strasz 892298414Strasz#ifdef notyet 893298414Strasz KASSERT(RACCT_CAN_DROP(resource), 894298414Strasz ("%s: called for resource %d which can not drop", __func__, 895298414Strasz resource)); 896298414Strasz#endif 897298414Strasz 898297578Strasz RACCT_LOCK(); 899220137Strasz racct_sub_cred_locked(cred, resource, amount); 900297578Strasz RACCT_UNLOCK(); 901220137Strasz} 902220137Strasz 903220137Strasz/* 904220137Strasz * Inherit resource usage information from the parent process. 905220137Strasz */ 906220137Straszint 907220137Straszracct_proc_fork(struct proc *parent, struct proc *child) 908220137Strasz{ 909220137Strasz int i, error = 0; 910220137Strasz 911282213Strasz if (!racct_enable) 912282213Strasz return (0); 913282213Strasz 914220137Strasz /* 915220137Strasz * Create racct for the child process. 916220137Strasz */ 917220137Strasz racct_create(&child->p_racct); 918220137Strasz 919220137Strasz PROC_LOCK(parent); 920220137Strasz PROC_LOCK(child); 921297578Strasz RACCT_LOCK(); 922220137Strasz 923225981Strasz#ifdef RCTL 924225981Strasz error = rctl_proc_fork(parent, child); 925225981Strasz if (error != 0) 926225981Strasz goto out; 927225981Strasz#endif 928225981Strasz 929242139Strasz /* Init process cpu time. */ 930242139Strasz child->p_prev_runtime = 0; 931242139Strasz child->p_throttled = 0; 932242139Strasz 933220137Strasz /* 934220137Strasz * Inherit resource usage. 935220137Strasz */ 936220137Strasz for (i = 0; i <= RACCT_MAX; i++) { 937220137Strasz if (parent->p_racct->r_resources[i] == 0 || 938223844Strasz !RACCT_IS_INHERITABLE(i)) 939220137Strasz continue; 940220137Strasz 941220137Strasz error = racct_set_locked(child, i, 942297489Strasz parent->p_racct->r_resources[i], 0); 943225938Strasz if (error != 0) 944220137Strasz goto out; 945220137Strasz } 946220137Strasz 947297490Strasz error = racct_add_locked(child, RACCT_NPROC, 1, 0); 948297490Strasz error += racct_add_locked(child, RACCT_NTHR, 1, 0); 949225944Strasz 950220137Straszout: 951297578Strasz RACCT_UNLOCK(); 952220137Strasz PROC_UNLOCK(child); 953220137Strasz PROC_UNLOCK(parent); 954220137Strasz 955235787Strasz if (error != 0) 956235787Strasz racct_proc_exit(child); 957235787Strasz 958220137Strasz return (error); 959220137Strasz} 960220137Strasz 961225940Strasz/* 962225940Strasz * Called at the end of fork1(), to handle rules that require the process 963225940Strasz * to be fully initialized. 964225940Strasz */ 965220137Straszvoid 966225940Straszracct_proc_fork_done(struct proc *child) 967225940Strasz{ 968225940Strasz 969282213Strasz if (!racct_enable) 970282213Strasz return; 971282213Strasz 972298414Strasz PROC_LOCK_ASSERT(child, MA_OWNED); 973298414Strasz 974298414Strasz#ifdef RCTL 975297578Strasz RACCT_LOCK(); 976225940Strasz rctl_enforce(child, RACCT_NPROC, 0); 977225940Strasz rctl_enforce(child, RACCT_NTHR, 0); 978297578Strasz RACCT_UNLOCK(); 979225940Strasz#endif 980225940Strasz} 981225940Strasz 982225940Straszvoid 983220137Straszracct_proc_exit(struct proc *p) 984220137Strasz{ 985298045Strasz struct timeval wallclock; 986298045Strasz uint64_t pct_estimate, pct, runtime; 987225364Strasz int i; 988220137Strasz 989282213Strasz if (!racct_enable) 990282213Strasz return; 991282213Strasz 992220137Strasz PROC_LOCK(p); 993220137Strasz /* 994220137Strasz * We don't need to calculate rux, proc_reap() has already done this. 995220137Strasz */ 996220137Strasz runtime = cputick2usec(p->p_rux.rux_runtime); 997220137Strasz#ifdef notyet 998220137Strasz KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); 999220137Strasz#else 1000220137Strasz if (runtime < p->p_prev_runtime) 1001220137Strasz runtime = p->p_prev_runtime; 1002220137Strasz#endif 1003242139Strasz microuptime(&wallclock); 1004242139Strasz timevalsub(&wallclock, &p->p_stats->p_start); 1005242957Strasz if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1006242957Strasz pct_estimate = (1000000 * runtime * 100) / 1007242957Strasz ((uint64_t)wallclock.tv_sec * 1000000 + 1008242957Strasz wallclock.tv_usec); 1009242957Strasz } else 1010242957Strasz pct_estimate = 0; 1011242139Strasz pct = racct_getpcpu(p, pct_estimate); 1012242139Strasz 1013297578Strasz RACCT_LOCK(); 1014297489Strasz racct_set_locked(p, RACCT_CPU, runtime, 0); 1015242139Strasz racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); 1016220137Strasz 1017225364Strasz for (i = 0; i <= RACCT_MAX; i++) { 1018225364Strasz if (p->p_racct->r_resources[i] == 0) 1019225364Strasz continue; 1020225364Strasz if (!RACCT_IS_RECLAIMABLE(i)) 1021225364Strasz continue; 1022297489Strasz racct_set_locked(p, i, 0, 0); 1023225364Strasz } 1024225364Strasz 1025220137Strasz#ifdef RCTL 1026220137Strasz rctl_racct_release(p->p_racct); 1027220137Strasz#endif 1028298414Strasz racct_destroy_locked(&p->p_racct); 1029298414Strasz RACCT_UNLOCK(); 1030298414Strasz PROC_UNLOCK(p); 1031220137Strasz} 1032220137Strasz 1033220137Strasz/* 1034220137Strasz * Called after credentials change, to move resource utilisation 1035220137Strasz * between raccts. 1036220137Strasz */ 1037220137Straszvoid 1038220137Straszracct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, 1039220137Strasz struct ucred *newcred) 1040220137Strasz{ 1041220137Strasz struct uidinfo *olduip, *newuip; 1042220137Strasz struct loginclass *oldlc, *newlc; 1043220137Strasz struct prison *oldpr, *newpr, *pr; 1044220137Strasz 1045282213Strasz if (!racct_enable) 1046282213Strasz return; 1047282213Strasz 1048220137Strasz PROC_LOCK_ASSERT(p, MA_NOTOWNED); 1049220137Strasz 1050220137Strasz newuip = newcred->cr_ruidinfo; 1051220137Strasz olduip = oldcred->cr_ruidinfo; 1052220137Strasz newlc = newcred->cr_loginclass; 1053220137Strasz oldlc = oldcred->cr_loginclass; 1054220137Strasz newpr = newcred->cr_prison; 1055220137Strasz oldpr = oldcred->cr_prison; 1056220137Strasz 1057297578Strasz RACCT_LOCK(); 1058220137Strasz if (newuip != olduip) { 1059220137Strasz racct_sub_racct(olduip->ui_racct, p->p_racct); 1060220137Strasz racct_add_racct(newuip->ui_racct, p->p_racct); 1061220137Strasz } 1062220137Strasz if (newlc != oldlc) { 1063220137Strasz racct_sub_racct(oldlc->lc_racct, p->p_racct); 1064220137Strasz racct_add_racct(newlc->lc_racct, p->p_racct); 1065220137Strasz } 1066220137Strasz if (newpr != oldpr) { 1067220137Strasz for (pr = oldpr; pr != NULL; pr = pr->pr_parent) 1068221362Strasz racct_sub_racct(pr->pr_prison_racct->prr_racct, 1069221362Strasz p->p_racct); 1070220137Strasz for (pr = newpr; pr != NULL; pr = pr->pr_parent) 1071221362Strasz racct_add_racct(pr->pr_prison_racct->prr_racct, 1072221362Strasz p->p_racct); 1073220137Strasz } 1074297578Strasz RACCT_UNLOCK(); 1075220137Strasz 1076220137Strasz#ifdef RCTL 1077220137Strasz rctl_proc_ucred_changed(p, newcred); 1078220137Strasz#endif 1079220137Strasz} 1080220137Strasz 1081232598Straszvoid 1082232598Straszracct_move(struct racct *dest, struct racct *src) 1083232598Strasz{ 1084232598Strasz 1085282213Strasz ASSERT_RACCT_ENABLED(); 1086282213Strasz 1087297578Strasz RACCT_LOCK(); 1088232598Strasz racct_add_racct(dest, src); 1089232598Strasz racct_sub_racct(src, src); 1090297578Strasz RACCT_UNLOCK(); 1091232598Strasz} 1092232598Strasz 1093297633Strasz/* 1094297633Strasz * Make the process sleep in userret() for 'timeout' ticks. Setting 1095297633Strasz * timeout to -1 makes it sleep until woken up by racct_proc_wakeup(). 1096297633Strasz */ 1097297633Straszvoid 1098297633Straszracct_proc_throttle(struct proc *p, int timeout) 1099242139Strasz{ 1100242139Strasz struct thread *td; 1101242139Strasz#ifdef SMP 1102242139Strasz int cpuid; 1103242139Strasz#endif 1104242139Strasz 1105297633Strasz KASSERT(timeout != 0, ("timeout %d", timeout)); 1106282213Strasz ASSERT_RACCT_ENABLED(); 1107242139Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 1108242139Strasz 1109242139Strasz /* 1110242139Strasz * Do not block kernel processes. Also do not block processes with 1111242139Strasz * low %cpu utilization to improve interactivity. 1112242139Strasz */ 1113297633Strasz if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) 1114242139Strasz return; 1115242139Strasz 1116297633Strasz if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout)) 1117297633Strasz return; 1118297633Strasz 1119297633Strasz p->p_throttled = timeout; 1120297633Strasz 1121242139Strasz FOREACH_THREAD_IN_PROC(p, td) { 1122248298Strasz thread_lock(td); 1123242139Strasz switch (td->td_state) { 1124242139Strasz case TDS_RUNQ: 1125242139Strasz /* 1126242139Strasz * If the thread is on the scheduler run-queue, we can 1127242139Strasz * not just remove it from there. So we set the flag 1128242139Strasz * TDF_NEEDRESCHED for the thread, so that once it is 1129242139Strasz * running, it is taken off the cpu as soon as possible. 1130242139Strasz */ 1131242139Strasz td->td_flags |= TDF_NEEDRESCHED; 1132242139Strasz break; 1133242139Strasz case TDS_RUNNING: 1134242139Strasz /* 1135242139Strasz * If the thread is running, we request a context 1136242139Strasz * switch for it by setting the TDF_NEEDRESCHED flag. 1137242139Strasz */ 1138242139Strasz td->td_flags |= TDF_NEEDRESCHED; 1139242139Strasz#ifdef SMP 1140242139Strasz cpuid = td->td_oncpu; 1141242139Strasz if ((cpuid != NOCPU) && (td != curthread)) 1142242139Strasz ipi_cpu(cpuid, IPI_AST); 1143242139Strasz#endif 1144242139Strasz break; 1145242139Strasz default: 1146242139Strasz break; 1147242139Strasz } 1148248298Strasz thread_unlock(td); 1149242139Strasz } 1150242139Strasz} 1151242139Strasz 1152242139Straszstatic void 1153242139Straszracct_proc_wakeup(struct proc *p) 1154242139Strasz{ 1155282213Strasz 1156282213Strasz ASSERT_RACCT_ENABLED(); 1157282213Strasz 1158242139Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 1159242139Strasz 1160297633Strasz if (p->p_throttled != 0) { 1161242139Strasz p->p_throttled = 0; 1162242139Strasz wakeup(p->p_racct); 1163242139Strasz } 1164242139Strasz} 1165242139Strasz 1166242139Straszstatic void 1167297494Straszracct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) 1168242139Strasz{ 1169242139Strasz int64_t r_old, r_new; 1170242139Strasz 1171282213Strasz ASSERT_RACCT_ENABLED(); 1172297578Strasz RACCT_LOCK_ASSERT(); 1173282213Strasz 1174297633Strasz#ifdef RCTL 1175297633Strasz rctl_throttle_decay(racct, RACCT_READBPS); 1176297633Strasz rctl_throttle_decay(racct, RACCT_WRITEBPS); 1177297633Strasz rctl_throttle_decay(racct, RACCT_READIOPS); 1178297633Strasz rctl_throttle_decay(racct, RACCT_WRITEIOPS); 1179297633Strasz#endif 1180297633Strasz 1181297494Strasz r_old = racct->r_resources[RACCT_PCTCPU]; 1182242139Strasz 1183242139Strasz /* If there is nothing to decay, just exit. */ 1184242139Strasz if (r_old <= 0) 1185242139Strasz return; 1186242139Strasz 1187242139Strasz r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; 1188297494Strasz racct->r_resources[RACCT_PCTCPU] = r_new; 1189290857Strasz} 1190290857Strasz 1191290857Straszstatic void 1192290857Straszracct_decay_pre(void) 1193290857Strasz{ 1194290857Strasz 1195297578Strasz RACCT_LOCK(); 1196290857Strasz} 1197290857Strasz 1198290857Straszstatic void 1199290857Straszracct_decay_post(void) 1200290857Strasz{ 1201290857Strasz 1202297578Strasz RACCT_UNLOCK(); 1203242139Strasz} 1204242139Strasz 1205242139Straszstatic void 1206297495Straszracct_decay(void) 1207242139Strasz{ 1208282213Strasz 1209282213Strasz ASSERT_RACCT_ENABLED(); 1210282213Strasz 1211297494Strasz ui_racct_foreach(racct_decay_callback, racct_decay_pre, 1212297494Strasz racct_decay_post, NULL, NULL); 1213297494Strasz loginclass_racct_foreach(racct_decay_callback, racct_decay_pre, 1214297494Strasz racct_decay_post, NULL, NULL); 1215297494Strasz prison_racct_foreach(racct_decay_callback, racct_decay_pre, 1216297494Strasz racct_decay_post, NULL, NULL); 1217242139Strasz} 1218242139Strasz 1219242139Straszstatic void 1220220137Straszracctd(void) 1221220137Strasz{ 1222220137Strasz struct thread *td; 1223220137Strasz struct proc *p; 1224220137Strasz struct timeval wallclock; 1225298045Strasz uint64_t pct, pct_estimate, runtime; 1226220137Strasz 1227282213Strasz ASSERT_RACCT_ENABLED(); 1228282213Strasz 1229220137Strasz for (;;) { 1230297494Strasz racct_decay(); 1231242139Strasz 1232220137Strasz sx_slock(&allproc_lock); 1233220137Strasz 1234242139Strasz LIST_FOREACH(p, &zombproc, p_list) { 1235242139Strasz PROC_LOCK(p); 1236242139Strasz racct_set(p, RACCT_PCTCPU, 0); 1237242139Strasz PROC_UNLOCK(p); 1238242139Strasz } 1239242139Strasz 1240220137Strasz FOREACH_PROC_IN_SYSTEM(p) { 1241242139Strasz PROC_LOCK(p); 1242242139Strasz if (p->p_state != PRS_NORMAL) { 1243242139Strasz PROC_UNLOCK(p); 1244220137Strasz continue; 1245242139Strasz } 1246220137Strasz 1247220137Strasz microuptime(&wallclock); 1248220137Strasz timevalsub(&wallclock, &p->p_stats->p_start); 1249275121Skib PROC_STATLOCK(p); 1250232782Strasz FOREACH_THREAD_IN_PROC(p, td) 1251220137Strasz ruxagg(p, td); 1252220137Strasz runtime = cputick2usec(p->p_rux.rux_runtime); 1253275121Skib PROC_STATUNLOCK(p); 1254220137Strasz#ifdef notyet 1255220137Strasz KASSERT(runtime >= p->p_prev_runtime, 1256220137Strasz ("runtime < p_prev_runtime")); 1257220137Strasz#else 1258220137Strasz if (runtime < p->p_prev_runtime) 1259220137Strasz runtime = p->p_prev_runtime; 1260220137Strasz#endif 1261220137Strasz p->p_prev_runtime = runtime; 1262242957Strasz if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1263242957Strasz pct_estimate = (1000000 * runtime * 100) / 1264242957Strasz ((uint64_t)wallclock.tv_sec * 1000000 + 1265242957Strasz wallclock.tv_usec); 1266242957Strasz } else 1267242957Strasz pct_estimate = 0; 1268242139Strasz pct = racct_getpcpu(p, pct_estimate); 1269297578Strasz RACCT_LOCK(); 1270297633Strasz#ifdef RCTL 1271297633Strasz rctl_throttle_decay(p->p_racct, RACCT_READBPS); 1272297633Strasz rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); 1273297633Strasz rctl_throttle_decay(p->p_racct, RACCT_READIOPS); 1274297633Strasz rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); 1275297633Strasz#endif 1276297489Strasz racct_set_locked(p, RACCT_PCTCPU, pct, 1); 1277297489Strasz racct_set_locked(p, RACCT_CPU, runtime, 0); 1278220137Strasz racct_set_locked(p, RACCT_WALLCLOCK, 1279233126Sjh (uint64_t)wallclock.tv_sec * 1000000 + 1280297489Strasz wallclock.tv_usec, 0); 1281297578Strasz RACCT_UNLOCK(); 1282220137Strasz PROC_UNLOCK(p); 1283220137Strasz } 1284242139Strasz 1285242139Strasz /* 1286242139Strasz * To ensure that processes are throttled in a fair way, we need 1287242139Strasz * to iterate over all processes again and check the limits 1288242139Strasz * for %cpu resource only after ucred racct containers have been 1289242139Strasz * properly filled. 1290242139Strasz */ 1291242139Strasz FOREACH_PROC_IN_SYSTEM(p) { 1292242139Strasz PROC_LOCK(p); 1293242139Strasz if (p->p_state != PRS_NORMAL) { 1294242139Strasz PROC_UNLOCK(p); 1295242139Strasz continue; 1296242139Strasz } 1297242139Strasz 1298297633Strasz if (racct_pcpu_available(p) <= 0) { 1299297633Strasz if (p->p_racct->r_resources[RACCT_PCTCPU] > 1300297633Strasz pcpu_threshold) 1301297633Strasz racct_proc_throttle(p, -1); 1302297633Strasz } else if (p->p_throttled == -1) { 1303242139Strasz racct_proc_wakeup(p); 1304297633Strasz } 1305242139Strasz PROC_UNLOCK(p); 1306242139Strasz } 1307220137Strasz sx_sunlock(&allproc_lock); 1308220137Strasz pause("-", hz); 1309220137Strasz } 1310220137Strasz} 1311220137Strasz 1312220137Straszstatic struct kproc_desc racctd_kp = { 1313220137Strasz "racctd", 1314220137Strasz racctd, 1315220137Strasz NULL 1316220137Strasz}; 1317220137Strasz 1318220137Straszstatic void 1319282213Straszracctd_init(void) 1320282213Strasz{ 1321282213Strasz if (!racct_enable) 1322282213Strasz return; 1323282213Strasz 1324282213Strasz kproc_start(&racctd_kp); 1325282213Strasz} 1326282213StraszSYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); 1327282213Strasz 1328282213Straszstatic void 1329220137Straszracct_init(void) 1330220137Strasz{ 1331282213Strasz if (!racct_enable) 1332282213Strasz return; 1333220137Strasz 1334220137Strasz racct_zone = uma_zcreate("racct", sizeof(struct racct), 1335298050Strasz NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1336220137Strasz /* 1337220137Strasz * XXX: Move this somewhere. 1338220137Strasz */ 1339221362Strasz prison0.pr_prison_racct = prison_racct_find("0"); 1340220137Strasz} 1341220137StraszSYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); 1342220137Strasz 1343220137Strasz#endif /* !RACCT */ 1344