1220137Strasz/*- 2220137Strasz * Copyright (c) 2010 The FreeBSD Foundation 3220137Strasz * All rights reserved. 4220137Strasz * 5220137Strasz * This software was developed by Edward Tomasz Napierala under sponsorship 6220137Strasz * from the FreeBSD Foundation. 7220137Strasz * 8220137Strasz * Redistribution and use in source and binary forms, with or without 9220137Strasz * modification, are permitted provided that the following conditions 10220137Strasz * are met: 11220137Strasz * 1. Redistributions of source code must retain the above copyright 12220137Strasz * notice, this list of conditions and the following disclaimer. 13220137Strasz * 2. Redistributions in binary form must reproduce the above copyright 14220137Strasz * notice, this list of conditions and the following disclaimer in the 15220137Strasz * documentation and/or other materials provided with the distribution. 16220137Strasz * 17220137Strasz * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18220137Strasz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19220137Strasz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20220137Strasz * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21220137Strasz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22220137Strasz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23220137Strasz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24220137Strasz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25220137Strasz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26220137Strasz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27220137Strasz * SUCH DAMAGE. 28220137Strasz * 29220137Strasz * $FreeBSD$ 30220137Strasz */ 31220137Strasz 32220137Strasz#include <sys/cdefs.h> 33220137Strasz__FBSDID("$FreeBSD$"); 34220137Strasz 35220137Strasz#include "opt_kdtrace.h" 36249444Strasz#include "opt_sched.h" 37220137Strasz 38220137Strasz#include <sys/param.h> 39230167Savg#include <sys/systm.h> 40220137Strasz#include <sys/eventhandler.h> 41220137Strasz#include <sys/jail.h> 42220137Strasz#include <sys/kernel.h> 43220137Strasz#include <sys/kthread.h> 44220137Strasz#include <sys/lock.h> 45220137Strasz#include <sys/loginclass.h> 46220137Strasz#include <sys/malloc.h> 47220137Strasz#include <sys/mutex.h> 48220137Strasz#include <sys/proc.h> 49220137Strasz#include <sys/racct.h> 50220137Strasz#include <sys/resourcevar.h> 51220137Strasz#include <sys/sbuf.h> 52220137Strasz#include <sys/sched.h> 53220137Strasz#include <sys/sdt.h> 54249444Strasz#include <sys/smp.h> 55220137Strasz#include <sys/sx.h> 56249444Strasz#include <sys/sysctl.h> 57220137Strasz#include <sys/sysent.h> 58220137Strasz#include <sys/sysproto.h> 59220137Strasz#include <sys/umtx.h> 60249444Strasz#include <machine/smp.h> 61220137Strasz 62220137Strasz#ifdef RCTL 63220137Strasz#include <sys/rctl.h> 64220137Strasz#endif 65220137Strasz 66220137Strasz#ifdef RACCT 67220137Strasz 68220137StraszFEATURE(racct, "Resource Accounting"); 69220137Strasz 70249444Strasz/* 71249444Strasz * Do not block processes that have their %cpu usage <= pcpu_threshold. 72249444Strasz */ 73249444Straszstatic int pcpu_threshold = 1; 74249444Strasz 75249444StraszSYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); 76249444StraszSYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 77249444Strasz 0, "Processes with higher %cpu usage than this value can be throttled."); 78249444Strasz 79249444Strasz/* 80249444Strasz * How many seconds it takes to use the scheduler %cpu calculations. When a 81249444Strasz * process starts, we compute its %cpu usage by dividing its runtime by the 82249444Strasz * process wall clock time. After RACCT_PCPU_SECS pass, we use the value 83249444Strasz * provided by the scheduler. 84249444Strasz */ 85249444Strasz#define RACCT_PCPU_SECS 3 86249444Strasz 87220137Straszstatic struct mtx racct_lock; 88220137StraszMTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); 89220137Strasz 90220137Straszstatic uma_zone_t racct_zone; 91220137Strasz 92220137Straszstatic void racct_sub_racct(struct racct *dest, const struct racct *src); 93220137Straszstatic void racct_sub_cred_locked(struct ucred *cred, int resource, 94220137Strasz uint64_t amount); 95220137Straszstatic void racct_add_cred_locked(struct ucred *cred, int resource, 96220137Strasz uint64_t amount); 97220137Strasz 98220137StraszSDT_PROVIDER_DEFINE(racct); 99262057SavgSDT_PROBE_DEFINE3(racct, kernel, rusage, add, "struct proc *", "int", 100220137Strasz "uint64_t"); 101262057SavgSDT_PROBE_DEFINE3(racct, kernel, rusage, add__failure, 102220137Strasz "struct proc *", "int", "uint64_t"); 103262057SavgSDT_PROBE_DEFINE3(racct, kernel, rusage, add__cred, "struct ucred *", 104220137Strasz "int", "uint64_t"); 105262057SavgSDT_PROBE_DEFINE3(racct, kernel, rusage, add__force, "struct proc *", 106220137Strasz "int", "uint64_t"); 107262057SavgSDT_PROBE_DEFINE3(racct, kernel, rusage, set, "struct proc *", "int", 108220137Strasz "uint64_t"); 109262057SavgSDT_PROBE_DEFINE3(racct, kernel, rusage, set__failure, 110220137Strasz "struct proc *", "int", "uint64_t"); 111262057SavgSDT_PROBE_DEFINE3(racct, kernel, rusage, sub, "struct proc *", "int", 112220137Strasz "uint64_t"); 113262057SavgSDT_PROBE_DEFINE3(racct, kernel, rusage, sub__cred, "struct ucred *", 114220137Strasz "int", "uint64_t"); 115262057SavgSDT_PROBE_DEFINE1(racct, kernel, racct, create, "struct racct *"); 116262057SavgSDT_PROBE_DEFINE1(racct, kernel, racct, destroy, "struct racct *"); 117262057SavgSDT_PROBE_DEFINE2(racct, kernel, racct, join, "struct racct *", 118220137Strasz "struct racct *"); 119262057SavgSDT_PROBE_DEFINE2(racct, kernel, racct, join__failure, 120220137Strasz "struct racct *", "struct racct *"); 121262057SavgSDT_PROBE_DEFINE2(racct, kernel, racct, leave, "struct racct *", 122220137Strasz "struct racct *"); 123220137Strasz 124220137Straszint racct_types[] = { 125220137Strasz [RACCT_CPU] = 126224036Strasz RACCT_IN_MILLIONS, 127220137Strasz [RACCT_DATA] = 128220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 129220137Strasz [RACCT_STACK] = 130220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 131220137Strasz [RACCT_CORE] = 132220137Strasz RACCT_DENIABLE, 133220137Strasz [RACCT_RSS] = 134220137Strasz RACCT_RECLAIMABLE, 135220137Strasz [RACCT_MEMLOCK] = 136220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE, 137220137Strasz [RACCT_NPROC] = 138220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE, 139220137Strasz [RACCT_NOFILE] = 140220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 141220137Strasz [RACCT_VMEM] = 142220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 143220137Strasz [RACCT_NPTS] = 144220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 145220137Strasz [RACCT_SWAP] = 146220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 147220137Strasz [RACCT_NTHR] = 148220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE, 149220137Strasz [RACCT_MSGQQUEUED] = 150220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 151220137Strasz [RACCT_MSGQSIZE] = 152220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 153220137Strasz [RACCT_NMSGQ] = 154220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 155220137Strasz [RACCT_NSEM] = 156220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 157220137Strasz [RACCT_NSEMOP] = 158220137Strasz RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 159220137Strasz [RACCT_NSHM] = 160220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 161220137Strasz [RACCT_SHMSIZE] = 162220137Strasz RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 163220137Strasz [RACCT_WALLCLOCK] = 164249444Strasz RACCT_IN_MILLIONS, 165249444Strasz [RACCT_PCTCPU] = 166249444Strasz RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; 167220137Strasz 168249444Straszstatic const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; 169249444Strasz 170249444Strasz#ifdef SCHED_4BSD 171249444Strasz/* 172249444Strasz * Contains intermediate values for %cpu calculations to avoid using floating 173249444Strasz * point in the kernel. 174249444Strasz * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) 175249444Strasz * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to 176249444Strasz * zero so the calculations are more straightforward. 177249444Strasz */ 178249444Straszfixpt_t ccpu_exp[] = { 179249444Strasz [0] = FSCALE * 1, 180249444Strasz [1] = FSCALE * 0.95122942450071400909, 181249444Strasz [2] = FSCALE * 0.90483741803595957316, 182249444Strasz [3] = FSCALE * 0.86070797642505780722, 183249444Strasz [4] = FSCALE * 0.81873075307798185866, 184249444Strasz [5] = FSCALE * 0.77880078307140486824, 185249444Strasz [6] = FSCALE * 0.74081822068171786606, 186249444Strasz [7] = FSCALE * 0.70468808971871343435, 187249444Strasz [8] = FSCALE * 0.67032004603563930074, 188249444Strasz [9] = FSCALE * 0.63762815162177329314, 189249444Strasz [10] = FSCALE * 0.60653065971263342360, 190249444Strasz [11] = FSCALE * 0.57694981038048669531, 191249444Strasz [12] = FSCALE * 0.54881163609402643262, 192249444Strasz [13] = FSCALE * 0.52204577676101604789, 193249444Strasz [14] = FSCALE * 0.49658530379140951470, 194249444Strasz [15] = FSCALE * 0.47236655274101470713, 195249444Strasz [16] = FSCALE * 0.44932896411722159143, 196249444Strasz [17] = FSCALE * 0.42741493194872666992, 197249444Strasz [18] = FSCALE * 0.40656965974059911188, 198249444Strasz [19] = FSCALE * 0.38674102345450120691, 199249444Strasz [20] = FSCALE * 0.36787944117144232159, 200249444Strasz [21] = FSCALE * 0.34993774911115535467, 201249444Strasz [22] = FSCALE * 0.33287108369807955328, 202249444Strasz [23] = FSCALE * 0.31663676937905321821, 203249444Strasz [24] = FSCALE * 0.30119421191220209664, 204249444Strasz [25] = FSCALE * 0.28650479686019010032, 205249444Strasz [26] = FSCALE * 0.27253179303401260312, 206249444Strasz [27] = FSCALE * 0.25924026064589150757, 207249444Strasz [28] = FSCALE * 0.24659696394160647693, 208249444Strasz [29] = FSCALE * 0.23457028809379765313, 209249444Strasz [30] = FSCALE * 0.22313016014842982893, 210249444Strasz [31] = FSCALE * 0.21224797382674305771, 211249444Strasz [32] = FSCALE * 0.20189651799465540848, 212249444Strasz [33] = FSCALE * 0.19204990862075411423, 213249444Strasz [34] = FSCALE * 0.18268352405273465022, 214249444Strasz [35] = FSCALE * 0.17377394345044512668, 215249444Strasz [36] = FSCALE * 0.16529888822158653829, 216249444Strasz [37] = FSCALE * 0.15723716631362761621, 217249444Strasz [38] = FSCALE * 0.14956861922263505264, 218249444Strasz [39] = FSCALE * 0.14227407158651357185, 219249444Strasz [40] = FSCALE * 0.13533528323661269189, 220249444Strasz [41] = FSCALE * 0.12873490358780421886, 221249444Strasz [42] = FSCALE * 0.12245642825298191021, 222249444Strasz [43] = FSCALE * 0.11648415777349695786, 223249444Strasz [44] = FSCALE * 0.11080315836233388333, 224249444Strasz [45] = FSCALE * 0.10539922456186433678, 225249444Strasz [46] = FSCALE * 0.10025884372280373372, 226249444Strasz [47] = FSCALE * 0.09536916221554961888, 227249444Strasz [48] = FSCALE * 0.09071795328941250337, 228249444Strasz [49] = FSCALE * 0.08629358649937051097, 229249444Strasz [50] = FSCALE * 0.08208499862389879516, 230249444Strasz [51] = FSCALE * 0.07808166600115315231, 231249444Strasz [52] = FSCALE * 0.07427357821433388042, 232249444Strasz [53] = FSCALE * 0.07065121306042958674, 233249444Strasz [54] = FSCALE * 0.06720551273974976512, 234249444Strasz [55] = FSCALE * 0.06392786120670757270, 235249444Strasz [56] = FSCALE * 0.06081006262521796499, 236249444Strasz [57] = FSCALE * 0.05784432087483846296, 237249444Strasz [58] = FSCALE * 0.05502322005640722902, 238249444Strasz [59] = FSCALE * 0.05233970594843239308, 239249444Strasz [60] = FSCALE * 0.04978706836786394297, 240249444Strasz [61] = FSCALE * 0.04735892439114092119, 241249444Strasz [62] = FSCALE * 0.04504920239355780606, 242249444Strasz [63] = FSCALE * 0.04285212686704017991, 243249444Strasz [64] = FSCALE * 0.04076220397836621516, 244249444Strasz [65] = FSCALE * 0.03877420783172200988, 245249444Strasz [66] = FSCALE * 0.03688316740124000544, 246249444Strasz [67] = FSCALE * 0.03508435410084502588, 247249444Strasz [68] = FSCALE * 0.03337326996032607948, 248249444Strasz [69] = FSCALE * 0.03174563637806794323, 249249444Strasz [70] = FSCALE * 0.03019738342231850073, 250249444Strasz [71] = FSCALE * 0.02872463965423942912, 251249444Strasz [72] = FSCALE * 0.02732372244729256080, 252249444Strasz [73] = FSCALE * 0.02599112877875534358, 253249444Strasz [74] = FSCALE * 0.02472352647033939120, 254249444Strasz [75] = FSCALE * 0.02351774585600910823, 255249444Strasz [76] = FSCALE * 0.02237077185616559577, 256249444Strasz [77] = FSCALE * 0.02127973643837716938, 257249444Strasz [78] = FSCALE * 0.02024191144580438847, 258249444Strasz [79] = FSCALE * 0.01925470177538692429, 259249444Strasz [80] = FSCALE * 0.01831563888873418029, 260249444Strasz [81] = FSCALE * 0.01742237463949351138, 261249444Strasz [82] = FSCALE * 0.01657267540176124754, 262249444Strasz [83] = FSCALE * 0.01576441648485449082, 263249444Strasz [84] = FSCALE * 0.01499557682047770621, 264249444Strasz [85] = FSCALE * 0.01426423390899925527, 265249444Strasz [86] = FSCALE * 0.01356855901220093175, 266249444Strasz [87] = FSCALE * 0.01290681258047986886, 267249444Strasz [88] = FSCALE * 0.01227733990306844117, 268249444Strasz [89] = FSCALE * 0.01167856697039544521, 269249444Strasz [90] = FSCALE * 0.01110899653824230649, 270249444Strasz [91] = FSCALE * 0.01056720438385265337, 271249444Strasz [92] = FSCALE * 0.01005183574463358164, 272249444Strasz [93] = FSCALE * 0.00956160193054350793, 273249444Strasz [94] = FSCALE * 0.00909527710169581709, 274249444Strasz [95] = FSCALE * 0.00865169520312063417, 275249444Strasz [96] = FSCALE * 0.00822974704902002884, 276249444Strasz [97] = FSCALE * 0.00782837754922577143, 277249444Strasz [98] = FSCALE * 0.00744658307092434051, 278249444Strasz [99] = FSCALE * 0.00708340892905212004, 279249444Strasz [100] = FSCALE * 0.00673794699908546709, 280249444Strasz [101] = FSCALE * 0.00640933344625638184, 281249444Strasz [102] = FSCALE * 0.00609674656551563610, 282249444Strasz [103] = FSCALE * 0.00579940472684214321, 283249444Strasz [104] = FSCALE * 0.00551656442076077241, 284249444Strasz [105] = FSCALE * 0.00524751839918138427, 285249444Strasz [106] = FSCALE * 0.00499159390691021621, 286249444Strasz [107] = FSCALE * 0.00474815099941147558, 287249444Strasz [108] = FSCALE * 0.00451658094261266798, 288249444Strasz [109] = FSCALE * 0.00429630469075234057, 289249444Strasz [110] = FSCALE * 0.00408677143846406699, 290249444Strasz}; 291249444Strasz#endif 292249444Strasz 293249444Strasz#define CCPU_EXP_MAX 110 294249444Strasz 295249444Strasz/* 296249444Strasz * This function is analogical to the getpcpu() function in the ps(1) command. 297249444Strasz * They should both calculate in the same way so that the racct %cpu 298249444Strasz * calculations are consistent with the values showed by the ps(1) tool. 299249444Strasz * The calculations are more complex in the 4BSD scheduler because of the value 300249444Strasz * of the ccpu variable. In ULE it is defined to be zero which saves us some 301249444Strasz * work. 302249444Strasz */ 303249444Straszstatic uint64_t 304249444Straszracct_getpcpu(struct proc *p, u_int pcpu) 305249444Strasz{ 306249444Strasz u_int swtime; 307249444Strasz#ifdef SCHED_4BSD 308249444Strasz fixpt_t pctcpu, pctcpu_next; 309249444Strasz#endif 310249444Strasz#ifdef SMP 311249444Strasz struct pcpu *pc; 312249444Strasz int found; 313249444Strasz#endif 314249444Strasz fixpt_t p_pctcpu; 315249444Strasz struct thread *td; 316249444Strasz 317249444Strasz /* 318249444Strasz * If the process is swapped out, we count its %cpu usage as zero. 319249444Strasz * This behaviour is consistent with the userland ps(1) tool. 320249444Strasz */ 321249444Strasz if ((p->p_flag & P_INMEM) == 0) 322249444Strasz return (0); 323249444Strasz swtime = (ticks - p->p_swtick) / hz; 324249444Strasz 325249444Strasz /* 326249444Strasz * For short-lived processes, the sched_pctcpu() returns small 327249444Strasz * values even for cpu intensive processes. Therefore we use 328249444Strasz * our own estimate in this case. 329249444Strasz */ 330249444Strasz if (swtime < RACCT_PCPU_SECS) 331249444Strasz return (pcpu); 332249444Strasz 333249444Strasz p_pctcpu = 0; 334249444Strasz FOREACH_THREAD_IN_PROC(p, td) { 335249444Strasz if (td == PCPU_GET(idlethread)) 336249444Strasz continue; 337249444Strasz#ifdef SMP 338249444Strasz found = 0; 339249444Strasz STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 340249444Strasz if (td == pc->pc_idlethread) { 341249444Strasz found = 1; 342249444Strasz break; 343249444Strasz } 344249444Strasz } 345249444Strasz if (found) 346249444Strasz continue; 347249444Strasz#endif 348249444Strasz thread_lock(td); 349249444Strasz#ifdef SCHED_4BSD 350249444Strasz pctcpu = sched_pctcpu(td); 351249444Strasz /* Count also the yet unfinished second. */ 352249444Strasz pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; 353249444Strasz pctcpu_next += sched_pctcpu_delta(td); 354249444Strasz p_pctcpu += max(pctcpu, pctcpu_next); 355249444Strasz#else 356249444Strasz /* 357249444Strasz * In ULE the %cpu statistics are updated on every 358249444Strasz * sched_pctcpu() call. So special calculations to 359249444Strasz * account for the latest (unfinished) second are 360249444Strasz * not needed. 361249444Strasz */ 362249444Strasz p_pctcpu += sched_pctcpu(td); 363249444Strasz#endif 364249444Strasz thread_unlock(td); 365249444Strasz } 366249444Strasz 367249444Strasz#ifdef SCHED_4BSD 368249444Strasz if (swtime <= CCPU_EXP_MAX) 369249444Strasz return ((100 * (uint64_t)p_pctcpu * 1000000) / 370249444Strasz (FSCALE - ccpu_exp[swtime])); 371249444Strasz#endif 372249444Strasz 373249444Strasz return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); 374249444Strasz} 375249444Strasz 376220137Straszstatic void 377220137Straszracct_add_racct(struct racct *dest, const struct racct *src) 378220137Strasz{ 379220137Strasz int i; 380220137Strasz 381220137Strasz mtx_assert(&racct_lock, MA_OWNED); 382220137Strasz 383220137Strasz /* 384220137Strasz * Update resource usage in dest. 385220137Strasz */ 386220137Strasz for (i = 0; i <= RACCT_MAX; i++) { 387220137Strasz KASSERT(dest->r_resources[i] >= 0, 388249444Strasz ("%s: resource %d propagation meltdown: dest < 0", 389249444Strasz __func__, i)); 390220137Strasz KASSERT(src->r_resources[i] >= 0, 391249444Strasz ("%s: resource %d propagation meltdown: src < 0", 392249444Strasz __func__, i)); 393220137Strasz dest->r_resources[i] += src->r_resources[i]; 394220137Strasz } 395220137Strasz} 396220137Strasz 397220137Straszstatic void 398220137Straszracct_sub_racct(struct racct *dest, const struct racct *src) 399220137Strasz{ 400220137Strasz int i; 401220137Strasz 402220137Strasz mtx_assert(&racct_lock, MA_OWNED); 403220137Strasz 404220137Strasz /* 405220137Strasz * Update resource usage in dest. 406220137Strasz */ 407220137Strasz for (i = 0; i <= RACCT_MAX; i++) { 408249444Strasz if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { 409220137Strasz KASSERT(dest->r_resources[i] >= 0, 410249444Strasz ("%s: resource %d propagation meltdown: dest < 0", 411249444Strasz __func__, i)); 412220137Strasz KASSERT(src->r_resources[i] >= 0, 413249444Strasz ("%s: resource %d propagation meltdown: src < 0", 414249444Strasz __func__, i)); 415220137Strasz KASSERT(src->r_resources[i] <= dest->r_resources[i], 416249444Strasz ("%s: resource %d propagation meltdown: src > dest", 417249444Strasz __func__, i)); 418220137Strasz } 419249444Strasz if (RACCT_CAN_DROP(i)) { 420220137Strasz dest->r_resources[i] -= src->r_resources[i]; 421220137Strasz if (dest->r_resources[i] < 0) { 422249444Strasz KASSERT(RACCT_IS_SLOPPY(i) || 423249444Strasz RACCT_IS_DECAYING(i), 424249444Strasz ("%s: resource %d usage < 0", __func__, i)); 425220137Strasz dest->r_resources[i] = 0; 426220137Strasz } 427220137Strasz } 428220137Strasz } 429220137Strasz} 430220137Strasz 431220137Straszvoid 432220137Straszracct_create(struct racct **racctp) 433220137Strasz{ 434220137Strasz 435220137Strasz SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0); 436220137Strasz 437220137Strasz KASSERT(*racctp == NULL, ("racct already allocated")); 438220137Strasz 439220137Strasz *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); 440220137Strasz} 441220137Strasz 442220137Straszstatic void 443220137Straszracct_destroy_locked(struct racct **racctp) 444220137Strasz{ 445220137Strasz int i; 446220137Strasz struct racct *racct; 447220137Strasz 448220137Strasz SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0); 449220137Strasz 450220137Strasz mtx_assert(&racct_lock, MA_OWNED); 451220137Strasz KASSERT(racctp != NULL, ("NULL racctp")); 452220137Strasz KASSERT(*racctp != NULL, ("NULL racct")); 453220137Strasz 454220137Strasz racct = *racctp; 455220137Strasz 456220137Strasz for (i = 0; i <= RACCT_MAX; i++) { 457223844Strasz if (RACCT_IS_SLOPPY(i)) 458220137Strasz continue; 459223844Strasz if (!RACCT_IS_RECLAIMABLE(i)) 460220137Strasz continue; 461220137Strasz KASSERT(racct->r_resources[i] == 0, 462220137Strasz ("destroying non-empty racct: " 463220137Strasz "%ju allocated for resource %d\n", 464220137Strasz racct->r_resources[i], i)); 465220137Strasz } 466220137Strasz uma_zfree(racct_zone, racct); 467220137Strasz *racctp = NULL; 468220137Strasz} 469220137Strasz 470220137Straszvoid 471220137Straszracct_destroy(struct racct **racct) 472220137Strasz{ 473220137Strasz 474220137Strasz mtx_lock(&racct_lock); 475220137Strasz racct_destroy_locked(racct); 476220137Strasz mtx_unlock(&racct_lock); 477220137Strasz} 478220137Strasz 479220137Strasz/* 480220137Strasz * Increase consumption of 'resource' by 'amount' for 'racct' 481220137Strasz * and all its parents. Differently from other cases, 'amount' here 482220137Strasz * may be less than zero. 483220137Strasz */ 484220137Straszstatic void 485220137Straszracct_alloc_resource(struct racct *racct, int resource, 486220137Strasz uint64_t amount) 487220137Strasz{ 488220137Strasz 489220137Strasz mtx_assert(&racct_lock, MA_OWNED); 490220137Strasz KASSERT(racct != NULL, ("NULL racct")); 491220137Strasz 492220137Strasz racct->r_resources[resource] += amount; 493220137Strasz if (racct->r_resources[resource] < 0) { 494249444Strasz KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), 495249444Strasz ("%s: resource %d usage < 0", __func__, resource)); 496220137Strasz racct->r_resources[resource] = 0; 497220137Strasz } 498249444Strasz 499249444Strasz /* 500249444Strasz * There are some cases where the racct %cpu resource would grow 501249444Strasz * beyond 100%. 502249444Strasz * For example in racct_proc_exit() we add the process %cpu usage 503249444Strasz * to the ucred racct containers. If too many processes terminated 504249444Strasz * in a short time span, the ucred %cpu resource could grow too much. 505249444Strasz * Also, the 4BSD scheduler sometimes returns for a thread more than 506249444Strasz * 100% cpu usage. So we set a boundary here to 100%. 507249444Strasz */ 508249444Strasz if ((resource == RACCT_PCTCPU) && 509249444Strasz (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000)) 510249444Strasz racct->r_resources[RACCT_PCTCPU] = 100 * 1000000; 511220137Strasz} 512220137Strasz 513226092Straszstatic int 514226092Straszracct_add_locked(struct proc *p, int resource, uint64_t amount) 515220137Strasz{ 516220137Strasz#ifdef RCTL 517220137Strasz int error; 518220137Strasz#endif 519220137Strasz 520220137Strasz SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0); 521220137Strasz 522220137Strasz /* 523220137Strasz * We need proc lock to dereference p->p_ucred. 524220137Strasz */ 525220137Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 526220137Strasz 527220137Strasz#ifdef RCTL 528220137Strasz error = rctl_enforce(p, resource, amount); 529223844Strasz if (error && RACCT_IS_DENIABLE(resource)) { 530262057Savg SDT_PROBE(racct, kernel, rusage, add__failure, p, resource, 531220137Strasz amount, 0, 0); 532220137Strasz return (error); 533220137Strasz } 534220137Strasz#endif 535220137Strasz racct_alloc_resource(p->p_racct, resource, amount); 536220137Strasz racct_add_cred_locked(p->p_ucred, resource, amount); 537220137Strasz 538220137Strasz return (0); 539220137Strasz} 540220137Strasz 541226092Strasz/* 542226092Strasz * Increase allocation of 'resource' by 'amount' for process 'p'. 543226092Strasz * Return 0 if it's below limits, or errno, if it's not. 544226092Strasz */ 545226092Straszint 546226092Straszracct_add(struct proc *p, int resource, uint64_t amount) 547226092Strasz{ 548226092Strasz int error; 549226092Strasz 550226092Strasz mtx_lock(&racct_lock); 551226092Strasz error = racct_add_locked(p, resource, amount); 552226092Strasz mtx_unlock(&racct_lock); 553226092Strasz return (error); 554226092Strasz} 555226092Strasz 556220137Straszstatic void 557220137Straszracct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) 558220137Strasz{ 559220137Strasz struct prison *pr; 560220137Strasz 561262057Savg SDT_PROBE(racct, kernel, rusage, add__cred, cred, resource, amount, 562220137Strasz 0, 0); 563220137Strasz 564220137Strasz racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount); 565220137Strasz for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 566221362Strasz racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 567221362Strasz amount); 568220137Strasz racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount); 569220137Strasz} 570220137Strasz 571220137Strasz/* 572220137Strasz * Increase allocation of 'resource' by 'amount' for credential 'cred'. 573220137Strasz * Doesn't check for limits and never fails. 574220137Strasz * 575220137Strasz * XXX: Shouldn't this ever return an error? 576220137Strasz */ 577220137Straszvoid 578220137Straszracct_add_cred(struct ucred *cred, int resource, uint64_t amount) 579220137Strasz{ 580220137Strasz 581220137Strasz mtx_lock(&racct_lock); 582220137Strasz racct_add_cred_locked(cred, resource, amount); 583220137Strasz mtx_unlock(&racct_lock); 584220137Strasz} 585220137Strasz 586220137Strasz/* 587220137Strasz * Increase allocation of 'resource' by 'amount' for process 'p'. 588220137Strasz * Doesn't check for limits and never fails. 589220137Strasz */ 590220137Straszvoid 591220137Straszracct_add_force(struct proc *p, int resource, uint64_t amount) 592220137Strasz{ 593220137Strasz 594262057Savg SDT_PROBE(racct, kernel, rusage, add__force, p, resource, amount, 0, 0); 595220137Strasz 596220137Strasz /* 597220137Strasz * We need proc lock to dereference p->p_ucred. 598220137Strasz */ 599220137Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 600220137Strasz 601220137Strasz mtx_lock(&racct_lock); 602220137Strasz racct_alloc_resource(p->p_racct, resource, amount); 603220137Strasz mtx_unlock(&racct_lock); 604220137Strasz racct_add_cred(p->p_ucred, resource, amount); 605220137Strasz} 606220137Strasz 607220137Straszstatic int 608220137Straszracct_set_locked(struct proc *p, int resource, uint64_t amount) 609220137Strasz{ 610249444Strasz int64_t old_amount, decayed_amount; 611249444Strasz int64_t diff_proc, diff_cred; 612220137Strasz#ifdef RCTL 613220137Strasz int error; 614220137Strasz#endif 615220137Strasz 616220137Strasz SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 617220137Strasz 618220137Strasz /* 619220137Strasz * We need proc lock to dereference p->p_ucred. 620220137Strasz */ 621220137Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 622220137Strasz 623249444Strasz old_amount = p->p_racct->r_resources[resource]; 624249444Strasz /* 625249444Strasz * The diffs may be negative. 626249444Strasz */ 627249444Strasz diff_proc = amount - old_amount; 628249444Strasz if (RACCT_IS_DECAYING(resource)) { 629249444Strasz /* 630249444Strasz * Resources in per-credential racct containers may decay. 631249444Strasz * If this is the case, we need to calculate the difference 632249444Strasz * between the new amount and the proportional value of the 633249444Strasz * old amount that has decayed in the ucred racct containers. 634249444Strasz */ 635249444Strasz decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 636249444Strasz diff_cred = amount - decayed_amount; 637249444Strasz } else 638249444Strasz diff_cred = diff_proc; 639220137Strasz#ifdef notyet 640249444Strasz KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), 641249444Strasz ("%s: usage of non-droppable resource %d dropping", __func__, 642220137Strasz resource)); 643220137Strasz#endif 644220137Strasz#ifdef RCTL 645249444Strasz if (diff_proc > 0) { 646249444Strasz error = rctl_enforce(p, resource, diff_proc); 647223844Strasz if (error && RACCT_IS_DENIABLE(resource)) { 648262057Savg SDT_PROBE(racct, kernel, rusage, set__failure, p, 649220137Strasz resource, amount, 0, 0); 650220137Strasz return (error); 651220137Strasz } 652220137Strasz } 653220137Strasz#endif 654249444Strasz racct_alloc_resource(p->p_racct, resource, diff_proc); 655249444Strasz if (diff_cred > 0) 656249444Strasz racct_add_cred_locked(p->p_ucred, resource, diff_cred); 657249444Strasz else if (diff_cred < 0) 658249444Strasz racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 659220137Strasz 660220137Strasz return (0); 661220137Strasz} 662220137Strasz 663220137Strasz/* 664220137Strasz * Set allocation of 'resource' to 'amount' for process 'p'. 665220137Strasz * Return 0 if it's below limits, or errno, if it's not. 666220137Strasz * 667220137Strasz * Note that decreasing the allocation always returns 0, 668220137Strasz * even if it's above the limit. 669220137Strasz */ 670220137Straszint 671220137Straszracct_set(struct proc *p, int resource, uint64_t amount) 672220137Strasz{ 673220137Strasz int error; 674220137Strasz 675220137Strasz mtx_lock(&racct_lock); 676220137Strasz error = racct_set_locked(p, resource, amount); 677220137Strasz mtx_unlock(&racct_lock); 678220137Strasz return (error); 679220137Strasz} 680220137Strasz 681249444Straszstatic void 682249444Straszracct_set_force_locked(struct proc *p, int resource, uint64_t amount) 683220137Strasz{ 684249444Strasz int64_t old_amount, decayed_amount; 685249444Strasz int64_t diff_proc, diff_cred; 686220137Strasz 687220137Strasz SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 688220137Strasz 689220137Strasz /* 690220137Strasz * We need proc lock to dereference p->p_ucred. 691220137Strasz */ 692220137Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 693220137Strasz 694249444Strasz old_amount = p->p_racct->r_resources[resource]; 695249444Strasz /* 696249444Strasz * The diffs may be negative. 697249444Strasz */ 698249444Strasz diff_proc = amount - old_amount; 699249444Strasz if (RACCT_IS_DECAYING(resource)) { 700249444Strasz /* 701249444Strasz * Resources in per-credential racct containers may decay. 702249444Strasz * If this is the case, we need to calculate the difference 703249444Strasz * between the new amount and the proportional value of the 704249444Strasz * old amount that has decayed in the ucred racct containers. 705249444Strasz */ 706249444Strasz decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 707249444Strasz diff_cred = amount - decayed_amount; 708249444Strasz } else 709249444Strasz diff_cred = diff_proc; 710249444Strasz 711249444Strasz racct_alloc_resource(p->p_racct, resource, diff_proc); 712249444Strasz if (diff_cred > 0) 713249444Strasz racct_add_cred_locked(p->p_ucred, resource, diff_cred); 714249444Strasz else if (diff_cred < 0) 715249444Strasz racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 716249444Strasz} 717249444Strasz 718249444Straszvoid 719249444Straszracct_set_force(struct proc *p, int resource, uint64_t amount) 720249444Strasz{ 721220137Strasz mtx_lock(&racct_lock); 722249444Strasz racct_set_force_locked(p, resource, amount); 723220137Strasz mtx_unlock(&racct_lock); 724220137Strasz} 725220137Strasz 726220137Strasz/* 727220137Strasz * Returns amount of 'resource' the process 'p' can keep allocated. 728220137Strasz * Allocating more than that would be denied, unless the resource 729220137Strasz * is marked undeniable. Amount of already allocated resource does 730220137Strasz * not matter. 731220137Strasz */ 732220137Straszuint64_t 733220137Straszracct_get_limit(struct proc *p, int resource) 734220137Strasz{ 735220137Strasz 736220137Strasz#ifdef RCTL 737220137Strasz return (rctl_get_limit(p, resource)); 738220137Strasz#else 739220137Strasz return (UINT64_MAX); 740220137Strasz#endif 741220137Strasz} 742220137Strasz 743220137Strasz/* 744220137Strasz * Returns amount of 'resource' the process 'p' can keep allocated. 745220137Strasz * Allocating more than that would be denied, unless the resource 746220137Strasz * is marked undeniable. Amount of already allocated resource does 747220137Strasz * matter. 748220137Strasz */ 749220137Straszuint64_t 750220137Straszracct_get_available(struct proc *p, int resource) 751220137Strasz{ 752220137Strasz 753220137Strasz#ifdef RCTL 754220137Strasz return (rctl_get_available(p, resource)); 755220137Strasz#else 756220137Strasz return (UINT64_MAX); 757220137Strasz#endif 758220137Strasz} 759220137Strasz 760220137Strasz/* 761249444Strasz * Returns amount of the %cpu resource that process 'p' can add to its %cpu 762249444Strasz * utilization. Adding more than that would lead to the process being 763249444Strasz * throttled. 764249444Strasz */ 765249444Straszstatic int64_t 766249444Straszracct_pcpu_available(struct proc *p) 767249444Strasz{ 768249444Strasz 769249444Strasz#ifdef RCTL 770249444Strasz return (rctl_pcpu_available(p)); 771249444Strasz#else 772249444Strasz return (INT64_MAX); 773249444Strasz#endif 774249444Strasz} 775249444Strasz 776249444Strasz/* 777220137Strasz * Decrease allocation of 'resource' by 'amount' for process 'p'. 778220137Strasz */ 779220137Straszvoid 780220137Straszracct_sub(struct proc *p, int resource, uint64_t amount) 781220137Strasz{ 782220137Strasz 783220137Strasz SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0); 784220137Strasz 785220137Strasz /* 786220137Strasz * We need proc lock to dereference p->p_ucred. 787220137Strasz */ 788220137Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 789249444Strasz KASSERT(RACCT_CAN_DROP(resource), 790249444Strasz ("%s: called for non-droppable resource %d", __func__, resource)); 791220137Strasz 792220137Strasz mtx_lock(&racct_lock); 793220137Strasz KASSERT(amount <= p->p_racct->r_resources[resource], 794249444Strasz ("%s: freeing %ju of resource %d, which is more " 795249444Strasz "than allocated %jd for %s (pid %d)", __func__, amount, resource, 796220137Strasz (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); 797220137Strasz 798220137Strasz racct_alloc_resource(p->p_racct, resource, -amount); 799220137Strasz racct_sub_cred_locked(p->p_ucred, resource, amount); 800220137Strasz mtx_unlock(&racct_lock); 801220137Strasz} 802220137Strasz 803220137Straszstatic void 804220137Straszracct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 805220137Strasz{ 806220137Strasz struct prison *pr; 807220137Strasz 808262057Savg SDT_PROBE(racct, kernel, rusage, sub__cred, cred, resource, amount, 809220137Strasz 0, 0); 810220137Strasz 811220137Strasz#ifdef notyet 812249444Strasz KASSERT(RACCT_CAN_DROP(resource), 813249444Strasz ("%s: called for resource %d which can not drop", __func__, 814220137Strasz resource)); 815220137Strasz#endif 816220137Strasz 817220137Strasz racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); 818220137Strasz for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 819221362Strasz racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 820221362Strasz -amount); 821220137Strasz racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount); 822220137Strasz} 823220137Strasz 824220137Strasz/* 825220137Strasz * Decrease allocation of 'resource' by 'amount' for credential 'cred'. 826220137Strasz */ 827220137Straszvoid 828220137Straszracct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 829220137Strasz{ 830220137Strasz 831220137Strasz mtx_lock(&racct_lock); 832220137Strasz racct_sub_cred_locked(cred, resource, amount); 833220137Strasz mtx_unlock(&racct_lock); 834220137Strasz} 835220137Strasz 836220137Strasz/* 837220137Strasz * Inherit resource usage information from the parent process. 838220137Strasz */ 839220137Straszint 840220137Straszracct_proc_fork(struct proc *parent, struct proc *child) 841220137Strasz{ 842220137Strasz int i, error = 0; 843220137Strasz 844220137Strasz /* 845220137Strasz * Create racct for the child process. 846220137Strasz */ 847220137Strasz racct_create(&child->p_racct); 848220137Strasz 849220137Strasz PROC_LOCK(parent); 850220137Strasz PROC_LOCK(child); 851220137Strasz mtx_lock(&racct_lock); 852220137Strasz 853226092Strasz#ifdef RCTL 854226092Strasz error = rctl_proc_fork(parent, child); 855226092Strasz if (error != 0) 856226092Strasz goto out; 857226092Strasz#endif 858226092Strasz 859249444Strasz /* Init process cpu time. */ 860249444Strasz child->p_prev_runtime = 0; 861249444Strasz child->p_throttled = 0; 862249444Strasz 863220137Strasz /* 864220137Strasz * Inherit resource usage. 865220137Strasz */ 866220137Strasz for (i = 0; i <= RACCT_MAX; i++) { 867220137Strasz if (parent->p_racct->r_resources[i] == 0 || 868223844Strasz !RACCT_IS_INHERITABLE(i)) 869220137Strasz continue; 870220137Strasz 871220137Strasz error = racct_set_locked(child, i, 872220137Strasz parent->p_racct->r_resources[i]); 873226092Strasz if (error != 0) 874220137Strasz goto out; 875220137Strasz } 876220137Strasz 877226092Strasz error = racct_add_locked(child, RACCT_NPROC, 1); 878226092Strasz error += racct_add_locked(child, RACCT_NTHR, 1); 879220137Strasz 880220137Straszout: 881220137Strasz mtx_unlock(&racct_lock); 882220137Strasz PROC_UNLOCK(child); 883220137Strasz PROC_UNLOCK(parent); 884220137Strasz 885236239Strasz if (error != 0) 886236239Strasz racct_proc_exit(child); 887236239Strasz 888220137Strasz return (error); 889220137Strasz} 890220137Strasz 891226092Strasz/* 892226092Strasz * Called at the end of fork1(), to handle rules that require the process 893226092Strasz * to be fully initialized. 894226092Strasz */ 895220137Straszvoid 896226092Straszracct_proc_fork_done(struct proc *child) 897226092Strasz{ 898226092Strasz 899226092Strasz#ifdef RCTL 900226092Strasz PROC_LOCK(child); 901226092Strasz mtx_lock(&racct_lock); 902226092Strasz rctl_enforce(child, RACCT_NPROC, 0); 903226092Strasz rctl_enforce(child, RACCT_NTHR, 0); 904226092Strasz mtx_unlock(&racct_lock); 905226092Strasz PROC_UNLOCK(child); 906226092Strasz#endif 907226092Strasz} 908226092Strasz 909226092Straszvoid 910220137Straszracct_proc_exit(struct proc *p) 911220137Strasz{ 912225364Strasz int i; 913220137Strasz uint64_t runtime; 914249444Strasz struct timeval wallclock; 915249444Strasz uint64_t pct_estimate, pct; 916220137Strasz 917220137Strasz PROC_LOCK(p); 918220137Strasz /* 919220137Strasz * We don't need to calculate rux, proc_reap() has already done this. 920220137Strasz */ 921220137Strasz runtime = cputick2usec(p->p_rux.rux_runtime); 922220137Strasz#ifdef notyet 923220137Strasz KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); 924220137Strasz#else 925220137Strasz if (runtime < p->p_prev_runtime) 926220137Strasz runtime = p->p_prev_runtime; 927220137Strasz#endif 928249444Strasz microuptime(&wallclock); 929249444Strasz timevalsub(&wallclock, &p->p_stats->p_start); 930249444Strasz if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 931249444Strasz pct_estimate = (1000000 * runtime * 100) / 932249444Strasz ((uint64_t)wallclock.tv_sec * 1000000 + 933249444Strasz wallclock.tv_usec); 934249444Strasz } else 935249444Strasz pct_estimate = 0; 936249444Strasz pct = racct_getpcpu(p, pct_estimate); 937249444Strasz 938225364Strasz mtx_lock(&racct_lock); 939225364Strasz racct_set_locked(p, RACCT_CPU, runtime); 940249444Strasz racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); 941220137Strasz 942225364Strasz for (i = 0; i <= RACCT_MAX; i++) { 943225364Strasz if (p->p_racct->r_resources[i] == 0) 944225364Strasz continue; 945225364Strasz if (!RACCT_IS_RECLAIMABLE(i)) 946225364Strasz continue; 947225364Strasz racct_set_locked(p, i, 0); 948225364Strasz } 949225364Strasz 950225364Strasz mtx_unlock(&racct_lock); 951220137Strasz PROC_UNLOCK(p); 952220137Strasz 953220137Strasz#ifdef RCTL 954220137Strasz rctl_racct_release(p->p_racct); 955220137Strasz#endif 956220137Strasz racct_destroy(&p->p_racct); 957220137Strasz} 958220137Strasz 959220137Strasz/* 960220137Strasz * Called after credentials change, to move resource utilisation 961220137Strasz * between raccts. 962220137Strasz */ 963220137Straszvoid 964220137Straszracct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, 965220137Strasz struct ucred *newcred) 966220137Strasz{ 967220137Strasz struct uidinfo *olduip, *newuip; 968220137Strasz struct loginclass *oldlc, *newlc; 969220137Strasz struct prison *oldpr, *newpr, *pr; 970220137Strasz 971220137Strasz PROC_LOCK_ASSERT(p, MA_NOTOWNED); 972220137Strasz 973220137Strasz newuip = newcred->cr_ruidinfo; 974220137Strasz olduip = oldcred->cr_ruidinfo; 975220137Strasz newlc = newcred->cr_loginclass; 976220137Strasz oldlc = oldcred->cr_loginclass; 977220137Strasz newpr = newcred->cr_prison; 978220137Strasz oldpr = oldcred->cr_prison; 979220137Strasz 980220137Strasz mtx_lock(&racct_lock); 981220137Strasz if (newuip != olduip) { 982220137Strasz racct_sub_racct(olduip->ui_racct, p->p_racct); 983220137Strasz racct_add_racct(newuip->ui_racct, p->p_racct); 984220137Strasz } 985220137Strasz if (newlc != oldlc) { 986220137Strasz racct_sub_racct(oldlc->lc_racct, p->p_racct); 987220137Strasz racct_add_racct(newlc->lc_racct, p->p_racct); 988220137Strasz } 989220137Strasz if (newpr != oldpr) { 990220137Strasz for (pr = oldpr; pr != NULL; pr = pr->pr_parent) 991221362Strasz racct_sub_racct(pr->pr_prison_racct->prr_racct, 992221362Strasz p->p_racct); 993220137Strasz for (pr = newpr; pr != NULL; pr = pr->pr_parent) 994221362Strasz racct_add_racct(pr->pr_prison_racct->prr_racct, 995221362Strasz p->p_racct); 996220137Strasz } 997220137Strasz mtx_unlock(&racct_lock); 998220137Strasz 999220137Strasz#ifdef RCTL 1000220137Strasz rctl_proc_ucred_changed(p, newcred); 1001220137Strasz#endif 1002220137Strasz} 1003220137Strasz 1004236253Straszvoid 1005236253Straszracct_move(struct racct *dest, struct racct *src) 1006236253Strasz{ 1007236253Strasz 1008236253Strasz mtx_lock(&racct_lock); 1009236253Strasz 1010236253Strasz racct_add_racct(dest, src); 1011236253Strasz racct_sub_racct(src, src); 1012236253Strasz 1013236253Strasz mtx_unlock(&racct_lock); 1014236253Strasz} 1015236253Strasz 1016220137Straszstatic void 1017249444Straszracct_proc_throttle(struct proc *p) 1018249444Strasz{ 1019249444Strasz struct thread *td; 1020249444Strasz#ifdef SMP 1021249444Strasz int cpuid; 1022249444Strasz#endif 1023249444Strasz 1024249444Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 1025249444Strasz 1026249444Strasz /* 1027249444Strasz * Do not block kernel processes. Also do not block processes with 1028249444Strasz * low %cpu utilization to improve interactivity. 1029249444Strasz */ 1030249444Strasz if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) || 1031249444Strasz (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) 1032249444Strasz return; 1033249444Strasz p->p_throttled = 1; 1034249444Strasz 1035249444Strasz FOREACH_THREAD_IN_PROC(p, td) { 1036249444Strasz thread_lock(td); 1037249444Strasz switch (td->td_state) { 1038249444Strasz case TDS_RUNQ: 1039249444Strasz /* 1040249444Strasz * If the thread is on the scheduler run-queue, we can 1041249444Strasz * not just remove it from there. So we set the flag 1042249444Strasz * TDF_NEEDRESCHED for the thread, so that once it is 1043249444Strasz * running, it is taken off the cpu as soon as possible. 1044249444Strasz */ 1045249444Strasz td->td_flags |= TDF_NEEDRESCHED; 1046249444Strasz break; 1047249444Strasz case TDS_RUNNING: 1048249444Strasz /* 1049249444Strasz * If the thread is running, we request a context 1050249444Strasz * switch for it by setting the TDF_NEEDRESCHED flag. 1051249444Strasz */ 1052249444Strasz td->td_flags |= TDF_NEEDRESCHED; 1053249444Strasz#ifdef SMP 1054249444Strasz cpuid = td->td_oncpu; 1055249444Strasz if ((cpuid != NOCPU) && (td != curthread)) 1056249444Strasz ipi_cpu(cpuid, IPI_AST); 1057249444Strasz#endif 1058249444Strasz break; 1059249444Strasz default: 1060249444Strasz break; 1061249444Strasz } 1062249444Strasz thread_unlock(td); 1063249444Strasz } 1064249444Strasz} 1065249444Strasz 1066249444Straszstatic void 1067249444Straszracct_proc_wakeup(struct proc *p) 1068249444Strasz{ 1069249444Strasz PROC_LOCK_ASSERT(p, MA_OWNED); 1070249444Strasz 1071249444Strasz if (p->p_throttled) { 1072249444Strasz p->p_throttled = 0; 1073249444Strasz wakeup(p->p_racct); 1074249444Strasz } 1075249444Strasz} 1076249444Strasz 1077249444Straszstatic void 1078249444Straszracct_decay_resource(struct racct *racct, void * res, void* dummy) 1079249444Strasz{ 1080249444Strasz int resource; 1081249444Strasz int64_t r_old, r_new; 1082249444Strasz 1083249444Strasz resource = *(int *)res; 1084249444Strasz r_old = racct->r_resources[resource]; 1085249444Strasz 1086249444Strasz /* If there is nothing to decay, just exit. */ 1087249444Strasz if (r_old <= 0) 1088249444Strasz return; 1089249444Strasz 1090249444Strasz mtx_lock(&racct_lock); 1091249444Strasz r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; 1092249444Strasz racct->r_resources[resource] = r_new; 1093249444Strasz mtx_unlock(&racct_lock); 1094249444Strasz} 1095249444Strasz 1096249444Straszstatic void 1097249444Straszracct_decay(int resource) 1098249444Strasz{ 1099249444Strasz ui_racct_foreach(racct_decay_resource, &resource, NULL); 1100249444Strasz loginclass_racct_foreach(racct_decay_resource, &resource, NULL); 1101249444Strasz prison_racct_foreach(racct_decay_resource, &resource, NULL); 1102249444Strasz} 1103249444Strasz 1104249444Straszstatic void 1105220137Straszracctd(void) 1106220137Strasz{ 1107220137Strasz struct thread *td; 1108220137Strasz struct proc *p; 1109220137Strasz struct timeval wallclock; 1110220137Strasz uint64_t runtime; 1111249444Strasz uint64_t pct, pct_estimate; 1112220137Strasz 1113220137Strasz for (;;) { 1114249444Strasz racct_decay(RACCT_PCTCPU); 1115249444Strasz 1116220137Strasz sx_slock(&allproc_lock); 1117220137Strasz 1118249444Strasz LIST_FOREACH(p, &zombproc, p_list) { 1119249444Strasz PROC_LOCK(p); 1120249444Strasz racct_set(p, RACCT_PCTCPU, 0); 1121249444Strasz PROC_UNLOCK(p); 1122249444Strasz } 1123249444Strasz 1124220137Strasz FOREACH_PROC_IN_SYSTEM(p) { 1125249444Strasz PROC_LOCK(p); 1126249444Strasz if (p->p_state != PRS_NORMAL) { 1127249444Strasz PROC_UNLOCK(p); 1128220137Strasz continue; 1129249444Strasz } 1130220137Strasz 1131220137Strasz microuptime(&wallclock); 1132220137Strasz timevalsub(&wallclock, &p->p_stats->p_start); 1133220137Strasz PROC_SLOCK(p); 1134235893Strasz FOREACH_THREAD_IN_PROC(p, td) 1135220137Strasz ruxagg(p, td); 1136220137Strasz runtime = cputick2usec(p->p_rux.rux_runtime); 1137220137Strasz PROC_SUNLOCK(p); 1138220137Strasz#ifdef notyet 1139220137Strasz KASSERT(runtime >= p->p_prev_runtime, 1140220137Strasz ("runtime < p_prev_runtime")); 1141220137Strasz#else 1142220137Strasz if (runtime < p->p_prev_runtime) 1143220137Strasz runtime = p->p_prev_runtime; 1144220137Strasz#endif 1145220137Strasz p->p_prev_runtime = runtime; 1146249444Strasz if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1147249444Strasz pct_estimate = (1000000 * runtime * 100) / 1148249444Strasz ((uint64_t)wallclock.tv_sec * 1000000 + 1149249444Strasz wallclock.tv_usec); 1150249444Strasz } else 1151249444Strasz pct_estimate = 0; 1152249444Strasz pct = racct_getpcpu(p, pct_estimate); 1153220137Strasz mtx_lock(&racct_lock); 1154249444Strasz racct_set_force_locked(p, RACCT_PCTCPU, pct); 1155220137Strasz racct_set_locked(p, RACCT_CPU, runtime); 1156220137Strasz racct_set_locked(p, RACCT_WALLCLOCK, 1157233543Sjh (uint64_t)wallclock.tv_sec * 1000000 + 1158233543Sjh wallclock.tv_usec); 1159220137Strasz mtx_unlock(&racct_lock); 1160220137Strasz PROC_UNLOCK(p); 1161220137Strasz } 1162249444Strasz 1163249444Strasz /* 1164249444Strasz * To ensure that processes are throttled in a fair way, we need 1165249444Strasz * to iterate over all processes again and check the limits 1166249444Strasz * for %cpu resource only after ucred racct containers have been 1167249444Strasz * properly filled. 1168249444Strasz */ 1169249444Strasz FOREACH_PROC_IN_SYSTEM(p) { 1170249444Strasz PROC_LOCK(p); 1171249444Strasz if (p->p_state != PRS_NORMAL) { 1172249444Strasz PROC_UNLOCK(p); 1173249444Strasz continue; 1174249444Strasz } 1175249444Strasz 1176249444Strasz if (racct_pcpu_available(p) <= 0) 1177249444Strasz racct_proc_throttle(p); 1178249444Strasz else if (p->p_throttled) 1179249444Strasz racct_proc_wakeup(p); 1180249444Strasz PROC_UNLOCK(p); 1181249444Strasz } 1182220137Strasz sx_sunlock(&allproc_lock); 1183220137Strasz pause("-", hz); 1184220137Strasz } 1185220137Strasz} 1186220137Strasz 1187220137Straszstatic struct kproc_desc racctd_kp = { 1188220137Strasz "racctd", 1189220137Strasz racctd, 1190220137Strasz NULL 1191220137Strasz}; 1192220137StraszSYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp); 1193220137Strasz 1194220137Straszstatic void 1195220137Straszracct_init(void) 1196220137Strasz{ 1197220137Strasz 1198220137Strasz racct_zone = uma_zcreate("racct", sizeof(struct racct), 1199220137Strasz NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1200220137Strasz /* 1201220137Strasz * XXX: Move this somewhere. 1202220137Strasz */ 1203221362Strasz prison0.pr_prison_racct = prison_racct_find("0"); 1204220137Strasz} 1205220137StraszSYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); 1206220137Strasz 1207220137Strasz#else /* !RACCT */ 1208220137Strasz 1209220137Straszint 1210220137Straszracct_add(struct proc *p, int resource, uint64_t amount) 1211220137Strasz{ 1212220137Strasz 1213220137Strasz return (0); 1214220137Strasz} 1215220137Strasz 1216220137Straszvoid 1217220137Straszracct_add_cred(struct ucred *cred, int resource, uint64_t amount) 1218220137Strasz{ 1219220137Strasz} 1220220137Strasz 1221220137Straszvoid 1222220137Straszracct_add_force(struct proc *p, int resource, uint64_t amount) 1223220137Strasz{ 1224220137Strasz 1225220137Strasz return; 1226220137Strasz} 1227220137Strasz 1228220137Straszint 1229220137Straszracct_set(struct proc *p, int resource, uint64_t amount) 1230220137Strasz{ 1231220137Strasz 1232220137Strasz return (0); 1233220137Strasz} 1234220137Strasz 1235220137Straszvoid 1236220372Straszracct_set_force(struct proc *p, int resource, uint64_t amount) 1237220372Strasz{ 1238220372Strasz} 1239220372Strasz 1240220372Straszvoid 1241220137Straszracct_sub(struct proc *p, int resource, uint64_t amount) 1242220137Strasz{ 1243220137Strasz} 1244220137Strasz 1245220137Straszvoid 1246220137Straszracct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 1247220137Strasz{ 1248220137Strasz} 1249220137Strasz 1250220137Straszuint64_t 1251220137Straszracct_get_limit(struct proc *p, int resource) 1252220137Strasz{ 1253220137Strasz 1254220137Strasz return (UINT64_MAX); 1255220137Strasz} 1256220137Strasz 1257220372Straszuint64_t 1258220372Straszracct_get_available(struct proc *p, int resource) 1259220372Strasz{ 1260220372Strasz 1261220372Strasz return (UINT64_MAX); 1262220372Strasz} 1263220372Strasz 1264220137Straszvoid 1265220137Straszracct_create(struct racct **racctp) 1266220137Strasz{ 1267220137Strasz} 1268220137Strasz 1269220137Straszvoid 1270220137Straszracct_destroy(struct racct **racctp) 1271220137Strasz{ 1272220137Strasz} 1273220137Strasz 1274220137Straszint 1275220137Straszracct_proc_fork(struct proc *parent, struct proc *child) 1276220137Strasz{ 1277220137Strasz 1278220137Strasz return (0); 1279220137Strasz} 1280220137Strasz 1281220137Straszvoid 1282226092Straszracct_proc_fork_done(struct proc *child) 1283226092Strasz{ 1284226092Strasz} 1285226092Strasz 1286226092Straszvoid 1287220137Straszracct_proc_exit(struct proc *p) 1288220137Strasz{ 1289220137Strasz} 1290220137Strasz 1291220137Strasz#endif /* !RACCT */ 1292