kern_racct.c revision 243088
1309124Sdim/*- 2193323Sed * Copyright (c) 2010 The FreeBSD Foundation 3193323Sed * All rights reserved. 4193323Sed * 5193323Sed * This software was developed by Edward Tomasz Napierala under sponsorship 6193323Sed * from the FreeBSD Foundation. 7193323Sed * 8193323Sed * Redistribution and use in source and binary forms, with or without 9193323Sed * modification, are permitted provided that the following conditions 10193323Sed * are met: 11193323Sed * 1. Redistributions of source code must retain the above copyright 12193323Sed * notice, this list of conditions and the following disclaimer. 13193323Sed * 2. Redistributions in binary form must reproduce the above copyright 14193323Sed * notice, this list of conditions and the following disclaimer in the 15193323Sed * documentation and/or other materials provided with the distribution. 16193323Sed * 17276479Sdim * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18261991Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19276479Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20193323Sed * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21249423Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22249423Sdim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23276479Sdim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24249423Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25249423Sdim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26288943Sdim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27288943Sdim * SUCH DAMAGE. 28226633Sdim * 29288943Sdim * $FreeBSD: head/sys/kern/kern_racct.c 243088 2012-11-15 15:55:49Z trasz $ 30224145Sdim */ 31193323Sed 32288943Sdim#include <sys/cdefs.h> 33193323Sed__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 243088 2012-11-15 15:55:49Z trasz $"); 34210299Sed 35210299Sed#include "opt_kdtrace.h" 36210299Sed#include "opt_sched.h" 37210299Sed 38226633Sdim#include <sys/param.h> 39210299Sed#include <sys/systm.h> 40210299Sed#include <sys/eventhandler.h> 41234353Sdim#include <sys/jail.h> 42234353Sdim#include <sys/kernel.h> 43234353Sdim#include <sys/kthread.h> 44234353Sdim#include <sys/lock.h> 45234353Sdim#include <sys/loginclass.h> 46234353Sdim#include <sys/malloc.h> 47239462Sdim#include <sys/mutex.h> 48234353Sdim#include <sys/proc.h> 49234353Sdim#include <sys/racct.h> 50234353Sdim#include <sys/resourcevar.h> 51234353Sdim#include <sys/sbuf.h> 52276479Sdim#include <sys/sched.h> 53234353Sdim#include <sys/sdt.h> 54210299Sed#include <sys/smp.h> 55276479Sdim#include <sys/sx.h> 56210299Sed#include <sys/sysctl.h> 57210299Sed#include <sys/sysent.h> 58210299Sed#include <sys/sysproto.h> 59234353Sdim#include <sys/umtx.h> 60234353Sdim#include <machine/smp.h> 61234353Sdim 62234353Sdim#ifdef RCTL 63210299Sed#include <sys/rctl.h> 64210299Sed#endif 65210299Sed 66296417Sdim#ifdef RACCT 67234353Sdim 68234353SdimFEATURE(racct, "Resource Accounting"); 69210299Sed 70234353Sdim/* 71210299Sed * Do not block processes that have their %cpu usage <= pcpu_threshold. 72234353Sdim */ 73234353Sdimstatic int pcpu_threshold = 1; 74210299Sed 75210299SedSYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); 76210299SedSYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 77234353Sdim 0, "Processes with higher %cpu usage than this value can be throttled."); 78296417Sdim 79234353Sdim/* 80234353Sdim * How many seconds it takes to use the scheduler %cpu calculations. When a 81234353Sdim * process starts, we compute its %cpu usage by dividing its runtime by the 82234353Sdim * process wall clock time. After RACCT_PCPU_SECS pass, we use the value 83296417Sdim * provided by the scheduler. 84234353Sdim */ 85234353Sdim#define RACCT_PCPU_SECS 3 86234353Sdim 87210299Sedstatic struct mtx racct_lock; 88210299SedMTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); 89296417Sdim 90296417Sdimstatic uma_zone_t racct_zone; 91296417Sdim 92296417Sdimstatic void racct_sub_racct(struct racct *dest, const struct racct *src); 93296417Sdimstatic void racct_sub_cred_locked(struct ucred *cred, int resource, 94296417Sdim uint64_t amount); 95296417Sdimstatic void racct_add_cred_locked(struct ucred *cred, int resource, 96296417Sdim uint64_t amount); 97296417Sdim 98309124SdimSDT_PROVIDER_DEFINE(racct); 99309124SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, add, add, "struct proc *", "int", 100309124Sdim "uint64_t"); 101309124SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, add_failure, add-failure, 102309124Sdim "struct proc *", "int", "uint64_t"); 103309124SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, add_cred, add-cred, "struct ucred *", 104296417Sdim "int", "uint64_t"); 105296417SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, add_force, add-force, "struct proc *", 106296417Sdim "int", "uint64_t"); 107296417SdimSDT_PROBE_DEFINE3(racct, kernel, rusage, set, set, "struct proc *", "int", 108296417Sdim "uint64_t"); 109195340SedSDT_PROBE_DEFINE3(racct, kernel, rusage, set_failure, set-failure, 110195340Sed "struct proc *", "int", "uint64_t"); 111195340SedSDT_PROBE_DEFINE3(racct, kernel, rusage, sub, sub, "struct proc *", "int", 112226633Sdim "uint64_t"); 113195340SedSDT_PROBE_DEFINE3(racct, kernel, rusage, sub_cred, sub-cred, "struct ucred *", 114195340Sed "int", "uint64_t"); 115195340SedSDT_PROBE_DEFINE1(racct, kernel, racct, create, create, "struct racct *"); 116195340SedSDT_PROBE_DEFINE1(racct, kernel, racct, destroy, destroy, "struct racct *"); 117195340SedSDT_PROBE_DEFINE2(racct, kernel, racct, join, join, "struct racct *", 118195340Sed "struct racct *"); 119195340SedSDT_PROBE_DEFINE2(racct, kernel, racct, join_failure, join-failure, 120195340Sed "struct racct *", "struct racct *"); 121193323SedSDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *", 122234353Sdim "struct racct *"); 123234353Sdim 124234353Sdimint racct_types[] = { 125234353Sdim [RACCT_CPU] = 126234353Sdim RACCT_IN_MILLIONS, 127234353Sdim [RACCT_DATA] = 128234353Sdim RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 129234353Sdim [RACCT_STACK] = 130193323Sed RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 131195340Sed [RACCT_CORE] = 132193323Sed RACCT_DENIABLE, 133193323Sed [RACCT_RSS] = 134193323Sed RACCT_RECLAIMABLE, 135193323Sed [RACCT_MEMLOCK] = 136193323Sed RACCT_RECLAIMABLE | RACCT_DENIABLE, 137193323Sed [RACCT_NPROC] = 138193323Sed RACCT_RECLAIMABLE | RACCT_DENIABLE, 139193323Sed [RACCT_NOFILE] = 140193323Sed RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 141193323Sed [RACCT_VMEM] = 142193323Sed RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 143193323Sed [RACCT_NPTS] = 144193323Sed RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 145193323Sed [RACCT_SWAP] = 146193323Sed RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 147210299Sed [RACCT_NTHR] = 148193323Sed RACCT_RECLAIMABLE | RACCT_DENIABLE, 149195340Sed [RACCT_MSGQQUEUED] = 150198090Srdivacky RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 151210299Sed [RACCT_MSGQSIZE] = 152210299Sed RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 153193323Sed [RACCT_NMSGQ] = 154210299Sed RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 155210299Sed [RACCT_NSEM] = 156210299Sed RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 157210299Sed [RACCT_NSEMOP] = 158296417Sdim RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, 159210299Sed [RACCT_NSHM] = 160210299Sed RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 161193323Sed [RACCT_SHMSIZE] = 162193323Sed RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, 163210299Sed [RACCT_WALLCLOCK] = 164193323Sed RACCT_IN_MILLIONS, 165296417Sdim [RACCT_PCTCPU] = 166210299Sed RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; 167193323Sed 168193323Sedstatic const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; 169193323Sed 170193323Sed#ifdef SCHED_4BSD 171195340Sed/* 172195340Sed * Contains intermediate values for %cpu calculations to avoid using floating 173193323Sed * point in the kernel. 174193323Sed * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) 175193323Sed * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to 176193323Sed * zero so the calculations are more straightforward. 177193323Sed */ 178193323Sedfixpt_t ccpu_exp[] = { 179193323Sed [0] = FSCALE * 1, 180195340Sed [1] = FSCALE * 0.95122942450071400909, 181195340Sed [2] = FSCALE * 0.90483741803595957316, 182195340Sed [3] = FSCALE * 0.86070797642505780722, 183195340Sed [4] = FSCALE * 0.81873075307798185866, 184193323Sed [5] = FSCALE * 0.77880078307140486824, 185193323Sed [6] = FSCALE * 0.74081822068171786606, 186204792Srdivacky [7] = FSCALE * 0.70468808971871343435, 187204792Srdivacky [8] = FSCALE * 0.67032004603563930074, 188204792Srdivacky [9] = FSCALE * 0.63762815162177329314, 189204792Srdivacky [10] = FSCALE * 0.60653065971263342360, 190327952Sdim [11] = FSCALE * 0.57694981038048669531, 191327952Sdim [12] = FSCALE * 0.54881163609402643262, 192327952Sdim [13] = FSCALE * 0.52204577676101604789, 193327952Sdim [14] = FSCALE * 0.49658530379140951470, 194327952Sdim [15] = FSCALE * 0.47236655274101470713, 195327952Sdim [16] = FSCALE * 0.44932896411722159143, 196327952Sdim [17] = FSCALE * 0.42741493194872666992, 197327952Sdim [18] = FSCALE * 0.40656965974059911188, 198327952Sdim [19] = FSCALE * 0.38674102345450120691, 199327952Sdim [20] = FSCALE * 0.36787944117144232159, 200327952Sdim [21] = FSCALE * 0.34993774911115535467, 201327952Sdim [22] = FSCALE * 0.33287108369807955328, 202327952Sdim [23] = FSCALE * 0.31663676937905321821, 203193323Sed [24] = FSCALE * 0.30119421191220209664, 204327952Sdim [25] = FSCALE * 0.28650479686019010032, 205296417Sdim [26] = FSCALE * 0.27253179303401260312, 206193323Sed [27] = FSCALE * 0.25924026064589150757, 207193323Sed [28] = FSCALE * 0.24659696394160647693, 208193323Sed [29] = FSCALE * 0.23457028809379765313, 209195340Sed [30] = FSCALE * 0.22313016014842982893, 210204642Srdivacky [31] = FSCALE * 0.21224797382674305771, 211261991Sdim [32] = FSCALE * 0.20189651799465540848, 212309124Sdim [33] = FSCALE * 0.19204990862075411423, 213204642Srdivacky [34] = FSCALE * 0.18268352405273465022, 214204642Srdivacky [35] = FSCALE * 0.17377394345044512668, 215296417Sdim [36] = FSCALE * 0.16529888822158653829, 216204642Srdivacky [37] = FSCALE * 0.15723716631362761621, 217204642Srdivacky [38] = FSCALE * 0.14956861922263505264, 218204642Srdivacky [39] = FSCALE * 0.14227407158651357185, 219204642Srdivacky [40] = FSCALE * 0.13533528323661269189, 220204642Srdivacky [41] = FSCALE * 0.12873490358780421886, 221296417Sdim [42] = FSCALE * 0.12245642825298191021, 222204642Srdivacky [43] = FSCALE * 0.11648415777349695786, 223204642Srdivacky [44] = FSCALE * 0.11080315836233388333, 224193323Sed [45] = FSCALE * 0.10539922456186433678, 225226633Sdim [46] = FSCALE * 0.10025884372280373372, 226261991Sdim [47] = FSCALE * 0.09536916221554961888, 227202878Srdivacky [48] = FSCALE * 0.09071795328941250337, 228204642Srdivacky [49] = FSCALE * 0.08629358649937051097, 229193323Sed [50] = FSCALE * 0.08208499862389879516, 230193323Sed [51] = FSCALE * 0.07808166600115315231, 231193323Sed [52] = FSCALE * 0.07427357821433388042, 232193323Sed [53] = FSCALE * 0.07065121306042958674, 233193323Sed [54] = FSCALE * 0.06720551273974976512, 234204642Srdivacky [55] = FSCALE * 0.06392786120670757270, 235193323Sed [56] = FSCALE * 0.06081006262521796499, 236193323Sed [57] = FSCALE * 0.05784432087483846296, 237193323Sed [58] = FSCALE * 0.05502322005640722902, 238193323Sed [59] = FSCALE * 0.05233970594843239308, 239288943Sdim [60] = FSCALE * 0.04978706836786394297, 240288943Sdim [61] = FSCALE * 0.04735892439114092119, 241288943Sdim [62] = FSCALE * 0.04504920239355780606, 242193323Sed [63] = FSCALE * 0.04285212686704017991, 243198090Srdivacky [64] = FSCALE * 0.04076220397836621516, 244193323Sed [65] = FSCALE * 0.03877420783172200988, 245193323Sed [66] = FSCALE * 0.03688316740124000544, 246198090Srdivacky [67] = FSCALE * 0.03508435410084502588, 247198090Srdivacky [68] = FSCALE * 0.03337326996032607948, 248207618Srdivacky [69] = FSCALE * 0.03174563637806794323, 249198090Srdivacky [70] = FSCALE * 0.03019738342231850073, 250198090Srdivacky [71] = FSCALE * 0.02872463965423942912, 251198090Srdivacky [72] = FSCALE * 0.02732372244729256080, 252193323Sed [73] = FSCALE * 0.02599112877875534358, 253193323Sed [74] = FSCALE * 0.02472352647033939120, 254198090Srdivacky [75] = FSCALE * 0.02351774585600910823, 255198090Srdivacky [76] = FSCALE * 0.02237077185616559577, 256193323Sed [77] = FSCALE * 0.02127973643837716938, 257198090Srdivacky [78] = FSCALE * 0.02024191144580438847, 258198090Srdivacky [79] = FSCALE * 0.01925470177538692429, 259198090Srdivacky [80] = FSCALE * 0.01831563888873418029, 260296417Sdim [81] = FSCALE * 0.01742237463949351138, 261198090Srdivacky [82] = FSCALE * 0.01657267540176124754, 262198090Srdivacky [83] = FSCALE * 0.01576441648485449082, 263198090Srdivacky [84] = FSCALE * 0.01499557682047770621, 264198090Srdivacky [85] = FSCALE * 0.01426423390899925527, 265198090Srdivacky [86] = FSCALE * 0.01356855901220093175, 266198090Srdivacky [87] = FSCALE * 0.01290681258047986886, 267296417Sdim [88] = FSCALE * 0.01227733990306844117, 268296417Sdim [89] = FSCALE * 0.01167856697039544521, 269198090Srdivacky [90] = FSCALE * 0.01110899653824230649, 270198090Srdivacky [91] = FSCALE * 0.01056720438385265337, 271193323Sed [92] = FSCALE * 0.01005183574463358164, 272193323Sed [93] = FSCALE * 0.00956160193054350793, 273193323Sed [94] = FSCALE * 0.00909527710169581709, 274193323Sed [95] = FSCALE * 0.00865169520312063417, 275193323Sed [96] = FSCALE * 0.00822974704902002884, 276198090Srdivacky [97] = FSCALE * 0.00782837754922577143, 277288943Sdim [98] = FSCALE * 0.00744658307092434051, 278288943Sdim [99] = FSCALE * 0.00708340892905212004, 279288943Sdim [100] = FSCALE * 0.00673794699908546709, 280288943Sdim [101] = FSCALE * 0.00640933344625638184, 281296417Sdim [102] = FSCALE * 0.00609674656551563610, 282288943Sdim [103] = FSCALE * 0.00579940472684214321, 283296417Sdim [104] = FSCALE * 0.00551656442076077241, 284288943Sdim [105] = FSCALE * 0.00524751839918138427, 285288943Sdim [106] = FSCALE * 0.00499159390691021621, 286193323Sed [107] = FSCALE * 0.00474815099941147558, 287198090Srdivacky [108] = FSCALE * 0.00451658094261266798, 288193323Sed [109] = FSCALE * 0.00429630469075234057, 289193323Sed [110] = FSCALE * 0.00408677143846406699, 290193323Sed}; 291198090Srdivacky#endif 292207618Srdivacky 293276479Sdim#define CCPU_EXP_MAX 110 294193323Sed 295193323Sed/* 296193323Sed * This function is analogical to the getpcpu() function in the ps(1) command. 297198090Srdivacky * They should both calculate in the same way so that the racct %cpu 298276479Sdim * calculations are consistent with the values showed by the ps(1) tool. 299193323Sed * The calculations are more complex in the 4BSD scheduler because of the value 300261991Sdim * of the ccpu variable. In ULE it is defined to be zero which saves us some 301261991Sdim * work. 302193323Sed */ 303193323Sedstatic uint64_t 304193323Sedracct_getpcpu(struct proc *p, u_int pcpu) 305193323Sed{ 306193323Sed u_int swtime; 307193323Sed#ifdef SCHED_4BSD 308198090Srdivacky fixpt_t pctcpu, pctcpu_next; 309198090Srdivacky#endif 310198090Srdivacky#ifdef SMP 311193323Sed struct pcpu *pc; 312198090Srdivacky int found; 313226633Sdim#endif 314198090Srdivacky fixpt_t p_pctcpu; 315198090Srdivacky struct thread *td; 316198090Srdivacky 317198090Srdivacky /* 318198090Srdivacky * If the process is swapped out, we count its %cpu usage as zero. 319198090Srdivacky * This behaviour is consistent with the userland ps(1) tool. 320198090Srdivacky */ 321198090Srdivacky if ((p->p_flag & P_INMEM) == 0) 322198090Srdivacky return (0); 323207618Srdivacky swtime = (ticks - p->p_swtick) / hz; 324198090Srdivacky 325198090Srdivacky /* 326198090Srdivacky * For short-lived processes, the sched_pctcpu() returns small 327205407Srdivacky * values even for cpu intensive processes. Therefore we use 328198090Srdivacky * our own estimate in this case. 329210299Sed */ 330205407Srdivacky if (swtime < RACCT_PCPU_SECS) 331205407Srdivacky return (pcpu); 332198090Srdivacky 333210299Sed p_pctcpu = 0; 334198090Srdivacky FOREACH_THREAD_IN_PROC(p, td) { 335198090Srdivacky if (td == PCPU_GET(idlethread)) 336198090Srdivacky continue; 337198090Srdivacky#ifdef SMP 338198090Srdivacky found = 0; 339198090Srdivacky STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 340198090Srdivacky if (td == pc->pc_idlethread) { 341198090Srdivacky found = 1; 342226633Sdim break; 343198090Srdivacky } 344198090Srdivacky } 345198090Srdivacky if (found) 346198090Srdivacky continue; 347198090Srdivacky#endif 348198090Srdivacky thread_lock(td); 349198090Srdivacky#ifdef SCHED_4BSD 350207618Srdivacky pctcpu = sched_pctcpu(td); 351198090Srdivacky /* Count also the yet unfinished second. */ 352198090Srdivacky pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; 353221345Sdim pctcpu_next += sched_pctcpu_delta(td); 354261991Sdim p_pctcpu += max(pctcpu, pctcpu_next); 355198090Srdivacky#else 356198090Srdivacky /* 357210299Sed * In ULE the %cpu statistics are updated on every 358198090Srdivacky * sched_pctcpu() call. So special calculations to 359198090Srdivacky * account for the latest (unfinished) second are 360198090Srdivacky * not needed. 361198090Srdivacky */ 362198090Srdivacky p_pctcpu += sched_pctcpu(td); 363198090Srdivacky#endif 364198090Srdivacky thread_unlock(td); 365210299Sed } 366198090Srdivacky 367198090Srdivacky#ifdef SCHED_4BSD 368198090Srdivacky if (swtime <= CCPU_EXP_MAX) 369198090Srdivacky return ((100 * (uint64_t)p_pctcpu * 1000000) / 370198090Srdivacky (FSCALE - ccpu_exp[swtime])); 371198090Srdivacky#endif 372198090Srdivacky 373198090Srdivacky return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); 374198090Srdivacky} 375198090Srdivacky 376198090Srdivackystatic void 377198090Srdivackyracct_add_racct(struct racct *dest, const struct racct *src) 378202878Srdivacky{ 379198090Srdivacky int i; 380198090Srdivacky 381198090Srdivacky mtx_assert(&racct_lock, MA_OWNED); 382198090Srdivacky 383193323Sed /* 384193323Sed * Update resource usage in dest. 385193323Sed */ 386193323Sed for (i = 0; i <= RACCT_MAX; i++) { 387193323Sed KASSERT(dest->r_resources[i] >= 0, 388193323Sed ("%s: resource %d propagation meltdown: dest < 0", 389193323Sed __func__, i)); 390193323Sed KASSERT(src->r_resources[i] >= 0, 391193323Sed ("%s: resource %d propagation meltdown: src < 0", 392193323Sed __func__, i)); 393193323Sed dest->r_resources[i] += src->r_resources[i]; 394193323Sed } 395193323Sed} 396193323Sed 397193323Sedstatic void 398198090Srdivackyracct_sub_racct(struct racct *dest, const struct racct *src) 399198090Srdivacky{ 400226633Sdim int i; 401226633Sdim 402193323Sed mtx_assert(&racct_lock, MA_OWNED); 403288943Sdim 404288943Sdim /* 405193323Sed * Update resource usage in dest. 406198090Srdivacky */ 407193323Sed for (i = 0; i <= RACCT_MAX; i++) { 408193323Sed if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { 409198090Srdivacky KASSERT(dest->r_resources[i] >= 0, 410198090Srdivacky ("%s: resource %d propagation meltdown: dest < 0", 411198090Srdivacky __func__, i)); 412198090Srdivacky KASSERT(src->r_resources[i] >= 0, 413288943Sdim ("%s: resource %d propagation meltdown: src < 0", 414261991Sdim __func__, i)); 415200581Srdivacky KASSERT(src->r_resources[i] <= dest->r_resources[i], 416193323Sed ("%s: resource %d propagation meltdown: src > dest", 417193323Sed __func__, i)); 418193323Sed } 419193323Sed if (RACCT_CAN_DROP(i)) { 420193323Sed dest->r_resources[i] -= src->r_resources[i]; 421198090Srdivacky if (dest->r_resources[i] < 0) { 422198090Srdivacky KASSERT(RACCT_IS_SLOPPY(i) || 423198090Srdivacky RACCT_IS_DECAYING(i), 424203954Srdivacky ("%s: resource %d usage < 0", __func__, i)); 425261991Sdim dest->r_resources[i] = 0; 426203954Srdivacky } 427203954Srdivacky } 428296417Sdim } 429207618Srdivacky} 430288943Sdim 431203954Srdivackyvoid 432203954Srdivackyracct_create(struct racct **racctp) 433203954Srdivacky{ 434203954Srdivacky 435203954Srdivacky SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0); 436203954Srdivacky 437203954Srdivacky KASSERT(*racctp == NULL, ("racct already allocated")); 438203954Srdivacky 439296417Sdim *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); 440203954Srdivacky} 441193323Sed 442203954Srdivackystatic void 443203954Srdivackyracct_destroy_locked(struct racct **racctp) 444203954Srdivacky{ 445203954Srdivacky int i; 446203954Srdivacky struct racct *racct; 447193323Sed 448193323Sed SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0); 449198090Srdivacky 450198090Srdivacky mtx_assert(&racct_lock, MA_OWNED); 451198090Srdivacky KASSERT(racctp != NULL, ("NULL racctp")); 452198090Srdivacky KASSERT(*racctp != NULL, ("NULL racct")); 453198090Srdivacky 454193323Sed racct = *racctp; 455193323Sed 456193323Sed for (i = 0; i <= RACCT_MAX; i++) { 457193323Sed if (RACCT_IS_SLOPPY(i)) 458193323Sed continue; 459193323Sed if (!RACCT_IS_RECLAIMABLE(i)) 460226633Sdim continue; 461198090Srdivacky KASSERT(racct->r_resources[i] == 0, 462198090Srdivacky ("destroying non-empty racct: " 463198090Srdivacky "%ju allocated for resource %d\n", 464288943Sdim racct->r_resources[i], i)); 465288943Sdim } 466288943Sdim uma_zfree(racct_zone, racct); 467288943Sdim *racctp = NULL; 468288943Sdim} 469288943Sdim 470288943Sdimvoid 471288943Sdimracct_destroy(struct racct **racct) 472288943Sdim{ 473288943Sdim 474288943Sdim mtx_lock(&racct_lock); 475288943Sdim racct_destroy_locked(racct); 476288943Sdim mtx_unlock(&racct_lock); 477288943Sdim} 478194612Sed 479288943Sdim/* 480288943Sdim * Increase consumption of 'resource' by 'amount' for 'racct' 481193323Sed * and all its parents. Differently from other cases, 'amount' here 482288943Sdim * may be less than zero. 483198090Srdivacky */ 484198090Srdivackystatic void 485198090Srdivackyracct_alloc_resource(struct racct *racct, int resource, 486198090Srdivacky uint64_t amount) 487198090Srdivacky{ 488198090Srdivacky 489198090Srdivacky mtx_assert(&racct_lock, MA_OWNED); 490198090Srdivacky KASSERT(racct != NULL, ("NULL racct")); 491198090Srdivacky 492193323Sed racct->r_resources[resource] += amount; 493226633Sdim if (racct->r_resources[resource] < 0) { 494193323Sed KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), 495198090Srdivacky ("%s: resource %d usage < 0", __func__, resource)); 496198090Srdivacky racct->r_resources[resource] = 0; 497193323Sed } 498193323Sed 499204642Srdivacky /* 500193323Sed * There are some cases where the racct %cpu resource would grow 501193323Sed * beyond 100%. 502193323Sed * For example in racct_proc_exit() we add the process %cpu usage 503198090Srdivacky * to the ucred racct containers. If too many processes terminated 504193323Sed * in a short time span, the ucred %cpu resource could grow too much. 505198090Srdivacky * Also, the 4BSD scheduler sometimes returns for a thread more than 506198090Srdivacky * 100% cpu usage. So we set a boundary here to 100%. 507234353Sdim */ 508296417Sdim if ((resource == RACCT_PCTCPU) && 509234353Sdim (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000)) 510198090Srdivacky racct->r_resources[RACCT_PCTCPU] = 100 * 1000000; 511194178Sed} 512193323Sed 513193323Sedstatic int 514193323Sedracct_add_locked(struct proc *p, int resource, uint64_t amount) 515193323Sed{ 516288943Sdim#ifdef RCTL 517288943Sdim int error; 518193323Sed#endif 519193323Sed 520193323Sed SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0); 521195340Sed 522195340Sed /* 523195340Sed * We need proc lock to dereference p->p_ucred. 524195340Sed */ 525193323Sed PROC_LOCK_ASSERT(p, MA_OWNED); 526193323Sed 527204792Srdivacky#ifdef RCTL 528204792Srdivacky error = rctl_enforce(p, resource, amount); 529204792Srdivacky if (error && RACCT_IS_DENIABLE(resource)) { 530204792Srdivacky SDT_PROBE(racct, kernel, rusage, add_failure, p, resource, 531193323Sed amount, 0, 0); 532193323Sed return (error); 533296417Sdim } 534193323Sed#endif 535193323Sed racct_alloc_resource(p->p_racct, resource, amount); 536193323Sed racct_add_cred_locked(p->p_ucred, resource, amount); 537193323Sed 538204642Srdivacky return (0); 539309124Sdim} 540204642Srdivacky 541204642Srdivacky/* 542296417Sdim * Increase allocation of 'resource' by 'amount' for process 'p'. 543204642Srdivacky * Return 0 if it's below limits, or errno, if it's not. 544204642Srdivacky */ 545204642Srdivackyint 546204642Srdivackyracct_add(struct proc *p, int resource, uint64_t amount) 547204642Srdivacky{ 548296417Sdim int error; 549204642Srdivacky 550204642Srdivacky mtx_lock(&racct_lock); 551198090Srdivacky error = racct_add_locked(p, resource, amount); 552288943Sdim mtx_unlock(&racct_lock); 553202878Srdivacky return (error); 554204642Srdivacky} 555193323Sed 556193323Sedstatic void 557193323Sedracct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) 558309124Sdim{ 559309124Sdim struct prison *pr; 560204642Srdivacky 561309124Sdim SDT_PROBE(racct, kernel, rusage, add_cred, cred, resource, amount, 562309124Sdim 0, 0); 563309124Sdim 564204642Srdivacky racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount); 565314564Sdim for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 566314564Sdim racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 567296417Sdim amount); 568309124Sdim racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount); 569309124Sdim} 570204642Srdivacky 571309124Sdim/* 572309124Sdim * Increase allocation of 'resource' by 'amount' for credential 'cred'. 573204642Srdivacky * Doesn't check for limits and never fails. 574309124Sdim * 575309124Sdim * XXX: Shouldn't this ever return an error? 576309124Sdim */ 577309124Sdimvoid 578309124Sdimracct_add_cred(struct ucred *cred, int resource, uint64_t amount) 579309124Sdim{ 580309124Sdim 581309124Sdim mtx_lock(&racct_lock); 582309124Sdim racct_add_cred_locked(cred, resource, amount); 583309124Sdim mtx_unlock(&racct_lock); 584309124Sdim} 585309124Sdim 586309124Sdim/* 587204642Srdivacky * Increase allocation of 'resource' by 'amount' for process 'p'. 588204642Srdivacky * Doesn't check for limits and never fails. 589193323Sed */ 590193323Sedvoid 591193323Sedracct_add_force(struct proc *p, int resource, uint64_t amount) 592204642Srdivacky{ 593204642Srdivacky 594204642Srdivacky SDT_PROBE(racct, kernel, rusage, add_force, p, resource, amount, 0, 0); 595204642Srdivacky 596204642Srdivacky /* 597204642Srdivacky * We need proc lock to dereference p->p_ucred. 598204642Srdivacky */ 599204642Srdivacky PROC_LOCK_ASSERT(p, MA_OWNED); 600204642Srdivacky 601204642Srdivacky mtx_lock(&racct_lock); 602204642Srdivacky racct_alloc_resource(p->p_racct, resource, amount); 603204642Srdivacky mtx_unlock(&racct_lock); 604204642Srdivacky racct_add_cred(p->p_ucred, resource, amount); 605193323Sed} 606218893Sdim 607204642Srdivackystatic int 608218893Sdimracct_set_locked(struct proc *p, int resource, uint64_t amount) 609218893Sdim{ 610296417Sdim int64_t old_amount, decayed_amount; 611218893Sdim int64_t diff_proc, diff_cred; 612218893Sdim#ifdef RCTL 613218893Sdim int error; 614204642Srdivacky#endif 615218893Sdim 616276479Sdim SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 617204642Srdivacky 618204642Srdivacky /* 619296417Sdim * We need proc lock to dereference p->p_ucred. 620218893Sdim */ 621276479Sdim PROC_LOCK_ASSERT(p, MA_OWNED); 622204642Srdivacky 623204642Srdivacky old_amount = p->p_racct->r_resources[resource]; 624276479Sdim /* 625204642Srdivacky * The diffs may be negative. 626204642Srdivacky */ 627296417Sdim diff_proc = amount - old_amount; 628296417Sdim if (RACCT_IS_DECAYING(resource)) { 629218893Sdim /* 630204642Srdivacky * Resources in per-credential racct containers may decay. 631218893Sdim * If this is the case, we need to calculate the difference 632218893Sdim * between the new amount and the proportional value of the 633218893Sdim * old amount that has decayed in the ucred racct containers. 634218893Sdim */ 635218893Sdim decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 636296417Sdim diff_cred = amount - decayed_amount; 637296417Sdim } else 638218893Sdim diff_cred = diff_proc; 639218893Sdim#ifdef notyet 640204642Srdivacky KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), 641204642Srdivacky ("%s: usage of non-droppable resource %d dropping", __func__, 642198090Srdivacky resource)); 643207618Srdivacky#endif 644207618Srdivacky#ifdef RCTL 645204642Srdivacky if (diff_proc > 0) { 646204642Srdivacky error = rctl_enforce(p, resource, diff_proc); 647204642Srdivacky if (error && RACCT_IS_DENIABLE(resource)) { 648204642Srdivacky SDT_PROBE(racct, kernel, rusage, set_failure, p, 649204642Srdivacky resource, amount, 0, 0); 650198090Srdivacky return (error); 651204642Srdivacky } 652204642Srdivacky } 653212904Sdim#endif 654212904Sdim racct_alloc_resource(p->p_racct, resource, diff_proc); 655212904Sdim if (diff_cred > 0) 656212904Sdim racct_add_cred_locked(p->p_ucred, resource, diff_cred); 657212904Sdim else if (diff_cred < 0) 658204642Srdivacky racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 659204642Srdivacky 660204642Srdivacky return (0); 661204642Srdivacky} 662204642Srdivacky 663204642Srdivacky/* 664204642Srdivacky * Set allocation of 'resource' to 'amount' for process 'p'. 665234353Sdim * Return 0 if it's below limits, or errno, if it's not. 666234353Sdim * 667204642Srdivacky * Note that decreasing the allocation always returns 0, 668234353Sdim * even if it's above the limit. 669204642Srdivacky */ 670204642Srdivackyint 671204642Srdivackyracct_set(struct proc *p, int resource, uint64_t amount) 672204642Srdivacky{ 673198090Srdivacky int error; 674204642Srdivacky 675193323Sed mtx_lock(&racct_lock); 676207618Srdivacky error = racct_set_locked(p, resource, amount); 677207618Srdivacky mtx_unlock(&racct_lock); 678204642Srdivacky return (error); 679226633Sdim} 680193323Sed 681204642Srdivackystatic void 682204642Srdivackyracct_set_force_locked(struct proc *p, int resource, uint64_t amount) 683204642Srdivacky{ 684204642Srdivacky int64_t old_amount, decayed_amount; 685204642Srdivacky int64_t diff_proc, diff_cred; 686204642Srdivacky 687204642Srdivacky SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0); 688218893Sdim 689204642Srdivacky /* 690204642Srdivacky * We need proc lock to dereference p->p_ucred. 691204642Srdivacky */ 692296417Sdim PROC_LOCK_ASSERT(p, MA_OWNED); 693204642Srdivacky 694204642Srdivacky old_amount = p->p_racct->r_resources[resource]; 695204642Srdivacky /* 696276479Sdim * The diffs may be negative. 697296417Sdim */ 698204642Srdivacky diff_proc = amount - old_amount; 699204642Srdivacky if (RACCT_IS_DECAYING(resource)) { 700204642Srdivacky /* 701204642Srdivacky * Resources in per-credential racct containers may decay. 702204642Srdivacky * If this is the case, we need to calculate the difference 703204642Srdivacky * between the new amount and the proportional value of the 704226633Sdim * old amount that has decayed in the ucred racct containers. 705204642Srdivacky */ 706204642Srdivacky decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; 707204642Srdivacky diff_cred = amount - decayed_amount; 708212904Sdim } else 709212904Sdim diff_cred = diff_proc; 710212904Sdim 711212904Sdim racct_alloc_resource(p->p_racct, resource, diff_proc); 712212904Sdim if (diff_cred > 0) 713212904Sdim racct_add_cred_locked(p->p_ucred, resource, diff_cred); 714212904Sdim else if (diff_cred < 0) 715212904Sdim racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); 716212904Sdim} 717204642Srdivacky 718226633Sdimvoid 719204642Srdivackyracct_set_force(struct proc *p, int resource, uint64_t amount) 720207618Srdivacky{ 721207618Srdivacky mtx_lock(&racct_lock); 722204642Srdivacky racct_set_force_locked(p, resource, amount); 723207618Srdivacky mtx_unlock(&racct_lock); 724207618Srdivacky} 725204642Srdivacky 726204642Srdivacky/* 727204642Srdivacky * Returns amount of 'resource' the process 'p' can keep allocated. 728234353Sdim * Allocating more than that would be denied, unless the resource 729204642Srdivacky * is marked undeniable. Amount of already allocated resource does 730202878Srdivacky * not matter. 731204642Srdivacky */ 732204642Srdivackyuint64_t 733204642Srdivackyracct_get_limit(struct proc *p, int resource) 734202878Srdivacky{ 735204642Srdivacky 736202878Srdivacky#ifdef RCTL 737204642Srdivacky return (rctl_get_limit(p, resource)); 738204642Srdivacky#else 739204642Srdivacky return (UINT64_MAX); 740204642Srdivacky#endif 741204642Srdivacky} 742202878Srdivacky 743193323Sed/* 744204642Srdivacky * Returns amount of 'resource' the process 'p' can keep allocated. 745204642Srdivacky * Allocating more than that would be denied, unless the resource 746193323Sed * is marked undeniable. Amount of already allocated resource does 747193323Sed * matter. 748193323Sed */ 749226633Sdimuint64_t 750193323Sedracct_get_available(struct proc *p, int resource) 751204642Srdivacky{ 752204642Srdivacky 753204642Srdivacky#ifdef RCTL 754204642Srdivacky return (rctl_get_available(p, resource)); 755204642Srdivacky#else 756218893Sdim return (UINT64_MAX); 757193323Sed#endif 758204642Srdivacky} 759296417Sdim 760204642Srdivacky/* 761204642Srdivacky * Returns amount of the %cpu resource that process 'p' can add to its %cpu 762204642Srdivacky * utilization. Adding more than that would lead to the process being 763276479Sdim * throttled. 764321369Sdim */ 765321369Sdimstatic int64_t 766321369Sdimracct_pcpu_available(struct proc *p) 767321369Sdim{ 768321369Sdim 769321369Sdim#ifdef RCTL 770321369Sdim return (rctl_pcpu_available(p)); 771321369Sdim#else 772321369Sdim return (INT64_MAX); 773321369Sdim#endif 774321369Sdim} 775321369Sdim 776321369Sdim/* 777321369Sdim * Decrease allocation of 'resource' by 'amount' for process 'p'. 778321369Sdim */ 779321369Sdimvoid 780321369Sdimracct_sub(struct proc *p, int resource, uint64_t amount) 781321369Sdim{ 782321369Sdim 783321369Sdim SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0); 784321369Sdim 785321369Sdim /* 786321369Sdim * We need proc lock to dereference p->p_ucred. 787321369Sdim */ 788321369Sdim PROC_LOCK_ASSERT(p, MA_OWNED); 789321369Sdim KASSERT(RACCT_CAN_DROP(resource), 790321369Sdim ("%s: called for non-droppable resource %d", __func__, resource)); 791321369Sdim 792321369Sdim mtx_lock(&racct_lock); 793321369Sdim KASSERT(amount <= p->p_racct->r_resources[resource], 794321369Sdim ("%s: freeing %ju of resource %d, which is more " 795321369Sdim "than allocated %jd for %s (pid %d)", __func__, amount, resource, 796321369Sdim (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); 797321369Sdim 798321369Sdim racct_alloc_resource(p->p_racct, resource, -amount); 799321369Sdim racct_sub_cred_locked(p->p_ucred, resource, amount); 800321369Sdim mtx_unlock(&racct_lock); 801321369Sdim} 802321369Sdim 803204642Srdivackystatic void 804204642Srdivackyracct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) 805321369Sdim{ 806321369Sdim struct prison *pr; 807204642Srdivacky 808204642Srdivacky SDT_PROBE(racct, kernel, rusage, sub_cred, cred, resource, amount, 809204642Srdivacky 0, 0); 810321369Sdim 811204642Srdivacky#ifdef notyet 812204642Srdivacky KASSERT(RACCT_CAN_DROP(resource), 813321369Sdim ("%s: called for resource %d which can not drop", __func__, 814204642Srdivacky resource)); 815204642Srdivacky#endif 816204642Srdivacky 817288943Sdim racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); 818288943Sdim for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) 819288943Sdim racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource, 820288943Sdim -amount); 821288943Sdim racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount); 822288943Sdim} 823288943Sdim 824288943Sdim/* 825288943Sdim * Decrease allocation of 'resource' by 'amount' for credential 'cred'. 826204642Srdivacky */ 827193323Sedvoid 828193323Sedracct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 829204642Srdivacky{ 830193323Sed 831193323Sed mtx_lock(&racct_lock); 832193323Sed racct_sub_cred_locked(cred, resource, amount); 833226633Sdim mtx_unlock(&racct_lock); 834193323Sed} 835194178Sed 836193323Sed/* 837296417Sdim * Inherit resource usage information from the parent process. 838193323Sed */ 839193323Sedint 840195340Sedracct_proc_fork(struct proc *parent, struct proc *child) 841193323Sed{ 842193323Sed int i, error = 0; 843194178Sed 844195340Sed /* 845193323Sed * Create racct for the child process. 846193323Sed */ 847193323Sed racct_create(&child->p_racct); 848193323Sed 849193323Sed PROC_LOCK(parent); 850198090Srdivacky PROC_LOCK(child); 851193323Sed mtx_lock(&racct_lock); 852193323Sed 853193323Sed#ifdef RCTL 854193323Sed error = rctl_proc_fork(parent, child); 855207618Srdivacky if (error != 0) 856193323Sed goto out; 857221345Sdim#endif 858261991Sdim 859193323Sed /* Init process cpu time. */ 860193323Sed child->p_prev_runtime = 0; 861193323Sed child->p_throttled = 0; 862198090Srdivacky 863193323Sed /* 864193323Sed * Inherit resource usage. 865193323Sed */ 866193323Sed for (i = 0; i <= RACCT_MAX; i++) { 867193323Sed if (parent->p_racct->r_resources[i] == 0 || 868193323Sed !RACCT_IS_INHERITABLE(i)) 869226633Sdim continue; 870226633Sdim 871226633Sdim error = racct_set_locked(child, i, 872226633Sdim parent->p_racct->r_resources[i]); 873226633Sdim if (error != 0) 874226633Sdim goto out; 875226633Sdim } 876226633Sdim 877226633Sdim error = racct_add_locked(child, RACCT_NPROC, 1); 878226633Sdim error += racct_add_locked(child, RACCT_NTHR, 1); 879226633Sdim 880226633Sdimout: 881226633Sdim mtx_unlock(&racct_lock); 882226633Sdim PROC_UNLOCK(child); 883296417Sdim PROC_UNLOCK(parent); 884226633Sdim 885226633Sdim if (error != 0) 886226633Sdim racct_proc_exit(child); 887226633Sdim 888226633Sdim return (error); 889226633Sdim} 890226633Sdim 891226633Sdim/* 892226633Sdim * Called at the end of fork1(), to handle rules that require the process 893226633Sdim * to be fully initialized. 894327952Sdim */ 895226633Sdimvoid 896226633Sdimracct_proc_fork_done(struct proc *child) 897226633Sdim{ 898226633Sdim 899226633Sdim#ifdef RCTL 900234353Sdim PROC_LOCK(child); 901234353Sdim mtx_lock(&racct_lock); 902234353Sdim rctl_enforce(child, RACCT_NPROC, 0); 903234353Sdim rctl_enforce(child, RACCT_NTHR, 0); 904234353Sdim mtx_unlock(&racct_lock); 905234353Sdim PROC_UNLOCK(child); 906234353Sdim#endif 907234353Sdim} 908234353Sdim 909234353Sdimvoid 910234353Sdimracct_proc_exit(struct proc *p) 911234353Sdim{ 912234353Sdim int i; 913276479Sdim uint64_t runtime; 914234353Sdim struct timeval wallclock; 915226633Sdim uint64_t pct_estimate, pct; 916234353Sdim 917276479Sdim PROC_LOCK(p); 918226633Sdim /* 919226633Sdim * We don't need to calculate rux, proc_reap() has already done this. 920234353Sdim */ 921234353Sdim runtime = cputick2usec(p->p_rux.rux_runtime); 922296417Sdim#ifdef notyet 923234353Sdim KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); 924276479Sdim#else 925234353Sdim if (runtime < p->p_prev_runtime) 926226633Sdim runtime = p->p_prev_runtime; 927234353Sdim#endif 928234353Sdim microuptime(&wallclock); 929296417Sdim timevalsub(&wallclock, &p->p_stats->p_start); 930226633Sdim if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 931226633Sdim pct_estimate = (1000000 * runtime * 100) / 932234353Sdim ((uint64_t)wallclock.tv_sec * 1000000 + 933296417Sdim wallclock.tv_usec); 934276479Sdim } else 935234353Sdim pct_estimate = 0; 936234353Sdim pct = racct_getpcpu(p, pct_estimate); 937234353Sdim 938234353Sdim mtx_lock(&racct_lock); 939234353Sdim racct_set_locked(p, RACCT_CPU, runtime); 940234353Sdim racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); 941234353Sdim 942234353Sdim for (i = 0; i <= RACCT_MAX; i++) { 943234353Sdim if (p->p_racct->r_resources[i] == 0) 944226633Sdim continue; 945276479Sdim if (!RACCT_IS_RECLAIMABLE(i)) 946226633Sdim continue; 947226633Sdim racct_set_locked(p, i, 0); 948226633Sdim } 949276479Sdim 950226633Sdim mtx_unlock(&racct_lock); 951226633Sdim PROC_UNLOCK(p); 952234353Sdim 953226633Sdim#ifdef RCTL 954234353Sdim rctl_racct_release(p->p_racct); 955234353Sdim#endif 956309124Sdim racct_destroy(&p->p_racct); 957309124Sdim} 958309124Sdim 959309124Sdim/* 960309124Sdim * Called after credentials change, to move resource utilisation 961309124Sdim * between raccts. 962309124Sdim */ 963309124Sdimvoid 964309124Sdimracct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, 965309124Sdim struct ucred *newcred) 966309124Sdim{ 967309124Sdim struct uidinfo *olduip, *newuip; 968309124Sdim struct loginclass *oldlc, *newlc; 969309124Sdim struct prison *oldpr, *newpr, *pr; 970309124Sdim 971309124Sdim PROC_LOCK_ASSERT(p, MA_NOTOWNED); 972309124Sdim 973234353Sdim newuip = newcred->cr_ruidinfo; 974234353Sdim olduip = oldcred->cr_ruidinfo; 975234353Sdim newlc = newcred->cr_loginclass; 976234353Sdim oldlc = oldcred->cr_loginclass; 977296417Sdim newpr = newcred->cr_prison; 978234353Sdim oldpr = oldcred->cr_prison; 979234353Sdim 980234353Sdim mtx_lock(&racct_lock); 981234353Sdim if (newuip != olduip) { 982296417Sdim racct_sub_racct(olduip->ui_racct, p->p_racct); 983296417Sdim racct_add_racct(newuip->ui_racct, p->p_racct); 984226633Sdim } 985234353Sdim if (newlc != oldlc) { 986296417Sdim racct_sub_racct(oldlc->lc_racct, p->p_racct); 987296417Sdim racct_add_racct(newlc->lc_racct, p->p_racct); 988296417Sdim } 989234353Sdim if (newpr != oldpr) { 990234353Sdim for (pr = oldpr; pr != NULL; pr = pr->pr_parent) 991234353Sdim racct_sub_racct(pr->pr_prison_racct->prr_racct, 992234353Sdim p->p_racct); 993234353Sdim for (pr = newpr; pr != NULL; pr = pr->pr_parent) 994234353Sdim racct_add_racct(pr->pr_prison_racct->prr_racct, 995234353Sdim p->p_racct); 996234353Sdim } 997234353Sdim mtx_unlock(&racct_lock); 998296417Sdim 999234353Sdim#ifdef RCTL 1000226633Sdim rctl_proc_ucred_changed(p, newcred); 1001296417Sdim#endif 1002309124Sdim} 1003234353Sdim 1004234353Sdimvoid 1005234353Sdimracct_move(struct racct *dest, struct racct *src) 1006226633Sdim{ 1007226633Sdim 1008234353Sdim mtx_lock(&racct_lock); 1009234353Sdim 1010234353Sdim racct_add_racct(dest, src); 1011234353Sdim racct_sub_racct(src, src); 1012234353Sdim 1013234353Sdim mtx_unlock(&racct_lock); 1014234353Sdim} 1015234353Sdim 1016234353Sdimstatic void 1017234353Sdimracct_proc_throttle(struct proc *p) 1018234353Sdim{ 1019234353Sdim struct thread *td; 1020234353Sdim#ifdef SMP 1021234353Sdim int cpuid; 1022234353Sdim#endif 1023234353Sdim 1024234353Sdim PROC_LOCK_ASSERT(p, MA_OWNED); 1025234353Sdim 1026234353Sdim /* 1027234353Sdim * Do not block kernel processes. Also do not block processes with 1028234353Sdim * low %cpu utilization to improve interactivity. 1029234353Sdim */ 1030234353Sdim if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) || 1031234353Sdim (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) 1032234353Sdim return; 1033234353Sdim p->p_throttled = 1; 1034234353Sdim 1035234353Sdim FOREACH_THREAD_IN_PROC(p, td) { 1036234353Sdim switch (td->td_state) { 1037234353Sdim case TDS_RUNQ: 1038234353Sdim /* 1039234353Sdim * If the thread is on the scheduler run-queue, we can 1040234353Sdim * not just remove it from there. So we set the flag 1041234353Sdim * TDF_NEEDRESCHED for the thread, so that once it is 1042234353Sdim * running, it is taken off the cpu as soon as possible. 1043234353Sdim */ 1044234353Sdim thread_lock(td); 1045234353Sdim td->td_flags |= TDF_NEEDRESCHED; 1046234353Sdim thread_unlock(td); 1047234353Sdim break; 1048234353Sdim case TDS_RUNNING: 1049234353Sdim /* 1050234353Sdim * If the thread is running, we request a context 1051234353Sdim * switch for it by setting the TDF_NEEDRESCHED flag. 1052234353Sdim */ 1053234353Sdim thread_lock(td); 1054276479Sdim td->td_flags |= TDF_NEEDRESCHED; 1055276479Sdim#ifdef SMP 1056309124Sdim cpuid = td->td_oncpu; 1057309124Sdim if ((cpuid != NOCPU) && (td != curthread)) 1058276479Sdim ipi_cpu(cpuid, IPI_AST); 1059276479Sdim#endif 1060276479Sdim thread_unlock(td); 1061276479Sdim break; 1062276479Sdim default: 1063309124Sdim break; 1064276479Sdim } 1065276479Sdim } 1066276479Sdim} 1067276479Sdim 1068276479Sdimstatic void 1069276479Sdimracct_proc_wakeup(struct proc *p) 1070276479Sdim{ 1071296417Sdim PROC_LOCK_ASSERT(p, MA_OWNED); 1072276479Sdim 1073276479Sdim if (p->p_throttled) { 1074276479Sdim p->p_throttled = 0; 1075276479Sdim wakeup(p->p_racct); 1076276479Sdim } 1077276479Sdim} 1078276479Sdim 1079276479Sdimstatic void 1080276479Sdimracct_decay_resource(struct racct *racct, void * res, void* dummy) 1081276479Sdim{ 1082276479Sdim int resource; 1083276479Sdim int64_t r_old, r_new; 1084276479Sdim 1085276479Sdim resource = *(int *)res; 1086276479Sdim r_old = racct->r_resources[resource]; 1087276479Sdim 1088276479Sdim /* If there is nothing to decay, just exit. */ 1089276479Sdim if (r_old <= 0) 1090276479Sdim return; 1091276479Sdim 1092276479Sdim mtx_lock(&racct_lock); 1093276479Sdim r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; 1094276479Sdim racct->r_resources[resource] = r_new; 1095276479Sdim mtx_unlock(&racct_lock); 1096276479Sdim} 1097276479Sdim 1098276479Sdimstatic void 1099276479Sdimracct_decay(int resource) 1100276479Sdim{ 1101276479Sdim ui_racct_foreach(racct_decay_resource, &resource, NULL); 1102276479Sdim loginclass_racct_foreach(racct_decay_resource, &resource, NULL); 1103288943Sdim prison_racct_foreach(racct_decay_resource, &resource, NULL); 1104288943Sdim} 1105288943Sdim 1106288943Sdimstatic void 1107288943Sdimracctd(void) 1108288943Sdim{ 1109288943Sdim struct thread *td; 1110288943Sdim struct proc *p; 1111288943Sdim struct timeval wallclock; 1112288943Sdim uint64_t runtime; 1113288943Sdim uint64_t pct, pct_estimate; 1114288943Sdim 1115288943Sdim for (;;) { 1116288943Sdim racct_decay(RACCT_PCTCPU); 1117288943Sdim 1118288943Sdim sx_slock(&allproc_lock); 1119288943Sdim 1120288943Sdim LIST_FOREACH(p, &zombproc, p_list) { 1121288943Sdim PROC_LOCK(p); 1122288943Sdim racct_set(p, RACCT_PCTCPU, 0); 1123288943Sdim PROC_UNLOCK(p); 1124288943Sdim } 1125288943Sdim 1126288943Sdim FOREACH_PROC_IN_SYSTEM(p) { 1127288943Sdim PROC_LOCK(p); 1128288943Sdim if (p->p_state != PRS_NORMAL) { 1129288943Sdim PROC_UNLOCK(p); 1130288943Sdim continue; 1131202878Srdivacky } 1132202878Srdivacky 1133202878Srdivacky microuptime(&wallclock); 1134202878Srdivacky timevalsub(&wallclock, &p->p_stats->p_start); 1135202878Srdivacky PROC_SLOCK(p); 1136202878Srdivacky FOREACH_THREAD_IN_PROC(p, td) 1137226633Sdim ruxagg(p, td); 1138276479Sdim runtime = cputick2usec(p->p_rux.rux_runtime); 1139276479Sdim PROC_SUNLOCK(p); 1140276479Sdim#ifdef notyet 1141224145Sdim KASSERT(runtime >= p->p_prev_runtime, 1142224145Sdim ("runtime < p_prev_runtime")); 1143202878Srdivacky#else 1144226633Sdim if (runtime < p->p_prev_runtime) 1145226633Sdim runtime = p->p_prev_runtime; 1146276479Sdim#endif 1147276479Sdim p->p_prev_runtime = runtime; 1148276479Sdim if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { 1149276479Sdim pct_estimate = (1000000 * runtime * 100) / 1150276479Sdim ((uint64_t)wallclock.tv_sec * 1000000 + 1151276479Sdim wallclock.tv_usec); 1152276479Sdim } else 1153296417Sdim pct_estimate = 0; 1154296417Sdim pct = racct_getpcpu(p, pct_estimate); 1155296417Sdim mtx_lock(&racct_lock); 1156276479Sdim racct_set_force_locked(p, RACCT_PCTCPU, pct); 1157327952Sdim racct_set_locked(p, RACCT_CPU, runtime); 1158327952Sdim racct_set_locked(p, RACCT_WALLCLOCK, 1159226633Sdim (uint64_t)wallclock.tv_sec * 1000000 + 1160202878Srdivacky wallclock.tv_usec); 1161327952Sdim mtx_unlock(&racct_lock); 1162276479Sdim PROC_UNLOCK(p); 1163276479Sdim } 1164226633Sdim 1165276479Sdim /* 1166276479Sdim * To ensure that processes are throttled in a fair way, we need 1167276479Sdim * to iterate over all processes again and check the limits 1168276479Sdim * for %cpu resource only after ucred racct containers have been 1169276479Sdim * properly filled. 1170276479Sdim */ 1171276479Sdim FOREACH_PROC_IN_SYSTEM(p) { 1172276479Sdim PROC_LOCK(p); 1173327952Sdim if (p->p_state != PRS_NORMAL) { 1174276479Sdim PROC_UNLOCK(p); 1175276479Sdim continue; 1176226633Sdim } 1177327952Sdim 1178226633Sdim if (racct_pcpu_available(p) <= 0) 1179276479Sdim racct_proc_throttle(p); 1180234353Sdim else if (p->p_throttled) 1181276479Sdim racct_proc_wakeup(p); 1182327952Sdim PROC_UNLOCK(p); 1183226633Sdim } 1184226633Sdim sx_sunlock(&allproc_lock); 1185276479Sdim pause("-", hz); 1186276479Sdim } 1187276479Sdim} 1188276479Sdim 1189276479Sdimstatic struct kproc_desc racctd_kp = { 1190276479Sdim "racctd", 1191327952Sdim racctd, 1192276479Sdim NULL 1193276479Sdim}; 1194276479SdimSYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp); 1195276479Sdim 1196276479Sdimstatic void 1197276479Sdimracct_init(void) 1198276479Sdim{ 1199276479Sdim 1200276479Sdim racct_zone = uma_zcreate("racct", sizeof(struct racct), 1201327952Sdim NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1202276479Sdim /* 1203276479Sdim * XXX: Move this somewhere. 1204276479Sdim */ 1205276479Sdim prison0.pr_prison_racct = prison_racct_find("0"); 1206276479Sdim} 1207276479SdimSYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); 1208276479Sdim 1209276479Sdim#else /* !RACCT */ 1210276479Sdim 1211296417Sdimint 1212276479Sdimracct_add(struct proc *p, int resource, uint64_t amount) 1213226633Sdim{ 1214226633Sdim 1215276479Sdim return (0); 1216226633Sdim} 1217226633Sdim 1218276479Sdimvoid 1219226633Sdimracct_add_cred(struct ucred *cred, int resource, uint64_t amount) 1220226633Sdim{ 1221203954Srdivacky} 1222202878Srdivacky 1223309124Sdimvoid 1224202878Srdivackyracct_add_force(struct proc *p, int resource, uint64_t amount) 1225234353Sdim{ 1226234353Sdim 1227234353Sdim return; 1228234353Sdim} 1229234353Sdim 1230234353Sdimint 1231234353Sdimracct_set(struct proc *p, int resource, uint64_t amount) 1232234353Sdim{ 1233234353Sdim 1234234353Sdim return (0); 1235314564Sdim} 1236314564Sdim 1237314564Sdimvoid 1238314564Sdimracct_set_force(struct proc *p, int resource, uint64_t amount) 1239314564Sdim{ 1240202878Srdivacky} 1241314564Sdim 1242314564Sdimvoid 1243224145Sdimracct_sub(struct proc *p, int resource, uint64_t amount) 1244296417Sdim{ 1245296417Sdim} 1246224145Sdim 1247234353Sdimvoid 1248234353Sdimracct_sub_cred(struct ucred *cred, int resource, uint64_t amount) 1249202878Srdivacky{ 1250234353Sdim} 1251234353Sdim 1252234353Sdimuint64_t 1253234353Sdimracct_get_limit(struct proc *p, int resource) 1254234353Sdim{ 1255202878Srdivacky 1256234353Sdim return (UINT64_MAX); 1257296417Sdim} 1258202878Srdivacky 1259288943Sdimuint64_t 1260288943Sdimracct_get_available(struct proc *p, int resource) 1261288943Sdim{ 1262288943Sdim 1263288943Sdim return (UINT64_MAX); 1264288943Sdim} 1265202878Srdivacky 1266221345Sdimvoid 1267221345Sdimracct_create(struct racct **racctp) 1268221345Sdim{ 1269224145Sdim} 1270224145Sdim 1271202878Srdivackyvoid 1272202878Srdivackyracct_destroy(struct racct **racctp) 1273202878Srdivacky{ 1274221345Sdim} 1275202878Srdivacky 1276202878Srdivackyint 1277202878Srdivackyracct_proc_fork(struct proc *parent, struct proc *child) 1278202878Srdivacky{ 1279202878Srdivacky 1280202878Srdivacky return (0); 1281202878Srdivacky} 1282202878Srdivacky 1283234353Sdimvoid 1284234353Sdimracct_proc_fork_done(struct proc *child) 1285234353Sdim{ 1286202878Srdivacky} 1287202878Srdivacky 1288224145Sdimvoid 1289234353Sdimracct_proc_exit(struct proc *p) 1290288943Sdim{ 1291261991Sdim} 1292288943Sdim 1293261991Sdim#endif /* !RACCT */ 1294288943Sdim