1104964Sjeff/*- 2104964Sjeff * Copyright (c) 1982, 1986, 1990, 1991, 1993 3104964Sjeff * The Regents of the University of California. All rights reserved. 4104964Sjeff * (c) UNIX System Laboratories, Inc. 5104964Sjeff * All or some portions of this file are derived from material licensed 6104964Sjeff * to the University of California by American Telephone and Telegraph 7104964Sjeff * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8104964Sjeff * the permission of UNIX System Laboratories, Inc. 9104964Sjeff * 10104964Sjeff * Redistribution and use in source and binary forms, with or without 11104964Sjeff * modification, are permitted provided that the following conditions 12104964Sjeff * are met: 13104964Sjeff * 1. Redistributions of source code must retain the above copyright 14104964Sjeff * notice, this list of conditions and the following disclaimer. 15104964Sjeff * 2. Redistributions in binary form must reproduce the above copyright 16104964Sjeff * notice, this list of conditions and the following disclaimer in the 17104964Sjeff * documentation and/or other materials provided with the distribution. 18104964Sjeff * 4. Neither the name of the University nor the names of its contributors 19104964Sjeff * may be used to endorse or promote products derived from this software 20104964Sjeff * without specific prior written permission. 21104964Sjeff * 22104964Sjeff * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23104964Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24104964Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25104964Sjeff * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26104964Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27104964Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28104964Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29104964Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30104964Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31104964Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32104964Sjeff * SUCH DAMAGE. 33104964Sjeff */ 34104964Sjeff 35116182Sobrien#include <sys/cdefs.h> 36116182Sobrien__FBSDID("$FreeBSD: stable/11/sys/kern/sched_4bsd.c 331722 2018-03-29 02:50:57Z eadler $"); 37116182Sobrien 38147565Speter#include "opt_hwpmc_hooks.h" 39177418Sjeff#include "opt_sched.h" 40147565Speter 41104964Sjeff#include <sys/param.h> 42104964Sjeff#include <sys/systm.h> 43176750Smarcel#include <sys/cpuset.h> 44104964Sjeff#include <sys/kernel.h> 45104964Sjeff#include <sys/ktr.h> 46104964Sjeff#include <sys/lock.h> 47123871Sjhb#include <sys/kthread.h> 48104964Sjeff#include <sys/mutex.h> 49104964Sjeff#include <sys/proc.h> 50104964Sjeff#include <sys/resourcevar.h> 51104964Sjeff#include <sys/sched.h> 52235459Srstone#include <sys/sdt.h> 53104964Sjeff#include <sys/smp.h> 54104964Sjeff#include <sys/sysctl.h> 55104964Sjeff#include <sys/sx.h> 56139453Sjhb#include <sys/turnstile.h> 57161599Sdavidxu#include <sys/umtx.h> 58160039Sobrien#include <machine/pcb.h> 59134689Sjulian#include <machine/smp.h> 60104964Sjeff 61145256Sjkoshy#ifdef HWPMC_HOOKS 62145256Sjkoshy#include <sys/pmckern.h> 63145256Sjkoshy#endif 64145256Sjkoshy 65179297Sjb#ifdef KDTRACE_HOOKS 66179297Sjb#include <sys/dtrace_bsd.h> 67179297Sjbint dtrace_vtime_active; 68179297Sjbdtrace_vtime_switch_func_t dtrace_vtime_switch_func; 69179297Sjb#endif 70179297Sjb 71107135Sjeff/* 72107135Sjeff * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in 73107135Sjeff * the range 100-256 Hz (approximately). 74107135Sjeff */ 75107135Sjeff#define ESTCPULIM(e) \ 76107135Sjeff min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ 77107135Sjeff RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) 78122355Sbde#ifdef SMP 79122355Sbde#define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) 80122355Sbde#else 81107135Sjeff#define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ 82122355Sbde#endif 83107135Sjeff#define NICE_WEIGHT 1 /* Priorities per nice level. */ 84107135Sjeff 85187679Sjeff#define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) 86187357Sjeff 87134791Sjulian/* 88163709Sjb * The schedulable entity that runs a context. 89164936Sjulian * This is an extension to the thread structure and is tailored to 90298145Skib * the requirements of this scheduler. 91298145Skib * All fields are protected by the scheduler lock. 92163709Sjb */ 93164936Sjulianstruct td_sched { 94298145Skib fixpt_t ts_pctcpu; /* %cpu during p_swtime. */ 95298145Skib u_int ts_estcpu; /* Estimated cpu utilization. */ 96298145Skib int ts_cpticks; /* Ticks of cpu time. */ 97298145Skib int ts_slptime; /* Seconds !RUNNING. */ 98239153Smav int ts_slice; /* Remaining part of time slice. */ 99180923Sjhb int ts_flags; 100164936Sjulian struct runq *ts_runq; /* runq the thread is currently on */ 101187357Sjeff#ifdef KTR 102187357Sjeff char ts_name[TS_NAME_LEN]; 103187357Sjeff#endif 104109145Sjeff}; 105109145Sjeff 106134791Sjulian/* flags kept in td_flags */ 107164936Sjulian#define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */ 108177435Sjeff#define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */ 109239157Smav#define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ 110134791Sjulian 111180923Sjhb/* flags kept in ts_flags */ 112180923Sjhb#define TSF_AFFINITY 0x0001 /* Has a non-"full" CPU set. */ 113180923Sjhb 114164936Sjulian#define SKE_RUNQ_PCPU(ts) \ 115164936Sjulian ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) 116124955Sjeff 117180923Sjhb#define THREAD_CAN_SCHED(td, cpu) \ 118180923Sjhb CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) 119180923Sjhb 120301456Skib_Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= 121301456Skib sizeof(struct thread0_storage), 122301456Skib "increase struct thread0_storage.t0st_sched size"); 123301456Skib 124265108Smariusstatic struct mtx sched_lock; 125134791Sjulian 126239185Smavstatic int realstathz = 127; /* stathz is sometimes 0 and run off of hz. */ 127125288Sjeffstatic int sched_tdcnt; /* Total runnable threads in the system. */ 128239185Smavstatic int sched_slice = 12; /* Thread run time before rescheduling. */ 129104964Sjeff 130124955Sjeffstatic void setup_runqs(void); 131123871Sjhbstatic void schedcpu(void); 132124955Sjeffstatic void schedcpu_thread(void); 133139453Sjhbstatic void sched_priority(struct thread *td, u_char prio); 134104964Sjeffstatic void sched_setup(void *dummy); 135104964Sjeffstatic void maybe_resched(struct thread *td); 136163709Sjbstatic void updatepri(struct thread *td); 137163709Sjbstatic void resetpriority(struct thread *td); 138163709Sjbstatic void resetpriority_thread(struct thread *td); 139134694Sjulian#ifdef SMP 140180923Sjhbstatic int sched_pickcpu(struct thread *td); 141180879Sjhbstatic int forward_wakeup(int cpunum); 142180879Sjhbstatic void kick_other_cpu(int pri, int cpuid); 143134694Sjulian#endif 144104964Sjeff 145124955Sjeffstatic struct kproc_desc sched_kp = { 146124955Sjeff "schedcpu", 147124955Sjeff schedcpu_thread, 148124955Sjeff NULL 149124955Sjeff}; 150253604SavgSYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start, 151177253Srwatson &sched_kp); 152177253SrwatsonSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); 153104964Sjeff 154239153Smavstatic void sched_initticks(void *dummy); 155239153SmavSYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, 156239153Smav NULL); 157239153Smav 158104964Sjeff/* 159104964Sjeff * Global run queue. 160104964Sjeff */ 161104964Sjeffstatic struct runq runq; 162104964Sjeff 163124955Sjeff#ifdef SMP 164124955Sjeff/* 165124955Sjeff * Per-CPU run queues 166124955Sjeff */ 167124955Sjeffstatic struct runq runq_pcpu[MAXCPU]; 168180923Sjhblong runq_length[MAXCPU]; 169222001Sattilio 170222813Sattiliostatic cpuset_t idle_cpus_mask; 171124955Sjeff#endif 172124955Sjeff 173212455Smavstruct pcpuidlestat { 174212455Smav u_int idlecalls; 175212455Smav u_int oldidlecalls; 176212455Smav}; 177215701Sdimstatic DPCPU_DEFINE(struct pcpuidlestat, idlestat); 178212455Smav 179124955Sjeffstatic void 180124955Sjeffsetup_runqs(void) 181124955Sjeff{ 182124955Sjeff#ifdef SMP 183124955Sjeff int i; 184124955Sjeff 185124955Sjeff for (i = 0; i < MAXCPU; ++i) 186124955Sjeff runq_init(&runq_pcpu[i]); 187124955Sjeff#endif 188124955Sjeff 189124955Sjeff runq_init(&runq); 190124955Sjeff} 191124955Sjeff 192239185Smavstatic int 193239185Smavsysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 194239185Smav{ 195239185Smav int error, new_val, period; 196239185Smav 197239185Smav period = 1000000 / realstathz; 198239185Smav new_val = period * sched_slice; 199239185Smav error = sysctl_handle_int(oidp, &new_val, 0, req); 200239196Smav if (error != 0 || req->newptr == NULL) 201239185Smav return (error); 202239185Smav if (new_val <= 0) 203239185Smav return (EINVAL); 204239196Smav sched_slice = imax(1, (new_val + period / 2) / period); 205239196Smav hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / 206239196Smav realstathz); 207239185Smav return (0); 208239185Smav} 209239185Smav 210132589SscottlSYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler"); 211130881Sscottl 212132589SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, 213132589Sscottl "Scheduler name"); 214239185SmavSYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, 215239185Smav NULL, 0, sysctl_kern_quantum, "I", 216239196Smav "Quantum for timeshare threads in microseconds"); 217239153SmavSYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, 218239196Smav "Quantum for timeshare threads in stathz ticks"); 219134693Sjulian#ifdef SMP 220134688Sjulian/* Enable forwarding of wakeups to all other cpus */ 221227309Sedstatic SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, 222227309Sed "Kernel SMP"); 223134688Sjulian 224177419Sjeffstatic int runq_fuzz = 1; 225177419SjeffSYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, ""); 226177419Sjeff 227134792Sjulianstatic int forward_wakeup_enabled = 1; 228134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, 229134688Sjulian &forward_wakeup_enabled, 0, 230134688Sjulian "Forwarding of wakeup to idle CPUs"); 231134688Sjulian 232134688Sjulianstatic int forward_wakeups_requested = 0; 233134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, 234134688Sjulian &forward_wakeups_requested, 0, 235134688Sjulian "Requests for Forwarding of wakeup to idle CPUs"); 236134688Sjulian 237134688Sjulianstatic int forward_wakeups_delivered = 0; 238134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, 239134688Sjulian &forward_wakeups_delivered, 0, 240134688Sjulian "Completed Forwarding of wakeup to idle CPUs"); 241134688Sjulian 242134792Sjulianstatic int forward_wakeup_use_mask = 1; 243134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, 244134688Sjulian &forward_wakeup_use_mask, 0, 245134688Sjulian "Use the mask of idle cpus"); 246134688Sjulian 247134688Sjulianstatic int forward_wakeup_use_loop = 0; 248134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, 249134688Sjulian &forward_wakeup_use_loop, 0, 250134688Sjulian "Use a loop to find idle cpus"); 251134688Sjulian 252134693Sjulian#endif 253164936Sjulian#if 0 254135051Sjulianstatic int sched_followon = 0; 255135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW, 256135051Sjulian &sched_followon, 0, 257135051Sjulian "allow threads to share a quantum"); 258163709Sjb#endif 259135051Sjulian 260235459SrstoneSDT_PROVIDER_DEFINE(sched); 261235459Srstone 262258622SavgSDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 263235459Srstone "struct proc *", "uint8_t"); 264258622SavgSDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 265235459Srstone "struct proc *", "void *"); 266258622SavgSDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 267235459Srstone "struct proc *", "void *", "int"); 268258622SavgSDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 269235459Srstone "struct proc *", "uint8_t", "struct thread *"); 270258622SavgSDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); 271258622SavgSDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", 272235459Srstone "struct proc *"); 273258622SavgSDT_PROBE_DEFINE(sched, , , on__cpu); 274258622SavgSDT_PROBE_DEFINE(sched, , , remain__cpu); 275258622SavgSDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", 276235459Srstone "struct proc *"); 277235459Srstone 278139317Sjeffstatic __inline void 279139317Sjeffsched_load_add(void) 280139317Sjeff{ 281187357Sjeff 282139317Sjeff sched_tdcnt++; 283187357Sjeff KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); 284258622Savg SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt); 285139317Sjeff} 286139317Sjeff 287139317Sjeffstatic __inline void 288139317Sjeffsched_load_rem(void) 289139317Sjeff{ 290187357Sjeff 291139317Sjeff sched_tdcnt--; 292187357Sjeff KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); 293258622Savg SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt); 294139317Sjeff} 295104964Sjeff/* 296104964Sjeff * Arrange to reschedule if necessary, taking the priorities and 297104964Sjeff * schedulers into account. 298104964Sjeff */ 299104964Sjeffstatic void 300104964Sjeffmaybe_resched(struct thread *td) 301104964Sjeff{ 302104964Sjeff 303170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 304134791Sjulian if (td->td_priority < curthread->td_priority) 305111032Sjulian curthread->td_flags |= TDF_NEEDRESCHED; 306104964Sjeff} 307104964Sjeff 308104964Sjeff/* 309177419Sjeff * This function is called when a thread is about to be put on run queue 310177419Sjeff * because it has been made runnable or its priority has been adjusted. It 311309446Sjhb * determines if the new thread should preempt the current thread. If so, 312309446Sjhb * it sets td_owepreempt to request a preemption. 313177419Sjeff */ 314177419Sjeffint 315177419Sjeffmaybe_preempt(struct thread *td) 316177419Sjeff{ 317177419Sjeff#ifdef PREEMPTION 318177419Sjeff struct thread *ctd; 319177419Sjeff int cpri, pri; 320177419Sjeff 321177419Sjeff /* 322177419Sjeff * The new thread should not preempt the current thread if any of the 323177419Sjeff * following conditions are true: 324177419Sjeff * 325177419Sjeff * - The kernel is in the throes of crashing (panicstr). 326177419Sjeff * - The current thread has a higher (numerically lower) or 327177419Sjeff * equivalent priority. Note that this prevents curthread from 328177419Sjeff * trying to preempt to itself. 329177419Sjeff * - The current thread has an inhibitor set or is in the process of 330177419Sjeff * exiting. In this case, the current thread is about to switch 331177419Sjeff * out anyways, so there's no point in preempting. If we did, 332177419Sjeff * the current thread would not be properly resumed as well, so 333177419Sjeff * just avoid that whole landmine. 334177419Sjeff * - If the new thread's priority is not a realtime priority and 335177419Sjeff * the current thread's priority is not an idle priority and 336177419Sjeff * FULL_PREEMPTION is disabled. 337177419Sjeff * 338177419Sjeff * If all of these conditions are false, but the current thread is in 339177419Sjeff * a nested critical section, then we have to defer the preemption 340177419Sjeff * until we exit the critical section. Otherwise, switch immediately 341177419Sjeff * to the new thread. 342177419Sjeff */ 343177419Sjeff ctd = curthread; 344177419Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 345177419Sjeff KASSERT((td->td_inhibitors == 0), 346177419Sjeff ("maybe_preempt: trying to run inhibited thread")); 347177419Sjeff pri = td->td_priority; 348177419Sjeff cpri = ctd->td_priority; 349310129Sjhb if (panicstr != NULL || pri >= cpri /* || dumping */ || 350177419Sjeff TD_IS_INHIBITED(ctd)) 351177419Sjeff return (0); 352177419Sjeff#ifndef FULL_PREEMPTION 353177419Sjeff if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE) 354177419Sjeff return (0); 355177419Sjeff#endif 356177419Sjeff 357309446Sjhb CTR0(KTR_PROC, "maybe_preempt: scheduling preemption"); 358309446Sjhb ctd->td_owepreempt = 1; 359177419Sjeff return (1); 360177419Sjeff#else 361177419Sjeff return (0); 362177419Sjeff#endif 363177419Sjeff} 364177419Sjeff 365177419Sjeff/* 366104964Sjeff * Constants for digital decay and forget: 367298145Skib * 90% of (ts_estcpu) usage in 5 * loadav time 368164936Sjulian * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive) 369104964Sjeff * Note that, as ps(1) mentions, this can let percentages 370104964Sjeff * total over 100% (I've seen 137.9% for 3 processes). 371104964Sjeff * 372298145Skib * Note that schedclock() updates ts_estcpu and p_cpticks asynchronously. 373104964Sjeff * 374298145Skib * We wish to decay away 90% of ts_estcpu in (5 * loadavg) seconds. 375104964Sjeff * That is, the system wants to compute a value of decay such 376104964Sjeff * that the following for loop: 377104964Sjeff * for (i = 0; i < (5 * loadavg); i++) 378298145Skib * ts_estcpu *= decay; 379104964Sjeff * will compute 380298145Skib * ts_estcpu *= 0.1; 381104964Sjeff * for all values of loadavg: 382104964Sjeff * 383104964Sjeff * Mathematically this loop can be expressed by saying: 384104964Sjeff * decay ** (5 * loadavg) ~= .1 385104964Sjeff * 386104964Sjeff * The system computes decay as: 387104964Sjeff * decay = (2 * loadavg) / (2 * loadavg + 1) 388104964Sjeff * 389104964Sjeff * We wish to prove that the system's computation of decay 390104964Sjeff * will always fulfill the equation: 391104964Sjeff * decay ** (5 * loadavg) ~= .1 392104964Sjeff * 393104964Sjeff * If we compute b as: 394104964Sjeff * b = 2 * loadavg 395104964Sjeff * then 396104964Sjeff * decay = b / (b + 1) 397104964Sjeff * 398104964Sjeff * We now need to prove two things: 399104964Sjeff * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 400104964Sjeff * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 401104964Sjeff * 402104964Sjeff * Facts: 403104964Sjeff * For x close to zero, exp(x) =~ 1 + x, since 404104964Sjeff * exp(x) = 0! + x**1/1! + x**2/2! + ... . 405104964Sjeff * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 406104964Sjeff * For x close to zero, ln(1+x) =~ x, since 407104964Sjeff * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 408104964Sjeff * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 409104964Sjeff * ln(.1) =~ -2.30 410104964Sjeff * 411104964Sjeff * Proof of (1): 412104964Sjeff * Solve (factor)**(power) =~ .1 given power (5*loadav): 413104964Sjeff * solving for factor, 414104964Sjeff * ln(factor) =~ (-2.30/5*loadav), or 415104964Sjeff * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 416104964Sjeff * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 417104964Sjeff * 418104964Sjeff * Proof of (2): 419104964Sjeff * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 420104964Sjeff * solving for power, 421104964Sjeff * power*ln(b/(b+1)) =~ -2.30, or 422104964Sjeff * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 423104964Sjeff * 424104964Sjeff * Actual power values for the implemented algorithm are as follows: 425104964Sjeff * loadav: 1 2 3 4 426104964Sjeff * power: 5.68 10.32 14.94 19.55 427104964Sjeff */ 428104964Sjeff 429104964Sjeff/* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 430104964Sjeff#define loadfactor(loadav) (2 * (loadav)) 431104964Sjeff#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 432104964Sjeff 433164936Sjulian/* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 434104964Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 435217370SmdfSYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 436104964Sjeff 437104964Sjeff/* 438104964Sjeff * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 439104964Sjeff * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 440104964Sjeff * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 441104964Sjeff * 442104964Sjeff * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 443104964Sjeff * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 444104964Sjeff * 445104964Sjeff * If you don't want to bother with the faster/more-accurate formula, you 446104964Sjeff * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 447104964Sjeff * (more general) method of calculating the %age of CPU used by a process. 448104964Sjeff */ 449104964Sjeff#define CCPU_SHIFT 11 450104964Sjeff 451104964Sjeff/* 452104964Sjeff * Recompute process priorities, every hz ticks. 453104964Sjeff * MP-safe, called without the Giant mutex. 454104964Sjeff */ 455104964Sjeff/* ARGSUSED */ 456104964Sjeffstatic void 457123871Sjhbschedcpu(void) 458104964Sjeff{ 459331643Sdim fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 460104964Sjeff struct thread *td; 461104964Sjeff struct proc *p; 462164936Sjulian struct td_sched *ts; 463239153Smav int awake; 464104964Sjeff 465104964Sjeff sx_slock(&allproc_lock); 466104964Sjeff FOREACH_PROC_IN_SYSTEM(p) { 467177368Sjeff PROC_LOCK(p); 468220390Sjhb if (p->p_state == PRS_NEW) { 469220390Sjhb PROC_UNLOCK(p); 470220390Sjhb continue; 471220390Sjhb } 472180879Sjhb FOREACH_THREAD_IN_PROC(p, td) { 473104964Sjeff awake = 0; 474301456Skib ts = td_get_sched(td); 475170293Sjeff thread_lock(td); 476163709Sjb /* 477163709Sjb * Increment sleep time (if sleeping). We 478163709Sjb * ignore overflow, as above. 479163709Sjb */ 480163709Sjb /* 481164936Sjulian * The td_sched slptimes are not touched in wakeup 482164936Sjulian * because the thread may not HAVE everything in 483164936Sjulian * memory? XXX I think this is out of date. 484163709Sjb */ 485166188Sjeff if (TD_ON_RUNQ(td)) { 486163709Sjb awake = 1; 487177435Sjeff td->td_flags &= ~TDF_DIDRUN; 488166188Sjeff } else if (TD_IS_RUNNING(td)) { 489163709Sjb awake = 1; 490177435Sjeff /* Do not clear TDF_DIDRUN */ 491177435Sjeff } else if (td->td_flags & TDF_DIDRUN) { 492163709Sjb awake = 1; 493177435Sjeff td->td_flags &= ~TDF_DIDRUN; 494163709Sjb } 495163709Sjb 496163709Sjb /* 497164936Sjulian * ts_pctcpu is only for ps and ttyinfo(). 498163709Sjb */ 499164936Sjulian ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT; 500163709Sjb /* 501164936Sjulian * If the td_sched has been idle the entire second, 502163709Sjb * stop recalculating its priority until 503163709Sjb * it wakes up. 504163709Sjb */ 505164936Sjulian if (ts->ts_cpticks != 0) { 506163709Sjb#if (FSHIFT >= CCPU_SHIFT) 507164936Sjulian ts->ts_pctcpu += (realstathz == 100) 508164936Sjulian ? ((fixpt_t) ts->ts_cpticks) << 509164936Sjulian (FSHIFT - CCPU_SHIFT) : 510164936Sjulian 100 * (((fixpt_t) ts->ts_cpticks) 511164936Sjulian << (FSHIFT - CCPU_SHIFT)) / realstathz; 512163709Sjb#else 513164936Sjulian ts->ts_pctcpu += ((FSCALE - ccpu) * 514164936Sjulian (ts->ts_cpticks * 515164936Sjulian FSCALE / realstathz)) >> FSHIFT; 516163709Sjb#endif 517164936Sjulian ts->ts_cpticks = 0; 518164267Sdavidxu } 519180879Sjhb /* 520163709Sjb * If there are ANY running threads in this process, 521104964Sjeff * then don't count it as sleeping. 522180879Sjhb * XXX: this is broken. 523104964Sjeff */ 524104964Sjeff if (awake) { 525172264Sjeff if (ts->ts_slptime > 1) { 526104964Sjeff /* 527104964Sjeff * In an ideal world, this should not 528104964Sjeff * happen, because whoever woke us 529104964Sjeff * up from the long sleep should have 530104964Sjeff * unwound the slptime and reset our 531104964Sjeff * priority before we run at the stale 532104964Sjeff * priority. Should KASSERT at some 533104964Sjeff * point when all the cases are fixed. 534104964Sjeff */ 535163709Sjb updatepri(td); 536163709Sjb } 537172264Sjeff ts->ts_slptime = 0; 538163709Sjb } else 539172264Sjeff ts->ts_slptime++; 540172264Sjeff if (ts->ts_slptime > 1) { 541170293Sjeff thread_unlock(td); 542163709Sjb continue; 543170293Sjeff } 544298145Skib ts->ts_estcpu = decay_cpu(loadfac, ts->ts_estcpu); 545163709Sjb resetpriority(td); 546163709Sjb resetpriority_thread(td); 547170293Sjeff thread_unlock(td); 548180879Sjhb } 549177368Sjeff PROC_UNLOCK(p); 550180879Sjhb } 551104964Sjeff sx_sunlock(&allproc_lock); 552104964Sjeff} 553104964Sjeff 554104964Sjeff/* 555123871Sjhb * Main loop for a kthread that executes schedcpu once a second. 556123871Sjhb */ 557123871Sjhbstatic void 558124955Sjeffschedcpu_thread(void) 559123871Sjhb{ 560123871Sjhb 561123871Sjhb for (;;) { 562123871Sjhb schedcpu(); 563167086Sjhb pause("-", hz); 564123871Sjhb } 565123871Sjhb} 566123871Sjhb 567123871Sjhb/* 568104964Sjeff * Recalculate the priority of a process after it has slept for a while. 569298145Skib * For all load averages >= 1 and max ts_estcpu of 255, sleeping for at 570298145Skib * least six times the loadfactor will decay ts_estcpu to zero. 571104964Sjeff */ 572104964Sjeffstatic void 573163709Sjbupdatepri(struct thread *td) 574104964Sjeff{ 575172264Sjeff struct td_sched *ts; 576172264Sjeff fixpt_t loadfac; 577172264Sjeff unsigned int newcpu; 578104964Sjeff 579301456Skib ts = td_get_sched(td); 580118972Sjhb loadfac = loadfactor(averunnable.ldavg[0]); 581172264Sjeff if (ts->ts_slptime > 5 * loadfac) 582298145Skib ts->ts_estcpu = 0; 583104964Sjeff else { 584298145Skib newcpu = ts->ts_estcpu; 585172264Sjeff ts->ts_slptime--; /* was incremented in schedcpu() */ 586172264Sjeff while (newcpu && --ts->ts_slptime) 587104964Sjeff newcpu = decay_cpu(loadfac, newcpu); 588298145Skib ts->ts_estcpu = newcpu; 589104964Sjeff } 590104964Sjeff} 591104964Sjeff 592104964Sjeff/* 593104964Sjeff * Compute the priority of a process when running in user mode. 594104964Sjeff * Arrange to reschedule if the resulting priority is better 595104964Sjeff * than that of the current process. 596104964Sjeff */ 597104964Sjeffstatic void 598163709Sjbresetpriority(struct thread *td) 599104964Sjeff{ 600298145Skib u_int newpriority; 601104964Sjeff 602298145Skib if (td->td_pri_class != PRI_TIMESHARE) 603298145Skib return; 604301456Skib newpriority = PUSER + 605301456Skib td_get_sched(td)->ts_estcpu / INVERSE_ESTCPU_WEIGHT + 606298145Skib NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN); 607298145Skib newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), 608298145Skib PRI_MAX_TIMESHARE); 609298145Skib sched_user_prio(td, newpriority); 610104964Sjeff} 611104964Sjeff 612139453Sjhb/* 613164936Sjulian * Update the thread's priority when the associated process's user 614139453Sjhb * priority changes. 615139453Sjhb */ 616139453Sjhbstatic void 617163709Sjbresetpriority_thread(struct thread *td) 618139453Sjhb{ 619139453Sjhb 620139453Sjhb /* Only change threads with a time sharing user priority. */ 621139453Sjhb if (td->td_priority < PRI_MIN_TIMESHARE || 622139453Sjhb td->td_priority > PRI_MAX_TIMESHARE) 623139453Sjhb return; 624139453Sjhb 625139453Sjhb /* XXX the whole needresched thing is broken, but not silly. */ 626139453Sjhb maybe_resched(td); 627139453Sjhb 628163709Sjb sched_prio(td, td->td_user_pri); 629139453Sjhb} 630139453Sjhb 631104964Sjeff/* ARGSUSED */ 632104964Sjeffstatic void 633104964Sjeffsched_setup(void *dummy) 634104964Sjeff{ 635239185Smav 636124955Sjeff setup_runqs(); 637118972Sjhb 638125288Sjeff /* Account for thread0. */ 639139317Sjeff sched_load_add(); 640104964Sjeff} 641104964Sjeff 642239153Smav/* 643239185Smav * This routine determines time constants after stathz and hz are setup. 644239153Smav */ 645239153Smavstatic void 646239153Smavsched_initticks(void *dummy) 647239153Smav{ 648239153Smav 649239153Smav realstathz = stathz ? stathz : hz; 650239153Smav sched_slice = realstathz / 10; /* ~100ms */ 651239196Smav hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / 652239196Smav realstathz); 653239153Smav} 654239153Smav 655104964Sjeff/* External interfaces start here */ 656180879Sjhb 657134791Sjulian/* 658134791Sjulian * Very early in the boot some setup of scheduler-specific 659145109Smaxim * parts of proc0 and of some scheduler resources needs to be done. 660134791Sjulian * Called from: 661134791Sjulian * proc0_init() 662134791Sjulian */ 663134791Sjulianvoid 664134791Sjulianschedinit(void) 665134791Sjulian{ 666301456Skib 667134791Sjulian /* 668301456Skib * Set up the scheduler specific parts of thread0. 669134791Sjulian */ 670170293Sjeff thread0.td_lock = &sched_lock; 671301456Skib td_get_sched(&thread0)->ts_slice = sched_slice; 672171488Sjeff mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); 673134791Sjulian} 674134791Sjulian 675104964Sjeffint 676104964Sjeffsched_runnable(void) 677104964Sjeff{ 678124955Sjeff#ifdef SMP 679124955Sjeff return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); 680124955Sjeff#else 681124955Sjeff return runq_check(&runq); 682124955Sjeff#endif 683104964Sjeff} 684104964Sjeff 685180879Sjhbint 686104964Sjeffsched_rr_interval(void) 687104964Sjeff{ 688239153Smav 689239153Smav /* Convert sched_slice from stathz to hz. */ 690239196Smav return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); 691104964Sjeff} 692104964Sjeff 693104964Sjeff/* 694298145Skib * We adjust the priority of the current process. The priority of a 695298145Skib * process gets worse as it accumulates CPU time. The cpu usage 696298145Skib * estimator (ts_estcpu) is increased here. resetpriority() will 697298145Skib * compute a different priority each time ts_estcpu increases by 698298145Skib * INVERSE_ESTCPU_WEIGHT (until PRI_MAX_TIMESHARE is reached). The 699298145Skib * cpu usage estimator ramps up quite quickly when the process is 700298145Skib * running (linearly), and decays away exponentially, at a rate which 701298145Skib * is proportionally slower when the system is busy. The basic 702298145Skib * principle is that the system will 90% forget that the process used 703298145Skib * a lot of CPU time in 5 * loadav seconds. This causes the system to 704298145Skib * favor processes which haven't run much recently, and to round-robin 705298145Skib * among other processes. 706104964Sjeff */ 707104964Sjeffvoid 708121127Sjeffsched_clock(struct thread *td) 709104964Sjeff{ 710212455Smav struct pcpuidlestat *stat; 711164936Sjulian struct td_sched *ts; 712104964Sjeff 713170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 714301456Skib ts = td_get_sched(td); 715113356Sjeff 716164936Sjulian ts->ts_cpticks++; 717298145Skib ts->ts_estcpu = ESTCPULIM(ts->ts_estcpu + 1); 718298145Skib if ((ts->ts_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { 719163709Sjb resetpriority(td); 720163709Sjb resetpriority_thread(td); 721104964Sjeff } 722173081Sjhb 723173081Sjhb /* 724173081Sjhb * Force a context switch if the current thread has used up a full 725239185Smav * time slice (default is 100ms). 726173081Sjhb */ 727239185Smav if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) { 728239153Smav ts->ts_slice = sched_slice; 729239157Smav td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND; 730239153Smav } 731212455Smav 732212455Smav stat = DPCPU_PTR(idlestat); 733212455Smav stat->oldidlecalls = stat->idlecalls; 734212455Smav stat->idlecalls = 0; 735104964Sjeff} 736118972Sjhb 737104964Sjeff/* 738180879Sjhb * Charge child's scheduling CPU usage to parent. 739104964Sjeff */ 740104964Sjeffvoid 741132372Sjuliansched_exit(struct proc *p, struct thread *td) 742104964Sjeff{ 743163709Sjb 744187357Sjeff KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit", 745225199Sdelphij "prio:%d", td->td_priority); 746187357Sjeff 747177368Sjeff PROC_LOCK_ASSERT(p, MA_OWNED); 748164936Sjulian sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); 749113356Sjeff} 750113356Sjeff 751113356Sjeffvoid 752164936Sjuliansched_exit_thread(struct thread *td, struct thread *child) 753113356Sjeff{ 754113923Sjhb 755187357Sjeff KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit", 756225199Sdelphij "prio:%d", child->td_priority); 757170293Sjeff thread_lock(td); 758301456Skib td_get_sched(td)->ts_estcpu = ESTCPULIM(td_get_sched(td)->ts_estcpu + 759301456Skib td_get_sched(child)->ts_estcpu); 760170293Sjeff thread_unlock(td); 761198854Sattilio thread_lock(child); 762198854Sattilio if ((child->td_flags & TDF_NOLOAD) == 0) 763139317Sjeff sched_load_rem(); 764198854Sattilio thread_unlock(child); 765113356Sjeff} 766109145Sjeff 767113356Sjeffvoid 768134791Sjuliansched_fork(struct thread *td, struct thread *childtd) 769113356Sjeff{ 770134791Sjulian sched_fork_thread(td, childtd); 771113356Sjeff} 772113356Sjeff 773113356Sjeffvoid 774134791Sjuliansched_fork_thread(struct thread *td, struct thread *childtd) 775113356Sjeff{ 776301456Skib struct td_sched *ts, *tsc; 777177426Sjeff 778286256Sjhb childtd->td_oncpu = NOCPU; 779286256Sjhb childtd->td_lastcpu = NOCPU; 780170293Sjeff childtd->td_lock = &sched_lock; 781176750Smarcel childtd->td_cpuset = cpuset_ref(td->td_cpuset); 782217078Sjhb childtd->td_priority = childtd->td_base_pri; 783301456Skib ts = td_get_sched(childtd); 784177426Sjeff bzero(ts, sizeof(*ts)); 785301456Skib tsc = td_get_sched(td); 786301456Skib ts->ts_estcpu = tsc->ts_estcpu; 787301456Skib ts->ts_flags |= (tsc->ts_flags & TSF_AFFINITY); 788239153Smav ts->ts_slice = 1; 789104964Sjeff} 790104964Sjeff 791104964Sjeffvoid 792130551Sjuliansched_nice(struct proc *p, int nice) 793104964Sjeff{ 794139453Sjhb struct thread *td; 795113873Sjhb 796130551Sjulian PROC_LOCK_ASSERT(p, MA_OWNED); 797130551Sjulian p->p_nice = nice; 798163709Sjb FOREACH_THREAD_IN_PROC(p, td) { 799170293Sjeff thread_lock(td); 800163709Sjb resetpriority(td); 801163709Sjb resetpriority_thread(td); 802170293Sjeff thread_unlock(td); 803163709Sjb } 804104964Sjeff} 805104964Sjeff 806113356Sjeffvoid 807163709Sjbsched_class(struct thread *td, int class) 808113356Sjeff{ 809170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 810163709Sjb td->td_pri_class = class; 811113356Sjeff} 812113356Sjeff 813105127Sjulian/* 814105127Sjulian * Adjust the priority of a thread. 815105127Sjulian */ 816139453Sjhbstatic void 817139453Sjhbsched_priority(struct thread *td, u_char prio) 818104964Sjeff{ 819104964Sjeff 820187357Sjeff 821187357Sjeff KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change", 822187357Sjeff "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, 823187357Sjeff sched_tdname(curthread)); 824258622Savg SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); 825187357Sjeff if (td != curthread && prio > td->td_priority) { 826187357Sjeff KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), 827187357Sjeff "lend prio", "prio:%d", td->td_priority, "new prio:%d", 828187357Sjeff prio, KTR_ATTR_LINKED, sched_tdname(td)); 829258622Savg SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 830235459Srstone curthread); 831187357Sjeff } 832170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 833139453Sjhb if (td->td_priority == prio) 834139453Sjhb return; 835166188Sjeff td->td_priority = prio; 836177435Sjeff if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) { 837166188Sjeff sched_rem(td); 838166188Sjeff sched_add(td, SRQ_BORING); 839104964Sjeff } 840104964Sjeff} 841104964Sjeff 842139453Sjhb/* 843139453Sjhb * Update a thread's priority when it is lent another thread's 844139453Sjhb * priority. 845139453Sjhb */ 846104964Sjeffvoid 847139453Sjhbsched_lend_prio(struct thread *td, u_char prio) 848139453Sjhb{ 849139453Sjhb 850139453Sjhb td->td_flags |= TDF_BORROWING; 851139453Sjhb sched_priority(td, prio); 852139453Sjhb} 853139453Sjhb 854139453Sjhb/* 855139453Sjhb * Restore a thread's priority when priority propagation is 856139453Sjhb * over. The prio argument is the minimum priority the thread 857139453Sjhb * needs to have to satisfy other possible priority lending 858139453Sjhb * requests. If the thread's regulary priority is less 859139453Sjhb * important than prio the thread will keep a priority boost 860139453Sjhb * of prio. 861139453Sjhb */ 862139453Sjhbvoid 863139453Sjhbsched_unlend_prio(struct thread *td, u_char prio) 864139453Sjhb{ 865139453Sjhb u_char base_pri; 866139453Sjhb 867139453Sjhb if (td->td_base_pri >= PRI_MIN_TIMESHARE && 868139453Sjhb td->td_base_pri <= PRI_MAX_TIMESHARE) 869163709Sjb base_pri = td->td_user_pri; 870139453Sjhb else 871139453Sjhb base_pri = td->td_base_pri; 872139453Sjhb if (prio >= base_pri) { 873139453Sjhb td->td_flags &= ~TDF_BORROWING; 874139453Sjhb sched_prio(td, base_pri); 875139453Sjhb } else 876139453Sjhb sched_lend_prio(td, prio); 877139453Sjhb} 878139453Sjhb 879139453Sjhbvoid 880139453Sjhbsched_prio(struct thread *td, u_char prio) 881139453Sjhb{ 882139453Sjhb u_char oldprio; 883139453Sjhb 884139453Sjhb /* First, update the base priority. */ 885139453Sjhb td->td_base_pri = prio; 886139453Sjhb 887139453Sjhb /* 888139453Sjhb * If the thread is borrowing another thread's priority, don't ever 889139453Sjhb * lower the priority. 890139453Sjhb */ 891139453Sjhb if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 892139453Sjhb return; 893139453Sjhb 894139453Sjhb /* Change the real priority. */ 895139453Sjhb oldprio = td->td_priority; 896139453Sjhb sched_priority(td, prio); 897139453Sjhb 898139453Sjhb /* 899139453Sjhb * If the thread is on a turnstile, then let the turnstile update 900139453Sjhb * its state. 901139453Sjhb */ 902139453Sjhb if (TD_ON_LOCK(td) && oldprio != prio) 903139453Sjhb turnstile_adjust(td, oldprio); 904139453Sjhb} 905139453Sjhb 906139453Sjhbvoid 907163709Sjbsched_user_prio(struct thread *td, u_char prio) 908161599Sdavidxu{ 909161599Sdavidxu 910174536Sdavidxu THREAD_LOCK_ASSERT(td, MA_OWNED); 911163709Sjb td->td_base_user_pri = prio; 912216313Sdavidxu if (td->td_lend_user_pri <= prio) 913164177Sdavidxu return; 914163709Sjb td->td_user_pri = prio; 915161599Sdavidxu} 916161599Sdavidxu 917161599Sdavidxuvoid 918161599Sdavidxusched_lend_user_prio(struct thread *td, u_char prio) 919161599Sdavidxu{ 920161599Sdavidxu 921174536Sdavidxu THREAD_LOCK_ASSERT(td, MA_OWNED); 922216313Sdavidxu td->td_lend_user_pri = prio; 923216791Sdavidxu td->td_user_pri = min(prio, td->td_base_user_pri); 924216791Sdavidxu if (td->td_priority > td->td_user_pri) 925216791Sdavidxu sched_prio(td, td->td_user_pri); 926216791Sdavidxu else if (td->td_priority != td->td_user_pri) 927216791Sdavidxu td->td_flags |= TDF_NEEDRESCHED; 928161599Sdavidxu} 929161599Sdavidxu 930161599Sdavidxuvoid 931177085Sjeffsched_sleep(struct thread *td, int pri) 932104964Sjeff{ 933113923Sjhb 934170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 935172264Sjeff td->td_slptick = ticks; 936301456Skib td_get_sched(td)->ts_slptime = 0; 937217410Sjhb if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) 938177085Sjeff sched_prio(td, pri); 939201347Skib if (TD_IS_SUSPENDED(td) || pri >= PSOCK) 940177085Sjeff td->td_flags |= TDF_CANSWAP; 941104964Sjeff} 942104964Sjeff 943104964Sjeffvoid 944135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags) 945104964Sjeff{ 946202889Sattilio struct mtx *tmtx; 947164936Sjulian struct td_sched *ts; 948104964Sjeff struct proc *p; 949239157Smav int preempted; 950104964Sjeff 951202889Sattilio tmtx = NULL; 952301456Skib ts = td_get_sched(td); 953104964Sjeff p = td->td_proc; 954104964Sjeff 955170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 956180879Sjhb 957180879Sjhb /* 958170293Sjeff * Switch to the sched lock to fix things up and pick 959170293Sjeff * a new thread. 960202889Sattilio * Block the td_lock in order to avoid breaking the critical path. 961170293Sjeff */ 962170293Sjeff if (td->td_lock != &sched_lock) { 963170293Sjeff mtx_lock_spin(&sched_lock); 964202889Sattilio tmtx = thread_lock_block(td); 965170293Sjeff } 966104964Sjeff 967198854Sattilio if ((td->td_flags & TDF_NOLOAD) == 0) 968139317Sjeff sched_load_rem(); 969135051Sjulian 970113339Sjulian td->td_lastcpu = td->td_oncpu; 971312665Savg preempted = (td->td_flags & TDF_SLICEEND) == 0 && 972312665Savg (flags & SW_PREEMPT) != 0; 973239157Smav td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND); 974144777Sups td->td_owepreempt = 0; 975113339Sjulian td->td_oncpu = NOCPU; 976180879Sjhb 977104964Sjeff /* 978104964Sjeff * At the last moment, if this thread is still marked RUNNING, 979104964Sjeff * then put it back on the run queue as it has not been suspended 980131473Sjhb * or stopped or any thing else similar. We never put the idle 981131473Sjhb * threads on the run queue, however. 982104964Sjeff */ 983166415Sjulian if (td->td_flags & TDF_IDLETD) { 984131473Sjhb TD_SET_CAN_RUN(td); 985166415Sjulian#ifdef SMP 986223758Sattilio CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask); 987166415Sjulian#endif 988166415Sjulian } else { 989134791Sjulian if (TD_IS_RUNNING(td)) { 990164936Sjulian /* Put us back on the run queue. */ 991239157Smav sched_add(td, preempted ? 992136170Sjulian SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 993136170Sjulian SRQ_OURSELF|SRQ_YIELDING); 994134791Sjulian } 995104964Sjeff } 996136170Sjulian if (newtd) { 997180879Sjhb /* 998136170Sjulian * The thread we are about to run needs to be counted 999136170Sjulian * as if it had been added to the run queue and selected. 1000136170Sjulian * It came from: 1001136170Sjulian * * A preemption 1002180879Sjhb * * An upcall 1003136170Sjulian * * A followon 1004136170Sjulian */ 1005136170Sjulian KASSERT((newtd->td_inhibitors == 0), 1006165693Srwatson ("trying to run inhibited thread")); 1007177435Sjeff newtd->td_flags |= TDF_DIDRUN; 1008136170Sjulian TD_SET_RUNNING(newtd); 1009198854Sattilio if ((newtd->td_flags & TDF_NOLOAD) == 0) 1010139317Sjeff sched_load_add(); 1011136170Sjulian } else { 1012131473Sjhb newtd = choosethread(); 1013202940Sattilio MPASS(newtd->td_lock == &sched_lock); 1014136170Sjulian } 1015136170Sjulian 1016316840Savg#if (KTR_COMPILE & KTR_SCHED) != 0 1017316840Savg if (TD_IS_IDLETHREAD(td)) 1018316840Savg KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", 1019316840Savg "prio:%d", td->td_priority); 1020316840Savg else 1021316840Savg KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), 1022316840Savg "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, 1023316840Savg "lockname:\"%s\"", td->td_lockname); 1024316840Savg#endif 1025316840Savg 1026145256Sjkoshy if (td != newtd) { 1027145256Sjkoshy#ifdef HWPMC_HOOKS 1028145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1029145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1030145256Sjkoshy#endif 1031235459Srstone 1032260043Smarkj SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); 1033235459Srstone 1034166415Sjulian /* I feel sleepy */ 1035174629Sjeff lock_profile_release_lock(&sched_lock.lock_object); 1036179297Sjb#ifdef KDTRACE_HOOKS 1037179297Sjb /* 1038179297Sjb * If DTrace has set the active vtime enum to anything 1039179297Sjb * other than INACTIVE (0), then it should have set the 1040179297Sjb * function to call. 1041179297Sjb */ 1042179297Sjb if (dtrace_vtime_active) 1043179297Sjb (*dtrace_vtime_switch_func)(newtd); 1044179297Sjb#endif 1045179297Sjb 1046202889Sattilio cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock); 1047174629Sjeff lock_profile_obtain_lock_success(&sched_lock.lock_object, 1048174629Sjeff 0, 0, __FILE__, __LINE__); 1049166415Sjulian /* 1050166415Sjulian * Where am I? What year is it? 1051166415Sjulian * We are in the same thread that went to sleep above, 1052180879Sjhb * but any amount of time may have passed. All our context 1053166415Sjulian * will still be available as will local variables. 1054166415Sjulian * PCPU values however may have changed as we may have 1055166415Sjulian * changed CPU so don't trust cached values of them. 1056166415Sjulian * New threads will go to fork_exit() instead of here 1057166415Sjulian * so if you change things here you may need to change 1058166415Sjulian * things there too. 1059180879Sjhb * 1060166415Sjulian * If the thread above was exiting it will never wake 1061166415Sjulian * up again here, so either it has saved everything it 1062166415Sjulian * needed to, or the thread_wait() or wait() will 1063166415Sjulian * need to reap it. 1064166415Sjulian */ 1065235459Srstone 1066258622Savg SDT_PROBE0(sched, , , on__cpu); 1067145256Sjkoshy#ifdef HWPMC_HOOKS 1068145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1069145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1070145256Sjkoshy#endif 1071235459Srstone } else 1072258622Savg SDT_PROBE0(sched, , , remain__cpu); 1073145256Sjkoshy 1074316840Savg KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", 1075316840Savg "prio:%d", td->td_priority); 1076316840Savg 1077166415Sjulian#ifdef SMP 1078166415Sjulian if (td->td_flags & TDF_IDLETD) 1079223758Sattilio CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask); 1080166415Sjulian#endif 1081121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1082121128Sjeff td->td_oncpu = PCPU_GET(cpuid); 1083170293Sjeff MPASS(td->td_lock == &sched_lock); 1084104964Sjeff} 1085104964Sjeff 1086104964Sjeffvoid 1087104964Sjeffsched_wakeup(struct thread *td) 1088104964Sjeff{ 1089172264Sjeff struct td_sched *ts; 1090172264Sjeff 1091170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1092301456Skib ts = td_get_sched(td); 1093177085Sjeff td->td_flags &= ~TDF_CANSWAP; 1094172264Sjeff if (ts->ts_slptime > 1) { 1095163709Sjb updatepri(td); 1096163709Sjb resetpriority(td); 1097163709Sjb } 1098201790Sattilio td->td_slptick = 0; 1099172264Sjeff ts->ts_slptime = 0; 1100239153Smav ts->ts_slice = sched_slice; 1101166188Sjeff sched_add(td, SRQ_BORING); 1102104964Sjeff} 1103104964Sjeff 1104134693Sjulian#ifdef SMP 1105134688Sjulianstatic int 1106180879Sjhbforward_wakeup(int cpunum) 1107134688Sjulian{ 1108134688Sjulian struct pcpu *pc; 1109223758Sattilio cpuset_t dontuse, map, map2; 1110223758Sattilio u_int id, me; 1111222813Sattilio int iscpuset; 1112134688Sjulian 1113134688Sjulian mtx_assert(&sched_lock, MA_OWNED); 1114134688Sjulian 1115134791Sjulian CTR0(KTR_RUNQ, "forward_wakeup()"); 1116134688Sjulian 1117134688Sjulian if ((!forward_wakeup_enabled) || 1118134688Sjulian (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) 1119134688Sjulian return (0); 1120310129Sjhb if (!smp_started || panicstr) 1121134688Sjulian return (0); 1122134688Sjulian 1123134688Sjulian forward_wakeups_requested++; 1124134688Sjulian 1125180879Sjhb /* 1126180879Sjhb * Check the idle mask we received against what we calculated 1127180879Sjhb * before in the old version. 1128180879Sjhb */ 1129223758Sattilio me = PCPU_GET(cpuid); 1130180879Sjhb 1131180879Sjhb /* Don't bother if we should be doing it ourself. */ 1132223758Sattilio if (CPU_ISSET(me, &idle_cpus_mask) && 1133223758Sattilio (cpunum == NOCPU || me == cpunum)) 1134134688Sjulian return (0); 1135134688Sjulian 1136223758Sattilio CPU_SETOF(me, &dontuse); 1137222813Sattilio CPU_OR(&dontuse, &stopped_cpus); 1138222813Sattilio CPU_OR(&dontuse, &hlt_cpus_mask); 1139222813Sattilio CPU_ZERO(&map2); 1140134688Sjulian if (forward_wakeup_use_loop) { 1141222531Snwhitehorn STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 1142223758Sattilio id = pc->pc_cpuid; 1143223758Sattilio if (!CPU_ISSET(id, &dontuse) && 1144134688Sjulian pc->pc_curthread == pc->pc_idlethread) { 1145223758Sattilio CPU_SET(id, &map2); 1146134688Sjulian } 1147134688Sjulian } 1148134688Sjulian } 1149134688Sjulian 1150134688Sjulian if (forward_wakeup_use_mask) { 1151222813Sattilio map = idle_cpus_mask; 1152222813Sattilio CPU_NAND(&map, &dontuse); 1153134688Sjulian 1154180879Sjhb /* If they are both on, compare and use loop if different. */ 1155134688Sjulian if (forward_wakeup_use_loop) { 1156222813Sattilio if (CPU_CMP(&map, &map2)) { 1157222040Sattilio printf("map != map2, loop method preferred\n"); 1158222040Sattilio map = map2; 1159134688Sjulian } 1160134688Sjulian } 1161134688Sjulian } else { 1162222040Sattilio map = map2; 1163134688Sjulian } 1164180879Sjhb 1165180879Sjhb /* If we only allow a specific CPU, then mask off all the others. */ 1166134688Sjulian if (cpunum != NOCPU) { 1167134688Sjulian KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); 1168222813Sattilio iscpuset = CPU_ISSET(cpunum, &map); 1169222813Sattilio if (iscpuset == 0) 1170222813Sattilio CPU_ZERO(&map); 1171222813Sattilio else 1172222813Sattilio CPU_SETOF(cpunum, &map); 1173134688Sjulian } 1174222813Sattilio if (!CPU_EMPTY(&map)) { 1175134688Sjulian forward_wakeups_delivered++; 1176222531Snwhitehorn STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { 1177223758Sattilio id = pc->pc_cpuid; 1178223758Sattilio if (!CPU_ISSET(id, &map)) 1179212455Smav continue; 1180212455Smav if (cpu_idle_wakeup(pc->pc_cpuid)) 1181223758Sattilio CPU_CLR(id, &map); 1182212455Smav } 1183222813Sattilio if (!CPU_EMPTY(&map)) 1184212455Smav ipi_selected(map, IPI_AST); 1185134688Sjulian return (1); 1186134688Sjulian } 1187134688Sjulian if (cpunum == NOCPU) 1188134688Sjulian printf("forward_wakeup: Idle processor not found\n"); 1189134688Sjulian return (0); 1190134688Sjulian} 1191134688Sjulian 1192147182Supsstatic void 1193180879Sjhbkick_other_cpu(int pri, int cpuid) 1194180879Sjhb{ 1195180879Sjhb struct pcpu *pcpu; 1196180879Sjhb int cpri; 1197147182Sups 1198180879Sjhb pcpu = pcpu_find(cpuid); 1199223758Sattilio if (CPU_ISSET(cpuid, &idle_cpus_mask)) { 1200147182Sups forward_wakeups_delivered++; 1201212455Smav if (!cpu_idle_wakeup(cpuid)) 1202212455Smav ipi_cpu(cpuid, IPI_AST); 1203147182Sups return; 1204147182Sups } 1205147182Sups 1206180879Sjhb cpri = pcpu->pc_curthread->td_priority; 1207147182Sups if (pri >= cpri) 1208147182Sups return; 1209147182Sups 1210147182Sups#if defined(IPI_PREEMPTION) && defined(PREEMPTION) 1211147182Sups#if !defined(FULL_PREEMPTION) 1212147182Sups if (pri <= PRI_MAX_ITHD) 1213147182Sups#endif /* ! FULL_PREEMPTION */ 1214147182Sups { 1215210939Sjhb ipi_cpu(cpuid, IPI_PREEMPT); 1216147182Sups return; 1217147182Sups } 1218147182Sups#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */ 1219147182Sups 1220147182Sups pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED; 1221210939Sjhb ipi_cpu(cpuid, IPI_AST); 1222147182Sups return; 1223147182Sups} 1224147182Sups#endif /* SMP */ 1225147182Sups 1226180923Sjhb#ifdef SMP 1227180923Sjhbstatic int 1228180923Sjhbsched_pickcpu(struct thread *td) 1229180923Sjhb{ 1230180923Sjhb int best, cpu; 1231180923Sjhb 1232180923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1233180923Sjhb 1234303884Sjhb if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu)) 1235180937Sjhb best = td->td_lastcpu; 1236180937Sjhb else 1237180937Sjhb best = NOCPU; 1238209059Sjhb CPU_FOREACH(cpu) { 1239180923Sjhb if (!THREAD_CAN_SCHED(td, cpu)) 1240180923Sjhb continue; 1241180923Sjhb 1242180923Sjhb if (best == NOCPU) 1243180923Sjhb best = cpu; 1244180923Sjhb else if (runq_length[cpu] < runq_length[best]) 1245180923Sjhb best = cpu; 1246180923Sjhb } 1247180923Sjhb KASSERT(best != NOCPU, ("no valid CPUs")); 1248180923Sjhb 1249180923Sjhb return (best); 1250180923Sjhb} 1251180923Sjhb#endif 1252180923Sjhb 1253104964Sjeffvoid 1254134586Sjuliansched_add(struct thread *td, int flags) 1255147182Sups#ifdef SMP 1256104964Sjeff{ 1257223758Sattilio cpuset_t tidlemsk; 1258164936Sjulian struct td_sched *ts; 1259223758Sattilio u_int cpu, cpuid; 1260134591Sjulian int forwarded = 0; 1261147182Sups int single_cpu = 0; 1262121127Sjeff 1263301456Skib ts = td_get_sched(td); 1264170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1265166188Sjeff KASSERT((td->td_inhibitors == 0), 1266166188Sjeff ("sched_add: trying to run inhibited thread")); 1267166188Sjeff KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 1268166188Sjeff ("sched_add: bad thread state")); 1269172207Sjeff KASSERT(td->td_flags & TDF_INMEM, 1270172207Sjeff ("sched_add: thread swapped out")); 1271180879Sjhb 1272187357Sjeff KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", 1273187357Sjeff "prio:%d", td->td_priority, KTR_ATTR_LINKED, 1274187357Sjeff sched_tdname(curthread)); 1275187357Sjeff KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", 1276187357Sjeff KTR_ATTR_LINKED, sched_tdname(td)); 1277235459Srstone SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 1278235459Srstone flags & SRQ_PREEMPTED); 1279187357Sjeff 1280187357Sjeff 1281170293Sjeff /* 1282170293Sjeff * Now that the thread is moving to the run-queue, set the lock 1283170293Sjeff * to the scheduler's lock. 1284170293Sjeff */ 1285170293Sjeff if (td->td_lock != &sched_lock) { 1286170293Sjeff mtx_lock_spin(&sched_lock); 1287170293Sjeff thread_lock_set(td, &sched_lock); 1288170293Sjeff } 1289166188Sjeff TD_SET_RUNQ(td); 1290131481Sjhb 1291221081Srstone /* 1292221081Srstone * If SMP is started and the thread is pinned or otherwise limited to 1293221081Srstone * a specific set of CPUs, queue the thread to a per-CPU run queue. 1294221081Srstone * Otherwise, queue the thread to the global run queue. 1295221081Srstone * 1296221081Srstone * If SMP has not yet been started we must use the global run queue 1297221081Srstone * as per-CPU state may not be initialized yet and we may crash if we 1298221081Srstone * try to access the per-CPU run queues. 1299221081Srstone */ 1300221081Srstone if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND || 1301221081Srstone ts->ts_flags & TSF_AFFINITY)) { 1302221081Srstone if (td->td_pinned != 0) 1303221081Srstone cpu = td->td_lastcpu; 1304221081Srstone else if (td->td_flags & TDF_BOUND) { 1305221081Srstone /* Find CPU from bound runq. */ 1306221081Srstone KASSERT(SKE_RUNQ_PCPU(ts), 1307221081Srstone ("sched_add: bound td_sched not on cpu runq")); 1308221081Srstone cpu = ts->ts_runq - &runq_pcpu[0]; 1309221081Srstone } else 1310221081Srstone /* Find a valid CPU for our cpuset */ 1311221081Srstone cpu = sched_pickcpu(td); 1312164936Sjulian ts->ts_runq = &runq_pcpu[cpu]; 1313147182Sups single_cpu = 1; 1314147182Sups CTR3(KTR_RUNQ, 1315180879Sjhb "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, 1316180879Sjhb cpu); 1317180879Sjhb } else { 1318134591Sjulian CTR2(KTR_RUNQ, 1319180879Sjhb "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, 1320180879Sjhb td); 1321134591Sjulian cpu = NOCPU; 1322164936Sjulian ts->ts_runq = &runq; 1323147182Sups } 1324180879Sjhb 1325309446Sjhb if ((td->td_flags & TDF_NOLOAD) == 0) 1326309446Sjhb sched_load_add(); 1327309446Sjhb runq_add(ts->ts_runq, td, flags); 1328309446Sjhb if (cpu != NOCPU) 1329309446Sjhb runq_length[cpu]++; 1330309446Sjhb 1331223758Sattilio cpuid = PCPU_GET(cpuid); 1332223758Sattilio if (single_cpu && cpu != cpuid) { 1333180879Sjhb kick_other_cpu(td->td_priority, cpu); 1334124955Sjeff } else { 1335147190Sups if (!single_cpu) { 1336223758Sattilio tidlemsk = idle_cpus_mask; 1337223758Sattilio CPU_NAND(&tidlemsk, &hlt_cpus_mask); 1338223758Sattilio CPU_CLR(cpuid, &tidlemsk); 1339147182Sups 1340223758Sattilio if (!CPU_ISSET(cpuid, &idle_cpus_mask) && 1341223758Sattilio ((flags & SRQ_INTR) == 0) && 1342222813Sattilio !CPU_EMPTY(&tidlemsk)) 1343147182Sups forwarded = forward_wakeup(cpu); 1344147182Sups } 1345147182Sups 1346147182Sups if (!forwarded) { 1347309446Sjhb if (!maybe_preempt(td)) 1348147182Sups maybe_resched(td); 1349147182Sups } 1350124955Sjeff } 1351147182Sups} 1352147182Sups#else /* SMP */ 1353147182Sups{ 1354164936Sjulian struct td_sched *ts; 1355180923Sjhb 1356301456Skib ts = td_get_sched(td); 1357170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1358166188Sjeff KASSERT((td->td_inhibitors == 0), 1359166188Sjeff ("sched_add: trying to run inhibited thread")); 1360166188Sjeff KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 1361166188Sjeff ("sched_add: bad thread state")); 1362172207Sjeff KASSERT(td->td_flags & TDF_INMEM, 1363172207Sjeff ("sched_add: thread swapped out")); 1364187357Sjeff KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", 1365187357Sjeff "prio:%d", td->td_priority, KTR_ATTR_LINKED, 1366187357Sjeff sched_tdname(curthread)); 1367187357Sjeff KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", 1368187357Sjeff KTR_ATTR_LINKED, sched_tdname(td)); 1369235471Spluknet SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 1370235459Srstone flags & SRQ_PREEMPTED); 1371180879Sjhb 1372170293Sjeff /* 1373170293Sjeff * Now that the thread is moving to the run-queue, set the lock 1374170293Sjeff * to the scheduler's lock. 1375170293Sjeff */ 1376170293Sjeff if (td->td_lock != &sched_lock) { 1377170293Sjeff mtx_lock_spin(&sched_lock); 1378170293Sjeff thread_lock_set(td, &sched_lock); 1379170293Sjeff } 1380166188Sjeff TD_SET_RUNQ(td); 1381164936Sjulian CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); 1382164936Sjulian ts->ts_runq = &runq; 1383134591Sjulian 1384198854Sattilio if ((td->td_flags & TDF_NOLOAD) == 0) 1385139317Sjeff sched_load_add(); 1386177435Sjeff runq_add(ts->ts_runq, td, flags); 1387309446Sjhb if (!maybe_preempt(td)) 1388309446Sjhb maybe_resched(td); 1389104964Sjeff} 1390147182Sups#endif /* SMP */ 1391147182Sups 1392104964Sjeffvoid 1393121127Sjeffsched_rem(struct thread *td) 1394104964Sjeff{ 1395164936Sjulian struct td_sched *ts; 1396121127Sjeff 1397301456Skib ts = td_get_sched(td); 1398172207Sjeff KASSERT(td->td_flags & TDF_INMEM, 1399172207Sjeff ("sched_rem: thread swapped out")); 1400166188Sjeff KASSERT(TD_ON_RUNQ(td), 1401164936Sjulian ("sched_rem: thread not on run queue")); 1402104964Sjeff mtx_assert(&sched_lock, MA_OWNED); 1403187357Sjeff KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem", 1404187357Sjeff "prio:%d", td->td_priority, KTR_ATTR_LINKED, 1405187357Sjeff sched_tdname(curthread)); 1406235459Srstone SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); 1407104964Sjeff 1408198854Sattilio if ((td->td_flags & TDF_NOLOAD) == 0) 1409139317Sjeff sched_load_rem(); 1410180923Sjhb#ifdef SMP 1411180923Sjhb if (ts->ts_runq != &runq) 1412180923Sjhb runq_length[ts->ts_runq - runq_pcpu]--; 1413180923Sjhb#endif 1414177435Sjeff runq_remove(ts->ts_runq, td); 1415166188Sjeff TD_SET_CAN_RUN(td); 1416104964Sjeff} 1417104964Sjeff 1418135295Sjulian/* 1419180879Sjhb * Select threads to run. Note that running threads still consume a 1420180879Sjhb * slot. 1421135295Sjulian */ 1422166188Sjeffstruct thread * 1423104964Sjeffsched_choose(void) 1424104964Sjeff{ 1425177435Sjeff struct thread *td; 1426124955Sjeff struct runq *rq; 1427104964Sjeff 1428170293Sjeff mtx_assert(&sched_lock, MA_OWNED); 1429124955Sjeff#ifdef SMP 1430177435Sjeff struct thread *tdcpu; 1431124955Sjeff 1432124955Sjeff rq = &runq; 1433177435Sjeff td = runq_choose_fuzz(&runq, runq_fuzz); 1434177435Sjeff tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); 1435104964Sjeff 1436180879Sjhb if (td == NULL || 1437180879Sjhb (tdcpu != NULL && 1438177435Sjeff tdcpu->td_priority < td->td_priority)) { 1439177435Sjeff CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu, 1440124955Sjeff PCPU_GET(cpuid)); 1441177435Sjeff td = tdcpu; 1442124955Sjeff rq = &runq_pcpu[PCPU_GET(cpuid)]; 1443180879Sjhb } else { 1444177435Sjeff CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td); 1445124955Sjeff } 1446124955Sjeff 1447124955Sjeff#else 1448124955Sjeff rq = &runq; 1449177435Sjeff td = runq_choose(&runq); 1450124955Sjeff#endif 1451124955Sjeff 1452177435Sjeff if (td) { 1453180923Sjhb#ifdef SMP 1454180923Sjhb if (td == tdcpu) 1455180923Sjhb runq_length[PCPU_GET(cpuid)]--; 1456180923Sjhb#endif 1457177435Sjeff runq_remove(rq, td); 1458177435Sjeff td->td_flags |= TDF_DIDRUN; 1459104964Sjeff 1460177435Sjeff KASSERT(td->td_flags & TDF_INMEM, 1461172207Sjeff ("sched_choose: thread swapped out")); 1462177435Sjeff return (td); 1463180879Sjhb } 1464166188Sjeff return (PCPU_GET(idlethread)); 1465104964Sjeff} 1466104964Sjeff 1467104964Sjeffvoid 1468177004Sjeffsched_preempt(struct thread *td) 1469177004Sjeff{ 1470235459Srstone 1471235459Srstone SDT_PROBE2(sched, , , surrender, td, td->td_proc); 1472177004Sjeff thread_lock(td); 1473177004Sjeff if (td->td_critnest > 1) 1474177004Sjeff td->td_owepreempt = 1; 1475177004Sjeff else 1476178272Sjeff mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL); 1477177004Sjeff thread_unlock(td); 1478177004Sjeff} 1479177004Sjeff 1480177004Sjeffvoid 1481104964Sjeffsched_userret(struct thread *td) 1482104964Sjeff{ 1483104964Sjeff /* 1484104964Sjeff * XXX we cheat slightly on the locking here to avoid locking in 1485104964Sjeff * the usual case. Setting td_priority here is essentially an 1486104964Sjeff * incomplete workaround for not setting it properly elsewhere. 1487104964Sjeff * Now that some interrupt handlers are threads, not setting it 1488104964Sjeff * properly elsewhere can clobber it in the window between setting 1489104964Sjeff * it here and returning to user mode, so don't waste time setting 1490104964Sjeff * it perfectly here. 1491104964Sjeff */ 1492139453Sjhb KASSERT((td->td_flags & TDF_BORROWING) == 0, 1493139453Sjhb ("thread with borrowed priority returning to userland")); 1494163709Sjb if (td->td_priority != td->td_user_pri) { 1495170293Sjeff thread_lock(td); 1496163709Sjb td->td_priority = td->td_user_pri; 1497163709Sjb td->td_base_pri = td->td_user_pri; 1498170293Sjeff thread_unlock(td); 1499163709Sjb } 1500104964Sjeff} 1501107126Sjeff 1502124955Sjeffvoid 1503124955Sjeffsched_bind(struct thread *td, int cpu) 1504124955Sjeff{ 1505164936Sjulian struct td_sched *ts; 1506124955Sjeff 1507208391Sjhb THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); 1508208391Sjhb KASSERT(td == curthread, ("sched_bind: can only bind curthread")); 1509124955Sjeff 1510301456Skib ts = td_get_sched(td); 1511124955Sjeff 1512177435Sjeff td->td_flags |= TDF_BOUND; 1513124955Sjeff#ifdef SMP 1514164936Sjulian ts->ts_runq = &runq_pcpu[cpu]; 1515124955Sjeff if (PCPU_GET(cpuid) == cpu) 1516124955Sjeff return; 1517124955Sjeff 1518131473Sjhb mi_switch(SW_VOL, NULL); 1519124955Sjeff#endif 1520124955Sjeff} 1521124955Sjeff 1522124955Sjeffvoid 1523124955Sjeffsched_unbind(struct thread* td) 1524124955Sjeff{ 1525170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1526208391Sjhb KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); 1527177435Sjeff td->td_flags &= ~TDF_BOUND; 1528124955Sjeff} 1529124955Sjeff 1530107126Sjeffint 1531145256Sjkoshysched_is_bound(struct thread *td) 1532145256Sjkoshy{ 1533170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1534177435Sjeff return (td->td_flags & TDF_BOUND); 1535145256Sjkoshy} 1536145256Sjkoshy 1537159630Sdavidxuvoid 1538159630Sdavidxusched_relinquish(struct thread *td) 1539159630Sdavidxu{ 1540170293Sjeff thread_lock(td); 1541178272Sjeff mi_switch(SW_VOL | SWT_RELINQUISH, NULL); 1542170293Sjeff thread_unlock(td); 1543159630Sdavidxu} 1544159630Sdavidxu 1545145256Sjkoshyint 1546125288Sjeffsched_load(void) 1547125288Sjeff{ 1548125288Sjeff return (sched_tdcnt); 1549125288Sjeff} 1550125288Sjeff 1551125288Sjeffint 1552107126Sjeffsched_sizeof_proc(void) 1553107126Sjeff{ 1554107126Sjeff return (sizeof(struct proc)); 1555107126Sjeff} 1556159630Sdavidxu 1557107126Sjeffint 1558107126Sjeffsched_sizeof_thread(void) 1559107126Sjeff{ 1560164936Sjulian return (sizeof(struct thread) + sizeof(struct td_sched)); 1561107126Sjeff} 1562107137Sjeff 1563107137Sjefffixpt_t 1564121127Sjeffsched_pctcpu(struct thread *td) 1565107137Sjeff{ 1566164936Sjulian struct td_sched *ts; 1567121147Sjeff 1568208787Sjhb THREAD_LOCK_ASSERT(td, MA_OWNED); 1569301456Skib ts = td_get_sched(td); 1570164936Sjulian return (ts->ts_pctcpu); 1571107137Sjeff} 1572159570Sdavidxu 1573282213Strasz#ifdef RACCT 1574242139Strasz/* 1575242139Strasz * Calculates the contribution to the thread cpu usage for the latest 1576242139Strasz * (unfinished) second. 1577242139Strasz */ 1578242139Straszfixpt_t 1579242139Straszsched_pctcpu_delta(struct thread *td) 1580242139Strasz{ 1581242139Strasz struct td_sched *ts; 1582242139Strasz fixpt_t delta; 1583242139Strasz int realstathz; 1584242139Strasz 1585242139Strasz THREAD_LOCK_ASSERT(td, MA_OWNED); 1586301456Skib ts = td_get_sched(td); 1587242139Strasz delta = 0; 1588242139Strasz realstathz = stathz ? stathz : hz; 1589242139Strasz if (ts->ts_cpticks != 0) { 1590242139Strasz#if (FSHIFT >= CCPU_SHIFT) 1591242139Strasz delta = (realstathz == 100) 1592242139Strasz ? ((fixpt_t) ts->ts_cpticks) << 1593242139Strasz (FSHIFT - CCPU_SHIFT) : 1594242139Strasz 100 * (((fixpt_t) ts->ts_cpticks) 1595242139Strasz << (FSHIFT - CCPU_SHIFT)) / realstathz; 1596242139Strasz#else 1597242139Strasz delta = ((FSCALE - ccpu) * 1598242139Strasz (ts->ts_cpticks * 1599242139Strasz FSCALE / realstathz)) >> FSHIFT; 1600242139Strasz#endif 1601242139Strasz } 1602242139Strasz 1603242139Strasz return (delta); 1604242139Strasz} 1605242139Strasz#endif 1606242139Strasz 1607298145Skibu_int 1608298145Skibsched_estcpu(struct thread *td) 1609159570Sdavidxu{ 1610298145Skib 1611301456Skib return (td_get_sched(td)->ts_estcpu); 1612159570Sdavidxu} 1613166188Sjeff 1614166188Sjeff/* 1615166188Sjeff * The actual idle process. 1616166188Sjeff */ 1617166188Sjeffvoid 1618166188Sjeffsched_idletd(void *dummy) 1619166188Sjeff{ 1620212455Smav struct pcpuidlestat *stat; 1621166188Sjeff 1622239585Sjhb THREAD_NO_SLEEPING(); 1623212455Smav stat = DPCPU_PTR(idlestat); 1624166188Sjeff for (;;) { 1625166188Sjeff mtx_assert(&Giant, MA_NOTOWNED); 1626166188Sjeff 1627212455Smav while (sched_runnable() == 0) { 1628212455Smav cpu_idle(stat->idlecalls + stat->oldidlecalls > 64); 1629212455Smav stat->idlecalls++; 1630212455Smav } 1631166188Sjeff 1632166188Sjeff mtx_lock_spin(&sched_lock); 1633178272Sjeff mi_switch(SW_VOL | SWT_IDLE, NULL); 1634166188Sjeff mtx_unlock_spin(&sched_lock); 1635166188Sjeff } 1636166188Sjeff} 1637166188Sjeff 1638170293Sjeff/* 1639170293Sjeff * A CPU is entering for the first time or a thread is exiting. 1640170293Sjeff */ 1641170293Sjeffvoid 1642170293Sjeffsched_throw(struct thread *td) 1643170293Sjeff{ 1644170293Sjeff /* 1645170293Sjeff * Correct spinlock nesting. The idle thread context that we are 1646170293Sjeff * borrowing was created so that it would start out with a single 1647170293Sjeff * spin lock (sched_lock) held in fork_trampoline(). Since we've 1648170293Sjeff * explicitly acquired locks in this function, the nesting count 1649170293Sjeff * is now 2 rather than 1. Since we are nested, calling 1650170293Sjeff * spinlock_exit() will simply adjust the counts without allowing 1651170293Sjeff * spin lock using code to interrupt us. 1652170293Sjeff */ 1653170293Sjeff if (td == NULL) { 1654170293Sjeff mtx_lock_spin(&sched_lock); 1655170293Sjeff spinlock_exit(); 1656229429Sjhb PCPU_SET(switchtime, cpu_ticks()); 1657229429Sjhb PCPU_SET(switchticks, ticks); 1658170293Sjeff } else { 1659174629Sjeff lock_profile_release_lock(&sched_lock.lock_object); 1660170293Sjeff MPASS(td->td_lock == &sched_lock); 1661286256Sjhb td->td_lastcpu = td->td_oncpu; 1662286256Sjhb td->td_oncpu = NOCPU; 1663170293Sjeff } 1664170293Sjeff mtx_assert(&sched_lock, MA_OWNED); 1665170293Sjeff KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); 1666170293Sjeff cpu_throw(td, choosethread()); /* doesn't return */ 1667170293Sjeff} 1668170293Sjeff 1669170293Sjeffvoid 1670170600Sjeffsched_fork_exit(struct thread *td) 1671170293Sjeff{ 1672170293Sjeff 1673170293Sjeff /* 1674170293Sjeff * Finish setting up thread glue so that it begins execution in a 1675170293Sjeff * non-nested critical section with sched_lock held but not recursed. 1676170293Sjeff */ 1677170600Sjeff td->td_oncpu = PCPU_GET(cpuid); 1678170600Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1679174629Sjeff lock_profile_obtain_lock_success(&sched_lock.lock_object, 1680174629Sjeff 0, 0, __FILE__, __LINE__); 1681170600Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); 1682315838Savg 1683315838Savg KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", 1684315838Savg "prio:%d", td->td_priority); 1685315838Savg SDT_PROBE0(sched, , , on__cpu); 1686170293Sjeff} 1687170293Sjeff 1688187357Sjeffchar * 1689187357Sjeffsched_tdname(struct thread *td) 1690187357Sjeff{ 1691187357Sjeff#ifdef KTR 1692187357Sjeff struct td_sched *ts; 1693187357Sjeff 1694301456Skib ts = td_get_sched(td); 1695187357Sjeff if (ts->ts_name[0] == '\0') 1696187357Sjeff snprintf(ts->ts_name, sizeof(ts->ts_name), 1697187357Sjeff "%s tid %d", td->td_name, td->td_tid); 1698187357Sjeff return (ts->ts_name); 1699187357Sjeff#else 1700187357Sjeff return (td->td_name); 1701187357Sjeff#endif 1702187357Sjeff} 1703187357Sjeff 1704232700Sjhb#ifdef KTR 1705176729Sjeffvoid 1706232700Sjhbsched_clear_tdname(struct thread *td) 1707232700Sjhb{ 1708232700Sjhb struct td_sched *ts; 1709232700Sjhb 1710301456Skib ts = td_get_sched(td); 1711232700Sjhb ts->ts_name[0] = '\0'; 1712232700Sjhb} 1713232700Sjhb#endif 1714232700Sjhb 1715232700Sjhbvoid 1716176729Sjeffsched_affinity(struct thread *td) 1717176729Sjeff{ 1718180923Sjhb#ifdef SMP 1719180923Sjhb struct td_sched *ts; 1720180923Sjhb int cpu; 1721180923Sjhb 1722180923Sjhb THREAD_LOCK_ASSERT(td, MA_OWNED); 1723180923Sjhb 1724180923Sjhb /* 1725180923Sjhb * Set the TSF_AFFINITY flag if there is at least one CPU this 1726180923Sjhb * thread can't run on. 1727180923Sjhb */ 1728301456Skib ts = td_get_sched(td); 1729180923Sjhb ts->ts_flags &= ~TSF_AFFINITY; 1730209059Sjhb CPU_FOREACH(cpu) { 1731180923Sjhb if (!THREAD_CAN_SCHED(td, cpu)) { 1732180923Sjhb ts->ts_flags |= TSF_AFFINITY; 1733180923Sjhb break; 1734180923Sjhb } 1735180923Sjhb } 1736180923Sjhb 1737180923Sjhb /* 1738180923Sjhb * If this thread can run on all CPUs, nothing else to do. 1739180923Sjhb */ 1740180923Sjhb if (!(ts->ts_flags & TSF_AFFINITY)) 1741180923Sjhb return; 1742180923Sjhb 1743180923Sjhb /* Pinned threads and bound threads should be left alone. */ 1744180923Sjhb if (td->td_pinned != 0 || td->td_flags & TDF_BOUND) 1745180923Sjhb return; 1746180923Sjhb 1747180923Sjhb switch (td->td_state) { 1748180923Sjhb case TDS_RUNQ: 1749180923Sjhb /* 1750180923Sjhb * If we are on a per-CPU runqueue that is in the set, 1751180923Sjhb * then nothing needs to be done. 1752180923Sjhb */ 1753180923Sjhb if (ts->ts_runq != &runq && 1754180923Sjhb THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu)) 1755180923Sjhb return; 1756180923Sjhb 1757180923Sjhb /* Put this thread on a valid per-CPU runqueue. */ 1758180923Sjhb sched_rem(td); 1759180923Sjhb sched_add(td, SRQ_BORING); 1760180923Sjhb break; 1761180923Sjhb case TDS_RUNNING: 1762180923Sjhb /* 1763180923Sjhb * See if our current CPU is in the set. If not, force a 1764180923Sjhb * context switch. 1765180923Sjhb */ 1766180923Sjhb if (THREAD_CAN_SCHED(td, td->td_oncpu)) 1767180923Sjhb return; 1768180923Sjhb 1769180923Sjhb td->td_flags |= TDF_NEEDRESCHED; 1770180923Sjhb if (td != curthread) 1771210939Sjhb ipi_cpu(cpu, IPI_AST); 1772180923Sjhb break; 1773180923Sjhb default: 1774180923Sjhb break; 1775180923Sjhb } 1776180923Sjhb#endif 1777176729Sjeff} 1778