sched_4bsd.c revision 135051
1104964Sjeff/*- 2104964Sjeff * Copyright (c) 1982, 1986, 1990, 1991, 1993 3104964Sjeff * The Regents of the University of California. All rights reserved. 4104964Sjeff * (c) UNIX System Laboratories, Inc. 5104964Sjeff * All or some portions of this file are derived from material licensed 6104964Sjeff * to the University of California by American Telephone and Telegraph 7104964Sjeff * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8104964Sjeff * the permission of UNIX System Laboratories, Inc. 9104964Sjeff * 10104964Sjeff * Redistribution and use in source and binary forms, with or without 11104964Sjeff * modification, are permitted provided that the following conditions 12104964Sjeff * are met: 13104964Sjeff * 1. Redistributions of source code must retain the above copyright 14104964Sjeff * notice, this list of conditions and the following disclaimer. 15104964Sjeff * 2. Redistributions in binary form must reproduce the above copyright 16104964Sjeff * notice, this list of conditions and the following disclaimer in the 17104964Sjeff * documentation and/or other materials provided with the distribution. 18104964Sjeff * 4. Neither the name of the University nor the names of its contributors 19104964Sjeff * may be used to endorse or promote products derived from this software 20104964Sjeff * without specific prior written permission. 21104964Sjeff * 22104964Sjeff * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23104964Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24104964Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25104964Sjeff * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26104964Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27104964Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28104964Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29104964Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30104964Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31104964Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32104964Sjeff * SUCH DAMAGE. 33104964Sjeff */ 34104964Sjeff 35116182Sobrien#include <sys/cdefs.h> 36116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_4bsd.c 135051 2004-09-10 21:04:38Z julian $"); 37116182Sobrien 38134791Sjulian#define kse td_sched 39134791Sjulian 40104964Sjeff#include <sys/param.h> 41104964Sjeff#include <sys/systm.h> 42104964Sjeff#include <sys/kernel.h> 43104964Sjeff#include <sys/ktr.h> 44104964Sjeff#include <sys/lock.h> 45123871Sjhb#include <sys/kthread.h> 46104964Sjeff#include <sys/mutex.h> 47104964Sjeff#include <sys/proc.h> 48104964Sjeff#include <sys/resourcevar.h> 49104964Sjeff#include <sys/sched.h> 50104964Sjeff#include <sys/smp.h> 51104964Sjeff#include <sys/sysctl.h> 52104964Sjeff#include <sys/sx.h> 53134689Sjulian#include <machine/smp.h> 54104964Sjeff 55107135Sjeff/* 56107135Sjeff * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in 57107135Sjeff * the range 100-256 Hz (approximately). 58107135Sjeff */ 59107135Sjeff#define ESTCPULIM(e) \ 60107135Sjeff min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ 61107135Sjeff RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) 62122355Sbde#ifdef SMP 63122355Sbde#define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) 64122355Sbde#else 65107135Sjeff#define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ 66122355Sbde#endif 67107135Sjeff#define NICE_WEIGHT 1 /* Priorities per nice level. */ 68107135Sjeff 69134791Sjulian/* 70134791Sjulian * The schedulable entity that can be given a context to run. 71134791Sjulian * A process may have several of these. Probably one per processor 72134791Sjulian * but posibly a few more. In this universe they are grouped 73134791Sjulian * with a KSEG that contains the priority and niceness 74134791Sjulian * for the group. 75134791Sjulian */ 76134791Sjulianstruct kse { 77134791Sjulian TAILQ_ENTRY(kse) ke_kglist; /* (*) Queue of KSEs in ke_ksegrp. */ 78134791Sjulian TAILQ_ENTRY(kse) ke_kgrlist; /* (*) Queue of KSEs in this state. */ 79134791Sjulian TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ 80134791Sjulian struct thread *ke_thread; /* (*) Active associated thread. */ 81134791Sjulian fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ 82134791Sjulian u_char ke_oncpu; /* (j) Which cpu we are on. */ 83134791Sjulian char ke_rqindex; /* (j) Run queue index. */ 84134791Sjulian enum { 85134791Sjulian KES_THREAD = 0x0, /* slaved to thread state */ 86134791Sjulian KES_ONRUNQ 87134791Sjulian } ke_state; /* (j) KSE status. */ 88134791Sjulian int ke_cpticks; /* (j) Ticks of cpu time. */ 89134791Sjulian struct runq *ke_runq; /* runq the kse is currently on */ 90134791Sjulian int ke_pinned; /* nested count of pinned to a cpu */ 91109145Sjeff}; 92109145Sjeff 93134791Sjulian#define ke_proc ke_thread->td_proc 94134791Sjulian#define ke_ksegrp ke_thread->td_ksegrp 95134791Sjulian 96134791Sjulian#define td_kse td_sched 97134791Sjulian 98134791Sjulian/* flags kept in td_flags */ 99134791Sjulian#define TDF_DIDRUN TDF_SCHED0 /* KSE actually ran. */ 100134791Sjulian#define TDF_EXIT TDF_SCHED1 /* KSE is being killed. */ 101134791Sjulian#define TDF_BOUND TDF_SCHED2 102134791Sjulian 103134791Sjulian#define ke_flags ke_thread->td_flags 104134791Sjulian#define KEF_DIDRUN TDF_DIDRUN /* KSE actually ran. */ 105134791Sjulian#define KEF_EXIT TDF_EXIT /* KSE is being killed. */ 106134791Sjulian#define KEF_BOUND TDF_BOUND /* stuck to one CPU */ 107134791Sjulian 108124955Sjeff#define SKE_RUNQ_PCPU(ke) \ 109124955Sjeff ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq) 110124955Sjeff 111134791Sjulianstruct kg_sched { 112134791Sjulian struct thread *skg_last_assigned; /* (j) Last thread assigned to */ 113134791Sjulian /* the system scheduler. */ 114134791Sjulian int skg_avail_opennings; /* (j) Num KSEs requested in group. */ 115134791Sjulian int skg_concurrency; /* (j) Num KSEs requested in group. */ 116134791Sjulian int skg_runq_kses; /* (j) Num KSEs on runq. */ 117134791Sjulian}; 118134791Sjulian#define kg_last_assigned kg_sched->skg_last_assigned 119134791Sjulian#define kg_avail_opennings kg_sched->skg_avail_opennings 120134791Sjulian#define kg_concurrency kg_sched->skg_concurrency 121134791Sjulian#define kg_runq_kses kg_sched->skg_runq_kses 122134791Sjulian 123124955Sjeff/* 124124955Sjeff * KSE_CAN_MIGRATE macro returns true if the kse can migrate between 125125295Sjeff * cpus. 126124955Sjeff */ 127124955Sjeff#define KSE_CAN_MIGRATE(ke) \ 128134791Sjulian ((ke)->ke_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 129109145Sjeff 130134791Sjulianstatic struct kse kse0; 131134791Sjulianstatic struct kg_sched kg_sched0; 132104964Sjeff 133125288Sjeffstatic int sched_tdcnt; /* Total runnable threads in the system. */ 134104964Sjeffstatic int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ 135112535Smux#define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ 136104964Sjeff 137104964Sjeffstatic struct callout roundrobin_callout; 138104964Sjeff 139134791Sjulianstatic void slot_fill(struct ksegrp *kg); 140134791Sjulianstatic struct kse *sched_choose(void); /* XXX Should be thread * */ 141134791Sjulian 142124955Sjeffstatic void setup_runqs(void); 143104964Sjeffstatic void roundrobin(void *arg); 144123871Sjhbstatic void schedcpu(void); 145124955Sjeffstatic void schedcpu_thread(void); 146104964Sjeffstatic void sched_setup(void *dummy); 147104964Sjeffstatic void maybe_resched(struct thread *td); 148104964Sjeffstatic void updatepri(struct ksegrp *kg); 149104964Sjeffstatic void resetpriority(struct ksegrp *kg); 150134694Sjulian#ifdef SMP 151134688Sjulianstatic int forward_wakeup(int cpunum); 152134694Sjulian#endif 153104964Sjeff 154124955Sjeffstatic struct kproc_desc sched_kp = { 155124955Sjeff "schedcpu", 156124955Sjeff schedcpu_thread, 157124955Sjeff NULL 158124955Sjeff}; 159124955SjeffSYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp) 160124955SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 161104964Sjeff 162104964Sjeff/* 163104964Sjeff * Global run queue. 164104964Sjeff */ 165104964Sjeffstatic struct runq runq; 166104964Sjeff 167124955Sjeff#ifdef SMP 168124955Sjeff/* 169124955Sjeff * Per-CPU run queues 170124955Sjeff */ 171124955Sjeffstatic struct runq runq_pcpu[MAXCPU]; 172124955Sjeff#endif 173124955Sjeff 174124955Sjeffstatic void 175124955Sjeffsetup_runqs(void) 176124955Sjeff{ 177124955Sjeff#ifdef SMP 178124955Sjeff int i; 179124955Sjeff 180124955Sjeff for (i = 0; i < MAXCPU; ++i) 181124955Sjeff runq_init(&runq_pcpu[i]); 182124955Sjeff#endif 183124955Sjeff 184124955Sjeff runq_init(&runq); 185124955Sjeff} 186124955Sjeff 187104964Sjeffstatic int 188104964Sjeffsysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 189104964Sjeff{ 190104964Sjeff int error, new_val; 191104964Sjeff 192104964Sjeff new_val = sched_quantum * tick; 193104964Sjeff error = sysctl_handle_int(oidp, &new_val, 0, req); 194104964Sjeff if (error != 0 || req->newptr == NULL) 195104964Sjeff return (error); 196104964Sjeff if (new_val < tick) 197104964Sjeff return (EINVAL); 198104964Sjeff sched_quantum = new_val / tick; 199104964Sjeff hogticks = 2 * sched_quantum; 200104964Sjeff return (0); 201104964Sjeff} 202104964Sjeff 203132589SscottlSYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler"); 204130881Sscottl 205132589SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, 206132589Sscottl "Scheduler name"); 207130881Sscottl 208132589SscottlSYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, 209132589Sscottl 0, sizeof sched_quantum, sysctl_kern_quantum, "I", 210132589Sscottl "Roundrobin scheduling quantum in microseconds"); 211104964Sjeff 212134693Sjulian#ifdef SMP 213134688Sjulian/* Enable forwarding of wakeups to all other cpus */ 214134688SjulianSYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP"); 215134688Sjulian 216134792Sjulianstatic int forward_wakeup_enabled = 1; 217134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, 218134688Sjulian &forward_wakeup_enabled, 0, 219134688Sjulian "Forwarding of wakeup to idle CPUs"); 220134688Sjulian 221134688Sjulianstatic int forward_wakeups_requested = 0; 222134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, 223134688Sjulian &forward_wakeups_requested, 0, 224134688Sjulian "Requests for Forwarding of wakeup to idle CPUs"); 225134688Sjulian 226134688Sjulianstatic int forward_wakeups_delivered = 0; 227134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, 228134688Sjulian &forward_wakeups_delivered, 0, 229134688Sjulian "Completed Forwarding of wakeup to idle CPUs"); 230134688Sjulian 231134792Sjulianstatic int forward_wakeup_use_mask = 1; 232134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, 233134688Sjulian &forward_wakeup_use_mask, 0, 234134688Sjulian "Use the mask of idle cpus"); 235134688Sjulian 236134688Sjulianstatic int forward_wakeup_use_loop = 0; 237134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, 238134688Sjulian &forward_wakeup_use_loop, 0, 239134688Sjulian "Use a loop to find idle cpus"); 240134688Sjulian 241134688Sjulianstatic int forward_wakeup_use_single = 0; 242134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW, 243134688Sjulian &forward_wakeup_use_single, 0, 244134688Sjulian "Only signal one idle cpu"); 245134688Sjulian 246134688Sjulianstatic int forward_wakeup_use_htt = 0; 247134688SjulianSYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW, 248134688Sjulian &forward_wakeup_use_htt, 0, 249134688Sjulian "account for htt"); 250135051Sjulian 251134693Sjulian#endif 252135051Sjulianstatic int sched_followon = 0; 253135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW, 254135051Sjulian &sched_followon, 0, 255135051Sjulian "allow threads to share a quantum"); 256134688Sjulian 257135051Sjulianstatic int sched_pfollowons = 0; 258135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, pfollowons, CTLFLAG_RD, 259135051Sjulian &sched_pfollowons, 0, 260135051Sjulian "number of followons done to a different ksegrp"); 261135051Sjulian 262135051Sjulianstatic int sched_kgfollowons = 0; 263135051SjulianSYSCTL_INT(_kern_sched, OID_AUTO, kgfollowons, CTLFLAG_RD, 264135051Sjulian &sched_kgfollowons, 0, 265135051Sjulian "number of followons done in a ksegrp"); 266135051Sjulian 267104964Sjeff/* 268104964Sjeff * Arrange to reschedule if necessary, taking the priorities and 269104964Sjeff * schedulers into account. 270104964Sjeff */ 271104964Sjeffstatic void 272104964Sjeffmaybe_resched(struct thread *td) 273104964Sjeff{ 274104964Sjeff 275104964Sjeff mtx_assert(&sched_lock, MA_OWNED); 276134791Sjulian if (td->td_priority < curthread->td_priority) 277111032Sjulian curthread->td_flags |= TDF_NEEDRESCHED; 278104964Sjeff} 279104964Sjeff 280104964Sjeff/* 281104964Sjeff * Force switch among equal priority processes every 100ms. 282104964Sjeff * We don't actually need to force a context switch of the current process. 283104964Sjeff * The act of firing the event triggers a context switch to softclock() and 284104964Sjeff * then switching back out again which is equivalent to a preemption, thus 285104964Sjeff * no further work is needed on the local CPU. 286104964Sjeff */ 287104964Sjeff/* ARGSUSED */ 288104964Sjeffstatic void 289104964Sjeffroundrobin(void *arg) 290104964Sjeff{ 291104964Sjeff 292104964Sjeff#ifdef SMP 293104964Sjeff mtx_lock_spin(&sched_lock); 294104964Sjeff forward_roundrobin(); 295104964Sjeff mtx_unlock_spin(&sched_lock); 296104964Sjeff#endif 297104964Sjeff 298104964Sjeff callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); 299104964Sjeff} 300104964Sjeff 301104964Sjeff/* 302104964Sjeff * Constants for digital decay and forget: 303118972Sjhb * 90% of (kg_estcpu) usage in 5 * loadav time 304118972Sjhb * 95% of (ke_pctcpu) usage in 60 seconds (load insensitive) 305104964Sjeff * Note that, as ps(1) mentions, this can let percentages 306104964Sjeff * total over 100% (I've seen 137.9% for 3 processes). 307104964Sjeff * 308118972Sjhb * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously. 309104964Sjeff * 310118972Sjhb * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds. 311104964Sjeff * That is, the system wants to compute a value of decay such 312104964Sjeff * that the following for loop: 313104964Sjeff * for (i = 0; i < (5 * loadavg); i++) 314118972Sjhb * kg_estcpu *= decay; 315104964Sjeff * will compute 316118972Sjhb * kg_estcpu *= 0.1; 317104964Sjeff * for all values of loadavg: 318104964Sjeff * 319104964Sjeff * Mathematically this loop can be expressed by saying: 320104964Sjeff * decay ** (5 * loadavg) ~= .1 321104964Sjeff * 322104964Sjeff * The system computes decay as: 323104964Sjeff * decay = (2 * loadavg) / (2 * loadavg + 1) 324104964Sjeff * 325104964Sjeff * We wish to prove that the system's computation of decay 326104964Sjeff * will always fulfill the equation: 327104964Sjeff * decay ** (5 * loadavg) ~= .1 328104964Sjeff * 329104964Sjeff * If we compute b as: 330104964Sjeff * b = 2 * loadavg 331104964Sjeff * then 332104964Sjeff * decay = b / (b + 1) 333104964Sjeff * 334104964Sjeff * We now need to prove two things: 335104964Sjeff * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 336104964Sjeff * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 337104964Sjeff * 338104964Sjeff * Facts: 339104964Sjeff * For x close to zero, exp(x) =~ 1 + x, since 340104964Sjeff * exp(x) = 0! + x**1/1! + x**2/2! + ... . 341104964Sjeff * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 342104964Sjeff * For x close to zero, ln(1+x) =~ x, since 343104964Sjeff * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 344104964Sjeff * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 345104964Sjeff * ln(.1) =~ -2.30 346104964Sjeff * 347104964Sjeff * Proof of (1): 348104964Sjeff * Solve (factor)**(power) =~ .1 given power (5*loadav): 349104964Sjeff * solving for factor, 350104964Sjeff * ln(factor) =~ (-2.30/5*loadav), or 351104964Sjeff * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 352104964Sjeff * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 353104964Sjeff * 354104964Sjeff * Proof of (2): 355104964Sjeff * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 356104964Sjeff * solving for power, 357104964Sjeff * power*ln(b/(b+1)) =~ -2.30, or 358104964Sjeff * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 359104964Sjeff * 360104964Sjeff * Actual power values for the implemented algorithm are as follows: 361104964Sjeff * loadav: 1 2 3 4 362104964Sjeff * power: 5.68 10.32 14.94 19.55 363104964Sjeff */ 364104964Sjeff 365104964Sjeff/* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 366104964Sjeff#define loadfactor(loadav) (2 * (loadav)) 367104964Sjeff#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 368104964Sjeff 369118972Sjhb/* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 370104964Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 371104964SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 372104964Sjeff 373104964Sjeff/* 374104964Sjeff * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 375104964Sjeff * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 376104964Sjeff * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 377104964Sjeff * 378104964Sjeff * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 379104964Sjeff * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 380104964Sjeff * 381104964Sjeff * If you don't want to bother with the faster/more-accurate formula, you 382104964Sjeff * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 383104964Sjeff * (more general) method of calculating the %age of CPU used by a process. 384104964Sjeff */ 385104964Sjeff#define CCPU_SHIFT 11 386104964Sjeff 387104964Sjeff/* 388104964Sjeff * Recompute process priorities, every hz ticks. 389104964Sjeff * MP-safe, called without the Giant mutex. 390104964Sjeff */ 391104964Sjeff/* ARGSUSED */ 392104964Sjeffstatic void 393123871Sjhbschedcpu(void) 394104964Sjeff{ 395104964Sjeff register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 396104964Sjeff struct thread *td; 397104964Sjeff struct proc *p; 398104964Sjeff struct kse *ke; 399104964Sjeff struct ksegrp *kg; 400118972Sjhb int awake, realstathz; 401104964Sjeff 402104964Sjeff realstathz = stathz ? stathz : hz; 403104964Sjeff sx_slock(&allproc_lock); 404104964Sjeff FOREACH_PROC_IN_SYSTEM(p) { 405118972Sjhb /* 406118972Sjhb * Prevent state changes and protect run queue. 407118972Sjhb */ 408104964Sjeff mtx_lock_spin(&sched_lock); 409118972Sjhb /* 410118972Sjhb * Increment time in/out of memory. We ignore overflow; with 411118972Sjhb * 16-bit int's (remember them?) overflow takes 45 days. 412118972Sjhb */ 413104964Sjeff p->p_swtime++; 414104964Sjeff FOREACH_KSEGRP_IN_PROC(p, kg) { 415104964Sjeff awake = 0; 416134791Sjulian FOREACH_THREAD_IN_GROUP(kg, td) { 417134791Sjulian ke = td->td_kse; 418104964Sjeff /* 419118972Sjhb * Increment sleep time (if sleeping). We 420118972Sjhb * ignore overflow, as above. 421104964Sjeff */ 422104964Sjeff /* 423104964Sjeff * The kse slptimes are not touched in wakeup 424104964Sjeff * because the thread may not HAVE a KSE. 425104964Sjeff */ 426104964Sjeff if (ke->ke_state == KES_ONRUNQ) { 427104964Sjeff awake = 1; 428104964Sjeff ke->ke_flags &= ~KEF_DIDRUN; 429104964Sjeff } else if ((ke->ke_state == KES_THREAD) && 430134791Sjulian (TD_IS_RUNNING(td))) { 431104964Sjeff awake = 1; 432104964Sjeff /* Do not clear KEF_DIDRUN */ 433104964Sjeff } else if (ke->ke_flags & KEF_DIDRUN) { 434104964Sjeff awake = 1; 435104964Sjeff ke->ke_flags &= ~KEF_DIDRUN; 436104964Sjeff } 437104964Sjeff 438104964Sjeff /* 439118972Sjhb * ke_pctcpu is only for ps and ttyinfo(). 440118972Sjhb * Do it per kse, and add them up at the end? 441104964Sjeff * XXXKSE 442104964Sjeff */ 443118972Sjhb ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> 444109145Sjeff FSHIFT; 445104964Sjeff /* 446104964Sjeff * If the kse has been idle the entire second, 447104964Sjeff * stop recalculating its priority until 448104964Sjeff * it wakes up. 449104964Sjeff */ 450134145Sjulian if (ke->ke_cpticks == 0) 451104964Sjeff continue; 452104964Sjeff#if (FSHIFT >= CCPU_SHIFT) 453109157Sjeff ke->ke_pctcpu += (realstathz == 100) 454134145Sjulian ? ((fixpt_t) ke->ke_cpticks) << 455104964Sjeff (FSHIFT - CCPU_SHIFT) : 456134145Sjulian 100 * (((fixpt_t) ke->ke_cpticks) 457109145Sjeff << (FSHIFT - CCPU_SHIFT)) / realstathz; 458104964Sjeff#else 459109157Sjeff ke->ke_pctcpu += ((FSCALE - ccpu) * 460134145Sjulian (ke->ke_cpticks * 461109145Sjeff FSCALE / realstathz)) >> FSHIFT; 462104964Sjeff#endif 463134145Sjulian ke->ke_cpticks = 0; 464104964Sjeff } /* end of kse loop */ 465104964Sjeff /* 466104964Sjeff * If there are ANY running threads in this KSEGRP, 467104964Sjeff * then don't count it as sleeping. 468104964Sjeff */ 469104964Sjeff if (awake) { 470104964Sjeff if (kg->kg_slptime > 1) { 471104964Sjeff /* 472104964Sjeff * In an ideal world, this should not 473104964Sjeff * happen, because whoever woke us 474104964Sjeff * up from the long sleep should have 475104964Sjeff * unwound the slptime and reset our 476104964Sjeff * priority before we run at the stale 477104964Sjeff * priority. Should KASSERT at some 478104964Sjeff * point when all the cases are fixed. 479104964Sjeff */ 480104964Sjeff updatepri(kg); 481104964Sjeff } 482104964Sjeff kg->kg_slptime = 0; 483118972Sjhb } else 484104964Sjeff kg->kg_slptime++; 485104964Sjeff if (kg->kg_slptime > 1) 486104964Sjeff continue; 487104964Sjeff kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu); 488104964Sjeff resetpriority(kg); 489104964Sjeff FOREACH_THREAD_IN_GROUP(kg, td) { 490104964Sjeff if (td->td_priority >= PUSER) { 491105127Sjulian sched_prio(td, kg->kg_user_pri); 492104964Sjeff } 493104964Sjeff } 494104964Sjeff } /* end of ksegrp loop */ 495104964Sjeff mtx_unlock_spin(&sched_lock); 496104964Sjeff } /* end of process loop */ 497104964Sjeff sx_sunlock(&allproc_lock); 498104964Sjeff} 499104964Sjeff 500104964Sjeff/* 501123871Sjhb * Main loop for a kthread that executes schedcpu once a second. 502123871Sjhb */ 503123871Sjhbstatic void 504124955Sjeffschedcpu_thread(void) 505123871Sjhb{ 506123871Sjhb int nowake; 507123871Sjhb 508123871Sjhb for (;;) { 509123871Sjhb schedcpu(); 510123871Sjhb tsleep(&nowake, curthread->td_priority, "-", hz); 511123871Sjhb } 512123871Sjhb} 513123871Sjhb 514123871Sjhb/* 515104964Sjeff * Recalculate the priority of a process after it has slept for a while. 516118972Sjhb * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at 517118972Sjhb * least six times the loadfactor will decay kg_estcpu to zero. 518104964Sjeff */ 519104964Sjeffstatic void 520104964Sjeffupdatepri(struct ksegrp *kg) 521104964Sjeff{ 522118972Sjhb register fixpt_t loadfac; 523104964Sjeff register unsigned int newcpu; 524104964Sjeff 525118972Sjhb loadfac = loadfactor(averunnable.ldavg[0]); 526104964Sjeff if (kg->kg_slptime > 5 * loadfac) 527104964Sjeff kg->kg_estcpu = 0; 528104964Sjeff else { 529118972Sjhb newcpu = kg->kg_estcpu; 530118972Sjhb kg->kg_slptime--; /* was incremented in schedcpu() */ 531104964Sjeff while (newcpu && --kg->kg_slptime) 532104964Sjeff newcpu = decay_cpu(loadfac, newcpu); 533104964Sjeff kg->kg_estcpu = newcpu; 534104964Sjeff } 535104964Sjeff resetpriority(kg); 536104964Sjeff} 537104964Sjeff 538104964Sjeff/* 539104964Sjeff * Compute the priority of a process when running in user mode. 540104964Sjeff * Arrange to reschedule if the resulting priority is better 541104964Sjeff * than that of the current process. 542104964Sjeff */ 543104964Sjeffstatic void 544104964Sjeffresetpriority(struct ksegrp *kg) 545104964Sjeff{ 546104964Sjeff register unsigned int newpriority; 547104964Sjeff struct thread *td; 548104964Sjeff 549104964Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) { 550104964Sjeff newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT + 551130551Sjulian NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN); 552104964Sjeff newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), 553104964Sjeff PRI_MAX_TIMESHARE); 554104964Sjeff kg->kg_user_pri = newpriority; 555104964Sjeff } 556104964Sjeff FOREACH_THREAD_IN_GROUP(kg, td) { 557104964Sjeff maybe_resched(td); /* XXXKSE silly */ 558104964Sjeff } 559104964Sjeff} 560104964Sjeff 561104964Sjeff/* ARGSUSED */ 562104964Sjeffstatic void 563104964Sjeffsched_setup(void *dummy) 564104964Sjeff{ 565124955Sjeff setup_runqs(); 566118972Sjhb 567104964Sjeff if (sched_quantum == 0) 568104964Sjeff sched_quantum = SCHED_QUANTUM; 569104964Sjeff hogticks = 2 * sched_quantum; 570104964Sjeff 571126665Srwatson callout_init(&roundrobin_callout, CALLOUT_MPSAFE); 572104964Sjeff 573104964Sjeff /* Kick off timeout driven events by calling first time. */ 574104964Sjeff roundrobin(NULL); 575125288Sjeff 576125288Sjeff /* Account for thread0. */ 577125288Sjeff sched_tdcnt++; 578104964Sjeff} 579104964Sjeff 580104964Sjeff/* External interfaces start here */ 581134791Sjulian/* 582134791Sjulian * Very early in the boot some setup of scheduler-specific 583134791Sjulian * parts of proc0 and of soem scheduler resources needs to be done. 584134791Sjulian * Called from: 585134791Sjulian * proc0_init() 586134791Sjulian */ 587134791Sjulianvoid 588134791Sjulianschedinit(void) 589134791Sjulian{ 590134791Sjulian /* 591134791Sjulian * Set up the scheduler specific parts of proc0. 592134791Sjulian */ 593134791Sjulian proc0.p_sched = NULL; /* XXX */ 594134791Sjulian ksegrp0.kg_sched = &kg_sched0; 595134791Sjulian thread0.td_sched = &kse0; 596134791Sjulian kse0.ke_thread = &thread0; 597134791Sjulian kse0.ke_oncpu = NOCPU; /* wrong.. can we use PCPU(cpuid) yet? */ 598134791Sjulian kse0.ke_state = KES_THREAD; 599134791Sjulian kg_sched0.skg_concurrency = 1; 600134791Sjulian kg_sched0.skg_avail_opennings = 0; /* we are already running */ 601134791Sjulian} 602134791Sjulian 603104964Sjeffint 604104964Sjeffsched_runnable(void) 605104964Sjeff{ 606124955Sjeff#ifdef SMP 607124955Sjeff return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); 608124955Sjeff#else 609124955Sjeff return runq_check(&runq); 610124955Sjeff#endif 611104964Sjeff} 612104964Sjeff 613104964Sjeffint 614104964Sjeffsched_rr_interval(void) 615104964Sjeff{ 616104964Sjeff if (sched_quantum == 0) 617104964Sjeff sched_quantum = SCHED_QUANTUM; 618104964Sjeff return (sched_quantum); 619104964Sjeff} 620104964Sjeff 621104964Sjeff/* 622104964Sjeff * We adjust the priority of the current process. The priority of 623104964Sjeff * a process gets worse as it accumulates CPU time. The cpu usage 624118972Sjhb * estimator (kg_estcpu) is increased here. resetpriority() will 625118972Sjhb * compute a different priority each time kg_estcpu increases by 626104964Sjeff * INVERSE_ESTCPU_WEIGHT 627104964Sjeff * (until MAXPRI is reached). The cpu usage estimator ramps up 628104964Sjeff * quite quickly when the process is running (linearly), and decays 629104964Sjeff * away exponentially, at a rate which is proportionally slower when 630104964Sjeff * the system is busy. The basic principle is that the system will 631104964Sjeff * 90% forget that the process used a lot of CPU time in 5 * loadav 632104964Sjeff * seconds. This causes the system to favor processes which haven't 633104964Sjeff * run much recently, and to round-robin among other processes. 634104964Sjeff */ 635104964Sjeffvoid 636121127Sjeffsched_clock(struct thread *td) 637104964Sjeff{ 638104964Sjeff struct ksegrp *kg; 639121127Sjeff struct kse *ke; 640104964Sjeff 641113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 642121127Sjeff kg = td->td_ksegrp; 643121127Sjeff ke = td->td_kse; 644113356Sjeff 645134145Sjulian ke->ke_cpticks++; 646104964Sjeff kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1); 647104964Sjeff if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { 648104964Sjeff resetpriority(kg); 649104964Sjeff if (td->td_priority >= PUSER) 650104964Sjeff td->td_priority = kg->kg_user_pri; 651104964Sjeff } 652104964Sjeff} 653118972Sjhb 654104964Sjeff/* 655104964Sjeff * charge childs scheduling cpu usage to parent. 656104964Sjeff * 657104964Sjeff * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp. 658104964Sjeff * Charge it to the ksegrp that did the wait since process estcpu is sum of 659104964Sjeff * all ksegrps, this is strictly as expected. Assume that the child process 660104964Sjeff * aggregated all the estcpu into the 'built-in' ksegrp. 661104964Sjeff */ 662104964Sjeffvoid 663132372Sjuliansched_exit(struct proc *p, struct thread *td) 664104964Sjeff{ 665132372Sjulian sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td); 666132372Sjulian sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); 667113356Sjeff} 668113356Sjeff 669113356Sjeffvoid 670132372Sjuliansched_exit_ksegrp(struct ksegrp *kg, struct thread *childtd) 671113356Sjeff{ 672113923Sjhb 673113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 674132372Sjulian kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + childtd->td_ksegrp->kg_estcpu); 675104964Sjeff} 676104964Sjeff 677104964Sjeffvoid 678113356Sjeffsched_exit_thread(struct thread *td, struct thread *child) 679104964Sjeff{ 680127894Sdfr if ((child->td_proc->p_flag & P_NOLOAD) == 0) 681125288Sjeff sched_tdcnt--; 682113356Sjeff} 683109145Sjeff 684113356Sjeffvoid 685134791Sjuliansched_fork(struct thread *td, struct thread *childtd) 686113356Sjeff{ 687134791Sjulian sched_fork_ksegrp(td, childtd->td_ksegrp); 688134791Sjulian sched_fork_thread(td, childtd); 689113356Sjeff} 690113356Sjeff 691113356Sjeffvoid 692132372Sjuliansched_fork_ksegrp(struct thread *td, struct ksegrp *child) 693113356Sjeff{ 694113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 695132372Sjulian child->kg_estcpu = td->td_ksegrp->kg_estcpu; 696113356Sjeff} 697109145Sjeff 698113356Sjeffvoid 699134791Sjuliansched_fork_thread(struct thread *td, struct thread *childtd) 700113356Sjeff{ 701134791Sjulian sched_newthread(childtd); 702104964Sjeff} 703104964Sjeff 704104964Sjeffvoid 705130551Sjuliansched_nice(struct proc *p, int nice) 706104964Sjeff{ 707130551Sjulian struct ksegrp *kg; 708113873Sjhb 709130551Sjulian PROC_LOCK_ASSERT(p, MA_OWNED); 710113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 711130551Sjulian p->p_nice = nice; 712130551Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 713130551Sjulian resetpriority(kg); 714130551Sjulian } 715104964Sjeff} 716104964Sjeff 717113356Sjeffvoid 718113356Sjeffsched_class(struct ksegrp *kg, int class) 719113356Sjeff{ 720113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 721113356Sjeff kg->kg_pri_class = class; 722113356Sjeff} 723113356Sjeff 724105127Sjulian/* 725105127Sjulian * Adjust the priority of a thread. 726105127Sjulian * This may include moving the thread within the KSEGRP, 727105127Sjulian * changing the assignment of a kse to the thread, 728105127Sjulian * and moving a KSE in the system run queue. 729105127Sjulian */ 730104964Sjeffvoid 731104964Sjeffsched_prio(struct thread *td, u_char prio) 732104964Sjeff{ 733104964Sjeff 734113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 735104964Sjeff if (TD_ON_RUNQ(td)) { 736105127Sjulian adjustrunqueue(td, prio); 737105127Sjulian } else { 738105127Sjulian td->td_priority = prio; 739104964Sjeff } 740104964Sjeff} 741104964Sjeff 742104964Sjeffvoid 743126326Sjhbsched_sleep(struct thread *td) 744104964Sjeff{ 745113923Sjhb 746113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 747104964Sjeff td->td_ksegrp->kg_slptime = 0; 748126326Sjhb td->td_base_pri = td->td_priority; 749104964Sjeff} 750104964Sjeff 751135051Sjulianstatic void remrunqueue(struct thread *td); 752135051Sjulian 753104964Sjeffvoid 754135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags) 755104964Sjeff{ 756104964Sjeff struct kse *ke; 757135051Sjulian struct ksegrp *kg; 758104964Sjeff struct proc *p; 759104964Sjeff 760104964Sjeff ke = td->td_kse; 761104964Sjeff p = td->td_proc; 762104964Sjeff 763113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 764104964Sjeff 765125295Sjeff if ((p->p_flag & P_NOLOAD) == 0) 766125288Sjeff sched_tdcnt--; 767135051Sjulian 768134791Sjulian /* 769135051Sjulian * We are volunteering to switch out so we get to nominate 770135051Sjulian * a successor for the rest of our quantum 771135051Sjulian * First try another thread in our ksegrp, and then look for 772135051Sjulian * other ksegrps in our process. 773135051Sjulian */ 774135051Sjulian if (sched_followon && 775135051Sjulian (p->p_flag & P_HADTHREADS) && 776135051Sjulian (flags & SW_VOL) && 777135051Sjulian newtd == NULL) { 778135051Sjulian /* lets schedule another thread from this process */ 779135051Sjulian kg = td->td_ksegrp; 780135051Sjulian if ((newtd = TAILQ_FIRST(&kg->kg_runq))) { 781135051Sjulian remrunqueue(newtd); 782135051Sjulian sched_kgfollowons++; 783135051Sjulian } else { 784135051Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 785135051Sjulian if ((newtd = TAILQ_FIRST(&kg->kg_runq))) { 786135051Sjulian sched_pfollowons++; 787135051Sjulian remrunqueue(newtd); 788135051Sjulian break; 789135051Sjulian } 790135051Sjulian } 791135051Sjulian } 792135051Sjulian } 793135051Sjulian 794135051Sjulian /* 795134791Sjulian * The thread we are about to run needs to be counted as if it had been 796134791Sjulian * added to the run queue and selected. 797134791Sjulian */ 798134791Sjulian if (newtd) { 799134791Sjulian newtd->td_ksegrp->kg_avail_opennings--; 800134791Sjulian newtd->td_kse->ke_flags |= KEF_DIDRUN; 801134791Sjulian TD_SET_RUNNING(newtd); 802134832Sjulian if ((newtd->td_proc->p_flag & P_NOLOAD) == 0) 803134832Sjulian sched_tdcnt++; 804134791Sjulian } 805135051Sjulian 806113339Sjulian td->td_lastcpu = td->td_oncpu; 807132266Sjhb td->td_flags &= ~TDF_NEEDRESCHED; 808132266Sjhb td->td_pflags &= ~TDP_OWEPREEMPT; 809113339Sjulian td->td_oncpu = NOCPU; 810104964Sjeff /* 811104964Sjeff * At the last moment, if this thread is still marked RUNNING, 812104964Sjeff * then put it back on the run queue as it has not been suspended 813131473Sjhb * or stopped or any thing else similar. We never put the idle 814131473Sjhb * threads on the run queue, however. 815104964Sjeff */ 816131473Sjhb if (td == PCPU_GET(idlethread)) 817131473Sjhb TD_SET_CAN_RUN(td); 818134791Sjulian else { 819134791Sjulian td->td_ksegrp->kg_avail_opennings++; 820134791Sjulian if (TD_IS_RUNNING(td)) { 821134791Sjulian /* Put us back on the run queue (kse and all). */ 822134791Sjulian setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING); 823134791Sjulian } else if (p->p_flag & P_HADTHREADS) { 824134791Sjulian /* 825134791Sjulian * We will not be on the run queue. So we must be 826134791Sjulian * sleeping or similar. As it's available, 827134791Sjulian * someone else can use the KSE if they need it. 828134791Sjulian */ 829134791Sjulian slot_fill(td->td_ksegrp); 830134791Sjulian } 831104964Sjeff } 832131473Sjhb if (newtd == NULL) 833131473Sjhb newtd = choosethread(); 834121128Sjeff if (td != newtd) 835121128Sjeff cpu_switch(td, newtd); 836121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 837121128Sjeff td->td_oncpu = PCPU_GET(cpuid); 838104964Sjeff} 839104964Sjeff 840104964Sjeffvoid 841104964Sjeffsched_wakeup(struct thread *td) 842104964Sjeff{ 843104964Sjeff struct ksegrp *kg; 844104964Sjeff 845113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 846104964Sjeff kg = td->td_ksegrp; 847104964Sjeff if (kg->kg_slptime > 1) 848104964Sjeff updatepri(kg); 849104964Sjeff kg->kg_slptime = 0; 850134586Sjulian setrunqueue(td, SRQ_BORING); 851104964Sjeff} 852104964Sjeff 853134693Sjulian#ifdef SMP 854134688Sjulian/* enable HTT_2 if you have a 2-way HTT cpu.*/ 855134688Sjulianstatic int 856134688Sjulianforward_wakeup(int cpunum) 857134688Sjulian{ 858134688Sjulian cpumask_t map, me, dontuse; 859134688Sjulian cpumask_t map2; 860134688Sjulian struct pcpu *pc; 861134688Sjulian cpumask_t id, map3; 862134688Sjulian 863134688Sjulian mtx_assert(&sched_lock, MA_OWNED); 864134688Sjulian 865134791Sjulian CTR0(KTR_RUNQ, "forward_wakeup()"); 866134688Sjulian 867134688Sjulian if ((!forward_wakeup_enabled) || 868134688Sjulian (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) 869134688Sjulian return (0); 870134688Sjulian if (!smp_started || cold || panicstr) 871134688Sjulian return (0); 872134688Sjulian 873134688Sjulian forward_wakeups_requested++; 874134688Sjulian 875134688Sjulian/* 876134688Sjulian * check the idle mask we received against what we calculated before 877134688Sjulian * in the old version. 878134688Sjulian */ 879134688Sjulian me = PCPU_GET(cpumask); 880134688Sjulian /* 881134688Sjulian * don't bother if we should be doing it ourself.. 882134688Sjulian */ 883134688Sjulian if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum))) 884134688Sjulian return (0); 885134688Sjulian 886134688Sjulian dontuse = me | stopped_cpus | hlt_cpus_mask; 887134688Sjulian map3 = 0; 888134688Sjulian if (forward_wakeup_use_loop) { 889134688Sjulian SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { 890134688Sjulian id = pc->pc_cpumask; 891134688Sjulian if ( (id & dontuse) == 0 && 892134688Sjulian pc->pc_curthread == pc->pc_idlethread) { 893134688Sjulian map3 |= id; 894134688Sjulian } 895134688Sjulian } 896134688Sjulian } 897134688Sjulian 898134688Sjulian if (forward_wakeup_use_mask) { 899134688Sjulian map = 0; 900134688Sjulian map = idle_cpus_mask & ~dontuse; 901134688Sjulian 902134688Sjulian /* If they are both on, compare and use loop if different */ 903134688Sjulian if (forward_wakeup_use_loop) { 904134688Sjulian if (map != map3) { 905134688Sjulian printf("map (%02X) != map3 (%02X)\n", 906134688Sjulian map, map3); 907134688Sjulian map = map3; 908134688Sjulian } 909134688Sjulian } 910134688Sjulian } else { 911134688Sjulian map = map3; 912134688Sjulian } 913134688Sjulian /* If we only allow a specific CPU, then mask off all the others */ 914134688Sjulian if (cpunum != NOCPU) { 915134688Sjulian KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); 916134688Sjulian map &= (1 << cpunum); 917134688Sjulian } else { 918134688Sjulian /* Try choose an idle die. */ 919134688Sjulian if (forward_wakeup_use_htt) { 920134688Sjulian map2 = (map & (map >> 1)) & 0x5555; 921134688Sjulian if (map2) { 922134688Sjulian map = map2; 923134688Sjulian } 924134688Sjulian } 925134688Sjulian 926134688Sjulian /* set only one bit */ 927134688Sjulian if (forward_wakeup_use_single) { 928134688Sjulian map = map & ((~map) + 1); 929134688Sjulian } 930134688Sjulian } 931134688Sjulian if (map) { 932134688Sjulian forward_wakeups_delivered++; 933134688Sjulian ipi_selected(map, IPI_AST); 934134688Sjulian return (1); 935134688Sjulian } 936134688Sjulian if (cpunum == NOCPU) 937134688Sjulian printf("forward_wakeup: Idle processor not found\n"); 938134688Sjulian return (0); 939134688Sjulian} 940134693Sjulian#endif 941134688Sjulian 942104964Sjeffvoid 943134586Sjuliansched_add(struct thread *td, int flags) 944104964Sjeff{ 945121127Sjeff struct kse *ke; 946134591Sjulian#ifdef SMP 947134591Sjulian int forwarded = 0; 948134591Sjulian int cpu; 949134591Sjulian#endif 950121127Sjeff 951121127Sjeff ke = td->td_kse; 952104964Sjeff mtx_assert(&sched_lock, MA_OWNED); 953104964Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 954124957Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 955104964Sjeff ke->ke_proc->p_comm)); 956104964Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 957124957Sjeff ("sched_add: process swapped out")); 958131481Sjhb 959131481Sjhb#ifdef SMP 960124955Sjeff if (KSE_CAN_MIGRATE(ke)) { 961134591Sjulian CTR2(KTR_RUNQ, 962134591Sjulian "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td); 963134591Sjulian cpu = NOCPU; 964124955Sjeff ke->ke_runq = &runq; 965124955Sjeff } else { 966124955Sjeff if (!SKE_RUNQ_PCPU(ke)) 967134591Sjulian ke->ke_runq = &runq_pcpu[(cpu = PCPU_GET(cpuid))]; 968134591Sjulian else 969134591Sjulian cpu = td->td_lastcpu; 970134591Sjulian CTR3(KTR_RUNQ, 971134591Sjulian "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu); 972124955Sjeff } 973124955Sjeff#else 974133396Sjulian CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to runq", ke, td); 975124955Sjeff ke->ke_runq = &runq; 976134591Sjulian 977124955Sjeff#endif 978134591Sjulian /* 979134591Sjulian * If we are yielding (on the way out anyhow) 980134591Sjulian * or the thread being saved is US, 981134591Sjulian * then don't try be smart about preemption 982134591Sjulian * or kicking off another CPU 983134591Sjulian * as it won't help and may hinder. 984134591Sjulian * In the YIEDLING case, we are about to run whoever is 985134591Sjulian * being put in the queue anyhow, and in the 986134591Sjulian * OURSELF case, we are puting ourself on the run queue 987134591Sjulian * which also only happens when we are about to yield. 988134591Sjulian */ 989134591Sjulian if((flags & SRQ_YIELDING) == 0) { 990134591Sjulian#ifdef SMP 991134591Sjulian cpumask_t me = PCPU_GET(cpumask); 992134591Sjulian int idle = idle_cpus_mask & me; 993134591Sjulian /* 994134591Sjulian * Only try to kick off another CPU if 995134591Sjulian * the thread is unpinned 996134591Sjulian * or pinned to another cpu, 997134591Sjulian * and there are other available and idle CPUs. 998134837Sjulian * if we are idle, or it's an interrupt, 999134837Sjulian * then skip straight to preemption. 1000134591Sjulian */ 1001134837Sjulian if ( (! idle) && ((flags & SRQ_INTR) == 0) && 1002134591Sjulian (idle_cpus_mask & ~(hlt_cpus_mask | me)) && 1003134591Sjulian ( KSE_CAN_MIGRATE(ke) || 1004134591Sjulian ke->ke_runq != &runq_pcpu[PCPU_GET(cpuid)])) { 1005134591Sjulian forwarded = forward_wakeup(cpu); 1006134591Sjulian } 1007134591Sjulian /* 1008134591Sjulian * If we failed to kick off another cpu, then look to 1009134591Sjulian * see if we should preempt this CPU. Only allow this 1010134591Sjulian * if it is not pinned or IS pinned to this CPU. 1011134591Sjulian * If we are the idle thread, we also try do preempt. 1012134591Sjulian * as it will be quicker and being idle, we won't 1013134591Sjulian * lose in doing so.. 1014134591Sjulian */ 1015134591Sjulian if ((!forwarded) && 1016134591Sjulian (ke->ke_runq == &runq || 1017134591Sjulian ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)])) 1018134591Sjulian#endif 1019134591Sjulian 1020134591Sjulian { 1021134591Sjulian if (maybe_preempt(td)) 1022134591Sjulian return; 1023134591Sjulian } 1024134591Sjulian } 1025125295Sjeff if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1026125288Sjeff sched_tdcnt++; 1027124955Sjeff runq_add(ke->ke_runq, ke); 1028133520Sjulian ke->ke_ksegrp->kg_runq_kses++; 1029133520Sjulian ke->ke_state = KES_ONRUNQ; 1030132118Sjhb maybe_resched(td); 1031104964Sjeff} 1032104964Sjeff 1033104964Sjeffvoid 1034121127Sjeffsched_rem(struct thread *td) 1035104964Sjeff{ 1036121127Sjeff struct kse *ke; 1037121127Sjeff 1038121127Sjeff ke = td->td_kse; 1039104964Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1040124957Sjeff ("sched_rem: process swapped out")); 1041124957Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), 1042124957Sjeff ("sched_rem: KSE not on run queue")); 1043104964Sjeff mtx_assert(&sched_lock, MA_OWNED); 1044104964Sjeff 1045125295Sjeff if ((td->td_proc->p_flag & P_NOLOAD) == 0) 1046125288Sjeff sched_tdcnt--; 1047134145Sjulian runq_remove(ke->ke_runq, ke); 1048124955Sjeff 1049104964Sjeff ke->ke_state = KES_THREAD; 1050104964Sjeff ke->ke_ksegrp->kg_runq_kses--; 1051104964Sjeff} 1052104964Sjeff 1053104964Sjeffstruct kse * 1054104964Sjeffsched_choose(void) 1055104964Sjeff{ 1056104964Sjeff struct kse *ke; 1057124955Sjeff struct runq *rq; 1058104964Sjeff 1059124955Sjeff#ifdef SMP 1060124955Sjeff struct kse *kecpu; 1061124955Sjeff 1062124955Sjeff rq = &runq; 1063104964Sjeff ke = runq_choose(&runq); 1064124955Sjeff kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); 1065104964Sjeff 1066124955Sjeff if (ke == NULL || 1067124955Sjeff (kecpu != NULL && 1068124955Sjeff kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) { 1069133396Sjulian CTR2(KTR_RUNQ, "choosing kse %p from pcpu runq %d", kecpu, 1070124955Sjeff PCPU_GET(cpuid)); 1071124955Sjeff ke = kecpu; 1072124955Sjeff rq = &runq_pcpu[PCPU_GET(cpuid)]; 1073124955Sjeff } else { 1074133396Sjulian CTR1(KTR_RUNQ, "choosing kse %p from main runq", ke); 1075124955Sjeff } 1076124955Sjeff 1077124955Sjeff#else 1078124955Sjeff rq = &runq; 1079124955Sjeff ke = runq_choose(&runq); 1080124955Sjeff#endif 1081124955Sjeff 1082104964Sjeff if (ke != NULL) { 1083124955Sjeff runq_remove(rq, ke); 1084104964Sjeff ke->ke_state = KES_THREAD; 1085133520Sjulian ke->ke_ksegrp->kg_runq_kses--; 1086104964Sjeff 1087104964Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1088124957Sjeff ("sched_choose: process swapped out")); 1089104964Sjeff } 1090104964Sjeff return (ke); 1091104964Sjeff} 1092104964Sjeff 1093104964Sjeffvoid 1094104964Sjeffsched_userret(struct thread *td) 1095104964Sjeff{ 1096104964Sjeff struct ksegrp *kg; 1097104964Sjeff /* 1098104964Sjeff * XXX we cheat slightly on the locking here to avoid locking in 1099104964Sjeff * the usual case. Setting td_priority here is essentially an 1100104964Sjeff * incomplete workaround for not setting it properly elsewhere. 1101104964Sjeff * Now that some interrupt handlers are threads, not setting it 1102104964Sjeff * properly elsewhere can clobber it in the window between setting 1103104964Sjeff * it here and returning to user mode, so don't waste time setting 1104104964Sjeff * it perfectly here. 1105104964Sjeff */ 1106104964Sjeff kg = td->td_ksegrp; 1107104964Sjeff if (td->td_priority != kg->kg_user_pri) { 1108104964Sjeff mtx_lock_spin(&sched_lock); 1109104964Sjeff td->td_priority = kg->kg_user_pri; 1110104964Sjeff mtx_unlock_spin(&sched_lock); 1111104964Sjeff } 1112104964Sjeff} 1113107126Sjeff 1114124955Sjeffvoid 1115124955Sjeffsched_bind(struct thread *td, int cpu) 1116124955Sjeff{ 1117124955Sjeff struct kse *ke; 1118124955Sjeff 1119124955Sjeff mtx_assert(&sched_lock, MA_OWNED); 1120124955Sjeff KASSERT(TD_IS_RUNNING(td), 1121124955Sjeff ("sched_bind: cannot bind non-running thread")); 1122124955Sjeff 1123124955Sjeff ke = td->td_kse; 1124124955Sjeff 1125124955Sjeff ke->ke_flags |= KEF_BOUND; 1126124955Sjeff#ifdef SMP 1127124955Sjeff ke->ke_runq = &runq_pcpu[cpu]; 1128124955Sjeff if (PCPU_GET(cpuid) == cpu) 1129124955Sjeff return; 1130124955Sjeff 1131124955Sjeff ke->ke_state = KES_THREAD; 1132124955Sjeff 1133131473Sjhb mi_switch(SW_VOL, NULL); 1134124955Sjeff#endif 1135124955Sjeff} 1136124955Sjeff 1137124955Sjeffvoid 1138124955Sjeffsched_unbind(struct thread* td) 1139124955Sjeff{ 1140124955Sjeff mtx_assert(&sched_lock, MA_OWNED); 1141124955Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 1142124955Sjeff} 1143124955Sjeff 1144107126Sjeffint 1145125288Sjeffsched_load(void) 1146125288Sjeff{ 1147125288Sjeff return (sched_tdcnt); 1148125288Sjeff} 1149125288Sjeff 1150125288Sjeffint 1151107126Sjeffsched_sizeof_ksegrp(void) 1152107126Sjeff{ 1153134791Sjulian return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1154107126Sjeff} 1155107126Sjeffint 1156107126Sjeffsched_sizeof_proc(void) 1157107126Sjeff{ 1158107126Sjeff return (sizeof(struct proc)); 1159107126Sjeff} 1160107126Sjeffint 1161107126Sjeffsched_sizeof_thread(void) 1162107126Sjeff{ 1163134791Sjulian return (sizeof(struct thread) + sizeof(struct kse)); 1164107126Sjeff} 1165107137Sjeff 1166107137Sjefffixpt_t 1167121127Sjeffsched_pctcpu(struct thread *td) 1168107137Sjeff{ 1169121147Sjeff struct kse *ke; 1170121147Sjeff 1171121147Sjeff ke = td->td_kse; 1172134791Sjulian return (ke->ke_pctcpu); 1173121147Sjeff 1174121147Sjeff return (0); 1175107137Sjeff} 1176134791Sjulian#define KERN_SWITCH_INCLUDE 1 1177134791Sjulian#include "kern/kern_switch.c" 1178