sched_ule.c revision 177169
1109864Sjeff/*- 2165762Sjeff * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27171482Sjeff/* 28171482Sjeff * This file implements the ULE scheduler. ULE supports independent CPU 29171482Sjeff * run queues and fine grain locking. It has superior interactive 30171482Sjeff * performance under load even on uni-processor systems. 31171482Sjeff * 32171482Sjeff * etymology: 33172293Sjeff * ULE is the last three letters in schedule. It owes its name to a 34171482Sjeff * generic user created for a scheduling system by Paul Mikesell at 35171482Sjeff * Isilon Systems and a general lack of creativity on the part of the author. 36171482Sjeff */ 37171482Sjeff 38116182Sobrien#include <sys/cdefs.h> 39116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 177169 2008-03-14 15:22:38Z jhb $"); 40116182Sobrien 41147565Speter#include "opt_hwpmc_hooks.h" 42147565Speter#include "opt_sched.h" 43134649Sscottl 44109864Sjeff#include <sys/param.h> 45109864Sjeff#include <sys/systm.h> 46131929Smarcel#include <sys/kdb.h> 47109864Sjeff#include <sys/kernel.h> 48109864Sjeff#include <sys/ktr.h> 49109864Sjeff#include <sys/lock.h> 50109864Sjeff#include <sys/mutex.h> 51109864Sjeff#include <sys/proc.h> 52112966Sjeff#include <sys/resource.h> 53122038Sjeff#include <sys/resourcevar.h> 54109864Sjeff#include <sys/sched.h> 55109864Sjeff#include <sys/smp.h> 56109864Sjeff#include <sys/sx.h> 57109864Sjeff#include <sys/sysctl.h> 58109864Sjeff#include <sys/sysproto.h> 59139453Sjhb#include <sys/turnstile.h> 60161599Sdavidxu#include <sys/umtx.h> 61109864Sjeff#include <sys/vmmeter.h> 62176735Sjeff#include <sys/cpuset.h> 63109864Sjeff#ifdef KTRACE 64109864Sjeff#include <sys/uio.h> 65109864Sjeff#include <sys/ktrace.h> 66109864Sjeff#endif 67109864Sjeff 68145256Sjkoshy#ifdef HWPMC_HOOKS 69145256Sjkoshy#include <sys/pmckern.h> 70145256Sjkoshy#endif 71145256Sjkoshy 72109864Sjeff#include <machine/cpu.h> 73121790Sjeff#include <machine/smp.h> 74109864Sjeff 75172887Sgrehan#if !defined(__i386__) && !defined(__amd64__) && !defined(__powerpc__) && !defined(__arm__) 76172345Sjeff#error "This architecture is not currently compatible with ULE" 77166190Sjeff#endif 78166190Sjeff 79171482Sjeff#define KTR_ULE 0 80166137Sjeff 81166137Sjeff/* 82171482Sjeff * Thread scheduler specific section. All fields are protected 83171482Sjeff * by the thread lock. 84146954Sjeff */ 85164936Sjulianstruct td_sched { 86171482Sjeff TAILQ_ENTRY(td_sched) ts_procq; /* Run queue. */ 87171482Sjeff struct thread *ts_thread; /* Active associated thread. */ 88171482Sjeff struct runq *ts_runq; /* Run-queue we're queued on. */ 89171482Sjeff short ts_flags; /* TSF_* flags. */ 90171482Sjeff u_char ts_rqindex; /* Run queue index. */ 91164936Sjulian u_char ts_cpu; /* CPU that we have affinity for. */ 92177009Sjeff int ts_rltick; /* Real last tick, for affinity. */ 93171482Sjeff int ts_slice; /* Ticks of slice remaining. */ 94171482Sjeff u_int ts_slptime; /* Number of ticks we vol. slept */ 95171482Sjeff u_int ts_runtime; /* Number of ticks we were running */ 96164936Sjulian int ts_ltick; /* Last tick that we were running on */ 97164936Sjulian int ts_ftick; /* First tick that we were running on */ 98164936Sjulian int ts_ticks; /* Tick count */ 99134791Sjulian}; 100164936Sjulian/* flags kept in ts_flags */ 101166108Sjeff#define TSF_BOUND 0x0001 /* Thread can not migrate. */ 102166108Sjeff#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ 103121790Sjeff 104164936Sjulianstatic struct td_sched td_sched0; 105109864Sjeff 106176735Sjeff#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) 107176735Sjeff#define THREAD_CAN_SCHED(td, cpu) \ 108176735Sjeff CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) 109176735Sjeff 110109864Sjeff/* 111165762Sjeff * Cpu percentage computation macros and defines. 112111857Sjeff * 113165762Sjeff * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. 114165762Sjeff * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. 115165796Sjeff * SCHED_TICK_MAX: Maximum number of ticks before scaling back. 116165762Sjeff * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. 117165762Sjeff * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. 118165762Sjeff * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. 119165762Sjeff */ 120165762Sjeff#define SCHED_TICK_SECS 10 121165762Sjeff#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) 122165796Sjeff#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) 123165762Sjeff#define SCHED_TICK_SHIFT 10 124165762Sjeff#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) 125165830Sjeff#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) 126165762Sjeff 127165762Sjeff/* 128165762Sjeff * These macros determine priorities for non-interactive threads. They are 129165762Sjeff * assigned a priority based on their recent cpu utilization as expressed 130165762Sjeff * by the ratio of ticks to the tick total. NHALF priorities at the start 131165762Sjeff * and end of the MIN to MAX timeshare range are only reachable with negative 132165762Sjeff * or positive nice respectively. 133165762Sjeff * 134165762Sjeff * PRI_RANGE: Priority range for utilization dependent priorities. 135116642Sjeff * PRI_NRESV: Number of nice values. 136165762Sjeff * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. 137165762Sjeff * PRI_NICE: Determines the part of the priority inherited from nice. 138109864Sjeff */ 139165762Sjeff#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) 140121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 141165762Sjeff#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF) 142165762Sjeff#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF) 143170787Sjeff#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN) 144165762Sjeff#define SCHED_PRI_TICKS(ts) \ 145165762Sjeff (SCHED_TICK_HZ((ts)) / \ 146165827Sjeff (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) 147165762Sjeff#define SCHED_PRI_NICE(nice) (nice) 148109864Sjeff 149109864Sjeff/* 150165762Sjeff * These determine the interactivity of a process. Interactivity differs from 151165762Sjeff * cpu utilization in that it expresses the voluntary time slept vs time ran 152165762Sjeff * while cpu utilization includes all time not running. This more accurately 153165762Sjeff * models the intent of the thread. 154109864Sjeff * 155110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 156110645Sjeff * before throttling back. 157121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 158116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 159111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 160109864Sjeff */ 161165762Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) 162165762Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) 163116365Sjeff#define SCHED_INTERACT_MAX (100) 164116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 165121126Sjeff#define SCHED_INTERACT_THRESH (30) 166111857Sjeff 167109864Sjeff/* 168165762Sjeff * tickincr: Converts a stathz tick into a hz domain scaled by 169165762Sjeff * the shift factor. Without the shift the error rate 170165762Sjeff * due to rounding would be unacceptably high. 171165762Sjeff * realstathz: stathz is sometimes 0 and run off of hz. 172165762Sjeff * sched_slice: Runtime of each thread before rescheduling. 173171482Sjeff * preempt_thresh: Priority threshold for preemption and remote IPIs. 174109864Sjeff */ 175165762Sjeffstatic int sched_interact = SCHED_INTERACT_THRESH; 176165762Sjeffstatic int realstathz; 177165762Sjeffstatic int tickincr; 178177009Sjeffstatic int sched_slice = 1; 179172345Sjeff#ifdef PREEMPTION 180172345Sjeff#ifdef FULL_PREEMPTION 181172345Sjeffstatic int preempt_thresh = PRI_MAX_IDLE; 182172345Sjeff#else 183171482Sjeffstatic int preempt_thresh = PRI_MIN_KERN; 184172345Sjeff#endif 185172345Sjeff#else 186172345Sjeffstatic int preempt_thresh = 0; 187172345Sjeff#endif 188177085Sjeffstatic int static_boost = 1; 189109864Sjeff 190109864Sjeff/* 191171482Sjeff * tdq - per processor runqs and statistics. All fields are protected by the 192171482Sjeff * tdq_lock. The load and lowpri may be accessed without to avoid excess 193171482Sjeff * locking in sched_pickcpu(); 194109864Sjeff */ 195164936Sjulianstruct tdq { 196177009Sjeff /* Ordered to improve efficiency of cpu_search() and switch(). */ 197177009Sjeff struct mtx tdq_lock; /* run queue lock. */ 198176735Sjeff struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ 199171482Sjeff int tdq_load; /* Aggregate load. */ 200176735Sjeff int tdq_sysload; /* For loadavg, !ITHD load. */ 201177009Sjeff int tdq_transferable; /* Transferable thread count. */ 202177009Sjeff u_char tdq_lowpri; /* Lowest priority thread. */ 203177009Sjeff u_char tdq_ipipending; /* IPI pending. */ 204166557Sjeff u_char tdq_idx; /* Current insert index. */ 205166557Sjeff u_char tdq_ridx; /* Current removal index. */ 206177009Sjeff struct runq tdq_realtime; /* real-time run queue. */ 207177009Sjeff struct runq tdq_timeshare; /* timeshare run queue. */ 208177009Sjeff struct runq tdq_idle; /* Queue of IDLE threads. */ 209176735Sjeff char tdq_name[sizeof("sched lock") + 6]; 210171482Sjeff} __aligned(64); 211109864Sjeff 212166108Sjeff 213123433Sjeff#ifdef SMP 214176735Sjeffstruct cpu_group *cpu_top; 215123433Sjeff 216176735Sjeff#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) 217176735Sjeff#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) 218166108Sjeff 219123433Sjeff/* 220166108Sjeff * Run-time tunables. 221166108Sjeff */ 222171506Sjeffstatic int rebalance = 1; 223172409Sjeffstatic int balance_interval = 128; /* Default set in sched_initticks(). */ 224166108Sjeffstatic int affinity; 225172409Sjeffstatic int steal_htt = 1; 226171506Sjeffstatic int steal_idle = 1; 227171506Sjeffstatic int steal_thresh = 2; 228166108Sjeff 229166108Sjeff/* 230165620Sjeff * One thread queue per processor. 231109864Sjeff */ 232164936Sjulianstatic struct tdq tdq_cpu[MAXCPU]; 233172409Sjeffstatic struct tdq *balance_tdq; 234172409Sjeffstatic int balance_ticks; 235129982Sjeff 236164936Sjulian#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) 237164936Sjulian#define TDQ_CPU(x) (&tdq_cpu[(x)]) 238171713Sjeff#define TDQ_ID(x) ((int)((x) - tdq_cpu)) 239123433Sjeff#else /* !SMP */ 240164936Sjulianstatic struct tdq tdq_cpu; 241129982Sjeff 242170315Sjeff#define TDQ_ID(x) (0) 243164936Sjulian#define TDQ_SELF() (&tdq_cpu) 244164936Sjulian#define TDQ_CPU(x) (&tdq_cpu) 245110028Sjeff#endif 246109864Sjeff 247171482Sjeff#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) 248171482Sjeff#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) 249171482Sjeff#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) 250171482Sjeff#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) 251176735Sjeff#define TDQ_LOCKPTR(t) (&(t)->tdq_lock) 252171482Sjeff 253163709Sjbstatic void sched_priority(struct thread *); 254146954Sjeffstatic void sched_thread_priority(struct thread *, u_char); 255163709Sjbstatic int sched_interact_score(struct thread *); 256163709Sjbstatic void sched_interact_update(struct thread *); 257163709Sjbstatic void sched_interact_fork(struct thread *); 258164936Sjulianstatic void sched_pctcpu_update(struct td_sched *); 259109864Sjeff 260110267Sjeff/* Operations on per processor queues */ 261164936Sjulianstatic struct td_sched * tdq_choose(struct tdq *); 262164936Sjulianstatic void tdq_setup(struct tdq *); 263164936Sjulianstatic void tdq_load_add(struct tdq *, struct td_sched *); 264164936Sjulianstatic void tdq_load_rem(struct tdq *, struct td_sched *); 265164936Sjulianstatic __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); 266164936Sjulianstatic __inline void tdq_runq_rem(struct tdq *, struct td_sched *); 267177005Sjeffstatic inline int sched_shouldpreempt(int, int, int); 268164936Sjulianvoid tdq_print(int cpu); 269165762Sjeffstatic void runq_print(struct runq *rq); 270171482Sjeffstatic void tdq_add(struct tdq *, struct thread *, int); 271110267Sjeff#ifdef SMP 272176735Sjeffstatic int tdq_move(struct tdq *, struct tdq *); 273171482Sjeffstatic int tdq_idled(struct tdq *); 274177005Sjeffstatic void tdq_notify(struct tdq *, struct td_sched *); 275176735Sjeffstatic struct td_sched *tdq_steal(struct tdq *, int); 276176735Sjeffstatic struct td_sched *runq_steal(struct runq *, int); 277171482Sjeffstatic int sched_pickcpu(struct td_sched *, int); 278172409Sjeffstatic void sched_balance(void); 279176735Sjeffstatic int sched_balance_pair(struct tdq *, struct tdq *); 280171482Sjeffstatic inline struct tdq *sched_setcpu(struct td_sched *, int, int); 281171482Sjeffstatic inline struct mtx *thread_block_switch(struct thread *); 282171482Sjeffstatic inline void thread_unblock_switch(struct thread *, struct mtx *); 283171713Sjeffstatic struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); 284121790Sjeff#endif 285110028Sjeff 286165762Sjeffstatic void sched_setup(void *dummy); 287165762SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 288165762Sjeff 289165762Sjeffstatic void sched_initticks(void *dummy); 290165762SjeffSYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) 291165762Sjeff 292171482Sjeff/* 293171482Sjeff * Print the threads waiting on a run-queue. 294171482Sjeff */ 295165762Sjeffstatic void 296165762Sjeffrunq_print(struct runq *rq) 297165762Sjeff{ 298165762Sjeff struct rqhead *rqh; 299165762Sjeff struct td_sched *ts; 300165762Sjeff int pri; 301165762Sjeff int j; 302165762Sjeff int i; 303165762Sjeff 304165762Sjeff for (i = 0; i < RQB_LEN; i++) { 305165762Sjeff printf("\t\trunq bits %d 0x%zx\n", 306165762Sjeff i, rq->rq_status.rqb_bits[i]); 307165762Sjeff for (j = 0; j < RQB_BPW; j++) 308165762Sjeff if (rq->rq_status.rqb_bits[i] & (1ul << j)) { 309165762Sjeff pri = j + (i << RQB_L2BPW); 310165762Sjeff rqh = &rq->rq_queues[pri]; 311165762Sjeff TAILQ_FOREACH(ts, rqh, ts_procq) { 312165762Sjeff printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", 313173600Sjulian ts->ts_thread, ts->ts_thread->td_name, ts->ts_thread->td_priority, ts->ts_rqindex, pri); 314165762Sjeff } 315165762Sjeff } 316165762Sjeff } 317165762Sjeff} 318165762Sjeff 319171482Sjeff/* 320171482Sjeff * Print the status of a per-cpu thread queue. Should be a ddb show cmd. 321171482Sjeff */ 322113357Sjeffvoid 323164936Sjuliantdq_print(int cpu) 324110267Sjeff{ 325164936Sjulian struct tdq *tdq; 326112994Sjeff 327164936Sjulian tdq = TDQ_CPU(cpu); 328112994Sjeff 329171713Sjeff printf("tdq %d:\n", TDQ_ID(tdq)); 330176735Sjeff printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); 331176735Sjeff printf("\tLock name: %s\n", tdq->tdq_name); 332165620Sjeff printf("\tload: %d\n", tdq->tdq_load); 333171482Sjeff printf("\ttimeshare idx: %d\n", tdq->tdq_idx); 334165766Sjeff printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); 335165762Sjeff printf("\trealtime runq:\n"); 336165762Sjeff runq_print(&tdq->tdq_realtime); 337165762Sjeff printf("\ttimeshare runq:\n"); 338165762Sjeff runq_print(&tdq->tdq_timeshare); 339165762Sjeff printf("\tidle runq:\n"); 340165762Sjeff runq_print(&tdq->tdq_idle); 341165620Sjeff printf("\tload transferable: %d\n", tdq->tdq_transferable); 342171713Sjeff printf("\tlowest priority: %d\n", tdq->tdq_lowpri); 343113357Sjeff} 344112994Sjeff 345177005Sjeffstatic inline int 346177005Sjeffsched_shouldpreempt(int pri, int cpri, int remote) 347177005Sjeff{ 348177005Sjeff /* 349177005Sjeff * If the new priority is not better than the current priority there is 350177005Sjeff * nothing to do. 351177005Sjeff */ 352177005Sjeff if (pri >= cpri) 353177005Sjeff return (0); 354177005Sjeff /* 355177005Sjeff * Always preempt idle. 356177005Sjeff */ 357177005Sjeff if (cpri >= PRI_MIN_IDLE) 358177005Sjeff return (1); 359177005Sjeff /* 360177005Sjeff * If preemption is disabled don't preempt others. 361177005Sjeff */ 362177005Sjeff if (preempt_thresh == 0) 363177005Sjeff return (0); 364177005Sjeff /* 365177005Sjeff * Preempt if we exceed the threshold. 366177005Sjeff */ 367177005Sjeff if (pri <= preempt_thresh) 368177005Sjeff return (1); 369177005Sjeff /* 370177005Sjeff * If we're realtime or better and there is timeshare or worse running 371177005Sjeff * preempt only remote processors. 372177005Sjeff */ 373177005Sjeff if (remote && pri <= PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME) 374177005Sjeff return (1); 375177005Sjeff return (0); 376177005Sjeff} 377177005Sjeff 378171482Sjeff#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) 379171482Sjeff/* 380171482Sjeff * Add a thread to the actual run-queue. Keeps transferable counts up to 381171482Sjeff * date with what is actually on the run-queue. Selects the correct 382171482Sjeff * queue position for timeshare threads. 383171482Sjeff */ 384122744Sjeffstatic __inline void 385164936Sjuliantdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) 386122744Sjeff{ 387177042Sjeff u_char pri; 388177042Sjeff 389171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 390171482Sjeff THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 391177009Sjeff 392177009Sjeff TD_SET_RUNQ(ts->ts_thread); 393165762Sjeff if (THREAD_CAN_MIGRATE(ts->ts_thread)) { 394165620Sjeff tdq->tdq_transferable++; 395164936Sjulian ts->ts_flags |= TSF_XFERABLE; 396123433Sjeff } 397177042Sjeff pri = ts->ts_thread->td_priority; 398177042Sjeff if (pri <= PRI_MAX_REALTIME) { 399177042Sjeff ts->ts_runq = &tdq->tdq_realtime; 400177042Sjeff } else if (pri <= PRI_MAX_TIMESHARE) { 401177042Sjeff ts->ts_runq = &tdq->tdq_timeshare; 402165762Sjeff KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, 403165762Sjeff ("Invalid priority %d on timeshare runq", pri)); 404165762Sjeff /* 405165762Sjeff * This queue contains only priorities between MIN and MAX 406165762Sjeff * realtime. Use the whole queue to represent these values. 407165762Sjeff */ 408171713Sjeff if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { 409165762Sjeff pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; 410165762Sjeff pri = (pri + tdq->tdq_idx) % RQ_NQS; 411165766Sjeff /* 412165766Sjeff * This effectively shortens the queue by one so we 413165766Sjeff * can have a one slot difference between idx and 414165766Sjeff * ridx while we wait for threads to drain. 415165766Sjeff */ 416165766Sjeff if (tdq->tdq_ridx != tdq->tdq_idx && 417165766Sjeff pri == tdq->tdq_ridx) 418167664Sjeff pri = (unsigned char)(pri - 1) % RQ_NQS; 419165762Sjeff } else 420165766Sjeff pri = tdq->tdq_ridx; 421165762Sjeff runq_add_pri(ts->ts_runq, ts, pri, flags); 422177042Sjeff return; 423165762Sjeff } else 424177009Sjeff ts->ts_runq = &tdq->tdq_idle; 425177042Sjeff runq_add(ts->ts_runq, ts, flags); 426177009Sjeff} 427177009Sjeff 428171482Sjeff/* 429171482Sjeff * Remove a thread from a run-queue. This typically happens when a thread 430171482Sjeff * is selected to run. Running threads are not on the queue and the 431171482Sjeff * transferable count does not reflect them. 432171482Sjeff */ 433122744Sjeffstatic __inline void 434164936Sjuliantdq_runq_rem(struct tdq *tdq, struct td_sched *ts) 435122744Sjeff{ 436171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 437171482Sjeff KASSERT(ts->ts_runq != NULL, 438171482Sjeff ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread)); 439164936Sjulian if (ts->ts_flags & TSF_XFERABLE) { 440165620Sjeff tdq->tdq_transferable--; 441164936Sjulian ts->ts_flags &= ~TSF_XFERABLE; 442123433Sjeff } 443165766Sjeff if (ts->ts_runq == &tdq->tdq_timeshare) { 444165766Sjeff if (tdq->tdq_idx != tdq->tdq_ridx) 445165766Sjeff runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); 446165766Sjeff else 447165766Sjeff runq_remove_idx(ts->ts_runq, ts, NULL); 448165766Sjeff } else 449165762Sjeff runq_remove(ts->ts_runq, ts); 450122744Sjeff} 451122744Sjeff 452171482Sjeff/* 453171482Sjeff * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load 454171482Sjeff * for this thread to the referenced thread queue. 455171482Sjeff */ 456113357Sjeffstatic void 457164936Sjuliantdq_load_add(struct tdq *tdq, struct td_sched *ts) 458113357Sjeff{ 459121896Sjeff int class; 460171482Sjeff 461171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 462171482Sjeff THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 463164936Sjulian class = PRI_BASE(ts->ts_thread->td_pri_class); 464165620Sjeff tdq->tdq_load++; 465171713Sjeff CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); 466166108Sjeff if (class != PRI_ITHD && 467166108Sjeff (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 468165620Sjeff tdq->tdq_sysload++; 469110267Sjeff} 470113357Sjeff 471171482Sjeff/* 472171482Sjeff * Remove the load from a thread that is transitioning to a sleep state or 473171482Sjeff * exiting. 474171482Sjeff */ 475112994Sjeffstatic void 476164936Sjuliantdq_load_rem(struct tdq *tdq, struct td_sched *ts) 477110267Sjeff{ 478121896Sjeff int class; 479171482Sjeff 480171482Sjeff THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 481171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 482164936Sjulian class = PRI_BASE(ts->ts_thread->td_pri_class); 483166108Sjeff if (class != PRI_ITHD && 484166108Sjeff (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 485165620Sjeff tdq->tdq_sysload--; 486171482Sjeff KASSERT(tdq->tdq_load != 0, 487171713Sjeff ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); 488165620Sjeff tdq->tdq_load--; 489165620Sjeff CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 490164936Sjulian ts->ts_runq = NULL; 491110267Sjeff} 492110267Sjeff 493176735Sjeff/* 494176735Sjeff * Set lowpri to its exact value by searching the run-queue and 495176735Sjeff * evaluating curthread. curthread may be passed as an optimization. 496176735Sjeff */ 497176735Sjeffstatic void 498176735Sjefftdq_setlowpri(struct tdq *tdq, struct thread *ctd) 499176735Sjeff{ 500176735Sjeff struct td_sched *ts; 501176735Sjeff struct thread *td; 502176735Sjeff 503176735Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 504176735Sjeff if (ctd == NULL) 505176735Sjeff ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; 506176735Sjeff ts = tdq_choose(tdq); 507176735Sjeff if (ts) 508176735Sjeff td = ts->ts_thread; 509176735Sjeff if (ts == NULL || td->td_priority > ctd->td_priority) 510176735Sjeff tdq->tdq_lowpri = ctd->td_priority; 511176735Sjeff else 512176735Sjeff tdq->tdq_lowpri = td->td_priority; 513176735Sjeff} 514176735Sjeff 515113357Sjeff#ifdef SMP 516176735Sjeffstruct cpu_search { 517176735Sjeff cpumask_t cs_mask; /* Mask of valid cpus. */ 518176735Sjeff u_int cs_load; 519176735Sjeff u_int cs_cpu; 520176735Sjeff int cs_limit; /* Min priority for low min load for high. */ 521176735Sjeff}; 522176735Sjeff 523176735Sjeff#define CPU_SEARCH_LOWEST 0x1 524176735Sjeff#define CPU_SEARCH_HIGHEST 0x2 525176735Sjeff#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST) 526176735Sjeff 527176735Sjeff#define CPUMASK_FOREACH(cpu, mask) \ 528176735Sjeff for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++) \ 529176735Sjeff if ((mask) & 1 << (cpu)) 530176735Sjeff 531177169Sjhbstatic __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low, 532176735Sjeff struct cpu_search *high, const int match); 533176735Sjeffint cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low); 534176735Sjeffint cpu_search_highest(struct cpu_group *cg, struct cpu_search *high); 535176735Sjeffint cpu_search_both(struct cpu_group *cg, struct cpu_search *low, 536176735Sjeff struct cpu_search *high); 537176735Sjeff 538116069Sjeff/* 539176735Sjeff * This routine compares according to the match argument and should be 540176735Sjeff * reduced in actual instantiations via constant propagation and dead code 541176735Sjeff * elimination. 542176735Sjeff */ 543176735Sjeffstatic __inline int 544176735Sjeffcpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high, 545176735Sjeff const int match) 546176735Sjeff{ 547176735Sjeff struct tdq *tdq; 548176735Sjeff 549176735Sjeff tdq = TDQ_CPU(cpu); 550176735Sjeff if (match & CPU_SEARCH_LOWEST) 551176735Sjeff if (low->cs_mask & (1 << cpu) && 552176735Sjeff tdq->tdq_load < low->cs_load && 553176735Sjeff tdq->tdq_lowpri > low->cs_limit) { 554176735Sjeff low->cs_cpu = cpu; 555176735Sjeff low->cs_load = tdq->tdq_load; 556176735Sjeff } 557176735Sjeff if (match & CPU_SEARCH_HIGHEST) 558176735Sjeff if (high->cs_mask & (1 << cpu) && 559176735Sjeff tdq->tdq_load >= high->cs_limit && 560176735Sjeff tdq->tdq_load > high->cs_load && 561176735Sjeff tdq->tdq_transferable) { 562176735Sjeff high->cs_cpu = cpu; 563176735Sjeff high->cs_load = tdq->tdq_load; 564176735Sjeff } 565176735Sjeff return (tdq->tdq_load); 566176735Sjeff} 567176735Sjeff 568176735Sjeff/* 569176735Sjeff * Search the tree of cpu_groups for the lowest or highest loaded cpu 570176735Sjeff * according to the match argument. This routine actually compares the 571176735Sjeff * load on all paths through the tree and finds the least loaded cpu on 572176735Sjeff * the least loaded path, which may differ from the least loaded cpu in 573176735Sjeff * the system. This balances work among caches and busses. 574116069Sjeff * 575176735Sjeff * This inline is instantiated in three forms below using constants for the 576176735Sjeff * match argument. It is reduced to the minimum set for each case. It is 577176735Sjeff * also recursive to the depth of the tree. 578116069Sjeff */ 579177169Sjhbstatic __inline int 580176735Sjeffcpu_search(struct cpu_group *cg, struct cpu_search *low, 581176735Sjeff struct cpu_search *high, const int match) 582176735Sjeff{ 583176735Sjeff int total; 584176735Sjeff 585176735Sjeff total = 0; 586176735Sjeff if (cg->cg_children) { 587176735Sjeff struct cpu_search lgroup; 588176735Sjeff struct cpu_search hgroup; 589176735Sjeff struct cpu_group *child; 590176735Sjeff u_int lload; 591176735Sjeff int hload; 592176735Sjeff int load; 593176735Sjeff int i; 594176735Sjeff 595176735Sjeff lload = -1; 596176735Sjeff hload = -1; 597176735Sjeff for (i = 0; i < cg->cg_children; i++) { 598176735Sjeff child = &cg->cg_child[i]; 599176735Sjeff if (match & CPU_SEARCH_LOWEST) { 600176735Sjeff lgroup = *low; 601176735Sjeff lgroup.cs_load = -1; 602176735Sjeff } 603176735Sjeff if (match & CPU_SEARCH_HIGHEST) { 604176735Sjeff hgroup = *high; 605176735Sjeff lgroup.cs_load = 0; 606176735Sjeff } 607176735Sjeff switch (match) { 608176735Sjeff case CPU_SEARCH_LOWEST: 609176735Sjeff load = cpu_search_lowest(child, &lgroup); 610176735Sjeff break; 611176735Sjeff case CPU_SEARCH_HIGHEST: 612176735Sjeff load = cpu_search_highest(child, &hgroup); 613176735Sjeff break; 614176735Sjeff case CPU_SEARCH_BOTH: 615176735Sjeff load = cpu_search_both(child, &lgroup, &hgroup); 616176735Sjeff break; 617176735Sjeff } 618176735Sjeff total += load; 619176735Sjeff if (match & CPU_SEARCH_LOWEST) 620176735Sjeff if (load < lload || low->cs_cpu == -1) { 621176735Sjeff *low = lgroup; 622176735Sjeff lload = load; 623176735Sjeff } 624176735Sjeff if (match & CPU_SEARCH_HIGHEST) 625176735Sjeff if (load > hload || high->cs_cpu == -1) { 626176735Sjeff hload = load; 627176735Sjeff *high = hgroup; 628176735Sjeff } 629176735Sjeff } 630176735Sjeff } else { 631176735Sjeff int cpu; 632176735Sjeff 633176735Sjeff CPUMASK_FOREACH(cpu, cg->cg_mask) 634176735Sjeff total += cpu_compare(cpu, low, high, match); 635176735Sjeff } 636176735Sjeff return (total); 637176735Sjeff} 638176735Sjeff 639176735Sjeff/* 640176735Sjeff * cpu_search instantiations must pass constants to maintain the inline 641176735Sjeff * optimization. 642176735Sjeff */ 643176735Sjeffint 644176735Sjeffcpu_search_lowest(struct cpu_group *cg, struct cpu_search *low) 645176735Sjeff{ 646176735Sjeff return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST); 647176735Sjeff} 648176735Sjeff 649176735Sjeffint 650176735Sjeffcpu_search_highest(struct cpu_group *cg, struct cpu_search *high) 651176735Sjeff{ 652176735Sjeff return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST); 653176735Sjeff} 654176735Sjeff 655176735Sjeffint 656176735Sjeffcpu_search_both(struct cpu_group *cg, struct cpu_search *low, 657176735Sjeff struct cpu_search *high) 658176735Sjeff{ 659176735Sjeff return cpu_search(cg, low, high, CPU_SEARCH_BOTH); 660176735Sjeff} 661176735Sjeff 662176735Sjeff/* 663176735Sjeff * Find the cpu with the least load via the least loaded path that has a 664176735Sjeff * lowpri greater than pri pri. A pri of -1 indicates any priority is 665176735Sjeff * acceptable. 666176735Sjeff */ 667176735Sjeffstatic inline int 668176735Sjeffsched_lowest(struct cpu_group *cg, cpumask_t mask, int pri) 669176735Sjeff{ 670176735Sjeff struct cpu_search low; 671176735Sjeff 672176735Sjeff low.cs_cpu = -1; 673176735Sjeff low.cs_load = -1; 674176735Sjeff low.cs_mask = mask; 675176735Sjeff low.cs_limit = pri; 676176735Sjeff cpu_search_lowest(cg, &low); 677176735Sjeff return low.cs_cpu; 678176735Sjeff} 679176735Sjeff 680176735Sjeff/* 681176735Sjeff * Find the cpu with the highest load via the highest loaded path. 682176735Sjeff */ 683176735Sjeffstatic inline int 684176735Sjeffsched_highest(struct cpu_group *cg, cpumask_t mask, int minload) 685176735Sjeff{ 686176735Sjeff struct cpu_search high; 687176735Sjeff 688176735Sjeff high.cs_cpu = -1; 689176735Sjeff high.cs_load = 0; 690176735Sjeff high.cs_mask = mask; 691176735Sjeff high.cs_limit = minload; 692176735Sjeff cpu_search_highest(cg, &high); 693176735Sjeff return high.cs_cpu; 694176735Sjeff} 695176735Sjeff 696176735Sjeff/* 697176735Sjeff * Simultaneously find the highest and lowest loaded cpu reachable via 698176735Sjeff * cg. 699176735Sjeff */ 700176735Sjeffstatic inline void 701176735Sjeffsched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu) 702176735Sjeff{ 703176735Sjeff struct cpu_search high; 704176735Sjeff struct cpu_search low; 705176735Sjeff 706176735Sjeff low.cs_cpu = -1; 707176735Sjeff low.cs_limit = -1; 708176735Sjeff low.cs_load = -1; 709176735Sjeff low.cs_mask = mask; 710176735Sjeff high.cs_load = 0; 711176735Sjeff high.cs_cpu = -1; 712176735Sjeff high.cs_limit = -1; 713176735Sjeff high.cs_mask = mask; 714176735Sjeff cpu_search_both(cg, &low, &high); 715176735Sjeff *lowcpu = low.cs_cpu; 716176735Sjeff *highcpu = high.cs_cpu; 717176735Sjeff return; 718176735Sjeff} 719176735Sjeff 720121790Sjeffstatic void 721176735Sjeffsched_balance_group(struct cpu_group *cg) 722116069Sjeff{ 723176735Sjeff cpumask_t mask; 724176735Sjeff int high; 725176735Sjeff int low; 726123487Sjeff int i; 727123487Sjeff 728176735Sjeff mask = -1; 729176735Sjeff for (;;) { 730176735Sjeff sched_both(cg, mask, &low, &high); 731176735Sjeff if (low == high || low == -1 || high == -1) 732176735Sjeff break; 733176735Sjeff if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) 734176735Sjeff break; 735123487Sjeff /* 736176735Sjeff * If we failed to move any threads determine which cpu 737176735Sjeff * to kick out of the set and try again. 738176735Sjeff */ 739176735Sjeff if (TDQ_CPU(high)->tdq_transferable == 0) 740176735Sjeff mask &= ~(1 << high); 741176735Sjeff else 742176735Sjeff mask &= ~(1 << low); 743123487Sjeff } 744176735Sjeff 745176735Sjeff for (i = 0; i < cg->cg_children; i++) 746176735Sjeff sched_balance_group(&cg->cg_child[i]); 747123487Sjeff} 748123487Sjeff 749123487Sjeffstatic void 750176735Sjeffsched_balance() 751123487Sjeff{ 752172409Sjeff struct tdq *tdq; 753123487Sjeff 754172409Sjeff /* 755172409Sjeff * Select a random time between .5 * balance_interval and 756172409Sjeff * 1.5 * balance_interval. 757172409Sjeff */ 758176735Sjeff balance_ticks = max(balance_interval / 2, 1); 759176735Sjeff balance_ticks += random() % balance_interval; 760171482Sjeff if (smp_started == 0 || rebalance == 0) 761171482Sjeff return; 762172409Sjeff tdq = TDQ_SELF(); 763172409Sjeff TDQ_UNLOCK(tdq); 764176735Sjeff sched_balance_group(cpu_top); 765172409Sjeff TDQ_LOCK(tdq); 766123487Sjeff} 767123487Sjeff 768171482Sjeff/* 769171482Sjeff * Lock two thread queues using their address to maintain lock order. 770171482Sjeff */ 771123487Sjeffstatic void 772171482Sjefftdq_lock_pair(struct tdq *one, struct tdq *two) 773171482Sjeff{ 774171482Sjeff if (one < two) { 775171482Sjeff TDQ_LOCK(one); 776171482Sjeff TDQ_LOCK_FLAGS(two, MTX_DUPOK); 777171482Sjeff } else { 778171482Sjeff TDQ_LOCK(two); 779171482Sjeff TDQ_LOCK_FLAGS(one, MTX_DUPOK); 780171482Sjeff } 781171482Sjeff} 782171482Sjeff 783171482Sjeff/* 784172409Sjeff * Unlock two thread queues. Order is not important here. 785172409Sjeff */ 786172409Sjeffstatic void 787172409Sjefftdq_unlock_pair(struct tdq *one, struct tdq *two) 788172409Sjeff{ 789172409Sjeff TDQ_UNLOCK(one); 790172409Sjeff TDQ_UNLOCK(two); 791172409Sjeff} 792172409Sjeff 793172409Sjeff/* 794171482Sjeff * Transfer load between two imbalanced thread queues. 795171482Sjeff */ 796176735Sjeffstatic int 797164936Sjuliansched_balance_pair(struct tdq *high, struct tdq *low) 798123487Sjeff{ 799123433Sjeff int transferable; 800116069Sjeff int high_load; 801116069Sjeff int low_load; 802176735Sjeff int moved; 803116069Sjeff int move; 804116069Sjeff int diff; 805116069Sjeff int i; 806116069Sjeff 807171482Sjeff tdq_lock_pair(high, low); 808176735Sjeff transferable = high->tdq_transferable; 809176735Sjeff high_load = high->tdq_load; 810176735Sjeff low_load = low->tdq_load; 811176735Sjeff moved = 0; 812116069Sjeff /* 813122744Sjeff * Determine what the imbalance is and then adjust that to how many 814165620Sjeff * threads we actually have to give up (transferable). 815122744Sjeff */ 816171482Sjeff if (transferable != 0) { 817171482Sjeff diff = high_load - low_load; 818171482Sjeff move = diff / 2; 819171482Sjeff if (diff & 0x1) 820171482Sjeff move++; 821171482Sjeff move = min(move, transferable); 822171482Sjeff for (i = 0; i < move; i++) 823176735Sjeff moved += tdq_move(high, low); 824172293Sjeff /* 825172293Sjeff * IPI the target cpu to force it to reschedule with the new 826172293Sjeff * workload. 827172293Sjeff */ 828172293Sjeff ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT); 829171482Sjeff } 830172409Sjeff tdq_unlock_pair(high, low); 831176735Sjeff return (moved); 832116069Sjeff} 833116069Sjeff 834171482Sjeff/* 835171482Sjeff * Move a thread from one thread queue to another. 836171482Sjeff */ 837176735Sjeffstatic int 838171482Sjefftdq_move(struct tdq *from, struct tdq *to) 839116069Sjeff{ 840171482Sjeff struct td_sched *ts; 841171482Sjeff struct thread *td; 842164936Sjulian struct tdq *tdq; 843171482Sjeff int cpu; 844116069Sjeff 845172409Sjeff TDQ_LOCK_ASSERT(from, MA_OWNED); 846172409Sjeff TDQ_LOCK_ASSERT(to, MA_OWNED); 847172409Sjeff 848164936Sjulian tdq = from; 849171482Sjeff cpu = TDQ_ID(to); 850176735Sjeff ts = tdq_steal(tdq, cpu); 851176735Sjeff if (ts == NULL) 852176735Sjeff return (0); 853171482Sjeff td = ts->ts_thread; 854171482Sjeff /* 855171482Sjeff * Although the run queue is locked the thread may be blocked. Lock 856172409Sjeff * it to clear this and acquire the run-queue lock. 857171482Sjeff */ 858171482Sjeff thread_lock(td); 859172409Sjeff /* Drop recursive lock on from acquired via thread_lock(). */ 860171482Sjeff TDQ_UNLOCK(from); 861171482Sjeff sched_rem(td); 862166108Sjeff ts->ts_cpu = cpu; 863171482Sjeff td->td_lock = TDQ_LOCKPTR(to); 864171482Sjeff tdq_add(to, td, SRQ_YIELDING); 865176735Sjeff return (1); 866116069Sjeff} 867110267Sjeff 868171482Sjeff/* 869171482Sjeff * This tdq has idled. Try to steal a thread from another cpu and switch 870171482Sjeff * to it. 871171482Sjeff */ 872123433Sjeffstatic int 873164936Sjuliantdq_idled(struct tdq *tdq) 874121790Sjeff{ 875176735Sjeff struct cpu_group *cg; 876164936Sjulian struct tdq *steal; 877176735Sjeff cpumask_t mask; 878176735Sjeff int thresh; 879171482Sjeff int cpu; 880123433Sjeff 881172484Sjeff if (smp_started == 0 || steal_idle == 0) 882172484Sjeff return (1); 883176735Sjeff mask = -1; 884176735Sjeff mask &= ~PCPU_GET(cpumask); 885176735Sjeff /* We don't want to be preempted while we're iterating. */ 886171482Sjeff spinlock_enter(); 887176735Sjeff for (cg = tdq->tdq_cg; cg != NULL; ) { 888176735Sjeff if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0) 889176735Sjeff thresh = steal_thresh; 890176735Sjeff else 891176735Sjeff thresh = 1; 892176735Sjeff cpu = sched_highest(cg, mask, thresh); 893176735Sjeff if (cpu == -1) { 894176735Sjeff cg = cg->cg_parent; 895176735Sjeff continue; 896166108Sjeff } 897176735Sjeff steal = TDQ_CPU(cpu); 898176735Sjeff mask &= ~(1 << cpu); 899176735Sjeff tdq_lock_pair(tdq, steal); 900176735Sjeff if (steal->tdq_load < thresh || steal->tdq_transferable == 0) { 901176735Sjeff tdq_unlock_pair(tdq, steal); 902176735Sjeff continue; 903171482Sjeff } 904176735Sjeff /* 905176735Sjeff * If a thread was added while interrupts were disabled don't 906176735Sjeff * steal one here. If we fail to acquire one due to affinity 907176735Sjeff * restrictions loop again with this cpu removed from the 908176735Sjeff * set. 909176735Sjeff */ 910176735Sjeff if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) { 911176735Sjeff tdq_unlock_pair(tdq, steal); 912176735Sjeff continue; 913176735Sjeff } 914176735Sjeff spinlock_exit(); 915176735Sjeff TDQ_UNLOCK(steal); 916176735Sjeff mi_switch(SW_VOL, NULL); 917176735Sjeff thread_unlock(curthread); 918176735Sjeff 919176735Sjeff return (0); 920123433Sjeff } 921171482Sjeff spinlock_exit(); 922123433Sjeff return (1); 923121790Sjeff} 924121790Sjeff 925171482Sjeff/* 926171482Sjeff * Notify a remote cpu of new work. Sends an IPI if criteria are met. 927171482Sjeff */ 928121790Sjeffstatic void 929177005Sjefftdq_notify(struct tdq *tdq, struct td_sched *ts) 930121790Sjeff{ 931166247Sjeff int cpri; 932166247Sjeff int pri; 933166108Sjeff int cpu; 934121790Sjeff 935177005Sjeff if (tdq->tdq_ipipending) 936177005Sjeff return; 937166108Sjeff cpu = ts->ts_cpu; 938166247Sjeff pri = ts->ts_thread->td_priority; 939177005Sjeff cpri = pcpu_find(cpu)->pc_curthread->td_priority; 940177005Sjeff if (!sched_shouldpreempt(pri, cpri, 1)) 941166137Sjeff return; 942177005Sjeff tdq->tdq_ipipending = 1; 943171482Sjeff ipi_selected(1 << cpu, IPI_PREEMPT); 944121790Sjeff} 945121790Sjeff 946171482Sjeff/* 947171482Sjeff * Steals load from a timeshare queue. Honors the rotating queue head 948171482Sjeff * index. 949171482Sjeff */ 950164936Sjulianstatic struct td_sched * 951176735Sjeffrunq_steal_from(struct runq *rq, int cpu, u_char start) 952171482Sjeff{ 953171482Sjeff struct td_sched *ts; 954171482Sjeff struct rqbits *rqb; 955171482Sjeff struct rqhead *rqh; 956171482Sjeff int first; 957171482Sjeff int bit; 958171482Sjeff int pri; 959171482Sjeff int i; 960171482Sjeff 961171482Sjeff rqb = &rq->rq_status; 962171482Sjeff bit = start & (RQB_BPW -1); 963171482Sjeff pri = 0; 964171482Sjeff first = 0; 965171482Sjeffagain: 966171482Sjeff for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { 967171482Sjeff if (rqb->rqb_bits[i] == 0) 968171482Sjeff continue; 969171482Sjeff if (bit != 0) { 970171482Sjeff for (pri = bit; pri < RQB_BPW; pri++) 971171482Sjeff if (rqb->rqb_bits[i] & (1ul << pri)) 972171482Sjeff break; 973171482Sjeff if (pri >= RQB_BPW) 974171482Sjeff continue; 975171482Sjeff } else 976171482Sjeff pri = RQB_FFS(rqb->rqb_bits[i]); 977171482Sjeff pri += (i << RQB_L2BPW); 978171482Sjeff rqh = &rq->rq_queues[pri]; 979171482Sjeff TAILQ_FOREACH(ts, rqh, ts_procq) { 980176735Sjeff if (first && THREAD_CAN_MIGRATE(ts->ts_thread) && 981176735Sjeff THREAD_CAN_SCHED(ts->ts_thread, cpu)) 982171482Sjeff return (ts); 983171482Sjeff first = 1; 984171482Sjeff } 985171482Sjeff } 986171482Sjeff if (start != 0) { 987171482Sjeff start = 0; 988171482Sjeff goto again; 989171482Sjeff } 990171482Sjeff 991171482Sjeff return (NULL); 992171482Sjeff} 993171482Sjeff 994171482Sjeff/* 995171482Sjeff * Steals load from a standard linear queue. 996171482Sjeff */ 997171482Sjeffstatic struct td_sched * 998176735Sjeffrunq_steal(struct runq *rq, int cpu) 999121790Sjeff{ 1000121790Sjeff struct rqhead *rqh; 1001121790Sjeff struct rqbits *rqb; 1002164936Sjulian struct td_sched *ts; 1003121790Sjeff int word; 1004121790Sjeff int bit; 1005121790Sjeff 1006121790Sjeff rqb = &rq->rq_status; 1007121790Sjeff for (word = 0; word < RQB_LEN; word++) { 1008121790Sjeff if (rqb->rqb_bits[word] == 0) 1009121790Sjeff continue; 1010121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 1011123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 1012121790Sjeff continue; 1013121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 1014171506Sjeff TAILQ_FOREACH(ts, rqh, ts_procq) 1015176735Sjeff if (THREAD_CAN_MIGRATE(ts->ts_thread) && 1016176735Sjeff THREAD_CAN_SCHED(ts->ts_thread, cpu)) 1017164936Sjulian return (ts); 1018121790Sjeff } 1019121790Sjeff } 1020121790Sjeff return (NULL); 1021121790Sjeff} 1022121790Sjeff 1023171482Sjeff/* 1024171482Sjeff * Attempt to steal a thread in priority order from a thread queue. 1025171482Sjeff */ 1026164936Sjulianstatic struct td_sched * 1027176735Sjefftdq_steal(struct tdq *tdq, int cpu) 1028121790Sjeff{ 1029164936Sjulian struct td_sched *ts; 1030121790Sjeff 1031171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 1032176735Sjeff if ((ts = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) 1033164936Sjulian return (ts); 1034176735Sjeff if ((ts = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx)) 1035176735Sjeff != NULL) 1036164936Sjulian return (ts); 1037176735Sjeff return (runq_steal(&tdq->tdq_idle, cpu)); 1038121790Sjeff} 1039123433Sjeff 1040171482Sjeff/* 1041171482Sjeff * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the 1042172409Sjeff * current lock and returns with the assigned queue locked. 1043171482Sjeff */ 1044171482Sjeffstatic inline struct tdq * 1045171482Sjeffsched_setcpu(struct td_sched *ts, int cpu, int flags) 1046123433Sjeff{ 1047171482Sjeff struct thread *td; 1048171482Sjeff struct tdq *tdq; 1049123433Sjeff 1050171482Sjeff THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 1051171482Sjeff 1052171482Sjeff tdq = TDQ_CPU(cpu); 1053171482Sjeff td = ts->ts_thread; 1054171482Sjeff ts->ts_cpu = cpu; 1055171713Sjeff 1056171713Sjeff /* If the lock matches just return the queue. */ 1057171482Sjeff if (td->td_lock == TDQ_LOCKPTR(tdq)) 1058171482Sjeff return (tdq); 1059171482Sjeff#ifdef notyet 1060123433Sjeff /* 1061172293Sjeff * If the thread isn't running its lockptr is a 1062171482Sjeff * turnstile or a sleepqueue. We can just lock_set without 1063171482Sjeff * blocking. 1064123685Sjeff */ 1065171482Sjeff if (TD_CAN_RUN(td)) { 1066171482Sjeff TDQ_LOCK(tdq); 1067171482Sjeff thread_lock_set(td, TDQ_LOCKPTR(tdq)); 1068171482Sjeff return (tdq); 1069171482Sjeff } 1070171482Sjeff#endif 1071166108Sjeff /* 1072171482Sjeff * The hard case, migration, we need to block the thread first to 1073171482Sjeff * prevent order reversals with other cpus locks. 1074166108Sjeff */ 1075171482Sjeff thread_lock_block(td); 1076171482Sjeff TDQ_LOCK(tdq); 1077171713Sjeff thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); 1078171482Sjeff return (tdq); 1079166108Sjeff} 1080166108Sjeff 1081166108Sjeffstatic int 1082171482Sjeffsched_pickcpu(struct td_sched *ts, int flags) 1083171482Sjeff{ 1084176735Sjeff struct cpu_group *cg; 1085176735Sjeff struct thread *td; 1086171482Sjeff struct tdq *tdq; 1087176735Sjeff cpumask_t mask; 1088166108Sjeff int self; 1089166108Sjeff int pri; 1090166108Sjeff int cpu; 1091166108Sjeff 1092176735Sjeff self = PCPU_GET(cpuid); 1093176735Sjeff td = ts->ts_thread; 1094166108Sjeff if (smp_started == 0) 1095166108Sjeff return (self); 1096171506Sjeff /* 1097171506Sjeff * Don't migrate a running thread from sched_switch(). 1098171506Sjeff */ 1099176735Sjeff if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) 1100176735Sjeff return (ts->ts_cpu); 1101166108Sjeff /* 1102176735Sjeff * Prefer to run interrupt threads on the processors that generate 1103176735Sjeff * the interrupt. 1104166108Sjeff */ 1105176735Sjeff if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && 1106176735Sjeff curthread->td_intr_nesting_level) 1107176735Sjeff ts->ts_cpu = self; 1108166108Sjeff /* 1109176735Sjeff * If the thread can run on the last cpu and the affinity has not 1110176735Sjeff * expired or it is idle run it there. 1111166108Sjeff */ 1112176735Sjeff pri = td->td_priority; 1113176735Sjeff tdq = TDQ_CPU(ts->ts_cpu); 1114176735Sjeff if (THREAD_CAN_SCHED(td, ts->ts_cpu)) { 1115176735Sjeff if (tdq->tdq_lowpri > PRI_MIN_IDLE) 1116176735Sjeff return (ts->ts_cpu); 1117176735Sjeff if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) 1118176735Sjeff return (ts->ts_cpu); 1119139334Sjeff } 1120123433Sjeff /* 1121176735Sjeff * Search for the highest level in the tree that still has affinity. 1122123433Sjeff */ 1123176735Sjeff cg = NULL; 1124176735Sjeff for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent) 1125176735Sjeff if (SCHED_AFFINITY(ts, cg->cg_level)) 1126176735Sjeff break; 1127176735Sjeff cpu = -1; 1128176735Sjeff mask = td->td_cpuset->cs_mask.__bits[0]; 1129176735Sjeff if (cg) 1130176735Sjeff cpu = sched_lowest(cg, mask, pri); 1131176735Sjeff if (cpu == -1) 1132176735Sjeff cpu = sched_lowest(cpu_top, mask, -1); 1133171506Sjeff /* 1134176735Sjeff * Compare the lowest loaded cpu to current cpu. 1135171506Sjeff */ 1136177005Sjeff if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri && 1137177005Sjeff TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) 1138177005Sjeff cpu = self; 1139177005Sjeff KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu.")); 1140171482Sjeff return (cpu); 1141123433Sjeff} 1142176735Sjeff#endif 1143123433Sjeff 1144117326Sjeff/* 1145121790Sjeff * Pick the highest priority task we have and return it. 1146117326Sjeff */ 1147164936Sjulianstatic struct td_sched * 1148164936Sjuliantdq_choose(struct tdq *tdq) 1149110267Sjeff{ 1150164936Sjulian struct td_sched *ts; 1151110267Sjeff 1152171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 1153165762Sjeff ts = runq_choose(&tdq->tdq_realtime); 1154170787Sjeff if (ts != NULL) 1155164936Sjulian return (ts); 1156165766Sjeff ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); 1157165762Sjeff if (ts != NULL) { 1158170787Sjeff KASSERT(ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE, 1159165762Sjeff ("tdq_choose: Invalid priority on timeshare queue %d", 1160165762Sjeff ts->ts_thread->td_priority)); 1161165762Sjeff return (ts); 1162165762Sjeff } 1163110267Sjeff 1164165762Sjeff ts = runq_choose(&tdq->tdq_idle); 1165165762Sjeff if (ts != NULL) { 1166165762Sjeff KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE, 1167165762Sjeff ("tdq_choose: Invalid priority on idle queue %d", 1168165762Sjeff ts->ts_thread->td_priority)); 1169165762Sjeff return (ts); 1170165762Sjeff } 1171165762Sjeff 1172165762Sjeff return (NULL); 1173110267Sjeff} 1174110267Sjeff 1175171482Sjeff/* 1176171482Sjeff * Initialize a thread queue. 1177171482Sjeff */ 1178109864Sjeffstatic void 1179164936Sjuliantdq_setup(struct tdq *tdq) 1180110028Sjeff{ 1181171482Sjeff 1182171713Sjeff if (bootverbose) 1183171713Sjeff printf("ULE: setup cpu %d\n", TDQ_ID(tdq)); 1184165762Sjeff runq_init(&tdq->tdq_realtime); 1185165762Sjeff runq_init(&tdq->tdq_timeshare); 1186165620Sjeff runq_init(&tdq->tdq_idle); 1187176735Sjeff snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), 1188176735Sjeff "sched lock %d", (int)TDQ_ID(tdq)); 1189176735Sjeff mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", 1190176735Sjeff MTX_SPIN | MTX_RECURSE); 1191110028Sjeff} 1192110028Sjeff 1193171713Sjeff#ifdef SMP 1194110028Sjeffstatic void 1195171713Sjeffsched_setup_smp(void) 1196171713Sjeff{ 1197171713Sjeff struct tdq *tdq; 1198171713Sjeff int i; 1199171713Sjeff 1200176735Sjeff cpu_top = smp_topo(); 1201176735Sjeff for (i = 0; i < MAXCPU; i++) { 1202171713Sjeff if (CPU_ABSENT(i)) 1203171713Sjeff continue; 1204176735Sjeff tdq = TDQ_CPU(i); 1205171713Sjeff tdq_setup(tdq); 1206176735Sjeff tdq->tdq_cg = smp_topo_find(cpu_top, i); 1207176735Sjeff if (tdq->tdq_cg == NULL) 1208176735Sjeff panic("Can't find cpu group for %d\n", i); 1209123433Sjeff } 1210176735Sjeff balance_tdq = TDQ_SELF(); 1211176735Sjeff sched_balance(); 1212171713Sjeff} 1213171713Sjeff#endif 1214171713Sjeff 1215171713Sjeff/* 1216171713Sjeff * Setup the thread queues and initialize the topology based on MD 1217171713Sjeff * information. 1218171713Sjeff */ 1219171713Sjeffstatic void 1220171713Sjeffsched_setup(void *dummy) 1221171713Sjeff{ 1222171713Sjeff struct tdq *tdq; 1223171713Sjeff 1224171713Sjeff tdq = TDQ_SELF(); 1225171713Sjeff#ifdef SMP 1226176734Sjeff sched_setup_smp(); 1227117237Sjeff#else 1228171713Sjeff tdq_setup(tdq); 1229116069Sjeff#endif 1230171482Sjeff /* 1231171482Sjeff * To avoid divide-by-zero, we set realstathz a dummy value 1232171482Sjeff * in case which sched_clock() called before sched_initticks(). 1233171482Sjeff */ 1234171482Sjeff realstathz = hz; 1235171482Sjeff sched_slice = (realstathz/10); /* ~100ms */ 1236171482Sjeff tickincr = 1 << SCHED_TICK_SHIFT; 1237171482Sjeff 1238171482Sjeff /* Add thread0's load since it's running. */ 1239171482Sjeff TDQ_LOCK(tdq); 1240171713Sjeff thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); 1241171482Sjeff tdq_load_add(tdq, &td_sched0); 1242176735Sjeff tdq->tdq_lowpri = thread0.td_priority; 1243171482Sjeff TDQ_UNLOCK(tdq); 1244109864Sjeff} 1245109864Sjeff 1246171482Sjeff/* 1247171482Sjeff * This routine determines the tickincr after stathz and hz are setup. 1248171482Sjeff */ 1249153533Sdavidxu/* ARGSUSED */ 1250153533Sdavidxustatic void 1251153533Sdavidxusched_initticks(void *dummy) 1252153533Sdavidxu{ 1253171482Sjeff int incr; 1254171482Sjeff 1255153533Sdavidxu realstathz = stathz ? stathz : hz; 1256166229Sjeff sched_slice = (realstathz/10); /* ~100ms */ 1257153533Sdavidxu 1258153533Sdavidxu /* 1259165762Sjeff * tickincr is shifted out by 10 to avoid rounding errors due to 1260165766Sjeff * hz not being evenly divisible by stathz on all platforms. 1261153533Sdavidxu */ 1262171482Sjeff incr = (hz << SCHED_TICK_SHIFT) / realstathz; 1263165762Sjeff /* 1264165762Sjeff * This does not work for values of stathz that are more than 1265165762Sjeff * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. 1266165762Sjeff */ 1267171482Sjeff if (incr == 0) 1268171482Sjeff incr = 1; 1269171482Sjeff tickincr = incr; 1270166108Sjeff#ifdef SMP 1271171899Sjeff /* 1272172409Sjeff * Set the default balance interval now that we know 1273172409Sjeff * what realstathz is. 1274172409Sjeff */ 1275172409Sjeff balance_interval = realstathz; 1276172409Sjeff /* 1277171899Sjeff * Set steal thresh to log2(mp_ncpu) but no greater than 4. This 1278171899Sjeff * prevents excess thrashing on large machines and excess idle on 1279171899Sjeff * smaller machines. 1280171899Sjeff */ 1281176735Sjeff steal_thresh = min(ffs(mp_ncpus) - 1, 3); 1282166108Sjeff affinity = SCHED_AFFINITY_DEFAULT; 1283166108Sjeff#endif 1284153533Sdavidxu} 1285153533Sdavidxu 1286153533Sdavidxu 1287109864Sjeff/* 1288171482Sjeff * This is the core of the interactivity algorithm. Determines a score based 1289171482Sjeff * on past behavior. It is the ratio of sleep time to run time scaled to 1290171482Sjeff * a [0, 100] integer. This is the voluntary sleep time of a process, which 1291171482Sjeff * differs from the cpu usage because it does not account for time spent 1292171482Sjeff * waiting on a run-queue. Would be prettier if we had floating point. 1293171482Sjeff */ 1294171482Sjeffstatic int 1295171482Sjeffsched_interact_score(struct thread *td) 1296171482Sjeff{ 1297171482Sjeff struct td_sched *ts; 1298171482Sjeff int div; 1299171482Sjeff 1300171482Sjeff ts = td->td_sched; 1301171482Sjeff /* 1302171482Sjeff * The score is only needed if this is likely to be an interactive 1303171482Sjeff * task. Don't go through the expense of computing it if there's 1304171482Sjeff * no chance. 1305171482Sjeff */ 1306171482Sjeff if (sched_interact <= SCHED_INTERACT_HALF && 1307171482Sjeff ts->ts_runtime >= ts->ts_slptime) 1308171482Sjeff return (SCHED_INTERACT_HALF); 1309171482Sjeff 1310171482Sjeff if (ts->ts_runtime > ts->ts_slptime) { 1311171482Sjeff div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); 1312171482Sjeff return (SCHED_INTERACT_HALF + 1313171482Sjeff (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); 1314171482Sjeff } 1315171482Sjeff if (ts->ts_slptime > ts->ts_runtime) { 1316171482Sjeff div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); 1317171482Sjeff return (ts->ts_runtime / div); 1318171482Sjeff } 1319171482Sjeff /* runtime == slptime */ 1320171482Sjeff if (ts->ts_runtime) 1321171482Sjeff return (SCHED_INTERACT_HALF); 1322171482Sjeff 1323171482Sjeff /* 1324171482Sjeff * This can happen if slptime and runtime are 0. 1325171482Sjeff */ 1326171482Sjeff return (0); 1327171482Sjeff 1328171482Sjeff} 1329171482Sjeff 1330171482Sjeff/* 1331109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 1332109864Sjeff * process. 1333109864Sjeff */ 1334113357Sjeffstatic void 1335163709Sjbsched_priority(struct thread *td) 1336109864Sjeff{ 1337165762Sjeff int score; 1338109864Sjeff int pri; 1339109864Sjeff 1340163709Sjb if (td->td_pri_class != PRI_TIMESHARE) 1341113357Sjeff return; 1342112966Sjeff /* 1343165762Sjeff * If the score is interactive we place the thread in the realtime 1344165762Sjeff * queue with a priority that is less than kernel and interrupt 1345165762Sjeff * priorities. These threads are not subject to nice restrictions. 1346112966Sjeff * 1347171482Sjeff * Scores greater than this are placed on the normal timeshare queue 1348165762Sjeff * where the priority is partially decided by the most recent cpu 1349165762Sjeff * utilization and the rest is decided by nice value. 1350172293Sjeff * 1351172293Sjeff * The nice value of the process has a linear effect on the calculated 1352172293Sjeff * score. Negative nice values make it easier for a thread to be 1353172293Sjeff * considered interactive. 1354112966Sjeff */ 1355172308Sjeff score = imax(0, sched_interact_score(td) - td->td_proc->p_nice); 1356165762Sjeff if (score < sched_interact) { 1357165762Sjeff pri = PRI_MIN_REALTIME; 1358165762Sjeff pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact) 1359165762Sjeff * score; 1360165762Sjeff KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME, 1361166208Sjeff ("sched_priority: invalid interactive priority %d score %d", 1362166208Sjeff pri, score)); 1363165762Sjeff } else { 1364165762Sjeff pri = SCHED_PRI_MIN; 1365165762Sjeff if (td->td_sched->ts_ticks) 1366165762Sjeff pri += SCHED_PRI_TICKS(td->td_sched); 1367165762Sjeff pri += SCHED_PRI_NICE(td->td_proc->p_nice); 1368171482Sjeff KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE, 1369171482Sjeff ("sched_priority: invalid priority %d: nice %d, " 1370171482Sjeff "ticks %d ftick %d ltick %d tick pri %d", 1371171482Sjeff pri, td->td_proc->p_nice, td->td_sched->ts_ticks, 1372171482Sjeff td->td_sched->ts_ftick, td->td_sched->ts_ltick, 1373171482Sjeff SCHED_PRI_TICKS(td->td_sched))); 1374165762Sjeff } 1375165762Sjeff sched_user_prio(td, pri); 1376112966Sjeff 1377112966Sjeff return; 1378109864Sjeff} 1379109864Sjeff 1380121868Sjeff/* 1381121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1382171482Sjeff * kept. It is called after either the slptime or runtime is adjusted. This 1383171482Sjeff * function is ugly due to integer math. 1384121868Sjeff */ 1385116463Sjeffstatic void 1386163709Sjbsched_interact_update(struct thread *td) 1387116463Sjeff{ 1388165819Sjeff struct td_sched *ts; 1389166208Sjeff u_int sum; 1390121605Sjeff 1391165819Sjeff ts = td->td_sched; 1392171482Sjeff sum = ts->ts_runtime + ts->ts_slptime; 1393121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1394121868Sjeff return; 1395121868Sjeff /* 1396165819Sjeff * This only happens from two places: 1397165819Sjeff * 1) We have added an unusual amount of run time from fork_exit. 1398165819Sjeff * 2) We have added an unusual amount of sleep time from sched_sleep(). 1399165819Sjeff */ 1400165819Sjeff if (sum > SCHED_SLP_RUN_MAX * 2) { 1401171482Sjeff if (ts->ts_runtime > ts->ts_slptime) { 1402171482Sjeff ts->ts_runtime = SCHED_SLP_RUN_MAX; 1403171482Sjeff ts->ts_slptime = 1; 1404165819Sjeff } else { 1405171482Sjeff ts->ts_slptime = SCHED_SLP_RUN_MAX; 1406171482Sjeff ts->ts_runtime = 1; 1407165819Sjeff } 1408165819Sjeff return; 1409165819Sjeff } 1410165819Sjeff /* 1411121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1412121868Sjeff * will not bring us back into range. Dividing by two here forces 1413133427Sjeff * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1414121868Sjeff */ 1415127850Sjeff if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1416171482Sjeff ts->ts_runtime /= 2; 1417171482Sjeff ts->ts_slptime /= 2; 1418121868Sjeff return; 1419116463Sjeff } 1420171482Sjeff ts->ts_runtime = (ts->ts_runtime / 5) * 4; 1421171482Sjeff ts->ts_slptime = (ts->ts_slptime / 5) * 4; 1422116463Sjeff} 1423116463Sjeff 1424171482Sjeff/* 1425171482Sjeff * Scale back the interactivity history when a child thread is created. The 1426171482Sjeff * history is inherited from the parent but the thread may behave totally 1427171482Sjeff * differently. For example, a shell spawning a compiler process. We want 1428171482Sjeff * to learn that the compiler is behaving badly very quickly. 1429171482Sjeff */ 1430121868Sjeffstatic void 1431163709Sjbsched_interact_fork(struct thread *td) 1432121868Sjeff{ 1433121868Sjeff int ratio; 1434121868Sjeff int sum; 1435121868Sjeff 1436171482Sjeff sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime; 1437121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1438121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1439171482Sjeff td->td_sched->ts_runtime /= ratio; 1440171482Sjeff td->td_sched->ts_slptime /= ratio; 1441121868Sjeff } 1442121868Sjeff} 1443121868Sjeff 1444113357Sjeff/* 1445171482Sjeff * Called from proc0_init() to setup the scheduler fields. 1446134791Sjulian */ 1447134791Sjulianvoid 1448134791Sjulianschedinit(void) 1449134791Sjulian{ 1450165762Sjeff 1451134791Sjulian /* 1452134791Sjulian * Set up the scheduler specific parts of proc0. 1453134791Sjulian */ 1454136167Sjulian proc0.p_sched = NULL; /* XXX */ 1455164936Sjulian thread0.td_sched = &td_sched0; 1456165762Sjeff td_sched0.ts_ltick = ticks; 1457165796Sjeff td_sched0.ts_ftick = ticks; 1458164936Sjulian td_sched0.ts_thread = &thread0; 1459177009Sjeff td_sched0.ts_slice = sched_slice; 1460134791Sjulian} 1461134791Sjulian 1462134791Sjulian/* 1463113357Sjeff * This is only somewhat accurate since given many processes of the same 1464113357Sjeff * priority they will switch when their slices run out, which will be 1465165762Sjeff * at most sched_slice stathz ticks. 1466113357Sjeff */ 1467109864Sjeffint 1468109864Sjeffsched_rr_interval(void) 1469109864Sjeff{ 1470165762Sjeff 1471165762Sjeff /* Convert sched_slice to hz */ 1472165762Sjeff return (hz/(realstathz/sched_slice)); 1473109864Sjeff} 1474109864Sjeff 1475171482Sjeff/* 1476171482Sjeff * Update the percent cpu tracking information when it is requested or 1477171482Sjeff * the total history exceeds the maximum. We keep a sliding history of 1478171482Sjeff * tick counts that slowly decays. This is less precise than the 4BSD 1479171482Sjeff * mechanism since it happens with less regular and frequent events. 1480171482Sjeff */ 1481121790Sjeffstatic void 1482164936Sjuliansched_pctcpu_update(struct td_sched *ts) 1483109864Sjeff{ 1484165762Sjeff 1485165762Sjeff if (ts->ts_ticks == 0) 1486165762Sjeff return; 1487165796Sjeff if (ticks - (hz / 10) < ts->ts_ltick && 1488165796Sjeff SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX) 1489165796Sjeff return; 1490109864Sjeff /* 1491109864Sjeff * Adjust counters and watermark for pctcpu calc. 1492116365Sjeff */ 1493165762Sjeff if (ts->ts_ltick > ticks - SCHED_TICK_TARG) 1494164936Sjulian ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) * 1495165762Sjeff SCHED_TICK_TARG; 1496165762Sjeff else 1497164936Sjulian ts->ts_ticks = 0; 1498164936Sjulian ts->ts_ltick = ticks; 1499165762Sjeff ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG; 1500109864Sjeff} 1501109864Sjeff 1502171482Sjeff/* 1503171482Sjeff * Adjust the priority of a thread. Move it to the appropriate run-queue 1504171482Sjeff * if necessary. This is the back-end for several priority related 1505171482Sjeff * functions. 1506171482Sjeff */ 1507165762Sjeffstatic void 1508139453Sjhbsched_thread_priority(struct thread *td, u_char prio) 1509109864Sjeff{ 1510164936Sjulian struct td_sched *ts; 1511177009Sjeff struct tdq *tdq; 1512177009Sjeff int oldpri; 1513109864Sjeff 1514139316Sjeff CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1515173600Sjulian td, td->td_name, td->td_priority, prio, curthread, 1516173600Sjulian curthread->td_name); 1517164936Sjulian ts = td->td_sched; 1518170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1519139453Sjhb if (td->td_priority == prio) 1520139453Sjhb return; 1521165762Sjeff 1522165766Sjeff if (TD_ON_RUNQ(td) && prio < td->td_priority) { 1523121605Sjeff /* 1524121605Sjeff * If the priority has been elevated due to priority 1525121605Sjeff * propagation, we may have to move ourselves to a new 1526165762Sjeff * queue. This could be optimized to not re-add in some 1527165762Sjeff * cases. 1528133555Sjeff */ 1529165762Sjeff sched_rem(td); 1530165762Sjeff td->td_priority = prio; 1531171482Sjeff sched_add(td, SRQ_BORROWING); 1532177009Sjeff return; 1533177009Sjeff } 1534177009Sjeff tdq = TDQ_CPU(ts->ts_cpu); 1535177009Sjeff oldpri = td->td_priority; 1536177009Sjeff td->td_priority = prio; 1537177009Sjeff if (TD_IS_RUNNING(td)) { 1538176735Sjeff if (prio < tdq->tdq_lowpri) 1539171482Sjeff tdq->tdq_lowpri = prio; 1540176735Sjeff else if (tdq->tdq_lowpri == oldpri) 1541176735Sjeff tdq_setlowpri(tdq, td); 1542177009Sjeff } 1543109864Sjeff} 1544109864Sjeff 1545139453Sjhb/* 1546139453Sjhb * Update a thread's priority when it is lent another thread's 1547139453Sjhb * priority. 1548139453Sjhb */ 1549109864Sjeffvoid 1550139453Sjhbsched_lend_prio(struct thread *td, u_char prio) 1551139453Sjhb{ 1552139453Sjhb 1553139453Sjhb td->td_flags |= TDF_BORROWING; 1554139453Sjhb sched_thread_priority(td, prio); 1555139453Sjhb} 1556139453Sjhb 1557139453Sjhb/* 1558139453Sjhb * Restore a thread's priority when priority propagation is 1559139453Sjhb * over. The prio argument is the minimum priority the thread 1560139453Sjhb * needs to have to satisfy other possible priority lending 1561139453Sjhb * requests. If the thread's regular priority is less 1562139453Sjhb * important than prio, the thread will keep a priority boost 1563139453Sjhb * of prio. 1564139453Sjhb */ 1565139453Sjhbvoid 1566139453Sjhbsched_unlend_prio(struct thread *td, u_char prio) 1567139453Sjhb{ 1568139453Sjhb u_char base_pri; 1569139453Sjhb 1570139453Sjhb if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1571139453Sjhb td->td_base_pri <= PRI_MAX_TIMESHARE) 1572163709Sjb base_pri = td->td_user_pri; 1573139453Sjhb else 1574139453Sjhb base_pri = td->td_base_pri; 1575139453Sjhb if (prio >= base_pri) { 1576139455Sjhb td->td_flags &= ~TDF_BORROWING; 1577139453Sjhb sched_thread_priority(td, base_pri); 1578139453Sjhb } else 1579139453Sjhb sched_lend_prio(td, prio); 1580139453Sjhb} 1581139453Sjhb 1582171482Sjeff/* 1583171482Sjeff * Standard entry for setting the priority to an absolute value. 1584171482Sjeff */ 1585139453Sjhbvoid 1586139453Sjhbsched_prio(struct thread *td, u_char prio) 1587139453Sjhb{ 1588139453Sjhb u_char oldprio; 1589139453Sjhb 1590139453Sjhb /* First, update the base priority. */ 1591139453Sjhb td->td_base_pri = prio; 1592139453Sjhb 1593139453Sjhb /* 1594139455Sjhb * If the thread is borrowing another thread's priority, don't 1595139453Sjhb * ever lower the priority. 1596139453Sjhb */ 1597139453Sjhb if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1598139453Sjhb return; 1599139453Sjhb 1600139453Sjhb /* Change the real priority. */ 1601139453Sjhb oldprio = td->td_priority; 1602139453Sjhb sched_thread_priority(td, prio); 1603139453Sjhb 1604139453Sjhb /* 1605139453Sjhb * If the thread is on a turnstile, then let the turnstile update 1606139453Sjhb * its state. 1607139453Sjhb */ 1608139453Sjhb if (TD_ON_LOCK(td) && oldprio != prio) 1609139453Sjhb turnstile_adjust(td, oldprio); 1610139453Sjhb} 1611139455Sjhb 1612171482Sjeff/* 1613171482Sjeff * Set the base user priority, does not effect current running priority. 1614171482Sjeff */ 1615139453Sjhbvoid 1616163709Sjbsched_user_prio(struct thread *td, u_char prio) 1617161599Sdavidxu{ 1618161599Sdavidxu u_char oldprio; 1619161599Sdavidxu 1620163709Sjb td->td_base_user_pri = prio; 1621164939Sjulian if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 1622164939Sjulian return; 1623163709Sjb oldprio = td->td_user_pri; 1624163709Sjb td->td_user_pri = prio; 1625161599Sdavidxu} 1626161599Sdavidxu 1627161599Sdavidxuvoid 1628161599Sdavidxusched_lend_user_prio(struct thread *td, u_char prio) 1629161599Sdavidxu{ 1630161599Sdavidxu u_char oldprio; 1631161599Sdavidxu 1632174536Sdavidxu THREAD_LOCK_ASSERT(td, MA_OWNED); 1633161599Sdavidxu td->td_flags |= TDF_UBORROWING; 1634164091Smaxim oldprio = td->td_user_pri; 1635163709Sjb td->td_user_pri = prio; 1636161599Sdavidxu} 1637161599Sdavidxu 1638161599Sdavidxuvoid 1639161599Sdavidxusched_unlend_user_prio(struct thread *td, u_char prio) 1640161599Sdavidxu{ 1641161599Sdavidxu u_char base_pri; 1642161599Sdavidxu 1643174536Sdavidxu THREAD_LOCK_ASSERT(td, MA_OWNED); 1644163709Sjb base_pri = td->td_base_user_pri; 1645161599Sdavidxu if (prio >= base_pri) { 1646161599Sdavidxu td->td_flags &= ~TDF_UBORROWING; 1647163709Sjb sched_user_prio(td, base_pri); 1648174536Sdavidxu } else { 1649161599Sdavidxu sched_lend_user_prio(td, prio); 1650174536Sdavidxu } 1651161599Sdavidxu} 1652161599Sdavidxu 1653171482Sjeff/* 1654171505Sjeff * Add the thread passed as 'newtd' to the run queue before selecting 1655171505Sjeff * the next thread to run. This is only used for KSE. 1656171505Sjeff */ 1657171505Sjeffstatic void 1658171505Sjeffsched_switchin(struct tdq *tdq, struct thread *td) 1659171505Sjeff{ 1660171505Sjeff#ifdef SMP 1661171505Sjeff spinlock_enter(); 1662171505Sjeff TDQ_UNLOCK(tdq); 1663171505Sjeff thread_lock(td); 1664171505Sjeff spinlock_exit(); 1665171505Sjeff sched_setcpu(td->td_sched, TDQ_ID(tdq), SRQ_YIELDING); 1666171505Sjeff#else 1667171505Sjeff td->td_lock = TDQ_LOCKPTR(tdq); 1668171505Sjeff#endif 1669171505Sjeff tdq_add(tdq, td, SRQ_YIELDING); 1670171505Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1671171505Sjeff} 1672171505Sjeff 1673171505Sjeff/* 1674174847Swkoszek * Block a thread for switching. Similar to thread_block() but does not 1675174847Swkoszek * bump the spin count. 1676174847Swkoszek */ 1677174847Swkoszekstatic inline struct mtx * 1678174847Swkoszekthread_block_switch(struct thread *td) 1679174847Swkoszek{ 1680174847Swkoszek struct mtx *lock; 1681174847Swkoszek 1682174847Swkoszek THREAD_LOCK_ASSERT(td, MA_OWNED); 1683174847Swkoszek lock = td->td_lock; 1684174847Swkoszek td->td_lock = &blocked_lock; 1685174847Swkoszek mtx_unlock_spin(lock); 1686174847Swkoszek 1687174847Swkoszek return (lock); 1688174847Swkoszek} 1689174847Swkoszek 1690174847Swkoszek/* 1691171713Sjeff * Handle migration from sched_switch(). This happens only for 1692171713Sjeff * cpu binding. 1693171713Sjeff */ 1694171713Sjeffstatic struct mtx * 1695171713Sjeffsched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) 1696171713Sjeff{ 1697171713Sjeff struct tdq *tdn; 1698171713Sjeff 1699171713Sjeff tdn = TDQ_CPU(td->td_sched->ts_cpu); 1700171713Sjeff#ifdef SMP 1701177009Sjeff tdq_load_rem(tdq, td->td_sched); 1702171713Sjeff /* 1703171713Sjeff * Do the lock dance required to avoid LOR. We grab an extra 1704171713Sjeff * spinlock nesting to prevent preemption while we're 1705171713Sjeff * not holding either run-queue lock. 1706171713Sjeff */ 1707171713Sjeff spinlock_enter(); 1708171713Sjeff thread_block_switch(td); /* This releases the lock on tdq. */ 1709171713Sjeff TDQ_LOCK(tdn); 1710171713Sjeff tdq_add(tdn, td, flags); 1711177005Sjeff tdq_notify(tdn, td->td_sched); 1712171713Sjeff /* 1713171713Sjeff * After we unlock tdn the new cpu still can't switch into this 1714171713Sjeff * thread until we've unblocked it in cpu_switch(). The lock 1715171713Sjeff * pointers may match in the case of HTT cores. Don't unlock here 1716171713Sjeff * or we can deadlock when the other CPU runs the IPI handler. 1717171713Sjeff */ 1718171713Sjeff if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) { 1719171713Sjeff TDQ_UNLOCK(tdn); 1720171713Sjeff TDQ_LOCK(tdq); 1721171713Sjeff } 1722171713Sjeff spinlock_exit(); 1723171713Sjeff#endif 1724171713Sjeff return (TDQ_LOCKPTR(tdn)); 1725171713Sjeff} 1726171713Sjeff 1727171713Sjeff/* 1728171482Sjeff * Release a thread that was blocked with thread_block_switch(). 1729171482Sjeff */ 1730171482Sjeffstatic inline void 1731171482Sjeffthread_unblock_switch(struct thread *td, struct mtx *mtx) 1732171482Sjeff{ 1733171482Sjeff atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, 1734171482Sjeff (uintptr_t)mtx); 1735171482Sjeff} 1736171482Sjeff 1737171482Sjeff/* 1738171482Sjeff * Switch threads. This function has to handle threads coming in while 1739171482Sjeff * blocked for some reason, running, or idle. It also must deal with 1740171482Sjeff * migrating a thread from one queue to another as running threads may 1741171482Sjeff * be assigned elsewhere via binding. 1742171482Sjeff */ 1743161599Sdavidxuvoid 1744135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags) 1745109864Sjeff{ 1746165627Sjeff struct tdq *tdq; 1747164936Sjulian struct td_sched *ts; 1748171482Sjeff struct mtx *mtx; 1749171713Sjeff int srqflag; 1750171482Sjeff int cpuid; 1751109864Sjeff 1752170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1753109864Sjeff 1754171482Sjeff cpuid = PCPU_GET(cpuid); 1755171482Sjeff tdq = TDQ_CPU(cpuid); 1756164936Sjulian ts = td->td_sched; 1757171713Sjeff mtx = td->td_lock; 1758171482Sjeff ts->ts_rltick = ticks; 1759133555Sjeff td->td_lastcpu = td->td_oncpu; 1760113339Sjulian td->td_oncpu = NOCPU; 1761132266Sjhb td->td_flags &= ~TDF_NEEDRESCHED; 1762144777Sups td->td_owepreempt = 0; 1763123434Sjeff /* 1764171482Sjeff * The lock pointer in an idle thread should never change. Reset it 1765171482Sjeff * to CAN_RUN as well. 1766123434Sjeff */ 1767167327Sjulian if (TD_IS_IDLETHREAD(td)) { 1768171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1769139334Sjeff TD_SET_CAN_RUN(td); 1770170293Sjeff } else if (TD_IS_RUNNING(td)) { 1771171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1772171713Sjeff srqflag = (flags & SW_PREEMPT) ? 1773170293Sjeff SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1774171713Sjeff SRQ_OURSELF|SRQ_YIELDING; 1775171713Sjeff if (ts->ts_cpu == cpuid) 1776177009Sjeff tdq_runq_add(tdq, ts, srqflag); 1777171713Sjeff else 1778171713Sjeff mtx = sched_switch_migrate(tdq, td, srqflag); 1779171482Sjeff } else { 1780171482Sjeff /* This thread must be going to sleep. */ 1781171482Sjeff TDQ_LOCK(tdq); 1782171482Sjeff mtx = thread_block_switch(td); 1783170293Sjeff tdq_load_rem(tdq, ts); 1784171482Sjeff } 1785171482Sjeff /* 1786171482Sjeff * We enter here with the thread blocked and assigned to the 1787171482Sjeff * appropriate cpu run-queue or sleep-queue and with the current 1788171482Sjeff * thread-queue locked. 1789171482Sjeff */ 1790171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); 1791171482Sjeff /* 1792171505Sjeff * If KSE assigned a new thread just add it here and let choosethread 1793171505Sjeff * select the best one. 1794171482Sjeff */ 1795171505Sjeff if (newtd != NULL) 1796171505Sjeff sched_switchin(tdq, newtd); 1797171482Sjeff newtd = choosethread(); 1798171482Sjeff /* 1799171482Sjeff * Call the MD code to switch contexts if necessary. 1800171482Sjeff */ 1801145256Sjkoshy if (td != newtd) { 1802145256Sjkoshy#ifdef HWPMC_HOOKS 1803145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1804145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1805145256Sjkoshy#endif 1806174629Sjeff lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); 1807172411Sjeff TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; 1808171482Sjeff cpu_switch(td, newtd, mtx); 1809171482Sjeff /* 1810171482Sjeff * We may return from cpu_switch on a different cpu. However, 1811171482Sjeff * we always return with td_lock pointing to the current cpu's 1812171482Sjeff * run queue lock. 1813171482Sjeff */ 1814171482Sjeff cpuid = PCPU_GET(cpuid); 1815171482Sjeff tdq = TDQ_CPU(cpuid); 1816174629Sjeff lock_profile_obtain_lock_success( 1817174629Sjeff &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); 1818145256Sjkoshy#ifdef HWPMC_HOOKS 1819145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1820145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1821145256Sjkoshy#endif 1822171482Sjeff } else 1823171482Sjeff thread_unblock_switch(td, mtx); 1824171482Sjeff /* 1825176735Sjeff * We should always get here with the lowest priority td possible. 1826176735Sjeff */ 1827176735Sjeff tdq->tdq_lowpri = td->td_priority; 1828176735Sjeff /* 1829171482Sjeff * Assert that all went well and return. 1830171482Sjeff */ 1831171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); 1832171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1833171482Sjeff td->td_oncpu = cpuid; 1834109864Sjeff} 1835109864Sjeff 1836171482Sjeff/* 1837171482Sjeff * Adjust thread priorities as a result of a nice request. 1838171482Sjeff */ 1839109864Sjeffvoid 1840130551Sjuliansched_nice(struct proc *p, int nice) 1841109864Sjeff{ 1842109864Sjeff struct thread *td; 1843109864Sjeff 1844130551Sjulian PROC_LOCK_ASSERT(p, MA_OWNED); 1845170293Sjeff PROC_SLOCK_ASSERT(p, MA_OWNED); 1846165762Sjeff 1847130551Sjulian p->p_nice = nice; 1848163709Sjb FOREACH_THREAD_IN_PROC(p, td) { 1849170293Sjeff thread_lock(td); 1850163709Sjb sched_priority(td); 1851165762Sjeff sched_prio(td, td->td_base_user_pri); 1852170293Sjeff thread_unlock(td); 1853130551Sjulian } 1854109864Sjeff} 1855109864Sjeff 1856171482Sjeff/* 1857171482Sjeff * Record the sleep time for the interactivity scorer. 1858171482Sjeff */ 1859109864Sjeffvoid 1860177085Sjeffsched_sleep(struct thread *td, int prio) 1861109864Sjeff{ 1862165762Sjeff 1863170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1864109864Sjeff 1865172264Sjeff td->td_slptick = ticks; 1866177085Sjeff if (TD_IS_SUSPENDED(td) || prio <= PSOCK) 1867177085Sjeff td->td_flags |= TDF_CANSWAP; 1868177085Sjeff if (static_boost && prio) 1869177085Sjeff sched_prio(td, prio); 1870109864Sjeff} 1871109864Sjeff 1872171482Sjeff/* 1873171482Sjeff * Schedule a thread to resume execution and record how long it voluntarily 1874171482Sjeff * slept. We also update the pctcpu, interactivity, and priority. 1875171482Sjeff */ 1876109864Sjeffvoid 1877109864Sjeffsched_wakeup(struct thread *td) 1878109864Sjeff{ 1879166229Sjeff struct td_sched *ts; 1880171482Sjeff int slptick; 1881165762Sjeff 1882170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1883166229Sjeff ts = td->td_sched; 1884177085Sjeff td->td_flags &= ~TDF_CANSWAP; 1885109864Sjeff /* 1886165762Sjeff * If we slept for more than a tick update our interactivity and 1887165762Sjeff * priority. 1888109864Sjeff */ 1889172264Sjeff slptick = td->td_slptick; 1890172264Sjeff td->td_slptick = 0; 1891171482Sjeff if (slptick && slptick != ticks) { 1892166208Sjeff u_int hzticks; 1893109864Sjeff 1894171482Sjeff hzticks = (ticks - slptick) << SCHED_TICK_SHIFT; 1895171482Sjeff ts->ts_slptime += hzticks; 1896165819Sjeff sched_interact_update(td); 1897166229Sjeff sched_pctcpu_update(ts); 1898109864Sjeff } 1899166229Sjeff /* Reset the slice value after we sleep. */ 1900166229Sjeff ts->ts_slice = sched_slice; 1901166190Sjeff sched_add(td, SRQ_BORING); 1902109864Sjeff} 1903109864Sjeff 1904109864Sjeff/* 1905109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1906109864Sjeff * priority. 1907109864Sjeff */ 1908109864Sjeffvoid 1909163709Sjbsched_fork(struct thread *td, struct thread *child) 1910109864Sjeff{ 1911170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1912164936Sjulian sched_fork_thread(td, child); 1913165762Sjeff /* 1914165762Sjeff * Penalize the parent and child for forking. 1915165762Sjeff */ 1916165762Sjeff sched_interact_fork(child); 1917165762Sjeff sched_priority(child); 1918171482Sjeff td->td_sched->ts_runtime += tickincr; 1919165762Sjeff sched_interact_update(td); 1920165762Sjeff sched_priority(td); 1921164936Sjulian} 1922109864Sjeff 1923171482Sjeff/* 1924171482Sjeff * Fork a new thread, may be within the same process. 1925171482Sjeff */ 1926164936Sjulianvoid 1927164936Sjuliansched_fork_thread(struct thread *td, struct thread *child) 1928164936Sjulian{ 1929164936Sjulian struct td_sched *ts; 1930164936Sjulian struct td_sched *ts2; 1931164936Sjulian 1932165762Sjeff /* 1933165762Sjeff * Initialize child. 1934165762Sjeff */ 1935170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1936163709Sjb sched_newthread(child); 1937171482Sjeff child->td_lock = TDQ_LOCKPTR(TDQ_SELF()); 1938176735Sjeff child->td_cpuset = cpuset_ref(td->td_cpuset); 1939164936Sjulian ts = td->td_sched; 1940164936Sjulian ts2 = child->td_sched; 1941164936Sjulian ts2->ts_cpu = ts->ts_cpu; 1942164936Sjulian ts2->ts_runq = NULL; 1943165762Sjeff /* 1944165762Sjeff * Grab our parents cpu estimation information and priority. 1945165762Sjeff */ 1946164936Sjulian ts2->ts_ticks = ts->ts_ticks; 1947164936Sjulian ts2->ts_ltick = ts->ts_ltick; 1948164936Sjulian ts2->ts_ftick = ts->ts_ftick; 1949165762Sjeff child->td_user_pri = td->td_user_pri; 1950165762Sjeff child->td_base_user_pri = td->td_base_user_pri; 1951165762Sjeff /* 1952165762Sjeff * And update interactivity score. 1953165762Sjeff */ 1954171482Sjeff ts2->ts_slptime = ts->ts_slptime; 1955171482Sjeff ts2->ts_runtime = ts->ts_runtime; 1956165762Sjeff ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ 1957113357Sjeff} 1958113357Sjeff 1959171482Sjeff/* 1960171482Sjeff * Adjust the priority class of a thread. 1961171482Sjeff */ 1962113357Sjeffvoid 1963163709Sjbsched_class(struct thread *td, int class) 1964113357Sjeff{ 1965113357Sjeff 1966170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1967163709Sjb if (td->td_pri_class == class) 1968113357Sjeff return; 1969165827Sjeff /* 1970165827Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1971165827Sjeff * count because could be changing to or from an interrupt 1972165827Sjeff * class. 1973165827Sjeff */ 1974166190Sjeff if (TD_ON_RUNQ(td)) { 1975165827Sjeff struct tdq *tdq; 1976165827Sjeff 1977165827Sjeff tdq = TDQ_CPU(td->td_sched->ts_cpu); 1978176735Sjeff if (THREAD_CAN_MIGRATE(td)) 1979165827Sjeff tdq->tdq_transferable--; 1980165827Sjeff td->td_pri_class = class; 1981176735Sjeff if (THREAD_CAN_MIGRATE(td)) 1982165827Sjeff tdq->tdq_transferable++; 1983165827Sjeff } 1984163709Sjb td->td_pri_class = class; 1985109864Sjeff} 1986109864Sjeff 1987109864Sjeff/* 1988109864Sjeff * Return some of the child's priority and interactivity to the parent. 1989109864Sjeff */ 1990109864Sjeffvoid 1991164939Sjuliansched_exit(struct proc *p, struct thread *child) 1992109864Sjeff{ 1993165762Sjeff struct thread *td; 1994164939Sjulian 1995163709Sjb CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 1996173600Sjulian child, child->td_name, child->td_priority); 1997113372Sjeff 1998170293Sjeff PROC_SLOCK_ASSERT(p, MA_OWNED); 1999165762Sjeff td = FIRST_THREAD_IN_PROC(p); 2000165762Sjeff sched_exit_thread(td, child); 2001113372Sjeff} 2002113372Sjeff 2003171482Sjeff/* 2004171482Sjeff * Penalize another thread for the time spent on this one. This helps to 2005171482Sjeff * worsen the priority and interactivity of processes which schedule batch 2006171482Sjeff * jobs such as make. This has little effect on the make process itself but 2007171482Sjeff * causes new processes spawned by it to receive worse scores immediately. 2008171482Sjeff */ 2009113372Sjeffvoid 2010164939Sjuliansched_exit_thread(struct thread *td, struct thread *child) 2011164936Sjulian{ 2012165762Sjeff 2013164939Sjulian CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 2014173600Sjulian child, child->td_name, child->td_priority); 2015164939Sjulian 2016165762Sjeff /* 2017165762Sjeff * Give the child's runtime to the parent without returning the 2018165762Sjeff * sleep time as a penalty to the parent. This causes shells that 2019165762Sjeff * launch expensive things to mark their children as expensive. 2020165762Sjeff */ 2021170293Sjeff thread_lock(td); 2022171482Sjeff td->td_sched->ts_runtime += child->td_sched->ts_runtime; 2023164939Sjulian sched_interact_update(td); 2024165762Sjeff sched_priority(td); 2025170293Sjeff thread_unlock(td); 2026164936Sjulian} 2027164936Sjulian 2028177005Sjeffvoid 2029177005Sjeffsched_preempt(struct thread *td) 2030177005Sjeff{ 2031177005Sjeff struct tdq *tdq; 2032177005Sjeff 2033177005Sjeff thread_lock(td); 2034177005Sjeff tdq = TDQ_SELF(); 2035177005Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2036177005Sjeff tdq->tdq_ipipending = 0; 2037177005Sjeff if (td->td_priority > tdq->tdq_lowpri) { 2038177005Sjeff if (td->td_critnest > 1) 2039177005Sjeff td->td_owepreempt = 1; 2040177005Sjeff else 2041177005Sjeff mi_switch(SW_INVOL | SW_PREEMPT, NULL); 2042177005Sjeff } 2043177005Sjeff thread_unlock(td); 2044177005Sjeff} 2045177005Sjeff 2046171482Sjeff/* 2047171482Sjeff * Fix priorities on return to user-space. Priorities may be elevated due 2048171482Sjeff * to static priorities in msleep() or similar. 2049171482Sjeff */ 2050164936Sjulianvoid 2051164936Sjuliansched_userret(struct thread *td) 2052164936Sjulian{ 2053164936Sjulian /* 2054164936Sjulian * XXX we cheat slightly on the locking here to avoid locking in 2055164936Sjulian * the usual case. Setting td_priority here is essentially an 2056164936Sjulian * incomplete workaround for not setting it properly elsewhere. 2057164936Sjulian * Now that some interrupt handlers are threads, not setting it 2058164936Sjulian * properly elsewhere can clobber it in the window between setting 2059164936Sjulian * it here and returning to user mode, so don't waste time setting 2060164936Sjulian * it perfectly here. 2061164936Sjulian */ 2062164936Sjulian KASSERT((td->td_flags & TDF_BORROWING) == 0, 2063164936Sjulian ("thread with borrowed priority returning to userland")); 2064164936Sjulian if (td->td_priority != td->td_user_pri) { 2065170293Sjeff thread_lock(td); 2066164936Sjulian td->td_priority = td->td_user_pri; 2067164936Sjulian td->td_base_pri = td->td_user_pri; 2068177005Sjeff tdq_setlowpri(TDQ_SELF(), td); 2069170293Sjeff thread_unlock(td); 2070164936Sjulian } 2071164936Sjulian} 2072164936Sjulian 2073171482Sjeff/* 2074171482Sjeff * Handle a stathz tick. This is really only relevant for timeshare 2075171482Sjeff * threads. 2076171482Sjeff */ 2077164936Sjulianvoid 2078121127Sjeffsched_clock(struct thread *td) 2079109864Sjeff{ 2080164936Sjulian struct tdq *tdq; 2081164936Sjulian struct td_sched *ts; 2082109864Sjeff 2083171482Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2084164936Sjulian tdq = TDQ_SELF(); 2085172409Sjeff#ifdef SMP 2086133427Sjeff /* 2087172409Sjeff * We run the long term load balancer infrequently on the first cpu. 2088172409Sjeff */ 2089172409Sjeff if (balance_tdq == tdq) { 2090172409Sjeff if (balance_ticks && --balance_ticks == 0) 2091172409Sjeff sched_balance(); 2092172409Sjeff } 2093172409Sjeff#endif 2094172409Sjeff /* 2095165766Sjeff * Advance the insert index once for each tick to ensure that all 2096165766Sjeff * threads get a chance to run. 2097133427Sjeff */ 2098165766Sjeff if (tdq->tdq_idx == tdq->tdq_ridx) { 2099165766Sjeff tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; 2100165766Sjeff if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) 2101165766Sjeff tdq->tdq_ridx = tdq->tdq_idx; 2102165766Sjeff } 2103165766Sjeff ts = td->td_sched; 2104175104Sjeff if (td->td_pri_class & PRI_FIFO_BIT) 2105113357Sjeff return; 2106175104Sjeff if (td->td_pri_class == PRI_TIMESHARE) { 2107175104Sjeff /* 2108175104Sjeff * We used a tick; charge it to the thread so 2109175104Sjeff * that we can compute our interactivity. 2110175104Sjeff */ 2111175104Sjeff td->td_sched->ts_runtime += tickincr; 2112175104Sjeff sched_interact_update(td); 2113177009Sjeff sched_priority(td); 2114175104Sjeff } 2115113357Sjeff /* 2116109864Sjeff * We used up one time slice. 2117109864Sjeff */ 2118164936Sjulian if (--ts->ts_slice > 0) 2119113357Sjeff return; 2120109864Sjeff /* 2121177009Sjeff * We're out of time, force a requeue at userret(). 2122109864Sjeff */ 2123177009Sjeff ts->ts_slice = sched_slice; 2124113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 2125109864Sjeff} 2126109864Sjeff 2127171482Sjeff/* 2128171482Sjeff * Called once per hz tick. Used for cpu utilization information. This 2129171482Sjeff * is easier than trying to scale based on stathz. 2130171482Sjeff */ 2131171482Sjeffvoid 2132171482Sjeffsched_tick(void) 2133171482Sjeff{ 2134171482Sjeff struct td_sched *ts; 2135171482Sjeff 2136171482Sjeff ts = curthread->td_sched; 2137171482Sjeff /* Adjust ticks for pctcpu */ 2138171482Sjeff ts->ts_ticks += 1 << SCHED_TICK_SHIFT; 2139171482Sjeff ts->ts_ltick = ticks; 2140171482Sjeff /* 2141171482Sjeff * Update if we've exceeded our desired tick threshhold by over one 2142171482Sjeff * second. 2143171482Sjeff */ 2144171482Sjeff if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick) 2145171482Sjeff sched_pctcpu_update(ts); 2146171482Sjeff} 2147171482Sjeff 2148171482Sjeff/* 2149171482Sjeff * Return whether the current CPU has runnable tasks. Used for in-kernel 2150171482Sjeff * cooperative idle threads. 2151171482Sjeff */ 2152109864Sjeffint 2153109864Sjeffsched_runnable(void) 2154109864Sjeff{ 2155164936Sjulian struct tdq *tdq; 2156115998Sjeff int load; 2157109864Sjeff 2158115998Sjeff load = 1; 2159115998Sjeff 2160164936Sjulian tdq = TDQ_SELF(); 2161121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 2162165620Sjeff if (tdq->tdq_load > 0) 2163121605Sjeff goto out; 2164121605Sjeff } else 2165165620Sjeff if (tdq->tdq_load - 1 > 0) 2166121605Sjeff goto out; 2167115998Sjeff load = 0; 2168115998Sjeffout: 2169115998Sjeff return (load); 2170109864Sjeff} 2171109864Sjeff 2172171482Sjeff/* 2173171482Sjeff * Choose the highest priority thread to run. The thread is removed from 2174171482Sjeff * the run-queue while running however the load remains. For SMP we set 2175171482Sjeff * the tdq in the global idle bitmask if it idles here. 2176171482Sjeff */ 2177166190Sjeffstruct thread * 2178109970Sjeffsched_choose(void) 2179109970Sjeff{ 2180171482Sjeff struct td_sched *ts; 2181164936Sjulian struct tdq *tdq; 2182109970Sjeff 2183164936Sjulian tdq = TDQ_SELF(); 2184171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2185164936Sjulian ts = tdq_choose(tdq); 2186164936Sjulian if (ts) { 2187177042Sjeff ts->ts_ltick = ticks; 2188164936Sjulian tdq_runq_rem(tdq, ts); 2189166190Sjeff return (ts->ts_thread); 2190109864Sjeff } 2191176735Sjeff return (PCPU_GET(idlethread)); 2192109864Sjeff} 2193109864Sjeff 2194171482Sjeff/* 2195171482Sjeff * Set owepreempt if necessary. Preemption never happens directly in ULE, 2196171482Sjeff * we always request it once we exit a critical section. 2197171482Sjeff */ 2198171482Sjeffstatic inline void 2199171482Sjeffsched_setpreempt(struct thread *td) 2200166190Sjeff{ 2201166190Sjeff struct thread *ctd; 2202166190Sjeff int cpri; 2203166190Sjeff int pri; 2204166190Sjeff 2205177005Sjeff THREAD_LOCK_ASSERT(curthread, MA_OWNED); 2206177005Sjeff 2207166190Sjeff ctd = curthread; 2208166190Sjeff pri = td->td_priority; 2209166190Sjeff cpri = ctd->td_priority; 2210177005Sjeff if (pri < cpri) 2211177005Sjeff ctd->td_flags |= TDF_NEEDRESCHED; 2212166190Sjeff if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) 2213171482Sjeff return; 2214177005Sjeff if (!sched_shouldpreempt(pri, cpri, 0)) 2215171482Sjeff return; 2216171482Sjeff ctd->td_owepreempt = 1; 2217166190Sjeff} 2218166190Sjeff 2219171482Sjeff/* 2220177009Sjeff * Add a thread to a thread queue. Select the appropriate runq and add the 2221177009Sjeff * thread to it. This is the internal function called when the tdq is 2222177009Sjeff * predetermined. 2223171482Sjeff */ 2224109864Sjeffvoid 2225171482Sjefftdq_add(struct tdq *tdq, struct thread *td, int flags) 2226109864Sjeff{ 2227164936Sjulian struct td_sched *ts; 2228109864Sjeff 2229171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2230166190Sjeff KASSERT((td->td_inhibitors == 0), 2231166190Sjeff ("sched_add: trying to run inhibited thread")); 2232166190Sjeff KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 2233166190Sjeff ("sched_add: bad thread state")); 2234172207Sjeff KASSERT(td->td_flags & TDF_INMEM, 2235172207Sjeff ("sched_add: thread swapped out")); 2236171482Sjeff 2237171482Sjeff ts = td->td_sched; 2238171482Sjeff if (td->td_priority < tdq->tdq_lowpri) 2239171482Sjeff tdq->tdq_lowpri = td->td_priority; 2240171482Sjeff tdq_runq_add(tdq, ts, flags); 2241171482Sjeff tdq_load_add(tdq, ts); 2242171482Sjeff} 2243171482Sjeff 2244171482Sjeff/* 2245171482Sjeff * Select the target thread queue and add a thread to it. Request 2246171482Sjeff * preemption or IPI a remote processor if required. 2247171482Sjeff */ 2248171482Sjeffvoid 2249171482Sjeffsched_add(struct thread *td, int flags) 2250171482Sjeff{ 2251171482Sjeff struct tdq *tdq; 2252171482Sjeff#ifdef SMP 2253177009Sjeff struct td_sched *ts; 2254171482Sjeff int cpu; 2255171482Sjeff#endif 2256171482Sjeff CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 2257173600Sjulian td, td->td_name, td->td_priority, curthread, 2258173600Sjulian curthread->td_name); 2259171482Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2260166108Sjeff /* 2261171482Sjeff * Recalculate the priority before we select the target cpu or 2262171482Sjeff * run-queue. 2263166108Sjeff */ 2264171482Sjeff if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) 2265171482Sjeff sched_priority(td); 2266171482Sjeff#ifdef SMP 2267171482Sjeff /* 2268171482Sjeff * Pick the destination cpu and if it isn't ours transfer to the 2269171482Sjeff * target cpu. 2270171482Sjeff */ 2271177009Sjeff ts = td->td_sched; 2272176735Sjeff cpu = sched_pickcpu(ts, flags); 2273171482Sjeff tdq = sched_setcpu(ts, cpu, flags); 2274171482Sjeff tdq_add(tdq, td, flags); 2275177009Sjeff if (cpu != PCPU_GET(cpuid)) { 2276177005Sjeff tdq_notify(tdq, ts); 2277166108Sjeff return; 2278166108Sjeff } 2279171482Sjeff#else 2280171482Sjeff tdq = TDQ_SELF(); 2281171482Sjeff TDQ_LOCK(tdq); 2282171482Sjeff /* 2283171482Sjeff * Now that the thread is moving to the run-queue, set the lock 2284171482Sjeff * to the scheduler's lock. 2285171482Sjeff */ 2286171482Sjeff thread_lock_set(td, TDQ_LOCKPTR(tdq)); 2287171482Sjeff tdq_add(tdq, td, flags); 2288166108Sjeff#endif 2289171482Sjeff if (!(flags & SRQ_YIELDING)) 2290171482Sjeff sched_setpreempt(td); 2291109864Sjeff} 2292109864Sjeff 2293171482Sjeff/* 2294171482Sjeff * Remove a thread from a run-queue without running it. This is used 2295171482Sjeff * when we're stealing a thread from a remote queue. Otherwise all threads 2296171482Sjeff * exit by calling sched_exit_thread() and sched_throw() themselves. 2297171482Sjeff */ 2298109864Sjeffvoid 2299121127Sjeffsched_rem(struct thread *td) 2300109864Sjeff{ 2301164936Sjulian struct tdq *tdq; 2302164936Sjulian struct td_sched *ts; 2303113357Sjeff 2304139316Sjeff CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 2305173600Sjulian td, td->td_name, td->td_priority, curthread, 2306173600Sjulian curthread->td_name); 2307164936Sjulian ts = td->td_sched; 2308171482Sjeff tdq = TDQ_CPU(ts->ts_cpu); 2309171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2310171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2311166190Sjeff KASSERT(TD_ON_RUNQ(td), 2312164936Sjulian ("sched_rem: thread not on run queue")); 2313164936Sjulian tdq_runq_rem(tdq, ts); 2314164936Sjulian tdq_load_rem(tdq, ts); 2315166190Sjeff TD_SET_CAN_RUN(td); 2316176735Sjeff if (td->td_priority == tdq->tdq_lowpri) 2317176735Sjeff tdq_setlowpri(tdq, NULL); 2318109864Sjeff} 2319109864Sjeff 2320171482Sjeff/* 2321171482Sjeff * Fetch cpu utilization information. Updates on demand. 2322171482Sjeff */ 2323109864Sjefffixpt_t 2324121127Sjeffsched_pctcpu(struct thread *td) 2325109864Sjeff{ 2326109864Sjeff fixpt_t pctcpu; 2327164936Sjulian struct td_sched *ts; 2328109864Sjeff 2329109864Sjeff pctcpu = 0; 2330164936Sjulian ts = td->td_sched; 2331164936Sjulian if (ts == NULL) 2332121290Sjeff return (0); 2333109864Sjeff 2334170293Sjeff thread_lock(td); 2335164936Sjulian if (ts->ts_ticks) { 2336109864Sjeff int rtick; 2337109864Sjeff 2338165796Sjeff sched_pctcpu_update(ts); 2339109864Sjeff /* How many rtick per second ? */ 2340165762Sjeff rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); 2341165762Sjeff pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; 2342109864Sjeff } 2343170293Sjeff thread_unlock(td); 2344109864Sjeff 2345109864Sjeff return (pctcpu); 2346109864Sjeff} 2347109864Sjeff 2348176735Sjeff/* 2349176735Sjeff * Enforce affinity settings for a thread. Called after adjustments to 2350176735Sjeff * cpumask. 2351176735Sjeff */ 2352176729Sjeffvoid 2353176729Sjeffsched_affinity(struct thread *td) 2354176729Sjeff{ 2355176735Sjeff#ifdef SMP 2356176735Sjeff struct td_sched *ts; 2357176735Sjeff int cpu; 2358176735Sjeff 2359176735Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2360176735Sjeff ts = td->td_sched; 2361176735Sjeff if (THREAD_CAN_SCHED(td, ts->ts_cpu)) 2362176735Sjeff return; 2363176735Sjeff if (!TD_IS_RUNNING(td)) 2364176735Sjeff return; 2365176735Sjeff td->td_flags |= TDF_NEEDRESCHED; 2366176735Sjeff if (!THREAD_CAN_MIGRATE(td)) 2367176735Sjeff return; 2368176735Sjeff /* 2369176735Sjeff * Assign the new cpu and force a switch before returning to 2370176735Sjeff * userspace. If the target thread is not running locally send 2371176735Sjeff * an ipi to force the issue. 2372176735Sjeff */ 2373176735Sjeff cpu = ts->ts_cpu; 2374176735Sjeff ts->ts_cpu = sched_pickcpu(ts, 0); 2375176735Sjeff if (cpu != PCPU_GET(cpuid)) 2376176735Sjeff ipi_selected(1 << cpu, IPI_PREEMPT); 2377176735Sjeff#endif 2378176729Sjeff} 2379176729Sjeff 2380171482Sjeff/* 2381171482Sjeff * Bind a thread to a target cpu. 2382171482Sjeff */ 2383122038Sjeffvoid 2384122038Sjeffsched_bind(struct thread *td, int cpu) 2385122038Sjeff{ 2386164936Sjulian struct td_sched *ts; 2387122038Sjeff 2388171713Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); 2389164936Sjulian ts = td->td_sched; 2390166137Sjeff if (ts->ts_flags & TSF_BOUND) 2391166152Sjeff sched_unbind(td); 2392164936Sjulian ts->ts_flags |= TSF_BOUND; 2393166137Sjeff sched_pin(); 2394123433Sjeff if (PCPU_GET(cpuid) == cpu) 2395122038Sjeff return; 2396166137Sjeff ts->ts_cpu = cpu; 2397122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 2398131527Sphk mi_switch(SW_VOL, NULL); 2399122038Sjeff} 2400122038Sjeff 2401171482Sjeff/* 2402171482Sjeff * Release a bound thread. 2403171482Sjeff */ 2404122038Sjeffvoid 2405122038Sjeffsched_unbind(struct thread *td) 2406122038Sjeff{ 2407165762Sjeff struct td_sched *ts; 2408165762Sjeff 2409170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2410165762Sjeff ts = td->td_sched; 2411166137Sjeff if ((ts->ts_flags & TSF_BOUND) == 0) 2412166137Sjeff return; 2413165762Sjeff ts->ts_flags &= ~TSF_BOUND; 2414165762Sjeff sched_unpin(); 2415122038Sjeff} 2416122038Sjeff 2417109864Sjeffint 2418145256Sjkoshysched_is_bound(struct thread *td) 2419145256Sjkoshy{ 2420170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2421164936Sjulian return (td->td_sched->ts_flags & TSF_BOUND); 2422145256Sjkoshy} 2423145256Sjkoshy 2424171482Sjeff/* 2425171482Sjeff * Basic yield call. 2426171482Sjeff */ 2427159630Sdavidxuvoid 2428159630Sdavidxusched_relinquish(struct thread *td) 2429159630Sdavidxu{ 2430170293Sjeff thread_lock(td); 2431170293Sjeff SCHED_STAT_INC(switch_relinquish); 2432159630Sdavidxu mi_switch(SW_VOL, NULL); 2433170293Sjeff thread_unlock(td); 2434159630Sdavidxu} 2435159630Sdavidxu 2436171482Sjeff/* 2437171482Sjeff * Return the total system load. 2438171482Sjeff */ 2439145256Sjkoshyint 2440125289Sjeffsched_load(void) 2441125289Sjeff{ 2442125289Sjeff#ifdef SMP 2443125289Sjeff int total; 2444125289Sjeff int i; 2445125289Sjeff 2446125289Sjeff total = 0; 2447176735Sjeff for (i = 0; i <= mp_maxid; i++) 2448176735Sjeff total += TDQ_CPU(i)->tdq_sysload; 2449125289Sjeff return (total); 2450125289Sjeff#else 2451165620Sjeff return (TDQ_SELF()->tdq_sysload); 2452125289Sjeff#endif 2453125289Sjeff} 2454125289Sjeff 2455125289Sjeffint 2456109864Sjeffsched_sizeof_proc(void) 2457109864Sjeff{ 2458109864Sjeff return (sizeof(struct proc)); 2459109864Sjeff} 2460109864Sjeff 2461109864Sjeffint 2462109864Sjeffsched_sizeof_thread(void) 2463109864Sjeff{ 2464109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 2465109864Sjeff} 2466159570Sdavidxu 2467166190Sjeff/* 2468166190Sjeff * The actual idle process. 2469166190Sjeff */ 2470166190Sjeffvoid 2471166190Sjeffsched_idletd(void *dummy) 2472166190Sjeff{ 2473166190Sjeff struct thread *td; 2474171482Sjeff struct tdq *tdq; 2475166190Sjeff 2476166190Sjeff td = curthread; 2477171482Sjeff tdq = TDQ_SELF(); 2478166190Sjeff mtx_assert(&Giant, MA_NOTOWNED); 2479171482Sjeff /* ULE relies on preemption for idle interruption. */ 2480171482Sjeff for (;;) { 2481171482Sjeff#ifdef SMP 2482171482Sjeff if (tdq_idled(tdq)) 2483171482Sjeff cpu_idle(); 2484171482Sjeff#else 2485166190Sjeff cpu_idle(); 2486171482Sjeff#endif 2487171482Sjeff } 2488166190Sjeff} 2489166190Sjeff 2490170293Sjeff/* 2491170293Sjeff * A CPU is entering for the first time or a thread is exiting. 2492170293Sjeff */ 2493170293Sjeffvoid 2494170293Sjeffsched_throw(struct thread *td) 2495170293Sjeff{ 2496172411Sjeff struct thread *newtd; 2497171482Sjeff struct tdq *tdq; 2498171482Sjeff 2499171482Sjeff tdq = TDQ_SELF(); 2500170293Sjeff if (td == NULL) { 2501171482Sjeff /* Correct spinlock nesting and acquire the correct lock. */ 2502171482Sjeff TDQ_LOCK(tdq); 2503170293Sjeff spinlock_exit(); 2504170293Sjeff } else { 2505171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2506171482Sjeff tdq_load_rem(tdq, td->td_sched); 2507174629Sjeff lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); 2508170293Sjeff } 2509170293Sjeff KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); 2510172411Sjeff newtd = choosethread(); 2511172411Sjeff TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; 2512170293Sjeff PCPU_SET(switchtime, cpu_ticks()); 2513170293Sjeff PCPU_SET(switchticks, ticks); 2514172411Sjeff cpu_throw(td, newtd); /* doesn't return */ 2515170293Sjeff} 2516170293Sjeff 2517171482Sjeff/* 2518171482Sjeff * This is called from fork_exit(). Just acquire the correct locks and 2519171482Sjeff * let fork do the rest of the work. 2520171482Sjeff */ 2521170293Sjeffvoid 2522170600Sjeffsched_fork_exit(struct thread *td) 2523170293Sjeff{ 2524171482Sjeff struct td_sched *ts; 2525171482Sjeff struct tdq *tdq; 2526171482Sjeff int cpuid; 2527170293Sjeff 2528170293Sjeff /* 2529170293Sjeff * Finish setting up thread glue so that it begins execution in a 2530171482Sjeff * non-nested critical section with the scheduler lock held. 2531170293Sjeff */ 2532171482Sjeff cpuid = PCPU_GET(cpuid); 2533171482Sjeff tdq = TDQ_CPU(cpuid); 2534171482Sjeff ts = td->td_sched; 2535171482Sjeff if (TD_IS_IDLETHREAD(td)) 2536171482Sjeff td->td_lock = TDQ_LOCKPTR(tdq); 2537171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2538171482Sjeff td->td_oncpu = cpuid; 2539172411Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); 2540174629Sjeff lock_profile_obtain_lock_success( 2541174629Sjeff &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); 2542176735Sjeff tdq->tdq_lowpri = td->td_priority; 2543170293Sjeff} 2544170293Sjeff 2545171482Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, 2546171482Sjeff "Scheduler"); 2547171482SjeffSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, 2548165762Sjeff "Scheduler name"); 2549171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, 2550171482Sjeff "Slice size for timeshare threads"); 2551171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, 2552171482Sjeff "Interactivity score threshold"); 2553171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 2554171482Sjeff 0,"Min priority for preemption, lower priorities have greater precedence"); 2555177085SjeffSYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 2556177085Sjeff 0,"Controls whether static kernel priorities are assigned to sleeping threads."); 2557166108Sjeff#ifdef SMP 2558171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, 2559171482Sjeff "Number of hz ticks to keep thread affinity for"); 2560171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, 2561171482Sjeff "Enables the long-term load balancer"); 2562172409SjeffSYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, 2563172409Sjeff &balance_interval, 0, 2564172409Sjeff "Average frequency in stathz ticks to run the long-term balancer"); 2565171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, 2566171482Sjeff "Steals work from another hyper-threaded core on idle"); 2567171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, 2568171482Sjeff "Attempts to steal work from other cores before idling"); 2569171506SjeffSYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, 2570171506Sjeff "Minimum load on remote cpu before we'll steal"); 2571166108Sjeff#endif 2572165762Sjeff 2573172264Sjeff/* ps compat. All cpu percentages from ULE are weighted. */ 2574172293Sjeffstatic int ccpu = 0; 2575165762SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 2576165762Sjeff 2577165762Sjeff 2578134791Sjulian#define KERN_SWITCH_INCLUDE 1 2579134791Sjulian#include "kern/kern_switch.c" 2580