sched_ule.c revision 177368
1109864Sjeff/*- 2165762Sjeff * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27171482Sjeff/* 28171482Sjeff * This file implements the ULE scheduler. ULE supports independent CPU 29171482Sjeff * run queues and fine grain locking. It has superior interactive 30171482Sjeff * performance under load even on uni-processor systems. 31171482Sjeff * 32171482Sjeff * etymology: 33172293Sjeff * ULE is the last three letters in schedule. It owes its name to a 34171482Sjeff * generic user created for a scheduling system by Paul Mikesell at 35171482Sjeff * Isilon Systems and a general lack of creativity on the part of the author. 36171482Sjeff */ 37171482Sjeff 38116182Sobrien#include <sys/cdefs.h> 39116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 177368 2008-03-19 06:19:01Z jeff $"); 40116182Sobrien 41147565Speter#include "opt_hwpmc_hooks.h" 42147565Speter#include "opt_sched.h" 43134649Sscottl 44109864Sjeff#include <sys/param.h> 45109864Sjeff#include <sys/systm.h> 46131929Smarcel#include <sys/kdb.h> 47109864Sjeff#include <sys/kernel.h> 48109864Sjeff#include <sys/ktr.h> 49109864Sjeff#include <sys/lock.h> 50109864Sjeff#include <sys/mutex.h> 51109864Sjeff#include <sys/proc.h> 52112966Sjeff#include <sys/resource.h> 53122038Sjeff#include <sys/resourcevar.h> 54109864Sjeff#include <sys/sched.h> 55109864Sjeff#include <sys/smp.h> 56109864Sjeff#include <sys/sx.h> 57109864Sjeff#include <sys/sysctl.h> 58109864Sjeff#include <sys/sysproto.h> 59139453Sjhb#include <sys/turnstile.h> 60161599Sdavidxu#include <sys/umtx.h> 61109864Sjeff#include <sys/vmmeter.h> 62176735Sjeff#include <sys/cpuset.h> 63109864Sjeff#ifdef KTRACE 64109864Sjeff#include <sys/uio.h> 65109864Sjeff#include <sys/ktrace.h> 66109864Sjeff#endif 67109864Sjeff 68145256Sjkoshy#ifdef HWPMC_HOOKS 69145256Sjkoshy#include <sys/pmckern.h> 70145256Sjkoshy#endif 71145256Sjkoshy 72109864Sjeff#include <machine/cpu.h> 73121790Sjeff#include <machine/smp.h> 74109864Sjeff 75172887Sgrehan#if !defined(__i386__) && !defined(__amd64__) && !defined(__powerpc__) && !defined(__arm__) 76172345Sjeff#error "This architecture is not currently compatible with ULE" 77166190Sjeff#endif 78166190Sjeff 79171482Sjeff#define KTR_ULE 0 80166137Sjeff 81166137Sjeff/* 82171482Sjeff * Thread scheduler specific section. All fields are protected 83171482Sjeff * by the thread lock. 84146954Sjeff */ 85164936Sjulianstruct td_sched { 86171482Sjeff TAILQ_ENTRY(td_sched) ts_procq; /* Run queue. */ 87171482Sjeff struct thread *ts_thread; /* Active associated thread. */ 88171482Sjeff struct runq *ts_runq; /* Run-queue we're queued on. */ 89171482Sjeff short ts_flags; /* TSF_* flags. */ 90171482Sjeff u_char ts_rqindex; /* Run queue index. */ 91164936Sjulian u_char ts_cpu; /* CPU that we have affinity for. */ 92177009Sjeff int ts_rltick; /* Real last tick, for affinity. */ 93171482Sjeff int ts_slice; /* Ticks of slice remaining. */ 94171482Sjeff u_int ts_slptime; /* Number of ticks we vol. slept */ 95171482Sjeff u_int ts_runtime; /* Number of ticks we were running */ 96164936Sjulian int ts_ltick; /* Last tick that we were running on */ 97164936Sjulian int ts_ftick; /* First tick that we were running on */ 98164936Sjulian int ts_ticks; /* Tick count */ 99134791Sjulian}; 100164936Sjulian/* flags kept in ts_flags */ 101166108Sjeff#define TSF_BOUND 0x0001 /* Thread can not migrate. */ 102166108Sjeff#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ 103121790Sjeff 104164936Sjulianstatic struct td_sched td_sched0; 105109864Sjeff 106176735Sjeff#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) 107176735Sjeff#define THREAD_CAN_SCHED(td, cpu) \ 108176735Sjeff CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) 109176735Sjeff 110109864Sjeff/* 111165762Sjeff * Cpu percentage computation macros and defines. 112111857Sjeff * 113165762Sjeff * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. 114165762Sjeff * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. 115165796Sjeff * SCHED_TICK_MAX: Maximum number of ticks before scaling back. 116165762Sjeff * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. 117165762Sjeff * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. 118165762Sjeff * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. 119165762Sjeff */ 120165762Sjeff#define SCHED_TICK_SECS 10 121165762Sjeff#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) 122165796Sjeff#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) 123165762Sjeff#define SCHED_TICK_SHIFT 10 124165762Sjeff#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) 125165830Sjeff#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) 126165762Sjeff 127165762Sjeff/* 128165762Sjeff * These macros determine priorities for non-interactive threads. They are 129165762Sjeff * assigned a priority based on their recent cpu utilization as expressed 130165762Sjeff * by the ratio of ticks to the tick total. NHALF priorities at the start 131165762Sjeff * and end of the MIN to MAX timeshare range are only reachable with negative 132165762Sjeff * or positive nice respectively. 133165762Sjeff * 134165762Sjeff * PRI_RANGE: Priority range for utilization dependent priorities. 135116642Sjeff * PRI_NRESV: Number of nice values. 136165762Sjeff * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. 137165762Sjeff * PRI_NICE: Determines the part of the priority inherited from nice. 138109864Sjeff */ 139165762Sjeff#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) 140121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 141165762Sjeff#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF) 142165762Sjeff#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF) 143170787Sjeff#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN) 144165762Sjeff#define SCHED_PRI_TICKS(ts) \ 145165762Sjeff (SCHED_TICK_HZ((ts)) / \ 146165827Sjeff (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) 147165762Sjeff#define SCHED_PRI_NICE(nice) (nice) 148109864Sjeff 149109864Sjeff/* 150165762Sjeff * These determine the interactivity of a process. Interactivity differs from 151165762Sjeff * cpu utilization in that it expresses the voluntary time slept vs time ran 152165762Sjeff * while cpu utilization includes all time not running. This more accurately 153165762Sjeff * models the intent of the thread. 154109864Sjeff * 155110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 156110645Sjeff * before throttling back. 157121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 158116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 159111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 160109864Sjeff */ 161165762Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) 162165762Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) 163116365Sjeff#define SCHED_INTERACT_MAX (100) 164116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 165121126Sjeff#define SCHED_INTERACT_THRESH (30) 166111857Sjeff 167109864Sjeff/* 168165762Sjeff * tickincr: Converts a stathz tick into a hz domain scaled by 169165762Sjeff * the shift factor. Without the shift the error rate 170165762Sjeff * due to rounding would be unacceptably high. 171165762Sjeff * realstathz: stathz is sometimes 0 and run off of hz. 172165762Sjeff * sched_slice: Runtime of each thread before rescheduling. 173171482Sjeff * preempt_thresh: Priority threshold for preemption and remote IPIs. 174109864Sjeff */ 175165762Sjeffstatic int sched_interact = SCHED_INTERACT_THRESH; 176165762Sjeffstatic int realstathz; 177165762Sjeffstatic int tickincr; 178177009Sjeffstatic int sched_slice = 1; 179172345Sjeff#ifdef PREEMPTION 180172345Sjeff#ifdef FULL_PREEMPTION 181172345Sjeffstatic int preempt_thresh = PRI_MAX_IDLE; 182172345Sjeff#else 183171482Sjeffstatic int preempt_thresh = PRI_MIN_KERN; 184172345Sjeff#endif 185172345Sjeff#else 186172345Sjeffstatic int preempt_thresh = 0; 187172345Sjeff#endif 188177085Sjeffstatic int static_boost = 1; 189109864Sjeff 190109864Sjeff/* 191171482Sjeff * tdq - per processor runqs and statistics. All fields are protected by the 192171482Sjeff * tdq_lock. The load and lowpri may be accessed without to avoid excess 193171482Sjeff * locking in sched_pickcpu(); 194109864Sjeff */ 195164936Sjulianstruct tdq { 196177009Sjeff /* Ordered to improve efficiency of cpu_search() and switch(). */ 197177009Sjeff struct mtx tdq_lock; /* run queue lock. */ 198176735Sjeff struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ 199171482Sjeff int tdq_load; /* Aggregate load. */ 200176735Sjeff int tdq_sysload; /* For loadavg, !ITHD load. */ 201177009Sjeff int tdq_transferable; /* Transferable thread count. */ 202177009Sjeff u_char tdq_lowpri; /* Lowest priority thread. */ 203177009Sjeff u_char tdq_ipipending; /* IPI pending. */ 204166557Sjeff u_char tdq_idx; /* Current insert index. */ 205166557Sjeff u_char tdq_ridx; /* Current removal index. */ 206177009Sjeff struct runq tdq_realtime; /* real-time run queue. */ 207177009Sjeff struct runq tdq_timeshare; /* timeshare run queue. */ 208177009Sjeff struct runq tdq_idle; /* Queue of IDLE threads. */ 209176735Sjeff char tdq_name[sizeof("sched lock") + 6]; 210171482Sjeff} __aligned(64); 211109864Sjeff 212166108Sjeff 213123433Sjeff#ifdef SMP 214176735Sjeffstruct cpu_group *cpu_top; 215123433Sjeff 216176735Sjeff#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) 217176735Sjeff#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) 218166108Sjeff 219123433Sjeff/* 220166108Sjeff * Run-time tunables. 221166108Sjeff */ 222171506Sjeffstatic int rebalance = 1; 223172409Sjeffstatic int balance_interval = 128; /* Default set in sched_initticks(). */ 224166108Sjeffstatic int affinity; 225172409Sjeffstatic int steal_htt = 1; 226171506Sjeffstatic int steal_idle = 1; 227171506Sjeffstatic int steal_thresh = 2; 228166108Sjeff 229166108Sjeff/* 230165620Sjeff * One thread queue per processor. 231109864Sjeff */ 232164936Sjulianstatic struct tdq tdq_cpu[MAXCPU]; 233172409Sjeffstatic struct tdq *balance_tdq; 234172409Sjeffstatic int balance_ticks; 235129982Sjeff 236164936Sjulian#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) 237164936Sjulian#define TDQ_CPU(x) (&tdq_cpu[(x)]) 238171713Sjeff#define TDQ_ID(x) ((int)((x) - tdq_cpu)) 239123433Sjeff#else /* !SMP */ 240164936Sjulianstatic struct tdq tdq_cpu; 241129982Sjeff 242170315Sjeff#define TDQ_ID(x) (0) 243164936Sjulian#define TDQ_SELF() (&tdq_cpu) 244164936Sjulian#define TDQ_CPU(x) (&tdq_cpu) 245110028Sjeff#endif 246109864Sjeff 247171482Sjeff#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) 248171482Sjeff#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) 249171482Sjeff#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) 250171482Sjeff#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) 251176735Sjeff#define TDQ_LOCKPTR(t) (&(t)->tdq_lock) 252171482Sjeff 253163709Sjbstatic void sched_priority(struct thread *); 254146954Sjeffstatic void sched_thread_priority(struct thread *, u_char); 255163709Sjbstatic int sched_interact_score(struct thread *); 256163709Sjbstatic void sched_interact_update(struct thread *); 257163709Sjbstatic void sched_interact_fork(struct thread *); 258164936Sjulianstatic void sched_pctcpu_update(struct td_sched *); 259109864Sjeff 260110267Sjeff/* Operations on per processor queues */ 261164936Sjulianstatic struct td_sched * tdq_choose(struct tdq *); 262164936Sjulianstatic void tdq_setup(struct tdq *); 263164936Sjulianstatic void tdq_load_add(struct tdq *, struct td_sched *); 264164936Sjulianstatic void tdq_load_rem(struct tdq *, struct td_sched *); 265164936Sjulianstatic __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); 266164936Sjulianstatic __inline void tdq_runq_rem(struct tdq *, struct td_sched *); 267177005Sjeffstatic inline int sched_shouldpreempt(int, int, int); 268164936Sjulianvoid tdq_print(int cpu); 269165762Sjeffstatic void runq_print(struct runq *rq); 270171482Sjeffstatic void tdq_add(struct tdq *, struct thread *, int); 271110267Sjeff#ifdef SMP 272176735Sjeffstatic int tdq_move(struct tdq *, struct tdq *); 273171482Sjeffstatic int tdq_idled(struct tdq *); 274177005Sjeffstatic void tdq_notify(struct tdq *, struct td_sched *); 275176735Sjeffstatic struct td_sched *tdq_steal(struct tdq *, int); 276176735Sjeffstatic struct td_sched *runq_steal(struct runq *, int); 277171482Sjeffstatic int sched_pickcpu(struct td_sched *, int); 278172409Sjeffstatic void sched_balance(void); 279176735Sjeffstatic int sched_balance_pair(struct tdq *, struct tdq *); 280171482Sjeffstatic inline struct tdq *sched_setcpu(struct td_sched *, int, int); 281171482Sjeffstatic inline struct mtx *thread_block_switch(struct thread *); 282171482Sjeffstatic inline void thread_unblock_switch(struct thread *, struct mtx *); 283171713Sjeffstatic struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); 284121790Sjeff#endif 285110028Sjeff 286165762Sjeffstatic void sched_setup(void *dummy); 287177253SrwatsonSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); 288165762Sjeff 289165762Sjeffstatic void sched_initticks(void *dummy); 290177253SrwatsonSYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, 291177253Srwatson NULL); 292165762Sjeff 293171482Sjeff/* 294171482Sjeff * Print the threads waiting on a run-queue. 295171482Sjeff */ 296165762Sjeffstatic void 297165762Sjeffrunq_print(struct runq *rq) 298165762Sjeff{ 299165762Sjeff struct rqhead *rqh; 300165762Sjeff struct td_sched *ts; 301165762Sjeff int pri; 302165762Sjeff int j; 303165762Sjeff int i; 304165762Sjeff 305165762Sjeff for (i = 0; i < RQB_LEN; i++) { 306165762Sjeff printf("\t\trunq bits %d 0x%zx\n", 307165762Sjeff i, rq->rq_status.rqb_bits[i]); 308165762Sjeff for (j = 0; j < RQB_BPW; j++) 309165762Sjeff if (rq->rq_status.rqb_bits[i] & (1ul << j)) { 310165762Sjeff pri = j + (i << RQB_L2BPW); 311165762Sjeff rqh = &rq->rq_queues[pri]; 312165762Sjeff TAILQ_FOREACH(ts, rqh, ts_procq) { 313165762Sjeff printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", 314173600Sjulian ts->ts_thread, ts->ts_thread->td_name, ts->ts_thread->td_priority, ts->ts_rqindex, pri); 315165762Sjeff } 316165762Sjeff } 317165762Sjeff } 318165762Sjeff} 319165762Sjeff 320171482Sjeff/* 321171482Sjeff * Print the status of a per-cpu thread queue. Should be a ddb show cmd. 322171482Sjeff */ 323113357Sjeffvoid 324164936Sjuliantdq_print(int cpu) 325110267Sjeff{ 326164936Sjulian struct tdq *tdq; 327112994Sjeff 328164936Sjulian tdq = TDQ_CPU(cpu); 329112994Sjeff 330171713Sjeff printf("tdq %d:\n", TDQ_ID(tdq)); 331176735Sjeff printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); 332176735Sjeff printf("\tLock name: %s\n", tdq->tdq_name); 333165620Sjeff printf("\tload: %d\n", tdq->tdq_load); 334171482Sjeff printf("\ttimeshare idx: %d\n", tdq->tdq_idx); 335165766Sjeff printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); 336165762Sjeff printf("\trealtime runq:\n"); 337165762Sjeff runq_print(&tdq->tdq_realtime); 338165762Sjeff printf("\ttimeshare runq:\n"); 339165762Sjeff runq_print(&tdq->tdq_timeshare); 340165762Sjeff printf("\tidle runq:\n"); 341165762Sjeff runq_print(&tdq->tdq_idle); 342165620Sjeff printf("\tload transferable: %d\n", tdq->tdq_transferable); 343171713Sjeff printf("\tlowest priority: %d\n", tdq->tdq_lowpri); 344113357Sjeff} 345112994Sjeff 346177005Sjeffstatic inline int 347177005Sjeffsched_shouldpreempt(int pri, int cpri, int remote) 348177005Sjeff{ 349177005Sjeff /* 350177005Sjeff * If the new priority is not better than the current priority there is 351177005Sjeff * nothing to do. 352177005Sjeff */ 353177005Sjeff if (pri >= cpri) 354177005Sjeff return (0); 355177005Sjeff /* 356177005Sjeff * Always preempt idle. 357177005Sjeff */ 358177005Sjeff if (cpri >= PRI_MIN_IDLE) 359177005Sjeff return (1); 360177005Sjeff /* 361177005Sjeff * If preemption is disabled don't preempt others. 362177005Sjeff */ 363177005Sjeff if (preempt_thresh == 0) 364177005Sjeff return (0); 365177005Sjeff /* 366177005Sjeff * Preempt if we exceed the threshold. 367177005Sjeff */ 368177005Sjeff if (pri <= preempt_thresh) 369177005Sjeff return (1); 370177005Sjeff /* 371177005Sjeff * If we're realtime or better and there is timeshare or worse running 372177005Sjeff * preempt only remote processors. 373177005Sjeff */ 374177005Sjeff if (remote && pri <= PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME) 375177005Sjeff return (1); 376177005Sjeff return (0); 377177005Sjeff} 378177005Sjeff 379171482Sjeff#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) 380171482Sjeff/* 381171482Sjeff * Add a thread to the actual run-queue. Keeps transferable counts up to 382171482Sjeff * date with what is actually on the run-queue. Selects the correct 383171482Sjeff * queue position for timeshare threads. 384171482Sjeff */ 385122744Sjeffstatic __inline void 386164936Sjuliantdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) 387122744Sjeff{ 388177042Sjeff u_char pri; 389177042Sjeff 390171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 391171482Sjeff THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 392177009Sjeff 393177009Sjeff TD_SET_RUNQ(ts->ts_thread); 394165762Sjeff if (THREAD_CAN_MIGRATE(ts->ts_thread)) { 395165620Sjeff tdq->tdq_transferable++; 396164936Sjulian ts->ts_flags |= TSF_XFERABLE; 397123433Sjeff } 398177042Sjeff pri = ts->ts_thread->td_priority; 399177042Sjeff if (pri <= PRI_MAX_REALTIME) { 400177042Sjeff ts->ts_runq = &tdq->tdq_realtime; 401177042Sjeff } else if (pri <= PRI_MAX_TIMESHARE) { 402177042Sjeff ts->ts_runq = &tdq->tdq_timeshare; 403165762Sjeff KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, 404165762Sjeff ("Invalid priority %d on timeshare runq", pri)); 405165762Sjeff /* 406165762Sjeff * This queue contains only priorities between MIN and MAX 407165762Sjeff * realtime. Use the whole queue to represent these values. 408165762Sjeff */ 409171713Sjeff if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { 410165762Sjeff pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; 411165762Sjeff pri = (pri + tdq->tdq_idx) % RQ_NQS; 412165766Sjeff /* 413165766Sjeff * This effectively shortens the queue by one so we 414165766Sjeff * can have a one slot difference between idx and 415165766Sjeff * ridx while we wait for threads to drain. 416165766Sjeff */ 417165766Sjeff if (tdq->tdq_ridx != tdq->tdq_idx && 418165766Sjeff pri == tdq->tdq_ridx) 419167664Sjeff pri = (unsigned char)(pri - 1) % RQ_NQS; 420165762Sjeff } else 421165766Sjeff pri = tdq->tdq_ridx; 422165762Sjeff runq_add_pri(ts->ts_runq, ts, pri, flags); 423177042Sjeff return; 424165762Sjeff } else 425177009Sjeff ts->ts_runq = &tdq->tdq_idle; 426177042Sjeff runq_add(ts->ts_runq, ts, flags); 427177009Sjeff} 428177009Sjeff 429171482Sjeff/* 430171482Sjeff * Remove a thread from a run-queue. This typically happens when a thread 431171482Sjeff * is selected to run. Running threads are not on the queue and the 432171482Sjeff * transferable count does not reflect them. 433171482Sjeff */ 434122744Sjeffstatic __inline void 435164936Sjuliantdq_runq_rem(struct tdq *tdq, struct td_sched *ts) 436122744Sjeff{ 437171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 438171482Sjeff KASSERT(ts->ts_runq != NULL, 439171482Sjeff ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread)); 440164936Sjulian if (ts->ts_flags & TSF_XFERABLE) { 441165620Sjeff tdq->tdq_transferable--; 442164936Sjulian ts->ts_flags &= ~TSF_XFERABLE; 443123433Sjeff } 444165766Sjeff if (ts->ts_runq == &tdq->tdq_timeshare) { 445165766Sjeff if (tdq->tdq_idx != tdq->tdq_ridx) 446165766Sjeff runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); 447165766Sjeff else 448165766Sjeff runq_remove_idx(ts->ts_runq, ts, NULL); 449165766Sjeff } else 450165762Sjeff runq_remove(ts->ts_runq, ts); 451122744Sjeff} 452122744Sjeff 453171482Sjeff/* 454171482Sjeff * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load 455171482Sjeff * for this thread to the referenced thread queue. 456171482Sjeff */ 457113357Sjeffstatic void 458164936Sjuliantdq_load_add(struct tdq *tdq, struct td_sched *ts) 459113357Sjeff{ 460121896Sjeff int class; 461171482Sjeff 462171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 463171482Sjeff THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 464164936Sjulian class = PRI_BASE(ts->ts_thread->td_pri_class); 465165620Sjeff tdq->tdq_load++; 466171713Sjeff CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); 467166108Sjeff if (class != PRI_ITHD && 468166108Sjeff (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 469165620Sjeff tdq->tdq_sysload++; 470110267Sjeff} 471113357Sjeff 472171482Sjeff/* 473171482Sjeff * Remove the load from a thread that is transitioning to a sleep state or 474171482Sjeff * exiting. 475171482Sjeff */ 476112994Sjeffstatic void 477164936Sjuliantdq_load_rem(struct tdq *tdq, struct td_sched *ts) 478110267Sjeff{ 479121896Sjeff int class; 480171482Sjeff 481171482Sjeff THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 482171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 483164936Sjulian class = PRI_BASE(ts->ts_thread->td_pri_class); 484166108Sjeff if (class != PRI_ITHD && 485166108Sjeff (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 486165620Sjeff tdq->tdq_sysload--; 487171482Sjeff KASSERT(tdq->tdq_load != 0, 488171713Sjeff ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); 489165620Sjeff tdq->tdq_load--; 490165620Sjeff CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 491164936Sjulian ts->ts_runq = NULL; 492110267Sjeff} 493110267Sjeff 494176735Sjeff/* 495176735Sjeff * Set lowpri to its exact value by searching the run-queue and 496176735Sjeff * evaluating curthread. curthread may be passed as an optimization. 497176735Sjeff */ 498176735Sjeffstatic void 499176735Sjefftdq_setlowpri(struct tdq *tdq, struct thread *ctd) 500176735Sjeff{ 501176735Sjeff struct td_sched *ts; 502176735Sjeff struct thread *td; 503176735Sjeff 504176735Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 505176735Sjeff if (ctd == NULL) 506176735Sjeff ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; 507176735Sjeff ts = tdq_choose(tdq); 508176735Sjeff if (ts) 509176735Sjeff td = ts->ts_thread; 510176735Sjeff if (ts == NULL || td->td_priority > ctd->td_priority) 511176735Sjeff tdq->tdq_lowpri = ctd->td_priority; 512176735Sjeff else 513176735Sjeff tdq->tdq_lowpri = td->td_priority; 514176735Sjeff} 515176735Sjeff 516113357Sjeff#ifdef SMP 517176735Sjeffstruct cpu_search { 518176735Sjeff cpumask_t cs_mask; /* Mask of valid cpus. */ 519176735Sjeff u_int cs_load; 520176735Sjeff u_int cs_cpu; 521176735Sjeff int cs_limit; /* Min priority for low min load for high. */ 522176735Sjeff}; 523176735Sjeff 524176735Sjeff#define CPU_SEARCH_LOWEST 0x1 525176735Sjeff#define CPU_SEARCH_HIGHEST 0x2 526176735Sjeff#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST) 527176735Sjeff 528176735Sjeff#define CPUMASK_FOREACH(cpu, mask) \ 529176735Sjeff for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++) \ 530176735Sjeff if ((mask) & 1 << (cpu)) 531176735Sjeff 532177169Sjhbstatic __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low, 533176735Sjeff struct cpu_search *high, const int match); 534176735Sjeffint cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low); 535176735Sjeffint cpu_search_highest(struct cpu_group *cg, struct cpu_search *high); 536176735Sjeffint cpu_search_both(struct cpu_group *cg, struct cpu_search *low, 537176735Sjeff struct cpu_search *high); 538176735Sjeff 539116069Sjeff/* 540176735Sjeff * This routine compares according to the match argument and should be 541176735Sjeff * reduced in actual instantiations via constant propagation and dead code 542176735Sjeff * elimination. 543176735Sjeff */ 544176735Sjeffstatic __inline int 545176735Sjeffcpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high, 546176735Sjeff const int match) 547176735Sjeff{ 548176735Sjeff struct tdq *tdq; 549176735Sjeff 550176735Sjeff tdq = TDQ_CPU(cpu); 551176735Sjeff if (match & CPU_SEARCH_LOWEST) 552176735Sjeff if (low->cs_mask & (1 << cpu) && 553176735Sjeff tdq->tdq_load < low->cs_load && 554176735Sjeff tdq->tdq_lowpri > low->cs_limit) { 555176735Sjeff low->cs_cpu = cpu; 556176735Sjeff low->cs_load = tdq->tdq_load; 557176735Sjeff } 558176735Sjeff if (match & CPU_SEARCH_HIGHEST) 559176735Sjeff if (high->cs_mask & (1 << cpu) && 560176735Sjeff tdq->tdq_load >= high->cs_limit && 561176735Sjeff tdq->tdq_load > high->cs_load && 562176735Sjeff tdq->tdq_transferable) { 563176735Sjeff high->cs_cpu = cpu; 564176735Sjeff high->cs_load = tdq->tdq_load; 565176735Sjeff } 566176735Sjeff return (tdq->tdq_load); 567176735Sjeff} 568176735Sjeff 569176735Sjeff/* 570176735Sjeff * Search the tree of cpu_groups for the lowest or highest loaded cpu 571176735Sjeff * according to the match argument. This routine actually compares the 572176735Sjeff * load on all paths through the tree and finds the least loaded cpu on 573176735Sjeff * the least loaded path, which may differ from the least loaded cpu in 574176735Sjeff * the system. This balances work among caches and busses. 575116069Sjeff * 576176735Sjeff * This inline is instantiated in three forms below using constants for the 577176735Sjeff * match argument. It is reduced to the minimum set for each case. It is 578176735Sjeff * also recursive to the depth of the tree. 579116069Sjeff */ 580177169Sjhbstatic __inline int 581176735Sjeffcpu_search(struct cpu_group *cg, struct cpu_search *low, 582176735Sjeff struct cpu_search *high, const int match) 583176735Sjeff{ 584176735Sjeff int total; 585176735Sjeff 586176735Sjeff total = 0; 587176735Sjeff if (cg->cg_children) { 588176735Sjeff struct cpu_search lgroup; 589176735Sjeff struct cpu_search hgroup; 590176735Sjeff struct cpu_group *child; 591176735Sjeff u_int lload; 592176735Sjeff int hload; 593176735Sjeff int load; 594176735Sjeff int i; 595176735Sjeff 596176735Sjeff lload = -1; 597176735Sjeff hload = -1; 598176735Sjeff for (i = 0; i < cg->cg_children; i++) { 599176735Sjeff child = &cg->cg_child[i]; 600176735Sjeff if (match & CPU_SEARCH_LOWEST) { 601176735Sjeff lgroup = *low; 602176735Sjeff lgroup.cs_load = -1; 603176735Sjeff } 604176735Sjeff if (match & CPU_SEARCH_HIGHEST) { 605176735Sjeff hgroup = *high; 606176735Sjeff lgroup.cs_load = 0; 607176735Sjeff } 608176735Sjeff switch (match) { 609176735Sjeff case CPU_SEARCH_LOWEST: 610176735Sjeff load = cpu_search_lowest(child, &lgroup); 611176735Sjeff break; 612176735Sjeff case CPU_SEARCH_HIGHEST: 613176735Sjeff load = cpu_search_highest(child, &hgroup); 614176735Sjeff break; 615176735Sjeff case CPU_SEARCH_BOTH: 616176735Sjeff load = cpu_search_both(child, &lgroup, &hgroup); 617176735Sjeff break; 618176735Sjeff } 619176735Sjeff total += load; 620176735Sjeff if (match & CPU_SEARCH_LOWEST) 621176735Sjeff if (load < lload || low->cs_cpu == -1) { 622176735Sjeff *low = lgroup; 623176735Sjeff lload = load; 624176735Sjeff } 625176735Sjeff if (match & CPU_SEARCH_HIGHEST) 626176735Sjeff if (load > hload || high->cs_cpu == -1) { 627176735Sjeff hload = load; 628176735Sjeff *high = hgroup; 629176735Sjeff } 630176735Sjeff } 631176735Sjeff } else { 632176735Sjeff int cpu; 633176735Sjeff 634176735Sjeff CPUMASK_FOREACH(cpu, cg->cg_mask) 635176735Sjeff total += cpu_compare(cpu, low, high, match); 636176735Sjeff } 637176735Sjeff return (total); 638176735Sjeff} 639176735Sjeff 640176735Sjeff/* 641176735Sjeff * cpu_search instantiations must pass constants to maintain the inline 642176735Sjeff * optimization. 643176735Sjeff */ 644176735Sjeffint 645176735Sjeffcpu_search_lowest(struct cpu_group *cg, struct cpu_search *low) 646176735Sjeff{ 647176735Sjeff return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST); 648176735Sjeff} 649176735Sjeff 650176735Sjeffint 651176735Sjeffcpu_search_highest(struct cpu_group *cg, struct cpu_search *high) 652176735Sjeff{ 653176735Sjeff return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST); 654176735Sjeff} 655176735Sjeff 656176735Sjeffint 657176735Sjeffcpu_search_both(struct cpu_group *cg, struct cpu_search *low, 658176735Sjeff struct cpu_search *high) 659176735Sjeff{ 660176735Sjeff return cpu_search(cg, low, high, CPU_SEARCH_BOTH); 661176735Sjeff} 662176735Sjeff 663176735Sjeff/* 664176735Sjeff * Find the cpu with the least load via the least loaded path that has a 665176735Sjeff * lowpri greater than pri pri. A pri of -1 indicates any priority is 666176735Sjeff * acceptable. 667176735Sjeff */ 668176735Sjeffstatic inline int 669176735Sjeffsched_lowest(struct cpu_group *cg, cpumask_t mask, int pri) 670176735Sjeff{ 671176735Sjeff struct cpu_search low; 672176735Sjeff 673176735Sjeff low.cs_cpu = -1; 674176735Sjeff low.cs_load = -1; 675176735Sjeff low.cs_mask = mask; 676176735Sjeff low.cs_limit = pri; 677176735Sjeff cpu_search_lowest(cg, &low); 678176735Sjeff return low.cs_cpu; 679176735Sjeff} 680176735Sjeff 681176735Sjeff/* 682176735Sjeff * Find the cpu with the highest load via the highest loaded path. 683176735Sjeff */ 684176735Sjeffstatic inline int 685176735Sjeffsched_highest(struct cpu_group *cg, cpumask_t mask, int minload) 686176735Sjeff{ 687176735Sjeff struct cpu_search high; 688176735Sjeff 689176735Sjeff high.cs_cpu = -1; 690176735Sjeff high.cs_load = 0; 691176735Sjeff high.cs_mask = mask; 692176735Sjeff high.cs_limit = minload; 693176735Sjeff cpu_search_highest(cg, &high); 694176735Sjeff return high.cs_cpu; 695176735Sjeff} 696176735Sjeff 697176735Sjeff/* 698176735Sjeff * Simultaneously find the highest and lowest loaded cpu reachable via 699176735Sjeff * cg. 700176735Sjeff */ 701176735Sjeffstatic inline void 702176735Sjeffsched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu) 703176735Sjeff{ 704176735Sjeff struct cpu_search high; 705176735Sjeff struct cpu_search low; 706176735Sjeff 707176735Sjeff low.cs_cpu = -1; 708176735Sjeff low.cs_limit = -1; 709176735Sjeff low.cs_load = -1; 710176735Sjeff low.cs_mask = mask; 711176735Sjeff high.cs_load = 0; 712176735Sjeff high.cs_cpu = -1; 713176735Sjeff high.cs_limit = -1; 714176735Sjeff high.cs_mask = mask; 715176735Sjeff cpu_search_both(cg, &low, &high); 716176735Sjeff *lowcpu = low.cs_cpu; 717176735Sjeff *highcpu = high.cs_cpu; 718176735Sjeff return; 719176735Sjeff} 720176735Sjeff 721121790Sjeffstatic void 722176735Sjeffsched_balance_group(struct cpu_group *cg) 723116069Sjeff{ 724176735Sjeff cpumask_t mask; 725176735Sjeff int high; 726176735Sjeff int low; 727123487Sjeff int i; 728123487Sjeff 729176735Sjeff mask = -1; 730176735Sjeff for (;;) { 731176735Sjeff sched_both(cg, mask, &low, &high); 732176735Sjeff if (low == high || low == -1 || high == -1) 733176735Sjeff break; 734176735Sjeff if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) 735176735Sjeff break; 736123487Sjeff /* 737176735Sjeff * If we failed to move any threads determine which cpu 738176735Sjeff * to kick out of the set and try again. 739176735Sjeff */ 740176735Sjeff if (TDQ_CPU(high)->tdq_transferable == 0) 741176735Sjeff mask &= ~(1 << high); 742176735Sjeff else 743176735Sjeff mask &= ~(1 << low); 744123487Sjeff } 745176735Sjeff 746176735Sjeff for (i = 0; i < cg->cg_children; i++) 747176735Sjeff sched_balance_group(&cg->cg_child[i]); 748123487Sjeff} 749123487Sjeff 750123487Sjeffstatic void 751176735Sjeffsched_balance() 752123487Sjeff{ 753172409Sjeff struct tdq *tdq; 754123487Sjeff 755172409Sjeff /* 756172409Sjeff * Select a random time between .5 * balance_interval and 757172409Sjeff * 1.5 * balance_interval. 758172409Sjeff */ 759176735Sjeff balance_ticks = max(balance_interval / 2, 1); 760176735Sjeff balance_ticks += random() % balance_interval; 761171482Sjeff if (smp_started == 0 || rebalance == 0) 762171482Sjeff return; 763172409Sjeff tdq = TDQ_SELF(); 764172409Sjeff TDQ_UNLOCK(tdq); 765176735Sjeff sched_balance_group(cpu_top); 766172409Sjeff TDQ_LOCK(tdq); 767123487Sjeff} 768123487Sjeff 769171482Sjeff/* 770171482Sjeff * Lock two thread queues using their address to maintain lock order. 771171482Sjeff */ 772123487Sjeffstatic void 773171482Sjefftdq_lock_pair(struct tdq *one, struct tdq *two) 774171482Sjeff{ 775171482Sjeff if (one < two) { 776171482Sjeff TDQ_LOCK(one); 777171482Sjeff TDQ_LOCK_FLAGS(two, MTX_DUPOK); 778171482Sjeff } else { 779171482Sjeff TDQ_LOCK(two); 780171482Sjeff TDQ_LOCK_FLAGS(one, MTX_DUPOK); 781171482Sjeff } 782171482Sjeff} 783171482Sjeff 784171482Sjeff/* 785172409Sjeff * Unlock two thread queues. Order is not important here. 786172409Sjeff */ 787172409Sjeffstatic void 788172409Sjefftdq_unlock_pair(struct tdq *one, struct tdq *two) 789172409Sjeff{ 790172409Sjeff TDQ_UNLOCK(one); 791172409Sjeff TDQ_UNLOCK(two); 792172409Sjeff} 793172409Sjeff 794172409Sjeff/* 795171482Sjeff * Transfer load between two imbalanced thread queues. 796171482Sjeff */ 797176735Sjeffstatic int 798164936Sjuliansched_balance_pair(struct tdq *high, struct tdq *low) 799123487Sjeff{ 800123433Sjeff int transferable; 801116069Sjeff int high_load; 802116069Sjeff int low_load; 803176735Sjeff int moved; 804116069Sjeff int move; 805116069Sjeff int diff; 806116069Sjeff int i; 807116069Sjeff 808171482Sjeff tdq_lock_pair(high, low); 809176735Sjeff transferable = high->tdq_transferable; 810176735Sjeff high_load = high->tdq_load; 811176735Sjeff low_load = low->tdq_load; 812176735Sjeff moved = 0; 813116069Sjeff /* 814122744Sjeff * Determine what the imbalance is and then adjust that to how many 815165620Sjeff * threads we actually have to give up (transferable). 816122744Sjeff */ 817171482Sjeff if (transferable != 0) { 818171482Sjeff diff = high_load - low_load; 819171482Sjeff move = diff / 2; 820171482Sjeff if (diff & 0x1) 821171482Sjeff move++; 822171482Sjeff move = min(move, transferable); 823171482Sjeff for (i = 0; i < move; i++) 824176735Sjeff moved += tdq_move(high, low); 825172293Sjeff /* 826172293Sjeff * IPI the target cpu to force it to reschedule with the new 827172293Sjeff * workload. 828172293Sjeff */ 829172293Sjeff ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT); 830171482Sjeff } 831172409Sjeff tdq_unlock_pair(high, low); 832176735Sjeff return (moved); 833116069Sjeff} 834116069Sjeff 835171482Sjeff/* 836171482Sjeff * Move a thread from one thread queue to another. 837171482Sjeff */ 838176735Sjeffstatic int 839171482Sjefftdq_move(struct tdq *from, struct tdq *to) 840116069Sjeff{ 841171482Sjeff struct td_sched *ts; 842171482Sjeff struct thread *td; 843164936Sjulian struct tdq *tdq; 844171482Sjeff int cpu; 845116069Sjeff 846172409Sjeff TDQ_LOCK_ASSERT(from, MA_OWNED); 847172409Sjeff TDQ_LOCK_ASSERT(to, MA_OWNED); 848172409Sjeff 849164936Sjulian tdq = from; 850171482Sjeff cpu = TDQ_ID(to); 851176735Sjeff ts = tdq_steal(tdq, cpu); 852176735Sjeff if (ts == NULL) 853176735Sjeff return (0); 854171482Sjeff td = ts->ts_thread; 855171482Sjeff /* 856171482Sjeff * Although the run queue is locked the thread may be blocked. Lock 857172409Sjeff * it to clear this and acquire the run-queue lock. 858171482Sjeff */ 859171482Sjeff thread_lock(td); 860172409Sjeff /* Drop recursive lock on from acquired via thread_lock(). */ 861171482Sjeff TDQ_UNLOCK(from); 862171482Sjeff sched_rem(td); 863166108Sjeff ts->ts_cpu = cpu; 864171482Sjeff td->td_lock = TDQ_LOCKPTR(to); 865171482Sjeff tdq_add(to, td, SRQ_YIELDING); 866176735Sjeff return (1); 867116069Sjeff} 868110267Sjeff 869171482Sjeff/* 870171482Sjeff * This tdq has idled. Try to steal a thread from another cpu and switch 871171482Sjeff * to it. 872171482Sjeff */ 873123433Sjeffstatic int 874164936Sjuliantdq_idled(struct tdq *tdq) 875121790Sjeff{ 876176735Sjeff struct cpu_group *cg; 877164936Sjulian struct tdq *steal; 878176735Sjeff cpumask_t mask; 879176735Sjeff int thresh; 880171482Sjeff int cpu; 881123433Sjeff 882172484Sjeff if (smp_started == 0 || steal_idle == 0) 883172484Sjeff return (1); 884176735Sjeff mask = -1; 885176735Sjeff mask &= ~PCPU_GET(cpumask); 886176735Sjeff /* We don't want to be preempted while we're iterating. */ 887171482Sjeff spinlock_enter(); 888176735Sjeff for (cg = tdq->tdq_cg; cg != NULL; ) { 889176735Sjeff if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0) 890176735Sjeff thresh = steal_thresh; 891176735Sjeff else 892176735Sjeff thresh = 1; 893176735Sjeff cpu = sched_highest(cg, mask, thresh); 894176735Sjeff if (cpu == -1) { 895176735Sjeff cg = cg->cg_parent; 896176735Sjeff continue; 897166108Sjeff } 898176735Sjeff steal = TDQ_CPU(cpu); 899176735Sjeff mask &= ~(1 << cpu); 900176735Sjeff tdq_lock_pair(tdq, steal); 901176735Sjeff if (steal->tdq_load < thresh || steal->tdq_transferable == 0) { 902176735Sjeff tdq_unlock_pair(tdq, steal); 903176735Sjeff continue; 904171482Sjeff } 905176735Sjeff /* 906176735Sjeff * If a thread was added while interrupts were disabled don't 907176735Sjeff * steal one here. If we fail to acquire one due to affinity 908176735Sjeff * restrictions loop again with this cpu removed from the 909176735Sjeff * set. 910176735Sjeff */ 911176735Sjeff if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) { 912176735Sjeff tdq_unlock_pair(tdq, steal); 913176735Sjeff continue; 914176735Sjeff } 915176735Sjeff spinlock_exit(); 916176735Sjeff TDQ_UNLOCK(steal); 917176735Sjeff mi_switch(SW_VOL, NULL); 918176735Sjeff thread_unlock(curthread); 919176735Sjeff 920176735Sjeff return (0); 921123433Sjeff } 922171482Sjeff spinlock_exit(); 923123433Sjeff return (1); 924121790Sjeff} 925121790Sjeff 926171482Sjeff/* 927171482Sjeff * Notify a remote cpu of new work. Sends an IPI if criteria are met. 928171482Sjeff */ 929121790Sjeffstatic void 930177005Sjefftdq_notify(struct tdq *tdq, struct td_sched *ts) 931121790Sjeff{ 932166247Sjeff int cpri; 933166247Sjeff int pri; 934166108Sjeff int cpu; 935121790Sjeff 936177005Sjeff if (tdq->tdq_ipipending) 937177005Sjeff return; 938166108Sjeff cpu = ts->ts_cpu; 939166247Sjeff pri = ts->ts_thread->td_priority; 940177005Sjeff cpri = pcpu_find(cpu)->pc_curthread->td_priority; 941177005Sjeff if (!sched_shouldpreempt(pri, cpri, 1)) 942166137Sjeff return; 943177005Sjeff tdq->tdq_ipipending = 1; 944171482Sjeff ipi_selected(1 << cpu, IPI_PREEMPT); 945121790Sjeff} 946121790Sjeff 947171482Sjeff/* 948171482Sjeff * Steals load from a timeshare queue. Honors the rotating queue head 949171482Sjeff * index. 950171482Sjeff */ 951164936Sjulianstatic struct td_sched * 952176735Sjeffrunq_steal_from(struct runq *rq, int cpu, u_char start) 953171482Sjeff{ 954171482Sjeff struct td_sched *ts; 955171482Sjeff struct rqbits *rqb; 956171482Sjeff struct rqhead *rqh; 957171482Sjeff int first; 958171482Sjeff int bit; 959171482Sjeff int pri; 960171482Sjeff int i; 961171482Sjeff 962171482Sjeff rqb = &rq->rq_status; 963171482Sjeff bit = start & (RQB_BPW -1); 964171482Sjeff pri = 0; 965171482Sjeff first = 0; 966171482Sjeffagain: 967171482Sjeff for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { 968171482Sjeff if (rqb->rqb_bits[i] == 0) 969171482Sjeff continue; 970171482Sjeff if (bit != 0) { 971171482Sjeff for (pri = bit; pri < RQB_BPW; pri++) 972171482Sjeff if (rqb->rqb_bits[i] & (1ul << pri)) 973171482Sjeff break; 974171482Sjeff if (pri >= RQB_BPW) 975171482Sjeff continue; 976171482Sjeff } else 977171482Sjeff pri = RQB_FFS(rqb->rqb_bits[i]); 978171482Sjeff pri += (i << RQB_L2BPW); 979171482Sjeff rqh = &rq->rq_queues[pri]; 980171482Sjeff TAILQ_FOREACH(ts, rqh, ts_procq) { 981176735Sjeff if (first && THREAD_CAN_MIGRATE(ts->ts_thread) && 982176735Sjeff THREAD_CAN_SCHED(ts->ts_thread, cpu)) 983171482Sjeff return (ts); 984171482Sjeff first = 1; 985171482Sjeff } 986171482Sjeff } 987171482Sjeff if (start != 0) { 988171482Sjeff start = 0; 989171482Sjeff goto again; 990171482Sjeff } 991171482Sjeff 992171482Sjeff return (NULL); 993171482Sjeff} 994171482Sjeff 995171482Sjeff/* 996171482Sjeff * Steals load from a standard linear queue. 997171482Sjeff */ 998171482Sjeffstatic struct td_sched * 999176735Sjeffrunq_steal(struct runq *rq, int cpu) 1000121790Sjeff{ 1001121790Sjeff struct rqhead *rqh; 1002121790Sjeff struct rqbits *rqb; 1003164936Sjulian struct td_sched *ts; 1004121790Sjeff int word; 1005121790Sjeff int bit; 1006121790Sjeff 1007121790Sjeff rqb = &rq->rq_status; 1008121790Sjeff for (word = 0; word < RQB_LEN; word++) { 1009121790Sjeff if (rqb->rqb_bits[word] == 0) 1010121790Sjeff continue; 1011121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 1012123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 1013121790Sjeff continue; 1014121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 1015171506Sjeff TAILQ_FOREACH(ts, rqh, ts_procq) 1016176735Sjeff if (THREAD_CAN_MIGRATE(ts->ts_thread) && 1017176735Sjeff THREAD_CAN_SCHED(ts->ts_thread, cpu)) 1018164936Sjulian return (ts); 1019121790Sjeff } 1020121790Sjeff } 1021121790Sjeff return (NULL); 1022121790Sjeff} 1023121790Sjeff 1024171482Sjeff/* 1025171482Sjeff * Attempt to steal a thread in priority order from a thread queue. 1026171482Sjeff */ 1027164936Sjulianstatic struct td_sched * 1028176735Sjefftdq_steal(struct tdq *tdq, int cpu) 1029121790Sjeff{ 1030164936Sjulian struct td_sched *ts; 1031121790Sjeff 1032171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 1033176735Sjeff if ((ts = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) 1034164936Sjulian return (ts); 1035176735Sjeff if ((ts = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx)) 1036176735Sjeff != NULL) 1037164936Sjulian return (ts); 1038176735Sjeff return (runq_steal(&tdq->tdq_idle, cpu)); 1039121790Sjeff} 1040123433Sjeff 1041171482Sjeff/* 1042171482Sjeff * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the 1043172409Sjeff * current lock and returns with the assigned queue locked. 1044171482Sjeff */ 1045171482Sjeffstatic inline struct tdq * 1046171482Sjeffsched_setcpu(struct td_sched *ts, int cpu, int flags) 1047123433Sjeff{ 1048171482Sjeff struct thread *td; 1049171482Sjeff struct tdq *tdq; 1050123433Sjeff 1051171482Sjeff THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 1052171482Sjeff 1053171482Sjeff tdq = TDQ_CPU(cpu); 1054171482Sjeff td = ts->ts_thread; 1055171482Sjeff ts->ts_cpu = cpu; 1056171713Sjeff 1057171713Sjeff /* If the lock matches just return the queue. */ 1058171482Sjeff if (td->td_lock == TDQ_LOCKPTR(tdq)) 1059171482Sjeff return (tdq); 1060171482Sjeff#ifdef notyet 1061123433Sjeff /* 1062172293Sjeff * If the thread isn't running its lockptr is a 1063171482Sjeff * turnstile or a sleepqueue. We can just lock_set without 1064171482Sjeff * blocking. 1065123685Sjeff */ 1066171482Sjeff if (TD_CAN_RUN(td)) { 1067171482Sjeff TDQ_LOCK(tdq); 1068171482Sjeff thread_lock_set(td, TDQ_LOCKPTR(tdq)); 1069171482Sjeff return (tdq); 1070171482Sjeff } 1071171482Sjeff#endif 1072166108Sjeff /* 1073171482Sjeff * The hard case, migration, we need to block the thread first to 1074171482Sjeff * prevent order reversals with other cpus locks. 1075166108Sjeff */ 1076171482Sjeff thread_lock_block(td); 1077171482Sjeff TDQ_LOCK(tdq); 1078171713Sjeff thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); 1079171482Sjeff return (tdq); 1080166108Sjeff} 1081166108Sjeff 1082166108Sjeffstatic int 1083171482Sjeffsched_pickcpu(struct td_sched *ts, int flags) 1084171482Sjeff{ 1085176735Sjeff struct cpu_group *cg; 1086176735Sjeff struct thread *td; 1087171482Sjeff struct tdq *tdq; 1088176735Sjeff cpumask_t mask; 1089166108Sjeff int self; 1090166108Sjeff int pri; 1091166108Sjeff int cpu; 1092166108Sjeff 1093176735Sjeff self = PCPU_GET(cpuid); 1094176735Sjeff td = ts->ts_thread; 1095166108Sjeff if (smp_started == 0) 1096166108Sjeff return (self); 1097171506Sjeff /* 1098171506Sjeff * Don't migrate a running thread from sched_switch(). 1099171506Sjeff */ 1100176735Sjeff if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) 1101176735Sjeff return (ts->ts_cpu); 1102166108Sjeff /* 1103176735Sjeff * Prefer to run interrupt threads on the processors that generate 1104176735Sjeff * the interrupt. 1105166108Sjeff */ 1106176735Sjeff if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && 1107176735Sjeff curthread->td_intr_nesting_level) 1108176735Sjeff ts->ts_cpu = self; 1109166108Sjeff /* 1110176735Sjeff * If the thread can run on the last cpu and the affinity has not 1111176735Sjeff * expired or it is idle run it there. 1112166108Sjeff */ 1113176735Sjeff pri = td->td_priority; 1114176735Sjeff tdq = TDQ_CPU(ts->ts_cpu); 1115176735Sjeff if (THREAD_CAN_SCHED(td, ts->ts_cpu)) { 1116176735Sjeff if (tdq->tdq_lowpri > PRI_MIN_IDLE) 1117176735Sjeff return (ts->ts_cpu); 1118176735Sjeff if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) 1119176735Sjeff return (ts->ts_cpu); 1120139334Sjeff } 1121123433Sjeff /* 1122176735Sjeff * Search for the highest level in the tree that still has affinity. 1123123433Sjeff */ 1124176735Sjeff cg = NULL; 1125176735Sjeff for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent) 1126176735Sjeff if (SCHED_AFFINITY(ts, cg->cg_level)) 1127176735Sjeff break; 1128176735Sjeff cpu = -1; 1129176735Sjeff mask = td->td_cpuset->cs_mask.__bits[0]; 1130176735Sjeff if (cg) 1131176735Sjeff cpu = sched_lowest(cg, mask, pri); 1132176735Sjeff if (cpu == -1) 1133176735Sjeff cpu = sched_lowest(cpu_top, mask, -1); 1134171506Sjeff /* 1135176735Sjeff * Compare the lowest loaded cpu to current cpu. 1136171506Sjeff */ 1137177005Sjeff if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri && 1138177005Sjeff TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) 1139177005Sjeff cpu = self; 1140177005Sjeff KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu.")); 1141171482Sjeff return (cpu); 1142123433Sjeff} 1143176735Sjeff#endif 1144123433Sjeff 1145117326Sjeff/* 1146121790Sjeff * Pick the highest priority task we have and return it. 1147117326Sjeff */ 1148164936Sjulianstatic struct td_sched * 1149164936Sjuliantdq_choose(struct tdq *tdq) 1150110267Sjeff{ 1151164936Sjulian struct td_sched *ts; 1152110267Sjeff 1153171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 1154165762Sjeff ts = runq_choose(&tdq->tdq_realtime); 1155170787Sjeff if (ts != NULL) 1156164936Sjulian return (ts); 1157165766Sjeff ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); 1158165762Sjeff if (ts != NULL) { 1159170787Sjeff KASSERT(ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE, 1160165762Sjeff ("tdq_choose: Invalid priority on timeshare queue %d", 1161165762Sjeff ts->ts_thread->td_priority)); 1162165762Sjeff return (ts); 1163165762Sjeff } 1164110267Sjeff 1165165762Sjeff ts = runq_choose(&tdq->tdq_idle); 1166165762Sjeff if (ts != NULL) { 1167165762Sjeff KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE, 1168165762Sjeff ("tdq_choose: Invalid priority on idle queue %d", 1169165762Sjeff ts->ts_thread->td_priority)); 1170165762Sjeff return (ts); 1171165762Sjeff } 1172165762Sjeff 1173165762Sjeff return (NULL); 1174110267Sjeff} 1175110267Sjeff 1176171482Sjeff/* 1177171482Sjeff * Initialize a thread queue. 1178171482Sjeff */ 1179109864Sjeffstatic void 1180164936Sjuliantdq_setup(struct tdq *tdq) 1181110028Sjeff{ 1182171482Sjeff 1183171713Sjeff if (bootverbose) 1184171713Sjeff printf("ULE: setup cpu %d\n", TDQ_ID(tdq)); 1185165762Sjeff runq_init(&tdq->tdq_realtime); 1186165762Sjeff runq_init(&tdq->tdq_timeshare); 1187165620Sjeff runq_init(&tdq->tdq_idle); 1188176735Sjeff snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), 1189176735Sjeff "sched lock %d", (int)TDQ_ID(tdq)); 1190176735Sjeff mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", 1191176735Sjeff MTX_SPIN | MTX_RECURSE); 1192110028Sjeff} 1193110028Sjeff 1194171713Sjeff#ifdef SMP 1195110028Sjeffstatic void 1196171713Sjeffsched_setup_smp(void) 1197171713Sjeff{ 1198171713Sjeff struct tdq *tdq; 1199171713Sjeff int i; 1200171713Sjeff 1201176735Sjeff cpu_top = smp_topo(); 1202176735Sjeff for (i = 0; i < MAXCPU; i++) { 1203171713Sjeff if (CPU_ABSENT(i)) 1204171713Sjeff continue; 1205176735Sjeff tdq = TDQ_CPU(i); 1206171713Sjeff tdq_setup(tdq); 1207176735Sjeff tdq->tdq_cg = smp_topo_find(cpu_top, i); 1208176735Sjeff if (tdq->tdq_cg == NULL) 1209176735Sjeff panic("Can't find cpu group for %d\n", i); 1210123433Sjeff } 1211176735Sjeff balance_tdq = TDQ_SELF(); 1212176735Sjeff sched_balance(); 1213171713Sjeff} 1214171713Sjeff#endif 1215171713Sjeff 1216171713Sjeff/* 1217171713Sjeff * Setup the thread queues and initialize the topology based on MD 1218171713Sjeff * information. 1219171713Sjeff */ 1220171713Sjeffstatic void 1221171713Sjeffsched_setup(void *dummy) 1222171713Sjeff{ 1223171713Sjeff struct tdq *tdq; 1224171713Sjeff 1225171713Sjeff tdq = TDQ_SELF(); 1226171713Sjeff#ifdef SMP 1227176734Sjeff sched_setup_smp(); 1228117237Sjeff#else 1229171713Sjeff tdq_setup(tdq); 1230116069Sjeff#endif 1231171482Sjeff /* 1232171482Sjeff * To avoid divide-by-zero, we set realstathz a dummy value 1233171482Sjeff * in case which sched_clock() called before sched_initticks(). 1234171482Sjeff */ 1235171482Sjeff realstathz = hz; 1236171482Sjeff sched_slice = (realstathz/10); /* ~100ms */ 1237171482Sjeff tickincr = 1 << SCHED_TICK_SHIFT; 1238171482Sjeff 1239171482Sjeff /* Add thread0's load since it's running. */ 1240171482Sjeff TDQ_LOCK(tdq); 1241171713Sjeff thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); 1242171482Sjeff tdq_load_add(tdq, &td_sched0); 1243176735Sjeff tdq->tdq_lowpri = thread0.td_priority; 1244171482Sjeff TDQ_UNLOCK(tdq); 1245109864Sjeff} 1246109864Sjeff 1247171482Sjeff/* 1248171482Sjeff * This routine determines the tickincr after stathz and hz are setup. 1249171482Sjeff */ 1250153533Sdavidxu/* ARGSUSED */ 1251153533Sdavidxustatic void 1252153533Sdavidxusched_initticks(void *dummy) 1253153533Sdavidxu{ 1254171482Sjeff int incr; 1255171482Sjeff 1256153533Sdavidxu realstathz = stathz ? stathz : hz; 1257166229Sjeff sched_slice = (realstathz/10); /* ~100ms */ 1258153533Sdavidxu 1259153533Sdavidxu /* 1260165762Sjeff * tickincr is shifted out by 10 to avoid rounding errors due to 1261165766Sjeff * hz not being evenly divisible by stathz on all platforms. 1262153533Sdavidxu */ 1263171482Sjeff incr = (hz << SCHED_TICK_SHIFT) / realstathz; 1264165762Sjeff /* 1265165762Sjeff * This does not work for values of stathz that are more than 1266165762Sjeff * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. 1267165762Sjeff */ 1268171482Sjeff if (incr == 0) 1269171482Sjeff incr = 1; 1270171482Sjeff tickincr = incr; 1271166108Sjeff#ifdef SMP 1272171899Sjeff /* 1273172409Sjeff * Set the default balance interval now that we know 1274172409Sjeff * what realstathz is. 1275172409Sjeff */ 1276172409Sjeff balance_interval = realstathz; 1277172409Sjeff /* 1278171899Sjeff * Set steal thresh to log2(mp_ncpu) but no greater than 4. This 1279171899Sjeff * prevents excess thrashing on large machines and excess idle on 1280171899Sjeff * smaller machines. 1281171899Sjeff */ 1282176735Sjeff steal_thresh = min(ffs(mp_ncpus) - 1, 3); 1283166108Sjeff affinity = SCHED_AFFINITY_DEFAULT; 1284166108Sjeff#endif 1285153533Sdavidxu} 1286153533Sdavidxu 1287153533Sdavidxu 1288109864Sjeff/* 1289171482Sjeff * This is the core of the interactivity algorithm. Determines a score based 1290171482Sjeff * on past behavior. It is the ratio of sleep time to run time scaled to 1291171482Sjeff * a [0, 100] integer. This is the voluntary sleep time of a process, which 1292171482Sjeff * differs from the cpu usage because it does not account for time spent 1293171482Sjeff * waiting on a run-queue. Would be prettier if we had floating point. 1294171482Sjeff */ 1295171482Sjeffstatic int 1296171482Sjeffsched_interact_score(struct thread *td) 1297171482Sjeff{ 1298171482Sjeff struct td_sched *ts; 1299171482Sjeff int div; 1300171482Sjeff 1301171482Sjeff ts = td->td_sched; 1302171482Sjeff /* 1303171482Sjeff * The score is only needed if this is likely to be an interactive 1304171482Sjeff * task. Don't go through the expense of computing it if there's 1305171482Sjeff * no chance. 1306171482Sjeff */ 1307171482Sjeff if (sched_interact <= SCHED_INTERACT_HALF && 1308171482Sjeff ts->ts_runtime >= ts->ts_slptime) 1309171482Sjeff return (SCHED_INTERACT_HALF); 1310171482Sjeff 1311171482Sjeff if (ts->ts_runtime > ts->ts_slptime) { 1312171482Sjeff div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); 1313171482Sjeff return (SCHED_INTERACT_HALF + 1314171482Sjeff (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); 1315171482Sjeff } 1316171482Sjeff if (ts->ts_slptime > ts->ts_runtime) { 1317171482Sjeff div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); 1318171482Sjeff return (ts->ts_runtime / div); 1319171482Sjeff } 1320171482Sjeff /* runtime == slptime */ 1321171482Sjeff if (ts->ts_runtime) 1322171482Sjeff return (SCHED_INTERACT_HALF); 1323171482Sjeff 1324171482Sjeff /* 1325171482Sjeff * This can happen if slptime and runtime are 0. 1326171482Sjeff */ 1327171482Sjeff return (0); 1328171482Sjeff 1329171482Sjeff} 1330171482Sjeff 1331171482Sjeff/* 1332109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 1333109864Sjeff * process. 1334109864Sjeff */ 1335113357Sjeffstatic void 1336163709Sjbsched_priority(struct thread *td) 1337109864Sjeff{ 1338165762Sjeff int score; 1339109864Sjeff int pri; 1340109864Sjeff 1341163709Sjb if (td->td_pri_class != PRI_TIMESHARE) 1342113357Sjeff return; 1343112966Sjeff /* 1344165762Sjeff * If the score is interactive we place the thread in the realtime 1345165762Sjeff * queue with a priority that is less than kernel and interrupt 1346165762Sjeff * priorities. These threads are not subject to nice restrictions. 1347112966Sjeff * 1348171482Sjeff * Scores greater than this are placed on the normal timeshare queue 1349165762Sjeff * where the priority is partially decided by the most recent cpu 1350165762Sjeff * utilization and the rest is decided by nice value. 1351172293Sjeff * 1352172293Sjeff * The nice value of the process has a linear effect on the calculated 1353172293Sjeff * score. Negative nice values make it easier for a thread to be 1354172293Sjeff * considered interactive. 1355112966Sjeff */ 1356172308Sjeff score = imax(0, sched_interact_score(td) - td->td_proc->p_nice); 1357165762Sjeff if (score < sched_interact) { 1358165762Sjeff pri = PRI_MIN_REALTIME; 1359165762Sjeff pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact) 1360165762Sjeff * score; 1361165762Sjeff KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME, 1362166208Sjeff ("sched_priority: invalid interactive priority %d score %d", 1363166208Sjeff pri, score)); 1364165762Sjeff } else { 1365165762Sjeff pri = SCHED_PRI_MIN; 1366165762Sjeff if (td->td_sched->ts_ticks) 1367165762Sjeff pri += SCHED_PRI_TICKS(td->td_sched); 1368165762Sjeff pri += SCHED_PRI_NICE(td->td_proc->p_nice); 1369171482Sjeff KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE, 1370171482Sjeff ("sched_priority: invalid priority %d: nice %d, " 1371171482Sjeff "ticks %d ftick %d ltick %d tick pri %d", 1372171482Sjeff pri, td->td_proc->p_nice, td->td_sched->ts_ticks, 1373171482Sjeff td->td_sched->ts_ftick, td->td_sched->ts_ltick, 1374171482Sjeff SCHED_PRI_TICKS(td->td_sched))); 1375165762Sjeff } 1376165762Sjeff sched_user_prio(td, pri); 1377112966Sjeff 1378112966Sjeff return; 1379109864Sjeff} 1380109864Sjeff 1381121868Sjeff/* 1382121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1383171482Sjeff * kept. It is called after either the slptime or runtime is adjusted. This 1384171482Sjeff * function is ugly due to integer math. 1385121868Sjeff */ 1386116463Sjeffstatic void 1387163709Sjbsched_interact_update(struct thread *td) 1388116463Sjeff{ 1389165819Sjeff struct td_sched *ts; 1390166208Sjeff u_int sum; 1391121605Sjeff 1392165819Sjeff ts = td->td_sched; 1393171482Sjeff sum = ts->ts_runtime + ts->ts_slptime; 1394121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1395121868Sjeff return; 1396121868Sjeff /* 1397165819Sjeff * This only happens from two places: 1398165819Sjeff * 1) We have added an unusual amount of run time from fork_exit. 1399165819Sjeff * 2) We have added an unusual amount of sleep time from sched_sleep(). 1400165819Sjeff */ 1401165819Sjeff if (sum > SCHED_SLP_RUN_MAX * 2) { 1402171482Sjeff if (ts->ts_runtime > ts->ts_slptime) { 1403171482Sjeff ts->ts_runtime = SCHED_SLP_RUN_MAX; 1404171482Sjeff ts->ts_slptime = 1; 1405165819Sjeff } else { 1406171482Sjeff ts->ts_slptime = SCHED_SLP_RUN_MAX; 1407171482Sjeff ts->ts_runtime = 1; 1408165819Sjeff } 1409165819Sjeff return; 1410165819Sjeff } 1411165819Sjeff /* 1412121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1413121868Sjeff * will not bring us back into range. Dividing by two here forces 1414133427Sjeff * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1415121868Sjeff */ 1416127850Sjeff if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1417171482Sjeff ts->ts_runtime /= 2; 1418171482Sjeff ts->ts_slptime /= 2; 1419121868Sjeff return; 1420116463Sjeff } 1421171482Sjeff ts->ts_runtime = (ts->ts_runtime / 5) * 4; 1422171482Sjeff ts->ts_slptime = (ts->ts_slptime / 5) * 4; 1423116463Sjeff} 1424116463Sjeff 1425171482Sjeff/* 1426171482Sjeff * Scale back the interactivity history when a child thread is created. The 1427171482Sjeff * history is inherited from the parent but the thread may behave totally 1428171482Sjeff * differently. For example, a shell spawning a compiler process. We want 1429171482Sjeff * to learn that the compiler is behaving badly very quickly. 1430171482Sjeff */ 1431121868Sjeffstatic void 1432163709Sjbsched_interact_fork(struct thread *td) 1433121868Sjeff{ 1434121868Sjeff int ratio; 1435121868Sjeff int sum; 1436121868Sjeff 1437171482Sjeff sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime; 1438121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1439121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1440171482Sjeff td->td_sched->ts_runtime /= ratio; 1441171482Sjeff td->td_sched->ts_slptime /= ratio; 1442121868Sjeff } 1443121868Sjeff} 1444121868Sjeff 1445113357Sjeff/* 1446171482Sjeff * Called from proc0_init() to setup the scheduler fields. 1447134791Sjulian */ 1448134791Sjulianvoid 1449134791Sjulianschedinit(void) 1450134791Sjulian{ 1451165762Sjeff 1452134791Sjulian /* 1453134791Sjulian * Set up the scheduler specific parts of proc0. 1454134791Sjulian */ 1455136167Sjulian proc0.p_sched = NULL; /* XXX */ 1456164936Sjulian thread0.td_sched = &td_sched0; 1457165762Sjeff td_sched0.ts_ltick = ticks; 1458165796Sjeff td_sched0.ts_ftick = ticks; 1459164936Sjulian td_sched0.ts_thread = &thread0; 1460177009Sjeff td_sched0.ts_slice = sched_slice; 1461134791Sjulian} 1462134791Sjulian 1463134791Sjulian/* 1464113357Sjeff * This is only somewhat accurate since given many processes of the same 1465113357Sjeff * priority they will switch when their slices run out, which will be 1466165762Sjeff * at most sched_slice stathz ticks. 1467113357Sjeff */ 1468109864Sjeffint 1469109864Sjeffsched_rr_interval(void) 1470109864Sjeff{ 1471165762Sjeff 1472165762Sjeff /* Convert sched_slice to hz */ 1473165762Sjeff return (hz/(realstathz/sched_slice)); 1474109864Sjeff} 1475109864Sjeff 1476171482Sjeff/* 1477171482Sjeff * Update the percent cpu tracking information when it is requested or 1478171482Sjeff * the total history exceeds the maximum. We keep a sliding history of 1479171482Sjeff * tick counts that slowly decays. This is less precise than the 4BSD 1480171482Sjeff * mechanism since it happens with less regular and frequent events. 1481171482Sjeff */ 1482121790Sjeffstatic void 1483164936Sjuliansched_pctcpu_update(struct td_sched *ts) 1484109864Sjeff{ 1485165762Sjeff 1486165762Sjeff if (ts->ts_ticks == 0) 1487165762Sjeff return; 1488165796Sjeff if (ticks - (hz / 10) < ts->ts_ltick && 1489165796Sjeff SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX) 1490165796Sjeff return; 1491109864Sjeff /* 1492109864Sjeff * Adjust counters and watermark for pctcpu calc. 1493116365Sjeff */ 1494165762Sjeff if (ts->ts_ltick > ticks - SCHED_TICK_TARG) 1495164936Sjulian ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) * 1496165762Sjeff SCHED_TICK_TARG; 1497165762Sjeff else 1498164936Sjulian ts->ts_ticks = 0; 1499164936Sjulian ts->ts_ltick = ticks; 1500165762Sjeff ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG; 1501109864Sjeff} 1502109864Sjeff 1503171482Sjeff/* 1504171482Sjeff * Adjust the priority of a thread. Move it to the appropriate run-queue 1505171482Sjeff * if necessary. This is the back-end for several priority related 1506171482Sjeff * functions. 1507171482Sjeff */ 1508165762Sjeffstatic void 1509139453Sjhbsched_thread_priority(struct thread *td, u_char prio) 1510109864Sjeff{ 1511164936Sjulian struct td_sched *ts; 1512177009Sjeff struct tdq *tdq; 1513177009Sjeff int oldpri; 1514109864Sjeff 1515139316Sjeff CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1516173600Sjulian td, td->td_name, td->td_priority, prio, curthread, 1517173600Sjulian curthread->td_name); 1518164936Sjulian ts = td->td_sched; 1519170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1520139453Sjhb if (td->td_priority == prio) 1521139453Sjhb return; 1522165762Sjeff 1523165766Sjeff if (TD_ON_RUNQ(td) && prio < td->td_priority) { 1524121605Sjeff /* 1525121605Sjeff * If the priority has been elevated due to priority 1526121605Sjeff * propagation, we may have to move ourselves to a new 1527165762Sjeff * queue. This could be optimized to not re-add in some 1528165762Sjeff * cases. 1529133555Sjeff */ 1530165762Sjeff sched_rem(td); 1531165762Sjeff td->td_priority = prio; 1532171482Sjeff sched_add(td, SRQ_BORROWING); 1533177009Sjeff return; 1534177009Sjeff } 1535177009Sjeff tdq = TDQ_CPU(ts->ts_cpu); 1536177009Sjeff oldpri = td->td_priority; 1537177009Sjeff td->td_priority = prio; 1538177009Sjeff if (TD_IS_RUNNING(td)) { 1539176735Sjeff if (prio < tdq->tdq_lowpri) 1540171482Sjeff tdq->tdq_lowpri = prio; 1541176735Sjeff else if (tdq->tdq_lowpri == oldpri) 1542176735Sjeff tdq_setlowpri(tdq, td); 1543177009Sjeff } 1544109864Sjeff} 1545109864Sjeff 1546139453Sjhb/* 1547139453Sjhb * Update a thread's priority when it is lent another thread's 1548139453Sjhb * priority. 1549139453Sjhb */ 1550109864Sjeffvoid 1551139453Sjhbsched_lend_prio(struct thread *td, u_char prio) 1552139453Sjhb{ 1553139453Sjhb 1554139453Sjhb td->td_flags |= TDF_BORROWING; 1555139453Sjhb sched_thread_priority(td, prio); 1556139453Sjhb} 1557139453Sjhb 1558139453Sjhb/* 1559139453Sjhb * Restore a thread's priority when priority propagation is 1560139453Sjhb * over. The prio argument is the minimum priority the thread 1561139453Sjhb * needs to have to satisfy other possible priority lending 1562139453Sjhb * requests. If the thread's regular priority is less 1563139453Sjhb * important than prio, the thread will keep a priority boost 1564139453Sjhb * of prio. 1565139453Sjhb */ 1566139453Sjhbvoid 1567139453Sjhbsched_unlend_prio(struct thread *td, u_char prio) 1568139453Sjhb{ 1569139453Sjhb u_char base_pri; 1570139453Sjhb 1571139453Sjhb if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1572139453Sjhb td->td_base_pri <= PRI_MAX_TIMESHARE) 1573163709Sjb base_pri = td->td_user_pri; 1574139453Sjhb else 1575139453Sjhb base_pri = td->td_base_pri; 1576139453Sjhb if (prio >= base_pri) { 1577139455Sjhb td->td_flags &= ~TDF_BORROWING; 1578139453Sjhb sched_thread_priority(td, base_pri); 1579139453Sjhb } else 1580139453Sjhb sched_lend_prio(td, prio); 1581139453Sjhb} 1582139453Sjhb 1583171482Sjeff/* 1584171482Sjeff * Standard entry for setting the priority to an absolute value. 1585171482Sjeff */ 1586139453Sjhbvoid 1587139453Sjhbsched_prio(struct thread *td, u_char prio) 1588139453Sjhb{ 1589139453Sjhb u_char oldprio; 1590139453Sjhb 1591139453Sjhb /* First, update the base priority. */ 1592139453Sjhb td->td_base_pri = prio; 1593139453Sjhb 1594139453Sjhb /* 1595139455Sjhb * If the thread is borrowing another thread's priority, don't 1596139453Sjhb * ever lower the priority. 1597139453Sjhb */ 1598139453Sjhb if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1599139453Sjhb return; 1600139453Sjhb 1601139453Sjhb /* Change the real priority. */ 1602139453Sjhb oldprio = td->td_priority; 1603139453Sjhb sched_thread_priority(td, prio); 1604139453Sjhb 1605139453Sjhb /* 1606139453Sjhb * If the thread is on a turnstile, then let the turnstile update 1607139453Sjhb * its state. 1608139453Sjhb */ 1609139453Sjhb if (TD_ON_LOCK(td) && oldprio != prio) 1610139453Sjhb turnstile_adjust(td, oldprio); 1611139453Sjhb} 1612139455Sjhb 1613171482Sjeff/* 1614171482Sjeff * Set the base user priority, does not effect current running priority. 1615171482Sjeff */ 1616139453Sjhbvoid 1617163709Sjbsched_user_prio(struct thread *td, u_char prio) 1618161599Sdavidxu{ 1619161599Sdavidxu u_char oldprio; 1620161599Sdavidxu 1621163709Sjb td->td_base_user_pri = prio; 1622164939Sjulian if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 1623164939Sjulian return; 1624163709Sjb oldprio = td->td_user_pri; 1625163709Sjb td->td_user_pri = prio; 1626161599Sdavidxu} 1627161599Sdavidxu 1628161599Sdavidxuvoid 1629161599Sdavidxusched_lend_user_prio(struct thread *td, u_char prio) 1630161599Sdavidxu{ 1631161599Sdavidxu u_char oldprio; 1632161599Sdavidxu 1633174536Sdavidxu THREAD_LOCK_ASSERT(td, MA_OWNED); 1634161599Sdavidxu td->td_flags |= TDF_UBORROWING; 1635164091Smaxim oldprio = td->td_user_pri; 1636163709Sjb td->td_user_pri = prio; 1637161599Sdavidxu} 1638161599Sdavidxu 1639161599Sdavidxuvoid 1640161599Sdavidxusched_unlend_user_prio(struct thread *td, u_char prio) 1641161599Sdavidxu{ 1642161599Sdavidxu u_char base_pri; 1643161599Sdavidxu 1644174536Sdavidxu THREAD_LOCK_ASSERT(td, MA_OWNED); 1645163709Sjb base_pri = td->td_base_user_pri; 1646161599Sdavidxu if (prio >= base_pri) { 1647161599Sdavidxu td->td_flags &= ~TDF_UBORROWING; 1648163709Sjb sched_user_prio(td, base_pri); 1649174536Sdavidxu } else { 1650161599Sdavidxu sched_lend_user_prio(td, prio); 1651174536Sdavidxu } 1652161599Sdavidxu} 1653161599Sdavidxu 1654171482Sjeff/* 1655171505Sjeff * Add the thread passed as 'newtd' to the run queue before selecting 1656171505Sjeff * the next thread to run. This is only used for KSE. 1657171505Sjeff */ 1658171505Sjeffstatic void 1659171505Sjeffsched_switchin(struct tdq *tdq, struct thread *td) 1660171505Sjeff{ 1661171505Sjeff#ifdef SMP 1662171505Sjeff spinlock_enter(); 1663171505Sjeff TDQ_UNLOCK(tdq); 1664171505Sjeff thread_lock(td); 1665171505Sjeff spinlock_exit(); 1666171505Sjeff sched_setcpu(td->td_sched, TDQ_ID(tdq), SRQ_YIELDING); 1667171505Sjeff#else 1668171505Sjeff td->td_lock = TDQ_LOCKPTR(tdq); 1669171505Sjeff#endif 1670171505Sjeff tdq_add(tdq, td, SRQ_YIELDING); 1671171505Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1672171505Sjeff} 1673171505Sjeff 1674171505Sjeff/* 1675174847Swkoszek * Block a thread for switching. Similar to thread_block() but does not 1676174847Swkoszek * bump the spin count. 1677174847Swkoszek */ 1678174847Swkoszekstatic inline struct mtx * 1679174847Swkoszekthread_block_switch(struct thread *td) 1680174847Swkoszek{ 1681174847Swkoszek struct mtx *lock; 1682174847Swkoszek 1683174847Swkoszek THREAD_LOCK_ASSERT(td, MA_OWNED); 1684174847Swkoszek lock = td->td_lock; 1685174847Swkoszek td->td_lock = &blocked_lock; 1686174847Swkoszek mtx_unlock_spin(lock); 1687174847Swkoszek 1688174847Swkoszek return (lock); 1689174847Swkoszek} 1690174847Swkoszek 1691174847Swkoszek/* 1692171713Sjeff * Handle migration from sched_switch(). This happens only for 1693171713Sjeff * cpu binding. 1694171713Sjeff */ 1695171713Sjeffstatic struct mtx * 1696171713Sjeffsched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) 1697171713Sjeff{ 1698171713Sjeff struct tdq *tdn; 1699171713Sjeff 1700171713Sjeff tdn = TDQ_CPU(td->td_sched->ts_cpu); 1701171713Sjeff#ifdef SMP 1702177009Sjeff tdq_load_rem(tdq, td->td_sched); 1703171713Sjeff /* 1704171713Sjeff * Do the lock dance required to avoid LOR. We grab an extra 1705171713Sjeff * spinlock nesting to prevent preemption while we're 1706171713Sjeff * not holding either run-queue lock. 1707171713Sjeff */ 1708171713Sjeff spinlock_enter(); 1709171713Sjeff thread_block_switch(td); /* This releases the lock on tdq. */ 1710171713Sjeff TDQ_LOCK(tdn); 1711171713Sjeff tdq_add(tdn, td, flags); 1712177005Sjeff tdq_notify(tdn, td->td_sched); 1713171713Sjeff /* 1714171713Sjeff * After we unlock tdn the new cpu still can't switch into this 1715171713Sjeff * thread until we've unblocked it in cpu_switch(). The lock 1716171713Sjeff * pointers may match in the case of HTT cores. Don't unlock here 1717171713Sjeff * or we can deadlock when the other CPU runs the IPI handler. 1718171713Sjeff */ 1719171713Sjeff if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) { 1720171713Sjeff TDQ_UNLOCK(tdn); 1721171713Sjeff TDQ_LOCK(tdq); 1722171713Sjeff } 1723171713Sjeff spinlock_exit(); 1724171713Sjeff#endif 1725171713Sjeff return (TDQ_LOCKPTR(tdn)); 1726171713Sjeff} 1727171713Sjeff 1728171713Sjeff/* 1729171482Sjeff * Release a thread that was blocked with thread_block_switch(). 1730171482Sjeff */ 1731171482Sjeffstatic inline void 1732171482Sjeffthread_unblock_switch(struct thread *td, struct mtx *mtx) 1733171482Sjeff{ 1734171482Sjeff atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, 1735171482Sjeff (uintptr_t)mtx); 1736171482Sjeff} 1737171482Sjeff 1738171482Sjeff/* 1739171482Sjeff * Switch threads. This function has to handle threads coming in while 1740171482Sjeff * blocked for some reason, running, or idle. It also must deal with 1741171482Sjeff * migrating a thread from one queue to another as running threads may 1742171482Sjeff * be assigned elsewhere via binding. 1743171482Sjeff */ 1744161599Sdavidxuvoid 1745135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags) 1746109864Sjeff{ 1747165627Sjeff struct tdq *tdq; 1748164936Sjulian struct td_sched *ts; 1749171482Sjeff struct mtx *mtx; 1750171713Sjeff int srqflag; 1751171482Sjeff int cpuid; 1752109864Sjeff 1753170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1754109864Sjeff 1755171482Sjeff cpuid = PCPU_GET(cpuid); 1756171482Sjeff tdq = TDQ_CPU(cpuid); 1757164936Sjulian ts = td->td_sched; 1758171713Sjeff mtx = td->td_lock; 1759171482Sjeff ts->ts_rltick = ticks; 1760133555Sjeff td->td_lastcpu = td->td_oncpu; 1761113339Sjulian td->td_oncpu = NOCPU; 1762132266Sjhb td->td_flags &= ~TDF_NEEDRESCHED; 1763144777Sups td->td_owepreempt = 0; 1764123434Sjeff /* 1765171482Sjeff * The lock pointer in an idle thread should never change. Reset it 1766171482Sjeff * to CAN_RUN as well. 1767123434Sjeff */ 1768167327Sjulian if (TD_IS_IDLETHREAD(td)) { 1769171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1770139334Sjeff TD_SET_CAN_RUN(td); 1771170293Sjeff } else if (TD_IS_RUNNING(td)) { 1772171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1773171713Sjeff srqflag = (flags & SW_PREEMPT) ? 1774170293Sjeff SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1775171713Sjeff SRQ_OURSELF|SRQ_YIELDING; 1776171713Sjeff if (ts->ts_cpu == cpuid) 1777177009Sjeff tdq_runq_add(tdq, ts, srqflag); 1778171713Sjeff else 1779171713Sjeff mtx = sched_switch_migrate(tdq, td, srqflag); 1780171482Sjeff } else { 1781171482Sjeff /* This thread must be going to sleep. */ 1782171482Sjeff TDQ_LOCK(tdq); 1783171482Sjeff mtx = thread_block_switch(td); 1784170293Sjeff tdq_load_rem(tdq, ts); 1785171482Sjeff } 1786171482Sjeff /* 1787171482Sjeff * We enter here with the thread blocked and assigned to the 1788171482Sjeff * appropriate cpu run-queue or sleep-queue and with the current 1789171482Sjeff * thread-queue locked. 1790171482Sjeff */ 1791171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); 1792171482Sjeff /* 1793171505Sjeff * If KSE assigned a new thread just add it here and let choosethread 1794171505Sjeff * select the best one. 1795171482Sjeff */ 1796171505Sjeff if (newtd != NULL) 1797171505Sjeff sched_switchin(tdq, newtd); 1798171482Sjeff newtd = choosethread(); 1799171482Sjeff /* 1800171482Sjeff * Call the MD code to switch contexts if necessary. 1801171482Sjeff */ 1802145256Sjkoshy if (td != newtd) { 1803145256Sjkoshy#ifdef HWPMC_HOOKS 1804145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1805145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1806145256Sjkoshy#endif 1807174629Sjeff lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); 1808172411Sjeff TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; 1809171482Sjeff cpu_switch(td, newtd, mtx); 1810171482Sjeff /* 1811171482Sjeff * We may return from cpu_switch on a different cpu. However, 1812171482Sjeff * we always return with td_lock pointing to the current cpu's 1813171482Sjeff * run queue lock. 1814171482Sjeff */ 1815171482Sjeff cpuid = PCPU_GET(cpuid); 1816171482Sjeff tdq = TDQ_CPU(cpuid); 1817174629Sjeff lock_profile_obtain_lock_success( 1818174629Sjeff &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); 1819145256Sjkoshy#ifdef HWPMC_HOOKS 1820145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1821145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1822145256Sjkoshy#endif 1823171482Sjeff } else 1824171482Sjeff thread_unblock_switch(td, mtx); 1825171482Sjeff /* 1826176735Sjeff * We should always get here with the lowest priority td possible. 1827176735Sjeff */ 1828176735Sjeff tdq->tdq_lowpri = td->td_priority; 1829176735Sjeff /* 1830171482Sjeff * Assert that all went well and return. 1831171482Sjeff */ 1832171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); 1833171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1834171482Sjeff td->td_oncpu = cpuid; 1835109864Sjeff} 1836109864Sjeff 1837171482Sjeff/* 1838171482Sjeff * Adjust thread priorities as a result of a nice request. 1839171482Sjeff */ 1840109864Sjeffvoid 1841130551Sjuliansched_nice(struct proc *p, int nice) 1842109864Sjeff{ 1843109864Sjeff struct thread *td; 1844109864Sjeff 1845130551Sjulian PROC_LOCK_ASSERT(p, MA_OWNED); 1846165762Sjeff 1847130551Sjulian p->p_nice = nice; 1848163709Sjb FOREACH_THREAD_IN_PROC(p, td) { 1849170293Sjeff thread_lock(td); 1850163709Sjb sched_priority(td); 1851165762Sjeff sched_prio(td, td->td_base_user_pri); 1852170293Sjeff thread_unlock(td); 1853130551Sjulian } 1854109864Sjeff} 1855109864Sjeff 1856171482Sjeff/* 1857171482Sjeff * Record the sleep time for the interactivity scorer. 1858171482Sjeff */ 1859109864Sjeffvoid 1860177085Sjeffsched_sleep(struct thread *td, int prio) 1861109864Sjeff{ 1862165762Sjeff 1863170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1864109864Sjeff 1865172264Sjeff td->td_slptick = ticks; 1866177085Sjeff if (TD_IS_SUSPENDED(td) || prio <= PSOCK) 1867177085Sjeff td->td_flags |= TDF_CANSWAP; 1868177085Sjeff if (static_boost && prio) 1869177085Sjeff sched_prio(td, prio); 1870109864Sjeff} 1871109864Sjeff 1872171482Sjeff/* 1873171482Sjeff * Schedule a thread to resume execution and record how long it voluntarily 1874171482Sjeff * slept. We also update the pctcpu, interactivity, and priority. 1875171482Sjeff */ 1876109864Sjeffvoid 1877109864Sjeffsched_wakeup(struct thread *td) 1878109864Sjeff{ 1879166229Sjeff struct td_sched *ts; 1880171482Sjeff int slptick; 1881165762Sjeff 1882170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1883166229Sjeff ts = td->td_sched; 1884177085Sjeff td->td_flags &= ~TDF_CANSWAP; 1885109864Sjeff /* 1886165762Sjeff * If we slept for more than a tick update our interactivity and 1887165762Sjeff * priority. 1888109864Sjeff */ 1889172264Sjeff slptick = td->td_slptick; 1890172264Sjeff td->td_slptick = 0; 1891171482Sjeff if (slptick && slptick != ticks) { 1892166208Sjeff u_int hzticks; 1893109864Sjeff 1894171482Sjeff hzticks = (ticks - slptick) << SCHED_TICK_SHIFT; 1895171482Sjeff ts->ts_slptime += hzticks; 1896165819Sjeff sched_interact_update(td); 1897166229Sjeff sched_pctcpu_update(ts); 1898109864Sjeff } 1899166229Sjeff /* Reset the slice value after we sleep. */ 1900166229Sjeff ts->ts_slice = sched_slice; 1901166190Sjeff sched_add(td, SRQ_BORING); 1902109864Sjeff} 1903109864Sjeff 1904109864Sjeff/* 1905109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1906109864Sjeff * priority. 1907109864Sjeff */ 1908109864Sjeffvoid 1909163709Sjbsched_fork(struct thread *td, struct thread *child) 1910109864Sjeff{ 1911170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1912164936Sjulian sched_fork_thread(td, child); 1913165762Sjeff /* 1914165762Sjeff * Penalize the parent and child for forking. 1915165762Sjeff */ 1916165762Sjeff sched_interact_fork(child); 1917165762Sjeff sched_priority(child); 1918171482Sjeff td->td_sched->ts_runtime += tickincr; 1919165762Sjeff sched_interact_update(td); 1920165762Sjeff sched_priority(td); 1921164936Sjulian} 1922109864Sjeff 1923171482Sjeff/* 1924171482Sjeff * Fork a new thread, may be within the same process. 1925171482Sjeff */ 1926164936Sjulianvoid 1927164936Sjuliansched_fork_thread(struct thread *td, struct thread *child) 1928164936Sjulian{ 1929164936Sjulian struct td_sched *ts; 1930164936Sjulian struct td_sched *ts2; 1931164936Sjulian 1932165762Sjeff /* 1933165762Sjeff * Initialize child. 1934165762Sjeff */ 1935170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1936163709Sjb sched_newthread(child); 1937171482Sjeff child->td_lock = TDQ_LOCKPTR(TDQ_SELF()); 1938176735Sjeff child->td_cpuset = cpuset_ref(td->td_cpuset); 1939164936Sjulian ts = td->td_sched; 1940164936Sjulian ts2 = child->td_sched; 1941164936Sjulian ts2->ts_cpu = ts->ts_cpu; 1942164936Sjulian ts2->ts_runq = NULL; 1943165762Sjeff /* 1944165762Sjeff * Grab our parents cpu estimation information and priority. 1945165762Sjeff */ 1946164936Sjulian ts2->ts_ticks = ts->ts_ticks; 1947164936Sjulian ts2->ts_ltick = ts->ts_ltick; 1948164936Sjulian ts2->ts_ftick = ts->ts_ftick; 1949165762Sjeff child->td_user_pri = td->td_user_pri; 1950165762Sjeff child->td_base_user_pri = td->td_base_user_pri; 1951165762Sjeff /* 1952165762Sjeff * And update interactivity score. 1953165762Sjeff */ 1954171482Sjeff ts2->ts_slptime = ts->ts_slptime; 1955171482Sjeff ts2->ts_runtime = ts->ts_runtime; 1956165762Sjeff ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ 1957113357Sjeff} 1958113357Sjeff 1959171482Sjeff/* 1960171482Sjeff * Adjust the priority class of a thread. 1961171482Sjeff */ 1962113357Sjeffvoid 1963163709Sjbsched_class(struct thread *td, int class) 1964113357Sjeff{ 1965113357Sjeff 1966170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 1967163709Sjb if (td->td_pri_class == class) 1968113357Sjeff return; 1969165827Sjeff /* 1970165827Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1971165827Sjeff * count because could be changing to or from an interrupt 1972165827Sjeff * class. 1973165827Sjeff */ 1974166190Sjeff if (TD_ON_RUNQ(td)) { 1975165827Sjeff struct tdq *tdq; 1976165827Sjeff 1977165827Sjeff tdq = TDQ_CPU(td->td_sched->ts_cpu); 1978176735Sjeff if (THREAD_CAN_MIGRATE(td)) 1979165827Sjeff tdq->tdq_transferable--; 1980165827Sjeff td->td_pri_class = class; 1981176735Sjeff if (THREAD_CAN_MIGRATE(td)) 1982165827Sjeff tdq->tdq_transferable++; 1983165827Sjeff } 1984163709Sjb td->td_pri_class = class; 1985109864Sjeff} 1986109864Sjeff 1987109864Sjeff/* 1988109864Sjeff * Return some of the child's priority and interactivity to the parent. 1989109864Sjeff */ 1990109864Sjeffvoid 1991164939Sjuliansched_exit(struct proc *p, struct thread *child) 1992109864Sjeff{ 1993165762Sjeff struct thread *td; 1994164939Sjulian 1995163709Sjb CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 1996173600Sjulian child, child->td_name, child->td_priority); 1997113372Sjeff 1998177368Sjeff PROC_LOCK_ASSERT(p, MA_OWNED); 1999165762Sjeff td = FIRST_THREAD_IN_PROC(p); 2000165762Sjeff sched_exit_thread(td, child); 2001113372Sjeff} 2002113372Sjeff 2003171482Sjeff/* 2004171482Sjeff * Penalize another thread for the time spent on this one. This helps to 2005171482Sjeff * worsen the priority and interactivity of processes which schedule batch 2006171482Sjeff * jobs such as make. This has little effect on the make process itself but 2007171482Sjeff * causes new processes spawned by it to receive worse scores immediately. 2008171482Sjeff */ 2009113372Sjeffvoid 2010164939Sjuliansched_exit_thread(struct thread *td, struct thread *child) 2011164936Sjulian{ 2012165762Sjeff 2013164939Sjulian CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 2014173600Sjulian child, child->td_name, child->td_priority); 2015164939Sjulian 2016165762Sjeff /* 2017165762Sjeff * Give the child's runtime to the parent without returning the 2018165762Sjeff * sleep time as a penalty to the parent. This causes shells that 2019165762Sjeff * launch expensive things to mark their children as expensive. 2020165762Sjeff */ 2021170293Sjeff thread_lock(td); 2022171482Sjeff td->td_sched->ts_runtime += child->td_sched->ts_runtime; 2023164939Sjulian sched_interact_update(td); 2024165762Sjeff sched_priority(td); 2025170293Sjeff thread_unlock(td); 2026164936Sjulian} 2027164936Sjulian 2028177005Sjeffvoid 2029177005Sjeffsched_preempt(struct thread *td) 2030177005Sjeff{ 2031177005Sjeff struct tdq *tdq; 2032177005Sjeff 2033177005Sjeff thread_lock(td); 2034177005Sjeff tdq = TDQ_SELF(); 2035177005Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2036177005Sjeff tdq->tdq_ipipending = 0; 2037177005Sjeff if (td->td_priority > tdq->tdq_lowpri) { 2038177005Sjeff if (td->td_critnest > 1) 2039177005Sjeff td->td_owepreempt = 1; 2040177005Sjeff else 2041177005Sjeff mi_switch(SW_INVOL | SW_PREEMPT, NULL); 2042177005Sjeff } 2043177005Sjeff thread_unlock(td); 2044177005Sjeff} 2045177005Sjeff 2046171482Sjeff/* 2047171482Sjeff * Fix priorities on return to user-space. Priorities may be elevated due 2048171482Sjeff * to static priorities in msleep() or similar. 2049171482Sjeff */ 2050164936Sjulianvoid 2051164936Sjuliansched_userret(struct thread *td) 2052164936Sjulian{ 2053164936Sjulian /* 2054164936Sjulian * XXX we cheat slightly on the locking here to avoid locking in 2055164936Sjulian * the usual case. Setting td_priority here is essentially an 2056164936Sjulian * incomplete workaround for not setting it properly elsewhere. 2057164936Sjulian * Now that some interrupt handlers are threads, not setting it 2058164936Sjulian * properly elsewhere can clobber it in the window between setting 2059164936Sjulian * it here and returning to user mode, so don't waste time setting 2060164936Sjulian * it perfectly here. 2061164936Sjulian */ 2062164936Sjulian KASSERT((td->td_flags & TDF_BORROWING) == 0, 2063164936Sjulian ("thread with borrowed priority returning to userland")); 2064164936Sjulian if (td->td_priority != td->td_user_pri) { 2065170293Sjeff thread_lock(td); 2066164936Sjulian td->td_priority = td->td_user_pri; 2067164936Sjulian td->td_base_pri = td->td_user_pri; 2068177005Sjeff tdq_setlowpri(TDQ_SELF(), td); 2069170293Sjeff thread_unlock(td); 2070164936Sjulian } 2071164936Sjulian} 2072164936Sjulian 2073171482Sjeff/* 2074171482Sjeff * Handle a stathz tick. This is really only relevant for timeshare 2075171482Sjeff * threads. 2076171482Sjeff */ 2077164936Sjulianvoid 2078121127Sjeffsched_clock(struct thread *td) 2079109864Sjeff{ 2080164936Sjulian struct tdq *tdq; 2081164936Sjulian struct td_sched *ts; 2082109864Sjeff 2083171482Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2084164936Sjulian tdq = TDQ_SELF(); 2085172409Sjeff#ifdef SMP 2086133427Sjeff /* 2087172409Sjeff * We run the long term load balancer infrequently on the first cpu. 2088172409Sjeff */ 2089172409Sjeff if (balance_tdq == tdq) { 2090172409Sjeff if (balance_ticks && --balance_ticks == 0) 2091172409Sjeff sched_balance(); 2092172409Sjeff } 2093172409Sjeff#endif 2094172409Sjeff /* 2095165766Sjeff * Advance the insert index once for each tick to ensure that all 2096165766Sjeff * threads get a chance to run. 2097133427Sjeff */ 2098165766Sjeff if (tdq->tdq_idx == tdq->tdq_ridx) { 2099165766Sjeff tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; 2100165766Sjeff if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) 2101165766Sjeff tdq->tdq_ridx = tdq->tdq_idx; 2102165766Sjeff } 2103165766Sjeff ts = td->td_sched; 2104175104Sjeff if (td->td_pri_class & PRI_FIFO_BIT) 2105113357Sjeff return; 2106175104Sjeff if (td->td_pri_class == PRI_TIMESHARE) { 2107175104Sjeff /* 2108175104Sjeff * We used a tick; charge it to the thread so 2109175104Sjeff * that we can compute our interactivity. 2110175104Sjeff */ 2111175104Sjeff td->td_sched->ts_runtime += tickincr; 2112175104Sjeff sched_interact_update(td); 2113177009Sjeff sched_priority(td); 2114175104Sjeff } 2115113357Sjeff /* 2116109864Sjeff * We used up one time slice. 2117109864Sjeff */ 2118164936Sjulian if (--ts->ts_slice > 0) 2119113357Sjeff return; 2120109864Sjeff /* 2121177009Sjeff * We're out of time, force a requeue at userret(). 2122109864Sjeff */ 2123177009Sjeff ts->ts_slice = sched_slice; 2124113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 2125109864Sjeff} 2126109864Sjeff 2127171482Sjeff/* 2128171482Sjeff * Called once per hz tick. Used for cpu utilization information. This 2129171482Sjeff * is easier than trying to scale based on stathz. 2130171482Sjeff */ 2131171482Sjeffvoid 2132171482Sjeffsched_tick(void) 2133171482Sjeff{ 2134171482Sjeff struct td_sched *ts; 2135171482Sjeff 2136171482Sjeff ts = curthread->td_sched; 2137171482Sjeff /* Adjust ticks for pctcpu */ 2138171482Sjeff ts->ts_ticks += 1 << SCHED_TICK_SHIFT; 2139171482Sjeff ts->ts_ltick = ticks; 2140171482Sjeff /* 2141171482Sjeff * Update if we've exceeded our desired tick threshhold by over one 2142171482Sjeff * second. 2143171482Sjeff */ 2144171482Sjeff if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick) 2145171482Sjeff sched_pctcpu_update(ts); 2146171482Sjeff} 2147171482Sjeff 2148171482Sjeff/* 2149171482Sjeff * Return whether the current CPU has runnable tasks. Used for in-kernel 2150171482Sjeff * cooperative idle threads. 2151171482Sjeff */ 2152109864Sjeffint 2153109864Sjeffsched_runnable(void) 2154109864Sjeff{ 2155164936Sjulian struct tdq *tdq; 2156115998Sjeff int load; 2157109864Sjeff 2158115998Sjeff load = 1; 2159115998Sjeff 2160164936Sjulian tdq = TDQ_SELF(); 2161121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 2162165620Sjeff if (tdq->tdq_load > 0) 2163121605Sjeff goto out; 2164121605Sjeff } else 2165165620Sjeff if (tdq->tdq_load - 1 > 0) 2166121605Sjeff goto out; 2167115998Sjeff load = 0; 2168115998Sjeffout: 2169115998Sjeff return (load); 2170109864Sjeff} 2171109864Sjeff 2172171482Sjeff/* 2173171482Sjeff * Choose the highest priority thread to run. The thread is removed from 2174171482Sjeff * the run-queue while running however the load remains. For SMP we set 2175171482Sjeff * the tdq in the global idle bitmask if it idles here. 2176171482Sjeff */ 2177166190Sjeffstruct thread * 2178109970Sjeffsched_choose(void) 2179109970Sjeff{ 2180171482Sjeff struct td_sched *ts; 2181164936Sjulian struct tdq *tdq; 2182109970Sjeff 2183164936Sjulian tdq = TDQ_SELF(); 2184171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2185164936Sjulian ts = tdq_choose(tdq); 2186164936Sjulian if (ts) { 2187177042Sjeff ts->ts_ltick = ticks; 2188164936Sjulian tdq_runq_rem(tdq, ts); 2189166190Sjeff return (ts->ts_thread); 2190109864Sjeff } 2191176735Sjeff return (PCPU_GET(idlethread)); 2192109864Sjeff} 2193109864Sjeff 2194171482Sjeff/* 2195171482Sjeff * Set owepreempt if necessary. Preemption never happens directly in ULE, 2196171482Sjeff * we always request it once we exit a critical section. 2197171482Sjeff */ 2198171482Sjeffstatic inline void 2199171482Sjeffsched_setpreempt(struct thread *td) 2200166190Sjeff{ 2201166190Sjeff struct thread *ctd; 2202166190Sjeff int cpri; 2203166190Sjeff int pri; 2204166190Sjeff 2205177005Sjeff THREAD_LOCK_ASSERT(curthread, MA_OWNED); 2206177005Sjeff 2207166190Sjeff ctd = curthread; 2208166190Sjeff pri = td->td_priority; 2209166190Sjeff cpri = ctd->td_priority; 2210177005Sjeff if (pri < cpri) 2211177005Sjeff ctd->td_flags |= TDF_NEEDRESCHED; 2212166190Sjeff if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) 2213171482Sjeff return; 2214177005Sjeff if (!sched_shouldpreempt(pri, cpri, 0)) 2215171482Sjeff return; 2216171482Sjeff ctd->td_owepreempt = 1; 2217166190Sjeff} 2218166190Sjeff 2219171482Sjeff/* 2220177009Sjeff * Add a thread to a thread queue. Select the appropriate runq and add the 2221177009Sjeff * thread to it. This is the internal function called when the tdq is 2222177009Sjeff * predetermined. 2223171482Sjeff */ 2224109864Sjeffvoid 2225171482Sjefftdq_add(struct tdq *tdq, struct thread *td, int flags) 2226109864Sjeff{ 2227164936Sjulian struct td_sched *ts; 2228109864Sjeff 2229171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2230166190Sjeff KASSERT((td->td_inhibitors == 0), 2231166190Sjeff ("sched_add: trying to run inhibited thread")); 2232166190Sjeff KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 2233166190Sjeff ("sched_add: bad thread state")); 2234172207Sjeff KASSERT(td->td_flags & TDF_INMEM, 2235172207Sjeff ("sched_add: thread swapped out")); 2236171482Sjeff 2237171482Sjeff ts = td->td_sched; 2238171482Sjeff if (td->td_priority < tdq->tdq_lowpri) 2239171482Sjeff tdq->tdq_lowpri = td->td_priority; 2240171482Sjeff tdq_runq_add(tdq, ts, flags); 2241171482Sjeff tdq_load_add(tdq, ts); 2242171482Sjeff} 2243171482Sjeff 2244171482Sjeff/* 2245171482Sjeff * Select the target thread queue and add a thread to it. Request 2246171482Sjeff * preemption or IPI a remote processor if required. 2247171482Sjeff */ 2248171482Sjeffvoid 2249171482Sjeffsched_add(struct thread *td, int flags) 2250171482Sjeff{ 2251171482Sjeff struct tdq *tdq; 2252171482Sjeff#ifdef SMP 2253177009Sjeff struct td_sched *ts; 2254171482Sjeff int cpu; 2255171482Sjeff#endif 2256171482Sjeff CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 2257173600Sjulian td, td->td_name, td->td_priority, curthread, 2258173600Sjulian curthread->td_name); 2259171482Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2260166108Sjeff /* 2261171482Sjeff * Recalculate the priority before we select the target cpu or 2262171482Sjeff * run-queue. 2263166108Sjeff */ 2264171482Sjeff if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) 2265171482Sjeff sched_priority(td); 2266171482Sjeff#ifdef SMP 2267171482Sjeff /* 2268171482Sjeff * Pick the destination cpu and if it isn't ours transfer to the 2269171482Sjeff * target cpu. 2270171482Sjeff */ 2271177009Sjeff ts = td->td_sched; 2272176735Sjeff cpu = sched_pickcpu(ts, flags); 2273171482Sjeff tdq = sched_setcpu(ts, cpu, flags); 2274171482Sjeff tdq_add(tdq, td, flags); 2275177009Sjeff if (cpu != PCPU_GET(cpuid)) { 2276177005Sjeff tdq_notify(tdq, ts); 2277166108Sjeff return; 2278166108Sjeff } 2279171482Sjeff#else 2280171482Sjeff tdq = TDQ_SELF(); 2281171482Sjeff TDQ_LOCK(tdq); 2282171482Sjeff /* 2283171482Sjeff * Now that the thread is moving to the run-queue, set the lock 2284171482Sjeff * to the scheduler's lock. 2285171482Sjeff */ 2286171482Sjeff thread_lock_set(td, TDQ_LOCKPTR(tdq)); 2287171482Sjeff tdq_add(tdq, td, flags); 2288166108Sjeff#endif 2289171482Sjeff if (!(flags & SRQ_YIELDING)) 2290171482Sjeff sched_setpreempt(td); 2291109864Sjeff} 2292109864Sjeff 2293171482Sjeff/* 2294171482Sjeff * Remove a thread from a run-queue without running it. This is used 2295171482Sjeff * when we're stealing a thread from a remote queue. Otherwise all threads 2296171482Sjeff * exit by calling sched_exit_thread() and sched_throw() themselves. 2297171482Sjeff */ 2298109864Sjeffvoid 2299121127Sjeffsched_rem(struct thread *td) 2300109864Sjeff{ 2301164936Sjulian struct tdq *tdq; 2302164936Sjulian struct td_sched *ts; 2303113357Sjeff 2304139316Sjeff CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 2305173600Sjulian td, td->td_name, td->td_priority, curthread, 2306173600Sjulian curthread->td_name); 2307164936Sjulian ts = td->td_sched; 2308171482Sjeff tdq = TDQ_CPU(ts->ts_cpu); 2309171482Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2310171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2311166190Sjeff KASSERT(TD_ON_RUNQ(td), 2312164936Sjulian ("sched_rem: thread not on run queue")); 2313164936Sjulian tdq_runq_rem(tdq, ts); 2314164936Sjulian tdq_load_rem(tdq, ts); 2315166190Sjeff TD_SET_CAN_RUN(td); 2316176735Sjeff if (td->td_priority == tdq->tdq_lowpri) 2317176735Sjeff tdq_setlowpri(tdq, NULL); 2318109864Sjeff} 2319109864Sjeff 2320171482Sjeff/* 2321171482Sjeff * Fetch cpu utilization information. Updates on demand. 2322171482Sjeff */ 2323109864Sjefffixpt_t 2324121127Sjeffsched_pctcpu(struct thread *td) 2325109864Sjeff{ 2326109864Sjeff fixpt_t pctcpu; 2327164936Sjulian struct td_sched *ts; 2328109864Sjeff 2329109864Sjeff pctcpu = 0; 2330164936Sjulian ts = td->td_sched; 2331164936Sjulian if (ts == NULL) 2332121290Sjeff return (0); 2333109864Sjeff 2334170293Sjeff thread_lock(td); 2335164936Sjulian if (ts->ts_ticks) { 2336109864Sjeff int rtick; 2337109864Sjeff 2338165796Sjeff sched_pctcpu_update(ts); 2339109864Sjeff /* How many rtick per second ? */ 2340165762Sjeff rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); 2341165762Sjeff pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; 2342109864Sjeff } 2343170293Sjeff thread_unlock(td); 2344109864Sjeff 2345109864Sjeff return (pctcpu); 2346109864Sjeff} 2347109864Sjeff 2348176735Sjeff/* 2349176735Sjeff * Enforce affinity settings for a thread. Called after adjustments to 2350176735Sjeff * cpumask. 2351176735Sjeff */ 2352176729Sjeffvoid 2353176729Sjeffsched_affinity(struct thread *td) 2354176729Sjeff{ 2355176735Sjeff#ifdef SMP 2356176735Sjeff struct td_sched *ts; 2357176735Sjeff int cpu; 2358176735Sjeff 2359176735Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2360176735Sjeff ts = td->td_sched; 2361176735Sjeff if (THREAD_CAN_SCHED(td, ts->ts_cpu)) 2362176735Sjeff return; 2363176735Sjeff if (!TD_IS_RUNNING(td)) 2364176735Sjeff return; 2365176735Sjeff td->td_flags |= TDF_NEEDRESCHED; 2366176735Sjeff if (!THREAD_CAN_MIGRATE(td)) 2367176735Sjeff return; 2368176735Sjeff /* 2369176735Sjeff * Assign the new cpu and force a switch before returning to 2370176735Sjeff * userspace. If the target thread is not running locally send 2371176735Sjeff * an ipi to force the issue. 2372176735Sjeff */ 2373176735Sjeff cpu = ts->ts_cpu; 2374176735Sjeff ts->ts_cpu = sched_pickcpu(ts, 0); 2375176735Sjeff if (cpu != PCPU_GET(cpuid)) 2376176735Sjeff ipi_selected(1 << cpu, IPI_PREEMPT); 2377176735Sjeff#endif 2378176729Sjeff} 2379176729Sjeff 2380171482Sjeff/* 2381171482Sjeff * Bind a thread to a target cpu. 2382171482Sjeff */ 2383122038Sjeffvoid 2384122038Sjeffsched_bind(struct thread *td, int cpu) 2385122038Sjeff{ 2386164936Sjulian struct td_sched *ts; 2387122038Sjeff 2388171713Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); 2389164936Sjulian ts = td->td_sched; 2390166137Sjeff if (ts->ts_flags & TSF_BOUND) 2391166152Sjeff sched_unbind(td); 2392164936Sjulian ts->ts_flags |= TSF_BOUND; 2393166137Sjeff sched_pin(); 2394123433Sjeff if (PCPU_GET(cpuid) == cpu) 2395122038Sjeff return; 2396166137Sjeff ts->ts_cpu = cpu; 2397122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 2398131527Sphk mi_switch(SW_VOL, NULL); 2399122038Sjeff} 2400122038Sjeff 2401171482Sjeff/* 2402171482Sjeff * Release a bound thread. 2403171482Sjeff */ 2404122038Sjeffvoid 2405122038Sjeffsched_unbind(struct thread *td) 2406122038Sjeff{ 2407165762Sjeff struct td_sched *ts; 2408165762Sjeff 2409170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2410165762Sjeff ts = td->td_sched; 2411166137Sjeff if ((ts->ts_flags & TSF_BOUND) == 0) 2412166137Sjeff return; 2413165762Sjeff ts->ts_flags &= ~TSF_BOUND; 2414165762Sjeff sched_unpin(); 2415122038Sjeff} 2416122038Sjeff 2417109864Sjeffint 2418145256Sjkoshysched_is_bound(struct thread *td) 2419145256Sjkoshy{ 2420170293Sjeff THREAD_LOCK_ASSERT(td, MA_OWNED); 2421164936Sjulian return (td->td_sched->ts_flags & TSF_BOUND); 2422145256Sjkoshy} 2423145256Sjkoshy 2424171482Sjeff/* 2425171482Sjeff * Basic yield call. 2426171482Sjeff */ 2427159630Sdavidxuvoid 2428159630Sdavidxusched_relinquish(struct thread *td) 2429159630Sdavidxu{ 2430170293Sjeff thread_lock(td); 2431170293Sjeff SCHED_STAT_INC(switch_relinquish); 2432159630Sdavidxu mi_switch(SW_VOL, NULL); 2433170293Sjeff thread_unlock(td); 2434159630Sdavidxu} 2435159630Sdavidxu 2436171482Sjeff/* 2437171482Sjeff * Return the total system load. 2438171482Sjeff */ 2439145256Sjkoshyint 2440125289Sjeffsched_load(void) 2441125289Sjeff{ 2442125289Sjeff#ifdef SMP 2443125289Sjeff int total; 2444125289Sjeff int i; 2445125289Sjeff 2446125289Sjeff total = 0; 2447176735Sjeff for (i = 0; i <= mp_maxid; i++) 2448176735Sjeff total += TDQ_CPU(i)->tdq_sysload; 2449125289Sjeff return (total); 2450125289Sjeff#else 2451165620Sjeff return (TDQ_SELF()->tdq_sysload); 2452125289Sjeff#endif 2453125289Sjeff} 2454125289Sjeff 2455125289Sjeffint 2456109864Sjeffsched_sizeof_proc(void) 2457109864Sjeff{ 2458109864Sjeff return (sizeof(struct proc)); 2459109864Sjeff} 2460109864Sjeff 2461109864Sjeffint 2462109864Sjeffsched_sizeof_thread(void) 2463109864Sjeff{ 2464109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 2465109864Sjeff} 2466159570Sdavidxu 2467166190Sjeff/* 2468166190Sjeff * The actual idle process. 2469166190Sjeff */ 2470166190Sjeffvoid 2471166190Sjeffsched_idletd(void *dummy) 2472166190Sjeff{ 2473166190Sjeff struct thread *td; 2474171482Sjeff struct tdq *tdq; 2475166190Sjeff 2476166190Sjeff td = curthread; 2477171482Sjeff tdq = TDQ_SELF(); 2478166190Sjeff mtx_assert(&Giant, MA_NOTOWNED); 2479171482Sjeff /* ULE relies on preemption for idle interruption. */ 2480171482Sjeff for (;;) { 2481171482Sjeff#ifdef SMP 2482171482Sjeff if (tdq_idled(tdq)) 2483171482Sjeff cpu_idle(); 2484171482Sjeff#else 2485166190Sjeff cpu_idle(); 2486171482Sjeff#endif 2487171482Sjeff } 2488166190Sjeff} 2489166190Sjeff 2490170293Sjeff/* 2491170293Sjeff * A CPU is entering for the first time or a thread is exiting. 2492170293Sjeff */ 2493170293Sjeffvoid 2494170293Sjeffsched_throw(struct thread *td) 2495170293Sjeff{ 2496172411Sjeff struct thread *newtd; 2497171482Sjeff struct tdq *tdq; 2498171482Sjeff 2499171482Sjeff tdq = TDQ_SELF(); 2500170293Sjeff if (td == NULL) { 2501171482Sjeff /* Correct spinlock nesting and acquire the correct lock. */ 2502171482Sjeff TDQ_LOCK(tdq); 2503170293Sjeff spinlock_exit(); 2504170293Sjeff } else { 2505171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2506171482Sjeff tdq_load_rem(tdq, td->td_sched); 2507174629Sjeff lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); 2508170293Sjeff } 2509170293Sjeff KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); 2510172411Sjeff newtd = choosethread(); 2511172411Sjeff TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; 2512170293Sjeff PCPU_SET(switchtime, cpu_ticks()); 2513170293Sjeff PCPU_SET(switchticks, ticks); 2514172411Sjeff cpu_throw(td, newtd); /* doesn't return */ 2515170293Sjeff} 2516170293Sjeff 2517171482Sjeff/* 2518171482Sjeff * This is called from fork_exit(). Just acquire the correct locks and 2519171482Sjeff * let fork do the rest of the work. 2520171482Sjeff */ 2521170293Sjeffvoid 2522170600Sjeffsched_fork_exit(struct thread *td) 2523170293Sjeff{ 2524171482Sjeff struct td_sched *ts; 2525171482Sjeff struct tdq *tdq; 2526171482Sjeff int cpuid; 2527170293Sjeff 2528170293Sjeff /* 2529170293Sjeff * Finish setting up thread glue so that it begins execution in a 2530171482Sjeff * non-nested critical section with the scheduler lock held. 2531170293Sjeff */ 2532171482Sjeff cpuid = PCPU_GET(cpuid); 2533171482Sjeff tdq = TDQ_CPU(cpuid); 2534171482Sjeff ts = td->td_sched; 2535171482Sjeff if (TD_IS_IDLETHREAD(td)) 2536171482Sjeff td->td_lock = TDQ_LOCKPTR(tdq); 2537171482Sjeff MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2538171482Sjeff td->td_oncpu = cpuid; 2539172411Sjeff TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); 2540174629Sjeff lock_profile_obtain_lock_success( 2541174629Sjeff &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); 2542176735Sjeff tdq->tdq_lowpri = td->td_priority; 2543170293Sjeff} 2544170293Sjeff 2545171482Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, 2546171482Sjeff "Scheduler"); 2547171482SjeffSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, 2548165762Sjeff "Scheduler name"); 2549171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, 2550171482Sjeff "Slice size for timeshare threads"); 2551171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, 2552171482Sjeff "Interactivity score threshold"); 2553171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 2554171482Sjeff 0,"Min priority for preemption, lower priorities have greater precedence"); 2555177085SjeffSYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 2556177085Sjeff 0,"Controls whether static kernel priorities are assigned to sleeping threads."); 2557166108Sjeff#ifdef SMP 2558171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, 2559171482Sjeff "Number of hz ticks to keep thread affinity for"); 2560171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, 2561171482Sjeff "Enables the long-term load balancer"); 2562172409SjeffSYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, 2563172409Sjeff &balance_interval, 0, 2564172409Sjeff "Average frequency in stathz ticks to run the long-term balancer"); 2565171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, 2566171482Sjeff "Steals work from another hyper-threaded core on idle"); 2567171482SjeffSYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, 2568171482Sjeff "Attempts to steal work from other cores before idling"); 2569171506SjeffSYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, 2570171506Sjeff "Minimum load on remote cpu before we'll steal"); 2571166108Sjeff#endif 2572165762Sjeff 2573172264Sjeff/* ps compat. All cpu percentages from ULE are weighted. */ 2574172293Sjeffstatic int ccpu = 0; 2575165762SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 2576165762Sjeff 2577165762Sjeff 2578134791Sjulian#define KERN_SWITCH_INCLUDE 1 2579134791Sjulian#include "kern/kern_switch.c" 2580