sched_ule.c revision 166190
1109864Sjeff/*- 2165762Sjeff * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 166190 2007-01-23 08:50:34Z jeff $"); 29116182Sobrien 30147565Speter#include "opt_hwpmc_hooks.h" 31147565Speter#include "opt_sched.h" 32134649Sscottl 33109864Sjeff#include <sys/param.h> 34109864Sjeff#include <sys/systm.h> 35131929Smarcel#include <sys/kdb.h> 36109864Sjeff#include <sys/kernel.h> 37109864Sjeff#include <sys/ktr.h> 38109864Sjeff#include <sys/lock.h> 39109864Sjeff#include <sys/mutex.h> 40109864Sjeff#include <sys/proc.h> 41112966Sjeff#include <sys/resource.h> 42122038Sjeff#include <sys/resourcevar.h> 43109864Sjeff#include <sys/sched.h> 44109864Sjeff#include <sys/smp.h> 45109864Sjeff#include <sys/sx.h> 46109864Sjeff#include <sys/sysctl.h> 47109864Sjeff#include <sys/sysproto.h> 48139453Sjhb#include <sys/turnstile.h> 49161599Sdavidxu#include <sys/umtx.h> 50109864Sjeff#include <sys/vmmeter.h> 51109864Sjeff#ifdef KTRACE 52109864Sjeff#include <sys/uio.h> 53109864Sjeff#include <sys/ktrace.h> 54109864Sjeff#endif 55109864Sjeff 56145256Sjkoshy#ifdef HWPMC_HOOKS 57145256Sjkoshy#include <sys/pmckern.h> 58145256Sjkoshy#endif 59145256Sjkoshy 60109864Sjeff#include <machine/cpu.h> 61121790Sjeff#include <machine/smp.h> 62109864Sjeff 63166190Sjeff#ifndef PREEMPTION 64166190Sjeff#error "SCHED_ULE requires options PREEMPTION" 65166190Sjeff#endif 66166190Sjeff 67109864Sjeff/* 68166137Sjeff * TODO: 69166137Sjeff * Pick idle from affinity group or self group first. 70166137Sjeff * Implement pick_score. 71166137Sjeff */ 72166137Sjeff 73166137Sjeff/* 74164936Sjulian * Thread scheduler specific section. 75146954Sjeff */ 76164936Sjulianstruct td_sched { 77164936Sjulian TAILQ_ENTRY(td_sched) ts_procq; /* (j/z) Run queue. */ 78164936Sjulian int ts_flags; /* (j) TSF_* flags. */ 79164936Sjulian struct thread *ts_thread; /* (*) Active associated thread. */ 80164936Sjulian u_char ts_rqindex; /* (j) Run queue index. */ 81164936Sjulian int ts_slptime; 82164936Sjulian int ts_slice; 83164936Sjulian struct runq *ts_runq; 84164936Sjulian u_char ts_cpu; /* CPU that we have affinity for. */ 85134791Sjulian /* The following variables are only used for pctcpu calculation */ 86164936Sjulian int ts_ltick; /* Last tick that we were running on */ 87164936Sjulian int ts_ftick; /* First tick that we were running on */ 88164936Sjulian int ts_ticks; /* Tick count */ 89166108Sjeff#ifdef SMP 90166108Sjeff int ts_rltick; /* Real last tick, for affinity. */ 91166108Sjeff#endif 92134791Sjulian 93163709Sjb /* originally from kg_sched */ 94163709Sjb int skg_slptime; /* Number of ticks we vol. slept */ 95163709Sjb int skg_runtime; /* Number of ticks we were running */ 96134791Sjulian}; 97164936Sjulian/* flags kept in ts_flags */ 98166108Sjeff#define TSF_BOUND 0x0001 /* Thread can not migrate. */ 99166108Sjeff#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ 100165620Sjeff#define TSF_DIDRUN 0x2000 /* Thread actually ran. */ 101121790Sjeff 102164936Sjulianstatic struct td_sched td_sched0; 103109864Sjeff 104109864Sjeff/* 105165762Sjeff * Cpu percentage computation macros and defines. 106111857Sjeff * 107165762Sjeff * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. 108165762Sjeff * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. 109165796Sjeff * SCHED_TICK_MAX: Maximum number of ticks before scaling back. 110165762Sjeff * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. 111165762Sjeff * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. 112165762Sjeff * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. 113165762Sjeff */ 114165762Sjeff#define SCHED_TICK_SECS 10 115165762Sjeff#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) 116165796Sjeff#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) 117165762Sjeff#define SCHED_TICK_SHIFT 10 118165762Sjeff#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) 119165830Sjeff#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) 120165762Sjeff 121165762Sjeff/* 122165762Sjeff * These macros determine priorities for non-interactive threads. They are 123165762Sjeff * assigned a priority based on their recent cpu utilization as expressed 124165762Sjeff * by the ratio of ticks to the tick total. NHALF priorities at the start 125165762Sjeff * and end of the MIN to MAX timeshare range are only reachable with negative 126165762Sjeff * or positive nice respectively. 127165762Sjeff * 128165762Sjeff * PRI_RANGE: Priority range for utilization dependent priorities. 129116642Sjeff * PRI_NRESV: Number of nice values. 130165762Sjeff * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. 131165762Sjeff * PRI_NICE: Determines the part of the priority inherited from nice. 132109864Sjeff */ 133165762Sjeff#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) 134121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 135165762Sjeff#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF) 136165762Sjeff#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF) 137165762Sjeff#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) 138165762Sjeff#define SCHED_PRI_TICKS(ts) \ 139165762Sjeff (SCHED_TICK_HZ((ts)) / \ 140165827Sjeff (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) 141165762Sjeff#define SCHED_PRI_NICE(nice) (nice) 142109864Sjeff 143109864Sjeff/* 144165762Sjeff * These determine the interactivity of a process. Interactivity differs from 145165762Sjeff * cpu utilization in that it expresses the voluntary time slept vs time ran 146165762Sjeff * while cpu utilization includes all time not running. This more accurately 147165762Sjeff * models the intent of the thread. 148109864Sjeff * 149110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 150110645Sjeff * before throttling back. 151121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 152116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 153111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 154109864Sjeff */ 155165762Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) 156165762Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) 157116365Sjeff#define SCHED_INTERACT_MAX (100) 158116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 159121126Sjeff#define SCHED_INTERACT_THRESH (30) 160111857Sjeff 161109864Sjeff/* 162165762Sjeff * tickincr: Converts a stathz tick into a hz domain scaled by 163165762Sjeff * the shift factor. Without the shift the error rate 164165762Sjeff * due to rounding would be unacceptably high. 165165762Sjeff * realstathz: stathz is sometimes 0 and run off of hz. 166165762Sjeff * sched_slice: Runtime of each thread before rescheduling. 167109864Sjeff */ 168165762Sjeffstatic int sched_interact = SCHED_INTERACT_THRESH; 169165762Sjeffstatic int realstathz; 170165762Sjeffstatic int tickincr; 171165762Sjeffstatic int sched_slice; 172109864Sjeff 173109864Sjeff/* 174164936Sjulian * tdq - per processor runqs and statistics. 175109864Sjeff */ 176164936Sjulianstruct tdq { 177165620Sjeff struct runq tdq_idle; /* Queue of IDLE threads. */ 178165762Sjeff struct runq tdq_timeshare; /* timeshare run queue. */ 179165762Sjeff struct runq tdq_realtime; /* real-time run queue. */ 180165766Sjeff int tdq_idx; /* Current insert index. */ 181165766Sjeff int tdq_ridx; /* Current removal index. */ 182165620Sjeff int tdq_load; /* Aggregate load. */ 183166108Sjeff int tdq_flags; /* Thread queue flags */ 184110267Sjeff#ifdef SMP 185165620Sjeff int tdq_transferable; 186165620Sjeff LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ 187165620Sjeff struct tdq_group *tdq_group; /* Our processor group. */ 188125289Sjeff#else 189165620Sjeff int tdq_sysload; /* For loadavg, !ITHD load. */ 190110267Sjeff#endif 191109864Sjeff}; 192109864Sjeff 193166108Sjeff#define TDQF_BUSY 0x0001 /* Queue is marked as busy */ 194166108Sjeff 195123433Sjeff#ifdef SMP 196109864Sjeff/* 197164936Sjulian * tdq groups are groups of processors which can cheaply share threads. When 198123433Sjeff * one processor in the group goes idle it will check the runqs of the other 199123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 200123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 201123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 202123433Sjeff * load balancer. 203123433Sjeff */ 204164936Sjulianstruct tdq_group { 205165620Sjeff int tdg_cpus; /* Count of CPUs in this tdq group. */ 206165620Sjeff cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ 207165620Sjeff cpumask_t tdg_idlemask; /* Idle cpus in this group. */ 208165620Sjeff cpumask_t tdg_mask; /* Bit mask for first cpu. */ 209165620Sjeff int tdg_load; /* Total load of this group. */ 210165620Sjeff int tdg_transferable; /* Transferable load of this group. */ 211165620Sjeff LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ 212123433Sjeff}; 213123433Sjeff 214166108Sjeff#define SCHED_AFFINITY_DEFAULT (hz / 100) 215166108Sjeff#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity) 216166108Sjeff 217123433Sjeff/* 218166108Sjeff * Run-time tunables. 219166108Sjeff */ 220166156Sjeffstatic int rebalance = 0; 221166108Sjeffstatic int pick_pri = 1; 222166108Sjeffstatic int affinity; 223166108Sjeffstatic int tryself = 1; 224166108Sjeffstatic int tryselfidle = 1; 225166108Sjeffstatic int ipi_ast = 0; 226166108Sjeffstatic int ipi_preempt = 1; 227166108Sjeffstatic int ipi_thresh = PRI_MIN_KERN; 228166108Sjeffstatic int steal_htt = 1; 229166108Sjeffstatic int steal_busy = 1; 230166108Sjeffstatic int busy_thresh = 4; 231166108Sjeff 232166108Sjeff/* 233165620Sjeff * One thread queue per processor. 234109864Sjeff */ 235166108Sjeffstatic volatile cpumask_t tdq_idle; 236166108Sjeffstatic volatile cpumask_t tdq_busy; 237165620Sjeffstatic int tdg_maxid; 238164936Sjulianstatic struct tdq tdq_cpu[MAXCPU]; 239164936Sjulianstatic struct tdq_group tdq_groups[MAXCPU]; 240129982Sjeffstatic int bal_tick; 241129982Sjeffstatic int gbal_tick; 242139334Sjeffstatic int balance_groups; 243129982Sjeff 244164936Sjulian#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) 245164936Sjulian#define TDQ_CPU(x) (&tdq_cpu[(x)]) 246164936Sjulian#define TDQ_ID(x) ((x) - tdq_cpu) 247164936Sjulian#define TDQ_GROUP(x) (&tdq_groups[(x)]) 248123433Sjeff#else /* !SMP */ 249164936Sjulianstatic struct tdq tdq_cpu; 250129982Sjeff 251164936Sjulian#define TDQ_SELF() (&tdq_cpu) 252164936Sjulian#define TDQ_CPU(x) (&tdq_cpu) 253110028Sjeff#endif 254109864Sjeff 255163709Sjbstatic void sched_priority(struct thread *); 256146954Sjeffstatic void sched_thread_priority(struct thread *, u_char); 257163709Sjbstatic int sched_interact_score(struct thread *); 258163709Sjbstatic void sched_interact_update(struct thread *); 259163709Sjbstatic void sched_interact_fork(struct thread *); 260164936Sjulianstatic void sched_pctcpu_update(struct td_sched *); 261165827Sjeffstatic inline void sched_pin_td(struct thread *td); 262165827Sjeffstatic inline void sched_unpin_td(struct thread *td); 263109864Sjeff 264110267Sjeff/* Operations on per processor queues */ 265164936Sjulianstatic struct td_sched * tdq_choose(struct tdq *); 266164936Sjulianstatic void tdq_setup(struct tdq *); 267164936Sjulianstatic void tdq_load_add(struct tdq *, struct td_sched *); 268164936Sjulianstatic void tdq_load_rem(struct tdq *, struct td_sched *); 269164936Sjulianstatic __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); 270164936Sjulianstatic __inline void tdq_runq_rem(struct tdq *, struct td_sched *); 271164936Sjulianvoid tdq_print(int cpu); 272165762Sjeffstatic void runq_print(struct runq *rq); 273110267Sjeff#ifdef SMP 274166108Sjeffstatic int tdq_pickidle(struct tdq *, struct td_sched *); 275166108Sjeffstatic int tdq_pickpri(struct tdq *, struct td_sched *, int); 276164936Sjulianstatic struct td_sched *runq_steal(struct runq *); 277129982Sjeffstatic void sched_balance(void); 278129982Sjeffstatic void sched_balance_groups(void); 279164936Sjulianstatic void sched_balance_group(struct tdq_group *); 280164936Sjulianstatic void sched_balance_pair(struct tdq *, struct tdq *); 281166108Sjeffstatic void sched_smp_tick(struct thread *); 282164936Sjulianstatic void tdq_move(struct tdq *, int); 283164936Sjulianstatic int tdq_idled(struct tdq *); 284166108Sjeffstatic void tdq_notify(struct td_sched *); 285164936Sjulianstatic struct td_sched *tdq_steal(struct tdq *, int); 286165827Sjeff 287166108Sjeff#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) 288121790Sjeff#endif 289110028Sjeff 290165762Sjeffstatic void sched_setup(void *dummy); 291165762SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 292165762Sjeff 293165762Sjeffstatic void sched_initticks(void *dummy); 294165762SjeffSYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) 295165762Sjeff 296165827Sjeffstatic inline void 297165827Sjeffsched_pin_td(struct thread *td) 298165827Sjeff{ 299165827Sjeff td->td_pinned++; 300165827Sjeff} 301165827Sjeff 302165827Sjeffstatic inline void 303165827Sjeffsched_unpin_td(struct thread *td) 304165827Sjeff{ 305165827Sjeff td->td_pinned--; 306165827Sjeff} 307165827Sjeff 308165762Sjeffstatic void 309165762Sjeffrunq_print(struct runq *rq) 310165762Sjeff{ 311165762Sjeff struct rqhead *rqh; 312165762Sjeff struct td_sched *ts; 313165762Sjeff int pri; 314165762Sjeff int j; 315165762Sjeff int i; 316165762Sjeff 317165762Sjeff for (i = 0; i < RQB_LEN; i++) { 318165762Sjeff printf("\t\trunq bits %d 0x%zx\n", 319165762Sjeff i, rq->rq_status.rqb_bits[i]); 320165762Sjeff for (j = 0; j < RQB_BPW; j++) 321165762Sjeff if (rq->rq_status.rqb_bits[i] & (1ul << j)) { 322165762Sjeff pri = j + (i << RQB_L2BPW); 323165762Sjeff rqh = &rq->rq_queues[pri]; 324165762Sjeff TAILQ_FOREACH(ts, rqh, ts_procq) { 325165762Sjeff printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", 326165762Sjeff ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri); 327165762Sjeff } 328165762Sjeff } 329165762Sjeff } 330165762Sjeff} 331165762Sjeff 332113357Sjeffvoid 333164936Sjuliantdq_print(int cpu) 334110267Sjeff{ 335164936Sjulian struct tdq *tdq; 336112994Sjeff 337164936Sjulian tdq = TDQ_CPU(cpu); 338112994Sjeff 339164936Sjulian printf("tdq:\n"); 340165620Sjeff printf("\tload: %d\n", tdq->tdq_load); 341165762Sjeff printf("\ttimeshare idx: %d\n", tdq->tdq_idx); 342165766Sjeff printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); 343165762Sjeff printf("\trealtime runq:\n"); 344165762Sjeff runq_print(&tdq->tdq_realtime); 345165762Sjeff printf("\ttimeshare runq:\n"); 346165762Sjeff runq_print(&tdq->tdq_timeshare); 347165762Sjeff printf("\tidle runq:\n"); 348165762Sjeff runq_print(&tdq->tdq_idle); 349121896Sjeff#ifdef SMP 350165620Sjeff printf("\tload transferable: %d\n", tdq->tdq_transferable); 351121896Sjeff#endif 352113357Sjeff} 353112994Sjeff 354122744Sjeffstatic __inline void 355164936Sjuliantdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) 356122744Sjeff{ 357122744Sjeff#ifdef SMP 358165762Sjeff if (THREAD_CAN_MIGRATE(ts->ts_thread)) { 359165620Sjeff tdq->tdq_transferable++; 360165620Sjeff tdq->tdq_group->tdg_transferable++; 361164936Sjulian ts->ts_flags |= TSF_XFERABLE; 362166108Sjeff if (tdq->tdq_transferable >= busy_thresh && 363166108Sjeff (tdq->tdq_flags & TDQF_BUSY) == 0) { 364166108Sjeff tdq->tdq_flags |= TDQF_BUSY; 365166108Sjeff atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq)); 366166108Sjeff } 367123433Sjeff } 368122744Sjeff#endif 369165762Sjeff if (ts->ts_runq == &tdq->tdq_timeshare) { 370165762Sjeff int pri; 371165762Sjeff 372165762Sjeff pri = ts->ts_thread->td_priority; 373165762Sjeff KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, 374165762Sjeff ("Invalid priority %d on timeshare runq", pri)); 375165762Sjeff /* 376165762Sjeff * This queue contains only priorities between MIN and MAX 377165762Sjeff * realtime. Use the whole queue to represent these values. 378165762Sjeff */ 379165762Sjeff#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) 380165762Sjeff if ((flags & SRQ_BORROWING) == 0) { 381165762Sjeff pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; 382165762Sjeff pri = (pri + tdq->tdq_idx) % RQ_NQS; 383165766Sjeff /* 384165766Sjeff * This effectively shortens the queue by one so we 385165766Sjeff * can have a one slot difference between idx and 386165766Sjeff * ridx while we wait for threads to drain. 387165766Sjeff */ 388165766Sjeff if (tdq->tdq_ridx != tdq->tdq_idx && 389165766Sjeff pri == tdq->tdq_ridx) 390165766Sjeff pri = (pri - 1) % RQ_NQS; 391165762Sjeff } else 392165766Sjeff pri = tdq->tdq_ridx; 393165762Sjeff runq_add_pri(ts->ts_runq, ts, pri, flags); 394165762Sjeff } else 395165762Sjeff runq_add(ts->ts_runq, ts, flags); 396122744Sjeff} 397122744Sjeff 398122744Sjeffstatic __inline void 399164936Sjuliantdq_runq_rem(struct tdq *tdq, struct td_sched *ts) 400122744Sjeff{ 401122744Sjeff#ifdef SMP 402164936Sjulian if (ts->ts_flags & TSF_XFERABLE) { 403165620Sjeff tdq->tdq_transferable--; 404165620Sjeff tdq->tdq_group->tdg_transferable--; 405164936Sjulian ts->ts_flags &= ~TSF_XFERABLE; 406166108Sjeff if (tdq->tdq_transferable < busy_thresh && 407166108Sjeff (tdq->tdq_flags & TDQF_BUSY)) { 408166108Sjeff atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq)); 409166108Sjeff tdq->tdq_flags &= ~TDQF_BUSY; 410166108Sjeff } 411123433Sjeff } 412122744Sjeff#endif 413165766Sjeff if (ts->ts_runq == &tdq->tdq_timeshare) { 414165766Sjeff if (tdq->tdq_idx != tdq->tdq_ridx) 415165766Sjeff runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); 416165766Sjeff else 417165766Sjeff runq_remove_idx(ts->ts_runq, ts, NULL); 418165796Sjeff /* 419165796Sjeff * For timeshare threads we update the priority here so 420165796Sjeff * the priority reflects the time we've been sleeping. 421165796Sjeff */ 422165796Sjeff ts->ts_ltick = ticks; 423165796Sjeff sched_pctcpu_update(ts); 424165796Sjeff sched_priority(ts->ts_thread); 425165766Sjeff } else 426165762Sjeff runq_remove(ts->ts_runq, ts); 427122744Sjeff} 428122744Sjeff 429113357Sjeffstatic void 430164936Sjuliantdq_load_add(struct tdq *tdq, struct td_sched *ts) 431113357Sjeff{ 432121896Sjeff int class; 433115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 434164936Sjulian class = PRI_BASE(ts->ts_thread->td_pri_class); 435165620Sjeff tdq->tdq_load++; 436165620Sjeff CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 437166108Sjeff if (class != PRI_ITHD && 438166108Sjeff (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 439123487Sjeff#ifdef SMP 440165620Sjeff tdq->tdq_group->tdg_load++; 441125289Sjeff#else 442165620Sjeff tdq->tdq_sysload++; 443123487Sjeff#endif 444110267Sjeff} 445113357Sjeff 446112994Sjeffstatic void 447164936Sjuliantdq_load_rem(struct tdq *tdq, struct td_sched *ts) 448110267Sjeff{ 449121896Sjeff int class; 450115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 451164936Sjulian class = PRI_BASE(ts->ts_thread->td_pri_class); 452166108Sjeff if (class != PRI_ITHD && 453166108Sjeff (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 454123487Sjeff#ifdef SMP 455165620Sjeff tdq->tdq_group->tdg_load--; 456125289Sjeff#else 457165620Sjeff tdq->tdq_sysload--; 458123487Sjeff#endif 459165620Sjeff tdq->tdq_load--; 460165620Sjeff CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 461164936Sjulian ts->ts_runq = NULL; 462110267Sjeff} 463110267Sjeff 464113357Sjeff#ifdef SMP 465165766Sjeffstatic void 466166108Sjeffsched_smp_tick(struct thread *td) 467165766Sjeff{ 468165766Sjeff struct tdq *tdq; 469165766Sjeff 470165766Sjeff tdq = TDQ_SELF(); 471166108Sjeff if (rebalance) { 472165819Sjeff if (ticks >= bal_tick) 473165819Sjeff sched_balance(); 474165819Sjeff if (ticks >= gbal_tick && balance_groups) 475165819Sjeff sched_balance_groups(); 476165819Sjeff } 477166108Sjeff td->td_sched->ts_rltick = ticks; 478165766Sjeff} 479165766Sjeff 480116069Sjeff/* 481122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 482116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 483116069Sjeff * by migrating some processes. 484116069Sjeff * 485116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 486116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 487116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 488116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 489116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 490116069Sjeff * 491116069Sjeff * It could be improved by considering the priorities and slices assigned to 492116069Sjeff * each task prior to balancing them. There are many pathological cases with 493116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 494116069Sjeff * 495116069Sjeff */ 496121790Sjeffstatic void 497129982Sjeffsched_balance(void) 498116069Sjeff{ 499164936Sjulian struct tdq_group *high; 500164936Sjulian struct tdq_group *low; 501165620Sjeff struct tdq_group *tdg; 502123487Sjeff int cnt; 503123487Sjeff int i; 504123487Sjeff 505139334Sjeff bal_tick = ticks + (random() % (hz * 2)); 506123487Sjeff if (smp_started == 0) 507139334Sjeff return; 508123487Sjeff low = high = NULL; 509165620Sjeff i = random() % (tdg_maxid + 1); 510165620Sjeff for (cnt = 0; cnt <= tdg_maxid; cnt++) { 511165620Sjeff tdg = TDQ_GROUP(i); 512123487Sjeff /* 513123487Sjeff * Find the CPU with the highest load that has some 514123487Sjeff * threads to transfer. 515123487Sjeff */ 516165620Sjeff if ((high == NULL || tdg->tdg_load > high->tdg_load) 517165620Sjeff && tdg->tdg_transferable) 518165620Sjeff high = tdg; 519165620Sjeff if (low == NULL || tdg->tdg_load < low->tdg_load) 520165620Sjeff low = tdg; 521165620Sjeff if (++i > tdg_maxid) 522123487Sjeff i = 0; 523123487Sjeff } 524123487Sjeff if (low != NULL && high != NULL && high != low) 525165620Sjeff sched_balance_pair(LIST_FIRST(&high->tdg_members), 526165620Sjeff LIST_FIRST(&low->tdg_members)); 527123487Sjeff} 528123487Sjeff 529123487Sjeffstatic void 530129982Sjeffsched_balance_groups(void) 531123487Sjeff{ 532123487Sjeff int i; 533123487Sjeff 534139334Sjeff gbal_tick = ticks + (random() % (hz * 2)); 535129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 536123487Sjeff if (smp_started) 537165620Sjeff for (i = 0; i <= tdg_maxid; i++) 538164936Sjulian sched_balance_group(TDQ_GROUP(i)); 539123487Sjeff} 540123487Sjeff 541123487Sjeffstatic void 542165620Sjeffsched_balance_group(struct tdq_group *tdg) 543123487Sjeff{ 544164936Sjulian struct tdq *tdq; 545164936Sjulian struct tdq *high; 546164936Sjulian struct tdq *low; 547123487Sjeff int load; 548123487Sjeff 549165620Sjeff if (tdg->tdg_transferable == 0) 550123487Sjeff return; 551123487Sjeff low = NULL; 552123487Sjeff high = NULL; 553165620Sjeff LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 554165620Sjeff load = tdq->tdq_load; 555165620Sjeff if (high == NULL || load > high->tdq_load) 556164936Sjulian high = tdq; 557165620Sjeff if (low == NULL || load < low->tdq_load) 558164936Sjulian low = tdq; 559123487Sjeff } 560123487Sjeff if (high != NULL && low != NULL && high != low) 561123487Sjeff sched_balance_pair(high, low); 562123487Sjeff} 563123487Sjeff 564123487Sjeffstatic void 565164936Sjuliansched_balance_pair(struct tdq *high, struct tdq *low) 566123487Sjeff{ 567123433Sjeff int transferable; 568116069Sjeff int high_load; 569116069Sjeff int low_load; 570116069Sjeff int move; 571116069Sjeff int diff; 572116069Sjeff int i; 573116069Sjeff 574116069Sjeff /* 575123433Sjeff * If we're transfering within a group we have to use this specific 576164936Sjulian * tdq's transferable count, otherwise we can steal from other members 577123433Sjeff * of the group. 578123433Sjeff */ 579165620Sjeff if (high->tdq_group == low->tdq_group) { 580165620Sjeff transferable = high->tdq_transferable; 581165620Sjeff high_load = high->tdq_load; 582165620Sjeff low_load = low->tdq_load; 583123487Sjeff } else { 584165620Sjeff transferable = high->tdq_group->tdg_transferable; 585165620Sjeff high_load = high->tdq_group->tdg_load; 586165620Sjeff low_load = low->tdq_group->tdg_load; 587123487Sjeff } 588123433Sjeff if (transferable == 0) 589123487Sjeff return; 590123433Sjeff /* 591122744Sjeff * Determine what the imbalance is and then adjust that to how many 592165620Sjeff * threads we actually have to give up (transferable). 593122744Sjeff */ 594123487Sjeff diff = high_load - low_load; 595116069Sjeff move = diff / 2; 596116069Sjeff if (diff & 0x1) 597116069Sjeff move++; 598123433Sjeff move = min(move, transferable); 599116069Sjeff for (i = 0; i < move; i++) 600164936Sjulian tdq_move(high, TDQ_ID(low)); 601116069Sjeff return; 602116069Sjeff} 603116069Sjeff 604121790Sjeffstatic void 605164936Sjuliantdq_move(struct tdq *from, int cpu) 606116069Sjeff{ 607164936Sjulian struct tdq *tdq; 608164936Sjulian struct tdq *to; 609164936Sjulian struct td_sched *ts; 610116069Sjeff 611164936Sjulian tdq = from; 612164936Sjulian to = TDQ_CPU(cpu); 613164936Sjulian ts = tdq_steal(tdq, 1); 614164936Sjulian if (ts == NULL) { 615165620Sjeff struct tdq_group *tdg; 616123433Sjeff 617165620Sjeff tdg = tdq->tdq_group; 618165620Sjeff LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 619165620Sjeff if (tdq == from || tdq->tdq_transferable == 0) 620123433Sjeff continue; 621164936Sjulian ts = tdq_steal(tdq, 1); 622123433Sjeff break; 623123433Sjeff } 624164936Sjulian if (ts == NULL) 625164936Sjulian panic("tdq_move: No threads available with a " 626123433Sjeff "transferable count of %d\n", 627165620Sjeff tdg->tdg_transferable); 628123433Sjeff } 629164936Sjulian if (tdq == to) 630123433Sjeff return; 631166108Sjeff sched_rem(ts->ts_thread); 632166108Sjeff ts->ts_cpu = cpu; 633166108Sjeff sched_pin_td(ts->ts_thread); 634166108Sjeff sched_add(ts->ts_thread, SRQ_YIELDING); 635166108Sjeff sched_unpin_td(ts->ts_thread); 636116069Sjeff} 637110267Sjeff 638123433Sjeffstatic int 639164936Sjuliantdq_idled(struct tdq *tdq) 640121790Sjeff{ 641165620Sjeff struct tdq_group *tdg; 642164936Sjulian struct tdq *steal; 643164936Sjulian struct td_sched *ts; 644123433Sjeff 645165620Sjeff tdg = tdq->tdq_group; 646123433Sjeff /* 647165620Sjeff * If we're in a cpu group, try and steal threads from another cpu in 648123433Sjeff * the group before idling. 649123433Sjeff */ 650166108Sjeff if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) { 651165620Sjeff LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { 652165620Sjeff if (steal == tdq || steal->tdq_transferable == 0) 653123433Sjeff continue; 654164936Sjulian ts = tdq_steal(steal, 0); 655166108Sjeff if (ts) 656166108Sjeff goto steal; 657166108Sjeff } 658166108Sjeff } 659166108Sjeff if (steal_busy) { 660166108Sjeff while (tdq_busy) { 661166108Sjeff int cpu; 662166108Sjeff 663166108Sjeff cpu = ffs(tdq_busy); 664166108Sjeff if (cpu == 0) 665166108Sjeff break; 666166108Sjeff cpu--; 667166108Sjeff steal = TDQ_CPU(cpu); 668166108Sjeff if (steal->tdq_transferable == 0) 669166108Sjeff continue; 670166108Sjeff ts = tdq_steal(steal, 1); 671164936Sjulian if (ts == NULL) 672123433Sjeff continue; 673166108Sjeff CTR5(KTR_SCHED, 674166108Sjeff "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X", 675166108Sjeff ts->ts_thread, ts->ts_thread->td_proc->p_comm, 676166108Sjeff ts->ts_thread->td_priority, cpu, tdq_busy); 677166108Sjeff goto steal; 678123433Sjeff } 679123433Sjeff } 680123433Sjeff /* 681123433Sjeff * We only set the idled bit when all of the cpus in the group are 682164936Sjulian * idle. Otherwise we could get into a situation where a thread bounces 683123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 684123433Sjeff */ 685165620Sjeff tdg->tdg_idlemask |= PCPU_GET(cpumask); 686166108Sjeff if (tdg->tdg_idlemask == tdg->tdg_cpumask) 687166108Sjeff atomic_set_int(&tdq_idle, tdg->tdg_mask); 688123433Sjeff return (1); 689166108Sjeffsteal: 690166108Sjeff sched_rem(ts->ts_thread); 691166108Sjeff ts->ts_cpu = PCPU_GET(cpuid); 692166108Sjeff sched_pin_td(ts->ts_thread); 693166108Sjeff sched_add(ts->ts_thread, SRQ_YIELDING); 694166108Sjeff sched_unpin_td(ts->ts_thread); 695121790Sjeff 696166108Sjeff return (0); 697121790Sjeff} 698121790Sjeff 699121790Sjeffstatic void 700166108Sjefftdq_notify(struct td_sched *ts) 701121790Sjeff{ 702121790Sjeff struct thread *td; 703121790Sjeff struct pcpu *pcpu; 704133427Sjeff int prio; 705166108Sjeff int cpu; 706121790Sjeff 707164936Sjulian prio = ts->ts_thread->td_priority; 708166108Sjeff cpu = ts->ts_cpu; 709166108Sjeff pcpu = pcpu_find(cpu); 710166108Sjeff td = pcpu->pc_curthread; 711166137Sjeff 712121790Sjeff /* 713166137Sjeff * If our priority is not better than the current priority there is 714166137Sjeff * nothing to do. 715166137Sjeff */ 716166137Sjeff if (prio > td->td_priority) 717166137Sjeff return; 718166137Sjeff /* Always set NEEDRESCHED. */ 719166137Sjeff td->td_flags |= TDF_NEEDRESCHED; 720166137Sjeff /* 721166108Sjeff * IPI if we exceed the threshold or if the target cpu is running an 722166108Sjeff * idle thread. 723121790Sjeff */ 724166108Sjeff if (prio > ipi_thresh && td->td_priority < PRI_MIN_IDLE) 725165819Sjeff return; 726166137Sjeff if (ipi_ast) 727121790Sjeff ipi_selected(1 << cpu, IPI_AST); 728166137Sjeff else if (ipi_preempt) 729166108Sjeff ipi_selected(1 << cpu, IPI_PREEMPT); 730121790Sjeff} 731121790Sjeff 732164936Sjulianstatic struct td_sched * 733121790Sjeffrunq_steal(struct runq *rq) 734121790Sjeff{ 735121790Sjeff struct rqhead *rqh; 736121790Sjeff struct rqbits *rqb; 737164936Sjulian struct td_sched *ts; 738121790Sjeff int word; 739121790Sjeff int bit; 740121790Sjeff 741121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 742121790Sjeff rqb = &rq->rq_status; 743121790Sjeff for (word = 0; word < RQB_LEN; word++) { 744121790Sjeff if (rqb->rqb_bits[word] == 0) 745121790Sjeff continue; 746121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 747123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 748121790Sjeff continue; 749121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 750164936Sjulian TAILQ_FOREACH(ts, rqh, ts_procq) { 751165762Sjeff if (THREAD_CAN_MIGRATE(ts->ts_thread)) 752164936Sjulian return (ts); 753121790Sjeff } 754121790Sjeff } 755121790Sjeff } 756121790Sjeff return (NULL); 757121790Sjeff} 758121790Sjeff 759164936Sjulianstatic struct td_sched * 760164936Sjuliantdq_steal(struct tdq *tdq, int stealidle) 761121790Sjeff{ 762164936Sjulian struct td_sched *ts; 763121790Sjeff 764123433Sjeff /* 765123433Sjeff * Steal from next first to try to get a non-interactive task that 766123433Sjeff * may not have run for a while. 767165762Sjeff * XXX Need to effect steal order for timeshare threads. 768123433Sjeff */ 769165762Sjeff if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL) 770164936Sjulian return (ts); 771165762Sjeff if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL) 772164936Sjulian return (ts); 773123433Sjeff if (stealidle) 774165620Sjeff return (runq_steal(&tdq->tdq_idle)); 775123433Sjeff return (NULL); 776121790Sjeff} 777123433Sjeff 778123433Sjeffint 779166108Sjefftdq_pickidle(struct tdq *tdq, struct td_sched *ts) 780123433Sjeff{ 781165620Sjeff struct tdq_group *tdg; 782166108Sjeff int self; 783123433Sjeff int cpu; 784123433Sjeff 785166108Sjeff self = PCPU_GET(cpuid); 786123685Sjeff if (smp_started == 0) 787166108Sjeff return (self); 788123433Sjeff /* 789166108Sjeff * If the current CPU has idled, just run it here. 790123685Sjeff */ 791166108Sjeff if ((tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) 792166108Sjeff return (self); 793166108Sjeff /* 794166108Sjeff * Try the last group we ran on. 795166108Sjeff */ 796166108Sjeff tdg = TDQ_CPU(ts->ts_cpu)->tdq_group; 797166108Sjeff cpu = ffs(tdg->tdg_idlemask); 798166108Sjeff if (cpu) 799166108Sjeff return (cpu - 1); 800166108Sjeff /* 801166108Sjeff * Search for an idle group. 802166108Sjeff */ 803166108Sjeff cpu = ffs(tdq_idle); 804166108Sjeff if (cpu) 805166108Sjeff return (cpu - 1); 806166108Sjeff /* 807166108Sjeff * XXX If there are no idle groups, check for an idle core. 808166108Sjeff */ 809166108Sjeff /* 810166108Sjeff * No idle CPUs? 811166108Sjeff */ 812166108Sjeff return (self); 813166108Sjeff} 814166108Sjeff 815166108Sjeffstatic int 816166108Sjefftdq_pickpri(struct tdq *tdq, struct td_sched *ts, int flags) 817166108Sjeff{ 818166108Sjeff struct pcpu *pcpu; 819166108Sjeff int lowpri; 820166108Sjeff int lowcpu; 821166108Sjeff int lowload; 822166108Sjeff int load; 823166108Sjeff int self; 824166108Sjeff int pri; 825166108Sjeff int cpu; 826166108Sjeff 827166108Sjeff self = PCPU_GET(cpuid); 828166108Sjeff if (smp_started == 0) 829166108Sjeff return (self); 830166108Sjeff 831166108Sjeff pri = ts->ts_thread->td_priority; 832166108Sjeff /* 833166108Sjeff * Regardless of affinity, if the last cpu is idle send it there. 834166108Sjeff */ 835166108Sjeff pcpu = pcpu_find(ts->ts_cpu); 836166108Sjeff if (pcpu->pc_curthread->td_priority > PRI_MIN_IDLE) { 837166108Sjeff CTR5(KTR_SCHED, 838166108Sjeff "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d", 839166108Sjeff ts->ts_cpu, ts->ts_rltick, ticks, pri, 840166108Sjeff pcpu->pc_curthread->td_priority); 841166108Sjeff return (ts->ts_cpu); 842123433Sjeff } 843166108Sjeff /* 844166108Sjeff * If we have affinity, try to place it on the cpu we last ran on. 845166108Sjeff */ 846166108Sjeff if (SCHED_AFFINITY(ts) && pcpu->pc_curthread->td_priority > pri) { 847166108Sjeff CTR5(KTR_SCHED, 848166108Sjeff "affinity for %d, ltick %d ticks %d pri %d curthread %d", 849166108Sjeff ts->ts_cpu, ts->ts_rltick, ticks, pri, 850166108Sjeff pcpu->pc_curthread->td_priority); 851166108Sjeff return (ts->ts_cpu); 852139334Sjeff } 853123433Sjeff /* 854166108Sjeff * Try ourself first; If we're running something lower priority this 855166108Sjeff * may have some locality with the waking thread and execute faster 856166108Sjeff * here. 857139334Sjeff */ 858166108Sjeff if (tryself) { 859166108Sjeff /* 860166108Sjeff * If we're being awoken by an interrupt thread or the waker 861166108Sjeff * is going right to sleep run here as well. 862166108Sjeff */ 863166108Sjeff if ((TDQ_SELF()->tdq_load == 1) && (flags & SRQ_YIELDING || 864166108Sjeff curthread->td_pri_class == PRI_ITHD)) { 865166108Sjeff CTR2(KTR_SCHED, "tryself load %d flags %d", 866166108Sjeff TDQ_SELF()->tdq_load, flags); 867166108Sjeff return (self); 868139334Sjeff } 869139334Sjeff } 870139334Sjeff /* 871166108Sjeff * Look for an idle group. 872123433Sjeff */ 873166108Sjeff CTR1(KTR_SCHED, "tdq_idle %X", tdq_idle); 874166108Sjeff cpu = ffs(tdq_idle); 875166108Sjeff if (cpu) 876166108Sjeff return (cpu - 1); 877166108Sjeff if (tryselfidle && pri < curthread->td_priority) { 878166108Sjeff CTR1(KTR_SCHED, "tryself %d", 879166108Sjeff curthread->td_priority); 880166108Sjeff return (self); 881123433Sjeff } 882133427Sjeff /* 883166108Sjeff * Now search for the cpu running the lowest priority thread with 884166108Sjeff * the least load. 885123433Sjeff */ 886166108Sjeff lowload = 0; 887166108Sjeff lowpri = lowcpu = 0; 888166108Sjeff for (cpu = 0; cpu <= mp_maxid; cpu++) { 889166108Sjeff if (CPU_ABSENT(cpu)) 890166108Sjeff continue; 891166108Sjeff pcpu = pcpu_find(cpu); 892166108Sjeff pri = pcpu->pc_curthread->td_priority; 893166108Sjeff CTR4(KTR_SCHED, 894166108Sjeff "cpu %d pri %d lowcpu %d lowpri %d", 895166108Sjeff cpu, pri, lowcpu, lowpri); 896166108Sjeff if (pri < lowpri) 897166108Sjeff continue; 898166108Sjeff load = TDQ_CPU(cpu)->tdq_load; 899166108Sjeff if (lowpri && lowpri == pri && load > lowload) 900166108Sjeff continue; 901166108Sjeff lowpri = pri; 902166108Sjeff lowcpu = cpu; 903166108Sjeff lowload = load; 904166108Sjeff } 905133427Sjeff 906166108Sjeff return (lowcpu); 907123433Sjeff} 908123433Sjeff 909121790Sjeff#endif /* SMP */ 910121790Sjeff 911117326Sjeff/* 912121790Sjeff * Pick the highest priority task we have and return it. 913117326Sjeff */ 914117326Sjeff 915164936Sjulianstatic struct td_sched * 916164936Sjuliantdq_choose(struct tdq *tdq) 917110267Sjeff{ 918164936Sjulian struct td_sched *ts; 919110267Sjeff 920115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 921112994Sjeff 922165762Sjeff ts = runq_choose(&tdq->tdq_realtime); 923165762Sjeff if (ts != NULL) { 924165762Sjeff KASSERT(ts->ts_thread->td_priority <= PRI_MAX_REALTIME, 925165762Sjeff ("tdq_choose: Invalid priority on realtime queue %d", 926165762Sjeff ts->ts_thread->td_priority)); 927164936Sjulian return (ts); 928110267Sjeff } 929165766Sjeff ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); 930165762Sjeff if (ts != NULL) { 931165762Sjeff KASSERT(ts->ts_thread->td_priority <= PRI_MAX_TIMESHARE && 932165762Sjeff ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE, 933165762Sjeff ("tdq_choose: Invalid priority on timeshare queue %d", 934165762Sjeff ts->ts_thread->td_priority)); 935165762Sjeff return (ts); 936165762Sjeff } 937110267Sjeff 938165762Sjeff ts = runq_choose(&tdq->tdq_idle); 939165762Sjeff if (ts != NULL) { 940165762Sjeff KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE, 941165762Sjeff ("tdq_choose: Invalid priority on idle queue %d", 942165762Sjeff ts->ts_thread->td_priority)); 943165762Sjeff return (ts); 944165762Sjeff } 945165762Sjeff 946165762Sjeff return (NULL); 947110267Sjeff} 948110267Sjeff 949109864Sjeffstatic void 950164936Sjuliantdq_setup(struct tdq *tdq) 951110028Sjeff{ 952165762Sjeff runq_init(&tdq->tdq_realtime); 953165762Sjeff runq_init(&tdq->tdq_timeshare); 954165620Sjeff runq_init(&tdq->tdq_idle); 955165620Sjeff tdq->tdq_load = 0; 956110028Sjeff} 957110028Sjeff 958110028Sjeffstatic void 959109864Sjeffsched_setup(void *dummy) 960109864Sjeff{ 961117313Sjeff#ifdef SMP 962109864Sjeff int i; 963117313Sjeff#endif 964109864Sjeff 965153533Sdavidxu /* 966153533Sdavidxu * To avoid divide-by-zero, we set realstathz a dummy value 967153533Sdavidxu * in case which sched_clock() called before sched_initticks(). 968153533Sdavidxu */ 969153533Sdavidxu realstathz = hz; 970165762Sjeff sched_slice = (realstathz/7); /* 140ms */ 971165762Sjeff tickincr = 1 << SCHED_TICK_SHIFT; 972111857Sjeff 973117237Sjeff#ifdef SMP 974123487Sjeff balance_groups = 0; 975123433Sjeff /* 976164936Sjulian * Initialize the tdqs. 977123433Sjeff */ 978123433Sjeff for (i = 0; i < MAXCPU; i++) { 979165627Sjeff struct tdq *tdq; 980123433Sjeff 981165627Sjeff tdq = &tdq_cpu[i]; 982164936Sjulian tdq_setup(&tdq_cpu[i]); 983123433Sjeff } 984117237Sjeff if (smp_topology == NULL) { 985165620Sjeff struct tdq_group *tdg; 986165627Sjeff struct tdq *tdq; 987139334Sjeff int cpus; 988123433Sjeff 989139334Sjeff for (cpus = 0, i = 0; i < MAXCPU; i++) { 990139334Sjeff if (CPU_ABSENT(i)) 991139334Sjeff continue; 992165627Sjeff tdq = &tdq_cpu[i]; 993165620Sjeff tdg = &tdq_groups[cpus]; 994123433Sjeff /* 995164936Sjulian * Setup a tdq group with one member. 996123433Sjeff */ 997165627Sjeff tdq->tdq_transferable = 0; 998165627Sjeff tdq->tdq_group = tdg; 999165620Sjeff tdg->tdg_cpus = 1; 1000165620Sjeff tdg->tdg_idlemask = 0; 1001165620Sjeff tdg->tdg_cpumask = tdg->tdg_mask = 1 << i; 1002165620Sjeff tdg->tdg_load = 0; 1003165620Sjeff tdg->tdg_transferable = 0; 1004165620Sjeff LIST_INIT(&tdg->tdg_members); 1005165627Sjeff LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); 1006139334Sjeff cpus++; 1007117237Sjeff } 1008165620Sjeff tdg_maxid = cpus - 1; 1009117237Sjeff } else { 1010165620Sjeff struct tdq_group *tdg; 1011123433Sjeff struct cpu_group *cg; 1012117237Sjeff int j; 1013113357Sjeff 1014117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 1015117237Sjeff cg = &smp_topology->ct_group[i]; 1016165620Sjeff tdg = &tdq_groups[i]; 1017123433Sjeff /* 1018123433Sjeff * Initialize the group. 1019123433Sjeff */ 1020165620Sjeff tdg->tdg_idlemask = 0; 1021165620Sjeff tdg->tdg_load = 0; 1022165620Sjeff tdg->tdg_transferable = 0; 1023165620Sjeff tdg->tdg_cpus = cg->cg_count; 1024165620Sjeff tdg->tdg_cpumask = cg->cg_mask; 1025165620Sjeff LIST_INIT(&tdg->tdg_members); 1026123433Sjeff /* 1027123433Sjeff * Find all of the group members and add them. 1028123433Sjeff */ 1029123433Sjeff for (j = 0; j < MAXCPU; j++) { 1030123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 1031165620Sjeff if (tdg->tdg_mask == 0) 1032165620Sjeff tdg->tdg_mask = 1 << j; 1033165620Sjeff tdq_cpu[j].tdq_transferable = 0; 1034165620Sjeff tdq_cpu[j].tdq_group = tdg; 1035165620Sjeff LIST_INSERT_HEAD(&tdg->tdg_members, 1036165620Sjeff &tdq_cpu[j], tdq_siblings); 1037123433Sjeff } 1038123433Sjeff } 1039165620Sjeff if (tdg->tdg_cpus > 1) 1040123487Sjeff balance_groups = 1; 1041117237Sjeff } 1042165620Sjeff tdg_maxid = smp_topology->ct_count - 1; 1043117237Sjeff } 1044123487Sjeff /* 1045123487Sjeff * Stagger the group and global load balancer so they do not 1046123487Sjeff * interfere with each other. 1047123487Sjeff */ 1048129982Sjeff bal_tick = ticks + hz; 1049123487Sjeff if (balance_groups) 1050129982Sjeff gbal_tick = ticks + (hz / 2); 1051117237Sjeff#else 1052164936Sjulian tdq_setup(TDQ_SELF()); 1053116069Sjeff#endif 1054117237Sjeff mtx_lock_spin(&sched_lock); 1055164936Sjulian tdq_load_add(TDQ_SELF(), &td_sched0); 1056117237Sjeff mtx_unlock_spin(&sched_lock); 1057109864Sjeff} 1058109864Sjeff 1059153533Sdavidxu/* ARGSUSED */ 1060153533Sdavidxustatic void 1061153533Sdavidxusched_initticks(void *dummy) 1062153533Sdavidxu{ 1063153533Sdavidxu mtx_lock_spin(&sched_lock); 1064153533Sdavidxu realstathz = stathz ? stathz : hz; 1065165762Sjeff sched_slice = (realstathz/7); /* ~140ms */ 1066153533Sdavidxu 1067153533Sdavidxu /* 1068165762Sjeff * tickincr is shifted out by 10 to avoid rounding errors due to 1069165766Sjeff * hz not being evenly divisible by stathz on all platforms. 1070153533Sdavidxu */ 1071165762Sjeff tickincr = (hz << SCHED_TICK_SHIFT) / realstathz; 1072165762Sjeff /* 1073165762Sjeff * This does not work for values of stathz that are more than 1074165762Sjeff * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. 1075165762Sjeff */ 1076153533Sdavidxu if (tickincr == 0) 1077153533Sdavidxu tickincr = 1; 1078166108Sjeff#ifdef SMP 1079166108Sjeff affinity = SCHED_AFFINITY_DEFAULT; 1080166108Sjeff#endif 1081153533Sdavidxu mtx_unlock_spin(&sched_lock); 1082153533Sdavidxu} 1083153533Sdavidxu 1084153533Sdavidxu 1085109864Sjeff/* 1086109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 1087109864Sjeff * process. 1088109864Sjeff */ 1089113357Sjeffstatic void 1090163709Sjbsched_priority(struct thread *td) 1091109864Sjeff{ 1092165762Sjeff int score; 1093109864Sjeff int pri; 1094109864Sjeff 1095163709Sjb if (td->td_pri_class != PRI_TIMESHARE) 1096113357Sjeff return; 1097112966Sjeff /* 1098165762Sjeff * If the score is interactive we place the thread in the realtime 1099165762Sjeff * queue with a priority that is less than kernel and interrupt 1100165762Sjeff * priorities. These threads are not subject to nice restrictions. 1101112966Sjeff * 1102165762Sjeff * Scores greater than this are placed on the normal realtime queue 1103165762Sjeff * where the priority is partially decided by the most recent cpu 1104165762Sjeff * utilization and the rest is decided by nice value. 1105112966Sjeff */ 1106165762Sjeff score = sched_interact_score(td); 1107165762Sjeff if (score < sched_interact) { 1108165762Sjeff pri = PRI_MIN_REALTIME; 1109165762Sjeff pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact) 1110165762Sjeff * score; 1111165762Sjeff KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME, 1112165762Sjeff ("sched_priority: invalid interactive priority %d", pri)); 1113165762Sjeff } else { 1114165762Sjeff pri = SCHED_PRI_MIN; 1115165762Sjeff if (td->td_sched->ts_ticks) 1116165762Sjeff pri += SCHED_PRI_TICKS(td->td_sched); 1117165762Sjeff pri += SCHED_PRI_NICE(td->td_proc->p_nice); 1118165796Sjeff if (!(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE)) { 1119165796Sjeff static int once = 1; 1120165796Sjeff if (once) { 1121165796Sjeff printf("sched_priority: invalid priority %d", 1122165796Sjeff pri); 1123165796Sjeff printf("nice %d, ticks %d ftick %d ltick %d tick pri %d\n", 1124165796Sjeff td->td_proc->p_nice, 1125165796Sjeff td->td_sched->ts_ticks, 1126165796Sjeff td->td_sched->ts_ftick, 1127165796Sjeff td->td_sched->ts_ltick, 1128165796Sjeff SCHED_PRI_TICKS(td->td_sched)); 1129165796Sjeff once = 0; 1130165796Sjeff } 1131165796Sjeff pri = min(max(pri, PRI_MIN_TIMESHARE), 1132165796Sjeff PRI_MAX_TIMESHARE); 1133165796Sjeff } 1134165762Sjeff } 1135165762Sjeff sched_user_prio(td, pri); 1136112966Sjeff 1137112966Sjeff return; 1138109864Sjeff} 1139109864Sjeff 1140121868Sjeff/* 1141121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1142121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 1143121868Sjeff */ 1144116463Sjeffstatic void 1145163709Sjbsched_interact_update(struct thread *td) 1146116463Sjeff{ 1147165819Sjeff struct td_sched *ts; 1148121868Sjeff int sum; 1149121605Sjeff 1150165819Sjeff ts = td->td_sched; 1151165819Sjeff sum = ts->skg_runtime + ts->skg_slptime; 1152121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1153121868Sjeff return; 1154121868Sjeff /* 1155165819Sjeff * This only happens from two places: 1156165819Sjeff * 1) We have added an unusual amount of run time from fork_exit. 1157165819Sjeff * 2) We have added an unusual amount of sleep time from sched_sleep(). 1158165819Sjeff */ 1159165819Sjeff if (sum > SCHED_SLP_RUN_MAX * 2) { 1160165819Sjeff if (ts->skg_runtime > ts->skg_slptime) { 1161165819Sjeff ts->skg_runtime = SCHED_SLP_RUN_MAX; 1162165819Sjeff ts->skg_slptime = 1; 1163165819Sjeff } else { 1164165819Sjeff ts->skg_slptime = SCHED_SLP_RUN_MAX; 1165165819Sjeff ts->skg_runtime = 1; 1166165819Sjeff } 1167165819Sjeff return; 1168165819Sjeff } 1169165819Sjeff /* 1170121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1171121868Sjeff * will not bring us back into range. Dividing by two here forces 1172133427Sjeff * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1173121868Sjeff */ 1174127850Sjeff if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1175165819Sjeff ts->skg_runtime /= 2; 1176165819Sjeff ts->skg_slptime /= 2; 1177121868Sjeff return; 1178116463Sjeff } 1179165819Sjeff ts->skg_runtime = (ts->skg_runtime / 5) * 4; 1180165819Sjeff ts->skg_slptime = (ts->skg_slptime / 5) * 4; 1181116463Sjeff} 1182116463Sjeff 1183121868Sjeffstatic void 1184163709Sjbsched_interact_fork(struct thread *td) 1185121868Sjeff{ 1186121868Sjeff int ratio; 1187121868Sjeff int sum; 1188121868Sjeff 1189163709Sjb sum = td->td_sched->skg_runtime + td->td_sched->skg_slptime; 1190121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1191121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1192163709Sjb td->td_sched->skg_runtime /= ratio; 1193163709Sjb td->td_sched->skg_slptime /= ratio; 1194121868Sjeff } 1195121868Sjeff} 1196121868Sjeff 1197111857Sjeffstatic int 1198163709Sjbsched_interact_score(struct thread *td) 1199111857Sjeff{ 1200116365Sjeff int div; 1201111857Sjeff 1202163709Sjb if (td->td_sched->skg_runtime > td->td_sched->skg_slptime) { 1203163709Sjb div = max(1, td->td_sched->skg_runtime / SCHED_INTERACT_HALF); 1204116365Sjeff return (SCHED_INTERACT_HALF + 1205163709Sjb (SCHED_INTERACT_HALF - (td->td_sched->skg_slptime / div))); 1206163709Sjb } if (td->td_sched->skg_slptime > td->td_sched->skg_runtime) { 1207163709Sjb div = max(1, td->td_sched->skg_slptime / SCHED_INTERACT_HALF); 1208163709Sjb return (td->td_sched->skg_runtime / div); 1209111857Sjeff } 1210111857Sjeff 1211116365Sjeff /* 1212116365Sjeff * This can happen if slptime and runtime are 0. 1213116365Sjeff */ 1214116365Sjeff return (0); 1215111857Sjeff 1216111857Sjeff} 1217111857Sjeff 1218113357Sjeff/* 1219165762Sjeff * Called from proc0_init() to bootstrap the scheduler. 1220134791Sjulian */ 1221134791Sjulianvoid 1222134791Sjulianschedinit(void) 1223134791Sjulian{ 1224165762Sjeff 1225134791Sjulian /* 1226134791Sjulian * Set up the scheduler specific parts of proc0. 1227134791Sjulian */ 1228136167Sjulian proc0.p_sched = NULL; /* XXX */ 1229164936Sjulian thread0.td_sched = &td_sched0; 1230165762Sjeff td_sched0.ts_ltick = ticks; 1231165796Sjeff td_sched0.ts_ftick = ticks; 1232164936Sjulian td_sched0.ts_thread = &thread0; 1233134791Sjulian} 1234134791Sjulian 1235134791Sjulian/* 1236113357Sjeff * This is only somewhat accurate since given many processes of the same 1237113357Sjeff * priority they will switch when their slices run out, which will be 1238165762Sjeff * at most sched_slice stathz ticks. 1239113357Sjeff */ 1240109864Sjeffint 1241109864Sjeffsched_rr_interval(void) 1242109864Sjeff{ 1243165762Sjeff 1244165762Sjeff /* Convert sched_slice to hz */ 1245165762Sjeff return (hz/(realstathz/sched_slice)); 1246109864Sjeff} 1247109864Sjeff 1248121790Sjeffstatic void 1249164936Sjuliansched_pctcpu_update(struct td_sched *ts) 1250109864Sjeff{ 1251165762Sjeff 1252165762Sjeff if (ts->ts_ticks == 0) 1253165762Sjeff return; 1254165796Sjeff if (ticks - (hz / 10) < ts->ts_ltick && 1255165796Sjeff SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX) 1256165796Sjeff return; 1257109864Sjeff /* 1258109864Sjeff * Adjust counters and watermark for pctcpu calc. 1259116365Sjeff */ 1260165762Sjeff if (ts->ts_ltick > ticks - SCHED_TICK_TARG) 1261164936Sjulian ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) * 1262165762Sjeff SCHED_TICK_TARG; 1263165762Sjeff else 1264164936Sjulian ts->ts_ticks = 0; 1265164936Sjulian ts->ts_ltick = ticks; 1266165762Sjeff ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG; 1267109864Sjeff} 1268109864Sjeff 1269165762Sjeffstatic void 1270139453Sjhbsched_thread_priority(struct thread *td, u_char prio) 1271109864Sjeff{ 1272164936Sjulian struct td_sched *ts; 1273109864Sjeff 1274139316Sjeff CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1275139316Sjeff td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1276139316Sjeff curthread->td_proc->p_comm); 1277164936Sjulian ts = td->td_sched; 1278109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1279139453Sjhb if (td->td_priority == prio) 1280139453Sjhb return; 1281165762Sjeff 1282165766Sjeff if (TD_ON_RUNQ(td) && prio < td->td_priority) { 1283121605Sjeff /* 1284121605Sjeff * If the priority has been elevated due to priority 1285121605Sjeff * propagation, we may have to move ourselves to a new 1286165762Sjeff * queue. This could be optimized to not re-add in some 1287165762Sjeff * cases. 1288133555Sjeff */ 1289165762Sjeff sched_rem(td); 1290165762Sjeff td->td_priority = prio; 1291165762Sjeff sched_add(td, SRQ_BORROWING); 1292121605Sjeff } else 1293119488Sdavidxu td->td_priority = prio; 1294109864Sjeff} 1295109864Sjeff 1296139453Sjhb/* 1297139453Sjhb * Update a thread's priority when it is lent another thread's 1298139453Sjhb * priority. 1299139453Sjhb */ 1300109864Sjeffvoid 1301139453Sjhbsched_lend_prio(struct thread *td, u_char prio) 1302139453Sjhb{ 1303139453Sjhb 1304139453Sjhb td->td_flags |= TDF_BORROWING; 1305139453Sjhb sched_thread_priority(td, prio); 1306139453Sjhb} 1307139453Sjhb 1308139453Sjhb/* 1309139453Sjhb * Restore a thread's priority when priority propagation is 1310139453Sjhb * over. The prio argument is the minimum priority the thread 1311139453Sjhb * needs to have to satisfy other possible priority lending 1312139453Sjhb * requests. If the thread's regular priority is less 1313139453Sjhb * important than prio, the thread will keep a priority boost 1314139453Sjhb * of prio. 1315139453Sjhb */ 1316139453Sjhbvoid 1317139453Sjhbsched_unlend_prio(struct thread *td, u_char prio) 1318139453Sjhb{ 1319139453Sjhb u_char base_pri; 1320139453Sjhb 1321139453Sjhb if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1322139453Sjhb td->td_base_pri <= PRI_MAX_TIMESHARE) 1323163709Sjb base_pri = td->td_user_pri; 1324139453Sjhb else 1325139453Sjhb base_pri = td->td_base_pri; 1326139453Sjhb if (prio >= base_pri) { 1327139455Sjhb td->td_flags &= ~TDF_BORROWING; 1328139453Sjhb sched_thread_priority(td, base_pri); 1329139453Sjhb } else 1330139453Sjhb sched_lend_prio(td, prio); 1331139453Sjhb} 1332139453Sjhb 1333139453Sjhbvoid 1334139453Sjhbsched_prio(struct thread *td, u_char prio) 1335139453Sjhb{ 1336139453Sjhb u_char oldprio; 1337139453Sjhb 1338139453Sjhb /* First, update the base priority. */ 1339139453Sjhb td->td_base_pri = prio; 1340139453Sjhb 1341139453Sjhb /* 1342139455Sjhb * If the thread is borrowing another thread's priority, don't 1343139453Sjhb * ever lower the priority. 1344139453Sjhb */ 1345139453Sjhb if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1346139453Sjhb return; 1347139453Sjhb 1348139453Sjhb /* Change the real priority. */ 1349139453Sjhb oldprio = td->td_priority; 1350139453Sjhb sched_thread_priority(td, prio); 1351139453Sjhb 1352139453Sjhb /* 1353139453Sjhb * If the thread is on a turnstile, then let the turnstile update 1354139453Sjhb * its state. 1355139453Sjhb */ 1356139453Sjhb if (TD_ON_LOCK(td) && oldprio != prio) 1357139453Sjhb turnstile_adjust(td, oldprio); 1358139453Sjhb} 1359139455Sjhb 1360139453Sjhbvoid 1361163709Sjbsched_user_prio(struct thread *td, u_char prio) 1362161599Sdavidxu{ 1363161599Sdavidxu u_char oldprio; 1364161599Sdavidxu 1365163709Sjb td->td_base_user_pri = prio; 1366164939Sjulian if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 1367164939Sjulian return; 1368163709Sjb oldprio = td->td_user_pri; 1369163709Sjb td->td_user_pri = prio; 1370163709Sjb 1371161599Sdavidxu if (TD_ON_UPILOCK(td) && oldprio != prio) 1372161599Sdavidxu umtx_pi_adjust(td, oldprio); 1373161599Sdavidxu} 1374161599Sdavidxu 1375161599Sdavidxuvoid 1376161599Sdavidxusched_lend_user_prio(struct thread *td, u_char prio) 1377161599Sdavidxu{ 1378161599Sdavidxu u_char oldprio; 1379161599Sdavidxu 1380161599Sdavidxu td->td_flags |= TDF_UBORROWING; 1381161599Sdavidxu 1382164091Smaxim oldprio = td->td_user_pri; 1383163709Sjb td->td_user_pri = prio; 1384161599Sdavidxu 1385161599Sdavidxu if (TD_ON_UPILOCK(td) && oldprio != prio) 1386161599Sdavidxu umtx_pi_adjust(td, oldprio); 1387161599Sdavidxu} 1388161599Sdavidxu 1389161599Sdavidxuvoid 1390161599Sdavidxusched_unlend_user_prio(struct thread *td, u_char prio) 1391161599Sdavidxu{ 1392161599Sdavidxu u_char base_pri; 1393161599Sdavidxu 1394163709Sjb base_pri = td->td_base_user_pri; 1395161599Sdavidxu if (prio >= base_pri) { 1396161599Sdavidxu td->td_flags &= ~TDF_UBORROWING; 1397163709Sjb sched_user_prio(td, base_pri); 1398161599Sdavidxu } else 1399161599Sdavidxu sched_lend_user_prio(td, prio); 1400161599Sdavidxu} 1401161599Sdavidxu 1402161599Sdavidxuvoid 1403135051Sjuliansched_switch(struct thread *td, struct thread *newtd, int flags) 1404109864Sjeff{ 1405165627Sjeff struct tdq *tdq; 1406164936Sjulian struct td_sched *ts; 1407166108Sjeff int preempt; 1408109864Sjeff 1409109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1410109864Sjeff 1411166108Sjeff preempt = flags & SW_PREEMPT; 1412165762Sjeff tdq = TDQ_SELF(); 1413164936Sjulian ts = td->td_sched; 1414133555Sjeff td->td_lastcpu = td->td_oncpu; 1415113339Sjulian td->td_oncpu = NOCPU; 1416132266Sjhb td->td_flags &= ~TDF_NEEDRESCHED; 1417144777Sups td->td_owepreempt = 0; 1418123434Sjeff /* 1419164936Sjulian * If the thread has been assigned it may be in the process of switching 1420123434Sjeff * to the new cpu. This is the case in sched_bind(). 1421123434Sjeff */ 1422139334Sjeff if (td == PCPU_GET(idlethread)) { 1423139334Sjeff TD_SET_CAN_RUN(td); 1424166108Sjeff } else { 1425165627Sjeff tdq_load_rem(tdq, ts); 1426139334Sjeff if (TD_IS_RUNNING(td)) { 1427139334Sjeff /* 1428139334Sjeff * Don't allow the thread to migrate 1429139334Sjeff * from a preemption. 1430139334Sjeff */ 1431166108Sjeff if (preempt) 1432166108Sjeff sched_pin_td(td); 1433166190Sjeff sched_add(td, preempt ? 1434139334Sjeff SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1435139334Sjeff SRQ_OURSELF|SRQ_YIELDING); 1436166108Sjeff if (preempt) 1437166108Sjeff sched_unpin_td(td); 1438163709Sjb } 1439121146Sjeff } 1440136167Sjulian if (newtd != NULL) { 1441136170Sjulian /* 1442147068Sjeff * If we bring in a thread account for it as if it had been 1443147068Sjeff * added to the run queue and then chosen. 1444136170Sjulian */ 1445164936Sjulian newtd->td_sched->ts_flags |= TSF_DIDRUN; 1446136173Sjulian TD_SET_RUNNING(newtd); 1447164936Sjulian tdq_load_add(TDQ_SELF(), newtd->td_sched); 1448136167Sjulian } else 1449131473Sjhb newtd = choosethread(); 1450145256Sjkoshy if (td != newtd) { 1451145256Sjkoshy#ifdef HWPMC_HOOKS 1452145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1453145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1454145256Sjkoshy#endif 1455163709Sjb 1456121128Sjeff cpu_switch(td, newtd); 1457145256Sjkoshy#ifdef HWPMC_HOOKS 1458145256Sjkoshy if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1459145256Sjkoshy PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1460145256Sjkoshy#endif 1461145256Sjkoshy } 1462121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1463113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1464109864Sjeff} 1465109864Sjeff 1466109864Sjeffvoid 1467130551Sjuliansched_nice(struct proc *p, int nice) 1468109864Sjeff{ 1469109864Sjeff struct thread *td; 1470109864Sjeff 1471130551Sjulian PROC_LOCK_ASSERT(p, MA_OWNED); 1472113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1473165762Sjeff 1474130551Sjulian p->p_nice = nice; 1475163709Sjb FOREACH_THREAD_IN_PROC(p, td) { 1476163709Sjb sched_priority(td); 1477165762Sjeff sched_prio(td, td->td_base_user_pri); 1478130551Sjulian } 1479109864Sjeff} 1480109864Sjeff 1481109864Sjeffvoid 1482126326Sjhbsched_sleep(struct thread *td) 1483109864Sjeff{ 1484165762Sjeff 1485109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1486109864Sjeff 1487164936Sjulian td->td_sched->ts_slptime = ticks; 1488109864Sjeff} 1489109864Sjeff 1490109864Sjeffvoid 1491109864Sjeffsched_wakeup(struct thread *td) 1492109864Sjeff{ 1493165762Sjeff int slptime; 1494165762Sjeff 1495109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1496109864Sjeff 1497109864Sjeff /* 1498165762Sjeff * If we slept for more than a tick update our interactivity and 1499165762Sjeff * priority. 1500109864Sjeff */ 1501165762Sjeff slptime = td->td_sched->ts_slptime; 1502165762Sjeff td->td_sched->ts_slptime = 0; 1503165762Sjeff if (slptime && slptime != ticks) { 1504113357Sjeff int hzticks; 1505109864Sjeff 1506165762Sjeff hzticks = (ticks - slptime) << SCHED_TICK_SHIFT; 1507165819Sjeff td->td_sched->skg_slptime += hzticks; 1508165819Sjeff sched_interact_update(td); 1509165796Sjeff sched_pctcpu_update(td->td_sched); 1510163709Sjb sched_priority(td); 1511109864Sjeff } 1512166190Sjeff sched_add(td, SRQ_BORING); 1513109864Sjeff} 1514109864Sjeff 1515109864Sjeff/* 1516109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1517109864Sjeff * priority. 1518109864Sjeff */ 1519109864Sjeffvoid 1520163709Sjbsched_fork(struct thread *td, struct thread *child) 1521109864Sjeff{ 1522109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1523164936Sjulian sched_fork_thread(td, child); 1524165762Sjeff /* 1525165762Sjeff * Penalize the parent and child for forking. 1526165762Sjeff */ 1527165762Sjeff sched_interact_fork(child); 1528165762Sjeff sched_priority(child); 1529165762Sjeff td->td_sched->skg_runtime += tickincr; 1530165762Sjeff sched_interact_update(td); 1531165762Sjeff sched_priority(td); 1532164936Sjulian} 1533109864Sjeff 1534164936Sjulianvoid 1535164936Sjuliansched_fork_thread(struct thread *td, struct thread *child) 1536164936Sjulian{ 1537164936Sjulian struct td_sched *ts; 1538164936Sjulian struct td_sched *ts2; 1539164936Sjulian 1540165762Sjeff /* 1541165762Sjeff * Initialize child. 1542165762Sjeff */ 1543163709Sjb sched_newthread(child); 1544164936Sjulian ts = td->td_sched; 1545164936Sjulian ts2 = child->td_sched; 1546164936Sjulian ts2->ts_cpu = ts->ts_cpu; 1547164936Sjulian ts2->ts_runq = NULL; 1548165762Sjeff /* 1549165762Sjeff * Grab our parents cpu estimation information and priority. 1550165762Sjeff */ 1551164936Sjulian ts2->ts_ticks = ts->ts_ticks; 1552164936Sjulian ts2->ts_ltick = ts->ts_ltick; 1553164936Sjulian ts2->ts_ftick = ts->ts_ftick; 1554165762Sjeff child->td_user_pri = td->td_user_pri; 1555165762Sjeff child->td_base_user_pri = td->td_base_user_pri; 1556165762Sjeff /* 1557165762Sjeff * And update interactivity score. 1558165762Sjeff */ 1559165762Sjeff ts2->skg_slptime = ts->skg_slptime; 1560165762Sjeff ts2->skg_runtime = ts->skg_runtime; 1561165762Sjeff ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ 1562113357Sjeff} 1563113357Sjeff 1564113357Sjeffvoid 1565163709Sjbsched_class(struct thread *td, int class) 1566113357Sjeff{ 1567113357Sjeff 1568113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1569163709Sjb if (td->td_pri_class == class) 1570113357Sjeff return; 1571113357Sjeff 1572121896Sjeff#ifdef SMP 1573165827Sjeff /* 1574165827Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1575165827Sjeff * count because could be changing to or from an interrupt 1576165827Sjeff * class. 1577165827Sjeff */ 1578166190Sjeff if (TD_ON_RUNQ(td)) { 1579165827Sjeff struct tdq *tdq; 1580165827Sjeff 1581165827Sjeff tdq = TDQ_CPU(td->td_sched->ts_cpu); 1582165827Sjeff if (THREAD_CAN_MIGRATE(td)) { 1583165827Sjeff tdq->tdq_transferable--; 1584165827Sjeff tdq->tdq_group->tdg_transferable--; 1585122744Sjeff } 1586165827Sjeff td->td_pri_class = class; 1587165827Sjeff if (THREAD_CAN_MIGRATE(td)) { 1588165827Sjeff tdq->tdq_transferable++; 1589165827Sjeff tdq->tdq_group->tdg_transferable++; 1590165827Sjeff } 1591165827Sjeff } 1592164936Sjulian#endif 1593163709Sjb td->td_pri_class = class; 1594109864Sjeff} 1595109864Sjeff 1596109864Sjeff/* 1597109864Sjeff * Return some of the child's priority and interactivity to the parent. 1598109864Sjeff */ 1599109864Sjeffvoid 1600164939Sjuliansched_exit(struct proc *p, struct thread *child) 1601109864Sjeff{ 1602165762Sjeff struct thread *td; 1603164939Sjulian 1604163709Sjb CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 1605164939Sjulian child, child->td_proc->p_comm, child->td_priority); 1606113372Sjeff 1607165762Sjeff td = FIRST_THREAD_IN_PROC(p); 1608165762Sjeff sched_exit_thread(td, child); 1609113372Sjeff} 1610113372Sjeff 1611113372Sjeffvoid 1612164939Sjuliansched_exit_thread(struct thread *td, struct thread *child) 1613164936Sjulian{ 1614165762Sjeff 1615164939Sjulian CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 1616165762Sjeff child, child->td_proc->p_comm, child->td_priority); 1617164939Sjulian 1618165762Sjeff tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched); 1619165762Sjeff#ifdef KSE 1620165762Sjeff /* 1621165762Sjeff * KSE forks and exits so often that this penalty causes short-lived 1622165762Sjeff * threads to always be non-interactive. This causes mozilla to 1623165762Sjeff * crawl under load. 1624165762Sjeff */ 1625165762Sjeff if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc) 1626165762Sjeff return; 1627165762Sjeff#endif 1628165762Sjeff /* 1629165762Sjeff * Give the child's runtime to the parent without returning the 1630165762Sjeff * sleep time as a penalty to the parent. This causes shells that 1631165762Sjeff * launch expensive things to mark their children as expensive. 1632165762Sjeff */ 1633164939Sjulian td->td_sched->skg_runtime += child->td_sched->skg_runtime; 1634164939Sjulian sched_interact_update(td); 1635165762Sjeff sched_priority(td); 1636164936Sjulian} 1637164936Sjulian 1638164936Sjulianvoid 1639164936Sjuliansched_userret(struct thread *td) 1640164936Sjulian{ 1641164936Sjulian /* 1642164936Sjulian * XXX we cheat slightly on the locking here to avoid locking in 1643164936Sjulian * the usual case. Setting td_priority here is essentially an 1644164936Sjulian * incomplete workaround for not setting it properly elsewhere. 1645164936Sjulian * Now that some interrupt handlers are threads, not setting it 1646164936Sjulian * properly elsewhere can clobber it in the window between setting 1647164936Sjulian * it here and returning to user mode, so don't waste time setting 1648164936Sjulian * it perfectly here. 1649164936Sjulian */ 1650164936Sjulian KASSERT((td->td_flags & TDF_BORROWING) == 0, 1651164936Sjulian ("thread with borrowed priority returning to userland")); 1652164936Sjulian if (td->td_priority != td->td_user_pri) { 1653164936Sjulian mtx_lock_spin(&sched_lock); 1654164936Sjulian td->td_priority = td->td_user_pri; 1655164936Sjulian td->td_base_pri = td->td_user_pri; 1656164936Sjulian mtx_unlock_spin(&sched_lock); 1657164936Sjulian } 1658164936Sjulian} 1659164936Sjulian 1660164936Sjulianvoid 1661121127Sjeffsched_clock(struct thread *td) 1662109864Sjeff{ 1663164936Sjulian struct tdq *tdq; 1664164936Sjulian struct td_sched *ts; 1665109864Sjeff 1666129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 1667165766Sjeff#ifdef SMP 1668166108Sjeff sched_smp_tick(td); 1669165766Sjeff#endif 1670164936Sjulian tdq = TDQ_SELF(); 1671133427Sjeff /* 1672165766Sjeff * Advance the insert index once for each tick to ensure that all 1673165766Sjeff * threads get a chance to run. 1674133427Sjeff */ 1675165766Sjeff if (tdq->tdq_idx == tdq->tdq_ridx) { 1676165766Sjeff tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; 1677165766Sjeff if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) 1678165766Sjeff tdq->tdq_ridx = tdq->tdq_idx; 1679165766Sjeff } 1680165766Sjeff ts = td->td_sched; 1681165762Sjeff /* 1682163709Sjb * We only do slicing code for TIMESHARE threads. 1683113357Sjeff */ 1684163709Sjb if (td->td_pri_class != PRI_TIMESHARE) 1685113357Sjeff return; 1686113357Sjeff /* 1687165766Sjeff * We used a tick; charge it to the thread so that we can compute our 1688113357Sjeff * interactivity. 1689109864Sjeff */ 1690163709Sjb td->td_sched->skg_runtime += tickincr; 1691163709Sjb sched_interact_update(td); 1692109864Sjeff /* 1693109864Sjeff * We used up one time slice. 1694109864Sjeff */ 1695164936Sjulian if (--ts->ts_slice > 0) 1696113357Sjeff return; 1697109864Sjeff /* 1698113357Sjeff * We're out of time, recompute priorities and requeue. 1699109864Sjeff */ 1700165796Sjeff sched_priority(td); 1701113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1702109864Sjeff} 1703109864Sjeff 1704109864Sjeffint 1705109864Sjeffsched_runnable(void) 1706109864Sjeff{ 1707164936Sjulian struct tdq *tdq; 1708115998Sjeff int load; 1709109864Sjeff 1710115998Sjeff load = 1; 1711115998Sjeff 1712164936Sjulian tdq = TDQ_SELF(); 1713121790Sjeff#ifdef SMP 1714166108Sjeff if (tdq_busy) 1715166108Sjeff goto out; 1716121790Sjeff#endif 1717121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1718165620Sjeff if (tdq->tdq_load > 0) 1719121605Sjeff goto out; 1720121605Sjeff } else 1721165620Sjeff if (tdq->tdq_load - 1 > 0) 1722121605Sjeff goto out; 1723115998Sjeff load = 0; 1724115998Sjeffout: 1725115998Sjeff return (load); 1726109864Sjeff} 1727109864Sjeff 1728166190Sjeffstruct thread * 1729109970Sjeffsched_choose(void) 1730109970Sjeff{ 1731164936Sjulian struct tdq *tdq; 1732164936Sjulian struct td_sched *ts; 1733109970Sjeff 1734115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1735164936Sjulian tdq = TDQ_SELF(); 1736113357Sjeff#ifdef SMP 1737123433Sjeffrestart: 1738113357Sjeff#endif 1739164936Sjulian ts = tdq_choose(tdq); 1740164936Sjulian if (ts) { 1741121790Sjeff#ifdef SMP 1742165819Sjeff if (ts->ts_thread->td_priority > PRI_MIN_IDLE) 1743164936Sjulian if (tdq_idled(tdq) == 0) 1744123433Sjeff goto restart; 1745121790Sjeff#endif 1746164936Sjulian tdq_runq_rem(tdq, ts); 1747166190Sjeff return (ts->ts_thread); 1748109864Sjeff } 1749109970Sjeff#ifdef SMP 1750164936Sjulian if (tdq_idled(tdq) == 0) 1751123433Sjeff goto restart; 1752109970Sjeff#endif 1753166190Sjeff return (PCPU_GET(idlethread)); 1754109864Sjeff} 1755109864Sjeff 1756166190Sjeffstatic int 1757166190Sjeffsched_preempt(struct thread *td) 1758166190Sjeff{ 1759166190Sjeff struct thread *ctd; 1760166190Sjeff int cpri; 1761166190Sjeff int pri; 1762166190Sjeff 1763166190Sjeff ctd = curthread; 1764166190Sjeff pri = td->td_priority; 1765166190Sjeff cpri = ctd->td_priority; 1766166190Sjeff if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) 1767166190Sjeff return (0); 1768166190Sjeff /* 1769166190Sjeff * Always preempt IDLE threads. Otherwise only if the preempting 1770166190Sjeff * thread is an ithread. 1771166190Sjeff */ 1772166190Sjeff if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE) 1773166190Sjeff return (0); 1774166190Sjeff if (ctd->td_critnest > 1) { 1775166190Sjeff CTR1(KTR_PROC, "sched_preempt: in critical section %d", 1776166190Sjeff ctd->td_critnest); 1777166190Sjeff ctd->td_owepreempt = 1; 1778166190Sjeff return (0); 1779166190Sjeff } 1780166190Sjeff /* 1781166190Sjeff * Thread is runnable but not yet put on system run queue. 1782166190Sjeff */ 1783166190Sjeff MPASS(TD_ON_RUNQ(td)); 1784166190Sjeff TD_SET_RUNNING(td); 1785166190Sjeff CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td, 1786166190Sjeff td->td_proc->p_pid, td->td_proc->p_comm); 1787166190Sjeff mi_switch(SW_INVOL|SW_PREEMPT, td); 1788166190Sjeff return (1); 1789166190Sjeff} 1790166190Sjeff 1791109864Sjeffvoid 1792134586Sjuliansched_add(struct thread *td, int flags) 1793109864Sjeff{ 1794164936Sjulian struct tdq *tdq; 1795164936Sjulian struct td_sched *ts; 1796139334Sjeff int preemptive; 1797121790Sjeff int class; 1798166108Sjeff#ifdef SMP 1799166108Sjeff int cpuid; 1800166108Sjeff int cpumask; 1801166108Sjeff#endif 1802166190Sjeff ts = td->td_sched; 1803109864Sjeff 1804166190Sjeff mtx_assert(&sched_lock, MA_OWNED); 1805139316Sjeff CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1806139316Sjeff td, td->td_proc->p_comm, td->td_priority, curthread, 1807139316Sjeff curthread->td_proc->p_comm); 1808166190Sjeff KASSERT((td->td_inhibitors == 0), 1809166190Sjeff ("sched_add: trying to run inhibited thread")); 1810166190Sjeff KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 1811166190Sjeff ("sched_add: bad thread state")); 1812163709Sjb KASSERT(td->td_proc->p_sflag & PS_INMEM, 1813110267Sjeff ("sched_add: process swapped out")); 1814164936Sjulian KASSERT(ts->ts_runq == NULL, 1815164936Sjulian ("sched_add: thread %p is still assigned to a run queue", td)); 1816166190Sjeff TD_SET_RUNQ(td); 1817166190Sjeff tdq = TDQ_SELF(); 1818166190Sjeff class = PRI_BASE(td->td_pri_class); 1819166190Sjeff preemptive = !(flags & SRQ_YIELDING); 1820165762Sjeff /* 1821166108Sjeff * Recalculate the priority before we select the target cpu or 1822166108Sjeff * run-queue. 1823165762Sjeff */ 1824165796Sjeff if (class == PRI_TIMESHARE) 1825165796Sjeff sched_priority(td); 1826166190Sjeff if (ts->ts_slice == 0) 1827166190Sjeff ts->ts_slice = sched_slice; 1828121790Sjeff#ifdef SMP 1829166108Sjeff cpuid = PCPU_GET(cpuid); 1830133427Sjeff /* 1831166108Sjeff * Pick the destination cpu and if it isn't ours transfer to the 1832166108Sjeff * target cpu. 1833133427Sjeff */ 1834166108Sjeff if (THREAD_CAN_MIGRATE(td)) { 1835166108Sjeff if (td->td_priority <= PRI_MAX_ITHD) { 1836166108Sjeff CTR2(KTR_SCHED, "ithd %d < %d", td->td_priority, PRI_MAX_ITHD); 1837166108Sjeff ts->ts_cpu = cpuid; 1838166108Sjeff } 1839166108Sjeff if (pick_pri) 1840166108Sjeff ts->ts_cpu = tdq_pickpri(tdq, ts, flags); 1841166108Sjeff else 1842166108Sjeff ts->ts_cpu = tdq_pickidle(tdq, ts); 1843166108Sjeff } else 1844166108Sjeff CTR1(KTR_SCHED, "pinned %d", td->td_pinned); 1845166108Sjeff if (ts->ts_cpu != cpuid) 1846166108Sjeff preemptive = 0; 1847166108Sjeff tdq = TDQ_CPU(ts->ts_cpu); 1848166108Sjeff cpumask = 1 << ts->ts_cpu; 1849121790Sjeff /* 1850123685Sjeff * If we had been idle, clear our bit in the group and potentially 1851166108Sjeff * the global bitmap. 1852121790Sjeff */ 1853165762Sjeff if ((class != PRI_IDLE && class != PRI_ITHD) && 1854166108Sjeff (tdq->tdq_group->tdg_idlemask & cpumask) != 0) { 1855121790Sjeff /* 1856123433Sjeff * Check to see if our group is unidling, and if so, remove it 1857123433Sjeff * from the global idle mask. 1858121790Sjeff */ 1859165620Sjeff if (tdq->tdq_group->tdg_idlemask == 1860165620Sjeff tdq->tdq_group->tdg_cpumask) 1861165620Sjeff atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); 1862123433Sjeff /* 1863123433Sjeff * Now remove ourselves from the group specific idle mask. 1864123433Sjeff */ 1865166108Sjeff tdq->tdq_group->tdg_idlemask &= ~cpumask; 1866166108Sjeff } 1867121790Sjeff#endif 1868166108Sjeff /* 1869166190Sjeff * Pick the run queue based on priority. 1870166108Sjeff */ 1871166108Sjeff if (td->td_priority <= PRI_MAX_REALTIME) 1872166108Sjeff ts->ts_runq = &tdq->tdq_realtime; 1873166108Sjeff else if (td->td_priority <= PRI_MAX_TIMESHARE) 1874166108Sjeff ts->ts_runq = &tdq->tdq_timeshare; 1875166108Sjeff else 1876166108Sjeff ts->ts_runq = &tdq->tdq_idle; 1877166190Sjeff if (preemptive && sched_preempt(td)) 1878131481Sjhb return; 1879164936Sjulian tdq_runq_add(tdq, ts, flags); 1880164936Sjulian tdq_load_add(tdq, ts); 1881166108Sjeff#ifdef SMP 1882166108Sjeff if (ts->ts_cpu != cpuid) { 1883166108Sjeff tdq_notify(ts); 1884166108Sjeff return; 1885166108Sjeff } 1886166108Sjeff#endif 1887166108Sjeff if (td->td_priority < curthread->td_priority) 1888166108Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1889109864Sjeff} 1890109864Sjeff 1891109864Sjeffvoid 1892121127Sjeffsched_rem(struct thread *td) 1893109864Sjeff{ 1894164936Sjulian struct tdq *tdq; 1895164936Sjulian struct td_sched *ts; 1896113357Sjeff 1897139316Sjeff CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1898139316Sjeff td, td->td_proc->p_comm, td->td_priority, curthread, 1899139316Sjeff curthread->td_proc->p_comm); 1900139334Sjeff mtx_assert(&sched_lock, MA_OWNED); 1901164936Sjulian ts = td->td_sched; 1902166190Sjeff KASSERT(TD_ON_RUNQ(td), 1903164936Sjulian ("sched_rem: thread not on run queue")); 1904109864Sjeff 1905164936Sjulian tdq = TDQ_CPU(ts->ts_cpu); 1906164936Sjulian tdq_runq_rem(tdq, ts); 1907164936Sjulian tdq_load_rem(tdq, ts); 1908166190Sjeff TD_SET_CAN_RUN(td); 1909109864Sjeff} 1910109864Sjeff 1911109864Sjefffixpt_t 1912121127Sjeffsched_pctcpu(struct thread *td) 1913109864Sjeff{ 1914109864Sjeff fixpt_t pctcpu; 1915164936Sjulian struct td_sched *ts; 1916109864Sjeff 1917109864Sjeff pctcpu = 0; 1918164936Sjulian ts = td->td_sched; 1919164936Sjulian if (ts == NULL) 1920121290Sjeff return (0); 1921109864Sjeff 1922115998Sjeff mtx_lock_spin(&sched_lock); 1923164936Sjulian if (ts->ts_ticks) { 1924109864Sjeff int rtick; 1925109864Sjeff 1926165796Sjeff sched_pctcpu_update(ts); 1927109864Sjeff /* How many rtick per second ? */ 1928165762Sjeff rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); 1929165762Sjeff pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; 1930109864Sjeff } 1931164936Sjulian td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick; 1932113865Sjhb mtx_unlock_spin(&sched_lock); 1933109864Sjeff 1934109864Sjeff return (pctcpu); 1935109864Sjeff} 1936109864Sjeff 1937122038Sjeffvoid 1938122038Sjeffsched_bind(struct thread *td, int cpu) 1939122038Sjeff{ 1940164936Sjulian struct td_sched *ts; 1941122038Sjeff 1942122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1943164936Sjulian ts = td->td_sched; 1944166137Sjeff if (ts->ts_flags & TSF_BOUND) 1945166152Sjeff sched_unbind(td); 1946164936Sjulian ts->ts_flags |= TSF_BOUND; 1947123433Sjeff#ifdef SMP 1948166137Sjeff sched_pin(); 1949123433Sjeff if (PCPU_GET(cpuid) == cpu) 1950122038Sjeff return; 1951166137Sjeff ts->ts_cpu = cpu; 1952122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 1953131527Sphk mi_switch(SW_VOL, NULL); 1954122038Sjeff#endif 1955122038Sjeff} 1956122038Sjeff 1957122038Sjeffvoid 1958122038Sjeffsched_unbind(struct thread *td) 1959122038Sjeff{ 1960165762Sjeff struct td_sched *ts; 1961165762Sjeff 1962122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1963165762Sjeff ts = td->td_sched; 1964166137Sjeff if ((ts->ts_flags & TSF_BOUND) == 0) 1965166137Sjeff return; 1966165762Sjeff ts->ts_flags &= ~TSF_BOUND; 1967165762Sjeff#ifdef SMP 1968165762Sjeff sched_unpin(); 1969165762Sjeff#endif 1970122038Sjeff} 1971122038Sjeff 1972109864Sjeffint 1973145256Sjkoshysched_is_bound(struct thread *td) 1974145256Sjkoshy{ 1975145256Sjkoshy mtx_assert(&sched_lock, MA_OWNED); 1976164936Sjulian return (td->td_sched->ts_flags & TSF_BOUND); 1977145256Sjkoshy} 1978145256Sjkoshy 1979159630Sdavidxuvoid 1980159630Sdavidxusched_relinquish(struct thread *td) 1981159630Sdavidxu{ 1982159630Sdavidxu mtx_lock_spin(&sched_lock); 1983163709Sjb if (td->td_pri_class == PRI_TIMESHARE) 1984159630Sdavidxu sched_prio(td, PRI_MAX_TIMESHARE); 1985159630Sdavidxu mi_switch(SW_VOL, NULL); 1986159630Sdavidxu mtx_unlock_spin(&sched_lock); 1987159630Sdavidxu} 1988159630Sdavidxu 1989145256Sjkoshyint 1990125289Sjeffsched_load(void) 1991125289Sjeff{ 1992125289Sjeff#ifdef SMP 1993125289Sjeff int total; 1994125289Sjeff int i; 1995125289Sjeff 1996125289Sjeff total = 0; 1997165620Sjeff for (i = 0; i <= tdg_maxid; i++) 1998165620Sjeff total += TDQ_GROUP(i)->tdg_load; 1999125289Sjeff return (total); 2000125289Sjeff#else 2001165620Sjeff return (TDQ_SELF()->tdq_sysload); 2002125289Sjeff#endif 2003125289Sjeff} 2004125289Sjeff 2005125289Sjeffint 2006109864Sjeffsched_sizeof_proc(void) 2007109864Sjeff{ 2008109864Sjeff return (sizeof(struct proc)); 2009109864Sjeff} 2010109864Sjeff 2011109864Sjeffint 2012109864Sjeffsched_sizeof_thread(void) 2013109864Sjeff{ 2014109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 2015109864Sjeff} 2016159570Sdavidxu 2017159570Sdavidxuvoid 2018159570Sdavidxusched_tick(void) 2019159570Sdavidxu{ 2020166190Sjeff struct td_sched *ts; 2021166190Sjeff 2022166190Sjeff ts = curthread->td_sched; 2023166190Sjeff /* Adjust ticks for pctcpu */ 2024166190Sjeff ts->ts_ticks += 1 << SCHED_TICK_SHIFT; 2025166190Sjeff ts->ts_ltick = ticks; 2026166190Sjeff /* 2027166190Sjeff * Update if we've exceeded our desired tick threshhold by over one 2028166190Sjeff * second. 2029166190Sjeff */ 2030166190Sjeff if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick) 2031166190Sjeff sched_pctcpu_update(ts); 2032159570Sdavidxu} 2033165762Sjeff 2034166190Sjeff/* 2035166190Sjeff * The actual idle process. 2036166190Sjeff */ 2037166190Sjeffvoid 2038166190Sjeffsched_idletd(void *dummy) 2039166190Sjeff{ 2040166190Sjeff struct proc *p; 2041166190Sjeff struct thread *td; 2042166190Sjeff 2043166190Sjeff td = curthread; 2044166190Sjeff p = td->td_proc; 2045166190Sjeff mtx_assert(&Giant, MA_NOTOWNED); 2046166190Sjeff /* ULE Relies on preemption for idle interruption. */ 2047166190Sjeff for (;;) 2048166190Sjeff cpu_idle(); 2049166190Sjeff} 2050166190Sjeff 2051165762Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 2052165762SjeffSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 2053165762Sjeff "Scheduler name"); 2054165762SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, ""); 2055165762SjeffSYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, ""); 2056165762SjeffSYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, ""); 2057165762SjeffSYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, ""); 2058166108Sjeff#ifdef SMP 2059166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, ""); 2060166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_affinity, CTLFLAG_RW, 2061166108Sjeff &affinity, 0, ""); 2062166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryself, CTLFLAG_RW, 2063166108Sjeff &tryself, 0, ""); 2064166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryselfidle, CTLFLAG_RW, 2065166108Sjeff &tryselfidle, 0, ""); 2066166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, ""); 2067166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, ipi_preempt, CTLFLAG_RW, &ipi_preempt, 0, ""); 2068166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, ipi_ast, CTLFLAG_RW, &ipi_ast, 0, ""); 2069166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, ""); 2070166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, ""); 2071166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, ""); 2072166108SjeffSYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, ""); 2073166108Sjeff#endif 2074165762Sjeff 2075165762Sjeff/* ps compat */ 2076165762Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 2077165762SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 2078165762Sjeff 2079165762Sjeff 2080134791Sjulian#define KERN_SWITCH_INCLUDE 1 2081134791Sjulian#include "kern/kern_switch.c" 2082