sched_ule.c revision 131481
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 131481 2004-07-02 20:21:44Z jhb $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32109864Sjeff#include <sys/kernel.h> 33109864Sjeff#include <sys/ktr.h> 34109864Sjeff#include <sys/lock.h> 35109864Sjeff#include <sys/mutex.h> 36109864Sjeff#include <sys/proc.h> 37112966Sjeff#include <sys/resource.h> 38122038Sjeff#include <sys/resourcevar.h> 39109864Sjeff#include <sys/sched.h> 40109864Sjeff#include <sys/smp.h> 41109864Sjeff#include <sys/sx.h> 42109864Sjeff#include <sys/sysctl.h> 43109864Sjeff#include <sys/sysproto.h> 44109864Sjeff#include <sys/vmmeter.h> 45109864Sjeff#ifdef DDB 46109864Sjeff#include <ddb/ddb.h> 47109864Sjeff#endif 48109864Sjeff#ifdef KTRACE 49109864Sjeff#include <sys/uio.h> 50109864Sjeff#include <sys/ktrace.h> 51109864Sjeff#endif 52109864Sjeff 53109864Sjeff#include <machine/cpu.h> 54121790Sjeff#include <machine/smp.h> 55109864Sjeff 56113357Sjeff#define KTR_ULE KTR_NFS 57113357Sjeff 58109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 59109864Sjeff/* XXX This is bogus compatability crap for ps */ 60109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 61109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 62109864Sjeff 63109864Sjeffstatic void sched_setup(void *dummy); 64109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 65109864Sjeff 66113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 67113357Sjeff 68130881Sscottl#define ULE_NAME "ule" 69130881Sscottl#define ULE_NAME_LEN 3 70130881SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, ULE_NAME, ULE_NAME_LEN, 71130881Sscottl "System is using the ULE scheduler"); 72130881Sscottl 73113357Sjeffstatic int slice_min = 1; 74113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 75113357Sjeff 76116365Sjeffstatic int slice_max = 10; 77113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 78113357Sjeff 79111857Sjeffint realstathz; 80113357Sjeffint tickincr = 1; 81111857Sjeff 82109864Sjeff/* 83109864Sjeff * These datastructures are allocated within their parent datastructure but 84109864Sjeff * are scheduler specific. 85109864Sjeff */ 86109864Sjeff 87109864Sjeffstruct ke_sched { 88109864Sjeff int ske_slice; 89109864Sjeff struct runq *ske_runq; 90109864Sjeff /* The following variables are only used for pctcpu calculation */ 91109864Sjeff int ske_ltick; /* Last tick that we were running on */ 92109864Sjeff int ske_ftick; /* First tick that we were running on */ 93109864Sjeff int ske_ticks; /* Tick count */ 94113357Sjeff /* CPU that we have affinity for. */ 95110260Sjeff u_char ske_cpu; 96109864Sjeff}; 97109864Sjeff#define ke_slice ke_sched->ske_slice 98109864Sjeff#define ke_runq ke_sched->ske_runq 99109864Sjeff#define ke_ltick ke_sched->ske_ltick 100109864Sjeff#define ke_ftick ke_sched->ske_ftick 101109864Sjeff#define ke_ticks ke_sched->ske_ticks 102110260Sjeff#define ke_cpu ke_sched->ske_cpu 103121790Sjeff#define ke_assign ke_procq.tqe_next 104109864Sjeff 105121790Sjeff#define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ 106122158Sjeff#define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ 107121790Sjeff 108109864Sjeffstruct kg_sched { 109110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 110110645Sjeff int skg_runtime; /* Number of ticks we were running */ 111109864Sjeff}; 112109864Sjeff#define kg_slptime kg_sched->skg_slptime 113110645Sjeff#define kg_runtime kg_sched->skg_runtime 114109864Sjeff 115109864Sjeffstruct td_sched { 116109864Sjeff int std_slptime; 117109864Sjeff}; 118109864Sjeff#define td_slptime td_sched->std_slptime 119109864Sjeff 120110267Sjeffstruct td_sched td_sched; 121109864Sjeffstruct ke_sched ke_sched; 122109864Sjeffstruct kg_sched kg_sched; 123109864Sjeff 124109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 125109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 126109864Sjeffstruct p_sched *proc0_sched = NULL; 127109864Sjeffstruct td_sched *thread0_sched = &td_sched; 128109864Sjeff 129109864Sjeff/* 130116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 131116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 132116642Sjeff * value is then directly added to this to allow nice to have some effect 133116642Sjeff * on latency. 134111857Sjeff * 135111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 136116642Sjeff * PRI_NRESV: Number of nice values. 137111857Sjeff * PRI_BASE: The start of the dynamic range. 138109864Sjeff */ 139111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 140121869Sjeff#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 141121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 142116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 143113357Sjeff#define SCHED_PRI_INTERACT(score) \ 144116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 145109864Sjeff 146109864Sjeff/* 147111857Sjeff * These determine the interactivity of a process. 148109864Sjeff * 149110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 150110645Sjeff * before throttling back. 151121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 152116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 153111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 154109864Sjeff */ 155121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 156121868Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 157116365Sjeff#define SCHED_INTERACT_MAX (100) 158116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 159121126Sjeff#define SCHED_INTERACT_THRESH (30) 160111857Sjeff 161109864Sjeff/* 162109864Sjeff * These parameters and macros determine the size of the time slice that is 163109864Sjeff * granted to each thread. 164109864Sjeff * 165109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 166109864Sjeff * SLICE_MAX: Maximum time slice granted. 167109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 168112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 169112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 170121871Sjeff * SLICE_NTHRESH: The nice cutoff point for slice assignment. 171109864Sjeff */ 172113357Sjeff#define SCHED_SLICE_MIN (slice_min) 173113357Sjeff#define SCHED_SLICE_MAX (slice_max) 174125299Sjeff#define SCHED_SLICE_INTERACTIVE (slice_max) 175121871Sjeff#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 176111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 177109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 178112966Sjeff#define SCHED_SLICE_NICE(nice) \ 179121871Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 180109864Sjeff 181109864Sjeff/* 182109864Sjeff * This macro determines whether or not the kse belongs on the current or 183109864Sjeff * next run queue. 184109864Sjeff */ 185113357Sjeff#define SCHED_INTERACTIVE(kg) \ 186113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 187113417Sjeff#define SCHED_CURR(kg, ke) \ 188127278Sobrien (ke->ke_thread->td_priority < kg->kg_user_pri || \ 189121107Sjeff SCHED_INTERACTIVE(kg)) 190109864Sjeff 191109864Sjeff/* 192109864Sjeff * Cpu percentage computation macros and defines. 193109864Sjeff * 194109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 195109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 196109864Sjeff */ 197109864Sjeff 198112971Sjeff#define SCHED_CPU_TIME 10 199109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 200109864Sjeff 201109864Sjeff/* 202113357Sjeff * kseq - per processor runqs and statistics. 203109864Sjeff */ 204109864Sjeffstruct kseq { 205113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 206113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 207113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 208113357Sjeff struct runq *ksq_curr; /* Current queue. */ 209121896Sjeff int ksq_load_timeshare; /* Load for timeshare. */ 210113357Sjeff int ksq_load; /* Aggregate load. */ 211121869Sjeff short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 212113357Sjeff short ksq_nicemin; /* Least nice. */ 213110267Sjeff#ifdef SMP 214123433Sjeff int ksq_transferable; 215123433Sjeff LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 216123433Sjeff struct kseq_group *ksq_group; /* Our processor group. */ 217123433Sjeff volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 218125289Sjeff#else 219125289Sjeff int ksq_sysload; /* For loadavg, !ITHD load. */ 220110267Sjeff#endif 221109864Sjeff}; 222109864Sjeff 223123433Sjeff#ifdef SMP 224109864Sjeff/* 225123433Sjeff * kseq groups are groups of processors which can cheaply share threads. When 226123433Sjeff * one processor in the group goes idle it will check the runqs of the other 227123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 228123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 229123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 230123433Sjeff * load balancer. 231123433Sjeff */ 232123433Sjeffstruct kseq_group { 233123433Sjeff int ksg_cpus; /* Count of CPUs in this kseq group. */ 234127498Smarcel cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ 235127498Smarcel cpumask_t ksg_idlemask; /* Idle cpus in this group. */ 236127498Smarcel cpumask_t ksg_mask; /* Bit mask for first cpu. */ 237123487Sjeff int ksg_load; /* Total load of this group. */ 238123433Sjeff int ksg_transferable; /* Transferable load of this group. */ 239123433Sjeff LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 240123433Sjeff}; 241123433Sjeff#endif 242123433Sjeff 243123433Sjeff/* 244109864Sjeff * One kse queue per processor. 245109864Sjeff */ 246110028Sjeff#ifdef SMP 247127498Smarcelstatic cpumask_t kseq_idle; 248123487Sjeffstatic int ksg_maxid; 249121790Sjeffstatic struct kseq kseq_cpu[MAXCPU]; 250123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU]; 251129982Sjeffstatic int bal_tick; 252129982Sjeffstatic int gbal_tick; 253129982Sjeff 254123433Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 255123433Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 256123487Sjeff#define KSEQ_ID(x) ((x) - kseq_cpu) 257123487Sjeff#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 258123433Sjeff#else /* !SMP */ 259121790Sjeffstatic struct kseq kseq_cpu; 260129982Sjeff 261110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 262110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 263110028Sjeff#endif 264109864Sjeff 265112966Sjeffstatic void sched_slice(struct kse *ke); 266113357Sjeffstatic void sched_priority(struct ksegrp *kg); 267111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 268116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 269121868Sjeffstatic void sched_interact_fork(struct ksegrp *kg); 270121790Sjeffstatic void sched_pctcpu_update(struct kse *ke); 271109864Sjeff 272110267Sjeff/* Operations on per processor queues */ 273121790Sjeffstatic struct kse * kseq_choose(struct kseq *kseq); 274110028Sjeffstatic void kseq_setup(struct kseq *kseq); 275122744Sjeffstatic void kseq_load_add(struct kseq *kseq, struct kse *ke); 276122744Sjeffstatic void kseq_load_rem(struct kseq *kseq, struct kse *ke); 277122744Sjeffstatic __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 278122744Sjeffstatic __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 279113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 280113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 281113660Sjeffvoid kseq_print(int cpu); 282110267Sjeff#ifdef SMP 283123433Sjeffstatic int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 284121790Sjeffstatic struct kse *runq_steal(struct runq *rq); 285129982Sjeffstatic void sched_balance(void); 286129982Sjeffstatic void sched_balance_groups(void); 287123487Sjeffstatic void sched_balance_group(struct kseq_group *ksg); 288123487Sjeffstatic void sched_balance_pair(struct kseq *high, struct kseq *low); 289121790Sjeffstatic void kseq_move(struct kseq *from, int cpu); 290123433Sjeffstatic int kseq_idled(struct kseq *kseq); 291121790Sjeffstatic void kseq_notify(struct kse *ke, int cpu); 292121790Sjeffstatic void kseq_assign(struct kseq *); 293123433Sjeffstatic struct kse *kseq_steal(struct kseq *kseq, int stealidle); 294123693Sjeff/* 295123693Sjeff * On P4 Xeons the round-robin interrupt delivery is broken. As a result of 296123693Sjeff * this, we can't pin interrupts to the cpu that they were delivered to, 297123693Sjeff * otherwise all ithreads only run on CPU 0. 298123693Sjeff */ 299123693Sjeff#ifdef __i386__ 300122038Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 301123693Sjeff ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 302123693Sjeff#else /* !__i386__ */ 303123693Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 304122158Sjeff ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 305122165Sjeff ((ke)->ke_flags & KEF_BOUND) == 0) 306123693Sjeff#endif /* !__i386__ */ 307121790Sjeff#endif 308110028Sjeff 309113357Sjeffvoid 310113660Sjeffkseq_print(int cpu) 311110267Sjeff{ 312113660Sjeff struct kseq *kseq; 313113357Sjeff int i; 314112994Sjeff 315113660Sjeff kseq = KSEQ_CPU(cpu); 316112994Sjeff 317113357Sjeff printf("kseq:\n"); 318113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 319122744Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 320121896Sjeff#ifdef SMP 321123433Sjeff printf("\tload transferable: %d\n", kseq->ksq_transferable); 322121896Sjeff#endif 323113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 324113357Sjeff printf("\tnice counts:\n"); 325121869Sjeff for (i = 0; i < SCHED_PRI_NRESV; i++) 326113357Sjeff if (kseq->ksq_nice[i]) 327113357Sjeff printf("\t\t%d = %d\n", 328113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 329113357Sjeff} 330112994Sjeff 331122744Sjeffstatic __inline void 332122744Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke) 333122744Sjeff{ 334122744Sjeff#ifdef SMP 335123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 336123433Sjeff kseq->ksq_transferable++; 337123433Sjeff kseq->ksq_group->ksg_transferable++; 338123433Sjeff } 339122744Sjeff#endif 340122744Sjeff runq_add(ke->ke_runq, ke); 341122744Sjeff} 342122744Sjeff 343122744Sjeffstatic __inline void 344122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke) 345122744Sjeff{ 346122744Sjeff#ifdef SMP 347123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 348123433Sjeff kseq->ksq_transferable--; 349123433Sjeff kseq->ksq_group->ksg_transferable--; 350123433Sjeff } 351122744Sjeff#endif 352122744Sjeff runq_remove(ke->ke_runq, ke); 353122744Sjeff} 354122744Sjeff 355113357Sjeffstatic void 356122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke) 357113357Sjeff{ 358121896Sjeff int class; 359115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 360121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 361121896Sjeff if (class == PRI_TIMESHARE) 362121896Sjeff kseq->ksq_load_timeshare++; 363113357Sjeff kseq->ksq_load++; 364128563Sobrien if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 365123487Sjeff#ifdef SMP 366123487Sjeff kseq->ksq_group->ksg_load++; 367125289Sjeff#else 368125289Sjeff kseq->ksq_sysload++; 369123487Sjeff#endif 370113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 371122744Sjeff CTR6(KTR_ULE, 372122744Sjeff "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 373122744Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 374130551Sjulian ke->ke_proc->p_nice, kseq->ksq_nicemin); 375113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 376130551Sjulian kseq_nice_add(kseq, ke->ke_proc->p_nice); 377110267Sjeff} 378113357Sjeff 379112994Sjeffstatic void 380122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke) 381110267Sjeff{ 382121896Sjeff int class; 383115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 384121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 385121896Sjeff if (class == PRI_TIMESHARE) 386121896Sjeff kseq->ksq_load_timeshare--; 387128563Sobrien if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 388123487Sjeff#ifdef SMP 389123487Sjeff kseq->ksq_group->ksg_load--; 390125289Sjeff#else 391125289Sjeff kseq->ksq_sysload--; 392123487Sjeff#endif 393113357Sjeff kseq->ksq_load--; 394113357Sjeff ke->ke_runq = NULL; 395113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 396130551Sjulian kseq_nice_rem(kseq, ke->ke_proc->p_nice); 397110267Sjeff} 398110267Sjeff 399113357Sjeffstatic void 400113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 401110267Sjeff{ 402115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 403113357Sjeff /* Normalize to zero. */ 404113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 405121896Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 406113357Sjeff kseq->ksq_nicemin = nice; 407110267Sjeff} 408110267Sjeff 409113357Sjeffstatic void 410113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 411110267Sjeff{ 412113357Sjeff int n; 413113357Sjeff 414115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 415113357Sjeff /* Normalize to zero. */ 416113357Sjeff n = nice + SCHED_PRI_NHALF; 417113357Sjeff kseq->ksq_nice[n]--; 418113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 419113357Sjeff 420113357Sjeff /* 421113357Sjeff * If this wasn't the smallest nice value or there are more in 422113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 423113357Sjeff * the smallest nice. 424113357Sjeff */ 425113357Sjeff if (nice != kseq->ksq_nicemin || 426113357Sjeff kseq->ksq_nice[n] != 0 || 427121896Sjeff kseq->ksq_load_timeshare == 0) 428113357Sjeff return; 429113357Sjeff 430121869Sjeff for (; n < SCHED_PRI_NRESV; n++) 431113357Sjeff if (kseq->ksq_nice[n]) { 432113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 433113357Sjeff return; 434113357Sjeff } 435110267Sjeff} 436110267Sjeff 437113357Sjeff#ifdef SMP 438116069Sjeff/* 439122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 440116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 441116069Sjeff * by migrating some processes. 442116069Sjeff * 443116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 444116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 445116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 446116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 447116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 448116069Sjeff * 449116069Sjeff * It could be improved by considering the priorities and slices assigned to 450116069Sjeff * each task prior to balancing them. There are many pathological cases with 451116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 452116069Sjeff * 453116069Sjeff */ 454121790Sjeffstatic void 455129982Sjeffsched_balance(void) 456116069Sjeff{ 457123487Sjeff struct kseq_group *high; 458123487Sjeff struct kseq_group *low; 459123487Sjeff struct kseq_group *ksg; 460123487Sjeff int cnt; 461123487Sjeff int i; 462123487Sjeff 463123487Sjeff if (smp_started == 0) 464123487Sjeff goto out; 465123487Sjeff low = high = NULL; 466123487Sjeff i = random() % (ksg_maxid + 1); 467123487Sjeff for (cnt = 0; cnt <= ksg_maxid; cnt++) { 468123487Sjeff ksg = KSEQ_GROUP(i); 469123487Sjeff /* 470123487Sjeff * Find the CPU with the highest load that has some 471123487Sjeff * threads to transfer. 472123487Sjeff */ 473123487Sjeff if ((high == NULL || ksg->ksg_load > high->ksg_load) 474123487Sjeff && ksg->ksg_transferable) 475123487Sjeff high = ksg; 476123487Sjeff if (low == NULL || ksg->ksg_load < low->ksg_load) 477123487Sjeff low = ksg; 478123487Sjeff if (++i > ksg_maxid) 479123487Sjeff i = 0; 480123487Sjeff } 481123487Sjeff if (low != NULL && high != NULL && high != low) 482123487Sjeff sched_balance_pair(LIST_FIRST(&high->ksg_members), 483123487Sjeff LIST_FIRST(&low->ksg_members)); 484123487Sjeffout: 485129982Sjeff bal_tick = ticks + (random() % (hz * 2)); 486123487Sjeff} 487123487Sjeff 488123487Sjeffstatic void 489129982Sjeffsched_balance_groups(void) 490123487Sjeff{ 491123487Sjeff int i; 492123487Sjeff 493129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 494123487Sjeff if (smp_started) 495123487Sjeff for (i = 0; i <= ksg_maxid; i++) 496123487Sjeff sched_balance_group(KSEQ_GROUP(i)); 497129982Sjeff gbal_tick = ticks + (random() % (hz * 2)); 498123487Sjeff} 499123487Sjeff 500123487Sjeffstatic void 501123487Sjeffsched_balance_group(struct kseq_group *ksg) 502123487Sjeff{ 503116069Sjeff struct kseq *kseq; 504123487Sjeff struct kseq *high; 505123487Sjeff struct kseq *low; 506123487Sjeff int load; 507123487Sjeff 508123487Sjeff if (ksg->ksg_transferable == 0) 509123487Sjeff return; 510123487Sjeff low = NULL; 511123487Sjeff high = NULL; 512123487Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 513123487Sjeff load = kseq->ksq_load; 514123487Sjeff if (high == NULL || load > high->ksq_load) 515123487Sjeff high = kseq; 516123487Sjeff if (low == NULL || load < low->ksq_load) 517123487Sjeff low = kseq; 518123487Sjeff } 519123487Sjeff if (high != NULL && low != NULL && high != low) 520123487Sjeff sched_balance_pair(high, low); 521123487Sjeff} 522123487Sjeff 523123487Sjeffstatic void 524123487Sjeffsched_balance_pair(struct kseq *high, struct kseq *low) 525123487Sjeff{ 526123433Sjeff int transferable; 527116069Sjeff int high_load; 528116069Sjeff int low_load; 529116069Sjeff int move; 530116069Sjeff int diff; 531116069Sjeff int i; 532116069Sjeff 533116069Sjeff /* 534123433Sjeff * If we're transfering within a group we have to use this specific 535123433Sjeff * kseq's transferable count, otherwise we can steal from other members 536123433Sjeff * of the group. 537123433Sjeff */ 538123487Sjeff if (high->ksq_group == low->ksq_group) { 539123487Sjeff transferable = high->ksq_transferable; 540123487Sjeff high_load = high->ksq_load; 541123487Sjeff low_load = low->ksq_load; 542123487Sjeff } else { 543123487Sjeff transferable = high->ksq_group->ksg_transferable; 544123487Sjeff high_load = high->ksq_group->ksg_load; 545123487Sjeff low_load = low->ksq_group->ksg_load; 546123487Sjeff } 547123433Sjeff if (transferable == 0) 548123487Sjeff return; 549123433Sjeff /* 550122744Sjeff * Determine what the imbalance is and then adjust that to how many 551123433Sjeff * kses we actually have to give up (transferable). 552122744Sjeff */ 553123487Sjeff diff = high_load - low_load; 554116069Sjeff move = diff / 2; 555116069Sjeff if (diff & 0x1) 556116069Sjeff move++; 557123433Sjeff move = min(move, transferable); 558116069Sjeff for (i = 0; i < move; i++) 559123487Sjeff kseq_move(high, KSEQ_ID(low)); 560116069Sjeff return; 561116069Sjeff} 562116069Sjeff 563121790Sjeffstatic void 564116069Sjeffkseq_move(struct kseq *from, int cpu) 565116069Sjeff{ 566123433Sjeff struct kseq *kseq; 567123433Sjeff struct kseq *to; 568116069Sjeff struct kse *ke; 569116069Sjeff 570123433Sjeff kseq = from; 571123433Sjeff to = KSEQ_CPU(cpu); 572123433Sjeff ke = kseq_steal(kseq, 1); 573123433Sjeff if (ke == NULL) { 574123433Sjeff struct kseq_group *ksg; 575123433Sjeff 576123433Sjeff ksg = kseq->ksq_group; 577123433Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 578123433Sjeff if (kseq == from || kseq->ksq_transferable == 0) 579123433Sjeff continue; 580123433Sjeff ke = kseq_steal(kseq, 1); 581123433Sjeff break; 582123433Sjeff } 583123433Sjeff if (ke == NULL) 584123433Sjeff panic("kseq_move: No KSEs available with a " 585123433Sjeff "transferable count of %d\n", 586123433Sjeff ksg->ksg_transferable); 587123433Sjeff } 588123433Sjeff if (kseq == to) 589123433Sjeff return; 590116069Sjeff ke->ke_state = KES_THREAD; 591123433Sjeff kseq_runq_rem(kseq, ke); 592123433Sjeff kseq_load_rem(kseq, ke); 593121923Sjeff kseq_notify(ke, cpu); 594116069Sjeff} 595110267Sjeff 596123433Sjeffstatic int 597123433Sjeffkseq_idled(struct kseq *kseq) 598121790Sjeff{ 599123433Sjeff struct kseq_group *ksg; 600123433Sjeff struct kseq *steal; 601123433Sjeff struct kse *ke; 602123433Sjeff 603123433Sjeff ksg = kseq->ksq_group; 604123433Sjeff /* 605123433Sjeff * If we're in a cpu group, try and steal kses from another cpu in 606123433Sjeff * the group before idling. 607123433Sjeff */ 608123433Sjeff if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 609123433Sjeff LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 610123433Sjeff if (steal == kseq || steal->ksq_transferable == 0) 611123433Sjeff continue; 612123433Sjeff ke = kseq_steal(steal, 0); 613123433Sjeff if (ke == NULL) 614123433Sjeff continue; 615123433Sjeff ke->ke_state = KES_THREAD; 616123433Sjeff kseq_runq_rem(steal, ke); 617123433Sjeff kseq_load_rem(steal, ke); 618123433Sjeff ke->ke_cpu = PCPU_GET(cpuid); 619123433Sjeff sched_add(ke->ke_thread); 620123433Sjeff return (0); 621123433Sjeff } 622123433Sjeff } 623123433Sjeff /* 624123433Sjeff * We only set the idled bit when all of the cpus in the group are 625123433Sjeff * idle. Otherwise we could get into a situation where a KSE bounces 626123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 627123433Sjeff */ 628123433Sjeff ksg->ksg_idlemask |= PCPU_GET(cpumask); 629123433Sjeff if (ksg->ksg_idlemask != ksg->ksg_cpumask) 630123433Sjeff return (1); 631123433Sjeff atomic_set_int(&kseq_idle, ksg->ksg_mask); 632123433Sjeff return (1); 633121790Sjeff} 634121790Sjeff 635121790Sjeffstatic void 636121790Sjeffkseq_assign(struct kseq *kseq) 637121790Sjeff{ 638121790Sjeff struct kse *nke; 639121790Sjeff struct kse *ke; 640121790Sjeff 641121790Sjeff do { 642122848Sjeff (volatile struct kse *)ke = kseq->ksq_assigned; 643121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 644121790Sjeff for (; ke != NULL; ke = nke) { 645121790Sjeff nke = ke->ke_assign; 646121790Sjeff ke->ke_flags &= ~KEF_ASSIGNED; 647121790Sjeff sched_add(ke->ke_thread); 648121790Sjeff } 649121790Sjeff} 650121790Sjeff 651121790Sjeffstatic void 652121790Sjeffkseq_notify(struct kse *ke, int cpu) 653121790Sjeff{ 654121790Sjeff struct kseq *kseq; 655121790Sjeff struct thread *td; 656121790Sjeff struct pcpu *pcpu; 657121790Sjeff 658123529Sjeff ke->ke_cpu = cpu; 659121790Sjeff ke->ke_flags |= KEF_ASSIGNED; 660121790Sjeff 661121790Sjeff kseq = KSEQ_CPU(cpu); 662121790Sjeff 663121790Sjeff /* 664121790Sjeff * Place a KSE on another cpu's queue and force a resched. 665121790Sjeff */ 666121790Sjeff do { 667122848Sjeff (volatile struct kse *)ke->ke_assign = kseq->ksq_assigned; 668121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 669121790Sjeff pcpu = pcpu_find(cpu); 670121790Sjeff td = pcpu->pc_curthread; 671121790Sjeff if (ke->ke_thread->td_priority < td->td_priority || 672121790Sjeff td == pcpu->pc_idlethread) { 673121790Sjeff td->td_flags |= TDF_NEEDRESCHED; 674121790Sjeff ipi_selected(1 << cpu, IPI_AST); 675121790Sjeff } 676121790Sjeff} 677121790Sjeff 678121790Sjeffstatic struct kse * 679121790Sjeffrunq_steal(struct runq *rq) 680121790Sjeff{ 681121790Sjeff struct rqhead *rqh; 682121790Sjeff struct rqbits *rqb; 683121790Sjeff struct kse *ke; 684121790Sjeff int word; 685121790Sjeff int bit; 686121790Sjeff 687121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 688121790Sjeff rqb = &rq->rq_status; 689121790Sjeff for (word = 0; word < RQB_LEN; word++) { 690121790Sjeff if (rqb->rqb_bits[word] == 0) 691121790Sjeff continue; 692121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 693123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 694121790Sjeff continue; 695121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 696121790Sjeff TAILQ_FOREACH(ke, rqh, ke_procq) { 697121896Sjeff if (KSE_CAN_MIGRATE(ke, 698121896Sjeff PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 699121790Sjeff return (ke); 700121790Sjeff } 701121790Sjeff } 702121790Sjeff } 703121790Sjeff return (NULL); 704121790Sjeff} 705121790Sjeff 706121790Sjeffstatic struct kse * 707123433Sjeffkseq_steal(struct kseq *kseq, int stealidle) 708121790Sjeff{ 709121790Sjeff struct kse *ke; 710121790Sjeff 711123433Sjeff /* 712123433Sjeff * Steal from next first to try to get a non-interactive task that 713123433Sjeff * may not have run for a while. 714123433Sjeff */ 715123433Sjeff if ((ke = runq_steal(kseq->ksq_next)) != NULL) 716123433Sjeff return (ke); 717121790Sjeff if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 718121790Sjeff return (ke); 719123433Sjeff if (stealidle) 720123433Sjeff return (runq_steal(&kseq->ksq_idle)); 721123433Sjeff return (NULL); 722121790Sjeff} 723123433Sjeff 724123433Sjeffint 725123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class) 726123433Sjeff{ 727123433Sjeff struct kseq_group *ksg; 728123433Sjeff int cpu; 729123433Sjeff 730123685Sjeff if (smp_started == 0) 731123685Sjeff return (0); 732123433Sjeff cpu = 0; 733123433Sjeff ksg = kseq->ksq_group; 734123433Sjeff 735123433Sjeff /* 736123685Sjeff * If there are any idle groups, give them our extra load. The 737123685Sjeff * threshold at which we start to reassign kses has a large impact 738123685Sjeff * on the overall performance of the system. Tuned too high and 739123685Sjeff * some CPUs may idle. Too low and there will be excess migration 740128055Scognet * and context switches. 741123685Sjeff */ 742123694Sjeff if (ksg->ksg_load > (ksg->ksg_cpus * 2) && kseq_idle) { 743123433Sjeff /* 744123433Sjeff * Multiple cpus could find this bit simultaneously 745123433Sjeff * but the race shouldn't be terrible. 746123433Sjeff */ 747123433Sjeff cpu = ffs(kseq_idle); 748123433Sjeff if (cpu) 749123433Sjeff atomic_clear_int(&kseq_idle, 1 << (cpu - 1)); 750123433Sjeff } 751123433Sjeff /* 752123433Sjeff * If another cpu in this group has idled, assign a thread over 753123433Sjeff * to them after checking to see if there are idled groups. 754123433Sjeff */ 755123433Sjeff if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) { 756123433Sjeff cpu = ffs(ksg->ksg_idlemask); 757123433Sjeff if (cpu) 758123433Sjeff ksg->ksg_idlemask &= ~(1 << (cpu - 1)); 759123433Sjeff } 760123433Sjeff /* 761123433Sjeff * Now that we've found an idle CPU, migrate the thread. 762123433Sjeff */ 763123433Sjeff if (cpu) { 764123433Sjeff cpu--; 765123433Sjeff ke->ke_runq = NULL; 766123433Sjeff kseq_notify(ke, cpu); 767123433Sjeff return (1); 768123433Sjeff } 769123433Sjeff return (0); 770123433Sjeff} 771123433Sjeff 772121790Sjeff#endif /* SMP */ 773121790Sjeff 774117326Sjeff/* 775121790Sjeff * Pick the highest priority task we have and return it. 776117326Sjeff */ 777117326Sjeff 778121790Sjeffstatic struct kse * 779121790Sjeffkseq_choose(struct kseq *kseq) 780110267Sjeff{ 781110267Sjeff struct kse *ke; 782110267Sjeff struct runq *swap; 783110267Sjeff 784115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 785113357Sjeff swap = NULL; 786112994Sjeff 787113357Sjeff for (;;) { 788113357Sjeff ke = runq_choose(kseq->ksq_curr); 789113357Sjeff if (ke == NULL) { 790113357Sjeff /* 791131473Sjhb * We already swapped once and didn't get anywhere. 792113357Sjeff */ 793113357Sjeff if (swap) 794113357Sjeff break; 795113357Sjeff swap = kseq->ksq_curr; 796113357Sjeff kseq->ksq_curr = kseq->ksq_next; 797113357Sjeff kseq->ksq_next = swap; 798113357Sjeff continue; 799113357Sjeff } 800113357Sjeff /* 801113357Sjeff * If we encounter a slice of 0 the kse is in a 802113357Sjeff * TIMESHARE kse group and its nice was too far out 803113357Sjeff * of the range that receives slices. 804113357Sjeff */ 805121790Sjeff if (ke->ke_slice == 0) { 806113357Sjeff runq_remove(ke->ke_runq, ke); 807113357Sjeff sched_slice(ke); 808113357Sjeff ke->ke_runq = kseq->ksq_next; 809113357Sjeff runq_add(ke->ke_runq, ke); 810113357Sjeff continue; 811113357Sjeff } 812113357Sjeff return (ke); 813110267Sjeff } 814110267Sjeff 815113357Sjeff return (runq_choose(&kseq->ksq_idle)); 816110267Sjeff} 817110267Sjeff 818109864Sjeffstatic void 819110028Sjeffkseq_setup(struct kseq *kseq) 820110028Sjeff{ 821113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 822113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 823112994Sjeff runq_init(&kseq->ksq_idle); 824113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 825113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 826113660Sjeff kseq->ksq_load = 0; 827121896Sjeff kseq->ksq_load_timeshare = 0; 828110028Sjeff} 829110028Sjeff 830110028Sjeffstatic void 831109864Sjeffsched_setup(void *dummy) 832109864Sjeff{ 833117313Sjeff#ifdef SMP 834123487Sjeff int balance_groups; 835109864Sjeff int i; 836117313Sjeff#endif 837109864Sjeff 838116946Sjeff slice_min = (hz/100); /* 10ms */ 839116946Sjeff slice_max = (hz/7); /* ~140ms */ 840111857Sjeff 841117237Sjeff#ifdef SMP 842123487Sjeff balance_groups = 0; 843123433Sjeff /* 844123433Sjeff * Initialize the kseqs. 845123433Sjeff */ 846123433Sjeff for (i = 0; i < MAXCPU; i++) { 847123433Sjeff struct kseq *ksq; 848123433Sjeff 849123433Sjeff ksq = &kseq_cpu[i]; 850123433Sjeff ksq->ksq_assigned = NULL; 851123433Sjeff kseq_setup(&kseq_cpu[i]); 852123433Sjeff } 853117237Sjeff if (smp_topology == NULL) { 854123433Sjeff struct kseq_group *ksg; 855123433Sjeff struct kseq *ksq; 856123433Sjeff 857117237Sjeff for (i = 0; i < MAXCPU; i++) { 858123433Sjeff ksq = &kseq_cpu[i]; 859123433Sjeff ksg = &kseq_groups[i]; 860123433Sjeff /* 861129982Sjeff * Setup a kseq group with one member. 862123433Sjeff */ 863123433Sjeff ksq->ksq_transferable = 0; 864123433Sjeff ksq->ksq_group = ksg; 865123433Sjeff ksg->ksg_cpus = 1; 866123433Sjeff ksg->ksg_idlemask = 0; 867123433Sjeff ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 868123487Sjeff ksg->ksg_load = 0; 869123433Sjeff ksg->ksg_transferable = 0; 870123433Sjeff LIST_INIT(&ksg->ksg_members); 871123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 872117237Sjeff } 873117237Sjeff } else { 874123433Sjeff struct kseq_group *ksg; 875123433Sjeff struct cpu_group *cg; 876117237Sjeff int j; 877113357Sjeff 878117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 879117237Sjeff cg = &smp_topology->ct_group[i]; 880123433Sjeff ksg = &kseq_groups[i]; 881123433Sjeff /* 882123433Sjeff * Initialize the group. 883123433Sjeff */ 884123433Sjeff ksg->ksg_idlemask = 0; 885123487Sjeff ksg->ksg_load = 0; 886123433Sjeff ksg->ksg_transferable = 0; 887123433Sjeff ksg->ksg_cpus = cg->cg_count; 888123433Sjeff ksg->ksg_cpumask = cg->cg_mask; 889123433Sjeff LIST_INIT(&ksg->ksg_members); 890123433Sjeff /* 891123433Sjeff * Find all of the group members and add them. 892123433Sjeff */ 893123433Sjeff for (j = 0; j < MAXCPU; j++) { 894123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 895123433Sjeff if (ksg->ksg_mask == 0) 896123433Sjeff ksg->ksg_mask = 1 << j; 897123433Sjeff kseq_cpu[j].ksq_transferable = 0; 898123433Sjeff kseq_cpu[j].ksq_group = ksg; 899123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, 900123433Sjeff &kseq_cpu[j], ksq_siblings); 901123433Sjeff } 902123433Sjeff } 903123487Sjeff if (ksg->ksg_cpus > 1) 904123487Sjeff balance_groups = 1; 905117237Sjeff } 906123487Sjeff ksg_maxid = smp_topology->ct_count - 1; 907117237Sjeff } 908123487Sjeff /* 909123487Sjeff * Stagger the group and global load balancer so they do not 910123487Sjeff * interfere with each other. 911123487Sjeff */ 912129982Sjeff bal_tick = ticks + hz; 913123487Sjeff if (balance_groups) 914129982Sjeff gbal_tick = ticks + (hz / 2); 915117237Sjeff#else 916117237Sjeff kseq_setup(KSEQ_SELF()); 917116069Sjeff#endif 918117237Sjeff mtx_lock_spin(&sched_lock); 919122744Sjeff kseq_load_add(KSEQ_SELF(), &kse0); 920117237Sjeff mtx_unlock_spin(&sched_lock); 921109864Sjeff} 922109864Sjeff 923109864Sjeff/* 924109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 925109864Sjeff * process. 926109864Sjeff */ 927113357Sjeffstatic void 928109864Sjeffsched_priority(struct ksegrp *kg) 929109864Sjeff{ 930109864Sjeff int pri; 931109864Sjeff 932109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 933113357Sjeff return; 934109864Sjeff 935113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 936111857Sjeff pri += SCHED_PRI_BASE; 937130551Sjulian pri += kg->kg_proc->p_nice; 938109864Sjeff 939109864Sjeff if (pri > PRI_MAX_TIMESHARE) 940109864Sjeff pri = PRI_MAX_TIMESHARE; 941109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 942109864Sjeff pri = PRI_MIN_TIMESHARE; 943109864Sjeff 944109864Sjeff kg->kg_user_pri = pri; 945109864Sjeff 946113357Sjeff return; 947109864Sjeff} 948109864Sjeff 949109864Sjeff/* 950112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 951112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 952109864Sjeff */ 953112966Sjeffstatic void 954112966Sjeffsched_slice(struct kse *ke) 955109864Sjeff{ 956113357Sjeff struct kseq *kseq; 957112966Sjeff struct ksegrp *kg; 958109864Sjeff 959112966Sjeff kg = ke->ke_ksegrp; 960113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 961109864Sjeff 962112966Sjeff /* 963112966Sjeff * Rationale: 964112966Sjeff * KSEs in interactive ksegs get the minimum slice so that we 965112966Sjeff * quickly notice if it abuses its advantage. 966112966Sjeff * 967112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 968112966Sjeff * based on the ksegs nice value relative to the least nice kseg 969112966Sjeff * on the run queue for this cpu. 970112966Sjeff * 971112966Sjeff * If the KSE is less nice than all others it gets the maximum 972112966Sjeff * slice and other KSEs will adjust their slice relative to 973112966Sjeff * this when they first expire. 974112966Sjeff * 975112966Sjeff * There is 20 point window that starts relative to the least 976112966Sjeff * nice kse on the run queue. Slice size is determined by 977112966Sjeff * the kse distance from the last nice ksegrp. 978112966Sjeff * 979121871Sjeff * If the kse is outside of the window it will get no slice 980121871Sjeff * and will be reevaluated each time it is selected on the 981121871Sjeff * run queue. The exception to this is nice 0 ksegs when 982121871Sjeff * a nice -20 is running. They are always granted a minimum 983121871Sjeff * slice. 984112966Sjeff */ 985113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 986112966Sjeff int nice; 987112966Sjeff 988130551Sjulian nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); 989121896Sjeff if (kseq->ksq_load_timeshare == 0 || 990130551Sjulian kg->kg_proc->p_nice < kseq->ksq_nicemin) 991112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 992121871Sjeff else if (nice <= SCHED_SLICE_NTHRESH) 993112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 994130551Sjulian else if (kg->kg_proc->p_nice == 0) 995121871Sjeff ke->ke_slice = SCHED_SLICE_MIN; 996112966Sjeff else 997112966Sjeff ke->ke_slice = 0; 998112966Sjeff } else 999123684Sjeff ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1000112966Sjeff 1001113357Sjeff CTR6(KTR_ULE, 1002113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 1003130551Sjulian ke, ke->ke_slice, kg->kg_proc->p_nice, kseq->ksq_nicemin, 1004121896Sjeff kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); 1005113357Sjeff 1006112966Sjeff return; 1007109864Sjeff} 1008109864Sjeff 1009121868Sjeff/* 1010121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1011121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 1012121868Sjeff * This routine will not operate correctly when slp or run times have been 1013121868Sjeff * adjusted to more than double their maximum. 1014121868Sjeff */ 1015116463Sjeffstatic void 1016116463Sjeffsched_interact_update(struct ksegrp *kg) 1017116463Sjeff{ 1018121868Sjeff int sum; 1019121605Sjeff 1020121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1021121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1022121868Sjeff return; 1023121868Sjeff /* 1024121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1025121868Sjeff * will not bring us back into range. Dividing by two here forces 1026121868Sjeff * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1027121868Sjeff */ 1028127850Sjeff if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1029121868Sjeff kg->kg_runtime /= 2; 1030121868Sjeff kg->kg_slptime /= 2; 1031121868Sjeff return; 1032116463Sjeff } 1033121868Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1034121868Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1035116463Sjeff} 1036116463Sjeff 1037121868Sjeffstatic void 1038121868Sjeffsched_interact_fork(struct ksegrp *kg) 1039121868Sjeff{ 1040121868Sjeff int ratio; 1041121868Sjeff int sum; 1042121868Sjeff 1043121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1044121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1045121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1046121868Sjeff kg->kg_runtime /= ratio; 1047121868Sjeff kg->kg_slptime /= ratio; 1048121868Sjeff } 1049121868Sjeff} 1050121868Sjeff 1051111857Sjeffstatic int 1052111857Sjeffsched_interact_score(struct ksegrp *kg) 1053111857Sjeff{ 1054116365Sjeff int div; 1055111857Sjeff 1056111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 1057116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1058116365Sjeff return (SCHED_INTERACT_HALF + 1059116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1060116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 1061116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1062116365Sjeff return (kg->kg_runtime / div); 1063111857Sjeff } 1064111857Sjeff 1065116365Sjeff /* 1066116365Sjeff * This can happen if slptime and runtime are 0. 1067116365Sjeff */ 1068116365Sjeff return (0); 1069111857Sjeff 1070111857Sjeff} 1071111857Sjeff 1072113357Sjeff/* 1073113357Sjeff * This is only somewhat accurate since given many processes of the same 1074113357Sjeff * priority they will switch when their slices run out, which will be 1075113357Sjeff * at most SCHED_SLICE_MAX. 1076113357Sjeff */ 1077109864Sjeffint 1078109864Sjeffsched_rr_interval(void) 1079109864Sjeff{ 1080109864Sjeff return (SCHED_SLICE_MAX); 1081109864Sjeff} 1082109864Sjeff 1083121790Sjeffstatic void 1084109864Sjeffsched_pctcpu_update(struct kse *ke) 1085109864Sjeff{ 1086109864Sjeff /* 1087109864Sjeff * Adjust counters and watermark for pctcpu calc. 1088116365Sjeff */ 1089120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1090120272Sjeff /* 1091120272Sjeff * Shift the tick count out so that the divide doesn't 1092120272Sjeff * round away our results. 1093120272Sjeff */ 1094120272Sjeff ke->ke_ticks <<= 10; 1095120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1096120272Sjeff SCHED_CPU_TICKS; 1097120272Sjeff ke->ke_ticks >>= 10; 1098120272Sjeff } else 1099120272Sjeff ke->ke_ticks = 0; 1100109864Sjeff ke->ke_ltick = ticks; 1101109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1102109864Sjeff} 1103109864Sjeff 1104109864Sjeffvoid 1105109864Sjeffsched_prio(struct thread *td, u_char prio) 1106109864Sjeff{ 1107121605Sjeff struct kse *ke; 1108109864Sjeff 1109121605Sjeff ke = td->td_kse; 1110109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1111109864Sjeff if (TD_ON_RUNQ(td)) { 1112121605Sjeff /* 1113121605Sjeff * If the priority has been elevated due to priority 1114121605Sjeff * propagation, we may have to move ourselves to a new 1115121605Sjeff * queue. We still call adjustrunqueue below in case kse 1116121605Sjeff * needs to fix things up. 1117121605Sjeff */ 1118121872Sjeff if (prio < td->td_priority && ke && 1119121872Sjeff (ke->ke_flags & KEF_ASSIGNED) == 0 && 1120121790Sjeff ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1121121605Sjeff runq_remove(ke->ke_runq, ke); 1122121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1123121605Sjeff runq_add(ke->ke_runq, ke); 1124121605Sjeff } 1125119488Sdavidxu adjustrunqueue(td, prio); 1126121605Sjeff } else 1127119488Sdavidxu td->td_priority = prio; 1128109864Sjeff} 1129109864Sjeff 1130109864Sjeffvoid 1131131473Sjhbsched_switch(struct thread *td, struct thread *newtd) 1132109864Sjeff{ 1133109864Sjeff struct kse *ke; 1134109864Sjeff 1135109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1136109864Sjeff 1137109864Sjeff ke = td->td_kse; 1138109864Sjeff 1139109864Sjeff td->td_last_kse = ke; 1140113339Sjulian td->td_lastcpu = td->td_oncpu; 1141113339Sjulian td->td_oncpu = NOCPU; 1142131481Sjhb td->td_flags &= ~(TDF_NEEDRESCHED | TDF_OWEPREEMPT); 1143109864Sjeff 1144123434Sjeff /* 1145123434Sjeff * If the KSE has been assigned it may be in the process of switching 1146123434Sjeff * to the new cpu. This is the case in sched_bind(). 1147123434Sjeff */ 1148123434Sjeff if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1149131473Sjhb if (td == PCPU_GET(idlethread)) 1150131473Sjhb TD_SET_CAN_RUN(td); 1151131473Sjhb else if (TD_IS_RUNNING(td)) { 1152127278Sobrien kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1153127278Sobrien setrunqueue(td); 1154123434Sjeff } else { 1155125289Sjeff if (ke->ke_runq) { 1156123434Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1157125289Sjeff } else if ((td->td_flags & TDF_IDLETD) == 0) 1158125289Sjeff backtrace(); 1159123434Sjeff /* 1160123434Sjeff * We will not be on the run queue. So we must be 1161123434Sjeff * sleeping or similar. 1162123434Sjeff */ 1163123434Sjeff if (td->td_proc->p_flag & P_SA) 1164123434Sjeff kse_reassign(ke); 1165123434Sjeff } 1166121146Sjeff } 1167131473Sjhb if (newtd == NULL) 1168131473Sjhb newtd = choosethread(); 1169131473Sjhb else 1170131473Sjhb kseq_load_add(KSEQ_SELF(), newtd->td_kse); 1171121128Sjeff if (td != newtd) 1172121128Sjeff cpu_switch(td, newtd); 1173121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1174109864Sjeff 1175113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1176109864Sjeff} 1177109864Sjeff 1178109864Sjeffvoid 1179130551Sjuliansched_nice(struct proc *p, int nice) 1180109864Sjeff{ 1181130551Sjulian struct ksegrp *kg; 1182113357Sjeff struct kse *ke; 1183109864Sjeff struct thread *td; 1184113357Sjeff struct kseq *kseq; 1185109864Sjeff 1186130551Sjulian PROC_LOCK_ASSERT(p, MA_OWNED); 1187113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1188113357Sjeff /* 1189113357Sjeff * We need to adjust the nice counts for running KSEs. 1190113357Sjeff */ 1191130551Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 1192130551Sjulian if (kg->kg_pri_class == PRI_TIMESHARE) { 1193130551Sjulian FOREACH_KSE_IN_GROUP(kg, ke) { 1194130551Sjulian if (ke->ke_runq == NULL) 1195130551Sjulian continue; 1196130551Sjulian kseq = KSEQ_CPU(ke->ke_cpu); 1197130551Sjulian kseq_nice_rem(kseq, p->p_nice); 1198130551Sjulian kseq_nice_add(kseq, nice); 1199130551Sjulian } 1200113357Sjeff } 1201130551Sjulian } 1202130551Sjulian p->p_nice = nice; 1203130551Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 1204130551Sjulian sched_priority(kg); 1205130551Sjulian FOREACH_THREAD_IN_GROUP(kg, td) 1206130551Sjulian td->td_flags |= TDF_NEEDRESCHED; 1207130551Sjulian } 1208109864Sjeff} 1209109864Sjeff 1210109864Sjeffvoid 1211126326Sjhbsched_sleep(struct thread *td) 1212109864Sjeff{ 1213109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1214109864Sjeff 1215109864Sjeff td->td_slptime = ticks; 1216126326Sjhb td->td_base_pri = td->td_priority; 1217109864Sjeff 1218113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 1219113357Sjeff td->td_kse, td->td_slptime); 1220109864Sjeff} 1221109864Sjeff 1222109864Sjeffvoid 1223109864Sjeffsched_wakeup(struct thread *td) 1224109864Sjeff{ 1225109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1226109864Sjeff 1227109864Sjeff /* 1228109864Sjeff * Let the kseg know how long we slept for. This is because process 1229109864Sjeff * interactivity behavior is modeled in the kseg. 1230109864Sjeff */ 1231111788Sjeff if (td->td_slptime) { 1232111788Sjeff struct ksegrp *kg; 1233113357Sjeff int hzticks; 1234109864Sjeff 1235111788Sjeff kg = td->td_ksegrp; 1236121868Sjeff hzticks = (ticks - td->td_slptime) << 10; 1237121868Sjeff if (hzticks >= SCHED_SLP_RUN_MAX) { 1238121868Sjeff kg->kg_slptime = SCHED_SLP_RUN_MAX; 1239121868Sjeff kg->kg_runtime = 1; 1240121868Sjeff } else { 1241121868Sjeff kg->kg_slptime += hzticks; 1242121868Sjeff sched_interact_update(kg); 1243121868Sjeff } 1244111788Sjeff sched_priority(kg); 1245116463Sjeff if (td->td_kse) 1246116463Sjeff sched_slice(td->td_kse); 1247113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 1248113357Sjeff td->td_kse, hzticks); 1249111788Sjeff td->td_slptime = 0; 1250109864Sjeff } 1251109864Sjeff setrunqueue(td); 1252109864Sjeff} 1253109864Sjeff 1254109864Sjeff/* 1255109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1256109864Sjeff * priority. 1257109864Sjeff */ 1258109864Sjeffvoid 1259113357Sjeffsched_fork(struct proc *p, struct proc *p1) 1260109864Sjeff{ 1261109864Sjeff 1262109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1263109864Sjeff 1264130551Sjulian p1->p_nice = p->p_nice; 1265113357Sjeff sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 1266113357Sjeff sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 1267113357Sjeff sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 1268113357Sjeff} 1269113357Sjeff 1270113357Sjeffvoid 1271113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child) 1272113357Sjeff{ 1273113923Sjhb 1274116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1275122847Sjeff child->ke_cpu = ke->ke_cpu; 1276113357Sjeff child->ke_runq = NULL; 1277113357Sjeff 1278121051Sjeff /* Grab our parents cpu estimation information. */ 1279121051Sjeff child->ke_ticks = ke->ke_ticks; 1280121051Sjeff child->ke_ltick = ke->ke_ltick; 1281121051Sjeff child->ke_ftick = ke->ke_ftick; 1282113357Sjeff} 1283113357Sjeff 1284113357Sjeffvoid 1285113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1286113357Sjeff{ 1287113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 1288116365Sjeff 1289121868Sjeff child->kg_slptime = kg->kg_slptime; 1290121868Sjeff child->kg_runtime = kg->kg_runtime; 1291121868Sjeff child->kg_user_pri = kg->kg_user_pri; 1292121868Sjeff sched_interact_fork(child); 1293116463Sjeff kg->kg_runtime += tickincr << 10; 1294116463Sjeff sched_interact_update(kg); 1295113357Sjeff 1296121868Sjeff CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", 1297121868Sjeff kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 1298121868Sjeff child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); 1299113357Sjeff} 1300109864Sjeff 1301113357Sjeffvoid 1302113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 1303113357Sjeff{ 1304113357Sjeff} 1305113357Sjeff 1306113357Sjeffvoid 1307113357Sjeffsched_class(struct ksegrp *kg, int class) 1308113357Sjeff{ 1309113357Sjeff struct kseq *kseq; 1310113357Sjeff struct kse *ke; 1311121896Sjeff int nclass; 1312121896Sjeff int oclass; 1313113357Sjeff 1314113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1315113357Sjeff if (kg->kg_pri_class == class) 1316113357Sjeff return; 1317113357Sjeff 1318121896Sjeff nclass = PRI_BASE(class); 1319121896Sjeff oclass = PRI_BASE(kg->kg_pri_class); 1320113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1321113357Sjeff if (ke->ke_state != KES_ONRUNQ && 1322113357Sjeff ke->ke_state != KES_THREAD) 1323113357Sjeff continue; 1324113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1325113357Sjeff 1326121896Sjeff#ifdef SMP 1327122744Sjeff /* 1328122744Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1329122744Sjeff * count because could be changing to or from an interrupt 1330122744Sjeff * class. 1331122744Sjeff */ 1332122744Sjeff if (ke->ke_state == KES_ONRUNQ) { 1333123433Sjeff if (KSE_CAN_MIGRATE(ke, oclass)) { 1334123433Sjeff kseq->ksq_transferable--; 1335123433Sjeff kseq->ksq_group->ksg_transferable--; 1336123433Sjeff } 1337123433Sjeff if (KSE_CAN_MIGRATE(ke, nclass)) { 1338123433Sjeff kseq->ksq_transferable++; 1339123433Sjeff kseq->ksq_group->ksg_transferable++; 1340123433Sjeff } 1341122744Sjeff } 1342121896Sjeff#endif 1343122744Sjeff if (oclass == PRI_TIMESHARE) { 1344121896Sjeff kseq->ksq_load_timeshare--; 1345130551Sjulian kseq_nice_rem(kseq, kg->kg_proc->p_nice); 1346122744Sjeff } 1347122744Sjeff if (nclass == PRI_TIMESHARE) { 1348121896Sjeff kseq->ksq_load_timeshare++; 1349130551Sjulian kseq_nice_add(kseq, kg->kg_proc->p_nice); 1350122744Sjeff } 1351109970Sjeff } 1352109970Sjeff 1353113357Sjeff kg->kg_pri_class = class; 1354109864Sjeff} 1355109864Sjeff 1356109864Sjeff/* 1357109864Sjeff * Return some of the child's priority and interactivity to the parent. 1358109864Sjeff */ 1359109864Sjeffvoid 1360113357Sjeffsched_exit(struct proc *p, struct proc *child) 1361109864Sjeff{ 1362109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1363113372Sjeff sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 1364116365Sjeff sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 1365109864Sjeff} 1366109864Sjeff 1367109864Sjeffvoid 1368113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child) 1369113372Sjeff{ 1370122744Sjeff kseq_load_rem(KSEQ_CPU(child->ke_cpu), child); 1371113372Sjeff} 1372113372Sjeff 1373113372Sjeffvoid 1374113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1375113372Sjeff{ 1376116463Sjeff /* kg->kg_slptime += child->kg_slptime; */ 1377116365Sjeff kg->kg_runtime += child->kg_runtime; 1378116463Sjeff sched_interact_update(kg); 1379113372Sjeff} 1380113372Sjeff 1381113372Sjeffvoid 1382113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 1383113372Sjeff{ 1384113372Sjeff} 1385113372Sjeff 1386113372Sjeffvoid 1387121127Sjeffsched_clock(struct thread *td) 1388109864Sjeff{ 1389113357Sjeff struct kseq *kseq; 1390113357Sjeff struct ksegrp *kg; 1391121127Sjeff struct kse *ke; 1392109864Sjeff 1393129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 1394129982Sjeff#ifdef SMP 1395129982Sjeff if (ticks == bal_tick) 1396129982Sjeff sched_balance(); 1397129982Sjeff if (ticks == gbal_tick) 1398129982Sjeff sched_balance_groups(); 1399129982Sjeff#endif 1400113357Sjeff /* 1401113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1402113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1403113357Sjeff * calculating this here. 1404113357Sjeff */ 1405113357Sjeff if (realstathz == 0) { 1406113357Sjeff realstathz = stathz ? stathz : hz; 1407113357Sjeff tickincr = hz / realstathz; 1408113357Sjeff /* 1409113357Sjeff * XXX This does not work for values of stathz that are much 1410113357Sjeff * larger than hz. 1411113357Sjeff */ 1412113357Sjeff if (tickincr == 0) 1413113357Sjeff tickincr = 1; 1414113357Sjeff } 1415109864Sjeff 1416121127Sjeff ke = td->td_kse; 1417113357Sjeff kg = ke->ke_ksegrp; 1418109864Sjeff 1419110028Sjeff /* Adjust ticks for pctcpu */ 1420111793Sjeff ke->ke_ticks++; 1421109971Sjeff ke->ke_ltick = ticks; 1422112994Sjeff 1423109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1424109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1425109971Sjeff sched_pctcpu_update(ke); 1426109971Sjeff 1427114496Sjulian if (td->td_flags & TDF_IDLETD) 1428109864Sjeff return; 1429110028Sjeff 1430113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1431113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1432110028Sjeff /* 1433113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1434113357Sjeff */ 1435113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1436113357Sjeff return; 1437113357Sjeff /* 1438110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1439113357Sjeff * interactivity. 1440109864Sjeff */ 1441113357Sjeff kg->kg_runtime += tickincr << 10; 1442116463Sjeff sched_interact_update(kg); 1443110645Sjeff 1444109864Sjeff /* 1445109864Sjeff * We used up one time slice. 1446109864Sjeff */ 1447122847Sjeff if (--ke->ke_slice > 0) 1448113357Sjeff return; 1449109864Sjeff /* 1450113357Sjeff * We're out of time, recompute priorities and requeue. 1451109864Sjeff */ 1452122847Sjeff kseq = KSEQ_SELF(); 1453122744Sjeff kseq_load_rem(kseq, ke); 1454113357Sjeff sched_priority(kg); 1455113357Sjeff sched_slice(ke); 1456113357Sjeff if (SCHED_CURR(kg, ke)) 1457113357Sjeff ke->ke_runq = kseq->ksq_curr; 1458113357Sjeff else 1459113357Sjeff ke->ke_runq = kseq->ksq_next; 1460122744Sjeff kseq_load_add(kseq, ke); 1461113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1462109864Sjeff} 1463109864Sjeff 1464109864Sjeffint 1465109864Sjeffsched_runnable(void) 1466109864Sjeff{ 1467109864Sjeff struct kseq *kseq; 1468115998Sjeff int load; 1469109864Sjeff 1470115998Sjeff load = 1; 1471115998Sjeff 1472110028Sjeff kseq = KSEQ_SELF(); 1473121790Sjeff#ifdef SMP 1474122094Sjeff if (kseq->ksq_assigned) { 1475122094Sjeff mtx_lock_spin(&sched_lock); 1476121790Sjeff kseq_assign(kseq); 1477122094Sjeff mtx_unlock_spin(&sched_lock); 1478122094Sjeff } 1479121790Sjeff#endif 1480121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1481121605Sjeff if (kseq->ksq_load > 0) 1482121605Sjeff goto out; 1483121605Sjeff } else 1484121605Sjeff if (kseq->ksq_load - 1 > 0) 1485121605Sjeff goto out; 1486115998Sjeff load = 0; 1487115998Sjeffout: 1488115998Sjeff return (load); 1489109864Sjeff} 1490109864Sjeff 1491109864Sjeffvoid 1492109864Sjeffsched_userret(struct thread *td) 1493109864Sjeff{ 1494109864Sjeff struct ksegrp *kg; 1495121605Sjeff 1496121605Sjeff kg = td->td_ksegrp; 1497109864Sjeff 1498109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1499109864Sjeff mtx_lock_spin(&sched_lock); 1500109864Sjeff td->td_priority = kg->kg_user_pri; 1501109864Sjeff mtx_unlock_spin(&sched_lock); 1502109864Sjeff } 1503109864Sjeff} 1504109864Sjeff 1505109864Sjeffstruct kse * 1506109970Sjeffsched_choose(void) 1507109970Sjeff{ 1508110028Sjeff struct kseq *kseq; 1509109970Sjeff struct kse *ke; 1510109970Sjeff 1511115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1512121790Sjeff kseq = KSEQ_SELF(); 1513113357Sjeff#ifdef SMP 1514123433Sjeffrestart: 1515121790Sjeff if (kseq->ksq_assigned) 1516121790Sjeff kseq_assign(kseq); 1517113357Sjeff#endif 1518121790Sjeff ke = kseq_choose(kseq); 1519109864Sjeff if (ke) { 1520121790Sjeff#ifdef SMP 1521121790Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1522123433Sjeff if (kseq_idled(kseq) == 0) 1523123433Sjeff goto restart; 1524121790Sjeff#endif 1525122744Sjeff kseq_runq_rem(kseq, ke); 1526109864Sjeff ke->ke_state = KES_THREAD; 1527112966Sjeff 1528113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1529113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1530113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1531113357Sjeff ke->ke_thread->td_priority); 1532113357Sjeff } 1533113357Sjeff return (ke); 1534109864Sjeff } 1535109970Sjeff#ifdef SMP 1536123433Sjeff if (kseq_idled(kseq) == 0) 1537123433Sjeff goto restart; 1538109970Sjeff#endif 1539113357Sjeff return (NULL); 1540109864Sjeff} 1541109864Sjeff 1542109864Sjeffvoid 1543121127Sjeffsched_add(struct thread *td) 1544109864Sjeff{ 1545110267Sjeff struct kseq *kseq; 1546113357Sjeff struct ksegrp *kg; 1547121127Sjeff struct kse *ke; 1548121790Sjeff int class; 1549109864Sjeff 1550121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 1551121127Sjeff ke = td->td_kse; 1552121127Sjeff kg = td->td_ksegrp; 1553121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1554121790Sjeff return; 1555121790Sjeff kseq = KSEQ_SELF(); 1556124958Sjeff KASSERT((ke->ke_thread != NULL), 1557124958Sjeff ("sched_add: No thread on KSE")); 1558109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1559110267Sjeff ("sched_add: No KSE on thread")); 1560109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1561110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1562109864Sjeff ke->ke_proc->p_comm)); 1563109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1564110267Sjeff ("sched_add: process swapped out")); 1565113387Sjeff KASSERT(ke->ke_runq == NULL, 1566113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1567109864Sjeff 1568121790Sjeff class = PRI_BASE(kg->kg_pri_class); 1569121790Sjeff switch (class) { 1570112994Sjeff case PRI_ITHD: 1571112994Sjeff case PRI_REALTIME: 1572113357Sjeff ke->ke_runq = kseq->ksq_curr; 1573113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1574113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1575112994Sjeff break; 1576112994Sjeff case PRI_TIMESHARE: 1577113387Sjeff if (SCHED_CURR(kg, ke)) 1578113387Sjeff ke->ke_runq = kseq->ksq_curr; 1579113387Sjeff else 1580113387Sjeff ke->ke_runq = kseq->ksq_next; 1581113357Sjeff break; 1582112994Sjeff case PRI_IDLE: 1583113357Sjeff /* 1584113357Sjeff * This is for priority prop. 1585113357Sjeff */ 1586121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1587113357Sjeff ke->ke_runq = kseq->ksq_curr; 1588113357Sjeff else 1589113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1590113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1591112994Sjeff break; 1592113357Sjeff default: 1593121868Sjeff panic("Unknown pri class."); 1594113357Sjeff break; 1595112994Sjeff } 1596121790Sjeff#ifdef SMP 1597123433Sjeff if (ke->ke_cpu != PCPU_GET(cpuid)) { 1598123529Sjeff ke->ke_runq = NULL; 1599123433Sjeff kseq_notify(ke, ke->ke_cpu); 1600123433Sjeff return; 1601123433Sjeff } 1602121790Sjeff /* 1603123685Sjeff * If we had been idle, clear our bit in the group and potentially 1604123685Sjeff * the global bitmap. If not, see if we should transfer this thread. 1605121790Sjeff */ 1606123433Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1607123433Sjeff (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1608121790Sjeff /* 1609123433Sjeff * Check to see if our group is unidling, and if so, remove it 1610123433Sjeff * from the global idle mask. 1611121790Sjeff */ 1612123433Sjeff if (kseq->ksq_group->ksg_idlemask == 1613123433Sjeff kseq->ksq_group->ksg_cpumask) 1614123433Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1615123433Sjeff /* 1616123433Sjeff * Now remove ourselves from the group specific idle mask. 1617123433Sjeff */ 1618123433Sjeff kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1619123685Sjeff } else if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class)) 1620123685Sjeff if (kseq_transfer(kseq, ke, class)) 1621123685Sjeff return; 1622121790Sjeff#endif 1623121790Sjeff if (td->td_priority < curthread->td_priority) 1624121790Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1625121790Sjeff 1626131481Sjhb#ifdef SMP 1627131481Sjhb /* 1628131481Sjhb * Only try to preempt if the thread is unpinned or pinned to the 1629131481Sjhb * current CPU. 1630131481Sjhb */ 1631131481Sjhb if (KSE_CAN_MIGRATE(ke) || ke->ke_cpu == PCPU_GET(cpuid)) 1632131481Sjhb#endif 1633131481Sjhb if (maybe_preempt(td)) 1634131481Sjhb return; 1635109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1636109864Sjeff ke->ke_state = KES_ONRUNQ; 1637109864Sjeff 1638122744Sjeff kseq_runq_add(kseq, ke); 1639122744Sjeff kseq_load_add(kseq, ke); 1640109864Sjeff} 1641109864Sjeff 1642109864Sjeffvoid 1643121127Sjeffsched_rem(struct thread *td) 1644109864Sjeff{ 1645113357Sjeff struct kseq *kseq; 1646121127Sjeff struct kse *ke; 1647113357Sjeff 1648121127Sjeff ke = td->td_kse; 1649121790Sjeff /* 1650121790Sjeff * It is safe to just return here because sched_rem() is only ever 1651121790Sjeff * used in places where we're immediately going to add the 1652121790Sjeff * kse back on again. In that case it'll be added with the correct 1653121790Sjeff * thread and priority when the caller drops the sched_lock. 1654121790Sjeff */ 1655121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1656121790Sjeff return; 1657109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1658124958Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), 1659124958Sjeff ("sched_rem: KSE not on run queue")); 1660109864Sjeff 1661109864Sjeff ke->ke_state = KES_THREAD; 1662109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1663113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1664122744Sjeff kseq_runq_rem(kseq, ke); 1665122744Sjeff kseq_load_rem(kseq, ke); 1666109864Sjeff} 1667109864Sjeff 1668109864Sjefffixpt_t 1669121127Sjeffsched_pctcpu(struct thread *td) 1670109864Sjeff{ 1671109864Sjeff fixpt_t pctcpu; 1672121127Sjeff struct kse *ke; 1673109864Sjeff 1674109864Sjeff pctcpu = 0; 1675121127Sjeff ke = td->td_kse; 1676121290Sjeff if (ke == NULL) 1677121290Sjeff return (0); 1678109864Sjeff 1679115998Sjeff mtx_lock_spin(&sched_lock); 1680109864Sjeff if (ke->ke_ticks) { 1681109864Sjeff int rtick; 1682109864Sjeff 1683116365Sjeff /* 1684116365Sjeff * Don't update more frequently than twice a second. Allowing 1685116365Sjeff * this causes the cpu usage to decay away too quickly due to 1686116365Sjeff * rounding errors. 1687116365Sjeff */ 1688123435Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1689123435Sjeff ke->ke_ltick < (ticks - (hz / 2))) 1690116365Sjeff sched_pctcpu_update(ke); 1691109864Sjeff /* How many rtick per second ? */ 1692116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1693110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1694109864Sjeff } 1695109864Sjeff 1696109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1697113865Sjhb mtx_unlock_spin(&sched_lock); 1698109864Sjeff 1699109864Sjeff return (pctcpu); 1700109864Sjeff} 1701109864Sjeff 1702122038Sjeffvoid 1703122038Sjeffsched_bind(struct thread *td, int cpu) 1704122038Sjeff{ 1705122038Sjeff struct kse *ke; 1706122038Sjeff 1707122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1708122038Sjeff ke = td->td_kse; 1709122038Sjeff ke->ke_flags |= KEF_BOUND; 1710123433Sjeff#ifdef SMP 1711123433Sjeff if (PCPU_GET(cpuid) == cpu) 1712122038Sjeff return; 1713122038Sjeff /* sched_rem without the runq_remove */ 1714122038Sjeff ke->ke_state = KES_THREAD; 1715122038Sjeff ke->ke_ksegrp->kg_runq_kses--; 1716122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1717122038Sjeff kseq_notify(ke, cpu); 1718122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 1719124944Sjeff mi_switch(SW_VOL); 1720122038Sjeff#endif 1721122038Sjeff} 1722122038Sjeff 1723122038Sjeffvoid 1724122038Sjeffsched_unbind(struct thread *td) 1725122038Sjeff{ 1726122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1727122038Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 1728122038Sjeff} 1729122038Sjeff 1730109864Sjeffint 1731125289Sjeffsched_load(void) 1732125289Sjeff{ 1733125289Sjeff#ifdef SMP 1734125289Sjeff int total; 1735125289Sjeff int i; 1736125289Sjeff 1737125289Sjeff total = 0; 1738125289Sjeff for (i = 0; i <= ksg_maxid; i++) 1739125289Sjeff total += KSEQ_GROUP(i)->ksg_load; 1740125289Sjeff return (total); 1741125289Sjeff#else 1742125289Sjeff return (KSEQ_SELF()->ksq_sysload); 1743125289Sjeff#endif 1744125289Sjeff} 1745125289Sjeff 1746125289Sjeffint 1747109864Sjeffsched_sizeof_kse(void) 1748109864Sjeff{ 1749109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1750109864Sjeff} 1751109864Sjeff 1752109864Sjeffint 1753109864Sjeffsched_sizeof_ksegrp(void) 1754109864Sjeff{ 1755109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1756109864Sjeff} 1757109864Sjeff 1758109864Sjeffint 1759109864Sjeffsched_sizeof_proc(void) 1760109864Sjeff{ 1761109864Sjeff return (sizeof(struct proc)); 1762109864Sjeff} 1763109864Sjeff 1764109864Sjeffint 1765109864Sjeffsched_sizeof_thread(void) 1766109864Sjeff{ 1767109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1768109864Sjeff} 1769