sched_ule.c revision 123685
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 123685 2003-12-20 14:03:14Z jeff $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32109864Sjeff#include <sys/kernel.h> 33109864Sjeff#include <sys/ktr.h> 34109864Sjeff#include <sys/lock.h> 35109864Sjeff#include <sys/mutex.h> 36109864Sjeff#include <sys/proc.h> 37112966Sjeff#include <sys/resource.h> 38122038Sjeff#include <sys/resourcevar.h> 39109864Sjeff#include <sys/sched.h> 40109864Sjeff#include <sys/smp.h> 41109864Sjeff#include <sys/sx.h> 42109864Sjeff#include <sys/sysctl.h> 43109864Sjeff#include <sys/sysproto.h> 44109864Sjeff#include <sys/vmmeter.h> 45109864Sjeff#ifdef DDB 46109864Sjeff#include <ddb/ddb.h> 47109864Sjeff#endif 48109864Sjeff#ifdef KTRACE 49109864Sjeff#include <sys/uio.h> 50109864Sjeff#include <sys/ktrace.h> 51109864Sjeff#endif 52109864Sjeff 53109864Sjeff#include <machine/cpu.h> 54121790Sjeff#include <machine/smp.h> 55109864Sjeff 56113357Sjeff#define KTR_ULE KTR_NFS 57113357Sjeff 58109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 59109864Sjeff/* XXX This is bogus compatability crap for ps */ 60109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 61109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 62109864Sjeff 63109864Sjeffstatic void sched_setup(void *dummy); 64109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 65109864Sjeff 66113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 67113357Sjeff 68113357Sjeffstatic int sched_strict; 69113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 70113357Sjeff 71113357Sjeffstatic int slice_min = 1; 72113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 73113357Sjeff 74116365Sjeffstatic int slice_max = 10; 75113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 76113357Sjeff 77111857Sjeffint realstathz; 78113357Sjeffint tickincr = 1; 79111857Sjeff 80116069Sjeff#ifdef SMP 81123487Sjeff/* Callouts to handle load balancing SMP systems. */ 82116069Sjeffstatic struct callout kseq_lb_callout; 83123487Sjeffstatic struct callout kseq_group_callout; 84116069Sjeff#endif 85116069Sjeff 86109864Sjeff/* 87109864Sjeff * These datastructures are allocated within their parent datastructure but 88109864Sjeff * are scheduler specific. 89109864Sjeff */ 90109864Sjeff 91109864Sjeffstruct ke_sched { 92109864Sjeff int ske_slice; 93109864Sjeff struct runq *ske_runq; 94109864Sjeff /* The following variables are only used for pctcpu calculation */ 95109864Sjeff int ske_ltick; /* Last tick that we were running on */ 96109864Sjeff int ske_ftick; /* First tick that we were running on */ 97109864Sjeff int ske_ticks; /* Tick count */ 98113357Sjeff /* CPU that we have affinity for. */ 99110260Sjeff u_char ske_cpu; 100109864Sjeff}; 101109864Sjeff#define ke_slice ke_sched->ske_slice 102109864Sjeff#define ke_runq ke_sched->ske_runq 103109864Sjeff#define ke_ltick ke_sched->ske_ltick 104109864Sjeff#define ke_ftick ke_sched->ske_ftick 105109864Sjeff#define ke_ticks ke_sched->ske_ticks 106110260Sjeff#define ke_cpu ke_sched->ske_cpu 107121790Sjeff#define ke_assign ke_procq.tqe_next 108109864Sjeff 109121790Sjeff#define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ 110122158Sjeff#define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ 111121790Sjeff 112109864Sjeffstruct kg_sched { 113110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 114110645Sjeff int skg_runtime; /* Number of ticks we were running */ 115109864Sjeff}; 116109864Sjeff#define kg_slptime kg_sched->skg_slptime 117110645Sjeff#define kg_runtime kg_sched->skg_runtime 118109864Sjeff 119109864Sjeffstruct td_sched { 120109864Sjeff int std_slptime; 121109864Sjeff}; 122109864Sjeff#define td_slptime td_sched->std_slptime 123109864Sjeff 124110267Sjeffstruct td_sched td_sched; 125109864Sjeffstruct ke_sched ke_sched; 126109864Sjeffstruct kg_sched kg_sched; 127109864Sjeff 128109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 129109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 130109864Sjeffstruct p_sched *proc0_sched = NULL; 131109864Sjeffstruct td_sched *thread0_sched = &td_sched; 132109864Sjeff 133109864Sjeff/* 134116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 135116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 136116642Sjeff * value is then directly added to this to allow nice to have some effect 137116642Sjeff * on latency. 138111857Sjeff * 139111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 140116642Sjeff * PRI_NRESV: Number of nice values. 141111857Sjeff * PRI_BASE: The start of the dynamic range. 142109864Sjeff */ 143111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 144121869Sjeff#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 145121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 146116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 147113357Sjeff#define SCHED_PRI_INTERACT(score) \ 148116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 149109864Sjeff 150109864Sjeff/* 151111857Sjeff * These determine the interactivity of a process. 152109864Sjeff * 153110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 154110645Sjeff * before throttling back. 155121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 156116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 157111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 158109864Sjeff */ 159121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 160121868Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 161116365Sjeff#define SCHED_INTERACT_MAX (100) 162116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 163121126Sjeff#define SCHED_INTERACT_THRESH (30) 164111857Sjeff 165109864Sjeff/* 166109864Sjeff * These parameters and macros determine the size of the time slice that is 167109864Sjeff * granted to each thread. 168109864Sjeff * 169109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 170109864Sjeff * SLICE_MAX: Maximum time slice granted. 171109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 172112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 173112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 174121871Sjeff * SLICE_NTHRESH: The nice cutoff point for slice assignment. 175109864Sjeff */ 176113357Sjeff#define SCHED_SLICE_MIN (slice_min) 177113357Sjeff#define SCHED_SLICE_MAX (slice_max) 178123684Sjeff#define SCHED_SLICE_INTERACTIVE (slice_min * 4) 179121871Sjeff#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 180111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 181109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 182112966Sjeff#define SCHED_SLICE_NICE(nice) \ 183121871Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 184109864Sjeff 185109864Sjeff/* 186109864Sjeff * This macro determines whether or not the kse belongs on the current or 187109864Sjeff * next run queue. 188109864Sjeff */ 189113357Sjeff#define SCHED_INTERACTIVE(kg) \ 190113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 191113417Sjeff#define SCHED_CURR(kg, ke) \ 192121107Sjeff (ke->ke_thread->td_priority != kg->kg_user_pri || \ 193121107Sjeff SCHED_INTERACTIVE(kg)) 194109864Sjeff 195109864Sjeff/* 196109864Sjeff * Cpu percentage computation macros and defines. 197109864Sjeff * 198109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 199109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 200109864Sjeff */ 201109864Sjeff 202112971Sjeff#define SCHED_CPU_TIME 10 203109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 204109864Sjeff 205109864Sjeff/* 206113357Sjeff * kseq - per processor runqs and statistics. 207109864Sjeff */ 208109864Sjeffstruct kseq { 209113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 210113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 211113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 212113357Sjeff struct runq *ksq_curr; /* Current queue. */ 213121896Sjeff int ksq_load_timeshare; /* Load for timeshare. */ 214113357Sjeff int ksq_load; /* Aggregate load. */ 215121869Sjeff short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 216113357Sjeff short ksq_nicemin; /* Least nice. */ 217110267Sjeff#ifdef SMP 218123433Sjeff int ksq_transferable; 219123433Sjeff LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 220123433Sjeff struct kseq_group *ksq_group; /* Our processor group. */ 221123433Sjeff volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 222110267Sjeff#endif 223109864Sjeff}; 224109864Sjeff 225123433Sjeff#ifdef SMP 226109864Sjeff/* 227123433Sjeff * kseq groups are groups of processors which can cheaply share threads. When 228123433Sjeff * one processor in the group goes idle it will check the runqs of the other 229123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 230123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 231123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 232123433Sjeff * load balancer. 233123433Sjeff */ 234123433Sjeffstruct kseq_group { 235123433Sjeff int ksg_cpus; /* Count of CPUs in this kseq group. */ 236123433Sjeff int ksg_cpumask; /* Mask of cpus in this group. */ 237123433Sjeff int ksg_idlemask; /* Idle cpus in this group. */ 238123433Sjeff int ksg_mask; /* Bit mask for first cpu. */ 239123487Sjeff int ksg_load; /* Total load of this group. */ 240123433Sjeff int ksg_transferable; /* Transferable load of this group. */ 241123433Sjeff LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 242123433Sjeff}; 243123433Sjeff#endif 244123433Sjeff 245123433Sjeff/* 246109864Sjeff * One kse queue per processor. 247109864Sjeff */ 248110028Sjeff#ifdef SMP 249121790Sjeffstatic int kseq_idle; 250123487Sjeffstatic int ksg_maxid; 251121790Sjeffstatic struct kseq kseq_cpu[MAXCPU]; 252123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU]; 253123433Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 254123433Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 255123487Sjeff#define KSEQ_ID(x) ((x) - kseq_cpu) 256123487Sjeff#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 257123433Sjeff#else /* !SMP */ 258121790Sjeffstatic struct kseq kseq_cpu; 259110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 260110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 261110028Sjeff#endif 262109864Sjeff 263112966Sjeffstatic void sched_slice(struct kse *ke); 264113357Sjeffstatic void sched_priority(struct ksegrp *kg); 265111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 266116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 267121868Sjeffstatic void sched_interact_fork(struct ksegrp *kg); 268121790Sjeffstatic void sched_pctcpu_update(struct kse *ke); 269109864Sjeff 270110267Sjeff/* Operations on per processor queues */ 271121790Sjeffstatic struct kse * kseq_choose(struct kseq *kseq); 272110028Sjeffstatic void kseq_setup(struct kseq *kseq); 273122744Sjeffstatic void kseq_load_add(struct kseq *kseq, struct kse *ke); 274122744Sjeffstatic void kseq_load_rem(struct kseq *kseq, struct kse *ke); 275122744Sjeffstatic __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 276122744Sjeffstatic __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 277113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 278113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 279113660Sjeffvoid kseq_print(int cpu); 280110267Sjeff#ifdef SMP 281123433Sjeffstatic int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 282121790Sjeffstatic struct kse *runq_steal(struct runq *rq); 283122744Sjeffstatic void sched_balance(void *arg); 284123487Sjeffstatic void sched_balance_group(struct kseq_group *ksg); 285123487Sjeffstatic void sched_balance_pair(struct kseq *high, struct kseq *low); 286121790Sjeffstatic void kseq_move(struct kseq *from, int cpu); 287123433Sjeffstatic int kseq_idled(struct kseq *kseq); 288121790Sjeffstatic void kseq_notify(struct kse *ke, int cpu); 289121790Sjeffstatic void kseq_assign(struct kseq *); 290123433Sjeffstatic struct kse *kseq_steal(struct kseq *kseq, int stealidle); 291122038Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 292122158Sjeff ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 293122165Sjeff ((ke)->ke_flags & KEF_BOUND) == 0) 294121790Sjeff#endif 295110028Sjeff 296113357Sjeffvoid 297113660Sjeffkseq_print(int cpu) 298110267Sjeff{ 299113660Sjeff struct kseq *kseq; 300113357Sjeff int i; 301112994Sjeff 302113660Sjeff kseq = KSEQ_CPU(cpu); 303112994Sjeff 304113357Sjeff printf("kseq:\n"); 305113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 306122744Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 307121896Sjeff#ifdef SMP 308123433Sjeff printf("\tload transferable: %d\n", kseq->ksq_transferable); 309121896Sjeff#endif 310113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 311113357Sjeff printf("\tnice counts:\n"); 312121869Sjeff for (i = 0; i < SCHED_PRI_NRESV; i++) 313113357Sjeff if (kseq->ksq_nice[i]) 314113357Sjeff printf("\t\t%d = %d\n", 315113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 316113357Sjeff} 317112994Sjeff 318122744Sjeffstatic __inline void 319122744Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke) 320122744Sjeff{ 321122744Sjeff#ifdef SMP 322123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 323123433Sjeff kseq->ksq_transferable++; 324123433Sjeff kseq->ksq_group->ksg_transferable++; 325123433Sjeff } 326122744Sjeff#endif 327122744Sjeff runq_add(ke->ke_runq, ke); 328122744Sjeff} 329122744Sjeff 330122744Sjeffstatic __inline void 331122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke) 332122744Sjeff{ 333122744Sjeff#ifdef SMP 334123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 335123433Sjeff kseq->ksq_transferable--; 336123433Sjeff kseq->ksq_group->ksg_transferable--; 337123433Sjeff } 338122744Sjeff#endif 339122744Sjeff runq_remove(ke->ke_runq, ke); 340122744Sjeff} 341122744Sjeff 342113357Sjeffstatic void 343122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke) 344113357Sjeff{ 345121896Sjeff int class; 346115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 347121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 348121896Sjeff if (class == PRI_TIMESHARE) 349121896Sjeff kseq->ksq_load_timeshare++; 350113357Sjeff kseq->ksq_load++; 351123487Sjeff#ifdef SMP 352123487Sjeff if (class != PRI_ITHD) 353123487Sjeff kseq->ksq_group->ksg_load++; 354123487Sjeff#endif 355113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 356122744Sjeff CTR6(KTR_ULE, 357122744Sjeff "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 358122744Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 359122744Sjeff ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 360113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 361113357Sjeff kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 362110267Sjeff} 363113357Sjeff 364112994Sjeffstatic void 365122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke) 366110267Sjeff{ 367121896Sjeff int class; 368115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 369121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 370121896Sjeff if (class == PRI_TIMESHARE) 371121896Sjeff kseq->ksq_load_timeshare--; 372123487Sjeff#ifdef SMP 373123487Sjeff if (class != PRI_ITHD) 374123487Sjeff kseq->ksq_group->ksg_load--; 375123487Sjeff#endif 376113357Sjeff kseq->ksq_load--; 377113357Sjeff ke->ke_runq = NULL; 378113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 379113357Sjeff kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 380110267Sjeff} 381110267Sjeff 382113357Sjeffstatic void 383113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 384110267Sjeff{ 385115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 386113357Sjeff /* Normalize to zero. */ 387113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 388121896Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 389113357Sjeff kseq->ksq_nicemin = nice; 390110267Sjeff} 391110267Sjeff 392113357Sjeffstatic void 393113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 394110267Sjeff{ 395113357Sjeff int n; 396113357Sjeff 397115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 398113357Sjeff /* Normalize to zero. */ 399113357Sjeff n = nice + SCHED_PRI_NHALF; 400113357Sjeff kseq->ksq_nice[n]--; 401113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 402113357Sjeff 403113357Sjeff /* 404113357Sjeff * If this wasn't the smallest nice value or there are more in 405113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 406113357Sjeff * the smallest nice. 407113357Sjeff */ 408113357Sjeff if (nice != kseq->ksq_nicemin || 409113357Sjeff kseq->ksq_nice[n] != 0 || 410121896Sjeff kseq->ksq_load_timeshare == 0) 411113357Sjeff return; 412113357Sjeff 413121869Sjeff for (; n < SCHED_PRI_NRESV; n++) 414113357Sjeff if (kseq->ksq_nice[n]) { 415113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 416113357Sjeff return; 417113357Sjeff } 418110267Sjeff} 419110267Sjeff 420113357Sjeff#ifdef SMP 421116069Sjeff/* 422122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 423116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 424116069Sjeff * by migrating some processes. 425116069Sjeff * 426116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 427116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 428116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 429116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 430116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 431116069Sjeff * 432116069Sjeff * It could be improved by considering the priorities and slices assigned to 433116069Sjeff * each task prior to balancing them. There are many pathological cases with 434116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 435116069Sjeff * 436116069Sjeff */ 437121790Sjeffstatic void 438122744Sjeffsched_balance(void *arg) 439116069Sjeff{ 440123487Sjeff struct kseq_group *high; 441123487Sjeff struct kseq_group *low; 442123487Sjeff struct kseq_group *ksg; 443123487Sjeff int timo; 444123487Sjeff int cnt; 445123487Sjeff int i; 446123487Sjeff 447123487Sjeff mtx_lock_spin(&sched_lock); 448123487Sjeff if (smp_started == 0) 449123487Sjeff goto out; 450123487Sjeff low = high = NULL; 451123487Sjeff i = random() % (ksg_maxid + 1); 452123487Sjeff for (cnt = 0; cnt <= ksg_maxid; cnt++) { 453123487Sjeff ksg = KSEQ_GROUP(i); 454123487Sjeff /* 455123487Sjeff * Find the CPU with the highest load that has some 456123487Sjeff * threads to transfer. 457123487Sjeff */ 458123487Sjeff if ((high == NULL || ksg->ksg_load > high->ksg_load) 459123487Sjeff && ksg->ksg_transferable) 460123487Sjeff high = ksg; 461123487Sjeff if (low == NULL || ksg->ksg_load < low->ksg_load) 462123487Sjeff low = ksg; 463123487Sjeff if (++i > ksg_maxid) 464123487Sjeff i = 0; 465123487Sjeff } 466123487Sjeff if (low != NULL && high != NULL && high != low) 467123487Sjeff sched_balance_pair(LIST_FIRST(&high->ksg_members), 468123487Sjeff LIST_FIRST(&low->ksg_members)); 469123487Sjeffout: 470123487Sjeff mtx_unlock_spin(&sched_lock); 471123487Sjeff timo = random() % (hz * 2); 472123487Sjeff callout_reset(&kseq_lb_callout, timo, sched_balance, NULL); 473123487Sjeff} 474123487Sjeff 475123487Sjeffstatic void 476123487Sjeffsched_balance_groups(void *arg) 477123487Sjeff{ 478123487Sjeff int timo; 479123487Sjeff int i; 480123487Sjeff 481123487Sjeff mtx_lock_spin(&sched_lock); 482123487Sjeff if (smp_started) 483123487Sjeff for (i = 0; i <= ksg_maxid; i++) 484123487Sjeff sched_balance_group(KSEQ_GROUP(i)); 485123487Sjeff mtx_unlock_spin(&sched_lock); 486123487Sjeff timo = random() % (hz * 2); 487123487Sjeff callout_reset(&kseq_group_callout, timo, sched_balance_groups, NULL); 488123487Sjeff} 489123487Sjeff 490123487Sjeffstatic void 491123487Sjeffsched_balance_group(struct kseq_group *ksg) 492123487Sjeff{ 493116069Sjeff struct kseq *kseq; 494123487Sjeff struct kseq *high; 495123487Sjeff struct kseq *low; 496123487Sjeff int load; 497123487Sjeff 498123487Sjeff if (ksg->ksg_transferable == 0) 499123487Sjeff return; 500123487Sjeff low = NULL; 501123487Sjeff high = NULL; 502123487Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 503123487Sjeff load = kseq->ksq_load; 504123487Sjeff if (kseq == KSEQ_CPU(0)) 505123487Sjeff load--; 506123487Sjeff if (high == NULL || load > high->ksq_load) 507123487Sjeff high = kseq; 508123487Sjeff if (low == NULL || load < low->ksq_load) 509123487Sjeff low = kseq; 510123487Sjeff } 511123487Sjeff if (high != NULL && low != NULL && high != low) 512123487Sjeff sched_balance_pair(high, low); 513123487Sjeff} 514123487Sjeff 515123487Sjeffstatic void 516123487Sjeffsched_balance_pair(struct kseq *high, struct kseq *low) 517123487Sjeff{ 518123433Sjeff int transferable; 519116069Sjeff int high_load; 520116069Sjeff int low_load; 521116069Sjeff int move; 522116069Sjeff int diff; 523116069Sjeff int i; 524116069Sjeff 525116069Sjeff /* 526123433Sjeff * If we're transfering within a group we have to use this specific 527123433Sjeff * kseq's transferable count, otherwise we can steal from other members 528123433Sjeff * of the group. 529123433Sjeff */ 530123487Sjeff if (high->ksq_group == low->ksq_group) { 531123487Sjeff transferable = high->ksq_transferable; 532123487Sjeff high_load = high->ksq_load; 533123487Sjeff low_load = low->ksq_load; 534123487Sjeff /* 535123487Sjeff * XXX If we encounter cpu 0 we must remember to reduce it's 536123487Sjeff * load by 1 to reflect the swi that is running the callout. 537123487Sjeff * At some point we should really fix load balancing of the 538123487Sjeff * swi and then this wont matter. 539123487Sjeff */ 540123487Sjeff if (high == KSEQ_CPU(0)) 541123487Sjeff high_load--; 542123487Sjeff if (low == KSEQ_CPU(0)) 543123487Sjeff low_load--; 544123487Sjeff } else { 545123487Sjeff transferable = high->ksq_group->ksg_transferable; 546123487Sjeff high_load = high->ksq_group->ksg_load; 547123487Sjeff low_load = low->ksq_group->ksg_load; 548123487Sjeff } 549123433Sjeff if (transferable == 0) 550123487Sjeff return; 551123433Sjeff /* 552122744Sjeff * Determine what the imbalance is and then adjust that to how many 553123433Sjeff * kses we actually have to give up (transferable). 554122744Sjeff */ 555123487Sjeff diff = high_load - low_load; 556116069Sjeff move = diff / 2; 557116069Sjeff if (diff & 0x1) 558116069Sjeff move++; 559123433Sjeff move = min(move, transferable); 560116069Sjeff for (i = 0; i < move; i++) 561123487Sjeff kseq_move(high, KSEQ_ID(low)); 562116069Sjeff return; 563116069Sjeff} 564116069Sjeff 565121790Sjeffstatic void 566116069Sjeffkseq_move(struct kseq *from, int cpu) 567116069Sjeff{ 568123433Sjeff struct kseq *kseq; 569123433Sjeff struct kseq *to; 570116069Sjeff struct kse *ke; 571116069Sjeff 572123433Sjeff kseq = from; 573123433Sjeff to = KSEQ_CPU(cpu); 574123433Sjeff ke = kseq_steal(kseq, 1); 575123433Sjeff if (ke == NULL) { 576123433Sjeff struct kseq_group *ksg; 577123433Sjeff 578123433Sjeff ksg = kseq->ksq_group; 579123433Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 580123433Sjeff if (kseq == from || kseq->ksq_transferable == 0) 581123433Sjeff continue; 582123433Sjeff ke = kseq_steal(kseq, 1); 583123433Sjeff break; 584123433Sjeff } 585123433Sjeff if (ke == NULL) 586123433Sjeff panic("kseq_move: No KSEs available with a " 587123433Sjeff "transferable count of %d\n", 588123433Sjeff ksg->ksg_transferable); 589123433Sjeff } 590123433Sjeff if (kseq == to) 591123433Sjeff return; 592116069Sjeff ke->ke_state = KES_THREAD; 593123433Sjeff kseq_runq_rem(kseq, ke); 594123433Sjeff kseq_load_rem(kseq, ke); 595121923Sjeff kseq_notify(ke, cpu); 596116069Sjeff} 597110267Sjeff 598123433Sjeffstatic int 599123433Sjeffkseq_idled(struct kseq *kseq) 600121790Sjeff{ 601123433Sjeff struct kseq_group *ksg; 602123433Sjeff struct kseq *steal; 603123433Sjeff struct kse *ke; 604123433Sjeff 605123433Sjeff ksg = kseq->ksq_group; 606123433Sjeff /* 607123433Sjeff * If we're in a cpu group, try and steal kses from another cpu in 608123433Sjeff * the group before idling. 609123433Sjeff */ 610123433Sjeff if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 611123433Sjeff LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 612123433Sjeff if (steal == kseq || steal->ksq_transferable == 0) 613123433Sjeff continue; 614123433Sjeff ke = kseq_steal(steal, 0); 615123433Sjeff if (ke == NULL) 616123433Sjeff continue; 617123433Sjeff ke->ke_state = KES_THREAD; 618123433Sjeff kseq_runq_rem(steal, ke); 619123433Sjeff kseq_load_rem(steal, ke); 620123433Sjeff ke->ke_cpu = PCPU_GET(cpuid); 621123433Sjeff sched_add(ke->ke_thread); 622123433Sjeff return (0); 623123433Sjeff } 624123433Sjeff } 625123433Sjeff /* 626123433Sjeff * We only set the idled bit when all of the cpus in the group are 627123433Sjeff * idle. Otherwise we could get into a situation where a KSE bounces 628123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 629123433Sjeff */ 630123433Sjeff ksg->ksg_idlemask |= PCPU_GET(cpumask); 631123433Sjeff if (ksg->ksg_idlemask != ksg->ksg_cpumask) 632123433Sjeff return (1); 633123433Sjeff atomic_set_int(&kseq_idle, ksg->ksg_mask); 634123433Sjeff return (1); 635121790Sjeff} 636121790Sjeff 637121790Sjeffstatic void 638121790Sjeffkseq_assign(struct kseq *kseq) 639121790Sjeff{ 640121790Sjeff struct kse *nke; 641121790Sjeff struct kse *ke; 642121790Sjeff 643121790Sjeff do { 644122848Sjeff (volatile struct kse *)ke = kseq->ksq_assigned; 645121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 646121790Sjeff for (; ke != NULL; ke = nke) { 647121790Sjeff nke = ke->ke_assign; 648121790Sjeff ke->ke_flags &= ~KEF_ASSIGNED; 649121790Sjeff sched_add(ke->ke_thread); 650121790Sjeff } 651121790Sjeff} 652121790Sjeff 653121790Sjeffstatic void 654121790Sjeffkseq_notify(struct kse *ke, int cpu) 655121790Sjeff{ 656121790Sjeff struct kseq *kseq; 657121790Sjeff struct thread *td; 658121790Sjeff struct pcpu *pcpu; 659121790Sjeff 660123529Sjeff ke->ke_cpu = cpu; 661121790Sjeff ke->ke_flags |= KEF_ASSIGNED; 662121790Sjeff 663121790Sjeff kseq = KSEQ_CPU(cpu); 664121790Sjeff 665121790Sjeff /* 666121790Sjeff * Place a KSE on another cpu's queue and force a resched. 667121790Sjeff */ 668121790Sjeff do { 669122848Sjeff (volatile struct kse *)ke->ke_assign = kseq->ksq_assigned; 670121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 671121790Sjeff pcpu = pcpu_find(cpu); 672121790Sjeff td = pcpu->pc_curthread; 673121790Sjeff if (ke->ke_thread->td_priority < td->td_priority || 674121790Sjeff td == pcpu->pc_idlethread) { 675121790Sjeff td->td_flags |= TDF_NEEDRESCHED; 676121790Sjeff ipi_selected(1 << cpu, IPI_AST); 677121790Sjeff } 678121790Sjeff} 679121790Sjeff 680121790Sjeffstatic struct kse * 681121790Sjeffrunq_steal(struct runq *rq) 682121790Sjeff{ 683121790Sjeff struct rqhead *rqh; 684121790Sjeff struct rqbits *rqb; 685121790Sjeff struct kse *ke; 686121790Sjeff int word; 687121790Sjeff int bit; 688121790Sjeff 689121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 690121790Sjeff rqb = &rq->rq_status; 691121790Sjeff for (word = 0; word < RQB_LEN; word++) { 692121790Sjeff if (rqb->rqb_bits[word] == 0) 693121790Sjeff continue; 694121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 695123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 696121790Sjeff continue; 697121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 698121790Sjeff TAILQ_FOREACH(ke, rqh, ke_procq) { 699121896Sjeff if (KSE_CAN_MIGRATE(ke, 700121896Sjeff PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 701121790Sjeff return (ke); 702121790Sjeff } 703121790Sjeff } 704121790Sjeff } 705121790Sjeff return (NULL); 706121790Sjeff} 707121790Sjeff 708121790Sjeffstatic struct kse * 709123433Sjeffkseq_steal(struct kseq *kseq, int stealidle) 710121790Sjeff{ 711121790Sjeff struct kse *ke; 712121790Sjeff 713123433Sjeff /* 714123433Sjeff * Steal from next first to try to get a non-interactive task that 715123433Sjeff * may not have run for a while. 716123433Sjeff */ 717123433Sjeff if ((ke = runq_steal(kseq->ksq_next)) != NULL) 718123433Sjeff return (ke); 719121790Sjeff if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 720121790Sjeff return (ke); 721123433Sjeff if (stealidle) 722123433Sjeff return (runq_steal(&kseq->ksq_idle)); 723123433Sjeff return (NULL); 724121790Sjeff} 725123433Sjeff 726123433Sjeffint 727123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class) 728123433Sjeff{ 729123433Sjeff struct kseq_group *ksg; 730123433Sjeff int cpu; 731123433Sjeff 732123685Sjeff if (smp_started == 0) 733123685Sjeff return (0); 734123433Sjeff cpu = 0; 735123433Sjeff ksg = kseq->ksq_group; 736123433Sjeff 737123433Sjeff /* 738123685Sjeff * If there are any idle groups, give them our extra load. The 739123685Sjeff * threshold at which we start to reassign kses has a large impact 740123685Sjeff * on the overall performance of the system. Tuned too high and 741123685Sjeff * some CPUs may idle. Too low and there will be excess migration 742123685Sjeff * and context swiches. 743123685Sjeff */ 744123685Sjeff /* 745123433Sjeff * XXX This ksg_transferable might work better if we were checking 746123433Sjeff * against a global group load. As it is now, this prevents us from 747123433Sjeff * transfering a thread from a group that is potentially bogged down 748123433Sjeff * with non transferable load. 749123433Sjeff */ 750123433Sjeff if (ksg->ksg_transferable > ksg->ksg_cpus && kseq_idle) { 751123433Sjeff /* 752123433Sjeff * Multiple cpus could find this bit simultaneously 753123433Sjeff * but the race shouldn't be terrible. 754123433Sjeff */ 755123433Sjeff cpu = ffs(kseq_idle); 756123433Sjeff if (cpu) 757123433Sjeff atomic_clear_int(&kseq_idle, 1 << (cpu - 1)); 758123433Sjeff } 759123433Sjeff /* 760123433Sjeff * If another cpu in this group has idled, assign a thread over 761123433Sjeff * to them after checking to see if there are idled groups. 762123433Sjeff */ 763123433Sjeff if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) { 764123433Sjeff cpu = ffs(ksg->ksg_idlemask); 765123433Sjeff if (cpu) 766123433Sjeff ksg->ksg_idlemask &= ~(1 << (cpu - 1)); 767123433Sjeff } 768123433Sjeff /* 769123433Sjeff * Now that we've found an idle CPU, migrate the thread. 770123433Sjeff */ 771123433Sjeff if (cpu) { 772123433Sjeff cpu--; 773123433Sjeff ke->ke_runq = NULL; 774123433Sjeff kseq_notify(ke, cpu); 775123433Sjeff return (1); 776123433Sjeff } 777123433Sjeff return (0); 778123433Sjeff} 779123433Sjeff 780121790Sjeff#endif /* SMP */ 781121790Sjeff 782117326Sjeff/* 783121790Sjeff * Pick the highest priority task we have and return it. 784117326Sjeff */ 785117326Sjeff 786121790Sjeffstatic struct kse * 787121790Sjeffkseq_choose(struct kseq *kseq) 788110267Sjeff{ 789110267Sjeff struct kse *ke; 790110267Sjeff struct runq *swap; 791110267Sjeff 792115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 793113357Sjeff swap = NULL; 794112994Sjeff 795113357Sjeff for (;;) { 796113357Sjeff ke = runq_choose(kseq->ksq_curr); 797113357Sjeff if (ke == NULL) { 798113357Sjeff /* 799113357Sjeff * We already swaped once and didn't get anywhere. 800113357Sjeff */ 801113357Sjeff if (swap) 802113357Sjeff break; 803113357Sjeff swap = kseq->ksq_curr; 804113357Sjeff kseq->ksq_curr = kseq->ksq_next; 805113357Sjeff kseq->ksq_next = swap; 806113357Sjeff continue; 807113357Sjeff } 808113357Sjeff /* 809113357Sjeff * If we encounter a slice of 0 the kse is in a 810113357Sjeff * TIMESHARE kse group and its nice was too far out 811113357Sjeff * of the range that receives slices. 812113357Sjeff */ 813121790Sjeff if (ke->ke_slice == 0) { 814113357Sjeff runq_remove(ke->ke_runq, ke); 815113357Sjeff sched_slice(ke); 816113357Sjeff ke->ke_runq = kseq->ksq_next; 817113357Sjeff runq_add(ke->ke_runq, ke); 818113357Sjeff continue; 819113357Sjeff } 820113357Sjeff return (ke); 821110267Sjeff } 822110267Sjeff 823113357Sjeff return (runq_choose(&kseq->ksq_idle)); 824110267Sjeff} 825110267Sjeff 826109864Sjeffstatic void 827110028Sjeffkseq_setup(struct kseq *kseq) 828110028Sjeff{ 829113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 830113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 831112994Sjeff runq_init(&kseq->ksq_idle); 832113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 833113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 834113660Sjeff kseq->ksq_load = 0; 835121896Sjeff kseq->ksq_load_timeshare = 0; 836110028Sjeff} 837110028Sjeff 838110028Sjeffstatic void 839109864Sjeffsched_setup(void *dummy) 840109864Sjeff{ 841117313Sjeff#ifdef SMP 842123487Sjeff int balance_groups; 843109864Sjeff int i; 844117313Sjeff#endif 845109864Sjeff 846116946Sjeff slice_min = (hz/100); /* 10ms */ 847116946Sjeff slice_max = (hz/7); /* ~140ms */ 848111857Sjeff 849117237Sjeff#ifdef SMP 850123487Sjeff balance_groups = 0; 851123433Sjeff /* 852123433Sjeff * Initialize the kseqs. 853123433Sjeff */ 854123433Sjeff for (i = 0; i < MAXCPU; i++) { 855123433Sjeff struct kseq *ksq; 856123433Sjeff 857123433Sjeff ksq = &kseq_cpu[i]; 858123433Sjeff ksq->ksq_assigned = NULL; 859123433Sjeff kseq_setup(&kseq_cpu[i]); 860123433Sjeff } 861117237Sjeff if (smp_topology == NULL) { 862123433Sjeff struct kseq_group *ksg; 863123433Sjeff struct kseq *ksq; 864123433Sjeff 865117237Sjeff for (i = 0; i < MAXCPU; i++) { 866123433Sjeff ksq = &kseq_cpu[i]; 867123433Sjeff ksg = &kseq_groups[i]; 868123433Sjeff /* 869123433Sjeff * Setup a kse group with one member. 870123433Sjeff */ 871123433Sjeff ksq->ksq_transferable = 0; 872123433Sjeff ksq->ksq_group = ksg; 873123433Sjeff ksg->ksg_cpus = 1; 874123433Sjeff ksg->ksg_idlemask = 0; 875123433Sjeff ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 876123487Sjeff ksg->ksg_load = 0; 877123433Sjeff ksg->ksg_transferable = 0; 878123433Sjeff LIST_INIT(&ksg->ksg_members); 879123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 880117237Sjeff } 881117237Sjeff } else { 882123433Sjeff struct kseq_group *ksg; 883123433Sjeff struct cpu_group *cg; 884117237Sjeff int j; 885113357Sjeff 886117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 887117237Sjeff cg = &smp_topology->ct_group[i]; 888123433Sjeff ksg = &kseq_groups[i]; 889123433Sjeff /* 890123433Sjeff * Initialize the group. 891123433Sjeff */ 892123433Sjeff ksg->ksg_idlemask = 0; 893123487Sjeff ksg->ksg_load = 0; 894123433Sjeff ksg->ksg_transferable = 0; 895123433Sjeff ksg->ksg_cpus = cg->cg_count; 896123433Sjeff ksg->ksg_cpumask = cg->cg_mask; 897123433Sjeff LIST_INIT(&ksg->ksg_members); 898123433Sjeff /* 899123433Sjeff * Find all of the group members and add them. 900123433Sjeff */ 901123433Sjeff for (j = 0; j < MAXCPU; j++) { 902123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 903123433Sjeff if (ksg->ksg_mask == 0) 904123433Sjeff ksg->ksg_mask = 1 << j; 905123433Sjeff kseq_cpu[j].ksq_transferable = 0; 906123433Sjeff kseq_cpu[j].ksq_group = ksg; 907123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, 908123433Sjeff &kseq_cpu[j], ksq_siblings); 909123433Sjeff } 910123433Sjeff } 911123487Sjeff if (ksg->ksg_cpus > 1) 912123487Sjeff balance_groups = 1; 913117237Sjeff } 914123487Sjeff ksg_maxid = smp_topology->ct_count - 1; 915117237Sjeff } 916119137Ssam callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 917123487Sjeff callout_init(&kseq_group_callout, CALLOUT_MPSAFE); 918122744Sjeff sched_balance(NULL); 919123487Sjeff /* 920123487Sjeff * Stagger the group and global load balancer so they do not 921123487Sjeff * interfere with each other. 922123487Sjeff */ 923123487Sjeff if (balance_groups) 924123487Sjeff callout_reset(&kseq_group_callout, hz / 2, 925123487Sjeff sched_balance_groups, NULL); 926117237Sjeff#else 927117237Sjeff kseq_setup(KSEQ_SELF()); 928116069Sjeff#endif 929117237Sjeff mtx_lock_spin(&sched_lock); 930122744Sjeff kseq_load_add(KSEQ_SELF(), &kse0); 931117237Sjeff mtx_unlock_spin(&sched_lock); 932109864Sjeff} 933109864Sjeff 934109864Sjeff/* 935109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 936109864Sjeff * process. 937109864Sjeff */ 938113357Sjeffstatic void 939109864Sjeffsched_priority(struct ksegrp *kg) 940109864Sjeff{ 941109864Sjeff int pri; 942109864Sjeff 943109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 944113357Sjeff return; 945109864Sjeff 946113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 947111857Sjeff pri += SCHED_PRI_BASE; 948109864Sjeff pri += kg->kg_nice; 949109864Sjeff 950109864Sjeff if (pri > PRI_MAX_TIMESHARE) 951109864Sjeff pri = PRI_MAX_TIMESHARE; 952109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 953109864Sjeff pri = PRI_MIN_TIMESHARE; 954109864Sjeff 955109864Sjeff kg->kg_user_pri = pri; 956109864Sjeff 957113357Sjeff return; 958109864Sjeff} 959109864Sjeff 960109864Sjeff/* 961112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 962112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 963109864Sjeff */ 964112966Sjeffstatic void 965112966Sjeffsched_slice(struct kse *ke) 966109864Sjeff{ 967113357Sjeff struct kseq *kseq; 968112966Sjeff struct ksegrp *kg; 969109864Sjeff 970112966Sjeff kg = ke->ke_ksegrp; 971113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 972109864Sjeff 973112966Sjeff /* 974112966Sjeff * Rationale: 975112966Sjeff * KSEs in interactive ksegs get the minimum slice so that we 976112966Sjeff * quickly notice if it abuses its advantage. 977112966Sjeff * 978112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 979112966Sjeff * based on the ksegs nice value relative to the least nice kseg 980112966Sjeff * on the run queue for this cpu. 981112966Sjeff * 982112966Sjeff * If the KSE is less nice than all others it gets the maximum 983112966Sjeff * slice and other KSEs will adjust their slice relative to 984112966Sjeff * this when they first expire. 985112966Sjeff * 986112966Sjeff * There is 20 point window that starts relative to the least 987112966Sjeff * nice kse on the run queue. Slice size is determined by 988112966Sjeff * the kse distance from the last nice ksegrp. 989112966Sjeff * 990121871Sjeff * If the kse is outside of the window it will get no slice 991121871Sjeff * and will be reevaluated each time it is selected on the 992121871Sjeff * run queue. The exception to this is nice 0 ksegs when 993121871Sjeff * a nice -20 is running. They are always granted a minimum 994121871Sjeff * slice. 995112966Sjeff */ 996113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 997112966Sjeff int nice; 998112966Sjeff 999113357Sjeff nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 1000121896Sjeff if (kseq->ksq_load_timeshare == 0 || 1001113357Sjeff kg->kg_nice < kseq->ksq_nicemin) 1002112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1003121871Sjeff else if (nice <= SCHED_SLICE_NTHRESH) 1004112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 1005121871Sjeff else if (kg->kg_nice == 0) 1006121871Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1007112966Sjeff else 1008112966Sjeff ke->ke_slice = 0; 1009112966Sjeff } else 1010123684Sjeff ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1011112966Sjeff 1012113357Sjeff CTR6(KTR_ULE, 1013113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 1014113357Sjeff ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 1015121896Sjeff kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); 1016113357Sjeff 1017112966Sjeff return; 1018109864Sjeff} 1019109864Sjeff 1020121868Sjeff/* 1021121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1022121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 1023121868Sjeff * This routine will not operate correctly when slp or run times have been 1024121868Sjeff * adjusted to more than double their maximum. 1025121868Sjeff */ 1026116463Sjeffstatic void 1027116463Sjeffsched_interact_update(struct ksegrp *kg) 1028116463Sjeff{ 1029121868Sjeff int sum; 1030121605Sjeff 1031121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1032121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1033121868Sjeff return; 1034121868Sjeff /* 1035121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1036121868Sjeff * will not bring us back into range. Dividing by two here forces 1037121868Sjeff * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1038121868Sjeff */ 1039121868Sjeff if (sum > (SCHED_INTERACT_MAX / 5) * 6) { 1040121868Sjeff kg->kg_runtime /= 2; 1041121868Sjeff kg->kg_slptime /= 2; 1042121868Sjeff return; 1043116463Sjeff } 1044121868Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1045121868Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1046116463Sjeff} 1047116463Sjeff 1048121868Sjeffstatic void 1049121868Sjeffsched_interact_fork(struct ksegrp *kg) 1050121868Sjeff{ 1051121868Sjeff int ratio; 1052121868Sjeff int sum; 1053121868Sjeff 1054121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1055121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1056121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1057121868Sjeff kg->kg_runtime /= ratio; 1058121868Sjeff kg->kg_slptime /= ratio; 1059121868Sjeff } 1060121868Sjeff} 1061121868Sjeff 1062111857Sjeffstatic int 1063111857Sjeffsched_interact_score(struct ksegrp *kg) 1064111857Sjeff{ 1065116365Sjeff int div; 1066111857Sjeff 1067111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 1068116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1069116365Sjeff return (SCHED_INTERACT_HALF + 1070116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1071116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 1072116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1073116365Sjeff return (kg->kg_runtime / div); 1074111857Sjeff } 1075111857Sjeff 1076116365Sjeff /* 1077116365Sjeff * This can happen if slptime and runtime are 0. 1078116365Sjeff */ 1079116365Sjeff return (0); 1080111857Sjeff 1081111857Sjeff} 1082111857Sjeff 1083113357Sjeff/* 1084113357Sjeff * This is only somewhat accurate since given many processes of the same 1085113357Sjeff * priority they will switch when their slices run out, which will be 1086113357Sjeff * at most SCHED_SLICE_MAX. 1087113357Sjeff */ 1088109864Sjeffint 1089109864Sjeffsched_rr_interval(void) 1090109864Sjeff{ 1091109864Sjeff return (SCHED_SLICE_MAX); 1092109864Sjeff} 1093109864Sjeff 1094121790Sjeffstatic void 1095109864Sjeffsched_pctcpu_update(struct kse *ke) 1096109864Sjeff{ 1097109864Sjeff /* 1098109864Sjeff * Adjust counters and watermark for pctcpu calc. 1099116365Sjeff */ 1100120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1101120272Sjeff /* 1102120272Sjeff * Shift the tick count out so that the divide doesn't 1103120272Sjeff * round away our results. 1104120272Sjeff */ 1105120272Sjeff ke->ke_ticks <<= 10; 1106120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1107120272Sjeff SCHED_CPU_TICKS; 1108120272Sjeff ke->ke_ticks >>= 10; 1109120272Sjeff } else 1110120272Sjeff ke->ke_ticks = 0; 1111109864Sjeff ke->ke_ltick = ticks; 1112109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1113109864Sjeff} 1114109864Sjeff 1115109864Sjeffvoid 1116109864Sjeffsched_prio(struct thread *td, u_char prio) 1117109864Sjeff{ 1118121605Sjeff struct kse *ke; 1119109864Sjeff 1120121605Sjeff ke = td->td_kse; 1121109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1122109864Sjeff if (TD_ON_RUNQ(td)) { 1123121605Sjeff /* 1124121605Sjeff * If the priority has been elevated due to priority 1125121605Sjeff * propagation, we may have to move ourselves to a new 1126121605Sjeff * queue. We still call adjustrunqueue below in case kse 1127121605Sjeff * needs to fix things up. 1128121605Sjeff */ 1129121872Sjeff if (prio < td->td_priority && ke && 1130121872Sjeff (ke->ke_flags & KEF_ASSIGNED) == 0 && 1131121790Sjeff ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1132121605Sjeff runq_remove(ke->ke_runq, ke); 1133121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1134121605Sjeff runq_add(ke->ke_runq, ke); 1135121605Sjeff } 1136119488Sdavidxu adjustrunqueue(td, prio); 1137121605Sjeff } else 1138119488Sdavidxu td->td_priority = prio; 1139109864Sjeff} 1140109864Sjeff 1141109864Sjeffvoid 1142121128Sjeffsched_switch(struct thread *td) 1143109864Sjeff{ 1144121128Sjeff struct thread *newtd; 1145109864Sjeff struct kse *ke; 1146109864Sjeff 1147109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1148109864Sjeff 1149109864Sjeff ke = td->td_kse; 1150109864Sjeff 1151109864Sjeff td->td_last_kse = ke; 1152113339Sjulian td->td_lastcpu = td->td_oncpu; 1153113339Sjulian td->td_oncpu = NOCPU; 1154111032Sjulian td->td_flags &= ~TDF_NEEDRESCHED; 1155109864Sjeff 1156123434Sjeff /* 1157123434Sjeff * If the KSE has been assigned it may be in the process of switching 1158123434Sjeff * to the new cpu. This is the case in sched_bind(). 1159123434Sjeff */ 1160123434Sjeff if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1161123434Sjeff if (TD_IS_RUNNING(td)) { 1162123434Sjeff if (td->td_proc->p_flag & P_SA) { 1163123434Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1164123434Sjeff setrunqueue(td); 1165123434Sjeff } else 1166123434Sjeff kseq_runq_add(KSEQ_SELF(), ke); 1167123434Sjeff } else { 1168123434Sjeff if (ke->ke_runq) 1169123434Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1170123434Sjeff /* 1171123434Sjeff * We will not be on the run queue. So we must be 1172123434Sjeff * sleeping or similar. 1173123434Sjeff */ 1174123434Sjeff if (td->td_proc->p_flag & P_SA) 1175123434Sjeff kse_reassign(ke); 1176123434Sjeff } 1177121146Sjeff } 1178121128Sjeff newtd = choosethread(); 1179121128Sjeff if (td != newtd) 1180121128Sjeff cpu_switch(td, newtd); 1181121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1182109864Sjeff 1183113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1184109864Sjeff} 1185109864Sjeff 1186109864Sjeffvoid 1187109864Sjeffsched_nice(struct ksegrp *kg, int nice) 1188109864Sjeff{ 1189113357Sjeff struct kse *ke; 1190109864Sjeff struct thread *td; 1191113357Sjeff struct kseq *kseq; 1192109864Sjeff 1193113873Sjhb PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 1194113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1195113357Sjeff /* 1196113357Sjeff * We need to adjust the nice counts for running KSEs. 1197113357Sjeff */ 1198113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 1199113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1200116500Sjeff if (ke->ke_runq == NULL) 1201113357Sjeff continue; 1202113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1203113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1204113357Sjeff kseq_nice_add(kseq, nice); 1205113357Sjeff } 1206109864Sjeff kg->kg_nice = nice; 1207109864Sjeff sched_priority(kg); 1208113357Sjeff FOREACH_THREAD_IN_GROUP(kg, td) 1209111032Sjulian td->td_flags |= TDF_NEEDRESCHED; 1210109864Sjeff} 1211109864Sjeff 1212109864Sjeffvoid 1213109864Sjeffsched_sleep(struct thread *td, u_char prio) 1214109864Sjeff{ 1215109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1216109864Sjeff 1217109864Sjeff td->td_slptime = ticks; 1218109864Sjeff td->td_priority = prio; 1219109864Sjeff 1220113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 1221113357Sjeff td->td_kse, td->td_slptime); 1222109864Sjeff} 1223109864Sjeff 1224109864Sjeffvoid 1225109864Sjeffsched_wakeup(struct thread *td) 1226109864Sjeff{ 1227109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1228109864Sjeff 1229109864Sjeff /* 1230109864Sjeff * Let the kseg know how long we slept for. This is because process 1231109864Sjeff * interactivity behavior is modeled in the kseg. 1232109864Sjeff */ 1233111788Sjeff if (td->td_slptime) { 1234111788Sjeff struct ksegrp *kg; 1235113357Sjeff int hzticks; 1236109864Sjeff 1237111788Sjeff kg = td->td_ksegrp; 1238121868Sjeff hzticks = (ticks - td->td_slptime) << 10; 1239121868Sjeff if (hzticks >= SCHED_SLP_RUN_MAX) { 1240121868Sjeff kg->kg_slptime = SCHED_SLP_RUN_MAX; 1241121868Sjeff kg->kg_runtime = 1; 1242121868Sjeff } else { 1243121868Sjeff kg->kg_slptime += hzticks; 1244121868Sjeff sched_interact_update(kg); 1245121868Sjeff } 1246111788Sjeff sched_priority(kg); 1247116463Sjeff if (td->td_kse) 1248116463Sjeff sched_slice(td->td_kse); 1249113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 1250113357Sjeff td->td_kse, hzticks); 1251111788Sjeff td->td_slptime = 0; 1252109864Sjeff } 1253109864Sjeff setrunqueue(td); 1254109864Sjeff} 1255109864Sjeff 1256109864Sjeff/* 1257109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1258109864Sjeff * priority. 1259109864Sjeff */ 1260109864Sjeffvoid 1261113357Sjeffsched_fork(struct proc *p, struct proc *p1) 1262109864Sjeff{ 1263109864Sjeff 1264109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1265109864Sjeff 1266113357Sjeff sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 1267113357Sjeff sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 1268113357Sjeff sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 1269113357Sjeff} 1270113357Sjeff 1271113357Sjeffvoid 1272113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child) 1273113357Sjeff{ 1274113923Sjhb 1275116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1276122847Sjeff child->ke_cpu = ke->ke_cpu; 1277113357Sjeff child->ke_runq = NULL; 1278113357Sjeff 1279121051Sjeff /* Grab our parents cpu estimation information. */ 1280121051Sjeff child->ke_ticks = ke->ke_ticks; 1281121051Sjeff child->ke_ltick = ke->ke_ltick; 1282121051Sjeff child->ke_ftick = ke->ke_ftick; 1283113357Sjeff} 1284113357Sjeff 1285113357Sjeffvoid 1286113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1287113357Sjeff{ 1288113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 1289116365Sjeff 1290121868Sjeff child->kg_slptime = kg->kg_slptime; 1291121868Sjeff child->kg_runtime = kg->kg_runtime; 1292121868Sjeff child->kg_user_pri = kg->kg_user_pri; 1293121868Sjeff child->kg_nice = kg->kg_nice; 1294121868Sjeff sched_interact_fork(child); 1295116463Sjeff kg->kg_runtime += tickincr << 10; 1296116463Sjeff sched_interact_update(kg); 1297113357Sjeff 1298121868Sjeff CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", 1299121868Sjeff kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 1300121868Sjeff child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); 1301113357Sjeff} 1302109864Sjeff 1303113357Sjeffvoid 1304113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 1305113357Sjeff{ 1306113357Sjeff} 1307113357Sjeff 1308113357Sjeffvoid 1309113357Sjeffsched_class(struct ksegrp *kg, int class) 1310113357Sjeff{ 1311113357Sjeff struct kseq *kseq; 1312113357Sjeff struct kse *ke; 1313121896Sjeff int nclass; 1314121896Sjeff int oclass; 1315113357Sjeff 1316113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1317113357Sjeff if (kg->kg_pri_class == class) 1318113357Sjeff return; 1319113357Sjeff 1320121896Sjeff nclass = PRI_BASE(class); 1321121896Sjeff oclass = PRI_BASE(kg->kg_pri_class); 1322113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1323113357Sjeff if (ke->ke_state != KES_ONRUNQ && 1324113357Sjeff ke->ke_state != KES_THREAD) 1325113357Sjeff continue; 1326113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1327113357Sjeff 1328121896Sjeff#ifdef SMP 1329122744Sjeff /* 1330122744Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1331122744Sjeff * count because could be changing to or from an interrupt 1332122744Sjeff * class. 1333122744Sjeff */ 1334122744Sjeff if (ke->ke_state == KES_ONRUNQ) { 1335123433Sjeff if (KSE_CAN_MIGRATE(ke, oclass)) { 1336123433Sjeff kseq->ksq_transferable--; 1337123433Sjeff kseq->ksq_group->ksg_transferable--; 1338123433Sjeff } 1339123433Sjeff if (KSE_CAN_MIGRATE(ke, nclass)) { 1340123433Sjeff kseq->ksq_transferable++; 1341123433Sjeff kseq->ksq_group->ksg_transferable++; 1342123433Sjeff } 1343122744Sjeff } 1344121896Sjeff#endif 1345122744Sjeff if (oclass == PRI_TIMESHARE) { 1346121896Sjeff kseq->ksq_load_timeshare--; 1347122744Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1348122744Sjeff } 1349122744Sjeff if (nclass == PRI_TIMESHARE) { 1350121896Sjeff kseq->ksq_load_timeshare++; 1351113357Sjeff kseq_nice_add(kseq, kg->kg_nice); 1352122744Sjeff } 1353109970Sjeff } 1354109970Sjeff 1355113357Sjeff kg->kg_pri_class = class; 1356109864Sjeff} 1357109864Sjeff 1358109864Sjeff/* 1359109864Sjeff * Return some of the child's priority and interactivity to the parent. 1360109864Sjeff */ 1361109864Sjeffvoid 1362113357Sjeffsched_exit(struct proc *p, struct proc *child) 1363109864Sjeff{ 1364109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1365113372Sjeff sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 1366116365Sjeff sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 1367109864Sjeff} 1368109864Sjeff 1369109864Sjeffvoid 1370113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child) 1371113372Sjeff{ 1372122744Sjeff kseq_load_rem(KSEQ_CPU(child->ke_cpu), child); 1373113372Sjeff} 1374113372Sjeff 1375113372Sjeffvoid 1376113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1377113372Sjeff{ 1378116463Sjeff /* kg->kg_slptime += child->kg_slptime; */ 1379116365Sjeff kg->kg_runtime += child->kg_runtime; 1380116463Sjeff sched_interact_update(kg); 1381113372Sjeff} 1382113372Sjeff 1383113372Sjeffvoid 1384113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 1385113372Sjeff{ 1386113372Sjeff} 1387113372Sjeff 1388113372Sjeffvoid 1389121127Sjeffsched_clock(struct thread *td) 1390109864Sjeff{ 1391113357Sjeff struct kseq *kseq; 1392113357Sjeff struct ksegrp *kg; 1393121127Sjeff struct kse *ke; 1394109864Sjeff 1395113357Sjeff /* 1396113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1397113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1398113357Sjeff * calculating this here. 1399113357Sjeff */ 1400113357Sjeff if (realstathz == 0) { 1401113357Sjeff realstathz = stathz ? stathz : hz; 1402113357Sjeff tickincr = hz / realstathz; 1403113357Sjeff /* 1404113357Sjeff * XXX This does not work for values of stathz that are much 1405113357Sjeff * larger than hz. 1406113357Sjeff */ 1407113357Sjeff if (tickincr == 0) 1408113357Sjeff tickincr = 1; 1409113357Sjeff } 1410109864Sjeff 1411121127Sjeff ke = td->td_kse; 1412113357Sjeff kg = ke->ke_ksegrp; 1413109864Sjeff 1414110028Sjeff mtx_assert(&sched_lock, MA_OWNED); 1415110028Sjeff KASSERT((td != NULL), ("schedclock: null thread pointer")); 1416110028Sjeff 1417110028Sjeff /* Adjust ticks for pctcpu */ 1418111793Sjeff ke->ke_ticks++; 1419109971Sjeff ke->ke_ltick = ticks; 1420112994Sjeff 1421109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1422109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1423109971Sjeff sched_pctcpu_update(ke); 1424109971Sjeff 1425114496Sjulian if (td->td_flags & TDF_IDLETD) 1426109864Sjeff return; 1427110028Sjeff 1428113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1429113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1430110028Sjeff /* 1431113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1432113357Sjeff */ 1433113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1434113357Sjeff return; 1435113357Sjeff /* 1436110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1437113357Sjeff * interactivity. 1438109864Sjeff */ 1439113357Sjeff kg->kg_runtime += tickincr << 10; 1440116463Sjeff sched_interact_update(kg); 1441110645Sjeff 1442109864Sjeff /* 1443109864Sjeff * We used up one time slice. 1444109864Sjeff */ 1445122847Sjeff if (--ke->ke_slice > 0) 1446113357Sjeff return; 1447109864Sjeff /* 1448113357Sjeff * We're out of time, recompute priorities and requeue. 1449109864Sjeff */ 1450122847Sjeff kseq = KSEQ_SELF(); 1451122744Sjeff kseq_load_rem(kseq, ke); 1452113357Sjeff sched_priority(kg); 1453113357Sjeff sched_slice(ke); 1454113357Sjeff if (SCHED_CURR(kg, ke)) 1455113357Sjeff ke->ke_runq = kseq->ksq_curr; 1456113357Sjeff else 1457113357Sjeff ke->ke_runq = kseq->ksq_next; 1458122744Sjeff kseq_load_add(kseq, ke); 1459113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1460109864Sjeff} 1461109864Sjeff 1462109864Sjeffint 1463109864Sjeffsched_runnable(void) 1464109864Sjeff{ 1465109864Sjeff struct kseq *kseq; 1466115998Sjeff int load; 1467109864Sjeff 1468115998Sjeff load = 1; 1469115998Sjeff 1470110028Sjeff kseq = KSEQ_SELF(); 1471121790Sjeff#ifdef SMP 1472122094Sjeff if (kseq->ksq_assigned) { 1473122094Sjeff mtx_lock_spin(&sched_lock); 1474121790Sjeff kseq_assign(kseq); 1475122094Sjeff mtx_unlock_spin(&sched_lock); 1476122094Sjeff } 1477121790Sjeff#endif 1478121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1479121605Sjeff if (kseq->ksq_load > 0) 1480121605Sjeff goto out; 1481121605Sjeff } else 1482121605Sjeff if (kseq->ksq_load - 1 > 0) 1483121605Sjeff goto out; 1484115998Sjeff load = 0; 1485115998Sjeffout: 1486115998Sjeff return (load); 1487109864Sjeff} 1488109864Sjeff 1489109864Sjeffvoid 1490109864Sjeffsched_userret(struct thread *td) 1491109864Sjeff{ 1492109864Sjeff struct ksegrp *kg; 1493121605Sjeff 1494121605Sjeff kg = td->td_ksegrp; 1495109864Sjeff 1496109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1497109864Sjeff mtx_lock_spin(&sched_lock); 1498109864Sjeff td->td_priority = kg->kg_user_pri; 1499109864Sjeff mtx_unlock_spin(&sched_lock); 1500109864Sjeff } 1501109864Sjeff} 1502109864Sjeff 1503109864Sjeffstruct kse * 1504109970Sjeffsched_choose(void) 1505109970Sjeff{ 1506110028Sjeff struct kseq *kseq; 1507109970Sjeff struct kse *ke; 1508109970Sjeff 1509115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1510121790Sjeff kseq = KSEQ_SELF(); 1511113357Sjeff#ifdef SMP 1512123433Sjeffrestart: 1513121790Sjeff if (kseq->ksq_assigned) 1514121790Sjeff kseq_assign(kseq); 1515113357Sjeff#endif 1516121790Sjeff ke = kseq_choose(kseq); 1517109864Sjeff if (ke) { 1518121790Sjeff#ifdef SMP 1519121790Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1520123433Sjeff if (kseq_idled(kseq) == 0) 1521123433Sjeff goto restart; 1522121790Sjeff#endif 1523122744Sjeff kseq_runq_rem(kseq, ke); 1524109864Sjeff ke->ke_state = KES_THREAD; 1525112966Sjeff 1526113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1527113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1528113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1529113357Sjeff ke->ke_thread->td_priority); 1530113357Sjeff } 1531113357Sjeff return (ke); 1532109864Sjeff } 1533109970Sjeff#ifdef SMP 1534123433Sjeff if (kseq_idled(kseq) == 0) 1535123433Sjeff goto restart; 1536109970Sjeff#endif 1537113357Sjeff return (NULL); 1538109864Sjeff} 1539109864Sjeff 1540109864Sjeffvoid 1541121127Sjeffsched_add(struct thread *td) 1542109864Sjeff{ 1543110267Sjeff struct kseq *kseq; 1544113357Sjeff struct ksegrp *kg; 1545121127Sjeff struct kse *ke; 1546121790Sjeff int class; 1547109864Sjeff 1548121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 1549121127Sjeff ke = td->td_kse; 1550121127Sjeff kg = td->td_ksegrp; 1551121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1552121790Sjeff return; 1553121790Sjeff kseq = KSEQ_SELF(); 1554110267Sjeff KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1555109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1556110267Sjeff ("sched_add: No KSE on thread")); 1557109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1558110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1559109864Sjeff ke->ke_proc->p_comm)); 1560109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1561110267Sjeff ("sched_add: process swapped out")); 1562113387Sjeff KASSERT(ke->ke_runq == NULL, 1563113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1564109864Sjeff 1565121790Sjeff class = PRI_BASE(kg->kg_pri_class); 1566121790Sjeff switch (class) { 1567112994Sjeff case PRI_ITHD: 1568112994Sjeff case PRI_REALTIME: 1569113357Sjeff ke->ke_runq = kseq->ksq_curr; 1570113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1571113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1572112994Sjeff break; 1573112994Sjeff case PRI_TIMESHARE: 1574113387Sjeff if (SCHED_CURR(kg, ke)) 1575113387Sjeff ke->ke_runq = kseq->ksq_curr; 1576113387Sjeff else 1577113387Sjeff ke->ke_runq = kseq->ksq_next; 1578113357Sjeff break; 1579112994Sjeff case PRI_IDLE: 1580113357Sjeff /* 1581113357Sjeff * This is for priority prop. 1582113357Sjeff */ 1583121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1584113357Sjeff ke->ke_runq = kseq->ksq_curr; 1585113357Sjeff else 1586113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1587113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1588112994Sjeff break; 1589113357Sjeff default: 1590121868Sjeff panic("Unknown pri class."); 1591113357Sjeff break; 1592112994Sjeff } 1593121790Sjeff#ifdef SMP 1594123433Sjeff if (ke->ke_cpu != PCPU_GET(cpuid)) { 1595123529Sjeff ke->ke_runq = NULL; 1596123433Sjeff kseq_notify(ke, ke->ke_cpu); 1597123433Sjeff return; 1598123433Sjeff } 1599121790Sjeff /* 1600123685Sjeff * If we had been idle, clear our bit in the group and potentially 1601123685Sjeff * the global bitmap. If not, see if we should transfer this thread. 1602121790Sjeff */ 1603123433Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1604123433Sjeff (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1605121790Sjeff /* 1606123433Sjeff * Check to see if our group is unidling, and if so, remove it 1607123433Sjeff * from the global idle mask. 1608121790Sjeff */ 1609123433Sjeff if (kseq->ksq_group->ksg_idlemask == 1610123433Sjeff kseq->ksq_group->ksg_cpumask) 1611123433Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1612123433Sjeff /* 1613123433Sjeff * Now remove ourselves from the group specific idle mask. 1614123433Sjeff */ 1615123433Sjeff kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1616123685Sjeff } else if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class)) 1617123685Sjeff if (kseq_transfer(kseq, ke, class)) 1618123685Sjeff return; 1619121790Sjeff#endif 1620121790Sjeff if (td->td_priority < curthread->td_priority) 1621121790Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1622121790Sjeff 1623109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1624109864Sjeff ke->ke_state = KES_ONRUNQ; 1625109864Sjeff 1626122744Sjeff kseq_runq_add(kseq, ke); 1627122744Sjeff kseq_load_add(kseq, ke); 1628109864Sjeff} 1629109864Sjeff 1630109864Sjeffvoid 1631121127Sjeffsched_rem(struct thread *td) 1632109864Sjeff{ 1633113357Sjeff struct kseq *kseq; 1634121127Sjeff struct kse *ke; 1635113357Sjeff 1636121127Sjeff ke = td->td_kse; 1637121790Sjeff /* 1638121790Sjeff * It is safe to just return here because sched_rem() is only ever 1639121790Sjeff * used in places where we're immediately going to add the 1640121790Sjeff * kse back on again. In that case it'll be added with the correct 1641121790Sjeff * thread and priority when the caller drops the sched_lock. 1642121790Sjeff */ 1643121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1644121790Sjeff return; 1645109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1646113387Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1647109864Sjeff 1648109864Sjeff ke->ke_state = KES_THREAD; 1649109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1650113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1651122744Sjeff kseq_runq_rem(kseq, ke); 1652122744Sjeff kseq_load_rem(kseq, ke); 1653109864Sjeff} 1654109864Sjeff 1655109864Sjefffixpt_t 1656121127Sjeffsched_pctcpu(struct thread *td) 1657109864Sjeff{ 1658109864Sjeff fixpt_t pctcpu; 1659121127Sjeff struct kse *ke; 1660109864Sjeff 1661109864Sjeff pctcpu = 0; 1662121127Sjeff ke = td->td_kse; 1663121290Sjeff if (ke == NULL) 1664121290Sjeff return (0); 1665109864Sjeff 1666115998Sjeff mtx_lock_spin(&sched_lock); 1667109864Sjeff if (ke->ke_ticks) { 1668109864Sjeff int rtick; 1669109864Sjeff 1670116365Sjeff /* 1671116365Sjeff * Don't update more frequently than twice a second. Allowing 1672116365Sjeff * this causes the cpu usage to decay away too quickly due to 1673116365Sjeff * rounding errors. 1674116365Sjeff */ 1675123435Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1676123435Sjeff ke->ke_ltick < (ticks - (hz / 2))) 1677116365Sjeff sched_pctcpu_update(ke); 1678109864Sjeff /* How many rtick per second ? */ 1679116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1680110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1681109864Sjeff } 1682109864Sjeff 1683109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1684113865Sjhb mtx_unlock_spin(&sched_lock); 1685109864Sjeff 1686109864Sjeff return (pctcpu); 1687109864Sjeff} 1688109864Sjeff 1689122038Sjeffvoid 1690122038Sjeffsched_bind(struct thread *td, int cpu) 1691122038Sjeff{ 1692122038Sjeff struct kse *ke; 1693122038Sjeff 1694122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1695122038Sjeff ke = td->td_kse; 1696122038Sjeff ke->ke_flags |= KEF_BOUND; 1697123433Sjeff#ifdef SMP 1698123433Sjeff if (PCPU_GET(cpuid) == cpu) 1699122038Sjeff return; 1700122038Sjeff /* sched_rem without the runq_remove */ 1701122038Sjeff ke->ke_state = KES_THREAD; 1702122038Sjeff ke->ke_ksegrp->kg_runq_kses--; 1703122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1704122038Sjeff kseq_notify(ke, cpu); 1705122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 1706122038Sjeff td->td_proc->p_stats->p_ru.ru_nvcsw++; 1707122038Sjeff mi_switch(); 1708122038Sjeff#endif 1709122038Sjeff} 1710122038Sjeff 1711122038Sjeffvoid 1712122038Sjeffsched_unbind(struct thread *td) 1713122038Sjeff{ 1714122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1715122038Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 1716122038Sjeff} 1717122038Sjeff 1718109864Sjeffint 1719109864Sjeffsched_sizeof_kse(void) 1720109864Sjeff{ 1721109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1722109864Sjeff} 1723109864Sjeff 1724109864Sjeffint 1725109864Sjeffsched_sizeof_ksegrp(void) 1726109864Sjeff{ 1727109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1728109864Sjeff} 1729109864Sjeff 1730109864Sjeffint 1731109864Sjeffsched_sizeof_proc(void) 1732109864Sjeff{ 1733109864Sjeff return (sizeof(struct proc)); 1734109864Sjeff} 1735109864Sjeff 1736109864Sjeffint 1737109864Sjeffsched_sizeof_thread(void) 1738109864Sjeff{ 1739109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1740109864Sjeff} 1741