sched_ule.c revision 123487
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 123487 2003-12-12 07:33:51Z jeff $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32109864Sjeff#include <sys/kernel.h> 33109864Sjeff#include <sys/ktr.h> 34109864Sjeff#include <sys/lock.h> 35109864Sjeff#include <sys/mutex.h> 36109864Sjeff#include <sys/proc.h> 37112966Sjeff#include <sys/resource.h> 38122038Sjeff#include <sys/resourcevar.h> 39109864Sjeff#include <sys/sched.h> 40109864Sjeff#include <sys/smp.h> 41109864Sjeff#include <sys/sx.h> 42109864Sjeff#include <sys/sysctl.h> 43109864Sjeff#include <sys/sysproto.h> 44109864Sjeff#include <sys/vmmeter.h> 45109864Sjeff#ifdef DDB 46109864Sjeff#include <ddb/ddb.h> 47109864Sjeff#endif 48109864Sjeff#ifdef KTRACE 49109864Sjeff#include <sys/uio.h> 50109864Sjeff#include <sys/ktrace.h> 51109864Sjeff#endif 52109864Sjeff 53109864Sjeff#include <machine/cpu.h> 54121790Sjeff#include <machine/smp.h> 55109864Sjeff 56113357Sjeff#define KTR_ULE KTR_NFS 57113357Sjeff 58109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 59109864Sjeff/* XXX This is bogus compatability crap for ps */ 60109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 61109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 62109864Sjeff 63109864Sjeffstatic void sched_setup(void *dummy); 64109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 65109864Sjeff 66113357Sjeffstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 67113357Sjeff 68113357Sjeffstatic int sched_strict; 69113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 70113357Sjeff 71113357Sjeffstatic int slice_min = 1; 72113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 73113357Sjeff 74116365Sjeffstatic int slice_max = 10; 75113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 76113357Sjeff 77111857Sjeffint realstathz; 78113357Sjeffint tickincr = 1; 79111857Sjeff 80116069Sjeff#ifdef SMP 81123487Sjeff/* Callouts to handle load balancing SMP systems. */ 82116069Sjeffstatic struct callout kseq_lb_callout; 83123487Sjeffstatic struct callout kseq_group_callout; 84116069Sjeff#endif 85116069Sjeff 86109864Sjeff/* 87109864Sjeff * These datastructures are allocated within their parent datastructure but 88109864Sjeff * are scheduler specific. 89109864Sjeff */ 90109864Sjeff 91109864Sjeffstruct ke_sched { 92109864Sjeff int ske_slice; 93109864Sjeff struct runq *ske_runq; 94109864Sjeff /* The following variables are only used for pctcpu calculation */ 95109864Sjeff int ske_ltick; /* Last tick that we were running on */ 96109864Sjeff int ske_ftick; /* First tick that we were running on */ 97109864Sjeff int ske_ticks; /* Tick count */ 98113357Sjeff /* CPU that we have affinity for. */ 99110260Sjeff u_char ske_cpu; 100109864Sjeff}; 101109864Sjeff#define ke_slice ke_sched->ske_slice 102109864Sjeff#define ke_runq ke_sched->ske_runq 103109864Sjeff#define ke_ltick ke_sched->ske_ltick 104109864Sjeff#define ke_ftick ke_sched->ske_ftick 105109864Sjeff#define ke_ticks ke_sched->ske_ticks 106110260Sjeff#define ke_cpu ke_sched->ske_cpu 107121790Sjeff#define ke_assign ke_procq.tqe_next 108109864Sjeff 109121790Sjeff#define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ 110122158Sjeff#define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ 111121790Sjeff 112109864Sjeffstruct kg_sched { 113110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 114110645Sjeff int skg_runtime; /* Number of ticks we were running */ 115109864Sjeff}; 116109864Sjeff#define kg_slptime kg_sched->skg_slptime 117110645Sjeff#define kg_runtime kg_sched->skg_runtime 118109864Sjeff 119109864Sjeffstruct td_sched { 120109864Sjeff int std_slptime; 121109864Sjeff}; 122109864Sjeff#define td_slptime td_sched->std_slptime 123109864Sjeff 124110267Sjeffstruct td_sched td_sched; 125109864Sjeffstruct ke_sched ke_sched; 126109864Sjeffstruct kg_sched kg_sched; 127109864Sjeff 128109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 129109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 130109864Sjeffstruct p_sched *proc0_sched = NULL; 131109864Sjeffstruct td_sched *thread0_sched = &td_sched; 132109864Sjeff 133109864Sjeff/* 134116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 135116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 136116642Sjeff * value is then directly added to this to allow nice to have some effect 137116642Sjeff * on latency. 138111857Sjeff * 139111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 140116642Sjeff * PRI_NRESV: Number of nice values. 141111857Sjeff * PRI_BASE: The start of the dynamic range. 142109864Sjeff */ 143111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 144121869Sjeff#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 145121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 146116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 147113357Sjeff#define SCHED_PRI_INTERACT(score) \ 148116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 149109864Sjeff 150109864Sjeff/* 151111857Sjeff * These determine the interactivity of a process. 152109864Sjeff * 153110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 154110645Sjeff * before throttling back. 155121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 156116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 157111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 158109864Sjeff */ 159121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 160121868Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 161116365Sjeff#define SCHED_INTERACT_MAX (100) 162116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 163121126Sjeff#define SCHED_INTERACT_THRESH (30) 164111857Sjeff 165109864Sjeff/* 166109864Sjeff * These parameters and macros determine the size of the time slice that is 167109864Sjeff * granted to each thread. 168109864Sjeff * 169109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 170109864Sjeff * SLICE_MAX: Maximum time slice granted. 171109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 172112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 173112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 174121871Sjeff * SLICE_NTHRESH: The nice cutoff point for slice assignment. 175109864Sjeff */ 176113357Sjeff#define SCHED_SLICE_MIN (slice_min) 177113357Sjeff#define SCHED_SLICE_MAX (slice_max) 178121871Sjeff#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 179111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 180109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 181112966Sjeff#define SCHED_SLICE_NICE(nice) \ 182121871Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 183109864Sjeff 184109864Sjeff/* 185109864Sjeff * This macro determines whether or not the kse belongs on the current or 186109864Sjeff * next run queue. 187109864Sjeff */ 188113357Sjeff#define SCHED_INTERACTIVE(kg) \ 189113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 190113417Sjeff#define SCHED_CURR(kg, ke) \ 191121107Sjeff (ke->ke_thread->td_priority != kg->kg_user_pri || \ 192121107Sjeff SCHED_INTERACTIVE(kg)) 193109864Sjeff 194109864Sjeff/* 195109864Sjeff * Cpu percentage computation macros and defines. 196109864Sjeff * 197109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 198109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 199109864Sjeff */ 200109864Sjeff 201112971Sjeff#define SCHED_CPU_TIME 10 202109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 203109864Sjeff 204109864Sjeff/* 205113357Sjeff * kseq - per processor runqs and statistics. 206109864Sjeff */ 207109864Sjeffstruct kseq { 208113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 209113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 210113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 211113357Sjeff struct runq *ksq_curr; /* Current queue. */ 212121896Sjeff int ksq_load_timeshare; /* Load for timeshare. */ 213113357Sjeff int ksq_load; /* Aggregate load. */ 214121869Sjeff short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 215113357Sjeff short ksq_nicemin; /* Least nice. */ 216110267Sjeff#ifdef SMP 217123433Sjeff int ksq_transferable; 218123433Sjeff LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 219123433Sjeff struct kseq_group *ksq_group; /* Our processor group. */ 220123433Sjeff volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 221110267Sjeff#endif 222109864Sjeff}; 223109864Sjeff 224123433Sjeff#ifdef SMP 225109864Sjeff/* 226123433Sjeff * kseq groups are groups of processors which can cheaply share threads. When 227123433Sjeff * one processor in the group goes idle it will check the runqs of the other 228123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 229123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 230123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 231123433Sjeff * load balancer. 232123433Sjeff */ 233123433Sjeffstruct kseq_group { 234123433Sjeff int ksg_cpus; /* Count of CPUs in this kseq group. */ 235123433Sjeff int ksg_cpumask; /* Mask of cpus in this group. */ 236123433Sjeff int ksg_idlemask; /* Idle cpus in this group. */ 237123433Sjeff int ksg_mask; /* Bit mask for first cpu. */ 238123487Sjeff int ksg_load; /* Total load of this group. */ 239123433Sjeff int ksg_transferable; /* Transferable load of this group. */ 240123433Sjeff LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 241123433Sjeff}; 242123433Sjeff#endif 243123433Sjeff 244123433Sjeff/* 245109864Sjeff * One kse queue per processor. 246109864Sjeff */ 247110028Sjeff#ifdef SMP 248121790Sjeffstatic int kseq_idle; 249123487Sjeffstatic int ksg_maxid; 250121790Sjeffstatic struct kseq kseq_cpu[MAXCPU]; 251123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU]; 252123433Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 253123433Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 254123487Sjeff#define KSEQ_ID(x) ((x) - kseq_cpu) 255123487Sjeff#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 256123433Sjeff#else /* !SMP */ 257121790Sjeffstatic struct kseq kseq_cpu; 258110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 259110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 260110028Sjeff#endif 261109864Sjeff 262112966Sjeffstatic void sched_slice(struct kse *ke); 263113357Sjeffstatic void sched_priority(struct ksegrp *kg); 264111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 265116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 266121868Sjeffstatic void sched_interact_fork(struct ksegrp *kg); 267121790Sjeffstatic void sched_pctcpu_update(struct kse *ke); 268109864Sjeff 269110267Sjeff/* Operations on per processor queues */ 270121790Sjeffstatic struct kse * kseq_choose(struct kseq *kseq); 271110028Sjeffstatic void kseq_setup(struct kseq *kseq); 272122744Sjeffstatic void kseq_load_add(struct kseq *kseq, struct kse *ke); 273122744Sjeffstatic void kseq_load_rem(struct kseq *kseq, struct kse *ke); 274122744Sjeffstatic __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 275122744Sjeffstatic __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 276113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 277113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 278113660Sjeffvoid kseq_print(int cpu); 279110267Sjeff#ifdef SMP 280123433Sjeffstatic int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 281121790Sjeffstatic struct kse *runq_steal(struct runq *rq); 282122744Sjeffstatic void sched_balance(void *arg); 283123487Sjeffstatic void sched_balance_group(struct kseq_group *ksg); 284123487Sjeffstatic void sched_balance_pair(struct kseq *high, struct kseq *low); 285121790Sjeffstatic void kseq_move(struct kseq *from, int cpu); 286123433Sjeffstatic int kseq_idled(struct kseq *kseq); 287121790Sjeffstatic void kseq_notify(struct kse *ke, int cpu); 288121790Sjeffstatic void kseq_assign(struct kseq *); 289123433Sjeffstatic struct kse *kseq_steal(struct kseq *kseq, int stealidle); 290122038Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 291122158Sjeff ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 292122165Sjeff ((ke)->ke_flags & KEF_BOUND) == 0) 293121790Sjeff#endif 294110028Sjeff 295113357Sjeffvoid 296113660Sjeffkseq_print(int cpu) 297110267Sjeff{ 298113660Sjeff struct kseq *kseq; 299113357Sjeff int i; 300112994Sjeff 301113660Sjeff kseq = KSEQ_CPU(cpu); 302112994Sjeff 303113357Sjeff printf("kseq:\n"); 304113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 305122744Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 306121896Sjeff#ifdef SMP 307123433Sjeff printf("\tload transferable: %d\n", kseq->ksq_transferable); 308121896Sjeff#endif 309113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 310113357Sjeff printf("\tnice counts:\n"); 311121869Sjeff for (i = 0; i < SCHED_PRI_NRESV; i++) 312113357Sjeff if (kseq->ksq_nice[i]) 313113357Sjeff printf("\t\t%d = %d\n", 314113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 315113357Sjeff} 316112994Sjeff 317122744Sjeffstatic __inline void 318122744Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke) 319122744Sjeff{ 320122744Sjeff#ifdef SMP 321123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 322123433Sjeff kseq->ksq_transferable++; 323123433Sjeff kseq->ksq_group->ksg_transferable++; 324123433Sjeff } 325122744Sjeff#endif 326122744Sjeff runq_add(ke->ke_runq, ke); 327122744Sjeff} 328122744Sjeff 329122744Sjeffstatic __inline void 330122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke) 331122744Sjeff{ 332122744Sjeff#ifdef SMP 333123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 334123433Sjeff kseq->ksq_transferable--; 335123433Sjeff kseq->ksq_group->ksg_transferable--; 336123433Sjeff } 337122744Sjeff#endif 338122744Sjeff runq_remove(ke->ke_runq, ke); 339122744Sjeff} 340122744Sjeff 341113357Sjeffstatic void 342122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke) 343113357Sjeff{ 344121896Sjeff int class; 345115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 346121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 347121896Sjeff if (class == PRI_TIMESHARE) 348121896Sjeff kseq->ksq_load_timeshare++; 349113357Sjeff kseq->ksq_load++; 350123487Sjeff#ifdef SMP 351123487Sjeff if (class != PRI_ITHD) 352123487Sjeff kseq->ksq_group->ksg_load++; 353123487Sjeff#endif 354113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 355122744Sjeff CTR6(KTR_ULE, 356122744Sjeff "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 357122744Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 358122744Sjeff ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 359113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 360113357Sjeff kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 361110267Sjeff} 362113357Sjeff 363112994Sjeffstatic void 364122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke) 365110267Sjeff{ 366121896Sjeff int class; 367115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 368121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 369121896Sjeff if (class == PRI_TIMESHARE) 370121896Sjeff kseq->ksq_load_timeshare--; 371123487Sjeff#ifdef SMP 372123487Sjeff if (class != PRI_ITHD) 373123487Sjeff kseq->ksq_group->ksg_load--; 374123487Sjeff#endif 375113357Sjeff kseq->ksq_load--; 376113357Sjeff ke->ke_runq = NULL; 377113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 378113357Sjeff kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 379110267Sjeff} 380110267Sjeff 381113357Sjeffstatic void 382113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 383110267Sjeff{ 384115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 385113357Sjeff /* Normalize to zero. */ 386113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 387121896Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 388113357Sjeff kseq->ksq_nicemin = nice; 389110267Sjeff} 390110267Sjeff 391113357Sjeffstatic void 392113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 393110267Sjeff{ 394113357Sjeff int n; 395113357Sjeff 396115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 397113357Sjeff /* Normalize to zero. */ 398113357Sjeff n = nice + SCHED_PRI_NHALF; 399113357Sjeff kseq->ksq_nice[n]--; 400113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 401113357Sjeff 402113357Sjeff /* 403113357Sjeff * If this wasn't the smallest nice value or there are more in 404113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 405113357Sjeff * the smallest nice. 406113357Sjeff */ 407113357Sjeff if (nice != kseq->ksq_nicemin || 408113357Sjeff kseq->ksq_nice[n] != 0 || 409121896Sjeff kseq->ksq_load_timeshare == 0) 410113357Sjeff return; 411113357Sjeff 412121869Sjeff for (; n < SCHED_PRI_NRESV; n++) 413113357Sjeff if (kseq->ksq_nice[n]) { 414113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 415113357Sjeff return; 416113357Sjeff } 417110267Sjeff} 418110267Sjeff 419113357Sjeff#ifdef SMP 420116069Sjeff/* 421122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 422116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 423116069Sjeff * by migrating some processes. 424116069Sjeff * 425116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 426116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 427116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 428116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 429116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 430116069Sjeff * 431116069Sjeff * It could be improved by considering the priorities and slices assigned to 432116069Sjeff * each task prior to balancing them. There are many pathological cases with 433116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 434116069Sjeff * 435116069Sjeff */ 436121790Sjeffstatic void 437122744Sjeffsched_balance(void *arg) 438116069Sjeff{ 439123487Sjeff struct kseq_group *high; 440123487Sjeff struct kseq_group *low; 441123487Sjeff struct kseq_group *ksg; 442123487Sjeff int timo; 443123487Sjeff int cnt; 444123487Sjeff int i; 445123487Sjeff 446123487Sjeff mtx_lock_spin(&sched_lock); 447123487Sjeff if (smp_started == 0) 448123487Sjeff goto out; 449123487Sjeff low = high = NULL; 450123487Sjeff i = random() % (ksg_maxid + 1); 451123487Sjeff for (cnt = 0; cnt <= ksg_maxid; cnt++) { 452123487Sjeff ksg = KSEQ_GROUP(i); 453123487Sjeff /* 454123487Sjeff * Find the CPU with the highest load that has some 455123487Sjeff * threads to transfer. 456123487Sjeff */ 457123487Sjeff if ((high == NULL || ksg->ksg_load > high->ksg_load) 458123487Sjeff && ksg->ksg_transferable) 459123487Sjeff high = ksg; 460123487Sjeff if (low == NULL || ksg->ksg_load < low->ksg_load) 461123487Sjeff low = ksg; 462123487Sjeff if (++i > ksg_maxid) 463123487Sjeff i = 0; 464123487Sjeff } 465123487Sjeff if (low != NULL && high != NULL && high != low) 466123487Sjeff sched_balance_pair(LIST_FIRST(&high->ksg_members), 467123487Sjeff LIST_FIRST(&low->ksg_members)); 468123487Sjeffout: 469123487Sjeff mtx_unlock_spin(&sched_lock); 470123487Sjeff timo = random() % (hz * 2); 471123487Sjeff callout_reset(&kseq_lb_callout, timo, sched_balance, NULL); 472123487Sjeff} 473123487Sjeff 474123487Sjeffstatic void 475123487Sjeffsched_balance_groups(void *arg) 476123487Sjeff{ 477123487Sjeff int timo; 478123487Sjeff int i; 479123487Sjeff 480123487Sjeff mtx_lock_spin(&sched_lock); 481123487Sjeff if (smp_started) 482123487Sjeff for (i = 0; i <= ksg_maxid; i++) 483123487Sjeff sched_balance_group(KSEQ_GROUP(i)); 484123487Sjeff mtx_unlock_spin(&sched_lock); 485123487Sjeff timo = random() % (hz * 2); 486123487Sjeff callout_reset(&kseq_group_callout, timo, sched_balance_groups, NULL); 487123487Sjeff} 488123487Sjeff 489123487Sjeffstatic void 490123487Sjeffsched_balance_group(struct kseq_group *ksg) 491123487Sjeff{ 492116069Sjeff struct kseq *kseq; 493123487Sjeff struct kseq *high; 494123487Sjeff struct kseq *low; 495123487Sjeff int load; 496123487Sjeff 497123487Sjeff if (ksg->ksg_transferable == 0) 498123487Sjeff return; 499123487Sjeff low = NULL; 500123487Sjeff high = NULL; 501123487Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 502123487Sjeff load = kseq->ksq_load; 503123487Sjeff if (kseq == KSEQ_CPU(0)) 504123487Sjeff load--; 505123487Sjeff if (high == NULL || load > high->ksq_load) 506123487Sjeff high = kseq; 507123487Sjeff if (low == NULL || load < low->ksq_load) 508123487Sjeff low = kseq; 509123487Sjeff } 510123487Sjeff if (high != NULL && low != NULL && high != low) 511123487Sjeff sched_balance_pair(high, low); 512123487Sjeff} 513123487Sjeff 514123487Sjeffstatic void 515123487Sjeffsched_balance_pair(struct kseq *high, struct kseq *low) 516123487Sjeff{ 517123433Sjeff int transferable; 518116069Sjeff int high_load; 519116069Sjeff int low_load; 520116069Sjeff int move; 521116069Sjeff int diff; 522116069Sjeff int i; 523116069Sjeff 524116069Sjeff /* 525123433Sjeff * If we're transfering within a group we have to use this specific 526123433Sjeff * kseq's transferable count, otherwise we can steal from other members 527123433Sjeff * of the group. 528123433Sjeff */ 529123487Sjeff if (high->ksq_group == low->ksq_group) { 530123487Sjeff transferable = high->ksq_transferable; 531123487Sjeff high_load = high->ksq_load; 532123487Sjeff low_load = low->ksq_load; 533123487Sjeff /* 534123487Sjeff * XXX If we encounter cpu 0 we must remember to reduce it's 535123487Sjeff * load by 1 to reflect the swi that is running the callout. 536123487Sjeff * At some point we should really fix load balancing of the 537123487Sjeff * swi and then this wont matter. 538123487Sjeff */ 539123487Sjeff if (high == KSEQ_CPU(0)) 540123487Sjeff high_load--; 541123487Sjeff if (low == KSEQ_CPU(0)) 542123487Sjeff low_load--; 543123487Sjeff } else { 544123487Sjeff transferable = high->ksq_group->ksg_transferable; 545123487Sjeff high_load = high->ksq_group->ksg_load; 546123487Sjeff low_load = low->ksq_group->ksg_load; 547123487Sjeff } 548123433Sjeff if (transferable == 0) 549123487Sjeff return; 550123433Sjeff /* 551122744Sjeff * Determine what the imbalance is and then adjust that to how many 552123433Sjeff * kses we actually have to give up (transferable). 553122744Sjeff */ 554123487Sjeff diff = high_load - low_load; 555116069Sjeff move = diff / 2; 556116069Sjeff if (diff & 0x1) 557116069Sjeff move++; 558123433Sjeff move = min(move, transferable); 559116069Sjeff for (i = 0; i < move; i++) 560123487Sjeff kseq_move(high, KSEQ_ID(low)); 561116069Sjeff return; 562116069Sjeff} 563116069Sjeff 564121790Sjeffstatic void 565116069Sjeffkseq_move(struct kseq *from, int cpu) 566116069Sjeff{ 567123433Sjeff struct kseq *kseq; 568123433Sjeff struct kseq *to; 569116069Sjeff struct kse *ke; 570116069Sjeff 571123433Sjeff kseq = from; 572123433Sjeff to = KSEQ_CPU(cpu); 573123433Sjeff ke = kseq_steal(kseq, 1); 574123433Sjeff if (ke == NULL) { 575123433Sjeff struct kseq_group *ksg; 576123433Sjeff 577123433Sjeff ksg = kseq->ksq_group; 578123433Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 579123433Sjeff if (kseq == from || kseq->ksq_transferable == 0) 580123433Sjeff continue; 581123433Sjeff ke = kseq_steal(kseq, 1); 582123433Sjeff break; 583123433Sjeff } 584123433Sjeff if (ke == NULL) 585123433Sjeff panic("kseq_move: No KSEs available with a " 586123433Sjeff "transferable count of %d\n", 587123433Sjeff ksg->ksg_transferable); 588123433Sjeff } 589123433Sjeff if (kseq == to) 590123433Sjeff return; 591116069Sjeff ke->ke_state = KES_THREAD; 592123433Sjeff kseq_runq_rem(kseq, ke); 593123433Sjeff kseq_load_rem(kseq, ke); 594116069Sjeff 595116069Sjeff ke->ke_cpu = cpu; 596121923Sjeff kseq_notify(ke, cpu); 597116069Sjeff} 598110267Sjeff 599123433Sjeffstatic int 600123433Sjeffkseq_idled(struct kseq *kseq) 601121790Sjeff{ 602123433Sjeff struct kseq_group *ksg; 603123433Sjeff struct kseq *steal; 604123433Sjeff struct kse *ke; 605123433Sjeff 606123433Sjeff ksg = kseq->ksq_group; 607123433Sjeff /* 608123433Sjeff * If we're in a cpu group, try and steal kses from another cpu in 609123433Sjeff * the group before idling. 610123433Sjeff */ 611123433Sjeff if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 612123433Sjeff LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 613123433Sjeff if (steal == kseq || steal->ksq_transferable == 0) 614123433Sjeff continue; 615123433Sjeff ke = kseq_steal(steal, 0); 616123433Sjeff if (ke == NULL) 617123433Sjeff continue; 618123433Sjeff ke->ke_state = KES_THREAD; 619123433Sjeff kseq_runq_rem(steal, ke); 620123433Sjeff kseq_load_rem(steal, ke); 621123433Sjeff ke->ke_cpu = PCPU_GET(cpuid); 622123433Sjeff sched_add(ke->ke_thread); 623123433Sjeff return (0); 624123433Sjeff } 625123433Sjeff } 626123433Sjeff /* 627123433Sjeff * We only set the idled bit when all of the cpus in the group are 628123433Sjeff * idle. Otherwise we could get into a situation where a KSE bounces 629123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 630123433Sjeff */ 631123433Sjeff ksg->ksg_idlemask |= PCPU_GET(cpumask); 632123433Sjeff if (ksg->ksg_idlemask != ksg->ksg_cpumask) 633123433Sjeff return (1); 634123433Sjeff atomic_set_int(&kseq_idle, ksg->ksg_mask); 635123433Sjeff return (1); 636121790Sjeff} 637121790Sjeff 638121790Sjeffstatic void 639121790Sjeffkseq_assign(struct kseq *kseq) 640121790Sjeff{ 641121790Sjeff struct kse *nke; 642121790Sjeff struct kse *ke; 643121790Sjeff 644121790Sjeff do { 645122848Sjeff (volatile struct kse *)ke = kseq->ksq_assigned; 646121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 647121790Sjeff for (; ke != NULL; ke = nke) { 648121790Sjeff nke = ke->ke_assign; 649121790Sjeff ke->ke_flags &= ~KEF_ASSIGNED; 650121790Sjeff sched_add(ke->ke_thread); 651121790Sjeff } 652121790Sjeff} 653121790Sjeff 654121790Sjeffstatic void 655121790Sjeffkseq_notify(struct kse *ke, int cpu) 656121790Sjeff{ 657121790Sjeff struct kseq *kseq; 658121790Sjeff struct thread *td; 659121790Sjeff struct pcpu *pcpu; 660121790Sjeff 661121790Sjeff ke->ke_flags |= KEF_ASSIGNED; 662121790Sjeff 663121790Sjeff kseq = KSEQ_CPU(cpu); 664121790Sjeff 665121790Sjeff /* 666121790Sjeff * Place a KSE on another cpu's queue and force a resched. 667121790Sjeff */ 668121790Sjeff do { 669122848Sjeff (volatile struct kse *)ke->ke_assign = kseq->ksq_assigned; 670121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 671121790Sjeff pcpu = pcpu_find(cpu); 672121790Sjeff td = pcpu->pc_curthread; 673121790Sjeff if (ke->ke_thread->td_priority < td->td_priority || 674121790Sjeff td == pcpu->pc_idlethread) { 675121790Sjeff td->td_flags |= TDF_NEEDRESCHED; 676121790Sjeff ipi_selected(1 << cpu, IPI_AST); 677121790Sjeff } 678121790Sjeff} 679121790Sjeff 680121790Sjeffstatic struct kse * 681121790Sjeffrunq_steal(struct runq *rq) 682121790Sjeff{ 683121790Sjeff struct rqhead *rqh; 684121790Sjeff struct rqbits *rqb; 685121790Sjeff struct kse *ke; 686121790Sjeff int word; 687121790Sjeff int bit; 688121790Sjeff 689121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 690121790Sjeff rqb = &rq->rq_status; 691121790Sjeff for (word = 0; word < RQB_LEN; word++) { 692121790Sjeff if (rqb->rqb_bits[word] == 0) 693121790Sjeff continue; 694121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 695123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 696121790Sjeff continue; 697121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 698121790Sjeff TAILQ_FOREACH(ke, rqh, ke_procq) { 699121896Sjeff if (KSE_CAN_MIGRATE(ke, 700121896Sjeff PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 701121790Sjeff return (ke); 702121790Sjeff } 703121790Sjeff } 704121790Sjeff } 705121790Sjeff return (NULL); 706121790Sjeff} 707121790Sjeff 708121790Sjeffstatic struct kse * 709123433Sjeffkseq_steal(struct kseq *kseq, int stealidle) 710121790Sjeff{ 711121790Sjeff struct kse *ke; 712121790Sjeff 713123433Sjeff /* 714123433Sjeff * Steal from next first to try to get a non-interactive task that 715123433Sjeff * may not have run for a while. 716123433Sjeff */ 717123433Sjeff if ((ke = runq_steal(kseq->ksq_next)) != NULL) 718123433Sjeff return (ke); 719121790Sjeff if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 720121790Sjeff return (ke); 721123433Sjeff if (stealidle) 722123433Sjeff return (runq_steal(&kseq->ksq_idle)); 723123433Sjeff return (NULL); 724121790Sjeff} 725123433Sjeff 726123433Sjeffint 727123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class) 728123433Sjeff{ 729123433Sjeff struct kseq_group *ksg; 730123433Sjeff int cpu; 731123433Sjeff 732123433Sjeff cpu = 0; 733123433Sjeff ksg = kseq->ksq_group; 734123433Sjeff 735123433Sjeff /* 736123433Sjeff * XXX This ksg_transferable might work better if we were checking 737123433Sjeff * against a global group load. As it is now, this prevents us from 738123433Sjeff * transfering a thread from a group that is potentially bogged down 739123433Sjeff * with non transferable load. 740123433Sjeff */ 741123433Sjeff if (ksg->ksg_transferable > ksg->ksg_cpus && kseq_idle) { 742123433Sjeff /* 743123433Sjeff * Multiple cpus could find this bit simultaneously 744123433Sjeff * but the race shouldn't be terrible. 745123433Sjeff */ 746123433Sjeff cpu = ffs(kseq_idle); 747123433Sjeff if (cpu) 748123433Sjeff atomic_clear_int(&kseq_idle, 1 << (cpu - 1)); 749123433Sjeff } 750123433Sjeff /* 751123433Sjeff * If another cpu in this group has idled, assign a thread over 752123433Sjeff * to them after checking to see if there are idled groups. 753123433Sjeff */ 754123433Sjeff if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) { 755123433Sjeff cpu = ffs(ksg->ksg_idlemask); 756123433Sjeff if (cpu) 757123433Sjeff ksg->ksg_idlemask &= ~(1 << (cpu - 1)); 758123433Sjeff } 759123433Sjeff /* 760123433Sjeff * Now that we've found an idle CPU, migrate the thread. 761123433Sjeff */ 762123433Sjeff if (cpu) { 763123433Sjeff cpu--; 764123433Sjeff ke->ke_cpu = cpu; 765123433Sjeff ke->ke_runq = NULL; 766123433Sjeff kseq_notify(ke, cpu); 767123433Sjeff return (1); 768123433Sjeff } 769123433Sjeff return (0); 770123433Sjeff} 771123433Sjeff 772121790Sjeff#endif /* SMP */ 773121790Sjeff 774117326Sjeff/* 775121790Sjeff * Pick the highest priority task we have and return it. 776117326Sjeff */ 777117326Sjeff 778121790Sjeffstatic struct kse * 779121790Sjeffkseq_choose(struct kseq *kseq) 780110267Sjeff{ 781110267Sjeff struct kse *ke; 782110267Sjeff struct runq *swap; 783110267Sjeff 784115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 785113357Sjeff swap = NULL; 786112994Sjeff 787113357Sjeff for (;;) { 788113357Sjeff ke = runq_choose(kseq->ksq_curr); 789113357Sjeff if (ke == NULL) { 790113357Sjeff /* 791113357Sjeff * We already swaped once and didn't get anywhere. 792113357Sjeff */ 793113357Sjeff if (swap) 794113357Sjeff break; 795113357Sjeff swap = kseq->ksq_curr; 796113357Sjeff kseq->ksq_curr = kseq->ksq_next; 797113357Sjeff kseq->ksq_next = swap; 798113357Sjeff continue; 799113357Sjeff } 800113357Sjeff /* 801113357Sjeff * If we encounter a slice of 0 the kse is in a 802113357Sjeff * TIMESHARE kse group and its nice was too far out 803113357Sjeff * of the range that receives slices. 804113357Sjeff */ 805121790Sjeff if (ke->ke_slice == 0) { 806113357Sjeff runq_remove(ke->ke_runq, ke); 807113357Sjeff sched_slice(ke); 808113357Sjeff ke->ke_runq = kseq->ksq_next; 809113357Sjeff runq_add(ke->ke_runq, ke); 810113357Sjeff continue; 811113357Sjeff } 812113357Sjeff return (ke); 813110267Sjeff } 814110267Sjeff 815113357Sjeff return (runq_choose(&kseq->ksq_idle)); 816110267Sjeff} 817110267Sjeff 818109864Sjeffstatic void 819110028Sjeffkseq_setup(struct kseq *kseq) 820110028Sjeff{ 821113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 822113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 823112994Sjeff runq_init(&kseq->ksq_idle); 824113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 825113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 826113660Sjeff kseq->ksq_load = 0; 827121896Sjeff kseq->ksq_load_timeshare = 0; 828110028Sjeff} 829110028Sjeff 830110028Sjeffstatic void 831109864Sjeffsched_setup(void *dummy) 832109864Sjeff{ 833117313Sjeff#ifdef SMP 834123487Sjeff int balance_groups; 835109864Sjeff int i; 836117313Sjeff#endif 837109864Sjeff 838116946Sjeff slice_min = (hz/100); /* 10ms */ 839116946Sjeff slice_max = (hz/7); /* ~140ms */ 840111857Sjeff 841117237Sjeff#ifdef SMP 842123487Sjeff balance_groups = 0; 843123433Sjeff /* 844123433Sjeff * Initialize the kseqs. 845123433Sjeff */ 846123433Sjeff for (i = 0; i < MAXCPU; i++) { 847123433Sjeff struct kseq *ksq; 848123433Sjeff 849123433Sjeff ksq = &kseq_cpu[i]; 850123433Sjeff ksq->ksq_assigned = NULL; 851123433Sjeff kseq_setup(&kseq_cpu[i]); 852123433Sjeff } 853117237Sjeff if (smp_topology == NULL) { 854123433Sjeff struct kseq_group *ksg; 855123433Sjeff struct kseq *ksq; 856123433Sjeff 857117237Sjeff for (i = 0; i < MAXCPU; i++) { 858123433Sjeff ksq = &kseq_cpu[i]; 859123433Sjeff ksg = &kseq_groups[i]; 860123433Sjeff /* 861123433Sjeff * Setup a kse group with one member. 862123433Sjeff */ 863123433Sjeff ksq->ksq_transferable = 0; 864123433Sjeff ksq->ksq_group = ksg; 865123433Sjeff ksg->ksg_cpus = 1; 866123433Sjeff ksg->ksg_idlemask = 0; 867123433Sjeff ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 868123487Sjeff ksg->ksg_load = 0; 869123433Sjeff ksg->ksg_transferable = 0; 870123433Sjeff LIST_INIT(&ksg->ksg_members); 871123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 872117237Sjeff } 873117237Sjeff } else { 874123433Sjeff struct kseq_group *ksg; 875123433Sjeff struct cpu_group *cg; 876117237Sjeff int j; 877113357Sjeff 878117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 879117237Sjeff cg = &smp_topology->ct_group[i]; 880123433Sjeff ksg = &kseq_groups[i]; 881123433Sjeff /* 882123433Sjeff * Initialize the group. 883123433Sjeff */ 884123433Sjeff ksg->ksg_idlemask = 0; 885123487Sjeff ksg->ksg_load = 0; 886123433Sjeff ksg->ksg_transferable = 0; 887123433Sjeff ksg->ksg_cpus = cg->cg_count; 888123433Sjeff ksg->ksg_cpumask = cg->cg_mask; 889123433Sjeff LIST_INIT(&ksg->ksg_members); 890123433Sjeff /* 891123433Sjeff * Find all of the group members and add them. 892123433Sjeff */ 893123433Sjeff for (j = 0; j < MAXCPU; j++) { 894123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 895123433Sjeff if (ksg->ksg_mask == 0) 896123433Sjeff ksg->ksg_mask = 1 << j; 897123433Sjeff kseq_cpu[j].ksq_transferable = 0; 898123433Sjeff kseq_cpu[j].ksq_group = ksg; 899123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, 900123433Sjeff &kseq_cpu[j], ksq_siblings); 901123433Sjeff } 902123433Sjeff } 903123487Sjeff if (ksg->ksg_cpus > 1) 904123487Sjeff balance_groups = 1; 905117237Sjeff } 906123487Sjeff ksg_maxid = smp_topology->ct_count - 1; 907117237Sjeff } 908119137Ssam callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 909123487Sjeff callout_init(&kseq_group_callout, CALLOUT_MPSAFE); 910122744Sjeff sched_balance(NULL); 911123487Sjeff /* 912123487Sjeff * Stagger the group and global load balancer so they do not 913123487Sjeff * interfere with each other. 914123487Sjeff */ 915123487Sjeff if (balance_groups) 916123487Sjeff callout_reset(&kseq_group_callout, hz / 2, 917123487Sjeff sched_balance_groups, NULL); 918117237Sjeff#else 919117237Sjeff kseq_setup(KSEQ_SELF()); 920116069Sjeff#endif 921117237Sjeff mtx_lock_spin(&sched_lock); 922122744Sjeff kseq_load_add(KSEQ_SELF(), &kse0); 923117237Sjeff mtx_unlock_spin(&sched_lock); 924109864Sjeff} 925109864Sjeff 926109864Sjeff/* 927109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 928109864Sjeff * process. 929109864Sjeff */ 930113357Sjeffstatic void 931109864Sjeffsched_priority(struct ksegrp *kg) 932109864Sjeff{ 933109864Sjeff int pri; 934109864Sjeff 935109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 936113357Sjeff return; 937109864Sjeff 938113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 939111857Sjeff pri += SCHED_PRI_BASE; 940109864Sjeff pri += kg->kg_nice; 941109864Sjeff 942109864Sjeff if (pri > PRI_MAX_TIMESHARE) 943109864Sjeff pri = PRI_MAX_TIMESHARE; 944109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 945109864Sjeff pri = PRI_MIN_TIMESHARE; 946109864Sjeff 947109864Sjeff kg->kg_user_pri = pri; 948109864Sjeff 949113357Sjeff return; 950109864Sjeff} 951109864Sjeff 952109864Sjeff/* 953112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 954112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 955109864Sjeff */ 956112966Sjeffstatic void 957112966Sjeffsched_slice(struct kse *ke) 958109864Sjeff{ 959113357Sjeff struct kseq *kseq; 960112966Sjeff struct ksegrp *kg; 961109864Sjeff 962112966Sjeff kg = ke->ke_ksegrp; 963113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 964109864Sjeff 965112966Sjeff /* 966112966Sjeff * Rationale: 967112966Sjeff * KSEs in interactive ksegs get the minimum slice so that we 968112966Sjeff * quickly notice if it abuses its advantage. 969112966Sjeff * 970112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 971112966Sjeff * based on the ksegs nice value relative to the least nice kseg 972112966Sjeff * on the run queue for this cpu. 973112966Sjeff * 974112966Sjeff * If the KSE is less nice than all others it gets the maximum 975112966Sjeff * slice and other KSEs will adjust their slice relative to 976112966Sjeff * this when they first expire. 977112966Sjeff * 978112966Sjeff * There is 20 point window that starts relative to the least 979112966Sjeff * nice kse on the run queue. Slice size is determined by 980112966Sjeff * the kse distance from the last nice ksegrp. 981112966Sjeff * 982121871Sjeff * If the kse is outside of the window it will get no slice 983121871Sjeff * and will be reevaluated each time it is selected on the 984121871Sjeff * run queue. The exception to this is nice 0 ksegs when 985121871Sjeff * a nice -20 is running. They are always granted a minimum 986121871Sjeff * slice. 987112966Sjeff */ 988113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 989112966Sjeff int nice; 990112966Sjeff 991113357Sjeff nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 992121896Sjeff if (kseq->ksq_load_timeshare == 0 || 993113357Sjeff kg->kg_nice < kseq->ksq_nicemin) 994112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 995121871Sjeff else if (nice <= SCHED_SLICE_NTHRESH) 996112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 997121871Sjeff else if (kg->kg_nice == 0) 998121871Sjeff ke->ke_slice = SCHED_SLICE_MIN; 999112966Sjeff else 1000112966Sjeff ke->ke_slice = 0; 1001112966Sjeff } else 1002112966Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1003112966Sjeff 1004113357Sjeff CTR6(KTR_ULE, 1005113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 1006113357Sjeff ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 1007121896Sjeff kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); 1008113357Sjeff 1009112966Sjeff return; 1010109864Sjeff} 1011109864Sjeff 1012121868Sjeff/* 1013121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1014121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 1015121868Sjeff * This routine will not operate correctly when slp or run times have been 1016121868Sjeff * adjusted to more than double their maximum. 1017121868Sjeff */ 1018116463Sjeffstatic void 1019116463Sjeffsched_interact_update(struct ksegrp *kg) 1020116463Sjeff{ 1021121868Sjeff int sum; 1022121605Sjeff 1023121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1024121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1025121868Sjeff return; 1026121868Sjeff /* 1027121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1028121868Sjeff * will not bring us back into range. Dividing by two here forces 1029121868Sjeff * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1030121868Sjeff */ 1031121868Sjeff if (sum > (SCHED_INTERACT_MAX / 5) * 6) { 1032121868Sjeff kg->kg_runtime /= 2; 1033121868Sjeff kg->kg_slptime /= 2; 1034121868Sjeff return; 1035116463Sjeff } 1036121868Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1037121868Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1038116463Sjeff} 1039116463Sjeff 1040121868Sjeffstatic void 1041121868Sjeffsched_interact_fork(struct ksegrp *kg) 1042121868Sjeff{ 1043121868Sjeff int ratio; 1044121868Sjeff int sum; 1045121868Sjeff 1046121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1047121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1048121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1049121868Sjeff kg->kg_runtime /= ratio; 1050121868Sjeff kg->kg_slptime /= ratio; 1051121868Sjeff } 1052121868Sjeff} 1053121868Sjeff 1054111857Sjeffstatic int 1055111857Sjeffsched_interact_score(struct ksegrp *kg) 1056111857Sjeff{ 1057116365Sjeff int div; 1058111857Sjeff 1059111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 1060116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1061116365Sjeff return (SCHED_INTERACT_HALF + 1062116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1063116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 1064116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1065116365Sjeff return (kg->kg_runtime / div); 1066111857Sjeff } 1067111857Sjeff 1068116365Sjeff /* 1069116365Sjeff * This can happen if slptime and runtime are 0. 1070116365Sjeff */ 1071116365Sjeff return (0); 1072111857Sjeff 1073111857Sjeff} 1074111857Sjeff 1075113357Sjeff/* 1076113357Sjeff * This is only somewhat accurate since given many processes of the same 1077113357Sjeff * priority they will switch when their slices run out, which will be 1078113357Sjeff * at most SCHED_SLICE_MAX. 1079113357Sjeff */ 1080109864Sjeffint 1081109864Sjeffsched_rr_interval(void) 1082109864Sjeff{ 1083109864Sjeff return (SCHED_SLICE_MAX); 1084109864Sjeff} 1085109864Sjeff 1086121790Sjeffstatic void 1087109864Sjeffsched_pctcpu_update(struct kse *ke) 1088109864Sjeff{ 1089109864Sjeff /* 1090109864Sjeff * Adjust counters and watermark for pctcpu calc. 1091116365Sjeff */ 1092120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1093120272Sjeff /* 1094120272Sjeff * Shift the tick count out so that the divide doesn't 1095120272Sjeff * round away our results. 1096120272Sjeff */ 1097120272Sjeff ke->ke_ticks <<= 10; 1098120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1099120272Sjeff SCHED_CPU_TICKS; 1100120272Sjeff ke->ke_ticks >>= 10; 1101120272Sjeff } else 1102120272Sjeff ke->ke_ticks = 0; 1103109864Sjeff ke->ke_ltick = ticks; 1104109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1105109864Sjeff} 1106109864Sjeff 1107109864Sjeffvoid 1108109864Sjeffsched_prio(struct thread *td, u_char prio) 1109109864Sjeff{ 1110121605Sjeff struct kse *ke; 1111109864Sjeff 1112121605Sjeff ke = td->td_kse; 1113109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1114109864Sjeff if (TD_ON_RUNQ(td)) { 1115121605Sjeff /* 1116121605Sjeff * If the priority has been elevated due to priority 1117121605Sjeff * propagation, we may have to move ourselves to a new 1118121605Sjeff * queue. We still call adjustrunqueue below in case kse 1119121605Sjeff * needs to fix things up. 1120121605Sjeff */ 1121121872Sjeff if (prio < td->td_priority && ke && 1122121872Sjeff (ke->ke_flags & KEF_ASSIGNED) == 0 && 1123121790Sjeff ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1124121605Sjeff runq_remove(ke->ke_runq, ke); 1125121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1126121605Sjeff runq_add(ke->ke_runq, ke); 1127121605Sjeff } 1128119488Sdavidxu adjustrunqueue(td, prio); 1129121605Sjeff } else 1130119488Sdavidxu td->td_priority = prio; 1131109864Sjeff} 1132109864Sjeff 1133109864Sjeffvoid 1134121128Sjeffsched_switch(struct thread *td) 1135109864Sjeff{ 1136121128Sjeff struct thread *newtd; 1137109864Sjeff struct kse *ke; 1138109864Sjeff 1139109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1140109864Sjeff 1141109864Sjeff ke = td->td_kse; 1142109864Sjeff 1143109864Sjeff td->td_last_kse = ke; 1144113339Sjulian td->td_lastcpu = td->td_oncpu; 1145113339Sjulian td->td_oncpu = NOCPU; 1146111032Sjulian td->td_flags &= ~TDF_NEEDRESCHED; 1147109864Sjeff 1148123434Sjeff /* 1149123434Sjeff * If the KSE has been assigned it may be in the process of switching 1150123434Sjeff * to the new cpu. This is the case in sched_bind(). 1151123434Sjeff */ 1152123434Sjeff if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1153123434Sjeff if (TD_IS_RUNNING(td)) { 1154123434Sjeff if (td->td_proc->p_flag & P_SA) { 1155123434Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1156123434Sjeff setrunqueue(td); 1157123434Sjeff } else 1158123434Sjeff kseq_runq_add(KSEQ_SELF(), ke); 1159123434Sjeff } else { 1160123434Sjeff if (ke->ke_runq) 1161123434Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1162123434Sjeff /* 1163123434Sjeff * We will not be on the run queue. So we must be 1164123434Sjeff * sleeping or similar. 1165123434Sjeff */ 1166123434Sjeff if (td->td_proc->p_flag & P_SA) 1167123434Sjeff kse_reassign(ke); 1168123434Sjeff } 1169121146Sjeff } 1170121128Sjeff newtd = choosethread(); 1171121128Sjeff if (td != newtd) 1172121128Sjeff cpu_switch(td, newtd); 1173121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1174109864Sjeff 1175113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1176109864Sjeff} 1177109864Sjeff 1178109864Sjeffvoid 1179109864Sjeffsched_nice(struct ksegrp *kg, int nice) 1180109864Sjeff{ 1181113357Sjeff struct kse *ke; 1182109864Sjeff struct thread *td; 1183113357Sjeff struct kseq *kseq; 1184109864Sjeff 1185113873Sjhb PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 1186113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1187113357Sjeff /* 1188113357Sjeff * We need to adjust the nice counts for running KSEs. 1189113357Sjeff */ 1190113357Sjeff if (kg->kg_pri_class == PRI_TIMESHARE) 1191113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1192116500Sjeff if (ke->ke_runq == NULL) 1193113357Sjeff continue; 1194113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1195113357Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1196113357Sjeff kseq_nice_add(kseq, nice); 1197113357Sjeff } 1198109864Sjeff kg->kg_nice = nice; 1199109864Sjeff sched_priority(kg); 1200113357Sjeff FOREACH_THREAD_IN_GROUP(kg, td) 1201111032Sjulian td->td_flags |= TDF_NEEDRESCHED; 1202109864Sjeff} 1203109864Sjeff 1204109864Sjeffvoid 1205109864Sjeffsched_sleep(struct thread *td, u_char prio) 1206109864Sjeff{ 1207109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1208109864Sjeff 1209109864Sjeff td->td_slptime = ticks; 1210109864Sjeff td->td_priority = prio; 1211109864Sjeff 1212113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 1213113357Sjeff td->td_kse, td->td_slptime); 1214109864Sjeff} 1215109864Sjeff 1216109864Sjeffvoid 1217109864Sjeffsched_wakeup(struct thread *td) 1218109864Sjeff{ 1219109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1220109864Sjeff 1221109864Sjeff /* 1222109864Sjeff * Let the kseg know how long we slept for. This is because process 1223109864Sjeff * interactivity behavior is modeled in the kseg. 1224109864Sjeff */ 1225111788Sjeff if (td->td_slptime) { 1226111788Sjeff struct ksegrp *kg; 1227113357Sjeff int hzticks; 1228109864Sjeff 1229111788Sjeff kg = td->td_ksegrp; 1230121868Sjeff hzticks = (ticks - td->td_slptime) << 10; 1231121868Sjeff if (hzticks >= SCHED_SLP_RUN_MAX) { 1232121868Sjeff kg->kg_slptime = SCHED_SLP_RUN_MAX; 1233121868Sjeff kg->kg_runtime = 1; 1234121868Sjeff } else { 1235121868Sjeff kg->kg_slptime += hzticks; 1236121868Sjeff sched_interact_update(kg); 1237121868Sjeff } 1238111788Sjeff sched_priority(kg); 1239116463Sjeff if (td->td_kse) 1240116463Sjeff sched_slice(td->td_kse); 1241113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 1242113357Sjeff td->td_kse, hzticks); 1243111788Sjeff td->td_slptime = 0; 1244109864Sjeff } 1245109864Sjeff setrunqueue(td); 1246109864Sjeff} 1247109864Sjeff 1248109864Sjeff/* 1249109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1250109864Sjeff * priority. 1251109864Sjeff */ 1252109864Sjeffvoid 1253113357Sjeffsched_fork(struct proc *p, struct proc *p1) 1254109864Sjeff{ 1255109864Sjeff 1256109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1257109864Sjeff 1258113357Sjeff sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 1259113357Sjeff sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 1260113357Sjeff sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 1261113357Sjeff} 1262113357Sjeff 1263113357Sjeffvoid 1264113357Sjeffsched_fork_kse(struct kse *ke, struct kse *child) 1265113357Sjeff{ 1266113923Sjhb 1267116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1268122847Sjeff child->ke_cpu = ke->ke_cpu; 1269113357Sjeff child->ke_runq = NULL; 1270113357Sjeff 1271121051Sjeff /* Grab our parents cpu estimation information. */ 1272121051Sjeff child->ke_ticks = ke->ke_ticks; 1273121051Sjeff child->ke_ltick = ke->ke_ltick; 1274121051Sjeff child->ke_ftick = ke->ke_ftick; 1275113357Sjeff} 1276113357Sjeff 1277113357Sjeffvoid 1278113357Sjeffsched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1279113357Sjeff{ 1280113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 1281116365Sjeff 1282121868Sjeff child->kg_slptime = kg->kg_slptime; 1283121868Sjeff child->kg_runtime = kg->kg_runtime; 1284121868Sjeff child->kg_user_pri = kg->kg_user_pri; 1285121868Sjeff child->kg_nice = kg->kg_nice; 1286121868Sjeff sched_interact_fork(child); 1287116463Sjeff kg->kg_runtime += tickincr << 10; 1288116463Sjeff sched_interact_update(kg); 1289113357Sjeff 1290121868Sjeff CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", 1291121868Sjeff kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 1292121868Sjeff child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); 1293113357Sjeff} 1294109864Sjeff 1295113357Sjeffvoid 1296113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 1297113357Sjeff{ 1298113357Sjeff} 1299113357Sjeff 1300113357Sjeffvoid 1301113357Sjeffsched_class(struct ksegrp *kg, int class) 1302113357Sjeff{ 1303113357Sjeff struct kseq *kseq; 1304113357Sjeff struct kse *ke; 1305121896Sjeff int nclass; 1306121896Sjeff int oclass; 1307113357Sjeff 1308113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1309113357Sjeff if (kg->kg_pri_class == class) 1310113357Sjeff return; 1311113357Sjeff 1312121896Sjeff nclass = PRI_BASE(class); 1313121896Sjeff oclass = PRI_BASE(kg->kg_pri_class); 1314113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1315113357Sjeff if (ke->ke_state != KES_ONRUNQ && 1316113357Sjeff ke->ke_state != KES_THREAD) 1317113357Sjeff continue; 1318113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1319113357Sjeff 1320121896Sjeff#ifdef SMP 1321122744Sjeff /* 1322122744Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1323122744Sjeff * count because could be changing to or from an interrupt 1324122744Sjeff * class. 1325122744Sjeff */ 1326122744Sjeff if (ke->ke_state == KES_ONRUNQ) { 1327123433Sjeff if (KSE_CAN_MIGRATE(ke, oclass)) { 1328123433Sjeff kseq->ksq_transferable--; 1329123433Sjeff kseq->ksq_group->ksg_transferable--; 1330123433Sjeff } 1331123433Sjeff if (KSE_CAN_MIGRATE(ke, nclass)) { 1332123433Sjeff kseq->ksq_transferable++; 1333123433Sjeff kseq->ksq_group->ksg_transferable++; 1334123433Sjeff } 1335122744Sjeff } 1336121896Sjeff#endif 1337122744Sjeff if (oclass == PRI_TIMESHARE) { 1338121896Sjeff kseq->ksq_load_timeshare--; 1339122744Sjeff kseq_nice_rem(kseq, kg->kg_nice); 1340122744Sjeff } 1341122744Sjeff if (nclass == PRI_TIMESHARE) { 1342121896Sjeff kseq->ksq_load_timeshare++; 1343113357Sjeff kseq_nice_add(kseq, kg->kg_nice); 1344122744Sjeff } 1345109970Sjeff } 1346109970Sjeff 1347113357Sjeff kg->kg_pri_class = class; 1348109864Sjeff} 1349109864Sjeff 1350109864Sjeff/* 1351109864Sjeff * Return some of the child's priority and interactivity to the parent. 1352109864Sjeff */ 1353109864Sjeffvoid 1354113357Sjeffsched_exit(struct proc *p, struct proc *child) 1355109864Sjeff{ 1356109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1357113372Sjeff sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 1358116365Sjeff sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 1359109864Sjeff} 1360109864Sjeff 1361109864Sjeffvoid 1362113372Sjeffsched_exit_kse(struct kse *ke, struct kse *child) 1363113372Sjeff{ 1364122744Sjeff kseq_load_rem(KSEQ_CPU(child->ke_cpu), child); 1365113372Sjeff} 1366113372Sjeff 1367113372Sjeffvoid 1368113372Sjeffsched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1369113372Sjeff{ 1370116463Sjeff /* kg->kg_slptime += child->kg_slptime; */ 1371116365Sjeff kg->kg_runtime += child->kg_runtime; 1372116463Sjeff sched_interact_update(kg); 1373113372Sjeff} 1374113372Sjeff 1375113372Sjeffvoid 1376113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 1377113372Sjeff{ 1378113372Sjeff} 1379113372Sjeff 1380113372Sjeffvoid 1381121127Sjeffsched_clock(struct thread *td) 1382109864Sjeff{ 1383113357Sjeff struct kseq *kseq; 1384113357Sjeff struct ksegrp *kg; 1385121127Sjeff struct kse *ke; 1386109864Sjeff 1387113357Sjeff /* 1388113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1389113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1390113357Sjeff * calculating this here. 1391113357Sjeff */ 1392113357Sjeff if (realstathz == 0) { 1393113357Sjeff realstathz = stathz ? stathz : hz; 1394113357Sjeff tickincr = hz / realstathz; 1395113357Sjeff /* 1396113357Sjeff * XXX This does not work for values of stathz that are much 1397113357Sjeff * larger than hz. 1398113357Sjeff */ 1399113357Sjeff if (tickincr == 0) 1400113357Sjeff tickincr = 1; 1401113357Sjeff } 1402109864Sjeff 1403121127Sjeff ke = td->td_kse; 1404113357Sjeff kg = ke->ke_ksegrp; 1405109864Sjeff 1406110028Sjeff mtx_assert(&sched_lock, MA_OWNED); 1407110028Sjeff KASSERT((td != NULL), ("schedclock: null thread pointer")); 1408110028Sjeff 1409110028Sjeff /* Adjust ticks for pctcpu */ 1410111793Sjeff ke->ke_ticks++; 1411109971Sjeff ke->ke_ltick = ticks; 1412112994Sjeff 1413109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1414109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1415109971Sjeff sched_pctcpu_update(ke); 1416109971Sjeff 1417114496Sjulian if (td->td_flags & TDF_IDLETD) 1418109864Sjeff return; 1419110028Sjeff 1420113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1421113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1422110028Sjeff /* 1423113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1424113357Sjeff */ 1425113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1426113357Sjeff return; 1427113357Sjeff /* 1428110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1429113357Sjeff * interactivity. 1430109864Sjeff */ 1431113357Sjeff kg->kg_runtime += tickincr << 10; 1432116463Sjeff sched_interact_update(kg); 1433110645Sjeff 1434109864Sjeff /* 1435109864Sjeff * We used up one time slice. 1436109864Sjeff */ 1437122847Sjeff if (--ke->ke_slice > 0) 1438113357Sjeff return; 1439109864Sjeff /* 1440113357Sjeff * We're out of time, recompute priorities and requeue. 1441109864Sjeff */ 1442122847Sjeff kseq = KSEQ_SELF(); 1443122744Sjeff kseq_load_rem(kseq, ke); 1444113357Sjeff sched_priority(kg); 1445113357Sjeff sched_slice(ke); 1446113357Sjeff if (SCHED_CURR(kg, ke)) 1447113357Sjeff ke->ke_runq = kseq->ksq_curr; 1448113357Sjeff else 1449113357Sjeff ke->ke_runq = kseq->ksq_next; 1450122744Sjeff kseq_load_add(kseq, ke); 1451113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1452109864Sjeff} 1453109864Sjeff 1454109864Sjeffint 1455109864Sjeffsched_runnable(void) 1456109864Sjeff{ 1457109864Sjeff struct kseq *kseq; 1458115998Sjeff int load; 1459109864Sjeff 1460115998Sjeff load = 1; 1461115998Sjeff 1462110028Sjeff kseq = KSEQ_SELF(); 1463121790Sjeff#ifdef SMP 1464122094Sjeff if (kseq->ksq_assigned) { 1465122094Sjeff mtx_lock_spin(&sched_lock); 1466121790Sjeff kseq_assign(kseq); 1467122094Sjeff mtx_unlock_spin(&sched_lock); 1468122094Sjeff } 1469121790Sjeff#endif 1470121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1471121605Sjeff if (kseq->ksq_load > 0) 1472121605Sjeff goto out; 1473121605Sjeff } else 1474121605Sjeff if (kseq->ksq_load - 1 > 0) 1475121605Sjeff goto out; 1476115998Sjeff load = 0; 1477115998Sjeffout: 1478115998Sjeff return (load); 1479109864Sjeff} 1480109864Sjeff 1481109864Sjeffvoid 1482109864Sjeffsched_userret(struct thread *td) 1483109864Sjeff{ 1484109864Sjeff struct ksegrp *kg; 1485121605Sjeff 1486121605Sjeff kg = td->td_ksegrp; 1487109864Sjeff 1488109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1489109864Sjeff mtx_lock_spin(&sched_lock); 1490109864Sjeff td->td_priority = kg->kg_user_pri; 1491109864Sjeff mtx_unlock_spin(&sched_lock); 1492109864Sjeff } 1493109864Sjeff} 1494109864Sjeff 1495109864Sjeffstruct kse * 1496109970Sjeffsched_choose(void) 1497109970Sjeff{ 1498110028Sjeff struct kseq *kseq; 1499109970Sjeff struct kse *ke; 1500109970Sjeff 1501115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1502121790Sjeff kseq = KSEQ_SELF(); 1503113357Sjeff#ifdef SMP 1504123433Sjeffrestart: 1505121790Sjeff if (kseq->ksq_assigned) 1506121790Sjeff kseq_assign(kseq); 1507113357Sjeff#endif 1508121790Sjeff ke = kseq_choose(kseq); 1509109864Sjeff if (ke) { 1510121790Sjeff#ifdef SMP 1511121790Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1512123433Sjeff if (kseq_idled(kseq) == 0) 1513123433Sjeff goto restart; 1514121790Sjeff#endif 1515122744Sjeff kseq_runq_rem(kseq, ke); 1516109864Sjeff ke->ke_state = KES_THREAD; 1517112966Sjeff 1518113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1519113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1520113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1521113357Sjeff ke->ke_thread->td_priority); 1522113357Sjeff } 1523113357Sjeff return (ke); 1524109864Sjeff } 1525109970Sjeff#ifdef SMP 1526123433Sjeff if (kseq_idled(kseq) == 0) 1527123433Sjeff goto restart; 1528109970Sjeff#endif 1529113357Sjeff return (NULL); 1530109864Sjeff} 1531109864Sjeff 1532109864Sjeffvoid 1533121127Sjeffsched_add(struct thread *td) 1534109864Sjeff{ 1535110267Sjeff struct kseq *kseq; 1536113357Sjeff struct ksegrp *kg; 1537121127Sjeff struct kse *ke; 1538121790Sjeff int class; 1539109864Sjeff 1540121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 1541121127Sjeff ke = td->td_kse; 1542121127Sjeff kg = td->td_ksegrp; 1543121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1544121790Sjeff return; 1545121790Sjeff kseq = KSEQ_SELF(); 1546110267Sjeff KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1547109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1548110267Sjeff ("sched_add: No KSE on thread")); 1549109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1550110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1551109864Sjeff ke->ke_proc->p_comm)); 1552109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1553110267Sjeff ("sched_add: process swapped out")); 1554113387Sjeff KASSERT(ke->ke_runq == NULL, 1555113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1556109864Sjeff 1557121790Sjeff class = PRI_BASE(kg->kg_pri_class); 1558121790Sjeff switch (class) { 1559112994Sjeff case PRI_ITHD: 1560112994Sjeff case PRI_REALTIME: 1561113357Sjeff ke->ke_runq = kseq->ksq_curr; 1562113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1563113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1564112994Sjeff break; 1565112994Sjeff case PRI_TIMESHARE: 1566113387Sjeff if (SCHED_CURR(kg, ke)) 1567113387Sjeff ke->ke_runq = kseq->ksq_curr; 1568113387Sjeff else 1569113387Sjeff ke->ke_runq = kseq->ksq_next; 1570113357Sjeff break; 1571112994Sjeff case PRI_IDLE: 1572113357Sjeff /* 1573113357Sjeff * This is for priority prop. 1574113357Sjeff */ 1575121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1576113357Sjeff ke->ke_runq = kseq->ksq_curr; 1577113357Sjeff else 1578113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1579113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1580112994Sjeff break; 1581113357Sjeff default: 1582121868Sjeff panic("Unknown pri class."); 1583113357Sjeff break; 1584112994Sjeff } 1585121790Sjeff#ifdef SMP 1586123433Sjeff if (ke->ke_cpu != PCPU_GET(cpuid)) { 1587123433Sjeff kseq_notify(ke, ke->ke_cpu); 1588123433Sjeff return; 1589123433Sjeff } 1590121790Sjeff /* 1591123433Sjeff * If there are any idle groups, give them our extra load. The 1592122744Sjeff * threshold at which we start to reassign kses has a large impact 1593122744Sjeff * on the overall performance of the system. Tuned too high and 1594122744Sjeff * some CPUs may idle. Too low and there will be excess migration 1595122744Sjeff * and context swiches. 1596121790Sjeff */ 1597123433Sjeff if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class)) 1598123433Sjeff if (kseq_transfer(kseq, ke, class)) 1599123433Sjeff return; 1600123433Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1601123433Sjeff (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1602121790Sjeff /* 1603123433Sjeff * Check to see if our group is unidling, and if so, remove it 1604123433Sjeff * from the global idle mask. 1605121790Sjeff */ 1606123433Sjeff if (kseq->ksq_group->ksg_idlemask == 1607123433Sjeff kseq->ksq_group->ksg_cpumask) 1608123433Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1609123433Sjeff /* 1610123433Sjeff * Now remove ourselves from the group specific idle mask. 1611123433Sjeff */ 1612123433Sjeff kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1613121790Sjeff } 1614121790Sjeff#endif 1615121790Sjeff if (td->td_priority < curthread->td_priority) 1616121790Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1617121790Sjeff 1618109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1619109864Sjeff ke->ke_state = KES_ONRUNQ; 1620109864Sjeff 1621122744Sjeff kseq_runq_add(kseq, ke); 1622122744Sjeff kseq_load_add(kseq, ke); 1623109864Sjeff} 1624109864Sjeff 1625109864Sjeffvoid 1626121127Sjeffsched_rem(struct thread *td) 1627109864Sjeff{ 1628113357Sjeff struct kseq *kseq; 1629121127Sjeff struct kse *ke; 1630113357Sjeff 1631121127Sjeff ke = td->td_kse; 1632121790Sjeff /* 1633121790Sjeff * It is safe to just return here because sched_rem() is only ever 1634121790Sjeff * used in places where we're immediately going to add the 1635121790Sjeff * kse back on again. In that case it'll be added with the correct 1636121790Sjeff * thread and priority when the caller drops the sched_lock. 1637121790Sjeff */ 1638121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1639121790Sjeff return; 1640109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1641113387Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1642109864Sjeff 1643109864Sjeff ke->ke_state = KES_THREAD; 1644109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1645113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1646122744Sjeff kseq_runq_rem(kseq, ke); 1647122744Sjeff kseq_load_rem(kseq, ke); 1648109864Sjeff} 1649109864Sjeff 1650109864Sjefffixpt_t 1651121127Sjeffsched_pctcpu(struct thread *td) 1652109864Sjeff{ 1653109864Sjeff fixpt_t pctcpu; 1654121127Sjeff struct kse *ke; 1655109864Sjeff 1656109864Sjeff pctcpu = 0; 1657121127Sjeff ke = td->td_kse; 1658121290Sjeff if (ke == NULL) 1659121290Sjeff return (0); 1660109864Sjeff 1661115998Sjeff mtx_lock_spin(&sched_lock); 1662109864Sjeff if (ke->ke_ticks) { 1663109864Sjeff int rtick; 1664109864Sjeff 1665116365Sjeff /* 1666116365Sjeff * Don't update more frequently than twice a second. Allowing 1667116365Sjeff * this causes the cpu usage to decay away too quickly due to 1668116365Sjeff * rounding errors. 1669116365Sjeff */ 1670123435Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1671123435Sjeff ke->ke_ltick < (ticks - (hz / 2))) 1672116365Sjeff sched_pctcpu_update(ke); 1673109864Sjeff /* How many rtick per second ? */ 1674116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1675110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1676109864Sjeff } 1677109864Sjeff 1678109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1679113865Sjhb mtx_unlock_spin(&sched_lock); 1680109864Sjeff 1681109864Sjeff return (pctcpu); 1682109864Sjeff} 1683109864Sjeff 1684122038Sjeffvoid 1685122038Sjeffsched_bind(struct thread *td, int cpu) 1686122038Sjeff{ 1687122038Sjeff struct kse *ke; 1688122038Sjeff 1689122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1690122038Sjeff ke = td->td_kse; 1691122038Sjeff ke->ke_flags |= KEF_BOUND; 1692123433Sjeff#ifdef SMP 1693123433Sjeff if (PCPU_GET(cpuid) == cpu) 1694122038Sjeff return; 1695122038Sjeff /* sched_rem without the runq_remove */ 1696122038Sjeff ke->ke_state = KES_THREAD; 1697122038Sjeff ke->ke_ksegrp->kg_runq_kses--; 1698122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1699122038Sjeff ke->ke_cpu = cpu; 1700122038Sjeff kseq_notify(ke, cpu); 1701122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 1702122038Sjeff td->td_proc->p_stats->p_ru.ru_nvcsw++; 1703122038Sjeff mi_switch(); 1704122038Sjeff#endif 1705122038Sjeff} 1706122038Sjeff 1707122038Sjeffvoid 1708122038Sjeffsched_unbind(struct thread *td) 1709122038Sjeff{ 1710122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1711122038Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 1712122038Sjeff} 1713122038Sjeff 1714109864Sjeffint 1715109864Sjeffsched_sizeof_kse(void) 1716109864Sjeff{ 1717109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1718109864Sjeff} 1719109864Sjeff 1720109864Sjeffint 1721109864Sjeffsched_sizeof_ksegrp(void) 1722109864Sjeff{ 1723109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1724109864Sjeff} 1725109864Sjeff 1726109864Sjeffint 1727109864Sjeffsched_sizeof_proc(void) 1728109864Sjeff{ 1729109864Sjeff return (sizeof(struct proc)); 1730109864Sjeff} 1731109864Sjeff 1732109864Sjeffint 1733109864Sjeffsched_sizeof_thread(void) 1734109864Sjeff{ 1735109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1736109864Sjeff} 1737