sched_ule.c revision 134415
1109864Sjeff/*- 2113357Sjeff * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3109864Sjeff * All rights reserved. 4109864Sjeff * 5109864Sjeff * Redistribution and use in source and binary forms, with or without 6109864Sjeff * modification, are permitted provided that the following conditions 7109864Sjeff * are met: 8109864Sjeff * 1. Redistributions of source code must retain the above copyright 9109864Sjeff * notice unmodified, this list of conditions, and the following 10109864Sjeff * disclaimer. 11109864Sjeff * 2. Redistributions in binary form must reproduce the above copyright 12109864Sjeff * notice, this list of conditions and the following disclaimer in the 13109864Sjeff * documentation and/or other materials provided with the distribution. 14109864Sjeff * 15109864Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16109864Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17109864Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18109864Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19109864Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20109864Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21109864Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22109864Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23109864Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24109864Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25109864Sjeff */ 26109864Sjeff 27116182Sobrien#include <sys/cdefs.h> 28116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 134415 2004-08-28 00:49:22Z peter $"); 29116182Sobrien 30109864Sjeff#include <sys/param.h> 31109864Sjeff#include <sys/systm.h> 32131929Smarcel#include <sys/kdb.h> 33109864Sjeff#include <sys/kernel.h> 34109864Sjeff#include <sys/ktr.h> 35109864Sjeff#include <sys/lock.h> 36109864Sjeff#include <sys/mutex.h> 37109864Sjeff#include <sys/proc.h> 38112966Sjeff#include <sys/resource.h> 39122038Sjeff#include <sys/resourcevar.h> 40109864Sjeff#include <sys/sched.h> 41109864Sjeff#include <sys/smp.h> 42109864Sjeff#include <sys/sx.h> 43109864Sjeff#include <sys/sysctl.h> 44109864Sjeff#include <sys/sysproto.h> 45109864Sjeff#include <sys/vmmeter.h> 46109864Sjeff#ifdef KTRACE 47109864Sjeff#include <sys/uio.h> 48109864Sjeff#include <sys/ktrace.h> 49109864Sjeff#endif 50109864Sjeff 51109864Sjeff#include <machine/cpu.h> 52121790Sjeff#include <machine/smp.h> 53109864Sjeff 54133555Sjeff#define KTR_ULE KTR_NFS 55113357Sjeff 56109864Sjeff/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57109864Sjeff/* XXX This is bogus compatability crap for ps */ 58109864Sjeffstatic fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 59109864SjeffSYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 60109864Sjeff 61109864Sjeffstatic void sched_setup(void *dummy); 62109864SjeffSYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 63109864Sjeff 64132589Sscottlstatic SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 65113357Sjeff 66132589SscottlSYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 67132589Sscottl "Scheduler name"); 68130881Sscottl 69113357Sjeffstatic int slice_min = 1; 70113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71113357Sjeff 72116365Sjeffstatic int slice_max = 10; 73113357SjeffSYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74113357Sjeff 75111857Sjeffint realstathz; 76113357Sjeffint tickincr = 1; 77111857Sjeff 78109864Sjeff/* 79109864Sjeff * These datastructures are allocated within their parent datastructure but 80109864Sjeff * are scheduler specific. 81109864Sjeff */ 82109864Sjeff 83109864Sjeffstruct ke_sched { 84109864Sjeff int ske_slice; 85109864Sjeff struct runq *ske_runq; 86109864Sjeff /* The following variables are only used for pctcpu calculation */ 87109864Sjeff int ske_ltick; /* Last tick that we were running on */ 88109864Sjeff int ske_ftick; /* First tick that we were running on */ 89109864Sjeff int ske_ticks; /* Tick count */ 90113357Sjeff /* CPU that we have affinity for. */ 91110260Sjeff u_char ske_cpu; 92109864Sjeff}; 93109864Sjeff#define ke_slice ke_sched->ske_slice 94109864Sjeff#define ke_runq ke_sched->ske_runq 95109864Sjeff#define ke_ltick ke_sched->ske_ltick 96109864Sjeff#define ke_ftick ke_sched->ske_ftick 97109864Sjeff#define ke_ticks ke_sched->ske_ticks 98110260Sjeff#define ke_cpu ke_sched->ske_cpu 99121790Sjeff#define ke_assign ke_procq.tqe_next 100109864Sjeff 101121790Sjeff#define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ 102122158Sjeff#define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ 103133427Sjeff#define KEF_XFERABLE KEF_SCHED2 /* KSE was added as transferable. */ 104133555Sjeff#define KEF_HOLD KEF_SCHED3 /* KSE is temporarily bound. */ 105121790Sjeff 106109864Sjeffstruct kg_sched { 107110645Sjeff int skg_slptime; /* Number of ticks we vol. slept */ 108110645Sjeff int skg_runtime; /* Number of ticks we were running */ 109109864Sjeff}; 110109864Sjeff#define kg_slptime kg_sched->skg_slptime 111110645Sjeff#define kg_runtime kg_sched->skg_runtime 112109864Sjeff 113109864Sjeffstruct td_sched { 114109864Sjeff int std_slptime; 115109864Sjeff}; 116109864Sjeff#define td_slptime td_sched->std_slptime 117109864Sjeff 118110267Sjeffstruct td_sched td_sched; 119109864Sjeffstruct ke_sched ke_sched; 120109864Sjeffstruct kg_sched kg_sched; 121109864Sjeff 122109864Sjeffstruct ke_sched *kse0_sched = &ke_sched; 123109864Sjeffstruct kg_sched *ksegrp0_sched = &kg_sched; 124109864Sjeffstruct p_sched *proc0_sched = NULL; 125109864Sjeffstruct td_sched *thread0_sched = &td_sched; 126109864Sjeff 127109864Sjeff/* 128116642Sjeff * The priority is primarily determined by the interactivity score. Thus, we 129116642Sjeff * give lower(better) priorities to kse groups that use less CPU. The nice 130116642Sjeff * value is then directly added to this to allow nice to have some effect 131116642Sjeff * on latency. 132111857Sjeff * 133111857Sjeff * PRI_RANGE: Total priority range for timeshare threads. 134116642Sjeff * PRI_NRESV: Number of nice values. 135111857Sjeff * PRI_BASE: The start of the dynamic range. 136109864Sjeff */ 137111857Sjeff#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 138121869Sjeff#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 139121869Sjeff#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 140116642Sjeff#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 141113357Sjeff#define SCHED_PRI_INTERACT(score) \ 142116642Sjeff ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 143109864Sjeff 144109864Sjeff/* 145111857Sjeff * These determine the interactivity of a process. 146109864Sjeff * 147110645Sjeff * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 148110645Sjeff * before throttling back. 149121868Sjeff * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 150116365Sjeff * INTERACT_MAX: Maximum interactivity value. Smaller is better. 151111857Sjeff * INTERACT_THRESH: Threshhold for placement on the current runq. 152109864Sjeff */ 153121126Sjeff#define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 154121868Sjeff#define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 155116365Sjeff#define SCHED_INTERACT_MAX (100) 156116365Sjeff#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 157121126Sjeff#define SCHED_INTERACT_THRESH (30) 158111857Sjeff 159109864Sjeff/* 160109864Sjeff * These parameters and macros determine the size of the time slice that is 161109864Sjeff * granted to each thread. 162109864Sjeff * 163109864Sjeff * SLICE_MIN: Minimum time slice granted, in units of ticks. 164109864Sjeff * SLICE_MAX: Maximum time slice granted. 165109864Sjeff * SLICE_RANGE: Range of available time slices scaled by hz. 166112966Sjeff * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 167112966Sjeff * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 168121871Sjeff * SLICE_NTHRESH: The nice cutoff point for slice assignment. 169109864Sjeff */ 170113357Sjeff#define SCHED_SLICE_MIN (slice_min) 171113357Sjeff#define SCHED_SLICE_MAX (slice_max) 172125299Sjeff#define SCHED_SLICE_INTERACTIVE (slice_max) 173121871Sjeff#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 174111857Sjeff#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 175109864Sjeff#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 176112966Sjeff#define SCHED_SLICE_NICE(nice) \ 177121871Sjeff (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 178109864Sjeff 179109864Sjeff/* 180109864Sjeff * This macro determines whether or not the kse belongs on the current or 181109864Sjeff * next run queue. 182109864Sjeff */ 183113357Sjeff#define SCHED_INTERACTIVE(kg) \ 184113357Sjeff (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 185113417Sjeff#define SCHED_CURR(kg, ke) \ 186127278Sobrien (ke->ke_thread->td_priority < kg->kg_user_pri || \ 187121107Sjeff SCHED_INTERACTIVE(kg)) 188109864Sjeff 189109864Sjeff/* 190109864Sjeff * Cpu percentage computation macros and defines. 191109864Sjeff * 192109864Sjeff * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 193109864Sjeff * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 194109864Sjeff */ 195109864Sjeff 196112971Sjeff#define SCHED_CPU_TIME 10 197109864Sjeff#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 198109864Sjeff 199109864Sjeff/* 200113357Sjeff * kseq - per processor runqs and statistics. 201109864Sjeff */ 202109864Sjeffstruct kseq { 203113357Sjeff struct runq ksq_idle; /* Queue of IDLE threads. */ 204113357Sjeff struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 205113357Sjeff struct runq *ksq_next; /* Next timeshare queue. */ 206113357Sjeff struct runq *ksq_curr; /* Current queue. */ 207121896Sjeff int ksq_load_timeshare; /* Load for timeshare. */ 208113357Sjeff int ksq_load; /* Aggregate load. */ 209121869Sjeff short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 210113357Sjeff short ksq_nicemin; /* Least nice. */ 211110267Sjeff#ifdef SMP 212123433Sjeff int ksq_transferable; 213123433Sjeff LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 214123433Sjeff struct kseq_group *ksq_group; /* Our processor group. */ 215123433Sjeff volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 216125289Sjeff#else 217125289Sjeff int ksq_sysload; /* For loadavg, !ITHD load. */ 218110267Sjeff#endif 219109864Sjeff}; 220109864Sjeff 221123433Sjeff#ifdef SMP 222109864Sjeff/* 223123433Sjeff * kseq groups are groups of processors which can cheaply share threads. When 224123433Sjeff * one processor in the group goes idle it will check the runqs of the other 225123433Sjeff * processors in its group prior to halting and waiting for an interrupt. 226123433Sjeff * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 227123433Sjeff * In a numa environment we'd want an idle bitmap per group and a two tiered 228123433Sjeff * load balancer. 229123433Sjeff */ 230123433Sjeffstruct kseq_group { 231123433Sjeff int ksg_cpus; /* Count of CPUs in this kseq group. */ 232127498Smarcel cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ 233127498Smarcel cpumask_t ksg_idlemask; /* Idle cpus in this group. */ 234127498Smarcel cpumask_t ksg_mask; /* Bit mask for first cpu. */ 235123487Sjeff int ksg_load; /* Total load of this group. */ 236123433Sjeff int ksg_transferable; /* Transferable load of this group. */ 237123433Sjeff LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 238123433Sjeff}; 239123433Sjeff#endif 240123433Sjeff 241123433Sjeff/* 242109864Sjeff * One kse queue per processor. 243109864Sjeff */ 244110028Sjeff#ifdef SMP 245127498Smarcelstatic cpumask_t kseq_idle; 246123487Sjeffstatic int ksg_maxid; 247121790Sjeffstatic struct kseq kseq_cpu[MAXCPU]; 248123433Sjeffstatic struct kseq_group kseq_groups[MAXCPU]; 249129982Sjeffstatic int bal_tick; 250129982Sjeffstatic int gbal_tick; 251129982Sjeff 252123433Sjeff#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 253123433Sjeff#define KSEQ_CPU(x) (&kseq_cpu[(x)]) 254123487Sjeff#define KSEQ_ID(x) ((x) - kseq_cpu) 255123487Sjeff#define KSEQ_GROUP(x) (&kseq_groups[(x)]) 256123433Sjeff#else /* !SMP */ 257121790Sjeffstatic struct kseq kseq_cpu; 258129982Sjeff 259110028Sjeff#define KSEQ_SELF() (&kseq_cpu) 260110028Sjeff#define KSEQ_CPU(x) (&kseq_cpu) 261110028Sjeff#endif 262109864Sjeff 263131839Sjhbstatic void sched_add_internal(struct thread *td, int preemptive); 264112966Sjeffstatic void sched_slice(struct kse *ke); 265113357Sjeffstatic void sched_priority(struct ksegrp *kg); 266111857Sjeffstatic int sched_interact_score(struct ksegrp *kg); 267116463Sjeffstatic void sched_interact_update(struct ksegrp *kg); 268121868Sjeffstatic void sched_interact_fork(struct ksegrp *kg); 269121790Sjeffstatic void sched_pctcpu_update(struct kse *ke); 270109864Sjeff 271110267Sjeff/* Operations on per processor queues */ 272121790Sjeffstatic struct kse * kseq_choose(struct kseq *kseq); 273110028Sjeffstatic void kseq_setup(struct kseq *kseq); 274122744Sjeffstatic void kseq_load_add(struct kseq *kseq, struct kse *ke); 275122744Sjeffstatic void kseq_load_rem(struct kseq *kseq, struct kse *ke); 276122744Sjeffstatic __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 277122744Sjeffstatic __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 278113357Sjeffstatic void kseq_nice_add(struct kseq *kseq, int nice); 279113357Sjeffstatic void kseq_nice_rem(struct kseq *kseq, int nice); 280113660Sjeffvoid kseq_print(int cpu); 281110267Sjeff#ifdef SMP 282123433Sjeffstatic int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 283121790Sjeffstatic struct kse *runq_steal(struct runq *rq); 284129982Sjeffstatic void sched_balance(void); 285129982Sjeffstatic void sched_balance_groups(void); 286123487Sjeffstatic void sched_balance_group(struct kseq_group *ksg); 287123487Sjeffstatic void sched_balance_pair(struct kseq *high, struct kseq *low); 288121790Sjeffstatic void kseq_move(struct kseq *from, int cpu); 289123433Sjeffstatic int kseq_idled(struct kseq *kseq); 290121790Sjeffstatic void kseq_notify(struct kse *ke, int cpu); 291121790Sjeffstatic void kseq_assign(struct kseq *); 292123433Sjeffstatic struct kse *kseq_steal(struct kseq *kseq, int stealidle); 293123693Sjeff/* 294123693Sjeff * On P4 Xeons the round-robin interrupt delivery is broken. As a result of 295123693Sjeff * this, we can't pin interrupts to the cpu that they were delivered to, 296123693Sjeff * otherwise all ithreads only run on CPU 0. 297123693Sjeff */ 298123693Sjeff#ifdef __i386__ 299122038Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 300123693Sjeff ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 301123693Sjeff#else /* !__i386__ */ 302123693Sjeff#define KSE_CAN_MIGRATE(ke, class) \ 303122158Sjeff ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 304122165Sjeff ((ke)->ke_flags & KEF_BOUND) == 0) 305123693Sjeff#endif /* !__i386__ */ 306121790Sjeff#endif 307110028Sjeff 308113357Sjeffvoid 309113660Sjeffkseq_print(int cpu) 310110267Sjeff{ 311113660Sjeff struct kseq *kseq; 312113357Sjeff int i; 313112994Sjeff 314113660Sjeff kseq = KSEQ_CPU(cpu); 315112994Sjeff 316113357Sjeff printf("kseq:\n"); 317113357Sjeff printf("\tload: %d\n", kseq->ksq_load); 318122744Sjeff printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 319121896Sjeff#ifdef SMP 320123433Sjeff printf("\tload transferable: %d\n", kseq->ksq_transferable); 321121896Sjeff#endif 322113357Sjeff printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 323113357Sjeff printf("\tnice counts:\n"); 324121869Sjeff for (i = 0; i < SCHED_PRI_NRESV; i++) 325113357Sjeff if (kseq->ksq_nice[i]) 326113357Sjeff printf("\t\t%d = %d\n", 327113357Sjeff i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 328113357Sjeff} 329112994Sjeff 330122744Sjeffstatic __inline void 331122744Sjeffkseq_runq_add(struct kseq *kseq, struct kse *ke) 332122744Sjeff{ 333122744Sjeff#ifdef SMP 334123433Sjeff if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 335123433Sjeff kseq->ksq_transferable++; 336123433Sjeff kseq->ksq_group->ksg_transferable++; 337133427Sjeff ke->ke_flags |= KEF_XFERABLE; 338123433Sjeff } 339122744Sjeff#endif 340122744Sjeff runq_add(ke->ke_runq, ke); 341122744Sjeff} 342122744Sjeff 343122744Sjeffstatic __inline void 344122744Sjeffkseq_runq_rem(struct kseq *kseq, struct kse *ke) 345122744Sjeff{ 346122744Sjeff#ifdef SMP 347133427Sjeff if (ke->ke_flags & KEF_XFERABLE) { 348123433Sjeff kseq->ksq_transferable--; 349123433Sjeff kseq->ksq_group->ksg_transferable--; 350133427Sjeff ke->ke_flags &= ~KEF_XFERABLE; 351123433Sjeff } 352122744Sjeff#endif 353122744Sjeff runq_remove(ke->ke_runq, ke); 354122744Sjeff} 355122744Sjeff 356113357Sjeffstatic void 357122744Sjeffkseq_load_add(struct kseq *kseq, struct kse *ke) 358113357Sjeff{ 359121896Sjeff int class; 360115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 361121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 362121896Sjeff if (class == PRI_TIMESHARE) 363121896Sjeff kseq->ksq_load_timeshare++; 364113357Sjeff kseq->ksq_load++; 365128563Sobrien if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 366123487Sjeff#ifdef SMP 367123487Sjeff kseq->ksq_group->ksg_load++; 368125289Sjeff#else 369125289Sjeff kseq->ksq_sysload++; 370123487Sjeff#endif 371113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 372122744Sjeff CTR6(KTR_ULE, 373122744Sjeff "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 374122744Sjeff ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 375130551Sjulian ke->ke_proc->p_nice, kseq->ksq_nicemin); 376113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 377130551Sjulian kseq_nice_add(kseq, ke->ke_proc->p_nice); 378110267Sjeff} 379113357Sjeff 380112994Sjeffstatic void 381122744Sjeffkseq_load_rem(struct kseq *kseq, struct kse *ke) 382110267Sjeff{ 383121896Sjeff int class; 384115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 385121896Sjeff class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 386121896Sjeff if (class == PRI_TIMESHARE) 387121896Sjeff kseq->ksq_load_timeshare--; 388128563Sobrien if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 389123487Sjeff#ifdef SMP 390123487Sjeff kseq->ksq_group->ksg_load--; 391125289Sjeff#else 392125289Sjeff kseq->ksq_sysload--; 393123487Sjeff#endif 394113357Sjeff kseq->ksq_load--; 395113357Sjeff ke->ke_runq = NULL; 396113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 397130551Sjulian kseq_nice_rem(kseq, ke->ke_proc->p_nice); 398110267Sjeff} 399110267Sjeff 400113357Sjeffstatic void 401113357Sjeffkseq_nice_add(struct kseq *kseq, int nice) 402110267Sjeff{ 403115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 404113357Sjeff /* Normalize to zero. */ 405113357Sjeff kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 406121896Sjeff if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 407113357Sjeff kseq->ksq_nicemin = nice; 408110267Sjeff} 409110267Sjeff 410113357Sjeffstatic void 411113357Sjeffkseq_nice_rem(struct kseq *kseq, int nice) 412110267Sjeff{ 413113357Sjeff int n; 414113357Sjeff 415115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 416113357Sjeff /* Normalize to zero. */ 417113357Sjeff n = nice + SCHED_PRI_NHALF; 418113357Sjeff kseq->ksq_nice[n]--; 419113357Sjeff KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 420113357Sjeff 421113357Sjeff /* 422113357Sjeff * If this wasn't the smallest nice value or there are more in 423113357Sjeff * this bucket we can just return. Otherwise we have to recalculate 424113357Sjeff * the smallest nice. 425113357Sjeff */ 426113357Sjeff if (nice != kseq->ksq_nicemin || 427113357Sjeff kseq->ksq_nice[n] != 0 || 428121896Sjeff kseq->ksq_load_timeshare == 0) 429113357Sjeff return; 430113357Sjeff 431121869Sjeff for (; n < SCHED_PRI_NRESV; n++) 432113357Sjeff if (kseq->ksq_nice[n]) { 433113357Sjeff kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 434113357Sjeff return; 435113357Sjeff } 436110267Sjeff} 437110267Sjeff 438113357Sjeff#ifdef SMP 439116069Sjeff/* 440122744Sjeff * sched_balance is a simple CPU load balancing algorithm. It operates by 441116069Sjeff * finding the least loaded and most loaded cpu and equalizing their load 442116069Sjeff * by migrating some processes. 443116069Sjeff * 444116069Sjeff * Dealing only with two CPUs at a time has two advantages. Firstly, most 445116069Sjeff * installations will only have 2 cpus. Secondly, load balancing too much at 446116069Sjeff * once can have an unpleasant effect on the system. The scheduler rarely has 447116069Sjeff * enough information to make perfect decisions. So this algorithm chooses 448116069Sjeff * algorithm simplicity and more gradual effects on load in larger systems. 449116069Sjeff * 450116069Sjeff * It could be improved by considering the priorities and slices assigned to 451116069Sjeff * each task prior to balancing them. There are many pathological cases with 452116069Sjeff * any approach and so the semi random algorithm below may work as well as any. 453116069Sjeff * 454116069Sjeff */ 455121790Sjeffstatic void 456129982Sjeffsched_balance(void) 457116069Sjeff{ 458123487Sjeff struct kseq_group *high; 459123487Sjeff struct kseq_group *low; 460123487Sjeff struct kseq_group *ksg; 461123487Sjeff int cnt; 462123487Sjeff int i; 463123487Sjeff 464123487Sjeff if (smp_started == 0) 465123487Sjeff goto out; 466123487Sjeff low = high = NULL; 467123487Sjeff i = random() % (ksg_maxid + 1); 468123487Sjeff for (cnt = 0; cnt <= ksg_maxid; cnt++) { 469123487Sjeff ksg = KSEQ_GROUP(i); 470123487Sjeff /* 471123487Sjeff * Find the CPU with the highest load that has some 472123487Sjeff * threads to transfer. 473123487Sjeff */ 474123487Sjeff if ((high == NULL || ksg->ksg_load > high->ksg_load) 475123487Sjeff && ksg->ksg_transferable) 476123487Sjeff high = ksg; 477123487Sjeff if (low == NULL || ksg->ksg_load < low->ksg_load) 478123487Sjeff low = ksg; 479123487Sjeff if (++i > ksg_maxid) 480123487Sjeff i = 0; 481123487Sjeff } 482123487Sjeff if (low != NULL && high != NULL && high != low) 483123487Sjeff sched_balance_pair(LIST_FIRST(&high->ksg_members), 484123487Sjeff LIST_FIRST(&low->ksg_members)); 485123487Sjeffout: 486129982Sjeff bal_tick = ticks + (random() % (hz * 2)); 487123487Sjeff} 488123487Sjeff 489123487Sjeffstatic void 490129982Sjeffsched_balance_groups(void) 491123487Sjeff{ 492123487Sjeff int i; 493123487Sjeff 494129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 495123487Sjeff if (smp_started) 496123487Sjeff for (i = 0; i <= ksg_maxid; i++) 497123487Sjeff sched_balance_group(KSEQ_GROUP(i)); 498129982Sjeff gbal_tick = ticks + (random() % (hz * 2)); 499123487Sjeff} 500123487Sjeff 501123487Sjeffstatic void 502123487Sjeffsched_balance_group(struct kseq_group *ksg) 503123487Sjeff{ 504116069Sjeff struct kseq *kseq; 505123487Sjeff struct kseq *high; 506123487Sjeff struct kseq *low; 507123487Sjeff int load; 508123487Sjeff 509123487Sjeff if (ksg->ksg_transferable == 0) 510123487Sjeff return; 511123487Sjeff low = NULL; 512123487Sjeff high = NULL; 513123487Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 514123487Sjeff load = kseq->ksq_load; 515123487Sjeff if (high == NULL || load > high->ksq_load) 516123487Sjeff high = kseq; 517123487Sjeff if (low == NULL || load < low->ksq_load) 518123487Sjeff low = kseq; 519123487Sjeff } 520123487Sjeff if (high != NULL && low != NULL && high != low) 521123487Sjeff sched_balance_pair(high, low); 522123487Sjeff} 523123487Sjeff 524123487Sjeffstatic void 525123487Sjeffsched_balance_pair(struct kseq *high, struct kseq *low) 526123487Sjeff{ 527123433Sjeff int transferable; 528116069Sjeff int high_load; 529116069Sjeff int low_load; 530116069Sjeff int move; 531116069Sjeff int diff; 532116069Sjeff int i; 533116069Sjeff 534116069Sjeff /* 535123433Sjeff * If we're transfering within a group we have to use this specific 536123433Sjeff * kseq's transferable count, otherwise we can steal from other members 537123433Sjeff * of the group. 538123433Sjeff */ 539123487Sjeff if (high->ksq_group == low->ksq_group) { 540123487Sjeff transferable = high->ksq_transferable; 541123487Sjeff high_load = high->ksq_load; 542123487Sjeff low_load = low->ksq_load; 543123487Sjeff } else { 544123487Sjeff transferable = high->ksq_group->ksg_transferable; 545123487Sjeff high_load = high->ksq_group->ksg_load; 546123487Sjeff low_load = low->ksq_group->ksg_load; 547123487Sjeff } 548123433Sjeff if (transferable == 0) 549123487Sjeff return; 550123433Sjeff /* 551122744Sjeff * Determine what the imbalance is and then adjust that to how many 552123433Sjeff * kses we actually have to give up (transferable). 553122744Sjeff */ 554123487Sjeff diff = high_load - low_load; 555116069Sjeff move = diff / 2; 556116069Sjeff if (diff & 0x1) 557116069Sjeff move++; 558123433Sjeff move = min(move, transferable); 559116069Sjeff for (i = 0; i < move; i++) 560123487Sjeff kseq_move(high, KSEQ_ID(low)); 561116069Sjeff return; 562116069Sjeff} 563116069Sjeff 564121790Sjeffstatic void 565116069Sjeffkseq_move(struct kseq *from, int cpu) 566116069Sjeff{ 567123433Sjeff struct kseq *kseq; 568123433Sjeff struct kseq *to; 569116069Sjeff struct kse *ke; 570116069Sjeff 571123433Sjeff kseq = from; 572123433Sjeff to = KSEQ_CPU(cpu); 573123433Sjeff ke = kseq_steal(kseq, 1); 574123433Sjeff if (ke == NULL) { 575123433Sjeff struct kseq_group *ksg; 576123433Sjeff 577123433Sjeff ksg = kseq->ksq_group; 578123433Sjeff LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 579123433Sjeff if (kseq == from || kseq->ksq_transferable == 0) 580123433Sjeff continue; 581123433Sjeff ke = kseq_steal(kseq, 1); 582123433Sjeff break; 583123433Sjeff } 584123433Sjeff if (ke == NULL) 585123433Sjeff panic("kseq_move: No KSEs available with a " 586123433Sjeff "transferable count of %d\n", 587123433Sjeff ksg->ksg_transferable); 588123433Sjeff } 589123433Sjeff if (kseq == to) 590123433Sjeff return; 591116069Sjeff ke->ke_state = KES_THREAD; 592123433Sjeff kseq_runq_rem(kseq, ke); 593123433Sjeff kseq_load_rem(kseq, ke); 594121923Sjeff kseq_notify(ke, cpu); 595116069Sjeff} 596110267Sjeff 597123433Sjeffstatic int 598123433Sjeffkseq_idled(struct kseq *kseq) 599121790Sjeff{ 600123433Sjeff struct kseq_group *ksg; 601123433Sjeff struct kseq *steal; 602123433Sjeff struct kse *ke; 603123433Sjeff 604123433Sjeff ksg = kseq->ksq_group; 605123433Sjeff /* 606123433Sjeff * If we're in a cpu group, try and steal kses from another cpu in 607123433Sjeff * the group before idling. 608123433Sjeff */ 609123433Sjeff if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 610123433Sjeff LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 611123433Sjeff if (steal == kseq || steal->ksq_transferable == 0) 612123433Sjeff continue; 613123433Sjeff ke = kseq_steal(steal, 0); 614123433Sjeff if (ke == NULL) 615123433Sjeff continue; 616123433Sjeff ke->ke_state = KES_THREAD; 617123433Sjeff kseq_runq_rem(steal, ke); 618123433Sjeff kseq_load_rem(steal, ke); 619123433Sjeff ke->ke_cpu = PCPU_GET(cpuid); 620131839Sjhb sched_add_internal(ke->ke_thread, 0); 621123433Sjeff return (0); 622123433Sjeff } 623123433Sjeff } 624123433Sjeff /* 625123433Sjeff * We only set the idled bit when all of the cpus in the group are 626123433Sjeff * idle. Otherwise we could get into a situation where a KSE bounces 627123433Sjeff * back and forth between two idle cores on seperate physical CPUs. 628123433Sjeff */ 629123433Sjeff ksg->ksg_idlemask |= PCPU_GET(cpumask); 630123433Sjeff if (ksg->ksg_idlemask != ksg->ksg_cpumask) 631123433Sjeff return (1); 632123433Sjeff atomic_set_int(&kseq_idle, ksg->ksg_mask); 633123433Sjeff return (1); 634121790Sjeff} 635121790Sjeff 636121790Sjeffstatic void 637121790Sjeffkseq_assign(struct kseq *kseq) 638121790Sjeff{ 639121790Sjeff struct kse *nke; 640121790Sjeff struct kse *ke; 641121790Sjeff 642121790Sjeff do { 643132776Skan *(volatile struct kse **)&ke = kseq->ksq_assigned; 644121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 645121790Sjeff for (; ke != NULL; ke = nke) { 646121790Sjeff nke = ke->ke_assign; 647121790Sjeff ke->ke_flags &= ~KEF_ASSIGNED; 648131839Sjhb sched_add_internal(ke->ke_thread, 0); 649121790Sjeff } 650121790Sjeff} 651121790Sjeff 652121790Sjeffstatic void 653121790Sjeffkseq_notify(struct kse *ke, int cpu) 654121790Sjeff{ 655121790Sjeff struct kseq *kseq; 656121790Sjeff struct thread *td; 657121790Sjeff struct pcpu *pcpu; 658133427Sjeff int prio; 659121790Sjeff 660123529Sjeff ke->ke_cpu = cpu; 661121790Sjeff ke->ke_flags |= KEF_ASSIGNED; 662133427Sjeff prio = ke->ke_thread->td_priority; 663121790Sjeff 664121790Sjeff kseq = KSEQ_CPU(cpu); 665121790Sjeff 666121790Sjeff /* 667121790Sjeff * Place a KSE on another cpu's queue and force a resched. 668121790Sjeff */ 669121790Sjeff do { 670132776Skan *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned; 671121790Sjeff } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 672133427Sjeff /* 673133427Sjeff * Without sched_lock we could lose a race where we set NEEDRESCHED 674133427Sjeff * on a thread that is switched out before the IPI is delivered. This 675133427Sjeff * would lead us to miss the resched. This will be a problem once 676133427Sjeff * sched_lock is pushed down. 677133427Sjeff */ 678121790Sjeff pcpu = pcpu_find(cpu); 679121790Sjeff td = pcpu->pc_curthread; 680121790Sjeff if (ke->ke_thread->td_priority < td->td_priority || 681121790Sjeff td == pcpu->pc_idlethread) { 682121790Sjeff td->td_flags |= TDF_NEEDRESCHED; 683121790Sjeff ipi_selected(1 << cpu, IPI_AST); 684121790Sjeff } 685121790Sjeff} 686121790Sjeff 687121790Sjeffstatic struct kse * 688121790Sjeffrunq_steal(struct runq *rq) 689121790Sjeff{ 690121790Sjeff struct rqhead *rqh; 691121790Sjeff struct rqbits *rqb; 692121790Sjeff struct kse *ke; 693121790Sjeff int word; 694121790Sjeff int bit; 695121790Sjeff 696121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 697121790Sjeff rqb = &rq->rq_status; 698121790Sjeff for (word = 0; word < RQB_LEN; word++) { 699121790Sjeff if (rqb->rqb_bits[word] == 0) 700121790Sjeff continue; 701121790Sjeff for (bit = 0; bit < RQB_BPW; bit++) { 702123231Speter if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 703121790Sjeff continue; 704121790Sjeff rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 705121790Sjeff TAILQ_FOREACH(ke, rqh, ke_procq) { 706121896Sjeff if (KSE_CAN_MIGRATE(ke, 707121896Sjeff PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 708121790Sjeff return (ke); 709121790Sjeff } 710121790Sjeff } 711121790Sjeff } 712121790Sjeff return (NULL); 713121790Sjeff} 714121790Sjeff 715121790Sjeffstatic struct kse * 716123433Sjeffkseq_steal(struct kseq *kseq, int stealidle) 717121790Sjeff{ 718121790Sjeff struct kse *ke; 719121790Sjeff 720123433Sjeff /* 721123433Sjeff * Steal from next first to try to get a non-interactive task that 722123433Sjeff * may not have run for a while. 723123433Sjeff */ 724123433Sjeff if ((ke = runq_steal(kseq->ksq_next)) != NULL) 725123433Sjeff return (ke); 726121790Sjeff if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 727121790Sjeff return (ke); 728123433Sjeff if (stealidle) 729123433Sjeff return (runq_steal(&kseq->ksq_idle)); 730123433Sjeff return (NULL); 731121790Sjeff} 732123433Sjeff 733123433Sjeffint 734123433Sjeffkseq_transfer(struct kseq *kseq, struct kse *ke, int class) 735123433Sjeff{ 736123433Sjeff struct kseq_group *ksg; 737123433Sjeff int cpu; 738123433Sjeff 739123685Sjeff if (smp_started == 0) 740123685Sjeff return (0); 741123433Sjeff cpu = 0; 742123433Sjeff /* 743133427Sjeff * If our load exceeds a certain threshold we should attempt to 744133427Sjeff * reassign this thread. The first candidate is the cpu that 745133427Sjeff * originally ran the thread. If it is idle, assign it there, 746133427Sjeff * otherwise, pick an idle cpu. 747133427Sjeff * 748133427Sjeff * The threshold at which we start to reassign kses has a large impact 749123685Sjeff * on the overall performance of the system. Tuned too high and 750123685Sjeff * some CPUs may idle. Too low and there will be excess migration 751128055Scognet * and context switches. 752123685Sjeff */ 753133427Sjeff ksg = kseq->ksq_group; 754133427Sjeff if (ksg->ksg_load > ksg->ksg_cpus && kseq_idle) { 755133427Sjeff ksg = KSEQ_CPU(ke->ke_cpu)->ksq_group; 756133427Sjeff if (kseq_idle & ksg->ksg_mask) { 757133427Sjeff cpu = ffs(ksg->ksg_idlemask); 758133427Sjeff if (cpu) 759133427Sjeff goto migrate; 760133427Sjeff } 761123433Sjeff /* 762123433Sjeff * Multiple cpus could find this bit simultaneously 763123433Sjeff * but the race shouldn't be terrible. 764123433Sjeff */ 765123433Sjeff cpu = ffs(kseq_idle); 766123433Sjeff if (cpu) 767133427Sjeff goto migrate; 768123433Sjeff } 769123433Sjeff /* 770123433Sjeff * If another cpu in this group has idled, assign a thread over 771123433Sjeff * to them after checking to see if there are idled groups. 772123433Sjeff */ 773133427Sjeff ksg = kseq->ksq_group; 774133427Sjeff if (ksg->ksg_idlemask) { 775123433Sjeff cpu = ffs(ksg->ksg_idlemask); 776123433Sjeff if (cpu) 777133427Sjeff goto migrate; 778123433Sjeff } 779123433Sjeff /* 780133427Sjeff * No new CPU was found. 781133427Sjeff */ 782133427Sjeff return (0); 783133427Sjeffmigrate: 784133427Sjeff /* 785123433Sjeff * Now that we've found an idle CPU, migrate the thread. 786123433Sjeff */ 787133427Sjeff cpu--; 788133427Sjeff ke->ke_runq = NULL; 789133427Sjeff kseq_notify(ke, cpu); 790133427Sjeff 791133427Sjeff return (1); 792123433Sjeff} 793123433Sjeff 794121790Sjeff#endif /* SMP */ 795121790Sjeff 796117326Sjeff/* 797121790Sjeff * Pick the highest priority task we have and return it. 798117326Sjeff */ 799117326Sjeff 800121790Sjeffstatic struct kse * 801121790Sjeffkseq_choose(struct kseq *kseq) 802110267Sjeff{ 803110267Sjeff struct kse *ke; 804110267Sjeff struct runq *swap; 805110267Sjeff 806115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 807113357Sjeff swap = NULL; 808112994Sjeff 809113357Sjeff for (;;) { 810113357Sjeff ke = runq_choose(kseq->ksq_curr); 811113357Sjeff if (ke == NULL) { 812113357Sjeff /* 813131473Sjhb * We already swapped once and didn't get anywhere. 814113357Sjeff */ 815113357Sjeff if (swap) 816113357Sjeff break; 817113357Sjeff swap = kseq->ksq_curr; 818113357Sjeff kseq->ksq_curr = kseq->ksq_next; 819113357Sjeff kseq->ksq_next = swap; 820113357Sjeff continue; 821113357Sjeff } 822113357Sjeff /* 823113357Sjeff * If we encounter a slice of 0 the kse is in a 824113357Sjeff * TIMESHARE kse group and its nice was too far out 825113357Sjeff * of the range that receives slices. 826113357Sjeff */ 827121790Sjeff if (ke->ke_slice == 0) { 828113357Sjeff runq_remove(ke->ke_runq, ke); 829113357Sjeff sched_slice(ke); 830113357Sjeff ke->ke_runq = kseq->ksq_next; 831113357Sjeff runq_add(ke->ke_runq, ke); 832113357Sjeff continue; 833113357Sjeff } 834113357Sjeff return (ke); 835110267Sjeff } 836110267Sjeff 837113357Sjeff return (runq_choose(&kseq->ksq_idle)); 838110267Sjeff} 839110267Sjeff 840109864Sjeffstatic void 841110028Sjeffkseq_setup(struct kseq *kseq) 842110028Sjeff{ 843113357Sjeff runq_init(&kseq->ksq_timeshare[0]); 844113357Sjeff runq_init(&kseq->ksq_timeshare[1]); 845112994Sjeff runq_init(&kseq->ksq_idle); 846113357Sjeff kseq->ksq_curr = &kseq->ksq_timeshare[0]; 847113357Sjeff kseq->ksq_next = &kseq->ksq_timeshare[1]; 848113660Sjeff kseq->ksq_load = 0; 849121896Sjeff kseq->ksq_load_timeshare = 0; 850110028Sjeff} 851110028Sjeff 852110028Sjeffstatic void 853109864Sjeffsched_setup(void *dummy) 854109864Sjeff{ 855117313Sjeff#ifdef SMP 856123487Sjeff int balance_groups; 857109864Sjeff int i; 858117313Sjeff#endif 859109864Sjeff 860116946Sjeff slice_min = (hz/100); /* 10ms */ 861116946Sjeff slice_max = (hz/7); /* ~140ms */ 862111857Sjeff 863117237Sjeff#ifdef SMP 864123487Sjeff balance_groups = 0; 865123433Sjeff /* 866123433Sjeff * Initialize the kseqs. 867123433Sjeff */ 868123433Sjeff for (i = 0; i < MAXCPU; i++) { 869123433Sjeff struct kseq *ksq; 870123433Sjeff 871123433Sjeff ksq = &kseq_cpu[i]; 872123433Sjeff ksq->ksq_assigned = NULL; 873123433Sjeff kseq_setup(&kseq_cpu[i]); 874123433Sjeff } 875117237Sjeff if (smp_topology == NULL) { 876123433Sjeff struct kseq_group *ksg; 877123433Sjeff struct kseq *ksq; 878123433Sjeff 879117237Sjeff for (i = 0; i < MAXCPU; i++) { 880123433Sjeff ksq = &kseq_cpu[i]; 881123433Sjeff ksg = &kseq_groups[i]; 882123433Sjeff /* 883129982Sjeff * Setup a kseq group with one member. 884123433Sjeff */ 885123433Sjeff ksq->ksq_transferable = 0; 886123433Sjeff ksq->ksq_group = ksg; 887123433Sjeff ksg->ksg_cpus = 1; 888123433Sjeff ksg->ksg_idlemask = 0; 889123433Sjeff ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 890123487Sjeff ksg->ksg_load = 0; 891123433Sjeff ksg->ksg_transferable = 0; 892123433Sjeff LIST_INIT(&ksg->ksg_members); 893123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 894117237Sjeff } 895117237Sjeff } else { 896123433Sjeff struct kseq_group *ksg; 897123433Sjeff struct cpu_group *cg; 898117237Sjeff int j; 899113357Sjeff 900117237Sjeff for (i = 0; i < smp_topology->ct_count; i++) { 901117237Sjeff cg = &smp_topology->ct_group[i]; 902123433Sjeff ksg = &kseq_groups[i]; 903123433Sjeff /* 904123433Sjeff * Initialize the group. 905123433Sjeff */ 906123433Sjeff ksg->ksg_idlemask = 0; 907123487Sjeff ksg->ksg_load = 0; 908123433Sjeff ksg->ksg_transferable = 0; 909123433Sjeff ksg->ksg_cpus = cg->cg_count; 910123433Sjeff ksg->ksg_cpumask = cg->cg_mask; 911123433Sjeff LIST_INIT(&ksg->ksg_members); 912123433Sjeff /* 913123433Sjeff * Find all of the group members and add them. 914123433Sjeff */ 915123433Sjeff for (j = 0; j < MAXCPU; j++) { 916123433Sjeff if ((cg->cg_mask & (1 << j)) != 0) { 917123433Sjeff if (ksg->ksg_mask == 0) 918123433Sjeff ksg->ksg_mask = 1 << j; 919123433Sjeff kseq_cpu[j].ksq_transferable = 0; 920123433Sjeff kseq_cpu[j].ksq_group = ksg; 921123433Sjeff LIST_INSERT_HEAD(&ksg->ksg_members, 922123433Sjeff &kseq_cpu[j], ksq_siblings); 923123433Sjeff } 924123433Sjeff } 925123487Sjeff if (ksg->ksg_cpus > 1) 926123487Sjeff balance_groups = 1; 927117237Sjeff } 928123487Sjeff ksg_maxid = smp_topology->ct_count - 1; 929117237Sjeff } 930123487Sjeff /* 931123487Sjeff * Stagger the group and global load balancer so they do not 932123487Sjeff * interfere with each other. 933123487Sjeff */ 934129982Sjeff bal_tick = ticks + hz; 935123487Sjeff if (balance_groups) 936129982Sjeff gbal_tick = ticks + (hz / 2); 937117237Sjeff#else 938117237Sjeff kseq_setup(KSEQ_SELF()); 939116069Sjeff#endif 940117237Sjeff mtx_lock_spin(&sched_lock); 941122744Sjeff kseq_load_add(KSEQ_SELF(), &kse0); 942117237Sjeff mtx_unlock_spin(&sched_lock); 943109864Sjeff} 944109864Sjeff 945109864Sjeff/* 946109864Sjeff * Scale the scheduling priority according to the "interactivity" of this 947109864Sjeff * process. 948109864Sjeff */ 949113357Sjeffstatic void 950109864Sjeffsched_priority(struct ksegrp *kg) 951109864Sjeff{ 952109864Sjeff int pri; 953109864Sjeff 954109864Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 955113357Sjeff return; 956109864Sjeff 957113357Sjeff pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 958111857Sjeff pri += SCHED_PRI_BASE; 959130551Sjulian pri += kg->kg_proc->p_nice; 960109864Sjeff 961109864Sjeff if (pri > PRI_MAX_TIMESHARE) 962109864Sjeff pri = PRI_MAX_TIMESHARE; 963109864Sjeff else if (pri < PRI_MIN_TIMESHARE) 964109864Sjeff pri = PRI_MIN_TIMESHARE; 965109864Sjeff 966109864Sjeff kg->kg_user_pri = pri; 967109864Sjeff 968113357Sjeff return; 969109864Sjeff} 970109864Sjeff 971109864Sjeff/* 972112966Sjeff * Calculate a time slice based on the properties of the kseg and the runq 973112994Sjeff * that we're on. This is only for PRI_TIMESHARE ksegrps. 974109864Sjeff */ 975112966Sjeffstatic void 976112966Sjeffsched_slice(struct kse *ke) 977109864Sjeff{ 978113357Sjeff struct kseq *kseq; 979112966Sjeff struct ksegrp *kg; 980109864Sjeff 981112966Sjeff kg = ke->ke_ksegrp; 982113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 983109864Sjeff 984112966Sjeff /* 985112966Sjeff * Rationale: 986133427Sjeff * KSEs in interactive ksegs get a minimal slice so that we 987112966Sjeff * quickly notice if it abuses its advantage. 988112966Sjeff * 989112966Sjeff * KSEs in non-interactive ksegs are assigned a slice that is 990112966Sjeff * based on the ksegs nice value relative to the least nice kseg 991112966Sjeff * on the run queue for this cpu. 992112966Sjeff * 993112966Sjeff * If the KSE is less nice than all others it gets the maximum 994112966Sjeff * slice and other KSEs will adjust their slice relative to 995112966Sjeff * this when they first expire. 996112966Sjeff * 997112966Sjeff * There is 20 point window that starts relative to the least 998112966Sjeff * nice kse on the run queue. Slice size is determined by 999112966Sjeff * the kse distance from the last nice ksegrp. 1000112966Sjeff * 1001121871Sjeff * If the kse is outside of the window it will get no slice 1002121871Sjeff * and will be reevaluated each time it is selected on the 1003121871Sjeff * run queue. The exception to this is nice 0 ksegs when 1004121871Sjeff * a nice -20 is running. They are always granted a minimum 1005121871Sjeff * slice. 1006112966Sjeff */ 1007113357Sjeff if (!SCHED_INTERACTIVE(kg)) { 1008112966Sjeff int nice; 1009112966Sjeff 1010130551Sjulian nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); 1011121896Sjeff if (kseq->ksq_load_timeshare == 0 || 1012130551Sjulian kg->kg_proc->p_nice < kseq->ksq_nicemin) 1013112966Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1014121871Sjeff else if (nice <= SCHED_SLICE_NTHRESH) 1015112966Sjeff ke->ke_slice = SCHED_SLICE_NICE(nice); 1016130551Sjulian else if (kg->kg_proc->p_nice == 0) 1017121871Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1018112966Sjeff else 1019112966Sjeff ke->ke_slice = 0; 1020112966Sjeff } else 1021123684Sjeff ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1022112966Sjeff 1023113357Sjeff CTR6(KTR_ULE, 1024113357Sjeff "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 1025130551Sjulian ke, ke->ke_slice, kg->kg_proc->p_nice, kseq->ksq_nicemin, 1026121896Sjeff kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); 1027113357Sjeff 1028112966Sjeff return; 1029109864Sjeff} 1030109864Sjeff 1031121868Sjeff/* 1032121868Sjeff * This routine enforces a maximum limit on the amount of scheduling history 1033121868Sjeff * kept. It is called after either the slptime or runtime is adjusted. 1034121868Sjeff * This routine will not operate correctly when slp or run times have been 1035121868Sjeff * adjusted to more than double their maximum. 1036121868Sjeff */ 1037116463Sjeffstatic void 1038116463Sjeffsched_interact_update(struct ksegrp *kg) 1039116463Sjeff{ 1040121868Sjeff int sum; 1041121605Sjeff 1042121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1043121868Sjeff if (sum < SCHED_SLP_RUN_MAX) 1044121868Sjeff return; 1045121868Sjeff /* 1046121868Sjeff * If we have exceeded by more than 1/5th then the algorithm below 1047121868Sjeff * will not bring us back into range. Dividing by two here forces 1048133427Sjeff * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1049121868Sjeff */ 1050127850Sjeff if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1051121868Sjeff kg->kg_runtime /= 2; 1052121868Sjeff kg->kg_slptime /= 2; 1053121868Sjeff return; 1054116463Sjeff } 1055121868Sjeff kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1056121868Sjeff kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1057116463Sjeff} 1058116463Sjeff 1059121868Sjeffstatic void 1060121868Sjeffsched_interact_fork(struct ksegrp *kg) 1061121868Sjeff{ 1062121868Sjeff int ratio; 1063121868Sjeff int sum; 1064121868Sjeff 1065121868Sjeff sum = kg->kg_runtime + kg->kg_slptime; 1066121868Sjeff if (sum > SCHED_SLP_RUN_FORK) { 1067121868Sjeff ratio = sum / SCHED_SLP_RUN_FORK; 1068121868Sjeff kg->kg_runtime /= ratio; 1069121868Sjeff kg->kg_slptime /= ratio; 1070121868Sjeff } 1071121868Sjeff} 1072121868Sjeff 1073111857Sjeffstatic int 1074111857Sjeffsched_interact_score(struct ksegrp *kg) 1075111857Sjeff{ 1076116365Sjeff int div; 1077111857Sjeff 1078111857Sjeff if (kg->kg_runtime > kg->kg_slptime) { 1079116365Sjeff div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1080116365Sjeff return (SCHED_INTERACT_HALF + 1081116365Sjeff (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1082116365Sjeff } if (kg->kg_slptime > kg->kg_runtime) { 1083116365Sjeff div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1084116365Sjeff return (kg->kg_runtime / div); 1085111857Sjeff } 1086111857Sjeff 1087116365Sjeff /* 1088116365Sjeff * This can happen if slptime and runtime are 0. 1089116365Sjeff */ 1090116365Sjeff return (0); 1091111857Sjeff 1092111857Sjeff} 1093111857Sjeff 1094113357Sjeff/* 1095113357Sjeff * This is only somewhat accurate since given many processes of the same 1096113357Sjeff * priority they will switch when their slices run out, which will be 1097113357Sjeff * at most SCHED_SLICE_MAX. 1098113357Sjeff */ 1099109864Sjeffint 1100109864Sjeffsched_rr_interval(void) 1101109864Sjeff{ 1102109864Sjeff return (SCHED_SLICE_MAX); 1103109864Sjeff} 1104109864Sjeff 1105121790Sjeffstatic void 1106109864Sjeffsched_pctcpu_update(struct kse *ke) 1107109864Sjeff{ 1108109864Sjeff /* 1109109864Sjeff * Adjust counters and watermark for pctcpu calc. 1110116365Sjeff */ 1111120272Sjeff if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1112120272Sjeff /* 1113120272Sjeff * Shift the tick count out so that the divide doesn't 1114120272Sjeff * round away our results. 1115120272Sjeff */ 1116120272Sjeff ke->ke_ticks <<= 10; 1117120272Sjeff ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1118120272Sjeff SCHED_CPU_TICKS; 1119120272Sjeff ke->ke_ticks >>= 10; 1120120272Sjeff } else 1121120272Sjeff ke->ke_ticks = 0; 1122109864Sjeff ke->ke_ltick = ticks; 1123109864Sjeff ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1124109864Sjeff} 1125109864Sjeff 1126109864Sjeffvoid 1127109864Sjeffsched_prio(struct thread *td, u_char prio) 1128109864Sjeff{ 1129121605Sjeff struct kse *ke; 1130109864Sjeff 1131121605Sjeff ke = td->td_kse; 1132109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1133109864Sjeff if (TD_ON_RUNQ(td)) { 1134121605Sjeff /* 1135121605Sjeff * If the priority has been elevated due to priority 1136121605Sjeff * propagation, we may have to move ourselves to a new 1137121605Sjeff * queue. We still call adjustrunqueue below in case kse 1138121605Sjeff * needs to fix things up. 1139121605Sjeff */ 1140121872Sjeff if (prio < td->td_priority && ke && 1141121872Sjeff (ke->ke_flags & KEF_ASSIGNED) == 0 && 1142121790Sjeff ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1143121605Sjeff runq_remove(ke->ke_runq, ke); 1144121605Sjeff ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1145121605Sjeff runq_add(ke->ke_runq, ke); 1146121605Sjeff } 1147133555Sjeff /* 1148133555Sjeff * Hold this kse on this cpu so that sched_prio() doesn't 1149133555Sjeff * cause excessive migration. We only want migration to 1150133555Sjeff * happen as the result of a wakeup. 1151133555Sjeff */ 1152133555Sjeff ke->ke_flags |= KEF_HOLD; 1153119488Sdavidxu adjustrunqueue(td, prio); 1154121605Sjeff } else 1155119488Sdavidxu td->td_priority = prio; 1156109864Sjeff} 1157109864Sjeff 1158109864Sjeffvoid 1159131473Sjhbsched_switch(struct thread *td, struct thread *newtd) 1160109864Sjeff{ 1161109864Sjeff struct kse *ke; 1162109864Sjeff 1163109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1164109864Sjeff 1165109864Sjeff ke = td->td_kse; 1166109864Sjeff 1167109864Sjeff td->td_last_kse = ke; 1168133555Sjeff td->td_lastcpu = td->td_oncpu; 1169113339Sjulian td->td_oncpu = NOCPU; 1170132266Sjhb td->td_flags &= ~TDF_NEEDRESCHED; 1171132266Sjhb td->td_pflags &= ~TDP_OWEPREEMPT; 1172109864Sjeff 1173123434Sjeff /* 1174123434Sjeff * If the KSE has been assigned it may be in the process of switching 1175123434Sjeff * to the new cpu. This is the case in sched_bind(). 1176123434Sjeff */ 1177123434Sjeff if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1178133427Sjeff if (td == PCPU_GET(idlethread)) { 1179131473Sjhb TD_SET_CAN_RUN(td); 1180133427Sjeff } else if (TD_IS_RUNNING(td)) { 1181127278Sobrien kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1182133555Sjeff /* 1183133555Sjeff * Don't allow the kse to migrate from a preemption. 1184133555Sjeff */ 1185133555Sjeff ke->ke_flags |= KEF_HOLD; 1186127278Sobrien setrunqueue(td); 1187123434Sjeff } else { 1188125289Sjeff if (ke->ke_runq) { 1189123434Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1190125289Sjeff } else if ((td->td_flags & TDF_IDLETD) == 0) 1191131929Smarcel kdb_backtrace(); 1192123434Sjeff /* 1193123434Sjeff * We will not be on the run queue. So we must be 1194123434Sjeff * sleeping or similar. 1195123434Sjeff */ 1196123434Sjeff if (td->td_proc->p_flag & P_SA) 1197123434Sjeff kse_reassign(ke); 1198123434Sjeff } 1199121146Sjeff } 1200134415Speter if (newtd != NULL) 1201133427Sjeff kseq_load_add(KSEQ_SELF(), newtd->td_kse); 1202134415Speter else 1203131473Sjhb newtd = choosethread(); 1204121128Sjeff if (td != newtd) 1205121128Sjeff cpu_switch(td, newtd); 1206121128Sjeff sched_lock.mtx_lock = (uintptr_t)td; 1207109864Sjeff 1208113339Sjulian td->td_oncpu = PCPU_GET(cpuid); 1209109864Sjeff} 1210109864Sjeff 1211109864Sjeffvoid 1212130551Sjuliansched_nice(struct proc *p, int nice) 1213109864Sjeff{ 1214130551Sjulian struct ksegrp *kg; 1215113357Sjeff struct kse *ke; 1216109864Sjeff struct thread *td; 1217113357Sjeff struct kseq *kseq; 1218109864Sjeff 1219130551Sjulian PROC_LOCK_ASSERT(p, MA_OWNED); 1220113873Sjhb mtx_assert(&sched_lock, MA_OWNED); 1221113357Sjeff /* 1222113357Sjeff * We need to adjust the nice counts for running KSEs. 1223113357Sjeff */ 1224130551Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 1225130551Sjulian if (kg->kg_pri_class == PRI_TIMESHARE) { 1226130551Sjulian FOREACH_KSE_IN_GROUP(kg, ke) { 1227130551Sjulian if (ke->ke_runq == NULL) 1228130551Sjulian continue; 1229130551Sjulian kseq = KSEQ_CPU(ke->ke_cpu); 1230130551Sjulian kseq_nice_rem(kseq, p->p_nice); 1231130551Sjulian kseq_nice_add(kseq, nice); 1232130551Sjulian } 1233113357Sjeff } 1234130551Sjulian } 1235130551Sjulian p->p_nice = nice; 1236130551Sjulian FOREACH_KSEGRP_IN_PROC(p, kg) { 1237130551Sjulian sched_priority(kg); 1238130551Sjulian FOREACH_THREAD_IN_GROUP(kg, td) 1239130551Sjulian td->td_flags |= TDF_NEEDRESCHED; 1240130551Sjulian } 1241109864Sjeff} 1242109864Sjeff 1243109864Sjeffvoid 1244126326Sjhbsched_sleep(struct thread *td) 1245109864Sjeff{ 1246109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1247109864Sjeff 1248109864Sjeff td->td_slptime = ticks; 1249126326Sjhb td->td_base_pri = td->td_priority; 1250109864Sjeff 1251113357Sjeff CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 1252113357Sjeff td->td_kse, td->td_slptime); 1253109864Sjeff} 1254109864Sjeff 1255109864Sjeffvoid 1256109864Sjeffsched_wakeup(struct thread *td) 1257109864Sjeff{ 1258109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1259109864Sjeff 1260109864Sjeff /* 1261109864Sjeff * Let the kseg know how long we slept for. This is because process 1262109864Sjeff * interactivity behavior is modeled in the kseg. 1263109864Sjeff */ 1264111788Sjeff if (td->td_slptime) { 1265111788Sjeff struct ksegrp *kg; 1266113357Sjeff int hzticks; 1267109864Sjeff 1268111788Sjeff kg = td->td_ksegrp; 1269121868Sjeff hzticks = (ticks - td->td_slptime) << 10; 1270121868Sjeff if (hzticks >= SCHED_SLP_RUN_MAX) { 1271121868Sjeff kg->kg_slptime = SCHED_SLP_RUN_MAX; 1272121868Sjeff kg->kg_runtime = 1; 1273121868Sjeff } else { 1274121868Sjeff kg->kg_slptime += hzticks; 1275121868Sjeff sched_interact_update(kg); 1276121868Sjeff } 1277111788Sjeff sched_priority(kg); 1278116463Sjeff if (td->td_kse) 1279116463Sjeff sched_slice(td->td_kse); 1280113357Sjeff CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 1281113357Sjeff td->td_kse, hzticks); 1282111788Sjeff td->td_slptime = 0; 1283109864Sjeff } 1284109864Sjeff setrunqueue(td); 1285109864Sjeff} 1286109864Sjeff 1287109864Sjeff/* 1288109864Sjeff * Penalize the parent for creating a new child and initialize the child's 1289109864Sjeff * priority. 1290109864Sjeff */ 1291109864Sjeffvoid 1292132372Sjuliansched_fork(struct thread *td, struct proc *p1) 1293109864Sjeff{ 1294109864Sjeff 1295109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1296109864Sjeff 1297132372Sjulian p1->p_nice = td->td_proc->p_nice; 1298132372Sjulian sched_fork_ksegrp(td, FIRST_KSEGRP_IN_PROC(p1)); 1299132372Sjulian sched_fork_kse(td, FIRST_KSE_IN_PROC(p1)); 1300132372Sjulian sched_fork_thread(td, FIRST_THREAD_IN_PROC(p1)); 1301113357Sjeff} 1302113357Sjeff 1303113357Sjeffvoid 1304132372Sjuliansched_fork_kse(struct thread *td, struct kse *child) 1305113357Sjeff{ 1306132372Sjulian struct kse *ke = td->td_kse; 1307132372Sjulian 1308116365Sjeff child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1309122847Sjeff child->ke_cpu = ke->ke_cpu; 1310113357Sjeff child->ke_runq = NULL; 1311113357Sjeff 1312121051Sjeff /* Grab our parents cpu estimation information. */ 1313121051Sjeff child->ke_ticks = ke->ke_ticks; 1314121051Sjeff child->ke_ltick = ke->ke_ltick; 1315121051Sjeff child->ke_ftick = ke->ke_ftick; 1316113357Sjeff} 1317113357Sjeff 1318113357Sjeffvoid 1319132372Sjuliansched_fork_ksegrp(struct thread *td, struct ksegrp *child) 1320113357Sjeff{ 1321132372Sjulian struct ksegrp *kg = td->td_ksegrp; 1322113923Sjhb PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 1323116365Sjeff 1324121868Sjeff child->kg_slptime = kg->kg_slptime; 1325121868Sjeff child->kg_runtime = kg->kg_runtime; 1326121868Sjeff child->kg_user_pri = kg->kg_user_pri; 1327121868Sjeff sched_interact_fork(child); 1328116463Sjeff kg->kg_runtime += tickincr << 10; 1329116463Sjeff sched_interact_update(kg); 1330113357Sjeff 1331121868Sjeff CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", 1332121868Sjeff kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 1333121868Sjeff child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); 1334113357Sjeff} 1335109864Sjeff 1336113357Sjeffvoid 1337113357Sjeffsched_fork_thread(struct thread *td, struct thread *child) 1338113357Sjeff{ 1339113357Sjeff} 1340113357Sjeff 1341113357Sjeffvoid 1342113357Sjeffsched_class(struct ksegrp *kg, int class) 1343113357Sjeff{ 1344113357Sjeff struct kseq *kseq; 1345113357Sjeff struct kse *ke; 1346121896Sjeff int nclass; 1347121896Sjeff int oclass; 1348113357Sjeff 1349113923Sjhb mtx_assert(&sched_lock, MA_OWNED); 1350113357Sjeff if (kg->kg_pri_class == class) 1351113357Sjeff return; 1352113357Sjeff 1353121896Sjeff nclass = PRI_BASE(class); 1354121896Sjeff oclass = PRI_BASE(kg->kg_pri_class); 1355113357Sjeff FOREACH_KSE_IN_GROUP(kg, ke) { 1356113357Sjeff if (ke->ke_state != KES_ONRUNQ && 1357113357Sjeff ke->ke_state != KES_THREAD) 1358113357Sjeff continue; 1359113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1360113357Sjeff 1361121896Sjeff#ifdef SMP 1362122744Sjeff /* 1363122744Sjeff * On SMP if we're on the RUNQ we must adjust the transferable 1364122744Sjeff * count because could be changing to or from an interrupt 1365122744Sjeff * class. 1366122744Sjeff */ 1367122744Sjeff if (ke->ke_state == KES_ONRUNQ) { 1368123433Sjeff if (KSE_CAN_MIGRATE(ke, oclass)) { 1369123433Sjeff kseq->ksq_transferable--; 1370123433Sjeff kseq->ksq_group->ksg_transferable--; 1371123433Sjeff } 1372123433Sjeff if (KSE_CAN_MIGRATE(ke, nclass)) { 1373123433Sjeff kseq->ksq_transferable++; 1374123433Sjeff kseq->ksq_group->ksg_transferable++; 1375123433Sjeff } 1376122744Sjeff } 1377121896Sjeff#endif 1378122744Sjeff if (oclass == PRI_TIMESHARE) { 1379121896Sjeff kseq->ksq_load_timeshare--; 1380130551Sjulian kseq_nice_rem(kseq, kg->kg_proc->p_nice); 1381122744Sjeff } 1382122744Sjeff if (nclass == PRI_TIMESHARE) { 1383121896Sjeff kseq->ksq_load_timeshare++; 1384130551Sjulian kseq_nice_add(kseq, kg->kg_proc->p_nice); 1385122744Sjeff } 1386109970Sjeff } 1387109970Sjeff 1388113357Sjeff kg->kg_pri_class = class; 1389109864Sjeff} 1390109864Sjeff 1391109864Sjeff/* 1392109864Sjeff * Return some of the child's priority and interactivity to the parent. 1393109864Sjeff */ 1394109864Sjeffvoid 1395132372Sjuliansched_exit(struct proc *p, struct thread *td) 1396109864Sjeff{ 1397109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1398132372Sjulian sched_exit_kse(FIRST_KSE_IN_PROC(p), td); 1399132372Sjulian sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td); 1400109864Sjeff} 1401109864Sjeff 1402109864Sjeffvoid 1403132372Sjuliansched_exit_kse(struct kse *ke, struct thread *td) 1404113372Sjeff{ 1405132372Sjulian kseq_load_rem(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse); 1406113372Sjeff} 1407113372Sjeff 1408113372Sjeffvoid 1409132372Sjuliansched_exit_ksegrp(struct ksegrp *kg, struct thread *td) 1410113372Sjeff{ 1411132372Sjulian /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */ 1412132372Sjulian kg->kg_runtime += td->td_ksegrp->kg_runtime; 1413116463Sjeff sched_interact_update(kg); 1414113372Sjeff} 1415113372Sjeff 1416113372Sjeffvoid 1417113372Sjeffsched_exit_thread(struct thread *td, struct thread *child) 1418113372Sjeff{ 1419113372Sjeff} 1420113372Sjeff 1421113372Sjeffvoid 1422121127Sjeffsched_clock(struct thread *td) 1423109864Sjeff{ 1424113357Sjeff struct kseq *kseq; 1425113357Sjeff struct ksegrp *kg; 1426121127Sjeff struct kse *ke; 1427109864Sjeff 1428129982Sjeff mtx_assert(&sched_lock, MA_OWNED); 1429133427Sjeff kseq = KSEQ_SELF(); 1430129982Sjeff#ifdef SMP 1431129982Sjeff if (ticks == bal_tick) 1432129982Sjeff sched_balance(); 1433129982Sjeff if (ticks == gbal_tick) 1434129982Sjeff sched_balance_groups(); 1435133427Sjeff /* 1436133427Sjeff * We could have been assigned a non real-time thread without an 1437133427Sjeff * IPI. 1438133427Sjeff */ 1439133427Sjeff if (kseq->ksq_assigned) 1440133427Sjeff kseq_assign(kseq); /* Potentially sets NEEDRESCHED */ 1441129982Sjeff#endif 1442113357Sjeff /* 1443113357Sjeff * sched_setup() apparently happens prior to stathz being set. We 1444113357Sjeff * need to resolve the timers earlier in the boot so we can avoid 1445113357Sjeff * calculating this here. 1446113357Sjeff */ 1447113357Sjeff if (realstathz == 0) { 1448113357Sjeff realstathz = stathz ? stathz : hz; 1449113357Sjeff tickincr = hz / realstathz; 1450113357Sjeff /* 1451113357Sjeff * XXX This does not work for values of stathz that are much 1452113357Sjeff * larger than hz. 1453113357Sjeff */ 1454113357Sjeff if (tickincr == 0) 1455113357Sjeff tickincr = 1; 1456113357Sjeff } 1457109864Sjeff 1458121127Sjeff ke = td->td_kse; 1459113357Sjeff kg = ke->ke_ksegrp; 1460109864Sjeff 1461110028Sjeff /* Adjust ticks for pctcpu */ 1462111793Sjeff ke->ke_ticks++; 1463109971Sjeff ke->ke_ltick = ticks; 1464112994Sjeff 1465109971Sjeff /* Go up to one second beyond our max and then trim back down */ 1466109971Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1467109971Sjeff sched_pctcpu_update(ke); 1468109971Sjeff 1469114496Sjulian if (td->td_flags & TDF_IDLETD) 1470109864Sjeff return; 1471110028Sjeff 1472113357Sjeff CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1473113357Sjeff ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1474110028Sjeff /* 1475113357Sjeff * We only do slicing code for TIMESHARE ksegrps. 1476113357Sjeff */ 1477113357Sjeff if (kg->kg_pri_class != PRI_TIMESHARE) 1478113357Sjeff return; 1479113357Sjeff /* 1480110645Sjeff * We used a tick charge it to the ksegrp so that we can compute our 1481113357Sjeff * interactivity. 1482109864Sjeff */ 1483113357Sjeff kg->kg_runtime += tickincr << 10; 1484116463Sjeff sched_interact_update(kg); 1485110645Sjeff 1486109864Sjeff /* 1487109864Sjeff * We used up one time slice. 1488109864Sjeff */ 1489122847Sjeff if (--ke->ke_slice > 0) 1490113357Sjeff return; 1491109864Sjeff /* 1492113357Sjeff * We're out of time, recompute priorities and requeue. 1493109864Sjeff */ 1494122744Sjeff kseq_load_rem(kseq, ke); 1495113357Sjeff sched_priority(kg); 1496113357Sjeff sched_slice(ke); 1497113357Sjeff if (SCHED_CURR(kg, ke)) 1498113357Sjeff ke->ke_runq = kseq->ksq_curr; 1499113357Sjeff else 1500113357Sjeff ke->ke_runq = kseq->ksq_next; 1501122744Sjeff kseq_load_add(kseq, ke); 1502113357Sjeff td->td_flags |= TDF_NEEDRESCHED; 1503109864Sjeff} 1504109864Sjeff 1505109864Sjeffint 1506109864Sjeffsched_runnable(void) 1507109864Sjeff{ 1508109864Sjeff struct kseq *kseq; 1509115998Sjeff int load; 1510109864Sjeff 1511115998Sjeff load = 1; 1512115998Sjeff 1513110028Sjeff kseq = KSEQ_SELF(); 1514121790Sjeff#ifdef SMP 1515122094Sjeff if (kseq->ksq_assigned) { 1516122094Sjeff mtx_lock_spin(&sched_lock); 1517121790Sjeff kseq_assign(kseq); 1518122094Sjeff mtx_unlock_spin(&sched_lock); 1519122094Sjeff } 1520121790Sjeff#endif 1521121605Sjeff if ((curthread->td_flags & TDF_IDLETD) != 0) { 1522121605Sjeff if (kseq->ksq_load > 0) 1523121605Sjeff goto out; 1524121605Sjeff } else 1525121605Sjeff if (kseq->ksq_load - 1 > 0) 1526121605Sjeff goto out; 1527115998Sjeff load = 0; 1528115998Sjeffout: 1529115998Sjeff return (load); 1530109864Sjeff} 1531109864Sjeff 1532109864Sjeffvoid 1533109864Sjeffsched_userret(struct thread *td) 1534109864Sjeff{ 1535109864Sjeff struct ksegrp *kg; 1536121605Sjeff 1537121605Sjeff kg = td->td_ksegrp; 1538109864Sjeff 1539109864Sjeff if (td->td_priority != kg->kg_user_pri) { 1540109864Sjeff mtx_lock_spin(&sched_lock); 1541109864Sjeff td->td_priority = kg->kg_user_pri; 1542109864Sjeff mtx_unlock_spin(&sched_lock); 1543109864Sjeff } 1544109864Sjeff} 1545109864Sjeff 1546109864Sjeffstruct kse * 1547109970Sjeffsched_choose(void) 1548109970Sjeff{ 1549110028Sjeff struct kseq *kseq; 1550109970Sjeff struct kse *ke; 1551109970Sjeff 1552115998Sjeff mtx_assert(&sched_lock, MA_OWNED); 1553121790Sjeff kseq = KSEQ_SELF(); 1554113357Sjeff#ifdef SMP 1555123433Sjeffrestart: 1556121790Sjeff if (kseq->ksq_assigned) 1557121790Sjeff kseq_assign(kseq); 1558113357Sjeff#endif 1559121790Sjeff ke = kseq_choose(kseq); 1560109864Sjeff if (ke) { 1561121790Sjeff#ifdef SMP 1562121790Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1563123433Sjeff if (kseq_idled(kseq) == 0) 1564123433Sjeff goto restart; 1565121790Sjeff#endif 1566122744Sjeff kseq_runq_rem(kseq, ke); 1567109864Sjeff ke->ke_state = KES_THREAD; 1568112966Sjeff 1569113357Sjeff if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1570113357Sjeff CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1571113357Sjeff ke, ke->ke_runq, ke->ke_slice, 1572113357Sjeff ke->ke_thread->td_priority); 1573113357Sjeff } 1574113357Sjeff return (ke); 1575109864Sjeff } 1576109970Sjeff#ifdef SMP 1577123433Sjeff if (kseq_idled(kseq) == 0) 1578123433Sjeff goto restart; 1579109970Sjeff#endif 1580113357Sjeff return (NULL); 1581109864Sjeff} 1582109864Sjeff 1583109864Sjeffvoid 1584121127Sjeffsched_add(struct thread *td) 1585109864Sjeff{ 1586131839Sjhb 1587131839Sjhb sched_add_internal(td, 1); 1588131839Sjhb} 1589131839Sjhb 1590131839Sjhbstatic void 1591131839Sjhbsched_add_internal(struct thread *td, int preemptive) 1592131839Sjhb{ 1593110267Sjeff struct kseq *kseq; 1594113357Sjeff struct ksegrp *kg; 1595121127Sjeff struct kse *ke; 1596133427Sjeff#ifdef SMP 1597133427Sjeff int canmigrate; 1598133427Sjeff#endif 1599121790Sjeff int class; 1600109864Sjeff 1601121790Sjeff mtx_assert(&sched_lock, MA_OWNED); 1602121127Sjeff ke = td->td_kse; 1603121127Sjeff kg = td->td_ksegrp; 1604121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1605121790Sjeff return; 1606121790Sjeff kseq = KSEQ_SELF(); 1607124958Sjeff KASSERT((ke->ke_thread != NULL), 1608124958Sjeff ("sched_add: No thread on KSE")); 1609109864Sjeff KASSERT((ke->ke_thread->td_kse != NULL), 1610110267Sjeff ("sched_add: No KSE on thread")); 1611109864Sjeff KASSERT(ke->ke_state != KES_ONRUNQ, 1612110267Sjeff ("sched_add: kse %p (%s) already in run queue", ke, 1613109864Sjeff ke->ke_proc->p_comm)); 1614109864Sjeff KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1615110267Sjeff ("sched_add: process swapped out")); 1616113387Sjeff KASSERT(ke->ke_runq == NULL, 1617113387Sjeff ("sched_add: KSE %p is still assigned to a run queue", ke)); 1618109864Sjeff 1619121790Sjeff class = PRI_BASE(kg->kg_pri_class); 1620121790Sjeff switch (class) { 1621112994Sjeff case PRI_ITHD: 1622112994Sjeff case PRI_REALTIME: 1623113357Sjeff ke->ke_runq = kseq->ksq_curr; 1624113357Sjeff ke->ke_slice = SCHED_SLICE_MAX; 1625113660Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1626112994Sjeff break; 1627112994Sjeff case PRI_TIMESHARE: 1628113387Sjeff if (SCHED_CURR(kg, ke)) 1629113387Sjeff ke->ke_runq = kseq->ksq_curr; 1630113387Sjeff else 1631113387Sjeff ke->ke_runq = kseq->ksq_next; 1632113357Sjeff break; 1633112994Sjeff case PRI_IDLE: 1634113357Sjeff /* 1635113357Sjeff * This is for priority prop. 1636113357Sjeff */ 1637121605Sjeff if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1638113357Sjeff ke->ke_runq = kseq->ksq_curr; 1639113357Sjeff else 1640113357Sjeff ke->ke_runq = &kseq->ksq_idle; 1641113357Sjeff ke->ke_slice = SCHED_SLICE_MIN; 1642112994Sjeff break; 1643113357Sjeff default: 1644121868Sjeff panic("Unknown pri class."); 1645113357Sjeff break; 1646112994Sjeff } 1647121790Sjeff#ifdef SMP 1648133427Sjeff /* 1649133427Sjeff * Don't migrate running threads here. Force the long term balancer 1650133427Sjeff * to do it. 1651133427Sjeff */ 1652133427Sjeff canmigrate = KSE_CAN_MIGRATE(ke, class); 1653133555Sjeff if (ke->ke_flags & KEF_HOLD) { 1654133555Sjeff ke->ke_flags &= ~KEF_HOLD; 1655133427Sjeff canmigrate = 0; 1656133555Sjeff } 1657133427Sjeff /* 1658133427Sjeff * If this thread is pinned or bound, notify the target cpu. 1659133427Sjeff */ 1660133427Sjeff if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) { 1661123529Sjeff ke->ke_runq = NULL; 1662123433Sjeff kseq_notify(ke, ke->ke_cpu); 1663123433Sjeff return; 1664123433Sjeff } 1665121790Sjeff /* 1666123685Sjeff * If we had been idle, clear our bit in the group and potentially 1667123685Sjeff * the global bitmap. If not, see if we should transfer this thread. 1668121790Sjeff */ 1669123433Sjeff if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1670123433Sjeff (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1671121790Sjeff /* 1672123433Sjeff * Check to see if our group is unidling, and if so, remove it 1673123433Sjeff * from the global idle mask. 1674121790Sjeff */ 1675123433Sjeff if (kseq->ksq_group->ksg_idlemask == 1676123433Sjeff kseq->ksq_group->ksg_cpumask) 1677123433Sjeff atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1678123433Sjeff /* 1679123433Sjeff * Now remove ourselves from the group specific idle mask. 1680123433Sjeff */ 1681123433Sjeff kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1682133427Sjeff } else if (kseq->ksq_load > 1 && canmigrate) 1683123685Sjeff if (kseq_transfer(kseq, ke, class)) 1684123685Sjeff return; 1685133427Sjeff ke->ke_cpu = PCPU_GET(cpuid); 1686121790Sjeff#endif 1687133427Sjeff /* 1688133427Sjeff * XXX With preemption this is not necessary. 1689133427Sjeff */ 1690133555Sjeff if (td->td_priority < curthread->td_priority && 1691133555Sjeff ke->ke_runq == kseq->ksq_curr) 1692133555Sjeff curthread->td_flags |= TDF_NEEDRESCHED; 1693131839Sjhb if (preemptive && maybe_preempt(td)) 1694131481Sjhb return; 1695109864Sjeff ke->ke_ksegrp->kg_runq_kses++; 1696109864Sjeff ke->ke_state = KES_ONRUNQ; 1697109864Sjeff 1698122744Sjeff kseq_runq_add(kseq, ke); 1699122744Sjeff kseq_load_add(kseq, ke); 1700109864Sjeff} 1701109864Sjeff 1702109864Sjeffvoid 1703121127Sjeffsched_rem(struct thread *td) 1704109864Sjeff{ 1705113357Sjeff struct kseq *kseq; 1706121127Sjeff struct kse *ke; 1707113357Sjeff 1708121127Sjeff ke = td->td_kse; 1709121790Sjeff /* 1710121790Sjeff * It is safe to just return here because sched_rem() is only ever 1711121790Sjeff * used in places where we're immediately going to add the 1712121790Sjeff * kse back on again. In that case it'll be added with the correct 1713121790Sjeff * thread and priority when the caller drops the sched_lock. 1714121790Sjeff */ 1715121790Sjeff if (ke->ke_flags & KEF_ASSIGNED) 1716121790Sjeff return; 1717109864Sjeff mtx_assert(&sched_lock, MA_OWNED); 1718124958Sjeff KASSERT((ke->ke_state == KES_ONRUNQ), 1719124958Sjeff ("sched_rem: KSE not on run queue")); 1720109864Sjeff 1721109864Sjeff ke->ke_state = KES_THREAD; 1722109864Sjeff ke->ke_ksegrp->kg_runq_kses--; 1723113357Sjeff kseq = KSEQ_CPU(ke->ke_cpu); 1724122744Sjeff kseq_runq_rem(kseq, ke); 1725122744Sjeff kseq_load_rem(kseq, ke); 1726109864Sjeff} 1727109864Sjeff 1728109864Sjefffixpt_t 1729121127Sjeffsched_pctcpu(struct thread *td) 1730109864Sjeff{ 1731109864Sjeff fixpt_t pctcpu; 1732121127Sjeff struct kse *ke; 1733109864Sjeff 1734109864Sjeff pctcpu = 0; 1735121127Sjeff ke = td->td_kse; 1736121290Sjeff if (ke == NULL) 1737121290Sjeff return (0); 1738109864Sjeff 1739115998Sjeff mtx_lock_spin(&sched_lock); 1740109864Sjeff if (ke->ke_ticks) { 1741109864Sjeff int rtick; 1742109864Sjeff 1743116365Sjeff /* 1744116365Sjeff * Don't update more frequently than twice a second. Allowing 1745116365Sjeff * this causes the cpu usage to decay away too quickly due to 1746116365Sjeff * rounding errors. 1747116365Sjeff */ 1748123435Sjeff if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1749123435Sjeff ke->ke_ltick < (ticks - (hz / 2))) 1750116365Sjeff sched_pctcpu_update(ke); 1751109864Sjeff /* How many rtick per second ? */ 1752116365Sjeff rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1753110226Sscottl pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1754109864Sjeff } 1755109864Sjeff 1756109864Sjeff ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1757113865Sjhb mtx_unlock_spin(&sched_lock); 1758109864Sjeff 1759109864Sjeff return (pctcpu); 1760109864Sjeff} 1761109864Sjeff 1762122038Sjeffvoid 1763122038Sjeffsched_bind(struct thread *td, int cpu) 1764122038Sjeff{ 1765122038Sjeff struct kse *ke; 1766122038Sjeff 1767122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1768122038Sjeff ke = td->td_kse; 1769122038Sjeff ke->ke_flags |= KEF_BOUND; 1770123433Sjeff#ifdef SMP 1771123433Sjeff if (PCPU_GET(cpuid) == cpu) 1772122038Sjeff return; 1773122038Sjeff /* sched_rem without the runq_remove */ 1774122038Sjeff ke->ke_state = KES_THREAD; 1775122038Sjeff ke->ke_ksegrp->kg_runq_kses--; 1776122744Sjeff kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1777122038Sjeff kseq_notify(ke, cpu); 1778122038Sjeff /* When we return from mi_switch we'll be on the correct cpu. */ 1779131527Sphk mi_switch(SW_VOL, NULL); 1780122038Sjeff#endif 1781122038Sjeff} 1782122038Sjeff 1783122038Sjeffvoid 1784122038Sjeffsched_unbind(struct thread *td) 1785122038Sjeff{ 1786122038Sjeff mtx_assert(&sched_lock, MA_OWNED); 1787122038Sjeff td->td_kse->ke_flags &= ~KEF_BOUND; 1788122038Sjeff} 1789122038Sjeff 1790109864Sjeffint 1791125289Sjeffsched_load(void) 1792125289Sjeff{ 1793125289Sjeff#ifdef SMP 1794125289Sjeff int total; 1795125289Sjeff int i; 1796125289Sjeff 1797125289Sjeff total = 0; 1798125289Sjeff for (i = 0; i <= ksg_maxid; i++) 1799125289Sjeff total += KSEQ_GROUP(i)->ksg_load; 1800125289Sjeff return (total); 1801125289Sjeff#else 1802125289Sjeff return (KSEQ_SELF()->ksq_sysload); 1803125289Sjeff#endif 1804125289Sjeff} 1805125289Sjeff 1806125289Sjeffint 1807109864Sjeffsched_sizeof_kse(void) 1808109864Sjeff{ 1809109864Sjeff return (sizeof(struct kse) + sizeof(struct ke_sched)); 1810109864Sjeff} 1811109864Sjeff 1812109864Sjeffint 1813109864Sjeffsched_sizeof_ksegrp(void) 1814109864Sjeff{ 1815109864Sjeff return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1816109864Sjeff} 1817109864Sjeff 1818109864Sjeffint 1819109864Sjeffsched_sizeof_proc(void) 1820109864Sjeff{ 1821109864Sjeff return (sizeof(struct proc)); 1822109864Sjeff} 1823109864Sjeff 1824109864Sjeffint 1825109864Sjeffsched_sizeof_thread(void) 1826109864Sjeff{ 1827109864Sjeff return (sizeof(struct thread) + sizeof(struct td_sched)); 1828109864Sjeff} 1829